diff --git a/pyproject.toml b/pyproject.toml
index 7938d0f..41a9b84 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,11 +31,7 @@ sentence-transformers = "^5.2.2"
spacy = "^3.8.11"
refinedoc = "^1.0.0"
qdrant-client = "1.16.2"
-<<<<<<< Feature/external-id-scientif-journals
-python-dotenv = "^1.2.1"
-=======
python-dotenv = "^1.2.2"
->>>>>>> main
beautifulsoup4 = "^4.14.3"
pyphen = "^0.17.2"
ijson = "^3.4.0"
diff --git a/tests/url_collector/resources/atom_file_doi.xml b/tests/url_collector/resources/atom_file_doi.xml
new file mode 100644
index 0000000..1254355
--- /dev/null
+++ b/tests/url_collector/resources/atom_file_doi.xml
@@ -0,0 +1,43 @@
+
+
+ Example Atom Feed
+
+
+ https://doi.org/10.1234/feed
+ 2023-09-12T00:00:00Z
+
+ Your Name
+ your-email@example.com
+
+
+
+ First Entry
+
+ https://doi.org/10.1234/entry1
+ 2023-09-12T12:00:00Z
+ 2023-09-12T12:00:00Z
+ This is the summary of the first entry.
+ <p>This is the content of the first entry.</p>
+
+
+
+ Second Entry
+
+ https://doi.org/10.1234/entry2
+ 2023-09-13T09:30:00Z
+ 2023-09-13T09:30:00Z
+ This is the summary of the second entry.
+ <p>This is the content of the second entry.</p>
+
+
+
+ Third Entry
+
+ https://doi.org/10.1234/entry3
+ 2023-09-14T15:45:00Z
+ 2023-09-14T15:45:00Z
+ This is the summary of the third entry.
+ <p>This is the content of the third entry.</p>
+
+
+
\ No newline at end of file
diff --git a/tests/url_collector/test_atom_collector.py b/tests/url_collector/test_atom_collector.py
index 8b69339..e38efb9 100644
--- a/tests/url_collector/test_atom_collector.py
+++ b/tests/url_collector/test_atom_collector.py
@@ -2,6 +2,7 @@
from unittest import TestCase
from unittest.mock import Mock, patch
+from welearn_database.data.enumeration import ExternalIdType
from welearn_database.data.models import Corpus
from welearn_datastack.collectors.atom_collector import AtomURLCollector
@@ -10,11 +11,16 @@
class Test(TestCase):
def setUp(self) -> None:
self.rss_file_path = Path(__file__).parent / "resources" / "atom_file.xml"
+ self.rss_file_path_doi = (
+ Path(__file__).parent / "resources" / "atom_file_doi.xml"
+ )
self.mock_corpus = Corpus(
source_name="test", is_fix=True, main_url="https://www.example.com"
)
with self.rss_file_path.open(mode="r") as f:
self.rss_content = f.read()
+ with self.rss_file_path_doi.open(mode="r") as f:
+ self.rss_content_doi = f.read()
@patch("welearn_datastack.collectors.atom_collector.get_new_https_session")
def test_atom_urlcollector(self, mock_get_new_https_session):
@@ -39,6 +45,35 @@ def test_atom_urlcollector(self, mock_get_new_https_session):
for i in range(0, len(collected)):
self.assertEqual(collected[i].url, f"https://www.example.com/entry{i+1}")
self.assertEqual(collected[i].corpus.source_name, "test")
+ self.assertEqual(collected[i].external_id, f"entry{i+1}")
+ self.assertEqual(collected[i].external_id_type, ExternalIdType.SLUG)
+ self.assertEqual(collected[i].corpus.is_fix, True)
+
+ @patch("welearn_datastack.collectors.atom_collector.get_new_https_session")
+ def test_atom_urlcollector_with_id_doi(self, mock_get_new_https_session):
+ """
+ Test the collect method of the AtomURLCollector class, on a rss file with 3 rows
+ """
+ mock_session = Mock()
+ mock_response = Mock()
+ mock_get_new_https_session.return_value = mock_session
+ mock_session.ok.return_value = True
+ mock_session.get.return_value = mock_response
+
+ mock_response.content = self.rss_content_doi.encode("utf-8")
+
+ rss_collector = AtomURLCollector(
+ feed_url="https://www.example.com",
+ corpus=self.mock_corpus,
+ )
+ collected = rss_collector.collect()
+ self.assertEqual(3, len(collected))
+
+ for i in range(0, len(collected)):
+ self.assertEqual(collected[i].url, f"https://www.example.com/entry{i+1}")
+ self.assertEqual(collected[i].corpus.source_name, "test")
+ self.assertEqual(collected[i].external_id, f"10.1234/entry{i+1}")
+ self.assertEqual(collected[i].external_id_type, ExternalIdType.DOI)
self.assertEqual(collected[i].corpus.is_fix, True)
@patch("welearn_datastack.collectors.atom_collector.get_new_https_session")
@@ -55,7 +90,7 @@ def test_atom_urlcollector_different_domain(self, mock_get_new_https_session):
localmock_corpus = Corpus(
source_name="test_org", is_fix=True, main_url="https://www.example.org"
)
- local_atom = self.rss_content.replace("example.com", "example.org")
+ local_atom = self.rss_content_.replace("example.com", "example.org")
mock_response.content = local_atom.encode("utf-8")
diff --git a/tests/url_collector/test_oe_books_collector.py b/tests/url_collector/test_oe_books_collector.py
index c2da2f6..13c684d 100644
--- a/tests/url_collector/test_oe_books_collector.py
+++ b/tests/url_collector/test_oe_books_collector.py
@@ -2,9 +2,11 @@
from pathlib import Path
from unittest.mock import Mock, patch
+from welearn_database.data.enumeration import ExternalIdType
from welearn_database.data.models import Corpus, WeLearnDocument
from welearn_datastack.collectors.oe_books_collector import OpenEditionBooksURLCollector
+from welearn_datastack.modules.url_utils import extract_url_parts_post_netloc
from welearn_datastack.modules.xml_extractor import XMLExtractor
from welearn_datastack.plugins.scrapers import OpenEditionBooksCollector
@@ -122,6 +124,8 @@ def test_collect_book_accessible_no_chapters(
self.assertEqual(len(collected), 1)
self.assertEqual(collected[0].url, self.url_list[0])
+ self.assertEqual(collected[0].external_id, "examplepub0/0")
+ self.assertEqual(collected[0].external_id_type, ExternalIdType.SLUG)
@patch("welearn_datastack.collectors.oe_books_collector.get_new_https_session")
@patch("welearn_datastack.collectors.oe_books_collector.RssURLCollector.collect")
@@ -151,6 +155,8 @@ def test_collect_book_accessible_license_unauthorized(
self.assertEqual(len(collected), 1)
self.assertEqual(collected[0].url, self.url_list[0])
+ self.assertEqual(collected[0].external_id, "examplepub0/0")
+ self.assertEqual(collected[0].external_id_type, ExternalIdType.SLUG)
@patch("welearn_datastack.collectors.oe_books_collector.get_new_https_session")
@patch("welearn_datastack.collectors.oe_books_collector.RssURLCollector.collect")
@@ -185,6 +191,16 @@ def test_collect_book_accessible_license_authorized(
"https://books.openedition.org/examplepub0/8088",
]
+ wanted_external_ids = [
+ extract_url_parts_post_netloc(url) for url in wanted_urls
+ ]
+ collected_external_ids = [doc.external_id for doc in collected]
+ wanted_external_ids.sort()
+ collected_external_ids.sort()
+
collected_url.sort()
wanted_urls.sort()
self.assertListEqual(collected_url, wanted_urls)
+ self.assertListEqual(collected_external_ids, wanted_external_ids)
+ for doc in collected:
+ self.assertEqual(doc.external_id_type, ExternalIdType.SLUG)
diff --git a/tests/url_collector/test_openalex_collector.py b/tests/url_collector/test_openalex_collector.py
index 3bfffae..7143b9f 100644
--- a/tests/url_collector/test_openalex_collector.py
+++ b/tests/url_collector/test_openalex_collector.py
@@ -6,6 +6,7 @@
from unittest.mock import MagicMock, Mock, patch
from zoneinfo import ZoneInfo
+from welearn_database.data.enumeration import ExternalIdType
from welearn_database.data.models import Corpus, WeLearnDocument
from welearn_datastack.collectors.open_alex_collector import OpenAlexURLCollector
@@ -79,6 +80,11 @@ def test_collect(self, mock__get_oa_json):
full_content = self.content_json1["results"] + self.content_json2["results"]
awaited_urls = [v["id"] for v in full_content]
+ awaited_external_ids = [v["id"].split("/")[-1] for v in full_content]
self.assertListEqual(returned_urls, awaited_urls)
self.assertEqual(len(returned_urls), 400)
+ for wldoc, awaited_external_id in zip(returned_wldoc, awaited_external_ids):
+ self.assertEqual(wldoc.corpus, self.mock_corpus)
+ self.assertEqual(wldoc.external_id, awaited_external_id)
+ self.assertEqual(wldoc.external_id_type, ExternalIdType.API_ID)
diff --git a/welearn_datastack/collectors/atom_collector.py b/welearn_datastack/collectors/atom_collector.py
index 7e937d2..3d7e11f 100644
--- a/welearn_datastack/collectors/atom_collector.py
+++ b/welearn_datastack/collectors/atom_collector.py
@@ -1,15 +1,34 @@
+import logging
+import os
from typing import List
from urllib.parse import urlparse
+from welearn_database.data.enumeration import ExternalIdType
from welearn_database.data.models import Corpus, WeLearnDocument
-from welearn_datastack.collectors.helpers.feed_helpers import (
- extracted_url_to_url_datastore,
- lines_to_url,
-)
from welearn_datastack.data.url_collector import URLCollector
+from welearn_datastack.modules.url_utils import (
+ extract_doi_number,
+ extract_url_parts_post_netloc,
+)
+from welearn_datastack.modules.validation import validate_doi
+from welearn_datastack.modules.xml_extractor import XMLExtractor
from welearn_datastack.utils_.http_client_utils import get_new_https_session
+log_level: int = logging.getLevelName(os.getenv("LOG_LEVEL", "INFO"))
+log_format: str = os.getenv(
+ "LOG_FORMAT", "[%(asctime)s][%(name)s][%(levelname)s] - %(message)s"
+)
+
+if not isinstance(log_level, int):
+ raise ValueError("Log level is not recognized : '%s'", log_level)
+
+logging.basicConfig(
+ level=logging.getLevelName(log_level),
+ format=log_format,
+)
+logger = logging.getLogger(__name__)
+
url_illegal_characters = ['"', "<", ">"]
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
@@ -31,23 +50,57 @@ def __init__(
self.corpus = corpus
def collect(self) -> List[WeLearnDocument]:
+ logger.info(
+ f"Collecting URLs from feed {self.feed_url} for corpus {self.corpus.id}"
+ )
domain = "https://" + urlparse(self.corpus.main_url).netloc
client = get_new_https_session()
res = client.get(url=self.feed_url, headers=headers)
content = res.content.decode("utf-8")
- flag = False
- link_lines: List[str] = []
- for line in content.split("\n"):
- # If we are in the entry section and we find a link
- # The definition, especially "rel" part is empirical
- if flag and line.strip().startswith('"):
- flag = True
-
- urls = lines_to_url(domain, link_lines)
-
- ret = extracted_url_to_url_datastore(urls=urls, corpus=self.corpus)
+ logger.debug(f"Content of the feed {self.feed_url} : {content}")
+ ret: list[WeLearnDocument] = []
+ entries = XMLExtractor(content).extract_content("entry")
+ logger.info(f"Found {len(entries)} entries in the feed {self.feed_url}")
+ for entry in entries:
+ entry_extractor = XMLExtractor(entry.content)
+ link = entry_extractor.extract_content_attribute_filter(
+ tag="link",
+ attribute_name="rel",
+ attribute_value="alternate",
+ )
+ [xml_external_id] = entry_extractor.extract_content("id")
+ external_id = xml_external_id.content.strip()
+ if len(link) == 0:
+ continue
+ link_url = link[0].attributes["href"]
+ if link_url.startswith(domain):
+ if validate_doi(external_id, resolve_doi=False):
+ # If the external ID is a valid DOI, we can use it directly as the external ID and set the type to DOI
+ logger.info(
+ f"External ID {external_id} is a valid DOI for URL {link_url}"
+ )
+ external_id = extract_doi_number(external_id)
+ external_id_type = ExternalIdType.DOI
+ else:
+ # Otherwise, we can use the part of the URL after the domain as the external ID and set the type to SLUG
+ logger.info(
+ f"External ID {external_id} is not a valid DOI for URL {link_url}, using the part of the URL after the domain as the external ID"
+ )
+ external_id = extract_url_parts_post_netloc(
+ link_url, remove_start_slash=True
+ )
+ external_id_type = ExternalIdType.SLUG
+ ret.append(
+ WeLearnDocument(
+ url=link_url,
+ corpus=self.corpus,
+ external_id=external_id,
+ external_id_type=external_id_type,
+ )
+ )
+ logger.info(
+ f"Collected {len(ret)} URLs from feed {self.feed_url} for corpus {self.corpus.id}"
+ )
return ret
diff --git a/welearn_datastack/collectors/oe_books_collector.py b/welearn_datastack/collectors/oe_books_collector.py
index bed2cc3..2079aea 100644
--- a/welearn_datastack/collectors/oe_books_collector.py
+++ b/welearn_datastack/collectors/oe_books_collector.py
@@ -2,6 +2,7 @@
import re
from typing import Dict, List
+from welearn_database.data.enumeration import ExternalIdType
from welearn_database.data.models import Corpus, WeLearnDocument
from welearn_datastack.collectors.rss_collector import RssURLCollector
@@ -11,6 +12,7 @@
MD_OE_BOOKS_BASE_URL,
)
from welearn_datastack.data.url_collector import URLCollector
+from welearn_datastack.modules.url_utils import extract_url_parts_post_netloc
from welearn_datastack.modules.xml_extractor import XMLExtractor
from welearn_datastack.utils_.http_client_utils import get_new_https_session
@@ -81,7 +83,7 @@ def collect(self) -> List[WeLearnDocument]:
client = get_new_https_session()
for book_url in rss_urls:
logger.info("Collecting book: %s", book_url)
- md_id = book_url.url.replace("https://books.openedition.org/", "")
+ md_id = extract_url_parts_post_netloc(book_url.url)
md_url = MD_OE_BOOKS_BASE_URL.replace("", md_id)
md_res = client.get(url=md_url, headers=HEADERS)
@@ -112,20 +114,39 @@ def collect(self) -> List[WeLearnDocument]:
# Weird case where there is no chapters
logger.warning("No chapters found for book: %s", book_url.url)
ret.append(
- WeLearnDocument(url=book_url.url, corpus=self.corpus)
+ WeLearnDocument(
+ url=book_url.url,
+ corpus=self.corpus,
+ external_id=md_id,
+ external_id_type=ExternalIdType.SLUG,
+ )
)
continue
else:
for chapter_url in chapters_urls:
logger.info("--Collecting chapter: %s", chapter_url)
ret.append(
- WeLearnDocument(url=chapter_url, corpus=self.corpus)
+ WeLearnDocument(
+ url=chapter_url,
+ corpus=self.corpus,
+ external_id=extract_url_parts_post_netloc(
+ chapter_url
+ ),
+ external_id_type=ExternalIdType.SLUG,
+ )
)
else:
logger.info(
"Book chapters are not legally usable : %s", book_url.url
)
- ret.append(WeLearnDocument(url=book_url.url, corpus=self.corpus))
+ ret.append(
+ WeLearnDocument(
+ url=book_url.url,
+ corpus=self.corpus,
+ external_id=md_id,
+ external_id_type=ExternalIdType.SLUG,
+ )
+ )
continue
else:
logger.info("Book is not open access: %s", book_url.url)
diff --git a/welearn_datastack/collectors/open_alex_collector.py b/welearn_datastack/collectors/open_alex_collector.py
index 45cdc0c..6af969a 100644
--- a/welearn_datastack/collectors/open_alex_collector.py
+++ b/welearn_datastack/collectors/open_alex_collector.py
@@ -6,10 +6,12 @@
from typing import Dict, List
from zoneinfo import ZoneInfo
+from welearn_database.data.enumeration import ExternalIdType
from welearn_database.data.models import Corpus, WeLearnDocument
from welearn_datastack.constants import OPEN_ALEX_BASE_URL, PUBLISHERS_TO_AVOID
from welearn_datastack.data.url_collector import URLCollector
+from welearn_datastack.modules.url_utils import extract_url_parts_post_netloc
from welearn_datastack.utils_.http_client_utils import get_new_https_session
logger = logging.getLogger(__name__)
@@ -138,7 +140,14 @@ def collect(self) -> List[WeLearnDocument]:
for i in range(0, iteration_quantity):
logger.info(f"Iteration {i+1}/{iteration_quantity}")
for work in json_from_oa["results"]:
- ret.append(WeLearnDocument(url=work["id"], corpus=self.corpus))
+ ret.append(
+ WeLearnDocument(
+ url=work["id"],
+ corpus=self.corpus,
+ external_id=extract_url_parts_post_netloc(work["id"]),
+ external_id_type=ExternalIdType.API_ID,
+ )
+ )
if json_from_oa["meta"]["next_cursor"]:
params["cursor"] = json_from_oa["meta"]["next_cursor"]
diff --git a/welearn_datastack/collectors/wikipedia_collector.py b/welearn_datastack/collectors/wikipedia_collector.py
index b6a1b1c..9943b50 100644
--- a/welearn_datastack/collectors/wikipedia_collector.py
+++ b/welearn_datastack/collectors/wikipedia_collector.py
@@ -148,7 +148,6 @@ def get_page_translation(
return ret
def collect(self, batch_id: int | None = None) -> List[WeLearnDocument]:
-
portals_to_process: List[WikipediaContainer]
categories_to_process: List[WikipediaContainer]
diff --git a/welearn_datastack/data/enumerations.py b/welearn_datastack/data/enumerations.py
index f97ac9d..40f167c 100644
--- a/welearn_datastack/data/enumerations.py
+++ b/welearn_datastack/data/enumerations.py
@@ -1,4 +1,4 @@
-from enum import Enum, auto
+from enum import Enum, StrEnum, auto
class PluginType(Enum):
@@ -16,19 +16,6 @@ class DeletePart(Enum):
after = 2
-# class Step(Enum):
-# URL_RETRIEVED = "url_retrieved"
-# DOCUMENT_SCRAPED = "document_scraped"
-# DOCUMENT_VECTORIZED = "document_vectorized"
-# DOCUMENT_CLASSIFIED_SDG = "document_classified_sdg"
-# DOCUMENT_CLASSIFIED_NON_SDG = "document_classified_non_sdg"
-# DOCUMENT_KEYWORDS_EXTRACTED = "document_with_keywords"
-# DOCUMENT_IN_QDRANT = "document_in_qdrant"
-# DOCUMENT_IS_INVALID = "document_is_invalid"
-# KEPT_FOR_TRACE = "kept_for_trace"
-# DOCUMENT_IS_IRRETRIEVABLE = "document_is_irretrievable"
-
-
class MLModelsType(Enum):
BI_CLASSIFIER = auto()
N_CLASSIFIER = auto()
@@ -49,3 +36,12 @@ class URLStatus(Enum):
UPDATE = 2
DELETE = 3
UNKNOWN = 4
+
+
+class URLParts(Enum):
+ SCHEME = auto()
+ NETLOC = auto()
+ PATH = auto()
+ PARAMS = auto()
+ QUERY = auto()
+ FRAGMENT = auto()
diff --git a/welearn_datastack/modules/url_utils.py b/welearn_datastack/modules/url_utils.py
new file mode 100644
index 0000000..7827942
--- /dev/null
+++ b/welearn_datastack/modules/url_utils.py
@@ -0,0 +1,81 @@
+from urllib.parse import urlparse
+
+from welearn_datastack.data.enumerations import URLParts
+
+
+def extract_url_parts(
+ url: str, parts_to_extract: list[URLParts], concat: bool
+) -> str | list[str]:
+ """
+ Extract the specified parts of a URL and return them as a concatenated string or a list of strings.
+ :param url: The URL to extract parts from
+ :param parts_to_extract: A list of URLParts enum values specifying which parts to extract
+ :param concat: If True, return the extracted parts as a concatenated string. If False, return the extracted parts as a list of strings.
+ :return: The extracted parts as a concatenated string or a list of strings, depending on the value of concat. If no parts are extracted, return an empty list.
+ """
+ parsed = urlparse(url)
+ # Init empty list of 6 elements to store the extracted parts, in the order of URLParts enum
+ extracted_parts = ["" for _ in range(6)]
+
+ for part in parts_to_extract:
+ match part:
+ case URLParts.SCHEME:
+ extracted_parts[0] = parsed.scheme
+ case URLParts.NETLOC:
+ extracted_parts[1] = parsed.netloc
+ case URLParts.PATH:
+ extracted_parts[2] = parsed.path
+ case URLParts.PARAMS:
+ extracted_parts[3] = parsed.params
+ case URLParts.QUERY:
+ extracted_parts[4] = parsed.query
+ case URLParts.FRAGMENT:
+ extracted_parts[5] = parsed.fragment
+
+ if concat:
+ return "".join(extracted_parts)
+ else:
+ ret = [i for i in extracted_parts if i != ""]
+ return ret if len(ret) > 1 else []
+
+
+def extract_url_parts_post_netloc(url: str, remove_start_slash: bool = True) -> str:
+ """
+ Extract the path, params, query and fragment parts of a URL and concatenate them into a single string. Optionally remove the starting slash from the path.
+ :param url: The URL to extract parts from
+ :param remove_start_slash: If True, remove the starting slash from the path part if it exists. Default is True.
+ :return: The concatenated string of the path, params, query and fragment parts of the URL, with the starting
+ slash removed from the path if remove_start_slash is True and the path starts with a slash.
+ If no parts are extracted, return an empty string.
+ """
+ ret = extract_url_parts(
+ url=url,
+ parts_to_extract=[
+ URLParts.PATH,
+ URLParts.PARAMS,
+ URLParts.QUERY,
+ URLParts.FRAGMENT,
+ ],
+ concat=True,
+ )
+ if remove_start_slash and ret.startswith("/"):
+ ret = ret[1:]
+ return ret
+
+
+def extract_doi_number(url: str, strict: bool = False) -> str:
+ """
+ Extract the DOI number from a URL if it exists. The DOI number is expected to be in the format "10.xxxx/xxxxx".
+ :param url: The URL to extract the DOI number from
+ :return: The extracted DOI number as a string, or an empty string if no DOI number is found
+ """
+ path = urlparse(url).path
+ if path.startswith("/"):
+ ret = path[1:]
+ else:
+ return "" if strict else path
+
+ if ret.startswith("10."):
+ return ret
+ else:
+ return "" if strict else path
diff --git a/welearn_datastack/modules/validation.py b/welearn_datastack/modules/validation.py
index 84575b7..53fc4f8 100644
--- a/welearn_datastack/modules/validation.py
+++ b/welearn_datastack/modules/validation.py
@@ -1,5 +1,10 @@
+import re
+
+import requests
from welearn_database.data.models import WeLearnDocument
+from welearn_datastack.regular_expression import DOI_REGEX
+
def validate_non_null_fields_document(doc: WeLearnDocument) -> bool:
"""
@@ -9,3 +14,28 @@ def validate_non_null_fields_document(doc: WeLearnDocument) -> bool:
is_desc_empty = not doc.description or doc.description.strip() == ""
is_content_empty = not doc.full_content or doc.full_content.strip() == ""
return not (is_desc_empty or is_content_empty)
+
+
+def validate_doi(s: str, resolve_doi: bool = True) -> bool:
+ """
+ Validate if a string is a valid DOI and if it exists.
+ - Check if the string matches the DOI format using a regular expression.
+ - If it matches, make a request to the DOI API to check if it exists.
+ - If the API returns a 200 status code, the DOI is valid and exists; otherwise, it is not valid or does not exist.
+ - If the string does not match the DOI format, it is not valid.
+ - Handle any exceptions that may occur during the API request and return False in case of an error.
+ - Strip the input string and remove the "https://doi.org/" prefix if it exists before validating the DOI format and existence.
+ :param s: The string to validate as a DOI.
+ :param resolve_doi: Whether to check if the DOI exists by making a request to the DOI API. If False, only the format will be validated.
+ :return: True if the string is a valid DOI (and exists if resolve_doi is True), False otherwise.
+ """
+ s = s.strip().removeprefix("https://doi.org/")
+ if not re.match(DOI_REGEX, s):
+ return False
+ if resolve_doi:
+ try:
+ r = requests.get(f"https://doi.org/api/handles/{s}", timeout=5)
+ return r.status_code == 200
+ except requests.RequestException:
+ return False
+ return True
diff --git a/welearn_datastack/modules/xml_extractor.py b/welearn_datastack/modules/xml_extractor.py
index 49a3e71..b61501f 100644
--- a/welearn_datastack/modules/xml_extractor.py
+++ b/welearn_datastack/modules/xml_extractor.py
@@ -7,6 +7,7 @@
from welearn_datastack.regular_expression import (
SIMPLE_XML_ATTRIBUTE_REGEX,
simple_xml_tag_format_regex,
+ simple_xml_tag_format_regex_autoclosing,
)
logger = logging.getLogger(__name__)
@@ -63,9 +64,11 @@ def extract_content(self, tag: str) -> List[XMLData]:
attr_pattern = re.compile(SIMPLE_XML_ATTRIBUTE_REGEX)
# Find all matches of the pattern in the XML raw data
- matches = re.findall(
- simple_xml_tag_format_regex(tag), self.xml_raw_data, re.DOTALL
- )
+ pattern = simple_xml_tag_format_regex(tag)
+ autoclosing_pattern = simple_xml_tag_format_regex_autoclosing(tag)
+ matches = re.findall(pattern, self.xml_raw_data, re.DOTALL)
+ if len(matches) == 0:
+ matches = re.findall(autoclosing_pattern, self.xml_raw_data, re.DOTALL)
logger.info("Found %d matches for tag %s", len(matches), tag)
ret = []
diff --git a/welearn_datastack/plugins/rest_requesters/fao_open_knowledge.py b/welearn_datastack/plugins/rest_requesters/fao_open_knowledge.py
index ba43965..9f0e8d3 100644
--- a/welearn_datastack/plugins/rest_requesters/fao_open_knowledge.py
+++ b/welearn_datastack/plugins/rest_requesters/fao_open_knowledge.py
@@ -7,6 +7,7 @@
import pydantic
import requests
+from welearn_database.data.enumeration import ExternalIdType
from welearn_database.data.models import WeLearnDocument
from welearn_database.modules.text_cleaning import clean_text
@@ -24,7 +25,6 @@
NoContent,
NoDescriptionFoundError,
NotExpectedMoreThanOneItem,
- PDFFileSizeExceedLimit,
UnauthorizedLicense,
UnauthorizedState,
)
@@ -227,7 +227,7 @@ def _extract_details(self, fao_document: Item) -> dict:
[publication_date] = parsed_metadata.get("dc.date.available", empty_entry)
[update_date] = parsed_metadata.get("dc.date.lastModified", empty_entry)
[isbn] = parsed_metadata.get("dc.identifier.isbn", empty_entry)
- [doi] = parsed_metadata.get("dc.identifier.doi", empty_entry)
+ [doi] = parsed_metadata.get("fao.identifier.doi", empty_entry)
[type_] = parsed_metadata.get("fao.taxonomy.type", empty_entry)
ret: dict[str, Any] = {
"publication_date": (
diff --git a/welearn_datastack/plugins/rest_requesters/oapen.py b/welearn_datastack/plugins/rest_requesters/oapen.py
index 2a47674..fa3b238 100644
--- a/welearn_datastack/plugins/rest_requesters/oapen.py
+++ b/welearn_datastack/plugins/rest_requesters/oapen.py
@@ -10,6 +10,7 @@
from typing import Dict, Iterable, List
from lingua import Language
+from welearn_database.data.enumeration import ExternalIdType
from welearn_database.data.models import WeLearnDocument
from welearn_datastack.constants import AUTHORIZED_LICENSES, HEADERS
diff --git a/welearn_datastack/plugins/rest_requesters/open_alex.py b/welearn_datastack/plugins/rest_requesters/open_alex.py
index ee03a15..147f288 100644
--- a/welearn_datastack/plugins/rest_requesters/open_alex.py
+++ b/welearn_datastack/plugins/rest_requesters/open_alex.py
@@ -44,6 +44,7 @@
remove_hyphens,
replace_ligatures,
)
+from welearn_datastack.modules.url_utils import extract_doi_number
from welearn_datastack.plugins.interface import IPluginRESTCollector
from welearn_datastack.utils_.http_client_utils import (
get_http_code_from_exception,
@@ -199,8 +200,6 @@ def _update_welearn_document(self, wrapper: WrapperRawData) -> WeLearnDocument:
wrapper.document.description = document_desc
wrapper.document.content = document_content
wrapper.document.details = document_details
- wrapper.document.external_id = self._get_doi(wrapper)
- wrapper.document.external_id_type = ExternalIdType.DOI
return wrapper.document
@@ -242,7 +241,7 @@ def _build_details(
def _get_doi(wrapper: WrapperRawData) -> str | None:
doi = wrapper.raw_data.ids.doi
if doi.startswith("https://doi.org/"):
- doi = doi.replace("https://doi.org/", "")
+ doi = extract_doi_number(doi)
return doi
@staticmethod
diff --git a/welearn_datastack/plugins/scrapers/oe_books.py b/welearn_datastack/plugins/scrapers/oe_books.py
index 4a373af..e529150 100644
--- a/welearn_datastack/plugins/scrapers/oe_books.py
+++ b/welearn_datastack/plugins/scrapers/oe_books.py
@@ -5,11 +5,13 @@
from bs4 import BeautifulSoup # type: ignore
from requests import Session # type: ignore
+from welearn_database.data.enumeration import ExternalIdType
from welearn_database.data.models import WeLearnDocument
from welearn_datastack.constants import AUTHORIZED_LICENSES, MD_OE_BOOKS_BASE_URL
from welearn_datastack.data.db_wrapper import WrapperRetrieveDocument
from welearn_datastack.exceptions import ClosedAccessContent
+from welearn_datastack.modules.url_utils import extract_url_parts_post_netloc
from welearn_datastack.modules.xml_extractor import XMLExtractor
from welearn_datastack.plugins.interface import IPluginScrapeCollector
from welearn_datastack.utils_.http_client_utils import (
@@ -85,8 +87,12 @@ def _scrape_url(self, document: WeLearnDocument) -> WeLearnDocument:
case "book":
details["type"] = "book"
if not root_extractor:
- logger.warning("Weird case, cannot accessed to API before :%s", url)
- md_id, mets_api_res = self._get_mets_metadata(https_session, url)
+ logger.warning(
+ "Weird case, cannot accessed to API before :%s", document.url
+ )
+ md_id, mets_api_res = self._get_mets_metadata(
+ https_session, document.url
+ )
root_extractor = XMLExtractor(mets_api_res.content.decode("utf-8"))
if not self._is_open_access(root_extractor):
@@ -353,7 +359,7 @@ def _get_mets_metadata(self, https_session, url):
:param url: The url of the book
:return: The md_id and the response of the API
"""
- md_id = url.replace("https://books.openedition.org/", "")
+ md_id = extract_url_parts_post_netloc(url)
md_url = MD_OE_BOOKS_BASE_URL.replace("", md_id)
mets_api_res = https_session.get(url=md_url, timeout=self.timeout)
return md_id, mets_api_res
diff --git a/welearn_datastack/plugins/scrapers/peerj.py b/welearn_datastack/plugins/scrapers/peerj.py
index eebe26a..954d9e5 100644
--- a/welearn_datastack/plugins/scrapers/peerj.py
+++ b/welearn_datastack/plugins/scrapers/peerj.py
@@ -239,8 +239,6 @@ def _scrape_url(self, document: WeLearnDocument) -> WeLearnDocument:
doi = document.details.get("doi", None)
if not doi:
raise NoDOIFoundError(f"No DOI found for '{document.url}'")
- document.external_id = doi
- document.external_id_type = ExternalIdType.DOI
return document
diff --git a/welearn_datastack/plugins/scrapers/plos.py b/welearn_datastack/plugins/scrapers/plos.py
index 9be91bf..e3e82f5 100644
--- a/welearn_datastack/plugins/scrapers/plos.py
+++ b/welearn_datastack/plugins/scrapers/plos.py
@@ -14,6 +14,7 @@
from welearn_datastack.constants import AUTHORIZED_LICENSES
from welearn_datastack.data.db_wrapper import WrapperRetrieveDocument
from welearn_datastack.exceptions import UnauthorizedLicense
+from welearn_datastack.modules.url_utils import extract_doi_number
from welearn_datastack.plugins.interface import IPluginScrapeCollector
from welearn_datastack.regular_expression import ANTI_URL_REGEX
from welearn_datastack.utils_.http_client_utils import (
@@ -124,7 +125,7 @@ def extract_doi(self, article_meta: BeautifulSoup) -> str:
doi_extract = article_meta.find("article-id", {"pub-id-type": "doi"})
doi = self.extract_property(doi_extract)
if doi.startswith("https://doi.org/"):
- doi = doi.replace("https://doi.org/", "")
+ doi = extract_doi_number(doi)
return doi
@staticmethod
@@ -273,8 +274,6 @@ def extract_data_from_plos_xml(
document.description = clean_return_to_line(doc_desc)
document.full_content = clean_doc_content
document.details = self._get_document_details(soup=soup)
- document.external_id = self.extract_doi(article_meta)
- document.external_id_type = ExternalIdType.DOI
return document
diff --git a/welearn_datastack/regular_expression.py b/welearn_datastack/regular_expression.py
index 4486129..9ebade1 100644
--- a/welearn_datastack/regular_expression.py
+++ b/welearn_datastack/regular_expression.py
@@ -75,6 +75,13 @@
# limit: Treats underscores as word characters and does not handle hyphenated words or contractions as single tokens.
WORDS_REGEX = r"\w+"
+# description: Matches a DOI (Digital Object Identifier) in its standard format, starting with "10." followed by a numeric prefix and a suffix.
+# example: "10.1000/xyz123" -> matches "10.1000/xyz123"
+# limit: The regex is quite permissive in the suffix part, allowing any
+# non-whitespace characters except quotes and angle brackets, which may lead to false positives in some contexts.
+# Source : https://stackoverflow.com/a/10324802/31019364
+DOI_REGEX = r"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'<>])\S)+)"
+
def simple_xml_tag_format_regex(tag: str) -> str:
"""
@@ -86,3 +93,8 @@ def simple_xml_tag_format_regex(tag: str) -> str:
:return: A regular expression string to match the specified XML tag.
"""
return rf"<{tag}([^>]*)>(.*?){tag}>"
+
+
+def simple_xml_tag_format_regex_autoclosing(tag: str) -> str:
+
+ return rf"<{tag}([^>]*)(.*?)/>"