diff --git a/pyproject.toml b/pyproject.toml index 7938d0f..41a9b84 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,11 +31,7 @@ sentence-transformers = "^5.2.2" spacy = "^3.8.11" refinedoc = "^1.0.0" qdrant-client = "1.16.2" -<<<<<<< Feature/external-id-scientif-journals -python-dotenv = "^1.2.1" -======= python-dotenv = "^1.2.2" ->>>>>>> main beautifulsoup4 = "^4.14.3" pyphen = "^0.17.2" ijson = "^3.4.0" diff --git a/tests/url_collector/resources/atom_file_doi.xml b/tests/url_collector/resources/atom_file_doi.xml new file mode 100644 index 0000000..1254355 --- /dev/null +++ b/tests/url_collector/resources/atom_file_doi.xml @@ -0,0 +1,43 @@ + + + Example Atom Feed + + + https://doi.org/10.1234/feed + 2023-09-12T00:00:00Z + + Your Name + your-email@example.com + + + + First Entry + + https://doi.org/10.1234/entry1 + 2023-09-12T12:00:00Z + 2023-09-12T12:00:00Z + This is the summary of the first entry. + <p>This is the content of the first entry.</p> + + + + Second Entry + + https://doi.org/10.1234/entry2 + 2023-09-13T09:30:00Z + 2023-09-13T09:30:00Z + This is the summary of the second entry. + <p>This is the content of the second entry.</p> + + + + Third Entry + + https://doi.org/10.1234/entry3 + 2023-09-14T15:45:00Z + 2023-09-14T15:45:00Z + This is the summary of the third entry. + <p>This is the content of the third entry.</p> + + + \ No newline at end of file diff --git a/tests/url_collector/test_atom_collector.py b/tests/url_collector/test_atom_collector.py index 8b69339..e38efb9 100644 --- a/tests/url_collector/test_atom_collector.py +++ b/tests/url_collector/test_atom_collector.py @@ -2,6 +2,7 @@ from unittest import TestCase from unittest.mock import Mock, patch +from welearn_database.data.enumeration import ExternalIdType from welearn_database.data.models import Corpus from welearn_datastack.collectors.atom_collector import AtomURLCollector @@ -10,11 +11,16 @@ class Test(TestCase): def setUp(self) -> None: self.rss_file_path = Path(__file__).parent / "resources" / "atom_file.xml" + self.rss_file_path_doi = ( + Path(__file__).parent / "resources" / "atom_file_doi.xml" + ) self.mock_corpus = Corpus( source_name="test", is_fix=True, main_url="https://www.example.com" ) with self.rss_file_path.open(mode="r") as f: self.rss_content = f.read() + with self.rss_file_path_doi.open(mode="r") as f: + self.rss_content_doi = f.read() @patch("welearn_datastack.collectors.atom_collector.get_new_https_session") def test_atom_urlcollector(self, mock_get_new_https_session): @@ -39,6 +45,35 @@ def test_atom_urlcollector(self, mock_get_new_https_session): for i in range(0, len(collected)): self.assertEqual(collected[i].url, f"https://www.example.com/entry{i+1}") self.assertEqual(collected[i].corpus.source_name, "test") + self.assertEqual(collected[i].external_id, f"entry{i+1}") + self.assertEqual(collected[i].external_id_type, ExternalIdType.SLUG) + self.assertEqual(collected[i].corpus.is_fix, True) + + @patch("welearn_datastack.collectors.atom_collector.get_new_https_session") + def test_atom_urlcollector_with_id_doi(self, mock_get_new_https_session): + """ + Test the collect method of the AtomURLCollector class, on a rss file with 3 rows + """ + mock_session = Mock() + mock_response = Mock() + mock_get_new_https_session.return_value = mock_session + mock_session.ok.return_value = True + mock_session.get.return_value = mock_response + + mock_response.content = self.rss_content_doi.encode("utf-8") + + rss_collector = AtomURLCollector( + feed_url="https://www.example.com", + corpus=self.mock_corpus, + ) + collected = rss_collector.collect() + self.assertEqual(3, len(collected)) + + for i in range(0, len(collected)): + self.assertEqual(collected[i].url, f"https://www.example.com/entry{i+1}") + self.assertEqual(collected[i].corpus.source_name, "test") + self.assertEqual(collected[i].external_id, f"10.1234/entry{i+1}") + self.assertEqual(collected[i].external_id_type, ExternalIdType.DOI) self.assertEqual(collected[i].corpus.is_fix, True) @patch("welearn_datastack.collectors.atom_collector.get_new_https_session") @@ -55,7 +90,7 @@ def test_atom_urlcollector_different_domain(self, mock_get_new_https_session): localmock_corpus = Corpus( source_name="test_org", is_fix=True, main_url="https://www.example.org" ) - local_atom = self.rss_content.replace("example.com", "example.org") + local_atom = self.rss_content_.replace("example.com", "example.org") mock_response.content = local_atom.encode("utf-8") diff --git a/tests/url_collector/test_oe_books_collector.py b/tests/url_collector/test_oe_books_collector.py index c2da2f6..13c684d 100644 --- a/tests/url_collector/test_oe_books_collector.py +++ b/tests/url_collector/test_oe_books_collector.py @@ -2,9 +2,11 @@ from pathlib import Path from unittest.mock import Mock, patch +from welearn_database.data.enumeration import ExternalIdType from welearn_database.data.models import Corpus, WeLearnDocument from welearn_datastack.collectors.oe_books_collector import OpenEditionBooksURLCollector +from welearn_datastack.modules.url_utils import extract_url_parts_post_netloc from welearn_datastack.modules.xml_extractor import XMLExtractor from welearn_datastack.plugins.scrapers import OpenEditionBooksCollector @@ -122,6 +124,8 @@ def test_collect_book_accessible_no_chapters( self.assertEqual(len(collected), 1) self.assertEqual(collected[0].url, self.url_list[0]) + self.assertEqual(collected[0].external_id, "examplepub0/0") + self.assertEqual(collected[0].external_id_type, ExternalIdType.SLUG) @patch("welearn_datastack.collectors.oe_books_collector.get_new_https_session") @patch("welearn_datastack.collectors.oe_books_collector.RssURLCollector.collect") @@ -151,6 +155,8 @@ def test_collect_book_accessible_license_unauthorized( self.assertEqual(len(collected), 1) self.assertEqual(collected[0].url, self.url_list[0]) + self.assertEqual(collected[0].external_id, "examplepub0/0") + self.assertEqual(collected[0].external_id_type, ExternalIdType.SLUG) @patch("welearn_datastack.collectors.oe_books_collector.get_new_https_session") @patch("welearn_datastack.collectors.oe_books_collector.RssURLCollector.collect") @@ -185,6 +191,16 @@ def test_collect_book_accessible_license_authorized( "https://books.openedition.org/examplepub0/8088", ] + wanted_external_ids = [ + extract_url_parts_post_netloc(url) for url in wanted_urls + ] + collected_external_ids = [doc.external_id for doc in collected] + wanted_external_ids.sort() + collected_external_ids.sort() + collected_url.sort() wanted_urls.sort() self.assertListEqual(collected_url, wanted_urls) + self.assertListEqual(collected_external_ids, wanted_external_ids) + for doc in collected: + self.assertEqual(doc.external_id_type, ExternalIdType.SLUG) diff --git a/tests/url_collector/test_openalex_collector.py b/tests/url_collector/test_openalex_collector.py index 3bfffae..7143b9f 100644 --- a/tests/url_collector/test_openalex_collector.py +++ b/tests/url_collector/test_openalex_collector.py @@ -6,6 +6,7 @@ from unittest.mock import MagicMock, Mock, patch from zoneinfo import ZoneInfo +from welearn_database.data.enumeration import ExternalIdType from welearn_database.data.models import Corpus, WeLearnDocument from welearn_datastack.collectors.open_alex_collector import OpenAlexURLCollector @@ -79,6 +80,11 @@ def test_collect(self, mock__get_oa_json): full_content = self.content_json1["results"] + self.content_json2["results"] awaited_urls = [v["id"] for v in full_content] + awaited_external_ids = [v["id"].split("/")[-1] for v in full_content] self.assertListEqual(returned_urls, awaited_urls) self.assertEqual(len(returned_urls), 400) + for wldoc, awaited_external_id in zip(returned_wldoc, awaited_external_ids): + self.assertEqual(wldoc.corpus, self.mock_corpus) + self.assertEqual(wldoc.external_id, awaited_external_id) + self.assertEqual(wldoc.external_id_type, ExternalIdType.API_ID) diff --git a/welearn_datastack/collectors/atom_collector.py b/welearn_datastack/collectors/atom_collector.py index 7e937d2..3d7e11f 100644 --- a/welearn_datastack/collectors/atom_collector.py +++ b/welearn_datastack/collectors/atom_collector.py @@ -1,15 +1,34 @@ +import logging +import os from typing import List from urllib.parse import urlparse +from welearn_database.data.enumeration import ExternalIdType from welearn_database.data.models import Corpus, WeLearnDocument -from welearn_datastack.collectors.helpers.feed_helpers import ( - extracted_url_to_url_datastore, - lines_to_url, -) from welearn_datastack.data.url_collector import URLCollector +from welearn_datastack.modules.url_utils import ( + extract_doi_number, + extract_url_parts_post_netloc, +) +from welearn_datastack.modules.validation import validate_doi +from welearn_datastack.modules.xml_extractor import XMLExtractor from welearn_datastack.utils_.http_client_utils import get_new_https_session +log_level: int = logging.getLevelName(os.getenv("LOG_LEVEL", "INFO")) +log_format: str = os.getenv( + "LOG_FORMAT", "[%(asctime)s][%(name)s][%(levelname)s] - %(message)s" +) + +if not isinstance(log_level, int): + raise ValueError("Log level is not recognized : '%s'", log_level) + +logging.basicConfig( + level=logging.getLevelName(log_level), + format=log_format, +) +logger = logging.getLogger(__name__) + url_illegal_characters = ['"', "<", ">"] headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", @@ -31,23 +50,57 @@ def __init__( self.corpus = corpus def collect(self) -> List[WeLearnDocument]: + logger.info( + f"Collecting URLs from feed {self.feed_url} for corpus {self.corpus.id}" + ) domain = "https://" + urlparse(self.corpus.main_url).netloc client = get_new_https_session() res = client.get(url=self.feed_url, headers=headers) content = res.content.decode("utf-8") - flag = False - link_lines: List[str] = [] - for line in content.split("\n"): - # If we are in the entry section and we find a link - # The definition, especially "rel" part is empirical - if flag and line.strip().startswith('"): - flag = True - - urls = lines_to_url(domain, link_lines) - - ret = extracted_url_to_url_datastore(urls=urls, corpus=self.corpus) + logger.debug(f"Content of the feed {self.feed_url} : {content}") + ret: list[WeLearnDocument] = [] + entries = XMLExtractor(content).extract_content("entry") + logger.info(f"Found {len(entries)} entries in the feed {self.feed_url}") + for entry in entries: + entry_extractor = XMLExtractor(entry.content) + link = entry_extractor.extract_content_attribute_filter( + tag="link", + attribute_name="rel", + attribute_value="alternate", + ) + [xml_external_id] = entry_extractor.extract_content("id") + external_id = xml_external_id.content.strip() + if len(link) == 0: + continue + link_url = link[0].attributes["href"] + if link_url.startswith(domain): + if validate_doi(external_id, resolve_doi=False): + # If the external ID is a valid DOI, we can use it directly as the external ID and set the type to DOI + logger.info( + f"External ID {external_id} is a valid DOI for URL {link_url}" + ) + external_id = extract_doi_number(external_id) + external_id_type = ExternalIdType.DOI + else: + # Otherwise, we can use the part of the URL after the domain as the external ID and set the type to SLUG + logger.info( + f"External ID {external_id} is not a valid DOI for URL {link_url}, using the part of the URL after the domain as the external ID" + ) + external_id = extract_url_parts_post_netloc( + link_url, remove_start_slash=True + ) + external_id_type = ExternalIdType.SLUG + ret.append( + WeLearnDocument( + url=link_url, + corpus=self.corpus, + external_id=external_id, + external_id_type=external_id_type, + ) + ) + logger.info( + f"Collected {len(ret)} URLs from feed {self.feed_url} for corpus {self.corpus.id}" + ) return ret diff --git a/welearn_datastack/collectors/oe_books_collector.py b/welearn_datastack/collectors/oe_books_collector.py index bed2cc3..2079aea 100644 --- a/welearn_datastack/collectors/oe_books_collector.py +++ b/welearn_datastack/collectors/oe_books_collector.py @@ -2,6 +2,7 @@ import re from typing import Dict, List +from welearn_database.data.enumeration import ExternalIdType from welearn_database.data.models import Corpus, WeLearnDocument from welearn_datastack.collectors.rss_collector import RssURLCollector @@ -11,6 +12,7 @@ MD_OE_BOOKS_BASE_URL, ) from welearn_datastack.data.url_collector import URLCollector +from welearn_datastack.modules.url_utils import extract_url_parts_post_netloc from welearn_datastack.modules.xml_extractor import XMLExtractor from welearn_datastack.utils_.http_client_utils import get_new_https_session @@ -81,7 +83,7 @@ def collect(self) -> List[WeLearnDocument]: client = get_new_https_session() for book_url in rss_urls: logger.info("Collecting book: %s", book_url) - md_id = book_url.url.replace("https://books.openedition.org/", "") + md_id = extract_url_parts_post_netloc(book_url.url) md_url = MD_OE_BOOKS_BASE_URL.replace("", md_id) md_res = client.get(url=md_url, headers=HEADERS) @@ -112,20 +114,39 @@ def collect(self) -> List[WeLearnDocument]: # Weird case where there is no chapters logger.warning("No chapters found for book: %s", book_url.url) ret.append( - WeLearnDocument(url=book_url.url, corpus=self.corpus) + WeLearnDocument( + url=book_url.url, + corpus=self.corpus, + external_id=md_id, + external_id_type=ExternalIdType.SLUG, + ) ) continue else: for chapter_url in chapters_urls: logger.info("--Collecting chapter: %s", chapter_url) ret.append( - WeLearnDocument(url=chapter_url, corpus=self.corpus) + WeLearnDocument( + url=chapter_url, + corpus=self.corpus, + external_id=extract_url_parts_post_netloc( + chapter_url + ), + external_id_type=ExternalIdType.SLUG, + ) ) else: logger.info( "Book chapters are not legally usable : %s", book_url.url ) - ret.append(WeLearnDocument(url=book_url.url, corpus=self.corpus)) + ret.append( + WeLearnDocument( + url=book_url.url, + corpus=self.corpus, + external_id=md_id, + external_id_type=ExternalIdType.SLUG, + ) + ) continue else: logger.info("Book is not open access: %s", book_url.url) diff --git a/welearn_datastack/collectors/open_alex_collector.py b/welearn_datastack/collectors/open_alex_collector.py index 45cdc0c..6af969a 100644 --- a/welearn_datastack/collectors/open_alex_collector.py +++ b/welearn_datastack/collectors/open_alex_collector.py @@ -6,10 +6,12 @@ from typing import Dict, List from zoneinfo import ZoneInfo +from welearn_database.data.enumeration import ExternalIdType from welearn_database.data.models import Corpus, WeLearnDocument from welearn_datastack.constants import OPEN_ALEX_BASE_URL, PUBLISHERS_TO_AVOID from welearn_datastack.data.url_collector import URLCollector +from welearn_datastack.modules.url_utils import extract_url_parts_post_netloc from welearn_datastack.utils_.http_client_utils import get_new_https_session logger = logging.getLogger(__name__) @@ -138,7 +140,14 @@ def collect(self) -> List[WeLearnDocument]: for i in range(0, iteration_quantity): logger.info(f"Iteration {i+1}/{iteration_quantity}") for work in json_from_oa["results"]: - ret.append(WeLearnDocument(url=work["id"], corpus=self.corpus)) + ret.append( + WeLearnDocument( + url=work["id"], + corpus=self.corpus, + external_id=extract_url_parts_post_netloc(work["id"]), + external_id_type=ExternalIdType.API_ID, + ) + ) if json_from_oa["meta"]["next_cursor"]: params["cursor"] = json_from_oa["meta"]["next_cursor"] diff --git a/welearn_datastack/collectors/wikipedia_collector.py b/welearn_datastack/collectors/wikipedia_collector.py index b6a1b1c..9943b50 100644 --- a/welearn_datastack/collectors/wikipedia_collector.py +++ b/welearn_datastack/collectors/wikipedia_collector.py @@ -148,7 +148,6 @@ def get_page_translation( return ret def collect(self, batch_id: int | None = None) -> List[WeLearnDocument]: - portals_to_process: List[WikipediaContainer] categories_to_process: List[WikipediaContainer] diff --git a/welearn_datastack/data/enumerations.py b/welearn_datastack/data/enumerations.py index f97ac9d..40f167c 100644 --- a/welearn_datastack/data/enumerations.py +++ b/welearn_datastack/data/enumerations.py @@ -1,4 +1,4 @@ -from enum import Enum, auto +from enum import Enum, StrEnum, auto class PluginType(Enum): @@ -16,19 +16,6 @@ class DeletePart(Enum): after = 2 -# class Step(Enum): -# URL_RETRIEVED = "url_retrieved" -# DOCUMENT_SCRAPED = "document_scraped" -# DOCUMENT_VECTORIZED = "document_vectorized" -# DOCUMENT_CLASSIFIED_SDG = "document_classified_sdg" -# DOCUMENT_CLASSIFIED_NON_SDG = "document_classified_non_sdg" -# DOCUMENT_KEYWORDS_EXTRACTED = "document_with_keywords" -# DOCUMENT_IN_QDRANT = "document_in_qdrant" -# DOCUMENT_IS_INVALID = "document_is_invalid" -# KEPT_FOR_TRACE = "kept_for_trace" -# DOCUMENT_IS_IRRETRIEVABLE = "document_is_irretrievable" - - class MLModelsType(Enum): BI_CLASSIFIER = auto() N_CLASSIFIER = auto() @@ -49,3 +36,12 @@ class URLStatus(Enum): UPDATE = 2 DELETE = 3 UNKNOWN = 4 + + +class URLParts(Enum): + SCHEME = auto() + NETLOC = auto() + PATH = auto() + PARAMS = auto() + QUERY = auto() + FRAGMENT = auto() diff --git a/welearn_datastack/modules/url_utils.py b/welearn_datastack/modules/url_utils.py new file mode 100644 index 0000000..7827942 --- /dev/null +++ b/welearn_datastack/modules/url_utils.py @@ -0,0 +1,81 @@ +from urllib.parse import urlparse + +from welearn_datastack.data.enumerations import URLParts + + +def extract_url_parts( + url: str, parts_to_extract: list[URLParts], concat: bool +) -> str | list[str]: + """ + Extract the specified parts of a URL and return them as a concatenated string or a list of strings. + :param url: The URL to extract parts from + :param parts_to_extract: A list of URLParts enum values specifying which parts to extract + :param concat: If True, return the extracted parts as a concatenated string. If False, return the extracted parts as a list of strings. + :return: The extracted parts as a concatenated string or a list of strings, depending on the value of concat. If no parts are extracted, return an empty list. + """ + parsed = urlparse(url) + # Init empty list of 6 elements to store the extracted parts, in the order of URLParts enum + extracted_parts = ["" for _ in range(6)] + + for part in parts_to_extract: + match part: + case URLParts.SCHEME: + extracted_parts[0] = parsed.scheme + case URLParts.NETLOC: + extracted_parts[1] = parsed.netloc + case URLParts.PATH: + extracted_parts[2] = parsed.path + case URLParts.PARAMS: + extracted_parts[3] = parsed.params + case URLParts.QUERY: + extracted_parts[4] = parsed.query + case URLParts.FRAGMENT: + extracted_parts[5] = parsed.fragment + + if concat: + return "".join(extracted_parts) + else: + ret = [i for i in extracted_parts if i != ""] + return ret if len(ret) > 1 else [] + + +def extract_url_parts_post_netloc(url: str, remove_start_slash: bool = True) -> str: + """ + Extract the path, params, query and fragment parts of a URL and concatenate them into a single string. Optionally remove the starting slash from the path. + :param url: The URL to extract parts from + :param remove_start_slash: If True, remove the starting slash from the path part if it exists. Default is True. + :return: The concatenated string of the path, params, query and fragment parts of the URL, with the starting + slash removed from the path if remove_start_slash is True and the path starts with a slash. + If no parts are extracted, return an empty string. + """ + ret = extract_url_parts( + url=url, + parts_to_extract=[ + URLParts.PATH, + URLParts.PARAMS, + URLParts.QUERY, + URLParts.FRAGMENT, + ], + concat=True, + ) + if remove_start_slash and ret.startswith("/"): + ret = ret[1:] + return ret + + +def extract_doi_number(url: str, strict: bool = False) -> str: + """ + Extract the DOI number from a URL if it exists. The DOI number is expected to be in the format "10.xxxx/xxxxx". + :param url: The URL to extract the DOI number from + :return: The extracted DOI number as a string, or an empty string if no DOI number is found + """ + path = urlparse(url).path + if path.startswith("/"): + ret = path[1:] + else: + return "" if strict else path + + if ret.startswith("10."): + return ret + else: + return "" if strict else path diff --git a/welearn_datastack/modules/validation.py b/welearn_datastack/modules/validation.py index 84575b7..53fc4f8 100644 --- a/welearn_datastack/modules/validation.py +++ b/welearn_datastack/modules/validation.py @@ -1,5 +1,10 @@ +import re + +import requests from welearn_database.data.models import WeLearnDocument +from welearn_datastack.regular_expression import DOI_REGEX + def validate_non_null_fields_document(doc: WeLearnDocument) -> bool: """ @@ -9,3 +14,28 @@ def validate_non_null_fields_document(doc: WeLearnDocument) -> bool: is_desc_empty = not doc.description or doc.description.strip() == "" is_content_empty = not doc.full_content or doc.full_content.strip() == "" return not (is_desc_empty or is_content_empty) + + +def validate_doi(s: str, resolve_doi: bool = True) -> bool: + """ + Validate if a string is a valid DOI and if it exists. + - Check if the string matches the DOI format using a regular expression. + - If it matches, make a request to the DOI API to check if it exists. + - If the API returns a 200 status code, the DOI is valid and exists; otherwise, it is not valid or does not exist. + - If the string does not match the DOI format, it is not valid. + - Handle any exceptions that may occur during the API request and return False in case of an error. + - Strip the input string and remove the "https://doi.org/" prefix if it exists before validating the DOI format and existence. + :param s: The string to validate as a DOI. + :param resolve_doi: Whether to check if the DOI exists by making a request to the DOI API. If False, only the format will be validated. + :return: True if the string is a valid DOI (and exists if resolve_doi is True), False otherwise. + """ + s = s.strip().removeprefix("https://doi.org/") + if not re.match(DOI_REGEX, s): + return False + if resolve_doi: + try: + r = requests.get(f"https://doi.org/api/handles/{s}", timeout=5) + return r.status_code == 200 + except requests.RequestException: + return False + return True diff --git a/welearn_datastack/modules/xml_extractor.py b/welearn_datastack/modules/xml_extractor.py index 49a3e71..b61501f 100644 --- a/welearn_datastack/modules/xml_extractor.py +++ b/welearn_datastack/modules/xml_extractor.py @@ -7,6 +7,7 @@ from welearn_datastack.regular_expression import ( SIMPLE_XML_ATTRIBUTE_REGEX, simple_xml_tag_format_regex, + simple_xml_tag_format_regex_autoclosing, ) logger = logging.getLogger(__name__) @@ -63,9 +64,11 @@ def extract_content(self, tag: str) -> List[XMLData]: attr_pattern = re.compile(SIMPLE_XML_ATTRIBUTE_REGEX) # Find all matches of the pattern in the XML raw data - matches = re.findall( - simple_xml_tag_format_regex(tag), self.xml_raw_data, re.DOTALL - ) + pattern = simple_xml_tag_format_regex(tag) + autoclosing_pattern = simple_xml_tag_format_regex_autoclosing(tag) + matches = re.findall(pattern, self.xml_raw_data, re.DOTALL) + if len(matches) == 0: + matches = re.findall(autoclosing_pattern, self.xml_raw_data, re.DOTALL) logger.info("Found %d matches for tag %s", len(matches), tag) ret = [] diff --git a/welearn_datastack/plugins/rest_requesters/fao_open_knowledge.py b/welearn_datastack/plugins/rest_requesters/fao_open_knowledge.py index ba43965..9f0e8d3 100644 --- a/welearn_datastack/plugins/rest_requesters/fao_open_knowledge.py +++ b/welearn_datastack/plugins/rest_requesters/fao_open_knowledge.py @@ -7,6 +7,7 @@ import pydantic import requests +from welearn_database.data.enumeration import ExternalIdType from welearn_database.data.models import WeLearnDocument from welearn_database.modules.text_cleaning import clean_text @@ -24,7 +25,6 @@ NoContent, NoDescriptionFoundError, NotExpectedMoreThanOneItem, - PDFFileSizeExceedLimit, UnauthorizedLicense, UnauthorizedState, ) @@ -227,7 +227,7 @@ def _extract_details(self, fao_document: Item) -> dict: [publication_date] = parsed_metadata.get("dc.date.available", empty_entry) [update_date] = parsed_metadata.get("dc.date.lastModified", empty_entry) [isbn] = parsed_metadata.get("dc.identifier.isbn", empty_entry) - [doi] = parsed_metadata.get("dc.identifier.doi", empty_entry) + [doi] = parsed_metadata.get("fao.identifier.doi", empty_entry) [type_] = parsed_metadata.get("fao.taxonomy.type", empty_entry) ret: dict[str, Any] = { "publication_date": ( diff --git a/welearn_datastack/plugins/rest_requesters/oapen.py b/welearn_datastack/plugins/rest_requesters/oapen.py index 2a47674..fa3b238 100644 --- a/welearn_datastack/plugins/rest_requesters/oapen.py +++ b/welearn_datastack/plugins/rest_requesters/oapen.py @@ -10,6 +10,7 @@ from typing import Dict, Iterable, List from lingua import Language +from welearn_database.data.enumeration import ExternalIdType from welearn_database.data.models import WeLearnDocument from welearn_datastack.constants import AUTHORIZED_LICENSES, HEADERS diff --git a/welearn_datastack/plugins/rest_requesters/open_alex.py b/welearn_datastack/plugins/rest_requesters/open_alex.py index ee03a15..147f288 100644 --- a/welearn_datastack/plugins/rest_requesters/open_alex.py +++ b/welearn_datastack/plugins/rest_requesters/open_alex.py @@ -44,6 +44,7 @@ remove_hyphens, replace_ligatures, ) +from welearn_datastack.modules.url_utils import extract_doi_number from welearn_datastack.plugins.interface import IPluginRESTCollector from welearn_datastack.utils_.http_client_utils import ( get_http_code_from_exception, @@ -199,8 +200,6 @@ def _update_welearn_document(self, wrapper: WrapperRawData) -> WeLearnDocument: wrapper.document.description = document_desc wrapper.document.content = document_content wrapper.document.details = document_details - wrapper.document.external_id = self._get_doi(wrapper) - wrapper.document.external_id_type = ExternalIdType.DOI return wrapper.document @@ -242,7 +241,7 @@ def _build_details( def _get_doi(wrapper: WrapperRawData) -> str | None: doi = wrapper.raw_data.ids.doi if doi.startswith("https://doi.org/"): - doi = doi.replace("https://doi.org/", "") + doi = extract_doi_number(doi) return doi @staticmethod diff --git a/welearn_datastack/plugins/scrapers/oe_books.py b/welearn_datastack/plugins/scrapers/oe_books.py index 4a373af..e529150 100644 --- a/welearn_datastack/plugins/scrapers/oe_books.py +++ b/welearn_datastack/plugins/scrapers/oe_books.py @@ -5,11 +5,13 @@ from bs4 import BeautifulSoup # type: ignore from requests import Session # type: ignore +from welearn_database.data.enumeration import ExternalIdType from welearn_database.data.models import WeLearnDocument from welearn_datastack.constants import AUTHORIZED_LICENSES, MD_OE_BOOKS_BASE_URL from welearn_datastack.data.db_wrapper import WrapperRetrieveDocument from welearn_datastack.exceptions import ClosedAccessContent +from welearn_datastack.modules.url_utils import extract_url_parts_post_netloc from welearn_datastack.modules.xml_extractor import XMLExtractor from welearn_datastack.plugins.interface import IPluginScrapeCollector from welearn_datastack.utils_.http_client_utils import ( @@ -85,8 +87,12 @@ def _scrape_url(self, document: WeLearnDocument) -> WeLearnDocument: case "book": details["type"] = "book" if not root_extractor: - logger.warning("Weird case, cannot accessed to API before :%s", url) - md_id, mets_api_res = self._get_mets_metadata(https_session, url) + logger.warning( + "Weird case, cannot accessed to API before :%s", document.url + ) + md_id, mets_api_res = self._get_mets_metadata( + https_session, document.url + ) root_extractor = XMLExtractor(mets_api_res.content.decode("utf-8")) if not self._is_open_access(root_extractor): @@ -353,7 +359,7 @@ def _get_mets_metadata(self, https_session, url): :param url: The url of the book :return: The md_id and the response of the API """ - md_id = url.replace("https://books.openedition.org/", "") + md_id = extract_url_parts_post_netloc(url) md_url = MD_OE_BOOKS_BASE_URL.replace("", md_id) mets_api_res = https_session.get(url=md_url, timeout=self.timeout) return md_id, mets_api_res diff --git a/welearn_datastack/plugins/scrapers/peerj.py b/welearn_datastack/plugins/scrapers/peerj.py index eebe26a..954d9e5 100644 --- a/welearn_datastack/plugins/scrapers/peerj.py +++ b/welearn_datastack/plugins/scrapers/peerj.py @@ -239,8 +239,6 @@ def _scrape_url(self, document: WeLearnDocument) -> WeLearnDocument: doi = document.details.get("doi", None) if not doi: raise NoDOIFoundError(f"No DOI found for '{document.url}'") - document.external_id = doi - document.external_id_type = ExternalIdType.DOI return document diff --git a/welearn_datastack/plugins/scrapers/plos.py b/welearn_datastack/plugins/scrapers/plos.py index 9be91bf..e3e82f5 100644 --- a/welearn_datastack/plugins/scrapers/plos.py +++ b/welearn_datastack/plugins/scrapers/plos.py @@ -14,6 +14,7 @@ from welearn_datastack.constants import AUTHORIZED_LICENSES from welearn_datastack.data.db_wrapper import WrapperRetrieveDocument from welearn_datastack.exceptions import UnauthorizedLicense +from welearn_datastack.modules.url_utils import extract_doi_number from welearn_datastack.plugins.interface import IPluginScrapeCollector from welearn_datastack.regular_expression import ANTI_URL_REGEX from welearn_datastack.utils_.http_client_utils import ( @@ -124,7 +125,7 @@ def extract_doi(self, article_meta: BeautifulSoup) -> str: doi_extract = article_meta.find("article-id", {"pub-id-type": "doi"}) doi = self.extract_property(doi_extract) if doi.startswith("https://doi.org/"): - doi = doi.replace("https://doi.org/", "") + doi = extract_doi_number(doi) return doi @staticmethod @@ -273,8 +274,6 @@ def extract_data_from_plos_xml( document.description = clean_return_to_line(doc_desc) document.full_content = clean_doc_content document.details = self._get_document_details(soup=soup) - document.external_id = self.extract_doi(article_meta) - document.external_id_type = ExternalIdType.DOI return document diff --git a/welearn_datastack/regular_expression.py b/welearn_datastack/regular_expression.py index 4486129..9ebade1 100644 --- a/welearn_datastack/regular_expression.py +++ b/welearn_datastack/regular_expression.py @@ -75,6 +75,13 @@ # limit: Treats underscores as word characters and does not handle hyphenated words or contractions as single tokens. WORDS_REGEX = r"\w+" +# description: Matches a DOI (Digital Object Identifier) in its standard format, starting with "10." followed by a numeric prefix and a suffix. +# example: "10.1000/xyz123" -> matches "10.1000/xyz123" +# limit: The regex is quite permissive in the suffix part, allowing any +# non-whitespace characters except quotes and angle brackets, which may lead to false positives in some contexts. +# Source : https://stackoverflow.com/a/10324802/31019364 +DOI_REGEX = r"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'<>])\S)+)" + def simple_xml_tag_format_regex(tag: str) -> str: """ @@ -86,3 +93,8 @@ def simple_xml_tag_format_regex(tag: str) -> str: :return: A regular expression string to match the specified XML tag. """ return rf"<{tag}([^>]*)>(.*?)" + + +def simple_xml_tag_format_regex_autoclosing(tag: str) -> str: + + return rf"<{tag}([^>]*)(.*?)/>"