Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
9d10a19
feat: add URL parts extraction utility and refactor DOI handling
lpi-tn Apr 28, 2026
6a0f764
feat: set external ID and type for WeLearnDocument in OAPEN processing
lpi-tn Apr 28, 2026
33a1685
feat: update DOI handling to use new identifier and set external ID type
lpi-tn Apr 28, 2026
18f63f1
Merge remote-tracking branch 'origin/Feature/external-id-scientif-jou…
lpi-tn Apr 28, 2026
4959ba1
feat: update python-dotenv to version 1.2.2 and improve DOI extractio…
lpi-tn Apr 28, 2026
0f25c73
feat: remove external ID and type assignment from document processing
lpi-tn Apr 28, 2026
2c9c854
feat: remove external ID and type assignment from document processing…
lpi-tn Apr 28, 2026
7a9aaf6
feat: add external ID and type assignment to WeLearnDocument in OpenA…
lpi-tn Apr 28, 2026
c2c1dbb
feat: add external ID and type assignment to WeLearnDocument in oe_bo…
lpi-tn Apr 28, 2026
8134397
Merge branch 'main' into Feature/external-id-scientif-journals
lpi-tn Apr 28, 2026
8ca9ccb
Merge branch 'main' into Feature/external-id-scientif-journals
lpi-tn Apr 28, 2026
a874c30
feat: add DOI validation function and regex pattern for document vali…
lpi-tn Apr 28, 2026
531b29b
typo
lpi-tn Apr 28, 2026
2bcb9be
feat: enhance XML extraction and add regex for autoclosing tags
lpi-tn Apr 28, 2026
e94a1f7
feat: add logging and DOI validation in URL collection process
lpi-tn Apr 28, 2026
a97f879
feat: add DOI extraction and update Atom URL collector for DOI handling
lpi-tn Apr 28, 2026
9304e7b
typo
lpi-tn Apr 28, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,7 @@ sentence-transformers = "^5.2.2"
spacy = "^3.8.11"
refinedoc = "^1.0.0"
qdrant-client = "1.16.2"
<<<<<<< Feature/external-id-scientif-journals
python-dotenv = "^1.2.1"
=======
python-dotenv = "^1.2.2"
>>>>>>> main
beautifulsoup4 = "^4.14.3"
pyphen = "^0.17.2"
ijson = "^3.4.0"
Expand Down
43 changes: 43 additions & 0 deletions tests/url_collector/resources/atom_file_doi.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="https://www.w3.org/2005/Atom">
<title>Example Atom Feed</title>
<link href="https://www.example.com" rel="self"/>
<link rel="alternate" href="https://www.example.com"/>
<id>https://doi.org/10.1234/feed</id>
<updated>2023-09-12T00:00:00Z</updated>
<author>
<name>Your Name</name>
<email>your-email@example.com</email>
</author>

<entry>
<title>First Entry</title>
<link rel="alternate" href="https://www.example.com/entry1"/>
<id>https://doi.org/10.1234/entry1</id>
<published>2023-09-12T12:00:00Z</published>
<updated>2023-09-12T12:00:00Z</updated>
<summary>This is the summary of the first entry.</summary>
<content type="html">&lt;p&gt;This is the content of the first entry.&lt;/p&gt;</content>
</entry>

<entry>
<title>Second Entry</title>
<link rel="alternate" href="https://www.example.com/entry2"/>
<id>https://doi.org/10.1234/entry2</id>
<published>2023-09-13T09:30:00Z</published>
<updated>2023-09-13T09:30:00Z</updated>
<summary>This is the summary of the second entry.</summary>
<content type="html">&lt;p&gt;This is the content of the second entry.&lt;/p&gt;</content>
</entry>

<entry>
<title>Third Entry</title>
<link rel="alternate" href="https://www.example.com/entry3"/>
<id>https://doi.org/10.1234/entry3</id>
<published>2023-09-14T15:45:00Z</published>
<updated>2023-09-14T15:45:00Z</updated>
<summary>This is the summary of the third entry.</summary>
<content type="html">&lt;p&gt;This is the content of the third entry.&lt;/p&gt;</content>
</entry>

</feed>
37 changes: 36 additions & 1 deletion tests/url_collector/test_atom_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from unittest import TestCase
from unittest.mock import Mock, patch

from welearn_database.data.enumeration import ExternalIdType
from welearn_database.data.models import Corpus

from welearn_datastack.collectors.atom_collector import AtomURLCollector
Expand All @@ -10,11 +11,16 @@
class Test(TestCase):
def setUp(self) -> None:
self.rss_file_path = Path(__file__).parent / "resources" / "atom_file.xml"
self.rss_file_path_doi = (
Path(__file__).parent / "resources" / "atom_file_doi.xml"
)
self.mock_corpus = Corpus(
source_name="test", is_fix=True, main_url="https://www.example.com"
)
with self.rss_file_path.open(mode="r") as f:
self.rss_content = f.read()
with self.rss_file_path_doi.open(mode="r") as f:
self.rss_content_doi = f.read()

@patch("welearn_datastack.collectors.atom_collector.get_new_https_session")
def test_atom_urlcollector(self, mock_get_new_https_session):
Expand All @@ -39,6 +45,35 @@ def test_atom_urlcollector(self, mock_get_new_https_session):
for i in range(0, len(collected)):
self.assertEqual(collected[i].url, f"https://www.example.com/entry{i+1}")
self.assertEqual(collected[i].corpus.source_name, "test")
self.assertEqual(collected[i].external_id, f"entry{i+1}")
self.assertEqual(collected[i].external_id_type, ExternalIdType.SLUG)
self.assertEqual(collected[i].corpus.is_fix, True)

@patch("welearn_datastack.collectors.atom_collector.get_new_https_session")
def test_atom_urlcollector_with_id_doi(self, mock_get_new_https_session):
"""
Test the collect method of the AtomURLCollector class, on a rss file with 3 rows
"""
mock_session = Mock()
mock_response = Mock()
mock_get_new_https_session.return_value = mock_session
mock_session.ok.return_value = True
mock_session.get.return_value = mock_response

mock_response.content = self.rss_content_doi.encode("utf-8")

rss_collector = AtomURLCollector(
feed_url="https://www.example.com",
corpus=self.mock_corpus,
)
collected = rss_collector.collect()
self.assertEqual(3, len(collected))

for i in range(0, len(collected)):
self.assertEqual(collected[i].url, f"https://www.example.com/entry{i+1}")
self.assertEqual(collected[i].corpus.source_name, "test")
self.assertEqual(collected[i].external_id, f"10.1234/entry{i+1}")
self.assertEqual(collected[i].external_id_type, ExternalIdType.DOI)
self.assertEqual(collected[i].corpus.is_fix, True)

@patch("welearn_datastack.collectors.atom_collector.get_new_https_session")
Expand All @@ -55,7 +90,7 @@ def test_atom_urlcollector_different_domain(self, mock_get_new_https_session):
localmock_corpus = Corpus(
source_name="test_org", is_fix=True, main_url="https://www.example.org"
)
local_atom = self.rss_content.replace("example.com", "example.org")
local_atom = self.rss_content_.replace("example.com", "example.org")

mock_response.content = local_atom.encode("utf-8")

Expand Down
16 changes: 16 additions & 0 deletions tests/url_collector/test_oe_books_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
from pathlib import Path
from unittest.mock import Mock, patch

from welearn_database.data.enumeration import ExternalIdType
from welearn_database.data.models import Corpus, WeLearnDocument

from welearn_datastack.collectors.oe_books_collector import OpenEditionBooksURLCollector
from welearn_datastack.modules.url_utils import extract_url_parts_post_netloc
from welearn_datastack.modules.xml_extractor import XMLExtractor
from welearn_datastack.plugins.scrapers import OpenEditionBooksCollector

Expand Down Expand Up @@ -122,6 +124,8 @@ def test_collect_book_accessible_no_chapters(

self.assertEqual(len(collected), 1)
self.assertEqual(collected[0].url, self.url_list[0])
self.assertEqual(collected[0].external_id, "examplepub0/0")
self.assertEqual(collected[0].external_id_type, ExternalIdType.SLUG)

@patch("welearn_datastack.collectors.oe_books_collector.get_new_https_session")
@patch("welearn_datastack.collectors.oe_books_collector.RssURLCollector.collect")
Expand Down Expand Up @@ -151,6 +155,8 @@ def test_collect_book_accessible_license_unauthorized(

self.assertEqual(len(collected), 1)
self.assertEqual(collected[0].url, self.url_list[0])
self.assertEqual(collected[0].external_id, "examplepub0/0")
self.assertEqual(collected[0].external_id_type, ExternalIdType.SLUG)

@patch("welearn_datastack.collectors.oe_books_collector.get_new_https_session")
@patch("welearn_datastack.collectors.oe_books_collector.RssURLCollector.collect")
Expand Down Expand Up @@ -185,6 +191,16 @@ def test_collect_book_accessible_license_authorized(
"https://books.openedition.org/examplepub0/8088",
]

wanted_external_ids = [
extract_url_parts_post_netloc(url) for url in wanted_urls
]
collected_external_ids = [doc.external_id for doc in collected]
wanted_external_ids.sort()
collected_external_ids.sort()

collected_url.sort()
wanted_urls.sort()
self.assertListEqual(collected_url, wanted_urls)
self.assertListEqual(collected_external_ids, wanted_external_ids)
for doc in collected:
self.assertEqual(doc.external_id_type, ExternalIdType.SLUG)
6 changes: 6 additions & 0 deletions tests/url_collector/test_openalex_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from unittest.mock import MagicMock, Mock, patch
from zoneinfo import ZoneInfo

from welearn_database.data.enumeration import ExternalIdType
from welearn_database.data.models import Corpus, WeLearnDocument

from welearn_datastack.collectors.open_alex_collector import OpenAlexURLCollector
Expand Down Expand Up @@ -79,6 +80,11 @@ def test_collect(self, mock__get_oa_json):

full_content = self.content_json1["results"] + self.content_json2["results"]
awaited_urls = [v["id"] for v in full_content]
awaited_external_ids = [v["id"].split("/")[-1] for v in full_content]

self.assertListEqual(returned_urls, awaited_urls)
self.assertEqual(len(returned_urls), 400)
for wldoc, awaited_external_id in zip(returned_wldoc, awaited_external_ids):
self.assertEqual(wldoc.corpus, self.mock_corpus)
self.assertEqual(wldoc.external_id, awaited_external_id)
self.assertEqual(wldoc.external_id_type, ExternalIdType.API_ID)
87 changes: 70 additions & 17 deletions welearn_datastack/collectors/atom_collector.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,34 @@
import logging
import os
from typing import List
from urllib.parse import urlparse

from welearn_database.data.enumeration import ExternalIdType
from welearn_database.data.models import Corpus, WeLearnDocument

from welearn_datastack.collectors.helpers.feed_helpers import (
extracted_url_to_url_datastore,
lines_to_url,
)
from welearn_datastack.data.url_collector import URLCollector
from welearn_datastack.modules.url_utils import (
extract_doi_number,
extract_url_parts_post_netloc,
)
from welearn_datastack.modules.validation import validate_doi
from welearn_datastack.modules.xml_extractor import XMLExtractor
from welearn_datastack.utils_.http_client_utils import get_new_https_session

log_level: int = logging.getLevelName(os.getenv("LOG_LEVEL", "INFO"))
log_format: str = os.getenv(
"LOG_FORMAT", "[%(asctime)s][%(name)s][%(levelname)s] - %(message)s"
)

if not isinstance(log_level, int):
raise ValueError("Log level is not recognized : '%s'", log_level)

logging.basicConfig(
level=logging.getLevelName(log_level),
format=log_format,
)
logger = logging.getLogger(__name__)

url_illegal_characters = ['"', "<", ">"]
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
Expand All @@ -31,23 +50,57 @@ def __init__(
self.corpus = corpus

def collect(self) -> List[WeLearnDocument]:
logger.info(
f"Collecting URLs from feed {self.feed_url} for corpus {self.corpus.id}"
)
domain = "https://" + urlparse(self.corpus.main_url).netloc
client = get_new_https_session()
res = client.get(url=self.feed_url, headers=headers)
content = res.content.decode("utf-8")

flag = False
link_lines: List[str] = []
for line in content.split("\n"):
# If we are in the entry section and we find a link
# The definition, especially "rel" part is empirical
if flag and line.strip().startswith('<link rel="alternate"'):
link_lines.append(line.strip())
if line.strip().startswith("<entry>"):
flag = True

urls = lines_to_url(domain, link_lines)

ret = extracted_url_to_url_datastore(urls=urls, corpus=self.corpus)
logger.debug(f"Content of the feed {self.feed_url} : {content}")

ret: list[WeLearnDocument] = []
entries = XMLExtractor(content).extract_content("entry")
logger.info(f"Found {len(entries)} entries in the feed {self.feed_url}")
for entry in entries:
entry_extractor = XMLExtractor(entry.content)
link = entry_extractor.extract_content_attribute_filter(
tag="link",
attribute_name="rel",
attribute_value="alternate",
)
[xml_external_id] = entry_extractor.extract_content("id")
external_id = xml_external_id.content.strip()
if len(link) == 0:
continue
link_url = link[0].attributes["href"]
if link_url.startswith(domain):
if validate_doi(external_id, resolve_doi=False):
# If the external ID is a valid DOI, we can use it directly as the external ID and set the type to DOI
logger.info(
f"External ID {external_id} is a valid DOI for URL {link_url}"
)
external_id = extract_doi_number(external_id)
external_id_type = ExternalIdType.DOI
else:
# Otherwise, we can use the part of the URL after the domain as the external ID and set the type to SLUG
logger.info(
f"External ID {external_id} is not a valid DOI for URL {link_url}, using the part of the URL after the domain as the external ID"
)
external_id = extract_url_parts_post_netloc(
link_url, remove_start_slash=True
)
external_id_type = ExternalIdType.SLUG
ret.append(
WeLearnDocument(
url=link_url,
corpus=self.corpus,
external_id=external_id,
external_id_type=external_id_type,
)
)
logger.info(
f"Collected {len(ret)} URLs from feed {self.feed_url} for corpus {self.corpus.id}"
)
return ret
29 changes: 25 additions & 4 deletions welearn_datastack/collectors/oe_books_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import re
from typing import Dict, List

from welearn_database.data.enumeration import ExternalIdType
from welearn_database.data.models import Corpus, WeLearnDocument

from welearn_datastack.collectors.rss_collector import RssURLCollector
Expand All @@ -11,6 +12,7 @@
MD_OE_BOOKS_BASE_URL,
)
from welearn_datastack.data.url_collector import URLCollector
from welearn_datastack.modules.url_utils import extract_url_parts_post_netloc
from welearn_datastack.modules.xml_extractor import XMLExtractor
from welearn_datastack.utils_.http_client_utils import get_new_https_session

Expand Down Expand Up @@ -81,7 +83,7 @@ def collect(self) -> List[WeLearnDocument]:
client = get_new_https_session()
for book_url in rss_urls:
logger.info("Collecting book: %s", book_url)
md_id = book_url.url.replace("https://books.openedition.org/", "")
md_id = extract_url_parts_post_netloc(book_url.url)
md_url = MD_OE_BOOKS_BASE_URL.replace("<md_id>", md_id)

md_res = client.get(url=md_url, headers=HEADERS)
Expand Down Expand Up @@ -112,20 +114,39 @@ def collect(self) -> List[WeLearnDocument]:
# Weird case where there is no chapters
logger.warning("No chapters found for book: %s", book_url.url)
ret.append(
WeLearnDocument(url=book_url.url, corpus=self.corpus)
WeLearnDocument(
url=book_url.url,
corpus=self.corpus,
external_id=md_id,
external_id_type=ExternalIdType.SLUG,
)
)
continue
else:
for chapter_url in chapters_urls:
logger.info("--Collecting chapter: %s", chapter_url)
ret.append(
WeLearnDocument(url=chapter_url, corpus=self.corpus)
WeLearnDocument(
url=chapter_url,
corpus=self.corpus,
external_id=extract_url_parts_post_netloc(
chapter_url
),
external_id_type=ExternalIdType.SLUG,
)
)
else:
logger.info(
"Book chapters are not legally usable : %s", book_url.url
)
ret.append(WeLearnDocument(url=book_url.url, corpus=self.corpus))
ret.append(
WeLearnDocument(
url=book_url.url,
corpus=self.corpus,
external_id=md_id,
external_id_type=ExternalIdType.SLUG,
)
)
continue
else:
logger.info("Book is not open access: %s", book_url.url)
Expand Down
Loading
Loading