CyberCRI · lpi-tn · Apr 28, 2026 · Apr 28, 2026 · Apr 28, 2026 · Apr 28, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -31,11 +31,7 @@ sentence-transformers = "^5.2.2"
 spacy = "^3.8.11"
 refinedoc = "^1.0.0"
 qdrant-client = "1.16.2"
-<<<<<<< Feature/external-id-scientif-journals
-python-dotenv = "^1.2.1"
-=======
 python-dotenv = "^1.2.2"
->>>>>>> main
 beautifulsoup4 = "^4.14.3"
 pyphen = "^0.17.2"
 ijson = "^3.4.0"

diff --git a/tests/url_collector/resources/atom_file_doi.xml b/tests/url_collector/resources/atom_file_doi.xml
@@ -0,0 +1,43 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<feed xmlns="https://www.w3.org/2005/Atom">
+  <title>Example Atom Feed</title>
+  <link href="https://www.example.com" rel="self"/>
+  <link rel="alternate" href="https://www.example.com"/>
+  <id>https://doi.org/10.1234/feed</id>
+  <updated>2023-09-12T00:00:00Z</updated>
+  <author>
+    <name>Your Name</name>
+    <email>your-email@example.com</email>
+  </author>
+
+  <entry>
+    <title>First Entry</title>
+    <link rel="alternate" href="https://www.example.com/entry1"/>
+    <id>https://doi.org/10.1234/entry1</id>
+    <published>2023-09-12T12:00:00Z</published>
+    <updated>2023-09-12T12:00:00Z</updated>
+    <summary>This is the summary of the first entry.</summary>
+    <content type="html">&lt;p&gt;This is the content of the first entry.&lt;/p&gt;</content>
+  </entry>
+
+  <entry>
+    <title>Second Entry</title>
+    <link rel="alternate" href="https://www.example.com/entry2"/>
+    <id>https://doi.org/10.1234/entry2</id>
+    <published>2023-09-13T09:30:00Z</published>
+    <updated>2023-09-13T09:30:00Z</updated>
+    <summary>This is the summary of the second entry.</summary>
+    <content type="html">&lt;p&gt;This is the content of the second entry.&lt;/p&gt;</content>
+  </entry>
+
+  <entry>
+    <title>Third Entry</title>
+    <link rel="alternate"  href="https://www.example.com/entry3"/>
+    <id>https://doi.org/10.1234/entry3</id>
+    <published>2023-09-14T15:45:00Z</published>
+    <updated>2023-09-14T15:45:00Z</updated>
+    <summary>This is the summary of the third entry.</summary>
+    <content type="html">&lt;p&gt;This is the content of the third entry.&lt;/p&gt;</content>
+  </entry>
+
+</feed>
diff --git a/tests/url_collector/test_atom_collector.py b/tests/url_collector/test_atom_collector.py
@@ -2,6 +2,7 @@
 from unittest import TestCase
 from unittest.mock import Mock, patch
 
+from welearn_database.data.enumeration import ExternalIdType
 from welearn_database.data.models import Corpus
 
 from welearn_datastack.collectors.atom_collector import AtomURLCollector
@@ -10,11 +11,16 @@
 class Test(TestCase):
     def setUp(self) -> None:
         self.rss_file_path = Path(__file__).parent / "resources" / "atom_file.xml"
+        self.rss_file_path_doi = (
+            Path(__file__).parent / "resources" / "atom_file_doi.xml"
+        )
         self.mock_corpus = Corpus(
             source_name="test", is_fix=True, main_url="https://www.example.com"
         )
         with self.rss_file_path.open(mode="r") as f:
             self.rss_content = f.read()
+        with self.rss_file_path_doi.open(mode="r") as f:
+            self.rss_content_doi = f.read()
 
     @patch("welearn_datastack.collectors.atom_collector.get_new_https_session")
     def test_atom_urlcollector(self, mock_get_new_https_session):
@@ -39,6 +45,35 @@ def test_atom_urlcollector(self, mock_get_new_https_session):
         for i in range(0, len(collected)):
             self.assertEqual(collected[i].url, f"https://www.example.com/entry{i+1}")
             self.assertEqual(collected[i].corpus.source_name, "test")
+            self.assertEqual(collected[i].external_id, f"entry{i+1}")
+            self.assertEqual(collected[i].external_id_type, ExternalIdType.SLUG)
+            self.assertEqual(collected[i].corpus.is_fix, True)
+
+    @patch("welearn_datastack.collectors.atom_collector.get_new_https_session")
+    def test_atom_urlcollector_with_id_doi(self, mock_get_new_https_session):
+        """
+        Test the collect method of the AtomURLCollector class, on a rss file with 3 rows
+        """
+        mock_session = Mock()
+        mock_response = Mock()
+        mock_get_new_https_session.return_value = mock_session
+        mock_session.ok.return_value = True
+        mock_session.get.return_value = mock_response
+
+        mock_response.content = self.rss_content_doi.encode("utf-8")
+
+        rss_collector = AtomURLCollector(
+            feed_url="https://www.example.com",
+            corpus=self.mock_corpus,
+        )
+        collected = rss_collector.collect()
+        self.assertEqual(3, len(collected))
+
+        for i in range(0, len(collected)):
+            self.assertEqual(collected[i].url, f"https://www.example.com/entry{i+1}")
+            self.assertEqual(collected[i].corpus.source_name, "test")
+            self.assertEqual(collected[i].external_id, f"10.1234/entry{i+1}")
+            self.assertEqual(collected[i].external_id_type, ExternalIdType.DOI)
             self.assertEqual(collected[i].corpus.is_fix, True)
 
     @patch("welearn_datastack.collectors.atom_collector.get_new_https_session")
@@ -55,7 +90,7 @@ def test_atom_urlcollector_different_domain(self, mock_get_new_https_session):
         localmock_corpus = Corpus(
             source_name="test_org", is_fix=True, main_url="https://www.example.org"
         )
-        local_atom = self.rss_content.replace("example.com", "example.org")
+        local_atom = self.rss_content_.replace("example.com", "example.org")
 
         mock_response.content = local_atom.encode("utf-8")
 

diff --git a/tests/url_collector/test_oe_books_collector.py b/tests/url_collector/test_oe_books_collector.py
@@ -2,9 +2,11 @@
 from pathlib import Path
 from unittest.mock import Mock, patch
 
+from welearn_database.data.enumeration import ExternalIdType
 from welearn_database.data.models import Corpus, WeLearnDocument
 
 from welearn_datastack.collectors.oe_books_collector import OpenEditionBooksURLCollector
+from welearn_datastack.modules.url_utils import extract_url_parts_post_netloc
 from welearn_datastack.modules.xml_extractor import XMLExtractor
 from welearn_datastack.plugins.scrapers import OpenEditionBooksCollector
 
@@ -122,6 +124,8 @@ def test_collect_book_accessible_no_chapters(
 
         self.assertEqual(len(collected), 1)
         self.assertEqual(collected[0].url, self.url_list[0])
+        self.assertEqual(collected[0].external_id, "examplepub0/0")
+        self.assertEqual(collected[0].external_id_type, ExternalIdType.SLUG)
 
     @patch("welearn_datastack.collectors.oe_books_collector.get_new_https_session")
     @patch("welearn_datastack.collectors.oe_books_collector.RssURLCollector.collect")
@@ -151,6 +155,8 @@ def test_collect_book_accessible_license_unauthorized(
 
         self.assertEqual(len(collected), 1)
         self.assertEqual(collected[0].url, self.url_list[0])
+        self.assertEqual(collected[0].external_id, "examplepub0/0")
+        self.assertEqual(collected[0].external_id_type, ExternalIdType.SLUG)
 
     @patch("welearn_datastack.collectors.oe_books_collector.get_new_https_session")
     @patch("welearn_datastack.collectors.oe_books_collector.RssURLCollector.collect")
@@ -185,6 +191,16 @@ def test_collect_book_accessible_license_authorized(
             "https://books.openedition.org/examplepub0/8088",
         ]
 
+        wanted_external_ids = [
+            extract_url_parts_post_netloc(url) for url in wanted_urls
+        ]
+        collected_external_ids = [doc.external_id for doc in collected]
+        wanted_external_ids.sort()
+        collected_external_ids.sort()
+
         collected_url.sort()
         wanted_urls.sort()
         self.assertListEqual(collected_url, wanted_urls)
+        self.assertListEqual(collected_external_ids, wanted_external_ids)
+        for doc in collected:
+            self.assertEqual(doc.external_id_type, ExternalIdType.SLUG)
diff --git a/tests/url_collector/test_openalex_collector.py b/tests/url_collector/test_openalex_collector.py
@@ -6,6 +6,7 @@
 from unittest.mock import MagicMock, Mock, patch
 from zoneinfo import ZoneInfo
 
+from welearn_database.data.enumeration import ExternalIdType
 from welearn_database.data.models import Corpus, WeLearnDocument
 
 from welearn_datastack.collectors.open_alex_collector import OpenAlexURLCollector
@@ -79,6 +80,11 @@ def test_collect(self, mock__get_oa_json):
 
         full_content = self.content_json1["results"] + self.content_json2["results"]
         awaited_urls = [v["id"] for v in full_content]
+        awaited_external_ids = [v["id"].split("/")[-1] for v in full_content]
 
         self.assertListEqual(returned_urls, awaited_urls)
         self.assertEqual(len(returned_urls), 400)
+        for wldoc, awaited_external_id in zip(returned_wldoc, awaited_external_ids):
+            self.assertEqual(wldoc.corpus, self.mock_corpus)
+            self.assertEqual(wldoc.external_id, awaited_external_id)
+            self.assertEqual(wldoc.external_id_type, ExternalIdType.API_ID)
diff --git a/welearn_datastack/collectors/atom_collector.py b/welearn_datastack/collectors/atom_collector.py
@@ -1,15 +1,34 @@
+import logging
+import os
 from typing import List
 from urllib.parse import urlparse
 
+from welearn_database.data.enumeration import ExternalIdType
 from welearn_database.data.models import Corpus, WeLearnDocument
 
-from welearn_datastack.collectors.helpers.feed_helpers import (
-    extracted_url_to_url_datastore,
-    lines_to_url,
-)
 from welearn_datastack.data.url_collector import URLCollector
+from welearn_datastack.modules.url_utils import (
+    extract_doi_number,
+    extract_url_parts_post_netloc,
+)
+from welearn_datastack.modules.validation import validate_doi
+from welearn_datastack.modules.xml_extractor import XMLExtractor
 from welearn_datastack.utils_.http_client_utils import get_new_https_session
 
+log_level: int = logging.getLevelName(os.getenv("LOG_LEVEL", "INFO"))
+log_format: str = os.getenv(
+    "LOG_FORMAT", "[%(asctime)s][%(name)s][%(levelname)s] - %(message)s"
+)
+
+if not isinstance(log_level, int):
+    raise ValueError("Log level is not recognized : '%s'", log_level)
+
+logging.basicConfig(
+    level=logging.getLevelName(log_level),
+    format=log_format,
+)
+logger = logging.getLogger(__name__)
+
 url_illegal_characters = ['"', "<", ">"]
 headers = {
     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
@@ -31,23 +50,57 @@ def __init__(
         self.corpus = corpus
 
     def collect(self) -> List[WeLearnDocument]:
+        logger.info(
+            f"Collecting URLs from feed {self.feed_url} for corpus {self.corpus.id}"
+        )
         domain = "https://" + urlparse(self.corpus.main_url).netloc
         client = get_new_https_session()
         res = client.get(url=self.feed_url, headers=headers)
         content = res.content.decode("utf-8")
 
-        flag = False
-        link_lines: List[str] = []
-        for line in content.split("\n"):
-            # If we are in the entry section and we find a link
-            # The definition, especially "rel" part is empirical
-            if flag and line.strip().startswith('<link rel="alternate"'):
-                link_lines.append(line.strip())
-            if line.strip().startswith("<entry>"):
-                flag = True
-
-        urls = lines_to_url(domain, link_lines)
-
-        ret = extracted_url_to_url_datastore(urls=urls, corpus=self.corpus)
+        logger.debug(f"Content of the feed {self.feed_url} : {content}")
 
+        ret: list[WeLearnDocument] = []
+        entries = XMLExtractor(content).extract_content("entry")
+        logger.info(f"Found {len(entries)} entries in the feed {self.feed_url}")
+        for entry in entries:
+            entry_extractor = XMLExtractor(entry.content)
+            link = entry_extractor.extract_content_attribute_filter(
+                tag="link",
+                attribute_name="rel",
+                attribute_value="alternate",
+            )
+            [xml_external_id] = entry_extractor.extract_content("id")
+            external_id = xml_external_id.content.strip()
+            if len(link) == 0:
+                continue
+            link_url = link[0].attributes["href"]
+            if link_url.startswith(domain):
+                if validate_doi(external_id, resolve_doi=False):
+                    # If the external ID is a valid DOI, we can use it directly as the external ID and set the type to DOI
+                    logger.info(
+                        f"External ID {external_id} is a valid DOI for URL {link_url}"
+                    )
+                    external_id = extract_doi_number(external_id)
+                    external_id_type = ExternalIdType.DOI
+                else:
+                    # Otherwise, we can use the part of the URL after the domain as the external ID and set the type to SLUG
+                    logger.info(
+                        f"External ID {external_id} is not a valid DOI for URL {link_url}, using the part of the URL after the domain as the external ID"
+                    )
+                    external_id = extract_url_parts_post_netloc(
+                        link_url, remove_start_slash=True
+                    )
+                    external_id_type = ExternalIdType.SLUG
+                ret.append(
+                    WeLearnDocument(
+                        url=link_url,
+                        corpus=self.corpus,
+                        external_id=external_id,
+                        external_id_type=external_id_type,
+                    )
+                )
+        logger.info(
+            f"Collected {len(ret)} URLs from feed {self.feed_url} for corpus {self.corpus.id}"
+        )
         return ret
diff --git a/welearn_datastack/collectors/oe_books_collector.py b/welearn_datastack/collectors/oe_books_collector.py
@@ -2,6 +2,7 @@
 import re
 from typing import Dict, List
 
+from welearn_database.data.enumeration import ExternalIdType
 from welearn_database.data.models import Corpus, WeLearnDocument
 
 from welearn_datastack.collectors.rss_collector import RssURLCollector
@@ -11,6 +12,7 @@
     MD_OE_BOOKS_BASE_URL,
 )
 from welearn_datastack.data.url_collector import URLCollector
+from welearn_datastack.modules.url_utils import extract_url_parts_post_netloc
 from welearn_datastack.modules.xml_extractor import XMLExtractor
 from welearn_datastack.utils_.http_client_utils import get_new_https_session
 
@@ -81,7 +83,7 @@ def collect(self) -> List[WeLearnDocument]:
         client = get_new_https_session()
         for book_url in rss_urls:
             logger.info("Collecting book: %s", book_url)
-            md_id = book_url.url.replace("https://books.openedition.org/", "")
+            md_id = extract_url_parts_post_netloc(book_url.url)
             md_url = MD_OE_BOOKS_BASE_URL.replace("<md_id>", md_id)
 
             md_res = client.get(url=md_url, headers=HEADERS)
@@ -112,20 +114,39 @@ def collect(self) -> List[WeLearnDocument]:
                         # Weird case where there is no chapters
                         logger.warning("No chapters found for book: %s", book_url.url)
                         ret.append(
-                            WeLearnDocument(url=book_url.url, corpus=self.corpus)
+                            WeLearnDocument(
+                                url=book_url.url,
+                                corpus=self.corpus,
+                                external_id=md_id,
+                                external_id_type=ExternalIdType.SLUG,
+                            )
                         )
                         continue
                     else:
                         for chapter_url in chapters_urls:
                             logger.info("--Collecting chapter: %s", chapter_url)
                             ret.append(
-                                WeLearnDocument(url=chapter_url, corpus=self.corpus)
+                                WeLearnDocument(
+                                    url=chapter_url,
+                                    corpus=self.corpus,
+                                    external_id=extract_url_parts_post_netloc(
+                                        chapter_url
+                                    ),
+                                    external_id_type=ExternalIdType.SLUG,
+                                )
                             )
                 else:
                     logger.info(
                         "Book chapters are not legally usable : %s", book_url.url
                     )
-                    ret.append(WeLearnDocument(url=book_url.url, corpus=self.corpus))
+                    ret.append(
+                        WeLearnDocument(
+                            url=book_url.url,
+                            corpus=self.corpus,
+                            external_id=md_id,
+                            external_id_type=ExternalIdType.SLUG,
+                        )
+                    )
                     continue
             else:
                 logger.info("Book is not open access: %s", book_url.url)