From 145951fa807cb4eacc98c930e70479327c7ab1c9 Mon Sep 17 00:00:00 2001 From: hanig Date: Sat, 7 Feb 2026 15:19:24 -0800 Subject: [PATCH] Fix Zotero add-paper to extract metadata from arXiv, PubMed, and generic publisher URLs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, URLs from arXiv and most publishers fell through to the webpage fallback, storing only the raw link. Now _extract_doi_from_url handles arXiv (via 10.48550 DOI), PubMed (PMID→DOI via NCBI API), and a generic fallback that catches any DOI embedded in a URL path or query string (Wiley, Springer, T&F, ACS, SAGE, Oxford, Frontiers, PLoS, etc.). Co-Authored-By: Claude Opus 4.6 --- src/integrations/zotero_client.py | 50 +++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/src/integrations/zotero_client.py b/src/integrations/zotero_client.py index 240541f..ea30aa7 100644 --- a/src/integrations/zotero_client.py +++ b/src/integrations/zotero_client.py @@ -633,6 +633,56 @@ def _extract_doi_from_url(self, url: str) -> str | None: if match: return match.group(1) + # arXiv: arxiv.org/abs/2601.07372 or arxiv.org/pdf/2601.07372 + if "arxiv.org/" in url: + match = re.search(r"arxiv\.org/(?:abs|pdf|html)/(\d+\.\d+)", url) + if match: + return f"10.48550/arXiv.{match.group(1)}" + + # PubMed: pubmed.ncbi.nlm.nih.gov/12345678 + if "pubmed.ncbi.nlm.nih.gov/" in url: + match = re.search(r"pubmed\.ncbi\.nlm\.nih\.gov/(\d+)", url) + if match: + pmid = match.group(1) + doi = self._resolve_pmid_to_doi(pmid) + if doi: + return doi + + # Generic fallback: look for a DOI embedded anywhere in the URL + # Covers Wiley, Springer, T&F, ACS, SAGE, Oxford, Frontiers, PLoS, etc. + match = re.search(r"(?:^|[/=])(10\.\d{4,}/[^\s&#]+)", url) + if match: + return match.group(1).rstrip("/") + + return None + + def _resolve_pmid_to_doi(self, pmid: str) -> str | None: + """Resolve a PubMed ID to a DOI via the NCBI API. + + Args: + pmid: The PubMed ID (numeric string). + + Returns: + DOI string or None if not found. + """ + import httpx + + ncbi_url = ( + "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi" + f"?db=pubmed&id={pmid}&retmode=json" + ) + try: + with httpx.Client() as client: + response = client.get(ncbi_url, timeout=10.0) + if response.status_code == 200: + data = response.json() + article = data.get("result", {}).get(pmid, {}) + for id_entry in article.get("articleids", []): + if id_entry.get("idtype") == "doi": + return id_entry["value"] + except Exception as e: + logger.warning(f"NCBI lookup failed for PMID {pmid}: {e}") + return None def add_item_by_url(