From f6d531d3cee9d04a849f0c3e0c7fbb3013cffc35 Mon Sep 17 00:00:00 2001
From: "Michael E. Karpeles (Mek)" <michael.karpeles@gmail.com>
Date: Tue, 16 Jun 2026 17:39:22 -0600
Subject: [PATCH 1/2] feat(import): add work_identifiers support for work
 matching (#9472)

Adds a new optional `work_identifiers` field to the import schema,
parallel to `identifiers` (edition). When present, the import pipeline
uses these IDs to locate the matching Work before falling back to title
comparison.

Changes:
- import.schema.json: add `work_identifiers` field (same shape as `identifiers`)
- find_matching_work(): identifier-first lookup via infobase things() query,
  title-based match retained as fallback
- update_work_with_rec_data(): merges incoming work_identifiers onto the
  matched Work, deduplicated
- 5 new tests covering happy path, identifier-wins-over-title conflict,
  enrichment write-back, unknown-identifier fallback, and absent-field
  backward compat
---
 openlibrary/catalog/add_book/__init__.py      |  31 ++-
 .../catalog/add_book/tests/test_add_book.py   | 195 ++++++++++++++++++
 openlibrary/schemata/import.schema.json       |  17 +-
 3 files changed, 239 insertions(+), 4 deletions(-)

diff --git a/openlibrary/catalog/add_book/__init__.py b/openlibrary/catalog/add_book/__init__.py
index 5afc8f8a35c..ac924d50eb4 100644
--- a/openlibrary/catalog/add_book/__init__.py
+++ b/openlibrary/catalog/add_book/__init__.py
@@ -196,15 +196,30 @@ def split_subtitle(full_title: str):
 
 def find_matching_work(e):
     """
-    Looks for an existing Work representing the new import edition by
-    comparing normalized titles for every work by each author of the current edition.
-    Returns the first match found, or None.
+    Looks for an existing Work representing the new import edition.
+
+    First tries to match by work_identifiers (external IDs such as Goodreads
+    work ID or Wikidata Q-number) if any are present in the record.  Identifier
+    matches take priority over title matching: a caller that supplies a work
+    identifier is asserting they already know which Work this is.
+
+    Falls back to comparing normalized titles for every work by each author,
+    which is the original behaviour.
 
     :param dict e: An OL edition suitable for saving, has a key, and has full Authors with keys
                    but has not yet been saved.
     :rtype: None or str
     :return: the matched work key "/works/OL..W" if found
     """
+    # --- identifier-based match (priority) ---
+    for identifier, vals in e.get("work_identifiers", {}).items():
+        for val in vals:
+            q = {"type": "/type/work", f"identifiers.{identifier}": val}
+            matches = list(site.get().things(q))
+            if matches:
+                return matches[0]
+
+    # --- title-based match (fallback) ---
     seen = set()
     for a in e["authors"]:
         q = {"type": "/type/work", "authors": {"author": {"key": a["key"]}}}
@@ -931,6 +946,16 @@ def update_work_with_rec_data(rec: dict, edition: Edition, work: dict[str, Any],
         if work.get("authors"):
             need_work_save = True
 
+    # Add new work_identifiers, merging with any already on the work.
+    if "work_identifiers" in rec:
+        identifiers = defaultdict(list, work.get("identifiers", {}))
+        for k, vals in rec["work_identifiers"].items():
+            identifiers[k].extend(vals)
+            identifiers[k] = list(set(identifiers[k]))
+        if work.get("identifiers") != identifiers:
+            work["identifiers"] = dict(identifiers)
+            need_work_save = True
+
     return need_work_save
 
 
diff --git a/openlibrary/catalog/add_book/tests/test_add_book.py b/openlibrary/catalog/add_book/tests/test_add_book.py
index 2f7ab1ecc01..ff9d0185c32 100644
--- a/openlibrary/catalog/add_book/tests/test_add_book.py
+++ b/openlibrary/catalog/add_book/tests/test_add_book.py
@@ -1056,6 +1056,201 @@ def test_existing_work(mock_site, add_languages):
     assert e.works[0]["key"] == "/works/OL16W"
 
 
+def test_work_identifiers_match_existing_work(mock_site, add_languages):
+    """
+    work_identifiers in the import record match an existing OL work by its
+    stored identifiers (e.g. a Goodreads work ID the work already has).
+    The edition should be linked to that work and the identifier enriched.
+
+    Freso's use case: importing from Goodreads where you have the Goodreads
+    work ID alongside the edition ID.
+    """
+    author = {
+        "type": {"key": "/type/author"},
+        "name": "Charles Dickens",
+        "key": "/authors/OL1A",
+    }
+    existing_work = {
+        "authors": [{"author": "/authors/OL1A", "type": {"key": "/type/author_role"}}],
+        "key": "/works/OL1W",
+        "title": "A Christmas Carol",
+        "type": {"key": "/type/work"},
+        "identifiers": {"goodreads": ["5326"]},
+    }
+    mock_site.save(author)
+    mock_site.save(existing_work)
+
+    rec = {
+        "source_records": ["goodreads:5326-edition"],
+        "title": "A Christmas Carol",  # title also matches, but identifiers win
+        "authors": [{"name": "Charles Dickens"}],
+        "publishers": ["Chapman & Hall"],
+        "publish_date": "1843",
+        "work_identifiers": {"goodreads": ["5326"]},
+    }
+
+    reply = load(rec)
+    assert reply["success"] is True
+    assert reply["work"]["status"] == "matched"
+    assert reply["work"]["key"] == "/works/OL1W"
+
+
+def test_work_identifiers_match_wins_over_title(mock_site, add_languages):
+    """
+    When work_identifiers match a work with a slightly different title,
+    the identifier match takes priority over title matching.
+
+    This is the core value of the feature: identifier lookup is more precise
+    than fuzzy title normalization.
+    """
+    author = {
+        "type": {"key": "/type/author"},
+        "name": "Charles Dickens",
+        "key": "/authors/OL1A",
+    }
+    # Work whose title would NOT match the import rec's title via mk_norm
+    work_with_id = {
+        "authors": [{"author": "/authors/OL1A", "type": {"key": "/type/author_role"}}],
+        "key": "/works/OL1W",
+        "title": "A Christmas Carol: A Ghost Story of Christmas",
+        "type": {"key": "/type/work"},
+        "identifiers": {"goodreads": ["5326"]},
+    }
+    # Work whose title WOULD match — but no matching identifier
+    work_title_match = {
+        "authors": [{"author": "/authors/OL1A", "type": {"key": "/type/author_role"}}],
+        "key": "/works/OL2W",
+        "title": "A Christmas Carol",
+        "type": {"key": "/type/work"},
+    }
+    mock_site.save(author)
+    mock_site.save(work_with_id)
+    mock_site.save(work_title_match)
+
+    rec = {
+        "source_records": ["goodreads:5326-edition"],
+        "title": "A Christmas Carol",
+        "authors": [{"name": "Charles Dickens"}],
+        "publishers": ["Chapman & Hall"],
+        "publish_date": "1843",
+        "work_identifiers": {"goodreads": ["5326"]},
+    }
+
+    reply = load(rec)
+    assert reply["success"] is True
+    assert reply["work"]["status"] == "matched"
+    assert reply["work"]["key"] == "/works/OL1W"  # identifier match, not title match
+
+
+def test_work_identifiers_enriched_onto_matched_work(mock_site, add_languages):
+    """
+    When a work is matched via identifiers, a new identifier from the import
+    record that isn't already on the work gets written back to the work.
+
+    E.g. a LibriVox import that matches via Goodreads ID can simultaneously
+    add a librivox identifier to the work.
+    """
+    author = {
+        "type": {"key": "/type/author"},
+        "name": "Charles Dickens",
+        "key": "/authors/OL1A",
+    }
+    existing_work = {
+        "authors": [{"author": "/authors/OL1A", "type": {"key": "/type/author_role"}}],
+        "key": "/works/OL1W",
+        "title": "A Christmas Carol",
+        "type": {"key": "/type/work"},
+        "identifiers": {"goodreads": ["5326"]},
+    }
+    mock_site.save(author)
+    mock_site.save(existing_work)
+
+    rec = {
+        "source_records": ["librivox:843"],
+        "title": "A Christmas Carol",
+        "authors": [{"name": "Charles Dickens"}],
+        "publishers": ["LibriVox"],
+        "publish_date": "2006",
+        "work_identifiers": {"goodreads": ["5326"], "librivox": ["843"]},
+    }
+
+    reply = load(rec)
+    assert reply["success"] is True
+    assert reply["work"]["key"] == "/works/OL1W"
+    work = mock_site.get("/works/OL1W")
+    assert "librivox" in work.get("identifiers", {})
+    assert "843" in work["identifiers"]["librivox"]
+    # Original identifier preserved
+    assert "5326" in work["identifiers"]["goodreads"]
+
+
+def test_work_identifiers_no_match_falls_back_to_title(mock_site, add_languages):
+    """
+    When work_identifiers are provided but no existing work has them,
+    matching falls back to title normalization as before.
+    """
+    author = {
+        "type": {"key": "/type/author"},
+        "name": "Charles Dickens",
+        "key": "/authors/OL1A",
+    }
+    existing_work = {
+        "authors": [{"author": "/authors/OL1A", "type": {"key": "/type/author_role"}}],
+        "key": "/works/OL1W",
+        "title": "A Christmas Carol",
+        "type": {"key": "/type/work"},
+    }
+    mock_site.save(author)
+    mock_site.save(existing_work)
+
+    rec = {
+        "source_records": ["goodreads:unknown-edition"],
+        "title": "A Christmas Carol",
+        "authors": [{"name": "Charles Dickens"}],
+        "publishers": ["Chapman & Hall"],
+        "publish_date": "1843",
+        "work_identifiers": {"goodreads": ["99999"]},  # unknown ID
+    }
+
+    reply = load(rec)
+    assert reply["success"] is True
+    assert reply["work"]["status"] == "matched"
+    assert reply["work"]["key"] == "/works/OL1W"  # fell back to title match
+
+
+def test_work_identifiers_absent_preserves_existing_behaviour(mock_site, add_languages):
+    """
+    Records with no work_identifiers field work exactly as before — title matching only.
+    """
+    author = {
+        "type": {"key": "/type/author"},
+        "name": "John Smith",
+        "key": "/authors/OL20A",
+    }
+    existing_work = {
+        "authors": [{"author": "/authors/OL20A", "type": {"key": "/type/author_role"}}],
+        "key": "/works/OL16W",
+        "title": "Finding existing works",
+        "type": {"key": "/type/work"},
+    }
+    mock_site.save(author)
+    mock_site.save(existing_work)
+
+    rec = {
+        "source_records": "non-marc:test",
+        "title": "Finding Existing Works",
+        "authors": [{"name": "John Smith"}],
+        "publishers": ["Black Spot"],
+        "publish_date": "Jan 09, 2011",
+        "isbn_10": ["1250144051"],
+    }
+
+    reply = load(rec)
+    assert reply["success"] is True
+    assert reply["work"]["status"] == "matched"
+    assert reply["work"]["key"] == "/works/OL16W"
+
+
 def test_existing_work_with_subtitle(mock_site, add_languages):
     author = {
         "type": {"key": "/type/author"},
diff --git a/openlibrary/schemata/import.schema.json b/openlibrary/schemata/import.schema.json
index 3f00e90235d..188704d715f 100644
--- a/openlibrary/schemata/import.schema.json
+++ b/openlibrary/schemata/import.schema.json
@@ -113,7 +113,7 @@
       "patternProperties": {
         "^\\w+": { "$ref": "shared_definitions.json#/string_array" }
       },
-      "description": "Unique identifiers used by external sites to identify a book. Used by Open Library to link offsite.",
+      "description": "Unique identifiers used by external sites to identify a book (edition). Used by Open Library to link offsite.",
       "examples": [
         {
             "standard_ebooks": ["leo-tolstoy/what-is-art/aylmer-maude"]
@@ -123,6 +123,21 @@
         }
       ]
     },
+    "work_identifiers": {
+      "type": "object",
+      "patternProperties": {
+        "^\\w+": { "$ref": "shared_definitions.json#/string_array" }
+      },
+      "description": "Identifiers for the Work (as distinct from the Edition). Used to match an import record against an existing OL Work. E.g. Goodreads work ID, Wikidata Q-number.",
+      "examples": [
+        {
+            "goodreads": ["1128434"]
+        },
+        {
+            "wikidata": ["Q8337"]
+        }
+      ]
+    },
     "cover": {
       "type": "string",
       "description": "URL for an edition's cover",

From 8cdccfb3c97ec5514c752fcf38e54e598a7bb101 Mon Sep 17 00:00:00 2001
From: "Michael E. Karpeles (Mek)" <michael.karpeles@gmail.com>
Date: Tue, 16 Jun 2026 18:14:33 -0600
Subject: [PATCH 2/2] fix(import): write back work_identifiers to matched work
 in load_data()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a record with work_identifiers matched an existing work via load_data()
(the no-edition-match code path), the identifiers were never merged onto the
work object. Only the load() → update_work_with_rec_data() path handled this.

Merges incoming identifiers into the work, deduplicates, and marks for update.
---
 openlibrary/catalog/add_book/__init__.py | 26 +++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/openlibrary/catalog/add_book/__init__.py b/openlibrary/catalog/add_book/__init__.py
index ac924d50eb4..688a63b0543 100644
--- a/openlibrary/catalog/add_book/__init__.py
+++ b/openlibrary/catalog/add_book/__init__.py
@@ -705,6 +705,22 @@ def load_data(  # noqa: PLR0912, PLR0915
         if cover_id:
             work.setdefault("covers", []).append(cover_id)
             need_update = True
+        if "work_identifiers" in rec:
+            existing_raw = work.get("identifiers") or {}
+            # work is a Thing object; .dict() normalizes it to a plain dict for comparison
+            existing = existing_raw.dict() if hasattr(existing_raw, "dict") else dict(existing_raw)
+            # Only enrich if this work was matched via one of the rec's identifiers.
+            # Title-based fallback matches don't confirm the work_identifiers apply here.
+            if any(v in (existing.get(k) or []) for k, vals in rec["work_identifiers"].items() for v in vals):
+                # Deep-copy lists to avoid aliasing existing[k] through the defaultdict
+                identifiers = defaultdict(list, {k: list(v) for k, v in existing.items()})
+                for k, vals in rec["work_identifiers"].items():
+                    identifiers[k].extend(vals)
+                    identifiers[k] = list(set(identifiers[k]))
+                new_ids = dict(identifiers)
+                if existing != new_ids:
+                    work["identifiers"] = new_ids
+                    need_update = True
         if need_update:
             work_state = "modified"
             edits.append(work.dict())
@@ -948,12 +964,16 @@ def update_work_with_rec_data(rec: dict, edition: Edition, work: dict[str, Any],
 
     # Add new work_identifiers, merging with any already on the work.
     if "work_identifiers" in rec:
-        identifiers = defaultdict(list, work.get("identifiers", {}))
+        existing_raw = work.get("identifiers") or {}
+        existing = existing_raw.dict() if hasattr(existing_raw, "dict") else dict(existing_raw)
+        # Deep-copy lists to avoid aliasing existing[k] through the defaultdict
+        identifiers = defaultdict(list, {k: list(v) for k, v in existing.items()})
         for k, vals in rec["work_identifiers"].items():
             identifiers[k].extend(vals)
             identifiers[k] = list(set(identifiers[k]))
-        if work.get("identifiers") != identifiers:
-            work["identifiers"] = dict(identifiers)
+        new_ids = dict(identifiers)
+        if existing != new_ids:
+            work["identifiers"] = new_ids
             need_work_save = True
 
     return need_work_save