From f6d531d3cee9d04a849f0c3e0c7fbb3013cffc35 Mon Sep 17 00:00:00 2001 From: "Michael E. Karpeles (Mek)" Date: Tue, 16 Jun 2026 17:39:22 -0600 Subject: [PATCH 1/2] feat(import): add work_identifiers support for work matching (#9472) Adds a new optional `work_identifiers` field to the import schema, parallel to `identifiers` (edition). When present, the import pipeline uses these IDs to locate the matching Work before falling back to title comparison. Changes: - import.schema.json: add `work_identifiers` field (same shape as `identifiers`) - find_matching_work(): identifier-first lookup via infobase things() query, title-based match retained as fallback - update_work_with_rec_data(): merges incoming work_identifiers onto the matched Work, deduplicated - 5 new tests covering happy path, identifier-wins-over-title conflict, enrichment write-back, unknown-identifier fallback, and absent-field backward compat --- openlibrary/catalog/add_book/__init__.py | 31 ++- .../catalog/add_book/tests/test_add_book.py | 195 ++++++++++++++++++ openlibrary/schemata/import.schema.json | 17 +- 3 files changed, 239 insertions(+), 4 deletions(-) diff --git a/openlibrary/catalog/add_book/__init__.py b/openlibrary/catalog/add_book/__init__.py index 5afc8f8a35c..ac924d50eb4 100644 --- a/openlibrary/catalog/add_book/__init__.py +++ b/openlibrary/catalog/add_book/__init__.py @@ -196,15 +196,30 @@ def split_subtitle(full_title: str): def find_matching_work(e): """ - Looks for an existing Work representing the new import edition by - comparing normalized titles for every work by each author of the current edition. - Returns the first match found, or None. + Looks for an existing Work representing the new import edition. + + First tries to match by work_identifiers (external IDs such as Goodreads + work ID or Wikidata Q-number) if any are present in the record. Identifier + matches take priority over title matching: a caller that supplies a work + identifier is asserting they already know which Work this is. + + Falls back to comparing normalized titles for every work by each author, + which is the original behaviour. :param dict e: An OL edition suitable for saving, has a key, and has full Authors with keys but has not yet been saved. :rtype: None or str :return: the matched work key "/works/OL..W" if found """ + # --- identifier-based match (priority) --- + for identifier, vals in e.get("work_identifiers", {}).items(): + for val in vals: + q = {"type": "/type/work", f"identifiers.{identifier}": val} + matches = list(site.get().things(q)) + if matches: + return matches[0] + + # --- title-based match (fallback) --- seen = set() for a in e["authors"]: q = {"type": "/type/work", "authors": {"author": {"key": a["key"]}}} @@ -931,6 +946,16 @@ def update_work_with_rec_data(rec: dict, edition: Edition, work: dict[str, Any], if work.get("authors"): need_work_save = True + # Add new work_identifiers, merging with any already on the work. + if "work_identifiers" in rec: + identifiers = defaultdict(list, work.get("identifiers", {})) + for k, vals in rec["work_identifiers"].items(): + identifiers[k].extend(vals) + identifiers[k] = list(set(identifiers[k])) + if work.get("identifiers") != identifiers: + work["identifiers"] = dict(identifiers) + need_work_save = True + return need_work_save diff --git a/openlibrary/catalog/add_book/tests/test_add_book.py b/openlibrary/catalog/add_book/tests/test_add_book.py index 2f7ab1ecc01..ff9d0185c32 100644 --- a/openlibrary/catalog/add_book/tests/test_add_book.py +++ b/openlibrary/catalog/add_book/tests/test_add_book.py @@ -1056,6 +1056,201 @@ def test_existing_work(mock_site, add_languages): assert e.works[0]["key"] == "/works/OL16W" +def test_work_identifiers_match_existing_work(mock_site, add_languages): + """ + work_identifiers in the import record match an existing OL work by its + stored identifiers (e.g. a Goodreads work ID the work already has). + The edition should be linked to that work and the identifier enriched. + + Freso's use case: importing from Goodreads where you have the Goodreads + work ID alongside the edition ID. + """ + author = { + "type": {"key": "/type/author"}, + "name": "Charles Dickens", + "key": "/authors/OL1A", + } + existing_work = { + "authors": [{"author": "/authors/OL1A", "type": {"key": "/type/author_role"}}], + "key": "/works/OL1W", + "title": "A Christmas Carol", + "type": {"key": "/type/work"}, + "identifiers": {"goodreads": ["5326"]}, + } + mock_site.save(author) + mock_site.save(existing_work) + + rec = { + "source_records": ["goodreads:5326-edition"], + "title": "A Christmas Carol", # title also matches, but identifiers win + "authors": [{"name": "Charles Dickens"}], + "publishers": ["Chapman & Hall"], + "publish_date": "1843", + "work_identifiers": {"goodreads": ["5326"]}, + } + + reply = load(rec) + assert reply["success"] is True + assert reply["work"]["status"] == "matched" + assert reply["work"]["key"] == "/works/OL1W" + + +def test_work_identifiers_match_wins_over_title(mock_site, add_languages): + """ + When work_identifiers match a work with a slightly different title, + the identifier match takes priority over title matching. + + This is the core value of the feature: identifier lookup is more precise + than fuzzy title normalization. + """ + author = { + "type": {"key": "/type/author"}, + "name": "Charles Dickens", + "key": "/authors/OL1A", + } + # Work whose title would NOT match the import rec's title via mk_norm + work_with_id = { + "authors": [{"author": "/authors/OL1A", "type": {"key": "/type/author_role"}}], + "key": "/works/OL1W", + "title": "A Christmas Carol: A Ghost Story of Christmas", + "type": {"key": "/type/work"}, + "identifiers": {"goodreads": ["5326"]}, + } + # Work whose title WOULD match — but no matching identifier + work_title_match = { + "authors": [{"author": "/authors/OL1A", "type": {"key": "/type/author_role"}}], + "key": "/works/OL2W", + "title": "A Christmas Carol", + "type": {"key": "/type/work"}, + } + mock_site.save(author) + mock_site.save(work_with_id) + mock_site.save(work_title_match) + + rec = { + "source_records": ["goodreads:5326-edition"], + "title": "A Christmas Carol", + "authors": [{"name": "Charles Dickens"}], + "publishers": ["Chapman & Hall"], + "publish_date": "1843", + "work_identifiers": {"goodreads": ["5326"]}, + } + + reply = load(rec) + assert reply["success"] is True + assert reply["work"]["status"] == "matched" + assert reply["work"]["key"] == "/works/OL1W" # identifier match, not title match + + +def test_work_identifiers_enriched_onto_matched_work(mock_site, add_languages): + """ + When a work is matched via identifiers, a new identifier from the import + record that isn't already on the work gets written back to the work. + + E.g. a LibriVox import that matches via Goodreads ID can simultaneously + add a librivox identifier to the work. + """ + author = { + "type": {"key": "/type/author"}, + "name": "Charles Dickens", + "key": "/authors/OL1A", + } + existing_work = { + "authors": [{"author": "/authors/OL1A", "type": {"key": "/type/author_role"}}], + "key": "/works/OL1W", + "title": "A Christmas Carol", + "type": {"key": "/type/work"}, + "identifiers": {"goodreads": ["5326"]}, + } + mock_site.save(author) + mock_site.save(existing_work) + + rec = { + "source_records": ["librivox:843"], + "title": "A Christmas Carol", + "authors": [{"name": "Charles Dickens"}], + "publishers": ["LibriVox"], + "publish_date": "2006", + "work_identifiers": {"goodreads": ["5326"], "librivox": ["843"]}, + } + + reply = load(rec) + assert reply["success"] is True + assert reply["work"]["key"] == "/works/OL1W" + work = mock_site.get("/works/OL1W") + assert "librivox" in work.get("identifiers", {}) + assert "843" in work["identifiers"]["librivox"] + # Original identifier preserved + assert "5326" in work["identifiers"]["goodreads"] + + +def test_work_identifiers_no_match_falls_back_to_title(mock_site, add_languages): + """ + When work_identifiers are provided but no existing work has them, + matching falls back to title normalization as before. + """ + author = { + "type": {"key": "/type/author"}, + "name": "Charles Dickens", + "key": "/authors/OL1A", + } + existing_work = { + "authors": [{"author": "/authors/OL1A", "type": {"key": "/type/author_role"}}], + "key": "/works/OL1W", + "title": "A Christmas Carol", + "type": {"key": "/type/work"}, + } + mock_site.save(author) + mock_site.save(existing_work) + + rec = { + "source_records": ["goodreads:unknown-edition"], + "title": "A Christmas Carol", + "authors": [{"name": "Charles Dickens"}], + "publishers": ["Chapman & Hall"], + "publish_date": "1843", + "work_identifiers": {"goodreads": ["99999"]}, # unknown ID + } + + reply = load(rec) + assert reply["success"] is True + assert reply["work"]["status"] == "matched" + assert reply["work"]["key"] == "/works/OL1W" # fell back to title match + + +def test_work_identifiers_absent_preserves_existing_behaviour(mock_site, add_languages): + """ + Records with no work_identifiers field work exactly as before — title matching only. + """ + author = { + "type": {"key": "/type/author"}, + "name": "John Smith", + "key": "/authors/OL20A", + } + existing_work = { + "authors": [{"author": "/authors/OL20A", "type": {"key": "/type/author_role"}}], + "key": "/works/OL16W", + "title": "Finding existing works", + "type": {"key": "/type/work"}, + } + mock_site.save(author) + mock_site.save(existing_work) + + rec = { + "source_records": "non-marc:test", + "title": "Finding Existing Works", + "authors": [{"name": "John Smith"}], + "publishers": ["Black Spot"], + "publish_date": "Jan 09, 2011", + "isbn_10": ["1250144051"], + } + + reply = load(rec) + assert reply["success"] is True + assert reply["work"]["status"] == "matched" + assert reply["work"]["key"] == "/works/OL16W" + + def test_existing_work_with_subtitle(mock_site, add_languages): author = { "type": {"key": "/type/author"}, diff --git a/openlibrary/schemata/import.schema.json b/openlibrary/schemata/import.schema.json index 3f00e90235d..188704d715f 100644 --- a/openlibrary/schemata/import.schema.json +++ b/openlibrary/schemata/import.schema.json @@ -113,7 +113,7 @@ "patternProperties": { "^\\w+": { "$ref": "shared_definitions.json#/string_array" } }, - "description": "Unique identifiers used by external sites to identify a book. Used by Open Library to link offsite.", + "description": "Unique identifiers used by external sites to identify a book (edition). Used by Open Library to link offsite.", "examples": [ { "standard_ebooks": ["leo-tolstoy/what-is-art/aylmer-maude"] @@ -123,6 +123,21 @@ } ] }, + "work_identifiers": { + "type": "object", + "patternProperties": { + "^\\w+": { "$ref": "shared_definitions.json#/string_array" } + }, + "description": "Identifiers for the Work (as distinct from the Edition). Used to match an import record against an existing OL Work. E.g. Goodreads work ID, Wikidata Q-number.", + "examples": [ + { + "goodreads": ["1128434"] + }, + { + "wikidata": ["Q8337"] + } + ] + }, "cover": { "type": "string", "description": "URL for an edition's cover", From 8cdccfb3c97ec5514c752fcf38e54e598a7bb101 Mon Sep 17 00:00:00 2001 From: "Michael E. Karpeles (Mek)" Date: Tue, 16 Jun 2026 18:14:33 -0600 Subject: [PATCH 2/2] fix(import): write back work_identifiers to matched work in load_data() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a record with work_identifiers matched an existing work via load_data() (the no-edition-match code path), the identifiers were never merged onto the work object. Only the load() → update_work_with_rec_data() path handled this. Merges incoming identifiers into the work, deduplicates, and marks for update. --- openlibrary/catalog/add_book/__init__.py | 26 +++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/openlibrary/catalog/add_book/__init__.py b/openlibrary/catalog/add_book/__init__.py index ac924d50eb4..688a63b0543 100644 --- a/openlibrary/catalog/add_book/__init__.py +++ b/openlibrary/catalog/add_book/__init__.py @@ -705,6 +705,22 @@ def load_data( # noqa: PLR0912, PLR0915 if cover_id: work.setdefault("covers", []).append(cover_id) need_update = True + if "work_identifiers" in rec: + existing_raw = work.get("identifiers") or {} + # work is a Thing object; .dict() normalizes it to a plain dict for comparison + existing = existing_raw.dict() if hasattr(existing_raw, "dict") else dict(existing_raw) + # Only enrich if this work was matched via one of the rec's identifiers. + # Title-based fallback matches don't confirm the work_identifiers apply here. + if any(v in (existing.get(k) or []) for k, vals in rec["work_identifiers"].items() for v in vals): + # Deep-copy lists to avoid aliasing existing[k] through the defaultdict + identifiers = defaultdict(list, {k: list(v) for k, v in existing.items()}) + for k, vals in rec["work_identifiers"].items(): + identifiers[k].extend(vals) + identifiers[k] = list(set(identifiers[k])) + new_ids = dict(identifiers) + if existing != new_ids: + work["identifiers"] = new_ids + need_update = True if need_update: work_state = "modified" edits.append(work.dict()) @@ -948,12 +964,16 @@ def update_work_with_rec_data(rec: dict, edition: Edition, work: dict[str, Any], # Add new work_identifiers, merging with any already on the work. if "work_identifiers" in rec: - identifiers = defaultdict(list, work.get("identifiers", {})) + existing_raw = work.get("identifiers") or {} + existing = existing_raw.dict() if hasattr(existing_raw, "dict") else dict(existing_raw) + # Deep-copy lists to avoid aliasing existing[k] through the defaultdict + identifiers = defaultdict(list, {k: list(v) for k, v in existing.items()}) for k, vals in rec["work_identifiers"].items(): identifiers[k].extend(vals) identifiers[k] = list(set(identifiers[k])) - if work.get("identifiers") != identifiers: - work["identifiers"] = dict(identifiers) + new_ids = dict(identifiers) + if existing != new_ids: + work["identifiers"] = new_ids need_work_save = True return need_work_save