Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 48 additions & 3 deletions openlibrary/catalog/add_book/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,15 +196,30 @@ def split_subtitle(full_title: str):

def find_matching_work(e):
"""
Looks for an existing Work representing the new import edition by
comparing normalized titles for every work by each author of the current edition.
Returns the first match found, or None.
Looks for an existing Work representing the new import edition.

First tries to match by work_identifiers (external IDs such as Goodreads
work ID or Wikidata Q-number) if any are present in the record. Identifier
matches take priority over title matching: a caller that supplies a work
identifier is asserting they already know which Work this is.

Falls back to comparing normalized titles for every work by each author,
which is the original behaviour.

:param dict e: An OL edition suitable for saving, has a key, and has full Authors with keys
but has not yet been saved.
:rtype: None or str
:return: the matched work key "/works/OL..W" if found
"""
# --- identifier-based match (priority) ---
for identifier, vals in e.get("work_identifiers", {}).items():
for val in vals:
q = {"type": "/type/work", f"identifiers.{identifier}": val}
matches = list(site.get().things(q))
if matches:
return matches[0]

# --- title-based match (fallback) ---
seen = set()
for a in e["authors"]:
q = {"type": "/type/work", "authors": {"author": {"key": a["key"]}}}
Expand Down Expand Up @@ -690,6 +705,22 @@ def load_data( # noqa: PLR0912, PLR0915
if cover_id:
work.setdefault("covers", []).append(cover_id)
need_update = True
if "work_identifiers" in rec:
existing_raw = work.get("identifiers") or {}
# work is a Thing object; .dict() normalizes it to a plain dict for comparison
existing = existing_raw.dict() if hasattr(existing_raw, "dict") else dict(existing_raw)
# Only enrich if this work was matched via one of the rec's identifiers.
# Title-based fallback matches don't confirm the work_identifiers apply here.
if any(v in (existing.get(k) or []) for k, vals in rec["work_identifiers"].items() for v in vals):
# Deep-copy lists to avoid aliasing existing[k] through the defaultdict
identifiers = defaultdict(list, {k: list(v) for k, v in existing.items()})
for k, vals in rec["work_identifiers"].items():
identifiers[k].extend(vals)
identifiers[k] = list(set(identifiers[k]))
new_ids = dict(identifiers)
if existing != new_ids:
work["identifiers"] = new_ids
need_update = True
if need_update:
work_state = "modified"
edits.append(work.dict())
Expand Down Expand Up @@ -931,6 +962,20 @@ def update_work_with_rec_data(rec: dict, edition: Edition, work: dict[str, Any],
if work.get("authors"):
need_work_save = True

# Add new work_identifiers, merging with any already on the work.
if "work_identifiers" in rec:
existing_raw = work.get("identifiers") or {}
existing = existing_raw.dict() if hasattr(existing_raw, "dict") else dict(existing_raw)
# Deep-copy lists to avoid aliasing existing[k] through the defaultdict
identifiers = defaultdict(list, {k: list(v) for k, v in existing.items()})
for k, vals in rec["work_identifiers"].items():
identifiers[k].extend(vals)
identifiers[k] = list(set(identifiers[k]))
new_ids = dict(identifiers)
if existing != new_ids:
work["identifiers"] = new_ids
need_work_save = True

return need_work_save


Expand Down
195 changes: 195 additions & 0 deletions openlibrary/catalog/add_book/tests/test_add_book.py
Original file line number Diff line number Diff line change
Expand Up @@ -1056,6 +1056,201 @@ def test_existing_work(mock_site, add_languages):
assert e.works[0]["key"] == "/works/OL16W"


def test_work_identifiers_match_existing_work(mock_site, add_languages):
"""
work_identifiers in the import record match an existing OL work by its
stored identifiers (e.g. a Goodreads work ID the work already has).
The edition should be linked to that work and the identifier enriched.

Freso's use case: importing from Goodreads where you have the Goodreads
work ID alongside the edition ID.
"""
author = {
"type": {"key": "/type/author"},
"name": "Charles Dickens",
"key": "/authors/OL1A",
}
existing_work = {
"authors": [{"author": "/authors/OL1A", "type": {"key": "/type/author_role"}}],
"key": "/works/OL1W",
"title": "A Christmas Carol",
"type": {"key": "/type/work"},
"identifiers": {"goodreads": ["5326"]},
}
mock_site.save(author)
mock_site.save(existing_work)

rec = {
"source_records": ["goodreads:5326-edition"],
"title": "A Christmas Carol", # title also matches, but identifiers win
"authors": [{"name": "Charles Dickens"}],
"publishers": ["Chapman & Hall"],
"publish_date": "1843",
"work_identifiers": {"goodreads": ["5326"]},
}

reply = load(rec)
assert reply["success"] is True
assert reply["work"]["status"] == "matched"
assert reply["work"]["key"] == "/works/OL1W"


def test_work_identifiers_match_wins_over_title(mock_site, add_languages):
"""
When work_identifiers match a work with a slightly different title,
the identifier match takes priority over title matching.

This is the core value of the feature: identifier lookup is more precise
than fuzzy title normalization.
"""
author = {
"type": {"key": "/type/author"},
"name": "Charles Dickens",
"key": "/authors/OL1A",
}
# Work whose title would NOT match the import rec's title via mk_norm
work_with_id = {
"authors": [{"author": "/authors/OL1A", "type": {"key": "/type/author_role"}}],
"key": "/works/OL1W",
"title": "A Christmas Carol: A Ghost Story of Christmas",
"type": {"key": "/type/work"},
"identifiers": {"goodreads": ["5326"]},
}
# Work whose title WOULD match — but no matching identifier
work_title_match = {
"authors": [{"author": "/authors/OL1A", "type": {"key": "/type/author_role"}}],
"key": "/works/OL2W",
"title": "A Christmas Carol",
"type": {"key": "/type/work"},
}
mock_site.save(author)
mock_site.save(work_with_id)
mock_site.save(work_title_match)

rec = {
"source_records": ["goodreads:5326-edition"],
"title": "A Christmas Carol",
"authors": [{"name": "Charles Dickens"}],
"publishers": ["Chapman & Hall"],
"publish_date": "1843",
"work_identifiers": {"goodreads": ["5326"]},
}

reply = load(rec)
assert reply["success"] is True
assert reply["work"]["status"] == "matched"
assert reply["work"]["key"] == "/works/OL1W" # identifier match, not title match


def test_work_identifiers_enriched_onto_matched_work(mock_site, add_languages):
"""
When a work is matched via identifiers, a new identifier from the import
record that isn't already on the work gets written back to the work.

E.g. a LibriVox import that matches via Goodreads ID can simultaneously
add a librivox identifier to the work.
"""
author = {
"type": {"key": "/type/author"},
"name": "Charles Dickens",
"key": "/authors/OL1A",
}
existing_work = {
"authors": [{"author": "/authors/OL1A", "type": {"key": "/type/author_role"}}],
"key": "/works/OL1W",
"title": "A Christmas Carol",
"type": {"key": "/type/work"},
"identifiers": {"goodreads": ["5326"]},
}
mock_site.save(author)
mock_site.save(existing_work)

rec = {
"source_records": ["librivox:843"],
"title": "A Christmas Carol",
"authors": [{"name": "Charles Dickens"}],
"publishers": ["LibriVox"],
"publish_date": "2006",
"work_identifiers": {"goodreads": ["5326"], "librivox": ["843"]},
}

reply = load(rec)
assert reply["success"] is True
assert reply["work"]["key"] == "/works/OL1W"
work = mock_site.get("/works/OL1W")
assert "librivox" in work.get("identifiers", {})
assert "843" in work["identifiers"]["librivox"]
# Original identifier preserved
assert "5326" in work["identifiers"]["goodreads"]


def test_work_identifiers_no_match_falls_back_to_title(mock_site, add_languages):
"""
When work_identifiers are provided but no existing work has them,
matching falls back to title normalization as before.
"""
author = {
"type": {"key": "/type/author"},
"name": "Charles Dickens",
"key": "/authors/OL1A",
}
existing_work = {
"authors": [{"author": "/authors/OL1A", "type": {"key": "/type/author_role"}}],
"key": "/works/OL1W",
"title": "A Christmas Carol",
"type": {"key": "/type/work"},
}
mock_site.save(author)
mock_site.save(existing_work)

rec = {
"source_records": ["goodreads:unknown-edition"],
"title": "A Christmas Carol",
"authors": [{"name": "Charles Dickens"}],
"publishers": ["Chapman & Hall"],
"publish_date": "1843",
"work_identifiers": {"goodreads": ["99999"]}, # unknown ID
}

reply = load(rec)
assert reply["success"] is True
assert reply["work"]["status"] == "matched"
assert reply["work"]["key"] == "/works/OL1W" # fell back to title match


def test_work_identifiers_absent_preserves_existing_behaviour(mock_site, add_languages):
"""
Records with no work_identifiers field work exactly as before — title matching only.
"""
author = {
"type": {"key": "/type/author"},
"name": "John Smith",
"key": "/authors/OL20A",
}
existing_work = {
"authors": [{"author": "/authors/OL20A", "type": {"key": "/type/author_role"}}],
"key": "/works/OL16W",
"title": "Finding existing works",
"type": {"key": "/type/work"},
}
mock_site.save(author)
mock_site.save(existing_work)

rec = {
"source_records": "non-marc:test",
"title": "Finding Existing Works",
"authors": [{"name": "John Smith"}],
"publishers": ["Black Spot"],
"publish_date": "Jan 09, 2011",
"isbn_10": ["1250144051"],
}

reply = load(rec)
assert reply["success"] is True
assert reply["work"]["status"] == "matched"
assert reply["work"]["key"] == "/works/OL16W"


def test_existing_work_with_subtitle(mock_site, add_languages):
author = {
"type": {"key": "/type/author"},
Expand Down
17 changes: 16 additions & 1 deletion openlibrary/schemata/import.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@
"patternProperties": {
"^\\w+": { "$ref": "shared_definitions.json#/string_array" }
},
"description": "Unique identifiers used by external sites to identify a book. Used by Open Library to link offsite.",
"description": "Unique identifiers used by external sites to identify a book (edition). Used by Open Library to link offsite.",
"examples": [
{
"standard_ebooks": ["leo-tolstoy/what-is-art/aylmer-maude"]
Expand All @@ -123,6 +123,21 @@
}
]
},
"work_identifiers": {
"type": "object",
"patternProperties": {
"^\\w+": { "$ref": "shared_definitions.json#/string_array" }
},
"description": "Identifiers for the Work (as distinct from the Edition). Used to match an import record against an existing OL Work. E.g. Goodreads work ID, Wikidata Q-number.",
"examples": [
{
"goodreads": ["1128434"]
},
{
"wikidata": ["Q8337"]
}
]
},
"cover": {
"type": "string",
"description": "URL for an edition's cover",
Expand Down