From ddc4ef10304a3fd60c97a28587fe5d534d7c4039 Mon Sep 17 00:00:00 2001 From: David Rodger Date: Sun, 31 May 2026 09:27:17 -0400 Subject: [PATCH 1/4] Load .env in verify_performer_references script The script hand-rolls its own sys.path bootstrap instead of using ScriptBase, and so never loaded .env. db_utils then read empty DB_* vars and fell back to the local Postgres socket, failing with host=None. Add the same dotenv-loading block script_base.py uses so the script connects using the configured database. Co-Authored-By: Claude Opus 4.8 --- backend/scripts/verify_performer_references.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/backend/scripts/verify_performer_references.py b/backend/scripts/verify_performer_references.py index ac7b09f0..da5ec26e 100755 --- a/backend/scripts/verify_performer_references.py +++ b/backend/scripts/verify_performer_references.py @@ -23,6 +23,17 @@ backend_dir = Path(__file__).resolve().parent.parent sys.path.insert(0, str(backend_dir)) +# Load environment variables from .env in the backend directory (matches +# script_base.py). Without this, db_utils sees empty DB_* vars and the +# connection falls back to the local Postgres socket. +try: + from dotenv import load_dotenv + env_path = backend_dir / '.env' + if env_path.exists(): + load_dotenv(env_path) +except ImportError: + pass # python-dotenv not installed, skip + # Third-party imports import requests From 32e94805ac18e83499fdefd17b6ded10767bf9b8 Mon Sep 17 00:00:00 2001 From: David Rodger Date: Sun, 31 May 2026 09:37:38 -0400 Subject: [PATCH 2/4] Reduce verify_performer_references noise and idle time Bulk runs over 30k+ performers were dominated by per-row log lines and loop-level sleep: - Demote "unchanged"/"skipped" rows to DEBUG so the INFO stream shows only actual changes and items needing review. - Add a progress heartbeat every PROGRESS_INTERVAL (50) performers with running counts and an ETA. - Drop the 1.5s per-performer sleep. WikipediaSearcher.rate_limit() and the MusicBrainz path already throttle their own live requests, so the extra loop sleep was pure idle time (~13h over 30k rows). Co-Authored-By: Claude Opus 4.8 --- .../scripts/verify_performer_references.py | 42 +++++++++++++++---- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/backend/scripts/verify_performer_references.py b/backend/scripts/verify_performer_references.py index da5ec26e..b9c25f0a 100755 --- a/backend/scripts/verify_performer_references.py +++ b/backend/scripts/verify_performer_references.py @@ -53,6 +53,9 @@ ) logger = logging.getLogger(__name__) +# Emit a progress heartbeat every N performers during a full run. +PROGRESS_INTERVAL = 50 + class PerformerReferenceVerifier: """Verify and update external references for performers""" @@ -249,15 +252,18 @@ def _log_performer_status(self, name, performer_id, old_url, new_url, status): old_display = old_url if old_url else "none" new_display = new_url if new_url else "none" - # Use different formatting based on status + # Use different formatting based on status. No-op outcomes + # (unchanged / skipped) are demoted to DEBUG so the INFO stream + # highlights only actual changes and items needing review; with + # 30k+ performers the per-row lines otherwise drown out signal. if status == "unchanged": - logger.info(f"✓ {name} | ID: {performer_id} | Old: {old_display} | New: {new_display} | Status: {status}") + logger.debug(f"✓ {name} | ID: {performer_id} | Old: {old_display} | New: {new_display} | Status: {status}") elif status == "changed": logger.info(f"✎ {name} | ID: {performer_id} | Old: {old_display} | New: {new_display} | Status: {status}") elif status == "manual_inspection": logger.info(f"⚠ {name} | ID: {performer_id} | Old: {old_display} | New: {new_display} | Status: {status}") elif status == "skipped": - logger.info(f"⊘ {name} | ID: {performer_id} | Old: {old_display} | New: {new_display} | Status: {status}") + logger.debug(f"⊘ {name} | ID: {performer_id} | Old: {old_display} | New: {new_display} | Status: {status}") elif status == "error": logger.info(f"✗ {name} | ID: {performer_id} | Old: {old_display} | New: {new_display} | Status: {status}") else: @@ -660,17 +666,35 @@ def run(self): logger.info(f" (No performer found with ID '{self.id_filter}')") return True - logger.info(f"Found {len(performers)} performer(s) to process") + total = len(performers) + logger.info(f"Found {total} performer(s) to process") logger.info("") - + # Process each performer + start_time = time.time() for performer in performers: self.stats['performers_processed'] += 1 success, made_api_calls = self.process_performer(performer) - - # Only sleep if we made actual API calls (not cached data) - if made_api_calls: - time.sleep(1.5) + + # Heartbeat every PROGRESS_INTERVAL performers (and at the end), + # since per-row "unchanged"/"skipped" lines are now suppressed. + processed = self.stats['performers_processed'] + if processed % PROGRESS_INTERVAL == 0 or processed == total: + elapsed = time.time() - start_time + rate = processed / elapsed if elapsed > 0 else 0 + eta_min = (total - processed) / rate / 60 if rate > 0 else 0 + logger.info( + f"… {processed}/{total} ({processed * 100 // total}%) | " + f"added {self.stats['references_added']}, " + f"removed {self.stats['references_removed']}, " + f"review {self.stats['invalid_references'] - self.stats['references_removed']}, " + f"errors {self.stats['errors']} | " + f"{rate:.1f}/s, ETA {eta_min:.0f}m" + ) + + # No per-performer sleep here: WikipediaSearcher.rate_limit() and + # the MusicBrainz path already throttle their own live requests, so + # an extra loop-level sleep just added idle time (~13h over 30k rows). # Print summary self.print_summary() From e3e591a03d4e50989488b13554f8d993d1d5e190 Mon Sep 17 00:00:00 2001 From: David Rodger Date: Sun, 31 May 2026 09:54:34 -0400 Subject: [PATCH 3/4] Strip quoted nicknames in Wikipedia performer matching Performer names stored with a decorative quoted nickname (e.g. "Brother" Jack McDuff, 'Papa' John DeFrancesco) failed Wikipedia lookup two ways: OpenSearch returned an album/redirect instead of the canonical article, and the name comparison scored only a partial match (45 < the 50-point threshold). - Add WikipediaSearcher._strip_nickname() - removes paired double quotes and smart single quotes. Lone apostrophes (O'Brien, D'Angelo) have no opener and are left untouched. The stripped form is only used when it still has >= 2 tokens (a plausible first + last name); collapsing to a single bare surname is rejected and the original kept, since a lone surname fuzzy-matches unrelated famous people ('Doc' West -> Kanye West, 'Bugs' Bower -> Kris Bowers). - verify_wikipedia_reference now strips nicknames from both the performer name and the page title before matching, so the stripped legal name is an exact (not partial) match. - search_wikipedia queries the nickname-stripped (canonical) form first, then the stored name, merging candidates - so the canonical article is found and preferred over an album/redirect. Extracted the OpenSearch call into _opensearch(), collapsing the duplicated force/non-force branches and no longer caching transient request failures. - Add unit tests for _strip_nickname, including the single-surname guard. Improves the shared searcher used by both the batch verifier and the core ingestion path. Co-Authored-By: Claude Opus 4.8 --- backend/integrations/wikipedia/utils.py | 170 +++++++++++++---------- backend/tests/test_wikipedia_nickname.py | 61 ++++++++ 2 files changed, 160 insertions(+), 71 deletions(-) create mode 100644 backend/tests/test_wikipedia_nickname.py diff --git a/backend/integrations/wikipedia/utils.py b/backend/integrations/wikipedia/utils.py index 9a82963a..306e53cc 100644 --- a/backend/integrations/wikipedia/utils.py +++ b/backend/integrations/wikipedia/utils.py @@ -252,6 +252,31 @@ def rate_limit(self): time.sleep(sleep_time) self.last_request_time = time.time() + def _strip_nickname(self, name): + """Remove a decorative quoted nickname and normalize smart quotes. + + Performer names sometimes embed a nickname in quotes, e.g. + '“Brother” Jack McDuff' or '‘Papa’ John DeFrancesco'. The quoted part is + decorative; the legal name ('Jack McDuff') is what Wikipedia titles and + our matching want. Only paired double quotes ("..." / “...”) and paired + smart single quotes (‘...’) are stripped — a lone straight apostrophe + (O'Brien, 'Night Sweet Pea) has no opener, so such names are untouched. + + The stripped form is only returned when it still has at least two + tokens (a plausible first + last name). If stripping collapses the + name to a single bare surname (e.g. '‘Doc’ West' -> 'West'), the + original is returned instead: a lone surname is too generic — it + fuzzy-matches unrelated famous people ('West' -> Kanye West) and + partial-matches any 'First Surname' page. + """ + s = name.replace('“', '"').replace('”', '"') + s = re.sub(r'"[^"]*"', ' ', s) # "nickname" + s = re.sub(r'‘[^’]*’', ' ', s) # ‘nickname’ + s = re.sub(r'\s+', ' ', s).strip() + if s and len(s.split()) >= 2: + return s + return name.strip() + def verify_wikipedia_reference(self, performer_name, wikipedia_url, context): """ Verify that a Wikipedia URL is valid and refers to the correct performer @@ -381,9 +406,13 @@ def verify_wikipedia_reference(self, performer_name, wikipedia_url, context): 'score': 0 } - # Remove disambiguation parentheses like "(saxophonist)" - page_name = re.sub(r'\s*\([^)]*\)\s*$', '', page_title_text).strip().lower() - performer_name_lower = performer_name.lower() + # Remove disambiguation parentheses like "(saxophonist)" and + # strip decorative quoted nicknames from both sides so e.g. + # '“Brother” Jack McDuff' matches the page titled 'Jack McDuff' + # as an exact (not merely partial) name match. + page_name = re.sub(r'\s*\([^)]*\)\s*$', '', page_title_text).strip() + page_name = self._strip_nickname(page_name).lower() + performer_name_lower = self._strip_nickname(performer_name).lower() name_match = False if page_name == performer_name_lower: @@ -597,94 +626,93 @@ def _string_similarity(self, s1, s2): return 1.0 - (edit_distance / max_len) + def _opensearch(self, query): + """Return candidate Wikipedia article URLs for a query via the + OpenSearch API, honoring the 7-day search cache. + + Sets self.last_made_api_call. Returns a list of URLs (possibly empty); + a genuine empty result is cached so bulk re-runs skip it. Returns None + on a transient request failure so the caller can tell 'no results' + apart from 'lookup failed' (and we avoid caching the failure). + """ + if not self.force_refresh: + cached = self._load_search_from_cache(query) + if cached is not None: + self.last_made_api_call = False + return cached + + self.last_made_api_call = True + self.rate_limit() + try: + response = self.session.get( + "https://en.wikipedia.org/w/api.php", + params={'action': 'opensearch', 'search': query, + 'limit': 5, 'namespace': 0, 'format': 'json'}, + timeout=10) + except requests.RequestException as e: + logger.warning(f" OpenSearch request failed for {query!r}: {e}") + return None + + if response.status_code != 200: + return None + + data = response.json() + urls = data[3] if len(data) >= 4 and data[3] else [] + if not self.force_refresh: + self._save_search_to_cache(query, urls) + return urls + def search_wikipedia(self, performer_name, context): """ Search Wikipedia for a performer - + Args: performer_name: Name to search for context: Dict with additional info for verification - + Returns: Wikipedia URL if found with reasonable confidence, None otherwise """ try: - # Check cache first (unless force_refresh is enabled) - if not self.force_refresh: - cached_results = self._load_search_from_cache(performer_name) - if cached_results is not None: # Changed from 'if cached_results:' to handle empty lists - self.last_made_api_call = False # Using cached search results - if not cached_results: # Empty list means no results found previously - logger.debug(f" Using cached empty search results (no Wikipedia page found)") - return None - logger.debug(f" Using cached search results") - urls = cached_results - else: - # Perform API search - self.last_made_api_call = True - search_url = "https://en.wikipedia.org/w/api.php" - params = { - 'action': 'opensearch', - 'search': performer_name, - 'limit': 5, - 'namespace': 0, - 'format': 'json' - } - - self.rate_limit() - response = self.session.get(search_url, params=params, timeout=10) - - if response.status_code != 200: - return None - - data = response.json() - if len(data) < 4 or not data[3]: - # Cache empty results so we don't search again - self._save_search_to_cache(performer_name, []) - return None - - urls = data[3] - - # Save to cache - self._save_search_to_cache(performer_name, urls) - else: - # Force refresh - skip cache - self.last_made_api_call = True - search_url = "https://en.wikipedia.org/w/api.php" - params = { - 'action': 'opensearch', - 'search': performer_name, - 'limit': 5, - 'namespace': 0, - 'format': 'json' - } - - self.rate_limit() - response = self.session.get(search_url, params=params, timeout=10) - - if response.status_code != 200: - return None - - data = response.json() - if len(data) < 4 or not data[3]: - return None - - urls = data[3] + # Search the nickname-stripped (more canonical) form first, then + # the name as stored. e.g. '“Brother” Jack McDuff' searches + # 'Jack McDuff' first so the canonical article is found and + # preferred over an album/redirect that the decorated name returns. + queries = [] + stripped = self._strip_nickname(performer_name) + if stripped.lower() != performer_name.lower(): + queries.append(stripped) + queries.append(performer_name) + + candidate_urls = [] + any_api_call = False + for query in queries: + urls = self._opensearch(query) + any_api_call = any_api_call or self.last_made_api_call + for url in (urls or []): + if url not in candidate_urls: + candidate_urls.append(url) + self.last_made_api_call = any_api_call + + if not candidate_urls: + logger.debug(" No Wikipedia search results") + return None - # Verify each URL until we find a good match + # Verify each candidate until we find a good match # Note: verify_wikipedia_reference will also set last_made_api_call - for url in urls[:5]: + for url in candidate_urls[:8]: verification = self.verify_wikipedia_reference(performer_name, url, context) logger.debug(f" Checked {url}: valid={verification['valid']}, confidence={verification['confidence']}, score={verification.get('score', 0)}, reason={verification['reason']}") if verification['valid']: logger.debug(f" Found Wikipedia: {url} (confidence: {verification['confidence']}, score: {verification.get('score', 0)})") logger.debug(f" Reason: {verification['reason']}") return url - - # No valid URL found - cache empty results + + # No candidate verified - cache empty under the stored name so a + # re-run skips re-verifying (preserves bulk-run speed). self._save_search_to_cache(performer_name, []) return None - + except Exception as e: logger.error(f"Error searching Wikipedia for {performer_name}: {e}") return None \ No newline at end of file diff --git a/backend/tests/test_wikipedia_nickname.py b/backend/tests/test_wikipedia_nickname.py new file mode 100644 index 00000000..49c84738 --- /dev/null +++ b/backend/tests/test_wikipedia_nickname.py @@ -0,0 +1,61 @@ +""" +Tests for WikipediaSearcher._strip_nickname. + +Performer names sometimes carry a decorative nickname in quotes +(e.g. '“Brother” Jack McDuff'). The searcher strips that quoted segment so +the lookup and name-matching use the legal name ('Jack McDuff') — which is +what Wikipedia article titles use. Crucially, lone apostrophes in real names +(O'Brien, D'Angelo, leading-apostrophe titles) must be left untouched. + +_strip_nickname is pure (no DB/network), so these run without fixtures. +""" + +import pytest + +from integrations.wikipedia.utils import WikipediaSearcher + + +@pytest.fixture(scope="module") +def searcher(): + # Construction only sets up cache dirs + an HTTP session; no DB/network. + return WikipediaSearcher() + + +@pytest.mark.parametrize( + "name, expected", + [ + # Smart double-quote nickname (the Jack McDuff case) + ("“Brother” Jack McDuff", "Jack McDuff"), + # Straight double-quote nickname + ('"Brother" Jack McDuff', "Jack McDuff"), + # Smart single-quote nickname + ("‘Papa’ John DeFrancesco", "John DeFrancesco"), + # Plain names are unchanged + ("Miles Davis", "Miles Davis"), + ("John Coltrane", "John Coltrane"), + # Lone apostrophes must NOT be treated as nickname delimiters + ("Jack O'Brien", "Jack O'Brien"), + ("D'Angelo", "D'Angelo"), + ("'Night, Sweet Pea", "'Night, Sweet Pea"), + # Trailing quoted segment (album-style title) still strips the quotes + ("“Brother” Jack McDuff Live!", "Jack McDuff Live!"), + # Whitespace left by stripping is collapsed + ("“Brother” Jack McDuff", "Jack McDuff"), + # Guard: stripping that leaves a single bare surname is rejected and + # the original is kept (a lone surname fuzzy-matches famous people: + # 'West' -> Kanye West, 'Bower' -> Kris Bowers). + ("‘Doc’ West", "‘Doc’ West"), + ("“Bumps” Myers", "“Bumps” Myers"), + ("“Bugs” Bower", "“Bugs” Bower"), + ('"Dizzy" Gillespie', '"Dizzy" Gillespie'), + ], +) +def test_strip_nickname(searcher, name, expected): + assert searcher._strip_nickname(name) == expected + + +def test_strip_nickname_never_empties(searcher): + """A name that is *only* a quoted nickname falls back to the trimmed + original rather than returning an empty string.""" + assert searcher._strip_nickname("“Brother”") == "“Brother”" + assert searcher._strip_nickname(" ") == "" From 0ed7ba3ab463513ceed6fa1b925d4d872e1d756d Mon Sep 17 00:00:00 2001 From: David Rodger Date: Sun, 31 May 2026 15:20:36 -0400 Subject: [PATCH 4/4] Reject non-musician and ambiguous Wikipedia matches Nickname stripping can yield a common "First Last" that collides with a famous *different* person, and exact-name + generic music keywords (50) was too weak to tell them apart. Two false positives seen in the wild: "Captain" Kirk Douglas matched the actor Kirk Douglas, and "Virginia" Joe Jones matched a different Joe Jones (the Fluxus musician). Add two precision guards to verify_wikipedia_reference: - Non-musician guard: if the infobox/lead establishes a non-musician subject (actor, athlete, politician, ...) and carries no music signal, reject. A music term in the infobox/lead protects genuine musicians (e.g. "jazz organist"), so McDuff/DeFrancesco are unaffected. - Disambiguation-corroboration guard: a parenthetically disambiguated title ("Joe Jones (Fluxus musician)") means several same-named people exist, so require a birth/death-year or song match before accepting. Also strip hatnotes ("For the musician, see ...") before reading the page text. They are cross-references to other subjects; letting their keywords leak in mis-scored pages (the Kirk Douglas actor hatnote even mentions "musician" and points at the real performer). Both bad cases now resolve to no match; McDuff, DeFrancesco and Miles Davis still verify. Adds offline unit tests (crafted HTML) for the guards. Co-Authored-By: Claude Opus 4.8 --- backend/integrations/wikipedia/utils.py | 77 +++++++++++++-- backend/tests/test_wikipedia_verify_guards.py | 98 +++++++++++++++++++ 2 files changed, 169 insertions(+), 6 deletions(-) create mode 100644 backend/tests/test_wikipedia_verify_guards.py diff --git a/backend/integrations/wikipedia/utils.py b/backend/integrations/wikipedia/utils.py index 306e53cc..f29dbe7a 100644 --- a/backend/integrations/wikipedia/utils.py +++ b/backend/integrations/wikipedia/utils.py @@ -19,6 +19,26 @@ logger = logging.getLogger(__name__) +# Terms that mark a page's primary subject as a non-musician. Used to reject +# pages whose infobox/lead clearly describe an actor, athlete, politician, etc. +# Kept focused to avoid false rejects; only applied when NO music term is also +# present in the infobox/lead. +_NON_MUSICIAN_TERMS = [ + 'actor', 'actress', 'filmmaker', 'screenwriter', 'comedian', + 'basketball', 'footballer', 'baseball', 'quarterback', 'athlete', + 'politician', 'senator', 'congressman', 'governor', 'mayor', 'president', + 'novelist', 'painter', 'sculptor', 'economist', 'physicist', 'philosopher', +] + +# Music terms whose presence in the infobox/lead protects a genuine musician +# from the non-musician guard above (e.g. 'jazz organist'). +_MUSICIAN_TERMS = [ + 'musician', 'singer', 'vocalist', 'pianist', 'organist', 'guitarist', + 'bassist', 'drummer', 'saxophonist', 'trumpeter', 'trombonist', + 'composer', 'bandleader', 'jazz', 'blues', 'bebop', 'swing', +] + + class WikipediaSearcher: """Shared Wikipedia search functionality with caching""" @@ -308,6 +328,12 @@ def verify_wikipedia_reference(self, performer_name, wikipedia_url, context): # Get the main content area (skip navigation/menus) content_div = soup.find('div', {'id': 'mw-content-text'}) or soup.find('div', {'class': 'mw-parser-output'}) if content_div: + # Drop hatnotes ("For the musician, see ...") before reading the + # text: they are cross-references to OTHER subjects, and letting + # their keywords leak in mis-scores the page (e.g. the actor Kirk + # Douglas hatnote mentions "musician" and points at the real one). + for hatnote in content_div.select('div.hatnote, .hatnote'): + hatnote.decompose() page_text = content_div.get_text().lower() else: page_text = soup.get_text().lower() @@ -364,10 +390,11 @@ def verify_wikipedia_reference(self, performer_name, wikipedia_url, context): reasons = [] # Check name similarity + page_title_text = '' page_title = soup.find('h1', {'id': 'firstHeading'}) if page_title: page_title_text = page_title.get_text().strip() - + # Check if the title disambiguation clearly indicates a NON-musician # Extract the disambiguation term in parentheses (e.g., "(basketball)" from "Sam Jones (basketball)") disambiguation_match = re.search(r'\(([^)]+)\)$', page_title_text) @@ -455,6 +482,7 @@ def verify_wikipedia_reference(self, performer_name, wikipedia_url, context): reasons.append(f"Name mismatch: expected '{performer_name}', page is '{page_title_text}'") # Look for infobox (strong signal this is a musician page) + infobox_text = '' infobox = soup.find('table', {'class': 'infobox'}) if infobox: infobox_text = infobox.get_text().lower() @@ -503,28 +531,65 @@ def verify_wikipedia_reference(self, performer_name, wikipedia_url, context): # Generic terms only get partial credit and only if we have other signals confidence_score += 5 reasons.append(f"Found generic music keywords: {', '.join(found_generic[:2])}") - + + # Guard: reject pages whose primary subject is clearly a non-musician + # (actor, athlete, politician, ...). We look only at the infobox and + # the lead sentence — an incidental "musician" mention later in the + # body must not rescue e.g. the actor Kirk Douglas. A music term in + # the infobox/lead protects genuine musicians ('jazz organist'). + lead_text = page_text[:600] + subject_text = f"{infobox_text} {lead_text}" + non_musician_hits = [t for t in _NON_MUSICIAN_TERMS if self._word_in_text(t, subject_text)] + music_hits = [t for t in _MUSICIAN_TERMS if self._word_in_text(t, subject_text)] + if non_musician_hits and not music_hits: + logger.debug(f"Primary subject looks non-musician ({non_musician_hits[:2]}), no music signal - rejecting") + return { + 'valid': False, + 'confidence': 'high', + 'reason': f"Page subject appears to be a {non_musician_hits[0]}, not a musician", + 'score': 0 + } + # Check birth/death dates if available + has_corroboration = False if context.get('birth_date'): birth_year = str(context['birth_date'].year) if hasattr(context['birth_date'], 'year') else str(context['birth_date'])[:4] if birth_year in page_text[:2000]: confidence_score += 25 + has_corroboration = True reasons.append(f"Birth year {birth_year} found on page") - + if context.get('death_date'): death_year = str(context['death_date'].year) if hasattr(context['death_date'], 'year') else str(context['death_date'])[:4] if death_year in page_text[:2000]: confidence_score += 20 + has_corroboration = True reasons.append(f"Death year {death_year} found on page") - + # Check if any of the performer's songs are mentioned if context.get('sample_songs'): - song_mentions = [song for song in context['sample_songs'] + song_mentions = [song for song in context['sample_songs'] if song and song.lower() in page_text] if song_mentions: confidence_score += 25 + has_corroboration = True reasons.append(f"Found song references: {', '.join(song_mentions[:2])}") - + + # Guard: a parenthetically disambiguated title (e.g. + # "Joe Jones (Fluxus musician)") means several same-named people + # exist, so a bare name + generic music keywords isn't enough to + # know which one this is. Require corroboration (birth/death year or + # a song on the page) before accepting such a page. + title_disambiguated = bool(re.search(r'\([^)]+\)\s*$', page_title_text)) + if title_disambiguated and not has_corroboration: + logger.debug(f"Disambiguated title '{page_title_text}' without corroboration - not accepting") + return { + 'valid': False, + 'confidence': 'low', + 'reason': f"Disambiguated page '{page_title_text}' needs birth/death or song corroboration (score: {confidence_score})", + 'score': confidence_score + } + # Determine validity based on confidence score # Require at least 50 points (medium confidence) to accept if confidence_score >= 50: diff --git a/backend/tests/test_wikipedia_verify_guards.py b/backend/tests/test_wikipedia_verify_guards.py new file mode 100644 index 00000000..625189ff --- /dev/null +++ b/backend/tests/test_wikipedia_verify_guards.py @@ -0,0 +1,98 @@ +""" +Tests for the precision guards in WikipediaSearcher.verify_wikipedia_reference. + +These guard against nickname-stripped names colliding with a famous *different* +person: + +- Non-musician guard: a page whose infobox/lead describes an actor/athlete/etc. + (and has no music signal) is rejected — catches the actor Kirk Douglas. +- Disambiguation-corroboration guard: a parenthetically disambiguated title + ("Joe Jones (Fluxus musician)") needs a birth/death-year or song match before + it's accepted — catches the wrong Joe Jones. +- Hatnote stripping: cross-reference hatnotes ("For the musician, see ...") are + removed before scoring so the *other* subject's keywords don't leak in. + +The page fetch is monkeypatched with crafted HTML, so these run offline. +""" + +import pytest + +from integrations.wikipedia.utils import WikipediaSearcher + + +@pytest.fixture(scope="module") +def searcher(): + return WikipediaSearcher() + + +def _page(title, lead, occupation=None, hatnote=None): + """Minimal Wikipedia-shaped HTML: h1 heading, optional hatnote, optional + infobox with an Occupation row, and a lead paragraph.""" + infobox = ( + f'' + f'
Occupation{occupation}
' if occupation else '' + ) + hat = f'
{hatnote}
' if hatnote else '' + return ( + '' + f'

{title}

' + '
' + f'{hat}{infobox}

{lead}

' + '
' + ) + + +def _verify(searcher, monkeypatch, performer, html, context=None): + monkeypatch.setattr(searcher, "_fetch_wikipedia_page", lambda url: html) + ctx = context or {"birth_date": None, "death_date": None, "sample_songs": []} + return searcher.verify_wikipedia_reference( + performer, "https://en.wikipedia.org/wiki/X", ctx + ) + + +def test_non_musician_subject_rejected(searcher, monkeypatch): + html = _page("John Smith", + "John Smith was an American actor and filmmaker.", + occupation="Actor, filmmaker") + result = _verify(searcher, monkeypatch, "John Smith", html) + assert result["valid"] is False + assert result["score"] == 0 + + +def test_musician_with_incidental_non_music_word_kept(searcher, monkeypatch): + # A music term in the lead protects a genuine musician even if a + # non-musician word also appears. + html = _page("Jane Doe", + "Jane Doe was an American jazz organist and occasional actor.", + occupation="Musician") + result = _verify(searcher, monkeypatch, "Jane Doe", html) + assert result["valid"] is True + + +def test_disambiguated_title_without_corroboration_rejected(searcher, monkeypatch): + html = _page("Joe Test (musician)", + "Joe Test was an American jazz drummer.", + occupation="Musician") + result = _verify(searcher, monkeypatch, "Joe Test", html) + assert result["valid"] is False + + +def test_disambiguated_title_with_song_corroboration_accepted(searcher, monkeypatch): + html = _page("Joe Test (musician)", + "Joe Test was an American jazz drummer known for Blue Moon.", + occupation="Musician") + ctx = {"birth_date": None, "death_date": None, "sample_songs": ["Blue Moon"]} + result = _verify(searcher, monkeypatch, "Joe Test", html, ctx) + assert result["valid"] is True + + +def test_hatnote_keywords_do_not_rescue_non_musician(searcher, monkeypatch): + # The hatnote mentions "musician" and points elsewhere; it must be stripped + # so the actual (actor) subject is still rejected. + html = _page("Bob Star", + "Bob Star was an American actor.", + occupation="Actor", + hatnote="For the musician, see Bob Star (bandleader).") + result = _verify(searcher, monkeypatch, "Bob Star", html) + assert result["valid"] is False + assert result["score"] == 0