From ddc4ef10304a3fd60c97a28587fe5d534d7c4039 Mon Sep 17 00:00:00 2001
From: David Rodger <dave@davidrodger.com>
Date: Sun, 31 May 2026 09:27:17 -0400
Subject: [PATCH 1/4] Load .env in verify_performer_references script

The script hand-rolls its own sys.path bootstrap instead of using
ScriptBase, and so never loaded .env. db_utils then read empty DB_*
vars and fell back to the local Postgres socket, failing with
host=None. Add the same dotenv-loading block script_base.py uses so
the script connects using the configured database.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 backend/scripts/verify_performer_references.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/backend/scripts/verify_performer_references.py b/backend/scripts/verify_performer_references.py
index ac7b09f0..da5ec26e 100755
--- a/backend/scripts/verify_performer_references.py
+++ b/backend/scripts/verify_performer_references.py
@@ -23,6 +23,17 @@
 backend_dir = Path(__file__).resolve().parent.parent
 sys.path.insert(0, str(backend_dir))
 
+# Load environment variables from .env in the backend directory (matches
+# script_base.py). Without this, db_utils sees empty DB_* vars and the
+# connection falls back to the local Postgres socket.
+try:
+    from dotenv import load_dotenv
+    env_path = backend_dir / '.env'
+    if env_path.exists():
+        load_dotenv(env_path)
+except ImportError:
+    pass  # python-dotenv not installed, skip
+
 # Third-party imports
 import requests
 

From 32e94805ac18e83499fdefd17b6ded10767bf9b8 Mon Sep 17 00:00:00 2001
From: David Rodger <dave@davidrodger.com>
Date: Sun, 31 May 2026 09:37:38 -0400
Subject: [PATCH 2/4] Reduce verify_performer_references noise and idle time

Bulk runs over 30k+ performers were dominated by per-row log lines and
loop-level sleep:

- Demote "unchanged"/"skipped" rows to DEBUG so the INFO stream shows
  only actual changes and items needing review.
- Add a progress heartbeat every PROGRESS_INTERVAL (50) performers with
  running counts and an ETA.
- Drop the 1.5s per-performer sleep. WikipediaSearcher.rate_limit() and
  the MusicBrainz path already throttle their own live requests, so the
  extra loop sleep was pure idle time (~13h over 30k rows).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .../scripts/verify_performer_references.py    | 42 +++++++++++++++----
 1 file changed, 33 insertions(+), 9 deletions(-)

diff --git a/backend/scripts/verify_performer_references.py b/backend/scripts/verify_performer_references.py
index da5ec26e..b9c25f0a 100755
--- a/backend/scripts/verify_performer_references.py
+++ b/backend/scripts/verify_performer_references.py
@@ -53,6 +53,9 @@
 )
 logger = logging.getLogger(__name__)
 
+# Emit a progress heartbeat every N performers during a full run.
+PROGRESS_INTERVAL = 50
+
 
 class PerformerReferenceVerifier:
     """Verify and update external references for performers"""
@@ -249,15 +252,18 @@ def _log_performer_status(self, name, performer_id, old_url, new_url, status):
         old_display = old_url if old_url else "none"
         new_display = new_url if new_url else "none"
         
-        # Use different formatting based on status
+        # Use different formatting based on status. No-op outcomes
+        # (unchanged / skipped) are demoted to DEBUG so the INFO stream
+        # highlights only actual changes and items needing review; with
+        # 30k+ performers the per-row lines otherwise drown out signal.
         if status == "unchanged":
-            logger.info(f"✓ {name} | ID: {performer_id} | Old: {old_display} | New: {new_display} | Status: {status}")
+            logger.debug(f"✓ {name} | ID: {performer_id} | Old: {old_display} | New: {new_display} | Status: {status}")
         elif status == "changed":
             logger.info(f"✎ {name} | ID: {performer_id} | Old: {old_display} | New: {new_display} | Status: {status}")
         elif status == "manual_inspection":
             logger.info(f"⚠ {name} | ID: {performer_id} | Old: {old_display} | New: {new_display} | Status: {status}")
         elif status == "skipped":
-            logger.info(f"⊘ {name} | ID: {performer_id} | Old: {old_display} | New: {new_display} | Status: {status}")
+            logger.debug(f"⊘ {name} | ID: {performer_id} | Old: {old_display} | New: {new_display} | Status: {status}")
         elif status == "error":
             logger.info(f"✗ {name} | ID: {performer_id} | Old: {old_display} | New: {new_display} | Status: {status}")
         else:
@@ -660,17 +666,35 @@ def run(self):
                 logger.info(f"  (No performer found with ID '{self.id_filter}')")
             return True
         
-        logger.info(f"Found {len(performers)} performer(s) to process")
+        total = len(performers)
+        logger.info(f"Found {total} performer(s) to process")
         logger.info("")
-        
+
         # Process each performer
+        start_time = time.time()
         for performer in performers:
             self.stats['performers_processed'] += 1
             success, made_api_calls = self.process_performer(performer)
-            
-            # Only sleep if we made actual API calls (not cached data)
-            if made_api_calls:
-                time.sleep(1.5)
+
+            # Heartbeat every PROGRESS_INTERVAL performers (and at the end),
+            # since per-row "unchanged"/"skipped" lines are now suppressed.
+            processed = self.stats['performers_processed']
+            if processed % PROGRESS_INTERVAL == 0 or processed == total:
+                elapsed = time.time() - start_time
+                rate = processed / elapsed if elapsed > 0 else 0
+                eta_min = (total - processed) / rate / 60 if rate > 0 else 0
+                logger.info(
+                    f"… {processed}/{total} ({processed * 100 // total}%) | "
+                    f"added {self.stats['references_added']}, "
+                    f"removed {self.stats['references_removed']}, "
+                    f"review {self.stats['invalid_references'] - self.stats['references_removed']}, "
+                    f"errors {self.stats['errors']} | "
+                    f"{rate:.1f}/s, ETA {eta_min:.0f}m"
+                )
+
+            # No per-performer sleep here: WikipediaSearcher.rate_limit() and
+            # the MusicBrainz path already throttle their own live requests, so
+            # an extra loop-level sleep just added idle time (~13h over 30k rows).
         
         # Print summary
         self.print_summary()

From e3e591a03d4e50989488b13554f8d993d1d5e190 Mon Sep 17 00:00:00 2001
From: David Rodger <dave@davidrodger.com>
Date: Sun, 31 May 2026 09:54:34 -0400
Subject: [PATCH 3/4] Strip quoted nicknames in Wikipedia performer matching

Performer names stored with a decorative quoted nickname (e.g.
"Brother" Jack McDuff, 'Papa' John DeFrancesco) failed Wikipedia lookup
two ways: OpenSearch returned an album/redirect instead of the canonical
article, and the name comparison scored only a partial match (45 < the
50-point threshold).

- Add WikipediaSearcher._strip_nickname() - removes paired double quotes
  and smart single quotes. Lone apostrophes (O'Brien, D'Angelo) have no
  opener and are left untouched. The stripped form is only used when it
  still has >= 2 tokens (a plausible first + last name); collapsing to a
  single bare surname is rejected and the original kept, since a lone
  surname fuzzy-matches unrelated famous people ('Doc' West -> Kanye West,
  'Bugs' Bower -> Kris Bowers).
- verify_wikipedia_reference now strips nicknames from both the performer
  name and the page title before matching, so the stripped legal name is
  an exact (not partial) match.
- search_wikipedia queries the nickname-stripped (canonical) form first,
  then the stored name, merging candidates - so the canonical article is
  found and preferred over an album/redirect. Extracted the OpenSearch
  call into _opensearch(), collapsing the duplicated force/non-force
  branches and no longer caching transient request failures.
- Add unit tests for _strip_nickname, including the single-surname guard.

Improves the shared searcher used by both the batch verifier and the
core ingestion path.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 backend/integrations/wikipedia/utils.py  | 170 +++++++++++++----------
 backend/tests/test_wikipedia_nickname.py |  61 ++++++++
 2 files changed, 160 insertions(+), 71 deletions(-)
 create mode 100644 backend/tests/test_wikipedia_nickname.py

diff --git a/backend/integrations/wikipedia/utils.py b/backend/integrations/wikipedia/utils.py
index 9a82963a..306e53cc 100644
--- a/backend/integrations/wikipedia/utils.py
+++ b/backend/integrations/wikipedia/utils.py
@@ -252,6 +252,31 @@ def rate_limit(self):
             time.sleep(sleep_time)
         self.last_request_time = time.time()
 
+    def _strip_nickname(self, name):
+        """Remove a decorative quoted nickname and normalize smart quotes.
+
+        Performer names sometimes embed a nickname in quotes, e.g.
+        '“Brother” Jack McDuff' or '‘Papa’ John DeFrancesco'. The quoted part is
+        decorative; the legal name ('Jack McDuff') is what Wikipedia titles and
+        our matching want. Only paired double quotes ("..." / “...”) and paired
+        smart single quotes (‘...’) are stripped — a lone straight apostrophe
+        (O'Brien, 'Night Sweet Pea) has no opener, so such names are untouched.
+
+        The stripped form is only returned when it still has at least two
+        tokens (a plausible first + last name). If stripping collapses the
+        name to a single bare surname (e.g. '‘Doc’ West' -> 'West'), the
+        original is returned instead: a lone surname is too generic — it
+        fuzzy-matches unrelated famous people ('West' -> Kanye West) and
+        partial-matches any 'First Surname' page.
+        """
+        s = name.replace('“', '"').replace('”', '"')
+        s = re.sub(r'"[^"]*"', ' ', s)                  # "nickname"
+        s = re.sub(r'‘[^’]*’', ' ', s)   # ‘nickname’
+        s = re.sub(r'\s+', ' ', s).strip()
+        if s and len(s.split()) >= 2:
+            return s
+        return name.strip()
+
     def verify_wikipedia_reference(self, performer_name, wikipedia_url, context):
         """
         Verify that a Wikipedia URL is valid and refers to the correct performer
@@ -381,9 +406,13 @@ def verify_wikipedia_reference(self, performer_name, wikipedia_url, context):
                             'score': 0
                         }
                 
-                # Remove disambiguation parentheses like "(saxophonist)"
-                page_name = re.sub(r'\s*\([^)]*\)\s*$', '', page_title_text).strip().lower()
-                performer_name_lower = performer_name.lower()
+                # Remove disambiguation parentheses like "(saxophonist)" and
+                # strip decorative quoted nicknames from both sides so e.g.
+                # '“Brother” Jack McDuff' matches the page titled 'Jack McDuff'
+                # as an exact (not merely partial) name match.
+                page_name = re.sub(r'\s*\([^)]*\)\s*$', '', page_title_text).strip()
+                page_name = self._strip_nickname(page_name).lower()
+                performer_name_lower = self._strip_nickname(performer_name).lower()
 
                 name_match = False
                 if page_name == performer_name_lower:
@@ -597,94 +626,93 @@ def _string_similarity(self, s1, s2):
         return 1.0 - (edit_distance / max_len)
 
 
+    def _opensearch(self, query):
+        """Return candidate Wikipedia article URLs for a query via the
+        OpenSearch API, honoring the 7-day search cache.
+
+        Sets self.last_made_api_call. Returns a list of URLs (possibly empty);
+        a genuine empty result is cached so bulk re-runs skip it. Returns None
+        on a transient request failure so the caller can tell 'no results'
+        apart from 'lookup failed' (and we avoid caching the failure).
+        """
+        if not self.force_refresh:
+            cached = self._load_search_from_cache(query)
+            if cached is not None:
+                self.last_made_api_call = False
+                return cached
+
+        self.last_made_api_call = True
+        self.rate_limit()
+        try:
+            response = self.session.get(
+                "https://en.wikipedia.org/w/api.php",
+                params={'action': 'opensearch', 'search': query,
+                        'limit': 5, 'namespace': 0, 'format': 'json'},
+                timeout=10)
+        except requests.RequestException as e:
+            logger.warning(f"  OpenSearch request failed for {query!r}: {e}")
+            return None
+
+        if response.status_code != 200:
+            return None
+
+        data = response.json()
+        urls = data[3] if len(data) >= 4 and data[3] else []
+        if not self.force_refresh:
+            self._save_search_to_cache(query, urls)
+        return urls
+
     def search_wikipedia(self, performer_name, context):
         """
         Search Wikipedia for a performer
-        
+
         Args:
             performer_name: Name to search for
             context: Dict with additional info for verification
-            
+
         Returns:
             Wikipedia URL if found with reasonable confidence, None otherwise
         """
         try:
-            # Check cache first (unless force_refresh is enabled)
-            if not self.force_refresh:
-                cached_results = self._load_search_from_cache(performer_name)
-                if cached_results is not None:  # Changed from 'if cached_results:' to handle empty lists
-                    self.last_made_api_call = False  # Using cached search results
-                    if not cached_results:  # Empty list means no results found previously
-                        logger.debug(f"  Using cached empty search results (no Wikipedia page found)")
-                        return None
-                    logger.debug(f"  Using cached search results")
-                    urls = cached_results
-                else:
-                    # Perform API search
-                    self.last_made_api_call = True
-                    search_url = "https://en.wikipedia.org/w/api.php"
-                    params = {
-                        'action': 'opensearch',
-                        'search': performer_name,
-                        'limit': 5,
-                        'namespace': 0,
-                        'format': 'json'
-                    }
-                    
-                    self.rate_limit()
-                    response = self.session.get(search_url, params=params, timeout=10)
-                    
-                    if response.status_code != 200:
-                        return None
-                    
-                    data = response.json()
-                    if len(data) < 4 or not data[3]:
-                        # Cache empty results so we don't search again
-                        self._save_search_to_cache(performer_name, [])
-                        return None
-                    
-                    urls = data[3]
-                    
-                    # Save to cache
-                    self._save_search_to_cache(performer_name, urls)
-            else:
-                # Force refresh - skip cache
-                self.last_made_api_call = True
-                search_url = "https://en.wikipedia.org/w/api.php"
-                params = {
-                    'action': 'opensearch',
-                    'search': performer_name,
-                    'limit': 5,
-                    'namespace': 0,
-                    'format': 'json'
-                }
-                
-                self.rate_limit()
-                response = self.session.get(search_url, params=params, timeout=10)
-                
-                if response.status_code != 200:
-                    return None
-                
-                data = response.json()
-                if len(data) < 4 or not data[3]:
-                    return None
-                
-                urls = data[3]
+            # Search the nickname-stripped (more canonical) form first, then
+            # the name as stored. e.g. '“Brother” Jack McDuff' searches
+            # 'Jack McDuff' first so the canonical article is found and
+            # preferred over an album/redirect that the decorated name returns.
+            queries = []
+            stripped = self._strip_nickname(performer_name)
+            if stripped.lower() != performer_name.lower():
+                queries.append(stripped)
+            queries.append(performer_name)
+
+            candidate_urls = []
+            any_api_call = False
+            for query in queries:
+                urls = self._opensearch(query)
+                any_api_call = any_api_call or self.last_made_api_call
+                for url in (urls or []):
+                    if url not in candidate_urls:
+                        candidate_urls.append(url)
+            self.last_made_api_call = any_api_call
+
+            if not candidate_urls:
+                logger.debug("  No Wikipedia search results")
+                return None
 
-            # Verify each URL until we find a good match
+            # Verify each candidate until we find a good match
             # Note: verify_wikipedia_reference will also set last_made_api_call
-            for url in urls[:5]:
+            for url in candidate_urls[:8]:
                 verification = self.verify_wikipedia_reference(performer_name, url, context)
                 logger.debug(f"  Checked {url}: valid={verification['valid']}, confidence={verification['confidence']}, score={verification.get('score', 0)}, reason={verification['reason']}")
                 if verification['valid']:
                     logger.debug(f"  Found Wikipedia: {url} (confidence: {verification['confidence']}, score: {verification.get('score', 0)})")
                     logger.debug(f"    Reason: {verification['reason']}")
                     return url
-            
-            # No valid URL found - cache empty results
+
+            # No candidate verified - cache empty under the stored name so a
+            # re-run skips re-verifying (preserves bulk-run speed).
             self._save_search_to_cache(performer_name, [])
             return None
-            
+
         except Exception as e:
             logger.error(f"Error searching Wikipedia for {performer_name}: {e}")
             return None
\ No newline at end of file
diff --git a/backend/tests/test_wikipedia_nickname.py b/backend/tests/test_wikipedia_nickname.py
new file mode 100644
index 00000000..49c84738
--- /dev/null
+++ b/backend/tests/test_wikipedia_nickname.py
@@ -0,0 +1,61 @@
+"""
+Tests for WikipediaSearcher._strip_nickname.
+
+Performer names sometimes carry a decorative nickname in quotes
+(e.g. '“Brother” Jack McDuff'). The searcher strips that quoted segment so
+the lookup and name-matching use the legal name ('Jack McDuff') — which is
+what Wikipedia article titles use. Crucially, lone apostrophes in real names
+(O'Brien, D'Angelo, leading-apostrophe titles) must be left untouched.
+
+_strip_nickname is pure (no DB/network), so these run without fixtures.
+"""
+
+import pytest
+
+from integrations.wikipedia.utils import WikipediaSearcher
+
+
+@pytest.fixture(scope="module")
+def searcher():
+    # Construction only sets up cache dirs + an HTTP session; no DB/network.
+    return WikipediaSearcher()
+
+
+@pytest.mark.parametrize(
+    "name, expected",
+    [
+        # Smart double-quote nickname (the Jack McDuff case)
+        ("“Brother” Jack McDuff", "Jack McDuff"),
+        # Straight double-quote nickname
+        ('"Brother" Jack McDuff', "Jack McDuff"),
+        # Smart single-quote nickname
+        ("‘Papa’ John DeFrancesco", "John DeFrancesco"),
+        # Plain names are unchanged
+        ("Miles Davis", "Miles Davis"),
+        ("John Coltrane", "John Coltrane"),
+        # Lone apostrophes must NOT be treated as nickname delimiters
+        ("Jack O'Brien", "Jack O'Brien"),
+        ("D'Angelo", "D'Angelo"),
+        ("'Night, Sweet Pea", "'Night, Sweet Pea"),
+        # Trailing quoted segment (album-style title) still strips the quotes
+        ("“Brother” Jack McDuff Live!", "Jack McDuff Live!"),
+        # Whitespace left by stripping is collapsed
+        ("“Brother”  Jack   McDuff", "Jack McDuff"),
+        # Guard: stripping that leaves a single bare surname is rejected and
+        # the original is kept (a lone surname fuzzy-matches famous people:
+        # 'West' -> Kanye West, 'Bower' -> Kris Bowers).
+        ("‘Doc’ West", "‘Doc’ West"),
+        ("“Bumps” Myers", "“Bumps” Myers"),
+        ("“Bugs” Bower", "“Bugs” Bower"),
+        ('"Dizzy" Gillespie', '"Dizzy" Gillespie'),
+    ],
+)
+def test_strip_nickname(searcher, name, expected):
+    assert searcher._strip_nickname(name) == expected
+
+
+def test_strip_nickname_never_empties(searcher):
+    """A name that is *only* a quoted nickname falls back to the trimmed
+    original rather than returning an empty string."""
+    assert searcher._strip_nickname("“Brother”") == "“Brother”"
+    assert searcher._strip_nickname("   ") == ""

From 0ed7ba3ab463513ceed6fa1b925d4d872e1d756d Mon Sep 17 00:00:00 2001
From: David Rodger <dave@davidrodger.com>
Date: Sun, 31 May 2026 15:20:36 -0400
Subject: [PATCH 4/4] Reject non-musician and ambiguous Wikipedia matches

Nickname stripping can yield a common "First Last" that collides with a
famous *different* person, and exact-name + generic music keywords (50)
was too weak to tell them apart. Two false positives seen in the wild:
"Captain" Kirk Douglas matched the actor Kirk Douglas, and "Virginia"
Joe Jones matched a different Joe Jones (the Fluxus musician).

Add two precision guards to verify_wikipedia_reference:

- Non-musician guard: if the infobox/lead establishes a non-musician
  subject (actor, athlete, politician, ...) and carries no music signal,
  reject. A music term in the infobox/lead protects genuine musicians
  (e.g. "jazz organist"), so McDuff/DeFrancesco are unaffected.
- Disambiguation-corroboration guard: a parenthetically disambiguated
  title ("Joe Jones (Fluxus musician)") means several same-named people
  exist, so require a birth/death-year or song match before accepting.

Also strip hatnotes ("For the musician, see ...") before reading the
page text. They are cross-references to other subjects; letting their
keywords leak in mis-scored pages (the Kirk Douglas actor hatnote even
mentions "musician" and points at the real performer).

Both bad cases now resolve to no match; McDuff, DeFrancesco and Miles
Davis still verify. Adds offline unit tests (crafted HTML) for the
guards.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 backend/integrations/wikipedia/utils.py       | 77 +++++++++++++--
 backend/tests/test_wikipedia_verify_guards.py | 98 +++++++++++++++++++
 2 files changed, 169 insertions(+), 6 deletions(-)
 create mode 100644 backend/tests/test_wikipedia_verify_guards.py

diff --git a/backend/integrations/wikipedia/utils.py b/backend/integrations/wikipedia/utils.py
index 306e53cc..f29dbe7a 100644
--- a/backend/integrations/wikipedia/utils.py
+++ b/backend/integrations/wikipedia/utils.py
@@ -19,6 +19,26 @@
 
 logger = logging.getLogger(__name__)
 
+# Terms that mark a page's primary subject as a non-musician. Used to reject
+# pages whose infobox/lead clearly describe an actor, athlete, politician, etc.
+# Kept focused to avoid false rejects; only applied when NO music term is also
+# present in the infobox/lead.
+_NON_MUSICIAN_TERMS = [
+    'actor', 'actress', 'filmmaker', 'screenwriter', 'comedian',
+    'basketball', 'footballer', 'baseball', 'quarterback', 'athlete',
+    'politician', 'senator', 'congressman', 'governor', 'mayor', 'president',
+    'novelist', 'painter', 'sculptor', 'economist', 'physicist', 'philosopher',
+]
+
+# Music terms whose presence in the infobox/lead protects a genuine musician
+# from the non-musician guard above (e.g. 'jazz organist').
+_MUSICIAN_TERMS = [
+    'musician', 'singer', 'vocalist', 'pianist', 'organist', 'guitarist',
+    'bassist', 'drummer', 'saxophonist', 'trumpeter', 'trombonist',
+    'composer', 'bandleader', 'jazz', 'blues', 'bebop', 'swing',
+]
+
+
 class WikipediaSearcher:
     """Shared Wikipedia search functionality with caching"""
     
@@ -308,6 +328,12 @@ def verify_wikipedia_reference(self, performer_name, wikipedia_url, context):
             # Get the main content area (skip navigation/menus)
             content_div = soup.find('div', {'id': 'mw-content-text'}) or soup.find('div', {'class': 'mw-parser-output'})
             if content_div:
+                # Drop hatnotes ("For the musician, see ...") before reading the
+                # text: they are cross-references to OTHER subjects, and letting
+                # their keywords leak in mis-scores the page (e.g. the actor Kirk
+                # Douglas hatnote mentions "musician" and points at the real one).
+                for hatnote in content_div.select('div.hatnote, .hatnote'):
+                    hatnote.decompose()
                 page_text = content_div.get_text().lower()
             else:
                 page_text = soup.get_text().lower()
@@ -364,10 +390,11 @@ def verify_wikipedia_reference(self, performer_name, wikipedia_url, context):
             reasons = []
             
             # Check name similarity
+            page_title_text = ''
             page_title = soup.find('h1', {'id': 'firstHeading'})
             if page_title:
                 page_title_text = page_title.get_text().strip()
-                
+
                 # Check if the title disambiguation clearly indicates a NON-musician
                 # Extract the disambiguation term in parentheses (e.g., "(basketball)" from "Sam Jones (basketball)")
                 disambiguation_match = re.search(r'\(([^)]+)\)$', page_title_text)
@@ -455,6 +482,7 @@ def verify_wikipedia_reference(self, performer_name, wikipedia_url, context):
                     reasons.append(f"Name mismatch: expected '{performer_name}', page is '{page_title_text}'")
             
             # Look for infobox (strong signal this is a musician page)
+            infobox_text = ''
             infobox = soup.find('table', {'class': 'infobox'})
             if infobox:
                 infobox_text = infobox.get_text().lower()
@@ -503,28 +531,65 @@ def verify_wikipedia_reference(self, performer_name, wikipedia_url, context):
                 # Generic terms only get partial credit and only if we have other signals
                 confidence_score += 5
                 reasons.append(f"Found generic music keywords: {', '.join(found_generic[:2])}")
-            
+
+            # Guard: reject pages whose primary subject is clearly a non-musician
+            # (actor, athlete, politician, ...). We look only at the infobox and
+            # the lead sentence — an incidental "musician" mention later in the
+            # body must not rescue e.g. the actor Kirk Douglas. A music term in
+            # the infobox/lead protects genuine musicians ('jazz organist').
+            lead_text = page_text[:600]
+            subject_text = f"{infobox_text} {lead_text}"
+            non_musician_hits = [t for t in _NON_MUSICIAN_TERMS if self._word_in_text(t, subject_text)]
+            music_hits = [t for t in _MUSICIAN_TERMS if self._word_in_text(t, subject_text)]
+            if non_musician_hits and not music_hits:
+                logger.debug(f"Primary subject looks non-musician ({non_musician_hits[:2]}), no music signal - rejecting")
+                return {
+                    'valid': False,
+                    'confidence': 'high',
+                    'reason': f"Page subject appears to be a {non_musician_hits[0]}, not a musician",
+                    'score': 0
+                }
+
             # Check birth/death dates if available
+            has_corroboration = False
             if context.get('birth_date'):
                 birth_year = str(context['birth_date'].year) if hasattr(context['birth_date'], 'year') else str(context['birth_date'])[:4]
                 if birth_year in page_text[:2000]:
                     confidence_score += 25
+                    has_corroboration = True
                     reasons.append(f"Birth year {birth_year} found on page")
-            
+
             if context.get('death_date'):
                 death_year = str(context['death_date'].year) if hasattr(context['death_date'], 'year') else str(context['death_date'])[:4]
                 if death_year in page_text[:2000]:
                     confidence_score += 20
+                    has_corroboration = True
                     reasons.append(f"Death year {death_year} found on page")
-            
+
             # Check if any of the performer's songs are mentioned
             if context.get('sample_songs'):
-                song_mentions = [song for song in context['sample_songs'] 
+                song_mentions = [song for song in context['sample_songs']
                                if song and song.lower() in page_text]
                 if song_mentions:
                     confidence_score += 25
+                    has_corroboration = True
                     reasons.append(f"Found song references: {', '.join(song_mentions[:2])}")
-            
+
+            # Guard: a parenthetically disambiguated title (e.g.
+            # "Joe Jones (Fluxus musician)") means several same-named people
+            # exist, so a bare name + generic music keywords isn't enough to
+            # know which one this is. Require corroboration (birth/death year or
+            # a song on the page) before accepting such a page.
+            title_disambiguated = bool(re.search(r'\([^)]+\)\s*$', page_title_text))
+            if title_disambiguated and not has_corroboration:
+                logger.debug(f"Disambiguated title '{page_title_text}' without corroboration - not accepting")
+                return {
+                    'valid': False,
+                    'confidence': 'low',
+                    'reason': f"Disambiguated page '{page_title_text}' needs birth/death or song corroboration (score: {confidence_score})",
+                    'score': confidence_score
+                }
+
             # Determine validity based on confidence score
             # Require at least 50 points (medium confidence) to accept
             if confidence_score >= 50:
diff --git a/backend/tests/test_wikipedia_verify_guards.py b/backend/tests/test_wikipedia_verify_guards.py
new file mode 100644
index 00000000..625189ff
--- /dev/null
+++ b/backend/tests/test_wikipedia_verify_guards.py
@@ -0,0 +1,98 @@
+"""
+Tests for the precision guards in WikipediaSearcher.verify_wikipedia_reference.
+
+These guard against nickname-stripped names colliding with a famous *different*
+person:
+
+- Non-musician guard: a page whose infobox/lead describes an actor/athlete/etc.
+  (and has no music signal) is rejected — catches the actor Kirk Douglas.
+- Disambiguation-corroboration guard: a parenthetically disambiguated title
+  ("Joe Jones (Fluxus musician)") needs a birth/death-year or song match before
+  it's accepted — catches the wrong Joe Jones.
+- Hatnote stripping: cross-reference hatnotes ("For the musician, see ...") are
+  removed before scoring so the *other* subject's keywords don't leak in.
+
+The page fetch is monkeypatched with crafted HTML, so these run offline.
+"""
+
+import pytest
+
+from integrations.wikipedia.utils import WikipediaSearcher
+
+
+@pytest.fixture(scope="module")
+def searcher():
+    return WikipediaSearcher()
+
+
+def _page(title, lead, occupation=None, hatnote=None):
+    """Minimal Wikipedia-shaped HTML: h1 heading, optional hatnote, optional
+    infobox with an Occupation row, and a lead paragraph."""
+    infobox = (
+        f'<table class="infobox"><tr><th>Occupation</th>'
+        f'<td>{occupation}</td></tr></table>' if occupation else ''
+    )
+    hat = f'<div class="hatnote">{hatnote}</div>' if hatnote else ''
+    return (
+        '<html><body>'
+        f'<h1 id="firstHeading">{title}</h1>'
+        '<div id="mw-content-text"><div class="mw-parser-output">'
+        f'{hat}{infobox}<p>{lead}</p>'
+        '</div></div></body></html>'
+    )
+
+
+def _verify(searcher, monkeypatch, performer, html, context=None):
+    monkeypatch.setattr(searcher, "_fetch_wikipedia_page", lambda url: html)
+    ctx = context or {"birth_date": None, "death_date": None, "sample_songs": []}
+    return searcher.verify_wikipedia_reference(
+        performer, "https://en.wikipedia.org/wiki/X", ctx
+    )
+
+
+def test_non_musician_subject_rejected(searcher, monkeypatch):
+    html = _page("John Smith",
+                 "John Smith was an American actor and filmmaker.",
+                 occupation="Actor, filmmaker")
+    result = _verify(searcher, monkeypatch, "John Smith", html)
+    assert result["valid"] is False
+    assert result["score"] == 0
+
+
+def test_musician_with_incidental_non_music_word_kept(searcher, monkeypatch):
+    # A music term in the lead protects a genuine musician even if a
+    # non-musician word also appears.
+    html = _page("Jane Doe",
+                 "Jane Doe was an American jazz organist and occasional actor.",
+                 occupation="Musician")
+    result = _verify(searcher, monkeypatch, "Jane Doe", html)
+    assert result["valid"] is True
+
+
+def test_disambiguated_title_without_corroboration_rejected(searcher, monkeypatch):
+    html = _page("Joe Test (musician)",
+                 "Joe Test was an American jazz drummer.",
+                 occupation="Musician")
+    result = _verify(searcher, monkeypatch, "Joe Test", html)
+    assert result["valid"] is False
+
+
+def test_disambiguated_title_with_song_corroboration_accepted(searcher, monkeypatch):
+    html = _page("Joe Test (musician)",
+                 "Joe Test was an American jazz drummer known for Blue Moon.",
+                 occupation="Musician")
+    ctx = {"birth_date": None, "death_date": None, "sample_songs": ["Blue Moon"]}
+    result = _verify(searcher, monkeypatch, "Joe Test", html, ctx)
+    assert result["valid"] is True
+
+
+def test_hatnote_keywords_do_not_rescue_non_musician(searcher, monkeypatch):
+    # The hatnote mentions "musician" and points elsewhere; it must be stripped
+    # so the actual (actor) subject is still rejected.
+    html = _page("Bob Star",
+                 "Bob Star was an American actor.",
+                 occupation="Actor",
+                 hatnote="For the musician, see Bob Star (bandleader).")
+    result = _verify(searcher, monkeypatch, "Bob Star", html)
+    assert result["valid"] is False
+    assert result["score"] == 0