dprodger · dprodger · May 31, 2026 · May 31, 2026 · May 31, 2026 · May 31, 2026
diff --git a/backend/integrations/wikipedia/utils.py b/backend/integrations/wikipedia/utils.py
@@ -19,6 +19,26 @@
 
 logger = logging.getLogger(__name__)
 
+# Terms that mark a page's primary subject as a non-musician. Used to reject
+# pages whose infobox/lead clearly describe an actor, athlete, politician, etc.
+# Kept focused to avoid false rejects; only applied when NO music term is also
+# present in the infobox/lead.
+_NON_MUSICIAN_TERMS = [
+    'actor', 'actress', 'filmmaker', 'screenwriter', 'comedian',
+    'basketball', 'footballer', 'baseball', 'quarterback', 'athlete',
+    'politician', 'senator', 'congressman', 'governor', 'mayor', 'president',
+    'novelist', 'painter', 'sculptor', 'economist', 'physicist', 'philosopher',
+]
+
+# Music terms whose presence in the infobox/lead protects a genuine musician
+# from the non-musician guard above (e.g. 'jazz organist').
+_MUSICIAN_TERMS = [
+    'musician', 'singer', 'vocalist', 'pianist', 'organist', 'guitarist',
+    'bassist', 'drummer', 'saxophonist', 'trumpeter', 'trombonist',
+    'composer', 'bandleader', 'jazz', 'blues', 'bebop', 'swing',
+]
+
+
 class WikipediaSearcher:
     """Shared Wikipedia search functionality with caching"""
 
@@ -252,6 +272,31 @@ def rate_limit(self):
             time.sleep(sleep_time)
         self.last_request_time = time.time()
 
+    def _strip_nickname(self, name):
+        """Remove a decorative quoted nickname and normalize smart quotes.
+
+        Performer names sometimes embed a nickname in quotes, e.g.
+        '“Brother” Jack McDuff' or '‘Papa’ John DeFrancesco'. The quoted part is
+        decorative; the legal name ('Jack McDuff') is what Wikipedia titles and
+        our matching want. Only paired double quotes ("..." / “...”) and paired
+        smart single quotes (‘...’) are stripped — a lone straight apostrophe
+        (O'Brien, 'Night Sweet Pea) has no opener, so such names are untouched.
+
+        The stripped form is only returned when it still has at least two
+        tokens (a plausible first + last name). If stripping collapses the
+        name to a single bare surname (e.g. '‘Doc’ West' -> 'West'), the
+        original is returned instead: a lone surname is too generic — it
+        fuzzy-matches unrelated famous people ('West' -> Kanye West) and
+        partial-matches any 'First Surname' page.
+        """
+        s = name.replace('“', '"').replace('”', '"')
+        s = re.sub(r'"[^"]*"', ' ', s)                  # "nickname"
+        s = re.sub(r'‘[^’]*’', ' ', s)   # ‘nickname’
+        s = re.sub(r'\s+', ' ', s).strip()
+        if s and len(s.split()) >= 2:
+            return s
+        return name.strip()
+
     def verify_wikipedia_reference(self, performer_name, wikipedia_url, context):
         """
         Verify that a Wikipedia URL is valid and refers to the correct performer
@@ -283,6 +328,12 @@ def verify_wikipedia_reference(self, performer_name, wikipedia_url, context):
             # Get the main content area (skip navigation/menus)
             content_div = soup.find('div', {'id': 'mw-content-text'}) or soup.find('div', {'class': 'mw-parser-output'})
             if content_div:
+                # Drop hatnotes ("For the musician, see ...") before reading the
+                # text: they are cross-references to OTHER subjects, and letting
+                # their keywords leak in mis-scores the page (e.g. the actor Kirk
+                # Douglas hatnote mentions "musician" and points at the real one).
+                for hatnote in content_div.select('div.hatnote, .hatnote'):
+                    hatnote.decompose()
                 page_text = content_div.get_text().lower()
             else:
                 page_text = soup.get_text().lower()
@@ -339,10 +390,11 @@ def verify_wikipedia_reference(self, performer_name, wikipedia_url, context):
             reasons = []
 
             # Check name similarity
+            page_title_text = ''
             page_title = soup.find('h1', {'id': 'firstHeading'})
             if page_title:
                 page_title_text = page_title.get_text().strip()
-                
+
                 # Check if the title disambiguation clearly indicates a NON-musician
                 # Extract the disambiguation term in parentheses (e.g., "(basketball)" from "Sam Jones (basketball)")
                 disambiguation_match = re.search(r'\(([^)]+)\)$', page_title_text)
@@ -381,9 +433,13 @@ def verify_wikipedia_reference(self, performer_name, wikipedia_url, context):
                             'score': 0
                         }
 
-                # Remove disambiguation parentheses like "(saxophonist)"
-                page_name = re.sub(r'\s*\([^)]*\)\s*$', '', page_title_text).strip().lower()
-                performer_name_lower = performer_name.lower()
+                # Remove disambiguation parentheses like "(saxophonist)" and
+                # strip decorative quoted nicknames from both sides so e.g.
+                # '“Brother” Jack McDuff' matches the page titled 'Jack McDuff'
+                # as an exact (not merely partial) name match.
+                page_name = re.sub(r'\s*\([^)]*\)\s*$', '', page_title_text).strip()
+                page_name = self._strip_nickname(page_name).lower()
+                performer_name_lower = self._strip_nickname(performer_name).lower()
 
                 name_match = False
                 if page_name == performer_name_lower:
@@ -426,6 +482,7 @@ def verify_wikipedia_reference(self, performer_name, wikipedia_url, context):
                     reasons.append(f"Name mismatch: expected '{performer_name}', page is '{page_title_text}'")
 
             # Look for infobox (strong signal this is a musician page)
+            infobox_text = ''
             infobox = soup.find('table', {'class': 'infobox'})
             if infobox:
                 infobox_text = infobox.get_text().lower()
@@ -474,28 +531,65 @@ def verify_wikipedia_reference(self, performer_name, wikipedia_url, context):
                 # Generic terms only get partial credit and only if we have other signals
                 confidence_score += 5
                 reasons.append(f"Found generic music keywords: {', '.join(found_generic[:2])}")
-
+
+            # Guard: reject pages whose primary subject is clearly a non-musician
+            # (actor, athlete, politician, ...). We look only at the infobox and
+            # the lead sentence — an incidental "musician" mention later in the
+            # body must not rescue e.g. the actor Kirk Douglas. A music term in
+            # the infobox/lead protects genuine musicians ('jazz organist').
+            lead_text = page_text[:600]
+            subject_text = f"{infobox_text} {lead_text}"
+            non_musician_hits = [t for t in _NON_MUSICIAN_TERMS if self._word_in_text(t, subject_text)]
+            music_hits = [t for t in _MUSICIAN_TERMS if self._word_in_text(t, subject_text)]
+            if non_musician_hits and not music_hits:
+                logger.debug(f"Primary subject looks non-musician ({non_musician_hits[:2]}), no music signal - rejecting")
+                return {
+                    'valid': False,
+                    'confidence': 'high',
+                    'reason': f"Page subject appears to be a {non_musician_hits[0]}, not a musician",
+                    'score': 0
+                }
+
             # Check birth/death dates if available
+            has_corroboration = False
             if context.get('birth_date'):
                 birth_year = str(context['birth_date'].year) if hasattr(context['birth_date'], 'year') else str(context['birth_date'])[:4]
                 if birth_year in page_text[:2000]:
                     confidence_score += 25
+                    has_corroboration = True
                     reasons.append(f"Birth year {birth_year} found on page")
-            
+
             if context.get('death_date'):
                 death_year = str(context['death_date'].year) if hasattr(context['death_date'], 'year') else str(context['death_date'])[:4]
                 if death_year in page_text[:2000]:
                     confidence_score += 20
+                    has_corroboration = True
                     reasons.append(f"Death year {death_year} found on page")
-            
+
             # Check if any of the performer's songs are mentioned
             if context.get('sample_songs'):
-                song_mentions = [song for song in context['sample_songs'] 
+                song_mentions = [song for song in context['sample_songs']
                                if song and song.lower() in page_text]
                 if song_mentions:
                     confidence_score += 25
+                    has_corroboration = True
                     reasons.append(f"Found song references: {', '.join(song_mentions[:2])}")
-
+
+            # Guard: a parenthetically disambiguated title (e.g.
+            # "Joe Jones (Fluxus musician)") means several same-named people
+            # exist, so a bare name + generic music keywords isn't enough to
+            # know which one this is. Require corroboration (birth/death year or
+            # a song on the page) before accepting such a page.
+            title_disambiguated = bool(re.search(r'\([^)]+\)\s*$', page_title_text))
+            if title_disambiguated and not has_corroboration:
+                logger.debug(f"Disambiguated title '{page_title_text}' without corroboration - not accepting")
+                return {
+                    'valid': False,
+                    'confidence': 'low',
+                    'reason': f"Disambiguated page '{page_title_text}' needs birth/death or song corroboration (score: {confidence_score})",
+                    'score': confidence_score
+                }
+
             # Determine validity based on confidence score
             # Require at least 50 points (medium confidence) to accept
             if confidence_score >= 50:
@@ -597,94 +691,93 @@ def _string_similarity(self, s1, s2):
         return 1.0 - (edit_distance / max_len)
 
 
+    def _opensearch(self, query):
+        """Return candidate Wikipedia article URLs for a query via the
+        OpenSearch API, honoring the 7-day search cache.
+
+        Sets self.last_made_api_call. Returns a list of URLs (possibly empty);
+        a genuine empty result is cached so bulk re-runs skip it. Returns None
+        on a transient request failure so the caller can tell 'no results'
+        apart from 'lookup failed' (and we avoid caching the failure).
+        """
+        if not self.force_refresh:
+            cached = self._load_search_from_cache(query)
+            if cached is not None:
+                self.last_made_api_call = False
+                return cached
+
+        self.last_made_api_call = True
+        self.rate_limit()
+        try:
+            response = self.session.get(
+                "https://en.wikipedia.org/w/api.php",
+                params={'action': 'opensearch', 'search': query,
+                        'limit': 5, 'namespace': 0, 'format': 'json'},
+                timeout=10)
+        except requests.RequestException as e:
+            logger.warning(f"  OpenSearch request failed for {query!r}: {e}")
+            return None
+
+        if response.status_code != 200:
+            return None
+
+        data = response.json()
+        urls = data[3] if len(data) >= 4 and data[3] else []
+        if not self.force_refresh:
+            self._save_search_to_cache(query, urls)
+        return urls
+
     def search_wikipedia(self, performer_name, context):
         """
         Search Wikipedia for a performer
-        
+
         Args:
             performer_name: Name to search for
             context: Dict with additional info for verification
-            
+
         Returns:
             Wikipedia URL if found with reasonable confidence, None otherwise
         """
         try:
-            # Check cache first (unless force_refresh is enabled)
-            if not self.force_refresh:
-                cached_results = self._load_search_from_cache(performer_name)
-                if cached_results is not None:  # Changed from 'if cached_results:' to handle empty lists
-                    self.last_made_api_call = False  # Using cached search results
-                    if not cached_results:  # Empty list means no results found previously
-                        logger.debug(f"  Using cached empty search results (no Wikipedia page found)")
-                        return None
-                    logger.debug(f"  Using cached search results")
-                    urls = cached_results
-                else:
-                    # Perform API search
-                    self.last_made_api_call = True
-                    search_url = "https://en.wikipedia.org/w/api.php"
-                    params = {
-                        'action': 'opensearch',
-                        'search': performer_name,
-                        'limit': 5,
-                        'namespace': 0,
-                        'format': 'json'
-                    }
-
-                    self.rate_limit()
-                    response = self.session.get(search_url, params=params, timeout=10)
-
-                    if response.status_code != 200:
-                        return None
-
-                    data = response.json()
-                    if len(data) < 4 or not data[3]:
-                        # Cache empty results so we don't search again
-                        self._save_search_to_cache(performer_name, [])
-                        return None
-
-                    urls = data[3]
-
-                    # Save to cache
-                    self._save_search_to_cache(performer_name, urls)
-            else:
-                # Force refresh - skip cache
-                self.last_made_api_call = True
-                search_url = "https://en.wikipedia.org/w/api.php"
-                params = {
-                    'action': 'opensearch',
-                    'search': performer_name,
-                    'limit': 5,
-                    'namespace': 0,
-                    'format': 'json'
-                }
-
-                self.rate_limit()
-                response = self.session.get(search_url, params=params, timeout=10)
-
-                if response.status_code != 200:
-                    return None
-
-                data = response.json()
-                if len(data) < 4 or not data[3]:
-                    return None
-
-                urls = data[3]
+            # Search the nickname-stripped (more canonical) form first, then
+            # the name as stored. e.g. '“Brother” Jack McDuff' searches
+            # 'Jack McDuff' first so the canonical article is found and
+            # preferred over an album/redirect that the decorated name returns.
+            queries = []
+            stripped = self._strip_nickname(performer_name)
+            if stripped.lower() != performer_name.lower():
+                queries.append(stripped)
+            queries.append(performer_name)
+
+            candidate_urls = []
+            any_api_call = False
+            for query in queries:
+                urls = self._opensearch(query)
+                any_api_call = any_api_call or self.last_made_api_call
+                for url in (urls or []):
+                    if url not in candidate_urls:
+                        candidate_urls.append(url)
+            self.last_made_api_call = any_api_call
+
+            if not candidate_urls:
+                logger.debug("  No Wikipedia search results")
+                return None
 
-            # Verify each URL until we find a good match
+            # Verify each candidate until we find a good match
             # Note: verify_wikipedia_reference will also set last_made_api_call
-            for url in urls[:5]:
+            for url in candidate_urls[:8]:
                 verification = self.verify_wikipedia_reference(performer_name, url, context)
                 logger.debug(f"  Checked {url}: valid={verification['valid']}, confidence={verification['confidence']}, score={verification.get('score', 0)}, reason={verification['reason']}")
                 if verification['valid']:
                     logger.debug(f"  Found Wikipedia: {url} (confidence: {verification['confidence']}, score: {verification.get('score', 0)})")
                     logger.debug(f"    Reason: {verification['reason']}")
                     return url
-
-            # No valid URL found - cache empty results
+
+            # No candidate verified - cache empty under the stored name so a
+            # re-run skips re-verifying (preserves bulk-run speed).
             self._save_search_to_cache(performer_name, [])
             return None
-            
+
         except Exception as e:
             logger.error(f"Error searching Wikipedia for {performer_name}: {e}")
             return None