Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
247 changes: 170 additions & 77 deletions backend/integrations/wikipedia/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,26 @@

logger = logging.getLogger(__name__)

# Terms that mark a page's primary subject as a non-musician. Used to reject
# pages whose infobox/lead clearly describe an actor, athlete, politician, etc.
# Kept focused to avoid false rejects; only applied when NO music term is also
# present in the infobox/lead.
_NON_MUSICIAN_TERMS = [
'actor', 'actress', 'filmmaker', 'screenwriter', 'comedian',
'basketball', 'footballer', 'baseball', 'quarterback', 'athlete',
'politician', 'senator', 'congressman', 'governor', 'mayor', 'president',
'novelist', 'painter', 'sculptor', 'economist', 'physicist', 'philosopher',
]

# Music terms whose presence in the infobox/lead protects a genuine musician
# from the non-musician guard above (e.g. 'jazz organist').
_MUSICIAN_TERMS = [
'musician', 'singer', 'vocalist', 'pianist', 'organist', 'guitarist',
'bassist', 'drummer', 'saxophonist', 'trumpeter', 'trombonist',
'composer', 'bandleader', 'jazz', 'blues', 'bebop', 'swing',
]


class WikipediaSearcher:
"""Shared Wikipedia search functionality with caching"""

Expand Down Expand Up @@ -252,6 +272,31 @@ def rate_limit(self):
time.sleep(sleep_time)
self.last_request_time = time.time()

def _strip_nickname(self, name):
"""Remove a decorative quoted nickname and normalize smart quotes.

Performer names sometimes embed a nickname in quotes, e.g.
'“Brother” Jack McDuff' or '‘Papa’ John DeFrancesco'. The quoted part is
decorative; the legal name ('Jack McDuff') is what Wikipedia titles and
our matching want. Only paired double quotes ("..." / “...”) and paired
smart single quotes (‘...’) are stripped — a lone straight apostrophe
(O'Brien, 'Night Sweet Pea) has no opener, so such names are untouched.

The stripped form is only returned when it still has at least two
tokens (a plausible first + last name). If stripping collapses the
name to a single bare surname (e.g. '‘Doc’ West' -> 'West'), the
original is returned instead: a lone surname is too generic — it
fuzzy-matches unrelated famous people ('West' -> Kanye West) and
partial-matches any 'First Surname' page.
"""
s = name.replace('“', '"').replace('”', '"')
s = re.sub(r'"[^"]*"', ' ', s) # "nickname"
s = re.sub(r'‘[^’]*’', ' ', s) # ‘nickname’
s = re.sub(r'\s+', ' ', s).strip()
if s and len(s.split()) >= 2:
return s
return name.strip()

def verify_wikipedia_reference(self, performer_name, wikipedia_url, context):
"""
Verify that a Wikipedia URL is valid and refers to the correct performer
Expand Down Expand Up @@ -283,6 +328,12 @@ def verify_wikipedia_reference(self, performer_name, wikipedia_url, context):
# Get the main content area (skip navigation/menus)
content_div = soup.find('div', {'id': 'mw-content-text'}) or soup.find('div', {'class': 'mw-parser-output'})
if content_div:
# Drop hatnotes ("For the musician, see ...") before reading the
# text: they are cross-references to OTHER subjects, and letting
# their keywords leak in mis-scores the page (e.g. the actor Kirk
# Douglas hatnote mentions "musician" and points at the real one).
for hatnote in content_div.select('div.hatnote, .hatnote'):
hatnote.decompose()
page_text = content_div.get_text().lower()
else:
page_text = soup.get_text().lower()
Expand Down Expand Up @@ -339,10 +390,11 @@ def verify_wikipedia_reference(self, performer_name, wikipedia_url, context):
reasons = []

# Check name similarity
page_title_text = ''
page_title = soup.find('h1', {'id': 'firstHeading'})
if page_title:
page_title_text = page_title.get_text().strip()

# Check if the title disambiguation clearly indicates a NON-musician
# Extract the disambiguation term in parentheses (e.g., "(basketball)" from "Sam Jones (basketball)")
disambiguation_match = re.search(r'\(([^)]+)\)$', page_title_text)
Expand Down Expand Up @@ -381,9 +433,13 @@ def verify_wikipedia_reference(self, performer_name, wikipedia_url, context):
'score': 0
}

# Remove disambiguation parentheses like "(saxophonist)"
page_name = re.sub(r'\s*\([^)]*\)\s*$', '', page_title_text).strip().lower()
performer_name_lower = performer_name.lower()
# Remove disambiguation parentheses like "(saxophonist)" and
# strip decorative quoted nicknames from both sides so e.g.
# '“Brother” Jack McDuff' matches the page titled 'Jack McDuff'
# as an exact (not merely partial) name match.
page_name = re.sub(r'\s*\([^)]*\)\s*$', '', page_title_text).strip()
page_name = self._strip_nickname(page_name).lower()
performer_name_lower = self._strip_nickname(performer_name).lower()

name_match = False
if page_name == performer_name_lower:
Expand Down Expand Up @@ -426,6 +482,7 @@ def verify_wikipedia_reference(self, performer_name, wikipedia_url, context):
reasons.append(f"Name mismatch: expected '{performer_name}', page is '{page_title_text}'")

# Look for infobox (strong signal this is a musician page)
infobox_text = ''
infobox = soup.find('table', {'class': 'infobox'})
if infobox:
infobox_text = infobox.get_text().lower()
Expand Down Expand Up @@ -474,28 +531,65 @@ def verify_wikipedia_reference(self, performer_name, wikipedia_url, context):
# Generic terms only get partial credit and only if we have other signals
confidence_score += 5
reasons.append(f"Found generic music keywords: {', '.join(found_generic[:2])}")


# Guard: reject pages whose primary subject is clearly a non-musician
# (actor, athlete, politician, ...). We look only at the infobox and
# the lead sentence — an incidental "musician" mention later in the
# body must not rescue e.g. the actor Kirk Douglas. A music term in
# the infobox/lead protects genuine musicians ('jazz organist').
lead_text = page_text[:600]
subject_text = f"{infobox_text} {lead_text}"
non_musician_hits = [t for t in _NON_MUSICIAN_TERMS if self._word_in_text(t, subject_text)]
music_hits = [t for t in _MUSICIAN_TERMS if self._word_in_text(t, subject_text)]
if non_musician_hits and not music_hits:
logger.debug(f"Primary subject looks non-musician ({non_musician_hits[:2]}), no music signal - rejecting")
return {
'valid': False,
'confidence': 'high',
'reason': f"Page subject appears to be a {non_musician_hits[0]}, not a musician",
'score': 0
}

# Check birth/death dates if available
has_corroboration = False
if context.get('birth_date'):
birth_year = str(context['birth_date'].year) if hasattr(context['birth_date'], 'year') else str(context['birth_date'])[:4]
if birth_year in page_text[:2000]:
confidence_score += 25
has_corroboration = True
reasons.append(f"Birth year {birth_year} found on page")

if context.get('death_date'):
death_year = str(context['death_date'].year) if hasattr(context['death_date'], 'year') else str(context['death_date'])[:4]
if death_year in page_text[:2000]:
confidence_score += 20
has_corroboration = True
reasons.append(f"Death year {death_year} found on page")

# Check if any of the performer's songs are mentioned
if context.get('sample_songs'):
song_mentions = [song for song in context['sample_songs']
song_mentions = [song for song in context['sample_songs']
if song and song.lower() in page_text]
if song_mentions:
confidence_score += 25
has_corroboration = True
reasons.append(f"Found song references: {', '.join(song_mentions[:2])}")


# Guard: a parenthetically disambiguated title (e.g.
# "Joe Jones (Fluxus musician)") means several same-named people
# exist, so a bare name + generic music keywords isn't enough to
# know which one this is. Require corroboration (birth/death year or
# a song on the page) before accepting such a page.
title_disambiguated = bool(re.search(r'\([^)]+\)\s*$', page_title_text))
if title_disambiguated and not has_corroboration:
logger.debug(f"Disambiguated title '{page_title_text}' without corroboration - not accepting")
return {
'valid': False,
'confidence': 'low',
'reason': f"Disambiguated page '{page_title_text}' needs birth/death or song corroboration (score: {confidence_score})",
'score': confidence_score
}

# Determine validity based on confidence score
# Require at least 50 points (medium confidence) to accept
if confidence_score >= 50:
Expand Down Expand Up @@ -597,94 +691,93 @@ def _string_similarity(self, s1, s2):
return 1.0 - (edit_distance / max_len)


def _opensearch(self, query):
"""Return candidate Wikipedia article URLs for a query via the
OpenSearch API, honoring the 7-day search cache.

Sets self.last_made_api_call. Returns a list of URLs (possibly empty);
a genuine empty result is cached so bulk re-runs skip it. Returns None
on a transient request failure so the caller can tell 'no results'
apart from 'lookup failed' (and we avoid caching the failure).
"""
if not self.force_refresh:
cached = self._load_search_from_cache(query)
if cached is not None:
self.last_made_api_call = False
return cached

self.last_made_api_call = True
self.rate_limit()
try:
response = self.session.get(
"https://en.wikipedia.org/w/api.php",
params={'action': 'opensearch', 'search': query,
'limit': 5, 'namespace': 0, 'format': 'json'},
timeout=10)
except requests.RequestException as e:
logger.warning(f" OpenSearch request failed for {query!r}: {e}")
return None

if response.status_code != 200:
return None

data = response.json()
urls = data[3] if len(data) >= 4 and data[3] else []
if not self.force_refresh:
self._save_search_to_cache(query, urls)
return urls

def search_wikipedia(self, performer_name, context):
"""
Search Wikipedia for a performer

Args:
performer_name: Name to search for
context: Dict with additional info for verification

Returns:
Wikipedia URL if found with reasonable confidence, None otherwise
"""
try:
# Check cache first (unless force_refresh is enabled)
if not self.force_refresh:
cached_results = self._load_search_from_cache(performer_name)
if cached_results is not None: # Changed from 'if cached_results:' to handle empty lists
self.last_made_api_call = False # Using cached search results
if not cached_results: # Empty list means no results found previously
logger.debug(f" Using cached empty search results (no Wikipedia page found)")
return None
logger.debug(f" Using cached search results")
urls = cached_results
else:
# Perform API search
self.last_made_api_call = True
search_url = "https://en.wikipedia.org/w/api.php"
params = {
'action': 'opensearch',
'search': performer_name,
'limit': 5,
'namespace': 0,
'format': 'json'
}

self.rate_limit()
response = self.session.get(search_url, params=params, timeout=10)

if response.status_code != 200:
return None

data = response.json()
if len(data) < 4 or not data[3]:
# Cache empty results so we don't search again
self._save_search_to_cache(performer_name, [])
return None

urls = data[3]

# Save to cache
self._save_search_to_cache(performer_name, urls)
else:
# Force refresh - skip cache
self.last_made_api_call = True
search_url = "https://en.wikipedia.org/w/api.php"
params = {
'action': 'opensearch',
'search': performer_name,
'limit': 5,
'namespace': 0,
'format': 'json'
}

self.rate_limit()
response = self.session.get(search_url, params=params, timeout=10)

if response.status_code != 200:
return None

data = response.json()
if len(data) < 4 or not data[3]:
return None

urls = data[3]
# Search the nickname-stripped (more canonical) form first, then
# the name as stored. e.g. '“Brother” Jack McDuff' searches
# 'Jack McDuff' first so the canonical article is found and
# preferred over an album/redirect that the decorated name returns.
queries = []
stripped = self._strip_nickname(performer_name)
if stripped.lower() != performer_name.lower():
queries.append(stripped)
queries.append(performer_name)

candidate_urls = []
any_api_call = False
for query in queries:
urls = self._opensearch(query)
any_api_call = any_api_call or self.last_made_api_call
for url in (urls or []):
if url not in candidate_urls:
candidate_urls.append(url)
self.last_made_api_call = any_api_call

if not candidate_urls:
logger.debug(" No Wikipedia search results")
return None

# Verify each URL until we find a good match
# Verify each candidate until we find a good match
# Note: verify_wikipedia_reference will also set last_made_api_call
for url in urls[:5]:
for url in candidate_urls[:8]:
verification = self.verify_wikipedia_reference(performer_name, url, context)
logger.debug(f" Checked {url}: valid={verification['valid']}, confidence={verification['confidence']}, score={verification.get('score', 0)}, reason={verification['reason']}")
if verification['valid']:
logger.debug(f" Found Wikipedia: {url} (confidence: {verification['confidence']}, score: {verification.get('score', 0)})")
logger.debug(f" Reason: {verification['reason']}")
return url

# No valid URL found - cache empty results

# No candidate verified - cache empty under the stored name so a
# re-run skips re-verifying (preserves bulk-run speed).
self._save_search_to_cache(performer_name, [])
return None

except Exception as e:
logger.error(f"Error searching Wikipedia for {performer_name}: {e}")
return None
Loading
Loading