From 5cc5caed4f4ffe39803042fe38cc843a33539d96 Mon Sep 17 00:00:00 2001
From: David Rodger <dave@davidrodger.com>
Date: Sun, 14 Jun 2026 15:16:55 -0400
Subject: [PATCH 1/3] admin: editable MB work IDs and alternate titles on song
 detail

Add inline editors to the admin song detail page:

- MB Work ID and Secondary MB Work ID are both shown and editable.
  Entering an ID runs a MusicBrainz work lookup (cached via
  MusicBrainzSearcher) that surfaces the id, title, and composer/
  writer/lyricist credits; Save is gated on a successful lookup so
  an unverified ID cannot be persisted. Slots with a value get Clear.
- Alternate titles render under the song title as chips with an
  add/remove editor that saves to songs.alt_titles.

Backing endpoints:
- GET  /admin/musicbrainz/work/<id>/lookup
- POST /admin/songs/<id>/mb-id        (slot: primary|second)
- POST /admin/songs/<id>/alt-titles

CSRF is handled by the existing admin.js fetch wrapper.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 backend/routes/admin.py                       | 130 ++++++++
 .../templates/admin/browse_song_detail.html   | 313 +++++++++++++++++-
 2 files changed, 426 insertions(+), 17 deletions(-)
diff --git a/backend/routes/admin.py b/backend/routes/admin.py
index 3dbe306..e8fd96e 100644
--- a/backend/routes/admin.py
+++ b/backend/routes/admin.py
@@ -4434,6 +4434,136 @@ def songs_browse_detail(song_id):
     )
 
 
+def _is_uuid(value):
+    """True if `value` parses as a UUID (MB work IDs are UUIDs)."""
+    import uuid as _uuid
+    try:
+        _uuid.UUID(str(value))
+        return True
+    except (ValueError, AttributeError, TypeError):
+        return False
+
+
+@admin_bp.route('/musicbrainz/work/<work_id>/lookup', methods=['GET'])
+def musicbrainz_work_lookup(work_id):
+    """Look up a MusicBrainz work by ID and return its title + creators.
+
+    Backs the inline MB Work ID editor on the song detail page: the admin
+    types an ID, we fetch the work from MusicBrainz (cached via
+    MusicBrainzSearcher) and echo back the canonical title and composer/
+    writer/lyricist credits so the change can be eyeballed before saving.
+    """
+    work_id = (work_id or '').strip()
+    if not _is_uuid(work_id):
+        return jsonify({'error': 'Not a valid MusicBrainz work ID (expected a UUID).'}), 400
+
+    try:
+        work_data = MusicBrainzSearcher().get_work_recordings(work_id)
+    except Exception as e:
+        logger.error("MB work lookup failed for %s: %s", work_id, e)
+        return jsonify({'error': 'MusicBrainz lookup failed. Try again.'}), 502
+
+    if not work_data:
+        return jsonify({'error': 'No MusicBrainz work found for that ID.'}), 404
+
+    # Pull composer/writer/lyricist credits off the artist relations, in the
+    # same way song_updates.update_song_composer does, preserving order and
+    # de-duplicating by name.
+    creators = []
+    seen = set()
+    for relation in work_data.get('relations', []):
+        rel_type = relation.get('type')
+        if rel_type in ('composer', 'writer', 'lyricist'):
+            name = (relation.get('artist') or {}).get('name')
+            if name and name not in seen:
+                seen.add(name)
+                creators.append({'name': name, 'type': rel_type})
+
+    return jsonify({
+        'id': work_data.get('id') or work_id,
+        'title': work_data.get('title'),
+        'composers': creators,
+    })
+
+
+@admin_bp.route('/songs/<song_id>/mb-id', methods=['POST'])
+def songs_update_mb_id(song_id):
+    """Set or clear a song's primary or secondary MusicBrainz work ID.
+
+    Body (JSON):
+        slot:  'primary' | 'second' (required)
+        mb_id: UUID string, or '' / null to clear the slot.
+    """
+    body = request.get_json(silent=True) or {}
+    slot = (body.get('slot') or '').strip()
+    column = {'primary': 'musicbrainz_id', 'second': 'second_mb_id'}.get(slot)
+    if not column:
+        return jsonify({'error': "slot must be 'primary' or 'second'"}), 400
+
+    raw = body.get('mb_id')
+    mb_id = (raw or '').strip() or None
+    if mb_id is not None and not _is_uuid(mb_id):
+        return jsonify({'error': 'Not a valid MusicBrainz work ID (expected a UUID).'}), 400
+
+    with get_db_connection() as conn:
+        with conn.cursor() as cur:
+            cur.execute(
+                f"UPDATE songs SET {column} = %s, updated_at = CURRENT_TIMESTAMP "
+                "WHERE id = %s RETURNING id",
+                (mb_id, song_id),
+            )
+            if cur.fetchone() is None:
+                conn.rollback()
+                return jsonify({'error': 'Song not found'}), 404
+        conn.commit()
+
+    logger.info("admin set %s=%s on song %s", column, mb_id, song_id)
+    return jsonify({'success': True, 'slot': slot, 'mb_id': mb_id})
+
+
+@admin_bp.route('/songs/<song_id>/alt-titles', methods=['POST'])
+def songs_update_alt_titles(song_id):
+    """Replace a song's alternate-title list (songs.alt_titles TEXT[]).
+
+    Body (JSON): { alt_titles: ["...", ...] }. Entries are trimmed, blanks
+    dropped, duplicates removed (case-insensitive, first spelling wins). An
+    empty list clears the column to NULL.
+    """
+    body = request.get_json(silent=True) or {}
+    raw = body.get('alt_titles')
+    if not isinstance(raw, list):
+        return jsonify({'error': 'alt_titles must be a list of strings'}), 400
+
+    cleaned = []
+    seen = set()
+    for item in raw:
+        title = (item or '').strip() if isinstance(item, str) else ''
+        if not title:
+            continue
+        key = title.lower()
+        if key in seen:
+            continue
+        seen.add(key)
+        cleaned.append(title)
+
+    stored = cleaned or None  # empty list -> NULL
+
+    with get_db_connection() as conn:
+        with conn.cursor() as cur:
+            cur.execute(
+                "UPDATE songs SET alt_titles = %s, updated_at = CURRENT_TIMESTAMP "
+                "WHERE id = %s RETURNING id",
+                (stored, song_id),
+            )
+            if cur.fetchone() is None:
+                conn.rollback()
+                return jsonify({'error': 'Song not found'}), 404
+        conn.commit()
+
+    logger.info("admin set alt_titles=%s on song %s", cleaned, song_id)
+    return jsonify({'success': True, 'alt_titles': cleaned})
+
+
 @admin_bp.route('/releases/<release_id>')
 def releases_browse_detail(release_id):
     """Release detail.
diff --git a/backend/templates/admin/browse_song_detail.html b/backend/templates/admin/browse_song_detail.html
index 8b140a7..27f8e7b 100644
--- a/backend/templates/admin/browse_song_detail.html
+++ b/backend/templates/admin/browse_song_detail.html
@@ -163,6 +163,51 @@
             margin-left: 4px;
         }
         .empty { color: #888; padding: 24px; text-align: center; background: #fff; border: 1px solid #e0e0e0; border-radius: 8px; }
+
+        /* Inline editors (MB IDs + alt titles) */
+        .link-btn {
+            background: none; border: none; color: #0066cc; cursor: pointer;
+            font-size: 12px; padding: 0; margin-left: 8px; font-family: inherit;
+        }
+        .link-btn:hover { text-decoration: underline; }
+        .btn {
+            font-size: 12px; padding: 4px 10px; border-radius: 6px;
+            border: 1px solid #d0d0d0; background: #fff; cursor: pointer; font-family: inherit;
+        }
+        .btn:hover { background: #f5f5f7; }
+        .btn-primary { background: #0066cc; border-color: #0066cc; color: #fff; }
+        .btn-primary:hover { background: #0052a3; }
+        .btn-primary:disabled { background: #b0c4de; border-color: #b0c4de; cursor: not-allowed; }
+        .btn-danger { color: #cc0000; border-color: #e0b4b4; }
+        .btn-danger:hover { background: #fdf2f2; }
+        .mb-input, .alt-input {
+            font-family: 'SF Mono', Monaco, monospace; font-size: 12px;
+            padding: 5px 8px; border: 1px solid #ccc; border-radius: 6px;
+            width: 340px; max-width: 100%;
+        }
+        .alt-input { font-family: inherit; }
+        .mb-editor { margin-top: 8px; }
+        .editor-row { display: flex; gap: 8px; align-items: center; flex-wrap: wrap; }
+        .mb-result { margin-top: 8px; font-size: 12px; min-height: 1px; }
+        .mb-result .ok { color: #1a7f37; font-weight: 600; }
+        .mb-result .err { color: #cc0000; }
+        .mb-result dl { display: grid; grid-template-columns: max-content 1fr; gap: 2px 10px; margin-top: 4px; }
+        .mb-result dt { color: #888; }
+        .editor-actions { display: flex; gap: 8px; margin-top: 10px; align-items: center; }
+
+        .alt-titles { margin: 2px 0 12px; display: flex; align-items: center; flex-wrap: wrap; gap: 6px; }
+        .alt-label { color: #888; font-size: 12px; }
+        .alt-chip {
+            display: inline-flex; align-items: center; gap: 4px;
+            background: #eef1f4; border-radius: 12px; padding: 3px 4px 3px 10px;
+            font-size: 12px; color: #333;
+        }
+        .alt-chip button {
+            background: none; border: none; cursor: pointer; color: #999;
+            font-size: 15px; line-height: 1; padding: 0 4px; font-family: inherit;
+        }
+        .alt-chip button:hover { color: #cc0000; }
+        [hidden] { display: none !important; }
     </style>
 </head>
 <body>
@@ -174,39 +219,74 @@
             Song
         </div>
 
-        <div class="summary">
+        <div class="summary" data-song-id="{{ song.id }}">
             <h1>{{ song.title }}</h1>
             {% if song.composer %}
               <div class="composer">{{ song.composer }}</div>
             {% endif %}
+
+            {# Alternate titles, editable. JS owns the chip rendering from the
+               initial array below so display and editor stay in sync. #}
+            <div class="alt-titles" id="alt-titles-view">
+                <span class="alt-label">Alt titles:</span>
+                <span id="alt-titles-chips-view"></span>
+                <button type="button" class="link-btn" id="alt-titles-edit-btn">Edit</button>
+            </div>
+            <div class="alt-titles-editor mb-editor" id="alt-titles-editor" hidden>
+                <div id="alt-titles-chips-edit" class="alt-titles" style="margin:0 0 8px;"></div>
+                <div class="editor-row">
+                    <input type="text" class="alt-input" id="alt-title-input"
+                           placeholder="Add an alternate title, then press Enter or Add" />
+                    <button type="button" class="btn" id="alt-title-add-btn">Add</button>
+                </div>
+                <div class="editor-actions">
+                    <button type="button" class="btn btn-primary" id="alt-titles-save-btn">Save alt titles</button>
+                    <button type="button" class="btn" id="alt-titles-cancel-btn">Cancel</button>
+                    <span id="alt-titles-status" style="font-size:12px;"></span>
+                </div>
+            </div>
+
             <dl>
                 <dt>Song ID</dt>
                 <dd class="mono">{{ song.id }}</dd>
 
-                <dt>MB Work ID</dt>
-                <dd>
-                  {% if song.musicbrainz_id %}
-                    <a class="mono" href="https://musicbrainz.org/work/{{ song.musicbrainz_id }}" target="_blank" rel="noopener">{{ song.musicbrainz_id }}</a>
-                  {% else %}<span class="muted">—</span>{% endif %}
-                </dd>
-
-                {% if song.second_mb_id %}
-                <dt>Secondary MB Work ID</dt>
+                {# Primary + secondary MB work IDs, each with an inline,
+                   verify-before-save editor (see JS at the bottom). #}
+                {% for slot, label, value in [
+                    ('primary', 'MB Work ID', song.musicbrainz_id),
+                    ('second', 'Secondary MB Work ID', song.second_mb_id)
+                ] %}
+                <dt>{{ label }}</dt>
                 <dd>
-                    <a class="mono" href="https://musicbrainz.org/work/{{ song.second_mb_id }}" target="_blank" rel="noopener">{{ song.second_mb_id }}</a>
+                    <span class="mb-view" data-slot="{{ slot }}">
+                        {% if value %}
+                          <a class="mono" href="https://musicbrainz.org/work/{{ value }}" target="_blank" rel="noopener">{{ value }}</a>
+                        {% else %}<span class="muted">—</span>{% endif %}
+                        <button type="button" class="link-btn mb-edit-btn" data-slot="{{ slot }}">{{ 'Edit' if value else 'Add' }}</button>
+                    </span>
+                    <div class="mb-editor" data-slot="{{ slot }}" hidden>
+                        <div class="editor-row">
+                            <input type="text" class="mb-input" value="{{ value or '' }}"
+                                   placeholder="MusicBrainz work UUID" />
+                            <button type="button" class="btn mb-lookup-btn">Look up</button>
+                        </div>
+                        <div class="mb-result"></div>
+                        <div class="editor-actions">
+                            <button type="button" class="btn btn-primary mb-save-btn" disabled>Save</button>
+                            {% if value %}
+                            <button type="button" class="btn btn-danger mb-clear-btn">Clear</button>
+                            {% endif %}
+                            <button type="button" class="btn mb-cancel-btn">Cancel</button>
+                        </div>
+                    </div>
                 </dd>
-                {% endif %}
+                {% endfor %}
 
                 {% if song.composed_year or song.composed_key %}
                 <dt>Composed</dt>
                 <dd>{{ song.composed_year or '' }}{% if song.composed_key %} · key {{ song.composed_key }}{% endif %}</dd>
                 {% endif %}
 
-                {% if song.alt_titles %}
-                <dt>Alt Titles</dt>
-                <dd>{{ song.alt_titles | join(' · ') }}</dd>
-                {% endif %}
-
                 {% if song.wikipedia_url %}
                 <dt>Wikipedia</dt>
                 <dd><a href="{{ song.wikipedia_url }}" target="_blank" rel="noopener">{{ song.wikipedia_url }}</a></dd>
@@ -214,6 +294,8 @@ <h1>{{ song.title }}</h1>
             </dl>
         </div>
 
+        <script id="alt-titles-data" type="application/json">{{ (song.alt_titles or []) | tojson }}</script>
+
         <div class="section-header">
             <h2>Recordings ({{ recordings | length }})</h2>
             <span class="hint">Click a column header to sort. Click a row to open the recording.</span>
@@ -338,5 +420,202 @@ <h2>Recordings ({{ recordings | length }})</h2>
         });
     })();
     </script>
+
+    <script>
+    // Inline editors for the MusicBrainz work IDs and the alternate-title
+    // list. State-changing fetches to /admin/* are CSRF-stamped automatically
+    // by static/js/admin.js (loaded via _nav.html).
+    (function () {
+        const summary = document.querySelector('.summary[data-song-id]');
+        if (!summary) return;
+        const songId = summary.dataset.songId;
+        const UUID_RE = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
+
+        function esc(s) {
+            return String(s == null ? '' : s).replace(/[&<>"']/g, c => ({
+                '&': '&amp;', '<': '&lt;', '>': '&gt;', '"': '&quot;', "'": '&#39;'
+            }[c]));
+        }
+
+        // ---- MusicBrainz work ID editors --------------------------------
+        summary.querySelectorAll('.mb-view .mb-edit-btn').forEach(editBtn => {
+            const slot = editBtn.dataset.slot;
+            const view = summary.querySelector(`.mb-view[data-slot="${slot}"]`);
+            const editor = summary.querySelector(`.mb-editor[data-slot="${slot}"]`);
+            const input = editor.querySelector('.mb-input');
+            const lookupBtn = editor.querySelector('.mb-lookup-btn');
+            const saveBtn = editor.querySelector('.mb-save-btn');
+            const clearBtn = editor.querySelector('.mb-clear-btn');
+            const cancelBtn = editor.querySelector('.mb-cancel-btn');
+            const result = editor.querySelector('.mb-result');
+            const original = input.value;
+            let verifiedId = null;   // the id confirmed by the last lookup
+
+            function open() { view.hidden = true; editor.hidden = false; input.focus(); }
+            function close() {
+                editor.hidden = true; view.hidden = false;
+                input.value = original; result.innerHTML = '';
+                saveBtn.disabled = true; verifiedId = null;
+            }
+            function invalidate() { saveBtn.disabled = true; verifiedId = null; }
+
+            editBtn.addEventListener('click', open);
+            cancelBtn.addEventListener('click', close);
+            input.addEventListener('input', () => { result.innerHTML = ''; invalidate(); });
+
+            async function doLookup() {
+                const id = input.value.trim();
+                if (!UUID_RE.test(id)) {
+                    result.innerHTML = '<span class="err">Enter a valid MusicBrainz work ID (a UUID).</span>';
+                    invalidate();
+                    return;
+                }
+                lookupBtn.disabled = true;
+                result.innerHTML = 'Looking up…';
+                try {
+                    const resp = await fetch(`/admin/musicbrainz/work/${encodeURIComponent(id)}/lookup`, {
+                        headers: { 'Accept': 'application/json' },
+                    });
+                    const data = await resp.json();
+                    if (!resp.ok) throw new Error(data.error || ('HTTP ' + resp.status));
+                    const composers = (data.composers || [])
+                        .map(c => `${esc(c.name)} <span style="color:#888">(${esc(c.type)})</span>`)
+                        .join(', ') || '<span class="muted">—</span>';
+                    result.innerHTML =
+                        '<div class="ok">✓ Found on MusicBrainz</div>' +
+                        '<dl>' +
+                        `<dt>ID</dt><dd class="mono">${esc(data.id)}</dd>` +
+                        `<dt>Title</dt><dd>${esc(data.title) || '<span class="muted">—</span>'}</dd>` +
+                        `<dt>Composers</dt><dd>${composers}</dd>` +
+                        '</dl>';
+                    verifiedId = data.id;
+                    saveBtn.disabled = false;
+                } catch (e) {
+                    result.innerHTML = `<span class="err">${esc(e.message)}</span>`;
+                    invalidate();
+                } finally {
+                    lookupBtn.disabled = false;
+                }
+            }
+            lookupBtn.addEventListener('click', doLookup);
+            input.addEventListener('keydown', e => { if (e.key === 'Enter') { e.preventDefault(); doLookup(); } });
+
+            async function save(mbId) {
+                saveBtn.disabled = true;
+                if (clearBtn) clearBtn.disabled = true;
+                try {
+                    const resp = await fetch(`/admin/songs/${songId}/mb-id`, {
+                        method: 'POST',
+                        headers: { 'Content-Type': 'application/json', 'Accept': 'application/json' },
+                        body: JSON.stringify({ slot, mb_id: mbId }),
+                    });
+                    const data = await resp.json();
+                    if (!resp.ok || !data.success) throw new Error(data.error || ('HTTP ' + resp.status));
+                    window.location.reload();
+                } catch (e) {
+                    result.innerHTML = `<span class="err">Save failed: ${esc(e.message)}</span>`;
+                    saveBtn.disabled = false;
+                    if (clearBtn) clearBtn.disabled = false;
+                }
+            }
+            saveBtn.addEventListener('click', () => { if (verifiedId) save(verifiedId); });
+            if (clearBtn) {
+                clearBtn.addEventListener('click', () => {
+                    if (confirm('Clear this MusicBrainz work ID?')) save('');
+                });
+            }
+        });
+
+        // ---- Alternate-title editor -------------------------------------
+        (function () {
+            const dataEl = document.getElementById('alt-titles-data');
+            let initial = [];
+            try { initial = JSON.parse(dataEl.textContent) || []; } catch (_e) {}
+
+            const viewWrap = document.getElementById('alt-titles-view');
+            const viewChips = document.getElementById('alt-titles-chips-view');
+            const editWrap = document.getElementById('alt-titles-editor');
+            const editChips = document.getElementById('alt-titles-chips-edit');
+            const input = document.getElementById('alt-title-input');
+            const addBtn = document.getElementById('alt-title-add-btn');
+            const saveBtn = document.getElementById('alt-titles-save-btn');
+            const cancelBtn = document.getElementById('alt-titles-cancel-btn');
+            const editBtn = document.getElementById('alt-titles-edit-btn');
+            const status = document.getElementById('alt-titles-status');
+
+            let working = [];
+
+            function renderView() {
+                if (initial.length) {
+                    viewChips.innerHTML = initial.map(t =>
+                        `<span class="alt-chip" style="padding:3px 10px">${esc(t)}</span>`).join(' ');
+                } else {
+                    viewChips.innerHTML = '<span class="muted" style="font-size:12px">none</span>';
+                }
+                editBtn.textContent = initial.length ? 'Edit' : 'Add';
+            }
+
+            function renderEdit() {
+                editChips.innerHTML = working.length ? working.map((t, i) =>
+                    `<span class="alt-chip">${esc(t)}<button type="button" data-i="${i}" aria-label="Remove">×</button></span>`
+                ).join(' ') : '<span class="muted" style="font-size:12px">No alternate titles yet.</span>';
+                editChips.querySelectorAll('button[data-i]').forEach(b => {
+                    b.addEventListener('click', () => {
+                        working.splice(parseInt(b.dataset.i, 10), 1);
+                        renderEdit();
+                    });
+                });
+            }
+
+            function addFromInput() {
+                const v = input.value.trim();
+                if (!v) return;
+                if (!working.some(t => t.toLowerCase() === v.toLowerCase())) working.push(v);
+                input.value = '';
+                input.focus();
+                renderEdit();
+            }
+
+            editBtn.addEventListener('click', () => {
+                working = initial.slice();
+                status.textContent = '';
+                renderEdit();
+                viewWrap.hidden = true;
+                editWrap.hidden = false;
+                input.focus();
+            });
+            cancelBtn.addEventListener('click', () => {
+                editWrap.hidden = true; viewWrap.hidden = false; input.value = '';
+            });
+            addBtn.addEventListener('click', addFromInput);
+            input.addEventListener('keydown', e => { if (e.key === 'Enter') { e.preventDefault(); addFromInput(); } });
+
+            saveBtn.addEventListener('click', async () => {
+                addFromInput();  // fold any pending text in the box
+                saveBtn.disabled = true;
+                status.textContent = 'Saving…';
+                status.className = '';
+                try {
+                    const resp = await fetch(`/admin/songs/${songId}/alt-titles`, {
+                        method: 'POST',
+                        headers: { 'Content-Type': 'application/json', 'Accept': 'application/json' },
+                        body: JSON.stringify({ alt_titles: working }),
+                    });
+                    const data = await resp.json();
+                    if (!resp.ok || !data.success) throw new Error(data.error || ('HTTP ' + resp.status));
+                    initial = data.alt_titles || [];
+                    renderView();
+                    editWrap.hidden = true; viewWrap.hidden = false; input.value = '';
+                } catch (e) {
+                    status.textContent = 'Save failed: ' + e.message;
+                    status.style.color = '#cc0000';
+                    saveBtn.disabled = false;
+                }
+            });
+
+            renderView();
+        })();
+    })();
+    </script>
 </body>
 </html>

From 8eea90685a3c6a4cf0f1d17a743a0b818eea4907 Mon Sep 17 00:00:00 2001
From: David Rodger <dave@davidrodger.com>
Date: Sun, 14 Jun 2026 17:47:01 -0400
Subject: [PATCH 2/3] backend: pull Wikipedia song intro into the research
 pipeline

The song intro shown in the app lives in songs.structure, fetched from
the song's Wikipedia URL via the MediaWiki extracts API. That logic only
ever ran as a one-time backfill (scripts/onetime_scripts/
one_time_song_wiki_intro.py); it was never wired into ongoing import, so
newly imported songs got a wikipedia_url but never the intro text.

- Add integrations/wikipedia/song_intro.py: parse_wikipedia_url,
  fetch_wikipedia_intro, and update_song_wikipedia_intro(song_id). The
  updater reads the wikipedia_url already on the song, fetches the lead
  section, and writes it to songs.structure. Idempotent like the sibling
  MB updaters (skips if structure is set) except force_refresh re-pulls.
- Wire it as Step 1.8 in core.song_research.research_song, after the
  Wikipedia-URL step it depends on, passing through force_refresh.

Lives under integrations/wikipedia (not the MusicBrainz updaters in
song_updates.py) because it talks to MediaWiki, not MusicBrainz.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 backend/core/song_research.py                |   9 +
 backend/integrations/wikipedia/song_intro.py | 172 +++++++++++++++++++
 2 files changed, 181 insertions(+)
 create mode 100644 backend/integrations/wikipedia/song_intro.py

diff --git a/backend/core/song_research.py b/backend/core/song_research.py
index d9f4eae..9b6738e 100644
--- a/backend/core/song_research.py
+++ b/backend/core/song_research.py
@@ -23,6 +23,7 @@
 from integrations.musicbrainz.release_importer import MBReleaseImporter
 from db_utils import get_db_connection, execute_query
 from integrations.musicbrainz.utils import MusicBrainzSearcher, update_song_composer, update_song_wikipedia_url, update_song_composed_year
+from integrations.wikipedia.song_intro import update_song_wikipedia_intro
 from core import research_queue, research_jobs
 from core import performer_reference_verification
 logger = logging.getLogger(__name__)
@@ -217,6 +218,14 @@ def progress_callback(phase: str, current: int, total: int):
         if not composed_year_updated:
             logger.debug("Composed year not updated (already set or not found)")
 
+        # Step 1.8: Pull the Wikipedia intro into songs.structure. Runs after
+        # Step 1.6 so it can consume the wikipedia_url just resolved off the
+        # MB work. Idempotent unless force_refresh — see update_song_wikipedia_intro.
+        logger.info("Checking for Wikipedia intro update...")
+        intro_updated = update_song_wikipedia_intro(str(song_id), force_refresh=force_refresh)
+        if not intro_updated:
+            logger.debug("Wikipedia intro not updated (already set, no URL, or not found)")
+
         # Spotify, Apple Music, and YouTube matching all run on the
         # durable research queue (research_worker/handlers/*). Their
         # per-job stats live on the research_jobs row's `result` field —
diff --git a/backend/integrations/wikipedia/song_intro.py b/backend/integrations/wikipedia/song_intro.py
new file mode 100644
index 0000000..c8d393e
--- /dev/null
+++ b/backend/integrations/wikipedia/song_intro.py
@@ -0,0 +1,172 @@
+"""Wikipedia song-intro fetcher + updater.
+
+Pulls the lead-section extract for a song's Wikipedia article — the
+plain-text intro shown in the app's song detail — and writes it into
+songs.structure.
+
+This is the reusable, pipeline-wired version of the one-time backfill in
+scripts/onetime_scripts/one_time_song_wiki_intro.py. The backfill populated
+existing rows once; this module is called from core.song_research so every
+newly imported / refreshed song with a wikipedia_url gets its intro pulled
+in too (the import path previously set wikipedia_url but never the intro).
+
+It lives under integrations/wikipedia (not the MusicBrainz updaters in
+integrations/musicbrainz/song_updates.py) because it talks to the MediaWiki
+extracts API, not MusicBrainz — it consumes the wikipedia_url that the MB
+updater has already resolved onto the song.
+"""
+
+import logging
+from urllib.parse import unquote, urlparse
+
+import requests
+
+logger = logging.getLogger(__name__)
+
+# MediaWiki asks API clients to send a descriptive User-Agent.
+USER_AGENT = "ApproachNote/1.0 (+support@approachnote.com)"
+DEFAULT_SENTENCES = 4
+REQUEST_TIMEOUT = 15
+
+
+def parse_wikipedia_url(wikipedia_url: str):
+    """Return (api_url, page_title) for a Wikipedia article URL, or (None, None).
+
+    Honors the language subdomain so de.wikipedia.org URLs hit the right API.
+    """
+    try:
+        parsed = urlparse(wikipedia_url)
+        if not parsed.netloc or '/wiki/' not in parsed.path:
+            return None, None
+        title = parsed.path.split('/wiki/', 1)[1]
+        title = title.split('#', 1)[0]
+        title = unquote(title)
+        if not title:
+            return None, None
+        api_url = f"{parsed.scheme}://{parsed.netloc}/w/api.php"
+        return api_url, title
+    except Exception:
+        return None, None
+
+
+def fetch_wikipedia_intro(page_title: str, api_url: str,
+                          sentences: int = DEFAULT_SENTENCES,
+                          session: requests.Session = None):
+    """Fetch the lead-section plain-text extract for a Wikipedia page.
+
+    Returns the extract string, or None if the page is missing / empty / the
+    request fails. Raises nothing for HTTP-level non-200s (logs + returns
+    None); network exceptions propagate to the caller.
+    """
+    sess = session or requests.Session()
+    params = {
+        'action': 'query',
+        'format': 'json',
+        'prop': 'extracts',
+        'titles': page_title,
+        'redirects': 1,
+        'exintro': 1,
+        'explaintext': 1,
+        'exsentences': sentences,
+    }
+    headers = {'User-Agent': USER_AGENT, 'Accept': 'application/json'}
+    resp = sess.get(api_url, params=params, headers=headers, timeout=REQUEST_TIMEOUT)
+    if resp.status_code != 200:
+        logger.warning("Wikipedia returned status %s for %s", resp.status_code, page_title)
+        return None
+    pages = resp.json().get('query', {}).get('pages', {})
+    if not pages:
+        return None
+    page = next(iter(pages.values()))
+    if 'missing' in page:
+        logger.warning("Wikipedia page missing: %s", page_title)
+        return None
+    extract = (page.get('extract') or '').strip()
+    return extract or None
+
+
+def update_song_wikipedia_intro(song_id: str,
+                                sentences: int = DEFAULT_SENTENCES,
+                                force_refresh: bool = False,
+                                dry_run: bool = False) -> bool:
+    """Populate songs.structure with the song's Wikipedia intro.
+
+    Reads the wikipedia_url already on the song (set earlier in the research
+    pipeline by update_song_wikipedia_url), fetches the lead-section extract,
+    and stores it in songs.structure.
+
+    Idempotent like the sibling MB updaters: skips a song that already has
+    structure text, UNLESS force_refresh is set — a deep refresh re-pulls
+    the intro so edits/expansions on Wikipedia flow through.
+
+    Args:
+        song_id: UUID of the song
+        sentences: Number of intro sentences to request from MediaWiki
+        force_refresh: Overwrite existing structure text if True
+        dry_run: Log what would happen without writing to the DB
+
+    Returns:
+        bool: True if structure was updated (or would be, in dry-run).
+    """
+    from db_utils import get_db_connection
+
+    try:
+        with get_db_connection() as conn:
+            with conn.cursor() as cur:
+                cur.execute(
+                    "SELECT wikipedia_url, structure, title FROM songs WHERE id = %s",
+                    (song_id,),
+                )
+                row = cur.fetchone()
+
+        if not row:
+            return False
+
+        wikipedia_url = row['wikipedia_url']
+        current_structure = row['structure']
+        song_title = row['title']
+
+        if not wikipedia_url:
+            logger.debug("Song has no Wikipedia URL, skipping intro update")
+            return False
+
+        # Don't clobber an existing intro unless explicitly refreshing.
+        if (current_structure or '').strip() and not force_refresh:
+            logger.debug("Song '%s' already has intro text, skipping", song_title)
+            return False
+
+        api_url, page_title = parse_wikipedia_url(wikipedia_url)
+        if not api_url:
+            logger.warning("Could not parse Wikipedia URL for '%s': %s",
+                           song_title, wikipedia_url)
+            return False
+
+        intro = fetch_wikipedia_intro(page_title, api_url, sentences=sentences)
+        if not intro:
+            logger.debug("No Wikipedia intro returned for '%s'", song_title)
+            return False
+
+        if dry_run:
+            logger.info("[DRY RUN] Would update intro for '%s' (%d chars)",
+                        song_title, len(intro))
+            return True
+
+        with get_db_connection() as conn:
+            with conn.cursor() as cur:
+                cur.execute(
+                    "UPDATE songs SET structure = %s, updated_at = CURRENT_TIMESTAMP "
+                    "WHERE id = %s",
+                    (intro, song_id),
+                )
+                conn.commit()
+
+        logger.info("✓ Updated Wikipedia intro for '%s' (%d chars)",
+                    song_title, len(intro))
+        return True
+
+    except requests.RequestException as e:
+        logger.error("Wikipedia request error updating intro for song %s: %s", song_id, e)
+        return False
+    except Exception as e:
+        logger.error("Error updating Wikipedia intro for song %s: %s", song_id, e)
+        return False

From 27de003a7c8fda0e555c2e78bfd580c8ec5c1c3f Mon Sep 17 00:00:00 2001
From: David Rodger <dave@davidrodger.com>
Date: Sun, 14 Jun 2026 17:52:31 -0400
Subject: [PATCH 3/3] backend: add shared HTTP session factory, use it for the
 song intro

Introduce core/http_client.py with HTTP_USER_AGENT and make_session(), a
single home for the outbound User-Agent that's currently copy-pasted into
~20 files. song_intro.py now builds its session via make_session() instead
of hardcoding the UA string. A follow-up PR will sweep the remaining call
sites onto the factory.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 backend/core/http_client.py                  | 34 ++++++++++++++++++++
 backend/integrations/wikipedia/song_intro.py |  9 +++---
 2 files changed, 38 insertions(+), 5 deletions(-)
 create mode 100644 backend/core/http_client.py

diff --git a/backend/core/http_client.py b/backend/core/http_client.py
new file mode 100644
index 0000000..593a378
--- /dev/null
+++ b/backend/core/http_client.py
@@ -0,0 +1,34 @@
+"""Shared HTTP client configuration.
+
+A single home for the outbound User-Agent and a `requests.Session` factory,
+so every crawler/integration identifies us the same way and a version bump
+is a one-line change rather than a sweep across ~20 files.
+
+Most external services (MusicBrainz, Wikipedia/MediaWiki, Cover Art Archive,
+Wikimedia Commons, etc.) expect — and in some cases require — a descriptive
+User-Agent. Use `make_session()` to get a session that already carries it.
+
+Note: this only handles identification/headers, not per-service rate
+limiting. Clients that must throttle (e.g. MusicBrainz) keep their own
+rate-limit logic on top of the session.
+"""
+
+import requests
+
+# Outbound identity sent on every API/crawl request. Bump the version here.
+HTTP_USER_AGENT = "ApproachNote/1.0 (+support@approachnote.com)"
+
+
+def make_session(accept_json: bool = True) -> requests.Session:
+    """Return a requests.Session preconfigured with our User-Agent.
+
+    Args:
+        accept_json: Also set ``Accept: application/json`` (the common case
+            for the JSON APIs we call). Pass False for HTML/binary fetches.
+    """
+    session = requests.Session()
+    headers = {'User-Agent': HTTP_USER_AGENT}
+    if accept_json:
+        headers['Accept'] = 'application/json'
+    session.headers.update(headers)
+    return session
diff --git a/backend/integrations/wikipedia/song_intro.py b/backend/integrations/wikipedia/song_intro.py
index c8d393e..dacbbbe 100644
--- a/backend/integrations/wikipedia/song_intro.py
+++ b/backend/integrations/wikipedia/song_intro.py
@@ -21,10 +21,10 @@
 
 import requests
 
+from core.http_client import make_session
+
 logger = logging.getLogger(__name__)
 
-# MediaWiki asks API clients to send a descriptive User-Agent.
-USER_AGENT = "ApproachNote/1.0 (+support@approachnote.com)"
 DEFAULT_SENTENCES = 4
 REQUEST_TIMEOUT = 15
 
@@ -58,7 +58,7 @@ def fetch_wikipedia_intro(page_title: str, api_url: str,
     request fails. Raises nothing for HTTP-level non-200s (logs + returns
     None); network exceptions propagate to the caller.
     """
-    sess = session or requests.Session()
+    sess = session or make_session()
     params = {
         'action': 'query',
         'format': 'json',
@@ -69,8 +69,7 @@ def fetch_wikipedia_intro(page_title: str, api_url: str,
         'explaintext': 1,
         'exsentences': sentences,
     }
-    headers = {'User-Agent': USER_AGENT, 'Accept': 'application/json'}
-    resp = sess.get(api_url, params=params, headers=headers, timeout=REQUEST_TIMEOUT)
+    resp = sess.get(api_url, params=params, timeout=REQUEST_TIMEOUT)
     if resp.status_code != 200:
         logger.warning("Wikipedia returned status %s for %s", resp.status_code, page_title)
         return None