diff --git a/core/prompts.py b/core/prompts.py index 27e0c13..da65c20 100644 --- a/core/prompts.py +++ b/core/prompts.py @@ -66,11 +66,46 @@ When in doubt, return null. - confidence: "high" if the row is clearly legible, "medium" if you had to guess one or two characters, "low" if mostly illegible - - notes: null in the common case. Use one of these tags only when relevant: - * "continuation" — this row is a wrap of the previous row, not its own entry - * "double_height" — handwriting takes up two physical rows of the grid - * "crossed_out" — the entry is struck through - * "illegible" — you could not read enough to attempt a transcription + - notes: null in the common case (>90% of rows). Use one of these tags + ONLY when the visual cue is unambiguous. Read each definition before + choosing a tag. + + * "double_height" — ONE handwritten entry that visually spans two + printed grid rows. The DJ wrote a single song's text but the + handwriting is tall, or the artist/track text flows from the upper + printed line down into the lower printed line as one continuous + piece of writing. Emit a SINGLE Entry on the UPPER printed grid + row, with raw_text containing the whole song's text, and tag it + "double_height". Do NOT also emit a separate Entry on the lower + printed grid row for the same handwritten content — that produces + a phantom duplicate row. Example: the artist "Buffy Sainte-Marie" + is written on the printed row N and the track "God is Alive, + Magic is Afoot" continues on printed row N+1 as one piece of + handwriting → emit one Entry with row_index=N, raw_text="Buffy + Sainte-Marie - God is Alive, Magic is Afoot", notes="double_height". + + * "continuation" — RARE in this corpus. Use ONLY when the lower + printed grid row contains a SEPARATE handwritten fragment that + completes a song started on the row above (different ink stroke, + an explicit arrow or bracket carrying the text down, or the DJ + clearly ran out of room and re-wrote the tail of the entry below). + Emit it as its own Entry on the lower grid row, with the + continuation fragment as raw_text and notes="continuation". + If you find yourself wanting to use "continuation" because the + handwriting itself is tall and flows visually across two rows, + use "double_height" instead — that is the same-entry case. + + * "crossed_out" — a clear horizontal line (or angry scribble) drawn + THROUGH the artist/track text, indicating the DJ retracted the + entry. Tag this ONLY when the strike-through crosses the song + text itself. Margin doodles next to a row, asterisks, arrows, + underlines drawn UNDER the text, a mark in the left-margin type + column, or a scratch through ONLY the left-margin circle are NOT + crossed_out — capture those as entry-level oddities instead. + When in doubt, leave notes=null. + + * "illegible" — you could not read enough to attempt a transcription. + Set confidence="low" as well. Always return EXACTLY FOUR quadrants in this fixed order: 1. top_left 2. top_right 3. bottom_left 4. bottom_right @@ -172,11 +207,46 @@ When in doubt, return null. - confidence: "high" if the row is clearly legible, "medium" if you had to guess one or two characters, "low" if mostly illegible - - notes: null in the common case. Use one of these tags only when relevant: - * "continuation" — this row is a wrap of the previous row, not its own entry - * "double_height" — handwriting takes up two physical rows of the grid - * "crossed_out" — the entry is struck through - * "illegible" — you could not read enough to attempt a transcription + - notes: null in the common case (>90% of rows). Use one of these tags + ONLY when the visual cue is unambiguous. Read each definition before + choosing a tag. + + * "double_height" — ONE handwritten entry that visually spans two + printed grid rows. The DJ wrote a single song's text but the + handwriting is tall, or the artist/track text flows from the upper + printed line down into the lower printed line as one continuous + piece of writing. Emit a SINGLE Entry on the UPPER printed grid + row, with raw_text containing the whole song's text, and tag it + "double_height". Do NOT also emit a separate Entry on the lower + printed grid row for the same handwritten content — that produces + a phantom duplicate row. Example: the artist "Buffy Sainte-Marie" + is written on the printed row N and the track "God is Alive, + Magic is Afoot" continues on printed row N+1 as one piece of + handwriting → emit one Entry with row_index=N, raw_text="Buffy + Sainte-Marie - God is Alive, Magic is Afoot", notes="double_height". + + * "continuation" — RARE in this corpus. Use ONLY when the lower + printed grid row contains a SEPARATE handwritten fragment that + completes a song started on the row above (different ink stroke, + an explicit arrow or bracket carrying the text down, or the DJ + clearly ran out of room and re-wrote the tail of the entry below). + Emit it as its own Entry on the lower grid row, with the + continuation fragment as raw_text and notes="continuation". + If you find yourself wanting to use "continuation" because the + handwriting itself is tall and flows visually across two rows, + use "double_height" instead — that is the same-entry case. + + * "crossed_out" — a clear horizontal line (or angry scribble) drawn + THROUGH the artist/track text, indicating the DJ retracted the + entry. Tag this ONLY when the strike-through crosses the song + text itself. Margin doodles next to a row, asterisks, arrows, + underlines drawn UNDER the text, a mark in the left-margin type + column, or a scratch through ONLY the left-margin circle are NOT + crossed_out — capture those as entry-level oddities instead. + When in doubt, leave notes=null. + + * "illegible" — you could not read enough to attempt a transcription. + Set confidence="low" as well. For the quadrant itself, capture: - position: must be exactly "{position}" (the cell this crop came from) diff --git a/scripts/_revalidate_notes_2026_06_04.py b/scripts/_revalidate_notes_2026_06_04.py new file mode 100644 index 0000000..071de72 --- /dev/null +++ b/scripts/_revalidate_notes_2026_06_04.py @@ -0,0 +1,64 @@ +"""One-shot script: re-extract a small sample of pages against current Gemini +and dump the raw GeminiPageResult JSON next to the verified files for diff. + +Not a permanent CLI command — this is a measurement for issue #61. +""" + +from __future__ import annotations + +import asyncio +import json +import os +from pathlib import Path + +from dotenv import load_dotenv + +from core.gemini import GeminiClient, MediaResolution + +PAGES = [2, 6, 9, 14, 16, 19] +PAGES_DIR = Path("data/pages/1990/April 1990/1990-04apr0106") +OUT_DIR = Path("data/notes-revalidation-2026-06-04") + + +async def main() -> None: + load_dotenv(override=False) + api_key = os.environ["GEMINI_API_KEY"] + model = os.environ.get("GEMINI_MODEL", "gemini-3.1-pro-preview") + media_resolution = MediaResolution.from_string( + os.environ.get("GEMINI_MEDIA_RESOLUTION", "high") + ) + + from google import genai + + client = GeminiClient( + sdk=genai.Client(api_key=api_key), + model=model, + media_resolution=media_resolution, + ) + # Try to cache; harmless if it fails. + cached = await client.create_cache() + print(f"model={model} cache={'on' if cached else 'off'}") + + OUT_DIR.mkdir(parents=True, exist_ok=True) + + async def run_one(p: int) -> tuple[int, dict | str]: + img = PAGES_DIR / f"page-{p:02d}.png" + try: + result = await client.extract_page(img) + return p, result.model_dump(mode="json") + except Exception as exc: # noqa: BLE001 + return p, f"ERROR: {type(exc).__name__}: {exc}" + + results = await asyncio.gather(*(run_one(p) for p in PAGES)) + for p, payload in results: + out = OUT_DIR / f"1990-04apr0106-page{p:02d}.fresh.json" + if isinstance(payload, str): + print(f"page {p:02d}: {payload}") + out.write_text(json.dumps({"error": payload}, indent=2)) + else: + out.write_text(json.dumps(payload, indent=2)) + print(f"page {p:02d}: wrote {out}") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/scripts/_revalidate_notes_metrics.py b/scripts/_revalidate_notes_metrics.py new file mode 100644 index 0000000..964bcd6 --- /dev/null +++ b/scripts/_revalidate_notes_metrics.py @@ -0,0 +1,155 @@ +"""Compute per-page and aggregate metrics for the notes revalidation.""" + +from __future__ import annotations + +import json +from collections import Counter +from pathlib import Path + +PAGES = [2, 6, 9, 14, 16, 19] +VERIFIED_DIR = Path("data/verifier-pulled-refresh") +FRESH_DIR = Path("data/notes-revalidation-2026-06-04") + + +def index_notes(page: dict) -> dict[tuple[str, int], str | None]: + out: dict[tuple[str, int], str | None] = {} + for q in page.get("quadrants", []): + pos = q.get("position") + for e in q.get("entries", []): + ri = e.get("row_index") + out[(pos, ri)] = e.get("notes") + return out + + +def main() -> None: + rows: list[dict] = [] + agg_emit = Counter() # fresh-Gemini notes value -> count + agg_truth = Counter() # Alex notes value -> count + agg_keep = Counter() # for each fresh-emitted tag, # where Alex agrees + agg_recall_hit = Counter() # for each Alex-truth tag, # where fresh-Gemini matches + agg_unmatched_fresh = 0 + agg_unmatched_truth = 0 + + for p in PAGES: + verified = json.loads( + (VERIFIED_DIR / f"1990-04apr0106-page{p:02d}.verified.json").read_text() + ) + fresh = json.loads((FRESH_DIR / f"1990-04apr0106-page{p:02d}.fresh.json").read_text()) + + truth_notes = index_notes(verified) + fresh_notes = index_notes(fresh) + + # Per-page counters + emit = Counter() + truth = Counter() + keep = Counter() + recall_hit = Counter() + unmatched_fresh = 0 + unmatched_truth = 0 + + all_keys = set(truth_notes) | set(fresh_notes) + for k in all_keys: + t = truth_notes.get(k) # may be None or missing-entry + f = fresh_notes.get(k) + if k not in truth_notes: + # fresh emitted an entry Alex doesn't have at that position + if f is not None: + unmatched_fresh += 1 + continue + if k not in fresh_notes: + if t is not None: + unmatched_truth += 1 + continue + if f is not None: + emit[f] += 1 + if t is not None: + truth[t] += 1 + if f is not None and t is not None and f == t: + keep[f] += 1 + recall_hit[t] += 1 + + rows.append( + { + "page": p, + "emit": dict(emit), + "truth": dict(truth), + "keep": dict(keep), + "recall_hit": dict(recall_hit), + "unmatched_fresh_rows": unmatched_fresh, + "unmatched_truth_rows": unmatched_truth, + } + ) + + for k, v in emit.items(): + agg_emit[k] += v + for k, v in truth.items(): + agg_truth[k] += v + for k, v in keep.items(): + agg_keep[k] += v + for k, v in recall_hit.items(): + agg_recall_hit[k] += v + agg_unmatched_fresh += unmatched_fresh + agg_unmatched_truth += unmatched_truth + + print("== Per-page (fresh emits / Alex truth) ==") + tags = ["crossed_out", "continuation", "double_height", "illegible", "other"] + header = f"{'page':>4} " + " ".join(f"{t[:5]:>11}" for t in tags) + " unmatched(F/T)" + print(header) + for r in rows: + cells = [] + for t in tags: + e = r["emit"].get(t, 0) + tr = r["truth"].get(t, 0) + cells.append(f"{e:>4}/{tr:<5}") + print( + f"{r['page']:>4} " + + " ".join(cells) + + f" {r['unmatched_fresh_rows']}/{r['unmatched_truth_rows']}" + ) + + print() + print("== Aggregate ==") + print(f"fresh-Gemini emit counts: {dict(agg_emit)}") + print(f"Alex truth counts: {dict(agg_truth)}") + print(f"matched (fresh==truth): {dict(agg_keep)}") + print(f"truth rows recalled: {dict(agg_recall_hit)}") + print(f"unmatched fresh entries (row Alex doesn't have): {agg_unmatched_fresh}") + print(f"unmatched truth entries (row fresh-Gemini lacks): {agg_unmatched_truth}") + + print() + print("== Headline metrics ==") + # crossed_out precision: of fresh-Gemini's crossed_out emits, how many does Alex keep as crossed_out + co_emit = agg_emit.get("crossed_out", 0) + co_keep = agg_keep.get("crossed_out", 0) + co_prec = (co_keep / co_emit) if co_emit else None + print( + f"crossed_out precision = {co_keep}/{co_emit} = {co_prec if co_prec is None else f'{co_prec * 100:.0f}%'}" + ) + + # continuation recall: of Alex's continuation, how many fresh-Gemini caught + co_truth = agg_truth.get("continuation", 0) + co_hit = agg_recall_hit.get("continuation", 0) + print( + f"continuation recall = {co_hit}/{co_truth} = {(co_hit / co_truth * 100):.0f}%" + if co_truth + else "continuation recall = N/A" + ) + + # double_height recall + dh_truth = agg_truth.get("double_height", 0) + dh_hit = agg_recall_hit.get("double_height", 0) + print( + f"double_height recall = {dh_hit}/{dh_truth} = {(dh_hit / dh_truth * 100):.0f}%" + if dh_truth + else "double_height recall = N/A" + ) + + # illegible recall (bonus) + il_truth = agg_truth.get("illegible", 0) + il_hit = agg_recall_hit.get("illegible", 0) + if il_truth: + print(f"illegible recall = {il_hit}/{il_truth} = {(il_hit / il_truth * 100):.0f}%") + + +if __name__ == "__main__": + main() diff --git a/scripts/_revalidate_notes_newprompt.py b/scripts/_revalidate_notes_newprompt.py new file mode 100644 index 0000000..ffc08bc --- /dev/null +++ b/scripts/_revalidate_notes_newprompt.py @@ -0,0 +1,65 @@ +"""Re-extract the same 6-page sample as scripts/_revalidate_notes_2026_06_04.py +but against whatever core/prompts.py is currently on disk. Dumps results to +data/notes-revalidation-newprompt/ so the baseline measurement +(data/notes-revalidation-2026-06-04/) is preserved for diff. + +One-shot measurement for issue #61. Run from repo root. +""" + +from __future__ import annotations + +import asyncio +import json +import os +from pathlib import Path + +from dotenv import load_dotenv + +from core.gemini import GeminiClient, MediaResolution + +PAGES = [2, 6, 9, 14, 16, 19] +PAGES_DIR = Path("data/pages/1990/April 1990/1990-04apr0106") +OUT_DIR = Path("data/notes-revalidation-newprompt") + + +async def main() -> None: + load_dotenv(override=False) + api_key = os.environ["GEMINI_API_KEY"] + model = os.environ.get("GEMINI_MODEL", "gemini-3.1-pro-preview") + media_resolution = MediaResolution.from_string( + os.environ.get("GEMINI_MEDIA_RESOLUTION", "high") + ) + + from google import genai + + client = GeminiClient( + sdk=genai.Client(api_key=api_key), + model=model, + media_resolution=media_resolution, + ) + cached = await client.create_cache() + print(f"model={model} cache={'on' if cached else 'off'}") + + OUT_DIR.mkdir(parents=True, exist_ok=True) + + async def run_one(p: int) -> tuple[int, dict | str]: + img = PAGES_DIR / f"page-{p:02d}.png" + try: + result = await client.extract_page(img) + return p, result.model_dump(mode="json") + except Exception as exc: # noqa: BLE001 + return p, f"ERROR: {type(exc).__name__}: {exc}" + + results = await asyncio.gather(*(run_one(p) for p in PAGES)) + for p, payload in results: + out = OUT_DIR / f"1990-04apr0106-page{p:02d}.fresh.json" + if isinstance(payload, str): + print(f"page {p:02d}: {payload}") + out.write_text(json.dumps({"error": payload}, indent=2)) + else: + out.write_text(json.dumps(payload, indent=2)) + print(f"page {p:02d}: wrote {out}") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/tests/unit/test_prompts.py b/tests/unit/test_prompts.py index 30067a9..cea4cc7 100644 --- a/tests/unit/test_prompts.py +++ b/tests/unit/test_prompts.py @@ -41,6 +41,46 @@ def test_prompt_lists_every_phase1_notes_tag(tag: str) -> None: assert tag in PAGE_EXTRACTION_PROMPT +def test_prompt_double_height_emits_single_entry_on_upper_row() -> None: + """Drift observed 2026-06-04: fresh Gemini was splitting a 2-grid-row + handwritten entry into two Entries and tagging the second one + `continuation`, instead of emitting one Entry tagged `double_height`. + The prompt must explicitly direct against the split shape — a single + `double_height` definition without the negation reproduces the drift.""" + text = PAGE_EXTRACTION_PROMPT + assert "SINGLE Entry" in text + # The negation: do not also emit a row on the lower line. Allow either + # "separate" or "second" wording — both are valid phrasings of the rule. + # Normalise whitespace because the prompt is wrapped. + normalised = " ".join(text.lower().split()) + assert "do not also emit a separate entry on the lower" in normalised or ( + "do not also emit a second entry on the lower" in normalised + ) + + +def test_prompt_continuation_definition_excludes_tall_handwriting() -> None: + """`continuation` is for the SEPARATE-fragment case (visible arrow / + re-write below). The wording must steer the model away from using + `continuation` for tall handwriting that spans two grid rows — that + case belongs to `double_height`.""" + # The directive: tall handwriting routes to double_height, not continuation. + assert 'use "double_height" instead' in PAGE_EXTRACTION_PROMPT + + +def test_prompt_crossed_out_excludes_margin_marks() -> None: + """`crossed_out` was at 27% precision; the prompt must enumerate the + common false-positive shapes (margin doodles, asterisks, arrows, + underlines, type-column-only marks) so the model recognises them as + non-crossed-out.""" + text = PAGE_EXTRACTION_PROMPT.lower() + assert "through the artist/track text" in text + false_positives = ["doodle", "asterisk", "arrow", "underline", "type column"] + found = sum(1 for fp in false_positives if fp in text) + assert found >= 3, ( + f"expected at least 3 of {false_positives} in the crossed_out clause, found {found}" + ) + + @pytest.mark.parametrize("confidence", ["high", "medium", "low"]) def test_prompt_lists_every_confidence_value(confidence: str) -> None: assert f'"{confidence}"' in PAGE_EXTRACTION_PROMPT @@ -194,6 +234,27 @@ def test_quadrant_template_lists_every_phase1_notes_tag(tag: str) -> None: assert tag in QUADRANT_EXTRACTION_PROMPT_TEMPLATE +def test_quadrant_template_double_height_emits_single_entry_on_upper_row() -> None: + text = QUADRANT_EXTRACTION_PROMPT_TEMPLATE + assert "SINGLE Entry" in text + normalised = " ".join(text.lower().split()) + assert "do not also emit a separate entry on the lower" in normalised or ( + "do not also emit a second entry on the lower" in normalised + ) + + +def test_quadrant_template_continuation_excludes_tall_handwriting() -> None: + assert 'use "double_height" instead' in QUADRANT_EXTRACTION_PROMPT_TEMPLATE + + +def test_quadrant_template_crossed_out_excludes_margin_marks() -> None: + text = QUADRANT_EXTRACTION_PROMPT_TEMPLATE.lower() + assert "through the artist/track text" in text + false_positives = ["doodle", "asterisk", "arrow", "underline", "type column"] + found = sum(1 for fp in false_positives if fp in text) + assert found >= 3 + + @pytest.mark.parametrize("confidence", ["high", "medium", "low"]) def test_quadrant_template_lists_every_confidence_value(confidence: str) -> None: assert f'"{confidence}"' in QUADRANT_EXTRACTION_PROMPT_TEMPLATE