diff --git a/core/prompts.py b/core/prompts.py index da65c20..0bd4f87 100644 --- a/core/prompts.py +++ b/core/prompts.py @@ -70,30 +70,21 @@ ONLY when the visual cue is unambiguous. Read each definition before choosing a tag. - * "double_height" — ONE handwritten entry that visually spans two - printed grid rows. The DJ wrote a single song's text but the - handwriting is tall, or the artist/track text flows from the upper - printed line down into the lower printed line as one continuous - piece of writing. Emit a SINGLE Entry on the UPPER printed grid - row, with raw_text containing the whole song's text, and tag it - "double_height". Do NOT also emit a separate Entry on the lower - printed grid row for the same handwritten content — that produces - a phantom duplicate row. Example: the artist "Buffy Sainte-Marie" - is written on the printed row N and the track "God is Alive, - Magic is Afoot" continues on printed row N+1 as one piece of - handwriting → emit one Entry with row_index=N, raw_text="Buffy - Sainte-Marie - God is Alive, Magic is Afoot", notes="double_height". - - * "continuation" — RARE in this corpus. Use ONLY when the lower - printed grid row contains a SEPARATE handwritten fragment that - completes a song started on the row above (different ink stroke, - an explicit arrow or bracket carrying the text down, or the DJ - clearly ran out of room and re-wrote the tail of the entry below). - Emit it as its own Entry on the lower grid row, with the - continuation fragment as raw_text and notes="continuation". - If you find yourself wanting to use "continuation" because the - handwriting itself is tall and flows visually across two rows, - use "double_height" instead — that is the same-entry case. + * "double_height" — a single handwritten entry whose handwriting + is tall enough to occupy two printed grid rows but is written as + one continuous piece. Emit a SINGLE Entry on the upper printed + grid row with the whole song's text in raw_text and notes set to + "double_height". (If the entry instead reads as two visually + distinct fragments — e.g. the artist on the upper row and a + clearly separate track-name fragment on the lower row — use the + continuation tag below instead.) + + * "continuation" — the entry's text wraps onto the next printed + grid row. Emit it as its own Entry on the lower printed grid + row, with that wrap-fragment as raw_text and notes set to + "continuation". A downstream baker merges these into the prior + row at write time, so transcribe both rows verbatim — do not + try to inline the wrap into the prior entry yourself. * "crossed_out" — a clear horizontal line (or angry scribble) drawn THROUGH the artist/track text, indicating the DJ retracted the @@ -211,30 +202,21 @@ ONLY when the visual cue is unambiguous. Read each definition before choosing a tag. - * "double_height" — ONE handwritten entry that visually spans two - printed grid rows. The DJ wrote a single song's text but the - handwriting is tall, or the artist/track text flows from the upper - printed line down into the lower printed line as one continuous - piece of writing. Emit a SINGLE Entry on the UPPER printed grid - row, with raw_text containing the whole song's text, and tag it - "double_height". Do NOT also emit a separate Entry on the lower - printed grid row for the same handwritten content — that produces - a phantom duplicate row. Example: the artist "Buffy Sainte-Marie" - is written on the printed row N and the track "God is Alive, - Magic is Afoot" continues on printed row N+1 as one piece of - handwriting → emit one Entry with row_index=N, raw_text="Buffy - Sainte-Marie - God is Alive, Magic is Afoot", notes="double_height". - - * "continuation" — RARE in this corpus. Use ONLY when the lower - printed grid row contains a SEPARATE handwritten fragment that - completes a song started on the row above (different ink stroke, - an explicit arrow or bracket carrying the text down, or the DJ - clearly ran out of room and re-wrote the tail of the entry below). - Emit it as its own Entry on the lower grid row, with the - continuation fragment as raw_text and notes="continuation". - If you find yourself wanting to use "continuation" because the - handwriting itself is tall and flows visually across two rows, - use "double_height" instead — that is the same-entry case. + * "double_height" — a single handwritten entry whose handwriting + is tall enough to occupy two printed grid rows but is written as + one continuous piece. Emit a SINGLE Entry on the upper printed + grid row with the whole song's text in raw_text and notes set to + "double_height". (If the entry instead reads as two visually + distinct fragments — e.g. the artist on the upper row and a + clearly separate track-name fragment on the lower row — use the + continuation tag below instead.) + + * "continuation" — the entry's text wraps onto the next printed + grid row. Emit it as its own Entry on the lower printed grid + row, with that wrap-fragment as raw_text and notes set to + "continuation". A downstream baker merges these into the prior + row at write time, so transcribe both rows verbatim — do not + try to inline the wrap into the prior entry yourself. * "crossed_out" — a clear horizontal line (or angry scribble) drawn THROUGH the artist/track text, indicating the DJ retracted the diff --git a/scripts/make_verifier_bundle.py b/scripts/make_verifier_bundle.py index da344b2..1176689 100644 --- a/scripts/make_verifier_bundle.py +++ b/scripts/make_verifier_bundle.py @@ -103,6 +103,12 @@ def _merge_with_spans(entries: list[Entry]) -> list[tuple[Entry, int]]: - notes="continuation": folds into the previous logical entry's raw_text (verbatim with the existing merge rules) and adds 1 to its span. - notes="double_height": stays as a single logical entry but spans 2 rows. + - notes="crossed_out": stripped to None before any further processing. + Empirical precision of Gemini's `crossed_out` is ~22% on the + verified corpus; surfacing the tag generates more false-positive + review work than true-positive value. The raw_text is preserved + verbatim — only the notes value is reset, so Alex can mark genuine + strike-throughs by toggling the dropdown. - All others: span 1. A leading "continuation" with nothing above it is preserved as-is with @@ -112,7 +118,15 @@ def _merge_with_spans(entries: list[Entry]) -> list[tuple[Entry, int]]: a verifier-geometry concern. The on-disk pipeline doesn't need it. """ result: list[tuple[Entry, int]] = [] - for entry in entries: + for raw_entry in entries: + # Strip unreliable `crossed_out` tags before any merge / span logic. + # Done first so a stripped crossed_out predecessor can still absorb + # a following continuation row instead of blocking the merge. + entry = ( + raw_entry.model_copy(update={"notes": None}) + if raw_entry.notes == "crossed_out" + else raw_entry + ) if entry.notes == "continuation" and result: prior, prior_span = result[-1] joined = f"{prior.raw_text.rstrip()} {entry.raw_text.lstrip()}".strip() diff --git a/tests/unit/test_make_verifier_bundle.py b/tests/unit/test_make_verifier_bundle.py index b51c090..ca45694 100644 --- a/tests/unit/test_make_verifier_bundle.py +++ b/tests/unit/test_make_verifier_bundle.py @@ -343,6 +343,43 @@ def test_merge_with_spans_empty_input() -> None: assert _merge_with_spans([]) == [] +def test_merge_with_spans_drops_crossed_out_tag() -> None: + """Empirical precision of Gemini's `crossed_out` is ~22% (8 false + positives per 11 emits, measured n=20 on 1990-04apr0106 and reproduced + on 1990-04apr1318). Stripping the tag at bake time eliminates the + false-positive review action; Alex marks the few true positives + himself by toggling the dropdown. The raw_text is preserved verbatim — + only the notes value is reset.""" + entries = [ + Entry(row_index=0, raw_text="Pixies - Debaser", confidence="high", notes="crossed_out"), + Entry(row_index=1, raw_text="Sonic Youth - Sugar Kane", confidence="high"), + ] + result = _merge_with_spans(entries) + assert len(result) == 2 + merged_first, span_first = result[0] + assert merged_first.notes is None, "expected crossed_out to be stripped from the bundle output" + assert merged_first.raw_text == "Pixies - Debaser" + assert span_first == 1 + + +def test_merge_with_spans_drops_crossed_out_before_continuation_merge() -> None: + """`crossed_out` stripping must happen before the continuation merge, + so a crossed_out predecessor doesn't suppress the merge or carry the + tag onto a logically-multi-row entry.""" + entries = [ + Entry(row_index=0, raw_text="Galaxie 500 -", confidence="high", notes="crossed_out"), + Entry(row_index=1, raw_text="Tugboat", confidence="medium", notes="continuation"), + ] + result = _merge_with_spans(entries) + assert len(result) == 1 + merged, span = result[0] + assert merged.raw_text == "Galaxie 500 - Tugboat" + assert span == 2 + # The merged predecessor's tag should be `double_height` from the merge + # rule, NOT `crossed_out` (which we just stripped). + assert merged.notes == "double_height" + + # -- make_bundle ------------------------------------------------------------ diff --git a/tests/unit/test_prompts.py b/tests/unit/test_prompts.py index cea4cc7..9de6d22 100644 --- a/tests/unit/test_prompts.py +++ b/tests/unit/test_prompts.py @@ -41,30 +41,19 @@ def test_prompt_lists_every_phase1_notes_tag(tag: str) -> None: assert tag in PAGE_EXTRACTION_PROMPT -def test_prompt_double_height_emits_single_entry_on_upper_row() -> None: - """Drift observed 2026-06-04: fresh Gemini was splitting a 2-grid-row - handwritten entry into two Entries and tagging the second one - `continuation`, instead of emitting one Entry tagged `double_height`. - The prompt must explicitly direct against the split shape — a single - `double_height` definition without the negation reproduces the drift.""" +def test_prompt_continuation_describes_split_shape_for_wraps() -> None: + """For multi-line wraps, the model should emit the wrap as a separate + Entry tagged `continuation` (the natural split shape). The bundle baker + merges those into the prior row at write time, so the prompt must NOT + ask the model to inline the wrap itself — that would suppress the + second printed line's text on cases the model can't visually classify + as one-vs-two entries.""" text = PAGE_EXTRACTION_PROMPT - assert "SINGLE Entry" in text - # The negation: do not also emit a row on the lower line. Allow either - # "separate" or "second" wording — both are valid phrasings of the rule. - # Normalise whitespace because the prompt is wrapped. - normalised = " ".join(text.lower().split()) - assert "do not also emit a separate entry on the lower" in normalised or ( - "do not also emit a second entry on the lower" in normalised - ) - - -def test_prompt_continuation_definition_excludes_tall_handwriting() -> None: - """`continuation` is for the SEPARATE-fragment case (visible arrow / - re-write below). The wording must steer the model away from using - `continuation` for tall handwriting that spans two grid rows — that - case belongs to `double_height`.""" - # The directive: tall handwriting routes to double_height, not continuation. - assert 'use "double_height" instead' in PAGE_EXTRACTION_PROMPT + # The wrap-fragment-as-its-own-Entry directive. + assert "its own Entry on the lower" in text + # The "don't inline" negation that prevents the iter-1 truncation regression. + lowered = " ".join(text.lower().split()) + assert "do not try to inline the wrap" in lowered def test_prompt_crossed_out_excludes_margin_marks() -> None: @@ -234,17 +223,14 @@ def test_quadrant_template_lists_every_phase1_notes_tag(tag: str) -> None: assert tag in QUADRANT_EXTRACTION_PROMPT_TEMPLATE -def test_quadrant_template_double_height_emits_single_entry_on_upper_row() -> None: +def test_quadrant_template_continuation_describes_split_shape_for_wraps() -> None: + """Parallel of the PAGE version: wraps emit as their own Entry tagged + `continuation`, and the prompt must NOT ask the model to inline the + wrap into the prior row.""" text = QUADRANT_EXTRACTION_PROMPT_TEMPLATE - assert "SINGLE Entry" in text - normalised = " ".join(text.lower().split()) - assert "do not also emit a separate entry on the lower" in normalised or ( - "do not also emit a second entry on the lower" in normalised - ) - - -def test_quadrant_template_continuation_excludes_tall_handwriting() -> None: - assert 'use "double_height" instead' in QUADRANT_EXTRACTION_PROMPT_TEMPLATE + assert "its own Entry on the lower" in text + lowered = " ".join(text.lower().split()) + assert "do not try to inline the wrap" in lowered def test_quadrant_template_crossed_out_excludes_margin_marks() -> None: