diff --git a/docs/DECISIONS.md b/docs/DECISIONS.md index 0ad5f59..4117f3c 100644 --- a/docs/DECISIONS.md +++ b/docs/DECISIONS.md @@ -448,3 +448,24 @@ the target product is home iPhone video, not GuitarSet. A silent default would hide a dataset-specific learned bias inside every decode. Keeping it explicit preserves baseline behavior while allowing the coordinator to run the exact home-video ablation before deciding whether to promote it. + +--- + +## 2026-05-07 — Phase 8 smoke eval is deterministic without external data + +**Phase:** 8 (eval harness hardening) +**Decision tree:** Phase 8 determinism gate and deferred Phase 1.5/3/4 debt audit +**Branch taken:** **Use a dependency-light synthetic smoke scope for CI, and +emit explicit blockers for full model-backed eval until the manifest and labels +exist.** +**Evidence:** `docs/EVAL_REPORTS/eval_full_20260507T000000Z.json` and +`.md` report 0 manifest clips, all four required tiers missing, Phase 3 +preflight/fretboard labels at 0/10 and 0/5, and Phase 4 hand labels at +0/100. `python -m scripts.eval.run --scope smoke --twice-and-diff +--timestamp 2026-05-07T00:00:00Z` reports `deterministic=true` with a +180-second smoke budget. +**Reasoning:** Full eval cannot honestly produce audio-only/audio+vision/prior +or confidence-calibration metrics without the Phase 1.5 manifest and external +media/annotations. The smoke scope still exercises the same report writer and +fixed output format in CI, so Phase 8 hardening can progress without masking the +remaining data-bound acceptance debt. diff --git a/docs/EVAL_REPORTS/eval_full_20260507T000000Z.json b/docs/EVAL_REPORTS/eval_full_20260507T000000Z.json new file mode 100644 index 0000000..7b98422 --- /dev/null +++ b/docs/EVAL_REPORTS/eval_full_20260507T000000Z.json @@ -0,0 +1,156 @@ +{ + "ablations": [ + { + "blocker": "No complete Phase 1.5 manifest plus local media/annotations are available for model-backed eval.", + "chord_accuracy": null, + "onset_f1": null, + "pitch_f1": null, + "status": "blocked", + "tab_f1": null, + "variant": "audio_only" + }, + { + "blocker": "No complete Phase 1.5 manifest plus local media/annotations are available for model-backed eval.", + "chord_accuracy": null, + "onset_f1": null, + "pitch_f1": null, + "status": "blocked", + "tab_f1": null, + "variant": "audio_vision" + }, + { + "blocker": "No complete Phase 1.5 manifest plus local media/annotations are available for model-backed eval.", + "chord_accuracy": null, + "onset_f1": null, + "pitch_f1": null, + "status": "blocked", + "tab_f1": null, + "variant": "audio_vision_prior" + } + ], + "confidence_calibration": { + "bins": 10, + "blocker": "confidence calibration is blocked until scored predictions with per-event confidence exist for manifest clips (currently 0 clips).", + "ece": null, + "metric": "ece", + "status": "blocked" + }, + "manifest": { + "clip_count": 0, + "clip_ids": [], + "items": [ + { + "clip_id": null, + "code": "MISSING_TIER", + "message": "Add at least one eval clip for required tier 'clean_acoustic_single_line'.", + "severity": "fail" + }, + { + "clip_id": null, + "code": "MISSING_TIER", + "message": "Add at least one eval clip for required tier 'clean_acoustic_strummed'.", + "severity": "fail" + }, + { + "clip_id": null, + "code": "MISSING_TIER", + "message": "Add at least one eval clip for required tier 'clean_electric'.", + "severity": "fail" + }, + { + "clip_id": null, + "code": "MISSING_TIER", + "message": "Add at least one eval clip for required tier 'distorted_electric'.", + "severity": "fail" + }, + { + "clip_id": null, + "code": "TOO_FEW_CLIPS", + "message": "Phase 1.5 requires >= 15 clips; found 0.", + "severity": "fail" + } + ], + "manifest_path": "data/eval/manifest.toml", + "missing_tiers": [ + "clean_acoustic_single_line", + "clean_acoustic_strummed", + "clean_electric", + "distorted_electric" + ], + "passed": false, + "present_tiers": [] + }, + "phase_debt": { + "phase_1_5": { + "command": "tabvision-eval --manifest tabvision/data/eval/manifest.toml --check", + "gate": "manifest completeness and all required tiers represented" + }, + "phase_3": { + "fretboard": { + "command": "pytest -m fretboard_eval tests/eval/test_phase3_eval.py", + "required_labels": 5, + "status": "blocked", + "usable_labels": 0 + }, + "guitar_detector": { + "evidence": "docs/DECISIONS.md#2026-05-05-phase-3-detector-acceptance", + "metric": "neck mAP50=0.995", + "status": "passed_documented" + }, + "preflight": { + "command": "pytest -m preflight_eval tests/eval/test_phase3_eval.py", + "required_labels": 10, + "status": "blocked", + "usable_labels": 0 + } + }, + "phase_4": { + "hand": { + "command": "pytest -m hand_eval tests/eval/test_phase4_eval.py", + "required_fretting_labels": 100, + "status": "blocked", + "usable_fretting_labels": 0 + } + } + }, + "schema_version": 1, + "scope": "full", + "seed": 0, + "smoke": { + "enabled": false, + "passed": false, + "subset": [] + }, + "smoke_budget_s": 180.0, + "tier_breakdown": [ + { + "clip_count": 0, + "status": "blocked", + "tab_f1": null, + "tab_f1_target": 0.94, + "tier": "clean_acoustic_single_line" + }, + { + "clip_count": 0, + "status": "blocked", + "tab_f1": null, + "tab_f1_target": 0.86, + "tier": "clean_acoustic_strummed" + }, + { + "clip_count": 0, + "status": "blocked", + "tab_f1": null, + "tab_f1_target": 0.9, + "tier": "clean_electric" + }, + { + "clip_count": 0, + "status": "blocked", + "tab_f1": null, + "tab_f1_target": 0.82, + "tier": "distorted_electric" + } + ], + "timestamp": "2026-05-07T00:00:00Z" +} diff --git a/docs/EVAL_REPORTS/eval_full_20260507T000000Z.md b/docs/EVAL_REPORTS/eval_full_20260507T000000Z.md new file mode 100644 index 0000000..a5d2bab --- /dev/null +++ b/docs/EVAL_REPORTS/eval_full_20260507T000000Z.md @@ -0,0 +1,44 @@ +# Eval Debt And Harness Report (full) + +Timestamp: `2026-05-07T00:00:00Z` +Seed: `0` +Smoke budget target: < 180 s + +## Phase 1.5 Manifest + +- Passed: `False` +- Clips: `0` +- Missing tiers: `clean_acoustic_single_line, clean_acoustic_strummed, clean_electric, distorted_electric` + +## Per-Tier Breakdown + +| Tier | Clips | Target Tab F1 | Status | Current Tab F1 | +|---|---:|---:|---|---:| +| clean_acoustic_single_line | 0 | 0.94 | blocked | | +| clean_acoustic_strummed | 0 | 0.86 | blocked | | +| clean_electric | 0 | 0.90 | blocked | | +| distorted_electric | 0 | 0.82 | blocked | | + +## Phase 3/4 Acceptance Debt + +| Gate | Status | Evidence / Blocker | Command | +|---|---|---|---| +| Phase 3 guitar detector | passed_documented | neck mAP50=0.995 (docs/DECISIONS.md#2026-05-05-phase-3-detector-acceptance) | current report | +| Phase 3 preflight | blocked | 0/10 labels | `pytest -m preflight_eval tests/eval/test_phase3_eval.py` | +| Phase 3 fretboard | blocked | 0/5 labels | `pytest -m fretboard_eval tests/eval/test_phase3_eval.py` | +| Phase 4 hand | blocked | 0/100 fretting labels | `pytest -m hand_eval tests/eval/test_phase4_eval.py` | + +## Ablations + +| Variant | Status | Onset F1 | Pitch F1 | Tab F1 | Chord Acc | Blocker | +|---|---|---:|---:|---:|---:|---| +| audio_only | blocked | | | | | No complete Phase 1.5 manifest plus local media/annotations are available for model-backed eval. | +| audio_vision | blocked | | | | | No complete Phase 1.5 manifest plus local media/annotations are available for model-backed eval. | +| audio_vision_prior | blocked | | | | | No complete Phase 1.5 manifest plus local media/annotations are available for model-backed eval. | + +## Confidence Calibration + +- Status: `blocked` +- Metric: `ece` with `10` bins +- ECE: `` +- Blocker: confidence calibration is blocked until scored predictions with per-event confidence exist for manifest clips (currently 0 clips). diff --git a/tabvision/data/eval/manifest.toml b/tabvision/data/eval/manifest.toml new file mode 100644 index 0000000..da79dc5 --- /dev/null +++ b/tabvision/data/eval/manifest.toml @@ -0,0 +1,17 @@ +# Phase 1.5 eval manifest. +# +# This file is intentionally a checked-in placeholder until the external +# datasets are registered under $TABVISION_DATA_ROOT. The validator reports +# the concrete missing clips and tiers: +# +# tabvision-eval --manifest tabvision/data/eval/manifest.toml --check +# +# Required clip schema: +# +# [[clips]] +# id = "guitarset-05-example" +# tier = "clean_acoustic_single_line" +# source = "GuitarSet" +# split = "validation" +# media_path = "$TABVISION_DATA_ROOT/guitarset/audio_mono-mic/05_example_mic.wav" +# annotation_path = "$TABVISION_DATA_ROOT/guitarset/annotation/05_example.jams" diff --git a/tabvision/pyproject.toml b/tabvision/pyproject.toml index 3b71a8e..9294a7e 100644 --- a/tabvision/pyproject.toml +++ b/tabvision/pyproject.toml @@ -80,6 +80,7 @@ dev = [ [project.scripts] tabvision = "tabvision.cli:main" +tabvision-eval = "tabvision.eval.runner:main" [tool.hatch.version] path = "tabvision/__init__.py" diff --git a/tabvision/scripts/eval/run.py b/tabvision/scripts/eval/run.py new file mode 100644 index 0000000..97298f6 --- /dev/null +++ b/tabvision/scripts/eval/run.py @@ -0,0 +1,13 @@ +"""One-command eval runner. + +Examples: + python -m scripts.eval.run --scope full + python -m scripts.eval.run --scope smoke --twice-and-diff +""" + +from __future__ import annotations + +from tabvision.eval.runner import main + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tabvision/tabvision/eval/manifest.py b/tabvision/tabvision/eval/manifest.py new file mode 100644 index 0000000..e9d9878 --- /dev/null +++ b/tabvision/tabvision/eval/manifest.py @@ -0,0 +1,247 @@ +"""Eval manifest validation for Phase 1.5 and Phase 8 reports.""" + +from __future__ import annotations + +import json +import tomllib +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Literal + +REQUIRED_TIERS: tuple[str, ...] = ( + "clean_acoustic_single_line", + "clean_acoustic_strummed", + "clean_electric", + "distorted_electric", +) + +OPTIONAL_TIERS: tuple[str, ...] = ("iphone_ood",) +ALLOWED_TIERS = REQUIRED_TIERS + OPTIONAL_TIERS +REQUIRED_CLIP_FIELDS: tuple[str, ...] = ( + "id", + "tier", + "source", + "split", + "media_path", + "annotation_path", +) +ALLOWED_SPLITS: tuple[str, ...] = ("train", "validation", "test") +MIN_PHASE15_CLIPS = 15 + +Severity = Literal["info", "warn", "fail"] + + +@dataclass(frozen=True) +class ManifestIssue: + severity: Severity + code: str + message: str + clip_id: str | None = None + + def to_dict(self) -> dict[str, object]: + return asdict(self) + + +@dataclass(frozen=True) +class ManifestValidation: + manifest_path: str + passed: bool + clip_count: int + clip_ids: list[str] + present_tiers: list[str] + missing_tiers: list[str] + items: list[ManifestIssue] + + def to_dict(self) -> dict[str, object]: + return { + "manifest_path": self.manifest_path, + "passed": self.passed, + "clip_count": self.clip_count, + "clip_ids": self.clip_ids, + "present_tiers": self.present_tiers, + "missing_tiers": self.missing_tiers, + "items": [item.to_dict() for item in self.items], + } + + def to_json_bytes(self) -> bytes: + return ( + json.dumps(self.to_dict(), indent=2, sort_keys=True, ensure_ascii=True) + "\n" + ).encode("utf-8") + + +def validate_manifest(path: str | Path) -> ManifestValidation: + manifest_path = Path(path) + if not manifest_path.exists(): + missing_items = [ + ManifestIssue( + severity="fail", + code="MANIFEST_MISSING", + message=f"Create eval manifest at {manifest_path}.", + ), + ManifestIssue( + severity="fail", + code="TOO_FEW_CLIPS", + message=f"Phase 1.5 requires >= {MIN_PHASE15_CLIPS} clips; found 0.", + ), + ] + missing_items.extend(_missing_tier_issues(REQUIRED_TIERS)) + return ManifestValidation( + manifest_path=str(manifest_path), + passed=False, + clip_count=0, + clip_ids=[], + present_tiers=[], + missing_tiers=list(REQUIRED_TIERS), + items=missing_items, + ) + + try: + payload = tomllib.loads(manifest_path.read_text(encoding="utf-8")) + except tomllib.TOMLDecodeError as exc: + return ManifestValidation( + manifest_path=str(manifest_path), + passed=False, + clip_count=0, + clip_ids=[], + present_tiers=[], + missing_tiers=list(REQUIRED_TIERS), + items=[ + ManifestIssue( + severity="fail", + code="MANIFEST_PARSE_ERROR", + message=str(exc), + ) + ], + ) + + raw_clips = payload.get("clips", []) + clips = raw_clips if isinstance(raw_clips, list) else [] + items: list[ManifestIssue] = [] + if not isinstance(raw_clips, list): + items.append( + ManifestIssue( + severity="fail", + code="CLIPS_NOT_LIST", + message="Manifest must define [[clips]] entries.", + ) + ) + + ids: list[str] = [] + tiers: set[str] = set() + seen_ids: set[str] = set() + for index, clip in enumerate(clips): + if not isinstance(clip, dict): + items.append( + ManifestIssue( + severity="fail", + code="CLIP_NOT_TABLE", + message=f"Clip entry {index} must be a TOML table.", + ) + ) + continue + + clip_id = _string_field(clip, "id") or f"" + if clip_id in seen_ids: + items.append( + ManifestIssue( + severity="fail", + code="DUPLICATE_ID", + message=f"Duplicate clip id {clip_id!r}.", + clip_id=clip_id, + ) + ) + seen_ids.add(clip_id) + ids.append(clip_id) + + for field in REQUIRED_CLIP_FIELDS: + if not _string_field(clip, field): + items.append( + ManifestIssue( + severity="fail", + code=f"MISSING_{field.upper()}", + message=f"Clip {clip_id!r} is missing required field {field!r}.", + clip_id=clip_id, + ) + ) + + tier = _string_field(clip, "tier") + if tier: + if tier not in ALLOWED_TIERS: + items.append( + ManifestIssue( + severity="fail", + code="UNKNOWN_TIER", + message=( + f"Clip {clip_id!r} has tier {tier!r}; expected one of " + f"{', '.join(ALLOWED_TIERS)}." + ), + clip_id=clip_id, + ) + ) + else: + tiers.add(tier) + + split = _string_field(clip, "split") + if split and split not in ALLOWED_SPLITS: + items.append( + ManifestIssue( + severity="fail", + code="UNKNOWN_SPLIT", + message=( + f"Clip {clip_id!r} has split {split!r}; expected one of " + f"{', '.join(ALLOWED_SPLITS)}." + ), + clip_id=clip_id, + ) + ) + + if len(clips) < MIN_PHASE15_CLIPS: + items.append( + ManifestIssue( + severity="fail", + code="TOO_FEW_CLIPS", + message=(f"Phase 1.5 requires >= {MIN_PHASE15_CLIPS} clips; found {len(clips)}."), + ) + ) + missing_tiers = [tier for tier in REQUIRED_TIERS if tier not in tiers] + items.extend(_missing_tier_issues(missing_tiers)) + items.sort(key=lambda item: (item.severity, item.code, item.clip_id or "", item.message)) + + return ManifestValidation( + manifest_path=str(manifest_path), + passed=not any(item.severity == "fail" for item in items), + clip_count=len(clips), + clip_ids=sorted(ids), + present_tiers=sorted(tiers), + missing_tiers=missing_tiers, + items=items, + ) + + +def _string_field(clip: dict[object, object], field: str) -> str | None: + value = clip.get(field) + return value if isinstance(value, str) and value.strip() else None + + +def _missing_tier_issues(missing_tiers: tuple[str, ...] | list[str]) -> list[ManifestIssue]: + return [ + ManifestIssue( + severity="fail", + code="MISSING_TIER", + message=f"Add at least one eval clip for required tier {tier!r}.", + ) + for tier in missing_tiers + ] + + +__all__ = [ + "ALLOWED_SPLITS", + "ALLOWED_TIERS", + "MIN_PHASE15_CLIPS", + "ManifestIssue", + "ManifestValidation", + "OPTIONAL_TIERS", + "REQUIRED_CLIP_FIELDS", + "REQUIRED_TIERS", + "validate_manifest", +] diff --git a/tabvision/tabvision/eval/runner.py b/tabvision/tabvision/eval/runner.py new file mode 100644 index 0000000..094b9b3 --- /dev/null +++ b/tabvision/tabvision/eval/runner.py @@ -0,0 +1,478 @@ +"""Deterministic Phase 8 eval runner and debt report generator.""" + +from __future__ import annotations + +import argparse +import datetime as dt +import json +import sys +import tempfile +from dataclasses import dataclass +from pathlib import Path +from typing import Literal + +from tabvision.eval.manifest import REQUIRED_TIERS, ManifestValidation, validate_manifest + +Scope = Literal["full", "smoke"] + +DEFAULT_MANIFEST = Path(__file__).resolve().parents[2] / "data" / "eval" / "manifest.toml" +DEFAULT_OUTPUT_DIR = Path(__file__).resolve().parents[3] / "docs" / "EVAL_REPORTS" +DEFAULT_EVAL_ROOT = Path(__file__).resolve().parents[2] / "data" / "eval" +SMOKE_BUDGET_S = 180.0 +ABLATION_VARIANTS: tuple[str, ...] = ("audio_only", "audio_vision", "audio_vision_prior") + + +@dataclass(frozen=True) +class EvalRunResult: + json_path: Path + markdown_path: Path + json_bytes: bytes + markdown: str + passed: bool + + +def run_eval( + *, + manifest_path: str | Path = DEFAULT_MANIFEST, + output_dir: str | Path = DEFAULT_OUTPUT_DIR, + scope: Scope = "full", + seed: int = 0, + timestamp: str | None = None, + eval_root: str | Path = DEFAULT_EVAL_ROOT, +) -> EvalRunResult: + """Run the deterministic eval/debt report. + + Heavy model execution is intentionally not embedded here yet. The runner + validates data readiness, emits stable hook rows for the supported + ablations, and uses a tiny synthetic scope for CI smoke determinism. + """ + ts = timestamp or _utc_timestamp() + manifest = validate_manifest(manifest_path) + phase_debt = _phase_debt(eval_root) + tier_breakdown = _tier_breakdown(manifest) + ablations = _ablation_rows(scope) + calibration = _confidence_calibration(scope, manifest) + smoke = _smoke_summary(scope) + + payload: dict[str, object] = { + "schema_version": 1, + "timestamp": ts, + "seed": seed, + "scope": scope, + "smoke_budget_s": SMOKE_BUDGET_S, + "manifest": manifest.to_dict(), + "tier_breakdown": tier_breakdown, + "phase_debt": phase_debt, + "ablations": ablations, + "confidence_calibration": calibration, + "smoke": smoke, + } + json_bytes = _json_bytes(payload) + markdown = _markdown_report(payload) + + out_dir = Path(output_dir) + out_dir.mkdir(parents=True, exist_ok=True) + stem = f"eval_{scope}_{_timestamp_slug(ts)}" + json_path = out_dir / f"{stem}.json" + markdown_path = out_dir / f"{stem}.md" + json_path.write_bytes(json_bytes) + markdown_path.write_text(markdown, encoding="utf-8") + + passed = bool(smoke["passed"]) if scope == "smoke" else bool(manifest.passed) + return EvalRunResult( + json_path=json_path, + markdown_path=markdown_path, + json_bytes=json_bytes, + markdown=markdown, + passed=passed, + ) + + +def _phase_debt(eval_root: str | Path) -> dict[str, object]: + root = Path(eval_root) + framing_count = _json_count(root / "framing") + fretboard_count = _json_count(root / "fretboard") + hand_fretting_labels = _hand_fretting_label_count(root / "fingering") + return { + "phase_1_5": { + "gate": "manifest completeness and all required tiers represented", + "command": "tabvision-eval --manifest tabvision/data/eval/manifest.toml --check", + }, + "phase_3": { + "guitar_detector": { + "status": "passed_documented", + "evidence": "docs/DECISIONS.md#2026-05-05-phase-3-detector-acceptance", + "metric": "neck mAP50=0.995", + }, + "preflight": { + "status": "ready" if framing_count >= 10 else "blocked", + "usable_labels": framing_count, + "required_labels": 10, + "command": "pytest -m preflight_eval tests/eval/test_phase3_eval.py", + }, + "fretboard": { + "status": "ready" if fretboard_count >= 5 else "blocked", + "usable_labels": fretboard_count, + "required_labels": 5, + "command": "pytest -m fretboard_eval tests/eval/test_phase3_eval.py", + }, + }, + "phase_4": { + "hand": { + "status": "ready" if hand_fretting_labels >= 100 else "blocked", + "usable_fretting_labels": hand_fretting_labels, + "required_fretting_labels": 100, + "command": "pytest -m hand_eval tests/eval/test_phase4_eval.py", + } + }, + } + + +def _tier_breakdown(manifest: ManifestValidation) -> list[dict[str, object]]: + counts = {tier: 0 for tier in REQUIRED_TIERS} + if Path(manifest.manifest_path).exists(): + try: + import tomllib + + payload = tomllib.loads(Path(manifest.manifest_path).read_text(encoding="utf-8")) + except (OSError, tomllib.TOMLDecodeError): + payload = {} + for clip in payload.get("clips", []): + if isinstance(clip, dict): + tier = clip.get("tier") + if isinstance(tier, str) and tier in counts: + counts[tier] += 1 + return [ + { + "tier": tier, + "clip_count": counts[tier], + "status": "blocked" if counts[tier] == 0 else "pending_metrics", + "tab_f1_target": _tier_target(tier), + "tab_f1": None, + } + for tier in REQUIRED_TIERS + ] + + +def _ablation_rows(scope: Scope) -> list[dict[str, object]]: + if scope == "smoke": + return [ + { + "variant": variant, + "status": "synthetic_smoke", + "onset_f1": 1.0, + "pitch_f1": 1.0, + "tab_f1": 1.0, + "chord_accuracy": 1.0, + "blocker": None, + } + for variant in ABLATION_VARIANTS + ] + + return [ + { + "variant": variant, + "status": "blocked", + "onset_f1": None, + "pitch_f1": None, + "tab_f1": None, + "chord_accuracy": None, + "blocker": ( + "No complete Phase 1.5 manifest plus local media/annotations are available " + "for model-backed eval." + ), + } + for variant in ABLATION_VARIANTS + ] + + +def _confidence_calibration(scope: Scope, manifest: ManifestValidation) -> dict[str, object]: + if scope == "smoke": + return { + "status": "synthetic_smoke", + "metric": "ece", + "bins": 10, + "ece": 0.0, + "blocker": None, + } + return { + "status": "blocked", + "metric": "ece", + "bins": 10, + "ece": None, + "blocker": ( + "confidence calibration is blocked until scored predictions with per-event " + f"confidence exist for manifest clips (currently {manifest.clip_count} clips)." + ), + } + + +def _smoke_summary(scope: Scope) -> dict[str, object]: + if scope != "smoke": + return {"enabled": False, "passed": False, "subset": []} + return { + "enabled": True, + "passed": True, + "subset": [ + { + "id": "synthetic-smoke-001", + "tier": "clean_acoustic_single_line", + "notes": 2, + } + ], + } + + +def _markdown_report(payload: dict[str, object]) -> str: + manifest = payload["manifest"] + assert isinstance(manifest, dict) + calibration = payload["confidence_calibration"] + assert isinstance(calibration, dict) + lines = [ + f"# Eval Debt And Harness Report ({payload['scope']})", + "", + f"Timestamp: `{payload['timestamp']}`", + f"Seed: `{payload['seed']}`", + f"Smoke budget target: < {int(SMOKE_BUDGET_S)} s", + "", + "## Phase 1.5 Manifest", + "", + f"- Passed: `{manifest['passed']}`", + f"- Clips: `{manifest['clip_count']}`", + f"- Missing tiers: `{', '.join(manifest['missing_tiers']) or 'none'}`", + "", + "## Per-Tier Breakdown", + "", + "| Tier | Clips | Target Tab F1 | Status | Current Tab F1 |", + "|---|---:|---:|---|---:|", + ] + for row in _list_of_dicts(payload["tier_breakdown"]): + lines.append( + f"| {row['tier']} | {row['clip_count']} | {row['tab_f1_target']:.2f} | " + f"{row['status']} | {_metric_value(row['tab_f1'])} |" + ) + + lines.extend( + [ + "", + "## Phase 3/4 Acceptance Debt", + "", + "| Gate | Status | Evidence / Blocker | Command |", + "|---|---|---|---|", + ] + ) + lines.extend(_phase_debt_rows(payload["phase_debt"])) + lines.extend( + [ + "", + "## Ablations", + "", + "| Variant | Status | Onset F1 | Pitch F1 | Tab F1 | Chord Acc | Blocker |", + "|---|---|---:|---:|---:|---:|---|", + ] + ) + for row in _list_of_dicts(payload["ablations"]): + lines.append( + f"| {row['variant']} | {row['status']} | {_metric_value(row['onset_f1'])} | " + f"{_metric_value(row['pitch_f1'])} | {_metric_value(row['tab_f1'])} | " + f"{_metric_value(row['chord_accuracy'])} | {row['blocker'] or ''} |" + ) + + lines.extend( + [ + "", + "## Confidence Calibration", + "", + f"- Status: `{calibration['status']}`", + f"- Metric: `{calibration['metric']}` with `{calibration['bins']}` bins", + f"- ECE: `{_metric_value(calibration['ece'])}`", + ] + ) + if calibration.get("blocker"): + lines.append(f"- Blocker: {calibration['blocker']}") + return "\n".join(lines) + "\n" + + +def _phase_debt_rows(phase_debt: object) -> list[str]: + assert isinstance(phase_debt, dict) + phase3 = phase_debt["phase_3"] + phase4 = phase_debt["phase_4"] + assert isinstance(phase3, dict) + assert isinstance(phase4, dict) + rows: list[str] = [] + guitar = phase3["guitar_detector"] + preflight = phase3["preflight"] + fretboard = phase3["fretboard"] + hand = phase4["hand"] + assert isinstance(guitar, dict) + assert isinstance(preflight, dict) + assert isinstance(fretboard, dict) + assert isinstance(hand, dict) + rows.append( + "| Phase 3 guitar detector | " + f"{guitar['status']} | {guitar['metric']} ({guitar['evidence']}) | current report |" + ) + rows.append( + "| Phase 3 preflight | " + f"{preflight['status']} | {preflight['usable_labels']}/" + f"{preflight['required_labels']} labels | `{preflight['command']}` |" + ) + rows.append( + "| Phase 3 fretboard | " + f"{fretboard['status']} | {fretboard['usable_labels']}/" + f"{fretboard['required_labels']} labels | `{fretboard['command']}` |" + ) + rows.append( + "| Phase 4 hand | " + f"{hand['status']} | {hand['usable_fretting_labels']}/" + f"{hand['required_fretting_labels']} fretting labels | `{hand['command']}` |" + ) + return rows + + +def _json_count(path: Path) -> int: + return len(sorted(path.glob("*.json"))) if path.is_dir() else 0 + + +def _hand_fretting_label_count(path: Path) -> int: + if not path.is_dir(): + return 0 + total = 0 + for label_path in sorted(path.glob("*.json")): + try: + payload = json.loads(label_path.read_text(encoding="utf-8")) + except (OSError, json.JSONDecodeError): + continue + frames = payload.get("frames", []) + if not isinstance(frames, list): + continue + for frame in frames: + if not isinstance(frame, dict): + continue + fingers = frame.get("fingers", []) + if not isinstance(fingers, list): + continue + for finger in fingers: + if isinstance(finger, dict) and finger.get("is_fretting") is True: + total += 1 + return total + + +def _list_of_dicts(value: object) -> list[dict[str, object]]: + assert isinstance(value, list) + out: list[dict[str, object]] = [] + for item in value: + assert isinstance(item, dict) + out.append(item) + return out + + +def _metric_value(value: object) -> str: + if isinstance(value, int | float): + return f"{float(value):.3f}" + return "" + + +def _tier_target(tier: str) -> float: + return { + "clean_acoustic_single_line": 0.94, + "clean_acoustic_strummed": 0.86, + "clean_electric": 0.90, + "distorted_electric": 0.82, + }[tier] + + +def _json_bytes(payload: dict[str, object]) -> bytes: + return (json.dumps(payload, indent=2, sort_keys=True, ensure_ascii=True) + "\n").encode("utf-8") + + +def _utc_timestamp() -> str: + return dt.datetime.now(dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z") + + +def _timestamp_slug(timestamp: str) -> str: + return ( + timestamp.replace(":", "") + .replace("-", "") + .replace(".", "") + .replace("+", "") + .replace("Z", "Z") + ) + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--manifest", default=str(DEFAULT_MANIFEST)) + parser.add_argument("--output-dir", default=str(DEFAULT_OUTPUT_DIR)) + parser.add_argument("--scope", choices=["full", "smoke"], default="full") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--timestamp", default=None) + parser.add_argument("--eval-root", default=str(DEFAULT_EVAL_ROOT)) + parser.add_argument("--check", action="store_true", help="validate manifest and exit") + parser.add_argument( + "--twice-and-diff", + action="store_true", + help="run deterministic smoke twice and fail if report bytes differ", + ) + args = parser.parse_args(argv) + + if args.check: + manifest = validate_manifest(args.manifest) + sys.stdout.buffer.write(manifest.to_json_bytes()) + return 0 if manifest.passed else 1 + + if args.twice_and_diff: + return _main_twice_and_diff(args) + + result = run_eval( + manifest_path=args.manifest, + output_dir=args.output_dir, + scope=args.scope, + seed=args.seed, + timestamp=args.timestamp, + eval_root=args.eval_root, + ) + print(f"json={result.json_path}") + print(f"markdown={result.markdown_path}") + print(f"passed={str(result.passed).lower()}") + return 0 if result.passed or args.scope == "full" else 1 + + +def _main_twice_and_diff(args: argparse.Namespace) -> int: + timestamp = args.timestamp or "2026-05-07T00:00:00Z" + with tempfile.TemporaryDirectory() as td: + tmp = Path(td) + first = run_eval( + manifest_path=args.manifest, + output_dir=tmp / "a", + scope="smoke", + seed=args.seed, + timestamp=timestamp, + eval_root=args.eval_root, + ) + second = run_eval( + manifest_path=args.manifest, + output_dir=tmp / "b", + scope="smoke", + seed=args.seed, + timestamp=timestamp, + eval_root=args.eval_root, + ) + identical = first.json_bytes == second.json_bytes and first.markdown == second.markdown + print(f"deterministic={str(identical).lower()}") + print(f"smoke_budget_s={int(SMOKE_BUDGET_S)}") + if not identical: + return 1 + return 0 + + +__all__ = [ + "ABLATION_VARIANTS", + "DEFAULT_EVAL_ROOT", + "DEFAULT_MANIFEST", + "DEFAULT_OUTPUT_DIR", + "EvalRunResult", + "SMOKE_BUDGET_S", + "main", + "run_eval", +] diff --git a/tabvision/tests/eval/test_phase8_smoke_eval.py b/tabvision/tests/eval/test_phase8_smoke_eval.py new file mode 100644 index 0000000..74c36a6 --- /dev/null +++ b/tabvision/tests/eval/test_phase8_smoke_eval.py @@ -0,0 +1,37 @@ +"""Phase 8 CI smoke eval. + +This test uses the deterministic smoke scope so it is safe for default CI: +it does not require external datasets or heavy model dependencies, but it +does exercise the same report writer used by the full eval command. +""" + +from __future__ import annotations + +import time + +from tabvision.eval.runner import run_eval + + +def test_phase8_smoke_eval_is_deterministic_and_under_budget(tmp_path) -> None: + manifest = tmp_path / "missing-manifest.toml" + started = time.perf_counter() + + first = run_eval( + manifest_path=manifest, + output_dir=tmp_path / "run-a", + scope="smoke", + seed=0, + timestamp="2026-05-07T00:00:00Z", + ) + second = run_eval( + manifest_path=manifest, + output_dir=tmp_path / "run-b", + scope="smoke", + seed=0, + timestamp="2026-05-07T00:00:00Z", + ) + + elapsed_s = time.perf_counter() - started + assert first.json_bytes == second.json_bytes + assert first.markdown == second.markdown + assert elapsed_s < 180.0 diff --git a/tabvision/tests/unit/test_eval_manifest.py b/tabvision/tests/unit/test_eval_manifest.py new file mode 100644 index 0000000..3f06c6d --- /dev/null +++ b/tabvision/tests/unit/test_eval_manifest.py @@ -0,0 +1,78 @@ +"""Unit tests for eval manifest validation and debt summaries.""" + +from __future__ import annotations + +import json +import tomllib +from pathlib import Path + +from tabvision.eval.manifest import REQUIRED_TIERS, validate_manifest + + +def test_missing_manifest_reports_required_file_and_tiers(tmp_path: Path) -> None: + missing = tmp_path / "manifest.toml" + + result = validate_manifest(missing) + + assert not result.passed + assert result.clip_count == 0 + assert result.missing_tiers == list(REQUIRED_TIERS) + assert any(item.code == "MANIFEST_MISSING" for item in result.items) + + +def test_manifest_reports_missing_required_clip_fields(tmp_path: Path) -> None: + manifest = tmp_path / "manifest.toml" + manifest.write_text( + """ +[[clips]] +id = "clip-a" +tier = "clean_acoustic_single_line" +source = "GuitarSet" +split = "validation" +""".strip() + + "\n", + encoding="utf-8", + ) + + result = validate_manifest(manifest) + + assert not result.passed + assert result.clip_count == 1 + assert {item.code for item in result.items if item.clip_id == "clip-a"} >= { + "MISSING_MEDIA_PATH", + "MISSING_ANNOTATION_PATH", + } + + +def test_manifest_validation_is_json_serializable_and_sorted(tmp_path: Path) -> None: + manifest = tmp_path / "manifest.toml" + manifest.write_text( + """ +[[clips]] +id = "b" +tier = "distorted_electric" +source = "EGDB" +split = "test" +media_path = "$TABVISION_DATA_ROOT/egdb/b.wav" +annotation_path = "$TABVISION_DATA_ROOT/egdb/b.jams" + +[[clips]] +id = "a" +tier = "clean_acoustic_strummed" +source = "GuitarSet" +split = "validation" +media_path = "$TABVISION_DATA_ROOT/guitarset/a.wav" +annotation_path = "$TABVISION_DATA_ROOT/guitarset/a.jams" +""".strip() + + "\n", + encoding="utf-8", + ) + + first = validate_manifest(manifest).to_json_bytes() + second = validate_manifest(manifest).to_json_bytes() + + assert first == second + payload = json.loads(first) + assert payload["clip_ids"] == ["a", "b"] + assert payload["present_tiers"] == ["clean_acoustic_strummed", "distorted_electric"] + assert tomllib.loads(manifest.read_text(encoding="utf-8"))["clips"][0]["id"] == "b" diff --git a/tabvision/tests/unit/test_eval_runner_report.py b/tabvision/tests/unit/test_eval_runner_report.py new file mode 100644 index 0000000..7c54131 --- /dev/null +++ b/tabvision/tests/unit/test_eval_runner_report.py @@ -0,0 +1,78 @@ +"""Unit tests for deterministic eval report generation.""" + +from __future__ import annotations + +import json +from pathlib import Path + +from tabvision.eval.runner import run_eval + + +def test_smoke_eval_writes_byte_identical_reports(tmp_path: Path) -> None: + manifest = tmp_path / "missing-manifest.toml" + out_a = tmp_path / "a" + out_b = tmp_path / "b" + + first = run_eval( + manifest_path=manifest, + output_dir=out_a, + scope="smoke", + seed=7, + timestamp="2026-05-07T00:00:00Z", + ) + second = run_eval( + manifest_path=manifest, + output_dir=out_b, + scope="smoke", + seed=7, + timestamp="2026-05-07T00:00:00Z", + ) + + assert first.json_bytes == second.json_bytes + assert first.markdown == second.markdown + assert first.json_path.read_bytes() == second.json_path.read_bytes() + assert first.markdown_path.read_text(encoding="utf-8") == second.markdown_path.read_text( + encoding="utf-8" + ) + assert "Smoke budget target: < 180 s" in first.markdown + + +def test_full_eval_reports_phase_debt_and_ablation_blockers(tmp_path: Path) -> None: + manifest = tmp_path / "manifest.toml" + manifest.write_text( + """ +[[clips]] +id = "clip-a" +tier = "clean_acoustic_single_line" +source = "GuitarSet" +split = "validation" +media_path = "$TABVISION_DATA_ROOT/guitarset/a.wav" +annotation_path = "$TABVISION_DATA_ROOT/guitarset/a.jams" +""".strip() + + "\n", + encoding="utf-8", + ) + + result = run_eval( + manifest_path=manifest, + output_dir=tmp_path / "reports", + scope="full", + seed=0, + timestamp="2026-05-07T00:00:00Z", + ) + + payload = json.loads(result.json_bytes) + assert payload["scope"] == "full" + assert payload["manifest"]["missing_tiers"] == [ + "clean_acoustic_strummed", + "clean_electric", + "distorted_electric", + ] + assert [row["variant"] for row in payload["ablations"]] == [ + "audio_only", + "audio_vision", + "audio_vision_prior", + ] + assert all(row["status"] == "blocked" for row in payload["ablations"]) + assert "Phase 3/4 Acceptance Debt" in result.markdown + assert "confidence calibration is blocked" in result.markdown