diff --git a/docs/2026-05-07-session-handoff.md b/docs/2026-05-07-session-handoff.md new file mode 100644 index 0000000..62fa6a9 --- /dev/null +++ b/docs/2026-05-07-session-handoff.md @@ -0,0 +1,127 @@ +# 2026-05-07 Session Handoff + +## Current Focus + +Phase 5 is blocked on trustworthy audio-to-tab mapping before any more +video-calibration or `lambda_vision` tuning. + +The key finding from today is that highres audio is strong at GuitarSet +onset/pitch, but default pitch-to-string/fret decoding is weak. A learned +GuitarSet train-split position prior improves Tab F1 substantially, but +it should not become an unconditional production default yet. + +## What Changed Today + +- Added a raw GuitarSet audio-only evaluator: + - `tabvision/tabvision/eval/guitarset_audio.py` + - `tabvision/scripts/eval/guitarset_audio_eval.py` + - `tabvision/tests/unit/test_guitarset_audio_eval.py` +- Added learned pitch-position prior support: + - `tabvision/tabvision/fusion/position_prior.py` + - `tabvision/tests/unit/test_position_prior.py` +- Added a Modal L4 GPU runner for full highres eval: + - `tabvision/scripts/eval/guitarset_highres_modal.py` +- Recorded the Phase 5 position-prior decision in `docs/DECISIONS.md`. + +The existing GuitarSet TFRecords were inspected and are insufficient for +Tab F1 because they do not retain string/fret labels. Raw JAMS/WAV is the +correct source for Tab F1. + +## Metric Snapshot + +Full GuitarSet validation split is player `05` (60 tracks). + +Modal L4 highres eval, full validation: + +| Run | Onset F1 | Pitch F1 | Tab F1 | +| --- | ---: | ---: | ---: | +| no position prior | `0.9218` | `0.9022` | `0.3878` | +| GuitarSet train prior | `0.9218` | `0.9022` | `0.6104` | + +Delta from prior: `+22.26 pp` Tab F1, with onset/pitch unchanged. + +Per-track effect: 51/60 improved, 8/60 regressed, 1/60 unchanged. Mean +track Tab F1 moved from `0.347` to `0.589`. + +Reports: + +- `tabvision-server/tools/outputs/guitarset_audio_eval-highres-validation-none-2026-05-07.md` +- `tabvision-server/tools/outputs/guitarset_audio_eval-highres-validation-guitarset-train-2026-05-07.md` + +## Promotion Decision + +Do not make the GuitarSet train prior an unconditional production default +yet. + +Recommended path: + +1. Promote the position prior as a versioned/configured production option. +2. Create a checked-in prior artifact or generator so raw GuitarSet is not + required at runtime. +3. Classify the 8 validation regressions, especially the SS/comp cases + where no-prior was already strong. +4. Run the home-video Phase 5 benchmark with and without the prior. +5. Make it the default only if the home-video benchmark has no regression + and the GuitarSet regressions are understood or accepted. + +## Modal State + +The Modal GPU path is now working. + +Useful commands: + +```bash +tabvision-server/venv/bin/modal run tabvision/scripts/eval/guitarset_highres_modal.py --limit 1 +tabvision-server/venv/bin/modal run tabvision/scripts/eval/guitarset_highres_modal.py +tabvision-server/venv/bin/modal run tabvision/scripts/eval/guitarset_highres_modal.py --position-prior none +``` + +The Modal volume `tabvision-guitarset` has been hydrated with 360 raw +GuitarSet JAMS/WAV tracks from `taohu/guitarset`. + +## Verification At Handoff + +Last full fast suite: + +```bash +cd tabvision +.venv/bin/python -m pytest -q +``` + +Result: `249 passed, 9 skipped`. + +Focused checks after the eval/Modal changes: + +```bash +cd tabvision +.venv/bin/python -m pytest tests/unit/test_guitarset_audio_eval.py tests/unit/test_position_prior.py -q +.venv/bin/python -m ruff check tabvision/eval/guitarset_audio.py tabvision/fusion/position_prior.py scripts/eval/guitarset_audio_eval.py scripts/eval/guitarset_highres_modal.py tests/unit/test_guitarset_audio_eval.py tests/unit/test_position_prior.py +``` + +Result: 11 focused tests passed; ruff passed. + +## Worktree Note + +The worktree is still dirty. Some dirty files predate this handoff. The +most relevant new files from this session are: + +- `tabvision/tabvision/eval/guitarset_audio.py` +- `tabvision/tabvision/fusion/position_prior.py` +- `tabvision/scripts/eval/guitarset_audio_eval.py` +- `tabvision/scripts/eval/guitarset_highres_modal.py` +- `tabvision/tests/unit/test_guitarset_audio_eval.py` +- `tabvision/tests/unit/test_position_prior.py` +- `tabvision-server/tools/outputs/guitarset_audio_eval-highres-validation-none-2026-05-07.{md,csv}` +- `tabvision-server/tools/outputs/guitarset_audio_eval-highres-validation-guitarset-train-2026-05-07.{md,csv}` + +There are also Phase 5 eval/fusion/video diagnostics and prior Phase 5 +implementation changes in the worktree. Do not assume every dirty file +belongs to the GuitarSet work. + +## Next Best Step + +Build the production integration for the position prior as an explicit +config option/artifact, then run home-video Phase 5 with prior on/off. + +Do not proceed to Phase 6 until Phase 5 is recorded and the user explicitly +says `proceed`. diff --git a/docs/DECISIONS.md b/docs/DECISIONS.md index dcdc09b..e5275e0 100644 --- a/docs/DECISIONS.md +++ b/docs/DECISIONS.md @@ -380,3 +380,45 @@ to validate; "the hand is around frets 3-6" is a stronger, more stable visual prior for resolving audio's same-pitch string/fret ambiguity. Keeping the signal as a prior lets audio and playability override it when the visual evidence is weak or wrong. + +--- + +## 2026-05-07 — Phase 5 GuitarSet pitch-to-tab bottleneck + +**Phase:** 5 (audio-to-tab mapping) +**Decision tree:** GuitarSet audio-only diagnostic — if pitch F1 is good +but Tab F1 is bad, fix string/fret candidate selection before tuning video +calibration or `lambda_vision`. +**Branch taken:** **Add an optional learned pitch-position prior.** Raw +GuitarSet JAMS provide held-out string/fret labels; the evaluator now learns +`P(string,fret | pitch)` from train players and attaches it via the existing +2D `AudioEvent.fret_prior` path before audio-only Viterbi decode. +**Evidence:** On full validation, oracle gold-onset/gold-pitch events scored +only `0.4335` Tab F1 with the default decoder, proving the mapping is bad even +when audio extraction is perfect. On the first 10 validation tracks, about +two-thirds of same-pitch events landed on the wrong adjacent string/fret. Most +errors were low-fret equivalents such as G-string notes decoded on B or B-string +notes decoded on high E. A GuitarSet train-split prior raised oracle +full-validation Tab F1 to `0.6802`. On the 3-track highres smoke, Tab F1 moved +from `0.3356` to `0.7260` while onset F1 (`0.9692`) and pitch F1 (`0.9555`) +stayed unchanged. +**Reasoning:** This confirms the immediate bottleneck is pitch-to-position +ambiguity, not highres onset/pitch extraction and not Phase 5 vision weighting. +The prior is optional for now; it does not change public fusion APIs or the +default production decode until a full validation run and home-video check +justify promoting it. + +**Follow-up evidence:** A Modal L4 full-validation highres run completed on +2026-05-07. With no position prior: onset F1 `0.9218`, pitch F1 `0.9022`, +Tab F1 `0.3878`. With the GuitarSet train-split prior: onset F1 `0.9218`, +pitch F1 `0.9022`, Tab F1 `0.6104` (`+22.26 pp`). Per-track, 51/60 improved, +8/60 regressed, and 1/60 was unchanged. Mean track Tab F1 moved from `0.347` +to `0.589`. +**Promotion decision:** **Do not make this an unconditional production default +yet.** Promote the prior next as a versioned/configured production option, then +make it the default only after (a) a checked-in prior artifact is available +without requiring raw GuitarSet at runtime, (b) same-pitch regressions are +classified and reduced or accepted, and (c) the home-video Phase 5 benchmark +shows no regression. The full GuitarSet result is strong enough to justify the +production integration path, but the 8 regressed validation clips make a silent +global default premature. diff --git a/tabvision-server/tools/outputs/guitarset_audio_eval-highres-validation-2026-05-07.csv b/tabvision-server/tools/outputs/guitarset_audio_eval-highres-validation-2026-05-07.csv new file mode 100644 index 0000000..c35d4e1 --- /dev/null +++ b/tabvision-server/tools/outputs/guitarset_audio_eval-highres-validation-2026-05-07.csv @@ -0,0 +1,4 @@ +track_id,backend,gold_notes,audio_events,decoded_events,onset_f1,pitch_f1,tab_f1,tab_tp,tab_fp,tab_fn +05_BN1-129-Eb_comp,highres,148,150,150,0.979866,0.973154,0.221477,33,117,115 +05_BN1-129-Eb_solo,highres,44,46,46,0.933333,0.933333,0.111111,5,41,39 +05_BN1-147-Gb_comp,highres,96,100,100,0.969388,0.938776,0.612245,60,40,36 diff --git a/tabvision-server/tools/outputs/guitarset_audio_eval-highres-validation-2026-05-07.md b/tabvision-server/tools/outputs/guitarset_audio_eval-highres-validation-2026-05-07.md new file mode 100644 index 0000000..0445079 --- /dev/null +++ b/tabvision-server/tools/outputs/guitarset_audio_eval-highres-validation-2026-05-07.md @@ -0,0 +1,22 @@ +# GuitarSet Audio Eval (highres) + +Split: **validation** +Tracks: **3** +Gold notes: **288** +Audio events: **296** + +## Aggregate + +| Metric | Mean F1 | Micro P | Micro R | Micro F1 | +| --- | ---: | ---: | ---: | ---: | +| Onset | 0.961 | 0.956 | 0.983 | 0.969 | +| Pitch | 0.948 | 0.943 | 0.969 | 0.955 | +| Tab | 0.315 | 0.331 | 0.340 | 0.336 | + +## Per Track + +| Track | Gold | Audio | Decoded | Onset F1 | Pitch F1 | Tab F1 | +| --- | ---: | ---: | ---: | ---: | ---: | ---: | +| `05_BN1-129-Eb_comp` | 148 | 150 | 150 | 0.980 | 0.973 | 0.221 | +| `05_BN1-129-Eb_solo` | 44 | 46 | 46 | 0.933 | 0.933 | 0.111 | +| `05_BN1-147-Gb_comp` | 96 | 100 | 100 | 0.969 | 0.939 | 0.612 | diff --git a/tabvision-server/tools/outputs/guitarset_audio_eval-highres-validation-guitarset-train-2026-05-07.csv b/tabvision-server/tools/outputs/guitarset_audio_eval-highres-validation-guitarset-train-2026-05-07.csv new file mode 100644 index 0000000..4e11a4b --- /dev/null +++ b/tabvision-server/tools/outputs/guitarset_audio_eval-highres-validation-guitarset-train-2026-05-07.csv @@ -0,0 +1,61 @@ +track_id,backend,gold_notes,audio_events,decoded_events,onset_f1,pitch_f1,tab_f1,tab_tp,tab_fp,tab_fn +05_BN1-129-Eb_comp,highres,148,150,150,0.979866,0.973154,0.932886,139,11,9 +05_BN1-129-Eb_solo,highres,44,46,46,0.933333,0.933333,0.377778,17,29,27 +05_BN1-147-Gb_comp,highres,96,100,100,0.969388,0.938776,0.571429,56,44,40 +05_BN1-147-Gb_solo,highres,27,32,32,0.847458,0.847458,0.203390,6,26,21 +05_BN2-131-B_comp,highres,202,203,203,0.987654,0.972840,0.809877,164,39,38 +05_BN2-131-B_solo,highres,69,76,76,0.937931,0.937931,0.593103,43,33,26 +05_BN2-166-Ab_comp,highres,173,162,162,0.955224,0.931343,0.859701,144,18,29 +05_BN2-166-Ab_solo,highres,67,69,69,0.941176,0.926471,0.691176,47,22,20 +05_BN3-119-G_comp,highres,180,178,178,0.977654,0.977654,0.720670,129,49,51 +05_BN3-119-G_solo,highres,62,64,64,0.873016,0.873016,0.619048,39,25,23 +05_BN3-154-E_comp,highres,185,180,180,0.958904,0.926027,0.695890,127,53,58 +05_BN3-154-E_solo,highres,56,59,59,0.921739,0.904348,0.260870,15,44,41 +05_Funk1-114-Ab_comp,highres,181,162,153,0.844311,0.838323,0.808383,135,18,46 +05_Funk1-114-Ab_solo,highres,60,56,56,0.844828,0.793103,0.637931,37,19,23 +05_Funk1-97-C_comp,highres,177,172,172,0.974212,0.951289,0.796562,139,33,38 +05_Funk1-97-C_solo,highres,58,57,57,0.921739,0.921739,0.330435,19,38,39 +05_Funk2-108-Eb_comp,highres,245,271,260,0.843564,0.807921,0.673267,170,90,75 +05_Funk2-108-Eb_solo,highres,61,61,61,0.983607,0.983607,0.491803,30,31,31 +05_Funk2-119-G_comp,highres,182,192,184,0.907104,0.890710,0.841530,154,30,28 +05_Funk2-119-G_solo,highres,99,96,96,0.943590,0.923077,0.256410,25,71,74 +05_Funk3-112-C#_comp,highres,212,206,206,0.928230,0.913876,0.837321,175,31,37 +05_Funk3-112-C#_solo,highres,70,64,64,0.955224,0.955224,0.686567,46,18,24 +05_Funk3-98-A_comp,highres,184,182,162,0.901734,0.895954,0.815029,141,21,43 +05_Funk3-98-A_solo,highres,65,65,65,0.969231,0.969231,0.538462,35,30,30 +05_Jazz1-130-D_comp,highres,141,118,118,0.880309,0.849421,0.687259,89,29,52 +05_Jazz1-130-D_solo,highres,58,61,61,0.924370,0.890756,0.554622,33,28,25 +05_Jazz1-200-B_comp,highres,175,168,163,0.852071,0.781065,0.491124,83,80,92 +05_Jazz1-200-B_solo,highres,50,49,49,0.969697,0.969697,0.464646,23,26,27 +05_Jazz2-110-Bb_comp,highres,252,252,242,0.910931,0.890688,0.781377,193,49,59 +05_Jazz2-110-Bb_solo,highres,103,97,97,0.940000,0.940000,0.700000,70,27,33 +05_Jazz2-187-F#_comp,highres,247,226,219,0.862661,0.836910,0.669528,156,63,91 +05_Jazz2-187-F#_solo,highres,61,60,60,0.975207,0.975207,0.280992,17,43,44 +05_Jazz3-137-Eb_comp,highres,235,231,227,0.857143,0.800866,0.683983,158,69,77 +05_Jazz3-137-Eb_solo,highres,77,76,76,0.967320,0.967320,0.496732,38,38,39 +05_Jazz3-150-C_comp,highres,227,226,219,0.910314,0.874439,0.291480,65,154,162 +05_Jazz3-150-C_solo,highres,69,67,67,0.955882,0.955882,0.764706,52,15,17 +05_Rock1-130-A_comp,highres,312,349,349,0.883510,0.877458,0.166415,55,294,257 +05_Rock1-130-A_solo,highres,60,62,62,0.950820,0.950820,0.606557,37,25,23 +05_Rock1-90-C#_comp,highres,277,276,276,0.936709,0.871609,0.867993,240,36,37 +05_Rock1-90-C#_solo,highres,99,106,106,0.926829,0.907317,0.809756,83,23,16 +05_Rock2-142-D_comp,highres,460,450,450,0.890110,0.841758,0.336264,153,297,307 +05_Rock2-142-D_solo,highres,86,86,86,0.906977,0.906977,0.290698,25,61,61 +05_Rock2-85-F_comp,highres,325,296,296,0.856683,0.821256,0.615137,191,105,134 +05_Rock2-85-F_solo,highres,125,124,124,0.955823,0.939759,0.698795,87,37,38 +05_Rock3-117-Bb_comp,highres,398,401,401,0.878598,0.868586,0.403004,161,240,237 +05_Rock3-117-Bb_solo,highres,70,65,65,0.962963,0.962963,0.518519,35,30,35 +05_Rock3-148-C_comp,highres,249,286,286,0.927103,0.923364,0.818692,219,67,30 +05_Rock3-148-C_solo,highres,61,60,60,0.991736,0.991736,0.495868,30,30,31 +05_SS1-100-C#_comp,highres,102,95,95,0.964467,0.964467,0.395939,39,56,63 +05_SS1-100-C#_solo,highres,60,64,64,0.887097,0.870968,0.774194,48,16,12 +05_SS1-68-E_comp,highres,143,148,148,0.982818,0.975945,0.783505,114,34,29 +05_SS1-68-E_solo,highres,78,74,74,0.894737,0.881579,0.328947,25,49,53 +05_SS2-107-Ab_comp,highres,222,227,227,0.971047,0.953229,0.726058,163,64,59 +05_SS2-107-Ab_solo,highres,87,85,85,0.941860,0.941860,0.546512,47,38,40 +05_SS2-88-F_comp,highres,196,190,190,0.963731,0.943005,0.621762,120,70,76 +05_SS2-88-F_solo,highres,93,97,97,0.978947,0.968421,0.589474,56,41,37 +05_SS3-84-Bb_comp,highres,211,215,215,0.985915,0.985915,0.671362,143,72,68 +05_SS3-84-Bb_solo,highres,111,119,119,0.947826,0.947826,0.330435,38,81,73 +05_SS3-98-C_comp,highres,199,187,187,0.937824,0.932642,0.735751,142,45,57 +05_SS3-98-C_solo,highres,93,94,94,0.973262,0.973262,0.288770,27,67,66 diff --git a/tabvision-server/tools/outputs/guitarset_audio_eval-highres-validation-guitarset-train-2026-05-07.md b/tabvision-server/tools/outputs/guitarset_audio_eval-highres-validation-guitarset-train-2026-05-07.md new file mode 100644 index 0000000..3354c72 --- /dev/null +++ b/tabvision-server/tools/outputs/guitarset_audio_eval-highres-validation-guitarset-train-2026-05-07.md @@ -0,0 +1,80 @@ +# GuitarSet Audio Eval (highres) + +Split: **validation** +Position prior: **guitarset-train** +Tracks: **60** +Gold notes: **8715** +Audio events: **8690** + +## Aggregate + +| Metric | Mean F1 | Micro P | Micro R | Micro F1 | +| --- | ---: | ---: | ---: | ---: | +| Onset | 0.930 | 0.928 | 0.916 | 0.922 | +| Pitch | 0.915 | 0.908 | 0.897 | 0.902 | +| Tab | 0.589 | 0.614 | 0.607 | 0.610 | + +## Per Track + +| Track | Gold | Audio | Decoded | Onset F1 | Pitch F1 | Tab F1 | +| --- | ---: | ---: | ---: | ---: | ---: | ---: | +| `05_BN1-129-Eb_comp` | 148 | 150 | 150 | 0.980 | 0.973 | 0.933 | +| `05_BN1-129-Eb_solo` | 44 | 46 | 46 | 0.933 | 0.933 | 0.378 | +| `05_BN1-147-Gb_comp` | 96 | 100 | 100 | 0.969 | 0.939 | 0.571 | +| `05_BN1-147-Gb_solo` | 27 | 32 | 32 | 0.847 | 0.847 | 0.203 | +| `05_BN2-131-B_comp` | 202 | 203 | 203 | 0.988 | 0.973 | 0.810 | +| `05_BN2-131-B_solo` | 69 | 76 | 76 | 0.938 | 0.938 | 0.593 | +| `05_BN2-166-Ab_comp` | 173 | 162 | 162 | 0.955 | 0.931 | 0.860 | +| `05_BN2-166-Ab_solo` | 67 | 69 | 69 | 0.941 | 0.926 | 0.691 | +| `05_BN3-119-G_comp` | 180 | 178 | 178 | 0.978 | 0.978 | 0.721 | +| `05_BN3-119-G_solo` | 62 | 64 | 64 | 0.873 | 0.873 | 0.619 | +| `05_BN3-154-E_comp` | 185 | 180 | 180 | 0.959 | 0.926 | 0.696 | +| `05_BN3-154-E_solo` | 56 | 59 | 59 | 0.922 | 0.904 | 0.261 | +| `05_Funk1-114-Ab_comp` | 181 | 162 | 153 | 0.844 | 0.838 | 0.808 | +| `05_Funk1-114-Ab_solo` | 60 | 56 | 56 | 0.845 | 0.793 | 0.638 | +| `05_Funk1-97-C_comp` | 177 | 172 | 172 | 0.974 | 0.951 | 0.797 | +| `05_Funk1-97-C_solo` | 58 | 57 | 57 | 0.922 | 0.922 | 0.330 | +| `05_Funk2-108-Eb_comp` | 245 | 271 | 260 | 0.844 | 0.808 | 0.673 | +| `05_Funk2-108-Eb_solo` | 61 | 61 | 61 | 0.984 | 0.984 | 0.492 | +| `05_Funk2-119-G_comp` | 182 | 192 | 184 | 0.907 | 0.891 | 0.842 | +| `05_Funk2-119-G_solo` | 99 | 96 | 96 | 0.944 | 0.923 | 0.256 | +| `05_Funk3-112-C#_comp` | 212 | 206 | 206 | 0.928 | 0.914 | 0.837 | +| `05_Funk3-112-C#_solo` | 70 | 64 | 64 | 0.955 | 0.955 | 0.687 | +| `05_Funk3-98-A_comp` | 184 | 182 | 162 | 0.902 | 0.896 | 0.815 | +| `05_Funk3-98-A_solo` | 65 | 65 | 65 | 0.969 | 0.969 | 0.538 | +| `05_Jazz1-130-D_comp` | 141 | 118 | 118 | 0.880 | 0.849 | 0.687 | +| `05_Jazz1-130-D_solo` | 58 | 61 | 61 | 0.924 | 0.891 | 0.555 | +| `05_Jazz1-200-B_comp` | 175 | 168 | 163 | 0.852 | 0.781 | 0.491 | +| `05_Jazz1-200-B_solo` | 50 | 49 | 49 | 0.970 | 0.970 | 0.465 | +| `05_Jazz2-110-Bb_comp` | 252 | 252 | 242 | 0.911 | 0.891 | 0.781 | +| `05_Jazz2-110-Bb_solo` | 103 | 97 | 97 | 0.940 | 0.940 | 0.700 | +| `05_Jazz2-187-F#_comp` | 247 | 226 | 219 | 0.863 | 0.837 | 0.670 | +| `05_Jazz2-187-F#_solo` | 61 | 60 | 60 | 0.975 | 0.975 | 0.281 | +| `05_Jazz3-137-Eb_comp` | 235 | 231 | 227 | 0.857 | 0.801 | 0.684 | +| `05_Jazz3-137-Eb_solo` | 77 | 76 | 76 | 0.967 | 0.967 | 0.497 | +| `05_Jazz3-150-C_comp` | 227 | 226 | 219 | 0.910 | 0.874 | 0.291 | +| `05_Jazz3-150-C_solo` | 69 | 67 | 67 | 0.956 | 0.956 | 0.765 | +| `05_Rock1-130-A_comp` | 312 | 349 | 349 | 0.884 | 0.877 | 0.166 | +| `05_Rock1-130-A_solo` | 60 | 62 | 62 | 0.951 | 0.951 | 0.607 | +| `05_Rock1-90-C#_comp` | 277 | 276 | 276 | 0.937 | 0.872 | 0.868 | +| `05_Rock1-90-C#_solo` | 99 | 106 | 106 | 0.927 | 0.907 | 0.810 | +| `05_Rock2-142-D_comp` | 460 | 450 | 450 | 0.890 | 0.842 | 0.336 | +| `05_Rock2-142-D_solo` | 86 | 86 | 86 | 0.907 | 0.907 | 0.291 | +| `05_Rock2-85-F_comp` | 325 | 296 | 296 | 0.857 | 0.821 | 0.615 | +| `05_Rock2-85-F_solo` | 125 | 124 | 124 | 0.956 | 0.940 | 0.699 | +| `05_Rock3-117-Bb_comp` | 398 | 401 | 401 | 0.879 | 0.869 | 0.403 | +| `05_Rock3-117-Bb_solo` | 70 | 65 | 65 | 0.963 | 0.963 | 0.519 | +| `05_Rock3-148-C_comp` | 249 | 286 | 286 | 0.927 | 0.923 | 0.819 | +| `05_Rock3-148-C_solo` | 61 | 60 | 60 | 0.992 | 0.992 | 0.496 | +| `05_SS1-100-C#_comp` | 102 | 95 | 95 | 0.964 | 0.964 | 0.396 | +| `05_SS1-100-C#_solo` | 60 | 64 | 64 | 0.887 | 0.871 | 0.774 | +| `05_SS1-68-E_comp` | 143 | 148 | 148 | 0.983 | 0.976 | 0.784 | +| `05_SS1-68-E_solo` | 78 | 74 | 74 | 0.895 | 0.882 | 0.329 | +| `05_SS2-107-Ab_comp` | 222 | 227 | 227 | 0.971 | 0.953 | 0.726 | +| `05_SS2-107-Ab_solo` | 87 | 85 | 85 | 0.942 | 0.942 | 0.547 | +| `05_SS2-88-F_comp` | 196 | 190 | 190 | 0.964 | 0.943 | 0.622 | +| `05_SS2-88-F_solo` | 93 | 97 | 97 | 0.979 | 0.968 | 0.589 | +| `05_SS3-84-Bb_comp` | 211 | 215 | 215 | 0.986 | 0.986 | 0.671 | +| `05_SS3-84-Bb_solo` | 111 | 119 | 119 | 0.948 | 0.948 | 0.330 | +| `05_SS3-98-C_comp` | 199 | 187 | 187 | 0.938 | 0.933 | 0.736 | +| `05_SS3-98-C_solo` | 93 | 94 | 94 | 0.973 | 0.973 | 0.289 | diff --git a/tabvision-server/tools/outputs/guitarset_audio_eval-highres-validation-none-2026-05-07.csv b/tabvision-server/tools/outputs/guitarset_audio_eval-highres-validation-none-2026-05-07.csv new file mode 100644 index 0000000..a6257fd --- /dev/null +++ b/tabvision-server/tools/outputs/guitarset_audio_eval-highres-validation-none-2026-05-07.csv @@ -0,0 +1,61 @@ +track_id,backend,gold_notes,audio_events,decoded_events,onset_f1,pitch_f1,tab_f1,tab_tp,tab_fp,tab_fn +05_BN1-129-Eb_comp,highres,148,150,150,0.979866,0.973154,0.221477,33,117,115 +05_BN1-129-Eb_solo,highres,44,46,46,0.933333,0.933333,0.111111,5,41,39 +05_BN1-147-Gb_comp,highres,96,100,100,0.969388,0.938776,0.612245,60,40,36 +05_BN1-147-Gb_solo,highres,27,32,32,0.847458,0.847458,0.033898,1,31,26 +05_BN2-131-B_comp,highres,202,203,203,0.987654,0.972840,0.340741,69,134,133 +05_BN2-131-B_solo,highres,69,76,76,0.937931,0.937931,0.124138,9,67,60 +05_BN2-166-Ab_comp,highres,173,162,162,0.955224,0.931343,0.507463,85,77,88 +05_BN2-166-Ab_solo,highres,67,69,69,0.941176,0.926471,0.323529,22,47,45 +05_BN3-119-G_comp,highres,180,178,178,0.977654,0.977654,0.312849,56,122,124 +05_BN3-119-G_solo,highres,62,64,64,0.873016,0.873016,0.047619,3,61,59 +05_BN3-154-E_comp,highres,185,180,180,0.958904,0.926027,0.093151,17,163,168 +05_BN3-154-E_solo,highres,56,59,59,0.921739,0.904348,0.034783,2,57,54 +05_Funk1-114-Ab_comp,highres,181,162,153,0.844311,0.838323,0.784431,131,22,50 +05_Funk1-114-Ab_solo,highres,60,56,56,0.844828,0.793103,0.206897,12,44,48 +05_Funk1-97-C_comp,highres,177,172,172,0.974212,0.951289,0.229226,40,132,137 +05_Funk1-97-C_solo,highres,58,57,57,0.921739,0.921739,0.260870,15,42,43 +05_Funk2-108-Eb_comp,highres,245,271,260,0.843564,0.807921,0.467327,118,142,127 +05_Funk2-108-Eb_solo,highres,61,61,61,0.983607,0.983607,0.459016,28,33,33 +05_Funk2-119-G_comp,highres,182,192,184,0.907104,0.890710,0.819672,150,34,32 +05_Funk2-119-G_solo,highres,99,96,96,0.943590,0.923077,0.225641,22,74,77 +05_Funk3-112-C#_comp,highres,212,206,206,0.928230,0.913876,0.803828,168,38,44 +05_Funk3-112-C#_solo,highres,70,64,64,0.955224,0.955224,0.313433,21,43,49 +05_Funk3-98-A_comp,highres,184,182,162,0.901734,0.895954,0.485549,84,78,100 +05_Funk3-98-A_solo,highres,65,65,65,0.969231,0.969231,0.107692,7,58,58 +05_Jazz1-130-D_comp,highres,141,118,118,0.880309,0.849421,0.416988,54,64,87 +05_Jazz1-130-D_solo,highres,58,61,61,0.924370,0.890756,0.235294,14,47,44 +05_Jazz1-200-B_comp,highres,175,168,163,0.852071,0.781065,0.165680,28,135,147 +05_Jazz1-200-B_solo,highres,50,49,49,0.969697,0.969697,0.161616,8,41,42 +05_Jazz2-110-Bb_comp,highres,252,252,242,0.910931,0.890688,0.570850,141,101,111 +05_Jazz2-110-Bb_solo,highres,103,97,97,0.940000,0.940000,0.320000,32,65,71 +05_Jazz2-187-F#_comp,highres,247,226,219,0.862661,0.836910,0.313305,73,146,174 +05_Jazz2-187-F#_solo,highres,61,60,60,0.975207,0.975207,0.297521,18,42,43 +05_Jazz3-137-Eb_comp,highres,235,231,227,0.857143,0.800866,0.415584,96,131,139 +05_Jazz3-137-Eb_solo,highres,77,76,76,0.967320,0.967320,0.104575,8,68,69 +05_Jazz3-150-C_comp,highres,227,226,219,0.910314,0.874439,0.291480,65,154,162 +05_Jazz3-150-C_solo,highres,69,67,67,0.955882,0.955882,0.367647,25,42,44 +05_Rock1-130-A_comp,highres,312,349,349,0.883510,0.877458,0.172466,57,292,255 +05_Rock1-130-A_solo,highres,60,62,62,0.950820,0.950820,0.065574,4,58,56 +05_Rock1-90-C#_comp,highres,277,276,276,0.936709,0.871609,0.466546,129,147,148 +05_Rock1-90-C#_solo,highres,99,106,106,0.926829,0.907317,0.263415,27,79,72 +05_Rock2-142-D_comp,highres,460,450,450,0.890110,0.841758,0.204396,93,357,367 +05_Rock2-142-D_solo,highres,86,86,86,0.906977,0.906977,0.081395,7,79,79 +05_Rock2-85-F_comp,highres,325,296,296,0.856683,0.821256,0.373591,116,180,209 +05_Rock2-85-F_solo,highres,125,124,124,0.955823,0.939759,0.345382,43,81,82 +05_Rock3-117-Bb_comp,highres,398,401,401,0.878598,0.868586,0.122653,49,352,349 +05_Rock3-117-Bb_solo,highres,70,65,65,0.962963,0.962963,0.444444,30,35,40 +05_Rock3-148-C_comp,highres,249,286,286,0.927103,0.923364,0.695327,186,100,63 +05_Rock3-148-C_solo,highres,61,60,60,0.991736,0.991736,0.132231,8,52,53 +05_SS1-100-C#_comp,highres,102,95,95,0.964467,0.964467,0.710660,70,25,32 +05_SS1-100-C#_solo,highres,60,64,64,0.887097,0.870968,0.645161,40,24,20 +05_SS1-68-E_comp,highres,143,148,148,0.982818,0.975945,0.824742,120,28,23 +05_SS1-68-E_solo,highres,78,74,74,0.894737,0.881579,0.092105,7,67,71 +05_SS2-107-Ab_comp,highres,222,227,227,0.971047,0.953229,0.476615,107,120,115 +05_SS2-107-Ab_solo,highres,87,85,85,0.941860,0.941860,0.127907,11,74,76 +05_SS2-88-F_comp,highres,196,190,190,0.963731,0.943005,0.787565,152,38,44 +05_SS2-88-F_solo,highres,93,97,97,0.978947,0.968421,0.242105,23,74,70 +05_SS3-84-Bb_comp,highres,211,215,215,0.985915,0.985915,0.774648,165,50,46 +05_SS3-84-Bb_solo,highres,111,119,119,0.947826,0.947826,0.191304,22,97,89 +05_SS3-98-C_comp,highres,199,187,187,0.937824,0.932642,0.803109,155,32,44 +05_SS3-98-C_solo,highres,93,94,94,0.973262,0.973262,0.192513,18,76,75 diff --git a/tabvision-server/tools/outputs/guitarset_audio_eval-highres-validation-none-2026-05-07.md b/tabvision-server/tools/outputs/guitarset_audio_eval-highres-validation-none-2026-05-07.md new file mode 100644 index 0000000..a5451b1 --- /dev/null +++ b/tabvision-server/tools/outputs/guitarset_audio_eval-highres-validation-none-2026-05-07.md @@ -0,0 +1,80 @@ +# GuitarSet Audio Eval (highres) + +Split: **validation** +Position prior: **none** +Tracks: **60** +Gold notes: **8715** +Audio events: **8690** + +## Aggregate + +| Metric | Mean F1 | Micro P | Micro R | Micro F1 | +| --- | ---: | ---: | ---: | ---: | +| Onset | 0.930 | 0.928 | 0.916 | 0.922 | +| Pitch | 0.915 | 0.908 | 0.897 | 0.902 | +| Tab | 0.347 | 0.390 | 0.385 | 0.388 | + +## Per Track + +| Track | Gold | Audio | Decoded | Onset F1 | Pitch F1 | Tab F1 | +| --- | ---: | ---: | ---: | ---: | ---: | ---: | +| `05_BN1-129-Eb_comp` | 148 | 150 | 150 | 0.980 | 0.973 | 0.221 | +| `05_BN1-129-Eb_solo` | 44 | 46 | 46 | 0.933 | 0.933 | 0.111 | +| `05_BN1-147-Gb_comp` | 96 | 100 | 100 | 0.969 | 0.939 | 0.612 | +| `05_BN1-147-Gb_solo` | 27 | 32 | 32 | 0.847 | 0.847 | 0.034 | +| `05_BN2-131-B_comp` | 202 | 203 | 203 | 0.988 | 0.973 | 0.341 | +| `05_BN2-131-B_solo` | 69 | 76 | 76 | 0.938 | 0.938 | 0.124 | +| `05_BN2-166-Ab_comp` | 173 | 162 | 162 | 0.955 | 0.931 | 0.507 | +| `05_BN2-166-Ab_solo` | 67 | 69 | 69 | 0.941 | 0.926 | 0.324 | +| `05_BN3-119-G_comp` | 180 | 178 | 178 | 0.978 | 0.978 | 0.313 | +| `05_BN3-119-G_solo` | 62 | 64 | 64 | 0.873 | 0.873 | 0.048 | +| `05_BN3-154-E_comp` | 185 | 180 | 180 | 0.959 | 0.926 | 0.093 | +| `05_BN3-154-E_solo` | 56 | 59 | 59 | 0.922 | 0.904 | 0.035 | +| `05_Funk1-114-Ab_comp` | 181 | 162 | 153 | 0.844 | 0.838 | 0.784 | +| `05_Funk1-114-Ab_solo` | 60 | 56 | 56 | 0.845 | 0.793 | 0.207 | +| `05_Funk1-97-C_comp` | 177 | 172 | 172 | 0.974 | 0.951 | 0.229 | +| `05_Funk1-97-C_solo` | 58 | 57 | 57 | 0.922 | 0.922 | 0.261 | +| `05_Funk2-108-Eb_comp` | 245 | 271 | 260 | 0.844 | 0.808 | 0.467 | +| `05_Funk2-108-Eb_solo` | 61 | 61 | 61 | 0.984 | 0.984 | 0.459 | +| `05_Funk2-119-G_comp` | 182 | 192 | 184 | 0.907 | 0.891 | 0.820 | +| `05_Funk2-119-G_solo` | 99 | 96 | 96 | 0.944 | 0.923 | 0.226 | +| `05_Funk3-112-C#_comp` | 212 | 206 | 206 | 0.928 | 0.914 | 0.804 | +| `05_Funk3-112-C#_solo` | 70 | 64 | 64 | 0.955 | 0.955 | 0.313 | +| `05_Funk3-98-A_comp` | 184 | 182 | 162 | 0.902 | 0.896 | 0.486 | +| `05_Funk3-98-A_solo` | 65 | 65 | 65 | 0.969 | 0.969 | 0.108 | +| `05_Jazz1-130-D_comp` | 141 | 118 | 118 | 0.880 | 0.849 | 0.417 | +| `05_Jazz1-130-D_solo` | 58 | 61 | 61 | 0.924 | 0.891 | 0.235 | +| `05_Jazz1-200-B_comp` | 175 | 168 | 163 | 0.852 | 0.781 | 0.166 | +| `05_Jazz1-200-B_solo` | 50 | 49 | 49 | 0.970 | 0.970 | 0.162 | +| `05_Jazz2-110-Bb_comp` | 252 | 252 | 242 | 0.911 | 0.891 | 0.571 | +| `05_Jazz2-110-Bb_solo` | 103 | 97 | 97 | 0.940 | 0.940 | 0.320 | +| `05_Jazz2-187-F#_comp` | 247 | 226 | 219 | 0.863 | 0.837 | 0.313 | +| `05_Jazz2-187-F#_solo` | 61 | 60 | 60 | 0.975 | 0.975 | 0.298 | +| `05_Jazz3-137-Eb_comp` | 235 | 231 | 227 | 0.857 | 0.801 | 0.416 | +| `05_Jazz3-137-Eb_solo` | 77 | 76 | 76 | 0.967 | 0.967 | 0.105 | +| `05_Jazz3-150-C_comp` | 227 | 226 | 219 | 0.910 | 0.874 | 0.291 | +| `05_Jazz3-150-C_solo` | 69 | 67 | 67 | 0.956 | 0.956 | 0.368 | +| `05_Rock1-130-A_comp` | 312 | 349 | 349 | 0.884 | 0.877 | 0.172 | +| `05_Rock1-130-A_solo` | 60 | 62 | 62 | 0.951 | 0.951 | 0.066 | +| `05_Rock1-90-C#_comp` | 277 | 276 | 276 | 0.937 | 0.872 | 0.467 | +| `05_Rock1-90-C#_solo` | 99 | 106 | 106 | 0.927 | 0.907 | 0.263 | +| `05_Rock2-142-D_comp` | 460 | 450 | 450 | 0.890 | 0.842 | 0.204 | +| `05_Rock2-142-D_solo` | 86 | 86 | 86 | 0.907 | 0.907 | 0.081 | +| `05_Rock2-85-F_comp` | 325 | 296 | 296 | 0.857 | 0.821 | 0.374 | +| `05_Rock2-85-F_solo` | 125 | 124 | 124 | 0.956 | 0.940 | 0.345 | +| `05_Rock3-117-Bb_comp` | 398 | 401 | 401 | 0.879 | 0.869 | 0.123 | +| `05_Rock3-117-Bb_solo` | 70 | 65 | 65 | 0.963 | 0.963 | 0.444 | +| `05_Rock3-148-C_comp` | 249 | 286 | 286 | 0.927 | 0.923 | 0.695 | +| `05_Rock3-148-C_solo` | 61 | 60 | 60 | 0.992 | 0.992 | 0.132 | +| `05_SS1-100-C#_comp` | 102 | 95 | 95 | 0.964 | 0.964 | 0.711 | +| `05_SS1-100-C#_solo` | 60 | 64 | 64 | 0.887 | 0.871 | 0.645 | +| `05_SS1-68-E_comp` | 143 | 148 | 148 | 0.983 | 0.976 | 0.825 | +| `05_SS1-68-E_solo` | 78 | 74 | 74 | 0.895 | 0.882 | 0.092 | +| `05_SS2-107-Ab_comp` | 222 | 227 | 227 | 0.971 | 0.953 | 0.477 | +| `05_SS2-107-Ab_solo` | 87 | 85 | 85 | 0.942 | 0.942 | 0.128 | +| `05_SS2-88-F_comp` | 196 | 190 | 190 | 0.964 | 0.943 | 0.788 | +| `05_SS2-88-F_solo` | 93 | 97 | 97 | 0.979 | 0.968 | 0.242 | +| `05_SS3-84-Bb_comp` | 211 | 215 | 215 | 0.986 | 0.986 | 0.775 | +| `05_SS3-84-Bb_solo` | 111 | 119 | 119 | 0.948 | 0.948 | 0.191 | +| `05_SS3-98-C_comp` | 199 | 187 | 187 | 0.938 | 0.933 | 0.803 | +| `05_SS3-98-C_solo` | 93 | 94 | 94 | 0.973 | 0.973 | 0.193 | diff --git a/tabvision/scripts/eval/guitarset_audio_eval.py b/tabvision/scripts/eval/guitarset_audio_eval.py new file mode 100644 index 0000000..6dd1a18 --- /dev/null +++ b/tabvision/scripts/eval/guitarset_audio_eval.py @@ -0,0 +1,6 @@ +"""CLI wrapper for the v1 GuitarSet audio-only evaluator.""" + +from tabvision.eval.guitarset_audio import main + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tabvision/scripts/eval/guitarset_highres_modal.py b/tabvision/scripts/eval/guitarset_highres_modal.py new file mode 100644 index 0000000..69bccf7 --- /dev/null +++ b/tabvision/scripts/eval/guitarset_highres_modal.py @@ -0,0 +1,218 @@ +"""Modal GPU runner for the v1 GuitarSet highres audio eval. + +Runs the same evaluator as ``scripts/eval/guitarset_audio_eval.py`` on a +Modal L4 GPU, with raw GuitarSet hydrated into a persistent Modal Volume +from the ``taohu/guitarset`` Hugging Face mirror. + +Usage from repo root: + + tabvision-server/venv/bin/modal run tabvision/scripts/eval/guitarset_highres_modal.py + tabvision-server/venv/bin/modal run tabvision/scripts/eval/guitarset_highres_modal.py --limit 3 +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +import modal + + +def _local_repo_root() -> Path: + """Find the repo root when this file is imported by the local Modal CLI. + + Modal re-imports the script inside the remote container from ``/root``; + in that context the local source tree is not present, so fall back to + cwd instead of indexing fixed parents. + """ + script = Path(__file__).resolve() + for candidate in (Path.cwd().resolve(), *script.parents): + if (candidate / "tabvision" / "tabvision").is_dir(): + return candidate + return Path.cwd().resolve() + + +REPO_ROOT = _local_repo_root() +V1_ROOT = REPO_ROOT / "tabvision" +PACKAGE_LOCAL = V1_ROOT / "tabvision" +OUTPUT_LOCAL = REPO_ROOT / "tabvision-server" / "tools" / "outputs" + +APP_NAME = "tabvision-guitarset-highres-eval" +VOLUME_NAME = "tabvision-guitarset" +HF_REPO = "taohu/guitarset" +HF_REPO_TYPE = "dataset" +SHARD_FILES = [f"data/train-{i:05d}-of-00005.parquet" for i in range(5)] + +REMOTE_CODE = "/code" +REMOTE_VOLUME = "/data" +REMOTE_GUITARSET = f"{REMOTE_VOLUME}/guitarset" +REMOTE_OUTPUT = "/output" +SENTINEL = f"{REMOTE_GUITARSET}/.complete" + +volume = modal.Volume.from_name(VOLUME_NAME, create_if_missing=True) + +image = ( + modal.Image.from_registry("pytorch/pytorch:2.5.1-cuda12.4-cudnn9-runtime", add_python=None) + .apt_install("ffmpeg", "git", "libsndfile1") + .pip_install( + "git+https://github.com/xavriley/hf_midi_transcription.git", + "huggingface-hub>=0.16.0", + "librosa>=0.10.0", + "mir_eval>=0.7", + "numpy<2", + "pretty_midi>=0.2.10", + "pyarrow>=15.0", + "safetensors>=0.3.0", + "scipy>=1.10.0", + "soundfile>=0.12.0", + ) + .add_local_dir(str(PACKAGE_LOCAL), f"{REMOTE_CODE}/tabvision") +) + +app = modal.App(APP_NAME, image=image) + + +@app.function(gpu="L4", timeout=60 * 120, volumes={REMOTE_VOLUME: volume}) +def run_highres_eval( + limit: int | None = None, + force_data_refresh: bool = False, + position_prior: str = "guitarset-train", +) -> dict: + """Run validation eval on a GPU and return report artifacts.""" + import logging + import os + import shutil + import time + from pathlib import Path as RemotePath + + import torch + + logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s") + log = logging.getLogger("guitarset-highres-eval") + + log.info( + "torch=%s cuda=%s gpus=%d", + torch.__version__, + torch.cuda.is_available(), + torch.cuda.device_count(), + ) + if not torch.cuda.is_available(): + raise RuntimeError("no CUDA GPU visible to torch") + + sys.path.insert(0, REMOTE_CODE) + _ensure_guitarset_data(force=force_data_refresh, log=log) + + from tabvision.eval.guitarset_audio import run_eval, write_report + + os.makedirs(REMOTE_OUTPUT, exist_ok=True) + t0 = time.time() + results, summary = run_eval( + backend_name="highres", + data_home=REMOTE_GUITARSET, + split="validation", + limit=limit, + position_prior_name=position_prior, + backend_kwargs={"device": "cuda"}, + ) + log.info( + "eval finished in %.1fs: tracks=%d onset=%.4f pitch=%.4f tab=%.4f", + time.time() - t0, + summary.n_tracks, + summary.micro_onset.f1, + summary.micro_pitch.f1, + summary.micro_tab.f1, + ) + + md_path, csv_path = write_report(results, summary, output_dir=REMOTE_OUTPUT) + # Keep the remote output directory tidy between calls. + for child in RemotePath(REMOTE_OUTPUT).iterdir(): + if child not in {md_path, csv_path}: + if child.is_dir(): + shutil.rmtree(child) + else: + child.unlink() + + return { + "md_name": md_path.name, + "md": md_path.read_bytes(), + "csv_name": csv_path.name, + "csv": csv_path.read_bytes(), + "tracks": summary.n_tracks, + "onset_f1": summary.micro_onset.f1, + "pitch_f1": summary.micro_pitch.f1, + "tab_f1": summary.micro_tab.f1, + } + + +def _ensure_guitarset_data(*, force: bool, log) -> None: + """Hydrate raw GuitarSet JAMS/WAV files into the mounted Modal volume.""" + from pathlib import Path as RemotePath + + import pyarrow.parquet as pq + from huggingface_hub import hf_hub_download + + root = RemotePath(REMOTE_GUITARSET) + annotation_dir = root / "annotation" + audio_dir = root / "audio_mono-mic" + sentinel = RemotePath(SENTINEL) + if sentinel.exists() and not force: + log.info("GuitarSet volume already hydrated at %s", root) + return + + annotation_dir.mkdir(parents=True, exist_ok=True) + audio_dir.mkdir(parents=True, exist_ok=True) + n_tracks = 0 + for shard in SHARD_FILES: + log.info("downloading %s from %s", shard, HF_REPO) + local = hf_hub_download(repo_id=HF_REPO, filename=shard, repo_type=HF_REPO_TYPE) + table = pq.read_table(local, columns=["track_id", "jams", "audio_mic"]) + log.info(" shard rows: %d", len(table)) + for row in table.to_pylist(): + track_id = row["track_id"] + (annotation_dir / f"{track_id}.jams").write_text(row["jams"], encoding="utf-8") + (audio_dir / f"{track_id}_mic.wav").write_bytes(row["audio_mic"]["bytes"]) + n_tracks += 1 + sentinel.write_text(f"tracks={n_tracks}\n", encoding="utf-8") + volume.commit() + log.info( + "hydrated %d tracks into %s (annotation=%d audio=%d)", + n_tracks, + root, + len(list(annotation_dir.glob("*.jams"))), + len(list(audio_dir.glob("*_mic.wav"))), + ) + + +@app.local_entrypoint() +def main( + limit: int | None = None, + force_data_refresh: bool = False, + position_prior: str = "guitarset-train", +) -> None: + print( + f"[modal] app={APP_NAME} gpu=L4 limit={limit} " + f"position_prior={position_prior}", + file=sys.stderr, + ) + payload = run_highres_eval.remote( + limit=limit, + force_data_refresh=force_data_refresh, + position_prior=position_prior, + ) + + OUTPUT_LOCAL.mkdir(parents=True, exist_ok=True) + md_path = OUTPUT_LOCAL / payload["md_name"] + csv_path = OUTPUT_LOCAL / payload["csv_name"] + md_path.write_bytes(payload["md"]) + csv_path.write_bytes(payload["csv"]) + print( + "[modal] tracks={tracks} onset_f1={onset:.4f} pitch_f1={pitch:.4f} tab_f1={tab:.4f}".format( + tracks=payload["tracks"], + onset=payload["onset_f1"], + pitch=payload["pitch_f1"], + tab=payload["tab_f1"], + ), + file=sys.stderr, + ) + print(f"[modal] report={md_path}", file=sys.stderr) + print(f"[modal] csv={csv_path}", file=sys.stderr) diff --git a/tabvision/scripts/eval/phase5_fusion_diagnostics.py b/tabvision/scripts/eval/phase5_fusion_diagnostics.py new file mode 100644 index 0000000..92f3300 --- /dev/null +++ b/tabvision/scripts/eval/phase5_fusion_diagnostics.py @@ -0,0 +1,573 @@ +"""Phase 5 fusion-consumption diagnostic. + +Usage: + python -m scripts.eval.phase5_fusion_diagnostics --clip-id training-01 +""" + +from __future__ import annotations + +import argparse +import json +import math +from collections.abc import Sequence +from pathlib import Path +from statistics import fmean + +import numpy as np + +from tabvision.fusion.candidates import Candidate, candidate_positions + +REPO_ROOT = Path(__file__).resolve().parents[3] +BENCHMARK_INDEX = ( + REPO_ROOT / "tabvision-server" / "tests" / "fixtures" / "benchmarks" / "index.json" +) +DEFAULT_LAMBDAS = (0.0, 0.5, 1.0, 2.0, 5.0) +EPS = 1e-9 + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser( + description="Inspect whether Phase 5 fusion consumes video evidence." + ) + parser.add_argument("--clip-id", default="training-01") + parser.add_argument("--video", type=Path, default=None) + parser.add_argument( + "--lambdas", + type=float, + nargs="+", + default=list(DEFAULT_LAMBDAS), + help="lambda_vision values to decode; 0.0 is audio-only.", + ) + parser.add_argument("--sample-events", type=int, default=5) + args = parser.parse_args(argv) + + bench = None if args.video is not None else _benchmark_for_clip(args.clip_id) + video = args.video if args.video is not None else REPO_ROOT / bench["video_path"] + report = diagnose_fusion_consumption( + video, + benchmark=bench, + lambdas=args.lambdas, + sample_events=args.sample_events, + ) + print(_format_report(args.clip_id, video, report)) + return 0 + + +def diagnose_fusion_consumption( + video: Path, + *, + benchmark: dict | None = None, + lambdas: Sequence[float] = DEFAULT_LAMBDAS, + sample_events: int = 5, +) -> dict: + from tabvision.audio.backend import make as make_audio_backend + from tabvision.demux import demux + from tabvision.fusion import apply_neck_anchor_priors, fuse, playability + from tabvision.pipeline import ( + _make_fretboard_backend, + _make_guitar_backend, + _make_hand_backend, + _run_video_stack, + ) + from tabvision.types import GuitarConfig, SessionConfig + + cfg = GuitarConfig() + session = SessionConfig() + + audio_demuxed = demux(video) + audio_backend = make_audio_backend("highres") + audio_events = list( + audio_backend.transcribe(audio_demuxed.wav, audio_demuxed.sample_rate, session) + ) + + video_demuxed = demux(video) + hand_backend = _make_hand_backend() + try: + video_result = _run_video_stack( + video_demuxed.frame_iterator, + stride=3, + cfg=cfg, + guitar_backend=_make_guitar_backend(), + fretboard_backend=_make_fretboard_backend(), + hand_backend=hand_backend, + ) + finally: + close = getattr(hand_backend, "close", None) + if close is not None: + close() + + fingerings = video_result.fingerings + anchors = video_result.neck_anchors + enriched_events = apply_neck_anchor_priors(audio_events, anchors, cfg) + gold_events, aligned_gold, gold_offset, gold_matches = _aligned_gold_events( + benchmark=benchmark, + audio_events=audio_events, + video_duration_s=audio_demuxed.duration_s, + ) + + prior_events = [ev for ev in enriched_events if ev.fret_prior is not None] + nearby_fingerings = [ + playability.find_fingering_at(ev.onset_s, fingerings) for ev in audio_events + ] + nearby_count = sum(f is not None for f in nearby_fingerings) + both_count = sum( + ev.fret_prior is not None and f is not None + for ev, f in zip(enriched_events, nearby_fingerings, strict=True) + ) + + audio_only = list(fuse(audio_events, [], cfg, session, lambda_vision=0.0)) + decoded = {0.0: audio_only} + for lambda_vision in lambdas: + if lambda_vision == 0.0: + continue + decoded[lambda_vision] = list( + fuse( + enriched_events, + fingerings, + cfg, + session, + lambda_vision=lambda_vision, + ) + ) + + decode_rows = [] + for lambda_vision in sorted(decoded): + tabs = decoded[lambda_vision] + decode_rows.append( + { + "lambda": lambda_vision, + "tab_events": len(tabs), + "different_from_audio_only": _position_differences(audio_only, tabs), + "unique_positions": len({(t.string_idx, t.fret) for t in tabs}), + } + ) + + return { + "audio_events": len(audio_events), + "fingerings": len(fingerings), + "anchors": len(anchors), + "prior_events": len(prior_events), + "nearby_fingering_events": nearby_count, + "both_prior_and_fingering": both_count, + "prior_stats": _prior_stats(enriched_events, cfg), + "gold_events": len(gold_events), + "aligned_gold_events": len(aligned_gold), + "gold_offset_s": gold_offset, + "gold_alignment_matches": gold_matches, + "posterior_gold_alignment": _posterior_gold_alignment_stats( + enriched_events, + fingerings, + aligned_gold, + cfg, + ), + "decode_rows": decode_rows, + "samples": _sample_emission_terms( + enriched_events, + fingerings, + cfg, + aligned_gold, + sample_events=sample_events, + ), + } + + +def _benchmark_for_clip(clip_id: str) -> dict: + if not BENCHMARK_INDEX.exists(): + raise FileNotFoundError(f"benchmark index not found: {BENCHMARK_INDEX}") + benchmarks = json.loads(BENCHMARK_INDEX.read_text()).get("benchmarks", []) + for bench in benchmarks: + if bench.get("id") == clip_id: + video = REPO_ROOT / bench["video_path"] + if not video.exists(): + raise FileNotFoundError(f"benchmark video not found: {video}") + return bench + raise KeyError(f"benchmark clip id not found: {clip_id}") + + +def _position_differences(base: Sequence, other: Sequence) -> int: + n = min(len(base), len(other)) + diffs = sum( + (base[i].string_idx, base[i].fret) != (other[i].string_idx, other[i].fret) + for i in range(n) + ) + return diffs + abs(len(base) - len(other)) + + +def _prior_stats(events: Sequence, cfg) -> dict: + values = [] + for ev in events: + if ev.fret_prior is None: + continue + arr = np.asarray(ev.fret_prior, dtype=np.float64) + for c in candidate_positions(ev.pitch_midi, cfg): + values.append(float(arr[c.string_idx, c.fret])) + return _stats(values) + + +def _sample_emission_terms( + events: Sequence, + fingerings: Sequence, + cfg, + aligned_gold: Sequence, + *, + sample_events: int, +) -> list[dict]: + from tabvision.fusion import playability + + samples = [] + for ev in events: + candidates = candidate_positions(ev.pitch_midi, cfg) + fingering = playability.find_fingering_at(ev.onset_s, fingerings) + if ev.fret_prior is None or fingering is None or not candidates: + continue + + marginal = fingering.marginal_string_fret() + rows = [_candidate_terms(ev, c, marginal) for c in candidates] + gold = _nearest_gold_event(ev, aligned_gold) + samples.append( + { + "onset_s": float(ev.onset_s), + "pitch_midi": int(ev.pitch_midi), + "candidate_count": len(candidates), + "nearest_gold": _gold_summary(ev, gold), + "top_posterior_cells": _top_posterior_cells(marginal), + "same_pitch_candidates": [ + { + "string_idx": c.string_idx, + "fret": c.fret, + "vision_prob": float(marginal[c.string_idx, c.fret]), + "prior": float( + np.asarray(ev.fret_prior)[c.string_idx, c.fret] + ), + } + for c in candidates + ], + "best_prior": _best(rows, "prior_cost"), + "best_vision": _best(rows, "vision_cost"), + "best_total_lambda1": _best(rows, "total_lambda1"), + "prior_cost": _stats([r["prior_cost"] for r in rows]), + "vision_cost": _stats([r["vision_cost"] for r in rows]), + "low_fret_open_cost": _stats([r["low_fret_open_cost"] for r in rows]), + } + ) + if len(samples) >= sample_events: + break + return samples + + +def _posterior_gold_alignment_stats( + events: Sequence, + fingerings: Sequence, + aligned_gold: Sequence, + cfg, + *, + max_dt_s: float = 0.15, +) -> dict: + from tabvision.fusion import playability + + matched = [] + for ev in events: + gold = _nearest_gold_event(ev, aligned_gold) + if gold is None or gold.pitch_midi != ev.pitch_midi: + continue + dt = abs(ev.onset_s - gold.onset_s) + if dt > max_dt_s: + continue + fingering = playability.find_fingering_at(ev.onset_s, fingerings) + if fingering is None: + continue + marginal = fingering.marginal_string_fret() + candidates = candidate_positions(ev.pitch_midi, cfg) + if not candidates: + continue + + gold_prob = float(marginal[gold.string_idx, gold.fret]) + global_rank = _rank_cell(marginal, gold.string_idx, gold.fret) + candidate_probs = [ + (c, float(marginal[c.string_idx, c.fret])) for c in candidates + ] + candidate_probs.sort(key=lambda item: item[1], reverse=True) + same_pitch_rank = next( + i + 1 + for i, (c, _prob) in enumerate(candidate_probs) + if c.string_idx == gold.string_idx and c.fret == gold.fret + ) + top_global = _top_posterior_cells(marginal, n=1)[0] + top_same_pitch, top_same_pitch_prob = candidate_probs[0] + matched.append( + { + "dt_s": float(ev.onset_s - gold.onset_s), + "gold_string_idx": int(gold.string_idx), + "gold_fret": int(gold.fret), + "gold_prob": gold_prob, + "global_rank": global_rank, + "same_pitch_rank": same_pitch_rank, + "top_global": top_global, + "top_same_pitch": { + "string_idx": int(top_same_pitch.string_idx), + "fret": int(top_same_pitch.fret), + "prob": float(top_same_pitch_prob), + }, + "is_open": gold.fret == 0, + } + ) + + return { + "matched_events": len(matched), + "all": _posterior_rank_summary(matched), + "open": _posterior_rank_summary([m for m in matched if m["is_open"]]), + "fretted": _posterior_rank_summary([m for m in matched if not m["is_open"]]), + "examples": matched[:8], + } + + +def _rank_cell(marginal: np.ndarray, string_idx: int, fret: int) -> int: + target = float(marginal[string_idx, fret]) + return int(np.sum(marginal > target) + 1) + + +def _posterior_rank_summary(rows: Sequence[dict]) -> dict: + if not rows: + return { + "count": 0, + "gold_prob": _stats([]), + "global_rank": _stats([]), + "same_pitch_rank": _stats([]), + "global_top1": 0, + "global_top5": 0, + "same_pitch_top1": 0, + } + return { + "count": len(rows), + "gold_prob": _stats([r["gold_prob"] for r in rows]), + "global_rank": _stats([float(r["global_rank"]) for r in rows]), + "same_pitch_rank": _stats([float(r["same_pitch_rank"]) for r in rows]), + "global_top1": sum(r["global_rank"] == 1 for r in rows), + "global_top5": sum(r["global_rank"] <= 5 for r in rows), + "same_pitch_top1": sum(r["same_pitch_rank"] == 1 for r in rows), + } + + +def _aligned_gold_events( + *, + benchmark: dict | None, + audio_events: Sequence, + video_duration_s: float, +) -> tuple[list, list, float, int]: + if benchmark is None: + return [], [], 0.0, 0 + + from tests.eval.test_phase5_eval import ( + _align_gold_to_audio_only, + _load_gold_tab_events, + ) + + gold_path = REPO_ROOT / benchmark["ground_truth_path"] + gold = _load_gold_tab_events( + gold_path, + bpm=benchmark.get("bpm"), + video_duration_s=video_duration_s, + ) + aligned, offset_s, matches = _align_gold_to_audio_only( + audio_only=[], + gold=gold, + video_duration_s=video_duration_s, + ) + if audio_events: + audio_like = [ + _PitchTimeEvent(onset_s=ev.onset_s, pitch_midi=ev.pitch_midi) + for ev in audio_events + ] + aligned, offset_s, matches = _align_gold_to_audio_only( + audio_only=audio_like, + gold=gold, + video_duration_s=video_duration_s, + ) + return gold, aligned, offset_s, matches + + +class _PitchTimeEvent: + def __init__(self, *, onset_s: float, pitch_midi: int) -> None: + self.onset_s = onset_s + self.pitch_midi = pitch_midi + + +def _nearest_gold_event(event, aligned_gold: Sequence): + if not aligned_gold: + return None + pitch_matches = [g for g in aligned_gold if g.pitch_midi == event.pitch_midi] + pool = pitch_matches or aligned_gold + return min(pool, key=lambda g: abs(g.onset_s - event.onset_s)) + + +def _gold_summary(event, gold) -> dict | None: + if gold is None: + return None + return { + "dt_s": float(event.onset_s - gold.onset_s), + "string_idx": int(gold.string_idx), + "fret": int(gold.fret), + "pitch_midi": int(gold.pitch_midi), + } + + +def _top_posterior_cells(marginal: np.ndarray, *, n: int = 5) -> list[dict]: + flat_order = np.argsort(marginal.reshape(-1))[::-1][:n] + out = [] + for flat_idx in flat_order: + string_idx, fret = np.unravel_index(int(flat_idx), marginal.shape) + out.append( + { + "string_idx": int(string_idx), + "fret": int(fret), + "prob": float(marginal[string_idx, fret]), + } + ) + return out + + +def _candidate_terms(event, candidate: Candidate, marginal: np.ndarray) -> dict: + from tabvision.fusion import playability + + prior = float(np.asarray(event.fret_prior)[candidate.string_idx, candidate.fret]) + vision_prob = float(marginal[candidate.string_idx, candidate.fret]) + low_fret_open = playability.LOW_FRET_BIAS * candidate.fret + if candidate.fret == 0: + low_fret_open -= playability.OPEN_STRING_BONUS + prior_cost = -math.log(max(prior, EPS)) + vision_cost = -math.log(max(vision_prob, playability.VISION_FLOOR)) + return { + "string_idx": candidate.string_idx, + "fret": candidate.fret, + "prior": prior, + "vision_prob": vision_prob, + "prior_cost": prior_cost, + "vision_cost": vision_cost, + "low_fret_open_cost": low_fret_open, + "total_lambda1": prior_cost + vision_cost + low_fret_open, + } + + +def _best(rows: Sequence[dict], key: str) -> dict: + row = min(rows, key=lambda r: r[key]) + return { + "string_idx": row["string_idx"], + "fret": row["fret"], + "cost": row[key], + "prior": row["prior"], + "vision_prob": row["vision_prob"], + } + + +def _stats(values: Sequence[float]) -> dict: + if not values: + return {"min": None, "mean": None, "max": None} + return {"min": min(values), "mean": fmean(values), "max": max(values)} + + +def _format_report(clip_id: str, video: Path, report: dict) -> str: + lines = [ + f"clip={clip_id}", + f"video={video}", + f"audio_events={report['audio_events']}", + f"fingerings={report['fingerings']} anchors={report['anchors']}", + ( + f"fret_prior_events={report['prior_events']}/{report['audio_events']} " + f"nearby_fingering_events={report['nearby_fingering_events']}/" + f"{report['audio_events']} both={report['both_prior_and_fingering']}/" + f"{report['audio_events']}" + ), + ( + f"gold_events={report['gold_events']} aligned_gold_events=" + f"{report['aligned_gold_events']} gold_offset_s=" + f"{report['gold_offset_s']:.2f} gold_alignment_matches=" + f"{report['gold_alignment_matches']}" + ), + _stat_line("candidate_prior_probability", report["prior_stats"]), + "posterior_vs_aligned_gold:", + _posterior_summary_line("all", report["posterior_gold_alignment"]["all"]), + _posterior_summary_line("open", report["posterior_gold_alignment"]["open"]), + _posterior_summary_line( + "fretted", + report["posterior_gold_alignment"]["fretted"], + ), + "decode_by_lambda:", + ] + for row in report["decode_rows"]: + lines.append( + " " + f"lambda={row['lambda']:.2f} tab_events={row['tab_events']} " + f"diffs_vs_audio_only={row['different_from_audio_only']} " + f"unique_positions={row['unique_positions']}" + ) + lines.append("sample_emission_terms:") + for sample in report["samples"]: + lines.append( + " " + f"t={sample['onset_s']:.3f} pitch={sample['pitch_midi']} " + f"candidates={sample['candidate_count']}" + ) + if sample["nearest_gold"] is not None: + gold = sample["nearest_gold"] + lines.append( + " " + f"nearest_gold: dt={gold['dt_s']:+.3f}s " + f"s={gold['string_idx']} f={gold['fret']} " + f"pitch={gold['pitch_midi']}" + ) + lines.append(" top_posterior:") + for cell in sample["top_posterior_cells"]: + lines.append( + " " + f"s={cell['string_idx']} f={cell['fret']} " + f"p={cell['prob']:.6f}" + ) + lines.append(" same_pitch_candidates:") + for c in sample["same_pitch_candidates"]: + lines.append( + " " + f"s={c['string_idx']} f={c['fret']} " + f"vision={c['vision_prob']:.6f} prior={c['prior']:.6f}" + ) + for label in ("best_prior", "best_vision", "best_total_lambda1"): + best = sample[label] + lines.append( + " " + f"{label}: s={best['string_idx']} f={best['fret']} " + f"cost={best['cost']:.3f} prior={best['prior']:.6f} " + f"vision={best['vision_prob']:.6f}" + ) + lines.append(" " + _stat_line("prior_cost", sample["prior_cost"])) + lines.append(" " + _stat_line("vision_cost", sample["vision_cost"])) + lines.append( + " " + + _stat_line("low_fret_open_cost", sample["low_fret_open_cost"]) + ) + return "\n".join(lines) + + +def _posterior_summary_line(label: str, stats: dict) -> str: + if stats["count"] == 0: + return f" {label}: count=0" + return ( + f" {label}: count={stats['count']} " + f"gold_prob_mean={stats['gold_prob']['mean']:.6f} " + f"global_rank_mean={stats['global_rank']['mean']:.2f} " + f"same_pitch_rank_mean={stats['same_pitch_rank']['mean']:.2f} " + f"global_top1={stats['global_top1']}/{stats['count']} " + f"global_top5={stats['global_top5']}/{stats['count']} " + f"same_pitch_top1={stats['same_pitch_top1']}/{stats['count']}" + ) + + +def _stat_line(label: str, stats: dict) -> str: + if stats["min"] is None: + return f"{label}: none" + return ( + f"{label}: min={stats['min']:.6f} mean={stats['mean']:.6f} " + f"max={stats['max']:.6f}" + ) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tabvision/scripts/eval/phase5_video_diagnostics.py b/tabvision/scripts/eval/phase5_video_diagnostics.py new file mode 100644 index 0000000..1d271cc --- /dev/null +++ b/tabvision/scripts/eval/phase5_video_diagnostics.py @@ -0,0 +1,190 @@ +"""Phase 5 video-evidence diagnostic. + +Usage: + python -m scripts.eval.phase5_video_diagnostics --clip-id training-01 +""" + +from __future__ import annotations + +import argparse +import json +from pathlib import Path +from statistics import fmean + +import numpy as np + +REPO_ROOT = Path(__file__).resolve().parents[3] +BENCHMARK_INDEX = ( + REPO_ROOT / "tabvision-server" / "tests" / "fixtures" / "benchmarks" / "index.json" +) + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser( + description="Inspect Phase 5 video evidence for one benchmark clip." + ) + parser.add_argument("--clip-id", default="training-01") + parser.add_argument("--video", type=Path, default=None) + parser.add_argument("--sample-frames", type=int, default=10) + args = parser.parse_args(argv) + + video = args.video if args.video is not None else _video_for_clip(args.clip_id) + report = diagnose_video_evidence(video, sample_frames=args.sample_frames) + print(_format_report(args.clip_id, video, report)) + return 0 + + +def diagnose_video_evidence(video: Path, *, sample_frames: int = 10) -> dict: + from tabvision.demux import demux + from tabvision.pipeline import ( + _make_fretboard_backend, + _make_guitar_backend, + _make_hand_backend, + _run_video_stack, + ) + from tabvision.types import GuitarConfig + + cfg = GuitarConfig() + demuxed = demux(video) + hand_backend = _make_hand_backend() + try: + result = _run_video_stack( + demuxed.frame_iterator, + stride=3, + cfg=cfg, + guitar_backend=_make_guitar_backend(), + fretboard_backend=_make_fretboard_backend(), + hand_backend=hand_backend, + ) + finally: + close = getattr(hand_backend, "close", None) + if close is not None: + close() + + fingerings = result.fingerings + anchors = result.neck_anchors + homography_conf = [float(ff.homography_confidence) for ff in fingerings] + logits = [np.asarray(ff.finger_pos_logits, dtype=np.float64) for ff in fingerings] + sums = [float(arr.sum()) for arr in logits] + stds = [float(arr.std()) for arr in logits] + maxes = [float(arr.max()) for arr in logits] + + report = { + "fingerings": len(fingerings), + "anchors": len(anchors), + "homography_conf": _stats(homography_conf), + "homography_positive": sum(c > 0.0 for c in homography_conf), + "logits_nonzero": sum(s != 0.0 for s in sums), + "logits_nonuniform": sum(s > 1e-9 for s in stds), + "logits_std": _stats(stds), + "logits_max": _stats(maxes), + "samples": [], + "anchor_center": {}, + "anchor_conf": {}, + "anchor_samples": [], + } + + for ff, arr, std in zip(fingerings[:sample_frames], logits, stds, strict=False): + report["samples"].append( + { + "t": float(ff.t), + "homography_confidence": float(ff.homography_confidence), + "logit_sum": float(arr.sum()), + "logit_std": std, + "logit_max": float(arr.max()), + } + ) + + if anchors: + centers = [float(anchor.center_fret) for _t, anchor in anchors] + confs = [float(anchor.confidence) for _t, anchor in anchors] + report["anchor_center"] = _stats(centers) + report["anchor_conf"] = _stats(confs) + for t, anchor in anchors[:sample_frames]: + report["anchor_samples"].append( + { + "t": float(t), + "center_fret": float(anchor.center_fret), + "min_fret": float(anchor.min_fret), + "max_fret": float(anchor.max_fret), + "confidence": float(anchor.confidence), + } + ) + + return report + + +def _video_for_clip(clip_id: str) -> Path: + if not BENCHMARK_INDEX.exists(): + raise FileNotFoundError(f"benchmark index not found: {BENCHMARK_INDEX}") + benchmarks = json.loads(BENCHMARK_INDEX.read_text()).get("benchmarks", []) + for bench in benchmarks: + if bench.get("id") == clip_id: + video = REPO_ROOT / bench["video_path"] + if not video.exists(): + raise FileNotFoundError(f"benchmark video not found: {video}") + return video + raise KeyError(f"benchmark clip id not found: {clip_id}") + + +def _stats(values: list[float]) -> dict: + if not values: + return {"min": None, "mean": None, "max": None} + return {"min": min(values), "mean": fmean(values), "max": max(values)} + + +def _format_report(clip_id: str, video: Path, report: dict) -> str: + lines = [ + f"clip={clip_id}", + f"video={video}", + f"fingerings={report['fingerings']} anchors={report['anchors']}", + _stat_line( + "homography_conf", + report["homography_conf"], + suffix=f" positive={report['homography_positive']}/{report['fingerings']}", + ), + ( + f"logits_nonzero={report['logits_nonzero']}/{report['fingerings']} " + f"logits_nonuniform={report['logits_nonuniform']}/{report['fingerings']}" + ), + _stat_line("logits_std", report["logits_std"]), + _stat_line("logits_max", report["logits_max"]), + ] + if report["anchors"]: + lines.extend( + [ + _stat_line("anchor_center", report["anchor_center"]), + _stat_line("anchor_conf", report["anchor_conf"]), + ] + ) + lines.append("sample_frames:") + for row in report["samples"]: + lines.append( + " " + f"t={row['t']:.3f} H={row['homography_confidence']:.3f} " + f"sum={row['logit_sum']:.6f} std={row['logit_std']:.6f} " + f"max={row['logit_max']:.6f}" + ) + if report["anchor_samples"]: + lines.append("sample_anchors:") + for row in report["anchor_samples"]: + lines.append( + " " + f"t={row['t']:.3f} center={row['center_fret']:.3f} " + f"span=({row['min_fret']:.3f},{row['max_fret']:.3f}) " + f"conf={row['confidence']:.3f}" + ) + return "\n".join(lines) + + +def _stat_line(label: str, stats: dict, *, suffix: str = "") -> str: + if stats["min"] is None: + return f"{label}: none{suffix}" + return ( + f"{label}: min={stats['min']:.6f} mean={stats['mean']:.6f} " + f"max={stats['max']:.6f}{suffix}" + ) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tabvision/tabvision/eval/guitarset_audio.py b/tabvision/tabvision/eval/guitarset_audio.py new file mode 100644 index 0000000..718c5e7 --- /dev/null +++ b/tabvision/tabvision/eval/guitarset_audio.py @@ -0,0 +1,616 @@ +"""Audio-only GuitarSet eval helpers. + +This module deliberately keeps import-time dependencies light. The raw +GuitarSet JAMS files carry string/fret labels, so they are the source of +truth for Tab F1; the derived Basic Pitch TFRecords are useful for +onset/pitch targets but do not retain string/fret. +""" + +from __future__ import annotations + +import argparse +import csv +import datetime as dt +import json +import sys +from collections.abc import Iterable, Mapping, Sequence +from dataclasses import dataclass +from pathlib import Path + +import numpy as np + +from tabvision.errors import BackendError +from tabvision.eval.metrics import TabF1Result, tab_f1 +from tabvision.fusion import fuse +from tabvision.fusion.position_prior import ( + PitchPositionPrior, + apply_pitch_position_prior, + learn_pitch_position_prior, +) +from tabvision.types import AudioBackend, AudioEvent, GuitarConfig, SessionConfig, TabEvent + +DEFAULT_DATA_HOME = Path("~/mir_datasets/guitarset").expanduser() +DEFAULT_OUTPUT_DIR = ( + Path(__file__).resolve().parents[3] / "tabvision-server" / "tools" / "outputs" +) +DEFAULT_VALIDATION_PLAYER = "05" +DEFAULT_ONSET_TOLERANCE_S = 0.05 +DEFAULT_POSITION_PRIOR_ALPHA = 1.0 +DEFAULT_POSITION_PRIOR_POWER = 2.0 + + +@dataclass(frozen=True) +class EventF1Result: + precision: float + recall: float + f1: float + true_positives: int + false_positives: int + false_negatives: int + + @property + def total_predicted(self) -> int: + return self.true_positives + self.false_positives + + @property + def total_gold(self) -> int: + return self.true_positives + self.false_negatives + + +@dataclass(frozen=True) +class AudioOnlyScore: + onset: EventF1Result + pitch: EventF1Result + tab: TabF1Result + decoded: list[TabEvent] + + +@dataclass(frozen=True) +class TrackEvalResult: + track_id: str + backend: str + gold_notes: int + audio_events: int + decoded_events: int + onset: EventF1Result + pitch: EventF1Result + tab: TabF1Result + + +@dataclass(frozen=True) +class EvalSummary: + backend: str + split: str + position_prior: str + n_tracks: int + total_gold_notes: int + total_audio_events: int + mean_onset_f1: float + mean_pitch_f1: float + mean_tab_f1: float + micro_onset: EventF1Result + micro_pitch: EventF1Result + micro_tab: TabF1Result + + +def parse_guitarset_jams( + jams_path: str | Path, + cfg: GuitarConfig | None = None, +) -> list[TabEvent]: + """Parse GuitarSet note_midi annotations into v1 TabEvent gold notes. + + GuitarSet stores one ``note_midi`` annotation per string. Its + ``data_source`` convention is already low-E to high-E as ``0..5``, + matching v1 ``string_idx``. Fret is derived from MIDI minus the open + string pitch so bent/float MIDI labels still land on the nearest fret. + """ + if cfg is None: + cfg = GuitarConfig() + + path = Path(jams_path) + payload = json.loads(path.read_text(encoding="utf-8")) + + out: list[TabEvent] = [] + for ann in payload.get("annotations", []): + if ann.get("namespace") != "note_midi": + continue + + source = ann.get("annotation_metadata", {}).get("data_source") + try: + string_idx = int(source) + except (TypeError, ValueError): + continue + if not 0 <= string_idx < cfg.n_strings: + continue + + open_pitch = cfg.tuning_midi[string_idx] + for row in ann.get("data") or []: + try: + onset_s = float(row["time"]) + duration_s = float(row["duration"]) + pitch_midi = int(round(float(row["value"]))) + except (KeyError, TypeError, ValueError): + continue + + fret = pitch_midi - open_pitch + if fret < cfg.capo or fret > cfg.max_fret: + continue + out.append( + TabEvent( + onset_s=onset_s, + duration_s=max(0.0, duration_s), + string_idx=string_idx, + fret=fret, + pitch_midi=pitch_midi, + confidence=1.0, + ) + ) + + out.sort(key=lambda ev: (ev.onset_s, ev.string_idx, ev.fret)) + return out + + +def list_guitarset_track_ids( + data_home: str | Path = DEFAULT_DATA_HOME, + *, + split: str = "validation", + validation_player: str = DEFAULT_VALIDATION_PLAYER, +) -> list[str]: + """List raw GuitarSet track ids for the requested split. + + The local TFRecord validation split is the held-out player ``05``. + Reusing that convention avoids TensorFlow as a dependency for this v1 + eval while keeping it aligned with the existing Basic Pitch baseline. + """ + root = Path(data_home) + annotation_dir = root / "annotation" + audio_dir = root / "audio_mono-mic" + if not annotation_dir.is_dir() or not audio_dir.is_dir(): + return [] + + track_ids = sorted(p.stem for p in annotation_dir.glob("*.jams")) + available = [ + tid for tid in track_ids if (audio_dir / f"{tid}_mic.wav").is_file() + ] + if split == "all": + return available + if split == "validation": + return [ + tid + for tid in available + if tid.split("_", 1)[0] == validation_player + ] + if split == "train": + return [ + tid + for tid in available + if tid.split("_", 1)[0] != validation_player + ] + raise ValueError(f"unknown split: {split!r}; expected train, validation, or all") + + +def _score_event_f1( + predicted: Sequence[TabEvent], + gold: Sequence[TabEvent], + *, + match_pitch: bool, + onset_tolerance_s: float = DEFAULT_ONSET_TOLERANCE_S, +) -> EventF1Result: + pred_sorted = sorted(predicted, key=lambda ev: ev.onset_s) + gold_sorted = sorted(gold, key=lambda ev: ev.onset_s) + gold_used = [False] * len(gold_sorted) + tp = 0 + fp = 0 + + for pred in pred_sorted: + best_j = -1 + best_dt = onset_tolerance_s + 1e-9 + for j, ref in enumerate(gold_sorted): + if gold_used[j]: + continue + if match_pitch and pred.pitch_midi != ref.pitch_midi: + continue + dt = abs(pred.onset_s - ref.onset_s) + if dt <= onset_tolerance_s and dt < best_dt: + best_j = j + best_dt = dt + if best_j >= 0: + gold_used[best_j] = True + tp += 1 + else: + fp += 1 + + fn = sum(1 for used in gold_used if not used) + precision = tp / (tp + fp) if tp + fp else 0.0 + recall = tp / (tp + fn) if tp + fn else 0.0 + f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0.0 + return EventF1Result( + precision=precision, + recall=recall, + f1=f1, + true_positives=tp, + false_positives=fp, + false_negatives=fn, + ) + + +def score_audio_only( + audio_events: Sequence[AudioEvent], + gold: Sequence[TabEvent], + *, + cfg: GuitarConfig | None = None, + session: SessionConfig | None = None, + onset_tolerance_s: float = DEFAULT_ONSET_TOLERANCE_S, +) -> AudioOnlyScore: + if cfg is None: + cfg = GuitarConfig() + if session is None: + session = SessionConfig() + + decoded = fuse(audio_events, [], cfg, session, lambda_vision=0.0) + onset = _score_event_f1( + decoded, + gold, + match_pitch=False, + onset_tolerance_s=onset_tolerance_s, + ) + pitch = _score_event_f1( + decoded, + gold, + match_pitch=True, + onset_tolerance_s=onset_tolerance_s, + ) + tab = tab_f1(decoded, gold, onset_tolerance_s=onset_tolerance_s) + return AudioOnlyScore(onset=onset, pitch=pitch, tab=tab, decoded=decoded) + + +def build_guitarset_position_prior( + data_home: str | Path = DEFAULT_DATA_HOME, + *, + training_split: str = "train", + validation_player: str = DEFAULT_VALIDATION_PLAYER, + alpha: float = DEFAULT_POSITION_PRIOR_ALPHA, + power: float = DEFAULT_POSITION_PRIOR_POWER, + cfg: GuitarConfig | None = None, +) -> PitchPositionPrior: + """Learn a pitch-position prior from raw GuitarSet tab annotations.""" + if cfg is None: + cfg = GuitarConfig() + + examples: list[TabEvent] = [] + for track_id in list_guitarset_track_ids( + data_home, + split=training_split, + validation_player=validation_player, + ): + jams_path = Path(data_home) / "annotation" / f"{track_id}.jams" + examples.extend(parse_guitarset_jams(jams_path, cfg)) + if not examples: + raise RuntimeError( + f"no GuitarSet prior-training notes for split={training_split!r} under {data_home}" + ) + return learn_pitch_position_prior(examples, cfg=cfg, alpha=alpha, power=power) + + +def load_mono_audio(audio_path: str | Path) -> tuple[np.ndarray, int]: + """Load a WAV as mono float32, preserving the original sample rate.""" + try: + import soundfile as sf + except ImportError as exc: # pragma: no cover - dependency readiness path + raise RuntimeError("soundfile is required to load GuitarSet WAV files") from exc + + wav, sr = sf.read(str(audio_path), always_2d=False) + arr = np.asarray(wav, dtype=np.float32) + if arr.ndim == 2: + arr = arr.mean(axis=1) + if arr.ndim != 1: + raise ValueError(f"expected mono/stereo audio, got shape {arr.shape}") + return arr, int(sr) + + +def evaluate_track( + track_id: str, + backend_name: str, + *, + data_home: str | Path = DEFAULT_DATA_HOME, + cfg: GuitarConfig | None = None, + session: SessionConfig | None = None, + position_prior: PitchPositionPrior | None = None, + backend: AudioBackend | None = None, +) -> TrackEvalResult: + if cfg is None: + cfg = GuitarConfig() + if session is None: + session = SessionConfig() + + root = Path(data_home) + audio_path = root / "audio_mono-mic" / f"{track_id}_mic.wav" + jams_path = root / "annotation" / f"{track_id}.jams" + if not audio_path.is_file(): + raise FileNotFoundError(f"missing GuitarSet audio: {audio_path}") + if not jams_path.is_file(): + raise FileNotFoundError(f"missing GuitarSet JAMS: {jams_path}") + + gold = parse_guitarset_jams(jams_path, cfg) + wav, sr = load_mono_audio(audio_path) + + if backend is None: + from tabvision.audio.backend import make + + backend = make(backend_name) + audio_events = list(backend.transcribe(wav, sr, session)) + if position_prior is not None: + audio_events = apply_pitch_position_prior(audio_events, position_prior) + scored = score_audio_only(audio_events, gold, cfg=cfg, session=session) + return TrackEvalResult( + track_id=track_id, + backend=backend_name, + gold_notes=len(gold), + audio_events=len(audio_events), + decoded_events=len(scored.decoded), + onset=scored.onset, + pitch=scored.pitch, + tab=scored.tab, + ) + + +def summarize_results( + results: Sequence[TrackEvalResult], + *, + backend: str, + split: str, + position_prior: str = "none", +) -> EvalSummary: + total_gold = sum(r.gold_notes for r in results) + total_audio = sum(r.audio_events for r in results) + n_tracks = len(results) + mean_onset = _mean(r.onset.f1 for r in results) + mean_pitch = _mean(r.pitch.f1 for r in results) + mean_tab = _mean(r.tab.f1 for r in results) + return EvalSummary( + backend=backend, + split=split, + position_prior=position_prior, + n_tracks=n_tracks, + total_gold_notes=total_gold, + total_audio_events=total_audio, + mean_onset_f1=mean_onset, + mean_pitch_f1=mean_pitch, + mean_tab_f1=mean_tab, + micro_onset=_sum_event_f1(r.onset for r in results), + micro_pitch=_sum_event_f1(r.pitch for r in results), + micro_tab=_sum_tab_f1(r.tab for r in results), + ) + + +def _mean(values: Iterable[float]) -> float: + collected = list(values) + return sum(collected) / len(collected) if collected else 0.0 + + +def _sum_event_f1(results: Iterable[EventF1Result]) -> EventF1Result: + collected = list(results) + tp = sum(r.true_positives for r in collected) + fp = sum(r.false_positives for r in collected) + fn = sum(r.false_negatives for r in collected) + precision = tp / (tp + fp) if tp + fp else 0.0 + recall = tp / (tp + fn) if tp + fn else 0.0 + f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0.0 + return EventF1Result(precision, recall, f1, tp, fp, fn) + + +def _sum_tab_f1(results: Iterable[TabF1Result]) -> TabF1Result: + collected = list(results) + tp = sum(r.true_positives for r in collected) + fp = sum(r.false_positives for r in collected) + fn = sum(r.false_negatives for r in collected) + precision = tp / (tp + fp) if tp + fp else 0.0 + recall = tp / (tp + fn) if tp + fn else 0.0 + f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0.0 + return TabF1Result(precision, recall, f1, tp, fp, fn) + + +def run_eval( + *, + backend_name: str, + data_home: str | Path = DEFAULT_DATA_HOME, + split: str = "validation", + limit: int | None = None, + validation_player: str = DEFAULT_VALIDATION_PLAYER, + position_prior_name: str = "none", + backend_kwargs: Mapping[str, object] | None = None, +) -> tuple[list[TrackEvalResult], EvalSummary]: + track_ids = list_guitarset_track_ids( + data_home, + split=split, + validation_player=validation_player, + ) + if limit is not None: + track_ids = track_ids[:limit] + if not track_ids: + raise RuntimeError(f"no GuitarSet tracks found for split={split!r} under {data_home}") + + position_prior: PitchPositionPrior | None = None + if position_prior_name == "guitarset-train": + position_prior = build_guitarset_position_prior( + data_home, + validation_player=validation_player, + ) + elif position_prior_name != "none": + raise ValueError( + f"unknown position prior: {position_prior_name!r}; expected none or guitarset-train" + ) + + from tabvision.audio.backend import make + + backend = make(backend_name, **(dict(backend_kwargs or {}))) + results: list[TrackEvalResult] = [] + for track_id in track_ids: + results.append( + evaluate_track( + track_id, + backend_name, + data_home=data_home, + position_prior=position_prior, + backend=backend, + ) + ) + return results, summarize_results( + results, + backend=backend_name, + split=split, + position_prior=position_prior_name, + ) + + +def write_report( + results: Sequence[TrackEvalResult], + summary: EvalSummary, + *, + output_dir: str | Path = DEFAULT_OUTPUT_DIR, +) -> tuple[Path, Path]: + out_dir = Path(output_dir) + out_dir.mkdir(parents=True, exist_ok=True) + today = dt.date.today().isoformat() + prior_slug = summary.position_prior.replace("_", "-") + stem = f"guitarset_audio_eval-{summary.backend}-{summary.split}-{prior_slug}-{today}" + csv_path = out_dir / f"{stem}.csv" + md_path = out_dir / f"{stem}.md" + + with csv_path.open("w", newline="", encoding="utf-8") as f: + writer = csv.writer(f) + writer.writerow( + [ + "track_id", + "backend", + "gold_notes", + "audio_events", + "decoded_events", + "onset_f1", + "pitch_f1", + "tab_f1", + "tab_tp", + "tab_fp", + "tab_fn", + ] + ) + for r in results: + writer.writerow( + [ + r.track_id, + r.backend, + r.gold_notes, + r.audio_events, + r.decoded_events, + f"{r.onset.f1:.6f}", + f"{r.pitch.f1:.6f}", + f"{r.tab.f1:.6f}", + r.tab.true_positives, + r.tab.false_positives, + r.tab.false_negatives, + ] + ) + + lines = [ + f"# GuitarSet Audio Eval ({summary.backend})", + "", + f"Split: **{summary.split}**", + f"Position prior: **{summary.position_prior}**", + f"Tracks: **{summary.n_tracks}**", + f"Gold notes: **{summary.total_gold_notes}**", + f"Audio events: **{summary.total_audio_events}**", + "", + "## Aggregate", + "", + "| Metric | Mean F1 | Micro P | Micro R | Micro F1 |", + "| --- | ---: | ---: | ---: | ---: |", + _metric_row("Onset", summary.mean_onset_f1, summary.micro_onset), + _metric_row("Pitch", summary.mean_pitch_f1, summary.micro_pitch), + _metric_row("Tab", summary.mean_tab_f1, summary.micro_tab), + "", + "## Per Track", + "", + "| Track | Gold | Audio | Decoded | Onset F1 | Pitch F1 | Tab F1 |", + "| --- | ---: | ---: | ---: | ---: | ---: | ---: |", + ] + for r in results: + lines.append( + f"| `{r.track_id}` | {r.gold_notes} | {r.audio_events} | " + f"{r.decoded_events} | {r.onset.f1:.3f} | {r.pitch.f1:.3f} | " + f"{r.tab.f1:.3f} |" + ) + md_path.write_text("\n".join(lines) + "\n", encoding="utf-8") + return md_path, csv_path + + +def _metric_row(name: str, mean_f1: float, result: EventF1Result | TabF1Result) -> str: + return ( + f"| {name} | {mean_f1:.3f} | {result.precision:.3f} | " + f"{result.recall:.3f} | {result.f1:.3f} |" + ) + + +def main(argv: Sequence[str] | None = None) -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--backend", + default="highres", + choices=["highres", "highres-fl", "basicpitch"], + ) + parser.add_argument("--data-home", default=str(DEFAULT_DATA_HOME)) + parser.add_argument("--split", default="validation", choices=["validation", "train", "all"]) + parser.add_argument("--limit", type=int, default=None) + parser.add_argument("--validation-player", default=DEFAULT_VALIDATION_PLAYER) + parser.add_argument( + "--device", + default=None, + help="optional backend device override, e.g. cuda for GPU runners", + ) + parser.add_argument( + "--position-prior", + default="none", + choices=["none", "guitarset-train"], + help="optional pitch-to-string/fret prior attached before audio-only fusion", + ) + parser.add_argument("--output-dir", default=str(DEFAULT_OUTPUT_DIR)) + args = parser.parse_args(argv) + + try: + results, summary = run_eval( + backend_name=args.backend, + data_home=args.data_home, + split=args.split, + limit=args.limit, + validation_player=args.validation_player, + position_prior_name=args.position_prior, + backend_kwargs={"device": args.device} if args.device else None, + ) + except (BackendError, FileNotFoundError, RuntimeError) as exc: + print(f"setup_blocker={exc}", file=sys.stderr) + return 2 + md_path, csv_path = write_report(results, summary, output_dir=args.output_dir) + print(f"tracks={summary.n_tracks}") + print(f"onset_f1={summary.micro_onset.f1:.4f}") + print(f"pitch_f1={summary.micro_pitch.f1:.4f}") + print(f"tab_f1={summary.micro_tab.f1:.4f}") + print(f"report={md_path}") + print(f"csv={csv_path}") + return 0 + + +__all__ = [ + "AudioOnlyScore", + "build_guitarset_position_prior", + "EvalSummary", + "EventF1Result", + "TrackEvalResult", + "evaluate_track", + "list_guitarset_track_ids", + "load_mono_audio", + "main", + "parse_guitarset_jams", + "run_eval", + "score_audio_only", + "summarize_results", + "write_report", +] diff --git a/tabvision/tabvision/fusion/position_prior.py b/tabvision/tabvision/fusion/position_prior.py new file mode 100644 index 0000000..5065736 --- /dev/null +++ b/tabvision/tabvision/fusion/position_prior.py @@ -0,0 +1,97 @@ +"""Learned pitch-to-position priors for audio-only tab decoding.""" + +from __future__ import annotations + +from collections.abc import Mapping, Sequence +from dataclasses import dataclass + +import numpy as np + +from tabvision.types import AudioEvent, GuitarConfig, TabEvent + + +@dataclass(frozen=True) +class PitchPositionPrior: + """Mapping from MIDI pitch to a normalized ``(string, fret)`` prior.""" + + by_pitch: Mapping[int, np.ndarray] + + def matrix_for_pitch(self, pitch_midi: int) -> np.ndarray | None: + return self.by_pitch.get(int(pitch_midi)) + + +def learn_pitch_position_prior( + examples: Sequence[TabEvent], + cfg: GuitarConfig | None = None, + *, + alpha: float = 1.0, + power: float = 2.0, +) -> PitchPositionPrior: + """Estimate ``P(string, fret | pitch)`` from tab-labelled examples. + + Smoothing is applied only to playable candidates for each pitch. The + optional ``power`` sharpens observed preferences while preserving zero + probability for impossible positions. + """ + if cfg is None: + cfg = GuitarConfig() + if alpha < 0: + raise ValueError("alpha must be non-negative") + if power <= 0: + raise ValueError("power must be positive") + + priors: dict[int, np.ndarray] = {} + for pitch in range(128): + arr = np.zeros((cfg.n_strings, cfg.max_fret + 1), dtype=np.float64) + for string_idx, open_pitch in enumerate(cfg.tuning_midi): + fret = pitch - open_pitch + if cfg.capo <= fret <= cfg.max_fret: + arr[string_idx, fret] = alpha + priors[pitch] = arr + + for ev in examples: + if ev.pitch_midi not in priors: + continue + if not (0 <= ev.string_idx < cfg.n_strings): + continue + if not (0 <= ev.fret <= cfg.max_fret): + continue + priors[ev.pitch_midi][ev.string_idx, ev.fret] += 1.0 + + normalized: dict[int, np.ndarray] = {} + for pitch, arr in priors.items(): + sharpened = arr**power + total = float(sharpened.sum()) + if total > 0: + normalized[pitch] = sharpened / total + return PitchPositionPrior(normalized) + + +def apply_pitch_position_prior( + events: Sequence[AudioEvent], + prior: PitchPositionPrior, +) -> list[AudioEvent]: + """Return copies of audio events with a pitch-position prior attached.""" + out: list[AudioEvent] = [] + for ev in events: + matrix = prior.matrix_for_pitch(ev.pitch_midi) + out.append( + AudioEvent( + onset_s=ev.onset_s, + offset_s=ev.offset_s, + pitch_midi=ev.pitch_midi, + velocity=ev.velocity, + confidence=ev.confidence, + pitch_logits=ev.pitch_logits, + fret_prior=matrix if matrix is not None else ev.fret_prior, + tags=ev.tags, + ) + ) + return out + + +__all__ = [ + "PitchPositionPrior", + "apply_pitch_position_prior", + "learn_pitch_position_prior", +] diff --git a/tabvision/tabvision/video/fretboard/keypoint.py b/tabvision/tabvision/video/fretboard/keypoint.py index e4be112..910ce54 100644 --- a/tabvision/tabvision/video/fretboard/keypoint.py +++ b/tabvision/tabvision/video/fretboard/keypoint.py @@ -15,17 +15,17 @@ backends without changing their math): - x-axis: along the neck, ``0 = nut``, ``1 = body end of detected region``. -- y-axis: across the strings, ``0 = top edge (high E side)``, - ``1 = bottom edge (low E side)``. +- y-axis: across the strings, ``0 = high-E side``, + ``1 = low-E side``. - The four corners of the unit square map to ``top_left``, ``top_right``, ``bottom_right``, ``bottom_left`` in that order. -The high-E vs low-E assignment uses image-Y (smaller Y = top of frame = -high-E side), which is correct for the standard iPhone-on-lap framing -the spec assumes (§7 Phase 3, §1 user setup). Clips with the guitar -flipped will still produce a valid homography but with the canonical -y-axis inverted; downstream consumers that depend on orientation should -sanity-check with the hand pipeline. +The high-E vs low-E assignment uses a lap-framing heuristic: when the +headstock/nut side is to the right of the body side in the image, the +player-facing camera view usually puts high-E on the lower image edge, so +canonical y is flipped. Otherwise smaller image-Y is treated as high-E. +Clips with unusual camera/player handedness may still need a preflight +orientation check. """ from __future__ import annotations @@ -91,6 +91,9 @@ def predictions_to_homography(preds: OBBPredictions) -> Homography: nut = preds.best_nut() nut_xy = (nut.cx, nut.cy) if nut else None ordered = _order_corners_by_neck_anatomy(corners, nut_xy) + if nut_xy is not None: + ordered = _extend_nut_edge_to_detection(ordered, nut_xy) + ordered = _orient_string_axis_for_lap_framing(ordered) H = _homography_from_quad(ordered) # noqa: N806 — math-convention name # Confidence: weight neck heavily, boost a bit if we also pinned the @@ -197,6 +200,61 @@ def _order_corners_by_neck_anatomy( ) +def _extend_nut_edge_to_detection( + ordered_corners: np.ndarray, + nut_xy: tuple[float, float], + *, + min_extension_px: float = 5.0, +) -> np.ndarray: + """Move the nut-side edge to an external nut detection when needed. + + The YOLO ``neck`` OBB often starts at the first visible fret rather than + the true nut. In that case canonical x=0 is too far down the neck and + every fingertip projects to an artificially low fret. We keep the OBB's + cross-neck edge vector, but slide that whole edge along the neck axis to + the detected nut center when the nut lies beyond the current nut edge. + """ + if ordered_corners.shape != (4, 2): + raise ValueError(f"expected (4, 2) corners, got {ordered_corners.shape}") + + out = ordered_corners.astype(np.float64, copy=True) + nut_mid = (out[0] + out[3]) / 2.0 + body_mid = (out[1] + out[2]) / 2.0 + neck_axis = body_mid - nut_mid + norm = float(np.linalg.norm(neck_axis)) + if norm <= 1e-9: + return out + + axis_u = neck_axis / norm + nut_pt = np.array(nut_xy, dtype=np.float64) + signed_distance = float(np.dot(nut_pt - nut_mid, axis_u)) + if signed_distance >= -min_extension_px: + return out + + cross_edge = out[3] - out[0] + new_nut_mid = nut_mid + signed_distance * axis_u + out[0] = new_nut_mid - cross_edge / 2.0 + out[3] = new_nut_mid + cross_edge / 2.0 + return out + + +def _orient_string_axis_for_lap_framing(ordered_corners: np.ndarray) -> np.ndarray: + """Infer canonical string-side orientation for common iPhone/lap videos. + + ``ordered_corners`` arrives as [nut top, body top, body bottom, nut bottom] + in image-Y terms. When the nut/headstock is to the right of the body side, + the front-facing lap view typically shows low-E on the image-top edge and + high-E on the image-bottom edge. Since canonical y=0 means high-E, swap the + top/bottom edges in that case. + """ + out = ordered_corners.astype(np.float64, copy=True) + nut_mid = (out[0] + out[3]) / 2.0 + body_mid = (out[1] + out[2]) / 2.0 + if nut_mid[0] > body_mid[0]: + out = np.array([out[3], out[2], out[1], out[0]], dtype=np.float64) + return out + + def _split_top_bottom(corners: np.ndarray, i: int, j: int) -> tuple[int, int]: """Return (top_idx, bot_idx) for the two corner indices ``i, j``.""" if corners[i, 1] <= corners[j, 1]: diff --git a/tabvision/tests/eval/test_phase5_eval.py b/tabvision/tests/eval/test_phase5_eval.py index e8df108..9b5dfba 100644 --- a/tabvision/tests/eval/test_phase5_eval.py +++ b/tabvision/tests/eval/test_phase5_eval.py @@ -30,18 +30,26 @@ import datetime as _dt import json +import shutil +import subprocess +import sys from collections.abc import Sequence +from dataclasses import replace +from functools import cache from pathlib import Path import pytest from tabvision.eval.metrics import ( - ChordAccuracyResult, - TabF1Result, chord_instance_accuracy, tab_f1, ) -from tabvision.types import TabEvent +from tabvision.types import ( + DEFAULT_TUNING_MIDI, + AudioEvent, + FrameFingering, + TabEvent, +) PHASE5_TAB_F1_DELTA_GATE = 0.08 """SPEC §5: audio+vision must beat audio-only by at least this much on Tab F1.""" @@ -52,6 +60,18 @@ PHASE5_CHORD_ACCURACY_GATE = 0.80 """SPEC §5: chord-instance accuracy gate.""" +PHASE5_ALIGNMENT_TOLERANCE_S = 0.50 +"""Loose pitch-only window used only to find per-clip gold alignment offsets.""" + +PHASE5_ALIGNMENT_STEP_S = 0.05 +"""Offset-search step. Fine enough for later 50 ms strict Tab F1 scoring.""" + +PHASE5_LAMBDA_SWEEP = (0.0, 0.5, 1.0, 2.0, 5.0) +"""Diagnostic sweep when default ``lambda_vision=1.0`` misses the delta gate.""" + +LEGACY_MAX_FRET = 24 +"""Max fret used by the frozen v0 tab parser when disambiguating 2-digit frets.""" + REPO_ROOT = Path(__file__).resolve().parents[3] BENCHMARK_INDEX = ( REPO_ROOT / "tabvision-server" / "tests" / "fixtures" / "benchmarks" / "index.json" @@ -69,58 +89,17 @@ def test_phase5_audio_plus_vision_beats_audio_only(): is unavailable, *or* when the video-stack-into-pipeline integration is still a TODO in ``_run_pipeline``. """ - pytest.importorskip("torch", reason="highres backend needs torch.") - pytest.importorskip( - "mediapipe", - reason="MediaPipe needed for video evidence; install with pip install '.[vision]'.", - ) - pytest.importorskip("cv2", reason="opencv-python needed for video frames.") + _require_eval_readiness() - benchmarks = _load_benchmarks() - if not benchmarks: - pytest.skip("no benchmarks defined in index.json") - - audio_only_scores: list[TabF1Result] = [] - audio_video_scores: list[TabF1Result] = [] - chord_scores: list[ChordAccuracyResult] = [] - rows: list[dict] = [] + rows = _collect_phase5_rows(lambda_vision=1.0) - for bench in benchmarks: - video = REPO_ROOT / bench["video_path"] - gold_path = REPO_ROOT / bench["ground_truth_path"] - if not video.exists() or not gold_path.exists(): - continue - gold = _load_gold_tab_events(gold_path) - if not gold: - continue - - ao = _run_pipeline(video, lambda_vision=0.0) - av = _run_pipeline(video, lambda_vision=1.0) - - ao_score = tab_f1(ao, gold) - av_score = tab_f1(av, gold) - chord_score = chord_instance_accuracy(av, gold) - - audio_only_scores.append(ao_score) - audio_video_scores.append(av_score) - chord_scores.append(chord_score) - rows.append( - { - "id": bench["id"], - "ao_f1": ao_score.f1, - "av_f1": av_score.f1, - "delta": av_score.f1 - ao_score.f1, - "chord_acc": chord_score.accuracy, - } - ) - - if not rows: - pytest.skip("no benchmark videos / ground truth files were available") - - ao_mean = _mean([r.f1 for r in audio_only_scores]) - av_mean = _mean([r.f1 for r in audio_video_scores]) - chord_mean = _mean([r.accuracy for r in chord_scores]) + ao_mean = _mean([r["ao_f1"] for r in rows]) + av_mean = _mean([r["av_f1"] for r in rows]) + chord_mean = _mean([r["chord_acc"] for r in rows]) delta = av_mean - ao_mean + sweep_rows = [] + if delta < PHASE5_TAB_F1_DELTA_GATE: + sweep_rows = _run_lambda_sweep() _write_report( rows=rows, @@ -128,6 +107,7 @@ def test_phase5_audio_plus_vision_beats_audio_only(): av_mean=av_mean, delta=delta, chord_mean=chord_mean, + sweep_rows=sweep_rows, ) assert delta >= PHASE5_TAB_F1_DELTA_GATE, ( @@ -146,30 +126,10 @@ def test_phase5_audio_plus_vision_beats_audio_only(): strict=False, ) def test_phase5_absolute_tab_f1(): - pytest.importorskip("torch") - pytest.importorskip("mediapipe") - pytest.importorskip("cv2") - - benchmarks = _load_benchmarks() - if not benchmarks: - pytest.skip("no benchmarks defined in index.json") + _require_eval_readiness() - scores: list[TabF1Result] = [] - for bench in benchmarks: - video = REPO_ROOT / bench["video_path"] - gold_path = REPO_ROOT / bench["ground_truth_path"] - if not video.exists() or not gold_path.exists(): - continue - gold = _load_gold_tab_events(gold_path) - if not gold: - continue - av = _run_pipeline(video, lambda_vision=1.0) - scores.append(tab_f1(av, gold)) - - if not scores: - pytest.skip("no benchmark videos available") - - mean_f1 = _mean([s.f1 for s in scores]) + rows = _collect_phase5_rows(lambda_vision=1.0) + mean_f1 = _mean([r["av_f1"] for r in rows]) assert mean_f1 >= PHASE5_TAB_F1_ABSOLUTE_GATE, ( f"absolute Tab F1 {mean_f1:.3f} < {PHASE5_TAB_F1_ABSOLUTE_GATE}" ) @@ -177,85 +137,490 @@ def test_phase5_absolute_tab_f1(): @pytest.mark.eval def test_phase5_chord_accuracy(): - pytest.importorskip("torch") - pytest.importorskip("mediapipe") - pytest.importorskip("cv2") + _require_eval_readiness() + + rows = _collect_phase5_rows(lambda_vision=1.0) + mean_acc = _mean([r["chord_acc"] for r in rows]) + assert mean_acc >= PHASE5_CHORD_ACCURACY_GATE, ( + f"chord accuracy {mean_acc:.3f} < {PHASE5_CHORD_ACCURACY_GATE}" + ) + + +# ---------- helpers ---------- + + +@pytest.fixture(autouse=True) +def _phase5_eval_requires_marker(request: pytest.FixtureRequest) -> None: + markexpr = str(getattr(request.config.option, "markexpr", "") or "") + if request.node.get_closest_marker("eval") and "eval" not in markexpr: + pytest.skip( + "Phase 5 eval is opt-in; run with " + "`pytest -m eval tests/eval/test_phase5_eval.py`." + ) + + +def _load_benchmarks() -> list[dict]: + if not BENCHMARK_INDEX.exists(): + return [] + return json.loads(BENCHMARK_INDEX.read_text()).get("benchmarks", []) + +def _require_eval_readiness() -> None: + """Skip only for optional heavy dependencies / model artifacts. + + Benchmark-data problems fail later because those are repo issues, not + optional local-environment issues. + """ + pytest.importorskip("torch", reason="highres backend needs torch.") + pytest.importorskip( + "hf_midi_transcription", + reason="highres backend needs hf-midi-transcription.", + ) + pytest.importorskip("soundfile", reason="highres backend needs soundfile.") + pytest.importorskip("scipy.signal", reason="highres backend needs scipy.") + pytest.importorskip("pretty_midi", reason="highres backend needs pretty_midi.") + pytest.importorskip( + "mediapipe", + reason="MediaPipe needed for video evidence; install with pip install '.[vision]'.", + ) + pytest.importorskip("cv2", reason="opencv-python needed for video frames.") + + if not shutil.which("ffmpeg"): + pytest.skip("ffmpeg not on PATH; required by tabvision.demux") + if not shutil.which("ffprobe"): + pytest.skip("ffprobe not on PATH; required by tabvision.demux") + + from tabvision.video.guitar.yolo_backend import _default_checkpoint_path + from tabvision.video.hand.mediapipe_backend import _default_model_path + + hand_model = _default_model_path() + if not hand_model.exists(): + pytest.skip(f"MediaPipe hand model not found at {hand_model}") + _require_mediapipe_landmarker_loads(hand_model) + + yolo_checkpoint = _default_checkpoint_path() + if not yolo_checkpoint.exists(): + pytest.skip(f"YOLO-OBB checkpoint not found at {yolo_checkpoint}") + + +def _require_mediapipe_landmarker_loads(hand_model: Path) -> None: + """Probe native MediaPipe readiness out-of-process so segfaults become skips.""" + probe = """ +import sys +from mediapipe.tasks import python +from mediapipe.tasks.python import vision + +base_options = python.BaseOptions(model_asset_path=sys.argv[1]) +options = vision.HandLandmarkerOptions(base_options=base_options, num_hands=1) +landmarker = vision.HandLandmarker.create_from_options(options) +landmarker.close() +""" + try: + proc = subprocess.run( + [sys.executable, "-c", probe, str(hand_model)], + capture_output=True, + check=False, + text=True, + timeout=30, + ) + except subprocess.TimeoutExpired: + pytest.skip("MediaPipe HandLandmarker readiness probe timed out.") + if proc.returncode == 0: + return + + details = (proc.stderr or proc.stdout).strip().splitlines() + reason = details[-1] if details else f"process exited {proc.returncode}" + pytest.skip( + "MediaPipe HandLandmarker readiness probe failed; install compatible " + f"system OpenGL/GLES runtime libraries. Last error: {reason}" + ) + + +@cache +def _collect_phase5_rows(*, lambda_vision: float) -> tuple[dict, ...]: benchmarks = _load_benchmarks() if not benchmarks: - pytest.skip("no benchmarks defined in index.json") + pytest.fail(f"no benchmarks defined in {BENCHMARK_INDEX}") - scores: list[ChordAccuracyResult] = [] + rows: list[dict] = [] + available = 0 + parsed_gold = 0 + runner = _Phase5Runner(lambda_vision=lambda_vision) for bench in benchmarks: video = REPO_ROOT / bench["video_path"] gold_path = REPO_ROOT / bench["ground_truth_path"] if not video.exists() or not gold_path.exists(): continue - gold = _load_gold_tab_events(gold_path) + available += 1 + + video_duration_s = _video_duration_s(video) + gold = _load_gold_tab_events( + gold_path, + bpm=bench.get("bpm"), + video_duration_s=video_duration_s, + ) if not gold: continue - av = _run_pipeline(video, lambda_vision=1.0) - scores.append(chord_instance_accuracy(av, gold)) + parsed_gold += 1 - if not scores: - pytest.skip("no benchmark videos available") + ao, av = runner.run(video) + aligned_gold, offset_s, alignment_matches = _align_gold_to_audio_only( + audio_only=ao, + gold=gold, + video_duration_s=video_duration_s, + ) - mean_acc = _mean([s.accuracy for s in scores]) - assert mean_acc >= PHASE5_CHORD_ACCURACY_GATE, ( - f"chord accuracy {mean_acc:.3f} < {PHASE5_CHORD_ACCURACY_GATE}" + ao_score = tab_f1(ao, aligned_gold) + av_score = tab_f1(av, aligned_gold) + chord_score = chord_instance_accuracy(av, aligned_gold) + + rows.append( + { + "id": bench["id"], + "lambda": lambda_vision, + "ao_f1": ao_score.f1, + "av_f1": av_score.f1, + "delta": av_score.f1 - ao_score.f1, + "chord_acc": chord_score.accuracy, + "offset_s": offset_s, + "alignment_matches": alignment_matches, + "gold_count": len(aligned_gold), + "ao_count": len(ao), + "av_count": len(av), + } + ) + + if available == 0: + pytest.fail("benchmark index exists, but no referenced video/ground-truth files exist") + if parsed_gold == 0: + pytest.fail("benchmark files exist, but no ground-truth notes parsed") + if rows and all(r["alignment_matches"] == 0 for r in rows): + pytest.fail( + "pitch-only alignment found zero matches on every clip; inspect audio backend " + "output before trusting strict Tab F1" + ) + return tuple(rows) + + +class _Phase5Runner: + """Eval-only runner that reuses expensive per-clip evidence.""" + + def __init__(self, *, lambda_vision: float) -> None: + from tabvision.types import GuitarConfig, SessionConfig + + self.lambda_vision = lambda_vision + self.cfg = GuitarConfig() + self.session = SessionConfig() + + def run(self, video: Path) -> tuple[list[TabEvent], list[TabEvent]]: + from tabvision.fusion import apply_neck_anchor_priors, fuse + + audio_events = list(_phase5_audio_events(video)) + audio_only = list( + fuse( + audio_events, + [], + self.cfg, + self.session, + lambda_vision=0.0, + ) + ) + + fingerings = [] + av_audio_events = audio_events + if self.lambda_vision > 0.0: + fingerings, neck_anchors = _phase5_video_evidence(video) + if neck_anchors: + av_audio_events = apply_neck_anchor_priors( + audio_events, + neck_anchors, + self.cfg, + ) + + audio_vision = list( + fuse( + av_audio_events, + fingerings, + self.cfg, + self.session, + lambda_vision=self.lambda_vision, + ) + ) + return audio_only, audio_vision + + +@cache +def _phase5_audio_backend(): + from tabvision.pipeline import _make_audio_backend + + return _make_audio_backend("highres") + + +@cache +def _phase5_video_backends(): + from tabvision.pipeline import ( + _make_fretboard_backend, + _make_guitar_backend, + _make_hand_backend, ) + return _make_guitar_backend(), _make_fretboard_backend(), _make_hand_backend() -# ---------- helpers ---------- +@cache +def _phase5_audio_events(video: Path) -> tuple[AudioEvent, ...]: + from tabvision.demux import demux + from tabvision.types import SessionConfig -def _load_benchmarks() -> list[dict]: - if not BENCHMARK_INDEX.exists(): - return [] - return json.loads(BENCHMARK_INDEX.read_text()).get("benchmarks", []) + demuxed = demux(video) + return tuple( + _phase5_audio_backend().transcribe( + demuxed.wav, + demuxed.sample_rate, + SessionConfig(), + ) + ) -def _load_gold_tab_events(path: Path) -> list[TabEvent]: - """Parse the legacy benchmark ground-truth ``.txt`` format into TabEvents. +@cache +def _phase5_video_evidence(video: Path) -> tuple[tuple[FrameFingering, ...], tuple]: + from tabvision.demux import demux + from tabvision.pipeline import _run_video_stack + from tabvision.types import GuitarConfig + + guitar_backend, fretboard_backend, hand_backend = _phase5_video_backends() + demuxed = demux(video) + result = _run_video_stack( + demuxed.frame_iterator, + stride=3, + cfg=GuitarConfig(), + guitar_backend=guitar_backend, + fretboard_backend=fretboard_backend, + hand_backend=hand_backend, + ) + return tuple(result.fingerings), tuple(result.neck_anchors) - The legacy parser lives in ``tabvision-server/evaluate_transcription.py``; - this helper imports it lazily to keep the eval module's deps minimal. - Returns an empty list if the legacy module isn't importable (e.g. when - the test runs from an environment without the server checked out). - """ - try: - import sys +def _load_gold_tab_events( + path: Path, + *, + bpm: float | int | None, + video_duration_s: float, +) -> list[TabEvent]: + """Parse legacy tab text and convert beat positions into real seconds.""" + text = path.read_text() + parsed = _parse_ground_truth_tabs(text) + return _gold_notes_to_tab_events(parsed, bpm=bpm, video_duration_s=video_duration_s) + + +def _parse_ground_truth_tabs(text: str) -> list[dict]: + """Use v0's parser when importable; otherwise mirror its lightweight logic.""" + try: server_path = REPO_ROOT / "tabvision-server" if str(server_path) not in sys.path: sys.path.insert(0, str(server_path)) from evaluate_transcription import parse_ground_truth_tabs except Exception: # noqa: BLE001 — broad: optional dep, want graceful skip + return _parse_legacy_tab_text(text) + + return parse_ground_truth_tabs(text) + + +def _parse_legacy_tab_text(text: str) -> list[dict]: + """Mirror ``tabvision-server/evaluate_transcription.py`` tab parser.""" + string_map = {"e": 1, "B": 2, "G": 3, "D": 4, "A": 5, "E": 6} + notes: list[dict] = [] + + for line in text.strip().splitlines(): + if "|" not in line: + continue + + parts = line.split("|") + if len(parts) < 2: + continue + + string_id = None + for char in parts[0].strip(): + if char in string_map: + string_id = string_map[char] + break + if string_id is None: + continue + + content = "|".join(parts[1:]) + i = 0 + beat_position = 0.0 + while i < len(content): + char = content[i] + if char == "|": + i += 1 + elif char == "-": + beat_position += 0.25 + i += 1 + elif char.isdigit(): + fret_str = char + if i + 1 < len(content) and content[i + 1].isdigit(): + two_digit_fret = int(char + content[i + 1]) + if two_digit_fret <= LEGACY_MAX_FRET: + fret_str = char + content[i + 1] + i += 1 + notes.append( + { + "string": string_id, + "fret": int(fret_str), + "beat": beat_position, + } + ) + beat_position += 0.25 + i += 1 + elif char in ("X", "x", "/"): + if char in ("X", "x"): + notes.append( + { + "string": string_id, + "fret": "X", + "beat": beat_position, + } + ) + beat_position += 0.25 + i += 1 + else: + i += 1 + + return sorted(notes, key=lambda n: (n["beat"], n["string"])) + + +def _gold_notes_to_tab_events( + notes: Sequence[dict], + *, + bpm: float | int | None, + video_duration_s: float, +) -> list[TabEvent]: + """Convert legacy ``parse_ground_truth_tabs`` dicts to timed TabEvents.""" + pitched = [n for n in notes if n.get("fret") not in ("X", "x")] + if not pitched: return [] - text = path.read_text() - parsed = parse_ground_truth_tabs(text) - # The legacy parser returns beats; we need seconds. The benchmarks - # don't carry duration, so this helper currently returns the parsed - # raw notes without timing. Phase 5 acceptance defers timing - # alignment to the per-video runner that knows the video duration — - # see ``_run_pipeline``. + max_beat = max(float(n["beat"]) for n in pitched) + if bpm is not None and float(bpm) > 0.0: + beat_to_time = 60.0 / float(bpm) + else: + beat_to_time = video_duration_s / max_beat if max_beat > 0.0 else 1.0 + out: list[TabEvent] = [] - for note in parsed: + for note in pitched: + fret = int(note["fret"]) + string_idx = 6 - int(note["string"]) + if string_idx < 0 or string_idx >= len(DEFAULT_TUNING_MIDI): + continue out.append( TabEvent( - onset_s=float(note["beat"]), # placeholder — runner aligns + onset_s=float(note["beat"]) * beat_to_time, duration_s=0.25, - # Legacy uses 1=high E, 6=low E; spec uses 0=low E, 5=high E. - string_idx=6 - int(note["string"]), - fret=0 if note["fret"] == "X" else int(note["fret"]), - pitch_midi=0, # not needed for Tab F1 + string_idx=string_idx, + fret=fret, + pitch_midi=DEFAULT_TUNING_MIDI[string_idx] + fret, confidence=1.0, ) ) return out +def _video_duration_s(video: Path) -> float: + from tabvision.demux import _probe_metadata + + duration_s, _fps = _probe_metadata(video) + return duration_s + + +def _align_gold_to_audio_only( + *, + audio_only: Sequence[TabEvent], + gold: Sequence[TabEvent], + video_duration_s: float, +) -> tuple[list[TabEvent], float, int]: + offset_s, matches = _find_best_pitch_offset( + predicted=audio_only, + gold=gold, + video_duration_s=video_duration_s, + tolerance_s=PHASE5_ALIGNMENT_TOLERANCE_S, + step_s=PHASE5_ALIGNMENT_STEP_S, + ) + return [replace(g, onset_s=g.onset_s + offset_s) for g in gold], offset_s, matches + + +def _find_best_pitch_offset( + *, + predicted: Sequence[TabEvent], + gold: Sequence[TabEvent], + video_duration_s: float, + tolerance_s: float, + step_s: float, +) -> tuple[float, int]: + """Search positive global offsets using pitch-only matches.""" + if not predicted or not gold: + return 0.0, 0 + + first_gold = min(g.onset_s for g in gold) + last_gold = max(g.onset_s for g in gold) + gt_span = max(0.0, last_gold - first_gold) + pred_max = max((p.onset_s for p in predicted), default=video_duration_s) + search_duration = max(video_duration_s, pred_max + tolerance_s) + max_offset = max(0.0, search_duration - gt_span) + + n_steps = int(max_offset / step_s) + 1 + candidate_offsets = [i * step_s for i in range(n_steps)] + best_offset = 0.0 + best_matches = -1 + best_error = float("inf") + + for offset in candidate_offsets: + matches, error = _pitch_match_stats(predicted, gold, offset, tolerance_s) + if matches > best_matches or ( + matches == best_matches and error < best_error + ): + best_matches = matches + best_offset = offset + best_error = error + + return best_offset, max(best_matches, 0) + + +def _count_pitch_matches( + predicted: Sequence[TabEvent], + gold: Sequence[TabEvent], + offset_s: float, + tolerance_s: float, +) -> int: + return _pitch_match_stats(predicted, gold, offset_s, tolerance_s)[0] + + +def _pitch_match_stats( + predicted: Sequence[TabEvent], + gold: Sequence[TabEvent], + offset_s: float, + tolerance_s: float, +) -> tuple[int, float]: + gold_used = [False] * len(gold) + matches = 0 + total_error = 0.0 + for pred in sorted(predicted, key=lambda t: t.onset_s): + best_j = -1 + best_dt = tolerance_s + 1e-9 + for j, g in enumerate(gold): + if gold_used[j] or pred.pitch_midi != g.pitch_midi: + continue + dt = abs(pred.onset_s - (g.onset_s + offset_s)) + if dt <= tolerance_s and dt < best_dt: + best_j = j + best_dt = dt + if best_j >= 0: + gold_used[best_j] = True + matches += 1 + total_error += best_dt + return matches, total_error if matches else float("inf") + + def _run_pipeline( video: Path, *, @@ -273,6 +638,7 @@ def _run_pipeline( video, audio_backend_name=audio_backend_name, lambda_vision=lambda_vision, + video_enabled=lambda_vision > 0.0, ) @@ -280,13 +646,32 @@ def _mean(values: list[float]) -> float: return sum(values) / len(values) if values else 0.0 +def _run_lambda_sweep() -> list[dict]: + sweep_rows: list[dict] = [] + for lambda_vision in PHASE5_LAMBDA_SWEEP: + rows = _collect_phase5_rows(lambda_vision=lambda_vision) + ao_mean = _mean([r["ao_f1"] for r in rows]) + av_mean = _mean([r["av_f1"] for r in rows]) + sweep_rows.append( + { + "lambda": lambda_vision, + "ao_mean": ao_mean, + "av_mean": av_mean, + "delta": av_mean - ao_mean, + "chord_mean": _mean([r["chord_acc"] for r in rows]), + } + ) + return sweep_rows + + def _write_report( *, - rows: list[dict], + rows: Sequence[dict], ao_mean: float, av_mean: float, delta: float, chord_mean: float, + sweep_rows: list[dict], ) -> None: """Emit ``tools/outputs/phase5_eval-YYYY-MM-DD.md`` summary report.""" EVAL_OUTPUT_DIR.mkdir(parents=True, exist_ok=True) @@ -307,15 +692,38 @@ def _write_report( f"| Mean chord-instance accuracy | {chord_mean:.4f} |", f"| Phase 5 +{PHASE5_TAB_F1_DELTA_GATE * 100:.0f}pp gate | " f"{'PASS' if delta >= PHASE5_TAB_F1_DELTA_GATE else 'FAIL'} |", + f"| Chord accuracy gate | " + f"{'PASS' if chord_mean >= PHASE5_CHORD_ACCURACY_GATE else 'FAIL'} |", + f"| Absolute Tab F1 gate | " + f"{'PASS' if av_mean >= PHASE5_TAB_F1_ABSOLUTE_GATE else 'DEFER/FAIL'} |", "", "## Per-video", "", - "| id | audio-only F1 | audio+vision F1 | delta | chord acc |", - "|---|---:|---:|---:|---:|", + "| id | audio-only F1 | audio+vision F1 | delta | chord acc | " + "offset | align matches | gold | ao notes | av notes |", + "|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|", ] for r in rows: lines.append( f"| {r['id']} | {r['ao_f1']:.3f} | {r['av_f1']:.3f} | " - f"{r['delta']:+.3f} | {r['chord_acc']:.3f} |" + f"{r['delta']:+.3f} | {r['chord_acc']:.3f} | " + f"{r['offset_s']:.2f}s | {r['alignment_matches']} | " + f"{r['gold_count']} | {r['ao_count']} | {r['av_count']} |" ) + + if sweep_rows: + lines.extend( + [ + "", + "## Diagnostic lambda sweep", + "", + "| lambda_vision | audio-only F1 | audio+vision F1 | delta | chord acc |", + "|---:|---:|---:|---:|---:|", + ] + ) + for r in sweep_rows: + lines.append( + f"| {r['lambda']:.1f} | {r['ao_mean']:.3f} | {r['av_mean']:.3f} | " + f"{r['delta']:+.3f} | {r['chord_mean']:.3f} |" + ) out.write_text("\n".join(lines) + "\n") diff --git a/tabvision/tests/unit/test_fretboard_keypoint.py b/tabvision/tests/unit/test_fretboard_keypoint.py index 0093054..078182c 100644 --- a/tabvision/tests/unit/test_fretboard_keypoint.py +++ b/tabvision/tests/unit/test_fretboard_keypoint.py @@ -13,9 +13,11 @@ from tabvision.types import Homography from tabvision.video.fretboard.keypoint import ( + _extend_nut_edge_to_detection, _homography_from_quad, _obb_to_corners, _order_corners_by_neck_anatomy, + _orient_string_axis_for_lap_framing, predictions_to_homography, ) from tabvision.video.guitar.yolo_backend import ( @@ -121,6 +123,83 @@ def test_order_corners_falls_back_to_smaller_x_when_nut_missing(): assert ordered[3].tolist() == [50.0, 140.0] +def test_extend_nut_edge_to_external_nut_detection(): + ordered = np.array( + [ + [50.0, 100.0], + [450.0, 100.0], + [450.0, 140.0], + [50.0, 140.0], + ], + dtype=np.float64, + ) + + extended = _extend_nut_edge_to_detection(ordered, (-50.0, 120.0)) + + assert extended[0].tolist() == pytest.approx([-50.0, 100.0]) + assert extended[3].tolist() == pytest.approx([-50.0, 140.0]) + assert extended[1].tolist() == pytest.approx([450.0, 100.0]) + assert extended[2].tolist() == pytest.approx([450.0, 140.0]) + + +def test_extend_nut_edge_leaves_internal_nut_detection_alone(): + ordered = np.array( + [ + [50.0, 100.0], + [450.0, 100.0], + [450.0, 140.0], + [50.0, 140.0], + ], + dtype=np.float64, + ) + + extended = _extend_nut_edge_to_detection(ordered, (55.0, 120.0)) + + assert np.allclose(extended, ordered) + + +def test_orient_string_axis_flips_when_nut_is_right_of_body(): + ordered = np.array( + [ + [450.0, 100.0], + [50.0, 100.0], + [50.0, 140.0], + [450.0, 140.0], + ], + dtype=np.float64, + ) + + oriented = _orient_string_axis_for_lap_framing(ordered) + + assert np.allclose( + oriented, + np.array( + [ + [450.0, 140.0], + [50.0, 140.0], + [50.0, 100.0], + [450.0, 100.0], + ] + ), + ) + + +def test_orient_string_axis_keeps_left_nut_top_as_high_e(): + ordered = np.array( + [ + [50.0, 100.0], + [450.0, 100.0], + [450.0, 140.0], + [50.0, 140.0], + ], + dtype=np.float64, + ) + + oriented = _orient_string_axis_for_lap_framing(ordered) + + assert np.allclose(oriented, ordered) + + # ----- homography construction ----- @@ -225,6 +304,22 @@ def test_predictions_canonical_origin_maps_to_nut_when_nut_detected(): assert proj[0, 1] == pytest.approx(200.0, abs=1.0) +def test_predictions_canonical_origin_extends_to_external_nut_detection(): + """When the nut lies beyond the neck OBB edge, canonical x=0 follows it.""" + preds = OBBPredictions( + neck=[_neck(250, 200, 400, 60, conf=0.8)], + nut=[_nut(-50, 200, conf=0.5)], + ) + homog = predictions_to_homography(preds) + + pt = np.array([[0.0, 0.5, 1.0]]) + proj = (homog.H @ pt.T).T + proj = proj[:, :2] / proj[:, 2:] + + assert proj[0, 0] == pytest.approx(-50.0, abs=1.0) + assert proj[0, 1] == pytest.approx(200.0, abs=1.0) + + def test_predictions_canonical_top_maps_to_smaller_image_y(): """Canonical y=0 (top, high-E) maps to the smaller image-y edge.""" preds = OBBPredictions( @@ -240,3 +335,21 @@ def test_predictions_canonical_top_maps_to_smaller_image_y(): proj_bot = (homog.H @ pt_bot.T).T proj_bot = proj_bot[:, :2] / proj_bot[:, 2:] assert proj_top[0, 1] < proj_bot[0, 1] + + +def test_predictions_canonical_top_maps_to_lower_edge_when_nut_on_right(): + """For common lap framing with headstock right, high-E is lower in image.""" + preds = OBBPredictions( + neck=[_neck(250, 200, 400, 60, conf=0.8)], + nut=[_nut(450, 200, conf=0.5)], + ) + homog = predictions_to_homography(preds) + + pt_top = np.array([[0.0, 0.0, 1.0]]) + pt_bot = np.array([[0.0, 1.0, 1.0]]) + proj_top = (homog.H @ pt_top.T).T + proj_top = proj_top[:, :2] / proj_top[:, 2:] + proj_bot = (homog.H @ pt_bot.T).T + proj_bot = proj_bot[:, :2] / proj_bot[:, 2:] + + assert proj_top[0, 1] > proj_bot[0, 1] diff --git a/tabvision/tests/unit/test_guitarset_audio_eval.py b/tabvision/tests/unit/test_guitarset_audio_eval.py new file mode 100644 index 0000000..d31eea4 --- /dev/null +++ b/tabvision/tests/unit/test_guitarset_audio_eval.py @@ -0,0 +1,258 @@ +"""Unit tests for the v1 GuitarSet audio-only eval helpers.""" + +from __future__ import annotations + +import json +from pathlib import Path + +import numpy as np +import pytest + +from tabvision.errors import BackendError +from tabvision.eval.guitarset_audio import ( + EventF1Result, + TrackEvalResult, + _score_event_f1, + build_guitarset_position_prior, + list_guitarset_track_ids, + parse_guitarset_jams, + score_audio_only, + summarize_results, +) +from tabvision.eval.metrics import TabF1Result +from tabvision.types import AudioEvent, TabEvent + + +def _write_jams(path: Path) -> None: + payload = { + "annotations": [ + { + "namespace": "note_midi", + "annotation_metadata": {"data_source": "0"}, + "data": [ + {"time": 0.10, "duration": 0.25, "value": 44.1}, + {"time": 0.60, "duration": 0.30, "value": 38.9}, + ], + }, + { + "namespace": "note_midi", + "annotation_metadata": {"data_source": "5"}, + "data": [ + {"time": 1.00, "duration": 0.40, "value": 72.0}, + ], + }, + { + "namespace": "pitch_contour", + "annotation_metadata": {"data_source": "5"}, + "data": {"time": [], "duration": [], "value": [], "confidence": []}, + }, + ] + } + path.write_text(json.dumps(payload), encoding="utf-8") + + +def test_parse_guitarset_jams_retains_string_fret_and_pitch(tmp_path: Path): + jams_path = tmp_path / "clip.jams" + _write_jams(jams_path) + + notes = parse_guitarset_jams(jams_path) + + assert [(n.onset_s, n.duration_s, n.string_idx, n.fret, n.pitch_midi) for n in notes] == [ + (0.10, 0.25, 0, 4, 44), + (1.00, 0.40, 5, 8, 72), + ] + + +def test_validation_track_listing_uses_held_out_player(tmp_path: Path): + ann = tmp_path / "annotation" + audio = tmp_path / "audio_mono-mic" + ann.mkdir() + audio.mkdir() + for track_id in ["00_alpha", "05_beta", "05_gamma"]: + (ann / f"{track_id}.jams").write_text("{}", encoding="utf-8") + (audio / f"{track_id}_mic.wav").write_bytes(b"RIFF") + + assert list_guitarset_track_ids(tmp_path, split="validation") == ["05_beta", "05_gamma"] + + +def test_build_guitarset_position_prior_uses_train_split_only(tmp_path: Path): + ann = tmp_path / "annotation" + audio = tmp_path / "audio_mono-mic" + ann.mkdir() + audio.mkdir() + + train_jams = tmp_path / "annotation" / "00_train.jams" + train_jams.write_text( + json.dumps( + { + "annotations": [ + { + "namespace": "note_midi", + "annotation_metadata": {"data_source": "3"}, + "data": [{"time": 0.0, "duration": 0.2, "value": 69.0}], + } + ] + } + ), + encoding="utf-8", + ) + validation_jams = tmp_path / "annotation" / "05_validation.jams" + validation_jams.write_text( + json.dumps( + { + "annotations": [ + { + "namespace": "note_midi", + "annotation_metadata": {"data_source": "5"}, + "data": [{"time": 0.0, "duration": 0.2, "value": 69.0}], + } + ] + } + ), + encoding="utf-8", + ) + (audio / "00_train_mic.wav").write_bytes(b"RIFF") + (audio / "05_validation_mic.wav").write_bytes(b"RIFF") + + prior = build_guitarset_position_prior(tmp_path) + matrix = prior.matrix_for_pitch(69) + + assert matrix is not None + assert matrix[3, 14] > matrix[5, 5] + + +def test_event_f1_can_score_onsets_separately_from_pitch(): + pred = [ + TabEvent(1.00, 0.2, string_idx=0, fret=5, pitch_midi=45, confidence=1.0), + TabEvent(2.00, 0.2, string_idx=5, fret=3, pitch_midi=67, confidence=1.0), + ] + gold = [ + TabEvent(1.02, 0.2, string_idx=0, fret=4, pitch_midi=44, confidence=1.0), + TabEvent(2.02, 0.2, string_idx=5, fret=3, pitch_midi=67, confidence=1.0), + ] + + onset = _score_event_f1(pred, gold, match_pitch=False) + pitch = _score_event_f1(pred, gold, match_pitch=True) + + assert onset.f1 == 1.0 + assert pitch.true_positives == 1 + assert pitch.false_positives == 1 + assert pitch.false_negatives == 1 + + +def test_score_audio_only_separates_pitch_from_tab_candidate_selection(): + gold = [TabEvent(0.0, 0.2, string_idx=3, fret=14, pitch_midi=69, confidence=1.0)] + predicted_audio = [ + AudioEvent(0.01, 0.21, pitch_midi=69, velocity=0.8, confidence=0.9), + ] + + scored = score_audio_only(predicted_audio, gold) + + assert scored.onset.f1 == 1.0 + assert scored.pitch.f1 == 1.0 + assert scored.tab.f1 == 0.0 + assert scored.decoded[0].string_idx == 5 + assert scored.decoded[0].fret == 5 + + +def test_summarize_results_uses_all_micro_counts(): + result = TrackEvalResult( + track_id="clip", + backend="highres", + gold_notes=3, + audio_events=4, + decoded_events=4, + onset=EventF1Result(0.50, 0.67, 0.57, 2, 2, 1), + pitch=EventF1Result(0.25, 0.33, 0.29, 1, 3, 2), + tab=TabF1Result(0.25, 0.33, 0.29, 1, 3, 2), + ) + + summary = summarize_results([result], backend="highres", split="validation") + + assert summary.micro_onset.true_positives == 2 + assert summary.micro_onset.false_positives == 2 + assert summary.micro_onset.false_negatives == 1 + assert summary.micro_onset.f1 == pytest.approx(4 / 7) + assert summary.micro_tab.true_positives == 1 + assert summary.micro_tab.false_positives == 3 + assert summary.micro_tab.false_negatives == 2 + assert summary.micro_tab.f1 == pytest.approx(2 / 7) + + +def test_main_reports_backend_setup_blocker(monkeypatch: pytest.MonkeyPatch, capsys): + import tabvision.eval.guitarset_audio as guitarset_audio + + def _raise_blocker(**_kwargs): + raise BackendError("basic-pitch is not installed") + + monkeypatch.setattr(guitarset_audio, "run_eval", _raise_blocker) + + code = guitarset_audio.main(["--backend", "basicpitch", "--limit", "1"]) + + assert code == 2 + assert "setup_blocker=basic-pitch is not installed" in capsys.readouterr().err + + +def test_run_eval_reuses_backend_across_tracks( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +): + import tabvision.audio.backend as backend_registry + import tabvision.eval.guitarset_audio as guitarset_audio + + ann = tmp_path / "annotation" + audio = tmp_path / "audio_mono-mic" + ann.mkdir() + audio.mkdir() + for track_id in ["05_one", "05_two"]: + _write_jams(ann / f"{track_id}.jams") + (audio / f"{track_id}_mic.wav").write_bytes(b"RIFF") + + class FakeBackend: + name = "fake" + + def __init__(self) -> None: + self.transcribe_calls = 0 + + def transcribe(self, _wav, _sr, _session): + self.transcribe_calls += 1 + return [ + AudioEvent( + onset_s=0.10, + offset_s=0.35, + pitch_midi=44, + velocity=1.0, + confidence=1.0, + ) + ] + + fake = FakeBackend() + make_calls = 0 + + seen_kwargs = [] + + def fake_make(name: str, **kwargs): + nonlocal make_calls + assert name == "fake" + make_calls += 1 + seen_kwargs.append(kwargs) + return fake + + monkeypatch.setattr(backend_registry, "make", fake_make) + monkeypatch.setattr( + guitarset_audio, + "load_mono_audio", + lambda _path: (np.zeros(8, dtype=np.float32), 8_000), + ) + + results, _summary = guitarset_audio.run_eval( + backend_name="fake", + data_home=tmp_path, + split="validation", + backend_kwargs={"device": "cuda"}, + ) + + assert len(results) == 2 + assert make_calls == 1 + assert seen_kwargs == [{"device": "cuda"}] + assert fake.transcribe_calls == 2 diff --git a/tabvision/tests/unit/test_phase5_eval_helpers.py b/tabvision/tests/unit/test_phase5_eval_helpers.py new file mode 100644 index 0000000..1ca0be6 --- /dev/null +++ b/tabvision/tests/unit/test_phase5_eval_helpers.py @@ -0,0 +1,112 @@ +"""Unit tests for Phase 5 acceptance-eval timing/alignment helpers.""" + +from __future__ import annotations + +import pytest + +from tabvision.eval.metrics import tab_f1 +from tabvision.types import TabEvent +from tests.eval.test_phase5_eval import ( + _align_gold_to_audio_only, + _find_best_pitch_offset, + _gold_notes_to_tab_events, + _parse_legacy_tab_text, +) + + +def _event(t: float, string_idx: int, fret: int, pitch: int) -> TabEvent: + return TabEvent( + onset_s=t, + duration_s=0.25, + string_idx=string_idx, + fret=fret, + pitch_midi=pitch, + confidence=1.0, + ) + + +def test_gold_conversion_uses_bpm_for_seconds_and_computes_midi(): + notes = [ + {"string": 1, "fret": 0, "beat": 1.0}, + {"string": 6, "fret": 3, "beat": 2.0}, + ] + + events = _gold_notes_to_tab_events(notes, bpm=120, video_duration_s=99.0) + + assert [e.onset_s for e in events] == [0.5, 1.0] + assert [(e.string_idx, e.fret, e.pitch_midi) for e in events] == [ + (5, 0, 64), # high E open + (0, 3, 43), # low E fret 3 + ] + + +def test_gold_conversion_falls_back_to_duration_when_bpm_missing(): + notes = [ + {"string": 1, "fret": 0, "beat": 2.0}, + {"string": 2, "fret": 1, "beat": 8.0}, + ] + + events = _gold_notes_to_tab_events(notes, bpm=None, video_duration_s=4.0) + + assert [e.onset_s for e in events] == [1.0, 4.0] + + +def test_gold_conversion_skips_muted_notes(): + notes = [ + {"string": 1, "fret": "X", "beat": 0.0}, + {"string": 1, "fret": 3, "beat": 1.0}, + ] + + events = _gold_notes_to_tab_events(notes, bpm=60, video_duration_s=10.0) + + assert len(events) == 1 + assert events[0].fret == 3 + assert events[0].pitch_midi == 67 + + +def test_legacy_tab_parser_fallback_handles_tabs_and_muted_notes(): + notes = _parse_legacy_tab_text( + """ +e|--12-x-| +B|--3----| +""" + ) + + assert notes == [ + {"string": 1, "fret": 12, "beat": 0.5}, + {"string": 2, "fret": 3, "beat": 0.5}, + {"string": 1, "fret": "X", "beat": 1.0}, + ] + + +def test_find_best_pitch_offset_recovers_known_offset(): + gold = [_event(0.0, 5, 0, 64), _event(1.0, 5, 2, 66)] + predicted = [_event(2.0, 5, 0, 64), _event(3.0, 5, 2, 66)] + + offset, matches = _find_best_pitch_offset( + predicted=predicted, + gold=gold, + video_duration_s=5.0, + tolerance_s=0.01, + step_s=0.05, + ) + + assert offset == pytest.approx(2.0) + assert matches == 2 + + +def test_alignment_from_audio_only_is_reused_for_audio_video_scoring(): + gold = [_event(0.0, 5, 0, 64)] + audio_only = [_event(1.25, 5, 0, 64)] + audio_video = [_event(1.25, 5, 0, 64)] + + aligned_gold, offset, matches = _align_gold_to_audio_only( + audio_only=audio_only, + gold=gold, + video_duration_s=3.0, + ) + + assert offset == pytest.approx(1.25) + assert matches == 1 + assert tab_f1(audio_only, aligned_gold).f1 == 1.0 + assert tab_f1(audio_video, aligned_gold).f1 == 1.0 diff --git a/tabvision/tests/unit/test_position_prior.py b/tabvision/tests/unit/test_position_prior.py new file mode 100644 index 0000000..7cad49c --- /dev/null +++ b/tabvision/tests/unit/test_position_prior.py @@ -0,0 +1,73 @@ +"""Unit tests for learned pitch-position priors.""" + +from __future__ import annotations + +import numpy as np + +from tabvision.fusion import fuse +from tabvision.fusion.position_prior import ( + PitchPositionPrior, + apply_pitch_position_prior, + learn_pitch_position_prior, +) +from tabvision.types import AudioEvent, GuitarConfig, TabEvent + + +def _gold(t: float, string_idx: int, fret: int, pitch: int) -> TabEvent: + return TabEvent( + onset_s=t, + duration_s=0.25, + string_idx=string_idx, + fret=fret, + pitch_midi=pitch, + confidence=1.0, + ) + + +def _audio(t: float, pitch: int) -> AudioEvent: + return AudioEvent( + onset_s=t, + offset_s=t + 0.25, + pitch_midi=pitch, + velocity=1.0, + confidence=1.0, + ) + + +def test_learned_prior_prefers_observed_string_fret_for_pitch(): + prior = learn_pitch_position_prior( + [_gold(0.0, string_idx=3, fret=14, pitch=69)], + alpha=0.1, + power=2.0, + ) + + matrix = prior.matrix_for_pitch(69) + + assert matrix.shape == (6, 25) + assert matrix[3, 14] > matrix[5, 5] + assert matrix[0, 0] == 0.0 + assert np.isclose(matrix.sum(), 1.0) + + +def test_prior_attachment_copies_audio_events_without_mutating_original(): + prior = PitchPositionPrior({69: np.ones((6, 25), dtype=np.float64) / 150}) + event = _audio(0.0, 69) + + attached = apply_pitch_position_prior([event], prior) + + assert attached[0] is not event + assert event.fret_prior is None + assert attached[0].fret_prior is prior.matrix_for_pitch(69) + + +def test_learned_prior_can_override_lowest_fret_audio_only_pick(): + prior = learn_pitch_position_prior( + [_gold(0.0, string_idx=3, fret=14, pitch=69) for _ in range(4)], + alpha=0.1, + power=2.0, + ) + + event = apply_pitch_position_prior([_audio(0.0, 69)], prior)[0] + decoded = fuse([event], [], GuitarConfig(), lambda_vision=0.0) + + assert [(ev.string_idx, ev.fret) for ev in decoded] == [(3, 14)]