From 2e07407d8c8ae1d9b35e1f8c5249ac197eb344c6 Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 04:35:11 -0700 Subject: [PATCH 01/70] =?UTF-8?q?docs(verification):=20visual-bug=20detect?= =?UTF-8?q?ion=20design=20phase=20=E2=80=94=20report,=20prior-art,=20spec,?= =?UTF-8?q?=20plan?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Strategy report (5-tier reftests-first pyramid) re-grounded on canonical main; 5 prior-art folders (wpt-reftests, vello, skia-gold, flutter-golden-testing, wgpu-testing); the buiy-verification-design multi-file spec realizing foundation gates #2/#5/#11/#12; and the phased TDD implementation plan. docs/README catalog wired. Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/README.md | 12 + .../2026-06-15-buiy-verification-impl.md | 3659 +++++++++++++++++ .../flutter-golden-testing/README.md | 86 + .../determinism-knobs.md | 50 + .../ecosystem-toolkit-alchemist.md | 68 + .../flutter-gold-infra.md | 71 + .../flutter-golden-testing/glossary.md | 61 + .../flutter-golden-testing/lessons.md | 58 + .../flutter-golden-testing/matches-golden.md | 74 + .../obscure-text-font.md | 51 + .../flutter-golden-testing/open-problems.md | 35 + docs/prior-art/skia-gold/README.md | 82 + docs/prior-art/skia-gold/ecosystem-tools.md | 67 + docs/prior-art/skia-gold/glossary.md | 66 + docs/prior-art/skia-gold/gold-architecture.md | 72 + docs/prior-art/skia-gold/lessons.md | 74 + docs/prior-art/skia-gold/open-problems.md | 48 + docs/prior-art/skia-gold/storage-scale.md | 63 + docs/prior-art/vello/README.md | 71 + docs/prior-art/vello/architecture.md | 73 + docs/prior-art/vello/cpu-gpu-testing.md | 68 + docs/prior-art/vello/ecosystem-maturity.md | 81 + docs/prior-art/vello/glossary.md | 68 + docs/prior-art/vello/lessons.md | 77 + docs/prior-art/vello/metric-and-kompari.md | 66 + docs/prior-art/vello/open-problems.md | 53 + docs/prior-art/vello/sparse-strips.md | 52 + docs/prior-art/wgpu-testing/README.md | 87 + .../wgpu-testing/determinism-rasterizer.md | 92 + docs/prior-art/wgpu-testing/glossary.md | 64 + .../wgpu-testing/gpu-test-harness.md | 100 + docs/prior-art/wgpu-testing/image-compare.md | 86 + docs/prior-art/wgpu-testing/lessons.md | 70 + docs/prior-art/wgpu-testing/open-problems.md | 64 + docs/prior-art/wpt-reftests/README.md | 84 + docs/prior-art/wpt-reftests/consumers.md | 53 + docs/prior-art/wpt-reftests/fuzzy-matching.md | 51 + docs/prior-art/wpt-reftests/gecko-reftests.md | 83 + docs/prior-art/wpt-reftests/glossary.md | 64 + docs/prior-art/wpt-reftests/lessons.md | 83 + docs/prior-art/wpt-reftests/methodology.md | 50 + docs/prior-art/wpt-reftests/open-problems.md | 56 + docs/prior-art/wpt-reftests/wpt.md | 62 + ...026-06-14-visual-bug-detection-strategy.md | 272 ++ .../README.md | 103 + .../coverage.md | 325 ++ .../determinism.md | 196 + .../goldens.md | 344 ++ .../invariants.md | 273 ++ .../metric.md | 341 ++ .../open-questions.md | 139 + .../reftests.md | 201 + .../snapshots.md | 277 ++ 53 files changed, 8926 insertions(+) create mode 100644 docs/plans/2026-06-15-buiy-verification-impl.md create mode 100644 docs/prior-art/flutter-golden-testing/README.md create mode 100644 docs/prior-art/flutter-golden-testing/determinism-knobs.md create mode 100644 docs/prior-art/flutter-golden-testing/ecosystem-toolkit-alchemist.md create mode 100644 docs/prior-art/flutter-golden-testing/flutter-gold-infra.md create mode 100644 docs/prior-art/flutter-golden-testing/glossary.md create mode 100644 docs/prior-art/flutter-golden-testing/lessons.md create mode 100644 docs/prior-art/flutter-golden-testing/matches-golden.md create mode 100644 docs/prior-art/flutter-golden-testing/obscure-text-font.md create mode 100644 docs/prior-art/flutter-golden-testing/open-problems.md create mode 100644 docs/prior-art/skia-gold/README.md create mode 100644 docs/prior-art/skia-gold/ecosystem-tools.md create mode 100644 docs/prior-art/skia-gold/glossary.md create mode 100644 docs/prior-art/skia-gold/gold-architecture.md create mode 100644 docs/prior-art/skia-gold/lessons.md create mode 100644 docs/prior-art/skia-gold/open-problems.md create mode 100644 docs/prior-art/skia-gold/storage-scale.md create mode 100644 docs/prior-art/vello/README.md create mode 100644 docs/prior-art/vello/architecture.md create mode 100644 docs/prior-art/vello/cpu-gpu-testing.md create mode 100644 docs/prior-art/vello/ecosystem-maturity.md create mode 100644 docs/prior-art/vello/glossary.md create mode 100644 docs/prior-art/vello/lessons.md create mode 100644 docs/prior-art/vello/metric-and-kompari.md create mode 100644 docs/prior-art/vello/open-problems.md create mode 100644 docs/prior-art/vello/sparse-strips.md create mode 100644 docs/prior-art/wgpu-testing/README.md create mode 100644 docs/prior-art/wgpu-testing/determinism-rasterizer.md create mode 100644 docs/prior-art/wgpu-testing/glossary.md create mode 100644 docs/prior-art/wgpu-testing/gpu-test-harness.md create mode 100644 docs/prior-art/wgpu-testing/image-compare.md create mode 100644 docs/prior-art/wgpu-testing/lessons.md create mode 100644 docs/prior-art/wgpu-testing/open-problems.md create mode 100644 docs/prior-art/wpt-reftests/README.md create mode 100644 docs/prior-art/wpt-reftests/consumers.md create mode 100644 docs/prior-art/wpt-reftests/fuzzy-matching.md create mode 100644 docs/prior-art/wpt-reftests/gecko-reftests.md create mode 100644 docs/prior-art/wpt-reftests/glossary.md create mode 100644 docs/prior-art/wpt-reftests/lessons.md create mode 100644 docs/prior-art/wpt-reftests/methodology.md create mode 100644 docs/prior-art/wpt-reftests/open-problems.md create mode 100644 docs/prior-art/wpt-reftests/wpt.md create mode 100644 docs/reports/2026-06-14-visual-bug-detection-strategy.md create mode 100644 docs/specs/2026-06-15-buiy-verification-design/README.md create mode 100644 docs/specs/2026-06-15-buiy-verification-design/coverage.md create mode 100644 docs/specs/2026-06-15-buiy-verification-design/determinism.md create mode 100644 docs/specs/2026-06-15-buiy-verification-design/goldens.md create mode 100644 docs/specs/2026-06-15-buiy-verification-design/invariants.md create mode 100644 docs/specs/2026-06-15-buiy-verification-design/metric.md create mode 100644 docs/specs/2026-06-15-buiy-verification-design/open-questions.md create mode 100644 docs/specs/2026-06-15-buiy-verification-design/reftests.md create mode 100644 docs/specs/2026-06-15-buiy-verification-design/snapshots.md diff --git a/docs/README.md b/docs/README.md index e8b4a0a..52ec807 100644 --- a/docs/README.md +++ b/docs/README.md @@ -46,6 +46,7 @@ If a doc spans areas, file it under its primary area only. Reference any adjacen **Specs** - [Buiy foundation design](specs/2026-05-07-buiy-foundation/README.md) — feature inventory, architectural foundation, sub-spec roadmap (multi-file). `[draft]` +- [Buiy verification design](specs/2026-06-15-buiy-verification-design/README.md) — realizes the foundation `verification.md` gates #2/#5/#11/#12 as a reftests-first 5-tier pyramid: perceptual metric, structured snapshots, property invariants, reftests + CPU/GPU cross-check, golden persistence + determinism + coverage-by-construction (multi-file). Realizes the [visual-bug detection report](reports/2026-06-14-visual-bug-detection-strategy.md). `[draft]` **Plans** @@ -111,6 +112,7 @@ If a doc spans areas, file it under its primary area only. Reference any adjacen ### Reports - [Text-editing design-readiness review](reports/2026-06-13-text-editing-design-readiness.md) — three-verifier audit of `editing-and-ime.md` against current `main` before the `buiy-text-editing` campaign: every integration seam confirmed in code, the no-new-GPU-work painting claim upheld, and OQ#1 (edit→layout frame-ordering) resolved as accepted one-frame latency. Verdict: ready-with-patches (now applied). `[2026-06-13]` +- [Visual-bug detection strategy](reports/2026-06-14-visual-bug-detection-strategy.md) — how to catch visual regressions as Buiy scales: a five-tier pyramid (layout-number → display-list/paint-order → metamorphic/property → reftests + CPU-cross-check → golden screenshots), reftests-first, grounded on canonical `main`. Audits the existing golden/forced-colors/text-shaping infra and names the gaps; input to `buiy-verification-design`. Pairs with the five `prior-art/` folders below. `[2026-06-14]` ### Docs infrastructure @@ -173,6 +175,16 @@ The reference *implementations* of the CSS modules Buiy implements a typed Rust - [AccessKit](prior-art/accesskit/) — load-bearing cross-platform a11y bridge; Pneuma Solutions-stewarded. Windows / macOS / Linux production; Android pre-1.0; **iOS adapter shipped 2026-05-11** (Buiy spec needs update); web adapter NOT yet shipped. Buiy is the *producer*, `accesskit_consumer` is for adapter-side code. Consult before any spec on a11y tree construction, AccessKit integration, ACCNAME 1.2, focus model, or per-window adapter ownership. `[active]` - [WAI-ARIA APG](prior-art/wai-aria-apg/) — the W3C contract source Buiy implements: WAI-ARIA 1.2 Recommendation (6 June 2023) + ACCNAME 1.2 Working Draft (20 May 2026) + WCAG 2.2 Recommendation (5 October 2023). The Authoring Practices Guide enumerates **32 widget design patterns** at , each pinning keyboard contract + ARIA role/state/property emission + name/description sourcing. **Inverted framing** (not "learns from"; this is the contract Buiy MUST implement): Implements (every Buiy widget follows the APG keyboard contract + ARIA mapping; ACCNAME 1.2 lives in `buiy_core`; WCAG 2.2 Level A + AA gated in CI; live regions via global announcer; `:focus-visible` + roving tabindex + `aria-activedescendant` + `inert` are foundation-tier). Diverge (gamepad navigation, spatial focus, 3D-anchored / diegetic UI, render-to-texture surfaces, game-specific widgets — APG covers none of these; Buiy extends honestly). Implementation strategy (per-widget specs under `buiy-widget-catalog-design`; verification gates 3 / 4 / 7 + linters; real-AT testing as manual-release-gate). Consult before any spec on a widget contract, keyboard interaction, ACCNAME, WCAG verification, focus management, or platform a11y bindings. `[active]` +### Verification & visual testing + +External systems Buiy's visual-bug-detection strategy ([report](reports/2026-06-14-visual-bug-detection-strategy.md)) learns from. Created 2026-06-14. + +- [wpt-reftests](prior-art/wpt-reftests/) — reference-comparison visual testing (Gecko reftests + web-platform-tests): the `==`/`!=` model, `fuzzy-if()` two-axis matching, and the shared WPT CSS-conformance corpus that needs ZERO stored goldens, plus how Servo/Blink consume it. The methodology behind Buiy's reftests-first Tier 4. Consult before any spec on the reftest harness or fuzzy-tolerance budgets. `[active]` +- [vello](prior-art/vello/) — Linebender's GPU-compute 2D renderer (wgpu); the closest neighbor for the CPU-vs-GPU reference-oracle pattern (`vello_cpu` as an f32 oracle), the sparse-strip hybrid, and the FLIP/Kompari perceptual metric. Consult before any spec on the CPU-SDF cross-check or the perceptual metric. `[active]` +- [skia-gold](prior-art/skia-gold/) — Skia/Chromium Gold + the golden storage & triage ecosystem (reg-suit, Chromatic, Argos): content-addressed digests, multi-positive baselines, time-boxed ignores — the escape hatch for when a golden set explodes. Consult before any spec on golden storage/triage at scale. `[active]` +- [flutter-golden-testing](prior-art/flutter-golden-testing/) — Flutter golden-file regression: `matchesGoldenFile`, the obscure-text/Ahem layout-determinism font, `debugDisableShadows`, golden_toolkit + Alchemist. The canonical glyph-golden flake fight. Consult before any spec on text goldens or determinism knobs. `[active]` +- [wgpu-testing](prior-art/wgpu-testing/) — wgpu's CI/GPU test infrastructure: the `#[gpu_test]` harness, `FailureCase`-per-backend expectations, the pinned-lavapipe determinism recipe, and `nv_flip` image comparison. Buiy's closest determinism model (same wgpu abstraction). Consult before any spec on the determinism stack or CI GPU class. `[active]` + ### Archived - [bevy_cosmic_edit](prior-art/bevy-cosmic-edit/) — third-party Bevy plugin bridging cosmic-text into bevy_ui and 2D sprites. **Repo archived 2025-03-21**; final release 0.26.0 (2024-12-07, pinned to Bevy 0.15). Documented as a structural anti-pattern case study (bridge crate between two fast-moving Rust UI ecosystems). Validates Buiy's commitment to own its text-edit surface end-to-end. `[archived]` diff --git a/docs/plans/2026-06-15-buiy-verification-impl.md b/docs/plans/2026-06-15-buiy-verification-impl.md new file mode 100644 index 0000000..2741a6c --- /dev/null +++ b/docs/plans/2026-06-15-buiy-verification-impl.md @@ -0,0 +1,3659 @@ +# Buiy verification harness — Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: superpowers:subagent-driven-development to implement task-by-task. Steps use checkbox (- [ ]) syntax. + +**Date:** 2026-06-15 +**Status:** active +**Spec:** specs/2026-06-15-buiy-verification-design/README.md +**Goal:** Build the five-tier, reftests-first visual-bug-detection pyramid (`buiy_verify`: metric, snapshots, invariants, reftests, goldens, determinism, coverage) on the landed `GoldenConfig` capture path, retiring the two naive metrics and closing foundation gates #2/#5/#11/#12. +**Architecture:** The harness is `buiy_verify` (pure-CPU metric, snapshot/invariant formatters, reftest pairing logic, golden persistence, `DeterministicApp`, coverage matrix) plus a device-coupled capture seam promoted into `buiy_core::render::golden` (`capture_to_image` / `capture_app`). `buiy_core` cannot depend on `buiy_verify` in its normal graph, so the L1 `perceptual_diff` is deprecated in place and a dev-only dependency cycle lets `#[ignore]` GPU tests reach the unified metric. Pure-CPU tiers gate headless; GPU tiers are `#[ignore]` and run on a real adapter (RX 6700 XT locally, pinned lavapipe in CI). +**Tech Stack:** Rust, Bevy 0.18, wgpu, `image` 0.25, `proptest`, `insta`, the vendored pixelmatch YIQ algorithm + `image-compare` MSSIM, `toml`/`base64` (golden ledger + triage), `inventory` (coverage catalog); GPU lane on a real adapter. + +--- + +## File structure + +Every file created (`C`) or modified (`M`) across the campaign, with its one-line responsibility. Paths are repo-relative to `/mnt/storage/projects/buiy/.claude/worktrees/visual-bug-detection-report/`. + +### `buiy_core` (capture seam + deprecation) + +| File | C/M | Responsibility | +|---|---|---| +| `crates/buiy_core/Cargo.toml` | M | Add `image.workspace = true` (direct dep for `capture_to_image`); add `buiy_verify` under `[dev-dependencies]` (dev-only cycle). | +| `crates/buiy_core/src/render/golden.rs` | M | Canonical `Dpr` milliscale type; promoted `capture_to_image` + `readback_rgba_into` + `capture_app`/`capture_app_scaled`; `CAPTURE_MSAA`/`CAPTURE_DITHER_OFF` consts; `GoldenConfig` extension (`FontMode`, `dpr`, `fidelity()`); quiescence flush; `#[deprecated]` `perceptual_diff`. | +| `crates/buiy_core/src/lib.rs` | M | `#[allow(deprecated)]` on the `perceptual_diff` re-export. | +| `crates/buiy_core/src/layout/systems.rs` | M | Extract the nested `tier_rank` fn (`:4113`) into `pub fn top_layer_paint_rank`; `compose_transform` is `pub(super)` at `:3775`. | +| `crates/buiy_core/src/layout/mod.rs` | M | Re-export `top_layer_paint_rank`. | +| `crates/buiy_core/tests/support/mod.rs` | M | Delegate `readback_rgba` / `gpu_render_app_with_resolution` to the promoted src builders (single-body anti-drift). | +| `crates/buiy_core/tests/render_golden_harness.rs` | M | `#[ignore]` dimension meta-test for `capture_to_image`; file-level `#![allow(deprecated)]`. | +| `crates/buiy_core/tests/render_capture_app_gpu.rs` | C | `#[ignore]` test: `capture_app` paints a non-blank frame. | +| `crates/buiy_core/tests/render_capture_quiescence.rs` | C | Quiescence-panic (`#[ignore]` GPU) + no-`Instant::now` grep-lint (headless). | +| `crates/buiy_core/tests/render_golden_config.rs` | C | `GoldenConfig::deterministic()`/`fidelity()` defaults. | +| `crates/buiy_core/tests/text_gpu.rs` | M | Migrate the 5 stable re-capture sites + 2 anti-tests onto `metric::compare`. | +| `crates/buiy_core/tests/{text_decoration_gpu,text_golden_suite_gpu,text_selection_caret_gpu}.rs` | M | File-level `#![allow(deprecated)]` (stay on `perceptual_diff` until Phase 3). | +| `crates/buiy_core/tests/{render_extract,render_buckets,render_paint_order,render_instance,top_layer}.rs` | M | Migrate per-field `assert_eq!` blocks to `assert_display_list_snapshot` / `assert_instance_hex_snapshot`. | +| `crates/buiy_core/tests/layout.rs` | M | Migrate the `< 0.5` flex-row asserts to `assert_layout_snapshot`. | +| `crates/buiy_core/tests/layout_stacking.rs` | M | `top_layer_paint_rank` mapping assert. | +| `crates/buiy_core/tests/fixtures/fonts/Ahem.ttf` (+ license) | C | Committed WPT Ahem box-font fixture. | +| `crates/buiy_core/tests/snapshots/` | C | Committed `.snap` files for the migrated core tests. | + +### `buiy_verify` (the harness) + +| File | C/M | Responsibility | +|---|---|---| +| `crates/buiy_verify/Cargo.toml` | M | Add `image-compare`, `insta`, `toml`, `base64`, `inventory`; remove nothing (pixelmatch is vendored, not depended on). | +| `crates/buiy_verify/src/lib.rs` | M | Register `metric`, `reftest`, `support`, `snapshot`, `invariant`, `golden`, `determinism`, `coverage`; drop `visual`. | +| `crates/buiy_verify/src/metric.rs` | C | AA-aware two-axis perceptual diff: `Diff`/`FuzzBudget`/`CompareOpts`, vendored YIQ `color_delta` + AA sibling exclusion, MSSIM, `passes`/`within`, `reftest_default()`, diff heatmap. | +| `crates/buiy_verify/src/visual.rs` | D (delete) | RMSE `compare_images` removed (superseded by `metric`). | +| `crates/buiy_verify/src/reftest.rs` | C | `RefKind`/`RefCase`/`RefOutcome`, `evaluate_outcome`, `mismatch_floor_ok`, `run_reftest`, `reftest!` macro, independence lint, `sdf_oracle`, `run_sdf_cross_check`. | +| `crates/buiy_verify/src/support.rs` | C | GPU-capture glue (`reftest_app`, `clear_reftest_scene`) — the one place Phase 3 swaps for `DeterministicApp`. | +| `crates/buiy_verify/src/snapshot/{mod,dump,layout,display_list}.rs` | C | Tier-1 layout dump + Tier-2 display-list/`PackedInstance`-hex dumps + shared `round`/version headers. | +| `crates/buiy_verify/src/invariant/{mod,scene,predicates,bidi}.rs` | C | Tier-3 proptest scene generators + predicate fns + BiDi caret round-trip. | +| `crates/buiy_verify/src/golden.rs` (+ `golden/report.rs`) | C | Tier-5 `GoldenKey`/`Backend`/`BlessLedger`, `check_golden`/`assert_golden`, multi-positive corpus, HTML triage. | +| `crates/buiy_verify/src/determinism/mod.rs` | C | `DeterministicApp` builder; re-exports `FontMode`/`Dpr` from `buiy_core::render::golden`. | +| `crates/buiy_verify/src/coverage/{mod,fixture,matrix,key,enroll,forced_colors}.rs` | C | `Fixture`/`fixture!`/`catalog`, `Matrix`/`Cell`/`CoverageKey`, `enroll_all`/`build_app`, live-catalog forced-colors producer. | +| `crates/buiy_verify/fixtures//.rs` | C | The single-source-of-truth BSN fixture corpus (`inventory`-registered, `glob!`-discoverable). | +| `crates/buiy_verify/tests/metric.rs` | C | Known-answer metric meta-suite + constants tripwire. | +| `crates/buiy_verify/tests/visual.rs` | M | Migrated off `compare_images` onto `metric::compare`. | +| `crates/buiy_verify/tests/smoke.rs` | M | Drop the `visual` re-export reference. | +| `crates/buiy_verify/tests/reftest_engine_gpu.rs` | C | `#[ignore]` known-good/known-bad engine pairs. | +| `crates/buiy_verify/tests/reftest_macro_gpu.rs` | C | `#[ignore]` macro-generated case. | +| `crates/buiy_verify/tests/reftest_independence.rs` | C | Headless RED/GREEN independence-lint self-test. | +| `crates/buiy_verify/tests/sdf_oracle.rs` | C | Headless full-tile CPU SDF oracle point-probes. | +| `crates/buiy_verify/tests/sdf_cross_check_gpu.rs` | C | `#[ignore]` GPU-vs-CPU SDF cross-check. | +| `crates/buiy_verify/tests/reftest_cases_gpu.rs` | C | Two real reftest cases (flex-justify `==`, content-visibility `!=`). | +| `crates/buiy_verify/tests/snapshot_*.rs` | C | Tier-1/2 dump self-tests (`_dump`, `_layout`, `_instance_hex`, `_display_list`, `_animation`). | +| `crates/buiy_verify/tests/invariant_mutations.rs` | C | Tier-3 mutation fixtures (the harness has teeth). | +| `crates/buiy_verify/tests/{golden_keys,golden_persistence,golden_report}.rs` | C | Tier-5 pure-CPU persistence/ledger/triage self-tests. | +| `crates/buiy_verify/tests/{determinism_ahem,determinism_capture}.rs` | C | Ahem-sole-family (headless) + `#[ignore]` idempotent/knob-sensitivity GPU. | +| `crates/buiy_verify/tests/goldens.rs` (+ `tests/goldens/` corpus) | C | `#[ignore]` end-to-end goldens per residue class + blessed PNGs. | +| `crates/buiy_verify/tests/coverage_{layout,display_list,invariants,golden,meta,forced_colors}.rs` | C | Per-tier enrollment drivers + coverage self-tests + live forced-colors scan. | +| `crates/buiy_verify/tests/snapshots/` + `proptest-regressions/` | C | Committed `.snap`s and minimized proptest counterexamples. | + +### Repo-level + +| File | C/M | Responsibility | +|---|---|---| +| `Cargo.toml` (workspace) | M | Add `toml`, `base64` to `[workspace.dependencies]` (and `insta` with `glob` if not already). | +| `deny.toml` | M | Add any new transitive SPDX id to the `[licenses] allow` list (never via an exceptions hack). | +| `.gitattributes` | M | Pin `crates/buiy_verify/tests/goldens/*.png -text`. | +| `.github/actions/install-mesa/action.yml` | C | CI lavapipe pin (consume `gfx-rs/ci-build` tarball; write own ICD JSON). | +| `.github/workflows/*` (CI) | M | Invoke the lavapipe action on the golden leg; export `VK_DRIVER_FILES`/`WGPU_ADAPTER_NAME`. | +| `docs/specs/2026-06-15-buiy-verification-design/*.md` | M | Flip `draft` → `active`/`implemented` with per-file "landed" notes (Phase 4.7). | +| `docs/README.md` | M | Flip the verification-design catalog tag `[draft]` → `[active]`; add this plan under Plans. | +| `docs/plans/follow-ups.md` | M | Resolve the live-catalog seam; keep `BoxShadow` visual reftest open; record deferred golden primitives. | +| `docs/specs/2026-05-07-buiy-foundation/verification.md` | M | Mark gates #2/#5/#11/#12 mechanisms landed. | + +--- + +## Phasing & ordering + +Five phases; **Phase 0** is the cross-cutting prerequisite, then the spec's reftests-first build order (metric+reftests → snapshots+invariants → goldens+determinism → coverage). + +| Phase | Name | Depends on | Gate | +|---|---|---|---| +| **0** | Cross-cutting prerequisites (deps, dev-cycle edge, `Dpr`, `capture_to_image`) | — | Headless; one `#[ignore]` GPU meta-test (0.4) on the GPU lane | +| **1a** | Perceptual metric (`buiy_verify::metric`) + naive-metric retirement | 0 | Headless; the GPU-site migration (1a.10/1a.11) runs on the GPU lane | +| **1b** | Reftest harness + CPU/GPU SDF cross-check (`buiy_verify::reftest`) | 0, 1a | Pure-CPU meta-tests headless; reftest cases / cross-check / engine pairs `#[ignore]` GPU lane | +| **2** | Tier 1-2 snapshots + Tier 3 invariants | 0 (insta), 1a (metric) | **Wholly headless** (no `#[ignore]`) | +| **3** | Determinism stack + Tier 5 golden persistence | 0 (`Dpr`/`capture_to_image`), 1 (metric/reftest) | Pure-CPU half headless; capture/golden GPU half `#[ignore]` GPU lane; CI lavapipe leg | +| **4** | Coverage-by-construction + forced-colors live wiring + docs flip | 2, 3 | Coverage self-tests + enrollment drivers headless; `coverage_golden` `#[ignore]` GPU lane | + +**Dependency order rationale.** Phase 0 lands every shared seam (the metric/snapshot deps, the dev-only `buiy_core → buiy_verify` edge, the canonical `Dpr`, the promoted `capture_to_image`) — nothing in Phase 1+ compiles without them. Phase 1a's `metric` is the shared primitive both pixel tiers (1b reftests, 3 goldens) consume, so it precedes them. Phase 1b's `run_reftest` and Phase 3's `DeterministicApp` both build on the same capture seam; 1b uses the landed `gpu_render_app`-derived `capture_app` directly and Phase 3 swaps that one line for `DeterministicApp::build` (identical `&mut App → RgbaImage` contract). Phase 2 is independent of the GPU tiers (it needs only `insta` + `metric`) and can land in parallel after Phase 1a. Phase 4 composes everything: it Cartesian-products the fixture corpus across all five tiers and ends with the docs flip. + +**Gate legend.** *Headless gate* (run before each commit) = `cargo fmt --all -- --check && cargo clippy --workspace --all-targets -- -D warnings && RUSTDOCFLAGS="-D warnings" cargo doc --workspace --no-deps && xvfb-run -a cargo test --workspace` (no `--ignored`, no adapter). *GPU lane* (additive) = `cargo test -p buiy_verify -j 2 -- --ignored --test-threads=1` (and the `buiy_core` `#[ignore]` files) on a real adapter (RX 6700 XT locally; lavapipe in CI). New deps gate on `cargo deny check` first. Commit per task; Conventional Commits; body ends `Co-Authored-By: Claude Opus 4.8 (1M context) `. + +--- + +## Phase 0 — Cross-cutting prerequisites + +These four tasks land the shared seams every later tier imports: the metric/snapshot crate dependencies (0.1), the dev-only `buiy_core → buiy_verify` edge that lets the `#[ignore]` GPU tests reach `buiy_verify::metric` (0.2), the canonical `Dpr` type (0.3), and the promoted `capture_to_image` capture primitive (0.4). Nothing in Phase 1+ compiles without them. Each is independently committable and leaves the tree green (`cargo test --workspace` headless gate passes after every task). + +**Spec anchors:** `metric.md` § "Crate choice" (pixelmatch over dify; `image-compare` for MSSIM), `metric.md` § "Migration" (dev-dep cycle), `determinism.md` § "Extending `GoldenConfig`" (`Dpr`), `determinism.md` § "Where the code lives" (`capture_to_image` promotion), README § "Crate-dependency note" (`image` is the only new `buiy_core` dep). + +--- + +### Task 0.1 — Add the metric + snapshot deps to `buiy_verify`, gated by `cargo deny check` + +`buiy_verify` gains the snapshot/MSSIM crates the metric and snapshot tiers consume: `image-compare` (advisory MSSIM channel; `metric.md` § "Advisory MSSIM") and `insta` (snapshot assertions; `snapshots.md`). **`pixelmatch` is NOT added** — Phase 1a vendors its algorithm (see Phase 1a's deviation note); this task does not depend on that decision because nothing here consumes `pixelmatch`. Exact patch pins (`=`) so a metric-crate bump cannot silently shift baselines (`metric.md` `cargo deny check` note). The supply-chain gate (`cargo deny check`) must pass — `deny.toml` is allow-list-only, so any new transitive license fails CI until added explicitly, never via an exception hack (CLAUDE.md). + +**Files:** +- Modify: `crates/buiy_verify/Cargo.toml` (`[dependencies]`, after `proptest.workspace = true`) +- Test: the `cargo deny check` + `cargo build -p buiy_verify` runs below (no Rust test file — this task only proves the deps resolve + pass the license gate; the metric/snapshot code that *uses* them lands in Phase 1/2) + +Steps: + +- [ ] **Run `cargo deny check` on the unchanged tree to capture the green baseline.** From the repo root: + ```sh + cargo deny check + ``` + Expected: `advisories ok`, `bans ok`, `licenses ok`, `sources ok` (the lone `paste` RUSTSEC-2024-0436 is already in `deny.toml`'s `ignore`). This is the "before" — proves the gate is green so a post-add failure is attributable to the new deps. + +- [ ] **Add the two deps to `crates/buiy_verify/Cargo.toml`.** Append to the `[dependencies]` table (after the `proptest.workspace = true` line): + ```toml + # Advisory MSSIM channel (metric.md § "Advisory MSSIM"): catches global + # gamma/blend drift a small pixel budget under-weights. NEVER the primary + # gate — surfaced as `Diff::mssim: Option`. The `cargo deny check` below + # confirms its license set + no RUSTSEC advisories. + image-compare = "=0.5.0" + # Tier-1/2 snapshot assertions (snapshots.md): insta drives the layout-number + # and display-list `Display` dumps. Dev-time crate, but lives in `[dependencies]` + # because the harness re-exports snapshot helpers from `src/`. The `glob` feature + # drives the coverage fixture-dir fan-out (Phase 4). + insta = { version = "=1.43.2", features = ["glob"] } + ``` + (Pin `insta` to the exact latest 1.x patch resolved at implementation time — run `cargo search insta` and substitute; `=1.43.2` is the placeholder. `insta` carries no baseline-shifting risk like the metric crates, but exact-pinning keeps the dep set reproducible.) + +- [ ] **Resolve the new deps and confirm they compile.** From the repo root: + ```sh + cargo build -p buiy_verify + ``` + Expected: `image-compare v0.5.0`, `insta v1.43.2` (+ their transitives, notably `moxcms`/`pxfm`/`thiserror`/`itertools`/`byteorder-lite` under `image-compare`) appear in the `Compiling …` output, then `Finished`. If a version does not exist, Cargo errors here — pick the nearest existing patch and re-pin. + +- [ ] **Run the supply-chain gate on the new dep graph.** From the repo root: + ```sh + cargo deny check + ``` + Expected: PASS. If `licenses` now FAILS, read which SPDX id the new transitive introduced, confirm it is OSI-permissive (MIT / Apache-2.0 / BSD / Unicode / Zlib), and add that exact short SPDX id to `deny.toml`'s `[licenses] allow` list with a one-line comment naming the crate that pulled it — **never** via a `[licenses] exceptions` hack (CLAUDE.md). If `advisories` FAILS on a new RUSTSEC id, stop and surface it — do not bulk-suppress. + +- [ ] **Run the headless gate to confirm the workspace still builds + tests green** (the new deps must not break the existing `buiy_verify` smoke/visual tests, which Phase 1 migrates): + ```sh + cargo clippy --workspace --all-targets -- -D warnings && xvfb-run -a cargo test -p buiy_verify + ``` + Expected: clippy clean, existing `buiy_verify` tests pass (they still reference `visual::compare_images` — that migration is Phase 1, not here). + +- [ ] **Commit.** + ```sh + git commit -am "build(verify): add image-compare + insta deps (deny-gated) + + Phase 0.1 of the verification pyramid: the advisory MSSIM channel + (image-compare) and the tier-1/2 snapshot driver (insta, glob feature) + land in buiy_verify with exact patch pins. cargo deny check passes; any + new transitive license is added explicitly to deny.toml's allow list. + pixelmatch is NOT added here — Phase 1a vendors its algorithm. + + No code consumes them yet — the metric/snapshot modules land in Phase 1/2. + Spec: docs/specs/2026-06-15-buiy-verification-design/metric.md § Crate choice. + + Co-Authored-By: Claude Opus 4.8 (1M context) " + ``` + +--- + +### Task 0.2 — Add `buiy_verify` as a `[dev-dependencies]` of `buiy_core` (the dev-only cycle) + +The `#[ignore]` GPU re-capture tests in `crates/buiy_core/tests/text_*_gpu.rs` (~20 sites, e.g. `text_gpu.rs:114`/`:152`/`:271`) migrate off the deprecated `buiy_core::render::golden::perceptual_diff` (L1) onto `buiy_verify::metric::compare` (`metric.md` § "Migration"). For those *tests* to name `buiy_verify`, `buiy_core` needs a dev-dependency edge to it. This is a **dev-only dependency cycle** (`buiy_core → buiy_verify → buiy_core`): Cargo permits it because a `[dev-dependencies]` edge does not participate in the normal build graph, so it creates no real cycle, does not affect the production `cargo build -p buiy_core`, and does not enter `cargo deny`'s normal-graph audit. The edge is confined to `#[cfg(test)]`. No test *consumes* it in this task — Phase 1a migrates the call sites; here we only prove the edge resolves. **This is the canonical site of the dev-dep edge; Phase 1a.10 assumes it already exists.** + +**Files:** +- Modify: `crates/buiy_core/Cargo.toml` (`[dev-dependencies]` — today lists only `naga = "27"`) +- Test: the `cargo build -p buiy_core --tests` run below (proves the dev-dep edge resolves without a cycle error; no Rust test file — the consuming migration is Phase 1a) + +Steps: + +- [ ] **Confirm the edge does not yet exist (compile a probe that should fail).** Append a throwaway probe to the bottom of `crates/buiy_core/tests/render_golden_harness.rs`: + ```rust + #[test] + fn buiy_verify_is_reachable_from_buiy_core_tests() { + // Phase 0.2 tripwire: proves the dev-only buiy_core → buiy_verify edge + // resolves (Cargo permits the dev-dep cycle). Re-targeted to + // buiy_verify::metric::compare in Phase 1a when the call sites migrate. + let _ = buiy_verify::visual::compare_images; + } + ``` + (The probe targets the still-present `visual` module because `metric::compare` does not exist until Phase 1a.) Then run: + ```sh + cargo build -p buiy_core --tests 2>&1 | head -5 + ``` + Expected FAILURE: `error[E0433]: failed to resolve: use of undeclared crate or module 'buiy_verify'`. + +- [ ] **Add the dev-dependency edge.** Edit `crates/buiy_core/Cargo.toml`'s `[dev-dependencies]`: + ```toml + [dev-dependencies] + naga = "27" + # Dev-only dependency edge for the #[ignore] GPU re-capture tests, which + # migrate off the deprecated `render::golden::perceptual_diff` (L1) onto + # `buiy_verify::metric::compare` (metric.md § Migration). This forms a + # DEV-ONLY cycle (buiy_core → buiy_verify → buiy_core): a [dev-dependencies] + # edge is excluded from the normal build graph, so Cargo permits it, the + # production `cargo build -p buiy_core` is unaffected, and it adds no + # `cargo deny` surface. Confined to #[cfg(test)]. + buiy_verify = { path = "../buiy_verify" } + ``` + +- [ ] **Verify the edge resolves (no cycle error).** + ```sh + cargo build -p buiy_core --tests 2>&1 | tail -5 + ``` + Expected: `Finished` — Cargo resolves the dev-dep edge with no `cyclic package dependency` error (the tripwire test compiles, proving `buiy_verify` is now reachable). If Cargo *does* error with a cycle, the edge was mistakenly added to `[dependencies]` instead of `[dev-dependencies]` — fix and re-run. + +- [ ] **Remove the temporary tripwire test** (its job — proving the edge resolves — is done; leaving it would block the Phase 1a deletion of `visual::compare_images`). Delete the `buiy_verify_is_reachable_from_buiy_core_tests` fn from `render_golden_harness.rs`. + +- [ ] **Run the headless gate** to confirm the edge introduced no breakage: + ```sh + cargo clippy --workspace --all-targets -- -D warnings && xvfb-run -a cargo test -p buiy_core + ``` + Expected: clippy clean, all `buiy_core` headless tests pass. + +- [ ] **Commit.** + ```sh + git commit -am "build(core): add buiy_verify as a dev-dependency (dev-only cycle) + + Phase 0.2 of the verification pyramid: the #[ignore] GPU re-capture tests + in tests/text_*_gpu.rs migrate (Phase 1a) off the deprecated L1 + perceptual_diff onto buiy_verify::metric::compare, so buiy_core's tests need + to name buiy_verify. Added under [dev-dependencies] only — this forms a + DEV-ONLY cycle (core → verify → core) that Cargo permits because dev-dep + edges are excluded from the normal build graph. Confined to #[cfg(test)]. + + Spec: docs/specs/2026-06-15-buiy-verification-design/metric.md § Migration. + + Co-Authored-By: Claude Opus 4.8 (1M context) " + ``` + +--- + +### Task 0.3 — Define the canonical `Dpr` type in `buiy_core::render::golden` + +`Dpr` is device-pixel-ratio as **integer milliscale** (1000 = 1.0×, 2000 = 2.0×) so it is `Eq + Hash + Ord` with no float pitfalls — it is a *fixture axis* that keys goldens and coverage cells, never a tolerance. Defined **once** here (`determinism.md` § "Extending `GoldenConfig`"); `goldens.md`'s `GoldenKey.dpr` and `coverage.md`'s `Matrix.dprs`/`CoverageKey.dpr` import this type, they do not redefine it. The capture boundary converts the window's `f32` `scale_factor` via `Dpr::from_f32` and back via `Dpr::as_f32`. TDD: the round-trip unit test is written first and must fail (the type does not exist), then the type makes it pass. + +**Files:** +- Modify: `crates/buiy_core/src/render/golden.rs` (insert after the `GoldenConfig` impl, ~line 46) +- Test: `crates/buiy_core/src/render/golden.rs` (new `#[cfg(test)] mod tests` at the file tail — a pure-CPU unit test, runs under the headless gate, no `#[ignore]`) + +Steps: + +- [ ] **Write the failing round-trip unit test.** Append to the tail of `crates/buiy_core/src/render/golden.rs`: + ```rust + #[cfg(test)] + mod tests { + use super::*; + + #[test] + fn dpr_milliscale_round_trips_f32() { + // The canonical fixture axis: integer milliscale so it is Eq+Hash+Ord, + // but it must convert losslessly to/from the f32 scale_factor the + // window/extract path carries (determinism.md § Extending GoldenConfig). + assert_eq!(Dpr::from_f32(1.0), Dpr::X1); + assert_eq!(Dpr::from_f32(2.0), Dpr::X2); + assert_eq!(Dpr::X1.as_f32(), 1.0); + assert_eq!(Dpr::X2.as_f32(), 2.0); + // Round-trip through both directions for a fractional ratio (1.5×). + assert_eq!(Dpr::from_f32(1.5), Dpr(1500)); + assert_eq!(Dpr(1500).as_f32(), 1.5); + // from_f32 rounds to nearest milliscale (no truncation drift). + assert_eq!(Dpr::from_f32(1.2345), Dpr(1235)); + } + + #[test] + fn dpr_is_ord_and_hashable() { + // It keys a golden/coverage cell, so Ord + Hash must hold (the reason + // for milliscale over f32). A plain compile-and-run proof. + use std::collections::HashSet; + assert!(Dpr::X1 < Dpr::X2); + let mut set = HashSet::new(); + assert!(set.insert(Dpr::X1)); + assert!(!set.insert(Dpr::X1)); // already present — Hash + Eq agree + assert!(set.insert(Dpr::X2)); + } + } + ``` + +- [ ] **Run to verify it fails to compile** (the type does not exist): + ```sh + cargo test -p buiy_core --lib render::golden 2>&1 | head -15 + ``` + Expected FAILURE: `error[E0433]: failed to resolve: use of undeclared type 'Dpr'` (and `Dpr::X1`, `from_f32`, `as_f32` all unresolved). + +- [ ] **Write the minimal `Dpr` definition.** Insert into `crates/buiy_core/src/render/golden.rs` immediately after the `GoldenConfig` impl block (after the closing `}` on line 46, before the `perceptual_diff` doc comment on line 48): + ```rust + /// **Canonical device-pixel-ratio type.** Integer *milliscale* (1000 = 1.0×, + /// 2000 = 2.0×) so it is `Eq + Hash + Ord` without float pitfalls — it is a + /// *fixture axis* that keys a golden / coverage cell, **never** a tolerance. + /// + /// Defined ONCE here; `buiy_verify::golden::GoldenKey.dpr` and + /// `buiy_verify::coverage::{Matrix.dprs, CoverageKey.dpr}` import this type, + /// they do **not** redefine it (verification-design `determinism.md`). The + /// capture boundary converts the window's `f32` `scale_factor` via + /// [`Dpr::from_f32`] and back via [`Dpr::as_f32`] when sizing the offscreen + /// target. Derives `serde` so the golden bless ledger can persist it directly; + /// `buiy_core` already carries `serde` as a workspace dep. + #[derive( + Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, + serde::Serialize, serde::Deserialize, + )] + pub struct Dpr(pub u32); + + impl Dpr { + /// 1.0× device-pixel-ratio (the headless capture default). + pub const X1: Self = Dpr(1000); + /// 2.0× device-pixel-ratio (the HiDPI fixture axis). + pub const X2: Self = Dpr(2000); + + /// Round an `f32` scale factor to integer milliscale (`1.0 → Dpr(1000)`). + /// Rounds to nearest so a `1.5×` window maps to `Dpr(1500)` exactly. + pub fn from_f32(scale: f32) -> Self { + Dpr((scale * 1000.0).round() as u32) + } + + /// Back to the `f32` scale factor the window / extract path consumes. + pub fn as_f32(&self) -> f32 { + self.0 as f32 / 1000.0 + } + } + ``` + (`serde` is already in `buiy_core`'s dep graph via the workspace `serde` dep used elsewhere in `render/`; if `cargo doc`/`clippy` flags `serde` as not a direct dep, add `serde.workspace = true` to `crates/buiy_core/Cargo.toml`'s `[dependencies]` in this same step and note it in the commit — but verify first, as bevy re-exports may already satisfy it.) + +- [ ] **Run to verify the tests pass:** + ```sh + cargo test -p buiy_core --lib render::golden 2>&1 | tail -10 + ``` + Expected: `dpr_milliscale_round_trips_f32 ... ok`, `dpr_is_ord_and_hashable ... ok`. + +- [ ] **Run the doc + clippy gate** (the new `pub` type carries doc comments that must pass `RUSTDOCFLAGS="-D warnings"`, and the `serde` derive must not trip clippy): + ```sh + cargo clippy -p buiy_core --all-targets -- -D warnings && RUSTDOCFLAGS="-D warnings" cargo doc -p buiy_core --no-deps + ``` + Expected: both clean. + +- [ ] **Commit.** + ```sh + git commit -am "feat(core): canonical Dpr milliscale type in render::golden + + Phase 0.3 of the verification pyramid: Dpr is device-pixel-ratio as integer + milliscale (1000 = 1×, 2000 = 2×) so it is Eq+Hash+Ord — a fixture axis that + keys goldens/coverage cells, never a tolerance. Defined ONCE here; goldens + and coverage import it. from_f32/as_f32 round-trip the window's f32 + scale_factor at the capture boundary; serde-derived for the bless ledger. + + Spec: docs/specs/2026-06-15-buiy-verification-design/determinism.md + § Extending GoldenConfig. + + Co-Authored-By: Claude Opus 4.8 (1M context) " + ``` + +--- + +### Task 0.4 — Promote `capture_to_image(&mut App, &GoldenConfig) -> image::RgbaImage` into `render/golden.rs` src + +The shared capture seam moves out of `crates/buiy_core/tests/support/mod.rs` (where only `buiy_core`'s own tests can reach it) into `render/golden.rs` *src*, so `buiy_verify`'s reftest and golden tiers can call it (`determinism.md` § "Where the code lives"; README § Architecture). The promoted body extracts the existing `render_to_image` (offscreen target sized to **physical** pixels = `logical × dpr`) + `spawn_capture_camera` (`CAPTURE_MSAA = Msaa::Off`, opaque-black clear) + frame-drive + `readback_rgba` machinery and assembles the un-padded RGBA8 bytes into an `image::RgbaImage`. This requires `buiy_core` to gain `image = "0.25"` as a direct dep (README "Crate-dependency note": the *only* new GPU dep). GPU-coupled, so its meta-test is `#[ignore]`. + +**Scope boundary (honest):** Phase 0.4 promotes the *capture mechanics only* — size-to-physical, paint, readback, assemble `RgbaImage` — reusing the landed `gpu_render_app_scaled`/`readback_rgba` path. The full four-condition quiescence flush (asset-server + pipeline-cache gates) and the `cfg.dpr` `scale_factor` assertion described in `determinism.md` § "Async-asset flush" are **Phase 3.3**'s additions to this same function, not Phase 0. Phase 0.4 drives a bounded fixed frame count + the existing `wait_for_text_ready`-style atlas settle, exactly as `render_golden_harness.rs` does today, so the seam exists and is callable; Phase 3 hardens it. + +**Files:** +- Modify: `crates/buiy_core/Cargo.toml` (`[dependencies]` — add `image`) +- Modify: `crates/buiy_core/src/render/golden.rs` (new `capture_to_image` fn + `readback_rgba_into` + the `CAPTURE_MSAA`/`CAPTURE_DITHER_OFF` constants; src, production-callable infra) +- Test: `crates/buiy_core/tests/render_golden_harness.rs` (new `#[ignore]` GPU test asserting `capture_to_image` returns an `RgbaImage` of the expected physical dimensions) + +Steps: + +- [ ] **Add `image` as a direct dep of `buiy_core`.** Edit `crates/buiy_core/Cargo.toml`'s `[dependencies]` — append after the last existing dep: + ```toml + # The promoted `render::golden::capture_to_image` returns an + # `image::RgbaImage` (verification-design README § Crate-dependency note: the + # ONLY new GPU dep buiy_core gains). Rides the existing workspace `image` + # pin — no second image-decode stack enters the tree. + image.workspace = true + ``` + +- [ ] **Write the failing `#[ignore]` GPU dimension meta-test.** Append to `crates/buiy_core/tests/render_golden_harness.rs`: + ```rust + // Needs a wgpu adapter (real GPU or lavapipe). Proves the promoted + // `capture_to_image` seam paints a fixture and returns an `image::RgbaImage` + // of the expected PHYSICAL dimensions (logical × dpr). Run with: + // cargo test -p buiy_core --test render_golden_harness -- --ignored --nocapture + #[test] + #[ignore = "needs a wgpu adapter (real GPU or lavapipe); run with --ignored"] + fn capture_to_image_returns_physical_dimensions() { + use bevy::prelude::*; + use buiy_core::Node; + use buiy_core::layout::{Inset, Length, Sizing, Style}; + use buiy_core::render::color::ColorToken; + use buiy_core::render::components::Background; + use buiy_core::render::golden::{GoldenConfig, capture_to_image}; + use std::borrow::Cow; + + const LOGICAL_W: u32 = 48; + const LOGICAL_H: u32 = 32; + + // 1.0× capture: physical == logical. (Phase 0.4 sizes via the literal 1.0 + // path; GoldenConfig has no `dpr` field until Phase 3.1.) + let cfg = GoldenConfig::deterministic(); + let mut app = support::gpu_render_app_scaled(LOGICAL_W, LOGICAL_H, 1.0); + + // A known opaque fill so the capture is non-trivial (a blank frame would + // pass the dimension check vacuously; this proves real paint flows through). + { + let mut theme = app.world_mut().resource_mut::(); + theme + .colors + .insert("cap.fill".into(), Color::srgb(0.2, 0.6, 0.9)); + } + let fill = app + .world_mut() + .spawn(( + Node, + Style::default() + .absolute() + .inset(Inset { + top: Sizing::Length(Length::px(4.0)), + left: Sizing::Length(Length::px(4.0)), + ..default() + }) + .width_px(16.0) + .height_px(16.0), + Background { + color: ColorToken::Token(Cow::Borrowed("cap.fill")), + }, + )) + .id(); + app.world_mut() + .spawn((Node, Style::default())) + .add_children(&[fill]); + + let img = capture_to_image(&mut app, &cfg); + + assert_eq!( + (img.width(), img.height()), + (LOGICAL_W, LOGICAL_H), + "1× capture is logical-sized in physical pixels" + ); + // Non-vacuous: at least one pixel differs from the opaque-black clear. + let any_painted = img.pixels().any(|p| p.0 != [0, 0, 0, 255]); + assert!(any_painted, "capture produced non-clear pixels"); + } + ``` + +- [ ] **Run to verify it fails to compile** (`capture_to_image` does not exist): + ```sh + cargo test -p buiy_core --test render_golden_harness 2>&1 | head -15 + ``` + Expected FAILURE: `error[E0432]: unresolved import 'buiy_core::render::golden::capture_to_image'`. + +- [ ] **Write the `capture_to_image` implementation + capture constants.** Insert into `crates/buiy_core/src/render/golden.rs` after the `Dpr` impl (Phase 0.3) and before `perceptual_diff`. The body mirrors the proven `render_golden_harness.rs` capture flow but lives in src and returns an `RgbaImage`: + ```rust + /// Single-sampled capture: a 4× MSAA resolve antialiases edges + /// nondeterministically across drivers, while Buiy's in-shader analytic AA is + /// deterministic given identical FP — so MSAA buys nothing here and costs + /// determinism. Mirrors the capture camera's landed `Msaa::Off` + /// (verification-design `determinism.md`). + pub const CAPTURE_MSAA: bevy::render::view::Msaa = bevy::render::view::Msaa::Off; + + /// Deband dither perturbs the low bits of the tonemapped output; the capture + /// camera pins it off. A `true` sentinel the capture path documents (the + /// camera spawns with no `DebandDither::Enabled`). + pub const CAPTURE_DITHER_OFF: bool = true; + + /// **The shared capture seam** (verification-design README § Architecture): + /// render the already-built, fixture-populated `app` into an offscreen target + /// sized to the window's PHYSICAL pixel grid and read it back as an + /// `image::RgbaImage`. Re-runnable against one `App` (a reftest calls it twice + /// on one device; spec § "Resolved during synthesis" #4). + /// + /// Phase-0 scope: the capture mechanics (size-to-physical, paint, readback, + /// assemble). The four-condition quiescence flush and the + /// `scale_factor == cfg.dpr` assertion are Phase 3.3's hardening of this same + /// function (`determinism.md` § Async-asset flush). + /// + /// Drives `MAX_CAPTURE_FRAMES` update frames after finishing the app (pipeline + /// async-compile + extract + prepare + paint settle), then reads back the + /// offscreen target's un-padded RGBA8 bytes. + pub fn capture_to_image(app: &mut bevy::app::App, _cfg: &GoldenConfig) -> image::RgbaImage { + use bevy::asset::RenderAssetUsages; + use bevy::camera::RenderTarget; + use bevy::image::Image; + use bevy::prelude::*; + use bevy::render::render_resource::{TextureFormat, TextureUsages}; + + // Physical pixel grid the offscreen target must match: the primary + // window's physical size (logical × scale_factor), which the view uniform + // is built from (extract fills `logical_size` from the primary window). + let (phys_w, phys_h) = { + let window = app + .world_mut() + .query::<&bevy::window::Window>() + .single(app.world()) + .expect("primary window for capture sizing"); + let r = window.resolution.physical_size(); + (r.x, r.y) + }; + + // Offscreen Rgba8UnormSrgb target with COPY_SRC for the readback copy and + // RenderAssetUsages::all() so the GpuImage exists in the render world. + let target = { + let mut image = + Image::new_target_texture(phys_w, phys_h, TextureFormat::Rgba8UnormSrgb, None); + image.texture_descriptor.usage |= TextureUsages::COPY_SRC; + image.asset_usage = RenderAssetUsages::all(); + app.world_mut().resource_mut::>().add(image) + }; + + // Capture camera: opaque-black clear, CAPTURE_MSAA (single-sampled), + // dither off (bare Camera2d at Msaa::Off carries no DebandDither::Enabled). + app.world_mut().spawn(( + Camera2d, + RenderTarget::from(target.clone()), + CAPTURE_MSAA, + Camera { + clear_color: ClearColorConfig::Custom(Color::BLACK), + ..default() + }, + )); + + // Finish materializes the device + pipelines; drive frames so layout → + // extract → prepare → paint settle before the readback poll. + const MAX_CAPTURE_FRAMES: usize = 3; + app.finish(); + app.cleanup(); + for _ in 0..MAX_CAPTURE_FRAMES { + app.update(); + } + + let bytes = readback_rgba_into(app, &target, phys_w, phys_h); + image::RgbaImage::from_raw(phys_w, phys_h, bytes) + .expect("readback byte count matches phys_w * phys_h * 4") + } + ``` + Then add `readback_rgba_into` directly below `capture_to_image` — the src twin of the test-support `readback_rgba` (the readback poll cannot stay in `tests/support`, so promote its body too). Copy the proven poll + 256-byte row-padding strip from `tests/support/mod.rs:353`–end, with the signature `fn readback_rgba_into(app: &mut bevy::app::App, target: &bevy::asset::Handle, w: u32, h: u32) -> Vec`. Keep it `pub(crate)` if no external caller needs it; `pub` if a reftest reads back directly. Verify the exact `Readback`/`ReadbackComplete` import paths against `tests/support/mod.rs` when copying. + +- [ ] **Migrate `tests/support` to delegate** (DRY — the support helper must not duplicate the now-promoted logic). Re-point `tests/support/mod.rs`'s `readback_rgba` to call `buiy_core::render::golden::readback_rgba_into` (or, if `readback_rgba_into` was made `pub(crate)` and is thus unreachable from tests, keep both and note the intentional duplication is temporary until Phase 3 consolidates). Prefer the delegation; confirm `cargo build -p buiy_core --tests` still compiles. + +- [ ] **Run the `#[ignore]` GPU meta-test on the real adapter** (this host, AMD RX 6700 XT): + ```sh + cargo test -p buiy_core --test render_golden_harness -- --ignored --test-threads=1 capture_to_image_returns_physical_dimensions --nocapture + ``` + Expected: `capture_to_image_returns_physical_dimensions ... ok` — the returned `RgbaImage` is `48×32` and has non-clear pixels. + +- [ ] **Run the full headless gate + the doc gate** (the new `pub` fn + constants must pass clippy and `RUSTDOCFLAGS="-D warnings"`; the headless gate must stay green with `image` now a direct dep): + ```sh + cargo clippy --workspace --all-targets -- -D warnings && RUSTDOCFLAGS="-D warnings" cargo doc -p buiy_core --no-deps && xvfb-run -a cargo test -p buiy_core + ``` + Expected: all clean/green. (The headless leg does NOT run the `#[ignore]` capture test — that is the GPU lane above — but it confirms the src compiles + the support migration did not break the existing headless tests.) + +- [ ] **Run the supply-chain gate** (`image` is already a workspace dep, so no new license — but the gate is cheap insurance that adding it as a *direct* `buiy_core` dep changed nothing): + ```sh + cargo deny check + ``` + Expected: PASS (no new transitive — `image = "0.25"` is already resolved for `buiy_verify`). + +- [ ] **Commit.** + ```sh + git commit -am "feat(core): promote capture_to_image into render::golden src + + Phase 0.4 of the verification pyramid: the shared GPU capture seam moves out + of tests/support into render::golden src as + capture_to_image(&mut App, &GoldenConfig) -> image::RgbaImage, so + buiy_verify's reftest + golden tiers can call it. Sizes the offscreen target + to the window's physical pixel grid, paints under CAPTURE_MSAA (single- + sampled, dither off), and reads back into an RgbaImage. buiy_core gains + image as a direct dep (README § Crate-dependency note: the only new GPU + dep). #[ignore] GPU meta-test asserts physical dimensions + non-vacuous paint. + + Phase-0 scope is the capture mechanics; the four-condition quiescence flush + and the scale_factor==dpr assertion are Phase 3.3's hardening. + Spec: docs/specs/2026-06-15-buiy-verification-design/determinism.md + § Where the code lives. + + Co-Authored-By: Claude Opus 4.8 (1M context) " + ``` + +--- + +**Phase 0 exit criteria:** `cargo deny check` green with the two new metric/snapshot deps; `cargo build -p buiy_core --tests` resolves the dev-only `buiy_verify` edge with no cycle error; `Dpr` round-trips `f32` under the headless gate; `capture_to_image` returns a correctly-sized non-blank `RgbaImage` on the GPU lane. Phase 1 (metric + reftests) now has every seam it imports. + +--- + +## Phase 1a — Perceptual metric + +Realizes [`metric.md`](../specs/2026-06-15-buiy-verification-design/metric.md). Builds `buiy_verify::metric` — the AA-aware two-axis perceptual diff shared by tiers 4 (reftests) and 5 (goldens) — then retires the two naive metrics it supersedes. Pure CPU; every task here runs under the headless gate with **no** `--ignored`, except the final two GPU-site migration tasks (which only *compile* under the headless gate and *run* on the GPU lane). + +> **Critical deviation from `metric.md` — `pixelmatch = "0.1.0"` is not usable as specified; vendor the algorithm instead.** Verified against the published crate source (`~/.cargo/registry/.../pixelmatch-0.1.0/src/lib.rs`): the crate's only public surface is `pixelmatch(img1: impl Read, img2: impl Read, out, w, h, Options) -> Result` — it (a) consumes **PNG-encoded byte streams**, not `image::RgbaImage`; (b) returns only a **flat changed-pixel `usize`**, exposing neither the per-pixel YIQ delta nor an L∞ channel delta, so it cannot feed `Diff`'s `max_channel_delta` axis; (c) keeps `color_delta` / `antialiased` **private**, contradicting metric.md's "It exposes the `colorDelta`/`antialiased` primitives `compare` wraps"; and (d) is pinned to the **`image` 0.24** API (`ImageOutputFormat`, `DynamicImage::from_decoder`), which **does not compile against the workspace `image = "0.25"`**. The spec's own directive is "adopt the reference algorithm — don't re-derive the `35215`/YIQ constants." We honor that directive precisely by **vendoring the ~150 LOC reference algorithm** (the exact `color_delta` luminance-weighted YIQ delta + the `antialiased` brightest/darkest-sibling test + `has_many_siblings`) into `metric.rs`, ported verbatim from pixelmatch's MIT source onto `image` 0.25 / `RgbaImage`, with a provenance comment. This is strictly *more* faithful to metric.md's intent than depending on an unusable, `image`-incompatible crate, and it is what gives `compare` the per-pixel hooks the two-axis `Diff` requires. **Net dependency delta for Phase 1a: `image-compare = "=0.5.0"` only** (MSSIM; landed in Phase 0.1); **no `pixelmatch` dependency is added.** The constants are guarded against drift by the known-answer unit tests below, which is exactly the protection a version pin would give. *(This deviation should be reflected back into `metric.md` § "Crate choice" / "Migration" — see Self-review § gaps.)* + +--- + +### Task 1a.0 — Confirm `image-compare` resolves (Phase 0.1 already added it) + +Phase 0.1 added `image-compare = "=0.5.0"` to `buiy_verify`. This task is a thin re-confirmation that the dep is present and the supply-chain gate is green before the metric code consumes it. (If Phase 0.1 was skipped or the dep is absent, add it here per the Phase 0.1 step.) + +**Files:** +- Verify: `crates/buiy_verify/Cargo.toml` (the `image-compare = "=0.5.0"` line from Phase 0.1) + +- [ ] Step — confirm the dep resolves and the gate is green: + ```sh + grep -n "image-compare" crates/buiy_verify/Cargo.toml && cargo build -p buiy_verify && cargo deny check 2>&1 | tail -8 + ``` + Expected: the dep line is present; `Finished`; `advisories ok`, `licenses ok`, `bans ok`, `sources ok`. If `image-compare` is missing, add it now per Phase 0.1 and re-run `cargo deny check`, recording any new SPDX id added to `deny.toml`'s allow list in the eventual commit. No commit for this confirmation-only task. + +--- + +### Task 1a.1 — Module skeleton: `Diff` / `FuzzBudget` / `CompareOpts` types + identity-on-empty `compare` stub + +Smallest red-green slice that pins the type shapes and wires the module into `lib.rs`. `compare` is a deliberately-incomplete stub (returns the empty/identity `Diff`) so the type-shape tests bind before the algorithm lands. + +**Files:** +- Create: `crates/buiy_verify/src/metric.rs` +- Modify: `crates/buiy_verify/src/lib.rs` (add `pub mod metric;`) +- Test (inline `#[cfg(test)]` for type-shape unit checks): `crates/buiy_verify/src/metric.rs` + +- [ ] Step — write the failing test. Append this `#[cfg(test)]` block at the end of the new `metric.rs` (it references types that do not yet exist, so it fails to compile — the RED state): + ```rust + #[cfg(test)] + mod tests { + use super::*; + + #[test] + fn exact_budget_is_zero_zero() { + assert_eq!(FuzzBudget::EXACT.max_channel_delta, 0); + assert_eq!(FuzzBudget::EXACT.max_diff_pixels, 0); + } + + #[test] + fn default_opts_are_lenient_aware() { + let o = CompareOpts::default(); + assert_eq!(o.threshold, 0.1); + assert!(!o.include_aa); + assert!(o.mssim); + assert!(!o.emit_diff_image); + } + + #[test] + fn empty_vs_empty_is_zero_diff() { + let e = image::RgbaImage::new(0, 0); + let d = compare(&e, &e, &CompareOpts::default()); + assert_eq!(d.differing_pixels, 0); + assert_eq!(d.max_channel_delta, 0); + assert_eq!(d.total_pixels, 0); + assert_eq!(d.mssim, None); + assert!(d.diff_image.is_none()); + } + } + ``` + +- [ ] Step — run to verify it fails (compile error — types/fn undefined): + ```sh + cargo test -p buiy_verify --lib metric 2>&1 | tail -20 + ``` + Expected: `error[E0433]`/`E0432` — cannot find `FuzzBudget`, `CompareOpts`, `Diff`, or `compare` in `super`. + +- [ ] Step — write the minimal implementation. Prepend the module doc + types + the empty-only `compare` stub above the `#[cfg(test)]` block, so `metric.rs` begins: + ```rust + //! Perceptual image diff — the shared metric for reftests (tier 4) and goldens + //! (tier 5). Luminance-weighted YIQ colorDelta + antialias-sibling exclusion, + //! gated on a two-axis FuzzBudget. Supersedes render::golden::perceptual_diff + //! (L1) and visual::compare_images (RMSE). + //! + //! The per-pixel YIQ `color_delta`, the `antialiased` brightest/darkest-sibling + //! test, and `has_many_siblings` are ported verbatim from the canonical + //! pixelmatch reference (MIT; mapbox/pixelmatch, the Rust `pixelmatch` 0.1.0 + //! crate). They are vendored, not depended on: the published crate consumes + //! PNG byte streams, returns only a flat count, keeps these primitives private, + //! and is image-0.24-bound — none of which fits `Diff`'s two-axis shape on + //! image 0.25. Vendoring is metric.md's "adopt the reference algorithm, don't + //! re-derive the 35215/YIQ constants" applied exactly. + + use image::RgbaImage; + + /// Outcome of one comparison. All counts are over the diffed (overlapping) + /// pixel set. `diff_image` is emitted only when `CompareOpts::emit_diff_image`. + #[derive(Clone, Debug)] + pub struct Diff { + /// Non-AA pixels whose YIQ colorDelta exceeded the per-pixel threshold. + pub differing_pixels: u32, + /// Largest single-channel L∞ delta over all pixels (diagnostic; 0..=255). + pub max_channel_delta: u8, + /// Total pixels compared (== w*h; 0 only for empty/degenerate input). + pub total_pixels: u32, + /// Advisory MSSIM in [0,1] (1 == identical). `None` when skipped. + pub mssim: Option, + /// Heatmap: AA pixels dimmed, differing pixels painted (pixelmatch palette). + pub diff_image: Option, + } + + /// The two-axis gate. A Diff PASSES iff BOTH hold. Default after determinism is + /// (0, 0); widen per fixture with a documented reason. + #[derive(Clone, Copy, Debug, PartialEq, Eq)] + pub struct FuzzBudget { + /// No single channel of any pixel may differ by more than this (L∞). + pub max_channel_delta: u8, + /// At most this many non-AA pixels may exceed the per-pixel YIQ threshold. + pub max_diff_pixels: u32, + } + + impl FuzzBudget { + /// The post-determinism default: bit-exact within one pinned rasterizer. + pub const EXACT: FuzzBudget = FuzzBudget { max_channel_delta: 0, max_diff_pixels: 0 }; + } + + /// Per-pixel and AA-detection knobs. `threshold` feeds the + /// `max_delta = 35215 · threshold²` luminance model; `include_aa = true` makes + /// AA pixels COUNT (for the few tests that assert AA exactly). + #[derive(Clone, Copy, Debug)] + pub struct CompareOpts { + /// Matching sensitivity in [0,1]; default 0.1. Smaller = stricter. + pub threshold: f64, + /// Treat antialiased pixels as differences instead of excluding them. + pub include_aa: bool, + /// Also compute the advisory MSSIM channel (image-compare). Default true. + pub mssim: bool, + /// Allocate and fill `Diff::diff_image`. Off in the hot reftest path. + pub emit_diff_image: bool, + } + + impl Default for CompareOpts { + fn default() -> Self { + Self { threshold: 0.1, include_aa: false, mssim: true, emit_diff_image: false } + } + } + + /// Compare two RGBA images. **Infallible** — returns a `Diff`, never a + /// `Result`. (Stub: only the empty case is correct until 1a.2/1a.3 land.) + pub fn compare(a: &RgbaImage, b: &RgbaImage, _opts: &CompareOpts) -> Diff { + let _ = (a, b); + Diff { + differing_pixels: 0, + max_channel_delta: 0, + total_pixels: 0, + mssim: None, + diff_image: None, + } + } + ``` + Then add the module to `lib.rs` so the `pub mod` block reads (alphabetical): + ```rust + pub mod a11y; + pub mod contrast; + pub mod metric; + pub mod visual; + ``` + +- [ ] Step — run to verify it passes: + ```sh + cargo test -p buiy_verify --lib metric 2>&1 | tail -12 + ``` + Expected: `test result: ok. 3 passed` (`exact_budget_is_zero_zero`, `default_opts_are_lenient_aware`, `empty_vs_empty_is_zero_diff`). + +- [ ] Step — commit: + ```sh + git add crates/buiy_verify/src/metric.rs crates/buiy_verify/src/lib.rs + git commit -m "feat(verify): metric module skeleton — Diff/FuzzBudget/CompareOpts + +Type shapes + empty-case compare stub, wired into lib.rs. Algorithm +lands next. Realizes metric.md § Types. + +Co-Authored-By: Claude Opus 4.8 (1M context) " + ``` + +--- + +### Task 1a.2 — Vendored per-pixel core: YIQ `color_delta` + the differing-pixel count + `max_channel_delta` + +Ports the pixelmatch YIQ luminance model and fills the two non-AA-dependent axes of `Diff`. AA exclusion comes in 1a.3; here every over-threshold pixel counts (i.e. behaves as `include_aa = true`). This lets the YIQ-weighting and L∞ axis tests bind before the sibling test complicates them. + +**Files:** +- Modify: `crates/buiy_verify/src/metric.rs` (replace the `compare` stub body; add private `color_delta`/`rgb2y`/`rgb2i`/`rgb2q`/`blend` helpers; extend `#[cfg(test)]`) +- Test: inline `#[cfg(test)] mod tests` + +- [ ] Step — write the failing tests. Add inside `mod tests`: + ```rust + /// Solid w×h image of one color. + fn solid(w: u32, h: u32, px: [u8; 4]) -> image::RgbaImage { + image::RgbaImage::from_pixel(w, h, image::Rgba(px)) + } + + #[test] + fn identity_is_zero_diff() { + let img = solid(8, 8, [10, 200, 30, 255]); + let d = compare(&img, &img, &CompareOpts::default()); + assert_eq!(d.differing_pixels, 0); + assert_eq!(d.max_channel_delta, 0); + assert_eq!(d.total_pixels, 64); + } + + #[test] + fn single_wrong_pixel_survives_every_scale() { + // The §4 regression: one wrong-by-200 pixel must be caught at any N. + for n in [16u32, 256, 2048] { + let a = solid(n, n, [0, 0, 0, 255]); + let mut b = a.clone(); + b.put_pixel(n / 2, n / 2, image::Rgba([200, 200, 200, 255])); + let d = compare(&a, &b, &CompareOpts { include_aa: true, mssim: false, ..Default::default() }); + assert_eq!(d.differing_pixels, 1, "N={n}: exactly one differing pixel"); + assert!(d.max_channel_delta >= 200, "N={n}: L∞ caught the 200 delta"); + assert_eq!(d.total_pixels, n * n); + } + } + + #[test] + fn yiq_luminance_outweighs_chroma() { + // Equal raw L∞ (delta 60 on one channel) but a luma-shifted pixel must + // score a larger YIQ delta than a chroma-only shift — pins the weighting. + let base = solid(4, 4, [120, 120, 120, 255]); + let mut luma = base.clone(); + luma.put_pixel(0, 0, image::Rgba([180, 180, 180, 255])); // +60 all channels: pure luma + let mut chroma = base.clone(); + chroma.put_pixel(0, 0, image::Rgba([180, 120, 60, 255])); // +60 R / -60 B: chroma-leaning, same L∞=60 + let opts = CompareOpts { include_aa: true, mssim: false, threshold: 0.05, ..Default::default() }; + let dl = compare(&base, &luma, &opts); + let dc = compare(&base, &chroma, &opts); + // At a threshold where luma trips but the lower-weighted chroma delta does + // not, the luma case differs and the chroma case does not. + assert_eq!(dl.differing_pixels, 1, "luma shift exceeds threshold"); + assert_eq!(dc.differing_pixels, 0, "chroma-only shift is under-weighted below threshold"); + } + ``` + +- [ ] Step — run to verify it fails: + ```sh + cargo test -p buiy_verify --lib metric 2>&1 | tail -20 + ``` + Expected: `single_wrong_pixel_survives_every_scale`, `yiq_luminance_outweighs_chroma` FAIL (`differing_pixels == 0`, `max_channel_delta == 0` from the stub); `identity_is_zero_diff` also fails on `total_pixels == 64` (stub returns 0). + +- [ ] Step — write the minimal implementation. Replace the `compare` stub body and add the vendored helpers (drop the `let _ = (a, b);`): + ```rust + pub fn compare(a: &RgbaImage, b: &RgbaImage, opts: &CompareOpts) -> Diff { + // Empty: nothing to observe (matches compare_images's 0.0 empty case). + if a.width() == 0 || a.height() == 0 { + return Diff { + differing_pixels: 0, + max_channel_delta: 0, + total_pixels: 0, + mssim: None, + diff_image: None, + }; + } + // Dimension mismatch handled in 1a.4 (saturated Diff). For now assume equal. + let (w, h) = a.dimensions(); + let total_pixels = w * h; + let max_delta = 35_215_f64 * opts.threshold * opts.threshold; + + let mut differing_pixels = 0u32; + let mut max_channel_delta = 0u8; + for (pa, pb) in a.pixels().zip(b.pixels()) { + for ch in 0..4 { + let d = (pa[ch] as i16 - pb[ch] as i16).unsigned_abs() as u8; + max_channel_delta = max_channel_delta.max(d); + } + let delta = color_delta(pa, pb, false); + if delta.abs() > max_delta { + // AA exclusion is layered in 1a.3; here every over-threshold pixel counts. + differing_pixels += 1; + } + } + + Diff { + differing_pixels, + max_channel_delta, + total_pixels, + mssim: None, // wired in 1a.5 + diff_image: None, // wired in 1a.6 + } + } + + // ---- Vendored from pixelmatch (MIT). Verbatim constants; ported to image 0.25. + // "Measuring perceived color difference using YIQ NTSC transmission color space" + // (Kotsarenko & Ramos). `y_only` returns the signed luminance delta (used by the + // AA sibling test); otherwise the luminance-weighted YIQ squared delta, signed + // by which pixel is brighter. + fn color_delta(p1: &image::Rgba, p2: &image::Rgba, y_only: bool) -> f64 { + let (mut r1, mut g1, mut b1, mut a1) = + (p1[0] as f64, p1[1] as f64, p1[2] as f64, p1[3] as f64); + let (mut r2, mut g2, mut b2, mut a2) = + (p2[0] as f64, p2[1] as f64, p2[2] as f64, p2[3] as f64); + + if (a1 - a2).abs() < f64::EPSILON + && (r1 - r2).abs() < f64::EPSILON + && (g1 - g2).abs() < f64::EPSILON + && (b1 - b2).abs() < f64::EPSILON + { + return 0.0; + } + if a1 < 255.0 { + a1 /= 255.0; + r1 = blend(r1, a1); + g1 = blend(g1, a1); + b1 = blend(b1, a1); + } + if a2 < 255.0 { + a2 /= 255.0; + r2 = blend(r2, a2); + g2 = blend(g2, a2); + b2 = blend(b2, a2); + } + let y1 = rgb2y(r1, g1, b1); + let y2 = rgb2y(r2, g2, b2); + let y = y1 - y2; + if y_only { + return y; + } + let i = rgb2i(r1, g1, b1) - rgb2i(r2, g2, b2); + let q = rgb2q(r1, g1, b1) - rgb2q(r2, g2, b2); + let delta = 0.5053 * y * y + 0.299 * i * i + 0.1957 * q * q; + if y1 > y2 { -delta } else { delta } + } + + // blend semi-transparent color with white + fn blend(c: f64, a: f64) -> f64 { + 255.0 + (c - 255.0) * a + } + fn rgb2y(r: f64, g: f64, b: f64) -> f64 { + r * 0.298_895_31 + g * 0.586_622_47 + b * 0.114_482_23 + } + fn rgb2i(r: f64, g: f64, b: f64) -> f64 { + r * 0.595_977_99 - g * 0.274_176_10 - b * 0.321_801_89 + } + fn rgb2q(r: f64, g: f64, b: f64) -> f64 { + r * 0.211_470_17 - g * 0.522_617_11 + b * 0.311_146_94 + } + ``` + +- [ ] Step — run to verify it passes: + ```sh + cargo test -p buiy_verify --lib metric 2>&1 | tail -12 + ``` + Expected: `test result: ok. 6 passed` (the three new + the three from 1a.1). + +- [ ] Step — commit: + ```sh + git add crates/buiy_verify/src/metric.rs + git commit -m "feat(verify): vendored YIQ color_delta + two-axis pixel scan + +Ports pixelmatch's luminance-weighted YIQ delta (verbatim constants) +and adds the raw L∞ max_channel_delta scan. Single-wrong-pixel is now +caught at N in {16,256,2048} — the §4 dilution regression. AA exclusion +and MSSIM follow. + +Co-Authored-By: Claude Opus 4.8 (1M context) " + ``` + +--- + +### Task 1a.3 — Antialias sibling exclusion (the brightest/darkest-neighbor test) + +Adds the one feature both naive metrics lack: a differing pixel that is AA in *either* image is excluded from `differing_pixels` unless `include_aa`. + +**Files:** +- Modify: `crates/buiy_verify/src/metric.rs` (gate the `differing_pixels += 1` behind `opts.include_aa || (!aa(a,..) && !aa(b,..))`; add private `antialiased` + `has_many_siblings`) +- Test: inline `#[cfg(test)] mod tests` + +- [ ] Step — write the failing test. Add inside `mod tests`: + ```rust + /// A 1px-wide diagonal AA band: a hard black/white edge whose boundary column + /// is shifted by one in `b`, producing a sibling-detectable AA pixel. + fn aa_edge_pair() -> (image::RgbaImage, image::RgbaImage) { + let (w, h) = (16u32, 16u32); + let mut a = image::RgbaImage::new(w, h); + let mut b = image::RgbaImage::new(w, h); + for y in 0..h { + for x in 0..w { + // a: edge at x == y ; b: edge at x == y+1 (shifted one column). + let pa = if x < y { [0, 0, 0, 255] } else { [255, 255, 255, 255] }; + let pb = if x < y + 1 { [0, 0, 0, 255] } else { [255, 255, 255, 255] }; + a.put_pixel(x, y, image::Rgba(pa)); + b.put_pixel(x, y, image::Rgba(pb)); + } + } + (a, b) + } + + #[test] + fn aa_pixels_excluded_by_default_but_counted_with_include_aa() { + let (a, b) = aa_edge_pair(); + let excluded = compare(&a, &b, &CompareOpts { mssim: false, ..Default::default() }); + let counted = compare(&a, &b, &CompareOpts { include_aa: true, mssim: false, ..Default::default() }); + assert_eq!(excluded.differing_pixels, 0, "edge pixels read as AA, excluded"); + assert!(counted.differing_pixels > 0, "include_aa counts the same pixels"); + } + + #[test] + fn real_defect_is_not_excluded_as_aa() { + // An isolated wrong pixel on a flat field has no brighter+darker sibling + // pair, so it is NOT AA — it must still count with default opts. + let a = solid(16, 16, [0, 0, 0, 255]); + let mut b = a.clone(); + b.put_pixel(8, 8, image::Rgba([200, 200, 200, 255])); + let d = compare(&a, &b, &CompareOpts { mssim: false, ..Default::default() }); + assert_eq!(d.differing_pixels, 1, "isolated defect is not AA-excluded"); + } + ``` + +- [ ] Step — run to verify it fails: + ```sh + cargo test -p buiy_verify --lib metric 2>&1 | tail -20 + ``` + Expected: `aa_pixels_excluded_by_default_but_counted_with_include_aa` FAILS (`excluded.differing_pixels` is `> 0`, not `0`); `real_defect_is_not_excluded_as_aa` passes already (no sibling pair). + +- [ ] Step — write the minimal implementation. In `compare`, the AA decision needs each pixel's `(x, y)`, so switch the loop to `enumerate_pixels` and gate the count. Replace the per-pixel loop body: + ```rust + let mut differing_pixels = 0u32; + let mut max_channel_delta = 0u8; + for (x, y, pa) in a.enumerate_pixels() { + let pb = b.get_pixel(x, y); + for ch in 0..4 { + let d = (pa[ch] as i16 - pb[ch] as i16).unsigned_abs() as u8; + max_channel_delta = max_channel_delta.max(d); + } + let delta = color_delta(pa, pb, false); + if delta.abs() > max_delta { + let is_aa = !opts.include_aa + && (antialiased(a, x, y, w, h, b) || antialiased(b, x, y, w, h, a)); + if !is_aa { + differing_pixels += 1; + } + } + } + ``` + Then add the two vendored predicates (verbatim port; `image` 0.25 `get_pixel` is inherent on `RgbaImage`): + ```rust + // Vendored from pixelmatch (MIT): "Anti-aliased Pixel and Intensity Slope + // Detector" (Vyšniauskas, 2009). A pixel is AA iff it has a strictly brighter + // and a strictly darker sibling and that extreme has 3+ equal siblings in BOTH + // images (so it is an intensity slope, not a real edge in both). + fn antialiased(img1: &RgbaImage, x: u32, y: u32, w: u32, h: u32, img2: &RgbaImage) -> bool { + let mut zeroes: u8 = u8::from(x == 0 || y == 0 || x == w - 1 || y == h - 1); + let (mut min, mut max) = (0.0f64, 0.0f64); + let (mut min_x, mut min_y, mut max_x, mut max_y) = (0u32, 0u32, 0u32, 0u32); + let center = img1.get_pixel(x, y); + + let x0 = x.saturating_sub(1); + let x1 = if x < w - 1 { x + 1 } else { x }; + let y0 = y.saturating_sub(1); + let y1 = if y < h - 1 { y + 1 } else { y }; + for ax in x0..=x1 { + for ay in y0..=y1 { + if ax == x && ay == y { + continue; + } + let delta = color_delta(center, img1.get_pixel(ax, ay), true); + if delta == 0.0 { + zeroes += 1; + if zeroes > 2 { + return false; + } + continue; + } + if delta < min { + min = delta; + min_x = ax; + min_y = ay; + continue; + } + if delta > max { + max = delta; + max_x = ax; + max_y = ay; + } + } + } + if min == 0.0 || max == 0.0 { + return false; + } + (has_many_siblings(img1, min_x, min_y, w, h) && has_many_siblings(img2, min_x, min_y, w, h)) + || (has_many_siblings(img1, max_x, max_y, w, h) + && has_many_siblings(img2, max_x, max_y, w, h)) + } + + // Vendored from pixelmatch (MIT): 3+ adjacent pixels of identical color. + fn has_many_siblings(img: &RgbaImage, x: u32, y: u32, w: u32, h: u32) -> bool { + let mut zeroes: u8 = u8::from(x == 0 || y == 0 || x == w - 1 || y == h - 1); + let center = img.get_pixel(x, y); + let x0 = x.saturating_sub(1); + let x1 = if x < w - 1 { x + 1 } else { x }; + let y0 = y.saturating_sub(1); + let y1 = if y < h - 1 { y + 1 } else { y }; + for ax in x0..=x1 { + for ay in y0..=y1 { + if ax == x && ay == y { + continue; + } + if center == img.get_pixel(ax, ay) { + zeroes += 1; + if zeroes > 2 { + return true; + } + } + } + } + false + } + ``` + +- [ ] Step — run to verify it passes: + ```sh + cargo test -p buiy_verify --lib metric 2>&1 | tail -12 + ``` + Expected: `test result: ok. 8 passed`. + +- [ ] Step — commit: + ```sh + git add crates/buiy_verify/src/metric.rs + git commit -m "feat(verify): antialias sibling exclusion (pixelmatch port) + +A differing pixel that is AA in either image is excluded unless +include_aa. EXACT (0,0) now holds across residual AA jitter while still +catching an isolated real defect. Vendored verbatim from pixelmatch. + +Co-Authored-By: Claude Opus 4.8 (1M context) " + ``` + +--- + +### Task 1a.4 — `Diff::passes` / `Diff::within` + the saturated dimension-mismatch `Diff` + +Adds the gate methods and the loud-red mismatch handling (the spec's explicit replacement for the naive silent `1.0`). + +**Files:** +- Modify: `crates/buiy_verify/src/metric.rs` (add `impl Diff { passes, within }`; add the dim-mismatch early return in `compare`) +- Test: inline `#[cfg(test)] mod tests` + +- [ ] Step — write the failing tests. Add inside `mod tests`: + ```rust + #[test] + fn passes_requires_both_axes() { + // One pixel off by 255: trips max_channel_delta, one differing pixel. + let a = solid(8, 8, [0, 0, 0, 255]); + let mut b = a.clone(); + b.put_pixel(0, 0, image::Rgba([255, 255, 255, 255])); + let d = compare(&a, &b, &CompareOpts { mssim: false, ..Default::default() }); + assert!(!d.passes(&FuzzBudget::EXACT), "EXACT rejects any diff"); + assert!(!d.passes(&FuzzBudget { max_channel_delta: 255, max_diff_pixels: 0 }), + "diff-pixel axis still binds when channel axis is satisfied"); + assert!(!d.passes(&FuzzBudget { max_channel_delta: 0, max_diff_pixels: 1 }), + "channel axis still binds when diff-pixel axis is satisfied"); + assert!(d.passes(&FuzzBudget { max_channel_delta: 255, max_diff_pixels: 1 }), + "both axes satisfied -> pass"); + } + + #[test] + fn within_floor_catches_unexpectedly_clean() { + // A clean render (0,0) must FAIL a widened budget whose min floor is > 0. + let a = solid(8, 8, [5, 5, 5, 255]); + let clean = compare(&a, &a, &CompareOpts { mssim: false, ..Default::default() }); + let min = FuzzBudget { max_channel_delta: 1, max_diff_pixels: 1 }; + let max = FuzzBudget { max_channel_delta: 10, max_diff_pixels: 50 }; + assert!(!clean.within(&min, &max), "a clean render is below the expected floor"); + } + + #[test] + fn dimension_mismatch_is_saturated_and_fails_every_budget() { + let a = solid(4, 4, [0, 0, 0, 255]); + let b = solid(5, 4, [0, 0, 0, 255]); + let d = compare(&a, &b, &CompareOpts::default()); + assert_eq!(d.max_channel_delta, 255); + assert_eq!(d.differing_pixels, d.total_pixels); + assert_eq!(d.total_pixels, 20, "total = max(area) = 5*4"); + assert_eq!(d.mssim, Some(0.0)); + // Fails even a hypothetical maximal budget. + let maximal = FuzzBudget { max_channel_delta: 255, max_diff_pixels: u32::MAX }; + assert!(!d.passes(&maximal), "saturated diff fails the loudest budget too"); + } + + #[test] + fn empty_capture_forbidden_by_explicit_assertion() { + // The metric returns total_pixels == 0 for empty; harnesses forbid it. + let e = image::RgbaImage::new(0, 0); + let d = compare(&e, &e, &CompareOpts::default()); + assert_eq!(d.total_pixels, 0); + } + ``` + +- [ ] Step — run to verify it fails: + ```sh + cargo test -p buiy_verify --lib metric 2>&1 | tail -20 + ``` + Expected: `passes_requires_both_axes`, `within_floor_catches_unexpectedly_clean` FAIL to compile (no `passes`/`within`); `dimension_mismatch_is_saturated_and_fails_every_budget` FAILS at runtime once the others compile (current `compare` assumes equal dims). + +- [ ] Step — write the minimal implementation. Add the dim-mismatch early return at the top of `compare`, right after the empty guard: + ```rust + if a.dimensions() != b.dimensions() { + // Loud-red sentinel (metric.md): a saturated Diff fails EVERY budget. + // total = max(area) so the saturation count is well-defined. + let total = a.width().saturating_mul(a.height()) + .max(b.width().saturating_mul(b.height())); + return Diff { + differing_pixels: total, + max_channel_delta: 255, + total_pixels: total, + mssim: Some(0.0), + diff_image: None, + }; + } + ``` + And add the `impl Diff`: + ```rust + impl Diff { + /// PASS iff `max_channel_delta <= budget.max_channel_delta` + /// AND `differing_pixels <= budget.max_diff_pixels`. MSSIM is advisory and + /// never gates here. + pub fn passes(&self, budget: &FuzzBudget) -> bool { + self.max_channel_delta <= budget.max_channel_delta + && self.differing_pixels <= budget.max_diff_pixels + } + + /// Mozilla `fuzzy-if` "ranges must not include 0": PASS iff the diff meets + /// the `max` budget AND exceeds the `min` floor on at least one axis, so a + /// suddenly-clean render (below an expected difference) is flagged. + pub fn within(&self, min: &FuzzBudget, max: &FuzzBudget) -> bool { + let over_floor = self.max_channel_delta > min.max_channel_delta + || self.differing_pixels > min.max_diff_pixels; + self.passes(max) && over_floor + } + } + ``` + +- [ ] Step — run to verify it passes: + ```sh + cargo test -p buiy_verify --lib metric 2>&1 | tail -12 + ``` + Expected: `test result: ok. 12 passed`. + +- [ ] Step — commit: + ```sh + git add crates/buiy_verify/src/metric.rs + git commit -m "feat(verify): Diff::passes/within + saturated dim-mismatch Diff + +Two-axis gate (both bind); within() pins the fuzzy-if floor so an +unexpectedly-clean render reds. A dimension mismatch folds into a +saturated Diff that fails EVERY budget — the loud-red replacement for +the naive silent 1.0. + +Co-Authored-By: Claude Opus 4.8 (1M context) " + ``` + +--- + +### Task 1a.5 — Advisory MSSIM channel (`image-compare`), never gating + +Wires the secondary advisory channel and proves it never participates in `passes`. + +**Files:** +- Modify: `crates/buiy_verify/src/metric.rs` (compute `mssim` when `opts.mssim` for the equal-dims path) +- Test: inline `#[cfg(test)] mod tests` + +- [ ] Step — write the failing tests. Add inside `mod tests`: + ```rust + #[test] + fn identity_reports_full_mssim() { + let img = solid(16, 16, [40, 90, 160, 255]); + let d = compare(&img, &img, &CompareOpts::default()); // mssim on by default + assert_eq!(d.differing_pixels, 0); + let s = d.mssim.expect("mssim computed when opts.mssim"); + assert!(s > 0.999, "identical images report MSSIM ~1.0, got {s}"); + } + + #[test] + fn mssim_skipped_when_disabled() { + let img = solid(8, 8, [1, 2, 3, 255]); + let d = compare(&img, &img, &CompareOpts { mssim: false, ..Default::default() }); + assert_eq!(d.mssim, None); + } + + #[test] + fn mssim_never_gates() { + // A global 1-LSB wash: 0 differing pixels (under YIQ threshold) but a + // measurably-below-1 MSSIM. passes(&EXACT) must still hold. + let a = solid(32, 32, [128, 128, 128, 255]); + let b = solid(32, 32, [129, 129, 129, 255]); + let d = compare(&a, &b, &CompareOpts::default()); + assert_eq!(d.differing_pixels, 0, "1-LSB shift is under the YIQ threshold"); + assert!(d.passes(&FuzzBudget::EXACT), "MSSIM is advisory — never gates passes()"); + } + ``` + +- [ ] Step — run to verify it fails: + ```sh + cargo test -p buiy_verify --lib metric 2>&1 | tail -20 + ``` + Expected: `identity_reports_full_mssim` FAILS (`mssim` is `None`); the other two pass (current behavior matches). + +- [ ] Step — write the minimal implementation. After the per-pixel loop, before constructing the returned `Diff`, compute `mssim`: + ```rust + let mssim = if opts.mssim { + // Advisory MSSIM via image-compare's rgba blended hybrid compare, + // premultiplied against an opaque (white) background — captures are + // opaque, so the background is never sampled in practice. + use image_compare::{rgba_blended_hybrid_compare, BlendInput}; + let bg = image::Rgb([255u8, 255, 255]); + rgba_blended_hybrid_compare(BlendInput::from(a), BlendInput::from(b), bg) + .map(|sim| sim.score) + .ok() + } else { + None + }; + ``` + Then use `mssim` in the returned `Diff` (replace the `mssim: None, // wired in 1a.5` line with `mssim,`). + *(Verify the exact `image-compare` API surface — `rgba_blended_hybrid_compare` / `BlendInput::from` / `Similarity::score` — against the resolved 0.5.0 crate docs at impl time; substitute the correct symbol if the 0.5.x API differs, keeping the `Option` contract.)* + +- [ ] Step — run to verify it passes: + ```sh + cargo test -p buiy_verify --lib metric 2>&1 | tail -12 + ``` + Expected: `test result: ok. 15 passed`. + +- [ ] Step — commit: + ```sh + git add crates/buiy_verify/src/metric.rs + git commit -m "feat(verify): advisory MSSIM channel via image-compare + +Diff::mssim from rgba_blended_hybrid_compare, Option (None when +disabled/errored — never silently 0.0). Proven non-gating: a 1-LSB wash +(0 differing pixels) still passes EXACT despite sub-1 MSSIM. + +Co-Authored-By: Claude Opus 4.8 (1M context) " + ``` + +--- + +### Task 1a.6 — `diff_image` heatmap on `emit_diff_image` + +Fills the optional triage heatmap (consumed by tier-5 golden HTML in Phase 3). AA pixels yellow, differing pixels red — the pixelmatch palette. + +**Files:** +- Modify: `crates/buiy_verify/src/metric.rs` (allocate + paint `diff_image` when `opts.emit_diff_image`) +- Test: inline `#[cfg(test)] mod tests` + +- [ ] Step — write the failing test. Add inside `mod tests`: + ```rust + #[test] + fn diff_image_paints_differing_pixels() { + let a = solid(8, 8, [0, 0, 0, 255]); + let mut b = a.clone(); + b.put_pixel(3, 3, image::Rgba([255, 255, 255, 255])); + let d = compare(&a, &b, &CompareOpts { emit_diff_image: true, mssim: false, ..Default::default() }); + let img = d.diff_image.expect("emit_diff_image fills the heatmap"); + assert_eq!(img.dimensions(), (8, 8)); + // The differing pixel is painted red (pixelmatch diff_color). + assert_eq!(*img.get_pixel(3, 3), image::Rgba([255, 0, 0, 255])); + } + + #[test] + fn diff_image_absent_by_default() { + let a = solid(4, 4, [10, 10, 10, 255]); + let d = compare(&a, &a, &CompareOpts::default()); + assert!(d.diff_image.is_none()); + } + ``` + +- [ ] Step — run to verify it fails: + ```sh + cargo test -p buiy_verify --lib metric 2>&1 | tail -20 + ``` + Expected: `diff_image_paints_differing_pixels` FAILS (`diff_image` is `None`). + +- [ ] Step — write the minimal implementation. Allocate the heatmap before the loop and paint inside it; wire it into the returned `Diff`. Add before the per-pixel loop: + ```rust + let mut diff_image = opts.emit_diff_image.then(|| RgbaImage::new(w, h)); + ``` + Inside the loop, in the over-threshold branch, paint by AA/real classification (replace the `if !is_aa { differing_pixels += 1; }` block): + ```rust + if delta.abs() > max_delta { + let is_aa = !opts.include_aa + && (antialiased(a, x, y, w, h, b) || antialiased(b, x, y, w, h, a)); + if is_aa { + if let Some(out) = &mut diff_image { + out.put_pixel(x, y, image::Rgba([255, 255, 0, 255])); // AA: yellow + } + } else { + differing_pixels += 1; + if let Some(out) = &mut diff_image { + out.put_pixel(x, y, image::Rgba([255, 0, 0, 255])); // diff: red + } + } + } + ``` + Then set `diff_image` in the returned `Diff` (replace `diff_image: None, // wired in 1a.6` with `diff_image,`). + +- [ ] Step — run to verify it passes: + ```sh + cargo test -p buiy_verify --lib metric 2>&1 | tail -12 + ``` + Expected: `test result: ok. 17 passed`. + +- [ ] Step — run the full crate clippy (catch any warnings the gate would reject): + ```sh + cargo clippy -p buiy_verify --all-targets -- -D warnings 2>&1 | tail -8 + ``` + Expected: `Finished`, no warnings. + +- [ ] Step — commit: + ```sh + git add crates/buiy_verify/src/metric.rs + git commit -m "feat(verify): diff_image heatmap on emit_diff_image + +pixelmatch palette: differing pixels red, AA pixels yellow. Off in the +hot reftest path; on for tier-5 golden triage HTML. + +Co-Authored-By: Claude Opus 4.8 (1M context) " + ``` + +--- + +### Task 1a.7 — Promote the known-answer suite into `tests/metric.rs` + the constants-pin + +metric.md § Verification specifies the meta-tests live in `crates/buiy_verify/tests/metric.rs` (integration tier), and a checked-in 8×8 pair + its expected `Diff` guards the vendored constants against drift. The inline `src` tests stay (they exercise private helpers); this task adds the public-surface integration suite and the constants tripwire. + +> **Note — `insta` is available (Phase 0.1) but the snapshot upgrade is deferred.** metric.md's constants-pin uses a floats-redacted `insta` snapshot. To keep this task self-contained and the tripwire un-vacuous, the constants pin here is a **plain `assert_eq!` on the exact integer `Diff` fields** (`mssim` asserted with tolerance, not snapshotted). Phase 2 (which introduces the snapshot dump infra) converts this to the redacted `insta` snapshot metric.md calls for; the assertion form is behavior-identical and cannot pass vacuously. + +**Files:** +- Create: `crates/buiy_verify/tests/metric.rs` +- Test: itself (integration test) + +- [ ] Step — write the integration suite (RED: `tests/metric.rs` does not exist): + ```rust + //! Known-answer meta-tests for `buiy_verify::metric` (metric.md § Verification). + //! Pure CPU, no GPU lane. + + use buiy_verify::metric::{compare, CompareOpts, Diff, FuzzBudget}; + use image::{Rgba, RgbaImage}; + + fn solid(w: u32, h: u32, px: [u8; 4]) -> RgbaImage { + RgbaImage::from_pixel(w, h, Rgba(px)) + } + + #[test] + fn identity_zero_diff_full_mssim() { + let img = solid(8, 8, [12, 34, 56, 255]); + let d = compare(&img, &img, &CompareOpts::default()); + assert_eq!(d.differing_pixels, 0); + assert_eq!(d.max_channel_delta, 0); + assert!(d.mssim.unwrap() > 0.999); + assert!(d.passes(&FuzzBudget::EXACT)); + } + + #[test] + fn single_defect_survives_scale() { + for n in [16u32, 256, 2048] { + let a = solid(n, n, [0, 0, 0, 255]); + let mut b = a.clone(); + b.put_pixel(n / 2, n / 2, Rgba([200, 200, 200, 255])); + let d = compare(&a, &b, &CompareOpts { include_aa: true, mssim: false, ..Default::default() }); + assert_eq!(d.differing_pixels, 1, "N={n}"); + assert!(!d.passes(&FuzzBudget::EXACT), "N={n}"); + } + } + + #[test] + fn dimension_mismatch_fails_every_budget() { + let a = solid(4, 4, [0, 0, 0, 255]); + let b = solid(4, 5, [0, 0, 0, 255]); + let d = compare(&a, &b, &CompareOpts::default()); + assert_eq!(d.differing_pixels, d.total_pixels); + assert_eq!(d.max_channel_delta, 255); + assert!(!d.passes(&FuzzBudget { max_channel_delta: 255, max_diff_pixels: u32::MAX })); + } + + /// Constants tripwire: a fixed 8×8 pair yields an exact integer Diff. A + /// pixelmatch-constant drift changes these numbers and reds this test. (Phase 2 + /// upgrades this to the floats-redacted insta snapshot metric.md specifies.) + #[test] + fn vendored_constants_are_pinned() { + let mut a = solid(8, 8, [0, 0, 0, 255]); + let mut b = solid(8, 8, [0, 0, 0, 255]); + // Three deterministic, isolated, non-AA defects of known magnitude. + a.put_pixel(1, 1, Rgba([0, 0, 0, 255])); + b.put_pixel(1, 1, Rgba([255, 0, 0, 255])); // luma-heavy + a.put_pixel(4, 4, Rgba([0, 0, 0, 255])); + b.put_pixel(4, 4, Rgba([0, 255, 0, 255])); + a.put_pixel(6, 2, Rgba([10, 10, 10, 255])); + b.put_pixel(6, 2, Rgba([250, 250, 250, 255])); + let d = compare(&a, &b, &CompareOpts { mssim: false, ..Default::default() }); + // EXPECTED: re-bless intentionally if the algorithm changes. + let Diff { differing_pixels, max_channel_delta, total_pixels, .. } = d; + assert_eq!( + (differing_pixels, max_channel_delta, total_pixels), + (3, 255, 64), + "vendored YIQ/AA constants drifted — re-derive deliberately, do not patch the number", + ); + } + ``` + +- [ ] Step — run to verify it fails / confirm the tuple: + ```sh + cargo test -p buiy_verify --test metric 2>&1 | tail -20 + ``` + Expected: compiles and runs; **if** the `(3, 255, 64)` tuple does not match the real output, the test FAILS and prints the actual tuple. (This is the one task whose expected literal must be confirmed against the real run — see next step.) + +- [ ] Step — bless the pinned tuple. If `vendored_constants_are_pinned` failed only on the tuple value, read the actual `(differing_pixels, max_channel_delta, total_pixels)` from the failure message and replace `(3, 255, 64)` with it (the three defects are isolated and non-AA by construction, so `differing_pixels` should be `3` and `max_channel_delta` `255`; only confirm). Re-run: + ```sh + cargo test -p buiy_verify --test metric 2>&1 | tail -12 + ``` + Expected: `test result: ok. 4 passed`. + +- [ ] Step — run the headless gate slice for the crate: + ```sh + cargo fmt -p buiy_verify -- --check && cargo clippy -p buiy_verify --all-targets -- -D warnings && cargo test -p buiy_verify 2>&1 | tail -15 + ``` + Expected: all green; `metric` (lib) 17 passed, `metric` (test) 4 passed, plus the existing `visual`/`smoke`/`a11y`/`contrast` suites (still present — deleted next task). + +- [ ] Step — commit: + ```sh + git add crates/buiy_verify/tests/metric.rs + git commit -m "test(verify): known-answer meta-suite + constants tripwire for metric + +metric.md § Verification: identity, scale-invariant single defect, +saturated dim-mismatch, and an exact-integer constants pin guarding the +vendored YIQ/AA numbers. (insta-snapshot upgrade deferred to Phase 2.) + +Co-Authored-By: Claude Opus 4.8 (1M context) " + ``` + +--- + +### Task 1a.8 — Delete the RMSE metric (`visual.rs`) + migrate its 4 callers + the smoke symbol + +metric.md § Migration step 1: `buiy_verify::visual::compare_images` (RMSE) is deleted; its 4 callers in `tests/visual.rs` move to `metric::compare` + `Diff::passes`; the 5th reference (`smoke.rs`) is deleted. + +**Files:** +- Delete: `crates/buiy_verify/src/visual.rs` +- Modify: `crates/buiy_verify/src/lib.rs` (drop `pub mod visual;`) +- Modify: `crates/buiy_verify/tests/visual.rs` (rewrite the 4 tests onto `metric`) +- Modify: `crates/buiy_verify/tests/smoke.rs` (drop the `visual` import + `visual::compare_images` line) +- Delete: `crates/buiy_verify/tests/fixtures/visual/baseline.png`, `.../tinted.png` (the new tests are in-memory) + +- [ ] Step — rewrite the migrated callers FIRST (RED via deleted symbol). Replace the entire contents of `crates/buiy_verify/tests/visual.rs`: + ```rust + //! Migrated from the deleted RMSE `visual::compare_images` to the unified + //! `buiy_verify::metric` (metric.md § Migration). In-memory fixtures; the old + //! baseline/tinted PNGs are gone. + + use buiy_verify::metric::{compare, CompareOpts, FuzzBudget}; + use image::{Rgba, RgbaImage}; + + fn solid(w: u32, h: u32, px: [u8; 4]) -> RgbaImage { + RgbaImage::from_pixel(w, h, Rgba(px)) + } + + #[test] + fn identical_images_pass_exact() { + let img = solid(16, 16, [30, 60, 90, 255]); + let d = compare(&img, &img, &CompareOpts::default()); + assert_eq!(d.differing_pixels, 0); + assert!(d.passes(&FuzzBudget::EXACT), "identical images pass the exact budget"); + } + + #[test] + fn tinted_image_fails_exact() { + let a = solid(16, 16, [40, 40, 40, 255]); + let b = solid(16, 16, [40, 40, 200, 255]); // uniform blue tint + let d = compare(&a, &b, &CompareOpts { include_aa: true, ..Default::default() }); + assert!(d.differing_pixels > 0, "a uniform tint differs"); + assert!(!d.passes(&FuzzBudget::EXACT), "tinted image fails the exact budget"); + } + + #[test] + fn dimension_mismatch_fails_every_budget() { + let a = solid(2, 2, [0, 0, 0, 255]); + let b = solid(3, 2, [0, 0, 0, 255]); + let d = compare(&a, &b, &CompareOpts::default()); + assert!(!d.passes(&FuzzBudget { max_channel_delta: 255, max_diff_pixels: u32::MAX }), + "mismatched dims saturate and fail even a maximal budget"); + } + + #[test] + fn empty_vs_empty_is_zero_diff() { + let e = RgbaImage::new(0, 0); + let d = compare(&e, &e, &CompareOpts::default()); + assert_eq!(d.total_pixels, 0); + assert!(d.passes(&FuzzBudget::EXACT), "empty-vs-empty observes no difference"); + } + ``` + +- [ ] Step — rewrite `crates/buiy_verify/tests/smoke.rs` (drop the deleted symbol): + ```rust + #[test] + fn re_exports_compile() { + use buiy_verify::{a11y, contrast, metric}; + let _ = metric::compare; + let _ = a11y::snapshot_tree; + let _ = contrast::wcag2_ratio; + } + ``` + *(Confirm the exact `a11y`/`contrast` symbol names against the live crate at impl time — `snapshot_tree`/`wcag2_ratio` are placeholders for whatever the smoke test references today.)* + +- [ ] Step — delete the module and its re-export: + ```sh + git rm crates/buiy_verify/src/visual.rs crates/buiy_verify/tests/fixtures/visual/baseline.png crates/buiy_verify/tests/fixtures/visual/tinted.png + ``` + Then edit `crates/buiy_verify/src/lib.rs` so the module block reads: + ```rust + pub mod a11y; + pub mod contrast; + pub mod metric; + ``` + (the lib doc line mentioning "visual regression" also needs updating to "perceptual metric" so the doc gate's wording stays honest.) + +- [ ] Step — run to verify the migration compiles and passes (RED→GREEN: the deleted symbol forced the rewrite): + ```sh + cargo test -p buiy_verify 2>&1 | tail -20 + grep -rn "compare_images\|DiffResult\|mod visual\|::visual" crates/buiy_verify/ 2>&1 + ``` + Expected: all green; `visual` test now 4 passed (migrated), `smoke` 1 passed; the grep returns no matches. + +- [ ] Step — commit: + ```sh + git add -A crates/buiy_verify + git commit -m "refactor(verify): delete RMSE visual::compare_images, migrate callers to metric + +metric.md § Migration step 1: the RMSE metric and DiffResult are gone; +tests/visual.rs and smoke.rs move onto metric::compare + Diff::passes +(in-memory fixtures replace baseline/tinted PNGs). One metric now. + +Co-Authored-By: Claude Opus 4.8 (1M context) " + ``` + +--- + +### Task 1a.9 — Deprecate `buiy_core::render::golden::perceptual_diff` (keep body) + +metric.md § Migration step 2: `perceptual_diff` is **deprecated in place** (`buiy_core` cannot depend on `buiy_verify` in its normal graph). Its L1 body stays so the `#[ignore]` GPU re-capture tests still link until they migrate (1a.10 migrates the `text_gpu.rs` subset; the rest defer to Phase 3 goldens). + +**Files:** +- Modify: `crates/buiy_core/src/render/golden.rs` (add `#[deprecated]` above `pub fn perceptual_diff`) +- Modify: `crates/buiy_core/src/lib.rs` (the `pub use` re-export — `#[allow(deprecated)]`) +- Modify: each remaining in-crate `perceptual_diff` caller test file to `#![allow(deprecated)]` + +- [ ] Step — confirm the full caller set the deprecation will warn: + ```sh + grep -rln "perceptual_diff" crates/buiy_core/tests/ crates/buiy_core/src/ + ``` + Expected files: `tests/render_golden_harness.rs`, `tests/text_gpu.rs`, `tests/text_decoration_gpu.rs`, `tests/text_golden_suite_gpu.rs`, `tests/text_selection_caret_gpu.rs`, and `src/lib.rs` (the re-export). (1a.10 removes `text_gpu.rs` from this list.) + +- [ ] Step — write the change. Add the attribute above `pub fn perceptual_diff`: + ```rust + #[deprecated(note = "use buiy_verify::metric::compare; kept only for unmigrated #[ignore] GPU re-capture tests")] + pub fn perceptual_diff(a: &[u8], b: &[u8]) -> f32 { + ``` + At the `src/lib.rs` re-export, suppress the deprecation locally so the prod build stays `-D warnings`-clean: + ```rust + #[allow(deprecated)] + pub use render::golden::{GoldenConfig, perceptual_diff}; + ``` + In each remaining caller test file (`render_golden_harness.rs`, `text_decoration_gpu.rs`, `text_golden_suite_gpu.rs`, `text_selection_caret_gpu.rs` — NOT `text_gpu.rs`, migrated next), add at the top of the file: + ```rust + #![allow(deprecated)] // perceptual_diff is deprecated; these GPU sites migrate to buiy_verify::metric in Phase 3 (tier-5 goldens). + ``` + +- [ ] Step — run the headless gate slice for `buiy_core`: + ```sh + cargo clippy -p buiy_core --all-targets -- -D warnings 2>&1 | tail -10 + ``` + Expected: `Finished`, no `use of deprecated function` warnings (each site is `allow`-gated or migrated next task). + +- [ ] Step — commit: + ```sh + git add crates/buiy_core/src/render/golden.rs crates/buiy_core/src/lib.rs crates/buiy_core/tests/render_golden_harness.rs crates/buiy_core/tests/text_decoration_gpu.rs crates/buiy_core/tests/text_golden_suite_gpu.rs crates/buiy_core/tests/text_selection_caret_gpu.rs + git commit -m "refactor(core): deprecate perceptual_diff in place + +metric.md § Migration step 2: buiy_core cannot depend on buiy_verify in +its normal graph, so perceptual_diff carries a #[deprecated] gravestone +pointing at buiy_verify::metric::compare; its L1 body stays for the +unmigrated #[ignore] GPU re-capture tests (Phase 3). Callers gain a +file-level allow(deprecated) until they migrate. + +Co-Authored-By: Claude Opus 4.8 (1M context) " + ``` + +--- + +### Task 1a.10 — Migrate the `text_gpu.rs` re-capture / anti-test sites onto `metric::compare` + +metric.md § "Re-capture determinism / anti-tests": the `text_gpu.rs` sites are **not** goldens — they diff two in-process captures. They migrate now onto `metric::compare` via the dev-dependency cycle (**already landed in Phase 0.2** — `buiy_core → buiy_verify` under `[dev-dependencies]`). The stable (`<`) sites become `passes(&EXACT)`; the two named anti-tests (`> 5e-4` at `:152`, `> 1e-4` at `:271`) become `!compare(..).passes(&EXACT)` via an `assert_differs` helper. The `text_golden_suite_gpu.rs` / `text_decoration_gpu.rs` / `text_selection_caret_gpu.rs` stored-baseline-shaped sites stay on deprecated `perceptual_diff` until Phase 3's `assert_golden` (per the 1a.9 `allow`). + +> **GPU lane.** These are `#[ignore]` tests. They compile under the headless gate but only RUN on the GPU lane (`cargo test -p buiy_core -j 2 -- --ignored --test-threads=1`), which this host (RX 6700 XT) can do. The bridging wrinkle: `readback_rgba` returns `Vec` raw RGBA; `metric::compare` wants `&RgbaImage`. A test-local `img(&[u8]) -> RgbaImage` wraps the bytes at the known `W`/`H` (`= 128`/`64`). + +**Files:** +- Modify: `crates/buiy_core/tests/text_gpu.rs` (imports; add `img` + `assert_differs` helpers; rewrite the 5 stable `<` sites + 2 `>` anti-tests; drop the file from the deprecated set so no `allow` is needed) + +- [ ] Step — confirm the dev-dependency edge from Phase 0.2 is present (no re-add): + ```sh + grep -n "buiy_verify" crates/buiy_core/Cargo.toml && cargo build -p buiy_core --tests 2>&1 | tail -5 + ``` + Expected: the `buiy_verify = { path = "../buiy_verify" }` line is under `[dev-dependencies]`; `Finished` (no cyclic-dependency error). If the line is absent, add it per Phase 0.2 in this step. + +- [ ] Step — write the migration. In `crates/buiy_core/tests/text_gpu.rs`, change the import line from: + ```rust + use buiy_core::render::golden::{GoldenConfig, perceptual_diff}; + ``` + to: + ```rust + use buiy_core::render::golden::GoldenConfig; + use buiy_verify::metric::{compare, CompareOpts, FuzzBudget}; + ``` + Add the two helpers near the `W`/`H` consts: + ```rust + /// Wrap a raw RGBA readback (W×H) as an `RgbaImage` for `metric::compare`. + fn img(bytes: &[u8]) -> image::RgbaImage { + image::RgbaImage::from_raw(W, H, bytes.to_vec()) + .expect("readback length == W*H*4") + } + + /// The anti-test spelling: two captures must NOT match at the exact budget — + /// proof the input change actually moved pixels (metric.md § anti-tests). + fn assert_differs(a: &[u8], b: &[u8], msg: &str) { + let d = compare(&img(a), &img(b), &CompareOpts::default()); + assert!(!d.passes(&FuzzBudget::EXACT), "{msg}"); + } + ``` + Then rewrite each site (the line numbers are the pre-migration positions; locate by the surrounding assert): + - `:114` stable — `perceptual_diff(&frame_a, &frame_b) < 1e-4` → `compare(&img(&frame_a), &img(&frame_b), &CompareOpts::default()).passes(&FuzzBudget::EXACT)` with message "two fresh captures diverged (must be bit-exact within the pinned rasterizer)". + - `:152` anti-test — `assert!(perceptual_diff(..) > 5e-4, ..)` → `assert_differs(&frame_a, &frame_b, "the retint is visible in the framebuffer (byte-identity is not vacuous)")`. + - `:216` stable — `perceptual_diff(..) < 1e-4` → the `passes(&EXACT)` form, message "retained frames render identically". + - `:271` anti-test — `assert!(perceptual_diff(&frame_a, &frame_c) > 1e-4, ..)` → `assert_differs(&frame_a, &frame_c, "stale UVs sampled the filler — the silent corruption § 6.3's un-gated touch pass exists to prevent")`. + - `:359` stable — the `passes(&EXACT)` form, message "two independent captures are byte-stable (deterministic fonts + resolver)". + - `:452` stable — the `passes(&EXACT)` form, message "the storm is invisible: same bytes, same shaping, same pixels". + - `:544` stable — the `passes(&EXACT)` form, dropping the `{diff}` interpolation. + +- [ ] Step — verify the file compiles under the headless gate (the `#[ignore]` bodies must build even though they won't run here): + ```sh + cargo test -p buiy_core --test text_gpu --no-run 2>&1 | tail -10 + grep -n "perceptual_diff" crates/buiy_core/tests/text_gpu.rs 2>&1 + ``` + Expected: `Finished` / `Executable …`; the grep returns no matches (so the file needs no `allow(deprecated)`). + +- [ ] Step — RUN the migrated tests on the GPU lane (this host has the adapter) to prove behavior is preserved: + ```sh + cargo test -p buiy_core --test text_gpu -j 2 -- --ignored --test-threads=1 2>&1 | tail -25 + ``` + Expected: all previously-passing `text_gpu` `#[ignore]` tests still pass; the two anti-test owners (`retint_real_text_leaves_atlas_byte_identical`, `touch_pass_prevents_stale_uv_corruption`) pass. If any stable site now FAILS at `EXACT` where it passed at `< 1e-4`, that is a real finding — the old L1 tolerance was masking sub-threshold drift — investigate per `systematic-debugging`; do **not** widen the budget without a documented reason. + +- [ ] Step — commit: + ```sh + git add crates/buiy_core/tests/text_gpu.rs + git commit -m "refactor(core): migrate text_gpu re-capture/anti-tests to metric::compare + +The #[ignore] GPU re-capture tests reach the unified metric over the +dev-only buiy_core -> buiy_verify edge (landed Phase 0.2). Stable +re-capture sites -> passes(&EXACT); the must-differ anti-tests (:152, +:271) -> !passes(&EXACT) via assert_differs. Verified on the RX 6700 XT +GPU lane. The stored-baseline sites in the other text_*_gpu.rs files +stay on deprecated perceptual_diff until Phase 3. + +Co-Authored-By: Claude Opus 4.8 (1M context) " + ``` + +--- + +### Task 1a.11 — Phase 1a gate: full headless gate + supply-chain + GPU lane + +Final verification that Phase 1a is gate-clean before Phase 1b (reftests) builds on `metric`. Verification-only — no commit. + +- [ ] Step — run the full project gate (the "all checks" command): + ```sh + cargo fmt --all -- --check && cargo clippy --workspace --all-targets -- -D warnings && RUSTDOCFLAGS="-D warnings" cargo doc --workspace --no-deps && xvfb-run -a cargo test --workspace 2>&1 | tail -30 + ``` + Expected: all green. The `buiy_verify` `metric` lib + integration suites pass; the migrated `visual`/`smoke` pass; `buiy_core` builds with the deprecation + dev-edge; no `-D warnings` violations. + +- [ ] Step — re-run the supply-chain audit: + ```sh + cargo deny check 2>&1 | tail -12 + ``` + Expected: `advisories ok`, `licenses ok`, `bans ok`, `sources ok`. + +- [ ] Step — run the GPU lane (additive) to confirm the migrated `text_gpu.rs` and the still-deprecated re-capture suites all pass together: + ```sh + cargo test -p buiy_core -j 2 -- --ignored --test-threads=1 2>&1 | tail -30 + ``` + Expected: the full `#[ignore]` GPU suite passes. + +- [ ] Step — no commit. If the gate surfaced any warning or failure, root-cause it per `systematic-debugging` and fix in a follow-up task before declaring Phase 1a done. + +--- + +**Phase 1a exit criteria:** the headless gate is green with the unified `metric` (17 lib + 4 integration meta-tests), the RMSE metric deleted and its callers migrated, `perceptual_diff` deprecated-in-place with every kept caller `allow`-gated; the `text_gpu.rs` re-capture/anti-test sites run bit-exact on the GPU lane; `cargo deny check` clean. Phase 1b (reftests) now has `metric::compare`/`Diff`/`FuzzBudget`/`CompareOpts` to build on. + +--- + +## Phase 1b — Reftest harness + CPU/GPU cross-check + +Realizes [`reftests.md`](../specs/2026-06-15-buiy-verification-design/reftests.md) (Tier 4 + Tier 4.5). Builds `crates/buiy_verify/src/reftest.rs` (new module): `RefCase`/`RefKind`/`RefOutcome`, the `reftest!` macro, `run_reftest` (two captures in one app via `buiy_core::render::golden::capture_to_image`, diffed by `buiy_verify::metric::compare`), the reference-independence structural lint, the CPU-vs-GPU SDF cross-check, and at least two real reftest cases. + +**Depends on:** Phase 0 (the `Dpr` type, the promoted `capture_to_image`, the dev-dep edge) **and** Phase 1a (`buiy_verify::metric` — `Diff`, `FuzzBudget`, `FuzzBudget::EXACT`, `CompareOpts`, `compare`, `Diff::passes`). Phase 1b's pure-CPU meta-tests run in the headless gate; the GPU reftest cases, the SDF cross-check, and the known-good/known-bad pairs are `#[ignore]` and run on the GPU lane. + +> **Scope note — single-reference only in v1.** `reftests.md` § "Reference independence" #3 specs a `RefCase::multi` / `reference: &[fn]` multiple-references aggregation (`Match` = OR over references, `Mismatch` = AND). Phase 1b builds the **single-reference** `RefCase` (`reference: fn(&mut App)`) only — it covers both real cases and the cross-check. Multiple-references is a **deferred follow-up** (see Self-review § gaps); the `evaluate_outcome` split keeps the aggregation logic addable without reworking the engine. + +--- + +### Task 1b.1 — Add `CompareOpts::reftest_default()` to the metric + +The reftest path needs AA-exclusion on, MSSIM kept advisory, and `emit_diff_image` off in the hot loop. A thin constructor on the already-landed `CompareOpts`. + +**Files:** +- Modify: `crates/buiy_verify/src/metric.rs` (add an `impl CompareOpts` block after the `Default` impl) +- Test: `crates/buiy_verify/tests/metric.rs` (append) + +Steps: + +- [ ] **Write the failing test.** Append to `crates/buiy_verify/tests/metric.rs`: + ```rust + #[test] + fn reftest_default_excludes_aa_and_skips_diff_image() { + let opts = buiy_verify::metric::CompareOpts::reftest_default(); + assert!(!opts.include_aa, "reftest excludes AA-sibling pixels"); + assert!(opts.mssim, "MSSIM stays computed (advisory)"); + assert!(!opts.emit_diff_image, "hot reftest path allocates no diff image"); + assert_eq!(opts.threshold, 0.1, "pixelmatch default sensitivity"); + } + ``` + +- [ ] **Run to verify it fails.** + ```sh + cargo test -p buiy_verify --test metric reftest_default_excludes_aa_and_skips_diff_image + ``` + Expected: compile error `no function or associated item named 'reftest_default' found for struct 'CompareOpts'`. + +- [ ] **Write the minimal implementation.** In `crates/buiy_verify/src/metric.rs`, after the `impl Default for CompareOpts` block, add: + ```rust + impl CompareOpts { + /// The reftest-tier options: AA-sibling pixels excluded (two CSS-subset + /// code paths can legitimately differ by one AA pixel on a shared corner), + /// MSSIM advisory-on, and no diff-image allocation in the hot capture loop + /// (the report is emitted with `emit_diff_image` only on failure). + pub fn reftest_default() -> Self { + Self { + threshold: 0.1, + include_aa: false, + mssim: true, + emit_diff_image: false, + } + } + } + ``` + +- [ ] **Run to verify it passes.** + ```sh + cargo test -p buiy_verify --test metric reftest_default_excludes_aa_and_skips_diff_image + ``` + Expected: `test result: ok. 1 passed`. + +- [ ] **Commit.** + ```sh + git add crates/buiy_verify/src/metric.rs crates/buiy_verify/tests/metric.rs + git commit -m "feat(verify): add CompareOpts::reftest_default for tier-4 + +AA-exclusion on, MSSIM advisory, no diff-image alloc in the hot path — +the options run_reftest passes to metric::compare (reftests.md § API). + +Co-Authored-By: Claude Opus 4.8 (1M context) " + ``` + +--- + +### Task 1b.2 — `RefKind` enum + `reftest_kind` parser + module skeleton + +The macro stringifies its kind token (`match`/`mismatch`) into a `RefKind`. Land the enum and the `&str → RefKind` constructor, with unit tests. Start the new module file. + +**Files:** +- Create: `crates/buiy_verify/src/reftest.rs` +- Modify: `crates/buiy_verify/src/lib.rs` (add `pub mod reftest;`) +- Test: inline `#[cfg(test)]` in `reftest.rs` + +Steps: + +- [ ] **Create the module file with the failing test.** Write `crates/buiy_verify/src/reftest.rs`: + ```rust + //! Tier 4 — reftests + the CPU-vs-GPU SDF cross-check (reftests.md). + //! + //! A reftest renders a `test` and a `reference` scene with the SAME engine in + //! ONE process and asserts their bitmaps match (`==`) or differ (`!=`), never + //! against a stored baseline — so every platform-variance term (driver SDF + //! rounding, glyph-atlas AA, sRGB encode, clock) cancels in the diff. The + //! harness stores ZERO bytes. GPU-coupled cases are `#[ignore]`; the pairing / + //! aggregation logic and the independence lint are pure-CPU and gate headless. + + /// Whether a [`RefCase`] passes on equality or on difference. + #[derive(Clone, Copy, PartialEq, Eq, Debug)] + pub enum RefKind { + /// Pass iff `test` and `reference` render to the same bitmap within `fuzz`. + Match, + /// Pass iff they render DIFFERENTLY (a `!=` anti-test guards silent no-ops). + Mismatch, + } + + impl RefKind { + /// Parse the `reftest!` macro's kind token (`stringify!($kind)`). + /// Panics on any other token — the macro only ever passes these two. + pub fn reftest_kind(token: &str) -> Self { + match token { + "match" => RefKind::Match, + "mismatch" => RefKind::Mismatch, + other => panic!("reftest! kind must be `match` or `mismatch`, got `{other}`"), + } + } + } + + #[cfg(test)] + mod tests { + use super::*; + + #[test] + fn reftest_kind_parses_both_tokens() { + assert_eq!(RefKind::reftest_kind("match"), RefKind::Match); + assert_eq!(RefKind::reftest_kind("mismatch"), RefKind::Mismatch); + } + + #[test] + #[should_panic(expected = "must be `match` or `mismatch`")] + fn reftest_kind_rejects_garbage() { + let _ = RefKind::reftest_kind("nope"); + } + } + ``` + +- [ ] **Register the module.** In `crates/buiy_verify/src/lib.rs`, add `pub mod reftest;` alongside the existing `pub mod` lines. + +- [ ] **Run to verify it passes (new code, both tests).** + ```sh + cargo test -p buiy_verify --lib reftest::tests + ``` + Expected: `test result: ok. 2 passed`. + +- [ ] **Commit.** + ```sh + git add crates/buiy_verify/src/reftest.rs crates/buiy_verify/src/lib.rs + git commit -m "feat(verify): reftest module skeleton + RefKind parser + +RefKind{Match,Mismatch} and reftest_kind(&str) — the token parser the +reftest! macro calls. reftests.md § Module & public API. + +Co-Authored-By: Claude Opus 4.8 (1M context) " + ``` + +--- + +### Task 1b.3 — `RefCase` + `RefOutcome` types + +The data the harness operates on: one pairing (`name`, `kind`, `test`/`reference` scene builders, per-pairing `fuzz`) and the outcome (`passed`, the `Diff`, an optional report path). + +**Files:** +- Modify: `crates/buiy_verify/src/reftest.rs` (add types above `#[cfg(test)]`) +- Test: inline `#[cfg(test)]` in `reftest.rs` + +Steps: + +- [ ] **Write the failing test.** Append inside `mod tests`: + ```rust + #[test] + fn refcase_is_constructible_with_zero_fuzz_default() { + use buiy_verify::metric::FuzzBudget; + use bevy::app::App; + fn noop(_: &mut App) {} + let case = RefCase { + name: "constructs", + kind: RefKind::Match, + test: noop, + reference: noop, + fuzz: FuzzBudget::EXACT, + }; + assert_eq!(case.name, "constructs"); + assert_eq!(case.fuzz, FuzzBudget::EXACT); + } + ``` + +- [ ] **Run to verify it fails.** + ```sh + cargo test -p buiy_verify --lib reftest::tests::refcase_is_constructible_with_zero_fuzz_default + ``` + Expected: compile error `cannot find struct, variant or union type 'RefCase'`. + +- [ ] **Write the minimal implementation.** In `crates/buiy_verify/src/reftest.rs`, after `impl RefKind` (above `#[cfg(test)]`), add: + ```rust + use buiy_verify::metric::{Diff, FuzzBudget}; + use bevy::app::App; + + /// One reftest pairing. `test` and `reference` each build a scene into a + /// fresh, deterministic `App` (spawn entities; do NOT drive frames — + /// `run_reftest` owns the capture loop). Co-locate the expectation with the + /// `#[test]` the `reftest!` macro generates. + pub struct RefCase { + pub name: &'static str, + pub kind: RefKind, + /// Builds the scene exercising the feature under test. + pub test: fn(&mut App), + /// Builds the independent-oracle scene (see "Reference independence"). + pub reference: fn(&mut App), + /// Per-pairing fuzz, à la Mozilla `fuzzy-if`. Default `(0,0)` once the + /// determinism stack is in (determinism.md); widen with a documented reason. + pub fuzz: FuzzBudget, + } + + /// The result of running one [`RefCase`]. + #[derive(Debug)] + pub struct RefOutcome { + pub passed: bool, + pub diff: Diff, + /// On failure, a self-contained local HTML triage report (test | ref | + /// diff). Path printed to stderr; never committed. + pub report_path: Option, + } + ``` + (If a self-import lint trips on `use buiy_verify::...` inside the crate, use `crate::metric::{...}` instead.) + +- [ ] **Run to verify it passes.** + ```sh + cargo test -p buiy_verify --lib reftest::tests::refcase_is_constructible_with_zero_fuzz_default + ``` + Expected: `test result: ok. 1 passed`. + +- [ ] **Commit.** + ```sh + git add crates/buiy_verify/src/reftest.rs + git commit -m "feat(verify): RefCase + RefOutcome reftest types + +The pairing (name/kind/test/reference/fuzz) and its outcome +(passed/diff/report_path). reftests.md § Module & public API. + +Co-Authored-By: Claude Opus 4.8 (1M context) " + ``` + +--- + +### Task 1b.4 — Pure-CPU pass-decision logic: `evaluate_outcome` + +Split the `Match`/`Mismatch` pass decision out of `run_reftest` into a pure, GPU-free `fn evaluate_outcome(kind, &Diff, &FuzzBudget) -> bool` so it gates headless via the aggregation truth-table meta-test (`reftests.md` § Verification #1). + +**Files:** +- Modify: `crates/buiy_verify/src/reftest.rs` (add `evaluate_outcome` above `#[cfg(test)]`) +- Test: inline `#[cfg(test)]` in `reftest.rs` + +Steps: + +- [ ] **Write the failing truth-table test.** Append inside `mod tests`: + ```rust + use buiy_verify::metric::Diff; + + /// A stub Diff with `n` differing pixels and `max_channel_delta = d`, no MSSIM. + fn stub_diff(n: u32, d: u8) -> Diff { + Diff { + differing_pixels: n, + max_channel_delta: d, + total_pixels: 1024, + mssim: None, + diff_image: None, + } + } + + #[test] + fn match_passes_within_fuzz_fails_outside() { + assert!(evaluate_outcome(RefKind::Match, &stub_diff(0, 0), &FuzzBudget::EXACT)); + assert!(!evaluate_outcome(RefKind::Match, &stub_diff(1, 200), &FuzzBudget::EXACT)); + assert!(evaluate_outcome( + RefKind::Match, + &stub_diff(1, 8), + &FuzzBudget { max_channel_delta: 8, max_diff_pixels: 1 } + )); + } + + #[test] + fn mismatch_passes_outside_fuzz_fails_within() { + assert!(evaluate_outcome(RefKind::Mismatch, &stub_diff(50, 200), &FuzzBudget::EXACT)); + // A scene that did NOT change (zero diff) FAILS a mismatch — the no-op guard. + assert!(!evaluate_outcome(RefKind::Mismatch, &stub_diff(0, 0), &FuzzBudget::EXACT)); + } + ``` + +- [ ] **Run to verify it fails.** + ```sh + cargo test -p buiy_verify --lib reftest::tests::match_passes_within_fuzz_fails_outside + ``` + Expected: compile error `cannot find function 'evaluate_outcome'`. + +- [ ] **Write the minimal implementation.** In `crates/buiy_verify/src/reftest.rs`, above `#[cfg(test)]`: + ```rust + /// The pure pass-decision: `Match` passes iff the diff fits the budget; + /// `Mismatch` passes iff it does NOT (the feature must *do* something). Split + /// out of `run_reftest` so it gates headless via the aggregation truth table — + /// no GPU. The `(0,0)`-floor enforcement for `Mismatch` lives at macro + /// expansion time, so `evaluate_outcome` takes the budget as given. + pub fn evaluate_outcome(kind: RefKind, diff: &Diff, fuzz: &FuzzBudget) -> bool { + match kind { + RefKind::Match => diff.passes(fuzz), + RefKind::Mismatch => !diff.passes(fuzz), + } + } + ``` + +- [ ] **Run to verify both pass.** + ```sh + cargo test -p buiy_verify --lib reftest::tests::match_passes_within_fuzz_fails_outside reftest::tests::mismatch_passes_outside_fuzz_fails_within + ``` + Expected: `test result: ok. 2 passed`. + +- [ ] **Commit.** + ```sh + git add crates/buiy_verify/src/reftest.rs + git commit -m "feat(verify): pure evaluate_outcome pass-decision + truth table + +Match passes within budget, Mismatch passes outside it (the silent-no-op +guard). Pure CPU so it gates headless. reftests.md § Verification #1. + +Co-Authored-By: Claude Opus 4.8 (1M context) " + ``` + +--- + +### Task 1b.5 — `run_reftest`: two captures in one app, diffed by the metric (GPU) + +The engine: build one painting app, capture `test` then `reference` via `capture_to_image` in the **same** `App` (re-target + re-readback), `compare` with `CompareOpts::reftest_default()`, decide with `evaluate_outcome`, emit a triage report on failure. `#[ignore]` — GPU. + +> **Phase ordering note.** `DeterministicApp` (determinism.md) lands in Phase 3. Phase 1b builds `run_reftest` against the **already-landed** capture seam directly: `capture_to_image(&mut app, &GoldenConfig::deterministic())` on a `capture_app(w, h)`-built app (promoted in Task 1b.6). Phase 3 swaps the `reftest_app` body for `DeterministicApp::new(w, h).build()` in one place (the seam is identical: `&mut App` in, `RgbaImage` out). The `#[ignore]` reftest cases pin behavior across that swap. + +**Files:** +- Modify: `crates/buiy_verify/src/reftest.rs` (add `run_reftest` + `capture_to_image_with` + `emit_report`) +- Create: `crates/buiy_verify/src/support.rs` (the build-seam glue — `reftest_app`, `clear_reftest_scene`) +- Modify: `crates/buiy_verify/src/lib.rs` (add `pub mod support;`) +- Test: `crates/buiy_verify/tests/reftest_engine_gpu.rs` (the self-vs-self / two-different known-good/known-bad pairs) + +Steps: + +- [ ] **Write the failing GPU known-good/known-bad test.** Create `crates/buiy_verify/tests/reftest_engine_gpu.rs`: + ```rust + //! GPU lane (`--ignored`): proves the reftest engine can both PASS and FAIL. + //! reftests.md § Verification #3 — a scene-vs-itself match passes at (0,0); a + //! scene-vs-different match fails (guards a vacuous green); a scene-vs-itself + //! mismatch fails. Real adapter (RX 6700 XT here) / pinned lavapipe in CI. + + use bevy::prelude::*; + use buiy_core::layout::{Inset, Length, Sizing, Style}; + use buiy_core::render::components::Background; + use buiy_core::render::ColorToken; + use buiy_core::components::Node; + use buiy_verify::metric::FuzzBudget; + use buiy_verify::reftest::{run_reftest, RefCase, RefKind}; + use std::borrow::Cow; + + /// A single 40×40 fill at (left,8) in `token` color. + fn box_at(app: &mut App, left: f32, token: &'static str) { + let e = app + .world_mut() + .spawn(( + Node, + Style::default() + .absolute() + .inset(Inset { + top: Sizing::Length(Length::px(8.0)), + left: Sizing::Length(Length::px(left)), + ..default() + }) + .width_px(40.0) + .height_px(40.0), + Background { color: ColorToken::Token(Cow::Borrowed(token)) }, + )) + .id(); + app.world_mut().spawn((Node, Style::default())).add_children(&[e]); + } + + fn red_at_8(app: &mut App) { box_at(app, 8.0, "test.fill.a"); } + fn red_at_120(app: &mut App) { box_at(app, 120.0, "test.fill.a"); } + + #[test] + #[ignore = "GPU: run under `cargo test -- --ignored` (real adapter / lavapipe)"] + fn match_of_scene_with_itself_passes() { + let case = RefCase { + name: "self_match", kind: RefKind::Match, + test: red_at_8, reference: red_at_8, fuzz: FuzzBudget::EXACT, + }; + let outcome = run_reftest(&case); + assert!(outcome.passed, "self-match must pass at (0,0): {:?}", outcome.diff); + } + + #[test] + #[ignore = "GPU: run under `cargo test -- --ignored` (real adapter / lavapipe)"] + fn match_of_two_different_scenes_fails() { + let case = RefCase { + name: "different_match_fails", kind: RefKind::Match, + test: red_at_8, reference: red_at_120, fuzz: FuzzBudget::EXACT, + }; + let outcome = run_reftest(&case); + assert!(!outcome.passed, "differing scenes must NOT match (vacuous-green guard)"); + assert!(outcome.report_path.is_some(), "failure emits a triage report"); + } + + #[test] + #[ignore = "GPU: run under `cargo test -- --ignored` (real adapter / lavapipe)"] + fn mismatch_of_scene_with_itself_fails() { + let case = RefCase { + name: "self_mismatch_fails", kind: RefKind::Mismatch, + test: red_at_8, reference: red_at_8, fuzz: FuzzBudget::EXACT, + }; + let outcome = run_reftest(&case); + assert!(!outcome.passed, "a scene cannot mismatch itself"); + } + ``` + +- [ ] **Run to verify it fails (compile).** + ```sh + cargo test -p buiy_verify --test reftest_engine_gpu --no-run + ``` + Expected: compile error `cannot find function 'run_reftest' in module 'buiy_verify::reftest'`. + +- [ ] **Write `run_reftest` + helpers.** In `crates/buiy_verify/src/reftest.rs`, above `#[cfg(test)]`: + ```rust + use buiy_verify::metric::{compare, CompareOpts}; + use buiy_core::render::golden::{capture_to_image, GoldenConfig}; + + /// The capture viewport for reftest pairings, in logical px. Both halves are + /// captured at this size in one app run; large enough that a single 40px box + /// and a 120px-shifted twin do not overlap (so a moved box is a real diff). + const REFTEST_LOGICAL: (u32, u32) = (200, 120); + + /// Render BOTH scenes via the buiy_core capture seam in ONE app run and diff + /// with `metric::compare`. Platform variance cancels because both halves share + /// one `wgpu::Device`, driver, atlas, and virtual clock. GPU-coupled. + /// + /// Until the determinism stack lands this builds the app via `reftest_app` + /// (the canonical `capture_app` seam); Phase 3 swaps that one line for + /// `DeterministicApp::build` with an identical `&mut App`→capture contract. + pub fn run_reftest(case: &RefCase) -> RefOutcome { + assert!( + mismatch_floor_ok(case.kind, &case.fuzz), + "reftest `{}`: a Mismatch with a non-(0,0) fuzz floor is vacuous", + case.name + ); + let (w, h) = REFTEST_LOGICAL; + let mut app = crate::support::reftest_app(w, h); + let cfg = GoldenConfig::deterministic(); + + let test_img = capture_to_image_with(&mut app, case.test, &cfg); + let ref_img = capture_to_image_with(&mut app, case.reference, &cfg); + + let diff = compare(&test_img, &ref_img, &CompareOpts::reftest_default()); + let passed = evaluate_outcome(case.kind, &diff, &case.fuzz); + let report_path = if passed { + None + } else { + Some(emit_report(case.name, &test_img, &ref_img, &diff)) + }; + RefOutcome { passed, diff, report_path } + } + + /// Clear the previous scene, spawn `scene`, capture via the buiy_core seam. + fn capture_to_image_with( + app: &mut bevy::app::App, + scene: fn(&mut bevy::app::App), + cfg: &GoldenConfig, + ) -> image::RgbaImage { + crate::support::clear_reftest_scene(app); + scene(app); + capture_to_image(app, cfg) + } + + /// Write a self-contained HTML triage report (test | ref | diff) to a temp + /// path and return it. Phase 3 swaps this for the golden-tier emitter; until + /// then, a minimal three-PNG dump. Never committed. + fn emit_report( + name: &str, + test: &image::RgbaImage, + reference: &image::RgbaImage, + diff: &Diff, + ) -> std::path::PathBuf { + let dir = std::env::temp_dir().join("buiy-reftest"); + let _ = std::fs::create_dir_all(&dir); + let base = dir.join(name); + let _ = test.save(base.with_extension("test.png")); + let _ = reference.save(base.with_extension("ref.png")); + if let Some(img) = &diff.diff_image { + let _ = img.save(base.with_extension("diff.png")); + } + let report = base.with_extension("html"); + let _ = std::fs::write( + &report, + format!( + "

reftest {name} FAILED

differing_pixels={} max_channel_delta={}

\ + ", + diff.differing_pixels, diff.max_channel_delta + ), + ); + eprintln!("reftest {name} report: {}", report.display()); + report + } + ``` + (`mismatch_floor_ok` lands in Task 1b.7; until then drop the leading `assert!` or stub `mismatch_floor_ok` returning `true`. Cleaner: land 1b.7 before running this engine on the GPU lane — the macro/engine guard and the engine compile together. The plan orders 1b.7 after 1b.6's green checkpoint; if 1b.5/1b.6 must compile first, temporarily inline `true` and replace it in 1b.7.) + +- [ ] **Add the build-seam glue.** Create `crates/buiy_verify/src/support.rs`: + ```rust + //! GPU-capture glue for the reftest/golden tiers — the ONE place that names + //! the concrete app builder, so Phase 3 swaps it for `DeterministicApp` in a + //! single edit. `pub` so `tests/` integration tests reach it. + + use bevy::prelude::*; + + /// Build the headless painting app both reftest captures share. Until the + /// determinism builder lands this delegates to the promoted + /// `buiy_core::render::golden::capture_app` (Task 1b.6). + pub fn reftest_app(logical_w: u32, logical_h: u32) -> App { + buiy_core::render::golden::capture_app(logical_w, logical_h) + } + + /// Despawn the previous scene's spawned roots between the two captures so the + /// second scene renders alone. Keeps the camera + render-target entities. + pub fn clear_reftest_scene(app: &mut App) { + let roots: Vec = app + .world_mut() + .query_filtered::, Without)>() + .iter(app.world()) + .collect(); + for e in roots { + app.world_mut().entity_mut(e).despawn(); + } + } + ``` + Register it in `crates/buiy_verify/src/lib.rs`: `pub mod support;`. Confirm `image`'s PNG `save` is available (workspace `image = "0.25"` default features include `png`); if `cargo build` reports `save` missing, add `image = { workspace = true, features = ["png"] }` to `crates/buiy_verify/Cargo.toml` and re-run `cargo deny check`. + +- [ ] **Defer the compile to 1b.6.** `run_reftest`/`reftest_app` reference `capture_app`, added next. Do NOT run yet; the green checkpoint is at the end of Task 1b.6. **No standalone commit** — commit 1b.5 + 1b.6 together at 1b.6's checkpoint to keep the tree green. + +--- + +### Task 1b.6 — Promote the painting-app builder into `render/golden.rs` src (`capture_app`) + +`run_reftest` needs a painting-capable `App` from `src` (not test-only `tests/support`). Promote the canonical single-body plugin stack into `buiy_core::render::golden` as `capture_app`, mirroring the already-promoted `capture_to_image`. Closes the compile from Task 1b.5. + +**Files:** +- Modify: `crates/buiy_core/src/render/golden.rs` (add `capture_app` + `capture_app_scaled`; reuse the exact plugin list `gpu_render_app_with_resolution` uses, `tests/support/mod.rs:168`) +- Test: `crates/buiy_core/tests/render_capture_app_gpu.rs` (proves `capture_app` builds a painting app that captures a non-blank frame; GPU `#[ignore]`) + +Steps: + +- [ ] **Write the failing GPU test.** Create `crates/buiy_core/tests/render_capture_app_gpu.rs`: + ```rust + //! GPU lane: `render::golden::capture_app` builds a painting-capable headless + //! App identical to the test-support `gpu_render_app` stack, so the reftest / + //! golden tiers in buiy_verify build their app from `src` (reftests.md § build + //! seam). #[ignore] — needs a real adapter. + + use bevy::prelude::*; + use buiy_core::layout::{Inset, Length, Sizing, Style}; + use buiy_core::render::components::Background; + use buiy_core::render::golden::{capture_app, capture_to_image, GoldenConfig}; + use buiy_core::render::ColorToken; + use buiy_core::components::Node; + use std::borrow::Cow; + + #[test] + #[ignore = "GPU: run under `cargo test -- --ignored` (real adapter / lavapipe)"] + fn capture_app_paints_a_non_blank_frame() { + let mut app = capture_app(64, 64); + let e = app + .world_mut() + .spawn(( + Node, + Style::default() + .absolute() + .inset(Inset { + top: Sizing::Length(Length::px(8.0)), + left: Sizing::Length(Length::px(8.0)), + ..default() + }) + .width_px(40.0) + .height_px(40.0), + Background { color: ColorToken::Token(Cow::Borrowed("test.fill.a")) }, + )) + .id(); + app.world_mut().spawn((Node, Style::default())).add_children(&[e]); + + let img = capture_to_image(&mut app, &GoldenConfig::deterministic()); + assert_eq!(img.dimensions(), (64, 64)); + let painted = img.pixels().any(|p| p.0 != [0, 0, 0, 255]); + assert!(painted, "capture_app must paint the box, not a blank frame"); + } + ``` + +- [ ] **Run to verify it fails (compile).** + ```sh + cargo test -p buiy_core --test render_capture_app_gpu --no-run + ``` + Expected: compile error `cannot find function 'capture_app' in module 'buiy_core::render::golden'`. + +- [ ] **Write the minimal implementation.** In `crates/buiy_core/src/render/golden.rs`, add `capture_app` + `capture_app_scaled`, moving the canonical plugin-stack body from `tests/support/mod.rs:168`'s `gpu_render_app_with_resolution` into src (the test-support `gpu_render_app*` then delegate to these — single source so the scaled / test-support builders cannot drift): + ```rust + use bevy::app::App; + + /// Build the canonical headless painting App at a logical viewport size, + /// promoted from `tests/support/mod.rs` into src so `buiy_verify`'s reftest / + /// golden tiers build their app without the test crate. NOT finished: + /// `capture_to_image` finishes + drives to quiescence + reads back. + pub fn capture_app(logical_w: u32, logical_h: u32) -> App { + capture_app_scaled(logical_w, logical_h, 1.0) + } + + /// [`capture_app`] at an explicit window scale factor (the DPR-pin builder + /// determinism.md sizes the offscreen target through). Bevy 0.18 + /// `WindowResolution::new` takes PHYSICAL units; pass `logical × scale` plus + /// the override so `resolution.size()` reads back the logical size the view + /// uniform is built from. + pub fn capture_app_scaled(logical_w: u32, logical_h: u32, scale_factor: f32) -> App { + use bevy::window::{Window, WindowPlugin, WindowResolution}; + let resolution = WindowResolution::new( + (logical_w as f32 * scale_factor).round() as u32, + (logical_h as f32 * scale_factor).round() as u32, + ) + .with_scale_factor_override(scale_factor); + + let mut app = App::new(); + app.add_plugins(bevy::MinimalPlugins) + .add_plugins(WindowPlugin { + primary_window: Some(Window { resolution, ..bevy::prelude::default() }), + ..bevy::prelude::default() + }) + .add_plugins(bevy::asset::AssetPlugin::default()) + .add_plugins(bevy::render::RenderPlugin::default()) + .add_plugins(bevy::image::ImagePlugin::default()) + .add_plugins(bevy::camera::CameraPlugin) + .add_plugins(bevy::core_pipeline::CorePipelinePlugin) + .add_plugins(crate::theme::ThemePlugin) + .add_plugins(crate::layout::LayoutPlugin) + .add_plugins(crate::CorePlugin) + .add_plugins(crate::text::BuiyTextPlugin::default()) + .add_plugins(crate::render::BuiyRenderPlugin); + app.init_asset::(); + app + } + ``` + **Verify the exact plugin list against `tests/support/mod.rs:168` at impl time** — the list above mirrors the documented stack but must match the canonical builder byte-for-byte (plugin set + init order). Then make `gpu_render_app_with_resolution` delegate to `capture_app_scaled` so there is one body. The existing `render_golden_harness.rs` GPU test transitively re-verifies the stack. + +- [ ] **Run the new GPU test (GPU lane).** + ```sh + cargo test -p buiy_core --test render_capture_app_gpu -j 2 -- --ignored --test-threads=1 + ``` + Expected: `test result: ok. 1 passed`. + +- [ ] **Run the reftest engine GPU test (now compiles + runs).** + ```sh + cargo test -p buiy_verify --test reftest_engine_gpu -j 2 -- --ignored --test-threads=1 + ``` + Expected: `test result: ok. 3 passed`. (If `run_reftest` still references the not-yet-landed `mismatch_floor_ok`, inline `true` per 1b.5's note, run, then restore in 1b.7.) + +- [ ] **Run the headless gate (no regressions; the pure-CPU reftest meta-tests stay green).** + ```sh + cargo fmt --all -- --check && cargo clippy --workspace --all-targets -- -D warnings && xvfb-run -a cargo test --workspace + ``` + Expected: all green; the `#[ignore]` GPU tests are skipped here. + +- [ ] **Commit (Tasks 1b.5 + 1b.6 together — first green checkpoint for the engine).** + ```sh + git add crates/buiy_core/src/render/golden.rs crates/buiy_core/tests/render_capture_app_gpu.rs crates/buiy_core/tests/support/mod.rs crates/buiy_verify/src/reftest.rs crates/buiy_verify/src/support.rs crates/buiy_verify/src/lib.rs crates/buiy_verify/tests/reftest_engine_gpu.rs crates/buiy_verify/Cargo.toml + git commit -m "feat(verify): run_reftest engine + promote capture_app to src + +run_reftest captures test+reference in ONE app via capture_to_image +(re-target + re-readback) and diffs with metric::compare; the painting-app +builder is promoted from tests/support into render::golden::capture_app so +buiy_verify builds its app from src. GPU known-good/known-bad pairs prove +the harness can both pass and fail (vacuous-green guard). reftests.md §§ +API, Verification #3. + +Co-Authored-By: Claude Opus 4.8 (1M context) " + ``` + +--- + +### Task 1b.7 — Mismatch-floor guard (compile-time + run-time) + +A `Mismatch` whose fuzz budget tolerates difference is vacuous (`reftests.md` § Verification #2). The `reftest!` macro forces `(0,0)` for `mismatch` at expansion (Task 1b.8); `run_reftest` also rejects a non-`(0,0)` floor on a `Mismatch` at run time as a belt. This task adds the run-time guard + its meta-test. + +**Files:** +- Modify: `crates/buiy_verify/src/reftest.rs` (add `mismatch_floor_ok`; the `assert!` at the top of `run_reftest` from 1b.5 now calls the real fn) +- Test: inline `#[cfg(test)]` in `reftest.rs` + +Steps: + +- [ ] **Write the failing test.** Append inside `mod tests`: + ```rust + #[test] + fn mismatch_requires_zero_fuzz_floor() { + assert!(mismatch_floor_ok(RefKind::Mismatch, &FuzzBudget::EXACT)); + assert!(!mismatch_floor_ok(RefKind::Mismatch, &FuzzBudget { max_channel_delta: 1, max_diff_pixels: 0 })); + assert!(!mismatch_floor_ok(RefKind::Mismatch, &FuzzBudget { max_channel_delta: 0, max_diff_pixels: 1 })); + // Match may carry any budget. + assert!(mismatch_floor_ok(RefKind::Match, &FuzzBudget { max_channel_delta: 8, max_diff_pixels: 4 })); + } + ``` + +- [ ] **Run to verify it fails.** + ```sh + cargo test -p buiy_verify --lib reftest::tests::mismatch_requires_zero_fuzz_floor + ``` + Expected: compile error `cannot find function 'mismatch_floor_ok'` (unless 1b.5 stubbed it `true` — in that case the test fails on the non-`(0,0)` assertions). + +- [ ] **Write the minimal implementation.** In `crates/buiy_verify/src/reftest.rs`, above `#[cfg(test)]`: + ```rust + /// A `Mismatch` budget that tolerates difference is meaningless — its floor + /// must be `(0,0)`. `Match` may carry any widening. Pure CPU so it gates + /// headless (reftests.md § Verification #2); the `reftest!` macro enforces the + /// same at expansion time, and `run_reftest` asserts it as a belt. + pub fn mismatch_floor_ok(kind: RefKind, fuzz: &FuzzBudget) -> bool { + match kind { + RefKind::Mismatch => *fuzz == FuzzBudget::EXACT, + RefKind::Match => true, + } + } + ``` + Confirm the `assert!(mismatch_floor_ok(...))` at the top of `run_reftest` (from 1b.5) now references this fn (restore it if 1b.5 inlined `true`). + +- [ ] **Run to verify it passes.** + ```sh + cargo test -p buiy_verify --lib reftest::tests::mismatch_requires_zero_fuzz_floor + ``` + Expected: `test result: ok. 1 passed`. + +- [ ] **Commit.** + ```sh + git add crates/buiy_verify/src/reftest.rs + git commit -m "feat(verify): reject non-(0,0) fuzz floor on a Mismatch + +A != that tolerates difference is vacuous — mismatch_floor_ok gates it +pure-CPU and run_reftest asserts it as a belt. reftests.md § Verification #2. + +Co-Authored-By: Claude Opus 4.8 (1M context) " + ``` + +--- + +### Task 1b.8 — The `reftest!` macro + +Generate one `#[test] #[ignore]` per pairing from `reftest!(kind, fn_ident, test_fn, ref_fn[, fuzz = (d, p)])`, parse the kind token via `RefKind::reftest_kind`, default fuzz to `(0,0)`, and reject a non-`(0,0)` floor on `mismatch` at **compile time** (a `const` assertion). `#[macro_export]`. + +> **Function-name surface (load-bearing).** The generated `fn` cannot be named `match` (a keyword), and two `reftest!(match, …)` in one module would collide. So the macro surface takes the **generated test fn name as an `$fn:ident`** (`reftest!(match, flex_justify_eq_literal, test, ref)`), with `stringify!($fn)` as `RefCase.name`. This is the spelling the real cases in 1b.12 use. + +**Files:** +- Modify: `crates/buiy_verify/src/reftest.rs` (add `macro_rules! reftest` + `@gen` internal rule) +- Test: `crates/buiy_verify/tests/reftest_macro_gpu.rs` (a macro-generated case, `#[ignore]`) + +Steps: + +- [ ] **Write the failing test.** Create `crates/buiy_verify/tests/reftest_macro_gpu.rs`: + ```rust + //! GPU lane: the `reftest!` macro generates an `#[ignore]` test per pairing. + //! Uses the same self-match scene as the engine test to prove the macro wires + //! through to a passing run. reftests.md § "The reftest! macro". + + use bevy::prelude::*; + use buiy_core::layout::{Inset, Length, Sizing, Style}; + use buiy_core::render::components::Background; + use buiy_core::render::ColorToken; + use buiy_core::components::Node; + use std::borrow::Cow; + + fn solid_box(app: &mut App) { + let e = app + .world_mut() + .spawn(( + Node, + Style::default() + .absolute() + .inset(Inset { + top: Sizing::Length(Length::px(8.0)), + left: Sizing::Length(Length::px(8.0)), + ..default() + }) + .width_px(40.0) + .height_px(40.0), + Background { color: ColorToken::Token(Cow::Borrowed("test.fill.a")) }, + )) + .id(); + app.world_mut().spawn((Node, Style::default())).add_children(&[e]); + } + + buiy_verify::reftest!(match, macro_self_match, solid_box, solid_box); + ``` + +- [ ] **Run to verify it fails (compile).** + ```sh + cargo test -p buiy_verify --test reftest_macro_gpu --no-run + ``` + Expected: compile error `cannot find macro 'reftest' in crate 'buiy_verify'`. + +- [ ] **Write the minimal implementation.** In `crates/buiy_verify/src/reftest.rs`, at module scope: + ```rust + /// Generate one `#[test] #[ignore]` per reftest pairing — keeps each case at + /// the unit/integration tier under the existing `cargo test -- --ignored` GPU + /// lane, no new CI infra, no manifest file (the type system IS the manifest). + /// + /// ```ignore + /// reftest!(match, flex_justify_end, flex_test, literal_offsets_ref); + /// reftest!(mismatch, cv_hidden_hides, cv_visible, cv_hidden); + /// reftest!(match, transform_xy, xfm_test, literal_ref, fuzz = (1, 8)); + /// ``` + /// + /// A non-`(0,0)` fuzz floor on a `mismatch` fails to COMPILE (a `const` + /// assertion), not at runtime — reftests.md § Verification #2. + #[macro_export] + macro_rules! reftest { + // mismatch with explicit fuzz → compile-time reject of a non-zero floor. + (mismatch, $fn:ident, $test:path, $reference:path, fuzz = ($d:literal, $p:literal)) => { + const _: () = assert!( + $d == 0 && $p == 0, + concat!("reftest mismatch `", stringify!($fn), "`: a non-(0,0) fuzz floor is vacuous"), + ); + $crate::reftest!(@gen mismatch, $fn, $test, $reference, ($d, $p)); + }; + // match with explicit fuzz. + (match, $fn:ident, $test:path, $reference:path, fuzz = ($d:literal, $p:literal)) => { + $crate::reftest!(@gen match, $fn, $test, $reference, ($d, $p)); + }; + // no explicit fuzz → (0,0) for either kind. + ($kind:ident, $fn:ident, $test:path, $reference:path) => { + $crate::reftest!(@gen $kind, $fn, $test, $reference, (0, 0)); + }; + // internal: emit the #[ignore] test named $fn. + (@gen $kind:ident, $fn:ident, $test:path, $reference:path, ($d:literal, $p:literal)) => { + #[test] + #[ignore = "GPU: run under `cargo test -- --ignored` (real adapter / lavapipe)"] + fn $fn() { + let case = $crate::reftest::RefCase { + name: stringify!($fn), + kind: $crate::reftest::RefKind::reftest_kind(stringify!($kind)), + test: $test, + reference: $reference, + fuzz: $crate::metric::FuzzBudget { + max_channel_delta: $d, + max_diff_pixels: $p, + }, + }; + let outcome = $crate::reftest::run_reftest(&case); + assert!( + outcome.passed, + "reftest {} failed: {:?} (report: {:?})", + stringify!($fn), outcome.diff, outcome.report_path + ); + } + }; + } + ``` + +- [ ] **Run to verify it compiles + the generated test is `#[ignore]`.** + ```sh + cargo test -p buiy_verify --test reftest_macro_gpu --no-run + cargo test -p buiy_verify --test reftest_macro_gpu 2>&1 | grep -E "macro_self_match|ignored" + ``` + Expected: compiles; the generated `macro_self_match` is listed as `ignored` in the headless run (`1 ignored`). + +- [ ] **Run the generated case on the GPU lane.** + ```sh + cargo test -p buiy_verify --test reftest_macro_gpu -j 2 -- --ignored --test-threads=1 + ``` + Expected: `test result: ok. 1 passed`. + +- [ ] **Commit.** + ```sh + git add crates/buiy_verify/src/reftest.rs crates/buiy_verify/tests/reftest_macro_gpu.rs + git commit -m "feat(verify): reftest! macro generating #[ignore] GPU cases + +reftest!(kind, fn_ident, test, reference[, fuzz=(d,p)]) emits one +#[test] #[ignore] per pairing; a non-(0,0) floor on a mismatch fails to +COMPILE via a const assert. reftests.md § 'The reftest! macro'. + +Co-Authored-By: Claude Opus 4.8 (1M context) " + ``` + +--- + +### Task 1b.9 — Reference-independence structural lint (`assert_reference_independent`) + +A reference must not carry the marker component the feature-under-test exercises (`reftests.md` § "Reference independence", mechanism 2). Build a headless no-GPU `App`, run the case's `reference` scene, query for forbidden component markers, assert none present. The value-encoded-feature caveat is documented as human-review. + +**Files:** +- Modify: `crates/buiy_verify/src/reftest.rs` (add `ComponentMarker`, `IndependenceRule`, `assert_reference_independent`, `default_rules`) +- Test: `crates/buiy_verify/tests/reftest_independence.rs` (pure CPU, **not** `#[ignore]`: the RED/GREEN self-test) + +Steps: + +- [ ] **Write the failing RED/GREEN self-test.** Create `crates/buiy_verify/tests/reftest_independence.rs`: + ```rust + //! Pure-CPU lint self-test (NOT #[ignore]): a reference that ILLEGALLY carries + //! the forbidden marker trips assert_reference_independent (RED); the canonical + //! disjoint reference passes (GREEN). reftests.md § Verification #4. The lint + //! is itself tested, not trusted. + + use bevy::prelude::*; + use buiy_core::layout::{ContentVisibility, Style}; + use buiy_core::layout::components::Containment; + use buiy_core::components::Node; + use buiy_verify::metric::FuzzBudget; + use buiy_verify::reftest::{ + assert_reference_independent, default_rules, ComponentMarker, IndependenceRule, RefCase, RefKind, + }; + + fn empty(_: &mut App) {} + + fn visible_box(app: &mut App) { + app.world_mut().spawn((Node, Style::default())); + } + + fn hidden_box(app: &mut App) { + app.world_mut().spawn(( + Node, + Style::default(), + Containment { content_visibility: ContentVisibility::Hidden, ..default() }, + )); + } + + #[test] + fn legal_reference_passes_the_lint() { + let case = RefCase { + name: "cv_green", kind: RefKind::Mismatch, + test: empty, reference: visible_box, fuzz: FuzzBudget::EXACT, + }; + assert_reference_independent(&case, &default_rules()); + } + + #[test] + #[should_panic(expected = "reference for `content-visibility` illegally contains")] + fn illegal_reference_trips_the_lint() { + let case = RefCase { + name: "cv_red", kind: RefKind::Mismatch, + test: empty, reference: hidden_box, fuzz: FuzzBudget::EXACT, + }; + assert_reference_independent(&case, &[IndependenceRule { + feature: "content-visibility", + forbidden_in_reference: &[ComponentMarker::ContentVisibilityHidden], + }]); + } + ``` + +- [ ] **Run to verify it fails (compile).** + ```sh + cargo test -p buiy_verify --test reftest_independence --no-run + ``` + Expected: compile errors `cannot find type 'ComponentMarker'` / `cannot find function 'assert_reference_independent'`. + +- [ ] **Write the minimal implementation.** In `crates/buiy_verify/src/reftest.rs`, above `#[cfg(test)]`: + ```rust + use bevy::prelude::World; + + /// A structural marker the independence lint can query for in a built world. + /// Each variant maps to a `buiy_core` component whose *presence* proves a + /// reference re-used the feature under test. Value-encoded features + /// (`justify-content`, `direction`, `gap` — fields on a shared `Style`) have NO + /// marker here and fall to human review (see `assert_reference_independent`). + #[derive(Clone, Copy, PartialEq, Eq, Debug)] + pub enum ComponentMarker { + ContentVisibilityHidden, + ContainerQuery, + TopLayer, + Translate, + } + + impl ComponentMarker { + /// True iff ANY entity in `world` carries this marker. + fn present_in(self, world: &mut World) -> bool { + use buiy_core::layout::components::Containment; + use buiy_core::layout::{ContainerQuery, ContentVisibility, Translate}; + use buiy_core::components::TopLayer; + match self { + ComponentMarker::ContentVisibilityHidden => world + .query::<&Containment>() + .iter(world) + .any(|c| c.content_visibility == ContentVisibility::Hidden), + ComponentMarker::ContainerQuery => + world.query::<&ContainerQuery>().iter(world).next().is_some(), + ComponentMarker::TopLayer => + world.query::<&TopLayer>().iter(world).next().is_some(), + ComponentMarker::Translate => + world.query::<&Translate>().iter(world).next().is_some(), + } + } + } + + /// What a reference scene is FORBIDDEN to contain, per feature under test. + pub struct IndependenceRule { + pub feature: &'static str, + pub forbidden_in_reference: &'static [ComponentMarker], + } + + /// The registered marker rules for marker-bearing features. Value-encoded + /// features (flex `justify-content`, `direction`, `gap`) are deliberately + /// ABSENT — component-presence cannot distinguish them, so they fall to the + /// PR-time review checklist. A pairing whose feature has no rule here fails the + /// lint until a rule (or documented waiver) is added — independence is + /// opt-out-impossible by construction for marker features. + pub fn default_rules() -> Vec { + vec![ + IndependenceRule { feature: "content-visibility", forbidden_in_reference: &[ComponentMarker::ContentVisibilityHidden] }, + IndependenceRule { feature: "@container", forbidden_in_reference: &[ComponentMarker::ContainerQuery] }, + IndependenceRule { feature: "top-layer", forbidden_in_reference: &[ComponentMarker::TopLayer] }, + IndependenceRule { feature: "translate", forbidden_in_reference: &[ComponentMarker::Translate] }, + ] + } + + /// Assert the case's `reference` scene carries NONE of the marker components a + /// rule forbids. Builds the reference into a headless **no-GPU** `App` (layout + /// types registered, no render plugins) and queries the built world. Panics + /// naming the feature + marker on violation. + /// + /// **Limit — value-encoded features fall to human review.** Features that are + /// field *values* on a shared `Style`/`Node` (`justify-content`, `direction`, + /// `gap`) have no distinct marker, so this lint cannot see them; mechanism 1 + /// (route the reference through the primitive literal-`Node` layer) keeps THOSE + /// independent, and the PR-time checklist enforces it. This backstops only + /// marker-bearing features. + pub fn assert_reference_independent(case: &RefCase, rules: &[IndependenceRule]) { + let mut app = bevy::app::App::new(); + app.add_plugins(buiy_core::layout::LayoutPlugin); + (case.reference)(&mut app); + let world = app.world_mut(); + for rule in rules { + for &marker in rule.forbidden_in_reference { + assert!( + !marker.present_in(world), + "reference for `{}` illegally contains {:?} — it re-uses the \ + feature under test, so the comparison would pass vacuously \ + (reftests.md § Reference independence)", + rule.feature, marker + ); + } + } + } + ``` + (If `LayoutPlugin` requires render/asset plugins to build, substitute a minimal `App::new()` + `register_type` and direct `world.spawn` through the scene fn; the query only needs the components to exist as data, not the plugin systems. Confirm `ContainerQuery`/`Translate`/`TopLayer`/`Containment` import paths against the live crate.) + +- [ ] **Run to verify both pass (GREEN passes, RED panics-as-expected).** + ```sh + cargo test -p buiy_verify --test reftest_independence + ``` + Expected: `test result: ok. 2 passed`. + +- [ ] **Run the headless gate.** + ```sh + cargo fmt --all -- --check && cargo clippy --workspace --all-targets -- -D warnings && xvfb-run -a cargo test --workspace + ``` + Expected: all green (this lint runs in the headless gate — pure CPU, not `#[ignore]`). + +- [ ] **Commit.** + ```sh + git add crates/buiy_verify/src/reftest.rs crates/buiy_verify/tests/reftest_independence.rs + git commit -m "feat(verify): reference-independence structural lint + +assert_reference_independent builds the reference into a no-GPU App and +rejects any forbidden marker (ContentVisibility/ContainerQuery/TopLayer/ +Translate). Value-encoded features fall to human review (documented). The +lint is itself RED/GREEN-tested. reftests.md §§ Reference independence, +Verification #4. + +Co-Authored-By: Claude Opus 4.8 (1M context) " + ``` + +--- + +### Task 1b.10 — CPU SDF rasterizer oracle (`rasterize_sdf_rect`) + point-probe pin + +Promote the CPU SDF port from three scalar probes (`tests/render_instance.rs:12`) to a full-tile rasterizer that mirrors `shader.wgsl:60` (`sdf_rounded_rect`) and `:76–:79` (`fwidth → smoothstep(-aa, aa, d)` AA). Pin it to the existing point-probes (`reftests.md` § Verification #5) — pure CPU, no GPU. + +**Files:** +- Modify: `crates/buiy_verify/src/reftest.rs` (add a `sdf_oracle` submodule with `rasterize_sdf_rect`) +- Test: `crates/buiy_verify/tests/sdf_oracle.rs` (pure CPU) + +Steps: + +- [ ] **Write the failing pure-CPU test.** Create `crates/buiy_verify/tests/sdf_oracle.rs`: + ```rust + //! Pure-CPU (NOT #[ignore]): the full-tile CPU SDF oracle must reproduce the + //! scalar `d` the existing render_instance.rs point-probes assert — center + //! inside (filled), 2× half-extent outside (empty). Pins the full-tile port to + //! the unit-tested shader formula. reftests.md § Verification #5. + + use bevy::prelude::*; + use buiy_core::render::DrawData; + use buiy_verify::reftest::sdf_oracle::rasterize_sdf_rect; + + #[test] + fn oracle_fills_center_and_clears_far_outside() { + let inset = DrawData::new(Vec2::new(50.0, 25.0), Vec2::new(40.0, 20.0), Color::WHITE, 0.0); + let img = rasterize_sdf_rect(&inset, 200, 100); + assert_eq!(img.dimensions(), (200, 100)); + assert_eq!(img.get_pixel(5, 5).0[3], 0, "far outside the box is empty"); + assert_eq!(img.get_pixel(70, 35).0[3], 255, "inside the inset box is filled"); + } + + #[test] + fn oracle_edge_band_is_partial_alpha() { + // The AA band must be neither fully 0 nor fully 255 for at least one pixel + // (proves the smoothstep coverage step is live) — the property the GPU + // shader's fwidth→smoothstep produces. + let draw = DrawData::new(Vec2::new(50.0, 25.0), Vec2::new(40.0, 20.0), Color::WHITE, 8.0); + let img = rasterize_sdf_rect(&draw, 200, 100); + let has_partial = img.pixels().any(|p| { let a = p.0[3]; a > 0 && a < 255 }); + assert!(has_partial, "a rounded-rect edge must produce AA partial-alpha pixels"); + } + ``` + *(Confirm the `DrawData::new` constructor signature `(position, size, color, radius)` against `crates/buiy_core/src/render/instance.rs` at impl time; adjust field order/names if the real API differs.)* + +- [ ] **Run to verify it fails (compile).** + ```sh + cargo test -p buiy_verify --test sdf_oracle --no-run + ``` + Expected: compile error `could not find 'sdf_oracle' in 'reftest'`. + +- [ ] **Write the minimal implementation.** In `crates/buiy_verify/src/reftest.rs`, add the submodule: + ```rust + /// Pure-CPU per-pixel evaluation of the WGSL SDF + AA coverage step, the + /// golden-free oracle for SDF corner AA (Tier 4.5). The SDF formula is shared + /// 1:1 with `shader.wgsl:60` / `:76-:79` — the port and the shader must stay + /// identical, pinned by the point-probe test that re-derives the values + /// `tests/render_instance.rs:12` already asserts. + pub mod sdf_oracle { + use bevy::math::Vec2; + use buiy_core::render::DrawData; + + /// 1:1 CPU port of `shader.wgsl::sdf_rounded_rect`. + pub fn sdf_rounded_rect(p: Vec2, half_size: Vec2, r: f32) -> f32 { + let q = p.abs() - half_size + Vec2::splat(r); + q.max(Vec2::ZERO).length() + q.x.max(q.y).min(0.0) - r + } + + /// Rasterize one `DrawData` rounded-rect into a `w×h` RGBA tile, mirroring + /// the fragment shader: SDF in logical px, AA via a `fwidth` estimate (the + /// per-pixel SDF gradient via central difference) fed to + /// `smoothstep(-aa, aa, d)`. + pub fn rasterize_sdf_rect(draw: &DrawData, w: u32, h: u32) -> image::RgbaImage { + let half = draw.size * 0.5; + let center = draw.position + half; + let r = draw.radius; + let lin = bevy::color::LinearRgba::from(draw.color); + let srgba = bevy::color::Srgba::from(lin); + let (rr, gg, bb) = ( + (srgba.red * 255.0).round() as u8, + (srgba.green * 255.0).round() as u8, + (srgba.blue * 255.0).round() as u8, + ); + let base_a = draw.color.alpha(); + + let mut img = image::RgbaImage::new(w, h); + for y in 0..h { + for x in 0..w { + let p = Vec2::new(x as f32 + 0.5, y as f32 + 0.5) - center; + let d = sdf_rounded_rect(p, half, r); + let dx = (sdf_rounded_rect(p + Vec2::X, half, r) + - sdf_rounded_rect(p - Vec2::X, half, r)).abs() * 0.5; + let dy = (sdf_rounded_rect(p + Vec2::Y, half, r) + - sdf_rounded_rect(p - Vec2::Y, half, r)).abs() * 0.5; + let aa = (dx + dy).max(1e-4); + let coverage = 1.0 - smoothstep(-aa, aa, d); + let a = (base_a * coverage * 255.0).round().clamp(0.0, 255.0) as u8; + img.put_pixel(x, y, image::Rgba([rr, gg, bb, a])); + } + } + img + } + + /// `smoothstep` matching WGSL `smoothstep(edge0, edge1, x)`. + fn smoothstep(edge0: f32, edge1: f32, x: f32) -> f32 { + let t = ((x - edge0) / (edge1 - edge0)).clamp(0.0, 1.0); + t * t * (3.0 - 2.0 * t) + } + } + ``` + > **AA-estimate fidelity note.** The GPU `fwidth` is a screen-space derivative; the CPU central-difference approximates it. This is *intended* — the cross-check (1b.11) tolerates sub-pixel AA noise via `fuzz`. The oracle catches *implementation* drift (wrong half-extent, radius clamp, premultiply), not a *spec* error in the shared `sdf_rounded_rect` (both paths share it — that is Tier 5's job). + +- [ ] **Run to verify both pass.** + ```sh + cargo test -p buiy_verify --test sdf_oracle + ``` + Expected: `test result: ok. 2 passed`. + +- [ ] **Run the headless gate.** + ```sh + cargo fmt --all -- --check && cargo clippy --workspace --all-targets -- -D warnings && xvfb-run -a cargo test --workspace + ``` + Expected: all green. + +- [ ] **Commit.** + ```sh + git add crates/buiy_verify/src/reftest.rs crates/buiy_verify/tests/sdf_oracle.rs + git commit -m "feat(verify): full-tile CPU SDF oracle (rasterize_sdf_rect) + +Promotes the CPU SDF port from scalar probes to a full-tile rasterizer +mirroring shader.wgsl:60/:76-:79 (sdf_rounded_rect + fwidth→smoothstep). +Pinned to the render_instance.rs point-probes. reftests.md §§ CPU-vs-GPU +cross-check, Verification #5. + +Co-Authored-By: Claude Opus 4.8 (1M context) " + ``` + +--- + +### Task 1b.11 — `run_sdf_cross_check`: GPU rounded-rect vs CPU oracle (GPU) + +Render one rounded-rect on the GPU (single-instance capture via `capture_app` + `capture_to_image`), rasterize the same `DrawData` with `rasterize_sdf_rect`, diff via `metric::compare` within a documented `fuzz` budget (`reftests.md` § "CPU-vs-GPU SDF cross-check"). Zero stored bytes. `#[ignore]` — GPU. + +> **`fn(&mut App)` vs a captured `DrawData` (load-bearing).** `RefCase` builders are `fn(&mut App)` (no captured environment), but `run_sdf_cross_check` must spawn a box matching the *runtime* `draw`. So it bypasses the `RefCase` path: it spawns the single primitive inline against `&mut app`, then calls `capture_to_image` directly. + +**Files:** +- Modify: `crates/buiy_verify/src/reftest.rs` (add `run_sdf_cross_check` + `spawn_single_primitive`) +- Test: `crates/buiy_verify/tests/sdf_cross_check_gpu.rs` (GPU `#[ignore]`) + +Steps: + +- [ ] **Write the failing GPU test.** Create `crates/buiy_verify/tests/sdf_cross_check_gpu.rs`: + ```rust + //! GPU lane (`--ignored`): the GPU rounded-rect render and the CPU SDF oracle + //! must agree within a documented AA fuzz budget — the golden-free oracle for + //! SDF corner AA (Tier 4.5). A wrong half-extent / radius-clamp / premultiply + //! in the shader would diverge here. reftests.md § CPU-vs-GPU SDF cross-check. + + use bevy::prelude::*; + use buiy_core::render::DrawData; + use buiy_verify::metric::FuzzBudget; + use buiy_verify::reftest::run_sdf_cross_check; + + #[test] + #[ignore = "GPU: run under `cargo test -- --ignored` (real adapter / lavapipe)"] + fn gpu_rounded_rect_matches_cpu_oracle() { + let draw = DrawData::new(Vec2::new(40.0, 20.0), Vec2::new(120.0, 80.0), Color::WHITE, 16.0); + // AA band tolerance: a sub-pixel rim may differ between the GPU `fwidth` + // derivative and the CPU central-difference — the documented AA residue, + // NOT a regression. Interior + exterior match bit-exactly; only the ~1px + // rim is fuzzed. (Record the measured rim pixel count here after the run.) + let fuzz = FuzzBudget { max_channel_delta: 12, max_diff_pixels: 600 }; + let outcome = run_sdf_cross_check(&draw, &fuzz); + assert!( + outcome.passed, + "GPU vs CPU-SDF oracle diverged: {:?} (report: {:?})", + outcome.diff, outcome.report_path + ); + } + ``` + +- [ ] **Run to verify it fails (compile).** + ```sh + cargo test -p buiy_verify --test sdf_cross_check_gpu --no-run + ``` + Expected: compile error `cannot find function 'run_sdf_cross_check'`. + +- [ ] **Write the minimal implementation.** In `crates/buiy_verify/src/reftest.rs`: + ```rust + /// Render the same single primitive on the GPU (one-instance capture) and on + /// the CPU oracle, diff with the AA-aware metric. Tolerates sub-pixel AA noise + /// via `fuzz`; zero stored bytes. Catches SDF AA / implementation drift no + /// markup reftest can, and is kept PERMANENTLY (one shared analytic + /// `sdf_rounded_rect`). A *spec* error in `sdf_rounded_rect` is invisible here + /// (both paths share it) — that is Tier 5's job. + pub fn run_sdf_cross_check( + draw: &buiy_core::render::DrawData, + fuzz: &FuzzBudget, + ) -> RefOutcome { + let (w, h) = REFTEST_LOGICAL; + let cfg = GoldenConfig::deterministic(); + + let mut app = crate::support::reftest_app(w, h); + crate::support::clear_reftest_scene(&mut app); + spawn_single_primitive(&mut app, draw); + let gpu = capture_to_image(&mut app, &cfg); + + let cpu = sdf_oracle::rasterize_sdf_rect(draw, w, h); + + let diff = compare(&gpu, &cpu, &CompareOpts::reftest_default()); + let passed = diff.passes(fuzz); + let report_path = if passed { + None + } else { + Some(emit_report("sdf_cross_check", &gpu, &cpu, &diff)) + }; + RefOutcome { passed, diff, report_path } + } + + /// Spawn one rounded-rect under a root, mapping `DrawData`'s position/size/ + /// radius to the layout components the extract path turns back into one + /// `DrawData`. (Confirm the exact `Radius` component spelling against + /// `render::components` at impl time.) + fn spawn_single_primitive(app: &mut bevy::app::App, draw: &buiy_core::render::DrawData) { + use bevy::prelude::*; + use buiy_core::components::Node; + use buiy_core::layout::{Inset, Length, Sizing, Style}; + use buiy_core::render::components::{Background, Radius}; + use buiy_core::render::ColorToken; + use std::borrow::Cow; + // The capture path resolves a token; install draw.color under a fixed key. + let key = "sdf.cross.fill"; + { + let mut theme = app.world_mut().resource_mut::(); + theme.colors.insert(key.into(), draw.color); + } + let e = app + .world_mut() + .spawn(( + Node, + Style::default() + .absolute() + .inset(Inset { + left: Sizing::Length(Length::px(draw.position.x)), + top: Sizing::Length(Length::px(draw.position.y)), + ..default() + }) + .width_px(draw.size.x) + .height_px(draw.size.y), + Background { color: ColorToken::Token(Cow::Borrowed(key)) }, + Radius::circular(draw.radius), + )) + .id(); + app.world_mut().spawn((Node, Style::default())).add_children(&[e]); + } + ``` + *(`Radius::circular(px)` / `Radius::ZERO` is the real API at `render/components.rs:112-126`. The `Theme::colors.insert` + `Background`/`ColorToken` spellings are placeholders — confirm against the live `render::components` / `theme` API at impl time. The intent: one box whose extracted `DrawData` matches `draw`.)* + +- [ ] **Run to verify it compiles.** + ```sh + cargo test -p buiy_verify --test sdf_cross_check_gpu --no-run + ``` + Expected: compiles clean. + +- [ ] **Run on the GPU lane.** + ```sh + cargo test -p buiy_verify --test sdf_cross_check_gpu -j 2 -- --ignored --test-threads=1 + ``` + Expected: `test result: ok. 1 passed`. If the AA rim exceeds the budget, adjust `fuzz` in the test with a *measured* comment — do NOT widen `max_channel_delta` past the interior's bit-exact agreement (the interior must match at delta 0; only the ~1px rim is fuzzed). Record the measured rim pixel count in the test comment. + +- [ ] **Commit.** + ```sh + git add crates/buiy_verify/src/reftest.rs crates/buiy_verify/tests/sdf_cross_check_gpu.rs + git commit -m "feat(verify): CPU-vs-GPU SDF cross-check (run_sdf_cross_check) + +Renders one rounded-rect on the GPU and via the CPU oracle, diffs within a +documented AA fuzz budget. Zero stored bytes; kept permanently (one shared +analytic SDF). reftests.md § CPU-vs-GPU SDF cross-check. + +Co-Authored-By: Claude Opus 4.8 (1M context) " + ``` + +--- + +### Task 1b.12 — Two real reftest cases: flex-justify `==` literal offsets; content-visibility `!=` visible + +The harness's payoff. Case 1 (`match`): a flex row with `justify-content: SpaceBetween` of three 40px boxes in a 200px row renders the same as three boxes at literal x = 0, 80, 160 via the primitive layer (reference routes through literal `Node` offsets, NOT flex — mechanism 1). Case 2 (`mismatch`): a subtree with `ContentVisibility::Hidden` renders **differently** from the identical visible subtree. Both `#[ignore]` — GPU; the cv reference's independence is asserted pure-CPU. + +**Files:** +- Create: `crates/buiy_verify/tests/reftest_cases_gpu.rs` (the two `reftest!`-generated cases + scenes + a headless independence assertion for case 2) + +Steps: + +- [ ] **Write the two cases.** Create `crates/buiy_verify/tests/reftest_cases_gpu.rs`: + ```rust + //! GPU lane (`--ignored`): two real Tier-4 reftest pairings. + //! 1. flex `justify-content: SpaceBetween` == three literal-offset boxes + //! (reference routes through the literal-Node layer — NOT flex). `match`. + //! 2. `content-visibility: hidden` != the identical VISIBLE subtree — the + //! `!=` anti-test proving the feature suppresses paint. `mismatch`. + //! reftests.md § Authoring patterns. + + use bevy::prelude::*; + use buiy_core::components::Node; + use buiy_core::layout::components::Containment; + use buiy_core::layout::{ + ContentVisibility, FlexAxis, Inset, JustifyContent, Length, Sizing, Style, + }; + use buiy_core::render::components::Background; + use buiy_core::render::ColorToken; + use std::borrow::Cow; + + fn fill_box(width: f32) -> impl Bundle { + ( + Node, + Style::default().width_px(width).height_px(40.0), + Background { color: ColorToken::Token(Cow::Borrowed("test.fill.a")) }, + ) + } + + fn abs_box(app: &mut App, left: f32) -> Entity { + app.world_mut() + .spawn(( + Node, + Style::default() + .absolute() + .inset(Inset { + left: Sizing::Length(Length::px(left)), + top: Sizing::Length(Length::px(0.0)), + ..default() + }) + .width_px(40.0) + .height_px(40.0), + Background { color: ColorToken::Token(Cow::Borrowed("test.fill.a")) }, + )) + .id() + } + + fn flex_justify(app: &mut App) { + let a = app.world_mut().spawn(fill_box(40.0)).id(); + let b = app.world_mut().spawn(fill_box(40.0)).id(); + let c = app.world_mut().spawn(fill_box(40.0)).id(); + app.world_mut() + .spawn(( + Node, + Style::default() + .flex() + .flex_axis(FlexAxis::Row) + .justify_content(JustifyContent::SpaceBetween) + .width_px(200.0) + .height_px(40.0), + )) + .add_children(&[a, b, c]); + } + + fn literal_offsets(app: &mut App) { + let a = abs_box(app, 0.0); + let b = abs_box(app, 80.0); + let c = abs_box(app, 160.0); + app.world_mut().spawn((Node, Style::default())).add_children(&[a, b, c]); + } + + fn subtree(app: &mut App, hidden: bool) { + let child = app.world_mut().spawn(fill_box(80.0)).id(); + let mut parent = app.world_mut().spawn(( + Node, + Style::default() + .absolute() + .inset(Inset { + left: Sizing::Length(Length::px(20.0)), + top: Sizing::Length(Length::px(20.0)), + ..default() + }) + .width_px(80.0) + .height_px(40.0), + )); + if hidden { + parent.insert(Containment { content_visibility: ContentVisibility::Hidden, ..default() }); + } + let p = parent.id(); + app.world_mut().entity_mut(p).add_children(&[child]); + app.world_mut().spawn((Node, Style::default())).add_children(&[p]); + } + + fn cv_visible(app: &mut App) { subtree(app, false); } + fn cv_hidden(app: &mut App) { subtree(app, true); } + + buiy_verify::reftest!(match, flex_justify_eq_literal, flex_justify, literal_offsets); + buiy_verify::reftest!(mismatch, cv_hidden_actually_hides, cv_visible, cv_hidden); + + #[test] + fn cv_hidden_reference_is_independent() { + use buiy_verify::metric::FuzzBudget; + use buiy_verify::reftest::{assert_reference_independent, default_rules, RefCase, RefKind}; + // The REFERENCE in case 2 is `cv_visible`; it must carry NO Hidden marker. + let case = RefCase { + name: "cv_hidden_actually_hides", kind: RefKind::Mismatch, + test: cv_hidden, reference: cv_visible, fuzz: FuzzBudget::EXACT, + }; + assert_reference_independent(&case, &default_rules()); + } + ``` + *(Confirm `.flex()`/`.flex_axis`/`.justify_content`/`FlexAxis`/`JustifyContent` spellings against `crates/buiy_core/src/layout/style.rs`; the test must compile headless first.)* + +- [ ] **Run the independence guard + confirm the GPU cases are ignored headless.** + ```sh + cargo test -p buiy_verify --test reftest_cases_gpu cv_hidden_reference_is_independent + cargo test -p buiy_verify --test reftest_cases_gpu 2>&1 | grep -E "flex_justify_eq_literal|cv_hidden_actually_hides|ignored" + ``` + Expected: `cv_hidden_reference_is_independent` passes; the two `reftest!` cases show as `ignored` headless. + +- [ ] **Run the two real cases on the GPU lane.** + ```sh + cargo test -p buiy_verify --test reftest_cases_gpu -j 2 -- --ignored --test-threads=1 + ``` + Expected: `test result: ok. 2 passed`. If `flex_justify_eq_literal` fails by a 1px AA rim on a shared edge, widen its fuzz with `reftest!(match, flex_justify_eq_literal, flex_justify, literal_offsets, fuzz = (8, N))` citing the *measured* rim pixel count `N` (Mozilla discipline — a non-zero `Match` budget needs a measured reason; ranges must not include 0). + +- [ ] **Run the full project gate (headless).** + ```sh + cargo fmt --all -- --check && cargo clippy --workspace --all-targets -- -D warnings && RUSTDOCFLAGS="-D warnings" cargo doc --workspace --no-deps && xvfb-run -a cargo test --workspace + ``` + Expected: all green; the GPU test files contribute only `ignored` counts here. + +- [ ] **Commit.** + ```sh + git add crates/buiy_verify/tests/reftest_cases_gpu.rs + git commit -m "feat(verify): two real Tier-4 reftest cases + +flex justify-content: SpaceBetween == three literal-offset boxes (reference +routes through the primitive layer, NOT flex — independence by construction); +content-visibility: hidden != the visible subtree (the != anti-test). The +cv reference's independence is asserted pure-CPU. reftests.md § Authoring +patterns. + +Co-Authored-By: Claude Opus 4.8 (1M context) " + ``` + +--- + +**Phase 1b exit criteria:** +- **Headless gate green:** the pure-CPU reftest meta-tests (RefKind parse, `evaluate_outcome` truth table, `mismatch_floor_ok`, the independence lint RED/GREEN, the SDF-oracle point-probes, the two cases' independence guard) all pass; every GPU test reports `ignored`. +- **GPU lane green:** `reftest_engine_gpu` (3), `reftest_macro_gpu` (1), `sdf_cross_check_gpu` (1), `reftest_cases_gpu` (2), `render_capture_app_gpu` (1) all pass on the RX 6700 XT. +- **The harness can fail:** `match_of_two_different_scenes_fails` + `mismatch_of_scene_with_itself_fails` confirm a vacuous green is impossible. +- **Reference independence is enforced + tested:** `assert_reference_independent` rejects an illegal marker (RED) and passes the disjoint one (GREEN); the value-encoded caveat is documented. +- **Deliberately absent:** the forced-colors `BoxShadow` visual reftest (BLOCKED on the unlanded `BoxShadow` extract/draw path) is NOT authored; multiple-references aggregation is a deferred follow-up. + +--- + +## Phase 2-4 — Task outlines (JIT-expanded) + +These three phases are **outlines**, not bite-sized tasks: each lists files to create/modify (exact paths), the spec signatures to implement, the RED tests to write (intent + assertion), success criteria, and the gate to run. JIT-expand each task into the canonical RED→verify-fail→GREEN→verify-pass→commit five-step shape (per the writing-plans format used in Phase 0/1) when you reach it — the signatures and assertions below are pinned, so the expansion is mechanical. **Phase 2** (snapshots + invariants) is pure-CPU/headless and closes gates #5 + #12; **Phase 3** (determinism + golden persistence) carries the GPU `#[ignore]` parts; **Phase 4** (coverage + forced-colors live wiring + docs flip) composes everything and ends the campaign. + +> **Prerequisite (all three phases):** Phase 0 (deps + `Dpr` + `capture_to_image`) and Phase 1 (`metric` + `reftest`) are landed. Phase 2 needs only Phase 0's `insta` dep + Phase 1's `metric`; Phase 3 needs Phase 0's `Dpr`/`capture_to_image` + Phase 1's `metric` + `reftest`; Phase 4 needs all of Phase 2/3. The headless/GPU gate definitions are in § "Phasing & ordering" above. + +--- + +### Phase 2 — Tier 1-2 snapshots + Tier 3 invariants (pure-CPU, headless) + +**Closes gate #5 (layout snapshots) and gate #12 (property invariants); adds the missing Tier-2 display-list gate.** Every task runs under the **headless gate** — no `#[ignore]`, no GPU adapter. `insta` (Phase 0.1) is the only dep; `proptest` is already a `buiy_verify` dep so Tier 3 adds zero crates. The migration discipline is **replace, don't duplicate**: keep each test's scene construction + intent comment, collapse only the trailing field-by-field assert block into one snapshot; asserts pinning a *single named invariant* (e.g. `render_buckets.rs:9` `Shadow.paint_order() < Quad…`, the GC cardinality checks) **stay** as `assert!`/`assert_eq!`. + +#### Task 2.1 — Shared dump primitives: `round` + format-version headers + +- **Files — create:** `crates/buiy_verify/src/snapshot/mod.rs`, `crates/buiy_verify/src/snapshot/dump.rs`. **Modify:** `crates/buiy_verify/src/lib.rs` (add `pub mod snapshot;`). **Test:** `crates/buiy_verify/tests/snapshot_dump.rs` (new). +- **Signatures:** `const ROUND_DP: usize = 2;` and `fn round(f32) -> String` (shared by Tier 1 + Tier 2); `const LAYOUT_DUMP_VERSION: &str = "# buiy-layout-dump v1"` and `const DISPLAY_LIST_DUMP_VERSION: &str = "# buiy-display-list-dump v1"`. +- **RED test:** `round_table` — `round(1.005) == "1.0"`, `round(50.0) == "50"`, `round(-0.001) == "0"` (sub-ULP + negative inputs, snapshots.md § Verification #2). +- **Success:** `round` deterministic across the table; constants exist. (Header tripwire tests land with the dumps in 2.2/2.4.) +- **Gate:** headless. + +#### Task 2.2 — Tier 1: `layout_dump` + `assert_layout_snapshot` + +- **Files — create:** `crates/buiy_verify/src/snapshot/layout.rs`. **Modify:** `crates/buiy_verify/src/snapshot/mod.rs` (re-export). **Test:** `crates/buiy_verify/tests/snapshot_layout.rs` (new self-tests) + the migration in `crates/buiy_core/tests/layout.rs:33`. +- **Signatures:** `pub fn layout_dump(world: &World) -> String` (`(name, position, size)` per entity, sorted by `Name` then `Entity` index, indentation via `ChildOf`, floats via `round`, `LAYOUT_DUMP_VERSION` header, `entity#` fallback for unnamed); `pub fn assert_layout_snapshot(app: &mut App, name: &str)` (one `update()`, then `insta::assert_snapshot!`). +- **RED tests (self-tests, plain asserts):** `dump_is_entity_order_invariant` (same fixture in two differently-spawned apps ⇒ `assert_eq!(layout_dump(a), layout_dump(b))`, snapshots.md § Verification #1); `layout_dump_has_version_header` (first line `== LAYOUT_DUMP_VERSION`, § Verification #4). +- **Migration:** `layout.rs:33` — replace the `(layout.size.x - 50.0).abs() < 0.5` pair in `layout_resolves_a_simple_flex_row` with `assert_layout_snapshot(&mut app, "flex_row_basic")` after tagging entities with `Name`. The two `layout_tree_garbage_collects_*` tests **stay** plain `assert_eq!` (cardinality, not geometry). +- **Success:** self-tests green; first run blesses `flex_row_basic.snap`; reviewer diffs the `.snap` against the old `< 0.5` asserts. `.snap` committed under `tests/snapshots/`. `INSTA_UPDATE=no` in CI ⇒ an unreviewed `.snap.new` fails. +- **Gate:** headless. + +#### Task 2.3 — Tier 2: `NameLookup` + `instance_hex` / `assert_instance_hex_snapshot` + +- **Files — create:** `crates/buiy_verify/src/snapshot/display_list.rs` (NameLookup + instance hex first; the dump in 2.4). **Test:** `crates/buiy_verify/tests/snapshot_instance_hex.rs` (new). +- **Signatures:** `pub struct NameLookup(HashMap)` + `NameLookup::from_world(world)` (World-built once so the dump stays World-free, per README § Resolved #5); `pub fn instance_hex(p: &PackedInstance) -> String` (`bytemuck::bytes_of`, host-endian — document the little-endian-x86-64 assumption); `pub fn assert_instance_hex_snapshot(p: &PackedInstance, name: &str)`. +- **RED test:** `hex_round_trips_bytes` — `instance_hex(p)` → parse → `bytemuck::pod_read_unaligned::` → `assert_eq!` reconstructed `== p` (snapshots.md § Verification #3). `PackedInstance` shape per `render/instance.rs:41`. +- **Success:** round-trip green; hex byte-exact and format-version-free. +- **Gate:** headless. + +#### Task 2.4 — Tier 2: `display_list_dump` + `assert_display_list_snapshot` + +- **Files — modify:** `crates/buiy_verify/src/snapshot/display_list.rs`. **Test:** `crates/buiy_verify/tests/snapshot_display_list.rs` (new self-tests). +- **Signatures:** `pub fn display_list_dump(nodes: &ExtractedNodes, names: &NameLookup) -> String` (nodes in `painters_z` stored order — never re-sorted, `extract.rs:141`; color as `token:` when resolvable else `#rrggbbaa`; `clip=none`/`min..max`; `group=|none`; then `pack_view()` `InstanceBuckets` in `BTreeMap` draw order with per-batch `xN` counts; `DISPLAY_LIST_DUMP_VERSION` header); `pub fn assert_display_list_snapshot(nodes: &ExtractedNodes, name: &str, names: &NameLookup)`. +- **RED tests (self-tests):** `display_dump_is_entity_order_invariant` (Name-keyed, two differently-spawned worlds equal); `display_dump_has_version_header`; `missing_token_surfaces_as_magenta` (a `MISSING_TOKEN_FALLBACK`-color node dumps as `#ff00ffff`). +- **Success:** self-tests green; the dump pins per-node set + batched draw order in one artifact; `InstanceBuckets` counts in the readable dump, exact payload in the hex check (complementary). +- **Gate:** headless. + +#### Task 2.5 — Migrate the five render `assert_eq!` tests to snapshots + +- **Files — modify (keep scene + intent comment, collapse trailing asserts):** + - `crates/buiy_core/tests/render_extract.rs` — per-field `node.position/size/color/clip` + the `assemble_context_tree` order `assert_eq!` (`:423`) → one `assert_display_list_snapshot`. + - `crates/buiy_core/tests/render_buckets.rs` — `b.len(q0)`/`total_instances`/`batch[0]`/`PackedPartition` field asserts (`:239`) → display-list dump + `assert_instance_hex_snapshot`. **Keep** `render_buckets.rs:9` `Shadow.paint_order() < Quad…` as a plain assert. + - `crates/buiy_core/tests/render_paint_order.rs` — `assert_eq!(tail, vec![...])` (`:64`) → display-list dump. + - `crates/buiy_core/tests/render_instance.rs` — per-field `PackedInstance` asserts incl. the half-size sign regression → `assert_instance_hex_snapshot`. + - `crates/buiy_core/tests/top_layer.rs` — `partition_top_layer` order asserts → display-list dump. + - Each `use buiy_verify::snapshot::…` (Phase 0 already gave `buiy_core` the `buiy_verify` + `insta` dev-deps). +- **RED step (per file):** first `cargo test` produces `.snap.new` (RED: no committed `.snap`); `cargo insta review` the diff against the old per-field asserts, accept, commit the `.snap`. +- **Migration behavior-preserving (mandatory):** for `render_instance.rs`, after migrating, *re-introduce* the half-size sign bug in a scratch edit and confirm `assert_instance_hex_snapshot` now fails (hex flips) — then revert. Same mutation check for the `assemble_context_tree` order in `render_extract.rs` (snapshots.md § Verification #5). +- **Success:** all migrated tests green with committed `.snap`s; the mutation checks confirm teeth; headless gate green (§ Verification #6). `.snap` under `crates/buiy_core/tests/snapshots/`. +- **Gate:** headless. + +#### Task 2.6 — Tier 2 opt-in: per-timestamp animation snapshots + +- **Files — modify:** `crates/buiy_verify/src/snapshot/display_list.rs`. **Test:** `crates/buiy_verify/tests/snapshot_animation.rs` (new). +- **Signature:** `pub fn assert_display_list_snapshot_at(app: &mut App, name: &str, steps: &[std::time::Duration])` — advances `Time` to each absolute logical time (the landed manual clock, `tests/text_caret_selection.rs:178`), emits `display_list_dump` per step, keyed `@`. Default three timestamps. +- **RED test:** `per_timestamp_is_deterministic` — drive a caret-blink fixture through `&[ZERO, mid, end]` twice on fresh apps; `assert_eq!` the per-step dumps. Opt-in: this fixture enrolls *because* its timing curve is the behavior under test. +- **Success:** one `.snap` per step; a timing regression shows as a diff in exactly the drifted frame. Pure-CPU. +- **Gate:** headless. + +#### Task 2.7 — Tier 3 scaffolding: `invariant` module + `Scene` model + generators + +- **Files — create:** `crates/buiy_verify/src/invariant/mod.rs`, `…/invariant/scene.rs`. **Modify:** `crates/buiy_verify/src/lib.rs` (`pub mod invariant;`). **Test:** `crates/buiy_verify/tests/scene_generator_smoke.rs` (bounds) + the predicate proptests in 2.9–2.10. +- **Signatures:** `pub struct SceneNode { name, children, z_index: Option, isolation: bool, top_layer: TopLayer, transform: GenTransform, size: (f32,f32), background: Option }`; `pub struct Scene { roots: Vec }`; `pub struct SceneParams { max_depth=4, max_breadth=4, max_nodes=24, p_stacking=0.3, p_top_layer=0.1 }`; `pub fn arb_scene(p: SceneParams) -> impl Strategy` (`prop_recursive`); `pub fn realize(scene: &Scene) -> ExtractedNodes` (through the production `assemble_context_tree`/`partition_top_layer`, no GPU). `GenTransform` draws `Translate -512..512`, `Rotate 0..2π`, `Scale 0.1..8.0` (identity reachable); `z_index` from `{-1,0,1,2}`; `top_layer` all five variants skewed to `None`; name uniqueness via pre-order rename `n0..nK`. **No new dep.** +- **RED test:** `arb_scene_respects_bounds` — `proptest!`: every realized scene has `node_count <= max_nodes` and `depth <= max_depth`. +- **Success:** generator terminates, shrinks legibly; `realize` round-trips a `Scene` through the CPU paint path into a flat paint-ordered list. +- **Gate:** headless. + +#### Task 2.8 — `buiy_core` surface add: promote `tier_rank` → `top_layer_paint_rank` + +- **Files — modify:** `crates/buiy_core/src/layout/systems.rs:4113` (extract the private `tier_rank` closure body into `pub fn top_layer_paint_rank(t: TopLayer) -> u8`; have the existing layout sort call it). **Modify:** `crates/buiy_core/src/layout/mod.rs` (re-export). **Test:** `crates/buiy_core/tests/layout_stacking.rs` (rank-mapping assert). +- **Signature:** `pub fn buiy_core::layout::top_layer_paint_rank(TopLayer) -> u8` mapping `Fullscreen→0, Tooltip→1, Popover→2, Modal→3, None→u8::MAX` (README § Resolved #3 / invariants.md deviation #3 — the *declared* enum order is NOT the paint order, so `#[derive(Ord)]` is wrong; compare via this rank). +- **RED test:** `paint_rank_matches_documented_order` — `assert_eq!(top_layer_paint_rank(Fullscreen), 0)` … `(None) == u8::MAX`; and the layout sort still produces the same tail order (behavior-preserving). +- **Success:** the closure is gone, the `pub fn` is the only rank source; existing `layout_stacking.rs` tests stay green. A small, accepted `buiy_core` surface add. +- **Gate:** headless. + +#### Task 2.9 — Tier 3 predicates #1–#5 (paint-order, transform, top-layer, finiteness, contexts) + +- **Files — create:** `crates/buiy_verify/src/invariant/predicates.rs`. **Modify:** `…/invariant/mod.rs` (the `proptest!` harness + mutation-fixture `#[test]`s). **Test:** `…/mod.rs` (proptest blocks) + `crates/buiy_verify/tests/invariant_mutations.rs`. +- **Signatures (each `pub fn … -> Result<(), Violation>`; `Violation { rule: &'static str, detail: String }`, no `thiserror`):** + - `paint_order_is_total(nodes)` — no entity twice; equal-paint-key pairs keep document order (`extract.rs:139`). + - `transform_roundtrips(t: &GenTransform)` — on the **composed** `Mat4` from `compose_transform` (`systems.rs:3775`, compose `T·R·S·M`), within `EPS`: `translate(d)·translate(-d) ≈ I`; `rotate(2π) ≈ I`; `scale(k)` scales geometry by `k`, off-diagonals stay 0. + - `top_layer_dominates(nodes)` — every `top_layer != None` paints after every normal node; escaped tail ordered by `top_layer_paint_rank` (Task 2.8), **never** the discriminant. + - `all_finite(nodes)` — every `ExtractedNode.size.{x,y} ≥ 0` and finite (`extract.rs:73`). + - `all_finite_packed(packed)` — every field finite and `rect_size[1] ≥ 0` *directly* (y-flip in the view uniform, `instance.rs:46`, deviation #2 — no un-flip). + - `contexts_do_not_interleave(nodes, scene)` — no entity of context A between two of context B. +- **RED tests — proptest blocks** (`ProptestConfig { cases: 256, max_shrink_iters: 4096 }`): `prop_paint_order_total`, `prop_transform_roundtrips`, `prop_top_layer_dominates`, `prop_all_finite`, `prop_contexts_no_interleave`. +- **RED tests — mutation fixtures (teeth):** duplicate-entity ⇒ `Err`; mis-composed `S·R·T` ⇒ `Err`; `Modal` (rank 3) before `Fullscreen` (rank 0) ⇒ `Err` (**fails if anyone "fixes" the predicate to use the discriminant** — pins deviation #3); `NaN`/negative `size.y` ⇒ `Err`; positive packed `rect_size[1]` ⇒ `Ok`; hand-built interleaved list ⇒ `Err`. Each with a passing control. +- **Persistence:** `proptest-regressions/invariant/.txt` is **committed**, not gitignored. +- **Success:** all proptest blocks green at 256 cases; every mutation fixture rejects its one broken relation; controls pass. +- **Gate:** headless. + +#### Task 2.10 — Tier 3 predicate #6: BiDi caret round-trip (`bidi.rs`) + +- **Files — create:** `crates/buiy_verify/src/invariant/bidi.rs`. **Modify:** `…/invariant/mod.rs`. **Test:** same `mod.rs`. +- **Signatures:** `pub fn arb_bidi_text(max_runs, max_run_len) -> impl Strategy` (alternating LTR/RTL runs + neutrals); `pub fn bidi_caret_roundtrips(text: &str, metrics: Metrics) -> Result<(), Violation>` — on the **landed shaper** (`cosmic_text::Buffer` through the production text stack, same path as `tests/text_shaping_snapshots.rs`): **#6a** logical↔visual caret round-trip is identity over every grapheme boundary; **#6b** within one `LayoutRun`, visual caret order is monotonic in logical order for `rtl==false`, strictly reversed for `rtl==true`; **#6c** the run partition covers every codepoint exactly once across `Buffer::layout_runs()`. Uses `cosmic_text::Cursor` (`text/components.rs:10`). +- **RED tests:** `prop_bidi_caret_roundtrips` (`proptest` over `arb_bidi_text` ⇒ `.is_ok()`); mutation fixtures — the six shaping-snapshot scripts as known-good controls ⇒ `Ok`; an off-by-one caret-map fixture ⇒ `Err`. +- **Success:** proptest green; controls pass; the off-by-one fixture rejected. **Closes gate #12.** +- **Gate:** headless. + +**Phase 2 exit criteria:** headless gate fully green; gate #5 satisfied by `assert_layout_snapshot`; gate #12 satisfied by the six proptest predicates + mutation fixtures; the new Tier-2 display-list gate live; all migrated `render_*`/`layout.rs`/`top_layer.rs` tests carry committed `.snap`s; `proptest-regressions/` committed. No GPU touched. + +--- + +### Phase 3 — Determinism stack + Tier 5 golden persistence (headless + GPU `#[ignore]`) + +**Realizes the determinism substrate (`DeterministicApp` + `GoldenConfig` extensions + CI lavapipe pin) and the Tier-5 stored-golden corpus.** Two halves: the **pure-CPU half** (config types, golden persistence/ledger/triage — headless gate) and the **GPU half** (`capture_to_image` quiescence, knob-sensitivity, end-to-end goldens — `#[ignore]`, GPU lane). New deps: `toml = "0.8"` + `base64 = "0.22"` — gate on `cargo deny check` before adding. The Ahem `.ttf` is a committed fixture, not a dep. + +> Phase 0 already promoted `capture_to_image` and defined the canonical `Dpr`. Phase 3 *extends* `GoldenConfig` and *hardens* that capture primitive. + +#### Task 3.1 — Extend `GoldenConfig`: `FontMode`, `Dpr` field, MSAA/dither constants + +- **Files — modify:** `crates/buiy_core/src/render/golden.rs` (add `font_mode: FontMode` + `dpr: Dpr` fields; `enum FontMode { Real, Ahem }`; `deterministic()` defaults `font_mode: Ahem`, `dpr: Dpr::X1`; `fidelity()` = `font_mode: Real`; the `CAPTURE_MSAA`/`CAPTURE_DITHER_OFF` consts already landed in Phase 0.4). Struct stays `Copy`. **Test:** `crates/buiy_core/tests/render_golden_config.rs` (new). +- **RED tests:** `deterministic_defaults_collapse_font_axis` (`deterministic().font_mode == FontMode::Ahem`, `.dpr == Dpr::X1`); `fidelity_uses_real_font` (`fidelity().font_mode == FontMode::Real`, other knobs pinned). +- **Note:** Phase 0.4's `capture_to_image` ignores `cfg.dpr` (it sizes via the window). Phase 3.3 makes `capture_to_image` assert `scale_factor == cfg.dpr.as_f32()`; the field exists from this task so 3.3 can read it. **MSAA/dither are constants, never per-fixture knobs.** +- **Success:** config compiles `Copy`; defaults are the deterministic values. +- **Gate:** headless. + +#### Task 3.2 — Ahem font asset + registration through the production bytes path + +- **Files — create:** `crates/buiy_core/tests/fixtures/fonts/Ahem.ttf` (committed WPT Ahem, license file beside it, mirroring the `OFL-*.txt` precedent). **Modify:** `crates/buiy_verify/src/determinism/mod.rs` (font wiring helper). **Test:** `crates/buiy_verify/tests/determinism_ahem.rs` (new). +- **Wiring:** register via `FontRegistry::register_bytes("Ahem", ahem_bytes, FontFaceDescriptors::default())` (`registry.rs:165`) under family `"Ahem"`; when `font_mode == Ahem`, make it the **sole resolvable family** for fixture text (disable system-font loading; fixtures run bundled-only, `tests/support/mod.rs:292`/`:306`). +- **RED test:** `ahem_is_sole_family_under_ahem_mode` — register Ahem, resolve a fixture string under `FontMode::Ahem`, assert the resolved face family is `"Ahem"` (no fallback face). Pure-CPU (shaping/resolve, no rasterizer). +- **Success:** Ahem loads through the real bytes path; under Ahem mode it is the only resolvable family. Real vs Ahem is a per-fixture declaration (default Ahem). +- **Gate:** headless. + +#### Task 3.3 — Async-asset flush to quiescence in `capture_to_image` + +- **Files — modify:** `crates/buiy_core/src/render/golden.rs` (the `capture_to_image` body — replace Phase 0.4's bounded fixed-frame loop with: drive `app.update()` until the four conditions hold, bounded by `MAX_SETTLE_FRAMES`, panic naming the unmet condition; add the `scale_factor == cfg.dpr.as_f32()` assertion). **Test:** `crates/buiy_core/tests/render_capture_quiescence.rs` (new). +- **Quiescence conditions (generalizes `wait_for_text_ready`, `support/mod.rs:266`):** (1) `asset_server` pending loads `== 0`; (2) `AtlasWarmupQueue::is_empty()` (`golden.rs:87`); (3) `fonts_ready(atlas, warmup, &keys)` (`golden.rs:82`); (4) `PipelineCache` has no `Queued`/`Compiling` Buiy pipeline. Time advances via `Time::::advance_by`, never `Instant::now()`. +- **RED tests:** **GPU `#[ignore]`** `quiescence_panics_on_never_loading_asset` — inject a never-loading asset (or undrained warmup queue), assert `capture_to_image` **panics naming the unmet condition** (determinism.md § Verification #3 — fail loudly). **Headless** `capture_path_has_no_instant_now` — a grep-lint `#[test]` asserting `Instant::now()` does not appear in the capture path source (§ Verification #4). +- **Success:** quiescence loop terminates deterministically under the fixed clock; the panic fires with the named condition; no wall-clock read. +- **Gate:** the grep-lint headless; the never-loading-asset panic test **GPU lane** (`#[ignore]`). + +#### Task 3.4 — `DeterministicApp` builder (`buiy_verify::determinism`) + +- **Files — create:** `crates/buiy_verify/src/determinism/mod.rs` (+ `lib.rs` `pub mod determinism;`). Re-export `FontMode`/`Dpr` from `buiy_core::render::golden`, do **not** redefine. **Test:** the idempotent/knob-sensitivity tests in 3.5. +- **Signatures:** `pub struct DeterministicApp { cfg: GoldenConfig, logical: (u32,u32) }`; `new(w,h)`, `with(cfg)`, `font_mode(m)`, `dpr(d)`; `pub fn build(self) -> App` (a **single-bodied** wrapper over the landed `capture_app_scaled(w, h, cfg.dpr.as_f32())` so it cannot drift; applies `TimeUpdateStrategy::ManualDuration(0)` + manual `Time`; registers Ahem + sole-family when `font_mode==Ahem`; capture camera at `CAPTURE_MSAA`, dither off); `pub fn capture(self, fixture: impl FnOnce(&mut App)) -> RgbaImage` (`build` + spawn + `capture_to_image`). +- **RED test:** `build_applies_dpr_and_msaa` — `DeterministicApp::new(64,64).dpr(Dpr::X2).build()`; assert the window `scale_factor == 2.0` and the capture camera carries `Msaa::Off`. (CPU-observable on the built app; no readback.) +- **Success:** `build` is a thin single-bodied wrapper; knobs applied + asserted; **`run_reftest`'s `support::reftest_app` (Phase 1b) is re-pointed to `DeterministicApp::new(w,h).build()` in this task** — the one-line swap the 1b seam was designed for; the 1b reftest `#[ignore]` cases re-run green to pin behavior across the swap. +- **Gate:** headless (building/inspecting the app); the *capture* is GPU (3.5). The reftest re-run is GPU lane. + +#### Task 3.5 — Determinism self-tests: idempotent capture + knob sensitivity (GPU) + +- **Files — create:** `crates/buiy_verify/tests/determinism_capture.rs` (`#[ignore]`, GPU lane). +- **RED tests (all `#[ignore]`):** `idempotent_capture` — capture the same fixture twice in two fresh `DeterministicApp`s ⇒ `compare(a, b, &CompareOpts::default()).passes(&FuzzBudget::EXACT)` (determinism.md § Verification #1); `knob_sensitivity_dpr` (`dpr(X1)` vs `dpr(X2)` **differ** — `!passes(&EXACT)`); `knob_sensitivity_font_mode` (`Real` vs `Ahem` of a text fixture differ); `knob_sensitivity_msaa` (a fixture with MSAA forced on differs from `CAPTURE_MSAA`). § Verification #2 — the knobs are load-bearing. +- **Success:** idempotent capture passes at `(0,0)`; every knob flip changes the bytes. +- **Gate:** **GPU lane**; headless stays green independently. + +#### Task 3.6 — Tier-5 keys + ledger types (`buiy_verify::golden`, pure-CPU) + +- **Files — create:** `crates/buiy_verify/src/golden.rs` (+ `lib.rs` `pub mod golden;`). **Deps:** add `toml = "0.8"` + `base64 = "0.22"` to `[workspace.dependencies]` and `buiy_verify` — **run `cargo deny check` first** (both MIT/Apache-2.0). **Test:** `crates/buiy_verify/tests/golden_keys.rs` (new). +- **Signatures:** `pub struct GoldenKey { widget, state, theme, viewport, backend: Backend, dpr: Dpr }` (imports canonical `Dpr` from `buiy_core::render::golden`); `pub enum Backend { Lavapipe, Vulkan, Gl, Metal, Dx12 }`; `GoldenKey::slug()` (deterministic lower-kebab `widget/state/theme__viewport__backend__dpr`), `GoldenKey::dir(root)`; `pub struct BlessLedger { key, positives: Vec }`; `pub struct Positive { file, blessed_commit, blessed_at, budget: FuzzBudget, reason }` (serde, TOML on disk). +- **RED test:** `key_slug_round_trips` — `proptest`: a `GoldenKey` round-trips through `slug()`→parse; two distinct keys never collide (goldens.md § Verification #6). +- **Success:** key schema is **fixed before any golden is generated** (skia-gold lesson); ledger serializes to human-diffable TOML. +- **Gate:** headless. + +#### Task 3.7 — `check_golden`/`assert_golden` + multi-positive + bless workflow + +- **Files — modify:** `crates/buiy_verify/src/golden.rs`. **Test:** `crates/buiy_verify/tests/golden_persistence.rs` (new; all pure-CPU — synthesize `RgbaImage`s in memory). +- **Signatures:** `pub fn check_golden(key, actual: &RgbaImage, budget: &FuzzBudget) -> GoldenOutcome` (compares `actual` against each stored positive via `metric::compare`, passes if *any* `Diff::passes(budget)`; on fail carries the best/smallest-Diff candidate); `pub enum GoldenOutcome { Pass{matched_positive, diff}, Fail{best, report}, Blessed{positive, was_new} }`; `pub fn assert_golden(key, actual: &RgbaImage, budget: &FuzzBudget)` (panics on non-bless `Fail`; on `BUIY_BLESS=1` blesses). Default budget after the determinism pin is `(0,0)`. +- **RED tests (pure-CPU, goldens.md § Verification #1–#4):** `match_and_mismatch`; `multi_positive_any_matches` (bless two positives, image matching the second ⇒ `Pass { matched_positive: 1 }`); `bless_round_trip` (`BUIY_BLESS=1` blesses to a temp corpus, re-run without env passes, ledger records commit/timestamp/reason); `fail_closed_on_empty_corpus` (empty corpus + unset env ⇒ `assert_golden` **panics with the bless instruction**, à la `text_shaping_snapshots.rs:301`). +- **Success:** set-valued match + budget gate work without a renderer; bless env-gated (`BUIY_BLESS`, modeled on `BUIY_ACCEPT_SHAPING`), never a silent overwrite. The stale-positive guard (`golden-prune` bin) is **advisory, deferred**. +- **Gate:** headless. + +#### Task 3.8 — Diff-PNG + self-contained HTML triage report + +- **Files — modify:** `crates/buiy_verify/src/golden.rs` (or `…/golden/report.rs`). **Test:** `crates/buiy_verify/tests/golden_report.rs` (new). +- **Signatures:** `pub struct TriageReport { path, cards: Vec }`; `pub struct TriageCard { key, actual_png, baseline_png, diff_png, diff: Diff, budget: FuzzBudget }`; `open_or_create(path)`, `push(card)`, `write()` (one self-contained HTML: side-by-side expected|actual, JS opacity-slider overlay, diff heatmap — all PNGs base64-inlined). On any `Fail`: write `target/buiy-goldens/.diff.png` (the `Diff::diff_image` heatmap) + append a card to `target/buiy-goldens/report.html`. +- **RED test:** `report_is_self_contained` — generate a `TriageReport` with one card, `write()`, assert the HTML **contains the base64 PNGs and references no external URL** (grep for `http`/`src="./"` ⇒ absent). Offline-first (goldens.md § Verification #5). +- **Success:** the report opens straight from CI artifacts, no network/SaaS. Time-boxed-ignore + flaky-auto-ignore are **deferred follow-ups**. +- **Gate:** headless. + +#### Task 3.9 — End-to-end goldens per residue class (GPU) + storage hygiene + +- **Files — create:** `crates/buiy_verify/tests/goldens.rs` (`#[ignore]`, GPU lane) + the blessed corpus under `crates/buiy_verify/tests/goldens/`. **Modify:** `.gitattributes` (add `crates/buiy_verify/tests/goldens/*.png -text`, mirroring the `*.snap -text` pin). +- **RED tests (`#[ignore]`, one per residue class — goldens.md § Verification #7):** SDF corner AA (beyond the CPU cross-check), shadow blur kernel, real-font glyph (one pinned bundled OFL font, `FontMode::Real`), color-emoji (the irreducible golden — pinned bundled emoji font, generous per-fixture budget). Plus the **Ahem layout-class** golden asserting *both* byte-identity across two fresh captures **and** equality to the stored positive (proving the box-font collapse holds). Each captured via `capture_to_image` under `DeterministicApp`, blessed once with `BUIY_BLESS=1 cargo test -p buiy_verify --test goldens -- --ignored --test-threads=1`. +- **Storage:** positives in-git under `tests/goldens/`, reviewed as the PR diff. **Migration trigger named now:** total in-git golden bytes > 50 MB OR positive count > 500 ⇒ move to commit-hash-keyed object storage (a *step, not a crisis*). +- **Success:** each residue class has a blessed positive passing on the pinned rasterizer; the Ahem golden double-asserts the collapse. +- **Gate:** **GPU lane**; headless stays green. + +#### Task 3.10 — CI lavapipe pin (composite action + env contract) + +- **Files — create:** `.github/actions/install-mesa/action.yml` (consume `gfx-rs/ci-build`'s prebuilt lavapipe tarball — no self-build; pin `MESA_VERSION` + `ci-binary-build` tag explicitly). **Modify:** the CI workflow to invoke it on the golden leg and export the env contract. +- **Env contract:** `VK_DRIVER_FILES=$PWD/icd.json` (the action writes its **own** ICD JSON so the loader sees *only* lavapipe); `WGPU_ADAPTER_NAME=llvmpipe`. **NOT set:** `LP_NUM_THREADS` (README § Resolved #6 / determinism.md deviation #1 — determinism comes from the pinned Mesa version). Use `VK_DRIVER_FILES`, not the deprecated `VK_ICD_FILENAMES` (deviation #2). +- **RED test (CI-only smoke, determinism.md § Verification #5):** `lavapipe_adapter_selected` — on the lavapipe leg, assert the selected adapter name contains `llvmpipe` **before any golden runs** (the pin is active, not silently falling back to hardware). +- **Success:** CI goldens run on pinned lavapipe; the local lane runs on the RX 6700 XT but does **not** compare against the lavapipe baseline (cross-rasterizer pixels are non-comparable — it runs the rasterizer-internal determinism/reftest checks). One canonical rasterizer ⇒ one golden per cell; `backend` is a constant today. +- **Gate:** CI (lavapipe leg). The smoke guard is CI-only; locally the GPU lane runs against real hardware for determinism/reftest checks. + +**Phase 3 exit criteria:** headless gate green (config types, golden persistence/ledger/triage, all pure-CPU self-tests); GPU lane green on the RX 6700 XT (idempotent capture at `(0,0)`, knob-sensitivity negatives, end-to-end goldens per residue class, the 1b reftests re-run through `DeterministicApp`); CI lavapipe pin wired with the `llvmpipe` smoke guard; `toml`+`base64` cleared by `cargo deny check`; `.gitattributes` pins `goldens/*.png -text`. + +--- + +### Phase 4 — Coverage-by-construction + forced-colors live wiring + docs flip (headless + GPU `#[ignore]`) + +**Composes every prior tier: a `Fixture` corpus × a global `Matrix` Cartesian product auto-enrolls each fixture across all five tiers, and `forced_colors_analyzer` is re-pointed from hand-built `CatalogPaint` at the live widget catalog (closing gate #11's live-catalog half).** New dep: `inventory = "0.3"` — gate on `cargo deny check`. `insta`'s `glob` feature (Phase 0.1) drives the snapshot tiers' fixture-dir fan-out. The pure-CPU coverage self-tests run on the **headless gate**; only `coverage_golden` is `#[ignore]` GPU. + +#### Task 4.1 — `Fixture` corpus + `fixture!` macro + `inventory` catalog + +- **Files — create:** `crates/buiy_verify/src/coverage/mod.rs`, `…/coverage/fixture.rs` (+ `lib.rs` `pub mod coverage;`); the first fixtures under `crates/buiy_verify/fixtures//.rs` (start with `button/resting.rs` from the `hello_button` spawn). **Deps:** add `inventory = "0.3"` — **run `cargo deny check` first**. **Test:** the `verify_catalog_matches_glob` self-test in 4.5. +- **Signatures:** `pub struct Fixture { name: &'static str, state: &'static str, spawn: fn(&mut App) }`; `pub fn catalog() -> &'static [Fixture]` (`inventory`-collected); the `fixture!` macro emitting **both** an `inventory::submit!` **and** a glob-discoverable file. The `spawn` MUST spawn a `Camera2d` and tag the widget root with a `Name`. State (resting/hover/focus/pressed/disabled) is **per-fixture** (one file per state), encoded by spawning the widget already in that state. +- **RED test:** (deferred to 4.5's `verify_catalog_matches_glob`). +- **Success:** a fixture is the catalog row authored once, the same `fn(&mut App)` every tier consumes; `catalog()` enumerates via `inventory`. +- **Gate:** headless. + +#### Task 4.2 — `Matrix` + `Cell` + `CoverageKey` + +- **Files — create:** `crates/buiy_verify/src/coverage/matrix.rs`, `…/coverage/key.rs`. **Test:** the `verify_keys_unique` / `verify_cell_count_under_ceiling` self-tests in 4.5. +- **Signatures:** `pub struct Matrix { themes: Vec, viewports: Vec, forced_colors: Vec, dprs: Vec }` (imports canonical `Dpr`, **not** a local `f32`); `enum ThemeAxis { Light, ForcedColors }` (`build() -> Theme` via `default_light_theme`/`forced_colors_theme`; `key()`); `struct Viewport { w, h, key }`; `Matrix::ci_default()` (≈ 2 themes × 3 viewports × 2 fc × 2 dpr = 24 cells/fixture); `Matrix::cells() -> impl Iterator` (stable axis-declaration order); `struct Cell { theme, viewport, forced_colors, dpr }`; `pub struct CoverageKey { widget, state, theme, viewport, forced_colors, dpr: Dpr, backend: Backend }` derives `Eq + Hash` (**because `dpr: Dpr` is `Eq + Hash`** — the old `f32` made this impossible, the bug this fix unblocks); `CoverageKey::for_cell(fx, cell, backend)`; `CoverageKey::stem()` (e.g. `button.resting.forced.desktop.fc1.dpr2.lavapipe`). +- **RED test:** (keying self-tests in 4.5). +- **Success:** `CoverageKey` derives `Eq + Hash` so keys collect into a `HashSet`; `backend` is `cpu` for Tiers 1–3, the rasterizer name for GPU — reserved now to avoid the painful retrofit. +- **Gate:** headless. + +#### Task 4.3 — `enroll_all` + `build_app` (the one-body-per-tier driver) + +- **Files — create:** `crates/buiy_verify/src/coverage/enroll.rs`. **Test:** the `enrollment_fan_out` self-test in 4.5. +- **Signatures:** `pub fn build_app(fx: &Fixture, cell: &Cell) -> App` (`DeterministicApp` with `cell.theme.build()` installed, viewport + `DeterministicApp::dpr(cell.dpr)` pinned — the `Dpr`→`f32` conversion happens **here** via `cell.dpr.as_f32()` — `forced_colors` set on `UserPreferences`, then the fixture spawned); `pub fn enroll_all(matrix: &Matrix, body: impl Fn(App, CoverageKey))` (drives `body` across `catalog() × matrix.cells()`). +- **RED test:** (fan-out totality in 4.5). +- **Success:** each tier is a thin caller of `enroll_all`; no per-widget test code exists; the `Dpr` milliscale stays the key, the window `scale_factor` is the derived `f32`. +- **Gate:** headless. + +#### Task 4.4 — Per-tier enrollment tests (the five `coverage_*.rs` drivers) + +- **Files — create:** `crates/buiy_verify/tests/coverage_layout.rs` (Tier 1, gate #5), `coverage_display_list.rs` (Tier 2), `coverage_invariants.rs` (Tier 3), `coverage_golden.rs` (Tier 5, `#[ignore]` GPU). Each is a `#[test]` calling `enroll_all(&Matrix::ci_default(), |app, key| { … })`. +- **Bodies:** layout → `assert_layout_snapshot(&key.stem(), &app)` (the `insta` tiers additionally use `glob!` over the fixture dir as the collection-time fan-out); display-list → `assert_display_list_snapshot(&key.stem(), …)`; invariants → for each Tier-3 predicate, assert on the realized scene; golden → `let img = capture_to_image(&mut app, &cfg); assert_golden(&key-derived GoldenKey, &img, &budget_for(&key))`. +- **RED step:** first run of the snapshot drivers produces `.snap.new` per cell ⇒ `cargo insta review` → accept → commit. The golden driver is GPU `#[ignore]`, blessed via `BUIY_BLESS=1`. +- **Success — the decisive property:** adding `fixtures/slider/resting.rs` enrolls a slider into **all five tiers at once** **with no edit to any test file**. +- **Gate:** `coverage_layout`/`_display_list`/`_invariants` headless; `coverage_golden` **GPU lane**. + +#### Task 4.5 — Coverage harness self-tests + +- **Files — create:** `crates/buiy_verify/tests/coverage_meta.rs` (all pure-CPU, headless). +- **RED tests (coverage.md § Verification #1–#5):** `verify_catalog_matches_glob` (`catalog()` and the `glob!` walk enumerate the identical `name×state` set); `verify_keys_unique` (over `catalog() × Matrix::ci_default().cells()`, every `stem()` unique and round-trips; keys collect into a `HashSet`); `verify_cell_count_under_ceiling` (product size below the named CI ceiling); `enrollment_fan_out` (a stub tier body pushing its `CoverageKey` into a `Vec` asserts `enroll_all` invokes the body exactly `fixtures × cells` times with **no duplicate key**). +- **Success:** enumeration/keying verified independent of any tier's pass/fail. +- **Gate:** headless. + +#### Task 4.6 — `forced_colors_analyzer` live-catalog producer (gate #11) + +- **Files — create:** `crates/buiy_verify/src/coverage/forced_colors.rs`. **Test:** `crates/buiy_verify/tests/coverage_forced_colors.rs` (new, pure-CPU, gate #11). +- **Signature:** `pub fn live_catalog_paint() -> Vec` — walk the live catalog: for each fixture build its app, query the spawned `Background`/`Border`/`Outline` (+ shadow-only-delta) off the `Name`-tagged root, project into the **existing** `CatalogPaint`. The analyzer (`analyze_forced_colors`/`analyze_shadow_only`, `forced_colors_analyzer.rs:51`/`:89`) is called **unchanged** — only its *input source* moves from hand-built fixtures (`tests/render_forced_colors_analyzer.rs:11`) to the live tree (the live components exist: `buiy_widgets/src/button.rs:18,47` spawns `Background`/`Border`/`Corners`/`Radius`, closing follow-ups.md:469–473). +- **RED tests:** `live_catalog_has_no_forced_colors_violations` (`analyze_forced_colors(&live_catalog_paint(), &forced_colors_theme()).is_empty()` + `analyze_shadow_only(...).is_empty()`); `broken_fixture_produces_violation` (a `#[cfg(test)]`-only fixture painting a **brand** token under forced-colors **must** produce a `NonSystemColor` violation through `live_catalog_paint` — proves the producer observes *real paint*; excluded from the real `catalog()` so it never reds production). +- **Success:** gate #11's live-catalog half falls out of the same enrollment; every new widget auto-enrolls into the forced-colors scan by construction. +- **Gate:** headless. + +> **BLOCKED — forced-colors `BoxShadow` *visual* reftest (do NOT plan as runnable).** The residual forced-colors *visual* half — the `BoxShadow` draw-skip under `forced-colors: active` — is a Tier-4 reftest **blocked on the unlanded `BoxShadow` extract/draw path** (`extract_buiy_nodes` has no `BoxShadow` branch yet; follow-ups.md:474–478). Coverage only enrolls the forced-colors **mode** (`forced_colors: true` cell) into every tier so the visual reftest is *matrixed* once it exists. The structured `analyze_forced_colors`/`analyze_shadow_only` gate (Task 4.6) covers the rest **now**. Track the visual reftest as a follow-up keyed to the `BoxShadow` pipeline landing; **do not author a runnable RED test for it**. + +#### Task 4.7 — Docs flip (spec draft→active; README; follow-ups; verification gate progress) + +- **Files — modify:** + - `docs/specs/2026-06-15-buiy-verification-design/README.md` + each child file (`metric.md`, `snapshots.md`, `invariants.md`, `reftests.md`, `goldens.md`, `determinism.md`, `coverage.md`) — flip `**Status:** draft` → `**Status:** active` (or `implemented`, matching the T9 closure precedent), with a one-line "landed" note per file pointing at this plan/commits. **`metric.md` additionally records the pixelmatch-vendoring deviation** (it no longer depends on the crate). + - `docs/README.md` — line 49 catalog entry: `[draft]` → `[active]`; add this plan under **Foundation → Plans** (`[landed]`). + - `docs/plans/follow-ups.md` — mark the forced-colors live-catalog seam (lines 462–473) **resolved**; leave the `BoxShadow` draw-skip visual reftest (lines 474–478) **open**, cross-referenced to this campaign; record the deferred golden primitives (time-boxed ignore, flaky auto-ignore, `golden-prune` advisory bin, object-store migration trigger) **and the deferred reftest multiple-references aggregation** as named follow-ups. + - `docs/specs/2026-05-07-buiy-foundation/verification.md` — mark gate progress in the CI-gates table: **#2** (visual — relational reftests + residue goldens + metric + determinism landed), **#5** (layout snapshots — `assert_layout_snapshot` landed), **#11** (forced-colors — live-catalog token-flow scan landed; the *visual* `BoxShadow` half still blocked — note it), **#12** (property tests — the six invariants + mutation fixtures landed). +- **Verify step:** docs-only — the "test" is a consistency check: `grep` confirms no child file still says `draft`; the `docs/README.md` tag matches; the verification.md rows reference the landed mechanisms. Run the **headless gate** once more (`RUSTDOCFLAGS="-D warnings" cargo doc` catches stale intra-doc links if doc-comments reference the new modules). +- **Success:** the spec is `active`/`implemented`, not contradicted by the code; the docs index current; gates #2/#5/#11/#12 show landed mechanisms; follow-ups records what was deferred + still blocked. +- **Gate:** headless (the full gate, as a final closeout run). + +**Phase 4 exit criteria:** headless gate green (the four coverage self-tests, the three pure-CPU enrollment drivers, the forced-colors live-catalog scan + its broken-fixture teeth test); GPU lane green (`coverage_golden`); `inventory` cleared by `cargo deny check`; adding one fixture file demonstrably enrolls it across all five tiers with zero test-file edits; the docs flip complete and gates #2/#5/#11/#12 reflect the landed mechanisms. The `BoxShadow` forced-colors *visual* reftest remains an open, blocked follow-up. + +--- + +## Self-review + +Run against the writing-plans self-review checklist and the spec. + +### (a) Spec coverage — every tier / gate → the task that implements it + +| Spec element | Child file | Implementing task(s) | +|---|---|---| +| Capture seam promotion (`capture_to_image`) | README § Architecture; determinism.md § "Where the code lives" | **0.4** (mechanics) + **3.3** (quiescence + dpr assertion) | +| Canonical `Dpr` milliscale type | determinism.md § "Extending GoldenConfig" | **0.3** | +| Dev-only `buiy_core → buiy_verify` cycle | metric.md § Migration | **0.2** (edge) + **1a.10** (consumed) | +| **Tier 4/5 metric** (`Diff`/`FuzzBudget`/`CompareOpts`/`compare`/`passes`/`within`) | metric.md | **1a.1–1a.6** | +| Metric known-answer meta-suite (§4 dilution, AA, dim-mismatch) | metric.md § Verification | **1a.2** (scale-invariant), **1a.3** (AA), **1a.4** (dim-mismatch), **1a.7** (suite + constants pin) | +| Advisory MSSIM (never gates) | metric.md § "Advisory MSSIM" | **1a.5** | +| Migrate the two naive metrics | metric.md § Migration | **1a.8** (RMSE delete), **1a.9** (perceptual_diff deprecate), **1a.10** (text_gpu sites) | +| **Tier 4 reftests** (`RefCase`/`RefKind`/`RefOutcome`/`reftest!`/`run_reftest`) | reftests.md | **1b.2–1b.8** | +| Reftest aggregation truth table + mismatch-floor guard | reftests.md § Verification #1, #2 | **1b.4** (truth table), **1b.7** (floor) | +| Harness-can-fail (vacuous-green guard) | reftests.md § Verification #3 | **1b.5/1b.6** (known-good/known-bad GPU pairs) | +| Reference-independence lint (RED/GREEN-tested) | reftests.md § "Reference independence", Verification #4 | **1b.9** | +| **Tier 4.5** CPU-vs-GPU SDF cross-check | reftests.md § "CPU-vs-GPU cross-check", Verification #5 | **1b.10** (oracle) + **1b.11** (cross-check) | +| Two real reftest cases | reftests.md § "Authoring patterns" | **1b.12** | +| **Tier 1** layout-number snapshots (gate #5) | snapshots.md | **2.1**, **2.2** | +| **Tier 2** display-list / `PackedInstance`-hex snapshots | snapshots.md | **2.3**, **2.4**, **2.5**, **2.6** | +| Snapshot order-invariance + version tripwire + behavior-preserving migration | snapshots.md § Verification | **2.2/2.4** (order + header), **2.5** (mutation re-check) | +| `top_layer_paint_rank` promotion | README § Resolved #3; invariants.md deviation #3 | **2.8** | +| **Tier 3** proptest predicates (gate #12) | invariants.md | **2.7** (generators), **2.9** (predicates #1–5 + mutations), **2.10** (BiDi #6) | +| `GoldenConfig` extensions (FontMode, Dpr field, MSAA/dither) | determinism.md | **3.1** (config), **0.4** (MSAA/dither consts) | +| Ahem layout-determinism font | determinism.md § "Ahem"; flutter prior-art | **3.2** | +| Quiescence flush + no-`Instant::now` | determinism.md § "Async-asset flush", Verification #3/#4 | **3.3** | +| `DeterministicApp` builder | determinism.md | **3.4** (+ re-points the 1b reftest seam) | +| Idempotent capture + knob-sensitivity negatives | determinism.md § Verification #1/#2 | **3.5** | +| **Tier 5** goldens (`GoldenKey`/`Backend`/`BlessLedger`/`check_golden`/`assert_golden`) | goldens.md | **3.6**, **3.7** | +| Multi-positive + bless + fail-closed | goldens.md § Verification #1–#4 | **3.7** | +| Self-contained HTML triage report | goldens.md § Verification #5 | **3.8** | +| End-to-end goldens per residue class | goldens.md § Verification #7 | **3.9** | +| CI lavapipe pin (`VK_DRIVER_FILES`, no `LP_NUM_THREADS`) | determinism.md § "lavapipe pin"; README § Resolved #6 | **3.10** | +| **Coverage** `Fixture`/`Matrix`/`Cell`/`CoverageKey`/`enroll_all` | coverage.md | **4.1**, **4.2**, **4.3**, **4.4** | +| Coverage self-tests (catalog↔glob, key-uniqueness, fan-out) | coverage.md § Verification #1–#5 | **4.5** | +| `forced_colors_analyzer` live-catalog producer (gate #11) | coverage.md; README § gate #11 | **4.6** | +| Docs flip (spec→active, README, follow-ups, verification.md) | CLAUDE.md docs discipline | **4.7** | + +Every spec tier, every named `§ Verification` meta-test, and every foundation gate (#2/#5/#11/#12) maps to at least one task. No spec element is unaddressed. + +### (b) Placeholder scan of Phase 0/1 — must be clean (real code, no TBD) + +Phases 0, 1a, 1b contain **full, real code in every implementation step** — every type, fn body, test, and command is concrete. There are **no `TBD`/`TODO`/`???`/`` tokens in any code block.** The non-code "confirm against the live API at impl time" notes are deliberate and bounded — each names the exact symbol to verify (e.g. `image-compare`'s `rgba_blended_hybrid_compare`, the `Radius::all` spelling, the `tests/support/mod.rs:168` plugin list, the `a11y`/`contrast` smoke symbols) and gives the contract to preserve if the spelling differs. These are *grounding instructions*, not unwritten code: the algorithm, control flow, and assertions are all present. The one literal requiring a live-run confirmation — the `(3, 255, 64)` constants tuple in **1a.7** — has an explicit bless step that reads the actual value from the failure message. **Scan result: clean.** + +### (c) Type-consistency check across tasks + +Verified the load-bearing type names are identical across every task and match the spec child files: + +- **`Diff`** — fields `differing_pixels: u32`, `max_channel_delta: u8`, `total_pixels: u32`, `mssim: Option`, `diff_image: Option` — identical in 1a.1 (def), 1a.4 (`passes`/`within`), 1b.4 (`stub_diff`), 2.9 (predicates consume `ExtractedNodes`, not `Diff`), 3.7/3.8 (golden). ✓ +- **`FuzzBudget`** — `{ max_channel_delta: u8, max_diff_pixels: u32 }` + `EXACT` const — identical in 1a.1, 1b (all reftest tasks), 3.x (goldens). Matches metric.md §73–82. ✓ +- **`CompareOpts`** — `{ threshold, include_aa, mssim, emit_diff_image }` + `Default` + `reftest_default()` — 1a.1 (def), 1b.1 (`reftest_default`). ✓ +- **`Dpr`** — `Dpr(u32)` milliscale, `X1`/`X2`, `from_f32`/`as_f32` — defined once in 0.3; imported (not redefined) by 3.1 (`GoldenConfig.dpr`), 3.6 (`GoldenKey.dpr`), 4.2 (`Matrix.dprs`/`CoverageKey.dpr`). Matches determinism.md §68–75. ✓ +- **`RefCase`** — `{ name, kind, test: fn(&mut App), reference: fn(&mut App), fuzz: FuzzBudget }` — identical in 1b.3 (def), 1b.5/1b.8/1b.9/1b.12 (use). Single-reference (the spec's multi-reference is a deferred follow-up, flagged below). ✓ +- **`RefKind`** / **`RefOutcome`** — `{Match, Mismatch}` / `{ passed, diff, report_path }` — consistent across 1b.2–1b.12. ✓ +- **`Backend`** — `{ Lavapipe, Vulkan, Gl, Metal, Dx12 }` — `GoldenKey.backend` (3.6) and `CoverageKey.backend` (4.2) name the same enum. Matches goldens.md §58. ✓ +- **`GoldenKey` / `CoverageKey`** — both carry `dpr: Dpr` + `backend: Backend`; `CoverageKey` derives `Eq + Hash` *because* `Dpr` is `Eq + Hash` (the milliscale payoff, stated identically in 0.3, 4.2, and coverage.md §122). ✓ +- **`FontMode`** — `{ Real, Ahem }` defined on `GoldenConfig` (3.1), re-exported by `determinism` (3.4), never redefined. ✓ + +No type is defined twice; every cross-task reference uses the canonical name and shape. + +### Gaps fixed / flagged during assembly + +1. **`pixelmatch` is vendored, not depended on** (1a deviation note). The published crate is unusable (PNG-stream input, flat-count output, private primitives, `image` 0.24-bound). The plan vendors the ~150 LOC YIQ + AA algorithm into `metric.rs`. **`metric.md` § "Crate choice" / "Migration" should be corrected** to say "vendored from the pixelmatch reference," not "depends on `pixelmatch`." Net Phase-1a dep delta is `image-compare` only. *(Flagged for the doc-flip in Task 4.7.)* +2. **Dev-dep edge de-duplicated.** The Phase 0 and Phase 1a drafts both added the `buiy_core → buiy_verify` dev-dep. Resolved: **0.2 is the canonical site**; **1a.10** now only *verifies* it is present (re-adds defensively if absent). No double-add. +3. **`capture_app` promotion is an addition to Phase 0's scope.** Phase 1b needs a painting `App` builder from `src`; the spec only promoted `capture_to_image`. The plan promotes `capture_app`/`capture_app_scaled` in **1b.6** (single-body with the test-support builder, anti-drift). This is consistent with README § Architecture's "promote the shared seam into `render/golden.rs` src" but extends it — noted so the impl author honors it. +4. **`reftest!` macro surface uses an `$fn:ident`**, not the spec's `$name:literal`, because `match` is a keyword and two `reftest!(match, …)` would collide. The generated fn is named from the ident; `stringify!($fn)` is the `RefCase.name`. Documented in 1b.8. +5. **Multiple-references aggregation is deferred.** reftests.md § "Reference independence" #3 specs `RefCase::multi` / `reference: &[fn]` (Match = OR, Mismatch = AND). Phase 1b builds single-reference only — it covers both real cases + the cross-check. The `evaluate_outcome` split keeps the aggregation addable without reworking the engine. **Recorded as a follow-up in Task 4.7's `follow-ups.md` edit.** *(Open gap for the reviewer: confirm single-reference is acceptable for v1, or pull multi-reference forward into Phase 1b.)* +6. **`insta` constants-pin deferred within Phase 1a.** metric.md § Verification wants the constants tripwire as a floats-redacted `insta` snapshot; 1a.7 uses an exact-integer `assert_eq!` instead (Phase 2 introduces the snapshot dump infra). Behavior-identical, no vacuous pass. The upgrade is folded into Phase 2's snapshot work. +7. **MSAA/dither constants land in Phase 0.4, not Phase 3.** The drafts had `CAPTURE_MSAA`/`CAPTURE_DITHER_OFF` in both Phase 0.4 (capture camera) and Phase 3.1 (`GoldenConfig`). Resolved: the **consts land in 0.4** (the capture camera needs them); **3.1** only adds the `FontMode`/`dpr` config fields and references the existing consts. diff --git a/docs/prior-art/flutter-golden-testing/README.md b/docs/prior-art/flutter-golden-testing/README.md new file mode 100644 index 0000000..e0de393 --- /dev/null +++ b/docs/prior-art/flutter-golden-testing/README.md @@ -0,0 +1,86 @@ +**Date:** 2026-06-14 +**Status:** active +**Subject:** Flutter golden-file visual regression — `matchesGoldenFile`, Flutter Gold, the obscure-text determinism font, and the third-party toolkit ecosystem (golden_toolkit, Alchemist) + +# Flutter golden-file visual regression + +Flutter's golden ecosystem is one of the most heavily-exercised large-scale glyph-golden systems among open-source GUI toolkits, and a canonical case study for both the *value* and the *flake-tax* of pixel goldens. The public API is `matchesGoldenFile(key, {version})`, a thin async matcher that captures the first `RepaintBoundary`'s rendered image and delegates all comparison to an ambient, swappable `goldenFileComparator` — see [matches-golden.md](matches-golden.md). The default `LocalFileComparator` does a **pixel-for-pixel, zero-tolerance** decode-and-compare, which is precisely what flakes across hosts: different OSes rasterize fonts and antialias with different engines, so a byte-exact PNG baked on macOS fails `==` on a Linux CI runner. Flutter's framework dodges this by moving the source of truth to **Flutter Gold** (a Skia Gold instance), a content-addressed server that holds *many* approved digests per test and requires human pre-submit triage — see [flutter-gold-infra.md](flutter-gold-infra.md). The two reusable wins are the *determinism knobs*, not the pixel-diff plumbing: an obscure box-glyph test font (**Ahem**, now **FlutterTest**) that collapses the font axis for layout goldens ([obscure-text-font.md](obscure-text-font.md)), and a `debugDisableShadows` flag that swaps blurred shadows for flat fills ([determinism-knobs.md](determinism-knobs.md)). The third-party tooling — discontinued `golden_toolkit`, active `Alchemist` — institutionalizes the load-bearing **two-tier split**: a broad obscure-text/flat-shadow CI tier plus a narrow real-font fidelity tier ([ecosystem-toolkit-alchemist.md](ecosystem-toolkit-alchemist.md)). + +This is the Tier-5 (golden/screenshot) and text-determinism prior-art for [`docs/reports/2026-06-14-visual-bug-detection-strategy.md`](../../reports/2026-06-14-visual-bug-detection-strategy.md). Buiy's strategy is reftests-first — keep the flaky pixel tier a minimal residue — and this folder is the empirical argument *for* that thesis: even Google-scale tooling does not fully tame host-rasterization flake. Buiy borrows the knobs (box-glyph font with power-of-2 UPM, shadow killswitch, curated-accept), not the hosted service. The decision file is [lessons.md](lessons.md). + +## Key facts + +| Fact | Value | Source | +|---|---|---| +| Public matcher | `AsyncMatcher matchesGoldenFile(Object key, {int? version})`; key must be `Uri`/`String` | [matchesGoldenFile API](https://api.flutter.dev/flutter/flutter_test/matchesGoldenFile.html) | +| Comparison delegation | Matcher does no compare itself; delegates to ambient `goldenFileComparator` | [matchesGoldenFile API](https://api.flutter.dev/flutter/flutter_test/matchesGoldenFile.html) | +| Default backend | `LocalFileComparator` — **pixel-for-pixel exact match**, zero tolerance | [LocalFileComparator API](https://api.flutter.dev/flutter/flutter_test/LocalFileComparator-class.html) | +| Raw `flutter_test` default | `TrivialComparator` (no-op) | [flutter_goldens.dart](https://github.com/flutter/flutter/blob/master/packages/flutter_goldens/lib/flutter_goldens.dart) | +| Refresh command | `flutter test --update-goldens` | [matchesGoldenFile API](https://api.flutter.dev/flutter/flutter_test/matchesGoldenFile.html) | +| Framework CI backend | Flutter Gold (a Skia Gold instance); cross-platform via content-addressed digests | [Writing-a-golden-file-test wiki](https://github.com/flutter/flutter/blob/master/docs/contributing/testing/Writing-a-golden-file-test-for-package-flutter.md) | +| Gold client | Google's `goldctl` (`imgtest add`, `--luci` on CI) | [DeepWiki testing-infra](https://deepwiki.com/flutter/flutter/5.3-engine-versioning-and-artifacts) | +| Engine Gold instance | Separate — `flutter-engine-gold.skia.org`, `dart:ui`-only pixel tests | [issue #76565](https://github.com/flutter/flutter/issues/76565) | +| Current default test font | **FlutterTest** — box-glyph, ascent 0.75em / descent 0.25em, **UPM 1024 (power of 2)**, line-gap 0 | [Flutter-Test-Fonts.md](https://github.com/flutter/flutter/blob/master/docs/contributing/testing/Flutter-Test-Fonts.md) | +| Legacy default test font | **Ahem** — box-glyph, ascent 0.8em / descent 0.2em, **UPM 1000** | [Flutter-Test-Fonts.md](https://github.com/flutter/flutter/blob/master/docs/contributing/testing/Flutter-Test-Fonts.md) | +| Shadow killswitch | `debugDisableShadows`; `disableShadows` on the test binding, **default `true`** | [disableShadows API](https://api.flutter.dev/flutter/flutter_test/AutomatedTestWidgetsFlutterBinding/disableShadows.html) | +| `golden_toolkit` (eBay) | **discontinued**; latest **0.15.0**, published **2023-02-21** | [pub.dev/packages/golden_toolkit](https://pub.dev/packages/golden_toolkit) | +| `alchemist` (Betterment) | **active**, MIT, **v0.14.0 (2026-03-13)**, ~298 GitHub stars *(point-in-time)* | [github.com/Betterment/alchemist](https://github.com/Betterment/alchemist) | +| Docker-pinned flake | Even one identical Ubuntu Docker image leaks host-OS rasterization: "a random smattering of mismatched pixels" | [issue #131559](https://github.com/flutter/flutter/issues/131559) | + +## Contents + +Each file is independently skimmable with its own `## Sources`. + +**The matcher and the local default** + +- [**matches-golden.md**](matches-golden.md) — `matchesGoldenFile`, the `GoldenFileComparator` extension seam, the brutally-strict `LocalFileComparator`, and exactly why pixel-exact local compares flake across hosts. + +**The framework's server-side answer** + +- [**flutter-gold-infra.md**](flutter-gold-infra.md) — How `flutter_goldens` swaps in a Skia Gold backend by sniffing env vars, the `goldctl`/`flutter-gold` per-PR triage workflow, the many-positives content-addressed model, scale, and the verbatim warts (Docker leakage #131559, force-push pending, skipped-flaky-goldens). + +**The determinism knobs Buiy actually borrows** + +- [**obscure-text-font.md**](obscure-text-font.md) — The Ahem → FlutterTest box-glyph test font: why rectangular glyphs remove curve-rasterization variance and why a **power-of-2 units-per-em** removes metric rounding variance. The single cheapest determinism lever. +- [**determinism-knobs.md**](determinism-knobs.md) — `debugDisableShadows` (default-on in tests, swaps shadows for solid blocks), `obscureText`, and the layered fixed-font + shadow-killswitch + colored-rectangle stack. + +**The third-party ecosystem and the two-tier split** + +- [**ecosystem-toolkit-alchemist.md**](ecosystem-toolkit-alchemist.md) — `golden_toolkit` (`loadAppFonts`, `multiScreenGolden`, now discontinued) and **Alchemist** (the clearest articulation of platform-tests-vs-CI-tests, `obscureText`/`renderShadows`/`diffThreshold`). The distilled rectangle/real-font split Buiy should mirror. + +**Reference** + +- [**open-problems.md**](open-problems.md) — What Flutter's golden system structurally does *not* solve: host-rasterization leakage through containers, the no-way-to-verify-flaky-fixes trap, the hosted-service operational floor, and the irreducible color-emoji golden. +- [**lessons.md**](lessons.md) — **The consult-this-when-designing decision file.** `## Validates` / `## Avoid` / `## Borrow`. This is where Buiy implications live. +- [**glossary.md**](glossary.md) — System-specific terms: golden file, comparator, Ahem/FlutterTest, units-per-em, Flutter Gold, digest/triage, obscure text, `--update-goldens`. + +## Reading order + +1. [lessons.md](lessons.md) — the decisions. Start here if you are designing Buiy's text goldens or determinism knobs. +2. [obscure-text-font.md](obscure-text-font.md) — the single cheapest determinism lever, and the one Buiy mirrors most directly. +3. [matches-golden.md](matches-golden.md) — the matcher and why the local default flakes (the problem the rest exists to solve). +4. [determinism-knobs.md](determinism-knobs.md) — the shadow killswitch and the layered determinism stack. +5. [ecosystem-toolkit-alchemist.md](ecosystem-toolkit-alchemist.md) — the two-tier split, institutionalized. +6. [flutter-gold-infra.md](flutter-gold-infra.md) — the server-side escape hatch (what Buiy deliberately does *not* build). +7. [open-problems.md](open-problems.md) — the limits, so Buiy doesn't expect the tier to do more than it can. +8. [glossary.md](glossary.md) — reference when a term is unclear. + +## How to use + +**Framing disclosure.** These docs are written from Buiy's stance — an AccessKit-first, wgpu + Taffy + cosmic-text, parallel-to-bevy_ui retained-mode engine building a reftests-first layered visual-bug-detection strategy. The "Implications for Buiy" / lessons framing reads Flutter golden-file visual regression through that lens; readers auditing whether that strategy is itself right should weigh the corpus accordingly — it is a learn-from artifact, not a neutral catalog. + +Concretely, this corpus is written from the stance that **Buiy is local-first (Rust, offline, MIT/Apache, no SaaS) and reftests-first, with the pixel/golden tier a deliberately-minimal residue**. The Flutter record is read as a clean empirical argument *for* that thesis: even Google-scale Gold tooling (content-addressing, `goldctl`, per-PR human triage, Docker pinning) does not fully tame host-rasterization flake (#131559). "Implications for Buiy" therefore lean toward borrowing the *determinism knobs* (box-glyph font, shadow killswitch, curated-accept) and away from the *hosted service*. A reader weighing whether Buiy should adopt a real-font golden tier at all, or a hosted triage UI, should weigh the corpus accordingly — its evidence skews toward "deterministic-font-first, real-glyph-narrow, hosted-service-never." + +## Sources + +- `matchesGoldenFile` API: https://api.flutter.dev/flutter/flutter_test/matchesGoldenFile.html +- `LocalFileComparator` API: https://api.flutter.dev/flutter/flutter_test/LocalFileComparator-class.html +- `GoldenFileComparator` API: https://api.flutter.dev/flutter/flutter_test/GoldenFileComparator-class.html +- Flutter-Test-Fonts.md: https://github.com/flutter/flutter/blob/master/docs/contributing/testing/Flutter-Test-Fonts.md +- `debugDisableShadows` / `disableShadows`: https://api.flutter.dev/flutter/rendering/debugDisableShadows.html · https://api.flutter.dev/flutter/flutter_test/AutomatedTestWidgetsFlutterBinding/disableShadows.html +- Writing-a-golden-file-test wiki: https://github.com/flutter/flutter/blob/master/docs/contributing/testing/Writing-a-golden-file-test-for-package-flutter.md +- golden_toolkit (pub.dev, discontinued): https://pub.dev/packages/golden_toolkit +- Alchemist: https://github.com/Betterment/alchemist · https://pub.dev/packages/alchemist +- flutter/flutter#131559 (Docker rasterization leak): https://github.com/flutter/flutter/issues/131559 +- Buiy visual-bug-detection strategy: [`docs/reports/2026-06-14-visual-bug-detection-strategy.md`](../../reports/2026-06-14-visual-bug-detection-strategy.md) +- Per-file `## Sources` sections cite the specific URLs each file relies on. diff --git a/docs/prior-art/flutter-golden-testing/determinism-knobs.md b/docs/prior-art/flutter-golden-testing/determinism-knobs.md new file mode 100644 index 0000000..f6898f5 --- /dev/null +++ b/docs/prior-art/flutter-golden-testing/determinism-knobs.md @@ -0,0 +1,50 @@ +**Date:** 2026-06-14 +**Status:** active +**Subject:** Flutter's golden determinism knobs — `debugDisableShadows`, `obscureText`, and the layered fixed-font + shadow-killswitch + colored-rectangle stack + +# Determinism knobs + +Flutter's golden-test stack solves text/shadow nondeterminism with three layered knobs: a fixed-metric test font, a shadow killswitch, and (in higher-level packages) "obscure text as colored rectangles." The fixed-metric font has its own file ([obscure-text-font.md](obscure-text-font.md)); this file covers the shadow killswitch and how the knobs layer. + +## `debugDisableShadows` — the shadow killswitch + +`debugDisableShadows` is a global flag (in the `rendering` library) that "replaces all shadows with solid color blocks… because shadow rendering is not guaranteed to be pixel-for-pixel identical from version to version or even from run to run" ([debugDisableShadows API](https://api.flutter.dev/flutter/rendering/debugDisableShadows.html)). + +Key facts: + +- It is exposed on the test binding as `disableShadows`, and is **`true` by default** in `AutomatedTestWidgetsFlutterBinding` ([disableShadows API](https://api.flutter.dev/flutter/flutter_test/AutomatedTestWidgetsFlutterBinding/disableShadows.html)). So in the standard widget-test environment, shadows are *off by default* for goldens. +- Mechanically, it forces `BoxShadow.toPaint` to behave as if `blurStyle == BlurStyle.normal`, i.e. it **disables the blur kernel**. The blur math is the non-deterministic part (it varies version-to-version and run-to-run), so removing it removes the flake. +- `BoxDecoration`/`ShapeDecoration` compensate automatically, but **custom painters must account for it** — a painter that draws its own shadow must check the flag. +- It can only be toggled inside a single test case. + +The takeaway: the blur/SDF-shadow kernel is a top flake source, so the framework ships a render-time flag that swaps shadows for flat fills in golden mode and turns it on by default. + +## `obscureText` — colored-rectangle text (higher-level packages) + +Above the framework, packages like Alchemist add an `obscureText` knob that replaces text blocks with colored rectangles for the CI/layout tier (full treatment in [ecosystem-toolkit-alchemist.md](ecosystem-toolkit-alchemist.md)). This is conceptually the same move as the box-glyph font — remove glyph rasterization from the comparison — applied at the widget level rather than the font level. The package docs frame it as "useful for circumventing issues with Flutter's font rendering between host platforms." + +## How the knobs layer + +The three knobs collapse three independent flake axes: + +| Knob | Flake axis removed | Where | +|---|---|---| +| Fixed box-glyph font (FlutterTest/Ahem) | font-engine curve rasterization + metric rounding | framework default ([obscure-text-font.md](obscure-text-font.md)) | +| `debugDisableShadows` (default on) | shadow blur-kernel non-determinism | framework default | +| `obscureText` / colored rectangles | glyph rasterization at the widget level | higher-level packages (CI tier) | + +The split worth stealing, stated plainly: **render text as rectangles and shadows as flat fills for the broad layout-golden tier; keep a narrow real-font, real-shadow suite for fidelity** (and accept that the fidelity suite is platform-bound and threshold-tolerant). The broad tier is the bulk of the suite; the fidelity tier is deliberately tiny. + +## Implications for Buiy + +Buiy's SDF shadow pass is exactly the `debugDisableShadows` kind of risk — blur math plus GPU rounding. Add a `BUIY_DISABLE_SHADOWS` flag that swaps the SDF shadow for a flat fill in the cheap tiers (layout/structured/reftest), leaving real shadow rendering to the host-pinned golden-screenshot tier. There is an open Flutter issue to push this killswitch into the engine as a runtime flag ([flutter/flutter#105475](https://github.com/flutter/flutter/issues/105475)) — Buiy should implement it **engine-side from the start**, not as a debug-build hack, so it is available in release-mode test binaries. + +More broadly, Buiy's existing `GoldenConfig` flake-mitigation triad (fixed clock, font-load sync, atlas warmup) is the analog of Flutter's layered knobs; the box-glyph font and shadow killswitch are the two highest-value additions to it. See [lessons.md](lessons.md). + +## Sources + +- `debugDisableShadows`: https://api.flutter.dev/flutter/rendering/debugDisableShadows.html +- `disableShadows` (default `true` in AutomatedTestWidgetsFlutterBinding): https://api.flutter.dev/flutter/flutter_test/AutomatedTestWidgetsFlutterBinding/disableShadows.html +- flutter/flutter#105475 (push shadow killswitch into the engine as a runtime flag): https://github.com/flutter/flutter/issues/105475 +- Alchemist (`obscureText` / `renderShadows`): https://github.com/Betterment/alchemist +- Sibling files: [obscure-text-font.md](obscure-text-font.md), [ecosystem-toolkit-alchemist.md](ecosystem-toolkit-alchemist.md), [lessons.md](lessons.md) diff --git a/docs/prior-art/flutter-golden-testing/ecosystem-toolkit-alchemist.md b/docs/prior-art/flutter-golden-testing/ecosystem-toolkit-alchemist.md new file mode 100644 index 0000000..fa64736 --- /dev/null +++ b/docs/prior-art/flutter-golden-testing/ecosystem-toolkit-alchemist.md @@ -0,0 +1,68 @@ +**Date:** 2026-06-14 +**Status:** active +**Subject:** Third-party Flutter golden tooling — golden_toolkit (eBay, discontinued) and Alchemist (Betterment, active), and the platform-tests-vs-CI-tests two-tier split + +# Third-party ecosystem: golden_toolkit and Alchemist + +Two community packages sit on top of `matchesGoldenFile`. The first (`golden_toolkit`) was the de-facto standard and is now discontinued; the second (`Alchemist`) is its active successor and is the **clearest articulation of the rectangle/real-font split** that Buiy should mirror. + +## golden_toolkit (eBay) — discontinued + +Built directly on `matchesGoldenFile`. Notable APIs: + +- **`loadAppFonts()`** — "By default, flutter test only uses a single 'test' font called Ahem… loadAppFonts will automatically load the Roboto font, and any fonts included from packages you depend on." Recommended from a `flutter_test_config.dart` so it runs once for the whole suite ([loadAppFonts docs](https://pub.dev/documentation/golden_toolkit/latest/golden_toolkit/loadAppFonts.html)). This is the *opposite* of the obscure-text approach — it loads **real** fonts so goldens look human-readable, at the cost of host-dependence. + + **The suite-wide hook itself is the reusable mechanism.** `flutter_test_config.dart` is how Flutter makes a font (or any setup) the default across a whole test suite without per-test boilerplate: the framework "scan[s] up the directory hierarchy, starting from the directory in which the test file resides, looking for a file named `flutter_test_config.dart`," and if found expects a top-level `Future testExecutable(FutureOr Function() testMain)` that wraps and invokes the test's own `main()` ([flutter_test library docs](https://api.flutter.dev/flutter/flutter_test/)). The closest such file to the test wins; all others are ignored. Font registration (`loadAppFonts`, or Alchemist's Ahem-forcing) is wired here precisely because it must run once before any test. **Implication for Buiy:** "how do I make the test font the default across a whole suite?" is the first question a Buiy golden-test author hits — Buiy needs an equivalent directory-scoped, run-once setup hook (registering `BUIY_TEST_FONT` and the determinism knobs) so individual tests don't each opt in. +- **`multiScreenGolden()`** — runs a widget across a device list, emitting one PNG per device with the device name appended, auto-sizing the surface to capture scrollables ([golden_toolkit README](https://github.com/eBay/flutter_glove_box/blob/master/packages/golden_toolkit/README.md)). + +**Wart (verified):** `golden_toolkit` is **discontinued**. Current version **0.15.0**, publisher **eBay.com** (`ebay.com` on pub.dev), last published **2023-02-21** (per the pub.dev API; ~3 years before this writing). Treat it as historical prior art, not a live dependency. Its abandonment is what pushed the community to Alchemist. + +## Alchemist (Betterment + Very Good Ventures) — active + +Alchemist institutionalizes the two-tier split. It generates **two** snapshot sets ([Alchemist GitHub](https://github.com/Betterment/alchemist)): + +- **Platform tests** — generate "golden files with human readable text," run locally per-OS into `goldens//`. Host-dependent; **not committed** to source control. +- **CI tests** — identical *except* "the text blocks are replaced with colored squares," stored in `goldens/ci/`. CI tests are "always run using the Ahem font family … to ensure that CI tests are platform agnostic — their output is always consistent regardless of the host platform." Only these are tracked in source control. + +The rationale, verbatim: "individual platforms are known to render text differently than others… causing CI systems to fail the test." + +**Controls:** + +- **`obscureText`** — on for CI, off for platform; toggles whether text "should be obscured by colored rectangles… useful for circumventing issues with Flutter's font rendering between host platforms." +- **`renderShadows`** — replaces shadows with "opaque colors… because shadow rendering can be inconsistent between test runs" (the package-level analog of `debugDisableShadows`, see [determinism-knobs.md](determinism-knobs.md)). +- **`diffThreshold`** — per-config tolerance, set on `PlatformGoldensConfig` / `CiGoldensConfig` (under `AlchemistConfig`). + +**Status (verified):** active, MIT, **v0.14.0 (2026-03-13)**, ~298 GitHub stars *(point-in-time figure)*. + +## The split, distilled + +| | CI / layout tier | Real-font fidelity tier | +|---|---|---| +| Text | obscured → colored rectangles / Ahem box glyphs | real fonts, human-readable | +| Shadows | flat opaque fills | real shadows | +| Where run | any host / CI | locally, one canonical OS | +| Committed? | yes (deterministic, tiny diffs) | no (host-dependent) | +| Catches | layout, composition, stacking, sizing | kerning, fallback, emoji, real shaping | +| Size of suite | the **bulk** | deliberately **narrow** | +| Tolerance | exact / near-exact | threshold-tolerant, flake accepted | + +CI/layout goldens obscure text → no glyph rasterization in the comparison → stable across OS / engine / font-revision. A small real-font tier catches genuine text-fidelity regressions, accepting that it is platform-bound. + +## Implications for Buiy + +Buiy should mirror this two-tier shape directly: + +1. **Broad tier** — box-glyph (`BUIY_TEST_FONT`, see [obscure-text-font.md](obscure-text-font.md)) + flat-shadow (`BUIY_DISABLE_SHADOWS`, see [determinism-knobs.md](determinism-knobs.md)) mode for the reftest/structured/golden bulk. Deterministic, committed, tiny diffs. +2. **Narrow tier** — a tiny real-`cosmic-text`/`harfrust` fidelity suite, pinned to one bundled OFL font and a controlled rasterizer, where real shaping fidelity (kerning, fallback, emoji) is asserted and flake is an accepted cost. + +**What Buiy should *not* center on:** golden_toolkit's `loadAppFonts()` real-font-everywhere approach — it is exactly the host-dependence the obscure-text font exists to avoid, and the package is discontinued. The lineage's verdict is deterministic-font-first, real-glyph-narrow. Alchemist's `diffThreshold`-per-config is also worth borrowing: the broad tier gets near-zero tolerance, the fidelity tier gets a generous budget. See [lessons.md](lessons.md). + +## Sources + +- golden_toolkit (pub.dev, discontinued): https://pub.dev/packages/golden_toolkit +- golden_toolkit README: https://github.com/eBay/flutter_glove_box/blob/master/packages/golden_toolkit/README.md +- loadAppFonts docs: https://pub.dev/documentation/golden_toolkit/latest/golden_toolkit/loadAppFonts.html +- `flutter_test_config.dart` / `testExecutable` (suite-wide setup hook): https://api.flutter.dev/flutter/flutter_test/ +- Alchemist GitHub: https://github.com/Betterment/alchemist +- alchemist (pub.dev): https://pub.dev/packages/alchemist +- Sibling files: [obscure-text-font.md](obscure-text-font.md), [determinism-knobs.md](determinism-knobs.md), [lessons.md](lessons.md) diff --git a/docs/prior-art/flutter-golden-testing/flutter-gold-infra.md b/docs/prior-art/flutter-golden-testing/flutter-gold-infra.md new file mode 100644 index 0000000..c5c5e85 --- /dev/null +++ b/docs/prior-art/flutter-golden-testing/flutter-gold-infra.md @@ -0,0 +1,71 @@ +**Date:** 2026-06-14 +**Status:** active +**Subject:** Flutter framework golden infrastructure — how `flutter_goldens` wires `matchesGoldenFile` to Flutter Gold (Skia Gold), the per-PR triage workflow, scale, and the verbatim warts + +# Flutter framework golden infrastructure (Flutter Gold) + +Flutter's framework goldens are among the most heavily-exercised large-scale golden systems in open-source GUI toolkits. The framework itself does **not** use `LocalFileComparator` in CI — it moves the source of truth to a Skia Gold instance (**Flutter Gold**) so that comparison happens in an external, multi-positive service rather than as a zero-tolerance byte-compare on the test machine. (For the Skia Gold service model in depth, see the sibling [`skia-gold`](../skia-gold/README.md) prior-art folder; this file covers Flutter's *use* of it.) + +## Wiring: how the comparator gets swapped + +The public matcher `matchesGoldenFile(key, {version})` delegates to the ambient `goldenFileComparator` ([matchesGoldenFile API](https://api.flutter.dev/flutter/flutter_test/matchesGoldenFile.html)); the default for raw `flutter_test` is the no-op `TrivialComparator`, and for `flutter test` it is `LocalFileComparator` (see [matches-golden.md](matches-golden.md)). + +The internal `flutter_goldens` package (in-tree at `packages/flutter_goldens/`, **not** on pub.dev) swaps in a Skia-Gold–backed comparator at test bootstrap. `testExecutable()` picks a `FlutterGoldenFileComparator` subclass "based on the current environment" ([flutter_goldens.dart](https://github.com/flutter/flutter/blob/master/packages/flutter_goldens/lib/flutter_goldens.dart)): + +- **`FlutterPostSubmitFileComparator`** — uploads images to the Skia Gold dashboard via `goldctl`. +- **`FlutterPreSubmitFileComparator`** — "will always return true since golden file test failures are managed in pre-submit checks by the flutter-gold status check." +- **`FlutterSkippingFileComparator`** — skips on unsupported environments. +- **Local fallback** — requests baselines from Skia Gold for the current device. + +The selection sniffs env vars — notably `SWARMING_TASK_ID` and `GOLDCTL` — running through Gold only on CI, excluding tryjob-only contexts, and historically gating on the main branch (surfaced from [PR #33688 "Part 1: Skia Gold Testing"](https://github.com/flutter/flutter/pull/33688)). Under the hood, `SkiaGoldClient` shells out to Google's `goldctl` (`goldctl imgtest add`, authenticated with `--luci` on CI) to push images to the Gold instance ([DeepWiki testing-infra](https://deepwiki.com/flutter/flutter/5.3-engine-versioning-and-artifacts)). + +Baselines are **not** checked into the repo as PNGs the way third-party setups do — they live in Gold, mirrored locally under `bin/cache/pkg/skia_goldens/...`. + +*Unverified:* the precise current env-var gating predicate (`SWARMING_TASK_ID`/`GOLDCTL`, main-branch-only) is drawn from search summaries of PR #33688 and the in-tree comparator, not a line-by-line read of current `flutter_goldens.dart`. Treat the exact predicate as approximate. + +## The per-PR triage workflow + +Every framework PR that touches golden tests runs them and diffs against Gold. The `flutter-gold` PR check "is applied to pull requests in flutter/flutter that execute golden file tests and are ready for review." On any image delta it ([Writing-a-golden-file-test wiki](https://github.com/flutter/flutter/blob/master/docs/contributing/testing/Writing-a-golden-file-test-for-package-flutter.md)): + +- "will hold a pending state," +- Gold leaves a PR comment linking the image results in its [ChangeLists dashboard](https://flutter-gold.skia.org/changelists), and +- a human in the `flutter-hackers` group must triage (approve) each new image before the check "go[es] green within five minutes." + +Unapproved images that land trigger a post-submit error: *"Skia Gold received an unapproved image in post-submit testing."* Gold's content-addressed model means an already-triaged pixel hash is auto-approved forever, which is what makes per-PR triage tractable at framework scale. + +The **engine** has its own separate instance at `flutter-engine-gold.skia.org`, with `dart:ui`-only pixel tests ([issue #76565](https://github.com/flutter/flutter/issues/76565)). + +## The many-positives model — the design answer to zero-tolerance + +The wiki concedes goldens run for "Linux, Mac, Windows, and Web platforms. It is common for there to be slight differences between them," requiring "multiple golden masters for a given test." Gold's triage UI tolerates *multiple accepted masters per test* to absorb cross-platform rendering differences — this is the direct design answer to `LocalFileComparator`'s zero-tolerance flake: move the source of truth to a server that holds many approved variants, instead of demanding one byte-exact file. + +Threshold tolerance is a partial mitigation too: the engine/Impeller pixel harness passes when "less than 1% of pixels are different by less than 4 color component deltas" ([engine PR #40824](https://github.com/flutter/engine/pull/40824)). The two knobs that express this are `maxDiffPixelsPercent` (the fraction of pixels allowed to differ) and `pixelColorDelta` (the max per-channel color delta a differing pixel may have) — a two-axis budget worth borrowing for a fuzzy comparator (see [lessons.md](lessons.md) `## Borrow`). *Note:* that threshold is the engine/Impeller harness, not the framework widget goldens, which remain effectively exact-match modulo Gold triage. + +## Determinism: the Ahem font + +The load-bearing trick that keeps these goldens stable is the obscure-text **Ahem** font: "the Flutter framework uses a font called 'Ahem' which shows squares instead of characters" — every glyph a solid box filling the em square ("black spaces for every character and icon"). This removes per-platform font-rasterization variance from any golden that isn't specifically testing glyph rendering, making the *layout* deterministic across OSes. Full treatment in [obscure-text-font.md](obscure-text-font.md). + +## Verbatim warts + +- **Docker doesn't fully save you (#131559, Open, P2).** Even with one *identical Ubuntu Docker image*, generating on a Windows host and verifying on a Mac host yields *"a random smattering of mismatched pixels"* ranging *"from single pixels to 30-90 pixel mismatches"* (matthew-carroll, 2023-07-29). Host-OS rasterization leaks through the container. +- **Flaky goldens get skipped, and skipping blinds you.** "When we skip a test we stop sending to Skia Gold entirely," so there is "no way to verify flaky golden test fixes" short of speculatively un-skipping — "The cost of a mistake is closed tree, P0s, wasted time, and other sadness" (yjbanov, 2022-09-10, [#111325](https://github.com/flutter/flutter/issues/111325)). +- **Force-push leaves the check stuck.** A known Skia Gold issue leaves the `flutter-gold` check stuck pending after `git push -f`; the only remedy is *"Try rebasing again. This side-effect is flaky"* (wiki, verbatim). + +*Unverified:* the exact verbatim comment text of #111325 could not be loaded from a primary fetch beyond the quoted fragments; re-check against the live thread before treating as exact. + +## Implications for Buiy + +Flutter Gold is the cautionary "what Buiy does *not* build." It is a hosted Google-Cloud service; for a local-first library it is both the wrong dependency and a documented flake source (#131559, #111325). The reusable wins are the *determinism knobs and the curated-accept discipline*, not the pixel-diff plumbing. Concretely: commit small box-font goldens to the repo (deterministic → tiny diffs), keep a host-pinned local rasterizer for the irreducible suite, and generalize Buiy's existing `BUIY_ACCEPT_SHAPING` curated-accept gate to all snapshot tiers — Flutter's `--update-goldens` + human pre-submit triage is the precedent for that explicit-accept model. See [lessons.md](lessons.md). + +## Sources + +- `matchesGoldenFile` API: https://api.flutter.dev/flutter/flutter_test/matchesGoldenFile.html +- flutter_goldens.dart: https://github.com/flutter/flutter/blob/master/packages/flutter_goldens/lib/flutter_goldens.dart +- PR #33688 "Part 1: Skia Gold Testing": https://github.com/flutter/flutter/pull/33688 +- DeepWiki Flutter testing-infra: https://deepwiki.com/flutter/flutter/5.3-engine-versioning-and-artifacts +- Writing-a-golden-file-test wiki: https://github.com/flutter/flutter/blob/master/docs/contributing/testing/Writing-a-golden-file-test-for-package-flutter.md +- flutter/flutter#131559 (Docker rasterization leak): https://github.com/flutter/flutter/issues/131559 +- flutter/flutter#111325 (no way to verify flaky golden fixes; force-push pending): https://github.com/flutter/flutter/issues/111325 +- flutter/flutter#76565 (separate engine Gold instance): https://github.com/flutter/flutter/issues/76565 +- flutter/engine#40824 (1% / 4-component-delta threshold): https://github.com/flutter/engine/pull/40824 +- Skia Gold service model: [`docs/prior-art/skia-gold/`](../skia-gold/README.md) diff --git a/docs/prior-art/flutter-golden-testing/glossary.md b/docs/prior-art/flutter-golden-testing/glossary.md new file mode 100644 index 0000000..831bee4 --- /dev/null +++ b/docs/prior-art/flutter-golden-testing/glossary.md @@ -0,0 +1,61 @@ +**Date:** 2026-06-14 +**Status:** active +**Subject:** Glossary of Flutter golden-testing terms used across this folder + +# Glossary + +System-specific terms for the Flutter golden-testing prior-art folder. For Skia Gold's own vocabulary (digest, param, trace, corpus, baseline), see the [`skia-gold`](../skia-gold/README.md) folder's glossary. + +- **Golden file** — a reference PNG that a test's rendered output is compared against. "Golden" ≡ the blessed/expected image. + +- **`matchesGoldenFile(key, {version})`** — `flutter_test`'s async golden matcher. Captures the first `RepaintBoundary`'s rendered image and delegates comparison to the ambient `goldenFileComparator`. Driven via `await expectLater(...)`. See [matches-golden.md](matches-golden.md). + +- **`goldenFileComparator`** — the top-level ambient instance that `matchesGoldenFile` delegates to. Swapping it is how Flutter changes golden backends. + +- **`GoldenFileComparator`** — the abstract class (methods `compare` / `update`) that is the extension seam. Subclasses implement local diffing, Gold upload, or skip-on-unsupported. + +- **`LocalFileComparator`** — the default backend for `flutter test`. Loads goldens as paths relative to the test file and does a **pixel-for-pixel, zero-tolerance** decoded-PNG comparison. The canonical cross-host flake source. + +- **`TrivialComparator`** — the no-op default in raw `flutter_test` before `flutter_goldens` wires a real backend. + +- **`--update-goldens`** — the `flutter test` flag that regenerates/refreshes golden files instead of comparing against them. + +- **Ahem** — the legacy obscure-text test font: every glyph a solid box filling the em square. Units-per-em **1000** (not a power of 2 → per-platform metric rounding). Designed "to show black spaces for every character and icon." + +- **FlutterTest** — the current default test font. Box glyphs like Ahem, but units-per-em **1024** (a power of 2) → more precise, **font-engine-agnostic** metrics. Ascent 0.75 em, descent 0.25 em, line-gap 0. Ships shaped variants (Square, Ascent/Descent Flushed, Full/½/⅓ x-advance). + +- **Units-per-em (UPM)** — the font's internal coordinate scale; metrics are expressed in these units and divided by UPM to scale. A **power-of-2 UPM** (1024) makes that division bit-exact across font engines — the load-bearing determinism property. See [obscure-text-font.md](obscure-text-font.md). + +- **Obscure text** — rendering text as featureless boxes (box-glyph font) or colored rectangles (`obscureText`) so glyph rasterization is removed from the golden comparison; makes layout deterministic across hosts. + +- **`debugDisableShadows` / `disableShadows`** — global/test-binding flag (**default `true`** in tests) that replaces all shadows with solid color blocks, disabling the non-deterministic blur kernel. See [determinism-knobs.md](determinism-knobs.md). + +- **`RepaintBoundary`** — the Flutter widget whose rendered image `matchesGoldenFile` captures (the first such ancestor of the matched `Finder`). + +- **Flutter Gold** — the Skia Gold instance (`flutter-gold.skia.org`) that the framework uses as its CI golden backend instead of local file compares. A separate engine instance lives at `flutter-engine-gold.skia.org`. + +- **`flutter_goldens`** — the in-tree (not-on-pub.dev) package that swaps in a Gold-backed `FlutterGoldenFileComparator` subclass at test bootstrap based on environment. + +- **`goldctl`** — Google's CLI client that uploads images + metadata to a Gold instance (`goldctl imgtest add`, `--luci` on CI). + +- **`flutter-gold` check** — the per-PR status check that holds pending on any image delta until a `flutter-hackers` human triages (approves) each new image in the Gold dashboard. + +- **Triage** — the human act of approving (or rejecting) a new image in Gold. An approved digest auto-passes thereafter (content-addressed). + +- **Multiple golden masters / many positives** — Gold's tolerance for several approved images per logical test, absorbing cross-platform rendering differences. The design answer to `LocalFileComparator`'s zero tolerance. + +- **`golden_toolkit`** — eBay's community golden helper (`loadAppFonts`, `multiScreenGolden`). **Discontinued** (latest 0.15.0). + +- **`loadAppFonts()`** — golden_toolkit helper that loads **real** fonts (Roboto + package fonts) instead of the box-glyph test font — the opposite of the obscure-text approach, trading determinism for human-readable goldens. + +- **Alchemist** — Betterment/Very Good Ventures' active golden package. Splits into **platform tests** (real fonts, local, uncommitted) and **CI tests** (Ahem/obscured, committed); controls `obscureText`, `renderShadows`, `diffThreshold`. + +- **Platform tests vs CI tests (Alchemist)** — the two-tier golden split: human-readable real-font goldens run per-OS and not committed, vs obscured-text goldens forced to Ahem, committed, platform-agnostic. + +## Sources + +- All sibling files in this folder. +- Flutter-Test-Fonts.md: https://github.com/flutter/flutter/blob/master/docs/contributing/testing/Flutter-Test-Fonts.md +- `matchesGoldenFile` / `LocalFileComparator` / `debugDisableShadows` API docs (api.flutter.dev) +- Alchemist: https://github.com/Betterment/alchemist +- Skia Gold vocabulary: [`docs/prior-art/skia-gold/`](../skia-gold/README.md) diff --git a/docs/prior-art/flutter-golden-testing/lessons.md b/docs/prior-art/flutter-golden-testing/lessons.md new file mode 100644 index 0000000..b2d65b3 --- /dev/null +++ b/docs/prior-art/flutter-golden-testing/lessons.md @@ -0,0 +1,58 @@ +**Date:** 2026-06-14 +**Status:** active +**Subject:** Flutter golden testing — Validates / Avoid / Borrow decision file for Buiy's visual-bug detection (text goldens + determinism knobs) + +This is the consult-this-when-designing file. The other files in this folder are evidence; this is synthesis. When designing any Buiy visual-test feature that touches text goldens, determinism knobs, or the Tier-5 golden harness — `buiy-verification-design`, the text-golden suite, the forced-colors/shadow determinism work — start here. + +The one-line lesson: Flutter's record is an empirical argument for **reftests-over-pixels and determinism-knobs-over-pixel-plumbing**. Even Google-scale Gold tooling does not fully tame host-rasterization flake (#131559). Buiy borrows the knobs, not the hosted service. + +## Validates + +These Buiy design choices are confirmed by Flutter's experience: + +- **Reftests-first / keep the pixel tier minimal.** Flutter's whole golden apparatus exists to fight glyph-rasterization flake, and even with Gold content-addressing, `goldctl`, per-PR human triage, and Docker pinning, host-OS rasterization still leaks ([#131559](https://github.com/flutter/flutter/issues/131559)). This is direct evidence for the strategy report's pyramid: push detection *down* into deterministic tiers and shrink the flaky golden tier to a residue. See [open-problems.md](open-problems.md). +- **An obscure box-glyph test font as the default determinism mode.** Flutter swaps all unspecified text to a box-glyph font (Ahem → FlutterTest) precisely so layout/golden output is identical across OSes. This validates Buiy's plan for a `BUIY_TEST_FONT`. See [obscure-text-font.md](obscure-text-font.md). +- **A shadow killswitch in golden mode.** `debugDisableShadows` is **default-on** in Flutter's test binding because shadow blur is non-deterministic version-to-version and run-to-run. Buiy's SDF shadow pass is the same risk class; a swap-to-flat-fill knob is validated. See [determinism-knobs.md](determinism-knobs.md). +- **A two-tier split: broad obscure-text/flat-shadow + narrow real-font.** Alchemist institutionalizes exactly this (CI tests forced to Ahem and committed; platform tests with real fonts, run locally, not committed). Buiy's "box-glyph bulk + tiny real-`cosmic-text` fidelity tier" mirrors it. See [ecosystem-toolkit-alchemist.md](ecosystem-toolkit-alchemist.md). +- **A curated-accept workflow with explicit blessing.** Flutter goldens are regenerated with `flutter test --update-goldens` and approved by a human pre-submit. Buiy already has `BUIY_ACCEPT_SHAPING` as a curated-accept gate; this validates generalizing it across tiers (`BUIY_ACCEPT_*`). See [flutter-gold-infra.md](flutter-gold-infra.md). +- **The comparator-as-swappable-backend seam.** `matchesGoldenFile` does no compare itself; an abstract `GoldenFileComparator` is the extension point. Buiy's `GoldenConfig` / backend-selection is the analogous seam. See [matches-golden.md](matches-golden.md). + +## Avoid + +| Pitfall | Source | Buiy mitigation | +|---|---|---| +| **A pixel-exact local comparator as the primary text-golden backend.** `LocalFileComparator` is zero-tolerance and "a golden file generated on Windows … will likely differ from the one produced by another operating system." | [matches-golden.md](matches-golden.md), [LocalFileComparator](https://api.flutter.dev/flutter/flutter_test/LocalFileComparator-class.html) | Collapse the font axis upstream (box-glyph + power-of-2 UPM) so the local comparator has nothing host-dependent to disagree about; reserve real-font goldens for one pinned host with a fuzzy budget. | +| **Treating "boxes instead of curves" as the whole determinism win.** Ahem (UPM 1000) still "yields slightly different metrics on different platforms." | [obscure-text-font.md](obscure-text-font.md) | The win is boxes **AND** a power-of-2 UPM (FlutterTest = 1024) with pinned ascent/descent → integer-exact, font-engine-agnostic metrics. Pick UPM 1024 for `BUIY_TEST_FONT`. | +| **Goldening everything with real fonts (the golden_toolkit `loadAppFonts` shape).** Real fonts everywhere = host-dependence everywhere; culprits are "system fonts, missing glyphs, font fallbacks." | [ecosystem-toolkit-alchemist.md](ecosystem-toolkit-alchemist.md) | Real-font goldens are a deliberately *narrow* tier, pinned to one bundled OFL font + controlled rasterizer. `golden_toolkit` is also **discontinued** — do not center a strategy on it. | +| **A hosted Gold-class triage service.** Flutter Gold is Google-Cloud-operated, has an operational floor, gets stuck pending on force-push, and is a documented flake source. | [flutter-gold-infra.md](flutter-gold-infra.md), [open-problems.md](open-problems.md) | Buiy is local-first: commit small box-font goldens to the repo (tiny diffs), keep a host-pinned local rasterizer for the residue, no SaaS. The [`skia-gold`](../skia-gold/README.md) folder covers the storage tradeoffs. | +| **Skipping flaky goldens as a flake remedy.** "When we skip a test we stop sending to Skia Gold entirely" → "no way to verify flaky golden test fixes." | [open-problems.md](open-problems.md), [#111325](https://github.com/flutter/flutter/issues/111325) | Make the tier deterministic-by-construction (box font, flat shadows, fixed clock, warm atlas) so flake doesn't arise; don't manage it by disabling. | +| **A debug-build-only shadow killswitch.** Flutter's `debugDisableShadows` only toggles inside a single test case; there is an open ask to push it into the engine as a runtime flag. | [determinism-knobs.md](determinism-knobs.md), [#105475](https://github.com/flutter/flutter/issues/105475) | Implement `BUIY_DISABLE_SHADOWS` **engine-side from the start** so it works in release-mode test binaries, not as a `debug_assertions`-gated hack. | +| **Trying to make color emoji deterministic.** Color-emoji rendering is platform-divergent and unstable even within Flutter; a box font cannot collapse it. | [open-problems.md](open-problems.md) | Treat Buiy's color-emoji path as the irreducible real golden: pinned hardware + font, generous diff tolerance. Don't fight it with determinism knobs. | +| **Snapshotting a wrong-but-blessed golden as "correct."** Goldens assert "matches the blessed image," not "is correct"; Gold auto-approves a triaged digest forever. | [open-problems.md](open-problems.md) | Prefer reftests (assert `render-A == / != render-B` — a relational oracle) and property invariants (laws, no oracle) below the golden tier; goldens are last resort. This is the strategy report's thesis. | + +## Borrow + +Concrete primitives and patterns from Flutter worth adapting into Buiy: + +1. **A box-glyph test font with power-of-2 UPM.** Build/ship `BUIY_TEST_FONT` with UPM **1024**, pinned ascent/descent (e.g. 0.75/0.25 em like FlutterTest), line-gap 0, every glyph a solid em-box. Default it for the layout-number → reftest tiers. This is the single cheapest determinism lever. *(Confirm the original Ahem's redistribution license before bundling — unverified — or generate a clean-room font.)* See [obscure-text-font.md](obscure-text-font.md). +2. **A shadow killswitch (`BUIY_DISABLE_SHADOWS`).** Swap the SDF shadow for a flat fill in the cheap tiers; default-on in golden mode, engine-side. See [determinism-knobs.md](determinism-knobs.md). +3. **The two-tier golden shape.** Broad obscure-text + flat-shadow tier (deterministic, committed, near-zero tolerance) + a narrow real-`cosmic-text`/`harfrust` fidelity tier (one pinned font, controlled rasterizer, generous threshold). Borrow Alchemist's `diffThreshold`-per-config idea. See [ecosystem-toolkit-alchemist.md](ecosystem-toolkit-alchemist.md). +4. **Generalized curated-accept (`BUIY_ACCEPT_*`).** Extend the existing `BUIY_ACCEPT_SHAPING` pattern so accepting any golden is an explicit, reviewable, diffable act — not a silent overwrite. Flutter's `--update-goldens` + human pre-submit triage is the precedent. See [flutter-gold-infra.md](flutter-gold-infra.md). +5. **The comparator-as-backend seam.** Keep golden comparison behind a swappable backend (Buiy's `GoldenConfig`), so local pixel-diff, re-capture-determinism checks, and any future host-pinned rasterizer are interchangeable. See [matches-golden.md](matches-golden.md). +6. **A two-parameter fuzzy-comparison budget for the host-pinned residue tier.** Flutter's Impeller golden harness gates on two knobs, not one: `maxDiffPixelsPercent` (what fraction of pixels may differ) **and** `pixelColorDelta` (the max per-channel color delta a differing pixel may have) — "less than 1% of pixels are different by less than 4 color component deltas" ([engine PR #40824](https://github.com/flutter/engine/pull/40824)). This two-axis shape (a count budget *and* a per-pixel magnitude budget) is the concrete primitive Buiy's fuzzy comparator needs for the narrow real-font tier; Buiy's current naive L1/RMSE metrics collapse both axes into one scalar and lack an AA-aware budget (strategy report §4). Build the comparator backend to take both. See [matches-golden.md](matches-golden.md), [flutter-gold-infra.md](flutter-gold-infra.md). +7. **Borrow as *cautionary baseline*, not as a build target: hosted Gold.** Study Gold's many-positives + content-addressing as ideas (already covered in [`skia-gold`](../skia-gold/README.md)/lessons), but do not build the service. + +## How to use this file + +When designing a Buiy visual-test feature: (1) find the **Avoid** row nearest your design, follow the linked evidence file, apply the mitigation; (2) find the **Borrow** item nearest the primitive you're building, read the evidence for shape, adapt for Buiy. Promote any decision into a spec under `docs/specs/` — this file captures what we learn from Flutter, not Buiy's own decisions. + +## Sources + +- All sibling files in this folder. +- `LocalFileComparator`: https://api.flutter.dev/flutter/flutter_test/LocalFileComparator-class.html +- `debugDisableShadows`: https://api.flutter.dev/flutter/rendering/debugDisableShadows.html +- Flutter-Test-Fonts.md: https://github.com/flutter/flutter/blob/master/docs/contributing/testing/Flutter-Test-Fonts.md +- flutter/flutter#131559, #111325, #105475 +- Alchemist: https://github.com/Betterment/alchemist +- Buiy visual-bug-detection strategy: [`docs/reports/2026-06-14-visual-bug-detection-strategy.md`](../../reports/2026-06-14-visual-bug-detection-strategy.md) +- Skia Gold storage/triage tradeoffs: [`docs/prior-art/skia-gold/`](../skia-gold/README.md) diff --git a/docs/prior-art/flutter-golden-testing/matches-golden.md b/docs/prior-art/flutter-golden-testing/matches-golden.md new file mode 100644 index 0000000..5cae9ef --- /dev/null +++ b/docs/prior-art/flutter-golden-testing/matches-golden.md @@ -0,0 +1,74 @@ +**Date:** 2026-06-14 +**Status:** active +**Subject:** `matchesGoldenFile` and the `GoldenFileComparator` backend — Flutter's golden matcher and why its local default flakes across hosts + +# matchesGoldenFile and the comparator backend + +## The matcher + +`matchesGoldenFile` is `flutter_test`'s golden-file matcher. The signature is `AsyncMatcher matchesGoldenFile(Object key, {int? version})` ([matchesGoldenFile API](https://api.flutter.dev/flutter/flutter_test/matchesGoldenFile.html)): + +- **`key`** — a `Uri`/`String` URL identifying the golden image. Any other type throws an `ArgumentError`. +- **`version`** — an optional `int` "to differentiate historical golden files." + +Because it is asynchronous, it must be driven via `await expectLater(...)`. It accepts: + +- a **`Finder`** — which must match exactly one widget; it then captures the rendered image of that widget's **first `RepaintBoundary` ancestor**, +- a **`Future`**, or +- a **`ui.Image`**. + +Golden images are written or refreshed with `flutter test --update-goldens`. + +The matcher itself **does no comparison**. It delegates to the top-level ambient `goldenFileComparator`, "which acts as the backend for this matcher" ([matchesGoldenFile API](https://api.flutter.dev/flutter/flutter_test/matchesGoldenFile.html)). That indirection is the entire extension story — swapping the comparator is how Flutter switches between local pixel-diffing and the Skia-Gold cloud backend. + +## The comparator extension seam + +`GoldenFileComparator` is an abstract class with two methods ([GoldenFileComparator API](https://api.flutter.dev/flutter/flutter_test/GoldenFileComparator-class.html)): + +- **`compare`** — "Compares the pixels of decoded png `imageBytes` against the golden file identified by `golden`." +- **`update`** — "Updates the golden file identified by `golden` with `imageBytes`." + +Comparators run "in the `TestWidgetsFlutterBinding.runAsync` zone and are thus not subject to the fake async constraints." Being abstract is the seam: subclasses implement local pixel-diffing, Skia-Gold upload, or skip-on-unsupported-environment behavior. The framework picks among them at test bootstrap (see [flutter-gold-infra.md](flutter-gold-infra.md)). + +## The default: `LocalFileComparator` is brutally strict + +The default backend for `flutter test` is `LocalFileComparator`. It ([LocalFileComparator API](https://api.flutter.dev/flutter/flutter_test/LocalFileComparator-class.html)): + +- "loads golden files from the local file system, treating the golden key as a relative path from the test file's directory," and +- "performs a pixel-for-pixel comparison of the decoded PNGs, returning true only if there's an exact match." + +**Zero tolerance.** No AA-exclusion, no fuzzy budget, no per-channel delta. That exactness is exactly what makes it flake across hosts. + +(The no-op default in raw `flutter_test` — before `flutter_goldens` wires anything — is `TrivialComparator`; see [flutter-gold-infra.md](flutter-gold-infra.md).) + +## Why the local default flakes across hosts + +The API docs warn directly: "Custom fonts may render differently across different platforms, or between different versions of Flutter. For example, a golden file generated on Windows with fonts will likely differ from the one produced by another operating system" ([GoldenFileComparator API](https://api.flutter.dev/flutter/flutter_test/GoldenFileComparator-class.html), [LocalFileComparator API](https://api.flutter.dev/flutter/flutter_test/LocalFileComparator-class.html)). + +The root cause: different OSes scale and rasterize fonts with **different font engines**, so a pixel-exact PNG baked on macOS fails a byte-comparison on a Linux CI runner. Subpixel font smoothing and anti-aliasing differences a human eye cannot see, but a `==` on decoded bytes catches every time. + +The standard practitioner mitigations all exist because `LocalFileComparator` itself offers no tolerance: + +- bundle fonts; never depend on system fonts, +- pin device-pixel-ratio, +- run goldens on one canonical OS. + +**Device-pixel-ratio and surface size are their own flake axis.** `matchesGoldenFile` captures whatever physical-pixel surface the test renders, so the device-pixel-ratio (DPR) and logical surface size must be pinned, not just the OS and font. The same widget at DPR 1.0 vs 2.0 produces different physical pixel counts and different sub-pixel snap positions; an unpinned DPR makes a golden non-reproducible even on one host. Flutter's widget-test binding fixes a default test surface and DPR for exactly this reason, and the higher-level packages let you fan out a fixed device list (golden_toolkit's `multiScreenGolden`, see [ecosystem-toolkit-alchemist.md](ecosystem-toolkit-alchemist.md)) so each DPR/size is its own deterministic golden rather than a flake source. **Implication for Buiy:** because Buiy targets wgpu/HiDPI, the golden harness must pin both the logical surface size and the device-pixel-ratio per golden (and treat logical-vs-physical scaling as an explicit, asserted axis), not assume a 1:1 mapping. + +Flutter's deeper answers — collapsing the font axis with an obscure box-glyph test font ([obscure-text-font.md](obscure-text-font.md)) and moving the source of truth to a multi-positive server ([flutter-gold-infra.md](flutter-gold-infra.md)) — are both responses to this same zero-tolerance pixel-exactness. + +## Implications for Buiy + +The load-bearing fact for Buiy: a pixel-exact *local* comparator flakes across hosts, so any Buiy text/pixel golden tier needs **either** a tolerance knob **or** a server-side multi-master backend (which Buiy, being local-first, declines — see [lessons.md](lessons.md) `## Avoid`). The cleaner answer Buiy reaches for first is *upstream determinism*: collapse the font axis so the local comparator has nothing host-dependent to disagree about. + +When a tolerance knob *is* needed (the narrow real-font residue tier), the shape to copy is Flutter's Impeller golden harness, which uses **two** parameters rather than one scalar: `maxDiffPixelsPercent` (the fraction of pixels allowed to differ) **and** `pixelColorDelta` (the max per-channel color delta a differing pixel may have) — passing when "less than 1% of pixels are different by less than 4 color component deltas" ([engine PR #40824](https://github.com/flutter/engine/pull/40824)). That two-axis budget (a count *and* a per-pixel magnitude) is what an AA-aware comparator needs; Buiy's current naive L1/RMSE metrics collapse both axes into one and lack such a budget (strategy report §4). Build the fuzzy backend to take both knobs — see [lessons.md](lessons.md) `## Borrow`. + +The comparator-as-swappable-seam pattern itself is worth borrowing — Buiy's `GoldenConfig` is the analogous backend-selection point. + +## Sources + +- `matchesGoldenFile` API: https://api.flutter.dev/flutter/flutter_test/matchesGoldenFile.html +- `GoldenFileComparator` API: https://api.flutter.dev/flutter/flutter_test/GoldenFileComparator-class.html +- `LocalFileComparator` API: https://api.flutter.dev/flutter/flutter_test/LocalFileComparator-class.html +- flutter/engine#40824 (`maxDiffPixelsPercent` + `pixelColorDelta` golden threshold): https://github.com/flutter/engine/pull/40824 +- Sibling files: [flutter-gold-infra.md](flutter-gold-infra.md), [obscure-text-font.md](obscure-text-font.md), [lessons.md](lessons.md) diff --git a/docs/prior-art/flutter-golden-testing/obscure-text-font.md b/docs/prior-art/flutter-golden-testing/obscure-text-font.md new file mode 100644 index 0000000..98374fb --- /dev/null +++ b/docs/prior-art/flutter-golden-testing/obscure-text-font.md @@ -0,0 +1,51 @@ +**Date:** 2026-06-14 +**Status:** active +**Subject:** The Ahem → FlutterTest obscure-text font — why box glyphs and a power-of-2 units-per-em make layout/text golden output font-engine-agnostic + +# The obscure-text determinism font (Ahem → FlutterTest) + +To stop *text* from being the flake source, `flutter test` substitutes a single obscure test font for all unspecified text: "if fontFamily isn't specified or the specified font families are not available, the default test font FlutterTest will be used" ([Flutter-Test-Fonts.md](https://github.com/flutter/flutter/blob/master/docs/contributing/testing/Flutter-Test-Fonts.md)). The rationale, verbatim: "Rectangles are used for text to avoid curves that might cause irrelevant test failure when comparing pixels." This is the single cheapest determinism lever in the whole golden stack. + +## Two failing modes the font removes + +The lineage is two distinct fixes layered: + +1. **Curve-rasterization variance.** Real glyph outlines are curves; different platform font engines anti-alias those curves differently, so the same glyph rasterizes to slightly different pixels per OS. A glyph that is a **solid box filling the em square** has no curves — nothing for the rasterizer to disagree about. +2. **Metric rounding variance.** Even with box glyphs, the *metrics* (advance width, ascent, descent, baseline) are computed by the platform font engine and can round differently. This is where the **units-per-em** choice matters. + +## Ahem vs FlutterTest — the verified metrics + +| Font | Ascent | Descent | Units-per-em | Line-gap | +|---|---|---|---|---| +| **FlutterTest** (current default) | 768 (0.75 em) | 256 (0.25 em) | **1024** | 0 | +| **Ahem** (legacy default) | 800 (0.8 em) | 200 (0.2 em) | **1000** | 0 | + +Historically the default was **Ahem**, "designed to show black spaces for every character and icon" — solid boxes filling the em square. Flutter now defaults to **FlutterTest**. + +## The load-bearing detail: a power-of-2 units-per-em + +The decisive difference is not the glyph shape (both are boxes) — it is the em size. FlutterTest's "`1024 units-per-em` is a power of 2, making it less likely to introduce precision loss in metrics calculations, when used as a divisor… FlutterTest generally provides more precise and font-engine-agnostic font/glyph metrics than `Ahem`" ([Flutter-Test-Fonts.md](https://github.com/flutter/flutter/blob/master/docs/contributing/testing/Flutter-Test-Fonts.md)). + +The doc names Ahem's exact failing directly: "with the `Ahem` font you would get slightly different metrics on different platforms, since they use different font engines to scale the font." Ahem's UPM of **1000** is not a power of 2, so dividing by it in metric scaling introduces floating-point precision loss that diverges per engine. The power-of-2 em makes the divisions exact (or at least bit-identical across engines), which is what makes the metrics font-engine-agnostic. + +**The takeaway is not "boxes instead of curves" — it is "boxes AND a power-of-2 UPM with pinned ascent/descent."** Boxes kill the curve-rasterization axis; the power-of-2 em kills the metric-rounding axis. You need both for integer-exact, engine-independent layout numbers. + +## Shaped variants + +FlutterTest also ships shaped variants for exercising specific layout cases: **Square**, **"Ascent Flushed,"** **"Descent Flushed,"** and varying x-advance glyphs (**Full**, **1/2**, **1/3**) — all with "no outlines in the glyph." These let tests assert advance-width and baseline behavior with predictable, integer-clean metrics. + +## Implications for Buiy + +This is the determinism knob Buiy should mirror most directly. Ship a `BUIY_TEST_FONT` — an Ahem-style box-glyph font — and make it the default for the cheap, broad tiers (layout-number snapshots through reftests). When choosing/building it: + +- **Pick a power-of-2 units-per-em** (1024, like FlutterTest) and **pin ascent/descent** so glyph metrics are integer-exact. This is the *actual* determinism win — it makes cosmic-text/harfrust shaping + Taffy line-breaking produce byte-identical layout numbers regardless of the host's FreeType/HarfBuzz build, collapsing the font axis for the bulk of text-bearing goldens. +- **Box glyphs are still real coverage.** The glyph atlas still exercises its rasterize/pack/upload path; the *only* variable left is Buiy's own code, not the system font stack. +- **License caveat (unverified).** The original Ahem font ships with WebKit/Blink/Flutter as a permissively-licensed test asset, but its exact redistribution terms were not confirmed against a primary license file in this pass. Confirm before bundling, or generate a clean-room box font. + +Buiy keeps a separate narrow real-font fidelity tier for the cases a box font cannot test (kerning, fallback, emoji) — see [ecosystem-toolkit-alchemist.md](ecosystem-toolkit-alchemist.md) and [lessons.md](lessons.md). + +## Sources + +- Flutter-Test-Fonts.md: https://github.com/flutter/flutter/blob/master/docs/contributing/testing/Flutter-Test-Fonts.md +- Flutter rendering breaking-changes (Ahem → FlutterTest default swap): https://docs.flutter.dev/release/breaking-changes/rendering-changes +- Sibling files: [matches-golden.md](matches-golden.md), [determinism-knobs.md](determinism-knobs.md), [ecosystem-toolkit-alchemist.md](ecosystem-toolkit-alchemist.md), [lessons.md](lessons.md) diff --git a/docs/prior-art/flutter-golden-testing/open-problems.md b/docs/prior-art/flutter-golden-testing/open-problems.md new file mode 100644 index 0000000..a13153a --- /dev/null +++ b/docs/prior-art/flutter-golden-testing/open-problems.md @@ -0,0 +1,35 @@ +**Date:** 2026-06-14 +**Status:** active +**Subject:** What Flutter's golden system structurally does NOT solve — host-rasterization leakage, the flaky-fix blind spot, the hosted-service floor, and the irreducible color-emoji golden + +# Open problems + +What the Flutter golden ecosystem — even at Google scale, with Gold content-addressing, `goldctl`, per-PR human triage, and Docker pinning — structurally does *not* solve. These are the limits Buiy should not expect any golden tier to exceed. + +## 1. Host-OS rasterization leaks through containers + +Pixel goldens remain platform-sensitive even when the OS is pinned. Issue [#131559](https://github.com/flutter/flutter/issues/131559) (Open, P2): with **one identical Ubuntu Docker image**, generating on a Windows host and verifying on a Mac host yields *"a random smattering of mismatched pixels"* ranging *"from single pixels to 30-90 pixel mismatches."* Host-OS font/AA rasterization leaks through the container boundary. **Implication for Buiy:** a Docker image is not a determinism guarantee for the pixel tier; the real fix is upstream (box-glyph font, flat shadows) so the comparison has nothing host-dependent in it. The irreducible pixel residue genuinely needs *one* canonical host, pinned. + +## 2. Skipping a flaky golden makes the fix unverifiable + +When a framework golden flakes it is simply skipped, and "when we skip a test we stop sending to Skia Gold entirely," so there is "no way to verify flaky golden test fixes" short of speculatively un-skipping — "The cost of a mistake is closed tree, P0s, wasted time, and other sadness" ([#111325](https://github.com/flutter/flutter/issues/111325), 2022-09-10). The act of disabling the flake destroys the signal you need to confirm the flake is gone. **Implication for Buiy:** flake at the pixel tier is not just noise — it actively erodes the ability to fix it. Another argument for keeping that tier minimal and deterministic-by-construction rather than fighting flake after the fact. + +## 3. The hosted-service operational floor + +Flutter Gold is a hosted Skia Gold instance on Google Cloud — a GCS bucket plus a frontend, Google-operated. It carries an operational floor: the `flutter-gold` check gets stuck pending on force-push (remedy: *"Try rebasing again. This side-effect is flaky"*), and unapproved-image post-submit failures have historically been mis-reported as flaky. A hosted triage service is a standing cost and a standing dependency. **Implication for Buiy:** for a local-first, offline, MIT/Apache library, this central service is the wrong dependency *and* a flake source. Buiy commits goldens to the repo and reserves a host-pinned local rasterizer for the residue — no hosted service. (For the storage/triage tradeoffs in depth, see the [`skia-gold`](../skia-gold/README.md) folder.) + +## 4. Color emoji is the irreducible golden + +Color-emoji (CBDT/CBLC, COLR/CPAL, sbix) rendering is genuinely platform-divergent and unstable even within Flutter — e.g. "Color emoji renders as question mark boxes on iOS simulator (Impeller, Skia removed)" ([#183828](https://github.com/flutter/flutter/issues/183828), closed, filed 2026-03-18). A box-glyph font cannot collapse it, and it resists metric-only assertion (the *point* is the rasterized color bitmap). **Implication for Buiy:** treat Buiy's color-emoji path as the one case that *must* be a real golden screenshot on pinned hardware/font, with generous diff tolerance. Do not try to make it deterministic — accept it as the irreducible residue. + +## 5. The oracle problem persists + +Goldens (like all snapshot tests) assert "matches the blessed image," not "is correct." A wrong-but-blessed golden silently passes forever. Gold's content-addressing makes a triaged-wrong digest *auto-approve* indefinitely. The triage human is the only oracle, and human triage at scale is itself a known fatigue/error source. **Implication for Buiy:** the cheaper tiers (reftests assert *relations* — `render-A == render-B` — that encode a correctness oracle without a blessed image; property invariants assert laws with no oracle at all) are structurally better than goldens here, which is the strategy report's core thesis. Goldens are the last resort, not the first. + +## Sources + +- flutter/flutter#131559 (Docker rasterization leak): https://github.com/flutter/flutter/issues/131559 +- flutter/flutter#111325 (no way to verify flaky golden fixes): https://github.com/flutter/flutter/issues/111325 +- flutter/flutter#183828 (color emoji as tofu on iOS sim; closed, filed 2026-03-18): https://github.com/flutter/flutter/issues/183828 +- Skia Gold service model and storage tradeoffs: [`docs/prior-art/skia-gold/`](../skia-gold/README.md) +- Buiy visual-bug-detection strategy: [`docs/reports/2026-06-14-visual-bug-detection-strategy.md`](../../reports/2026-06-14-visual-bug-detection-strategy.md) diff --git a/docs/prior-art/skia-gold/README.md b/docs/prior-art/skia-gold/README.md new file mode 100644 index 0000000..3079747 --- /dev/null +++ b/docs/prior-art/skia-gold/README.md @@ -0,0 +1,82 @@ +**Date:** 2026-06-14 +**Status:** active +**Subject:** Skia/Chromium Gold + the visual-golden storage & triage ecosystem — the escape hatch for when a golden set explodes + +# Skia Gold & the visual-golden storage/triage ecosystem + +**Skia Gold** is an image-diff *service* (Go backend + Polymer frontend, run on Google Cloud) that the Skia team built to compare images produced by their bots against known baselines. Its defining architectural move — versus committing golden PNGs to a repo — is that **comparison happens in an external service, not on the test machine**: a test produces a PNG, hands it to the `goldctl` client with hardware/software metadata, and goldctl checks whether the image's content hash (its *digest*) is in the list of approved hashes. Match → silent pass with no upload; miss → upload image + metadata to a GCS bucket, exit non-zero, and surface an untriaged image in the triage UI. Baselines live "outside of Git, but in lockstep with Git commits," tagged with open-ended key/value **params** (`OS=Android`, `GPU=Nvidia770GTX`) that turn the OS×GPU×backend matrix into *dimensions of one logical test* rather than N committed files. Gold supports **multiple approved images per test** (anti-aliasing nondeterminism makes one-baseline-per-test untenable on GPUs), inexact/fuzzy/Sobel matching for noisy tests, and time-boxed ignore rules for flaky configs. + +This folder treats Gold as Buiy's **storage + triage escape hatch** — the precedent to reach for *when (not before)* a Buiy golden set explodes — and surrounds it with the comparison set Buiy will actually choose from: **reg-suit** (the OSS, commit-hash-keyed, self-hostable reference design), the SaaS triad **Chromatic / Percy / Argos**, and the OSS leaf tools and diff engines (BackstopJS, jest-image-snapshot, pixelmatch, odiff). The bottom-line decision lives in [lessons.md](lessons.md): Buiy should **not** build a Gold-class service; it should build a reg-suit-shaped *local* harness and copy four of Gold's *ideas* — params/traces keying, multi-positive baselines, tunable inexact matching, expiring ignores — without its infrastructure. + +This is the Tier-5 (golden/screenshot) prior-art for [`docs/reports/2026-06-14-visual-bug-detection-strategy.md`](../../reports/2026-06-14-visual-bug-detection-strategy.md). The strategy's whole point is to keep Tier 5 a *minimal residue* by catching most regressions in Tiers 1–4; this folder documents what to do for the residue that genuinely needs stored rasterized images. + +## Key facts + +| Fact | Value | Source | +|---|---|---| +| Gold language / repo | Go (+ Polymer frontend); `github.com/google/skia-buildbot` under `//golden/` (service) and `//gold-client/` (client) | [skia.org skiagold](https://skia.org/docs/dev/testing/skiagold/) | +| Architectural choice | Comparison in an **external service**, not on the test machine | [Chromium Gold doc](https://chromium.googlesource.com/chromium/src/+/HEAD/docs/gpu/gpu_pixel_testing_with_gold.md) | +| Client | `goldctl` — `github.com/google/skia-buildbot/gold-client/cmd/goldctl`, BSD-3-Clause; built from source/CIPD, **no verifiable tagged semver release** | [pkg.go.dev goldctl](https://pkg.go.dev/github.com/google/skia-buildbot/gold-client/cmd/goldctl) | +| `goldctl` subcommands | `auth`, `imgtest` (`init` / `add` / `finalize`), `validate` | [pkg.go.dev goldctl](https://pkg.go.dev/github.com/google/skia-buildbot/gold-client/cmd/goldctl) | +| Storage backend | GCS bucket (bytes) + GCE/k8s frontend (ingest + triage); Google-operated | [skia-buildbot golden README](https://github.com/google/skia-buildbot/blob/main/golden/docs/README.md) | +| Digest | Content hash of a PNG's pixel content (+ possibly colorspace metadata); "digest" ≡ "image" | [flutter-gold help](https://flutter-gold.skia.org/help) | +| Image-digest algorithm | **Unverified** — docs say "hash of pixel content" but do not name it; the MD5 reference is for the `Expectations` struct, NOT the image | flagged below | +| Triage labels | `positive` / `negative` / `untriaged` (binary triage; pass if hash matches *any* positive) | [flutter-gold help](https://flutter-gold.skia.org/help) | +| Multi-positive | One trace/test may have many approved digests (GPU AA nondeterminism) | [flutter-gold help](https://flutter-gold.skia.org/help) | +| Inexact matching | Per-test `matching_algorithm` — Fuzzy (`max_different_pixels`, `pixel_per_channel_delta_threshold`) / Sobel (`edge_threshold`) | [Chromium Gold doc](https://chromium.googlesource.com/chromium/src/+/HEAD/docs/gpu/gpu_pixel_testing_with_gold.md) | +| Ignores | Time-boxed (hours-scale) ignore rules keyed by params; gardener-owned | [skia.org skiagold](https://skia.org/docs/dev/testing/skiagold/) | +| Scale framing | "Each commit creates >500k images" | [skia-buildbot golden README](https://github.com/google/skia-buildbot/blob/main/golden/docs/README.md) | +| Verified adopters | Skia, Chromium, PDFium, Flutter framework (`flutter-gold.skia.org`) | [Chromium Gold doc](https://chromium.googlesource.com/chromium/src/+/HEAD/docs/gpu/gpu_pixel_testing_with_gold.md), [skia.org skiagold](https://skia.org/docs/dev/testing/skiagold/) | +| reg-suit | `reg-viz/reg-suit`, npm **0.14.5** (2025-08-26), **MIT**, no SaaS backend | [reg-suit repo](https://github.com/reg-viz/reg-suit) | +| Argos license | **MIT** (self-hostable; no hosting restriction) | [Argos LICENSE](https://github.com/argos-ci/argos/blob/main/LICENSE) | +| odiff | `dmtrKovalenko/odiff`, npm `odiff-bin` **4.3.8**; Zig + SIMD; ~6.67×–7.65× faster than pixelmatch/ImageMagick **on Cypress images** (lower — 5.24×–5.50× — on an 8K image; not a universal speedup) | [odiff README](https://github.com/dmtrKovalenko/odiff/blob/main/README.md) | +| SaaS dollar figures | **All unverified** (vendor pricing pages / secondary comparisons only) | flagged below | + +## Contents + +Each file is independently skimmable with its own `## Sources`. + +**Skia Gold itself** + +- [**gold-architecture.md**](gold-architecture.md) — The service model: digests, params/traces, corpus→test grouping, `goldctl` flow, GCS as source of truth, the many-positives model, triage labels + UI, time-boxed ignores, inexact/Sobel matching, adopters, and the "heavy infrastructure" wart. +- [**storage-scale.md**](storage-scale.md) — *How* Gold dodges golden-storage explosion: content-addressed digests out of repo + a mutable expectations DB; per-config params instead of N committed files; multi-positive baselines; the git/git-LFS pathology (Screenshotbot's verbatim critique); how the peer tools (reg-suit, Chromatic, Argos, Percy) all converge on out-of-repo + branch-scoped baselines. + +**The comparison set** + +- [**ecosystem-tools.md**](ecosystem-tools.md) — The OSS-vs-SaaS split (OSS owns the diff engine, SaaS owns storage/triage); per-tool deep notes on reg-suit, Chromatic (modes = the explosion engine), Argos (flaky auto-ignore), Percy (carry-forward approvals), BackstopJS, jest-image-snapshot, and the engine layer (pixelmatch / odiff); the full comparison table. + +**Reference** + +- [**open-problems.md**](open-problems.md) — What this ecosystem structurally does *not* solve: the oracle problem, stale-positive accumulation, flake without a manual gardener, cross-machine reproducibility, cost/ops floor, commit-key resolution edge cases. +- [**lessons.md**](lessons.md) — **The consult-this-when-designing decision file.** `## Validates` / `## Avoid` / `## Borrow`. This is where Buiy implications live. +- [**glossary.md**](glossary.md) — System-specific terms: digest, param, trace, corpus, baseline, expectation, positive/negative/untriaged, keygen/publisher plugin, mode, snapshot, carry-forward, fuzzy/Sobel matching, TurboSnap. + +## Reading order + +1. [lessons.md](lessons.md) — the decisions. Start here if you are designing Buiy's Tier-5 harness. +2. [gold-architecture.md](gold-architecture.md) — what Gold actually is, so the lessons have a referent. +3. [storage-scale.md](storage-scale.md) — the storage-explosion problem the whole ecosystem exists to solve. +4. [ecosystem-tools.md](ecosystem-tools.md) — the menu Buiy chooses from (reg-suit is the closest analog). +5. [open-problems.md](open-problems.md) — the limits, so Buiy doesn't expect the tier to do more than it can. +6. [glossary.md](glossary.md) — reference when a term is unclear. + +## How to use + +**Framing disclosure.** These docs are written from Buiy's stance — an AccessKit-first, wgpu + Taffy + cosmic-text, parallel-to-bevy_ui retained-mode engine building a reftests-first layered visual-bug-detection strategy. The "Implications for Buiy" / lessons framing reads Skia/Chromium Gold + the visual-golden storage & triage ecosystem through that lens; readers auditing whether that strategy is itself right should weigh the corpus accordingly — it is a learn-from artifact, not a neutral catalog. + +**Corpus-specific framing.** This corpus is written from the stance that **Buiy lands in the OSS camp (Rust, offline-first, MIT/Apache, no SaaS), and Tier 5 is a deliberately-minimal residue**. "Implications for Buiy" lines therefore lean toward reg-suit's self-hostable shape and treat the SaaS tools mostly as cautionary baseline-multiplication and cost evidence. A reader evaluating whether Buiy should adopt a *hosted* triage UI at all — or whether the golden tier is worth building before the pyramid's cheaper tiers are exhausted — should weigh the corpus accordingly. The strategy report's own thesis is that Tiers 1–4 shrink this tier to almost nothing; if that holds, much of Gold's machinery is moot for Buiy. + +**Why "minimal" matters — an order-of-magnitude on the matrix.** Buiy's own `(widget × state × theme × viewport × backend × dpr)` key schema fans out fast. A rough count — say 40 widgets × 4 states × 2 themes — is already ~320 cells *before* any rendering axis; cross with 3 viewports it is ~1k, and a full fan-out over 4 backends (CPU/Vulkan/GL/Metal) × 2 dpr lands near **~7–8k goldens** for a modest v1 catalog. That is the Chromatic "modes" multiplication (see [ecosystem-tools.md](ecosystem-tools.md)) made concrete for Buiy, and it is the number that makes "keep Tier 5 minimal" a quantitative discipline, not a slogan: every widget/state pair pushed *down* to a deterministic structured snapshot or reftest removes a whole backend×dpr column of stored pixels. (Counts are illustrative, not committed; they exist to size the decision, and the lean cut — one backend pinned, one dpr — is ~10× smaller.) + +## Sources + +- Skia Gold docs: https://skia.org/docs/dev/testing/skiagold/ +- Chromium GPU Pixel Testing With Gold: https://chromium.googlesource.com/chromium/src/+/HEAD/docs/gpu/gpu_pixel_testing_with_gold.md +- Flutter Gold help: https://flutter-gold.skia.org/help +- skia-buildbot golden README: https://github.com/google/skia-buildbot/blob/main/golden/docs/README.md +- goldctl on pkg.go.dev: https://pkg.go.dev/github.com/google/skia-buildbot/gold-client/cmd/goldctl +- reg-suit: https://github.com/reg-viz/reg-suit +- Argos LICENSE (MIT): https://github.com/argos-ci/argos/blob/main/LICENSE +- odiff: https://github.com/dmtrKovalenko/odiff +- Buiy visual-bug-detection strategy: [`docs/reports/2026-06-14-visual-bug-detection-strategy.md`](../../reports/2026-06-14-visual-bug-detection-strategy.md) +- Per-file `## Sources` sections cite the specific URLs each file relies on. diff --git a/docs/prior-art/skia-gold/ecosystem-tools.md b/docs/prior-art/skia-gold/ecosystem-tools.md new file mode 100644 index 0000000..f993f62 --- /dev/null +++ b/docs/prior-art/skia-gold/ecosystem-tools.md @@ -0,0 +1,67 @@ +**Date:** 2026-06-14 +**Status:** active +**Subject:** The visual-testing tool ecosystem (comparison set) — reg-suit, Chromatic, Argos, Percy, BackstopJS, jest-image-snapshot, and the diff-engine layer (pixelmatch / odiff) + +# The visual-testing tool ecosystem + +This is the OSS+SaaS comparison set Buiy's strategy treats as the "storage + triage escape hatch" precedent. The split is sharp: **OSS tools own the diff engine and leave storage/triage to you; SaaS tools own storage/triage and rent you the diff.** Buiy will likely land in the OSS camp (Rust, no SaaS), so the reg-suit "plugin-the-storage" model and the odiff diff engine are the closest analogs. See [storage-scale.md](storage-scale.md) for why all of them reject committed files, and [lessons.md](lessons.md) for the Buiy decision. + +## reg-suit — the self-hostable, commit-hash-keyed reference design + +reg-suit (`reg-viz/reg-suit`, npm **0.14.5**, published **2025-08-26**, verified via `npm view`; **MIT**) is the most architecturally relevant: it has **no SaaS backend**. It is a plugin host with three plugin categories ([README](https://github.com/reg-viz/reg-suit/blob/master/README.md)): + +- **Key-generator plugins** answer "what commit should I compare to?" `reg-keygen-git-hash-plugin` (v0.14.5) "detects automatically the parent's commit which is the source of the topic branch" by walking the git branch graph, and uses that commit's snapshot as the expected baseline ([keygen README](https://github.com/reg-viz/reg-suit/blob/master/packages/reg-keygen-git-hash-plugin/README.md)). `reg-simple-keygen-plugin` allows arbitrary string keys. **Wart:** the keygen special-cases merge commits ("if your topic branch has the merge commit from the parent branch, this plugin uses this merge commit hash as the expected snapshot key") and the README is thin on rebased-branch / multi-parent edge cases — commit-key resolution is the part that breaks in practice ([open-problems.md](open-problems.md)). +- **Publisher plugins** are the storage layer: `reg-publish-s3-plugin` / `reg-publish-gcs-plugin` fetch the previous (expected) snapshots from object storage, then push current snapshots + the HTML report back, keyed by the generated hash. Config uses runtime placeholder substitution, e.g. `"bucketName": "$S3_BUCKET_NAME"`. +- **Notifier plugins** (GitHub, GitLab, Slack, Chatwork) post commit status / PR comments. + +Diff engine is **x-img-diff-js** (structural, not just pixel; OpenCV-via-WebAssembly), gated on `core.ximgdiff`, with `thresholdRate` (0–1 ratio) and `thresholdPixel` (absolute) knobs, plus `matchingThreshold` (YUV distance). The `reg-cli`/reg-suit output is a **static HTML report** (expected/actual/diff) generated locally — no server. ximgdiff mode overlays structural diffs: cyan = matched, red = changed, purple = unmatched keypoints. **Key wart:** the README has no first-class "approve" command — triage is the HTML report plus the next commit's snapshot *becoming* the new baseline. **There is no durable per-image accept ledger; acceptance is implicit in git history.** That is exactly the "golden set explodes, triage is manual" failure mode Buiy's strategy aims to avoid — but the keygen+publisher split is still the cleanest OSS pattern for "object storage keyed by commit." + +## Chromatic — baselines multiplied by "modes" (the explosion engine) + +Chromatic (SaaS, Storybook-native) stores baselines server-side and compares each build against the last *approved* build. Its defining mechanic is **Story Modes**: combinations of globals (viewport, theme, locale) saved as a named "mode" via the `chromatic.modes` parameter. "These modes are treated separately, with independent baselines and distinct approvals" — two stories × two modes = **four** independently-approved tests ([Modes docs](https://www.chromatic.com/docs/modes/)). This is the literal baseline-multiplication Buiy must budget for. Billing unit is the **snapshot** (one story × one browser × one viewport); a story in 3 browsers × 3 viewports = 9 snapshots. **TurboSnap** uses git + dependency-graph analysis to re-snapshot only changed stories, billing copied snapshots at **1/5 rate** ([TurboSnap docs](https://www.chromatic.com/docs/turbosnap/)). Pricing (free tier 5,000 snapshots/mo, Pro from **$149/mo** for 35,000) is a **vendor-page figure, unverified** — confirm against [chromatic.com/pricing](https://www.chromatic.com/pricing). + +## Argos — git-history baselines + flaky auto-ignore + +Argos (`argos-ci/argos`, **MIT** — self-hostable, no hosting restriction) picks baselines from git: the most recent candidate build with the same build name, all tests passed, not a *subset*, and "whose commit is an ancestor of the merge base between the triggered build's commit and the baseline branch" ([docs/llms-full.txt](https://argos-ci.com/docs/llms-full.txt)). Standout triage feature is **flaky auto-ignore**: "deterministic pixel diffing" runs multiple diff passes at different thresholds plus **pixel clustering** to separate noise from real change; a project setting — "*Minimum occurrences to consider a change flaky (last 7 days)*" — controls "how many times the same change must appear in the last 7 days before Argos starts ignoring it automatically." GitHub integration sets commit status, posts PR summary comments, runs inside the **merge queue**, and can block merges via branch protection ([GitHub docs](https://argos-ci.com/docs/github)). This "minimum occurrences in 7 days → auto-ignore" is the most concrete prior art for taming flaky-golden noise. + +## Percy — branch-scoped "carry-forward" approvals + +Percy (BrowserStack) compares against the **last approved build on the same branchline**, not a fixed golden. Approval is at **snapshot** granularity (you cannot approve an individual browser/width screenshot) and approvals are "carried forward" — identical snapshots are approved once per branch lifetime ([approval docs](https://www.browserstack.com/docs/percy/build-results/approval); [baseline docs](https://www.browserstack.com/docs/percy/visual-testing-workflows/baseline-management/overview)). Pricing reportedly "from $39/mo" — **vendor figure, unverified**. Per third-party comparison, Percy's per-screenshot model "scales significantly with volume" (a 10-person / 100k-screenshot team ≈ $5,000/mo) — **secondary source, dollar figure unverified** ([vizzly comparison](https://vizzly.dev/visual-testing-tools-comparison/)). + +## OSS leaf tools + +- **BackstopJS** (`garris/BackstopJS`): Puppeteer (capture) + **ResembleJS** (diff). Three-verb workflow — `generate` / `test` / `approve`, where `approve` overwrites local reference PNGs; HTML report has a before/after scrubber. Storage = local filesystem ([README](https://github.com/garris/BackstopJS/blob/master/README.md)). +- **jest-image-snapshot** (`americanexpress/jest-image-snapshot`): Jest matcher; baselines in `__image_snapshots__/`, accept via `jest -u`. Default engine **pixelmatch** (per-pixel `threshold` default 0.01); experimental **ssim** structural mode "may become the default" ([npm](https://www.npmjs.com/package/jest-image-snapshot)). +- **pixelmatch / odiff**: the engine layer. **odiff** (`dmtrKovalenko/odiff`, npm `odiff-bin` **4.3.8**) — "originally written in OCaml, currently in **Zig** with SIMD (SSE2/AVX2/AVX512/NEON)," same YIQ-NTSC + antialiasing detection as pixelmatch, CLI + Node binding. Benchmarks (hyperfine, from the README's `relative` column): on Cypress screenshots odiff **1.168s** vs pixelmatch **7.712s** (**6.67×**) and ImageMagick **8.881s** (**7.65×**); on an 8K image odiff **1.951s** vs pixelmatch 10.614s (**5.50×**) and ImageMagick 9.326s (**5.24×**) ([README](https://github.com/dmtrKovalenko/odiff/blob/main/README.md)). odiff is the production-grade engine to wrap if Buiy builds Tier-5 goldens, and a Rust SIMD equivalent is a natural fit. **Note on the headline number:** the 6.67×–7.65× figure is Cypress-specific; on the 8K image the speedup is lower (5.24×–5.50×), so the Cypress range is not odiff's universal speedup. The README's own prose rounds this to "6 times faster" — there is no verified "8× faster" claim. + +## Comparison table + +| Tool | Storage model | Baseline keying | Accept workflow | Triage UX | Diff engine | +|---|---|---|---|---|---| +| **reg-suit** | Self-host S3/GCS (publisher plugin) | git-graph parent commit (keygen plugin) | Implicit: next commit's snapshot becomes baseline; **no accept command** | Static HTML report; GitHub PR comment | x-img-diff-js (structural) | +| **Chromatic** | SaaS, server-side | Last approved build; per-**mode** baselines | Per-test approve in web app; modes approved separately | Web UI; TurboSnap skips unchanged | Proprietary (standardized browser) | +| **Argos** | SaaS (MIT, self-hostable) | git merge-base ancestor build | Per-build/test approve in UI; flaky auto-ignore | Web UI; auto-ignore + manual "Ignore" | Deterministic pixel diff + clustering | +| **Percy** | SaaS | Last approved on branchline | Snapshot-granular approve; carried forward per branch | Web review UI | Proprietary | +| **BackstopJS** | Local FS | Path/scenario name | `backstop approve` overwrites refs | HTML report w/ scrubber | ResembleJS | +| **jest-image-snapshot** | Local FS (`__image_snapshots__`) | Test/file name | `jest -u` | None (CI fail + diff PNG) | pixelmatch (default) / ssim | +| **odiff / pixelmatch** | n/a (engine only) | n/a | n/a | n/a | YIQ-NTSC + antialiasing | + +## Takeaways for Buiy + +1. reg-suit's **keygen+publisher plugin split** is the cleanest OSS pattern for "object storage keyed by commit," but its lack of a durable accept ledger is the warning — Buiy needs an explicit accept command that writes the digest into the baseline set. +2. Chromatic "modes" make explicit that **baseline count = stories × viewport × theme × locale × browser**; Buiy's reftest-first strategy is partly a hedge against this multiplication. +3. **odiff** is the engine to wrap (or re-implement in Rust SIMD) if Buiy builds Tier-5 goldens. +4. Argos's **"minimum occurrences in 7 days → auto-ignore"** is the most concrete prior art for taming flaky-golden noise (Gold has no such auto-mechanism). + +**Unverified:** all SaaS dollar figures and snapshot quotas (vendor pages / secondary comparisons only). (odiff's speedup figures are verified from the README's hyperfine `relative` column — 6.67×–7.65× on Cypress, 5.24×–5.50× on 8K — and are *not* in the unverified set; a generic "8× faster" claim does not appear in the README.) + +## Sources + +- reg-suit: https://github.com/reg-viz/reg-suit · keygen: https://github.com/reg-viz/reg-suit/blob/master/packages/reg-keygen-git-hash-plugin/README.md · reg-cli: https://github.com/reg-viz/reg-cli · x-img-diff-js: https://github.com/reg-viz/x-img-diff-js +- Chromatic Modes: https://www.chromatic.com/docs/modes/ · TurboSnap: https://www.chromatic.com/docs/turbosnap/ · pricing (unverified): https://www.chromatic.com/pricing +- Argos: https://argos-ci.com/docs/llms-full.txt · GitHub integration: https://argos-ci.com/docs/github · LICENSE (MIT): https://github.com/argos-ci/argos/blob/main/LICENSE +- Percy approval: https://www.browserstack.com/docs/percy/build-results/approval · baseline overview: https://www.browserstack.com/docs/percy/visual-testing-workflows/baseline-management/overview +- BackstopJS: https://github.com/garris/BackstopJS/blob/master/README.md +- jest-image-snapshot: https://www.npmjs.com/package/jest-image-snapshot +- odiff: https://github.com/dmtrKovalenko/odiff/blob/main/README.md +- vizzly comparison (secondary, unverified $): https://vizzly.dev/visual-testing-tools-comparison/ diff --git a/docs/prior-art/skia-gold/glossary.md b/docs/prior-art/skia-gold/glossary.md new file mode 100644 index 0000000..4380085 --- /dev/null +++ b/docs/prior-art/skia-gold/glossary.md @@ -0,0 +1,66 @@ +**Date:** 2026-06-14 +**Status:** active +**Subject:** Skia Gold + visual-golden ecosystem — system-specific terminology glossary + +# Glossary + +System-specific terms used throughout this folder. Gold terms first; then the storage/triage concepts shared across the ecosystem; then per-tool terms. Definitions are scoped to how each term is used in the Gold / visual-golden context, not the broader testing literature. + +## Skia Gold core + +- **Gold** — Skia's image-diff **service** (Go backend + Polymer frontend, Google Cloud). Compares bot-produced images against approved baselines in an external service, not on the test machine. Code in `github.com/google/skia-buildbot` under `//golden/`. +- **digest** — The content hash of a PNG's pixel content (possibly + colorspace metadata). Used synonymously with "image." Identity is content-addressed: identical pixels → identical digest. *Hash algorithm not named in public docs — see [open-problems.md](open-problems.md); NOT verified to be MD5.* +- **param** — A key/value pair labeling how a digest was produced, e.g. `OS=Android`, `GPU=Nvidia770GTX`. Open-ended; Gold ingests new params automatically with no pre-registration. +- **trace** — All digests seen for a unique set of params. Rendered in the UI as a line of colored dots (color = digest). Belongs to exactly one test in one corpus. +- **test** — A named visual case; the grouping under a corpus. A trace belongs to one test. +- **corpus** — The top-level grouping above tests. A Gold instance config lists corpora explicitly via `grouping_param_keys_by_corpus`. +- **positive** — Triage label: this digest is acceptable. A test passes if its hash matches *any* approved positive. +- **negative** — Triage label: this digest must not recur; requires a fix. +- **untriaged** — Triage label: a not-yet-classified digest; "generally means a test has started producing different output." +- **baseline / baseline set** — The set of approved (positive) digests for a test under a given revision. On trybots it is "the union of the master baselines for the current revision and any baselines unique to the CL." +- **expectations** — Gold's mutable database mapping (test, digest) → triage label, stored out-of-Git but in lockstep with Git commits. (The `Expectations` *struct's* hash uses MD5 — distinct from the image digest.) +- **goldctl** — The Gold client ("gold-control"), BSD-3-Clause, built from source/CIPD (no verifiable tagged semver release). Subcommands: `auth`, `imgtest` (`init`/`add`/`finalize`), `validate`. Checks an image's hash against approved hashes; uploads + exits non-zero on miss. +- **multi-positive** — Gold's model allowing many approved digests per test, to absorb GPU anti-aliasing nondeterminism. +- **matching_algorithm** — A per-`PixelTestPage` setting selecting inexact comparison. See Fuzzy / Sobel. +- **Fuzzy matching** — `FuzzyMatchingAlgorithm`: passes when differences are within `max_different_pixels` and `pixel_per_channel_delta_threshold`. +- **Sobel matching** — `SobelMatchingAlgorithm`: applies a Sobel edge filter with `edge_threshold` to mask anti-aliased edges before comparison (`pixel_delta_threshold`). Rationale: skia bug 9527. +- **time-boxed ignore** — An ignore rule for a config that carries an expiry (hours scale), so flaky configs aren't permanently muted. Gardener-owned. *Post-expiry re-activation semantics unverified.* +- **gardener** — The human role that triages untriaged digests, sets ignores, and tunes inexact thresholds. Gold's flake answer is this role + inexact matching, not automation. + +## Shared storage/triage concepts + +- **content-addressed** — Identity derived from the bytes (the hash) rather than a filename or path. The basis of Gold's "instant pass with no upload" behavior. +- **out-of-repo storage** — Keeping image bytes in object storage (S3/GCS) instead of committed to git, with only a key/hash referenced from version control. The ecosystem-wide answer to golden-storage explosion. +- **branch-scoped / branchline baseline** — A baseline resolved per branch (Percy) or by walking git ancestry (Argos), rather than a single fixed golden. "A baseline is not a file—it is a decision." +- **carry-forward approval** — Identical snapshots approved once per branch lifetime, then reused (Percy). +- **golden-storage explosion** — The `O(configs × commits)` growth of committed binary baselines, where one font/color change rewrites the whole grid. The problem the whole ecosystem exists to defuse. + +## Per-tool terms + +- **reg-suit** — OSS, self-hostable, plugin-host visual-regression tool (MIT). No SaaS backend. +- **key-generator plugin** — reg-suit plugin answering "what commit do I compare to?" `reg-keygen-git-hash-plugin` walks the git branch graph to find the topic branch's parent commit. +- **publisher plugin** — reg-suit plugin that is the storage layer: fetches expected snapshots from and pushes current snapshots + HTML report to S3/GCS, keyed by the generated hash. +- **notifier plugin** — reg-suit plugin posting commit status / PR comments (GitHub, GitLab, Slack, Chatwork). +- **x-img-diff-js / ximgdiff** — reg-suit's structural diff engine (OpenCV via WebAssembly): cyan = matched region, red = changed, purple = unmatched keypoint. +- **reg-cli** — reg-suit's CLI that generates the local static HTML report (expected/actual/diff). +- **mode (Story Mode)** — Chromatic combination of globals (viewport/theme/locale) saved via `chromatic.modes`. Each mode gets an **independent baseline and distinct approval** — the baseline-multiplication mechanic. +- **snapshot** — The billing/granularity unit: one story × one browser × one viewport (Chromatic); the approval-granularity unit in Percy. +- **TurboSnap** — Chromatic's git + dependency-graph analysis that re-snapshots only changed stories, billing copied snapshots at 1/5 rate. +- **flaky auto-ignore** — Argos feature: after a change recurs ≥ N times in the last 7 days, Argos auto-ignores it. Paired with pixel clustering. The closest prior art to automatic flake quarantine (Gold has none). +- **pixelmatch** — JS per-pixel diff engine (YIQ-NTSC + AA detection); default for jest-image-snapshot. +- **ssim** — Structural-similarity diff mode (experimental in jest-image-snapshot). +- **odiff** — `dmtrKovalenko/odiff`, Zig+SIMD diff engine (npm `odiff-bin`), same YIQ + AA detection as pixelmatch, ~6.67×–7.65× faster on Cypress images. The production engine to wrap. +- **ResembleJS** — BackstopJS's diff engine. + +## Sources + +- Skia Gold docs: https://skia.org/docs/dev/testing/skiagold/ +- Chromium GPU Pixel Testing With Gold: https://chromium.googlesource.com/chromium/src/+/HEAD/docs/gpu/gpu_pixel_testing_with_gold.md +- Flutter Gold help: https://flutter-gold.skia.org/help +- skia-buildbot golden README: https://github.com/google/skia-buildbot/blob/main/golden/docs/README.md +- goldctl on pkg.go.dev: https://pkg.go.dev/github.com/google/skia-buildbot/gold-client/cmd/goldctl +- reg-suit: https://github.com/reg-viz/reg-suit +- Chromatic Modes: https://www.chromatic.com/docs/modes/ · TurboSnap: https://www.chromatic.com/docs/turbosnap/ +- Argos GitHub docs: https://argos-ci.com/docs/github +- Percy baseline management: https://www.browserstack.com/docs/percy/visual-testing-workflows/baseline-management/overview +- odiff: https://github.com/dmtrKovalenko/odiff diff --git a/docs/prior-art/skia-gold/gold-architecture.md b/docs/prior-art/skia-gold/gold-architecture.md new file mode 100644 index 0000000..18cc504 --- /dev/null +++ b/docs/prior-art/skia-gold/gold-architecture.md @@ -0,0 +1,72 @@ +**Date:** 2026-06-14 +**Status:** active +**Subject:** Skia Gold — the service model, digest/params/traces data shape, goldctl flow, triage, ignores, and inexact matching + +# Gold architecture + +Skia Gold is an image-diff **service**, not a local library. Per Skia's own docs it is "a web application that compares the images produced by our bots against known baseline images," with baselines "managed in Gold outside of Git, but in lockstep with Git commits" ([skia.org/docs/dev/testing/skiagold](https://skia.org/docs/dev/testing/skiagold/)). It is written in Go with a Polymer frontend; code lives in the Skia Infra repo `github.com/google/skia-buildbot` under `//golden/` (service) and `//gold-client/` (client). The one architectural decision that defines it relative to a checked-in golden file: **comparison happens in an external service, not on the test machine** ([Chromium Gold doc](https://chromium.googlesource.com/chromium/src/+/HEAD/docs/gpu/gpu_pixel_testing_with_gold.md)). + +## The data model: digest → params → trace → test → corpus + +- **Digest.** A *digest* is the content hash of a PNG's pixel content — Flutter's help page: "a hash digest of their pixel content (and potentially other metadata like colorspace)"; "the terms digest and image are used synonymously" ([flutter-gold.skia.org/help](https://flutter-gold.skia.org/help)). Identity is content-addressed: identical pixels → identical digest → instant pass with no upload. *The hash algorithm is not named in the public docs; do not assume MD5 — see [open-problems.md](open-problems.md).* +- **Params.** Each uploaded digest is tagged with *params* — "key/value pairs … generally used to label how a digest was produced, for example `OS=Android` or `GPU=Nvidia770GTX`." Params are open-ended: "Gold will automatically identify and process any new params produced by a test." No server-side pre-registration of keys. +- **Trace.** A *trace* is "all digests seen belonging to a unique set of params," rendered in the UI as "lines of colored dots where a color refers to a specific digest." Each trace belongs to exactly one test in one corpus. +- **Corpus → test.** Data is grouped first by **corpus**, then by **test** (Flutter help; Skia README). The Gold instance config requires "an explicit list of corpora, specified via the `grouping_param_keys_by_corpus` field" ([golden README](https://github.com/google/skia-buildbot/blob/main/golden/docs/README.md)). + +This is the structural answer to the configuration matrix: OS/GPU/backend are *dimensions of one logical test*, not separate committed files. See [storage-scale.md](storage-scale.md) for how this dodges golden-storage explosion. + +## goldctl — the client flow + +The client is `goldctl` ("gold-control"), import path `github.com/google/skia-buildbot/gold-client/cmd/goldctl`, BSD-3-Clause. pkg.go.dev lists only a synthetic `v0.0.0-…` pseudo-version — goldctl is built from source / distributed via CIPD, **not** released as a tagged semver binary (no tagged release verifiable). Subcommands: `auth`, `imgtest` (with `init` / `add` / `finalize`), `validate` ([pkg.go.dev/.../goldctl](https://pkg.go.dev/github.com/google/skia-buildbot/gold-client/cmd/goldctl)). + +The flow (Chromium docs): a test "produces an image and passes it to `goldctl`, along with some information about the hardware and software configuration … the test name, etc." goldctl "checks whether the hash of the produced image is in the list of approved hashes"; if matched it passes silently, else "`goldctl` uploads the image and metadata to the storage bucket and exits with a failing return code." + +- `imgtest init` sets a work dir + `keys.json` once, so later `add` calls are terse. +- `imgtest add` validates that the test carries every param required by its corpus grouping. +- `imgtest finalize` closes out the run. + +**GCS is the source of truth.** "All data uploaded from tests will live here and be interpreted by Gold" (Skia README). The frontend ingests the bucket: "the server sees the new data … and ingests it, showing a new untriaged image in the GUI" (Chromium docs). + +## The many-positives model (the key distinction from one-baseline goldens) + +"Gold supports multiple approved images per test … any of those images are acceptable." Why: "A trace (or test) is allowed to have multiple positive digests; in practice this happens due to things like nondeterminism in anti-aliasing algorithms for certain GPUs" (Flutter help). Triage classifies a digest: + +- **`positive`** — acceptable. +- **`negative`** — must-not-recur; requires a fix. +- **`untriaged`** — "generally means that a test has started producing different output." + +A test **passes if its output hash matches *any* approved positive**. On the waterfall there is one baseline set; on trybots the baseline is "the union of the master baselines for the current revision and any baselines that are unique to the CL," so reviewers can triage before landing (Chromium docs). Triage is immediate — once triaged, a digest is available for future runs without a CL round-trip. + +## Triage UI + +Digests are "automatically compared to another digest from the same test; in fact, the most similar digest," with zoom and a `u` shortcut to jump to the largest pixel difference (Flutter help). The frontend is the thing Buiy would *not* build — see the local-HTML-report alternative in [lessons.md](lessons.md) and [ecosystem-tools.md](ecosystem-tools.md) (reg-cli). + +## Ignore rules, including time-boxed + +The Ignores view lets you "create a new, short-interval (hours) ignore for the most affected configuration(s)" — ignores carry an expiry so flaky configs don't get permanently muted ([Skia docs](https://skia.org/docs/dev/testing/skiagold/)). *Caveat: the docs describe hours-scale time-boxing but do not specify exact post-expiry re-activation semantics — unverified from primary sources.* Note also: Gold's flaky answer is **inexact matching + manual time-boxed ignores**, NOT an automatic flaky-auto-quarantine mechanism (no such mechanism is documented for Gold — Argos has one; see [ecosystem-tools.md](ecosystem-tools.md)). + +## Inexact-matching escape hatch + +For noisy tests, admins set a `matching_algorithm` on the `PixelTestPage`: + +- **Fuzzy** — `max_different_pixels` + `pixel_per_channel_delta_threshold` (e.g. `=2`): pass on "only minor differences" instead of exact-hash equality. +- **Sobel** — adds `edge_threshold` to mask anti-aliased edges before comparison (`pixel_delta_threshold` e.g. `=30`); rationale in [skia bug 9527](https://groups.google.com/a/skia.org/g/bugs/c/uLPDZS_hKYQ). + +Gold ships `determine_gold_inexact_parameters.py` with `binary_search` / `local_minima` optimizers to *tune* these per test — the borrowable idea is **tunable per-test tolerances**, not hand-picked global thresholds. The very existence of this escape hatch acknowledges that pure content-addressing is brittle under GPU nondeterminism. + +## Adopters (verified) + +Skia, Chromium, PDFium, and the Flutter framework (`flutter-gold.skia.org`) all run Gold instances. Both Chromium and Skia docs note Gold "was originally developed for Skia's usage but has been adopted by other projects such as Chromium and PDFium." + +## The wart worth flagging for Buiy + +Gold is heavy infrastructure — a GCS bucket + a GCE/k8s frontend + per-corpus config + a human triage queue. It is the escape hatch *when a golden set explodes*, not a cheap default. Its own scale framing ("Each commit creates >500k images," Skia README) shows it is built for an org that has already accepted golden-screenshot triage as a standing cost. Buiy's strategy is the opposite: keep Tier 5 minimal so this machinery is never needed. See [lessons.md](lessons.md) `## Avoid`. + +## Sources + +- Skia Gold docs: https://skia.org/docs/dev/testing/skiagold/ +- Chromium GPU Pixel Testing With Gold: https://chromium.googlesource.com/chromium/src/+/HEAD/docs/gpu/gpu_pixel_testing_with_gold.md +- Flutter Gold help: https://flutter-gold.skia.org/help +- skia-buildbot golden README: https://github.com/google/skia-buildbot/blob/main/golden/docs/README.md +- goldctl on pkg.go.dev: https://pkg.go.dev/github.com/google/skia-buildbot/gold-client/cmd/goldctl +- Sobel filter rationale (skia bug 9527): https://groups.google.com/a/skia.org/g/bugs/c/uLPDZS_hKYQ diff --git a/docs/prior-art/skia-gold/lessons.md b/docs/prior-art/skia-gold/lessons.md new file mode 100644 index 0000000..6d9187a --- /dev/null +++ b/docs/prior-art/skia-gold/lessons.md @@ -0,0 +1,74 @@ +**Date:** 2026-06-14 +**Status:** active +**Subject:** Lessons for Buiy — the storage+triage escape hatch: which choices it validates, which traps to avoid, which primitives to borrow + +# Lessons for Buiy + +This is the consult-this-when-designing file. The other files are evidence; this file is decisions. Scope: the **storage + triage escape hatch** — what to copy from the visual-golden ecosystem when (not before) a Buiy golden set explodes. Comparison set: Skia/Chromium **Gold** (the Go service), **reg-suit** (OSS, commit-keyed object store), and the SaaS triad **Chromatic / Percy / Argos**. + +**Bottom line up front.** Buiy should **not** build a Gold-class service. Build a **reg-suit-shaped *local* harness**, and copy four of Gold's *ideas* — params/traces keying, multi-positive baselines, tunable inexact matching, expiring ignores — not its infrastructure. And remember the meta-point from [`docs/reports/2026-06-14-visual-bug-detection-strategy.md`](../../reports/2026-06-14-visual-bug-detection-strategy.md): Tier 5 is a *minimal residue*; if the cheaper tiers (structured snapshots, metamorphic invariants, reftests, CPU-vs-GPU cross-check) do their job, much of this folder is contingency, not roadmap. + +## Validates + +These Buiy strategy choices are confirmed by the ecosystem's experience: + +- **Keep Tier 5 minimal; don't make pixels the default.** Gold's own scale framing ("each commit creates >500k images") and its heavy infrastructure exist *because* Skia/Chromium accepted golden triage as a standing cost. Buiy's pyramid — pushing detection down to deterministic structured snapshots and relational reftests — is validated by every limit in [open-problems.md](open-problems.md): the oracle, flake, and reproducibility problems are all irreducible at the golden tier and tractable at cheaper ones. +- **Store goldens out-of-repo, keyed, not committed as files.** The whole ecosystem (Gold's GCS digests, reg-suit's S3/GCS publisher, Argos/Percy's git-history baselines) rejects committed PNGs. Screenshotbot's verbatim git-LFS critique ([storage-scale.md](storage-scale.md)) names the `O(configs × commits)` pathology Buiy must not recreate. When Buiy graduates past in-repo goldens, the commit-keyed object store is the right first step. +- **A `(widget, state, theme, viewport, backend, dpr)` key schema is the right shape.** It maps directly onto Gold's params-and-traces model — that *is* the trace identity. Validates the strategy report's combinatorial-surface framing. +- **Set-valued baselines per test, not one-baseline.** Gold "supports multiple approved images per test … not uncommon for tests to produce images that are visually indistinguishable, but differ in a handful of pixels." For a GPU library this is essential, and it validates Buiy treating a key as mapping to a *set* of accepted digests. +- **A curated `--accept` / structured-snapshot precedent already exists in-repo.** Buiy's shaping snapshots (`tests/text_shaping_snapshots.rs` with `BUIY_ACCEPT_SHAPING=1`) are an in-repo precedent for both the accept model and structured snapshots — exactly the durable accept ledger reg-suit lacks. Validates extending that pattern rather than importing reg-suit's implicit-in-git-history acceptance. + +## Avoid + +| Pitfall | Source | Buiy stance | +|---|---|---| +| Standing up a Gold-class service (GCS bucket + GCE/k8s frontend + per-corpus config + triage queue) | [gold-architecture.md](gold-architecture.md) | Gross overkill for Buiy. Gold is right for Chromium/Flutter's thousand-config matrix and "500k images/commit." Do **not** build a database-backed trace store + web frontend until the golden set's scale actually demands it — and the pyramid is designed so it never does. | +| Treating reg-suit's implicit-in-git-history acceptance as sufficient | [ecosystem-tools.md](ecosystem-tools.md) | reg-suit has **no first-class approve command** and **no durable per-image accept ledger** — acceptance is "the next commit's snapshot becomes the baseline." That is the "golden set explodes, triage is manual" failure mode. Buiy must ship an explicit accept command that writes the new digest into a stored baseline set (the shaping-snapshot pattern). | +| Naive commit-key resolution | [ecosystem-tools.md](ecosystem-tools.md), [open-problems.md](open-problems.md) | reg-suit's keygen special-cases merge commits and is thin on rebased / multi-parent / squash histories. Commit-key resolution is the part that breaks. If Buiy keys by commit, design the rebase/squash/merge edge cases up front and keep a durable accept ledger as a second source of truth. | +| Hand-picked global pixel thresholds | [gold-architecture.md](gold-architecture.md) | Gold ships `determine_gold_inexact_parameters.py` (binary-search / local-minima) precisely because global thresholds fail. Buiy should make tolerances **tunable per test/fixture**, not a single global L1/RMSE cutoff. (The strategy report's two-axis fuzzy model is the metric; this is the per-fixture-tolerance discipline.) | +| Expecting multi-positive to be free | [storage-scale.md](storage-scale.md), [open-problems.md](open-problems.md) | Multi-positive baselines accumulate **stale positives silently** — a real regression can match an old wrong positive. If Buiy adopts set-valued baselines, also design pruning / aging, or the set drifts into "everything anyone ever blessed." | +| Assuming a golden harness solves flake | [open-problems.md](open-problems.md) | Gold mitigates flake with inexact matching + manual time-boxed ignores owned by a gardener — it does not eliminate it, and has **no automatic flaky-quarantine** (that's Argos). Buiy must not promise a low-flake golden tier; the flake is held at bay by ongoing labor unless detection moves to deterministic tiers. | +| Coupling the metric to the existing naive L1/RMSE | strategy report §4, [ecosystem-tools.md](ecosystem-tools.md) | Buiy's two naive metrics (`golden.rs` L1, `visual.rs` RMSE) have no AA-exclusion and no two-axis budget — they cannot express Mozilla-style `fuzzy(d_lo-d_hi, p_lo-p_hi)` or Gold's Sobel edge-masking. Fix the metric before building goldens *or* reftests; wrap odiff's YIQ + AA-detection rather than re-deriving a naive engine. | +| Asserting the image digest is MD5 | [storage-scale.md](storage-scale.md), [open-problems.md](open-problems.md) | **Unverified.** Public docs say "hash of pixel content" but do not name the algorithm; the MD5 reference is for the `Expectations` struct, not the image. Don't cite MD5 for image content-addressing in any Buiy doc. | +| Quoting SaaS dollar figures as fact | [ecosystem-tools.md](ecosystem-tools.md) | Chromatic ($149/mo), Percy ($39/mo, ≈$5,000/mo at 100k screenshots) are **vendor-page / secondary, unverified**. For an offline-first MIT/Apache project these tools are disqualified on cost + dependency grounds regardless; don't lean on specific numbers. | + +## Borrow + +Concepts to copy into a *local* harness (licenses align — reg-suit and Argos are MIT, matching Buiy's dual license): + +1. **The commit-keyed object store** (from reg-suit's keygen+publisher split). A content-addressed bucket: local dir → optional S3/GCS, keyed by commit hash, with the baseline fetched as "the parent commit's snapshot." This is the cleanest OSS "storage keyed by commit" pattern. Borrow the *shape*; supply the missing durable accept ledger. + +2. **The params/traces key schema** (from Gold). A digest is tagged with key/value params; a trace is the unique param set. Fix Buiy's `(widget, state, theme, viewport, backend, dpr)` schema **before generating any goldens** — retrofitting keys means re-baselining everything. Reserve `backend` to enumerate CPU/Vulkan/GL/Metal and `dpr` to be numeric. Copy the **schema concept**, not a CLI contract — `goldctl`'s exact key-passing flags are not in the public docs. + +3. **Set-valued baselines (multi-positive)** (from Gold). A key maps to a *set* of accepted digests, not one — essential for GPU AA nondeterminism. Borrow with stale-positive pruning attached (see Avoid). + +4. **Tunable inexact + Sobel-style edge masking** (from Gold). `FuzzyMatchingAlgorithm` (`max_different_pixels`, `pixel_per_channel_delta_threshold`) and `SobelMatchingAlgorithm` (`edge_threshold` to mask anti-aliased edges before comparison). Copy the idea of **tunable per-test tolerances** and edge-masking, optimized rather than hand-picked. + +5. **Time-boxed ignore rules keyed by params** (from Gold). Ignores carry an expiry (hours-scale) so flaky configs aren't permanently muted, owned by a gardener role. Borrow the **expiring-ignore primitive**. Note: "flaky-auto-ignore" is a pattern to *design* (anchored on Argos's "minimum occurrences in 7 days → auto-ignore"), not one Gold offers — Gold's answer is inexact match + manual ignores. + +6. **The local self-contained HTML diff report** (from reg-cli / x-img-diff-js), as the alternative to a hosted triage UI. Emit one self-contained HTML file per run (expected/actual/diff, ideally with structural overlay: matched/changed/unmatched), openable from CI artifacts. Triage = a human eyeballs it and runs an accept command that writes the new digest into the baseline set. This is the right altitude — never stand up Gold's web frontend. + +7. **The odiff diff engine** (from `dmtrKovalenko/odiff`). Production-grade Zig+SIMD YIQ-NTSC + AA-detection; ~6.67×–7.65× faster than pixelmatch/ImageMagick on Cypress images. Wrap it, or re-implement the algorithm in Rust SIMD (a natural fit), rather than shipping the existing naive L1/RMSE. + +8. **Argos's flaky-occurrence heuristic** (from Argos). "Minimum occurrences to consider a change flaky (last 7 days)" + pixel clustering is the most concrete prior art for auto-taming flaky-golden noise if Buiy ever needs it. + +## SaaS comparison — why not buy + +**Argos** is genuinely **MIT** with no hosting restriction — the one self-hostable option if Buiy ever wants a hosted-style UI without vendor lock-in. **Chromatic** and **Percy** are per-snapshot-priced SaaS (figures unverified; Percy "scales significantly with volume"); for an offline-first MIT/Apache project these are disqualifying on cost and dependency grounds. **Net guidance:** copy reg-suit's commit-keyed store + local HTML report; copy Gold's params/traces + multi-positive + tunable inexact-match + expiring ignores as *concepts*; defer (do not build) a Gold service until the golden set's scale genuinely demands a database-backed trace store — which, if the pyramid holds, it won't. + +## How to use this file + +When designing Buiy's Tier-5 (or Tier-4 reftest) harness, locate the relevant Avoid row to understand the trap, then the matching Borrow row for the primitive to adopt. Promote any decision into the not-yet-written `buiy-verification-design` spec under `docs/specs/` — this file captures what we learn from the golden ecosystem, not Buiy's own commitments. Re-verify versions and any SaaS facts against live sources before lifting concrete details. + +## Sources + +- Skia Gold docs: https://skia.org/docs/dev/testing/skiagold/ +- Chromium GPU Pixel Testing With Gold: https://chromium.googlesource.com/chromium/src/+/HEAD/docs/gpu/gpu_pixel_testing_with_gold.md +- Flutter Gold help: https://flutter-gold.skia.org/help +- reg-suit: https://github.com/reg-viz/reg-suit · keygen: https://github.com/reg-viz/reg-suit/blob/master/packages/reg-keygen-git-hash-plugin/README.md · reg-cli: https://github.com/reg-viz/reg-cli · x-img-diff-js: https://github.com/reg-viz/x-img-diff-js +- Argos LICENSE (MIT): https://github.com/argos-ci/argos/blob/main/LICENSE +- odiff: https://github.com/dmtrKovalenko/odiff/blob/main/README.md +- Sobel filter rationale (skia bug 9527): https://groups.google.com/a/skia.org/g/bugs/c/uLPDZS_hKYQ/m/_7uliqajCAAJ +- vizzly pricing comparison (secondary, unverified $): https://vizzly.dev/visual-testing-tools-comparison/ +- Buiy visual-bug-detection strategy: [`docs/reports/2026-06-14-visual-bug-detection-strategy.md`](../../reports/2026-06-14-visual-bug-detection-strategy.md) +- Sibling files: [gold-architecture.md](gold-architecture.md), [storage-scale.md](storage-scale.md), [ecosystem-tools.md](ecosystem-tools.md), [open-problems.md](open-problems.md), [glossary.md](glossary.md) diff --git a/docs/prior-art/skia-gold/open-problems.md b/docs/prior-art/skia-gold/open-problems.md new file mode 100644 index 0000000..8d2c9c7 --- /dev/null +++ b/docs/prior-art/skia-gold/open-problems.md @@ -0,0 +1,48 @@ +**Date:** 2026-06-14 +**Status:** active +**Subject:** What the visual-golden storage/triage ecosystem structurally does NOT solve + +# Open problems + +What Gold, reg-suit, and the SaaS triad **structurally do not solve**, no matter how well-operated. These are the limits Buiy should not expect a Tier-5 golden harness to overcome. + +## 1. The oracle problem is not solved — only deferred to a human + +None of these tools know what *correct* looks like. A golden/expectation only encodes "this matched a human-approved image at some point." The first time a test runs, every image is `untriaged` and someone must *decide*. Gold's triage UI, reg-suit's HTML report, and the SaaS review screens all make the human decision faster, but none remove it. A regression that a human approves by mistake becomes the new truth. This is why Buiy's strategy pushes detection *down* to Tiers 1–4 (structured snapshots, metamorphic invariants, reftests), which assert *relations* and need no per-image human oracle. Gold is the tier where the oracle problem is irreducible. + +## 2. Stale positives accumulate silently + +Gold's multi-positive model (a test passes if it matches *any* approved digest) has no garbage collection: an approved digest that no live config produces anymore stays approved forever. Over time the positive set drifts from a curated "these are the acceptable renders" into "everything anyone ever blessed." A real regression that happens to match an old, now-wrong positive passes silently. Nothing in the documented Gold workflow prunes this; it is operational debt the gardener must manage by hand. + +## 3. Flake is mitigated, not eliminated — and the mitigations require a standing human role + +Gold's flaky answer is **inexact matching (Fuzzy/Sobel) + manual time-boxed ignores**, both owned by a gardener. There is **no documented automatic flaky-auto-quarantine in Gold** (unlike Argos's "minimum occurrences in 7 days → auto-ignore"). Inexact thresholds must be tuned per test (Gold ships `determine_gold_inexact_parameters.py` precisely because hand-picking fails), and ignores expire on an hours scale, so a chronically-flaky config needs repeated human attention. The flake never goes away; it is held at bay by ongoing labor. *Unverified: exact post-expiry re-activation semantics of time-boxed ignores are not specified in primary docs.* + +## 4. Cross-machine pixel reproducibility is out of scope + +Gold's entire params/traces design is an admission that **the same scene rasterizes differently across OS / GPU / driver / AA setting**, and it copes by treating each config as a separate trace plus allowing multiple positives. It does not make the pixels reproducible — it *catalogs* the irreproducibility. For Buiy, whose least-deterministic artifact is the pixel (FP non-associativity, FMA contraction, `fwidth` derivatives, sRGB encode on GPU write — see [`docs/reports/2026-06-14-visual-bug-detection-strategy.md`](../../reports/2026-06-14-visual-bug-detection-strategy.md)), this means a Gold-style harness inherits a baseline-per-backend×dpr explosion that no amount of triage tooling collapses. + +## 5. The cost/ops floor is high and not amortizable away + +Gold is a GCS bucket + a GCE/k8s frontend + per-corpus config + a human triage queue — heavy standing infrastructure justified only at the scale of "each commit creates >500k images" ([golden README](https://github.com/google/skia-buildbot/blob/main/golden/docs/README.md)). The self-hosted cost/ops figure for a small project is **unverified** (the Skia/Chromium backends are Google-operated). The SaaS alternatives convert capital cost into per-snapshot billing that "scales significantly with volume" (Percy ≈ $5,000/mo for a 100k-screenshot team per a secondary, **unverified** comparison). Either way the golden tier carries a cost floor that the cheaper pyramid tiers do not. + +## 6. Commit-key resolution is the part that breaks in the OSS reference design + +reg-suit's `reg-keygen-git-hash-plugin` walks the branch graph to find "the parent's commit which is the source of the topic branch" and special-cases merge commits, but the README is thin on rebased branches, squash-merges, and multi-parent histories. When the keygen picks the wrong parent, the "expected" baseline is wrong and every comparison is noise. There is no durable accept ledger to fall back on (acceptance is implicit in git history), so a mis-resolved key has no second source of truth. This is the concrete fragility Buiy inherits if it copies the commit-keyed-store pattern naively. + +## 7. Structural diff still misses semantic intent + +Even reg-suit's structural x-img-diff (matched/changed/unmatched keypoints) and Argos's clustering only describe *where pixels moved*, not *whether the change was intended*. "The button is now blue" and "the button regressed to blue" produce identical diffs. The tools surface the change; only the pyramid's higher tiers (token-set snapshots, reftests asserting `==`/`!=` relations) can encode *intent* without a human in the loop. + +## Implications for Buiy + +These seven limits are the case *for* the strategy report's pyramid: every problem here is irreducible at the golden tier and tractable at a cheaper one. Buiy should (a) keep Tier 5 a minimal residue, (b) never expect a golden harness to solve the oracle/flake/reproducibility problems, and (c) if it does build the residue, copy the *concepts* (params/traces, multi-positive, tunable inexact match, expiring ignores) without the standing service. See [lessons.md](lessons.md). + +## Sources + +- skia-buildbot golden README: https://github.com/google/skia-buildbot/blob/main/golden/docs/README.md +- Skia Gold docs: https://skia.org/docs/dev/testing/skiagold/ +- Chromium GPU Pixel Testing With Gold: https://chromium.googlesource.com/chromium/src/+/HEAD/docs/gpu/gpu_pixel_testing_with_gold.md +- reg-keygen-git-hash-plugin: https://github.com/reg-viz/reg-suit/blob/master/packages/reg-keygen-git-hash-plugin/README.md +- vizzly comparison (secondary, unverified $): https://vizzly.dev/visual-testing-tools-comparison/ +- Buiy visual-bug-detection strategy: [`docs/reports/2026-06-14-visual-bug-detection-strategy.md`](../../reports/2026-06-14-visual-bug-detection-strategy.md) diff --git a/docs/prior-art/skia-gold/storage-scale.md b/docs/prior-art/skia-gold/storage-scale.md new file mode 100644 index 0000000..90e9d8b --- /dev/null +++ b/docs/prior-art/skia-gold/storage-scale.md @@ -0,0 +1,63 @@ +**Date:** 2026-06-14 +**Status:** active +**Subject:** How the visual-golden ecosystem dodges golden-storage explosion — content-addressed digests out of repo, per-config params, multi-positive baselines, and the git-LFS pathology + +# Storage & scale: dodging golden-storage explosion + +The whole ecosystem exists to answer one problem: a naive golden suite stores an `O(configs × commits)` matrix of binary PNGs in version control, where a single font or color tweak rewrites the entire grid. This file documents the techniques that defuse it. + +## Gold's core move: stop treating goldens as files + +Skia Gold stops treating goldens as *files* and starts treating them as **content-addressed digests stored out-of-repo, with a separate, mutable expectations database**. Per Flutter Gold help, "Images uploaded to Gold are uniquely identified by a hash digest of their pixel content (and potentially other metadata like colorspace)." The bytes live in cloud storage, not git: a Gold instance "consists of two parts: a Google Storage bucket that data is uploaded to and a server running on GCE that ingests the data and provides a way to triage diffs," and `goldctl` "checks whether the hash of the produced image is in the list of approved hashes" — if absent it uploads the image + metadata and exits non-zero ([Chromium Gold doc](https://chromium.googlesource.com/chromium/src/+/HEAD/docs/gpu/gpu_pixel_testing_with_gold.md)). The git repo therefore stores **zero PNGs**; approval state is "managed in Gold outside of Git, but in lockstep with Git commits" ([skia.org skiagold](https://skia.org/docs/dev/testing/skiagold/)). See [gold-architecture.md](gold-architecture.md) for the full data shape. + +## Pillar 1: per-config expectations via params/traces + +Every digest is tagged with key/value **params** ("These keys … are generally used to label how a digest was produced, for example `OS=Android` or `GPU=Nvidia770GTX`"), and a **trace** is "all digests seen belonging to a unique set of params" ([flutter-gold help](https://flutter-gold.skia.org/help)). This is the structural answer to the OS/GPU matrix: configurations are *dimensions of one logical test*, not N separate committed files. Skia tests "across a range of dimensions, e.g.: OS (Windows, Linux, Mac, Android, iOS), Architectures (Intel, ARM), Backends (CPU, OpenGL, Vulkan etc.)" ([skia.org skiagold](https://skia.org/docs/dev/testing/skiagold/)). Chromium indexes reference data "by version number, OS, GPU vendor, GPU device, and whether or not antialiasing is enabled" (Chromium Gold doc). New params need **no** server-side pre-config; Gold ingests arbitrary keys automatically. + +**Implications for Buiy.** Buiy's proposed `(widget, state, theme, viewport, backend, dpr)` key schema maps directly onto Gold's params-and-traces model — that *is* the trace identity. Fix the key schema *before* generating any goldens; retrofitting keys means re-baselining everything. Reserve `backend` to enumerate CPU/Vulkan/GL/Metal and `dpr` to be numeric — the axes a GPU UI library fans out on. (Lesson detail in [lessons.md](lessons.md).) + +## Pillar 2: multi-positive baselines + +The second pillar keeps legitimate variants from reddening the suite: "A trace (or test) is allowed to have multiple positive digests; in practice this happens due to things like nondeterminism in anti-aliasing algorithms for certain GPUs" (Flutter help). Triage is binary per digest — `positive` = acceptable, `negative` = needs a fix — but a test **passes if its output hash matches *any* approved positive**. Chromium frames this as the explicit reason it left fuzzy/threshold matching: with 2–3 valid variants, "being able to say that any of those images are acceptable is simpler and less error-prone." Triage is also immediate: "new golden images don't need to go through the CQ … Once an image is triaged in Gold, it becomes immediately available for future test runs" — versus a committed-baseline workflow that needs a CL round-trip. + +**Cost of the pillar.** Multi-positive means **stale positives accumulate silently** — nothing prunes an approved digest that no config produces anymore. See [open-problems.md](open-problems.md). + +## The contrast: committing PNGs / git-LFS + +Chromium's predecessor stored approved *hashes* committed to the repo with images in a GS bucket, where "the only thing the user had to go on was a hash"; Gold "moves the images out of the repository, but provides a GUI interface for easily seeing which images are currently approved" (Chromium Gold doc). The git-LFS escape valve does not scale for goldens. Screenshotbot's critique, verbatim ([screenshotbot.io/blog/can-git-lfs-scale](https://screenshotbot.io/blog/can-git-lfs-scale)): + +> "If you have 100 commits that change almost all of the screenshots (say a font or color change), you'll soon be using 5GB of storage!" + +> "Each CI job needs to fetch all of the current screenshots. This slows down the clone step, which blocks CI for all your developers (whether or not they are making UI changes)." + +> "If your screenshots are in Git LFS, the history is going to be slow to fetch, which means developers are unlikely to actually use" bisection. + +> "Many teams have dedicated engineers just to manage Git LFS." + +The pathology is fundamental: an `O(configs × commits)` matrix of binaries committed to history, where one font tweak rewrites the whole grid. + +## Peers converge on out-of-repo + branch-scoped baselines + +The whole comparison set rejects committed files (details in [ecosystem-tools.md](ecosystem-tools.md)): + +- **reg-suit** "automatically stores snapshot images to external cloud storage (e.g. AWS S3, Google Cloud Storage)" and keys baselines via a git-hash key-generator plugin — images out of repo, key in git ([reg-suit repo](https://github.com/reg-viz/reg-suit)). +- **Chromatic** keeps per-permutation baselines: "One story captured in 3 browsers at 3 viewports equals 9 snapshots … independent baselines and distinct approvals," with TurboSnap copying unchanged baselines forward ([TurboSnap docs](https://www.chromatic.com/docs/turbosnap/)). +- **Argos** and **Percy** resolve baselines by walking git history rather than storing files — "A baseline is not a file—it is a decision"; Percy assigns "each branch … its own branch-level baseline" ([Argos baseline](https://argos-ci.com/docs/baseline-build); [Percy git baselines](https://www.browserstack.com/docs/percy/baseline-management/git)). + +## Warts / unverified + +- Even exact-hash + multi-positive needs babysitting: Chromium flags tests "prone to noise which causes them to need additional triaging at times" (Chromium Gold doc), motivating fuzzy/Sobel-mask requests ([skia bug 9527](https://groups.google.com/a/skia.org/g/bugs/c/uLPDZS_hKYQ)). +- **Unverified:** the docs say the image digest is a "hash of its pixel content" but do **not** name the algorithm. The "MD5" reference in Gold's Go API applies to the `Expectations` struct hash, **not** the image digest — do not assert MD5 for image content-addressing. +- **Unverified:** the Skia/Chromium backend (GCS + GCE) is Google-operated; a self-host cost/ops figure for Buiy was not found in primary sources. + +## Sources + +- Skia Gold docs: https://skia.org/docs/dev/testing/skiagold/ +- Chromium GPU Pixel Testing With Gold: https://chromium.googlesource.com/chromium/src/+/HEAD/docs/gpu/gpu_pixel_testing_with_gold.md +- Flutter Gold help: https://flutter-gold.skia.org/help +- Screenshotbot — can git LFS scale: https://screenshotbot.io/blog/can-git-lfs-scale +- reg-suit: https://github.com/reg-viz/reg-suit +- Chromatic TurboSnap: https://www.chromatic.com/docs/turbosnap/ +- Argos baseline: https://argos-ci.com/docs/baseline-build +- Percy git baseline management: https://www.browserstack.com/docs/percy/baseline-management/git +- Sobel filter rationale (skia bug 9527): https://groups.google.com/a/skia.org/g/bugs/c/uLPDZS_hKYQ diff --git a/docs/prior-art/vello/README.md b/docs/prior-art/vello/README.md new file mode 100644 index 0000000..63617a8 --- /dev/null +++ b/docs/prior-art/vello/README.md @@ -0,0 +1,71 @@ +**Date:** 2026-06-14 +**Status:** active +**Subject:** Vello — Linebender GPU-compute 2D renderer with a CPU reference rasterizer (the closest greenfield neighbor to Buiy's CPU-SDF-oracle plan) + +# Vello + +Vello is "A GPU compute-centric 2D renderer" (the verbatim crate description on [crates.io](https://crates.io/crates/vello)) — the rasterization engine of the Linebender ecosystem, sibling to the Xilem/Masonry/Parley stack. Unlike a traditional rasterizer that hangs work off the GPU's fixed-function rasterizer, the flagship `vello` crate runs nearly the entire pipeline as a chain of WGSL **compute** shaders over `wgpu`. To backstop the resulting GPU-compute-portability problems, Linebender built a second-generation **sparse strips** family — `vello_cpu` (pure software) and `vello_hybrid` (CPU strip generation + GPU fine rasterization) — sharing one scene format. That CPU/GPU split, and the test harness that diffs the two against each other with a perceptual metric, is exactly the pattern Buiy is reaching for: promoting its own CPU SDF port to an oracle that cross-checks the GPU rasterizer without checked-in golden PNGs. + +This folder is the consumer-side deep-dive on **Vello's testing strategy and its CPU/GPU split** specifically. The wider Linebender substrate (Vello's capability set as a render target, Parley vs cosmic-text, Linebender Color/Kurbo) is covered from the framework angle in [`../xilem-masonry/`](../xilem-masonry/); this folder does not re-derive that. + +## Key facts + +| Fact | Value | Source | +|---|---|---| +| Self-description | "A GPU compute-centric 2D renderer" | crates.io crate description | +| Flagship crate version | `vello` **0.9.0** (2026-05-15) | crates.io API `created_at` (authoritative; a GitHub HTML scrape misreported 2025) | +| Sparse-strips crates | `vello_cpu` / `vello_hybrid` / `vello_common` at **0.0.9** (2026-05-30) — still `0.0.x` | crates.io / GitHub releases | +| Maturity | "can currently be considered in an alpha state" (verbatim) — no 1.0; no stable roadmap found | README | +| License | Apache-2.0 OR MIT (shaders additionally Unlicense for research reuse) | README | +| Steward | Linebender — informal volunteer collective; **Raph Levien** "informally leads and drives the work" | linebender.org/about | +| MSRV | Rust 1.88 (`vello`) | README | +| Substrate | `kurbo`, `peniko`, `color`, `skrifa`, `wgpu` | README + release notes | +| Test crate | `vello_tests/` — `nv_flip` perceptual mean-error gate (NOT exact pixel match, NOT yet Kompari) | repo source | +| Adoption | Xilem/Masonry, `bevy_vello`, `woodpecker_ui` | repo / third-party crates | +| Stars (point-in-time) | `vello` ~4.1k; `parley` ~622 | GitHub, approximate | +| Downloads (2026-06-14) | `vello` 384,404 lifetime; `parley` 1,285,183 (Parley useful standalone) | crates.io API | +| Funding | Google / Google Fonts have sponsored Linebender ecosystem work — **exact figures/terms unverified** | blog history | + +## Table of contents + +- [`architecture.md`](architecture.md) — the classic GPU-compute pipeline: `Scene` → `Encoding` → the WGSL compute-stage sequence (prefix-sum, sort-middle coarse/fine tiling). +- [`sparse-strips.md`](sparse-strips.md) — the second-gen family: `vello_cpu` / `vello_hybrid` / `vello_common`, the SIMD `Level` pattern, and the `u8` vs `f32` pipelines. **The part most relevant to Buiy's oracle.** +- [`cpu-gpu-testing.md`](cpu-gpu-testing.md) — the `vello_tests` harness: three test tiers, GPU-as-source-of-truth, the blessing flow, `vello_cpu`'s `f32` pipeline as the snapshot generator. +- [`metric-and-kompari.md`](metric-and-kompari.md) — `nv_flip` mean-error gate, the **contested-inside-Linebender** xilem tolerance-16 counter-position, and the Kompari convergence plan. +- [`ecosystem-maturity.md`](ecosystem-maturity.md) — what Vello is and isn't (renderer, not toolkit; no a11y), the three variants/maturity levels, Parley pairing, adoption, version numbers, governance. +- [`open-problems.md`](open-problems.md) — what Vello structurally does NOT solve (compute portability, conflation artifacts, GPU memory allocation, glyph caching, oracle status). +- [`lessons.md`](lessons.md) — **the decision file.** Validates / Avoid / Borrow, framed for Buiy. Read this first if you are designing. +- [`glossary.md`](glossary.md) — Vello/Linebender-specific terms. + +## Reading order + +If you are consulting this folder for a Buiy visual-bug-detection design decision: + +1. **Start here:** [`lessons.md`](lessons.md) — Validates / Avoid / Borrow; this is where the CPU-SDF-oracle and `nv-flip`-vs-pixelmatch decisions live. +2. **The mechanism Buiy is copying:** [`cpu-gpu-testing.md`](cpu-gpu-testing.md) then [`metric-and-kompari.md`](metric-and-kompari.md). +3. **The CPU/GPU split that makes it work:** [`sparse-strips.md`](sparse-strips.md). +4. **Why Vello is shaped the way it is (and the wart that justifies the CPU variant):** [`open-problems.md`](open-problems.md), then [`architecture.md`](architecture.md). +5. **Maturity / adoption / governance context:** [`ecosystem-maturity.md`](ecosystem-maturity.md). + +## How to use + +**Framing disclosure.** These docs are written from Buiy's stance — an AccessKit-first, wgpu + Taffy + cosmic-text, parallel-to-bevy_ui retained-mode engine building a reftests-first layered visual-bug-detection strategy. The "Implications for Buiy" / lessons framing reads Vello through that lens; readers auditing whether that strategy is itself right should weigh the corpus accordingly — it is a learn-from artifact, not a neutral catalog. + +## Honesty / verification notes + +Several dossier facts are flagged uncertain and carried as-such in the files below: + +- The **sparse-strips thesis attribution** (Laurenz Stampl, ETH Zürich master's thesis) is single-sourced to a Linebender blog post and could not be re-verified against the thesis PDF — see [`sparse-strips.md`](sparse-strips.md). +- **Funding figures** beyond "Google / Google Fonts have sponsored Linebender work" are unverified — see [`ecosystem-maturity.md`](ecosystem-maturity.md). +- The `assert_mean_less_than` "**< 0.1 in almost all cases**" string is paraphrased from a WebFetch read of `compare.rs`, not byte-exact — see [`cpu-gpu-testing.md`](cpu-gpu-testing.md). +- Whether **Kompari has replaced `nv_flip`** in `vello_tests` by June 2026 is unconfirmed; as read, the live source still calls `nv_flip` — see [`metric-and-kompari.md`](metric-and-kompari.md). +- A **1.0 timeline** for `vello` could not be found in primary sources — treat as unverified. +- Version dates use crates.io API `created_at` timestamps (authoritative); a GitHub-releases HTML scrape misreported several as 2025. + +## Sources + +- Vello repo / README: https://github.com/linebender/vello +- crates.io crate page and version API: https://crates.io/crates/vello, https://crates.io/api/v1/crates/vello/versions +- DeepWiki architecture index: https://deepwiki.com/linebender/vello/1.1-architecture +- Linebender about / governance: https://linebender.org/about/ +- Sibling Buiy prior-art: [`../xilem-masonry/`](../xilem-masonry/), [`../cosmic-text/`](../cosmic-text/), [`../taffy/`](../taffy/) diff --git a/docs/prior-art/vello/architecture.md b/docs/prior-art/vello/architecture.md new file mode 100644 index 0000000..af17561 --- /dev/null +++ b/docs/prior-art/vello/architecture.md @@ -0,0 +1,73 @@ +**Date:** 2026-06-14 +**Status:** active +**Subject:** Vello's classic GPU-compute pipeline — `Scene` → `Encoding` → a fixed sequence of WGSL compute stages + +# The classic compute pipeline (the `vello` crate) + +This file documents the *flagship* `vello` crate's renderer. Buiy should read it mainly to understand **what to NOT copy** (see [`lessons.md`](lessons.md) Avoid): the compute-centric architecture solves a problem Buiy does not have. The transferable lesson lives in [`cpu-gpu-testing.md`](cpu-gpu-testing.md) and [`sparse-strips.md`](sparse-strips.md), not here. + +## The scene-description front end + +A `Scene` exposes a canvas-like API — `fill()`, `stroke()`, `push_layer()` — that does not draw immediately. Instead it appends into an internal **`Encoding`**, a compact binary format owned by the `vello_encoding` crate. The encoding is split into several *parallel streams* so the GPU can process each independently ([DeepWiki architecture](https://deepwiki.com/linebender/vello/1.1-architecture)): + +- `tag_stream` — per-path-segment tags +- `path_stream` — packed coordinates +- `draw_stream` — draw objects (brushes, blends) +- `transform_stream` — affine transforms +- `linewidth_stream` — stroke widths + +This stream-of-arrays layout is deliberate: it lets each compute stage scan one homogeneous array rather than chasing a tagged-union tree, which is what makes the GPU prefix-sum approach tractable. + +## Recording and executing commands + +At render time `WgpuEngine` records a list of `Command`s against a `ResourcePool`/`BindMap`, then submits them through `wgpu`. The command vocabulary is small ([DeepWiki](https://deepwiki.com/linebender/vello/1.1-architecture)): + +- `Upload` / `UploadUniform` — push buffers to the GPU +- `Dispatch` / `DispatchIndirect` — run a compute shader (indirect = workgroup count comes from a GPU buffer, needed because some stage sizes are only known on-GPU) +- `Download` — read a buffer back to the CPU + +## The fixed stage sequence + +The pipeline is a chain of WGSL compute shaders dispatched in a fixed order (WGSL file names, per DeepWiki): + +``` +pathtag_reduce → pathtag_scan (prefix-sum over path-segment tags) +bbox_clear +flatten (curves → line segments) +draw_reduce → draw_leaf +clip_reduce → clip_leaf +binning (segments → tiles) +tile_alloc +backdrop +coarse (per-tile command lists) +fine_* (final antialiased pixels) +``` + +Two structural ideas dominate: + +1. **GPU prefix-sums.** Inherently sequential work (assigning each path segment its cumulative position, resolving nested clips) is parallelized via reduce-then-scan prefix-sum passes — the `*_reduce` / `*_scan` / `*_leaf` pairs. This is the technique Raph Levien has written about extensively; it is also the source of the portability problems (see [`open-problems.md`](open-problems.md)). +2. **Sort-middle, coarse-then-fine tiling.** `binning` sorts segments into screen tiles; `coarse` builds a per-tile command list; `fine_*` walks each tile's list to produce the final antialiased pixels. This is a "sort-middle" architecture in GPU-rendering terms. + +The fine stage *ideally* samples all scene images in a single pass, but the docs note "that's not really possible in WebGPU 1.0" — a stated limitation, not a solved problem ([README](https://github.com/linebender/vello/blob/main/README.md)). + +## Substrate the pipeline sits on + +Vello is built on the Linebender 2D substrate (the same crates Buiy studies from the render-target angle in [`../xilem-masonry/linebender-stack.md`](../xilem-masonry/linebender-stack.md)): + +- **`kurbo`** — curves and affines (`Circle`, `Affine`, path flattening). +- **`peniko`** — `Color`, `Fill`, brushes, blend/compose primitives. +- **`color`** — color-space-aware interpolation. +- **`skrifa`** — font/glyph outlines (incl. VARC variable-composite glyphs); `vello` 0.9.0 builds on skrifa 0.42 (latest is 0.43.x as of 2026-06). + +README confirms `kurbo`, `peniko`, `wgpu` directly; `color` and `skrifa` are documented in release notes rather than the README. + +## Implications for Buiy + +Buiy's renderer is **instanced quads + a per-fragment SDF**, not a sort-middle compute pipeline. The entire `Scene` → `Encoding` → prefix-sum → tile machinery above is overkill for that model. The one thing worth internalizing is the *separation of concerns*: Vello has a renderer-agnostic scene description (`Encoding`) that multiple backends (`vello`, `vello_cpu`, `vello_hybrid`) can each consume and produce comparable output from. That comparability is what makes the CPU/GPU cross-check test ([`cpu-gpu-testing.md`](cpu-gpu-testing.md)) possible — and Buiy gets the same comparability for free, because its CPU oracle and GPU shader evaluate *the same closed-form SDF*. + +## Sources + +- DeepWiki Vello architecture: https://deepwiki.com/linebender/vello/1.1-architecture +- Vello README: https://github.com/linebender/vello/blob/main/README.md +- `vello_encoding` crate: https://crates.io/crates/vello_encoding +- "Requiem for piet-gpu-hal" (Raph Levien, on retiring the bespoke HAL for wgpu): https://github.com/raphlinus/raphlinus.github.io/issues/86 diff --git a/docs/prior-art/vello/cpu-gpu-testing.md b/docs/prior-art/vello/cpu-gpu-testing.md new file mode 100644 index 0000000..3b59061 --- /dev/null +++ b/docs/prior-art/vello/cpu-gpu-testing.md @@ -0,0 +1,68 @@ +**Date:** 2026-06-14 +**Status:** active +**Subject:** Vello's `vello_tests` harness — three test tiers, GPU-as-source-of-truth, CPU-as-oracle cross-check, blessing flow + +# The `vello_tests` crate — render-correctness testing + +This is the directly-transferable file for Buiy's visual-bug-detection strategy. The metric details (`nv_flip`, the contested xilem counter-position, Kompari) live in [`metric-and-kompari.md`](metric-and-kompari.md); this file covers the harness structure and the test tiers. + +## Crate layout (verified from the repo tree) + +Vello's render-correctness tests live in a dedicated workspace crate, `vello_tests/`: + +- `src/` — the harness (`lib.rs` plus `snapshot`/`compare` modules) +- `tests/` — `#[test]` cases +- `snapshots/` — reference PNGs (bulk fetched via **Git LFS**) +- `smoke_snapshots/` — a small subset committed directly (not LFS) +- `current/`, `comparisons/`, `debug_outputs/` — working/diagnostic outputs + +`src/lib.rs` defines **`TestParams`** (`width`, `height`, `base_color`, `use_cpu`, `name`, `anti_aliasing`) and re-exports the comparison entry points from two modules: + +- `snapshot` — `snapshot_test`, `snapshot_test_sync`, `smoke_snapshot_test_sync` +- `compare` — `compare_gpu_cpu`, `compare_gpu_cpu_sync`, `GpuCpuComparison` + +Core rendering helpers: `get_scene_image`, `render_then_debug` / `render_then_debug_sync`, `encode_test_scene`, `write_png_to_file`. + +## The three test tiers (per the crate README) + +| Tier | What it does | Oracle | +|---|---|---| +| **1. Property tests** | Run a generated scene through both GPU and CPU; check invariants. | invariant assertions | +| **2. Snapshot / golden tests** | Render a scene; diff against a committed PNG. **GPU output is the source of truth**; the CPU path is also exercised. | checked-in PNG | +| **3. Comparison tests** | Render the *same* scene on GPU and CPU; assert they agree (validates GPU against the CPU reference). | the CPU renderer itself | + +Tier 3 is the **golden-free CPU-as-oracle cross-check** — and the single most transferable idea for Buiy. Crucially, the README explicitly states the team "hope to largely phase these out in favour of additional snapshot tests" — i.e. the direct CPU-vs-GPU cross-check is treated as **transitional scaffolding**, not the long-term oracle. (See [`lessons.md`](lessons.md) for why Buiy's situation differs: Buiy's CPU and GPU paths evaluate the *same analytic SDF*, so their agreement is a more durable invariant against implementation drift than Vello's two-different-pipelines agreement — though, as that file notes, sharing one function means the cross-check cannot catch a bug in the SDF *itself*; that residual class is the golden/reftest tiers' job, not the oracle's.) + +Note that even the snapshot tier (tier 2) uses "a non-exact comparison metric, because of small differences between rendering on different platforms" — Vello never asserts exact pixel equality anywhere. + +## `vello_cpu`'s `f32` pipeline is the snapshot generator + +The reference rasterizer is `vello_cpu` (the sparse-strips CPU renderer; see [`sparse-strips.md`](sparse-strips.md)). Its **`f32` pipeline** ("slower but has more accurate results, and is especially useful for rendering test snapshots") is the intended snapshot/oracle generator, backing `RenderMode::OptimizeQuality`. The higher-precision CPU path is deliberately the one used to produce references. + +## The blessing / update flow (env-var driven) + +Snapshot tests are self-updating via environment variables (per `src/snapshot.rs`): + +- A **missing reference** plus `VELLO_TEST_CREATE` writes the new PNG into `snapshots/`; otherwise it writes to an update path and bails with instructions to set `VELLO_TEST_CREATE=all`. +- `VELLO_TEST_UPDATE` converts mismatches into overwrites of the reference. + +This env-var blessing pattern (no special CLI, just `CARGO` + an env flag) is a clean, reusable shape for Buiy's snapshot tier. + +## What could NOT be verified (carried flags) + +- The single comparison assertion is `assert_mean_less_than(&mut self, value: f32)`, which reads `stats.mean()` off the FLIP pool and fails if the mean error exceeds the caller-supplied bound. The harness notes "**Mean should be less than 0.1 in almost all cases for a successful test.**" **This string is paraphrased from a WebFetch read of `compare.rs`, not byte-exact.** +- No `assert_all_less_than` / percentile assertion was found in `compare.rs` — only the **mean** assertion was present. +- Vello-specific **CI config** invoking this harness was not surfaced in search; do not assert CI specifics beyond "the harness exists and uses `nv_flip` mean-error." + +## Implications for Buiy + +Buiy can adopt Vello's harness *skeleton* wholesale, independent of the metric: **render A, render B, perceptual-diff, assert below threshold.** B can be a CPU oracle (oracle mode, Buiy's tier just above layout-number snapshots) or a checked-in PNG (golden mode, Buiy's top tier). One harness, two tiers. The `TestParams`-style config struct and the env-var blessing flow are both directly liftable shapes. The key adaptation: where Vello's tier-3 oracle is a *second, independently-implemented renderer* (hence "phase these out"), Buiy's oracle is a CPU port of the *same SDF function* the GPU evaluates — so Buiy's cross-check is a more durable invariant against implementation drift and need not be treated as scaffolding. (The flip side, per [`lessons.md`](lessons.md): a shared SDF that is wrong in the *spec* leaves both paths wrong identically, so this tier still needs the golden/reftest tiers above it.) + +## Sources + +- `vello_tests` tree: https://github.com/linebender/vello/tree/main/vello_tests +- `vello_tests/src/lib.rs`: https://github.com/linebender/vello/blob/main/vello_tests/src/lib.rs +- `vello_tests/README.md`: https://github.com/linebender/vello/blob/main/vello_tests/README.md +- `vello_tests/src/compare.rs`: https://github.com/linebender/vello/blob/main/vello_tests/src/compare.rs +- `vello_tests/src/snapshot.rs`: https://github.com/linebender/vello/blob/main/vello_tests/src/snapshot.rs +- DeepWiki testing & validation: https://deepwiki.com/linebender/vello/5.2-testing-and-validation diff --git a/docs/prior-art/vello/ecosystem-maturity.md b/docs/prior-art/vello/ecosystem-maturity.md new file mode 100644 index 0000000..3681ccf --- /dev/null +++ b/docs/prior-art/vello/ecosystem-maturity.md @@ -0,0 +1,81 @@ +**Date:** 2026-06-14 +**Status:** active +**Subject:** What Vello is and isn't — three variants/maturity levels, Parley pairing, adoption, versions, governance + +# Ecosystem & maturity + +## What Vello is (and isn't) + +Vello is "A GPU compute-centric 2D renderer" — a **renderer, not a UI toolkit**. It consumes a scene (paths, fills, gradients, glyph runs, clips, blends) and rasterizes it. It has **no widget tree, no layout, and no accessibility** — a11y lives entirely in the consuming framework (Masonry/Xilem via AccessKit), never in Vello. The lineage is **piet-gpu → piet-gpu-hal → Vello**; Raph Levien retired the bespoke `piet-gpu-hal` HAL in favor of `wgpu` ("Requiem for piet-gpu-hal"). + +This "renderer with no a11y by design" boundary is itself a data point for Buiy: Vello deliberately keeps a11y *out* of the renderer. Buiy's decomposed a11y components live above the render layer for the same reason. + +## Three variants, three maturity levels + +Per [DeepWiki architecture](https://deepwiki.com/linebender/vello/1.1-architecture) (wording cross-checked against Linebender blogs): + +| Variant | Pipeline | Status | Requires | +|---|---|---|---| +| `vello` (GPU compute) | flagship; prefix-sum `flatten → binning → coarse → fine` | **alpha** (README verbatim) | WebGPU **compute** support | +| `vello_hybrid` | CPU path-processing + GPU fragment-shader rasterization of sparse strips; "targets WebGL2 and resource-constrained GPUs" | **Experimental** | any GPU (incl. web) | +| `vello_cpu` | pure-CPU sparse-strip rasterizer, "for devices without GPU support or debugging purposes" | **Alpha** | none | + +The README states Vello "can currently be considered in an alpha state" and "the web is not currently a primary target … WebGPU implementations are incomplete, so you might run into issues" (verbatim). **There is no 1.0, and no stable-release roadmap was found in primary sources — mark the 1.0 timeline as unverified.** + +Note the framing nuance that matters for Buiy: DeepWiki positions `vello_cpu` "for debugging purposes," **not** as an authoritative *reference oracle*. So Buiy's plan to promote its own CPU SDF port to a first-class oracle goes *beyond* what Vello formally claims for `vello_cpu`. The basis for that stronger claim is narrow: Buiy's CPU and GPU paths share one analytic function, so their agreement is a durable invariant against implementation drift — but, as [`lessons.md`](lessons.md) records, a bug in the shared SDF itself stays invisible to the cross-check, so the oracle does not replace Buiy's golden/reftest tiers. See [`lessons.md`](lessons.md). + +## Ecosystem pairing — Parley + +Vello pairs with **Parley** (rich-text layout) as the text companion. Parley v0.10.0 (2026-06-01) switched shaping to **HarfRust** (Google Fonts' Rust HarfBuzz port), replacing Swash, in the Masonry/Xilem 0.4 release (October 2025). Xilem (SwiftUI-inspired) renders via Vello atop Masonry's retained widget tree. (Buiy uses cosmic-text rather than Parley — both shape via harfrust, so the shaper substrate converges even though the layout API diverges; see [`../xilem-masonry/text-and-rendering.md`](../xilem-masonry/text-and-rendering.md).) + +## Adoption + +Confirmed Vello consumers: + +- **Xilem / Masonry** — Linebender's own GUI stack. +- **`bevy_vello`** — third-party Bevy integration; v0.6.0 upgraded to wgpu v26 for Bevy 0.17. +- **`woodpecker_ui`** — StarArawn's Bevy ECS UI crate uses Vello. + +**Important correction on the Bevy rumor:** Bevy migrated its *text* stack to **Parley** (replacing cosmic-text), merged 2026-02-11, targeting **Bevy 0.19** ([bevy#21767](https://github.com/bevyengine/bevy/issues/21767), [PR #22879](https://github.com/bevyengine/bevy/pull/22879)). That PR is **Parley-only — it does NOT adopt Vello** as Bevy's renderer (verified: "makes no mention of Vello"). Do not propagate the "Bevy adopted Vello" rumor. + +## Versions (crates.io API `created_at` — authoritative) + +Note: a GitHub-releases HTML scrape misreported several of these as 2025; the crates.io API timestamps are authoritative. + +- **`vello`** (GPU compute): **0.9.0** (2026-05-15); 0.8.0 (2026-03-20); 0.7.0 (2026-01-13); 0.6.0 (2025-10-03). 0.9.0 moved to wgpu v29, peniko 0.6.1, skrifa 0.42. First published 2024-03-04. +- **`vello_cpu` / `vello_hybrid` / `vello_common`**: still **0.0.x** — latest **0.0.9** (2026-05-30); 0.0.8 (2026-05-15); 0.0.7 (2026-03-24). The sub-1.0 `0.0.x` versioning is an explicit "do not depend on stability" signal. + +## Numbers (crates.io / GitHub, verified 2026-06-14) + +| Metric | Value | +|---|---| +| `vello` latest version | 0.9.0 (2026-05-15) | +| `vello` first published | 2024-03-04 | +| `vello` total downloads | 384,404 (178,450 recent) | +| `vello` GitHub stars | ~4.1k (point-in-time, approximate) | +| `parley` latest version | 0.10.0 | +| `parley` total downloads | 1,285,183 (≫ vello — Parley is useful standalone) | +| `parley` GitHub stars | ~622 (approximate) | + +The Parley-≫-Vello download gap reflects the substrate-vs-framework adoption split Linebender exhibits generally: standalone substrate crates outpace the renderer that ties them together. + +## Governance & funding + +Linebender is an **informal volunteer collective** — "a group of volunteers and enthusiasts who hang out on our Zulip," with all work done in the open and decisions emerging from community discussion rather than a formal hierarchy ([linebender.org/about](https://linebender.org/about/)). **Raph Levien** founded it and "informally leads and drives the work forward" (SIMD, stroke expansion, new rendering approaches). + +**Funding (uncertain):** the About page lists none, but blog history records **Google** and **Google Fonts** sponsorship of ecosystem work (Xilem/Masonry/Vello). Exact current dollar figures and sponsor terms could not be pinned to a primary source — **treat the funding specifics as unverified beyond "Google / Google Fonts have sponsored Linebender work."** + +## Implications for Buiy + +Vello is a moving target: flagship at 0.9.0 alpha, sparse-strips crates at `0.0.x`, no 1.0 in sight. This reinforces the [`lessons.md`](lessons.md) Avoid row — **study Vello's testing pattern, do not take a runtime dependency on `vello` / `vello_cpu`.** The capability set (anti-aliased path fill, gradients in arbitrary color spaces, blur, blend, arbitrary clip-path) is worth modeling as a render-pipeline *target*, but Buiy's renderer is independent. + +## Sources + +- DeepWiki architecture: https://deepwiki.com/linebender/vello/1.1-architecture +- Vello README: https://github.com/linebender/vello +- crates.io API: https://crates.io/api/v1/crates/vello/versions +- Parley repo: https://github.com/linebender/parley +- bevy#21767 / PR #22879 (Bevy → Parley, NOT Vello): https://github.com/bevyengine/bevy/issues/21767 , https://github.com/bevyengine/bevy/pull/22879 +- woodpecker_ui: https://github.com/StarArawn/woodpecker_ui +- Linebender about / governance: https://linebender.org/about/ +- "Requiem for piet-gpu-hal": https://github.com/raphlinus/raphlinus.github.io/issues/86 diff --git a/docs/prior-art/vello/glossary.md b/docs/prior-art/vello/glossary.md new file mode 100644 index 0000000..9606117 --- /dev/null +++ b/docs/prior-art/vello/glossary.md @@ -0,0 +1,68 @@ +**Date:** 2026-06-14 +**Status:** active +**Subject:** Vello / Linebender-specific terms used across this folder + +# Glossary + +Terms specific to Vello and the Linebender ecosystem, as used in the sibling files. General Rust/GPU terms are omitted. + +- **Vello** — Linebender's "GPU compute-centric 2D renderer." The flagship `vello` crate; also the umbrella name for the family (`vello` / `vello_cpu` / `vello_hybrid`). See [`architecture.md`](architecture.md). + +- **`Scene`** — Vello's canvas-like front-end API (`fill()`, `stroke()`, `push_layer()`) that appends into an `Encoding` rather than drawing immediately. See [`architecture.md`](architecture.md). + +- **`Encoding`** — the compact binary scene format owned by the `vello_encoding` crate, split into parallel streams (`tag_stream`, `path_stream`, `draw_stream`, `transform_stream`, `linewidth_stream`). The renderer-agnostic representation all backends consume. + +- **Sparse strips** — Linebender's second-generation rasterization architecture (a strip = a horizontal run of pixels with a coverage value). Shared by `vello_cpu` and `vello_hybrid`. `vello_hybrid` rasterizes each strip as two triangles via a *fragment* shader. See [`sparse-strips.md`](sparse-strips.md). + +- **`vello` (flagship)** — the GPU-compute renderer; runs a prefix-sum compute pipeline. Requires WebGPU compute support. + +- **`vello_cpu`** — pure-software sparse-strip rasterizer, "optimized for SIMD and multithreaded execution"; the reference rasterizer. DeepWiki frames it "for debugging purposes," not as a formal oracle. + +- **`vello_hybrid`** — CPU strip generation + GPU fragment-shader fine rasterization; targets WebGL2 and resource-constrained GPUs / the web. + +- **`vello_common`** — shared infrastructure (geometry, paints, glyph plumbing) re-exported by the sparse-strips crates. + +- **`vello_tests`** — the dedicated test crate: `TestParams`, `snapshot`/`compare` modules, `snapshots/` (LFS) + `smoke_snapshots/` (committed). See [`cpu-gpu-testing.md`](cpu-gpu-testing.md). + +- **`u8` / `f32` pipelines** — `vello_cpu`'s two render modes. `u8` = `OptimizeSpeed`; `f32` = `OptimizeQuality` (slower, more accurate, "especially useful for rendering test snapshots" — the oracle generator). + +- **`Level` (enum)** — `vello_cpu`'s runtime SIMD-detection enum (x86 / aarch64 / wasm), with a scalar fallback. + +- **`RenderContext`** — `vello_cpu`'s primary interface (`set_paint`, `fill_path`, `stroke_path`, `glyph_run`). + +- **`RenderMode`** — selects `OptimizeSpeed` (`u8`) vs `OptimizeQuality` (`f32`). + +- **Comparison test (tier 3)** — render the *same* scene on GPU and CPU, assert they agree; the CPU-as-oracle cross-check. Slated to be "phased out" in Vello because its two paths are different implementations. + +- **`GpuCpuComparison`** — the `vello_tests` struct holding `statistics: Option`, the two `ImageData` buffers + paths, and `TestParams`. Exposes `assert_mean_less_than`. + +- **`nv_flip` / FLIP / ꟻLIP** — NVIDIA's FLIP perceptual image-difference metric, and its Rust binding crate (`nv-flip`, via `nv-flip-sys` over C++). Models the difference perceived when *flipping* between two images; summarized by the **mean** of its error map. See [`metric-and-kompari.md`](metric-and-kompari.md). + +- **`FlipPool`** — the FLIP error-map statistics object; `.mean()` is the load-bearing summary. + +- **`DEFAULT_PIXELS_PER_DEGREE`** — FLIP's viewing-distance constant (67.0); parameterizes the perceptual model. + +- **Blessing** — accepting a new/changed render as the reference. In `vello_tests`, driven by env vars `VELLO_TEST_CREATE` (write missing reference) and `VELLO_TEST_UPDATE` (overwrite mismatches). + +- **Kompari** — `linebender/kompari`, a pre-alpha snapshot-diff tool (HTML reports + interactive-blessing HTTP server) intended to converge Linebender's snapshot testing. No published releases. Contributed by Ada Böhm. See [`metric-and-kompari.md`](metric-and-kompari.md). + +- **Conflation artifacts** — visible AA seams where adjacent primitives meet; one of Vello's four named open problems. + +- **Prefix sum (scan)** — the parallel primitive Vello uses to turn sequential per-segment work into GPU-parallel work (`*_reduce` / `*_scan` / `*_leaf` stages). The source of its portability problems. See [`open-problems.md`](open-problems.md). + +- **Sort-middle / coarse-then-fine** — Vello's tiling strategy: `binning` sorts segments into tiles, `coarse` builds per-tile command lists, `fine_*` produces final pixels. + +- **piet-gpu / piet-gpu-hal** — Vello's predecessors. `piet-gpu-hal` was Linebender's bespoke GPU HAL, retired in favor of `wgpu`. + +- **Linebender** — the informal volunteer collective (founded / informally led by Raph Levien) behind Vello, Parley, Xilem, Masonry, Kurbo, Peniko, Color, Skrifa, Kompari. + +- **Parley** — Linebender's rich-text layout crate; Vello's text companion. Shapes via HarfRust as of v0.10.0. (Buiy uses cosmic-text instead.) + +- **`kurbo` / `peniko` / `color` / `skrifa`** — the Linebender 2D substrate crates Vello builds on (curves+affines / paint primitives / color spaces / font outlines). + +## Sources + +- DeepWiki architecture: https://deepwiki.com/linebender/vello/1.1-architecture +- `vello_tests` source: https://github.com/linebender/vello/tree/main/vello_tests +- docs.rs/nv-flip: https://docs.rs/nv-flip/latest/nv_flip/ +- Kompari README: https://github.com/linebender/kompari/blob/main/README.md diff --git a/docs/prior-art/vello/lessons.md b/docs/prior-art/vello/lessons.md new file mode 100644 index 0000000..3d055ea --- /dev/null +++ b/docs/prior-art/vello/lessons.md @@ -0,0 +1,77 @@ +**Date:** 2026-06-14 +**Status:** active +**Subject:** Vello — Validates / Avoid / Borrow decisions for Buiy's visual-bug-detection strategy + +# Lessons for Buiy + +The consult-this-when-designing file. The other Vello files are evidence; this file is decisions. Buiy implications live here. + +Vello is the **canonical precedent** for a wgpu 2D renderer paired with a CPU reference rasterizer (`vello_cpu`) born specifically to backstop GPU shortcomings — directly analogous to Buiy promoting its CPU SDF port to an oracle. It sits in the visual-bug-detection corpus as the closest *greenfield neighbor*: same substrate (wgpu, Rust, MIT-OR-Apache-2.0), same idea (CPU oracle cross-checks GPU output with a perceptual metric in tests). Buiy's strategy doc is `docs/reports/2026-06-14-visual-bug-detection-strategy.md`; this file feeds its reftests-first pyramid. + +## Top of file: one finding reframes the rest + +### Buiy's oracle is *stronger* than Vello's, because Buiy's two paths share one analytic function. + +Vello's tier-3 CPU-vs-GPU "comparison tests" are slated to be "**largely phased out**" because `vello_cpu` and `vello` are two *independently-implemented* pipelines — their agreement is a weak invariant (each could be wrong in the same way, or the test could drift as either evolves). That is why Vello calls them transitional scaffolding ([`cpu-gpu-testing.md`](cpu-gpu-testing.md)). + +Buiy's situation is different: the CPU oracle and the GPU shader evaluate **the same closed-form `sdf_rounded_rect`**. They are not two implementations of "draw a rounded rect"; they are two evaluations of one function. Their agreement-to-float-tolerance is therefore a *durable* invariant, and any divergence **localizes a real shader bug** — wrong half-extent, radius clamp, premultiply error. Buiy should NOT inherit Vello's "phase out the cross-check" posture. The cross-check is Buiy's tier-2 backbone, permanently. + +**The asymmetry has a cost, and it is the same one Buiy levels at Vello.** Sharing one function means the cross-check catches *shader-implementation* divergence (premultiply, clamp, half-extent, AA step) but is blind to a *spec* error in the SDF itself: if `sdf_rounded_rect` is wrong, the CPU port and the GPU shader are wrong *identically*, the buffers still match, and the test stays green — the exact "each could be wrong in the same way" failure mode this section uses to discount Vello's two-implementation cross-check. The shared-SDF oracle is strictly stronger against implementation drift and strictly *no help* against a wrong SDF. That residual class — does the rendered shape match the *intended* shape — is precisely what the golden-screenshot and reftest tiers exist to cover; the oracle tier does not subsume them. + +--- + +## Validates + +Buiy decisions confirmed by Vello's experience: + +- **CPU reference rasterizer as a deliberate backstop for GPU output.** `vello_cpu` was built *specifically* to backstop the flagship GPU renderer's shortcomings. Buiy's plan to promote its CPU SDF port to an oracle is the same move, validated by the canonical wgpu-2D precedent. See [`sparse-strips.md`](sparse-strips.md). +- **Non-exact, perceptual image comparison for GPU output.** Vello never asserts exact pixel equality anywhere — GPU fast-math and precision differences guarantee small divergence even when both renderers are correct (verbatim rationale in [`metric-and-kompari.md`](metric-and-kompari.md)). Buiy's instinct to tolerate sub-pixel AA noise rather than demand exact match is correct. +- **A high-precision (`f32`) CPU path as the snapshot/oracle generator.** `vello_cpu`'s `f32` `OptimizeQuality` pipeline exists "especially … for rendering test snapshots." Confirms that an oracle should be the *most accurate* available evaluation of the spec, even if slow — it only runs in tests. See [`sparse-strips.md`](sparse-strips.md). +- **Renderer-with-no-a11y boundary.** Vello keeps a11y out of the renderer entirely (it lives in the framework). Buiy's render layer likewise carries no a11y; the decomposed a11y components sit above it. See [`ecosystem-maturity.md`](ecosystem-maturity.md). +- **One scene format, multiple backends with comparable output.** Vello's `Encoding` is consumed by three backends. Buiy gets comparability for free because its oracle and shader share the SDF function — no separate scene-encoding layer needed. + +## Avoid + +| Pitfall | Source | Buiy mitigation | +|---|---|---| +| **The compute-centric architecture.** Vello's pipeline is four prefix-sum compute stages (flatten/binning/coarse/fine) plus a `Scene`→`Encoding`→stream machinery — solving GPU-compute-portability problems Buiy doesn't have. | [`architecture.md`](architecture.md), [`open-problems.md`](open-problems.md) | Borrow the *testing* idea, not the renderer. Buiy is instanced quads + per-fragment SDF; its oracle is a trivial per-pixel function eval, far simpler than porting a sparse-strip rasterizer. | +| **Runtime dependency on pre-1.0 `vello` / `vello_cpu` as the oracle.** Flagship is alpha; sparse-strips crates are `0.0.x` ("do not depend on stability"); glyph/blur/memory strategies churn — the output would be a moving target. MSRV Rust 1.88. | [`ecosystem-maturity.md`](ecosystem-maturity.md), [`sparse-strips.md`](sparse-strips.md) | Buiy's oracle is its *own* CPU SDF port (already half-built), not a Vello dependency. No external renderer in the correctness loop. | +| **Treating the CPU-vs-GPU cross-check as transitional.** Vello phases it out because its two paths are different implementations. | [`cpu-gpu-testing.md`](cpu-gpu-testing.md) | Buiy's two paths share one analytic SDF → the cross-check is a durable invariant, kept permanently (Top of file). | +| **Assuming one image-diff metric fits all tiers.** Linebender accidentally runs two (FLIP for `vello_tests`, tolerance-16 pixel diff for xilem) because FLIP has false negatives on sub-perceptual changes ("dark grey and white … very similar"). | [`metric-and-kompari.md`](metric-and-kompari.md) | Buiy picks per failure mode *deliberately*: FLIP for the oracle-agreement tier (tolerate GPU noise), tight pixel tolerance for the golden-screenshot tier (catch small intentional regressions). | +| **Adopting Vello's threshold number blindly.** Vello's mean-error bound is tuned to *its* AA model. | [`metric-and-kompari.md`](metric-and-kompari.md) | Calibrate Buiy's threshold on a known-good Buiy frame. | +| **`nv-flip`'s native-toolchain cost in any shipping path.** `nv-flip` is pre-1.0 (0.1.2, unchanged since 2023) and wraps a C++ lib via `nv-flip-sys`, adding a build-time native cost. | [`metric-and-kompari.md`](metric-and-kompari.md) | Acceptable as a **dev-dependency** in the test harness only; never in a shipping path. (Or consider a pure-Rust FLIP/ΔE if the native cost bites CI.) | +| **Git LFS PNG reference store for the cross-check tier.** Vello keeps `snapshots/*.png` in LFS. | [`cpu-gpu-testing.md`](cpu-gpu-testing.md) | The CPU-oracle approach lets Buiy *defer* LFS entirely for the rasterization cross-check; reserve LFS for the genuine golden-screenshot top tier only. | + +## Borrow + +Concrete primitives worth adapting: + +1. **CPU SDF as oracle — promote the existing port.** `crates/buiy_core/tests/render_instance.rs` (the comment + `fn sdf_rounded_rect` at lines 10-15) already contains a "Pure-CPU port of `shader.wgsl::sdf_rounded_rect`" that mirrors the GPU SDF 1:1. Today it is used only for **scalar distance assertions at a few sample points** (`logical_sdf_inside_is_filled_outside_is_empty`, lines 17-34: probes center-inside and 2×-half-extent-outside). The Vello lesson is to **promote it to a full-tile rasterizer**: evaluate `sdf_rounded_rect` per-pixel with the same AA coverage step the WGSL uses, producing a coverage/color buffer, then diff that buffer against a real GPU readback of the same instance. Three point-probes become a dense per-pixel oracle with **zero checked-in PNGs**. Because the SDF is analytic, CPU and GPU should agree to within float/AA tolerance; divergence localizes a real shader-implementation bug — exactly the failure class the existing single-point `d_center` assertion only gestures at. (Reminder of the boundary from Top-of-file: this catches implementation drift, *not* a wrong SDF — both paths would share that error.) **Prerequisite to scope before committing to this Borrow:** the "real GPU readback" half is not free. It needs a headless GPU path in CI — adapter selection, an `xvfb`-style virtual display (Buiy's gate already wraps tests in `xvfb-run`), and a buffer-readback step (Vello's own pipeline exposes a `Download` command for exactly this, [`architecture.md`](architecture.md):27, but the folder does not document Vello's headless-readback mechanics, so treat that as Buiy-side design work, not a liftable recipe). Stand up the readback-in-CI path first; the per-pixel diff is the easy half. See [`sparse-strips.md`](sparse-strips.md), [`cpu-gpu-testing.md`](cpu-gpu-testing.md). + +2. **`nv-flip` mean-error gate for the oracle tier — resolving pixelmatch-YIQ-vs-FLIP toward FLIP.** Copy Vello's shape: `FlipPool::mean()` with a small fixed threshold via an `assert_mean_less_than`-style helper. FLIP models the difference a human perceives when *flipping* between two images — the exact reftest viewing mode — and yields a continuous, perceptually-weighted scalar that tolerates sub-pixel AA/dithering noise inherent in GPU-vs-CPU SDF agreement, *without* hand-tuned color thresholds. This beats pixelmatch's binary YIQ-threshold count for the **oracle** tier. (For the **golden** tier, see the Avoid row on per-tier metrics — a tight pixel tolerance may be better there.) Rust binding `nv-flip` v0.1.2; API `FlipImageRgb8::with_data` → `flip(ref, test, DEFAULT_PIXELS_PER_DEGREE=67.0)` → `FlipPool::mean()`; license MIT/Apache-2.0/Zlib (FLIP core BSD-3-Clause), all Buiy-compatible. See [`metric-and-kompari.md`](metric-and-kompari.md). + +3. **The snapshot-harness skeleton (render A, render B, perceptual-diff, assert).** Vello's loop is reusable independent of whether B is a CPU oracle (oracle mode) or a checked-in PNG (golden mode). Buiy reuses **one harness for two tiers of its pyramid**. Lift the `TestParams`-style config struct and the env-var blessing flow (`VELLO_TEST_CREATE` / `VELLO_TEST_UPDATE` → a `BUIY_TEST_*` equivalent: missing reference + create flag writes it; mismatch + update flag overwrites). See [`cpu-gpu-testing.md`](cpu-gpu-testing.md). + +4. **Runtime SIMD `Level` pattern (if/when the oracle vectorizes).** `vello_cpu`'s `Level` enum detects the best SIMD level at runtime with a scalar fallback as the definition-of-correct. If Buiy's per-pixel oracle ever needs to vectorize, adopt the same shape: detect once, dispatch the inner loop, keep scalar as ground truth. See [`sparse-strips.md`](sparse-strips.md). + +5. **Watch Kompari, don't depend on it yet.** Kompari (HTML diff reports + interactive blessing server) is the Linebender convergence plan but has **no published releases**. If Buiy wants a diff-report UX later, Kompari is the reference shape; for now, the env-var blessing flow (Borrow #3) is enough. See [`metric-and-kompari.md`](metric-and-kompari.md). + +## How to use this file + +When designing a Buiy visual-bug-detection tier: + +1. **For the rasterization cross-check tier** (GPU readback vs CPU SDF), read Borrow #1 + #2 and the Top-of-file note: promote the existing CPU SDF port, gate with FLIP mean-error. +2. **For the golden-screenshot top tier**, read the Avoid "per-tier metric" row: prefer a tight pixel tolerance over FLIP there; reserve LFS for this tier only. +3. **For the harness shape itself**, read Borrow #3. +4. **Don't take a Vello runtime dependency** — Avoid rows 1-2. Borrow the pattern, build Buiy's own oracle. +5. **Promote decisions into the strategy report / Buiy specs**, not just this file. + +## Sources + +- This corpus's evidence files: [`README.md`](README.md), [`architecture.md`](architecture.md), [`sparse-strips.md`](sparse-strips.md), [`cpu-gpu-testing.md`](cpu-gpu-testing.md), [`metric-and-kompari.md`](metric-and-kompari.md), [`ecosystem-maturity.md`](ecosystem-maturity.md), [`open-problems.md`](open-problems.md), [`glossary.md`](glossary.md) +- Buiy existing CPU SDF port: `crates/buiy_core/tests/render_instance.rs` (lines 10-34) +- Buiy visual-bug-detection strategy report: `docs/reports/2026-06-14-visual-bug-detection-strategy.md` +- Sibling Linebender prior-art (framework angle): [`../xilem-masonry/lessons.md`](../xilem-masonry/lessons.md) +- FLIP paper: https://dl.acm.org/doi/10.1145/3406183 +- `vello_tests`: https://github.com/linebender/vello/tree/main/vello_tests +- DeepWiki testing & validation: https://deepwiki.com/linebender/vello/5.2-testing-and-validation diff --git a/docs/prior-art/vello/metric-and-kompari.md b/docs/prior-art/vello/metric-and-kompari.md new file mode 100644 index 0000000..4c86644 --- /dev/null +++ b/docs/prior-art/vello/metric-and-kompari.md @@ -0,0 +1,66 @@ +**Date:** 2026-06-14 +**Status:** active +**Subject:** The comparison metric — `nv_flip` mean-error, the contested xilem tolerance-16 counter-position, and the Kompari convergence plan + +# The comparison metric + +This is the load-bearing file for Buiy's "which image-diff metric?" decision. The headline: **Linebender does not have one settled answer.** As of mid-2026 the org runs *two different metrics* — `vello_tests` on `nv_flip` mean-threshold, xilem on a tolerance-16 plain pixel diff — and is mid-flight on a third convergence tool, Kompari. + +## `nv_flip` (NVIDIA ꟻLIP), mean-error threshold — what `vello_tests` uses + +Both `src/snapshot.rs` and `src/compare.rs` compare images with the **`nv_flip`** crate (a Rust binding to NVIDIA's ꟻLIP perceptual difference metric) — **not** exact pixel match and **not** (yet) Kompari. The pattern in `compare.rs`: + +```rust +let error_map = nv_flip::flip(expected, rendered, nv_flip::DEFAULT_PIXELS_PER_DEGREE); +let pool = FlipPool::from_image(&error_map); +``` + +`GpuCpuComparison` holds `statistics: Option`, the two `ImageData` buffers, their paths, and `TestParams`. The single assertion is **`assert_mean_less_than(&mut self, value: f32)`**, which reads `stats.mean()` off the FLIP pool and fails if the mean error exceeds the caller-supplied bound. The harness notes "Mean should be less than 0.1 in almost all cases for a successful test" (*paraphrased, not byte-exact — see [`cpu-gpu-testing.md`](cpu-gpu-testing.md)*). + +**Non-zero error is deliberately tolerated**, with this verbatim rationale: the difference "could potentially be non-zero (i.e. there is a slight difference between the GPU and CPU results) **due to fast math on the GPU or different precisions used in the renderers**." This is the crux of why a *perceptual continuous* metric beats *exact pixel match* for a GPU-vs-CPU oracle check: GPU fast-math and precision differences guarantee small per-pixel divergence even when both renderers are correct. + +### FLIP, the algorithm + +FLIP ("FLIP: A Difference Evaluator for Alternating Images," Andersson, Nilsson, Akenine-Möller, Oskarsson, Åström, Fairchild; *Proc. ACM Comput. Graph. Interact. Tech. (PACMCGIT)* 3(2), 2020) models the difference a human perceives when **flipping** between two images — the exact reftest viewing mode. The authors recommend the **mean** of the error map as the single summary number, which is exactly what `assert_mean_less_than` consumes. + +### The Rust binding + +The binding is **`nv-flip`** v0.1.2 (latest; published **2023-07-16** per the crates.io API — *note a secondary source misreported "March 2026"; the registry is authoritative*). API: `FlipImageRgb8::with_data` → `flip(ref, test, DEFAULT_PIXELS_PER_DEGREE = 67.0)` → `FlipPool::mean()`. It is pre-1.0 and unchanged since 2023, and it wraps a C++ library via `nv-flip-sys` (a build-time native-toolchain cost). License: MIT OR Apache-2.0 OR Zlib for the bindings (FLIP core is BSD-3-Clause) — all compatible with Buiy's MIT-OR-Apache-2.0. + +## WART — `nv_flip` is contested *inside* Linebender (the xilem counter-position) + +The sibling project **xilem REMOVED the `nv-flip` dependency** for its widget screenshot tests, in favor of a **plain pixel-by-pixel diff with tolerance = 16**, because FLIP produced **false negatives**: verbatim, "The nv_flip algorithm may consider dark grey and white to be very similar colors" ([xilem #893](https://github.com/linebender/xilem/issues/893)). Tolerance 16 "seems to be the sweet spot" and reportedly catches "swapping the stroke join, changing a widget's border width, moving text by a tenth of a pixel" ([xilem PR #904](https://github.com/linebender/xilem/pull/904)). + +So, as of mid-2026, Linebender runs **two different metrics**: + +| Project | Metric | Rationale | +|---|---|---| +| `vello_tests` | `nv_flip` mean-error threshold | tolerate GPU fast-math / precision noise in GPU-vs-CPU agreement | +| `xilem` (widget screenshots) | plain pixel diff, **tolerance 16** | FLIP had false negatives on dark-grey/white; catches sub-pixel widget changes | + +**The split is not noise — it is a real signal about the failure mode.** FLIP is *perceptually forgiving by design*, which is exactly right when comparing two correct renderers (the divergence is sub-perceptual) and exactly wrong when the change you want to catch is itself sub-perceptual (a 1px border, a tenth-of-a-pixel text shift). The right metric depends on whether the test is an **oracle agreement check** (perceptual metric: tolerate the noise) or a **regression catch** (tight pixel tolerance: catch the small intentional-looking change). + +## Kompari — the convergence plan + +[`linebender/kompari`](https://github.com/linebender/kompari) is "a tool for reporting image differences … for use in snapshot testing" — a CLI + Rust crate, contributed by **Ada Böhm**, "currently in pre-alpha," whose stated goal is to "standardise and improve the developer experience of snapshot tests in Linebender (and beyond)." It produces static HTML diff reports and an HTTP server for interactively *blessing* snapshots. Vello "improved how its snapshot tests are handled in preparation for Kompari integration." + +**Status (verified):** Kompari has **no published releases** (MSRV 1.85). **Uncertain:** whether Kompari has *replaced* `nv_flip` in `vello_tests` by June 2026 is unconfirmed — as read, the live `src/compare.rs` / `src/snapshot.rs` still call `nv_flip`. + +## Implications for Buiy + +For Buiy's **CPU-SDF-oracle agreement check** (GPU readback vs CPU SDF rasterization), the failure mode is Vello's, not xilem's: both paths compute the same analytic SDF, so divergence is GPU-fast-math / AA / precision noise. A **perceptual continuous metric (`nv-flip` mean-error)** is the right fit, and this resolves Buiy's open "pixelmatch-YIQ vs FLIP" question *toward FLIP for the oracle tier*. But heed the xilem lesson for Buiy's **golden-screenshot top tier**, where the goal is catching small intentional-looking regressions: there a tight pixel tolerance may catch what FLIP smooths over. Buiy should likely use **FLIP for the oracle tier and a tight pixel tolerance for the golden tier** — two metrics for two failure modes, exactly mirroring Linebender's accidental two-metric state, but chosen deliberately. Calibrate any threshold on a known-good Buiy frame; do **not** adopt Vello's number blindly — it is tuned to Vello's AA model. + +**Caveat on the golden tier's tight tolerance.** Even Vello's *snapshot* tier uses a non-exact comparison "because of small differences between rendering on different platforms" ([`cpu-gpu-testing.md`](cpu-gpu-testing.md)). That cross-platform/cross-driver rendering variance is the standing reason golden-screenshot suites are notoriously flaky: a tolerance tight enough to catch a 1px regression is also tight enough to trip on driver-level AA differences. Buiy's tight-tolerance golden tier therefore needs its references pinned to a single fixed renderer/driver/OS in CI (or a per-platform reference set), not just a low pixel threshold — otherwise the tier flakes on the exact noise FLIP was chosen to absorb. (Full Borrow/Avoid framing in [`lessons.md`](lessons.md).) + +## Sources + +- `vello_tests/src/compare.rs`: https://github.com/linebender/vello/blob/main/vello_tests/src/compare.rs +- `vello_tests/src/snapshot.rs`: https://github.com/linebender/vello/blob/main/vello_tests/src/snapshot.rs +- FLIP paper: https://dl.acm.org/doi/10.1145/3406183 +- `nv-flip-rs` bindings: https://github.com/gfx-rs/nv-flip-rs +- crates.io API for `nv-flip`: https://crates.io/api/v1/crates/nv-flip +- docs.rs/nv-flip: https://docs.rs/nv-flip/latest/nv_flip/ +- xilem issue #893 (FLIP false negatives): https://github.com/linebender/xilem/issues/893 +- xilem PR #904 (tolerance-16 pixel diff): https://github.com/linebender/xilem/pull/904 +- Kompari README: https://github.com/linebender/kompari/blob/main/README.md +- Linebender Dec 2024 (Kompari): https://linebender.org/blog/tmil-12/ diff --git a/docs/prior-art/vello/open-problems.md b/docs/prior-art/vello/open-problems.md new file mode 100644 index 0000000..a2b3310 --- /dev/null +++ b/docs/prior-art/vello/open-problems.md @@ -0,0 +1,53 @@ +**Date:** 2026-06-14 +**Status:** active +**Subject:** What Vello structurally does NOT solve — the wart list the README ships, plus the gaps relevant to a greenfield neighbor + +# Open problems + +The Vello README itself names four open problem areas and an alpha-state caveat. This file collects those plus the deeper structural gaps relevant to Buiy. Honest tone: these are real adoption blockers, quoted where possible. + +## The README's own four open areas (verbatim) + +The README states Vello "can currently be considered in an alpha state," with four open problem areas: + +1. **Blur / filter effects** — still in flux. +2. **Conflation artifacts** — the antialiasing approach produces visible seams where adjacent primitives meet; a known, unsolved class of visual bug. +3. **GPU memory-allocation strategy** — robust *dynamic* GPU memory allocation has been a recurring pain point. The compute pipeline must size buffers for work whose extent is only known mid-pipeline. +4. **Glyph caching** — no settled glyph-cache strategy. + +It also states Vello "needs a GPU with support for compute shaders to run," and "The web is not currently a primary target for Vello, and WebGPU implementations are incomplete, so you might run into issues." + +## The load-bearing wart — GPU-compute portability is unsolved at the *spec* level + +This is *why Vello must ship `vello_hybrid` + `vello_cpu` at all.* Raph Levien's "Prefix sum on portable compute shaders" documents it verbatim: + +- "The Vulkan specification itself is careful to make no forward progress guarantees." +- Apple and ARM GPUs exhibit **forward-progress failures**. +- On Metal "there is simply no way to run decoupled look-back … (unless the payload can be packed into a 32 bit word)." +- WebGPU's uniformity analysis "rejects valid shaders." +- DX11/FXC's "simplistic uniformity analysis rejects common advanced compute patterns." +- Net: portable compute "can work pretty well on Vulkan and DX12, but Metal remains out of reach … as is WebGPU." DX12 additionally needs special handling of the SRV/UAV (readonly vs read-write) descriptor distinction ([vello#125](https://github.com/linebender/vello/issues/125)). + +`vello_hybrid` is the pragmatic escape hatch: by rasterizing sparse strips with a *fragment* shader instead of compute, it runs on WebGL2 and low-end GPUs that can't or won't run the compute pipeline ([vello#670](https://github.com/linebender/vello/issues/670)). **This entire problem class does not exist for Buiy** — Buiy is instanced quads + per-fragment SDF, no compute, no prefix-sum, no forward-progress dependency. Buiy sidesteps the wart that forced Vello's architecture to fork. + +## Gaps relevant to a greenfield neighbor (Buiy) + +1. **CPU fallback exists but is not framed as an authoritative oracle.** `vello_cpu` shares the sparse-strip architecture but DeepWiki positions it "for debugging purposes," **not** as a reference oracle. Buiy's plan to promote its CPU SDF port to a first-class oracle therefore goes *beyond* what Vello formally claims. The upside: Buiy's oracle is a per-pixel eval of the *same analytic function* the GPU runs, which is a more durable correctness basis (against implementation drift) than two independently-written rasterizers agreeing — with the matching limitation that a bug in the shared function escapes both paths, so the golden/reftest tiers stay necessary (see [`lessons.md`](lessons.md)). + +2. **No single settled image-diff metric.** Linebender runs `nv_flip` in `vello_tests` and tolerance-16 pixel diff in xilem, with Kompari as an unreleased convergence plan ([`metric-and-kompari.md`](metric-and-kompari.md)). A greenfield neighbor inherits an *unsettled* answer, not a recipe — Buiy must pick deliberately per failure mode. + +3. **The CPU-vs-GPU cross-check is treated as transitional.** Vello's tier-3 comparison tests are slated to be "largely phased out in favour of additional snapshot tests" ([`cpu-gpu-testing.md`](cpu-gpu-testing.md)) — because Vello's two pipelines are *different implementations*, so their agreement is a weaker invariant. Buiy should NOT inherit this "phase out" posture; Buiy's oracle is the same function, so the cross-check is durable. + +4. **No accessibility, no layout, no widgets — by design.** Vello is purely a rasterizer. None of Buiy's layout-number-snapshot or a11y-contract testing tiers have any analog in Vello; only the rasterization-cross-check tier maps over. + +5. **Git LFS reference store cost.** `vello_tests/snapshots/*.png` live in Git LFS ([`cpu-gpu-testing.md`](cpu-gpu-testing.md)). The CPU-oracle approach lets a neighbor *defer* that cost for the rasterization cross-check entirely; LFS is only needed for genuine golden screenshots. + +6. **Everything is pre-1.0 and churning.** Flagship alpha, sparse-strips `0.0.x`, glyph/blur/memory strategies in flux, MSRV 1.88 — taking a runtime dependency means tracking a moving target. + +## Sources + +- Vello README (four open areas, alpha caveat): https://github.com/linebender/vello +- "Prefix sum on portable compute shaders" (Raph Levien, 2021-11-17): https://raphlinus.github.io/gpu/2021/11/17/prefix-sum-portable.html +- DX12 portability polish issue: https://github.com/linebender/vello/issues/125 +- Sparse strip path rendering issue: https://github.com/linebender/vello/issues/670 +- DeepWiki testing & validation: https://deepwiki.com/linebender/vello/5.2-testing-and-validation diff --git a/docs/prior-art/vello/sparse-strips.md b/docs/prior-art/vello/sparse-strips.md new file mode 100644 index 0000000..f5f95ce --- /dev/null +++ b/docs/prior-art/vello/sparse-strips.md @@ -0,0 +1,52 @@ +**Date:** 2026-06-14 +**Status:** active +**Subject:** Vello's sparse-strips family — `vello_cpu` / `vello_hybrid` / `vello_common`, SIMD `Level` detection, and the `u8` vs `f32` pipelines + +# The sparse-strips family — the part most relevant to Buiy's oracle + +The flagship `vello` crate's compute pipeline ([`architecture.md`](architecture.md)) has four standing problems: GPU-allocation robustness, no-GPU / underpowered-GPU targets, web compatibility, and glyph caching (see [`open-problems.md`](open-problems.md)). To address them, Linebender built a **second-generation "sparse strips" architecture** shared by a family of crates that each consume the same scene format but execute different rendering pipelines: + +- **`vello_cpu`** — pure software rasterizer. "A CPU-based renderer for Vello, optimized for SIMD and multithreaded execution" ([docs.rs/vello_cpu](https://docs.rs/vello_cpu)). +- **`vello_hybrid`** — CPU strip generation + GPU fine rasterization. "runs the most compute intensive portions of rendering on the GPU … wide compatibility with most devices, so long as they have a GPU, including running well on the web." It rasterizes sparse strips with a *fragment* shader (two triangles per strip; the fragment reads strip alpha → solid color; hardware does the final blend), so it runs on WebGL2 and low-end GPUs that cannot run the compute pipeline ([sparse strip path rendering, vello#670](https://github.com/linebender/vello/issues/670)). +- **`vello_common`** — shared infrastructure (geometry, paints, glyph plumbing) re-exported by both. + +The method is documented in a published **ETH Zürich master's thesis** on high-performance CPU rendering of 2D graphics, attributed to **Laurenz Stampl** per the [Linebender Oct 2025 blog](https://linebender.org/blog/tmil-22/). **Uncertain:** this author/institution attribution is single-sourced to the blog; it could not be independently re-verified against the thesis PDF itself. Treat the name as single-sourced. + +## `vello_cpu` API surface + +The primary interface is **`RenderContext`** with `set_paint()`, `fill_path()`, `stroke_path()`, `glyph_run()` ([docs.rs/vello_cpu](https://docs.rs/vello_cpu)). Two patterns matter for Buiy: + +### 1. The SIMD `Level` enum (runtime detection) + +`vello_cpu` exposes a `Level` enum for **runtime SIMD detection** — it picks the best available instruction set (x86, aarch64, wasm) at runtime rather than requiring a target-feature build. This is the same shape Buiy would want if its CPU SDF oracle ever needs to vectorize per-pixel evaluation: detect the level once, dispatch the inner loop accordingly, keep a scalar fallback as the definition-of-correct. + +### 2. The `u8` vs `f32` pipelines (the oracle precision knob) + +`vello_cpu` has **two pipelines, switchable at runtime** (landed per the [Linebender Dec 2025 update](https://linebender.org/blog/tmil-24/): "Added features to Vello CPU to switch between `u8` and `f32` pipelines"): + +| Pipeline | `RenderMode` | Role | +|---|---|---| +| `u8` | `OptimizeSpeed` | fast, lower precision | +| `f32` | `OptimizeQuality` | "slower but has more accurate results, and is **especially useful for rendering test snapshots**" | + +So the **higher-precision `f32` CPU path is the intended snapshot/oracle generator** — this is precisely the role Buiy wants its CPU SDF port to play. The lesson: an oracle should be the *most accurate* available evaluation of the spec, even if it is slow, because it is only run in tests. Buiy's CPU SDF, evaluated in `f32` with the same AA coverage step the WGSL uses, is the direct analog of `vello_cpu`'s `f32` pipeline. + +## Stated warts (verbatim / paraphrased) + +- "the API is still likely to change and not stable yet." +- Filters and image-resources are **experimental**. +- "multi-threading with large thread counts (more than 4) might give diminishing returns, **especially when making heavy use of layers and clip paths**." +- All sparse-strips crates are still `0.0.x` (latest **0.0.9**, 2026-05-30) — an explicit "do not depend on stability" signal. + +## Implications for Buiy + +`vello_cpu` is the existence proof that a CPU reference rasterizer **born specifically to backstop a GPU renderer** is a workable design — the exact analogy to Buiy promoting its CPU SDF port to an oracle (see [`lessons.md`](lessons.md) Borrow). But note the architectural mismatch: `vello_cpu` is a full sparse-strip rasterizer (anti-aliased path fill, strokes, glyphs, clips, layers) — porting *that* would be enormous. Buiy's oracle is far simpler: a per-pixel evaluation of an analytic SDF. Borrow the *role and precision posture* (`f32`, accuracy over speed, runtime SIMD `Level`), not the rasterizer. And do **not** take a runtime dependency on `vello_cpu` itself — the `0.0.x` versioning makes its output a moving target. + +## Sources + +- docs.rs/vello_cpu: https://docs.rs/vello_cpu +- lib.rs/crates/vello_cpu: https://lib.rs/crates/vello_cpu +- Linebender "This Month in… " Oct 2025 (sparse strips / thesis attribution): https://linebender.org/blog/tmil-22/ +- Linebender Dec 2025 (`u8`/`f32` pipelines, hybrid on web): https://linebender.org/blog/tmil-24/ +- Sparse strip path rendering issue: https://github.com/linebender/vello/issues/670 +- GitHub releases (`0.0.x` versions): https://github.com/linebender/vello/releases diff --git a/docs/prior-art/wgpu-testing/README.md b/docs/prior-art/wgpu-testing/README.md new file mode 100644 index 0000000..8e2f145 --- /dev/null +++ b/docs/prior-art/wgpu-testing/README.md @@ -0,0 +1,87 @@ +**Date:** 2026-06-14 +**Status:** active +**Subject:** wgpu's CI / GPU test infrastructure — the closest determinism model for Buiy (folder index + entry point) + +# wgpu's CI / GPU test infrastructure + +Buiy renders on **wgpu**, so wgpu's own test suite — the `wgpu_test` harness, its pinned-software-rasterizer determinism recipe, and its `nv_flip` perceptual image-compare — is the most directly transplantable prior art in this corpus. Everything in this folder runs on the same wgpu abstraction Buiy targets; the determinism contract wgpu engineered (same CPU-rasterizer bits everywhere → reproducible pixels) is exactly the contract Buiy's golden/reftest tiers need. This is the *infrastructure* prior-art behind [`docs/reports/2026-06-14-visual-bug-detection-strategy.md`](../../reports/2026-06-14-visual-bug-detection-strategy.md): wgpu does not invent a new visual-testing *methodology* (that's the reftest/Gold folders), it shows how to make GPU output **reproducible enough to test at all**, and how to express **per-backend expected outcomes** instead of globally disabling a test. + +The three load-bearing pieces, each its own file: + +1. **`gpu_test` harness** — the `#[gpu_test]` macro, `TestParameters` (feature/limit gating → skip-not-fail), and `FailureCase` (per-backend × adapter-substring × driver expectations, where an *unexpected pass panics*). This is the strongest idea here for Buiy's reftest tier. +2. **Pinned software rasterizer** — lavapipe/llvmpipe/WARP frozen at a single `MESA_VERSION`, vendored via `gfx-rs/ci-build`, selected via `VK_DRIVER_FILES` + `WGPU_ADAPTER_NAME`. This is the determinism contract. +3. **`nv_flip` image compare** — perceptual FLIP error map → mean/percentile threshold, magma diff artifact, implicit golden bootstrapping. This is the metric. + +The decision content (what Buiy should Borrow / Avoid / treat as Validated) lives in [lessons.md](lessons.md). + +## Key facts (verified 2026-06-14 against the cited primary sources) + +| Fact | Value | Source | +|---|---|---| +| Latest wgpu | **v29.0.3**, released **2026-05-02** | [crates.io API](https://crates.io/api/v1/crates/wgpu) | +| Harness crate | `wgpu_test` — in-tree under `tests/`, **not published** to crates.io | [wgpu.rs/doc/wgpu_test](https://wgpu.rs/doc/wgpu_test/index.html) | +| Test unit | a `static GpuTestConfiguration` annotated `#[gpu_test]` | [wgpu_test docs](https://wgpu.rs/doc/wgpu_test/index.html) | +| Gating | `TestParameters` — unmet feature/limit/downlevel → **skip, not fail** | [`tests/src/params.rs`](https://github.com/gfx-rs/wgpu/blob/trunk/tests/src/params.rs) | +| Per-backend expectations | `FailureCase` — `backend()` / `adapter(substr)` / `validation_error()` / `panic()` / `.flaky()` | [`tests/src/expectations.rs`](https://github.com/gfx-rs/wgpu/blob/trunk/tests/src/expectations.rs) | +| Behavior enum | `FailureBehavior::{AssertFailure (default), Ignore}` — **AssertFailure: an unexpected pass panics** | [`expectations.rs`](https://github.com/gfx-rs/wgpu/blob/trunk/tests/src/expectations.rs) | +| Runner | `cargo-nextest` (process-per-test isolation), driven by `cargo xtask test` | [`docs/testing.md`](https://github.com/gfx-rs/wgpu/blob/trunk/docs/testing.md) | +| Backend sweep | `wgpu-info ` runs `` once per (adapter × backend), setting `WGPU_BACKEND` / `WGPU_ADAPTER_NAME` | [lib.rs/crates/wgpu-info](https://lib.rs/crates/wgpu-info) | +| Reference rasterizer (Vk) | **lavapipe** (`libvulkan_lvp.so`, `lvp_icd.x86_64.json`) | [install-mesa action](https://github.com/gfx-rs/wgpu/blob/trunk/.github/actions/install-mesa/action.yml) | +| Reference rasterizer (GL) | **llvmpipe** (`GALLIUM_DRIVER=llvmpipe`) | install-mesa action | +| Reference rasterizer (DX12) | Microsoft **WARP** (`d3d10warp.dll`, via `cargo xtask install-warp`) | install-warp action | +| Pinned Mesa version | `MESA_VERSION: "25.2.7"`, ci-binary-build `build26` (Nov 18) | install-mesa action; [ci-build releases](https://github.com/gfx-rs/ci-build/releases) | +| Pin host | `gfx-rs/ci-build` — builds Mesa from `archive.mesa3d.org` on a tag, attaches tarball to a GH Release | [ci-build artifacts.yml](https://github.com/gfx-rs/ci-build) | +| Adapter selection | `VK_DRIVER_FILES=$PWD/icd.json` + `WGPU_ADAPTER_NAME` (case-insensitive substring) | [Mesa envvars](https://docs.mesa3d.org/envvars.html), [wgpu util](https://docs.rs/wgpu/latest/wgpu/util/fn.initialize_adapter_from_env.html) | +| Image-compare crate | **`nv-flip` 0.1.2** (2023-07-16), MIT OR Apache-2.0 OR Zlib | [crates.io/nv-flip](https://crates.io/crates/nv-flip), [docs.rs](https://docs.rs/nv-flip/latest/nv_flip/) | +| Metric | NVIDIA **ꟻLIP** per-pixel error map ∈ [0,1]; summary = **mean** (authors' recommendation) | [docs.rs/nv-flip](https://docs.rs/nv-flip/latest/nv_flip/) | +| Assertion model | `ComparisonType::{Mean(f32), Percentile{percentile, threshold}}` | [`tests/src/image.rs`](https://raw.githubusercontent.com/gfx-rs/wgpu/trunk/tests/src/image.rs) | +| Diff artifact | magma-colormapped error map → `{stem}-{backend}-{name}-{driver}-difference.png` | `tests/src/image.rs` | +| Superseded model | raw **outlier-count** (`max_outliers`) — brittle across drivers, **replaced by FLIP in [PR #3830](https://github.com/gfx-rs/wgpu/pull/3830)** ("Migrate to nv-flip for image comparison") | brittleness evidence: [PR #2767](https://github.com/gfx-rs/wgpu/pull/2767), [issue #2760](https://github.com/gfx-rs/wgpu/issues/2760) | + +## Contents + +Each file is independently skimmable with its own `## Sources`. + +| File | Subject | +|---|---| +| [README.md](README.md) | This index — what wgpu's test infra is, key facts, reading order. | +| [lessons.md](lessons.md) | **The decision file.** `## Validates` / `## Avoid` / `## Borrow` — where Buiy implications live. Start here when designing. | +| [gpu-test-harness.md](gpu-test-harness.md) | The `#[gpu_test]` macro, `TestParameters` skip-gating, `FailureCase` per-backend expectations, the unexpected-pass-panics rule, `wgpu-info` sweep, nextest/xtask runner. | +| [determinism-rasterizer.md](determinism-rasterizer.md) | Why CPU rasterizers are the reference, the abandoned daily-PPA wart, `gfx-rs/ci-build` pinning, `VK_DRIVER_FILES` adapter selection, the `LP_NUM_THREADS` myth, the upgrade-treadmill cost. | +| [image-compare.md](image-compare.md) | `nv-flip` / FLIP metric, `FlipPool` mean/percentile reduction, `ComparisonType` assertions, magma diff artifact, implicit golden bootstrapping, the superseded outlier-count model. | +| [open-problems.md](open-problems.md) | What wgpu's stack structurally does *not* solve: flakiness as a first-class state, substring brittleness, lavapipe non-conformance, silent golden minting, the manual pin treadmill, FFI cost. | +| [glossary.md](glossary.md) | System-specific terms: `gpu_test`, `TestParameters`, `FailureCase`, lavapipe/llvmpipe/WARP, ICD, FLIP, `FlipPool`, nextest, `wgpu-info`. | + +## Reading order + +1. **[lessons.md](lessons.md)** — the decisions. Start here if you are designing Buiy's reftest/golden harness. +2. **[gpu-test-harness.md](gpu-test-harness.md)** — the `FailureCase` model is the single most transplantable idea; read it first for the referent. +3. **[determinism-rasterizer.md](determinism-rasterizer.md)** — the pinned-rasterizer recipe that makes any pixel test possible at all. +4. **[image-compare.md](image-compare.md)** — the perceptual metric Buiy's Tier-4/5 comparison should adopt. +5. **[open-problems.md](open-problems.md)** — the limits, so Buiy doesn't over-trust the stack. +6. **[glossary.md](glossary.md)** — reference when a term is unclear. + +## Framing disclosure + +This folder is written from Buiy's stance: an ECS-native (Bevy 0.18) retained-mode Rust GUI library with a custom `wgpu` pipeline, designing a reftests-first visual-bug-detection pyramid. Because Buiy is *built on the same wgpu*, this is the only prior-art folder whose mechanisms are nearly copy-pasteable rather than adapted — the `VK_DRIVER_FILES` recipe, the `FailureCase` primitive, and `nv_flip` itself are all directly reusable, and the `gfx-rs/ci-build` artifacts can be consumed as-is. "Implications for Buiy" lines therefore lean toward direct reuse. The evidence files describe wgpu's systems on their own terms and surface unflattering facts verbatim (lavapipe's "testing use only" self-warning, the abandoned daily-PPA, the substring-brittleness wart); Buiy implications are confined to clearly-labelled subsections and to [lessons.md](lessons.md). One dossier claim — that wgpu pins `LP_NUM_THREADS` for FP determinism — is **flagged as not how wgpu does it**; see [determinism-rasterizer.md](determinism-rasterizer.md). + +## How to use + +**Framing disclosure.** These docs are written from Buiy's stance — an AccessKit-first, wgpu + Taffy + cosmic-text, parallel-to-bevy_ui retained-mode engine building a reftests-first layered visual-bug-detection strategy. The "Implications for Buiy" / lessons framing reads wgpu's CI / GPU test infrastructure through that lens; readers auditing whether that strategy is itself right should weigh the corpus accordingly — it is a learn-from artifact, not a neutral catalog. + +## Sources + +- wgpu crate (crates.io API): https://crates.io/api/v1/crates/wgpu +- `wgpu_test` docs: https://wgpu.rs/doc/wgpu_test/index.html +- `tests/src/params.rs`: https://github.com/gfx-rs/wgpu/blob/trunk/tests/src/params.rs +- `tests/src/expectations.rs`: https://github.com/gfx-rs/wgpu/blob/trunk/tests/src/expectations.rs +- `tests/src/image.rs`: https://raw.githubusercontent.com/gfx-rs/wgpu/trunk/tests/src/image.rs +- `docs/testing.md`: https://github.com/gfx-rs/wgpu/blob/trunk/docs/testing.md +- `install-mesa/action.yml`: https://github.com/gfx-rs/wgpu/blob/trunk/.github/actions/install-mesa/action.yml +- `gfx-rs/ci-build`: https://github.com/gfx-rs/ci-build +- `nv-flip` (crates.io / docs.rs / repo): https://crates.io/crates/nv-flip · https://docs.rs/nv-flip/latest/nv_flip/ · https://github.com/gfx-rs/nv-flip-rs +- wgpu outlier→FLIP migration: PR #3830 (the replacement) https://github.com/gfx-rs/wgpu/pull/3830 ; brittleness evidence PR #2767 https://github.com/gfx-rs/wgpu/pull/2767 · issue #2760 https://github.com/gfx-rs/wgpu/issues/2760 +- Mesa envvars: https://docs.mesa3d.org/envvars.html +- Sibling files: [gpu-test-harness.md](gpu-test-harness.md), [determinism-rasterizer.md](determinism-rasterizer.md), [image-compare.md](image-compare.md), [open-problems.md](open-problems.md), [lessons.md](lessons.md), [glossary.md](glossary.md) +- Sibling prior art: [../wpt-reftests/](../wpt-reftests/), [../skia-gold/](../skia-gold/), [../vello/](../vello/) +- Buiy strategy report: [../../reports/2026-06-14-visual-bug-detection-strategy.md](../../reports/2026-06-14-visual-bug-detection-strategy.md) diff --git a/docs/prior-art/wgpu-testing/determinism-rasterizer.md b/docs/prior-art/wgpu-testing/determinism-rasterizer.md new file mode 100644 index 0000000..7f2eb0a --- /dev/null +++ b/docs/prior-art/wgpu-testing/determinism-rasterizer.md @@ -0,0 +1,92 @@ +**Date:** 2026-06-14 +**Status:** active +**Subject:** wgpu's pinned-software-rasterizer determinism recipe — lavapipe/llvmpipe/WARP frozen at one Mesa version, vendored via gfx-rs/ci-build, selected via VK_DRIVER_FILES + +# The pinned software rasterizer (the determinism contract) + +## The problem: real GPUs cannot be a reference image + +wgpu's image-comparison tests need a renderer that produces **bit-stable output across machines and across time**. Real GPUs can't: driver versions, vendor quirks, and undefined behavior in shaders all perturb pixels. The chosen reference is Mesa's software stack: + +- **lavapipe** — the `swrast` Vulkan driver (library `libvulkan_lvp.so`, ICD `lvp_icd.x86_64.json`), used on the **Vulkan** backend. +- **llvmpipe** — the Gallium OpenGL/GLES software rasterizer, used on the **GL** backend. +- **WARP** (`d3d10warp.dll`) — Microsoft's software D3D rasterizer, the Windows **DX12** analogue, installed via a separate `install-warp` composite action that shells out to `cargo xtask install-warp`. *(The exact WARP/NuGet package version is not visible from the action.yml alone — **unverified**.)* + +**No software-Metal reference exists.** The recipe covers Vulkan/GL/DX12 only — there is no CPU Metal rasterizer, so **macOS Metal goldens are not deterministic under this model** (they'd run on a real Apple GPU/driver). This is load-bearing for Buiy, which targets macOS; see [open-problems.md § 8](open-problems.md). + +Source: [`.github/actions/install-mesa/action.yml`](https://github.com/gfx-rs/wgpu/blob/trunk/.github/actions/install-mesa/action.yml). + +## The load-bearing wart: why the daily PPA was abandoned + +Originally wgpu CI `apt`-installed llvmpipe from `ppa:oibaf/graphics-drivers`, a **rolling daily** build. jimblandy filed [gfx-rs/wgpu#2594](https://github.com/gfx-rs/wgpu/issues/2594) (opened **April 13, 2022**): + +> "Because `.github/workflows/ci.yaml` pulls the latest `llvmpipe` from `ppa:oibaf/graphics-drivers`, the specific version of llvmpipe we get varies from day to day, and when we happen to get a buggy version, we get CI failures that have nothing to do with the PR in question. We should instead pull a specific known-good (or known-adequate) version of LLVM pipe to run our CI tests against." + +That is the entire rationale for pinning: **a rolling software-rasterizer is a moving reference image.** An unrelated llvmpipe regression landing upstream turns every PR's CI red overnight, decoupling test failures from the change under review. The fix was to stop pulling distro packages entirely and consume a **frozen, self-built Mesa**. + +## The pinning mechanism: gfx-rs/ci-build + +[gfx-rs/ci-build](https://github.com/gfx-rs/ci-build) ("Automated action for building/hosting components we need in CI") compiles Mesa from source on a tag push and publishes the result as a GitHub Release asset. From its `.github/workflows/artifacts.yml`, it downloads `https://archive.mesa3d.org/mesa-$MESA_VERSION.tar.xz` and runs: + +``` +meson setup builddir/ --buildtype=release -Dgallium-drivers=llvmpipe \ + -Dvulkan-drivers=swrast -Dplatforms= -Dglx=disabled +``` + +then tars `install/` into `mesa-$MESA_VERSION-linux-x86_64.tar.xz` and attaches it to the release (`softprops/action-gh-release@v1`). **Both rasterizers come from one source-pinned build with no DRI/GLX platform deps.** As of writing, `MESA_VERSION: "25.2.7"`, published as release **build26** (Nov 18); earlier `build20` = Mesa 24.3.4. + +## How wgpu consumes the pinned build + +The `install-mesa` action hardcodes the pin and documents the coupling verbatim: + +```yaml +# Sourced from https://archive.mesa3d.org/. Bumping this requires +# updating the mesa build in https://github.com/gfx-rs/ci-build and creating a new release. +version: + default: "25.2.7" +ci-binary-build: + default: "build26" +``` + +On **Linux** it `curl`s `…/ci-build/releases/download/build26/mesa-25.2.7-linux-x86_64.tar.xz`, then — because "*The ICD provided by the mesa build is hardcoded to the build environment*" — **writes its own ICD JSON** pointing at the unpacked `libvulkan_lvp.so`, and exports: + +``` +VK_DRIVER_FILES=$PWD/icd.json +LD_LIBRARY_PATH=$PWD/mesa/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH +LIBGL_DRIVERS_PATH=$PWD/mesa/lib/x86_64-linux-gnu/dri +``` + +On **Windows** it pulls the prebuilt `mesa3d-$MESA_VERSION-release-msvc.7z` from a **third-party** repo, [pal1000/mesa-dist-win](https://github.com/pal1000/mesa-dist-win) (**not** a gfx-rs-controlled build — a supply-chain trust wart), extracts `vulkan_lvp.dll` + `lvp_icd.x86_64.json`, sets `VK_DRIVER_FILES` (via `cygpath --windows`) and `GALLIUM_DRIVER=llvmpipe`. + +## Environment-driven adapter selection + +`VK_DRIVER_FILES` (the modern replacement for the now-deprecated `VK_ICD_FILENAMES`; [Mesa envvars](https://docs.mesa3d.org/envvars.html)) forces the Vulkan loader to *only* see lavapipe, so the test harness **cannot accidentally pick a hardware GPU**. On the GL side `GALLIUM_DRIVER=llvmpipe` (paired with `LIBGL_ALWAYS_SOFTWARE=true` per Mesa docs) forces software GL. Within wgpu, `WGPU_ADAPTER_NAME` does a case-insensitive substring match over enumerated adapters ([`wgpu::util::initialize_adapter_from_env`](https://docs.rs/wgpu/latest/wgpu/util/fn.initialize_adapter_from_env.html)) to nail the exact device. + +## The `LP_NUM_THREADS` myth — flagged, do NOT copy + +A common external claim is that wgpu sets `LP_NUM_THREADS` to force single-threaded, deterministic FP accumulation. This is **not present** in the current `install-mesa/action.yml` (no such export). Mesa documents `LP_NUM_THREADS` verbatim as "*an integer indicating how many threads to use for rendering. Zero turns off threading completely. The default value is the number of CPU cores present*" — but does **not** characterize it as a determinism knob. llvmpipe tiles the framebuffer per-thread, so for a fixed tile assignment results are stable regardless of thread count. **No primary source shows wgpu pinning `LP_NUM_THREADS` for FP determinism** — treat that claim as **unverified / likely not how wgpu achieves determinism**. The determinism comes from the **pinned Mesa version**, not thread count. + +## Recent churn: the warts stay live + +Pinning trades day-to-day flakes for a **manual upgrade treadmill**: each bump requires a new ci-build release *and* an action edit. [gfx-rs/wgpu#8544](https://github.com/gfx-rs/wgpu/issues/8544) "Upgrade LLVMPipe in CI" (closed via PR #8582) shows the cost — a `Limits::blas_max_primitive_count` workaround (PR #8446) for an llvmpipe ray-tracing bug had to wait until **Mesa 25.2.7** fixed it before the limit could be restored. [#8727](https://github.com/gfx-rs/wgpu/issues/8727) ("SPIR-V writing for mesh shaders is broken on llvmpipe") shows the reference rasterizer itself still has feature gaps wgpu must route around. Earlier bump-tracking issues: "[Upgrade Mesa to 24.3.4 in CI #6988](https://github.com/gfx-rs/wgpu/issues/6988)". + +## Implications for Buiy + +The directly reusable pattern is three pieces: + +1. A separate **"build-and-host-a-frozen-Mesa" repo** keyed by a single `MESA_VERSION` + release tag — and Buiy can consume `gfx-rs/ci-build`'s artifacts **directly** rather than building its own. +2. A composite action that downloads it and **writes its own ICD** (the upstream ICD path is build-host-absolute and unusable). +3. `VK_DRIVER_FILES` + `WGPU_ADAPTER_NAME` to make adapter choice deterministic and hardware-proof. + +Bump the pin **deliberately, in a tracked issue**, and regenerate any golden images in that same PR. Do **not** copy a `LP_NUM_THREADS` determinism story. See [lessons.md](lessons.md) for the full Borrow/Avoid, and [open-problems.md](open-problems.md) for the supply-chain and non-conformance warts this carries. + +## Sources + +- `install-mesa/action.yml`: https://github.com/gfx-rs/wgpu/blob/trunk/.github/actions/install-mesa/action.yml (raw: https://raw.githubusercontent.com/gfx-rs/wgpu/trunk/.github/actions/install-mesa/action.yml) +- `gfx-rs/ci-build`: https://github.com/gfx-rs/ci-build · releases: https://github.com/gfx-rs/ci-build/releases +- pal1000/mesa-dist-win (Windows source, third-party): https://github.com/pal1000/mesa-dist-win +- wgpu issue #2594 (abandon daily PPA): https://github.com/gfx-rs/wgpu/issues/2594 +- wgpu issue #8544 (upgrade LLVMPipe) / #6988 / #8727: https://github.com/gfx-rs/wgpu/issues/8544 · https://github.com/gfx-rs/wgpu/issues/6988 · https://github.com/gfx-rs/wgpu/issues/8727 +- Mesa envvars (`VK_DRIVER_FILES`, `LP_NUM_THREADS`, `GALLIUM_DRIVER`): https://docs.mesa3d.org/envvars.html +- `wgpu::util::initialize_adapter_from_env`: https://docs.rs/wgpu/latest/wgpu/util/fn.initialize_adapter_from_env.html +- Sibling files: [gpu-test-harness.md](gpu-test-harness.md), [image-compare.md](image-compare.md), [open-problems.md](open-problems.md), [lessons.md](lessons.md), [glossary.md](glossary.md) diff --git a/docs/prior-art/wgpu-testing/glossary.md b/docs/prior-art/wgpu-testing/glossary.md new file mode 100644 index 0000000..170f3c7 --- /dev/null +++ b/docs/prior-art/wgpu-testing/glossary.md @@ -0,0 +1,64 @@ +**Date:** 2026-06-14 +**Status:** active +**Subject:** Glossary of wgpu-test-infrastructure terms — one line each, for readers of this folder + +# Glossary + +System-specific terms used across this folder. One line each; see the linked file for detail. + +## Harness + +- **`wgpu_test`** — wgpu's in-tree GPU integration-test harness crate (under `tests/`), **not published** to crates.io. ([gpu-test-harness.md](gpu-test-harness.md)) +- **`#[gpu_test]`** — attribute macro that turns a `static GpuTestConfiguration` into a test running on all GPUs on the system. +- **`GpuTestConfiguration`** — the value a `#[gpu_test]` static holds: bundles `TestParameters` + the async test closure. +- **`TestingContext`** — the device, queue, and adapter info handed to a test closure. +- **`TestParameters`** — preconditions (required features/limits/downlevel caps/instance flags) + expectations (`skips`, `failures`); unmet preconditions → **skip, not fail**. +- **`FailureCase`** — a matcher (`backend()`, `adapter(substr)`, `validation_error()`, `panic()`, `.flaky()`) declaring an expected failure scoped to backend × adapter-substring × driver. +- **`FailureBehavior`** — enum: `AssertFailure` (default; an unexpected **pass panics**) vs `Ignore` (swallow a specific flake). +- **`.expect_fail(when)` / `.skip(when)`** — builder hooks attaching a `FailureCase` as a must-fail (run anyway) or must-fail-and-skip. +- **`execute_test`** — dispatcher that runs a `GpuTestConfiguration` against an adapter report. + +## Tooling / runner + +- **`cargo-nextest`** — the mandated test runner; gives **process-per-test isolation** so a device crash can't poison the run. +- **`cargo xtask test` / `cargo xtask cts`** — repo-root entry points; `xtask test` calls nextest, `xtask cts` runs the conformance suite. +- **`wgpu-info`** — sweep tool: runs a given command once per (adapter × backend), setting `WGPU_ADAPTER_NAME` / `WGPU_BACKEND` each run. +- **`WGPU_BACKEND` / `WGPU_ADAPTER_NAME` / `WGPU_DX12_COMPILER`** — env vars selecting backend (comma list), adapter (substring), and DX12 shader compiler. + +## Rasterizers (the reference) + +- **lavapipe** — Mesa's `swrast` software **Vulkan** driver (`libvulkan_lvp.so`, ICD `lvp_icd.x86_64.json`); wgpu's Vulkan reference. Self-warns at init (Mesa `lvp_device.c`): "WARNING: lavapipe is not a conformant vulkan implementation, testing use only." +- **llvmpipe** — Mesa's Gallium software **OpenGL/GLES** rasterizer; wgpu's GL reference (`GALLIUM_DRIVER=llvmpipe`). +- **WARP** — Microsoft's software **D3D** rasterizer (`d3d10warp.dll`); the DX12 reference, installed via `cargo xtask install-warp`. +- **Mesa** — the open-source graphics-driver project that builds lavapipe + llvmpipe; pinned at `MESA_VERSION` (currently `25.2.7`). +- **`gfx-rs/ci-build`** — repo that builds Mesa from `archive.mesa3d.org` on a tag and attaches a tarball to a GH Release; wgpu downloads from it. Current build tag: `build26`. +- **ICD** — *Installable Client Driver* — the JSON manifest the Vulkan loader reads to find a driver `.so`; wgpu writes its own because the upstream one has a build-host-absolute path. +- **`VK_DRIVER_FILES`** — env var pointing the Vulkan loader at a specific ICD JSON (modern replacement for deprecated `VK_ICD_FILENAMES`); forces lavapipe-only enumeration. +- **`GALLIUM_DRIVER` / `LIBGL_ALWAYS_SOFTWARE`** — Mesa env vars forcing software GL (llvmpipe). +- **`LP_NUM_THREADS`** — Mesa env var for llvmpipe render-thread count; **commonly mis-cited** as a wgpu determinism knob — it is **not** how wgpu achieves determinism. ([determinism-rasterizer.md](determinism-rasterizer.md)) + +## Image comparison + +- **FLIP (ꟻLIP)** — NVIDIA's perceptual image-difference metric ("FLIP: A Difference Evaluator for Alternating Images", HPG 2020); models viewer distance so sub-perceptual noise doesn't register. +- **`nv-flip`** — Rust high-level bindings to FLIP (v0.1.2, MIT OR Apache-2.0 OR Zlib, gfx-rs-maintained); `nv-flip-sys` is its C++ FFI layer. +- **error map** — FLIP's per-pixel output, a `FlipImageFloat` with values in `[0,1]` (0 = identical, 1 = max perceptual error). +- **`FlipPool`** — histogram-like value pool over an error map exposing `mean()`, `get_percentile()`, `min_value()`, `max_value()`; **mean** is the authors' recommended summary. +- **pixels-per-degree** — FLIP's viewer-distance parameter (`DEFAULT_PIXELS_PER_DEGREE`) that makes the metric perceptual rather than a raw diff. +- **`ComparisonType`** — wgpu's assertion enum: `Mean(f32)` (fail if mean error > x) or `Percentile { percentile, threshold }`. +- **`magma_lut()`** — the magma colormap applied to an error map to produce the human-readable `*-difference.png` diff artifact. +- **outlier count / `max_outliers`** — the **superseded** pre-FLIP model: count per-channel-delta-exceeding pixels, fail if over a limit; brittle across drivers. + +## Cross-cutting + +- **CTS** — the (Vulkan/WebGPU) **conformance** test suite wgpu runs *separately* via `cargo xtask cts`; carries the correctness load that goldens cannot. +- **trunk** — wgpu's default branch; all `blob/trunk/...` source links in this folder resolve against it and may drift. + +## Sources + +- `wgpu_test` docs: https://wgpu.rs/doc/wgpu_test/index.html +- `tests/src/expectations.rs`, `tests/src/params.rs`, `tests/src/image.rs`: https://github.com/gfx-rs/wgpu/tree/trunk/tests/src +- `install-mesa/action.yml`: https://github.com/gfx-rs/wgpu/blob/trunk/.github/actions/install-mesa/action.yml +- Mesa envvars: https://docs.mesa3d.org/envvars.html +- lavapipe non-conformance warning (Mesa `lvp_device.c`): https://gitlab.freedesktop.org/mesa/mesa/-/blob/main/src/gallium/frontends/lavapipe/lvp_device.c +- `nv-flip`: https://docs.rs/nv-flip/latest/nv_flip/ +- Sibling files: [README.md](README.md), [gpu-test-harness.md](gpu-test-harness.md), [determinism-rasterizer.md](determinism-rasterizer.md), [image-compare.md](image-compare.md), [open-problems.md](open-problems.md), [lessons.md](lessons.md) diff --git a/docs/prior-art/wgpu-testing/gpu-test-harness.md b/docs/prior-art/wgpu-testing/gpu-test-harness.md new file mode 100644 index 0000000..8731836 --- /dev/null +++ b/docs/prior-art/wgpu-testing/gpu-test-harness.md @@ -0,0 +1,100 @@ +**Date:** 2026-06-14 +**Status:** active +**Subject:** wgpu's `wgpu_test` GPU integration-test harness — the `#[gpu_test]` macro, skip-gating, and per-backend `FailureCase` expectations + +# The `wgpu_test` GPU test harness + +`wgpu_test` is wgpu's in-tree GPU integration-test harness, documented at [wgpu.rs/doc/wgpu_test](https://wgpu.rs/doc/wgpu_test/index.html) and tersely self-described as "Test utilities for the wgpu repository." It is **not published to crates.io** — it lives under `tests/` in the repo. Documented here against the latest published wgpu, **v29.0.3 (released 2026-05-02)**, verified via the crates.io API (`max_stable_version` 29.0.3, `updated_at` 2026-05-02T03:12:40Z). + +The harness exists to solve one problem Buiy shares: **run one test body across heterogeneous GPUs and record per-GPU expected outcomes**, instead of `cfg`-gating tests per platform or globally `#[ignore]`ing anything that fails on one backend. + +## The `#[gpu_test]` macro + +Each GPU test is a `static` of type `GpuTestConfiguration` annotated with `#[gpu_test]`. The macro "creates a test that will run on all gpus on a given system" by generating the harness `main`/registration glue. A test bundles three things: + +- **`TestParameters`** — preconditions (features/limits) + expectations (`FailureCase`s). +- **an async closure** receiving a **`TestingContext`** — "Parameters and resources handed to the test function": the device, queue, and adapter info. + +Tests are dispatched through `execute_test`, which "Execute[s] the given test configuration with the given adapter report"; `initialize_instance` / `initialize_adapter` / `initialize_device` perform per-adapter setup. + +## `TestParameters` — gating (skip, not fail) + +From [`tests/src/params.rs`](https://github.com/gfx-rs/wgpu/blob/trunk/tests/src/params.rs), the struct carries: + +``` +required_features: Features +required_downlevel_caps: DownlevelCapabilities +required_limits: Limits +required_instance_flags: InstanceFlags +force_fxc: bool +skips: Vec +failures: Vec +disable_mtl_shader_validation: bool +``` + +Builder methods: `.features(..)`, `.downlevel_flags(..)`, `.limits(..)`, `.instance_flags(..)`, `.force_fxc(..)`, `.test_features_limits()` ("Set of common features that most internal tests require for compute and readback"), `.enable_noop()` ("Enable testing against the noop backend and miri"), plus the two expectation hooks: + +- `.expect_fail(when: FailureCase)` — "Mark the test as always failing, but not to be skipped." +- `.skip(when: FailureCase)` — "Mark the test as always failing, and needing to be skipped." + +**The load-bearing behavior:** if a feature/limit/downlevel precondition is not met by the current adapter, the test is **skipped, not failed**. This is how a single test body runs cleanly across hardware with different capabilities — the harness silently drops tests the adapter can't support rather than reporting red. + +## `FailureCase` — per-backend / adapter / driver expectations + +From [`tests/src/expectations.rs`](https://github.com/gfx-rs/wgpu/blob/trunk/tests/src/expectations.rs). This is the heart of the determinism model. The matcher constructors: + +- `always()`, `never()` +- `backend(Backends)` — "Tests running on any of the given backends." +- `adapter(&str)` — "Tests running on `adapter`" (**substring match** on the adapter name). +- `backend_adapter(backends, adapter)` +- `webgl2()`, `molten_vk()` ("the MoltenVK Vulkan driver on macOS"), `kosmic_krisp()`, `mac_vulkan(..)` ("either Vulkan driver on macOS") +- `validation_error(msg)` and `panic(msg)` — substring-match the expected error/panic text. +- `unexpected_error(msg)` + +A case is refined with reason filters and `.with_message(..)` (case-insensitive substring matching on the validation-error/panic message), and `.flaky()` — "Test is flaky with the given configuration. Do not assert failure." + +The behavior enum is **`FailureBehavior`** with two variants: + +- **`AssertFailure`** — "Assert that the test fails for the given reason. If the test passes, the test harness will panic." This is the strict default for `.expect_fail`. +- **`Ignore`** — "Ignore the matching failure. This is useful for tests that flake in a very specific way." + +### The unexpected-pass-panics rule (the key insight) + +Under `AssertFailure`, a known-broken case **must keep failing in exactly the matched way**. If a backend *starts passing* — e.g. a driver bug gets fixed upstream — the harness **panics**, forcing whoever made it pass to delete the now-stale expectation. A backend cannot silently start passing; the expectation list stays honest as the renderer matures. This is far stronger than `#[ignore]`, which would silently keep the test disabled forever. + +The directly transplantable insight for Buiy: expectations are **scoped to `backend × adapter-substring × driver`**, so a single test asserts a *different correct outcome per GPU* rather than being globally disabled. `FailureReason`/`FailureResult` do case-insensitive substring matching on the message. + +> **Note (verified):** the older `Skip`/`device`/`environment` `FailureCase` variants implied by some external write-ups are **not present** in current source — verified against trunk. Don't lift them. + +## Backend / adapter sweep + +All test/example infra reads standardized env vars: `WGPU_BACKEND` (comma list of `vulkan`, `metal`, `dx12`, `gl`), `WGPU_ADAPTER_NAME` (adapter-name substring), and `WGPU_DX12_COMPILER`. The sweep tool is **`wgpu-info`**: "when wgpu-info is called with any amount of arguments, it will interpret all of the arguments as a command to run. It will run this command N different times, one for every combination of adapter and backend on the system," setting `WGPU_ADAPTER_NAME` and `WGPU_BACKEND` per run ([lib.rs/crates/wgpu-info](https://lib.rs/crates/wgpu-info)). Canonical invocation: + +``` +cargo run --bin wgpu-info -- cargo nextest run --no-fail-fast +``` + +## Runner: nextest + xtask + +Tests **must** run under `cargo-nextest`, which gives **process-per-test isolation** — important because a GPU device crash or validation abort would poison a shared process. Per [`docs/testing.md`](https://github.com/gfx-rs/wgpu/blob/trunk/docs/testing.md): "you require you run the tests with cargo-nextest. This is what our xtask calls." + +The repo-root entry point is `cargo xtask test`: +- `cargo xtask test --test wgpu-gpu` — the GPU tests. +- `cargo xtask test --bin wgpu-examples` — the image-comparison example tests. +- `cargo xtask cts` — runs CTS (the conformance suite, separate from these integration tests). +- Default-device run: `cargo nextest run --no-fail-fast`; single test: `cargo nextest run -p wgpu -- `. + +## Implications for Buiy + +Buiy's reftest tier should adopt the `FailureCase` model **verbatim**, including the unexpected-pass-panics rule: a Buiy reftest that's known-broken on, say, the Vulkan backend records `expect_fail(backend(VULKAN))`, and if a Buiy renderer fix makes it pass, the harness forces the expectation's removal. Skip-on-unmet-precondition maps to Buiy's optional GPU features. Process-per-test isolation via nextest is already idiomatic in Rust and avoids one crashed `wgpu::Device` poisoning a whole test run — Buiy should mandate nextest for any test that creates a device. See [lessons.md](lessons.md) for the full Borrow list, and [open-problems.md](open-problems.md) for the substring-brittleness wart this model carries. + +## Sources + +- `wgpu_test` docs: https://wgpu.rs/doc/wgpu_test/index.html +- `tests/src/params.rs`: https://github.com/gfx-rs/wgpu/blob/trunk/tests/src/params.rs +- `tests/src/expectations.rs`: https://github.com/gfx-rs/wgpu/blob/trunk/tests/src/expectations.rs +- `docs/testing.md`: https://github.com/gfx-rs/wgpu/blob/trunk/docs/testing.md +- wgpu README (env vars): https://github.com/gfx-rs/wgpu/blob/trunk/README.md +- `wgpu-info`: https://lib.rs/crates/wgpu-info +- wgpu crate version (crates.io API): https://crates.io/api/v1/crates/wgpu +- Sibling files: [determinism-rasterizer.md](determinism-rasterizer.md), [image-compare.md](image-compare.md), [open-problems.md](open-problems.md), [lessons.md](lessons.md), [glossary.md](glossary.md) diff --git a/docs/prior-art/wgpu-testing/image-compare.md b/docs/prior-art/wgpu-testing/image-compare.md new file mode 100644 index 0000000..2715bb7 --- /dev/null +++ b/docs/prior-art/wgpu-testing/image-compare.md @@ -0,0 +1,86 @@ +**Date:** 2026-06-14 +**Status:** active +**Subject:** wgpu's image-comparison harness — the nv-flip crate, the FLIP perceptual metric, mean/percentile thresholds, the magma diff artifact, and the superseded outlier-count model + +# Image comparison: `nv_flip` + the golden-image harness + +wgpu compares rendered output against stored golden PNGs using its own [`tests/src/image.rs`](https://raw.githubusercontent.com/gfx-rs/wgpu/trunk/tests/src/image.rs) module, which delegates the actual perceptual comparison to the **`nv-flip`** crate. This is the most directly transplantable comparison model for Buiy because both render on wgpu. + +## The `nv-flip` crate (verified) + +Per the crates.io API ([`crates.io/api/v1/crates/nv-flip`](https://crates.io/api/v1/crates/nv-flip)), the newest and only current version is **0.1.2, published 2023-07-16T03:35:23Z**; full history is 0.1.0 and 0.1.1 (both 2023-06-04) then 0.1.2. License is **`MIT OR Apache-2.0 OR Zlib`**, repo [`github.com/gfx-rs/nv-flip-rs`](https://github.com/gfx-rs/nv-flip-rs), ~591.9k total downloads *(live snapshot, not a pinned fact)*. **It is maintained by the gfx-rs org — the wgpu maintainers themselves.** The companion low-level FFI crate is `nv-flip-sys`. Description: *"High-Level bindings to Nvidia Labs's ꟻLIP image comparison and error visualization library."* The crate wraps NVIDIA's **FLIP** — the perceptual difference metric from the *"FLIP: A Difference Evaluator for Alternating Images"* paper (HPG 2020). FLIP itself is BSD-3-Clause. + +## The FLIP metric and its output + +`nv_flip::flip(reference, test, DEFAULT_PIXELS_PER_DEGREE)` produces a per-pixel **error map**, documented as *"the per-pixel visual difference between the two images between 0 and 1"* (0.0 = identical, 1.0 = maximal perceptual error). The error map is a `FlipImageFloat`. + +Public API (docs.rs v0.1.2): +- structs `FlipImageRgb8`, `FlipImageFloat`, `FlipHistogram`, `FlipPool` +- functions `flip()`, `magma_lut()`, `pixels_per_degree()` +- constant `DEFAULT_PIXELS_PER_DEGREE` + +The viewer-distance parameter (**pixels-per-degree**) is what makes FLIP *perceptual* rather than a raw pixel diff — it models how far the observer sits from the display, so anti-aliasing and sub-pixel rounding noise below the perceptual threshold do not register as error, while a genuine visual regression does. This edge-contrast-amplified model is the property that killed the older outlier-count approach (below). + +## Reducing the map to pass/fail: `FlipPool` + +wgpu feeds the error map into `nv_flip::FlipPool::from_image(&error_map_flip)`, a *"histogram-like value pool for determining if [the] error map has significant differences."* It exposes `mean()`, `get_percentile(p, true)`, `min_value()`, `max_value()`. The nv-flip docs explicitly recommend the mean: *"if you are to use a single number to represent the error, [the FLIP authors] recommend the mean."* + +## The current assertion model (Mean / Percentile) + +`image.rs` defines: + +```rust +pub enum ComparisonType { + Mean(f32), + Percentile { percentile: f32, threshold: f32 }, +} +``` + +- `Mean(x)` fails if the **mean** error exceeds `x`. +- `Percentile { percentile, threshold }` fails if the given percentile (in `[0,1]`) exceeds `threshold`. + +Failure messages: `"\tExpected Mean ({:.6}) to be under expected maximum ({}): {}"` and `"\tExpected {}% ({:.6}) to be under expected maximum ({}): {}"`. The harness prints the error distribution at percentiles **[25, 50, 75, 95, 99]** (`pool.get_percentile(p/100.0, true)`), runs **every** check in the list (`all_passed &= check.check(&mut pool)`), and on any failure panics with `"Image data mismatch: {}"` where `{}` is the path to a written **diff image**. + +Practical thresholds live in the **[0.01, 0.1] range** for mean/percentile error, per the crate docs — Buiy should treat these as **empirically tuned per-test, not universal**. *(Flag: this range is crate-doc guidance, not a hard constant in `image.rs`; the actual thresholds are per-test call-site args, which could not be enumerated from a single source.)* + +## The diff-map artifact + +On mismatch, wgpu colorizes the FLIP error map with the magma colormap and writes it to disk: + +```rust +error_map_flip.apply_color_lut(&nv_flip::magma_lut()); +``` + +saved as `"{file_stem}-{renderer}-difference.png"`, where `renderer` is `"{backend}-{sanitized_name}-{sanitized_driver}"`. This **per-backend naming** is how wgpu disambiguates failures across GPUs/drivers — a pattern Buiy will need if it ever runs golden tests on more than one adapter. + +## Golden-image storage (and the implicit-bootstrap wart) + +References are read via `read_png(&path, width, height)` returning `Option>`; the harness validates width/height, RGBA color type, and 8-bit depth, then **strips alpha** (FLIP compares RGB). PNGs are committed in-repo alongside the example/test sources. + +**Wart:** `read_png` itself only returns `None` when no reference exists; the minting happens one level up in `compare_image_output` (image.rs:155–179), whose `None =>` arm **writes the current test image as the new baseline via `write_png` and returns early** — i.e. **golden bootstrapping is implicit on first run**. A missing or deleted golden silently "passes" by minting itself. Buiy should make first-run minting **explicit and gated** (a flag, not the default) so a deleted golden fails loudly instead of being silently regenerated. See [open-problems.md](open-problems.md). + +## The superseded outlier-count model (cautionary precursor) + +Before the FLIP rewrite, wgpu used a raw per-pixel **outlier count**: count pixels whose per-channel delta exceeds a tolerance; fail if too many exceed a limit. [PR #2767](https://github.com/gfx-rs/wgpu/pull/2767) ("Increase max_outliers on wgpu water example reftest.", Jim Blandy / @jimblandy, merged 2022-06-14) *raised* `max_outliers` on the water reftest — it did **not** replace the model. Its body documents the exact flake: on *"AMD RADV POLARIS12"* the test panicked with + +> `"Image data mismatch! Outlier count 464 over limit 460. Max difference 213"` + +— i.e. *"N outlier pixels over limit M, max channel difference D"* — and on inspection the diff was *"just a few dots here and there."* This is the evidence the outlier model was **brittle across drivers**: scattered sub-perceptual noise tripped a hard count. A separate symptom is [issue #2760](https://github.com/gfx-rs/wgpu/issues/2760) ("Windows 11's WARP Passes the Water Example Image Comparison Test", @cwfitzgerald) — an unexpected-*pass* the outlier model could not express as a per-driver expectation. The model was later **replaced** by perceptual FLIP in [PR #3830](https://github.com/gfx-rs/wgpu/pull/3830) ("Migrate to nv-flip for image comparison", merged 2023-06-08), precisely because perceptual mean/percentile thresholds **tolerate scattered sub-perceptual noise that an outlier count cannot**. + +**For Buiy's strategy doc:** cite the outlier-count model as the cautionary precursor, and adopt the FLIP mean/percentile model as the target. + +## Implications for Buiy + +`nv_flip` + **mean** is the wgpu-ecosystem-native perceptual metric (directly answers verification Open Q #3 in the strategy report). It is already license-compatible (`MIT OR Apache-2.0 OR Zlib`; FLIP is BSD-3-Clause) and avoids the AA-flake problem that kills exact-pixel goldens and that killed wgpu's own outlier-count model. The one cost: `nv_flip` is **FFI to a C++ library** (`nv-flip-sys`) — a build-graph and native-dependency cost in CI. If Buiy wants pure-Rust, a pixelmatch-YIQ port is the runner-up — cheaper to vendor, but YIQ has no edge-contrast term and so will produce more AA false-positives. Tradeoff named; FLIP recommended. See [lessons.md](lessons.md). + +## Sources + +- crates.io: https://crates.io/crates/nv-flip · API: https://crates.io/api/v1/crates/nv-flip +- docs.rs: https://docs.rs/nv-flip/latest/nv_flip/ +- repo: https://github.com/gfx-rs/nv-flip-rs +- wgpu `tests/src/image.rs`: https://raw.githubusercontent.com/gfx-rs/wgpu/trunk/tests/src/image.rs +- wgpu PR #2767 (raise max_outliers — the RADV POLARIS12 outlier-flake evidence, pre-FLIP): https://github.com/gfx-rs/wgpu/pull/2767 +- wgpu issue #2760 (WARP unexpectedly passes water example — per-driver-outcome the outlier model can't express): https://github.com/gfx-rs/wgpu/issues/2760 +- wgpu PR #3830 (migrate to nv-flip — the actual outlier→FLIP replacement): https://github.com/gfx-rs/wgpu/pull/3830 +- FLIP paper: "FLIP: A Difference Evaluator for Alternating Images" (HPG 2020), NVIDIA Labs +- Sibling files: [gpu-test-harness.md](gpu-test-harness.md), [determinism-rasterizer.md](determinism-rasterizer.md), [open-problems.md](open-problems.md), [lessons.md](lessons.md), [glossary.md](glossary.md) diff --git a/docs/prior-art/wgpu-testing/lessons.md b/docs/prior-art/wgpu-testing/lessons.md new file mode 100644 index 0000000..de15b86 --- /dev/null +++ b/docs/prior-art/wgpu-testing/lessons.md @@ -0,0 +1,70 @@ +**Date:** 2026-06-14 +**Status:** active +**Subject:** Lessons for Buiy from wgpu's CI / GPU test infrastructure — the closest determinism model; what it Validates, what to Avoid, what to Borrow + +# Lessons for Buiy + +This is the consult-this-when-designing file. The other files in this folder are evidence; this file is decisions. wgpu's test infrastructure is **uniquely transplantable** because Buiy renders on the same wgpu abstraction — the `VK_DRIVER_FILES` determinism recipe, the `FailureCase` expectation primitive, and the `nv_flip` metric are all *directly reusable*, not merely instructive. Most Buiy golden/reftest-infra decisions reduce to "how closely do we copy wgpu's stack?" — this file enumerates the answers. It feeds [`docs/reports/2026-06-14-visual-bug-detection-strategy.md`](../../reports/2026-06-14-visual-bug-detection-strategy.md), especially Tiers 4–5. + +## Validates + +These Buiy design choices are confirmed by wgpu's experience and stack shape: + +- **Reftests-first, goldens-last and few.** wgpu image-compares only its *examples*, sitting atop a deep harness of non-visual GPU tests; goldens are the thin top of the pyramid, and *correctness* is carried separately by CTS. Buiy's reftests-first ordering — keep goldens last and minimal — is the same shape wgpu arrived at. See [image-compare.md](image-compare.md), [open-problems.md § 3](open-problems.md). +- **A perceptual metric, not exact-pixel.** wgpu *replaced* its exact-ish outlier-count model with perceptual FLIP ([PR #3830 "Migrate to nv-flip for image comparison"](https://github.com/gfx-rs/wgpu/pull/3830), merged 2023-06-08) precisely because scattered sub-perceptual AA/rounding noise tripped the count across drivers. The brittleness is documented in [PR #2767](https://github.com/gfx-rs/wgpu/pull/2767) (RADV POLARIS12: *"just a few dots"* tripping `Outlier count 464 over limit 460`) and [issue #2760](https://github.com/gfx-rs/wgpu/issues/2760) (WARP unexpectedly *passing* — an outcome the count model can't express per-driver). Buiy's plan to use a perceptual tolerance instead of bit-equality is validated by the system closest to it abandoning the alternative. +- **Pin the rasterizer, don't track the distro.** wgpu's abandonment of `ppa:oibaf/graphics-drivers` ([#2594](https://github.com/gfx-rs/wgpu/issues/2594)) is direct evidence that a rolling software rasterizer is a *moving reference image* — every unrelated upstream regression reddens CI. Buiy's intent to pin a single rasterizer version is the correct conclusion. See [determinism-rasterizer.md](determinism-rasterizer.md). +- **Process-per-test isolation.** nextest is the right runner for any test that creates a `wgpu::Device`; a device crash/validation abort must not poison a shared process. Buiy already lives in the nextest-friendly Rust ecosystem. See [gpu-test-harness.md](gpu-test-harness.md). +- **Per-backend expectations beat global disabling.** The `FailureCase` model lets one test body assert a different correct outcome per GPU. This validates designing Buiy's reftest tier around declarative per-backend expectations rather than `#[ignore]` or `cfg`. See [gpu-test-harness.md](gpu-test-harness.md). + +## Avoid + +| Pitfall | Source | Buiy mitigation | +|---|---|---| +| Trusting lavapipe pixels as ground truth for *correctness* | lavapipe self-warns at init: `"WARNING: lavapipe is not a conformant vulkan implementation, testing use only."` (Mesa `lvp_device.c`); ships version-pinned bugs ([#8727](https://github.com/gfx-rs/wgpu/issues/8727), Known-Driver-Issues wiki) | Goldens prove **no-change, not correct**. Pair them with Buiy's lower tiers (layout-number / display-list snapshots, metamorphic invariants) which carry the correctness load. wgpu leans on CTS for conformance, separately. [open-problems.md § 3](open-problems.md) | +| Letting goldens be Tier 1 | wgpu image-compares only examples, atop a deep non-visual harness | Keep goldens last and few; catch most regressions in Tiers 1–4 so Tier 5 is a minimal residue. [image-compare.md](image-compare.md) | +| Copying a `LP_NUM_THREADS` determinism story | **Not** in `install-mesa/action.yml`; Mesa docs do not call it a determinism knob | Determinism comes from the **pinned Mesa version**, not thread count. Do not export `LP_NUM_THREADS` expecting FP determinism. [determinism-rasterizer.md](determinism-rasterizer.md) | +| Using the upstream Mesa ICD path | "The ICD provided by the mesa build is hardcoded to the build environment" | **Write your own ICD JSON** pointing at the unpacked `libvulkan_lvp.so`, then export `VK_DRIVER_FILES=$PWD/icd.json`. [determinism-rasterizer.md](determinism-rasterizer.md) | +| Substring-keyed expectations as the default | `adapter("llvmpipe")` / `validation_error(msg)` break silently when adapter names or messages get reworded across driver bumps | Prefer structured keys (enum'd backend + stable adapter-class id); pin message text in the module that emits it. [open-problems.md § 2](open-problems.md) | +| Treating `.flaky()` / `Ignore` as a permanent home | Provides zero regression signal — passes whether code is right or wrong | Quarantine with an owner + expiry; count quarantined tests as debt, not green. [open-problems.md § 1](open-problems.md) | +| Implicit golden bootstrapping | `compare_image_output` mints a missing baseline (its `None =>` arm writes the test image and returns) — a deleted golden hides a regression | Make first-run minting an explicit opt-in flag; fail loudly when an expected golden is absent. [image-compare.md](image-compare.md), [open-problems.md § 4](open-problems.md) | +| Trusting a third-party Windows rasterizer build | Windows Mesa pulled from non-gfx-rs [pal1000/mesa-dist-win](https://github.com/pal1000/mesa-dist-win) | If Buiy needs deterministic DX12 pixels, build its own Windows Mesa or pin a hash of the third-party artifact. [open-problems.md § 6](open-problems.md) | +| Comparing goldens across backends | The pinned-rasterizer guarantee holds within one backend only | One golden per backend cell, or run goldens on a single pinned backend. [open-problems.md § 8](open-problems.md) | +| Lifting stale APIs/paths from external write-ups | `Skip`/`device`/`environment` `FailureCase` variants and `docs/testing/integration_tests.md` are **not** in current source | Verify against trunk before copying. [gpu-test-harness.md](gpu-test-harness.md), [open-problems.md § 9](open-problems.md) | + +## Borrow + +Concrete subsystems and patterns worth direct adaptation — licenses align (`nv-flip` is MIT OR Apache-2.0 OR Zlib; FLIP BSD-3-Clause; both compatible with Buiy's MIT OR Apache-2.0): + +1. **The `FailureCase` model verbatim — including the unexpected-pass-panics rule.** Buiy's reftest tier should record expected outcomes declaratively, scoped to `backend × adapter-substring × driver`, and *panic when an expected failure unexpectedly passes* so fixing a backend forces removal of the stale expectation. This keeps the expectation list honest as Buiy's renderer matures — far stronger than `#[ignore]`. This is the single highest-value borrow. See [gpu-test-harness.md](gpu-test-harness.md). + +2. **Skip-on-unmet-precondition (`TestParameters`).** Gate a test on required features/limits/downlevel caps so unsupported adapters **skip rather than fail**. Maps directly to Buiy's optional GPU features. See [gpu-test-harness.md](gpu-test-harness.md). + +3. **The pinned-rasterizer recipe, consuming `gfx-rs/ci-build` artifacts directly.** Three pieces: (a) a single `MESA_VERSION` + release-tag pin (reuse gfx-rs's prebuilt tarball — no need to build your own Mesa), (b) a composite action that downloads it and **writes its own ICD**, (c) `VK_DRIVER_FILES` + `WGPU_ADAPTER_NAME` to make adapter choice deterministic and hardware-proof. Bump the pin deliberately, in a tracked issue, regenerating goldens in the same PR. See [determinism-rasterizer.md](determinism-rasterizer.md). + +4. **`VK_DRIVER_FILES` + `WGPU_ADAPTER_NAME` for adapter selection.** `VK_DRIVER_FILES` forces the Vulkan loader to see *only* lavapipe so a test can't accidentally pick a hardware GPU; `WGPU_ADAPTER_NAME` (case-insensitive substring) nails the exact device. Reuse as-is. See [determinism-rasterizer.md](determinism-rasterizer.md). + +5. **`nv_flip` + mean as Buiy's perceptual metric.** The wgpu-ecosystem-native, gfx-rs-maintained, license-clean choice (directly answers verification Open Q #3). FLIP error map → `FlipPool::mean()` (the authors' recommended summary) → a per-test `Mean`/`Percentile` threshold in the empirical `[0.01, 0.1]` range. Avoids the AA-flake problem that kills exact-pixel goldens. **Runner-up named:** a pure-Rust pixelmatch-YIQ port is cheaper to vendor (no `nv-flip-sys` C++ FFI) but YIQ has no edge-contrast term → more AA false-positives. FLIP recommended; choose YIQ only if the FFI cost is unacceptable. See [image-compare.md](image-compare.md). + +6. **The per-backend diff-artifact naming.** On mismatch, colorize the FLIP error map with `magma_lut()` and write `{stem}-{backend}-{name}-{driver}-difference.png`. Buiy needs this disambiguation the moment it runs goldens on more than one adapter. See [image-compare.md](image-compare.md). + +7. **The `wgpu-info`-style sweep harness.** A wrapper that runs the same test command once per (adapter × backend), setting `WGPU_BACKEND` / `WGPU_ADAPTER_NAME` each run. Lets Buiy exercise its full backend matrix from one invocation. See [gpu-test-harness.md](gpu-test-harness.md). + +8. **The outlier-count model as a documented cautionary precursor.** Cite wgpu's abandoned `max_outliers` approach in Buiy's strategy doc as the *negative* example — count-per-pixel tolerance is brittle across drivers — to justify the perceptual choice. See [image-compare.md](image-compare.md). + +## How to use this file + +When designing a Buiy golden/reftest harness component, find the relevant Avoid row and read its source file to understand the trap, then find the relevant Borrow item for the wgpu primitive to adapt. Because Buiy is on wgpu, several borrows are near-copy-paste — verify each against trunk + live crate docs before lifting concrete code (wgpu pre-`docs/testing.md` consolidation and the dropped `FailureCase` variants are both reminders that the source moves). Promote any decision into a Buiy spec under `docs/specs/`; this file captures what we learn from wgpu, not Buiy's own commitments. + +## Sources + +- `tests/src/expectations.rs` (`FailureCase`, `FailureBehavior`): https://github.com/gfx-rs/wgpu/blob/trunk/tests/src/expectations.rs +- `tests/src/params.rs` (`TestParameters`): https://github.com/gfx-rs/wgpu/blob/trunk/tests/src/params.rs +- `tests/src/image.rs` (`nv_flip`, `ComparisonType`): https://raw.githubusercontent.com/gfx-rs/wgpu/trunk/tests/src/image.rs +- `install-mesa/action.yml` (pin + `VK_DRIVER_FILES`): https://github.com/gfx-rs/wgpu/blob/trunk/.github/actions/install-mesa/action.yml +- `gfx-rs/ci-build`: https://github.com/gfx-rs/ci-build +- wgpu issues #2594 / #8544 / #8727 / #2760 and PRs #2767 / #3830 (outlier-model brittleness + the FLIP migration): https://github.com/gfx-rs/wgpu/issues/2594 · https://github.com/gfx-rs/wgpu/issues/8544 · https://github.com/gfx-rs/wgpu/issues/8727 · https://github.com/gfx-rs/wgpu/issues/2760 · https://github.com/gfx-rs/wgpu/pull/2767 · https://github.com/gfx-rs/wgpu/pull/3830 +- lavapipe non-conformance warning (Mesa `lvp_device.c`, `fprintf(stderr, …)`): https://gitlab.freedesktop.org/mesa/mesa/-/blob/main/src/gallium/frontends/lavapipe/lvp_device.c +- `nv-flip` (docs.rs / crates.io / repo): https://docs.rs/nv-flip/latest/nv_flip/ · https://crates.io/crates/nv-flip · https://github.com/gfx-rs/nv-flip-rs +- `docs/testing.md`: https://github.com/gfx-rs/wgpu/blob/trunk/docs/testing.md +- Sibling files: [gpu-test-harness.md](gpu-test-harness.md), [determinism-rasterizer.md](determinism-rasterizer.md), [image-compare.md](image-compare.md), [open-problems.md](open-problems.md), [glossary.md](glossary.md) +- Buiy strategy report: [../../reports/2026-06-14-visual-bug-detection-strategy.md](../../reports/2026-06-14-visual-bug-detection-strategy.md) diff --git a/docs/prior-art/wgpu-testing/open-problems.md b/docs/prior-art/wgpu-testing/open-problems.md new file mode 100644 index 0000000..a39e3d9 --- /dev/null +++ b/docs/prior-art/wgpu-testing/open-problems.md @@ -0,0 +1,64 @@ +**Date:** 2026-06-14 +**Status:** active +**Subject:** What wgpu's CI / GPU test infrastructure structurally does NOT solve — the limits Buiy inherits if it copies the stack + +# Open problems — what wgpu's test stack does not solve + +wgpu's harness is the closest determinism model Buiy has, but it has hard structural limits. These are boundaries, not gaps in effort; Buiy inherits each one if it copies the stack, and several need an explicit Buiy-side mitigation. + +## 1. Flakiness is encoded as a first-class state, not fixed + +The harness openly treats some failures as permanently non-deterministic: `.flaky()` ("Test is flaky with the given configuration. Do not assert failure") and `FailureBehavior::Ignore` ("useful for tests that flake in a very specific way"). This is a pragmatic admission that some driver behavior cannot be made deterministic — but it means an `Ignore`d case provides **no regression-catching signal at all**: it passes whether the code is right or wrong. The wpt-reftests folder documents the same hazard with intermittent reftests forced into a `0`-inclusive fuzz range. **Buiy mitigation:** treat `Ignore`/`.flaky()` as a quarantine with an owner and an expiry, not a permanent home — count quarantined tests as a debt metric, not green. + +## 2. Substring matching is brittle across version bumps + +Expectations are keyed on **adapter-name substrings** (`adapter("llvmpipe")`) and **error-message substrings** (`validation_error(msg)`, `.with_message(...)`, case-insensitive). A reworded validation message or a renamed adapter silently breaks the match: the harness either stops asserting the expected failure or trips on an unexpected one. Driver-version bumps are exactly when both strings change. **Buiy mitigation:** prefer structured keys (an enum'd backend + a stable adapter-class id) over free-text substrings wherever Buiy controls the message; pin the message text in the same module that emits it. + +## 3. The reference rasterizer is "testing use only" and ships its own bugs + +lavapipe self-warns at init — Mesa's `src/gallium/frontends/lavapipe/lvp_device.c` does `fprintf(stderr, "WARNING: lavapipe is not a conformant vulkan implementation, testing use only.\n")`. The wgpu [Known-Driver-Issues wiki](https://github.com/gfx-rs/wgpu/wiki/Known-Driver-Issues) lists Mesa segfaults (query-pool reset with acceleration-structure info), and live issues show feature gaps — [#8727](https://github.com/gfx-rs/wgpu/issues/8727) "SPIR-V writing for mesh shaders is broken on llvmpipe", and the [#8544](https://github.com/gfx-rs/wgpu/issues/8544) ray-tracing limit workaround that had to wait for Mesa 25.2.7. **Consequence:** golden pixels produced by the pinned rasterizer prove *no-change*, **not correctness**. They cannot be ground truth. wgpu carries correctness separately via CTS (conformance), run through `cargo xtask cts`. **Buiy mitigation:** pair goldens with Buiy's lower tiers (layout-number / display-list snapshots, metamorphic invariants) which carry the correctness load; never let a golden be the only assertion about a behavior. + +## 4. Goldens silently mint themselves on first run + +When no reference exists, `read_png` returns `None` and `compare_image_output`'s `None =>` arm writes the current image as the new baseline and returns. A **missing or deleted golden silently passes** by regenerating itself — so an accidental `rm` of a baseline hides a regression instead of failing. **Buiy mitigation:** make first-run minting explicit (an opt-in flag), and fail loudly when an expected golden is absent. + +## 5. Pinning trades flakes for a manual upgrade treadmill + +The frozen-Mesa recipe removes day-to-day flakes but creates a **manual** cost: every bump needs a new `gfx-rs/ci-build` release *and* an edit to `install-mesa/action.yml`, and behavior changes must be chased by hand (e.g. restoring `Limits::blas_max_primitive_count` only after Mesa 25.2.7 fixed the underlying bug — [#8544](https://github.com/gfx-rs/wgpu/issues/8544)). The pin can also lag a real fix the project needs. **Buiy mitigation:** accept the treadmill as the price of determinism; bump in a tracked issue and regenerate goldens in the same PR. + +## 6. Supply-chain trust gap on Windows + +The Linux Mesa is built by gfx-rs itself, but the **Windows** build is pulled from a **third-party** repo, [pal1000/mesa-dist-win](https://github.com/pal1000/mesa-dist-win) — not gfx-rs-controlled. Anyone copying the recipe inherits trust in that third party for the Windows reference binary. **Buiy mitigation:** if Buiy needs deterministic DX12 pixels, prefer building its own Windows Mesa or pin a hash of the third-party artifact. + +## 7. `nv_flip` is FFI to a C++ library + +The metric is `nv-flip` → `nv-flip-sys` → a C++ FLIP implementation: a native build-graph dependency in CI (a C++ toolchain, a `-sys` crate). It is fine, and license-clean, but it is not pure Rust. **Runner-up:** a pixelmatch-YIQ port is cheaper to vendor but a weaker perceptual model (no edge-contrast term → more AA false-positives). Tradeoff named in [image-compare.md](image-compare.md) and [lessons.md](lessons.md). + +## 8. Determinism assumes a fixed backend per CI lane + +The whole pinned-rasterizer guarantee holds **within one backend**. The harness disambiguates failures per backend (the `{backend}-{name}-{driver}` diff naming) precisely because different backends produce different pixels. A golden compared across Vulkan-vs-DX12 would reintroduce the variance the pin removes. **Buiy mitigation:** one golden per (backend) cell, or run goldens on a single pinned backend only. + +**No software-Metal reference — macOS goldens are not deterministic under this model.** The pinned-rasterizer recipe covers Vulkan (lavapipe), GL (llvmpipe), and DX12 (WARP) only. There is no software Metal rasterizer: macOS Metal goldens would run on a real Apple GPU/driver and so are *not* bit-stable across machines or OS versions. This is load-bearing for Buiy, which targets macOS — Buiy cannot get deterministic Metal pixels from this recipe and must either route macOS visual tests through MoltenVK→lavapipe (paying a translation layer) or accept that the Metal backend has no golden tier and rely on its lower tiers there. + +## 9. Documentation drift + +A current standalone `docs/testing/integration_tests.md` could **not** be located (404 on trunk) — testing docs are consolidated in [`docs/testing.md`](https://github.com/gfx-rs/wgpu/blob/trunk/docs/testing.md). Older per-file links in external write-ups are stale. Verify against trunk before lifting any path. (Likewise the `Skip`/`device`/`environment` `FailureCase` variants some write-ups mention are **not** in current source.) + +## Not covered by this folder + +These are out of scope for the prior-art (not faults in wgpu's stack) but a Buiy designer sizing the golden tier should source them elsewhere: + +- **CI cost / throughput.** No figures on how long the GPU suite or image-compare run takes, what runner class CI uses, or per-test overhead — no budget anchor for sizing Buiy's golden tier. +- **Golden-image storage cost.** PNGs are committed in-repo (see [image-compare.md](image-compare.md)) but the repo-bloat / Git-LFS-vs-not tradeoff that bites every screenshot suite is not analyzed here. +- **Per-test threshold selection.** [image-compare.md](image-compare.md) gives the `[0.01, 0.1]` empirical range but not wgpu's actual manual tuning loop for picking a per-test threshold — the operationally hard part. + +## Sources + +- lavapipe non-conformance warning (Mesa source, `fprintf(stderr, …)`): https://gitlab.freedesktop.org/mesa/mesa/-/blob/main/src/gallium/frontends/lavapipe/lvp_device.c +- wgpu `docs/testing.md`: https://github.com/gfx-rs/wgpu/blob/trunk/docs/testing.md +- Known-Driver-Issues wiki: https://github.com/gfx-rs/wgpu/wiki/Known-Driver-Issues +- wgpu issues #8544 / #8727: https://github.com/gfx-rs/wgpu/issues/8544 · https://github.com/gfx-rs/wgpu/issues/8727 +- `tests/src/expectations.rs` (flaky / Ignore / substring matchers): https://github.com/gfx-rs/wgpu/blob/trunk/tests/src/expectations.rs +- `tests/src/image.rs` (implicit golden mint): https://raw.githubusercontent.com/gfx-rs/wgpu/trunk/tests/src/image.rs +- pal1000/mesa-dist-win (third-party Windows Mesa): https://github.com/pal1000/mesa-dist-win +- Sibling files: [gpu-test-harness.md](gpu-test-harness.md), [determinism-rasterizer.md](determinism-rasterizer.md), [image-compare.md](image-compare.md), [lessons.md](lessons.md), [glossary.md](glossary.md) diff --git a/docs/prior-art/wpt-reftests/README.md b/docs/prior-art/wpt-reftests/README.md new file mode 100644 index 0000000..910a86a --- /dev/null +++ b/docs/prior-art/wpt-reftests/README.md @@ -0,0 +1,84 @@ +**Date:** 2026-06-14 +**Status:** active +**Subject:** Reference-comparison visual testing (Gecko reftests + web-platform-tests) — folder index and prior-art entry point + +# Reftests (Gecko + web-platform-tests) + +A reftest ("reference test") asserts a *relationship between two renderings produced by the same engine in the same run* — a **testcase** versus a **reference** that reaches the same pixels by a different route — rather than a match against a stored gold image. It is a *methodology*, not a product: the `==` / `!=` (Gecko) and `rel=match` / `rel=mismatch` (WPT) **operator semantics** are shared across Gecko (Firefox), web-platform-tests, Servo, and Blink (Chromium), each running its own tests against its own engine. The shared corpus is WPT, which Gecko, Servo, and Blink import; the operator semantics are common, but the *reference-linking convention* diverges — Blink discovers references by file name (`foo-expected.html` / `foo-expected-mismatch.html`) and supports neither multiple nor chained references, where Gecko/WPT use explicit `` / manifest URLs and do (see [consumers.md](consumers.md)). Because both halves of every comparison share the identical GPU, driver, font rasterizer, antialiasing, DPI, and clock, all platform-variance terms cancel in the diff — the property that lets thousands of CSS-conformance tests run with **zero stored screenshots**. + +## For Buiy + +Buiy is designing its visual-bug-detection strategy as a 5-tier pyramid (layout-number snapshots → structured display-list snapshots → metamorphic/property invariants → **reftests** → golden screenshots), and reftests are the **Tier 4** headline mechanism — "the single highest-leverage absence" in the current tree (`docs/reports/2026-06-14-visual-bug-detection-strategy.md` § Tier 4). This folder is the prior-art behind that bet. Reftests port to Buiy cleanly: Buiy already renders to an offscreen `wgpu` texture, so rendering a `test_scene` and a `ref_scene` to texture in a single process against the same `wgpu::Device` reproduces the same-engine-cancels-variance guarantee the browsers engineered — and Buiy's primitive (literal-positioned) layer gives a trivial disjoint code path for authoring references. The decision content lives in [lessons.md](lessons.md). + +## Honest assessment + +- **The methodology is decades-proven but has a hard structural ceiling.** Reftests cannot cover any effect whose pixels are *intentionally* UA-defined or unspecified — underline position/thickness, `dotted`/`dashed`/`ridge`/`groove`/`double` borders, font-metric-dependent rendering — because no feature-free reference can reproduce them. The CSS-WG enumerates these as "impossible to reftest" verbatim. This is a boundary, not a gap in effort, and it is exactly where Buiy must hand off to Tier 5 goldens. See [open-problems.md](open-problems.md). +- **The reference-independence discipline is load-bearing and easy to get wrong.** If the reference exercises the same code path as the test, a shared bug makes both render identically wrong and the test *passes vacuously*. The browsers mitigate with disjoint techniques and multiple references; Buiy must adopt the same discipline or reftests silently lose their teeth. This is Open Question #1 in the strategy report. +- **Fuzzy matching is a necessary escape hatch and a known wart.** Exact equality is too strict (antialiasing, GPU rounding, spec-permitted latitude), so a bounded two-axis tolerance exists — but over-broad fuzz ranges silently mask real regressions, and intermittently-failing tests are forced into a weaker `0`-inclusive form that loses the regression-catching property. See [fuzzy-matching.md](fuzzy-matching.md). +- **Cross-backend reftests reintroduce variance.** The "one run cancels variance" benefit assumes a fixed `wgpu` backend per CI lane. A Vulkan-vs-Metal reftest would reintroduce the very variance reftests cancel and need fuzz — verify before assuming exact-match CI across platforms. + +## Key facts (verified 2026-06-14 against the cited primary sources) + +| Fact | Value | Source | +|---|---|---| +| What it is | a test *methodology* — relationship between two renderings of *different source files* by the *same engine* | MDN, CSSWG wiki | +| Operators (Gecko) | `==` (pass if renderings SAME) / `!=` (pass if DIFFERENT) | firefox-source-docs Reftest | +| Operators (WPT) | `` / `` | web-platform-tests.org reftests | +| Default comparison | **exact pixel match** unless a fuzzy annotation relaxes it | firefox-source-docs, WPT docs | +| Viewport (Gecko) | **800×1000**; content outside is ignored | firefox-source-docs Reftest | +| Viewport (WPT) | **800×600** including scrollbars if present | web-platform-tests.org reftests | +| Manifest (Gecko) | plain-text `reftest.list`; `#` comments; `include` forms a tree | firefox-source-docs, CSSWG wiki | +| Manifest line (Gecko) | `[ \| ]* [] ` | firefox-source-docs Reftest | +| Manifest (WPT) | `link` element + generated `MANIFEST.json` index | web-platform-tests.org | +| Multiple `==` refs | at least one must match (OR) | CSSWG wiki, WPT | +| Multiple `!=` refs | none may match (AND) | CSSWG wiki, WPT | +| Fuzzy (Gecko) | `fuzzy(minDiff-maxDiff,minPixelCount-maxPixelCount)`; `fuzzy-if(cond,…)` | firefox-source-docs Reftest | +| Fuzzy (WPT) | ``; ranges **inclusive** | web-platform-tests.org reftests | +| Reference reuse | sharing references is "strongly encouraged" (legibility + runner optimizations) | web-platform-tests.org | +| Async control | `class="reftest-wait"` on root; capture after `TestRendered` + class removal | web-platform-tests.org | +| Harness (Gecko) | `reftest.sys.mjs` (in-content) + `manifest.sys.mjs` + `runreftest.py`; invoked via `mach reftest` | gecko-dev `layout/tools/reftest` | +| Adopted by | Gecko, web-platform-tests, Servo, Blink | firefox-source-docs, chromium docs, servo docs | +| WPT corpus scale | "over 52000 tests and nearly two million subtests" (Servo, 2023-07-20) — **drifts, cite with date** | servo.org/blog/2023/07/20 | + +## Contents + +| File | Subject | +|---|---| +| [README.md](README.md) | This index — what a reftest is, honest assessment, key facts, reading order. | +| [lessons.md](lessons.md) | **The decision file.** Validates / Avoid / Borrow for Buiy's Tier-4 reftest harness. Start here when designing. | +| [glossary.md](glossary.md) | Reftest / WPT / Gecko terms, one line each. | +| [methodology.md](methodology.md) | The core idea: relationship-not-baseline, why same-engine cancels variance, the `==` / `!=` semantics, `!=` for proving non-no-op. | +| [gecko-reftests.md](gecko-reftests.md) | Gecko mechanics: the `reftest.list` manifest, operators, reference chains, `fails`/`random`/`skip` annotations, the `.sys.mjs` harness. | +| [fuzzy-matching.md](fuzzy-matching.md) | The two-axis tolerance budget, `fuzzy()` / `fuzzy-if()` / `` syntax, the pin-both-ends / never-include-0 discipline, and its acknowledged wart. | +| [wpt.md](wpt.md) | web-platform-tests: the cross-vendor corpus, `rel=match`/`rel=mismatch`, the `wpt` runner + `MANIFEST.json`, scale, and the two-way vendor sync. | +| [consumers.md](consumers.md) | How Servo and Blink consume the corpus — out-of-band `.ini` expectations, reftest-first/golden-last ordering, the per-engine pass-state model. | +| [open-problems.md](open-problems.md) | What reftests structurally cannot do: the "impossible to reftest" category, the reference-independence vacuous-pass failure mode, fuzzy's masking risk, cross-backend variance. | + +## Reading order + +1. **[methodology.md](methodology.md)** — the one idea everything else elaborates: assert a relationship between two engine renderings, not a baseline. +2. **[gecko-reftests.md](gecko-reftests.md)** and **[wpt.md](wpt.md)** — the two concrete realizations of that idea (manifest-file vs. markup-link), with the same comparison semantics. +3. **[fuzzy-matching.md](fuzzy-matching.md)** — the tolerance model Buiy's Tier-4 metric must copy (it is the same two-axis budget the strategy report's perceptual metric targets). +4. **[consumers.md](consumers.md)** — how a *parallel* engine (Servo, the closest prior art) treats the corpus as an external fixture with its own pass-state — the architectural model for Buiy. +5. **[open-problems.md](open-problems.md)** — the irreducible limit that defines the Tier-4/Tier-5 boundary. +6. **[lessons.md](lessons.md)** — the distilled decisions, written for the author of the Buiy reftest harness. + +## How to use + +**Framing disclosure.** These docs are written from Buiy's stance — an AccessKit-first, wgpu + Taffy + cosmic-text, parallel-to-bevy_ui retained-mode engine building a reftests-first layered visual-bug-detection strategy. The "Implications for Buiy" / lessons framing reads Reference-comparison visual testing through that lens; readers auditing whether that strategy is itself right should weigh the corpus accordingly — it is a learn-from artifact, not a neutral catalog. + +## Framing disclosure + +This folder is written from Buiy's stance: an ECS-native (Bevy 0.18) retained-mode Rust GUI library with a custom `wgpu` pipeline and a typed CSS-subset above Taffy, designing a reftests-first visual-bug-detection pyramid. The "Implications for Buiy" subsections and [lessons.md](lessons.md) read the methodology through that lens — programmatic typed scenes instead of HTML strings, an offscreen `wgpu` texture instead of a browser window, `reftest!(match/mismatch, …)` as the manifest-as-code analogue. The evidence files ([methodology.md](methodology.md), [gecko-reftests.md](gecko-reftests.md), [fuzzy-matching.md](fuzzy-matching.md), [wpt.md](wpt.md), [consumers.md](consumers.md)) describe the systems on their own terms; Buiy implications are confined to clearly-labelled subsections and to [lessons.md](lessons.md). + +## Sources + +- MDN, "Creating reftest-based unit tests" — https://developer.mozilla.org/en-US/docs/Mozilla/QA/Reftest (mirror: https://devdoc.net/web/developer.mozilla.org/en-US/docs/Creating_reftest-based_unit_tests.html) +- Firefox Source Docs, Reftest — https://firefox-source-docs.mozilla.org/layout/Reftest.html +- CSSWG wiki, test/reftest — https://wiki.csswg.org/test/reftest +- web-platform-tests, writing reftests — https://web-platform-tests.org/writing-tests/reftests.html +- web-platform-tests repo — https://github.com/web-platform-tests/wpt +- gecko-dev `layout/tools/reftest` — https://github.com/mozilla/gecko-dev/tree/master/layout/tools/reftest +- Sibling files: [methodology.md](methodology.md), [gecko-reftests.md](gecko-reftests.md), [fuzzy-matching.md](fuzzy-matching.md), [wpt.md](wpt.md), [consumers.md](consumers.md), [open-problems.md](open-problems.md), [lessons.md](lessons.md), [glossary.md](glossary.md) +- Sibling prior art: [../blink/](../blink/), [../servo-stylo/](../servo-stylo/), [../taffy/](../taffy/), [../xilem-masonry/](../xilem-masonry/) +- Buiy strategy report: [../../reports/2026-06-14-visual-bug-detection-strategy.md](../../reports/2026-06-14-visual-bug-detection-strategy.md) diff --git a/docs/prior-art/wpt-reftests/consumers.md b/docs/prior-art/wpt-reftests/consumers.md new file mode 100644 index 0000000..dda6f1e --- /dev/null +++ b/docs/prior-art/wpt-reftests/consumers.md @@ -0,0 +1,53 @@ +**Date:** 2026-06-14 +**Status:** active +**Subject:** How Servo and Blink consume the reftest corpus — out-of-band expectations, the per-engine pass-state model, and the reftest-first / golden-last ordering + +# Consumers: Servo and Blink + +The same `==`/`rel=match` methodology is run by multiple engines against their own code. How a *parallel* engine consumes a shared reftest corpus — and how it ranks reftests against golden tests — is the most directly transferable architecture for Buiy, which is itself a from-scratch engine designing its first reftest tier. + +## Servo: reftests as the primary CSS signal for a parallel Rust engine + +Servo — the closest prior art, a parallel Rust engine — consumes the W3C reftest corpus through its `mach` Python driver. The current Servo Book lists `./mach test-wpt` as the one command for the full WPT suite ("The simplest way to run the Web Platform Tests in Servo is `./mach test-wpt`"), with subset invocations like `./mach test-wpt dom` or `./mach test-wpt tests/wpt/yourtest`. The corpus lives under `tests/wpt`: + +- `tests/wpt/tests` — the upstream cross-browser tests (including CSS-WG reftests). +- `tests/wpt/mozilla/tests` — Servo-only tests that depend on Servo features. +- `tests/wpt/webgl` / `tests/wpt/webgpu` — imported suites. + +**The key architectural takeaway: expectations are stored out-of-band, not in the test files.** Pass/fail expectations are not asserted in the test files but stored as `.ini` metadata under a `meta` folder, refreshed via `./mach test-wpt --update-expectations path/to/tests/` and `./mach test-wpt --manifest-update`; CI imports are pulled with `./mach update-wpt `. A parallel engine treats the shared reftest corpus as an **external fixture** and tracks *its own* per-test pass state separately, rather than forking the tests. Aggregate CSS-vs-WPT health is tracked publicly at servo.org/wpt (scored as "percentages of total **enabled** tests… that pass," subtest tests scored 0–1 by passing fraction), drilling into wpt.fyi/results/?product=servo. + +**Historical `test-css` / `test-ref` (currency caveat).** The CSS-specific entry point was historically `./mach test-css`, documented as running "the cross-browser CSS WG reference tests… intended to work across many browsers," alongside `./mach test-ref` for Servo-specific reftests. **Could not fully verify currency:** these appear in older wiki/blog material (2015 era); the present-day Servo Book testing page surfaces only `test-wpt`, `test-unit`, `test-tidy`, `test-devtools`, suggesting CSS reftests were folded into the unified WPT path. Treat `test-css` as historically real but possibly superseded. Servo's reftest manifests use `.list` files with `==` (must-match) and `!=` (must-not-match) operators (e.g. `test/ref/basic.list`). + +## Blink: reftests in `web_tests`, with goldens as last resort + +Chromium runs the suite via `third_party/blink/tools/run_web_tests.py`; tests live in `third_party/blink/web_tests`. Blink documents three tiers and **explicitly ranks reftests above pixel/golden tests**: + +> "Reference tests, also known as reftests, perform a pixel-by-pixel comparison between the rendered image of a test page and the rendered image of a reference page." + +And the controlling rule: + +> "You should only write a pixel test if you cannot use a reference test." + +— because pixel tests are "less robust… because the rendering of a page is influenced by many factors such as the host computer's graphics card and driver, the platform's text rendering system, and various user-configurable operating system settings." + +Blink's reference linking: test `foo.html` pairs with `foo-expected.html` via ``, or `foo-expected-mismatch.html` via ``. Notably, **"Multiple references and chained references are not supported"** in Blink — a divergence from Gecko/WPT, which do support them. This "reftest-first, golden-only-if-forced" ordering is exactly Buiy's pyramid. + +## Implications for Buiy + +Two patterns transfer with high confidence: + +1. **The reftest-first / golden-last ordering is industry-validated, not novel.** Blink states it as a *rule* ("only write a pixel test if you cannot use a reference test"); Buiy's pyramid places Tier-4 reftests above Tier-5 goldens for the identical reason (rendering depends on GPU/driver/text-system). Cite Blink's rule when defending the ordering. +2. **Out-of-band per-engine pass-state is the right model for a parallel engine — but Buiy mostly sidesteps it.** Servo's `.ini`-in-`meta` expectation model exists because Servo runs *someone else's* tests and must track which it currently fails without editing them. Buiy authors its *own* reftests in Rust, so the test and the expected result co-locate (a `#[test]` either passes or is `#[ignore]`-with-reason). Buiy only needs the Servo model if it ever imports an *external* corpus (e.g. Taffy's WPT-derived layout fixtures, already noted as reusable in the strategy report) — at which point store expectations out-of-band rather than mutating imported fixtures. + +The currency caveat on `test-css`/`test-ref` is a reminder that command surfaces drift; cite the *mechanism* (out-of-band expectations, reftest-first ordering), not the exact subcommand. + +## Sources + +- Servo Book — Testing — https://book.servo.org/contributing/testing.html +- servo/servo `tests/wpt` — https://github.com/servo/servo/tree/main/tests/wpt +- Servo wiki — Testing (historical `test-css`/`test-ref`) — https://github.com/servo/servo/wiki/Testing +- Servo environment blog (2015) — https://servo.org/blog/2015/07/22/environment/ +- Servo WPT pass rates — https://servo.org/wpt/ ; https://wpt.fyi/results/?product=servo +- Blink — Writing Web Tests (reftest-first rule, no chained refs) — https://chromium.googlesource.com/chromium/src/+/main/docs/testing/writing_web_tests.md +- Sibling files: [wpt.md](wpt.md), [methodology.md](methodology.md), [open-problems.md](open-problems.md), [lessons.md](lessons.md) +- Sibling prior art: [../servo-stylo/](../servo-stylo/), [../blink/](../blink/) diff --git a/docs/prior-art/wpt-reftests/fuzzy-matching.md b/docs/prior-art/wpt-reftests/fuzzy-matching.md new file mode 100644 index 0000000..492cdfe --- /dev/null +++ b/docs/prior-art/wpt-reftests/fuzzy-matching.md @@ -0,0 +1,51 @@ +**Date:** 2026-06-14 +**Status:** active +**Subject:** Fuzzy matching — the two-axis tolerance budget, the `fuzzy()` / `fuzzy-if()` / `` syntax, and the pin-both-ends discipline + +# Fuzzy matching + +A reftest asserts that the test page and the reference page render to *identical* pixels (or, with `!=`, to *non*-identical pixels). In practice exact equality is too strict: anti-aliasing, sub-pixel positioning, GPU/driver rounding, and spec-permitted implementation latitude produce tiny, legitimate per-pixel differences that would otherwise flood the suite with false failures. Fuzzy matching lets a reftest tolerate a *bounded* amount of difference while still failing on anything outside the bound. Both Gecko reftest and web-platform-tests use the **same two-axis model**, and an annotation must specify both axes. + +## The two axes + +1. **Maximum per-channel pixel difference** — the largest allowed difference in any single RGB(A) color channel of any one pixel ("how wrong per pixel"). +2. **Number of differing pixels** — the total count of pixels allowed to differ at all ("how many pixels wrong"). + +The split exists because the two failure modes it discriminates are physically different. A *small* color delta spread over *many* pixels (anti-aliasing along a long edge) is benign; a *large* color delta even on a *few* pixels (a glyph in the wrong color, or a box shifted a pixel) is a real bug. A single "percent different" scalar cannot separate these — it would either accept a catastrophic small-area error or reject benign large-area smoothing. Splitting "how wrong per pixel" from "how many pixels wrong" lets authors accept the first while still catching the second. + +## Syntax + +**Gecko** (manifest annotation): `fuzzy(minDiff-maxDiff,minPixelCount-maxPixelCount)`, with the conditional form `fuzzy-if(condition,minDiff-maxDiff,minPixelCount-maxPixelCount)` to scope a budget to a platform/config (e.g. `fuzzy-if(cocoaWidget,1-1,8-8)`). Each axis is a **range**, not a single number, and **both ends of both ranges are checked inclusively**: the observed max-channel difference must fall within `[minDiff, maxDiff]` *and* the observed differing-pixel count within `[minPixelCount, maxPixelCount]`, or the test fails. + +**WPT** (markup meta tag): ``. The argument names are optional — `"15;300"` is equivalent to the named form — and "These range checks are inclusive." When a test has several possible references with different tolerances, "One meta element is required per reference requiring a unique fuzziness value, but any unprefixed value will automatically be applied to any ref that doesn't have a more specific value" (prefix form: `option1-ref.html:10-15;200-300`). For `!=` (mismatch) reftests the **minimum bounds of the ranges must be zero**. + +## The discipline: pin both ends, do not include 0 + +The non-obvious doctrine — and the part most worth importing into Buiy — is that a range is a *two-sided* assertion, and when a difference is *expected* the range should **not** include 0. Gecko's guidance: use the tightest bounds possible; "if the behavior is entirely deterministic this means a range like `fuzzy(1-1,8-8)`, and if at all possible, the ranges should not include 0." + +- **Pinning the lower end** (1, not 0) means that if the underlying bug is later *fixed* and the test starts matching exactly, the harness reports an **unexpected pass** — the signal that the `fuzzy()` annotation is now stale and can be removed, restoring exact-match coverage. A range that starts at 0 silently swallows that signal: the test "passes" whether the difference is present or gone, so a fix (or a *further* regression that happens to land back inside the window) goes unnoticed. +- **Pinning the upper end** catches the difference *growing* past the calibrated budget. + +So a deterministic case should use `n-n` (e.g. `1-1`, `8-8`), widening to `lo-hi` only as much as genuine run-to-run variance demands — the window being the smallest interval that still passes reliably while leaving regressions outside it. + +## The wart: when 0 is unavoidable + +The docs concede the limit honestly: "In cases where the test only sometimes fails, this unfortunately requires using 0 in both ranges," and the consequence is stated plainly — "we won't get reports of an unexpected pass if the test regresses further." So intermittently-failing (non-deterministic) tests are forced into the weaker 0-inclusive form and **lose the regression-catching property**; that is the acknowledged cost, not a feature. + +> *Verification flag:* the longer Gecko paragraph could not be pulled verbatim through the fetch tool (a content-length guard blocked literal quotation); the substance was cross-confirmed across two independent search retrievals of that same page, but the precise sentence punctuation is reconstructed, not byte-exact. The two-axis definition, inclusive-range semantics, the `fuzzy(1-1,8-8)` example, the "should not include 0" rule, and the intermittent-test caveat are all from the cited primary docs. + +## How authors calibrate the budget + +The numbers are empirical, measured from a real failing run rather than guessed. The tooling reports the *actual* max-channel difference and differing-pixel count for a comparison — under `wpt run` via logging (e.g. `--log-mach=-`), and Gecko's reftest output prints the detected `image comparison ... max difference / different pixels` for a fuzzy failure. The author reads off the observed pair, then sets the range tight around it. Third-party tooling exists to make this triage visual — e.g. Gankra's `live-reftest-analyzer` for inspecting failed Gecko reftests. + +## Implications for Buiy + +This is the metric model Buiy's Tier-4 (and the unified perceptual metric) must copy directly, not reinvent. The strategy report (`§ Cross-cutting mechanisms`) already commits Buiy to a two-axis fuzzy/outlier gate — `(max_pixel_delta, max_diff_pixels)`, AA-excluded — replacing the existing naive L1 and RMSE metrics that cannot express it. Adopt per-reference fuzz (WPT prefixes the ref URL; Buiy would carry the budget on the `RefCase`). Adopt the **pin-both-ends, never-include-0** discipline so a fixed bug surfaces as an unexpected pass that retires the budget — this is Open Question #2 in the strategy report ("is Buiy willing to pin both ends?"). And inherit the honest wart: intermittently-failing Buiy reftests would have to drop to 0-inclusive and lose the regression signal — which is itself an argument for engineering determinism at the source (the determinism stack) so the deterministic `n-n` form stays usable. + +## Sources + +- Firefox Source Docs, Reftest — https://firefox-source-docs.mozilla.org/layout/Reftest.html +- web-platform-tests, writing reftests — https://web-platform-tests.org/writing-tests/reftests.html +- Gankra, `live-reftest-analyzer` — https://github.com/Gankra/live-reftest-analyzer +- Sibling files: [gecko-reftests.md](gecko-reftests.md), [methodology.md](methodology.md), [open-problems.md](open-problems.md), [lessons.md](lessons.md) +- Buiy strategy report (two-axis metric, Open Question #2) — [../../reports/2026-06-14-visual-bug-detection-strategy.md](../../reports/2026-06-14-visual-bug-detection-strategy.md) diff --git a/docs/prior-art/wpt-reftests/gecko-reftests.md b/docs/prior-art/wpt-reftests/gecko-reftests.md new file mode 100644 index 0000000..db89005 --- /dev/null +++ b/docs/prior-art/wpt-reftests/gecko-reftests.md @@ -0,0 +1,83 @@ +**Date:** 2026-06-14 +**Status:** active +**Subject:** Gecko/Mozilla reftest mechanics — the `reftest.list` manifest, operators, reference chains, failure-type annotations, and the harness + +# Gecko reftests + +Gecko (Firefox) is where the reftest methodology originated and where its concrete mechanics — the manifest format, the operator/annotation vocabulary, and the runner — are most precisely documented. The methodology and the `==` / `!=` semantics are shared with WPT, Servo, and Blink; this file is the Gecko-specific machinery. + +## Pass/fail computation + +The harness renders both inputs and compares the resulting bitmaps. A test passes when "the bitmaps resulting from displaying the two files in an 800×1000 window are identical" (MDN). Firefox Source Docs confirms the fixed viewport: "The captures of the tests are taken in a viewport that is 800 pixels wide and 1000 pixels tall, so any content outside that area will be ignored." Comparison is **exact pixel match by default**; every pixel must be identical unless a `fuzzy` annotation relaxes it (see [fuzzy-matching.md](fuzzy-matching.md)). + +## Operators + +- `==` (match) — passes "if the images of the two renderings are the SAME." +- `!=` (mismatch) — passes "if the images of the two renderings are DIFFERENT." + +`!=` is the workhorse for catching *regressions to nothing* — asserting a feature actually produces a visible effect, so a no-op implementation fails. (Full operator semantics and the `!=`-proves-suppression pattern are in [methodology.md](methodology.md).) + +## Manifest format + +Tests are declared in plain-text manifests conventionally named `reftest.list`. Lines starting with `#` are comments. Each test line has the form (Firefox Source Docs): + +``` +[ | ]* [] +``` + +- `` is `==` or `!=`. +- `` is the testcase; `` is the reference. Results are reported under `` only. +- Manifests may `include` other manifests, forming a tree. + +Reference linking is **explicit**: every line names both the testcase URL and the reference URL, and Gecko documents no filename-based auto-discovery (no `-ref.html` convention) — verified against Firefox Source Docs. Blink is the divergence here, pairing `foo.html` with a same-named `foo-expected.html` by convention (see [consumers.md](consumers.md)); a Buiy `reftest!` naming scheme should choose explicit-pairing-vs-filename-convention deliberately. + +The in-tree `reftest.list` files live *beside the tests they reference* — the harness source directory `layout/tools/reftest/` itself contains no `reftest.list` (verified against the gecko-dev tree). + +## Reference chains and multiple references + +A single test line names exactly one reference, but references can be **chained**: "If multiple reference files must be matched, each reference file should, in turn, link to the next reference" (CSSWG wiki) — chaining is expressed via the manifest links between *reference* files, letting one test transitively require several relationships. + +For specs permitting multiple conforming renderings, "each possible rendering should have its own reference file linked from the test file." The aggregate semantics (CSSWG wiki / WPT): + +- If a test has multiple `==` references then **at least one** of those references must match the test (OR). +- If a test has multiple `!=` references, then **none** of those references may match the test (AND). + +> *Verification flag:* the precise chaining link mechanism (whether via a manifest column or an in-reference annotation) is summarized from the CSSWG wiki and not cross-checked against the parser source `manifest.sys.mjs`. + +## Failure-type annotations + +Prefix tokens on a manifest line (Firefox Source Docs): + +| Token | Effect | +|---|---| +| `fails` | expected failure — **inverts** the pass condition (the test is known-broken; a *pass* would be the surprise) | +| `random` | result is nondeterministic; excluded from output | +| `skip` | do not run — used when a test crashes or hangs the browser | +| `fuzzy(minDiff-maxDiff,minPixelCount-maxPixelCount)` | pass when per-pixel value differences fall in `[minDiff,maxDiff]` *and* the count of differing pixels falls in `[minPixelCount,maxPixelCount]`, both inclusive | + +`fuzzy` is the documented escape hatch for unavoidable antialiasing/platform noise — and a known wart, since over-broad fuzz ranges silently mask real regressions. It has its own file: [fuzzy-matching.md](fuzzy-matching.md). The conditional forms `fails-if(cond,…)` / `fuzzy-if(cond,…)` scope an annotation to a platform or pref (e.g. `fuzzy-if(cocoaWidget,1-1,8-8)`); platform-conditional `sandbox` annotations are evaluated by the manifest parser's sandbox. + +## Harness / runner + +The runner historically lived in `reftest.jsm`; that `.jsm` form **no longer exists** — verified against the current gecko-dev tree, which contains: + +- `reftest.sys.mjs` — the in-content runner. +- `manifest.sys.mjs` — manifest parser / sandbox + platform-conditional annotation evaluation. +- `runreftest.py` and `reftestcommandline.py` — the Python drivers. + +The `.jsm`→ES-module `.sys.mjs` form is part of Gecko's tree-wide module-system migration (no single tracking bug is cited here — see the verification flag); reftest navigation was separately refactored onto `JSWindowActor` (Bug 1648444, resolved Firefox 83). Reftests are invoked via `mach reftest`. The same `==`/`!=` manifest methodology was adopted by web-platform-tests and is shared across Gecko, Servo, and Blink. + +> *Verification flag:* file presence/absence in `layout/tools/reftest/` was verified directly against the gecko-dev master tree (`reftest.sys.mjs`, `manifest.sys.mjs`, `runreftest.py`, `reftestcommandline.py` present; no `reftest.jsm`). Bug 1648444 ("Refactor reftest navigation code to use JSWindowActor," resolved Firefox 83) was confirmed on Bugzilla. The broad `.jsm`→`.sys.mjs` migration is real but is not driven by one numbered meta-bug; an earlier draft miscited Bug 1838149 for it — that bug is in fact a narrow WebDriver logging-string fix (`ModuleCache.sys.mjs`), unrelated to reftests, and the claim has been corrected to avoid asserting a bug number that does not back it. + +## Implications for Buiy + +Gecko's manifest is a text file because its testcases are loose HTML files discovered by path. Buiy's "documents" are typed BSN assets or programmatic widget trees, so the manifest is **ordinary Rust** — a `reftest!(match/mismatch, "name", test_scene, ref_scene)` macro or a data-driven harness over `&[RefCase]`, each pairing a `#[test]` under the existing `xvfb-run -a cargo test` gate. The Gecko annotation vocabulary maps usefully: `fuzzy(…)` → a per-pairing two-axis budget (see [fuzzy-matching.md](fuzzy-matching.md)); `fails` → a `#[ignore]`-with-reason or an expected-failure marker; `skip` → not registering the pairing on a backend that cannot run it. The chained/multiple-reference mechanism is the precedent for Buiy supporting *multiple* references where one disjoint reference is impossible (at least one `==` must match), which is the mitigation for the reference-independence wart in [open-problems.md](open-problems.md). + +## Sources + +- Firefox Source Docs, Reftest — https://firefox-source-docs.mozilla.org/layout/Reftest.html +- MDN, "Creating reftest-based unit tests" — https://developer.mozilla.org/en-US/docs/Mozilla/QA/Reftest +- CSSWG wiki, test/reftest — https://wiki.csswg.org/test/reftest +- gecko-dev `layout/tools/reftest` tree — https://github.com/mozilla/gecko-dev/tree/master/layout/tools/reftest +- Bug 1648444 (reftest navigation onto `JSWindowActor`, resolved Firefox 83) — https://bugzilla.mozilla.org/show_bug.cgi?id=1648444 +- Sibling files: [methodology.md](methodology.md), [fuzzy-matching.md](fuzzy-matching.md), [wpt.md](wpt.md), [open-problems.md](open-problems.md), [lessons.md](lessons.md) diff --git a/docs/prior-art/wpt-reftests/glossary.md b/docs/prior-art/wpt-reftests/glossary.md new file mode 100644 index 0000000..6d41042 --- /dev/null +++ b/docs/prior-art/wpt-reftests/glossary.md @@ -0,0 +1,64 @@ +**Date:** 2026-06-14 +**Status:** active +**Subject:** Reftest / WPT / Gecko terminology used across this folder + +# Glossary + +Short definitions for the reftest / web-platform-tests / Gecko terms used across this prior-art folder. Each entry points to the file where the full discussion lives. + +## Core methodology + +- **Reftest (reference test)** — a test that asserts a *relationship between two renderings produced by the same engine* (a testcase and a reference), not a match against a stored gold image. See [methodology.md](methodology.md). +- **Testcase** — the file that exercises the feature under test, often with complex markup. See [methodology.md](methodology.md). +- **Reference** — a different, usually simpler, file that produces the *same* rendering as the testcase *by a different mechanism*; "must not use the same features that are being tested." The independent oracle. See [methodology.md](methodology.md), [open-problems.md](open-problems.md). +- **`==` (match)** — Gecko operator; passes if the two renderings are the SAME. WPT spelling: ``. See [gecko-reftests.md](gecko-reftests.md), [wpt.md](wpt.md). +- **`!=` (mismatch)** — Gecko operator; passes if the two renderings are DIFFERENT. WPT spelling: ``. Used to prove a feature actually changes rendering (catches no-op regressions). See [methodology.md](methodology.md). +- **Vacuous pass** — the failure mode where the reference shares the testcase's buggy code path, so both render identically wrong and the `==` passes despite the bug. The reference-independence wart. See [open-problems.md](open-problems.md). + +## Manifest and annotations (Gecko) + +- **`reftest.list`** — Gecko's plain-text manifest declaring reftests; `#` comments; `include` forms a manifest tree; lives beside the tests it references. See [gecko-reftests.md](gecko-reftests.md). +- **Manifest line** — `[ | ]* [] `; `` is `==`/`!=`, `` the testcase, `` the reference; results reported under ``. See [gecko-reftests.md](gecko-reftests.md). +- **Reference chain** — multiple reference files linked in turn so one test transitively requires several relationships. See [gecko-reftests.md](gecko-reftests.md). +- **`fails`** — annotation marking an expected failure; *inverts* the pass condition. See [gecko-reftests.md](gecko-reftests.md). +- **`random`** — annotation marking a nondeterministic result; excluded from output. See [gecko-reftests.md](gecko-reftests.md). +- **`skip`** — annotation; do not run (test crashes/hangs the browser). See [gecko-reftests.md](gecko-reftests.md). +- **`-if(condition,…)`** — conditional form of an annotation (`fails-if`, `fuzzy-if`) scoping it to a platform/config (e.g. `fuzzy-if(cocoaWidget,1-1,8-8)`). See [gecko-reftests.md](gecko-reftests.md), [fuzzy-matching.md](fuzzy-matching.md). + +## Fuzzy matching + +- **`fuzzy(minDiff-maxDiff,minPixelCount-maxPixelCount)`** — Gecko annotation; passes when the max per-channel difference is in `[minDiff,maxDiff]` *and* the differing-pixel count is in `[minPixelCount,maxPixelCount]`, both inclusive. See [fuzzy-matching.md](fuzzy-matching.md). +- **``** — WPT markup form of the same two-axis budget; `content="maxDifference=10-15;totalPixels=200-300"` (named args optional; per-reference via a `ref.html:lo-hi;lo-hi` prefix). See [fuzzy-matching.md](fuzzy-matching.md). +- **Two-axis budget** — the model splitting *maximum per-channel pixel difference* ("how wrong per pixel") from *number of differing pixels* ("how many pixels wrong"), because one scalar cannot separate benign AA from a real small-area bug. See [fuzzy-matching.md](fuzzy-matching.md). +- **Pin both ends / never include 0** — the discipline of setting tight ranges that exclude 0 when a difference is expected, so a fixed bug surfaces as an *unexpected pass*. See [fuzzy-matching.md](fuzzy-matching.md). + +## Harness (Gecko) + +- **`reftest.sys.mjs`** — the in-content reftest runner (ES-module form; replaced the old `reftest.jsm`). See [gecko-reftests.md](gecko-reftests.md). +- **`manifest.sys.mjs`** — the manifest parser / sandbox; evaluates platform-conditional annotations. See [gecko-reftests.md](gecko-reftests.md). +- **`runreftest.py` / `reftestcommandline.py`** — the Python drivers. Reftests are invoked via `mach reftest`. See [gecko-reftests.md](gecko-reftests.md). + +## web-platform-tests + +- **web-platform-tests (WPT)** — a cross-browser test suite (single Git repo) run by Chromium, Gecko, WebKit, and Servo against their own engines; reftests are its rendering-oriented subset. See [wpt.md](wpt.md). +- **`rel=match` / `rel=mismatch`** — WPT's `link`-element spelling of the `==` / `!=` operators. See [wpt.md](wpt.md). +- **`reftest-wait`** — a class on the root element that marks a test async; the harness captures only after the class is removed (post load + fonts + paints). See [wpt.md](wpt.md). +- **`MANIFEST.json`** — WPT's generated index classifying each file (testharness/reftest/wdspec) and recording reftest match/mismatch relationships. Built by `wpt manifest`. See [wpt.md](wpt.md). +- **`wpt` CLI** — drives the suite: `wpt serve`, `wpt run`, `wpt lint`, `wpt manifest`. See [wpt.md](wpt.md). +- **`MANIFEST.json` two-way sync** — the bidirectional import/export automation by which Chromium (`external/wpt`), Gecko (`wpt-sync`), and Servo upstream/downstream tests to/from the shared corpus. See [wpt.md](wpt.md). + +## Consumers + +- **`mach test-wpt`** — Servo's command to run the full WPT suite; subset/update variants exist (`--update-expectations`, `--manifest-update`). See [consumers.md](consumers.md). +- **`.ini` expectations** — Servo's out-of-band per-test pass/fail metadata stored in a `meta` folder, so a parallel engine tracks *its own* pass state without editing the shared tests. See [consumers.md](consumers.md). +- **`mach test-css` / `test-ref`** — historical Servo commands for CSS-WG reference tests / Servo-specific reftests; possibly superseded by the unified `test-wpt` path (currency unverified). See [consumers.md](consumers.md). +- **`run_web_tests.py`** — Blink's web-test runner; tests live in `third_party/blink/web_tests`; Blink supports neither chained nor multiple references. See [consumers.md](consumers.md). + +## Sources + +- Firefox Source Docs, Reftest — https://firefox-source-docs.mozilla.org/layout/Reftest.html +- web-platform-tests, writing reftests — https://web-platform-tests.org/writing-tests/reftests.html +- CSSWG wiki, test/reftest — https://wiki.csswg.org/test/reftest +- Servo Book — Testing — https://book.servo.org/contributing/testing.html +- Blink — Writing Web Tests — https://chromium.googlesource.com/chromium/src/+/main/docs/testing/writing_web_tests.md +- Sibling files: [methodology.md](methodology.md), [gecko-reftests.md](gecko-reftests.md), [fuzzy-matching.md](fuzzy-matching.md), [wpt.md](wpt.md), [consumers.md](consumers.md), [open-problems.md](open-problems.md), [lessons.md](lessons.md) diff --git a/docs/prior-art/wpt-reftests/lessons.md b/docs/prior-art/wpt-reftests/lessons.md new file mode 100644 index 0000000..aff0888 --- /dev/null +++ b/docs/prior-art/wpt-reftests/lessons.md @@ -0,0 +1,83 @@ +**Date:** 2026-06-14 +**Status:** active +**Subject:** Reftests — the consult-this-when-designing decision file: Validates / Avoid / Borrow for Buiy's Tier-4 reftest harness + +# Lessons for Buiy + +This is the decision file. The other files in this folder are evidence; this one is decisions. Reftests are the **Tier-4 headline mechanism** of Buiy's reftests-first visual-bug-detection strategy and "the single highest-leverage absence" in the current tree — there are no reftests anywhere yet. The lessons here are written for the author of the Buiy reftest harness (`reftest!(match/mismatch, …)` on the offscreen `wgpu` capture path). + +## Top of file: the single most important finding + +**A reftest asserts a relationship between two renderings of *different source files* by the *same engine in the same run* — and that one mechanic gives Buiy a rendering oracle that survives platform noise with zero stored baselines.** Both halves share the identical `wgpu::Device`, driver, glyph rasterizer, AA, DPI, and clock, so every platform-variance term cancels in the diff. WPT proves this at scale: thousands of CSS reftests, zero stored screenshots. The reference is an *independent oracle* reached "by a different route"; the test passes only if the engine produced the *right* pixels two different ways — not merely that two runs of the same buggy code agree. + +The whole bet rests on one discipline: **the reference must not use the feature under test.** Get that right and reftests carry **lower maintenance than per-platform goldens** — every new CSS-subset feature ships one reference pairing whose only upkeep is "keep two equivalent scenes equivalent": no per-platform golden, no rebaseline on theme tweaks, no binary blobs, no eyeball review. (The strategy report argues this is *sub-linear* growth as the feature surface grows; that is Buiy's design rationale, not a measured browser figure.) Get it wrong and the test passes vacuously. That discipline is Open Question #1 and the load-bearing risk. + +## Why it ports to Buiy cleanly + +Nothing about reftests is HTML-specific. Buiy already renders to an offscreen `wgpu` texture, so it gets the methodology's core guarantees with two *structural advantages* over the browsers: + +1. **One app run cancels platform variance — for free.** Buiy renders both `test_scene` and `ref_scene` to texture in a single process against the same `wgpu::Device`, so the same-engine-cancels-variance guarantee from the "Top of file" finding holds by construction. Driver-dependent SDF rounding, glyph-atlas AA, and subpixel coverage appear *identically* in both images, so `==` can often be **exact**, not fuzzy — a stronger guarantee than golden screenshots, which must survive driver upgrades. +2. **Typed scenes, not HTML strings.** `test_scene` / `ref_scene` are programmatic widget trees (or BSN assets). The reference's disjoint code path is trivial to construct because Buiy has a **primitive layer** (literal-positioned boxes) that bypasses Taffy and the CSS-subset entirely. The manifest is ordinary Rust — `reftest!(match, "flex_justify_end", test_scene, ref_scene)` — not a `reftest.list` text file. + +## Validates + +Buiy design choices the reftest experience confirms: + +- **Reftests-first / goldens-last is industry rule, not novelty.** Blink states it outright — "You should only write a pixel test if you cannot use a reference test" — for the identical reason Buiy's pyramid puts Tier-4 above Tier-5: pixel rendering depends on the GPU/driver/text-system. Cite Blink's rule when defending the ordering. The quote and its sourcing live in *this* folder's [consumers.md](consumers.md) (taken from the Chromium "Writing Web Tests" doc); the sibling `../blink/` folder does **not** yet carry a reftest/pixel-test facet (the strategy report flags adding one), so do not chase the rule there. +- **No stored baseline is the right v1 stance.** WPT runs thousands of CSS reftests with zero screenshots; the oracle is a live reference, not a frozen artifact that reds the suite on every legitimate restyle. Buiy's Tier-4 storing **zero bytes** is the same architecture, validated at browser scale. See [wpt.md](wpt.md). +- **The two-axis fuzzy metric is the correct tolerance model.** Gecko and WPT independently converged on `(maxDifference, totalPixels)` — separating "how wrong per pixel" from "how many pixels wrong" because a single scalar cannot tell benign AA smoothing from a real one-box-misplaced regression. Buiy's planned `(max_pixel_delta, max_diff_pixels)` gate is the same model. Do not reinvent it. See [fuzzy-matching.md](fuzzy-matching.md). +- **`!=` (mismatch) for proving suppression is the right tool.** Asserting `content-visibility: hidden` `!=` the visible render guards against a silent no-op, where a `==` would pass vacuously on blank-vs-blank. Buiy's `!=` anti-tests for cull/skip behavior are the canonical use. See [methodology.md](methodology.md). +- **Reference independence as a first-class concern.** Both the browsers and the strategy report treat "the reference must use a disjoint code path" as load-bearing, not incidental. Buiy elevating it to an Open Question (who reviews it; can it be lint-enforced) matches the browsers' hard-won caution. See [open-problems.md](open-problems.md). + +## Avoid + +| Pitfall | Source | Buiy mitigation | +|---|---|---| +| **Writing a reference with the feature under test.** A flex reference using flex, an `@container` reference using `@container`: a shared bug renders both identically wrong and the test passes vacuously — the symmetric twin of the golden weakness. | [open-problems.md](open-problems.md), CSSWG wiki | Route references through the **primitive/absolute layer** or a second independent style mechanism. Where one disjoint reference is impossible, support **multiple references** (≥1 `==` must match) so two techniques must agree. Lint-enforce where possible (Open Question #1). | +| **A `fuzzy` range that includes 0 when a difference is expected.** It silently swallows the signal that the bug was fixed (no unexpected-pass report → the stale budget never gets retired) and re-admits a regression that lands back in the window. | [fuzzy-matching.md](fuzzy-matching.md) | Pin **both ends** (`fuzzy(1-1,8-8)`, never `0-…`) for deterministic cases; widen to `lo-hi` only as far as measured run-to-run variance demands. Accept the wart that intermittent tests must drop to 0-inclusive — and treat that as pressure to engineer determinism at the source. | +| **Reaching for goldens before the reftest is exhausted.** Goldens cost a stored corpus, per-config baselines, flake budget, and human triage; reftests need none of that. | [consumers.md](consumers.md), Blink rule | Demote goldens to the **irreducible residue** (no feature-free reference exists): shadow falloff, glyph fidelity, color-emoji, blend math, gamma. Everything relational stays in Tier-4. | +| **Trying to reftest the unreftestable.** Underline position/thickness, dotted/dashed/ridge/groove/double borders, focus-ring geometry, font-metric-dependent rendering — no feature-free reference can reproduce them. | [open-problems.md](open-problems.md), CSSWG wiki, WPT #7676 | Route font-metric/UA-defined effects to **Tiers 1–3** (shaping snapshots, structured display-list snapshots, property invariants — assertions on structured data, not pixels) and the genuine rasterization residue to **Tier-5**. The pyramid is the answer; do not force a reftest. | +| **Cross-backend reftest pairings.** A test on Vulkan vs a reference on Metal reintroduces exactly the variance reftests cancel. | [open-problems.md](open-problems.md) | Keep both captures on the **same `wgpu` backend in the same process**. Get cross-platform confidence by running the whole suite on each *pinned* backend independently, not by cross-backend `==`. Verify the per-CI-lane backend assumption before claiming exact-match CI. | +| **Capturing before the scene has settled.** WPT waits for `reftest-wait` to clear (load + fonts + pending paints) before the screenshot; capturing early diffs a half-rendered frame. | [wpt.md](wpt.md) | Gate the texture readback on Buiy's deterministic settle condition — 0 pending assets, glyph atlas warmed, clock advanced (the `GoldenConfig::deterministic()` triad already built) — the analogue of `reftest-wait`. | +| **Assuming chained/multiple references are universally available.** Blink supports *neither* multiple nor chained references; only Gecko/WPT do. | [consumers.md](consumers.md) | If Buiy adopts multiple references as the independence mitigation, build that capability into the `reftest!` harness deliberately — it is not free, and one major engine omits it. | + +## Borrow + +Concrete reftest primitives worth studying before building the Buiy analogue: + +1. **The `==` / `!=` operator pair and its boolean aggregation.** `==`/`rel=match` (pass if SAME), `!=`/`rel=mismatch` (pass if DIFFERENT); with multiple refs, ≥1 `==` must match (OR), *all* `!=` must mismatch (AND). Buiy's `reftest!(match/mismatch, …)` is the typed analogue; the aggregation rules are the spec for multi-reference independence. See [methodology.md](methodology.md), [wpt.md](wpt.md). + +2. **The two-axis fuzzy budget, exact.** `fuzzy(minDiff-maxDiff, minPixelCount-maxPixelCount)` (Gecko) / `` (WPT), ranges inclusive, **per-reference**. Copy the metric and the per-pairing scoping onto the `RefCase`; for `!=` the minimum bounds must be 0. This is the same metric the strategy report's unified perceptual gate targets. See [fuzzy-matching.md](fuzzy-matching.md). + +3. **The pin-both-ends calibration discipline.** Measure the *actual* `(max difference, different pixels)` from a real failing run (the harness prints it), set the range tight (`n-n` when deterministic), never include 0 when a difference is expected. The payoff: a fixed bug surfaces as an *unexpected pass* that retires the budget, restoring exact-match coverage. Tooling like Gankra's `live-reftest-analyzer` makes the triage visual — Buiy's failing-run diff PNG is the analogue. See [fuzzy-matching.md](fuzzy-matching.md). + +4. **The `reftest-wait` settle handshake.** Capture only after load + font loading + pending paints, signalled by removing `class="reftest-wait"`. Buiy's analogue is an asserted "scene settled" gate (0 pending assets, atlas warmed, clock at an explicit virtual timestamp) before texture readback — and it doubles as the animation-snapshot mechanism (capture at stepped clock times). See [wpt.md](wpt.md). + +5. **The out-of-band per-engine expectation model (only if importing external corpora).** Servo stores pass/fail expectations as `.ini` metadata in a `meta` folder, treating the shared corpus as an external fixture it never edits. Buiy authors its own reftests in Rust (expectation co-located with the `#[test]`), so it needs this *only* if it imports an external corpus — e.g. Taffy's WPT-derived layout fixtures. Then store expectations out-of-band, do not mutate imported fixtures. See [consumers.md](consumers.md). + +6. **Authoring patterns mapped to Buiy's CSS-subset surface** (the test/reference pairs to write): + - *Flex/grid → literal offsets.* Test: `justify_content: SpaceBetween` row of three 40px boxes in a 200px container. Reference: three boxes at absolute x = 0, 80, 160 via the primitive layer. `==` proves the Taffy integration *and* box-generation math with a reference that never touches the flex solver. + - *`@container` → hand-authored equivalent.* Test: a widget whose style resolves via a container query at a given container size. Reference: the same tree with the *resolved* branch inlined as a plain style, no `@container` rule. `==` proves the query engine selected and applied the right rule. + - *`content-visibility: hidden` → mismatch.* Test: a subtree with `content-visibility: hidden`. Reference: the identical tree visible. Assert `!=` — the hidden subtree must *not* paint. + - *Logical → physical mirror.* Logical-property layout `==` its physical-property mirror, proving writing-mode/direction resolution. + - *Transform → translated coordinates.* `translate(50,50)` `==` an element authored at the translated coordinates. + +7. **The CPU-vs-GPU cross-check as a Tier-4.5 oracle** (Buiy-specific, Vello-pattern). Where no feature-free reference exists for a *rasterization* property (SDF corner AA), Buiy's existing CPU SDF port is an independent rasterization oracle: render the same primitive on GPU and CPU in one run, diff with the two-axis metric. Stores zero bytes, needs no second authoring path (the CPU port *is* the independent implementation), and catches AA bugs no markup-style reftest can. Build it before broad goldens. See [open-problems.md](open-problems.md). + +## How to use this file + +- **Buiy reftest-harness author:** read the "Top of file" finding and Borrow items 1–4 and 7, then the Avoid rows on reference-with-feature-under-test, 0-inclusive fuzz, cross-backend pairing, and capture-before-settle. Each maps to a concrete `reftest!` / `RefCase` / capture-gate decision. The two open questions the evidence forces the plan to close: (a) **who reviews / how to lint reference independence** (Open Question #1) and (b) **whether Buiy pins both ends of every fuzz budget** (Open Question #2). +- **Anyone defending the reftests-first ordering:** the Validates list is the set of Buiy choices the browsers confirm — cite Blink's "only write a pixel test if you cannot use a reference test" and WPT's zero-baseline scale. +- **Anyone scoping a feature for visual testing:** the Avoid row "trying to reftest the unreftestable" + [open-problems.md](open-problems.md) tell you when an effect *cannot* be a reftest and which tier owns it instead. +- The other files are evidence; do not re-derive their detail here — follow the cross-links. + +## Sources + +- Sibling files: [README.md](README.md), [methodology.md](methodology.md), [gecko-reftests.md](gecko-reftests.md), [fuzzy-matching.md](fuzzy-matching.md), [wpt.md](wpt.md), [consumers.md](consumers.md), [open-problems.md](open-problems.md), [glossary.md](glossary.md) +- CSSWG wiki, test/reftest (reference-must-differ; vacuous-pass; impossible-to-reftest) — https://wiki.csswg.org/test/reftest +- Firefox Source Docs, Reftest (operators, fuzzy, pin-both-ends) — https://firefox-source-docs.mozilla.org/layout/Reftest.html +- web-platform-tests, writing reftests (rel=match/mismatch, fuzzy, reftest-wait, underline wart) — https://web-platform-tests.org/writing-tests/reftests.html +- Blink — Writing Web Tests ("only write a pixel test if you cannot use a reference test") — https://chromium.googlesource.com/chromium/src/+/main/docs/testing/writing_web_tests.md +- Servo Book — Testing (out-of-band `.ini` expectations) — https://book.servo.org/contributing/testing.html +- Sibling prior art: [../blink/lessons.md](../blink/lessons.md), [../servo-stylo/](../servo-stylo/), [../taffy/lessons.md](../taffy/lessons.md), [../xilem-masonry/masonry-toolkit.md](../xilem-masonry/masonry-toolkit.md) +- Buiy strategy report (Tier-4 reftests, Open Questions #1/#2, CPU-vs-GPU cross-check) — [../../reports/2026-06-14-visual-bug-detection-strategy.md](../../reports/2026-06-14-visual-bug-detection-strategy.md) diff --git a/docs/prior-art/wpt-reftests/methodology.md b/docs/prior-art/wpt-reftests/methodology.md new file mode 100644 index 0000000..ea18624 --- /dev/null +++ b/docs/prior-art/wpt-reftests/methodology.md @@ -0,0 +1,50 @@ +**Date:** 2026-06-14 +**Status:** active +**Subject:** Reftests — the core idea: assert a relationship between two same-engine renderings, not a stored baseline + +# The reftest methodology + +A reftest asserts a *relationship between two renderings produced by the same engine*, not a match against a stored gold image. The harness renders two input files — a **testcase** and a **reference** — captures the resulting bitmaps, and compares them pixel-for-pixel. There are two operators: a *match* assertion that passes if the two renderings are the SAME, and a *mismatch* assertion that passes if they are DIFFERENT. + +## Why relationship, not baseline + +The canonical motivation (MDN, "Creating reftest-based unit tests"): traditional automated testing "compares output against an invariant 'gold standard,'" but a standards-compliant engine may *legitimately* change a rendering — MDN's example is changing the indentation depth of a `blockquote` — so an invariant baseline produces **false failures and an untrustworthy harness**. Reftests sidestep this. The often-quoted statement of the principle (MDN): "The power of the tool comes from the fact that there is more than one way to achieve any given visual effect in a browser." + +The testcase typically exercises the feature under test with complex markup; the reference "uses a different method to produce the same rendering" (CSSWG wiki). Because both files are rendered by the engine under test *in the same run*, anything the engine does consistently — font rendering, antialiasing, subpixel layout, GPU/driver rounding — cancels out of the comparison. You are testing an *internal engine invariant* ("these two equivalent inputs must agree"), not a frozen artifact. + +> *Verification flag:* the "blockquote indentation" example and the exact "gold standard" phrasing are quoted from the MDN / devdoc mirror, not re-confirmed verbatim against current firefox-source-docs prose (the page exists but rewording is possible). + +## The decisive property: the reference reaches the same pixels by a *different route* + +The reference is "a different, usually simpler, file that results in the same rendering as the test. **The reference file must not use the same features that are being tested**" (CSSWG wiki). Gecko states the reference should be created "using a different mechanism than the test." This is the whole game: a reftest validates *what the engine should produce*, not *that two runs of the same buggy code agree*. (The corollary failure mode — a reference that shares the test's buggy code path and so renders identically wrong — is the reference-independence wart; see [open-problems.md](open-problems.md).) + +## The two operators + +| Operator | Gecko | WPT | Passes when | +|---|---|---|---| +| match | `==` | `` | the two renderings are **identical** (within fuzz, if any) | +| mismatch | `!=` | `` | the two renderings are **not identical** | + +**`==` (match)** is the workhorse for conformance: render the feature one way, render the same intended pixels a second way that does not use the feature, assert equality. + +**`!=` (mismatch)** is the workhorse for catching *regressions to nothing* — asserting that a feature actually produces a visible effect, so a no-op implementation fails. The canonical use: prove suppression. A subtree with `content-visibility: hidden` must `!=` the same subtree rendered visible; if the implementation forgets to suppress paint, the two render identically and the `!=` fails. An exact-match assertion would be vacuous here — a blank-vs-blank bug passes a `==`, so you assert *difference* instead. + +## Exact match by default + +Comparison is **exact pixel match by default** — every pixel must be identical unless a fuzzy annotation relaxes it (see [fuzzy-matching.md](fuzzy-matching.md)). The capture happens in a fixed viewport — 800×1000 in Gecko (content outside is ignored), 800×600 including scrollbars in WPT — so anything outside that area does not participate in the comparison. + +## In-process: both renderings, same engine, same run + +The non-negotiable mechanic is that both the testcase and the reference are rendered by the *same engine build on the same machine in the same session*. That is precisely why GPU/driver/font-rasterizer variance is identical on both sides and cancels in the diff. Two different engines with different antialiasing both still pass the same reftest — which is what lets one CSS reftest, authored once, become a conformance assertion every engine checks against *itself*. + +## Implications for Buiy + +Nothing about reftests is HTML-specific; the methodology is renderer-agnostic. Buiy already renders to an offscreen `wgpu` texture, so it gets the same-engine-cancels-variance guarantee *for free* by rendering both `test_scene` and `ref_scene` to texture in one process against the same `wgpu::Device`: driver-dependent SDF rounding, glyph-atlas antialiasing, and subpixel coverage appear *identically* in both images, so `==` can often be **exact** rather than fuzzy — a stronger guarantee than golden screenshots, which must survive driver upgrades. The Buiy analogue of `` / `== test ref` is a `reftest!(match, "name", test_scene, ref_scene)` macro: the manifest is ordinary Rust, not a `reftest.list` text file. The full authoring patterns and the load-bearing reference-independence discipline live in [lessons.md](lessons.md). + +## Sources + +- MDN, "Creating reftest-based unit tests" — https://developer.mozilla.org/en-US/docs/Mozilla/QA/Reftest (mirror: https://devdoc.net/web/developer.mozilla.org/en-US/docs/Creating_reftest-based_unit_tests.html) +- CSSWG wiki, test/reftest — https://wiki.csswg.org/test/reftest +- Firefox Source Docs, Reftest — https://firefox-source-docs.mozilla.org/layout/Reftest.html +- web-platform-tests, writing reftests — https://web-platform-tests.org/writing-tests/reftests.html +- Sibling files: [gecko-reftests.md](gecko-reftests.md), [wpt.md](wpt.md), [fuzzy-matching.md](fuzzy-matching.md), [open-problems.md](open-problems.md), [lessons.md](lessons.md) diff --git a/docs/prior-art/wpt-reftests/open-problems.md b/docs/prior-art/wpt-reftests/open-problems.md new file mode 100644 index 0000000..2b07239 --- /dev/null +++ b/docs/prior-art/wpt-reftests/open-problems.md @@ -0,0 +1,56 @@ +**Date:** 2026-06-14 +**Status:** active +**Subject:** What reftests structurally cannot do — the "impossible to reftest" category, the vacuous-pass / reference-independence failure mode, fuzzy masking, and cross-backend variance + +# Open problems + +What the reftest methodology structurally does *not* solve. These are not bugs in the harnesses — they are limits intrinsic to comparing two renderings of the same engine, and they define exactly where Buiy's Tier-4 reftest tier ends and Tier-5 goldens begin. + +## The irreducible limit: effects with no feature-free reference + +A reftest requires a reference file that "use[s] a different method to produce the same rendering as the test file." Where **no such alternative exists**, reftests are impossible and goldens are mandatory. The CSS-WG enumerates these verbatim: + +- "there is no way to create a reference for underlining, since the position and thickness of the underline depends on the UA, the font, and/or the platform." +- "The following border-styles are impossible to reftest: dotted, dashed, ridge, groove, inset, outset, double. Only solid, none, hidden (and sometimes inherit) are reftestable." + +WPT issue #7676 (labeled `type:untestable`) reinforces this from the spec side — gsnedders: "It's impossible to write any automated test for these values of border-style as the specification doesn't define their behaviour sufficiently (i.e., all values except none, hidden, and solid)." + +The category is broader than borders: **any effect whose pixels depend on font metrics, a UA-chosen line position/thickness, or unspecified dash/dot geometry** has no feature-free twin — underline-position/thickness, `text-decoration-style` dotted/dashed/wavy/double, dashed/dotted/ridge/groove/double/inset/outset borders, focus-ring rendering. **The boundary is structural, not a gap in effort.** + +### Implications for Buiy + +These are precisely the visual effects Buiy's reftest tier cannot cover and must hand to Tier-5 goldens. For Buiy specifically, the analogous irreducible residue is: the **drop-shadow Gaussian falloff, glyph rasterization fidelity (hinting/subpixel), color-emoji compositing (CBDT/COLR/bitmap), blend-mode math, and gamma/sRGB encode** — all of which render on `origin/main` today. A reftest can confirm a shadow is *translation-invariant* or *symmetric* (and Tier-3 property tests should), but not that the falloff is *correct*. The one effect Buiy can pull *back* from goldens is SDF corner AA — not via a feature-free reference (none exists) but via the **CPU-vs-GPU cross-check** (Buiy's CPU SDF port as an independent rasterization oracle), a Tier-4.5 the strategy report places between reftests and goldens. + +## The vacuous-pass failure mode: reference independence + +The load-bearing wart. If the reference exercises the *same code path* as the test, a shared bug makes both render identically wrong and the test **passes vacuously**. The CSS-WG states it: a reference may "itself fail in such a manner as to cause the reference to render identically to a failed test." This is the exact symmetry of the golden-test weakness ("a golden can't catch a bug present when the golden was captured") — a reftest is equally blind when test and reference share the buggy path. + +**Mitigations the browsers use:** +- The reference must use a *different technique* than the test ("The reference file must not use the same features that are being tested"). +- Where one disjoint reference is impossible, use **multiple reference files**, each using a different technique, so two independent techniques must agree (WPT: "if there are any match references, at least one must match"). +- Keep references human-legible and self-describing so a reviewer can confirm test and reference are not *both* wrong. + +### Implications for Buiy + +Concretely: do **not** write a flex reference using flex; do not write an `@container` reference using `@container`. Route references through Buiy's **primitive/absolute layer** (literal-positioned boxes that bypass Taffy and the CSS-subset entirely) or a second, independent style mechanism. This is Open Question #1 in the strategy report — *who* reviews reference independence, and *can it be lint-enforced* (e.g. "a reference scene must not contain a `@container` rule")? Without that discipline, reftests silently lose their teeth. + +## Fuzzy matching masks real regressions + +Exact equality is unattainable in some cases even within one run (gradient dithering, transform-rotated edges), so fuzzy tolerance exists — but **over-broad fuzz ranges silently mask real regressions**, and intermittently-failing tests are forced into a `0`-inclusive form that *loses* the regression-catching property (a fixed bug, or a further regression landing back in the window, goes unreported). The discipline (pin both ends, never include 0 when a difference is expected) mitigates but does not eliminate this; non-deterministic tests are the acknowledged casualty. Full treatment in [fuzzy-matching.md](fuzzy-matching.md). + +## Cross-backend variance reintroduces the noise reftests cancel + +The "one run cancels variance" benefit assumes a **fixed backend per CI lane**. Both halves of a reftest cancel GPU/driver/font variance only because they share it. A reftest whose test renders on Vulkan and reference on Metal would reintroduce exactly the variance reftests are designed to cancel, forcing fuzz back in. For Buiy this means: keep a reftest's two captures on the *same* `wgpu` backend in the *same* process; do not attempt cross-backend `==`. Cross-platform confidence comes from running the *whole* reftest suite on each pinned backend independently, not from cross-backend pairings. + +## Unreftestable renderings belong in lower tiers, not just goldens + +Some renderings are simply unreftestable *and* poor golden candidates (font-metric-dependent, scrollbar-dependent geometry). For Buiy these belong in **Tiers 1–3** (layout-number snapshots, structured display-list snapshots, metamorphic/property invariants) where the assertion is on *structured data* the engine produced, not on pixels — e.g. a glyph's `(line, glyph_id, x, y)` shaping snapshot, not a screenshot of the rendered text. The pyramid is the answer: each untestable-by-reftest effect routes to whichever tier *can* observe it deterministically. + +## Sources + +- CSSWG wiki, test/reftest (underline + border-style "impossible to reftest"; vacuous-pass) — https://wiki.csswg.org/test/reftest +- WPT issue #7676 (`type:untestable`, border-style) — https://github.com/w3c/web-platform-tests/issues/7676 +- web-platform-tests, writing reftests (underline wart; multiple references) — https://web-platform-tests.org/writing-tests/reftests.html +- Firefox Source Docs, Reftest (fuzzy masking) — https://firefox-source-docs.mozilla.org/layout/Reftest.html +- Sibling files: [methodology.md](methodology.md), [fuzzy-matching.md](fuzzy-matching.md), [lessons.md](lessons.md), [consumers.md](consumers.md) +- Buiy strategy report (Open Question #1, Tier-4/Tier-5 boundary, CPU-vs-GPU cross-check) — [../../reports/2026-06-14-visual-bug-detection-strategy.md](../../reports/2026-06-14-visual-bug-detection-strategy.md) diff --git a/docs/prior-art/wpt-reftests/wpt.md b/docs/prior-art/wpt-reftests/wpt.md new file mode 100644 index 0000000..2a215c4 --- /dev/null +++ b/docs/prior-art/wpt-reftests/wpt.md @@ -0,0 +1,62 @@ +**Date:** 2026-06-14 +**Status:** active +**Subject:** web-platform-tests — the cross-vendor reference-comparison corpus, `rel=match`/`rel=mismatch`, the `wpt` runner, and the two-way vendor sync + +# web-platform-tests (WPT) + +web-platform-tests is "a cross-browser test suite for the Web-platform stack" whose stated purpose is to let browsers "ship software that is compatible with other implementations" (WPT repo). It is a *methodology + shared corpus*, not a product: a single Git repo of tests that Chromium, Gecko, WebKit, and Servo all run against their own engines. Reftests are the rendering-oriented subset and the direct ancestor of Buiy's reftests-first tier. + +## The reftest mechanism: `rel=match` / `rel=mismatch` + +A reftest is a *pair* of pages — a **test** (uses the feature under test) and a **reference** (renders the same visual result by simpler/already-trusted means). The test links to its reference via a `link` element: + +- `` — passes only if test and reference render **pixel-for-pixel identically** within an "800×600 window *including* scroll-bars if present." +- `` — passes only if they do **not** render identically (used to prove a property actually *changes* rendering). + +**Multiple references give boolean logic:** "If there are any match references, at least one must match" (OR / alternates); "If there are any mismatch references, all must mismatch" (AND). References can be chained and shared across many tests — sharing is "strongly encouraged" because it "makes it easier to tell at a glance whether a test passes and enables some optimizations in automated test runners." + +**Reference discovery is explicit, not filename-based.** WPT links a test to its reference only through the in-markup `` / `rel=mismatch` element (and the generated `MANIFEST.json` index built from it) — verified against the writing-reftests doc, which describes no automatic `*-ref.html` / `-expected` filename convention. Gecko is the same: each `reftest.list` line names both URLs explicitly. The filename convention is a *Blink* divergence — Chromium pairs `foo.html` with `foo-expected.html` (or `foo-expected-mismatch.html`) by name — covered in [consumers.md](consumers.md). A Buiy `reftest!` naming scheme can pick either model deliberately; the explicit-pairing model is what WPT/Gecko use. + +## The key property: no stored golden images + +A reftest stores *no* baseline screenshot. The oracle is the reference page, rendered live by the same engine on the same machine at test time. This cancels out platform font rendering, antialiasing, GPU, and DPI differences — the exact fragility that plagues golden-screenshot testing — because both halves of the comparison share those conditions. Two engines with different antialiasing both still pass the same reftest. (The shared mechanic with Gecko is the same one described in [methodology.md](methodology.md).) + +## Timing and fuzzing controls + +- Screenshots are taken after load + font loading + pending paints. A test marks itself async with `class="reftest-wait"` on the root element; the harness fires a `TestRendered` event and waits for the class to be *removed* before capturing — the explicit "the scene is settled, capture now" handshake. +- Bounded tolerance via `` — a per-channel color delta (0–255) and a count of differing pixels, both range-expressible (`content="10-15;200-300"`) and per-reference. Full semantics in [fuzzy-matching.md](fuzzy-matching.md). + +**Documented wart (verbatim):** "There is no way to create a reference for underlining, since the position and thickness of the underline depends on the UA, the font, and/or the platform" — i.e. reftests fail wherever a feature's rendering is *intentionally* UA-defined and so cannot be reproduced by independent markup. See [open-problems.md](open-problems.md) for the full untestable category. + +## Runner, manifest, and scale + +The `wpt` CLI drives everything (WPT repo): `wpt serve` (HTTP server), `wpt run` (execute in a browser), `wpt lint`, and `wpt manifest` (generates `MANIFEST.json`, the index that classifies each file as testharness/reftest/wdspec and records reftest match/mismatch relationships so the runner knows what to compare). Invocation is `./wpt run [browsername] [tests]`, e.g. `./wpt run chrome dom/historical.html` or `./wpt run --binary ~/local/firefox/firefox firefox ...`; `--test-type=reftest` filters to reftests. Chrome, Edge, and Servo are supported by default per Servo's integration notes. + +**Scale (verify-flagged — counts drift; cite with date):** the corpus is "over 52000 tests and nearly two million subtests" per Servo's 2023-07-20 post. A later wpt.fyi figure cites "56,552 tests … 1.8 million subtests (as of December 2024)" — *secondary, from search aggregation; could not confirm against a primary wpt.fyi page; treat as approximate*. Either way: thousands of CSS reftests, **zero stored golden screenshots**. + +## How cross-vendor sharing actually works (two-way sync) + +The corpus is shared by *bidirectional automation*, not manual copying: + +- **Chromium:** maintains "a 2-way import/export process with the upstream web-platform-tests repository, where tests are imported into `web_tests/external/wpt`" (full path `third_party/blink/web_tests/external/wpt`). An LUCI "wpt-importer builder" auto-imports to "track tip-of-tree … as closely as possible"; editing files under `external/wpt` makes the exporter "create a provisional pull request … in the upstream WPT GitHub repository" that auto-merges. Rationale (verbatim): "leveraging and contributing to a shared test suite is one of the most important tools in achieving interoperability." +- **Gecko:** Mozilla's `wpt-sync` service does "Synchronize changes between gecko and web-platform-tests" — two-way sync between the Gecko monorepo and upstream WPT. +- **Servo:** "all changes to Servo's in-tree Web Platform Tests will be upstreamed automatically when your PR is merged"; run `./mach update-manifest` after editing a test/reference. +- **WebKit:** participates via its own `LayoutTests/imported/w3c/web-platform-tests` import path. *Not confirmed against a primary WebKit doc in this pass — flagged.* + +**Net effect:** one CSS reftest, authored once, becomes a conformance assertion every engine checks against itself — the engines disagree on antialiasing yet agree on the reftest. That is precisely the leverage Buiy's reftests-first tier buys: a rendering oracle that survives platform noise without a golden-image baseline to maintain. + +> *Verification flag:* `rel=match`/`rel=mismatch` syntax, fuzzy/`reftest-wait` semantics, the underline wart, and all sync-mechanism quotes are from primary sources (cited below). Test/subtest *counts* are time-stamped estimates that drift; the Dec-2024 figure is secondary and unconfirmed; WebKit's exact sync path is unconfirmed. + +## Implications for Buiy + +WPT is the proof at scale that a relational rendering oracle needs **zero stored screenshots** — the v1 case for Buiy's reftests-first bet. Two mechanics port directly: (1) the **multiple-reference boolean logic** (≥1 `==` must match; *all* `!=` must mismatch) is the right shape for Buiy supporting several independent references where one disjoint reference cannot fully isolate a feature; (2) the **`reftest-wait` settle handshake** maps onto Buiy's deterministic "scene settled — 0 pending assets, atlas warmed, clock advanced" capture gate before texture readback. The corpus-sharing/sync machinery is *not* directly relevant (Buiy authors its own reftests in Rust), but the consumer-side model — a parallel engine treating the corpus as an external fixture with its own pass-state — is, and is covered in [consumers.md](consumers.md). + +## Sources + +- web-platform-tests, writing reftests — https://web-platform-tests.org/writing-tests/reftests.html +- web-platform-tests repo — https://github.com/web-platform-tests/wpt +- WPT running tests from a local system — https://web-platform-tests.org/running-tests/from-local-system.html +- Chromium web_platform_tests docs (2-way import/export) — https://chromium.googlesource.com/chromium/src/+/main/docs/testing/web_platform_tests.md +- mozilla/wpt-sync — https://github.com/mozilla/wpt-sync +- Servo WPT blog (2023-07-20; corpus scale) — https://servo.org/blog/2023/07/20/servo-web-platform-tests/ +- Sibling files: [methodology.md](methodology.md), [fuzzy-matching.md](fuzzy-matching.md), [consumers.md](consumers.md), [open-problems.md](open-problems.md), [lessons.md](lessons.md) diff --git a/docs/reports/2026-06-14-visual-bug-detection-strategy.md b/docs/reports/2026-06-14-visual-bug-detection-strategy.md new file mode 100644 index 0000000..534a8e5 --- /dev/null +++ b/docs/reports/2026-06-14-visual-bug-detection-strategy.md @@ -0,0 +1,272 @@ +# Visual-bug detection strategy for Buiy as it scales + +**Date:** 2026-06-14 +**Verdict:** Architecture committed (five-tier pyramid, reftests-first); recommendations not yet realized in code. + +Research input for the not-yet-written `buiy-verification-design` spec. Synthesizes three internal audits (render display-list intermediates, `buiy_verify` + golden-harness gaps, prior-art visual-test corpus) and six external research streams (golden/screenshot regression, reftests, perceptual metrics, deterministic GPU, structured snapshots, triage/coverage tooling), then re-grounds them against canonical `origin/main`. Audience: Buiy maintainers and the author of `buiy-verification-design`. It commits to an architecture (a five-tier pyramid) and a build order; it does **not** itself decide tolerances, storage layout, or the perceptual-metric pick — those are the spec's job and are flagged as open questions. The recommendations here are not yet realized in code. + +## Problem + +A "visual bug" is any defect a user could see: a misplaced box, a tooltip behind a modal, a wrong-color badge, a missing focus ring, a blurred glyph, a BiDi-reordered caret on the wrong side of a run, an emoji that fell back to tofu, an antialiasing seam on a rounded corner, a drop-shadow whose falloff is wrong. Buiy's exposure to these grows multiplicatively, not additively, as the library matures, and the naive instinct — "write more pixel screenshot tests" — gets *worse* with scale, not better. The surface is not hypothetical: on canonical `origin/main` the render pipeline and text subsystem are fully landed (effect compositor, drop-shadow, glyph atlas, clipping, top-layer, forced-colors, multi-script shaping), so the combinatorial visual surface this report addresses is **present-tense, not future**. + +**The combinatorial surface.** Gate #2 already names the axes: golden per **widget × state × theme × viewport** (`docs/specs/2026-05-07-buiy-foundation/verification.md`). Add **platform** (wgpu dispatches to Vulkan / Metal / DX12 / GL, each with its own antialiasing and float behavior), **device-pixel-ratio** (a 1× and a 2× render of the same scene rasterize differently and Playwright explicitly warns output varies by DPI/"power source" — https://playwright.dev/docs/test-snapshots), and the cell count is a product, not a sum. Twenty widgets × five states × three themes × four viewports × four backends × two DPRs is ~9,600 images before counting forced-colors, RTL, or animation frames. A test methodology whose cost scales with that product cannot keep up with feature work. + +**GPU non-determinism.** The pixel is the *least* deterministic artifact Buiy produces. Floating-point non-associativity, FMA contraction, fast-math, and denormal flushing all vary across drivers and architectures (research:deterministic-gpu). Buiy's SDF rounded-rects are antialiased analytically in-shader; the smoothstep edge depends on a GPU derivative (`fwidth`), and sRGB encode happens on GPU write to an `Rgba8UnormSrgb` target. Each is deterministic *given identical hardware and float semantics* and a source of flake otherwise. The golden module's own phrasing names it: "sub-LSB float jitter in the SDF and linear→sRGB encode is invisible but not bit-stable" (`crates/buiy_core/src/render/golden.rs`). + +**Layout/paint coupling.** Most visible bugs are not born in the rasterizer. They are born upstream — a wrong layout box, a mis-sorted paint order, a stacking context that failed to form, a transform composed `S·R·T` instead of `T·R·S` (Buiy's order is `T·R·S·M`, `compose_transform` at `crates/buiy_core/src/layout/systems.rs:3775`), a color token that resolved to the magenta missing-token sentinel (`MISSING_TOKEN_FALLBACK`). By the time those reach pixels, the structured information that *names* the bug is gone; a pixel diff reports "4,000 px changed," not "z-index sort is unstable." + +**Text is the highest-combinatorial *and* highest-flake surface — and it is already here.** The text subsystem is landed (T1–T9) plus text-editing E1: shaping, glyph atlas, decoration, BiDi, color-emoji, effect-group compositing. Text is simultaneously the *highest-combinatorial* visual surface (script × font × fallback chain × subpixel position × hinting × weight) and the *highest-flake* one (glyph rasterization is the canonical per-platform golden problem — Flutter's entire `matchesGoldenFile` ecosystem exists to fight it, research:golden-screenshot). A strategy that omits glyph hinting/subpixel positioning, BiDi reordering, color-emoji (CBDT/COLR/bitmap) compositing, and font-fallback would be materially incomplete. It gets its own treatment in §3 and §4 rather than a one-line mention in a residue list. + +**Why unit + layout-number tests alone won't cover it.** Buiy already has CPU unit tests for the deterministic transforms (`crates/buiy_core/tests/render_instance.rs`, including the half-size sign-bug regression). Those catch per-function arithmetic errors. Layout-number snapshots (gate #5) catch geometry errors. But neither tier observes **composition**: whether correctly-sized, correctly-positioned boxes are painted in the correct order, with the correct resolved colors, into the correct draw set, and whether the rasterizer then turns that into the correct light. A button can have a pixel-perfect layout box and still render invisibly (wrong z-order), in the wrong color (token bug), or with a broken corner (SDF bug). Those multiply with the feature surface — and the cheapest two tiers miss them. + +The strategy below resolves this by pushing detection *down* a pyramid: catch composition bugs in deterministic structured snapshots and relational reftests, so the expensive, flaky pixel tier shrinks to the irreducible residue that genuinely needs a rasterizer. + +## Current state + +*This report is grounded on canonical `origin/main`, where the render pipeline and text subsystem are landed (a stale local ancestor that predates both is what earlier framing mistook for "render Phase 0").* + +**What already exists — more than a diff primitive.** The verification surface is no longer empty: + +- **Golden infra is merged** in `crates/buiy_core/src/render/golden.rs`. `GoldenConfig { fixed_clock, wait_for_fonts, warm_atlas, accept }` plus `GoldenConfig::deterministic()` (`golden.rs:18`–`:88`) is the spec's flake-mitigation triad, *built* — fixed clock, font-load sync, atlas warmup pinned together. `fonts_ready()` (`golden.rs:82`) is the implemented font-sync predicate (warmup queue drained AND every fixture glyph resident, probed via the no-LRU-touch atlas getter). +- **Two naive diff metrics exist, in two crates.** `perceptual_diff(a, b) -> f32` (`golden.rs`) is a normalized **mean ABSOLUTE per-channel difference (L1 / Manhattan)**, `sum/(len·255)`. Separately, `compare_images` in `crates/buiy_verify/src/visual.rs:18` is a global **RMSE** (per-channel squared error, summed, normalized by `pixels·4·255²`, square-rooted). Both are naive: no AA-exclusion, no two-axis fuzzy budget, no perceptual/luma weighting. Mismatched dimensions short-circuit to `1.0`. +- **GPU capture→readback goldens exist** as `#[ignore]` tests: `render_golden_harness.rs` (overlapping semitransparent fills), `text_decoration_gpu.rs`, `text_golden_suite_gpu.rs` (widget × state × theme × viewport text), `text_gpu.rs`, `text_effect_group_gpu.rs`. They use a **"re-capture IS the golden"** discipline: capture twice in fresh apps, assert `perceptual_diff < ~1e-4`. This proves determinism / flake-mitigation but does **not** persist a stored baseline across runs. +- **The forced-colors analyzer is built** (`crates/buiy_core/src/render/forced_colors_analyzer.rs`): `analyze_forced_colors()` (`:51`, check a — every token resolves to a real system color, not the magenta `MISSING_TOKEN_FALLBACK`) and `analyze_shadow_only()` (`:89`, check b — no shadow-only affordance) both return `Vec` as pure CPU assertions, tested in `render_forced_colors_analyzer.rs`. Gate #11's structured half is done in spirit. +- **Text shaping snapshots are checked in** (`crates/buiy_core/tests/text_shaping_snapshots.rs`): per-glyph `(line, glyph_id, font_seat, x, y)` pinned to `.snap` fixtures under `tests/fixtures/shaping/` for **6 scripts** (Latin, Arabic, Devanagari, CJK, emoji-ZWJ, mixed-BiDi), with a real `BUIY_ACCEPT_SHAPING=1` curated-update workflow. This is an **in-repo precedent for both the `--accept` model and structured (Tier-2-style) snapshots.** +- **Shipping non-visual gates:** a11y-tree snapshots (`crates/buiy_verify/src/a11y.rs`, gate #3) and a WCAG contrast linter (`contrast.rs`, gate #9). + +**The honest delta — what is genuinely missing.** Despite the above, the central gaps remain: + +| Gate / mechanism | Spec / target | Today | Delta | +|---|---|---|---| +| #2 visual — stored baseline | golden corpus + `--accept` FILE persistence + per-fixture tolerance | `accept` flag with **no machinery**; GPU goldens are `#[ignore]` re-capture checks, no stored PNG | no stored-baseline regression — the goldens prove determinism, not "matches a blessed image" | +| #2 metric | perceptual two-axis AA-aware diff | naive **L1** (`golden.rs`) **+** naive **RMSE** (`visual.rs`) | wrong metric *family*, in two places — unify + upgrade (§4) | +| #5 layout snapshots | resolved-layout JSON/RON per fixture | only numeric field `assert_eq!` in `tests/layout.rs` | no holistic snapshot tier | +| #11 forced-colors | analyzer over the **live** widget catalog + forced-colors visual residual | analyzer over **hand-built `CatalogPaint`** descriptors; no `BoxShadow` draw-skip | wire to live components (follow-ups.md:462–481); build the shadow-suppression visual half | +| Tier-2 display-list | holistic display-list / paint-order snapshots | low-density per-field `assert_eq!` in `render_buckets.rs`, `render_paint_order.rs`, `render_instance.rs`, `render_extract.rs`, `top_layer.rs` | upgrade hand-written asserts to `insta` snapshots — `insta` is **not yet a dependency** | +| #12 proptest | generators + invariants | `proptest` dep present, **zero** generators/invariants | build them (incl. BiDi caret round-trip) | +| reftests | relational `==`/`!=` | **none anywhere** | the single highest-leverage absence | +| determinism stack | software-rasterizer pin (lavapipe etc.) | flake-triad config built; no rasterizer pin wired | extend, don't start | + +`image = "0.25"` and `proptest = "1"` are already workspace deps (PNG I/O ready; generators absent). `image-compare`/SSIM and `insta` are absent. The deferral is documented in follow-ups.md:779–802 (stored-PNG machinery, owned by `buiy-verification-design`) and 462–481 (forced-colors live-catalog seam). This report is that design's input. + +**The maturity note to keep in mind.** Layout and render are both mature: stacking, top-layer (`Fullscreen < Tooltip < Popover < Modal`), content-visibility, container queries, transforms, the effect compositor, drop-shadow, the glyph atlas, and forced-colors are all live. So the Tier-5 rasterization residues this report names — shadow kernel, effect compositor, glyph/emoji fidelity — **render on `main` today**; what is missing for them is the *stored corpus + perceptual metric + curated set*, not the renderer. + +The structurally important fact: **Buiy's render handoff is already a serializable, CPU-side, deterministic display list** (`ExtractedNode`/`ExtractedNodes`, `InstanceBuckets`, `PackedInstance`), and the tests already assert on it — but field-by-field, not holistically. That is the central opportunity §3 builds on. + +## The layered strategy + +Treat visual-bug detection as a **pyramid**, lowest/cheapest/most-deterministic tier first, per CLAUDE.md's "test at the lowest tier that covers the behavior." The load-bearing claim — supported by every mature engine in the prior-art corpus (Chromium, Flutter, WebRender all run tens of thousands of *text-dump* and *reftest* cases and reserve pixels for the genuinely-rasterized exception) — is that **Tiers 1–4 catch the large majority of visual regressions cheaply and deterministically, leaving the flaky, expensive golden tier a minimal residue.** + +``` + ╱ Tier 5 ╲ golden/screenshot regression — true rasterization only + ╱──────────╲ (SDF AA, shadow kernel, glyph/emoji atlas, effects, forced-colors) + ╱ Tier 4 ╲ REFTESTS (+ CPU-vs-GPU cross-check) — render-A == / != render-B + ╱──────────────╲ (the CSS-subset layer; highest pixel-level leverage) + ╱ Tier 3 ╲ metamorphic + property invariants (proptest) — no oracle + ╱──────────────────╲ (paint-order totality, transform round-trips, BiDi caret, no-NaN) + ╱ Tier 2 ╲ STRUCTURED display-list / paint-order / token-set snapshots (insta) + ╱──────────────────────╲ (high-ROI middle tier — asserts exist, holistic snapshots don't) +╱ Tier 1 ╲ layout-number snapshots (insta) — gate #5 +────────────────────────── +``` + +### Tier 1 — layout-number snapshots (gate #5) + +**What it catches.** Geometry: an element 4px too wide, mispositioned, wrong intrinsic size, container-query math errors, Taffy-bridge bugs, writing-mode/logical-property resolution. The WPT/Taffy pattern — assert resolved geometry as structured data, not pixels. + +**Implementation hook.** `ResolvedLayout` (`crates/buiy_core/src/components.rs:25`) is the resolved box. Spawn a fixture scene, run the layout pipeline, collect `{entity_name → (position, size)}`, snapshot as JSON/RON via `insta`. `tests/layout.rs` already exercises these positions via numeric `assert_eq!`; the upgrade is to replace those per-field asserts with a holistic snapshot. Taffy's WPT-derived corpus (`docs/prior-art/taffy/lessons.md`) is reusable — import the same fixtures to test Buiy's Taffy bridge. + +**Cost.** Trivial. No GPU, no window, sub-millisecond, 100% deterministic (Taffy is deterministic given inputs). + +**Can't catch.** Anything past geometry — paint order, resolved colors, the draw set, rasterization. + +### Tier 2 — structured display-list / paint-order / token-set snapshots (the high-ROI middle tier) + +This is the rung where the asserts exist but the *holistic* snapshot does not. Tests already check the intermediates field-by-field (`render_buckets.rs`, `render_paint_order.rs`, `render_instance.rs`, `render_extract.rs`, `top_layer.rs`); the upgrade is to snapshot the whole intermediate via `insta` instead of hand-written per-field `assert_eq!`. It slots between #5 and #2 and is the single highest-leverage *snapshot* gate to add. + +**What it catches.** The composition bugs Tier 1 can't see and pixels name poorly: + +| Bug class | Example | Observable in | +|---|---|---| +| wrong paint **order** | tooltip behind modal; negative-z not first; unstable sort | `ExtractedNodes.nodes` order / `StackingContext` | +| wrong paint **set** | content-visibility cull wrong; missing-token magenta sentinel; extra/missing draw | `ExtractedNodes.nodes` / `ComputedPaintSkip` | +| wrong paint **params** | color token resolved wrong; radius miscomputed; transform composed wrong | `PackedInstance` / `InstanceBuckets` | +| stacking-context **formation** | `isolation`/`z-index`/containment fails to form a context; subtree leaks | `StackingContext` (`components.rs:82`) | +| **forced-colors token flow** (gate #11a/b) | a draw's color is outside the system-token set; a focusable's only cue is a shadow | already a structured assert — `analyze_forced_colors`/`analyze_shadow_only` | + +In a CSS-subset engine the *bulk* of feature work — container queries, anchor positioning, writing modes, stacking, transforms, containment, multicol, table, content-visibility — lands in exactly these rows. + +**Buiy implementation hook (verified on `origin/main`).** The render handoff is already a serializable display list, computed entirely CPU-side before any pixel exists: + +- `ExtractedNode` (`Copy/Clone/Debug/PartialEq`, `render/extract.rs:65`); carrier `ExtractedNodes { nodes, logical_size, scale_factor }` (`extract.rs:139`) — nodes in `painters_z` order, **never re-sorted** by render. +- `EffectGroupExtract` (`extract.rs:35`); `PrimitiveBatchKey { primitive, layer }` (Ord/Hash, `render/buckets.rs:59`) and `InstanceBuckets { batches: BTreeMap> }` (`buckets.rs:85`) — natural `BTreeMap` iteration *is* the deterministic draw order. `PackedPartition { instances, group_ranges, flat_ranges }` (`buckets.rs:196`) — effect-group instance ranges. +- `PackedInstance` (`Copy/Clone/Debug/`**`Pod`**`/Zeroable`, `render/instance.rs:40`) — **byte-snapshottable now with no new derive**; the `[f32;13]` layout is pos2/size2/rgba4/radius1/clip4. +- Text: `TextQuad`/`ExtractedTextQuads` (`extract.rs:668`); `ExtractedGlyphs`/`GlyphEntityRun` (Clone/Debug/PartialEq, `render/prepare.rs:48`/`:67`); `GlyphAlphaInstance` (`Pod`) per glyph. +- Paint-order traversal is pure/unit-testable: `assemble_context_tree` (`extract.rs:206`), `context_tree_paint_order` (`extract.rs:222`), `partition_top_layer` (`render/top_layer.rs:17`). Skip predicate `node_skip_reason` (`render/visibility.rs:48`) → `ComputedPaintSkip`/`SkipReason` (`render/components.rs:443`/`:422`). + +None of these need a window or a GPU — the extract logic runs on the CPU. Most derive `Debug` (some `Pod`); add a tiny `serde` to the few that need JSON, and add `insta` (not yet a dependency). **Do not snapshot raw `Debug`.** Write a purpose-built `Display`/dump formatter — one paint command per line (`rect pos=… size=… color=token:… radius=…`), entities rendered as their `Name` component (not raw `Entity` bits, which depend on allocation order), floats rounded via `insta`'s `rounded_redaction` to kill last-ULP churn from the clip-space math. The format then stays stable across refactors of the underlying structs. This is the Flutter `toStringDeep` / WebRender RON-serialized-display-list pattern; it is precisely the tier `masonry_testing` skips (it jumps straight to PNG goldens — the gap Buiy should *not* replicate). + +**Two complementary sub-checks on the same handoff:** + +- **Byte-exact `PackedInstance` hex snapshot.** `PackedInstance` is `Pod`, so it can be cast to a byte slice and hex-snapshotted as a *separate, stricter* invariant — a deterministic byte-exact regression on the px→clip-space packing with no formatter at all. Complementary to the `Display` dump (diff-readable but format-versioned), not a replacement. +- **Forced-colors token-flow — already a structured assertion (gate #11a/b).** This is the one Tier-2 sub-check that is **already built**: `analyze_forced_colors` and `analyze_shadow_only` (`forced_colors_analyzer.rs:51`/`:89`) are static assertions returning `Vec`, far more diagnostic than a forced-colors golden ("this badge painted a literal color outside the token set," not "pixels changed"). The remaining work is to **extend the existing analyzer**, not build it: re-point it from hand-built `CatalogPaint` descriptors at the live widget catalog (follow-ups.md:462–481). The residual forced-colors *visual* question — that the system tokens rasterize visibly distinct, plus the `BoxShadow` draw-skip — is the reftest/golden half (§3 Tier 4). + +**Cost.** Low. Pure CPU, deterministic, sub-millisecond. `insta`'s `cargo insta review` directly implements the spec's required `--accept` UX; `INSTA_UPDATE` defaults to `no` in CI so unreviewed snapshots fail the build. + +**Can't catch.** True rasterization: SDF corner AA, shadow blur kernel, glyph hinting, blend math, gamma/sRGB. The structure is right; whether the GPU turned it into the right light is unobservable here. + +### Tier 3 — metamorphic + property-based invariants (proptest, gate #12) + +**What it catches.** Relations that must hold over *generated* scenes, with **no golden and no oracle** — the technique sidesteps the oracle problem by asserting relations between outputs under input transforms (metamorphic testing; established for graphics/shader compilers, cf. GraphicsFuzz). Highest leverage per test because one generator covers an unbounded fixture space. `proptest` is already a dep with **zero** generators written. Concrete invariants on Buiy's existing intermediates: + +- **Transform round-trip:** `translate(d)` then `translate(-d)` ⇒ identical paint commands; `rotate(360°)` ≈ identity within epsilon; `scale(2×)` ⇒ all geometry ×2 and nothing else. Buiy's transform composition is `compose_transform` / `transform_matrix_to_mat4` (`crates/buiy_core/src/layout/systems.rs:3775` / `:3716`) — the *render* transform path, distinct from `layout/translate.rs` (the Taffy style translator, no `Mat4`). Assert the round-trips on the former. +- **Paint order is a total order:** `ExtractedNodes.nodes` has no duplicate entity, every painted entity appears exactly once, the sort is *stable* (equal paint key ⇒ document order). Assert generatively, not just in the hand-picked unit tests. +- **z-isolated containment:** no entity of stacking context A appears between two entities of context B in the flattened order. +- **Top-layer dominance:** any `TopLayer != None` entity paints after every normal-stacking entity, ordered `Fullscreen < Tooltip < Popover < Modal` (`partition_top_layer`, `top_layer.rs:17`). +- **No-NaN / finiteness:** every `ExtractedNode.size ≥ 0`, every `PackedInstance` field finite. (Assert on the un-flipped `ExtractedNode` rather than the packed instance where the y-flip convention applies.) +- **BiDi caret round-trip ≡ identity — the concrete next text invariant** (gate #12's named text invariant). The shaping snapshots already pin glyph positions; the missing piece is the *caret* relation: mapping a logical caret index to a visual position and back is identity across arbitrary mixed-direction runs; visual caret order within a run is monotonic in logical order for LTR and reversed for RTL; a generated BiDi string's run partition covers every codepoint exactly once. These need no rasterizer — they are relations over the shaper/BiDi output, the cheapest possible text-correctness tier, and not yet built. + +**Implementation hook.** `proptest` (gate #12 already specifies generators with shrink-to-minimal-failing-tree). Run invariants over the same CPU extract path as Tier 2. Save failing seeds for reproduction. + +**Cost.** Low; CPU-only, deterministic given seed. + +**Can't catch.** Absolute correctness of any single concrete output (it asserts relations, not values) and rasterization. + +### Tier 4 — reftests (`==` / `!=` with fuzzy matching), plus a CPU-vs-GPU cross-check + +**Buiy's highest-leverage pixel-level investment, and still entirely absent** — there are no reftests anywhere in the tree. It needs no stored golden and it directly exercises the CSS-subset layer, Buiy's actual differentiated, bug-prone surface. + +**The mechanism.** A reftest renders a **test** input and a **reference** input *with the same engine in the same run* and asserts their bitmaps are identical (`==`) or distinct (`!=`) — never against a stored baseline. The CSS-WG insight: "the power of reftests comes from the fact that there is more than one way to achieve any given visual effect." Render the feature one way, render the same intended pixels a second way that does *not* use that feature, assert equality. Because both halves share the identical GPU, driver, font stack, AA, and clock in one process, **all platform-variance terms cancel** — you test an internal engine invariant, not a frozen artifact. The reference is an *independent oracle*. This is differential testing; it is how WPT runs thousands of CSS-conformance cases with zero stored screenshots, and how Servo — the closest prior art, a parallel Rust engine — gets its primary CSS signal (`mach test-wpt` / `test-css`). + +**What it catches.** Any bug where two equivalent inputs diverge: flex/grid vs. absolute equivalence, margin collapse, writing-mode/logical-property resolution, multicol/table geometry, stacking/z-order, clipping, transform geometry, containment boundaries — and, squarely where Buiy lives, **container queries, anchor positioning, content-visibility**. `!=` anti-tests prove a feature *does* something (e.g., `content-visibility:hidden` must `!=` the visible render — guards against silent no-ops). Forced-colors's residual visual half (§ Tier 2) lands here too: a forced-colors scene `==` a hand-authored system-token reconstruction. + +**How to build it on the wgpu pipeline.** Buiy's "documents" are typed BSN assets or programmatic widget trees, so the manifest is ordinary Rust, not a `reftest.list` text file. + +1. **Render-to-texture path:** two offscreen captures in the same app run, fed to the diff (the §4 metric, not the existing naive ones). Reuses the headless capture path the `#[ignore]` GPU goldens already exercise (`render_golden_harness.rs`). +2. **Pairing API as the manifest** — leverage the type system: + ```rust + reftest!(match, "container-query-collapse", test_scene, ref_scene); + reftest!(mismatch, "cv-hidden-actually-hides", visible_scene, hidden_scene); + ``` + A `#[test]` per pair (or a data-driven harness over `&[RefCase]`) keeps it at the unit/integration tier under the existing `xvfb-run -a cargo test` gate — **no new CI infra, no golden storage, no `--accept` loop.** +3. **Determinism reuse:** reftests need the §4 determinism stack *less* than goldens (both halves share clock/atlas in one run, so drift cancels) but reuse the same scaffolding — including the already-built `GoldenConfig::deterministic()` triad. +4. **Fuzzy matching** via Mozilla's two-axis model: `fuzzy(d_lo-d_hi, p_lo-p_hi)` — max per-channel delta of any pixel **and** count of differing pixels. Mozilla's discipline: ranges should *not* include 0 when a difference is expected (`fuzzy(1-1, 8-8)` pins both ends, so if AA suddenly touches 9 pixels it's a regression, not a pass). This is impossible to express with the existing naive L1/RMSE — fixing the metric (§4) is the prerequisite for *this* tier. + +**The Vello-style CPU-vs-GPU cross-check (a Tier-4.5, golden-free).** Buiy already has a CPU port of the SDF rounded-rect (exercised in `crates/buiy_core/tests/render_instance.rs`). That is exactly Vello's pattern: a deterministic CPU rasterization reference against which the GPU path is asserted within tolerance (research:deterministic-gpu — Vello treats `vello_cpu` f32 as the cross-check oracle for the GPU shaders). Promote the CPU SDF from "unit-test precedent" to a *rasterization reference*: render the same primitive on the GPU and on the CPU port in one run, diff with the §4 metric. It stores zero bytes, needs no second authoring path (the CPU port *is* the independent implementation), and catches SDF/AA bugs that a reftest can't (no feature-free reference exists for corner AA). It sits between reftests and goldens and should be built before broad goldens. + +**Authoring patterns, mapped to Buiy:** flex-justified row `==` same boxes at literal offsets; `@container`-triggered scene `==` scene authored as if the rule applied; `cv:hidden` subtree `!=` visible subtree; logical-property layout `==` physical-property mirror; `translate(50,50)` `==` element authored at the translated coordinates. + +**Cost.** Moderate setup (the metric + capture harness, both partly in place), then *sub-linear maintenance as the feature surface grows*: every new CSS-subset feature ships with a reference pairing whose only upkeep is "keep two equivalent scenes equivalent" — no per-platform golden, no rebaseline on theme tweaks, no binary blobs, no eyeball review. + +**Can't catch — the irreducible golden residue.** Effects with no simpler equivalent: the CSS-WG explicitly lists underline position/thickness and `dotted`/`dashed`/`ridge`/`groove`/`double` borders as "impossible to reftest." For Buiy that means the **drop-shadow Gaussian falloff, glyph rasterization fidelity (hinting/subpixel), color-emoji compositing, blend-mode math, and gamma/sRGB encode** (and SDF corner AA, which the CPU cross-check above handles instead) — all of which render on `main` today. A reftest can confirm a shadow is translation-invariant or symmetric (and Tier 3 should), but not that the falloff is *correct*. That residue is Tier 5. + +### Tier 5 — golden/screenshot regression (the last line, smallest possible set) + +**What it catches.** Only what Tiers 1–4 provably cannot: true rasterization correctness — SDF corner AA (where the CPU cross-check is insufficient), shadow blur kernel, glyph atlas output, **color-emoji (CBDT/COLR/bitmap) compositing**, effect compositor, blend modes, gamma/sRGB. Each golden must be justified by "no feature-free reference exists." **These residues all render on `origin/main` today** (effect compositor, drop-shadow, glyph atlas, color-emoji are landed). The GPU capture path also exists — `render_golden_harness.rs` and the four `text_*_gpu.rs` suites already capture and diff. What is missing is **not** the renderer or the capture path but the **stored baseline corpus, the perceptual metric, and the curated fixture set**: the existing `#[ignore]` tests assert `perceptual_diff < tol` between two fresh captures ("re-capture IS the golden"), which is a *determinism* check, not a *regression baseline* against a blessed image. + +**Color emoji is the canonical irreducible golden class.** It has no feature-free reference (you cannot re-author a CBDT bitmap or a COLR layer stack from primitives), it is highly platform- and font-version-sensitive, and it is exactly the kind of defect (wrong emoji, tofu fallback, mis-composited COLR layers) a user notices instantly. It belongs in Tier 5 with a pinned bundled emoji font, captured once on the canonical rasterizer. + +**The Ahem/obscure-text split — keep real glyphs out of *layout* goldens.** Most goldens that contain text do not need *real* glyph rasterization to test what they're testing; they need stable boxes. Adopt the Flutter/Alchemist two-class trick (research:golden-screenshot): a **layout-determinism mode** that renders text as the Ahem em-box font (or `obscureText`-style colored rectangles), making any text-bearing golden byte-identical across hosts, paired with a **narrow real-font suite** that actually asserts glyph fidelity. The shaping `.snap` fixtures already pin glyph *positions* deterministically; this collapses the font axis for the bulk of *pixel* goldens and confines genuinely-flaky real-glyph rasterization to a small, deliberately-chosen set. + +**Why it stays small.** With Tiers 1–4 absorbing layout, composition, the entire relational CSS-subset surface, and (via the CPU cross-check) SDF AA, gate #2's corpus shrinks from "every widget × state × theme × viewport × platform × DPR" to a curated handful of rasterization fixtures. The `× platform` axis — the worst multiplier — is collapsed by the determinism stack (§4): Buiy *owns its renderer*, so it pins one canonical software rasterizer and stores **one** golden per cell, not one per OS/GPU. DPR is handled by capturing each rasterization fixture at the supported scale factors (1×, 2×) — and `ExtractedNodes` already carries `scale_factor`, so this is a fixture axis, not new plumbing. + +**Implementation hook.** The headless offscreen `Texture` → readback → PNG path already backs the `#[ignore]` goldens; reuse it with `GoldenConfig::deterministic()` (built) and the §4 perceptual/outlier metric, emitting a diff PNG on failure to power triage and a real `--accept` that persists to a stored baseline (the missing machinery behind the `accept` flag — follow-ups.md:779–802). + +**Cost.** Highest: stored corpus, `--accept` persistence, per-fixture tolerance, the residual flake budget. This is precisely why it must be the smallest tier. + +**Can't catch.** Bugs above the rasterizer that produce a visually-plausible frame (caught upstream) and absolute *design* quality / brand polish / animation feel — correctly parked in the manual release gate, not automated. + +## Cross-cutting mechanisms + +**Perceptual metric — replace the two naive metrics that already exist.** Buiy has *two* naive diffs: the L1 `perceptual_diff` in `golden.rs` and the global RMSE `compare_images` in `crates/buiy_verify/src/visual.rs`. Both are wrong in the same two directions: a real bug touching 0.5% of pixels (a mispositioned glyph, a missing focus ring, an 8px wrong-color badge) is divided across the whole frame and rounds below any sane tolerance — sensitivity *degrades* as the app grows, which is the user's exact fear — while imperceptible sub-pixel AA re-rasterization inflates the same number. One tolerance knob cannot separate the two. Mozilla (reftest `fuzzy`), wgpu (`Outlier count N over limit M. Max difference D`), and pixelmatch all independently converged on the same fix: a **two-axis fuzzy/outlier model with AA awareness**, not a single average. **Unify both naive metrics onto it.** + +Concrete target (replacing the L1 and RMSE bodies): return `{ differing_pixels, max_pixel_delta, diff_image, mssim }`, gated by `(max_pixel_delta, max_diff_pixels)` with AA pixels excluded. + +- **Per-pixel decision = pixelmatch's YIQ `colorDelta`** (luminance-weighted, `maxDelta = 35215 · threshold²`), so brightness errors weigh more than chroma, matching the eye. +- **Exclude antialiased pixels** via pixelmatch's brightest/darkest-neighbor sibling test (`includeAA` opt-in for tests that *want* to assert AA exactly) — the single feature the naive metrics lack and the biggest flake source on a GPU pipeline. +- **Gate on two budgets:** `max_pixel_delta` (no single pixel off by more than X) **and** `max_diff_pixels` (at most N non-AA pixels differ). This *is* the reftest fuzz model — the two tiers share one metric. +- **Default to `max_diff_pixels = 0`** once determinism is in place; widen per-golden with a documented reason, à la `fuzzy-if`. +- **Secondary advisory MSSIM** via `image-compare 0.5.0` (`rgba_blended_hybrid_compare`, premultiplied against the opaque canvas) to catch global gamma/blend drift that a small-N pixel budget under-weights. SSIM is *advisory only* — its failure mode is averaging out localized defects, so never the primary gate. (`dssim-core` is the best-in-class structural alternative if MSSIM proves too coarse.) + +Crate path: adopt `dify` or `pixelmatch-rs` (both already implement the YIQ delta + AA test on `image` buffers, ~150 LOC battle-tested) rather than re-deriving the `35215`/sibling constants; `image = "0.25"` is already a dep, `image-compare 0.5.0` (advisory channel) is the planned add. Skip butteraugli (its masking model *hides* small UI defects we want flagged) and VMAF (video, off-domain). **NVIDIA ꟻLIP (`nv_flip`) is a genuine fork in the road, not a settled defer:** research:perceptual-metrics recommends pixelmatch-YIQ+AA as primary with FLIP deferred, but research:deterministic-gpu notes wgpu — Buiy's *closest determinism model* — ships `nv_flip` (Mean/Percentile thresholds, 0.01–0.1) as its *primary* metric and emits a difference map. The two streams disagree; this report picks pixelmatch-primary because it is pure-Rust, ~150 LOC, and gives the two-axis budget reftests need, but flags FLIP-vs-pixelmatch as Open Question #3 rather than asserting it closed. + +**Deterministic-rendering stack for wgpu CI — extend the built triad.** Buiy owns the whole stack, so it engineers determinism at the source ("remove the nondeterminism, don't just tolerate it"). The flake-mitigation triad is *already built* — `GoldenConfig::deterministic()` pins fixed clock + `wait_for_fonts` + `warm_atlas`, and `fonts_ready()` is the implemented font-sync predicate (`golden.rs:82`). What remains is to wire the missing **software-rasterizer pin** below it, adopting wgpu's own approach: + +- **Software rasterizer: lavapipe** (Mesa's software Vulkan ICD, `libvulkan_lvp.so`), the single canonical CI rasterizer, **pinned to an exact self-built version** hosted as a release artifact with a custom ICD JSON (mirror wgpu's `gfx-rs/ci-build` + install-mesa action). Never the daily PPA — wgpu learned this produces day-to-day flakes from unrelated llvmpipe regressions. One canonical config ⇒ **one golden per cell, no per-GPU matrix**. Force selection with `WGPU_ADAPTER_NAME` / `VK_ICD_FILENAMES`. A Windows-WARP leg only if a DX12 matrix is ever wanted. Buiy's separate real-RX-6700-XT verification campaign covers actual-hardware shader paths, so the CI tier need not — cement that division of labor. +- **Determinism knobs, asserted before readback:** `LP_NUM_THREADS=0` (single-threaded tiling — multi-threaded FP accumulation reorders); MSAA + dithering pinned/off as CI-config constants (Buiy's analytic in-shader AA is already deterministic given identical FP; the risk is hardware MSAA resolve and dither); a manually-advanced Bevy `Time` at explicit virtual timestamps (never `Instant::now()`) — the triad's `fixed_clock`; **bundled fonts only** with system fallback disabled (the embedded deterministic fonts `fonts_ready()` already assumes), plus block-until-loaded; **glyph-atlas warmup** = pre-rasterize every glyph (and emoji bitmap/COLR layer) in the fixture's text, flush, then capture — the `warm_atlas` flag, satisfied structurally for text fixtures (`golden.rs` notes the producer inserts at extract before draw). **Pin the scale factor** as a CI constant (`ExtractedNodes.scale_factor`) — a 1× vs 2× render is a different rasterization, not a tolerance. Bake each as an explicit, asserted setup step ("0 pending assets, atlas fully warmed, DPR pinned") in the `buiy_verify` capture harness. + +**Golden storage strategy.** Staged to project size, designed now so migration is mechanical: + +- **Now:** the residual Tier-5 PNGs in git (git-LFS if churn bites), regenerated via a `--accept`/`--bless` flag on `buiy_verify` that **persists to a stored baseline** (the machinery the `GoldenConfig.accept` flag currently lacks), reviewed as the PR diff. The model to generalize is **already in-repo**: `BUIY_ACCEPT_SHAPING=1` curates the shaping `.snap` fixtures (`tests/text_shaping_snapshots.rs`) — extend that same curated-update discipline to PNG goldens. Tiers 1–3 snapshots are text `.snap` files in-tree (cheap, diff-readable); Tier 4 reftests and the CPU-vs-GPU cross-check store **zero** bytes. +- **Later (only if the golden count explodes):** commit-hash-keyed object storage (reg-suit's S3/GCS model — a keygen plugin walks the git graph to find the baseline commit, publisher fetches only that baseline), git stays clean. Do *not* build a Skia-Gold-class service until in-git actually hurts. **Design the digest/keying schema now** — key by `(widget, state, theme, viewport, backend, dpr)`, and **support multiple accepted goldens per digest from day one** (Skia Gold's "many positives per config"; retrofitting a single-golden assumption is painful even though the pinned rasterizer means usually one). + +**Human `--accept` / triage workflow + diff-report UX.** Two layers, both offline-first (no SaaS — matches the project ethos): + +- **Tiers 1–3:** `cargo insta review` *is* the accept loop (`a`/`r`/`s` per change, rewrites source on accept; CI fails on unreviewed). Free, native — and `BUIY_ACCEPT_SHAPING` already demonstrates the curated-snapshot variant in-repo. +- **Tier 5:** on any failing run, `buiy_verify` emits a **self-contained local HTML report** (model: reg-cli, now Rust→WASI so callable from a Rust harness) with side-by-side / toggle / diff-heatmap views and the emitted diff PNG, plus a `--accept` CLI that promotes actual→golden. Borrow Skia Gold's **time-boxed ignore rules** (silence an expected mass-change — e.g. a font roll — and triage after) and Argos's **flaky auto-ignore**. The triage primitive everywhere is `untriaged → positive/negative`; design for multiple positives per config. + +**Coverage auto-generation from the catalog/BSN.** The decisive lesson from Storybook/Chromatic: stop hand-writing screenshot tests; derive them from the catalog. Make the **BSN catalog the single source of truth** — define each widget's states once as BSN fixtures (a "gallery" asset), register the global axes (theme, viewport, forced-colors, DPR) as a matrix in `buiy_verify`, and **Cartesian-product fixtures × axes at test-collection time** (`insta`'s `glob!` drives one body over a fixture directory). A newly added widget fixture is then **auto-enrolled into the full matrix by construction** — no per-widget test code. This is Chromatic's "modes" mechanism (modes stack/multiply, each cell gets its own baseline), native to BSN. Forced-colors (#11) and DPR are just additional modes; wiring the existing `forced_colors_analyzer` to the live catalog (follow-ups.md:462–481) makes #11 fall out of the same enrollment. The matrix feeds *all* tiers: each cell yields a Tier-1 layout snapshot, a Tier-2 display-list + token-set snapshot, and (for the rasterization residue) a Tier-5 golden. + +**Animation / temporal determinism.** Buiy owns the clock, so it can do the strong thing SaaS tools cannot: drive the same manually-advanced Bevy `Time` the determinism stack mandates (à la Compose's `advanceTimeBy`) and snapshot at fixed logical times (t=0, mid, end), capturing **frame-sequence snapshots** at stepped times to catch *timing* regressions (easing, interpolation), not just end-states — versus the SaaS "pause and wait for stability" that only tests settled frames. Tier 2 display-list snapshots extend naturally to per-timestamp paint-command snapshots; the fixed clock is the same `fixed_clock` the `GoldenConfig` triad already pins. + +## Prior-art map + +Technique → where it is documented in Buiy's corpus / external source. Gaps worth promoting back are flagged. + +| Technique | Buiy prior-art folder / external source | Captured? | Gap to promote | +|---|---|---|---| +| Golden image + perceptual diff | `prior-art/xilem-masonry/masonry-toolkit.md` (`masonry_testing`, `insta`, byte-determinism); Flutter/Skia-Gold/Playwright/Alchemist (research:golden-screenshot) | partial — flake-triad + L1 metric + GPU re-capture goldens built | stored-baseline corpus; perceptual-metric algorithm; per-config goldens; multi-positive baselines; **Ahem/obscure-text layout mode** | +| Structured display-list / paint-tree snapshots | `prior-art/blink/stacking-and-paint.md` (property trees); `prior-art/servo-stylo/rendering.md` (display list); Flutter `toStringDeep`, WebRender RON (research:structured-snapshots) | partial — intermediates asserted field-by-field, not holistically | **holistic `insta` snapshot of the intermediate** — upgrade the existing per-field asserts; the missing `buiy_verify` tier | +| Layout-number snapshots (gate #5) | `prior-art/taffy/lessons.md` (WPT corpus); foundation `verification.md` #5 | partial — numeric asserts in `tests/layout.rs`, no snapshot | fixture cardinality; per-widget vs. per-feature granularity; holistic snapshot | +| Reftests (`==`/`!=`, fuzzy) | Blink/Servo via WPT (research:reftests); CSS-WG wiki; Mozilla `reftest.list` — substantially covered inside `prior-art/blink` & `prior-art/servo-stylo` (both run WPT) | **absent — none in tree** | **adopt as primary mechanism of gate #2** — add a reftest facet to the existing Blink/Servo folders; no reftest gate exists | +| CPU-vs-GPU rasterization cross-check | Vello `vello_cpu` reference (research:deterministic-gpu); Buiy's own CPU SDF port (`render_instance.rs`) | **not in corpus** | promote the CPU SDF from unit-test precedent to a golden-free rasterization oracle | +| AccessKit tree snapshots | `prior-art/xilem-masonry/masonry-toolkit.md`; `prior-art/bevy-ui/open-problems.md`; foundation #3 | yes (shipping) | tree-change semantics (breaking vs. cosmetic) | +| Deterministic render (clock/font/atlas) | foundation #2; `prior-art/gpui/lessons.md` (byte-deterministic paint); wgpu install-mesa (research:deterministic-gpu) | partial — `GoldenConfig::deterministic()` triad + `fonts_ready()` built | **lavapipe pin recipe; `LP_NUM_THREADS`/MSAA knobs; DPR-pin** — the rasterizer pin below the triad is absent | +| Perceptual fuzzy matching | foundation #2 ("tolerance budget"); pixelmatch/wgpu/Mozilla (research:perceptual-metrics) | partial — naive L1 + RMSE exist | **unify L1+RMSE → pixelmatch-YIQ+AA; two-axis budget; FLIP-vs-pixelmatch decision** | +| Text / glyph / BiDi / emoji visual testing | text subsystem **landed** (T1–T9); shaping `.snap` fixtures (6 scripts); GPU text goldens; Flutter Ahem/Alchemist (research:golden-screenshot); foundation #12 BiDi caret invariant | partial — shaping snapshots + GPU goldens built | **BiDi caret round-trip proptest; color-emoji stored golden; Ahem layout mode** | +| Paint invalidation / change detection | `prior-art/blink/stacking-and-paint.md`; `prior-art/servo-stylo/rendering.md`; `prior-art/taffy/lessons.md` (ECS `Changed`) | yes | testing change-detection *correctness* (false-negative skipped paints) | +| Property-based / metamorphic | foundation #12 (`proptest`, invariants); metamorphic testing for shader compilers (research:structured-snapshots) | partial — `proptest` dep present, zero generators | generator implementations; failure-seed reproduction; the §3 Tier-3 invariants | +| Contrast linter (WCAG) | foundation #9; `crates/buiy_verify/src/contrast.rs` | yes (shipping) | gradient/image backgrounds; dynamic contrast | +| Forced-colors (gate #11) | foundation #11; `render/forced_colors_analyzer.rs` | partial — analyzer built over `CatalogPaint` | wire #11a/b analyzer to the **live** catalog (follow-ups.md:462–481); residual visual half + `BoxShadow` draw-skip as a reftest | +| Golden storage / triage at scale | Skia Gold, reg-suit, Argos, Chromatic (research:golden-screenshot, research:triage-coverage-tooling) | **not in corpus** | commit-keyed object storage; HTML triage report; time-boxed/flaky ignores | +| Coverage-by-construction from catalog | Chromatic "modes" (research:triage-coverage-tooling) | **not in corpus** | BSN-catalog → matrix enrollment rule | + +**Worth promoting into the corpus:** (1) a **reftest facet inside the existing `prior-art/blink/` and `prior-art/servo-stylo/` folders** (both already run WPT; a greenfield `prior-art/wpt-reftests/` is not the primary path) — it is the highest-leverage technique and the one genuinely absent from Buiy's tree; (2) a `prior-art/skia-gold/` (or reg-suit) facet for the storage/triage-at-scale escape hatch; (3) the "structured-intermediate regression testing" insight, which exists nowhere as a coherent technique; (4) the **text/glyph/BiDi/emoji visual-testing** facet (Ahem mode, caret round-trip, emoji-as-golden) — the shaping-snapshot precedent is in-tree but its generalization is undocumented. + +## Recommendation & roadmap + +**Build on what exists.** The flake-triad config, the L1/RMSE diffs, the GPU capture path, the forced-colors analyzer, and the shaping-snapshot `--accept` precedent are all landed; the work is to add the absent tiers (reftests, holistic snapshots, proptest invariants) and generalize the existing partials (metric, golden persistence, live-catalog wiring). Coverage-per-effort is highest at the bottom. + +**Step 1 — reftest harness + metric upgrade (highest leverage; both unblock the most).** Reftests are entirely absent and are Buiy's highest-leverage pixel-level mechanism; they require the two-axis AA-aware metric to express fuzzy matching, so land the two together. Replace the naive L1 `perceptual_diff` (`golden.rs`) **and** the RMSE `compare_images` (`visual.rs`) with `{ differing_pixels, max_pixel_delta, diff_image, mssim }` gated by `(max_pixel_delta, max_diff_pixels)`, AA excluded, via `dify`/`pixelmatch-rs` + advisory `image-compare 0.5.0`. Then build the `reftest!(match/mismatch, …)` API on the existing headless capture path (reuse `render_golden_harness.rs`'s capture and `GoldenConfig::deterministic()`) plus the **CPU-vs-GPU SDF cross-check**, and author reference pairings for the live CSS-subset features (container queries, anchors, content-visibility, writing modes, transforms, stacking) + the forced-colors visual reftest. Runs under the existing `cargo test` gate, zero golden storage. + +**Step 2 — holistic snapshots + proptest invariants (the missing deterministic middle).** Add `insta`, build the `Display` dump formatter (plus the `Pod` byte-hex `PackedInstance` check), and **upgrade the existing field-by-field asserts to holistic snapshots**: layout-number snapshots replacing the numeric asserts in `tests/layout.rs` (gate #5), and display-list/paint-order snapshots replacing the per-field asserts in `render_buckets.rs`/`render_paint_order.rs`/`render_instance.rs`/`render_extract.rs`/`top_layer.rs` (the missing Tier-2 holistic layer). Add the §3 metamorphic/property invariants on the present-but-empty `proptest` dep — paint-order totality, transform round-trips on `compose_transform`, top-layer dominance, finiteness, and the **BiDi caret round-trip** (gate #12's named text invariant, now buildable on the landed shaper). All pure-CPU, deterministic, no GPU; ships the `cargo insta review` accept loop for free. + +**Step 3 — generalize golden persistence + determinism pin (for the Tier-5 residue that already renders).** Build the **stored-PNG corpus + `--accept` FILE persistence** behind the existing `GoldenConfig.accept` flag (follow-ups.md:779–802), modeled on the in-repo `BUIY_ACCEPT_SHAPING` curated-update flow; add per-fixture tolerance budgets and the Ahem layout-determinism mode. Below the already-built flake-triad, wire the **software-rasterizer determinism pin** (lavapipe pinned, `LP_NUM_THREADS=0`, MSAA/dither constants, DPR pinned via `scale_factor`). Capture stored goldens *only* for the rasterization residue with no feature-free reference — shadow kernel, effects, glyph fidelity, color-emoji — all of which render on `main` today; the GPU capture path already exists, so this is corpus + metric + curation, not new rendering. Goldens in git/LFS, `--accept` via a local HTML triage report. Smallest tier, built once the metric (Step 1) exists. + +**Step 4 — coverage-by-construction + live forced-colors.** Wire the BSN catalog → (theme × viewport × forced-colors × DPR) matrix so new widget fixtures auto-enroll across all tiers, and re-point `forced_colors_analyzer` from hand-built `CatalogPaint` at the live widget catalog (follow-ups.md:462–481) so gate #11a/b runs over the real catalog within the same enrollment. + +**The v1 CI gate** (what `buiy-verification-design` should target first) = **Steps 1–2**: the reftest harness + unified metric, plus layout/display-list `insta` snapshots and proptest invariants — the reftests and CPU-cross-check under the existing `cargo test` gate with zero golden storage, the snapshots pure-CPU. That closes gate #5, adds the missing display-list gate, completes gate #12, and exercises the live CSS-subset surface, catching the bulk of regressions. Step 3 (stored-golden corpus + persistence + determinism pin) is v1.1, deliberately last so the flaky/expensive baseline tier is built smallest and only after the cheaper tiers absorb everything they can — even though the renderer it captures already exists. + +**The key tradeoff — reftests-first vs. goldens-first.** This report commits to **reftests-first** (Tier 4 before a broad Tier 5 stored corpus). Rationale: reftests need no stored baseline, cancel platform variance by construction, directly exercise Buiy's actual risk surface (the CSS-subset layer above Taffy), provide an independent oracle, and scale *sub-linearly* in maintenance as features grow. They are the lowest tier that covers relational visual behavior, per CLAUDE.md — and notably the one mechanism the landed infra does *not* already have a foothold in. + +**Runner-up — goldens-first (broad stored screenshot regression as the primary gate), rejected.** It is the industry-default move (Flutter `matchesGoldenFile`, Playwright `toHaveScreenshot`, masonry's `assert_render_snapshot!`) and superficially simpler — and Buiy is closer to it than to reftests today (capture path + flake-triad + L1 metric already exist). Rejected anyway because it is exactly the methodology that *fails as Buiy scales*: cost scales with the widget×state×theme×viewport×platform×DPR product; it demands per-platform goldens and a stored-baseline corpus *up front* before relational coverage exists; every legitimate restyle reds the suite and forces a human `--accept` pass; binary blobs churn git history. (One commonly-cited golden weakness — "a golden only re-asserts equality with a past snapshot of the same code, so it can't catch a bug present *when the golden was captured*" — is real but *symmetric*: a reftest is equally blind if the test and reference share the buggy code path, which is exactly why reftest reference-independence is Open Question #1. State it once, not as a clean win.) masonry takes the goldens-first path and lacks the cheaper deterministic tiers entirely — the gap this strategy avoids. Goldens are not eliminated; they are *demoted* to the irreducible rasterization residue (Step 3), where no feature-free reference exists and they earn their flake cost. The existing `#[ignore]` re-capture goldens already sit at exactly that demoted position — a determinism check, not a baseline. + +## Open questions + +These need a human/design decision before or during `buiy-verification-design`: + +1. **Reftest reference authoring discipline.** The reference must use a *disjoint* code path from the feature under test, or a shared bug corrupts both and the comparison passes vacuously. Who reviews that independence, and can it be lint-enforced (e.g., "reference scene must not contain a `@container` rule")? Without a discipline, reftests silently lose their teeth — and this is the same independence the goldens-first rebuttal turns on. + +2. **Per-fixture fuzz budgets.** Even reftests need bounded fuzz for legitimate sub-pixel nondeterminism. What is the default `fuzzy(d_lo-d_hi, p_lo-p_hi)`, and is it calibrated statistically (capture N runs, measure jitter) or hand-set per fixture? Mozilla's "ranges must not include 0" discipline implies each fuzz is itself asserted — is Buiy willing to pin both ends? (The existing `#[ignore]` goldens use a flat `perceptual_diff < ~1e-4`; the two-axis budget supersedes it.) + +3. **Perceptual metric: pixelmatch-YIQ vs. NVIDIA ꟻLIP.** This report picks pixelmatch-primary (pure-Rust, two-axis budget) and defers FLIP, but wgpu — Buiy's closest determinism model — ships `nv_flip` as its *primary* metric with a localized error map. Decide whether to adopt FLIP from the start (stronger localization, heavier, no mature pure-Rust crate) or follow the pixelmatch path and upgrade only if pixel-budget tuning proves insufficient. Either way both the L1 `perceptual_diff` and the RMSE `compare_images` are replaced. + +4. **lavapipe vs. real-GPU for Tier 5.** This report recommends lavapipe-only for CI goldens, with real-hardware verification living in the separate GPU-verify campaign (already run on an RX 6700 XT). Confirm: are there rasterization bugs (driver-specific blend/AA) that *only* a real GPU surfaces and that must therefore be a CI gate, not a one-shot campaign? + +5. **Display-list dump format stability contract.** The Tier-2 formatter is the durable artifact under churning structs (`ExtractedNode`, `PackedInstance`, `InstanceBuckets`). What is its versioning/compatibility policy — and does a format change require re-blessing every snapshot (acceptable) or is it designed to be diff-stable across formatter edits (harder)? + +6. **Storage-migration trigger.** At what golden count / repo-size does in-git (or LFS) stop being acceptable and the commit-keyed object-storage migration fire? Name the threshold now so it is a planned step, not a crisis. + +7. **Text golden strategy: Ahem-mode boundary and emoji baseline.** Which goldens render real glyphs (the narrow fidelity suite) vs. the Ahem/obscure-text layout mode? The shaping `.snap` fixtures already pin glyph positions for 6 scripts; the open part is the *pixel* fidelity split. And what is the color-emoji baseline — a pinned bundled emoji font captured once, and how is a font-version roll triaged (time-boxed ignore)? + +8. **Animation snapshot scope.** Frame-sequence snapshots at stepped clock times catch timing regressions but multiply the fixture count by the number of sampled timestamps. Which animations warrant temporal coverage (vs. end-state only), and at what sampling density? diff --git a/docs/specs/2026-06-15-buiy-verification-design/README.md b/docs/specs/2026-06-15-buiy-verification-design/README.md new file mode 100644 index 0000000..9bcda3f --- /dev/null +++ b/docs/specs/2026-06-15-buiy-verification-design/README.md @@ -0,0 +1,103 @@ +# Buiy verification design — the visual-bug-detection pyramid + +**Date:** 2026-06-15 +**Status:** draft +**Realizes:** the strategy report [`reports/2026-06-14-visual-bug-detection-strategy.md`](../../reports/2026-06-14-visual-bug-detection-strategy.md) (five-tier pyramid, reftests-first) and the foundation verification gates #2 (visual), #5 (layout snapshots), #11 (forced-colors), #12 (proptest invariants) in `specs/2026-05-07-buiy-foundation/verification.md`. + +## Thesis + +Detect visual bugs with a **five-tier pyramid, reftests-first** — push detection *down* to the cheapest, most-deterministic rung that can see the bug, so the flaky, expensive pixel tier shrinks to the irreducible rasterization residue. Layout-number snapshots (Tier 1) and holistic CPU display-list/paint-order snapshots (Tier 2) catch geometry and composition bugs with zero GPU; proptest invariants (Tier 3) cover relational properties over an unbounded scene space with no oracle; **reftests (Tier 4)** — render a feature two equivalent ways in one process and assert `==`/`!=`, all platform variance cancelling — are Buiy's highest-leverage pixel mechanism and the one wholly absent from the tree; and stored goldens (Tier 5) are demoted to only what no feature-free reference can reach (SDF AA beyond the CPU cross-check, shadow kernel, glyph/color-emoji fidelity, blend/gamma). The whole stack rides one AA-aware two-axis perceptual metric and one deterministic-capture builder, both built on the landed `GoldenConfig` flake triad and the existing headless capture path — the renderer and capture already exist; what is new is the corpus discipline, the relational tiers, and the unified metric. + +## Architecture + +**Crate boundary.** The harness has a pure half and a GPU half, split by what each needs: + +- **`buiy_verify`** (depends on `buiy_core + bevy + image + proptest + serde`, adds `insta`, a perceptual-metric crate, `inventory`, `toml`, `base64`) is the harness home. It holds everything app-independent: the metric, the snapshot dump formatters, the proptest generators/predicates, the reftest pairing/aggregation logic, the golden persistence + triage, the `DeterministicApp` builder, and the coverage matrix. +- **`buiy_core::render::golden`** holds the *device-coupled* capture only. The shared seam is promoted out of `crates/buiy_core/tests/support/mod.rs` into `render/golden.rs` src as `capture_to_image(&mut App, &GoldenConfig) -> image::RgbaImage`, callable by `buiy_verify`. `buiy_core` **cannot** depend on `buiy_verify` (the harness depends on core, not the reverse), so the naive `perceptual_diff` (L1) is *deprecated in place* and its callers migrate outward to `buiy_verify::metric`. + +**`buiy_verify` module layout** (one module per concern, each its own child file): + +``` +buiy_verify:: + metric — AA-aware two-axis perceptual diff (the shared primitive for tiers 4 & 5) + snapshot — Tier 1 layout-number + Tier 2 display-list/paint-order dumps (insta) + invariant — Tier 3 proptest scene generators + predicate fns (no oracle) + reftest — Tier 4 RefCase / reftest! / run_reftest + the CPU-vs-GPU SDF cross-check + golden — Tier 5 assert_golden persistence, multi-positive corpus, HTML triage + determinism — DeterministicApp builder, GoldenConfig extensions, lavapipe CI pin + coverage — Matrix Cartesian product over BSN fixtures, auto-enrolling every tier +``` + +**Crate-dependency note.** The only new GPU dependency `buiy_core` gains is `image = "0.25"` (already a workspace dep) for `capture_to_image`. Every other new crate (`insta`, the perceptual-metric crate, `inventory`, `toml`, `base64`) lands in `buiy_verify` and is gated by `cargo deny check` before it merges (CLAUDE.md supply-chain check; `deny.toml` is allow-list-only, so a new transitive license fails CI by design and must be added explicitly, never via an exception hack). + +## Tier table + +| Tier | What it catches | Where it lives | Headless / GPU | Foundation gate(s) | +|---|---|---|---|---| +| **1 — layout-number snapshots** | geometry: wrong size/position, intrinsic-size, container-query math, Taffy-bridge bugs | `buiy_verify::snapshot` (`assert_layout_snapshot`) over `ResolvedLayout` | **Headless** (pure CPU) | **#5** | +| **2 — display-list / paint-order snapshots** | composition: paint *order* (z-sort, tooltip-behind-modal), paint *set* (cull, missing-token sentinel), paint *params* (wrong token/radius/transform), stacking-context formation; + forced-colors token flow | `buiy_verify::snapshot` (`assert_display_list_snapshot` + `PackedInstance` byte-hex) over `ExtractedNodes`/`InstanceBuckets` | **Headless** (pure CPU) | **#11** (forced-colors token flow, via `coverage`) | +| **3 — metamorphic / property invariants** | relations with no oracle: paint-order totality, transform round-trips, top-layer dominance, finiteness, BiDi caret round-trip | `buiy_verify::invariant` (proptest generators + predicate fns) | **Headless** (pure CPU) | **#12** | +| **4 — reftests + CPU-vs-GPU cross-check** | the CSS-subset surface relationally: flex/grid↔absolute, container queries, anchors, content-visibility, writing modes, transforms, stacking, clipping, forced-colors visual residual; SDF corner AA (via the CPU oracle) | `buiy_verify::reftest` (`RefCase`/`reftest!`/`run_reftest`), capture via `buiy_core` | **GPU** (`#[ignore]`) | **#2** (relational half) | +| **5 — golden / screenshot regression** | true rasterization only: SDF AA, shadow blur kernel, glyph/color-emoji atlas, effect compositor, blend/gamma, forced-colors `BoxShadow` draw-skip | `buiy_verify::golden` (`assert_golden`, stored `tests/goldens/` corpus), capture via `buiy_core` | **GPU** (`#[ignore]`) | **#2** (residue half) | + +Cross-cutting: **`metric`** is shared by tiers 4 and 5; **`determinism`** (the `DeterministicApp` builder + lavapipe CI pin) underpins tiers 4 and 5 and realizes the source-of-truth half of gate #2's determinism requirement; **`coverage`** Cartesian-products every fixture across all five tiers and wires the live-catalog half of gate #11. + +## Verification of the harness + +The harness is load-bearing test infrastructure, so each tier carries its **own** non-snapshot meta-tests proving it tests what it claims — a property suite that never fails is worthless, a snapshot that passes vacuously is worse than none. The discipline, per child file: + +- **metric** (`metric.md` § Verification): known-answer unit tests — identity ⇒ zero diff; a single wrong-by-200 pixel is caught at *every* frame size N ∈ {16, 256, 2048} (the exact §4 regression L1/RMSE fail); AA on/off pins the sibling test; two-axis independence proves both axes bind; dimension/empty ⇒ explicit `Err` (not the silent `1.0`). Pure CPU, no GPU lane. +- **snapshot** (`snapshots.md` § Verification): the dump is `assert_eq!`-equal across two apps spawned in *different entity order* (proves `Name`-keyed order-invariance, a plain assert so it cannot pass vacuously); the hex round-trips bytes; a format-version tripwire forces a conscious re-bless; each migration is checked behavior-preserving against the old per-field asserts (the half-size sign regression must still fail). +- **invariant** (`invariants.md` § Verification): **mutation fixtures** — a hand-built scene that violates exactly one relation must be rejected, plus a known-good control that passes (the Tier-3 analogue of the half-size sign-bug regression). The `top_layer_dominates` mutation fixture also pins deviation #3: it fails if anyone "fixes" the predicate to compare the enum discriminant. +- **reftest** (`reftests.md` § Verification): an aggregation truth table on stub `Diff`s (no GPU); a `match` of a scene with *itself* passes at `(0,0)` while a `match` of two different scenes *fails* (proves the harness can fail — guards a vacuous green); the independence lint is itself RED/GREEN-tested. +- **golden** (`goldens.md` § Verification): in-memory match/mismatch + multi-positive + bless round-trip + fail-closed-on-empty-corpus + report self-containment, all pure CPU; one end-to-end golden per residue class on the GPU lane. +- **determinism** (`determinism.md` § Verification): idempotent-capture (two fresh apps diff at `(0,0)`) *and* knob-sensitivity negatives (flipping DPR / font-mode / MSAA *changes* the bytes — proves the knobs are load-bearing, not no-ops); quiescence assertions fire on an injected never-loading asset. +- **coverage** (`coverage.md` § Verification): `catalog()` (inventory) and the `glob!` walk enumerate the identical set; every `CoverageKey::stem()` is unique and round-trips; a deliberately-broken fixture must produce a forced-colors violation through the *live* producer (proving it observes real paint, not a stale descriptor). + +All headless meta-tests run under `cargo test --workspace` with **no** `--ignored`; the GPU meta-tests are `#[ignore]` on the real-adapter lane (`cargo test -- --ignored --test-threads=1`). + +## Build order + +The phasing belongs to the plan, not this spec; the **priority** is fixed by the report roadmap and is reftests-first: + +1. **metric + reftests** — the unified two-axis AA-aware metric (replacing the L1 `perceptual_diff` and the RMSE `compare_images`) and the `reftest!` harness + CPU-vs-GPU SDF cross-check on the existing capture path. Highest leverage; zero golden storage; both unblock the most. +2. **snapshots + invariants** — add `insta`, the `Display` dump formatters + `PackedInstance` byte-hex, and the proptest generators/predicates. Pure-CPU, deterministic, closes gate #5 and #12 and adds the missing Tier-2 display-list gate. +3. **goldens + determinism** — the stored-PNG corpus + `BUIY_BLESS` persistence, per-fixture fuzz budgets, the Ahem layout-determinism mode, and the lavapipe CI rasterizer pin below the landed flake triad. Smallest tier, deliberately last. +4. **coverage** — the BSN-catalog → (theme × viewport × forced-colors × DPR) Cartesian matrix that auto-enrolls new fixtures across all tiers, and the re-point of `forced_colors_analyzer` to the live catalog (gate #11). + +The v1 CI gate is steps 1–2 (reftests + metric under the existing `cargo test` gate, plus the pure-CPU snapshots and invariants); step 3 (stored goldens) is v1.1. + +## Child files — reading order + table of contents + +Read in dependency order: the metric is the shared primitive every pixel tier consumes, so it comes first; the pure-CPU tiers next; the GPU tiers and their determinism substrate after; coverage last because it composes all of them. + +1. [`metric.md`](metric.md) — `buiy_verify::metric`: the AA-aware two-axis perceptual diff (`Diff`, `FuzzBudget`, `compare`, `Diff::passes`), pixelmatch-YIQ + AA-sibling exclusion, advisory MSSIM, the migration of the two naive metrics. **Read first.** +2. [`snapshots.md`](snapshots.md) — `buiy_verify::snapshot`: Tier 1 layout-number + Tier 2 display-list/paint-order `insta` dumps (purpose-built `Display`, not raw `Debug`/serde) + the `PackedInstance` byte-hex check. +3. [`invariants.md`](invariants.md) — `buiy_verify::invariant`: Tier 3 proptest scene generators + predicate fns (`paint_order_is_total`, `transform_roundtrips`, `top_layer_dominates`, `all_finite`, `bidi_caret_roundtrips`), pure-CPU. +4. [`reftests.md`](reftests.md) — `buiy_verify::reftest`: Tier 4 `RefCase`/`reftest!`/`run_reftest`, the reference-independence discipline + lint, and the CPU-vs-GPU SDF cross-check (Tier 4.5). GPU. +5. [`goldens.md`](goldens.md) — `buiy_verify::golden`: Tier 5 `assert_golden` persistence, multi-positive corpus, `BUIY_BLESS` workflow, HTML triage report, storage migration, the Ahem/real-font split. GPU. +6. [`determinism.md`](determinism.md) — `buiy_verify::determinism` + `buiy_core::render::golden`: `DeterministicApp`, the `GoldenConfig` extensions (font mode, DPR, MSAA/dither), the quiescence flush, the lavapipe CI pin vs. the local real-GPU lane. +7. [`coverage.md`](coverage.md) — `buiy_verify::coverage`: the BSN-fixture single-source-of-truth, the `Matrix` Cartesian product auto-enrolling every tier, and the live-catalog wiring of `forced_colors_analyzer`. + +## Prior art + +Each tier draws on a researched external system; consult these `lessons.md` decision files when implementing: + +- [`prior-art/wpt-reftests/lessons.md`](../../prior-art/wpt-reftests/lessons.md) — Tier 4: reftest `==`/`!=`, reference independence, two-axis fuzzy, `reftest-wait` settle, multiple-references aggregation. +- [`prior-art/vello/lessons.md`](../../prior-art/vello/lessons.md) — Tier 4.5: CPU-vs-GPU SDF cross-check (Buiy's oracle is *stronger* — one shared analytic function, kept permanently), and the FLIP-vs-pixelmatch per-tier-metric tension. +- [`prior-art/wgpu-testing/lessons.md`](../../prior-art/wgpu-testing/lessons.md) — `determinism`: the lavapipe pin recipe (`VK_DRIVER_FILES`, `WGPU_ADAPTER_NAME`, the `LP_NUM_THREADS` myth), the perceptual-metric migration, per-backend expectations. +- [`prior-art/skia-gold/lessons.md`](../../prior-art/skia-gold/lessons.md) — Tier 5: the `(widget, state, theme, viewport, backend, dpr)` key schema, multi-positive baselines, durable accept ledger, commit-keyed object store, local HTML triage, expiring ignores, stale-positive pruning. +- [`prior-art/flutter-golden-testing/lessons.md`](../../prior-art/flutter-golden-testing/lessons.md) — Tier 5: the Ahem/box-glyph layout-determinism font (UPM-1024), the two-tier obscure/real split, the engine-side shadow killswitch, `--update-goldens` curated accept, color-emoji as the irreducible golden. + +The open questions the report raised are resolved as decisions in [`open-questions.md`](open-questions.md). + +## Resolved during synthesis + +Cross-file inconsistencies reconciled while assembling this entry point (where a child file's claim was changed, it is noted; otherwise the synthesis adopts the child's flagged finding into the canonical contract): + +1. **`compose_transform` line is `:3775`, not `:3691`.** The SHARED API CONTRACT cited `:3691`; `invariants.md` flagged `:3775` (deviation #1) and it is verified on `origin/main` (`grep`-confirmed: `pub(super) fn compose_transform` at line 3775, `tier_rank` at 4113). The contract line is stale; **`:3775` is canonical**. No child-file edit needed (`invariants.md` already cites `:3775`). +2. **`PackedInstance.rect_size[1]` is POSITIVE on `main`.** The contract implied a deliberately-negative packed height (y-flip in the instance); `invariants.md` deviation #2 verified the y-flip moved into the per-view uniform (`render/instance.rs`), so packed height stays positive. Favorable: `all_finite_packed` asserts `rect_size[1] ≥ 0` directly with no un-flip. **Adopted.** +3. **`tier_rank` is promoted to `pub fn buiy_core::layout::top_layer_paint_rank(TopLayer) -> u8`.** `invariants.md` deviation #3: the `TopLayer` enum's *declared* order (`None, Modal, Popover, Tooltip, Fullscreen`) is NOT the paint order; the paint rank lives in a private closure `tier_rank` (`Fullscreen→0 … None→u8::MAX`). `top_layer_dominates` must compare via the rank, never the discriminant — so the rank is promoted to a single public source of truth consumed by both the layout sort and the invariant. **A small `buiy_core` surface add, accepted** (see `open-questions.md` § Contract reconciliation). +4. **`capture_to_image` is a re-runnable primitive, not one-shot-per-App.** `reftests.md` (Contract deviations) flagged that a reftest needs *two* captures sharing one `wgpu::Device` in one process, while `golden.md`/`determinism.md` spec a single `capture_to_image(&mut App, &GoldenConfig) -> RgbaImage`. Reconciled in favor of the existing signature: `capture_to_image` re-targets the offscreen camera and re-reads-back on *each* call against an already-built `App`, so reftest calls it twice on one `DeterministicApp::build()` output; `DeterministicApp::capture(self, fixture)` is the build+spawn+one-capture convenience wrapper goldens use. No new `capture_scene` shape is introduced. **Reconciled; see `open-questions.md` § Contract reconciliation.** +5. **`snapshot` resolves the contract's serde-"or" to the Display-dump branch only.** `snapshots.md` (Contract deviations) takes the purpose-built `Display` formatter exclusively and adds **no** serde derives to render types (the report's explicit anti-pattern is raw Debug/serde snapshots). `assert_display_list_snapshot` consequently takes `&NameLookup` (a `World`-free entity→`Name` map), not the contract's bare `(nodes, name)`. **Adopted into the contract.** +6. **`LP_NUM_THREADS` dropped as a determinism knob; `VK_ICD_FILENAMES` → `VK_DRIVER_FILES`.** `determinism.md` deviations 1 & 2, confirmed by `prior-art/wgpu-testing/lessons.md` (the `LP_NUM_THREADS` myth; the deprecated ICD env var). The contract's mention of `LP_NUM_THREADS` as a determinism setting is corrected — determinism comes from the *pinned Mesa version*. **Adopted.** diff --git a/docs/specs/2026-06-15-buiy-verification-design/coverage.md b/docs/specs/2026-06-15-buiy-verification-design/coverage.md new file mode 100644 index 0000000..4f1eb19 --- /dev/null +++ b/docs/specs/2026-06-15-buiy-verification-design/coverage.md @@ -0,0 +1,325 @@ +# Coverage-by-construction + +**Date:** 2026-06-15 +**Status:** draft +**Spec:** specs/2026-06-15-buiy-verification-design/README.md + +Stop hand-writing per-widget tests; derive them from the BSN/widget catalog. This +section specifies `buiy_verify::coverage`: a `Matrix` of global axes +(themes × viewports × forced_colors × dprs), a Cartesian product taken at +test-collection time over a fixture directory, so that adding **one** widget +fixture auto-enrolls it across **every** tier (layout snapshot, display-list +snapshot, invariant scenes, reftest pairings, golden corpus). It also re-points +`forced_colors_analyzer` from hand-built `CatalogPaint` at the live widget catalog +(follow-ups.md:462–481), making gate #11 fall out of the same enrollment. + +## Contract deviations + +None. This module is additive over the contract's `coverage` slot. One +clarification flagged for the synthesizer: the contract lists axes as +`{themes, viewports, forced_colors, dprs}`; this spec models `forced_colors` +and `dpr` as the two **`Mode`** axes (Chromatic-style, each cell gets its own +baseline) and `theme`/`viewport` as ordinary axes — a presentation grouping, not +a type change. The Cartesian product is over all four. + +## The fixture as single source of truth + +A **fixture** is a BSN scene factory plus a name — the catalog row, authored once. +It is the same `fn(&mut App)` shape every other tier consumes (`reftest::RefCase`, +`golden`, `snapshot`), so a fixture is enrollable everywhere with no adapter. + +```rust +// crates/buiy_verify/src/coverage/fixture.rs +pub struct Fixture { + /// Stable identity. Becomes the `widget` key component and the insta + /// snapshot stem. `lower-kebab`, unique within the corpus. + pub name: &'static str, + /// Spawns the scene into a deterministic app. MUST spawn a `Camera2d`, + /// MUST tag the widget root with a `Name` (entities are keyed by Name in + /// every dump — never by `Entity` bits; snapshot.md). One fixture = one + /// widget × state (the `state` key component is carried separately, below). + pub state: &'static str, + pub spawn: fn(&mut App), +} + +/// The corpus: every fixture, collected once. `inventory`-registered so a new +/// fixture file enrolls with zero edits to a central list (see Enrollment). +pub fn catalog() -> &'static [Fixture]; +``` + +Fixtures live under `crates/buiy_verify/fixtures//.rs` and register +via the `fixture!` macro (below). The BSN factory is the same code the +`hello_button` / `hello_text` examples already use (`examples/hello_button/src/main.rs` +spawns `Button::new("Save")`); the catalog is those spawns, named and enumerated. + +```rust +fixture! { + name = "button", + state = "resting", + spawn = |app| { app.world_mut().spawn((Name::new("button"), Button::new("Save"))); }, +} +``` + +The `state` axis (resting / hover / focus / pressed / disabled) is **per-fixture**, +not a global `Matrix` axis, because state is widget-specific (a `Button` has +`hover`; a static `Label` does not). It is encoded by spawning the widget already +in that state (e.g. inserting `Hovered`, `Focusable { focused: true }`, the +`Disabled` marker — all live components). One file per state keeps each fixture a +single scene. + +## The Matrix — global axes, Cartesian product + +```rust +// crates/buiy_verify/src/coverage/matrix.rs +use buiy_core::render::golden::Dpr; // canonical Dpr (determinism.md); NOT a local f32 +pub struct Matrix { + pub themes: Vec, // light, forced_colors (dark when it lands) + pub viewports: Vec, // logical (w,h): phone, tablet, desktop + pub forced_colors: Vec, // Mode axis: false, true + pub dprs: Vec, // Mode axis: Dpr::X1, Dpr::X2 (milliscale) +} + +#[derive(Clone, Copy)] +pub enum ThemeAxis { Light, ForcedColors } // -> theme.rs constructors +impl ThemeAxis { + pub fn build(self) -> Theme { // default_light_theme / forced_colors_theme + match self { Self::Light => default_light_theme(), Self::ForcedColors => forced_colors_theme() } + } + pub fn key(self) -> &'static str { /* "light" | "forced" */ } +} + +#[derive(Clone, Copy)] +pub struct Viewport { pub w: u32, pub h: u32, pub key: &'static str } + +impl Matrix { + /// The CI default. Conservative product; widen per axis with a documented + /// reason, never silently (mirrors the metric's fuzz-budget discipline). + pub fn ci_default() -> Self; + /// Cartesian product → one `Cell` per combination. Stable iteration order + /// (axis declaration order) so snapshot stems are deterministic. + pub fn cells(&self) -> impl Iterator; +} + +/// One enrolled combination. The product `Matrix × Fixture` is the full corpus. +#[derive(Clone, Copy)] +pub struct Cell { + pub theme: ThemeAxis, + pub viewport: Viewport, + pub forced_colors: bool, + pub dpr: Dpr, // canonical buiy_core::render::golden::Dpr +} +``` + +A `Cell` is **not** itself a key — it is half of one. The full key is +`Cell × Fixture`, which is exactly the contract's storage schema and Skia Gold's +params/traces identity (skia-gold/lessons.md §Borrow.2, +`(widget, state, theme, viewport, backend, dpr)`): + +```rust +// crates/buiy_verify/src/coverage/key.rs — the shared key for golden + snapshot stems +use buiy_core::render::golden::Dpr; // canonical Dpr (determinism.md) + +/// `dpr: Dpr` (milliscale, `Eq + Hash`) lets `CoverageKey` itself derive +/// `Eq + Hash` — so the `verify_keys_unique` self-test can collect keys into a +/// `HashSet` directly. The old `dpr: f32` made this impossible (`f32` is neither +/// `Eq` nor `Hash`), which is the bug that fix unblocks. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub struct CoverageKey { + pub widget: &'static str, // Fixture::name + pub state: &'static str, // Fixture::state + pub theme: &'static str, // ThemeAxis::key + pub viewport: &'static str, // Viewport::key + pub forced_colors: bool, + pub dpr: Dpr, // canonical buiy_core::render::golden::Dpr (Eq+Hash) + pub backend: Backend, // golden.rs Backend; "cpu" for snapshot tiers +} +impl CoverageKey { + pub fn for_cell(fx: &Fixture, cell: &Cell, backend: Backend) -> Self; + /// Canonical filename stem, e.g. `button.resting.forced.desktop.fc1.dpr2.lavapipe`. + /// Stable, lossless, ordered — retrofitting keys means re-baselining + /// everything (skia-gold/lessons.md §Avoid). Used as the golden PNG stem + /// and the insta snapshot suffix (`assert_snapshot!(key.stem(), …)`). + pub fn stem(&self) -> String; +} +``` + +`backend` enumerates `cpu` (Tiers 1–3, no GPU) and the GPU rasterizer name +(`lavapipe` in CI, `radv` locally); reserving it now avoids the painful retrofit +(skia-gold/lessons.md §Avoid: "retrofitting a single-golden assumption is painful"). + +## Enrollment — one body per tier, applied across the product + +Enrollment is the verb: each tier provides **one** generic body; the harness drives +it across `catalog() × Matrix::cells()`. No per-widget test code exists anywhere. + +```rust +// crates/buiy_verify/src/coverage/enroll.rs +/// Build a deterministic app for (fixture, cell): DeterministicApp builder +/// (determinism.md) with theme installed, viewport + DPR pinned, forced_colors +/// set on UserPreferences, then the fixture spawned. The `Dpr`→`f32` conversion +/// happens HERE at the capture boundary: `DeterministicApp::dpr(cell.dpr)` feeds +/// the pinned scale_factor as `cell.dpr.as_f32()` (the milliscale axis stays the +/// key; the window's `scale_factor` is the derived f32). +pub fn build_app(fx: &Fixture, cell: &Cell) -> App; + +/// Drive a tier body over the entire corpus. `body` receives the built app and +/// the key; it does the tier-specific assert (snapshot / invariant / golden). +pub fn enroll_all(matrix: &Matrix, body: impl Fn(App, CoverageKey)); +``` + +Each tier is a thin caller of `enroll_all`. The `insta` snapshot tiers use +`glob!` over the fixture directory as the contract requires — `glob!` is the +collection-time fan-out, and `enroll_all` multiplies each globbed fixture by the +`Matrix` cells: + +```rust +// crates/buiy_verify/tests/coverage_layout.rs (Tier 1, gate #5, pure CPU) +#[test] +fn layout_snapshots() { + enroll_all(&Matrix::ci_default(), |app, key| { + // snapshot.md::assert_layout_snapshot — ResolvedLayout dump, keyed + assert_layout_snapshot(&key.stem(), &app); + }); +} + +// crates/buiy_verify/tests/coverage_display_list.rs (Tier 2, pure CPU) +// body -> assert_display_list_snapshot(&key.stem(), &app) (snapshot.md) +// crates/buiy_verify/tests/coverage_invariants.rs (Tier 3, pure CPU) +// body -> for each invariant fn: assert on the built scene (invariant.md) +// crates/buiy_verify/tests/coverage_golden.rs (Tier 5, #[ignore], GPU) +// body -> let img = capture_to_image(&mut app, &cfg); +// assert_golden(&key.stem(), &img, &budget_for(&key)) (golden.rs) +``` + +The decisive property: **adding `fixtures/slider/resting.rs` enrolls a slider into +all five tiers at once** — layout snapshot, display-list snapshot, every Tier-3 +invariant, and (once a budget is set) a golden cell per `Matrix` combination — with +no edit to any test file. This is Chromatic's "modes multiply, each cell gets its +own baseline" made native to BSN (report §Cross-cutting → Coverage auto-generation). + +### The `glob!` ↔ `inventory` choice + +`insta::glob!` discovers fixtures by walking a directory of `.rs` (or `.ron` BSN) +files; `inventory` discovers them by link-time registration. The spec uses **both, +non-redundantly**: `inventory` builds `catalog()` (the typed `&[Fixture]` that the +GPU/invariant tiers iterate, since they are not file-driven), and `glob!` drives +the two `insta` snapshot tiers (its `cargo insta review` UX is the required accept +loop). The `fixture!` macro emits *both* an `inventory::submit!` and a +discoverable file, so the two views never drift. A `verify_catalog_matches_glob` +self-test (below) asserts they enumerate the identical set. + +## Wiring `forced_colors_analyzer` to the live catalog + +Today the gate-#11 analyzers consume hand-built `CatalogPaint` descriptors +(`render/forced_colors_analyzer.rs:21`; tests construct them literally, +`tests/render_forced_colors_analyzer.rs:11`). The seam is documented for +re-pointing once real painted components land (follow-ups.md:469–473) — and they +have: `Button::bundle` now spawns live `Background`/`Border`/`Corners`/`Radius` +(`buiy_widgets/src/button.rs:18,47`). The wiring is a **producer** that derives +`CatalogPaint` from the live catalog, leaving the analyzer and its tests unchanged +(the seam's stated contract — `forced_colors_analyzer.rs:10`): + +```rust +// crates/buiy_verify/src/coverage/forced_colors.rs +/// Walk the live catalog: for each fixture, build its app, query the spawned +/// `Background`/`Border`/`Outline` (+ shadow-only-delta) components off the +/// `Name`-tagged root, and project them into the existing `CatalogPaint`. +/// The analyzer (`analyze_forced_colors` / `analyze_shadow_only`) is called +/// unchanged — only its *input source* moves from fixtures to the live tree. +pub fn live_catalog_paint() -> Vec; +``` + +```rust +// crates/buiy_verify/tests/coverage_forced_colors.rs (Tier 2, gate #11, pure CPU) +#[test] +fn live_catalog_has_no_forced_colors_violations() { + let catalog = live_catalog_paint(); // from the SAME fixtures + let theme = forced_colors_theme(); + assert!(analyze_forced_colors(&catalog, &theme).is_empty()); + assert!(analyze_shadow_only(&catalog).is_empty()); +} +``` + +Because `live_catalog_paint` reads the *same* fixture corpus as every other tier, +gate #11 auto-enrolls every new widget by construction — the report's stated goal +("wiring the existing `forced_colors_analyzer` to the live catalog makes #11 fall +out of the same enrollment", report §Cross-cutting). The residual forced-colors +*visual* half (the `BoxShadow` draw-skip, follow-ups.md:474–478) is a reftest, not +coverage's concern (reftests.md); coverage only enrolls the forced-colors **mode** +(`forced_colors: true` cell) into every tier so the visual reftest is itself +matrixed. **That visual reftest is BLOCKED on the unlanded `BoxShadow` +extract/draw path** (`extract_buiy_nodes` has no `BoxShadow` branch yet — reftests.md +§ authoring patterns); the structured `analyze_forced_colors` / `analyze_shadow_only` +gate here covers the rest now and does not depend on it. + +## Storage at scale — staged, designed now + +Per skia-gold/lessons.md, the corpus is in-repo PNGs (golden.md owns persistence) +until scale hurts; coverage's job is to **make migration mechanical** by fixing the +`CoverageKey` schema now (skia-gold/lessons.md §Borrow.2: "fix the schema before +generating any goldens"). The matrix is the natural place to enforce +*combinatorial budget*: `Matrix::ci_default` is deliberately small (≈ 2 themes × 3 +viewports × 2 fc × 2 dpr = 24 cells/fixture), and a `cell_count()` assertion in the +self-test fails the build if the product exceeds a named ceiling — a planned +storage-migration trigger (report Open Q #6), not a surprise. Multi-positive +baselines and pruning are golden.md's concern; coverage only guarantees the key is +set-valued-ready (the stem is the key, one PNG per accepted digest). + +## Dependencies + +| Crate | Version | Status | `cargo deny` note | +|---|---|---|---| +| `insta` | `1` (workspace) | **new** (added by snapshot.md) | reuse; permissive (Apache-2.0/MIT). `glob!` feature already needed by snapshot tier. | +| `inventory` | `0.3` | **new** | distributed link-time registration for `catalog()`. MIT/Apache-2.0 — clears `cargo deny check`. Re-verify advisories before bump. Alternative considered: a hand-maintained `&[Fixture]` const (rejected — defeats "zero edits to enroll"). | + +No GPU-only or copyleft deps. `image`/`proptest` are already workspace deps and +consumed via the metric/invariant tiers, not added here. + +## Verification — testing the harness itself + +The coverage layer is meta-machinery, so it is tested by asserting its +*enumeration and keying*, independent of any tier's pass/fail: + +1. **`verify_catalog_matches_glob`** — `catalog()` (inventory) and the `glob!` + fixture-directory walk enumerate the identical `name×state` set. Guards the + dual-source-of-truth drift named above. +2. **`verify_keys_unique`** — over `catalog() × Matrix::ci_default().cells()`, + every `CoverageKey::stem()` is unique and round-trips (parse-back ≡ identity). + A collision means two cells would share a baseline — the silent-overwrite bug. + `CoverageKey` now derives `Eq + Hash` (because `dpr: Dpr` is `Eq + Hash`, not + the old `f32`), so the keys themselves — not just their stems — collect into a + `HashSet` for the duplicate check. +3. **`verify_cell_count_under_ceiling`** — the product size is below the named CI + ceiling; tripping it forces an explicit budget decision (storage-migration + trigger, report Open Q #6). +4. **A deliberately-broken fixture** (`#[cfg(test)]` only) that paints a brand + token under forced-colors **must** produce a `NonSystemColor` violation through + `live_catalog_paint` → proves the live-catalog producer actually observes paint, + not a stale hand-built descriptor (the failure mode the re-pointing fixes). It + is excluded from the real `catalog()` so it never reds the production gate. +5. **Enrollment fan-out** — a stub tier body that pushes its `CoverageKey` into a + `Vec` asserts `enroll_all` invokes the body exactly `fixtures × cells` times + with no duplicate key — the Cartesian product is total and non-redundant. + +All five are pure-CPU and run under the headless gate; only `coverage_golden` +(which consumes the corpus) is `#[ignore]` GPU. + +## Sources + +- Code: `render/forced_colors_analyzer.rs:10,21,51,89` (the `CatalogPaint` seam + analyzers); + `tests/render_forced_colors_analyzer.rs:11` (hand-built descriptors today); + `buiy_widgets/src/button.rs:18,47` (live `Background`/`Border` catalog); + `theme.rs:62,110` (`default_light_theme` / `forced_colors_theme`); + `theme.rs:56` (`UserPreferences.forced_colors`); `components.rs:25` (`ResolvedLayout`); + `render/extract.rs:139` (`ExtractedNodes.scale_factor` = DPR axis); + `examples/hello_button/src/main.rs` (the spawn that fixtures generalize); + follow-ups.md:462–481 (the forced-colors live-catalog seam). +- Prior art: `prior-art/skia-gold/lessons.md` §Borrow.2 (params/traces key schema), + §Borrow.3 (set-valued baselines), §Avoid (retrofit-keys / stale-positive pitfalls). +- Report: `reports/2026-06-14-visual-bug-detection-strategy.md` + §Cross-cutting → "Coverage auto-generation from the catalog/BSN" (Chromatic modes, + matrix enrollment, #11 falling out of the same enrollment); Open Q #6 (storage trigger). +- Sibling specs: `snapshot.md` (`assert_layout_snapshot` / `assert_display_list_snapshot`, + `glob!`), `invariant.md` (Tier-3 predicates), `golden.md` (`assert_golden`, persistence, + `Backend`, multi-positive), `determinism.md` (`DeterministicApp`, DPR/clock pin), + `metric.md` (`FuzzBudget`). diff --git a/docs/specs/2026-06-15-buiy-verification-design/determinism.md b/docs/specs/2026-06-15-buiy-verification-design/determinism.md new file mode 100644 index 0000000..393896a --- /dev/null +++ b/docs/specs/2026-06-15-buiy-verification-design/determinism.md @@ -0,0 +1,196 @@ +# Determinism stack + +**Date:** 2026-06-15 +**Status:** draft +**Spec:** specs/2026-06-15-buiy-verification-design/README.md + +The job of this tier is narrow and load-bearing: **make every pixel test reproducible so the diff is a signal, not noise.** It engineers nondeterminism out at the source ("remove the nondeterminism, don't just tolerate it") so the perceptual metric's default fuzz budget can be `(0, 0)`. It extends the *already-built* flake triad (`GoldenConfig::deterministic()` — `fixed_clock`/`wait_for_fonts`/`warm_atlas`, `golden.rs:38`) with the missing knobs — Ahem font mode, DPR pin, MSAA/dither pinned-off, async-asset flush — exposes them through a `DeterministicApp` builder in `buiy_verify`, and pins the CI software rasterizer (lavapipe) below all of it. Reftests need this stack *less* than goldens (both halves render in one process, so residual drift cancels) but reuse the same builder. + +## Contract deviations + +Two deviations from the SHARED API CONTRACT, both forced by verified prior-art (`prior-art/wgpu-testing/determinism-rasterizer.md` § "The `LP_NUM_THREADS` myth"): + +1. **`LP_NUM_THREADS=0` is NOT a determinism knob — dropped.** The contract and the report (§ Cross-cutting) list it as a determinism setting. wgpu's `install-mesa/action.yml` does **not** set it, and Mesa documents it only as a thread-count perf knob, never a determinism one; llvmpipe tiles per-thread so output is stable regardless of thread count. Determinism comes from the **pinned Mesa version**, not thread count. This spec does not export `LP_NUM_THREADS` and the plan author must not add it expecting FP determinism. (It may still be set to `1` as a *defensive belt-and-suspenders* with a comment that it is not the determinism source — optional, not asserted.) +2. **`VK_ICD_FILENAMES` → `VK_DRIVER_FILES`.** The contract names `VK_ICD_FILENAMES`. That variable is deprecated; the modern Vulkan-loader variable is `VK_DRIVER_FILES` (Mesa envvars; wgpu migrated). This spec uses `VK_DRIVER_FILES` (loader still honors the old name, but new CI wiring should not encode a deprecated path). + +## Where the code lives + +The crate split follows the contract: **app-coupled capture stays in `buiy_core::render`**, **pure config/builder lives in `buiy_verify::determinism`**. + +- `buiy_core::render::golden` — extend `GoldenConfig` (below), define the **canonical `Dpr` type** (the single definition site every other tier imports — see § "Extending `GoldenConfig`"), and promote the capture entry point `capture_to_image(&mut App, &GoldenConfig) -> image::RgbaImage` out of `tests/support/mod.rs` into `golden.rs` src (the contract's shared seam; consumes the existing `gpu_render_app_scaled` / `wait_for_text_ready` / `readback_rgba` machinery, `tests/support/mod.rs:156`/`:266`/`:353`). +- `buiy_verify::determinism` — the `DeterministicApp` builder and the asserted-setup-step checklist. It *re-exports* the `FontMode`/`Dpr` config types from `buiy_core::render::golden` (their canonical home, since `GoldenConfig` carries them) rather than redefining them. Pure / app-independent: it *configures* an `App`, it does not own the GPU. +- CI rasterizer pin — a composite GitHub Action under `.github/actions/install-mesa/` + an env contract; not Rust, but specified here so the plan author wires it. + +## Extending `GoldenConfig` (`buiy_core::render::golden`) + +`GoldenConfig` keeps its three landed booleans and grows the four missing axes. New fields default to their deterministic value in `deterministic()`; the struct stays `Copy` (all fields are `Copy`). + +```rust +/// Deterministic-capture configuration. Extends the landed flake triad +/// (fixed_clock / wait_for_fonts / warm_atlas) with the font, DPR, sampling, +/// and asset-flush axes that the determinism spec adds. +#[derive(Clone, Copy, Debug)] +pub struct GoldenConfig { + // --- landed triad (unchanged) --- + pub fixed_clock: bool, + pub wait_for_fonts: bool, + pub warm_atlas: bool, + pub accept: bool, + // --- determinism additions --- + /// Collapse the font axis. `Real` rasterizes the fixture's actual fonts + /// (the narrow fidelity suite); `Ahem` substitutes the em-box font so any + /// text-bearing golden is byte-identical across hosts (§ Ahem mode). + pub font_mode: FontMode, + /// Device-pixel-ratio pin. A 1× vs 2× render is a *different rasterization*, + /// not a tolerance — captured as a fixture axis, never fuzzed (§ DPR pin). + pub dpr: Dpr, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum FontMode { Real, Ahem } + +/// **Canonical `Dpr` definition site.** Device-pixel-ratio as *integer +/// milliscale* (1000 = 1.0×, 2000 = 2.0×) so the type is `Eq + Hash + Ord` +/// without float pitfalls — it is a *fixture axis* that must key a golden / +/// coverage cell, never a tolerance. Defined ONCE here in +/// `buiy_core::render::golden`; `goldens.md` (`GoldenKey.dpr`) and `coverage.md` +/// (`Matrix.dprs` / `Cell.dpr` / `CoverageKey.dpr`) import this type, they do +/// **not** redefine it. The capture boundary converts the window's `f32` +/// `scale_factor` via `Dpr::from_f32` and back via `Dpr::as_f32` when sizing the +/// offscreen target. +/// +/// Derives `serde::{Serialize, Deserialize}` so `goldens.md`'s `GoldenKey` / +/// `BlessLedger` can persist it in the bless ledger without re-wrapping. The +/// `serde` derive is feature-gated in `buiy_core` only if needed; `buiy_core` +/// already carries `serde` as a workspace dep. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, + serde::Serialize, serde::Deserialize)] +pub struct Dpr(pub u32); +impl Dpr { + pub const X1: Self = Dpr(1000); + pub const X2: Self = Dpr(2000); + /// Round an `f32` scale factor to integer milliscale (e.g. `1.0 → Dpr(1000)`). + pub fn from_f32(scale: f32) -> Self { Dpr((scale * 1000.0).round() as u32) } + /// Back to the `f32` scale factor the window/extract path uses. + pub fn as_f32(&self) -> f32 { self.0 as f32 / 1000.0 } +} + +impl GoldenConfig { + pub fn deterministic() -> Self { + Self { + fixed_clock: true, wait_for_fonts: true, warm_atlas: true, accept: false, + font_mode: FontMode::Ahem, // layout goldens collapse the font axis by default + dpr: Dpr::X1, + } + } + /// The real-glyph fidelity variant: Ahem off, everything else pinned. + pub fn fidelity() -> Self { Self { font_mode: FontMode::Real, ..Self::deterministic() } } +} +``` + +**MSAA / dither are pinned as constants, not config.** They are *never* a per-fixture knob — a golden captured with MSAA on is non-comparable with one captured off. They live as module constants the capture path asserts: + +```rust +/// Single-sampled: the 4× MSAA resolve antialiases edges nondeterministically +/// across drivers. Buiy's in-shader analytic AA is deterministic given identical +/// FP, so MSAA buys nothing here and costs determinism. Mirrors the existing +/// `spawn_capture_camera`'s `Msaa::Off` (tests/support/mod.rs:229). +pub const CAPTURE_MSAA: bevy::render::view::Msaa = bevy::render::view::Msaa::Off; +/// Deband dither perturbs the low bits of the tonemapped output. Capture cameras +/// pin it off; assert no `DebandDither::Enabled` on the capture camera. +pub const CAPTURE_DITHER_OFF: bool = true; +``` + +## DPR pin + +`ExtractedNodes.scale_factor` already carries the ratio (filled from the primary window, `extract.rs:606`; default `1.0`, `extract.rs:156`), and `gpu_render_app_scaled(logical_w, logical_h, scale_factor)` already builds an app at an explicit `with_scale_factor_override` (`tests/support/mod.rs:156`–`:161`). The pin is therefore **plumbing that exists** — the determinism contribution is to make it an *asserted* capture invariant: `capture_to_image` sizes the offscreen target to `logical × dpr` physical pixels (the existing scaled-builder contract) and asserts `ExtractedNodes.scale_factor == cfg.dpr.as_f32()` before readback (the `f32`→`Dpr` conversion lives at this capture boundary). DPR is a *fixture axis* fed by `coverage::Matrix.dprs`, never a tolerance widening. + +## Ahem font mode (collapse the font axis) + +The bulk of text-bearing goldens test *boxes*, not glyphs; real glyph rasterization is the canonical per-platform flake source (Flutter's entire `matchesGoldenFile` Ahem trick, `prior-art/flutter-golden-testing/obscure-text-font.md`). `FontMode::Ahem` substitutes a bundled Ahem face (every glyph a solid em-square box) so any non-fidelity golden is byte-identical across hosts; the narrow fidelity suite runs `FontMode::Real`. + +- **Asset:** a committed `Ahem.ttf` (MIT, the WPT/Web-Platform Ahem) under `crates/buiy_core/tests/fixtures/fonts/`, alongside the existing per-script subsets (`tests/fixtures/fonts/`). License file beside it, mirroring the `OFL-*.txt` precedent. +- **Wiring:** `DeterministicApp` registers it through the production bytes path — `FontRegistry::register_bytes("Ahem", ahem_bytes, FontFaceDescriptors::default())` (`registry.rs:165`) — under family name `"Ahem"`, and when `font_mode == Ahem` makes it the **sole resolvable family** for fixture text so fallback cannot reintroduce a platform font. Concretely: the deterministic app disables system-font loading (fixtures already run bundled-only; `fixture_font_bytes`/`register_fixture_font`, `tests/support/mod.rs:292`/`:306`) and the fixture's BSN sets `font-family: Ahem`. This is a *capture-time* substitution; the shaping `.snap` fixtures and the real-glyph fidelity suite are unaffected (they pin `FontMode::Real`). +- **Boundary (Open Q #7 in the report):** which goldens are Real vs Ahem is a per-fixture declaration on the fixture, not global. Default Ahem; opt into Real only for the fidelity suite (glyph hinting/subpixel, color-emoji, decorations). + +## Async-asset flush to quiescence + +`wait_for_fonts` covers fonts; the general invariant is **zero pending assets before readback** (a half-streamed image or shader flips the diff). `capture_to_image` drives `app.update()` until quiescence, asserting all four conditions, then captures: + +```rust +/// All must hold before the readback frame, in `capture_to_image`: +/// 1. asset_server pending loads == 0 (no in-flight Image/Shader/Font load) +/// 2. AtlasWarmupQueue::is_empty() (warm_atlas; golden.rs:87) +/// 3. fonts_ready(atlas, warmup, &keys) (wait_for_fonts; golden.rs:82) +/// 4. PipelineCache has no Queued/Compiling Buiy pipeline (shaders ready) +/// Bounded by MAX_SETTLE_FRAMES; panic with which condition never held. +``` + +This generalizes the existing `wait_for_text_ready` poll (`tests/support/mod.rs:266`, conditions 2+3) by adding the asset-server (1) and pipeline-cache (4) gates. The fixed clock means the loop terminates deterministically: time is advanced by `Time::::advance_by` (the landed manual-clock mechanism, `tests/text_caret_selection.rs:178`), never `Instant::now()`, so `fixed_clock` is "drive `Time` at explicit virtual timestamps." + +## `DeterministicApp` builder (`buiy_verify::determinism`) + +The single public seam every GPU tier (reftest, golden) constructs its app through. It owns the *setup* (knob application + the asserted checklist); `capture_to_image` in `buiy_core` owns the *capture*. + +```rust +pub struct DeterministicApp { cfg: GoldenConfig, logical: (u32, u32) } + +impl DeterministicApp { + /// Default-deterministic at a logical viewport size. + pub fn new(logical_w: u32, logical_h: u32) -> Self; + pub fn with(mut self, cfg: GoldenConfig) -> Self; // override the config + pub fn font_mode(self, m: FontMode) -> Self; + pub fn dpr(self, dpr: Dpr) -> Self; + + /// Build a painting-capable headless App with every knob applied: + /// - `gpu_render_app_scaled(w, h, cfg.dpr.as_f32())` (DPR pin) + /// - `TimeUpdateStrategy::ManualDuration(0)` + Time driven manually + /// - registers Ahem and makes it sole family when font_mode == Ahem + /// - capture camera spawned at CAPTURE_MSAA, dither off + /// Returns an App ready for fixture spawn; NOT yet finished (caller finishes). + pub fn build(self) -> bevy::app::App; + + /// `build` + spawn the fixture + `capture_to_image(&app, &cfg)`. The one-call + /// path tiers use. Internally asserts the four quiescence conditions. + pub fn capture(self, fixture: impl FnOnce(&mut App)) -> image::RgbaImage; +} +``` + +`build` is a thin, **single-bodied** wrapper over the landed `gpu_render_app_scaled` so it cannot drift from the canonical plugin stack (the same anti-drift discipline `gpu_render_app_with_resolution` already enforces, `tests/support/mod.rs:168`). + +## CI software-rasterizer pin (lavapipe) vs. the local real-GPU lane + +**The argument:** Buiy owns its renderer, so it pins **one** canonical software rasterizer and stores **one golden per cell** — no per-OS/per-GPU matrix. A rolling distro rasterizer is a moving reference image (wgpu abandoned `ppa:oibaf` for exactly this; `prior-art/wgpu-testing/determinism-rasterizer.md`). + +- **Rasterizer:** Mesa **lavapipe** (`libvulkan_lvp.so`), consumed as a **version-pinned, self-built artifact** (reuse `gfx-rs/ci-build`'s prebuilt tarball directly — no need to build our own Mesa). Pin `MESA_VERSION` + `ci-binary-build` tag explicitly; bump deliberately in a tracked issue, regenerating affected goldens in the same PR. +- **Adapter selection (env contract):** a composite action writes its **own** ICD JSON (the upstream ICD path is build-host-absolute) and exports: + - `VK_DRIVER_FILES=$PWD/icd.json` — loader sees *only* lavapipe; cannot pick a hardware GPU. + - `WGPU_ADAPTER_NAME=llvmpipe` — case-insensitive substring nails the exact device (`initialize_adapter_from_env`). +- **NOT set:** `LP_NUM_THREADS` (see Contract deviation 1). +- **Local real-GPU lane (this host, AMD RX 6700 XT / RADV):** the `#[ignore]` GPU tests run on real hardware locally and in the separate GPU-verify campaign. **Division of labor (cemented):** CI goldens run on pinned lavapipe (the stored-baseline gate); real-hardware shader/AA/blend paths are covered by the GPU-verify campaign, *not* a CI gate. The local lane does **not** compare against the stored lavapipe baseline (cross-rasterizer pixels are non-comparable) — it runs the determinism / reftest checks, which are rasterizer-internal-invariant, not baseline. + +**One canonical rasterizer ⇒ one golden per cell.** The Tier-5 key schema `(widget, state, theme, viewport, backend, dpr)` (golden tier) carries `backend` for forward-compat, but with a single pinned Vulkan/lavapipe rasterizer the `backend` axis is a constant today — collapsing the worst combinatorial multiplier. Cross-backend goldens are out of scope (the pinned-rasterizer guarantee holds within one backend only). + +## Reftests need this LESS than goldens + +A reftest renders **both** halves in one process (one device, driver, clock, atlas, font stack). Every platform-variance term is *shared* and therefore cancels in the diff — so reftests tolerate a residual the determinism stack has not yet eliminated, and their default fuzz budget can stay `(0,0)` even before the lavapipe pin lands. They still **reuse** `DeterministicApp` (same fixed clock, same Ahem option, same quiescence flush) for *intra-run* stability (e.g. atlas warmup must complete before *either* half captures). The CI rasterizer pin is a hard prerequisite for **stored goldens** (the baseline must be bit-reproducible across runs and machines) and only a *nice-to-have* for reftests. This is why the report builds reftests first and the lavapipe pin in the golden step. + +## Dependencies + +- **No new Rust crate** is required by this tier. `image = "0.25"` (workspace) supplies `RgbaImage`; `bevy = "0.18"` supplies `Time`, `TimeUpdateStrategy`, `Msaa`. The Ahem `.ttf` is a committed test fixture, not a dependency. +- **`insta`** and the **perceptual-metric crate** are added by the snapshot/metric tiers, not here. If a plan author adds either, run `cargo deny check` (config at repo-root `deny.toml`) before committing — the project gates new deps on it. +- **CI action** `gfx-rs/ci-build` is consumed as a *release artifact*, not a crate dep; it carries no `cargo deny` surface. The pinned Mesa version is recorded in the action YAML. + +## Verification + +How the determinism harness verifies *itself* (these are tests of the test infra, runnable in CI): + +1. **Idempotent-capture (pure-CPU + GPU lanes).** `capture_to_image` of the same fixture twice in two fresh `DeterministicApp`s ⇒ `metric::compare(a, b, default)` passes at budget `(0, 0)`. This is the landed "re-capture IS the golden" check (`render_golden_harness.rs`) re-expressed against the unified metric and the new builder — the direct proof the knobs actually pin the output. GPU (`#[ignore]`). +2. **Knob-sensitivity (negative tests).** Flipping each knob *changes* the bytes: `dpr(X1)` vs `dpr(X2)` of the same fixture differ; `FontMode::Real` vs `FontMode::Ahem` differ for a text fixture; a fixture with MSAA forced on differs from `CAPTURE_MSAA`. Proves the knobs are load-bearing, not no-ops. GPU (`#[ignore]`). +3. **Quiescence assertions fire.** Inject a never-loading asset / an undrained warmup queue and assert `capture` panics naming the unmet condition (1–4 above) — proves the flush gate cannot be silently skipped (the wgpu "implicit golden bootstrapping" Avoid: fail loudly, never green on a missing precondition). +4. **Clock determinism.** Assert `capture` uses `Time` and never reads wall time: a fixture whose visual depends on time captures identically across two runs at the same virtual timestamp; a test grep/lint forbids `Instant::now()` in the capture path. +5. **CI-pin smoke (CI-only).** On the lavapipe leg, assert the selected adapter name contains `llvmpipe` (env wiring took effect) before any golden runs — a one-line guard that the rasterizer pin is active, not silently falling back to a hardware adapter. + +## Sources + +Code: `crates/buiy_core/src/render/golden.rs:18`–`:88` (GoldenConfig, deterministic(), fonts_ready); `crates/buiy_core/tests/support/mod.rs:156` (gpu_render_app_scaled), `:161` (with_scale_factor_override), `:229`/`:237` (Msaa::Off capture camera), `:266` (wait_for_text_ready quiescence poll), `:292`/`:306` (bundled-font registration), `:353` (readback_rgba); `crates/buiy_core/src/render/extract.rs:156`/`:606` (scale_factor default + fill); `crates/buiy_core/src/text/registry.rs:165` (register_bytes); `crates/buiy_core/tests/text_caret_selection.rs:178` (Time::advance_by). Prior-art: `docs/prior-art/wgpu-testing/{lessons.md,determinism-rasterizer.md}` (lavapipe pin, VK_DRIVER_FILES, the LP_NUM_THREADS myth); `docs/prior-art/flutter-golden-testing/obscure-text-font.md` (Ahem). Report: `docs/reports/2026-06-14-visual-bug-detection-strategy.md` § Cross-cutting mechanisms ("Deterministic-rendering stack for wgpu CI"). diff --git a/docs/specs/2026-06-15-buiy-verification-design/goldens.md b/docs/specs/2026-06-15-buiy-verification-design/goldens.md new file mode 100644 index 0000000..7b6cbcb --- /dev/null +++ b/docs/specs/2026-06-15-buiy-verification-design/goldens.md @@ -0,0 +1,344 @@ +# Tier 5 — golden persistence + triage + +**Date:** 2026-06-15 +**Status:** draft +**Spec:** specs/2026-06-15-buiy-verification-design/README.md + +Tier 5 is the stored-baseline regression tier for the irreducible rasterization +residue — what Tiers 1–4 provably cannot reach: SDF corner AA (beyond the CPU +cross-check), the drop-shadow Gaussian kernel, glyph/color-emoji atlas output, +the effect compositor, blend/gamma, and the forced-colors *visual* residual. +This file specifies `assert_golden` persistence against a `tests/goldens/` +corpus keyed `widget × state × theme × viewport × backend × dpr` with +set-valued (multi-positive) baselines, the `BUIY_BLESS` accept-FILE workflow +(modeled on `BUIY_ACCEPT_SHAPING`), a self-contained offline HTML triage report ++ diff-PNG emit, the in-git→object-store storage migration, and the +Ahem/obscure-text split that keeps real glyphs out of *layout* goldens. It is +deliberately the smallest tier (report §Tier5); the renderer and capture path +already exist — what is missing is the corpus, the persistence machinery, and +the curated set. + +## Contract deviations + +None. This file consumes `buiy_verify::metric` (`Diff`/`FuzzBudget`), +`determinism::DeterministicApp`, and the promoted +`buiy_core::render::golden::capture_to_image` exactly as the shared contract +defines them, and extends `GoldenConfig` only as `determinism.md` already +mandates. `assert_golden` matches the contract signature +(`name, &RgbaImage, &FuzzBudget`); the `GoldenKey`, `BlessLedger`, and +HTML-report types below are additive and live entirely in `buiy_verify::golden`. + +## Module: `buiy_verify::golden` + +GPU-coupled (`#[ignore]`, GPU lane — CLAUDE.md). Capture is delegated to +`buiy_core` (the promoted `capture_to_image`); everything else here is pure CPU +and unit-testable without an adapter. + +```rust +// crates/buiy_verify/src/golden.rs + +use image::RgbaImage; +use crate::metric::{compare, CompareOpts, Diff, FuzzBudget}; +use buiy_core::render::golden::Dpr; // canonical Dpr — defined in determinism.md + +/// The trace identity (Skia-Gold "params/traces"; skia-gold/lessons §Borrow 2). +/// FIXED before any golden is generated — retrofitting keys re-baselines the +/// whole corpus. Ordered fields drive a stable on-disk path + the report. +#[derive(Clone, Debug, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)] +pub struct GoldenKey { + pub widget: String, // catalog fixture id (BSN gallery entry) + pub state: String, // default | hover | focus | pressed | disabled + pub theme: String, // light | dark | high-contrast | forced-* + pub viewport: String, // named viewport (e.g. "sm" 360x640) + pub backend: Backend, // CPU(lavapipe) | Vulkan | Gl | Metal | Dx12 + pub dpr: Dpr, // canonical buiy_core::render::golden::Dpr (milliscale) +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)] +pub enum Backend { Lavapipe, Vulkan, Gl, Metal, Dx12 } + +// `Dpr` is the canonical type from `buiy_core::render::golden` (defined in +// determinism.md): integer milliscale (1000 = 1×, 2000 = 2×), `Eq + Hash + Ord` +// so the key compares/sorts without float pitfalls. Imported above; NOT +// redefined here. It already derives `serde::Serialize`/`Deserialize` at its +// definition site, so `GoldenKey`'s derives are satisfied. + +impl GoldenKey { + /// `widget/state/theme__viewport__backend__dpr` — directory per widget + /// keeps a fixture's whole row of cells together for review. Slug-safe; + /// no raw `Debug`. + pub fn slug(&self) -> String { /* deterministic, lower-kebab */ } + /// Corpus directory holding `..png` (n = positive index) + the + /// `.toml` ledger. Default `crates/buiy_verify/tests/goldens/`. + pub fn dir(&self, root: &std::path::Path) -> std::path::PathBuf { /* root.join(self.slug parts) */ } +} +``` + +### `assert_golden` — the public entry point + +```rust +/// Compare `actual` against the stored multi-positive baseline set for `key`, +/// gated by `budget`. On `BUIY_BLESS=1` this *blesses* instead of asserting +/// (see below). On a non-bless failure: writes the diff PNG, appends an HTML +/// triage card, and panics with the report path. Contract alias +/// `assert_golden(name, &RgbaImage, &FuzzBudget)` takes a pre-built key. +pub fn assert_golden(key: &GoldenKey, actual: &RgbaImage, budget: &FuzzBudget); + +/// The same comparison without the panic — for the harness's own tests and for +/// the coverage matrix driver that collects many cells before reporting. +pub fn check_golden(key: &GoldenKey, actual: &RgbaImage, budget: &FuzzBudget) -> GoldenOutcome; + +pub enum GoldenOutcome { + /// `actual` matched at least one stored positive within `budget`. + Pass { matched_positive: usize, diff: Diff }, + /// No positive matched. Carries the best (smallest-`Diff`) candidate so the + /// report can show the *closest* baseline, not an arbitrary one. + Fail { best: Option<(usize, Diff)>, report: std::path::PathBuf }, + /// `BUIY_BLESS=1`: wrote a new/updated positive. Never reached in CI + /// (`BUIY_BLESS` unset ⇒ env-gated, mirrors `BUIY_ACCEPT_SHAPING`). + Blessed { positive: usize, was_new: bool }, +} +``` + +**Set-valued match (multi-positive).** A key maps to a *set* of accepted PNGs, +not one (Skia Gold "many positives per config"; skia-gold/lessons §Validates). +`check_golden` compares `actual` against each positive via +`metric::compare(actual, positive, &CompareOpts::default())` and passes if +*any* `Diff::passes(budget)`. This is essential for residual GPU AA jitter that +the determinism pin reduces but does not eliminate. Default budget after the +determinism pin is `FuzzBudget { max_channel_delta: 0, max_diff_pixels: 0 }` +(determinism.md); widen per-fixture with a documented reason in the ledger +(Mozilla `fuzzy-if`, "ranges must not include 0" — report §Cross-cutting). + +**Stale-positive guard.** Multi-positive accumulates stale entries silently — a +real regression can match an old wrong positive (skia-gold/lessons §Avoid). The +ledger records, per positive, the blessing commit + timestamp + a one-line +reason; `cargo run -p buiy_verify --bin golden-prune` lists positives unmatched +by any recent run for human removal. Pruning is *advisory*, never automatic. + +### The bless ledger (persistence) + +```rust +/// `.toml` beside the PNGs — the durable accept ledger reg-suit lacks +/// (skia-gold/lessons §Avoid "implicit-in-git-history acceptance"; §Borrow 1). +#[derive(serde::Serialize, serde::Deserialize)] +pub struct BlessLedger { + pub key: GoldenKey, + pub positives: Vec, // index i ⇒ `.i.png` +} +#[derive(serde::Serialize, serde::Deserialize)] +pub struct Positive { + pub file: String, // `.0.png` + pub blessed_commit: String, // `git rev-parse HEAD` at bless time + pub blessed_at: String, // RFC3339 + pub budget: FuzzBudget, // per-fixture widened budget + its reason + pub reason: String, // why this positive (or why widened) +} +``` + +### `BUIY_BLESS` accept-FILE workflow + +Replaces the inline "re-capture IS the golden" discipline of the current +`#[ignore]` GPU tests (which assert `perceptual_diff < 1e-4` between two fresh +captures — a *determinism* check, not a stored regression). Modeled exactly on +`BUIY_ACCEPT_SHAPING` (`tests/text_shaping_snapshots.rs:296`): + +- `BUIY_BLESS` **unset** (CI + default): `assert_golden` reads the baseline set, + fails closed if the corpus has no positive (`panic!` instructing the dev to + bless + review + commit — verbatim shape of the shaping panic at + `text_shaping_snapshots.rs:301`). +- `BUIY_BLESS=1`: `assert_golden` writes `actual` as a new positive (or replaces + positive 0 when `BUIY_BLESS_REPLACE=`), updates the ledger, and returns + `Blessed`. **Then the human reviews the PNG diff in the PR and commits it** — + blessing is an explicit, reviewable, diffable act, never a silent overwrite + (Flutter `--update-goldens` + pre-submit triage; flutter-golden/lessons + §Borrow 4). One canonical invocation, documented in the module header: + + ```sh + BUIY_BLESS=1 cargo test -p buiy_verify --test goldens -- --ignored --test-threads=1 + ``` + +### Diff-PNG + self-contained HTML triage report + +Offline-first, no SaaS (project ethos; skia-gold/lessons §Borrow 6 +reg-cli/x-img-diff-js). On any `Fail`, the harness: + +1. Writes `target/buiy-goldens/.diff.png` — the `Diff::diff_image` + heatmap from `metric` (already produced by `compare` when + `CompareOpts::default()` requests it). +2. Appends a card to a single self-contained `target/buiy-goldens/report.html` + (one file per `cargo test` run, all failing cells accumulated): + +```rust +pub struct TriageReport { path: std::path::PathBuf, cards: Vec } +pub struct TriageCard { + pub key: GoldenKey, + pub actual_png: Vec, // base64-inlined ⇒ self-contained, CI-artifact-portable + pub baseline_png: Vec, // the closest positive (GoldenOutcome::Fail.best) + pub diff_png: Vec, + pub diff: Diff, // differing_pixels / max_channel_delta / mssim + pub budget: FuzzBudget, +} +impl TriageReport { + pub fn open_or_create(path: &std::path::Path) -> Self; + pub fn push(&mut self, card: TriageCard); + /// Emit one HTML file: side-by-side, toggle-overlay (JS opacity slider), + /// and diff-heatmap views per card, all PNGs base64-inlined. No external + /// assets, no network — openable straight from CI artifacts. + pub fn write(&self) -> std::io::Result<()>; +} +``` + +The HTML embeds three views per card (skia-gold/lessons §Borrow 6): +side-by-side expected|actual, a slider/toggle overlay, and the diff heatmap. +Triage = human eyeballs it, then runs the `BUIY_BLESS=1` command to promote +actual→positive. **Borrowed primitives, deferred to follow-ups** (not v1): +Skia-Gold time-boxed ignore rules (a `[[ignore]]` block in the ledger with an +RFC3339 `expires`, for an expected mass change like a font roll) and Argos-style +flaky auto-ignore (min-occurrences heuristic) — design hooks named, machinery +deferred (skia-gold/lessons §Borrow 5, 8). + +## Capture: promote `capture_to_image` into `buiy_core` + +The pure/GPU split (shared contract): the device-coupled capture lives in +`buiy_core::render::golden`, callable by `buiy_verify`. Promote the +`render_to_image`/`readback_rgba`/`spawn_capture_camera` triad from +`tests/support/mod.rs:204,353,229` into a library fn: + +```rust +// crates/buiy_core/src/render/golden.rs (new, src — not tests) +/// Render `app` to an offscreen Rgba8UnormSrgb target sized to the window's +/// physical pixels and read it back as an `image::RgbaImage`. Honors +/// `cfg.wait_for_fonts` (drives frames until `fonts_ready`, support/mod.rs:266) +/// before capture. The single GPU-coupled primitive every Tier-4/5 test shares. +pub fn capture_to_image(app: &mut bevy::app::App, cfg: &GoldenConfig) -> image::RgbaImage; +``` + +This adds `image = "0.25"` (already a workspace dep) to `buiy_core`; no new +crate. The existing naive `perceptual_diff` (`golden.rs:56`) is deprecated — +its callers move to `buiy_verify::metric::compare` (shared contract). + +## The Ahem / obscure-text split — keep real glyphs out of *layout* goldens + +Two classes of golden, per the Flutter/Alchemist two-class trick +(flutter-golden/lessons §Validates, §Borrow 1, 3): + +- **Layout-determinism class (the bulk).** Text-bearing goldens that test + *boxes*, not glyph fidelity, render under `BUIY_TEST_FONT` — a clean-room + box-glyph font (UPM **1024**, pinned ascent/descent 0.75/0.25 em, line-gap 0, + every glyph a solid em-box). Power-of-2 UPM makes metrics integer-exact and + font-engine-agnostic — boxes alone are not enough (flutter-golden/lessons + §Avoid "boxes instead of curves"). This collapses the font axis: any + layout-class golden is byte-identical across hosts. Wired through the same + `FontRegistry::register_bytes` path the shaping fixtures use + (`support/mod.rs:306`); selected by `DeterministicApp::test_font()` + (determinism.md). Shadows in this class swap to a flat fill via + `BUIY_DISABLE_SHADOWS` (engine-side, release-safe — flutter-golden/lessons + §Avoid "debug-build-only killswitch"; spec'd in determinism.md). + +- **Real-font fidelity class (deliberately narrow).** Only goldens that *assert* + glyph rasterization — hinting/subpixel, decoration position, color-emoji — + render real `cosmic-text`/`harfrust` glyphs from one pinned bundled OFL font + per script (the committed fixture fonts, `tests/fixtures/fonts/`), on the + pinned lavapipe rasterizer, with a documented widened budget. The shaping + `.snap` fixtures already pin glyph *positions* deterministically for 6 + scripts; this class adds the *pixel* fidelity check the snapshots can't. + +**Color emoji is the canonical irreducible golden** (report §Tier5; +flutter-golden/lessons §Avoid "trying to make color emoji deterministic"). It +has no feature-free reference (you cannot re-author a CBDT bitmap or a COLR +layer stack from primitives), is highly font-version-sensitive, and a user +notices tofu/wrong-emoji instantly. It belongs in the real-font class with a +pinned bundled emoji font, captured once on the canonical rasterizer, and a +generous per-fixture budget — never fought with determinism knobs. A font-version +roll is triaged via the time-boxed ignore (deferred primitive above). + +## Storage staging + migration trigger + +Designed now so migration is mechanical (report §Cross-cutting; skia-gold/lessons +§Borrow 1, 2): + +- **Now:** positives live in-git under `crates/buiy_verify/tests/goldens/`, + reviewed as the PR diff. The box-font layout class produces *tiny* PNGs + (solid rectangles compress hard), so churn is bounded; git-LFS only if the + real-font class churn bites. `*.png` under `tests/goldens/` gets `-text` in + `.gitattributes` (mirrors the `*.snap` pin already present). +- **Later (only if the count explodes):** commit-hash-keyed object storage + (reg-suit's keygen+publisher split) — a content-addressed bucket + (local dir → optional S3/GCS) with the baseline fetched as the parent + commit's snapshot, git stays clean. The `GoldenKey` schema + `BlessLedger` + are the durable accept ledger that reg-suit lacks; the object store only + changes *where bytes live*, not the key or the bless contract. Design the + rebase/squash/merge commit-key edge cases up front (skia-gold/lessons §Avoid + "naive commit-key resolution"). +- **Migration trigger (Open Q for the synthesizer — report §OQ6):** propose + **total in-git golden bytes > 50 MB OR positive count > 500** as the + planned threshold. Name it now so migration is a step, not a crisis. Do *not* + build a Skia-Gold-class service (skia-gold/lessons §Avoid). + +## Dependencies + +- `image = "0.25"` — already a workspace dep (PNG I/O); now also used in + `buiy_core`. No add. +- `serde`/`serde_json` — already deps (ledger TOML/JSON). The ledger uses + `toml` for human-diffable review; **add `toml = "0.8"`** to + `[workspace.dependencies]` and `buiy_verify`. New dep ⇒ run + `cargo deny check` before committing (CLAUDE.md); `toml` (MIT/Apache-2.0) is + license-clean and already transitively present via `cargo` tooling. +- HTML report: hand-written `String` templating + `base64` inlining. **Add + `base64 = "0.22"`** (MIT/Apache-2.0) to inline PNGs; gate on `cargo deny + check`. No templating/WASM crate — the report is a static string, offline by + construction. +- No perceptual-metric crate is added *here* — `metric.md` owns that. +- No object-store/S3 crate now — deferred until the migration trigger fires. + +## Verification (how the Tier-5 harness tests itself) + +The harness is mostly pure CPU; only capture needs the GPU lane. + +1. **Match/mismatch unit tests (no GPU).** Synthesize two `RgbaImage`s in + memory, write one as a positive via the bless path, assert `check_golden` + returns `Pass` on an identical image and `Fail` on a one-pixel-over-budget + image. Proves the set-valued comparison + budget gate without a renderer. +2. **Multi-positive.** Bless two near-identical positives; assert an image + matching the *second* returns `Pass { matched_positive: 1 }`. Proves the + any-positive-matches semantics. +3. **Bless round-trip.** With `BUIY_BLESS=1`, bless an image to a temp corpus + root, re-run without the env, assert it now passes and the ledger records + commit/timestamp/reason. Mirrors the shaping accept-then-assert test shape. +4. **Fail-closed.** Empty corpus + `BUIY_BLESS` unset ⇒ `assert_golden` panics + with the bless instruction (assert on the panic message, à la + `text_shaping_snapshots.rs:301`). +5. **Report self-containment.** Generate a `TriageReport` with one card, assert + the emitted HTML contains the base64 PNGs and references no external URL + (grep the string for `http`/`src="./"`). Proves offline-first. +6. **Key/slug stability.** Property test (`proptest`, already a dep): a + `GoldenKey` round-trips through `slug()`→parse and two distinct keys never + collide on a slug. +7. **GPU lane (`#[ignore]`).** One end-to-end golden per residue class (SDF + corner, shadow kernel, real-font glyph, color-emoji) captured via + `capture_to_image` under `DeterministicApp`, blessed once, asserted on the + pinned rasterizer. The Ahem layout-class golden additionally asserts + byte-identity across two fresh captures (re-capture determinism) *and* + equality to the stored positive — proving the box-font collapse holds. + +## Sources + +- Code: `crates/buiy_core/src/render/golden.rs:18,38,56,82` (GoldenConfig, + deterministic(), perceptual_diff, fonts_ready); + `crates/buiy_core/tests/support/mod.rs:204,229,266,306,353` (render_to_image, + spawn_capture_camera, wait_for_text_ready, register_fixture_font, + readback_rgba); `crates/buiy_core/tests/text_shaping_snapshots.rs:296,301` + (BUIY_ACCEPT_SHAPING accept-FILE precedent + fail-closed panic); + `crates/buiy_verify/src/visual.rs` (naive RMSE, superseded by `metric`); + `.gitattributes` (`*.snap -text` pin to mirror for `*.png`). +- Prior-art: `docs/prior-art/skia-gold/lessons.md` (key schema, multi-positive, + durable accept ledger, commit-keyed store, local HTML report, expiring + ignores, stale-positive pruning); `docs/prior-art/flutter-golden-testing/lessons.md` + (box-glyph UPM-1024 font, engine-side shadow killswitch, two-tier + obscure/real split, `--update-goldens` curated accept, color-emoji as + irreducible golden). +- Report: `docs/reports/2026-06-14-visual-bug-detection-strategy.md` §Tier 5, + §Cross-cutting (golden storage strategy, `--accept`/triage UX, Ahem split), + §Open questions 6 (storage-migration trigger), 7 (Ahem boundary + emoji + baseline). diff --git a/docs/specs/2026-06-15-buiy-verification-design/invariants.md b/docs/specs/2026-06-15-buiy-verification-design/invariants.md new file mode 100644 index 0000000..d2b4003 --- /dev/null +++ b/docs/specs/2026-06-15-buiy-verification-design/invariants.md @@ -0,0 +1,273 @@ +# Tier 3 — metamorphic & property invariants (`buiy_verify::invariant`) + +**Date:** 2026-06-15 +**Status:** draft +**Spec:** specs/2026-06-15-buiy-verification-design/README.md + +The `proptest`-driven middle tier (gate #12): generated scene strategies plus a +fixed set of predicate functions asserting *relations* over the CPU display-list +and shaper output — no golden, no oracle (report §3 Tier 3). It catches +paint-order/transform/top-layer/finiteness/BiDi-caret regressions over an +unbounded fixture space, pure-CPU and deterministic given a seed. This file +specifies the generators, the predicate signatures they feed, and how the +harness itself is verified. + +## Contract deviations + +Flagged for the synthesizer to reconcile — the shared contract cited stale +`origin/main` facts; the canonical code says otherwise: + +1. **`compose_transform` is at `layout/systems.rs:3775`, not `:3691`.** The + contract and task both cite `:3691`; verified line is `:3775` (signature + `(&UiTransform, Option<&Translate>, Option<&Rotate>, Option<&Scale>) -> Mat4`, + compose `T·R·S·M`). Plan author: cite `:3775`. +2. **`PackedInstance.rect_size[1]` is POSITIVE on `main`, not negative.** The + y-flip moved out of the instance into the per-view uniform + (`render/instance.rs:35`–`:47`: "height is POSITIVE — the y-flip lives in the + view uniform now"). The contract's "rect_size[1] deliberately negative by + y-flip" is stale. Consequence is *favorable*: `all_finite` can assert + `rect_size[1] ≥ 0` directly on `PackedInstance` — no un-flip needed. We keep + the `DrawData`/`ExtractedNode.size ≥ 0` assertion as the primary + non-negativity check and add the packed check as a stricter sibling. +3. **`tier_rank` is a private closure, and the `TopLayer` enum's `derive`d order + is NOT the paint order.** `tier_rank` lives inside a layout system as a local + `fn` (`systems.rs:4113`: `Fullscreen→0, Tooltip→1, Popover→2, Modal→3, + None→u8::MAX`), while `enum TopLayer` declares `None, Modal, Popover, Tooltip, + Fullscreen` (`layout/types.rs:1265`) — so `#[derive(Ord)]` would give the + WRONG dominance. `top_layer_dominates` must compare via the documented tier + rank, not enum discriminant. This spec requires promoting `tier_rank` to a + `pub fn buiy_core::layout::top_layer_paint_rank(TopLayer) -> u8` (single + source of truth, consumed by both the layout sort and this invariant); + flag the small `buiy_core` surface add. + +## Module shape + +`crates/buiy_verify/src/invariant/` — pure-CPU, no GPU, no window: + +``` +invariant/ + mod.rs // re-exports; the `#[cfg(test)]` proptest harness lives here + scene.rs // Scene model + proptest Strategy generators (shrinkable) + predicates.rs // the predicate fns — each is `pub`, takes borrowed data, returns Result + bidi.rs // BiDi caret round-trip generators + predicates (shaper-coupled) +``` + +No new dependency. `proptest = "1"` is already a workspace dep +(`Cargo.toml:51`) and `buiy_verify` already pulls it +(`crates/buiy_verify/Cargo.toml:13`), alongside `buiy_core + bevy + serde`. Tier +3 adds **zero** crates, so no `cargo deny check` entry is needed; the determinism +font mode and GPU pieces that *do* add deps live in the `determinism`/`metric` +files, not here. + +## Scene generators (`scene.rs`) + +A generator produces a *headless* scene description that the same CPU extract +path Tier 2 uses can turn into an `ExtractedNodes` list, with no GPU. We +generate an abstract `Scene` (not raw Bevy `World`s) so shrinking yields a +minimal, printable counterexample and the predicates stay world-agnostic. + +```rust +/// A generated node in a bounded hierarchy. `name` is the stable identity used +/// in diagnostics (mirrors Tier 2's `Name`-based dump — never raw `Entity` bits). +pub struct SceneNode { + pub name: String, // unique within a Scene ("n0", "n1", …) + pub children: Vec, + pub z_index: Option, // positioned z; drives stacking + paint order + pub isolation: bool, // forces a stacking context + pub top_layer: TopLayer, // None for the bulk; non-None ⇒ escapes + pub transform: GenTransform, // the Translate/Rotate/Scale/Matrix inputs + pub size: (f32, f32), // logical-px box (always finite, ≥ 0 by gen) + pub background: Option, // resolved color token (never the magenta sentinel) +} + +pub struct Scene { pub roots: Vec } + +/// Realize a `Scene` through the production CPU paint-order assembly +/// (`assemble_context_tree` / `partition_top_layer`) into the flat paint-ordered +/// node list the predicates assert on. No GPU, no readback. +pub fn realize(scene: &Scene) -> ExtractedNodes; +``` + +**Strategy budget (bounded, shrink-to-minimal).** Generators are explicitly +bounded so the property space is finite-depth and shrinking terminates fast: + +```rust +pub struct SceneParams { + pub max_depth: u32, // default 4 — hierarchy depth cap + pub max_breadth: u32, // default 4 — children per node cap + pub max_nodes: u32, // default 24 — total-node guard (prevents blow-up) + pub p_stacking: f64, // default 0.3 — P(node forms a context via z/isolation) + pub p_top_layer: f64, // default 0.1 — P(node escapes to the top layer) +} +pub fn arb_scene(p: SceneParams) -> impl Strategy; +``` + +- Depth/breadth use `proptest::collection::vec(inner, 0..=breadth)` recursed via + `Strategy::prop_recursive(depth, max_nodes, breadth, …)` so proptest's built-in + recursion guard bounds the tree and shrinks toward the empty/shallow scene. +- `z_index` is drawn from a small set `{-1, 0, 1, 2}` (the interesting + negative/zero/positive partition) rather than full `i32`, keeping shrinks + legible while still exercising the negative-z-first rule. +- `GenTransform` draws from the `compose_transform` input space: a `Translate` + with px components in `-512.0..512.0`, a `Rotate` quaternion built from an + axis-angle (angle in `0..2π`), and a `Scale` in `0.1..8.0` per axis — values + chosen finite and away from `0` so round-trips are well-conditioned. The + identity case (all `None`) is always reachable for shrinking. +- `top_layer` is drawn from all five `TopLayer` variants (skewed to `None`); + every variant must be reachable so `top_layer_dominates` exercises the full + tier rank, not just `Modal`. +- Uniqueness of `name` is enforced by a post-generation pass that renames in + pre-order (`n0..nK`), so a shrunk counterexample is reproducible and printable. + +## Predicate functions (`predicates.rs`) + +Each is a free `pub fn` taking borrowed data and returning +`Result<(), Violation>` (not a bare `bool`) so a failing property prints *which* +relation broke and the offending names/indices — the seed plus this message +reproduces it. `Violation` is a `thiserror`-free plain struct +(`{ rule: &'static str, detail: String }`) to keep the dep surface at zero. + +```rust +/// #1 — paint order is a TOTAL order over painted entities. +/// No entity appears twice; the sort is stable: two nodes with an equal paint +/// key keep document (generation) order. Mirrors the non-re-sorting contract of +/// `ExtractedNodes.nodes` (render/extract.rs:139 "Never re-sorted by render"). +pub fn paint_order_is_total(nodes: &ExtractedNodes) -> Result<(), Violation>; + +/// #2 — transform round-trips on `compose_transform` (systems.rs:3775). +/// Asserts three metamorphic relations on the COMPOSED Mat4, within `EPS`: +/// • translate(d) · translate(-d) ≈ identity +/// • rotate(2π) ≈ identity +/// • scale(k) applied to a unit box scales every geometry component by k and +/// nothing else (off-diagonals stay 0). +/// Operates on `compose_transform` outputs, NOT `layout/translate.rs` (the Taffy +/// style translator, which has no Mat4 — report §3 Tier 3). +pub fn transform_roundtrips(t: &GenTransform) -> Result<(), Violation>; + +/// #3 — top-layer dominance. Every `top_layer != None` node paints AFTER every +/// normal-stacking node, and the escaped tail is ordered by paint rank +/// Fullscreen < Tooltip < Popover < Modal — compared via the promoted +/// `buiy_core::layout::top_layer_paint_rank` (systems.rs:4113 tier_rank), never +/// the enum discriminant (see deviation #3). +pub fn top_layer_dominates(nodes: &ExtractedNodes) -> Result<(), Violation>; + +/// #4 — finiteness / non-negativity. Every `ExtractedNode.size.{x,y} ≥ 0` and +/// finite (the un-flipped logical box, render/extract.rs:73). The companion +/// `all_finite_packed` asserts every `PackedInstance` field is finite and +/// `rect_size[1] ≥ 0` directly — valid because the y-flip now lives in the view +/// uniform, so packed height stays positive (render/instance.rs:46, deviation #2). +pub fn all_finite(nodes: &ExtractedNodes) -> Result<(), Violation>; +pub fn all_finite_packed(packed: &[PackedInstance]) -> Result<(), Violation>; + +/// #5 — z-isolated containment (report §3): no entity of stacking context A +/// appears between two entities of context B in the flattened order. Asserted on +/// the same realized list, using the per-node context membership the generator +/// recorded. Guards against subtree leakage across an `isolation`/z boundary. +pub fn contexts_do_not_interleave(nodes: &ExtractedNodes, scene: &Scene) + -> Result<(), Violation>; +``` + +### BiDi caret round-trip (`bidi.rs`, predicate #6) + +Gate #12's named text invariant, on the **landed shaper** (`cosmic_text::Buffer` +laid out through the production text stack — same path as +`tests/text_shaping_snapshots.rs`). Relations over the shaper output, no +rasterizer: + +```rust +/// Generate a mixed-direction string: alternating LTR (Latin) and RTL +/// (Arabic/Hebrew) runs of bounded length, plus neutrals — the BiDi stress space +/// the shaping `.snap` fixtures pin positions for, now exercised generatively. +pub fn arb_bidi_text(max_runs: u32, max_run_len: u32) -> impl Strategy; + +/// #6a — logical↔visual caret round-trip is identity. For every grapheme +/// boundary, mapping the logical `Cursor { line, index }` to a visual x via the +/// run's glyph `start..end` (LayoutGlyph logical byte span) and `x` (visual +/// position), then hit-testing that x back, recovers the SAME logical cursor. +/// #6b — within one `LayoutRun`, visual caret order is MONOTONIC in logical +/// order for an LTR run (`run.rtl == false`) and strictly REVERSED for an RTL +/// run (`run.rtl == true`). +/// #6c — the run partition COVERS every codepoint exactly once (no gap, no +/// overlap across `Buffer::layout_runs()`). +pub fn bidi_caret_roundtrips(text: &str, metrics: Metrics) -> Result<(), Violation>; +``` + +The mapping uses cosmic-text's own `Buffer::layout_runs()` → +`LayoutRun { line_i, rtl, glyphs: [LayoutGlyph { start, end, x, … }] }` (the +exact structure `text/extract.rs:429`/`:797` consumes) and `cosmic_text::Cursor +{ line, index }` (re-exported at `text/components.rs:10`, not a Buiy struct — it +is cosmic-text's own type), so the invariant tests Buiy's *integration* of the +shaper, not a re-implementation of BiDi. + +## The proptest harness (`mod.rs`) + +One `proptest! { … }` block per predicate, each a `#[test]` so failures are +isolated and `cargo test -p buiy_verify` reports them individually. Default +config: `ProptestConfig { cases: 256, max_shrink_iters: 4096, .. }`, raised for +the cheap pure-CPU predicates. + +```rust +proptest! { + #[test] + fn prop_paint_order_total(scene in arb_scene(SceneParams::default())) { + let nodes = realize(&scene); + prop_assert!(paint_order_is_total(&nodes).is_ok(), + "{:?}", paint_order_is_total(&nodes).unwrap_err()); + } + // … one per predicate #1–#6 … +} +``` + +**Failing-seed reproduction.** Rely on proptest's built-in persistence: +`proptest-regressions/invariant/.txt` is committed, so any minimized +counterexample re-runs deterministically on the next `cargo test` (the +project's existing `cargo test` gate picks it up; no extra wiring). The +`Violation` message names the rule and the offending node names, and the shrunk +`Scene`/`String` prints via `Debug` — together these reproduce a failure from +the seed line alone. Document the persistence dir in the plan so it is committed, +not gitignored. + +## Verification (testing the harness itself) + +A property suite that never fails is worthless; we verify each predicate has +teeth with **mutation fixtures** — hand-built scenes that VIOLATE exactly one +relation, asserted to be rejected, plus a known-good control asserted to pass: + +- `paint_order_is_total`: a fixture whose realized list duplicates one entity ⇒ + `Err`; a stable-but-equal-key pair in reversed document order ⇒ `Err`; the + generator's output on a fixed seed ⇒ `Ok`. +- `transform_roundtrips`: feed a deliberately mis-composed matrix (e.g. `S·R·T` + instead of `T·R·S·M`) ⇒ `Err`; identity inputs ⇒ `Ok`. Pin `EPS` and add a + boundary fixture at `EPS ± 1 ULP`. +- `top_layer_dominates`: a fixture with a `Modal` painted before a `Fullscreen` + (rank 3 before rank 0) ⇒ `Err`; a normal node after a top-layer node ⇒ `Err`. + This fixture also pins deviation #3 — it FAILS if anyone "fixes" + `top_layer_dominates` to use the enum discriminant. +- `all_finite` / `all_finite_packed`: inject a `NaN` size and a negative + `size.y` ⇒ `Err`; a positive packed `rect_size[1]` ⇒ `Ok` (regression-pins + deviation #2). +- `contexts_do_not_interleave`: a hand-built interleaved list ⇒ `Err`. +- `bidi_caret_roundtrips`: the six shaping-snapshot scripts (Latin, Arabic, + Devanagari, CJK, emoji-ZWJ, mixed-BiDi) as known-good controls ⇒ `Ok`; an + off-by-one caret-map fixture ⇒ `Err`. + +These mutation fixtures are ordinary `#[test]`s alongside the `proptest!` blocks, +so the harness's own correctness rides the same `cargo test -p buiy_verify` gate +(no GPU, no `#[ignore]`). They are the Tier-3 analogue of the half-size sign-bug +regression in `render_instance.rs` — the predicate must reject the known bug. + +## Sources + +- Report: `docs/reports/2026-06-14-visual-bug-detection-strategy.md` §3 Tier 3 + (lines 118–133), cross-cutting §"Animation/temporal determinism". +- Code: `compose_transform` (`crates/buiy_core/src/layout/systems.rs:3775`), + `transform_matrix_to_mat4` (`:3716`), `tier_rank` (`:4113`); `ExtractedNode` + (`render/extract.rs:65`), `ExtractedNodes` (`:139`), `assemble_context_tree` + (`:206`), `partition_top_layer` (`render/top_layer.rs:17`); `PackedInstance` + (`render/instance.rs:40`), `packed_to_raw` (`render/buckets.rs:121`); + `enum TopLayer` (`layout/types.rs:1265`); shaper structures consumed at + `text/extract.rs:429`/`:797`, `cosmic_text::Cursor` (re-exported at + `text/components.rs:10`); precedent + `tests/text_shaping_snapshots.rs`, `tests/render_instance.rs`. +- Prior-art: `docs/prior-art/wgpu-testing/lessons.md` (lower tiers carry the + correctness load; goldens prove "no-change, not correct"). diff --git a/docs/specs/2026-06-15-buiy-verification-design/metric.md b/docs/specs/2026-06-15-buiy-verification-design/metric.md new file mode 100644 index 0000000..36c740b --- /dev/null +++ b/docs/specs/2026-06-15-buiy-verification-design/metric.md @@ -0,0 +1,341 @@ +# Perceptual metric — `buiy_verify::metric` + +**Date:** 2026-06-15 +**Status:** draft +**Spec:** specs/2026-06-15-buiy-verification-design/README.md + +The one image-comparison metric for the whole pyramid: an AA-aware, two-axis +fuzzy diff that replaces the two naive metrics on `main` — the L1 +`perceptual_diff` (`render/golden.rs:56`) and the global RMSE `compare_images` +(`buiy_verify/src/visual.rs:18`). It is the shared primitive consumed by +**tier-4 reftests** (fuzzy `==`/`!=` in one process) and **tier-5 goldens** +(stored-baseline regression), so both tiers express tolerance the same way. The +per-pixel decision is pixelmatch's luminance-weighted YIQ `colorDelta` with an +antialias-sibling exclusion; an advisory MSSIM channel catches global drift a +small pixel budget under-weights. + +## Contract deviations + +None. Signatures below match the SHARED API CONTRACT. Two clarifications (not +deviations): the gate uses the YIQ-weighted per-pixel delta while `max_channel_delta` +is the raw L∞ kept for diagnostics; `mssim` is `Option` so it is skipped (`None`) +on empty/disabled input, never silently `0.0`. + +## Why the naive metrics fail (report §4) + +Both average one global scalar: L1 = `Σ|Δ|/(len·255)`, RMSE = `√(Σ Δ²/(px·4·255²))`. +A defect touching 0.5% of pixels (a mispositioned glyph, a missing focus ring, an +8px wrong-color badge) divides across the whole frame and rounds below any sane +tolerance — **sensitivity degrades as the app grows** — while imperceptible sub-pixel +AA re-rasterization inflates the same number. One knob cannot separate the two. +Mozilla `reftest` (`fuzzy`), wgpu (abandoned `Outlier count N over M`, then FLIP), +and pixelmatch all converged on the same fix: a **two-axis budget with AA awareness**, +not an average (report §4; `prior-art/wgpu-testing/lessons.md` — wgpu PR #3830 / +issue #2760). + +## Module layout + +`crates/buiy_verify/src/metric.rs` (pure CPU, no GPU, no `bevy`). Operates on +`image::RgbaImage` (`image = "0.25"`, already a workspace dep). Re-exported as +`buiy_verify::metric`. + +```rust +//! Perceptual image diff — the shared metric for reftests (tier 4) and goldens +//! (tier 5). pixelmatch-YIQ colorDelta + antialias-sibling exclusion, gated on a +//! two-axis FuzzBudget. Supersedes render::golden::perceptual_diff (L1) and +//! visual::compare_images (RMSE). +``` + +### Types + +```rust +/// Outcome of one comparison. All counts are over the diffed (overlapping) +/// pixel set. `diff_image` is emitted only when `CompareOpts::emit_diff_image`. +#[derive(Clone, Debug)] +pub struct Diff { + /// Non-AA pixels whose YIQ colorDelta exceeded the per-pixel threshold. + pub differing_pixels: u32, + /// Largest single-channel L∞ delta over all pixels (diagnostic; 0..=255). + pub max_channel_delta: u8, + /// Total pixels compared (== w*h; 0 only for empty/degenerate input). + pub total_pixels: u32, + /// Advisory MSSIM in [0,1] (1 == identical). `None` when skipped. + pub mssim: Option, + /// Heatmap: AA pixels dimmed, differing pixels painted (pixelmatch palette). + pub diff_image: Option, +} + +/// The two-axis gate. A Diff PASSES iff BOTH hold. Default after determinism is +/// (0, 0); widen per fixture with a documented reason. Per Mozilla's +/// `fuzzy-if` discipline a *widened* budget should pin BOTH ends (a separate +/// min-budget assertion, below) so a shrinking diff is itself a regression. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub struct FuzzBudget { + /// No single channel of any pixel may differ by more than this (L∞). + pub max_channel_delta: u8, + /// At most this many non-AA pixels may exceed the per-pixel YIQ threshold. + pub max_diff_pixels: u32, +} + +impl FuzzBudget { + /// The post-determinism default: bit-exact within one pinned rasterizer. + pub const EXACT: FuzzBudget = FuzzBudget { max_channel_delta: 0, max_diff_pixels: 0 }; +} + +/// Per-pixel and AA-detection knobs. `threshold` feeds the pixelmatch +/// `maxDelta = 35215 · threshold²` luminance model; `include_aa = true` makes +/// AA pixels COUNT (for the few tests that assert AA exactly). +#[derive(Clone, Copy, Debug)] +pub struct CompareOpts { + /// Matching sensitivity in [0,1]; pixelmatch default 0.1. Smaller = stricter. + pub threshold: f64, + /// Treat antialiased pixels as differences instead of excluding them. + pub include_aa: bool, + /// Also compute the advisory MSSIM channel (image-compare). Default true. + pub mssim: bool, + /// Allocate and fill `Diff::diff_image`. Off in the hot reftest path. + pub emit_diff_image: bool, +} + +impl Default for CompareOpts { + fn default() -> Self { + Self { threshold: 0.1, include_aa: false, mssim: true, emit_diff_image: false } + } +} +``` + +### Functions + +```rust +/// Compare two RGBA images. **Infallible** — returns a `Diff`, never a +/// `Result`, so callers (`assert_golden`, `run_reftest`) need no error arm and +/// the crate stays `thiserror`-free (`thiserror` is project-deferred). +/// +/// A **dimension mismatch** folds into a *saturated* `Diff` that FAILS any +/// budget: `differing_pixels == total_pixels`, `max_channel_delta == 255`, +/// `mssim == Some(0.0)`. This mirrors the existing `compare_images` / +/// `perceptual_diff`, which return a maximal-difference sentinel on a size +/// mismatch (`visual.rs:19`, `golden.rs:58`) — but, crucially, it is the +/// *fail* direction, not the silent-pass bug §4 removes: the naive `1.0` was a +/// problem only because a separate code path let a `1.0` score satisfy a max +/// budget. Here the saturated `Diff` makes `passes(&_)` false for **every** +/// budget, so a mis-sized capture reds the gate loudly instead of squeaking +/// through. (`total_pixels` is set to `max(a.area, b.area)` so the saturation +/// count is well-defined.) +/// +/// An **empty image** (zero pixels) yields `Diff { differing_pixels: 0, +/// max_channel_delta: 0, total_pixels: 0, mssim: None, .. }` — there is no +/// difference to observe in an empty set, matching `compare_images`'s `0.0` +/// for the empty case. A harness that wants to forbid empty captures asserts +/// `total_pixels > 0` itself (the determinism quiescence gate already does). +pub fn compare(a: &image::RgbaImage, b: &image::RgbaImage, opts: &CompareOpts) -> Diff; + +impl Diff { + /// PASS iff `max_channel_delta <= budget.max_channel_delta` + /// AND `differing_pixels <= budget.max_diff_pixels`. (MSSIM is advisory and + /// never gates here — see below.) + pub fn passes(&self, budget: &FuzzBudget) -> bool; + + /// Mozilla `fuzzy-if` "ranges must not include 0" discipline: when a fixture + /// widens its budget because a difference is EXPECTED, assert the diff also + /// meets a floor, so a suddenly-clean render is flagged as a regression. + pub fn within(&self, min: &FuzzBudget, max: &FuzzBudget) -> bool; +} +``` + +## The per-pixel decision — pixelmatch YIQ `colorDelta` + +For each overlapping pixel, convert both samples to YIQ and weight the squared +delta `0.5053·ΔY² + 0.299·ΔI² + 0.1957·ΔQ²` (luminance dominates, matching the +eye). The acceptance bound is pixelmatch's `maxDelta = 35215 · threshold²`; a +pixel is *differing* iff its weighted delta exceeds it. Adopting the reference +algorithm (not re-deriving the `35215`/YIQ constants) is the point — brightness +errors then outweigh chroma, unlike L1/RMSE's equal channel weighting (report §4). + +### Antialias exclusion — the brightest/darkest-neighbor sibling test + +The single feature both naive metrics lack and the biggest GPU-pipeline flake +source (SDF `smoothstep` edge + linear→sRGB encode jitter sub-LSB, `golden.rs:52`). +pixelmatch's `antialiased(img, x, y, …)` predicate: a pixel is AA iff it has a +neighbor that is the **brightest** and one the **darkest** relative to it (by YIQ +luminance) and is not a hard edge in *both* images. A differing pixel that is AA in +*either* image is excluded from `differing_pixels` unless `include_aa`. This lets +`FuzzBudget::EXACT` (0,0) hold across the pinned rasterizer's residual AA jitter +while still catching a one-pixel real defect (a glyph shifted off the AA band). + +## Crate choice — vendor pixelmatch, don't hand-roll + +| Option | Verdict | +|---|---| +| **Hand-roll** the YIQ delta + sibling test | rejected — re-deriving battle-tested constants is the anti-pattern §4 warns against | +| **`dify = "0.8.0"`** | rejected — packaged as a CLI binary; its diff core is not a clean library surface and pulls extra deps | +| **`pixelmatch = "0.1.0"`** | **selected** — pure-Rust port of the canonical JS pixelmatch (YIQ `colorDelta` + AA sibling test) over `image` buffers, ~150 LOC, zero native/FFI cost | + +Primary dep (new): **`pixelmatch = "0.1.0"`** in `buiy_verify` — pure Rust, no +build script, MIT-licensed (compatible). It exposes the `colorDelta`/`antialiased` +primitives `compare` wraps; the `FuzzBudget` two-axis gate, `Diff` shape, MSSIM +channel, and the saturated-`Diff` mismatch handling are Buiy's layer on top +(pixelmatch returns only a flat changed-pixel count). + +> **`cargo deny check` note.** `pixelmatch = "0.1.0"` and `image-compare = +> "0.5.0"` are both new workspace deps; run `cargo deny check` before adding +> either (CLAUDE.md "supply-chain check"). pixelmatch is a thin, dependency-light +> port; `image-compare` pulls `nalgebra` — confirm the license set +> (MIT/Apache/BSD) and no `RUSTSEC` advisories in the same audit. Both ride the +> existing `image = "0.25"`; no second image-decode stack enters the tree. Pin +> exact patch versions (`=0.1.0`, `=0.5.0`) so a rasterizer-independent metric +> bump cannot silently shift baselines. + +## Advisory MSSIM — `image-compare` + +Secondary, **advisory-only** channel via **`image-compare = "0.5.0"`** +(`rgba_blended_hybrid_compare`, premultiplied against the opaque capture canvas), +surfaced as `Diff::mssim: Option`. It catches global gamma/blend drift a +small-N pixel budget under-weights (a uniform 1-LSB gamma shift is zero differing +pixels but a visible wash). It is **never the primary gate** — its failure mode is +averaging out localized defects, exactly the L1/RMSE weakness — so `Diff::passes` +ignores it; harnesses log it or assert it as a soft secondary in goldens. +(`dssim-core` is the structural fallback if MSSIM proves too coarse; not adopted.) + +## FLIP — the deferred fork (report Open Question #3 / prior-art) + +`prior-art/wgpu-testing/lessons.md` and `prior-art/vello/lessons.md` both +recommend NVIDIA ꟻLIP (`nv-flip`) as *primary*: wgpu migrated to it (PR #3830) +for AA tolerance, Vello gates `vello_tests` on `FlipPool::mean()`. This spec picks +**pixelmatch-primary** because it is pure Rust (no `nv-flip-sys` C++ FFI build cost +— a CI burden Vello's lesson flags), and it natively yields the **two-axis budget +reftests need** (FLIP yields one mean scalar, not a count + max-delta). If +pixel-budget tuning proves insufficient for the oracle/golden tiers, `metric` gains +a `flip` feature adding an `nv-flip` dev-dependency behind the same `Diff`/`FuzzBudget` +surface (its mean → a single-axis budget) — designed as an additive swap, not a +rewrite. Per Vello, the metric may legitimately differ per failure mode; the shared +`compare` + `CompareOpts.threshold` already expresses that spread. + +## How the two consuming tiers share this metric + +- **tier-4 reftests** (`buiy_verify::reftest`): `run_reftest` renders the test + and reference scenes in **one process** and calls `metric::compare`; platform + variance cancels because both halves share the GPU/driver/clock, so + `FuzzBudget` near `EXACT` holds. `RefCase.kind = Match` asserts `passes`; + `Mismatch` asserts `!passes` (the feature must *do* something). Same call backs + the CPU-vs-GPU SDF cross-check (CPU `sdf_rounded_rect` oracle vs GPU readback). +- **tier-5 goldens** (`buiy_verify::golden`): `assert_golden(name, &img, + &budget)` loads `tests/goldens/.png` and calls the *same* `compare`; + `emit_diff_image` is on so the triage HTML embeds the heatmap. One metric, one + budget vocabulary across both tiers — the §4 unification. + +## Migration of the two naive metrics + +1. **`buiy_verify::visual::compare_images` (RMSE)** — deleted. Its 4 callers in + `crates/buiy_verify/tests/visual.rs` migrate to `metric::compare` + + `Diff::passes(&budget)`. **A 5th reference** — the symbol-existence smoke test + `crates/buiy_verify/tests/smoke.rs:4` (`let _ = visual::compare_images;`) — + must also be deleted (or re-pointed at `metric::compare`) when the symbol goes, + or the smoke test stops compiling. `DiffResult{score}`/`passed(tol)` removed. +2. **`buiy_core::render::golden::perceptual_diff` (L1)** — `buiy_core` cannot + depend on `buiy_verify` in its *normal* (`[dependencies]`) graph — the harness + depends on core, not the reverse — so the production `perceptual_diff` is + **deprecated in place** (`#[deprecated(note = "use + buiy_verify::metric::compare")]`), its L1 body kept only for the existing + `#[ignore]` GPU re-capture tests until they migrate. To make + `buiy_verify::metric` reachable from those tests, the plan **adds `buiy_verify` + as a dev-dependency of `buiy_core`**: `buiy_verify = { path = + "../buiy_verify" }` under `[dev-dependencies]` in `crates/buiy_core/Cargo.toml` + (which today lists only `naga` there). This is a **dev-only dependency cycle** + (`buiy_core` → `buiy_verify` → `buiy_core`), which Cargo permits — a + dev-dependency edge does not participate in the normal build graph, so it + creates no real cycle and does not affect `cargo deny`. The cycle is + intentional and confined to `#[cfg(test)]`. With that edge in place, the ~20 + call sites in `tests/text_*_gpu.rs` (e.g. `text_gpu.rs:114`, + `text_golden_suite_gpu.rs:260`) move to `buiy_verify::metric::compare` when + those re-capture checks become stored goldens (tier-5, a later plan step). + Net: one metric, with a deprecation gravestone, not a duplicate. + +### Re-capture determinism / anti-tests — `compare`, not `assert_golden` + +Not every `perceptual_diff` site is a latent golden. The `text_*_gpu.rs` suite +has two *non-golden* shapes that compare **two in-process captures** against each +other (never a stored baseline), and both migrate onto `metric::compare` while +**staying as in-test assertions** — no PNG is stored, no `assert_golden` is +involved. They are determinism / behavior checks, the reftest pattern expressed +without a markup reference: + +- **"must be stable within budget"** — the re-capture determinism sites that today + assert `perceptual_diff(a, b) < tol` (e.g. `text_gpu.rs:114`, `:216`, `:359`, + `:452`; `text_gpu.rs:544`). These become + `compare(&a, &b, &CompareOpts::default()).within(&min, &max)` where the budget + is `FuzzBudget::EXACT` (`(0,0)`) once the determinism stack lands — i.e. *plain* + `passes(&EXACT)` for the bit-exact case. They assert two fresh captures of the + same scene agree; this is the `RefKind::Match`-of-a-scene-with-itself property, + inlined in the text suite. + +- **"must differ"** — the mismatch/anti-tests that today assert + `perceptual_diff(a, b) > tol` (`text_gpu.rs:152`, `:271`): proof that flipping + an input (a different glyph, a moved caret) actually *changes the pixels*, the + silent-no-op guard. These become `!compare(&a, &b, &CompareOpts::default()) + .passes(&FuzzBudget::EXACT)` — i.e. the captures must **not** match at the exact + budget. This is exactly `RefKind::Mismatch`'s `!passes` with a forced `(0,0)` + floor (reftests.md); a convenience `assert_differs(&a, &b)` wrapper in the test + module reads cleaner than the negation and is the recommended spelling. A budget + that tolerated difference would make the anti-test vacuous, so the floor is + pinned at `EXACT`. + +Both shapes are **in-test assertions on a live pair**, NOT stored goldens — they +diff two captures from the same run, so no baseline corpus, no bless ledger, no +`tests/goldens/` entry. Only the sites that compare a capture against a *stored* +reference (the `text_golden_suite_gpu.rs` baselines) become tier-5 +`assert_golden`. The migration is therefore three-way: stored-baseline → +`assert_golden`; same-run stability → `compare(..).passes(&EXACT)`; same-run +mismatch → `!compare(..).passes(&EXACT)` / `assert_differs`. + +## Verification — testing the metric itself + +The harness's own correctness is asserted with pure-CPU unit tests in +`crates/buiy_verify/tests/metric.rs` (no GPU), each a known-answer case: + +- **Identity:** `compare(img, img, default)` ⇒ `differing_pixels == 0`, + `max_channel_delta == 0`, `mssim == Some(1.0)`, `passes(&EXACT)`. +- **Single-pixel defect survives scale (the §4 regression):** an N×N image with + exactly one wrong-by-200 pixel yields `differing_pixels == 1` and + `!passes(&EXACT)` for *every* N — proving sensitivity does NOT dilute with + frame size (the exact failure of L1/RMSE; assert across N ∈ {16, 256, 2048}). +- **AA exclusion vs `include_aa`:** a synthetic edge AA'd one pixel-band wide + reads `differing_pixels == 0` with default opts and `> 0` with + `include_aa = true` — pins the sibling test on/off. +- **Two-axis independence:** a case that trips `max_channel_delta` but not + `max_diff_pixels` (one pixel off by 255) and the converse (many pixels off + by 1, below the YIQ threshold) — each must fail the gate, proving BOTH axes + bind. +- **`within` floor (fuzzy-if):** a diff below a widened `min` budget fails + `within(min,max)` — proving an unexpectedly-clean render is caught. +- **Dimension mismatch** ⇒ a saturated `Diff` (`differing_pixels == + total_pixels`, `max_channel_delta == 255`) that `!passes(&_)` for *every* + budget, including a hypothetical maximal one — pins the fail direction (the + loud-red replacement for the naive `1.0` silent-pass). **Empty** ⇒ a zero + `Diff` with `total_pixels == 0`; a separate assertion (`total_pixels > 0`) + forbids empty captures where that matters. +- **YIQ luminance weighting:** an equal-L∞ luma-channel change scores a larger + YIQ delta than a chroma-only change — pins that brightness outweighs chroma. +- **Advisory isolation:** a failing-MSSIM, zero-pixel diff still `passes` — MSSIM + never gates. A checked-in 8×8 PNG pair + its expected `Diff` (an `insta` + snapshot, floats redacted) guards the constants against a pixelmatch bump. + +All run under the headless `cargo test --workspace` gate (no `#[ignore]`, no +adapter) — the metric is pure CPU, so its self-test needs no GPU lane. + +## Sources + +Code: `crates/buiy_core/src/render/golden.rs:48-66` (L1 `perceptual_diff`), +`crates/buiy_verify/src/visual.rs:18-45` (RMSE `compare_images`), +`crates/buiy_core/src/render/instance.rs:40-58` (`PackedInstance`, the +byte-snapshot sibling primitive), `crates/buiy_core/tests/text_gpu.rs:114`/`:152`/`:271` +(re-capture `perceptual_diff` call sites to migrate — `:114` stable, `:152`/`:271` +mismatch anti-tests), +`crates/buiy_verify/tests/visual.rs` + `crates/buiy_verify/tests/smoke.rs:4` +(RMSE `compare_images` callers to migrate, incl. the symbol-existence smoke test). +Prior-art: +`docs/prior-art/wgpu-testing/lessons.md` (outlier-count brittleness → FLIP, the +pixelmatch-vs-FLIP runner-up), `docs/prior-art/vello/lessons.md` (per-tier metric +choice, `nv-flip` FFI cost, FLIP-mean oracle gate). Report: +`docs/reports/2026-06-14-visual-bug-detection-strategy.md` §4 "Perceptual metric +— replace the two naive metrics" and Open Question #3. diff --git a/docs/specs/2026-06-15-buiy-verification-design/open-questions.md b/docs/specs/2026-06-15-buiy-verification-design/open-questions.md new file mode 100644 index 0000000..7026ca5 --- /dev/null +++ b/docs/specs/2026-06-15-buiy-verification-design/open-questions.md @@ -0,0 +1,139 @@ +# Open questions — resolved as decisions + +**Date:** 2026-06-15 +**Status:** draft +**Spec:** [`README.md`](README.md) + +Each of the report's eight open questions ([`reports/2026-06-14-visual-bug-detection-strategy.md`](../../reports/2026-06-14-visual-bug-detection-strategy.md) § Open questions) is resolved here as a **decision** with a named runner-up and why it was rejected. The brief was to work autonomously and adopt the report's recommendations as defaults; each decision does so unless a child file's verified prior-art forced a sharper answer. The crate-boundary and the contract deviations the drafters flagged are reconciled at the end into one coherent contract. + +--- + +## Decision 1 — Reftest reference-independence enforcement + +**Question (report OQ#1).** The reference must use a code path disjoint from the feature under test, or a shared bug corrupts both and the comparison passes vacuously. Who reviews that independence, and can it be lint-enforced? + +**Decision.** Enforce independence by a **three-mechanism stack, in priority order, with a structural CI lint as the backstop** (`reftests.md` § "Reference independence"): + +1. **Route references through the primitive layer by default.** Buiy has a layer below Taffy/CSS-subset — `DrawData::new(position, size, color, radius)` and literal-positioned `Node` boxes that bypass the flex/grid/container-query solver entirely. A reference authored with literal offsets *cannot* share a layout-solver bug. This covers the bulk of pairings. +2. **Structural CI lint (`assert_reference_independent`).** A `#[test]` (pure CPU, *not* `#[ignore]`) builds each `RefCase`'s reference into a headless no-GPU `App` and asserts the reference subtree carries **none** of the components the feature exercises, via a declarative `IndependenceRule` map (`feature → forbidden_in_reference: &[ComponentMarker]`). The check queries the built ECS world for the forbidden component, so a textual rename cannot fool it. A pairing whose feature has **no registered rule fails the lint** until a rule is added — independence is opt-out-impossible by construction. +3. **Multiple references** where one disjoint path is impossible (logical↔physical, transform↔literal), with WPT/Gecko aggregation: `Match` passes if ≥1 reference matches (OR), `Mismatch` passes only if **all** mismatch (AND). + +A PR-time review checklist complements the lint for the *semantic* cases the marker map misses (does the reference invoke the feature in spirit; is a `Mismatch` floor `(0,0)`; does a widened `Match` fuzz cite measured jitter). + +**Runner-up — pure human review (PR checklist only), rejected.** It is what the browsers largely do and needs no harness code. Rejected because it is the load-bearing risk of the whole tier (`wpt-reftests/lessons.md` "the whole bet rests on one discipline") and human review silently rots: a reviewer who misses one feature-using reference ships a permanently-vacuous test that reports green forever. The structural lint makes the common case (a known feature component appearing in a reference) a hard CI failure, reserving human judgment for the residue. The cost — Blink supports neither multiple nor chained references, so mechanism 3 is not free (`wpt-reftests/lessons.md` Avoid row) — is accepted deliberately because the primitive layer (mechanism 1) makes multiple-references the exception, not the rule. + +--- + +## Decision 2 — Per-fixture fuzz budgets: default + calibration + +**Question (report OQ#2).** What is the default `fuzzy(d_lo-d_hi, p_lo-p_hi)`, and is it calibrated statistically or hand-set? Is Buiy willing to pin both ends (Mozilla "ranges must not include 0")? + +**Decision.** Default budget is **`FuzzBudget::EXACT` = `(max_channel_delta: 0, max_diff_pixels: 0)`** once the determinism stack is in place (`metric.md` § Types; `determinism.md` engineers nondeterminism out at the source so `(0,0)` is reachable). Calibration is **measurement-driven, per fixture, only when widening**: a fixture widens its budget *only* with a documented reason recorded in the bless ledger (`goldens.md` `Positive.reason`), and the widened value is taken from the harness's printed `(max_channel_delta, differing_pixels)` on a real failing run — never a guessed round number. **Yes, Buiy pins both ends:** a widened budget asserts a floor too, via `Diff::within(min, max)`, so a suddenly-clean render (the bug got fixed, or the AA stopped touching those pixels) is itself flagged as a regression that retires the budget (Mozilla `fuzzy-if` discipline; `wpt-reftests/lessons.md` Borrow #3). For reftests the floor on a `Mismatch` is *forced* to `(0,0)` at macro-expansion time — a `!=` that tolerates difference is vacuous. + +**Runner-up — a single global tolerance (the existing `perceptual_diff < ~1e-4` flat threshold), rejected.** It is what the current `#[ignore]` re-capture goldens use and needs no per-fixture bookkeeping. Rejected because Skia Gold ships `determine_gold_inexact_parameters.py` precisely because global thresholds fail (`skia-gold/lessons.md` Avoid: "hand-picked global pixel thresholds"): one knob cannot separate a benign sub-pixel AA shift in one fixture from a real one-box regression in another, and a global average dilutes a localized defect below tolerance as the frame grows — the exact §4 failure the new metric exists to fix. Per-fixture, two-axis, pinned-both-ends is the only model that expresses "this fixture legitimately jitters by N AA pixels, and a regression is anything outside that band." + +--- + +## Decision 3 — Perceptual metric: pixelmatch-YIQ vs. NVIDIA ꟻLIP + +**Question (report OQ#3).** Adopt FLIP from the start, or pixelmatch-primary with FLIP deferred? + +**Decision. pixelmatch-YIQ + AA-sibling exclusion is primary; FLIP is deferred behind an additive feature** (`metric.md` § "FLIP — the deferred fork"). Vendor `pixelmatch = "0.1.0"` (pure-Rust port of canonical JS pixelmatch: luminance-weighted YIQ `colorDelta` + the brightest/darkest-neighbor AA sibling test, ~150 LOC, no FFI). `image-compare = "0.5.0"` supplies an **advisory-only** MSSIM channel (`Diff::mssim`) that never gates. If pixel-budget tuning proves insufficient for the oracle/golden tiers, `metric` gains a `flip` feature adding an `nv-flip` dev-dependency behind the *same* `Diff`/`FuzzBudget` surface (its mean → a single-axis budget) — designed as an additive swap, not a rewrite. + +**Runner-up — NVIDIA ꟻLIP as primary, rejected (for now).** This is what wgpu — Buiy's closest determinism model — migrated to (PR #3830) and what Vello gates on (`FlipPool::mean()`); both `wgpu-testing/lessons.md` (Borrow #5) and `vello/lessons.md` (Borrow #2) recommend it, and FLIP's edge-contrast term yields fewer AA false-positives than YIQ. Rejected as the *starting* primary on three grounds: (a) `nv-flip` wraps a C++ library via `nv-flip-sys`, a build-time native cost on CI that Vello's own lesson flags as a burden (`vello/lessons.md` Avoid row); (b) FLIP yields **one mean scalar**, not the count-plus-max-delta two-axis budget reftests *require* to express Mozilla-style fuzzy matching (`wpt-reftests/lessons.md` Borrow #2) — the single most important metric property for the headline tier; (c) pure-Rust pixelmatch is ~150 LOC with no FFI and rides the existing `image = "0.25"` dep. The disagreement between the wgpu/Vello streams and the pixelmatch stream is real and is why this is an Open Question, not a clean win — the deferred-feature design keeps the FLIP door open at zero rewrite cost, and per Vello the metric may legitimately differ per failure mode (the shared `CompareOpts.threshold` already expresses that spread). + +--- + +## Decision 4 — lavapipe vs. real-GPU for CI goldens + +**Question (report OQ#4).** lavapipe-only for CI goldens, with real-hardware in the separate GPU-verify campaign? Or are there driver-specific rasterization bugs that *only* a real GPU surfaces and must be a CI gate? + +**Decision. Pinned lavapipe is the single canonical CI golden rasterizer; the local real-GPU lane (this host's AMD RX 6700 XT / RADV) runs only rasterizer-internal-invariant checks, never the stored-baseline comparison** (`determinism.md` § "CI software-rasterizer pin"). One canonical config ⇒ **one golden per cell, no per-OS/per-GPU matrix** — collapsing the worst combinatorial multiplier. Mesa lavapipe is consumed as a **version-pinned, self-built artifact** (reuse `gfx-rs/ci-build`'s prebuilt tarball directly), with a composite action writing its own ICD JSON and exporting `VK_DRIVER_FILES` (loader sees only lavapipe) + `WGPU_ADAPTER_NAME=llvmpipe`. The cemented division of labor: **CI goldens run on pinned lavapipe (the stored-baseline gate); real-hardware shader/AA/blend paths are covered by the separate GPU-verify campaign, NOT a CI gate.** The local lane does *not* compare against the lavapipe baseline (cross-rasterizer pixels are non-comparable) — it runs the determinism / reftest checks, which are rasterizer-internal invariants. The `GoldenKey.backend` axis is reserved for forward-compat but is a *constant* (`Lavapipe`) today. + +**Runner-up — a real-GPU CI matrix (goldens on actual Vulkan/Metal/DX12 hardware), rejected.** It is the only way to catch a driver-specific blend/AA bug *as a gate* rather than a one-shot campaign, and is what a hardware-coverage purist would demand. Rejected because: (a) a rolling distro rasterizer is a *moving reference image* — wgpu abandoned `ppa:oibaf` for exactly this, every unrelated upstream regression reddening CI (`wgpu-testing/lessons.md` Validates: "pin the rasterizer, don't track the distro"); (b) lavapipe self-warns it is "not a conformant vulkan implementation, testing use only," so goldens prove **no-change, not correct** regardless — correctness is carried by the lower deterministic tiers and the GPU-verify campaign (`wgpu-testing/lessons.md` Avoid row); (c) a per-GPU matrix re-multiplies the cell count the whole pyramid exists to shrink. The residual risk — a real-hardware-only rasterization bug — is consciously assigned to the existing RX 6700 XT GPU-verify campaign, which already runs the `#[ignore]` tests on real hardware. If a class of driver-only visual bug ever escapes the campaign into a release, that is the trigger to reconsider a narrow real-GPU CI leg, named now so it is a planned escalation, not a surprise. + +--- + +## Decision 5 — Display-list dump format stability contract + +**Question (report OQ#5).** The Tier-2 `Display` formatter is the durable artifact under churning structs. What is its versioning/compatibility policy — does a format change re-bless every snapshot (acceptable) or must it be diff-stable across formatter edits (harder)? + +**Decision. Explicit version header, conscious re-bless on format change — `re-bless-on-bump`, not diff-stable-across-edits** (`snapshots.md` § "Why a Display dump"). Every dump opens with a `# buiy--dump vN` header line; a format change bumps `N` and re-blesses every affected `.snap` in one reviewed PR. A `format-version tripwire` test asserts the dump's first line equals the current `vN` constant, so a formatter edit that *should* bump the version but didn't fails loudly. The structural guarantee that keeps re-bless rare and the day-to-day diffs clean: the dump is **decoupled from the structs underneath** — entities render by `Name` (never `Entity` allocation bits), floats round to `ROUND_DP`, one paint command per line — so a struct refactor or an unrelated spawn does *not* churn the snapshot. Only a deliberate formatter change bumps the version. + +**Runner-up — a format-stable-across-edits contract (the formatter guarantees byte-compatible output across its own edits, so no re-bless is ever needed), rejected.** It would spare the bulk re-bless. Rejected because guaranteeing output stability across formatter evolution is materially harder (every future field addition must be append-only and back-compatible, an open-ended constraint), buys little (a bulk re-bless is a single reviewed mechanical PR — the `BUIY_ACCEPT_SHAPING` precedent already does exactly this for shaping `.snap`s), and the report explicitly rates re-bless "acceptable." The version header makes a format change a *visible, single-line, conscious* act — which is the actual goal — rather than an invisible silent drift. + +--- + +## Decision 6 — Golden storage-migration trigger + +**Question (report OQ#6).** At what golden count / repo size does in-git (or LFS) stop being acceptable and the commit-keyed object-storage migration fire? Name the threshold now. + +**Decision. Migrate when total in-git golden bytes exceed 50 MB OR the positive count exceeds 500** (`goldens.md` § "Storage staging"; `coverage.md` enforces the count half via a `cell_count()` self-test that fails the build past a named ceiling). Until then, positives live in-git under `crates/buiy_verify/tests/goldens/` (`*.png` pinned `-text` in `.gitattributes`, mirroring the `*.snap` precedent), reviewed as the PR diff; git-LFS is adopted only if the *real-font* class churn bites before either threshold. The `GoldenKey` schema and `BlessLedger` are designed now so the migration only changes *where bytes live*, not the key or the bless contract — the later target is reg-suit's commit-hash-keyed object store (local dir → optional S3/GCS), with the rebase/squash/merge commit-key edge cases designed up front. **Do not** build a Skia-Gold-class database-backed service (`skia-gold/lessons.md` Avoid) — if the pyramid holds, the count never explodes. + +**Runner-up — "migrate when it hurts," no named number, rejected.** It avoids committing to a threshold that may prove wrong. Rejected because the report explicitly asks for the threshold to be named *now* so migration is a planned step, not a crisis, and an unnamed trigger reliably fires late — by the time in-git "hurts" (slow clones, churned history), the `O(configs × commits)` LFS pathology Screenshotbot warns of has already entered the history irreversibly (`skia-gold/lessons.md` Validates: "store goldens out-of-repo, keyed"). 50 MB / 500 positives is a deliberately conservative pair (the Ahem box-font layout class compresses hard, so the count, not the bytes, will bind first) that turns migration into a tracked issue with headroom. + +--- + +## Decision 7 — Text golden Ahem boundary + emoji baseline + +**Question (report OQ#7).** Which goldens render real glyphs (the narrow fidelity suite) vs. the Ahem/obscure-text layout mode? What is the color-emoji baseline and how is a font-version roll triaged? + +**Decision. Default Ahem; opt into Real only for the fidelity suite — a per-fixture declaration, not a global switch** (`goldens.md` § "Ahem / obscure-text split"; `determinism.md` § "Ahem font mode"): + +- **Layout-determinism class (the bulk).** Text-bearing goldens that test *boxes*, not glyph fidelity, render under `FontMode::Ahem` — a clean-room box-glyph font (UPM **1024** so metrics are integer-exact and font-engine-agnostic; pinned ascent/descent 0.75/0.25 em, line-gap 0; every glyph a solid em-box). This collapses the font axis: any layout-class golden is byte-identical across hosts. Wired through the production `FontRegistry::register_bytes` path; Ahem is made the *sole resolvable family* so fallback cannot reintroduce a platform font. (Boxes alone are not enough — power-of-2 UPM is what makes them deterministic; `flutter-golden-testing/lessons.md` Avoid "boxes instead of curves".) +- **Real-font fidelity class (deliberately narrow).** Only goldens that *assert* glyph rasterization — hinting/subpixel, decoration position, color-emoji — render real `cosmic-text`/`harfrust` glyphs from one pinned bundled OFL font per script, on pinned lavapipe, with a documented widened budget. The shaping `.snap` fixtures already pin glyph *positions* for 6 scripts; this class adds the *pixel* fidelity the snapshots cannot. +- **Color emoji** is the canonical irreducible golden: no feature-free reference (you cannot re-author a CBDT bitmap or COLR layer stack from primitives), highly font-version-sensitive, instantly noticed when wrong (tofu/wrong-emoji). It lives in the real-font class with a **pinned bundled emoji font captured once** on the canonical rasterizer and a generous per-fixture budget — never fought with determinism knobs. A **font-version roll is triaged via the time-boxed ignore** (a `[[ignore]]` block in the ledger with an RFC3339 `expires`), a primitive designed now but deferred to follow-ups, not v1 (`skia-gold/lessons.md` Borrow #5). + +**Runner-up — real glyphs everywhere with per-fixture fuzz to absorb the flake, rejected.** It is simpler (one font path, no Ahem asset) and tests "what the user sees." Rejected because real glyph rasterization is *the* canonical per-platform golden flake source — Flutter's entire `matchesGoldenFile` Ahem trick exists to fight exactly this (`flutter-golden-testing/lessons.md`) — and absorbing it with fuzz everywhere either widens budgets until real regressions slip through, or pins a single host's glyph output as the cross-host baseline (defeating the lavapipe-portability win). The split confines genuinely-flaky real-glyph rasterization to a small, deliberately-chosen set and makes the bulk byte-identical. + +--- + +## Decision 8 — Animation snapshot scope + +**Question (report OQ#8).** Frame-sequence snapshots at stepped clock times catch timing regressions but multiply the fixture count by the number of sampled timestamps. Which animations warrant temporal coverage, and at what sampling density? + +**Decision. Temporal coverage is opt-in per fixture, at three logical timestamps (t=0, mid, end), and lives in Tier 2 (structured per-timestamp paint-command snapshots), not Tier 5.** Buiy owns the clock, so animations are sampled by driving the manually-advanced `Time` the determinism stack already mandates (`determinism.md` § "Async-asset flush"; the `fixed_clock` mechanism) to explicit virtual timestamps and snapshotting the *display-list dump* at each — catching easing/interpolation timing regressions, not just settled end-states, at **near-zero cost** because the snapshot is a pure-CPU text dump (`snapshots.md`), not a pixel capture. **Default: end-state only** (most fixtures animate to a resting frame whose correctness the static golden already covers). A fixture **opts into** the three-sample sequence only when its *timing curve* is the behavior under test (a custom easing, a staged reveal, the caret blink — already a fixed-clock pair). Sampling density stays at three points unless a specific curve demands more, named per fixture. + +**Runner-up — pixel frame-sequence goldens at a fixed dense sampling (e.g. every animation, N≥5 stepped pixel captures), rejected.** It is the most thorough and catches rasterization-level timing artifacts. Rejected because it multiplies the *most expensive* tier's cell count by the sample count — the exact combinatorial blow-up the pyramid exists to prevent — to test a property (timing/interpolation) that is fully observable one tier down in the deterministic CPU display-list. Pixel-level temporal coverage is reserved for the rare fixture whose *rasterized* output genuinely changes per-frame in a way the display list cannot express (and even then, end-state-plus-one-mid, not a dense sequence). The structured per-timestamp snapshot is the right altitude; goldens stay end-state. + +--- + +## Contract reconciliation — crate boundary + flagged deviations + +The drafters flagged contract deviations in `metric.md`, `snapshots.md`, `invariants.md`, `reftests.md`, `goldens.md`, `determinism.md`, and `coverage.md`. Reconciled into one coherent contract (also summarized in `README.md` § "Resolved during synthesis"): + +**Crate boundary (confirmed, no deviation).** Pure/app-independent pieces live in `buiy_verify` (`metric`, `snapshot`, `invariant`, `reftest` pairing logic, `golden` persistence, `determinism` builder, `coverage`); the device-coupled capture lives in `buiy_core::render::golden` as the promoted `capture_to_image(&mut App, &GoldenConfig) -> image::RgbaImage`. `buiy_core` does not depend on `buiy_verify`; the naive `perceptual_diff` (L1) is deprecated in place, its `#[ignore]` re-capture callers migrating to `buiy_verify::metric` when they become stored goldens. This adds only `image = "0.25"` (already a workspace dep) to `buiy_core`. + +**Reconciled deviations:** + +1. **`compose_transform` at `:3775`** (was `:3691` in the contract) — `invariants.md` deviation #1, grep-verified on `origin/main`. Canonical: **`:3775`**. The transform round-trip invariant operates on `compose_transform`'s composed `Mat4` (`T·R·S·M`), not `layout/translate.rs`. + +2. **`PackedInstance.rect_size[1]` is POSITIVE** (was "deliberately negative by y-flip" in the contract) — `invariants.md` deviation #2; the y-flip moved into the per-view uniform. Canonical: **positive**. `all_finite_packed` asserts `rect_size[1] ≥ 0` directly, no un-flip; `ExtractedNode.size ≥ 0` stays the primary non-negativity check with the packed check as a stricter sibling. + +3. **Promote `tier_rank` → `pub fn buiy_core::layout::top_layer_paint_rank(TopLayer) -> u8`** — `invariants.md` deviation #3. The `TopLayer` enum's derived order is NOT the paint order (it declares `None, Modal, Popover, Tooltip, Fullscreen`; the paint rank is `Fullscreen→0, Tooltip→1, Popover→2, Modal→3, None→u8::MAX`, a private closure at `systems.rs:4113`). `top_layer_dominates` MUST compare via the rank, never the discriminant. **Accepted small `buiy_core` surface add:** one public fn, the single source of truth consumed by both the existing layout sort and the new invariant. This is the only production-code change the spec mandates outside the harness crates (besides promoting `capture_to_image` and deprecating `perceptual_diff`). + +4. **`capture_to_image` is a re-runnable primitive, not one-shot-per-App** — `reftests.md` Contract deviation. A reftest captures *two* scenes sharing one `wgpu::Device`/driver/clock/atlas in one process (so platform variance cancels). Reconciled in favor of the existing `capture_to_image(&mut App, &GoldenConfig) -> RgbaImage` signature: it re-targets the offscreen camera and re-reads-back on *each* call against an already-built `App`. Reftest calls it **twice** on one `DeterministicApp::build()` output; `DeterministicApp::capture(self, fixture)` is the build+spawn+single-capture convenience wrapper goldens use. **No `capture_scene(&mut App, FnOnce, &GoldenConfig)` shape is introduced** — the re-runnable primitive subsumes it. + +5. **`snapshot` resolves the contract's serde-"or" to Display-dump-only** — `snapshots.md` Contract deviation. No serde derives are added to render types (raw Debug/serde snapshots are the report's anti-pattern). Consequently `assert_display_list_snapshot` takes `&NameLookup` (a `World`-free entity→`Name` map built once via `NameLookup::from_world`), not the contract's bare `(nodes, name)` — required because `ExtractedNode` carries only an `Entity`, and the dump renders entities by `Name`. **Adopted.** + +6. **`LP_NUM_THREADS` dropped as a determinism knob; `VK_ICD_FILENAMES` → `VK_DRIVER_FILES`** — `determinism.md` deviations 1 & 2, confirmed by `wgpu-testing/lessons.md` (the `LP_NUM_THREADS` myth; the deprecated ICD env var). Determinism comes from the *pinned Mesa version*, not thread count. `LP_NUM_THREADS` may optionally be set to `1` as a commented belt-and-suspenders, never asserted as the determinism source. **Adopted.** + +7. **`coverage` models `forced_colors` and `dpr` as `Mode` axes** (each cell gets its own baseline) and `theme`/`viewport` as ordinary axes — `coverage.md` clarification. A presentation grouping, not a type change; the Cartesian product is over all four. **Adopted; no contract change.** + +**Reconciled final API contract** (the single source the plan implements against): + +- **`buiy_verify::metric`**: `Diff { differing_pixels: u32, max_channel_delta: u8, total_pixels: u32, mssim: Option, diff_image: Option }`; `FuzzBudget { max_channel_delta: u8, max_diff_pixels: u32 }` with `EXACT = (0,0)`; `CompareOpts { threshold, include_aa, mssim, emit_diff_image }`; **infallible** `fn compare(&RgbaImage, &RgbaImage, &CompareOpts) -> Diff` (no `Result`, no `thiserror` — a dimension mismatch folds into a *saturated* `Diff` that fails any budget, mirroring the existing naive metrics' fail-sentinel without their silent-pass bug); `Diff::passes(&FuzzBudget) -> bool`, `Diff::within(&FuzzBudget, &FuzzBudget) -> bool`. Primary = pixelmatch-YIQ + AA-sibling (`pixelmatch = "0.1.0"`); advisory MSSIM (`image-compare = "0.5.0"`, never gates); FLIP deferred behind a `flip` feature. +- **`buiy_verify::snapshot`**: `assert_layout_snapshot(&mut App, &str)` + `layout_dump(&World) -> String`; `assert_display_list_snapshot(&ExtractedNodes, &str, &NameLookup)` + `display_list_dump(&ExtractedNodes, &NameLookup) -> String`; `NameLookup::from_world(&World)`; `instance_hex(&PackedInstance) -> String` + `assert_instance_hex_snapshot(&PackedInstance, &str)`. Purpose-built `Display` dumps, version-headered; `insta` (workspace, `glob` feature); no serde derives on render types. +- **`buiy_verify::invariant`**: proptest `arb_scene(SceneParams) -> impl Strategy` + `realize(&Scene) -> ExtractedNodes`; predicates `paint_order_is_total`, `transform_roundtrips` (on `compose_transform`, `:3775`), `top_layer_dominates` (via `top_layer_paint_rank`), `all_finite` + `all_finite_packed` (`rect_size[1] ≥ 0`), `contexts_do_not_interleave`, `bidi_caret_roundtrips`; each returns `Result<(), Violation>`. No new dep. +- **`buiy_verify::reftest`**: `RefCase { name, kind: Match|Mismatch, test: fn(&mut App), reference: fn(&mut App), fuzz: FuzzBudget }`; `reftest!` macro; `run_reftest(&RefCase) -> RefOutcome` (two captures, one `App`, diff via `metric`); `assert_reference_independent` lint + `IndependenceRule`; multi-reference via `RefCase::multi`; CPU-vs-GPU `rasterize_sdf_rect` + `run_sdf_cross_check`. GPU (`#[ignore]`). +- **`buiy_verify::golden`**: `GoldenKey { widget, state, theme, viewport, backend, dpr }` (multi-positive, set-valued); `assert_golden(&GoldenKey, &RgbaImage, &FuzzBudget)` + `check_golden -> GoldenOutcome`; `BUIY_BLESS=1` accept-FILE workflow; `BlessLedger`/`Positive` (TOML); self-contained HTML `TriageReport` (base64-inlined). New deps `toml = "0.8"`, `base64 = "0.22"`. GPU (`#[ignore]`). +- **`buiy_verify::determinism`**: `DeterministicApp::{new, with, font_mode, dpr, build, capture}`; `GoldenConfig` extended with `font_mode: FontMode {Real|Ahem}`, `dpr: Dpr`; `CAPTURE_MSAA`/`CAPTURE_DITHER_OFF` constants; quiescence flush (4 conditions); lavapipe pin (`VK_DRIVER_FILES`, `WGPU_ADAPTER_NAME`, no `LP_NUM_THREADS`). No new Rust dep. +- **`buiy_core::render::golden`**: promote `capture_to_image(&mut App, &GoldenConfig) -> image::RgbaImage` (re-runnable); deprecate `perceptual_diff`. Adds `image` to `buiy_core`. +- **`buiy_verify::coverage`**: `Fixture { name, state, spawn }` + `fixture!` macro (`inventory` + `glob!`); `Matrix { themes, viewports, forced_colors, dprs }` + `cells()`; `CoverageKey` + `stem()`; `enroll_all(matrix, body)`; `live_catalog_paint() -> Vec` wiring `forced_colors_analyzer`. New dep `inventory = "0.3"`. + +## Unresolved for the reviewer + +None blocking. Two items are deliberately deferred (not unresolved) and flagged so the reviewer can object: + +- **FLIP-as-primary** (Decision 3) is deferred behind a feature, not closed — the wgpu/Vello prior art genuinely disagrees with the pixelmatch stream. If the reviewer judges the oracle/golden tiers will need FLIP's localization from day one, the additive-feature design makes adopting it early a small change, but the *default* this spec ships is pixelmatch-primary. +- **The `top_layer_paint_rank` promotion** (reconciliation #3) is the one production-code change outside the harness crates beyond `capture_to_image`/`perceptual_diff`. It is minimal (one public fn, no behavior change) but touches `buiy_core::layout` — flagged for the reviewer to confirm the surface add is acceptable rather than threading the private `tier_rank` through a test-only accessor. diff --git a/docs/specs/2026-06-15-buiy-verification-design/reftests.md b/docs/specs/2026-06-15-buiy-verification-design/reftests.md new file mode 100644 index 0000000..a772a80 --- /dev/null +++ b/docs/specs/2026-06-15-buiy-verification-design/reftests.md @@ -0,0 +1,201 @@ +# Tier 4 — reftests + CPU/GPU cross-check + +**Date:** 2026-06-15 +**Status:** draft +**Spec:** specs/2026-06-15-buiy-verification-design/README.md + +The reftest harness — Buiy's highest-leverage pixel investment and the one mechanism wholly absent from the tree. A reftest renders a **test** scene and a **reference** scene with the *same engine in one process* and asserts their bitmaps match (`==`) or differ (`!=`), never against a stored baseline — so every platform-variance term (driver SDF rounding, glyph-atlas AA, sRGB encode, clock) cancels in the diff. This file specifies `RefCase`, the `reftest!` macro, `run_reftest`, the reference-independence discipline + its lint/review enforcement, the CSS-subset authoring patterns, and the Vello-style CPU-vs-GPU SDF rasterization cross-check (Tier 4.5). It is GPU-coupled (`#[ignore]`, runs under `cargo test -- --ignored` on a real adapter here and pinned lavapipe in CI). + +## Contract deviations + +None. This file consumes `buiy_verify::metric` (`Diff`, `FuzzBudget`, `compare`, `Diff::passes`) and `CompareOpts` exactly as the contract defines them, and the promoted `buiy_core::render::golden::capture_to_image(&mut App, &GoldenConfig) -> image::RgbaImage` exactly as the crate-boundary clause defines it. One additive note flagged for the synthesizer, not a deviation: this tier needs `capture_to_image` to support **two captures in one `App`** (re-target the camera, re-readback) without rebuilding the device — see `run_reftest` below. If `golden.md` specs `capture_to_image` as one-shot-per-App, reconcile toward a `capture_scene(&mut App, scene: impl FnOnce(&mut App), &GoldenConfig) -> RgbaImage` shape that both tiers share. + +## Module & public API + +Lives in `buiy_verify::reftest` (pure pairing/aggregation logic + the macro) and calls into `buiy_core::render::golden` for capture. The harness itself stores **zero bytes**. + +```rust +// buiy_verify::reftest + +use buiy_verify::metric::{compare, CompareOpts, Diff, FuzzBudget}; +use bevy::app::App; + +/// One reftest pairing. `test` and `reference` each build a scene into a fresh, +/// deterministic `App` (spawn entities; do NOT drive frames — `run_reftest` owns +/// the capture loop). Co-locate the expectation with the `#[test]`. +pub struct RefCase { + pub name: &'static str, + pub kind: RefKind, + /// Builds the scene exercising the feature under test. + pub test: fn(&mut App), + /// Builds the independent-oracle scene (see "Reference independence"). + pub reference: fn(&mut App), + /// Per-pairing fuzz, à la Mozilla `fuzzy-if`. Default `(0,0)` once the + /// determinism stack is in (determinism.md); widen with a documented reason. + pub fuzz: FuzzBudget, +} + +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +pub enum RefKind { + /// Pass iff `test` and `reference` render to the same bitmap within `fuzz`. + Match, + /// Pass iff they render DIFFERENTLY (a `!=` anti-test guards silent no-ops). + Mismatch, +} + +#[derive(Debug)] +pub struct RefOutcome { + pub passed: bool, + pub diff: Diff, + /// On failure, a self-contained local HTML triage report (test | ref | diff), + /// reusing golden.md's report emitter. Path printed to stderr; never committed. + pub report_path: Option, +} + +/// Render BOTH scenes via buiy_core capture in ONE app run and diff with +/// `metric::compare`. Platform variance cancels because both halves share one +/// `wgpu::Device`, driver, atlas, and virtual clock. +pub fn run_reftest(case: &RefCase) -> RefOutcome; +``` + +`run_reftest` is the whole engine: + +1. Build a `DeterministicApp` (determinism.md) — one device, fixed virtual clock, fonts/atlas warmed, DPR + MSAA-off pinned. Both captures share it. +2. Capture `test` → `RgbaImage` via `golden::capture_to_image` (Ahem/obscure-text layout-font mode on by default per determinism.md, so text-bearing reftests assert boxes, not glyph fidelity — glyph fidelity is Tier 5). +3. Capture `reference` → `RgbaImage` in the **same** `App` (re-target the offscreen camera, re-readback — see Contract deviations). +4. `let diff = compare(&test_img, &ref_img, &CompareOpts::reftest_default());` — AA-aware (pixelmatch YIQ `colorDelta` + the antialias sibling test), since two CSS-subset code paths can legitimately differ by one AA pixel on a shared corner. +5. `Match`: `diff.passes(&case.fuzz)`. `Mismatch`: `!diff.passes(&case.fuzz)` **and** the fuzz floor is `(0,0)` (a `!=` whose budget tolerates difference is meaningless — assert this at macro-expansion time). + +`CompareOpts::reftest_default()` enables AA exclusion and the YIQ per-pixel decision; the secondary MSSIM (`image-compare`) channel is advisory and never gates a reftest. + +### The `reftest!` macro + +Generates one `#[test] #[ignore]` per pairing — keeps each case at the unit/integration tier under the existing `cargo test -- --ignored` GPU lane, no new CI infra, no manifest file (the type system *is* the manifest). + +```rust +reftest!(match, "container_query_collapse", cq_test, cq_reference); +reftest!(mismatch, "cv_hidden_actually_hides", cv_visible, cv_hidden); +reftest!(match, "flex_justify_end", flex_test, literal_offsets_ref, fuzz = (1, 8)); +``` + +Expansion (sketch): + +```rust +macro_rules! reftest { + ($kind:ident, $name:literal, $test:path, $reference:path $(, fuzz = ($d:literal, $p:literal))?) => { + #[test] + #[ignore = "GPU: run under `cargo test -- --ignored` (real adapter / lavapipe)"] + fn $name() { + let case = $crate::reftest::RefCase { + name: $name, + kind: $crate::reftest::RefKind::reftest_kind(stringify!($kind)), + test: $test, + reference: $reference, + fuzz: reftest!(@fuzz $kind $(($d, $p))?), + }; + let outcome = $crate::reftest::run_reftest(&case); + assert!(outcome.passed, "reftest {} failed: {:?} (report: {:?})", + $name, outcome.diff, outcome.report_path); + } + }; + // mismatch with no explicit fuzz → (0,0); match with none → (0,0); macro + // rejects a non-zero floor on `mismatch` at compile time. +} +``` + +## Reference independence — the load-bearing discipline + +The whole bet (wpt-reftests/lessons.md, "Top of file"): **the reference must not use the feature under test.** A flex reference built with flex, or an `@container` reference built with `@container`, shares any bug and the comparison passes vacuously — the symmetric twin of the golden weakness named in the report's runner-up rejection. This is the report's Open Question #1; this spec closes it with three mechanisms, in priority order: + +1. **Route references through the primitive layer.** Buiy has a layer below Taffy/CSS-subset: `DrawData::new(position, size, color, radius)` (`render/mod.rs:78`/`:85`) and literal-positioned `Node` boxes that bypass the flex/grid/container-query solver entirely. A reference authored with literal offsets *cannot* share a layout-solver bug. This is the default and covers the bulk of pairings. + +2. **Lint-enforce disjointness (CI gate).** A `buiy_verify::reftest::lint` check, run as a `#[test]` (not `#[ignore]` — pure CPU), introspects each `RefCase`'s `reference` scene after building it into a headless no-GPU `App` and asserts the reference subtree carries **none** of the components the `test` exercises. Concretely, a declarative map keyed by feature: + + ```rust + /// What a reference scene is FORBIDDEN to contain, per feature under test. + /// Checked by component presence in the built ECS world — structural, not textual. + pub struct IndependenceRule { + pub feature: &'static str, + pub forbidden_in_reference: &'static [ComponentMarker], // e.g. ContainerQuery, ContentVisibility + } + pub fn assert_reference_independent(case: &RefCase, rules: &[IndependenceRule]); + ``` + + E.g. a `@container` pairing's reference must contain **zero** `ContainerQuery` components; a `content-visibility` pairing's reference must contain zero `ContentVisibility::Hidden`. The check is structural (query the built world for the forbidden component), so it cannot be fooled by a textual rename. Pairings whose feature has no registered rule fail the lint until a rule is added — independence is opt-out-impossible by construction. + + **Limit — value-encoded features fall to human review (and may be the majority).** The lint queries for forbidden *components*, but many CSS-subset features are not their own component: `justify-content`, `align-items`, `direction`, `writing-mode`, `gap`, and the like are *field values* on a shared `Style`/`Node` component that every flex/grid scene carries — including a legitimately-disjoint reference. Component-presence cannot distinguish "reference uses flex *via* `justify-content`" from "reference is a plain literal-offset box that happens to carry a default `Style`," so these features have **no usable structural rule** and fall to the PR-time review checklist below. This is not the residue — for a CSS-subset engine where layout is value-encoded on one `Style` component, value-encoded features may be the *majority* of pairings, and mechanism 1 (route references through the primitive `DrawData`/literal-`Node` layer, which carries no `Style` at all) is what keeps them independent; the lint backstops only the features that *do* have a distinct marker component (`ContainerQuery`, `ContentVisibility`, `TopLayer`, transforms). Where a value-encoded feature cannot route through the primitive layer, human review (checklist item a) is the only enforcement, and the reviewer must treat it as load-bearing, not a formality. + +3. **Multiple references where one disjoint path is impossible** (logical↔physical, transform↔literal where the literal still routes through one shared packer). Support `reference: &[fn(&mut App)]` semantics via a `RefCase::multi` constructor: for `Match`, **≥1** reference must match (OR); for `Mismatch`, **all** must mismatch (AND) — the WPT/Gecko aggregation (wpt-reftests/lessons.md Borrow #1). Build this into the harness deliberately; Blink supports neither multiple nor chained references, so it is not free (wpt-reftests/lessons.md Avoid row). + +**Review checklist** (PR-time, complements the lint): (a) does the reference invoke the feature under test? (lint catches the structural cases; reviewer catches semantic ones the marker map misses); (b) is the fuzz floor `(0,0)` for a `Mismatch`?; (c) does a `Match` with non-zero fuzz cite a measured run-to-run jitter reason, ranges not including 0 (Mozilla discipline, wpt-reftests/lessons.md Avoid)? + +## Authoring patterns — mapped to Buiy's CSS-subset + +Each row is a `reftest!` pairing; the reference column is the disjoint oracle (wpt-reftests/lessons.md Borrow #6): + +| Feature under test | `test` scene | `reference` scene (disjoint) | kind | +|---|---|---|---| +| flex `justify-content: SpaceBetween` | three 40px boxes in a 200px flex row | three boxes at literal x = 0, 80, 160 via primitive layer | `match` | +| `@container` query resolution | widget whose style resolves via a container query | same tree with the resolved branch inlined as a plain `Style`, no `ContainerQuery` | `match` | +| `content-visibility: hidden` | subtree with `ContentVisibility::Hidden` | identical subtree, visible | `mismatch` | +| logical → physical mirror | logical-property layout (writing-mode/`direction`) | hand-authored physical-property mirror | `match` | +| `translate(50,50)` | element with `Translate(50,50,0)` | element authored at the translated literal coordinates | `match` | +| forced-colors visual residual | a scene under forced-colors mode | hand-authored reconstruction using only system tokens (coverage.md catalog) | `match` | + +**The forced-colors visual reftest is BLOCKED until the `BoxShadow` extract/draw path lands.** That row exercises the forced-colors `BoxShadow` draw-skip (shadows suppressed under forced colors), but `extract_buiy_nodes` has no `BoxShadow` branch today — it gains one only when `BoxShadow` gets a real extract/draw path (follow-ups.md:474–478, "extract_buiy_nodes has no such branch"). So this specific pairing is **specified now but not runnable** until that unlanded path exists; it must not be authored as a green test before then. The *structured* forced-colors checks cover the rest in the meantime: `analyze_forced_colors` / `analyze_shadow_only` over the live catalog (coverage.md § "Wiring `forced_colors_analyzer`", gate #11) gate the non-shadow forced-colors paint today, pure-CPU, with no dependency on the BoxShadow draw path. + +`!=` anti-tests (the `mismatch` rows) prove a feature *does something* — guarding silent no-ops a `==` would pass vacuously on blank-vs-blank. + +**Do not reftest the unreftestable** (wpt-reftests/lessons.md Avoid): underline position/thickness, dotted/dashed/ridge/groove/double borders, focus-ring geometry, font-metric-dependent rendering — no feature-free reference reproduces them. Route those to Tiers 1–3 (snapshot.md / invariant.md) or the Tier-5 golden residue (golden.md). The pyramid is the answer; do not force a reftest. + +## CPU-vs-GPU SDF cross-check (Tier 4.5) + +The golden-free rasterization oracle for the one property no markup reference can reach: **SDF corner AA**. Vello's pattern (vello/lessons.md "Top of file"), but *stronger* — Buiy's CPU oracle and GPU shader evaluate the **same closed-form `sdf_rounded_rect`** (`render/shader.wgsl:60`; CPU port at `tests/render_instance.rs:12`), so their agreement-to-tolerance is a *durable* invariant whose divergence localizes a real shader bug (wrong half-extent, radius clamp, premultiply, AA step). Keep it **permanently** — do not inherit Vello's "phase out the cross-check" posture (vello/lessons.md Avoid), which applies only to their two-independent-implementation case. + +Promote the CPU port from three scalar point-probes to a **full-tile rasterizer** (vello/lessons.md Borrow #1): + +```rust +// buiy_verify::reftest::sdf_oracle + +/// Pure-CPU per-pixel evaluation of the WGSL SDF + AA coverage step, mirroring +/// shader.wgsl:60/:76-:79 (fwidth → smoothstep(-aa, aa, d)) at the same logical-px +/// scale. The single source of the SDF formula is shared with the shader via a +/// doc-pinned port (the port and shader.wgsl must stay 1:1 — checked by a unit +/// test that re-derives the few sample points the existing render_instance.rs uses). +pub fn rasterize_sdf_rect(draw: &buiy_core::render::DrawData, w: u32, h: u32) -> image::RgbaImage; + +/// Render the same single primitive on the GPU (one-instance capture) and on the +/// CPU oracle, diff with metric. Tolerates sub-pixel AA noise via `fuzz`; zero +/// stored bytes. Catches AA/implementation drift no reftest can. +pub fn run_sdf_cross_check(draw: &buiy_core::render::DrawData, fuzz: &FuzzBudget) -> RefOutcome; +``` + +**Boundary, stated once (vello/lessons.md):** the shared SDF catches *implementation* drift, not a *spec* error in the SDF itself — if `sdf_rounded_rect` is wrong, both paths are wrong identically and the buffer matches. That residual ("is the shape the *intended* shape") is exactly Tier 5's job; the oracle does not subsume goldens. Use the **same** AA-aware metric as reftests (the report's pixelmatch-YIQ+AA primary); FLIP-for-the-oracle-tier (vello/lessons.md Borrow #2) is deferred to metric.md's Open Question, not adopted here. + +## Determinism & the capture gate + +Reftests need the determinism stack *less* than goldens (both halves share clock/atlas/DPR in one run, so drift cancels) but reuse it (wpt-reftests/lessons.md Borrow #4 — the `reftest-wait` settle handshake). Before each readback, `run_reftest` asserts the settle condition determinism.md owns: **0 pending assets, glyph atlas warmed, virtual clock at an explicit timestamp, DPR pinned, MSAA off**. This is `wait_for_text_ready` (`tests/support/mod.rs:266`) + `fonts_ready` (`golden.rs:82`) generalized into `DeterministicApp`. Capturing a half-settled frame diffs a half-rendered scene — the WPT capture-before-settle pitfall. + +Both captures stay on the **same wgpu backend in the same process** — never a Vulkan-test-vs-Metal-reference pairing (wpt-reftests/lessons.md Avoid). Cross-platform confidence comes from running the whole suite on each *pinned* backend independently (lavapipe in CI, RADV here), not from cross-backend `==`. + +## Dependencies + +- **No new external crate for the harness itself.** `buiy_verify` already depends on `buiy_core`, `bevy`, `image` 0.25, `proptest`, `serde` (`crates/buiy_verify/Cargo.toml`). `RgbaImage` is `image::RgbaImage`. +- The AA-aware metric (`metric.md`) owns the only new deps (pixelmatch-YIQ port + advisory `image-compare`); reftest consumes them transitively. The SDF oracle is pure `glam`/`image` arithmetic — no new dep. +- **`cargo deny check` note:** no dep is added *by this file*. Any new transitive crate is introduced and license-cleared in metric.md; reftest adds nothing to clear. + +## Verification — how the harness tests itself + +The harness is test infrastructure, so its own correctness needs meta-tests (pure CPU, **not** `#[ignore]`, run in the headless gate): + +1. **Aggregation truth table.** Unit-test `RefKind` + multi-reference OR/AND aggregation against a stub `compare` returning canned `Diff`s — `Match` passes iff within fuzz; `Mismatch` passes iff outside; multi `Match` is OR, multi `Mismatch` is AND. No GPU. +2. **Mismatch-floor guard.** Assert the macro/`run_reftest` rejects a `Mismatch` with a non-`(0,0)` fuzz floor (a `!=` that tolerates difference is vacuous). +3. **Known-good / known-bad pairs (GPU, `#[ignore]`).** A `match` pairing of a scene with *itself* must pass with `(0,0)` (proves capture determinism — the existing `render_golden_harness.rs` re-capture discipline). A `match` pairing of two deliberately-different scenes must **fail** (proves the harness can fail — guards a vacuous green). A `mismatch` of a scene with itself must fail. +4. **Independence lint self-test.** A reference scene that *illegally* contains the forbidden component must trip `assert_reference_independent` (RED), and the canonical disjoint reference must pass (GREEN) — the lint is itself tested, not trusted. +5. **SDF oracle vs. point-probes.** `rasterize_sdf_rect` must reproduce the scalar `d` values the existing `tests/render_instance.rs` point-probes assert (center inside, 2× half-extent outside) — pins the full-tile port to the unit-tested formula, pure CPU. + +## Sources + +- Code: `crates/buiy_core/tests/support/mod.rs` (capture/readback: `gpu_render_app`/`render_to_image`/`readback_rgba`/`wait_for_text_ready` :134/:204/:353/:266), `crates/buiy_core/src/render/golden.rs` (`GoldenConfig::deterministic`/`fonts_ready` :38/:82), `crates/buiy_core/src/render/shader.wgsl:60` + `tests/render_instance.rs:12` (the shared SDF + its CPU port), `crates/buiy_core/src/render/mod.rs:78` (`DrawData`), `crates/buiy_core/tests/render_golden_harness.rs` (re-capture discipline), `crates/buiy_core/src/layout/systems.rs:3775` (`compose_transform`, the transform-reftest oracle), `crates/buiy_verify/src/visual.rs` (superseded RMSE). +- Prior art: `docs/prior-art/wpt-reftests/lessons.md` (Top-of-file oracle finding, Validates, Avoid, Borrow #1/#4/#6, Open Questions #1/#2), `docs/prior-art/vello/lessons.md` (CPU-vs-GPU cross-check, Top-of-file "stronger oracle", Borrow #1/#2, the spec-error boundary). +- Report: `docs/reports/2026-06-14-visual-bug-detection-strategy.md` § "Tier 4 — reftests (`==` / `!=`)" + "The Vello-style CPU-vs-GPU cross-check" + Open Questions #1/#2. diff --git a/docs/specs/2026-06-15-buiy-verification-design/snapshots.md b/docs/specs/2026-06-15-buiy-verification-design/snapshots.md new file mode 100644 index 0000000..df4d109 --- /dev/null +++ b/docs/specs/2026-06-15-buiy-verification-design/snapshots.md @@ -0,0 +1,277 @@ +# Tiers 1–2 — structured snapshots + +**Date:** 2026-06-15 +**Status:** draft +**Spec:** specs/2026-06-15-buiy-verification-design/README.md + +The two cheapest, most deterministic rungs of the pyramid: **Tier 1** snapshots resolved +layout numbers per fixture (gate #5), **Tier 2** snapshots the whole CPU display-list / +paint-order / instance handoff holistically. Both replace today's low-density, +field-by-field `assert_eq!` in `render_*.rs` / `layout.rs` with one `insta` snapshot per +fixture, plus a byte-exact `Pod` hex check on `PackedInstance`. Pure-CPU, headless, in the +standard `cargo test` gate — no GPU, no window, sub-millisecond, 100% deterministic. + +## Why a Display dump, not serde `Debug`/JSON + +The report (§ Tier 2) is explicit: **do not snapshot raw `Debug`, and do not snapshot a +serde dump of the structs.** A serde/`Debug` snapshot couples the artifact to private field +names and `Entity` allocation bits (which vary with spawn order), so every struct refactor +re-blesses every snapshot and every unrelated spawn churns the diff. We instead emit a +purpose-built `Display` dump: one paint command per line, entities rendered by `Name`, floats +rounded, with a format-version header. The dump is the durable contract; the structs underneath +are free to churn. This is the Flutter `toStringDeep` / WebRender RON-display-list pattern, and +it is the one tier `masonry_testing` skips (it jumps straight to PNG goldens — the gap Buiy must +not replicate). **Consequence: no new `serde` derives are added to render types.** `serde` / +`serde_json` are already workspace deps (`crates/buiy_verify/Cargo.toml`) and stay unused by this +tier; the only new dependency is `insta`. + +## Dependency: `insta` + +Add to the workspace (`Cargo.toml [workspace.dependencies]`): + +```toml +insta = { version = "1", features = ["glob"] } # "glob" drives fixture-dir enrollment +``` + +`buiy_verify` and `buiy_core` (dev-dependency) consume it via `insta.workspace = true`. `insta` +is MIT/Apache-2.0 (already in the allow-list) and pulls `similar`, `console`, `linked-hash-map` +— all permissive. **`cargo deny check` is a required gate before this lands**; if any transitive +license is new it fails CI by design (deny.toml `[licenses]` is allow-list-only) and must be added +explicitly with its SPDX id, never an exception hack. The `cargo-insta` CLI (the review tool) is a +developer tool installed via `cargo install cargo-insta`, **not** a dependency — CI never needs it +(`INSTA_UPDATE=no` is the CI default, so an unreviewed `.snap.new` fails the build). + +## Tier 1 — layout-number snapshots (gate #5) + +### Public API (`buiy_verify::snapshot`) + +```rust +/// Run the layout pipeline on `app`, then snapshot every entity's resolved box +/// as a stable Display dump. Asserts via `insta::assert_snapshot!` under `name`. +/// Pure-CPU: MinimalPlugins + CorePlugin + LayoutPlugin, no RenderApp, one `update()`. +pub fn assert_layout_snapshot(app: &mut App, name: &str); + +/// The format-versioned Display dump backing the macro — `(name, position, size)` +/// per entity, sorted by `Name` then `Entity` index, floats rounded to `ROUND_DP`. +pub fn layout_dump(world: &World) -> String; +``` + +Dump format (`layout_dump`), version-headered so a format change is a single visible line: + +``` +# buiy-layout-dump v1 +root pos=0,0 size=200,100 + row.item[0] pos=0,0 size=50,50 + row.item[1] pos=50,0 size=50,50 +``` + +- Entities are named by their `Name` component (`bevy::prelude::Name`); fixtures **must** set + one (`Name::new("root")`). An unnamed entity falls back to `entity#` — flagged, because + an unnamed fixture is non-diff-stable across refactors. (`Name` is not currently spawned in + `buiy_core`; fixtures opt in. The dump never prints raw `Entity` bits.) +- Tree indentation follows `ChildOf`; siblings ordered by `Name` (document order is unstable + under ECS archetype moves, so `Name` is the sort key). +- Floats rounded to `ROUND_DP = 2` decimals (`const ROUND_DP: usize`) via a shared + `round(f32) -> String` helper (Tier 1 + Tier 2 share it) — kills last-ULP churn from the + Taffy/clip-space math while staying diff-readable. + +### What it replaces + +`crates/buiy_core/tests/layout.rs:33` — the `assert!((layout.size.x - 50.0).abs() < 0.5)` +pair in `layout_resolves_a_simple_flex_row` becomes one `assert_layout_snapshot(&mut app, +"flex_row_basic")`. The two GC tests (`layout_tree_garbage_collects_*`) assert `LayoutTree` +*cardinality*, not geometry — they stay as plain `assert_eq!` (snapshotting a length is +lower-density than the assert). Tier 1 replaces only the *geometry* asserts. + +Taffy's WPT-derived corpus (`docs/prior-art/taffy/lessons.md`) is importable as fixtures here +to exercise Buiy's Taffy bridge — the coverage matrix (`coverage.md`) auto-enrolls them. + +## Tier 2 — display-list / paint-order / instance snapshots + +### Public API (`buiy_verify::snapshot`) + +```rust +/// Snapshot the CPU display-list handoff holistically: ExtractedNodes order + +/// InstanceBuckets draw order + per-instance paint params, as one Display dump. +/// Pure-CPU — runs the extract/pack path, never a GPU. `name` keys the `.snap`. +pub fn assert_display_list_snapshot(nodes: &ExtractedNodes, name: &str, names: &NameLookup); + +/// Display dump of an ExtractedNodes set: nodes in `painters_z` order, then the +/// pack_view() InstanceBuckets in BTreeMap (draw) order. Entities by Name. +pub fn display_list_dump(nodes: &ExtractedNodes, names: &NameLookup) -> String; + +/// Resolve Entity -> human name for the dump (Name component, else `entity#idx`). +/// Built from the world once; passed in so the dump fn stays World-free/pure. +pub struct NameLookup(/* HashMap */); +impl NameLookup { pub fn from_world(world: &World) -> Self; } +``` + +Dump format (`display_list_dump`), version-headered: + +``` +# buiy-display-list-dump v1 +[nodes painters_z] +0 modal rect pos=10,20 size=100,40 color=token:Surface clip=none group=none +1 tooltip rect pos=0,0 size=80,24 color=#ffffffff clip=0,0..80,24 group=0 +[buckets draw-order] +(Quad,layer=0) x2 +(Glyph,layer=1) x5 +``` + +- **One paint command per line.** `ExtractedNode.nodes` is emitted in stored order (it is + *never* re-sorted by render — `extract.rs:141` — so the snapshot is the paint order, and a + z-sort regression shows as a line reorder, the exact bug class pixels name poorly). +- **Color rendered as a token when resolvable, else `#rrggbbaa`.** `ExtractedNode.color` is + already theme-resolved (`extract.rs:77`), so a literal hex in a snapshot that should show a + token is itself a regression signal (the magenta `MISSING_TOKEN_FALLBACK` sentinel surfaces + as `#ff00ffff`). +- **`InstanceBuckets` appended in `BTreeMap` key order** (`buckets.rs:113`, `(layer, primitive + paint-order)`) — the natural iteration *is* the deterministic draw order, so the dump pins + both the per-node set and the batched draw order in one artifact. Per-batch instance *counts* + go in the dump; the exact `[f32;13]` payload is pinned by the byte-hex check below (counts in + the readable dump, bytes in the strict one — complementary, per report § Tier 2). +- Floats rounded to `ROUND_DP` via the shared helper; `clip=none` for the `None` full-view + sentinel (`extract.rs:83`), else `min..max`; `group=|none` for `ExtractedNode.group`. + +### The byte-exact `PackedInstance` hex check + +`PackedInstance` is `#[repr(C)] Pod/Zeroable` (`render/instance.rs:41`), 52 bytes = `[f32;13]` +(pos2/size2/rgba4/radius1/clip_min2/clip_max2 — confirmed `instance.rs:42`–`:58`, +`PACKED_INSTANCE_STRIDE_BYTES = 52`). It is byte-snapshottable *now* with no new derive — a +deterministic, stricter, formatter-free regression on the px→logical packing: + +```rust +/// Hex-dump a packed instance as `bytemuck::bytes_of(p)` — a byte-exact snapshot +/// of the GPU upload payload, independent of the Display dump's format version. +pub fn instance_hex(p: &PackedInstance) -> String; // 104 hex chars +pub fn assert_instance_hex_snapshot(p: &PackedInstance, name: &str); +``` + +This is the complement the report mandates: the `Display` dump is diff-readable but +format-versioned; the hex dump is opaque but byte-exact and format-free. A packing arithmetic +change (e.g. the half-size sign bug `render_instance.rs` already regression-tests) flips the +hex even if the rounded Display dump rounds it away. **Endianness note:** `bytes_of` is +host-endian; CI and dev are both little-endian x86-64, and the hex is a within-repo regression +artifact (not a cross-host wire format), so this is acceptable — documented in the fn so a +big-endian CI host would be a conscious change. + +### What it replaces + +The low-density per-field `assert_eq!` named in the report become holistic snapshots: + +| Test file | Today | After | +|---|---|---| +| `tests/render_extract.rs` (459 L) | `assert_eq!(node.position, …)`, `node.size`, `node.color`, `node.clip`, the `assemble_context_tree` order `assert_eq!(got, vec![root,a,nested,c,d,b])` (`:423`) | `assert_display_list_snapshot` over the assembled `ExtractedNodes` | +| `tests/render_buckets.rs` (385 L) | `b.len(q0)`, `total_instances`, `batch[0] == expect`, the `PackedPartition` field asserts (`:239`) | display-list dump (counts + draw order) + `assert_instance_hex_snapshot` for the exact payload | +| `tests/render_paint_order.rs` (135 L) | `assert_eq!(tail, vec![fullscreen,tooltip,popover,modal])` (`:64`) | display-list dump of the assembled order (the tail ordering reads off the node lines) | +| `tests/render_instance.rs` (168 L) | per-field `PackedInstance` asserts incl. the half-size sign regression | `assert_instance_hex_snapshot` (byte-exact; the sign bug flips the hex) | +| `tests/top_layer.rs` | `partition_top_layer` order asserts | display-list dump of `partition_top_layer` output | +| `tests/layout.rs:33` | geometry `assert!` | `assert_layout_snapshot` (Tier 1) | + +**Replace, don't duplicate.** Each migrated test keeps its *scene construction* and *intent +comment*; only the trailing assert block collapses into one snapshot line. Asserts that pin a +**single named invariant** (e.g. `render_buckets.rs:9` `Shadow.paint_order() < Quad…`, or the +GC cardinality checks) stay as `assert!`/`assert_eq!` — a snapshot of one boolean is *lower* +density, which the report's "lowest tier that covers the behavior" rule rejects. The migration +is "holistic state → snapshot; single named property → keep the assert." + +### Per-timestamp animation snapshots (Tier 2, opt-in — Decision 8) + +Animation timing lives one tier down from pixels: the easing/interpolation curve is +fully observable in the deterministic CPU display-list, so temporal coverage is a +**display-list snapshot sampled at stepped virtual timestamps**, not a pixel sequence +(open-questions Decision 8). It is **opt-in per fixture** — default is end-state only +(the static golden covers the resting frame) — and a fixture enrolls only when its +*timing curve* is the behavior under test (a custom easing, a staged reveal, the caret +blink). When it does, the default sampling is **three logical timestamps** (`t=0`, mid, +end); a curve that demands more names them per fixture. + +The entry point drives the same manually-advanced `Time` clock the determinism +stack mandates (`determinism.md` § "Async-asset flush" — `fixed_clock` is "drive +`Time` at explicit virtual timestamps"), snapshotting the display-list dump at +each step: + +```rust +/// Snapshot the display-list dump at each virtual timestamp in `steps`, advancing +/// `Time` to each absolute logical time (NOT wall-clock) between captures. +/// One `.snap` per step, keyed `@` (e.g. `caret_blink@0`, +/// `caret_blink@250`, `caret_blink@500`), so a timing regression shows as a diff in +/// exactly the frame whose curve drifted. Pure-CPU, no GPU — the dump is a text +/// artifact (snapshots.md), so a 3-sample sequence costs ~3× a single dump, not a +/// pixel capture. Each step runs the extract path and emits `display_list_dump`. +pub fn assert_display_list_snapshot_at( + app: &mut App, + name: &str, + steps: &[std::time::Duration], // e.g. &[ZERO, mid, end] — three by default +); +``` + +The fixed clock makes the sequence deterministic: every step is an explicit +`Time::::advance_to`/`advance_by` (the landed manual-clock mechanism, +`tests/text_caret_selection.rs:178`), so the same timestamps reproduce byte-identical +dumps across hosts and runs — the determinism stack's clock guarantee is exactly what +makes per-timestamp snapshots cheap and stable. Pixel-level temporal coverage stays +reserved for the rare fixture whose *rasterized* output changes per-frame in a way the +display list cannot express (Decision 8 runner-up rejection); the default temporal +altitude is this structured per-timestamp dump. + +## The `cargo insta review` accept loop + +`cargo insta review` *is* the `--accept` UX the report requires (`a`/`r`/`s` per change, +rewrites the `.snap` on accept). No bespoke env flag is added — this is the native analogue of +the in-repo `BUIY_ACCEPT_SHAPING` curated flow (`tests/text_shaping_snapshots.rs`), and the +discipline is identical: **a snapshot change is a behavior change — review the diff before +accepting.** `INSTA_UPDATE` defaults to `no`, so in CI an unreviewed `.snap.new` fails the +build. `.snap` files live beside their tests (`crates/buiy_core/tests/snapshots/`, +`crates/buiy_verify/tests/snapshots/`) — text, diff-readable, in git, zero binary blobs. + +## Contract deviations + +- **`serde` additions explicitly NOT taken.** The contract's `snapshot` bullet lists "the serde + additions needed (ResolvedLayout, ExtractedNode, DrawData/InstanceData) **or** a Display dump + formatter approach (preferred per report)". This spec takes the Display-dump branch + exclusively and adds **no** serde derives — the report (§ Tier 2) is explicit that raw + Debug/serde snapshots are the anti-pattern. Flagged for the synthesizer only because it + resolves the contract's "or" to one branch. +- **`assert_display_list_snapshot` signature** takes `&NameLookup` (a `World`-free entity→name + map) rather than the contract's bare `(nodes, name)`. Required because the dump renders + entities by `Name`, and `ExtractedNode` carries only an `Entity`, not its `Name`. Pure-fn + hygiene: the dump stays `World`-free; the lookup is built once via `NameLookup::from_world`. + +## Verification + +How the harness verifies *itself* (the snapshot tooling is load-bearing, so it gets its own +non-snapshot tests): + +1. **Determinism of the dump.** A unit test builds one fixture, calls `layout_dump` / + `display_list_dump` twice on independent `App`s spawned in **different entity order**, and + `assert_eq!`s the two strings — proving the dump is invariant to `Entity` allocation order + (the property the `Name`-keyed sort exists to guarantee). This is a plain `assert_eq!`, not a + snapshot, so the meta-test cannot pass vacuously. +2. **Float rounding.** `round(1.005) == "1.0"`-class table tests on the shared helper pin the + `ROUND_DP` behavior, including negative and sub-ULP inputs. +3. **Hex round-trips bytes.** `instance_hex` then `hex → bytes → bytemuck::pod_read_unaligned` + reconstructs the original `PackedInstance` (`assert_eq!`), proving the hex is lossless and + matches the GPU upload payload. +4. **Format-version tripwire.** A test asserts the dump's first line equals the current + `vN` header constant — so a formatter edit that should bump the version but didn't fails + here (answering Open Q #5: format changes are a conscious, version-gated re-bless). +5. **Migration is behavior-preserving.** Each migrated test's first run blesses the snapshot; + reviewers diff the new `.snap` against the *old* per-field asserts to confirm the snapshot + encodes the same facts (the half-size sign regression in `render_instance.rs` must still + fail when re-introduced — verified by a mutation check during the migration plan). +6. **Standard gate.** Everything runs under `xvfb-run -a cargo test --workspace` with **no** + `--ignored` and no GPU adapter — the headless gate stays green on a CI host with no GPU. + +## Sources + +Code: `render/extract.rs:65`/`:139`/`:141`/`:77`/`:83` (ExtractedNode/ExtractedNodes, never +re-sorted), `render/instance.rs:41`/`:42` (PackedInstance Pod, `[f32;13]`/52 B), +`render/buckets.rs:86`/`:113`/`:196` (InstanceBuckets BTreeMap draw order, PackedPartition), +`components.rs:25`/`:82` (ResolvedLayout, StackingContext), `render/golden.rs` (GoldenConfig / +the `BUIY_ACCEPT_SHAPING` accept-flow analogue). Tests replaced: +`tests/{layout,render_extract,render_buckets,render_paint_order,render_instance,top_layer}.rs`, +`tests/text_shaping_snapshots.rs` (the in-repo structured-snapshot + curated-accept precedent). +Prior-art: `docs/prior-art/taffy/lessons.md` (WPT layout corpus), +`docs/prior-art/xilem-masonry/lessons.md` (`insta`, the skipped structured tier). Report: +`docs/reports/2026-06-14-visual-bug-detection-strategy.md` §§ Tier 1, Tier 2, Open Q #5. From 325b1e804f4dc79ebe3111f5df59cc4b18036d26 Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 04:42:37 -0700 Subject: [PATCH 02/70] build(verify): add image-compare + insta deps (deny-gated) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 0.1 of the verification pyramid: the advisory MSSIM channel (image-compare) and the tier-1/2 snapshot driver (insta, glob feature) land in buiy_verify with exact patch pins. cargo deny check passes; any new transitive license is added explicitly to deny.toml's allow list. pixelmatch is NOT added here — Phase 1a vendors its algorithm. No code consumes them yet — the metric/snapshot modules land in Phase 1/2. insta pinned to =1.48.0 (latest 1.x patch at impl time, not the plan's =1.43.2 placeholder, per the plan's 'pin the exact latest 1.x' directive). Spec: docs/specs/2026-06-15-buiy-verification-design/metric.md § Crate choice. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/buiy_verify/Cargo.toml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/crates/buiy_verify/Cargo.toml b/crates/buiy_verify/Cargo.toml index 93b0913..dc0aa0a 100644 --- a/crates/buiy_verify/Cargo.toml +++ b/crates/buiy_verify/Cargo.toml @@ -11,3 +11,13 @@ serde.workspace = true serde_json.workspace = true image.workspace = true proptest.workspace = true +# Advisory MSSIM channel (metric.md § "Advisory MSSIM"): catches global +# gamma/blend drift a small pixel budget under-weights. NEVER the primary +# gate — surfaced as `Diff::mssim: Option`. The `cargo deny check` below +# confirms its license set + no RUSTSEC advisories. +image-compare = "=0.5.0" +# Tier-1/2 snapshot assertions (snapshots.md): insta drives the layout-number +# and display-list `Display` dumps. Dev-time crate, but lives in `[dependencies]` +# because the harness re-exports snapshot helpers from `src/`. The `glob` feature +# drives the coverage fixture-dir fan-out (Phase 4). +insta = { version = "=1.48.0", features = ["glob"] } From a15dbc032af7a164da3a362f022ea9cf77bee812 Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 04:53:11 -0700 Subject: [PATCH 03/70] build(core): add buiy_verify as a dev-dependency (dev-only cycle) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 0.2 of the verification pyramid: the #[ignore] GPU re-capture tests in tests/text_*_gpu.rs migrate (Phase 1a) off the deprecated L1 perceptual_diff onto buiy_verify::metric::compare, so buiy_core's tests need to name buiy_verify. Added under [dev-dependencies] only — this forms a DEV-ONLY cycle (core → verify → core) that Cargo permits because dev-dep edges are excluded from the normal build graph. Confined to #[cfg(test)]. Spec: docs/specs/2026-06-15-buiy-verification-design/metric.md § Migration. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/buiy_core/Cargo.toml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/crates/buiy_core/Cargo.toml b/crates/buiy_core/Cargo.toml index b3bb8a9..7bf2b3f 100644 --- a/crates/buiy_core/Cargo.toml +++ b/crates/buiy_core/Cargo.toml @@ -47,3 +47,11 @@ default_font = [] [dev-dependencies] naga = "27" +# Dev-only dependency edge for the #[ignore] GPU re-capture tests, which +# migrate off the deprecated `render::golden::perceptual_diff` (L1) onto +# `buiy_verify::metric::compare` (metric.md § Migration). This forms a +# DEV-ONLY cycle (buiy_core → buiy_verify → buiy_core): a [dev-dependencies] +# edge is excluded from the normal build graph, so Cargo permits it, the +# production `cargo build -p buiy_core` is unaffected, and it adds no +# `cargo deny` surface. Confined to #[cfg(test)]. +buiy_verify = { path = "../buiy_verify" } From a90b599aa0758ddeb4a4a37712ae5ee911a25b59 Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 04:56:14 -0700 Subject: [PATCH 04/70] feat(core): canonical Dpr milliscale type in render::golden MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 0.3 of the verification pyramid: Dpr is device-pixel-ratio as integer milliscale (1000 = 1×, 2000 = 2×) so it is Eq+Hash+Ord — a fixture axis that keys goldens/coverage cells, never a tolerance. Defined ONCE here; goldens and coverage import it. from_f32/as_f32 round-trip the window's f32 scale_factor at the capture boundary; serde-derived for the bless ledger. Added serde.workspace = true to buiy_core [dependencies]: the plan made this conditional on 'if serde isn't already a direct dep'. Verified it was NOT (buiy_core's src had no serde use and the manifest no serde line), and the derive emits ::serde:: paths that bevy's re-export does not satisfy, so the direct dep is required. Rides the workspace serde pin — no new crate. Spec: docs/specs/2026-06-15-buiy-verification-design/determinism.md § Extending GoldenConfig. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/buiy_core/Cargo.toml | 7 +++ crates/buiy_core/src/render/golden.rs | 66 +++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) diff --git a/crates/buiy_core/Cargo.toml b/crates/buiy_core/Cargo.toml index 7bf2b3f..73ae1ec 100644 --- a/crates/buiy_core/Cargo.toml +++ b/crates/buiy_core/Cargo.toml @@ -36,6 +36,13 @@ sys-locale = "0.3" # Version-synced to cosmic-text 0.19's pin (0.5.8); an upstream bump to 0.6 # surfaces here as a loud type-mismatch compile error, by design. unicode-script = "0.5" +# The canonical `render::golden::Dpr` derives `serde::{Serialize, Deserialize}` +# so the verification golden bless ledger (`buiy_verify`) can persist it +# directly. `serde` is a workspace dep already; buiy_core names it directly +# because the derive emits `::serde::…` paths (bevy's re-export does not satisfy +# a bare `serde::Serialize` derive). Rides the workspace `serde` pin — no new +# crate enters the tree. +serde.workspace = true [features] default = ["default_font"] diff --git a/crates/buiy_core/src/render/golden.rs b/crates/buiy_core/src/render/golden.rs index 992cc60..33eabac 100644 --- a/crates/buiy_core/src/render/golden.rs +++ b/crates/buiy_core/src/render/golden.rs @@ -45,6 +45,39 @@ impl GoldenConfig { } } +/// **Canonical device-pixel-ratio type.** Integer *milliscale* (1000 = 1.0×, +/// 2000 = 2.0×) so it is `Eq + Hash + Ord` without float pitfalls — it is a +/// *fixture axis* that keys a golden / coverage cell, **never** a tolerance. +/// +/// Defined ONCE here; `buiy_verify::golden::GoldenKey.dpr` and +/// `buiy_verify::coverage::{Matrix.dprs, CoverageKey.dpr}` import this type, +/// they do **not** redefine it (verification-design `determinism.md`). The +/// capture boundary converts the window's `f32` `scale_factor` via +/// [`Dpr::from_f32`] and back via [`Dpr::as_f32`] when sizing the offscreen +/// target. Derives `serde` so the golden bless ledger can persist it directly. +#[derive( + Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, serde::Serialize, serde::Deserialize, +)] +pub struct Dpr(pub u32); + +impl Dpr { + /// 1.0× device-pixel-ratio (the headless capture default). + pub const X1: Self = Dpr(1000); + /// 2.0× device-pixel-ratio (the HiDPI fixture axis). + pub const X2: Self = Dpr(2000); + + /// Round an `f32` scale factor to integer milliscale (`1.0 → Dpr(1000)`). + /// Rounds to nearest so a `1.5×` window maps to `Dpr(1500)` exactly. + pub fn from_f32(scale: f32) -> Self { + Dpr((scale * 1000.0).round() as u32) + } + + /// Back to the `f32` scale factor the window / extract path consumes. + pub fn as_f32(&self) -> f32 { + self.0 as f32 / 1000.0 + } +} + /// Perceptual difference between two RGBA8 frames, as a normalized mean /// per-channel difference in `[0.0, 1.0]` (0 == identical). Comparison is /// *perceptual*, not exact byte equality (§ 4.2): sub-LSB float jitter in the @@ -86,3 +119,36 @@ pub fn fonts_ready( ) -> bool { warmup.is_empty() && visible_keys.iter().all(|key| atlas.get(key).is_some()) } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn dpr_milliscale_round_trips_f32() { + // The canonical fixture axis: integer milliscale so it is Eq+Hash+Ord, + // but it must convert losslessly to/from the f32 scale_factor the + // window/extract path carries (determinism.md § Extending GoldenConfig). + assert_eq!(Dpr::from_f32(1.0), Dpr::X1); + assert_eq!(Dpr::from_f32(2.0), Dpr::X2); + assert_eq!(Dpr::X1.as_f32(), 1.0); + assert_eq!(Dpr::X2.as_f32(), 2.0); + // Round-trip through both directions for a fractional ratio (1.5×). + assert_eq!(Dpr::from_f32(1.5), Dpr(1500)); + assert_eq!(Dpr(1500).as_f32(), 1.5); + // from_f32 rounds to nearest milliscale (no truncation drift). + assert_eq!(Dpr::from_f32(1.2345), Dpr(1235)); + } + + #[test] + fn dpr_is_ord_and_hashable() { + // It keys a golden/coverage cell, so Ord + Hash must hold (the reason + // for milliscale over f32). A plain compile-and-run proof. + use std::collections::HashSet; + assert!(Dpr::X1 < Dpr::X2); + let mut set = HashSet::new(); + assert!(set.insert(Dpr::X1)); + assert!(!set.insert(Dpr::X1)); // already present — Hash + Eq agree + assert!(set.insert(Dpr::X2)); + } +} From 7e9d0504748d0ec035bd38189c0a004d1ca85156 Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 05:07:40 -0700 Subject: [PATCH 05/70] feat(core): promote capture_to_image into render::golden src MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 0.4 of the verification pyramid: the shared GPU capture seam moves out of tests/support into render::golden src as capture_to_image(&mut App, &GoldenConfig) -> image::RgbaImage, so buiy_verify's reftest + golden tiers can call it. Sizes the offscreen target to the window's physical pixel grid, paints under CAPTURE_MSAA (single- sampled, dither off), and reads back into an RgbaImage. buiy_core gains image as a direct dep (README § Crate-dependency note: the only new GPU dep). #[ignore] GPU meta-test asserts physical dimensions + non-vacuous paint. readback_rgba_into is promoted to pub alongside capture_to_image; the tests/support readback_rgba now delegates to it so the readback poll + the 256-byte row-padding strip live in exactly one place (anti-drift). The dead CapturedBytes resource + Readback/ReadbackComplete/Mutex imports drop from tests/support as a result. Phase-0 scope is the capture mechanics; the four-condition quiescence flush and the scale_factor==dpr assertion are Phase 3.3's hardening. Spec: docs/specs/2026-06-15-buiy-verification-design/determinism.md § Where the code lives. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/buiy_core/Cargo.toml | 5 + crates/buiy_core/src/render/golden.rs | 173 ++++++++++++++++++ .../buiy_core/tests/render_golden_harness.rs | 65 +++++++ crates/buiy_core/tests/support/mod.rs | 82 +-------- 4 files changed, 251 insertions(+), 74 deletions(-) diff --git a/crates/buiy_core/Cargo.toml b/crates/buiy_core/Cargo.toml index 73ae1ec..016b096 100644 --- a/crates/buiy_core/Cargo.toml +++ b/crates/buiy_core/Cargo.toml @@ -43,6 +43,11 @@ unicode-script = "0.5" # a bare `serde::Serialize` derive). Rides the workspace `serde` pin — no new # crate enters the tree. serde.workspace = true +# The promoted `render::golden::capture_to_image` returns an +# `image::RgbaImage` (verification-design README § Crate-dependency note: the +# ONLY new GPU dep buiy_core gains). Rides the existing workspace `image` +# pin — no second image-decode stack enters the tree. +image.workspace = true [features] default = ["default_font"] diff --git a/crates/buiy_core/src/render/golden.rs b/crates/buiy_core/src/render/golden.rs index 33eabac..d063178 100644 --- a/crates/buiy_core/src/render/golden.rs +++ b/crates/buiy_core/src/render/golden.rs @@ -78,6 +78,179 @@ impl Dpr { } } +/// Single-sampled capture: a 4× MSAA resolve antialiases edges +/// nondeterministically across drivers, while Buiy's in-shader analytic AA is +/// deterministic given identical FP — so MSAA buys nothing here and costs +/// determinism. Mirrors the capture camera's landed `Msaa::Off` +/// (verification-design `determinism.md`). +pub const CAPTURE_MSAA: bevy::render::view::Msaa = bevy::render::view::Msaa::Off; + +/// Deband dither perturbs the low bits of the tonemapped output; the capture +/// camera pins it off. A `true` sentinel the capture path documents (the +/// camera spawns with no `DebandDither::Enabled`). +pub const CAPTURE_DITHER_OFF: bool = true; + +/// **The shared capture seam** (verification-design README § Architecture): +/// render the already-built, fixture-populated `app` into an offscreen target +/// sized to the window's PHYSICAL pixel grid and read it back as an +/// `image::RgbaImage`. Re-runnable against one `App` (a reftest calls it twice +/// on one device; spec § "Resolved during synthesis" #4). +/// +/// Phase-0 scope: the capture mechanics (size-to-physical, paint, readback, +/// assemble). The four-condition quiescence flush and the +/// `scale_factor == cfg.dpr` assertion are Phase 3.3's hardening of this same +/// function (`determinism.md` § Async-asset flush). +/// +/// Drives `MAX_CAPTURE_FRAMES` update frames after finishing the app (pipeline +/// async-compile + extract + prepare + paint settle), then reads back the +/// offscreen target's un-padded RGBA8 bytes. +pub fn capture_to_image(app: &mut bevy::app::App, _cfg: &GoldenConfig) -> image::RgbaImage { + use bevy::asset::RenderAssetUsages; + use bevy::camera::RenderTarget; + use bevy::image::Image; + use bevy::prelude::*; + use bevy::render::render_resource::{TextureFormat, TextureUsages}; + + // Physical pixel grid the offscreen target must match: the primary + // window's physical size (logical × scale_factor), which the view uniform + // is built from (extract fills `logical_size` from the primary window). + let (phys_w, phys_h) = { + let window = app + .world_mut() + .query::<&bevy::window::Window>() + .single(app.world()) + .expect("primary window for capture sizing"); + let r = window.resolution.physical_size(); + (r.x, r.y) + }; + + // Offscreen Rgba8UnormSrgb target with COPY_SRC for the readback copy and + // RenderAssetUsages::all() so the GpuImage exists in the render world. + let target = { + let mut image = + Image::new_target_texture(phys_w, phys_h, TextureFormat::Rgba8UnormSrgb, None); + image.texture_descriptor.usage |= TextureUsages::COPY_SRC; + image.asset_usage = RenderAssetUsages::all(); + app.world_mut().resource_mut::>().add(image) + }; + + // Capture camera: opaque-black clear, CAPTURE_MSAA (single-sampled), + // dither off (bare Camera2d at Msaa::Off carries no DebandDither::Enabled). + app.world_mut().spawn(( + Camera2d, + RenderTarget::from(target.clone()), + CAPTURE_MSAA, + Camera { + clear_color: ClearColorConfig::Custom(Color::BLACK), + ..default() + }, + )); + + // Finish materializes the device + pipelines; drive frames so layout → + // extract → prepare → paint settle before the readback poll. + const MAX_CAPTURE_FRAMES: usize = 3; + app.finish(); + app.cleanup(); + for _ in 0..MAX_CAPTURE_FRAMES { + app.update(); + } + + let bytes = readback_rgba_into(app, &target, phys_w, phys_h); + image::RgbaImage::from_raw(phys_w, phys_h, bytes) + .expect("readback byte count matches phys_w * phys_h * 4") +} + +/// Resource cell the `ReadbackComplete` observer writes the captured bytes +/// into. `Arc>` so the observer (which `move`s its capture) and the +/// poll loop share one slot. The src twin of the test-support `CapturedBytes`. +#[derive(bevy::ecs::resource::Resource, Clone, Default)] +struct CapturedBytes(std::sync::Arc>>>); + +/// Spawn `Readback::texture(target)`, observe its `ReadbackComplete`, and POLL +/// `app.update()` until the bytes arrive — condition-based, NOT a fixed frame +/// count: the pipeline async-compiles, prepares, paints, copies, and maps +/// across several frames, so the number of frames is not knowable up front. +/// Bounded by `MAX_FRAMES`; panics with a clear message if the readback never +/// fires. +/// +/// Returns the un-padded `w*h*4` RGBA8 bytes. The raw readback buffer keeps +/// wgpu's 256-byte ROW PADDING whenever `w * 4` is not already 256-aligned; +/// the padding is stripped HERE so callers can index `chunks_exact(4)` safely. +/// The src twin of `tests/support/mod.rs`'s `readback_rgba`; the support +/// helper delegates here so the readback body lives in exactly one place. +pub fn readback_rgba_into( + app: &mut bevy::app::App, + target: &bevy::asset::Handle, + w: u32, + h: u32, +) -> Vec { + use bevy::prelude::*; + use bevy::render::gpu_readback::{Readback, ReadbackComplete}; + + const MAX_FRAMES: usize = 60; + let (width, height) = (w as usize, h as usize); + + let cell = CapturedBytes::default(); + app.insert_resource(cell.clone()); + + let sink = cell.0.clone(); + app.world_mut() + .spawn(Readback::texture(target.clone())) + .observe(move |trigger: On| { + // `ReadbackComplete` derefs to its `data: Vec`; clone the raw + // RGBA8 into the shared slot. First completion wins (the readback + // re-fires every frame until its entity is despawned, but the poll + // loop stops at the first non-empty slot). + let mut slot = sink.lock().expect("readback sink mutex"); + if slot.is_none() { + slot.replace(trigger.event().data.clone()); + } + }); + + for _ in 0..MAX_FRAMES { + app.update(); + if cell.0.lock().expect("readback sink mutex").is_some() { + break; + } + } + + let data = cell + .0 + .lock() + .expect("readback sink mutex") + .take() + .unwrap_or_else(|| { + panic!( + "GPU readback never delivered bytes within {MAX_FRAMES} frames — \ + the texture→buffer copy or buffer map never completed (check that \ + the image carries COPY_SRC + RenderAssetUsages::all() and that a \ + capture camera targets it)" + ) + }); + + // Strip wgpu's 256-byte row padding if present (see the doc comment). + let unpadded_row = width * 4; + let padded_row = unpadded_row.div_ceil(256) * 256; + if data.len() == unpadded_row * height { + data + } else if data.len() == padded_row * height { + let mut out = Vec::with_capacity(unpadded_row * height); + for row in 0..height { + let start = row * padded_row; + out.extend_from_slice(&data[start..start + unpadded_row]); + } + out + } else { + panic!( + "readback returned {} bytes for a {width}x{height} RGBA8 target — \ + expected {} (unpadded) or {} (256-byte-padded rows)", + data.len(), + unpadded_row * height, + padded_row * height, + ); + } +} + /// Perceptual difference between two RGBA8 frames, as a normalized mean /// per-channel difference in `[0.0, 1.0]` (0 == identical). Comparison is /// *perceptual*, not exact byte equality (§ 4.2): sub-LSB float jitter in the diff --git a/crates/buiy_core/tests/render_golden_harness.rs b/crates/buiy_core/tests/render_golden_harness.rs index 06aa324..f49f9d8 100644 --- a/crates/buiy_core/tests/render_golden_harness.rs +++ b/crates/buiy_core/tests/render_golden_harness.rs @@ -269,3 +269,68 @@ fn fonts_ready_requires_drained_queue_and_resident_keys() { atlas.drain_warmup(&mut queue); assert!(fonts_ready(&atlas, &queue, std::slice::from_ref(&key))); } + +// Needs a wgpu adapter (real GPU or lavapipe). Proves the promoted +// `capture_to_image` seam paints a fixture and returns an `image::RgbaImage` +// of the expected PHYSICAL dimensions (logical × dpr). Run with: +// cargo test -p buiy_core --test render_golden_harness -- --ignored --nocapture +#[test] +#[ignore = "needs a wgpu adapter (real GPU or lavapipe); run with --ignored"] +fn capture_to_image_returns_physical_dimensions() { + use bevy::prelude::*; + use buiy_core::Node; + use buiy_core::layout::{Inset, Length, Sizing, Style}; + use buiy_core::render::color::ColorToken; + use buiy_core::render::components::Background; + use buiy_core::render::golden::{GoldenConfig, capture_to_image}; + use std::borrow::Cow; + + const LOGICAL_W: u32 = 48; + const LOGICAL_H: u32 = 32; + + // 1.0× capture: physical == logical. (Phase 0.4 sizes via the literal 1.0 + // path; GoldenConfig has no `dpr` field until Phase 3.1.) + let cfg = GoldenConfig::deterministic(); + let mut app = support::gpu_render_app_scaled(LOGICAL_W, LOGICAL_H, 1.0); + + // A known opaque fill so the capture is non-trivial (a blank frame would + // pass the dimension check vacuously; this proves real paint flows through). + { + let mut theme = app.world_mut().resource_mut::(); + theme + .colors + .insert("cap.fill".into(), Color::srgb(0.2, 0.6, 0.9)); + } + let fill = app + .world_mut() + .spawn(( + Node, + Style::default() + .absolute() + .inset(Inset { + top: Sizing::Length(Length::px(4.0)), + left: Sizing::Length(Length::px(4.0)), + ..default() + }) + .width_px(16.0) + .height_px(16.0), + Background { + color: ColorToken::Token(Cow::Borrowed("cap.fill")), + }, + )) + .id(); + app.world_mut() + .spawn((Node, Style::default())) + .add_children(&[fill]); + + let img = capture_to_image(&mut app, &cfg); + + assert_eq!( + (img.width(), img.height()), + (LOGICAL_W, LOGICAL_H), + "1× capture is logical-sized in physical pixels" + ); + // Non-vacuous: at least one pixel differs from the opaque-black clear. + let any_painted = img.pixels().any(|p| p.0 != [0, 0, 0, 255]); + assert!(any_painted, "capture produced non-clear pixels"); +} diff --git a/crates/buiy_core/tests/support/mod.rs b/crates/buiy_core/tests/support/mod.rs index d126a8b..5fac5ef 100644 --- a/crates/buiy_core/tests/support/mod.rs +++ b/crates/buiy_core/tests/support/mod.rs @@ -32,10 +32,9 @@ use bevy::asset::{AssetApp, RenderAssetUsages}; use bevy::camera::RenderTarget; use bevy::image::Image; use bevy::prelude::*; -use bevy::render::gpu_readback::{Readback, ReadbackComplete}; use bevy::render::render_resource::{TextureFormat, TextureUsages}; use buiy_core::{CorePlugin, render::BuiyRenderPlugin}; -use std::sync::{Arc, Mutex}; +use std::sync::Arc; /// Build the canonical headless-GPU Buiy app. The returned [`App`] is **not yet /// finished** — the caller must `finish()` it (or use [`finish_and_run`]) before @@ -252,13 +251,6 @@ pub fn spawn_capture_camera_with_msaa( )); } -/// Resource cell the `ReadbackComplete` observer writes the captured bytes into. -/// `Arc>` so the observer (which `move`s its capture) and the test poll -/// loop share one slot; an ECS resource would also work but the shared cell keeps -/// the observer a small closure. -#[derive(Resource, Clone, Default)] -struct CapturedBytes(Arc>>>); - /// Drive frames until the text fixture's `wait_for_fonts` predicate holds /// (verification § 3.2): the producer has emitted (`ResidentTextKeys` /// non-empty), the warmup queue is drained, and every emitted key is @@ -351,75 +343,17 @@ pub fn expected_full_coverage_srgb(color: [f32; 4]) -> [u8; 4] { /// padding bytes are `[0,0,0,0]`, which would otherwise satisfy a /// `px != clear` probe and false-green a "something painted" assertion. pub fn readback_rgba(app: &mut App, target: Handle) -> Vec { - const MAX_FRAMES: usize = 60; - - // The target's true extent — needed to detect + strip row padding below. + // The target's true extent — the promoted readback needs it to detect + + // strip wgpu's 256-byte row padding. let (width, height) = { let images = app.world().resource::>(); let image = images.get(&target).expect("readback target Image exists"); ( - image.texture_descriptor.size.width as usize, - image.texture_descriptor.size.height as usize, + image.texture_descriptor.size.width, + image.texture_descriptor.size.height, ) }; - - let cell = CapturedBytes::default(); - app.insert_resource(cell.clone()); - - let sink = cell.0.clone(); - app.world_mut().spawn(Readback::texture(target)).observe( - move |trigger: On| { - // `ReadbackComplete` derefs to its `data: Vec`; clone the raw - // RGBA8 into the shared slot. First completion wins (the readback - // re-fires every frame until its entity is despawned, but the poll - // loop stops at the first non-empty slot). - let mut slot = sink.lock().expect("readback sink mutex"); - if slot.is_none() { - slot.replace(trigger.event().data.clone()); - } - }, - ); - - for _ in 0..MAX_FRAMES { - app.update(); - if cell.0.lock().expect("readback sink mutex").is_some() { - break; - } - } - - let data = cell - .0 - .lock() - .expect("readback sink mutex") - .take() - .unwrap_or_else(|| { - panic!( - "GPU readback never delivered bytes within {MAX_FRAMES} frames — \ - the texture→buffer copy or buffer map never completed (check that \ - the image carries COPY_SRC + RenderAssetUsages::all() and that a \ - capture camera targets it)" - ) - }); - - // Strip wgpu's 256-byte row padding if present (see the doc comment). - let unpadded_row = width * 4; - let padded_row = unpadded_row.div_ceil(256) * 256; - if data.len() == unpadded_row * height { - data - } else if data.len() == padded_row * height { - let mut out = Vec::with_capacity(unpadded_row * height); - for row in 0..height { - let start = row * padded_row; - out.extend_from_slice(&data[start..start + unpadded_row]); - } - out - } else { - panic!( - "readback returned {} bytes for a {width}x{height} RGBA8 target — \ - expected {} (unpadded) or {} (256-byte-padded rows)", - data.len(), - unpadded_row * height, - padded_row * height, - ); - } + // Delegate to the promoted src twin so the readback poll + row-padding + // strip live in exactly one place (Phase 0.4 anti-drift). + buiy_core::render::golden::readback_rgba_into(app, &target, width, height) } From 9525cca3070707db1ffb26f8c10db7f371ae03ca Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 05:15:08 -0700 Subject: [PATCH 06/70] =?UTF-8?q?feat(verify):=20metric=20module=20skeleto?= =?UTF-8?q?n=20=E2=80=94=20Diff/FuzzBudget/CompareOpts?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Type shapes + empty-case compare stub, wired into lib.rs. Algorithm lands next. Realizes metric.md § Types. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/buiy_verify/src/lib.rs | 1 + crates/buiy_verify/src/metric.rs | 119 +++++++++++++++++++++++++++++++ 2 files changed, 120 insertions(+) create mode 100644 crates/buiy_verify/src/metric.rs diff --git a/crates/buiy_verify/src/lib.rs b/crates/buiy_verify/src/lib.rs index 91dd46d..226a01b 100644 --- a/crates/buiy_verify/src/lib.rs +++ b/crates/buiy_verify/src/lib.rs @@ -6,4 +6,5 @@ pub mod a11y; pub mod contrast; +pub mod metric; pub mod visual; diff --git a/crates/buiy_verify/src/metric.rs b/crates/buiy_verify/src/metric.rs new file mode 100644 index 0000000..21a655f --- /dev/null +++ b/crates/buiy_verify/src/metric.rs @@ -0,0 +1,119 @@ +//! Perceptual image diff — the shared metric for reftests (tier 4) and goldens +//! (tier 5). Luminance-weighted YIQ colorDelta + antialias-sibling exclusion, +//! gated on a two-axis FuzzBudget. Supersedes render::golden::perceptual_diff +//! (L1) and visual::compare_images (RMSE). +//! +//! The per-pixel YIQ `color_delta`, the `antialiased` brightest/darkest-sibling +//! test, and `has_many_siblings` are ported verbatim from the canonical +//! pixelmatch reference (MIT; mapbox/pixelmatch, the Rust `pixelmatch` 0.1.0 +//! crate). They are vendored, not depended on: the published crate consumes +//! PNG byte streams, returns only a flat count, keeps these primitives private, +//! and is image-0.24-bound — none of which fits `Diff`'s two-axis shape on +//! image 0.25. Vendoring is metric.md's "adopt the reference algorithm, don't +//! re-derive the 35215/YIQ constants" applied exactly. + +use image::RgbaImage; + +/// Outcome of one comparison. All counts are over the diffed (overlapping) +/// pixel set. `diff_image` is emitted only when `CompareOpts::emit_diff_image`. +#[derive(Clone, Debug)] +pub struct Diff { + /// Non-AA pixels whose YIQ colorDelta exceeded the per-pixel threshold. + pub differing_pixels: u32, + /// Largest single-channel L∞ delta over all pixels (diagnostic; 0..=255). + pub max_channel_delta: u8, + /// Total pixels compared (== w*h; 0 only for empty/degenerate input). + pub total_pixels: u32, + /// Advisory MSSIM in [0,1] (1 == identical). `None` when skipped. + pub mssim: Option, + /// Heatmap: AA pixels dimmed, differing pixels painted (pixelmatch palette). + pub diff_image: Option, +} + +/// The two-axis gate. A Diff PASSES iff BOTH hold. Default after determinism is +/// (0, 0); widen per fixture with a documented reason. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub struct FuzzBudget { + /// No single channel of any pixel may differ by more than this (L∞). + pub max_channel_delta: u8, + /// At most this many non-AA pixels may exceed the per-pixel YIQ threshold. + pub max_diff_pixels: u32, +} + +impl FuzzBudget { + /// The post-determinism default: bit-exact within one pinned rasterizer. + pub const EXACT: FuzzBudget = FuzzBudget { + max_channel_delta: 0, + max_diff_pixels: 0, + }; +} + +/// Per-pixel and AA-detection knobs. `threshold` feeds the +/// `max_delta = 35215 · threshold²` luminance model; `include_aa = true` makes +/// AA pixels COUNT (for the few tests that assert AA exactly). +#[derive(Clone, Copy, Debug)] +pub struct CompareOpts { + /// Matching sensitivity in [0,1]; default 0.1. Smaller = stricter. + pub threshold: f64, + /// Treat antialiased pixels as differences instead of excluding them. + pub include_aa: bool, + /// Also compute the advisory MSSIM channel (image-compare). Default true. + pub mssim: bool, + /// Allocate and fill `Diff::diff_image`. Off in the hot reftest path. + pub emit_diff_image: bool, +} + +impl Default for CompareOpts { + fn default() -> Self { + Self { + threshold: 0.1, + include_aa: false, + mssim: true, + emit_diff_image: false, + } + } +} + +/// Compare two RGBA images. **Infallible** — returns a `Diff`, never a +/// `Result`. (Stub: only the empty case is correct until 1a.2/1a.3 land.) +pub fn compare(a: &RgbaImage, b: &RgbaImage, _opts: &CompareOpts) -> Diff { + let _ = (a, b); + Diff { + differing_pixels: 0, + max_channel_delta: 0, + total_pixels: 0, + mssim: None, + diff_image: None, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn exact_budget_is_zero_zero() { + assert_eq!(FuzzBudget::EXACT.max_channel_delta, 0); + assert_eq!(FuzzBudget::EXACT.max_diff_pixels, 0); + } + + #[test] + fn default_opts_are_lenient_aware() { + let o = CompareOpts::default(); + assert_eq!(o.threshold, 0.1); + assert!(!o.include_aa); + assert!(o.mssim); + assert!(!o.emit_diff_image); + } + + #[test] + fn empty_vs_empty_is_zero_diff() { + let e = image::RgbaImage::new(0, 0); + let d = compare(&e, &e, &CompareOpts::default()); + assert_eq!(d.differing_pixels, 0); + assert_eq!(d.max_channel_delta, 0); + assert_eq!(d.total_pixels, 0); + assert_eq!(d.mssim, None); + assert!(d.diff_image.is_none()); + } +} From 3432c27e96d3af85d8e077a6b943aa419fa86258 Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 05:17:02 -0700 Subject: [PATCH 07/70] feat(verify): vendored YIQ color_delta + two-axis pixel scan MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ports pixelmatch's luminance-weighted YIQ delta (verbatim constants) and adds the raw L∞ max_channel_delta scan. Single-wrong-pixel is now caught at N in {16,256,2048} — the §4 dilution regression. AA exclusion and MSSIM follow. The yiq_luminance_outweighs_chroma fixture is corrected from the plan's [180,120,60]@0.05 (which does not separate luma from chroma — both exceed max_delta=88) to an equal-L∞ pure-luma (+30 all) vs chroma-leaning (+30R/-30B) pair @0.1, where the YIQ weighting (luma 455 vs chroma 244, max_delta=352) is what separates them. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/buiy_verify/src/metric.rs | 166 +++++++++++++++++++++++++++++-- 1 file changed, 158 insertions(+), 8 deletions(-) diff --git a/crates/buiy_verify/src/metric.rs b/crates/buiy_verify/src/metric.rs index 21a655f..44a9aa8 100644 --- a/crates/buiy_verify/src/metric.rs +++ b/crates/buiy_verify/src/metric.rs @@ -75,22 +75,172 @@ impl Default for CompareOpts { } /// Compare two RGBA images. **Infallible** — returns a `Diff`, never a -/// `Result`. (Stub: only the empty case is correct until 1a.2/1a.3 land.) -pub fn compare(a: &RgbaImage, b: &RgbaImage, _opts: &CompareOpts) -> Diff { - let _ = (a, b); +/// `Result`. (AA exclusion is layered in 1a.3; here every over-threshold pixel +/// counts.) +pub fn compare(a: &RgbaImage, b: &RgbaImage, opts: &CompareOpts) -> Diff { + // Empty: nothing to observe (matches compare_images's 0.0 empty case). + if a.width() == 0 || a.height() == 0 { + return Diff { + differing_pixels: 0, + max_channel_delta: 0, + total_pixels: 0, + mssim: None, + diff_image: None, + }; + } + // Dimension mismatch handled in 1a.4 (saturated Diff). For now assume equal. + let (w, h) = a.dimensions(); + let total_pixels = w * h; + let max_delta = 35_215_f64 * opts.threshold * opts.threshold; + + let mut differing_pixels = 0u32; + let mut max_channel_delta = 0u8; + for (pa, pb) in a.pixels().zip(b.pixels()) { + for ch in 0..4 { + let d = (pa[ch] as i16 - pb[ch] as i16).unsigned_abs() as u8; + max_channel_delta = max_channel_delta.max(d); + } + let delta = color_delta(pa, pb, false); + if delta.abs() > max_delta { + // AA exclusion is layered in 1a.3; here every over-threshold pixel counts. + differing_pixels += 1; + } + } + Diff { - differing_pixels: 0, - max_channel_delta: 0, - total_pixels: 0, - mssim: None, - diff_image: None, + differing_pixels, + max_channel_delta, + total_pixels, + mssim: None, // wired in 1a.5 + diff_image: None, // wired in 1a.6 } } +// ---- Vendored from pixelmatch (MIT). Verbatim constants; ported to image 0.25. +// "Measuring perceived color difference using YIQ NTSC transmission color space" +// (Kotsarenko & Ramos). `y_only` returns the signed luminance delta (used by the +// AA sibling test); otherwise the luminance-weighted YIQ squared delta, signed +// by which pixel is brighter. +fn color_delta(p1: &image::Rgba, p2: &image::Rgba, y_only: bool) -> f64 { + let (mut r1, mut g1, mut b1, mut a1) = (p1[0] as f64, p1[1] as f64, p1[2] as f64, p1[3] as f64); + let (mut r2, mut g2, mut b2, mut a2) = (p2[0] as f64, p2[1] as f64, p2[2] as f64, p2[3] as f64); + + if (a1 - a2).abs() < f64::EPSILON + && (r1 - r2).abs() < f64::EPSILON + && (g1 - g2).abs() < f64::EPSILON + && (b1 - b2).abs() < f64::EPSILON + { + return 0.0; + } + if a1 < 255.0 { + a1 /= 255.0; + r1 = blend(r1, a1); + g1 = blend(g1, a1); + b1 = blend(b1, a1); + } + if a2 < 255.0 { + a2 /= 255.0; + r2 = blend(r2, a2); + g2 = blend(g2, a2); + b2 = blend(b2, a2); + } + let y1 = rgb2y(r1, g1, b1); + let y2 = rgb2y(r2, g2, b2); + let y = y1 - y2; + if y_only { + return y; + } + let i = rgb2i(r1, g1, b1) - rgb2i(r2, g2, b2); + let q = rgb2q(r1, g1, b1) - rgb2q(r2, g2, b2); + let delta = 0.5053 * y * y + 0.299 * i * i + 0.1957 * q * q; + if y1 > y2 { -delta } else { delta } +} + +// blend semi-transparent color with white +fn blend(c: f64, a: f64) -> f64 { + 255.0 + (c - 255.0) * a +} +fn rgb2y(r: f64, g: f64, b: f64) -> f64 { + r * 0.298_895_31 + g * 0.586_622_47 + b * 0.114_482_23 +} +fn rgb2i(r: f64, g: f64, b: f64) -> f64 { + r * 0.595_977_99 - g * 0.274_176_10 - b * 0.321_801_89 +} +fn rgb2q(r: f64, g: f64, b: f64) -> f64 { + r * 0.211_470_17 - g * 0.522_617_11 + b * 0.311_146_94 +} + #[cfg(test)] mod tests { use super::*; + /// Solid w×h image of one color. + fn solid(w: u32, h: u32, px: [u8; 4]) -> image::RgbaImage { + image::RgbaImage::from_pixel(w, h, image::Rgba(px)) + } + + #[test] + fn identity_is_zero_diff() { + let img = solid(8, 8, [10, 200, 30, 255]); + let d = compare(&img, &img, &CompareOpts::default()); + assert_eq!(d.differing_pixels, 0); + assert_eq!(d.max_channel_delta, 0); + assert_eq!(d.total_pixels, 64); + } + + #[test] + fn single_wrong_pixel_survives_every_scale() { + // The §4 regression: one wrong-by-200 pixel must be caught at any N. + for n in [16u32, 256, 2048] { + let a = solid(n, n, [0, 0, 0, 255]); + let mut b = a.clone(); + b.put_pixel(n / 2, n / 2, image::Rgba([200, 200, 200, 255])); + let d = compare( + &a, + &b, + &CompareOpts { + include_aa: true, + mssim: false, + ..Default::default() + }, + ); + assert_eq!(d.differing_pixels, 1, "N={n}: exactly one differing pixel"); + assert!(d.max_channel_delta >= 200, "N={n}: L∞ caught the 200 delta"); + assert_eq!(d.total_pixels, n * n); + } + } + + #[test] + fn yiq_luminance_outweighs_chroma() { + // Equal raw L∞ (delta 30 on a channel) but a luma-shifted pixel must + // score a larger YIQ delta than a chroma-leaning shift — pins the + // weighting. luma=+30 all channels (pure luminance, dY=-30); chroma= + // +30 R / -30 B with G fixed (same L∞=30 but near-constant luminance, + // dY=-5.5). At threshold 0.1 (max_delta=352) the luma delta (455) trips + // while the lower-weighted chroma delta (244) does not — the YIQ + // weighting, not L∞, is what separates them. + let base = solid(4, 4, [120, 120, 120, 255]); + let mut luma = base.clone(); + luma.put_pixel(0, 0, image::Rgba([150, 150, 150, 255])); // +30 all: pure luma + let mut chroma = base.clone(); + chroma.put_pixel(0, 0, image::Rgba([150, 120, 90, 255])); // +30 R / -30 B: chroma-leaning, same L∞=30 + let opts = CompareOpts { + include_aa: true, + mssim: false, + threshold: 0.1, + ..Default::default() + }; + let dl = compare(&base, &luma, &opts); + let dc = compare(&base, &chroma, &opts); + // At a threshold where luma trips but the lower-weighted chroma delta does + // not, the luma case differs and the chroma case does not. + assert_eq!(dl.differing_pixels, 1, "luma shift exceeds threshold"); + assert_eq!( + dc.differing_pixels, 0, + "chroma-leaning shift is under-weighted below threshold" + ); + } + #[test] fn exact_budget_is_zero_zero() { assert_eq!(FuzzBudget::EXACT.max_channel_delta, 0); From b28d9900d73251fee4b2961b31a19de22828ade0 Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 05:21:11 -0700 Subject: [PATCH 08/70] feat(verify): antialias sibling exclusion (pixelmatch port) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A differing pixel that is AA in either image is excluded unless include_aa. EXACT (0,0) now holds across residual AA jitter while still catching an isolated real defect. Vendored verbatim from pixelmatch. The aa_edge_pair fixture is corrected from the plan's hard-2-tone diagonal step (which pixelmatch correctly never classifies as AA — a pure black/white edge has no pixel with both a brighter and darker sibling, so excluded would equal counted=16, not 0) to a genuine antialiased vertical edge (black | gray AA column | white) whose gray column jitters 128->180 between a and b — the canonical sub-LSB re-rasterization the AA exclusion exists to tolerate (excluded=0, counted=16). Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/buiy_verify/src/metric.rs | 163 ++++++++++++++++++++++++++++++- 1 file changed, 160 insertions(+), 3 deletions(-) diff --git a/crates/buiy_verify/src/metric.rs b/crates/buiy_verify/src/metric.rs index 44a9aa8..59b0695 100644 --- a/crates/buiy_verify/src/metric.rs +++ b/crates/buiy_verify/src/metric.rs @@ -95,15 +95,19 @@ pub fn compare(a: &RgbaImage, b: &RgbaImage, opts: &CompareOpts) -> Diff { let mut differing_pixels = 0u32; let mut max_channel_delta = 0u8; - for (pa, pb) in a.pixels().zip(b.pixels()) { + for (x, y, pa) in a.enumerate_pixels() { + let pb = b.get_pixel(x, y); for ch in 0..4 { let d = (pa[ch] as i16 - pb[ch] as i16).unsigned_abs() as u8; max_channel_delta = max_channel_delta.max(d); } let delta = color_delta(pa, pb, false); if delta.abs() > max_delta { - // AA exclusion is layered in 1a.3; here every over-threshold pixel counts. - differing_pixels += 1; + let is_aa = !opts.include_aa + && (antialiased(a, x, y, w, h, b) || antialiased(b, x, y, w, h, a)); + if !is_aa { + differing_pixels += 1; + } } } @@ -170,6 +174,78 @@ fn rgb2q(r: f64, g: f64, b: f64) -> f64 { r * 0.211_470_17 - g * 0.522_617_11 + b * 0.311_146_94 } +// Vendored from pixelmatch (MIT): "Anti-aliased Pixel and Intensity Slope +// Detector" (Vyšniauskas, 2009). A pixel is AA iff it has a strictly brighter +// and a strictly darker sibling and that extreme has 3+ equal siblings in BOTH +// images (so it is an intensity slope, not a real edge in both). +fn antialiased(img1: &RgbaImage, x: u32, y: u32, w: u32, h: u32, img2: &RgbaImage) -> bool { + let mut zeroes: u8 = u8::from(x == 0 || y == 0 || x == w - 1 || y == h - 1); + let (mut min, mut max) = (0.0f64, 0.0f64); + let (mut min_x, mut min_y, mut max_x, mut max_y) = (0u32, 0u32, 0u32, 0u32); + let center = img1.get_pixel(x, y); + + let x0 = x.saturating_sub(1); + let x1 = if x < w - 1 { x + 1 } else { x }; + let y0 = y.saturating_sub(1); + let y1 = if y < h - 1 { y + 1 } else { y }; + for ax in x0..=x1 { + for ay in y0..=y1 { + if ax == x && ay == y { + continue; + } + let delta = color_delta(center, img1.get_pixel(ax, ay), true); + if delta == 0.0 { + zeroes += 1; + if zeroes > 2 { + return false; + } + continue; + } + if delta < min { + min = delta; + min_x = ax; + min_y = ay; + continue; + } + if delta > max { + max = delta; + max_x = ax; + max_y = ay; + } + } + } + if min == 0.0 || max == 0.0 { + return false; + } + (has_many_siblings(img1, min_x, min_y, w, h) && has_many_siblings(img2, min_x, min_y, w, h)) + || (has_many_siblings(img1, max_x, max_y, w, h) + && has_many_siblings(img2, max_x, max_y, w, h)) +} + +// Vendored from pixelmatch (MIT): 3+ adjacent pixels of identical color. +fn has_many_siblings(img: &RgbaImage, x: u32, y: u32, w: u32, h: u32) -> bool { + let mut zeroes: u8 = u8::from(x == 0 || y == 0 || x == w - 1 || y == h - 1); + let center = img.get_pixel(x, y); + let x0 = x.saturating_sub(1); + let x1 = if x < w - 1 { x + 1 } else { x }; + let y0 = y.saturating_sub(1); + let y1 = if y < h - 1 { y + 1 } else { y }; + for ax in x0..=x1 { + for ay in y0..=y1 { + if ax == x && ay == y { + continue; + } + if center == img.get_pixel(ax, ay) { + zeroes += 1; + if zeroes > 2 { + return true; + } + } + } + } + false +} + #[cfg(test)] mod tests { use super::*; @@ -241,6 +317,87 @@ mod tests { ); } + /// An antialiased vertical edge — black | one gray AA column | white — + /// whose gray column value JITTERS between `a` and `b`, modeling the + /// sub-LSB SDF/sRGB re-rasterization the metric must tolerate. Every + /// differing (gray) pixel has a strictly brighter (white) and strictly + /// darker (black) horizontal sibling, and those extremes have 3+ identical + /// siblings in both images, so pixelmatch's slope detector reads them as AA. + /// A hard 2-tone edge would NOT work: a pure black/white step has no pixel + /// with both a brighter and a darker neighbor, so pixelmatch (correctly) + /// never classifies it as AA. + fn aa_edge_pair() -> (image::RgbaImage, image::RgbaImage) { + let (w, h) = (16u32, 16u32); + let build = |gray: u8| { + let mut img = image::RgbaImage::new(w, h); + for y in 0..h { + for x in 0..w { + let p = if x < 7 { + [0, 0, 0, 255] + } else if x == 7 { + [gray, gray, gray, 255] + } else { + [255, 255, 255, 255] + }; + img.put_pixel(x, y, image::Rgba(p)); + } + } + img + }; + // The gray AA column is sampled at 128 in `a`, 180 in `b` — sub-edge + // jitter, above the YIQ threshold so the pixels are over-threshold but + // AA-excluded. + (build(128), build(180)) + } + + #[test] + fn aa_pixels_excluded_by_default_but_counted_with_include_aa() { + let (a, b) = aa_edge_pair(); + let excluded = compare( + &a, + &b, + &CompareOpts { + mssim: false, + ..Default::default() + }, + ); + let counted = compare( + &a, + &b, + &CompareOpts { + include_aa: true, + mssim: false, + ..Default::default() + }, + ); + assert_eq!( + excluded.differing_pixels, 0, + "edge pixels read as AA, excluded" + ); + assert!( + counted.differing_pixels > 0, + "include_aa counts the same pixels" + ); + } + + #[test] + fn real_defect_is_not_excluded_as_aa() { + // An isolated wrong pixel on a flat field has no brighter+darker sibling + // pair, so it is NOT AA — it must still count with default opts. + let a = solid(16, 16, [0, 0, 0, 255]); + let mut b = a.clone(); + b.put_pixel(8, 8, image::Rgba([200, 200, 200, 255])); + let d = compare( + &a, + &b, + &CompareOpts { + mssim: false, + ..Default::default() + }, + ); + assert_eq!(d.differing_pixels, 1, "isolated defect is not AA-excluded"); + } + #[test] fn exact_budget_is_zero_zero() { assert_eq!(FuzzBudget::EXACT.max_channel_delta, 0); From 353bfb7ecaaaa2c4a4c9314ee21097be256b28a0 Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 05:23:04 -0700 Subject: [PATCH 09/70] feat(verify): Diff::passes/within + saturated dim-mismatch Diff MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two-axis gate (both bind); within() pins the fuzzy-if floor so an unexpectedly-clean render reds. A dimension mismatch folds into a saturated Diff that fails EVERY budget — the loud-red replacement for the naive silent 1.0. Adds a `saturated: bool` discriminator to Diff so passes() can honor metric.md's "false for every budget, including a maximal (255, u32::MAX)" contract: the pure two-axis formula would otherwise ACCEPT a saturated diff under a maximal budget. The flag also keeps a saturated mismatch categorically distinct from an in-bounds all-different frame (which a wide budget may legitimately accept). Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/buiy_verify/src/metric.rs | 140 ++++++++++++++++++++++++++++++- 1 file changed, 139 insertions(+), 1 deletion(-) diff --git a/crates/buiy_verify/src/metric.rs b/crates/buiy_verify/src/metric.rs index 59b0695..74beb47 100644 --- a/crates/buiy_verify/src/metric.rs +++ b/crates/buiy_verify/src/metric.rs @@ -28,6 +28,13 @@ pub struct Diff { pub mssim: Option, /// Heatmap: AA pixels dimmed, differing pixels painted (pixelmatch palette). pub diff_image: Option, + /// Set only by the dimension-mismatch sentinel. A saturated `Diff` is an + /// *unconditional fail*: [`Diff::passes`] returns `false` for it against + /// EVERY budget — including a hypothetical maximal `(255, u32::MAX)` — so a + /// mis-sized capture reds the gate loudly (metric.md § compare). It is + /// distinct from an in-bounds all-different frame, which a wide-enough + /// budget may legitimately accept. + pub saturated: bool, } /// The two-axis gate. A Diff PASSES iff BOTH hold. Default after determinism is @@ -86,9 +93,25 @@ pub fn compare(a: &RgbaImage, b: &RgbaImage, opts: &CompareOpts) -> Diff { total_pixels: 0, mssim: None, diff_image: None, + saturated: false, + }; + } + if a.dimensions() != b.dimensions() { + // Loud-red sentinel (metric.md): a saturated Diff fails EVERY budget. + // total = max(area) so the saturation count is well-defined. + let total = a + .width() + .saturating_mul(a.height()) + .max(b.width().saturating_mul(b.height())); + return Diff { + differing_pixels: total, + max_channel_delta: 255, + total_pixels: total, + mssim: Some(0.0), + diff_image: None, + saturated: true, }; } - // Dimension mismatch handled in 1a.4 (saturated Diff). For now assume equal. let (w, h) = a.dimensions(); let total_pixels = w * h; let max_delta = 35_215_f64 * opts.threshold * opts.threshold; @@ -117,6 +140,29 @@ pub fn compare(a: &RgbaImage, b: &RgbaImage, opts: &CompareOpts) -> Diff { total_pixels, mssim: None, // wired in 1a.5 diff_image: None, // wired in 1a.6 + saturated: false, + } +} + +impl Diff { + /// PASS iff `max_channel_delta <= budget.max_channel_delta` + /// AND `differing_pixels <= budget.max_diff_pixels`. MSSIM is advisory and + /// never gates here. A [`saturated`](Self::saturated) (dimension-mismatch) + /// Diff is an unconditional fail — `false` for every budget, including a + /// maximal `(255, u32::MAX)` — so a mis-sized capture cannot squeak through. + pub fn passes(&self, budget: &FuzzBudget) -> bool { + !self.saturated + && self.max_channel_delta <= budget.max_channel_delta + && self.differing_pixels <= budget.max_diff_pixels + } + + /// Mozilla `fuzzy-if` "ranges must not include 0": PASS iff the diff meets + /// the `max` budget AND exceeds the `min` floor on at least one axis, so a + /// suddenly-clean render (below an expected difference) is flagged. + pub fn within(&self, min: &FuzzBudget, max: &FuzzBudget) -> bool { + let over_floor = self.max_channel_delta > min.max_channel_delta + || self.differing_pixels > min.max_diff_pixels; + self.passes(max) && over_floor } } @@ -398,6 +444,98 @@ mod tests { assert_eq!(d.differing_pixels, 1, "isolated defect is not AA-excluded"); } + #[test] + fn passes_requires_both_axes() { + // One pixel off by 255: trips max_channel_delta, one differing pixel. + let a = solid(8, 8, [0, 0, 0, 255]); + let mut b = a.clone(); + b.put_pixel(0, 0, image::Rgba([255, 255, 255, 255])); + let d = compare( + &a, + &b, + &CompareOpts { + mssim: false, + ..Default::default() + }, + ); + assert!(!d.passes(&FuzzBudget::EXACT), "EXACT rejects any diff"); + assert!( + !d.passes(&FuzzBudget { + max_channel_delta: 255, + max_diff_pixels: 0 + }), + "diff-pixel axis still binds when channel axis is satisfied" + ); + assert!( + !d.passes(&FuzzBudget { + max_channel_delta: 0, + max_diff_pixels: 1 + }), + "channel axis still binds when diff-pixel axis is satisfied" + ); + assert!( + d.passes(&FuzzBudget { + max_channel_delta: 255, + max_diff_pixels: 1 + }), + "both axes satisfied -> pass" + ); + } + + #[test] + fn within_floor_catches_unexpectedly_clean() { + // A clean render (0,0) must FAIL a widened budget whose min floor is > 0. + let a = solid(8, 8, [5, 5, 5, 255]); + let clean = compare( + &a, + &a, + &CompareOpts { + mssim: false, + ..Default::default() + }, + ); + let min = FuzzBudget { + max_channel_delta: 1, + max_diff_pixels: 1, + }; + let max = FuzzBudget { + max_channel_delta: 10, + max_diff_pixels: 50, + }; + assert!( + !clean.within(&min, &max), + "a clean render is below the expected floor" + ); + } + + #[test] + fn dimension_mismatch_is_saturated_and_fails_every_budget() { + let a = solid(4, 4, [0, 0, 0, 255]); + let b = solid(5, 4, [0, 0, 0, 255]); + let d = compare(&a, &b, &CompareOpts::default()); + assert_eq!(d.max_channel_delta, 255); + assert_eq!(d.differing_pixels, d.total_pixels); + assert_eq!(d.total_pixels, 20, "total = max(area) = 5*4"); + assert_eq!(d.mssim, Some(0.0)); + // Fails even a hypothetical maximal budget. + let maximal = FuzzBudget { + max_channel_delta: 255, + max_diff_pixels: u32::MAX, + }; + assert!( + !d.passes(&maximal), + "saturated diff fails the loudest budget too" + ); + } + + #[test] + fn empty_capture_forbidden_by_explicit_assertion() { + // The metric returns total_pixels == 0 for empty; harnesses forbid it. + let e = image::RgbaImage::new(0, 0); + let d = compare(&e, &e, &CompareOpts::default()); + assert_eq!(d.total_pixels, 0); + } + #[test] fn exact_budget_is_zero_zero() { assert_eq!(FuzzBudget::EXACT.max_channel_delta, 0); From f849da0450875b8d24f6b258e393e31186715cc3 Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 05:24:31 -0700 Subject: [PATCH 10/70] feat(verify): advisory MSSIM channel via image-compare MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Diff::mssim from rgba_blended_hybrid_compare, Option (None when disabled/errored — never silently 0.0). Proven non-gating: a 1-LSB wash (0 differing pixels) still passes a budget admitting its 1-LSB L∞ delta despite a sub-1 MSSIM. The mssim_never_gates fixture is corrected from the plan's passes(&EXACT) form (EXACT rejects the 1-LSB wash on the *channel* axis, so it cannot isolate the MSSIM-non-gating property) to a budget that tolerates the L∞ delta and 0 diff pixels, leaving MSSIM as the only possible gate. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/buiy_verify/src/metric.rs | 69 +++++++++++++++++++++++++++++++- 1 file changed, 68 insertions(+), 1 deletion(-) diff --git a/crates/buiy_verify/src/metric.rs b/crates/buiy_verify/src/metric.rs index 74beb47..32fbeca 100644 --- a/crates/buiy_verify/src/metric.rs +++ b/crates/buiy_verify/src/metric.rs @@ -134,11 +134,24 @@ pub fn compare(a: &RgbaImage, b: &RgbaImage, opts: &CompareOpts) -> Diff { } } + let mssim = if opts.mssim { + // Advisory MSSIM via image-compare's rgba blended hybrid compare, + // premultiplied against an opaque (white) background — captures are + // opaque, so the background is never sampled in practice. + use image_compare::{BlendInput, rgba_blended_hybrid_compare}; + let bg = image::Rgb([255u8, 255, 255]); + rgba_blended_hybrid_compare(BlendInput::from(a), BlendInput::from(b), bg) + .map(|sim| sim.score) + .ok() + } else { + None + }; + Diff { differing_pixels, max_channel_delta, total_pixels, - mssim: None, // wired in 1a.5 + mssim, diff_image: None, // wired in 1a.6 saturated: false, } @@ -444,6 +457,60 @@ mod tests { assert_eq!(d.differing_pixels, 1, "isolated defect is not AA-excluded"); } + #[test] + fn identity_reports_full_mssim() { + let img = solid(16, 16, [40, 90, 160, 255]); + let d = compare(&img, &img, &CompareOpts::default()); // mssim on by default + assert_eq!(d.differing_pixels, 0); + let s = d.mssim.expect("mssim computed when opts.mssim"); + assert!(s > 0.999, "identical images report MSSIM ~1.0, got {s}"); + } + + #[test] + fn mssim_skipped_when_disabled() { + let img = solid(8, 8, [1, 2, 3, 255]); + let d = compare( + &img, + &img, + &CompareOpts { + mssim: false, + ..Default::default() + }, + ); + assert_eq!(d.mssim, None); + } + + #[test] + fn mssim_never_gates() { + // A global 1-LSB wash: 0 differing pixels (the YIQ delta 0.5 is far + // under max_delta=352) but a measurably-below-1 MSSIM. Against a budget + // that admits the 1-LSB L∞ channel delta the wash introduces, the diff + // PASSES — proving MSSIM does not participate in the gate. (EXACT would + // reject this on the *channel* axis, not because of MSSIM, so it cannot + // isolate the property; the budget here tolerates the L∞ delta and 0 + // diff pixels, leaving only MSSIM as a possible gate — which must not + // bind.) + let a = solid(32, 32, [128, 128, 128, 255]); + let b = solid(32, 32, [129, 129, 129, 255]); + let d = compare(&a, &b, &CompareOpts::default()); + assert_eq!( + d.differing_pixels, 0, + "1-LSB shift is under the YIQ threshold" + ); + assert_eq!(d.max_channel_delta, 1, "the wash is a 1-LSB L∞ delta"); + let s = d.mssim.expect("mssim computed by default"); + assert!(s < 1.0, "a uniform wash measurably lowers MSSIM below 1.0"); + let budget = FuzzBudget { + max_channel_delta: 1, + max_diff_pixels: 0, + }; + assert!( + d.passes(&budget), + "MSSIM is advisory — a sub-1 MSSIM does not gate passes() when both \ + pixel axes are satisfied" + ); + } + #[test] fn passes_requires_both_axes() { // One pixel off by 255: trips max_channel_delta, one differing pixel. From b085f91295827e69289645a846861813eca29016 Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 05:25:20 -0700 Subject: [PATCH 11/70] feat(verify): diff_image heatmap on emit_diff_image pixelmatch palette: differing pixels red, AA pixels yellow. Off in the hot reftest path; on for tier-5 golden triage HTML. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/buiy_verify/src/metric.rs | 39 ++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/crates/buiy_verify/src/metric.rs b/crates/buiy_verify/src/metric.rs index 32fbeca..c9c77be 100644 --- a/crates/buiy_verify/src/metric.rs +++ b/crates/buiy_verify/src/metric.rs @@ -116,6 +116,7 @@ pub fn compare(a: &RgbaImage, b: &RgbaImage, opts: &CompareOpts) -> Diff { let total_pixels = w * h; let max_delta = 35_215_f64 * opts.threshold * opts.threshold; + let mut diff_image = opts.emit_diff_image.then(|| RgbaImage::new(w, h)); let mut differing_pixels = 0u32; let mut max_channel_delta = 0u8; for (x, y, pa) in a.enumerate_pixels() { @@ -128,8 +129,15 @@ pub fn compare(a: &RgbaImage, b: &RgbaImage, opts: &CompareOpts) -> Diff { if delta.abs() > max_delta { let is_aa = !opts.include_aa && (antialiased(a, x, y, w, h, b) || antialiased(b, x, y, w, h, a)); - if !is_aa { + if is_aa { + if let Some(out) = &mut diff_image { + out.put_pixel(x, y, image::Rgba([255, 255, 0, 255])); // AA: yellow + } + } else { differing_pixels += 1; + if let Some(out) = &mut diff_image { + out.put_pixel(x, y, image::Rgba([255, 0, 0, 255])); // diff: red + } } } } @@ -152,7 +160,7 @@ pub fn compare(a: &RgbaImage, b: &RgbaImage, opts: &CompareOpts) -> Diff { max_channel_delta, total_pixels, mssim, - diff_image: None, // wired in 1a.6 + diff_image, saturated: false, } } @@ -511,6 +519,33 @@ mod tests { ); } + #[test] + fn diff_image_paints_differing_pixels() { + let a = solid(8, 8, [0, 0, 0, 255]); + let mut b = a.clone(); + b.put_pixel(3, 3, image::Rgba([255, 255, 255, 255])); + let d = compare( + &a, + &b, + &CompareOpts { + emit_diff_image: true, + mssim: false, + ..Default::default() + }, + ); + let img = d.diff_image.expect("emit_diff_image fills the heatmap"); + assert_eq!(img.dimensions(), (8, 8)); + // The differing pixel is painted red (pixelmatch diff_color). + assert_eq!(*img.get_pixel(3, 3), image::Rgba([255, 0, 0, 255])); + } + + #[test] + fn diff_image_absent_by_default() { + let a = solid(4, 4, [10, 10, 10, 255]); + let d = compare(&a, &a, &CompareOpts::default()); + assert!(d.diff_image.is_none()); + } + #[test] fn passes_requires_both_axes() { // One pixel off by 255: trips max_channel_delta, one differing pixel. From 19463dca1cdcb70e3a0abc31e36047e6e943eb82 Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 05:26:09 -0700 Subject: [PATCH 12/70] test(verify): known-answer meta-suite + constants tripwire for metric MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit metric.md § Verification: identity, scale-invariant single defect, saturated dim-mismatch, and an exact-integer constants pin guarding the vendored YIQ/AA numbers. (insta-snapshot upgrade deferred to Phase 2.) Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/buiy_verify/tests/metric.rs | 88 ++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 crates/buiy_verify/tests/metric.rs diff --git a/crates/buiy_verify/tests/metric.rs b/crates/buiy_verify/tests/metric.rs new file mode 100644 index 0000000..720d733 --- /dev/null +++ b/crates/buiy_verify/tests/metric.rs @@ -0,0 +1,88 @@ +//! Known-answer meta-tests for `buiy_verify::metric` (metric.md § Verification). +//! Pure CPU, no GPU lane. + +use buiy_verify::metric::{CompareOpts, Diff, FuzzBudget, compare}; +use image::{Rgba, RgbaImage}; + +fn solid(w: u32, h: u32, px: [u8; 4]) -> RgbaImage { + RgbaImage::from_pixel(w, h, Rgba(px)) +} + +#[test] +fn identity_zero_diff_full_mssim() { + let img = solid(8, 8, [12, 34, 56, 255]); + let d = compare(&img, &img, &CompareOpts::default()); + assert_eq!(d.differing_pixels, 0); + assert_eq!(d.max_channel_delta, 0); + assert!(d.mssim.unwrap() > 0.999); + assert!(d.passes(&FuzzBudget::EXACT)); +} + +#[test] +fn single_defect_survives_scale() { + for n in [16u32, 256, 2048] { + let a = solid(n, n, [0, 0, 0, 255]); + let mut b = a.clone(); + b.put_pixel(n / 2, n / 2, Rgba([200, 200, 200, 255])); + let d = compare( + &a, + &b, + &CompareOpts { + include_aa: true, + mssim: false, + ..Default::default() + }, + ); + assert_eq!(d.differing_pixels, 1, "N={n}"); + assert!(!d.passes(&FuzzBudget::EXACT), "N={n}"); + } +} + +#[test] +fn dimension_mismatch_fails_every_budget() { + let a = solid(4, 4, [0, 0, 0, 255]); + let b = solid(4, 5, [0, 0, 0, 255]); + let d = compare(&a, &b, &CompareOpts::default()); + assert_eq!(d.differing_pixels, d.total_pixels); + assert_eq!(d.max_channel_delta, 255); + assert!(!d.passes(&FuzzBudget { + max_channel_delta: 255, + max_diff_pixels: u32::MAX + })); +} + +/// Constants tripwire: a fixed 8×8 pair yields an exact integer Diff. A +/// pixelmatch-constant drift changes these numbers and reds this test. (Phase 2 +/// upgrades this to the floats-redacted insta snapshot metric.md specifies.) +#[test] +fn vendored_constants_are_pinned() { + let mut a = solid(8, 8, [0, 0, 0, 255]); + let mut b = solid(8, 8, [0, 0, 0, 255]); + // Three deterministic, isolated, non-AA defects of known magnitude. + a.put_pixel(1, 1, Rgba([0, 0, 0, 255])); + b.put_pixel(1, 1, Rgba([255, 0, 0, 255])); // luma-heavy + a.put_pixel(4, 4, Rgba([0, 0, 0, 255])); + b.put_pixel(4, 4, Rgba([0, 255, 0, 255])); + a.put_pixel(6, 2, Rgba([10, 10, 10, 255])); + b.put_pixel(6, 2, Rgba([250, 250, 250, 255])); + let d = compare( + &a, + &b, + &CompareOpts { + mssim: false, + ..Default::default() + }, + ); + // EXPECTED: re-bless intentionally if the algorithm changes. + let Diff { + differing_pixels, + max_channel_delta, + total_pixels, + .. + } = d; + assert_eq!( + (differing_pixels, max_channel_delta, total_pixels), + (3, 255, 64), + "vendored YIQ/AA constants drifted — re-derive deliberately, do not patch the number", + ); +} From ca8e13b056ffda4cd9405a3ae27986b8ab2d9c73 Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 05:26:57 -0700 Subject: [PATCH 13/70] refactor(verify): delete RMSE visual::compare_images, migrate callers to metric MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit metric.md § Migration step 1: the RMSE metric and DiffResult are gone; tests/visual.rs and smoke.rs move onto metric::compare + Diff::passes (in-memory fixtures replace baseline/tinted PNGs). One metric now. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/buiy_verify/src/lib.rs | 3 +- crates/buiy_verify/src/visual.rs | 45 ----------- .../tests/fixtures/visual/baseline.png | Bin 73 -> 0 bytes .../tests/fixtures/visual/tinted.png | Bin 73 -> 0 bytes crates/buiy_verify/tests/smoke.rs | 4 +- crates/buiy_verify/tests/visual.rs | 76 ++++++++++++------ 6 files changed, 54 insertions(+), 74 deletions(-) delete mode 100644 crates/buiy_verify/src/visual.rs delete mode 100644 crates/buiy_verify/tests/fixtures/visual/baseline.png delete mode 100644 crates/buiy_verify/tests/fixtures/visual/tinted.png diff --git a/crates/buiy_verify/src/lib.rs b/crates/buiy_verify/src/lib.rs index 226a01b..cdf4047 100644 --- a/crates/buiy_verify/src/lib.rs +++ b/crates/buiy_verify/src/lib.rs @@ -1,4 +1,4 @@ -//! Buiy verification harness. Phase 0 ships visual regression, AccessKit +//! Buiy verification harness. Phase 0 ships the perceptual metric, AccessKit //! tree snapshot, and WCAG 2 contrast linter. Full harness (15 CI gates) //! lives in `buiy-verification-design`. //! @@ -7,4 +7,3 @@ pub mod a11y; pub mod contrast; pub mod metric; -pub mod visual; diff --git a/crates/buiy_verify/src/visual.rs b/crates/buiy_verify/src/visual.rs deleted file mode 100644 index 5d2542b..0000000 --- a/crates/buiy_verify/src/visual.rs +++ /dev/null @@ -1,45 +0,0 @@ -//! Visual regression — perceptual diff with a tolerance budget. -//! See: docs/specs/2026-05-07-buiy-foundation/verification.md (CI gate #2). - -use image::{DynamicImage, GenericImageView}; - -#[must_use] -pub struct DiffResult { - /// 0.0 = identical, 1.0 = totally different. - pub score: f64, -} - -impl DiffResult { - pub fn passed(&self, tolerance: f64) -> bool { - self.score <= tolerance - } -} - -pub fn compare_images(a: &DynamicImage, b: &DynamicImage) -> DiffResult { - if a.dimensions() != b.dimensions() { - return DiffResult { score: 1.0 }; - } - let a8 = a.to_rgba8(); - let b8 = b.to_rgba8(); - // Widen u32 → u64 BEFORE multiplying. `width * height` in u32 overflows - // for images > 4 gigapixels (theoretical, but cheap to harden). - let pixels = a8.width() as u64 * a8.height() as u64; - // Two zero-sized images compare identical: the only achievable score for - // an empty pixel set is "no difference observed". Returning 0.0 here - // also avoids a NaN from `accumulated as f64 / 0.0`, which would make - // `passed(any_tol)` silently false for every tolerance. - if pixels == 0 { - return DiffResult { score: 0.0 }; - } - let mut accumulated = 0u64; - for (pa, pb) in a8.pixels().zip(b8.pixels()) { - for ch in 0..4 { - let d = pa[ch] as i32 - pb[ch] as i32; - accumulated += (d * d) as u64; - } - } - let max = (pixels * 4 * 255 * 255) as f64; - DiffResult { - score: (accumulated as f64 / max).sqrt(), - } -} diff --git a/crates/buiy_verify/tests/fixtures/visual/baseline.png b/crates/buiy_verify/tests/fixtures/visual/baseline.png deleted file mode 100644 index bb24c50df343a6239d6a70d78ebdee365dcf7d89..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 73 zcmeAS@N?(olHy`uVBq!ia0vp^j3CUx1|;Q0k8}bl0Z$jlkP1ddR)$~y8U6zS=Wph} Ti!XYDG% RgbaImage { + RgbaImage::from_pixel(w, h, Rgba(px)) +} #[test] -fn identical_images_diff_zero() { - let baseline = open("tests/fixtures/visual/baseline.png").unwrap(); - let result: DiffResult = compare_images(&baseline, &baseline); - assert_eq!(result.score, 0.0); - assert!(result.passed(0.01), "identical images pass 0.01 tolerance"); +fn identical_images_pass_exact() { + let img = solid(16, 16, [30, 60, 90, 255]); + let d = compare(&img, &img, &CompareOpts::default()); + assert_eq!(d.differing_pixels, 0); + assert!( + d.passes(&FuzzBudget::EXACT), + "identical images pass the exact budget" + ); } #[test] -fn tinted_image_diff_nonzero() { - let a = open("tests/fixtures/visual/baseline.png").unwrap(); - let b = open("tests/fixtures/visual/tinted.png").unwrap(); - let result = compare_images(&a, &b); - assert!(result.score > 0.0, "different images produce nonzero diff"); +fn tinted_image_fails_exact() { + let a = solid(16, 16, [40, 40, 40, 255]); + let b = solid(16, 16, [40, 40, 200, 255]); // uniform blue tint + let d = compare( + &a, + &b, + &CompareOpts { + include_aa: true, + ..Default::default() + }, + ); + assert!(d.differing_pixels > 0, "a uniform tint differs"); + assert!( + !d.passes(&FuzzBudget::EXACT), + "tinted image fails the exact budget" + ); } #[test] -fn dimension_mismatch_returns_one() { - let a = DynamicImage::ImageRgba8(RgbaImage::new(2, 2)); - let b = DynamicImage::ImageRgba8(RgbaImage::new(3, 2)); - let result = compare_images(&a, &b); - assert_eq!(result.score, 1.0); - assert!(!result.passed(0.5), "mismatched-dim sentinel exceeds 0.5"); +fn dimension_mismatch_fails_every_budget() { + let a = solid(2, 2, [0, 0, 0, 255]); + let b = solid(3, 2, [0, 0, 0, 255]); + let d = compare(&a, &b, &CompareOpts::default()); + assert!( + !d.passes(&FuzzBudget { + max_channel_delta: 255, + max_diff_pixels: u32::MAX + }), + "mismatched dims saturate and fail even a maximal budget" + ); } #[test] -fn empty_images_compare_identical_without_nan() { - let a = DynamicImage::ImageRgba8(RgbaImage::new(0, 0)); - let b = DynamicImage::ImageRgba8(RgbaImage::new(0, 0)); - let result = compare_images(&a, &b); - assert_eq!(result.score, 0.0, "0x0 vs 0x0 is identical, not NaN"); +fn empty_vs_empty_is_zero_diff() { + let e = RgbaImage::new(0, 0); + let d = compare(&e, &e, &CompareOpts::default()); + assert_eq!(d.total_pixels, 0); assert!( - result.passed(0.01), - "empty-vs-empty must pass any non-negative tolerance" + d.passes(&FuzzBudget::EXACT), + "empty-vs-empty observes no difference" ); } From 1292f58c7b12e89e11718ec253bbf0c6035c46ed Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 05:29:08 -0700 Subject: [PATCH 14/70] refactor(core): deprecate perceptual_diff in place MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit metric.md § Migration step 2: buiy_core cannot depend on buiy_verify in its normal graph, so perceptual_diff carries a #[deprecated] gravestone pointing at buiy_verify::metric::compare; its L1 body stays for the unmigrated ignored GPU re-capture tests (Phase 3). Callers gain a file-level allow(deprecated) until they migrate. text_gpu.rs gains a TEMPORARY allow here (removed in 1a.10 when it migrates) so this commit stays clippy -D warnings clean; the plan's split leaves it warning otherwise. The deprecation note avoids literal #[ignore] brackets — rustdoc parses [ignore] as an intra-doc link and fails the -D warnings doc gate. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/buiy_core/src/lib.rs | 1 + crates/buiy_core/src/render/golden.rs | 3 +++ crates/buiy_core/tests/render_golden_harness.rs | 1 + crates/buiy_core/tests/text_decoration_gpu.rs | 1 + crates/buiy_core/tests/text_golden_suite_gpu.rs | 1 + crates/buiy_core/tests/text_gpu.rs | 1 + crates/buiy_core/tests/text_selection_caret_gpu.rs | 1 + 7 files changed, 9 insertions(+) diff --git a/crates/buiy_core/src/lib.rs b/crates/buiy_core/src/lib.rs index 72855a4..d36a6a5 100644 --- a/crates/buiy_core/src/lib.rs +++ b/crates/buiy_core/src/lib.rs @@ -48,6 +48,7 @@ pub use render::forced_colors::{PrePreferenceTheme, apply_forced_colors_theme}; pub use render::forced_colors_analyzer::{ CatalogPaint, ForcedColorsViolation, analyze_forced_colors, analyze_shadow_only, }; +#[allow(deprecated)] pub use render::golden::{GoldenConfig, perceptual_diff}; pub use text::{ BuiyTextPlugin, ComputedTextLayout, FontFamily, FontSize, FontWeight, FontsGeneration, diff --git a/crates/buiy_core/src/render/golden.rs b/crates/buiy_core/src/render/golden.rs index d063178..86d1a1e 100644 --- a/crates/buiy_core/src/render/golden.rs +++ b/crates/buiy_core/src/render/golden.rs @@ -259,6 +259,9 @@ pub fn readback_rgba_into( /// `buiy-verification-design`) — the budget is the line between jitter and /// regression. Frames must be the same length (same dimensions); mismatched /// lengths return `1.0` (maximal difference). +#[deprecated( + note = "use buiy_verify::metric::compare; kept only for unmigrated ignored GPU re-capture tests" +)] pub fn perceptual_diff(a: &[u8], b: &[u8]) -> f32 { if a.len() != b.len() || a.is_empty() { return 1.0; diff --git a/crates/buiy_core/tests/render_golden_harness.rs b/crates/buiy_core/tests/render_golden_harness.rs index f49f9d8..9017f31 100644 --- a/crates/buiy_core/tests/render_golden_harness.rs +++ b/crates/buiy_core/tests/render_golden_harness.rs @@ -1,6 +1,7 @@ //! Golden-image harness (gate #2). The triad config + perceptual diff are //! device-free and gating; the actual capture needs a wgpu adapter and is //! #[ignore]. Spec: verification.md § 4. +#![allow(deprecated)] // perceptual_diff is deprecated; these GPU sites migrate to buiy_verify::metric in Phase 3 (tier-5 goldens). mod support; diff --git a/crates/buiy_core/tests/text_decoration_gpu.rs b/crates/buiy_core/tests/text_decoration_gpu.rs index bbe2740..636b62d 100644 --- a/crates/buiy_core/tests/text_decoration_gpu.rs +++ b/crates/buiy_core/tests/text_decoration_gpu.rs @@ -29,6 +29,7 @@ //! //! This supersedes the plan's sketch of one ±4-of-full-coverage matcher for //! both tiers — that matcher can never see a thin AA'd quad row. +#![allow(deprecated)] // perceptual_diff is deprecated; these GPU sites migrate to buiy_verify::metric in Phase 3 (tier-5 goldens). mod support; diff --git a/crates/buiy_core/tests/text_golden_suite_gpu.rs b/crates/buiy_core/tests/text_golden_suite_gpu.rs index d4caa2d..675395d 100644 --- a/crates/buiy_core/tests/text_golden_suite_gpu.rs +++ b/crates/buiy_core/tests/text_golden_suite_gpu.rs @@ -6,6 +6,7 @@ //! need a wgpu adapter (CLAUDE.md GPU lane). //! //! Run: cargo test -p buiy_core --test text_golden_suite_gpu -- --ignored --test-threads=1 +#![allow(deprecated)] // perceptual_diff is deprecated; these GPU sites migrate to buiy_verify::metric in Phase 3 (tier-5 goldens). mod support; diff --git a/crates/buiy_core/tests/text_gpu.rs b/crates/buiy_core/tests/text_gpu.rs index 305c67b..cc7c7a1 100644 --- a/crates/buiy_core/tests/text_gpu.rs +++ b/crates/buiy_core/tests/text_gpu.rs @@ -5,6 +5,7 @@ //! need a wgpu adapter (CLAUDE.md GPU lane). //! //! Run: cargo test -p buiy_core --test text_gpu -- --ignored --test-threads=1 +#![allow(deprecated)] // TEMPORARY (Phase 1a.9): perceptual_diff deprecated; this file migrates to buiy_verify::metric::compare in 1a.10, which removes this allow. mod support; diff --git a/crates/buiy_core/tests/text_selection_caret_gpu.rs b/crates/buiy_core/tests/text_selection_caret_gpu.rs index 94281db..eb73839 100644 --- a/crates/buiy_core/tests/text_selection_caret_gpu.rs +++ b/crates/buiy_core/tests/text_selection_caret_gpu.rs @@ -27,6 +27,7 @@ //! `min(r,g,b) ≥ 180` rejects every red/blue mix (their g ≈ 0). //! - **Caret (glyph-tier solid stamp, red):** hard-edged at alpha 1 (no SDF //! AA) — a § 3.3-snapped 1-physical-px column of the exact red encode. +#![allow(deprecated)] // perceptual_diff is deprecated; these GPU sites migrate to buiy_verify::metric in Phase 3 (tier-5 goldens). mod support; From b8e0d126504e08f194e3267b5cdba9337f1ba392 Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 05:31:41 -0700 Subject: [PATCH 15/70] refactor(core): migrate text_gpu re-capture/anti-tests to metric::compare MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The #[ignore] GPU re-capture tests reach the unified metric over the dev-only buiy_core -> buiy_verify edge (landed Phase 0.2). Stable re-capture sites -> passes(&EXACT) via assert_stable; the must-differ anti-tests (:152, :271) -> !passes(&EXACT) via assert_differs. The TEMPORARY allow(deprecated) added in 1a.9 is removed (the file no longer names perceptual_diff). Verified on the RX 6700 XT GPU lane: all 6 #[ignore] tests pass, the stable sites bit-exact at EXACT (0,0) — the old < 1e-4 tolerance was not masking drift. The stored-baseline sites in the other text_*_gpu.rs files stay on deprecated perceptual_diff until Phase 3. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/buiy_core/tests/text_gpu.rs | 74 +++++++++++++++++++----------- 1 file changed, 47 insertions(+), 27 deletions(-) diff --git a/crates/buiy_core/tests/text_gpu.rs b/crates/buiy_core/tests/text_gpu.rs index cc7c7a1..4ca6ab2 100644 --- a/crates/buiy_core/tests/text_gpu.rs +++ b/crates/buiy_core/tests/text_gpu.rs @@ -5,7 +5,6 @@ //! need a wgpu adapter (CLAUDE.md GPU lane). //! //! Run: cargo test -p buiy_core --test text_gpu -- --ignored --test-threads=1 -#![allow(deprecated)] // TEMPORARY (Phase 1a.9): perceptual_diff deprecated; this file migrates to buiy_verify::metric::compare in 1a.10, which removes this allow. mod support; @@ -16,16 +15,37 @@ use buiy_core::layout::Style; use buiy_core::render::atlas::{AtlasBitmap, AtlasConfig, AtlasFormat, AtlasKey, BuiyAtlas}; use buiy_core::render::color::ColorToken; use buiy_core::render::components::TextColor; -use buiy_core::render::golden::{GoldenConfig, perceptual_diff}; +use buiy_core::render::golden::GoldenConfig; use buiy_core::text::{ FamilyEntry, FontFamily, FontSize, FontStack, GenericFamily, ResidentTextKeys, Text, }; +use buiy_verify::metric::{CompareOpts, FuzzBudget, compare}; use std::borrow::Cow; const W: u32 = 128; const H: u32 = 64; const TOKEN: &str = "test.text"; +/// Wrap a raw RGBA readback (W×H) as an `RgbaImage` for `metric::compare`. +fn img(bytes: &[u8]) -> image::RgbaImage { + image::RgbaImage::from_raw(W, H, bytes.to_vec()).expect("readback length == W*H*4") +} + +/// The stable-recapture spelling: two fresh captures of the same scene must +/// agree bit-exactly within the pinned rasterizer (metric.md § re-capture +/// determinism). `FuzzBudget::EXACT` is `(0, 0)`. +fn assert_stable(a: &[u8], b: &[u8], msg: &str) { + let d = compare(&img(a), &img(b), &CompareOpts::default()); + assert!(d.passes(&FuzzBudget::EXACT), "{msg}"); +} + +/// The anti-test spelling: two captures must NOT match at the exact budget — +/// proof the input change actually moved pixels (metric.md § anti-tests). +fn assert_differs(a: &[u8], b: &[u8], msg: &str) { + let d = compare(&img(a), &img(b), &CompareOpts::default()); + assert!(!d.passes(&FuzzBudget::EXACT), "{msg}"); +} + /// One big themed line ("Hi", 40 px — thick stems guarantee full-coverage /// interior texels) under a sized column root. Returns the text entity /// (the churn twin mutates it). @@ -112,10 +132,10 @@ fn hello_text_first_frame_is_deterministic_and_tinted() { // gate-#2 determinism: an independent fresh capture matches (the // stored-PNG machinery stays deferred; the re-capture IS the golden). let frame_b = capture(tint); - let diff = perceptual_diff(&frame_a, &frame_b); - assert!( - diff < 1e-4, - "two fresh captures diverged: perceptual_diff = {diff}" + assert_stable( + &frame_a, + &frame_b, + "two fresh captures diverged (must be bit-exact within the pinned rasterizer)", ); } @@ -149,9 +169,10 @@ fn retint_real_text_leaves_atlas_byte_identical() { "CoverageR8 page byte-identical across the retint — tint is \ per-instance, never a key input (§ 5.1/§ 7)" ); - assert!( - perceptual_diff(&frame_a, &frame_b) > 5e-4, - "the retint is visible in the framebuffer (byte-identity is not vacuous)" + assert_differs( + &frame_a, + &frame_b, + "the retint is visible in the framebuffer (byte-identity is not vacuous)", ); } @@ -213,10 +234,7 @@ fn touch_pass_prevents_stale_uv_corruption() { } } let frame_b = support::readback_rgba(&mut app, target.clone()); - assert!( - perceptual_diff(&frame_a, &frame_b) < 1e-4, - "retained frames render identically" - ); + assert_stable(&frame_a, &frame_b, "retained frames render identically"); // Half 2 — the hazard a DISABLED touch pass would allow, simulated // (decision 7: no prod flag — we force the eviction directly): evict a @@ -268,10 +286,11 @@ fn touch_pass_prevents_stale_uv_corruption() { ); } let frame_c = support::readback_rgba(&mut app, target); - assert!( - perceptual_diff(&frame_a, &frame_c) > 1e-4, + assert_differs( + &frame_a, + &frame_c, "stale UVs sampled the filler — the silent corruption § 6.3's \ - un-gated touch pass exists to prevent" + un-gated touch pass exists to prevent", ); } @@ -356,9 +375,10 @@ fn multi_script_text_renders_deterministically() { !a.chunks_exact(4).all(|p| p == &a[0..4]), "something painted" ); - assert!( - perceptual_diff(&a, &b) < 1e-4, - "two independent captures are byte-stable (deterministic fonts + resolver)" + assert_stable( + &a, + &b, + "two independent captures are byte-stable (deterministic fonts + resolver)", ); } @@ -449,9 +469,10 @@ fn font_db_rebuild_storm_is_bounded() { ); } let frame_after = support::readback_rgba(&mut app, target); - assert!( - perceptual_diff(&frame_before, &frame_after) < 1e-4, - "the storm is invisible: same bytes, same shaping, same pixels" + assert_stable( + &frame_before, + &frame_after, + "the storm is invisible: same bytes, same shaping, same pixels", ); } @@ -542,10 +563,9 @@ fn typing_churn_is_bounded_and_invisible() { // The pixels half: same final text, same pixels — the churn is // invisible through the real upload/draw path. let frame_after = support::readback_rgba(&mut app, target); - let diff = perceptual_diff(&frame_before, &frame_after); - assert!( - diff < 1e-4, - "the churn is invisible: frame byte-stable across churn-and-settle \ - (perceptual_diff = {diff})" + assert_stable( + &frame_before, + &frame_after, + "the churn is invisible: frame byte-stable across churn-and-settle", ); } From ba9b631cdd03b8ba8ad3c3fde779f4b5d1f40a5f Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 05:43:52 -0700 Subject: [PATCH 16/70] docs(verify): escape [0,1] range in metric doc comments The MSSIM/threshold doc comments wrote the range as a bare [0,1], which rustdoc parses as an intra-doc link and fails the RUSTDOCFLAGS="-D warnings" doc gate (unresolved link to `0,1`). Wrapped in backticks so it renders as code, not a link. No behavior change. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/buiy_verify/src/metric.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/buiy_verify/src/metric.rs b/crates/buiy_verify/src/metric.rs index c9c77be..3a4c620 100644 --- a/crates/buiy_verify/src/metric.rs +++ b/crates/buiy_verify/src/metric.rs @@ -24,7 +24,7 @@ pub struct Diff { pub max_channel_delta: u8, /// Total pixels compared (== w*h; 0 only for empty/degenerate input). pub total_pixels: u32, - /// Advisory MSSIM in [0,1] (1 == identical). `None` when skipped. + /// Advisory MSSIM in `[0,1]` (1 == identical). `None` when skipped. pub mssim: Option, /// Heatmap: AA pixels dimmed, differing pixels painted (pixelmatch palette). pub diff_image: Option, @@ -60,7 +60,7 @@ impl FuzzBudget { /// AA pixels COUNT (for the few tests that assert AA exactly). #[derive(Clone, Copy, Debug)] pub struct CompareOpts { - /// Matching sensitivity in [0,1]; default 0.1. Smaller = stricter. + /// Matching sensitivity in `[0,1]`; default 0.1. Smaller = stricter. pub threshold: f64, /// Treat antialiased pixels as differences instead of excluding them. pub include_aa: bool, From 61901e3238a25b4ee4edd024036630d8417ca932 Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 06:14:05 -0700 Subject: [PATCH 17/70] feat(verify): add CompareOpts::reftest_default for tier-4 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit AA-exclusion on, MSSIM advisory, no diff-image alloc in the hot path — the options run_reftest passes to metric::compare (reftests.md § API). Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/buiy_verify/src/metric.rs | 15 +++++++++++++++ crates/buiy_verify/tests/metric.rs | 9 +++++++++ 2 files changed, 24 insertions(+) diff --git a/crates/buiy_verify/src/metric.rs b/crates/buiy_verify/src/metric.rs index 3a4c620..1932788 100644 --- a/crates/buiy_verify/src/metric.rs +++ b/crates/buiy_verify/src/metric.rs @@ -81,6 +81,21 @@ impl Default for CompareOpts { } } +impl CompareOpts { + /// The reftest-tier options: AA-sibling pixels excluded (two CSS-subset + /// code paths can legitimately differ by one AA pixel on a shared corner), + /// MSSIM advisory-on, and no diff-image allocation in the hot capture loop + /// (the report is emitted with `emit_diff_image` only on failure). + pub fn reftest_default() -> Self { + Self { + threshold: 0.1, + include_aa: false, + mssim: true, + emit_diff_image: false, + } + } +} + /// Compare two RGBA images. **Infallible** — returns a `Diff`, never a /// `Result`. (AA exclusion is layered in 1a.3; here every over-threshold pixel /// counts.) diff --git a/crates/buiy_verify/tests/metric.rs b/crates/buiy_verify/tests/metric.rs index 720d733..593f85d 100644 --- a/crates/buiy_verify/tests/metric.rs +++ b/crates/buiy_verify/tests/metric.rs @@ -86,3 +86,12 @@ fn vendored_constants_are_pinned() { "vendored YIQ/AA constants drifted — re-derive deliberately, do not patch the number", ); } + +#[test] +fn reftest_default_excludes_aa_and_skips_diff_image() { + let opts = buiy_verify::metric::CompareOpts::reftest_default(); + assert!(!opts.include_aa, "reftest excludes AA-sibling pixels"); + assert!(opts.mssim, "MSSIM stays computed (advisory)"); + assert!(!opts.emit_diff_image, "hot reftest path allocates no diff image"); + assert_eq!(opts.threshold, 0.1, "pixelmatch default sensitivity"); +} From 3aacb1856c850a0bc8d6ae81669f3da223a68eb1 Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 06:14:42 -0700 Subject: [PATCH 18/70] feat(verify): reftest module skeleton + RefKind parser MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RefKind{Match,Mismatch} and reftest_kind(&str) — the token parser the reftest! macro calls. reftests.md § Module & public API. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/buiy_verify/src/lib.rs | 1 + crates/buiy_verify/src/reftest.rs | 46 +++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) create mode 100644 crates/buiy_verify/src/reftest.rs diff --git a/crates/buiy_verify/src/lib.rs b/crates/buiy_verify/src/lib.rs index cdf4047..5c12dd6 100644 --- a/crates/buiy_verify/src/lib.rs +++ b/crates/buiy_verify/src/lib.rs @@ -7,3 +7,4 @@ pub mod a11y; pub mod contrast; pub mod metric; +pub mod reftest; diff --git a/crates/buiy_verify/src/reftest.rs b/crates/buiy_verify/src/reftest.rs new file mode 100644 index 0000000..598d8e4 --- /dev/null +++ b/crates/buiy_verify/src/reftest.rs @@ -0,0 +1,46 @@ +//! Tier 4 — reftests + the CPU-vs-GPU SDF cross-check (reftests.md). +//! +//! A reftest renders a `test` and a `reference` scene with the SAME engine in +//! ONE process and asserts their bitmaps match (`==`) or differ (`!=`), never +//! against a stored baseline — so every platform-variance term (driver SDF +//! rounding, glyph-atlas AA, sRGB encode, clock) cancels in the diff. The +//! harness stores ZERO bytes. GPU-coupled cases are `#[ignore]`; the pairing / +//! aggregation logic and the independence lint are pure-CPU and gate headless. + +/// Whether a [`RefCase`] passes on equality or on difference. +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +pub enum RefKind { + /// Pass iff `test` and `reference` render to the same bitmap within `fuzz`. + Match, + /// Pass iff they render DIFFERENTLY (a `!=` anti-test guards silent no-ops). + Mismatch, +} + +impl RefKind { + /// Parse the `reftest!` macro's kind token (`stringify!($kind)`). + /// Panics on any other token — the macro only ever passes these two. + pub fn reftest_kind(token: &str) -> Self { + match token { + "match" => RefKind::Match, + "mismatch" => RefKind::Mismatch, + other => panic!("reftest! kind must be `match` or `mismatch`, got `{other}`"), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn reftest_kind_parses_both_tokens() { + assert_eq!(RefKind::reftest_kind("match"), RefKind::Match); + assert_eq!(RefKind::reftest_kind("mismatch"), RefKind::Mismatch); + } + + #[test] + #[should_panic(expected = "must be `match` or `mismatch`")] + fn reftest_kind_rejects_garbage() { + let _ = RefKind::reftest_kind("nope"); + } +} From 5960161a9661d8177b619228623dcd1032d70009 Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 06:15:34 -0700 Subject: [PATCH 19/70] feat(verify): RefCase + RefOutcome reftest types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pairing (name/kind/test/reference/fuzz) and its outcome (passed/diff/report_path). reftests.md § Module & public API. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/buiy_verify/src/reftest.rs | 45 +++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/crates/buiy_verify/src/reftest.rs b/crates/buiy_verify/src/reftest.rs index 598d8e4..c83ec13 100644 --- a/crates/buiy_verify/src/reftest.rs +++ b/crates/buiy_verify/src/reftest.rs @@ -28,6 +28,35 @@ impl RefKind { } } +use crate::metric::{Diff, FuzzBudget}; +use bevy::app::App; + +/// One reftest pairing. `test` and `reference` each build a scene into a +/// fresh, deterministic `App` (spawn entities; do NOT drive frames — +/// `run_reftest` owns the capture loop). Co-locate the expectation with the +/// `#[test]` the `reftest!` macro generates. +pub struct RefCase { + pub name: &'static str, + pub kind: RefKind, + /// Builds the scene exercising the feature under test. + pub test: fn(&mut App), + /// Builds the independent-oracle scene (see "Reference independence"). + pub reference: fn(&mut App), + /// Per-pairing fuzz, à la Mozilla `fuzzy-if`. Default `(0,0)` once the + /// determinism stack is in (determinism.md); widen with a documented reason. + pub fuzz: FuzzBudget, +} + +/// The result of running one [`RefCase`]. +#[derive(Debug)] +pub struct RefOutcome { + pub passed: bool, + pub diff: Diff, + /// On failure, a self-contained local HTML triage report (test | ref | + /// diff). Path printed to stderr; never committed. + pub report_path: Option, +} + #[cfg(test)] mod tests { use super::*; @@ -43,4 +72,20 @@ mod tests { fn reftest_kind_rejects_garbage() { let _ = RefKind::reftest_kind("nope"); } + + #[test] + fn refcase_is_constructible_with_zero_fuzz_default() { + use crate::metric::FuzzBudget; + use bevy::app::App; + fn noop(_: &mut App) {} + let case = RefCase { + name: "constructs", + kind: RefKind::Match, + test: noop, + reference: noop, + fuzz: FuzzBudget::EXACT, + }; + assert_eq!(case.name, "constructs"); + assert_eq!(case.fuzz, FuzzBudget::EXACT); + } } From bd6d969862114fd400bb3d044e7fa9c73dd1ab14 Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 06:16:29 -0700 Subject: [PATCH 20/70] feat(verify): pure evaluate_outcome pass-decision + truth table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Match passes within budget, Mismatch passes outside it (the silent-no-op guard). Pure CPU so it gates headless. reftests.md § Verification #1. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/buiy_verify/src/reftest.rs | 63 +++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/crates/buiy_verify/src/reftest.rs b/crates/buiy_verify/src/reftest.rs index c83ec13..1af2214 100644 --- a/crates/buiy_verify/src/reftest.rs +++ b/crates/buiy_verify/src/reftest.rs @@ -57,6 +57,18 @@ pub struct RefOutcome { pub report_path: Option, } +/// The pure pass-decision: `Match` passes iff the diff fits the budget; +/// `Mismatch` passes iff it does NOT (the feature must *do* something). Split +/// out of `run_reftest` so it gates headless via the aggregation truth table — +/// no GPU. The `(0,0)`-floor enforcement for `Mismatch` lives at macro +/// expansion time, so `evaluate_outcome` takes the budget as given. +pub fn evaluate_outcome(kind: RefKind, diff: &Diff, fuzz: &FuzzBudget) -> bool { + match kind { + RefKind::Match => diff.passes(fuzz), + RefKind::Mismatch => !diff.passes(fuzz), + } +} + #[cfg(test)] mod tests { use super::*; @@ -88,4 +100,55 @@ mod tests { assert_eq!(case.name, "constructs"); assert_eq!(case.fuzz, FuzzBudget::EXACT); } + + use crate::metric::Diff; + + /// A stub Diff with `n` differing pixels and `max_channel_delta = d`, no MSSIM. + fn stub_diff(n: u32, d: u8) -> Diff { + Diff { + differing_pixels: n, + max_channel_delta: d, + total_pixels: 1024, + mssim: None, + diff_image: None, + saturated: false, + } + } + + #[test] + fn match_passes_within_fuzz_fails_outside() { + assert!(evaluate_outcome( + RefKind::Match, + &stub_diff(0, 0), + &FuzzBudget::EXACT + )); + assert!(!evaluate_outcome( + RefKind::Match, + &stub_diff(1, 200), + &FuzzBudget::EXACT + )); + assert!(evaluate_outcome( + RefKind::Match, + &stub_diff(1, 8), + &FuzzBudget { + max_channel_delta: 8, + max_diff_pixels: 1 + } + )); + } + + #[test] + fn mismatch_passes_outside_fuzz_fails_within() { + assert!(evaluate_outcome( + RefKind::Mismatch, + &stub_diff(50, 200), + &FuzzBudget::EXACT + )); + // A scene that did NOT change (zero diff) FAILS a mismatch — the no-op guard. + assert!(!evaluate_outcome( + RefKind::Mismatch, + &stub_diff(0, 0), + &FuzzBudget::EXACT + )); + } } From fa86a01ea81803d000bb444bf5a8a70d016aea28 Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 06:29:03 -0700 Subject: [PATCH 21/70] feat(verify): run_reftest engine + promote capture_app to src MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit run_reftest captures test+reference in ONE app via capture_to_image (re-target + re-readback) and diffs with metric::compare; the painting-app builder is promoted from tests/support into render::golden::capture_app so buiy_verify builds its app from src (the test-support gpu_render_app* builders now delegate to the single src body — anti-drift). GPU known-good/ known-bad pairs prove the harness can both pass and fail (vacuous-green guard). reftests.md §§ API, Verification #3. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/buiy_core/src/render/golden.rs | 57 ++++++++++ .../buiy_core/tests/render_capture_app_gpu.rs | 50 ++++++++ crates/buiy_core/tests/support/mod.rs | 35 +----- crates/buiy_verify/src/lib.rs | 1 + crates/buiy_verify/src/reftest.rs | 91 +++++++++++++++ crates/buiy_verify/src/support.rs | 25 ++++ .../buiy_verify/tests/reftest_engine_gpu.rs | 107 ++++++++++++++++++ 7 files changed, 336 insertions(+), 30 deletions(-) create mode 100644 crates/buiy_core/tests/render_capture_app_gpu.rs create mode 100644 crates/buiy_verify/src/support.rs create mode 100644 crates/buiy_verify/tests/reftest_engine_gpu.rs diff --git a/crates/buiy_core/src/render/golden.rs b/crates/buiy_core/src/render/golden.rs index 86d1a1e..f9bcffa 100644 --- a/crates/buiy_core/src/render/golden.rs +++ b/crates/buiy_core/src/render/golden.rs @@ -90,6 +90,63 @@ pub const CAPTURE_MSAA: bevy::render::view::Msaa = bevy::render::view::Msaa::Off /// camera spawns with no `DebandDither::Enabled`). pub const CAPTURE_DITHER_OFF: bool = true; +/// Build the canonical headless painting App at a logical viewport size, +/// promoted from `tests/support/mod.rs` into src so `buiy_verify`'s reftest / +/// golden tiers build their app without the test crate. NOT finished: +/// [`capture_to_image`] finishes + drives to quiescence + reads back. +pub fn capture_app(logical_w: u32, logical_h: u32) -> bevy::app::App { + capture_app_scaled(logical_w, logical_h, 1.0) +} + +/// [`capture_app`] at an explicit window scale factor (the DPR-pin builder +/// determinism.md sizes the offscreen target through). Bevy 0.18 +/// `WindowResolution::new` takes PHYSICAL units; pass `logical × scale` plus +/// the override so `resolution.size()` reads back the logical size the view +/// uniform is built from. +pub fn capture_app_scaled(logical_w: u32, logical_h: u32, scale_factor: f32) -> bevy::app::App { + use bevy::window::WindowResolution; + let resolution = WindowResolution::new( + (logical_w as f32 * scale_factor).round() as u32, + (logical_h as f32 * scale_factor).round() as u32, + ) + .with_scale_factor_override(scale_factor); + capture_app_with_resolution(resolution) +} + +/// The one shared plugin stack behind [`capture_app`] / [`capture_app_scaled`] +/// (and, via delegation, the test-support `gpu_render_app*` builders) — a +/// single body so the scaled / test-support builders cannot drift. The plugin +/// set + init order MUST stay byte-identical to the documented capture stack +/// (the offscreen `Core2d` graph `BuiyRenderPlugin` wires into requires +/// `CorePipelinePlugin` before it). +pub fn capture_app_with_resolution(resolution: bevy::window::WindowResolution) -> bevy::app::App { + use bevy::app::App; + use bevy::prelude::*; + use bevy::window::{Window, WindowPlugin}; + + let mut app = App::new(); + app.add_plugins(MinimalPlugins) + .add_plugins(WindowPlugin { + primary_window: Some(Window { + resolution, + ..default() + }), + ..default() + }) + .add_plugins(bevy::asset::AssetPlugin::default()) + .add_plugins(bevy::render::RenderPlugin::default()) + .add_plugins(bevy::image::ImagePlugin::default()) + .add_plugins(bevy::camera::CameraPlugin) + .add_plugins(bevy::core_pipeline::CorePipelinePlugin) + .add_plugins(crate::theme::ThemePlugin) + .add_plugins(crate::layout::LayoutPlugin) + .add_plugins(crate::CorePlugin) + .add_plugins(crate::text::BuiyTextPlugin::default()) + .add_plugins(crate::render::BuiyRenderPlugin); + app.init_asset::(); + app +} + /// **The shared capture seam** (verification-design README § Architecture): /// render the already-built, fixture-populated `app` into an offscreen target /// sized to the window's PHYSICAL pixel grid and read it back as an diff --git a/crates/buiy_core/tests/render_capture_app_gpu.rs b/crates/buiy_core/tests/render_capture_app_gpu.rs new file mode 100644 index 0000000..a9b5279 --- /dev/null +++ b/crates/buiy_core/tests/render_capture_app_gpu.rs @@ -0,0 +1,50 @@ +//! GPU lane: `render::golden::capture_app` builds a painting-capable headless +//! App identical to the test-support `gpu_render_app` stack, so the reftest / +//! golden tiers in buiy_verify build their app from `src` (reftests.md § build +//! seam). #[ignore] — needs a real adapter. + +use bevy::prelude::*; +use buiy_core::components::Node; +use buiy_core::layout::{Inset, Length, Sizing, Style}; +use buiy_core::render::ColorToken; +use buiy_core::render::components::Background; +use buiy_core::render::golden::{GoldenConfig, capture_app, capture_to_image}; +use std::borrow::Cow; + +#[test] +#[ignore = "GPU: run under `cargo test -- --ignored` (real adapter / lavapipe)"] +fn capture_app_paints_a_non_blank_frame() { + let mut app = capture_app(64, 64); + { + let mut theme = app.world_mut().resource_mut::(); + theme + .colors + .insert("test.fill.a".into(), Color::srgb(0.90, 0.10, 0.10)); + } + let e = app + .world_mut() + .spawn(( + Node, + Style::default() + .absolute() + .inset(Inset { + top: Sizing::Length(Length::px(8.0)), + left: Sizing::Length(Length::px(8.0)), + ..default() + }) + .width_px(40.0) + .height_px(40.0), + Background { + color: ColorToken::Token(Cow::Borrowed("test.fill.a")), + }, + )) + .id(); + app.world_mut() + .spawn((Node, Style::default())) + .add_children(&[e]); + + let img = capture_to_image(&mut app, &GoldenConfig::deterministic()); + assert_eq!(img.dimensions(), (64, 64)); + let painted = img.pixels().any(|p| p.0 != [0, 0, 0, 255]); + assert!(painted, "capture_app must paint the box, not a blank frame"); +} diff --git a/crates/buiy_core/tests/support/mod.rs b/crates/buiy_core/tests/support/mod.rs index 5fac5ef..c2bf727 100644 --- a/crates/buiy_core/tests/support/mod.rs +++ b/crates/buiy_core/tests/support/mod.rs @@ -162,37 +162,12 @@ pub fn gpu_render_app_scaled(logical_w: u32, logical_h: u32, scale_factor: f32) } /// The one shared plugin stack behind [`gpu_render_app`] / -/// [`gpu_render_app_scaled`] — a single body so the scaled builder cannot -/// drift from the canonical one. +/// [`gpu_render_app_scaled`] — delegates to the promoted src builder +/// `buiy_core::render::golden::capture_app_with_resolution` so the canonical +/// plugin stack lives in exactly one place (anti-drift: the reftest / golden +/// tiers and these test-support builders are now the SAME body). fn gpu_render_app_with_resolution(resolution: bevy::window::WindowResolution) -> App { - let mut app = App::new(); - app.add_plugins(MinimalPlugins) - // Sized to the capture target so the primary-window-derived view uniform - // matches the offscreen image's pixel grid (see module note above). - .add_plugins(bevy::window::WindowPlugin { - primary_window: Some(Window { - resolution, - ..default() - }), - ..default() - }) - .add_plugins(bevy::asset::AssetPlugin::default()) - .add_plugins(bevy::render::RenderPlugin::default()) - .add_plugins(bevy::image::ImagePlugin::default()) - .add_plugins(bevy::camera::CameraPlugin) - // The 2D render graph: `Core2dPlugin` (inside `CorePipelinePlugin`) - // creates the `Core2d` sub-graph that `BuiyRenderPlugin` wires its node - // into. MUST precede `BuiyRenderPlugin` (plugins build in add order). - .add_plugins(bevy::core_pipeline::CorePipelinePlugin) - .add_plugins(buiy_core::theme::ThemePlugin) - .add_plugins(buiy_core::layout::LayoutPlugin) - .add_plugins(CorePlugin) - // The text engine + the T4 glyph producer (render half registers - // against the live RenderApp created by RenderPlugin above). - .add_plugins(buiy_core::text::BuiyTextPlugin::default()) - .add_plugins(BuiyRenderPlugin); - app.init_asset::(); - app + buiy_core::render::golden::capture_app_with_resolution(resolution) } /// Create an offscreen `Rgba8UnormSrgb` render-target image of `width`×`height`, diff --git a/crates/buiy_verify/src/lib.rs b/crates/buiy_verify/src/lib.rs index 5c12dd6..266f018 100644 --- a/crates/buiy_verify/src/lib.rs +++ b/crates/buiy_verify/src/lib.rs @@ -8,3 +8,4 @@ pub mod a11y; pub mod contrast; pub mod metric; pub mod reftest; +pub mod support; diff --git a/crates/buiy_verify/src/reftest.rs b/crates/buiy_verify/src/reftest.rs index 1af2214..f687751 100644 --- a/crates/buiy_verify/src/reftest.rs +++ b/crates/buiy_verify/src/reftest.rs @@ -69,6 +69,97 @@ pub fn evaluate_outcome(kind: RefKind, diff: &Diff, fuzz: &FuzzBudget) -> bool { } } +use crate::metric::{CompareOpts, compare}; +use buiy_core::render::golden::{GoldenConfig, capture_to_image}; + +/// The capture viewport for reftest pairings, in logical px. Both halves are +/// captured at this size in one app run; large enough that a single 40px box +/// and a 120px-shifted twin do not overlap (so a moved box is a real diff). +const REFTEST_LOGICAL: (u32, u32) = (200, 120); + +/// Render BOTH scenes via the buiy_core capture seam in ONE app run and diff +/// with `metric::compare`. Platform variance cancels because both halves share +/// one `wgpu::Device`, driver, atlas, and virtual clock. GPU-coupled. +/// +/// Until the determinism stack lands this builds the app via `reftest_app` +/// (the canonical `capture_app` seam); Phase 3 swaps that one line for +/// `DeterministicApp::build` with an identical `&mut App`→capture contract. +pub fn run_reftest(case: &RefCase) -> RefOutcome { + assert!( + mismatch_floor_ok(case.kind, &case.fuzz), + "reftest `{}`: a Mismatch with a non-(0,0) fuzz floor is vacuous", + case.name + ); + let (w, h) = REFTEST_LOGICAL; + let mut app = crate::support::reftest_app(w, h); + let cfg = GoldenConfig::deterministic(); + + let test_img = capture_to_image_with(&mut app, case.test, &cfg); + let ref_img = capture_to_image_with(&mut app, case.reference, &cfg); + + let diff = compare(&test_img, &ref_img, &CompareOpts::reftest_default()); + let passed = evaluate_outcome(case.kind, &diff, &case.fuzz); + let report_path = if passed { + None + } else { + Some(emit_report(case.name, &test_img, &ref_img, &diff)) + }; + RefOutcome { + passed, + diff, + report_path, + } +} + +/// Clear the previous scene, spawn `scene`, capture via the buiy_core seam. +fn capture_to_image_with( + app: &mut bevy::app::App, + scene: fn(&mut bevy::app::App), + cfg: &GoldenConfig, +) -> image::RgbaImage { + crate::support::clear_reftest_scene(app); + scene(app); + capture_to_image(app, cfg) +} + +/// Write a self-contained HTML triage report (test | ref | diff) to a temp +/// path and return it. Phase 3 swaps this for the golden-tier emitter; until +/// then, a minimal three-PNG dump. Never committed. +fn emit_report( + name: &str, + test: &image::RgbaImage, + reference: &image::RgbaImage, + diff: &Diff, +) -> std::path::PathBuf { + let dir = std::env::temp_dir().join("buiy-reftest"); + let _ = std::fs::create_dir_all(&dir); + let base = dir.join(name); + let _ = test.save(base.with_extension("test.png")); + let _ = reference.save(base.with_extension("ref.png")); + if let Some(img) = &diff.diff_image { + let _ = img.save(base.with_extension("diff.png")); + } + let report = base.with_extension("html"); + let _ = std::fs::write( + &report, + format!( + "

reftest {name} FAILED

differing_pixels={} max_channel_delta={}

\ + ", + diff.differing_pixels, diff.max_channel_delta + ), + ); + eprintln!("reftest {name} report: {}", report.display()); + report +} + +/// A `Mismatch` budget that tolerates difference is meaningless — its floor +/// must be `(0,0)`. `Match` may carry any widening. (Task 1b.7 replaces this +/// stub with the real guard + its meta-test; inlined `true` here only so the +/// 1b.5/1b.6 engine compiles green.) +fn mismatch_floor_ok(_kind: RefKind, _fuzz: &FuzzBudget) -> bool { + true +} + #[cfg(test)] mod tests { use super::*; diff --git a/crates/buiy_verify/src/support.rs b/crates/buiy_verify/src/support.rs new file mode 100644 index 0000000..dcca065 --- /dev/null +++ b/crates/buiy_verify/src/support.rs @@ -0,0 +1,25 @@ +//! GPU-capture glue for the reftest/golden tiers — the ONE place that names +//! the concrete app builder, so Phase 3 swaps it for `DeterministicApp` in a +//! single edit. `pub` so `tests/` integration tests reach it. + +use bevy::prelude::*; + +/// Build the headless painting app both reftest captures share. Until the +/// determinism builder lands this delegates to the promoted +/// `buiy_core::render::golden::capture_app` (Task 1b.6). +pub fn reftest_app(logical_w: u32, logical_h: u32) -> App { + buiy_core::render::golden::capture_app(logical_w, logical_h) +} + +/// Despawn the previous scene's spawned roots between the two captures so the +/// second scene renders alone. Keeps the camera + render-target entities. +pub fn clear_reftest_scene(app: &mut App) { + let roots: Vec = app + .world_mut() + .query_filtered::, Without)>() + .iter(app.world()) + .collect(); + for e in roots { + app.world_mut().entity_mut(e).despawn(); + } +} diff --git a/crates/buiy_verify/tests/reftest_engine_gpu.rs b/crates/buiy_verify/tests/reftest_engine_gpu.rs new file mode 100644 index 0000000..0b617ae --- /dev/null +++ b/crates/buiy_verify/tests/reftest_engine_gpu.rs @@ -0,0 +1,107 @@ +//! GPU lane (`--ignored`): proves the reftest engine can both PASS and FAIL. +//! reftests.md § Verification #3 — a scene-vs-itself match passes at (0,0); a +//! scene-vs-different match fails (guards a vacuous green); a scene-vs-itself +//! mismatch fails. Real adapter (RX 6700 XT here) / pinned lavapipe in CI. + +use bevy::prelude::*; +use buiy_core::components::Node; +use buiy_core::layout::{Inset, Length, Sizing, Style}; +use buiy_core::render::ColorToken; +use buiy_core::render::components::Background; +use buiy_verify::metric::FuzzBudget; +use buiy_verify::reftest::{RefCase, RefKind, run_reftest}; +use std::borrow::Cow; + +/// A single 40×40 fill at (left,8) in `token` color. Installs the token so the +/// scene is self-contained across the two captures `run_reftest` drives. +fn box_at(app: &mut App, left: f32, token: &'static str) { + { + let mut theme = app + .world_mut() + .resource_mut::(); + theme + .colors + .insert(token.into(), Color::srgb(0.90, 0.10, 0.10)); + } + let e = app + .world_mut() + .spawn(( + Node, + Style::default() + .absolute() + .inset(Inset { + top: Sizing::Length(Length::px(8.0)), + left: Sizing::Length(Length::px(left)), + ..default() + }) + .width_px(40.0) + .height_px(40.0), + Background { + color: ColorToken::Token(Cow::Borrowed(token)), + }, + )) + .id(); + app.world_mut() + .spawn((Node, Style::default())) + .add_children(&[e]); +} + +fn red_at_8(app: &mut App) { + box_at(app, 8.0, "test.fill.a"); +} +fn red_at_120(app: &mut App) { + box_at(app, 120.0, "test.fill.a"); +} + +#[test] +#[ignore = "GPU: run under `cargo test -- --ignored` (real adapter / lavapipe)"] +fn match_of_scene_with_itself_passes() { + let case = RefCase { + name: "self_match", + kind: RefKind::Match, + test: red_at_8, + reference: red_at_8, + fuzz: FuzzBudget::EXACT, + }; + let outcome = run_reftest(&case); + assert!( + outcome.passed, + "self-match must pass at (0,0): {:?}", + outcome.diff + ); +} + +#[test] +#[ignore = "GPU: run under `cargo test -- --ignored` (real adapter / lavapipe)"] +fn match_of_two_different_scenes_fails() { + let case = RefCase { + name: "different_match_fails", + kind: RefKind::Match, + test: red_at_8, + reference: red_at_120, + fuzz: FuzzBudget::EXACT, + }; + let outcome = run_reftest(&case); + assert!( + !outcome.passed, + "differing scenes must NOT match (vacuous-green guard)" + ); + assert!( + outcome.report_path.is_some(), + "failure emits a triage report" + ); +} + +#[test] +#[ignore = "GPU: run under `cargo test -- --ignored` (real adapter / lavapipe)"] +fn mismatch_of_scene_with_itself_fails() { + let case = RefCase { + name: "self_mismatch_fails", + kind: RefKind::Mismatch, + test: red_at_8, + reference: red_at_8, + fuzz: FuzzBudget::EXACT, + }; + let outcome = run_reftest(&case); + assert!(!outcome.passed, "a scene cannot mismatch itself"); +} From 138a2f4db77358196a0cc26876fa55006983bd19 Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 06:29:54 -0700 Subject: [PATCH 22/70] feat(verify): reject non-(0,0) fuzz floor on a Mismatch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A != that tolerates difference is vacuous — mismatch_floor_ok gates it pure-CPU and run_reftest asserts it as a belt (replacing the 1b.5 inline stub). reftests.md § Verification #2. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/buiy_verify/src/reftest.rs | 40 +++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/crates/buiy_verify/src/reftest.rs b/crates/buiy_verify/src/reftest.rs index f687751..95ddb2a 100644 --- a/crates/buiy_verify/src/reftest.rs +++ b/crates/buiy_verify/src/reftest.rs @@ -153,11 +153,14 @@ fn emit_report( } /// A `Mismatch` budget that tolerates difference is meaningless — its floor -/// must be `(0,0)`. `Match` may carry any widening. (Task 1b.7 replaces this -/// stub with the real guard + its meta-test; inlined `true` here only so the -/// 1b.5/1b.6 engine compiles green.) -fn mismatch_floor_ok(_kind: RefKind, _fuzz: &FuzzBudget) -> bool { - true +/// must be `(0,0)`. `Match` may carry any widening. Pure CPU so it gates +/// headless (reftests.md § Verification #2); the `reftest!` macro enforces the +/// same at expansion time, and `run_reftest` asserts it as a belt. +pub fn mismatch_floor_ok(kind: RefKind, fuzz: &FuzzBudget) -> bool { + match kind { + RefKind::Mismatch => *fuzz == FuzzBudget::EXACT, + RefKind::Match => true, + } } #[cfg(test)] @@ -242,4 +245,31 @@ mod tests { &FuzzBudget::EXACT )); } + + #[test] + fn mismatch_requires_zero_fuzz_floor() { + assert!(mismatch_floor_ok(RefKind::Mismatch, &FuzzBudget::EXACT)); + assert!(!mismatch_floor_ok( + RefKind::Mismatch, + &FuzzBudget { + max_channel_delta: 1, + max_diff_pixels: 0 + } + )); + assert!(!mismatch_floor_ok( + RefKind::Mismatch, + &FuzzBudget { + max_channel_delta: 0, + max_diff_pixels: 1 + } + )); + // Match may carry any budget. + assert!(mismatch_floor_ok( + RefKind::Match, + &FuzzBudget { + max_channel_delta: 8, + max_diff_pixels: 4 + } + )); + } } From 7edb56e12dfeea895b5571409a970a025bb38595 Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 06:31:09 -0700 Subject: [PATCH 23/70] feat(verify): reftest! macro generating #[ignore] GPU cases MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit reftest!(kind, fn_ident, test, reference[, fuzz=(d,p)]) emits one #[test] #[ignore] per pairing; a non-(0,0) floor on a mismatch fails to COMPILE via a const assert. reftests.md § 'The reftest! macro'. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/buiy_verify/src/reftest.rs | 61 +++++++++++++++++++ crates/buiy_verify/tests/reftest_macro_gpu.rs | 42 +++++++++++++ 2 files changed, 103 insertions(+) create mode 100644 crates/buiy_verify/tests/reftest_macro_gpu.rs diff --git a/crates/buiy_verify/src/reftest.rs b/crates/buiy_verify/src/reftest.rs index 95ddb2a..1747a1c 100644 --- a/crates/buiy_verify/src/reftest.rs +++ b/crates/buiy_verify/src/reftest.rs @@ -163,6 +163,67 @@ pub fn mismatch_floor_ok(kind: RefKind, fuzz: &FuzzBudget) -> bool { } } +/// Generate one `#[test] #[ignore]` per reftest pairing — keeps each case at +/// the unit/integration tier under the existing `cargo test -- --ignored` GPU +/// lane, no new CI infra, no manifest file (the type system IS the manifest). +/// +/// ```ignore +/// reftest!(match, flex_justify_end, flex_test, literal_offsets_ref); +/// reftest!(mismatch, cv_hidden_hides, cv_visible, cv_hidden); +/// reftest!(match, transform_xy, xfm_test, literal_ref, fuzz = (1, 8)); +/// ``` +/// +/// A non-`(0,0)` fuzz floor on a `mismatch` fails to COMPILE (a `const` +/// assertion), not at runtime — reftests.md § Verification #2. +#[macro_export] +macro_rules! reftest { + // mismatch with explicit fuzz → compile-time reject of a non-zero floor. + (mismatch, $fn:ident, $test:path, $reference:path, fuzz = ($d:literal, $p:literal)) => { + const _: () = assert!( + $d == 0 && $p == 0, + concat!( + "reftest mismatch `", + stringify!($fn), + "`: a non-(0,0) fuzz floor is vacuous" + ), + ); + $crate::reftest!(@gen mismatch, $fn, $test, $reference, ($d, $p)); + }; + // match with explicit fuzz. + (match, $fn:ident, $test:path, $reference:path, fuzz = ($d:literal, $p:literal)) => { + $crate::reftest!(@gen match, $fn, $test, $reference, ($d, $p)); + }; + // no explicit fuzz → (0,0) for either kind. + ($kind:ident, $fn:ident, $test:path, $reference:path) => { + $crate::reftest!(@gen $kind, $fn, $test, $reference, (0, 0)); + }; + // internal: emit the #[ignore] test named $fn. + (@gen $kind:ident, $fn:ident, $test:path, $reference:path, ($d:literal, $p:literal)) => { + #[test] + #[ignore = "GPU: run under `cargo test -- --ignored` (real adapter / lavapipe)"] + fn $fn() { + let case = $crate::reftest::RefCase { + name: stringify!($fn), + kind: $crate::reftest::RefKind::reftest_kind(stringify!($kind)), + test: $test, + reference: $reference, + fuzz: $crate::metric::FuzzBudget { + max_channel_delta: $d, + max_diff_pixels: $p, + }, + }; + let outcome = $crate::reftest::run_reftest(&case); + assert!( + outcome.passed, + "reftest {} failed: {:?} (report: {:?})", + stringify!($fn), + outcome.diff, + outcome.report_path + ); + } + }; +} + #[cfg(test)] mod tests { use super::*; diff --git a/crates/buiy_verify/tests/reftest_macro_gpu.rs b/crates/buiy_verify/tests/reftest_macro_gpu.rs new file mode 100644 index 0000000..c80bb15 --- /dev/null +++ b/crates/buiy_verify/tests/reftest_macro_gpu.rs @@ -0,0 +1,42 @@ +//! GPU lane: the `reftest!` macro generates an `#[ignore]` test per pairing. +//! Uses the same self-match scene as the engine test to prove the macro wires +//! through to a passing run. reftests.md § "The reftest! macro". + +use bevy::prelude::*; +use buiy_core::components::Node; +use buiy_core::layout::{Inset, Length, Sizing, Style}; +use buiy_core::render::ColorToken; +use buiy_core::render::components::Background; +use std::borrow::Cow; + +fn solid_box(app: &mut App) { + { + let mut theme = app.world_mut().resource_mut::(); + theme + .colors + .insert("test.fill.a".into(), Color::srgb(0.90, 0.10, 0.10)); + } + let e = app + .world_mut() + .spawn(( + Node, + Style::default() + .absolute() + .inset(Inset { + top: Sizing::Length(Length::px(8.0)), + left: Sizing::Length(Length::px(8.0)), + ..default() + }) + .width_px(40.0) + .height_px(40.0), + Background { + color: ColorToken::Token(Cow::Borrowed("test.fill.a")), + }, + )) + .id(); + app.world_mut() + .spawn((Node, Style::default())) + .add_children(&[e]); +} + +buiy_verify::reftest!(match, macro_self_match, solid_box, solid_box); From 0acbd1756d215129c55f6d324f76ac199d67cf32 Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 06:33:14 -0700 Subject: [PATCH 24/70] feat(verify): reference-independence structural lint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit assert_reference_independent builds the reference into a no-GPU App and rejects any forbidden marker (ContentVisibility/ContainerQuery/TopLayer/ Translate). Value-encoded features fall to human review (documented). The lint is itself RED/GREEN-tested. reftests.md §§ Reference independence, Verification #4. Two deviations forced by the live API (both keep the lint structural): - TopLayer is a FIELD on the Stacking component, not a component of its own, so the marker queries Stacking and checks top_layer != None — structurally equivalent to the Containment/content_visibility routing. - Style is a Bundle that already supplies Containment + Stacking; the self-test sets content_visibility via Style::containment() (spawning a second Containment alongside is a duplicate-component panic, not a lint trip), and the markers check the FIELD VALUE so a default-Visible Containment on a disjoint reference does not trip the lint. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/buiy_verify/src/reftest.rs | 111 ++++++++++++++++++ .../buiy_verify/tests/reftest_independence.rs | 66 +++++++++++ 2 files changed, 177 insertions(+) create mode 100644 crates/buiy_verify/tests/reftest_independence.rs diff --git a/crates/buiy_verify/src/reftest.rs b/crates/buiy_verify/src/reftest.rs index 1747a1c..4f4f53e 100644 --- a/crates/buiy_verify/src/reftest.rs +++ b/crates/buiy_verify/src/reftest.rs @@ -163,6 +163,117 @@ pub fn mismatch_floor_ok(kind: RefKind, fuzz: &FuzzBudget) -> bool { } } +use bevy::prelude::World; + +/// A structural marker the independence lint can query for in a built world. +/// Each variant maps to a `buiy_core` component (or a distinguishing field on +/// one) whose *presence* proves a reference re-used the feature under test. +/// Value-encoded features (`justify-content`, `direction`, `gap` — fields on a +/// shared `Style`) have NO marker here and fall to human review (see +/// [`assert_reference_independent`]). +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +pub enum ComponentMarker { + /// A `Containment` whose `content_visibility` is `Hidden`. + ContentVisibilityHidden, + /// Any `ContainerQuery` component. + ContainerQuery, + /// A `Stacking` whose `top_layer` is non-`None` (top-layer participation). + /// `TopLayer` is a field on the `Stacking` component, not a component of its + /// own, so the lint queries `Stacking` and checks the field — structurally + /// equivalent to the `ContentVisibilityHidden`/`Containment` routing. + TopLayer, + /// Any `Translate` component. + Translate, +} + +impl ComponentMarker { + /// True iff ANY entity in `world` carries this marker. + fn present_in(self, world: &mut World) -> bool { + use buiy_core::layout::{ + Containment, ContainerQuery, ContentVisibility, Stacking, TopLayer, Translate, + }; + match self { + ComponentMarker::ContentVisibilityHidden => world + .query::<&Containment>() + .iter(world) + .any(|c| c.content_visibility == ContentVisibility::Hidden), + ComponentMarker::ContainerQuery => { + world.query::<&ContainerQuery>().iter(world).next().is_some() + } + ComponentMarker::TopLayer => world + .query::<&Stacking>() + .iter(world) + .any(|s| s.top_layer != TopLayer::None), + ComponentMarker::Translate => { + world.query::<&Translate>().iter(world).next().is_some() + } + } + } +} + +/// What a reference scene is FORBIDDEN to contain, per feature under test. +pub struct IndependenceRule { + pub feature: &'static str, + pub forbidden_in_reference: &'static [ComponentMarker], +} + +/// The registered marker rules for marker-bearing features. Value-encoded +/// features (flex `justify-content`, `direction`, `gap`) are deliberately +/// ABSENT — component-presence cannot distinguish them, so they fall to the +/// PR-time review checklist. A pairing whose feature has no rule here fails the +/// lint until a rule (or documented waiver) is added — independence is +/// opt-out-impossible by construction for marker features. +pub fn default_rules() -> Vec { + vec![ + IndependenceRule { + feature: "content-visibility", + forbidden_in_reference: &[ComponentMarker::ContentVisibilityHidden], + }, + IndependenceRule { + feature: "@container", + forbidden_in_reference: &[ComponentMarker::ContainerQuery], + }, + IndependenceRule { + feature: "top-layer", + forbidden_in_reference: &[ComponentMarker::TopLayer], + }, + IndependenceRule { + feature: "translate", + forbidden_in_reference: &[ComponentMarker::Translate], + }, + ] +} + +/// Assert the case's `reference` scene carries NONE of the marker components a +/// rule forbids. Builds the reference into a headless **no-GPU** `App` (layout +/// types registered, no render plugins) and queries the built world. Panics +/// naming the feature + marker on violation. +/// +/// **Limit — value-encoded features fall to human review.** Features that are +/// field *values* on a shared `Style`/`Node` (`justify-content`, `direction`, +/// `gap`) have no distinct marker, so this lint cannot see them; mechanism 1 +/// (route the reference through the primitive literal-`Node` layer) keeps THOSE +/// independent, and the PR-time checklist enforces it. This backstops only +/// marker-bearing features. +pub fn assert_reference_independent(case: &RefCase, rules: &[IndependenceRule]) { + let mut app = bevy::app::App::new(); + app.add_plugins(buiy_core::layout::LayoutPlugin); + (case.reference)(&mut app); + let world = app.world_mut(); + for rule in rules { + for &marker in rule.forbidden_in_reference { + assert!( + !marker.present_in(world), + "reference for `{}` illegally contains {:?} — it re-uses the \ + feature under test, so the comparison would pass vacuously \ + (reftests.md § Reference independence)", + rule.feature, + marker + ); + } + } +} + /// Generate one `#[test] #[ignore]` per reftest pairing — keeps each case at /// the unit/integration tier under the existing `cargo test -- --ignored` GPU /// lane, no new CI infra, no manifest file (the type system IS the manifest). diff --git a/crates/buiy_verify/tests/reftest_independence.rs b/crates/buiy_verify/tests/reftest_independence.rs new file mode 100644 index 0000000..c32160b --- /dev/null +++ b/crates/buiy_verify/tests/reftest_independence.rs @@ -0,0 +1,66 @@ +//! Pure-CPU lint self-test (NOT #[ignore]): a reference that ILLEGALLY carries +//! the forbidden marker trips assert_reference_independent (RED); the canonical +//! disjoint reference passes (GREEN). reftests.md § Verification #4. The lint +//! is itself tested, not trusted. + +use bevy::prelude::*; +use buiy_core::components::Node; +use buiy_core::layout::{Containment, ContentVisibility, Style}; +use buiy_verify::metric::FuzzBudget; +use buiy_verify::reftest::{ + ComponentMarker, IndependenceRule, RefCase, RefKind, assert_reference_independent, + default_rules, +}; + +fn empty(_: &mut App) {} + +fn visible_box(app: &mut App) { + // A plain `Style` carries a default `Containment` (content_visibility: + // Visible) — the lint's check is on the FIELD VALUE (Hidden), so this + // legitimately-disjoint reference does not trip it. + app.world_mut().spawn((Node, Style::default())); +} + +fn hidden_box(app: &mut App) { + // `Style` is a Bundle that already supplies `Containment`; set the field + // via the builder (spawning a second `Containment` alongside would be a + // duplicate-component panic, NOT a lint trip). + app.world_mut().spawn(( + Node, + Style::default().containment(Containment { + content_visibility: ContentVisibility::Hidden, + ..default() + }), + )); +} + +#[test] +fn legal_reference_passes_the_lint() { + let case = RefCase { + name: "cv_green", + kind: RefKind::Mismatch, + test: empty, + reference: visible_box, + fuzz: FuzzBudget::EXACT, + }; + assert_reference_independent(&case, &default_rules()); +} + +#[test] +#[should_panic(expected = "reference for `content-visibility` illegally contains")] +fn illegal_reference_trips_the_lint() { + let case = RefCase { + name: "cv_red", + kind: RefKind::Mismatch, + test: empty, + reference: hidden_box, + fuzz: FuzzBudget::EXACT, + }; + assert_reference_independent( + &case, + &[IndependenceRule { + feature: "content-visibility", + forbidden_in_reference: &[ComponentMarker::ContentVisibilityHidden], + }], + ); +} From 836b78cae0715c5f34c3a89934aaa88e14706a08 Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 06:34:20 -0700 Subject: [PATCH 25/70] feat(verify): full-tile CPU SDF oracle (rasterize_sdf_rect) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Promotes the CPU SDF port from scalar probes to a full-tile rasterizer mirroring shader.wgsl:60/:76-:79 (sdf_rounded_rect + fwidth->smoothstep). Pinned to the render_instance.rs point-probes. reftests.md §§ CPU-vs-GPU cross-check, Verification #5. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/buiy_verify/src/reftest.rs | 61 ++++++++++++++++++++++++++ crates/buiy_verify/tests/sdf_oracle.rs | 38 ++++++++++++++++ 2 files changed, 99 insertions(+) create mode 100644 crates/buiy_verify/tests/sdf_oracle.rs diff --git a/crates/buiy_verify/src/reftest.rs b/crates/buiy_verify/src/reftest.rs index 4f4f53e..ae4bd8b 100644 --- a/crates/buiy_verify/src/reftest.rs +++ b/crates/buiy_verify/src/reftest.rs @@ -163,6 +163,67 @@ pub fn mismatch_floor_ok(kind: RefKind, fuzz: &FuzzBudget) -> bool { } } +/// Pure-CPU per-pixel evaluation of the WGSL SDF + AA coverage step, the +/// golden-free oracle for SDF corner AA (Tier 4.5). The SDF formula is shared +/// 1:1 with `shader.wgsl:60` / `:76-:79` — the port and the shader must stay +/// identical, pinned by the point-probe test that re-derives the values +/// `tests/render_instance.rs:12` already asserts. +pub mod sdf_oracle { + use bevy::math::Vec2; + use buiy_core::render::DrawData; + + /// 1:1 CPU port of `shader.wgsl::sdf_rounded_rect`. + pub fn sdf_rounded_rect(p: Vec2, half_size: Vec2, r: f32) -> f32 { + let q = p.abs() - half_size + Vec2::splat(r); + q.max(Vec2::ZERO).length() + q.x.max(q.y).min(0.0) - r + } + + /// Rasterize one `DrawData` rounded-rect into a `w×h` RGBA tile, mirroring + /// the fragment shader: SDF in logical px, AA via a `fwidth` estimate (the + /// per-pixel SDF gradient via central difference) fed to + /// `smoothstep(-aa, aa, d)`. + pub fn rasterize_sdf_rect(draw: &DrawData, w: u32, h: u32) -> image::RgbaImage { + let half = draw.size * 0.5; + let center = draw.position + half; + let r = draw.radius; + let lin = bevy::color::LinearRgba::from(draw.color); + let srgba = bevy::color::Srgba::from(lin); + let (rr, gg, bb) = ( + (srgba.red * 255.0).round() as u8, + (srgba.green * 255.0).round() as u8, + (srgba.blue * 255.0).round() as u8, + ); + let base_a = srgba.alpha; + + let mut img = image::RgbaImage::new(w, h); + for y in 0..h { + for x in 0..w { + let p = Vec2::new(x as f32 + 0.5, y as f32 + 0.5) - center; + let d = sdf_rounded_rect(p, half, r); + let dx = (sdf_rounded_rect(p + Vec2::X, half, r) + - sdf_rounded_rect(p - Vec2::X, half, r)) + .abs() + * 0.5; + let dy = (sdf_rounded_rect(p + Vec2::Y, half, r) + - sdf_rounded_rect(p - Vec2::Y, half, r)) + .abs() + * 0.5; + let aa = (dx + dy).max(1e-4); + let coverage = 1.0 - smoothstep(-aa, aa, d); + let a = (base_a * coverage * 255.0).round().clamp(0.0, 255.0) as u8; + img.put_pixel(x, y, image::Rgba([rr, gg, bb, a])); + } + } + img + } + + /// `smoothstep` matching WGSL `smoothstep(edge0, edge1, x)`. + fn smoothstep(edge0: f32, edge1: f32, x: f32) -> f32 { + let t = ((x - edge0) / (edge1 - edge0)).clamp(0.0, 1.0); + t * t * (3.0 - 2.0 * t) + } +} + use bevy::prelude::World; /// A structural marker the independence lint can query for in a built world. diff --git a/crates/buiy_verify/tests/sdf_oracle.rs b/crates/buiy_verify/tests/sdf_oracle.rs new file mode 100644 index 0000000..148d431 --- /dev/null +++ b/crates/buiy_verify/tests/sdf_oracle.rs @@ -0,0 +1,38 @@ +//! Pure-CPU (NOT #[ignore]): the full-tile CPU SDF oracle must reproduce the +//! scalar `d` the existing render_instance.rs point-probes assert — center +//! inside (filled), 2× half-extent outside (empty). Pins the full-tile port to +//! the unit-tested shader formula. reftests.md § Verification #5. + +use bevy::prelude::*; +use buiy_core::render::DrawData; +use buiy_verify::reftest::sdf_oracle::rasterize_sdf_rect; + +#[test] +fn oracle_fills_center_and_clears_far_outside() { + let inset = DrawData::new(Vec2::new(50.0, 25.0), Vec2::new(40.0, 20.0), Color::WHITE, 0.0); + let img = rasterize_sdf_rect(&inset, 200, 100); + assert_eq!(img.dimensions(), (200, 100)); + assert_eq!(img.get_pixel(5, 5).0[3], 0, "far outside the box is empty"); + assert_eq!( + img.get_pixel(70, 35).0[3], + 255, + "inside the inset box is filled" + ); +} + +#[test] +fn oracle_edge_band_is_partial_alpha() { + // The AA band must be neither fully 0 nor fully 255 for at least one pixel + // (proves the smoothstep coverage step is live) — the property the GPU + // shader's fwidth→smoothstep produces. + let draw = DrawData::new(Vec2::new(50.0, 25.0), Vec2::new(40.0, 20.0), Color::WHITE, 8.0); + let img = rasterize_sdf_rect(&draw, 200, 100); + let has_partial = img.pixels().any(|p| { + let a = p.0[3]; + a > 0 && a < 255 + }); + assert!( + has_partial, + "a rounded-rect edge must produce AA partial-alpha pixels" + ); +} From 20a25eb7f6385dce6d4c33d868f300f69442237c Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 06:41:20 -0700 Subject: [PATCH 26/70] feat(verify): CPU-vs-GPU SDF cross-check (run_sdf_cross_check) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Renders one rounded-rect on the GPU and via the CPU oracle, diffs within a measured AA fuzz budget. Zero stored bytes; kept permanently (one shared analytic SDF). reftests.md § CPU-vs-GPU SDF cross-check. Two corrections forced by root-causing a 60%-of-frame divergence to green: - The corner radius for a Background fill is carried on Border.radius (Corners::all(Radius::circular(..))) — the component draw_for_node reads (render/mod.rs:373) — NOT a bare Radius component (which the fill path ignores). spawn_single_primitive now uses a zero-width Border. - The CPU oracle must match the full CAPTURE chain, not just the fragment shader: the capture camera clears to OPAQUE BLACK and the pipeline blends linear-space SrcOver into an Rgba8UnormSrgb target. The oracle now composites coverage over opaque black in linear space then sRGB-encodes, so interior + exterior agree and only the ~1px AA rim differs (measured 87/24000 px on RX 6700 XT; budget bounds it at 200). The 1b.10 oracle point-probe test moves to the same capture-matched convention (filled = opaque white, empty = opaque black) — same geometry, composited. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/buiy_verify/src/reftest.rs | 137 ++++++++++++++++-- .../buiy_verify/tests/sdf_cross_check_gpu.rs | 44 ++++++ crates/buiy_verify/tests/sdf_oracle.rs | 38 +++-- 3 files changed, 193 insertions(+), 26 deletions(-) create mode 100644 crates/buiy_verify/tests/sdf_cross_check_gpu.rs diff --git a/crates/buiy_verify/src/reftest.rs b/crates/buiy_verify/src/reftest.rs index ae4bd8b..8b6c5e3 100644 --- a/crates/buiy_verify/src/reftest.rs +++ b/crates/buiy_verify/src/reftest.rs @@ -152,6 +152,84 @@ fn emit_report( report } +/// Render the same single primitive on the GPU (one-instance capture) and on +/// the CPU oracle, diff with the AA-aware metric. Tolerates sub-pixel AA noise +/// via `fuzz`; zero stored bytes. Catches SDF AA / implementation drift no +/// markup reftest can, and is kept PERMANENTLY (one shared analytic +/// `sdf_rounded_rect`). A *spec* error in `sdf_rounded_rect` is invisible here +/// (both paths share it) — that is Tier 5's job. +pub fn run_sdf_cross_check(draw: &buiy_core::render::DrawData, fuzz: &FuzzBudget) -> RefOutcome { + let (w, h) = REFTEST_LOGICAL; + let cfg = GoldenConfig::deterministic(); + + let mut app = crate::support::reftest_app(w, h); + crate::support::clear_reftest_scene(&mut app); + spawn_single_primitive(&mut app, draw); + let gpu = capture_to_image(&mut app, &cfg); + + let cpu = sdf_oracle::rasterize_sdf_rect(draw, w, h); + + let diff = compare(&gpu, &cpu, &CompareOpts::reftest_default()); + let passed = diff.passes(fuzz); + let report_path = if passed { + None + } else { + Some(emit_report("sdf_cross_check", &gpu, &cpu, &diff)) + }; + RefOutcome { + passed, + diff, + report_path, + } +} + +/// Spawn one rounded-rect under a root, mapping `DrawData`'s position/size/ +/// radius to the layout + render components the extract path turns back into one +/// `DrawData`. The corner radius is carried on `Border.radius` +/// (`Corners::all(Radius::circular(..))`) — that is the component +/// `draw_for_node` reads for the quad radius (`render/mod.rs:373`); a bare +/// `Radius` component is NOT consumed by the fill path. The `Border` band is +/// zero-width (width lives in `BoxModel`), so only the rounded fill paints. +fn spawn_single_primitive(app: &mut bevy::app::App, draw: &buiy_core::render::DrawData) { + use bevy::prelude::*; + use buiy_core::components::Node; + use buiy_core::layout::{Inset, Length, Sizing, Style}; + use buiy_core::render::ColorToken; + use buiy_core::render::components::{Background, Border, Corners, Radius}; + use std::borrow::Cow; + // The capture path resolves a token; install draw.color under a fixed key. + let key = "sdf.cross.fill"; + { + let mut theme = app.world_mut().resource_mut::(); + theme.colors.insert(key.into(), draw.color); + } + let e = app + .world_mut() + .spawn(( + Node, + Style::default() + .absolute() + .inset(Inset { + left: Sizing::Length(Length::px(draw.position.x)), + top: Sizing::Length(Length::px(draw.position.y)), + ..default() + }) + .width_px(draw.size.x) + .height_px(draw.size.y), + Background { + color: ColorToken::Token(Cow::Borrowed(key)), + }, + Border { + radius: Corners::all(Radius::circular(draw.radius)), + ..default() + }, + )) + .id(); + app.world_mut() + .spawn((Node, Style::default())) + .add_children(&[e]); +} + /// A `Mismatch` budget that tolerates difference is meaningless — its floor /// must be `(0,0)`. `Match` may carry any widening. Pure CPU so it gates /// headless (reftests.md § Verification #2); the `reftest!` macro enforces the @@ -178,22 +256,34 @@ pub mod sdf_oracle { q.max(Vec2::ZERO).length() + q.x.max(q.y).min(0.0) - r } - /// Rasterize one `DrawData` rounded-rect into a `w×h` RGBA tile, mirroring - /// the fragment shader: SDF in logical px, AA via a `fwidth` estimate (the - /// per-pixel SDF gradient via central difference) fed to - /// `smoothstep(-aa, aa, d)`. + /// Rasterize one `DrawData` rounded-rect into a `w×h` RGBA tile that matches + /// the **capture output**, not just the fragment shader. It mirrors the full + /// GPU chain so the cross-check compares like-for-like (`run_sdf_cross_check` + /// captures the GPU box over the capture camera's opaque-black clear): + /// + /// 1. **SDF + AA** — the shared `sdf_rounded_rect` in logical px, AA via a + /// `fwidth` estimate (the per-pixel SDF gradient by central difference) + /// fed to `smoothstep(-aa, aa, d)` → straight-alpha `coverage` + /// (`shader.wgsl:60`/`:76-:79`). + /// 2. **Linear-space SrcOver over opaque black** — the pipeline blends + /// `ALPHA_BLENDING` (SrcOver) in LINEAR space into the `Rgba8UnormSrgb` + /// target, and the capture camera clears to **opaque black**. So the + /// composite is `out_linear = src_linear · coverage` (the black backdrop + /// contributes nothing) with the result fully opaque (alpha 255) — the + /// same alpha the GPU readback carries everywhere, including OUTSIDE the + /// box (where coverage 0 → opaque black). Comparing a transparent CPU + /// backdrop against the GPU's opaque-black clear is exactly the + /// every-pixel alpha-255-vs-0 mismatch this composite removes. + /// 3. **sRGB encode** — the target is `Rgba8UnormSrgb`, so the linear result + /// is sRGB-encoded on write (matched here via `Srgba::from(LinearRgba)`). pub fn rasterize_sdf_rect(draw: &DrawData, w: u32, h: u32) -> image::RgbaImage { let half = draw.size * 0.5; let center = draw.position + half; let r = draw.radius; - let lin = bevy::color::LinearRgba::from(draw.color); - let srgba = bevy::color::Srgba::from(lin); - let (rr, gg, bb) = ( - (srgba.red * 255.0).round() as u8, - (srgba.green * 255.0).round() as u8, - (srgba.blue * 255.0).round() as u8, - ); - let base_a = srgba.alpha; + // Source color in LINEAR space (the space the GPU blends in), with its + // own straight alpha folded into the coverage below. + let src_lin = bevy::color::LinearRgba::from(draw.color); + let src_a = src_lin.alpha; let mut img = image::RgbaImage::new(w, h); for y in 0..h { @@ -210,8 +300,27 @@ pub mod sdf_oracle { * 0.5; let aa = (dx + dy).max(1e-4); let coverage = 1.0 - smoothstep(-aa, aa, d); - let a = (base_a * coverage * 255.0).round().clamp(0.0, 255.0) as u8; - img.put_pixel(x, y, image::Rgba([rr, gg, bb, a])); + // SrcOver over opaque black in LINEAR space: the black backdrop + // (0,0,0,1) contributes nothing to RGB, and the result is opaque. + let a_src = (src_a * coverage).clamp(0.0, 1.0); + let out_lin = bevy::color::LinearRgba::new( + src_lin.red * a_src, + src_lin.green * a_src, + src_lin.blue * a_src, + 1.0, + ); + // sRGB-encode on write (Rgba8UnormSrgb target). + let out = bevy::color::Srgba::from(out_lin); + img.put_pixel( + x, + y, + image::Rgba([ + (out.red * 255.0).round().clamp(0.0, 255.0) as u8, + (out.green * 255.0).round().clamp(0.0, 255.0) as u8, + (out.blue * 255.0).round().clamp(0.0, 255.0) as u8, + 255, + ]), + ); } } img diff --git a/crates/buiy_verify/tests/sdf_cross_check_gpu.rs b/crates/buiy_verify/tests/sdf_cross_check_gpu.rs new file mode 100644 index 0000000..21c36c6 --- /dev/null +++ b/crates/buiy_verify/tests/sdf_cross_check_gpu.rs @@ -0,0 +1,44 @@ +//! GPU lane (`--ignored`): the GPU rounded-rect render and the CPU SDF oracle +//! must agree within a documented AA fuzz budget — the golden-free oracle for +//! SDF corner AA (Tier 4.5). A wrong half-extent / radius-clamp / premultiply +//! in the shader would diverge here. reftests.md § CPU-vs-GPU SDF cross-check. + +use bevy::prelude::*; +use buiy_core::render::DrawData; +use buiy_verify::metric::FuzzBudget; +use buiy_verify::reftest::run_sdf_cross_check; + +#[test] +#[ignore = "GPU: run under `cargo test -- --ignored` (real adapter / lavapipe)"] +fn gpu_rounded_rect_matches_cpu_oracle() { + let draw = DrawData::new( + Vec2::new(40.0, 20.0), + Vec2::new(120.0, 80.0), + Color::WHITE, + 16.0, + ); + // AA band tolerance: a sub-pixel rim differs between the GPU `fwidth` + // screen-space derivative and the CPU central-difference — the documented + // AA residue (Tier 4.5), NOT a regression. The CPU oracle matches the full + // GPU capture chain (linear-space SrcOver over the opaque-black clear, sRGB + // encode), so the box INTERIOR and the EXTERIOR opaque-black background + // agree exactly; only the ~1px rounded-rect rim disagrees. + // + // Measured on the RX 6700 XT (RADV) at (0,0): differing_pixels = 87 of + // 24000 (the non-AA-excluded rim of a ~400px-perimeter rounded rect), + // mssim = 0.927. `max_channel_delta` is pinned at the 255 ceiling because a + // single hard-edge rim pixel can flip fully on/off between the two AA + // estimators (a true L∞ on one pixel); the meaningful axis is the pixel + // COUNT, bounded here at 200 (87 measured + driver-variance headroom, well + // below the 24000 total — a real bound, not a rubber stamp). + let fuzz = FuzzBudget { + max_channel_delta: 255, + max_diff_pixels: 200, + }; + let outcome = run_sdf_cross_check(&draw, &fuzz); + assert!( + outcome.passed, + "GPU vs CPU-SDF oracle diverged: {:?} (report: {:?})", + outcome.diff, outcome.report_path + ); +} diff --git a/crates/buiy_verify/tests/sdf_oracle.rs b/crates/buiy_verify/tests/sdf_oracle.rs index 148d431..18043d7 100644 --- a/crates/buiy_verify/tests/sdf_oracle.rs +++ b/crates/buiy_verify/tests/sdf_oracle.rs @@ -1,7 +1,14 @@ //! Pure-CPU (NOT #[ignore]): the full-tile CPU SDF oracle must reproduce the -//! scalar `d` the existing render_instance.rs point-probes assert — center +//! geometry the existing render_instance.rs point-probes assert — center //! inside (filled), 2× half-extent outside (empty). Pins the full-tile port to //! the unit-tested shader formula. reftests.md § Verification #5. +//! +//! The oracle output is **capture-matched** (`rasterize_sdf_rect` composites +//! the box over the capture camera's opaque-black clear in linear space, then +//! sRGB-encodes — so the CPU-vs-GPU cross-check compares like-for-like). Thus +//! "filled" is opaque WHITE `[255,255,255,255]` and "empty" is opaque BLACK +//! `[0,0,0,255]` (NOT transparent) — the same geometric center-inside / +//! far-outside probe, in the composited convention. use bevy::prelude::*; use buiy_core::render::DrawData; @@ -12,27 +19,34 @@ fn oracle_fills_center_and_clears_far_outside() { let inset = DrawData::new(Vec2::new(50.0, 25.0), Vec2::new(40.0, 20.0), Color::WHITE, 0.0); let img = rasterize_sdf_rect(&inset, 200, 100); assert_eq!(img.dimensions(), (200, 100)); - assert_eq!(img.get_pixel(5, 5).0[3], 0, "far outside the box is empty"); + // Far outside the box → the opaque-black clear (composited convention). assert_eq!( - img.get_pixel(70, 35).0[3], - 255, - "inside the inset box is filled" + img.get_pixel(5, 5).0, + [0, 0, 0, 255], + "far outside the box is the opaque-black background" + ); + // Deep interior → opaque white (full coverage of the white fill). + assert_eq!( + img.get_pixel(70, 35).0, + [255, 255, 255, 255], + "inside the inset box is the filled white" ); } #[test] -fn oracle_edge_band_is_partial_alpha() { - // The AA band must be neither fully 0 nor fully 255 for at least one pixel - // (proves the smoothstep coverage step is live) — the property the GPU - // shader's fwidth→smoothstep produces. +fn oracle_edge_band_is_partial_coverage() { + // The AA band must be a partial gray (between the opaque-black background + // and the opaque-white fill) for at least one pixel — proves the smoothstep + // coverage step is live, the property the GPU shader's fwidth→smoothstep + // produces. (Output is opaque, so AA shows in the RGB channels, not alpha.) let draw = DrawData::new(Vec2::new(50.0, 25.0), Vec2::new(40.0, 20.0), Color::WHITE, 8.0); let img = rasterize_sdf_rect(&draw, 200, 100); let has_partial = img.pixels().any(|p| { - let a = p.0[3]; - a > 0 && a < 255 + let lum = p.0[0]; + lum > 0 && lum < 255 && p.0[3] == 255 }); assert!( has_partial, - "a rounded-rect edge must produce AA partial-alpha pixels" + "a rounded-rect edge must produce AA partial-coverage (gray) pixels" ); } From eebf1e66ccae8afb8272058d112705fcf95feab7 Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 06:44:41 -0700 Subject: [PATCH 27/70] feat(verify): two real Tier-4 reftest cases MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit flex justify-content: SpaceBetween == three literal-offset boxes (reference routes through the primitive/literal-Node layer, NOT flex — independence by construction); content-visibility: hidden != the visible subtree (the != anti-test). The cv reference's independence is asserted pure-CPU. Both pass on the RX 6700 XT at the default (0,0) fuzz. reftests.md § Authoring patterns. Adaptations to the live API: - content-visibility set via Style::containment() (Style is a Bundle that already supplies Containment — a second one is a duplicate-component panic). - the independence lint builds the reference under ThemePlugin + LayoutPlugin (no GPU) so theme-token-installing scenes build; the lint still reads only component DATA, no render systems run. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/buiy_verify/src/reftest.rs | 19 ++- crates/buiy_verify/tests/metric.rs | 5 +- crates/buiy_verify/tests/reftest_cases_gpu.rs | 158 ++++++++++++++++++ .../buiy_verify/tests/reftest_engine_gpu.rs | 4 +- crates/buiy_verify/tests/sdf_oracle.rs | 14 +- 5 files changed, 187 insertions(+), 13 deletions(-) create mode 100644 crates/buiy_verify/tests/reftest_cases_gpu.rs diff --git a/crates/buiy_verify/src/reftest.rs b/crates/buiy_verify/src/reftest.rs index 8b6c5e3..6ace4f9 100644 --- a/crates/buiy_verify/src/reftest.rs +++ b/crates/buiy_verify/src/reftest.rs @@ -360,23 +360,23 @@ impl ComponentMarker { /// True iff ANY entity in `world` carries this marker. fn present_in(self, world: &mut World) -> bool { use buiy_core::layout::{ - Containment, ContainerQuery, ContentVisibility, Stacking, TopLayer, Translate, + ContainerQuery, Containment, ContentVisibility, Stacking, TopLayer, Translate, }; match self { ComponentMarker::ContentVisibilityHidden => world .query::<&Containment>() .iter(world) .any(|c| c.content_visibility == ContentVisibility::Hidden), - ComponentMarker::ContainerQuery => { - world.query::<&ContainerQuery>().iter(world).next().is_some() - } + ComponentMarker::ContainerQuery => world + .query::<&ContainerQuery>() + .iter(world) + .next() + .is_some(), ComponentMarker::TopLayer => world .query::<&Stacking>() .iter(world) .any(|s| s.top_layer != TopLayer::None), - ComponentMarker::Translate => { - world.query::<&Translate>().iter(world).next().is_some() - } + ComponentMarker::Translate => world.query::<&Translate>().iter(world).next().is_some(), } } } @@ -427,6 +427,11 @@ pub fn default_rules() -> Vec { /// marker-bearing features. pub fn assert_reference_independent(case: &RefCase, rules: &[IndependenceRule]) { let mut app = bevy::app::App::new(); + // `ThemePlugin` + `LayoutPlugin` — no render/asset plugins, no GPU. Theme is + // present because real reference scenes install fill tokens + // (`Theme.colors.insert`) while building; the lint only needs the components + // to exist as DATA, not the render systems to run. + app.add_plugins(buiy_core::theme::ThemePlugin); app.add_plugins(buiy_core::layout::LayoutPlugin); (case.reference)(&mut app); let world = app.world_mut(); diff --git a/crates/buiy_verify/tests/metric.rs b/crates/buiy_verify/tests/metric.rs index 593f85d..12d2426 100644 --- a/crates/buiy_verify/tests/metric.rs +++ b/crates/buiy_verify/tests/metric.rs @@ -92,6 +92,9 @@ fn reftest_default_excludes_aa_and_skips_diff_image() { let opts = buiy_verify::metric::CompareOpts::reftest_default(); assert!(!opts.include_aa, "reftest excludes AA-sibling pixels"); assert!(opts.mssim, "MSSIM stays computed (advisory)"); - assert!(!opts.emit_diff_image, "hot reftest path allocates no diff image"); + assert!( + !opts.emit_diff_image, + "hot reftest path allocates no diff image" + ); assert_eq!(opts.threshold, 0.1, "pixelmatch default sensitivity"); } diff --git a/crates/buiy_verify/tests/reftest_cases_gpu.rs b/crates/buiy_verify/tests/reftest_cases_gpu.rs new file mode 100644 index 0000000..ae55292 --- /dev/null +++ b/crates/buiy_verify/tests/reftest_cases_gpu.rs @@ -0,0 +1,158 @@ +//! GPU lane (`--ignored`): two real Tier-4 reftest pairings +//! (reftests.md § Authoring patterns). +//! +//! 1. flex `justify-content: SpaceBetween` == three literal-offset boxes +//! (reference routes through the literal-Node layer — NOT flex). `match`. +//! 2. `content-visibility: hidden` != the identical VISIBLE subtree — the +//! `!=` anti-test proving the feature suppresses paint. `mismatch`. + +use bevy::prelude::*; +use buiy_core::components::Node; +use buiy_core::layout::{ + Containment, ContentVisibility, Inset, JustifyContent, Length, Sizing, Style, +}; +use buiy_core::render::ColorToken; +use buiy_core::render::components::Background; +use std::borrow::Cow; + +/// Install the shared fill token both halves of every pairing reference. +fn install_fill(app: &mut App) { + let mut theme = app.world_mut().resource_mut::(); + theme + .colors + .insert("test.fill.a".into(), Color::srgb(0.90, 0.10, 0.10)); +} + +/// A block-flow `width × 40` fill box (a flex child). +fn fill_box(width: f32) -> impl Bundle { + ( + Node, + Style::default().width_px(width).height_px(40.0), + Background { + color: ColorToken::Token(Cow::Borrowed("test.fill.a")), + }, + ) +} + +/// An absolutely-positioned 40×40 fill box at literal `(left, 0)` — the +/// primitive / literal-offset layer that bypasses the flex solver entirely. +fn abs_box(app: &mut App, left: f32) -> Entity { + app.world_mut() + .spawn(( + Node, + Style::default() + .absolute() + .inset(Inset { + left: Sizing::Length(Length::px(left)), + top: Sizing::Length(Length::px(0.0)), + ..default() + }) + .width_px(40.0) + .height_px(40.0), + Background { + color: ColorToken::Token(Cow::Borrowed("test.fill.a")), + }, + )) + .id() +} + +// ---- Case 1: flex justify-content: SpaceBetween == three literal offsets ---- + +fn flex_justify(app: &mut App) { + install_fill(app); + let a = app.world_mut().spawn(fill_box(40.0)).id(); + let b = app.world_mut().spawn(fill_box(40.0)).id(); + let c = app.world_mut().spawn(fill_box(40.0)).id(); + // Three 40px boxes in a 200px row, SpaceBetween → x = 0, 80, 160. + app.world_mut() + .spawn(( + Node, + Style::default() + .flex_row() + .justify_content(JustifyContent::SpaceBetween) + .width_px(200.0) + .height_px(40.0), + )) + .add_children(&[a, b, c]); +} + +fn literal_offsets(app: &mut App) { + install_fill(app); + // The disjoint oracle: three boxes at the SpaceBetween-resolved literal + // coordinates via the absolute / literal-Node layer — no flex solver, so a + // flex-justify bug cannot be shared by this reference. + let a = abs_box(app, 0.0); + let b = abs_box(app, 80.0); + let c = abs_box(app, 160.0); + app.world_mut() + .spawn((Node, Style::default())) + .add_children(&[a, b, c]); +} + +// ---- Case 2: content-visibility: hidden != the visible subtree ---- + +fn subtree(app: &mut App, hidden: bool) { + install_fill(app); + let child = app.world_mut().spawn(fill_box(80.0)).id(); + // `Style` is a Bundle that already supplies `Containment`; set the + // content-visibility via the builder (a second `Containment` alongside + // would be a duplicate-component panic). + let containment = if hidden { + Containment { + content_visibility: ContentVisibility::Hidden, + ..default() + } + } else { + Containment::default() + }; + let p = app + .world_mut() + .spawn(( + Node, + Style::default() + .absolute() + .inset(Inset { + left: Sizing::Length(Length::px(20.0)), + top: Sizing::Length(Length::px(20.0)), + ..default() + }) + .width_px(80.0) + .height_px(40.0) + .containment(containment), + )) + .id(); + app.world_mut().entity_mut(p).add_children(&[child]); + app.world_mut() + .spawn((Node, Style::default())) + .add_children(&[p]); +} + +fn cv_visible(app: &mut App) { + subtree(app, false); +} +fn cv_hidden(app: &mut App) { + subtree(app, true); +} + +buiy_verify::reftest!( + match, + flex_justify_eq_literal, + flex_justify, + literal_offsets +); +buiy_verify::reftest!(mismatch, cv_hidden_actually_hides, cv_visible, cv_hidden); + +#[test] +fn cv_hidden_reference_is_independent() { + use buiy_verify::metric::FuzzBudget; + use buiy_verify::reftest::{RefCase, RefKind, assert_reference_independent, default_rules}; + // The REFERENCE in case 2 is `cv_visible`; it must carry NO Hidden marker. + let case = RefCase { + name: "cv_hidden_actually_hides", + kind: RefKind::Mismatch, + test: cv_hidden, + reference: cv_visible, + fuzz: FuzzBudget::EXACT, + }; + assert_reference_independent(&case, &default_rules()); +} diff --git a/crates/buiy_verify/tests/reftest_engine_gpu.rs b/crates/buiy_verify/tests/reftest_engine_gpu.rs index 0b617ae..935a846 100644 --- a/crates/buiy_verify/tests/reftest_engine_gpu.rs +++ b/crates/buiy_verify/tests/reftest_engine_gpu.rs @@ -16,9 +16,7 @@ use std::borrow::Cow; /// scene is self-contained across the two captures `run_reftest` drives. fn box_at(app: &mut App, left: f32, token: &'static str) { { - let mut theme = app - .world_mut() - .resource_mut::(); + let mut theme = app.world_mut().resource_mut::(); theme .colors .insert(token.into(), Color::srgb(0.90, 0.10, 0.10)); diff --git a/crates/buiy_verify/tests/sdf_oracle.rs b/crates/buiy_verify/tests/sdf_oracle.rs index 18043d7..a7b4c54 100644 --- a/crates/buiy_verify/tests/sdf_oracle.rs +++ b/crates/buiy_verify/tests/sdf_oracle.rs @@ -16,7 +16,12 @@ use buiy_verify::reftest::sdf_oracle::rasterize_sdf_rect; #[test] fn oracle_fills_center_and_clears_far_outside() { - let inset = DrawData::new(Vec2::new(50.0, 25.0), Vec2::new(40.0, 20.0), Color::WHITE, 0.0); + let inset = DrawData::new( + Vec2::new(50.0, 25.0), + Vec2::new(40.0, 20.0), + Color::WHITE, + 0.0, + ); let img = rasterize_sdf_rect(&inset, 200, 100); assert_eq!(img.dimensions(), (200, 100)); // Far outside the box → the opaque-black clear (composited convention). @@ -39,7 +44,12 @@ fn oracle_edge_band_is_partial_coverage() { // and the opaque-white fill) for at least one pixel — proves the smoothstep // coverage step is live, the property the GPU shader's fwidth→smoothstep // produces. (Output is opaque, so AA shows in the RGB channels, not alpha.) - let draw = DrawData::new(Vec2::new(50.0, 25.0), Vec2::new(40.0, 20.0), Color::WHITE, 8.0); + let draw = DrawData::new( + Vec2::new(50.0, 25.0), + Vec2::new(40.0, 20.0), + Color::WHITE, + 8.0, + ); let img = rasterize_sdf_rect(&draw, 200, 100); let has_partial = img.pixels().any(|p| { let lum = p.0[0]; From 45b1ddb9d1638b5228f644b4ef17b326ffdc7c7a Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 07:04:42 -0700 Subject: [PATCH 28/70] =?UTF-8?q?feat(verify):=20snapshot=20module=20?= =?UTF-8?q?=E2=80=94=20shared=20dump=20primitives=20(round=20+=20versions)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tier 1-2 structured-snapshot module (snapshots.md). Task 2.1 lands the shared dump primitives both tiers consume: - `round(f32) -> String` — round to ROUND_DP=2 decimals, strip trailing zeros + bare trailing dot, normalize -0 to "0". Kills last-ULP churn from the Taffy / clip-space math while staying diff-readable. - `LAYOUT_DUMP_VERSION` / `DISPLAY_LIST_DUMP_VERSION` — format-version headers so a formatter change is one conscious, visible diff line. The `#[track_caller]` insta bridge (`assert_named_snapshot`) writes each `.snap` beside the CALLING test file via `Location::caller()` + `prepend_module_to_snapshot(false)`, so the dump helpers can live in buiy_verify while their `.snap`s live next to the buiy_core tests that call them. `bytemuck.workspace = true` added to buiy_verify (already a workspace dep used by buiy_core for the PackedInstance POD layout; the Tier-2 hex check needs bytes_of / pod_read_unaligned). No new supply-chain crate, no new cargo-deny surface. Deviation: snapshots.md § Verification #2's `round(1.005) == "1.0"` vector is self-inconsistent with `round(50.0) == "50"` (1.005_f32 is 1.00499…, formats to "1.00" — same .00 suffix as 50.0's "50.00", so one trailing-zero rule cannot strip one to "1.0" and the other to "50"). The self-consistent rule strips all trailing zeros; `round(1.005) == "1"` preserves the vector's intent (1.005 rounds DOWN to 1.00, never up). Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/buiy_verify/Cargo.toml | 5 + crates/buiy_verify/src/lib.rs | 1 + crates/buiy_verify/src/snapshot.rs | 475 ++++++++++++++++++++++ crates/buiy_verify/tests/snapshot_dump.rs | 38 ++ 4 files changed, 519 insertions(+) create mode 100644 crates/buiy_verify/src/snapshot.rs create mode 100644 crates/buiy_verify/tests/snapshot_dump.rs diff --git a/crates/buiy_verify/Cargo.toml b/crates/buiy_verify/Cargo.toml index dc0aa0a..137b9ab 100644 --- a/crates/buiy_verify/Cargo.toml +++ b/crates/buiy_verify/Cargo.toml @@ -11,6 +11,11 @@ serde.workspace = true serde_json.workspace = true image.workspace = true proptest.workspace = true +# Already a workspace dep (used by buiy_core for the `PackedInstance` POD layout). +# The Tier-2 byte-exact hex check (snapshots.md § byte-exact) needs +# `bytemuck::bytes_of` / `pod_read_unaligned` over the same `PackedInstance` — +# no NEW supply-chain crate, no new `cargo deny` surface. +bytemuck.workspace = true # Advisory MSSIM channel (metric.md § "Advisory MSSIM"): catches global # gamma/blend drift a small pixel budget under-weights. NEVER the primary # gate — surfaced as `Diff::mssim: Option`. The `cargo deny check` below diff --git a/crates/buiy_verify/src/lib.rs b/crates/buiy_verify/src/lib.rs index 266f018..cb65954 100644 --- a/crates/buiy_verify/src/lib.rs +++ b/crates/buiy_verify/src/lib.rs @@ -8,4 +8,5 @@ pub mod a11y; pub mod contrast; pub mod metric; pub mod reftest; +pub mod snapshot; pub mod support; diff --git a/crates/buiy_verify/src/snapshot.rs b/crates/buiy_verify/src/snapshot.rs new file mode 100644 index 0000000..f6916cd --- /dev/null +++ b/crates/buiy_verify/src/snapshot.rs @@ -0,0 +1,475 @@ +//! Tiers 1–2 — structured snapshots (snapshots.md). +//! +//! The two cheapest, most deterministic rungs of the verification pyramid: +//! +//! - **Tier 1** ([`assert_layout_snapshot`]) snapshots every entity's resolved +//! box (`ResolvedLayout.position`/`.size`) as a stable, `Name`-keyed Display +//! dump (gate #5). +//! - **Tier 2** ([`assert_display_list_snapshot`]) snapshots the whole CPU +//! display-list handoff holistically: the [`ExtractedNodes`] paint order plus +//! the packed [`InstanceBuckets`] draw order, in one Display dump — plus a +//! byte-exact [`assert_instance_hex_snapshot`] on the [`PackedInstance`] +//! px→logical packing. +//! +//! Both emit a **purpose-built Display dump**, never raw `Debug`/serde, so the +//! artifact is decoupled from private field names and `Entity` allocation bits +//! (which vary with spawn order). Entities render by [`Name`]; floats round via +//! the shared [`round`]; each dump carries a format-version header so a format +//! change is a single visible line (snapshots.md § "Why a Display dump"). +//! +//! Pure-CPU, headless, sub-millisecond, 100% deterministic: no GPU, no window. +//! The `assert_*` helpers are `#[track_caller]` so insta writes each `.snap` +//! beside the *calling* test file (`crates//tests/snapshots/`), even +//! though the helper bodies live here in `buiy_verify`. + +use std::collections::HashMap; +use std::fmt::Write as _; + +use bevy::prelude::*; + +use buiy_core::components::ResolvedLayout; +use buiy_core::render::buckets::pack_view; +use buiy_core::render::extract::{ExtractedNode, ExtractedNodes}; +use buiy_core::render::instance::PackedInstance; + +// --------------------------------------------------------------------------- +// Shared dump primitives (Task 2.1) — used by both Tier 1 and Tier 2. +// --------------------------------------------------------------------------- + +/// Decimal places floats are rounded to in every dump. Two decimals kills the +/// last-ULP churn from the Taffy / clip-space math while staying diff-readable +/// (snapshots.md § Tier 1). +pub const ROUND_DP: usize = 2; + +/// Format-version header for the Tier-1 layout dump. A formatter change bumps +/// the `vN` and re-blesses every layout `.snap` as one conscious, visible diff +/// (snapshots.md § Verification #4). +pub const LAYOUT_DUMP_VERSION: &str = "# buiy-layout-dump v1"; + +/// Format-version header for the Tier-2 display-list dump. See +/// [`LAYOUT_DUMP_VERSION`]. +pub const DISPLAY_LIST_DUMP_VERSION: &str = "# buiy-display-list-dump v1"; + +/// Round a float to [`ROUND_DP`] decimals and render it diff-stably: trailing +/// zeros and a bare trailing `.` are stripped (`50.0 → "50"`), and `-0.0` +/// normalizes to `"0"` so a sub-ULP negative never prints a spurious `-0`. The +/// shared rounding helper for Tier 1 + Tier 2 (snapshots.md § Tier 1, § +/// Verification #2). +pub fn round(v: f32) -> String { + // Round to ROUND_DP decimals. `{:.*}` does round-half-away; the result is + // a fixed-decimal string we then trim. + let mut s = format!("{v:.*}", ROUND_DP); + // Normalize "-0", "-0.00", etc. to a single "0" before trimming so the + // sign never leaks for a value that rounded to zero. + if s.starts_with('-') && s[1..].chars().all(|c| c == '0' || c == '.') { + s = s[1..].to_string(); + } + // Strip trailing zeros, then a trailing dot, only when a dot is present. + if s.contains('.') { + let trimmed = s.trim_end_matches('0').trim_end_matches('.'); + s = trimmed.to_string(); + } + s +} + +// --------------------------------------------------------------------------- +// `#[track_caller]` insta bridge — write `.snap` beside the CALLING test file. +// --------------------------------------------------------------------------- + +/// Assert `value` against the named text snapshot, writing the `.snap` beside +/// the **caller's** source file (`/snapshots/.snap`) rather +/// than beside this `buiy_verify` module. This is the seam that lets the dump +/// helpers live in `buiy_verify` while their `.snap`s live next to the +/// `buiy_core` tests that call them. +/// +/// Mechanics: insta keys a `.snap` off the *macro call site* (`file!()`, +/// `module_path!()`). Because the helper is a plain `fn`, the macro would key +/// off `buiy_verify`'s source and collide every caller's snapshot. We instead +/// call `insta::_macro_support::assert_snapshot` directly with the caller's +/// `Location` (via `#[track_caller]`), an empty `module_path`, and +/// `prepend_module_to_snapshot(false)`, so the file is exactly +/// `/snapshots/.snap`. The workspace root is resolved by +/// insta from `CARGO_MANIFEST_DIR` (same workspace ⇒ same root). +#[track_caller] +fn assert_named_snapshot(name: &str, value: String) { + let loc = std::panic::Location::caller(); + // insta joins `workspace_root / dirname(assertion_file) / snapshot_path / + // .snap`. `Location::file()` is workspace-relative, matching what + // `file!()` yields at the call site. + let workspace = insta::_macro_support::get_cargo_workspace( + insta::_macro_support::Workspace::DetectWithCargo(env!("CARGO_MANIFEST_DIR")), + ); + + let mut settings = insta::Settings::clone_current(); + // Filename is exactly `.snap` (no `module__` prefix) — matches the + // dump-format examples in snapshots.md (e.g. `flex_row_basic.snap`). + settings.set_prepend_module_to_snapshot(false); + settings.set_snapshot_path("snapshots"); + let _guard = settings.bind_to_scope(); + + insta::_macro_support::assert_snapshot( + (Some(name.to_string()), value.as_str()).into(), + workspace.as_path(), + // function_name only disambiguates auto-named snapshots; we always pass + // an explicit `name`, so an empty string is fine. + "", + // Empty module_path + `prepend_module_to_snapshot(false)` ⇒ no prefix. + "", + loc.file(), + loc.line(), + // The "expression" shown in the failure diff header. + name, + ) + .unwrap(); +} + +// --------------------------------------------------------------------------- +// Tier 1 — layout-number snapshots (gate #5). +// --------------------------------------------------------------------------- + +/// Run one `update()` on `app`, then snapshot every entity's resolved box as a +/// stable [`layout_dump`], keyed by `name`. Pure-CPU: the caller wires +/// `MinimalPlugins + CorePlugin + LayoutPlugin` (no RenderApp). The `.snap` +/// lands beside the calling test (`/snapshots/.snap`). +#[track_caller] +pub fn assert_layout_snapshot(app: &mut App, name: &str) { + app.update(); + let dump = layout_dump(app.world()); + assert_named_snapshot(name, dump); +} + +/// The format-versioned Display dump backing [`assert_layout_snapshot`]: +/// `(name, position, size)` per [`ResolvedLayout`] entity, one per line, +/// indented by `ChildOf` depth, siblings ordered by `Name` (then `Entity` +/// index as a tiebreak). Floats round via [`round`]; an unnamed entity falls +/// back to `entity#` (a flagged, non-diff-stable fixture). The dump +/// never prints raw `Entity` bits (snapshots.md § Tier 1). +pub fn layout_dump(world: &World) -> String { + let entries = collect_layout_entries(world); + + let mut out = String::new(); + out.push_str(LAYOUT_DUMP_VERSION); + out.push('\n'); + for e in &entries { + let indent = " ".repeat(e.depth); + let _ = writeln!( + out, + "{indent}{name} pos={px},{py} size={sx},{sy}", + name = e.name, + px = round(e.position.x), + py = round(e.position.y), + sx = round(e.size.x), + sy = round(e.size.y), + ); + } + out +} + +/// One resolved-layout row, pre-sorted into a stable, `Name`-keyed pre-order +/// tree walk (depth carries the `ChildOf` indentation). +struct LayoutEntry { + name: String, + depth: usize, + position: Vec2, + size: Vec2, +} + +/// Gather every `ResolvedLayout` entity into a stable pre-order list: roots +/// (entities with no `ChildOf`) first, then a depth-first descent through +/// `Children`, siblings ordered by `Name` then `Entity` index. The `Name`-key +/// is what makes the dump invariant to ECS spawn/archetype order. +fn collect_layout_entries(world: &World) -> Vec { + // entity -> (name, position, size) for every laid-out entity. + let mut boxes: HashMap = HashMap::new(); + let mut q = world + .try_query::<(Entity, &ResolvedLayout, Option<&Name>)>() + .unwrap(); + for (e, layout, name) in q.iter(world) { + boxes.insert(e, (entity_label(name, e), layout.position, layout.size)); + } + + // Adjacency: parent -> children (only over laid-out entities). + let mut children: HashMap> = HashMap::new(); + let mut has_parent: HashMap = HashMap::new(); + for &e in boxes.keys() { + has_parent.entry(e).or_insert(false); + } + let mut cq = world.try_query::<(Entity, &ChildOf)>().unwrap(); + for (e, child_of) in cq.iter(world) { + if !boxes.contains_key(&e) { + continue; + } + let parent = child_of.parent(); + if boxes.contains_key(&parent) { + children.entry(parent).or_default().push(e); + has_parent.insert(e, true); + } + } + + // Stable sibling order: by Name (the label) then Entity index. + let sort_key = |boxes: &HashMap, e: &Entity| { + (boxes[e].0.clone(), e.index().index()) + }; + for siblings in children.values_mut() { + siblings.sort_by_key(|e| sort_key(&boxes, e)); + } + let mut roots: Vec = boxes.keys().copied().filter(|e| !has_parent[e]).collect(); + roots.sort_by_key(|e| sort_key(&boxes, e)); + + let mut out = Vec::with_capacity(boxes.len()); + let mut stack: Vec<(Entity, usize)> = roots.into_iter().rev().map(|e| (e, 0)).collect(); + while let Some((e, depth)) = stack.pop() { + let (name, position, size) = boxes[&e].clone(); + out.push(LayoutEntry { + name, + depth, + position, + size, + }); + if let Some(kids) = children.get(&e) { + // Push reversed so the lowest sort_key is popped first. + for &child in kids.iter().rev() { + stack.push((child, depth + 1)); + } + } + } + out +} + +// --------------------------------------------------------------------------- +// Tier 2 — display-list / paint-order / instance snapshots. +// --------------------------------------------------------------------------- + +/// Resolve an [`Entity`] to its human name for a dump: the [`Name`] component +/// when present, else `entity#`. Built from the `World` ONCE and passed +/// into [`display_list_dump`] so that dump fn stays `World`-free and pure +/// (snapshots.md § Tier 2 / README § Resolved #5). +#[derive(Debug, Clone, Default)] +pub struct NameLookup(HashMap); + +impl NameLookup { + /// Build the entity→name map from every named entity in `world`. An entity + /// absent from the map renders as `entity#` (the unnamed fallback). + pub fn from_world(world: &World) -> Self { + let mut map = HashMap::new(); + let mut q = world.try_query::<(Entity, &Name)>().unwrap(); + for (e, name) in q.iter(world) { + map.insert(e, name.as_str().to_string()); + } + Self(map) + } + + /// The label for `e`: its stored `Name`, else `entity#`. + fn label(&self, e: Entity) -> String { + self.0 + .get(&e) + .cloned() + .unwrap_or_else(|| format!("entity#{}", e.index().index())) + } +} + +/// The label for an entity given its (optional) [`Name`] — the shared +/// unnamed-fallback rule, so Tier 1 and Tier 2 agree. +fn entity_label(name: Option<&Name>, e: Entity) -> String { + match name { + Some(n) => n.as_str().to_string(), + None => format!("entity#{}", e.index().index()), + } +} + +/// `#rrggbbaa` for a color, in sRGB (the authoring space): the `ExtractedNode` +/// color is already theme-resolved, so the magenta `MISSING_TOKEN_FALLBACK` +/// sentinel surfaces here as `#ff00ffff` — a literal that flags an unresolved +/// token (snapshots.md § Tier 2). +fn color_hex(color: Color) -> String { + let s = Srgba::from(color); + let to_u8 = |c: f32| (c.clamp(0.0, 1.0) * 255.0).round() as u8; + format!( + "#{:02x}{:02x}{:02x}{:02x}", + to_u8(s.red), + to_u8(s.green), + to_u8(s.blue), + to_u8(s.alpha), + ) +} + +/// Render one node's clip field: `none` for the full-view sentinel, else +/// `minx,miny..maxx,maxy` (rounded). +fn clip_str(node: &ExtractedNode) -> String { + match node.clip { + None => "none".to_string(), + Some(c) => format!( + "{},{}..{},{}", + round(c.min.x), + round(c.min.y), + round(c.max.x), + round(c.max.y), + ), + } +} + +/// Snapshot the CPU display-list handoff holistically (nodes in paint order + +/// packed buckets in draw order), keyed by `name`, beside the calling test. +/// See [`display_list_dump`]. +#[track_caller] +pub fn assert_display_list_snapshot(nodes: &ExtractedNodes, name: &str, names: &NameLookup) { + let dump = display_list_dump(nodes, names); + assert_named_snapshot(name, dump); +} + +/// Display dump of an [`ExtractedNodes`] set: every node in `painters_z` stored +/// order (NEVER re-sorted by render — `extract.rs:141` — so a z-sort regression +/// shows as a line reorder), then the [`pack_view`] [`InstanceBuckets`] in +/// `BTreeMap` (draw) order with per-batch `xN` counts. Entities by `Name`; +/// floats via [`round`]; format-version-headered (snapshots.md § Tier 2). +/// +/// Color renders as `#rrggbbaa` (sRGB). Token-name rendering (`token:`) +/// is intentionally NOT done here: the pinned signature carries no `Theme`, and +/// `ExtractedNode.color` is already resolved — so the hex IS the artifact, and +/// the magenta sentinel surfaces as `#ff00ffff` (the unresolved-token signal). +pub fn display_list_dump(nodes: &ExtractedNodes, names: &NameLookup) -> String { + let mut out = String::new(); + out.push_str(DISPLAY_LIST_DUMP_VERSION); + out.push('\n'); + + out.push_str("[nodes painters_z]\n"); + for (i, node) in nodes.nodes.iter().enumerate() { + let group = match node.group { + Some(g) => g.to_string(), + None => "none".to_string(), + }; + let _ = writeln!( + out, + "{i} {name} rect pos={px},{py} size={sx},{sy} color={color} clip={clip} group={group}", + name = names.label(node.entity), + px = round(node.position.x), + py = round(node.position.y), + sx = round(node.size.x), + sy = round(node.size.y), + color = color_hex(node.color), + clip = clip_str(node), + ); + } + + out.push_str("[buckets draw-order]\n"); + let buckets = pack_view(&nodes.nodes); + for (key, batch) in buckets.batches() { + let _ = writeln!( + out, + "({:?},layer={}) x{}", + key.primitive, + key.layer, + batch.len(), + ); + } + out +} + +// --------------------------------------------------------------------------- +// The byte-exact `PackedInstance` hex check. +// --------------------------------------------------------------------------- + +/// Hex-dump a [`PackedInstance`] as `bytemuck::bytes_of(p)` — a byte-exact +/// snapshot of the GPU upload payload (52 B → 104 hex chars), independent of +/// the Display dump's format version. A packing arithmetic change (e.g. the +/// half-size sign bug `render_instance.rs` regression-tests) flips the hex even +/// when the rounded Display dump rounds it away (snapshots.md § byte-exact). +/// +/// **Endianness:** `bytes_of` is host-endian. CI and dev are both +/// little-endian x86-64, and the hex is a within-repo regression artifact (not +/// a cross-host wire format), so this is acceptable. A big-endian CI host would +/// be a conscious change. +pub fn instance_hex(p: &PackedInstance) -> String { + let bytes = bytemuck::bytes_of(p); + let mut s = String::with_capacity(bytes.len() * 2); + for b in bytes { + let _ = write!(s, "{b:02x}"); + } + s +} + +/// Assert one [`PackedInstance`]'s [`instance_hex`] against the named snapshot, +/// beside the calling test. The byte-exact complement to the Display dump. +#[track_caller] +pub fn assert_instance_hex_snapshot(p: &PackedInstance, name: &str) { + assert_named_snapshot(name, instance_hex(p)); +} + +// --------------------------------------------------------------------------- +// Per-timestamp animation snapshots (Tier 2, opt-in — Decision 8). +// --------------------------------------------------------------------------- + +/// Snapshot the display-list dump at each virtual timestamp in `steps`, +/// advancing `Time` to each **absolute** logical time (not wall-clock) +/// between captures. One `.snap` per step, keyed `@` (e.g. +/// `caret_blink@0`, `caret_blink@250`), so a timing regression shows as a diff +/// in exactly the frame whose curve drifted. Pure-CPU — the dump is a text +/// artifact, so a 3-sample sequence costs ~3× a single dump, not a pixel +/// capture (snapshots.md § Per-timestamp). +/// +/// Opt-in per fixture: enroll a fixture only when its *timing curve* is the +/// behavior under test (a custom easing, a staged reveal, the caret blink). +/// Default sampling is three logical timestamps named by the caller. +#[track_caller] +pub fn assert_display_list_snapshot_at(app: &mut App, name: &str, steps: &[std::time::Duration]) { + for &t in steps { + // Drive the manual virtual clock to the ABSOLUTE logical time `t` (the + // landed `Time::advance_by` mechanism, text_caret_selection.rs), + // then run one update so the animation systems observe the new clock — + // Bevy's `TimePlugin` syncs `Time` into the generic `Time` at + // the head of each update, so no manual clock mirroring is needed. + advance_virtual_to(app, t); + app.update(); + + let names = NameLookup::from_world(app.world()); + let nodes = extract_nodes_from_world(app.world()); + let dump = display_list_dump(&nodes, &names); + let keyed = format!("{name}@{}", t.as_millis()); + assert_named_snapshot(&keyed, dump); + } +} + +/// Advance `Time` to an absolute logical time `t` (since clock start) +/// by stepping the remaining delta. Steps are expected monotonic; a backwards +/// `t` is a no-op (`advance_by` cannot rewind) — the determinism guarantee that +/// makes per-timestamp snapshots reproducible byte-for-byte. +fn advance_virtual_to(app: &mut App, t: std::time::Duration) { + let mut virt = app.world_mut().resource_mut::>(); + let elapsed = virt.elapsed(); + let delta = t.checked_sub(elapsed).unwrap_or(std::time::Duration::ZERO); + virt.advance_by(delta); +} + +/// Build an `ExtractedNodes` from a laid-out world by reading each entity's +/// resolved box + background through the production `extracted_node_for`, +/// ordered by `Name` then `Entity` index for determinism. Pure-CPU: this is the +/// same single record builder the RenderApp's extract uses, with no GPU. +fn extract_nodes_from_world(world: &World) -> ExtractedNodes { + use buiy_core::render::components::Background; + use buiy_core::render::extract::extracted_node_for; + use buiy_core::theme::Theme; + + let theme = world.get_resource::().cloned().unwrap_or_default(); + + let mut rows: Vec<(String, u32, ExtractedNode)> = Vec::new(); + let mut q = world + .try_query::<( + Entity, + &ResolvedLayout, + Option<&GlobalTransform>, + Option<&Background>, + Option<&Name>, + )>() + .unwrap(); + for (e, layout, gt, bg, name) in q.iter(world) { + let gt = gt.copied().unwrap_or(GlobalTransform::IDENTITY); + let node = extracted_node_for(e, >, layout, bg, None, &theme); + rows.push((entity_label(name, e), e.index().index(), node)); + } + rows.sort_by(|a, b| a.0.cmp(&b.0).then(a.1.cmp(&b.1))); + + ExtractedNodes { + nodes: rows.into_iter().map(|(_, _, n)| n).collect(), + ..Default::default() + } +} diff --git a/crates/buiy_verify/tests/snapshot_dump.rs b/crates/buiy_verify/tests/snapshot_dump.rs new file mode 100644 index 0000000..3e051ca --- /dev/null +++ b/crates/buiy_verify/tests/snapshot_dump.rs @@ -0,0 +1,38 @@ +//! Task 2.1 — shared dump primitives: `round` + format-version headers. +//! Plain `assert_eq!` (NOT a snapshot) so this meta-test of the snapshot tooling +//! cannot pass vacuously (snapshots.md § Verification #2). + +use buiy_verify::snapshot::{DISPLAY_LIST_DUMP_VERSION, LAYOUT_DUMP_VERSION, ROUND_DP, round}; + +#[test] +fn round_table() { + // ROUND_DP = 2: round to 2 decimals, then strip trailing zeros / the + // trailing dot so the dump stays diff-readable and last-ULP-stable. + // sub-ULP + negative inputs (snapshots.md § Verification #2). + assert_eq!(ROUND_DP, 2); + // snapshots.md § Verification #2 lists `round(1.005) == "1.0"`, but that + // vector is self-inconsistent with `round(50.0) == "50"`: `1.005_f32` is + // `1.00499…`, which formats to `"1.00"` at 2 dp — byte-identical suffix to + // `50.0`'s `"50.00"`, so ONE trailing-zero rule cannot strip one to `"1.0"` + // and the other to `"50"`. We strip ALL trailing zeros (the only + // self-consistent rule). The vector's INTENT — proving `1.005` rounds DOWN + // to 1.00, never up to 1.01 — is fully preserved by `"1"`. + assert_eq!(round(1.005), "1"); // rounds to 1.00 (NOT 1.01), then strips + assert_eq!(round(50.0), "50"); // integral value drops the ".0" + assert_eq!(round(-0.001), "0"); // sub-ULP negative collapses to "0" (no "-0") + assert_eq!(round(0.0), "0"); + assert_eq!(round(-0.0), "0"); // negative zero normalizes to "0" + assert_eq!(round(50.5), "50.5"); + assert_eq!(round(50.567), "50.57"); // rounds at the 2nd decimal + assert_eq!(round(-12.34), "-12.34"); + assert_eq!(round(100.0), "100"); +} + +#[test] +fn version_headers_are_stable_constants() { + // The format-version tripwire (snapshots.md § Verification #4): a formatter + // edit that should bump the version but didn't fails the dump header tests + // (2.2/2.4); these pin the literal strings the dumps emit as line 1. + assert_eq!(LAYOUT_DUMP_VERSION, "# buiy-layout-dump v1"); + assert_eq!(DISPLAY_LIST_DUMP_VERSION, "# buiy-display-list-dump v1"); +} From 80c7b6032e081b26a068d5a23a65d57e1d0ce1cc Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 07:07:50 -0700 Subject: [PATCH 29/70] feat(verify): Tier-1 layout_dump + assert_layout_snapshot (gate #5) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `layout_dump(world)` emits one `(name, pos, size)` line per ResolvedLayout entity, indented by ChildOf depth, siblings ordered by Name then Entity index — the Name-key is what makes the dump invariant to ECS spawn / archetype order (proved by the entity-order-invariant self-test). Floats via the shared `round`; unnamed entities fall back to `entity#`; version-headered. `assert_layout_snapshot(app, name)` runs one update() then snapshots the dump via the #[track_caller] insta bridge, so the `.snap` lands beside the CALLING test (verified: buiy_core's flex_row_basic.snap landed under crates/buiy_core/tests/snapshots/, not buiy_verify's tree). Self-tests (plain assert_eq!, non-vacuous): entity-order invariance, version-header tripwire, unnamed-fallback. Migration (layout.rs:33): the child-only `(size - 50).abs() < 0.5` pair becomes one `assert_layout_snapshot(&mut app, "flex_row_basic")` over a Name-tagged root + TWO 50x50 children — the snapshot pins every box's position+size (strictly more than the old tolerance assert) and exercises sibling ordering. The two layout_tree_garbage_collects_* tests STAY plain assert_eq! (LayoutTree cardinality, not geometry — a length snapshot is lower-density). Robustness: collect_layout_entries / NameLookup::from_world / extract_nodes_from_world look up Name/ChildOf/Background per-entity via world.get and tolerate try_query returning None for an unregistered component (a fixture that tags none) — fixes a panic on a nameless, childless fixture. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/buiy_core/tests/layout.rs | 37 ++++-- .../tests/snapshots/flex_row_basic.snap | 8 ++ crates/buiy_verify/src/snapshot.rs | 75 +++++++----- crates/buiy_verify/tests/snapshot_layout.rs | 113 ++++++++++++++++++ .../snapshots/flex_row_basic_selftest.snap | 8 ++ 5 files changed, 201 insertions(+), 40 deletions(-) create mode 100644 crates/buiy_core/tests/snapshots/flex_row_basic.snap create mode 100644 crates/buiy_verify/tests/snapshot_layout.rs create mode 100644 crates/buiy_verify/tests/snapshots/flex_row_basic_selftest.snap diff --git a/crates/buiy_core/tests/layout.rs b/crates/buiy_core/tests/layout.rs index b4fbae6..e98f3fc 100644 --- a/crates/buiy_core/tests/layout.rs +++ b/crates/buiy_core/tests/layout.rs @@ -4,6 +4,7 @@ use buiy_core::{ components::{Node, ResolvedLayout}, layout::{LayoutPlugin, LayoutTree, Style}, }; +use buiy_verify::snapshot::assert_layout_snapshot; #[test] fn layout_resolves_a_simple_flex_row() { @@ -12,29 +13,43 @@ fn layout_resolves_a_simple_flex_row() { app.add_plugins(CorePlugin); app.add_plugins(LayoutPlugin); + // A 200x100 flex-row root with two 50x50 children. `Name`-tagging is what + // makes the Tier-1 layout snapshot diff-stable (entity-by-Name, never raw + // Entity bits). The trailing per-field `(size.x - 50.0).abs() < 0.5` pair + // is now one holistic `assert_layout_snapshot` — the .snap pins EVERY box's + // position+size (root + both children), strictly more than the old child- + // only width/height tolerance asserts (snapshots.md § Tier 1). let parent = app .world_mut() .spawn(( Node, + Name::new("root"), Style::default().flex_row().width_px(200.0).height_px(100.0), )) .id(); - let child = app + let child0 = app .world_mut() - .spawn((Node, Style::default().width_px(50.0).height_px(50.0))) + .spawn(( + Node, + Name::new("row.item[0]"), + Style::default().width_px(50.0).height_px(50.0), + )) + .id(); + let child1 = app + .world_mut() + .spawn(( + Node, + Name::new("row.item[1]"), + Style::default().width_px(50.0).height_px(50.0), + )) .id(); - app.world_mut().entity_mut(parent).add_child(child); - - app.update(); + app.world_mut() + .entity_mut(parent) + .add_children(&[child0, child1]); - let layout = app - .world() - .get::(child) - .expect("child has ResolvedLayout after Update"); - assert!((layout.size.x - 50.0).abs() < 0.5, "child width ~ 50"); - assert!((layout.size.y - 50.0).abs() < 0.5, "child height ~ 50"); + assert_layout_snapshot(&mut app, "flex_row_basic"); } #[test] diff --git a/crates/buiy_core/tests/snapshots/flex_row_basic.snap b/crates/buiy_core/tests/snapshots/flex_row_basic.snap new file mode 100644 index 0000000..507ca52 --- /dev/null +++ b/crates/buiy_core/tests/snapshots/flex_row_basic.snap @@ -0,0 +1,8 @@ +--- +source: crates/buiy_core/tests/layout.rs +expression: flex_row_basic +--- +# buiy-layout-dump v1 +root pos=0,0 size=200,100 + row.item[0] pos=0,0 size=50,50 + row.item[1] pos=50,0 size=50,50 diff --git a/crates/buiy_verify/src/snapshot.rs b/crates/buiy_verify/src/snapshot.rs index f6916cd..10b2f59 100644 --- a/crates/buiy_verify/src/snapshot.rs +++ b/crates/buiy_verify/src/snapshot.rs @@ -179,30 +179,38 @@ struct LayoutEntry { /// `Children`, siblings ordered by `Name` then `Entity` index. The `Name`-key /// is what makes the dump invariant to ECS spawn/archetype order. fn collect_layout_entries(world: &World) -> Vec { - // entity -> (name, position, size) for every laid-out entity. + // entity -> (name, position, size) for every laid-out entity. `Name` is + // looked up per-entity via `world.get` (not in the query) because `Name` + // may be UNREGISTERED in a fixture that tags no entity — `try_query` over + // an unregistered component returns `None`. `ResolvedLayout` is always + // registered by `LayoutPlugin`, so its query never fails. let mut boxes: HashMap = HashMap::new(); let mut q = world - .try_query::<(Entity, &ResolvedLayout, Option<&Name>)>() - .unwrap(); - for (e, layout, name) in q.iter(world) { - boxes.insert(e, (entity_label(name, e), layout.position, layout.size)); + .try_query::<(Entity, &ResolvedLayout)>() + .expect("ResolvedLayout is registered by LayoutPlugin"); + for (e, layout) in q.iter(world) { + let label = entity_label(world.get::(e), e); + boxes.insert(e, (label, layout.position, layout.size)); } - // Adjacency: parent -> children (only over laid-out entities). + // Adjacency: parent -> children (only over laid-out entities). `ChildOf` + // may be unregistered (a flat fixture with no children) — then every + // entity is a root. let mut children: HashMap> = HashMap::new(); let mut has_parent: HashMap = HashMap::new(); for &e in boxes.keys() { has_parent.entry(e).or_insert(false); } - let mut cq = world.try_query::<(Entity, &ChildOf)>().unwrap(); - for (e, child_of) in cq.iter(world) { - if !boxes.contains_key(&e) { - continue; - } - let parent = child_of.parent(); - if boxes.contains_key(&parent) { - children.entry(parent).or_default().push(e); - has_parent.insert(e, true); + if let Some(mut cq) = world.try_query::<(Entity, &ChildOf)>() { + for (e, child_of) in cq.iter(world) { + if !boxes.contains_key(&e) { + continue; + } + let parent = child_of.parent(); + if boxes.contains_key(&parent) { + children.entry(parent).or_default().push(e); + has_parent.insert(e, true); + } } } @@ -252,9 +260,12 @@ impl NameLookup { /// absent from the map renders as `entity#` (the unnamed fallback). pub fn from_world(world: &World) -> Self { let mut map = HashMap::new(); - let mut q = world.try_query::<(Entity, &Name)>().unwrap(); - for (e, name) in q.iter(world) { - map.insert(e, name.as_str().to_string()); + // `Name` may be unregistered (no entity is named) — then the map is + // empty and every entity falls back to `entity#`. + if let Some(mut q) = world.try_query::<(Entity, &Name)>() { + for (e, name) in q.iter(world) { + map.insert(e, name.as_str().to_string()); + } } Self(map) } @@ -452,19 +463,25 @@ fn extract_nodes_from_world(world: &World) -> ExtractedNodes { let theme = world.get_resource::().cloned().unwrap_or_default(); let mut rows: Vec<(String, u32, ExtractedNode)> = Vec::new(); + // Query only the always-registered `ResolvedLayout`; the optional paint + // inputs (`GlobalTransform`/`Background`/`Name`) are looked up per-entity + // via `world.get`, which tolerates an unregistered component (a fixture + // that tags none) where `try_query` would return `None`. let mut q = world - .try_query::<( - Entity, - &ResolvedLayout, - Option<&GlobalTransform>, - Option<&Background>, - Option<&Name>, - )>() - .unwrap(); - for (e, layout, gt, bg, name) in q.iter(world) { - let gt = gt.copied().unwrap_or(GlobalTransform::IDENTITY); + .try_query::<(Entity, &ResolvedLayout)>() + .expect("ResolvedLayout is registered by LayoutPlugin"); + for (e, layout) in q.iter(world) { + let gt = world + .get::(e) + .copied() + .unwrap_or(GlobalTransform::IDENTITY); + let bg = world.get::(e); let node = extracted_node_for(e, >, layout, bg, None, &theme); - rows.push((entity_label(name, e), e.index().index(), node)); + rows.push(( + entity_label(world.get::(e), e), + e.index().index(), + node, + )); } rows.sort_by(|a, b| a.0.cmp(&b.0).then(a.1.cmp(&b.1))); diff --git a/crates/buiy_verify/tests/snapshot_layout.rs b/crates/buiy_verify/tests/snapshot_layout.rs new file mode 100644 index 0000000..38ed7ba --- /dev/null +++ b/crates/buiy_verify/tests/snapshot_layout.rs @@ -0,0 +1,113 @@ +//! Task 2.2 self-tests for the Tier-1 layout dump. These are PLAIN `assert_eq!` +//! (not snapshots) so the meta-tests of the snapshot tooling cannot pass +//! vacuously (snapshots.md § Verification #1, #4). + +use bevy::prelude::*; +use buiy_core::CorePlugin; +use buiy_core::components::Node; +use buiy_core::layout::{LayoutPlugin, Style}; +use buiy_verify::snapshot::{LAYOUT_DUMP_VERSION, assert_layout_snapshot, layout_dump}; + +/// Build a minimal pure-CPU layout app: a 200x100 flex-row root with two 50x50 +/// children, every entity `Name`-tagged. `spawn_order` flips the order the two +/// children are spawned so the determinism test can prove `Name`-keyed output +/// is invariant to ECS spawn / archetype order. +fn flex_row_app(reversed: bool) -> App { + let mut app = App::new(); + app.add_plugins(MinimalPlugins); + app.add_plugins(CorePlugin); + app.add_plugins(LayoutPlugin); + + let spawn_child = |app: &mut App, label: &str, w: f32| { + app.world_mut() + .spawn(( + Node, + Name::new(label.to_string()), + Style::default().width_px(w).height_px(50.0), + )) + .id() + }; + + // Flip spawn order to perturb Entity allocation; the dump must not change. + let (a, b) = if reversed { + let b = spawn_child(&mut app, "row.item[1]", 50.0); + let a = spawn_child(&mut app, "row.item[0]", 50.0); + (a, b) + } else { + let a = spawn_child(&mut app, "row.item[0]", 50.0); + let b = spawn_child(&mut app, "row.item[1]", 50.0); + (a, b) + }; + + let root = app + .world_mut() + .spawn(( + Node, + Name::new("root"), + Style::default().flex_row().width_px(200.0).height_px(100.0), + )) + .id(); + app.world_mut().entity_mut(root).add_children(&[a, b]); + app +} + +#[test] +fn dump_is_entity_order_invariant() { + // snapshots.md § Verification #1: the same fixture, spawned in two different + // entity orders, must produce a BYTE-IDENTICAL dump — the property the + // Name-keyed sibling sort exists to guarantee. + let mut a = flex_row_app(false); + let mut b = flex_row_app(true); + a.update(); + b.update(); + let da = layout_dump(a.world()); + let db = layout_dump(b.world()); + assert_eq!( + da, db, + "layout dump must be invariant to entity spawn order" + ); + // And it is non-empty / structured (guards a vacuous "" == "" pass). + assert!(da.contains("root pos="), "dump names the root by Name"); + assert!(da.contains("row.item[0]"), "dump names the first child"); +} + +#[test] +fn layout_dump_has_version_header() { + // snapshots.md § Verification #4: line 1 is the format-version constant, so + // a formatter edit that should bump the version but didn't fails here. + let mut app = flex_row_app(false); + app.update(); + let dump = layout_dump(app.world()); + assert_eq!( + dump.lines().next(), + Some(LAYOUT_DUMP_VERSION), + "first line must be the layout dump version header" + ); +} + +#[test] +fn unnamed_entity_falls_back_to_entity_index() { + // An entity with no `Name` renders as `entity#` (flagged, since an + // unnamed fixture is non-diff-stable). Proves the fallback path. + let mut app = App::new(); + app.add_plugins(MinimalPlugins); + app.add_plugins(CorePlugin); + app.add_plugins(LayoutPlugin); + app.world_mut() + .spawn((Node, Style::default().width_px(10.0).height_px(10.0))); + app.update(); + let dump = layout_dump(app.world()); + assert!( + dump.contains("entity#"), + "unnamed entity uses the entity# fallback, got:\n{dump}" + ); +} + +#[test] +fn flex_row_basic_layout_snapshot() { + // The migration target from buiy_core's layout.rs:33 also runs here as a + // buiy_verify self-test of the full `assert_layout_snapshot` path (insta + // bridge + dump). `.snap` lands beside THIS file. + let mut app = flex_row_app(false); + assert_layout_snapshot(&mut app, "flex_row_basic_selftest"); +} diff --git a/crates/buiy_verify/tests/snapshots/flex_row_basic_selftest.snap b/crates/buiy_verify/tests/snapshots/flex_row_basic_selftest.snap new file mode 100644 index 0000000..bc83d3c --- /dev/null +++ b/crates/buiy_verify/tests/snapshots/flex_row_basic_selftest.snap @@ -0,0 +1,8 @@ +--- +source: crates/buiy_verify/tests/snapshot_layout.rs +expression: flex_row_basic_selftest +--- +# buiy-layout-dump v1 +root pos=0,0 size=200,100 + row.item[0] pos=0,0 size=50,50 + row.item[1] pos=50,0 size=50,50 From 990bca94afad88c37678e6e8fe6976cc4511085a Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 07:08:29 -0700 Subject: [PATCH 30/70] feat(verify): NameLookup + byte-exact instance_hex check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `instance_hex(p)` hex-dumps `bytemuck::bytes_of(&PackedInstance)` (52 B → 104 hex chars) — a byte-exact, format-version-free snapshot of the GPU upload payload, the complement to the diff-readable Display dump: a packing arithmetic change flips the hex even when the rounded dump rounds it away. `NameLookup` (entity→name, World-built once) keeps the display-list dump World-free. Self-tests (plain assert_eq!, non-vacuous): - hex_round_trips_bytes: hex → parse → pod_read_unaligned reconstructs the exact instance bytes (lossless, matches the GPU payload). - hex_flips_on_a_packing_change: a negated height (the half-size sign bug render_instance.rs regression-tests) flips the hex — proves teeth. Endianness: bytes_of is host-endian; CI + dev are little-endian x86-64 and the hex is a within-repo regression artifact, documented in the fn. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../tests/snapshot_instance_hex.rs | 62 +++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 crates/buiy_verify/tests/snapshot_instance_hex.rs diff --git a/crates/buiy_verify/tests/snapshot_instance_hex.rs b/crates/buiy_verify/tests/snapshot_instance_hex.rs new file mode 100644 index 0000000..d1917da --- /dev/null +++ b/crates/buiy_verify/tests/snapshot_instance_hex.rs @@ -0,0 +1,62 @@ +//! Task 2.3 self-test for the byte-exact `PackedInstance` hex check. Plain +//! `assert_eq!` (NOT a snapshot) so the round-trip cannot pass vacuously +//! (snapshots.md § Verification #3). + +use buiy_core::render::instance::PackedInstance; +use buiy_verify::snapshot::instance_hex; + +#[test] +fn hex_round_trips_bytes() { + // `instance_hex(p)` → parse hex → `bytemuck::pod_read_unaligned` must + // reconstruct the ORIGINAL `PackedInstance` bit-for-bit, proving the hex is + // lossless and matches the GPU upload payload (52 B → 104 hex chars). + let p = PackedInstance { + rect_pos: [10.0, 20.0], + rect_size: [100.0, 40.0], + color: [0.25, 0.5, 0.75, 1.0], + radius: 8.0, + clip_min: [0.0, 0.0], + clip_max: [200.0, 100.0], + }; + + let hex = instance_hex(&p); + assert_eq!(hex.len(), 104, "52 bytes → 104 hex chars"); + + // Parse the hex back into the 52 bytes. + let bytes: Vec = (0..hex.len()) + .step_by(2) + .map(|i| u8::from_str_radix(&hex[i..i + 2], 16).unwrap()) + .collect(); + assert_eq!(bytes.len(), std::mem::size_of::()); + + let round: PackedInstance = bytemuck::pod_read_unaligned(&bytes); + // PackedInstance has no PartialEq; compare its raw bytes (the GPU payload + // identity that matters). + assert_eq!( + bytemuck::bytes_of(&round), + bytemuck::bytes_of(&p), + "hex round-trip must reconstruct the exact instance bytes" + ); +} + +#[test] +fn hex_flips_on_a_packing_change() { + // Teeth: a single-field change MUST flip the hex (so the snapshot has bite). + let base = PackedInstance { + rect_pos: [10.0, 20.0], + rect_size: [100.0, 40.0], + color: [1.0, 1.0, 1.0, 1.0], + radius: 0.0, + clip_min: [f32::NEG_INFINITY, f32::NEG_INFINITY], + clip_max: [f32::INFINITY, f32::INFINITY], + }; + let mut flipped = base; + // The half-size sign bug `render_instance.rs` regression-tests: a negated + // height must change the bytes. + flipped.rect_size[1] = -flipped.rect_size[1]; + assert_ne!( + instance_hex(&base), + instance_hex(&flipped), + "a negated height (the half-size sign bug) must flip the hex" + ); +} From b14e2a0160cbdf0cd34e6f198d6d8e8e732cfc56 Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 07:09:51 -0700 Subject: [PATCH 31/70] feat(verify): Tier-2 display_list_dump + assert_display_list_snapshot MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `display_list_dump(nodes, names)` emits the CPU display-list handoff holistically in one artifact: - `[nodes painters_z]`: every ExtractedNode in STORED order (never re-sorted — extract.rs:141 — so a z-sort regression shows as a line reorder), one paint command per line: order index, Name, kind, pos, size, color (#rrggbbaa sRGB), clip (none | min..max), group (idx|none). - `[buckets draw-order]`: the pack_view() InstanceBuckets in BTreeMap (draw) order with per-batch `xN` counts. Version-headered; floats via the shared round; entities by Name. Self-tests (plain assert!, non-vacuous): Name-keyed entity-order invariance (two scenes whose names map to different Entity ids dump identically), version header, stored paint order + clip/group rendering, magenta MISSING_TOKEN_FALLBACK surfaces as `#ff00ffff` (the unresolved-token signal), bucket draw-order counts. Deviation (documented in the fn + report): color renders as #rrggbbaa, NOT `token:`. The pinned signature carries no Theme and ExtractedNode.color is already theme-resolved, so reverse-resolving a Color to a token name is impossible here — the hex IS the artifact, and the sentinel surfaces as #ff00ffff exactly as the self-test requires. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../tests/snapshot_display_list.rs | 148 ++++++++++++++++++ 1 file changed, 148 insertions(+) create mode 100644 crates/buiy_verify/tests/snapshot_display_list.rs diff --git a/crates/buiy_verify/tests/snapshot_display_list.rs b/crates/buiy_verify/tests/snapshot_display_list.rs new file mode 100644 index 0000000..bed7509 --- /dev/null +++ b/crates/buiy_verify/tests/snapshot_display_list.rs @@ -0,0 +1,148 @@ +//! Task 2.4 self-tests for the Tier-2 display-list dump. Plain `assert_eq!` / +//! `assert!` (NOT snapshots) so these meta-tests cannot pass vacuously +//! (snapshots.md § Verification #1, #4 + the magenta-sentinel signal). + +use bevy::prelude::*; +use buiy_core::render::color::MISSING_TOKEN_FALLBACK; +use buiy_core::render::components::ClipRect; +use buiy_core::render::extract::{ExtractedNode, ExtractedNodes}; +use buiy_verify::snapshot::{DISPLAY_LIST_DUMP_VERSION, NameLookup, display_list_dump}; + +/// Build an `ExtractedNodes` set with two named nodes (a clipped tooltip over a +/// modal) plus the matching `NameLookup`, both derived from REAL entities in a +/// fresh `World`. `swap` flips the order the two entities are spawned, which +/// perturbs their Entity ids — so the determinism self-test proves the dump is +/// Name-keyed (id-invariant), end-to-end through `NameLookup::from_world`. +fn two_node_scene(swap: bool) -> (ExtractedNodes, NameLookup) { + let mut world = World::new(); + let (modal_e, tooltip_e) = if swap { + let t = world.spawn(Name::new("tooltip")).id(); + let m = world.spawn(Name::new("modal")).id(); + (m, t) + } else { + let m = world.spawn(Name::new("modal")).id(); + let t = world.spawn(Name::new("tooltip")).id(); + (m, t) + }; + + let modal = ExtractedNode { + entity: modal_e, + position: Vec2::new(10.0, 20.0), + size: Vec2::new(100.0, 40.0), + color: Color::srgba(0.1, 0.2, 0.3, 1.0), + clip: None, + group: None, + }; + let tooltip = ExtractedNode { + entity: tooltip_e, + position: Vec2::new(0.0, 0.0), + size: Vec2::new(80.0, 24.0), + color: Color::WHITE, + clip: Some(ClipRect { + min: Vec2::new(0.0, 0.0), + max: Vec2::new(80.0, 24.0), + }), + group: Some(0), + }; + let nodes = ExtractedNodes { + // Stored paint order is modal (bottom) then tooltip (top); the dump + // emits this verbatim regardless of the entities' raw ids. + nodes: vec![modal, tooltip], + ..Default::default() + }; + (nodes, NameLookup::from_world(&world)) +} + +#[test] +fn display_dump_is_entity_order_invariant() { + // snapshots.md § Verification #1: the dump renders entities by Name, so two + // scenes whose names map to DIFFERENT Entity ids produce a byte-identical + // dump (the node ORDER is the same; only the underlying ids differ). + let (na, la) = two_node_scene(false); + let (nb, lb) = two_node_scene(true); + let da = display_list_dump(&na, &la); + let db = display_list_dump(&nb, &lb); + assert_eq!(da, db, "display-list dump must be invariant to Entity ids"); + assert!(da.contains("modal rect"), "names the modal node"); + assert!(da.contains("tooltip rect"), "names the tooltip node"); +} + +#[test] +fn display_dump_has_version_header() { + // snapshots.md § Verification #4. + let (nodes, names) = two_node_scene(false); + let dump = display_list_dump(&nodes, &names); + assert_eq!( + dump.lines().next(), + Some(DISPLAY_LIST_DUMP_VERSION), + "first line is the display-list dump version header" + ); +} + +#[test] +fn nodes_render_in_stored_paint_order() { + // The dump emits `ExtractedNode.nodes` in STORED order (never re-sorted — + // extract.rs:141), so a z-sort regression shows as a line reorder. The + // modal is index 0, the tooltip index 1. + let (nodes, names) = two_node_scene(false); + let dump = display_list_dump(&nodes, &names); + let node_lines: Vec<&str> = dump + .lines() + .skip_while(|l| !l.starts_with("[nodes")) + .skip(1) + .take_while(|l| !l.starts_with("[buckets")) + .collect(); + assert_eq!(node_lines.len(), 2); + assert!(node_lines[0].starts_with("0 modal rect"), "index 0 = modal"); + assert!( + node_lines[1].starts_with("1 tooltip rect"), + "index 1 = tooltip" + ); + // The clipped tooltip renders its clip AABB; the unclipped modal is `none`. + assert!(node_lines[0].contains("clip=none")); + assert!(node_lines[1].contains("clip=0,0..80,24")); + assert!(node_lines[1].contains("group=0")); + assert!(node_lines[0].contains("group=none")); +} + +#[test] +fn missing_token_surfaces_as_magenta() { + // snapshots.md § Tier 2: `ExtractedNode.color` is already theme-resolved, so + // a node carrying the magenta MISSING_TOKEN_FALLBACK sentinel dumps as the + // literal `#ff00ffff` — an unresolved-token regression signal in the diff. + let node = ExtractedNode { + entity: Entity::from_raw_u32(1).unwrap(), + position: Vec2::ZERO, + size: Vec2::splat(10.0), + color: MISSING_TOKEN_FALLBACK, + clip: None, + group: None, + }; + let nodes = ExtractedNodes { + nodes: vec![node], + ..Default::default() + }; + let dump = display_list_dump(&nodes, &NameLookup::default()); + assert!( + dump.contains("color=#ff00ffff"), + "magenta sentinel must surface as #ff00ffff, got:\n{dump}" + ); + // Unnamed entity falls back to entity#. + assert!(dump.contains("entity#1 rect"), "unnamed fallback in dump"); +} + +#[test] +fn buckets_appear_in_draw_order_with_counts() { + // The dump appends the pack_view() InstanceBuckets in BTreeMap (draw) order + // with per-batch `xN` counts. Two opaque nodes → one (Quad,layer=0) x2. + let (nodes, names) = two_node_scene(false); + let dump = display_list_dump(&nodes, &names); + assert!( + dump.contains("[buckets draw-order]"), + "dump has a buckets section" + ); + assert!( + dump.contains("(Quad,layer=0) x2"), + "two opaque nodes pack to one Quad batch of 2, got:\n{dump}" + ); +} From 2c534f5211778ac0ff4a1b09a7a2e60442dc57b2 Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 07:18:42 -0700 Subject: [PATCH 32/70] feat(verify): migrate render/layout assert_eq! to Tier-1/2 snapshots MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace (don't duplicate): keep each test's scene construction + intent comment + single-named-invariant asserts; collapse only holistic field-by-field / order assert blocks into one snapshot. render_instance.rs: per-field PackedInstance asserts (pos/size/radius, clip_min/max, sentinel) → 3 `assert_instance_hex_snapshot`s. Kept: the stride asserts (single named invariant) + the CPU-pre-linearization oracle check (cross-checks LinearRgba::from, which a self-snapshot can't replicate). Mutation check: re-introduced the half-size sign bug — the hex flipped 0000a042 (80.0) → 0000a0c2 (-80.0); reverted. render_buckets.rs: the `batch[0] == packed_to_raw(...)` payload check → `assert_instance_hex_snapshot` (the preserved oracle assert stays; the hex pins the exact bytes). Kept: Shadow.paint_order() < Quad, the batch key-sort, and the InstanceBuckets group/iterate mechanics (unit tests on synthetic data, not display-list state). render_extract.rs: the `assemble_context_tree` order `assert_eq!(got, vec![root,a,nested,c,d,b])` → a Name-keyed `assert_display_list_snapshot`. Mutation check: re-introduced the flat-concat bug — the node lines reordered (C,D after B); reverted. Kept: the single-node box/color-resolution oracle checks. render_paint_order.rs (the spec's "top_layer.rs" — partition_top_layer order asserts live here): the tail-order `assert_eq!(tail, vec![fullscreen,tooltip,popover,modal])` → a Name-keyed display-list snapshot. Kept: the modal-first-hit-candidate ordering identity (a named invariant). NameLookup gains `from_pairs` — a World-free constructor for pure-CPU tests that assemble synthetic ExtractedNodes (mirrors from_world). Every .snap reviewed (decoded the hex payloads; confirmed paint orders) before accept. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/buiy_core/tests/render_buckets.rs | 15 ++++- crates/buiy_core/tests/render_extract.rs | 27 +++++++-- crates/buiy_core/tests/render_instance.rs | 21 ++++--- crates/buiy_core/tests/render_paint_order.rs | 59 +++++++++++++++---- .../snapshots/nested_context_paint_order.snap | 14 +++++ .../snapshots/pack_extracted_finite_clip.snap | 5 ++ .../pack_extracted_sentinel_clip.snap | 5 ++ .../snapshots/pack_instance_logical_px.snap | 5 ++ .../snapshots/pack_view_node_payload.snap | 5 ++ .../snapshots/top_layer_tail_tier_order.snap | 12 ++++ crates/buiy_verify/src/snapshot.rs | 12 ++++ 11 files changed, 153 insertions(+), 27 deletions(-) create mode 100644 crates/buiy_core/tests/snapshots/nested_context_paint_order.snap create mode 100644 crates/buiy_core/tests/snapshots/pack_extracted_finite_clip.snap create mode 100644 crates/buiy_core/tests/snapshots/pack_extracted_sentinel_clip.snap create mode 100644 crates/buiy_core/tests/snapshots/pack_instance_logical_px.snap create mode 100644 crates/buiy_core/tests/snapshots/pack_view_node_payload.snap create mode 100644 crates/buiy_core/tests/snapshots/top_layer_tail_tier_order.snap diff --git a/crates/buiy_core/tests/render_buckets.rs b/crates/buiy_core/tests/render_buckets.rs index a3c25a7..eb54914 100644 --- a/crates/buiy_core/tests/render_buckets.rs +++ b/crates/buiy_core/tests/render_buckets.rs @@ -127,6 +127,7 @@ use bevy::prelude::*; use buiy_core::render::buckets::pack_view; use buiy_core::render::extract::ExtractedNode; use buiy_core::render::instance::{pack_extracted, packed_raw_stride_agrees}; +use buiy_verify::snapshot::assert_instance_hex_snapshot; // pack_view consumes R5's ExtractedNode records (the prepare seam, Task 6) — the // bucketing assertions below are unchanged from the DrawData era; only the input @@ -168,6 +169,13 @@ fn pack_view_routes_every_draw_to_quad_layer_0() { #[test] fn pack_view_preserves_packed_values_in_order() { + // pack_view's single batch holds each node packed verbatim. The old + // `batch[0] == packed_to_raw(pack_extracted(node))` oracle cross-check + // becomes a byte-exact hex snapshot of the packed payload: it pins the + // EXACT instance bytes pack_view emits (snapshots.md § Tier 2 — the bucket + // dump pins counts, the hex pins the payload). The asserts below still + // prove the batch's bytes equal the packing-fn output (the preserved + // oracle), and the hex pins what those bytes ARE. let nodes = vec![node( 1, Vec2::new(7.0, 9.0), @@ -176,8 +184,11 @@ fn pack_view_preserves_packed_values_in_order() { )]; let buckets = pack_view(&nodes); let (_, batch) = buckets.batches().next().expect("one batch"); - let expect = buiy_core::render::buckets::packed_to_raw(&pack_extracted(&nodes[0])); - assert_eq!(batch[0], expect); + let packed = pack_extracted(&nodes[0]); + // Preserved oracle: the batch's raw row equals the packing fn's output. + assert_eq!(batch[0], buiy_core::render::buckets::packed_to_raw(&packed)); + // Pinned payload: snapshot the exact bytes pack_view emits for this node. + assert_instance_hex_snapshot(&packed, "pack_view_node_payload"); } #[test] diff --git a/crates/buiy_core/tests/render_extract.rs b/crates/buiy_core/tests/render_extract.rs index dc3b2da..ee5a298 100644 --- a/crates/buiy_core/tests/render_extract.rs +++ b/crates/buiy_core/tests/render_extract.rs @@ -269,6 +269,7 @@ fn extracted_node_position_follows_global_transform() { use buiy_core::render::extract::{ ExtractedNode, ExtractedNodes, assemble_context_tree, assemble_in_paint_order, }; +use buiy_verify::snapshot::{NameLookup, assert_display_list_snapshot}; #[test] fn extracted_nodes_default_is_empty_with_unit_scale() { @@ -395,6 +396,12 @@ fn nested_context_is_entered_atomically_at_its_parent_position() { // guards: flat-concatenating each context's painters_z paints the nested // descendants [C, D] at the END of their own list instead of between the // parent's A and B. Tree: root R = [A, NESTED, B]; NESTED = [C, D]. + // + // The `assert_eq!(got, vec![root, a, nested, c, d, b])` order check becomes + // a Name-keyed display-list snapshot: the assembled paint order IS the node + // line order in the dump, so the flat-concat regression shows as a line + // reorder (snapshots.md § Tier 2 — "a z-sort regression shows as a line + // reorder, the exact bug class pixels name poorly"). let (root, a, nested, b, c, d) = (e(1), e(2), e(3), e(4), e(5), e(6)); let mut map: std::collections::HashMap> = std::collections::HashMap::new(); map.insert(root, vec![a, nested, b]); @@ -417,10 +424,22 @@ fn nested_context_is_entered_atomically_at_its_parent_position() { }, &mut out, ); - let got: Vec = out.iter().map(|n| n.entity).collect(); - // Root's OWN box paints first, then A, then the whole nested unit (its own - // box NESTED, then C, D), then B — never A, NESTED, B, C, D. - assert_eq!(got, vec![root, a, nested, c, d, b]); + let nodes = ExtractedNodes { + nodes: out, + ..Default::default() + }; + // Name the synthetic entities so the dump is diff-stable by Name (not raw + // Entity bits). The dump's node lines read root, a, nested, c, d, b — the + // expected atomic-descent order. + let names = NameLookup::from_pairs([ + (root, "root"), + (a, "a"), + (nested, "nested"), + (b, "b"), + (c, "c"), + (d, "d"), + ]); + assert_display_list_snapshot(&nodes, "nested_context_paint_order", &names); } #[test] diff --git a/crates/buiy_core/tests/render_instance.rs b/crates/buiy_core/tests/render_instance.rs index 29de4d6..90a7b5b 100644 --- a/crates/buiy_core/tests/render_instance.rs +++ b/crates/buiy_core/tests/render_instance.rs @@ -6,6 +6,7 @@ use bevy::prelude::*; use buiy_core::render::DrawData; +use buiy_verify::snapshot::assert_instance_hex_snapshot; // Pure-CPU port of `shader.wgsl::sdf_rounded_rect` (logical px). The view-uniform // path keeps the SDF in logical px with a POSITIVE half_size — no abs() hack. @@ -54,6 +55,10 @@ fn packed_instance_stride_matches_logical_pipeline_descriptor() { fn pack_instance_keeps_position_and_size_in_logical_px() { // No clip conversion, no y-flip baked into the size. The raw logical box // is forwarded; the GPU view uniform (Task 1) does the clip transform. + // The per-field pos/size/radius asserts collapse into one byte-exact hex + // snapshot — it pins every f32 of the packed payload (positive height = NO + // y-flip, radius in logical px = NO 2/min(w,h)), so the half-size sign bug + // or a radius approximation flips the hex (snapshots.md § byte-exact). let draw = DrawData::new( Vec2::new(100.0, 50.0), Vec2::new(200.0, 80.0), @@ -61,9 +66,7 @@ fn pack_instance_keeps_position_and_size_in_logical_px() { 12.0, ); let p = pack_instance(&draw); - assert_eq!(p.rect_pos, [100.0, 50.0]); - assert_eq!(p.rect_size, [200.0, 80.0]); // positive height — NO y-flip here - assert_eq!(p.radius, 12.0); // logical px — NO 2/min(w,h) + assert_instance_hex_snapshot(&p, "pack_instance_logical_px"); } #[test] @@ -118,23 +121,25 @@ fn packed_instance_stride_is_52() { #[test] fn pack_extracted_sets_clip_min_max_from_node_clip() { // A node carrying a finite ClipRect packs that box verbatim into - // clip_min/clip_max (the same logical-px space as ClipRect.min/.max). + // clip_min/clip_max (the same logical-px space as ClipRect.min/.max). The + // per-field clip_min/clip_max asserts become one byte-exact hex snapshot + // (it pins the whole packed payload, clip bytes included). let clip = ClipRect { min: Vec2::new(5.0, 6.0), max: Vec2::new(105.0, 206.0), }; let p = pack_extracted(&node_with_clip(Some(clip))); - assert_eq!(p.clip_min, [5.0, 6.0]); - assert_eq!(p.clip_max, [105.0, 206.0]); + assert_instance_hex_snapshot(&p, "pack_extracted_finite_clip"); } #[test] fn pack_extracted_uses_full_view_sentinel_when_clip_absent() { // clip == None packs to clip_min = [-INF; 2], clip_max = [+INF; 2] — for any // finite frag_pos the discard never fires, so the node paints unclipped. + // The hex snapshot pins the ±INFINITY sentinel bytes exactly (so a regression + // to a finite default flips the hex). let p = pack_extracted(&node_with_clip(None)); - assert_eq!(p.clip_min, [f32::NEG_INFINITY, f32::NEG_INFINITY]); - assert_eq!(p.clip_max, [f32::INFINITY, f32::INFINITY]); + assert_instance_hex_snapshot(&p, "pack_extracted_sentinel_clip"); } #[test] diff --git a/crates/buiy_core/tests/render_paint_order.rs b/crates/buiy_core/tests/render_paint_order.rs index a5ef587..019961a 100644 --- a/crates/buiy_core/tests/render_paint_order.rs +++ b/crates/buiy_core/tests/render_paint_order.rs @@ -7,9 +7,10 @@ use bevy::prelude::*; use buiy_core::components::StackingContext; use buiy_core::layout::{LayoutPlugin, Stacking, Style, TopLayer}; -use buiy_core::render::extract::{ExtractedNode, assemble_context_tree}; +use buiy_core::render::extract::{ExtractedNode, ExtractedNodes, assemble_context_tree}; use buiy_core::render::top_layer::partition_top_layer; use buiy_core::{CorePlugin, Node}; +use buiy_verify::snapshot::{NameLookup, assert_display_list_snapshot}; fn app() -> App { let mut app = App::new(); @@ -30,26 +31,43 @@ fn top_layer_of(world: &World, e: Entity) -> TopLayer { fn top_layer_tail_is_tier_ordered_fullscreen_to_modal() { let mut app = app(); // Spawn one of each non-None tier as children of a single root. Layout 6f - // escapes them to the root context's tail, tier-sorted. + // escapes them to the root context's tail, tier-sorted. Name-tagged so the + // display-list snapshot is diff-stable by Name (not raw Entity bits). let modal = app .world_mut() - .spawn((Node, Style::default().top_layer(TopLayer::Modal))) + .spawn(( + Node, + Name::new("modal"), + Style::default().top_layer(TopLayer::Modal), + )) .id(); let tooltip = app .world_mut() - .spawn((Node, Style::default().top_layer(TopLayer::Tooltip))) + .spawn(( + Node, + Name::new("tooltip"), + Style::default().top_layer(TopLayer::Tooltip), + )) .id(); let popover = app .world_mut() - .spawn((Node, Style::default().top_layer(TopLayer::Popover))) + .spawn(( + Node, + Name::new("popover"), + Style::default().top_layer(TopLayer::Popover), + )) .id(); let fullscreen = app .world_mut() - .spawn((Node, Style::default().top_layer(TopLayer::Fullscreen))) + .spawn(( + Node, + Name::new("fullscreen"), + Style::default().top_layer(TopLayer::Fullscreen), + )) .id(); let root = app .world_mut() - .spawn((Node, Style::default())) + .spawn((Node, Name::new("root"), Style::default())) .add_children(&[modal, tooltip, popover, fullscreen]) .id(); app.update(); @@ -62,12 +80,27 @@ fn top_layer_tail_is_tier_ordered_fullscreen_to_modal() { let world = app.world(); let (_in_flow, tail) = partition_top_layer(&sc.painters_z, |e| top_layer_of(world, e)); - // Render reads the tail verbatim; layout pinned the tier order. Assert it. - assert_eq!( - tail, - vec![fullscreen, tooltip, popover, modal], - "top-layer tail paints Fullscreen < Tooltip < Popover < Modal (paint-order § 3.1)" - ); + // Render reads the tail verbatim; layout pinned the tier order. The + // `assert_eq!(tail, vec![fullscreen, tooltip, popover, modal])` order check + // becomes a Name-keyed display-list snapshot: the tail's paint order reads + // off the node line order (Fullscreen < Tooltip < Popover < Modal, + // paint-order § 3.1), so a tier-sort regression shows as a line reorder. + let nodes = ExtractedNodes { + nodes: tail + .iter() + .map(|&e| ExtractedNode { + entity: e, + position: Vec2::ZERO, + size: Vec2::ONE, + color: Color::WHITE, + clip: None, + group: None, + }) + .collect(), + ..Default::default() + }; + let names = NameLookup::from_world(world); + assert_display_list_snapshot(&nodes, "top_layer_tail_tier_order", &names); } #[test] diff --git a/crates/buiy_core/tests/snapshots/nested_context_paint_order.snap b/crates/buiy_core/tests/snapshots/nested_context_paint_order.snap new file mode 100644 index 0000000..b8fd05c --- /dev/null +++ b/crates/buiy_core/tests/snapshots/nested_context_paint_order.snap @@ -0,0 +1,14 @@ +--- +source: crates/buiy_core/tests/render_extract.rs +expression: nested_context_paint_order +--- +# buiy-display-list-dump v1 +[nodes painters_z] +0 root rect pos=0,0 size=1,1 color=#ffffffff clip=none group=none +1 a rect pos=0,0 size=1,1 color=#ffffffff clip=none group=none +2 nested rect pos=0,0 size=1,1 color=#ffffffff clip=none group=none +3 c rect pos=0,0 size=1,1 color=#ffffffff clip=none group=none +4 d rect pos=0,0 size=1,1 color=#ffffffff clip=none group=none +5 b rect pos=0,0 size=1,1 color=#ffffffff clip=none group=none +[buckets draw-order] +(Quad,layer=0) x6 diff --git a/crates/buiy_core/tests/snapshots/pack_extracted_finite_clip.snap b/crates/buiy_core/tests/snapshots/pack_extracted_finite_clip.snap new file mode 100644 index 0000000..949ce83 --- /dev/null +++ b/crates/buiy_core/tests/snapshots/pack_extracted_finite_clip.snap @@ -0,0 +1,5 @@ +--- +source: crates/buiy_core/tests/render_instance.rs +expression: pack_extracted_finite_clip +--- +000020410000a0410000f041000020420000803f0000803f0000803f0000803f000000000000a0400000c0400000d24200004e43 diff --git a/crates/buiy_core/tests/snapshots/pack_extracted_sentinel_clip.snap b/crates/buiy_core/tests/snapshots/pack_extracted_sentinel_clip.snap new file mode 100644 index 0000000..15858d1 --- /dev/null +++ b/crates/buiy_core/tests/snapshots/pack_extracted_sentinel_clip.snap @@ -0,0 +1,5 @@ +--- +source: crates/buiy_core/tests/render_instance.rs +expression: pack_extracted_sentinel_clip +--- +000020410000a0410000f041000020420000803f0000803f0000803f0000803f00000000000080ff000080ff0000807f0000807f diff --git a/crates/buiy_core/tests/snapshots/pack_instance_logical_px.snap b/crates/buiy_core/tests/snapshots/pack_instance_logical_px.snap new file mode 100644 index 0000000..bbdce65 --- /dev/null +++ b/crates/buiy_core/tests/snapshots/pack_instance_logical_px.snap @@ -0,0 +1,5 @@ +--- +source: crates/buiy_core/tests/render_instance.rs +expression: pack_instance_logical_px +--- +0000c84200004842000048430000a0420000803f0000803f0000803f0000803f00004041000080ff000080ff0000807f0000807f diff --git a/crates/buiy_core/tests/snapshots/pack_view_node_payload.snap b/crates/buiy_core/tests/snapshots/pack_view_node_payload.snap new file mode 100644 index 0000000..d98c5d0 --- /dev/null +++ b/crates/buiy_core/tests/snapshots/pack_view_node_payload.snap @@ -0,0 +1,5 @@ +--- +source: crates/buiy_core/tests/render_buckets.rs +expression: pack_view_node_payload +--- +0000e0400000104100004040000080400000803f0000803f0000803f0000803f00000000000080ff000080ff0000807f0000807f diff --git a/crates/buiy_core/tests/snapshots/top_layer_tail_tier_order.snap b/crates/buiy_core/tests/snapshots/top_layer_tail_tier_order.snap new file mode 100644 index 0000000..35e2f6c --- /dev/null +++ b/crates/buiy_core/tests/snapshots/top_layer_tail_tier_order.snap @@ -0,0 +1,12 @@ +--- +source: crates/buiy_core/tests/render_paint_order.rs +expression: top_layer_tail_tier_order +--- +# buiy-display-list-dump v1 +[nodes painters_z] +0 fullscreen rect pos=0,0 size=1,1 color=#ffffffff clip=none group=none +1 tooltip rect pos=0,0 size=1,1 color=#ffffffff clip=none group=none +2 popover rect pos=0,0 size=1,1 color=#ffffffff clip=none group=none +3 modal rect pos=0,0 size=1,1 color=#ffffffff clip=none group=none +[buckets draw-order] +(Quad,layer=0) x4 diff --git a/crates/buiy_verify/src/snapshot.rs b/crates/buiy_verify/src/snapshot.rs index 10b2f59..12025a4 100644 --- a/crates/buiy_verify/src/snapshot.rs +++ b/crates/buiy_verify/src/snapshot.rs @@ -270,6 +270,18 @@ impl NameLookup { Self(map) } + /// Build the lookup from explicit `(entity, name)` pairs — the World-free + /// constructor for pure-CPU tests that assemble synthetic `ExtractedNode`s + /// (no spawned `Name` component). Mirrors [`from_world`](Self::from_world); + /// an entity absent from the pairs renders as `entity#`. + pub fn from_pairs(pairs: I) -> Self + where + I: IntoIterator, + S: Into, + { + Self(pairs.into_iter().map(|(e, n)| (e, n.into())).collect()) + } + /// The label for `e`: its stored `Name`, else `entity#`. fn label(&self, e: Entity) -> String { self.0 From cef6d0c99de1eb22507a814a55dc49e50f520d2f Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 07:20:13 -0700 Subject: [PATCH 33/70] feat(verify): per-timestamp animation snapshots (Tier-2 opt-in, Decision 8) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `assert_display_list_snapshot_at(app, name, steps)` advances Time to each ABSOLUTE logical timestamp (the landed advance_by manual-clock mechanism — Bevy's TimePlugin syncs Virtual into the generic clock at the head of each update, so no manual mirroring), emits display_list_dump per step, and keys one `.snap` per step `@` — a timing regression shows in exactly the drifted frame. Pure-CPU: the dump is text, so a 3-sample sequence costs ~3× a single dump, not a pixel capture. Opt-in per fixture (default is end-state only); a fixture enrolls when its TIMING CURVE is the behavior under test. Self-tests: - per_timestamp_is_deterministic (plain assert_eq!, non-vacuous): a width-ramp fixture (size.x = 10 + elapsed_ms/10, clock paused so the only progression is the explicit steps) driven through [0,250,500] twice on fresh apps yields byte-identical per-step dumps; width grows 10→35→60 (guards a vacuous all-identical pass). - assert_display_list_snapshot_at_keys_per_step: the public entry point; blesses width_ramp@0/@250/@500. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/buiy_core/tests/snapshot_animation.rs | 125 ++++++++++++++++++ .../tests/snapshots/width_ramp@0.snap | 8 ++ .../tests/snapshots/width_ramp@250.snap | 8 ++ .../tests/snapshots/width_ramp@500.snap | 8 ++ 4 files changed, 149 insertions(+) create mode 100644 crates/buiy_core/tests/snapshot_animation.rs create mode 100644 crates/buiy_core/tests/snapshots/width_ramp@0.snap create mode 100644 crates/buiy_core/tests/snapshots/width_ramp@250.snap create mode 100644 crates/buiy_core/tests/snapshots/width_ramp@500.snap diff --git a/crates/buiy_core/tests/snapshot_animation.rs b/crates/buiy_core/tests/snapshot_animation.rs new file mode 100644 index 0000000..0238cb2 --- /dev/null +++ b/crates/buiy_core/tests/snapshot_animation.rs @@ -0,0 +1,125 @@ +//! Task 2.6 self-test for per-timestamp animation snapshots +//! (`assert_display_list_snapshot_at`). The determinism check is a PLAIN +//! `assert_eq!` over the per-step dumps captured on two fresh apps — so the +//! meta-test of the temporal-snapshot tooling cannot pass vacuously +//! (snapshots.md § Per-timestamp, Decision 8). + +use std::time::Duration; + +use bevy::prelude::*; +use buiy_core::CorePlugin; +use buiy_core::components::{Node, ResolvedLayout}; +use buiy_core::layout::{LayoutPlugin, Style}; +use buiy_verify::snapshot::{NameLookup, assert_display_list_snapshot_at}; + +/// A pure-CPU "animation": a system that drives the box `size.x` from the +/// virtual clock (`10 + elapsed_ms/10`), so the display-list dump changes per +/// virtual timestamp — the temporal behavior under test. Deterministic: the +/// size is a pure function of `Time.elapsed()`, which the harness +/// advances to explicit absolute timestamps (no wall-clock). +fn animate_width(time: Res>, mut q: Query<&mut ResolvedLayout, With>) { + let ms = time.elapsed().as_millis() as f32; + for mut layout in &mut q { + layout.size.x = 10.0 + ms / 10.0; + } +} + +fn anim_app() -> App { + let mut app = App::new(); + app.add_plugins(MinimalPlugins); + app.add_plugins(CorePlugin); + app.add_plugins(LayoutPlugin); + // Pause the virtual clock so the ONLY time progression is the harness's + // explicit advance_to steps (the determinism guarantee). + app.world_mut().resource_mut::>().pause(); + app.add_systems(Update, animate_width.after(buiy_core::BuiySet::Layout)); + app.world_mut().spawn(( + Node, + Name::new("animated"), + Style::default().width_px(10.0).height_px(10.0), + )); + app +} + +/// Snapshot the three logical timestamps on a throwaway app — used by the +/// determinism test to capture dumps WITHOUT going through insta (so the test +/// can `assert_eq!` two runs directly). Mirrors `assert_display_list_snapshot_at` +/// step-driving, but returns the dumps instead of asserting them. +fn capture_steps(app: &mut App, steps: &[Duration]) -> Vec { + use buiy_core::render::components::Background; + use buiy_core::render::extract::{ExtractedNode, ExtractedNodes, extracted_node_for}; + use buiy_core::theme::Theme; + use buiy_verify::snapshot::display_list_dump; + + let mut out = Vec::new(); + for &t in steps { + // Advance the virtual clock to the ABSOLUTE timestamp, then update. + let mut virt = app.world_mut().resource_mut::>(); + let elapsed = virt.elapsed(); + virt.advance_by(t.checked_sub(elapsed).unwrap_or(Duration::ZERO)); + app.update(); + + let world = app.world(); + let names = NameLookup::from_world(world); + let theme = world.get_resource::().cloned().unwrap_or_default(); + let mut rows: Vec<(String, ExtractedNode)> = Vec::new(); + let mut q = world + .try_query::<(Entity, &ResolvedLayout, Option<&Name>)>() + .unwrap(); + for (e, layout, name) in q.iter(world) { + let gt = world + .get::(e) + .copied() + .unwrap_or(GlobalTransform::IDENTITY); + let bg = world.get::(e); + let label = name + .map(|n| n.as_str().to_string()) + .unwrap_or_else(|| format!("entity#{}", e.index().index())); + rows.push((label, extracted_node_for(e, >, layout, bg, None, &theme))); + } + rows.sort_by(|a, b| a.0.cmp(&b.0)); + let nodes = ExtractedNodes { + nodes: rows.into_iter().map(|(_, n)| n).collect(), + ..Default::default() + }; + out.push(display_list_dump(&nodes, &names)); + } + out +} + +#[test] +fn per_timestamp_is_deterministic() { + // snapshots.md § Per-timestamp: the same timestamps reproduce byte-identical + // dumps across runs — the determinism the fixed virtual clock guarantees. + let steps = [ + Duration::ZERO, + Duration::from_millis(250), + Duration::from_millis(500), + ]; + let a = capture_steps(&mut anim_app(), &steps); + let b = capture_steps(&mut anim_app(), &steps); + assert_eq!(a.len(), 3); + assert_eq!( + a, b, + "per-timestamp dumps must be deterministic across runs" + ); + // And the animation actually MOVES (guards a vacuous all-identical pass): + // width grows 10 → 35 → 60 across t=0/250/500. + assert!(a[0].contains("size=10,"), "t=0 width 10, got:\n{}", a[0]); + assert!(a[1].contains("size=35,"), "t=250 width 35, got:\n{}", a[1]); + assert!(a[2].contains("size=60,"), "t=500 width 60, got:\n{}", a[2]); +} + +#[test] +fn assert_display_list_snapshot_at_keys_per_step() { + // The public entry point: one `.snap` per step keyed `@`, so a + // timing regression shows in exactly the drifted frame. Opt-in: this fixture + // enrolls BECAUSE its timing curve (the width ramp) is the behavior tested. + let mut app = anim_app(); + let steps = [ + Duration::ZERO, + Duration::from_millis(250), + Duration::from_millis(500), + ]; + assert_display_list_snapshot_at(&mut app, "width_ramp", &steps); +} diff --git a/crates/buiy_core/tests/snapshots/width_ramp@0.snap b/crates/buiy_core/tests/snapshots/width_ramp@0.snap new file mode 100644 index 0000000..741c8bb --- /dev/null +++ b/crates/buiy_core/tests/snapshots/width_ramp@0.snap @@ -0,0 +1,8 @@ +--- +source: crates/buiy_core/tests/snapshot_animation.rs +expression: width_ramp@0 +--- +# buiy-display-list-dump v1 +[nodes painters_z] +0 animated rect pos=0,0 size=10,10 color=#00000000 clip=none group=none +[buckets draw-order] diff --git a/crates/buiy_core/tests/snapshots/width_ramp@250.snap b/crates/buiy_core/tests/snapshots/width_ramp@250.snap new file mode 100644 index 0000000..0a71203 --- /dev/null +++ b/crates/buiy_core/tests/snapshots/width_ramp@250.snap @@ -0,0 +1,8 @@ +--- +source: crates/buiy_core/tests/snapshot_animation.rs +expression: width_ramp@250 +--- +# buiy-display-list-dump v1 +[nodes painters_z] +0 animated rect pos=0,0 size=35,10 color=#00000000 clip=none group=none +[buckets draw-order] diff --git a/crates/buiy_core/tests/snapshots/width_ramp@500.snap b/crates/buiy_core/tests/snapshots/width_ramp@500.snap new file mode 100644 index 0000000..87ce032 --- /dev/null +++ b/crates/buiy_core/tests/snapshots/width_ramp@500.snap @@ -0,0 +1,8 @@ +--- +source: crates/buiy_core/tests/snapshot_animation.rs +expression: width_ramp@500 +--- +# buiy-display-list-dump v1 +[nodes painters_z] +0 animated rect pos=0,0 size=60,10 color=#00000000 clip=none group=none +[buckets draw-order] From 551eff024a223163422b4394787f43d713a6cd23 Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 07:21:33 -0700 Subject: [PATCH 34/70] docs(verify): resolve InstanceBuckets intra-doc link in snapshot `RUSTDOCFLAGS="-D warnings" cargo doc -p buiy_verify` flagged the [`InstanceBuckets`] doc link as unresolved (only `pack_view` was in scope). Add a `#[cfg(doc)]` import so the link resolves without a runtime-unused import. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/buiy_verify/src/snapshot.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crates/buiy_verify/src/snapshot.rs b/crates/buiy_verify/src/snapshot.rs index 12025a4..73035a7 100644 --- a/crates/buiy_verify/src/snapshot.rs +++ b/crates/buiy_verify/src/snapshot.rs @@ -28,6 +28,8 @@ use std::fmt::Write as _; use bevy::prelude::*; use buiy_core::components::ResolvedLayout; +#[cfg(doc)] +use buiy_core::render::buckets::InstanceBuckets; use buiy_core::render::buckets::pack_view; use buiy_core::render::extract::{ExtractedNode, ExtractedNodes}; use buiy_core::render::instance::PackedInstance; From 70cdd409871a04d309061d12185939ad68d82dd8 Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 07:39:37 -0700 Subject: [PATCH 35/70] =?UTF-8?q?feat(layout):=20promote=20tier=5Frank=20?= =?UTF-8?q?=E2=86=92=20pub=20top=5Flayer=5Fpaint=5Frank?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extract the private `tier_rank` closure (the top-layer escape sort in sub-pass 6f) into a module-level `pub fn top_layer_paint_rank(TopLayer) -> u8` and re-export it from `layout`. The sort now calls it, so the rank is a SINGLE source of truth shared with the Tier-3 verification harness's `top_layer_dominates` invariant (Task 2.9). The rank (`Fullscreen→0, Tooltip→1, Popover→2, Modal→3, None→u8::MAX`) is deliberately NOT the `TopLayer` enum's declared discriminant order (`None, Modal, Popover, Tooltip, Fullscreen`), so `#[derive(Ord)]` would give the WRONG dominance — callers must compare via this rank. The doc comment pins that, and the new `paint_rank_matches_documented_order` test guards it. Behavior-preserving: all 14 `layout_stacking` integration tests stay green (the escape sort produces the identical tail order). A small, accepted `buiy_core` public-surface add (invariants.md deviation #3 / plan Task 2.8). Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/buiy_core/src/layout/mod.rs | 2 +- crates/buiy_core/src/layout/systems.rs | 37 +++++++++++++++++------ crates/buiy_core/tests/layout_stacking.rs | 36 ++++++++++++++++++++++ 3 files changed, 64 insertions(+), 11 deletions(-) diff --git a/crates/buiy_core/src/layout/mod.rs b/crates/buiy_core/src/layout/mod.rs index 7cef704..c625de3 100644 --- a/crates/buiy_core/src/layout/mod.rs +++ b/crates/buiy_core/src/layout/mod.rs @@ -21,7 +21,7 @@ pub use style::{LogicalBoxModel, LogicalInset, Style}; pub use systems::{ AnchorNameRegistry, ContentVisibilityMargin, LayoutAnchorWarnedThisFrame, LayoutTaffyComputeCount, LayoutWarnedOnceSession, PostTaffyPositionOverrides, - SyncStylesIterCount, TopLayerActivation, + SyncStylesIterCount, TopLayerActivation, top_layer_paint_rank, }; pub use tree::LayoutTree; pub use types::{ diff --git a/crates/buiy_core/src/layout/systems.rs b/crates/buiy_core/src/layout/systems.rs index ba84f0b..7c43256 100644 --- a/crates/buiy_core/src/layout/systems.rs +++ b/crates/buiy_core/src/layout/systems.rs @@ -3796,6 +3796,30 @@ pub(super) fn compose_transform( t_mat * r_mat * s_mat * m_transform } +/// The top-layer **paint rank**: a total order over [`TopLayer`] variants where +/// a SMALLER rank paints lower (earlier) and a larger rank paints higher +/// (later). Fullscreen sits at the bottom of the top layer (`0`), Modal at the +/// top (`3`); `None` (in-flow, not in the top layer) is the sentinel `u8::MAX`, +/// so any escaping variant outranks (paints below) an in-flow node. +/// +/// This is the SINGLE source of truth for top-layer dominance, shared by the +/// layout escape sort (sub-pass 6f) and the verification harness's +/// `top_layer_dominates` invariant. It is deliberately NOT the `TopLayer` +/// enum's declared discriminant order (`None, Modal, Popover, Tooltip, +/// Fullscreen`), so `#[derive(Ord)]` on `TopLayer` would give the WRONG +/// dominance — callers must compare via this rank, never the discriminant. +/// +/// Spec: docs/specs/2026-05-08-buiy-layout-design/stacking-and-top-layer.md § 4. +pub fn top_layer_paint_rank(t: TopLayer) -> u8 { + match t { + TopLayer::Fullscreen => 0, + TopLayer::Tooltip => 1, + TopLayer::Popover => 2, + TopLayer::Modal => 3, + TopLayer::None => u8::MAX, + } +} + /// The spec § 2 union of stacking-context-formation triggers: /// (1) positioned with explicit `z_index`, (2) `Isolation::Isolate`, /// (3) non-identity transform, (4) `Containment.contain ⊇ PAINT/STRICT`, @@ -4110,15 +4134,8 @@ pub(super) fn stacking_context( // An entity that is itself a root does NOT escape (it has no parent // context to escape from) — it forms its own root context, so it is // excluded here to avoid a self-reference in its own `painters_z`. - fn tier_rank(t: TopLayer) -> u8 { - match t { - TopLayer::Fullscreen => 0, - TopLayer::Tooltip => 1, - TopLayer::Popover => 2, - TopLayer::Modal => 3, - TopLayer::None => u8::MAX, - } - } + // The tier rank is the SINGLE source of truth shared with the verification + // harness — see [`top_layer_paint_rank`]. let root_ancestor = |start: Entity| -> Entity { let mut cur = start; while let Ok(parent) = parent_chain.get(cur) { @@ -4132,7 +4149,7 @@ pub(super) fn stacking_context( cur }; let mut top_sorted: Vec = activation.order.iter().copied().collect(); - top_sorted.sort_by_cached_key(|&e| tier_rank(top_layer_of(e))); + top_sorted.sort_by_cached_key(|&e| top_layer_paint_rank(top_layer_of(e))); let mut escaped_by_root: std::collections::HashMap> = std::collections::HashMap::new(); for &e in &top_sorted { diff --git a/crates/buiy_core/tests/layout_stacking.rs b/crates/buiy_core/tests/layout_stacking.rs index b36b1c5..34945ea 100644 --- a/crates/buiy_core/tests/layout_stacking.rs +++ b/crates/buiy_core/tests/layout_stacking.rs @@ -417,3 +417,39 @@ fn mixed_top_layer_tiers_order_tooltip_below_modal() { "tooltip paints below modal (earlier in painters_z) regardless of activation" ); } + +#[test] +fn paint_rank_matches_documented_order() { + use buiy_core::layout::top_layer_paint_rank; + + // The single source of truth for top-layer dominance — Fullscreen paints + // BOTTOM (rank 0), Modal paints TOP (rank 3), `None` is the in-flow + // sentinel (`u8::MAX`). The *declared* enum order + // (`None, Modal, Popover, Tooltip, Fullscreen`) is deliberately NOT this + // order, so `#[derive(Ord)]` on `TopLayer` would give the WRONG dominance; + // the rank fn is what callers compare on (spec stacking-and-top-layer.md + // § 4 / verification invariants.md deviation #3). + assert_eq!(top_layer_paint_rank(TopLayer::Fullscreen), 0); + assert_eq!(top_layer_paint_rank(TopLayer::Tooltip), 1); + assert_eq!(top_layer_paint_rank(TopLayer::Popover), 2); + assert_eq!(top_layer_paint_rank(TopLayer::Modal), 3); + assert_eq!(top_layer_paint_rank(TopLayer::None), u8::MAX); + + // The rank is strictly increasing along the documented dominance chain, + // and every escaping variant outranks (paints below) the in-flow sentinel. + let chain = [ + TopLayer::Fullscreen, + TopLayer::Tooltip, + TopLayer::Popover, + TopLayer::Modal, + ]; + for pair in chain.windows(2) { + assert!( + top_layer_paint_rank(pair[0]) < top_layer_paint_rank(pair[1]), + "{:?} must paint below {:?}", + pair[0], + pair[1], + ); + assert!(top_layer_paint_rank(pair[0]) < top_layer_paint_rank(TopLayer::None)); + } +} From c65df829420f3771acd2671114fee962a7c9ace2 Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 08:00:59 -0700 Subject: [PATCH 36/70] =?UTF-8?q?feat(verify):=20Tier-3=20invariant=20modu?= =?UTF-8?q?le=20=E2=80=94=20Scene=20model=20+=20generators=20+=20realize?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add `buiy_verify::invariant` (gate #12 scaffolding) and its `scene` submodule: the abstract `Scene`/`SceneNode`/`GenTransform` model, the bounded `proptest` generators (`arb_scene`, `prop_recursive` depth/breadth caps, all-five-`TopLayer` reachable, identity transform reachable for shrinking), and `realize` — the bridge that threads a generated `Scene` through the PRODUCTION CPU paint assembly (`context_tree_paint_order`, mirroring sub-pass 6f's painters_z build + top-layer escape via the promoted `top_layer_paint_rank`) into the flat `ExtractedNodes` the predicates assert on. No GPU, no `World`. Two real `realize` bugs found + fixed under the smoke proptest (their shrunk seeds are committed in scene_generator_smoke.proptest-regressions so they re-run forever): - a top-layer node that is itself a forest ROOT listed itself in its own painters_z (it must not escape — no parent context), which made the production walk recurse to a stack overflow; - only the FIRST forest root was marked is_root, so a plain second root formed no context and silently dropped its whole subtree. Both are now regression-pinned by unit tests (`two_root_forest_realizes_all`, `top_layer_root_does_not_self_reference`) plus the bounds + round-trip proptests (`arb_scene_respects_bounds`, `realize_round_trips_every_node`, 256 cases each). A debug-gated structural guard asserts the bridge never hands the production walk a self-referential or double-listed context (a `realize`-correctness invariant, not the code under test). Deviation: `realize` reuses `context_tree_paint_order` verbatim and MIRRORS sub-pass 6f's `partition_top_layer` split inline (the escaped tail is merged into each root context's painters_z exactly as 6f does), rather than calling `partition_top_layer` on the flattened global order — that fn operates on one root context's list, and a top-layer ROOT legitimately paints first, so feeding it the global order would wrongly reorder. Global top-layer dominance is the `top_layer_dominates` predicate's job (Task 2.9). Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/buiy_verify/src/invariant.rs | 29 + crates/buiy_verify/src/invariant/scene.rs | 646 ++++++++++++++++++ crates/buiy_verify/src/lib.rs | 1 + ...scene_generator_smoke.proptest-regressions | 8 + .../tests/scene_generator_smoke.rs | 101 +++ 5 files changed, 785 insertions(+) create mode 100644 crates/buiy_verify/src/invariant.rs create mode 100644 crates/buiy_verify/src/invariant/scene.rs create mode 100644 crates/buiy_verify/tests/scene_generator_smoke.proptest-regressions create mode 100644 crates/buiy_verify/tests/scene_generator_smoke.rs diff --git a/crates/buiy_verify/src/invariant.rs b/crates/buiy_verify/src/invariant.rs new file mode 100644 index 0000000..253d624 --- /dev/null +++ b/crates/buiy_verify/src/invariant.rs @@ -0,0 +1,29 @@ +//! Tier 3 — metamorphic & property invariants (invariants.md). +//! +//! The `proptest`-driven middle rung of the verification pyramid: generated +//! scene strategies plus a fixed set of predicate functions asserting +//! *relations* over the CPU display-list and shaper output — no golden, no +//! oracle. It catches paint-order / transform / top-layer / finiteness / +//! BiDi-caret regressions over an unbounded fixture space, pure-CPU and +//! deterministic given a seed (gate #12). +//! +//! The [`scene`] module holds the abstract [`Scene`] model + the `proptest` +//! generators ([`arb_scene`]), plus [`realize`], which threads a `Scene` +//! through the PRODUCTION CPU paint-order assembly +//! ([`context_tree_paint_order`](buiy_core::render::extract::context_tree_paint_order), +//! [`partition_top_layer`](buiy_core::render::top_layer::partition_top_layer), +//! and the promoted +//! [`top_layer_paint_rank`](buiy_core::layout::top_layer_paint_rank)) into the +//! flat paint-ordered node list the predicates assert on — no GPU, no `World`. +//! +//! The predicate functions, their `proptest!` harness, and the mutation +//! meta-tests land in their own tasks (2.9, 2.10); each predicate is a free +//! `pub fn` taking borrowed data and returning `Result<(), Violation>` so a +//! failing property prints *which* relation broke and the offending +//! names/indices. The harness + meta-tests live in the test crate +//! (`crates/buiy_verify/tests/invariant_*.rs`), not here, so a property failure +//! re-runs from its committed `proptest-regressions/` seed under the ordinary +//! `cargo test` gate. + +pub mod scene; +pub use scene::{GenTransform, Realized, Scene, SceneNode, SceneParams, arb_scene, realize}; diff --git a/crates/buiy_verify/src/invariant/scene.rs b/crates/buiy_verify/src/invariant/scene.rs new file mode 100644 index 0000000..0d1938d --- /dev/null +++ b/crates/buiy_verify/src/invariant/scene.rs @@ -0,0 +1,646 @@ +//! The abstract [`Scene`] model + `proptest` generators, and [`realize`] — +//! the bridge that threads a generated `Scene` through the PRODUCTION CPU +//! paint-order assembly into the flat [`ExtractedNodes`] list the predicates +//! assert on (invariants.md § "Scene generators"). +//! +//! We generate an abstract `Scene` (not raw Bevy `World`s) so shrinking yields +//! a minimal, printable counterexample and the predicates stay world-agnostic. +//! `realize` does the heavy lifting: it assigns each node a synthetic `Entity`, +//! decides stacking-context formation, builds each forming node's `painters_z` +//! exactly as layout sub-pass 6f does (document order, stop-at-nested-context, +//! stable z-tier sort, top-layer escape), then runs the *production* +//! [`context_tree_paint_order`] over a tree whose tails were split with +//! [`partition_top_layer`](buiy_core::render::top_layer::partition_top_layer) +//! and ranked with the promoted [`top_layer_paint_rank`], so the realized order +//! can never diverge from what the engine actually paints. + +use bevy::prelude::*; +use proptest::prelude::*; + +use buiy_core::layout::{TopLayer, top_layer_paint_rank}; +use buiy_core::render::components::ClipRect; +use buiy_core::render::extract::{ExtractedNode, ExtractedNodes, context_tree_paint_order}; + +// --------------------------------------------------------------------------- +// The abstract scene model. +// --------------------------------------------------------------------------- + +/// A generated node in a bounded hierarchy. `name` is the stable identity used +/// in diagnostics (mirrors Tier 2's `Name`-based dump — never raw `Entity` +/// bits). A shrunk counterexample prints via `Debug` and reproduces from the +/// committed seed alone. +#[derive(Clone, Debug, PartialEq)] +pub struct SceneNode { + /// Unique within a `Scene` (`n0`, `n1`, …), assigned by a post-generation + /// pre-order rename so the tree is reproducible and printable. + pub name: String, + /// Child subtrees, in document order. + pub children: Vec, + /// Positioned `z-index`; drives stacking-context formation + the paint + /// tier. `None` == auto/static (in-flow document order). + pub z_index: Option, + /// `Isolation::Isolate` — forces a stacking context even with no z/transform. + pub isolation: bool, + /// Top-layer participation. `None` for the bulk; a non-`None` variant + /// escapes its parent context to the root top layer (ordered by + /// [`top_layer_paint_rank`]). + pub top_layer: TopLayer, + /// The `compose_transform` inputs (a non-identity transform forms a context). + pub transform: GenTransform, + /// Logical-px box (always finite, `≥ 0` by construction). + pub size: (f32, f32), + /// Resolved background color (never the magenta missing-token sentinel). + pub background: Option<[f32; 4]>, +} + +/// A generated scene: a forest of root subtrees (typically one root). +#[derive(Clone, Debug, PartialEq)] +pub struct Scene { + /// Root subtrees, in document order. + pub roots: Vec, +} + +/// The `compose_transform` input space (invariants.md § "Scene generators"): +/// the longhand `Translate` (px), `Rotate` (axis-angle), `Scale` (per-axis), +/// all finite and away from the degenerate `0`. The identity (all-default) +/// case is always reachable for shrinking. This is the generator-side mirror +/// of `buiy_core`'s `Translate`/`Rotate`/`Scale` longhands; `transform_roundtrips` +/// feeds it straight through `compose_transform`. +#[derive(Clone, Copy, Debug, PartialEq)] +pub struct GenTransform { + /// Translation in logical px (`x`, `y`, `z`). + pub translate: [f32; 3], + /// Rotation as an axis-angle: unit-ish axis (`x`, `y`, `z`) + angle (rad). + pub rotate_axis: [f32; 3], + pub rotate_angle: f32, + /// Per-axis scale (away from `0`). + pub scale: [f32; 3], +} + +impl GenTransform { + /// The identity transform (all factors neutral). The shrink target. + pub const IDENTITY: GenTransform = GenTransform { + translate: [0.0, 0.0, 0.0], + rotate_axis: [0.0, 0.0, 1.0], + rotate_angle: 0.0, + scale: [1.0, 1.0, 1.0], + }; + + /// `true` when this is (numerically) the identity — the formation trigger + /// "non-identity transform" (a forming context). Uses an exact compare + /// against the neutral factors; the generator only ever emits the exact + /// `IDENTITY` or a deliberately non-trivial transform, so no epsilon is + /// needed here. + pub fn is_identity(&self) -> bool { + self.translate == [0.0, 0.0, 0.0] + && self.rotate_angle == 0.0 + && self.scale == [1.0, 1.0, 1.0] + } +} + +// --------------------------------------------------------------------------- +// Generator budget + strategies. +// --------------------------------------------------------------------------- + +/// Bounded generator budget so the property space is finite-depth and shrinking +/// terminates fast (invariants.md § "Strategy budget"). +#[derive(Clone, Copy, Debug)] +pub struct SceneParams { + /// Hierarchy depth cap. + pub max_depth: u32, + /// Children-per-node cap. + pub max_breadth: u32, + /// Total-node guard (prevents blow-up; `prop_recursive`'s `desired_size`). + pub max_nodes: u32, + /// P(a node forms a context via z/isolation). + pub p_stacking: f64, + /// P(a node escapes to the top layer). + pub p_top_layer: f64, +} + +impl Default for SceneParams { + fn default() -> Self { + Self { + max_depth: 4, + max_breadth: 4, + max_nodes: 24, + p_stacking: 0.3, + p_top_layer: 0.1, + } + } +} + +/// Strategy for a single `GenTransform`. Skewed to the identity (the common + +/// shrink case) but reaches a finite, well-conditioned non-identity transform: +/// translate in `-512..512`, rotate angle in `0..2π` about an axis with a +/// non-zero component, scale in `0.1..8.0` per axis (away from `0`). +fn arb_transform() -> impl Strategy { + prop_oneof![ + // Weighted heavily toward identity so most generated nodes are in-flow. + 3 => Just(GenTransform::IDENTITY), + 1 => ( + // translate + (-512.0f32..512.0, -512.0f32..512.0, -512.0f32..512.0), + // rotate axis (kept non-degenerate by forcing z away from 0) + angle + (-1.0f32..1.0, -1.0f32..1.0, 0.1f32..1.0), + 0.0f32..std::f32::consts::TAU, + // scale away from 0 + (0.1f32..8.0, 0.1f32..8.0, 0.1f32..8.0), + ) + .prop_map(|(t, axis, angle, s)| GenTransform { + translate: [t.0, t.1, t.2], + rotate_axis: [axis.0, axis.1, axis.2], + rotate_angle: angle, + scale: [s.0, s.1, s.2], + }), + ] +} + +/// Strategy for one node's leaf attributes (everything but `children`/`name`). +/// `z_index` is drawn from the interesting `{-1, 0, 1, 2}` partition +/// (negative/zero/positive), gated by `p_stacking`; `top_layer` from all five +/// variants skewed to `None`, gated by `p_top_layer`. +fn arb_leaf(p: SceneParams) -> impl Strategy { + let z_strategy = prop::option::weighted( + p.p_stacking, + prop_oneof![Just(-1i32), Just(0), Just(1), Just(2)], + ); + let isolation = prop::bool::weighted(p.p_stacking); + let top_layer = arb_top_layer(p.p_top_layer); + let size = (0.0f32..512.0, 0.0f32..512.0); + let background = prop::option::of((0.0f32..1.0, 0.0f32..1.0, 0.0f32..1.0, 0.0f32..1.0)); + + ( + z_strategy, + isolation, + top_layer, + arb_transform(), + size, + background, + ) + .prop_map(|(z, iso, tl, transform, size, bg)| SceneNode { + // Placeholder name; `realize`/`arb_scene` rename pre-order. + name: String::new(), + children: Vec::new(), + z_index: z, + isolation: iso, + top_layer: tl, + transform, + size: (size.0, size.1), + background: bg.map(|(r, g, b, a)| [r, g, b, a]), + }) +} + +/// Strategy for `TopLayer`, all five variants reachable but heavily skewed to +/// `None` (the common in-flow case). Every escaping variant MUST be reachable +/// so `top_layer_dominates` exercises the full tier rank, not just `Modal`. +fn arb_top_layer(p_top: f64) -> impl Strategy { + let escape = prop_oneof![ + Just(TopLayer::Fullscreen), + Just(TopLayer::Tooltip), + Just(TopLayer::Popover), + Just(TopLayer::Modal), + ]; + prop::option::weighted(p_top, escape).prop_map(|opt| opt.unwrap_or(TopLayer::None)) +} + +/// Generate a bounded, shrinkable [`Scene`]. `prop_recursive` bounds depth + +/// node count so the tree is finite and shrinks toward the empty/shallow scene +/// (invariants.md § "Strategy budget"). Names are assigned by a final pre-order +/// rename (`n0..nK`) so a shrunk counterexample is reproducible and printable. +pub fn arb_scene(p: SceneParams) -> impl Strategy { + let leaf = arb_leaf(p); + let tree = leaf.prop_recursive(p.max_depth, p.max_nodes, p.max_breadth, move |inner| { + ( + arb_leaf(p), + prop::collection::vec(inner, 0..=p.max_breadth as usize), + ) + .prop_map(|(mut node, children)| { + node.children = children; + node + }) + }); + // A scene is a small forest (1..=2 roots) so the multi-root cross-tree + // case is reachable, but most scenes are single-rooted (the common case). + prop::collection::vec(tree, 1..=2).prop_map(|mut roots| { + let mut counter = 0u32; + for root in &mut roots { + rename_preorder(root, &mut counter); + } + Scene { roots } + }) +} + +/// Pre-order rename so every node gets a unique, stable `nK` name. +fn rename_preorder(node: &mut SceneNode, counter: &mut u32) { + node.name = format!("n{counter}"); + *counter += 1; + for child in &mut node.children { + rename_preorder(child, counter); + } +} + +// --------------------------------------------------------------------------- +// `realize` — Scene → ExtractedNodes through the production paint path. +// --------------------------------------------------------------------------- + +/// A realized scene: the flat paint-ordered [`ExtractedNodes`] PLUS the +/// per-node stacking-context membership the generator recorded (consumed by +/// `contexts_do_not_interleave`). Kept together so the predicate sees the same +/// context assignment `realize` used. +#[derive(Debug, Clone)] +pub struct Realized { + /// The flat paint-ordered node list (the production order). + pub nodes: ExtractedNodes, + /// `entity → owning stacking-context root entity`, for every painted node. + pub context_of: std::collections::HashMap, + /// `entity → node name`, for diagnostics. + pub name_of: std::collections::HashMap, +} + +/// Realize a [`Scene`] into the flat paint-ordered [`ExtractedNodes`] the +/// predicates assert on, through the PRODUCTION CPU paint assembly. No GPU, no +/// `World`: every node maps to a synthetic `Entity` (pre-order index), each +/// forming context's `painters_z` is built exactly as layout sub-pass 6f does, +/// and the global order comes from the production [`context_tree_paint_order`] +/// over tails split with +/// [`partition_top_layer`](buiy_core::render::top_layer::partition_top_layer), +/// with the escaped top-layer members ordered by [`top_layer_paint_rank`]. +pub fn realize(scene: &Scene) -> ExtractedNodes { + realize_full(scene).nodes +} + +/// [`realize`] plus the context-membership map (`contexts_do_not_interleave` +/// needs it). Pure-CPU. +pub fn realize_full(scene: &Scene) -> Realized { + let mut flat: Vec = Vec::new(); + // Index every node in pre-order; record parent + the synthetic entity. + for (root_i, root) in scene.roots.iter().enumerate() { + // EVERY forest root forms its own root stacking context (not just the + // first) — each is a context tree the production walk runs from. + let _ = root_i; + flatten(root, None, true, &mut flat); + } + + // entity-keyed views. + let entity_of: std::collections::HashMap = flat + .iter() + .map(|n| { + ( + n.idx, + Entity::from_raw_u32(n.idx as u32 + 1).expect("nonzero index"), + ) + }) + .collect(); + let name_of: std::collections::HashMap = flat + .iter() + .map(|n| (entity_of[&n.idx], n.name.clone())) + .collect(); + + // Which nodes FORM a stacking context (root | isolation | z | transform). + let forms: std::collections::HashSet = flat + .iter() + .filter(|n| n.forms_context()) + .map(|n| n.idx) + .collect(); + + // children-by-parent, in document order. + let mut children_of: std::collections::HashMap> = + std::collections::HashMap::new(); + for n in &flat { + if let Some(p) = n.parent { + children_of.entry(p).or_default().push(n.idx); + } + } + + // The root context each node belongs to: the nearest forming ancestor + // (inclusive of self iff self forms). Used for context membership + escape. + let by_idx: std::collections::HashMap = + flat.iter().map(|n| (n.idx, n)).collect(); + let root_context = |mut idx: usize| -> usize { + loop { + if forms.contains(&idx) { + return idx; + } + match by_idx[&idx].parent { + Some(p) => idx = p, + None => return idx, // a root always forms; defensive + } + } + }; + // The OUTERMOST (tree-root) ancestor of a node — the context an escaped + // top-layer member attaches to (mirrors sub-pass 6f's `root_ancestor`, + // systems.rs § 4). Distinct from `root_context`: escape always goes to the + // top of the tree so a top-layer node paints after EVERY normal node, not + // just after the normal nodes of a nested context. + let tree_root = |mut idx: usize| -> usize { + while let Some(p) = by_idx[&idx].parent { + idx = p; + } + idx + }; + + // Build each forming context's `painters_z` (sub-pass 6f mirror): + // descendants in document order, STOP descending at a nested context + // (it appears as an atomic entry), EXCLUDE top-layer members (they + // escape), then a STABLE sort by the (tier, z) paint key. + let mut painters_z: std::collections::HashMap> = + std::collections::HashMap::new(); + for &ctx in &forms { + let mut painters = Vec::new(); + collect_painters(ctx, &children_of, &forms, &by_idx, &mut painters); + // Stable sort by the document-tier paint key (negative-z first, then + // in-flow, then auto-positioned, then positive-z ascending). The Vec is + // already in document order so equal-key entries keep it (spec § 2.1). + painters.sort_by_key(|&i| paint_key(by_idx[&i])); + painters_z.insert(ctx, painters); + } + + // Escaped top-layer members attach to their root-ancestor context's tail, + // ordered by `top_layer_paint_rank` (Fullscreen bottom < … < Modal top), + // stable within a tier (activation = document order here). + let mut escaped_by_ctx: std::collections::HashMap> = + std::collections::HashMap::new(); + for n in &flat { + if n.top_layer != TopLayer::None { + // A node that is itself a ROOT does NOT escape — it has no parent + // context to escape from, so it forms its own root context normally + // (mirrors sub-pass 6f's `if r != e` guard, systems.rs § 4). Only a + // top-layer node WITH a parent escapes, attaching to the OUTERMOST + // (tree-root) context so it paints after EVERY normal node. + if n.parent.is_some() { + let host = tree_root(n.idx); + escaped_by_ctx.entry(host).or_default().push(n.idx); + } + } + } + for tail in escaped_by_ctx.values_mut() { + tail.sort_by_key(|&i| top_layer_paint_rank(by_idx[&i].top_layer)); + } + + // Resolve a node index → its `painters_z` slice (or `None` for a + // non-context painter), the exact contract `context_tree_paint_order` wants. + // We thread by ENTITY so we can reuse the production fn verbatim. + let idx_of_entity: std::collections::HashMap = + entity_of.iter().map(|(i, e)| (*e, *i)).collect(); + // Build entity-keyed painters_z (in-flow only; the escaped tail is appended + // per-root below, mirroring sub-pass 6f's `painters_z.extend(escaped)`). + let painters_z_entities: std::collections::HashMap> = painters_z + .iter() + .map(|(&ctx, painters)| { + let mut list: Vec = painters.iter().map(|i| entity_of[i]).collect(); + if let Some(escaped) = escaped_by_ctx.get(&ctx) { + list.extend(escaped.iter().map(|i| entity_of[i])); + } + (entity_of[&ctx], list) + }) + .collect(); + + let painters_z_of = + |e: Entity| -> Option<&[Entity]> { painters_z_entities.get(&e).map(|v| v.as_slice()) }; + + // Structural invariant (debug-gated): the context tree we hand the + // production walk must be well-formed — no entity appears in two + // `painters_z` lists and no context lists itself — otherwise + // `context_tree_paint_order` would recurse forever. This guards `realize` + // against future regressions in the escape / collection logic; it is a + // property of the BRIDGE, not of the code under test, so it is a + // `debug_assert` (off in release proptest runs). + #[cfg(debug_assertions)] + { + let mut seen: std::collections::HashSet = std::collections::HashSet::new(); + for (&ctx, list) in &painters_z_entities { + for &p in list { + debug_assert_ne!(p, ctx, "realize produced a self-referential context"); + debug_assert!( + seen.insert(p), + "realize listed entity {p:?} in two painters_z lists" + ); + } + } + } + + // Walk the production context-tree paint order from each forest root. + let mut order: Vec = Vec::new(); + for (root_i, _root) in scene.roots.iter().enumerate() { + let root_idx = root_preorder_index(scene, root_i); + context_tree_paint_order(entity_of[&root_idx], &painters_z_of, &mut order); + } + + // The escaped top-layer members were merged into each ROOT context's + // `painters_z` tail via the production split — layout sub-pass 6f computes + // that tail with `partition_top_layer` and appends it + // (`painters_z.extend(escaped)`), exactly what `realize` mirrors above — so + // the production walk placed the tail after the in-flow painters and `order` + // IS the paint order. (`partition_top_layer` operates on ONE root context's + // list, not the flattened multi-context order: a top-layer ROOT legitimately + // paints first as its own tree's root, so feeding the global `order` through + // it would wrongly reorder. Global top-layer DOMINANCE is the job of the + // `top_layer_dominates` predicate, not of this bridge.) + + // Build the ExtractedNode for each entity in paint order. + let nodes: Vec = order + .iter() + .map(|&e| { + let n = by_idx[&idx_of_entity[&e]]; + extracted_node(e, n) + }) + .collect(); + + // context membership map (entity → owning context root entity). + let context_of: std::collections::HashMap = order + .iter() + .map(|&e| { + let idx = idx_of_entity[&e]; + (e, entity_of[&root_context(idx)]) + }) + .collect(); + + Realized { + nodes: ExtractedNodes { + nodes, + ..Default::default() + }, + context_of, + name_of, + } +} + +/// One flattened node with its pre-order index + parent link. +struct FlatNode { + idx: usize, + parent: Option, + is_root: bool, + name: String, + z_index: Option, + isolation: bool, + top_layer: TopLayer, + transform: GenTransform, + size: (f32, f32), + background: Option<[f32; 4]>, +} + +impl FlatNode { + /// The stacking-context formation triggers we model (invariants.md): root, + /// `Isolation::Isolate`, positioned `z-index`, non-identity transform, and + /// — so it hosts its own escaped subtree — any top-layer member (a top-layer + /// node always escapes as a context root, paint-order § 4.1). + fn forms_context(&self) -> bool { + self.is_root + || self.isolation + || self.z_index.is_some() + || !self.transform.is_identity() + || self.top_layer != TopLayer::None + } +} + +/// Flatten the tree pre-order, assigning monotonic indices. +fn flatten(node: &SceneNode, parent: Option, is_root: bool, out: &mut Vec) { + let idx = out.len(); + out.push(FlatNode { + idx, + parent, + is_root, + name: node.name.clone(), + z_index: node.z_index, + isolation: node.isolation, + top_layer: node.top_layer, + transform: node.transform, + size: node.size, + background: node.background, + }); + for child in &node.children { + flatten(child, Some(idx), false, out); + } +} + +/// The pre-order index of root `root_i` in the flattened forest. +fn root_preorder_index(scene: &Scene, root_i: usize) -> usize { + let mut count = 0usize; + for r in &scene.roots[..root_i] { + count += subtree_size(r); + } + count +} + +fn subtree_size(node: &SceneNode) -> usize { + 1 + node.children.iter().map(subtree_size).sum::() +} + +/// Collect a context's in-flow painters (sub-pass 6f mirror) by descending +/// from `cur`: walk descendants in document order, STOP at a nested forming +/// context (which appears as an atomic entry), EXCLUDE top-layer members (they +/// escape elsewhere). +fn collect_painters( + cur: usize, + children_of: &std::collections::HashMap>, + forms: &std::collections::HashSet, + by_idx: &std::collections::HashMap, + out: &mut Vec, +) { + let Some(kids) = children_of.get(&cur) else { + return; + }; + for &child in kids { + if by_idx[&child].top_layer != TopLayer::None { + // Top-layer member escapes — not in any in-flow painters list. + continue; + } + out.push(child); + // Descend only if the child does NOT itself form a context (a nested + // context root appears as a single atomic entry; its descendants live + // in its own painters_z). + if !forms.contains(&child) { + collect_painters(child, children_of, forms, by_idx, out); + } + } +} + +/// The (tier, z) paint key — the generator-side mirror of `buiy_core`'s +/// `paint_key` (which is `pub(super)`): negative-z first (tier 0), in-flow +/// non-positioned (tier 1), auto-positioned (tier 2), positive-z ascending +/// (tier 3). A node is "positioned" here iff it has an explicit `z_index`. +fn paint_key(n: &FlatNode) -> (u8, i32) { + match n.z_index { + Some(z) if z < 0 => (0, z), + None => (1, 0), + Some(0) => (3, 0), + Some(z) => (3, z), + } +} + +/// Build the `ExtractedNode` for one realized node. Position is a deterministic +/// per-index offset (the geometry the predicates assert on is `size`, which +/// comes straight from the generated box); `clip` mirrors the production +/// full-view sentinel (`None`) for top-layer members and `Some(box)` otherwise. +fn extracted_node(entity: Entity, n: &FlatNode) -> ExtractedNode { + let position = Vec2::new((n.idx as f32) * 8.0, (n.idx as f32) * 8.0); + let size = Vec2::new(n.size.0, n.size.1); + let color = match n.background { + Some([r, g, b, a]) => Color::srgba(r, g, b, a), + None => Color::NONE, + }; + let clip = if n.top_layer != TopLayer::None { + // Top-layer members are unclipped (full-view sentinel, § 3.2). + None + } else { + Some(ClipRect { + min: position, + max: position + size, + }) + }; + ExtractedNode { + entity, + position, + size, + color, + clip, + group: None, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn plain(name: &str, children: Vec) -> SceneNode { + SceneNode { + name: name.to_string(), + children, + z_index: None, + isolation: false, + top_layer: TopLayer::None, + transform: GenTransform::IDENTITY, + size: (10.0, 10.0), + background: None, + } + } + + /// Regression: a MULTI-root forest realizes EVERY root's subtree, not just + /// the first. (The first cut marked only `roots[0]` as `is_root`, so a + /// plain second root formed no context and silently dropped its children.) + #[test] + fn two_root_forest_realizes_all() { + let scene = Scene { + roots: vec![plain("n0", vec![]), plain("n1", vec![plain("n2", vec![])])], + }; + let nodes = realize(&scene); + assert_eq!( + nodes.nodes.len(), + 3, + "all 3 nodes across both roots realized" + ); + } + + /// A top-layer node that is itself a forest ROOT does NOT escape (no parent + /// context to escape to) — it must still realize exactly once, never list + /// itself in its own `painters_z`. + #[test] + fn top_layer_root_does_not_self_reference() { + let mut root = plain("n0", vec![plain("n1", vec![])]); + root.top_layer = TopLayer::Modal; + let scene = Scene { roots: vec![root] }; + let nodes = realize(&scene); + assert_eq!(nodes.nodes.len(), 2, "the top-layer root + its child"); + } +} diff --git a/crates/buiy_verify/src/lib.rs b/crates/buiy_verify/src/lib.rs index cb65954..f1c84ee 100644 --- a/crates/buiy_verify/src/lib.rs +++ b/crates/buiy_verify/src/lib.rs @@ -6,6 +6,7 @@ pub mod a11y; pub mod contrast; +pub mod invariant; pub mod metric; pub mod reftest; pub mod snapshot; diff --git a/crates/buiy_verify/tests/scene_generator_smoke.proptest-regressions b/crates/buiy_verify/tests/scene_generator_smoke.proptest-regressions new file mode 100644 index 0000000..13ee74e --- /dev/null +++ b/crates/buiy_verify/tests/scene_generator_smoke.proptest-regressions @@ -0,0 +1,8 @@ +# Seeds for failure cases proptest has generated in the past. It is +# automatically read and these particular cases re-run before any +# novel cases are generated. +# +# It is recommended to check this file in to source control so that +# everyone who runs the test benefits from these saved cases. +cc c1e781fc4b74a574e5ba2a3d28068313f898c678711564d2619b363457dac0f2 # shrinks to scene = Scene { roots: [SceneNode { name: "n0", children: [], z_index: None, isolation: false, top_layer: Fullscreen, transform: GenTransform { translate: [0.0, 0.0, 0.0], rotate_axis: [0.0, 0.0, 1.0], rotate_angle: 0.0, scale: [1.0, 1.0, 1.0] }, size: (0.0, 0.0), background: None }] } +cc f647226cb04c6d570ae3403a26e7abf6351f914a9bd841cfc07dc620945290ee # shrinks to scene = Scene { roots: [SceneNode { name: "n0", children: [], z_index: None, isolation: false, top_layer: None, transform: GenTransform { translate: [0.0, 0.0, 0.0], rotate_axis: [0.0, 0.0, 1.0], rotate_angle: 0.0, scale: [1.0, 1.0, 1.0] }, size: (0.0, 0.0), background: None }, SceneNode { name: "n1", children: [SceneNode { name: "n2", children: [], z_index: None, isolation: false, top_layer: None, transform: GenTransform { translate: [0.0, 0.0, 0.0], rotate_axis: [0.0, 0.0, 1.0], rotate_angle: 0.0, scale: [1.0, 1.0, 1.0] }, size: (0.0, 0.0), background: None }], z_index: None, isolation: false, top_layer: None, transform: GenTransform { translate: [0.0, 0.0, 0.0], rotate_axis: [0.0, 0.0, 1.0], rotate_angle: 0.0, scale: [1.0, 1.0, 1.0] }, size: (0.0, 0.0), background: None }] } diff --git a/crates/buiy_verify/tests/scene_generator_smoke.rs b/crates/buiy_verify/tests/scene_generator_smoke.rs new file mode 100644 index 0000000..3854794 --- /dev/null +++ b/crates/buiy_verify/tests/scene_generator_smoke.rs @@ -0,0 +1,101 @@ +//! Task 2.7 RED — the `arb_scene` generator's structural bounds + the `realize` +//! round-trip. Plain `proptest!`/`assert!` (NOT a snapshot) so it cannot pass +//! vacuously: it pins that the generator terminates within its depth budget and +//! that `realize` threads every node through the production paint path exactly +//! once (invariants.md § "Scene generators"). + +use std::collections::HashSet; + +use buiy_verify::invariant::{Scene, SceneNode, SceneParams, arb_scene, realize}; +use proptest::prelude::*; + +/// Max nesting depth of a scene forest (a single root is depth 1). +fn scene_depth(scene: &Scene) -> u32 { + scene.roots.iter().map(node_depth).max().unwrap_or(0) +} + +fn node_depth(node: &SceneNode) -> u32 { + 1 + node.children.iter().map(node_depth).max().unwrap_or(0) +} + +/// Total node count of a scene forest. +fn scene_node_count(scene: &Scene) -> usize { + scene.roots.iter().map(subtree_count).sum() +} + +fn subtree_count(node: &SceneNode) -> usize { + 1 + node.children.iter().map(subtree_count).sum::() +} + +/// Collect every node name in the scene (the generator renames pre-order). +fn scene_names(scene: &Scene) -> Vec { + let mut out = Vec::new(); + for root in &scene.roots { + collect_names(root, &mut out); + } + out +} + +fn collect_names(node: &SceneNode, out: &mut Vec) { + out.push(node.name.clone()); + for child in &node.children { + collect_names(child, out); + } +} + +proptest! { + #![proptest_config(ProptestConfig { cases: 256, ..ProptestConfig::default() })] + + /// `prop_recursive` HARD-caps recursion depth at `max_depth` (proptest + /// guarantee), so a generated scene never nests deeper than the budget. The + /// node count is a soft statistical target, so we bound it only by the true + /// structural maximum (full `max_breadth`-ary tree of `max_depth` levels per + /// root), never the soft `max_nodes`. + #[test] + fn arb_scene_respects_bounds(scene in arb_scene(SceneParams::default())) { + let p = SceneParams::default(); + prop_assert!( + scene_depth(&scene) <= p.max_depth, + "depth {} exceeds max_depth {}", + scene_depth(&scene), + p.max_depth, + ); + + // Structural hard cap: at most `Σ breadth^level` nodes per root tree. + let per_root: usize = (0..p.max_depth) + .map(|l| (p.max_breadth as usize).pow(l)) + .sum(); + let forest_cap = per_root.saturating_mul(2); // up to 2 roots + prop_assert!( + scene_node_count(&scene) <= forest_cap, + "node count {} exceeds structural cap {}", + scene_node_count(&scene), + forest_cap, + ); + + // Names are unique (the pre-order rename) and cover `n0..nK`. + let names = scene_names(&scene); + let unique: HashSet<&String> = names.iter().collect(); + prop_assert_eq!(unique.len(), names.len(), "node names must be unique"); + } + + /// `realize` threads a scene through the production CPU paint assembly into + /// a flat node list whose entities are EXACTLY the scene's nodes, each once + /// (the round-trip: no node dropped, none duplicated). + #[test] + fn realize_round_trips_every_node(scene in arb_scene(SceneParams::default())) { + let nodes = realize(&scene); + let painted: HashSet = + nodes.nodes.iter().map(|n| n.entity).collect(); + prop_assert_eq!( + painted.len(), + nodes.nodes.len(), + "no entity is painted twice" + ); + prop_assert_eq!( + nodes.nodes.len(), + scene_node_count(&scene), + "every scene node is realized exactly once" + ); + } +} From 5a11013fd05c36b8ffe15226208edca220b8bcb8 Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 08:28:52 -0700 Subject: [PATCH 37/70] feat(verify): Tier-3 predicates #1-#5 + proptest harness + mutation teeth MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add `invariant::predicates` (gate #12, the metamorphic + property tier) and its proptest harness + MANDATORY mutation meta-tests: - paint_order_is_total — no entity painted twice - transform_roundtrips — on the production `compose_transform`: translate∘-translate ≈ I, rotate(2π) ≈ I, scale(k) is a pure diagonal scale (within EPS) - top_layer_dominates — top-layer paints after every normal node, tail ranked Fullscreen` (rule + detail), so a failing property names the broken relation + offending node. 5 proptest blocks at 256 cases; 13 mutation meta-tests prove teeth — a duplicate, an S·R·T miscomposition, an EPS-boundary perturbation, a Modal-before-Fullscreen tail (pins the rank-vs-discriminant deviation #3), a NaN/negative size, a negative packed height, and an interleaved region are each REJECTED, with passing controls. buiy_core surface add: promote `compose_transform` to `pub` (the metamorphic relations assert on the PRODUCTION composed Mat4, never a re-implementation — invariants.md deviation #1). 306 buiy_core lib tests stay green. Two predicate-design bugs found + fixed via the proptests (real-invariant discipline — STOP, fix, re-run): - a descendant of an escaped top-layer node is itself "in the top layer" (it paints inside the escaped region); `Realized::top_layer_of` now stores the EFFECTIVE membership (nearest top-layer ancestor), so a normal child of a Modal is not mistaken for an in-flow node after the top layer; - `contexts_do_not_interleave` now checks each context's full painted REGION (root + nested contexts, via `context_tree_paint_order`) is contiguous — a nested context sitting among its parent's painters is the correct descend-as-a-unit rule, not interleaving. Generator now emits a SINGLE root tree (the Buiy one-root-per-window model); multi-root forests only added a cross-tree paint order `painters_z` leaves unspecified, forcing every predicate to special-case it. `realize` stays multi-root-correct (unit-tested) as a robustness property. The stale 2.7 proptest-regressions seed (generated against the old multi-root generator + looser depth bound) is removed; its two real `realize` bugs are now pinned by explicit unit tests instead. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/buiy_core/src/layout/mod.rs | 2 +- crates/buiy_core/src/layout/systems.rs | 7 +- crates/buiy_verify/src/invariant.rs | 11 +- .../buiy_verify/src/invariant/predicates.rs | 390 ++++++++++++++++++ crates/buiy_verify/src/invariant/scene.rs | 158 ++++++- .../buiy_verify/tests/invariant_mutations.rs | 303 ++++++++++++++ .../buiy_verify/tests/invariant_predicates.rs | 76 ++++ ...scene_generator_smoke.proptest-regressions | 8 - .../tests/scene_generator_smoke.rs | 22 +- 9 files changed, 938 insertions(+), 39 deletions(-) create mode 100644 crates/buiy_verify/src/invariant/predicates.rs create mode 100644 crates/buiy_verify/tests/invariant_mutations.rs create mode 100644 crates/buiy_verify/tests/invariant_predicates.rs delete mode 100644 crates/buiy_verify/tests/scene_generator_smoke.proptest-regressions diff --git a/crates/buiy_core/src/layout/mod.rs b/crates/buiy_core/src/layout/mod.rs index c625de3..38bb7db 100644 --- a/crates/buiy_core/src/layout/mod.rs +++ b/crates/buiy_core/src/layout/mod.rs @@ -21,7 +21,7 @@ pub use style::{LogicalBoxModel, LogicalInset, Style}; pub use systems::{ AnchorNameRegistry, ContentVisibilityMargin, LayoutAnchorWarnedThisFrame, LayoutTaffyComputeCount, LayoutWarnedOnceSession, PostTaffyPositionOverrides, - SyncStylesIterCount, TopLayerActivation, top_layer_paint_rank, + SyncStylesIterCount, TopLayerActivation, compose_transform, top_layer_paint_rank, }; pub use tree::LayoutTree; pub use types::{ diff --git a/crates/buiy_core/src/layout/systems.rs b/crates/buiy_core/src/layout/systems.rs index 7c43256..8da3386 100644 --- a/crates/buiy_core/src/layout/systems.rs +++ b/crates/buiy_core/src/layout/systems.rs @@ -3769,10 +3769,13 @@ pub(super) fn multicol_length_px(l: Option, fallback: f32) -> f32 { /// innermost. A child point `p` is transformed as `M · p`, so it /// feels the rightmost (innermost) factor first. /// -/// Pure function — no Bevy queries, no Taffy reads. Easy to unit test. +/// Pure function — no Bevy queries, no Taffy reads. Easy to unit test, and +/// consumed by the Tier-3 `transform_roundtrips` invariant (the metamorphic +/// `translate∘-translate ≈ I`, `rotate(2π) ≈ I`, `scale(k)` checks assert on +/// THIS composed matrix, never a re-implementation), hence `pub`. /// /// Spec: docs/specs/2026-05-08-buiy-layout-design/transforms-and-containment.md § 1, § 1.1. -pub(super) fn compose_transform( +pub fn compose_transform( ui: &UiTransform, t: Option<&Translate>, r: Option<&Rotate>, diff --git a/crates/buiy_verify/src/invariant.rs b/crates/buiy_verify/src/invariant.rs index 253d624..e291e5d 100644 --- a/crates/buiy_verify/src/invariant.rs +++ b/crates/buiy_verify/src/invariant.rs @@ -26,4 +26,13 @@ //! `cargo test` gate. pub mod scene; -pub use scene::{GenTransform, Realized, Scene, SceneNode, SceneParams, arb_scene, realize}; +pub use scene::{ + GenTransform, Realized, Scene, SceneNode, SceneParams, arb_scene, arb_transform, realize, + realize_full, +}; + +pub mod predicates; +pub use predicates::{ + EPS, Violation, all_finite, all_finite_packed, contexts_do_not_interleave, mat4_is_identity, + mat4_is_pure_scale, paint_order_is_total, top_layer_dominates, transform_roundtrips, +}; diff --git a/crates/buiy_verify/src/invariant/predicates.rs b/crates/buiy_verify/src/invariant/predicates.rs new file mode 100644 index 0000000..6fd8666 --- /dev/null +++ b/crates/buiy_verify/src/invariant/predicates.rs @@ -0,0 +1,390 @@ +//! The Tier-3 predicate functions (invariants.md § "Predicate functions"). +//! +//! Each is a free `pub fn` taking borrowed data and returning +//! `Result<(), Violation>` — NOT a bare `bool` — so a failing property prints +//! *which* relation broke and the offending names/indices. The `proptest!` +//! harness in `tests/invariant_predicates.rs` feeds them generated scenes; the +//! mutation meta-tests in `tests/invariant_mutations.rs` feed them hand-built +//! VIOLATING fixtures to prove each predicate has teeth (a predicate that never +//! fails is worthless). + +use std::fmt; + +use bevy::prelude::*; + +use buiy_core::layout::{ + Length, Rotate, Scale, TopLayer, Translate, UiTransform, compose_transform, + top_layer_paint_rank, +}; +use buiy_core::render::extract::ExtractedNodes; +use buiy_core::render::instance::PackedInstance; + +use super::scene::{GenTransform, Realized}; + +/// A broken invariant relation. Plain struct (no `thiserror`) to keep the dep +/// surface at zero: the `rule` names the predicate, the `detail` carries the +/// offending entity names / indices so the seed + this message reproduce the +/// failure. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Violation { + /// The invariant that broke (a stable `&'static str` id). + pub rule: &'static str, + /// Human-readable specifics (which entity, which index, the bad value). + pub detail: String, +} + +impl Violation { + fn new(rule: &'static str, detail: impl Into) -> Self { + Self { + rule, + detail: detail.into(), + } + } +} + +impl fmt::Display for Violation { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "[{}] {}", self.rule, self.detail) + } +} + +/// Tolerance for the metamorphic transform relations. A composed `Mat4` of +/// rotations + scales in `0.1..8.0` accumulates a few ULPs of f32 error; `1e-3` +/// is comfortably above that round-off yet far below any real composition bug +/// (a transposed factor, a dropped term) which shifts entries by `O(1)`. +pub const EPS: f32 = 1e-3; + +// --------------------------------------------------------------------------- +// #1 — paint order is a TOTAL order over painted entities. +// --------------------------------------------------------------------------- + +/// Paint order is a total order: no entity appears twice in +/// [`ExtractedNodes::nodes`]. Mirrors the non-re-sorting contract of the +/// stored paint order (`extract.rs` "Never re-sorted by render") — a duplicate +/// would mean the same box painted twice, a partial-re-extract or +/// context-walk bug. +/// +/// (Stable equal-key order is a property of the *generator's* document order + +/// the production stable sort, exercised by `realize`; the observable invariant +/// here is no-duplicates over the realized list.) +pub fn paint_order_is_total(nodes: &ExtractedNodes) -> Result<(), Violation> { + let mut seen = std::collections::HashSet::new(); + for (i, node) in nodes.nodes.iter().enumerate() { + if !seen.insert(node.entity) { + return Err(Violation::new( + "paint_order_is_total", + format!( + "entity {:?} appears more than once in painters_z (at index {i})", + node.entity + ), + )); + } + } + Ok(()) +} + +// --------------------------------------------------------------------------- +// #2 — transform round-trips on the production `compose_transform`. +// --------------------------------------------------------------------------- + +/// Three metamorphic relations on the COMPOSED `Mat4` from the production +/// [`compose_transform`] (`systems.rs`, compose `T·R·S·M`), within [`EPS`]: +/// +/// - `translate(d) · translate(-d) ≈ I` (translation is invertible), +/// - `rotate(2π) ≈ I` (a full turn is the identity), +/// - `scale(k)` scales every basis vector by its axis factor and leaves the +/// off-diagonals zero (a pure diagonal scale touches nothing else). +/// +/// Operates on `compose_transform` OUTPUTS, never a re-implementation — a +/// transposed/ dropped factor in the production composition reds this. +pub fn transform_roundtrips(t: &GenTransform) -> Result<(), Violation> { + // (a) translate(d) · translate(-d) ≈ I. + let d = Vec3::from_array(t.translate); + let fwd = compose_transform(&UiTransform::default(), Some(&translate_of(d)), None, None); + let back = compose_transform(&UiTransform::default(), Some(&translate_of(-d)), None, None); + mat4_is_identity("transform_roundtrips/translate", fwd * back)?; + + // (b) rotate(2π) ≈ I. A full turn about the generated axis. + let axis = Vec3::from_array(t.rotate_axis); + let axis = if axis.length_squared() > 1e-6 { + axis.normalize() + } else { + Vec3::Z + }; + let full_turn = Quat::from_axis_angle(axis, std::f32::consts::TAU); + let rot = compose_transform( + &UiTransform::default(), + None, + Some(&Rotate(full_turn)), + None, + ); + mat4_is_identity("transform_roundtrips/rotate2pi", rot)?; + + // (c) scale(k) is a pure diagonal scale: diagonal == k, off-diagonals == 0. + let k = t.scale; + let s = compose_transform( + &UiTransform::default(), + None, + None, + Some(&Scale(k[0], k[1], k[2])), + ); + mat4_is_pure_scale("transform_roundtrips/scale", s, k)?; + Ok(()) +} + +fn translate_of(d: Vec3) -> Translate { + Translate(Length::Px(d.x), Length::Px(d.y), Length::Px(d.z)) +} + +/// Assert a `Mat4` is the identity within [`EPS`] (every entry matches `I`). The +/// relation-check half of [`transform_roundtrips`], exposed so the mutation +/// meta-tests can feed it a deliberately mis-composed matrix and confirm it +/// REJECTS (the predicate's teeth, invariants.md § Verification). +pub fn mat4_is_identity(rule: &'static str, m: Mat4) -> Result<(), Violation> { + check_diagonal( + rule, + m, + [1.0, 1.0, 1.0, 1.0], + "composition is not the identity", + ) +} + +/// Assert a `Mat4` is a pure diagonal scale by `k`: diagonal == `[k.x,k.y,k.z,1]` +/// and every off-diagonal == 0 (within [`EPS`]). A mis-composed matrix +/// (`S·R·T` instead of the pure `S`) leaks an off-diagonal and is rejected — the +/// teeth the mutation meta-test exploits. +pub fn mat4_is_pure_scale(rule: &'static str, m: Mat4, k: [f32; 3]) -> Result<(), Violation> { + check_diagonal( + rule, + m, + [k[0], k[1], k[2], 1.0], + "off-diagonal leaked or wrong factor", + ) +} + +/// Assert `m` is a diagonal matrix with the given `diag` (within [`EPS`]): +/// every diagonal entry matches `diag[i]` and every off-diagonal is `0`. The +/// shared kernel of [`mat4_is_identity`] and [`mat4_is_pure_scale`]. +fn check_diagonal(rule: &'static str, m: Mat4, diag: [f32; 4], why: &str) -> Result<(), Violation> { + for (c, col) in m.to_cols_array_2d().iter().enumerate() { + for (r, &value) in col.iter().enumerate() { + let expected = if c == r { diag[c] } else { 0.0 }; + if (value - expected).abs() > EPS { + return Err(Violation::new( + rule, + format!("M[{r}][{c}] = {value} ≠ {expected} ({why})"), + )); + } + } + } + Ok(()) +} + +// --------------------------------------------------------------------------- +// #3 — top-layer dominance. +// --------------------------------------------------------------------------- + +/// Every `top_layer != None` node paints AFTER every normal-stacking node, and +/// the escaped tail is ordered by paint rank Fullscreen < Tooltip < Popover < +/// Modal — compared via the promoted [`top_layer_paint_rank`], NEVER the enum +/// discriminant (invariants.md deviation #3: the declared enum order is NOT the +/// paint order, so `#[derive(Ord)]` would dominate wrongly). +/// +/// Takes the [`Realized`] (not bare `ExtractedNodes`) because `ExtractedNode` +/// carries no top-layer field — membership lives in +/// [`Realized::top_layer_of`]. +pub fn top_layer_dominates(r: &Realized) -> Result<(), Violation> { + let order = &r.nodes.nodes; + let top_of = |e: Entity| r.top_layer_of.get(&e).copied().unwrap_or(TopLayer::None); + let name = |e: Entity| { + r.name_of + .get(&e) + .cloned() + .unwrap_or_else(|| format!("{e:?}")) + }; + + // (a) once a top-layer node has painted, no NORMAL node may paint after it. + let mut first_top: Option = None; + for (i, node) in order.iter().enumerate() { + let is_top = top_of(node.entity) != TopLayer::None; + if is_top && first_top.is_none() { + first_top = Some(i); + } + if !is_top && let Some(t) = first_top { + return Err(Violation::new( + "top_layer_dominates/normal_after_top", + format!( + "normal node {} (index {i}) paints AFTER top-layer node at index {t}", + name(node.entity) + ), + )); + } + } + + // (b) the escaped tail is non-decreasing in paint rank. + let mut prev_rank: Option = None; + let mut prev_name = String::new(); + for node in order.iter() { + let tl = top_of(node.entity); + if tl == TopLayer::None { + continue; + } + let rank = top_layer_paint_rank(tl); + if let Some(p) = prev_rank + && rank < p + { + return Err(Violation::new( + "top_layer_dominates/tail_misordered", + format!( + "top-layer {} (rank {rank}) paints after {prev_name} (rank {p}) — \ + tail not Fullscreen Result<(), Violation> { + for (i, node) in nodes.nodes.iter().enumerate() { + for (axis, v) in [("x", node.size.x), ("y", node.size.y)] { + if !v.is_finite() || v < 0.0 { + return Err(Violation::new( + "all_finite", + format!("node index {i} size.{axis} = {v} (must be finite and ≥ 0)"), + )); + } + } + for (axis, v) in [("x", node.position.x), ("y", node.position.y)] { + if !v.is_finite() { + return Err(Violation::new( + "all_finite", + format!("node index {i} position.{axis} = {v} (must be finite)"), + )); + } + } + } + Ok(()) +} + +/// Every [`PackedInstance`] field is finite and `rect_size[1] ≥ 0` DIRECTLY +/// (the y-flip lives in the view uniform now, so packed height stays positive — +/// `instance.rs`, invariants.md deviation #2: no un-flip needed). The clip +/// sentinels (`±INFINITY`) are the one allowed non-finite — they encode "no +/// clip" and are checked separately. +pub fn all_finite_packed(packed: &[PackedInstance]) -> Result<(), Violation> { + for (i, p) in packed.iter().enumerate() { + let finite_fields: [(&str, f32); 9] = [ + ("rect_pos.x", p.rect_pos[0]), + ("rect_pos.y", p.rect_pos[1]), + ("rect_size.x", p.rect_size[0]), + ("rect_size.y", p.rect_size[1]), + ("color.r", p.color[0]), + ("color.g", p.color[1]), + ("color.b", p.color[2]), + ("color.a", p.color[3]), + ("radius", p.radius), + ]; + for (field, v) in finite_fields { + if !v.is_finite() { + return Err(Violation::new( + "all_finite_packed", + format!("instance {i} {field} = {v} (must be finite)"), + )); + } + } + // Packed height is POSITIVE (deviation #2) — the y-flip is in the view + // uniform, so a negative packed height is a real packing bug. + if p.rect_size[1] < 0.0 { + return Err(Violation::new( + "all_finite_packed", + format!( + "instance {i} rect_size[1] = {} < 0 (height must stay positive; \ + the y-flip lives in the view uniform)", + p.rect_size[1] + ), + )); + } + // The clip AABB must be finite OR the full-view sentinel (both + // components ±INFINITY). A mixed finite/infinite clip is a packing bug. + for (field, lo, hi) in [ + ("clip_min", p.clip_min[0], p.clip_min[1]), + ("clip_max", p.clip_max[0], p.clip_max[1]), + ] { + let both_finite = lo.is_finite() && hi.is_finite(); + let both_inf = lo.is_infinite() && hi.is_infinite(); + if !(both_finite || both_inf) || lo.is_nan() || hi.is_nan() { + return Err(Violation::new( + "all_finite_packed", + format!("instance {i} {field} = [{lo}, {hi}] (NaN or mixed finite/sentinel)"), + )); + } + } + } + Ok(()) +} + +// --------------------------------------------------------------------------- +// #5 — z-isolated containment (no context interleaving). +// --------------------------------------------------------------------------- + +/// No stacking context interleaves another: a stacking context paints as a +/// UNIT, so every entity in a context's painted region (the context root + all +/// nested contexts' regions, [`Realized::context_members`]) forms a CONTIGUOUS +/// run in the flattened order — no foreign entity sits between two of them. +/// Guards against subtree leakage across an `isolation` / z boundary (a +/// context-walk that flattened instead of descending as a unit would +/// interleave). A nested context legitimately sits AMONG its parent's direct +/// painters — that is the "descend as a unit at this position" rule, and it is +/// NOT interleaving: the nested region is itself one contiguous block. +pub fn contexts_do_not_interleave(r: &Realized) -> Result<(), Violation> { + // Index of each entity in the flattened paint order. + let index_of: std::collections::HashMap = r + .nodes + .nodes + .iter() + .enumerate() + .map(|(i, n)| (n.entity, i)) + .collect(); + + for (&ctx, members) in &r.context_members { + let mut indices: Vec = members + .iter() + .filter_map(|e| index_of.get(e).copied()) + .collect(); + if indices.is_empty() { + continue; + } + indices.sort_unstable(); + let span = indices[indices.len() - 1] - indices[0] + 1; + if span != indices.len() { + let name = r + .name_of + .get(&ctx) + .cloned() + .unwrap_or_else(|| format!("{ctx:?}")); + return Err(Violation::new( + "contexts_do_not_interleave", + format!( + "context {name}'s painted region spans indices {}..={} ({span} slots) but \ + has {} members — a foreign entity interleaves it", + indices[0], + indices[indices.len() - 1], + indices.len(), + ), + )); + } + } + Ok(()) +} diff --git a/crates/buiy_verify/src/invariant/scene.rs b/crates/buiy_verify/src/invariant/scene.rs index 0d1938d..c4c984d 100644 --- a/crates/buiy_verify/src/invariant/scene.rs +++ b/crates/buiy_verify/src/invariant/scene.rs @@ -130,11 +130,12 @@ impl Default for SceneParams { } } -/// Strategy for a single `GenTransform`. Skewed to the identity (the common + +/// Strategy for a single [`GenTransform`]. Skewed to the identity (the common + /// shrink case) but reaches a finite, well-conditioned non-identity transform: /// translate in `-512..512`, rotate angle in `0..2π` about an axis with a -/// non-zero component, scale in `0.1..8.0` per axis (away from `0`). -fn arb_transform() -> impl Strategy { +/// non-zero component, scale in `0.1..8.0` per axis (away from `0`). Public so +/// the `transform_roundtrips` proptest can draw inputs directly. +pub fn arb_transform() -> impl Strategy { prop_oneof![ // Weighted heavily toward identity so most generated nodes are in-flow. 3 => Just(GenTransform::IDENTITY), @@ -204,10 +205,11 @@ fn arb_top_layer(p_top: f64) -> impl Strategy { prop::option::weighted(p_top, escape).prop_map(|opt| opt.unwrap_or(TopLayer::None)) } -/// Generate a bounded, shrinkable [`Scene`]. `prop_recursive` bounds depth + -/// node count so the tree is finite and shrinks toward the empty/shallow scene -/// (invariants.md § "Strategy budget"). Names are assigned by a final pre-order -/// rename (`n0..nK`) so a shrunk counterexample is reproducible and printable. +/// Generate a bounded, shrinkable single-root [`Scene`]. `prop_recursive` bounds +/// depth + node count so the tree is finite and shrinks toward the shallow +/// scene (invariants.md § "Strategy budget"). Names are assigned by a final +/// pre-order rename (`n0..nK`) so a shrunk counterexample is reproducible and +/// printable. pub fn arb_scene(p: SceneParams) -> impl Strategy { let leaf = arb_leaf(p); let tree = leaf.prop_recursive(p.max_depth, p.max_nodes, p.max_breadth, move |inner| { @@ -220,14 +222,22 @@ pub fn arb_scene(p: SceneParams) -> impl Strategy { node }) }); - // A scene is a small forest (1..=2 roots) so the multi-root cross-tree - // case is reachable, but most scenes are single-rooted (the common case). - prop::collection::vec(tree, 1..=2).prop_map(|mut roots| { + // A scene is a SINGLE root tree — the Buiy model is one root context per + // window (cross-window scoping is a deferred follow-up, per the layout + // code). One root fully exercises every invariant (nesting, z-order, + // top-layer escape, context isolation); a multi-root forest would only add + // a cross-tree paint order that `painters_z` leaves unspecified, forcing + // every predicate to special-case it without testing anything new. + tree.prop_map(|mut root| { + // The ROOT is never a top-layer member: the top layer is an ESCAPE + // mechanism (a node leaves its parent context to paint at the root), so a + // node with no parent has nothing to escape. Forcing the root to `None` + // keeps the model faithful — every top-layer node has a parent to escape + // from — and `top_layer_dominates` well-defined. + root.top_layer = TopLayer::None; let mut counter = 0u32; - for root in &mut roots { - rename_preorder(root, &mut counter); - } - Scene { roots } + rename_preorder(&mut root, &mut counter); + Scene { roots: vec![root] } }) } @@ -254,6 +264,19 @@ pub struct Realized { pub nodes: ExtractedNodes, /// `entity → owning stacking-context root entity`, for every painted node. pub context_of: std::collections::HashMap, + /// `context-root entity → every entity painted WITHIN that context's + /// subtree` (the root + all transitive descendants, including nested + /// contexts). A stacking context paints as a UNIT, so each such set must be + /// a contiguous run in the paint order — the property + /// `contexts_do_not_interleave` checks. + pub context_members: std::collections::HashMap>, + /// `entity → EFFECTIVE top-layer membership`: the nearest top-layer ancestor's + /// [`TopLayer`] (inclusive of self), or `None` for a purely in-flow node. A + /// descendant of an escaped node paints INSIDE that escaped context, so it + /// is part of the top layer and inherits its rank. `ExtractedNode` carries no + /// top-layer field (a render-only signal), so the dominance predicate + /// recovers membership from here. + pub top_layer_of: std::collections::HashMap, /// `entity → node name`, for diagnostics. pub name_of: std::collections::HashMap, } @@ -447,7 +470,8 @@ pub fn realize_full(scene: &Scene) -> Realized { }) .collect(); - // context membership map (entity → owning context root entity). + // context membership map (entity → owning context root entity) + the + // top-layer membership map, both over the painted entities. let context_of: std::collections::HashMap = order .iter() .map(|&e| { @@ -455,6 +479,45 @@ pub fn realize_full(scene: &Scene) -> Realized { (e, entity_of[&root_context(idx)]) }) .collect(); + // Effective top-layer membership: a node is "in the top layer" iff it OR a + // document ancestor escaped (a descendant of an escaped node paints INSIDE + // that escaped context, so it is part of the top layer). The value is the + // NEAREST top-layer ancestor's variant (inclusive of self) — the rank source + // for the dominance tail — or `None` for a purely in-flow node. The + // dominance predicate reads this, not the per-node own membership, so a + // normal child of a top-layer node is not mistaken for an in-flow node that + // "paints after the top layer". + let effective_top_layer = |mut idx: usize| -> TopLayer { + loop { + let tl = by_idx[&idx].top_layer; + if tl != TopLayer::None { + return tl; + } + match by_idx[&idx].parent { + Some(p) => idx = p, + None => return TopLayer::None, + } + } + }; + let top_layer_of: std::collections::HashMap = order + .iter() + .map(|&e| (e, effective_top_layer(idx_of_entity[&e]))) + .collect(); + + // Each forming context's full PAINTED region — exactly what the production + // `context_tree_paint_order` emits for that context root (root + every + // nested context's region as a unit; for the tree root, including the + // escaped top-layer tail). Because the global `order` is the concatenation + // of these walks descending as units, each region is a contiguous run — the + // property `contexts_do_not_interleave` checks. + let context_members: std::collections::HashMap> = forms + .iter() + .map(|&ctx| { + let mut region = Vec::new(); + context_tree_paint_order(entity_of[&ctx], &painters_z_of, &mut region); + (entity_of[&ctx], region) + }) + .collect(); Realized { nodes: ExtractedNodes { @@ -462,6 +525,8 @@ pub fn realize_full(scene: &Scene) -> Realized { ..Default::default() }, context_of, + context_members, + top_layer_of, name_of, } } @@ -616,11 +681,44 @@ mod tests { } } - /// Regression: a MULTI-root forest realizes EVERY root's subtree, not just - /// the first. (The first cut marked only `roots[0]` as `is_root`, so a - /// plain second root formed no context and silently dropped its children.) + /// A normal CHILD of an escaped top-layer node is itself "in the top layer" + /// (it paints inside the escaped context), so it inherits the top-layer + /// membership — it must NOT be treated as an in-flow node that "paints after + /// the top layer". Scene `n0 > n1 > {n2(Fullscreen) > {n3}}`. #[test] - fn two_root_forest_realizes_all() { + fn descendant_of_escaped_node_is_in_top_layer() { + let mut n2 = plain("n2", vec![plain("n3", vec![])]); + n2.top_layer = TopLayer::Fullscreen; + let n1 = plain("n1", vec![n2]); + let scene = Scene { + roots: vec![plain("n0", vec![n1])], + }; + let r = realize_full(&scene); + // n3's effective membership is Fullscreen (via its escaped parent n2). + let n3 = r + .nodes + .nodes + .iter() + .find(|n| r.name_of[&n.entity] == "n3") + .expect("n3 realized") + .entity; + assert_eq!( + r.top_layer_of[&n3], + TopLayer::Fullscreen, + "a descendant of an escaped node inherits its top-layer membership" + ); + assert!( + crate::invariant::top_layer_dominates(&r).is_ok(), + "n3 painting inside n2's escaped region is not a dominance violation" + ); + } + + /// Regression: `realize` handles a multi-root forest (every root forms its + /// own context — the early cut marked only `roots[0]` as `is_root`, dropping + /// later roots' subtrees). The GENERATOR only emits single-root scenes, but + /// `realize` stays multi-root-correct as a robustness property. + #[test] + fn multi_root_forest_realizes_all() { let scene = Scene { roots: vec![plain("n0", vec![]), plain("n1", vec![plain("n2", vec![])])], }; @@ -632,6 +730,28 @@ mod tests { ); } + /// A nested isolated context paints AS A UNIT at its document position + /// among its parent's painters — its region is one contiguous block and the + /// parent's region (which INCLUDES the nested block) is also contiguous. + /// `n0 > n1(plain) > {n2(isolation), n3(plain)}`: the order is + /// `[n0, n1, n2, n3]`, n2 forms its own context spanning just `[2..=2]`, and + /// n0's region is the whole `[0..=3]` — neither interleaves. + #[test] + fn nested_isolated_context_is_a_contiguous_unit() { + let mut n2 = plain("n2", vec![]); + n2.isolation = true; + let n1 = plain("n1", vec![n2, plain("n3", vec![])]); + let scene = Scene { + roots: vec![plain("n0", vec![n1])], + }; + let r = realize_full(&scene); + assert_eq!(r.nodes.nodes.len(), 4); + assert!( + crate::invariant::contexts_do_not_interleave(&r).is_ok(), + "a nested isolated context is a contiguous unit, not interleaving" + ); + } + /// A top-layer node that is itself a forest ROOT does NOT escape (no parent /// context to escape to) — it must still realize exactly once, never list /// itself in its own `painters_z`. diff --git a/crates/buiy_verify/tests/invariant_mutations.rs b/crates/buiy_verify/tests/invariant_mutations.rs new file mode 100644 index 0000000..5c83917 --- /dev/null +++ b/crates/buiy_verify/tests/invariant_mutations.rs @@ -0,0 +1,303 @@ +//! Task 2.9 — MANDATORY mutation meta-tests: prove each Tier-3 predicate has +//! teeth. A property suite that never fails is worthless, so for every predicate +//! we hand-build a fixture that VIOLATES exactly one relation and assert the +//! predicate REJECTS it (`Err`), plus a known-good control that PASSES (`Ok`). +//! These are the Tier-3 analogue of the half-size sign-bug regression in +//! `render_instance.rs` (invariants.md § Verification). +//! +//! Plain `#[test]`s (no proptest, no GPU) so the harness's own correctness rides +//! the same `cargo test -p buiy_verify` gate. + +use std::collections::HashMap; + +use bevy::prelude::*; +use buiy_core::layout::TopLayer; +use buiy_core::render::components::ClipRect; +use buiy_core::render::extract::{ExtractedNode, ExtractedNodes}; +use buiy_core::render::instance::PackedInstance; +use buiy_verify::invariant::{ + EPS, Realized, all_finite, all_finite_packed, contexts_do_not_interleave, mat4_is_identity, + mat4_is_pure_scale, paint_order_is_total, top_layer_dominates, +}; + +// --- fixture builders ------------------------------------------------------- + +fn e(i: u32) -> Entity { + Entity::from_raw_u32(i).expect("valid entity index") +} + +/// A finite, well-formed node at a deterministic position with a given size. +fn node(entity: Entity, size: Vec2) -> ExtractedNode { + ExtractedNode { + entity, + position: Vec2::new(1.0, 2.0), + size, + color: Color::WHITE, + clip: Some(ClipRect { + min: Vec2::ZERO, + max: size, + }), + group: None, + } +} + +fn nodes(list: Vec) -> ExtractedNodes { + ExtractedNodes { + nodes: list, + ..Default::default() + } +} + +/// Build a `Realized` from an explicit paint-ordered entity list plus per-entity +/// top-layer assignments and the per-context painted-region map, so the +/// top-layer / interleave fixtures can inject a precise violation. +fn realized( + order: &[Entity], + top_layer_of: &[(Entity, TopLayer)], + context_members: &[(Entity, Vec)], +) -> Realized { + let tl: HashMap = top_layer_of.iter().copied().collect(); + let members: HashMap> = context_members.iter().cloned().collect(); + // `context_of` is the nearest-context map; for these flat fixtures each + // entity is its own context unless a region lists it, but the predicates + // under test read `top_layer_of` / `context_members`, so a self-map is fine. + let cx: HashMap = order.iter().map(|&en| (en, en)).collect(); + let name: HashMap = order.iter().map(|&en| (en, format!("{en:?}"))).collect(); + Realized { + nodes: nodes( + order + .iter() + .map(|&en| node(en, Vec2::splat(10.0))) + .collect(), + ), + context_of: cx, + context_members: members, + top_layer_of: tl, + name_of: name, + } +} + +// --- #1 paint_order_is_total ------------------------------------------------ + +#[test] +fn paint_order_rejects_a_duplicate_entity() { + // Same entity painted twice — a partial-re-extract / walk bug. + let dup = nodes(vec![ + node(e(1), Vec2::splat(10.0)), + node(e(1), Vec2::splat(20.0)), + ]); + assert!( + paint_order_is_total(&dup).is_err(), + "a duplicated entity must be rejected" + ); +} + +#[test] +fn paint_order_accepts_distinct_entities() { + let ok = nodes(vec![ + node(e(1), Vec2::splat(10.0)), + node(e(2), Vec2::splat(10.0)), + ]); + assert!(paint_order_is_total(&ok).is_ok(), "distinct entities pass"); +} + +// --- #2 transform_roundtrips (relation-check teeth) ------------------------- + +#[test] +fn identity_check_rejects_a_miscomposed_matrix() { + // A deliberately non-identity matrix (a leaked translation) must NOT pass + // the "≈ identity" relation — this is what catches a mis-composed + // translate∘-translate or a rotate(2π) that did not return to I. + let bad = Mat4::from_translation(Vec3::new(5.0, 0.0, 0.0)); + assert!(mat4_is_identity("test", bad).is_err()); + assert!( + mat4_is_identity("test", Mat4::IDENTITY).is_ok(), + "the true identity passes" + ); +} + +#[test] +fn pure_scale_check_rejects_an_s_r_t_miscomposition() { + // The spec's mutation: feed `S·R·T` instead of the pure diagonal `S`. The + // rotation leaks off-diagonals, so the pure-scale relation rejects it. + let k = [2.0f32, 3.0, 4.0]; + let s = Mat4::from_scale(Vec3::from_array(k)); + let r = Mat4::from_rotation_z(std::f32::consts::FRAC_PI_4); + let tr = Mat4::from_translation(Vec3::new(7.0, 8.0, 0.0)); + let miscomposed = s * r * tr; + assert!( + mat4_is_pure_scale("test", miscomposed, k).is_err(), + "S·R·T must be rejected as not a pure scale" + ); + // The genuine pure scale passes. + assert!(mat4_is_pure_scale("test", s, k).is_ok(), "pure S passes"); +} + +#[test] +fn identity_check_eps_boundary() { + // A perturbation just OVER EPS is rejected; just UNDER is accepted — the + // tolerance is real, not vacuous. + let mut over = Mat4::IDENTITY; + over.x_axis.y = EPS * 2.0; + assert!(mat4_is_identity("test", over).is_err(), "> EPS rejected"); + + let mut under = Mat4::IDENTITY; + under.x_axis.y = EPS * 0.5; + assert!(mat4_is_identity("test", under).is_ok(), "< EPS accepted"); +} + +// --- #3 top_layer_dominates ------------------------------------------------- + +#[test] +fn top_layer_rejects_a_normal_node_after_a_top_layer_node() { + // Order: [top-layer modal, normal] — the normal node paints AFTER the top + // layer, which violates dominance. + let (modal, normal) = (e(1), e(2)); + let r = realized( + &[modal, normal], + &[(modal, TopLayer::Modal), (normal, TopLayer::None)], + &[], + ); + assert!( + top_layer_dominates(&r).is_err(), + "a normal node after a top-layer node must be rejected" + ); +} + +#[test] +fn top_layer_rejects_modal_painted_before_fullscreen() { + // The deviation-#3 PIN: a tail emitted [Modal(rank 3), Fullscreen(rank 0)] + // is misordered (rank must be NON-DECREASING). This test FAILS if anyone + // "fixes" the predicate to compare the ENUM DISCRIMINANT + // (None,Modal,Popover,Tooltip,Fullscreen) — under the discriminant Modal(1) + // would sort before Fullscreen(4) and look correct, so the predicate would + // wrongly return Ok and this assert would fail. + let (modal, full) = (e(1), e(2)); + let r = realized( + &[modal, full], + &[(modal, TopLayer::Modal), (full, TopLayer::Fullscreen)], + &[], + ); + assert!( + top_layer_dominates(&r).is_err(), + "Modal (rank 3) before Fullscreen (rank 0) must be rejected — \ + pins the paint-rank vs enum-discriminant deviation" + ); +} + +#[test] +fn top_layer_accepts_well_ordered_tail() { + // [normal, Fullscreen(0), Tooltip(1), Popover(2), Modal(3)] — the canonical + // dominant order. + let (n, fs, tt, pv, md) = (e(1), e(2), e(3), e(4), e(5)); + let r = realized( + &[n, fs, tt, pv, md], + &[ + (n, TopLayer::None), + (fs, TopLayer::Fullscreen), + (tt, TopLayer::Tooltip), + (pv, TopLayer::Popover), + (md, TopLayer::Modal), + ], + &[], + ); + assert!( + top_layer_dominates(&r).is_ok(), + "the canonical dominant order passes" + ); +} + +// --- #4 all_finite / all_finite_packed -------------------------------------- + +#[test] +fn all_finite_rejects_nan_and_negative_size() { + let nan = nodes(vec![node(e(1), Vec2::new(f32::NAN, 10.0))]); + assert!(all_finite(&nan).is_err(), "NaN size rejected"); + + let neg = nodes(vec![node(e(1), Vec2::new(10.0, -5.0))]); + assert!(all_finite(&neg).is_err(), "negative size.y rejected"); + + let ok = nodes(vec![node(e(1), Vec2::new(10.0, 20.0))]); + assert!(all_finite(&ok).is_ok(), "finite non-negative size passes"); +} + +/// A finite packed instance with the full-view clip sentinel and POSITIVE +/// height (deviation #2: the y-flip lives in the view uniform). +fn packed(rect_size: [f32; 2]) -> PackedInstance { + PackedInstance { + rect_pos: [0.0, 0.0], + rect_size, + color: [1.0, 1.0, 1.0, 1.0], + radius: 0.0, + clip_min: [f32::NEG_INFINITY, f32::NEG_INFINITY], + clip_max: [f32::INFINITY, f32::INFINITY], + } +} + +#[test] +fn all_finite_packed_rejects_nan_and_negative_height() { + let mut nan = packed([10.0, 10.0]); + nan.color[0] = f32::NAN; + assert!(all_finite_packed(&[nan]).is_err(), "NaN color rejected"); + + // Negative packed height is a real packing bug (height stays POSITIVE). + let neg = packed([10.0, -10.0]); + assert!( + all_finite_packed(&[neg]).is_err(), + "negative rect_size[1] rejected (deviation #2)" + ); +} + +#[test] +fn all_finite_packed_accepts_positive_height_and_sentinel_clip() { + // Positive height + the ±INFINITY full-view sentinel is VALID (regression- + // pins deviation #2: the sentinel is the one allowed non-finite). + let ok = packed([10.0, 10.0]); + assert!( + all_finite_packed(&[ok]).is_ok(), + "positive height + sentinel clip passes" + ); +} + +// --- #5 contexts_do_not_interleave ------------------------------------------ + +#[test] +fn contexts_rejects_an_interleaved_list() { + // Order [a0, b0, a1]: context A's painted region {a0, a1} is SPLIT by b0 + // (a foreign entity) — its members do not form a contiguous run. + let (a0, b0, a1) = (e(1), e(2), e(3)); + let r = realized( + &[a0, b0, a1], + &[ + (a0, TopLayer::None), + (b0, TopLayer::None), + (a1, TopLayer::None), + ], + // Context A's region is {a0, a1}; with b0 between them it spans 3 slots + // for 2 members → interleaved. + &[(a0, vec![a0, a1]), (b0, vec![b0])], + ); + assert!( + contexts_do_not_interleave(&r).is_err(), + "an interleaved context region must be rejected" + ); +} + +#[test] +fn contexts_accepts_contiguous_runs() { + // Order [a0, a1, b0]: each context's region is a contiguous block. + let (a0, a1, b0) = (e(1), e(2), e(3)); + let r = realized( + &[a0, a1, b0], + &[ + (a0, TopLayer::None), + (a1, TopLayer::None), + (b0, TopLayer::None), + ], + &[(a0, vec![a0, a1]), (b0, vec![b0])], + ); + assert!( + contexts_do_not_interleave(&r).is_ok(), + "contiguous context regions pass" + ); +} diff --git a/crates/buiy_verify/tests/invariant_predicates.rs b/crates/buiy_verify/tests/invariant_predicates.rs new file mode 100644 index 0000000..2b6812e --- /dev/null +++ b/crates/buiy_verify/tests/invariant_predicates.rs @@ -0,0 +1,76 @@ +//! Task 2.9 — the Tier-3 `proptest!` harness for predicates #1–#5. One block +//! per predicate, each a `#[test]` so failures are isolated and report +//! individually. A failing case's MINIMIZED counterexample is persisted in +//! `invariant_predicates.proptest-regressions` (committed, not gitignored) so it +//! re-runs deterministically on the next `cargo test`. +//! +//! These exercise the predicates over the UNBOUNDED generated scene space; the +//! teeth (that each predicate actually REJECTS a known break) are proven by the +//! hand-built mutation fixtures in `invariant_mutations.rs`. + +use buiy_core::render::instance::pack_extracted; +use buiy_verify::invariant::{ + SceneParams, all_finite, all_finite_packed, arb_scene, arb_transform, + contexts_do_not_interleave, paint_order_is_total, realize, realize_full, top_layer_dominates, + transform_roundtrips, +}; +use proptest::prelude::*; + +proptest! { + #![proptest_config(ProptestConfig { cases: 256, max_shrink_iters: 4096, ..ProptestConfig::default() })] + + /// #1 — the realized paint order never lists an entity twice. + #[test] + fn prop_paint_order_total(scene in arb_scene(SceneParams::default())) { + let nodes = realize(&scene); + prop_assert!( + paint_order_is_total(&nodes).is_ok(), + "{}", paint_order_is_total(&nodes).unwrap_err() + ); + } + + /// #2 — the production `compose_transform` round-trips on every generated + /// transform (translate∘-translate ≈ I, rotate(2π) ≈ I, pure diagonal scale). + #[test] + fn prop_transform_roundtrips(t in arb_transform()) { + prop_assert!( + transform_roundtrips(&t).is_ok(), + "{}", transform_roundtrips(&t).unwrap_err() + ); + } + + /// #3 — top-layer nodes paint after every normal node, tail ranked + /// Fullscreen = nodes.nodes.iter().map(pack_extracted).collect(); + prop_assert!( + all_finite_packed(&packed).is_ok(), + "{}", all_finite_packed(&packed).unwrap_err() + ); + } + + /// #5 — no stacking context interleaves another in the flattened order. + #[test] + fn prop_contexts_no_interleave(scene in arb_scene(SceneParams::default())) { + let r = realize_full(&scene); + prop_assert!( + contexts_do_not_interleave(&r).is_ok(), + "{}", contexts_do_not_interleave(&r).unwrap_err() + ); + } +} diff --git a/crates/buiy_verify/tests/scene_generator_smoke.proptest-regressions b/crates/buiy_verify/tests/scene_generator_smoke.proptest-regressions deleted file mode 100644 index 13ee74e..0000000 --- a/crates/buiy_verify/tests/scene_generator_smoke.proptest-regressions +++ /dev/null @@ -1,8 +0,0 @@ -# Seeds for failure cases proptest has generated in the past. It is -# automatically read and these particular cases re-run before any -# novel cases are generated. -# -# It is recommended to check this file in to source control so that -# everyone who runs the test benefits from these saved cases. -cc c1e781fc4b74a574e5ba2a3d28068313f898c678711564d2619b363457dac0f2 # shrinks to scene = Scene { roots: [SceneNode { name: "n0", children: [], z_index: None, isolation: false, top_layer: Fullscreen, transform: GenTransform { translate: [0.0, 0.0, 0.0], rotate_axis: [0.0, 0.0, 1.0], rotate_angle: 0.0, scale: [1.0, 1.0, 1.0] }, size: (0.0, 0.0), background: None }] } -cc f647226cb04c6d570ae3403a26e7abf6351f914a9bd841cfc07dc620945290ee # shrinks to scene = Scene { roots: [SceneNode { name: "n0", children: [], z_index: None, isolation: false, top_layer: None, transform: GenTransform { translate: [0.0, 0.0, 0.0], rotate_axis: [0.0, 0.0, 1.0], rotate_angle: 0.0, scale: [1.0, 1.0, 1.0] }, size: (0.0, 0.0), background: None }, SceneNode { name: "n1", children: [SceneNode { name: "n2", children: [], z_index: None, isolation: false, top_layer: None, transform: GenTransform { translate: [0.0, 0.0, 0.0], rotate_axis: [0.0, 0.0, 1.0], rotate_angle: 0.0, scale: [1.0, 1.0, 1.0] }, size: (0.0, 0.0), background: None }], z_index: None, isolation: false, top_layer: None, transform: GenTransform { translate: [0.0, 0.0, 0.0], rotate_axis: [0.0, 0.0, 1.0], rotate_angle: 0.0, scale: [1.0, 1.0, 1.0] }, size: (0.0, 0.0), background: None }] } diff --git a/crates/buiy_verify/tests/scene_generator_smoke.rs b/crates/buiy_verify/tests/scene_generator_smoke.rs index 3854794..dc7f6a0 100644 --- a/crates/buiy_verify/tests/scene_generator_smoke.rs +++ b/crates/buiy_verify/tests/scene_generator_smoke.rs @@ -54,23 +54,29 @@ proptest! { #[test] fn arb_scene_respects_bounds(scene in arb_scene(SceneParams::default())) { let p = SceneParams::default(); + // `prop_recursive(depth, …)` HARD-caps the number of recursive + // combinator LEVELS at `depth`; the base (non-recursive) leaf adds the + // final level, so a realized tree nests at most `max_depth + 1` deep (a + // single-node scene is depth 1). This is still a hard bound, never the + // soft node count. + let max_levels = p.max_depth + 1; prop_assert!( - scene_depth(&scene) <= p.max_depth, - "depth {} exceeds max_depth {}", + scene_depth(&scene) <= max_levels, + "depth {} exceeds max_depth+1 = {}", scene_depth(&scene), - p.max_depth, + max_levels, ); - // Structural hard cap: at most `Σ breadth^level` nodes per root tree. - let per_root: usize = (0..p.max_depth) + // Structural hard cap: at most `Σ breadth^level` nodes over the + // `max_depth + 1` levels of the single root tree. + let cap: usize = (0..max_levels) .map(|l| (p.max_breadth as usize).pow(l)) .sum(); - let forest_cap = per_root.saturating_mul(2); // up to 2 roots prop_assert!( - scene_node_count(&scene) <= forest_cap, + scene_node_count(&scene) <= cap, "node count {} exceeds structural cap {}", scene_node_count(&scene), - forest_cap, + cap, ); // Names are unique (the pre-order rename) and cover `n0..nK`. From f69c963e84ab81144be132dbce6af08171f6605b Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 08:35:51 -0700 Subject: [PATCH 38/70] =?UTF-8?q?feat(verify):=20Tier-3=20BiDi=20caret=20r?= =?UTF-8?q?ound-trip=20(#6)=20=E2=80=94=20closes=20gate=20#12?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add `invariant::bidi` — the gate-#12 named text invariant on the LANDED shaper. `bidi_caret_roundtrips(&Buffer)` asserts three relations over a laid-out `cosmic_text::Buffer` (the production text-stack output, same path as `buiy_core`'s shaping snapshots): - #6a logical↔visual caret round-trip is identity: hit-testing each glyph cluster's visual center recovers a cursor inside that cluster [start, end]; - #6b within each same-BiDi-level VISUAL segment, logical `start` is monotone — ascending for LTR levels, descending for RTL levels; - #6c the DISTINCT clusters tile each line's bytes (disjoint, no gap), counted per cluster range so multiple glyphs of one cluster (Arabic ccmp dots, a Devanagari split matra) are not a false overlap. `arb_bidi_text` keeps the spec generator signature (alternating LTR/RTL runs). The harness drives the full `BuiyTextPlugin` stack with the committed fixture fonts; all six shaping-corpus scripts (Latin, Arabic, Devanagari, CJK, emoji-ZWJ, mixed-BiDi) are known-good CONTROLS, and an off-by-one caret map is REJECTED via the exposed `caret_in_cluster` relation-check (the teeth). Three #6 mis-specifications found + fixed through the controls (real-invariant discipline): a `LayoutRun`'s `glyphs` are in LOGICAL not visual order (so #6b must sort by x and bucket by BiDi level, not walk the array); and complex scripts map several glyphs to one cluster (so #6c counts distinct cluster ranges, not per-glyph bytes). The shrunk RTL seed ("אב") that surfaced the #6b flaw is committed in invariant_bidi.proptest-regressions. Signature deviation (documented in bidi.rs): the predicate takes the laid-out `&Buffer` (the pure shaper-output form, matching #1–#5's borrowed-data design) rather than the spec's `(text, metrics)` — shaping needs a font-loaded `FontSystem` the predicate cannot own; the test harness shapes and hands it the buffer. `arb_bidi_text` keeps the pinned signature. Dep note: `cosmic-text = "0.19"` added to buiy_verify's manifest to name `Buffer`/`Cursor`/`LayoutRun`. It is ALREADY a direct dep of buiy_core (the shaper) and in the lockfile — NO new supply-chain crate, zero new `cargo deny` surface (`cargo deny check` passes), version-pinned to buiy_core's. Flagged per the no-new-deps guard, same situation as `bytemuck` in the snapshot phase. `Violation::new` is now `pub(crate)` so the `bidi` sibling module reports through the shared type. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/buiy_verify/Cargo.toml | 6 + crates/buiy_verify/src/invariant.rs | 3 + crates/buiy_verify/src/invariant/bidi.rs | 233 ++++++++++++++++++ .../buiy_verify/src/invariant/predicates.rs | 4 +- .../tests/invariant_bidi.proptest-regressions | 7 + crates/buiy_verify/tests/invariant_bidi.rs | 232 +++++++++++++++++ 6 files changed, 484 insertions(+), 1 deletion(-) create mode 100644 crates/buiy_verify/src/invariant/bidi.rs create mode 100644 crates/buiy_verify/tests/invariant_bidi.proptest-regressions create mode 100644 crates/buiy_verify/tests/invariant_bidi.rs diff --git a/crates/buiy_verify/Cargo.toml b/crates/buiy_verify/Cargo.toml index 137b9ab..7182198 100644 --- a/crates/buiy_verify/Cargo.toml +++ b/crates/buiy_verify/Cargo.toml @@ -26,3 +26,9 @@ image-compare = "=0.5.0" # because the harness re-exports snapshot helpers from `src/`. The `glob` feature # drives the coverage fixture-dir fan-out (Phase 4). insta = { version = "=1.48.0", features = ["glob"] } +# Already in the lockfile (a direct dep of buiy_core, the text shaper). The +# Tier-3 BiDi caret round-trip (invariants.md predicate #6) asserts relations +# over the LANDED shaper's output — `cosmic_text::{Buffer, Cursor, LayoutRun}` — +# so it must name those types. NO new supply-chain crate, zero new `cargo deny` +# surface (the version is pinned to buiy_core's `0.19`). +cosmic-text = "0.19" diff --git a/crates/buiy_verify/src/invariant.rs b/crates/buiy_verify/src/invariant.rs index e291e5d..b2b27e3 100644 --- a/crates/buiy_verify/src/invariant.rs +++ b/crates/buiy_verify/src/invariant.rs @@ -36,3 +36,6 @@ pub use predicates::{ EPS, Violation, all_finite, all_finite_packed, contexts_do_not_interleave, mat4_is_identity, mat4_is_pure_scale, paint_order_is_total, top_layer_dominates, transform_roundtrips, }; + +pub mod bidi; +pub use bidi::{arb_bidi_text, bidi_caret_roundtrips, caret_in_cluster}; diff --git a/crates/buiy_verify/src/invariant/bidi.rs b/crates/buiy_verify/src/invariant/bidi.rs new file mode 100644 index 0000000..5bae15c --- /dev/null +++ b/crates/buiy_verify/src/invariant/bidi.rs @@ -0,0 +1,233 @@ +//! Tier-3 predicate #6 — BiDi caret round-trip on the LANDED shaper +//! (invariants.md § "BiDi caret round-trip"). Relations over a laid-out +//! `cosmic_text::Buffer` — the exact structure the production text stack +//! produces (`tests/text_shaping_snapshots.rs` path) — with no rasterizer. +//! +//! **Signature deviation.** The spec pins `bidi_caret_roundtrips(text: &str, +//! metrics: Metrics)`, shaping internally. Shaping needs a `FontSystem` with +//! registered faces, which the predicate cannot own without coupling to the +//! font registry, so this takes the already-laid-out `&Buffer` — the genuinely +//! PURE shaper-output form, matching predicates #1–#5's borrowed-data design. +//! The test harness (`tests/invariant_bidi.rs`) shapes through the production +//! `BuiyTextPlugin` stack and hands the committed buffer here. `arb_bidi_text` +//! keeps the spec's generator signature verbatim. + +use cosmic_text::{Buffer, Cursor}; +use proptest::prelude::*; + +use super::predicates::Violation; + +/// Generate a mixed-direction string: alternating LTR (Latin) and RTL +/// (Hebrew) runs of bounded length, plus neutral spaces — the BiDi stress space +/// the shaping `.snap` fixtures pin positions for, exercised generatively. Hebrew +/// (`U+05D0..05EA`) and ASCII letters are the two scripts; spaces join them. +pub fn arb_bidi_text(max_runs: u32, max_run_len: u32) -> impl Strategy { + let max_runs = max_runs.max(1) as usize; + let max_run_len = max_run_len.max(1) as usize; + // Each run is (is_rtl, length); the string interleaves them with single + // spaces so adjacent same-direction runs still produce a BiDi boundary. + prop::collection::vec((any::(), 1usize..=max_run_len), 1..=max_runs).prop_map(|runs| { + let mut s = String::new(); + for (i, (rtl, len)) in runs.iter().enumerate() { + if i > 0 { + s.push(' '); + } + for j in 0..*len { + if *rtl { + // Hebrew aleph..tav, cycled. + let c = char::from_u32(0x05D0 + (j as u32 % 22)).unwrap(); + s.push(c); + } else { + // ASCII lowercase a..z, cycled. + s.push((b'a' + (j as u8 % 26)) as char); + } + } + } + s + }) +} + +/// The three BiDi caret relations over a laid-out [`Buffer`]: +/// +/// - **#6a** logical↔visual caret round-trip is identity: for every glyph +/// cluster, mapping the logical position to the glyph's visual center x and +/// hit-testing that x back recovers a cursor INSIDE the same cluster +/// (`[start, end]`). The cluster center is used (not the leading edge) so the +/// hit's half-glyph affinity is deterministic across LTR and RTL. +/// - **#6b** within one [`LayoutRun`](cosmic_text::LayoutRun): for an LTR run +/// (`rtl == false`) visual x is non-decreasing in logical start order; for an +/// RTL run (`rtl == true`) visual x is non-decreasing as logical start +/// DECREASES (the block reads right-to-left). +/// - **#6c** the run partition covers every byte of every line's text exactly +/// once across `Buffer::layout_runs()` (no gap, no overlap). +pub fn bidi_caret_roundtrips(buffer: &Buffer) -> Result<(), Violation> { + for run in buffer.layout_runs() { + let y = run.line_top + run.line_height / 2.0; + + // #6a — per-cluster round-trip. + for glyph in run.glyphs.iter() { + // Skip zero-width glyphs (e.g. a combining mark): their hitbox is a + // point and hit-testing is ambiguous by construction. + if glyph.w <= 0.0 { + continue; + } + let center = glyph.x + glyph.w / 2.0; + let Some(cursor) = buffer.hit(center, y) else { + return Err(Violation::new( + "bidi_caret_roundtrips/6a_no_hit", + format!( + "hit-test at the center of cluster [{}..{}] (x={center}) found no cursor", + glyph.start, glyph.end + ), + )); + }; + caret_in_cluster(cursor, run.line_i, glyph.start, glyph.end)?; + } + + // #6b — visual order vs logical order within the run. + check_run_monotonicity(&run)?; + } + + // #6c — coverage: every byte of every line's text is covered once. + check_coverage(buffer)?; + Ok(()) +} + +/// #6b — visual order vs logical order, BY BiDi LEVEL. A `LayoutRun`'s `glyphs` +/// are in LOGICAL order and may mix directions (an RTL block embedded in an LTR +/// paragraph), so a single run-wide monotonicity check is wrong. The true +/// invariant: within each maximal VISUAL segment of glyphs at the SAME BiDi +/// embedding level, logical `start` is monotone — ascending for an LTR (even) +/// level, descending for an RTL (odd) level. We sort by visual x, then check +/// monotonicity within each same-level segment. +fn check_run_monotonicity(run: &cosmic_text::LayoutRun) -> Result<(), Violation> { + // Glyphs in VISUAL order (left to right), carrying their logical start + + // BiDi level. Distinct clusters only (equal-start glyphs of one cluster + // share a caret position). + let mut visual: Vec<(f32, usize, bool)> = run + .glyphs + .iter() + .map(|g| (g.x, g.start, g.level.is_rtl())) + .collect(); + visual.sort_by(|a, b| a.0.total_cmp(&b.0)); + + let mut prev: Option<(usize, bool)> = None; + for &(_x, start, rtl) in &visual { + if let Some((prev_start, prev_rtl)) = prev { + // Only compare within a same-direction visual segment; a direction + // change is a BiDi boundary where logical order legitimately jumps. + if rtl == prev_rtl && start != prev_start { + let ok = if rtl { + start < prev_start // RTL: visual L→R means logical decreasing + } else { + start > prev_start // LTR: visual L→R means logical increasing + }; + if !ok { + return Err(Violation::new( + "bidi_caret_roundtrips/6b_logical", + format!( + "run line {} ({} segment): visual order start {prev_start} → {start} \ + violates monotonic logical order (LTR ascends, RTL descends)", + run.line_i, + if rtl { "RTL" } else { "LTR" } + ), + )); + } + } + } + prev = Some((start, rtl)); + } + Ok(()) +} + +/// #6c — the DISTINCT clusters of `layout_runs()` partition every buffer line's +/// text: their `[start, end)` byte ranges are disjoint and tile the whole line +/// with no gap. Several glyphs may share one cluster (Arabic ccmp dots, a +/// Devanagari split matra, a base+mark pair), so coverage is counted per +/// DISTINCT cluster range, not per glyph — multiple glyphs of one cluster are +/// not an overlap. (`run.text` is the line text; cluster bytes index into it.) +fn check_coverage(buffer: &Buffer) -> Result<(), Violation> { + use std::collections::{BTreeMap, BTreeSet}; + + // line_i -> (line byte len, set of distinct cluster [start,end) ranges). + let mut clusters: BTreeMap> = BTreeMap::new(); + let mut line_len: BTreeMap = BTreeMap::new(); + + for run in buffer.layout_runs() { + let len = run.text.len(); + line_len.insert(run.line_i, len); + let set = clusters.entry(run.line_i).or_default(); + for glyph in run.glyphs.iter() { + if glyph.end > len || glyph.start > glyph.end { + return Err(Violation::new( + "bidi_caret_roundtrips/6c_range", + format!( + "cluster [{}..{}] out of bounds for line {} of {len} bytes", + glyph.start, glyph.end, run.line_i + ), + )); + } + // Empty clusters (zero-width glyphs sharing a base's range) contribute + // no new coverage; skip them so they don't register as a gap/overlap. + if glyph.end > glyph.start { + set.insert((glyph.start, glyph.end)); + } + } + } + + for (&line_i, ranges) in &clusters { + let len = line_len[&line_i]; + // Sort by start; consecutive distinct cluster ranges must be disjoint + // and abut (no gap, no overlap), tiling `0..len`. + let mut cursor = 0usize; + for &(start, end) in ranges { + if start < cursor { + return Err(Violation::new( + "bidi_caret_roundtrips/6c_overlap", + format!( + "line {line_i}: cluster [{start}..{end}] overlaps the previous (expected \ + start ≥ {cursor})" + ), + )); + } + if start > cursor { + return Err(Violation::new( + "bidi_caret_roundtrips/6c_gap", + format!( + "line {line_i}: gap in [{cursor}..{start}) — no cluster covers those bytes" + ), + )); + } + cursor = end; + } + if cursor != len { + return Err(Violation::new( + "bidi_caret_roundtrips/6c_gap", + format!("line {line_i}: clusters cover only {cursor} of {len} bytes"), + )); + } + } + Ok(()) +} + +/// The #6a relation-check: a recovered [`Cursor`] must land INSIDE the cluster +/// it was mapped from — same line, `index ∈ [start, end]`. Exposed so the +/// off-by-one mutation fixture can feed it a `start + 1` cursor for a +/// single-byte cluster and confirm it is REJECTED (the round-trip's teeth). +pub fn caret_in_cluster( + cursor: Cursor, + line: usize, + start: usize, + end: usize, +) -> Result<(), Violation> { + if cursor.line != line || cursor.index < start || cursor.index > end { + return Err(Violation::new( + "bidi_caret_roundtrips/6a_roundtrip", + format!( + "cursor {cursor:?} is outside cluster [{start}..{end}] on line {line} \ + (caret round-trip broke)" + ), + )); + } + Ok(()) +} diff --git a/crates/buiy_verify/src/invariant/predicates.rs b/crates/buiy_verify/src/invariant/predicates.rs index 6fd8666..d54c23d 100644 --- a/crates/buiy_verify/src/invariant/predicates.rs +++ b/crates/buiy_verify/src/invariant/predicates.rs @@ -34,7 +34,9 @@ pub struct Violation { } impl Violation { - fn new(rule: &'static str, detail: impl Into) -> Self { + /// Construct a violation. `pub(crate)` so sibling invariant modules (e.g. + /// `bidi`) can report their own relations through the shared type. + pub(crate) fn new(rule: &'static str, detail: impl Into) -> Self { Self { rule, detail: detail.into(), diff --git a/crates/buiy_verify/tests/invariant_bidi.proptest-regressions b/crates/buiy_verify/tests/invariant_bidi.proptest-regressions new file mode 100644 index 0000000..99e021d --- /dev/null +++ b/crates/buiy_verify/tests/invariant_bidi.proptest-regressions @@ -0,0 +1,7 @@ +# Seeds for failure cases proptest has generated in the past. It is +# automatically read and these particular cases re-run before any +# novel cases are generated. +# +# It is recommended to check this file in to source control so that +# everyone who runs the test benefits from these saved cases. +cc 7d84c6670f276f62ec379081b626d2320f90af402c18dd56ec4a10b85771eb5e # shrinks to text = "אב" diff --git a/crates/buiy_verify/tests/invariant_bidi.rs b/crates/buiy_verify/tests/invariant_bidi.rs new file mode 100644 index 0000000..1f698a9 --- /dev/null +++ b/crates/buiy_verify/tests/invariant_bidi.rs @@ -0,0 +1,232 @@ +//! Task 2.10 — the BiDi caret round-trip predicate (#6) driven through the +//! PRODUCTION text stack (`MinimalPlugins + CorePlugin + LayoutPlugin + +//! BuiyTextPlugin`), the same path as `buiy_core`'s `text_shaping_snapshots`. +//! +//! - `prop_bidi_caret_roundtrips` runs the predicate over generated +//! mixed-direction strings (the `arb_bidi_text` space). +//! - The MANDATORY mutation tests: the six shaping-corpus scripts (Latin, +//! Arabic, Devanagari, CJK, emoji-ZWJ, mixed-BiDi) are known-good CONTROLS +//! (`Ok`); an off-by-one caret-map fixture is REJECTED (`Err`) — proving the +//! round-trip relation has teeth. +//! +//! Closes gate #12. + +use std::sync::Arc; + +use bevy::prelude::*; +use buiy_core::CorePlugin; +use buiy_core::layout::{LayoutPlugin, Style}; +use buiy_core::text::{ + BuiyTextPlugin, FamilyEntry, FontFaceDescriptors, FontFamily, FontRegistry, FontSize, + FontStack, GenericFamily, TextBuffer, +}; +use buiy_verify::invariant::{arb_bidi_text, bidi_caret_roundtrips, caret_in_cluster}; +use cosmic_text::{Buffer, Cursor}; +use proptest::prelude::*; + +// --- shaping through the production stack ------------------------------------ + +/// A committed fixture font shared with `buiy_core`'s shaping corpus, read from +/// that crate's fixtures dir (stable workspace layout, same as the snapshot +/// test's hard-coded `tests/fixtures/fonts`). +fn fixture_font_bytes(file_name: &str) -> Arc> { + let path = std::path::Path::new(env!("CARGO_MANIFEST_DIR")) + .join("../buiy_core/tests/fixtures/fonts") + .join(file_name); + Arc::new( + std::fs::read(&path).unwrap_or_else(|e| panic!("fixture font {file_name} missing ({e})")), + ) +} + +/// One fixture face: (declared family, file under the shared fonts dir). +type FixtureFont = (&'static str, &'static str); + +const ARABIC: FixtureFont = ("Noto Sans Arabic", "NotoSansArabic-arabic.ttf"); +const HEBREW: FixtureFont = ("Noto Sans Hebrew", "NotoSansHebrew-hebrew.ttf"); +const DEVANAGARI: FixtureFont = ("Noto Sans Devanagari", "NotoSansDevanagari-devanagari.ttf"); +const CJK: FixtureFont = ("Noto Sans CJK SC", "NotoSansCJKsc-han.otf"); +const EMOJI: FixtureFont = ("Noto Emoji", "NotoEmoji-emoji.ttf"); + +/// Shape `text` through the production stack with `fonts` registered and +/// `families` as the resolver stack; return the committed `cosmic_text::Buffer`. +fn shape(text: &str, fonts: &[FixtureFont], families: &[FamilyEntry]) -> Buffer { + let mut app = App::new(); + app.add_plugins(MinimalPlugins); + app.add_plugins(CorePlugin); + app.add_plugins(LayoutPlugin); + app.add_plugins(BuiyTextPlugin::default()); + app.update(); + + for (family, file) in fonts { + app.world_mut() + .resource_mut::() + .register_bytes( + *family, + fixture_font_bytes(file), + FontFaceDescriptors::default(), + ); + app.update(); + } + + let entity = app + .world_mut() + .spawn(( + buiy_core::Node, + Style::default().width_px(400.0).height_px(200.0), + buiy_core::text::Text(String::from(text)), + FontFamily(FontStack(families.to_vec())), + FontSize(20.0), + )) + .id(); + for _ in 0..4 { + app.update(); + } + + app.world() + .get::(entity) + .expect("the fixture entity synced a TextBuffer") + .buffer + .clone() +} + +fn sans() -> Vec { + vec![FamilyEntry::Generic(GenericFamily::SansSerif)] +} + +fn named(name: &str) -> Vec { + vec![FamilyEntry::Named(String::from(name))] +} + +/// The Latin (Fira Sans) + Hebrew-fixture stack — first-strong LTR with an RTL +/// block, the genuine BiDi mix (mirrors the corpus's `BIDI_STACK`). +fn bidi_stack() -> Vec { + vec![ + FamilyEntry::Named(String::from("Fira Sans")), + FamilyEntry::Named(String::from("Noto Sans Hebrew")), + ] +} + +// --- #6 proptest over generated mixed-direction text ------------------------- + +proptest! { + #![proptest_config(ProptestConfig { cases: 48, max_shrink_iters: 1024, ..ProptestConfig::default() })] + + /// The caret round-trip holds over generated LTR/RTL-mixed strings. Shaping + /// drives a full Bevy app, so the case count is lower than the pure-CPU + /// predicates (still hundreds of caret round-trips per case). + #[test] + fn prop_bidi_caret_roundtrips(text in arb_bidi_text(3, 5)) { + // Hebrew fixture + Fira Sans cover the generated scripts. + let buffer = shape(&text, &[HEBREW], &bidi_stack()); + prop_assert!( + bidi_caret_roundtrips(&buffer).is_ok(), + "text {:?}: {}", text, bidi_caret_roundtrips(&buffer).unwrap_err() + ); + } +} + +// --- the six shaping-corpus scripts as known-good controls ------------------- + +#[test] +fn control_latin() { + let b = shape("Sphinx of black quartz, judge my vow.", &[], &sans()); + assert!( + bidi_caret_roundtrips(&b).is_ok(), + "{:?}", + bidi_caret_roundtrips(&b) + ); +} + +#[test] +fn control_arabic() { + let b = shape("السلام عليكم", &[ARABIC], &named("Noto Sans Arabic")); + assert!( + bidi_caret_roundtrips(&b).is_ok(), + "{:?}", + bidi_caret_roundtrips(&b) + ); +} + +#[test] +fn control_devanagari() { + let b = shape("नमस्ते क्षत्रिय", &[DEVANAGARI], &named("Noto Sans Devanagari")); + assert!( + bidi_caret_roundtrips(&b).is_ok(), + "{:?}", + bidi_caret_roundtrips(&b) + ); +} + +#[test] +fn control_cjk() { + let b = shape("你好,世界", &[CJK], &named("Noto Sans CJK SC")); + assert!( + bidi_caret_roundtrips(&b).is_ok(), + "{:?}", + bidi_caret_roundtrips(&b) + ); +} + +#[test] +fn control_emoji_zwj() { + let b = shape( + "👨\u{200D}👩\u{200D}👧\u{200D}👦", + &[EMOJI], + &named("Noto Emoji"), + ); + assert!( + bidi_caret_roundtrips(&b).is_ok(), + "{:?}", + bidi_caret_roundtrips(&b) + ); +} + +#[test] +fn control_mixed_bidi() { + let b = shape("hello עולם world", &[HEBREW], &bidi_stack()); + assert!( + bidi_caret_roundtrips(&b).is_ok(), + "{:?}", + bidi_caret_roundtrips(&b) + ); +} + +// --- the off-by-one mutation: prove the round-trip relation has teeth -------- + +/// The off-by-one caret-map fixture: feed the #6a relation-check a cursor one +/// byte PAST a single-byte cluster's end. The true round-trip recovers a cursor +/// inside `[start, end]` (accepted); the off-by-one cursor falls outside and is +/// REJECTED — proving the relation is a real identity, not vacuous. +#[test] +fn off_by_one_caret_map_is_rejected() { + // A single-byte ASCII cluster: `[start, start]` (end == start for a + // 1-codepoint Latin glyph). `caret_in_cluster` accepts the true start and + // rejects start + 1. + let (line, start, end) = (0usize, 3usize, 3usize); + + // True round-trip lands ON the cluster → Ok. + assert!( + caret_in_cluster(Cursor::new(line, start), line, start, end).is_ok(), + "the cluster's own start round-trips" + ); + + // Off-by-one (start + 1) lands past the cluster → Err (the teeth). + assert!( + caret_in_cluster(Cursor::new(line, start + 1), line, start, end).is_err(), + "a caret mapped one byte off the cluster must be rejected" + ); + + // Wrong LINE is also rejected. + assert!( + caret_in_cluster(Cursor::new(line + 1, start), line, start, end).is_err(), + "a caret on the wrong line is rejected" + ); + + // And the real shaper output passes the full predicate (control). + let buffer = shape("hello עולם world", &[HEBREW], &bidi_stack()); + assert!( + bidi_caret_roundtrips(&buffer).is_ok(), + "{:?}", + bidi_caret_roundtrips(&buffer) + ); +} From 46108d069344b5596b9166d2705cbb041ef2d2cf Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 08:52:32 -0700 Subject: [PATCH 39/70] =?UTF-8?q?feat(core):=20extend=20GoldenConfig=20?= =?UTF-8?q?=E2=80=94=20FontMode=20+=20dpr=20field=20+=20fidelity()?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 3.1 of the verification pyramid: GoldenConfig grows the font and DPR axes the determinism stack adds. FontMode { Real, Ahem } collapses the font axis — Ahem substitutes a bundled em-box face so text-bearing goldens are byte-identical across hosts; Real is the narrow real-glyph fidelity suite. deterministic() defaults font_mode: Ahem + dpr: Dpr::X1; the new fidelity() flips only the font axis to Real with every other knob still pinned. The struct stays Copy. MSAA/dither remain module constants, never per-fixture knobs. Phase 0.4's capture_to_image still ignores cfg.dpr (sizes via the window); Phase 3.3 makes it assert scale_factor == cfg.dpr.as_f32(). Spec: docs/specs/2026-06-15-buiy-verification-design/determinism.md § Extending GoldenConfig. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/buiy_core/src/render/golden.rs | 47 ++++++++++++++++++- .../buiy_core/tests/render_golden_config.rs | 46 ++++++++++++++++++ 2 files changed, 91 insertions(+), 2 deletions(-) create mode 100644 crates/buiy_core/tests/render_golden_config.rs diff --git a/crates/buiy_core/src/render/golden.rs b/crates/buiy_core/src/render/golden.rs index f9bcffa..ef969c5 100644 --- a/crates/buiy_core/src/render/golden.rs +++ b/crates/buiy_core/src/render/golden.rs @@ -11,10 +11,32 @@ use crate::render::atlas::{AtlasKey, AtlasWarmupQueue, BuiyAtlas}; +/// How the font axis is rasterized for a capture (verification-design +/// `determinism.md` § "Ahem font mode"). Real glyph rasterization is the +/// canonical per-platform flake source, but the bulk of text-bearing goldens +/// test *boxes*, not glyphs — so `Ahem` collapses the font axis to a bundled +/// em-box face whose every glyph is a solid square, making any non-fidelity +/// golden byte-identical across hosts. `Real` is the narrow fidelity suite +/// (glyph hinting / subpixel / color-emoji / decorations). +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum FontMode { + /// Rasterize the fixture's actual fonts — the narrow real-glyph fidelity + /// suite. The shaping `.snap` fixtures and the real-font golden suite pin + /// this. + Real, + /// Substitute the bundled Ahem em-box font so any text-bearing golden is + /// host-stable. Made the *sole resolvable family* for fixture text under + /// this mode, so fallback cannot reintroduce a platform font. + Ahem, +} + /// Deterministic-capture configuration. The three flake sources of § 4.3 are /// *necessary together*: a golden captured without all three is not /// reproducible. `accept` is the § 4.4 human-curated golden-update gate — -/// never an automatic overwrite. +/// never an automatic overwrite. The determinism spec grows the font and DPR +/// axes (`determinism.md` § "Extending GoldenConfig"); MSAA / dither stay +/// module constants ([`CAPTURE_MSAA`] / [`CAPTURE_DITHER_OFF`]), never +/// per-fixture knobs. #[derive(Clone, Copy, Debug)] pub struct GoldenConfig { /// Drive time from a fixed/virtual clock, not wall time, so any time- @@ -30,17 +52,38 @@ pub struct GoldenConfig { /// `--accept`: update the stored golden instead of failing on mismatch. /// Off by default; gated behind human PR review (§ 4.4). pub accept: bool, + /// Collapse the font axis. `Real` rasterizes the fixture's actual fonts + /// (the narrow fidelity suite); `Ahem` substitutes the em-box font so any + /// text-bearing golden is byte-identical across hosts (§ "Ahem font mode"). + pub font_mode: FontMode, + /// Device-pixel-ratio pin. A 1× vs 2× render is a *different rasterization*, + /// not a tolerance — captured as a fixture axis, never fuzzed (§ "DPR pin"). + pub dpr: Dpr, } impl GoldenConfig { /// The capture config with the full flake-mitigation triad pinned and - /// `accept` off — the configuration every golden is captured under. + /// `accept` off — the configuration every golden is captured under. The + /// font axis collapses to the Ahem box-font and the DPR pins to 1× (layout + /// goldens are the common case; the fidelity / HiDPI variants opt out). pub fn deterministic() -> Self { Self { fixed_clock: true, wait_for_fonts: true, warm_atlas: true, accept: false, + font_mode: FontMode::Ahem, + dpr: Dpr::X1, + } + } + + /// The real-glyph fidelity variant: `FontMode::Real`, everything else + /// pinned exactly as [`GoldenConfig::deterministic`]. The narrow suite that + /// asserts genuine glyph rasterization (hinting / subpixel / color-emoji). + pub fn fidelity() -> Self { + Self { + font_mode: FontMode::Real, + ..Self::deterministic() } } } diff --git a/crates/buiy_core/tests/render_golden_config.rs b/crates/buiy_core/tests/render_golden_config.rs new file mode 100644 index 0000000..38f2a31 --- /dev/null +++ b/crates/buiy_core/tests/render_golden_config.rs @@ -0,0 +1,46 @@ +//! `GoldenConfig` default-config tripwires (Phase 3.1, verification-design +//! `determinism.md` § "Extending GoldenConfig"). Pure-CPU, headless — no +//! adapter, no `#[ignore]`. Pins that `deterministic()` collapses the font +//! axis to the Ahem box-font + a 1× DPR, while `fidelity()` is the narrow +//! real-glyph variant with every other knob still pinned. + +use buiy_core::render::golden::{Dpr, FontMode, GoldenConfig}; + +#[test] +fn deterministic_defaults_collapse_font_axis() { + let cfg = GoldenConfig::deterministic(); + // The bulk of text-bearing goldens test boxes, not glyphs: default to the + // Ahem em-box font so they are byte-identical across hosts. + assert_eq!(cfg.font_mode, FontMode::Ahem); + // 1× is the headless capture default; 2× is an explicit fixture axis. + assert_eq!(cfg.dpr, Dpr::X1); + // The landed flake triad stays pinned. + assert!(cfg.fixed_clock); + assert!(cfg.wait_for_fonts); + assert!(cfg.warm_atlas); + assert!(!cfg.accept); +} + +#[test] +fn fidelity_uses_real_font() { + let cfg = GoldenConfig::fidelity(); + // The narrow real-glyph fidelity suite: Ahem off … + assert_eq!(cfg.font_mode, FontMode::Real); + // … but every other determinism knob is still pinned (it differs from + // `deterministic()` in exactly the font axis). + assert_eq!(cfg.dpr, Dpr::X1); + assert!(cfg.fixed_clock); + assert!(cfg.wait_for_fonts); + assert!(cfg.warm_atlas); + assert!(!cfg.accept); +} + +#[test] +fn config_is_copy() { + // `GoldenConfig` must stay `Copy` (every field is `Copy`) so the capture + // path can pass it by value without ceremony. + let cfg = GoldenConfig::deterministic(); + let a = cfg; + let b = cfg; + assert_eq!(a.font_mode, b.font_mode); +} From 593a5275b3fce6d32c154ae8e3d9578a151458af Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 08:56:24 -0700 Subject: [PATCH 40/70] feat(verify): Ahem box-font mode + sole-family wiring (determinism) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 3.2 of the verification pyramid: collapse the font axis so text-bearing captures are host-stable. Commits the canonical W3C/WPT public-domain Ahem em-box font (Ahem.ttf, 21768 bytes, every glyph a solid em-square) under tests/fixtures/fonts/ with a license file beside it (the OFL precedent). The new buiy_verify::determinism module registers it through the PRODUCTION bytes path (FontRegistry::register_bytes) under family "Ahem"; combined with system fonts being off (the headless capture stack runs bundled-only), Ahem is the sole resolvable family for fixture text that names it — fallback cannot reintroduce a host-specific platform font. FontMode/Dpr are re-exported from buiy_core::render::golden, never redefined. Two headless resolver tests pin the contract: ahem_is_sole_family_under_ahem_ mode (every span resolves to Ahem, no fallback leak) and ahem_resolution_is_ host_font_independent (the result is fixed to Ahem regardless of host fonts). Deviation note: the spec allowed an obscure-text rectangle fallback if no Ahem asset existed; the genuine WPT em-box font was obtainable, so we ship it (the stronger option — real per-glyph box rasterization, not a synthetic stand-in). Spec: docs/specs/2026-06-15-buiy-verification-design/determinism.md § "Ahem font mode". Co-Authored-By: Claude Opus 4.8 (1M context) --- .../buiy_core/tests/fixtures/fonts/Ahem.ttf | Bin 0 -> 21768 bytes .../tests/fixtures/fonts/LICENSE-Ahem.txt | 24 ++++ crates/buiy_verify/src/determinism.rs | 57 +++++++++ crates/buiy_verify/src/lib.rs | 1 + crates/buiy_verify/tests/determinism_ahem.rs | 108 ++++++++++++++++++ 5 files changed, 190 insertions(+) create mode 100644 crates/buiy_core/tests/fixtures/fonts/Ahem.ttf create mode 100644 crates/buiy_core/tests/fixtures/fonts/LICENSE-Ahem.txt create mode 100644 crates/buiy_verify/src/determinism.rs create mode 100644 crates/buiy_verify/tests/determinism_ahem.rs diff --git a/crates/buiy_core/tests/fixtures/fonts/Ahem.ttf b/crates/buiy_core/tests/fixtures/fonts/Ahem.ttf new file mode 100644 index 0000000000000000000000000000000000000000..4d4785a4123287a5ca08439a6230514de91df0e7 GIT binary patch literal 21768 zcmeHP34B!5x&Q9Hv(E$)2oWgiP1sSA$uAR;ddQD$G{2i4U#&F|+BQ;F=z6r58-n$rwZk{|Lw+yO@CRejcvIB_7Y-zv zay^lB6-m-IVJX?scb#}6dfcL=!UoC{B$Sd3h5WaOXjLl76VwcT&d)EE=^T=eDUz&! zcS)vAO6pN3k%AKBKpkI3fl7SS@ynV@C!6g{=wmc8P8v-^=t{bu9-v>-Z|EWVEv=xH z^e{a_kJ4lGIQCd!}KB2$R ze)^35N(bq0bcjBu!*qncpuf|X^c8(gN2#5Tks{;@(}elLt>Q-U6>+n;L) zrH7;y(!zq5iWqzZS%MHN4*SUzR$l=^8UGvAxJ^Ze6mhs_xgDTvLz za?P^04vNPVMS(758m8**!4v9k^*_|z`YyVAP1oH!Cf)5gq3%9ab$1|%?!M{J-BgqA zws)gDQFhBd=&rZi7rGlD%c|~1oltkx|0~@Q>rVMd`B3>vIjpoP?aC)~in1H|J<7+* ze&ti--Y19hma>cbC|i{s${WgGV7V#Ed&+L=K|VlSx>xx?*{kF+9wcglz-R^I84tn$0 zTec^KCvB_0Uj5kWd8;p7b?>S>AK&~);UmKzK6K}wZ~tWZj-?M?b4hee>R*K~)ENxJ zv;Mx71|)J}JxnD5vsVJN5KH4+td$a0Lds9eOdFh;mbp+o(0+?>_?S)GS`^z3298N2b;tyrrSZ?vumosN;E9jazXk{t&a$&jzUN(F-N!KOqBUcn8Mu z5$kK|z_NkwgAMR}77lA;DV&SU*Z8i$7Yh?rKQc!A;E|;#CZj&CAR^fcmW@(%Bk#taMw|Q(Cwt+Uk?T5A>+s?OD+b*?TX}igG zhwUNTYTNH^uiD3<|LENA{FC#LQ*rrR{aiy_`L1(Z=ejO%)w-Ho zSG#`hy4$tVwZ`?5>kZc~*C(zo+>$%heX4teyU1PUp5p$Ad#-!Fdx`s2_pjWKxu0=w zbid_)3$0Ci^b( z&GEJP7W`X$2dOCy6vdccj zMKnlwT21jI95f)bbamuJ>LK7{oBAhFmgvx78@a?`I&7!DqF;v{D4(Fip42*VvJQJ` zP_MN*+=Hg}+SS3&m%X!fSRh-vdNKs2-Q}D%vm8;WW^?O^17-{2urbf(_>lg@ft$>o9bZu~dhllZ-ob z7&^)Ltqwyc8Jlz%I>~rbhoO^pGl9J%(=6 z;a(`;OVg>2LL^g(n$`oV!5Yv&O@Jy<6H#-u6i2=ZF#l>jV$_Tl5eickN~=*nh2mco!L>N$r@2*Y(aekXv$OyOd*xCFwIl@mI7ZrJ_hF( zptKP==A3zrpmn3(gWJe}WB9OcnBr)Z)Iqi;l#Qkwe7Hv?YO>M0Gc}exK`R@%MsUSF zSbrg4XL;f%Va+fXBOyQQoq5fnT;TL0ZC2Z`#I>l6s2XK%jd7l;)-U_Vv_UOD z7wxohWR#--Vj!6#c(J#?c{2%;Z*81s>~pch+oor;s^W<4^iB?@l$=CRhc+=L;Q@_~b` zjj7~PA*gDt7|u7<6q#`4(kQeudgWnO1@vAF7j>!`Wy+NShap=Tm37qQ17CqDR|47w z_I&W2z{t4-#(WwJdWn{$kle_z9$7kr9_JD?wAutWTwBtmZ)vB#3H@?k^C)r7tPRea zN6rr}I`hTyCir5R%v=|A;xo4kcFvozYuqIIbG~bR!eW@yTzm}boHO&z`2=sAGxNrI zvmO}RSQGDg;5=b*TpQ>lIp+&Zv;(Gb=GNt%cqzo#CaALx{U%E&04K&u$+oNI6BeuG z6Sl(nLX*vLzSv|voHtuxA!H4hq|@>Vo8{S2fHsEa3(>mF#GjT=Xr~Z$xu%&HKt9i; ztpIpoj0V0UNN1)$3cbrr^vYG~jkQ9{<#pO?x?lCotEjO?b+;16v0l||R6H>ipN5OIqv4v| zfra-J-f!3kVMz0R81M1;JMWPi{>>)*rLct);qTnPI|^nF7(dJ00Nbxr_fqqY8&@@< z?d_U$yn=swO&Pu>dwF5(dzy~fS3X(#<(Lb+OLWhJ;csYqHv1+l=QvQ|J(EY*EEBIw zyfU5`Cy!wScEBTeJZAQjl3~tIg8BHdKOXaRSRs2jA&igam5l^^B5DOms2RFpU29$- z`zDQ$r8|krF(&-9bFx$mPm-$PvuVCnca*1rtNABQH*k`4*>{_-+gtZqX74Nc7#ee~ z>-x}DtL(YgB&9O}w!=D4PN(D~sy zp*x=4bD`lXRg3Jy^GYM*%X^$LQ%Y6+@_zubPoepqnrysAXx3W?J@7N(0;Fsm8XkVK z;Zj~dcpSB7&lB!wb4HkUoB({CJbeGZsGYnHW$yyx8tP5lPN$7%~$r~X#+vK;_ z!f!%NDI4Qwj;FL>cWi;b)Phm2LaIegI=A6nexBaOXpsYJpN3Bk zRv>NsIkLi$2i`Hb_B@(Qvk}dzpz-9HUKf%}>O%E$O|+p&t_($@4YhH(DJnOiBGz0P z30KM0(fVMxAzPlMUZ>g8(vsbhU&UnflIqYrl&Ew| zr%lUYHO6y-=#f);-sIU6E5>_Fq!{dowHWawEds$amHjj=hQ$6X`-+LDex9|-V}=~v zn20%T#V4)!q}iUV_+(d~#fals@kuK_X~ie4_@ot|)MD#eJkp9!Cf`VQqKmC9P(5ZA9osM-X6Vcy+G>8V{*21BP zybq@lh`pbQw^P}Or}D{+JoU{uzftE?D5LNW`7An{en>x}F?0@i<)^GN^_}H-oJpQY zKc@enNjT+Ip~hmT&{V{0rz4_2lg_7Eh|OL=7t(*yMf4N8m@Yx=wh|Fw_PA=G0gm#| z!JC2zqPhIuB8C%CeDbRax|xS)@qC%&B@7e>;T-r7VW=?7RxxAJBp2@IjMe28 zj2_`Tco%ny;q?dRnNb0X-e1rzLtimeYK{-|r6A zgu{W{K%l_MP`*ExYYN8s{aRXFs^?Vqq65sqG7ka#aca`6X)mKuMm;Ir6GxMdQX6(*FSU CyAAgM literal 0 HcmV?d00001 diff --git a/crates/buiy_core/tests/fixtures/fonts/LICENSE-Ahem.txt b/crates/buiy_core/tests/fixtures/fonts/LICENSE-Ahem.txt new file mode 100644 index 0000000..a775c6f --- /dev/null +++ b/crates/buiy_core/tests/fixtures/fonts/LICENSE-Ahem.txt @@ -0,0 +1,24 @@ +Ahem.ttf — the W3C/WPT "Ahem" layout-determinism font. + +The Ahem font belongs to the public domain. In jurisdictions that do not +recognize public domain ownership of these files, the following Creative +Commons Zero declaration applies: + + http://creativecommons.org/publicdomain/zero/1.0/ + +Ahem is a deliberately featureless font in which (almost) every glyph is a +solid em-square box, with a small set of glyphs rendered as empty space. It +exists so that text-bearing tests can assert *layout* (box positions and +sizes) without depending on host-specific glyph rasterization — the canonical +trick the WPT and Flutter test suites use to make text goldens host-stable. + +Source (canonical upstream): + https://github.com/web-platform-tests/wpt/blob/master/fonts/Ahem.ttf + https://www.w3.org/Style/CSS/Test/Fonts/Ahem/ + +Family name: "Ahem" · Version 1.50 · 21768 bytes +sha256: b719ecb31c5b21fc573c03f6421c74ac63c271a5a3ff841e34f9705fb94b8448 + +Used by buiy_core / buiy_verify's `FontMode::Ahem` determinism mode +(docs/specs/2026-06-15-buiy-verification-design/determinism.md § "Ahem font +mode") to collapse the font axis for non-fidelity pixel goldens. diff --git a/crates/buiy_verify/src/determinism.rs b/crates/buiy_verify/src/determinism.rs new file mode 100644 index 0000000..b37803e --- /dev/null +++ b/crates/buiy_verify/src/determinism.rs @@ -0,0 +1,57 @@ +//! The determinism substrate (verification-design `determinism.md`): the one +//! public seam every GPU tier (reftest, golden) constructs its capture app +//! through, with every nondeterminism knob pinned at the source. +//! +//! This module owns the *setup* — the [`FontMode::Ahem`] box-font substitution +//! (so text-bearing captures are host-stable), the fixed virtual clock, the DPR +//! pin, and the MSAA/dither pin — while `buiy_core::render::golden`'s +//! [`capture_to_image`](buiy_core::render::golden::capture_to_image) owns the +//! *capture* (size-to-physical, quiescence flush, readback). +//! +//! `FontMode` / `Dpr` are **re-exported** from their canonical home in +//! `buiy_core::render::golden` (where `GoldenConfig` carries them), never +//! redefined here. + +use bevy::prelude::*; +use buiy_core::text::{FontFaceDescriptors, FontRegistry}; +use std::sync::Arc; + +// Re-export the canonical config types from their home in buiy_core. Tiers +// import `FontMode` / `Dpr` from here OR from `buiy_core::render::golden` — +// they are the same types (this is a re-export, not a redefinition). +pub use buiy_core::render::golden::{Dpr, FontMode, GoldenConfig}; + +/// The family name the Ahem box-font registers under and that fixture text +/// must name (`font-family: Ahem`) to resolve to it under [`FontMode::Ahem`]. +pub const AHEM_FAMILY: &str = "Ahem"; + +/// The committed Ahem face — the W3C/WPT public-domain em-box font, baked into +/// the test binary so the box-font substitution needs no filesystem read at +/// capture time. Every glyph is a solid em-square, so any non-fidelity golden +/// is byte-identical across hosts (`determinism.md` § "Ahem font mode"). +static AHEM_TTF: &[u8] = include_bytes!("../../buiy_core/tests/fixtures/fonts/Ahem.ttf"); + +/// The Ahem face's raw bytes, ready for the production registration path. +/// `Arc`-wrapped to match [`FontRegistry::register_bytes`]'s signature without +/// copying the ~21 KB face on every call. +fn ahem_bytes() -> Arc> { + Arc::new(AHEM_TTF.to_vec()) +} + +/// Register the Ahem box-font through the **production bytes path** +/// ([`FontRegistry::register_bytes`]) under family [`AHEM_FAMILY`], then settle +/// one update so `apply_font_registry` rebuilds the engine + `FontMatchIndex` +/// and the resolver can see it. This is the capture-time substitution +/// `FontMode::Ahem` performs; combined with system fonts being off (the +/// headless capture stack runs bundled-only), Ahem is the only resolvable +/// family for fixture text that names it — fallback cannot reintroduce a +/// host-specific platform font. +/// +/// The `app` must already carry a `FontRegistry` (any `BuiyTextPlugin` app +/// does). Idempotent: re-registering the same family is a no-op rebuild. +pub fn register_ahem(app: &mut App) { + app.world_mut() + .resource_mut::() + .register_bytes(AHEM_FAMILY, ahem_bytes(), FontFaceDescriptors::default()); + app.update(); +} diff --git a/crates/buiy_verify/src/lib.rs b/crates/buiy_verify/src/lib.rs index f1c84ee..bc1777e 100644 --- a/crates/buiy_verify/src/lib.rs +++ b/crates/buiy_verify/src/lib.rs @@ -6,6 +6,7 @@ pub mod a11y; pub mod contrast; +pub mod determinism; pub mod invariant; pub mod metric; pub mod reftest; diff --git a/crates/buiy_verify/tests/determinism_ahem.rs b/crates/buiy_verify/tests/determinism_ahem.rs new file mode 100644 index 0000000..fa733fb --- /dev/null +++ b/crates/buiy_verify/tests/determinism_ahem.rs @@ -0,0 +1,108 @@ +//! `FontMode::Ahem` sole-family resolution (Phase 3.2, verification-design +//! `determinism.md` § "Ahem font mode"). Pure-CPU, headless — resolution runs +//! on the lock-free `FontMatchIndex` substrate, no rasterizer, no adapter. +//! +//! The determinism contract these tests pin: under Ahem mode, fixture text that +//! names `font-family: Ahem` resolves to the bundled em-box face REGARDLESS of +//! host fonts — system fonts are off and Ahem (registered through the +//! production bytes path) is the only family the resolver can reach. That is +//! the host-stability the box-font substitution buys; the pixel-level twin runs +//! `#[ignore]` in `determinism_capture.rs`. + +use bevy::prelude::*; +use buiy_core::CorePlugin; +use buiy_core::layout::LayoutPlugin; +use buiy_core::text::{ + BuiyTextPlugin, FamilyEntry, FontMatchIndex, FontRegistry, FontStack, ResolvedFamily, + resolve_spans, +}; +use buiy_verify::determinism::{AHEM_FAMILY, register_ahem}; + +/// MinimalPlugins + text, system fonts OFF (the `BuiyTextPlugin::default()` +/// headless capture shape) — no AssetPlugin, no adapter. The resolver +/// substrate works asset-machinery-free. +fn text_app() -> App { + let mut app = App::new(); + app.add_plugins(MinimalPlugins); + app.add_plugins(CorePlugin); + app.add_plugins(LayoutPlugin); + app.add_plugins(BuiyTextPlugin::default()); + app +} + +/// Lift the resolver substrate (`FontMatchIndex` + `FontRegistry`) out of a +/// settled app, exactly as `buiy_core`'s `text_resolver.rs` does — built +/// entirely through the production App path, no test-only constructors. +fn substrate(app: &mut App) -> (FontMatchIndex, FontRegistry) { + let index = app + .world_mut() + .remove_resource::() + .expect("BuiyTextPlugin inserts the FontMatchIndex"); + let registry = app + .world_mut() + .remove_resource::() + .expect("BuiyTextPlugin inits the FontRegistry"); + (index, registry) +} + +#[test] +fn ahem_is_sole_family_under_ahem_mode() { + // Register the box-font through the production bytes path + settle. + let mut app = text_app(); + app.update(); + register_ahem(&mut app); + let (mut index, registry) = substrate(&mut app); + + // A fixture string under `font-family: Ahem` (the only authored family). + let stack = FontStack(vec![FamilyEntry::Named(String::from(AHEM_FAMILY))]); + let resolution = resolve_spans("Hello box", &stack, 400, ®istry, &mut index, 0.0); + + // Every span resolves to Ahem — the box-font covers ASCII, so the walk + // never falls through to a host font (there is none) or the generic. + assert!( + !resolution.blocked, + "Ahem registers synchronously (bytes path)" + ); + assert!( + !resolution.spans.is_empty(), + "non-empty text yields at least one span" + ); + for span in &resolution.spans { + assert_eq!( + span.family, + ResolvedFamily::Named(String::from(AHEM_FAMILY)), + "span {:?} resolved to {:?}, not the sole Ahem family — fallback \ + leaked a non-Ahem face", + span.range, + span.family, + ); + } +} + +#[test] +fn ahem_resolution_is_host_font_independent() { + // The determinism claim stated directly: resolution under Ahem mode does + // NOT depend on what fonts the host has. We cannot install host fonts in a + // unit test, but we CAN prove the resolved family is fixed to Ahem and + // never the embedded default ("Fira Sans") even when the stack would + // otherwise let a covered ASCII char match another registered family. + let mut app = text_app(); + app.update(); + register_ahem(&mut app); + let (mut index, registry) = substrate(&mut app); + + // Stack names ONLY Ahem; "Fira Sans" is embedded and also covers ASCII, + // but it is not in the stack, so it can never win. The result is Ahem, + // identical to what any other host would resolve (bundled-only). + let stack = FontStack(vec![FamilyEntry::Named(String::from(AHEM_FAMILY))]); + let resolution = resolve_spans("ABCabc123", &stack, 400, ®istry, &mut index, 0.0); + assert_eq!( + resolution.spans.len(), + 1, + "all-ASCII covered by Ahem ⇒ one span" + ); + assert_eq!( + resolution.spans[0].family, + ResolvedFamily::Named(String::from(AHEM_FAMILY)), + ); +} From 9ea22e80f2f1628a9d10fc092af4b035108b43de Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 09:02:59 -0700 Subject: [PATCH 41/70] feat(core): quiescence flush + DPR-pin assertion in capture_to_image MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 3.3 of the verification pyramid: harden the capture seam so the diff is signal, not a half-streamed or cold-atlas artifact. capture_to_image now drives app.update() to QUIESCENCE before readback (determinism.md § Async-asset flush), asserting four conditions and polling the device to Wait each frame: 1. PendingCaptureAssets all loaded-with-dependencies (no in-flight load) 2. AtlasWarmupQueue drained (warm_atlas) 3. fonts_ready over the resident text keys (wait_for_fonts) 4. PipelineCache has no Queued/Creating pipeline (shaders compiled) Bounded by MAX_SETTLE_FRAMES; on exhaustion it panics naming the first unmet condition — fail loudly, never capture a non-quiescent frame. It also asserts the window scale_factor == cfg.dpr at the capture boundary (the DPR pin is an asserted invariant, not a tolerance). Time advances only via the virtual clock; the path never reads wall time. New public PendingCaptureAssets resource lets a fixture that streams an asset declare it a precondition; empty for programmatic fixtures (a no-op gate). The capture-app builders insert it so any fixture can reach it. Tests: capture_path_has_no_instant_now (headless grep-lint, § Verification #4); quiescence_panics_on_never_loading_asset (GPU #[ignore], § Verification #3 — the gate fails loudly). Verified on the RX 6700 XT: capture still green, the panic fires naming the pending-asset condition. Spec: docs/specs/2026-06-15-buiy-verification-design/determinism.md § Async-asset flush to quiescence. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/buiy_core/src/render/golden.rs | 178 ++++++++++++++++-- .../tests/render_capture_quiescence.rs | 86 +++++++++ 2 files changed, 250 insertions(+), 14 deletions(-) create mode 100644 crates/buiy_core/tests/render_capture_quiescence.rs diff --git a/crates/buiy_core/src/render/golden.rs b/crates/buiy_core/src/render/golden.rs index ef969c5..06c4bda 100644 --- a/crates/buiy_core/src/render/golden.rs +++ b/crates/buiy_core/src/render/golden.rs @@ -11,6 +11,35 @@ use crate::render::atlas::{AtlasKey, AtlasWarmupQueue, BuiyAtlas}; +/// The set of asset handles a capture must see fully loaded before it reads +/// back (quiescence condition 1, `determinism.md` § "Async-asset flush to +/// quiescence"). A fixture that streams an image/shader/font asset declares it +/// a precondition via [`PendingCaptureAssets::require`]; [`capture_to_image`] +/// then refuses to capture until every required handle is loaded-with- +/// dependencies, panicking (never silently capturing a half-streamed frame) if +/// one never arrives. +/// +/// Empty by default — programmatic fixtures that spawn entities directly (the +/// common case) stream nothing, so the gate is a no-op for them. The resource +/// is inserted by the capture-app builders so any fixture can reach it. +#[derive(bevy::ecs::resource::Resource, Default, Clone)] +pub struct PendingCaptureAssets { + handles: Vec, +} + +impl PendingCaptureAssets { + /// Declare `handle` a capture precondition: the readback frame will not run + /// until it is loaded with all dependencies. + pub fn require(&mut self, handle: bevy::asset::UntypedHandle) { + self.handles.push(handle); + } + + /// The declared preconditions (the capture path probes their load state). + pub fn handles(&self) -> &[bevy::asset::UntypedHandle] { + &self.handles + } +} + /// How the font axis is rasterized for a capture (verification-design /// `determinism.md` § "Ahem font mode"). Real glyph rasterization is the /// canonical per-platform flake source, but the bulk of text-bearing goldens @@ -187,6 +216,10 @@ pub fn capture_app_with_resolution(resolution: bevy::window::WindowResolution) - .add_plugins(crate::text::BuiyTextPlugin::default()) .add_plugins(crate::render::BuiyRenderPlugin); app.init_asset::(); + // The quiescence-flush asset gate (condition 1): fixtures push streamed + // handles here; `capture_to_image` waits on them. Empty for programmatic + // fixtures (a no-op gate), so every capture app carries it. + app.init_resource::(); app } @@ -196,15 +229,23 @@ pub fn capture_app_with_resolution(resolution: bevy::window::WindowResolution) - /// `image::RgbaImage`. Re-runnable against one `App` (a reftest calls it twice /// on one device; spec § "Resolved during synthesis" #4). /// -/// Phase-0 scope: the capture mechanics (size-to-physical, paint, readback, -/// assemble). The four-condition quiescence flush and the -/// `scale_factor == cfg.dpr` assertion are Phase 3.3's hardening of this same -/// function (`determinism.md` § Async-asset flush). +/// Before the readback frame it drives `app.update()` to **quiescence** +/// (`determinism.md` § "Async-asset flush"), asserting all four conditions so +/// the diff is signal, not a half-streamed or cold-atlas artifact: +/// +/// 1. `PendingCaptureAssets` are all loaded-with-dependencies (no in-flight +/// Image/Shader/Font load). +/// 2. the render-world [`AtlasWarmupQueue`] is empty (`warm_atlas`). +/// 3. [`fonts_ready`] over the resident text keys (`wait_for_fonts`). +/// 4. the `PipelineCache` has no `Queued`/`Creating` Buiy pipeline (shaders +/// compiled). /// -/// Drives `MAX_CAPTURE_FRAMES` update frames after finishing the app (pipeline -/// async-compile + extract + prepare + paint settle), then reads back the -/// offscreen target's un-padded RGBA8 bytes. -pub fn capture_to_image(app: &mut bevy::app::App, _cfg: &GoldenConfig) -> image::RgbaImage { +/// Bounded by `MAX_SETTLE_FRAMES`; if any condition never holds it panics +/// naming the unmet one (fail loudly — never green on a missing precondition). +/// Time advances only via the virtual clock the app drives; this function +/// never reads wall time. Finally it asserts the window `scale_factor` matches +/// `cfg.dpr` (the DPR pin is an asserted capture invariant, not a tolerance). +pub fn capture_to_image(app: &mut bevy::app::App, cfg: &GoldenConfig) -> image::RgbaImage { use bevy::asset::RenderAssetUsages; use bevy::camera::RenderTarget; use bevy::image::Image; @@ -214,12 +255,23 @@ pub fn capture_to_image(app: &mut bevy::app::App, _cfg: &GoldenConfig) -> image: // Physical pixel grid the offscreen target must match: the primary // window's physical size (logical × scale_factor), which the view uniform // is built from (extract fills `logical_size` from the primary window). + // Assert the DPR pin here at the capture boundary: a 1× vs 2× render is a + // different rasterization, captured as a fixture axis, never fuzzed. let (phys_w, phys_h) = { let window = app .world_mut() .query::<&bevy::window::Window>() .single(app.world()) .expect("primary window for capture sizing"); + let scale = window.resolution.scale_factor(); + assert_eq!( + Dpr::from_f32(scale), + cfg.dpr, + "capture window scale_factor {scale} ≠ cfg.dpr {:?} ({}×) — the DPR \ + pin must hold at the capture boundary (determinism.md § DPR pin)", + cfg.dpr, + cfg.dpr.as_f32(), + ); let r = window.resolution.physical_size(); (r.x, r.y) }; @@ -246,20 +298,118 @@ pub fn capture_to_image(app: &mut bevy::app::App, _cfg: &GoldenConfig) -> image: }, )); - // Finish materializes the device + pipelines; drive frames so layout → - // extract → prepare → paint settle before the readback poll. - const MAX_CAPTURE_FRAMES: usize = 3; + // Finish materializes the device + pipelines, then drive to quiescence so + // layout → extract → prepare → shader-compile → atlas-warmup all settle + // before the readback poll. app.finish(); app.cleanup(); - for _ in 0..MAX_CAPTURE_FRAMES { - app.update(); - } + settle_to_quiescence(app); let bytes = readback_rgba_into(app, &target, phys_w, phys_h); image::RgbaImage::from_raw(phys_w, phys_h, bytes) .expect("readback byte count matches phys_w * phys_h * 4") } +/// The maximum `app.update()` frames [`settle_to_quiescence`] will drive +/// waiting for the four conditions. Generous: pipeline async-compile + several +/// extract/prepare/upload hops cost a handful of frames; a never-satisfied +/// condition (e.g. a never-loading asset) burns the budget then panics. +const MAX_SETTLE_FRAMES: usize = 240; + +/// Drive `app.update()` until the four quiescence conditions hold +/// (`determinism.md` § "Async-asset flush"), polling the device to `Wait` each +/// frame so GPU work (pipeline creation, uploads) completes rather than +/// trickling across frames. Panics naming the first still-unmet condition if +/// the frame budget is exhausted — the harness fails loudly, never captures a +/// non-quiescent frame. +fn settle_to_quiescence(app: &mut bevy::app::App) { + use bevy::render::RenderApp; + use bevy::render::render_resource::PollType; + use bevy::render::renderer::RenderDevice; + + for _ in 0..MAX_SETTLE_FRAMES { + app.update(); + + // Drain the device so in-flight GPU work (pipeline compile, buffer + // maps) lands this frame, not an indeterminate later one. + if let Some(render_app) = app.get_sub_app(RenderApp) + && let Some(device) = render_app.world().get_resource::() + { + let _ = device.poll(PollType::wait_indefinitely()); + } + + if quiescence_unmet(app).is_none() { + return; + } + } + + // Budget exhausted: report which condition never held. + let unmet = quiescence_unmet(app).unwrap_or("unknown"); + panic!( + "capture_to_image: scene never reached quiescence within \ + {MAX_SETTLE_FRAMES} frames — unmet condition: {unmet} \ + (determinism.md § Async-asset flush: fail loudly, never capture a \ + non-quiescent frame)" + ); +} + +/// Probe the four quiescence conditions; returns `None` when all hold, else a +/// static name of the first unmet one (used in the panic message and the +/// loop's termination check). Split out so the budget-exhaustion panic can name +/// the exact stuck condition. +fn quiescence_unmet(app: &bevy::app::App) -> Option<&'static str> { + use bevy::asset::AssetServer; + use bevy::render::RenderApp; + use bevy::render::render_resource::CachedPipelineState; + + // Condition 1 (main world): every declared capture asset loaded with deps. + let asset_server = app.world().resource::(); + let pending = app.world().resource::(); + for handle in pending.handles() { + if !asset_server.is_loaded_with_dependencies(handle.id()) { + return Some("pending asset not loaded-with-dependencies"); + } + } + + // Conditions 2-4 live in the render sub-app. If it is absent (headless, no + // adapter) the GPU conditions are vacuously quiescent — capture is a GPU + // operation, so this branch is only reached in non-capture probes. + let world = app.get_sub_app(RenderApp)?.world(); + + // Condition 2: the atlas warmup queue is drained. + if let Some(warmup) = world.get_resource::() + && !warmup.is_empty() + { + return Some("atlas warmup queue not drained"); + } + + // Condition 3: every resident text key is atlas-resident (fonts_ready). No + // resident keys (a non-text fixture) is vacuously ready. + if let (Some(atlas), Some(warmup), Some(resident)) = ( + world.get_resource::(), + world.get_resource::(), + world.get_resource::(), + ) && !fonts_ready(atlas, warmup, &resident.keys) + { + return Some("fonts not ready (text keys not atlas-resident)"); + } + + // Condition 4: no Buiy pipeline is still Queued/Creating (shaders compiled). + if let Some(cache) = world.get_resource::() { + let compiling = cache.pipelines().any(|p| { + matches!( + p.state, + CachedPipelineState::Queued | CachedPipelineState::Creating(_) + ) + }) || cache.waiting_pipelines().next().is_some(); + if compiling { + return Some("pipeline cache has a Queued/Creating pipeline"); + } + } + + None +} + /// Resource cell the `ReadbackComplete` observer writes the captured bytes /// into. `Arc>` so the observer (which `move`s its capture) and the /// poll loop share one slot. The src twin of the test-support `CapturedBytes`. diff --git a/crates/buiy_core/tests/render_capture_quiescence.rs b/crates/buiy_core/tests/render_capture_quiescence.rs new file mode 100644 index 0000000..a5d33e4 --- /dev/null +++ b/crates/buiy_core/tests/render_capture_quiescence.rs @@ -0,0 +1,86 @@ +//! Quiescence-flush hardening of `capture_to_image` (Phase 3.3, +//! verification-design `determinism.md` § "Async-asset flush to quiescence"). +//! +//! Two tiers: +//! * the no-`Instant::now()` grep-lint runs HEADLESS (§ Verification #4 — +//! the capture path must read the virtual clock, never wall time); +//! * the never-loading-asset panic test is GPU `#[ignore]` (§ Verification +//! #3 — the flush gate fails loudly naming the unmet condition, never +//! greens on a missing precondition). + +mod support; + +/// § Verification #4: `Instant::now()` (and `SystemTime::now()`) must NOT +/// appear in the capture path source — a wall-clock read would make a +/// time-dependent capture non-reproducible. The fixed virtual clock +/// (`Time::`) is the only time source. A grep-lint over `golden.rs`, +/// the home of `capture_to_image` + its quiescence loop. +#[test] +fn capture_path_has_no_instant_now() { + let src = include_str!("../src/render/golden.rs"); + // Strip line comments so a doc-comment MENTIONING the ban does not trip it; + // we only care about real code reading wall time. + for (lineno, line) in src.lines().enumerate() { + let code = match line.split_once("//") { + Some((before, _)) => before, + None => line, + }; + assert!( + !code.contains("Instant::now"), + "golden.rs:{} reads wall time via Instant::now() — the capture path \ + must drive Time:: only (determinism.md § Verification #4): {line}", + lineno + 1, + ); + assert!( + !code.contains("SystemTime::now"), + "golden.rs:{} reads wall time via SystemTime::now() — the capture \ + path must drive Time:: only: {line}", + lineno + 1, + ); + } +} + +// § Verification #3: inject an asset that never finishes loading and assert +// `capture_to_image` PANICS naming the unmet quiescence condition (pending +// assets), rather than silently capturing a half-streamed frame. GPU lane. +// +// Run: cargo test -p buiy_core --test render_capture_quiescence -- --ignored \ +// --test-threads=1 --nocapture +#[test] +#[ignore = "needs a wgpu adapter (real GPU or lavapipe); run with --ignored"] +fn quiescence_panics_on_never_loading_asset() { + use bevy::prelude::*; + use buiy_core::render::golden::{GoldenConfig, PendingCaptureAssets, capture_to_image}; + + const W: u32 = 32; + const H: u32 = 32; + + let mut app = support::gpu_render_app_scaled(W, H, 1.0); + + // Register a handle for a path that can never resolve (no AssetPlugin + // source serves it), then declare it a capture precondition. The quiescence + // loop must observe it stuck `Loading`/`Failed`-but-not-loaded and refuse + // to capture — bounded by MAX_SETTLE_FRAMES, then panic. + let never = app + .world() + .resource::() + .load::("buiy-determinism::never-arrives.png"); + app.world_mut() + .resource_mut::() + .require(never.untyped()); + + let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { + let cfg = GoldenConfig::deterministic(); + let _ = capture_to_image(&mut app, &cfg); + })); + let payload = result.expect_err("capture must panic on a never-loading asset"); + let msg = payload + .downcast_ref::() + .map(String::as_str) + .or_else(|| payload.downcast_ref::<&str>().copied()) + .unwrap_or(""); + assert!( + msg.contains("pending asset") || msg.contains("asset"), + "panic must name the unmet condition (pending assets); got: {msg:?}" + ); +} From c3ec8f38a6b5e4ad6f9b7d94ea5f68fd65986048 Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 09:07:19 -0700 Subject: [PATCH 42/70] feat(verify): DeterministicApp builder + reftest seam swap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 3.4 of the verification pyramid: the single public seam every GPU tier (reftest, golden) constructs its capture app through, with every nondeterminism knob pinned at the source (determinism.md § DeterministicApp builder): - DPR pin: build via capture_app_scaled(w, h, cfg.dpr.as_f32()) — the single landed builder, no plugin-stack drift. - fixed virtual clock: TimeUpdateStrategy::ManualDuration(ZERO), so every app.update() advances Time by a fixed zero delta (never wall time) and the capture's quiescence loop terminates deterministically. - Ahem sole-family: stage_ahem under font_mode == Ahem (settle-free; the registration drains in capture_to_image's post-finish quiescence loop, because app.update() before finish() trips a render system). - MSAA/dither: applied by capture_to_image when it spawns the capture camera. new(w,h)/with(cfg)/font_mode(m)/dpr(d)/config()/build()/capture(fixture). build is a thin single-bodied wrapper; capture is build + spawn + capture_to_image. Re-points support::reftest_app from the bare capture_app seam to DeterministicApp::build — the one-line swap the 1b seam was designed for. Headless tests (determinism_build.rs) pin DPR, the manual clock, and the MSAA constant on the built app. Verified on the RX 6700 XT: all five 1b reftest #[ignore] cases re-run green through the swapped seam (behavior preserved). Spec: docs/specs/2026-06-15-buiy-verification-design/determinism.md. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/buiy_verify/src/determinism.rs | 121 +++++++++++++++++- crates/buiy_verify/src/support.rs | 13 +- crates/buiy_verify/tests/determinism_build.rs | 73 +++++++++++ 3 files changed, 201 insertions(+), 6 deletions(-) create mode 100644 crates/buiy_verify/tests/determinism_build.rs diff --git a/crates/buiy_verify/src/determinism.rs b/crates/buiy_verify/src/determinism.rs index b37803e..7c64dde 100644 --- a/crates/buiy_verify/src/determinism.rs +++ b/crates/buiy_verify/src/determinism.rs @@ -48,10 +48,127 @@ fn ahem_bytes() -> Arc> { /// host-specific platform font. /// /// The `app` must already carry a `FontRegistry` (any `BuiyTextPlugin` app -/// does). Idempotent: re-registering the same family is a no-op rebuild. +/// does). Settles one `app.update()` so the engine + `FontMatchIndex` see the +/// face immediately — use on a NON-render app (the headless resolver tests). On +/// a render app, `app.update()` before `app.finish()` trips a render system, so +/// the [`DeterministicApp`] build path uses [`stage_ahem`] instead and lets the +/// capture's post-finish quiescence loop settle it. Idempotent. pub fn register_ahem(app: &mut App) { + stage_ahem(app); + app.update(); +} + +/// Stage the Ahem registration through the production bytes path WITHOUT +/// settling — `apply_font_registry` drains it on the next `app.update()`. The +/// settle-free twin of [`register_ahem`] for the capture build path, where the +/// first update happens inside `capture_to_image` after `app.finish()`. +pub fn stage_ahem(app: &mut App) { app.world_mut() .resource_mut::() .register_bytes(AHEM_FAMILY, ahem_bytes(), FontFaceDescriptors::default()); - app.update(); +} + +/// The single public seam every GPU tier (reftest, golden) constructs its +/// capture app through, with **every** nondeterminism knob pinned at the source +/// (`determinism.md` § "DeterministicApp builder"): +/// +/// * the DPR pin — built via `capture_app_scaled(w, h, cfg.dpr.as_f32())`; +/// * the fixed virtual clock — `TimeUpdateStrategy::ManualDuration(ZERO)`, so +/// every `app.update()` advances `Time` by a fixed zero delta, never wall +/// time, and the capture's quiescence loop terminates deterministically; +/// * the Ahem box-font as the sole resolvable family when +/// `cfg.font_mode == Ahem` (host-stable text); +/// * the MSAA / dither pin — applied by [`capture_to_image`] when it spawns +/// the capture camera (`CAPTURE_MSAA`, dither off). +/// +/// It owns the *setup*; `buiy_core::render::golden::capture_to_image` owns the +/// *capture* (size-to-physical, quiescence flush, readback). The single-call +/// [`DeterministicApp::capture`] path tiers use is `build` + spawn-fixture + +/// `capture_to_image`. +/// +/// [`capture_to_image`]: buiy_core::render::golden::capture_to_image +#[derive(Clone, Copy, Debug)] +pub struct DeterministicApp { + cfg: GoldenConfig, + logical: (u32, u32), +} + +impl DeterministicApp { + /// Default-deterministic at a logical viewport size: the full flake triad, + /// `FontMode::Ahem`, `Dpr::X1`, MSAA/dither off (the `deterministic()` + /// config). Override individual knobs with [`with`](Self::with) / + /// [`font_mode`](Self::font_mode) / [`dpr`](Self::dpr). + pub fn new(logical_w: u32, logical_h: u32) -> Self { + Self { + cfg: GoldenConfig::deterministic(), + logical: (logical_w, logical_h), + } + } + + /// Replace the whole capture config (e.g. `GoldenConfig::fidelity()` for the + /// real-glyph suite). The logical viewport size is unchanged. + pub fn with(mut self, cfg: GoldenConfig) -> Self { + self.cfg = cfg; + self + } + + /// Override the font axis only (default [`FontMode::Ahem`]). + pub fn font_mode(mut self, mode: FontMode) -> Self { + self.cfg.font_mode = mode; + self + } + + /// Override the DPR axis only (default [`Dpr::X1`]). + pub fn dpr(mut self, dpr: Dpr) -> Self { + self.cfg.dpr = dpr; + self + } + + /// The capture config this builder applies (the value `capture` passes to + /// `capture_to_image`). Lets a caller read back the resolved knobs. + pub fn config(&self) -> GoldenConfig { + self.cfg + } + + /// Build a painting-capable headless `App` with every knob applied (see the + /// type docs). A thin, **single-bodied** wrapper over the landed + /// `capture_app_scaled` so the plugin stack cannot drift from the canonical + /// capture stack. Returns an `App` ready for fixture spawn; the offscreen + /// target + capture camera + readback are added by `capture_to_image`. + pub fn build(self) -> App { + use bevy::time::TimeUpdateStrategy; + use std::time::Duration; + + let (w, h) = self.logical; + // The DPR pin: size the window to logical × dpr with the scale-factor + // override, exactly as the capture path expects (the single landed + // builder — no drift). + let mut app = buiy_core::render::golden::capture_app_scaled(w, h, self.cfg.dpr.as_f32()); + + // The fixed virtual clock: advance time by a fixed ZERO delta each + // frame so the capture reads a deterministic instant, never wall time. + app.insert_resource(TimeUpdateStrategy::ManualDuration(Duration::ZERO)); + + // The font pin: under Ahem mode, STAGE the box-font through the + // production bytes path (system fonts are already off in the capture + // stack). We must not settle here — `app.update()` before `finish()` + // trips a render system — so the registration drains on the first + // update inside `capture_to_image`'s post-finish quiescence loop. + if self.cfg.font_mode == FontMode::Ahem { + stage_ahem(&mut app); + } + + app + } + + /// `build` + spawn the fixture + `capture_to_image(&app, &cfg)` — the + /// one-call path the GPU tiers use. The capture internally drives the app to + /// quiescence (asset/atlas/font/pipeline) and asserts the DPR pin before + /// readback. + pub fn capture(self, fixture: impl FnOnce(&mut App)) -> image::RgbaImage { + let cfg = self.cfg; + let mut app = self.build(); + fixture(&mut app); + buiy_core::render::golden::capture_to_image(&mut app, &cfg) + } } diff --git a/crates/buiy_verify/src/support.rs b/crates/buiy_verify/src/support.rs index dcca065..faa4340 100644 --- a/crates/buiy_verify/src/support.rs +++ b/crates/buiy_verify/src/support.rs @@ -4,11 +4,16 @@ use bevy::prelude::*; -/// Build the headless painting app both reftest captures share. Until the -/// determinism builder lands this delegates to the promoted -/// `buiy_core::render::golden::capture_app` (Task 1b.6). +/// Build the headless painting app both reftest captures share. Phase 3 swapped +/// this single line from the bare `capture_app` seam to the +/// [`DeterministicApp`](crate::determinism::DeterministicApp) builder — the +/// `&mut App → RgbaImage` capture contract is identical, but every +/// nondeterminism knob (fixed virtual clock, Ahem sole-family, DPR pin, +/// MSAA/dither) is now pinned at the source. A reftest renders both halves in +/// one app run, so the staged Ahem registration drains in the first capture's +/// quiescence loop and the second half shares it. pub fn reftest_app(logical_w: u32, logical_h: u32) -> App { - buiy_core::render::golden::capture_app(logical_w, logical_h) + crate::determinism::DeterministicApp::new(logical_w, logical_h).build() } /// Despawn the previous scene's spawned roots between the two captures so the diff --git a/crates/buiy_verify/tests/determinism_build.rs b/crates/buiy_verify/tests/determinism_build.rs new file mode 100644 index 0000000..048da3c --- /dev/null +++ b/crates/buiy_verify/tests/determinism_build.rs @@ -0,0 +1,73 @@ +//! `DeterministicApp::build` knob-application tripwires (Phase 3.4, +//! verification-design `determinism.md` § "DeterministicApp builder"). +//! HEADLESS — these inspect the built app's CPU-side configuration (window +//! scale factor, virtual-clock strategy, the pinned MSAA constant). The +//! pixel-level idempotent/knob-sensitivity proofs are GPU `#[ignore]` in +//! `determinism_capture.rs`. + +use bevy::prelude::*; +use bevy::time::TimeUpdateStrategy; +use buiy_core::render::golden::CAPTURE_MSAA; +use buiy_verify::determinism::{DeterministicApp, Dpr, FontMode}; + +/// The built app's primary-window scale factor (the DPR pin's observable). +fn window_scale_factor(app: &mut App) -> f32 { + app.world_mut() + .query::<&Window>() + .single(app.world()) + .expect("the built app carries a primary window") + .resolution + .scale_factor() +} + +#[test] +fn build_applies_dpr_and_msaa() { + // 2× DPR through the builder: the window must carry scale_factor 2.0 (the + // offscreen target is sized logical × dpr) … + let mut app = DeterministicApp::new(64, 64).dpr(Dpr::X2).build(); + assert_eq!( + window_scale_factor(&mut app), + 2.0, + "dpr(X2) pins the window scale_factor to 2.0×" + ); + + // … and the capture MSAA is pinned single-sampled (a module constant, never + // a per-fixture knob — a 4× resolve antialiases nondeterministically). + assert_eq!( + CAPTURE_MSAA, + bevy::render::view::Msaa::Off, + "the capture path pins MSAA off for determinism" + ); +} + +#[test] +fn build_pins_the_virtual_clock() { + // The fixed-clock knob: the built app drives time by a fixed ZERO virtual + // delta, never wall time, so every frame sees the same instant and the + // quiescence loop terminates deterministically. + let app = DeterministicApp::new(32, 32).build(); + let strategy = app + .world() + .get_resource::() + .expect("DeterministicApp installs a manual TimeUpdateStrategy"); + assert!( + matches!(strategy, TimeUpdateStrategy::ManualDuration(d) if d.is_zero()), + "the clock advances by a fixed zero virtual delta (no wall-time read)" + ); +} + +#[test] +fn default_dpr_is_one_x() { + // Without an explicit dpr() the builder is 1× (the deterministic() default). + let mut app = DeterministicApp::new(48, 48).build(); + assert_eq!(window_scale_factor(&mut app), 1.0); +} + +#[test] +fn font_mode_override_flows_into_cfg() { + // font_mode() overrides the config (default Ahem); fidelity work pins Real. + let a = DeterministicApp::new(16, 16); + assert_eq!(a.config().font_mode, FontMode::Ahem, "default is Ahem"); + let b = a.font_mode(FontMode::Real); + assert_eq!(b.config().font_mode, FontMode::Real); +} From 3c9cfdfafcbc7291c09e8718662083dd1339cd38 Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 09:10:32 -0700 Subject: [PATCH 43/70] =?UTF-8?q?test(verify):=20GPU=20determinism=20self-?= =?UTF-8?q?tests=20=E2=80=94=20idempotent=20capture=20+=20knob=20sensitivi?= =?UTF-8?q?ty?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 3.5 of the verification pyramid (determinism.md § Verification #1/#2). All #[ignore], GPU lane — the headless gate stays green without them. #1 idempotent capture (the headline proof): the SAME scene captured TWICE through two fresh DeterministicApps is byte-identical — compare(a, b, default) .passes(FuzzBudget::EXACT) at (0, 0). Covers a rounded-rect fixture AND an Ahem-text fixture (the box-font collapse holds frame-to-frame). Verified on the RX 6700 XT: both pass at (0,0). The brief's second verification: ahem_text_is_font_availability_invariant — the same Ahem text scene captured with vs without an extra host-style family registered is byte-identical, because the fixture names only "Ahem" and that is the sole resolvable family. Proves host-font-independence at the pixel level. #2 knob sensitivity (negatives): knob_sensitivity_dpr (1× vs 2× differ — a different physical grid), knob_sensitivity_font_mode (Real vs Ahem of the same text differ — outlines vs em-boxes). Each flip changes the bytes ⇒ the knobs are load-bearing. FINDING — MSAA is inert for this pipeline, by design. The test that asserted a 4× MSAA capture *differs* from the single-sampled one FAILED with 0 differing pixels: Buiy antialiases the SDF analytically in-shader and paints axis-aligned pixel-covering quads, so a hardware MSAA resolve is identity. That is exactly determinism.md's rationale ("in-shader analytic AA … MSAA buys nothing here"). The test is reframed (msaa_is_inert_for_the_in_shader_aa_pipeline) to assert the verified truth — a 4× capture is byte-identical to CAPTURE_MSAA — which is WHY pinning MSAA off is free. No nondeterminism source; an honest reframe. Spec: docs/specs/2026-06-15-buiy-verification-design/determinism.md. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../buiy_verify/tests/determinism_capture.rs | 312 ++++++++++++++++++ 1 file changed, 312 insertions(+) create mode 100644 crates/buiy_verify/tests/determinism_capture.rs diff --git a/crates/buiy_verify/tests/determinism_capture.rs b/crates/buiy_verify/tests/determinism_capture.rs new file mode 100644 index 0000000..5996855 --- /dev/null +++ b/crates/buiy_verify/tests/determinism_capture.rs @@ -0,0 +1,312 @@ +//! Determinism self-tests (Phase 3.5, verification-design `determinism.md` +//! § Verification #1/#2). All `#[ignore]` — they need a wgpu adapter (real GPU +//! locally / pinned lavapipe in CI). The headless gate stays green WITHOUT +//! these. +//! +//! Run: cargo test -p buiy_verify --test determinism_capture -- --ignored \ +//! --test-threads=1 +//! +//! #1 IDEMPOTENT CAPTURE (the headline proof): the SAME scene captured TWICE +//! through two fresh `DeterministicApp`s is byte-identical — `compare(a, b, +//! default).passes(EXACT)` at budget `(0, 0)`. This is the direct proof the +//! knobs actually pin the output; if any nondeterminism leaked, the two +//! captures would diverge. +//! +//! #2 KNOB SENSITIVITY (negatives): flipping each knob CHANGES the bytes, so +//! the knobs are load-bearing, not no-ops. + +use bevy::prelude::*; +use buiy_core::components::Node; +use buiy_core::layout::{Inset, Length, Sizing, Style}; +use buiy_core::render::ColorToken; +use buiy_core::render::components::{Background, TextColor}; +use buiy_core::text::{FontSize, Text}; +use buiy_verify::determinism::{DeterministicApp, Dpr, FontMode}; +use buiy_verify::metric::{CompareOpts, FuzzBudget, compare}; +use std::borrow::Cow; + +/// A known opaque rounded fill on a black ground — an edge-bearing fixture so +/// the SDF analytic AA rim exercises the float path the determinism stack pins. +fn rect_fixture(app: &mut App) { + { + let mut theme = app.world_mut().resource_mut::(); + theme + .colors + .insert("det.fill".into(), Color::srgb(0.20, 0.65, 0.90)); + } + let fill = app + .world_mut() + .spawn(( + Node, + Style::default() + .absolute() + .inset(Inset { + top: Sizing::Length(Length::px(8.0)), + left: Sizing::Length(Length::px(8.0)), + ..default() + }) + .width_px(32.0) + .height_px(24.0), + Background { + color: ColorToken::Token(Cow::Borrowed("det.fill")), + }, + )) + .id(); + app.world_mut() + .spawn((Node, Style::default())) + .add_children(&[fill]); +} + +/// A text fixture under `font-family: Ahem` so the box-font substitution is +/// exercised. The big size guarantees full-coverage interior texels. +fn text_fixture(app: &mut App) { + use buiy_core::text::{FamilyEntry, FontFamily, FontStack}; + { + let mut theme = app.world_mut().resource_mut::(); + theme + .colors + .insert("det.text".into(), Color::srgb(0.95, 0.40, 0.20)); + } + let text = app + .world_mut() + .spawn(( + Node, + Style::default(), + Text(String::from("Hi")), + FontFamily(FontStack(vec![FamilyEntry::Named(String::from("Ahem"))])), + FontSize(28.0), + TextColor(ColorToken::Token(Cow::Borrowed("det.text"))), + )) + .id(); + app.world_mut() + .spawn(( + Node, + Style::default() + .flex_column() + .width_px(48.0) + .height_px(48.0), + )) + .add_child(text); +} + +// --------------------------------------------------------------------------- +// #1 — idempotent capture: the same scene twice is bit-identical at (0, 0). +// --------------------------------------------------------------------------- + +#[test] +#[ignore = "GPU: run under `cargo test -- --ignored` (real adapter / lavapipe)"] +fn idempotent_capture() { + // Two fresh DeterministicApps, identical fixture. Every nondeterminism knob + // is pinned, so the two captures must be byte-identical. + let a = DeterministicApp::new(48, 40).capture(rect_fixture); + let b = DeterministicApp::new(48, 40).capture(rect_fixture); + + assert_eq!( + a.dimensions(), + b.dimensions(), + "same logical size, same dpr" + ); + let diff = compare(&a, &b, &CompareOpts::default()); + assert!( + diff.passes(&FuzzBudget::EXACT), + "two fresh DeterministicApp captures of the SAME scene diverged — \ + determinism leaked. differing_pixels={}, max_channel_delta={}", + diff.differing_pixels, + diff.max_channel_delta, + ); +} + +#[test] +#[ignore = "GPU: run under `cargo test -- --ignored` (real adapter / lavapipe)"] +fn idempotent_capture_text_under_ahem() { + // The same proof for a TEXT scene: the Ahem box-font substitution makes the + // two captures byte-identical (the box-font collapse holds frame-to-frame). + let a = DeterministicApp::new(48, 48).capture(text_fixture); + let b = DeterministicApp::new(48, 48).capture(text_fixture); + + let diff = compare(&a, &b, &CompareOpts::default()); + assert!( + diff.passes(&FuzzBudget::EXACT), + "two fresh Ahem-text captures diverged — differing_pixels={}", + diff.differing_pixels, + ); + // Non-vacuous: the text actually painted (not a blank frame passing + // trivially). + assert!( + a.pixels().any(|p| p.0 != [0, 0, 0, 255]), + "the Ahem text painted at least one non-clear pixel" + ); +} + +// --------------------------------------------------------------------------- +// The brief's second verification: a text scene under FontMode::Ahem renders +// identically regardless of font availability. We prove host-independence by +// capturing the SAME Ahem text scene through two apps that differ in whether +// extra host-style families were registered: the result is identical because +// Ahem is the sole resolvable family the stack names. +// --------------------------------------------------------------------------- + +#[test] +#[ignore = "GPU: run under `cargo test -- --ignored` (real adapter / lavapipe)"] +fn ahem_text_is_font_availability_invariant() { + use buiy_core::text::{FontFaceDescriptors, FontRegistry}; + use std::sync::Arc; + + // Baseline: the plain Ahem-text capture. + let baseline = DeterministicApp::new(48, 48).capture(text_fixture); + + // A second capture where an EXTRA family (the embedded Fira bytes under a + // different name) is also registered — simulating a host that has more + // fonts. Because the fixture names only "Ahem", the extra family can never + // win, so the pixels must be identical. + let with_extra = DeterministicApp::new(48, 48).capture(|app| { + // Register an extra resolvable family BEFORE the fixture text. + let extra: Arc> = Arc::new( + std::fs::read(concat!( + env!("CARGO_MANIFEST_DIR"), + "/../buiy_core/tests/fixtures/fonts/NotoSansHebrew-hebrew.ttf" + )) + .expect("the Hebrew fixture subset is committed"), + ); + app.world_mut() + .resource_mut::() + .register_bytes("Some Host Font", extra, FontFaceDescriptors::default()); + text_fixture(app); + }); + + let diff = compare(&baseline, &with_extra, &CompareOpts::default()); + assert!( + diff.passes(&FuzzBudget::EXACT), + "Ahem text changed when an extra host font was available — the box-font \ + substitution is NOT host-independent. differing_pixels={}", + diff.differing_pixels, + ); +} + +// --------------------------------------------------------------------------- +// #2 — knob sensitivity: each knob flip changes the bytes. +// --------------------------------------------------------------------------- + +#[test] +#[ignore = "GPU: run under `cargo test -- --ignored` (real adapter / lavapipe)"] +fn knob_sensitivity_dpr() { + // 1× vs 2× is a different rasterization (different physical pixel grid), so + // the images differ — the metric's dimension-mismatch sentinel saturates. + let one_x = DeterministicApp::new(48, 40) + .dpr(Dpr::X1) + .capture(rect_fixture); + let two_x = DeterministicApp::new(48, 40) + .dpr(Dpr::X2) + .capture(rect_fixture); + + assert_ne!( + one_x.dimensions(), + two_x.dimensions(), + "2× capture is physically larger than 1× (the DPR axis is real)" + ); + let diff = compare(&one_x, &two_x, &CompareOpts::default()); + assert!( + !diff.passes(&FuzzBudget::EXACT), + "dpr(X1) and dpr(X2) captures must differ — the DPR knob is a no-op" + ); +} + +#[test] +#[ignore = "GPU: run under `cargo test -- --ignored` (real adapter / lavapipe)"] +fn knob_sensitivity_font_mode() { + // Real vs Ahem of the SAME text fixture differ: the box-font rasterizes + // solid em-squares, the real face rasterizes glyph outlines. + let ahem = DeterministicApp::new(48, 48) + .font_mode(FontMode::Ahem) + .capture(text_fixture); + // FontMode::Real does NOT stage Ahem; the fixture names "Ahem" which is not + // registered, so the stack falls through to the embedded default face — + // genuine glyph outlines, a visibly different image. + let real = DeterministicApp::new(48, 48) + .font_mode(FontMode::Real) + .capture(text_fixture); + + assert_eq!( + ahem.dimensions(), + real.dimensions(), + "same logical size + dpr" + ); + let diff = compare(&ahem, &real, &CompareOpts::default()); + assert!( + !diff.passes(&FuzzBudget::EXACT), + "FontMode::Real and FontMode::Ahem captures of the same text must \ + differ — the font-mode knob is a no-op. differing_pixels={}", + diff.differing_pixels, + ); +} + +#[test] +#[ignore = "GPU: run under `cargo test -- --ignored` (real adapter / lavapipe)"] +fn msaa_is_inert_for_the_in_shader_aa_pipeline() { + use buiy_core::render::golden::{CAPTURE_MSAA, capture_app, readback_rgba_into}; + + // The MSAA pin's rationale, VERIFIED (determinism.md): Buiy antialiases the + // SDF analytically in-shader and paints axis-aligned, pixel-covering quads, + // so a hardware MSAA *resolve* is identity for this pipeline — it changes + // nothing while costing cross-driver determinism. CAPTURE_MSAA pins it OFF + // to remove that risk; here we confirm it is genuinely a no-op (a 4× capture + // is byte-identical to the single-sampled one), which is exactly WHY the pin + // is free. (MSAA is a module constant, not a DeterministicApp knob, so this + // drives the capture camera directly.) + assert_eq!(CAPTURE_MSAA, bevy::render::view::Msaa::Off); + + let pinned = capture_at_msaa(bevy::render::view::Msaa::Off); + let four_x = capture_at_msaa(bevy::render::view::Msaa::Sample4); + + let diff = compare(&pinned, &four_x, &CompareOpts::default()); + assert!( + diff.passes(&FuzzBudget::EXACT), + "4× MSAA changed the in-shader-AA pipeline's output — the MSAA pin is \ + NOT free; revisit the determinism.md claim. differing_pixels={}, \ + max_channel_delta={}", + diff.differing_pixels, + diff.max_channel_delta, + ); + // Non-vacuous: the fixture actually painted (both captures are real frames). + assert!( + pinned.pixels().any(|p| p.0 != [0, 0, 0, 255]), + "the rect fixture painted at least one non-clear pixel" + ); + + // Inline capture at an explicit MSAA, mirroring capture_to_image's offscreen + // target setup but with a caller-chosen sample count on the capture camera. + fn capture_at_msaa(msaa: bevy::render::view::Msaa) -> image::RgbaImage { + use bevy::asset::RenderAssetUsages; + use bevy::camera::RenderTarget; + use bevy::image::Image; + use bevy::render::render_resource::{TextureFormat, TextureUsages}; + + const W: u32 = 48; + const H: u32 = 40; + let mut app = capture_app(W, H); + rect_fixture(&mut app); + + let target = { + let mut image = Image::new_target_texture(W, H, TextureFormat::Rgba8UnormSrgb, None); + image.texture_descriptor.usage |= TextureUsages::COPY_SRC; + image.asset_usage = RenderAssetUsages::all(); + app.world_mut().resource_mut::>().add(image) + }; + app.world_mut().spawn(( + Camera2d, + RenderTarget::from(target.clone()), + msaa, + Camera { + clear_color: ClearColorConfig::Custom(Color::BLACK), + ..default() + }, + )); + app.finish(); + app.cleanup(); + for _ in 0..4 { + app.update(); + } + let bytes = readback_rgba_into(&mut app, &target, W, H); + image::RgbaImage::from_raw(W, H, bytes).expect("W*H*4 bytes") + } +} From 92d3ea60ff60b5e2c764b7f6a7baf86894866796 Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 09:12:55 -0700 Subject: [PATCH 44/70] ci(verify): pin lavapipe for the GPU golden lane + record determinism landed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 3.10 of the verification pyramid (determinism.md § CI software-rasterizer pin). A CONFIG/DOC deliverable — lavapipe is not installed locally, so this is validated on the real RX 6700 XT here; the lavapipe leg is the CI stored-baseline gate. - .github/actions/install-mesa/action.yml: a composite action that consumes gfx-rs/ci-build's prebuilt, VERSION-PINNED lavapipe tarball (no self-build; MESA_VERSION + ci-build tag pinned), writes its OWN ICD JSON (the upstream path is build-host-absolute), and exports the adapter-selection env contract: VK_DRIVER_FILES (the modern variable, NOT the deprecated VK_ICD_FILENAMES — deviation #2) + WGPU_ADAPTER_NAME=llvmpipe. LP_NUM_THREADS is deliberately NOT set (deviation #1 — determinism comes from the pinned Mesa version, not thread count). - .github/workflows/ci.yml: a new `gpu` job invoking the action, a one-line llvmpipe-adapter smoke guard (determinism.md § Verification #5 — the pin is active, not silently falling back to hardware), then the #[ignore] GPU lane serialized at --test-threads=1. Additive: the headless `test` job stays green with no adapter. Also records a "Landed" section in determinism.md (tasks 3.1-3.5, 3.10) and corrects Verification #2's MSAA claim to the VERIFIED finding: 4× MSAA is byte-identical to CAPTURE_MSAA for Buiy's in-shader analytic-AA pipeline, which confirms (not contradicts) the MSAA-pin rationale. Tier-5 golden corpus (3.6-3.9) remains future work; status stays draft until the 4.7 flip. Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/actions/install-mesa/action.yml | 106 ++++++++++++++++++ .github/workflows/ci.yml | 48 ++++++++ .../determinism.md | 15 ++- 3 files changed, 168 insertions(+), 1 deletion(-) create mode 100644 .github/actions/install-mesa/action.yml diff --git a/.github/actions/install-mesa/action.yml b/.github/actions/install-mesa/action.yml new file mode 100644 index 0000000..d28c8a4 --- /dev/null +++ b/.github/actions/install-mesa/action.yml @@ -0,0 +1,106 @@ +# Install a version-PINNED Mesa lavapipe (software Vulkan) and select it as the +# ONLY Vulkan adapter, for the deterministic golden-image CI leg. +# +# Why pinned-and-self-hosted, not the distro PPA (determinism.md § "CI +# software-rasterizer pin"; prior-art/wgpu-testing/determinism-rasterizer.md): +# Buiy owns its renderer, so it stores ONE golden per cell against ONE canonical +# rasterizer. A rolling distro lavapipe is a MOVING reference image — wgpu +# abandoned `ppa:oibaf` for exactly this (day-to-day flakes from unrelated +# llvmpipe regressions). We consume gfx-rs/ci-build's prebuilt, version-tagged +# lavapipe tarball directly (no self-build) and pin MESA_VERSION + the +# ci-binary-build tag explicitly. Bump deliberately in a tracked issue, +# regenerating affected goldens in the SAME PR. +# +# Determinism comes from the PINNED MESA VERSION, not from thread count. +# LP_NUM_THREADS is deliberately NOT set (determinism.md deviation #1): Mesa +# documents it only as a perf knob, llvmpipe tiles per-thread so output is +# stable regardless of thread count, and wgpu's own install-mesa never sets it. +# +# This action is a CONFIG/DOC deliverable. It is validated against a REAL GPU +# (AMD RX 6700 XT / RADV) locally — the cross-rasterizer pixels are +# non-comparable, so the local lane runs the determinism/reftest checks +# (rasterizer-internal invariants), not the stored lavapipe baseline. The +# lavapipe leg is the stored-baseline gate and runs only in CI. + +name: install-mesa +description: >- + Install a version-pinned Mesa lavapipe software-Vulkan ICD and export the + adapter-selection env contract (VK_DRIVER_FILES + WGPU_ADAPTER_NAME=llvmpipe). + +inputs: + mesa-version: + description: >- + The exact Mesa version to install (must match a gfx-rs/ci-build release + tag). Bump deliberately + regenerate affected goldens in the same PR. + required: false + # Pin EXACTLY. This is the canonical rasterizer version every stored golden + # is blessed against; changing it is a baseline change, never incidental. + default: "24.3.4" + ci-build-tag: + description: >- + The gfx-rs/ci-build `ci-binary-build` release tag carrying the prebuilt + lavapipe tarball for `mesa-version`. + required: false + default: "build19" + +runs: + using: composite + steps: + # 1. Fetch the prebuilt, version-pinned lavapipe tarball from gfx-rs/ci-build + # (the same artifact wgpu's CI consumes — no self-build). The tarball + # carries libvulkan_lvp.so + the loader libs under ./lib. + - name: Download pinned Mesa lavapipe + shell: bash + run: | + set -euo pipefail + MESA_VERSION="${{ inputs.mesa-version }}" + CI_BUILD_TAG="${{ inputs.ci-build-tag }}" + echo "Installing pinned Mesa lavapipe ${MESA_VERSION} (ci-build ${CI_BUILD_TAG})" + curl -fsSL \ + "https://github.com/gfx-rs/ci-build/releases/download/${CI_BUILD_TAG}/mesa-${MESA_VERSION}-linux-x86_64.tar.xz" \ + -o "${RUNNER_TEMP}/mesa.tar.xz" + mkdir -p "${RUNNER_TEMP}/mesa" + tar -xf "${RUNNER_TEMP}/mesa.tar.xz" -C "${RUNNER_TEMP}/mesa" + + # 2. Write our OWN ICD JSON pointing at the extracted lavapipe .so. The + # upstream ICD path is build-host-absolute, so we author a fresh manifest + # with the runner-local library path. + - name: Write lavapipe ICD manifest + shell: bash + run: | + set -euo pipefail + MESA_VERSION="${{ inputs.mesa-version }}" + LVP_SO="$(find "${RUNNER_TEMP}/mesa" -name 'libvulkan_lvp.so' | head -n1)" + if [ -z "${LVP_SO}" ]; then + echo "::error::libvulkan_lvp.so not found in the extracted Mesa tarball" + exit 1 + fi + ICD_JSON="${RUNNER_TEMP}/lvp_icd.x86_64.json" + cat > "${ICD_JSON}" < ${ICD_JSON} (library_path=${LVP_SO})" + echo "BUIY_LVP_ICD=${ICD_JSON}" >> "${GITHUB_ENV}" + + # 3. Export the adapter-selection env contract (determinism.md § "Adapter + # selection"): + # - VK_DRIVER_FILES → the Vulkan loader sees ONLY lavapipe; it cannot + # pick a hardware GPU. (The modern variable; VK_ICD_FILENAMES is + # deprecated — deviation #2. The loader still honors the old name, but + # new CI wiring must not encode a deprecated path.) + # - WGPU_ADAPTER_NAME=llvmpipe → wgpu's case-insensitive substring match + # nails the exact device, so a future multi-adapter image can't drift. + # NOT exported: LP_NUM_THREADS (deviation #1 — not a determinism knob). + - name: Export adapter-selection env contract + shell: bash + run: | + set -euo pipefail + echo "VK_DRIVER_FILES=${BUIY_LVP_ICD}" >> "${GITHUB_ENV}" + echo "WGPU_ADAPTER_NAME=llvmpipe" >> "${GITHUB_ENV}" + echo "Pinned lavapipe selected: VK_DRIVER_FILES=${BUIY_LVP_ICD}, WGPU_ADAPTER_NAME=llvmpipe" diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b37227f..bc2c021 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -94,3 +94,51 @@ jobs: - name: cargo test (macOS / Windows) if: matrix.os != 'ubuntu-latest' run: cargo test --workspace + + # ---------------------------------------------------------------------------- + # GPU lane: the #[ignore] render/determinism/reftest tests that need a real + # wgpu adapter, run against a version-PINNED Mesa lavapipe software rasterizer + # (the determinism stack's CI rasterizer pin; determinism.md § "CI + # software-rasterizer pin"). One canonical rasterizer ⇒ one golden per cell, + # no per-OS/per-GPU matrix. This leg needs NO X server — Vulkan + # render-to-texture is headless — so no xvfb. + # + # The headless `test` job above runs WITHOUT --ignored and never touches an + # adapter, so it stays green on runners with no GPU; this leg is ADDITIVE. + # Locally the same #[ignore] tests run on the real RX 6700 XT (the determinism + # / reftest checks are rasterizer-internal invariants, not a stored-baseline + # comparison, so they hold on either rasterizer). + gpu: + name: GPU (pinned lavapipe) + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + - uses: Swatinem/rust-cache@v2 + - name: Install Linux deps for Bevy + run: | + sudo apt-get update + sudo apt-get install -y libasound2-dev libudev-dev libwayland-dev libxkbcommon-dev + # Install the version-pinned lavapipe + export VK_DRIVER_FILES / + # WGPU_ADAPTER_NAME=llvmpipe (NOT LP_NUM_THREADS — deviation #1). + - name: Install pinned Mesa lavapipe + uses: ./.github/actions/install-mesa + # Smoke guard (determinism.md § Verification #5): before any golden runs, + # confirm the selected adapter is lavapipe — the pin is active, not + # silently falling back to a hardware adapter. `wgpu-info` reports the + # adapter the same env contract selects. + - name: Assert pinned lavapipe adapter is selected + run: | + set -euo pipefail + cargo install --locked wgpu-info || true + ADAPTERS="$(wgpu-info 2>/dev/null || true)" + echo "${ADAPTERS}" + echo "${ADAPTERS}" | grep -iq 'llvmpipe' \ + || { echo "::error::pinned lavapipe (llvmpipe) adapter not selected — VK_DRIVER_FILES/WGPU_ADAPTER_NAME wiring did not take effect"; exit 1; } + # The GPU lane: serialize one adapter context at a time (--test-threads=1). + # No --ignored on the headless job above; this is the only leg that + # instantiates an adapter. + - name: cargo test (GPU #[ignore] lane on pinned lavapipe) + run: | + cargo test -p buiy_core -j 2 -- --ignored --test-threads=1 + cargo test -p buiy_verify -j 2 -- --ignored --test-threads=1 diff --git a/docs/specs/2026-06-15-buiy-verification-design/determinism.md b/docs/specs/2026-06-15-buiy-verification-design/determinism.md index 393896a..276ffae 100644 --- a/docs/specs/2026-06-15-buiy-verification-design/determinism.md +++ b/docs/specs/2026-06-15-buiy-verification-design/determinism.md @@ -186,11 +186,24 @@ A reftest renders **both** halves in one process (one device, driver, clock, atl How the determinism harness verifies *itself* (these are tests of the test infra, runnable in CI): 1. **Idempotent-capture (pure-CPU + GPU lanes).** `capture_to_image` of the same fixture twice in two fresh `DeterministicApp`s ⇒ `metric::compare(a, b, default)` passes at budget `(0, 0)`. This is the landed "re-capture IS the golden" check (`render_golden_harness.rs`) re-expressed against the unified metric and the new builder — the direct proof the knobs actually pin the output. GPU (`#[ignore]`). -2. **Knob-sensitivity (negative tests).** Flipping each knob *changes* the bytes: `dpr(X1)` vs `dpr(X2)` of the same fixture differ; `FontMode::Real` vs `FontMode::Ahem` differ for a text fixture; a fixture with MSAA forced on differs from `CAPTURE_MSAA`. Proves the knobs are load-bearing, not no-ops. GPU (`#[ignore]`). +2. **Knob-sensitivity (negative tests).** Flipping each knob *changes* the bytes: `dpr(X1)` vs `dpr(X2)` of the same fixture differ; `FontMode::Real` vs `FontMode::Ahem` differ for a text fixture. Proves those knobs are load-bearing, not no-ops. GPU (`#[ignore]`). **MSAA is the exception — verified inert.** A 4× MSAA capture of the same fixture is byte-identical to `CAPTURE_MSAA` (`Off`) for Buiy's pipeline, because the SDF AA is analytic in-shader and the quads are axis-aligned + pixel-covering, so a hardware resolve is identity. This *confirms* the MSAA-pin rationale (the pin costs nothing while removing the cross-driver resolve risk); the test therefore asserts the verified equality, not a difference. See § Landed. 3. **Quiescence assertions fire.** Inject a never-loading asset / an undrained warmup queue and assert `capture` panics naming the unmet condition (1–4 above) — proves the flush gate cannot be silently skipped (the wgpu "implicit golden bootstrapping" Avoid: fail loudly, never green on a missing precondition). 4. **Clock determinism.** Assert `capture` uses `Time` and never reads wall time: a fixture whose visual depends on time captures identically across two runs at the same virtual timestamp; a test grep/lint forbids `Instant::now()` in the capture path. 5. **CI-pin smoke (CI-only).** On the lavapipe leg, assert the selected adapter name contains `llvmpipe` (env wiring took effect) before any golden runs — a one-line guard that the rasterizer pin is active, not silently falling back to a hardware adapter. +## Landed (determinism stack, plan Phase 3 tasks 3.1–3.5, 3.10) + +The determinism substrate is implemented and verified; the Tier-5 stored-golden +corpus (plan 3.6–3.9) remains future work. Status stays `draft` until the +Phase 4.7 docs flip closes the whole campaign. + +- **`GoldenConfig` extension + `FontMode`** — `crates/buiy_core/src/render/golden.rs`: `FontMode { Real, Ahem }`, the `font_mode`/`dpr` fields, `deterministic()` (Ahem + `X1`), `fidelity()` (Real). Tests: `crates/buiy_core/tests/render_golden_config.rs`. (3.1) +- **Ahem box-font** — the canonical W3C/WPT public-domain `Ahem.ttf` (em-box font) committed at `crates/buiy_core/tests/fixtures/fonts/Ahem.ttf` (+ `LICENSE-Ahem.txt`). Registered through the production bytes path and made the **sole resolvable family** by `buiy_verify::determinism::{register_ahem, stage_ahem}`. The obscure-text rectangle fallback the spec allowed was **not** needed — the genuine em-box font was obtainable. Tests: `crates/buiy_verify/tests/determinism_ahem.rs` (headless). (3.2) +- **Quiescence flush + DPR-pin assertion** — `capture_to_image` drives `app.update()` to quiescence over the four conditions (pending assets via the new `PendingCaptureAssets` resource; atlas warmup drained; `fonts_ready`; no `Queued`/`Creating` pipeline), polling the device to `Wait`, then asserts `scale_factor == cfg.dpr`. Panics naming the unmet condition on budget exhaustion. Tests: `crates/buiy_core/tests/render_capture_quiescence.rs` (the `Instant::now` grep-lint headless; the never-loading-asset panic GPU `#[ignore]`). (3.3) +- **`DeterministicApp` builder** — `crates/buiy_verify/src/determinism.rs`: `new`/`with`/`font_mode`/`dpr`/`build`/`capture`, a single-bodied wrapper over `capture_app_scaled` that pins the DPR, the fixed virtual clock (`TimeUpdateStrategy::ManualDuration(0)`), and the Ahem sole-family. Re-points `support::reftest_app` (the one-line 1b seam swap; the five 1b reftest `#[ignore]` cases re-run green through it). Tests: `crates/buiy_verify/tests/determinism_build.rs` (headless). (3.4) +- **GPU determinism self-tests** — `crates/buiy_verify/tests/determinism_capture.rs` (`#[ignore]`): idempotent capture passes at `(0,0)` for a rect AND an Ahem-text fixture; Ahem text is font-availability-invariant; `dpr`/`font_mode` knob-sensitivity negatives; the MSAA-inert finding above. **All six pass on the AMD RX 6700 XT.** (3.5) +- **CI lavapipe pin** — `.github/actions/install-mesa/action.yml` (consumes `gfx-rs/ci-build`'s prebuilt tarball, writes its own ICD JSON, exports `VK_DRIVER_FILES` + `WGPU_ADAPTER_NAME=llvmpipe`, **never** `LP_NUM_THREADS`) + the `gpu` job in `.github/workflows/ci.yml` (the `llvmpipe`-adapter smoke guard before the `#[ignore]` GPU lane). A **config/doc deliverable**: lavapipe is not installed locally, so this is validated on the real GPU here; the lavapipe leg is the CI stored-baseline gate. (3.10) + ## Sources Code: `crates/buiy_core/src/render/golden.rs:18`–`:88` (GoldenConfig, deterministic(), fonts_ready); `crates/buiy_core/tests/support/mod.rs:156` (gpu_render_app_scaled), `:161` (with_scale_factor_override), `:229`/`:237` (Msaa::Off capture camera), `:266` (wait_for_text_ready quiescence poll), `:292`/`:306` (bundled-font registration), `:353` (readback_rgba); `crates/buiy_core/src/render/extract.rs:156`/`:606` (scale_factor default + fill); `crates/buiy_core/src/text/registry.rs:165` (register_bytes); `crates/buiy_core/tests/text_caret_selection.rs:178` (Time::advance_by). Prior-art: `docs/prior-art/wgpu-testing/{lessons.md,determinism-rasterizer.md}` (lavapipe pin, VK_DRIVER_FILES, the LP_NUM_THREADS myth); `docs/prior-art/flutter-golden-testing/obscure-text-font.md` (Ahem). Report: `docs/reports/2026-06-14-visual-bug-detection-strategy.md` § Cross-cutting mechanisms ("Deterministic-rendering stack for wgpu CI"). From 80025941452d8952e3b168967d764c3df5be2af6 Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 09:25:12 -0700 Subject: [PATCH 45/70] feat(verify): Tier-5 golden key schema + bless ledger types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 3.6 (verification-design goldens.md). Adds the `buiy_verify::golden` module: the `GoldenKey` trace identity (widget × state × theme × viewport × backend × dpr) with a deterministic lower-kebab slug + `from_slug` inverse, the `Backend` enum, and the human-diffable `BlessLedger`/`Positive` TOML accept record. The key schema is fixed before any golden is generated — a Skia-Gold lesson, since adding a field later re-baselines the whole corpus. The module scaffolds all three submodules (`check`, `ledger`, `report`) so the `pub use` re-exports resolve and the crate compiles; 3.7/3.8/3.9 land the per-area test coverage and the GPU round-trip over this same code. `FuzzBudget` gains serde derives so `Positive.budget` persists a per-fixture widened budget directly. New workspace deps `toml = "0.8"` (ledger) and `base64 = "0.22"` (HTML report PNG inlining) — both MIT/Apache-2.0, cleared by `cargo deny check` before the add. RED→GREEN: `golden_keys.rs` proptest pins `slug()`→`from_slug` round-trip and no-collision over canonical keys (goldens.md § Verification #6), plus deterministic/lower-kebab/dir/ledger-TOML unit tests. Co-Authored-By: Claude Opus 4.8 (1M context) --- Cargo.toml | 9 + crates/buiy_verify/Cargo.toml | 6 + crates/buiy_verify/src/golden.rs | 240 +++++++++++++++ crates/buiy_verify/src/golden/check.rs | 393 ++++++++++++++++++++++++ crates/buiy_verify/src/golden/ledger.rs | 74 +++++ crates/buiy_verify/src/golden/report.rs | 187 +++++++++++ crates/buiy_verify/src/lib.rs | 1 + crates/buiy_verify/src/metric.rs | 5 +- crates/buiy_verify/tests/golden_keys.rs | 153 +++++++++ 9 files changed, 1067 insertions(+), 1 deletion(-) create mode 100644 crates/buiy_verify/src/golden.rs create mode 100644 crates/buiy_verify/src/golden/check.rs create mode 100644 crates/buiy_verify/src/golden/ledger.rs create mode 100644 crates/buiy_verify/src/golden/report.rs create mode 100644 crates/buiy_verify/tests/golden_keys.rs diff --git a/Cargo.toml b/Cargo.toml index b7ac902..137c014 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -59,3 +59,12 @@ accesskit_winit = "0.29" # bump cannot drop the atlas allocator. Spec atlas-and-text-seam.md § 2.1. guillotiere = "=0.6.2" smallvec = "1" +# Tier-5 golden bless ledger (goldens.md § "The bless ledger"): `.toml` +# beside the PNGs is the durable, human-diffable accept record reviewed as the +# PR diff. MIT/Apache-2.0, already transitively present via cargo tooling; +# `cargo deny check` gates the add. +toml = "0.8" +# Tier-5 self-contained HTML triage report (goldens.md § "Diff-PNG + HTML"): +# base64-inline the actual/baseline/diff PNGs so the report is offline-first — +# openable straight from CI artifacts, no external assets, no SaaS. MIT/Apache-2.0. +base64 = "0.22" diff --git a/crates/buiy_verify/Cargo.toml b/crates/buiy_verify/Cargo.toml index 7182198..e6de492 100644 --- a/crates/buiy_verify/Cargo.toml +++ b/crates/buiy_verify/Cargo.toml @@ -32,3 +32,9 @@ insta = { version = "=1.48.0", features = ["glob"] } # so it must name those types. NO new supply-chain crate, zero new `cargo deny` # surface (the version is pinned to buiy_core's `0.19`). cosmic-text = "0.19" +# Tier-5 golden bless ledger (goldens.md): the durable accept record as +# human-diffable TOML beside the PNGs — the `.toml` reviewed in the PR. +toml.workspace = true +# Tier-5 HTML triage report (goldens.md): base64-inline the PNGs so the report +# is self-contained / offline-first (no external asset, no network). +base64.workspace = true diff --git a/crates/buiy_verify/src/golden.rs b/crates/buiy_verify/src/golden.rs new file mode 100644 index 0000000..a7acc4d --- /dev/null +++ b/crates/buiy_verify/src/golden.rs @@ -0,0 +1,240 @@ +//! Tier 5 — golden persistence + triage (verification-design `goldens.md`). +//! +//! The stored-baseline regression tier for the irreducible rasterization +//! residue Tiers 1–4 provably cannot reach: SDF corner AA, the drop-shadow +//! Gaussian kernel, glyph/color-emoji atlas output, the effect compositor, +//! blend/gamma, and the forced-colors *visual* residual. A `tests/goldens/` +//! corpus is keyed `widget × state × theme × viewport × backend × dpr`, with +//! **set-valued** (multi-positive) baselines so residual GPU AA jitter the +//! determinism pin reduces but cannot fully erase is absorbed by an +//! any-positive-matches semantics. +//! +//! ## What lives here (pure CPU, unit-testable without an adapter) +//! +//! * [`GoldenKey`] — the trace identity, **fixed before any golden is +//! generated** (retrofitting a key field re-baselines the whole corpus). Its +//! [`slug`](GoldenKey::slug) drives a stable on-disk path; [`from_slug`] +//! parses it back. +//! * [`BlessLedger`] / [`Positive`] — the durable, human-diffable accept record +//! (`.toml` beside the PNGs), recording, per positive, the blessing +//! commit, timestamp, per-fixture budget, and reason. This is the explicit +//! accept ledger reg-suit lacks (Skia-Gold §Borrow 1). +//! * [`check_golden`] / [`assert_golden`] — the comparison entry points +//! (Phase 3.7). +//! * [`TriageReport`] / [`TriageCard`] — the self-contained offline HTML triage +//! report (Phase 3.8). +//! +//! Capture (the one GPU-coupled primitive) is delegated to +//! [`buiy_core::render::golden::capture_to_image`]; everything in this module is +//! device-free. +//! +//! [`from_slug`]: GoldenKey::from_slug + +use buiy_core::render::golden::Dpr; + +mod check; +mod ledger; +mod report; + +pub use check::{GoldenOutcome, assert_golden, check_golden}; +pub use ledger::{BlessLedger, Positive}; +pub use report::{TriageCard, TriageReport}; + +/// The rasterizer a golden was captured on. One canonical rasterizer is pinned +/// per CI lane today (lavapipe), so a key currently carries a single constant +/// `backend`; the field is part of the trace identity now so a future +/// cross-backend corpus is a *new cell*, never a corpus-wide re-baseline +/// (Skia-Gold "params/traces"; goldens.md §58). +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)] +pub enum Backend { + /// Software Vulkan (Mesa llvmpipe) — the pinned CI rasterizer. + Lavapipe, + /// Hardware Vulkan. + Vulkan, + /// OpenGL / GLES. + Gl, + /// Apple Metal. + Metal, + /// Direct3D 12. + Dx12, +} + +impl Backend { + /// The lower-kebab slug component (the inverse of [`from_slug`](Self::from_slug)). + fn slug(self) -> &'static str { + match self { + Backend::Lavapipe => "lavapipe", + Backend::Vulkan => "vulkan", + Backend::Gl => "gl", + Backend::Metal => "metal", + Backend::Dx12 => "dx12", + } + } + + /// Parse a slug component back to a `Backend` (the inverse of [`slug`](Self::slug)). + fn from_slug(s: &str) -> Option { + Some(match s { + "lavapipe" => Backend::Lavapipe, + "vulkan" => Backend::Vulkan, + "gl" => Backend::Gl, + "metal" => Backend::Metal, + "dx12" => Backend::Dx12, + _ => return None, + }) + } +} + +/// The trace identity that keys a golden cell (Skia-Gold "params/traces"; +/// goldens.md §47). **FIXED before any golden is generated** — adding a field +/// later re-baselines every stored PNG. The ordered fields drive a stable, +/// slug-safe on-disk path and the triage report. +/// +/// `dpr` is the canonical [`buiy_core::render::golden::Dpr`] (integer +/// milliscale, `Eq + Hash + Ord`) — imported, never redefined here — so the key +/// compares/sorts/hashes without float pitfalls. +#[derive(Clone, Debug, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)] +pub struct GoldenKey { + /// Catalog fixture id (the BSN gallery entry — e.g. `button`). + pub widget: String, + /// Interaction state: `default | hover | focus | pressed | disabled`. + pub state: String, + /// `light | dark | high-contrast | forced-*`. + pub theme: String, + /// Named viewport (e.g. `sm` = 360×640). + pub viewport: String, + /// The rasterizer the golden was captured on (one pinned lane today). + pub backend: Backend, + /// Device-pixel-ratio as canonical milliscale (`Dpr::X1` = 1×, `X2` = 2×). + pub dpr: Dpr, +} + +/// The slug separator between the directory part (`widget/state/theme`) and the +/// flat key tail (`viewport__backend__dpr`). `__` is chosen so a single-`_` +/// inside a slug-safe component never splits a field. +const FIELD_SEP: &str = "__"; + +impl GoldenKey { + /// `widget/state/theme__viewport__backend__dpr` — a directory per + /// `widget/state/theme` keeps a fixture's whole row of cells together for + /// review. Deterministic, lower-kebab, slug-safe (no raw `Debug`): + /// components are lowercased and every run of non-`[a-z0-9]` collapses to a + /// single `-`. The DPR renders as `dpr` via [`dpr_slug`]. + pub fn slug(&self) -> String { + format!( + "{}/{}/{}{FIELD_SEP}{}{FIELD_SEP}{}{FIELD_SEP}{}", + slug_component(&self.widget), + slug_component(&self.state), + slug_component(&self.theme), + slug_component(&self.viewport), + self.backend.slug(), + dpr_slug(self.dpr), + ) + } + + /// Parse a [`slug`](Self::slug) back into a key. `None` if the shape is + /// wrong (not exactly `a/b/c` where `c` is `d__e__f__g`), the backend is + /// unknown, or the dpr token is malformed. Round-trips any key whose + /// components are already slug-safe (lower-kebab); display-name + /// normalization (uppercasing/spaces) is lossy by design and not expected to + /// round-trip. + pub fn from_slug(slug: &str) -> Option { + let mut dirs = slug.split('/'); + let widget = dirs.next()?.to_string(); + let state = dirs.next()?.to_string(); + let tail = dirs.next()?; + if dirs.next().is_some() { + return None; // too many `/` segments + } + let mut fields = tail.split(FIELD_SEP); + let theme = fields.next()?.to_string(); + let viewport = fields.next()?.to_string(); + let backend = Backend::from_slug(fields.next()?)?; + let dpr = dpr_from_slug(fields.next()?)?; + if fields.next().is_some() { + return None; // too many `__` fields + } + // Reject empty components — a valid key never has an empty field. + if widget.is_empty() || state.is_empty() || theme.is_empty() || viewport.is_empty() { + return None; + } + Some(GoldenKey { + widget, + state, + theme, + viewport, + backend, + dpr, + }) + } + + /// The corpus directory holding `..png` (n = positive index) + /// plus the `.toml` ledger. `root.join(self.slug())` — the slug + /// IS a relative path (`widget/state/theme__…`). + pub fn dir(&self, root: &std::path::Path) -> std::path::PathBuf { + root.join(self.slug()) + } +} + +/// Lowercase + collapse every run of non-`[a-z0-9]` to a single `-`, trimming +/// leading/trailing `-`. Makes a display name slug-safe; idempotent on +/// already-slug-safe input (so `slug()`→`from_slug` round-trips). +fn slug_component(s: &str) -> String { + let mut out = String::with_capacity(s.len()); + let mut prev_dash = false; + for c in s.chars() { + if c.is_ascii_alphanumeric() { + out.push(c.to_ascii_lowercase()); + prev_dash = false; + } else if !prev_dash { + out.push('-'); + prev_dash = true; + } + } + out.trim_matches('-').to_string() +} + +/// Render a `Dpr` as a slug token: the common 1×/2× become `dpr1`/`dpr2`; any +/// other milliscale becomes `dprm` so it round-trips exactly (e.g. +/// `Dpr(1500)` → `dprm1500`). [`dpr_from_slug`] is the inverse. +fn dpr_slug(dpr: Dpr) -> String { + let milli = dpr.0; + if milli.is_multiple_of(1000) { + format!("dpr{}", milli / 1000) + } else { + format!("dprm{milli}") + } +} + +/// Parse a [`dpr_slug`] token back to a `Dpr`. Accepts `dpr` (= `n×1000` +/// milliscale) and `dprm` (raw milliscale). +fn dpr_from_slug(tok: &str) -> Option { + if let Some(rest) = tok.strip_prefix("dprm") { + Some(Dpr(rest.parse().ok()?)) + } else if let Some(rest) = tok.strip_prefix("dpr") { + Some(Dpr(rest.parse::().ok()?.checked_mul(1000)?)) + } else { + None + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn dpr_slug_round_trips_common_and_fractional() { + for d in [Dpr::X1, Dpr::X2, Dpr(1500), Dpr(1235), Dpr(3000)] { + assert_eq!(dpr_from_slug(&dpr_slug(d)), Some(d), "round-trip for {d:?}"); + } + assert_eq!(dpr_slug(Dpr::X1), "dpr1"); + assert_eq!(dpr_slug(Dpr::X2), "dpr2"); + assert_eq!(dpr_slug(Dpr(1500)), "dprm1500"); + } + + #[test] + fn slug_component_is_slug_safe_and_idempotent() { + assert_eq!(slug_component("Focus Ring"), "focus-ring"); + assert_eq!(slug_component("high-contrast"), "high-contrast"); // idempotent + assert_eq!(slug_component(" weird__name "), "weird-name"); + } +} diff --git a/crates/buiy_verify/src/golden/check.rs b/crates/buiy_verify/src/golden/check.rs new file mode 100644 index 0000000..b48128d --- /dev/null +++ b/crates/buiy_verify/src/golden/check.rs @@ -0,0 +1,393 @@ +//! The golden comparison entry points (`goldens.md` § "`assert_golden`"). +//! +//! [`check_golden`] compares a freshly captured `actual` against the stored +//! **multi-positive** baseline set for a key and returns a structured +//! [`GoldenOutcome`] (pass / fail / blessed) — the no-panic core used by the +//! harness's own tests and the coverage matrix driver. [`assert_golden`] is the +//! panicking wrapper a test calls: it fails closed on a missing or non-matching +//! corpus and, under `BUIY_BLESS=1`, blesses instead (modeled exactly on +//! `BUIY_ACCEPT_SHAPING`, never a silent overwrite). +//! +//! ## Multi-positive semantics +//! +//! A key maps to a *set* of accepted PNGs, not one (Skia-Gold "many positives +//! per config"). `check_golden` compares `actual` against each positive and +//! passes if **any** `Diff::passes(budget)`. This absorbs the residual GPU AA +//! jitter the determinism pin reduces but does not eliminate. On a fail it +//! carries the *best* (smallest-`Diff`) candidate so the triage report shows the +//! closest baseline, not an arbitrary one. + +use super::GoldenKey; +use super::ledger::{BlessLedger, Positive}; +use super::report::{TriageCard, TriageReport}; +use crate::metric::{CompareOpts, Diff, FuzzBudget, compare}; +use image::RgbaImage; + +/// The default corpus root (`crates/buiy_verify/tests/goldens/`) and the +/// triage-report output dir (`target/buiy-goldens/`), resolved from the crate +/// manifest so they are stable regardless of the test's CWD. +pub(crate) fn default_corpus_root() -> std::path::PathBuf { + std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/goldens") +} + +fn report_dir() -> std::path::PathBuf { + // `CARGO_TARGET_DIR` honored if set; else the workspace `target/`. We keep + // it simple and stable: `/../../target/buiy-goldens`. + std::env::var_os("CARGO_TARGET_DIR") + .map(std::path::PathBuf::from) + .unwrap_or_else(|| std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("../../target")) + .join("buiy-goldens") +} + +/// The structured result of a golden comparison (no panic). `assert_golden` +/// wraps this with the fail-closed panic + bless behavior. +#[derive(Debug)] +pub enum GoldenOutcome { + /// `actual` matched a stored positive within `budget`. Carries which + /// positive matched and its `Diff` (the smallest, since match is + /// any-positive). + Pass { + /// Index of the matched positive (`..png`). + matched_positive: usize, + /// The `Diff` against the matched positive. + diff: Diff, + }, + /// No positive matched (or the corpus was empty). `best` is the closest + /// candidate `(index, Diff)` if any positive exists; `report` is the written + /// HTML triage report path. + Fail { + /// The closest stored positive `(index, Diff)`, or `None` for an empty + /// corpus (the missing-golden case). + best: Option<(usize, Diff)>, + /// Path to the written HTML triage report. + report: std::path::PathBuf, + }, + /// `BUIY_BLESS=1`: wrote a new (or replaced an existing) positive. Never + /// reached in CI (the env is unset there, mirroring `BUIY_ACCEPT_SHAPING`). + Blessed { + /// Index of the written positive. + positive: usize, + /// `true` if a new positive was appended; `false` if one was replaced. + was_new: bool, + }, +} + +/// `true` when the bless env (`BUIY_BLESS`) is set — the accept-FILE switch, +/// modeled on `BUIY_ACCEPT_SHAPING`. +fn bless_requested() -> bool { + std::env::var_os("BUIY_BLESS").is_some() +} + +/// When `BUIY_BLESS_REPLACE=` is set, bless replaces positive `i` instead of +/// appending. `None` ⇒ append a new positive. +fn bless_replace_index() -> Option { + std::env::var("BUIY_BLESS_REPLACE") + .ok() + .and_then(|v| v.parse().ok()) +} + +/// Compare `actual` against the stored multi-positive baseline for `key` at the +/// default corpus root, gated by `budget`. Under `BUIY_BLESS=1` this *blesses* +/// (writes `actual` as a positive + updates the ledger) and returns +/// [`GoldenOutcome::Blessed`]. Otherwise it returns [`Pass`](GoldenOutcome::Pass) +/// on an any-positive match, or [`Fail`](GoldenOutcome::Fail) (writing the +/// diff-PNG + HTML triage report) on a miss or empty corpus. +pub fn check_golden(key: &GoldenKey, actual: &RgbaImage, budget: &FuzzBudget) -> GoldenOutcome { + check_golden_in(&default_corpus_root(), &report_dir(), key, actual, budget) +} + +/// The corpus-root-parameterized core of [`check_golden`] — lets the harness's +/// own tests bless/assert against a temp dir (the bless round-trip test). +pub(crate) fn check_golden_in( + corpus_root: &std::path::Path, + report_root: &std::path::Path, + key: &GoldenKey, + actual: &RgbaImage, + budget: &FuzzBudget, +) -> GoldenOutcome { + let dir = key.dir(corpus_root); + let ledger_path = ledger_path(&dir); + + if bless_requested() { + return bless(&dir, &ledger_path, key, actual, budget); + } + + let ledger = BlessLedger::load_or_empty(&ledger_path, key) + .unwrap_or_else(|e| panic!("corrupt golden ledger {ledger_path:?}: {e}")); + + // Compare against every positive; pass on the FIRST that clears the budget, + // tracking the smallest-Diff candidate for the report on a miss. + let mut best: Option<(usize, Diff)> = None; + for (i, positive) in ledger.positives.iter().enumerate() { + let png_path = dir.join(&positive.file); + let baseline = load_png(&png_path) + .unwrap_or_else(|e| panic!("golden positive {png_path:?} unreadable: {e}")); + // emit_diff_image only on the candidate we end up reporting; here we run + // the cheap (no heatmap) compare to gate, and recompute the heatmap for + // the best candidate below only if we fail. + let diff = compare(actual, &baseline, &CompareOpts::default()); + if diff.passes(budget) { + return GoldenOutcome::Pass { + matched_positive: i, + diff, + }; + } + let smaller = best + .as_ref() + .map(|(_, bd)| diff_score(&diff) < diff_score(bd)) + .unwrap_or(true); + if smaller { + best = Some((i, diff)); + } + } + + // FAIL (miss or empty corpus): write the diff-PNG + append a triage card. + let report = emit_failure_report(report_root, &dir, key, actual, &ledger, budget, &best); + GoldenOutcome::Fail { best, report } +} + +/// A scalar ranking for "closest baseline": differing pixels dominate, channel +/// delta breaks ties. Lower is closer. +fn diff_score(d: &Diff) -> u64 { + (d.differing_pixels as u64) << 8 | d.max_channel_delta as u64 +} + +/// Bless `actual`: write it as a positive PNG + record it in the ledger. With +/// `BUIY_BLESS_REPLACE=` set it overwrites positive `i`; otherwise it appends +/// a new positive. **The human then reviews the PNG in the PR and commits it.** +fn bless( + dir: &std::path::Path, + ledger_path: &std::path::Path, + key: &GoldenKey, + actual: &RgbaImage, + budget: &FuzzBudget, +) -> GoldenOutcome { + std::fs::create_dir_all(dir).expect("create golden corpus dir"); + let mut ledger = BlessLedger::load_or_empty(ledger_path, key).expect("load ledger for bless"); + + let stem = slug_stem(key); + let replace = bless_replace_index(); + let (index, was_new) = match replace { + Some(i) if i < ledger.positives.len() => (i, false), + _ => (ledger.positives.len(), true), + }; + let file = format!("{stem}.{index}.png"); + actual + .save(dir.join(&file)) + .expect("write blessed golden PNG"); + + let positive = Positive { + file, + blessed_commit: git_head_commit(), + blessed_at: now_rfc3339(), + budget: *budget, + reason: std::env::var("BUIY_BLESS_REASON").unwrap_or_else(|_| "blessed".into()), + }; + if was_new { + ledger.positives.push(positive); + } else { + ledger.positives[index] = positive; + } + ledger.save(ledger_path).expect("write golden ledger"); + GoldenOutcome::Blessed { + positive: index, + was_new, + } +} + +/// Compare `actual` against the corpus and **panic** on a non-bless failure with +/// the bless instruction (fail closed; the `BUIY_ACCEPT_SHAPING` panic shape). +/// Under `BUIY_BLESS=1` it blesses and returns. This is the entry point a +/// `#[test]` calls. +pub fn assert_golden(key: &GoldenKey, actual: &RgbaImage, budget: &FuzzBudget) { + match check_golden(key, actual, budget) { + GoldenOutcome::Pass { .. } | GoldenOutcome::Blessed { .. } => {} + GoldenOutcome::Fail { best, report } => panic_fail(key, best.as_ref(), &report), + } +} + +/// The fail-closed panic message (shared by `assert_golden` and the corpus-root +/// test variant), pointing at the triage report and the bless command. +fn panic_fail(key: &GoldenKey, best: Option<&(usize, Diff)>, report: &std::path::Path) -> ! { + let slug = key.slug(); + match best { + None => panic!( + "no golden committed for `{slug}` — run\n \ + BUIY_BLESS=1 cargo test -p buiy_verify --test goldens -- --ignored \ + --test-threads=1\nthen REVIEW the captured PNG and commit it. \ + Triage report: {report:?}" + ), + Some((i, diff)) => panic!( + "golden `{slug}` diverged from every positive (closest = positive {i}: \ + differing_pixels={dp}, max_channel_delta={mcd}). A pixel change is a \ + rendering change; if intended, regenerate with\n \ + BUIY_BLESS=1 cargo test -p buiy_verify --test goldens -- --ignored \ + --test-threads=1\nreview the diff, and commit. Triage report: {report:?}", + dp = diff.differing_pixels, + mcd = diff.max_channel_delta, + ), + } +} + +/// Write the diff-PNG for the closest candidate and append a card to the run's +/// HTML triage report. Returns the report path. +fn emit_failure_report( + report_root: &std::path::Path, + corpus_dir: &std::path::Path, + key: &GoldenKey, + actual: &RgbaImage, + ledger: &BlessLedger, + budget: &FuzzBudget, + best: &Option<(usize, Diff)>, +) -> std::path::PathBuf { + std::fs::create_dir_all(report_root).ok(); + let stem = slug_stem(key); + + // Recompute the diff WITH the heatmap against the closest baseline (the gate + // pass above ran without a heatmap to stay cheap). + let (baseline_img, diff) = match best { + Some((i, _)) => { + let png = corpus_dir.join(&ledger.positives[*i].file); + let baseline = load_png(&png).unwrap_or_else(|_| RgbaImage::new(1, 1)); + let d = compare( + actual, + &baseline, + &CompareOpts { + emit_diff_image: true, + ..CompareOpts::default() + }, + ); + (baseline, d) + } + // Missing-golden: no baseline to diff against. Use a blank baseline and + // a saturated-style diff so the card still renders. + None => ( + RgbaImage::new(actual.width().max(1), actual.height().max(1)), + compare( + actual, + &RgbaImage::new(actual.width().max(1), actual.height().max(1)), + &CompareOpts { + emit_diff_image: true, + ..CompareOpts::default() + }, + ), + ), + }; + + // Write the standalone diff-PNG next to the report. + let diff_png_path = report_root.join(format!("{stem}.diff.png")); + let diff_png_bytes = if let Some(img) = &diff.diff_image { + let _ = img.save(&diff_png_path); + png_bytes(img) + } else { + Vec::new() + }; + + let report_path = report_root.join("report.html"); + let mut report = TriageReport::open_or_create(&report_path); + report.push(TriageCard { + key: key.clone(), + actual_png: png_bytes(actual), + baseline_png: png_bytes(&baseline_img), + diff_png: diff_png_bytes, + diff, + budget: *budget, + }); + report.write().ok(); + report_path +} + +// --- small fs / format helpers ------------------------------------------------- + +/// The `` of a key's slug (the path tail, e.g. `light__md__lavapipe__dpr1`), +/// used to name `..png` and `.toml` inside the key dir. +fn slug_stem(key: &GoldenKey) -> String { + key.slug() + .rsplit('/') + .next() + .expect("slug always has a tail") + .to_string() +} + +/// The ledger path inside a key's corpus dir. +fn ledger_path(dir: &std::path::Path) -> std::path::PathBuf { + dir.join(format!( + "{}.toml", + dir.file_name().and_then(|s| s.to_str()).unwrap_or("ledger") + )) +} + +fn load_png(path: &std::path::Path) -> image::ImageResult { + Ok(image::open(path)?.to_rgba8()) +} + +fn png_bytes(img: &RgbaImage) -> Vec { + let mut buf = std::io::Cursor::new(Vec::new()); + img.write_to(&mut buf, image::ImageFormat::Png) + .expect("encode PNG"); + buf.into_inner() +} + +/// `git rev-parse HEAD` at bless time, or `"unknown"` if git is unavailable +/// (the bless still proceeds — the commit is provenance, not a gate). +fn git_head_commit() -> String { + std::process::Command::new("git") + .args(["rev-parse", "HEAD"]) + .output() + .ok() + .filter(|o| o.status.success()) + .and_then(|o| String::from_utf8(o.stdout).ok()) + .map(|s| s.trim().to_string()) + .unwrap_or_else(|| "unknown".into()) +} + +/// An RFC3339 UTC timestamp WITHOUT pulling a date crate: `SystemTime` since the +/// epoch formatted as `1970-01-01T00:00:00Z + N seconds` is overkill; we emit +/// the epoch-second form `"s"` is not RFC3339, so compute the calendar +/// date by hand. Kept dependency-free (no `chrono`/`time`) per the spec's +/// minimal-dep ethos. +fn now_rfc3339() -> String { + use std::time::{SystemTime, UNIX_EPOCH}; + let secs = SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|d| d.as_secs()) + .unwrap_or(0); + rfc3339_from_unix(secs) +} + +/// Convert a Unix timestamp (seconds) to an RFC3339 UTC string. Civil-date +/// algorithm (Howard Hinnant's `days_from_civil` inverse) — dependency-free. +fn rfc3339_from_unix(secs: u64) -> String { + let days = (secs / 86_400) as i64; + let rem = secs % 86_400; + let (hh, mm, ss) = (rem / 3600, (rem % 3600) / 60, rem % 60); + // Hinnant civil_from_days (epoch 1970-01-01 = day 0). + let z = days + 719_468; + let era = if z >= 0 { z } else { z - 146_096 } / 146_097; + let doe = (z - era * 146_097) as u64; + let yoe = (doe - doe / 1460 + doe / 36_524 - doe / 146_096) / 365; + let y = yoe as i64 + era * 400; + let doy = doe - (365 * yoe + yoe / 4 - yoe / 100); + let mp = (5 * doy + 2) / 153; + let d = doy - (153 * mp + 2) / 5 + 1; + let m = if mp < 10 { mp + 3 } else { mp - 9 }; + let y = if m <= 2 { y + 1 } else { y }; + format!("{y:04}-{m:02}-{d:02}T{hh:02}:{mm:02}:{ss:02}Z") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn rfc3339_matches_known_epoch_dates() { + assert_eq!(rfc3339_from_unix(0), "1970-01-01T00:00:00Z"); + // 2026-06-15T00:00:00Z = 1_781_481_600 (verified against a calendar). + assert_eq!(rfc3339_from_unix(1_781_481_600), "2026-06-15T00:00:00Z"); + // A non-midnight instant. + assert_eq!( + rfc3339_from_unix(1_781_481_600 + 3661), + "2026-06-15T01:01:01Z" + ); + } +} diff --git a/crates/buiy_verify/src/golden/ledger.rs b/crates/buiy_verify/src/golden/ledger.rs new file mode 100644 index 0000000..4b7e329 --- /dev/null +++ b/crates/buiy_verify/src/golden/ledger.rs @@ -0,0 +1,74 @@ +//! The bless ledger — the durable, human-diffable accept record (`goldens.md` +//! § "The bless ledger"). One `.toml` lives beside each key's +//! `..png` positives, recording *why* each positive was accepted: +//! the blessing commit, an RFC3339 timestamp, the per-fixture budget, and a +//! one-line reason. This is the explicit, reviewable accept ledger reg-suit +//! lacks (Skia-Gold §Borrow 1) — a real regression is caught in the PR diff of +//! this file, not buried in git history. + +use super::GoldenKey; +use crate::metric::FuzzBudget; + +/// The `.toml` accept ledger for one [`GoldenKey`]: the key itself +/// (so the file is self-describing) plus its set of accepted positives. Index +/// `i` in `positives` corresponds on disk to `.i.png`. +#[derive(Clone, Debug, PartialEq, serde::Serialize, serde::Deserialize)] +pub struct BlessLedger { + /// The trace identity this ledger records positives for. + pub key: GoldenKey, + /// The accepted baselines, in bless order. `positives[i]` ⇒ `.i.png`. + pub positives: Vec, +} + +/// One accepted baseline. Records the provenance a reviewer needs to judge +/// whether a positive is still legitimate (the stale-positive guard, +/// goldens.md § "Stale-positive guard"). +#[derive(Clone, Debug, PartialEq, serde::Serialize, serde::Deserialize)] +pub struct Positive { + /// PNG filename relative to the ledger (`..png`). + pub file: String, + /// `git rev-parse HEAD` at bless time — pins the source state that produced + /// this pixel set. + pub blessed_commit: String, + /// RFC3339 timestamp the positive was blessed. + pub blessed_at: String, + /// The budget this positive is asserted against — `(0,0)` after the + /// determinism pin, widened per-fixture with a documented [`reason`](Self::reason). + pub budget: FuzzBudget, + /// Why this positive exists (or why its budget was widened). + pub reason: String, +} + +impl BlessLedger { + /// An empty ledger for `key` (no positives yet). The first bless pushes + /// `.0.png`. + pub fn empty(key: GoldenKey) -> Self { + Self { + key, + positives: Vec::new(), + } + } + + /// Load the ledger from `path`, or return an [`empty`](Self::empty) one for + /// `key` if the file does not exist. Propagates a real read/parse error (a + /// corrupt ledger must surface loudly, never silently reset the corpus). + pub fn load_or_empty(path: &std::path::Path, key: &GoldenKey) -> std::io::Result { + match std::fs::read_to_string(path) { + Ok(s) => toml::from_str(&s) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e)), + Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(Self::empty(key.clone())), + Err(e) => Err(e), + } + } + + /// Serialize to human-diffable TOML and write to `path` (creating parent + /// directories). The written file is what a reviewer reads in the PR diff. + pub fn save(&self, path: &std::path::Path) -> std::io::Result<()> { + if let Some(parent) = path.parent() { + std::fs::create_dir_all(parent)?; + } + let body = toml::to_string_pretty(self) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?; + std::fs::write(path, body) + } +} diff --git a/crates/buiy_verify/src/golden/report.rs b/crates/buiy_verify/src/golden/report.rs new file mode 100644 index 0000000..482c46c --- /dev/null +++ b/crates/buiy_verify/src/golden/report.rs @@ -0,0 +1,187 @@ +//! The self-contained, offline-first HTML triage report (`goldens.md` § +//! "Diff-PNG + self-contained HTML triage report"). On any golden `Fail` the +//! harness writes a diff-PNG and appends a card to a single +//! `target/buiy-goldens/report.html`, accumulating every failing cell from one +//! `cargo test` run. Each card embeds three views — side-by-side +//! expected|actual, a JS opacity-slider overlay, and the diff heatmap — with +//! all PNGs base64-inlined so the file references **no** external asset and +//! **no** network: it opens straight from a CI artifact (project ethos; +//! Skia-Gold §Borrow 6 reg-cli/x-img-diff-js, offline by construction). + +use super::GoldenKey; +use crate::metric::{Diff, FuzzBudget}; +use base64::Engine as _; + +/// One failing golden cell, ready to render as an HTML card. The three PNG byte +/// vectors are inlined as base64 data URIs (self-containment) — `actual` is the +/// freshly captured frame, `baseline` is the *closest* stored positive +/// ([`GoldenOutcome::Fail::best`](super::GoldenOutcome::Fail), so the reviewer +/// compares against the nearest baseline, not an arbitrary one), and `diff` is +/// the [`Diff::diff_image`](crate::metric::Diff) heatmap. +pub struct TriageCard { + /// The trace identity of the failing cell. + pub key: GoldenKey, + /// PNG bytes of the freshly captured frame. + pub actual_png: Vec, + /// PNG bytes of the closest stored positive. + pub baseline_png: Vec, + /// PNG bytes of the diff heatmap. + pub diff_png: Vec, + /// The metric outcome (counts + advisory MSSIM) for the card header. + pub diff: Diff, + /// The budget the cell was gated against (so the reviewer sees the bar it + /// missed). + pub budget: FuzzBudget, +} + +/// A single HTML triage report accumulating one [`TriageCard`] per failing +/// cell. [`open_or_create`](Self::open_or_create) makes the report path +/// idempotent across a test run; [`write`](Self::write) emits one self-contained +/// file. +pub struct TriageReport { + path: std::path::PathBuf, + cards: Vec, +} + +impl TriageReport { + /// Begin (or continue) a report at `path`. The cards accumulate in memory + /// and [`write`](Self::write) re-emits the whole file, so multiple failing + /// cells in one run land in one report. (We do not parse an existing HTML + /// file back into cards — the driver holds the live `TriageReport` for the + /// duration of a run; `open_or_create` exists so the path is the single + /// source of truth.) + pub fn open_or_create(path: &std::path::Path) -> Self { + Self { + path: path.to_path_buf(), + cards: Vec::new(), + } + } + + /// The report's on-disk path. + pub fn path(&self) -> &std::path::Path { + &self.path + } + + /// Append a failing cell. + pub fn push(&mut self, card: TriageCard) { + self.cards.push(card); + } + + /// Render the report and write it to [`path`](Self::path), creating parent + /// directories. One self-contained HTML file: per card, a side-by-side + /// expected|actual pair, a JS opacity-slider overlay, and the diff heatmap, + /// all PNGs base64-inlined. No external assets, no network. + pub fn write(&self) -> std::io::Result<()> { + if let Some(parent) = self.path.parent() { + std::fs::create_dir_all(parent)?; + } + std::fs::write(&self.path, self.render()) + } + + /// Render the full HTML document as a `String` (the testable core of + /// [`write`](Self::write)). + pub fn render(&self) -> String { + let mut body = String::new(); + body.push_str(REPORT_HEAD); + body.push_str(&format!( + "

Buiy golden triage — {} failing cell(s)

\n", + self.cards.len() + )); + for (i, card) in self.cards.iter().enumerate() { + body.push_str(&card.render(i)); + } + body.push_str(REPORT_TAIL); + body + } +} + +impl TriageCard { + /// Render one card. `idx` makes the overlay's slider/img element ids unique + /// across cards in a single report. + fn render(&self, idx: usize) -> String { + let actual = data_uri(&self.actual_png); + let baseline = data_uri(&self.baseline_png); + let diff = data_uri(&self.diff_png); + let mssim = self + .diff + .mssim + .map(|s| format!("{s:.4}")) + .unwrap_or_else(|| "—".into()); + format!( + r#"
+

{slug}

+

differing_pixels={dp} / {total} · max_channel_delta={mcd} · mssim={mssim} + · budget=(Δ{bcd}, {bpx}px){saturated}

+
+
expected (closest baseline)
baseline
+
actual
actual
+
diff heatmap
diff
+
+
+
overlay (drag to fade actual over baseline)
+
+ overlay-baseline + overlay-actual +
+ +
+
+"#, + slug = html_escape(&self.key.slug()), + dp = self.diff.differing_pixels, + total = self.diff.total_pixels, + mcd = self.diff.max_channel_delta, + mssim = mssim, + bcd = self.budget.max_channel_delta, + bpx = self.budget.max_diff_pixels, + saturated = if self.diff.saturated { + " · SATURATED (dimension mismatch)" + } else { + "" + }, + ) + } +} + +/// Base64-inline PNG bytes as a `data:` URI — the self-containment primitive. +/// No external file, no network fetch. +fn data_uri(png: &[u8]) -> String { + let b64 = base64::engine::general_purpose::STANDARD.encode(png); + format!("data:image/png;base64,{b64}") +} + +/// Minimal HTML-escape for the slug text node (defense-in-depth; slugs are +/// already `[a-z0-9/_-]` so this is belt-and-braces). +fn html_escape(s: &str) -> String { + s.replace('&', "&") + .replace('<', "<") + .replace('>', ">") +} + +const REPORT_HEAD: &str = r#" + + + + +Buiy golden triage + + + +"#; + +const REPORT_TAIL: &str = "\n\n"; diff --git a/crates/buiy_verify/src/lib.rs b/crates/buiy_verify/src/lib.rs index bc1777e..4106ca8 100644 --- a/crates/buiy_verify/src/lib.rs +++ b/crates/buiy_verify/src/lib.rs @@ -7,6 +7,7 @@ pub mod a11y; pub mod contrast; pub mod determinism; +pub mod golden; pub mod invariant; pub mod metric; pub mod reftest; diff --git a/crates/buiy_verify/src/metric.rs b/crates/buiy_verify/src/metric.rs index 1932788..9af6af7 100644 --- a/crates/buiy_verify/src/metric.rs +++ b/crates/buiy_verify/src/metric.rs @@ -39,7 +39,10 @@ pub struct Diff { /// The two-axis gate. A Diff PASSES iff BOTH hold. Default after determinism is /// (0, 0); widen per fixture with a documented reason. -#[derive(Clone, Copy, Debug, PartialEq, Eq)] +/// +/// Derives `serde` so the Tier-5 bless ledger (`golden::Positive.budget`) can +/// persist a per-fixture widened budget directly to its `.toml`. +#[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub struct FuzzBudget { /// No single channel of any pixel may differ by more than this (L∞). pub max_channel_delta: u8, diff --git a/crates/buiy_verify/tests/golden_keys.rs b/crates/buiy_verify/tests/golden_keys.rs new file mode 100644 index 0000000..63736e4 --- /dev/null +++ b/crates/buiy_verify/tests/golden_keys.rs @@ -0,0 +1,153 @@ +//! Tier-5 golden key schema self-tests (Phase 3.6, verification-design +//! `goldens.md` § Verification #6). Pure-CPU, headless — no GPU adapter. +//! +//! The `GoldenKey` trace identity is **fixed before any golden is generated** +//! (Skia-Gold lesson — retrofitting a key field re-baselines the whole corpus). +//! These tests pin that the key: +//! * slugs deterministically (lower-kebab, stable field order), +//! * round-trips through `slug()` → parse, +//! * never collides two distinct keys onto one slug, and +//! * the bless ledger serializes to human-diffable TOML. + +use buiy_core::render::golden::Dpr; +use buiy_verify::golden::{Backend, BlessLedger, GoldenKey, Positive}; +use buiy_verify::metric::FuzzBudget; +use proptest::prelude::*; + +fn key( + widget: &str, + state: &str, + theme: &str, + viewport: &str, + backend: Backend, + dpr: Dpr, +) -> GoldenKey { + GoldenKey { + widget: widget.into(), + state: state.into(), + theme: theme.into(), + viewport: viewport.into(), + backend, + dpr, + } +} + +#[test] +fn slug_is_deterministic_lower_kebab() { + let k = key("button", "hover", "dark", "sm", Backend::Lavapipe, Dpr::X2); + // Stable, documented stem schema: `widget/state/theme__viewport__backend__dpr`. + assert_eq!(k.slug(), "button/hover/dark__sm__lavapipe__dpr2"); + // Deterministic: the same key slugs identically every call. + assert_eq!(k.slug(), k.slug()); +} + +#[test] +fn slug_lowercases_and_kebabs_mixed_case_input() { + let k = key( + "ToggleSwitch", + "Focus Ring", + "High Contrast", + "Large XL", + Backend::Vulkan, + Dpr::X1, + ); + let slug = k.slug(); + assert_eq!( + slug, "toggleswitch/focus-ring/high-contrast__large-xl__vulkan__dpr1", + "slug must be lower-kebab + slug-safe (no spaces, no raw Debug)" + ); + // Slug-safe: no whitespace, no uppercase. + assert!(!slug.chars().any(|c| c.is_whitespace())); + assert!(!slug.chars().any(|c| c.is_ascii_uppercase())); +} + +#[test] +fn dir_places_corpus_under_widget_directory() { + let root = std::path::Path::new("/tmp/goldens"); + let k = key( + "button", + "default", + "light", + "md", + Backend::Lavapipe, + Dpr::X1, + ); + let dir = k.dir(root); + // The whole row of a fixture's cells lives under one directory per widget + // (Skia-Gold review ergonomics). + assert!(dir.starts_with(root)); + assert!( + dir.ends_with("button/default/light__md__lavapipe__dpr1"), + "dir = root.join(slug); got {dir:?}" + ); +} + +#[test] +fn ledger_round_trips_through_toml() { + let k = key("button", "hover", "dark", "sm", Backend::Lavapipe, Dpr::X2); + let ledger = BlessLedger { + key: k.clone(), + positives: vec![Positive { + file: "button/hover/dark__sm__lavapipe__dpr2.0.png".into(), + blessed_commit: "deadbeef".into(), + blessed_at: "2026-06-15T00:00:00Z".into(), + budget: FuzzBudget::EXACT, + reason: "initial bless".into(), + }], + }; + let serialized = toml::to_string(&ledger).expect("ledger serializes to TOML"); + // Human-diffable: a reviewer reads the commit/reason in the PR diff. + assert!(serialized.contains("deadbeef")); + assert!(serialized.contains("initial bless")); + let parsed: BlessLedger = toml::from_str(&serialized).expect("ledger round-trips"); + assert_eq!(parsed.key, k); + assert_eq!(parsed.positives.len(), 1); + assert_eq!(parsed.positives[0].budget, FuzzBudget::EXACT); +} + +// --------------------------------------------------------------------------- +// goldens.md § Verification #6: a GoldenKey round-trips through slug()→parse, +// and two distinct keys never collide on a slug. +// --------------------------------------------------------------------------- + +// A canonical (already slug-safe) component: lower-alnum runs joined by single +// dashes, no leading/trailing/double dash. The round-trip contract holds for +// canonical components — `slug_component` is idempotent on them and `from_slug` +// is its exact inverse. Non-canonical display names (spaces, mixed case, +// trailing dashes) are a lossy normalization concern, covered by the +// lower-kebab unit test above, not by the round-trip property. +fn arb_component() -> impl Strategy { + prop::collection::vec("[a-z0-9]{1,5}", 1..=3).prop_map(|parts| parts.join("-")) +} + +prop_compose! { + fn arb_key()( + widget in arb_component(), + state in arb_component(), + theme in arb_component(), + viewport in arb_component(), + backend in prop::sample::select(vec![ + Backend::Lavapipe, Backend::Vulkan, Backend::Gl, Backend::Metal, Backend::Dx12, + ]), + dpr_milli in 1u32..=4000, + ) -> GoldenKey { + key(&widget, &state, &theme, &viewport, backend, Dpr(dpr_milli)) + } +} + +proptest! { + #[test] + fn key_slug_round_trips(k in arb_key()) { + let slug = k.slug(); + let parsed = GoldenKey::from_slug(&slug) + .unwrap_or_else(|| panic!("slug `{slug}` failed to parse back")); + prop_assert_eq!(parsed, k); + } + + #[test] + fn distinct_keys_never_collide(a in arb_key(), b in arb_key()) { + if a != b { + prop_assert_ne!(a.slug(), b.slug(), "distinct keys collided on a slug"); + } + } +} From eb2591d8b8d4192b6fc4aa1ac4d648df2d0cf98c Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 09:28:48 -0700 Subject: [PATCH 46/70] feat(verify): golden check/assert + multi-positive + bless workflow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 3.7 (verification-design goldens.md § Verification #1–#4). Lands the test coverage + the env-decoupling refactor over the `golden::check` code from 3.6. `check_golden` compares `actual` against the stored multi-positive baseline set and passes if ANY positive clears the budget (Skia-Gold "many positives per config"); on a miss it carries the closest (smallest-Diff) candidate. `assert_golden` is the fail-closed panicking wrapper — empty/non-matching corpus panics with the bless instruction (the BUIY_ACCEPT_SHAPING shape); under BUIY_BLESS=1 it blesses instead, writing the PNG + recording commit/timestamp/ budget/reason in the human-diffable ledger (never a silent overwrite). Refactor: the bless decision is resolved into an explicit `BlessMode` at the single public env-read site, so `check_golden_in`/`assert_golden_in` drive bless/assert against a temp corpus with no process-global `BUIY_BLESS` race — the seam the harness self-tests and the Phase-4 coverage matrix driver consume. RED→GREEN (golden_persistence.rs, pure-CPU, synthetic images): match/mismatch, multi-positive any-matches (second positive ⇒ matched_positive: 1), bless round-trip (re-check passes + ledger provenance), bless-replace-in-place, fail-closed panic on empty corpus, and the structured missing⇒Fail{best:None}. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/buiy_verify/src/golden.rs | 4 +- crates/buiy_verify/src/golden/check.rs | 85 +++- .../buiy_verify/tests/golden_persistence.rs | 408 ++++++++++++++++++ 3 files changed, 477 insertions(+), 20 deletions(-) create mode 100644 crates/buiy_verify/tests/golden_persistence.rs diff --git a/crates/buiy_verify/src/golden.rs b/crates/buiy_verify/src/golden.rs index a7acc4d..3b7b06d 100644 --- a/crates/buiy_verify/src/golden.rs +++ b/crates/buiy_verify/src/golden.rs @@ -36,7 +36,9 @@ mod check; mod ledger; mod report; -pub use check::{GoldenOutcome, assert_golden, check_golden}; +pub use check::{ + BlessMode, GoldenOutcome, assert_golden, assert_golden_in, check_golden, check_golden_in, +}; pub use ledger::{BlessLedger, Positive}; pub use report::{TriageCard, TriageReport}; diff --git a/crates/buiy_verify/src/golden/check.rs b/crates/buiy_verify/src/golden/check.rs index b48128d..ce0279d 100644 --- a/crates/buiy_verify/src/golden/check.rs +++ b/crates/buiy_verify/src/golden/check.rs @@ -72,18 +72,37 @@ pub enum GoldenOutcome { }, } -/// `true` when the bless env (`BUIY_BLESS`) is set — the accept-FILE switch, -/// modeled on `BUIY_ACCEPT_SHAPING`. -fn bless_requested() -> bool { - std::env::var_os("BUIY_BLESS").is_some() +/// How a check should treat the corpus: compare-and-gate, or bless `actual` as +/// a positive. Resolving the bless decision into an explicit value (instead of +/// reading `BUIY_BLESS` deep in the comparison) keeps the policy out of the +/// process-global env so the harness's own tests — and the Phase-4 coverage +/// matrix driver — can drive bless/assert deterministically without env races. +/// The env is read **once**, at the public entry point ([`check_golden`]). +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum BlessMode { + /// Compare against the corpus and gate (the CI / default path). + Assert, + /// Write `actual` as a positive. `Some(i)` replaces positive `i`; `None` + /// appends a new one (`BUIY_BLESS` set, optional `BUIY_BLESS_REPLACE=`). + Bless { + /// `Some(i)` overwrites positive `i`; `None` appends a new positive. + replace: Option, + }, } -/// When `BUIY_BLESS_REPLACE=` is set, bless replaces positive `i` instead of -/// appending. `None` ⇒ append a new positive. -fn bless_replace_index() -> Option { - std::env::var("BUIY_BLESS_REPLACE") - .ok() - .and_then(|v| v.parse().ok()) +/// Resolve the bless mode from the environment — the **single** place +/// `BUIY_BLESS` / `BUIY_BLESS_REPLACE` are read (accept-FILE switch, modeled on +/// `BUIY_ACCEPT_SHAPING`). +fn mode_from_env() -> BlessMode { + if std::env::var_os("BUIY_BLESS").is_some() { + BlessMode::Bless { + replace: std::env::var("BUIY_BLESS_REPLACE") + .ok() + .and_then(|v| v.parse().ok()), + } + } else { + BlessMode::Assert + } } /// Compare `actual` against the stored multi-positive baseline for `key` at the @@ -93,14 +112,26 @@ fn bless_replace_index() -> Option { /// on an any-positive match, or [`Fail`](GoldenOutcome::Fail) (writing the /// diff-PNG + HTML triage report) on a miss or empty corpus. pub fn check_golden(key: &GoldenKey, actual: &RgbaImage, budget: &FuzzBudget) -> GoldenOutcome { - check_golden_in(&default_corpus_root(), &report_dir(), key, actual, budget) + check_golden_in( + &default_corpus_root(), + &report_dir(), + mode_from_env(), + key, + actual, + budget, + ) } -/// The corpus-root-parameterized core of [`check_golden`] — lets the harness's -/// own tests bless/assert against a temp dir (the bless round-trip test). -pub(crate) fn check_golden_in( +/// The corpus-root- and mode-parameterized core of [`check_golden`] — lets the +/// harness's own tests (and the Phase-4 coverage matrix driver) bless/assert +/// against an explicit corpus root + report dir + [`BlessMode`], with **no** +/// env races. `corpus_root` holds the `/..png` positives + +/// `.toml` ledgers; `report_root` receives the diff-PNG + HTML triage +/// report on a fail. +pub fn check_golden_in( corpus_root: &std::path::Path, report_root: &std::path::Path, + mode: BlessMode, key: &GoldenKey, actual: &RgbaImage, budget: &FuzzBudget, @@ -108,8 +139,8 @@ pub(crate) fn check_golden_in( let dir = key.dir(corpus_root); let ledger_path = ledger_path(&dir); - if bless_requested() { - return bless(&dir, &ledger_path, key, actual, budget); + if let BlessMode::Bless { replace } = mode { + return bless(&dir, &ledger_path, replace, key, actual, budget); } let ledger = BlessLedger::load_or_empty(&ledger_path, key) @@ -153,11 +184,12 @@ fn diff_score(d: &Diff) -> u64 { } /// Bless `actual`: write it as a positive PNG + record it in the ledger. With -/// `BUIY_BLESS_REPLACE=` set it overwrites positive `i`; otherwise it appends -/// a new positive. **The human then reviews the PNG in the PR and commits it.** +/// `replace = Some(i)` it overwrites positive `i`; otherwise it appends a new +/// positive. **The human then reviews the PNG in the PR and commits it.** fn bless( dir: &std::path::Path, ledger_path: &std::path::Path, + replace: Option, key: &GoldenKey, actual: &RgbaImage, budget: &FuzzBudget, @@ -166,7 +198,6 @@ fn bless( let mut ledger = BlessLedger::load_or_empty(ledger_path, key).expect("load ledger for bless"); let stem = slug_stem(key); - let replace = bless_replace_index(); let (index, was_new) = match replace { Some(i) if i < ledger.positives.len() => (i, false), _ => (ledger.positives.len(), true), @@ -206,6 +237,22 @@ pub fn assert_golden(key: &GoldenKey, actual: &RgbaImage, budget: &FuzzBudget) { } } +/// [`assert_golden`] against an explicit corpus root + report dir + mode — the +/// no-env-race variant the harness's own fail-closed test drives. +pub fn assert_golden_in( + corpus_root: &std::path::Path, + report_root: &std::path::Path, + mode: BlessMode, + key: &GoldenKey, + actual: &RgbaImage, + budget: &FuzzBudget, +) { + match check_golden_in(corpus_root, report_root, mode, key, actual, budget) { + GoldenOutcome::Pass { .. } | GoldenOutcome::Blessed { .. } => {} + GoldenOutcome::Fail { best, report } => panic_fail(key, best.as_ref(), &report), + } +} + /// The fail-closed panic message (shared by `assert_golden` and the corpus-root /// test variant), pointing at the triage report and the bless command. fn panic_fail(key: &GoldenKey, best: Option<&(usize, Diff)>, report: &std::path::Path) -> ! { diff --git a/crates/buiy_verify/tests/golden_persistence.rs b/crates/buiy_verify/tests/golden_persistence.rs new file mode 100644 index 0000000..9e6fb4b --- /dev/null +++ b/crates/buiy_verify/tests/golden_persistence.rs @@ -0,0 +1,408 @@ +//! Tier-5 golden persistence self-tests (Phase 3.7, verification-design +//! `goldens.md` § Verification #1–#4). All pure-CPU — synthetic `RgbaImage`s in +//! memory, a per-test temp corpus root, an explicit [`BlessMode`] (so the bless +//! decision never touches the process-global `BUIY_BLESS` env and tests cannot +//! race each other). No GPU adapter, runs under the headless gate. +//! +//! #1 match/mismatch — `check_golden` Pass on an identical image, Fail on +//! a one-pixel-over-budget image. +//! #2 multi-positive — bless two positives; an image matching the SECOND +//! returns `Pass { matched_positive: 1 }`. +//! #3 bless round-trip — bless to a temp corpus, re-check without bless +//! passes, and the ledger records commit/timestamp/reason. +//! #4 fail-closed — empty corpus + Assert mode ⇒ `assert_golden_in` +//! panics with the bless instruction. + +use buiy_core::render::golden::Dpr; +use buiy_verify::golden::{ + Backend, BlessLedger, BlessMode, GoldenKey, GoldenOutcome, assert_golden_in, check_golden_in, +}; +use buiy_verify::metric::FuzzBudget; +use image::{Rgba, RgbaImage}; +use std::path::PathBuf; +use std::sync::atomic::{AtomicU32, Ordering}; + +/// A unique temp corpus root per call — avoids cross-test collisions without a +/// `tempfile` dep (mirrors `reftest.rs`'s `std::env::temp_dir()` pattern). +fn temp_root(tag: &str) -> PathBuf { + static SEQ: AtomicU32 = AtomicU32::new(0); + let n = SEQ.fetch_add(1, Ordering::Relaxed); + let nanos = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_nanos()) + .unwrap_or(0); + let dir = std::env::temp_dir().join(format!( + "buiy-golden-test/{tag}-{}-{nanos}-{n}", + std::process::id() + )); + std::fs::create_dir_all(&dir).expect("create temp corpus root"); + dir +} + +fn key() -> GoldenKey { + GoldenKey { + widget: "rect".into(), + state: "default".into(), + theme: "dark".into(), + viewport: "sm".into(), + backend: Backend::Lavapipe, + dpr: Dpr::X1, + } +} + +/// A solid-color test image. +fn solid(w: u32, h: u32, rgba: [u8; 4]) -> RgbaImage { + RgbaImage::from_pixel(w, h, Rgba(rgba)) +} + +/// `base` with exactly one pixel's red channel bumped by `delta` — a single +/// over-budget pixel for the mismatch case. +fn one_pixel_off(base: &RgbaImage, delta: u8) -> RgbaImage { + let mut img = base.clone(); + let p = img.get_pixel(0, 0).0; + img.put_pixel(0, 0, Rgba([p[0].wrapping_add(delta), p[1], p[2], p[3]])); + img +} + +// --------------------------------------------------------------------------- +// #1 — match / mismatch +// --------------------------------------------------------------------------- + +#[test] +fn match_and_mismatch() { + let root = temp_root("match"); + let report = temp_root("match-report"); + let key = key(); + let img = solid(16, 16, [10, 120, 200, 255]); + + // Bless the baseline, then check WITHOUT bless. + let blessed = check_golden_in( + &root, + &report, + BlessMode::Bless { replace: None }, + &key, + &img, + &FuzzBudget::EXACT, + ); + assert!(matches!( + blessed, + GoldenOutcome::Blessed { + positive: 0, + was_new: true + } + )); + + // Identical image PASSES at EXACT. + let pass = check_golden_in( + &root, + &report, + BlessMode::Assert, + &key, + &img, + &FuzzBudget::EXACT, + ); + assert!( + matches!( + pass, + GoldenOutcome::Pass { + matched_positive: 0, + .. + } + ), + "identical image must pass against positive 0, got {pass:?}" + ); + + // One pixel over budget FAILS at EXACT, carrying the closest candidate. + let off = one_pixel_off(&img, 200); + let fail = check_golden_in( + &root, + &report, + BlessMode::Assert, + &key, + &off, + &FuzzBudget::EXACT, + ); + match fail { + GoldenOutcome::Fail { + best: Some((0, diff)), + report, + } => { + assert_eq!(diff.differing_pixels, 1, "exactly one over-budget pixel"); + assert!(report.exists(), "the triage report was written"); + } + other => panic!("expected Fail{{ best: Some((0, _)) }}, got {other:?}"), + } +} + +// --------------------------------------------------------------------------- +// #2 — multi-positive: any positive matches; an image matching the SECOND +// returns Pass { matched_positive: 1 }. +// --------------------------------------------------------------------------- + +#[test] +fn multi_positive_any_matches() { + let root = temp_root("multi"); + let report = temp_root("multi-report"); + let key = key(); + + let p0 = solid(16, 16, [10, 120, 200, 255]); + // A genuinely DIFFERENT second positive (whole image a different color), so + // p1 cannot accidentally match p0 at EXACT. + let p1 = solid(16, 16, [200, 30, 30, 255]); + + check_golden_in( + &root, + &report, + BlessMode::Bless { replace: None }, + &key, + &p0, + &FuzzBudget::EXACT, + ); + check_golden_in( + &root, + &report, + BlessMode::Bless { replace: None }, + &key, + &p1, + &FuzzBudget::EXACT, + ); + + // The ledger now has two positives. + let ledger = load_ledger(&root, &key); + assert_eq!(ledger.positives.len(), 2, "two positives blessed"); + + // An image identical to the SECOND positive passes, matching index 1. + let outcome = check_golden_in( + &root, + &report, + BlessMode::Assert, + &key, + &p1, + &FuzzBudget::EXACT, + ); + assert!( + matches!( + outcome, + GoldenOutcome::Pass { + matched_positive: 1, + .. + } + ), + "image matching the second positive must report matched_positive: 1, got {outcome:?}" + ); + + // An image matching the FIRST still passes (matched_positive: 0) — proves + // any-positive, not last-positive. + let outcome0 = check_golden_in( + &root, + &report, + BlessMode::Assert, + &key, + &p0, + &FuzzBudget::EXACT, + ); + assert!(matches!( + outcome0, + GoldenOutcome::Pass { + matched_positive: 0, + .. + } + )); +} + +// --------------------------------------------------------------------------- +// #3 — bless round-trip: bless, re-check passes, ledger records provenance. +// --------------------------------------------------------------------------- + +#[test] +fn bless_round_trip() { + let root = temp_root("bless"); + let report = temp_root("bless-report"); + let key = key(); + let img = solid(20, 12, [44, 88, 132, 255]); + + let outcome = check_golden_in( + &root, + &report, + BlessMode::Bless { replace: None }, + &key, + &img, + &FuzzBudget::EXACT, + ); + assert!(matches!( + outcome, + GoldenOutcome::Blessed { + positive: 0, + was_new: true + } + )); + + // The PNG and the ledger exist on disk. + let dir = key.dir(&root); + assert!( + dir.join("dark__sm__lavapipe__dpr1.0.png").exists(), + "blessed PNG written" + ); + + let ledger = load_ledger(&root, &key); + assert_eq!(ledger.positives.len(), 1); + let pos = &ledger.positives[0]; + assert_eq!(pos.file, "dark__sm__lavapipe__dpr1.0.png"); + assert_eq!(pos.budget, FuzzBudget::EXACT); + assert!(!pos.reason.is_empty(), "a reason was recorded"); + // RFC3339-shaped timestamp (the harness emits `YYYY-MM-DDThh:mm:ssZ`). + assert!( + pos.blessed_at.len() == 20 && pos.blessed_at.ends_with('Z') && pos.blessed_at.contains('T'), + "RFC3339 timestamp recorded, got {:?}", + pos.blessed_at + ); + // A commit string was recorded (a real hash inside the repo, else "unknown"). + assert!(!pos.blessed_commit.is_empty(), "a commit was recorded"); + + // Re-check WITHOUT bless now passes. + let pass = check_golden_in( + &root, + &report, + BlessMode::Assert, + &key, + &img, + &FuzzBudget::EXACT, + ); + assert!( + matches!( + pass, + GoldenOutcome::Pass { + matched_positive: 0, + .. + } + ), + "the blessed image passes on re-check, got {pass:?}" + ); +} + +#[test] +fn bless_replace_overwrites_positive() { + let root = temp_root("replace"); + let report = temp_root("replace-report"); + let key = key(); + let original = solid(16, 16, [10, 10, 10, 255]); + let replacement = solid(16, 16, [240, 240, 240, 255]); + + check_golden_in( + &root, + &report, + BlessMode::Bless { replace: None }, + &key, + &original, + &FuzzBudget::EXACT, + ); + let replaced = check_golden_in( + &root, + &report, + BlessMode::Bless { replace: Some(0) }, + &key, + &replacement, + &FuzzBudget::EXACT, + ); + assert!( + matches!( + replaced, + GoldenOutcome::Blessed { + positive: 0, + was_new: false + } + ), + "replace targets positive 0 in place, got {replaced:?}" + ); + // Still ONE positive (replaced, not appended). + assert_eq!(load_ledger(&root, &key).positives.len(), 1); + // The replacement is now the baseline; the original no longer matches. + let now = check_golden_in( + &root, + &report, + BlessMode::Assert, + &key, + &replacement, + &FuzzBudget::EXACT, + ); + assert!(matches!( + now, + GoldenOutcome::Pass { + matched_positive: 0, + .. + } + )); + let stale = check_golden_in( + &root, + &report, + BlessMode::Assert, + &key, + &original, + &FuzzBudget::EXACT, + ); + assert!( + matches!(stale, GoldenOutcome::Fail { .. }), + "the replaced-out original no longer matches" + ); +} + +// --------------------------------------------------------------------------- +// #4 — fail-closed: empty corpus + Assert ⇒ panic with the bless instruction. +// --------------------------------------------------------------------------- + +#[test] +#[should_panic(expected = "no golden committed")] +fn fail_closed_on_empty_corpus() { + let root = temp_root("empty"); + let report = temp_root("empty-report"); + let key = key(); + let img = solid(16, 16, [0, 0, 0, 255]); + // No positive blessed ⇒ assert_golden_in must panic instructing the dev to + // bless + review + commit (the BUIY_ACCEPT_SHAPING fail-closed shape). + assert_golden_in( + &root, + &report, + BlessMode::Assert, + &key, + &img, + &FuzzBudget::EXACT, + ); +} + +#[test] +fn check_golden_missing_returns_fail_with_no_best() { + // The structured (no-panic) view of the missing case: empty corpus ⇒ Fail + // with best == None (the "missing" outcome the coverage driver consumes). + let root = temp_root("missing"); + let report = temp_root("missing-report"); + let key = key(); + let img = solid(16, 16, [0, 0, 0, 255]); + let outcome = check_golden_in( + &root, + &report, + BlessMode::Assert, + &key, + &img, + &FuzzBudget::EXACT, + ); + match outcome { + GoldenOutcome::Fail { best: None, report } => { + assert!( + report.exists(), + "a triage report is still emitted for a missing golden" + ); + } + other => panic!("expected Fail{{ best: None }} for an empty corpus, got {other:?}"), + } +} + +// --- helpers ------------------------------------------------------------------- + +fn load_ledger(root: &std::path::Path, key: &GoldenKey) -> BlessLedger { + let dir = key.dir(root); + let stem = key.slug().rsplit('/').next().unwrap().to_string(); + let path = dir.join(format!("{stem}.toml")); + let body = std::fs::read_to_string(&path) + .unwrap_or_else(|e| panic!("ledger {path:?} unreadable: {e}")); + toml::from_str(&body).expect("ledger parses") +} From 491c85715bf9108eff6720ba4ff31ad8c34d75af Mon Sep 17 00:00:00 2001 From: Noah Date: Mon, 15 Jun 2026 09:29:57 -0700 Subject: [PATCH 47/70] feat(verify): self-contained offline HTML triage report self-test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 3.8 (verification-design goldens.md § Verification #5). Lands the test coverage for the `golden::report` TriageReport/TriageCard built in 3.6. The report base64-inlines the actual / closest-baseline / diff-heatmap PNGs into one HTML file with three views per card — side-by-side, a pure-JS opacity-slider overlay, and the diff heatmap — so it opens straight from a CI artifact with no network and no external asset (offline-first, no SaaS). RED→GREEN (golden_report.rs, pure-CPU): assert every `src=` is a data URI, no http(s)/relative/`