From a78a69533ee9b9f7326f2e2792284760dda35915 Mon Sep 17 00:00:00 2001
From: Mike Grier <micgrier@microsoft.com>
Date: Tue, 9 Jun 2026 15:14:17 -0400
Subject: [PATCH] fp-hw-survey: add cross-hardware FP behavior survey crate
 (capture/merge), workspace wiring, and rpres/afp feature detection

---
 Cargo.toml                                  |   1 +
 crates/fp-hw-survey/.gitignore              |   5 +
 crates/fp-hw-survey/Cargo.toml              |  22 +
 crates/fp-hw-survey/README.md               | 296 +++++++++++
 crates/fp-hw-survey/src/arch/aarch64.rs     | 537 ++++++++++++++++++++
 crates/fp-hw-survey/src/arch/mod.rs         |  20 +
 crates/fp-hw-survey/src/arch/unsupported.rs |  20 +
 crates/fp-hw-survey/src/arch/x86_64.rs      | 321 ++++++++++++
 crates/fp-hw-survey/src/capture.rs          | 228 +++++++++
 crates/fp-hw-survey/src/corpus.rs           | 215 ++++++++
 crates/fp-hw-survey/src/host.rs             | 375 ++++++++++++++
 crates/fp-hw-survey/src/jsonio.rs           | 361 +++++++++++++
 crates/fp-hw-survey/src/main.rs             | 199 ++++++++
 crates/fp-hw-survey/src/merge.rs            | 181 +++++++
 crates/fp-hw-survey/src/mode.rs             | 130 +++++
 crates/fp-hw-survey/src/normflags.rs        |  81 +++
 crates/fp-hw-survey/src/ops.rs              | 185 +++++++
 crates/fp-hw-survey/src/selftest.rs         | 163 ++++++
 spellcheck.dic                              |  69 ++-
 19 files changed, 3408 insertions(+), 1 deletion(-)
 create mode 100644 crates/fp-hw-survey/.gitignore
 create mode 100644 crates/fp-hw-survey/Cargo.toml
 create mode 100644 crates/fp-hw-survey/README.md
 create mode 100644 crates/fp-hw-survey/src/arch/aarch64.rs
 create mode 100644 crates/fp-hw-survey/src/arch/mod.rs
 create mode 100644 crates/fp-hw-survey/src/arch/unsupported.rs
 create mode 100644 crates/fp-hw-survey/src/arch/x86_64.rs
 create mode 100644 crates/fp-hw-survey/src/capture.rs
 create mode 100644 crates/fp-hw-survey/src/corpus.rs
 create mode 100644 crates/fp-hw-survey/src/host.rs
 create mode 100644 crates/fp-hw-survey/src/jsonio.rs
 create mode 100644 crates/fp-hw-survey/src/main.rs
 create mode 100644 crates/fp-hw-survey/src/merge.rs
 create mode 100644 crates/fp-hw-survey/src/mode.rs
 create mode 100644 crates/fp-hw-survey/src/normflags.rs
 create mode 100644 crates/fp-hw-survey/src/ops.rs
 create mode 100644 crates/fp-hw-survey/src/selftest.rs

diff --git a/Cargo.toml b/Cargo.toml
index c7c0758e..1fab6ad3 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,6 +2,7 @@
 resolver = "2"
 members = [
   "m",
+  "crates/fp-hw-survey",
 ]
 
 [workspace.metadata.spellcheck]
diff --git a/crates/fp-hw-survey/.gitignore b/crates/fp-hw-survey/.gitignore
new file mode 100644
index 00000000..334cfae5
--- /dev/null
+++ b/crates/fp-hw-survey/.gitignore
@@ -0,0 +1,5 @@
+/target
+*.ndjson
+*.ndjson.gz
+captures/
+.scratch/
diff --git a/crates/fp-hw-survey/Cargo.toml b/crates/fp-hw-survey/Cargo.toml
new file mode 100644
index 00000000..afb1e24a
--- /dev/null
+++ b/crates/fp-hw-survey/Cargo.toml
@@ -0,0 +1,22 @@
+[package]
+name = "fp-hw-survey"
+version = "0.1.0"
+edition = "2021"
+rust-version = "1.86"
+authors = ["Michael Grier <micgrier@microsoft.com>"]
+license = "MIT"
+description = "Cross-hardware IEEE-754 floating-point behavior survey: capture native FP results + exception flags on x86-64 and AArch64, then merge captures from many machines to find where hardware actually disagrees."
+repository = "https://github.com/azure/m"
+homepage = "https://github.com/azure/m"
+readme = "README.md"
+
+# No external dependencies on purpose: this must build on stable Rust with std
+# only, on macOS arm64, Windows arm64, and x86-64 Linux/Windows, with zero
+# friction.
+
+[lints]
+workspace = true
+
+[[bin]]
+name = "fp-hw-survey"
+path = "src/main.rs"
diff --git a/crates/fp-hw-survey/README.md b/crates/fp-hw-survey/README.md
new file mode 100644
index 00000000..29d9cf9e
--- /dev/null
+++ b/crates/fp-hw-survey/README.md
@@ -0,0 +1,296 @@
+# fp-hw-survey
+
+A small, dependency-free Rust program that captures **native floating-point
+hardware behavior** across many machines and architectures, then merges those
+captures to find the rows where the hardware actually *disagrees*.
+
+It exists to answer empirical questions like:
+
+- Do different AArch64 vendors (Qualcomm Snapdragon, Apple M-series, Ampere)
+  produce different bit patterns for the estimate instructions `frecpe` /
+  `frsqrte` / `frecpx` / `fmulx`? (Spoiler: the architecture says they
+  *shouldn't* — see [What we expect to find](#what-we-expect-to-find) — and the
+  survey's job is to confirm that empirically.)
+- Where exactly do x86-64 and AArch64 differ — `fmax`/`fmin` semantics,
+  float→int out-of-range results, NaN propagation, flush-to-zero edges?
+
+Each machine runs a **deterministic** operand × rounding-mode corpus through the
+real scalar instructions (via per-architecture inline asm — *not* Rust libcore
+math, which would hide the divergence), and records the native
+`(result_bits, exception_flags)`. Because the corpus is identical on every
+machine, captures align row-for-row and the merge step can emit just the
+disagreements.
+
+## What we expect to find
+
+A late but important correction to the tool's original premise: on **AArch64**,
+the scalar estimate instructions are **not** loosely "implementation-defined."
+The Arm Architecture Reference Manual defines `frecpe`/`frsqrte` via exact
+shared pseudocode (`FPRecipEstimate` → `RecipEstimate`), a deterministic
+fixed-point integer computation; `frecpx` and `fmulx` are likewise fully
+specified. The result bits are reproducible from the spec alone — e.g.
+`vrecpe(1,2,3,4)` is architecturally required to yield
+`0.99805, 0.49902, 0.33301, 0.24951` on every conforming ARMv8 part. So:
+
+- **Intra-ARMv8 agreement is the predicted result.** Apple vs Snapdragon vs
+  Ampere *should* produce bit-identical estimates. A merge that reports **zero**
+  divergences across AArch64 machines is therefore the *informative,* expected
+  outcome — a positive confirmation of conformance, not a wasted run.
+- **One legitimate intra-ARMv8 estimate divergence exists — and it is a
+  *feature* difference, not a conformance bug.** `FEAT_RPRES` selects a 12-bit
+  reciprocal / reciprocal-sqrt estimate table (when `FPCR.AH==1`, single
+  precision) where a non-RPRES part returns 8 bits. So a `frecpe.s` /
+  `frsqrte.s` disagreement between two machines is expected **only** when their
+  `rpres`/`AH` context differs; both answers are deterministic, just from
+  different mandated tables. The capture records `rpres`/`afp` in the header
+  `features` precisely so the merge can attribute such a row to the feature gap
+  rather than flag it as an erratum.
+- **The genuinely divergent axes lie elsewhere:** ARMv7 NEON `VRECPE`/`VRSQRTE`
+  (pre-v8, genuinely looser — "read from a ROM with limited bins"), and
+  **cross-architecture** x86-64 ↔ AArch64 semantics (`fmax`/`fmin` NaN handling,
+  float→int saturation, flush-to-zero edges).
+- **Spec ≠ proof of correct silicon.** The survey still earns its keep by
+  catching a part that *deviates* from the pseudocode (an erratum, a botched
+  subnormal/`FEAT_FP16`/sign-of-zero edge). "Should agree per spec" becomes
+  "does agree, measured."
+
+### Consequence for the downstream reference set
+
+This sharply shrinks the golden/reference data a consumer like `rook::fp` must
+carry. If AArch64 estimates are architecturally fixed, the consumer needs **one
+architecturally-derived table per architecture**, not a per-vendor / per-SKU
+golden captured from every machine. The survey does **not** ship a multi-SKU ×
+full-corpus blob (which is where the tens-to-hundreds-of-MB figures came from);
+it ships only the **divergence set** — the handful of keys that are genuinely
+not unanimous (cross-arch corners, any conformance outlier). In the expected
+case that intra-ARMv8 is unanimous, the AArch64 contribution to that reference
+set is **empty**, and the per-machine captures are intermediate verification
+artifacts, not data the consumer retains.
+
+## Build
+
+Stable Rust, `std` only, **no external crates**. Builds out of the box on:
+
+- macOS arm64 (Apple Silicon)
+- Windows on ARM (Snapdragon, e.g. Surface / Volterra / Lenovo X13s)
+- Linux/Windows x86-64
+
+```sh
+cargo build --release
+```
+
+The binary is `target/release/fp-hw-survey` (`.exe` on Windows).
+
+## Supported architectures
+
+| Arch | Backend | Notes |
+|------|---------|-------|
+| `aarch64` | full scalar oracle | All 77 catalogued ops; half-precision (`.h`) ops require `FEAT_FP16`. |
+| `x86_64`  | SSE/SSE2 (+FMA3) | The SSE-mappable subset only: arithmetic, `fmax`/`fmin`, `fsqrt`, `fma`, f32↔f64, truncating signed float→int, signed int→float. Ops with no scalar SSE form (`fmaxnm`, `fmulx`, `fabd`, the estimate family, directed-rounding/unsigned conversions, all half ops) are skipped. |
+| other | none | Produces only a header line. |
+
+> **x86-64 validation caveat:** the x86-64 inline asm was authored on an arm64
+> host and could not be executed there during development. Every `capture` run
+> first executes a **known-answer self-test** for the local architecture and
+> **aborts** if any check fails, so a broken oracle never emits untrustworthy
+> data. Still, the first time you run this on real x64 hardware, eyeball the
+> `selftest` output.
+
+## Usage
+
+### Generating a capture (run on each machine)
+
+A capture is a single self-describing NDJSON file. Its **header line records the
+hardware identity and the capture date/time automatically** — you do not have to
+supply them — so a capture file is always traceable back to the machine and the
+moment it was produced.
+
+**Step 1 — build the release binary on the target machine.**
+
+```sh
+# from the workspace root (c:\github\m or your clone)
+cargo build --release -p fp-hw-survey
+```
+
+Use `--release`: a capture runs a large operand × mode sweep, and the debug
+build is much slower. Optimization level does **not** change the captured
+results (each op runs inside its own inline-asm block), only the wall-clock
+time. The binary lands at:
+
+- Linux/macOS: `target/release/fp-hw-survey`
+- Windows: `target\release\fp-hw-survey.exe`
+
+**Step 2 — sanity-check the host (optional but recommended).**
+
+```sh
+fp-hw-survey info        # arch, OS, CPU brand, detected features, supported-op count
+fp-hw-survey selftest    # known-answer checks for this machine's oracle
+```
+
+`info` is also the quickest way to confirm the tool detected the CPU and
+features (e.g. `fp16` on AArch64) correctly before you commit to a full run.
+
+**Step 3 — run the capture.**
+
+```sh
+fp-hw-survey capture --label "snapdragon-x-elite-win"
+```
+
+`capture` first runs the self-test and **aborts without writing if any
+known-answer check fails**, so a broken oracle never produces untrustworthy
+data. On success it writes `capture-<label>.ndjson` in the current directory and
+prints the op/row counts and output path to stderr.
+
+**Step 4 — keep / send the file.** The resulting `capture-<label>.ndjson` is the
+artifact. It is self-contained: machine identity and timestamp are in the header
+(see [Record format](#record-format)).
+
+#### Capture options
+
+| Flag | Default | Meaning |
+|------|---------|---------|
+| `--label` | *(required)* | Human name for this machine; goes in the header and the default filename. Pick something that identifies the CPU **and** OS, e.g. `m2-macbook-air`, `ampere-altra-linux`, `snapdragon-x-elite-win`. |
+| `--out` | `capture-<label>.ndjson` | Output file path. |
+| `--pairs` | `2000` | Random operand draws per op (on top of the curated edge cases). Higher = denser coverage. |
+| `--budget-mb` | `150` | Hard output-size cap in MB; capture stops once exceeded. |
+| `--ops` | *(all)* | Comma-separated op labels to restrict the capture to, e.g. `frecpe.s,frsqrte.s`. |
+
+The defaults target the ~100–200 MB per-machine budget the survey was designed
+around. Increase `--pairs` for denser random coverage (bounded by
+`--budget-mb`); narrow with `--ops` when you only care about specific
+instructions.
+
+Examples:
+
+```sh
+# Full default capture, explicit output path
+fp-hw-survey capture --label ampere-altra-linux --out /tmp/altra.ndjson
+
+# Dense capture of just the estimate family, larger budget
+fp-hw-survey capture --label m2-mac --ops frecpe.s,frsqrte.s,frecpx.s,fmulx.s \
+                     --pairs 50000 --budget-mb 50
+```
+
+### Merge (offline, on one machine)
+
+```sh
+fp-hw-survey merge --out divergences.ndjson capture-*.ndjson
+```
+
+Aligns every capture on the logical key `(op, a, b, c, mode, flush)` and writes
+only the keys where the `(result, flags)` pair is **not unanimous** across the
+machines that produced it. Each divergence row lists every machine's label,
+arch, result bits, and decoded flags. A summary (machine count, aligned-key
+count, divergences, per-op breakdown) prints to stderr.
+
+> Merge currently holds all aligned rows in memory; run it on a box with enough
+> RAM for the combined capture set.
+
+### Other subcommands
+
+```sh
+fp-hw-survey selftest   # run known-answer checks for this host
+fp-hw-survey info       # arch, OS, CPU brand, features, supported-op count
+```
+
+## Collecting captures across the fleet
+
+Captures are gathered **manually** — each machine owner runs the tool and
+submits one NDJSON file. There is intentionally no CI capture job: the
+interesting SKUs (specific Snapdragon / Apple / Ampere / Graviton parts) are
+physical hardware a hosted runner does not represent, and a capture is a
+one-shot artifact, not something that needs to run on every push.
+
+Coordination and provenance live in **GitHub issues on `azure/m`**, while the
+*data* lives in a small committed artifact (issues are not a data store):
+
+1. **Tracking issue** — one epic, *"FP hardware survey — fleet capture
+   campaign"*, carries a checklist of target SKUs (Apple M-series, Snapdragon
+   X-series, Ampere Altra, AWS Graviton, x86-64 Intel/AMD; Microsoft Cobalt 100
+   is explicitly **deferred**). Each SKU is a per-platform capture issue.
+2. **Submitting a capture** — on the target machine, build `--release`, then:
+
+   ```sh
+   fp-hw-survey info        # confirm CPU brand + features (fp16/rpres/afp) detected
+   fp-hw-survey selftest    # must pass; capture refuses to write otherwise
+   fp-hw-survey capture --label <cpu-and-os>
+   ```
+
+   Paste the `info` and `selftest` output **and the one-line NDJSON header**
+   into the platform issue (the header is small, human-readable, and records the
+   CPU / OS / features / date — durable provenance even after the file itself is
+   gone), then attach or link the `capture-<label>.ndjson`.
+3. **Ingest (offline, on one machine)** — collect the submitted captures and
+   merge:
+
+   ```sh
+   fp-hw-survey merge --out divergences.ndjson capture-*.ndjson
+   ```
+
+   Commit **only** two things, never the raw multi-hundred-MB captures:
+
+   - **`divergences.ndjson`** — the merge output. For the intra-ARMv8 estimate
+     ops this file is *expected to be empty*; an empty file is the positive
+     conformance result, not a missing one. A non-empty AArch64 row is a real
+     finding **unless** the contributing machines differ in `rpres`/`afp`/`AH`
+     context (see [What we expect to find](#what-we-expect-to-find)).
+   - a **provenance table** — one row per contributing capture, copied from each
+     header: `label`, `cpu`, `os`, `features`, `captured_utc`, row count,
+     `tool_version`. This is what lets a future reader know exactly which silicon
+     backs the divergence set.
+
+   Raw captures are retained out-of-band (issue attachments / artifact storage),
+   not in git.
+4. **Close-out** — paste the `merge` stderr summary (machine count, aligned-key
+   count, divergence count, per-op breakdown) into the tracking issue and check
+   off the contributing SKUs.
+
+## Record format
+
+NDJSON, one object per line. The first line is a header that **identifies the
+hardware and stamps the capture time automatically**:
+
+```json
+{"kind":"header","schema":1,"arch":"aarch64","arch_tag":"aarch64","os":"macos","cpu":"Apple M2","label":"m2-macbook-air","features":["fp16"],"captured_unix":1780995871,"captured_utc":"2026-06-09T09:04:31Z","tool_version":"0.1.0"}
+```
+
+Header fields:
+
+| Field | Meaning |
+|-------|---------|
+| `arch` / `arch_tag` | Target architecture (`aarch64`, `x86_64`, …). |
+| `os` | Operating system (`macos`, `windows`, `linux`, …). |
+| `cpu` | Best-effort CPU brand string, detected natively per platform: CPUID on x86; the registry `ProcessorNameString` on Windows (incl. Windows-on-ARM); `sysctl machdep.cpu.brand_string` on macOS/iOS; `/proc/cpuinfo` on Linux. `"unknown"` only if none apply — the `--label` always disambiguates. |
+| `features` | Detected FP-relevant features (e.g. `fp16`, `rpres`, `afp` on AArch64; `avx`, `fma` on x86-64). `rpres`/`afp` are the IMPLEMENTATION_DEFINED knobs that can change an estimate *value*, so a `frecpe.s`/`frsqrte.s` divergence is expected only when they differ between machines. Best-effort on Windows-on-ARM (no `PF_*` flag exposes them). |
+| `label` | The `--label` you supplied. |
+| `captured_unix` | Capture time, seconds since the Unix epoch (UTC). |
+| `captured_utc` | Capture time as ISO-8601 UTC, e.g. `2026-06-09T09:04:31Z`. |
+| `tool_version` | `fp-hw-survey` version that produced the file. |
+
+Each data row stores operands and results as raw bit patterns (`u64`; the low
+bits hold f32/f16/i32 values), the logical rounding mode, the flush flag, and
+the **normalized** exception flags:
+
+```json
+{"op":"frecpe.s","a":0,"b":0,"c":0,"mode":"RN","flush":false,"res":2139095040,"flags":2}
+```
+
+Exception flags use the AArch64 `FPSR` cumulative layout
+(`IOC=1, DZC=2, OFC=4, UFC=8, IXC=16, IDC=128`); x86-64 `MXCSR` status bits are
+translated into this layout so flags compare directly across architectures.
+
+## Contributing captures
+
+1. `cargo build --release -p fp-hw-survey` on the target machine.
+2. `fp-hw-survey capture --label "<distinctive-name>"` (use a name that
+   identifies the CPU and OS, e.g. `snapdragon-x-elite-win`,
+   `ampere-altra-linux`).
+3. Send back the `capture-<label>.ndjson` file. It already carries the machine
+   identity and capture timestamp in its header — nothing else to record.
+
+Captures from the same CPU model on different OSes are still useful — OS-level
+defaults (e.g. denormal handling) can differ.
+
+## License
+
+MIT. See the repository root [LICENSE](../../LICENSE). Copyright (c) Microsoft Corporation.
diff --git a/crates/fp-hw-survey/src/arch/aarch64.rs b/crates/fp-hw-survey/src/arch/aarch64.rs
new file mode 100644
index 00000000..1536611e
--- /dev/null
+++ b/crates/fp-hw-survey/src/arch/aarch64.rs
@@ -0,0 +1,537 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: MIT
+#![cfg(target_arch = "aarch64")]
+
+//! AArch64 hardware oracle: executes each catalogued FP operation as the real
+//! scalar instruction inside an inline-asm block, under a caller-chosen `FPCR`
+//! state, and reads back the cumulative `FPSR` exception flags.
+//!
+//! `FPSR`'s low cumulative bits already use the normalized flag layout
+//! (`IOC..IXC` at bits 0..4, `IDC` at bit 7), so no flag translation is needed
+//! here — only masking.
+
+use crate::mode::{Mode, Round};
+use crate::normflags;
+
+/// Build an `FPCR` value for the logical mode. `RMode` occupies bits [23:22];
+/// flush sets both `FZ` (bit 24, single/double) and `FZ16` (bit 19, half) so a
+/// single value works for every format. `DN`/`AHP` stay 0 — we want to *observe*
+/// NaN propagation, not normalize it away.
+fn fpcr_for(m: Mode) -> u64 {
+    let rmode: u64 = match m.round {
+        Round::Ne => 0,
+        Round::Up => 1,
+        Round::Down => 2,
+        Round::Zero => 3,
+    };
+    let mut v = rmode << 22;
+    if m.flush {
+        v |= 1 << 24; // FZ
+        v |= 1 << 19; // FZ16
+    }
+    v
+}
+
+macro_rules! abin {
+    ($(#[$a:meta])* $name:ident, $insn:literal, $r:literal, $m:literal) => {
+        $(#[$a])*
+        #[inline(never)]
+        unsafe fn $name(a: u64, b: u64, fpcr: u64) -> (u64, u64) {
+            let res: u64;
+            let fpsr: u64;
+            core::arch::asm!(
+                "mrs {s}, fpcr",
+                "msr fpcr, {c}",
+                "isb",
+                "msr fpsr, xzr",
+                concat!("fmov ", $r, "0, {a:", $m, "}"),
+                concat!("fmov ", $r, "1, {b:", $m, "}"),
+                concat!($insn, " ", $r, "0, ", $r, "0, ", $r, "1"),
+                concat!("fmov {res:", $m, "}, ", $r, "0"),
+                "mrs {fpsr}, fpsr",
+                "msr fpcr, {s}",
+                "isb",
+                s = out(reg) _,
+                c = in(reg) fpcr,
+                a = in(reg) a,
+                b = in(reg) b,
+                res = out(reg) res,
+                fpsr = out(reg) fpsr,
+                out("v0") _,
+                out("v1") _,
+            );
+            (res, fpsr)
+        }
+    };
+}
+
+macro_rules! aun {
+    ($(#[$a:meta])* $name:ident, $insn:literal, $r:literal, $m:literal) => {
+        $(#[$a])*
+        #[inline(never)]
+        unsafe fn $name(a: u64, fpcr: u64) -> (u64, u64) {
+            let res: u64;
+            let fpsr: u64;
+            core::arch::asm!(
+                "mrs {s}, fpcr",
+                "msr fpcr, {c}",
+                "isb",
+                "msr fpsr, xzr",
+                concat!("fmov ", $r, "0, {a:", $m, "}"),
+                concat!($insn, " ", $r, "0, ", $r, "0"),
+                concat!("fmov {res:", $m, "}, ", $r, "0"),
+                "mrs {fpsr}, fpsr",
+                "msr fpcr, {s}",
+                "isb",
+                s = out(reg) _,
+                c = in(reg) fpcr,
+                a = in(reg) a,
+                res = out(reg) res,
+                fpsr = out(reg) fpsr,
+                out("v0") _,
+            );
+            (res, fpsr)
+        }
+    };
+}
+
+macro_rules! afma {
+    ($name:ident, $insn:literal, $r:literal, $m:literal) => {
+        #[inline(never)]
+        unsafe fn $name(a: u64, b: u64, c: u64, fpcr: u64) -> (u64, u64) {
+            // Compute a*b + c. `fmadd Sd, Sn, Sm, Sa` = Sn*Sm + Sa, so map
+            // Sn=a, Sm=b, Sa=c.
+            let res: u64;
+            let fpsr: u64;
+            core::arch::asm!(
+                "mrs {s}, fpcr",
+                "msr fpcr, {f}",
+                "isb",
+                "msr fpsr, xzr",
+                concat!("fmov ", $r, "1, {a:", $m, "}"),
+                concat!("fmov ", $r, "2, {b:", $m, "}"),
+                concat!("fmov ", $r, "3, {c:", $m, "}"),
+                concat!($insn, " ", $r, "0, ", $r, "1, ", $r, "2, ", $r, "3"),
+                concat!("fmov {res:", $m, "}, ", $r, "0"),
+                "mrs {fpsr}, fpsr",
+                "msr fpcr, {s}",
+                "isb",
+                s = out(reg) _,
+                f = in(reg) fpcr,
+                a = in(reg) a,
+                b = in(reg) b,
+                c = in(reg) c,
+                res = out(reg) res,
+                fpsr = out(reg) fpsr,
+                out("v0") _,
+                out("v1") _,
+                out("v2") _,
+                out("v3") _,
+            );
+            (res, fpsr)
+        }
+    };
+}
+
+macro_rules! acvtff {
+    ($(#[$a:meta])* $name:ident, $sr:literal, $sm:literal, $dr:literal, $dm:literal) => {
+        $(#[$a])*
+        #[inline(never)]
+        unsafe fn $name(a: u64, fpcr: u64) -> (u64, u64) {
+            let res: u64;
+            let fpsr: u64;
+            core::arch::asm!(
+                "mrs {s}, fpcr",
+                "msr fpcr, {c}",
+                "isb",
+                "msr fpsr, xzr",
+                concat!("fmov ", $sr, "0, {a:", $sm, "}"),
+                concat!("fcvt ", $dr, "0, ", $sr, "0"),
+                concat!("fmov {res:", $dm, "}, ", $dr, "0"),
+                "mrs {fpsr}, fpsr",
+                "msr fpcr, {s}",
+                "isb",
+                s = out(reg) _,
+                c = in(reg) fpcr,
+                a = in(reg) a,
+                res = out(reg) res,
+                fpsr = out(reg) fpsr,
+                out("v0") _,
+            );
+            (res, fpsr)
+        }
+    };
+}
+
+macro_rules! acvtf2i {
+    ($name:ident, $insn:literal, $im:literal, $fr:literal, $fm:literal) => {
+        #[inline(never)]
+        unsafe fn $name(a: u64, fpcr: u64) -> (u64, u64) {
+            let res: u64;
+            let fpsr: u64;
+            core::arch::asm!(
+                "mrs {s}, fpcr",
+                "msr fpcr, {c}",
+                "isb",
+                "msr fpsr, xzr",
+                concat!("fmov ", $fr, "0, {a:", $fm, "}"),
+                concat!($insn, " {res:", $im, "}, ", $fr, "0"),
+                "mrs {fpsr}, fpsr",
+                "msr fpcr, {s}",
+                "isb",
+                s = out(reg) _,
+                c = in(reg) fpcr,
+                a = in(reg) a,
+                res = out(reg) res,
+                fpsr = out(reg) fpsr,
+                out("v0") _,
+            );
+            (res, fpsr)
+        }
+    };
+}
+
+macro_rules! acvti2f {
+    ($name:ident, $insn:literal, $fr:literal, $fm:literal, $im:literal) => {
+        #[inline(never)]
+        unsafe fn $name(a: u64, fpcr: u64) -> (u64, u64) {
+            let res: u64;
+            let fpsr: u64;
+            core::arch::asm!(
+                "mrs {s}, fpcr",
+                "msr fpcr, {c}",
+                "isb",
+                "msr fpsr, xzr",
+                concat!($insn, " ", $fr, "0, {a:", $im, "}"),
+                concat!("fmov {res:", $fm, "}, ", $fr, "0"),
+                "mrs {fpsr}, fpsr",
+                "msr fpcr, {s}",
+                "isb",
+                s = out(reg) _,
+                c = in(reg) fpcr,
+                a = in(reg) a,
+                res = out(reg) res,
+                fpsr = out(reg) fpsr,
+                out("v0") _,
+            );
+            (res, fpsr)
+        }
+    };
+}
+
+// ── Two-source single ────────────────────────────────────────────────────
+abin!(fadd_s, "fadd", "s", "w");
+abin!(fsub_s, "fsub", "s", "w");
+abin!(fmul_s, "fmul", "s", "w");
+abin!(fdiv_s, "fdiv", "s", "w");
+abin!(fmax_s, "fmax", "s", "w");
+abin!(fmin_s, "fmin", "s", "w");
+abin!(fmaxnm_s, "fmaxnm", "s", "w");
+abin!(fminnm_s, "fminnm", "s", "w");
+abin!(fmulx_s, "fmulx", "s", "w");
+abin!(fabd_s, "fabd", "s", "w");
+abin!(frecps_s, "frecps", "s", "w");
+abin!(frsqrts_s, "frsqrts", "s", "w");
+
+// ── Two-source double ────────────────────────────────────────────────────
+abin!(fadd_d, "fadd", "d", "x");
+abin!(fsub_d, "fsub", "d", "x");
+abin!(fmul_d, "fmul", "d", "x");
+abin!(fdiv_d, "fdiv", "d", "x");
+abin!(fmax_d, "fmax", "d", "x");
+abin!(fmin_d, "fmin", "d", "x");
+abin!(fmaxnm_d, "fmaxnm", "d", "x");
+abin!(fminnm_d, "fminnm", "d", "x");
+abin!(fmulx_d, "fmulx", "d", "x");
+abin!(fabd_d, "fabd", "d", "x");
+abin!(frecps_d, "frecps", "d", "x");
+abin!(frsqrts_d, "frsqrts", "d", "x");
+
+// ── Two-source half (FEAT_FP16) ──────────────────────────────────────────
+abin!(
+    #[target_feature(enable = "fp16")]
+    fadd_h,
+    "fadd",
+    "h",
+    "w"
+);
+abin!(
+    #[target_feature(enable = "fp16")]
+    fsub_h,
+    "fsub",
+    "h",
+    "w"
+);
+abin!(
+    #[target_feature(enable = "fp16")]
+    fmul_h,
+    "fmul",
+    "h",
+    "w"
+);
+abin!(
+    #[target_feature(enable = "fp16")]
+    fdiv_h,
+    "fdiv",
+    "h",
+    "w"
+);
+abin!(
+    #[target_feature(enable = "fp16")]
+    fmax_h,
+    "fmax",
+    "h",
+    "w"
+);
+abin!(
+    #[target_feature(enable = "fp16")]
+    fmin_h,
+    "fmin",
+    "h",
+    "w"
+);
+abin!(
+    #[target_feature(enable = "fp16")]
+    fmaxnm_h,
+    "fmaxnm",
+    "h",
+    "w"
+);
+abin!(
+    #[target_feature(enable = "fp16")]
+    fminnm_h,
+    "fminnm",
+    "h",
+    "w"
+);
+abin!(
+    #[target_feature(enable = "fp16")]
+    fmulx_h,
+    "fmulx",
+    "h",
+    "w"
+);
+abin!(
+    #[target_feature(enable = "fp16")]
+    fabd_h,
+    "fabd",
+    "h",
+    "w"
+);
+
+// ── One-source single/double ─────────────────────────────────────────────
+aun!(fsqrt_s, "fsqrt", "s", "w");
+aun!(frecpe_s, "frecpe", "s", "w");
+aun!(frsqrte_s, "frsqrte", "s", "w");
+aun!(frecpx_s, "frecpx", "s", "w");
+aun!(frintn_s, "frintn", "s", "w");
+aun!(frinta_s, "frinta", "s", "w");
+aun!(frintp_s, "frintp", "s", "w");
+aun!(frintm_s, "frintm", "s", "w");
+aun!(frintz_s, "frintz", "s", "w");
+aun!(fsqrt_d, "fsqrt", "d", "x");
+aun!(frecpe_d, "frecpe", "d", "x");
+aun!(frsqrte_d, "frsqrte", "d", "x");
+aun!(frecpx_d, "frecpx", "d", "x");
+aun!(frintn_d, "frintn", "d", "x");
+aun!(frinta_d, "frinta", "d", "x");
+aun!(frintp_d, "frintp", "d", "x");
+aun!(frintm_d, "frintm", "d", "x");
+aun!(frintz_d, "frintz", "d", "x");
+aun!(
+    #[target_feature(enable = "fp16")]
+    fsqrt_h,
+    "fsqrt",
+    "h",
+    "w"
+);
+
+// ── Fused multiply-add ───────────────────────────────────────────────────
+afma!(fmadd_s, "fmadd", "s", "w");
+afma!(fmadd_d, "fmadd", "d", "x");
+
+// ── Float→float convert ──────────────────────────────────────────────────
+acvtff!(fcvt_s2d, "s", "w", "d", "x");
+acvtff!(fcvt_d2s, "d", "x", "s", "w");
+acvtff!(
+    #[target_feature(enable = "fp16")]
+    fcvt_s2h,
+    "s",
+    "w",
+    "h",
+    "w"
+);
+acvtff!(
+    #[target_feature(enable = "fp16")]
+    fcvt_d2h,
+    "d",
+    "x",
+    "h",
+    "w"
+);
+acvtff!(
+    #[target_feature(enable = "fp16")]
+    fcvt_h2s,
+    "h",
+    "w",
+    "s",
+    "w"
+);
+acvtff!(
+    #[target_feature(enable = "fp16")]
+    fcvt_h2d,
+    "h",
+    "w",
+    "d",
+    "x"
+);
+
+// ── Float→int (round toward zero) ────────────────────────────────────────
+acvtf2i!(fcvtzs_s_w, "fcvtzs", "w", "s", "w");
+acvtf2i!(fcvtzs_s_x, "fcvtzs", "x", "s", "w");
+acvtf2i!(fcvtzs_d_w, "fcvtzs", "w", "d", "x");
+acvtf2i!(fcvtzs_d_x, "fcvtzs", "x", "d", "x");
+acvtf2i!(fcvtzu_s_w, "fcvtzu", "w", "s", "w");
+acvtf2i!(fcvtzu_s_x, "fcvtzu", "x", "s", "w");
+acvtf2i!(fcvtzu_d_w, "fcvtzu", "w", "d", "x");
+acvtf2i!(fcvtzu_d_x, "fcvtzu", "x", "d", "x");
+
+// ── Int→float ────────────────────────────────────────────────────────────
+acvti2f!(scvtf_w_s, "scvtf", "s", "w", "w");
+acvti2f!(scvtf_x_s, "scvtf", "s", "w", "x");
+acvti2f!(scvtf_w_d, "scvtf", "d", "x", "w");
+acvti2f!(scvtf_x_d, "scvtf", "d", "x", "x");
+acvti2f!(ucvtf_w_s, "ucvtf", "s", "w", "w");
+acvti2f!(ucvtf_x_s, "ucvtf", "s", "w", "x");
+acvti2f!(ucvtf_w_d, "ucvtf", "d", "x", "w");
+acvti2f!(ucvtf_x_d, "ucvtf", "d", "x", "x");
+
+/// 32-bit result mask for `.w` integer destinations.
+const W: u64 = 0xFFFF_FFFF;
+
+pub fn arch_tag() -> &'static str {
+    "aarch64"
+}
+
+fn has_fp16() -> bool {
+    crate::host::aarch64_fp16()
+}
+
+/// Whether this host can execute the op. All AArch64 ops are supported except
+/// half-precision ones on hosts lacking `FEAT_FP16`.
+pub fn supports(label: &str) -> bool {
+    if label.ends_with(".h")
+        || label == "fcvt.s2h"
+        || label == "fcvt.d2h"
+        || label == "fcvt.h2s"
+        || label == "fcvt.h2d"
+    {
+        return has_fp16();
+    }
+    crate::ops::catalogue().iter().any(|o| o.label == label)
+}
+
+/// Execute `label` with the given operand bits and logical mode, returning
+/// `(result_bits, normalized_flags)`. Returns `None` if unsupported on this
+/// host (e.g. half op without `FEAT_FP16`).
+pub fn eval(label: &str, a: u64, b: u64, c: u64, mode: Mode) -> Option<(u64, u32)> {
+    let f = fpcr_for(mode);
+    let half =
+        label.ends_with(".h") || matches!(label, "fcvt.s2h" | "fcvt.d2h" | "fcvt.h2s" | "fcvt.h2d");
+    if half && !has_fp16() {
+        return None;
+    }
+    let (res, fpsr): (u64, u64) = unsafe {
+        match label {
+            // two-source single
+            "fadd.s" => fadd_s(a, b, f),
+            "fsub.s" => fsub_s(a, b, f),
+            "fmul.s" => fmul_s(a, b, f),
+            "fdiv.s" => fdiv_s(a, b, f),
+            "fmax.s" => fmax_s(a, b, f),
+            "fmin.s" => fmin_s(a, b, f),
+            "fmaxnm.s" => fmaxnm_s(a, b, f),
+            "fminnm.s" => fminnm_s(a, b, f),
+            "fmulx.s" => fmulx_s(a, b, f),
+            "fabd.s" => fabd_s(a, b, f),
+            "frecps.s" => frecps_s(a, b, f),
+            "frsqrts.s" => frsqrts_s(a, b, f),
+            // two-source double
+            "fadd.d" => fadd_d(a, b, f),
+            "fsub.d" => fsub_d(a, b, f),
+            "fmul.d" => fmul_d(a, b, f),
+            "fdiv.d" => fdiv_d(a, b, f),
+            "fmax.d" => fmax_d(a, b, f),
+            "fmin.d" => fmin_d(a, b, f),
+            "fmaxnm.d" => fmaxnm_d(a, b, f),
+            "fminnm.d" => fminnm_d(a, b, f),
+            "fmulx.d" => fmulx_d(a, b, f),
+            "fabd.d" => fabd_d(a, b, f),
+            "frecps.d" => frecps_d(a, b, f),
+            "frsqrts.d" => frsqrts_d(a, b, f),
+            // two-source half
+            "fadd.h" => fadd_h(a, b, f),
+            "fsub.h" => fsub_h(a, b, f),
+            "fmul.h" => fmul_h(a, b, f),
+            "fdiv.h" => fdiv_h(a, b, f),
+            "fmax.h" => fmax_h(a, b, f),
+            "fmin.h" => fmin_h(a, b, f),
+            "fmaxnm.h" => fmaxnm_h(a, b, f),
+            "fminnm.h" => fminnm_h(a, b, f),
+            "fmulx.h" => fmulx_h(a, b, f),
+            "fabd.h" => fabd_h(a, b, f),
+            // one-source
+            "fsqrt.s" => fsqrt_s(a, f),
+            "frecpe.s" => frecpe_s(a, f),
+            "frsqrte.s" => frsqrte_s(a, f),
+            "frecpx.s" => frecpx_s(a, f),
+            "frintn.s" => frintn_s(a, f),
+            "frinta.s" => frinta_s(a, f),
+            "frintp.s" => frintp_s(a, f),
+            "frintm.s" => frintm_s(a, f),
+            "frintz.s" => frintz_s(a, f),
+            "fsqrt.d" => fsqrt_d(a, f),
+            "frecpe.d" => frecpe_d(a, f),
+            "frsqrte.d" => frsqrte_d(a, f),
+            "frecpx.d" => frecpx_d(a, f),
+            "frintn.d" => frintn_d(a, f),
+            "frinta.d" => frinta_d(a, f),
+            "frintp.d" => frintp_d(a, f),
+            "frintm.d" => frintm_d(a, f),
+            "frintz.d" => frintz_d(a, f),
+            "fsqrt.h" => fsqrt_h(a, f),
+            // fma
+            "fmadd.s" => fmadd_s(a, b, c, f),
+            "fmadd.d" => fmadd_d(a, b, c, f),
+            // float→float
+            "fcvt.s2d" => fcvt_s2d(a, f),
+            "fcvt.d2s" => fcvt_d2s(a, f),
+            "fcvt.s2h" => fcvt_s2h(a, f),
+            "fcvt.d2h" => fcvt_d2h(a, f),
+            "fcvt.h2s" => fcvt_h2s(a, f),
+            "fcvt.h2d" => fcvt_h2d(a, f),
+            // float→int
+            "fcvtzs.s.w" => fcvtzs_s_w(a, f),
+            "fcvtzs.s.x" => fcvtzs_s_x(a, f),
+            "fcvtzs.d.w" => fcvtzs_d_w(a, f),
+            "fcvtzs.d.x" => fcvtzs_d_x(a, f),
+            "fcvtzu.s.w" => fcvtzu_s_w(a, f),
+            "fcvtzu.s.x" => fcvtzu_s_x(a, f),
+            "fcvtzu.d.w" => fcvtzu_d_w(a, f),
+            "fcvtzu.d.x" => fcvtzu_d_x(a, f),
+            // int→float
+            "scvtf.w.s" => scvtf_w_s(a, f),
+            "scvtf.x.s" => scvtf_x_s(a, f),
+            "scvtf.w.d" => scvtf_w_d(a, f),
+            "scvtf.x.d" => scvtf_x_d(a, f),
+            "ucvtf.w.s" => ucvtf_w_s(a, f),
+            "ucvtf.x.s" => ucvtf_x_s(a, f),
+            "ucvtf.w.d" => ucvtf_w_d(a, f),
+            "ucvtf.x.d" => ucvtf_x_d(a, f),
+            _ => return None,
+        }
+    };
+    // `.w` integer destinations: compare only the low 32 bits.
+    let res = if label.ends_with(".w") { res & W } else { res };
+    Some((res, (fpsr as u32) & normflags::MASK))
+}
diff --git a/crates/fp-hw-survey/src/arch/mod.rs b/crates/fp-hw-survey/src/arch/mod.rs
new file mode 100644
index 00000000..26c0db01
--- /dev/null
+++ b/crates/fp-hw-survey/src/arch/mod.rs
@@ -0,0 +1,20 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: MIT
+
+//! Architecture dispatch. Exactly one backend is compiled in per target; all
+//! expose the same `arch_tag` / `supports` / `eval` surface so the rest of the
+//! tool is architecture-agnostic.
+
+#[cfg(target_arch = "aarch64")]
+mod aarch64;
+#[cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))]
+mod unsupported;
+#[cfg(target_arch = "x86_64")]
+mod x86_64;
+
+#[cfg(target_arch = "aarch64")]
+pub use aarch64::{arch_tag, eval, supports};
+#[cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))]
+pub use unsupported::{arch_tag, eval, supports};
+#[cfg(target_arch = "x86_64")]
+pub use x86_64::{arch_tag, eval, supports};
diff --git a/crates/fp-hw-survey/src/arch/unsupported.rs b/crates/fp-hw-survey/src/arch/unsupported.rs
new file mode 100644
index 00000000..0f560100
--- /dev/null
+++ b/crates/fp-hw-survey/src/arch/unsupported.rs
@@ -0,0 +1,20 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: MIT
+#![cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))]
+
+//! Fallback oracle for architectures this tool has no hand-written asm for.
+//! It supports nothing; `capture` on such a host produces only a header line.
+
+use crate::mode::Mode;
+
+pub fn arch_tag() -> &'static str {
+    "unsupported"
+}
+
+pub fn supports(_label: &str) -> bool {
+    false
+}
+
+pub fn eval(_label: &str, _a: u64, _b: u64, _c: u64, _mode: Mode) -> Option<(u64, u32)> {
+    None
+}
diff --git a/crates/fp-hw-survey/src/arch/x86_64.rs b/crates/fp-hw-survey/src/arch/x86_64.rs
new file mode 100644
index 00000000..c4f7cc23
--- /dev/null
+++ b/crates/fp-hw-survey/src/arch/x86_64.rs
@@ -0,0 +1,321 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: MIT
+#![cfg(target_arch = "x86_64")]
+
+//! x86-64 SSE hardware oracle. Executes each supported catalogued op as the
+//! real scalar SSE/SSE2 (and, for FMA, FMA3) instruction under a caller-chosen
+//! `MXCSR` state, reading back the cumulative exception flags and translating
+//! them into the normalized layout.
+//!
+//! **This module cannot be exercised on the author's development host (which is
+//! aarch64).** Its correctness on real x64 hardware is gated at runtime by the
+//! `selftest` known-answer checks, which `capture` runs and which abort the run
+//! on any mismatch.
+
+use crate::mode::{Mode, Round};
+use crate::normflags;
+
+/// Build an `MXCSR` value for the logical mode. All six exception masks are set
+/// (bits 7..12 = `0x1F80`) so nothing traps. **Rounding-control encoding differs
+/// from AArch64**: RC bits [14:13] are 00=nearest, 01=down(−inf), 10=up(+inf),
+/// 11=zero. Flush sets `FTZ` (bit 15) and `DAZ` (bit 6).
+fn mxcsr_for(m: Mode) -> u32 {
+    let rc: u32 = match m.round {
+        Round::Ne => 0b00,
+        Round::Down => 0b01,
+        Round::Up => 0b10,
+        Round::Zero => 0b11,
+    };
+    let mut v: u32 = 0x1F80 | (rc << 13);
+    if m.flush {
+        v |= 1 << 15; // FTZ
+        v |= 1 << 6; // DAZ
+    }
+    v
+}
+
+/// Translate the six low `MXCSR` exception-status bits into the normalized
+/// layout. x86 status bit order is IE,DE,ZE,OE,UE,PE.
+fn translate(mxcsr: u32) -> u32 {
+    let s = mxcsr & 0x3F;
+    let mut n = 0u32;
+    if s & (1 << 0) != 0 {
+        n |= normflags::IOC; // IE  → invalid
+    }
+    if s & (1 << 1) != 0 {
+        n |= normflags::IDC; // DE  → input denormal
+    }
+    if s & (1 << 2) != 0 {
+        n |= normflags::DZC; // ZE  → divide by zero
+    }
+    if s & (1 << 3) != 0 {
+        n |= normflags::OFC; // OE  → overflow
+    }
+    if s & (1 << 4) != 0 {
+        n |= normflags::UFC; // UE  → underflow
+    }
+    if s & (1 << 5) != 0 {
+        n |= normflags::IXC; // PE  → inexact
+    }
+    n
+}
+
+macro_rules! xbin {
+    ($name:ident, $op:literal, $ld:literal, $rm:literal) => {
+        #[inline(never)]
+        unsafe fn $name(a: u64, b: u64, mxcsr: u32) -> (u64, u32) {
+            let ctrl = mxcsr;
+            let mut outv: u32 = 0;
+            let res: u64;
+            core::arch::asm!(
+                "ldmxcsr [{c}]",
+                concat!($ld, " xmm0, {a:", $rm, "}"),
+                concat!($ld, " xmm1, {b:", $rm, "}"),
+                concat!($op, " xmm0, xmm1"),
+                concat!($ld, " {res:", $rm, "}, xmm0"),
+                "stmxcsr [{o}]",
+                c = in(reg) &ctrl,
+                o = in(reg) &mut outv,
+                a = in(reg) a,
+                b = in(reg) b,
+                res = out(reg) res,
+                out("xmm0") _,
+                out("xmm1") _,
+            );
+            (res, translate(outv))
+        }
+    };
+}
+
+macro_rules! xun {
+    ($name:ident, $op:literal, $ld:literal, $rm:literal) => {
+        #[inline(never)]
+        unsafe fn $name(a: u64, mxcsr: u32) -> (u64, u32) {
+            let ctrl = mxcsr;
+            let mut outv: u32 = 0;
+            let res: u64;
+            core::arch::asm!(
+                "ldmxcsr [{c}]",
+                concat!($ld, " xmm0, {a:", $rm, "}"),
+                concat!($op, " xmm0, xmm0"),
+                concat!($ld, " {res:", $rm, "}, xmm0"),
+                "stmxcsr [{o}]",
+                c = in(reg) &ctrl,
+                o = in(reg) &mut outv,
+                a = in(reg) a,
+                res = out(reg) res,
+                out("xmm0") _,
+            );
+            (res, translate(outv))
+        }
+    };
+}
+
+macro_rules! xfma {
+    ($name:ident, $op:literal, $ld:literal, $rm:literal) => {
+        #[target_feature(enable = "avx,fma")]
+        #[inline(never)]
+        unsafe fn $name(a: u64, b: u64, c: u64, mxcsr: u32) -> (u64, u32) {
+            // vfmadd213ss xmm0, xmm1, xmm2  =>  xmm0 = xmm1*xmm0 + xmm2.
+            // Load xmm0=a, xmm1=b, xmm2=c  =>  b*a + c = a*b + c.
+            let ctrl = mxcsr;
+            let mut outv: u32 = 0;
+            let res: u64;
+            core::arch::asm!(
+                "ldmxcsr [{c}]",
+                concat!($ld, " xmm0, {a:", $rm, "}"),
+                concat!($ld, " xmm1, {b:", $rm, "}"),
+                concat!($ld, " xmm2, {cc:", $rm, "}"),
+                concat!($op, " xmm0, xmm1, xmm2"),
+                concat!($ld, " {res:", $rm, "}, xmm0"),
+                "stmxcsr [{o}]",
+                c = in(reg) &ctrl,
+                o = in(reg) &mut outv,
+                a = in(reg) a,
+                b = in(reg) b,
+                cc = in(reg) c,
+                res = out(reg) res,
+                out("xmm0") _,
+                out("xmm1") _,
+                out("xmm2") _,
+            );
+            (res, translate(outv))
+        }
+    };
+}
+
+macro_rules! xcvtff {
+    ($name:ident, $op:literal, $ldi:literal, $rmi:literal, $ldo:literal, $rmo:literal) => {
+        #[inline(never)]
+        unsafe fn $name(a: u64, mxcsr: u32) -> (u64, u32) {
+            let ctrl = mxcsr;
+            let mut outv: u32 = 0;
+            let res: u64;
+            core::arch::asm!(
+                "ldmxcsr [{c}]",
+                concat!($ldi, " xmm0, {a:", $rmi, "}"),
+                concat!($op, " xmm0, xmm0"),
+                concat!($ldo, " {res:", $rmo, "}, xmm0"),
+                "stmxcsr [{o}]",
+                c = in(reg) &ctrl,
+                o = in(reg) &mut outv,
+                a = in(reg) a,
+                res = out(reg) res,
+                out("xmm0") _,
+            );
+            (res, translate(outv))
+        }
+    };
+}
+
+macro_rules! xcvtf2i {
+    ($name:ident, $op:literal, $ldi:literal, $rmi:literal, $rmo:literal) => {
+        #[inline(never)]
+        unsafe fn $name(a: u64, mxcsr: u32) -> (u64, u32) {
+            let ctrl = mxcsr;
+            let mut outv: u32 = 0;
+            let res: u64;
+            core::arch::asm!(
+                "ldmxcsr [{c}]",
+                concat!($ldi, " xmm0, {a:", $rmi, "}"),
+                concat!($op, " {res:", $rmo, "}, xmm0"),
+                "stmxcsr [{o}]",
+                c = in(reg) &ctrl,
+                o = in(reg) &mut outv,
+                a = in(reg) a,
+                res = out(reg) res,
+                out("xmm0") _,
+            );
+            (res, translate(outv))
+        }
+    };
+}
+
+macro_rules! xcvti2f {
+    ($name:ident, $op:literal, $rmi:literal, $ldo:literal, $rmo:literal) => {
+        #[inline(never)]
+        unsafe fn $name(a: u64, mxcsr: u32) -> (u64, u32) {
+            let ctrl = mxcsr;
+            let mut outv: u32 = 0;
+            let res: u64;
+            core::arch::asm!(
+                "ldmxcsr [{c}]",
+                concat!($op, " xmm0, {a:", $rmi, "}"),
+                concat!($ldo, " {res:", $rmo, "}, xmm0"),
+                "stmxcsr [{o}]",
+                c = in(reg) &ctrl,
+                o = in(reg) &mut outv,
+                a = in(reg) a,
+                res = out(reg) res,
+                out("xmm0") _,
+            );
+            (res, translate(outv))
+        }
+    };
+}
+
+// ── Arithmetic single ─────────────────────────────────────────────────────
+xbin!(addss, "addss", "movd", "e");
+xbin!(subss, "subss", "movd", "e");
+xbin!(mulss, "mulss", "movd", "e");
+xbin!(divss, "divss", "movd", "e");
+xbin!(maxss, "maxss", "movd", "e");
+xbin!(minss, "minss", "movd", "e");
+// ── Arithmetic double ─────────────────────────────────────────────────────
+xbin!(addsd, "addsd", "movq", "r");
+xbin!(subsd, "subsd", "movq", "r");
+xbin!(mulsd, "mulsd", "movq", "r");
+xbin!(divsd, "divsd", "movq", "r");
+xbin!(maxsd, "maxsd", "movq", "r");
+xbin!(minsd, "minsd", "movq", "r");
+// ── Square root ───────────────────────────────────────────────────────────
+xun!(sqrtss, "sqrtss", "movd", "e");
+xun!(sqrtsd, "sqrtsd", "movq", "r");
+// ── FMA (FMA3) ────────────────────────────────────────────────────────────
+xfma!(fmadd_ss, "vfmadd213ss", "movd", "e");
+xfma!(fmadd_sd, "vfmadd213sd", "movq", "r");
+// ── Float→float ───────────────────────────────────────────────────────────
+xcvtff!(cvtss2sd, "cvtss2sd", "movd", "e", "movq", "r");
+xcvtff!(cvtsd2ss, "cvtsd2ss", "movq", "r", "movd", "e");
+// ── Float→int (truncating; signed only) ───────────────────────────────────
+xcvtf2i!(cvttss2si_w, "cvttss2si", "movd", "e", "e");
+xcvtf2i!(cvttss2si_x, "cvttss2si", "movd", "e", "r");
+xcvtf2i!(cvttsd2si_w, "cvttsd2si", "movq", "r", "e");
+xcvtf2i!(cvttsd2si_x, "cvttsd2si", "movq", "r", "r");
+// ── Int→float (signed only) ───────────────────────────────────────────────
+xcvti2f!(cvtsi2ss_w, "cvtsi2ss", "e", "movd", "e");
+xcvti2f!(cvtsi2ss_x, "cvtsi2ss", "r", "movd", "e");
+xcvti2f!(cvtsi2sd_w, "cvtsi2sd", "e", "movq", "r");
+xcvti2f!(cvtsi2sd_x, "cvtsi2sd", "r", "movq", "r");
+
+const W: u64 = 0xFFFF_FFFF;
+
+pub fn arch_tag() -> &'static str {
+    "x86_64"
+}
+
+fn has_fma() -> bool {
+    std::arch::is_x86_feature_detected!("fma") && std::arch::is_x86_feature_detected!("avx")
+}
+
+/// The ops x86-64 can faithfully execute as a single scalar instruction. Ops
+/// with no scalar SSE counterpart (`fmaxnm`, `fmulx`, the estimate family, the
+/// directed-rounding float→int forms, unsigned conversions, all half ops) are
+/// deliberately absent and yield `None`.
+pub fn supports(label: &str) -> bool {
+    match label {
+        "fadd.s" | "fsub.s" | "fmul.s" | "fdiv.s" | "fmax.s" | "fmin.s" | "fadd.d" | "fsub.d"
+        | "fmul.d" | "fdiv.d" | "fmax.d" | "fmin.d" | "fsqrt.s" | "fsqrt.d" | "fcvt.s2d"
+        | "fcvt.d2s" | "fcvtzs.s.w" | "fcvtzs.s.x" | "fcvtzs.d.w" | "fcvtzs.d.x" | "scvtf.w.s"
+        | "scvtf.x.s" | "scvtf.w.d" | "scvtf.x.d" => true,
+        "fmadd.s" | "fmadd.d" => has_fma(),
+        _ => false,
+    }
+}
+
+pub fn eval(label: &str, a: u64, b: u64, c: u64, mode: Mode) -> Option<(u64, u32)> {
+    let m = mxcsr_for(mode);
+    let (res, flags) = unsafe {
+        match label {
+            "fadd.s" => addss(a, b, m),
+            "fsub.s" => subss(a, b, m),
+            "fmul.s" => mulss(a, b, m),
+            "fdiv.s" => divss(a, b, m),
+            "fmax.s" => maxss(a, b, m),
+            "fmin.s" => minss(a, b, m),
+            "fadd.d" => addsd(a, b, m),
+            "fsub.d" => subsd(a, b, m),
+            "fmul.d" => mulsd(a, b, m),
+            "fdiv.d" => divsd(a, b, m),
+            "fmax.d" => maxsd(a, b, m),
+            "fmin.d" => minsd(a, b, m),
+            "fsqrt.s" => sqrtss(a, m),
+            "fsqrt.d" => sqrtsd(a, m),
+            "fmadd.s" => {
+                if !has_fma() {
+                    return None;
+                }
+                fmadd_ss(a, b, c, m)
+            }
+            "fmadd.d" => {
+                if !has_fma() {
+                    return None;
+                }
+                fmadd_sd(a, b, c, m)
+            }
+            "fcvt.s2d" => cvtss2sd(a, m),
+            "fcvt.d2s" => cvtsd2ss(a, m),
+            "fcvtzs.s.w" => cvttss2si_w(a, m),
+            "fcvtzs.s.x" => cvttss2si_x(a, m),
+            "fcvtzs.d.w" => cvttsd2si_w(a, m),
+            "fcvtzs.d.x" => cvttsd2si_x(a, m),
+            "scvtf.w.s" => cvtsi2ss_w(a, m),
+            "scvtf.x.s" => cvtsi2ss_x(a, m),
+            "scvtf.w.d" => cvtsi2sd_w(a, m),
+            "scvtf.x.d" => cvtsi2sd_x(a, m),
+            _ => return None,
+        }
+    };
+    let res = if label.ends_with(".w") { res & W } else { res };
+    Some((res, flags & normflags::MASK))
+}
diff --git a/crates/fp-hw-survey/src/capture.rs b/crates/fp-hw-survey/src/capture.rs
new file mode 100644
index 00000000..acd17e54
--- /dev/null
+++ b/crates/fp-hw-survey/src/capture.rs
@@ -0,0 +1,228 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: MIT
+
+//! Phase 1: drive every supported catalogued op over a deterministic operand ×
+//! mode corpus on the local hardware and stream native `(result, flags)` rows
+//! to an NDJSON file.
+
+use std::fs::File;
+use std::io::{BufWriter, Write};
+
+use crate::arch;
+use crate::corpus;
+use crate::host;
+use crate::jsonio::ObjectWriter;
+use crate::mode;
+use crate::ops::{self, Fmt, Kind};
+
+/// Capture configuration assembled from CLI args.
+pub struct Config {
+    pub label: String,
+    pub out: String,
+    /// Number of random operand draws per op (on top of the curated edge cases).
+    pub pairs: usize,
+    /// Hard cap on output size in bytes; capture stops once exceeded.
+    pub budget_bytes: u64,
+    /// If non-empty, only ops whose label is in this list are captured.
+    pub only_ops: Vec<String>,
+}
+
+fn float_edges(fmt: Fmt) -> Vec<u64> {
+    match fmt {
+        Fmt::Single => corpus::SINGLE.iter().map(|&x| x as u64).collect(),
+        Fmt::Double => corpus::DOUBLE.to_vec(),
+        Fmt::Half => corpus::HALF.iter().map(|&x| x as u64).collect(),
+    }
+}
+
+fn float_fill(fmt: Fmt, stream: u64, n: usize) -> Vec<u64> {
+    match fmt {
+        Fmt::Single => corpus::fill_u32(stream, n)
+            .into_iter()
+            .map(|x| x as u64)
+            .collect(),
+        Fmt::Double => corpus::fill_u64(stream, n),
+        Fmt::Half => corpus::fill_u16(stream, n)
+            .into_iter()
+            .map(|x| x as u64)
+            .collect(),
+    }
+}
+
+fn int_edges(width: u8) -> Vec<u64> {
+    if width == 32 {
+        corpus::INTS.iter().map(|&x| x & 0xFFFF_FFFF).collect()
+    } else {
+        corpus::INTS.to_vec()
+    }
+}
+
+fn int_fill(width: u8, stream: u64, n: usize) -> Vec<u64> {
+    if width == 32 {
+        corpus::fill_u32(stream, n)
+            .into_iter()
+            .map(|x| x as u64)
+            .collect()
+    } else {
+        corpus::fill_u64(stream, n)
+    }
+}
+
+/// Build the `(a, b, c)` operand triples for one op. `c` is `0` for non-FMA ops.
+fn operands(spec: &ops::OpSpec, op_idx: usize, pairs: usize) -> Vec<(u64, u64, u64)> {
+    let s0 = (op_idx as u64) * 8;
+    let s1 = s0 + 1;
+    let s2 = s0 + 2;
+    let mut out = Vec::new();
+    match spec.kind {
+        Kind::Bin(fmt) => {
+            let e = float_edges(fmt);
+            for &a in &e {
+                for &b in &e {
+                    out.push((a, b, 0));
+                }
+            }
+            let fa = float_fill(fmt, s0, pairs);
+            let fb = float_fill(fmt, s1, pairs);
+            for i in 0..pairs {
+                out.push((fa[i], fb[i], 0));
+            }
+        }
+        Kind::Un(fmt) => {
+            for &a in &float_edges(fmt) {
+                out.push((a, 0, 0));
+            }
+            for a in float_fill(fmt, s0, pairs) {
+                out.push((a, 0, 0));
+            }
+        }
+        Kind::Fma(fmt) => {
+            let e = float_edges(fmt);
+            // Curated coverage: edge a × edge b with c in {0.0, 1.0, -1.0}.
+            let cs: [u64; 3] = match fmt {
+                Fmt::Single => [0x0000_0000, 0x3f80_0000, 0xbf80_0000],
+                Fmt::Double => [
+                    0x0000_0000_0000_0000,
+                    0x3ff0_0000_0000_0000,
+                    0xbff0_0000_0000_0000,
+                ],
+                Fmt::Half => [0x0000, 0x3c00, 0xbc00],
+            };
+            for &a in &e {
+                for &b in &e {
+                    for &cc in &cs {
+                        out.push((a, b, cc));
+                    }
+                }
+            }
+            let fa = float_fill(fmt, s0, pairs);
+            let fb = float_fill(fmt, s1, pairs);
+            let fc = float_fill(fmt, s2, pairs);
+            for i in 0..pairs {
+                out.push((fa[i], fb[i], fc[i]));
+            }
+        }
+        Kind::CvtF2F { from, .. } => {
+            for &a in &float_edges(from) {
+                out.push((a, 0, 0));
+            }
+            for a in float_fill(from, s0, pairs) {
+                out.push((a, 0, 0));
+            }
+        }
+        Kind::CvtF2I { from, .. } => {
+            for &a in &float_edges(from) {
+                out.push((a, 0, 0));
+            }
+            for a in float_fill(from, s0, pairs) {
+                out.push((a, 0, 0));
+            }
+        }
+        Kind::CvtI2F { width, .. } => {
+            for &a in &int_edges(width) {
+                out.push((a, 0, 0));
+            }
+            for a in int_fill(width, s0, pairs) {
+                out.push((a, 0, 0));
+            }
+        }
+    }
+    out
+}
+
+/// Run a capture. Returns `(ops_captured, rows_written)`.
+pub fn run(cfg: &Config) -> std::io::Result<(usize, u64)> {
+    let file = File::create(&cfg.out)?;
+    let mut w = BufWriter::new(file);
+
+    // Header line.
+    let captured_unix = host::now_unix();
+    let mut h = ObjectWriter::new();
+    h.str_field("kind", "header")
+        .u64_field("schema", 1)
+        .str_field("arch", host::arch_name())
+        .str_field("arch_tag", arch::arch_tag())
+        .str_field("os", host::os_name())
+        .str_field("cpu", &host::cpu_brand())
+        .str_field("label", &cfg.label)
+        .str_array_field("features", &host::features())
+        .u64_field("captured_unix", captured_unix)
+        .str_field("captured_utc", &host::unix_to_iso_utc(captured_unix))
+        .str_field("tool_version", env!("CARGO_PKG_VERSION"));
+    let header = h.finish();
+    writeln!(w, "{header}")?;
+    let mut bytes: u64 = header.len() as u64 + 1;
+
+    let modes = mode::all_modes();
+    let catalogue = ops::catalogue();
+    let mut ops_captured = 0usize;
+    let mut rows: u64 = 0;
+
+    'outer: for (op_idx, spec) in catalogue.iter().enumerate() {
+        if !cfg.only_ops.is_empty() && !cfg.only_ops.iter().any(|o| o == spec.label) {
+            continue;
+        }
+        if !arch::supports(spec.label) {
+            continue;
+        }
+        let triples = operands(spec, op_idx, cfg.pairs);
+        let mut any = false;
+        for &(a, b, c) in &triples {
+            for &m in &modes {
+                let Some((res, flags)) = arch::eval(spec.label, a, b, c, m) else {
+                    continue;
+                };
+                any = true;
+                let mut row = ObjectWriter::new();
+                row.str_field("op", spec.label)
+                    .u64_field("a", a)
+                    .u64_field("b", b)
+                    .u64_field("c", c)
+                    .str_field("mode", m.round.key())
+                    .bool_field("flush", m.flush)
+                    .u64_field("res", res)
+                    .u64_field("flags", flags as u64);
+                let line = row.finish();
+                writeln!(w, "{line}")?;
+                bytes += line.len() as u64 + 1;
+                rows += 1;
+                if bytes >= cfg.budget_bytes {
+                    eprintln!(
+                        "budget of {} bytes reached after {} rows; stopping early",
+                        cfg.budget_bytes, rows
+                    );
+                    if any {
+                        ops_captured += 1;
+                    }
+                    break 'outer;
+                }
+            }
+        }
+        if any {
+            ops_captured += 1;
+        }
+    }
+
+    w.flush()?;
+    Ok((ops_captured, rows))
+}
diff --git a/crates/fp-hw-survey/src/corpus.rs b/crates/fp-hw-survey/src/corpus.rs
new file mode 100644
index 00000000..6c517c5c
--- /dev/null
+++ b/crates/fp-hw-survey/src/corpus.rs
@@ -0,0 +1,215 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: MIT
+
+//! Deterministic operand corpora.
+//!
+//! Every machine must generate **identical** operand sequences so that captures
+//! align row-for-row during merge. All sets are therefore fixed consts plus a
+//! seeded SplitMix64 fill — never wall-clock or thread-randomized.
+
+/// Curated single-precision edge bit patterns: signed zeros/infinities, quiet
+/// and signalling NaNs, the subnormal/normal boundaries, powers near the
+/// integer-precision limit, rounding boundaries, and a few ordinary values.
+pub const SINGLE: &[u32] = &[
+    0x0000_0000, // +0
+    0x8000_0000, // -0
+    0x0000_0001, // smallest +subnormal
+    0x8000_0001, // smallest -subnormal
+    0x007f_ffff, // largest +subnormal
+    0x0080_0000, // smallest +normal
+    0x3f80_0000, // 1.0
+    0xbf80_0000, // -1.0
+    0x4000_0000, // 2.0
+    0x3f00_0000, // 0.5
+    0x4049_0fdb, // pi
+    0x3eaa_aaab, // 1/3 rounded
+    0x4b80_0000, // 2^24 (integer-precision limit)
+    0x4b7f_ffff, // 2^24 - 1
+    0x7f7f_ffff, // largest +normal
+    0xff7f_ffff, // largest -normal
+    0x7f80_0000, // +inf
+    0xff80_0000, // -inf
+    0x7fc0_0000, // +qNaN
+    0xffc0_0000, // -qNaN
+    0x7f80_0001, // +sNaN
+    0x7fbf_ffff, // +sNaN (max payload)
+    0x3f7f_ffff, // just below 1.0
+    0x3f80_0001, // just above 1.0
+    0x4170_0000, // 15.0
+    0xc170_0000, // -15.0
+    0x0080_0001, // smallest normal + 1 ulp
+    0x7e80_0000, // large finite (~8.5e37)
+    0x0040_0000, // mid subnormal
+    0x4f00_0000, // 2^31 (overflow boundary for i32)
+    0x5f00_0000, // 2^63 (overflow boundary for i64)
+    0xcf00_0000, // -2^31
+];
+
+/// Curated double-precision edge bit patterns.
+pub const DOUBLE: &[u64] = &[
+    0x0000_0000_0000_0000, // +0
+    0x8000_0000_0000_0000, // -0
+    0x0000_0000_0000_0001, // smallest +subnormal
+    0x8000_0000_0000_0001, // smallest -subnormal
+    0x000f_ffff_ffff_ffff, // largest +subnormal
+    0x0010_0000_0000_0000, // smallest +normal
+    0x3ff0_0000_0000_0000, // 1.0
+    0xbff0_0000_0000_0000, // -1.0
+    0x4000_0000_0000_0000, // 2.0
+    0x3fe0_0000_0000_0000, // 0.5
+    0x4009_21fb_5444_2d18, // pi
+    0x3fd5_5555_5555_5555, // 1/3 rounded
+    0x4330_0000_0000_0000, // 2^52 (integer-precision limit)
+    0x432f_ffff_ffff_ffff, // 2^52 - 1
+    0x7fef_ffff_ffff_ffff, // largest +normal
+    0xffef_ffff_ffff_ffff, // largest -normal
+    0x7ff0_0000_0000_0000, // +inf
+    0xfff0_0000_0000_0000, // -inf
+    0x7ff8_0000_0000_0000, // +qNaN
+    0xfff8_0000_0000_0000, // -qNaN
+    0x7ff0_0000_0000_0001, // +sNaN
+    0x7ff7_ffff_ffff_ffff, // +sNaN (max payload)
+    0x3fef_ffff_ffff_ffff, // just below 1.0
+    0x3ff0_0000_0000_0001, // just above 1.0
+    0x41df_ffff_ffc0_0000, // 2^31 - 1 region
+    0x43e0_0000_0000_0000, // 2^63 (overflow boundary for i64)
+    0xc3e0_0000_0000_0000, // -2^63
+    0x4079_0000_0000_0000, // 400.0
+    0x0008_0000_0000_0000, // mid subnormal
+    0x7fe0_0000_0000_0000, // large finite
+    0x000f_ffff_ffff_fffe, // near subnormal top
+];
+
+/// Curated half-precision (IEEE binary16) edge bit patterns.
+pub const HALF: &[u16] = &[
+    0x0000, // +0
+    0x8000, // -0
+    0x0001, // smallest +subnormal
+    0x8001, // smallest -subnormal
+    0x03ff, // largest +subnormal
+    0x0400, // smallest +normal
+    0x3c00, // 1.0
+    0xbc00, // -1.0
+    0x4000, // 2.0
+    0x3800, // 0.5
+    0x3555, // 1/3 rounded
+    0x6400, // 1024 = 2^10 (integer-precision limit)
+    0x63ff, // 1023
+    0x7bff, // largest +normal (65504)
+    0xfbff, // largest -normal
+    0x7c00, // +inf
+    0xfc00, // -inf
+    0x7e00, // +qNaN
+    0xfe00, // -qNaN
+    0x7c01, // +sNaN
+    0x7dff, // +sNaN (max payload)
+    0x3bff, // just below 1.0
+    0x3c01, // just above 1.0
+    0x4900, // 10.0
+    0x0200, // mid subnormal
+];
+
+/// Curated integer operands (used for both 32- and 64-bit int→float, with the
+/// stored operand width fixing the value): zeros, ±1, powers of two, the
+/// signed/unsigned extremes, and the precision-limit boundaries.
+pub const INTS: &[u64] = &[
+    0,
+    1,
+    0xffff_ffff_ffff_ffff, // -1 / u64::MAX
+    2,
+    0x7fff_ffff,           // i32::MAX
+    0x8000_0000,           // i32::MIN as u32 / 2^31
+    0xffff_ffff,           // u32::MAX / -1 as u32
+    0x0100_0000,           // 2^24
+    0x0100_0001,           // 2^24 + 1 (f32 rounding boundary)
+    0x0020_0000_0000_0000, // 2^53
+    0x0020_0000_0000_0001, // 2^53 + 1 (f64 rounding boundary)
+    0x7fff_ffff_ffff_ffff, // i64::MAX
+    0x8000_0000_0000_0000, // i64::MIN / 2^63
+    1000,
+    0x0000_0000_dead_beef,
+    0x0000_0001_0000_0000, // 2^32
+    0xffff_ffff_0000_0000,
+    42,
+];
+
+/// A seeded SplitMix64 generator producing reproducible 64-bit words.
+pub struct SplitMix64 {
+    state: u64,
+}
+
+impl SplitMix64 {
+    pub fn new(seed: u64) -> Self {
+        SplitMix64 { state: seed }
+    }
+
+    pub fn next_u64(&mut self) -> u64 {
+        self.state = self.state.wrapping_add(0x9e37_79b9_7f4a_7c15);
+        let mut z = self.state;
+        z = (z ^ (z >> 30)).wrapping_mul(0xbf58_476d_1ce4_e5b9);
+        z = (z ^ (z >> 27)).wrapping_mul(0x94d0_49bb_1331_11eb);
+        z ^ (z >> 31)
+    }
+}
+
+/// Fixed base seed so every machine's random fill is identical. The XORed
+/// stream id keeps independent draws (e.g. operand `a` vs operand `b`) from
+/// being correlated.
+pub const FILL_SEED: u64 = 0x4650_4857_5355_5256; // "FPHWSURV"
+
+/// Generate `n` reproducible 32-bit fill words for stream `stream`.
+pub fn fill_u32(stream: u64, n: usize) -> Vec<u32> {
+    let mut r = SplitMix64::new(FILL_SEED ^ stream.wrapping_mul(0x1000_0001));
+    (0..n).map(|_| r.next_u64() as u32).collect()
+}
+
+/// Generate `n` reproducible 64-bit fill words for stream `stream`.
+pub fn fill_u64(stream: u64, n: usize) -> Vec<u64> {
+    let mut r = SplitMix64::new(FILL_SEED ^ stream.wrapping_mul(0x1000_0001));
+    (0..n).map(|_| r.next_u64()).collect()
+}
+
+/// Generate `n` reproducible 16-bit fill words for stream `stream`.
+pub fn fill_u16(stream: u64, n: usize) -> Vec<u16> {
+    let mut r = SplitMix64::new(FILL_SEED ^ stream.wrapping_mul(0x1000_0001));
+    (0..n).map(|_| r.next_u64() as u16).collect()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn splitmix_is_deterministic() {
+        let a: Vec<u64> = (0..8)
+            .scan(SplitMix64::new(42), |s, _| Some(s.next_u64()))
+            .collect();
+        let b: Vec<u64> = (0..8)
+            .scan(SplitMix64::new(42), |s, _| Some(s.next_u64()))
+            .collect();
+        assert_eq!(a, b);
+    }
+
+    #[test]
+    fn fills_are_reproducible_and_sized() {
+        assert_eq!(fill_u32(0, 16), fill_u32(0, 16));
+        assert_eq!(fill_u64(3, 16), fill_u64(3, 16));
+        assert_eq!(fill_u16(7, 16).len(), 16);
+    }
+
+    #[test]
+    fn streams_are_decorrelated() {
+        assert_ne!(fill_u64(0, 8), fill_u64(1, 8));
+    }
+
+    #[test]
+    fn edge_tables_nonempty_and_distinct() {
+        assert!(SINGLE.len() >= 24);
+        assert!(DOUBLE.len() >= 24);
+        assert!(HALF.len() >= 20);
+        let mut s = SINGLE.to_vec();
+        s.sort_unstable();
+        s.dedup();
+        assert_eq!(s.len(), SINGLE.len(), "SINGLE has duplicate entries");
+    }
+}
diff --git a/crates/fp-hw-survey/src/host.rs b/crates/fp-hw-survey/src/host.rs
new file mode 100644
index 00000000..3b996ccc
--- /dev/null
+++ b/crates/fp-hw-survey/src/host.rs
@@ -0,0 +1,375 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: MIT
+
+//! Best-effort host identification recorded in each capture's header line.
+
+/// Target architecture string (`"aarch64"`, `"x86_64"`, or the raw target arch).
+pub fn arch_name() -> &'static str {
+    if cfg!(target_arch = "aarch64") {
+        "aarch64"
+    } else if cfg!(target_arch = "x86_64") {
+        "x86_64"
+    } else {
+        "unknown"
+    }
+}
+
+/// Operating-system string.
+pub fn os_name() -> &'static str {
+    std::env::consts::OS
+}
+
+/// Runtime detection of AArch64 half-precision (`FEAT_FP16`) support.
+///
+/// `is_aarch64_feature_detected!` does not accept feature-name strings on the
+/// `aarch64-pc-windows-msvc` toolchain, so on Windows we query the OS directly
+/// via `IsProcessorFeaturePresent`. Elsewhere the std runtime probe works.
+#[cfg(target_arch = "aarch64")]
+pub fn aarch64_fp16() -> bool {
+    #[cfg(target_os = "windows")]
+    {
+        // PF_ARM_V82_FP16_INSTRUCTIONS_AVAILABLE = 44.
+        const PF_ARM_V82_FP16: u32 = 44;
+        extern "system" {
+            fn IsProcessorFeaturePresent(feature: u32) -> i32;
+        }
+        unsafe { IsProcessorFeaturePresent(PF_ARM_V82_FP16) != 0 }
+    }
+    #[cfg(not(target_os = "windows"))]
+    {
+        std::arch::is_aarch64_feature_detected!("fp16")
+    }
+}
+
+/// Detected FP-relevant CPU features for the current architecture. These are
+/// recorded so that, e.g., a half-precision gap on one machine is explained by
+/// the absence of `fp16` rather than mistaken for a divergence.
+///
+/// On AArch64 the two IMPLEMENTATION_DEFINED knobs that can change a *result
+/// value* are also recorded: `FEAT_RPRES` (selects a 12-bit reciprocal /
+/// reciprocal-sqrt estimate table when `FPCR.AH==1`, where a non-RPRES part
+/// returns 8 bits) and `FEAT_AFP` (gates the `FPCR.AH`/`FIZ`/`NEP` alternate
+/// behaviors). They are surveyed so that an `frecpe.s` / `frsqrte.s` divergence
+/// between two machines is attributed to a different RPRES/`AH` context rather
+/// than mistaken for a conformance bug. Windows-on-ARM exposes no `PF_*` flag
+/// for either, so they are best-effort and only probed where the std runtime
+/// detector works.
+pub fn features() -> Vec<String> {
+    let mut f = Vec::new();
+    #[cfg(target_arch = "aarch64")]
+    {
+        if aarch64_fp16() {
+            f.push("fp16".to_string());
+        }
+        #[cfg(not(target_os = "windows"))]
+        {
+            if std::arch::is_aarch64_feature_detected!("afp") {
+                f.push("afp".to_string());
+            }
+            if std::arch::is_aarch64_feature_detected!("rpres") {
+                f.push("rpres".to_string());
+            }
+        }
+    }
+    #[cfg(target_arch = "x86_64")]
+    {
+        macro_rules! probe {
+            ($name:literal) => {
+                if std::arch::is_x86_feature_detected!($name) {
+                    f.push($name.to_string());
+                }
+            };
+        }
+        probe!("sse2");
+        probe!("sse4.1");
+        probe!("avx");
+        probe!("fma");
+        probe!("avx512f");
+    }
+    f
+}
+
+/// Best-effort human-readable CPU brand string. Returns `"unknown"` when the
+/// platform does not expose one cheaply; callers should always pass an explicit
+/// `--label` so a capture is identifiable regardless.
+pub fn cpu_brand() -> String {
+    #[cfg(target_arch = "x86_64")]
+    {
+        if let Some(s) = x86_brand() {
+            return s;
+        }
+    }
+    // macOS / iOS: the kernel reports the chip via sysctl, including on Apple
+    // Silicon where there is no CPUID. This is the authoritative brand string
+    // (e.g. "Apple M2").
+    #[cfg(any(target_os = "macos", target_os = "ios"))]
+    {
+        if let Some(s) = macos_brand() {
+            return s;
+        }
+    }
+    // Windows (ARM and x64): the brand lives in the registry. On Windows-on-ARM
+    // there is no CPUID, so this is the only native source (e.g.
+    // "Snapdragon(R) X Elite ...").
+    #[cfg(target_os = "windows")]
+    {
+        if let Some(s) = windows_brand() {
+            return s;
+        }
+    }
+    // Linux: /proc/cpuinfo carries a model line on both arches.
+    if let Ok(txt) = std::fs::read_to_string("/proc/cpuinfo") {
+        for line in txt.lines() {
+            for tag in ["model name", "Model", "CPU part", "Hardware"] {
+                if let Some(rest) = line.strip_prefix(tag) {
+                    if let Some((_, v)) = rest.split_once(':') {
+                        let v = v.trim();
+                        if !v.is_empty() {
+                            return v.to_string();
+                        }
+                    }
+                }
+            }
+        }
+    }
+    "unknown".to_string()
+}
+
+#[cfg(target_arch = "x86_64")]
+fn x86_brand() -> Option<String> {
+    use std::arch::x86_64::__cpuid;
+    // Brand string is in CPUID leaves 0x80000002..0x80000004 (12 dwords).
+    let max_ext = unsafe { __cpuid(0x8000_0000) }.eax;
+    if max_ext < 0x8000_0004 {
+        return None;
+    }
+    let mut bytes = Vec::with_capacity(48);
+    for leaf in 0x8000_0002u32..=0x8000_0004 {
+        let r = unsafe { __cpuid(leaf) };
+        for reg in [r.eax, r.ebx, r.ecx, r.edx] {
+            bytes.extend_from_slice(&reg.to_le_bytes());
+        }
+    }
+    let s = String::from_utf8_lossy(&bytes);
+    let s = s.trim_matches(char::from(0)).trim();
+    if s.is_empty() {
+        None
+    } else {
+        Some(s.to_string())
+    }
+}
+
+/// Apple-platform CPU brand via `sysctlbyname("machdep.cpu.brand_string")`.
+///
+/// Works on Apple Silicon (M-series) and Intel Macs alike, and on iOS, where
+/// there is no CPUID and `/proc/cpuinfo` does not exist. The two-call pattern
+/// first queries the buffer length, then fills it.
+#[cfg(any(target_os = "macos", target_os = "ios"))]
+fn macos_brand() -> Option<String> {
+    use std::os::raw::{c_char, c_int, c_void};
+    extern "C" {
+        fn sysctlbyname(
+            name: *const c_char,
+            oldp: *mut c_void,
+            oldlenp: *mut usize,
+            newp: *mut c_void,
+            newlen: usize,
+        ) -> c_int;
+    }
+    let key = b"machdep.cpu.brand_string\0";
+    let mut len: usize = 0;
+    // First call: ask for the required buffer size.
+    let rc = unsafe {
+        sysctlbyname(
+            key.as_ptr().cast(),
+            std::ptr::null_mut(),
+            &mut len,
+            std::ptr::null_mut(),
+            0,
+        )
+    };
+    if rc != 0 || len == 0 {
+        return None;
+    }
+    let mut buf = vec![0u8; len];
+    let rc = unsafe {
+        sysctlbyname(
+            key.as_ptr().cast(),
+            buf.as_mut_ptr().cast(),
+            &mut len,
+            std::ptr::null_mut(),
+            0,
+        )
+    };
+    if rc != 0 {
+        return None;
+    }
+    // sysctl returns a NUL-terminated C string; drop the trailing NUL.
+    if let Some(pos) = buf.iter().position(|&b| b == 0) {
+        buf.truncate(pos);
+    }
+    let s = String::from_utf8_lossy(&buf).trim().to_string();
+    if s.is_empty() {
+        None
+    } else {
+        Some(s)
+    }
+}
+
+/// Windows CPU brand via the registry value
+/// `HKLM\HARDWARE\DESCRIPTION\System\CentralProcessor\0\ProcessorNameString`.
+///
+/// This is the only native brand source on Windows-on-ARM (no CPUID); it also
+/// works on Windows-x64. Uses `RegGetValueW` directly to stay dependency-free.
+#[cfg(target_os = "windows")]
+fn windows_brand() -> Option<String> {
+    use std::os::raw::{c_ulong, c_void};
+
+    type Hkey = *mut c_void;
+    // HKEY_LOCAL_MACHINE is a fixed pseudo-handle.
+    const HKEY_LOCAL_MACHINE: Hkey = 0x8000_0002u32 as usize as Hkey;
+    // RRF_RT_REG_SZ: restrict the result type to REG_SZ.
+    const RRF_RT_REG_SZ: c_ulong = 0x0000_0002;
+    const ERROR_SUCCESS: c_ulong = 0;
+
+    #[link(name = "advapi32")]
+    extern "system" {
+        fn RegGetValueW(
+            hkey: Hkey,
+            lpsubkey: *const u16,
+            lpvalue: *const u16,
+            dwflags: c_ulong,
+            pdwtype: *mut c_ulong,
+            pvdata: *mut c_void,
+            pcbdata: *mut c_ulong,
+        ) -> c_ulong;
+    }
+
+    fn wide(s: &str) -> Vec<u16> {
+        s.encode_utf16().chain(std::iter::once(0)).collect()
+    }
+
+    let subkey = wide(r"HARDWARE\DESCRIPTION\System\CentralProcessor\0");
+    let value = wide("ProcessorNameString");
+
+    // First call: query the size in bytes.
+    let mut cb: c_ulong = 0;
+    let rc = unsafe {
+        RegGetValueW(
+            HKEY_LOCAL_MACHINE,
+            subkey.as_ptr(),
+            value.as_ptr(),
+            RRF_RT_REG_SZ,
+            std::ptr::null_mut(),
+            std::ptr::null_mut(),
+            &mut cb,
+        )
+    };
+    if rc != ERROR_SUCCESS || cb == 0 {
+        return None;
+    }
+    // cb is a byte count for a UTF-16 string; round up to u16 units.
+    let mut buf: Vec<u16> = vec![0u16; (cb as usize).div_ceil(2)];
+    let mut cb2: c_ulong = (buf.len() * 2) as c_ulong;
+    let rc = unsafe {
+        RegGetValueW(
+            HKEY_LOCAL_MACHINE,
+            subkey.as_ptr(),
+            value.as_ptr(),
+            RRF_RT_REG_SZ,
+            std::ptr::null_mut(),
+            buf.as_mut_ptr().cast(),
+            &mut cb2,
+        )
+    };
+    if rc != ERROR_SUCCESS {
+        return None;
+    }
+    // Trim to the returned length and drop any trailing NUL(s).
+    let units = (cb2 as usize) / 2;
+    buf.truncate(units);
+    while let Some(&0) = buf.last() {
+        buf.pop();
+    }
+    let s = String::from_utf16_lossy(&buf).trim().to_string();
+    if s.is_empty() {
+        None
+    } else {
+        Some(s)
+    }
+}
+
+/// Seconds since the Unix epoch (UTC) at the moment of the call. Returns `0` if
+/// the system clock is set before 1970 (it never legitimately is).
+pub fn now_unix() -> u64 {
+    use std::time::{SystemTime, UNIX_EPOCH};
+    SystemTime::now()
+        .duration_since(UNIX_EPOCH)
+        .map(|d| d.as_secs())
+        .unwrap_or(0)
+}
+
+/// Format a Unix timestamp (UTC) as an ISO-8601 string, e.g.
+/// `"2026-06-09T09:04:31Z"`. Implemented locally to keep the crate
+/// dependency-free; uses the standard proleptic-Gregorian civil-date algorithm.
+pub fn unix_to_iso_utc(secs: u64) -> String {
+    let days = (secs / 86_400) as i64;
+    let rem = secs % 86_400;
+    let (hh, mm, ss) = (rem / 3600, (rem % 3600) / 60, rem % 60);
+    let (y, mo, d) = civil_from_days(days);
+    format!("{y:04}-{mo:02}-{d:02}T{hh:02}:{mm:02}:{ss:02}Z")
+}
+
+/// Convert a day count since the Unix epoch (1970-01-01) into a `(year, month,
+/// day)` triple in the proleptic Gregorian calendar (Howard Hinnant's
+/// `civil_from_days`).
+fn civil_from_days(z: i64) -> (i64, u32, u32) {
+    let z = z + 719_468;
+    let era = if z >= 0 { z } else { z - 146_096 } / 146_097;
+    let doe = z - era * 146_097; // [0, 146096]
+    let yoe = (doe - doe / 1460 + doe / 36_524 - doe / 146_096) / 365; // [0, 399]
+    let y = yoe + era * 400;
+    let doy = doe - (365 * yoe + yoe / 4 - yoe / 100); // [0, 365]
+    let mp = (5 * doy + 2) / 153; // [0, 11]
+    let d = (doy - (153 * mp + 2) / 5 + 1) as u32; // [1, 31]
+    let m = if mp < 10 { mp + 3 } else { mp - 9 } as u32; // [1, 12]
+    (y + if m <= 2 { 1 } else { 0 }, m, d)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn iso_format_known_epochs() {
+        assert_eq!(unix_to_iso_utc(0), "1970-01-01T00:00:00Z");
+        // 2000-01-01T00:00:00Z
+        assert_eq!(unix_to_iso_utc(946_684_800), "2000-01-01T00:00:00Z");
+        // 2026-06-09T09:04:31Z
+        assert_eq!(unix_to_iso_utc(1_780_995_871), "2026-06-09T09:04:31Z");
+        // Leap day 2024-02-29T12:00:00Z
+        assert_eq!(unix_to_iso_utc(1_709_208_000), "2024-02-29T12:00:00Z");
+    }
+
+    #[test]
+    fn now_is_after_2024() {
+        // Sanity: the clock should be well past 2024-01-01 (1704067200).
+        assert!(now_unix() > 1_704_067_200);
+    }
+
+    // On Windows and Apple platforms the brand string must resolve natively
+    // (registry / sysctl). Asserting non-"unknown" here would fail in sandboxed
+    // CI, so we only assert the call is well-formed and non-empty.
+    #[cfg(any(target_os = "windows", target_os = "macos", target_os = "ios"))]
+    #[test]
+    fn native_brand_lookup_is_nonempty_when_present() {
+        #[cfg(any(target_os = "macos", target_os = "ios"))]
+        if let Some(s) = macos_brand() {
+            assert!(!s.is_empty());
+        }
+        #[cfg(target_os = "windows")]
+        if let Some(s) = windows_brand() {
+            assert!(!s.is_empty());
+        }
+    }
+}
diff --git a/crates/fp-hw-survey/src/jsonio.rs b/crates/fp-hw-survey/src/jsonio.rs
new file mode 100644
index 00000000..4e5b3bc3
--- /dev/null
+++ b/crates/fp-hw-survey/src/jsonio.rs
@@ -0,0 +1,361 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: MIT
+
+//! Minimal, dependency-free NDJSON I/O for flat records.
+//!
+//! Records are one JSON object per line. We deliberately avoid a serialization
+//! crate so the tool builds with `std` only on any platform. The parser handles
+//! exactly the shapes this tool emits — flat objects whose values are strings,
+//! unsigned integers, booleans, or arrays of strings — and nothing more.
+
+use std::collections::BTreeMap;
+use std::fmt::Write as _;
+
+/// A builder for one compact JSON object line.
+pub struct ObjectWriter {
+    buf: String,
+    first: bool,
+}
+
+impl ObjectWriter {
+    pub fn new() -> Self {
+        ObjectWriter {
+            buf: String::from("{"),
+            first: true,
+        }
+    }
+
+    fn sep(&mut self) {
+        if self.first {
+            self.first = false;
+        } else {
+            self.buf.push(',');
+        }
+    }
+
+    pub fn str_field(&mut self, key: &str, val: &str) -> &mut Self {
+        self.sep();
+        let _ = write!(self.buf, "\"{}\":\"{}\"", key, escape(val));
+        self
+    }
+
+    pub fn u64_field(&mut self, key: &str, val: u64) -> &mut Self {
+        self.sep();
+        let _ = write!(self.buf, "\"{}\":{}", key, val);
+        self
+    }
+
+    pub fn bool_field(&mut self, key: &str, val: bool) -> &mut Self {
+        self.sep();
+        let _ = write!(self.buf, "\"{}\":{}", key, val);
+        self
+    }
+
+    pub fn str_array_field(&mut self, key: &str, vals: &[String]) -> &mut Self {
+        self.sep();
+        let _ = write!(self.buf, "\"{}\":[", key);
+        for (i, v) in vals.iter().enumerate() {
+            if i != 0 {
+                self.buf.push(',');
+            }
+            let _ = write!(self.buf, "\"{}\"", escape(v));
+        }
+        self.buf.push(']');
+        self
+    }
+
+    /// Finish the object, returning the JSON line (without a trailing newline).
+    pub fn finish(mut self) -> String {
+        self.buf.push('}');
+        self.buf
+    }
+}
+
+impl Default for ObjectWriter {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+fn escape(s: &str) -> String {
+    let mut out = String::with_capacity(s.len());
+    for c in s.chars() {
+        match c {
+            '"' => out.push_str("\\\""),
+            '\\' => out.push_str("\\\\"),
+            '\n' => out.push_str("\\n"),
+            '\r' => out.push_str("\\r"),
+            '\t' => out.push_str("\\t"),
+            c if (c as u32) < 0x20 => {
+                let _ = write!(out, "\\u{:04x}", c as u32);
+            }
+            c => out.push(c),
+        }
+    }
+    out
+}
+
+/// A parsed JSON scalar/array value (only the shapes this tool emits).
+///
+/// The accessor surface is intentionally complete (strings, numbers, booleans,
+/// arrays) even though the current readers do not consume every shape — keeping
+/// the parser able to accept any line this tool writes, including header arrays.
+#[derive(Debug, Clone)]
+#[allow(dead_code)]
+pub enum Value {
+    Str(String),
+    /// Numbers are kept as their textual form so 64-bit bit patterns survive
+    /// without precision loss; use [`Value::as_u64`] to decode.
+    Num(String),
+    Bool(bool),
+    Arr(Vec<String>),
+}
+
+impl Value {
+    pub fn as_str(&self) -> Option<&str> {
+        match self {
+            Value::Str(s) => Some(s),
+            _ => None,
+        }
+    }
+    pub fn as_u64(&self) -> Option<u64> {
+        match self {
+            Value::Num(s) => s.parse::<u64>().ok(),
+            _ => None,
+        }
+    }
+    pub fn as_bool(&self) -> Option<bool> {
+        match self {
+            Value::Bool(b) => Some(*b),
+            _ => None,
+        }
+    }
+    #[cfg_attr(not(test), allow(dead_code))]
+    pub fn as_arr(&self) -> Option<&[String]> {
+        match self {
+            Value::Arr(a) => Some(a),
+            _ => None,
+        }
+    }
+}
+
+/// Parse one flat JSON object line into a key→value map. Returns `None` if the
+/// line is not a well-formed flat object in the restricted grammar.
+pub fn parse_object(line: &str) -> Option<BTreeMap<String, Value>> {
+    let b = line.trim().as_bytes();
+    let mut p = Parser { b, i: 0 };
+    p.skip_ws();
+    if p.peek()? != b'{' {
+        return None;
+    }
+    p.i += 1;
+    let mut map = BTreeMap::new();
+    p.skip_ws();
+    if p.peek()? == b'}' {
+        return Some(map);
+    }
+    loop {
+        p.skip_ws();
+        let key = p.parse_string()?;
+        p.skip_ws();
+        if p.peek()? != b':' {
+            return None;
+        }
+        p.i += 1;
+        p.skip_ws();
+        let val = p.parse_value()?;
+        map.insert(key, val);
+        p.skip_ws();
+        match p.peek()? {
+            b',' => {
+                p.i += 1;
+                continue;
+            }
+            b'}' => {
+                break;
+            }
+            _ => return None,
+        }
+    }
+    Some(map)
+}
+
+struct Parser<'a> {
+    b: &'a [u8],
+    i: usize,
+}
+
+impl<'a> Parser<'a> {
+    fn peek(&self) -> Option<u8> {
+        self.b.get(self.i).copied()
+    }
+
+    fn skip_ws(&mut self) {
+        while let Some(c) = self.peek() {
+            if c == b' ' || c == b'\t' || c == b'\n' || c == b'\r' {
+                self.i += 1;
+            } else {
+                break;
+            }
+        }
+    }
+
+    fn parse_string(&mut self) -> Option<String> {
+        if self.peek()? != b'"' {
+            return None;
+        }
+        self.i += 1;
+        let mut s = String::new();
+        loop {
+            let c = self.peek()?;
+            self.i += 1;
+            match c {
+                b'"' => return Some(s),
+                b'\\' => {
+                    let e = self.peek()?;
+                    self.i += 1;
+                    match e {
+                        b'"' => s.push('"'),
+                        b'\\' => s.push('\\'),
+                        b'/' => s.push('/'),
+                        b'n' => s.push('\n'),
+                        b'r' => s.push('\r'),
+                        b't' => s.push('\t'),
+                        b'u' => {
+                            let mut code = 0u32;
+                            for _ in 0..4 {
+                                let h = self.peek()?;
+                                self.i += 1;
+                                code = code * 16 + (h as char).to_digit(16)?;
+                            }
+                            s.push(char::from_u32(code).unwrap_or('\u{fffd}'));
+                        }
+                        _ => return None,
+                    }
+                }
+                c => s.push(c as char),
+            }
+        }
+    }
+
+    fn parse_value(&mut self) -> Option<Value> {
+        match self.peek()? {
+            b'"' => Some(Value::Str(self.parse_string()?)),
+            b't' => {
+                if self.b[self.i..].starts_with(b"true") {
+                    self.i += 4;
+                    Some(Value::Bool(true))
+                } else {
+                    None
+                }
+            }
+            b'f' => {
+                if self.b[self.i..].starts_with(b"false") {
+                    self.i += 5;
+                    Some(Value::Bool(false))
+                } else {
+                    None
+                }
+            }
+            b'[' => {
+                self.i += 1;
+                let mut arr = Vec::new();
+                self.skip_ws();
+                if self.peek()? == b']' {
+                    self.i += 1;
+                    return Some(Value::Arr(arr));
+                }
+                loop {
+                    self.skip_ws();
+                    arr.push(self.parse_string()?);
+                    self.skip_ws();
+                    match self.peek()? {
+                        b',' => {
+                            self.i += 1;
+                            continue;
+                        }
+                        b']' => {
+                            self.i += 1;
+                            break;
+                        }
+                        _ => return None,
+                    }
+                }
+                Some(Value::Arr(arr))
+            }
+            _ => {
+                // Number: consume a run of numeric characters, keep as text.
+                let start = self.i;
+                while let Some(c) = self.peek() {
+                    if c.is_ascii_digit()
+                        || c == b'-'
+                        || c == b'+'
+                        || c == b'.'
+                        || c == b'e'
+                        || c == b'E'
+                    {
+                        self.i += 1;
+                    } else {
+                        break;
+                    }
+                }
+                if self.i == start {
+                    return None;
+                }
+                Some(Value::Num(
+                    std::str::from_utf8(&self.b[start..self.i])
+                        .ok()?
+                        .to_string(),
+                ))
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn writer_then_parse_roundtrip() {
+        let mut w = ObjectWriter::new();
+        w.str_field("op", "fadd.s")
+            .u64_field("a", 0xFFFF_FFFF_FFFF_FFFF)
+            .bool_field("flush", true)
+            .str_array_field("features", &["fp16".to_string(), "neon".to_string()]);
+        let line = w.finish();
+        let obj = parse_object(&line).expect("parse");
+        assert_eq!(obj.get("op").unwrap().as_str(), Some("fadd.s"));
+        assert_eq!(obj.get("a").unwrap().as_u64(), Some(0xFFFF_FFFF_FFFF_FFFF));
+        assert_eq!(obj.get("flush").unwrap().as_bool(), Some(true));
+        assert_eq!(
+            obj.get("features").unwrap().as_arr(),
+            Some(&["fp16".to_string(), "neon".to_string()][..])
+        );
+    }
+
+    #[test]
+    fn parses_empty_object() {
+        let obj = parse_object("{}").expect("parse");
+        assert!(obj.is_empty());
+    }
+
+    #[test]
+    fn rejects_malformed() {
+        assert!(parse_object("not json").is_none());
+        assert!(parse_object("{\"k\":}").is_none());
+        assert!(parse_object("{\"k\" 1}").is_none());
+    }
+
+    #[test]
+    fn escapes_survive_roundtrip() {
+        let mut w = ObjectWriter::new();
+        w.str_field("cpu", "weird \"quote\" \\ and\ttab");
+        let line = w.finish();
+        let obj = parse_object(&line).expect("parse");
+        assert_eq!(
+            obj.get("cpu").unwrap().as_str(),
+            Some("weird \"quote\" \\ and\ttab")
+        );
+    }
+}
diff --git a/crates/fp-hw-survey/src/main.rs b/crates/fp-hw-survey/src/main.rs
new file mode 100644
index 00000000..cd77dd5c
--- /dev/null
+++ b/crates/fp-hw-survey/src/main.rs
@@ -0,0 +1,199 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: MIT
+
+//! `fp-hw-survey` — capture native floating-point hardware behavior across many
+//! machines and architectures, then merge captures to find where the hardware
+//! actually disagrees.
+//!
+//! Subcommands:
+//!   capture   Run the corpus on this machine, write an NDJSON capture file.
+//!   merge     Combine capture files and emit only the divergent rows.
+//!   selftest  Run known-answer checks for this machine's oracle.
+//!   info      Print host identity and supported-op count.
+
+mod arch;
+mod capture;
+mod corpus;
+mod host;
+mod jsonio;
+mod merge;
+mod mode;
+mod normflags;
+mod ops;
+mod selftest;
+
+use std::process::ExitCode;
+
+fn main() -> ExitCode {
+    let args: Vec<String> = std::env::args().skip(1).collect();
+    if args.is_empty() {
+        usage();
+        return ExitCode::from(2);
+    }
+    match args[0].as_str() {
+        "capture" => cmd_capture(&args[1..]),
+        "merge" => cmd_merge(&args[1..]),
+        "selftest" => cmd_selftest(),
+        "info" => cmd_info(),
+        "-h" | "--help" | "help" => {
+            usage();
+            ExitCode::SUCCESS
+        }
+        other => {
+            eprintln!("unknown subcommand: {other}");
+            usage();
+            ExitCode::from(2)
+        }
+    }
+}
+
+fn usage() {
+    eprintln!(
+        "fp-hw-survey {}\n\
+         \n\
+         USAGE:\n\
+         \x20 fp-hw-survey capture --label <name> [--out <file>] [--pairs <N>] \\\n\
+         \x20                      [--budget-mb <N>] [--ops <a,b,...>]\n\
+         \x20 fp-hw-survey merge --out <file> <capture.ndjson>...\n\
+         \x20 fp-hw-survey selftest\n\
+         \x20 fp-hw-survey info\n\
+         \n\
+         capture options:\n\
+         \x20 --label      Required. Human name for this machine (e.g. \"m2-macbook\").\n\
+         \x20 --out        Output file (default: capture-<label>.ndjson).\n\
+         \x20 --pairs      Random operand draws per op (default 2000).\n\
+         \x20 --budget-mb  Hard output-size cap in MB (default 150).\n\
+         \x20 --ops        Comma-separated op labels to restrict capture to.",
+        env!("CARGO_PKG_VERSION")
+    );
+}
+
+/// Pull the value following `flag` from `args`, or `None` if absent.
+fn flag_value<'a>(args: &'a [String], flag: &str) -> Option<&'a str> {
+    args.iter()
+        .position(|a| a == flag)
+        .and_then(|i| args.get(i + 1))
+        .map(|s| s.as_str())
+}
+
+fn cmd_capture(args: &[String]) -> ExitCode {
+    let Some(label) = flag_value(args, "--label") else {
+        eprintln!("error: --label is required");
+        return ExitCode::from(2);
+    };
+    let out = flag_value(args, "--out")
+        .map(|s| s.to_string())
+        .unwrap_or_else(|| format!("capture-{}.ndjson", sanitize(label)));
+    let pairs = flag_value(args, "--pairs")
+        .and_then(|s| s.parse::<usize>().ok())
+        .unwrap_or(2000);
+    let budget_mb = flag_value(args, "--budget-mb")
+        .and_then(|s| s.parse::<u64>().ok())
+        .unwrap_or(150);
+    let only_ops: Vec<String> = flag_value(args, "--ops")
+        .map(|s| {
+            s.split(',')
+                .map(|x| x.trim().to_string())
+                .filter(|x| !x.is_empty())
+                .collect()
+        })
+        .unwrap_or_default();
+
+    // Self-test gate: never emit data from an oracle that fails known answers.
+    match selftest::run() {
+        Ok(n) => eprintln!("selftest passed ({n} checks) on arch {}", arch::arch_tag()),
+        Err(fails) => {
+            eprintln!("SELF-TEST FAILED — refusing to capture:");
+            for f in &fails {
+                eprintln!("  {f}");
+            }
+            return ExitCode::FAILURE;
+        }
+    }
+
+    let cfg = capture::Config {
+        label: label.to_string(),
+        out: out.clone(),
+        pairs,
+        budget_bytes: budget_mb.saturating_mul(1024 * 1024),
+        only_ops,
+    };
+
+    match capture::run(&cfg) {
+        Ok((ops_n, rows)) => {
+            eprintln!("captured {ops_n} ops, {rows} rows -> {out}");
+            ExitCode::SUCCESS
+        }
+        Err(e) => {
+            eprintln!("capture error: {e}");
+            ExitCode::FAILURE
+        }
+    }
+}
+
+fn cmd_merge(args: &[String]) -> ExitCode {
+    let Some(out) = flag_value(args, "--out") else {
+        eprintln!("error: merge requires --out <file>");
+        return ExitCode::from(2);
+    };
+    let inputs: Vec<String> = args
+        .iter()
+        .filter(|a| !a.starts_with("--") && a.as_str() != out)
+        .cloned()
+        .collect();
+    if inputs.len() < 2 {
+        eprintln!("error: merge needs at least two capture files");
+        return ExitCode::from(2);
+    }
+    match merge::run(out, &inputs) {
+        Ok(()) => ExitCode::SUCCESS,
+        Err(e) => {
+            eprintln!("merge error: {e}");
+            ExitCode::FAILURE
+        }
+    }
+}
+
+fn cmd_selftest() -> ExitCode {
+    match selftest::run() {
+        Ok(n) => {
+            println!("selftest passed: {n} checks on arch {}", arch::arch_tag());
+            ExitCode::SUCCESS
+        }
+        Err(fails) => {
+            println!("selftest FAILED:");
+            for f in &fails {
+                println!("  {f}");
+            }
+            ExitCode::FAILURE
+        }
+    }
+}
+
+fn cmd_info() -> ExitCode {
+    let supported = ops::catalogue()
+        .iter()
+        .filter(|o| arch::supports(o.label))
+        .count();
+    println!("arch:      {}", host::arch_name());
+    println!("arch_tag:  {}", arch::arch_tag());
+    println!("os:        {}", host::os_name());
+    println!("cpu:       {}", host::cpu_brand());
+    println!("features:  {}", host::features().join(", "));
+    println!("catalogue: {} ops", ops::catalogue().len());
+    println!("supported: {supported} ops on this host");
+    ExitCode::SUCCESS
+}
+
+/// Make a label safe for use in a default filename.
+fn sanitize(s: &str) -> String {
+    s.chars()
+        .map(|c| {
+            if c.is_ascii_alphanumeric() || c == '-' || c == '_' {
+                c
+            } else {
+                '_'
+            }
+        })
+        .collect()
+}
diff --git a/crates/fp-hw-survey/src/merge.rs b/crates/fp-hw-survey/src/merge.rs
new file mode 100644
index 00000000..61dcdfd5
--- /dev/null
+++ b/crates/fp-hw-survey/src/merge.rs
@@ -0,0 +1,181 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: MIT
+
+//! Phase 2: align capture files row-by-row and emit only the rows where the
+//! machines actually disagree. Because every machine generates an identical
+//! deterministic corpus, rows align on the logical key
+//! `(op, a, b, c, mode, flush)`. A key whose `(res, flags)` pair is not unanimous
+//! across the machines that produced it is a divergence — the small, valuable
+//! artifact this whole tool exists to find.
+
+use std::collections::BTreeMap;
+use std::fs::File;
+use std::io::{BufRead, BufReader, BufWriter, Write};
+
+use crate::jsonio::{self, ObjectWriter};
+use crate::normflags;
+
+#[derive(Clone)]
+struct Entry {
+    machine: String,
+    arch: String,
+    res: u64,
+    flags: u32,
+}
+
+struct RowKey {
+    op: String,
+    a: u64,
+    b: u64,
+    c: u64,
+    mode: String,
+    flush: bool,
+}
+
+fn key_string(k: &RowKey) -> String {
+    format!(
+        "{}|{}|{}|{}|{}|{}",
+        k.op, k.a, k.b, k.c, k.mode, k.flush as u8
+    )
+}
+
+/// Load one capture file, pushing its rows into `map`. Returns the machine label.
+fn load(path: &str, map: &mut BTreeMap<String, Vec<Entry>>) -> std::io::Result<String> {
+    let f = File::open(path)?;
+    let r = BufReader::new(f);
+    let mut machine = String::new();
+    let mut arch = String::new();
+    for line in r.lines() {
+        let line = line?;
+        if line.trim().is_empty() {
+            continue;
+        }
+        let Some(obj) = jsonio::parse_object(&line) else {
+            continue;
+        };
+        if let Some(kind) = obj.get("kind").and_then(|v| v.as_str()) {
+            if kind == "header" {
+                machine = obj
+                    .get("label")
+                    .and_then(|v| v.as_str())
+                    .unwrap_or("?")
+                    .to_string();
+                arch = obj
+                    .get("arch_tag")
+                    .or_else(|| obj.get("arch"))
+                    .and_then(|v| v.as_str())
+                    .unwrap_or("?")
+                    .to_string();
+                continue;
+            }
+        }
+        let (Some(op), Some(a), Some(b), Some(c), Some(mode), Some(flush), Some(res), Some(flags)) = (
+            obj.get("op").and_then(|v| v.as_str()),
+            obj.get("a").and_then(|v| v.as_u64()),
+            obj.get("b").and_then(|v| v.as_u64()),
+            obj.get("c").and_then(|v| v.as_u64()),
+            obj.get("mode").and_then(|v| v.as_str()),
+            obj.get("flush").and_then(|v| v.as_bool()),
+            obj.get("res").and_then(|v| v.as_u64()),
+            obj.get("flags").and_then(|v| v.as_u64()),
+        ) else {
+            continue;
+        };
+        let k = RowKey {
+            op: op.to_string(),
+            a,
+            b,
+            c,
+            mode: mode.to_string(),
+            flush,
+        };
+        map.entry(key_string(&k)).or_default().push(Entry {
+            machine: machine.clone(),
+            arch: arch.clone(),
+            res,
+            flags: flags as u32,
+        });
+    }
+    if machine.is_empty() {
+        machine = path.to_string();
+    }
+    Ok(machine)
+}
+
+/// Run the merge. Writes a divergence NDJSON file and prints a summary.
+pub fn run(out_path: &str, inputs: &[String]) -> std::io::Result<()> {
+    let mut map: BTreeMap<String, Vec<Entry>> = BTreeMap::new();
+    let mut machines = Vec::new();
+    for path in inputs {
+        let m = load(path, &mut map)?;
+        eprintln!("loaded {path} (machine: {m})");
+        machines.push(m);
+    }
+
+    let out = File::create(out_path)?;
+    let mut w = BufWriter::new(out);
+
+    // Header.
+    let mut h = ObjectWriter::new();
+    h.str_field("kind", "merge-header")
+        .u64_field("machines", machines.len() as u64)
+        .str_array_field("machine_labels", &machines)
+        .str_field("tool_version", env!("CARGO_PKG_VERSION"));
+    writeln!(w, "{}", h.finish())?;
+
+    let mut divergences: u64 = 0;
+    let mut per_op: BTreeMap<String, u64> = BTreeMap::new();
+    let mut total_keys: u64 = 0;
+
+    for (key, entries) in &map {
+        total_keys += 1;
+        // Only meaningful when at least two machines produced this key.
+        if entries.len() < 2 {
+            continue;
+        }
+        let first = (entries[0].res, entries[0].flags);
+        let unanimous = entries.iter().all(|e| (e.res, e.flags) == first);
+        if unanimous {
+            continue;
+        }
+        divergences += 1;
+
+        // key = "op|a|b|c|mode|flush"
+        let parts: Vec<&str> = key.split('|').collect();
+        let op = parts[0];
+        *per_op.entry(op.to_string()).or_default() += 1;
+
+        let mut row = ObjectWriter::new();
+        row.str_field("op", op)
+            .u64_field("a", parts[1].parse().unwrap_or(0))
+            .u64_field("b", parts[2].parse().unwrap_or(0))
+            .u64_field("c", parts[3].parse().unwrap_or(0))
+            .str_field("mode", parts[4])
+            .bool_field("flush", parts[5] == "1");
+        // Per-machine results as a flat parallel encoding.
+        let machines_str: Vec<String> = entries.iter().map(|e| e.machine.clone()).collect();
+        let arches_str: Vec<String> = entries.iter().map(|e| e.arch.clone()).collect();
+        let res_str: Vec<String> = entries.iter().map(|e| e.res.to_string()).collect();
+        let flags_str: Vec<String> = entries.iter().map(|e| normflags::render(e.flags)).collect();
+        row.str_array_field("machines", &machines_str)
+            .str_array_field("arches", &arches_str)
+            .str_array_field("res", &res_str)
+            .str_array_field("flags", &flags_str);
+        writeln!(w, "{}", row.finish())?;
+    }
+
+    w.flush()?;
+
+    eprintln!("---- merge summary ----");
+    eprintln!("machines:     {}", machines.len());
+    eprintln!("aligned keys: {total_keys}");
+    eprintln!("divergences:  {divergences}");
+    if !per_op.is_empty() {
+        eprintln!("by op:");
+        for (op, n) in &per_op {
+            eprintln!("  {op:<14} {n}");
+        }
+    }
+    eprintln!("written to:   {out_path}");
+    Ok(())
+}
diff --git a/crates/fp-hw-survey/src/mode.rs b/crates/fp-hw-survey/src/mode.rs
new file mode 100644
index 00000000..985b6079
--- /dev/null
+++ b/crates/fp-hw-survey/src/mode.rs
@@ -0,0 +1,130 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: MIT
+
+//! Portable, architecture-neutral FP control state.
+//!
+//! Each architecture maps these logical modes onto its native control register
+//! (`FPCR` on AArch64, `MXCSR` on x86-64). The *logical* mode — not the raw
+//! control bits — is what a capture record stores, so that a row produced on an
+//! arm64 box and a row produced on an x64 box for "round-to-nearest, no flush"
+//! line up under the same key during merge.
+//!
+//! `Default NaN` (DN) is deliberately left disabled and is not part of the
+//! logical mode: forcing default-NaN propagation would *hide* exactly the
+//! cross-vendor NaN-propagation differences this survey exists to find.
+
+/// IEEE rounding direction.
+#[derive(Copy, Clone, PartialEq, Eq, Debug)]
+pub enum Round {
+    /// Round to nearest, ties to even.
+    Ne,
+    /// Round toward +infinity.
+    Up,
+    /// Round toward -infinity.
+    Down,
+    /// Round toward zero (truncate).
+    Zero,
+}
+
+impl Round {
+    /// Stable two-character key used in capture records and merge keys.
+    pub fn key(self) -> &'static str {
+        match self {
+            Round::Ne => "RN",
+            Round::Up => "RP",
+            Round::Down => "RM",
+            Round::Zero => "RZ",
+        }
+    }
+
+    /// Parse a key produced by [`Round::key`].
+    #[cfg_attr(not(test), allow(dead_code))]
+    pub fn parse(s: &str) -> Option<Round> {
+        match s {
+            "RN" => Some(Round::Ne),
+            "RP" => Some(Round::Up),
+            "RM" => Some(Round::Down),
+            "RZ" => Some(Round::Zero),
+            _ => None,
+        }
+    }
+}
+
+/// A complete logical control state: rounding direction plus flush-to-zero.
+#[derive(Copy, Clone, PartialEq, Eq, Debug)]
+pub struct Mode {
+    pub round: Round,
+    /// Flush subnormals to zero. On AArch64 this drives `FZ` (and `FZ16` for
+    /// half-precision ops); on x86-64 it drives `FTZ` and `DAZ`.
+    pub flush: bool,
+}
+
+impl Mode {
+    /// Stable key, e.g. `"RN.f0"` or `"RZ.f1"`.
+    #[cfg_attr(not(test), allow(dead_code))]
+    pub fn key(self) -> String {
+        format!("{}.f{}", self.round.key(), self.flush as u8)
+    }
+
+    /// Parse a key produced by [`Mode::key`].
+    #[cfg_attr(not(test), allow(dead_code))]
+    pub fn parse(s: &str) -> Option<Mode> {
+        let (r, f) = s.split_once(".f")?;
+        let round = Round::parse(r)?;
+        let flush = match f {
+            "0" => false,
+            "1" => true,
+            _ => return None,
+        };
+        Some(Mode { round, flush })
+    }
+}
+
+/// The fixed sweep of logical modes every capture exercises: all four rounding
+/// directions crossed with flush off/on (8 modes).
+pub fn all_modes() -> Vec<Mode> {
+    let mut v = Vec::with_capacity(8);
+    for &round in &[Round::Ne, Round::Up, Round::Down, Round::Zero] {
+        for &flush in &[false, true] {
+            v.push(Mode { round, flush });
+        }
+    }
+    v
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn round_key_roundtrips() {
+        for r in [Round::Ne, Round::Up, Round::Down, Round::Zero] {
+            assert_eq!(Round::parse(r.key()), Some(r));
+        }
+    }
+
+    #[test]
+    fn mode_key_roundtrips() {
+        for m in all_modes() {
+            assert_eq!(Mode::parse(&m.key()), Some(m));
+        }
+    }
+
+    #[test]
+    fn all_modes_is_eight_distinct() {
+        let v = all_modes();
+        assert_eq!(v.len(), 8);
+        let mut keys: Vec<String> = v.iter().map(|m| m.key()).collect();
+        keys.sort();
+        keys.dedup();
+        assert_eq!(keys.len(), 8);
+    }
+
+    #[test]
+    fn bad_keys_rejected() {
+        assert_eq!(Round::parse("XX"), None);
+        assert_eq!(Mode::parse("RN"), None);
+        assert_eq!(Mode::parse("RN.f2"), None);
+        assert_eq!(Mode::parse("ZZ.f0"), None);
+    }
+}
diff --git a/crates/fp-hw-survey/src/normflags.rs b/crates/fp-hw-survey/src/normflags.rs
new file mode 100644
index 00000000..a314f1fc
--- /dev/null
+++ b/crates/fp-hw-survey/src/normflags.rs
@@ -0,0 +1,81 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: MIT
+
+//! Normalized FP exception-flag layout, shared across architectures.
+//!
+//! AArch64 `FPSR` and x86-64 `MXCSR` expose the same six IEEE sticky exception
+//! flags but in different bit positions. To make captures from different
+//! architectures directly comparable, every capture record stores flags in
+//! this single normalized layout (which happens to match the AArch64 `FPSR`
+//! cumulative-flag positions). Each architecture's oracle translates its native
+//! flag word into this layout before recording.
+
+/// Invalid Operation.
+pub const IOC: u32 = 1 << 0;
+/// Divide by Zero.
+pub const DZC: u32 = 1 << 1;
+/// Overflow.
+pub const OFC: u32 = 1 << 2;
+/// Underflow.
+pub const UFC: u32 = 1 << 3;
+/// Inexact.
+pub const IXC: u32 = 1 << 4;
+/// Input Denormal.
+pub const IDC: u32 = 1 << 7;
+
+/// Mask of all six normalized exception bits.
+pub const MASK: u32 = IOC | DZC | OFC | UFC | IXC | IDC;
+
+/// Render a normalized flag word as a short stable string, e.g. `"IOC|IXC"` or
+/// `"-"` when no flags are set. Used only for human-facing summaries; the
+/// machine record stores the raw `u32`.
+pub fn render(flags: u32) -> String {
+    let mut parts: Vec<&str> = Vec::new();
+    if flags & IOC != 0 {
+        parts.push("IOC");
+    }
+    if flags & DZC != 0 {
+        parts.push("DZC");
+    }
+    if flags & OFC != 0 {
+        parts.push("OFC");
+    }
+    if flags & UFC != 0 {
+        parts.push("UFC");
+    }
+    if flags & IXC != 0 {
+        parts.push("IXC");
+    }
+    if flags & IDC != 0 {
+        parts.push("IDC");
+    }
+    if parts.is_empty() {
+        "-".to_string()
+    } else {
+        parts.join("|")
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn render_none() {
+        assert_eq!(render(0), "-");
+    }
+
+    #[test]
+    fn render_single_and_multi() {
+        assert_eq!(render(IOC), "IOC");
+        assert_eq!(render(IDC), "IDC");
+        assert_eq!(render(IOC | IXC), "IOC|IXC");
+        assert_eq!(render(DZC | OFC | UFC), "DZC|OFC|UFC");
+        assert_eq!(render(MASK), "IOC|DZC|OFC|UFC|IXC|IDC");
+    }
+
+    #[test]
+    fn mask_covers_all_named_bits() {
+        assert_eq!(MASK, IOC | DZC | OFC | UFC | IXC | IDC);
+    }
+}
diff --git a/crates/fp-hw-survey/src/ops.rs b/crates/fp-hw-survey/src/ops.rs
new file mode 100644
index 00000000..6876c1db
--- /dev/null
+++ b/crates/fp-hw-survey/src/ops.rs
@@ -0,0 +1,185 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: MIT
+
+//! The operation catalogue: every FP operation the survey knows how to drive,
+//! described in an architecture-neutral way. Each architecture's oracle decides
+//! which of these it can actually execute (see `arch::supports`).
+
+/// Floating-point storage format.
+#[derive(Copy, Clone, PartialEq, Eq, Debug)]
+pub enum Fmt {
+    Half,
+    Single,
+    Double,
+}
+
+/// Operand layout / computation shape for an operation. This tells `capture`
+/// where to draw operands from and how many there are.
+///
+/// The struct-variant fields are a *complete* descriptor of each conversion's
+/// shape (source/destination format, signedness, integer width). Operand
+/// generation reads `from`/`width`; the remaining fields document the op for
+/// readers and keep the catalogue self-describing even though runtime dispatch
+/// is keyed on the op label string rather than on `Kind`.
+#[derive(Copy, Clone, Debug)]
+#[allow(dead_code)]
+pub enum Kind {
+    /// `a OP b`, both of the given float format.
+    Bin(Fmt),
+    /// `OP a`, one operand of the given float format.
+    Un(Fmt),
+    /// `a*b + c`, all of the given float format (single-rounded fused MAC).
+    Fma(Fmt),
+    /// Float→float conversion of one operand from `from` to `to`.
+    CvtF2F { from: Fmt, to: Fmt },
+    /// Float→integer conversion (round toward zero) of one float operand.
+    CvtF2I { from: Fmt, signed: bool, width: u8 },
+    /// Integer→float conversion of one integer operand of the given bit width.
+    CvtI2F { to: Fmt, signed: bool, width: u8 },
+}
+
+/// One catalogued operation: a stable label plus its operand shape.
+#[derive(Copy, Clone, Debug)]
+pub struct OpSpec {
+    pub label: &'static str,
+    pub kind: Kind,
+}
+
+/// The full catalogue (the union across architectures). Labels match the scheme
+/// used by the `rook` golden file so a merged divergence set drops straight in.
+pub fn catalogue() -> Vec<OpSpec> {
+    use Fmt::*;
+    use Kind::*;
+    let mut v: Vec<OpSpec> = Vec::new();
+    macro_rules! op {
+        ($l:literal, $k:expr) => {
+            v.push(OpSpec {
+                label: $l,
+                kind: $k,
+            })
+        };
+    }
+
+    // ── Two-source arithmetic ────────────────────────────────────────────
+    for &(suf, fmt) in &[("s", Single), ("d", Double), ("h", Half)] {
+        for base in [
+            "fadd", "fsub", "fmul", "fdiv", "fmax", "fmin", "fmaxnm", "fminnm", "fmulx", "fabd",
+        ] {
+            let label: &'static str = Box::leak(format!("{base}.{suf}").into_boxed_str());
+            v.push(OpSpec {
+                label,
+                kind: Bin(fmt),
+            });
+        }
+    }
+    // frecps / frsqrts are single/double only.
+    for &(suf, fmt) in &[("s", Single), ("d", Double)] {
+        for base in ["frecps", "frsqrts"] {
+            let label: &'static str = Box::leak(format!("{base}.{suf}").into_boxed_str());
+            v.push(OpSpec {
+                label,
+                kind: Bin(fmt),
+            });
+        }
+    }
+
+    // ── One-source ───────────────────────────────────────────────────────
+    for &(suf, fmt) in &[("s", Single), ("d", Double), ("h", Half)] {
+        let label: &'static str = Box::leak(format!("fsqrt.{suf}").into_boxed_str());
+        v.push(OpSpec {
+            label,
+            kind: Un(fmt),
+        });
+    }
+    for &(suf, fmt) in &[("s", Single), ("d", Double)] {
+        for base in [
+            "frecpe", "frsqrte", "frecpx", "frintn", "frinta", "frintp", "frintm", "frintz",
+        ] {
+            let label: &'static str = Box::leak(format!("{base}.{suf}").into_boxed_str());
+            v.push(OpSpec {
+                label,
+                kind: Un(fmt),
+            });
+        }
+    }
+
+    // ── Fused multiply-add ───────────────────────────────────────────────
+    op!("fmadd.s", Fma(Single));
+    op!("fmadd.d", Fma(Double));
+
+    // ── Float→float convert ──────────────────────────────────────────────
+    op!(
+        "fcvt.s2d",
+        CvtF2F {
+            from: Single,
+            to: Double
+        }
+    );
+    op!(
+        "fcvt.d2s",
+        CvtF2F {
+            from: Double,
+            to: Single
+        }
+    );
+    op!(
+        "fcvt.s2h",
+        CvtF2F {
+            from: Single,
+            to: Half
+        }
+    );
+    op!(
+        "fcvt.d2h",
+        CvtF2F {
+            from: Double,
+            to: Half
+        }
+    );
+    op!(
+        "fcvt.h2s",
+        CvtF2F {
+            from: Half,
+            to: Single
+        }
+    );
+    op!(
+        "fcvt.h2d",
+        CvtF2F {
+            from: Half,
+            to: Double
+        }
+    );
+
+    // ── Float→int, round toward zero ─────────────────────────────────────
+    for (&from, fs) in [Single, Double].iter().zip(["s", "d"]) {
+        for (signed, ss) in [(true, "fcvtzs"), (false, "fcvtzu")] {
+            for &(width, ws) in &[(32u8, "w"), (64u8, "x")] {
+                let label: &'static str = Box::leak(format!("{ss}.{fs}.{ws}").into_boxed_str());
+                v.push(OpSpec {
+                    label,
+                    kind: CvtF2I {
+                        from,
+                        signed,
+                        width,
+                    },
+                });
+            }
+        }
+    }
+
+    // ── Int→float ────────────────────────────────────────────────────────
+    for (&to, fs) in [Single, Double].iter().zip(["s", "d"]) {
+        for (signed, ss) in [(true, "scvtf"), (false, "ucvtf")] {
+            for &(width, ws) in &[(32u8, "w"), (64u8, "x")] {
+                let label: &'static str = Box::leak(format!("{ss}.{ws}.{fs}").into_boxed_str());
+                v.push(OpSpec {
+                    label,
+                    kind: CvtI2F { to, signed, width },
+                });
+            }
+        }
+    }
+
+    v
+}
diff --git a/crates/fp-hw-survey/src/selftest.rs b/crates/fp-hw-survey/src/selftest.rs
new file mode 100644
index 00000000..7e069d1e
--- /dev/null
+++ b/crates/fp-hw-survey/src/selftest.rs
@@ -0,0 +1,163 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: MIT
+
+//! Known-answer self-test. Because the x86-64 asm cannot be exercised on the
+//! author's aarch64 development host, `capture` runs this gate first and aborts
+//! if any check fails — preventing a machine from emitting untrustworthy data
+//! from broken inline asm.
+
+use crate::arch;
+use crate::mode::{Mode, Round};
+
+struct Case {
+    label: &'static str,
+    a: u64,
+    b: u64,
+    c: u64,
+    expect: u64,
+    desc: &'static str,
+}
+
+fn rn() -> Mode {
+    Mode {
+        round: Round::Ne,
+        flush: false,
+    }
+}
+
+fn cases() -> Vec<Case> {
+    // Bit patterns: f32 1.0=0x3f800000, 2.0=0x40000000, 3.0=0x40400000,
+    // 4.0=0x40800000, 6.0=0x40c00000, 2.5=0x40200000.
+    // f64 1.0=0x3ff0..., 2.0=0x4000..., 4.0=0x4010..., 6.0=0x4018...
+    vec![
+        Case {
+            label: "fadd.s",
+            a: 0x3f80_0000,
+            b: 0x3f80_0000,
+            c: 0,
+            expect: 0x4000_0000,
+            desc: "1.0 + 1.0 == 2.0 (f32)",
+        },
+        Case {
+            label: "fmul.s",
+            a: 0x4000_0000,
+            b: 0x4040_0000,
+            c: 0,
+            expect: 0x40c0_0000,
+            desc: "2.0 * 3.0 == 6.0 (f32)",
+        },
+        Case {
+            label: "fsub.s",
+            a: 0x4040_0000,
+            b: 0x3f80_0000,
+            c: 0,
+            expect: 0x4000_0000,
+            desc: "3.0 - 1.0 == 2.0 (f32)",
+        },
+        Case {
+            label: "fdiv.s",
+            a: 0x40c0_0000,
+            b: 0x4040_0000,
+            c: 0,
+            expect: 0x4000_0000,
+            desc: "6.0 / 3.0 == 2.0 (f32)",
+        },
+        Case {
+            label: "fsqrt.s",
+            a: 0x4080_0000,
+            b: 0,
+            c: 0,
+            expect: 0x4000_0000,
+            desc: "sqrt(4.0) == 2.0 (f32)",
+        },
+        Case {
+            label: "fadd.d",
+            a: 0x3ff0_0000_0000_0000,
+            b: 0x3ff0_0000_0000_0000,
+            c: 0,
+            expect: 0x4000_0000_0000_0000,
+            desc: "1.0 + 1.0 == 2.0 (f64)",
+        },
+        Case {
+            label: "fsqrt.d",
+            a: 0x4010_0000_0000_0000,
+            b: 0,
+            c: 0,
+            expect: 0x4000_0000_0000_0000,
+            desc: "sqrt(4.0) == 2.0 (f64)",
+        },
+        Case {
+            label: "fcvt.s2d",
+            a: 0x3f80_0000,
+            b: 0,
+            c: 0,
+            expect: 0x3ff0_0000_0000_0000,
+            desc: "(f64)1.0f == 1.0 (f32->f64)",
+        },
+        Case {
+            label: "fcvt.d2s",
+            a: 0x4000_0000_0000_0000,
+            b: 0,
+            c: 0,
+            expect: 0x4000_0000,
+            desc: "(f32)2.0 == 2.0 (f64->f32)",
+        },
+        Case {
+            label: "fcvtzs.s.w",
+            a: 0x4020_0000,
+            b: 0,
+            c: 0,
+            expect: 2,
+            desc: "(i32)2.5f == 2 (truncate)",
+        },
+        Case {
+            label: "scvtf.w.s",
+            a: 3,
+            b: 0,
+            c: 0,
+            expect: 0x4040_0000,
+            desc: "(f32)3 == 3.0",
+        },
+    ]
+}
+
+/// Run all known-answer checks for the current host. Returns `Ok(n)` with the
+/// number of checks that ran, or `Err(failures)` listing every mismatch.
+pub fn run() -> Result<usize, Vec<String>> {
+    let mut ran = 0usize;
+    let mut fails = Vec::new();
+    for case in cases() {
+        if !arch::supports(case.label) {
+            continue; // op not available on this arch; not a failure
+        }
+        if let Some((res, _flags)) = arch::eval(case.label, case.a, case.b, case.c, rn()) {
+            ran += 1;
+            if res != case.expect {
+                fails.push(format!(
+                    "FAIL {}: {} -> got 0x{:x}, expected 0x{:x}",
+                    case.label, case.desc, res, case.expect
+                ));
+            }
+        }
+        // `None` => unsupported at runtime; skip silently.
+    }
+    if fails.is_empty() {
+        Ok(ran)
+    } else {
+        Err(fails)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn known_answers_hold_on_this_host() {
+        // Exercises the real hardware oracle for the test architecture.
+        match run() {
+            Ok(n) => assert!(n > 0, "no self-test cases ran on this arch"),
+            Err(fails) => panic!("self-test failed: {fails:?}"),
+        }
+    }
+}
diff --git a/spellcheck.dic b/spellcheck.dic
index 9574f35f..c4a1d682 100644
--- a/spellcheck.dic
+++ b/spellcheck.dic
@@ -304,4 +304,71 @@ wakers
 Wakers
 wakeup
 wakeups
-workstealing
\ No newline at end of file
+workstealing
+AArch64
+aarch64
+Altra
+Ampere
+bitpattern
+denormal
+denormals
+DAZ
+DZC
+fabd
+fadd
+fcvt
+fcvtxn
+fcvtzs
+fcvtzu
+fdiv
+fmadd
+fmax
+fmaxnm
+fmin
+fminnm
+fmul
+fmulx
+FPCR
+FPSR
+frecpe
+frecps
+frecpx
+frinta
+frintm
+frintn
+frintp
+frintz
+frsqrte
+frsqrts
+fsqrt
+fsub
+FTZ
+IDC
+IEEE
+IOC
+IXC
+libcore
+MXCSR
+NaN
+NaNs
+NDJSON
+OFC
+Qualcomm
+RMode
+roundings
+scvtf
+Snapdragon
+SplitMix
+subnormal
+subnormals
+ucvtf
+UFC
+urecpe
+ursqrte
+Volterra
+CPUID
+cpuinfo
+Gregorian
+Hinnant
+ISO
+proleptic