From a78a69533ee9b9f7326f2e2792284760dda35915 Mon Sep 17 00:00:00 2001 From: Mike Grier Date: Tue, 9 Jun 2026 15:14:17 -0400 Subject: [PATCH] fp-hw-survey: add cross-hardware FP behavior survey crate (capture/merge), workspace wiring, and rpres/afp feature detection --- Cargo.toml | 1 + crates/fp-hw-survey/.gitignore | 5 + crates/fp-hw-survey/Cargo.toml | 22 + crates/fp-hw-survey/README.md | 296 +++++++++++ crates/fp-hw-survey/src/arch/aarch64.rs | 537 ++++++++++++++++++++ crates/fp-hw-survey/src/arch/mod.rs | 20 + crates/fp-hw-survey/src/arch/unsupported.rs | 20 + crates/fp-hw-survey/src/arch/x86_64.rs | 321 ++++++++++++ crates/fp-hw-survey/src/capture.rs | 228 +++++++++ crates/fp-hw-survey/src/corpus.rs | 215 ++++++++ crates/fp-hw-survey/src/host.rs | 375 ++++++++++++++ crates/fp-hw-survey/src/jsonio.rs | 361 +++++++++++++ crates/fp-hw-survey/src/main.rs | 199 ++++++++ crates/fp-hw-survey/src/merge.rs | 181 +++++++ crates/fp-hw-survey/src/mode.rs | 130 +++++ crates/fp-hw-survey/src/normflags.rs | 81 +++ crates/fp-hw-survey/src/ops.rs | 185 +++++++ crates/fp-hw-survey/src/selftest.rs | 163 ++++++ spellcheck.dic | 69 ++- 19 files changed, 3408 insertions(+), 1 deletion(-) create mode 100644 crates/fp-hw-survey/.gitignore create mode 100644 crates/fp-hw-survey/Cargo.toml create mode 100644 crates/fp-hw-survey/README.md create mode 100644 crates/fp-hw-survey/src/arch/aarch64.rs create mode 100644 crates/fp-hw-survey/src/arch/mod.rs create mode 100644 crates/fp-hw-survey/src/arch/unsupported.rs create mode 100644 crates/fp-hw-survey/src/arch/x86_64.rs create mode 100644 crates/fp-hw-survey/src/capture.rs create mode 100644 crates/fp-hw-survey/src/corpus.rs create mode 100644 crates/fp-hw-survey/src/host.rs create mode 100644 crates/fp-hw-survey/src/jsonio.rs create mode 100644 crates/fp-hw-survey/src/main.rs create mode 100644 crates/fp-hw-survey/src/merge.rs create mode 100644 crates/fp-hw-survey/src/mode.rs create mode 100644 crates/fp-hw-survey/src/normflags.rs create mode 100644 crates/fp-hw-survey/src/ops.rs create mode 100644 crates/fp-hw-survey/src/selftest.rs diff --git a/Cargo.toml b/Cargo.toml index c7c0758e..1fab6ad3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,6 +2,7 @@ resolver = "2" members = [ "m", + "crates/fp-hw-survey", ] [workspace.metadata.spellcheck] diff --git a/crates/fp-hw-survey/.gitignore b/crates/fp-hw-survey/.gitignore new file mode 100644 index 00000000..334cfae5 --- /dev/null +++ b/crates/fp-hw-survey/.gitignore @@ -0,0 +1,5 @@ +/target +*.ndjson +*.ndjson.gz +captures/ +.scratch/ diff --git a/crates/fp-hw-survey/Cargo.toml b/crates/fp-hw-survey/Cargo.toml new file mode 100644 index 00000000..afb1e24a --- /dev/null +++ b/crates/fp-hw-survey/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "fp-hw-survey" +version = "0.1.0" +edition = "2021" +rust-version = "1.86" +authors = ["Michael Grier "] +license = "MIT" +description = "Cross-hardware IEEE-754 floating-point behavior survey: capture native FP results + exception flags on x86-64 and AArch64, then merge captures from many machines to find where hardware actually disagrees." +repository = "https://github.com/azure/m" +homepage = "https://github.com/azure/m" +readme = "README.md" + +# No external dependencies on purpose: this must build on stable Rust with std +# only, on macOS arm64, Windows arm64, and x86-64 Linux/Windows, with zero +# friction. + +[lints] +workspace = true + +[[bin]] +name = "fp-hw-survey" +path = "src/main.rs" diff --git a/crates/fp-hw-survey/README.md b/crates/fp-hw-survey/README.md new file mode 100644 index 00000000..29d9cf9e --- /dev/null +++ b/crates/fp-hw-survey/README.md @@ -0,0 +1,296 @@ +# fp-hw-survey + +A small, dependency-free Rust program that captures **native floating-point +hardware behavior** across many machines and architectures, then merges those +captures to find the rows where the hardware actually *disagrees*. + +It exists to answer empirical questions like: + +- Do different AArch64 vendors (Qualcomm Snapdragon, Apple M-series, Ampere) + produce different bit patterns for the estimate instructions `frecpe` / + `frsqrte` / `frecpx` / `fmulx`? (Spoiler: the architecture says they + *shouldn't* — see [What we expect to find](#what-we-expect-to-find) — and the + survey's job is to confirm that empirically.) +- Where exactly do x86-64 and AArch64 differ — `fmax`/`fmin` semantics, + float→int out-of-range results, NaN propagation, flush-to-zero edges? + +Each machine runs a **deterministic** operand × rounding-mode corpus through the +real scalar instructions (via per-architecture inline asm — *not* Rust libcore +math, which would hide the divergence), and records the native +`(result_bits, exception_flags)`. Because the corpus is identical on every +machine, captures align row-for-row and the merge step can emit just the +disagreements. + +## What we expect to find + +A late but important correction to the tool's original premise: on **AArch64**, +the scalar estimate instructions are **not** loosely "implementation-defined." +The Arm Architecture Reference Manual defines `frecpe`/`frsqrte` via exact +shared pseudocode (`FPRecipEstimate` → `RecipEstimate`), a deterministic +fixed-point integer computation; `frecpx` and `fmulx` are likewise fully +specified. The result bits are reproducible from the spec alone — e.g. +`vrecpe(1,2,3,4)` is architecturally required to yield +`0.99805, 0.49902, 0.33301, 0.24951` on every conforming ARMv8 part. So: + +- **Intra-ARMv8 agreement is the predicted result.** Apple vs Snapdragon vs + Ampere *should* produce bit-identical estimates. A merge that reports **zero** + divergences across AArch64 machines is therefore the *informative,* expected + outcome — a positive confirmation of conformance, not a wasted run. +- **One legitimate intra-ARMv8 estimate divergence exists — and it is a + *feature* difference, not a conformance bug.** `FEAT_RPRES` selects a 12-bit + reciprocal / reciprocal-sqrt estimate table (when `FPCR.AH==1`, single + precision) where a non-RPRES part returns 8 bits. So a `frecpe.s` / + `frsqrte.s` disagreement between two machines is expected **only** when their + `rpres`/`AH` context differs; both answers are deterministic, just from + different mandated tables. The capture records `rpres`/`afp` in the header + `features` precisely so the merge can attribute such a row to the feature gap + rather than flag it as an erratum. +- **The genuinely divergent axes lie elsewhere:** ARMv7 NEON `VRECPE`/`VRSQRTE` + (pre-v8, genuinely looser — "read from a ROM with limited bins"), and + **cross-architecture** x86-64 ↔ AArch64 semantics (`fmax`/`fmin` NaN handling, + float→int saturation, flush-to-zero edges). +- **Spec ≠ proof of correct silicon.** The survey still earns its keep by + catching a part that *deviates* from the pseudocode (an erratum, a botched + subnormal/`FEAT_FP16`/sign-of-zero edge). "Should agree per spec" becomes + "does agree, measured." + +### Consequence for the downstream reference set + +This sharply shrinks the golden/reference data a consumer like `rook::fp` must +carry. If AArch64 estimates are architecturally fixed, the consumer needs **one +architecturally-derived table per architecture**, not a per-vendor / per-SKU +golden captured from every machine. The survey does **not** ship a multi-SKU × +full-corpus blob (which is where the tens-to-hundreds-of-MB figures came from); +it ships only the **divergence set** — the handful of keys that are genuinely +not unanimous (cross-arch corners, any conformance outlier). In the expected +case that intra-ARMv8 is unanimous, the AArch64 contribution to that reference +set is **empty**, and the per-machine captures are intermediate verification +artifacts, not data the consumer retains. + +## Build + +Stable Rust, `std` only, **no external crates**. Builds out of the box on: + +- macOS arm64 (Apple Silicon) +- Windows on ARM (Snapdragon, e.g. Surface / Volterra / Lenovo X13s) +- Linux/Windows x86-64 + +```sh +cargo build --release +``` + +The binary is `target/release/fp-hw-survey` (`.exe` on Windows). + +## Supported architectures + +| Arch | Backend | Notes | +|------|---------|-------| +| `aarch64` | full scalar oracle | All 77 catalogued ops; half-precision (`.h`) ops require `FEAT_FP16`. | +| `x86_64` | SSE/SSE2 (+FMA3) | The SSE-mappable subset only: arithmetic, `fmax`/`fmin`, `fsqrt`, `fma`, f32↔f64, truncating signed float→int, signed int→float. Ops with no scalar SSE form (`fmaxnm`, `fmulx`, `fabd`, the estimate family, directed-rounding/unsigned conversions, all half ops) are skipped. | +| other | none | Produces only a header line. | + +> **x86-64 validation caveat:** the x86-64 inline asm was authored on an arm64 +> host and could not be executed there during development. Every `capture` run +> first executes a **known-answer self-test** for the local architecture and +> **aborts** if any check fails, so a broken oracle never emits untrustworthy +> data. Still, the first time you run this on real x64 hardware, eyeball the +> `selftest` output. + +## Usage + +### Generating a capture (run on each machine) + +A capture is a single self-describing NDJSON file. Its **header line records the +hardware identity and the capture date/time automatically** — you do not have to +supply them — so a capture file is always traceable back to the machine and the +moment it was produced. + +**Step 1 — build the release binary on the target machine.** + +```sh +# from the workspace root (c:\github\m or your clone) +cargo build --release -p fp-hw-survey +``` + +Use `--release`: a capture runs a large operand × mode sweep, and the debug +build is much slower. Optimization level does **not** change the captured +results (each op runs inside its own inline-asm block), only the wall-clock +time. The binary lands at: + +- Linux/macOS: `target/release/fp-hw-survey` +- Windows: `target\release\fp-hw-survey.exe` + +**Step 2 — sanity-check the host (optional but recommended).** + +```sh +fp-hw-survey info # arch, OS, CPU brand, detected features, supported-op count +fp-hw-survey selftest # known-answer checks for this machine's oracle +``` + +`info` is also the quickest way to confirm the tool detected the CPU and +features (e.g. `fp16` on AArch64) correctly before you commit to a full run. + +**Step 3 — run the capture.** + +```sh +fp-hw-survey capture --label "snapdragon-x-elite-win" +``` + +`capture` first runs the self-test and **aborts without writing if any +known-answer check fails**, so a broken oracle never produces untrustworthy +data. On success it writes `capture-