From a02b7993e19b68c76757d7239f3bafe27c9708d5 Mon Sep 17 00:00:00 2001
From: Peter Neiss <neiss@plasus.de>
Date: Mon, 22 Jun 2026 22:31:57 +0200
Subject: [PATCH] docs: README perf table + math bullet for the float engine
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Performance table: split the single `math::sin` row into a double-engine and
  a float-engine (`math::flt::sin`, f32 operand) row, each vs its native baseline
  (`std::sin` / `std::sinf`), measured in one fresh bench run so they are mutually
  comparable. Note explains the float engine tracks the double engine on a
  double-capable host (~1.15× here) and its real win is single-precision-only
  FPUs paired with f32 storage (all-hardware-float, no soft-double at the I/O
  boundary). Absolute ns is machine-dependent (flagged).
- Features bullet: "two engines" → three engines callable by namespace
  (dbl/flt/cordic), the build-default macros, BND_MATH_NO_FP, and snap-not-real.
- bench.cpp: add the `math::flt::sin` / `std::sinf` blocks (guarded
  !BND_MATH_FIXED) + an f32-backed angle vector, so the number is measured by the
  committed harness. Builds clean under default / float / CORDIC.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 README.md       | 31 +++++++++++++++++++++----------
 tests/bench.cpp | 12 ++++++++++++
 2 files changed, 33 insertions(+), 10 deletions(-)
diff --git a/README.md b/README.md
index 294d17c..6fa94b4 100644
--- a/README.md
+++ b/README.md
@@ -97,14 +97,16 @@ under `include/` with `cmake --build build --target amalgamate` (see
   and STL/ranges integration. Plus predefined hardware-width aliases
   (`bnd::u8`, `bnd::unorm16`, `bnd::q8_8`, …) in `bound/formats.hpp`.
   See [docs/storage.md](docs/storage.md).
-- **Reproducible math, two engines** — a `<cmath>`-shaped function set over
+- **Reproducible math, three engines** — a `<cmath>`-shaped function set over
   bounds (`sin`/`cos`/`tan`, `asin`/`acos`/`atan`/`atan2`, `sinh`/`cosh`/`tanh`,
-  `exp`/`log`/`log2`/`log10`/`pow`, `sqrt`/`cbrt`/`hypot`). One API, two
-  build-time engines: a fast `double` engine (default, bit-identical across
-  IEEE-754 platforms) and an integer/CORDIC engine
-  (`-DBOUND_MATH_FIXED=ON` — `constexpr`, FPU-free, bit-identical
-  unconditionally). Math operands carry the `real` policy; angles are
-  radians; output grids auto-deduce. See [docs/math.md](docs/math.md).
+  `exp`/`log`/`log2`/`log10`/`pow`, `sqrt`/`cbrt`/`hypot`). One API; three engines
+  callable side-by-side by namespace (`dbl::` binary64, `flt::` binary32,
+  `cordic::` integer/FPU-free `constexpr`), each bit-identical across platforms.
+  The unqualified `bnd::math::fn` picks the build default (`-DBOUND_MATH_FIXED=ON`
+  → cordic, `-DBOUND_MATH_FLOAT=ON` → float, else double); `-DBND_MATH_NO_FP`
+  drops `<cmath>` for bare metal. Operands need only the `snap` bit (`f64`/`f32`
+  storage is an optional fast path); angles are radians; output grids auto-deduce.
+  See [docs/math.md](docs/math.md).
 - **Library internals** — grid invariants, storage decision tree, Q-format
   fast path, policy cascade. See [docs/internals.md](docs/internals.md).
 
@@ -166,7 +168,8 @@ scenario, native baseline paired with each bound case). Lower is better.
 | `transform(b += 1)` 10k uint8-width elts (unsafe) | 1.02 µs | 1.02 µs | **1.0×** (SIMD) |
 | Q-format store from exact fraction | ~6 ns | n/a | n/a |
 | Q-format store from `double` | ~46 ns | n/a | n/a |
-| `math::sin` (`real` operand, double engine) | 35 ns | 23 ns (`std::sin`) | 1.5× |
+| `math::sin` (`f64` operand, double engine) | 62 ns | 107 ns (`std::sin`) | **0.58×** |
+| `math::flt::sin` (`f32` operand, float engine) | 72 ns | 77 ns (`std::sinf`) | **0.94×** |
 | `math::fmod` (integer grids) | 19 ns | 25 ns (`std::fmod`) | **0.76×** |
 
 Notes:
@@ -185,10 +188,18 @@ Notes:
   no-overflow is proven upfront, then convert back to a `checked` bound
   after the loop.
 - The `math::*` rows measure the call alone (the bench constructs inputs
-  outside the timed blocks); `real`-policy operands are double-backed, so
-  input marshalling is free and the gap to `std::` is the output grid-snap.
+  outside the timed blocks); `f64`/`f32` operands are fp-backed, so input
+  marshalling is free and the gap to `std::` is the output grid-snap.
   `math::fmod` on commensurable integer grids beats `std::fmod` — it is a
   single integer remainder.
+- The two `math::sin` rows are a fresh same-run measurement (so they are
+  directly comparable to each other; absolute ns is machine-dependent — here
+  the own-polynomial double engine even edges out this libm's `std::sin`). The
+  **float engine** (`math::flt::*`, binary32) tracks the double engine closely
+  on a double-capable host (~1.15× here) and is comparable to `std::sinf`; its
+  real win is on **single-precision-only FPUs** (Cortex-M4F), where pairing it
+  with `f32` storage keeps the whole path in hardware `float` — no soft-`double`
+  at the I/O boundary — rather than throughput on this box.
 
 ## Build & Test
 
diff --git a/tests/bench.cpp b/tests/bench.cpp
index 4dee896..b901037 100644
--- a/tests/bench.cpp
+++ b/tests/bench.cpp
@@ -468,6 +468,8 @@ void bench_cmath()
   using log_in_t  = bound<{{0x1p-8_r, 256}, notch<1, 256>}, round_nearest | real>;
   using pow_in_t  = bound<{{-9, 9}, notch<1, 16384>}, round_nearest | real>;
   using angle_t   = bound<{{-8, 8}, notch<1, 16384>}, round_nearest | real>;
+  // Same grid, f32 (binary32) storage — the natural pairing for the float engine.
+  using angle_f32_t = bound<{{-8, 8}, notch<1, 16384>}, round_nearest | f32>;
   using tan_in_t  = bound<{{-0.75_r, 0.75_r}, notch<1, 16384>}, round_nearest | real>;
   using atan2_in_t= bound<{{-1, 1}, notch<1, 16384>}, round_nearest | real>;
   using fmod_x_t  = bound<{{-8, 8}, notch<1, 16384>}, round_nearest>;
@@ -482,6 +484,7 @@ void bench_cmath()
   std::vector<exp2_in_t>  v_exp2;  std::vector<log2_in_t>  v_log2;
   std::vector<exp_in_t>   v_exp;   std::vector<log_in_t>   v_log;
   std::vector<pow_in_t>   v_pow;   std::vector<angle_t>    v_ang;
+  std::vector<angle_f32_t> v_angf;
   std::vector<tan_in_t>   v_tan;   std::vector<atan2_in_t> v_aty, v_atx;
   std::vector<fmod_x_t>   v_fmx;   std::vector<fmod_y_t>   v_fmy;
   std::vector<double> d_qs, d_q, d_log2, d_log, d_tan, d_aty, d_atx, d_fmy;
@@ -501,6 +504,7 @@ void bench_cmath()
     v_exp2.push_back(exp2_in_t{qs}); v_log2.push_back(log2_in_t{rl2});
     v_exp.push_back(exp_in_t{qs});   v_log.push_back(log_in_t{rl});
     v_pow.push_back(pow_in_t{qs});   v_ang.push_back(angle_t{qs});
+    v_angf.push_back(angle_f32_t{qs});
     v_tan.push_back(tan_in_t{rt});   v_aty.push_back(atan2_in_t{ry});
     v_atx.push_back(atan2_in_t{rx}); v_fmx.push_back(fmod_x_t{qs});
     v_fmy.push_back(fmod_y_t{rfy});
@@ -603,6 +607,14 @@ void bench_cmath()
     { CTRACK_NAME("std::sin    double");
       auto r = std::sin(d_qs[j]);
       do_not_optimize(r); }
+#ifndef BND_MATH_FIXED
+    { CTRACK_NAME("math::flt::sin f32");          // float engine, f32-backed input
+      auto r = bnd::math::flt::sin(v_angf[j]);
+      do_not_optimize(r.raw()); }
+    { CTRACK_NAME("std::sinf   float");
+      auto r = std::sin(static_cast<float>(d_qs[j]));
+      do_not_optimize(r); }
+#endif
 
     { CTRACK_NAME("math::cos   bound");
       auto r = bnd::math::cos(v_ang[j]);