From a02b7993e19b68c76757d7239f3bafe27c9708d5 Mon Sep 17 00:00:00 2001 From: Peter Neiss Date: Mon, 22 Jun 2026 22:31:57 +0200 Subject: [PATCH] docs: README perf table + math bullet for the float engine MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Performance table: split the single `math::sin` row into a double-engine and a float-engine (`math::flt::sin`, f32 operand) row, each vs its native baseline (`std::sin` / `std::sinf`), measured in one fresh bench run so they are mutually comparable. Note explains the float engine tracks the double engine on a double-capable host (~1.15× here) and its real win is single-precision-only FPUs paired with f32 storage (all-hardware-float, no soft-double at the I/O boundary). Absolute ns is machine-dependent (flagged). - Features bullet: "two engines" → three engines callable by namespace (dbl/flt/cordic), the build-default macros, BND_MATH_NO_FP, and snap-not-real. - bench.cpp: add the `math::flt::sin` / `std::sinf` blocks (guarded !BND_MATH_FIXED) + an f32-backed angle vector, so the number is measured by the committed harness. Builds clean under default / float / CORDIC. Co-Authored-By: Claude Opus 4.8 --- README.md | 31 +++++++++++++++++++++---------- tests/bench.cpp | 12 ++++++++++++ 2 files changed, 33 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 294d17c..6fa94b4 100644 --- a/README.md +++ b/README.md @@ -97,14 +97,16 @@ under `include/` with `cmake --build build --target amalgamate` (see and STL/ranges integration. Plus predefined hardware-width aliases (`bnd::u8`, `bnd::unorm16`, `bnd::q8_8`, …) in `bound/formats.hpp`. See [docs/storage.md](docs/storage.md). -- **Reproducible math, two engines** — a ``-shaped function set over +- **Reproducible math, three engines** — a ``-shaped function set over bounds (`sin`/`cos`/`tan`, `asin`/`acos`/`atan`/`atan2`, `sinh`/`cosh`/`tanh`, - `exp`/`log`/`log2`/`log10`/`pow`, `sqrt`/`cbrt`/`hypot`). One API, two - build-time engines: a fast `double` engine (default, bit-identical across - IEEE-754 platforms) and an integer/CORDIC engine - (`-DBOUND_MATH_FIXED=ON` — `constexpr`, FPU-free, bit-identical - unconditionally). Math operands carry the `real` policy; angles are - radians; output grids auto-deduce. See [docs/math.md](docs/math.md). + `exp`/`log`/`log2`/`log10`/`pow`, `sqrt`/`cbrt`/`hypot`). One API; three engines + callable side-by-side by namespace (`dbl::` binary64, `flt::` binary32, + `cordic::` integer/FPU-free `constexpr`), each bit-identical across platforms. + The unqualified `bnd::math::fn` picks the build default (`-DBOUND_MATH_FIXED=ON` + → cordic, `-DBOUND_MATH_FLOAT=ON` → float, else double); `-DBND_MATH_NO_FP` + drops `` for bare metal. Operands need only the `snap` bit (`f64`/`f32` + storage is an optional fast path); angles are radians; output grids auto-deduce. + See [docs/math.md](docs/math.md). - **Library internals** — grid invariants, storage decision tree, Q-format fast path, policy cascade. See [docs/internals.md](docs/internals.md). @@ -166,7 +168,8 @@ scenario, native baseline paired with each bound case). Lower is better. | `transform(b += 1)` 10k uint8-width elts (unsafe) | 1.02 µs | 1.02 µs | **1.0×** (SIMD) | | Q-format store from exact fraction | ~6 ns | n/a | n/a | | Q-format store from `double` | ~46 ns | n/a | n/a | -| `math::sin` (`real` operand, double engine) | 35 ns | 23 ns (`std::sin`) | 1.5× | +| `math::sin` (`f64` operand, double engine) | 62 ns | 107 ns (`std::sin`) | **0.58×** | +| `math::flt::sin` (`f32` operand, float engine) | 72 ns | 77 ns (`std::sinf`) | **0.94×** | | `math::fmod` (integer grids) | 19 ns | 25 ns (`std::fmod`) | **0.76×** | Notes: @@ -185,10 +188,18 @@ Notes: no-overflow is proven upfront, then convert back to a `checked` bound after the loop. - The `math::*` rows measure the call alone (the bench constructs inputs - outside the timed blocks); `real`-policy operands are double-backed, so - input marshalling is free and the gap to `std::` is the output grid-snap. + outside the timed blocks); `f64`/`f32` operands are fp-backed, so input + marshalling is free and the gap to `std::` is the output grid-snap. `math::fmod` on commensurable integer grids beats `std::fmod` — it is a single integer remainder. +- The two `math::sin` rows are a fresh same-run measurement (so they are + directly comparable to each other; absolute ns is machine-dependent — here + the own-polynomial double engine even edges out this libm's `std::sin`). The + **float engine** (`math::flt::*`, binary32) tracks the double engine closely + on a double-capable host (~1.15× here) and is comparable to `std::sinf`; its + real win is on **single-precision-only FPUs** (Cortex-M4F), where pairing it + with `f32` storage keeps the whole path in hardware `float` — no soft-`double` + at the I/O boundary — rather than throughput on this box. ## Build & Test diff --git a/tests/bench.cpp b/tests/bench.cpp index 4dee896..b901037 100644 --- a/tests/bench.cpp +++ b/tests/bench.cpp @@ -468,6 +468,8 @@ void bench_cmath() using log_in_t = bound<{{0x1p-8_r, 256}, notch<1, 256>}, round_nearest | real>; using pow_in_t = bound<{{-9, 9}, notch<1, 16384>}, round_nearest | real>; using angle_t = bound<{{-8, 8}, notch<1, 16384>}, round_nearest | real>; + // Same grid, f32 (binary32) storage — the natural pairing for the float engine. + using angle_f32_t = bound<{{-8, 8}, notch<1, 16384>}, round_nearest | f32>; using tan_in_t = bound<{{-0.75_r, 0.75_r}, notch<1, 16384>}, round_nearest | real>; using atan2_in_t= bound<{{-1, 1}, notch<1, 16384>}, round_nearest | real>; using fmod_x_t = bound<{{-8, 8}, notch<1, 16384>}, round_nearest>; @@ -482,6 +484,7 @@ void bench_cmath() std::vector v_exp2; std::vector v_log2; std::vector v_exp; std::vector v_log; std::vector v_pow; std::vector v_ang; + std::vector v_angf; std::vector v_tan; std::vector v_aty, v_atx; std::vector v_fmx; std::vector v_fmy; std::vector d_qs, d_q, d_log2, d_log, d_tan, d_aty, d_atx, d_fmy; @@ -501,6 +504,7 @@ void bench_cmath() v_exp2.push_back(exp2_in_t{qs}); v_log2.push_back(log2_in_t{rl2}); v_exp.push_back(exp_in_t{qs}); v_log.push_back(log_in_t{rl}); v_pow.push_back(pow_in_t{qs}); v_ang.push_back(angle_t{qs}); + v_angf.push_back(angle_f32_t{qs}); v_tan.push_back(tan_in_t{rt}); v_aty.push_back(atan2_in_t{ry}); v_atx.push_back(atan2_in_t{rx}); v_fmx.push_back(fmod_x_t{qs}); v_fmy.push_back(fmod_y_t{rfy}); @@ -603,6 +607,14 @@ void bench_cmath() { CTRACK_NAME("std::sin double"); auto r = std::sin(d_qs[j]); do_not_optimize(r); } +#ifndef BND_MATH_FIXED + { CTRACK_NAME("math::flt::sin f32"); // float engine, f32-backed input + auto r = bnd::math::flt::sin(v_angf[j]); + do_not_optimize(r.raw()); } + { CTRACK_NAME("std::sinf float"); + auto r = std::sin(static_cast(d_qs[j])); + do_not_optimize(r); } +#endif { CTRACK_NAME("math::cos bound"); auto r = bnd::math::cos(v_ang[j]);