From 4a08f868ca840c1f297967ae63a12f14988d1e0a Mon Sep 17 00:00:00 2001 From: Peter Neiss Date: Mon, 22 Jun 2026 20:38:04 +0200 Subject: [PATCH] math: add the flt (binary32) engine, callable as bnd::math::flt:: MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A third math engine alongside cordic (integer) and dbl (binary64): a small reproducible libm in `float`, for single-precision-only FPUs (Cortex-M4F etc.) and size/speed where double-grade precision isn't needed. bnd::math::flt::fn binary32 compute, present unless BND_MATH_NO_FP Same recipe as the double engine but in single precision (new cmath_float.hpp): own fixed polynomials evaluated with the correctly-rounded std::fma(float), and Cody-Waite range-reduction constants DERIVED AT COMPILE TIME (mask the low mantissa bits of the float-rounded constant — no external codegen), plus the correctly-rounded std::sqrt(float). So flt is bit-identical on every IEEE-754 binary32 platform — the same determinism contract as dbl. Design notes: - Empirically validated the float cores against libm: trig ~0.6 ULP, atan ~0.8, log ~1, exp a few ULP, holding across the SHARED +-2^20 domain (the constexpr split keeps float reduction float-grade even at the edge). So flt reuses the same input envelopes — the same programs compile on every engine. (cos uses proper quadrant reduction, not sin(x+pi/2): shifting the float input loses bits.) - A third value set: float != double != cordic; snapped results differ by up to a few notches on fine grids, coincide on coarse grids and exact inputs. - flt:: has the full public-shaped API mirroring dbl:: (auto-deduced grids, domain static_asserts, expected-returning tan/sqrt-signed/pow). Gated behind !BND_MATH_NO_FP; a flt:: call under the FP-free build is a compile error. - flt::store widens float->double for `real` (f64-backed) bounds or snaps via the rational path otherwise (an f32-backed storage fast path comes with Phase 4b). Tests: test_math_engines.cpp now instantiates cordic+dbl+flt in one TU, checks they agree on exact inputs, and pins 10 float-engine golden values (determinism). Docs updated (three-engine table + flt section). Single header regenerated. Deferred to Phase 4b: BND_MATH_FLOAT to make flt the unqualified default, and the f32 storage flag + real->f64 rename. Verified: default 404/404, CORDIC 442/442, all single-header smokes build. Co-Authored-By: Claude Opus 4.8 --- docs/math.md | 41 +++- include/bound/cmath.hpp | 145 +++++++++++- include/bound/cmath_float.hpp | 277 ++++++++++++++++++++++ single_include/bound/bound.hpp | 418 +++++++++++++++++++++++++++++++++ tests/test_math_engines.cpp | 53 +++++ 5 files changed, 924 insertions(+), 10 deletions(-) create mode 100644 include/bound/cmath_float.hpp diff --git a/docs/math.md b/docs/math.md index 39e6b1b..87dd032 100644 --- a/docs/math.md +++ b/docs/math.md @@ -215,16 +215,17 @@ FPU-free build. The default double engine is the right choice everywhere else: it carries the same grid guarantees and is reproducible across IEEE-754 platforms compiled without `-ffast-math`. -## Choosing an engine per call (`cordic::` / `dbl::`) +## Choosing an engine per call (`cordic::` / `dbl::` / `flt::`) -The unqualified `bnd::math::fn` uses the build's default engine. Both engines are -also reachable by name, **callable side-by-side in the same binary**: +The unqualified `bnd::math::fn` uses the build's default engine. All three engines +are also reachable by name, **callable side-by-side in the same binary**: | Namespace | Engine | Availability | |---|---|---| | `bnd::math::cordic::fn` | integer / CORDIC | **always** (constexpr, FPU-free) | -| `bnd::math::dbl::fn` | `double` | unless `BND_MATH_NO_FP` | -| `bnd::math::fn` | the default | alias of `cordic` under `BND_MATH_FIXED`/`BND_MATH_NO_FP`, else `dbl` | +| `bnd::math::dbl::fn` | `double` (binary64) | unless `BND_MATH_NO_FP` | +| `bnd::math::flt::fn` | `float` (binary32) | unless `BND_MATH_NO_FP` | +| `bnd::math::fn` | the default | `cordic` under `BND_MATH_FIXED`/`BND_MATH_NO_FP`, else `dbl` | The qualified entry points have the **same signatures, domains, auto-deduced output grids, and domain `static_assert`s** as the unqualified one — only the @@ -235,14 +236,36 @@ using A = bound<{{-8, 8}, notch<1, 16384>}, round_nearest | real>; auto a = math::cordic::sin(A{1}); // bit-exact across every target — replay/sim auto b = math::dbl::sin(A{1}); // ~2× faster — hot, accuracy-insensitive path +auto f = math::flt::sin(A{1}); // binary32 — single-precision FPUs (Cortex-M4F) auto c = math::sin(A{1}); // whichever the build selected ``` -Because the engines are independent approximations, `cordic::fn` and `dbl::fn` -can disagree by up to one notch on rounding ties (the table-maker's dilemma — see +Because the engines are independent approximations, they can disagree by a notch +or two on rounding ties (the table-maker's dilemma — see [determinism.md](determinism.md)); algebraically-exact inputs (e.g. `sqrt(4)`, -`pow(2,4)`) land identically. Under `BND_MATH_NO_FP` the `dbl::` namespace is not -defined, so a `dbl::` call there is a compile error; `cordic::` always works. +`pow(2,4)`) land identically on all three. Under `BND_MATH_NO_FP` neither `dbl::` +nor `flt::` is defined, so a call to either there is a compile error; `cordic::` +always works. + +### The `flt` (binary32) engine + +`flt::` evaluates the same fixed polynomials as `dbl::` but in single precision, +with its own compile-time-derived Cody-Waite range-reduction constants and the +correctly-rounded `std::fma(float)`/`std::sqrt(float)` — so it is **bit-identical +on every IEEE-754 binary32 platform** (same determinism contract as `dbl`). It +exists for **single-precision-only FPUs** (Cortex-M4F and similar) and for +size/speed where double-grade precision isn't needed. + +- It is a **third value set**: `float ≠ double ≠ cordic`. Snapped results differ + from the double engine by up to a few notches on fine grids; on coarse grids + (notch ≫ binary32 ULP) they typically coincide. +- It keeps the **shared input domain** (e.g. `sin`/`cos` over `|x| ≤ 2²⁰`): the + constexpr split holds float reduction across that range (precision degrades + toward the edge but stays float-grade), so the same programs compile on every + engine. +- Precision: trig ≈ 1 ULP of `float`, `exp`/compositions a handful of ULP, then + quantized onto the output grid. Ships its own golden pins + (`tests/test_math_engines.cpp`). ## Compiling without floating point (`BND_MATH_NO_FP`) diff --git a/include/bound/cmath.hpp b/include/bound/cmath.hpp index 20f3dce..8d1b2d0 100644 --- a/include/bound/cmath.hpp +++ b/include/bound/cmath.hpp @@ -5,7 +5,8 @@ #define BNDcmathHPP #include "bound/bound.hpp" -#include "bound/cmath_double.hpp" // the default (double) math engine +#include "bound/cmath_double.hpp" // the double (binary64) math engine +#include "bound/cmath_float.hpp" // the float (binary32) math engine #include "slim/expected.hpp" // slim::expected, slim::unexpected @@ -2232,6 +2233,148 @@ namespace bnd::math return slim::expected{store(r)}; } } // namespace dbl + + namespace flt + { + // Public-shaped float-engine entry points — binary32 compute, same shapes as + // dbl:: (qualify shared helpers as bnd::math::detail::; *_core/store/d_* are + // this namespace's own). A third value set (float ≠ double ≠ cordic). + template + requires (Lower == bnd::detail::rational{0}) + [[nodiscard]] BND_DBL_FN auto sqrt(In x) noexcept + { static_assert(bnd::math::detail::require_snap()); return sqrt_core>(x); } + + template + requires (Lower < bnd::detail::rational{0}) + [[nodiscard]] BND_DBL_FN auto sqrt(In x) noexcept + { + static_assert(bnd::math::detail::require_snap()); + using Out = bnd::math::detail::sqrt_signed_auto_t; + float v = static_cast(x); + if (v < 0.0f) + return slim::expected{slim::unexpected(errc::domain_error)}; + return slim::expected{store(detail::d_sqrt(v))}; + } + + template + [[nodiscard]] BND_DBL_FN auto exp2(In x) noexcept + { static_assert(bnd::math::detail::require_snap()); return exp2_core>(x); } + + template + [[nodiscard]] BND_DBL_FN auto log2(In x) noexcept + { + static_assert(bnd::math::detail::require_snap()); + static_assert(Lower > 0, "bnd::math::flt::log2: input must be strictly positive"); + return log2_core>(x); + } + + template + [[nodiscard]] BND_DBL_FN auto exp(In x) noexcept + { static_assert(bnd::math::detail::require_snap()); return exp_core>(x); } + + template + [[nodiscard]] BND_DBL_FN auto log(In x) noexcept + { + static_assert(bnd::math::detail::require_snap()); + static_assert(Lower > 0, "bnd::math::flt::log: input must be strictly positive"); + return log_core>(x); + } + + template + [[nodiscard]] BND_DBL_FN auto pow_base(In x) noexcept + { + static_assert(bnd::math::detail::require_snap()); + using Out = bnd::math::detail::pow_base_auto_t; + return store(detail::d_pow(static_cast(Base), static_cast(x))); + } + + template + [[nodiscard]] BND_DBL_FN auto sin(In angle) noexcept + { static_assert(bnd::math::detail::require_snap()); return sin_core>(angle); } + + template + [[nodiscard]] BND_DBL_FN auto cos(In angle) noexcept + { static_assert(bnd::math::detail::require_snap()); return cos_core>(angle); } + + template + [[nodiscard]] BND_DBL_FN auto tan(In angle) noexcept + { + static_assert(bnd::math::detail::require_snap()); + using Out = bnd::math::detail::tan_auto_t; + float x = static_cast(angle); + float c = detail::d_cos(x); + if (c == 0.0f) + return slim::expected{slim::unexpected(errc::division_by_zero)}; + float t = detail::d_sin(x) / c; + if constexpr (!has_flag(BoundPolicy, clamp)) // clamp Out: saturate below + if (t < static_cast(Lower) || t > static_cast(Upper)) + return slim::expected{slim::unexpected(errc::overflow)}; + return slim::expected{store(t)}; + } + + template + [[nodiscard]] BND_DBL_FN auto atan2(In y, In x) noexcept + { static_assert(bnd::math::detail::require_snap()); return atan2_core>(y, x); } + + template + [[nodiscard]] BND_DBL_FN auto atan(In x) noexcept + { static_assert(bnd::math::detail::require_snap()); return atan_core>(x); } + + template + [[nodiscard]] BND_DBL_FN auto asin(In x) noexcept + { static_assert(bnd::math::detail::require_snap()); return asin_core>(x); } + + template + [[nodiscard]] BND_DBL_FN auto acos(In x) noexcept + { static_assert(bnd::math::detail::require_snap()); return acos_core>(x); } + + template + [[nodiscard]] BND_DBL_FN auto sinh(In x) noexcept + { static_assert(bnd::math::detail::require_snap()); return sinh_core>(x); } + + template + [[nodiscard]] BND_DBL_FN auto cosh(In x) noexcept + { static_assert(bnd::math::detail::require_snap()); return cosh_core>(x); } + + template + [[nodiscard]] BND_DBL_FN auto tanh(In x) noexcept + { static_assert(bnd::math::detail::require_snap()); return tanh_core>(x); } + + template + [[nodiscard]] BND_DBL_FN auto log10(In x) noexcept + { + static_assert(bnd::math::detail::require_snap()); + static_assert(Lower > 0, "bnd::math::flt::log10: input must be strictly positive"); + return log10_core>(x); + } + + template + [[nodiscard]] BND_DBL_FN auto cbrt(In x) noexcept + { static_assert(bnd::math::detail::require_snap()); return cbrt_core>(x); } + + template + [[nodiscard]] BND_DBL_FN auto hypot(InX x, InY y) noexcept + { + static_assert(bnd::math::detail::require_snap() && bnd::math::detail::require_snap()); + return hypot_core>(x, y); + } + + template + requires (Lower > bnd::detail::rational{0}) + [[nodiscard]] BND_DBL_FN auto pow(InB base, InE exp) noexcept + { + static_assert(bnd::math::detail::require_snap() && bnd::math::detail::require_snap()); + using Out = bnd::math::detail::pow_auto_t; + float b = static_cast(base); + if (b <= 0.0f) + return slim::expected{slim::unexpected(errc::domain_error)}; + float r = detail::d_pow(b, static_cast(exp)); + if constexpr (!has_flag(BoundPolicy, clamp)) // clamp Out: saturate below + if (r < static_cast(Lower) || r > static_cast(Upper)) + return slim::expected{slim::unexpected(errc::overflow)}; + return slim::expected{store(r)}; + } + } // namespace flt #endif // !BND_MATH_NO_FP } diff --git a/include/bound/cmath_float.hpp b/include/bound/cmath_float.hpp new file mode 100644 index 0000000..abb77e5 --- /dev/null +++ b/include/bound/cmath_float.hpp @@ -0,0 +1,277 @@ +//--------------------------------------------------------------------------- +// Copyright (C) 2026 Peter Neiss +//--------------------------------------------------------------------------- +// bnd::math float engine — a small, reproducible libm in `float` (binary32). +// Bit-identical on every IEEE-754 binary32 platform compiled without +// `-ffast-math`, via the same recipe as the double engine but in single +// precision, for single-precision-only FPUs (Cortex-M4F etc.) and size/speed: +// * NO transcendentals — own fixed polynomials evaluated with the +// correctly-rounded std::fma(float) (immune to FMA-contraction). +// * Cody-Waite range reduction whose hi/lo split constants are derived at +// compile time (mask the low mantissa bits of the float-rounded constant), +// no external codegen — honoring the bit-exact contract. +// * Correctly-rounded std::sqrt(float). +// Float is a THIRD value set: float ≠ double ≠ cordic, each ≤ a few notches of +// truth where the grid permits (table-maker's dilemma — see determinism.md). +// Present only when FP is available; compiled out under BND_MATH_NO_FP. +//--------------------------------------------------------------------------- +#ifndef BNDcmathfloatHPP +#define BNDcmathfloatHPP + +#include "bound/cmath_double.hpp" // resolves BND_MATH_NO_FP; BND_DBL_FN; store base + +#ifndef BND_MATH_NO_FP + +#include // std::bit_cast (constexpr Cody-Waite split derivation) +#include // std::fma, std::sqrt, std::nearbyint, std::ldexp, std::frexp (float) + +namespace bnd::math::flt::detail +{ + using std::fma; + + // Constexpr Cody-Waite split of a high-precision (double) reference into a + // float `hi` with `keep` significant mantissa bits (low bits zeroed, so k·hi + // stays exact for modest k) plus a float `lo` carrying the remainder. Pure + // bit manipulation — identical on every IEEE-754 target. + inline constexpr float split_hi(double full, int keep) noexcept + { + float f = static_cast(full); + std::uint32_t b = std::bit_cast(f); + b &= ~((std::uint32_t{1} << (23 - keep)) - 1); + return std::bit_cast(b); + } + inline constexpr float split_lo(double full, int keep) noexcept + { + return static_cast(full - static_cast(split_hi(full, keep))); + } + + // High-precision references (double literals; only their float projections are + // used at runtime). 12 kept bits hold accuracy across the shared ±2^20 domain. + inline constexpr double kPiD = 3.14159265358979323846; + inline constexpr double kPiO2D = kPiD / 2; + inline constexpr double kLn2D = 0.69314718055994530942; + + inline constexpr float kPio2Hi = split_hi(kPiO2D, 12); + inline constexpr float kPio2Lo = split_lo(kPiO2D, 12); + inline constexpr float kTwoOverPi = static_cast(2.0 / kPiD); + inline constexpr float kLn2Hi = split_hi(kLn2D, 12); + inline constexpr float kLn2Lo = split_lo(kLn2D, 12); + inline constexpr float kLn2Full = static_cast(kLn2D); + inline constexpr float kLog2e = static_cast(1.44269504088896340736); + inline constexpr float kLog10e = static_cast(0.43429448190325182765); + inline constexpr float kSqrtHalf = 0x1.6a09e6p-1f; // √½ + inline constexpr float kPi = static_cast(kPiD); + inline constexpr float kPiHalf = static_cast(kPiO2D); + inline constexpr float kPiSixth = static_cast(kPiD / 6); + inline constexpr float kInvSqrt3 = 0x1.279a74p-1f; // 1/√3 = tan(π/6) + inline constexpr float kTanPi12 = 0x1.126146p-2f; // tan(π/12) + + // sin(r), r ∈ [−π/4, π/4]: r·P(r²) to r¹¹ (float-sufficient). + inline BND_DBL_FN float sin_poly(float r) + { + float z = r * r; + float p = -1.0f / 39916800.0f; // −1/11! + p = fma(p, z, 1.0f / 362880.0f); // 1/9! + p = fma(p, z, -1.0f / 5040.0f); // −1/7! + p = fma(p, z, 1.0f / 120.0f); // 1/5! + p = fma(p, z, -1.0f / 6.0f); // −1/3! + p = fma(p, z, 1.0f); // 1 + return r * p; + } + + // cos(r), r ∈ [−π/4, π/4]: Q(r²) to r¹⁰. + inline BND_DBL_FN float cos_poly(float r) + { + float z = r * r; + float p = -1.0f / 3628800.0f; // −1/10! + p = fma(p, z, 1.0f / 40320.0f); // 1/8! + p = fma(p, z, -1.0f / 720.0f); // −1/6! + p = fma(p, z, 1.0f / 24.0f); // 1/4! + p = fma(p, z, -1.0f / 2.0f); // −1/2! + p = fma(p, z, 1.0f); // 1 + return p; + } + + // e^r, r ∈ [−ln2/2, ln2/2]: Σ rᵏ/k! to r⁷. + inline BND_DBL_FN float exp_poly(float r) + { + float p = 1.0f / 5040.0f; // 1/7! + p = fma(p, r, 1.0f / 720.0f); // 1/6! + p = fma(p, r, 1.0f / 120.0f); // 1/5! + p = fma(p, r, 1.0f / 24.0f); // 1/4! + p = fma(p, r, 1.0f / 6.0f); // 1/3! + p = fma(p, r, 1.0f / 2.0f); // 1/2! + p = fma(p, r, 1.0f); // 1/1! + p = fma(p, r, 1.0f); // 1 + return p; + } + + // Shared quadrant reduction: x → (r ∈ [−π/4,π/4], q = quadrant mod 4). + inline BND_DBL_FN float reduce_quadrant(float x, long& q) + { + float k = std::nearbyint(x * kTwoOverPi); + float r = fma(-k, kPio2Hi, x); + r = fma(-k, kPio2Lo, r); + q = static_cast(k) & 3; + return r; + } + + inline BND_DBL_FN float d_sin(float x) + { + long q; float r = reduce_quadrant(x, q); + switch (q) { + case 0: return sin_poly(r); + case 1: return cos_poly(r); + case 2: return -sin_poly(r); + default: return -cos_poly(r); + } + } + + // cos via quadrant reduction (NOT sin(x+π/2): shifting the float input would + // lose bits for large x — the double engine can afford that, float cannot). + inline BND_DBL_FN float d_cos(float x) + { + long q; float r = reduce_quadrant(x, q); + switch (q) { + case 0: return cos_poly(r); + case 1: return -sin_poly(r); + case 2: return -cos_poly(r); + default: return sin_poly(r); + } + } + + // e^x = 2^k · e^r, x = k·ln2 + r, r ∈ [−ln2/2, ln2/2]. + inline BND_DBL_FN float d_exp(float x) + { + float k = std::nearbyint(x * kLog2e); + float r = fma(-k, kLn2Hi, x); + r = fma(-k, kLn2Lo, r); + return std::ldexp(exp_poly(r), static_cast(k)); + } + + inline BND_DBL_FN float d_sqrt(float x) { return std::sqrt(x); } // correctly rounded + + // ln(x): frexp to m∈[½,1), rebalance to [√½,√2); ln = e·ln2 + 2·atanh(f), + // f = (m−1)/(m+1). Pre: x > 0. + inline BND_DBL_FN float d_log(float x) + { + int e; + float m = std::frexp(x, &e); + if (m < kSqrtHalf) { m += m; --e; } + float f = (m - 1.0f) / (m + 1.0f); + float f2 = f * f; + float p = 1.0f / 9.0f; + p = fma(p, f2, 1.0f / 7.0f); + p = fma(p, f2, 1.0f / 5.0f); + p = fma(p, f2, 1.0f / 3.0f); + p = fma(p, f2, 1.0f); + float logm = 2.0f * f * p; + float r = fma(static_cast(e), kLn2Hi, logm); + return fma(static_cast(e), kLn2Lo, r); + } + + // Compositions on the validated primitives (same shapes as the double engine). + inline BND_DBL_FN float d_exp2(float x) { return d_exp(x * kLn2Full); } + inline BND_DBL_FN float d_log2(float x) { return d_log(x) * kLog2e; } + inline BND_DBL_FN float d_log10(float x) { return d_log(x) * kLog10e; } + inline BND_DBL_FN float d_pow(float b, float e) { return d_exp(e * d_log(b)); } + inline BND_DBL_FN float d_cbrt(float x) + { + if (x == 0.0f) return 0.0f; + float m = d_exp(d_log(x < 0 ? -x : x) * (1.0f / 3.0f)); + return x < 0 ? -m : m; + } + inline BND_DBL_FN float d_sinh(float x) { float e = d_exp(x); return (e - 1.0f / e) * 0.5f; } + inline BND_DBL_FN float d_cosh(float x) { float e = d_exp(x); return (e + 1.0f / e) * 0.5f; } + inline BND_DBL_FN float d_tanh(float x) + { + float e = d_exp(x + x); // e^{2x} + return (e - 1.0f) / (e + 1.0f); + } + inline BND_DBL_FN float d_hypot(float x, float y) { return d_sqrt(x * x + y * y); } + + // atan(x): reduce |x|>1 via reciprocal; |a|>tan(π/12) via the π/6 addition + // formula → |t| ≤ tan(π/12); atan(t) = t·P(t²) Taylor. + inline BND_DBL_FN float d_atan(float x) + { + bool neg = x < 0; float a = neg ? -x : x; + bool inv = a > 1.0f; if (inv) a = 1.0f / a; + float off = 0.0f; + if (a > kTanPi12) { a = (a - kInvSqrt3) / fma(a, kInvSqrt3, 1.0f); off = kPiSixth; } + float z = a * a; + float p = 1.0f / 13.0f; + p = fma(p, z, -1.0f / 11.0f); p = fma(p, z, 1.0f / 9.0f); p = fma(p, z, -1.0f / 7.0f); + p = fma(p, z, 1.0f / 5.0f); p = fma(p, z, -1.0f / 3.0f); p = fma(p, z, 1.0f); + float r = off + a * p; + if (inv) r = kPiHalf - r; + return neg ? -r : r; + } + + inline BND_DBL_FN float d_atan2(float y, float x) + { + if (x > 0.0f) return d_atan(y / x); + if (x < 0.0f) return d_atan(y / x) + (y >= 0.0f ? kPi : -kPi); + if (y > 0.0f) return kPiHalf; + if (y < 0.0f) return -kPiHalf; + return 0.0f; + } + + inline BND_DBL_FN float d_asin(float x) { return d_atan(x / d_sqrt((1.0f - x) * (1.0f + x))); } + inline BND_DBL_FN float d_acos(float x) { return kPiHalf - d_asin(x); } +} // namespace bnd::math::flt::detail + +namespace bnd::math::flt +{ + // Engine cores: bound in → `float` math → bound out. Storing the float result: + // a `real` (double-backed) bound takes Out{double(f)} (widening is exact); a + // non-`real` snap grid assigns through the rational path, snapping via Out's + // round policy. (An f32-backed bound — Phase 4b — will store the float raw + // directly; until then it routes through the same paths.) + template + [[nodiscard]] BND_DBL_FN Out store(float f) + { + if constexpr (has_flag(BoundPolicy, real)) return Out{static_cast(f)}; + else { Out o{}; o = bnd::detail::rational{static_cast(f)}; return o; } + } + + template + [[nodiscard]] BND_DBL_FN Out sin_core(In x) { return store(detail::d_sin(static_cast(x))); } + template + [[nodiscard]] BND_DBL_FN Out cos_core(In x) { return store(detail::d_cos(static_cast(x))); } + template + [[nodiscard]] BND_DBL_FN Out exp_core(In x) { return store(detail::d_exp(static_cast(x))); } + template + [[nodiscard]] BND_DBL_FN Out sqrt_core(In x) { return store(detail::d_sqrt(static_cast(x))); } + template + [[nodiscard]] BND_DBL_FN Out log_core(In x) { return store(detail::d_log(static_cast(x))); } + template + [[nodiscard]] BND_DBL_FN Out exp2_core(In x) { return store(detail::d_exp2(static_cast(x))); } + template + [[nodiscard]] BND_DBL_FN Out log2_core(In x) { return store(detail::d_log2(static_cast(x))); } + template + [[nodiscard]] BND_DBL_FN Out log10_core(In x){ return store(detail::d_log10(static_cast(x))); } + template + [[nodiscard]] BND_DBL_FN Out cbrt_core(In x) { return store(detail::d_cbrt(static_cast(x))); } + template + [[nodiscard]] BND_DBL_FN Out sinh_core(In x) { return store(detail::d_sinh(static_cast(x))); } + template + [[nodiscard]] BND_DBL_FN Out cosh_core(In x) { return store(detail::d_cosh(static_cast(x))); } + template + [[nodiscard]] BND_DBL_FN Out tanh_core(In x) { return store(detail::d_tanh(static_cast(x))); } + template + [[nodiscard]] BND_DBL_FN Out atan_core(In x) { return store(detail::d_atan(static_cast(x))); } + template + [[nodiscard]] BND_DBL_FN Out asin_core(In x) { return store(detail::d_asin(static_cast(x))); } + template + [[nodiscard]] BND_DBL_FN Out acos_core(In x) { return store(detail::d_acos(static_cast(x))); } + template + [[nodiscard]] BND_DBL_FN Out atan2_core(In y, In x) + { return store(detail::d_atan2(static_cast(y), static_cast(x))); } + template + [[nodiscard]] BND_DBL_FN Out hypot_core(InX x, InY y) + { return store(detail::d_hypot(static_cast(x), static_cast(y))); } +} // namespace bnd::math::flt + +#endif // !BND_MATH_NO_FP + +#endif // BNDcmathfloatHPP diff --git a/single_include/bound/bound.hpp b/single_include/bound/bound.hpp index e0f392d..aa4c8f3 100644 --- a/single_include/bound/bound.hpp +++ b/single_include/bound/bound.hpp @@ -7965,6 +7965,282 @@ namespace bnd::math::dbl #endif // !BND_MATH_NO_FP +// ====================================================================== +// bound/cmath_float.hpp +// ====================================================================== +//--------------------------------------------------------------------------- +//--------------------------------------------------------------------------- +// bnd::math float engine — a small, reproducible libm in `float` (binary32). +// Bit-identical on every IEEE-754 binary32 platform compiled without +// `-ffast-math`, via the same recipe as the double engine but in single +// precision, for single-precision-only FPUs (Cortex-M4F etc.) and size/speed: +// * NO transcendentals — own fixed polynomials evaluated with the +// correctly-rounded std::fma(float) (immune to FMA-contraction). +// * Cody-Waite range reduction whose hi/lo split constants are derived at +// compile time (mask the low mantissa bits of the float-rounded constant), +// no external codegen — honoring the bit-exact contract. +// * Correctly-rounded std::sqrt(float). +// Float is a THIRD value set: float ≠ double ≠ cordic, each ≤ a few notches of +// truth where the grid permits (table-maker's dilemma — see determinism.md). +// Present only when FP is available; compiled out under BND_MATH_NO_FP. +//--------------------------------------------------------------------------- + + +#ifndef BND_MATH_NO_FP + +#include // std::bit_cast (constexpr Cody-Waite split derivation) +#include // std::fma, std::sqrt, std::nearbyint, std::ldexp, std::frexp (float) + +namespace bnd::math::flt::detail +{ + using std::fma; + + // Constexpr Cody-Waite split of a high-precision (double) reference into a + // float `hi` with `keep` significant mantissa bits (low bits zeroed, so k·hi + // stays exact for modest k) plus a float `lo` carrying the remainder. Pure + // bit manipulation — identical on every IEEE-754 target. + inline constexpr float split_hi(double full, int keep) noexcept + { + float f = static_cast(full); + std::uint32_t b = std::bit_cast(f); + b &= ~((std::uint32_t{1} << (23 - keep)) - 1); + return std::bit_cast(b); + } + inline constexpr float split_lo(double full, int keep) noexcept + { + return static_cast(full - static_cast(split_hi(full, keep))); + } + + // High-precision references (double literals; only their float projections are + // used at runtime). 12 kept bits hold accuracy across the shared ±2^20 domain. + inline constexpr double kPiD = 3.14159265358979323846; + inline constexpr double kPiO2D = kPiD / 2; + inline constexpr double kLn2D = 0.69314718055994530942; + + inline constexpr float kPio2Hi = split_hi(kPiO2D, 12); + inline constexpr float kPio2Lo = split_lo(kPiO2D, 12); + inline constexpr float kTwoOverPi = static_cast(2.0 / kPiD); + inline constexpr float kLn2Hi = split_hi(kLn2D, 12); + inline constexpr float kLn2Lo = split_lo(kLn2D, 12); + inline constexpr float kLn2Full = static_cast(kLn2D); + inline constexpr float kLog2e = static_cast(1.44269504088896340736); + inline constexpr float kLog10e = static_cast(0.43429448190325182765); + inline constexpr float kSqrtHalf = 0x1.6a09e6p-1f; // √½ + inline constexpr float kPi = static_cast(kPiD); + inline constexpr float kPiHalf = static_cast(kPiO2D); + inline constexpr float kPiSixth = static_cast(kPiD / 6); + inline constexpr float kInvSqrt3 = 0x1.279a74p-1f; // 1/√3 = tan(π/6) + inline constexpr float kTanPi12 = 0x1.126146p-2f; // tan(π/12) + + // sin(r), r ∈ [−π/4, π/4]: r·P(r²) to r¹¹ (float-sufficient). + inline BND_DBL_FN float sin_poly(float r) + { + float z = r * r; + float p = -1.0f / 39916800.0f; // −1/11! + p = fma(p, z, 1.0f / 362880.0f); // 1/9! + p = fma(p, z, -1.0f / 5040.0f); // −1/7! + p = fma(p, z, 1.0f / 120.0f); // 1/5! + p = fma(p, z, -1.0f / 6.0f); // −1/3! + p = fma(p, z, 1.0f); // 1 + return r * p; + } + + // cos(r), r ∈ [−π/4, π/4]: Q(r²) to r¹⁰. + inline BND_DBL_FN float cos_poly(float r) + { + float z = r * r; + float p = -1.0f / 3628800.0f; // −1/10! + p = fma(p, z, 1.0f / 40320.0f); // 1/8! + p = fma(p, z, -1.0f / 720.0f); // −1/6! + p = fma(p, z, 1.0f / 24.0f); // 1/4! + p = fma(p, z, -1.0f / 2.0f); // −1/2! + p = fma(p, z, 1.0f); // 1 + return p; + } + + // e^r, r ∈ [−ln2/2, ln2/2]: Σ rᵏ/k! to r⁷. + inline BND_DBL_FN float exp_poly(float r) + { + float p = 1.0f / 5040.0f; // 1/7! + p = fma(p, r, 1.0f / 720.0f); // 1/6! + p = fma(p, r, 1.0f / 120.0f); // 1/5! + p = fma(p, r, 1.0f / 24.0f); // 1/4! + p = fma(p, r, 1.0f / 6.0f); // 1/3! + p = fma(p, r, 1.0f / 2.0f); // 1/2! + p = fma(p, r, 1.0f); // 1/1! + p = fma(p, r, 1.0f); // 1 + return p; + } + + // Shared quadrant reduction: x → (r ∈ [−π/4,π/4], q = quadrant mod 4). + inline BND_DBL_FN float reduce_quadrant(float x, long& q) + { + float k = std::nearbyint(x * kTwoOverPi); + float r = fma(-k, kPio2Hi, x); + r = fma(-k, kPio2Lo, r); + q = static_cast(k) & 3; + return r; + } + + inline BND_DBL_FN float d_sin(float x) + { + long q; float r = reduce_quadrant(x, q); + switch (q) { + case 0: return sin_poly(r); + case 1: return cos_poly(r); + case 2: return -sin_poly(r); + default: return -cos_poly(r); + } + } + + // cos via quadrant reduction (NOT sin(x+π/2): shifting the float input would + // lose bits for large x — the double engine can afford that, float cannot). + inline BND_DBL_FN float d_cos(float x) + { + long q; float r = reduce_quadrant(x, q); + switch (q) { + case 0: return cos_poly(r); + case 1: return -sin_poly(r); + case 2: return -cos_poly(r); + default: return sin_poly(r); + } + } + + // e^x = 2^k · e^r, x = k·ln2 + r, r ∈ [−ln2/2, ln2/2]. + inline BND_DBL_FN float d_exp(float x) + { + float k = std::nearbyint(x * kLog2e); + float r = fma(-k, kLn2Hi, x); + r = fma(-k, kLn2Lo, r); + return std::ldexp(exp_poly(r), static_cast(k)); + } + + inline BND_DBL_FN float d_sqrt(float x) { return std::sqrt(x); } // correctly rounded + + // ln(x): frexp to m∈[½,1), rebalance to [√½,√2); ln = e·ln2 + 2·atanh(f), + // f = (m−1)/(m+1). Pre: x > 0. + inline BND_DBL_FN float d_log(float x) + { + int e; + float m = std::frexp(x, &e); + if (m < kSqrtHalf) { m += m; --e; } + float f = (m - 1.0f) / (m + 1.0f); + float f2 = f * f; + float p = 1.0f / 9.0f; + p = fma(p, f2, 1.0f / 7.0f); + p = fma(p, f2, 1.0f / 5.0f); + p = fma(p, f2, 1.0f / 3.0f); + p = fma(p, f2, 1.0f); + float logm = 2.0f * f * p; + float r = fma(static_cast(e), kLn2Hi, logm); + return fma(static_cast(e), kLn2Lo, r); + } + + // Compositions on the validated primitives (same shapes as the double engine). + inline BND_DBL_FN float d_exp2(float x) { return d_exp(x * kLn2Full); } + inline BND_DBL_FN float d_log2(float x) { return d_log(x) * kLog2e; } + inline BND_DBL_FN float d_log10(float x) { return d_log(x) * kLog10e; } + inline BND_DBL_FN float d_pow(float b, float e) { return d_exp(e * d_log(b)); } + inline BND_DBL_FN float d_cbrt(float x) + { + if (x == 0.0f) return 0.0f; + float m = d_exp(d_log(x < 0 ? -x : x) * (1.0f / 3.0f)); + return x < 0 ? -m : m; + } + inline BND_DBL_FN float d_sinh(float x) { float e = d_exp(x); return (e - 1.0f / e) * 0.5f; } + inline BND_DBL_FN float d_cosh(float x) { float e = d_exp(x); return (e + 1.0f / e) * 0.5f; } + inline BND_DBL_FN float d_tanh(float x) + { + float e = d_exp(x + x); // e^{2x} + return (e - 1.0f) / (e + 1.0f); + } + inline BND_DBL_FN float d_hypot(float x, float y) { return d_sqrt(x * x + y * y); } + + // atan(x): reduce |x|>1 via reciprocal; |a|>tan(π/12) via the π/6 addition + // formula → |t| ≤ tan(π/12); atan(t) = t·P(t²) Taylor. + inline BND_DBL_FN float d_atan(float x) + { + bool neg = x < 0; float a = neg ? -x : x; + bool inv = a > 1.0f; if (inv) a = 1.0f / a; + float off = 0.0f; + if (a > kTanPi12) { a = (a - kInvSqrt3) / fma(a, kInvSqrt3, 1.0f); off = kPiSixth; } + float z = a * a; + float p = 1.0f / 13.0f; + p = fma(p, z, -1.0f / 11.0f); p = fma(p, z, 1.0f / 9.0f); p = fma(p, z, -1.0f / 7.0f); + p = fma(p, z, 1.0f / 5.0f); p = fma(p, z, -1.0f / 3.0f); p = fma(p, z, 1.0f); + float r = off + a * p; + if (inv) r = kPiHalf - r; + return neg ? -r : r; + } + + inline BND_DBL_FN float d_atan2(float y, float x) + { + if (x > 0.0f) return d_atan(y / x); + if (x < 0.0f) return d_atan(y / x) + (y >= 0.0f ? kPi : -kPi); + if (y > 0.0f) return kPiHalf; + if (y < 0.0f) return -kPiHalf; + return 0.0f; + } + + inline BND_DBL_FN float d_asin(float x) { return d_atan(x / d_sqrt((1.0f - x) * (1.0f + x))); } + inline BND_DBL_FN float d_acos(float x) { return kPiHalf - d_asin(x); } +} // namespace bnd::math::flt::detail + +namespace bnd::math::flt +{ + // Engine cores: bound in → `float` math → bound out. Storing the float result: + // a `real` (double-backed) bound takes Out{double(f)} (widening is exact); a + // non-`real` snap grid assigns through the rational path, snapping via Out's + // round policy. (An f32-backed bound — Phase 4b — will store the float raw + // directly; until then it routes through the same paths.) + template + [[nodiscard]] BND_DBL_FN Out store(float f) + { + if constexpr (has_flag(BoundPolicy, real)) return Out{static_cast(f)}; + else { Out o{}; o = bnd::detail::rational{static_cast(f)}; return o; } + } + + template + [[nodiscard]] BND_DBL_FN Out sin_core(In x) { return store(detail::d_sin(static_cast(x))); } + template + [[nodiscard]] BND_DBL_FN Out cos_core(In x) { return store(detail::d_cos(static_cast(x))); } + template + [[nodiscard]] BND_DBL_FN Out exp_core(In x) { return store(detail::d_exp(static_cast(x))); } + template + [[nodiscard]] BND_DBL_FN Out sqrt_core(In x) { return store(detail::d_sqrt(static_cast(x))); } + template + [[nodiscard]] BND_DBL_FN Out log_core(In x) { return store(detail::d_log(static_cast(x))); } + template + [[nodiscard]] BND_DBL_FN Out exp2_core(In x) { return store(detail::d_exp2(static_cast(x))); } + template + [[nodiscard]] BND_DBL_FN Out log2_core(In x) { return store(detail::d_log2(static_cast(x))); } + template + [[nodiscard]] BND_DBL_FN Out log10_core(In x){ return store(detail::d_log10(static_cast(x))); } + template + [[nodiscard]] BND_DBL_FN Out cbrt_core(In x) { return store(detail::d_cbrt(static_cast(x))); } + template + [[nodiscard]] BND_DBL_FN Out sinh_core(In x) { return store(detail::d_sinh(static_cast(x))); } + template + [[nodiscard]] BND_DBL_FN Out cosh_core(In x) { return store(detail::d_cosh(static_cast(x))); } + template + [[nodiscard]] BND_DBL_FN Out tanh_core(In x) { return store(detail::d_tanh(static_cast(x))); } + template + [[nodiscard]] BND_DBL_FN Out atan_core(In x) { return store(detail::d_atan(static_cast(x))); } + template + [[nodiscard]] BND_DBL_FN Out asin_core(In x) { return store(detail::d_asin(static_cast(x))); } + template + [[nodiscard]] BND_DBL_FN Out acos_core(In x) { return store(detail::d_acos(static_cast(x))); } + template + [[nodiscard]] BND_DBL_FN Out atan2_core(In y, In x) + { return store(detail::d_atan2(static_cast(y), static_cast(x))); } + template + [[nodiscard]] BND_DBL_FN Out hypot_core(InX x, InY y) + { return store(detail::d_hypot(static_cast(x), static_cast(y))); } +} // namespace bnd::math::flt + +#endif // !BND_MATH_NO_FP + + // The public bnd::math::* functions dispatch to the double engine (default) or @@ -10187,6 +10463,148 @@ namespace bnd::math return slim::expected{store(r)}; } } // namespace dbl + + namespace flt + { + // Public-shaped float-engine entry points — binary32 compute, same shapes as + // dbl:: (qualify shared helpers as bnd::math::detail::; *_core/store/d_* are + // this namespace's own). A third value set (float ≠ double ≠ cordic). + template + requires (Lower == bnd::detail::rational{0}) + [[nodiscard]] BND_DBL_FN auto sqrt(In x) noexcept + { static_assert(bnd::math::detail::require_snap()); return sqrt_core>(x); } + + template + requires (Lower < bnd::detail::rational{0}) + [[nodiscard]] BND_DBL_FN auto sqrt(In x) noexcept + { + static_assert(bnd::math::detail::require_snap()); + using Out = bnd::math::detail::sqrt_signed_auto_t; + float v = static_cast(x); + if (v < 0.0f) + return slim::expected{slim::unexpected(errc::domain_error)}; + return slim::expected{store(detail::d_sqrt(v))}; + } + + template + [[nodiscard]] BND_DBL_FN auto exp2(In x) noexcept + { static_assert(bnd::math::detail::require_snap()); return exp2_core>(x); } + + template + [[nodiscard]] BND_DBL_FN auto log2(In x) noexcept + { + static_assert(bnd::math::detail::require_snap()); + static_assert(Lower > 0, "bnd::math::flt::log2: input must be strictly positive"); + return log2_core>(x); + } + + template + [[nodiscard]] BND_DBL_FN auto exp(In x) noexcept + { static_assert(bnd::math::detail::require_snap()); return exp_core>(x); } + + template + [[nodiscard]] BND_DBL_FN auto log(In x) noexcept + { + static_assert(bnd::math::detail::require_snap()); + static_assert(Lower > 0, "bnd::math::flt::log: input must be strictly positive"); + return log_core>(x); + } + + template + [[nodiscard]] BND_DBL_FN auto pow_base(In x) noexcept + { + static_assert(bnd::math::detail::require_snap()); + using Out = bnd::math::detail::pow_base_auto_t; + return store(detail::d_pow(static_cast(Base), static_cast(x))); + } + + template + [[nodiscard]] BND_DBL_FN auto sin(In angle) noexcept + { static_assert(bnd::math::detail::require_snap()); return sin_core>(angle); } + + template + [[nodiscard]] BND_DBL_FN auto cos(In angle) noexcept + { static_assert(bnd::math::detail::require_snap()); return cos_core>(angle); } + + template + [[nodiscard]] BND_DBL_FN auto tan(In angle) noexcept + { + static_assert(bnd::math::detail::require_snap()); + using Out = bnd::math::detail::tan_auto_t; + float x = static_cast(angle); + float c = detail::d_cos(x); + if (c == 0.0f) + return slim::expected{slim::unexpected(errc::division_by_zero)}; + float t = detail::d_sin(x) / c; + if constexpr (!has_flag(BoundPolicy, clamp)) // clamp Out: saturate below + if (t < static_cast(Lower) || t > static_cast(Upper)) + return slim::expected{slim::unexpected(errc::overflow)}; + return slim::expected{store(t)}; + } + + template + [[nodiscard]] BND_DBL_FN auto atan2(In y, In x) noexcept + { static_assert(bnd::math::detail::require_snap()); return atan2_core>(y, x); } + + template + [[nodiscard]] BND_DBL_FN auto atan(In x) noexcept + { static_assert(bnd::math::detail::require_snap()); return atan_core>(x); } + + template + [[nodiscard]] BND_DBL_FN auto asin(In x) noexcept + { static_assert(bnd::math::detail::require_snap()); return asin_core>(x); } + + template + [[nodiscard]] BND_DBL_FN auto acos(In x) noexcept + { static_assert(bnd::math::detail::require_snap()); return acos_core>(x); } + + template + [[nodiscard]] BND_DBL_FN auto sinh(In x) noexcept + { static_assert(bnd::math::detail::require_snap()); return sinh_core>(x); } + + template + [[nodiscard]] BND_DBL_FN auto cosh(In x) noexcept + { static_assert(bnd::math::detail::require_snap()); return cosh_core>(x); } + + template + [[nodiscard]] BND_DBL_FN auto tanh(In x) noexcept + { static_assert(bnd::math::detail::require_snap()); return tanh_core>(x); } + + template + [[nodiscard]] BND_DBL_FN auto log10(In x) noexcept + { + static_assert(bnd::math::detail::require_snap()); + static_assert(Lower > 0, "bnd::math::flt::log10: input must be strictly positive"); + return log10_core>(x); + } + + template + [[nodiscard]] BND_DBL_FN auto cbrt(In x) noexcept + { static_assert(bnd::math::detail::require_snap()); return cbrt_core>(x); } + + template + [[nodiscard]] BND_DBL_FN auto hypot(InX x, InY y) noexcept + { + static_assert(bnd::math::detail::require_snap() && bnd::math::detail::require_snap()); + return hypot_core>(x, y); + } + + template + requires (Lower > bnd::detail::rational{0}) + [[nodiscard]] BND_DBL_FN auto pow(InB base, InE exp) noexcept + { + static_assert(bnd::math::detail::require_snap() && bnd::math::detail::require_snap()); + using Out = bnd::math::detail::pow_auto_t; + float b = static_cast(base); + if (b <= 0.0f) + return slim::expected{slim::unexpected(errc::domain_error)}; + float r = detail::d_pow(b, static_cast(exp)); + if constexpr (!has_flag(BoundPolicy, clamp)) // clamp Out: saturate below + if (r < static_cast(Lower) || r > static_cast(Upper)) + return slim::expected{slim::unexpected(errc::overflow)}; + return slim::expected{store(r)}; + } + } // namespace flt #endif // !BND_MATH_NO_FP } diff --git a/tests/test_math_engines.cpp b/tests/test_math_engines.cpp index dbedb50..6fec970 100644 --- a/tests/test_math_engines.cpp +++ b/tests/test_math_engines.cpp @@ -65,6 +65,59 @@ TEST_CASE("double engine is callable side-by-side and agrees on special values", REQUIRE(rational{*p} == 16); } +TEST_CASE("float engine is callable side-by-side and agrees on special values", + "[cmath][engines][flt]") +{ + REQUIRE(rational{math::flt::sin(Ang{0})} == 0); + REQUIRE(rational{math::flt::cos(Ang{0})} == 1); + REQUIRE(rational{math::flt::atan(Ang{0})} == 0); + REQUIRE(rational{math::flt::sinh(Ang{0})} == 0); + REQUIRE(rational{math::flt::cosh(Ang{0})} == 1); + REQUIRE(rational{math::flt::tanh(Ang{0})} == 0); + REQUIRE(rational{math::flt::sqrt(Sq{4})} == 2); + REQUIRE(rational{math::flt::cbrt(Ang{8})} == 2); + REQUIRE(rational{math::flt::log10(Pos{100})} == 2); + REQUIRE(rational{math::flt::exp(Ang{0})} == 1); + + auto p = math::flt::pow(Pos{2}, Ang{4}); + REQUIRE(p.has_value()); + REQUIRE(rational{*p} == 16); +} + +// Golden pins for the float engine: the EXACT grid-snapped rational the binary32 +// engine must produce, bit-for-bit, on every IEEE-754 binary32 platform. These +// are a THIRD value set (float ≠ double ≠ cordic); regenerate only on a +// deliberate engine change. Grid: notch 1/16384 real (Ang/Pos/Sq above). +#define EXACT_FLT(expr, N, D) REQUIRE(rational{(expr)} == rational{N, D}) + +TEST_CASE("float engine golden pins are bit-exact (determinism)", + "[cmath][engines][flt][determinism]") +{ + EXACT_FLT(math::flt::sin(Ang{1}), 13787, 16384); + EXACT_FLT(math::flt::cos(Ang{1}), 2213, 4096); + EXACT_FLT(math::flt::atan(Ang{1}), 3217, 4096); + EXACT_FLT(math::flt::exp(Ang{2}), 60531, 8192); + EXACT_FLT(math::flt::log(Pos{10}), 18863, 8192); + EXACT_FLT(math::flt::log10(Pos{50}), 6959, 4096); + EXACT_FLT(math::flt::sqrt(Sq{2}), 11585, 8192); + EXACT_FLT(math::flt::cbrt(Ang{2}), 20643, 16384); + EXACT_FLT(math::flt::sinh(Ang{2}), 29711, 8192); + EXACT_FLT(math::flt::tanh(Ang{1}), 6239, 8192); +} + +TEST_CASE("all three engines coexist in one binary and meet at exact points", + "[cmath][engines][mix]") +{ + // Phase-4 property: cordic + dbl + flt all instantiated in the same TU. + REQUIRE(rational{math::flt::sqrt(Sq{4})} == rational{math::dbl::sqrt(Sq{4})}); + REQUIRE(rational{math::flt::cos(Ang{0})} == rational{math::cordic::cos(Ang{0})}); + // A pole errors through the expected channel under the float engine too. + using TanAng = bound<{{-2, 2}, notch<1, 4096>}, round_nearest | real>; + auto tf = math::flt::tan(TanAng{0}); + REQUIRE(tf.has_value()); + REQUIRE(rational{*tf} == 0); +} + TEST_CASE("both engines coexist in one binary and meet at exact points", "[cmath][engines][mix]") {