From 4a08f868ca840c1f297967ae63a12f14988d1e0a Mon Sep 17 00:00:00 2001
From: Peter Neiss <neiss@plasus.de>
Date: Mon, 22 Jun 2026 20:38:04 +0200
Subject: [PATCH] math: add the flt (binary32) engine, callable as
 bnd::math::flt::
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A third math engine alongside cordic (integer) and dbl (binary64): a small
reproducible libm in `float`, for single-precision-only FPUs (Cortex-M4F etc.)
and size/speed where double-grade precision isn't needed.

  bnd::math::flt::fn   binary32 compute, present unless BND_MATH_NO_FP

Same recipe as the double engine but in single precision (new cmath_float.hpp):
own fixed polynomials evaluated with the correctly-rounded std::fma(float), and
Cody-Waite range-reduction constants DERIVED AT COMPILE TIME (mask the low
mantissa bits of the float-rounded constant — no external codegen), plus the
correctly-rounded std::sqrt(float). So flt is bit-identical on every IEEE-754
binary32 platform — the same determinism contract as dbl.

Design notes:
- Empirically validated the float cores against libm: trig ~0.6 ULP, atan ~0.8,
  log ~1, exp a few ULP, holding across the SHARED +-2^20 domain (the constexpr
  split keeps float reduction float-grade even at the edge). So flt reuses the
  same input envelopes — the same programs compile on every engine. (cos uses
  proper quadrant reduction, not sin(x+pi/2): shifting the float input loses bits.)
- A third value set: float != double != cordic; snapped results differ by up to
  a few notches on fine grids, coincide on coarse grids and exact inputs.
- flt:: has the full public-shaped API mirroring dbl:: (auto-deduced grids,
  domain static_asserts, expected-returning tan/sqrt-signed/pow). Gated behind
  !BND_MATH_NO_FP; a flt:: call under the FP-free build is a compile error.
- flt::store widens float->double for `real` (f64-backed) bounds or snaps via the
  rational path otherwise (an f32-backed storage fast path comes with Phase 4b).

Tests: test_math_engines.cpp now instantiates cordic+dbl+flt in one TU, checks
they agree on exact inputs, and pins 10 float-engine golden values (determinism).
Docs updated (three-engine table + flt section). Single header regenerated.

Deferred to Phase 4b: BND_MATH_FLOAT to make flt the unqualified default, and
the f32 storage flag + real->f64 rename.

Verified: default 404/404, CORDIC 442/442, all single-header smokes build.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 docs/math.md                   |  41 +++-
 include/bound/cmath.hpp        | 145 +++++++++++-
 include/bound/cmath_float.hpp  | 277 ++++++++++++++++++++++
 single_include/bound/bound.hpp | 418 +++++++++++++++++++++++++++++++++
 tests/test_math_engines.cpp    |  53 +++++
 5 files changed, 924 insertions(+), 10 deletions(-)
 create mode 100644 include/bound/cmath_float.hpp

diff --git a/docs/math.md b/docs/math.md
index 39e6b1b..87dd032 100644
--- a/docs/math.md
+++ b/docs/math.md
@@ -215,16 +215,17 @@ FPU-free build. The default double engine is the right choice everywhere
 else: it carries the same grid guarantees and is reproducible across IEEE-754
 platforms compiled without `-ffast-math`.
 
-## Choosing an engine per call (`cordic::` / `dbl::`)
+## Choosing an engine per call (`cordic::` / `dbl::` / `flt::`)
 
-The unqualified `bnd::math::fn` uses the build's default engine. Both engines are
-also reachable by name, **callable side-by-side in the same binary**:
+The unqualified `bnd::math::fn` uses the build's default engine. All three engines
+are also reachable by name, **callable side-by-side in the same binary**:
 
 | Namespace | Engine | Availability |
 |---|---|---|
 | `bnd::math::cordic::fn` | integer / CORDIC | **always** (constexpr, FPU-free) |
-| `bnd::math::dbl::fn` | `double` | unless `BND_MATH_NO_FP` |
-| `bnd::math::fn` | the default | alias of `cordic` under `BND_MATH_FIXED`/`BND_MATH_NO_FP`, else `dbl` |
+| `bnd::math::dbl::fn` | `double` (binary64) | unless `BND_MATH_NO_FP` |
+| `bnd::math::flt::fn` | `float` (binary32) | unless `BND_MATH_NO_FP` |
+| `bnd::math::fn` | the default | `cordic` under `BND_MATH_FIXED`/`BND_MATH_NO_FP`, else `dbl` |
 
 The qualified entry points have the **same signatures, domains, auto-deduced
 output grids, and domain `static_assert`s** as the unqualified one — only the
@@ -235,14 +236,36 @@ using A = bound<{{-8, 8}, notch<1, 16384>}, round_nearest | real>;
 
 auto a = math::cordic::sin(A{1});   // bit-exact across every target — replay/sim
 auto b = math::dbl::sin(A{1});      // ~2× faster — hot, accuracy-insensitive path
+auto f = math::flt::sin(A{1});      // binary32 — single-precision FPUs (Cortex-M4F)
 auto c = math::sin(A{1});           // whichever the build selected
 ```
 
-Because the engines are independent approximations, `cordic::fn` and `dbl::fn`
-can disagree by up to one notch on rounding ties (the table-maker's dilemma — see
+Because the engines are independent approximations, they can disagree by a notch
+or two on rounding ties (the table-maker's dilemma — see
 [determinism.md](determinism.md)); algebraically-exact inputs (e.g. `sqrt(4)`,
-`pow(2,4)`) land identically. Under `BND_MATH_NO_FP` the `dbl::` namespace is not
-defined, so a `dbl::` call there is a compile error; `cordic::` always works.
+`pow(2,4)`) land identically on all three. Under `BND_MATH_NO_FP` neither `dbl::`
+nor `flt::` is defined, so a call to either there is a compile error; `cordic::`
+always works.
+
+### The `flt` (binary32) engine
+
+`flt::` evaluates the same fixed polynomials as `dbl::` but in single precision,
+with its own compile-time-derived Cody-Waite range-reduction constants and the
+correctly-rounded `std::fma(float)`/`std::sqrt(float)` — so it is **bit-identical
+on every IEEE-754 binary32 platform** (same determinism contract as `dbl`). It
+exists for **single-precision-only FPUs** (Cortex-M4F and similar) and for
+size/speed where double-grade precision isn't needed.
+
+- It is a **third value set**: `float ≠ double ≠ cordic`. Snapped results differ
+  from the double engine by up to a few notches on fine grids; on coarse grids
+  (notch ≫ binary32 ULP) they typically coincide.
+- It keeps the **shared input domain** (e.g. `sin`/`cos` over `|x| ≤ 2²⁰`): the
+  constexpr split holds float reduction across that range (precision degrades
+  toward the edge but stays float-grade), so the same programs compile on every
+  engine.
+- Precision: trig ≈ 1 ULP of `float`, `exp`/compositions a handful of ULP, then
+  quantized onto the output grid. Ships its own golden pins
+  (`tests/test_math_engines.cpp`).
 
 ## Compiling without floating point (`BND_MATH_NO_FP`)
 
diff --git a/include/bound/cmath.hpp b/include/bound/cmath.hpp
index 20f3dce..8d1b2d0 100644
--- a/include/bound/cmath.hpp
+++ b/include/bound/cmath.hpp
@@ -5,7 +5,8 @@
 #define BNDcmathHPP
 
 #include "bound/bound.hpp"
-#include "bound/cmath_double.hpp"   // the default (double) math engine
+#include "bound/cmath_double.hpp"   // the double (binary64) math engine
+#include "bound/cmath_float.hpp"    // the float  (binary32) math engine
 
 #include "slim/expected.hpp"     // slim::expected, slim::unexpected
 
@@ -2232,6 +2233,148 @@ namespace bnd::math
       return slim::expected<Out, errc>{store<Out>(r)};
     }
   } // namespace dbl
+
+  namespace flt
+  {
+    // Public-shaped float-engine entry points — binary32 compute, same shapes as
+    // dbl:: (qualify shared helpers as bnd::math::detail::; *_core/store/d_* are
+    // this namespace's own). A third value set (float ≠ double ≠ cordic).
+    template <boundable In>
+      requires (Lower<In> == bnd::detail::rational{0})
+    [[nodiscard]] BND_DBL_FN auto sqrt(In x) noexcept
+    { static_assert(bnd::math::detail::require_snap<In>()); return sqrt_core<bnd::math::detail::sqrt_auto_t<In>>(x); }
+
+    template <boundable In>
+      requires (Lower<In> < bnd::detail::rational{0})
+    [[nodiscard]] BND_DBL_FN auto sqrt(In x) noexcept
+    {
+      static_assert(bnd::math::detail::require_snap<In>());
+      using Out = bnd::math::detail::sqrt_signed_auto_t<In>;
+      float v = static_cast<float>(x);
+      if (v < 0.0f)
+        return slim::expected<Out, errc>{slim::unexpected(errc::domain_error)};
+      return slim::expected<Out, errc>{store<Out>(detail::d_sqrt(v))};
+    }
+
+    template <boundable In>
+    [[nodiscard]] BND_DBL_FN auto exp2(In x) noexcept
+    { static_assert(bnd::math::detail::require_snap<In>()); return exp2_core<bnd::math::detail::exp2_auto_t<In>>(x); }
+
+    template <boundable In>
+    [[nodiscard]] BND_DBL_FN auto log2(In x) noexcept
+    {
+      static_assert(bnd::math::detail::require_snap<In>());
+      static_assert(Lower<In> > 0, "bnd::math::flt::log2: input must be strictly positive");
+      return log2_core<bnd::math::detail::log2_auto_t<In>>(x);
+    }
+
+    template <boundable In>
+    [[nodiscard]] BND_DBL_FN auto exp(In x) noexcept
+    { static_assert(bnd::math::detail::require_snap<In>()); return exp_core<bnd::math::detail::exp_auto_t<In>>(x); }
+
+    template <boundable In>
+    [[nodiscard]] BND_DBL_FN auto log(In x) noexcept
+    {
+      static_assert(bnd::math::detail::require_snap<In>());
+      static_assert(Lower<In> > 0, "bnd::math::flt::log: input must be strictly positive");
+      return log_core<bnd::math::detail::log_auto_t<In>>(x);
+    }
+
+    template <imax Base, boundable In>
+    [[nodiscard]] BND_DBL_FN auto pow_base(In x) noexcept
+    {
+      static_assert(bnd::math::detail::require_snap<In>());
+      using Out = bnd::math::detail::pow_base_auto_t<Base, In>;
+      return store<Out>(detail::d_pow(static_cast<float>(Base), static_cast<float>(x)));
+    }
+
+    template <boundable In>
+    [[nodiscard]] BND_DBL_FN auto sin(In angle) noexcept
+    { static_assert(bnd::math::detail::require_snap<In>()); return sin_core<bnd::math::detail::sin_auto_t<In>>(angle); }
+
+    template <boundable In>
+    [[nodiscard]] BND_DBL_FN auto cos(In angle) noexcept
+    { static_assert(bnd::math::detail::require_snap<In>()); return cos_core<bnd::math::detail::cos_auto_t<In>>(angle); }
+
+    template <boundable In>
+    [[nodiscard]] BND_DBL_FN auto tan(In angle) noexcept
+    {
+      static_assert(bnd::math::detail::require_snap<In>());
+      using Out = bnd::math::detail::tan_auto_t<In>;
+      float x = static_cast<float>(angle);
+      float c = detail::d_cos(x);
+      if (c == 0.0f)
+        return slim::expected<Out, errc>{slim::unexpected(errc::division_by_zero)};
+      float t = detail::d_sin(x) / c;
+      if constexpr (!has_flag(BoundPolicy<Out>, clamp))   // clamp Out: saturate below
+        if (t < static_cast<float>(Lower<Out>) || t > static_cast<float>(Upper<Out>))
+          return slim::expected<Out, errc>{slim::unexpected(errc::overflow)};
+      return slim::expected<Out, errc>{store<Out>(t)};
+    }
+
+    template <boundable In>
+    [[nodiscard]] BND_DBL_FN auto atan2(In y, In x) noexcept
+    { static_assert(bnd::math::detail::require_snap<In>()); return atan2_core<bnd::math::detail::atan2_auto_t<In>>(y, x); }
+
+    template <boundable In>
+    [[nodiscard]] BND_DBL_FN auto atan(In x) noexcept
+    { static_assert(bnd::math::detail::require_snap<In>()); return atan_core<bnd::math::detail::atan_auto_t<In>>(x); }
+
+    template <boundable In>
+    [[nodiscard]] BND_DBL_FN auto asin(In x) noexcept
+    { static_assert(bnd::math::detail::require_snap<In>()); return asin_core<bnd::math::detail::asin_auto_t<In>>(x); }
+
+    template <boundable In>
+    [[nodiscard]] BND_DBL_FN auto acos(In x) noexcept
+    { static_assert(bnd::math::detail::require_snap<In>()); return acos_core<bnd::math::detail::acos_auto_t<In>>(x); }
+
+    template <boundable In>
+    [[nodiscard]] BND_DBL_FN auto sinh(In x) noexcept
+    { static_assert(bnd::math::detail::require_snap<In>()); return sinh_core<bnd::math::detail::sinh_auto_t<In>>(x); }
+
+    template <boundable In>
+    [[nodiscard]] BND_DBL_FN auto cosh(In x) noexcept
+    { static_assert(bnd::math::detail::require_snap<In>()); return cosh_core<bnd::math::detail::cosh_auto_t<In>>(x); }
+
+    template <boundable In>
+    [[nodiscard]] BND_DBL_FN auto tanh(In x) noexcept
+    { static_assert(bnd::math::detail::require_snap<In>()); return tanh_core<bnd::math::detail::tanh_auto_t<In>>(x); }
+
+    template <boundable In>
+    [[nodiscard]] BND_DBL_FN auto log10(In x) noexcept
+    {
+      static_assert(bnd::math::detail::require_snap<In>());
+      static_assert(Lower<In> > 0, "bnd::math::flt::log10: input must be strictly positive");
+      return log10_core<bnd::math::detail::log10_auto_t<In>>(x);
+    }
+
+    template <boundable In>
+    [[nodiscard]] BND_DBL_FN auto cbrt(In x) noexcept
+    { static_assert(bnd::math::detail::require_snap<In>()); return cbrt_core<bnd::math::detail::cbrt_auto_t<In>>(x); }
+
+    template <boundable InX, boundable InY>
+    [[nodiscard]] BND_DBL_FN auto hypot(InX x, InY y) noexcept
+    {
+      static_assert(bnd::math::detail::require_snap<InX>() && bnd::math::detail::require_snap<InY>());
+      return hypot_core<bnd::math::detail::hypot_auto_t<InX, InY>>(x, y);
+    }
+
+    template <boundable InB, boundable InE>
+      requires (Lower<InB> > bnd::detail::rational{0})
+    [[nodiscard]] BND_DBL_FN auto pow(InB base, InE exp) noexcept
+    {
+      static_assert(bnd::math::detail::require_snap<InB>() && bnd::math::detail::require_snap<InE>());
+      using Out = bnd::math::detail::pow_auto_t<InB, InE>;
+      float b = static_cast<float>(base);
+      if (b <= 0.0f)
+        return slim::expected<Out, errc>{slim::unexpected(errc::domain_error)};
+      float r = detail::d_pow(b, static_cast<float>(exp));
+      if constexpr (!has_flag(BoundPolicy<Out>, clamp))   // clamp Out: saturate below
+        if (r < static_cast<float>(Lower<Out>) || r > static_cast<float>(Upper<Out>))
+          return slim::expected<Out, errc>{slim::unexpected(errc::overflow)};
+      return slim::expected<Out, errc>{store<Out>(r)};
+    }
+  } // namespace flt
 #endif // !BND_MATH_NO_FP
 }
 
diff --git a/include/bound/cmath_float.hpp b/include/bound/cmath_float.hpp
new file mode 100644
index 0000000..abb77e5
--- /dev/null
+++ b/include/bound/cmath_float.hpp
@@ -0,0 +1,277 @@
+//---------------------------------------------------------------------------
+// Copyright (C) 2026 Peter Neiss
+//---------------------------------------------------------------------------
+// bnd::math float engine — a small, reproducible libm in `float` (binary32).
+// Bit-identical on every IEEE-754 binary32 platform compiled without
+// `-ffast-math`, via the same recipe as the double engine but in single
+// precision, for single-precision-only FPUs (Cortex-M4F etc.) and size/speed:
+//   * NO <cmath> transcendentals — own fixed polynomials evaluated with the
+//     correctly-rounded std::fma(float) (immune to FMA-contraction).
+//   * Cody-Waite range reduction whose hi/lo split constants are derived at
+//     compile time (mask the low mantissa bits of the float-rounded constant),
+//     no external codegen — honoring the bit-exact contract.
+//   * Correctly-rounded std::sqrt(float).
+// Float is a THIRD value set: float ≠ double ≠ cordic, each ≤ a few notches of
+// truth where the grid permits (table-maker's dilemma — see determinism.md).
+// Present only when FP is available; compiled out under BND_MATH_NO_FP.
+//---------------------------------------------------------------------------
+#ifndef BNDcmathfloatHPP
+#define BNDcmathfloatHPP
+
+#include "bound/cmath_double.hpp"  // resolves BND_MATH_NO_FP; BND_DBL_FN; store base
+
+#ifndef BND_MATH_NO_FP
+
+#include <bit>     // std::bit_cast (constexpr Cody-Waite split derivation)
+#include <cmath>   // std::fma, std::sqrt, std::nearbyint, std::ldexp, std::frexp (float)
+
+namespace bnd::math::flt::detail
+{
+  using std::fma;
+
+  // Constexpr Cody-Waite split of a high-precision (double) reference into a
+  // float `hi` with `keep` significant mantissa bits (low bits zeroed, so k·hi
+  // stays exact for modest k) plus a float `lo` carrying the remainder. Pure
+  // bit manipulation — identical on every IEEE-754 target.
+  inline constexpr float split_hi(double full, int keep) noexcept
+  {
+    float f = static_cast<float>(full);
+    std::uint32_t b = std::bit_cast<std::uint32_t>(f);
+    b &= ~((std::uint32_t{1} << (23 - keep)) - 1);
+    return std::bit_cast<float>(b);
+  }
+  inline constexpr float split_lo(double full, int keep) noexcept
+  {
+    return static_cast<float>(full - static_cast<double>(split_hi(full, keep)));
+  }
+
+  // High-precision references (double literals; only their float projections are
+  // used at runtime). 12 kept bits hold accuracy across the shared ±2^20 domain.
+  inline constexpr double kPiD    = 3.14159265358979323846;
+  inline constexpr double kPiO2D  = kPiD / 2;
+  inline constexpr double kLn2D   = 0.69314718055994530942;
+
+  inline constexpr float kPio2Hi    = split_hi(kPiO2D, 12);
+  inline constexpr float kPio2Lo    = split_lo(kPiO2D, 12);
+  inline constexpr float kTwoOverPi  = static_cast<float>(2.0 / kPiD);
+  inline constexpr float kLn2Hi     = split_hi(kLn2D, 12);
+  inline constexpr float kLn2Lo     = split_lo(kLn2D, 12);
+  inline constexpr float kLn2Full   = static_cast<float>(kLn2D);
+  inline constexpr float kLog2e     = static_cast<float>(1.44269504088896340736);
+  inline constexpr float kLog10e    = static_cast<float>(0.43429448190325182765);
+  inline constexpr float kSqrtHalf  = 0x1.6a09e6p-1f;             // √½
+  inline constexpr float kPi        = static_cast<float>(kPiD);
+  inline constexpr float kPiHalf    = static_cast<float>(kPiO2D);
+  inline constexpr float kPiSixth   = static_cast<float>(kPiD / 6);
+  inline constexpr float kInvSqrt3  = 0x1.279a74p-1f;             // 1/√3 = tan(π/6)
+  inline constexpr float kTanPi12   = 0x1.126146p-2f;             // tan(π/12)
+
+  // sin(r), r ∈ [−π/4, π/4]: r·P(r²) to r¹¹ (float-sufficient).
+  inline BND_DBL_FN float sin_poly(float r)
+  {
+    float z = r * r;
+    float p = -1.0f / 39916800.0f;       // −1/11!
+    p = fma(p, z,  1.0f / 362880.0f);    //  1/9!
+    p = fma(p, z, -1.0f / 5040.0f);      // −1/7!
+    p = fma(p, z,  1.0f / 120.0f);       //  1/5!
+    p = fma(p, z, -1.0f / 6.0f);         // −1/3!
+    p = fma(p, z,  1.0f);                //  1
+    return r * p;
+  }
+
+  // cos(r), r ∈ [−π/4, π/4]: Q(r²) to r¹⁰.
+  inline BND_DBL_FN float cos_poly(float r)
+  {
+    float z = r * r;
+    float p = -1.0f / 3628800.0f;        // −1/10!
+    p = fma(p, z,  1.0f / 40320.0f);     //  1/8!
+    p = fma(p, z, -1.0f / 720.0f);       // −1/6!
+    p = fma(p, z,  1.0f / 24.0f);        //  1/4!
+    p = fma(p, z, -1.0f / 2.0f);         // −1/2!
+    p = fma(p, z,  1.0f);                //  1
+    return p;
+  }
+
+  // e^r, r ∈ [−ln2/2, ln2/2]: Σ rᵏ/k! to r⁷.
+  inline BND_DBL_FN float exp_poly(float r)
+  {
+    float p = 1.0f / 5040.0f;            // 1/7!
+    p = fma(p, r, 1.0f / 720.0f);        // 1/6!
+    p = fma(p, r, 1.0f / 120.0f);        // 1/5!
+    p = fma(p, r, 1.0f / 24.0f);         // 1/4!
+    p = fma(p, r, 1.0f / 6.0f);          // 1/3!
+    p = fma(p, r, 1.0f / 2.0f);          // 1/2!
+    p = fma(p, r, 1.0f);                 // 1/1!
+    p = fma(p, r, 1.0f);                 // 1
+    return p;
+  }
+
+  // Shared quadrant reduction: x → (r ∈ [−π/4,π/4], q = quadrant mod 4).
+  inline BND_DBL_FN float reduce_quadrant(float x, long& q)
+  {
+    float k = std::nearbyint(x * kTwoOverPi);
+    float r = fma(-k, kPio2Hi, x);
+    r = fma(-k, kPio2Lo, r);
+    q = static_cast<long>(k) & 3;
+    return r;
+  }
+
+  inline BND_DBL_FN float d_sin(float x)
+  {
+    long q; float r = reduce_quadrant(x, q);
+    switch (q) {
+      case 0:  return sin_poly(r);
+      case 1:  return cos_poly(r);
+      case 2:  return -sin_poly(r);
+      default: return -cos_poly(r);
+    }
+  }
+
+  // cos via quadrant reduction (NOT sin(x+π/2): shifting the float input would
+  // lose bits for large x — the double engine can afford that, float cannot).
+  inline BND_DBL_FN float d_cos(float x)
+  {
+    long q; float r = reduce_quadrant(x, q);
+    switch (q) {
+      case 0:  return cos_poly(r);
+      case 1:  return -sin_poly(r);
+      case 2:  return -cos_poly(r);
+      default: return sin_poly(r);
+    }
+  }
+
+  // e^x = 2^k · e^r, x = k·ln2 + r, r ∈ [−ln2/2, ln2/2].
+  inline BND_DBL_FN float d_exp(float x)
+  {
+    float k = std::nearbyint(x * kLog2e);
+    float r = fma(-k, kLn2Hi, x);
+    r = fma(-k, kLn2Lo, r);
+    return std::ldexp(exp_poly(r), static_cast<int>(k));
+  }
+
+  inline BND_DBL_FN float d_sqrt(float x) { return std::sqrt(x); }   // correctly rounded
+
+  // ln(x): frexp to m∈[½,1), rebalance to [√½,√2); ln = e·ln2 + 2·atanh(f),
+  // f = (m−1)/(m+1). Pre: x > 0.
+  inline BND_DBL_FN float d_log(float x)
+  {
+    int e;
+    float m = std::frexp(x, &e);
+    if (m < kSqrtHalf) { m += m; --e; }
+    float f  = (m - 1.0f) / (m + 1.0f);
+    float f2 = f * f;
+    float p = 1.0f / 9.0f;
+    p = fma(p, f2, 1.0f / 7.0f);
+    p = fma(p, f2, 1.0f / 5.0f);
+    p = fma(p, f2, 1.0f / 3.0f);
+    p = fma(p, f2, 1.0f);
+    float logm = 2.0f * f * p;
+    float r = fma(static_cast<float>(e), kLn2Hi, logm);
+    return fma(static_cast<float>(e), kLn2Lo, r);
+  }
+
+  // Compositions on the validated primitives (same shapes as the double engine).
+  inline BND_DBL_FN float d_exp2(float x)  { return d_exp(x * kLn2Full); }
+  inline BND_DBL_FN float d_log2(float x)  { return d_log(x) * kLog2e; }
+  inline BND_DBL_FN float d_log10(float x) { return d_log(x) * kLog10e; }
+  inline BND_DBL_FN float d_pow(float b, float e) { return d_exp(e * d_log(b)); }
+  inline BND_DBL_FN float d_cbrt(float x)
+  {
+    if (x == 0.0f) return 0.0f;
+    float m = d_exp(d_log(x < 0 ? -x : x) * (1.0f / 3.0f));
+    return x < 0 ? -m : m;
+  }
+  inline BND_DBL_FN float d_sinh(float x) { float e = d_exp(x); return (e - 1.0f / e) * 0.5f; }
+  inline BND_DBL_FN float d_cosh(float x) { float e = d_exp(x); return (e + 1.0f / e) * 0.5f; }
+  inline BND_DBL_FN float d_tanh(float x)
+  {
+    float e = d_exp(x + x);             // e^{2x}
+    return (e - 1.0f) / (e + 1.0f);
+  }
+  inline BND_DBL_FN float d_hypot(float x, float y) { return d_sqrt(x * x + y * y); }
+
+  // atan(x): reduce |x|>1 via reciprocal; |a|>tan(π/12) via the π/6 addition
+  // formula → |t| ≤ tan(π/12); atan(t) = t·P(t²) Taylor.
+  inline BND_DBL_FN float d_atan(float x)
+  {
+    bool neg = x < 0; float a = neg ? -x : x;
+    bool inv = a > 1.0f; if (inv) a = 1.0f / a;
+    float off = 0.0f;
+    if (a > kTanPi12) { a = (a - kInvSqrt3) / fma(a, kInvSqrt3, 1.0f); off = kPiSixth; }
+    float z = a * a;
+    float p = 1.0f / 13.0f;
+    p = fma(p, z, -1.0f / 11.0f); p = fma(p, z,  1.0f / 9.0f); p = fma(p, z, -1.0f / 7.0f);
+    p = fma(p, z,  1.0f / 5.0f);  p = fma(p, z, -1.0f / 3.0f); p = fma(p, z,  1.0f);
+    float r = off + a * p;
+    if (inv) r = kPiHalf - r;
+    return neg ? -r : r;
+  }
+
+  inline BND_DBL_FN float d_atan2(float y, float x)
+  {
+    if (x > 0.0f) return d_atan(y / x);
+    if (x < 0.0f) return d_atan(y / x) + (y >= 0.0f ? kPi : -kPi);
+    if (y > 0.0f) return kPiHalf;
+    if (y < 0.0f) return -kPiHalf;
+    return 0.0f;
+  }
+
+  inline BND_DBL_FN float d_asin(float x) { return d_atan(x / d_sqrt((1.0f - x) * (1.0f + x))); }
+  inline BND_DBL_FN float d_acos(float x) { return kPiHalf - d_asin(x); }
+} // namespace bnd::math::flt::detail
+
+namespace bnd::math::flt
+{
+  // Engine cores: bound in → `float` math → bound out. Storing the float result:
+  // a `real` (double-backed) bound takes Out{double(f)} (widening is exact); a
+  // non-`real` snap grid assigns through the rational path, snapping via Out's
+  // round policy. (An f32-backed bound — Phase 4b — will store the float raw
+  // directly; until then it routes through the same paths.)
+  template <typename Out>
+  [[nodiscard]] BND_DBL_FN Out store(float f)
+  {
+    if constexpr (has_flag(BoundPolicy<Out>, real)) return Out{static_cast<double>(f)};
+    else { Out o{}; o = bnd::detail::rational{static_cast<double>(f)}; return o; }
+  }
+
+  template <typename Out, typename In>
+  [[nodiscard]] BND_DBL_FN Out sin_core(In x)  { return store<Out>(detail::d_sin(static_cast<float>(x))); }
+  template <typename Out, typename In>
+  [[nodiscard]] BND_DBL_FN Out cos_core(In x)  { return store<Out>(detail::d_cos(static_cast<float>(x))); }
+  template <typename Out, typename In>
+  [[nodiscard]] BND_DBL_FN Out exp_core(In x)  { return store<Out>(detail::d_exp(static_cast<float>(x))); }
+  template <typename Out, typename In>
+  [[nodiscard]] BND_DBL_FN Out sqrt_core(In x) { return store<Out>(detail::d_sqrt(static_cast<float>(x))); }
+  template <typename Out, typename In>
+  [[nodiscard]] BND_DBL_FN Out log_core(In x)  { return store<Out>(detail::d_log(static_cast<float>(x))); }
+  template <typename Out, typename In>
+  [[nodiscard]] BND_DBL_FN Out exp2_core(In x) { return store<Out>(detail::d_exp2(static_cast<float>(x))); }
+  template <typename Out, typename In>
+  [[nodiscard]] BND_DBL_FN Out log2_core(In x) { return store<Out>(detail::d_log2(static_cast<float>(x))); }
+  template <typename Out, typename In>
+  [[nodiscard]] BND_DBL_FN Out log10_core(In x){ return store<Out>(detail::d_log10(static_cast<float>(x))); }
+  template <typename Out, typename In>
+  [[nodiscard]] BND_DBL_FN Out cbrt_core(In x) { return store<Out>(detail::d_cbrt(static_cast<float>(x))); }
+  template <typename Out, typename In>
+  [[nodiscard]] BND_DBL_FN Out sinh_core(In x) { return store<Out>(detail::d_sinh(static_cast<float>(x))); }
+  template <typename Out, typename In>
+  [[nodiscard]] BND_DBL_FN Out cosh_core(In x) { return store<Out>(detail::d_cosh(static_cast<float>(x))); }
+  template <typename Out, typename In>
+  [[nodiscard]] BND_DBL_FN Out tanh_core(In x) { return store<Out>(detail::d_tanh(static_cast<float>(x))); }
+  template <typename Out, typename In>
+  [[nodiscard]] BND_DBL_FN Out atan_core(In x) { return store<Out>(detail::d_atan(static_cast<float>(x))); }
+  template <typename Out, typename In>
+  [[nodiscard]] BND_DBL_FN Out asin_core(In x) { return store<Out>(detail::d_asin(static_cast<float>(x))); }
+  template <typename Out, typename In>
+  [[nodiscard]] BND_DBL_FN Out acos_core(In x) { return store<Out>(detail::d_acos(static_cast<float>(x))); }
+  template <typename Out, typename In>
+  [[nodiscard]] BND_DBL_FN Out atan2_core(In y, In x)
+  { return store<Out>(detail::d_atan2(static_cast<float>(y), static_cast<float>(x))); }
+  template <typename Out, typename InX, typename InY>
+  [[nodiscard]] BND_DBL_FN Out hypot_core(InX x, InY y)
+  { return store<Out>(detail::d_hypot(static_cast<float>(x), static_cast<float>(y))); }
+} // namespace bnd::math::flt
+
+#endif // !BND_MATH_NO_FP
+
+#endif // BNDcmathfloatHPP
diff --git a/single_include/bound/bound.hpp b/single_include/bound/bound.hpp
index e0f392d..aa4c8f3 100644
--- a/single_include/bound/bound.hpp
+++ b/single_include/bound/bound.hpp
@@ -7965,6 +7965,282 @@ namespace bnd::math::dbl
 #endif // !BND_MATH_NO_FP
 
 
+// ======================================================================
+//  bound/cmath_float.hpp
+// ======================================================================
+//---------------------------------------------------------------------------
+//---------------------------------------------------------------------------
+// bnd::math float engine — a small, reproducible libm in `float` (binary32).
+// Bit-identical on every IEEE-754 binary32 platform compiled without
+// `-ffast-math`, via the same recipe as the double engine but in single
+// precision, for single-precision-only FPUs (Cortex-M4F etc.) and size/speed:
+//   * NO <cmath> transcendentals — own fixed polynomials evaluated with the
+//     correctly-rounded std::fma(float) (immune to FMA-contraction).
+//   * Cody-Waite range reduction whose hi/lo split constants are derived at
+//     compile time (mask the low mantissa bits of the float-rounded constant),
+//     no external codegen — honoring the bit-exact contract.
+//   * Correctly-rounded std::sqrt(float).
+// Float is a THIRD value set: float ≠ double ≠ cordic, each ≤ a few notches of
+// truth where the grid permits (table-maker's dilemma — see determinism.md).
+// Present only when FP is available; compiled out under BND_MATH_NO_FP.
+//---------------------------------------------------------------------------
+
+
+#ifndef BND_MATH_NO_FP
+
+#include <bit>     // std::bit_cast (constexpr Cody-Waite split derivation)
+#include <cmath>   // std::fma, std::sqrt, std::nearbyint, std::ldexp, std::frexp (float)
+
+namespace bnd::math::flt::detail
+{
+  using std::fma;
+
+  // Constexpr Cody-Waite split of a high-precision (double) reference into a
+  // float `hi` with `keep` significant mantissa bits (low bits zeroed, so k·hi
+  // stays exact for modest k) plus a float `lo` carrying the remainder. Pure
+  // bit manipulation — identical on every IEEE-754 target.
+  inline constexpr float split_hi(double full, int keep) noexcept
+  {
+    float f = static_cast<float>(full);
+    std::uint32_t b = std::bit_cast<std::uint32_t>(f);
+    b &= ~((std::uint32_t{1} << (23 - keep)) - 1);
+    return std::bit_cast<float>(b);
+  }
+  inline constexpr float split_lo(double full, int keep) noexcept
+  {
+    return static_cast<float>(full - static_cast<double>(split_hi(full, keep)));
+  }
+
+  // High-precision references (double literals; only their float projections are
+  // used at runtime). 12 kept bits hold accuracy across the shared ±2^20 domain.
+  inline constexpr double kPiD    = 3.14159265358979323846;
+  inline constexpr double kPiO2D  = kPiD / 2;
+  inline constexpr double kLn2D   = 0.69314718055994530942;
+
+  inline constexpr float kPio2Hi    = split_hi(kPiO2D, 12);
+  inline constexpr float kPio2Lo    = split_lo(kPiO2D, 12);
+  inline constexpr float kTwoOverPi  = static_cast<float>(2.0 / kPiD);
+  inline constexpr float kLn2Hi     = split_hi(kLn2D, 12);
+  inline constexpr float kLn2Lo     = split_lo(kLn2D, 12);
+  inline constexpr float kLn2Full   = static_cast<float>(kLn2D);
+  inline constexpr float kLog2e     = static_cast<float>(1.44269504088896340736);
+  inline constexpr float kLog10e    = static_cast<float>(0.43429448190325182765);
+  inline constexpr float kSqrtHalf  = 0x1.6a09e6p-1f;             // √½
+  inline constexpr float kPi        = static_cast<float>(kPiD);
+  inline constexpr float kPiHalf    = static_cast<float>(kPiO2D);
+  inline constexpr float kPiSixth   = static_cast<float>(kPiD / 6);
+  inline constexpr float kInvSqrt3  = 0x1.279a74p-1f;             // 1/√3 = tan(π/6)
+  inline constexpr float kTanPi12   = 0x1.126146p-2f;             // tan(π/12)
+
+  // sin(r), r ∈ [−π/4, π/4]: r·P(r²) to r¹¹ (float-sufficient).
+  inline BND_DBL_FN float sin_poly(float r)
+  {
+    float z = r * r;
+    float p = -1.0f / 39916800.0f;       // −1/11!
+    p = fma(p, z,  1.0f / 362880.0f);    //  1/9!
+    p = fma(p, z, -1.0f / 5040.0f);      // −1/7!
+    p = fma(p, z,  1.0f / 120.0f);       //  1/5!
+    p = fma(p, z, -1.0f / 6.0f);         // −1/3!
+    p = fma(p, z,  1.0f);                //  1
+    return r * p;
+  }
+
+  // cos(r), r ∈ [−π/4, π/4]: Q(r²) to r¹⁰.
+  inline BND_DBL_FN float cos_poly(float r)
+  {
+    float z = r * r;
+    float p = -1.0f / 3628800.0f;        // −1/10!
+    p = fma(p, z,  1.0f / 40320.0f);     //  1/8!
+    p = fma(p, z, -1.0f / 720.0f);       // −1/6!
+    p = fma(p, z,  1.0f / 24.0f);        //  1/4!
+    p = fma(p, z, -1.0f / 2.0f);         // −1/2!
+    p = fma(p, z,  1.0f);                //  1
+    return p;
+  }
+
+  // e^r, r ∈ [−ln2/2, ln2/2]: Σ rᵏ/k! to r⁷.
+  inline BND_DBL_FN float exp_poly(float r)
+  {
+    float p = 1.0f / 5040.0f;            // 1/7!
+    p = fma(p, r, 1.0f / 720.0f);        // 1/6!
+    p = fma(p, r, 1.0f / 120.0f);        // 1/5!
+    p = fma(p, r, 1.0f / 24.0f);         // 1/4!
+    p = fma(p, r, 1.0f / 6.0f);          // 1/3!
+    p = fma(p, r, 1.0f / 2.0f);          // 1/2!
+    p = fma(p, r, 1.0f);                 // 1/1!
+    p = fma(p, r, 1.0f);                 // 1
+    return p;
+  }
+
+  // Shared quadrant reduction: x → (r ∈ [−π/4,π/4], q = quadrant mod 4).
+  inline BND_DBL_FN float reduce_quadrant(float x, long& q)
+  {
+    float k = std::nearbyint(x * kTwoOverPi);
+    float r = fma(-k, kPio2Hi, x);
+    r = fma(-k, kPio2Lo, r);
+    q = static_cast<long>(k) & 3;
+    return r;
+  }
+
+  inline BND_DBL_FN float d_sin(float x)
+  {
+    long q; float r = reduce_quadrant(x, q);
+    switch (q) {
+      case 0:  return sin_poly(r);
+      case 1:  return cos_poly(r);
+      case 2:  return -sin_poly(r);
+      default: return -cos_poly(r);
+    }
+  }
+
+  // cos via quadrant reduction (NOT sin(x+π/2): shifting the float input would
+  // lose bits for large x — the double engine can afford that, float cannot).
+  inline BND_DBL_FN float d_cos(float x)
+  {
+    long q; float r = reduce_quadrant(x, q);
+    switch (q) {
+      case 0:  return cos_poly(r);
+      case 1:  return -sin_poly(r);
+      case 2:  return -cos_poly(r);
+      default: return sin_poly(r);
+    }
+  }
+
+  // e^x = 2^k · e^r, x = k·ln2 + r, r ∈ [−ln2/2, ln2/2].
+  inline BND_DBL_FN float d_exp(float x)
+  {
+    float k = std::nearbyint(x * kLog2e);
+    float r = fma(-k, kLn2Hi, x);
+    r = fma(-k, kLn2Lo, r);
+    return std::ldexp(exp_poly(r), static_cast<int>(k));
+  }
+
+  inline BND_DBL_FN float d_sqrt(float x) { return std::sqrt(x); }   // correctly rounded
+
+  // ln(x): frexp to m∈[½,1), rebalance to [√½,√2); ln = e·ln2 + 2·atanh(f),
+  // f = (m−1)/(m+1). Pre: x > 0.
+  inline BND_DBL_FN float d_log(float x)
+  {
+    int e;
+    float m = std::frexp(x, &e);
+    if (m < kSqrtHalf) { m += m; --e; }
+    float f  = (m - 1.0f) / (m + 1.0f);
+    float f2 = f * f;
+    float p = 1.0f / 9.0f;
+    p = fma(p, f2, 1.0f / 7.0f);
+    p = fma(p, f2, 1.0f / 5.0f);
+    p = fma(p, f2, 1.0f / 3.0f);
+    p = fma(p, f2, 1.0f);
+    float logm = 2.0f * f * p;
+    float r = fma(static_cast<float>(e), kLn2Hi, logm);
+    return fma(static_cast<float>(e), kLn2Lo, r);
+  }
+
+  // Compositions on the validated primitives (same shapes as the double engine).
+  inline BND_DBL_FN float d_exp2(float x)  { return d_exp(x * kLn2Full); }
+  inline BND_DBL_FN float d_log2(float x)  { return d_log(x) * kLog2e; }
+  inline BND_DBL_FN float d_log10(float x) { return d_log(x) * kLog10e; }
+  inline BND_DBL_FN float d_pow(float b, float e) { return d_exp(e * d_log(b)); }
+  inline BND_DBL_FN float d_cbrt(float x)
+  {
+    if (x == 0.0f) return 0.0f;
+    float m = d_exp(d_log(x < 0 ? -x : x) * (1.0f / 3.0f));
+    return x < 0 ? -m : m;
+  }
+  inline BND_DBL_FN float d_sinh(float x) { float e = d_exp(x); return (e - 1.0f / e) * 0.5f; }
+  inline BND_DBL_FN float d_cosh(float x) { float e = d_exp(x); return (e + 1.0f / e) * 0.5f; }
+  inline BND_DBL_FN float d_tanh(float x)
+  {
+    float e = d_exp(x + x);             // e^{2x}
+    return (e - 1.0f) / (e + 1.0f);
+  }
+  inline BND_DBL_FN float d_hypot(float x, float y) { return d_sqrt(x * x + y * y); }
+
+  // atan(x): reduce |x|>1 via reciprocal; |a|>tan(π/12) via the π/6 addition
+  // formula → |t| ≤ tan(π/12); atan(t) = t·P(t²) Taylor.
+  inline BND_DBL_FN float d_atan(float x)
+  {
+    bool neg = x < 0; float a = neg ? -x : x;
+    bool inv = a > 1.0f; if (inv) a = 1.0f / a;
+    float off = 0.0f;
+    if (a > kTanPi12) { a = (a - kInvSqrt3) / fma(a, kInvSqrt3, 1.0f); off = kPiSixth; }
+    float z = a * a;
+    float p = 1.0f / 13.0f;
+    p = fma(p, z, -1.0f / 11.0f); p = fma(p, z,  1.0f / 9.0f); p = fma(p, z, -1.0f / 7.0f);
+    p = fma(p, z,  1.0f / 5.0f);  p = fma(p, z, -1.0f / 3.0f); p = fma(p, z,  1.0f);
+    float r = off + a * p;
+    if (inv) r = kPiHalf - r;
+    return neg ? -r : r;
+  }
+
+  inline BND_DBL_FN float d_atan2(float y, float x)
+  {
+    if (x > 0.0f) return d_atan(y / x);
+    if (x < 0.0f) return d_atan(y / x) + (y >= 0.0f ? kPi : -kPi);
+    if (y > 0.0f) return kPiHalf;
+    if (y < 0.0f) return -kPiHalf;
+    return 0.0f;
+  }
+
+  inline BND_DBL_FN float d_asin(float x) { return d_atan(x / d_sqrt((1.0f - x) * (1.0f + x))); }
+  inline BND_DBL_FN float d_acos(float x) { return kPiHalf - d_asin(x); }
+} // namespace bnd::math::flt::detail
+
+namespace bnd::math::flt
+{
+  // Engine cores: bound in → `float` math → bound out. Storing the float result:
+  // a `real` (double-backed) bound takes Out{double(f)} (widening is exact); a
+  // non-`real` snap grid assigns through the rational path, snapping via Out's
+  // round policy. (An f32-backed bound — Phase 4b — will store the float raw
+  // directly; until then it routes through the same paths.)
+  template <typename Out>
+  [[nodiscard]] BND_DBL_FN Out store(float f)
+  {
+    if constexpr (has_flag(BoundPolicy<Out>, real)) return Out{static_cast<double>(f)};
+    else { Out o{}; o = bnd::detail::rational{static_cast<double>(f)}; return o; }
+  }
+
+  template <typename Out, typename In>
+  [[nodiscard]] BND_DBL_FN Out sin_core(In x)  { return store<Out>(detail::d_sin(static_cast<float>(x))); }
+  template <typename Out, typename In>
+  [[nodiscard]] BND_DBL_FN Out cos_core(In x)  { return store<Out>(detail::d_cos(static_cast<float>(x))); }
+  template <typename Out, typename In>
+  [[nodiscard]] BND_DBL_FN Out exp_core(In x)  { return store<Out>(detail::d_exp(static_cast<float>(x))); }
+  template <typename Out, typename In>
+  [[nodiscard]] BND_DBL_FN Out sqrt_core(In x) { return store<Out>(detail::d_sqrt(static_cast<float>(x))); }
+  template <typename Out, typename In>
+  [[nodiscard]] BND_DBL_FN Out log_core(In x)  { return store<Out>(detail::d_log(static_cast<float>(x))); }
+  template <typename Out, typename In>
+  [[nodiscard]] BND_DBL_FN Out exp2_core(In x) { return store<Out>(detail::d_exp2(static_cast<float>(x))); }
+  template <typename Out, typename In>
+  [[nodiscard]] BND_DBL_FN Out log2_core(In x) { return store<Out>(detail::d_log2(static_cast<float>(x))); }
+  template <typename Out, typename In>
+  [[nodiscard]] BND_DBL_FN Out log10_core(In x){ return store<Out>(detail::d_log10(static_cast<float>(x))); }
+  template <typename Out, typename In>
+  [[nodiscard]] BND_DBL_FN Out cbrt_core(In x) { return store<Out>(detail::d_cbrt(static_cast<float>(x))); }
+  template <typename Out, typename In>
+  [[nodiscard]] BND_DBL_FN Out sinh_core(In x) { return store<Out>(detail::d_sinh(static_cast<float>(x))); }
+  template <typename Out, typename In>
+  [[nodiscard]] BND_DBL_FN Out cosh_core(In x) { return store<Out>(detail::d_cosh(static_cast<float>(x))); }
+  template <typename Out, typename In>
+  [[nodiscard]] BND_DBL_FN Out tanh_core(In x) { return store<Out>(detail::d_tanh(static_cast<float>(x))); }
+  template <typename Out, typename In>
+  [[nodiscard]] BND_DBL_FN Out atan_core(In x) { return store<Out>(detail::d_atan(static_cast<float>(x))); }
+  template <typename Out, typename In>
+  [[nodiscard]] BND_DBL_FN Out asin_core(In x) { return store<Out>(detail::d_asin(static_cast<float>(x))); }
+  template <typename Out, typename In>
+  [[nodiscard]] BND_DBL_FN Out acos_core(In x) { return store<Out>(detail::d_acos(static_cast<float>(x))); }
+  template <typename Out, typename In>
+  [[nodiscard]] BND_DBL_FN Out atan2_core(In y, In x)
+  { return store<Out>(detail::d_atan2(static_cast<float>(y), static_cast<float>(x))); }
+  template <typename Out, typename InX, typename InY>
+  [[nodiscard]] BND_DBL_FN Out hypot_core(InX x, InY y)
+  { return store<Out>(detail::d_hypot(static_cast<float>(x), static_cast<float>(y))); }
+} // namespace bnd::math::flt
+
+#endif // !BND_MATH_NO_FP
+
+
 
 
 // The public bnd::math::* functions dispatch to the double engine (default) or
@@ -10187,6 +10463,148 @@ namespace bnd::math
       return slim::expected<Out, errc>{store<Out>(r)};
     }
   } // namespace dbl
+
+  namespace flt
+  {
+    // Public-shaped float-engine entry points — binary32 compute, same shapes as
+    // dbl:: (qualify shared helpers as bnd::math::detail::; *_core/store/d_* are
+    // this namespace's own). A third value set (float ≠ double ≠ cordic).
+    template <boundable In>
+      requires (Lower<In> == bnd::detail::rational{0})
+    [[nodiscard]] BND_DBL_FN auto sqrt(In x) noexcept
+    { static_assert(bnd::math::detail::require_snap<In>()); return sqrt_core<bnd::math::detail::sqrt_auto_t<In>>(x); }
+
+    template <boundable In>
+      requires (Lower<In> < bnd::detail::rational{0})
+    [[nodiscard]] BND_DBL_FN auto sqrt(In x) noexcept
+    {
+      static_assert(bnd::math::detail::require_snap<In>());
+      using Out = bnd::math::detail::sqrt_signed_auto_t<In>;
+      float v = static_cast<float>(x);
+      if (v < 0.0f)
+        return slim::expected<Out, errc>{slim::unexpected(errc::domain_error)};
+      return slim::expected<Out, errc>{store<Out>(detail::d_sqrt(v))};
+    }
+
+    template <boundable In>
+    [[nodiscard]] BND_DBL_FN auto exp2(In x) noexcept
+    { static_assert(bnd::math::detail::require_snap<In>()); return exp2_core<bnd::math::detail::exp2_auto_t<In>>(x); }
+
+    template <boundable In>
+    [[nodiscard]] BND_DBL_FN auto log2(In x) noexcept
+    {
+      static_assert(bnd::math::detail::require_snap<In>());
+      static_assert(Lower<In> > 0, "bnd::math::flt::log2: input must be strictly positive");
+      return log2_core<bnd::math::detail::log2_auto_t<In>>(x);
+    }
+
+    template <boundable In>
+    [[nodiscard]] BND_DBL_FN auto exp(In x) noexcept
+    { static_assert(bnd::math::detail::require_snap<In>()); return exp_core<bnd::math::detail::exp_auto_t<In>>(x); }
+
+    template <boundable In>
+    [[nodiscard]] BND_DBL_FN auto log(In x) noexcept
+    {
+      static_assert(bnd::math::detail::require_snap<In>());
+      static_assert(Lower<In> > 0, "bnd::math::flt::log: input must be strictly positive");
+      return log_core<bnd::math::detail::log_auto_t<In>>(x);
+    }
+
+    template <imax Base, boundable In>
+    [[nodiscard]] BND_DBL_FN auto pow_base(In x) noexcept
+    {
+      static_assert(bnd::math::detail::require_snap<In>());
+      using Out = bnd::math::detail::pow_base_auto_t<Base, In>;
+      return store<Out>(detail::d_pow(static_cast<float>(Base), static_cast<float>(x)));
+    }
+
+    template <boundable In>
+    [[nodiscard]] BND_DBL_FN auto sin(In angle) noexcept
+    { static_assert(bnd::math::detail::require_snap<In>()); return sin_core<bnd::math::detail::sin_auto_t<In>>(angle); }
+
+    template <boundable In>
+    [[nodiscard]] BND_DBL_FN auto cos(In angle) noexcept
+    { static_assert(bnd::math::detail::require_snap<In>()); return cos_core<bnd::math::detail::cos_auto_t<In>>(angle); }
+
+    template <boundable In>
+    [[nodiscard]] BND_DBL_FN auto tan(In angle) noexcept
+    {
+      static_assert(bnd::math::detail::require_snap<In>());
+      using Out = bnd::math::detail::tan_auto_t<In>;
+      float x = static_cast<float>(angle);
+      float c = detail::d_cos(x);
+      if (c == 0.0f)
+        return slim::expected<Out, errc>{slim::unexpected(errc::division_by_zero)};
+      float t = detail::d_sin(x) / c;
+      if constexpr (!has_flag(BoundPolicy<Out>, clamp))   // clamp Out: saturate below
+        if (t < static_cast<float>(Lower<Out>) || t > static_cast<float>(Upper<Out>))
+          return slim::expected<Out, errc>{slim::unexpected(errc::overflow)};
+      return slim::expected<Out, errc>{store<Out>(t)};
+    }
+
+    template <boundable In>
+    [[nodiscard]] BND_DBL_FN auto atan2(In y, In x) noexcept
+    { static_assert(bnd::math::detail::require_snap<In>()); return atan2_core<bnd::math::detail::atan2_auto_t<In>>(y, x); }
+
+    template <boundable In>
+    [[nodiscard]] BND_DBL_FN auto atan(In x) noexcept
+    { static_assert(bnd::math::detail::require_snap<In>()); return atan_core<bnd::math::detail::atan_auto_t<In>>(x); }
+
+    template <boundable In>
+    [[nodiscard]] BND_DBL_FN auto asin(In x) noexcept
+    { static_assert(bnd::math::detail::require_snap<In>()); return asin_core<bnd::math::detail::asin_auto_t<In>>(x); }
+
+    template <boundable In>
+    [[nodiscard]] BND_DBL_FN auto acos(In x) noexcept
+    { static_assert(bnd::math::detail::require_snap<In>()); return acos_core<bnd::math::detail::acos_auto_t<In>>(x); }
+
+    template <boundable In>
+    [[nodiscard]] BND_DBL_FN auto sinh(In x) noexcept
+    { static_assert(bnd::math::detail::require_snap<In>()); return sinh_core<bnd::math::detail::sinh_auto_t<In>>(x); }
+
+    template <boundable In>
+    [[nodiscard]] BND_DBL_FN auto cosh(In x) noexcept
+    { static_assert(bnd::math::detail::require_snap<In>()); return cosh_core<bnd::math::detail::cosh_auto_t<In>>(x); }
+
+    template <boundable In>
+    [[nodiscard]] BND_DBL_FN auto tanh(In x) noexcept
+    { static_assert(bnd::math::detail::require_snap<In>()); return tanh_core<bnd::math::detail::tanh_auto_t<In>>(x); }
+
+    template <boundable In>
+    [[nodiscard]] BND_DBL_FN auto log10(In x) noexcept
+    {
+      static_assert(bnd::math::detail::require_snap<In>());
+      static_assert(Lower<In> > 0, "bnd::math::flt::log10: input must be strictly positive");
+      return log10_core<bnd::math::detail::log10_auto_t<In>>(x);
+    }
+
+    template <boundable In>
+    [[nodiscard]] BND_DBL_FN auto cbrt(In x) noexcept
+    { static_assert(bnd::math::detail::require_snap<In>()); return cbrt_core<bnd::math::detail::cbrt_auto_t<In>>(x); }
+
+    template <boundable InX, boundable InY>
+    [[nodiscard]] BND_DBL_FN auto hypot(InX x, InY y) noexcept
+    {
+      static_assert(bnd::math::detail::require_snap<InX>() && bnd::math::detail::require_snap<InY>());
+      return hypot_core<bnd::math::detail::hypot_auto_t<InX, InY>>(x, y);
+    }
+
+    template <boundable InB, boundable InE>
+      requires (Lower<InB> > bnd::detail::rational{0})
+    [[nodiscard]] BND_DBL_FN auto pow(InB base, InE exp) noexcept
+    {
+      static_assert(bnd::math::detail::require_snap<InB>() && bnd::math::detail::require_snap<InE>());
+      using Out = bnd::math::detail::pow_auto_t<InB, InE>;
+      float b = static_cast<float>(base);
+      if (b <= 0.0f)
+        return slim::expected<Out, errc>{slim::unexpected(errc::domain_error)};
+      float r = detail::d_pow(b, static_cast<float>(exp));
+      if constexpr (!has_flag(BoundPolicy<Out>, clamp))   // clamp Out: saturate below
+        if (r < static_cast<float>(Lower<Out>) || r > static_cast<float>(Upper<Out>))
+          return slim::expected<Out, errc>{slim::unexpected(errc::overflow)};
+      return slim::expected<Out, errc>{store<Out>(r)};
+    }
+  } // namespace flt
 #endif // !BND_MATH_NO_FP
 }
 
diff --git a/tests/test_math_engines.cpp b/tests/test_math_engines.cpp
index dbedb50..6fec970 100644
--- a/tests/test_math_engines.cpp
+++ b/tests/test_math_engines.cpp
@@ -65,6 +65,59 @@ TEST_CASE("double engine is callable side-by-side and agrees on special values",
   REQUIRE(rational{*p} == 16);
 }
 
+TEST_CASE("float engine is callable side-by-side and agrees on special values",
+          "[cmath][engines][flt]")
+{
+  REQUIRE(rational{math::flt::sin(Ang{0})}    == 0);
+  REQUIRE(rational{math::flt::cos(Ang{0})}    == 1);
+  REQUIRE(rational{math::flt::atan(Ang{0})}   == 0);
+  REQUIRE(rational{math::flt::sinh(Ang{0})}   == 0);
+  REQUIRE(rational{math::flt::cosh(Ang{0})}   == 1);
+  REQUIRE(rational{math::flt::tanh(Ang{0})}   == 0);
+  REQUIRE(rational{math::flt::sqrt(Sq{4})}    == 2);
+  REQUIRE(rational{math::flt::cbrt(Ang{8})}   == 2);
+  REQUIRE(rational{math::flt::log10(Pos{100})} == 2);
+  REQUIRE(rational{math::flt::exp(Ang{0})}    == 1);
+
+  auto p = math::flt::pow(Pos{2}, Ang{4});
+  REQUIRE(p.has_value());
+  REQUIRE(rational{*p} == 16);
+}
+
+// Golden pins for the float engine: the EXACT grid-snapped rational the binary32
+// engine must produce, bit-for-bit, on every IEEE-754 binary32 platform. These
+// are a THIRD value set (float ≠ double ≠ cordic); regenerate only on a
+// deliberate engine change. Grid: notch 1/16384 real (Ang/Pos/Sq above).
+#define EXACT_FLT(expr, N, D) REQUIRE(rational{(expr)} == rational{N, D})
+
+TEST_CASE("float engine golden pins are bit-exact (determinism)",
+          "[cmath][engines][flt][determinism]")
+{
+  EXACT_FLT(math::flt::sin(Ang{1}),    13787, 16384);
+  EXACT_FLT(math::flt::cos(Ang{1}),     2213,  4096);
+  EXACT_FLT(math::flt::atan(Ang{1}),    3217,  4096);
+  EXACT_FLT(math::flt::exp(Ang{2}),    60531,  8192);
+  EXACT_FLT(math::flt::log(Pos{10}),   18863,  8192);
+  EXACT_FLT(math::flt::log10(Pos{50}),  6959,  4096);
+  EXACT_FLT(math::flt::sqrt(Sq{2}),    11585,  8192);
+  EXACT_FLT(math::flt::cbrt(Ang{2}),   20643, 16384);
+  EXACT_FLT(math::flt::sinh(Ang{2}),   29711,  8192);
+  EXACT_FLT(math::flt::tanh(Ang{1}),    6239,  8192);
+}
+
+TEST_CASE("all three engines coexist in one binary and meet at exact points",
+          "[cmath][engines][mix]")
+{
+  // Phase-4 property: cordic + dbl + flt all instantiated in the same TU.
+  REQUIRE(rational{math::flt::sqrt(Sq{4})}    == rational{math::dbl::sqrt(Sq{4})});
+  REQUIRE(rational{math::flt::cos(Ang{0})}    == rational{math::cordic::cos(Ang{0})});
+  // A pole errors through the expected channel under the float engine too.
+  using TanAng = bound<{{-2, 2}, notch<1, 4096>}, round_nearest | real>;
+  auto tf = math::flt::tan(TanAng{0});
+  REQUIRE(tf.has_value());
+  REQUIRE(rational{*tf} == 0);
+}
+
 TEST_CASE("both engines coexist in one binary and meet at exact points",
           "[cmath][engines][mix]")
 {