diff --git a/crates/compile/src/kernel_egress.rs b/crates/compile/src/kernel_egress.rs
index 743f97db..ea1299c1 100644
--- a/crates/compile/src/kernel_egress.rs
+++ b/crates/compile/src/kernel_egress.rs
@@ -384,9 +384,9 @@ struct EgressCtx {
   /// Memoized expression conversion. Keyed by `KExpr::addr()` (content
   /// hash); same hash → same Ixon expression (within a single block's
   /// tables).
-  expr_cache: FxHashMap<blake3::Hash, Arc<IxonExpr>>,
+  expr_cache: FxHashMap<ix_kernel::env::Addr, Arc<IxonExpr>>,
   /// Memoized universe conversion.
-  univ_cache: FxHashMap<blake3::Hash, Arc<IxonUniv>>,
+  univ_cache: FxHashMap<ix_kernel::env::Addr, Arc<IxonUniv>>,
 }
 
 impl EgressCtx {
diff --git a/crates/ixon/src/lazy.rs b/crates/ixon/src/lazy.rs
index f08330e1..612e67e9 100644
--- a/crates/ixon/src/lazy.rs
+++ b/crates/ixon/src/lazy.rs
@@ -104,6 +104,15 @@ pub struct LazyConstant {
   /// this `None`; their `get()` parses fresh and does not store —
   /// see the module-level "Cache policy" docs.
   cache: Option<Arc<Constant>>,
+  /// Deferred address verification: when set (the `.ixe` load paths),
+  /// `get()` checks `Address::hash(bytes) == pending_addr` before
+  /// parsing. This moves the per-constant content-hash from env parse
+  /// time to first materialization, so constants that are shipped in a
+  /// closure but never forced by the typechecker are never hashed —
+  /// while everything the kernel CERTIFIES is still verified (checking
+  /// a constant forces its materialization). `None` for the
+  /// compile-side path, which owns the parsed value already.
+  pending_addr: Option<Address>,
 }
 
 impl LazyConstant {
@@ -114,7 +123,22 @@ impl LazyConstant {
   /// `Constant::put` produced for the address this entry is stored
   /// under. Use [`Self::verify_address`] for an explicit check.
   pub fn from_bytes(bytes: Arc<[u8]>) -> Self {
-    LazyConstant { bytes: BytesSource::Heap(bytes), cache: None }
+    LazyConstant {
+      bytes: BytesSource::Heap(bytes),
+      cache: None,
+      pending_addr: None,
+    }
+  }
+
+  /// Like [`Self::from_bytes`], but with the address-binding check
+  /// deferred to first [`Self::get`]. Used by `Env::get_anon` so env
+  /// parse time does not pay one content hash per constant.
+  pub fn from_bytes_deferred(bytes: Arc<[u8]>, addr: Address) -> Self {
+    LazyConstant {
+      bytes: BytesSource::Heap(bytes),
+      cache: None,
+      pending_addr: Some(addr),
+    }
   }
 
   /// Construct from a memory-mapped window. The `Arc<Mmap>` keeps
@@ -124,7 +148,11 @@ impl LazyConstant {
   /// Used by `Env::get_anon_mmap` to avoid heap-copying the on-disk
   /// byte stream — the OS page cache is the source of truth.
   pub fn from_mmap_slice(mmap: Arc<Mmap>, offset: usize, len: usize) -> Self {
-    LazyConstant { bytes: BytesSource::Mmap { mmap, offset, len }, cache: None }
+    LazyConstant {
+      bytes: BytesSource::Mmap { mmap, offset, len },
+      cache: None,
+      pending_addr: None,
+    }
   }
 
   /// Construct from a structured `Constant` (the in-memory build path,
@@ -137,6 +165,7 @@ impl LazyConstant {
     LazyConstant {
       bytes: BytesSource::Heap(buf.into()),
       cache: Some(Arc::new(c)),
+      pending_addr: None,
     }
   }
 
@@ -151,6 +180,16 @@ impl LazyConstant {
     if let Some(c) = &self.cache {
       return Ok(c.clone());
     }
+    if let Some(expected) = &self.pending_addr {
+      let computed = Address::hash(self.bytes.as_slice());
+      if computed != *expected {
+        return Err(format!(
+          "LazyConstant::get: bytes hash to {} but stored under {}",
+          computed.hex(),
+          expected.hex()
+        ));
+      }
+    }
     let mut slice: &[u8] = self.bytes.as_slice();
     let parsed = Constant::get(&mut slice)
       .map_err(|e| format!("LazyConstant::get: {e}"))?;
diff --git a/crates/ixon/src/serialize.rs b/crates/ixon/src/serialize.rs
index 4233ea66..cdbee2a9 100644
--- a/crates/ixon/src/serialize.rs
+++ b/crates/ixon/src/serialize.rs
@@ -1419,6 +1419,20 @@ impl Env {
       }
       let (bytes, rest) = buf.split_at(len);
       *buf = rest;
+      // Blob bytes are bound to their address HERE, eagerly: literal
+      // identity downstream (kernel `ExprKey::Nat`/`Str` interning,
+      // `Constant.refs` blob entries, assumption filtering) keys on the
+      // address, so an unverified blob would let an adversarial env bind
+      // wrong bytes to a content address. Blobs are a small fraction of
+      // env bytes; constants stay lazily verified (see LazyConstant).
+      let computed = Address::hash(bytes);
+      if computed != addr {
+        return Err(format!(
+          "Env::get: blob bytes hash to {} but stored under {}",
+          computed.hex(),
+          addr.hex()
+        ));
+      }
       env.blobs.insert(addr, bytes.to_vec());
     }
 
@@ -1568,6 +1582,20 @@ impl Env {
       }
       let (bytes, rest) = buf.split_at(len);
       *buf = rest;
+      // Blob bytes are bound to their address HERE, eagerly: literal
+      // identity downstream (kernel `ExprKey::Nat`/`Str` interning,
+      // `Constant.refs` blob entries, assumption filtering) keys on the
+      // address, so an unverified blob would let an adversarial env bind
+      // wrong bytes to a content address. Blobs are a small fraction of
+      // env bytes; constants stay lazily verified (see LazyConstant).
+      let computed = Address::hash(bytes);
+      if computed != addr {
+        return Err(format!(
+          "Env::get_anon: blob bytes hash to {} but stored under {}",
+          computed.hex(),
+          addr.hex()
+        ));
+      }
       env.blobs.insert(addr, bytes.to_vec());
     }
 
@@ -1585,17 +1613,16 @@ impl Env {
       }
       let (bytes, rest) = buf.split_at(len);
       *buf = rest;
-      let computed = Address::hash(bytes);
-      if computed != addr {
-        return Err(format!(
-          "Env::get_anon: const at idx {i} bytes hash to {} but stored under {}",
-          computed.hex(),
-          addr.hex()
-        ));
-      }
-      env
-        .consts
-        .insert(addr, crate::lazy::LazyConstant::from_bytes(bytes.into()));
+      let _ = i;
+      // Address binding is verified lazily at first materialization
+      // (`LazyConstant::get`) instead of one content hash per constant
+      // here — constants shipped in a closure but never forced by the
+      // typechecker are never hashed, while everything the kernel
+      // certifies still gets verified on the way in.
+      env.consts.insert(
+        addr.clone(),
+        crate::lazy::LazyConstant::from_bytes_deferred(bytes.into(), addr),
+      );
     }
 
     // Section 3: Names — parse and DISCARD. We still need a populated
@@ -1769,6 +1796,20 @@ impl Env {
       }
       let (bytes, rest) = buf.split_at(len);
       buf = rest;
+      // Blob bytes are bound to their address HERE, eagerly: literal
+      // identity downstream (kernel `ExprKey::Nat`/`Str` interning,
+      // `Constant.refs` blob entries, assumption filtering) keys on the
+      // address, so an unverified blob would let an adversarial env bind
+      // wrong bytes to a content address. Blobs are a small fraction of
+      // env bytes; constants stay lazily verified (see LazyConstant).
+      let computed = Address::hash(bytes);
+      if computed != addr {
+        return Err(format!(
+          "Env::get_anon_mmap: blob bytes hash to {} but stored under {}",
+          computed.hex(),
+          addr.hex()
+        ));
+      }
       env.blobs.insert(addr, bytes.to_vec());
     }
 
@@ -2452,8 +2493,16 @@ mod tests {
     let off = 68 + 3;
     assert!(off < buf.len());
     buf[off] ^= 0xFF;
-    let res = Env::get_anon(&mut buf.as_slice());
-    let err = res.expect_err("tampered const bytes should be rejected");
+    // Address binding is verified lazily: parse succeeds, but the
+    // tampered constant is rejected at first materialization (which is
+    // what gates anything the kernel certifies).
+    let parsed = Env::get_anon(&mut buf.as_slice())
+      .expect("get_anon defers per-const verification");
+    let entry = parsed.consts.iter().next().expect("one const");
+    let err = entry
+      .value()
+      .get()
+      .expect_err("tampered const bytes should be rejected at get()");
     assert!(
       err.contains("bytes hash to") && err.contains("but stored under"),
       "expected per-entry verify error, got: {err}"
diff --git a/crates/kernel/Cargo.toml b/crates/kernel/Cargo.toml
index 996eab3e..ebb91120 100644
--- a/crates/kernel/Cargo.toml
+++ b/crates/kernel/Cargo.toml
@@ -36,5 +36,12 @@ quickcheck_macros = { workspace = true }
 name = "perf_check"
 path = "examples/perf_check.rs"
 
+# Native single-constant check (one work item via the lazy-anon path the
+# Zisk guest uses), so the `IX_*` instrumentation can target one expensive
+# constant without re-checking its whole env.
+[[example]]
+name = "check_one"
+path = "examples/check_one.rs"
+
 [lints]
 workspace = true
diff --git a/crates/kernel/examples/check_one.rs b/crates/kernel/examples/check_one.rs
new file mode 100644
index 00000000..4ae4d946
--- /dev/null
+++ b/crates/kernel/examples/check_one.rs
@@ -0,0 +1,72 @@
+//! Native single-constant check, mirroring the Zisk guest's reuse-mode path
+//! (`zisk/guest/src/main.rs`): resolve a Lean NAME through the env's `named`
+//! metadata to its ingress block, then `check_const` exactly that work item
+//! with the lazy-anon TypeChecker. Exists so the kernel's `IX_*`
+//! instrumentation (inert inside zkVM guests — see `src/lib.rs::env_var_os`)
+//! can be pointed at one expensive constant without re-checking a whole env.
+//!
+//!   IX_PERF_COUNTERS=1 cargo run --release --example check_one -- \
+//!     ~/ix/init.ixe '_private.Init.Data.Range.Polymorphic.SInt.0.Int16.instRxcHasSize_eq'
+
+use std::env;
+use std::fs;
+use std::time::Instant;
+
+use ix_kernel::anon_work::build_anon_work;
+use ix_kernel::env::KEnv;
+use ix_kernel::id::KId;
+use ix_kernel::mode::Anon;
+use ix_kernel::tc::TypeChecker;
+use ixon::constant::ConstantInfo as CI;
+use ixon::env::Env as IxonEnv;
+
+fn main() {
+  env_logger::builder()
+    .filter_level(log::LevelFilter::Info)
+    .format_timestamp(None)
+    .init();
+  let path = env::args().nth(1).expect("usage: check_one <ixe> <name>");
+  let name = env::args().nth(2).expect("usage: check_one <ixe> <name>");
+  let bytes = fs::read(&path).expect("read ixe");
+
+  let t_parse = Instant::now();
+  let env = IxonEnv::get(&mut &bytes[..]).expect("decode env");
+  eprintln!("[check_one] parse:   {:>8.2?}", t_parse.elapsed());
+
+  // Name → constant address → ingress block (mirrors zisk-host
+  // `block_of_addr`): a projection's `block`, else the address itself.
+  let addr = env
+    .named
+    .iter()
+    .find(|e| e.key().pretty() == name || e.key().to_string() == name)
+    .map_or_else(
+      || panic!("no constant named {name:?} in {path}"),
+      |e| e.value().addr.clone(),
+    );
+  let block = match env.get_const(&addr).map(|c| c.info.clone()) {
+    Some(CI::IPrj(p)) => p.block,
+    Some(CI::CPrj(p)) => p.block,
+    Some(CI::RPrj(p)) => p.block,
+    Some(CI::DPrj(p)) => p.block,
+    _ => addr.clone(),
+  };
+
+  let work = build_anon_work(&env).expect("build_anon_work");
+  let item =
+    work.iter().find(|it| *it.primary() == block).expect("work item for block");
+
+  let mut kenv = KEnv::<Anon>::new();
+  let mut tc = TypeChecker::<Anon>::new_with_lazy_anon(&mut kenv, &env);
+  let kid = KId::<Anon>::new(item.primary().clone(), ());
+  let t_check = Instant::now();
+  let result = tc.check_const(&kid);
+  eprintln!(
+    "[check_one] check:   {:>8.2?}  result: {:?}",
+    t_check.elapsed(),
+    result.is_ok()
+  );
+  if let Err(e) = result {
+    eprintln!("[check_one] error: {e:?}");
+    std::process::exit(1);
+  }
+}
diff --git a/crates/kernel/src/def_eq.rs b/crates/kernel/src/def_eq.rs
index c0868636..fb873cfc 100644
--- a/crates/kernel/src/def_eq.rs
+++ b/crates/kernel/src/def_eq.rs
@@ -21,6 +21,7 @@ use super::subst::{instantiate_rev, lift};
 use super::tc::{
   MAX_DEF_EQ_DEPTH, MAX_WHNF_FUEL, TypeChecker, collect_app_spine,
 };
+use super::whnf::PrimFamily;
 
 /// When set, trace every `is_def_eq` call where one side's head constant
 /// starts with the prefix in `IX_DEF_EQ_TRACE` (e.g. `IX_DEF_EQ_TRACE=bmod`
@@ -349,11 +350,17 @@ impl<M: KernelMode> TypeChecker<'_, M> {
       // primitives entirely when either side has a free variable, unless
       // eagerReduce is active.
       let nat_ok = (!wa.has_fvars() && !wb.has_fvars()) || self.eager_reduce;
+      let fam_a = self.head_prim_family(&wa);
+      let fam_b = self.head_prim_family(&wb);
       if nat_ok {
-        if let Some(wa2) = self.try_reduce_nat(&wa)? {
+        if fam_a == PrimFamily::Nat
+          && let Some(wa2) = self.try_reduce_nat(&wa)?
+        {
           return self.is_def_eq(&wa2, &wb);
         }
-        if let Some(wb2) = self.try_reduce_nat(&wb)? {
+        if fam_b == PrimFamily::Nat
+          && let Some(wb2) = self.try_reduce_nat(&wb)?
+        {
           return self.is_def_eq(&wa, &wb2);
         }
       }
@@ -363,17 +370,25 @@ impl<M: KernelMode> TypeChecker<'_, M> {
       // (lean4 `type_checker.cpp:986-991`, lean4lean `TypeChecker.lean:625-628`).
       // Ix-specific `try_reduce_decidable` runs after native to keep the
       // reference-aligned segment tight.
-      if let Some(wa2) = self.try_reduce_native(&wa)? {
+      if fam_a == PrimFamily::Native
+        && let Some(wa2) = self.try_reduce_native(&wa)?
+      {
         return self.is_def_eq(&wa2, &wb);
       }
-      if let Some(wb2) = self.try_reduce_native(&wb)? {
+      if fam_b == PrimFamily::Native
+        && let Some(wb2) = self.try_reduce_native(&wb)?
+      {
         return self.is_def_eq(&wa, &wb2);
       }
 
-      if let Some(wa2) = self.try_reduce_decidable(&wa)? {
+      if fam_a == PrimFamily::Decidable
+        && let Some(wa2) = self.try_reduce_decidable(&wa)?
+      {
         return self.is_def_eq(&wa2, &wb);
       }
-      if let Some(wb2) = self.try_reduce_decidable(&wb)? {
+      if fam_b == PrimFamily::Decidable
+        && let Some(wb2) = self.try_reduce_decidable(&wb)?
+      {
         return self.is_def_eq(&wa, &wb2);
       }
 
@@ -920,6 +935,26 @@ impl<M: KernelMode> TypeChecker<'_, M> {
     }
   }
 
+  /// Allocation-free check that `e` could decompose to `base + offset`:
+  /// a Nat literal, `Nat.zero`/`Nat.succ`, or an app whose head constant is
+  /// `Nat.succ`/`Nat.add`. Walks the app chain by reference — no spine Vec.
+  fn nat_offset_candidate(&self, e: &KExpr<M>) -> bool {
+    let mut cur = e;
+    loop {
+      match cur.data() {
+        ExprData::Nat(..) => return true,
+        ExprData::Const(id, _, _) => {
+          let p = &self.prims;
+          return id.addr == p.nat_zero.addr
+            || id.addr == p.nat_succ.addr
+            || id.addr == p.nat_add.addr;
+        },
+        ExprData::App(f, _, _) => cur = f,
+        _ => return false,
+      }
+    }
+  }
+
   /// If expression is nat-succ, return the predecessor.
   /// Matches both `Nat(n+1)` → `Nat(n)` and `Nat.succ e` → `e`.
   fn nat_succ_of(&mut self, e: &KExpr<M>) -> Option<KExpr<M>> {
@@ -965,8 +1000,16 @@ impl<M: KernelMode> TypeChecker<'_, M> {
     }
   }
 
-  /// M2: Nat offset reduction for lazy delta loop (lean4lean isDefEqOffset).
-  /// Returns Some(true/false) if both are nat-zero or nat-succ, None otherwise.
+  /// M2: Nat offset reduction for lazy delta loop (lean4lean isDefEqOffset),
+  /// generalized to offset form: each side decomposes to `base + offset`
+  /// (`Lit n`, `succ` layers, and `Nat.add base (Lit m)` — the compact stuck
+  /// form WHNF now leaves — all read in O(1) per layer), the shared offset
+  /// is stripped in ONE step, and the remainders compare through full
+  /// def-eq. This collapses `succ^k(x) ≟ succ^k(x)` from k `is_def_eq`
+  /// recursion levels (which blew `MAX_DEF_EQ_DEPTH` for large k) to one.
+  /// Stripping is verdict-preserving: `+k` is definitionally injective, the
+  /// same semantics the previous one-succ-peel already relied on.
+  /// Non-offset shapes fall back (`None`) to the generic path unchanged.
   fn try_def_eq_offset(
     &mut self,
     a: &KExpr<M>,
@@ -981,12 +1024,27 @@ impl<M: KernelMode> TypeChecker<'_, M> {
     if self.is_nat_zero(a) && self.is_nat_zero(b) {
       return Ok(Some(true));
     }
-    match (self.nat_succ_of(a), self.nat_succ_of(b)) {
-      (Some(a_pred), Some(b_pred)) => {
-        Ok(Some(self.is_def_eq(&a_pred, &b_pred)?))
-      },
-      _ => Ok(None),
+    // Allocation-free quick reject: decompose walks app spines, so only run
+    // it when both heads are plausibly offset-shaped (the old one-succ-peel
+    // rejected non-Nat shapes in O(1) off `e.data()` — keep that property).
+    if !self.nat_offset_candidate(a) || !self.nat_offset_candidate(b) {
+      return Ok(None);
+    }
+    let Some((base_a, ka)) = self.nat_offset_decompose(a)? else {
+      return Ok(None);
+    };
+    let Some((base_b, kb)) = self.nat_offset_decompose(b)? else {
+      return Ok(None);
+    };
+    let k = ka.0.clone().min(kb.0.clone());
+    if k == num_bigint::BigUint::ZERO {
+      // No shared offset to strip (e.g. literal 0 vs offset-shaped): defer
+      // to the generic path, matching the previous conservative behavior.
+      return Ok(None);
     }
+    let ra = self.nat_offset_rebuild(base_a, bignat::Nat(ka.0 - &k));
+    let rb = self.nat_offset_rebuild(base_b, bignat::Nat(kb.0 - &k));
+    Ok(Some(self.is_def_eq(&ra, &rb)?))
   }
 
   // -----------------------------------------------------------------------
@@ -1630,12 +1688,12 @@ fn compact_def_eq_head<M: KernelMode>(e: &KExpr<M>) -> String {
 }
 
 fn short_def_eq_addr<M: KernelMode>(e: &KExpr<M>) -> String {
-  e.addr().to_hex().chars().take(12).collect()
+  format!("uid{}", e.addr())
 }
 
-/// Canonical ordering for cache keys: (min, max) by hash bytes.
+/// Canonical ordering for cache keys: (min, max) by uid.
 fn canonical_pair(a: Addr, b: Addr) -> (Addr, Addr) {
-  if a.as_bytes() <= b.as_bytes() { (a, b) } else { (b, a) }
+  if a <= b { (a, b) } else { (b, a) }
 }
 
 /// Extract head constant KId from expression or app spine.
@@ -1900,6 +1958,10 @@ mod tests {
     );
     let plain = ME::cnst(id, Box::new([]));
 
+    // Interning ignores mdata (the shallow Const key carries only the
+    // address + universe uids), so both collapse to one canonical node.
+    let tagged = tc.intern(tagged);
+    let plain = tc.intern(plain);
     assert_eq!(tagged.addr(), plain.addr());
     assert!(tc.is_def_eq(&tagged, &plain).unwrap());
   }
diff --git a/crates/kernel/src/env.rs b/crates/kernel/src/env.rs
index f93a372e..a1e3d5a8 100644
--- a/crates/kernel/src/env.rs
+++ b/crates/kernel/src/env.rs
@@ -22,27 +22,69 @@ use super::mode::KernelMode;
 use super::perf::PerfCounters;
 use super::primitive::Primitives;
 
-/// Content-addressed Merkle hash. 32 bytes, `Copy`, no allocation.
+/// Canonical identity of an expression or universe node: the
+/// intern-assigned uid. Plain `u64`, allocated from a process-global
+/// counter (`expr.rs::fresh_uid`) and NEVER reused, so uid equality
+/// implies structural equality and cache keys built from uids cannot
+/// alias across intern-table clears (a stale key can only miss).
 ///
-/// Earlier revisions stored `Addr = Arc<blake3::Hash>` and threaded all
-/// constructions through a process-global `DashMap` intern table to dedup
-/// the inner allocation. On full-mathlib kernel-check runs that table grew
-/// to 100M+ entries (≈8+ GiB) and dominated RSS, even though the per-worker
-/// `KEnv` caches were correctly cleared per scheduled block. Switching to a
-/// `Copy` value drops the global intern, eliminates one allocation per
-/// `KExpr`/`KUniv` construction, and reduces per-`ExprData` overhead
-/// from `Arc<Hash>` (8-byte pointer + 16-byte heap header + 32-byte
-/// Hash) to a single in-place 32-byte field. Identity comparison falls
-/// back from `Arc::ptr_eq` (single pointer compare) to a 32-byte memcmp,
-/// which is a single AVX2 cycle on modern x86 and dominated by the
-/// surrounding kernel work.
-pub type Addr = blake3::Hash;
+/// This is KERNEL-INTERNAL identity for ephemeral in-memory nodes —
+/// distinct from the Ixon `Address` layer (blake3 over serialized
+/// content) that constants/blobs carry into claims, Merkle roots, and
+/// the proof store. Uids must never escape into a serialized artifact;
+/// see `docs/kernel_identity.md` for the boundary rule and the
+/// proof-carrying-code implications.
+///
+/// Earlier revisions stored the blake3 content hash of every node here,
+/// computed at construction: profiling on the Zisk guest put that hashing
+/// (`app_hash` + the blake3 wrapper) at ~20% of guest cycles on
+/// reduction-heavy constants. Identity is now assigned by the intern
+/// table from shallow structural keys ([`ExprKey`]/[`UnivKey`]) instead
+/// of computed from content.
+pub type Addr = u64;
+
+/// Key type for local-context hashing (`tc.rs::ctx_addr_for_lbr`) and
+/// other places that still need a collision-resistant digest over
+/// variable-length content. Per-node expression identity uses [`Addr`].
+pub type CtxAddr = blake3::Hash;
+
+/// Shallow structural key of an expression node: the variant tag plus the
+/// uids of its children and its semantic payload. Mirrors EXACTLY what the
+/// historical per-node content hash covered — display names, binder info,
+/// and mdata are excluded — so interning semantics are unchanged. Children
+/// are identified by uid: within one table, equal keys ⇔ structurally
+/// equal nodes (children canonical by induction).
+#[derive(Clone, Debug, Eq, Hash, PartialEq)]
+pub enum ExprKey {
+  Var(u64),
+  FVar(u64),
+  Sort(Addr),
+  Const(Address, Box<[Addr]>),
+  App(Addr, Addr),
+  Lam(Addr, Addr),
+  All(Addr, Addr),
+  Let(Addr, Addr, Addr, bool),
+  Prj(Address, u64, Addr),
+  Nat(Address),
+  Str(Address),
+}
+
+/// Shallow structural key of a universe node. `Param` display names are
+/// excluded, mirroring the historical hash semantics.
+#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
+pub enum UnivKey {
+  Zero,
+  Succ(Addr),
+  Max(Addr, Addr),
+  IMax(Addr, Addr),
+  Param(u64),
+}
 
 /// Hash-consing intern table for expressions and universes.
 ///
-/// Single-threaded and owned by one `KEnv`. Guarantees pointer uniqueness
-/// by blake3 hash within that environment: `ptr(a) == ptr(b)` iff
-/// `hash(a) == hash(b)`.
+/// Single-threaded and owned by one `KEnv`. Guarantees uid/pointer
+/// uniqueness per shallow structural key within that environment:
+/// `ptr(a) == ptr(b)` iff `key(a) == key(b)` for interned values.
 ///
 /// Also owns reusable scratch buffers used by `subst`, `simul_subst`, and
 /// `lift` to memoize content-addressed sub-traversals within a single
@@ -52,8 +94,14 @@ pub type Addr = blake3::Hash;
 /// already passed for hash-consing eliminates the malloc/free churn while
 /// keeping the per-call invariant (caches are cleared on entry).
 pub struct InternTable<M: KernelMode> {
-  pub(crate) univs: FxHashMap<blake3::Hash, KUniv<M>>,
-  pub(crate) exprs: FxHashMap<blake3::Hash, KExpr<M>>,
+  pub(crate) univs: FxHashMap<UnivKey, KUniv<M>>,
+  pub(crate) exprs: FxHashMap<ExprKey, KExpr<M>>,
+  /// Uids of canonical (interned) nodes. `intern_expr`/`intern_univ`
+  /// short-circuit on members, and use it to detect non-canonical
+  /// children that must be canonicalized before the shallow key is
+  /// meaningful.
+  pub(crate) canon_exprs: FxHashSet<Addr>,
+  pub(crate) canon_univs: FxHashSet<Addr>,
   /// Scratch buffer for `subst` / `simul_subst` per-call memoization,
   /// keyed by `(addr, depth)`. Cleared on entry. Owned here so the
   /// allocation persists across calls.
@@ -63,6 +111,14 @@ pub struct InternTable<M: KernelMode> {
   /// because `lift` is invoked from inside `subst_cached`, and the two
   /// caches have different semantics, so they must not share entries.
   pub(crate) lift_scratch: FxHashMap<(Addr, u64), KExpr<M>>,
+  /// Pool of scratch maps for `clo_subst` per-call memoization, keyed by
+  /// `(addr, depth)`. A pool rather than a single buffer because
+  /// `clo_subst` re-enters itself through `clo_readback` of environment
+  /// entries (each readback substitutes under a *different* environment,
+  /// so the memo must not be shared between the nesting levels). Maps are
+  /// cleared and returned to the pool on exit, so allocations persist
+  /// across calls like the other scratches.
+  pub(crate) clo_scratch_pool: Vec<FxHashMap<(Addr, u64), KExpr<M>>>,
 }
 
 impl<M: KernelMode> Default for InternTable<M> {
@@ -71,50 +127,201 @@ impl<M: KernelMode> Default for InternTable<M> {
   }
 }
 
+/// Shallow structural key of an expression whose children are canonical.
+pub fn expr_key<M: KernelMode>(e: &KExpr<M>) -> ExprKey {
+  use super::expr::ExprData;
+  match e.data() {
+    ExprData::Var(i, _, _) => ExprKey::Var(*i),
+    ExprData::FVar(id, _, _) => ExprKey::FVar(id.0),
+    ExprData::Sort(u, _) => ExprKey::Sort(*u.addr()),
+    ExprData::Const(id, us, _) => {
+      ExprKey::Const(id.addr.clone(), us.iter().map(|u| *u.addr()).collect())
+    },
+    ExprData::App(f, a, _) => ExprKey::App(*f.addr(), *a.addr()),
+    ExprData::Lam(_, _, t, b, _) => ExprKey::Lam(*t.addr(), *b.addr()),
+    ExprData::All(_, _, t, b, _) => ExprKey::All(*t.addr(), *b.addr()),
+    ExprData::Let(_, t, v, b, nd, _) => {
+      ExprKey::Let(*t.addr(), *v.addr(), *b.addr(), *nd)
+    },
+    ExprData::Prj(id, f, v, _) => ExprKey::Prj(id.addr.clone(), *f, *v.addr()),
+    ExprData::Nat(_, ba, _) => ExprKey::Nat(ba.clone()),
+    ExprData::Str(_, ba, _) => ExprKey::Str(ba.clone()),
+  }
+}
+
+/// Shallow structural key of a universe whose children are canonical.
+pub fn univ_key<M: KernelMode>(u: &KUniv<M>) -> UnivKey {
+  use super::level::UnivData;
+  match u.data() {
+    UnivData::Zero(_) => UnivKey::Zero,
+    UnivData::Succ(inner, _) => UnivKey::Succ(*inner.addr()),
+    UnivData::Max(a, b, _) => UnivKey::Max(*a.addr(), *b.addr()),
+    UnivData::IMax(a, b, _) => UnivKey::IMax(*a.addr(), *b.addr()),
+    UnivData::Param(idx, _, _) => UnivKey::Param(*idx),
+  }
+}
+
 impl<M: KernelMode> InternTable<M> {
   pub fn new() -> Self {
     InternTable {
       univs: FxHashMap::default(),
       exprs: FxHashMap::default(),
+      canon_exprs: FxHashSet::default(),
+      canon_univs: FxHashSet::default(),
       subst_scratch: FxHashMap::default(),
       lift_scratch: FxHashMap::default(),
+      clo_scratch_pool: Vec::new(),
     }
   }
 
   /// Read-only fast path: return the canonical interned universe for
-  /// `hash` if already present. Used by instrumented callers that want
-  /// to record hit/miss separately; plain callers should use
-  /// `intern_univ`.
+  /// `key` if already present. The key must be built from CANONICAL
+  /// children (their uids). Plain callers should use `intern_univ`.
   #[inline]
-  pub fn try_get_univ(&self, hash: &blake3::Hash) -> Option<KUniv<M>> {
-    self.univs.get(hash).cloned()
+  pub fn try_get_univ(&self, key: &UnivKey) -> Option<KUniv<M>> {
+    self.univs.get(key).cloned()
   }
 
   /// Read-only fast path counterpart of `try_get_univ` for expressions.
   #[inline]
-  pub fn try_get_expr(&self, hash: &blake3::Hash) -> Option<KExpr<M>> {
-    self.exprs.get(hash).cloned()
+  pub fn try_get_expr(&self, key: &ExprKey) -> Option<KExpr<M>> {
+    self.exprs.get(key).cloned()
   }
 
-  /// Intern a universe: if one with the same hash exists, return the
-  /// existing Arc (ensuring pointer uniqueness). Otherwise insert and
-  /// return.
+  /// Intern a universe: returns the canonical value for its structural
+  /// identity, recursively canonicalizing children as needed so the
+  /// shallow key is meaningful.
   pub fn intern_univ(&mut self, u: KUniv<M>) -> KUniv<M> {
-    let key = *u.addr();
+    use super::level::UnivData;
+    if self.canon_univs.contains(u.addr()) {
+      return u;
+    }
+    // Canonicalize children first; rebuild only if any child changed.
+    let u = match u.data() {
+      UnivData::Succ(inner, _) => {
+        let ci = self.intern_univ(inner.clone());
+        if ci.ptr_eq(inner) {
+          u
+        } else {
+          KUniv::new(UnivData::Succ(ci, super::expr::fresh_uid()))
+        }
+      },
+      UnivData::Max(a, b, _) => {
+        let ca = self.intern_univ(a.clone());
+        let cb = self.intern_univ(b.clone());
+        if ca.ptr_eq(a) && cb.ptr_eq(b) {
+          u
+        } else {
+          KUniv::new(UnivData::Max(ca, cb, super::expr::fresh_uid()))
+        }
+      },
+      UnivData::IMax(a, b, _) => {
+        let ca = self.intern_univ(a.clone());
+        let cb = self.intern_univ(b.clone());
+        if ca.ptr_eq(a) && cb.ptr_eq(b) {
+          u
+        } else {
+          KUniv::new(UnivData::IMax(ca, cb, super::expr::fresh_uid()))
+        }
+      },
+      UnivData::Zero(_) | UnivData::Param(..) => u,
+    };
+    let key = univ_key(&u);
     if let Some(existing) = self.univs.get(&key) {
       return existing.clone();
     }
-    self.univs.entry(key).or_insert(u).clone()
+    self.canon_univs.insert(*u.addr());
+    self.univs.insert(key, u.clone());
+    u
   }
 
-  /// Intern an expression: same pointer-uniqueness guarantee as
-  /// `intern_univ`.
+  /// Intern an expression: returns the canonical value for its structural
+  /// identity. Children are canonicalized recursively when needed (a node
+  /// built outside the table has non-canonical children whose uids would
+  /// make the shallow key meaningless), preserving the historical
+  /// content-hash interning semantics.
   pub fn intern_expr(&mut self, e: KExpr<M>) -> KExpr<M> {
-    let key = *e.addr();
+    use super::expr::ExprData;
+    if self.canon_exprs.contains(e.addr()) {
+      return e;
+    }
+    let e = match e.data() {
+      ExprData::Sort(un, _) => {
+        let cu = self.intern_univ(un.clone());
+        if cu.ptr_eq(un) { e } else { KExpr::sort_mdata(cu, e.mdata().clone()) }
+      },
+      ExprData::Const(id, us, _) => {
+        let cus: Box<[KUniv<M>]> =
+          us.iter().map(|un| self.intern_univ(un.clone())).collect();
+        if cus.iter().zip(us.iter()).all(|(a, b)| a.ptr_eq(b)) {
+          e
+        } else {
+          KExpr::cnst_mdata(id.clone(), cus, e.mdata().clone())
+        }
+      },
+      ExprData::App(f, a, _) => {
+        let cf = self.intern_expr(f.clone());
+        let ca = self.intern_expr(a.clone());
+        if cf.ptr_eq(f) && ca.ptr_eq(a) {
+          e
+        } else {
+          KExpr::app_mdata(cf, ca, e.mdata().clone())
+        }
+      },
+      ExprData::Lam(n, bi, t, b, _) => {
+        let ct = self.intern_expr(t.clone());
+        let cb = self.intern_expr(b.clone());
+        if ct.ptr_eq(t) && cb.ptr_eq(b) {
+          e
+        } else {
+          KExpr::lam_mdata(n.clone(), bi.clone(), ct, cb, e.mdata().clone())
+        }
+      },
+      ExprData::All(n, bi, t, b, _) => {
+        let ct = self.intern_expr(t.clone());
+        let cb = self.intern_expr(b.clone());
+        if ct.ptr_eq(t) && cb.ptr_eq(b) {
+          e
+        } else {
+          KExpr::all_mdata(n.clone(), bi.clone(), ct, cb, e.mdata().clone())
+        }
+      },
+      ExprData::Let(n, t, v, b, nd, _) => {
+        let ct = self.intern_expr(t.clone());
+        let cv = self.intern_expr(v.clone());
+        let cb = self.intern_expr(b.clone());
+        if ct.ptr_eq(t) && cv.ptr_eq(v) && cb.ptr_eq(b) {
+          e
+        } else {
+          KExpr::let_mdata(n.clone(), ct, cv, cb, *nd, e.mdata().clone())
+        }
+      },
+      ExprData::Prj(id, f, v, _) => {
+        let cv = self.intern_expr(v.clone());
+        if cv.ptr_eq(v) {
+          e
+        } else {
+          KExpr::prj_mdata(id.clone(), *f, cv, e.mdata().clone())
+        }
+      },
+      ExprData::Var(..)
+      | ExprData::FVar(..)
+      | ExprData::Nat(..)
+      | ExprData::Str(..) => e,
+    };
+    let key = expr_key(&e);
     if let Some(existing) = self.exprs.get(&key) {
+      // The shallow key (exact structural Eq over variant tag + child
+      // uids + payload — never a truncated or content-hashed key) plus
+      // canonical children make this hit structurally exact. Checked in
+      // debug builds; a violation here would be an interning bug, not
+      // an input an adversary can craft (uids are assigned, not hashed).
+      debug_assert!(existing == &e, "intern hit is not structurally equal");
       return existing.clone();
     }
-    self.exprs.entry(key).or_insert(e).clone()
+    self.canon_exprs.insert(*e.addr());
+    self.exprs.insert(key, e.clone());
+    e
   }
 }
 
@@ -254,9 +461,9 @@ pub struct KEnv<M: KernelMode> {
   // than `Arc::as_ptr` pointers, avoiding the ABA problem where deallocated
   // pointers are reused by the allocator for semantically different expressions.
   /// WHNF cache (full, with delta): (expr_hash, ctx_hash)-keyed.
-  pub whnf_cache: FxHashMap<(Addr, Addr), KExpr<M>>,
+  pub whnf_cache: FxHashMap<(Addr, CtxAddr), KExpr<M>>,
   /// WHNF cache (no delta): (expr_hash, ctx_hash)-keyed.
-  pub whnf_no_delta_cache: FxHashMap<(Addr, Addr), KExpr<M>>,
+  pub whnf_no_delta_cache: FxHashMap<(Addr, CtxAddr), KExpr<M>>,
   /// Cheap-mode WHNF cache (no delta, DEF_EQ_CORE flags): same key shape as
   /// `whnf_no_delta_cache`, but populated by cheap-projection callers in the
   /// def-eq lazy-delta loop. Cheap output is NOT shared with full callers
@@ -265,41 +472,41 @@ pub struct KEnv<M: KernelMode> {
   /// gated to cheap-mode callers only — mirrors the `def_eq_cheap_cache`
   /// pattern. Without this, every iteration of the lazy-delta loop redoes
   /// `whnf_no_delta_for_def_eq` from scratch (mathlib hot path).
-  pub whnf_no_delta_cheap_cache: FxHashMap<(Addr, Addr), KExpr<M>>,
+  pub whnf_no_delta_cheap_cache: FxHashMap<(Addr, CtxAddr), KExpr<M>>,
   /// WHNF core cache: structural-only reduction (beta/iota/zeta/proj),
   /// no native primitives, no delta. Mirrors lean4lean's `whnfCoreCache`
   /// (refs/lean4lean/Lean4Lean/TypeChecker.lean:19) and lean4 C++'s
   /// `m_whnf_core`. Populated only when flags are FULL — cheap-projection
   /// results are not safe to share with full callers.
-  pub whnf_core_cache: FxHashMap<(Addr, Addr), KExpr<M>>,
+  pub whnf_core_cache: FxHashMap<(Addr, CtxAddr), KExpr<M>>,
   /// Cheap-mode WHNF core cache: same key shape as `whnf_core_cache`, but
   /// populated by cheap-projection callers (DEF_EQ_CORE flags) inside the
   /// def-eq lazy-delta loop. Same soundness reasoning as
   /// `whnf_no_delta_cheap_cache` — cheap output stays in its own pool so
   /// full callers always see a properly-reduced result.
-  pub whnf_core_cheap_cache: FxHashMap<(Addr, Addr), KExpr<M>>,
+  pub whnf_core_cheap_cache: FxHashMap<(Addr, CtxAddr), KExpr<M>>,
   /// Infer cache: keyed by (expr_hash, ctx_hash). Context-dependent.
   /// Populated only from full-mode `infer` (i.e. not from `with_infer_only`),
   /// so every cached result has passed the validation `infer_only` skips.
   /// Both modes read from this same cache — an `infer_only` lookup happily
   /// consumes a full-mode result since it's strictly stronger.
-  pub infer_cache: FxHashMap<(Addr, Addr), KExpr<M>>,
+  pub infer_cache: FxHashMap<(Addr, CtxAddr), KExpr<M>>,
   /// Infer-only cache: keyed like `infer_cache`, but populated only by
   /// `with_infer_only` synthesis and read only while infer-only is active.
   /// This keeps unchecked results out of the validated full-mode cache while
   /// still sharing repeated proof-irrelevance/projection probes.
-  pub infer_only_cache: FxHashMap<(Addr, Addr), KExpr<M>>,
+  pub infer_only_cache: FxHashMap<(Addr, CtxAddr), KExpr<M>>,
   /// Full def-eq cache: keyed by (expr_hash, expr_hash, ctx_hash).
   /// Context-dependent. Entries in this cache are valid for both full and
   /// cheap def-eq callers.
-  pub def_eq_cache: FxHashMap<(Addr, Addr, Addr), bool>,
+  pub def_eq_cache: FxHashMap<(Addr, Addr, CtxAddr), bool>,
   /// Cheap def-eq cache: same key as `def_eq_cache`, but only for comparisons
   /// performed inside cheap projection reductions. Cheap `false` can be a
   /// full-mode false negative, so those entries must not be visible to full
   /// callers.
-  pub def_eq_cheap_cache: FxHashMap<(Addr, Addr, Addr), bool>,
+  pub def_eq_cheap_cache: FxHashMap<(Addr, Addr, CtxAddr), bool>,
   /// Failed def-eq pairs in lazy delta: canonical ordering by hash.
-  pub def_eq_failure: FxHashSet<(Addr, Addr, Addr)>,
+  pub def_eq_failure: FxHashSet<(Addr, Addr, CtxAddr)>,
   /// Constant-instantiation cache: caches the result of
   /// `instantiate_univ_params(val, us)` for each `Const(id, us)` head encountered
   /// during delta unfolding. Keyed by the head expression's content hash, which
@@ -314,11 +521,11 @@ pub struct KEnv<M: KernelMode> {
   /// this memo a stuck `Nat.succ^k(x)` chain is re-peeled from every
   /// depth it is encountered at — O(k²) fuel for symbolic-plus-literal
   /// Nat arithmetic (e.g. `x + 0xC0` in the UTF-8 codec proofs).
-  pub nat_succ_stuck: FxHashSet<(Addr, Addr)>,
+  pub nat_succ_stuck: FxHashSet<(Addr, CtxAddr)>,
   /// Ingress cache: LeanExpr → KExpr conversion results.
   /// Keyed by (expr_hash, param_names_hash) to account for different
   /// level param bindings producing different KExprs from the same LeanExpr.
-  pub ingress_cache: FxHashMap<(Addr, Addr), KExpr<M>>,
+  pub ingress_cache: FxHashMap<(CtxAddr, CtxAddr), KExpr<M>>,
   /// "Is this type Prop?" cache, keyed by (type_hash, ctx_hash).
   ///
   /// `try_proof_irrel` is called on essentially every `is_def_eq`
@@ -330,7 +537,7 @@ pub struct KEnv<M: KernelMode> {
   /// proof-irrelevance probe skip those three calls. Empirically this
   /// is the dominant cost on mathlib proof-heavy blocks, where the same
   /// propositions are tested for equality thousands of times.
-  pub is_prop_cache: FxHashMap<(Addr, Addr), bool>,
+  pub is_prop_cache: FxHashMap<(Addr, CtxAddr), bool>,
   /// Generated recursors, keyed by inductive Muts block id.
   pub recursor_cache: FxHashMap<KId<M>, Vec<GeneratedRecursor<M>>>,
   /// Nested-auxiliary order expected by stored recursors in this environment.
@@ -347,6 +554,12 @@ pub struct KEnv<M: KernelMode> {
   /// so every member of a bad block reports the same structured failure.
   pub block_check_results: FxHashMap<KId<M>, Result<(), TcError<M>>>,
 
+  /// Primitive-reducer family per head-constant address (see
+  /// `whnf.rs::PrimFamily`). Pure function of the address; memoized so
+  /// the WHNF loops classify each head with one map probe instead of a
+  /// ~30-address compare gauntlet across five recognizers.
+  pub prim_family_cache: FxHashMap<Address, super::whnf::PrimFamily>,
+
   /// Next free-variable id for checker-local binder openings.
   ///
   /// Type-checking caches live on `KEnv`, not on one `TypeChecker`, so FVar
@@ -421,6 +634,7 @@ impl<M: KernelMode> KEnv<M> {
       rec_majors_cache: FxHashMap::default(),
       block_peer_agreement_cache: FxHashSet::default(),
       block_check_results: FxHashMap::default(),
+      prim_family_cache: FxHashMap::default(),
       next_fvar_id: 0,
       perf: PerfCounters::default(),
       profile_sink: None,
@@ -511,11 +725,14 @@ impl<M: KernelMode> KEnv<M> {
     self.blocks.clear();
     self.intern.univs.clear();
     self.intern.exprs.clear();
+    self.intern.canon_exprs.clear();
+    self.intern.canon_univs.clear();
     // Scratch buffers retain entries from the most recent subst/lift call;
     // emptying them releases the KExpr Arc references they hold so the
     // intern.exprs cleanup above can actually drop ExprData allocations.
     self.intern.subst_scratch.clear();
     self.intern.lift_scratch.clear();
+    self.intern.clo_scratch_pool.clear();
     let _ = self.prims.take();
     self.whnf_cache.clear();
     self.whnf_no_delta_cache.clear();
@@ -535,6 +752,7 @@ impl<M: KernelMode> KEnv<M> {
     self.rec_majors_cache.clear();
     self.block_peer_agreement_cache.clear();
     self.block_check_results.clear();
+    self.prim_family_cache.clear();
     self.next_fvar_id = 0;
   }
 
@@ -597,6 +815,7 @@ impl<M: KernelMode> KEnv<M> {
     self.rec_majors_cache = FxHashMap::default();
     self.block_peer_agreement_cache = FxHashSet::default();
     self.block_check_results = FxHashMap::default();
+    self.prim_family_cache = FxHashMap::default();
     self.next_fvar_id = 0;
   }
 
diff --git a/crates/kernel/src/equiv.rs b/crates/kernel/src/equiv.rs
index aeec2727..2bde4d1c 100644
--- a/crates/kernel/src/equiv.rs
+++ b/crates/kernel/src/equiv.rs
@@ -6,10 +6,10 @@
 
 use rustc_hash::FxHashMap;
 
-use super::env::Addr;
+use super::env::{Addr, CtxAddr};
 
 /// Composite key: (expression content hash, context content hash).
-pub type EqKey = (Addr, Addr);
+pub type EqKey = (Addr, CtxAddr);
 
 /// Union-find structure for tracking definitional equality between
 /// (expr_hash, ctx_hash) pairs.
@@ -141,13 +141,17 @@ mod tests {
   use super::*;
 
   fn addr(n: u64) -> Addr {
+    n
+  }
+
+  fn ctx(n: u64) -> CtxAddr {
     blake3::hash(&n.to_le_bytes())
   }
 
   #[test]
   fn test_basic_equiv() {
     let mut em = EquivManager::new();
-    let zero = addr(0);
+    let zero = ctx(0);
     assert!(!em.is_equiv(&(addr(100), zero), &(addr(200), zero)));
     em.add_equiv((addr(100), zero), (addr(200), zero));
     assert!(em.is_equiv(&(addr(100), zero), &(addr(200), zero)));
@@ -157,7 +161,7 @@ mod tests {
   #[test]
   fn test_transitivity() {
     let mut em = EquivManager::new();
-    let zero = addr(0);
+    let zero = ctx(0);
     em.add_equiv((addr(100), zero), (addr(200), zero));
     em.add_equiv((addr(200), zero), (addr(300), zero));
     assert!(em.is_equiv(&(addr(100), zero), &(addr(300), zero)));
@@ -166,8 +170,8 @@ mod tests {
   #[test]
   fn test_context_isolation() {
     let mut em = EquivManager::new();
-    let ctx1 = addr(1);
-    let ctx2 = addr(2);
+    let ctx1 = ctx(1);
+    let ctx2 = ctx(2);
     em.add_equiv((addr(100), ctx1), (addr(200), ctx1));
     assert!(em.is_equiv(&(addr(100), ctx1), &(addr(200), ctx1)));
     assert!(!em.is_equiv(&(addr(100), ctx2), &(addr(200), ctx2)));
diff --git a/crates/kernel/src/expr.rs b/crates/kernel/src/expr.rs
index fe5ce40c..33a59916 100644
--- a/crates/kernel/src/expr.rs
+++ b/crates/kernel/src/expr.rs
@@ -8,10 +8,7 @@ use std::sync::Arc;
 
 use bignat::Nat;
 use ix_common::address::Address;
-use ix_common::env::{
-  BinderInfo, DataValue, EALL, EAPP, EFVAR, ELAM, ELET, ENAT, EPRJ, EREF,
-  ESORT, ESTR, EVAR, Name,
-};
+use ix_common::env::{BinderInfo, DataValue, Name};
 
 use super::env::Addr;
 use super::id::KId;
@@ -138,8 +135,10 @@ impl<M: KernelMode> KExpr<M> {
     &self.info().mdata
   }
 
-  /// Content-addressed key for cache lookups. Returns the blake3 hash
-  /// by value — `Addr` is `Copy`, so this is a 32-byte memcpy.
+  /// Canonical identity key for cache lookups: the intern-assigned uid.
+  /// `Addr` is a plain `u64`, allocated process-globally and never reused,
+  /// so uid equality implies "same construction event or same intern-table
+  /// canonical value" — strictly, structural equality.
   pub fn hash_key(&self) -> Addr {
     *self.addr()
   }
@@ -148,25 +147,73 @@ impl<M: KernelMode> KExpr<M> {
     Arc::ptr_eq(&self.0, &other.0)
   }
 
-  /// Content-addressed equality with a layered fast path.
-  ///
-  /// 1. `ptr_eq` on the outer `KExpr` Arc — fires when both sides
-  ///    came through the [`InternTable`](super::env::InternTable).
-  /// 2. 32-byte Blake3 hash compare — sound on its own (collisions
-  ///    require an adversarial preimage attack), and a single AVX2
-  ///    cycle on modern x86. Earlier revisions interposed an
-  ///    `Arc::ptr_eq` fast path on a process-globally-interned `Addr`,
-  ///    but that intern table dominated RSS at mathlib scale; the
-  ///    pure-content compare keeps the same correctness with no
-  ///    process-global state.
+  /// Canonical-identity equality: `ptr_eq` or equal intern uid. Sound in
+  /// the affirmative (equal uid ⇒ structurally equal); INCOMPLETE in the
+  /// negative — two structurally equal expressions built without sharing
+  /// an [`InternTable`](super::env::InternTable) carry distinct uids.
+  /// Callers that need a definitive verdict use `==` (structural with this
+  /// as the fast path); callers where a false negative merely costs a
+  /// cache miss or a def-eq fallback use this directly.
   pub fn hash_eq(&self, other: &KExpr<M>) -> bool {
     self.ptr_eq(other) || self.addr() == other.addr()
   }
+
+  /// Structural equality mirroring exactly what the intern identity
+  /// covers: display names, binder info, and mdata are NOT compared
+  /// (matching the historical content-hash semantics). Equal interned
+  /// subtrees prune at the uid fast path, so canonical-vs-canonical
+  /// comparison is O(1).
+  fn structural_eq(&self, other: &Self) -> bool {
+    if self.hash_eq(other) {
+      return true;
+    }
+    match (self.data(), other.data()) {
+      (ExprData::Var(i, _, _), ExprData::Var(j, _, _)) => i == j,
+      (ExprData::FVar(x, _, _), ExprData::FVar(y, _, _)) => x == y,
+      (ExprData::Sort(u, _), ExprData::Sort(v, _)) => u == v,
+      (ExprData::Const(id1, us1, _), ExprData::Const(id2, us2, _)) => {
+        id1.addr == id2.addr
+          && us1.len() == us2.len()
+          && us1.iter().zip(us2.iter()).all(|(a, b)| a == b)
+      },
+      (ExprData::App(f1, a1, _), ExprData::App(f2, a2, _)) => {
+        f1.structural_eq(f2) && a1.structural_eq(a2)
+      },
+      (ExprData::Lam(_, _, t1, b1, _), ExprData::Lam(_, _, t2, b2, _))
+      | (ExprData::All(_, _, t1, b1, _), ExprData::All(_, _, t2, b2, _)) => {
+        t1.structural_eq(t2) && b1.structural_eq(b2)
+      },
+      (
+        ExprData::Let(_, t1, v1, b1, nd1, _),
+        ExprData::Let(_, t2, v2, b2, nd2, _),
+      ) => {
+        nd1 == nd2
+          && t1.structural_eq(t2)
+          && v1.structural_eq(v2)
+          && b1.structural_eq(b2)
+      },
+      (ExprData::Prj(id1, f1, v1, _), ExprData::Prj(id2, f2, v2, _)) => {
+        id1.addr == id2.addr && f1 == f2 && v1.structural_eq(v2)
+      },
+      // Literals: the blob address IS the identity (mirroring the old
+      // content hash, which hashed only the address). The value conjunct
+      // is defense in depth: load-time blob verification makes it
+      // redundant, and if that invariant were ever violated it degrades
+      // to inequality — the conservative direction.
+      (ExprData::Nat(v1, ba1, _), ExprData::Nat(v2, ba2, _)) => {
+        ba1 == ba2 && v1 == v2
+      },
+      (ExprData::Str(v1, ba1, _), ExprData::Str(v2, ba2, _)) => {
+        ba1 == ba2 && v1 == v2
+      },
+      _ => false,
+    }
+  }
 }
 
 impl<M: KernelMode> PartialEq for KExpr<M> {
   fn eq(&self, other: &Self) -> bool {
-    self.hash_eq(other)
+    self.structural_eq(other)
   }
 }
 
@@ -186,18 +233,40 @@ fn mk_info<M: KernelMode>(
   ExprInfo { addr, lbr, count_0, has_fvars, mdata }
 }
 
+/// Process-global uid allocator for expression/universe identity.
+///
+/// Every constructed node takes a fresh uid, so uid equality means "same
+/// construction event" — or, after [`InternTable`](super::env::InternTable)
+/// hash-consing, "same canonical value". Uids are NEVER reused (the counter
+/// is global, not per-table), so cache keys built from uids stay sound
+/// across intern-table clears: a stale key can only miss, never alias.
+///
+/// This replaces the per-node blake3 content hash: profiling on the Zisk
+/// guest put `app_hash` + the blake3 wrapper at ~20% of cycles on
+/// reduction-heavy constants, all of it spent computing identity that the
+/// intern table can assign in one atomic increment.
+static NEXT_UID: std::sync::atomic::AtomicU64 =
+  std::sync::atomic::AtomicU64::new(1);
+
+#[inline]
+pub(crate) fn fresh_uid() -> Addr {
+  let uid = NEXT_UID.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+  // Uid exhaustion guard. Uids are ASSIGNED (sequential), never computed
+  // from content, so two distinct terms can only alias if the counter
+  // wraps — which would take centuries at nanosecond allocation rates
+  // (and >2^67 guest cycles in-circuit). Abort rather than reason about
+  // it: identity soundness must not rest on "probably won't happen".
+  assert!(uid < u64::MAX, "kernel uid counter exhausted");
+  uid
+}
+
 // =============================================================================
-// Hash-first interning: each `*_mdata` constructor is split into a
-// hash-only function (no allocation) and a `*_mdata_with_addr` builder
-// that takes a precomputed canonical [`Addr`]. The plain `*_mdata` form is
-// kept as a convenience wrapper for callers that don't pre-check the
-// intern table.
-//
-// Hot-path callers in `ingress.rs` use the split form so they can ask
-// `InternTable::try_get_expr(&hash)` *before* paying the
-// blake3-hash + `intern_addr` + `Arc<ExprData>` allocation cost — a
-// significant win because >60% of constructed values are immediately
-// discarded for an existing canonical Arc on the intern table.
+// Constructors: each `*_mdata` form allocates a fresh uid; the
+// `*_mdata_with_addr` builders remain for the intern table and callers
+// that already hold a fresh uid. Structural identity (what used to be the
+// content hash) is decided by `InternTable`'s shallow keys — see
+// `env.rs::ExprKey` — which mirror what the historical hash functions
+// covered: names, binder info, and mdata are excluded.
 // =============================================================================
 
 impl<M: KernelMode> KExpr<M> {
@@ -207,21 +276,7 @@ impl<M: KernelMode> KExpr<M> {
 
   /// Compute the content hash for [`KExpr::var_mdata`] without allocating.
   ///
-  /// `name` is descriptive metadata only and intentionally NOT hashed —
-  /// two `Var(i)` nodes with different display names are content-equal,
-  /// keeping hash equality alpha-invariant even in `Meta` mode.
-  pub fn var_hash(
-    idx: u64,
-    _name: &M::MField<Name>,
-    _mdata: &M::MField<Vec<MData>>,
-  ) -> blake3::Hash {
-    let mut h = blake3::Hasher::new();
-    h.update(&[EVAR]);
-    h.update(&idx.to_le_bytes());
-    h.finalize()
-  }
-
-  pub fn var_mdata_with_addr(
+  fn var_mdata_with_addr(
     idx: u64,
     name: M::MField<Name>,
     mdata: M::MField<Vec<MData>>,
@@ -236,22 +291,7 @@ impl<M: KernelMode> KExpr<M> {
     Self::fvar_mdata(id, name, no_mdata::<M>())
   }
 
-  /// Compute the content hash for [`KExpr::fvar_mdata`] without allocating.
-  /// Includes the [`FVarId`] so distinct fvars produce distinct hashes — the
-  /// soundness lever for keying caches by expression alone. `name` is
-  /// descriptive only and intentionally NOT hashed.
-  pub fn fvar_hash(
-    id: FVarId,
-    _name: &M::MField<Name>,
-    _mdata: &M::MField<Vec<MData>>,
-  ) -> blake3::Hash {
-    let mut h = blake3::Hasher::new();
-    h.update(&[EFVAR]);
-    h.update(&id.0.to_le_bytes());
-    h.finalize()
-  }
-
-  pub fn fvar_mdata_with_addr(
+  fn fvar_mdata_with_addr(
     id: FVarId,
     name: M::MField<Name>,
     mdata: M::MField<Vec<MData>>,
@@ -268,7 +308,7 @@ impl<M: KernelMode> KExpr<M> {
     name: M::MField<Name>,
     mdata: M::MField<Vec<MData>>,
   ) -> Self {
-    let addr = Self::fvar_hash(id, &name, &mdata);
+    let addr = fresh_uid();
     Self::fvar_mdata_with_addr(id, name, mdata, addr)
   }
 
@@ -277,7 +317,7 @@ impl<M: KernelMode> KExpr<M> {
     name: M::MField<Name>,
     mdata: M::MField<Vec<MData>>,
   ) -> Self {
-    let addr = Self::var_hash(idx, &name, &mdata);
+    let addr = fresh_uid();
     Self::var_mdata_with_addr(idx, name, mdata, addr)
   }
 
@@ -285,17 +325,7 @@ impl<M: KernelMode> KExpr<M> {
     Self::sort_mdata(u, no_mdata::<M>())
   }
 
-  pub fn sort_hash(
-    u: &KUniv<M>,
-    _mdata: &M::MField<Vec<MData>>,
-  ) -> blake3::Hash {
-    let mut h = blake3::Hasher::new();
-    h.update(&[ESORT]);
-    h.update(u.addr().as_bytes());
-    h.finalize()
-  }
-
-  pub fn sort_mdata_with_addr(
+  fn sort_mdata_with_addr(
     u: KUniv<M>,
     mdata: M::MField<Vec<MData>>,
     addr: Addr,
@@ -304,7 +334,7 @@ impl<M: KernelMode> KExpr<M> {
   }
 
   pub fn sort_mdata(u: KUniv<M>, mdata: M::MField<Vec<MData>>) -> Self {
-    let addr = Self::sort_hash(&u, &mdata);
+    let addr = fresh_uid();
     Self::sort_mdata_with_addr(u, mdata, addr)
   }
 
@@ -312,25 +342,7 @@ impl<M: KernelMode> KExpr<M> {
     Self::cnst_mdata(id, univs, no_mdata::<M>())
   }
 
-  /// `id.addr` is the constant's content-address — its identity. The
-  /// `id.name` field is display-only metadata, intentionally NOT hashed,
-  /// so two references to the same address with different display names
-  /// remain content-equal.
-  pub fn cnst_hash(
-    id: &KId<M>,
-    univs: &[KUniv<M>],
-    _mdata: &M::MField<Vec<MData>>,
-  ) -> blake3::Hash {
-    let mut h = blake3::Hasher::new();
-    h.update(&[EREF]);
-    h.update(id.addr.as_bytes());
-    for u in univs.iter() {
-      h.update(u.addr().as_bytes());
-    }
-    h.finalize()
-  }
-
-  pub fn cnst_mdata_with_addr(
+  fn cnst_mdata_with_addr(
     id: KId<M>,
     univs: Box<[KUniv<M>]>,
     mdata: M::MField<Vec<MData>>,
@@ -348,7 +360,7 @@ impl<M: KernelMode> KExpr<M> {
     univs: Box<[KUniv<M>]>,
     mdata: M::MField<Vec<MData>>,
   ) -> Self {
-    let addr = Self::cnst_hash(&id, &univs, &mdata);
+    let addr = fresh_uid();
     Self::cnst_mdata_with_addr(id, univs, mdata, addr)
   }
 
@@ -356,19 +368,7 @@ impl<M: KernelMode> KExpr<M> {
     Self::app_mdata(f, a, no_mdata::<M>())
   }
 
-  pub fn app_hash(
-    f: &KExpr<M>,
-    a: &KExpr<M>,
-    _mdata: &M::MField<Vec<MData>>,
-  ) -> blake3::Hash {
-    let mut h = blake3::Hasher::new();
-    h.update(&[EAPP]);
-    h.update(f.addr().as_bytes());
-    h.update(a.addr().as_bytes());
-    h.finalize()
-  }
-
-  pub fn app_mdata_with_addr(
+  fn app_mdata_with_addr(
     f: KExpr<M>,
     a: KExpr<M>,
     mdata: M::MField<Vec<MData>>,
@@ -389,7 +389,7 @@ impl<M: KernelMode> KExpr<M> {
     a: KExpr<M>,
     mdata: M::MField<Vec<MData>>,
   ) -> Self {
-    let addr = Self::app_hash(&f, &a, &mdata);
+    let addr = fresh_uid();
     Self::app_mdata_with_addr(f, a, mdata, addr)
   }
 
@@ -404,26 +404,7 @@ impl<M: KernelMode> KExpr<M> {
 
   /// Compute the content hash for [`KExpr::lam_mdata`].
   ///
-  /// Binder `name` and `bi` are display/elaboration metadata only and are
-  /// intentionally NOT hashed. The kernel does not distinguish lambdas
-  /// that differ only in binder name or binder info; this keeps hash
-  /// equality structural and alpha-invariant in `Meta` mode (matching
-  /// `Anon` mode where these fields are erased).
-  pub fn lam_hash(
-    _name: &M::MField<Name>,
-    _bi: &M::MField<BinderInfo>,
-    ty: &KExpr<M>,
-    body: &KExpr<M>,
-    _mdata: &M::MField<Vec<MData>>,
-  ) -> blake3::Hash {
-    let mut h = blake3::Hasher::new();
-    h.update(&[ELAM]);
-    h.update(ty.addr().as_bytes());
-    h.update(body.addr().as_bytes());
-    h.finalize()
-  }
-
-  pub fn lam_mdata_with_addr(
+  fn lam_mdata_with_addr(
     name: M::MField<Name>,
     bi: M::MField<BinderInfo>,
     ty: KExpr<M>,
@@ -448,7 +429,7 @@ impl<M: KernelMode> KExpr<M> {
     body: KExpr<M>,
     mdata: M::MField<Vec<MData>>,
   ) -> Self {
-    let addr = Self::lam_hash(&name, &bi, &ty, &body, &mdata);
+    let addr = fresh_uid();
     Self::lam_mdata_with_addr(name, bi, ty, body, mdata, addr)
   }
 
@@ -461,22 +442,7 @@ impl<M: KernelMode> KExpr<M> {
     Self::all_mdata(name, bi, ty, body, no_mdata::<M>())
   }
 
-  /// See [`KExpr::lam_hash`] — binder `name`/`bi` intentionally not hashed.
-  pub fn all_hash(
-    _name: &M::MField<Name>,
-    _bi: &M::MField<BinderInfo>,
-    ty: &KExpr<M>,
-    body: &KExpr<M>,
-    _mdata: &M::MField<Vec<MData>>,
-  ) -> blake3::Hash {
-    let mut h = blake3::Hasher::new();
-    h.update(&[EALL]);
-    h.update(ty.addr().as_bytes());
-    h.update(body.addr().as_bytes());
-    h.finalize()
-  }
-
-  pub fn all_mdata_with_addr(
+  fn all_mdata_with_addr(
     name: M::MField<Name>,
     bi: M::MField<BinderInfo>,
     ty: KExpr<M>,
@@ -501,7 +467,7 @@ impl<M: KernelMode> KExpr<M> {
     body: KExpr<M>,
     mdata: M::MField<Vec<MData>>,
   ) -> Self {
-    let addr = Self::all_hash(&name, &bi, &ty, &body, &mdata);
+    let addr = fresh_uid();
     Self::all_mdata_with_addr(name, bi, ty, body, mdata, addr)
   }
 
@@ -515,28 +481,7 @@ impl<M: KernelMode> KExpr<M> {
     Self::let_mdata(name, ty, val, body, non_dep, no_mdata::<M>())
   }
 
-  /// See [`KExpr::lam_hash`] — binder `name` is intentionally not hashed.
-  /// `non_dep` IS hashed: dropping it would intern two letEs that differ only
-  /// in `non_dep` to the same KExpr, and egress would then return whichever
-  /// `non_dep` was interned first, breaking Ixon roundtrip fidelity.
-  pub fn let_hash(
-    _name: &M::MField<Name>,
-    ty: &KExpr<M>,
-    val: &KExpr<M>,
-    body: &KExpr<M>,
-    non_dep: bool,
-    _mdata: &M::MField<Vec<MData>>,
-  ) -> blake3::Hash {
-    let mut h = blake3::Hasher::new();
-    h.update(&[ELET]);
-    h.update(ty.addr().as_bytes());
-    h.update(val.addr().as_bytes());
-    h.update(body.addr().as_bytes());
-    h.update(&[u8::from(non_dep)]);
-    h.finalize()
-  }
-
-  pub fn let_mdata_with_addr(
+  fn let_mdata_with_addr(
     name: M::MField<Name>,
     ty: KExpr<M>,
     val: KExpr<M>,
@@ -563,7 +508,7 @@ impl<M: KernelMode> KExpr<M> {
     non_dep: bool,
     mdata: M::MField<Vec<MData>>,
   ) -> Self {
-    let addr = Self::let_hash(&name, &ty, &val, &body, non_dep, &mdata);
+    let addr = fresh_uid();
     Self::let_mdata_with_addr(name, ty, val, body, non_dep, mdata, addr)
   }
 
@@ -571,22 +516,7 @@ impl<M: KernelMode> KExpr<M> {
     Self::prj_mdata(id, field, val, no_mdata::<M>())
   }
 
-  /// `id.name` is display-only metadata, intentionally NOT hashed.
-  pub fn prj_hash(
-    id: &KId<M>,
-    field: u64,
-    val: &KExpr<M>,
-    _mdata: &M::MField<Vec<MData>>,
-  ) -> blake3::Hash {
-    let mut h = blake3::Hasher::new();
-    h.update(&[EPRJ]);
-    h.update(id.addr.as_bytes());
-    h.update(&field.to_le_bytes());
-    h.update(val.addr().as_bytes());
-    h.finalize()
-  }
-
-  pub fn prj_mdata_with_addr(
+  fn prj_mdata_with_addr(
     id: KId<M>,
     field: u64,
     val: KExpr<M>,
@@ -604,7 +534,7 @@ impl<M: KernelMode> KExpr<M> {
     val: KExpr<M>,
     mdata: M::MField<Vec<MData>>,
   ) -> Self {
-    let addr = Self::prj_hash(&id, field, &val, &mdata);
+    let addr = fresh_uid();
     Self::prj_mdata_with_addr(id, field, val, mdata, addr)
   }
 
@@ -612,17 +542,7 @@ impl<M: KernelMode> KExpr<M> {
     Self::nat_mdata(val, blob_addr, no_mdata::<M>())
   }
 
-  pub fn nat_hash(
-    blob_addr: &Address,
-    _mdata: &M::MField<Vec<MData>>,
-  ) -> blake3::Hash {
-    let mut h = blake3::Hasher::new();
-    h.update(&[ENAT]);
-    h.update(blob_addr.as_bytes());
-    h.finalize()
-  }
-
-  pub fn nat_mdata_with_addr(
+  fn nat_mdata_with_addr(
     val: Nat,
     blob_addr: Address,
     mdata: M::MField<Vec<MData>>,
@@ -640,7 +560,7 @@ impl<M: KernelMode> KExpr<M> {
     blob_addr: Address,
     mdata: M::MField<Vec<MData>>,
   ) -> Self {
-    let addr = Self::nat_hash(&blob_addr, &mdata);
+    let addr = fresh_uid();
     Self::nat_mdata_with_addr(val, blob_addr, mdata, addr)
   }
 
@@ -648,17 +568,7 @@ impl<M: KernelMode> KExpr<M> {
     Self::str_mdata(val, blob_addr, no_mdata::<M>())
   }
 
-  pub fn str_hash(
-    blob_addr: &Address,
-    _mdata: &M::MField<Vec<MData>>,
-  ) -> blake3::Hash {
-    let mut h = blake3::Hasher::new();
-    h.update(&[ESTR]);
-    h.update(blob_addr.as_bytes());
-    h.finalize()
-  }
-
-  pub fn str_mdata_with_addr(
+  fn str_mdata_with_addr(
     val: String,
     blob_addr: Address,
     mdata: M::MField<Vec<MData>>,
@@ -676,7 +586,7 @@ impl<M: KernelMode> KExpr<M> {
     blob_addr: Address,
     mdata: M::MField<Vec<MData>>,
   ) -> Self {
-    let addr = Self::str_hash(&blob_addr, &mdata);
+    let addr = fresh_uid();
     Self::str_mdata_with_addr(val, blob_addr, mdata, addr)
   }
 }
@@ -826,7 +736,9 @@ mod tests {
 
   #[test]
   fn var_hash_deterministic() {
-    assert_eq!(AE::var(0, ()).addr(), AE::var(0, ()).addr());
+    // Identity is intern-assigned now: independently built nodes carry
+    // distinct uids but remain structurally equal (`==`).
+    assert_eq!(AE::var(0, ()), AE::var(0, ()));
   }
 
   #[test]
@@ -839,10 +751,7 @@ mod tests {
     // Binder names are descriptive metadata only — they do NOT contribute
     // to the content hash, so two `Var(0)` nodes with different display
     // names are content-equal. Keeps hash equality alpha-invariant.
-    assert_eq!(
-      ME::var(0, mk_name("x")).addr(),
-      ME::var(0, mk_name("y")).addr()
-    );
+    assert_eq!(ME::var(0, mk_name("x")), ME::var(0, mk_name("y")));
   }
 
   #[test]
@@ -867,7 +776,7 @@ mod tests {
     // their display names.
     let a = ME::cnst(KId::new(mk_addr("Nat"), mk_name("Nat")), Box::new([]));
     let b = ME::cnst(KId::new(mk_addr("Nat"), mk_name("Int")), Box::new([]));
-    assert_eq!(a.addr(), b.addr());
+    assert_eq!(a, b);
   }
 
   #[test]
@@ -894,7 +803,7 @@ mod tests {
     let a =
       ME::lam(mk_name("x"), BinderInfo::Default, ty.clone(), body.clone());
     let b = ME::lam(mk_name("y"), BinderInfo::Default, ty, body);
-    assert_eq!(a.addr(), b.addr());
+    assert_eq!(a, b);
   }
 
   #[test]
@@ -907,7 +816,7 @@ mod tests {
     let a =
       ME::lam(mk_name("x"), BinderInfo::Default, ty.clone(), body.clone());
     let b = ME::lam(mk_name("x"), BinderInfo::Implicit, ty, body);
-    assert_eq!(a.addr(), b.addr());
+    assert_eq!(a, b);
   }
 
   #[test]
diff --git a/crates/kernel/src/inductive.rs b/crates/kernel/src/inductive.rs
index 52e81028..9773d554 100644
--- a/crates/kernel/src/inductive.rs
+++ b/crates/kernel/src/inductive.rs
@@ -501,7 +501,7 @@ impl<M: KernelMode> TypeChecker<'_, M> {
     let mut flat: Vec<FlatBlockMember<M>> = Vec::new();
     // (ext_ind_addr, spec_params content hashes) for dedup.
     // Uses [u8; 32] blake3 digest for structural equality.
-    let mut aux_seen: Vec<(Address, Vec<[u8; 32]>)> = Vec::new();
+    let mut aux_seen: Vec<(Address, Vec<KExpr<M>>)> = Vec::new();
 
     // Seed with original block inductives.
     for ind_id in block_inds {
@@ -622,7 +622,7 @@ impl<M: KernelMode> TypeChecker<'_, M> {
     dom: &KExpr<M>,
     block_addrs: &[Address],
     flat: &mut Vec<FlatBlockMember<M>>,
-    aux_seen: &mut Vec<(Address, Vec<[u8; 32]>)>,
+    aux_seen: &mut Vec<(Address, Vec<KExpr<M>>)>,
     univ_offset: u64,
     param_depth: usize, // depth at the param context (before field locals)
     n_rec_params: u64, // number of inductive parameters (valid Var refs in spec_params)
@@ -713,9 +713,9 @@ impl<M: KernelMode> TypeChecker<'_, M> {
       }
 
       // Dedup: check if we've already seen this (ext_ind, spec_params) pair.
-      // Use blake3 content hash (addr) for structural dedup.
-      let spec_hashes: Vec<[u8; 32]> =
-        spec_params.iter().map(|e| *e.addr().as_bytes()).collect();
+      // Structural comparison (uid fast path + recursive fallback) so
+      // equal-but-separately-built spec params still collapse.
+      let spec_hashes: Vec<KExpr<M>> = spec_params.clone();
       if aux_seen.iter().any(|(a, s)| {
         *a == head_id.addr
           && s.len() == spec_hashes.len()
@@ -1106,10 +1106,10 @@ impl<M: KernelMode> TypeChecker<'_, M> {
       h.update(&(source_idx as u64).to_le_bytes());
       h.update(member.id.addr.as_bytes());
       for sp in &member.spec_params {
-        h.update(sp.addr().as_bytes());
+        h.update(&sp.addr().to_le_bytes());
       }
       for u in member.occurrence_us.iter() {
-        h.update(u.addr().as_bytes());
+        h.update(&u.addr().to_le_bytes());
       }
       let aux_addr = Address::from_blake3_hash(h.finalize());
       let aux_id = KId::new(aux_addr.clone(), M::meta_field(seed_name.clone()));
diff --git a/crates/kernel/src/infer.rs b/crates/kernel/src/infer.rs
index e65748eb..cbe061c0 100644
--- a/crates/kernel/src/infer.rs
+++ b/crates/kernel/src/infer.rs
@@ -598,7 +598,7 @@ fn compact_head<M: KernelMode>(e: &KExpr<M>) -> String {
 }
 
 fn short_addr<M: KernelMode>(e: &KExpr<M>) -> String {
-  e.addr().to_hex().chars().take(12).collect()
+  format!("uid{}", e.addr())
 }
 
 #[cfg(test)]
diff --git a/crates/kernel/src/ingress.rs b/crates/kernel/src/ingress.rs
index 0b2a4e83..e3c16b63 100644
--- a/crates/kernel/src/ingress.rs
+++ b/crates/kernel/src/ingress.rs
@@ -19,7 +19,6 @@ use rustc_hash::{FxHashMap, FxHashSet};
 
 use dashmap::DashMap;
 
-use crate::env::Addr;
 use bignat::Nat;
 use ix_common::address::Address;
 #[cfg(not(target_arch = "riscv64"))]
@@ -42,7 +41,7 @@ use ixon::metadata::{
 use ixon::univ::Univ as IxonUniv;
 
 use super::constant::{KConst, RecRule};
-use super::env::{InternTable, KEnv};
+use super::env::{CtxAddr, InternTable, KEnv};
 use super::expr::{KExpr, MData};
 use super::id::KId;
 use super::level::KUniv;
@@ -257,7 +256,7 @@ fn timed_intern_univ<M: KernelMode>(
     return intern.intern_univ(u);
   }
   let t0 = Instant::now();
-  let key = *u.addr();
+  let key = super::env::univ_key(&u);
   let result = if let Some(existing) = intern.try_get_univ(&key) {
     stats.intern_univ_get_hits += 1;
     existing
@@ -269,56 +268,45 @@ fn timed_intern_univ<M: KernelMode>(
   result
 }
 
-/// Hash-first interning. Precomputes the content hash, asks the intern
-/// table for an existing canonical KExpr; only on a miss does it call
-/// `build(addr)` to allocate a new KExpr.
+/// Key-first interning. Builds the shallow structural key from already
+/// canonical children, asks the intern table for an existing canonical
+/// KExpr; only on a miss does it call `build()` to allocate a new KExpr
+/// (which takes a fresh uid).
 ///
-/// Why this exists: profiling on Mathlib shows `kexpr_construct` (the
-/// blake3 hash + `Arc<ExprData>` allocation pair) is ~45% of `convert`
-/// worker-sum, of which ~62% is wasted because the intern table
-/// already has the same canonical value. By computing just the hash up
-/// front and skipping construction entirely on a hit, we avoid the
-/// allocation in the majority case.
-///
-/// The `build` closure receives the precomputed `Addr` (a `blake3::Hash`
-/// by value) and is expected to call one of the
-/// `KExpr::*_mdata_with_addr` constructors so it can plug the hash into
-/// `ExprInfo` without re-hashing.
-///
-/// Stats accounting (when enabled): the hit path bumps
-/// `intern_expr_get_hits`. The miss path also bumps `kexpr_construct_*`
-/// for the cost of the closure body. `intern_expr_ns` covers the
-/// surrounding DashMap traffic on both paths but excludes the
-/// closure-internal time.
+/// Why this exists: profiling on Mathlib showed the construction +
+/// interning pair dominates `convert` worker-sum, and most constructed
+/// values are immediately discarded for an existing canonical Arc. By
+/// probing with just the shallow key we skip construction entirely on a
+/// hit. (Historically the probe key was a precomputed blake3 content
+/// hash; the shallow key preserves the skip while removing the hash.)
 #[inline]
 fn timed_intern_or_build<M: KernelMode>(
   intern: &mut InternTable<M>,
-  hash: blake3::Hash,
-  build: impl FnOnce(Addr) -> KExpr<M>,
+  key: &super::env::ExprKey,
+  build: impl FnOnce() -> KExpr<M>,
   stats: &mut ConvertStats,
 ) -> KExpr<M> {
   if !stats.enabled {
-    if let Some(existing) = intern.try_get_expr(&hash) {
+    if let Some(existing) = intern.try_get_expr(key) {
       return existing;
     }
-    return intern.intern_expr(build(hash));
+    return intern.intern_expr(build());
   }
   let t0 = Instant::now();
-  if let Some(existing) = intern.try_get_expr(&hash) {
+  if let Some(existing) = intern.try_get_expr(key) {
     stats.intern_expr_get_hits += 1;
     stats.intern_expr_calls += 1;
     stats.intern_expr_ns += elapsed_ns(t0);
     return existing;
   }
-  let addr = hash;
   let kc_t0 = Instant::now();
-  let new = build(addr);
+  let new = build();
   let kc_elapsed = elapsed_ns(kc_t0);
   stats.kexpr_construct_ns += kc_elapsed;
   stats.kexpr_construct_calls += 1;
   let interned = intern.intern_expr(new);
   let total = elapsed_ns(t0);
-  // Account for the DashMap traffic only — the closure body's time is
+  // Account for the map traffic only — the closure body's time is
   // already in `kexpr_construct_ns`.
   stats.intern_expr_ns += total.saturating_sub(kc_elapsed);
   stats.intern_expr_calls += 1;
@@ -711,25 +699,21 @@ fn ingress_expr<M: KernelMode>(
           if mdata_layers.is_empty() {
             let name_field = M::meta_field(name);
             let mdata_field: M::MField<Vec<MData>> = M::meta_field(vec![]);
-            let hash = KExpr::<M>::var_hash(*idx, &name_field, &mdata_field);
+            let key = super::env::ExprKey::Var(*idx);
             values.push(timed_intern_or_build(
               intern,
-              hash,
-              |addr| {
-                KExpr::var_mdata_with_addr(*idx, name_field, mdata_field, addr)
-              },
+              &key,
+              || KExpr::var_mdata(*idx, name_field, mdata_field),
               stats,
             ));
           } else {
             let name_field = M::meta_field(name);
             let mdata_field = M::meta_field(mdata_layers);
-            let hash = KExpr::<M>::var_hash(*idx, &name_field, &mdata_field);
+            let key = super::env::ExprKey::Var(*idx);
             values.push(timed_intern_or_build(
               intern,
-              hash,
-              |addr| {
-                KExpr::var_mdata_with_addr(*idx, name_field, mdata_field, addr)
-              },
+              &key,
+              || KExpr::var_mdata(*idx, name_field, mdata_field),
               stats,
             ));
           }
@@ -762,11 +746,11 @@ fn ingress_expr<M: KernelMode>(
                 })?)
                 .ok_or_else(|| format!("invalid Sort univ index {idx}"))?;
             let zu = ingress_univ(u, ctx, intern, univ_cache, stats);
-            let hash = KExpr::<M>::sort_hash(&zu, &mdata);
+            let key = super::env::ExprKey::Sort(*zu.addr());
             values.push(timed_intern_or_build(
               intern,
-              hash,
-              |addr| KExpr::sort_mdata_with_addr(zu, mdata, addr),
+              &key,
+              || KExpr::sort_mdata(zu, mdata),
               stats,
             ));
           },
@@ -802,11 +786,14 @@ fn ingress_expr<M: KernelMode>(
             let univs =
               ingress_univ_args(univ_idxs, ctx, intern, univ_cache, stats)?;
             let id = KId::new(addr, name_field);
-            let hash = KExpr::<M>::cnst_hash(&id, &univs, &mdata);
+            let key = super::env::ExprKey::Const(
+              id.addr.clone(),
+              univs.iter().map(|u| *u.addr()).collect(),
+            );
             values.push(timed_intern_or_build(
               intern,
-              hash,
-              |a| KExpr::cnst_mdata_with_addr(id, univs, mdata, a),
+              &key,
+              || KExpr::cnst_mdata(id, univs, mdata),
               stats,
             ));
           },
@@ -823,11 +810,14 @@ fn ingress_expr<M: KernelMode>(
               .clone();
             let univs =
               ingress_univ_args(univ_idxs, ctx, intern, univ_cache, stats)?;
-            let hash = KExpr::<M>::cnst_hash(&mid, &univs, &mdata);
+            let key = super::env::ExprKey::Const(
+              mid.addr.clone(),
+              univs.iter().map(|u| *u.addr()).collect(),
+            );
             values.push(timed_intern_or_build(
               intern,
-              hash,
-              |a| KExpr::cnst_mdata_with_addr(mid, univs, mdata, a),
+              &key,
+              || KExpr::cnst_mdata(mid, univs, mdata),
               stats,
             ));
           },
@@ -930,11 +920,14 @@ fn ingress_expr<M: KernelMode>(
                   let id = KId::new(addr, M::meta_field(name));
                   let mdata_field: M::MField<Vec<MData>> =
                     M::meta_field(vec![]);
-                  let hash = KExpr::<M>::cnst_hash(&id, &univs, &mdata_field);
+                  let key = super::env::ExprKey::Const(
+                    id.addr.clone(),
+                    univs.iter().map(|u| *u.addr()).collect(),
+                  );
                   timed_intern_or_build(
                     intern,
-                    hash,
-                    |a| KExpr::cnst_mdata_with_addr(id, univs, mdata_field, a),
+                    &key,
+                    || KExpr::cnst_mdata(id, univs, mdata_field),
                     stats,
                   )
                 },
@@ -957,11 +950,14 @@ fn ingress_expr<M: KernelMode>(
                   )?;
                   let mdata_field: M::MField<Vec<MData>> =
                     M::meta_field(vec![]);
-                  let hash = KExpr::<M>::cnst_hash(&mid, &univs, &mdata_field);
+                  let key = super::env::ExprKey::Const(
+                    mid.addr.clone(),
+                    univs.iter().map(|u| *u.addr()).collect(),
+                  );
                   timed_intern_or_build(
                     intern,
-                    hash,
-                    |a| KExpr::cnst_mdata_with_addr(mid, univs, mdata_field, a),
+                    &key,
+                    || KExpr::cnst_mdata(mid, univs, mdata_field),
                     stats,
                   )
                 },
@@ -1188,11 +1184,11 @@ fn ingress_expr<M: KernelMode>(
               format!("invalid UTF-8 in Str blob at addr {}: {e}", addr.hex())
             })?;
             let blob_addr = addr.clone();
-            let hash = KExpr::<M>::str_hash(&blob_addr, &mdata);
+            let key = super::env::ExprKey::Str(blob_addr.clone());
             values.push(timed_intern_or_build(
               intern,
-              hash,
-              |a| KExpr::str_mdata_with_addr(s, blob_addr, mdata, a),
+              &key,
+              || KExpr::str_mdata(s, blob_addr, mdata),
               stats,
             ));
           },
@@ -1215,11 +1211,11 @@ fn ingress_expr<M: KernelMode>(
             }
             let n = Nat::from_le_bytes(&blob);
             let blob_addr = addr.clone();
-            let hash = KExpr::<M>::nat_hash(&blob_addr, &mdata);
+            let key = super::env::ExprKey::Nat(blob_addr.clone());
             values.push(timed_intern_or_build(
               intern,
-              hash,
-              |a| KExpr::nat_mdata_with_addr(n, blob_addr, mdata, a),
+              &key,
+              || KExpr::nat_mdata(n, blob_addr, mdata),
               stats,
             ));
           },
@@ -1241,11 +1237,11 @@ fn ingress_expr<M: KernelMode>(
         let cont_t0 = if stats.enabled { Some(Instant::now()) } else { None };
         let a = values.pop().unwrap();
         let f = values.pop().unwrap();
-        let hash = KExpr::<M>::app_hash(&f, &a, &mdata);
+        let key = super::env::ExprKey::App(*f.addr(), *a.addr());
         values.push(timed_intern_or_build(
           intern,
-          hash,
-          |addr| KExpr::app_mdata_with_addr(f, a, mdata, addr),
+          &key,
+          || KExpr::app_mdata(f, a, mdata),
           stats,
         ));
         if let Some(t0) = cont_t0 {
@@ -1264,11 +1260,11 @@ fn ingress_expr<M: KernelMode>(
         let cont_t0 = if stats.enabled { Some(Instant::now()) } else { None };
         let body = values.pop().unwrap();
         let ty = values.pop().unwrap();
-        let hash = KExpr::<M>::lam_hash(&name, &bi, &ty, &body, &mdata);
+        let key = super::env::ExprKey::Lam(*ty.addr(), *body.addr());
         values.push(timed_intern_or_build(
           intern,
-          hash,
-          |addr| KExpr::lam_mdata_with_addr(name, bi, ty, body, mdata, addr),
+          &key,
+          || KExpr::lam_mdata(name, bi, ty, body, mdata),
           stats,
         ));
         if let Some(t0) = cont_t0 {
@@ -1287,11 +1283,11 @@ fn ingress_expr<M: KernelMode>(
         let cont_t0 = if stats.enabled { Some(Instant::now()) } else { None };
         let body = values.pop().unwrap();
         let ty = values.pop().unwrap();
-        let hash = KExpr::<M>::all_hash(&name, &bi, &ty, &body, &mdata);
+        let key = super::env::ExprKey::All(*ty.addr(), *body.addr());
         values.push(timed_intern_or_build(
           intern,
-          hash,
-          |addr| KExpr::all_mdata_with_addr(name, bi, ty, body, mdata, addr),
+          &key,
+          || KExpr::all_mdata(name, bi, ty, body, mdata),
           stats,
         ));
         if let Some(t0) = cont_t0 {
@@ -1312,13 +1308,12 @@ fn ingress_expr<M: KernelMode>(
         let body = values.pop().unwrap();
         let val = values.pop().unwrap();
         let ty = values.pop().unwrap();
-        let hash = KExpr::<M>::let_hash(&name, &ty, &val, &body, nd, &mdata);
+        let key =
+          super::env::ExprKey::Let(*ty.addr(), *val.addr(), *body.addr(), nd);
         values.push(timed_intern_or_build(
           intern,
-          hash,
-          |addr| {
-            KExpr::let_mdata_with_addr(name, ty, val, body, nd, mdata, addr)
-          },
+          &key,
+          || KExpr::let_mdata(name, ty, val, body, nd, mdata),
           stats,
         ));
         if let Some(t0) = cont_t0 {
@@ -1342,11 +1337,12 @@ fn ingress_expr<M: KernelMode>(
       ExprFrame::PrjDone { type_id, field_idx, mdata } => {
         let cont_t0 = if stats.enabled { Some(Instant::now()) } else { None };
         let s = values.pop().unwrap();
-        let hash = KExpr::<M>::prj_hash(&type_id, field_idx, &s, &mdata);
+        let key =
+          super::env::ExprKey::Prj(type_id.addr.clone(), field_idx, *s.addr());
         values.push(timed_intern_or_build(
           intern,
-          hash,
-          |addr| KExpr::prj_mdata_with_addr(type_id, field_idx, s, mdata, addr),
+          &key,
+          || KExpr::prj_mdata(type_id, field_idx, s, mdata),
           stats,
         ));
         if let Some(t0) = cont_t0 {
@@ -2174,7 +2170,7 @@ pub fn resolve_lean_name_addr(
 /// Compute a stable hash for a `param_names` slice, used as part of the
 /// ingress cache key. Two calls with the same param names (in the same
 /// order) produce the same hash.
-pub fn param_names_hash(param_names: &[Name]) -> Addr {
+pub fn param_names_hash(param_names: &[Name]) -> CtxAddr {
   let mut hasher = blake3::Hasher::new();
   hasher.update(&(param_names.len() as u64).to_le_bytes());
   for n in param_names {
@@ -2254,8 +2250,8 @@ pub fn lean_expr_to_zexpr_cached(
   intern: &mut InternTable<Meta>,
   n2a: Option<&DashMap<Name, Address>>,
   aux_n2a: Option<&DashMap<Name, Address>>,
-  mut cache: Option<&mut FxHashMap<(Addr, Addr), KExpr<Meta>>>,
-  pn_hash: Option<&Addr>,
+  mut cache: Option<&mut FxHashMap<(CtxAddr, CtxAddr), KExpr<Meta>>>,
+  pn_hash: Option<&CtxAddr>,
 ) -> KExpr<Meta> {
   // Check cache
   if let (Some(cache), Some(pn_hash)) = (cache.as_ref(), pn_hash) {
@@ -2295,8 +2291,8 @@ fn lean_expr_to_zexpr_raw(
   intern: &mut InternTable<Meta>,
   n2a: Option<&DashMap<Name, Address>>,
   aux_n2a: Option<&DashMap<Name, Address>>,
-  mut cache: Option<&mut FxHashMap<(Addr, Addr), KExpr<Meta>>>,
-  pn_hash: Option<&Addr>,
+  mut cache: Option<&mut FxHashMap<(CtxAddr, CtxAddr), KExpr<Meta>>>,
+  pn_hash: Option<&CtxAddr>,
 ) -> KExpr<Meta> {
   // Walk through any consecutive `Mdata` wrappers first, accumulating them
   // as kernel-side `MData` layers. Lean represents `Mdata(a, Mdata(b, e))`
diff --git a/crates/kernel/src/level.rs b/crates/kernel/src/level.rs
index 0ff6df8c..1c899314 100644
--- a/crates/kernel/src/level.rs
+++ b/crates/kernel/src/level.rs
@@ -37,7 +37,7 @@ use std::collections::BTreeMap;
 use std::fmt;
 use std::sync::Arc;
 
-use ix_common::env::{Name, UIMAX, UMAX, UPARAM, USUCC, UZERO};
+use ix_common::env::Name;
 
 use super::env::Addr;
 use super::mode::{KernelMode, MetaDisplay};
@@ -81,11 +81,33 @@ impl<M: KernelMode> KUniv<M> {
     Arc::ptr_eq(&self.0, &other.0)
   }
 
-  /// Structural equality by Merkle hash (pointer-first fast path).
+  /// Canonical-identity equality: `ptr_eq` or equal intern uid. Sound in
+  /// the affirmative; incomplete in the negative (independently built
+  /// equal levels carry distinct uids) — `==` adds the structural
+  /// fallback.
   pub fn hash_eq(&self, other: &KUniv<M>) -> bool {
     self.ptr_eq(other) || self.addr() == other.addr()
   }
 
+  /// Structural equality mirroring the intern identity: `Param` display
+  /// names are NOT compared (matching the historical hash semantics).
+  /// Equal interned subtrees prune at the uid fast path.
+  fn structural_eq(&self, other: &Self) -> bool {
+    if self.hash_eq(other) {
+      return true;
+    }
+    match (self.data(), other.data()) {
+      (UnivData::Zero(_), UnivData::Zero(_)) => true,
+      (UnivData::Succ(a, _), UnivData::Succ(b, _)) => a.structural_eq(b),
+      (UnivData::Max(a1, b1, _), UnivData::Max(a2, b2, _))
+      | (UnivData::IMax(a1, b1, _), UnivData::IMax(a2, b2, _)) => {
+        a1.structural_eq(a2) && b1.structural_eq(b2)
+      },
+      (UnivData::Param(i, _, _), UnivData::Param(j, _, _)) => i == j,
+      _ => false,
+    }
+  }
+
   /// True if this level is definitionally zero (Prop).
   pub fn is_zero(&self) -> bool {
     matches!(self.data(), UnivData::Zero(_))
@@ -130,14 +152,11 @@ impl<M: KernelMode> KUniv<M> {
 
 impl<M: KernelMode> KUniv<M> {
   pub fn zero() -> Self {
-    KUniv::new(UnivData::Zero(blake3::hash(&[UZERO])))
+    KUniv::new(UnivData::Zero(super::expr::fresh_uid()))
   }
 
   pub fn succ(inner: KUniv<M>) -> Self {
-    let mut hasher = blake3::Hasher::new();
-    hasher.update(&[USUCC]);
-    hasher.update(inner.addr().as_bytes());
-    KUniv::new(UnivData::Succ(inner, hasher.finalize()))
+    KUniv::new(UnivData::Succ(inner, super::expr::fresh_uid()))
   }
 
   /// Construct `max(a, b)` with Lean-style simplifications:
@@ -193,11 +212,7 @@ impl<M: KernelMode> KUniv<M> {
   /// Raw `Max` constructor without simplification. Used by `max()` after
   /// all simplification opportunities are exhausted.
   fn max_raw(a: KUniv<M>, b: KUniv<M>) -> Self {
-    let mut hasher = blake3::Hasher::new();
-    hasher.update(&[UMAX]);
-    hasher.update(a.addr().as_bytes());
-    hasher.update(b.addr().as_bytes());
-    KUniv::new(UnivData::Max(a, b, hasher.finalize()))
+    KUniv::new(UnivData::Max(a, b, super::expr::fresh_uid()))
   }
 
   /// Construct `imax(a, b)` with Lean-style simplifications:
@@ -228,25 +243,18 @@ impl<M: KernelMode> KUniv<M> {
       return a; // imax(a, a) = a
     }
     // No simplification — construct raw IMax node.
-    let mut hasher = blake3::Hasher::new();
-    hasher.update(&[UIMAX]);
-    hasher.update(a.addr().as_bytes());
-    hasher.update(b.addr().as_bytes());
-    KUniv::new(UnivData::IMax(a, b, hasher.finalize()))
+    KUniv::new(UnivData::IMax(a, b, super::expr::fresh_uid()))
   }
 
   pub fn param(idx: u64, name: M::MField<Name>) -> Self {
-    let mut hasher = blake3::Hasher::new();
-    hasher.update(&[UPARAM]);
-    hasher.update(&idx.to_le_bytes());
-    KUniv::new(UnivData::Param(idx, name, hasher.finalize()))
+    KUniv::new(UnivData::Param(idx, name, super::expr::fresh_uid()))
   }
 }
 
-// Structural equality by Merkle hash.
+// Structural equality (uid fast path, recursive fallback).
 impl<M: KernelMode> PartialEq for KUniv<M> {
   fn eq(&self, other: &Self) -> bool {
-    self.hash_eq(other)
+    self.structural_eq(other)
   }
 }
 
@@ -725,8 +733,10 @@ mod tests {
 
   #[test]
   fn zero_hash_deterministic() {
-    assert_eq!(MU::zero().addr(), MU::zero().addr());
-    assert_eq!(AU::zero().addr(), AU::zero().addr());
+    // Identity is intern-assigned now: independent constructions carry
+    // distinct uids but remain structurally equal (`==`).
+    assert_eq!(MU::zero(), MU::zero());
+    assert_eq!(AU::zero(), AU::zero());
   }
 
   #[test]
@@ -774,14 +784,14 @@ mod tests {
   fn meta_param_name_does_not_affect_hash() {
     let a = MU::param(0, mk_name("u"));
     let b = MU::param(0, mk_name("v"));
-    assert_eq!(a.addr(), b.addr());
+    assert_eq!(a, b);
   }
 
   #[test]
   fn meta_param_same_name_same_hash() {
     let a = MU::param(0, mk_name("u"));
     let b = MU::param(0, mk_name("u"));
-    assert_eq!(a.addr(), b.addr());
+    assert_eq!(a, b);
   }
 
   // ---- Anon mode: names erased ----
@@ -790,23 +800,31 @@ mod tests {
   fn anon_param_same_index_same_hash() {
     let a = AU::param(0, ());
     let b = AU::param(0, ());
-    assert_eq!(a.addr(), b.addr());
+    assert_eq!(a, b);
   }
 
   // ---- Anon vs Meta structural hash matches (metadata erased) ----
 
   #[test]
   fn anon_vs_meta_named_param_match() {
+    // Cross-mode structural identity is visible through the shallow
+    // intern keys, which are mode-independent.
     let anon = AU::param(0, ());
     let meta = MU::param(0, mk_name("u"));
-    assert_eq!(anon.addr(), meta.addr());
+    assert_eq!(
+      super::super::env::univ_key(&anon),
+      super::super::env::univ_key(&meta)
+    );
   }
 
   #[test]
   fn anon_vs_meta_anon_param_same() {
     let anon = AU::param(0, ());
     let meta = MU::param(0, Name::anon());
-    assert_eq!(anon.addr(), meta.addr());
+    assert_eq!(
+      super::super::env::univ_key(&anon),
+      super::super::env::univ_key(&meta)
+    );
   }
 
   // ---- PartialEq ----
@@ -1011,8 +1029,8 @@ mod tests {
     // Same structure, different names — semantically equal
     let a = MU::param(0, mk_name("u"));
     let b = MU::param(0, mk_name("v"));
-    // Hashes are metadata-erased, and Géran comparison sees the same index.
-    assert_eq!(a.addr(), b.addr());
+    // Identity is metadata-erased, and Géran comparison sees the same index.
+    assert_eq!(a, b);
     assert!(univ_eq(&a, &b));
   }
 
diff --git a/crates/kernel/src/subst.rs b/crates/kernel/src/subst.rs
index 11ccac4a..32ac5cac 100644
--- a/crates/kernel/src/subst.rs
+++ b/crates/kernel/src/subst.rs
@@ -11,6 +11,9 @@
 //! uses a `PtrMap Expr Expr` for the same reason (see
 //! `refs/lean4lean/Lean4Lean/Expr.lean:14`).
 
+use std::cell::OnceCell;
+use std::sync::Arc;
+
 use rustc_hash::FxHashMap;
 
 use super::env::{Addr, InternTable};
@@ -488,6 +491,217 @@ fn lift_cached<M: KernelMode>(
   interned
 }
 
+// ============================================================================
+// Closures for the WHNF environment machine (see `whnf.rs` machine loop and
+// `docs/env_machine_whnf.md`). Beta/zeta there are O(1) pushes onto an
+// `MEnv`; substitution happens only here, at machine exit points
+// ("readback"), and only for the parts of the term the reduction actually
+// hands to a plain-expression consumer.
+// ============================================================================
+
+/// Expression closed over a machine environment: `Clo { e, env }` denotes
+/// `e[Var(i) := env[i]]` for `i < env.len()`, with `Var(i)` for
+/// `i >= env.len()` shifting DOWN by `env.len()` into the ambient context.
+/// WHNF never reduces under binders, so environments never need lifting.
+pub(crate) struct Clo<M: KernelMode> {
+  pub(crate) e: KExpr<M>,
+  pub(crate) env: MEnv<M>,
+  /// Memoized depth-0 readback. Closures are shared (one env entry may be
+  /// referenced by many variables; spine args survive multiple machine
+  /// re-entries), and without a global content-addressed memo this is
+  /// what keeps each shared closure's substitution from re-running.
+  readback: OnceCell<KExpr<M>>,
+}
+
+impl<M: KernelMode> Clo<M> {
+  pub(crate) fn new(e: KExpr<M>, env: MEnv<M>) -> Self {
+    Clo { e, env, readback: OnceCell::new() }
+  }
+
+  /// Closure over the empty environment (a plain expression).
+  pub(crate) fn closed(e: KExpr<M>) -> Self {
+    Clo::new(e, MEnv::empty())
+  }
+}
+
+struct MEnvNode<M: KernelMode> {
+  head: Arc<Clo<M>>,
+  tail: MEnv<M>,
+}
+
+/// Persistent cons-list environment: O(1) push with structural sharing
+/// across the closures captured at each binder. `len` is carried on the
+/// handle — recomputing it per suffix was a measured cost on the IxVM
+/// port of this machine.
+pub(crate) struct MEnv<M: KernelMode> {
+  node: Option<Arc<MEnvNode<M>>>,
+  len: u64,
+}
+
+// Manual impl: `#[derive(Clone)]` would demand `M: Clone`.
+impl<M: KernelMode> Clone for MEnv<M> {
+  fn clone(&self) -> Self {
+    MEnv { node: self.node.clone(), len: self.len }
+  }
+}
+
+impl<M: KernelMode> MEnv<M> {
+  pub(crate) fn empty() -> Self {
+    MEnv { node: None, len: 0 }
+  }
+
+  pub(crate) fn len(&self) -> u64 {
+    self.len
+  }
+
+  pub(crate) fn push(&self, c: Arc<Clo<M>>) -> Self {
+    MEnv {
+      node: Some(Arc::new(MEnvNode { head: c, tail: self.clone() })),
+      len: self.len + 1,
+    }
+  }
+
+  /// O(i) cons-list walk; `i` must be `< self.len()`. Machine variable
+  /// lookups are typically near the front (recently pushed args).
+  pub(crate) fn get(&self, i: u64) -> &Arc<Clo<M>> {
+    let mut node = self.node.as_ref().expect("MEnv::get out of range");
+    let mut i = i;
+    while i > 0 {
+      node = node.tail.node.as_ref().expect("MEnv::get out of range");
+      i -= 1;
+    }
+    &node.head
+  }
+}
+
+/// Materialize a closure into a plain expression:
+/// `e[Var(i) := readback(env[i])]` for `i < env.len()`, `Var(j)` above
+/// shifted down by `env.len()`. Memoized per closure.
+pub(crate) fn clo_readback<M: KernelMode>(
+  intern: &mut InternTable<M>,
+  c: &Clo<M>,
+) -> KExpr<M> {
+  if c.env.len() == 0 {
+    return c.e.clone();
+  }
+  if let Some(v) = c.readback.get() {
+    return v.clone();
+  }
+  let v = clo_subst(intern, &c.e, &c.env, 0);
+  let _ = c.readback.set(v.clone());
+  v
+}
+
+/// Simultaneous environment substitution at binder depth `depth` — the
+/// machine's only substitution ("readback"). Var arm semantics:
+///   `i < depth`              → unchanged (locally bound)
+///   `depth <= i < depth + n` → `lift(readback(env[i - depth]), depth)`
+///   `i >= depth + n`         → `Var(i - n)`
+/// where `n = env.len()`. lbr-guarded at every node (a no-op subtree
+/// returns its original Arc), per-call memoized by `(uid, depth)`, and
+/// results interned — the same discipline as `subst`/`simul_subst`.
+///
+/// The memo scratch comes from a pool (`clo_scratch_pool`) rather than a
+/// single buffer because the Var arm re-enters `clo_subst` through
+/// `clo_readback` of environment entries, each under a *different*
+/// environment; nesting levels must not share memo entries.
+pub(crate) fn clo_subst<M: KernelMode>(
+  intern: &mut InternTable<M>,
+  e: &KExpr<M>,
+  env: &MEnv<M>,
+  depth: u64,
+) -> KExpr<M> {
+  if env.len() == 0 || e.lbr() <= depth {
+    return e.clone();
+  }
+  let mut cache = intern.clo_scratch_pool.pop().unwrap_or_default();
+  let result = clo_subst_cached(intern, e, env, depth, &mut cache);
+  cache.clear();
+  intern.clo_scratch_pool.push(cache);
+  result
+}
+
+fn clo_subst_cached<M: KernelMode>(
+  intern: &mut InternTable<M>,
+  e: &KExpr<M>,
+  env: &MEnv<M>,
+  depth: u64,
+  cache: &mut FxHashMap<(Addr, u64), KExpr<M>>,
+) -> KExpr<M> {
+  if e.lbr() <= depth {
+    return e.clone();
+  }
+
+  let key = (e.hash_key(), depth);
+  if let Some(cached) = cache.get(&key) {
+    return cached.clone();
+  }
+
+  let n = env.len();
+
+  let result = match e.data() {
+    ExprData::Var(i, name, _) => {
+      let i = *i;
+      if i < depth + n {
+        // `i < depth` is unreachable under the outer lbr guard, so this
+        // is an env hit: substitute the entry's readback, lifted over
+        // the local binders we are under.
+        let c = env.get(i - depth).clone();
+        let v = clo_readback(intern, &c);
+        let r = lift(intern, &v, depth, 0);
+        cache.insert(key, r.clone());
+        return r;
+      }
+      KExpr::var(i - n, name.clone())
+    },
+
+    ExprData::App(f, x, _) => {
+      let f2 = clo_subst_cached(intern, f, env, depth, cache);
+      let x2 = clo_subst_cached(intern, x, env, depth, cache);
+      KExpr::app(f2, x2)
+    },
+
+    ExprData::Lam(name, bi, ty, inner, _) => {
+      let ty2 = clo_subst_cached(intern, ty, env, depth, cache);
+      let inner2 = clo_subst_cached(intern, inner, env, depth + 1, cache);
+      KExpr::lam(name.clone(), bi.clone(), ty2, inner2)
+    },
+
+    ExprData::All(name, bi, ty, inner, _) => {
+      let ty2 = clo_subst_cached(intern, ty, env, depth, cache);
+      let inner2 = clo_subst_cached(intern, inner, env, depth + 1, cache);
+      KExpr::all(name.clone(), bi.clone(), ty2, inner2)
+    },
+
+    ExprData::Let(name, ty, val, inner, nd, _) => {
+      let ty2 = clo_subst_cached(intern, ty, env, depth, cache);
+      let val2 = clo_subst_cached(intern, val, env, depth, cache);
+      let inner2 = clo_subst_cached(intern, inner, env, depth + 1, cache);
+      KExpr::let_(name.clone(), ty2, val2, inner2, *nd)
+    },
+
+    ExprData::Prj(id, field, val, _) => {
+      let val2 = clo_subst_cached(intern, val, env, depth, cache);
+      KExpr::prj(id.clone(), *field, val2)
+    },
+
+    ExprData::FVar(..)
+    | ExprData::Sort(..)
+    | ExprData::Const(..)
+    | ExprData::Nat(..)
+    | ExprData::Str(..) => {
+      // Closed atoms — unreachable under the lbr guard; defensive.
+      let r = e.clone();
+      cache.insert(key, r.clone());
+      return r;
+    },
+  };
+
+  let interned = intern.intern_expr(result);
+  cache.insert(key, interned.clone());
+  interned
+}
+
 /// Cheap beta reduction: peephole-reduce `App(λ...λ. body, args)` shapes
 /// without invoking the full [`subst`] machinery in trivial cases.
 ///
diff --git a/crates/kernel/src/tc.rs b/crates/kernel/src/tc.rs
index 257761f8..53d1c814 100644
--- a/crates/kernel/src/tc.rs
+++ b/crates/kernel/src/tc.rs
@@ -13,7 +13,7 @@ use ix_common::address::Address;
 use ixon::env::Env as IxonEnv;
 
 use super::constant::{KConst, RecRule};
-use super::env::{Addr, KEnv};
+use super::env::{Addr, CtxAddr, KEnv};
 use super::equiv::EquivManager;
 use super::error::{TcError, u64_to_usize};
 use super::expr::{ExprData, FVarId, KExpr};
@@ -29,9 +29,9 @@ use super::primitive::Primitives;
 use super::subst::{instantiate_rev, lift};
 
 /// Content-addressed context identity for the empty context (no bindings).
-pub fn empty_ctx_addr() -> Addr {
+pub fn empty_ctx_addr() -> CtxAddr {
   use std::sync::LazyLock;
-  static ADDR: LazyLock<Addr> =
+  static ADDR: LazyLock<CtxAddr> =
     LazyLock::new(|| blake3::hash(b"ix.kernel.ctx.empty"));
   *ADDR
 }
@@ -126,9 +126,9 @@ pub struct TypeChecker<'a, M: KernelMode> {
   pub num_let_bindings: usize,
   /// Content-addressed context identity: a blake3 hash derived from the
   /// binding-type chain. Immune to the ABA pointer-reuse problem.
-  pub ctx_id: Addr,
+  pub ctx_id: CtxAddr,
   /// Stack of previous ctx_ids for O(1) pop.
-  ctx_id_stack: Vec<Addr>,
+  ctx_id_stack: Vec<CtxAddr>,
 
   // -- Thread-local optimization --
   /// Union-find for transitive def-eq caching (lean4lean EquivManager).
@@ -183,7 +183,7 @@ pub struct TypeChecker<'a, M: KernelMode> {
   /// equal in the suffix-relevant prefix (`ctx_id` content-addresses the
   /// full context). The cache lifetime is the `TypeChecker` (one per
   /// `check_const`), so it is automatically reclaimed.
-  ctx_addr_cache: FxHashMap<(Addr, u64), Addr>,
+  ctx_addr_cache: FxHashMap<(CtxAddr, u64), CtxAddr>,
 
   // -- Free-variable infrastructure --
   /// Local context for fvar-based binder opening. Some validation paths still
@@ -355,7 +355,7 @@ impl<'a, M: KernelMode> TypeChecker<'a, M> {
   /// Sharing two distinct outer contexts that share a relevant suffix is the
   /// payoff: the same WHNF subterm can hit cache across them.
   #[inline]
-  pub fn whnf_key(&mut self, e: &KExpr<M>) -> (Addr, Addr) {
+  pub fn whnf_key(&mut self, e: &KExpr<M>) -> (Addr, CtxAddr) {
     (e.hash_key(), self.ctx_addr_for_lbr(e.lbr()))
   }
 
@@ -366,7 +366,7 @@ impl<'a, M: KernelMode> TypeChecker<'a, M> {
   /// dependencies, so two equal open subterms can share an infer result across
   /// different outer binders when the relevant local suffix is identical.
   #[inline]
-  pub fn infer_key(&mut self, e: &KExpr<M>) -> (Addr, Addr) {
+  pub fn infer_key(&mut self, e: &KExpr<M>) -> (Addr, CtxAddr) {
     (e.hash_key(), self.ctx_addr_for_lbr(e.lbr()))
   }
 
@@ -378,11 +378,11 @@ impl<'a, M: KernelMode> TypeChecker<'a, M> {
   /// expressions, so the relevant context is the suffix needed by the larger
   /// `lbr`.
   #[inline]
-  pub fn def_eq_ctx_key(&mut self, a: &KExpr<M>, b: &KExpr<M>) -> Addr {
+  pub fn def_eq_ctx_key(&mut self, a: &KExpr<M>, b: &KExpr<M>) -> CtxAddr {
     self.ctx_addr_for_lbr(a.lbr().max(b.lbr()))
   }
 
-  pub(crate) fn ctx_addr_for_lbr(&mut self, lbr: u64) -> Addr {
+  pub(crate) fn ctx_addr_for_lbr(&mut self, lbr: u64) -> CtxAddr {
     if lbr == 0 || self.ctx.is_empty() {
       return empty_ctx_addr();
     }
@@ -428,12 +428,12 @@ impl<'a, M: KernelMode> TypeChecker<'a, M> {
         match &self.let_vals[i] {
           Some(val) => {
             h.update(b"let");
-            h.update(self.ctx[i].addr().as_bytes());
-            h.update(val.addr().as_bytes());
+            h.update(&self.ctx[i].addr().to_le_bytes());
+            h.update(&val.addr().to_le_bytes());
           },
           None => {
             h.update(b"local");
-            h.update(self.ctx[i].addr().as_bytes());
+            h.update(&self.ctx[i].addr().to_le_bytes());
           },
         }
       }
@@ -445,10 +445,13 @@ impl<'a, M: KernelMode> TypeChecker<'a, M> {
   }
 
   /// Push a local variable type (lambda/forall binding, no let-value).
+  /// The type is interned first: ctx suffix hashing keys on uids, so a
+  /// canonical frame is what lets equal suffixes share cache entries.
   pub fn push_local(&mut self, ty: KExpr<M>) {
+    let ty = self.env.intern.intern_expr(ty);
     let mut h = blake3::Hasher::new();
     h.update(b"ctx.local");
-    h.update(ty.addr().as_bytes());
+    h.update(&ty.addr().to_le_bytes());
     h.update(self.ctx_id.as_bytes());
     self.ctx_id_stack.push(self.ctx_id);
     self.ctx_id = h.finalize();
@@ -459,10 +462,12 @@ impl<'a, M: KernelMode> TypeChecker<'a, M> {
   /// Push a let-bound variable (type + value). WHNF will zeta-reduce references
   /// to this variable by substituting the value (lean4lean withExtendedLetCtx).
   pub fn push_let(&mut self, ty: KExpr<M>, val: KExpr<M>) {
+    let ty = self.env.intern.intern_expr(ty);
+    let val = self.env.intern.intern_expr(val);
     let mut h = blake3::Hasher::new();
     h.update(b"ctx.let");
-    h.update(ty.addr().as_bytes());
-    h.update(val.addr().as_bytes());
+    h.update(&ty.addr().to_le_bytes());
+    h.update(&val.addr().to_le_bytes());
     h.update(self.ctx_id.as_bytes());
     self.ctx_id_stack.push(self.ctx_id);
     self.ctx_id = h.finalize();
@@ -982,7 +987,7 @@ impl<'a, M: KernelMode> TypeChecker<'a, M> {
       let ctx = self.ctx_addr_for_lbr(e.lbr());
       key.push_str(&format!(
         " ctx={} depth={}",
-        short_addr(&ctx),
+        short_ctx_addr(&ctx),
         self.depth()
       ));
     }
@@ -999,7 +1004,7 @@ impl<'a, M: KernelMode> TypeChecker<'a, M> {
       let ctx = self.def_eq_ctx_key(a, b);
       key.push_str(&format!(
         " ctx={} depth={}",
-        short_addr(&ctx),
+        short_ctx_addr(&ctx),
         self.depth()
       ));
     }
@@ -1155,6 +1160,10 @@ fn hot_expr_shape<M: KernelMode>(e: &KExpr<M>) -> String {
 }
 
 fn short_addr(addr: &Addr) -> String {
+  format!("uid{addr}")
+}
+
+fn short_ctx_addr(addr: &CtxAddr) -> String {
   addr.to_hex().chars().take(12).collect()
 }
 
@@ -1259,13 +1268,19 @@ mod tests {
 
   #[test]
   fn ctx_id_same_pushes_yield_same_hash() {
-    let mut tc1 = new_tc();
-    let mut tc2 = new_tc();
-    tc1.push_local(sort0());
-    tc1.push_local(sort1());
+    // ctx hashing keys on intern uids, so the sharing property holds for
+    // checkers over the SAME env (which is also the cache-sharing scope).
+    let env: *mut KEnv<Meta> = Box::leak(Box::new(KEnv::<Meta>::new()));
+    let id1 = {
+      let mut tc1 = TypeChecker::new(unsafe { &mut *env });
+      tc1.push_local(sort0());
+      tc1.push_local(sort1());
+      tc1.ctx_id
+    };
+    let mut tc2 = TypeChecker::new(unsafe { &mut *env });
     tc2.push_local(sort0());
     tc2.push_local(sort1());
-    assert_eq!(tc1.ctx_id, tc2.ctx_id);
+    assert_eq!(id1, tc2.ctx_id);
   }
 
   #[test]
@@ -1354,19 +1369,22 @@ mod tests {
     // outer frame. A `var(0)` with lbr=1 should key only by the inner
     // suffix, so the two `whnf_key`s should match even though the outer
     // contexts (and hence ctx_ids) differ.
-    let mut tc1 = new_tc();
-    tc1.push_local(sort0()); // outer A
-    tc1.push_local(sort1()); // inner X
-
-    let mut tc2 = new_tc();
+    let env: *mut KEnv<Meta> = Box::leak(Box::new(KEnv::<Meta>::new()));
+    let e = var(0); // lbr = 1, depends only on innermost frame
+    let (h1, ctx1, outer1) = {
+      let mut tc1 = TypeChecker::new(unsafe { &mut *env });
+      tc1.push_local(sort0()); // outer A
+      tc1.push_local(sort1()); // inner X
+      let (h, c) = tc1.whnf_key(&e);
+      (h, c, tc1.ctx_id)
+    };
+    let mut tc2 = TypeChecker::new(unsafe { &mut *env });
     tc2.push_local(sort1()); // outer B (different from A)
     tc2.push_local(sort1()); // inner X (same as tc1's inner)
 
     // ctx_ids differ (different outer frames).
-    assert_ne!(tc1.ctx_id, tc2.ctx_id);
+    assert_ne!(outer1, tc2.ctx_id);
 
-    let e = var(0); // lbr = 1, depends only on innermost frame
-    let (h1, ctx1) = tc1.whnf_key(&e);
     let (h2, ctx2) = tc2.whnf_key(&e);
     assert_eq!(h1, h2);
     assert_eq!(
diff --git a/crates/kernel/src/whnf.rs b/crates/kernel/src/whnf.rs
index 78fa5df9..dc7319f7 100644
--- a/crates/kernel/src/whnf.rs
+++ b/crates/kernel/src/whnf.rs
@@ -2,7 +2,7 @@
 //!
 //! Multi-phase: whnf_core (beta, iota, zeta) → proj → nat → quot → delta.
 
-use std::sync::LazyLock;
+use std::sync::{Arc, LazyLock};
 
 use rustc_hash::FxHashSet;
 
@@ -66,7 +66,9 @@ use super::expr::{ExprData, KExpr};
 use super::id::KId;
 use super::level::KUniv;
 use super::mode::KernelMode;
-use super::subst::{simul_subst, subst, subst_no_intern};
+use super::subst::{
+  Clo, MEnv, clo_readback, clo_subst, subst, subst_no_intern,
+};
 use super::tc::{IotaInfo, MAX_WHNF_FUEL, TypeChecker, collect_app_spine};
 
 use bignat::Nat;
@@ -107,6 +109,80 @@ struct NatRecLiteralParts<M: KernelMode> {
   major: Nat,
   base_idx: usize,
   step_idx: usize,
+  major_idx: usize,
+}
+
+/// Which primitive-reducer family a head constant belongs to. The WHNF
+/// loops used to probe every reducer per iteration — each collecting its
+/// own app spine and running its own gauntlet of 32-byte address
+/// compares. Classifying the head ONCE (memoized per address in
+/// `KEnv::prim_family_cache`) lets an iteration call at most one family
+/// reducer and skip everything for ordinary constants. Mirrors the IxVM
+/// `prim_family` dispatch memo.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum PrimFamily {
+  Native,
+  BitVec,
+  Nat,
+  Decidable,
+  Str,
+  Other,
+}
+
+fn prim_family_uncached<M: KernelMode>(
+  p: &Primitives<M>,
+  addr: &Address,
+) -> PrimFamily {
+  if *addr == p.nat_succ.addr
+    || *addr == p.nat_add.addr
+    || *addr == p.nat_sub.addr
+    || *addr == p.nat_mul.addr
+    || *addr == p.nat_div.addr
+    || *addr == p.nat_mod.addr
+    || *addr == p.nat_pow.addr
+    || *addr == p.nat_gcd.addr
+    || *addr == p.nat_land.addr
+    || *addr == p.nat_lor.addr
+    || *addr == p.nat_xor.addr
+    || *addr == p.nat_shift_left.addr
+    || *addr == p.nat_shift_right.addr
+    || *addr == p.nat_beq.addr
+    || *addr == p.nat_ble.addr
+  {
+    return PrimFamily::Nat;
+  }
+  if *addr == p.nat_dec_le.addr
+    || *addr == p.nat_dec_eq.addr
+    || *addr == p.nat_dec_lt.addr
+    || *addr == p.int_dec_le.addr
+    || *addr == p.int_dec_eq.addr
+    || *addr == p.int_dec_lt.addr
+  {
+    return PrimFamily::Decidable;
+  }
+  if *addr == p.bit_vec_to_nat.addr
+    || *addr == p.bit_vec_ult.addr
+    || *addr == p.decidable_decide.addr
+  {
+    return PrimFamily::BitVec;
+  }
+  if *addr == p.punit_size_of_1.addr
+    || *addr == p.subtype_val.addr
+    || *addr == p.size_of_size_of.addr
+    || *addr == p.system_platform_num_bits.addr
+    || *addr == p.reduce_bool.addr
+    || *addr == p.reduce_nat.addr
+  {
+    return PrimFamily::Native;
+  }
+  if *addr == p.string_back.addr
+    || *addr == p.string_legacy_back.addr
+    || *addr == p.string_utf8_byte_size.addr
+    || *addr == p.string_to_byte_array.addr
+  {
+    return PrimFamily::Str;
+  }
+  PrimFamily::Other
 }
 
 impl<M: KernelMode> TypeChecker<'_, M> {
@@ -290,10 +366,17 @@ impl<M: KernelMode> TypeChecker<'_, M> {
         break;
       }
 
-      // Native reduction: Lean.reduceBool, Lean.reduceNat, System.Platform.numBits
-      // (mirrors lean4 `type_checker.cpp:667-672` and lean4lean
-      // `TypeChecker.lean:438` — `reduce_native` runs before `reduce_nat`).
-      if let Some(reduced) = self.try_reduce_native(&cur)? {
+      // Primitive reduction, dispatched by memoized head family — the
+      // five recognizers below each collect their own spine and run their
+      // own address gauntlet, so probing all of them every iteration was
+      // a measurable tax on ordinary (Other-headed) terms. Semantics are
+      // unchanged: the families' head-address sets are disjoint, so at
+      // most one recognizer could fire anyway. Reference order preserved
+      // (native before nat: lean4 `type_checker.cpp:667-672`).
+      let family = self.head_prim_family(&cur);
+      if family == PrimFamily::Native
+        && let Some(reduced) = self.try_reduce_native(&cur)?
+      {
         cur = reduced;
         continue;
       }
@@ -301,7 +384,9 @@ impl<M: KernelMode> TypeChecker<'_, M> {
       // BitVec definitions reduce through Nat comparisons. Keep this before
       // delta so small definitional facts such as `x < 0#w` collapse
       // without unfolding the full Fin-backed representation of BitVec.
-      if let Some(reduced) = self.try_reduce_bitvec(&cur)? {
+      if family == PrimFamily::BitVec
+        && let Some(reduced) = self.try_reduce_bitvec(&cur)?
+      {
         cur = reduced;
         continue;
       }
@@ -309,25 +394,46 @@ impl<M: KernelMode> TypeChecker<'_, M> {
       // Nat primitive reduction in main WHNF loop (lean4lean TypeChecker.lean:439).
       // Must run BEFORE delta_unfold_one, so that Nat.sub/Nat.pow/etc. get
       // short-circuited before their bodies (which use Nat.rec) are exposed.
-      if let Some(reduced) =
-        self.try_reduce_nat_with_succ_mode(&cur, nat_succ_mode)?
+      if family == PrimFamily::Nat
+        && let Some(reduced) =
+          self.try_reduce_nat_with_succ_mode(&cur, nat_succ_mode)?
       {
         cur = reduced;
         continue;
       }
       // Nat decidability: Nat.decLe/decEq/decLt on literals → Decidable.isTrue/isFalse.
       // Must run BEFORE delta, so the body (which uses dite/Nat.rec) is never exposed.
-      if let Some(reduced) = self.try_reduce_decidable(&cur)? {
+      if family == PrimFamily::Decidable
+        && let Some(reduced) = self.try_reduce_decidable(&cur)?
+      {
         cur = reduced;
         continue;
       }
 
       // String literal primitives such as `String.back ""`.
-      if let Some(reduced) = self.try_reduce_string(&cur) {
+      if family == PrimFamily::Str
+        && let Some(reduced) = self.try_reduce_string(&cur)
+      {
         cur = reduced;
         continue;
       }
 
+      // Keep `Nat.add base (Lit n)` (symbolic base, n > 0) and
+      // `Nat.div/mod base (Lit k)` (k ≥ 2) STUCK as a compact offset instead
+      // of delta-unfolding: `Nat.add` would materialize succ^n(base) — O(n)
+      // substitution per layer — and `Nat.div/mod` would expand the division
+      // algorithm, even though both are irreducible for a symbolic base.
+      // (`Nat.shiftRight x k` unfolds to k nested `Nat.div _ 2`, which now
+      // stay stuck.) Iota over such a major still works via
+      // `cleanup_nat_offset_major`; def-eq decides offset pairs in
+      // `try_def_eq_offset`. Mirrors IxVM 5dcab7f/f7cfe23.
+      if family == PrimFamily::Nat
+        && let Some(stuck) = self.try_nat_offset_stuck(&cur)?
+      {
+        cur = stuck;
+        break;
+      }
+
       if let Some(unfolded) = self.delta_unfold_one(&cur)? {
         cur = unfolded;
         continue;
@@ -538,31 +644,15 @@ impl<M: KernelMode> TypeChecker<'_, M> {
       let (f0, args) = collect_app_spine(&cur);
       let f = self.whnf_core_with_flags(&f0, flags)?;
 
-      // Multi-arg beta
+      // Beta: enter the environment machine. Subsequent betas/zetas are
+      // O(1) environment pushes; substitution materializes only at the
+      // machine's exit, which re-enters this loop as the new `cur` (so
+      // ambient zeta / iota / prim dispatch see exactly what the old
+      // eager-substitution path produced). Entry is gated on an actual
+      // beta firing — Const-headed terms (e.g. literal recursor loops)
+      // never pay the closure-wrap + readback overhead.
       if matches!(f.data(), ExprData::Lam(..)) {
-        let mut body = f;
-        // Pre-size: at most one arg is consumed per outer Lam, capped by
-        // `args.len()`. Pre-sizing skips the first growth reallocation
-        // for non-trivial spines on this hot path.
-        let mut consumed_args = Vec::with_capacity(args.len());
-        while consumed_args.len() < args.len() {
-          if let ExprData::Lam(_, _, _, inner, _) = body.data() {
-            let inner = inner.clone();
-            consumed_args.push(args[consumed_args.len()].clone());
-            body = inner;
-          } else {
-            break;
-          }
-        }
-        let remaining_start = consumed_args.len();
-        if !consumed_args.is_empty() {
-          consumed_args.reverse();
-          body = simul_subst(&mut self.env.intern, &body, &consumed_args, 0);
-        }
-        for arg in &args[remaining_start..] {
-          body = self.intern(KExpr::app(body, arg.clone()));
-        }
-        cur = body;
+        cur = self.machine_whnf(f, &args, &mut fuel, flags)?;
         continue;
       }
 
@@ -589,6 +679,324 @@ impl<M: KernelMode> TypeChecker<'_, M> {
     }
   }
 
+  /// The WHNF environment machine (Krivine-style, structural fragment).
+  /// See `docs/env_machine_whnf.md`; mirrors the IxVM `mwhnf_spine`.
+  ///
+  /// Entered from the App arm of [`Self::whnf_core_with_flags_uncached`]
+  /// when a beta is about to fire: `head` is the already-whnf_core'd
+  /// `Lam`, `args` the plain argument spine nearest-first. Machine state
+  /// is `(head, env, spine)`: `head` a raw subterm of the program, `env`
+  /// the closure environment its loose `Var`s refer to, and `spine` the
+  /// pending argument closures used as a stack — NEAREST argument last.
+  ///
+  /// Transitions (each O(1), no substitution):
+  ///   `App(f, a)`         push `Clo{a, env}`; descend to `f`
+  ///   `Lam` + pending arg pop spine, push env (**beta**)
+  ///   `Let`               push `Clo{val, env}` (**zeta**)
+  ///   `Var(i)`, `i < n`   jump into `env[i]`'s closure
+  ///
+  /// Everything else EXITS: the head and remaining spine read back to a
+  /// plain expression (`clo_subst` — work proportional to what the
+  /// reduction actually consumes), and the caller's loop continues with
+  /// it. That exit contract keeps ambient let/LDecl zeta (the outer
+  /// Var/FVar arms), iota, projection reduction under the `flags` cheap
+  /// policy, and prim dispatch byte-identical to the eager path the
+  /// machine replaces.
+  ///
+  /// Fuel: beta and zeta charge the caller's budget — the same
+  /// one-substitution-event-per-tick granularity as the eager path.
+  /// The transitions between charges (App peels, var jumps) are bounded
+  /// by program structure reachable from the fueled pushes, so total
+  /// work stays fuel-bounded.
+  fn machine_whnf(
+    &mut self,
+    head: KExpr<M>,
+    args: &[KExpr<M>],
+    fuel: &mut u32,
+    flags: WhnfFlags,
+  ) -> Result<KExpr<M>, TcError<M>> {
+    let mut head = head;
+    let mut env: MEnv<M> = MEnv::empty();
+    let mut spine: Vec<Arc<Clo<M>>> =
+      args.iter().rev().map(|a| Arc::new(Clo::closed(a.clone()))).collect();
+
+    loop {
+      match head.data() {
+        ExprData::App(f, a, _) => {
+          let c = Arc::new(Clo::new(a.clone(), env.clone()));
+          let f = f.clone();
+          spine.push(c);
+          head = f;
+        },
+
+        ExprData::Lam(name, bi, ty, body, _) => {
+          if let Some(c) = spine.pop() {
+            // Beta: O(1) environment push.
+            if *fuel == 0 {
+              return Err(TcError::MaxRecDepth);
+            }
+            *fuel -= 1;
+            let body = body.clone();
+            env = env.push(c);
+            head = body;
+          } else {
+            // Value. Read back under one binder.
+            if env.len() == 0 {
+              return Ok(head.clone());
+            }
+            let ty2 = clo_subst(&mut self.env.intern, ty, &env, 0);
+            let body2 = clo_subst(&mut self.env.intern, body, &env, 1);
+            return Ok(self.intern(KExpr::lam(
+              name.clone(),
+              bi.clone(),
+              ty2,
+              body2,
+            )));
+          }
+        },
+
+        ExprData::Let(_, _, val, body, _, _) => {
+          // Zeta: O(1) environment push.
+          if *fuel == 0 {
+            return Err(TcError::MaxRecDepth);
+          }
+          *fuel -= 1;
+          let c = Arc::new(Clo::new(val.clone(), env.clone()));
+          let body = body.clone();
+          env = env.push(c);
+          head = body;
+        },
+
+        ExprData::Var(i, name, _) => {
+          let i = *i;
+          if i < env.len() {
+            // Machine-bound variable: jump into its closure. This is
+            // where laziness pays — the entry was never substituted.
+            let c = env.get(i).clone();
+            head = c.e.clone();
+            env = c.env.clone();
+          } else {
+            // Ambient variable: shift below the machine binders and
+            // exit stuck; the outer loop's Var arm handles legacy
+            // let-bound zeta on it.
+            let h = self.intern(KExpr::var(i - env.len(), name.clone()));
+            return Ok(self.machine_exit(h, &spine));
+          }
+        },
+
+        // Closure-iota (Phase B; mirrors IxVM try_iota_c): a recursor
+        // head consumes the spine LAZILY. On the main ctor-rule path the
+        // rule RHS re-enters the machine with the params/motives/minors
+        // and post-major arguments as their ORIGINAL closures (plus the
+        // ctor's fields wrapped closed) — the rule's Lam-chain betas push
+        // them straight into an environment, so unselected minors
+        // (dropped match/Decidable branches) are never substituted and
+        // never read back. Anything off the main path misses to the
+        // plain readback exit, whose `try_iota` redoes the major's whnf
+        // against a warm cache. Gated like the eager path: cheap_rec
+        // mode never runs full iota from inside the machine.
+        ExprData::Const(id, us, _) => {
+          if !flags.cheap_rec {
+            let id = id.clone();
+            let us: Vec<KUniv<M>> = us.to_vec();
+            if let Some((rhs, new_spine)) =
+              self.try_iota_clo(&id, &us, &spine)?
+            {
+              if *fuel == 0 {
+                return Err(TcError::MaxRecDepth);
+              }
+              *fuel -= 1;
+              head = rhs;
+              env = MEnv::empty();
+              spine = new_spine;
+              continue;
+            }
+          }
+          let h = head.clone();
+          return Ok(self.machine_exit(h, &spine));
+        },
+
+        // Stuck or dispatch-owned heads (no loose Vars of their own):
+        // exit; the outer loop applies prim dispatch / LDecl zeta
+        // exactly as before.
+        ExprData::FVar(..)
+        | ExprData::Sort(..)
+        | ExprData::Nat(..)
+        | ExprData::Str(..) => {
+          let h = head.clone();
+          return Ok(self.machine_exit(h, &spine));
+        },
+
+        // Pi value (or ill-typed Pi application): read back; outer loop
+        // returns it (spine empty) or leaves it stuck (non-empty), as
+        // the eager path did.
+        ExprData::All(name, bi, ty, body, _) => {
+          let h = if env.len() == 0 {
+            head.clone()
+          } else {
+            let ty2 = clo_subst(&mut self.env.intern, ty, &env, 0);
+            let body2 = clo_subst(&mut self.env.intern, body, &env, 1);
+            self.intern(KExpr::all(name.clone(), bi.clone(), ty2, body2))
+          };
+          return Ok(self.machine_exit(h, &spine));
+        },
+
+        // Projection: read the scrutinee back and exit; the outer loop
+        // reduces the projection under the caller's `flags` cheap/full
+        // policy (its Prj arm, or the App arm's head reduction when the
+        // spine is non-empty).
+        ExprData::Prj(id, field, val, _) => {
+          let h = if env.len() == 0 {
+            head.clone()
+          } else {
+            let val2 = clo_subst(&mut self.env.intern, val, &env, 0);
+            self.intern(KExpr::prj(id.clone(), *field, val2))
+          };
+          return Ok(self.machine_exit(h, &spine));
+        },
+      }
+    }
+  }
+
+  /// Read the machine's remaining spine back onto an exit head:
+  /// `h a₁ … aₙ` with each argument materialized via [`clo_readback`].
+  /// `spine` holds the nearest argument LAST.
+  fn machine_exit(&mut self, h: KExpr<M>, spine: &[Arc<Clo<M>>]) -> KExpr<M> {
+    let mut cur = h;
+    for c in spine.iter().rev() {
+      let a = clo_readback(&mut self.env.intern, c);
+      cur = self.intern(KExpr::app(cur, a));
+    }
+    cur
+  }
+
+  /// Closure-spine iota for the machine's `Const` exit: the main
+  /// ctor-rule path of [`Self::try_iota_with_flags`], consuming the
+  /// machine spine lazily. Returns the level-instantiated rule RHS and
+  /// the machine spine to re-enter with — the params/motives/minors and
+  /// post-major arguments ride through as their original closures
+  /// (never read back; only the major materializes), the ctor's field
+  /// arguments are wrapped closed.
+  ///
+  /// Everything off the main path returns `None`, and the caller falls
+  /// back to the plain readback exit whose `try_iota` is complete:
+  /// - K recursors (nullary-ctor synthesis needs infer + def-eq),
+  /// - `Nat` literal majors (the transient-work discipline must keep
+  ///   literal succ-chains out of the interner and the whnf caches, and
+  ///   the linear-rec/offset shortcuts live on the plain path),
+  /// - `Str` literal majors (ctor coercion + re-whnf),
+  /// - struct-eta candidates and genuinely stuck majors.
+  ///
+  /// A miss costs one readback of the major closure; the plain path's
+  /// own whnf of that major then hits a warm cache.
+  fn try_iota_clo(
+    &mut self,
+    rec_id: &KId<M>,
+    rec_us: &[KUniv<M>],
+    spine: &[Arc<Clo<M>>],
+  ) -> Result<Option<(KExpr<M>, Vec<Arc<Clo<M>>>)>, TcError<M>> {
+    let recr = match self.try_get_const(rec_id)? {
+      Some(KConst::Recr {
+        k,
+        params,
+        motives,
+        minors,
+        indices,
+        rules,
+        lvls,
+        ..
+      }) => {
+        if k {
+          return Ok(None);
+        }
+        let major_idx = u64_to_usize::<M>(params + motives + minors + indices)?;
+        if spine.len() <= major_idx {
+          return Ok(None);
+        }
+        // H6: level params arity (lean4lean Reduce.lean:76).
+        if rec_us.len() as u64 != lvls {
+          return Ok(None);
+        }
+        IotaInfo {
+          k,
+          params: u64_to_usize::<M>(params)?,
+          motives: u64_to_usize::<M>(motives)?,
+          minors: u64_to_usize::<M>(minors)?,
+          indices: u64_to_usize::<M>(indices)?,
+          major_idx,
+          rules,
+          lvls,
+        }
+      },
+      _ => return Ok(None),
+    };
+
+    // Materialize ONLY the major. `spine` is nearest-LAST, so the i-th
+    // argument nearest-first sits at `spine[len - 1 - i]`.
+    let len = spine.len();
+    let major_clo = spine[len - 1 - recr.major_idx].clone();
+    let major = clo_readback(&mut self.env.intern, &major_clo);
+    // Mirror the eager path's pre- and post-whnf offset cleanups so
+    // `Nat.add base k` majors expose a `Nat.succ` layer here instead of
+    // missing to a second full attempt.
+    let major = match self.cleanup_nat_offset_major(&major)? {
+      Some(cleaned) => cleaned,
+      None => major,
+    };
+    let mut major_whnf = self.whnf(&major)?;
+    if matches!(major_whnf.data(), ExprData::Nat(..)) {
+      return Ok(None);
+    }
+    if let Some(cleaned) = self.cleanup_nat_offset_major(&major_whnf)? {
+      major_whnf = cleaned;
+    }
+    if matches!(major_whnf.data(), ExprData::Str(..)) {
+      return Ok(None);
+    }
+
+    let (ctor_head, ctor_args) = collect_app_spine(&major_whnf);
+    let ctor_id = match ctor_head.data() {
+      ExprData::Const(id, _, _) => id.clone(),
+      _ => return Ok(None),
+    };
+    let (cidx, ctor_fields) = match self.try_get_const(&ctor_id)? {
+      Some(KConst::Ctor { cidx, fields, .. }) => {
+        (u64_to_usize::<M>(cidx)?, u64_to_usize::<M>(fields)?)
+      },
+      _ => return Ok(None),
+    };
+    if cidx >= recr.rules.len() {
+      return Ok(None);
+    }
+    // H5: nfields ≤ major args (lean4lean Reduce.lean:75).
+    if ctor_fields > ctor_args.len() {
+      return Ok(None);
+    }
+
+    crate::perf::record_iota_histo(&rec_id.addr);
+    let rule = &recr.rules[cidx];
+    let rec_us_vec: Vec<_> = rec_us.to_vec();
+    let rhs = self.instantiate_univ_params(&rule.rhs, &rec_us_vec)?;
+
+    // New machine spine, nearest-LAST. Nearest-first the rule sees
+    // `pmm ++ ctor fields ++ post-major`; in machine order that is the
+    // post-major prefix of `spine` (indices below the major's slot),
+    // then the fields reversed, then the pmm suffix of `spine`. The
+    // index arguments between pmm and the major are dropped, as in the
+    // eager path.
+    let pmm_end = recr.params + recr.motives + recr.minors;
+    let field_start = ctor_args.len() - ctor_fields;
+    let post_len = len - 1 - recr.major_idx;
+    let mut new_spine: Vec<Arc<Clo<M>>> =
+      Vec::with_capacity(post_len + ctor_fields + pmm_end);
+    new_spine.extend_from_slice(&spine[..post_len]);
+    for arg in ctor_args[field_start..].iter().rev() {
+      new_spine.push(Arc::new(Clo::closed(arg.clone())));
+    }
+    new_spine.extend_from_slice(&spine[len - pmm_end..]);
+    Ok(Some((rhs, new_spine)))
+  }
+
   /// WHNF without delta: whnf_core → proj-app → nat/native/string → quot.
   /// Projection values use full WHNF, preserving the public/full semantics.
   pub fn whnf_no_delta(
@@ -696,15 +1104,23 @@ impl<M: KernelMode> TypeChecker<'_, M> {
         continue;
       }
 
+      // Primitive reduction, dispatched by memoized head family (see the
+      // main WHNF loop) — family head sets are disjoint, so dispatching
+      // replaces probe-everything without changing semantics.
+      let family = self.head_prim_family(&cur);
+
       // BitVec.toNat/ult reductions are definitional wrappers around Nat.
-      if let Some(reduced) = self.try_reduce_bitvec(&cur)? {
+      if family == PrimFamily::BitVec
+        && let Some(reduced) = self.try_reduce_bitvec(&cur)?
+      {
         cur = reduced;
         continue;
       }
 
       // Nat primitive reduction
-      if let Some(reduced) =
-        self.try_reduce_nat_with_succ_mode(&cur, nat_succ_mode)?
+      if family == PrimFamily::Nat
+        && let Some(reduced) =
+          self.try_reduce_nat_with_succ_mode(&cur, nat_succ_mode)?
       {
         cur = reduced;
         continue;
@@ -715,13 +1131,17 @@ impl<M: KernelMode> TypeChecker<'_, M> {
       // `Subtype.val` and `String.toByteArray` are projection definitions;
       // once rewritten to `Prj`, the cheap primitive recognizers no longer
       // see the original head.
-      if let Some(reduced) = self.try_reduce_native(&cur)? {
+      if family == PrimFamily::Native
+        && let Some(reduced) = self.try_reduce_native(&cur)?
+      {
         cur = reduced;
         continue;
       }
 
       // String literal primitives.
-      if let Some(reduced) = self.try_reduce_string(&cur) {
+      if family == PrimFamily::Str
+        && let Some(reduced) = self.try_reduce_string(&cur)
+      {
         cur = reduced;
         continue;
       }
@@ -885,7 +1305,9 @@ impl<M: KernelMode> TypeChecker<'_, M> {
           minors: u64_to_usize::<M>(minors)?,
           indices: u64_to_usize::<M>(indices)?,
           major_idx,
-          rules: rules.clone(),
+          // `rules` is already owned here (moved out of the KConst clone
+          // `try_get_const` returned) — do not clone it again.
+          rules,
           lvls,
         }
       },
@@ -1066,23 +1488,43 @@ impl<M: KernelMode> TypeChecker<'_, M> {
   /// Nat literal iota can create a long chain of distinct predecessor terms.
   /// These terms are useful only while the current WHNF is executing; keeping
   /// each one in the global WHNF caches makes RSS linear in the literal.
+  /// Allocation-free spine probe: head expression and arg count without
+  /// materializing the spine. The transient-nat probes below run on
+  /// *every* whnf call **before** the cache lookup, so they must not
+  /// heap-allocate on the (overwhelmingly common) non-Nat-recursor path —
+  /// the previous implementation paid two `collect_app_spine` Vec
+  /// allocations plus a `KConst::Recr` clone per call, defeating the
+  /// cache on the hottest path in a full check. (Ported from jcb/fixes
+  /// H-15.)
+  fn spine_head_and_len(e: &KExpr<M>) -> (&KExpr<M>, usize) {
+    let mut cur = e;
+    let mut n = 0usize;
+    while let ExprData::App(f, _, _) = cur.data() {
+      n += 1;
+      cur = f;
+    }
+    (cur, n)
+  }
+
   fn is_transient_nat_literal_work(
     &mut self,
     e: &KExpr<M>,
   ) -> Result<bool, TcError<M>> {
-    if self.is_nat_literal_recursor_app(e)? {
-      return Ok(true);
-    }
-
-    let (head, args) = collect_app_spine(e);
+    let (head, nargs) = Self::spine_head_and_len(e);
     let ExprData::Const(id, _, _) = head.data() else {
       return Ok(false);
     };
-
-    if id.addr == self.prims.nat_succ.addr && args.len() == 1 {
-      return self.is_nat_literal_recursor_app(&args[0]);
+    if id.addr == self.prims.nat_rec.addr
+      || id.addr == self.prims.nat_cases_on.addr
+    {
+      return self.is_nat_literal_recursor_app(e);
+    }
+    if id.addr == self.prims.nat_succ.addr
+      && nargs == 1
+      && let ExprData::App(_, arg, _) = e.data()
+    {
+      return self.is_nat_literal_recursor_app(arg);
     }
-
     Ok(false)
   }
 
@@ -1090,7 +1532,9 @@ impl<M: KernelMode> TypeChecker<'_, M> {
     &mut self,
     e: &KExpr<M>,
   ) -> Result<bool, TcError<M>> {
-    let (head, spine) = collect_app_spine(e);
+    // Cheap pre-filter first; only fall through to the allocating spine
+    // collection + recursor lookup when the head can actually match.
+    let (head, _) = Self::spine_head_and_len(e);
     let ExprData::Const(id, _, _) = head.data() else {
       return Ok(false);
     };
@@ -1105,6 +1549,7 @@ impl<M: KernelMode> TypeChecker<'_, M> {
     else {
       return Ok(false);
     };
+    let (_, spine) = collect_app_spine(e);
     let major_idx = u64_to_usize::<M>(params + motives + minors + indices)?;
     Ok(
       spine
@@ -1233,6 +1678,114 @@ impl<M: KernelMode> TypeChecker<'_, M> {
     None
   }
 
+  /// If `e` is `Nat.add base (Lit n)` (n > 0) or `Nat.div/mod base (Lit k)`
+  /// (k ≥ 2) with a non-literal base, return the same operation in canonical
+  /// compact form so the WHNF loop can leave it stuck instead of
+  /// delta-unfolding. Thresholds keep `x + 0`, `x / 1`, `x / 0` (and mod)
+  /// reducing through the normal path. `None` means "not this shape —
+  /// proceed normally".
+  fn try_nat_offset_stuck(
+    &mut self,
+    e: &KExpr<M>,
+  ) -> Result<Option<KExpr<M>>, TcError<M>> {
+    // Allocation-free quick reject: this probe runs once per delta-unfold
+    // loop iteration, so don't collect a spine Vec unless the head constant
+    // is one of the three Nat primitives.
+    {
+      let mut cur = e;
+      loop {
+        match cur.data() {
+          ExprData::App(f, _, _) => cur = f,
+          ExprData::Const(id, _, _) => {
+            if id.addr != self.prims.nat_add.addr
+              && id.addr != self.prims.nat_div.addr
+              && id.addr != self.prims.nat_mod.addr
+            {
+              return Ok(None);
+            }
+            break;
+          },
+          _ => return Ok(None),
+        }
+      }
+    }
+    let (head, args) = collect_app_spine(e);
+    let ExprData::Const(id, _, _) = head.data() else {
+      return Ok(None);
+    };
+    let is_add = id.addr == self.prims.nat_add.addr;
+    let is_divmod =
+      id.addr == self.prims.nat_div.addr || id.addr == self.prims.nat_mod.addr;
+    if (!is_add && !is_divmod) || args.len() != 2 {
+      return Ok(None);
+    }
+    let Some(wb) = self.whnf_nat_reducer_arg(&args[1])? else {
+      return Ok(None);
+    };
+    let Some(n) = extract_nat_value(&wb, &self.prims) else {
+      return Ok(None);
+    };
+    if n.0 == num_bigint::BigUint::ZERO {
+      return Ok(None);
+    }
+    if is_divmod && n.0 == num_bigint::BigUint::from(1u64) {
+      return Ok(None);
+    }
+    let Some(wa) = self.whnf_nat_reducer_arg(&args[0])? else {
+      return Ok(None);
+    };
+    if extract_nat_value(&wa, &self.prims).is_some() {
+      // Both sides literal: this is closed arithmetic for `try_reduce_nat`,
+      // not a stuck offset.
+      return Ok(None);
+    }
+    let lit = self.nat_expr_from_value(n);
+    let inner = self.intern(KExpr::app(head.clone(), wa));
+    Ok(Some(self.intern(KExpr::app(inner, lit))))
+  }
+
+  /// Decompose a (whnf'd) Nat term into `(base, offset)` for offset-aware
+  /// def-eq: `Lit n` → `(None, n)`; `succ^j(Nat.add core (Lit m))` →
+  /// `(Some(core), j + m)` read in O(1) per layer via [`Self::nat_offset`]
+  /// instead of peeled one `is_def_eq` recursion level at a time.
+  /// `base = None` means the core is literal zero (the term IS a numeral).
+  /// `None` (the outer Option) means "not offset-shaped".
+  pub(super) fn nat_offset_decompose(
+    &mut self,
+    e: &KExpr<M>,
+  ) -> Result<Option<(Option<KExpr<M>>, Nat)>, TcError<M>> {
+    if let Some(v) = extract_nat_value(e, &self.prims) {
+      return Ok(Some((None, v)));
+    }
+    match self.nat_offset(e, 0)? {
+      Some((base, off)) if off.0 > num_bigint::BigUint::ZERO => {
+        if let Some(bv) = extract_nat_value(&base, &self.prims) {
+          Ok(Some((None, Nat(bv.0 + off.0))))
+        } else {
+          Ok(Some((Some(base), off)))
+        }
+      },
+      _ => Ok(None),
+    }
+  }
+
+  /// Rebuild `base + r` in the compact offset form used by
+  /// [`Self::try_nat_offset_stuck`].
+  pub(super) fn nat_offset_rebuild(
+    &mut self,
+    base: Option<KExpr<M>>,
+    r: Nat,
+  ) -> KExpr<M> {
+    match base {
+      None => self.nat_expr_from_value(r),
+      Some(b) if r.0 == num_bigint::BigUint::ZERO => b,
+      Some(b) => {
+        let lit = self.nat_expr_from_value(r);
+        self.mk_nat_add(b, lit)
+      },
+    }
+  }
+
   fn mk_nat_succ(&mut self, pred: KExpr<M>) -> KExpr<M> {
     let succ = KExpr::cnst(self.prims.nat_succ.clone(), Box::new([]));
     KExpr::app(succ, pred)
@@ -1694,6 +2247,26 @@ impl<M: KernelMode> TypeChecker<'_, M> {
   }
 
   /// Nat primitive reduction (add, sub, mul, div, mod, pow, gcd, bitwise, predicates).
+  /// Classify `e`'s head constant into a primitive-reducer family.
+  /// Allocation-free app-chain walk + per-address memo.
+  pub(super) fn head_prim_family(&mut self, e: &KExpr<M>) -> PrimFamily {
+    let mut cur = e;
+    loop {
+      match cur.data() {
+        ExprData::App(f, _, _) => cur = f,
+        ExprData::Const(id, _, _) => {
+          if let Some(&fam) = self.env.prim_family_cache.get(&id.addr) {
+            return fam;
+          }
+          let fam = prim_family_uncached(&self.prims, &id.addr);
+          self.env.prim_family_cache.insert(id.addr.clone(), fam);
+          return fam;
+        },
+        _ => return PrimFamily::Other,
+      }
+    }
+  }
+
   pub(super) fn try_reduce_nat(
     &mut self,
     e: &KExpr<M>,
@@ -1803,7 +2376,7 @@ impl<M: KernelMode> TypeChecker<'_, M> {
     if self.env.nat_succ_stuck.contains(&entry_key) {
       return Ok(None);
     }
-    let mut visited: Vec<(super::env::Addr, super::env::Addr)> =
+    let mut visited: Vec<(super::env::Addr, super::env::CtxAddr)> =
       vec![entry_key];
     let mut offset = num_bigint::BigUint::from(1u64);
     let mut cur = arg.clone();
@@ -1885,7 +2458,20 @@ impl<M: KernelMode> TypeChecker<'_, M> {
     let base = base.clone();
     let base_whnf = self.whnf(&base)?;
     let Some(base_val) = extract_nat_value(&base_whnf, &self.prims) else {
-      return Ok(None);
+      // Symbolic base: collapse `succ^offset(Nat.rec base succ-step (Lit n))`
+      // to the compact offset `Nat.add base (Lit (n + offset))` rather than
+      // declining into n iota steps that materialize succ^n(base). Keeps the
+      // value in the same `base + k` form a literal already has, so def-eq
+      // converges instead of descending n unary succ layers. Conservative:
+      // only when the recursor application carries no post-major arguments.
+      // Mirrors IxVM dbc4177.
+      if parts.spine.len() != parts.major_idx + 1 {
+        return Ok(None);
+      }
+      let total = Nat(&parts.major.0 + offset);
+      let lit = self.nat_expr_from_value(total);
+      let result = self.mk_nat_add(base_whnf, lit);
+      return Ok(Some(result));
     };
 
     let mut total = base_val.0;
@@ -1932,7 +2518,7 @@ impl<M: KernelMode> TypeChecker<'_, M> {
     };
     let major = major.clone();
 
-    Ok(Some(NatRecLiteralParts { spine, major, base_idx, step_idx }))
+    Ok(Some(NatRecLiteralParts { spine, major, base_idx, step_idx, major_idx }))
   }
 
   fn is_nat_succ_ih_step(
@@ -2908,6 +3494,15 @@ fn compute_nat_bin<M: KernelMode>(
   b: &Nat,
 ) -> Option<Nat> {
   use num_bigint::BigUint;
+  // Output-size cap for natively-computed results. `shiftLeft` and `pow`
+  // amplify a tiny term into a result exponentially larger than the input
+  // bytes (`Nat.shiftLeft 1 (1 << 40)` is a ~12-byte term demanding a
+  // ~128 GiB allocation). Refuse to materialize anything wider and leave
+  // the term stuck, exactly like the `pow` exponent guard below. 2^26 bits
+  // (8 MiB) keeps every `2^e, e <= ReducePowMaxExp` reduction the C++
+  // kernel performs while bounding the allocation an adversarial term can
+  // force. (Ported from jcb/fixes H-12.)
+  const MAX_NAT_REDUCE_BITS: u64 = 1 << 26;
   let zero = BigUint::ZERO;
   let r = if *addr == p.nat_add.addr {
     &a.0 + &b.0
@@ -2924,7 +3519,14 @@ fn compute_nat_bin<M: KernelMode>(
     const REDUCE_POW_MAX_EXP: u64 = 1 << 24; // 16_777_216
     match b.to_u64() {
       #[allow(clippy::cast_possible_truncation)] // guarded: exp <= 2^24
-      Some(exp) if exp <= REDUCE_POW_MAX_EXP => a.0.pow(exp as u32),
+      Some(exp) if exp <= REDUCE_POW_MAX_EXP => {
+        // The exponent cap alone does not bound the *result*: a wide base
+        // with an allowed exponent still explodes (bits(a) * exp).
+        if a.0.bits().saturating_mul(exp) > MAX_NAT_REDUCE_BITS {
+          return None;
+        }
+        a.0.pow(exp as u32)
+      },
       _ => return None, // too large to compute
     }
   } else if *addr == p.nat_gcd.addr {
@@ -2936,8 +3538,16 @@ fn compute_nat_bin<M: KernelMode>(
   } else if *addr == p.nat_xor.addr {
     &a.0 ^ &b.0
   } else if *addr == p.nat_shift_left.addr {
-    let shift = usize::try_from(b.to_u64()?).ok()?;
-    &a.0 << shift
+    let shift = b.to_u64()?;
+    if a.0 == zero {
+      zero
+    } else if a.0.bits().saturating_add(shift) > MAX_NAT_REDUCE_BITS {
+      // Result has exactly bits(a) + shift bits — refuse to materialize.
+      return None;
+    } else {
+      let shift = usize::try_from(shift).ok()?;
+      &a.0 << shift
+    }
   } else if *addr == p.nat_shift_right.addr {
     let shift = usize::try_from(b.to_u64()?).ok()?;
     &a.0 >> shift
@@ -3195,6 +3805,166 @@ mod tests {
     assert_eq!(tc.whnf(&app).unwrap(), sort0());
   }
 
+  // ---- environment-machine readback paths ----
+
+  #[test]
+  fn whnf_machine_open_term_shift() {
+    let mut env = env_with_id();
+    let mut tc = TypeChecker::new(&mut env);
+    // (λ x. x #3) opaque → opaque #2: the ambient Var must shift down
+    // past the consumed machine binder during spine readback.
+    let opaque = AE::cnst(mk_id("opaque"), Box::new([]));
+    let body = AE::app(AE::var(0, ()), AE::var(3, ()));
+    let lam = AE::lam((), (), sort0(), body);
+    let app = AE::app(lam, opaque.clone());
+    let expected = AE::app(opaque, AE::var(2, ()));
+    assert_eq!(tc.whnf(&app).unwrap(), expected);
+  }
+
+  #[test]
+  fn whnf_machine_partial_app_value() {
+    let mut env = env_with_id();
+    let mut tc = TypeChecker::new(&mut env);
+    // (λ x y. x y) opaque → λ y. opaque y: Lam value exit reads the body
+    // back at binder depth 1.
+    let opaque = AE::cnst(mk_id("opaque"), Box::new([]));
+    let body = AE::app(AE::var(1, ()), AE::var(0, ()));
+    let lam2 = AE::lam((), (), sort0(), AE::lam((), (), sort0(), body));
+    let app = AE::app(lam2, opaque.clone());
+    let expected = AE::lam((), (), sort0(), AE::app(opaque, AE::var(0, ())));
+    assert_eq!(tc.whnf(&app).unwrap(), expected);
+  }
+
+  #[test]
+  fn whnf_machine_env_entry_lifts_under_binder() {
+    let mut env = env_with_id();
+    let mut tc = TypeChecker::new(&mut env);
+    // (λ x y. x) #3 → λ y. #4: substituting an OPEN argument under the
+    // surviving binder must lift it (Var arm of clo_subst at depth 1).
+    let lam2 =
+      AE::lam((), (), sort0(), AE::lam((), (), sort0(), AE::var(1, ())));
+    let app = AE::app(lam2, AE::var(3, ()));
+    let expected = AE::lam((), (), sort0(), AE::var(4, ()));
+    assert_eq!(tc.whnf(&app).unwrap(), expected);
+  }
+
+  #[test]
+  fn whnf_machine_zeta_under_beta() {
+    let mut env = env_with_id();
+    let mut tc = TypeChecker::new(&mut env);
+    // (λ x. let z := x in z) opaque → opaque: machine zeta push.
+    let opaque = AE::cnst(mk_id("opaque"), Box::new([]));
+    let body = AE::let_((), sort0(), AE::var(0, ()), AE::var(0, ()), true);
+    let lam = AE::lam((), (), sort0(), body);
+    let app = AE::app(lam, opaque.clone());
+    assert_eq!(tc.whnf(&app).unwrap(), opaque);
+  }
+
+  #[test]
+  fn whnf_machine_closure_iota_via_beta() {
+    use super::super::constant::RecRule;
+    // A beta whose body is a recursor application: the machine's Const
+    // exit must take the closure-iota path (rule args ride through as
+    // closures) and produce the same result as the eager iota.
+    //
+    // Unit-like inductive `U` with one ctor `U.mk` (no params/fields)
+    // and recursor `U.rec : (motive) (minor) (major) → minor`.
+    let u_id = mk_id("Test.U");
+    let u_mk_id = mk_id("Test.U.mk");
+    let u_rec_id = mk_id("Test.U.rec");
+    let mut env = env_with_id();
+    env.insert(
+      u_id.clone(),
+      KConst::Indc {
+        name: (),
+        level_params: (),
+        lvls: 0,
+        params: 0,
+        indices: 0,
+        is_rec: true, // disable struct-eta so the ctor path is the only route
+        is_refl: false,
+        is_unsafe: false,
+        nested: 0,
+        block: u_id.clone(),
+        member_idx: 0,
+        ty: sort0(),
+        ctors: vec![u_mk_id.clone()],
+        lean_all: (),
+      },
+    );
+    env.insert(
+      u_mk_id.clone(),
+      KConst::Ctor {
+        name: (),
+        level_params: (),
+        is_unsafe: false,
+        lvls: 0,
+        induct: u_id.clone(),
+        cidx: 0,
+        params: 0,
+        fields: 0,
+        ty: AE::cnst(u_id.clone(), Box::new([])),
+      },
+    );
+    env.insert(
+      u_rec_id.clone(),
+      KConst::Recr {
+        name: (),
+        level_params: (),
+        k: false,
+        is_unsafe: false,
+        lvls: 0,
+        params: 0,
+        indices: 0,
+        motives: 1,
+        minors: 1,
+        block: u_id.clone(),
+        member_idx: 0,
+        ty: sort0(),
+        // rule rhs: λ motive minor. minor
+        rules: vec![RecRule {
+          ctor: (),
+          fields: 0,
+          rhs: AE::lam(
+            (),
+            (),
+            sort0(),
+            AE::lam((), (), sort0(), AE::var(0, ())),
+          ),
+        }],
+        lean_all: (),
+      },
+    );
+    let mut tc = TypeChecker::new(&mut env);
+    let rec = AE::cnst(u_rec_id, Box::new([]));
+    let mk = AE::cnst(u_mk_id, Box::new([]));
+    let opaque = AE::cnst(mk_id("opaque"), Box::new([]));
+    // (λ m. U.rec Sort0 m U.mk #2) opaque
+    //   → machine beta (m := opaque), Const(U.rec) exit → closure-iota
+    //   → minor = opaque, post-major arg #3 shifts to #2.
+    let body = AE::app(
+      AE::app(AE::app(AE::app(rec, sort0()), AE::var(0, ())), mk),
+      AE::var(3, ()),
+    );
+    let lam = AE::lam((), (), sort0(), body);
+    let app = AE::app(lam, opaque.clone());
+    let expected = AE::app(opaque, AE::var(2, ()));
+    assert_eq!(tc.whnf(&app).unwrap(), expected);
+  }
+
+  #[test]
+  fn whnf_machine_chained_beta() {
+    let mut env = env_with_id();
+    let mut tc = TypeChecker::new(&mut env);
+    // ((λ x. x) (λ y. y)) opaque → opaque: the intermediate beta result
+    // is consumed by the next beta without materializing.
+    let inner = AE::lam((), (), sort0(), AE::var(0, ()));
+    let outer = AE::lam((), (), sort0(), AE::var(0, ()));
+    let opaque = AE::cnst(mk_id("opaque"), Box::new([]));
+    let app = AE::app(AE::app(outer, inner), opaque.clone());
+    assert_eq!(tc.whnf(&app).unwrap(), opaque);
+  }
+
   #[test]
   fn whnf_delta_opaque_blocked() {
     let mut env = env_with_id();
@@ -3883,7 +4653,13 @@ mod tests {
   }
 
   #[test]
-  fn whnf_nat_add_symbolic_literal_rhs_exposes_succ() {
+  fn whnf_nat_add_symbolic_literal_rhs_stays_compact() {
+    // `Nat.add x (Lit 2)` with a symbolic base must stay STUCK in compact
+    // offset form (`try_nat_offset_stuck`) instead of delta-unfolding into
+    // `succ(succ(x))` — that tower is O(n) substitution per layer and the
+    // dominant cost on Nat-arithmetic proofs. Iota over such a major still
+    // works via `cleanup_nat_offset_major`; def-eq decides offset pairs in
+    // `try_def_eq_offset`.
     let mut env = nat_env();
     let empty = KEnv::new();
     let prims = Primitives::from_env(&empty);
@@ -3891,10 +4667,29 @@ mod tests {
 
     let mut tc = TypeChecker::new(&mut env);
     let add = AE::cnst(tc.prims.nat_add.clone(), Box::new([]));
-    let expr = app(app(add, var(0)), mk_nat(2));
+    let expr = app(app(add.clone(), var(0)), mk_nat(2));
     let result = tc.whnf(&expr).unwrap();
+    assert_eq!(result, app(app(add, var(0)), mk_nat(2)));
+  }
+
+  #[test]
+  fn def_eq_nat_offset_succ_tower_vs_compact_add() {
+    // `succ(succ(x))` ≡ `Nat.add x (Lit 2)` must be decided by the offset
+    // decomposition in `try_def_eq_offset` (equal offsets → compare bases),
+    // and `succ(x) ≟ Nat.add x (Lit 2)` must NOT be equal.
+    let mut env = nat_env();
+    let empty = KEnv::new();
+    let prims = Primitives::from_env(&empty);
+    insert_nat_add_model(&mut env, prims.nat_add.clone());
+
+    let mut tc = TypeChecker::new(&mut env);
+    let add = AE::cnst(tc.prims.nat_add.clone(), Box::new([]));
     let succ = AE::cnst(tc.prims.nat_succ.clone(), Box::new([]));
-    assert_eq!(result, app(succ.clone(), app(succ, var(0))));
+    let tower = app(succ.clone(), app(succ.clone(), var(0)));
+    let compact = app(app(add, var(0)), mk_nat(2));
+    assert!(tc.is_def_eq(&tower, &compact).unwrap());
+    let one = app(succ, var(0));
+    assert!(!tc.is_def_eq(&one, &compact).unwrap());
   }
 
   #[test]
diff --git a/docs/env_machine_whnf.md b/docs/env_machine_whnf.md
new file mode 100644
index 00000000..3fa1bcfd
--- /dev/null
+++ b/docs/env_machine_whnf.md
@@ -0,0 +1,259 @@
+# Design: environment-machine WHNF for the Rust kernel
+
+Port of the IxVM environment machine (`~/ix-aiur` commits `d0d3375`
+Phase A, `b4c4ea3` Phase B; `Ix/IxVM/Kernel/Whnf.lean:209-460`,
+`Subst.lean:352-400`) to `crates/kernel/src/whnf.rs`. Status:
+IMPLEMENTED — Phase A in `7825e0b` (machine loop in `whnf.rs`,
+`Clo`/`MEnv`/`clo_subst` in `subst.rs`), Phase B closure-iota in
+`acd0780` (`try_iota_clo`); guest cycle benchmarks pending. One
+deviation from §7: machine-native delta (the IxVM Phase C1.5 win)
+does not port at this layer — our machine lives inside `whnf_core`,
+which must stay delta-free for def-eq's lazy-delta unfold ordering,
+whereas the IxVM machine sits in a whnf that includes delta. Spanning
+`whnf`'s delta loop with closures is a separate, larger design.
+
+## 1. Problem and evidence
+
+Eager substitution is the dominant residual cost on reduction-heavy
+shards. Every beta materializes the substituted body via
+`simul_subst` (a full walk + intern of every reachable subterm), and
+every iota substitutes a whole recursor rule — including subterms the
+reduction then discards (unselected `Decidable`/match branches,
+unused minor premises, arguments dropped by projections).
+
+Current guest profiles (`/tmp/prof3-*.log`, cumulative):
+
+- `Vector.extract_append._proof_1` (4.95B steps): `simul_subst` 49%,
+  `try_same_head_spine` 71%, `whnf_core` 73%.
+- UTF-8 codec proof (1.16B steps): `try_iota_with_flags` **85.6%**,
+  `try_same_head_spine` 85.7% — one deep def-eq descent with iota at
+  every level, substituting giant match trees whose branches are
+  mostly dropped.
+- mergesort shard (2.3B steps): `whnf_core` 38%, `simul_subst` 16%.
+
+The same wall was hit independently on the Aiur side ("the
+substitution wall is THE remaining structural frontier — needs a
+representation-level fix, not another stuck-form"), and the machine
+is its proven fix, measured there (FFT cost):
+
+- Phase A: Int16 −33.8%, Vector._proof_2 −17.1%, mergeSortBench
+  −6.8%, String.append −7.7%, gcd −5.0%; **synthetic foldAdd_2000
+  +14.7%** (see Risks).
+- Phase B (closure-iota): a further −3.4% Int16, −2.0% mergeSort;
+  headline payoff (dropped-branch laziness on UTF-8-class) not
+  represented in that suite.
+
+## 2. Core idea
+
+A Krivine-style abstract machine for the *structural* fragment of
+WHNF (beta, zeta, app-spine, bound-variable lookup):
+
+- **Beta and zeta are O(1) environment pushes.** No substitution
+  walk, no materialized intermediate body.
+- **Substitution happens only at exit points** ("readback"), and only
+  for the parts of the term the reduction actually hands to a
+  consumer. A beta chain ending in another beta never materializes
+  its intermediate bodies; an iota that selects rule 3 never touches
+  rules 0–2.
+- WHNF never reduces under binders, so environments **never need
+  lifting** — no full explicit-substitution calculus required. A
+  `Lam`/`All` value reads back with `clo_subst` at binder depth 1.
+
+## 3. Data types (Rust)
+
+```rust
+/// Expression closed over a machine environment:
+/// `Clo { e, env }` denotes e[Var(i) := env[i]] for i < env.len(),
+/// with Var(i) for i >= env.len() shifting DOWN by env.len() into the
+/// ambient context. Environments are built only by the machine's
+/// beta/zeta pushes.
+struct Clo<M: KernelMode> {
+  e: KExpr<M>,
+  env: MEnv<M>,
+}
+
+/// Persistent cons-list environment: O(1) push, structural sharing
+/// across the closures captured at each binder. `len` is cached — the
+/// IxVM port found recomputing it per suffix cost ~94M rows on the
+/// UTF-8 check.
+struct MEnvNode<M: KernelMode> { head: Arc<Clo<M>>, tail: MEnv<M> }
+type MEnv<M> = Option<Arc<MEnvNode<M>>>;  // + len: u32 carried alongside
+
+/// Machine spine: args nearest-first, each a closure over the env in
+/// force where the App layer was peeled. Also a persistent cons list
+/// (the machine only pushes at the front and pops from the front).
+type CSpine<M> = MEnv-like list of Arc<Clo<M>>;
+```
+
+No `CWhnf` enum in Phase A: the Rust machine is a private loop inside
+`whnf_core_with_flags_uncached`, and every exit reads back to a plain
+`KExpr` that re-enters the *existing* loop (see §5). The symbolic
+result type only becomes necessary for Phase C (closure-aware
+def-eq), which this design defers.
+
+## 4. The machine loop
+
+A single iterative loop (NOT recursion — machine depth equals
+reduction depth and must not consume native stack; every IxVM
+transition is tail-call-shaped, so the loop is mechanical). State:
+`(head: KExpr, env: MEnv, n: u32, spine: CSpine)`. Fuel: decrements
+the same `MAX_WHNF_FUEL` budget as the enclosing `whnf_core` loop —
+unlike IxVM we KEEP fuel (adversarial-input posture; see
+`docs/kernel_identity.md` for the threat model).
+
+Transitions, mirroring `mwhnf_spine`:
+
+| head | action |
+|---|---|
+| `App(f, a)` | push `Clo{a, env}` onto spine; `head = f` (O(1) per layer, no `collect_app_spine` Vec) |
+| `Lam(_, _, _, body)` | spine non-empty → pop `c`, push onto env, `head = body` (**beta, O(1)**); spine empty → EXIT: read back `Lam` via `clo_subst` (a value) |
+| `Let(_, _, val, body)` | push `Clo{val, env}` onto env, `head = body` (**zeta, O(1)**) |
+| `Var(i)` with `i < n` | fetch `Clo{e2, env2}` = env[i]; `head, env = e2, env2` (O(1) jump — this is where laziness pays) |
+| `Var(i)` with `i >= n` | EXIT stuck: read back spine onto `Var(i - n)`; the *outer* loop's Var arm then does ambient `lookup_let_val` zeta |
+| `FVar(_)` | EXIT stuck (readback); outer loop's FVar arm does LDecl zeta |
+| `All(..)` | spine empty → EXIT readback; non-empty → ill-typed application, readback stuck (mirror IxVM) |
+| `Const(..)` | EXIT: read back spine, rebuild application, hand to the existing dispatch (prim families, iota, projection-definitions, delta in the enclosing loops). Phase B refines the `Recr` case — see §7 |
+| `Prj(id, f, val)` | scrutinee = `Clo{val, env}`: whnf it through the machine **respecting `flags.cheap_proj`** (cheap → machine + structural-only exits; full → full `whnf`), then the existing `try_proj_reduce` on the result; spine read back |
+| `Sort/Nat/Str` | EXIT readback (stuck application of a non-function — outer loop handles/returns) |
+
+**Exit contract:** every machine exit produces a plain `KExpr` and
+*re-enters `whnf_core_with_flags_uncached`'s loop* as the new `cur`
+(or returns, where the old code returned). This keeps EVERY existing
+behavior — ambient let/LDecl zeta, iota, K-synthesis,
+`cleanup_nat_offset_major`, string/nat literal exposure,
+proj-definitions, the nat-offset-stuck probe, delta in `whnf` — byte
+identical in semantics. The machine replaces only the
+App-collect/beta/zeta core.
+
+## 5. Entry gating (the hybrid — load-bearing)
+
+IxVM measured a machine-everywhere variant at **+26% on beta-free
+literal-recursor loops** (closure-wrap + readback overhead with no
+beta to amortize it). The shipped design enters the machine **only
+when a beta actually fires**:
+
+In `whnf_core_with_flags_uncached`'s App arm (whnf.rs:641ff), after
+`f = whnf_core(f0)`:
+
+- `f` is `Lam` → wrap the already-collected `args` as empty-env
+  closures (`spine_wrap`) and run the machine starting from the beta.
+  This REPLACES the current multi-lambda peel + `simul_subst` block.
+- otherwise → existing path unchanged (rebuild-if-changed, iota,
+  return).
+
+Everything outside the App arm (Const-headed terms, literal recursor
+loops, the whole no-delta/delta loop structure) never touches the
+machine. This is also why the existing *caches need no changes*: the
+machine is invisible between `whnf_core` entry and exit, and cache
+keys/values remain plain interned `KExpr`s.
+
+## 6. Readback: `clo_subst`
+
+New functions in `subst.rs`, mirroring IxVM `Subst.lean:352-400`:
+
+```rust
+/// e[Var(depth + i) := lift(readback(env[i]), depth)] for i < n,
+/// Var(j) for j >= depth + n shifted down by n. lbr-guarded at every
+/// node (the no-op-subtree guard measured −68% substitution entries
+/// on the IxVM side). Uses a per-call memo scratch keyed (uid, depth)
+/// (same pattern as subst_scratch — a NEW scratch map on InternTable;
+/// the two must not share entries). Results interned per node, like
+/// every other subst.
+fn clo_subst(intern, e, env, n, depth) -> KExpr
+fn clo_readback(intern, c: &Clo) -> KExpr  // clo_subst(e, env, n, 0)
+```
+
+`clo_subst`'s Var arm composes `clo_readback` of the env entry with
+the existing `lift` (subst.rs:341) at non-zero depth. Spine readback
+maps `clo_readback` over the spine and rebuilds Apps via interning —
+exactly what the current post-beta rebuild does, so uid/interning
+discipline (see `docs/kernel_identity.md`) is unchanged: readback
+goes through the same constructors + intern table as today's
+substitution output.
+
+## 7. Phases
+
+**Phase A — machine for beta/zeta (this design's deliverable 1).**
+Scope: `whnf.rs` App arm + the machine loop + `subst.rs::clo_subst`.
+All Const/Proj exits read back and use existing machinery. Expected
+from IxVM Phase A scaled to our profiles: vector double-digit
+(its 49% `simul_subst` is the direct target), utf8 and mergesort
+mid-single-digit, intRxc small.
+
+**Phase B — closure-iota (`try_iota_c`).** The `Recr` Const-exit no
+longer reads its spine back: the major whnfs through its closure
+(`clo_whnf`), and on the main ctor-rule hit the rule RHS returns with
+params/motives/minors + ctor fields + post-major args as CLOSURES —
+the rule's Lam-chain betas push them straight into an env, so
+**unselected rules and dropped branches are never substituted and
+never read back**. Off-main-path cases (K-flag synthesis, the
+nat linear-rec shortcut, struct-eta, quot, `cleanup_nat_offset_major`
+interplay) MISS to the existing plain `try_iota_with_flags` on a
+readback spine — semantics preserved by construction. This phase is
+the UTF-8-class payoff (its 85% iota share is mostly discarded-branch
+substitution). `flags.cheap_rec` gates exactly as today (cheap mode
+skips the machine-iota entirely and misses to the plain path).
+
+**Phase C (deferred, separate design):** closure-aware def-eq (the
+IxVM working tree's `CWhnf` capture route) and env-trimming for the
+curried-sharing regression. Do not start until A+B are measured.
+
+## 8. What explicitly does NOT change
+
+- All caches (whnf/whnf_core/cheap variants, infer, def-eq, unfold,
+  equiv, failure) — keys, values, clearing. The machine lives strictly
+  between one cache miss and its fill.
+- The delta loop in `whnf`, prim-family dispatch, nat-offset-stuck
+  probes, transient-nat cache exclusion, native/string/decidable
+  reducers.
+- def-eq tier structure; `whnf_core_for_def_eq` keeps cheap flags,
+  which the machine consults only at its Proj exit (A) and Recr exit
+  (B).
+- Fuel and error semantics (`MaxRecDepth`).
+- `simul_subst` itself stays — instantiation sites outside whnf
+  (infer's pi-instantiation etc.) still use it.
+
+## 9. Risks and mitigations
+
+1. **Curried-sharing regression (measured +14.7% on IxVM's synthetic
+   foldAdd_2000):** eager sequential substitution shared
+   constant-prefix substitutions across loop iterations; per-iteration
+   env closures do not. Our suite lacks a foldAdd-shaped input —
+   **add one before implementing** (e.g. dump `natfoldsucc.ixe` /
+   `natreclinear.ixe` from the repo root into the bench suite) so the
+   regression class is visible. Mitigation if it bites: env-trimming
+   at readback to each subterm's visible slice (IxVM's designed
+   fix), or tighten the entry gate (enter only on multi-arg beta).
+2. **Native stack:** the machine MUST be an iterative loop;
+   `clo_subst` recursion depth is term depth (same exposure as
+   today's `subst`).
+3. **Memory:** env chains are Arc-shared cons cells; worst case holds
+   alive every closure of a deep reduction until exit. Guest RAM is
+   512MB; the same terms today materialize MORE memory (full
+   substituted bodies), so this should be net-negative pressure, but
+   watch the RAM column in `ziskemu -m` on vector/utf8.
+4. **Cheap-flag fidelity:** the def-eq false-negative discipline
+   (cheap results must not enter full caches) is keyed at the
+   `whnf_core_with_flags` wrapper, which is untouched; the machine
+   only needs to route its Proj/Recr exits by `flags`. Test: the
+   existing 604 suite covers cheap/full splits.
+5. **Behavioral drift in exits:** any machine exit that fails to
+   re-enter the outer loop (instead of returning) silently loses an
+   ambient zeta/iota step. The exit contract in §4 is the review
+   checklist: every exit either returns where the old code returned,
+   or sets `cur` and continues.
+
+## 10. Test & acceptance plan
+
+Per phase: 604 kernel + 200 ixon tests; native `check_one` on
+Vector._proof_1, Int16, UTF-8; full 11-input guest cycle suite +
+`natfoldsucc`/`natreclinear` additions; per the acceptance bar, keep
+a phase only if the suite nets ≥1% (expected: well above).
+Phase A first, committed alone; Phase B on top, committed alone.
+
+## 11. Effort
+
+Phase A: ~350–500 lines (machine loop ~150, clo_subst ~120, App-arm
+rewire ~50, spine helpers, tests). Phase B: ~200 more
+(try_iota_c + clo_whnf + Recr exit). The IxVM source is the line-by-
+line reference for both.
diff --git a/docs/kernel_identity.md b/docs/kernel_identity.md
new file mode 100644
index 00000000..ffe6af7a
--- /dev/null
+++ b/docs/kernel_identity.md
@@ -0,0 +1,169 @@
+# Kernel term identity vs. Ixon content addressing
+
+Ix has **two distinct identity layers**, and they serve different masters.
+This document records the boundary between them — in particular why the
+kernel's switch from per-node blake3 hashing to intern-assigned uids
+(`1e3029d`, 2026-06) does not affect the proof-carrying-code story, and
+what a future feature must do if it ever needs cryptographic identity at
+sub-constant granularity.
+
+## Layer 1: Ixon content addresses (external, cryptographic, in proofs)
+
+The unit of cryptographic identity is the **constant**. A constant's
+`Address` is the blake3 hash of its alpha-invariant Ixon serialization
+(see `docs/Ixon.md`); blobs (the bytes behind `Nat`/`String` literals)
+are addressed the same way. Everything that leaves the kernel speaks
+this language:
+
+- `Constant.refs` point at constant and blob `Address`es.
+- `KId.addr` — the identity the typechecker certifies — is the Ixon
+  `Address`.
+- The Zisk guest's committed claim is built from `Address`es:
+  `subject_root` / `assumptions_root` are `merkle_root_canonical` over
+  the certified / assumed constant addresses (`crates/kernel/src/claim.rs`,
+  `zisk/guest/src/main.rs`), and `env_hash` is blake3 over the exact env
+  payload.
+- Aggregation (`Claim::CheckEnv`, the `Contains` discharge) resolves
+  assumption leaves against subject roots **by address**, and the
+  cross-run proof store is keyed by address (content-addressed: the same
+  constant has the same address in every env).
+- Address↔bytes binding is enforced by `Address::hash(bytes) == addr`,
+  verified at first materialization (`ixon::lazy::LazyConstant::get`).
+  Every constant the kernel certifies is necessarily materialized, so
+  everything that can enter a subject root has passed the check.
+
+**Merkle inclusion proofs for individual constants live entirely in this
+layer** and are unaffected by anything below: proving "address `A` is in
+this proof's subject root" is an inclusion path in a tree whose leaves
+are constant addresses.
+
+The trust chain, end to end:
+
+```
+claim roots  ──Merkle──▶  constant Addresses
+             ──verify──▶  serialized constant bytes   (blake3 binding)
+             ──parse───▶  terms the kernel typechecked
+```
+
+## Layer 2: kernel node identity (internal, ephemeral, never serialized)
+
+Inside one checker run, the kernel needs cheap identity for the
+**in-memory expression/universe nodes** (`KExpr`/`KUniv`) churned out by
+substitution, WHNF, and def-eq — for the hash-consing intern table, the
+whnf/infer/def-eq cache keys, and quick equality. These objects are
+ephemeral: they are never serialized, never compared against Ixon
+addresses, and are torn down when the `KEnv` clears.
+
+Historically this layer ALSO used blake3: every constructed node hashed
+`(variant tag ‖ child hashes)`. That was a *separate scheme* from Ixon
+addressing (a Merkle-DAG over node tags, not blake3-of-serialization) —
+the two were never interchangeable — and it cost ~20% of all guest
+cycles on reduction-heavy constants in the Zisk prover.
+
+Since `1e3029d`, layer-2 identity is an **intern-assigned `u64` uid**
+(`crates/kernel/src/env.rs::Addr`):
+
+- Uids come from a process-global counter and are **never reused**, so
+  `uid(a) == uid(b)` implies "same construction event or same
+  intern-table canonical value" — i.e. structural equality. Stale cache
+  keys can only miss, never alias.
+- The intern table keys on **shallow structural keys**
+  (`ExprKey`/`UnivKey`: variant tag + child uids + semantic payload),
+  mirroring exactly what the old content hash covered — display names,
+  binder info, and mdata are excluded, so interning semantics are
+  unchanged.
+- `PartialEq` for `KExpr`/`KUniv` is structural with the uid fast path:
+  canonical-vs-canonical comparison is O(1), and completeness-critical
+  `==` sites are unaffected. `hash_eq` remains the fast-but-incomplete
+  uid check for cache/quick-path callers, where a false negative only
+  costs a fallback.
+- Soundness direction: caches and def-eq quick paths rely only on the
+  affirmative ("equal uid ⇒ equal term"), which holds by construction.
+  An imperfect intern hit-rate degrades performance, not correctness.
+
+Nat/Str literal nodes keep their blake3 **blob** `Address` (that is
+layer-1 data riding on the node, used by `refs`/assumption filtering).
+
+## The boundary rule
+
+> Anything that crosses the kernel boundary — claims, public values,
+> proof-store entries, assumption sets — is identified by Ixon
+> `Address`. Kernel uids must never escape into a serialized artifact.
+
+This held before the change (nothing consumed the old expr hashes
+outside the kernel) and is now structurally enforced by the type split:
+`Address` (32-byte content hash) vs `Addr = u64` (kernel uid) vs
+`CtxAddr` (blake3 digests for local-context cache keys, fed by uids,
+also internal-only).
+
+## Adversarial model: why uid identity cannot be cache-poisoned
+
+The natural worry about replacing blake3 with `u64` identity is hash
+collision: *if two subterms had the same internal hash, a def-eq/whnf
+cache hit could return a result for the wrong term.* The design avoids
+this not by making collisions unlikely but by making them impossible —
+**uids are assigned, not computed**:
+
+- A uid comes from a process-global sequential counter
+  (`expr.rs::fresh_uid`), never from hashing input content. There is no
+  function from term content to uid for an adversary to find collisions
+  in — and no birthday bound either: uids are handed out once each, not
+  sampled from a space two terms could land in. Two `KExpr` values share
+  a uid only if they are clones of one construction event or the
+  canonical node the intern table returned for the same shallow key.
+  `fresh_uid` aborts on counter exhaustion (which would require >2^67
+  guest cycles) rather than wrap, and the only constructors that accept
+  a caller-supplied uid (`*_mdata_with_addr`) are private to `expr.rs`,
+  so no code outside the module can copy one node's uid into a
+  different structure.
+- **Between shards** (separate guest processes), uid collisions are
+  certain — every process counts from 1 — and harmless by design: a uid
+  is meaningless outside its process. Nothing serializes a uid (the
+  committed publics are blake3 addresses and Merkle roots; intern tables
+  and caches die with the process; manifests and the proof store carry
+  `Address`es only), so there is no channel through which shard A's
+  uid 42 could meet shard B's. All cross-shard identity — claims,
+  assumption resolution, proof reuse — lives in the Ixon `Address`
+  layer. Any future feature that wants to persist or ship a
+  reduction/intern cache MUST re-key it by content (e.g. Ixon
+  serialization hashes) first; see the boundary rule above.
+- The intern table is keyed by [`ExprKey`]/[`UnivKey`] — exact
+  structural keys compared by full `Eq` (variant tag, child uids,
+  complete 32-byte payload addresses; never truncated, never a digest).
+  `FxHashMap` bucketing uses a weak hash, but a bucket collision only
+  causes an extra key comparison, not a wrong hit. Children in a key are
+  canonical by induction, so key equality ⇔ structural equality. Debug
+  builds assert structural equality on every intern hit.
+- Reduction caches key on `(uid, CtxAddr)`; a hit requires uid equality,
+  hence structural equality. `CtxAddr` (local-context digests) remains
+  blake3 — collision-resistant — fed with uid bytes.
+- An adversarial env cannot choose uids at all: deserialization never
+  produces a uid; every node built from input takes the next counter
+  value. Compared to the old scheme — where cache keys were blake3 over
+  attacker-chosen content, so an attacker could at least grind contents
+  to flood `FxHashMap` buckets of their choosing — sequential uids give
+  the adversary strictly *less* influence over key distribution.
+
+The one place adversarial content does enter identity is **literals**:
+`ExprKey::Nat/Str` and structural equality key on the blob `Address`
+(mirroring the old content hash, which also hashed only the address).
+That is sound only if an address is bound to its bytes — so all three
+env deserializers verify `Address::hash(bytes)` for every blob at load
+(the constants' equivalent check runs lazily at first materialization),
+and the literal arms of structural equality compare values as well as
+addresses as defense in depth. Residual exposure is limited to DoS
+(bucket flooding of maps keyed by attacker-chosen `Address`es slows the
+prover, which is spending its own cycles), not soundness.
+
+## Future: sub-constant commitments
+
+If a feature ever needs cryptographic identity for **subterms** (e.g.
+Merkle inclusion of a specific expression inside a constant's body,
+rather than inclusion of a constant in a claim), expression-level
+content hashes no longer exist precomputed. The right move is to compute
+them at that boundary: a one-time egress pass over the relevant term —
+either the old `(tag ‖ child hashes)` Merkle-DAG scheme or blake3 over
+the Ixon serialization of the subterm. That is cheap where it is needed
+and keeps the per-node cost out of the prover's hottest loop. No current
+artifact (claims, aggregation, reuse, inclusion proofs at constant
+granularity) needs it.
diff --git a/zisk/scripts/bench-cycles.sh b/zisk/scripts/bench-cycles.sh
new file mode 100755
index 00000000..49aeb3c4
--- /dev/null
+++ b/zisk/scripts/bench-cycles.sh
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+# Cycle benchmark for the Zisk leaf guest across the standard input suite.
+#
+# Runs each dumped guest stdin in ~/benchdata/zisk-inputs through ziskemu,
+# records steps (-m) and checks the committed `failures` publics word (-c,
+# u32 word index 10) is zero — a non-zero value means the kernel REJECTED a
+# constant and the run is invalid for benchmarking.
+#
+# Usage:
+#   bench-cycles.sh <label> [elf]        # full suite
+#   BENCH_ONLY="int16rxc vectorextract" bench-cycles.sh <label>
+#
+# Results append to ~/benchdata/zisk-bench.tsv: label, git-rev, input, steps.
+# Inputs were dumped via `zisk-host --dump-input` (see zisk_guest_profiling
+# memory / scripts/mergesort-250k.sh for provenance).
+
+set -euo pipefail
+
+LABEL="${1:?usage: bench-cycles.sh <label> [elf]}"
+ELF="${2:-/home/ubuntu/ix/zisk/guest/target/elf/riscv64ima-zisk-zkvm-elf/release/zisk-guest}"
+INPUTS_DIR="$HOME/benchdata/zisk-inputs"
+OUT="$HOME/benchdata/zisk-bench.tsv"
+ZISKEMU="$HOME/.zisk/bin/ziskemu"
+REV="$(git -C /home/ubuntu/ix rev-parse --short HEAD 2>/dev/null || echo unknown)$(git -C /home/ubuntu/ix diff --quiet 2>/dev/null || echo +dirty)"
+
+# Suite ordered smallest-first so failures surface fast.
+# natreclinear/natfoldsucc are the foldAdd-class sentinels for the
+# env-machine curried-sharing regression (docs/env_machine_whnf.md §9.1).
+SUITE="${BENCH_ONLY:-stringappend natreclinear natfoldsucc natgcdcomm rbmap nataddcomm binsearch mergesort int16rxc int32rxc int64rxc vectorextract}"
+
+input_path() {
+  case "$1" in
+    mergesort) echo "$INPUTS_DIR/mergesort-whole.bin" ;;
+    *) echo "$INPUTS_DIR/input-$1.bin" ;;
+  esac
+}
+
+printf '%-15s %-22s %16s  %s\n' "LABEL" "INPUT" "STEPS" "REV"
+for name in $SUITE; do
+  bin="$(input_path "$name")"
+  [ -f "$bin" ] || { echo "missing input: $bin" >&2; exit 1; }
+  steps="$("$ZISKEMU" -e "$ELF" -i "$bin" -m 2>&1 | grep -oP 'steps=\K[0-9]+')"
+  failures="$("$ZISKEMU" -e "$ELF" -i "$bin" -c 2>&1 | sed -n '11p')"
+  if [ "$failures" != "00000000" ]; then
+    echo "FAIL $name: kernel failures word = $failures (run invalid)" >&2
+    exit 1
+  fi
+  printf '%-15s %-22s %16s  %s\n' "$LABEL" "$name" "$steps" "$REV"
+  printf '%s\t%s\t%s\t%s\n' "$LABEL" "$REV" "$name" "$steps" >> "$OUT"
+done