diff --git a/crates/compile/src/kernel_egress.rs b/crates/compile/src/kernel_egress.rs index 743f97db..ea1299c1 100644 --- a/crates/compile/src/kernel_egress.rs +++ b/crates/compile/src/kernel_egress.rs @@ -384,9 +384,9 @@ struct EgressCtx { /// Memoized expression conversion. Keyed by `KExpr::addr()` (content /// hash); same hash → same Ixon expression (within a single block's /// tables). - expr_cache: FxHashMap>, + expr_cache: FxHashMap>, /// Memoized universe conversion. - univ_cache: FxHashMap>, + univ_cache: FxHashMap>, } impl EgressCtx { diff --git a/crates/ixon/src/lazy.rs b/crates/ixon/src/lazy.rs index f08330e1..612e67e9 100644 --- a/crates/ixon/src/lazy.rs +++ b/crates/ixon/src/lazy.rs @@ -104,6 +104,15 @@ pub struct LazyConstant { /// this `None`; their `get()` parses fresh and does not store — /// see the module-level "Cache policy" docs. cache: Option>, + /// Deferred address verification: when set (the `.ixe` load paths), + /// `get()` checks `Address::hash(bytes) == pending_addr` before + /// parsing. This moves the per-constant content-hash from env parse + /// time to first materialization, so constants that are shipped in a + /// closure but never forced by the typechecker are never hashed — + /// while everything the kernel CERTIFIES is still verified (checking + /// a constant forces its materialization). `None` for the + /// compile-side path, which owns the parsed value already. + pending_addr: Option
, } impl LazyConstant { @@ -114,7 +123,22 @@ impl LazyConstant { /// `Constant::put` produced for the address this entry is stored /// under. Use [`Self::verify_address`] for an explicit check. pub fn from_bytes(bytes: Arc<[u8]>) -> Self { - LazyConstant { bytes: BytesSource::Heap(bytes), cache: None } + LazyConstant { + bytes: BytesSource::Heap(bytes), + cache: None, + pending_addr: None, + } + } + + /// Like [`Self::from_bytes`], but with the address-binding check + /// deferred to first [`Self::get`]. Used by `Env::get_anon` so env + /// parse time does not pay one content hash per constant. + pub fn from_bytes_deferred(bytes: Arc<[u8]>, addr: Address) -> Self { + LazyConstant { + bytes: BytesSource::Heap(bytes), + cache: None, + pending_addr: Some(addr), + } } /// Construct from a memory-mapped window. The `Arc` keeps @@ -124,7 +148,11 @@ impl LazyConstant { /// Used by `Env::get_anon_mmap` to avoid heap-copying the on-disk /// byte stream — the OS page cache is the source of truth. pub fn from_mmap_slice(mmap: Arc, offset: usize, len: usize) -> Self { - LazyConstant { bytes: BytesSource::Mmap { mmap, offset, len }, cache: None } + LazyConstant { + bytes: BytesSource::Mmap { mmap, offset, len }, + cache: None, + pending_addr: None, + } } /// Construct from a structured `Constant` (the in-memory build path, @@ -137,6 +165,7 @@ impl LazyConstant { LazyConstant { bytes: BytesSource::Heap(buf.into()), cache: Some(Arc::new(c)), + pending_addr: None, } } @@ -151,6 +180,16 @@ impl LazyConstant { if let Some(c) = &self.cache { return Ok(c.clone()); } + if let Some(expected) = &self.pending_addr { + let computed = Address::hash(self.bytes.as_slice()); + if computed != *expected { + return Err(format!( + "LazyConstant::get: bytes hash to {} but stored under {}", + computed.hex(), + expected.hex() + )); + } + } let mut slice: &[u8] = self.bytes.as_slice(); let parsed = Constant::get(&mut slice) .map_err(|e| format!("LazyConstant::get: {e}"))?; diff --git a/crates/ixon/src/serialize.rs b/crates/ixon/src/serialize.rs index 4233ea66..cdbee2a9 100644 --- a/crates/ixon/src/serialize.rs +++ b/crates/ixon/src/serialize.rs @@ -1419,6 +1419,20 @@ impl Env { } let (bytes, rest) = buf.split_at(len); *buf = rest; + // Blob bytes are bound to their address HERE, eagerly: literal + // identity downstream (kernel `ExprKey::Nat`/`Str` interning, + // `Constant.refs` blob entries, assumption filtering) keys on the + // address, so an unverified blob would let an adversarial env bind + // wrong bytes to a content address. Blobs are a small fraction of + // env bytes; constants stay lazily verified (see LazyConstant). + let computed = Address::hash(bytes); + if computed != addr { + return Err(format!( + "Env::get: blob bytes hash to {} but stored under {}", + computed.hex(), + addr.hex() + )); + } env.blobs.insert(addr, bytes.to_vec()); } @@ -1568,6 +1582,20 @@ impl Env { } let (bytes, rest) = buf.split_at(len); *buf = rest; + // Blob bytes are bound to their address HERE, eagerly: literal + // identity downstream (kernel `ExprKey::Nat`/`Str` interning, + // `Constant.refs` blob entries, assumption filtering) keys on the + // address, so an unverified blob would let an adversarial env bind + // wrong bytes to a content address. Blobs are a small fraction of + // env bytes; constants stay lazily verified (see LazyConstant). + let computed = Address::hash(bytes); + if computed != addr { + return Err(format!( + "Env::get_anon: blob bytes hash to {} but stored under {}", + computed.hex(), + addr.hex() + )); + } env.blobs.insert(addr, bytes.to_vec()); } @@ -1585,17 +1613,16 @@ impl Env { } let (bytes, rest) = buf.split_at(len); *buf = rest; - let computed = Address::hash(bytes); - if computed != addr { - return Err(format!( - "Env::get_anon: const at idx {i} bytes hash to {} but stored under {}", - computed.hex(), - addr.hex() - )); - } - env - .consts - .insert(addr, crate::lazy::LazyConstant::from_bytes(bytes.into())); + let _ = i; + // Address binding is verified lazily at first materialization + // (`LazyConstant::get`) instead of one content hash per constant + // here — constants shipped in a closure but never forced by the + // typechecker are never hashed, while everything the kernel + // certifies still gets verified on the way in. + env.consts.insert( + addr.clone(), + crate::lazy::LazyConstant::from_bytes_deferred(bytes.into(), addr), + ); } // Section 3: Names — parse and DISCARD. We still need a populated @@ -1769,6 +1796,20 @@ impl Env { } let (bytes, rest) = buf.split_at(len); buf = rest; + // Blob bytes are bound to their address HERE, eagerly: literal + // identity downstream (kernel `ExprKey::Nat`/`Str` interning, + // `Constant.refs` blob entries, assumption filtering) keys on the + // address, so an unverified blob would let an adversarial env bind + // wrong bytes to a content address. Blobs are a small fraction of + // env bytes; constants stay lazily verified (see LazyConstant). + let computed = Address::hash(bytes); + if computed != addr { + return Err(format!( + "Env::get_anon_mmap: blob bytes hash to {} but stored under {}", + computed.hex(), + addr.hex() + )); + } env.blobs.insert(addr, bytes.to_vec()); } @@ -2452,8 +2493,16 @@ mod tests { let off = 68 + 3; assert!(off < buf.len()); buf[off] ^= 0xFF; - let res = Env::get_anon(&mut buf.as_slice()); - let err = res.expect_err("tampered const bytes should be rejected"); + // Address binding is verified lazily: parse succeeds, but the + // tampered constant is rejected at first materialization (which is + // what gates anything the kernel certifies). + let parsed = Env::get_anon(&mut buf.as_slice()) + .expect("get_anon defers per-const verification"); + let entry = parsed.consts.iter().next().expect("one const"); + let err = entry + .value() + .get() + .expect_err("tampered const bytes should be rejected at get()"); assert!( err.contains("bytes hash to") && err.contains("but stored under"), "expected per-entry verify error, got: {err}" diff --git a/crates/kernel/Cargo.toml b/crates/kernel/Cargo.toml index 996eab3e..ebb91120 100644 --- a/crates/kernel/Cargo.toml +++ b/crates/kernel/Cargo.toml @@ -36,5 +36,12 @@ quickcheck_macros = { workspace = true } name = "perf_check" path = "examples/perf_check.rs" +# Native single-constant check (one work item via the lazy-anon path the +# Zisk guest uses), so the `IX_*` instrumentation can target one expensive +# constant without re-checking its whole env. +[[example]] +name = "check_one" +path = "examples/check_one.rs" + [lints] workspace = true diff --git a/crates/kernel/examples/check_one.rs b/crates/kernel/examples/check_one.rs new file mode 100644 index 00000000..4ae4d946 --- /dev/null +++ b/crates/kernel/examples/check_one.rs @@ -0,0 +1,72 @@ +//! Native single-constant check, mirroring the Zisk guest's reuse-mode path +//! (`zisk/guest/src/main.rs`): resolve a Lean NAME through the env's `named` +//! metadata to its ingress block, then `check_const` exactly that work item +//! with the lazy-anon TypeChecker. Exists so the kernel's `IX_*` +//! instrumentation (inert inside zkVM guests — see `src/lib.rs::env_var_os`) +//! can be pointed at one expensive constant without re-checking a whole env. +//! +//! IX_PERF_COUNTERS=1 cargo run --release --example check_one -- \ +//! ~/ix/init.ixe '_private.Init.Data.Range.Polymorphic.SInt.0.Int16.instRxcHasSize_eq' + +use std::env; +use std::fs; +use std::time::Instant; + +use ix_kernel::anon_work::build_anon_work; +use ix_kernel::env::KEnv; +use ix_kernel::id::KId; +use ix_kernel::mode::Anon; +use ix_kernel::tc::TypeChecker; +use ixon::constant::ConstantInfo as CI; +use ixon::env::Env as IxonEnv; + +fn main() { + env_logger::builder() + .filter_level(log::LevelFilter::Info) + .format_timestamp(None) + .init(); + let path = env::args().nth(1).expect("usage: check_one "); + let name = env::args().nth(2).expect("usage: check_one "); + let bytes = fs::read(&path).expect("read ixe"); + + let t_parse = Instant::now(); + let env = IxonEnv::get(&mut &bytes[..]).expect("decode env"); + eprintln!("[check_one] parse: {:>8.2?}", t_parse.elapsed()); + + // Name → constant address → ingress block (mirrors zisk-host + // `block_of_addr`): a projection's `block`, else the address itself. + let addr = env + .named + .iter() + .find(|e| e.key().pretty() == name || e.key().to_string() == name) + .map_or_else( + || panic!("no constant named {name:?} in {path}"), + |e| e.value().addr.clone(), + ); + let block = match env.get_const(&addr).map(|c| c.info.clone()) { + Some(CI::IPrj(p)) => p.block, + Some(CI::CPrj(p)) => p.block, + Some(CI::RPrj(p)) => p.block, + Some(CI::DPrj(p)) => p.block, + _ => addr.clone(), + }; + + let work = build_anon_work(&env).expect("build_anon_work"); + let item = + work.iter().find(|it| *it.primary() == block).expect("work item for block"); + + let mut kenv = KEnv::::new(); + let mut tc = TypeChecker::::new_with_lazy_anon(&mut kenv, &env); + let kid = KId::::new(item.primary().clone(), ()); + let t_check = Instant::now(); + let result = tc.check_const(&kid); + eprintln!( + "[check_one] check: {:>8.2?} result: {:?}", + t_check.elapsed(), + result.is_ok() + ); + if let Err(e) = result { + eprintln!("[check_one] error: {e:?}"); + std::process::exit(1); + } +} diff --git a/crates/kernel/src/def_eq.rs b/crates/kernel/src/def_eq.rs index c0868636..fb873cfc 100644 --- a/crates/kernel/src/def_eq.rs +++ b/crates/kernel/src/def_eq.rs @@ -21,6 +21,7 @@ use super::subst::{instantiate_rev, lift}; use super::tc::{ MAX_DEF_EQ_DEPTH, MAX_WHNF_FUEL, TypeChecker, collect_app_spine, }; +use super::whnf::PrimFamily; /// When set, trace every `is_def_eq` call where one side's head constant /// starts with the prefix in `IX_DEF_EQ_TRACE` (e.g. `IX_DEF_EQ_TRACE=bmod` @@ -349,11 +350,17 @@ impl TypeChecker<'_, M> { // primitives entirely when either side has a free variable, unless // eagerReduce is active. let nat_ok = (!wa.has_fvars() && !wb.has_fvars()) || self.eager_reduce; + let fam_a = self.head_prim_family(&wa); + let fam_b = self.head_prim_family(&wb); if nat_ok { - if let Some(wa2) = self.try_reduce_nat(&wa)? { + if fam_a == PrimFamily::Nat + && let Some(wa2) = self.try_reduce_nat(&wa)? + { return self.is_def_eq(&wa2, &wb); } - if let Some(wb2) = self.try_reduce_nat(&wb)? { + if fam_b == PrimFamily::Nat + && let Some(wb2) = self.try_reduce_nat(&wb)? + { return self.is_def_eq(&wa, &wb2); } } @@ -363,17 +370,25 @@ impl TypeChecker<'_, M> { // (lean4 `type_checker.cpp:986-991`, lean4lean `TypeChecker.lean:625-628`). // Ix-specific `try_reduce_decidable` runs after native to keep the // reference-aligned segment tight. - if let Some(wa2) = self.try_reduce_native(&wa)? { + if fam_a == PrimFamily::Native + && let Some(wa2) = self.try_reduce_native(&wa)? + { return self.is_def_eq(&wa2, &wb); } - if let Some(wb2) = self.try_reduce_native(&wb)? { + if fam_b == PrimFamily::Native + && let Some(wb2) = self.try_reduce_native(&wb)? + { return self.is_def_eq(&wa, &wb2); } - if let Some(wa2) = self.try_reduce_decidable(&wa)? { + if fam_a == PrimFamily::Decidable + && let Some(wa2) = self.try_reduce_decidable(&wa)? + { return self.is_def_eq(&wa2, &wb); } - if let Some(wb2) = self.try_reduce_decidable(&wb)? { + if fam_b == PrimFamily::Decidable + && let Some(wb2) = self.try_reduce_decidable(&wb)? + { return self.is_def_eq(&wa, &wb2); } @@ -920,6 +935,26 @@ impl TypeChecker<'_, M> { } } + /// Allocation-free check that `e` could decompose to `base + offset`: + /// a Nat literal, `Nat.zero`/`Nat.succ`, or an app whose head constant is + /// `Nat.succ`/`Nat.add`. Walks the app chain by reference — no spine Vec. + fn nat_offset_candidate(&self, e: &KExpr) -> bool { + let mut cur = e; + loop { + match cur.data() { + ExprData::Nat(..) => return true, + ExprData::Const(id, _, _) => { + let p = &self.prims; + return id.addr == p.nat_zero.addr + || id.addr == p.nat_succ.addr + || id.addr == p.nat_add.addr; + }, + ExprData::App(f, _, _) => cur = f, + _ => return false, + } + } + } + /// If expression is nat-succ, return the predecessor. /// Matches both `Nat(n+1)` → `Nat(n)` and `Nat.succ e` → `e`. fn nat_succ_of(&mut self, e: &KExpr) -> Option> { @@ -965,8 +1000,16 @@ impl TypeChecker<'_, M> { } } - /// M2: Nat offset reduction for lazy delta loop (lean4lean isDefEqOffset). - /// Returns Some(true/false) if both are nat-zero or nat-succ, None otherwise. + /// M2: Nat offset reduction for lazy delta loop (lean4lean isDefEqOffset), + /// generalized to offset form: each side decomposes to `base + offset` + /// (`Lit n`, `succ` layers, and `Nat.add base (Lit m)` — the compact stuck + /// form WHNF now leaves — all read in O(1) per layer), the shared offset + /// is stripped in ONE step, and the remainders compare through full + /// def-eq. This collapses `succ^k(x) ≟ succ^k(x)` from k `is_def_eq` + /// recursion levels (which blew `MAX_DEF_EQ_DEPTH` for large k) to one. + /// Stripping is verdict-preserving: `+k` is definitionally injective, the + /// same semantics the previous one-succ-peel already relied on. + /// Non-offset shapes fall back (`None`) to the generic path unchanged. fn try_def_eq_offset( &mut self, a: &KExpr, @@ -981,12 +1024,27 @@ impl TypeChecker<'_, M> { if self.is_nat_zero(a) && self.is_nat_zero(b) { return Ok(Some(true)); } - match (self.nat_succ_of(a), self.nat_succ_of(b)) { - (Some(a_pred), Some(b_pred)) => { - Ok(Some(self.is_def_eq(&a_pred, &b_pred)?)) - }, - _ => Ok(None), + // Allocation-free quick reject: decompose walks app spines, so only run + // it when both heads are plausibly offset-shaped (the old one-succ-peel + // rejected non-Nat shapes in O(1) off `e.data()` — keep that property). + if !self.nat_offset_candidate(a) || !self.nat_offset_candidate(b) { + return Ok(None); + } + let Some((base_a, ka)) = self.nat_offset_decompose(a)? else { + return Ok(None); + }; + let Some((base_b, kb)) = self.nat_offset_decompose(b)? else { + return Ok(None); + }; + let k = ka.0.clone().min(kb.0.clone()); + if k == num_bigint::BigUint::ZERO { + // No shared offset to strip (e.g. literal 0 vs offset-shaped): defer + // to the generic path, matching the previous conservative behavior. + return Ok(None); } + let ra = self.nat_offset_rebuild(base_a, bignat::Nat(ka.0 - &k)); + let rb = self.nat_offset_rebuild(base_b, bignat::Nat(kb.0 - &k)); + Ok(Some(self.is_def_eq(&ra, &rb)?)) } // ----------------------------------------------------------------------- @@ -1630,12 +1688,12 @@ fn compact_def_eq_head(e: &KExpr) -> String { } fn short_def_eq_addr(e: &KExpr) -> String { - e.addr().to_hex().chars().take(12).collect() + format!("uid{}", e.addr()) } -/// Canonical ordering for cache keys: (min, max) by hash bytes. +/// Canonical ordering for cache keys: (min, max) by uid. fn canonical_pair(a: Addr, b: Addr) -> (Addr, Addr) { - if a.as_bytes() <= b.as_bytes() { (a, b) } else { (b, a) } + if a <= b { (a, b) } else { (b, a) } } /// Extract head constant KId from expression or app spine. @@ -1900,6 +1958,10 @@ mod tests { ); let plain = ME::cnst(id, Box::new([])); + // Interning ignores mdata (the shallow Const key carries only the + // address + universe uids), so both collapse to one canonical node. + let tagged = tc.intern(tagged); + let plain = tc.intern(plain); assert_eq!(tagged.addr(), plain.addr()); assert!(tc.is_def_eq(&tagged, &plain).unwrap()); } diff --git a/crates/kernel/src/env.rs b/crates/kernel/src/env.rs index f93a372e..a1e3d5a8 100644 --- a/crates/kernel/src/env.rs +++ b/crates/kernel/src/env.rs @@ -22,27 +22,69 @@ use super::mode::KernelMode; use super::perf::PerfCounters; use super::primitive::Primitives; -/// Content-addressed Merkle hash. 32 bytes, `Copy`, no allocation. +/// Canonical identity of an expression or universe node: the +/// intern-assigned uid. Plain `u64`, allocated from a process-global +/// counter (`expr.rs::fresh_uid`) and NEVER reused, so uid equality +/// implies structural equality and cache keys built from uids cannot +/// alias across intern-table clears (a stale key can only miss). /// -/// Earlier revisions stored `Addr = Arc` and threaded all -/// constructions through a process-global `DashMap` intern table to dedup -/// the inner allocation. On full-mathlib kernel-check runs that table grew -/// to 100M+ entries (≈8+ GiB) and dominated RSS, even though the per-worker -/// `KEnv` caches were correctly cleared per scheduled block. Switching to a -/// `Copy` value drops the global intern, eliminates one allocation per -/// `KExpr`/`KUniv` construction, and reduces per-`ExprData` overhead -/// from `Arc` (8-byte pointer + 16-byte heap header + 32-byte -/// Hash) to a single in-place 32-byte field. Identity comparison falls -/// back from `Arc::ptr_eq` (single pointer compare) to a 32-byte memcmp, -/// which is a single AVX2 cycle on modern x86 and dominated by the -/// surrounding kernel work. -pub type Addr = blake3::Hash; +/// This is KERNEL-INTERNAL identity for ephemeral in-memory nodes — +/// distinct from the Ixon `Address` layer (blake3 over serialized +/// content) that constants/blobs carry into claims, Merkle roots, and +/// the proof store. Uids must never escape into a serialized artifact; +/// see `docs/kernel_identity.md` for the boundary rule and the +/// proof-carrying-code implications. +/// +/// Earlier revisions stored the blake3 content hash of every node here, +/// computed at construction: profiling on the Zisk guest put that hashing +/// (`app_hash` + the blake3 wrapper) at ~20% of guest cycles on +/// reduction-heavy constants. Identity is now assigned by the intern +/// table from shallow structural keys ([`ExprKey`]/[`UnivKey`]) instead +/// of computed from content. +pub type Addr = u64; + +/// Key type for local-context hashing (`tc.rs::ctx_addr_for_lbr`) and +/// other places that still need a collision-resistant digest over +/// variable-length content. Per-node expression identity uses [`Addr`]. +pub type CtxAddr = blake3::Hash; + +/// Shallow structural key of an expression node: the variant tag plus the +/// uids of its children and its semantic payload. Mirrors EXACTLY what the +/// historical per-node content hash covered — display names, binder info, +/// and mdata are excluded — so interning semantics are unchanged. Children +/// are identified by uid: within one table, equal keys ⇔ structurally +/// equal nodes (children canonical by induction). +#[derive(Clone, Debug, Eq, Hash, PartialEq)] +pub enum ExprKey { + Var(u64), + FVar(u64), + Sort(Addr), + Const(Address, Box<[Addr]>), + App(Addr, Addr), + Lam(Addr, Addr), + All(Addr, Addr), + Let(Addr, Addr, Addr, bool), + Prj(Address, u64, Addr), + Nat(Address), + Str(Address), +} + +/// Shallow structural key of a universe node. `Param` display names are +/// excluded, mirroring the historical hash semantics. +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +pub enum UnivKey { + Zero, + Succ(Addr), + Max(Addr, Addr), + IMax(Addr, Addr), + Param(u64), +} /// Hash-consing intern table for expressions and universes. /// -/// Single-threaded and owned by one `KEnv`. Guarantees pointer uniqueness -/// by blake3 hash within that environment: `ptr(a) == ptr(b)` iff -/// `hash(a) == hash(b)`. +/// Single-threaded and owned by one `KEnv`. Guarantees uid/pointer +/// uniqueness per shallow structural key within that environment: +/// `ptr(a) == ptr(b)` iff `key(a) == key(b)` for interned values. /// /// Also owns reusable scratch buffers used by `subst`, `simul_subst`, and /// `lift` to memoize content-addressed sub-traversals within a single @@ -52,8 +94,14 @@ pub type Addr = blake3::Hash; /// already passed for hash-consing eliminates the malloc/free churn while /// keeping the per-call invariant (caches are cleared on entry). pub struct InternTable { - pub(crate) univs: FxHashMap>, - pub(crate) exprs: FxHashMap>, + pub(crate) univs: FxHashMap>, + pub(crate) exprs: FxHashMap>, + /// Uids of canonical (interned) nodes. `intern_expr`/`intern_univ` + /// short-circuit on members, and use it to detect non-canonical + /// children that must be canonicalized before the shallow key is + /// meaningful. + pub(crate) canon_exprs: FxHashSet, + pub(crate) canon_univs: FxHashSet, /// Scratch buffer for `subst` / `simul_subst` per-call memoization, /// keyed by `(addr, depth)`. Cleared on entry. Owned here so the /// allocation persists across calls. @@ -63,6 +111,14 @@ pub struct InternTable { /// because `lift` is invoked from inside `subst_cached`, and the two /// caches have different semantics, so they must not share entries. pub(crate) lift_scratch: FxHashMap<(Addr, u64), KExpr>, + /// Pool of scratch maps for `clo_subst` per-call memoization, keyed by + /// `(addr, depth)`. A pool rather than a single buffer because + /// `clo_subst` re-enters itself through `clo_readback` of environment + /// entries (each readback substitutes under a *different* environment, + /// so the memo must not be shared between the nesting levels). Maps are + /// cleared and returned to the pool on exit, so allocations persist + /// across calls like the other scratches. + pub(crate) clo_scratch_pool: Vec>>, } impl Default for InternTable { @@ -71,50 +127,201 @@ impl Default for InternTable { } } +/// Shallow structural key of an expression whose children are canonical. +pub fn expr_key(e: &KExpr) -> ExprKey { + use super::expr::ExprData; + match e.data() { + ExprData::Var(i, _, _) => ExprKey::Var(*i), + ExprData::FVar(id, _, _) => ExprKey::FVar(id.0), + ExprData::Sort(u, _) => ExprKey::Sort(*u.addr()), + ExprData::Const(id, us, _) => { + ExprKey::Const(id.addr.clone(), us.iter().map(|u| *u.addr()).collect()) + }, + ExprData::App(f, a, _) => ExprKey::App(*f.addr(), *a.addr()), + ExprData::Lam(_, _, t, b, _) => ExprKey::Lam(*t.addr(), *b.addr()), + ExprData::All(_, _, t, b, _) => ExprKey::All(*t.addr(), *b.addr()), + ExprData::Let(_, t, v, b, nd, _) => { + ExprKey::Let(*t.addr(), *v.addr(), *b.addr(), *nd) + }, + ExprData::Prj(id, f, v, _) => ExprKey::Prj(id.addr.clone(), *f, *v.addr()), + ExprData::Nat(_, ba, _) => ExprKey::Nat(ba.clone()), + ExprData::Str(_, ba, _) => ExprKey::Str(ba.clone()), + } +} + +/// Shallow structural key of a universe whose children are canonical. +pub fn univ_key(u: &KUniv) -> UnivKey { + use super::level::UnivData; + match u.data() { + UnivData::Zero(_) => UnivKey::Zero, + UnivData::Succ(inner, _) => UnivKey::Succ(*inner.addr()), + UnivData::Max(a, b, _) => UnivKey::Max(*a.addr(), *b.addr()), + UnivData::IMax(a, b, _) => UnivKey::IMax(*a.addr(), *b.addr()), + UnivData::Param(idx, _, _) => UnivKey::Param(*idx), + } +} + impl InternTable { pub fn new() -> Self { InternTable { univs: FxHashMap::default(), exprs: FxHashMap::default(), + canon_exprs: FxHashSet::default(), + canon_univs: FxHashSet::default(), subst_scratch: FxHashMap::default(), lift_scratch: FxHashMap::default(), + clo_scratch_pool: Vec::new(), } } /// Read-only fast path: return the canonical interned universe for - /// `hash` if already present. Used by instrumented callers that want - /// to record hit/miss separately; plain callers should use - /// `intern_univ`. + /// `key` if already present. The key must be built from CANONICAL + /// children (their uids). Plain callers should use `intern_univ`. #[inline] - pub fn try_get_univ(&self, hash: &blake3::Hash) -> Option> { - self.univs.get(hash).cloned() + pub fn try_get_univ(&self, key: &UnivKey) -> Option> { + self.univs.get(key).cloned() } /// Read-only fast path counterpart of `try_get_univ` for expressions. #[inline] - pub fn try_get_expr(&self, hash: &blake3::Hash) -> Option> { - self.exprs.get(hash).cloned() + pub fn try_get_expr(&self, key: &ExprKey) -> Option> { + self.exprs.get(key).cloned() } - /// Intern a universe: if one with the same hash exists, return the - /// existing Arc (ensuring pointer uniqueness). Otherwise insert and - /// return. + /// Intern a universe: returns the canonical value for its structural + /// identity, recursively canonicalizing children as needed so the + /// shallow key is meaningful. pub fn intern_univ(&mut self, u: KUniv) -> KUniv { - let key = *u.addr(); + use super::level::UnivData; + if self.canon_univs.contains(u.addr()) { + return u; + } + // Canonicalize children first; rebuild only if any child changed. + let u = match u.data() { + UnivData::Succ(inner, _) => { + let ci = self.intern_univ(inner.clone()); + if ci.ptr_eq(inner) { + u + } else { + KUniv::new(UnivData::Succ(ci, super::expr::fresh_uid())) + } + }, + UnivData::Max(a, b, _) => { + let ca = self.intern_univ(a.clone()); + let cb = self.intern_univ(b.clone()); + if ca.ptr_eq(a) && cb.ptr_eq(b) { + u + } else { + KUniv::new(UnivData::Max(ca, cb, super::expr::fresh_uid())) + } + }, + UnivData::IMax(a, b, _) => { + let ca = self.intern_univ(a.clone()); + let cb = self.intern_univ(b.clone()); + if ca.ptr_eq(a) && cb.ptr_eq(b) { + u + } else { + KUniv::new(UnivData::IMax(ca, cb, super::expr::fresh_uid())) + } + }, + UnivData::Zero(_) | UnivData::Param(..) => u, + }; + let key = univ_key(&u); if let Some(existing) = self.univs.get(&key) { return existing.clone(); } - self.univs.entry(key).or_insert(u).clone() + self.canon_univs.insert(*u.addr()); + self.univs.insert(key, u.clone()); + u } - /// Intern an expression: same pointer-uniqueness guarantee as - /// `intern_univ`. + /// Intern an expression: returns the canonical value for its structural + /// identity. Children are canonicalized recursively when needed (a node + /// built outside the table has non-canonical children whose uids would + /// make the shallow key meaningless), preserving the historical + /// content-hash interning semantics. pub fn intern_expr(&mut self, e: KExpr) -> KExpr { - let key = *e.addr(); + use super::expr::ExprData; + if self.canon_exprs.contains(e.addr()) { + return e; + } + let e = match e.data() { + ExprData::Sort(un, _) => { + let cu = self.intern_univ(un.clone()); + if cu.ptr_eq(un) { e } else { KExpr::sort_mdata(cu, e.mdata().clone()) } + }, + ExprData::Const(id, us, _) => { + let cus: Box<[KUniv]> = + us.iter().map(|un| self.intern_univ(un.clone())).collect(); + if cus.iter().zip(us.iter()).all(|(a, b)| a.ptr_eq(b)) { + e + } else { + KExpr::cnst_mdata(id.clone(), cus, e.mdata().clone()) + } + }, + ExprData::App(f, a, _) => { + let cf = self.intern_expr(f.clone()); + let ca = self.intern_expr(a.clone()); + if cf.ptr_eq(f) && ca.ptr_eq(a) { + e + } else { + KExpr::app_mdata(cf, ca, e.mdata().clone()) + } + }, + ExprData::Lam(n, bi, t, b, _) => { + let ct = self.intern_expr(t.clone()); + let cb = self.intern_expr(b.clone()); + if ct.ptr_eq(t) && cb.ptr_eq(b) { + e + } else { + KExpr::lam_mdata(n.clone(), bi.clone(), ct, cb, e.mdata().clone()) + } + }, + ExprData::All(n, bi, t, b, _) => { + let ct = self.intern_expr(t.clone()); + let cb = self.intern_expr(b.clone()); + if ct.ptr_eq(t) && cb.ptr_eq(b) { + e + } else { + KExpr::all_mdata(n.clone(), bi.clone(), ct, cb, e.mdata().clone()) + } + }, + ExprData::Let(n, t, v, b, nd, _) => { + let ct = self.intern_expr(t.clone()); + let cv = self.intern_expr(v.clone()); + let cb = self.intern_expr(b.clone()); + if ct.ptr_eq(t) && cv.ptr_eq(v) && cb.ptr_eq(b) { + e + } else { + KExpr::let_mdata(n.clone(), ct, cv, cb, *nd, e.mdata().clone()) + } + }, + ExprData::Prj(id, f, v, _) => { + let cv = self.intern_expr(v.clone()); + if cv.ptr_eq(v) { + e + } else { + KExpr::prj_mdata(id.clone(), *f, cv, e.mdata().clone()) + } + }, + ExprData::Var(..) + | ExprData::FVar(..) + | ExprData::Nat(..) + | ExprData::Str(..) => e, + }; + let key = expr_key(&e); if let Some(existing) = self.exprs.get(&key) { + // The shallow key (exact structural Eq over variant tag + child + // uids + payload — never a truncated or content-hashed key) plus + // canonical children make this hit structurally exact. Checked in + // debug builds; a violation here would be an interning bug, not + // an input an adversary can craft (uids are assigned, not hashed). + debug_assert!(existing == &e, "intern hit is not structurally equal"); return existing.clone(); } - self.exprs.entry(key).or_insert(e).clone() + self.canon_exprs.insert(*e.addr()); + self.exprs.insert(key, e.clone()); + e } } @@ -254,9 +461,9 @@ pub struct KEnv { // than `Arc::as_ptr` pointers, avoiding the ABA problem where deallocated // pointers are reused by the allocator for semantically different expressions. /// WHNF cache (full, with delta): (expr_hash, ctx_hash)-keyed. - pub whnf_cache: FxHashMap<(Addr, Addr), KExpr>, + pub whnf_cache: FxHashMap<(Addr, CtxAddr), KExpr>, /// WHNF cache (no delta): (expr_hash, ctx_hash)-keyed. - pub whnf_no_delta_cache: FxHashMap<(Addr, Addr), KExpr>, + pub whnf_no_delta_cache: FxHashMap<(Addr, CtxAddr), KExpr>, /// Cheap-mode WHNF cache (no delta, DEF_EQ_CORE flags): same key shape as /// `whnf_no_delta_cache`, but populated by cheap-projection callers in the /// def-eq lazy-delta loop. Cheap output is NOT shared with full callers @@ -265,41 +472,41 @@ pub struct KEnv { /// gated to cheap-mode callers only — mirrors the `def_eq_cheap_cache` /// pattern. Without this, every iteration of the lazy-delta loop redoes /// `whnf_no_delta_for_def_eq` from scratch (mathlib hot path). - pub whnf_no_delta_cheap_cache: FxHashMap<(Addr, Addr), KExpr>, + pub whnf_no_delta_cheap_cache: FxHashMap<(Addr, CtxAddr), KExpr>, /// WHNF core cache: structural-only reduction (beta/iota/zeta/proj), /// no native primitives, no delta. Mirrors lean4lean's `whnfCoreCache` /// (refs/lean4lean/Lean4Lean/TypeChecker.lean:19) and lean4 C++'s /// `m_whnf_core`. Populated only when flags are FULL — cheap-projection /// results are not safe to share with full callers. - pub whnf_core_cache: FxHashMap<(Addr, Addr), KExpr>, + pub whnf_core_cache: FxHashMap<(Addr, CtxAddr), KExpr>, /// Cheap-mode WHNF core cache: same key shape as `whnf_core_cache`, but /// populated by cheap-projection callers (DEF_EQ_CORE flags) inside the /// def-eq lazy-delta loop. Same soundness reasoning as /// `whnf_no_delta_cheap_cache` — cheap output stays in its own pool so /// full callers always see a properly-reduced result. - pub whnf_core_cheap_cache: FxHashMap<(Addr, Addr), KExpr>, + pub whnf_core_cheap_cache: FxHashMap<(Addr, CtxAddr), KExpr>, /// Infer cache: keyed by (expr_hash, ctx_hash). Context-dependent. /// Populated only from full-mode `infer` (i.e. not from `with_infer_only`), /// so every cached result has passed the validation `infer_only` skips. /// Both modes read from this same cache — an `infer_only` lookup happily /// consumes a full-mode result since it's strictly stronger. - pub infer_cache: FxHashMap<(Addr, Addr), KExpr>, + pub infer_cache: FxHashMap<(Addr, CtxAddr), KExpr>, /// Infer-only cache: keyed like `infer_cache`, but populated only by /// `with_infer_only` synthesis and read only while infer-only is active. /// This keeps unchecked results out of the validated full-mode cache while /// still sharing repeated proof-irrelevance/projection probes. - pub infer_only_cache: FxHashMap<(Addr, Addr), KExpr>, + pub infer_only_cache: FxHashMap<(Addr, CtxAddr), KExpr>, /// Full def-eq cache: keyed by (expr_hash, expr_hash, ctx_hash). /// Context-dependent. Entries in this cache are valid for both full and /// cheap def-eq callers. - pub def_eq_cache: FxHashMap<(Addr, Addr, Addr), bool>, + pub def_eq_cache: FxHashMap<(Addr, Addr, CtxAddr), bool>, /// Cheap def-eq cache: same key as `def_eq_cache`, but only for comparisons /// performed inside cheap projection reductions. Cheap `false` can be a /// full-mode false negative, so those entries must not be visible to full /// callers. - pub def_eq_cheap_cache: FxHashMap<(Addr, Addr, Addr), bool>, + pub def_eq_cheap_cache: FxHashMap<(Addr, Addr, CtxAddr), bool>, /// Failed def-eq pairs in lazy delta: canonical ordering by hash. - pub def_eq_failure: FxHashSet<(Addr, Addr, Addr)>, + pub def_eq_failure: FxHashSet<(Addr, Addr, CtxAddr)>, /// Constant-instantiation cache: caches the result of /// `instantiate_univ_params(val, us)` for each `Const(id, us)` head encountered /// during delta unfolding. Keyed by the head expression's content hash, which @@ -314,11 +521,11 @@ pub struct KEnv { /// this memo a stuck `Nat.succ^k(x)` chain is re-peeled from every /// depth it is encountered at — O(k²) fuel for symbolic-plus-literal /// Nat arithmetic (e.g. `x + 0xC0` in the UTF-8 codec proofs). - pub nat_succ_stuck: FxHashSet<(Addr, Addr)>, + pub nat_succ_stuck: FxHashSet<(Addr, CtxAddr)>, /// Ingress cache: LeanExpr → KExpr conversion results. /// Keyed by (expr_hash, param_names_hash) to account for different /// level param bindings producing different KExprs from the same LeanExpr. - pub ingress_cache: FxHashMap<(Addr, Addr), KExpr>, + pub ingress_cache: FxHashMap<(CtxAddr, CtxAddr), KExpr>, /// "Is this type Prop?" cache, keyed by (type_hash, ctx_hash). /// /// `try_proof_irrel` is called on essentially every `is_def_eq` @@ -330,7 +537,7 @@ pub struct KEnv { /// proof-irrelevance probe skip those three calls. Empirically this /// is the dominant cost on mathlib proof-heavy blocks, where the same /// propositions are tested for equality thousands of times. - pub is_prop_cache: FxHashMap<(Addr, Addr), bool>, + pub is_prop_cache: FxHashMap<(Addr, CtxAddr), bool>, /// Generated recursors, keyed by inductive Muts block id. pub recursor_cache: FxHashMap, Vec>>, /// Nested-auxiliary order expected by stored recursors in this environment. @@ -347,6 +554,12 @@ pub struct KEnv { /// so every member of a bad block reports the same structured failure. pub block_check_results: FxHashMap, Result<(), TcError>>, + /// Primitive-reducer family per head-constant address (see + /// `whnf.rs::PrimFamily`). Pure function of the address; memoized so + /// the WHNF loops classify each head with one map probe instead of a + /// ~30-address compare gauntlet across five recognizers. + pub prim_family_cache: FxHashMap, + /// Next free-variable id for checker-local binder openings. /// /// Type-checking caches live on `KEnv`, not on one `TypeChecker`, so FVar @@ -421,6 +634,7 @@ impl KEnv { rec_majors_cache: FxHashMap::default(), block_peer_agreement_cache: FxHashSet::default(), block_check_results: FxHashMap::default(), + prim_family_cache: FxHashMap::default(), next_fvar_id: 0, perf: PerfCounters::default(), profile_sink: None, @@ -511,11 +725,14 @@ impl KEnv { self.blocks.clear(); self.intern.univs.clear(); self.intern.exprs.clear(); + self.intern.canon_exprs.clear(); + self.intern.canon_univs.clear(); // Scratch buffers retain entries from the most recent subst/lift call; // emptying them releases the KExpr Arc references they hold so the // intern.exprs cleanup above can actually drop ExprData allocations. self.intern.subst_scratch.clear(); self.intern.lift_scratch.clear(); + self.intern.clo_scratch_pool.clear(); let _ = self.prims.take(); self.whnf_cache.clear(); self.whnf_no_delta_cache.clear(); @@ -535,6 +752,7 @@ impl KEnv { self.rec_majors_cache.clear(); self.block_peer_agreement_cache.clear(); self.block_check_results.clear(); + self.prim_family_cache.clear(); self.next_fvar_id = 0; } @@ -597,6 +815,7 @@ impl KEnv { self.rec_majors_cache = FxHashMap::default(); self.block_peer_agreement_cache = FxHashSet::default(); self.block_check_results = FxHashMap::default(); + self.prim_family_cache = FxHashMap::default(); self.next_fvar_id = 0; } diff --git a/crates/kernel/src/equiv.rs b/crates/kernel/src/equiv.rs index aeec2727..2bde4d1c 100644 --- a/crates/kernel/src/equiv.rs +++ b/crates/kernel/src/equiv.rs @@ -6,10 +6,10 @@ use rustc_hash::FxHashMap; -use super::env::Addr; +use super::env::{Addr, CtxAddr}; /// Composite key: (expression content hash, context content hash). -pub type EqKey = (Addr, Addr); +pub type EqKey = (Addr, CtxAddr); /// Union-find structure for tracking definitional equality between /// (expr_hash, ctx_hash) pairs. @@ -141,13 +141,17 @@ mod tests { use super::*; fn addr(n: u64) -> Addr { + n + } + + fn ctx(n: u64) -> CtxAddr { blake3::hash(&n.to_le_bytes()) } #[test] fn test_basic_equiv() { let mut em = EquivManager::new(); - let zero = addr(0); + let zero = ctx(0); assert!(!em.is_equiv(&(addr(100), zero), &(addr(200), zero))); em.add_equiv((addr(100), zero), (addr(200), zero)); assert!(em.is_equiv(&(addr(100), zero), &(addr(200), zero))); @@ -157,7 +161,7 @@ mod tests { #[test] fn test_transitivity() { let mut em = EquivManager::new(); - let zero = addr(0); + let zero = ctx(0); em.add_equiv((addr(100), zero), (addr(200), zero)); em.add_equiv((addr(200), zero), (addr(300), zero)); assert!(em.is_equiv(&(addr(100), zero), &(addr(300), zero))); @@ -166,8 +170,8 @@ mod tests { #[test] fn test_context_isolation() { let mut em = EquivManager::new(); - let ctx1 = addr(1); - let ctx2 = addr(2); + let ctx1 = ctx(1); + let ctx2 = ctx(2); em.add_equiv((addr(100), ctx1), (addr(200), ctx1)); assert!(em.is_equiv(&(addr(100), ctx1), &(addr(200), ctx1))); assert!(!em.is_equiv(&(addr(100), ctx2), &(addr(200), ctx2))); diff --git a/crates/kernel/src/expr.rs b/crates/kernel/src/expr.rs index fe5ce40c..33a59916 100644 --- a/crates/kernel/src/expr.rs +++ b/crates/kernel/src/expr.rs @@ -8,10 +8,7 @@ use std::sync::Arc; use bignat::Nat; use ix_common::address::Address; -use ix_common::env::{ - BinderInfo, DataValue, EALL, EAPP, EFVAR, ELAM, ELET, ENAT, EPRJ, EREF, - ESORT, ESTR, EVAR, Name, -}; +use ix_common::env::{BinderInfo, DataValue, Name}; use super::env::Addr; use super::id::KId; @@ -138,8 +135,10 @@ impl KExpr { &self.info().mdata } - /// Content-addressed key for cache lookups. Returns the blake3 hash - /// by value — `Addr` is `Copy`, so this is a 32-byte memcpy. + /// Canonical identity key for cache lookups: the intern-assigned uid. + /// `Addr` is a plain `u64`, allocated process-globally and never reused, + /// so uid equality implies "same construction event or same intern-table + /// canonical value" — strictly, structural equality. pub fn hash_key(&self) -> Addr { *self.addr() } @@ -148,25 +147,73 @@ impl KExpr { Arc::ptr_eq(&self.0, &other.0) } - /// Content-addressed equality with a layered fast path. - /// - /// 1. `ptr_eq` on the outer `KExpr` Arc — fires when both sides - /// came through the [`InternTable`](super::env::InternTable). - /// 2. 32-byte Blake3 hash compare — sound on its own (collisions - /// require an adversarial preimage attack), and a single AVX2 - /// cycle on modern x86. Earlier revisions interposed an - /// `Arc::ptr_eq` fast path on a process-globally-interned `Addr`, - /// but that intern table dominated RSS at mathlib scale; the - /// pure-content compare keeps the same correctness with no - /// process-global state. + /// Canonical-identity equality: `ptr_eq` or equal intern uid. Sound in + /// the affirmative (equal uid ⇒ structurally equal); INCOMPLETE in the + /// negative — two structurally equal expressions built without sharing + /// an [`InternTable`](super::env::InternTable) carry distinct uids. + /// Callers that need a definitive verdict use `==` (structural with this + /// as the fast path); callers where a false negative merely costs a + /// cache miss or a def-eq fallback use this directly. pub fn hash_eq(&self, other: &KExpr) -> bool { self.ptr_eq(other) || self.addr() == other.addr() } + + /// Structural equality mirroring exactly what the intern identity + /// covers: display names, binder info, and mdata are NOT compared + /// (matching the historical content-hash semantics). Equal interned + /// subtrees prune at the uid fast path, so canonical-vs-canonical + /// comparison is O(1). + fn structural_eq(&self, other: &Self) -> bool { + if self.hash_eq(other) { + return true; + } + match (self.data(), other.data()) { + (ExprData::Var(i, _, _), ExprData::Var(j, _, _)) => i == j, + (ExprData::FVar(x, _, _), ExprData::FVar(y, _, _)) => x == y, + (ExprData::Sort(u, _), ExprData::Sort(v, _)) => u == v, + (ExprData::Const(id1, us1, _), ExprData::Const(id2, us2, _)) => { + id1.addr == id2.addr + && us1.len() == us2.len() + && us1.iter().zip(us2.iter()).all(|(a, b)| a == b) + }, + (ExprData::App(f1, a1, _), ExprData::App(f2, a2, _)) => { + f1.structural_eq(f2) && a1.structural_eq(a2) + }, + (ExprData::Lam(_, _, t1, b1, _), ExprData::Lam(_, _, t2, b2, _)) + | (ExprData::All(_, _, t1, b1, _), ExprData::All(_, _, t2, b2, _)) => { + t1.structural_eq(t2) && b1.structural_eq(b2) + }, + ( + ExprData::Let(_, t1, v1, b1, nd1, _), + ExprData::Let(_, t2, v2, b2, nd2, _), + ) => { + nd1 == nd2 + && t1.structural_eq(t2) + && v1.structural_eq(v2) + && b1.structural_eq(b2) + }, + (ExprData::Prj(id1, f1, v1, _), ExprData::Prj(id2, f2, v2, _)) => { + id1.addr == id2.addr && f1 == f2 && v1.structural_eq(v2) + }, + // Literals: the blob address IS the identity (mirroring the old + // content hash, which hashed only the address). The value conjunct + // is defense in depth: load-time blob verification makes it + // redundant, and if that invariant were ever violated it degrades + // to inequality — the conservative direction. + (ExprData::Nat(v1, ba1, _), ExprData::Nat(v2, ba2, _)) => { + ba1 == ba2 && v1 == v2 + }, + (ExprData::Str(v1, ba1, _), ExprData::Str(v2, ba2, _)) => { + ba1 == ba2 && v1 == v2 + }, + _ => false, + } + } } impl PartialEq for KExpr { fn eq(&self, other: &Self) -> bool { - self.hash_eq(other) + self.structural_eq(other) } } @@ -186,18 +233,40 @@ fn mk_info( ExprInfo { addr, lbr, count_0, has_fvars, mdata } } +/// Process-global uid allocator for expression/universe identity. +/// +/// Every constructed node takes a fresh uid, so uid equality means "same +/// construction event" — or, after [`InternTable`](super::env::InternTable) +/// hash-consing, "same canonical value". Uids are NEVER reused (the counter +/// is global, not per-table), so cache keys built from uids stay sound +/// across intern-table clears: a stale key can only miss, never alias. +/// +/// This replaces the per-node blake3 content hash: profiling on the Zisk +/// guest put `app_hash` + the blake3 wrapper at ~20% of cycles on +/// reduction-heavy constants, all of it spent computing identity that the +/// intern table can assign in one atomic increment. +static NEXT_UID: std::sync::atomic::AtomicU64 = + std::sync::atomic::AtomicU64::new(1); + +#[inline] +pub(crate) fn fresh_uid() -> Addr { + let uid = NEXT_UID.fetch_add(1, std::sync::atomic::Ordering::Relaxed); + // Uid exhaustion guard. Uids are ASSIGNED (sequential), never computed + // from content, so two distinct terms can only alias if the counter + // wraps — which would take centuries at nanosecond allocation rates + // (and >2^67 guest cycles in-circuit). Abort rather than reason about + // it: identity soundness must not rest on "probably won't happen". + assert!(uid < u64::MAX, "kernel uid counter exhausted"); + uid +} + // ============================================================================= -// Hash-first interning: each `*_mdata` constructor is split into a -// hash-only function (no allocation) and a `*_mdata_with_addr` builder -// that takes a precomputed canonical [`Addr`]. The plain `*_mdata` form is -// kept as a convenience wrapper for callers that don't pre-check the -// intern table. -// -// Hot-path callers in `ingress.rs` use the split form so they can ask -// `InternTable::try_get_expr(&hash)` *before* paying the -// blake3-hash + `intern_addr` + `Arc` allocation cost — a -// significant win because >60% of constructed values are immediately -// discarded for an existing canonical Arc on the intern table. +// Constructors: each `*_mdata` form allocates a fresh uid; the +// `*_mdata_with_addr` builders remain for the intern table and callers +// that already hold a fresh uid. Structural identity (what used to be the +// content hash) is decided by `InternTable`'s shallow keys — see +// `env.rs::ExprKey` — which mirror what the historical hash functions +// covered: names, binder info, and mdata are excluded. // ============================================================================= impl KExpr { @@ -207,21 +276,7 @@ impl KExpr { /// Compute the content hash for [`KExpr::var_mdata`] without allocating. /// - /// `name` is descriptive metadata only and intentionally NOT hashed — - /// two `Var(i)` nodes with different display names are content-equal, - /// keeping hash equality alpha-invariant even in `Meta` mode. - pub fn var_hash( - idx: u64, - _name: &M::MField, - _mdata: &M::MField>, - ) -> blake3::Hash { - let mut h = blake3::Hasher::new(); - h.update(&[EVAR]); - h.update(&idx.to_le_bytes()); - h.finalize() - } - - pub fn var_mdata_with_addr( + fn var_mdata_with_addr( idx: u64, name: M::MField, mdata: M::MField>, @@ -236,22 +291,7 @@ impl KExpr { Self::fvar_mdata(id, name, no_mdata::()) } - /// Compute the content hash for [`KExpr::fvar_mdata`] without allocating. - /// Includes the [`FVarId`] so distinct fvars produce distinct hashes — the - /// soundness lever for keying caches by expression alone. `name` is - /// descriptive only and intentionally NOT hashed. - pub fn fvar_hash( - id: FVarId, - _name: &M::MField, - _mdata: &M::MField>, - ) -> blake3::Hash { - let mut h = blake3::Hasher::new(); - h.update(&[EFVAR]); - h.update(&id.0.to_le_bytes()); - h.finalize() - } - - pub fn fvar_mdata_with_addr( + fn fvar_mdata_with_addr( id: FVarId, name: M::MField, mdata: M::MField>, @@ -268,7 +308,7 @@ impl KExpr { name: M::MField, mdata: M::MField>, ) -> Self { - let addr = Self::fvar_hash(id, &name, &mdata); + let addr = fresh_uid(); Self::fvar_mdata_with_addr(id, name, mdata, addr) } @@ -277,7 +317,7 @@ impl KExpr { name: M::MField, mdata: M::MField>, ) -> Self { - let addr = Self::var_hash(idx, &name, &mdata); + let addr = fresh_uid(); Self::var_mdata_with_addr(idx, name, mdata, addr) } @@ -285,17 +325,7 @@ impl KExpr { Self::sort_mdata(u, no_mdata::()) } - pub fn sort_hash( - u: &KUniv, - _mdata: &M::MField>, - ) -> blake3::Hash { - let mut h = blake3::Hasher::new(); - h.update(&[ESORT]); - h.update(u.addr().as_bytes()); - h.finalize() - } - - pub fn sort_mdata_with_addr( + fn sort_mdata_with_addr( u: KUniv, mdata: M::MField>, addr: Addr, @@ -304,7 +334,7 @@ impl KExpr { } pub fn sort_mdata(u: KUniv, mdata: M::MField>) -> Self { - let addr = Self::sort_hash(&u, &mdata); + let addr = fresh_uid(); Self::sort_mdata_with_addr(u, mdata, addr) } @@ -312,25 +342,7 @@ impl KExpr { Self::cnst_mdata(id, univs, no_mdata::()) } - /// `id.addr` is the constant's content-address — its identity. The - /// `id.name` field is display-only metadata, intentionally NOT hashed, - /// so two references to the same address with different display names - /// remain content-equal. - pub fn cnst_hash( - id: &KId, - univs: &[KUniv], - _mdata: &M::MField>, - ) -> blake3::Hash { - let mut h = blake3::Hasher::new(); - h.update(&[EREF]); - h.update(id.addr.as_bytes()); - for u in univs.iter() { - h.update(u.addr().as_bytes()); - } - h.finalize() - } - - pub fn cnst_mdata_with_addr( + fn cnst_mdata_with_addr( id: KId, univs: Box<[KUniv]>, mdata: M::MField>, @@ -348,7 +360,7 @@ impl KExpr { univs: Box<[KUniv]>, mdata: M::MField>, ) -> Self { - let addr = Self::cnst_hash(&id, &univs, &mdata); + let addr = fresh_uid(); Self::cnst_mdata_with_addr(id, univs, mdata, addr) } @@ -356,19 +368,7 @@ impl KExpr { Self::app_mdata(f, a, no_mdata::()) } - pub fn app_hash( - f: &KExpr, - a: &KExpr, - _mdata: &M::MField>, - ) -> blake3::Hash { - let mut h = blake3::Hasher::new(); - h.update(&[EAPP]); - h.update(f.addr().as_bytes()); - h.update(a.addr().as_bytes()); - h.finalize() - } - - pub fn app_mdata_with_addr( + fn app_mdata_with_addr( f: KExpr, a: KExpr, mdata: M::MField>, @@ -389,7 +389,7 @@ impl KExpr { a: KExpr, mdata: M::MField>, ) -> Self { - let addr = Self::app_hash(&f, &a, &mdata); + let addr = fresh_uid(); Self::app_mdata_with_addr(f, a, mdata, addr) } @@ -404,26 +404,7 @@ impl KExpr { /// Compute the content hash for [`KExpr::lam_mdata`]. /// - /// Binder `name` and `bi` are display/elaboration metadata only and are - /// intentionally NOT hashed. The kernel does not distinguish lambdas - /// that differ only in binder name or binder info; this keeps hash - /// equality structural and alpha-invariant in `Meta` mode (matching - /// `Anon` mode where these fields are erased). - pub fn lam_hash( - _name: &M::MField, - _bi: &M::MField, - ty: &KExpr, - body: &KExpr, - _mdata: &M::MField>, - ) -> blake3::Hash { - let mut h = blake3::Hasher::new(); - h.update(&[ELAM]); - h.update(ty.addr().as_bytes()); - h.update(body.addr().as_bytes()); - h.finalize() - } - - pub fn lam_mdata_with_addr( + fn lam_mdata_with_addr( name: M::MField, bi: M::MField, ty: KExpr, @@ -448,7 +429,7 @@ impl KExpr { body: KExpr, mdata: M::MField>, ) -> Self { - let addr = Self::lam_hash(&name, &bi, &ty, &body, &mdata); + let addr = fresh_uid(); Self::lam_mdata_with_addr(name, bi, ty, body, mdata, addr) } @@ -461,22 +442,7 @@ impl KExpr { Self::all_mdata(name, bi, ty, body, no_mdata::()) } - /// See [`KExpr::lam_hash`] — binder `name`/`bi` intentionally not hashed. - pub fn all_hash( - _name: &M::MField, - _bi: &M::MField, - ty: &KExpr, - body: &KExpr, - _mdata: &M::MField>, - ) -> blake3::Hash { - let mut h = blake3::Hasher::new(); - h.update(&[EALL]); - h.update(ty.addr().as_bytes()); - h.update(body.addr().as_bytes()); - h.finalize() - } - - pub fn all_mdata_with_addr( + fn all_mdata_with_addr( name: M::MField, bi: M::MField, ty: KExpr, @@ -501,7 +467,7 @@ impl KExpr { body: KExpr, mdata: M::MField>, ) -> Self { - let addr = Self::all_hash(&name, &bi, &ty, &body, &mdata); + let addr = fresh_uid(); Self::all_mdata_with_addr(name, bi, ty, body, mdata, addr) } @@ -515,28 +481,7 @@ impl KExpr { Self::let_mdata(name, ty, val, body, non_dep, no_mdata::()) } - /// See [`KExpr::lam_hash`] — binder `name` is intentionally not hashed. - /// `non_dep` IS hashed: dropping it would intern two letEs that differ only - /// in `non_dep` to the same KExpr, and egress would then return whichever - /// `non_dep` was interned first, breaking Ixon roundtrip fidelity. - pub fn let_hash( - _name: &M::MField, - ty: &KExpr, - val: &KExpr, - body: &KExpr, - non_dep: bool, - _mdata: &M::MField>, - ) -> blake3::Hash { - let mut h = blake3::Hasher::new(); - h.update(&[ELET]); - h.update(ty.addr().as_bytes()); - h.update(val.addr().as_bytes()); - h.update(body.addr().as_bytes()); - h.update(&[u8::from(non_dep)]); - h.finalize() - } - - pub fn let_mdata_with_addr( + fn let_mdata_with_addr( name: M::MField, ty: KExpr, val: KExpr, @@ -563,7 +508,7 @@ impl KExpr { non_dep: bool, mdata: M::MField>, ) -> Self { - let addr = Self::let_hash(&name, &ty, &val, &body, non_dep, &mdata); + let addr = fresh_uid(); Self::let_mdata_with_addr(name, ty, val, body, non_dep, mdata, addr) } @@ -571,22 +516,7 @@ impl KExpr { Self::prj_mdata(id, field, val, no_mdata::()) } - /// `id.name` is display-only metadata, intentionally NOT hashed. - pub fn prj_hash( - id: &KId, - field: u64, - val: &KExpr, - _mdata: &M::MField>, - ) -> blake3::Hash { - let mut h = blake3::Hasher::new(); - h.update(&[EPRJ]); - h.update(id.addr.as_bytes()); - h.update(&field.to_le_bytes()); - h.update(val.addr().as_bytes()); - h.finalize() - } - - pub fn prj_mdata_with_addr( + fn prj_mdata_with_addr( id: KId, field: u64, val: KExpr, @@ -604,7 +534,7 @@ impl KExpr { val: KExpr, mdata: M::MField>, ) -> Self { - let addr = Self::prj_hash(&id, field, &val, &mdata); + let addr = fresh_uid(); Self::prj_mdata_with_addr(id, field, val, mdata, addr) } @@ -612,17 +542,7 @@ impl KExpr { Self::nat_mdata(val, blob_addr, no_mdata::()) } - pub fn nat_hash( - blob_addr: &Address, - _mdata: &M::MField>, - ) -> blake3::Hash { - let mut h = blake3::Hasher::new(); - h.update(&[ENAT]); - h.update(blob_addr.as_bytes()); - h.finalize() - } - - pub fn nat_mdata_with_addr( + fn nat_mdata_with_addr( val: Nat, blob_addr: Address, mdata: M::MField>, @@ -640,7 +560,7 @@ impl KExpr { blob_addr: Address, mdata: M::MField>, ) -> Self { - let addr = Self::nat_hash(&blob_addr, &mdata); + let addr = fresh_uid(); Self::nat_mdata_with_addr(val, blob_addr, mdata, addr) } @@ -648,17 +568,7 @@ impl KExpr { Self::str_mdata(val, blob_addr, no_mdata::()) } - pub fn str_hash( - blob_addr: &Address, - _mdata: &M::MField>, - ) -> blake3::Hash { - let mut h = blake3::Hasher::new(); - h.update(&[ESTR]); - h.update(blob_addr.as_bytes()); - h.finalize() - } - - pub fn str_mdata_with_addr( + fn str_mdata_with_addr( val: String, blob_addr: Address, mdata: M::MField>, @@ -676,7 +586,7 @@ impl KExpr { blob_addr: Address, mdata: M::MField>, ) -> Self { - let addr = Self::str_hash(&blob_addr, &mdata); + let addr = fresh_uid(); Self::str_mdata_with_addr(val, blob_addr, mdata, addr) } } @@ -826,7 +736,9 @@ mod tests { #[test] fn var_hash_deterministic() { - assert_eq!(AE::var(0, ()).addr(), AE::var(0, ()).addr()); + // Identity is intern-assigned now: independently built nodes carry + // distinct uids but remain structurally equal (`==`). + assert_eq!(AE::var(0, ()), AE::var(0, ())); } #[test] @@ -839,10 +751,7 @@ mod tests { // Binder names are descriptive metadata only — they do NOT contribute // to the content hash, so two `Var(0)` nodes with different display // names are content-equal. Keeps hash equality alpha-invariant. - assert_eq!( - ME::var(0, mk_name("x")).addr(), - ME::var(0, mk_name("y")).addr() - ); + assert_eq!(ME::var(0, mk_name("x")), ME::var(0, mk_name("y"))); } #[test] @@ -867,7 +776,7 @@ mod tests { // their display names. let a = ME::cnst(KId::new(mk_addr("Nat"), mk_name("Nat")), Box::new([])); let b = ME::cnst(KId::new(mk_addr("Nat"), mk_name("Int")), Box::new([])); - assert_eq!(a.addr(), b.addr()); + assert_eq!(a, b); } #[test] @@ -894,7 +803,7 @@ mod tests { let a = ME::lam(mk_name("x"), BinderInfo::Default, ty.clone(), body.clone()); let b = ME::lam(mk_name("y"), BinderInfo::Default, ty, body); - assert_eq!(a.addr(), b.addr()); + assert_eq!(a, b); } #[test] @@ -907,7 +816,7 @@ mod tests { let a = ME::lam(mk_name("x"), BinderInfo::Default, ty.clone(), body.clone()); let b = ME::lam(mk_name("x"), BinderInfo::Implicit, ty, body); - assert_eq!(a.addr(), b.addr()); + assert_eq!(a, b); } #[test] diff --git a/crates/kernel/src/inductive.rs b/crates/kernel/src/inductive.rs index 52e81028..9773d554 100644 --- a/crates/kernel/src/inductive.rs +++ b/crates/kernel/src/inductive.rs @@ -501,7 +501,7 @@ impl TypeChecker<'_, M> { let mut flat: Vec> = Vec::new(); // (ext_ind_addr, spec_params content hashes) for dedup. // Uses [u8; 32] blake3 digest for structural equality. - let mut aux_seen: Vec<(Address, Vec<[u8; 32]>)> = Vec::new(); + let mut aux_seen: Vec<(Address, Vec>)> = Vec::new(); // Seed with original block inductives. for ind_id in block_inds { @@ -622,7 +622,7 @@ impl TypeChecker<'_, M> { dom: &KExpr, block_addrs: &[Address], flat: &mut Vec>, - aux_seen: &mut Vec<(Address, Vec<[u8; 32]>)>, + aux_seen: &mut Vec<(Address, Vec>)>, univ_offset: u64, param_depth: usize, // depth at the param context (before field locals) n_rec_params: u64, // number of inductive parameters (valid Var refs in spec_params) @@ -713,9 +713,9 @@ impl TypeChecker<'_, M> { } // Dedup: check if we've already seen this (ext_ind, spec_params) pair. - // Use blake3 content hash (addr) for structural dedup. - let spec_hashes: Vec<[u8; 32]> = - spec_params.iter().map(|e| *e.addr().as_bytes()).collect(); + // Structural comparison (uid fast path + recursive fallback) so + // equal-but-separately-built spec params still collapse. + let spec_hashes: Vec> = spec_params.clone(); if aux_seen.iter().any(|(a, s)| { *a == head_id.addr && s.len() == spec_hashes.len() @@ -1106,10 +1106,10 @@ impl TypeChecker<'_, M> { h.update(&(source_idx as u64).to_le_bytes()); h.update(member.id.addr.as_bytes()); for sp in &member.spec_params { - h.update(sp.addr().as_bytes()); + h.update(&sp.addr().to_le_bytes()); } for u in member.occurrence_us.iter() { - h.update(u.addr().as_bytes()); + h.update(&u.addr().to_le_bytes()); } let aux_addr = Address::from_blake3_hash(h.finalize()); let aux_id = KId::new(aux_addr.clone(), M::meta_field(seed_name.clone())); diff --git a/crates/kernel/src/infer.rs b/crates/kernel/src/infer.rs index e65748eb..cbe061c0 100644 --- a/crates/kernel/src/infer.rs +++ b/crates/kernel/src/infer.rs @@ -598,7 +598,7 @@ fn compact_head(e: &KExpr) -> String { } fn short_addr(e: &KExpr) -> String { - e.addr().to_hex().chars().take(12).collect() + format!("uid{}", e.addr()) } #[cfg(test)] diff --git a/crates/kernel/src/ingress.rs b/crates/kernel/src/ingress.rs index 0b2a4e83..e3c16b63 100644 --- a/crates/kernel/src/ingress.rs +++ b/crates/kernel/src/ingress.rs @@ -19,7 +19,6 @@ use rustc_hash::{FxHashMap, FxHashSet}; use dashmap::DashMap; -use crate::env::Addr; use bignat::Nat; use ix_common::address::Address; #[cfg(not(target_arch = "riscv64"))] @@ -42,7 +41,7 @@ use ixon::metadata::{ use ixon::univ::Univ as IxonUniv; use super::constant::{KConst, RecRule}; -use super::env::{InternTable, KEnv}; +use super::env::{CtxAddr, InternTable, KEnv}; use super::expr::{KExpr, MData}; use super::id::KId; use super::level::KUniv; @@ -257,7 +256,7 @@ fn timed_intern_univ( return intern.intern_univ(u); } let t0 = Instant::now(); - let key = *u.addr(); + let key = super::env::univ_key(&u); let result = if let Some(existing) = intern.try_get_univ(&key) { stats.intern_univ_get_hits += 1; existing @@ -269,56 +268,45 @@ fn timed_intern_univ( result } -/// Hash-first interning. Precomputes the content hash, asks the intern -/// table for an existing canonical KExpr; only on a miss does it call -/// `build(addr)` to allocate a new KExpr. +/// Key-first interning. Builds the shallow structural key from already +/// canonical children, asks the intern table for an existing canonical +/// KExpr; only on a miss does it call `build()` to allocate a new KExpr +/// (which takes a fresh uid). /// -/// Why this exists: profiling on Mathlib shows `kexpr_construct` (the -/// blake3 hash + `Arc` allocation pair) is ~45% of `convert` -/// worker-sum, of which ~62% is wasted because the intern table -/// already has the same canonical value. By computing just the hash up -/// front and skipping construction entirely on a hit, we avoid the -/// allocation in the majority case. -/// -/// The `build` closure receives the precomputed `Addr` (a `blake3::Hash` -/// by value) and is expected to call one of the -/// `KExpr::*_mdata_with_addr` constructors so it can plug the hash into -/// `ExprInfo` without re-hashing. -/// -/// Stats accounting (when enabled): the hit path bumps -/// `intern_expr_get_hits`. The miss path also bumps `kexpr_construct_*` -/// for the cost of the closure body. `intern_expr_ns` covers the -/// surrounding DashMap traffic on both paths but excludes the -/// closure-internal time. +/// Why this exists: profiling on Mathlib showed the construction + +/// interning pair dominates `convert` worker-sum, and most constructed +/// values are immediately discarded for an existing canonical Arc. By +/// probing with just the shallow key we skip construction entirely on a +/// hit. (Historically the probe key was a precomputed blake3 content +/// hash; the shallow key preserves the skip while removing the hash.) #[inline] fn timed_intern_or_build( intern: &mut InternTable, - hash: blake3::Hash, - build: impl FnOnce(Addr) -> KExpr, + key: &super::env::ExprKey, + build: impl FnOnce() -> KExpr, stats: &mut ConvertStats, ) -> KExpr { if !stats.enabled { - if let Some(existing) = intern.try_get_expr(&hash) { + if let Some(existing) = intern.try_get_expr(key) { return existing; } - return intern.intern_expr(build(hash)); + return intern.intern_expr(build()); } let t0 = Instant::now(); - if let Some(existing) = intern.try_get_expr(&hash) { + if let Some(existing) = intern.try_get_expr(key) { stats.intern_expr_get_hits += 1; stats.intern_expr_calls += 1; stats.intern_expr_ns += elapsed_ns(t0); return existing; } - let addr = hash; let kc_t0 = Instant::now(); - let new = build(addr); + let new = build(); let kc_elapsed = elapsed_ns(kc_t0); stats.kexpr_construct_ns += kc_elapsed; stats.kexpr_construct_calls += 1; let interned = intern.intern_expr(new); let total = elapsed_ns(t0); - // Account for the DashMap traffic only — the closure body's time is + // Account for the map traffic only — the closure body's time is // already in `kexpr_construct_ns`. stats.intern_expr_ns += total.saturating_sub(kc_elapsed); stats.intern_expr_calls += 1; @@ -711,25 +699,21 @@ fn ingress_expr( if mdata_layers.is_empty() { let name_field = M::meta_field(name); let mdata_field: M::MField> = M::meta_field(vec![]); - let hash = KExpr::::var_hash(*idx, &name_field, &mdata_field); + let key = super::env::ExprKey::Var(*idx); values.push(timed_intern_or_build( intern, - hash, - |addr| { - KExpr::var_mdata_with_addr(*idx, name_field, mdata_field, addr) - }, + &key, + || KExpr::var_mdata(*idx, name_field, mdata_field), stats, )); } else { let name_field = M::meta_field(name); let mdata_field = M::meta_field(mdata_layers); - let hash = KExpr::::var_hash(*idx, &name_field, &mdata_field); + let key = super::env::ExprKey::Var(*idx); values.push(timed_intern_or_build( intern, - hash, - |addr| { - KExpr::var_mdata_with_addr(*idx, name_field, mdata_field, addr) - }, + &key, + || KExpr::var_mdata(*idx, name_field, mdata_field), stats, )); } @@ -762,11 +746,11 @@ fn ingress_expr( })?) .ok_or_else(|| format!("invalid Sort univ index {idx}"))?; let zu = ingress_univ(u, ctx, intern, univ_cache, stats); - let hash = KExpr::::sort_hash(&zu, &mdata); + let key = super::env::ExprKey::Sort(*zu.addr()); values.push(timed_intern_or_build( intern, - hash, - |addr| KExpr::sort_mdata_with_addr(zu, mdata, addr), + &key, + || KExpr::sort_mdata(zu, mdata), stats, )); }, @@ -802,11 +786,14 @@ fn ingress_expr( let univs = ingress_univ_args(univ_idxs, ctx, intern, univ_cache, stats)?; let id = KId::new(addr, name_field); - let hash = KExpr::::cnst_hash(&id, &univs, &mdata); + let key = super::env::ExprKey::Const( + id.addr.clone(), + univs.iter().map(|u| *u.addr()).collect(), + ); values.push(timed_intern_or_build( intern, - hash, - |a| KExpr::cnst_mdata_with_addr(id, univs, mdata, a), + &key, + || KExpr::cnst_mdata(id, univs, mdata), stats, )); }, @@ -823,11 +810,14 @@ fn ingress_expr( .clone(); let univs = ingress_univ_args(univ_idxs, ctx, intern, univ_cache, stats)?; - let hash = KExpr::::cnst_hash(&mid, &univs, &mdata); + let key = super::env::ExprKey::Const( + mid.addr.clone(), + univs.iter().map(|u| *u.addr()).collect(), + ); values.push(timed_intern_or_build( intern, - hash, - |a| KExpr::cnst_mdata_with_addr(mid, univs, mdata, a), + &key, + || KExpr::cnst_mdata(mid, univs, mdata), stats, )); }, @@ -930,11 +920,14 @@ fn ingress_expr( let id = KId::new(addr, M::meta_field(name)); let mdata_field: M::MField> = M::meta_field(vec![]); - let hash = KExpr::::cnst_hash(&id, &univs, &mdata_field); + let key = super::env::ExprKey::Const( + id.addr.clone(), + univs.iter().map(|u| *u.addr()).collect(), + ); timed_intern_or_build( intern, - hash, - |a| KExpr::cnst_mdata_with_addr(id, univs, mdata_field, a), + &key, + || KExpr::cnst_mdata(id, univs, mdata_field), stats, ) }, @@ -957,11 +950,14 @@ fn ingress_expr( )?; let mdata_field: M::MField> = M::meta_field(vec![]); - let hash = KExpr::::cnst_hash(&mid, &univs, &mdata_field); + let key = super::env::ExprKey::Const( + mid.addr.clone(), + univs.iter().map(|u| *u.addr()).collect(), + ); timed_intern_or_build( intern, - hash, - |a| KExpr::cnst_mdata_with_addr(mid, univs, mdata_field, a), + &key, + || KExpr::cnst_mdata(mid, univs, mdata_field), stats, ) }, @@ -1188,11 +1184,11 @@ fn ingress_expr( format!("invalid UTF-8 in Str blob at addr {}: {e}", addr.hex()) })?; let blob_addr = addr.clone(); - let hash = KExpr::::str_hash(&blob_addr, &mdata); + let key = super::env::ExprKey::Str(blob_addr.clone()); values.push(timed_intern_or_build( intern, - hash, - |a| KExpr::str_mdata_with_addr(s, blob_addr, mdata, a), + &key, + || KExpr::str_mdata(s, blob_addr, mdata), stats, )); }, @@ -1215,11 +1211,11 @@ fn ingress_expr( } let n = Nat::from_le_bytes(&blob); let blob_addr = addr.clone(); - let hash = KExpr::::nat_hash(&blob_addr, &mdata); + let key = super::env::ExprKey::Nat(blob_addr.clone()); values.push(timed_intern_or_build( intern, - hash, - |a| KExpr::nat_mdata_with_addr(n, blob_addr, mdata, a), + &key, + || KExpr::nat_mdata(n, blob_addr, mdata), stats, )); }, @@ -1241,11 +1237,11 @@ fn ingress_expr( let cont_t0 = if stats.enabled { Some(Instant::now()) } else { None }; let a = values.pop().unwrap(); let f = values.pop().unwrap(); - let hash = KExpr::::app_hash(&f, &a, &mdata); + let key = super::env::ExprKey::App(*f.addr(), *a.addr()); values.push(timed_intern_or_build( intern, - hash, - |addr| KExpr::app_mdata_with_addr(f, a, mdata, addr), + &key, + || KExpr::app_mdata(f, a, mdata), stats, )); if let Some(t0) = cont_t0 { @@ -1264,11 +1260,11 @@ fn ingress_expr( let cont_t0 = if stats.enabled { Some(Instant::now()) } else { None }; let body = values.pop().unwrap(); let ty = values.pop().unwrap(); - let hash = KExpr::::lam_hash(&name, &bi, &ty, &body, &mdata); + let key = super::env::ExprKey::Lam(*ty.addr(), *body.addr()); values.push(timed_intern_or_build( intern, - hash, - |addr| KExpr::lam_mdata_with_addr(name, bi, ty, body, mdata, addr), + &key, + || KExpr::lam_mdata(name, bi, ty, body, mdata), stats, )); if let Some(t0) = cont_t0 { @@ -1287,11 +1283,11 @@ fn ingress_expr( let cont_t0 = if stats.enabled { Some(Instant::now()) } else { None }; let body = values.pop().unwrap(); let ty = values.pop().unwrap(); - let hash = KExpr::::all_hash(&name, &bi, &ty, &body, &mdata); + let key = super::env::ExprKey::All(*ty.addr(), *body.addr()); values.push(timed_intern_or_build( intern, - hash, - |addr| KExpr::all_mdata_with_addr(name, bi, ty, body, mdata, addr), + &key, + || KExpr::all_mdata(name, bi, ty, body, mdata), stats, )); if let Some(t0) = cont_t0 { @@ -1312,13 +1308,12 @@ fn ingress_expr( let body = values.pop().unwrap(); let val = values.pop().unwrap(); let ty = values.pop().unwrap(); - let hash = KExpr::::let_hash(&name, &ty, &val, &body, nd, &mdata); + let key = + super::env::ExprKey::Let(*ty.addr(), *val.addr(), *body.addr(), nd); values.push(timed_intern_or_build( intern, - hash, - |addr| { - KExpr::let_mdata_with_addr(name, ty, val, body, nd, mdata, addr) - }, + &key, + || KExpr::let_mdata(name, ty, val, body, nd, mdata), stats, )); if let Some(t0) = cont_t0 { @@ -1342,11 +1337,12 @@ fn ingress_expr( ExprFrame::PrjDone { type_id, field_idx, mdata } => { let cont_t0 = if stats.enabled { Some(Instant::now()) } else { None }; let s = values.pop().unwrap(); - let hash = KExpr::::prj_hash(&type_id, field_idx, &s, &mdata); + let key = + super::env::ExprKey::Prj(type_id.addr.clone(), field_idx, *s.addr()); values.push(timed_intern_or_build( intern, - hash, - |addr| KExpr::prj_mdata_with_addr(type_id, field_idx, s, mdata, addr), + &key, + || KExpr::prj_mdata(type_id, field_idx, s, mdata), stats, )); if let Some(t0) = cont_t0 { @@ -2174,7 +2170,7 @@ pub fn resolve_lean_name_addr( /// Compute a stable hash for a `param_names` slice, used as part of the /// ingress cache key. Two calls with the same param names (in the same /// order) produce the same hash. -pub fn param_names_hash(param_names: &[Name]) -> Addr { +pub fn param_names_hash(param_names: &[Name]) -> CtxAddr { let mut hasher = blake3::Hasher::new(); hasher.update(&(param_names.len() as u64).to_le_bytes()); for n in param_names { @@ -2254,8 +2250,8 @@ pub fn lean_expr_to_zexpr_cached( intern: &mut InternTable, n2a: Option<&DashMap>, aux_n2a: Option<&DashMap>, - mut cache: Option<&mut FxHashMap<(Addr, Addr), KExpr>>, - pn_hash: Option<&Addr>, + mut cache: Option<&mut FxHashMap<(CtxAddr, CtxAddr), KExpr>>, + pn_hash: Option<&CtxAddr>, ) -> KExpr { // Check cache if let (Some(cache), Some(pn_hash)) = (cache.as_ref(), pn_hash) { @@ -2295,8 +2291,8 @@ fn lean_expr_to_zexpr_raw( intern: &mut InternTable, n2a: Option<&DashMap>, aux_n2a: Option<&DashMap>, - mut cache: Option<&mut FxHashMap<(Addr, Addr), KExpr>>, - pn_hash: Option<&Addr>, + mut cache: Option<&mut FxHashMap<(CtxAddr, CtxAddr), KExpr>>, + pn_hash: Option<&CtxAddr>, ) -> KExpr { // Walk through any consecutive `Mdata` wrappers first, accumulating them // as kernel-side `MData` layers. Lean represents `Mdata(a, Mdata(b, e))` diff --git a/crates/kernel/src/level.rs b/crates/kernel/src/level.rs index 0ff6df8c..1c899314 100644 --- a/crates/kernel/src/level.rs +++ b/crates/kernel/src/level.rs @@ -37,7 +37,7 @@ use std::collections::BTreeMap; use std::fmt; use std::sync::Arc; -use ix_common::env::{Name, UIMAX, UMAX, UPARAM, USUCC, UZERO}; +use ix_common::env::Name; use super::env::Addr; use super::mode::{KernelMode, MetaDisplay}; @@ -81,11 +81,33 @@ impl KUniv { Arc::ptr_eq(&self.0, &other.0) } - /// Structural equality by Merkle hash (pointer-first fast path). + /// Canonical-identity equality: `ptr_eq` or equal intern uid. Sound in + /// the affirmative; incomplete in the negative (independently built + /// equal levels carry distinct uids) — `==` adds the structural + /// fallback. pub fn hash_eq(&self, other: &KUniv) -> bool { self.ptr_eq(other) || self.addr() == other.addr() } + /// Structural equality mirroring the intern identity: `Param` display + /// names are NOT compared (matching the historical hash semantics). + /// Equal interned subtrees prune at the uid fast path. + fn structural_eq(&self, other: &Self) -> bool { + if self.hash_eq(other) { + return true; + } + match (self.data(), other.data()) { + (UnivData::Zero(_), UnivData::Zero(_)) => true, + (UnivData::Succ(a, _), UnivData::Succ(b, _)) => a.structural_eq(b), + (UnivData::Max(a1, b1, _), UnivData::Max(a2, b2, _)) + | (UnivData::IMax(a1, b1, _), UnivData::IMax(a2, b2, _)) => { + a1.structural_eq(a2) && b1.structural_eq(b2) + }, + (UnivData::Param(i, _, _), UnivData::Param(j, _, _)) => i == j, + _ => false, + } + } + /// True if this level is definitionally zero (Prop). pub fn is_zero(&self) -> bool { matches!(self.data(), UnivData::Zero(_)) @@ -130,14 +152,11 @@ impl KUniv { impl KUniv { pub fn zero() -> Self { - KUniv::new(UnivData::Zero(blake3::hash(&[UZERO]))) + KUniv::new(UnivData::Zero(super::expr::fresh_uid())) } pub fn succ(inner: KUniv) -> Self { - let mut hasher = blake3::Hasher::new(); - hasher.update(&[USUCC]); - hasher.update(inner.addr().as_bytes()); - KUniv::new(UnivData::Succ(inner, hasher.finalize())) + KUniv::new(UnivData::Succ(inner, super::expr::fresh_uid())) } /// Construct `max(a, b)` with Lean-style simplifications: @@ -193,11 +212,7 @@ impl KUniv { /// Raw `Max` constructor without simplification. Used by `max()` after /// all simplification opportunities are exhausted. fn max_raw(a: KUniv, b: KUniv) -> Self { - let mut hasher = blake3::Hasher::new(); - hasher.update(&[UMAX]); - hasher.update(a.addr().as_bytes()); - hasher.update(b.addr().as_bytes()); - KUniv::new(UnivData::Max(a, b, hasher.finalize())) + KUniv::new(UnivData::Max(a, b, super::expr::fresh_uid())) } /// Construct `imax(a, b)` with Lean-style simplifications: @@ -228,25 +243,18 @@ impl KUniv { return a; // imax(a, a) = a } // No simplification — construct raw IMax node. - let mut hasher = blake3::Hasher::new(); - hasher.update(&[UIMAX]); - hasher.update(a.addr().as_bytes()); - hasher.update(b.addr().as_bytes()); - KUniv::new(UnivData::IMax(a, b, hasher.finalize())) + KUniv::new(UnivData::IMax(a, b, super::expr::fresh_uid())) } pub fn param(idx: u64, name: M::MField) -> Self { - let mut hasher = blake3::Hasher::new(); - hasher.update(&[UPARAM]); - hasher.update(&idx.to_le_bytes()); - KUniv::new(UnivData::Param(idx, name, hasher.finalize())) + KUniv::new(UnivData::Param(idx, name, super::expr::fresh_uid())) } } -// Structural equality by Merkle hash. +// Structural equality (uid fast path, recursive fallback). impl PartialEq for KUniv { fn eq(&self, other: &Self) -> bool { - self.hash_eq(other) + self.structural_eq(other) } } @@ -725,8 +733,10 @@ mod tests { #[test] fn zero_hash_deterministic() { - assert_eq!(MU::zero().addr(), MU::zero().addr()); - assert_eq!(AU::zero().addr(), AU::zero().addr()); + // Identity is intern-assigned now: independent constructions carry + // distinct uids but remain structurally equal (`==`). + assert_eq!(MU::zero(), MU::zero()); + assert_eq!(AU::zero(), AU::zero()); } #[test] @@ -774,14 +784,14 @@ mod tests { fn meta_param_name_does_not_affect_hash() { let a = MU::param(0, mk_name("u")); let b = MU::param(0, mk_name("v")); - assert_eq!(a.addr(), b.addr()); + assert_eq!(a, b); } #[test] fn meta_param_same_name_same_hash() { let a = MU::param(0, mk_name("u")); let b = MU::param(0, mk_name("u")); - assert_eq!(a.addr(), b.addr()); + assert_eq!(a, b); } // ---- Anon mode: names erased ---- @@ -790,23 +800,31 @@ mod tests { fn anon_param_same_index_same_hash() { let a = AU::param(0, ()); let b = AU::param(0, ()); - assert_eq!(a.addr(), b.addr()); + assert_eq!(a, b); } // ---- Anon vs Meta structural hash matches (metadata erased) ---- #[test] fn anon_vs_meta_named_param_match() { + // Cross-mode structural identity is visible through the shallow + // intern keys, which are mode-independent. let anon = AU::param(0, ()); let meta = MU::param(0, mk_name("u")); - assert_eq!(anon.addr(), meta.addr()); + assert_eq!( + super::super::env::univ_key(&anon), + super::super::env::univ_key(&meta) + ); } #[test] fn anon_vs_meta_anon_param_same() { let anon = AU::param(0, ()); let meta = MU::param(0, Name::anon()); - assert_eq!(anon.addr(), meta.addr()); + assert_eq!( + super::super::env::univ_key(&anon), + super::super::env::univ_key(&meta) + ); } // ---- PartialEq ---- @@ -1011,8 +1029,8 @@ mod tests { // Same structure, different names — semantically equal let a = MU::param(0, mk_name("u")); let b = MU::param(0, mk_name("v")); - // Hashes are metadata-erased, and Géran comparison sees the same index. - assert_eq!(a.addr(), b.addr()); + // Identity is metadata-erased, and Géran comparison sees the same index. + assert_eq!(a, b); assert!(univ_eq(&a, &b)); } diff --git a/crates/kernel/src/subst.rs b/crates/kernel/src/subst.rs index 11ccac4a..32ac5cac 100644 --- a/crates/kernel/src/subst.rs +++ b/crates/kernel/src/subst.rs @@ -11,6 +11,9 @@ //! uses a `PtrMap Expr Expr` for the same reason (see //! `refs/lean4lean/Lean4Lean/Expr.lean:14`). +use std::cell::OnceCell; +use std::sync::Arc; + use rustc_hash::FxHashMap; use super::env::{Addr, InternTable}; @@ -488,6 +491,217 @@ fn lift_cached( interned } +// ============================================================================ +// Closures for the WHNF environment machine (see `whnf.rs` machine loop and +// `docs/env_machine_whnf.md`). Beta/zeta there are O(1) pushes onto an +// `MEnv`; substitution happens only here, at machine exit points +// ("readback"), and only for the parts of the term the reduction actually +// hands to a plain-expression consumer. +// ============================================================================ + +/// Expression closed over a machine environment: `Clo { e, env }` denotes +/// `e[Var(i) := env[i]]` for `i < env.len()`, with `Var(i)` for +/// `i >= env.len()` shifting DOWN by `env.len()` into the ambient context. +/// WHNF never reduces under binders, so environments never need lifting. +pub(crate) struct Clo { + pub(crate) e: KExpr, + pub(crate) env: MEnv, + /// Memoized depth-0 readback. Closures are shared (one env entry may be + /// referenced by many variables; spine args survive multiple machine + /// re-entries), and without a global content-addressed memo this is + /// what keeps each shared closure's substitution from re-running. + readback: OnceCell>, +} + +impl Clo { + pub(crate) fn new(e: KExpr, env: MEnv) -> Self { + Clo { e, env, readback: OnceCell::new() } + } + + /// Closure over the empty environment (a plain expression). + pub(crate) fn closed(e: KExpr) -> Self { + Clo::new(e, MEnv::empty()) + } +} + +struct MEnvNode { + head: Arc>, + tail: MEnv, +} + +/// Persistent cons-list environment: O(1) push with structural sharing +/// across the closures captured at each binder. `len` is carried on the +/// handle — recomputing it per suffix was a measured cost on the IxVM +/// port of this machine. +pub(crate) struct MEnv { + node: Option>>, + len: u64, +} + +// Manual impl: `#[derive(Clone)]` would demand `M: Clone`. +impl Clone for MEnv { + fn clone(&self) -> Self { + MEnv { node: self.node.clone(), len: self.len } + } +} + +impl MEnv { + pub(crate) fn empty() -> Self { + MEnv { node: None, len: 0 } + } + + pub(crate) fn len(&self) -> u64 { + self.len + } + + pub(crate) fn push(&self, c: Arc>) -> Self { + MEnv { + node: Some(Arc::new(MEnvNode { head: c, tail: self.clone() })), + len: self.len + 1, + } + } + + /// O(i) cons-list walk; `i` must be `< self.len()`. Machine variable + /// lookups are typically near the front (recently pushed args). + pub(crate) fn get(&self, i: u64) -> &Arc> { + let mut node = self.node.as_ref().expect("MEnv::get out of range"); + let mut i = i; + while i > 0 { + node = node.tail.node.as_ref().expect("MEnv::get out of range"); + i -= 1; + } + &node.head + } +} + +/// Materialize a closure into a plain expression: +/// `e[Var(i) := readback(env[i])]` for `i < env.len()`, `Var(j)` above +/// shifted down by `env.len()`. Memoized per closure. +pub(crate) fn clo_readback( + intern: &mut InternTable, + c: &Clo, +) -> KExpr { + if c.env.len() == 0 { + return c.e.clone(); + } + if let Some(v) = c.readback.get() { + return v.clone(); + } + let v = clo_subst(intern, &c.e, &c.env, 0); + let _ = c.readback.set(v.clone()); + v +} + +/// Simultaneous environment substitution at binder depth `depth` — the +/// machine's only substitution ("readback"). Var arm semantics: +/// `i < depth` → unchanged (locally bound) +/// `depth <= i < depth + n` → `lift(readback(env[i - depth]), depth)` +/// `i >= depth + n` → `Var(i - n)` +/// where `n = env.len()`. lbr-guarded at every node (a no-op subtree +/// returns its original Arc), per-call memoized by `(uid, depth)`, and +/// results interned — the same discipline as `subst`/`simul_subst`. +/// +/// The memo scratch comes from a pool (`clo_scratch_pool`) rather than a +/// single buffer because the Var arm re-enters `clo_subst` through +/// `clo_readback` of environment entries, each under a *different* +/// environment; nesting levels must not share memo entries. +pub(crate) fn clo_subst( + intern: &mut InternTable, + e: &KExpr, + env: &MEnv, + depth: u64, +) -> KExpr { + if env.len() == 0 || e.lbr() <= depth { + return e.clone(); + } + let mut cache = intern.clo_scratch_pool.pop().unwrap_or_default(); + let result = clo_subst_cached(intern, e, env, depth, &mut cache); + cache.clear(); + intern.clo_scratch_pool.push(cache); + result +} + +fn clo_subst_cached( + intern: &mut InternTable, + e: &KExpr, + env: &MEnv, + depth: u64, + cache: &mut FxHashMap<(Addr, u64), KExpr>, +) -> KExpr { + if e.lbr() <= depth { + return e.clone(); + } + + let key = (e.hash_key(), depth); + if let Some(cached) = cache.get(&key) { + return cached.clone(); + } + + let n = env.len(); + + let result = match e.data() { + ExprData::Var(i, name, _) => { + let i = *i; + if i < depth + n { + // `i < depth` is unreachable under the outer lbr guard, so this + // is an env hit: substitute the entry's readback, lifted over + // the local binders we are under. + let c = env.get(i - depth).clone(); + let v = clo_readback(intern, &c); + let r = lift(intern, &v, depth, 0); + cache.insert(key, r.clone()); + return r; + } + KExpr::var(i - n, name.clone()) + }, + + ExprData::App(f, x, _) => { + let f2 = clo_subst_cached(intern, f, env, depth, cache); + let x2 = clo_subst_cached(intern, x, env, depth, cache); + KExpr::app(f2, x2) + }, + + ExprData::Lam(name, bi, ty, inner, _) => { + let ty2 = clo_subst_cached(intern, ty, env, depth, cache); + let inner2 = clo_subst_cached(intern, inner, env, depth + 1, cache); + KExpr::lam(name.clone(), bi.clone(), ty2, inner2) + }, + + ExprData::All(name, bi, ty, inner, _) => { + let ty2 = clo_subst_cached(intern, ty, env, depth, cache); + let inner2 = clo_subst_cached(intern, inner, env, depth + 1, cache); + KExpr::all(name.clone(), bi.clone(), ty2, inner2) + }, + + ExprData::Let(name, ty, val, inner, nd, _) => { + let ty2 = clo_subst_cached(intern, ty, env, depth, cache); + let val2 = clo_subst_cached(intern, val, env, depth, cache); + let inner2 = clo_subst_cached(intern, inner, env, depth + 1, cache); + KExpr::let_(name.clone(), ty2, val2, inner2, *nd) + }, + + ExprData::Prj(id, field, val, _) => { + let val2 = clo_subst_cached(intern, val, env, depth, cache); + KExpr::prj(id.clone(), *field, val2) + }, + + ExprData::FVar(..) + | ExprData::Sort(..) + | ExprData::Const(..) + | ExprData::Nat(..) + | ExprData::Str(..) => { + // Closed atoms — unreachable under the lbr guard; defensive. + let r = e.clone(); + cache.insert(key, r.clone()); + return r; + }, + }; + + let interned = intern.intern_expr(result); + cache.insert(key, interned.clone()); + interned +} + /// Cheap beta reduction: peephole-reduce `App(λ...λ. body, args)` shapes /// without invoking the full [`subst`] machinery in trivial cases. /// diff --git a/crates/kernel/src/tc.rs b/crates/kernel/src/tc.rs index 257761f8..53d1c814 100644 --- a/crates/kernel/src/tc.rs +++ b/crates/kernel/src/tc.rs @@ -13,7 +13,7 @@ use ix_common::address::Address; use ixon::env::Env as IxonEnv; use super::constant::{KConst, RecRule}; -use super::env::{Addr, KEnv}; +use super::env::{Addr, CtxAddr, KEnv}; use super::equiv::EquivManager; use super::error::{TcError, u64_to_usize}; use super::expr::{ExprData, FVarId, KExpr}; @@ -29,9 +29,9 @@ use super::primitive::Primitives; use super::subst::{instantiate_rev, lift}; /// Content-addressed context identity for the empty context (no bindings). -pub fn empty_ctx_addr() -> Addr { +pub fn empty_ctx_addr() -> CtxAddr { use std::sync::LazyLock; - static ADDR: LazyLock = + static ADDR: LazyLock = LazyLock::new(|| blake3::hash(b"ix.kernel.ctx.empty")); *ADDR } @@ -126,9 +126,9 @@ pub struct TypeChecker<'a, M: KernelMode> { pub num_let_bindings: usize, /// Content-addressed context identity: a blake3 hash derived from the /// binding-type chain. Immune to the ABA pointer-reuse problem. - pub ctx_id: Addr, + pub ctx_id: CtxAddr, /// Stack of previous ctx_ids for O(1) pop. - ctx_id_stack: Vec, + ctx_id_stack: Vec, // -- Thread-local optimization -- /// Union-find for transitive def-eq caching (lean4lean EquivManager). @@ -183,7 +183,7 @@ pub struct TypeChecker<'a, M: KernelMode> { /// equal in the suffix-relevant prefix (`ctx_id` content-addresses the /// full context). The cache lifetime is the `TypeChecker` (one per /// `check_const`), so it is automatically reclaimed. - ctx_addr_cache: FxHashMap<(Addr, u64), Addr>, + ctx_addr_cache: FxHashMap<(CtxAddr, u64), CtxAddr>, // -- Free-variable infrastructure -- /// Local context for fvar-based binder opening. Some validation paths still @@ -355,7 +355,7 @@ impl<'a, M: KernelMode> TypeChecker<'a, M> { /// Sharing two distinct outer contexts that share a relevant suffix is the /// payoff: the same WHNF subterm can hit cache across them. #[inline] - pub fn whnf_key(&mut self, e: &KExpr) -> (Addr, Addr) { + pub fn whnf_key(&mut self, e: &KExpr) -> (Addr, CtxAddr) { (e.hash_key(), self.ctx_addr_for_lbr(e.lbr())) } @@ -366,7 +366,7 @@ impl<'a, M: KernelMode> TypeChecker<'a, M> { /// dependencies, so two equal open subterms can share an infer result across /// different outer binders when the relevant local suffix is identical. #[inline] - pub fn infer_key(&mut self, e: &KExpr) -> (Addr, Addr) { + pub fn infer_key(&mut self, e: &KExpr) -> (Addr, CtxAddr) { (e.hash_key(), self.ctx_addr_for_lbr(e.lbr())) } @@ -378,11 +378,11 @@ impl<'a, M: KernelMode> TypeChecker<'a, M> { /// expressions, so the relevant context is the suffix needed by the larger /// `lbr`. #[inline] - pub fn def_eq_ctx_key(&mut self, a: &KExpr, b: &KExpr) -> Addr { + pub fn def_eq_ctx_key(&mut self, a: &KExpr, b: &KExpr) -> CtxAddr { self.ctx_addr_for_lbr(a.lbr().max(b.lbr())) } - pub(crate) fn ctx_addr_for_lbr(&mut self, lbr: u64) -> Addr { + pub(crate) fn ctx_addr_for_lbr(&mut self, lbr: u64) -> CtxAddr { if lbr == 0 || self.ctx.is_empty() { return empty_ctx_addr(); } @@ -428,12 +428,12 @@ impl<'a, M: KernelMode> TypeChecker<'a, M> { match &self.let_vals[i] { Some(val) => { h.update(b"let"); - h.update(self.ctx[i].addr().as_bytes()); - h.update(val.addr().as_bytes()); + h.update(&self.ctx[i].addr().to_le_bytes()); + h.update(&val.addr().to_le_bytes()); }, None => { h.update(b"local"); - h.update(self.ctx[i].addr().as_bytes()); + h.update(&self.ctx[i].addr().to_le_bytes()); }, } } @@ -445,10 +445,13 @@ impl<'a, M: KernelMode> TypeChecker<'a, M> { } /// Push a local variable type (lambda/forall binding, no let-value). + /// The type is interned first: ctx suffix hashing keys on uids, so a + /// canonical frame is what lets equal suffixes share cache entries. pub fn push_local(&mut self, ty: KExpr) { + let ty = self.env.intern.intern_expr(ty); let mut h = blake3::Hasher::new(); h.update(b"ctx.local"); - h.update(ty.addr().as_bytes()); + h.update(&ty.addr().to_le_bytes()); h.update(self.ctx_id.as_bytes()); self.ctx_id_stack.push(self.ctx_id); self.ctx_id = h.finalize(); @@ -459,10 +462,12 @@ impl<'a, M: KernelMode> TypeChecker<'a, M> { /// Push a let-bound variable (type + value). WHNF will zeta-reduce references /// to this variable by substituting the value (lean4lean withExtendedLetCtx). pub fn push_let(&mut self, ty: KExpr, val: KExpr) { + let ty = self.env.intern.intern_expr(ty); + let val = self.env.intern.intern_expr(val); let mut h = blake3::Hasher::new(); h.update(b"ctx.let"); - h.update(ty.addr().as_bytes()); - h.update(val.addr().as_bytes()); + h.update(&ty.addr().to_le_bytes()); + h.update(&val.addr().to_le_bytes()); h.update(self.ctx_id.as_bytes()); self.ctx_id_stack.push(self.ctx_id); self.ctx_id = h.finalize(); @@ -982,7 +987,7 @@ impl<'a, M: KernelMode> TypeChecker<'a, M> { let ctx = self.ctx_addr_for_lbr(e.lbr()); key.push_str(&format!( " ctx={} depth={}", - short_addr(&ctx), + short_ctx_addr(&ctx), self.depth() )); } @@ -999,7 +1004,7 @@ impl<'a, M: KernelMode> TypeChecker<'a, M> { let ctx = self.def_eq_ctx_key(a, b); key.push_str(&format!( " ctx={} depth={}", - short_addr(&ctx), + short_ctx_addr(&ctx), self.depth() )); } @@ -1155,6 +1160,10 @@ fn hot_expr_shape(e: &KExpr) -> String { } fn short_addr(addr: &Addr) -> String { + format!("uid{addr}") +} + +fn short_ctx_addr(addr: &CtxAddr) -> String { addr.to_hex().chars().take(12).collect() } @@ -1259,13 +1268,19 @@ mod tests { #[test] fn ctx_id_same_pushes_yield_same_hash() { - let mut tc1 = new_tc(); - let mut tc2 = new_tc(); - tc1.push_local(sort0()); - tc1.push_local(sort1()); + // ctx hashing keys on intern uids, so the sharing property holds for + // checkers over the SAME env (which is also the cache-sharing scope). + let env: *mut KEnv = Box::leak(Box::new(KEnv::::new())); + let id1 = { + let mut tc1 = TypeChecker::new(unsafe { &mut *env }); + tc1.push_local(sort0()); + tc1.push_local(sort1()); + tc1.ctx_id + }; + let mut tc2 = TypeChecker::new(unsafe { &mut *env }); tc2.push_local(sort0()); tc2.push_local(sort1()); - assert_eq!(tc1.ctx_id, tc2.ctx_id); + assert_eq!(id1, tc2.ctx_id); } #[test] @@ -1354,19 +1369,22 @@ mod tests { // outer frame. A `var(0)` with lbr=1 should key only by the inner // suffix, so the two `whnf_key`s should match even though the outer // contexts (and hence ctx_ids) differ. - let mut tc1 = new_tc(); - tc1.push_local(sort0()); // outer A - tc1.push_local(sort1()); // inner X - - let mut tc2 = new_tc(); + let env: *mut KEnv = Box::leak(Box::new(KEnv::::new())); + let e = var(0); // lbr = 1, depends only on innermost frame + let (h1, ctx1, outer1) = { + let mut tc1 = TypeChecker::new(unsafe { &mut *env }); + tc1.push_local(sort0()); // outer A + tc1.push_local(sort1()); // inner X + let (h, c) = tc1.whnf_key(&e); + (h, c, tc1.ctx_id) + }; + let mut tc2 = TypeChecker::new(unsafe { &mut *env }); tc2.push_local(sort1()); // outer B (different from A) tc2.push_local(sort1()); // inner X (same as tc1's inner) // ctx_ids differ (different outer frames). - assert_ne!(tc1.ctx_id, tc2.ctx_id); + assert_ne!(outer1, tc2.ctx_id); - let e = var(0); // lbr = 1, depends only on innermost frame - let (h1, ctx1) = tc1.whnf_key(&e); let (h2, ctx2) = tc2.whnf_key(&e); assert_eq!(h1, h2); assert_eq!( diff --git a/crates/kernel/src/whnf.rs b/crates/kernel/src/whnf.rs index 78fa5df9..dc7319f7 100644 --- a/crates/kernel/src/whnf.rs +++ b/crates/kernel/src/whnf.rs @@ -2,7 +2,7 @@ //! //! Multi-phase: whnf_core (beta, iota, zeta) → proj → nat → quot → delta. -use std::sync::LazyLock; +use std::sync::{Arc, LazyLock}; use rustc_hash::FxHashSet; @@ -66,7 +66,9 @@ use super::expr::{ExprData, KExpr}; use super::id::KId; use super::level::KUniv; use super::mode::KernelMode; -use super::subst::{simul_subst, subst, subst_no_intern}; +use super::subst::{ + Clo, MEnv, clo_readback, clo_subst, subst, subst_no_intern, +}; use super::tc::{IotaInfo, MAX_WHNF_FUEL, TypeChecker, collect_app_spine}; use bignat::Nat; @@ -107,6 +109,80 @@ struct NatRecLiteralParts { major: Nat, base_idx: usize, step_idx: usize, + major_idx: usize, +} + +/// Which primitive-reducer family a head constant belongs to. The WHNF +/// loops used to probe every reducer per iteration — each collecting its +/// own app spine and running its own gauntlet of 32-byte address +/// compares. Classifying the head ONCE (memoized per address in +/// `KEnv::prim_family_cache`) lets an iteration call at most one family +/// reducer and skip everything for ordinary constants. Mirrors the IxVM +/// `prim_family` dispatch memo. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum PrimFamily { + Native, + BitVec, + Nat, + Decidable, + Str, + Other, +} + +fn prim_family_uncached( + p: &Primitives, + addr: &Address, +) -> PrimFamily { + if *addr == p.nat_succ.addr + || *addr == p.nat_add.addr + || *addr == p.nat_sub.addr + || *addr == p.nat_mul.addr + || *addr == p.nat_div.addr + || *addr == p.nat_mod.addr + || *addr == p.nat_pow.addr + || *addr == p.nat_gcd.addr + || *addr == p.nat_land.addr + || *addr == p.nat_lor.addr + || *addr == p.nat_xor.addr + || *addr == p.nat_shift_left.addr + || *addr == p.nat_shift_right.addr + || *addr == p.nat_beq.addr + || *addr == p.nat_ble.addr + { + return PrimFamily::Nat; + } + if *addr == p.nat_dec_le.addr + || *addr == p.nat_dec_eq.addr + || *addr == p.nat_dec_lt.addr + || *addr == p.int_dec_le.addr + || *addr == p.int_dec_eq.addr + || *addr == p.int_dec_lt.addr + { + return PrimFamily::Decidable; + } + if *addr == p.bit_vec_to_nat.addr + || *addr == p.bit_vec_ult.addr + || *addr == p.decidable_decide.addr + { + return PrimFamily::BitVec; + } + if *addr == p.punit_size_of_1.addr + || *addr == p.subtype_val.addr + || *addr == p.size_of_size_of.addr + || *addr == p.system_platform_num_bits.addr + || *addr == p.reduce_bool.addr + || *addr == p.reduce_nat.addr + { + return PrimFamily::Native; + } + if *addr == p.string_back.addr + || *addr == p.string_legacy_back.addr + || *addr == p.string_utf8_byte_size.addr + || *addr == p.string_to_byte_array.addr + { + return PrimFamily::Str; + } + PrimFamily::Other } impl TypeChecker<'_, M> { @@ -290,10 +366,17 @@ impl TypeChecker<'_, M> { break; } - // Native reduction: Lean.reduceBool, Lean.reduceNat, System.Platform.numBits - // (mirrors lean4 `type_checker.cpp:667-672` and lean4lean - // `TypeChecker.lean:438` — `reduce_native` runs before `reduce_nat`). - if let Some(reduced) = self.try_reduce_native(&cur)? { + // Primitive reduction, dispatched by memoized head family — the + // five recognizers below each collect their own spine and run their + // own address gauntlet, so probing all of them every iteration was + // a measurable tax on ordinary (Other-headed) terms. Semantics are + // unchanged: the families' head-address sets are disjoint, so at + // most one recognizer could fire anyway. Reference order preserved + // (native before nat: lean4 `type_checker.cpp:667-672`). + let family = self.head_prim_family(&cur); + if family == PrimFamily::Native + && let Some(reduced) = self.try_reduce_native(&cur)? + { cur = reduced; continue; } @@ -301,7 +384,9 @@ impl TypeChecker<'_, M> { // BitVec definitions reduce through Nat comparisons. Keep this before // delta so small definitional facts such as `x < 0#w` collapse // without unfolding the full Fin-backed representation of BitVec. - if let Some(reduced) = self.try_reduce_bitvec(&cur)? { + if family == PrimFamily::BitVec + && let Some(reduced) = self.try_reduce_bitvec(&cur)? + { cur = reduced; continue; } @@ -309,25 +394,46 @@ impl TypeChecker<'_, M> { // Nat primitive reduction in main WHNF loop (lean4lean TypeChecker.lean:439). // Must run BEFORE delta_unfold_one, so that Nat.sub/Nat.pow/etc. get // short-circuited before their bodies (which use Nat.rec) are exposed. - if let Some(reduced) = - self.try_reduce_nat_with_succ_mode(&cur, nat_succ_mode)? + if family == PrimFamily::Nat + && let Some(reduced) = + self.try_reduce_nat_with_succ_mode(&cur, nat_succ_mode)? { cur = reduced; continue; } // Nat decidability: Nat.decLe/decEq/decLt on literals → Decidable.isTrue/isFalse. // Must run BEFORE delta, so the body (which uses dite/Nat.rec) is never exposed. - if let Some(reduced) = self.try_reduce_decidable(&cur)? { + if family == PrimFamily::Decidable + && let Some(reduced) = self.try_reduce_decidable(&cur)? + { cur = reduced; continue; } // String literal primitives such as `String.back ""`. - if let Some(reduced) = self.try_reduce_string(&cur) { + if family == PrimFamily::Str + && let Some(reduced) = self.try_reduce_string(&cur) + { cur = reduced; continue; } + // Keep `Nat.add base (Lit n)` (symbolic base, n > 0) and + // `Nat.div/mod base (Lit k)` (k ≥ 2) STUCK as a compact offset instead + // of delta-unfolding: `Nat.add` would materialize succ^n(base) — O(n) + // substitution per layer — and `Nat.div/mod` would expand the division + // algorithm, even though both are irreducible for a symbolic base. + // (`Nat.shiftRight x k` unfolds to k nested `Nat.div _ 2`, which now + // stay stuck.) Iota over such a major still works via + // `cleanup_nat_offset_major`; def-eq decides offset pairs in + // `try_def_eq_offset`. Mirrors IxVM 5dcab7f/f7cfe23. + if family == PrimFamily::Nat + && let Some(stuck) = self.try_nat_offset_stuck(&cur)? + { + cur = stuck; + break; + } + if let Some(unfolded) = self.delta_unfold_one(&cur)? { cur = unfolded; continue; @@ -538,31 +644,15 @@ impl TypeChecker<'_, M> { let (f0, args) = collect_app_spine(&cur); let f = self.whnf_core_with_flags(&f0, flags)?; - // Multi-arg beta + // Beta: enter the environment machine. Subsequent betas/zetas are + // O(1) environment pushes; substitution materializes only at the + // machine's exit, which re-enters this loop as the new `cur` (so + // ambient zeta / iota / prim dispatch see exactly what the old + // eager-substitution path produced). Entry is gated on an actual + // beta firing — Const-headed terms (e.g. literal recursor loops) + // never pay the closure-wrap + readback overhead. if matches!(f.data(), ExprData::Lam(..)) { - let mut body = f; - // Pre-size: at most one arg is consumed per outer Lam, capped by - // `args.len()`. Pre-sizing skips the first growth reallocation - // for non-trivial spines on this hot path. - let mut consumed_args = Vec::with_capacity(args.len()); - while consumed_args.len() < args.len() { - if let ExprData::Lam(_, _, _, inner, _) = body.data() { - let inner = inner.clone(); - consumed_args.push(args[consumed_args.len()].clone()); - body = inner; - } else { - break; - } - } - let remaining_start = consumed_args.len(); - if !consumed_args.is_empty() { - consumed_args.reverse(); - body = simul_subst(&mut self.env.intern, &body, &consumed_args, 0); - } - for arg in &args[remaining_start..] { - body = self.intern(KExpr::app(body, arg.clone())); - } - cur = body; + cur = self.machine_whnf(f, &args, &mut fuel, flags)?; continue; } @@ -589,6 +679,324 @@ impl TypeChecker<'_, M> { } } + /// The WHNF environment machine (Krivine-style, structural fragment). + /// See `docs/env_machine_whnf.md`; mirrors the IxVM `mwhnf_spine`. + /// + /// Entered from the App arm of [`Self::whnf_core_with_flags_uncached`] + /// when a beta is about to fire: `head` is the already-whnf_core'd + /// `Lam`, `args` the plain argument spine nearest-first. Machine state + /// is `(head, env, spine)`: `head` a raw subterm of the program, `env` + /// the closure environment its loose `Var`s refer to, and `spine` the + /// pending argument closures used as a stack — NEAREST argument last. + /// + /// Transitions (each O(1), no substitution): + /// `App(f, a)` push `Clo{a, env}`; descend to `f` + /// `Lam` + pending arg pop spine, push env (**beta**) + /// `Let` push `Clo{val, env}` (**zeta**) + /// `Var(i)`, `i < n` jump into `env[i]`'s closure + /// + /// Everything else EXITS: the head and remaining spine read back to a + /// plain expression (`clo_subst` — work proportional to what the + /// reduction actually consumes), and the caller's loop continues with + /// it. That exit contract keeps ambient let/LDecl zeta (the outer + /// Var/FVar arms), iota, projection reduction under the `flags` cheap + /// policy, and prim dispatch byte-identical to the eager path the + /// machine replaces. + /// + /// Fuel: beta and zeta charge the caller's budget — the same + /// one-substitution-event-per-tick granularity as the eager path. + /// The transitions between charges (App peels, var jumps) are bounded + /// by program structure reachable from the fueled pushes, so total + /// work stays fuel-bounded. + fn machine_whnf( + &mut self, + head: KExpr, + args: &[KExpr], + fuel: &mut u32, + flags: WhnfFlags, + ) -> Result, TcError> { + let mut head = head; + let mut env: MEnv = MEnv::empty(); + let mut spine: Vec>> = + args.iter().rev().map(|a| Arc::new(Clo::closed(a.clone()))).collect(); + + loop { + match head.data() { + ExprData::App(f, a, _) => { + let c = Arc::new(Clo::new(a.clone(), env.clone())); + let f = f.clone(); + spine.push(c); + head = f; + }, + + ExprData::Lam(name, bi, ty, body, _) => { + if let Some(c) = spine.pop() { + // Beta: O(1) environment push. + if *fuel == 0 { + return Err(TcError::MaxRecDepth); + } + *fuel -= 1; + let body = body.clone(); + env = env.push(c); + head = body; + } else { + // Value. Read back under one binder. + if env.len() == 0 { + return Ok(head.clone()); + } + let ty2 = clo_subst(&mut self.env.intern, ty, &env, 0); + let body2 = clo_subst(&mut self.env.intern, body, &env, 1); + return Ok(self.intern(KExpr::lam( + name.clone(), + bi.clone(), + ty2, + body2, + ))); + } + }, + + ExprData::Let(_, _, val, body, _, _) => { + // Zeta: O(1) environment push. + if *fuel == 0 { + return Err(TcError::MaxRecDepth); + } + *fuel -= 1; + let c = Arc::new(Clo::new(val.clone(), env.clone())); + let body = body.clone(); + env = env.push(c); + head = body; + }, + + ExprData::Var(i, name, _) => { + let i = *i; + if i < env.len() { + // Machine-bound variable: jump into its closure. This is + // where laziness pays — the entry was never substituted. + let c = env.get(i).clone(); + head = c.e.clone(); + env = c.env.clone(); + } else { + // Ambient variable: shift below the machine binders and + // exit stuck; the outer loop's Var arm handles legacy + // let-bound zeta on it. + let h = self.intern(KExpr::var(i - env.len(), name.clone())); + return Ok(self.machine_exit(h, &spine)); + } + }, + + // Closure-iota (Phase B; mirrors IxVM try_iota_c): a recursor + // head consumes the spine LAZILY. On the main ctor-rule path the + // rule RHS re-enters the machine with the params/motives/minors + // and post-major arguments as their ORIGINAL closures (plus the + // ctor's fields wrapped closed) — the rule's Lam-chain betas push + // them straight into an environment, so unselected minors + // (dropped match/Decidable branches) are never substituted and + // never read back. Anything off the main path misses to the + // plain readback exit, whose `try_iota` redoes the major's whnf + // against a warm cache. Gated like the eager path: cheap_rec + // mode never runs full iota from inside the machine. + ExprData::Const(id, us, _) => { + if !flags.cheap_rec { + let id = id.clone(); + let us: Vec> = us.to_vec(); + if let Some((rhs, new_spine)) = + self.try_iota_clo(&id, &us, &spine)? + { + if *fuel == 0 { + return Err(TcError::MaxRecDepth); + } + *fuel -= 1; + head = rhs; + env = MEnv::empty(); + spine = new_spine; + continue; + } + } + let h = head.clone(); + return Ok(self.machine_exit(h, &spine)); + }, + + // Stuck or dispatch-owned heads (no loose Vars of their own): + // exit; the outer loop applies prim dispatch / LDecl zeta + // exactly as before. + ExprData::FVar(..) + | ExprData::Sort(..) + | ExprData::Nat(..) + | ExprData::Str(..) => { + let h = head.clone(); + return Ok(self.machine_exit(h, &spine)); + }, + + // Pi value (or ill-typed Pi application): read back; outer loop + // returns it (spine empty) or leaves it stuck (non-empty), as + // the eager path did. + ExprData::All(name, bi, ty, body, _) => { + let h = if env.len() == 0 { + head.clone() + } else { + let ty2 = clo_subst(&mut self.env.intern, ty, &env, 0); + let body2 = clo_subst(&mut self.env.intern, body, &env, 1); + self.intern(KExpr::all(name.clone(), bi.clone(), ty2, body2)) + }; + return Ok(self.machine_exit(h, &spine)); + }, + + // Projection: read the scrutinee back and exit; the outer loop + // reduces the projection under the caller's `flags` cheap/full + // policy (its Prj arm, or the App arm's head reduction when the + // spine is non-empty). + ExprData::Prj(id, field, val, _) => { + let h = if env.len() == 0 { + head.clone() + } else { + let val2 = clo_subst(&mut self.env.intern, val, &env, 0); + self.intern(KExpr::prj(id.clone(), *field, val2)) + }; + return Ok(self.machine_exit(h, &spine)); + }, + } + } + } + + /// Read the machine's remaining spine back onto an exit head: + /// `h a₁ … aₙ` with each argument materialized via [`clo_readback`]. + /// `spine` holds the nearest argument LAST. + fn machine_exit(&mut self, h: KExpr, spine: &[Arc>]) -> KExpr { + let mut cur = h; + for c in spine.iter().rev() { + let a = clo_readback(&mut self.env.intern, c); + cur = self.intern(KExpr::app(cur, a)); + } + cur + } + + /// Closure-spine iota for the machine's `Const` exit: the main + /// ctor-rule path of [`Self::try_iota_with_flags`], consuming the + /// machine spine lazily. Returns the level-instantiated rule RHS and + /// the machine spine to re-enter with — the params/motives/minors and + /// post-major arguments ride through as their original closures + /// (never read back; only the major materializes), the ctor's field + /// arguments are wrapped closed. + /// + /// Everything off the main path returns `None`, and the caller falls + /// back to the plain readback exit whose `try_iota` is complete: + /// - K recursors (nullary-ctor synthesis needs infer + def-eq), + /// - `Nat` literal majors (the transient-work discipline must keep + /// literal succ-chains out of the interner and the whnf caches, and + /// the linear-rec/offset shortcuts live on the plain path), + /// - `Str` literal majors (ctor coercion + re-whnf), + /// - struct-eta candidates and genuinely stuck majors. + /// + /// A miss costs one readback of the major closure; the plain path's + /// own whnf of that major then hits a warm cache. + fn try_iota_clo( + &mut self, + rec_id: &KId, + rec_us: &[KUniv], + spine: &[Arc>], + ) -> Result, Vec>>)>, TcError> { + let recr = match self.try_get_const(rec_id)? { + Some(KConst::Recr { + k, + params, + motives, + minors, + indices, + rules, + lvls, + .. + }) => { + if k { + return Ok(None); + } + let major_idx = u64_to_usize::(params + motives + minors + indices)?; + if spine.len() <= major_idx { + return Ok(None); + } + // H6: level params arity (lean4lean Reduce.lean:76). + if rec_us.len() as u64 != lvls { + return Ok(None); + } + IotaInfo { + k, + params: u64_to_usize::(params)?, + motives: u64_to_usize::(motives)?, + minors: u64_to_usize::(minors)?, + indices: u64_to_usize::(indices)?, + major_idx, + rules, + lvls, + } + }, + _ => return Ok(None), + }; + + // Materialize ONLY the major. `spine` is nearest-LAST, so the i-th + // argument nearest-first sits at `spine[len - 1 - i]`. + let len = spine.len(); + let major_clo = spine[len - 1 - recr.major_idx].clone(); + let major = clo_readback(&mut self.env.intern, &major_clo); + // Mirror the eager path's pre- and post-whnf offset cleanups so + // `Nat.add base k` majors expose a `Nat.succ` layer here instead of + // missing to a second full attempt. + let major = match self.cleanup_nat_offset_major(&major)? { + Some(cleaned) => cleaned, + None => major, + }; + let mut major_whnf = self.whnf(&major)?; + if matches!(major_whnf.data(), ExprData::Nat(..)) { + return Ok(None); + } + if let Some(cleaned) = self.cleanup_nat_offset_major(&major_whnf)? { + major_whnf = cleaned; + } + if matches!(major_whnf.data(), ExprData::Str(..)) { + return Ok(None); + } + + let (ctor_head, ctor_args) = collect_app_spine(&major_whnf); + let ctor_id = match ctor_head.data() { + ExprData::Const(id, _, _) => id.clone(), + _ => return Ok(None), + }; + let (cidx, ctor_fields) = match self.try_get_const(&ctor_id)? { + Some(KConst::Ctor { cidx, fields, .. }) => { + (u64_to_usize::(cidx)?, u64_to_usize::(fields)?) + }, + _ => return Ok(None), + }; + if cidx >= recr.rules.len() { + return Ok(None); + } + // H5: nfields ≤ major args (lean4lean Reduce.lean:75). + if ctor_fields > ctor_args.len() { + return Ok(None); + } + + crate::perf::record_iota_histo(&rec_id.addr); + let rule = &recr.rules[cidx]; + let rec_us_vec: Vec<_> = rec_us.to_vec(); + let rhs = self.instantiate_univ_params(&rule.rhs, &rec_us_vec)?; + + // New machine spine, nearest-LAST. Nearest-first the rule sees + // `pmm ++ ctor fields ++ post-major`; in machine order that is the + // post-major prefix of `spine` (indices below the major's slot), + // then the fields reversed, then the pmm suffix of `spine`. The + // index arguments between pmm and the major are dropped, as in the + // eager path. + let pmm_end = recr.params + recr.motives + recr.minors; + let field_start = ctor_args.len() - ctor_fields; + let post_len = len - 1 - recr.major_idx; + let mut new_spine: Vec>> = + Vec::with_capacity(post_len + ctor_fields + pmm_end); + new_spine.extend_from_slice(&spine[..post_len]); + for arg in ctor_args[field_start..].iter().rev() { + new_spine.push(Arc::new(Clo::closed(arg.clone()))); + } + new_spine.extend_from_slice(&spine[len - pmm_end..]); + Ok(Some((rhs, new_spine))) + } + /// WHNF without delta: whnf_core → proj-app → nat/native/string → quot. /// Projection values use full WHNF, preserving the public/full semantics. pub fn whnf_no_delta( @@ -696,15 +1104,23 @@ impl TypeChecker<'_, M> { continue; } + // Primitive reduction, dispatched by memoized head family (see the + // main WHNF loop) — family head sets are disjoint, so dispatching + // replaces probe-everything without changing semantics. + let family = self.head_prim_family(&cur); + // BitVec.toNat/ult reductions are definitional wrappers around Nat. - if let Some(reduced) = self.try_reduce_bitvec(&cur)? { + if family == PrimFamily::BitVec + && let Some(reduced) = self.try_reduce_bitvec(&cur)? + { cur = reduced; continue; } // Nat primitive reduction - if let Some(reduced) = - self.try_reduce_nat_with_succ_mode(&cur, nat_succ_mode)? + if family == PrimFamily::Nat + && let Some(reduced) = + self.try_reduce_nat_with_succ_mode(&cur, nat_succ_mode)? { cur = reduced; continue; @@ -715,13 +1131,17 @@ impl TypeChecker<'_, M> { // `Subtype.val` and `String.toByteArray` are projection definitions; // once rewritten to `Prj`, the cheap primitive recognizers no longer // see the original head. - if let Some(reduced) = self.try_reduce_native(&cur)? { + if family == PrimFamily::Native + && let Some(reduced) = self.try_reduce_native(&cur)? + { cur = reduced; continue; } // String literal primitives. - if let Some(reduced) = self.try_reduce_string(&cur) { + if family == PrimFamily::Str + && let Some(reduced) = self.try_reduce_string(&cur) + { cur = reduced; continue; } @@ -885,7 +1305,9 @@ impl TypeChecker<'_, M> { minors: u64_to_usize::(minors)?, indices: u64_to_usize::(indices)?, major_idx, - rules: rules.clone(), + // `rules` is already owned here (moved out of the KConst clone + // `try_get_const` returned) — do not clone it again. + rules, lvls, } }, @@ -1066,23 +1488,43 @@ impl TypeChecker<'_, M> { /// Nat literal iota can create a long chain of distinct predecessor terms. /// These terms are useful only while the current WHNF is executing; keeping /// each one in the global WHNF caches makes RSS linear in the literal. + /// Allocation-free spine probe: head expression and arg count without + /// materializing the spine. The transient-nat probes below run on + /// *every* whnf call **before** the cache lookup, so they must not + /// heap-allocate on the (overwhelmingly common) non-Nat-recursor path — + /// the previous implementation paid two `collect_app_spine` Vec + /// allocations plus a `KConst::Recr` clone per call, defeating the + /// cache on the hottest path in a full check. (Ported from jcb/fixes + /// H-15.) + fn spine_head_and_len(e: &KExpr) -> (&KExpr, usize) { + let mut cur = e; + let mut n = 0usize; + while let ExprData::App(f, _, _) = cur.data() { + n += 1; + cur = f; + } + (cur, n) + } + fn is_transient_nat_literal_work( &mut self, e: &KExpr, ) -> Result> { - if self.is_nat_literal_recursor_app(e)? { - return Ok(true); - } - - let (head, args) = collect_app_spine(e); + let (head, nargs) = Self::spine_head_and_len(e); let ExprData::Const(id, _, _) = head.data() else { return Ok(false); }; - - if id.addr == self.prims.nat_succ.addr && args.len() == 1 { - return self.is_nat_literal_recursor_app(&args[0]); + if id.addr == self.prims.nat_rec.addr + || id.addr == self.prims.nat_cases_on.addr + { + return self.is_nat_literal_recursor_app(e); + } + if id.addr == self.prims.nat_succ.addr + && nargs == 1 + && let ExprData::App(_, arg, _) = e.data() + { + return self.is_nat_literal_recursor_app(arg); } - Ok(false) } @@ -1090,7 +1532,9 @@ impl TypeChecker<'_, M> { &mut self, e: &KExpr, ) -> Result> { - let (head, spine) = collect_app_spine(e); + // Cheap pre-filter first; only fall through to the allocating spine + // collection + recursor lookup when the head can actually match. + let (head, _) = Self::spine_head_and_len(e); let ExprData::Const(id, _, _) = head.data() else { return Ok(false); }; @@ -1105,6 +1549,7 @@ impl TypeChecker<'_, M> { else { return Ok(false); }; + let (_, spine) = collect_app_spine(e); let major_idx = u64_to_usize::(params + motives + minors + indices)?; Ok( spine @@ -1233,6 +1678,114 @@ impl TypeChecker<'_, M> { None } + /// If `e` is `Nat.add base (Lit n)` (n > 0) or `Nat.div/mod base (Lit k)` + /// (k ≥ 2) with a non-literal base, return the same operation in canonical + /// compact form so the WHNF loop can leave it stuck instead of + /// delta-unfolding. Thresholds keep `x + 0`, `x / 1`, `x / 0` (and mod) + /// reducing through the normal path. `None` means "not this shape — + /// proceed normally". + fn try_nat_offset_stuck( + &mut self, + e: &KExpr, + ) -> Result>, TcError> { + // Allocation-free quick reject: this probe runs once per delta-unfold + // loop iteration, so don't collect a spine Vec unless the head constant + // is one of the three Nat primitives. + { + let mut cur = e; + loop { + match cur.data() { + ExprData::App(f, _, _) => cur = f, + ExprData::Const(id, _, _) => { + if id.addr != self.prims.nat_add.addr + && id.addr != self.prims.nat_div.addr + && id.addr != self.prims.nat_mod.addr + { + return Ok(None); + } + break; + }, + _ => return Ok(None), + } + } + } + let (head, args) = collect_app_spine(e); + let ExprData::Const(id, _, _) = head.data() else { + return Ok(None); + }; + let is_add = id.addr == self.prims.nat_add.addr; + let is_divmod = + id.addr == self.prims.nat_div.addr || id.addr == self.prims.nat_mod.addr; + if (!is_add && !is_divmod) || args.len() != 2 { + return Ok(None); + } + let Some(wb) = self.whnf_nat_reducer_arg(&args[1])? else { + return Ok(None); + }; + let Some(n) = extract_nat_value(&wb, &self.prims) else { + return Ok(None); + }; + if n.0 == num_bigint::BigUint::ZERO { + return Ok(None); + } + if is_divmod && n.0 == num_bigint::BigUint::from(1u64) { + return Ok(None); + } + let Some(wa) = self.whnf_nat_reducer_arg(&args[0])? else { + return Ok(None); + }; + if extract_nat_value(&wa, &self.prims).is_some() { + // Both sides literal: this is closed arithmetic for `try_reduce_nat`, + // not a stuck offset. + return Ok(None); + } + let lit = self.nat_expr_from_value(n); + let inner = self.intern(KExpr::app(head.clone(), wa)); + Ok(Some(self.intern(KExpr::app(inner, lit)))) + } + + /// Decompose a (whnf'd) Nat term into `(base, offset)` for offset-aware + /// def-eq: `Lit n` → `(None, n)`; `succ^j(Nat.add core (Lit m))` → + /// `(Some(core), j + m)` read in O(1) per layer via [`Self::nat_offset`] + /// instead of peeled one `is_def_eq` recursion level at a time. + /// `base = None` means the core is literal zero (the term IS a numeral). + /// `None` (the outer Option) means "not offset-shaped". + pub(super) fn nat_offset_decompose( + &mut self, + e: &KExpr, + ) -> Result>, Nat)>, TcError> { + if let Some(v) = extract_nat_value(e, &self.prims) { + return Ok(Some((None, v))); + } + match self.nat_offset(e, 0)? { + Some((base, off)) if off.0 > num_bigint::BigUint::ZERO => { + if let Some(bv) = extract_nat_value(&base, &self.prims) { + Ok(Some((None, Nat(bv.0 + off.0)))) + } else { + Ok(Some((Some(base), off))) + } + }, + _ => Ok(None), + } + } + + /// Rebuild `base + r` in the compact offset form used by + /// [`Self::try_nat_offset_stuck`]. + pub(super) fn nat_offset_rebuild( + &mut self, + base: Option>, + r: Nat, + ) -> KExpr { + match base { + None => self.nat_expr_from_value(r), + Some(b) if r.0 == num_bigint::BigUint::ZERO => b, + Some(b) => { + let lit = self.nat_expr_from_value(r); + self.mk_nat_add(b, lit) + }, + } + } + fn mk_nat_succ(&mut self, pred: KExpr) -> KExpr { let succ = KExpr::cnst(self.prims.nat_succ.clone(), Box::new([])); KExpr::app(succ, pred) @@ -1694,6 +2247,26 @@ impl TypeChecker<'_, M> { } /// Nat primitive reduction (add, sub, mul, div, mod, pow, gcd, bitwise, predicates). + /// Classify `e`'s head constant into a primitive-reducer family. + /// Allocation-free app-chain walk + per-address memo. + pub(super) fn head_prim_family(&mut self, e: &KExpr) -> PrimFamily { + let mut cur = e; + loop { + match cur.data() { + ExprData::App(f, _, _) => cur = f, + ExprData::Const(id, _, _) => { + if let Some(&fam) = self.env.prim_family_cache.get(&id.addr) { + return fam; + } + let fam = prim_family_uncached(&self.prims, &id.addr); + self.env.prim_family_cache.insert(id.addr.clone(), fam); + return fam; + }, + _ => return PrimFamily::Other, + } + } + } + pub(super) fn try_reduce_nat( &mut self, e: &KExpr, @@ -1803,7 +2376,7 @@ impl TypeChecker<'_, M> { if self.env.nat_succ_stuck.contains(&entry_key) { return Ok(None); } - let mut visited: Vec<(super::env::Addr, super::env::Addr)> = + let mut visited: Vec<(super::env::Addr, super::env::CtxAddr)> = vec![entry_key]; let mut offset = num_bigint::BigUint::from(1u64); let mut cur = arg.clone(); @@ -1885,7 +2458,20 @@ impl TypeChecker<'_, M> { let base = base.clone(); let base_whnf = self.whnf(&base)?; let Some(base_val) = extract_nat_value(&base_whnf, &self.prims) else { - return Ok(None); + // Symbolic base: collapse `succ^offset(Nat.rec base succ-step (Lit n))` + // to the compact offset `Nat.add base (Lit (n + offset))` rather than + // declining into n iota steps that materialize succ^n(base). Keeps the + // value in the same `base + k` form a literal already has, so def-eq + // converges instead of descending n unary succ layers. Conservative: + // only when the recursor application carries no post-major arguments. + // Mirrors IxVM dbc4177. + if parts.spine.len() != parts.major_idx + 1 { + return Ok(None); + } + let total = Nat(&parts.major.0 + offset); + let lit = self.nat_expr_from_value(total); + let result = self.mk_nat_add(base_whnf, lit); + return Ok(Some(result)); }; let mut total = base_val.0; @@ -1932,7 +2518,7 @@ impl TypeChecker<'_, M> { }; let major = major.clone(); - Ok(Some(NatRecLiteralParts { spine, major, base_idx, step_idx })) + Ok(Some(NatRecLiteralParts { spine, major, base_idx, step_idx, major_idx })) } fn is_nat_succ_ih_step( @@ -2908,6 +3494,15 @@ fn compute_nat_bin( b: &Nat, ) -> Option { use num_bigint::BigUint; + // Output-size cap for natively-computed results. `shiftLeft` and `pow` + // amplify a tiny term into a result exponentially larger than the input + // bytes (`Nat.shiftLeft 1 (1 << 40)` is a ~12-byte term demanding a + // ~128 GiB allocation). Refuse to materialize anything wider and leave + // the term stuck, exactly like the `pow` exponent guard below. 2^26 bits + // (8 MiB) keeps every `2^e, e <= ReducePowMaxExp` reduction the C++ + // kernel performs while bounding the allocation an adversarial term can + // force. (Ported from jcb/fixes H-12.) + const MAX_NAT_REDUCE_BITS: u64 = 1 << 26; let zero = BigUint::ZERO; let r = if *addr == p.nat_add.addr { &a.0 + &b.0 @@ -2924,7 +3519,14 @@ fn compute_nat_bin( const REDUCE_POW_MAX_EXP: u64 = 1 << 24; // 16_777_216 match b.to_u64() { #[allow(clippy::cast_possible_truncation)] // guarded: exp <= 2^24 - Some(exp) if exp <= REDUCE_POW_MAX_EXP => a.0.pow(exp as u32), + Some(exp) if exp <= REDUCE_POW_MAX_EXP => { + // The exponent cap alone does not bound the *result*: a wide base + // with an allowed exponent still explodes (bits(a) * exp). + if a.0.bits().saturating_mul(exp) > MAX_NAT_REDUCE_BITS { + return None; + } + a.0.pow(exp as u32) + }, _ => return None, // too large to compute } } else if *addr == p.nat_gcd.addr { @@ -2936,8 +3538,16 @@ fn compute_nat_bin( } else if *addr == p.nat_xor.addr { &a.0 ^ &b.0 } else if *addr == p.nat_shift_left.addr { - let shift = usize::try_from(b.to_u64()?).ok()?; - &a.0 << shift + let shift = b.to_u64()?; + if a.0 == zero { + zero + } else if a.0.bits().saturating_add(shift) > MAX_NAT_REDUCE_BITS { + // Result has exactly bits(a) + shift bits — refuse to materialize. + return None; + } else { + let shift = usize::try_from(shift).ok()?; + &a.0 << shift + } } else if *addr == p.nat_shift_right.addr { let shift = usize::try_from(b.to_u64()?).ok()?; &a.0 >> shift @@ -3195,6 +3805,166 @@ mod tests { assert_eq!(tc.whnf(&app).unwrap(), sort0()); } + // ---- environment-machine readback paths ---- + + #[test] + fn whnf_machine_open_term_shift() { + let mut env = env_with_id(); + let mut tc = TypeChecker::new(&mut env); + // (λ x. x #3) opaque → opaque #2: the ambient Var must shift down + // past the consumed machine binder during spine readback. + let opaque = AE::cnst(mk_id("opaque"), Box::new([])); + let body = AE::app(AE::var(0, ()), AE::var(3, ())); + let lam = AE::lam((), (), sort0(), body); + let app = AE::app(lam, opaque.clone()); + let expected = AE::app(opaque, AE::var(2, ())); + assert_eq!(tc.whnf(&app).unwrap(), expected); + } + + #[test] + fn whnf_machine_partial_app_value() { + let mut env = env_with_id(); + let mut tc = TypeChecker::new(&mut env); + // (λ x y. x y) opaque → λ y. opaque y: Lam value exit reads the body + // back at binder depth 1. + let opaque = AE::cnst(mk_id("opaque"), Box::new([])); + let body = AE::app(AE::var(1, ()), AE::var(0, ())); + let lam2 = AE::lam((), (), sort0(), AE::lam((), (), sort0(), body)); + let app = AE::app(lam2, opaque.clone()); + let expected = AE::lam((), (), sort0(), AE::app(opaque, AE::var(0, ()))); + assert_eq!(tc.whnf(&app).unwrap(), expected); + } + + #[test] + fn whnf_machine_env_entry_lifts_under_binder() { + let mut env = env_with_id(); + let mut tc = TypeChecker::new(&mut env); + // (λ x y. x) #3 → λ y. #4: substituting an OPEN argument under the + // surviving binder must lift it (Var arm of clo_subst at depth 1). + let lam2 = + AE::lam((), (), sort0(), AE::lam((), (), sort0(), AE::var(1, ()))); + let app = AE::app(lam2, AE::var(3, ())); + let expected = AE::lam((), (), sort0(), AE::var(4, ())); + assert_eq!(tc.whnf(&app).unwrap(), expected); + } + + #[test] + fn whnf_machine_zeta_under_beta() { + let mut env = env_with_id(); + let mut tc = TypeChecker::new(&mut env); + // (λ x. let z := x in z) opaque → opaque: machine zeta push. + let opaque = AE::cnst(mk_id("opaque"), Box::new([])); + let body = AE::let_((), sort0(), AE::var(0, ()), AE::var(0, ()), true); + let lam = AE::lam((), (), sort0(), body); + let app = AE::app(lam, opaque.clone()); + assert_eq!(tc.whnf(&app).unwrap(), opaque); + } + + #[test] + fn whnf_machine_closure_iota_via_beta() { + use super::super::constant::RecRule; + // A beta whose body is a recursor application: the machine's Const + // exit must take the closure-iota path (rule args ride through as + // closures) and produce the same result as the eager iota. + // + // Unit-like inductive `U` with one ctor `U.mk` (no params/fields) + // and recursor `U.rec : (motive) (minor) (major) → minor`. + let u_id = mk_id("Test.U"); + let u_mk_id = mk_id("Test.U.mk"); + let u_rec_id = mk_id("Test.U.rec"); + let mut env = env_with_id(); + env.insert( + u_id.clone(), + KConst::Indc { + name: (), + level_params: (), + lvls: 0, + params: 0, + indices: 0, + is_rec: true, // disable struct-eta so the ctor path is the only route + is_refl: false, + is_unsafe: false, + nested: 0, + block: u_id.clone(), + member_idx: 0, + ty: sort0(), + ctors: vec![u_mk_id.clone()], + lean_all: (), + }, + ); + env.insert( + u_mk_id.clone(), + KConst::Ctor { + name: (), + level_params: (), + is_unsafe: false, + lvls: 0, + induct: u_id.clone(), + cidx: 0, + params: 0, + fields: 0, + ty: AE::cnst(u_id.clone(), Box::new([])), + }, + ); + env.insert( + u_rec_id.clone(), + KConst::Recr { + name: (), + level_params: (), + k: false, + is_unsafe: false, + lvls: 0, + params: 0, + indices: 0, + motives: 1, + minors: 1, + block: u_id.clone(), + member_idx: 0, + ty: sort0(), + // rule rhs: λ motive minor. minor + rules: vec![RecRule { + ctor: (), + fields: 0, + rhs: AE::lam( + (), + (), + sort0(), + AE::lam((), (), sort0(), AE::var(0, ())), + ), + }], + lean_all: (), + }, + ); + let mut tc = TypeChecker::new(&mut env); + let rec = AE::cnst(u_rec_id, Box::new([])); + let mk = AE::cnst(u_mk_id, Box::new([])); + let opaque = AE::cnst(mk_id("opaque"), Box::new([])); + // (λ m. U.rec Sort0 m U.mk #2) opaque + // → machine beta (m := opaque), Const(U.rec) exit → closure-iota + // → minor = opaque, post-major arg #3 shifts to #2. + let body = AE::app( + AE::app(AE::app(AE::app(rec, sort0()), AE::var(0, ())), mk), + AE::var(3, ()), + ); + let lam = AE::lam((), (), sort0(), body); + let app = AE::app(lam, opaque.clone()); + let expected = AE::app(opaque, AE::var(2, ())); + assert_eq!(tc.whnf(&app).unwrap(), expected); + } + + #[test] + fn whnf_machine_chained_beta() { + let mut env = env_with_id(); + let mut tc = TypeChecker::new(&mut env); + // ((λ x. x) (λ y. y)) opaque → opaque: the intermediate beta result + // is consumed by the next beta without materializing. + let inner = AE::lam((), (), sort0(), AE::var(0, ())); + let outer = AE::lam((), (), sort0(), AE::var(0, ())); + let opaque = AE::cnst(mk_id("opaque"), Box::new([])); + let app = AE::app(AE::app(outer, inner), opaque.clone()); + assert_eq!(tc.whnf(&app).unwrap(), opaque); + } + #[test] fn whnf_delta_opaque_blocked() { let mut env = env_with_id(); @@ -3883,7 +4653,13 @@ mod tests { } #[test] - fn whnf_nat_add_symbolic_literal_rhs_exposes_succ() { + fn whnf_nat_add_symbolic_literal_rhs_stays_compact() { + // `Nat.add x (Lit 2)` with a symbolic base must stay STUCK in compact + // offset form (`try_nat_offset_stuck`) instead of delta-unfolding into + // `succ(succ(x))` — that tower is O(n) substitution per layer and the + // dominant cost on Nat-arithmetic proofs. Iota over such a major still + // works via `cleanup_nat_offset_major`; def-eq decides offset pairs in + // `try_def_eq_offset`. let mut env = nat_env(); let empty = KEnv::new(); let prims = Primitives::from_env(&empty); @@ -3891,10 +4667,29 @@ mod tests { let mut tc = TypeChecker::new(&mut env); let add = AE::cnst(tc.prims.nat_add.clone(), Box::new([])); - let expr = app(app(add, var(0)), mk_nat(2)); + let expr = app(app(add.clone(), var(0)), mk_nat(2)); let result = tc.whnf(&expr).unwrap(); + assert_eq!(result, app(app(add, var(0)), mk_nat(2))); + } + + #[test] + fn def_eq_nat_offset_succ_tower_vs_compact_add() { + // `succ(succ(x))` ≡ `Nat.add x (Lit 2)` must be decided by the offset + // decomposition in `try_def_eq_offset` (equal offsets → compare bases), + // and `succ(x) ≟ Nat.add x (Lit 2)` must NOT be equal. + let mut env = nat_env(); + let empty = KEnv::new(); + let prims = Primitives::from_env(&empty); + insert_nat_add_model(&mut env, prims.nat_add.clone()); + + let mut tc = TypeChecker::new(&mut env); + let add = AE::cnst(tc.prims.nat_add.clone(), Box::new([])); let succ = AE::cnst(tc.prims.nat_succ.clone(), Box::new([])); - assert_eq!(result, app(succ.clone(), app(succ, var(0)))); + let tower = app(succ.clone(), app(succ.clone(), var(0))); + let compact = app(app(add, var(0)), mk_nat(2)); + assert!(tc.is_def_eq(&tower, &compact).unwrap()); + let one = app(succ, var(0)); + assert!(!tc.is_def_eq(&one, &compact).unwrap()); } #[test] diff --git a/docs/env_machine_whnf.md b/docs/env_machine_whnf.md new file mode 100644 index 00000000..3fa1bcfd --- /dev/null +++ b/docs/env_machine_whnf.md @@ -0,0 +1,259 @@ +# Design: environment-machine WHNF for the Rust kernel + +Port of the IxVM environment machine (`~/ix-aiur` commits `d0d3375` +Phase A, `b4c4ea3` Phase B; `Ix/IxVM/Kernel/Whnf.lean:209-460`, +`Subst.lean:352-400`) to `crates/kernel/src/whnf.rs`. Status: +IMPLEMENTED — Phase A in `7825e0b` (machine loop in `whnf.rs`, +`Clo`/`MEnv`/`clo_subst` in `subst.rs`), Phase B closure-iota in +`acd0780` (`try_iota_clo`); guest cycle benchmarks pending. One +deviation from §7: machine-native delta (the IxVM Phase C1.5 win) +does not port at this layer — our machine lives inside `whnf_core`, +which must stay delta-free for def-eq's lazy-delta unfold ordering, +whereas the IxVM machine sits in a whnf that includes delta. Spanning +`whnf`'s delta loop with closures is a separate, larger design. + +## 1. Problem and evidence + +Eager substitution is the dominant residual cost on reduction-heavy +shards. Every beta materializes the substituted body via +`simul_subst` (a full walk + intern of every reachable subterm), and +every iota substitutes a whole recursor rule — including subterms the +reduction then discards (unselected `Decidable`/match branches, +unused minor premises, arguments dropped by projections). + +Current guest profiles (`/tmp/prof3-*.log`, cumulative): + +- `Vector.extract_append._proof_1` (4.95B steps): `simul_subst` 49%, + `try_same_head_spine` 71%, `whnf_core` 73%. +- UTF-8 codec proof (1.16B steps): `try_iota_with_flags` **85.6%**, + `try_same_head_spine` 85.7% — one deep def-eq descent with iota at + every level, substituting giant match trees whose branches are + mostly dropped. +- mergesort shard (2.3B steps): `whnf_core` 38%, `simul_subst` 16%. + +The same wall was hit independently on the Aiur side ("the +substitution wall is THE remaining structural frontier — needs a +representation-level fix, not another stuck-form"), and the machine +is its proven fix, measured there (FFT cost): + +- Phase A: Int16 −33.8%, Vector._proof_2 −17.1%, mergeSortBench + −6.8%, String.append −7.7%, gcd −5.0%; **synthetic foldAdd_2000 + +14.7%** (see Risks). +- Phase B (closure-iota): a further −3.4% Int16, −2.0% mergeSort; + headline payoff (dropped-branch laziness on UTF-8-class) not + represented in that suite. + +## 2. Core idea + +A Krivine-style abstract machine for the *structural* fragment of +WHNF (beta, zeta, app-spine, bound-variable lookup): + +- **Beta and zeta are O(1) environment pushes.** No substitution + walk, no materialized intermediate body. +- **Substitution happens only at exit points** ("readback"), and only + for the parts of the term the reduction actually hands to a + consumer. A beta chain ending in another beta never materializes + its intermediate bodies; an iota that selects rule 3 never touches + rules 0–2. +- WHNF never reduces under binders, so environments **never need + lifting** — no full explicit-substitution calculus required. A + `Lam`/`All` value reads back with `clo_subst` at binder depth 1. + +## 3. Data types (Rust) + +```rust +/// Expression closed over a machine environment: +/// `Clo { e, env }` denotes e[Var(i) := env[i]] for i < env.len(), +/// with Var(i) for i >= env.len() shifting DOWN by env.len() into the +/// ambient context. Environments are built only by the machine's +/// beta/zeta pushes. +struct Clo { + e: KExpr, + env: MEnv, +} + +/// Persistent cons-list environment: O(1) push, structural sharing +/// across the closures captured at each binder. `len` is cached — the +/// IxVM port found recomputing it per suffix cost ~94M rows on the +/// UTF-8 check. +struct MEnvNode { head: Arc>, tail: MEnv } +type MEnv = Option>>; // + len: u32 carried alongside + +/// Machine spine: args nearest-first, each a closure over the env in +/// force where the App layer was peeled. Also a persistent cons list +/// (the machine only pushes at the front and pops from the front). +type CSpine = MEnv-like list of Arc>; +``` + +No `CWhnf` enum in Phase A: the Rust machine is a private loop inside +`whnf_core_with_flags_uncached`, and every exit reads back to a plain +`KExpr` that re-enters the *existing* loop (see §5). The symbolic +result type only becomes necessary for Phase C (closure-aware +def-eq), which this design defers. + +## 4. The machine loop + +A single iterative loop (NOT recursion — machine depth equals +reduction depth and must not consume native stack; every IxVM +transition is tail-call-shaped, so the loop is mechanical). State: +`(head: KExpr, env: MEnv, n: u32, spine: CSpine)`. Fuel: decrements +the same `MAX_WHNF_FUEL` budget as the enclosing `whnf_core` loop — +unlike IxVM we KEEP fuel (adversarial-input posture; see +`docs/kernel_identity.md` for the threat model). + +Transitions, mirroring `mwhnf_spine`: + +| head | action | +|---|---| +| `App(f, a)` | push `Clo{a, env}` onto spine; `head = f` (O(1) per layer, no `collect_app_spine` Vec) | +| `Lam(_, _, _, body)` | spine non-empty → pop `c`, push onto env, `head = body` (**beta, O(1)**); spine empty → EXIT: read back `Lam` via `clo_subst` (a value) | +| `Let(_, _, val, body)` | push `Clo{val, env}` onto env, `head = body` (**zeta, O(1)**) | +| `Var(i)` with `i < n` | fetch `Clo{e2, env2}` = env[i]; `head, env = e2, env2` (O(1) jump — this is where laziness pays) | +| `Var(i)` with `i >= n` | EXIT stuck: read back spine onto `Var(i - n)`; the *outer* loop's Var arm then does ambient `lookup_let_val` zeta | +| `FVar(_)` | EXIT stuck (readback); outer loop's FVar arm does LDecl zeta | +| `All(..)` | spine empty → EXIT readback; non-empty → ill-typed application, readback stuck (mirror IxVM) | +| `Const(..)` | EXIT: read back spine, rebuild application, hand to the existing dispatch (prim families, iota, projection-definitions, delta in the enclosing loops). Phase B refines the `Recr` case — see §7 | +| `Prj(id, f, val)` | scrutinee = `Clo{val, env}`: whnf it through the machine **respecting `flags.cheap_proj`** (cheap → machine + structural-only exits; full → full `whnf`), then the existing `try_proj_reduce` on the result; spine read back | +| `Sort/Nat/Str` | EXIT readback (stuck application of a non-function — outer loop handles/returns) | + +**Exit contract:** every machine exit produces a plain `KExpr` and +*re-enters `whnf_core_with_flags_uncached`'s loop* as the new `cur` +(or returns, where the old code returned). This keeps EVERY existing +behavior — ambient let/LDecl zeta, iota, K-synthesis, +`cleanup_nat_offset_major`, string/nat literal exposure, +proj-definitions, the nat-offset-stuck probe, delta in `whnf` — byte +identical in semantics. The machine replaces only the +App-collect/beta/zeta core. + +## 5. Entry gating (the hybrid — load-bearing) + +IxVM measured a machine-everywhere variant at **+26% on beta-free +literal-recursor loops** (closure-wrap + readback overhead with no +beta to amortize it). The shipped design enters the machine **only +when a beta actually fires**: + +In `whnf_core_with_flags_uncached`'s App arm (whnf.rs:641ff), after +`f = whnf_core(f0)`: + +- `f` is `Lam` → wrap the already-collected `args` as empty-env + closures (`spine_wrap`) and run the machine starting from the beta. + This REPLACES the current multi-lambda peel + `simul_subst` block. +- otherwise → existing path unchanged (rebuild-if-changed, iota, + return). + +Everything outside the App arm (Const-headed terms, literal recursor +loops, the whole no-delta/delta loop structure) never touches the +machine. This is also why the existing *caches need no changes*: the +machine is invisible between `whnf_core` entry and exit, and cache +keys/values remain plain interned `KExpr`s. + +## 6. Readback: `clo_subst` + +New functions in `subst.rs`, mirroring IxVM `Subst.lean:352-400`: + +```rust +/// e[Var(depth + i) := lift(readback(env[i]), depth)] for i < n, +/// Var(j) for j >= depth + n shifted down by n. lbr-guarded at every +/// node (the no-op-subtree guard measured −68% substitution entries +/// on the IxVM side). Uses a per-call memo scratch keyed (uid, depth) +/// (same pattern as subst_scratch — a NEW scratch map on InternTable; +/// the two must not share entries). Results interned per node, like +/// every other subst. +fn clo_subst(intern, e, env, n, depth) -> KExpr +fn clo_readback(intern, c: &Clo) -> KExpr // clo_subst(e, env, n, 0) +``` + +`clo_subst`'s Var arm composes `clo_readback` of the env entry with +the existing `lift` (subst.rs:341) at non-zero depth. Spine readback +maps `clo_readback` over the spine and rebuilds Apps via interning — +exactly what the current post-beta rebuild does, so uid/interning +discipline (see `docs/kernel_identity.md`) is unchanged: readback +goes through the same constructors + intern table as today's +substitution output. + +## 7. Phases + +**Phase A — machine for beta/zeta (this design's deliverable 1).** +Scope: `whnf.rs` App arm + the machine loop + `subst.rs::clo_subst`. +All Const/Proj exits read back and use existing machinery. Expected +from IxVM Phase A scaled to our profiles: vector double-digit +(its 49% `simul_subst` is the direct target), utf8 and mergesort +mid-single-digit, intRxc small. + +**Phase B — closure-iota (`try_iota_c`).** The `Recr` Const-exit no +longer reads its spine back: the major whnfs through its closure +(`clo_whnf`), and on the main ctor-rule hit the rule RHS returns with +params/motives/minors + ctor fields + post-major args as CLOSURES — +the rule's Lam-chain betas push them straight into an env, so +**unselected rules and dropped branches are never substituted and +never read back**. Off-main-path cases (K-flag synthesis, the +nat linear-rec shortcut, struct-eta, quot, `cleanup_nat_offset_major` +interplay) MISS to the existing plain `try_iota_with_flags` on a +readback spine — semantics preserved by construction. This phase is +the UTF-8-class payoff (its 85% iota share is mostly discarded-branch +substitution). `flags.cheap_rec` gates exactly as today (cheap mode +skips the machine-iota entirely and misses to the plain path). + +**Phase C (deferred, separate design):** closure-aware def-eq (the +IxVM working tree's `CWhnf` capture route) and env-trimming for the +curried-sharing regression. Do not start until A+B are measured. + +## 8. What explicitly does NOT change + +- All caches (whnf/whnf_core/cheap variants, infer, def-eq, unfold, + equiv, failure) — keys, values, clearing. The machine lives strictly + between one cache miss and its fill. +- The delta loop in `whnf`, prim-family dispatch, nat-offset-stuck + probes, transient-nat cache exclusion, native/string/decidable + reducers. +- def-eq tier structure; `whnf_core_for_def_eq` keeps cheap flags, + which the machine consults only at its Proj exit (A) and Recr exit + (B). +- Fuel and error semantics (`MaxRecDepth`). +- `simul_subst` itself stays — instantiation sites outside whnf + (infer's pi-instantiation etc.) still use it. + +## 9. Risks and mitigations + +1. **Curried-sharing regression (measured +14.7% on IxVM's synthetic + foldAdd_2000):** eager sequential substitution shared + constant-prefix substitutions across loop iterations; per-iteration + env closures do not. Our suite lacks a foldAdd-shaped input — + **add one before implementing** (e.g. dump `natfoldsucc.ixe` / + `natreclinear.ixe` from the repo root into the bench suite) so the + regression class is visible. Mitigation if it bites: env-trimming + at readback to each subterm's visible slice (IxVM's designed + fix), or tighten the entry gate (enter only on multi-arg beta). +2. **Native stack:** the machine MUST be an iterative loop; + `clo_subst` recursion depth is term depth (same exposure as + today's `subst`). +3. **Memory:** env chains are Arc-shared cons cells; worst case holds + alive every closure of a deep reduction until exit. Guest RAM is + 512MB; the same terms today materialize MORE memory (full + substituted bodies), so this should be net-negative pressure, but + watch the RAM column in `ziskemu -m` on vector/utf8. +4. **Cheap-flag fidelity:** the def-eq false-negative discipline + (cheap results must not enter full caches) is keyed at the + `whnf_core_with_flags` wrapper, which is untouched; the machine + only needs to route its Proj/Recr exits by `flags`. Test: the + existing 604 suite covers cheap/full splits. +5. **Behavioral drift in exits:** any machine exit that fails to + re-enter the outer loop (instead of returning) silently loses an + ambient zeta/iota step. The exit contract in §4 is the review + checklist: every exit either returns where the old code returned, + or sets `cur` and continues. + +## 10. Test & acceptance plan + +Per phase: 604 kernel + 200 ixon tests; native `check_one` on +Vector._proof_1, Int16, UTF-8; full 11-input guest cycle suite + +`natfoldsucc`/`natreclinear` additions; per the acceptance bar, keep +a phase only if the suite nets ≥1% (expected: well above). +Phase A first, committed alone; Phase B on top, committed alone. + +## 11. Effort + +Phase A: ~350–500 lines (machine loop ~150, clo_subst ~120, App-arm +rewire ~50, spine helpers, tests). Phase B: ~200 more +(try_iota_c + clo_whnf + Recr exit). The IxVM source is the line-by- +line reference for both. diff --git a/docs/kernel_identity.md b/docs/kernel_identity.md new file mode 100644 index 00000000..ffe6af7a --- /dev/null +++ b/docs/kernel_identity.md @@ -0,0 +1,169 @@ +# Kernel term identity vs. Ixon content addressing + +Ix has **two distinct identity layers**, and they serve different masters. +This document records the boundary between them — in particular why the +kernel's switch from per-node blake3 hashing to intern-assigned uids +(`1e3029d`, 2026-06) does not affect the proof-carrying-code story, and +what a future feature must do if it ever needs cryptographic identity at +sub-constant granularity. + +## Layer 1: Ixon content addresses (external, cryptographic, in proofs) + +The unit of cryptographic identity is the **constant**. A constant's +`Address` is the blake3 hash of its alpha-invariant Ixon serialization +(see `docs/Ixon.md`); blobs (the bytes behind `Nat`/`String` literals) +are addressed the same way. Everything that leaves the kernel speaks +this language: + +- `Constant.refs` point at constant and blob `Address`es. +- `KId.addr` — the identity the typechecker certifies — is the Ixon + `Address`. +- The Zisk guest's committed claim is built from `Address`es: + `subject_root` / `assumptions_root` are `merkle_root_canonical` over + the certified / assumed constant addresses (`crates/kernel/src/claim.rs`, + `zisk/guest/src/main.rs`), and `env_hash` is blake3 over the exact env + payload. +- Aggregation (`Claim::CheckEnv`, the `Contains` discharge) resolves + assumption leaves against subject roots **by address**, and the + cross-run proof store is keyed by address (content-addressed: the same + constant has the same address in every env). +- Address↔bytes binding is enforced by `Address::hash(bytes) == addr`, + verified at first materialization (`ixon::lazy::LazyConstant::get`). + Every constant the kernel certifies is necessarily materialized, so + everything that can enter a subject root has passed the check. + +**Merkle inclusion proofs for individual constants live entirely in this +layer** and are unaffected by anything below: proving "address `A` is in +this proof's subject root" is an inclusion path in a tree whose leaves +are constant addresses. + +The trust chain, end to end: + +``` +claim roots ──Merkle──▶ constant Addresses + ──verify──▶ serialized constant bytes (blake3 binding) + ──parse───▶ terms the kernel typechecked +``` + +## Layer 2: kernel node identity (internal, ephemeral, never serialized) + +Inside one checker run, the kernel needs cheap identity for the +**in-memory expression/universe nodes** (`KExpr`/`KUniv`) churned out by +substitution, WHNF, and def-eq — for the hash-consing intern table, the +whnf/infer/def-eq cache keys, and quick equality. These objects are +ephemeral: they are never serialized, never compared against Ixon +addresses, and are torn down when the `KEnv` clears. + +Historically this layer ALSO used blake3: every constructed node hashed +`(variant tag ‖ child hashes)`. That was a *separate scheme* from Ixon +addressing (a Merkle-DAG over node tags, not blake3-of-serialization) — +the two were never interchangeable — and it cost ~20% of all guest +cycles on reduction-heavy constants in the Zisk prover. + +Since `1e3029d`, layer-2 identity is an **intern-assigned `u64` uid** +(`crates/kernel/src/env.rs::Addr`): + +- Uids come from a process-global counter and are **never reused**, so + `uid(a) == uid(b)` implies "same construction event or same + intern-table canonical value" — i.e. structural equality. Stale cache + keys can only miss, never alias. +- The intern table keys on **shallow structural keys** + (`ExprKey`/`UnivKey`: variant tag + child uids + semantic payload), + mirroring exactly what the old content hash covered — display names, + binder info, and mdata are excluded, so interning semantics are + unchanged. +- `PartialEq` for `KExpr`/`KUniv` is structural with the uid fast path: + canonical-vs-canonical comparison is O(1), and completeness-critical + `==` sites are unaffected. `hash_eq` remains the fast-but-incomplete + uid check for cache/quick-path callers, where a false negative only + costs a fallback. +- Soundness direction: caches and def-eq quick paths rely only on the + affirmative ("equal uid ⇒ equal term"), which holds by construction. + An imperfect intern hit-rate degrades performance, not correctness. + +Nat/Str literal nodes keep their blake3 **blob** `Address` (that is +layer-1 data riding on the node, used by `refs`/assumption filtering). + +## The boundary rule + +> Anything that crosses the kernel boundary — claims, public values, +> proof-store entries, assumption sets — is identified by Ixon +> `Address`. Kernel uids must never escape into a serialized artifact. + +This held before the change (nothing consumed the old expr hashes +outside the kernel) and is now structurally enforced by the type split: +`Address` (32-byte content hash) vs `Addr = u64` (kernel uid) vs +`CtxAddr` (blake3 digests for local-context cache keys, fed by uids, +also internal-only). + +## Adversarial model: why uid identity cannot be cache-poisoned + +The natural worry about replacing blake3 with `u64` identity is hash +collision: *if two subterms had the same internal hash, a def-eq/whnf +cache hit could return a result for the wrong term.* The design avoids +this not by making collisions unlikely but by making them impossible — +**uids are assigned, not computed**: + +- A uid comes from a process-global sequential counter + (`expr.rs::fresh_uid`), never from hashing input content. There is no + function from term content to uid for an adversary to find collisions + in — and no birthday bound either: uids are handed out once each, not + sampled from a space two terms could land in. Two `KExpr` values share + a uid only if they are clones of one construction event or the + canonical node the intern table returned for the same shallow key. + `fresh_uid` aborts on counter exhaustion (which would require >2^67 + guest cycles) rather than wrap, and the only constructors that accept + a caller-supplied uid (`*_mdata_with_addr`) are private to `expr.rs`, + so no code outside the module can copy one node's uid into a + different structure. +- **Between shards** (separate guest processes), uid collisions are + certain — every process counts from 1 — and harmless by design: a uid + is meaningless outside its process. Nothing serializes a uid (the + committed publics are blake3 addresses and Merkle roots; intern tables + and caches die with the process; manifests and the proof store carry + `Address`es only), so there is no channel through which shard A's + uid 42 could meet shard B's. All cross-shard identity — claims, + assumption resolution, proof reuse — lives in the Ixon `Address` + layer. Any future feature that wants to persist or ship a + reduction/intern cache MUST re-key it by content (e.g. Ixon + serialization hashes) first; see the boundary rule above. +- The intern table is keyed by [`ExprKey`]/[`UnivKey`] — exact + structural keys compared by full `Eq` (variant tag, child uids, + complete 32-byte payload addresses; never truncated, never a digest). + `FxHashMap` bucketing uses a weak hash, but a bucket collision only + causes an extra key comparison, not a wrong hit. Children in a key are + canonical by induction, so key equality ⇔ structural equality. Debug + builds assert structural equality on every intern hit. +- Reduction caches key on `(uid, CtxAddr)`; a hit requires uid equality, + hence structural equality. `CtxAddr` (local-context digests) remains + blake3 — collision-resistant — fed with uid bytes. +- An adversarial env cannot choose uids at all: deserialization never + produces a uid; every node built from input takes the next counter + value. Compared to the old scheme — where cache keys were blake3 over + attacker-chosen content, so an attacker could at least grind contents + to flood `FxHashMap` buckets of their choosing — sequential uids give + the adversary strictly *less* influence over key distribution. + +The one place adversarial content does enter identity is **literals**: +`ExprKey::Nat/Str` and structural equality key on the blob `Address` +(mirroring the old content hash, which also hashed only the address). +That is sound only if an address is bound to its bytes — so all three +env deserializers verify `Address::hash(bytes)` for every blob at load +(the constants' equivalent check runs lazily at first materialization), +and the literal arms of structural equality compare values as well as +addresses as defense in depth. Residual exposure is limited to DoS +(bucket flooding of maps keyed by attacker-chosen `Address`es slows the +prover, which is spending its own cycles), not soundness. + +## Future: sub-constant commitments + +If a feature ever needs cryptographic identity for **subterms** (e.g. +Merkle inclusion of a specific expression inside a constant's body, +rather than inclusion of a constant in a claim), expression-level +content hashes no longer exist precomputed. The right move is to compute +them at that boundary: a one-time egress pass over the relevant term — +either the old `(tag ‖ child hashes)` Merkle-DAG scheme or blake3 over +the Ixon serialization of the subterm. That is cheap where it is needed +and keeps the per-node cost out of the prover's hottest loop. No current +artifact (claims, aggregation, reuse, inclusion proofs at constant +granularity) needs it. diff --git a/zisk/scripts/bench-cycles.sh b/zisk/scripts/bench-cycles.sh new file mode 100755 index 00000000..49aeb3c4 --- /dev/null +++ b/zisk/scripts/bench-cycles.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# Cycle benchmark for the Zisk leaf guest across the standard input suite. +# +# Runs each dumped guest stdin in ~/benchdata/zisk-inputs through ziskemu, +# records steps (-m) and checks the committed `failures` publics word (-c, +# u32 word index 10) is zero — a non-zero value means the kernel REJECTED a +# constant and the run is invalid for benchmarking. +# +# Usage: +# bench-cycles.sh