From 3774860c1144dda6ea0fc9b04d774f4572115714 Mon Sep 17 00:00:00 2001
From: Arthur Paulino <arthurleonardo.ap@gmail.com>
Date: Tue, 9 Jun 2026 11:38:20 -0700
Subject: [PATCH 01/13] IxVM: cheaper Expr.Share / universe lookups (list_drop
 + field index)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Expr.Share resolution and universe lookups walked the sharing/univ lists
with `list_lookup_u64` (per-step `u64_is_zero` + `relaxed_u64_pred`). Switch
to:

- universe lookups: `list_lookup(univs, flatten_u64(idx))` — cheap per-step
  field subtraction instead of the U64 predecessor.
- Expr.Share (Convert convert_expr + Ingress deref_share):
  `let ListNode.Cons(e, _) = load(list_drop(sharing, flatten_u64(idx)));`.
  list_drop returns the sublist *pointer*; repeated Share lookups collapse
  to shared memo entries, so the walk is near-free.

Measured on `lake exe ix check --ixe init.ixe --ixes init.ixes --shard 10`
(Init, owned=3, closure=676): total FFT 3.89G -> 2.11G. The merged
pointer-list walk drops from 1.93G (list_lookup) to 5.8M (list_drop, 0.27%).

(Measurement run had in-circuit blake3 verification disabled — not committed.)
---
 Ix/IxVM/Convert.lean | 11 ++++++++---
 Ix/IxVM/Ingress.lean |  4 +++-
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/Ix/IxVM/Convert.lean b/Ix/IxVM/Convert.lean
index 4917ec0f..b9cbd972 100644
--- a/Ix/IxVM/Convert.lean
+++ b/Ix/IxVM/Convert.lean
@@ -68,7 +68,9 @@ def convert := ⟦
     match load(idxs) {
       ListNode.Nil => store(ListNode.Nil),
       ListNode.Cons(idx, rest) =>
-        let u = load(list_lookup_u64(univs, idx));
+        -- universe indices are small; walk with a field index (cheap per-step
+        -- field sub) instead of `list_lookup_u64`'s per-step U64 predecessor.
+        let u = load(list_lookup(univs, flatten_u64(idx)));
         store(ListNode.Cons(store(convert_univ(u)), convert_univ_idxs(rest, univs))),
     }
   }
@@ -145,7 +147,9 @@ def convert := ⟦
   ) -> KExpr {
     match load(e) {
       Expr.Srt(univ_idx) =>
-        let u = load(list_lookup_u64(univs, univ_idx));
+        -- field-indexed walk (see `convert_univ_idxs`): avoids the per-step
+        -- U64 predecessor of `list_lookup_u64` on this hot universe lookup.
+        let u = load(list_lookup(univs, flatten_u64(univ_idx)));
         store(KExprNode.Srt(store(convert_univ(u)))),
 
       Expr.Var(idx) =>
@@ -199,7 +203,8 @@ def convert := ⟦
           convert_expr(body, sharing, ref_idxs, recur_idxs, lit_blobs, univs))),
 
       Expr.Share(idx) =>
-        convert_expr(list_lookup_u64(sharing, idx), sharing, ref_idxs, recur_idxs, lit_blobs, univs),
+        let ListNode.Cons(e, _) = load(list_drop(sharing, flatten_u64(idx)));
+        convert_expr(e, sharing, ref_idxs, recur_idxs, lit_blobs, univs),
     }
   }
 
diff --git a/Ix/IxVM/Ingress.lean b/Ix/IxVM/Ingress.lean
index 6630cc33..e285cfe0 100644
--- a/Ix/IxVM/Ingress.lean
+++ b/Ix/IxVM/Ingress.lean
@@ -792,7 +792,9 @@ def ingress := ⟦
   -- Deref Expr.Share via the constant's sharing list.
   fn deref_share(e: Expr, sharing: List‹&Expr›) -> Expr {
     match e {
-      Expr.Share(idx) => deref_share(load(list_lookup_u64(sharing, idx)), sharing),
+      Expr.Share(idx) =>
+        let ListNode.Cons(e, _) = load(list_drop(sharing, flatten_u64(idx)));
+        deref_share(load(e), sharing),
       _ => e,
     }
   }

From 69b6936679c43966e3b0c6e08c8aa8fd0c17c1d6 Mon Sep 17 00:00:00 2001
From: Arthur Paulino <arthurleonardo.ap@gmail.com>
Date: Tue, 9 Jun 2026 12:12:28 -0700
Subject: [PATCH 02/13] IxVM: hot/cold split address_eq on a limb-0 prefilter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The primitive-dispatch gauntlet in whnf compares each Const head against ~50
known primitive addresses, so address_eq is the single hottest circuit. The
full 32-limb compare ran at width 109 on every call.

Split it: `address_eq` now compares limb 0 only and rejects (the common case —
a single differing limb proves inequality), delegating the remaining 31-limb
compare to a separate `address_eq_tail`. Because Aiur charges a function's full
width on every one of its rows, the cold path must be a separate function (a
nested match in address_eq changes nothing — measured identical); as its own
circuit, address_eq_tail's height is only the rare limb-0-match rows.

Measured on `lake exe ix check --ixe init.ixe --ixes init.ixes --shard 10`:
- address_eq: width 109 -> 80, FFT 509.8M -> 374.1M
- address_eq_tail (new): width 108, height 1683, FFT 1.9M
- total shard FFT: 2.106G -> 1.973G

Limb-0 is the sweet spot: hot height (259991) dominates tail height (1683), so
pulling a second limb forward (N=2) widens the hot circuit by more than it
saves in the tail (measured 1.976G, worse). Result is identical to a full
32-limb compare (sound).

(Measurement run had in-circuit blake3 verification disabled — not committed.)
---
 Ix/IxVM/Ingress.lean | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/Ix/IxVM/Ingress.lean b/Ix/IxVM/Ingress.lean
index e285cfe0..5fee5918 100644
--- a/Ix/IxVM/Ingress.lean
+++ b/Ix/IxVM/Ingress.lean
@@ -53,17 +53,23 @@ def ingress := ⟦
     bytes
   }
 
-  -- Compare two 32-byte addresses for equality
-  fn address_eq(a: Addr, b: Addr) -> G {
-    let [a0, a1, a2, a3, a4, a5, a6, a7,
+  -- Compare two 32-byte addresses for equality.
+  --
+  -- Cold path: limb 0 already matched, compare the remaining 31 limbs.
+  -- Factored into its own function so it forms a separate circuit whose height
+  -- is only the (rare) limb-0-match rows; Aiur charges a function's full width
+  -- on every one of its rows, so a nested match in `address_eq` would not save
+  -- anything — the split must be a function boundary.
+  fn address_eq_tail(a: Addr, b: Addr) -> G {
+    let [_, a1, a2, a3, a4, a5, a6, a7,
          a8, a9, a10, a11, a12, a13, a14, a15,
          a16, a17, a18, a19, a20, a21, a22, a23,
          a24, a25, a26, a27, a28, a29, a30, a31] = load(a);
-    let [b0, b1, b2, b3, b4, b5, b6, b7,
+    let [_, b1, b2, b3, b4, b5, b6, b7,
          b8, b9, b10, b11, b12, b13, b14, b15,
          b16, b17, b18, b19, b20, b21, b22, b23,
          b24, b25, b26, b27, b28, b29, b30, b31] = load(b);
-    match [to_field(a0) - to_field(b0), to_field(a1) - to_field(b1),
+    match [to_field(a1) - to_field(b1),
            to_field(a2) - to_field(b2), to_field(a3) - to_field(b3),
            to_field(a4) - to_field(b4), to_field(a5) - to_field(b5),
            to_field(a6) - to_field(b6), to_field(a7) - to_field(b7),
@@ -80,7 +86,20 @@ def ingress := ⟦
            to_field(a28) - to_field(b28), to_field(a29) - to_field(b29),
            to_field(a30) - to_field(b30), to_field(a31) - to_field(b31)] {
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] => 1,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] => 1,
+      _ => 0,
+    }
+  }
+
+  -- Limb-0 prefilter: a single differing limb proves inequality, and almost
+  -- every comparison (the primitive-dispatch gauntlet in whnf) mismatches at
+  -- limb 0. Hot rows reject here at narrow width; only limb-0 matches pay the
+  -- wide `address_eq_tail` compare. Identical result to a full 32-limb compare.
+  fn address_eq(a: Addr, b: Addr) -> G {
+    let av = load(a);
+    let bv = load(b);
+    match to_field(av[0]) - to_field(bv[0]) {
+      0 => address_eq_tail(a, b),
       _ => 0,
     }
   }

From 0e30d786ae0a0e994faa8e5f87167661dd81ad55 Mon Sep 17 00:00:00 2001
From: Arthur Paulino <arthurleonardo.ap@gmail.com>
Date: Tue, 9 Jun 2026 14:40:48 -0700
Subject: [PATCH 03/13] IxVM: hot/cold split whnf_with_spine (Const + Proj head
 dispatch)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

whnf_with_spine is the single hottest circuit on reduction-heavy consts: width
89 charged on every one of its 3.36M rows. The width was set by two arms that
only a minority of rows reach — the Const-head dispatch (delta/iota/quot/prim
gauntlet) and the Proj-head reduction.

Factor both into their own functions (`whnf_const_head`, `whnf_proj_head`).
Because Aiur charges a function's full width on every row, the ~76% of steps
that are App/Lam/Let now run at the narrow residual width instead of 89; the
wide dispatch only taxes its own (smaller) row count.

Measured on `lake exe ix check --ixe init.ixe --ixes init.ixes --shard 9`
(1 reduction-heavy owned const):
- whnf_with_spine: width 89 -> 34, FFT 6.48G -> 2.48G
- whnf_const_head (new): width 77, height 821490, FFT 1.24G
- whnf_proj_head (new): width 56, height 3452, FFT 0.002G
- total shard FFT: 45.54G -> 42.78G (-6.1%)

The Proj arm is rare (3452 rows) but was nearly half the width; extracting it
dropped whnf_with_spine 65 -> 34 for ~zero relocated cost. Pure refactor.
---
 Ix/IxVM/Kernel/Whnf.lean | 193 ++++++++++++++++++++++-----------------
 1 file changed, 107 insertions(+), 86 deletions(-)

diff --git a/Ix/IxVM/Kernel/Whnf.lean b/Ix/IxVM/Kernel/Whnf.lean
index a80c5ddb..63cb5e1a 100644
--- a/Ix/IxVM/Kernel/Whnf.lean
+++ b/Ix/IxVM/Kernel/Whnf.lean
@@ -107,66 +107,119 @@ def whnf := ⟦
       KExprNode.Lam(ty, body) =>
         whnf_apply_beta(spine, head, types, top, addrs),
       KExprNode.Const(idx, lvls) =>
-        let head_addr = list_lookup(addrs, idx);
-        let ci = load(list_lookup(top, idx));
-        -- Recr / Quot heads can never match a primitive address (Nat ops,
-        -- Str ops, BitVec, native, decidable, proj-def all live as Ctor or
-        -- Defn). Skip the primitive dispatch chain for those.
-        match ci {
-          KConstantInfo.Rec(num_lvls, _, num_params, num_indices, num_motives, num_minors, rules, k_flag, _, _) =>
-            let iota = try_iota(lvls, spine, num_lvls, num_params, num_indices, num_motives, num_minors, rules, k_flag, types, top, addrs);
-            match iota {
-              (1, reduced2) => whnf(reduced2, types, top, addrs),
-              (0, _) => apply_spine(head, spine),
-            },
-          KConstantInfo.Quot(_, _, kind) =>
-            let qiota = try_quot_iota(kind, spine, types, top, addrs);
-            match qiota {
-              (1, reduced_q) => whnf(reduced_q, types, top, addrs),
-              (0, _) => apply_spine(head, spine),
+        -- Const-head reduction (delta / iota / quot / primitive dispatch) is the
+        -- widest arm by far. Factored into `whnf_const_head` so `whnf_with_spine`
+        -- stays narrow for the ~76% of reduction steps that are App/Lam/Proj —
+        -- Aiur charges a function's full width on every row, so the wide dispatch
+        -- only taxes the Const-head rows in its own circuit.
+        whnf_const_head(idx, lvls, head, spine, types, top, addrs),
+      KExprNode.Let(_, val, body) =>
+        let next = expr_inst1(body, val, 0);
+        whnf_with_spine(next, spine, types, top, addrs),
+      KExprNode.Proj(tidx, fidx, inner) =>
+        -- Proj reduction (whnf the scrutinee, fin-val rewrite, ctor field pull)
+        -- is the next-widest arm. Factored out for the same reason as Const.
+        whnf_proj_head(tidx, fidx, inner, spine, types, top, addrs),
+      _ => apply_spine(head, spine),
+    }
+  }
+
+  -- Proj-head WHNF dispatch, split out of `whnf_with_spine` (see its Proj arm).
+  fn whnf_proj_head(tidx: G, fidx: G, inner: KExpr, spine: List‹KExpr›,
+                    types: List‹KExpr›, top: List‹&KConstantInfo›, addrs: List‹Addr›) -> KExpr {
+    let inner_whnf = whnf(inner, types, top, addrs);
+    let inner_pair = collect_spine(inner_whnf);
+    match inner_pair {
+      (inner_head, inner_args) =>
+        -- Mirror: whnf.rs:1441-1500 try_reduce_fin_val_decidable_rec.
+        -- Pushes Fin.val inside Decidable.rec minors; allows iota.
+        let fvd_pair = try_reduce_fin_val_decidable_rec(tidx, fidx, inner_head, inner_args, addrs);
+        match fvd_pair {
+          (1, rewritten) => whnf_with_spine(rewritten, spine, types, top, addrs),
+          (0, _) =>
+            match load(inner_head) {
+              KExprNode.Const(cidx, _) =>
+                let cci = load(list_lookup(top, cidx));
+                match cci {
+                  KConstantInfo.Ctor(_, _, _, _, nparams, _, _) =>
+                    let field = list_lookup_or_nil(inner_args, nparams + fidx);
+                    whnf_with_spine(field, spine, types, top, addrs),
+                  _ =>
+                    let stuck = store(KExprNode.Proj(tidx, fidx, inner_whnf));
+                    apply_spine(stuck, spine),
+                },
+              _ =>
+                let stuck = store(KExprNode.Proj(tidx, fidx, inner_whnf));
+                apply_spine(stuck, spine),
             },
-          _ =>
-            let nat_pair = try_nat_dispatch(head_addr, spine, types, top, addrs);
-            match nat_pair {
-              (1, reduced) => whnf(reduced, types, top, addrs),
+        },
+    }
+  }
+
+  -- Const-head WHNF dispatch, split out of `whnf_with_spine` (see its Const arm).
+  -- `head` is the original `Const(idx, lvls)` KExpr, passed for the stuck
+  -- `apply_spine(head, spine)` fallbacks.
+  fn whnf_const_head(idx: G, lvls: List‹&KLevel›, head: KExpr, spine: List‹KExpr›,
+                     types: List‹KExpr›, top: List‹&KConstantInfo›, addrs: List‹Addr›) -> KExpr {
+    let head_addr = list_lookup(addrs, idx);
+    let ci = load(list_lookup(top, idx));
+    -- Recr / Quot heads can never match a primitive address (Nat ops,
+    -- Str ops, BitVec, native, decidable, proj-def all live as Ctor or
+    -- Defn). Skip the primitive dispatch chain for those.
+    match ci {
+      KConstantInfo.Rec(num_lvls, _, num_params, num_indices, num_motives, num_minors, rules, k_flag, _, _) =>
+        let iota = try_iota(lvls, spine, num_lvls, num_params, num_indices, num_motives, num_minors, rules, k_flag, types, top, addrs);
+        match iota {
+          (1, reduced2) => whnf(reduced2, types, top, addrs),
+          (0, _) => apply_spine(head, spine),
+        },
+      KConstantInfo.Quot(_, _, kind) =>
+        let qiota = try_quot_iota(kind, spine, types, top, addrs);
+        match qiota {
+          (1, reduced_q) => whnf(reduced_q, types, top, addrs),
+          (0, _) => apply_spine(head, spine),
+        },
+      _ =>
+        let nat_pair = try_nat_dispatch(head_addr, spine, types, top, addrs);
+        match nat_pair {
+          (1, reduced) => whnf(reduced, types, top, addrs),
+          (0, _) =>
+            let str_pair = try_str_dispatch(head_addr, spine, addrs);
+            match str_pair {
+              (1, reduced_s) => whnf(reduced_s, types, top, addrs),
               (0, _) =>
-                let str_pair = try_str_dispatch(head_addr, spine, addrs);
-                match str_pair {
-                  (1, reduced_s) => whnf(reduced_s, types, top, addrs),
+                let bv_pair = try_bitvec_dispatch(head_addr, spine, types, top, addrs);
+                match bv_pair {
+                  (1, reduced_b) => whnf(reduced_b, types, top, addrs),
                   (0, _) =>
-                    let bv_pair = try_bitvec_dispatch(head_addr, spine, types, top, addrs);
-                    match bv_pair {
-                      (1, reduced_b) => whnf(reduced_b, types, top, addrs),
+                    let nat_pair2 = try_reduce_native(head_addr, spine, types, top, addrs);
+                    match nat_pair2 {
+                      (1, reduced_n) => whnf(reduced_n, types, top, addrs),
                       (0, _) =>
-                        let nat_pair2 = try_reduce_native(head_addr, spine, types, top, addrs);
-                        match nat_pair2 {
-                          (1, reduced_n) => whnf(reduced_n, types, top, addrs),
+                        let dec_pair = try_reduce_decidable(head_addr, idx, lvls, spine, types, top, addrs);
+                        match dec_pair {
+                          (1, reduced_d) => whnf(reduced_d, types, top, addrs),
                           (0, _) =>
-                            let dec_pair = try_reduce_decidable(head_addr, idx, lvls, spine, types, top, addrs);
-                            match dec_pair {
-                              (1, reduced_d) => whnf(reduced_d, types, top, addrs),
+                            let proj_def_pair = try_reduce_projection_definition(idx, spine, top);
+                            match proj_def_pair {
+                              (1, reduced_pd) => whnf(reduced_pd, types, top, addrs),
                               (0, _) =>
-                                let proj_def_pair = try_reduce_projection_definition(idx, spine, top);
-                                match proj_def_pair {
-                                  (1, reduced_pd) => whnf(reduced_pd, types, top, addrs),
-                                  (0, _) =>
-                                    -- Mirror src/ix/kernel/whnf.rs:756-774
-                                    -- (`delta_unfold_one`): unfold any Defn
-                                    -- regardless of `ReducibilityHints`. The
-                                    -- hint is consulted by lazy-delta's
-                                    -- `delta_rank` for def-eq priority, not
-                                    -- as a gate on plain whnf delta. Without
-                                    -- unfolding here, ctor field types
-                                    -- written via opaque defs (e.g.
-                                    -- `constType (n α) (n α)`) stay stuck
-                                    -- and `check_positivity_aug` misclassifies.
-                                    match ci {
-                                      KConstantInfo.Defn(_, _, value, _, _) =>
-                                        let body = expr_inst_levels(value, lvls);
-                                        whnf_with_spine(body, spine, types, top, addrs),
-                                      KConstantInfo.Thm(_, _, _) => apply_spine(head, spine),
-                                      _ => apply_spine(head, spine),
-                                    },
+                                -- Mirror src/ix/kernel/whnf.rs:756-774
+                                -- (`delta_unfold_one`): unfold any Defn
+                                -- regardless of `ReducibilityHints`. The
+                                -- hint is consulted by lazy-delta's
+                                -- `delta_rank` for def-eq priority, not
+                                -- as a gate on plain whnf delta. Without
+                                -- unfolding here, ctor field types
+                                -- written via opaque defs (e.g.
+                                -- `constType (n α) (n α)`) stay stuck
+                                -- and `check_positivity_aug` misclassifies.
+                                match ci {
+                                  KConstantInfo.Defn(_, _, value, _, _) =>
+                                    let body = expr_inst_levels(value, lvls);
+                                    whnf_with_spine(body, spine, types, top, addrs),
+                                  KConstantInfo.Thm(_, _, _) => apply_spine(head, spine),
+                                  _ => apply_spine(head, spine),
                                 },
                             },
                         },
@@ -174,38 +227,6 @@ def whnf := ⟦
                 },
             },
         },
-      KExprNode.Let(_, val, body) =>
-        let next = expr_inst1(body, val, 0);
-        whnf_with_spine(next, spine, types, top, addrs),
-      KExprNode.Proj(tidx, fidx, inner) =>
-        let inner_whnf = whnf(inner, types, top, addrs);
-        let inner_pair = collect_spine(inner_whnf);
-        match inner_pair {
-          (inner_head, inner_args) =>
-            -- Mirror: whnf.rs:1441-1500 try_reduce_fin_val_decidable_rec.
-            -- Pushes Fin.val inside Decidable.rec minors; allows iota.
-            let fvd_pair = try_reduce_fin_val_decidable_rec(tidx, fidx, inner_head, inner_args, addrs);
-            match fvd_pair {
-              (1, rewritten) => whnf_with_spine(rewritten, spine, types, top, addrs),
-              (0, _) =>
-                match load(inner_head) {
-                  KExprNode.Const(cidx, _) =>
-                    let cci = load(list_lookup(top, cidx));
-                    match cci {
-                      KConstantInfo.Ctor(_, _, _, _, nparams, _, _) =>
-                        let field = list_lookup_or_nil(inner_args, nparams + fidx);
-                        whnf_with_spine(field, spine, types, top, addrs),
-                      _ =>
-                        let stuck = store(KExprNode.Proj(tidx, fidx, inner_whnf));
-                        apply_spine(stuck, spine),
-                    },
-                  _ =>
-                    let stuck = store(KExprNode.Proj(tidx, fidx, inner_whnf));
-                    apply_spine(stuck, spine),
-                },
-            },
-        },
-      _ => apply_spine(head, spine),
     }
   }
 

From 5b7fabcf4a4c49a593a783ab2c9424f48dc9fbfb Mon Sep 17 00:00:00 2001
From: Arthur Paulino <arthurleonardo.ap@gmail.com>
Date: Tue, 9 Jun 2026 16:44:21 -0700
Subject: [PATCH 04/13] IxVM: non-tail collect_spine, single shared definition

Replace the tail-recursive `collect_spine_go(e, acc)` (and its verbatim copy
`collect_spine_simple_go`) with one non-tail `collect_spine(e)` in the
kernelTypes block. Repoint all callers (whnf, primitive, inductive_check,
def_eq); delete both old copies. Keyed on `e` alone, it now dedups shared
sub-spines (0 -> 2.19M cache hits).

Measured on `lake exe ix check --ixe init.ixe --ixes init.ixes --shard 9`:
- collect_spine: 5.38M rows / 2.65G -> 2.17M rows / 0.96G
- list_snoc.Ptr: +0.96G (new)
- total shard FFT: 42.78G -> 41.67G
---
 Ix/IxVM/Kernel/DefEq.lean     |  2 +-
 Ix/IxVM/Kernel/Inductive.lean | 33 +++++++++------------------------
 Ix/IxVM/Kernel/Primitive.lean | 12 ++++++------
 Ix/IxVM/Kernel/Whnf.lean      | 15 ---------------
 Ix/IxVM/KernelTypes.lean      | 18 ++++++++++++++++++
 5 files changed, 34 insertions(+), 46 deletions(-)

diff --git a/Ix/IxVM/Kernel/DefEq.lean b/Ix/IxVM/Kernel/DefEq.lean
index c5dde3b6..75916064 100644
--- a/Ix/IxVM/Kernel/DefEq.lean
+++ b/Ix/IxVM/Kernel/DefEq.lean
@@ -146,7 +146,7 @@ def defEq := ⟦
 
   -- 1 iff ty is `Const(I, _) args` for non-rec 1-ctor 0-field inductive.
   fn is_unit_like_type(ty: KExpr, top: List‹&KConstantInfo›) -> G {
-    match collect_spine_simple(ty) {
+    match collect_spine(ty) {
       (head, _) =>
         match load(head) {
           KExprNode.Const(idx, _) =>
diff --git a/Ix/IxVM/Kernel/Inductive.lean b/Ix/IxVM/Kernel/Inductive.lean
index b3f89177..50b39f84 100644
--- a/Ix/IxVM/Kernel/Inductive.lean
+++ b/Ix/IxVM/Kernel/Inductive.lean
@@ -32,7 +32,7 @@ def inductive_check := ⟦
                             n_params: G, n_indices: G, n_fields: G,
                             ind_idx: G, ind_num_lvls: G) {
     let body = peel_n_foralls(ctor_ty, n_params + n_fields);
-    let pair = collect_spine_simple(body);
+    let pair = collect_spine(body);
     match pair {
       (head, args) =>
         match load(head) {
@@ -59,21 +59,6 @@ def inductive_check := ⟦
     }
   }
 
-  -- Walk a left-associative App chain, return (head, args-in-application-order).
-  -- Inlined to avoid Whnf import here (module ordering — Inductive precedes Whnf
-  -- in the dependency-first build order); identical to whnf.lean's collect_spine.
-  fn collect_spine_simple_go(e: KExpr, acc: List‹KExpr›) -> (KExpr, List‹KExpr›) {
-    match load(e) {
-      KExprNode.App(f, a) =>
-        collect_spine_simple_go(f, store(ListNode.Cons(a, acc))),
-      _ => (e, acc),
-    }
-  }
-
-  fn collect_spine_simple(e: KExpr) -> (KExpr, List‹KExpr›) {
-    collect_spine_simple_go(e, store(ListNode.Nil))
-  }
-
   -- Each `lvls[i]` must be `Param(expected_start + i)` for i in 0..count.
   fn assert_lvls_are_params(lvls: List‹&KLevel›, count: G, idx: G) {
     match count {
@@ -256,7 +241,7 @@ def inductive_check := ⟦
             let types2 = store(ListNode.Cons(inner_dom, types));
             check_positivity_aug(inner_body, block_idxs, types2, top, addrs),
           _ =>
-            match collect_spine_simple(dom_w) {
+            match collect_spine(dom_w) {
               (head, args) =>
                 match load(head) {
                   KExprNode.Const(idx, _) =>
@@ -576,7 +561,7 @@ def inductive_check := ⟦
                               data_bvars: List‹G›) -> G {
     match n_fields - field_idx {
       0 =>
-        match collect_spine_simple(ty) {
+        match collect_spine(ty) {
           (_, args) => all_bvars_in_args(data_bvars, args),
         },
       _ =>
@@ -853,7 +838,7 @@ def inductive_check := ⟦
         let n_ihs = list_length(rec_indices);
         let n_binders = n_fields + n_ihs;
         let depth_now = minor_saved + n_binders;
-        let ret_pair = collect_spine_simple(ret_ty);
+        let ret_pair = collect_spine(ret_ty);
         match ret_pair {
           (_ret_head, ret_args) =>
             -- Drop n_own_params from ret to expose indices.
@@ -1000,7 +985,7 @@ def inductive_check := ⟦
       (doms, body) =>
         let inner_types = list_concat(list_reverse(doms), types);
         let body_w = whnf(body, inner_types, top, addrs);
-        match collect_spine_simple(body_w) {
+        match collect_spine(body_w) {
           (head, _) =>
             match load(head) {
               KExprNode.Const(idx, _) => find_member_local_idx(block_member_idxs, idx, 0),
@@ -1454,7 +1439,7 @@ def inductive_check := ⟦
               ((inner_depth - 1) - n_params), 0);
             let with_minors = build_apply_minors(with_motives, n_minors,
               (((inner_depth - 1) - n_params) - n_motives), 0);
-            match collect_spine_simple(inner_body) {
+            match collect_spine(inner_body) {
               (_dh, dargs) =>
                 let idx_args = list_drop(dargs, target_n_params);
                 let with_idx = apply_spine(with_minors, idx_args);
@@ -1512,7 +1497,7 @@ def inductive_check := ⟦
     let after_skip = peel_n_foralls(ty, skip);
     match load(after_skip) {
       KExprNode.Forall(major_ty, _) =>
-        match collect_spine_simple(major_ty) {
+        match collect_spine(major_ty) {
           (head, _) =>
             match load(head) {
               KExprNode.Const(idx, _) => idx,
@@ -2014,7 +1999,7 @@ def inductive_check := ⟦
                           -> List‹(G, List‹KExpr›, List‹&KLevel›)› {
     match peel_leading_foralls(dom) {
       (_doms, body) =>
-        match collect_spine_simple(body) {
+        match collect_spine(body) {
           (head, args) =>
             match load(head) {
               KExprNode.Const(idx, occ_us) =>
@@ -2304,7 +2289,7 @@ def inductive_check := ⟦
             let inner_depth = depth + n_xs;
             let motive_bvar = (inner_depth - 1) - (motive_base + mem_idx);
             let field_bvar = (inner_depth - 1) - (minor_saved + field_idx);
-            match collect_spine_simple(inner_body) {
+            match collect_spine(inner_body) {
               (_h, dom_args) =>
                 let idx_args = list_drop(dom_args, target_n_params);
                 let motive_ref = store(KExprNode.BVar(motive_bvar));
diff --git a/Ix/IxVM/Kernel/Primitive.lean b/Ix/IxVM/Kernel/Primitive.lean
index ac5c7397..1e7bffec 100644
--- a/Ix/IxVM/Kernel/Primitive.lean
+++ b/Ix/IxVM/Kernel/Primitive.lean
@@ -1511,7 +1511,7 @@ def primitive := ⟦
   -- Returns (1, width_e, n_e) if `e` is `BitVec.ofNat W N` or
   -- `OfNat.ofNat (BitVec W) N _inst`. Else (0, _, _).
   fn bitvec_of_nat_args(e: KExpr, addrs: List‹Addr›) -> (G, KExpr, KExpr) {
-    match collect_spine_simple(e) {
+    match collect_spine(e) {
       (head, args) =>
         match load(head) {
           KExprNode.Const(idx, _) =>
@@ -1534,7 +1534,7 @@ def primitive := ⟦
                       1 => (0, store(KExprNode.BVar(0)), store(KExprNode.BVar(0))),
                       0 =>
                         let ty_arg = list_lookup(args, 0);
-                        match collect_spine_simple(ty_arg) {
+                        match collect_spine(ty_arg) {
                           (ty_head, ty_args) =>
                             match load(ty_head) {
                               KExprNode.Const(ty_idx, _) =>
@@ -1639,7 +1639,7 @@ def primitive := ⟦
       1 => (0, store(KExprNode.BVar(0))),
       0 =>
         let prop = list_lookup(spine, 0);
-        match collect_spine_simple(prop) {
+        match collect_spine(prop) {
           (lt_head, lt_args) =>
             match load(lt_head) {
               KExprNode.Const(lt_idx, _) =>
@@ -1651,7 +1651,7 @@ def primitive := ⟦
                       1 => (0, store(KExprNode.BVar(0))),
                       0 =>
                         let ty_arg = list_lookup(lt_args, 0);
-                        match collect_spine_simple(ty_arg) {
+                        match collect_spine(ty_arg) {
                           (ty_head, ty_args) =>
                             match load(ty_head) {
                               KExprNode.Const(ty_idx, _) =>
@@ -1749,7 +1749,7 @@ def primitive := ⟦
       1 => (0, store(KExprNode.BVar(0))),
       0 =>
         let val_arg = list_lookup(spine, 2);
-        match collect_spine_simple(val_arg) {
+        match collect_spine(val_arg) {
           (head, _) =>
             match load(head) {
               KExprNode.Const(idx, _) =>
@@ -1771,7 +1771,7 @@ def primitive := ⟦
       1 => (0, store(KExprNode.BVar(0))),
       0 =>
         let ty_arg = list_lookup(spine, 0);
-        match collect_spine_simple(ty_arg) {
+        match collect_spine(ty_arg) {
           (head, _) =>
             match load(head) {
               KExprNode.Const(idx, _) =>
diff --git a/Ix/IxVM/Kernel/Whnf.lean b/Ix/IxVM/Kernel/Whnf.lean
index 63cb5e1a..8a50f6c6 100644
--- a/Ix/IxVM/Kernel/Whnf.lean
+++ b/Ix/IxVM/Kernel/Whnf.lean
@@ -46,21 +46,6 @@ ingress (slot mapping in `Primitive.lean`).
 -/
 
 def whnf := ⟦
-  -- ============================================================================
-  -- Spine collection
-  -- ============================================================================
-  fn collect_spine_go(e: KExpr, acc: List‹KExpr›) -> (KExpr, List‹KExpr›) {
-    match load(e) {
-      KExprNode.App(f, a) =>
-        collect_spine_go(f, store(ListNode.Cons(a, acc))),
-      _ => (e, acc),
-    }
-  }
-
-  fn collect_spine(e: KExpr) -> (KExpr, List‹KExpr›) {
-    collect_spine_go(e, store(ListNode.Nil))
-  }
-
   fn apply_spine(head: KExpr, spine: List‹KExpr›) -> KExpr {
     match load(spine) {
       ListNode.Nil => head,
diff --git a/Ix/IxVM/KernelTypes.lean b/Ix/IxVM/KernelTypes.lean
index 950a8f2c..b4a57b34 100644
--- a/Ix/IxVM/KernelTypes.lean
+++ b/Ix/IxVM/KernelTypes.lean
@@ -50,6 +50,24 @@ def kernelTypes := ⟦
 
   type KExpr = &KExprNode
 
+  -- Collect an application spine: peel `App(f, a)` layers, returning the head
+  -- and the args in application order. The single shared definition for every
+  -- kernel caller (whnf, primitive, inductive_check, def_eq) — defined here in
+  -- `kernelTypes` so it precedes them all in the merge.
+  --
+  -- Non-tail (no accumulator): keyed on `e` alone, so Aiur memoization dedups
+  -- shared sub-spines across reductions. An accumulator would thread `acc`
+  -- through the memo key and block that sharing (tail recursion buys nothing in
+  -- Aiur — stack depth is free). `list_snoc` keeps order and is itself memoized.
+  fn collect_spine(e: KExpr) -> (KExpr, List‹KExpr›) {
+    match load(e) {
+      KExprNode.App(f, a) =>
+        let (head, args) = collect_spine(f);
+        (head, list_snoc(args, a)),
+      _ => (e, store(ListNode.Nil)),
+    }
+  }
+
   -- ============================================================================
   -- Values (NbE semantic domain)
   -- ============================================================================

From a52967f88b94befc2ef58b887845f8c474eabe4c Mon Sep 17 00:00:00 2001
From: Arthur Paulino <arthurleonardo.ap@gmail.com>
Date: Tue, 9 Jun 2026 19:31:20 -0700
Subject: [PATCH 05/13] IxVM: keyed skip-set for sharded check (replace
 addr_in_list scan)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

check_all_skipping tested each closure const against the assumption-leaf list
via addr_in_list, an O(N) linear address_eq scan — the single largest address_eq
source on sharded checks (1.81M calls on Init shard 9, more than all primitive
dispatch combined).

Build an RBTreeMap skip-set once (keyed on the first 4 address bytes), then test
membership with one rbtree lookup + one confirming address_eq. Key collisions
overwrite: sound because a missed skip only rechecks a frontier const, and the
confirming address_eq rules out false positives.

Measured on `lake exe ix check --ixe init.ixe --ixes init.ixes --shard 9`:
- address_eq: 2.07M rows / 3.48G -> 274K rows / 0.40G
- rbtree build + lookups: +~0.46G
- total shard FFT: 41.67G -> 37.84G
---
 Ix/IxVM/Kernel/Claim.lean | 50 +++++++++++++++++++++++++++++++++++----
 1 file changed, 45 insertions(+), 5 deletions(-)

diff --git a/Ix/IxVM/Kernel/Claim.lean b/Ix/IxVM/Kernel/Claim.lean
index 27c9a0ca..7122431a 100644
--- a/Ix/IxVM/Kernel/Claim.lean
+++ b/Ix/IxVM/Kernel/Claim.lean
@@ -804,6 +804,43 @@ def claim := ⟦
     }
   }
 
+  -- Pack the first 4 address bytes (LE) into a u32 key for the skip-set rbtree.
+  --
+  -- Capped at 4 bytes because `RBTreeMap` orders keys with `u32_less_than`, a
+  -- 32-bit comparator gadget — a wider key (a single `G` could hold 7 bytes in
+  -- Goldilocks) would overflow it and corrupt the tree ordering. A 32-bit key
+  -- space means key collisions are possible (~N²/2^33 over N leaves), but they
+  -- are harmless: a collision makes `addr_in_set`'s confirming `address_eq`
+  -- fail, yielding a false negative (a frontier const gets rechecked) never a
+  -- false positive (a wrong skip). See `build_skip_set`.
+  fn addr_key(a: Addr) -> G {
+    let arr = load(a);
+    to_field(arr[0])
+      + to_field(arr[1]) * 256
+      + to_field(arr[2]) * 65536
+      + to_field(arr[3]) * 16777216
+  }
+
+  -- Build an O(log N) membership set from the assumption-leaf list, keyed on
+  -- `addr_key`. Key collisions overwrite — sound because the only consequence is
+  -- a missed skip (a frontier const gets rechecked, never wrongly trusted); the
+  -- confirming `address_eq` in `addr_in_set` rules out false positives.
+  fn build_skip_set(leaves: List‹Addr›, acc: RBTreeMap‹Addr›) -> RBTreeMap‹Addr› {
+    match load(leaves) {
+      ListNode.Nil => acc,
+      ListNode.Cons(a, rest) =>
+        build_skip_set(rest, rbtree_map_insert(addr_key(a), a, acc)),
+    }
+  }
+
+  -- Membership via one rbtree lookup (cheap u32 key compares) + one confirming
+  -- full `address_eq`, replacing the O(N) linear `addr_in_list` scan that
+  -- dominated address_eq cost on sharded checks.
+  fn addr_in_set(target: Addr, skip_set: RBTreeMap‹Addr›) -> G {
+    let found = rbtree_map_lookup_or_default(addr_key(target), skip_set, store([0u8; 32]));
+    address_eq(found, target)
+  }
+
   -- ============================================================================
   -- check_all variant that skips positions whose addr is in the
   -- supplied assumption-leaf list.
@@ -813,24 +850,27 @@ def claim := ⟦
                         addrs: List‹Addr›,
                         asm_leaves: List‹Addr›) {
     let _ = check_canonical_block_sort(top);
-    check_all_skipping_iter(consts, top, addrs, asm_leaves, 0)
+    -- Build the skip-set once (O(N log N)) instead of an O(N) linear scan per
+    -- checked const.
+    let skip_set = build_skip_set(asm_leaves, RBTreeMap.Nil);
+    check_all_skipping_iter(consts, top, addrs, skip_set, 0)
   }
 
   fn check_all_skipping_iter(consts: List‹&KConstantInfo›,
                              top: List‹&KConstantInfo›,
                              addrs: List‹Addr›,
-                             asm_leaves: List‹Addr›,
+                             skip_set: RBTreeMap‹Addr›,
                              pos: G) {
     match load(consts) {
       ListNode.Nil => (),
       ListNode.Cons(&ci, rest) =>
         let addr = list_lookup(addrs, pos);
-        match addr_in_list(addr, asm_leaves) {
+        match addr_in_set(addr, skip_set) {
           1 =>
-            check_all_skipping_iter(rest, top, addrs, asm_leaves, pos + 1),
+            check_all_skipping_iter(rest, top, addrs, skip_set, pos + 1),
           _ =>
             let _ = check_const(ci, pos, top, addrs);
-            check_all_skipping_iter(rest, top, addrs, asm_leaves, pos + 1),
+            check_all_skipping_iter(rest, top, addrs, skip_set, pos + 1),
         },
     }
   }

From dbc4177143f0c61f1f7cc07606ad3e17c56dd3d3 Mon Sep 17 00:00:00 2001
From: Arthur Paulino <arthurleonardo.ap@gmail.com>
Date: Wed, 10 Jun 2026 08:05:11 -0700
Subject: [PATCH 06/13] IxVM: collapse symbolic-base Nat.rec succ-tower to an
 offset
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

try_nat_linear_rec only folded `Nat.rec base (λ_ ih => succ ih) (Lit n)` when
the base was also a literal; a symbolic base declined and fell through to n
iota steps that materialize succ^n(base). Reduce the symbolic-base case
directly to the offset primitive `Nat.add base (Lit n)` instead.

This keeps the value in the same compact `base + n` form a literal already has,
so def-eq comparing it converges instead of descending n unary succ layers (the
UTF-8 `x + 0xC0` reduction class). Offset magnitude stays KLimbs; only the
nat_add top-position index is a (small) G.

Measured on a `x + D = Nat.rec x succ-step D` benchmark: the unary def-eq
descent collapses (nat_succ_of D -> ~constant), ~-15% total FFT at D=512. The
remaining cost is the in-whnf chain reduction (substitution), unchanged here.
---
 Ix/IxVM/Kernel/Whnf.lean | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/Ix/IxVM/Kernel/Whnf.lean b/Ix/IxVM/Kernel/Whnf.lean
index 8a50f6c6..52ff6962 100644
--- a/Ix/IxVM/Kernel/Whnf.lean
+++ b/Ix/IxVM/Kernel/Whnf.lean
@@ -457,10 +457,29 @@ def whnf := ⟦
               1 =>
                 let raw_base = list_lookup(spine, base_idx);
                 let base_w = whnf(raw_base, types, top, addrs);
+                let post = list_drop(spine, major_idx + 1);
                 match try_extract_nat(base_w, addrs) {
-                  (0, _) => (0, store(KExprNode.BVar(0))),
+                  -- Symbolic base: collapse `Nat.rec base (succ-step) (Lit n)`
+                  -- to the offset primitive `Nat.add base (Lit n)` rather than
+                  -- materializing succ^n(base) via n iota steps. This keeps the
+                  -- value in the same compact `base + n` form a literal already
+                  -- has, so def-eq converges instead of descending n unary
+                  -- succ layers (the UTF-8 `x + 0xC0` pathology).
+                  (0, _) =>
+                    match klimbs_is_zero(n_klimbs) {
+                      1 => (1, apply_spine(base_w, post)),
+                      0 =>
+                        match find_addr_idx_safe(nat_add_addr(), addrs, 0) {
+                          (0, _) => (0, store(KExprNode.BVar(0))),
+                          (1, add_idx) =>
+                            let add_const = store(KExprNode.Const(add_idx, store(ListNode.Nil)));
+                            let off = store(KExprNode.App(
+                              store(KExprNode.App(add_const, base_w)),
+                              mk_nat_lit(n_klimbs)));
+                            (1, apply_spine(off, post)),
+                        },
+                    },
                   (1, b_klimbs) =>
-                    let post = list_drop(spine, major_idx + 1);
                     (1, apply_spine(mk_nat_lit(klimbs_add(b_klimbs, n_klimbs)), post)),
                 },
             },

From 5dcab7f85f26f9787ccc9c420b05e227af5e97ed Mon Sep 17 00:00:00 2001
From: Arthur Paulino <arthurleonardo.ap@gmail.com>
Date: Wed, 10 Jun 2026 08:33:55 -0700
Subject: [PATCH 07/13] IxVM: keep Nat base+literal compact (stop succ-tower
 materialization)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reducing `Nat.add x (Lit n)` (symbolic x) delta-unfolded into a succ^n(x)
tower, and the nat_succ dispatch then whnf-cascaded the whole chain — O(n)
substitution per reduction, the dominant cost on Nat-arithmetic proofs (the
UTF-8 codec class).

Two coordinated changes:
- whnf (try_nat_offset_stuck): leave `Nat.add base (Lit n)` (non-literal base,
  nonzero n) stuck as a compact offset instead of unfolding it.
- def-eq (nat_offset_of + try_def_eq_nat): decompose each side to base + KLimbs
  offset (reading `Nat.add base (Lit m)` in O(1), peeling the few succs whnf
  exposes) and decide `base_a ≟ base_b` only when offsets are equal; otherwise
  fall back. All magnitudes stay KLimbs (no Goldilocks collapse).

Measured on `x + D = Nat.rec x succ-step D` (depths 64..512): total FFT goes
from O(D) (328M at D=512) to a depth-independent ~28.5M (-91% at D=512). Full
`lake test -- --ignored ixvm` passes (297 tests; BAD cases still rejected);
a recursor-on-symbolic-offset completeness test reduces correctly.
---
 Ix/IxVM/Kernel/DefEq.lean | 82 ++++++++++++++++++++++++++++-----------
 Ix/IxVM/Kernel/Whnf.lean  | 50 +++++++++++++++++++++++-
 2 files changed, 107 insertions(+), 25 deletions(-)

diff --git a/Ix/IxVM/Kernel/DefEq.lean b/Ix/IxVM/Kernel/DefEq.lean
index 75916064..f8678931 100644
--- a/Ix/IxVM/Kernel/DefEq.lean
+++ b/Ix/IxVM/Kernel/DefEq.lean
@@ -218,36 +218,66 @@ def defEq := ⟦
     }
   }
 
-  -- Mirror: src/ix/kernel/def_eq.rs:930-948 fn nat_succ_of.
-  -- `Lit(n)` n>0 → (1, Lit(n-1)). `App(Const(Nat.succ), arg)` → (1, arg).
-  -- Else (0, _).
-  fn nat_succ_of(e: KExpr, addrs: List‹Addr›) -> (G, KExpr) {
+
+  -- Decompose a WHNF'd Nat into `base + offset` where `base` is the
+  -- non-offset core and `offset` a KLimbs literal. Recognizes:
+  --   Lit n               -> (matched, 0-base, n)
+  --   Nat.succ e          -> base/offset of e, offset+1
+  --   Nat.add e (Lit m)   -> base/offset of e, offset+m
+  -- `matched=1` iff `e` is offset-shaped (succ/add/lit). The few succ layers
+  -- whnf exposes are peeled, but `Nat.add base (Lit m)` is read in O(1) — so a
+  -- `succ^k(x)` chain (which whnf leaves as `succ(Nat.add x (Lit k-1))`)
+  -- decomposes to `(x, k)` in O(1) instead of k unary steps.
+  fn nat_offset_of(e: KExpr, addrs: List‹Addr›) -> (G, KExpr, KLimbs) {
     match load(e) {
       KExprNode.Lit(lit) =>
         match lit {
-          KLiteral.Nat(limbs) =>
-            match klimbs_is_zero(limbs) {
-              1 => (0, store(KExprNode.BVar(0))),
-              0 => (1, mk_nat_lit(klimbs_dec(limbs))),
-            },
-          _ => (0, store(KExprNode.BVar(0))),
+          KLiteral.Nat(n) => (1, mk_nat_lit(store(ListNode.Nil)), n),
+          _ => (0, e, store(ListNode.Nil)),
         },
       KExprNode.App(f, a) =>
         match load(f) {
           KExprNode.Const(idx, _) =>
             match address_eq(list_lookup(addrs, idx), nat_succ_addr()) {
-              1 => (1, a),
-              0 => (0, store(KExprNode.BVar(0))),
+              1 =>
+                match nat_offset_of(a, addrs) {
+                  (_, base, o) => (1, base, klimbs_succ(o)),
+                },
+              0 => (0, e, store(ListNode.Nil)),
             },
-          _ => (0, store(KExprNode.BVar(0))),
+          KExprNode.App(g, x) =>
+            match load(g) {
+              KExprNode.Const(idx, _) =>
+                match address_eq(list_lookup(addrs, idx), nat_add_addr()) {
+                  1 =>
+                    match load(a) {
+                      KExprNode.Lit(alit) =>
+                        match alit {
+                          KLiteral.Nat(m) =>
+                            match nat_offset_of(x, addrs) {
+                              (1, base, o) => (1, base, klimbs_add(o, m)),
+                              (0, _, _) => (1, x, m),
+                            },
+                          _ => (0, e, store(ListNode.Nil)),
+                        },
+                      _ => (0, e, store(ListNode.Nil)),
+                    },
+                  0 => (0, e, store(ListNode.Nil)),
+                },
+              _ => (0, e, store(ListNode.Nil)),
+            },
+          _ => (0, e, store(ListNode.Nil)),
         },
-      _ => (0, store(KExprNode.BVar(0))),
+      _ => (0, e, store(ListNode.Nil)),
     }
   }
 
-  -- Mirror: src/ix/kernel/def_eq.rs:953-995 is_def_eq_nat / try_def_eq_offset.
-  -- Returns (matched, eq). `matched=1` iff both sides are nat-shaped (both
-  -- zero, both succ-headed, or both literals); `eq` is the verdict.
+  -- Mirror: src/ix/kernel/def_eq.rs:953-995 is_def_eq_nat / try_def_eq_offset,
+  -- generalized to offset form. Returns (matched, eq). Conservative: only
+  -- decides when both sides are offset-shaped with EQUAL offsets (then the
+  -- verdict is `base_a ≟ base_b`, sound because `+k` is injective); differing
+  -- offsets or non-offset shapes fall back (matched=0) to the generic path.
+  -- Collapses `succ^k(x) ≟ succ^k(x)` from k unary steps to one klimbs compare.
   fn try_def_eq_nat(a: KExpr, b: KExpr, types: List‹KExpr›,
                      top: List‹&KConstantInfo›,
                      addrs: List‹Addr›) -> (G, G) {
@@ -256,13 +286,19 @@ def defEq := ⟦
     match za * zb {
       1 => (1, 1),
       0 =>
-        match nat_succ_of(a, addrs) {
-          (1, ap) =>
-            match nat_succ_of(b, addrs) {
-              (1, bp) => (1, k_is_def_eq(ap, bp, types, top, addrs)),
-              _ => (0, 0),
+        match nat_offset_of(a, addrs) {
+          (ma, ba, oa) =>
+            match nat_offset_of(b, addrs) {
+              (mb, bb, ob) =>
+                match ma * mb {
+                  0 => (0, 0),
+                  _ =>
+                    match klimbs_eq(oa, ob) {
+                      0 => (0, 0),
+                      1 => (1, k_is_def_eq(ba, bb, types, top, addrs)),
+                    },
+                },
             },
-          _ => (0, 0),
         },
     }
   }
diff --git a/Ix/IxVM/Kernel/Whnf.lean b/Ix/IxVM/Kernel/Whnf.lean
index 52ff6962..193ede68 100644
--- a/Ix/IxVM/Kernel/Whnf.lean
+++ b/Ix/IxVM/Kernel/Whnf.lean
@@ -141,6 +141,44 @@ def whnf := ⟦
     }
   }
 
+  -- If `head` is `Nat.add` applied to exactly (non-literal base, nonzero Lit n),
+  -- return (1, `Nat.add base (Lit n)`) in canonical form so whnf leaves it stuck
+  -- as a compact offset instead of delta-unfolding it into succ^n(base). All
+  -- magnitudes stay KLimbs. `(0, _)` means "not this shape — proceed normally".
+  fn try_nat_offset_stuck(head: KExpr, spine: List‹KExpr›, types: List‹KExpr›,
+                          top: List‹&KConstantInfo›, addrs: List‹Addr›) -> (G, KExpr) {
+    let head_addr = nat_add_addr();
+    match load(head) {
+      KExprNode.Const(idx, _) =>
+        match address_eq(list_lookup(addrs, idx), head_addr) {
+          0 => (0, store(KExprNode.BVar(0))),
+          1 =>
+            match list_length(spine) {
+              2 =>
+                let a0_w = whnf(list_lookup(spine, 0), types, top, addrs);
+                let a1_w = whnf(list_lookup(spine, 1), types, top, addrs);
+                match try_extract_nat(a1_w, addrs) {
+                  (0, _) => (0, store(KExprNode.BVar(0))),
+                  (1, n) =>
+                    match klimbs_is_zero(n) {
+                      1 => (0, store(KExprNode.BVar(0))),
+                      0 =>
+                        match try_extract_nat(a0_w, addrs) {
+                          (1, _) => (0, store(KExprNode.BVar(0))),
+                          (0, _) =>
+                            (1, store(KExprNode.App(
+                                  store(KExprNode.App(head, a0_w)),
+                                  mk_nat_lit(n)))),
+                        },
+                    },
+                },
+              _ => (0, store(KExprNode.BVar(0))),
+            },
+        },
+      _ => (0, store(KExprNode.BVar(0))),
+    }
+  }
+
   -- Const-head WHNF dispatch, split out of `whnf_with_spine` (see its Const arm).
   -- `head` is the original `Const(idx, lvls)` KExpr, passed for the stuck
   -- `apply_spine(head, spine)` fallbacks.
@@ -201,8 +239,16 @@ def whnf := ⟦
                                 -- and `check_positivity_aug` misclassifies.
                                 match ci {
                                   KConstantInfo.Defn(_, _, value, _, _) =>
-                                    let body = expr_inst_levels(value, lvls);
-                                    whnf_with_spine(body, spine, types, top, addrs),
+                                    -- Keep `Nat.add base (Lit n)` (symbolic base)
+                                    -- stuck as a compact offset instead of
+                                    -- delta-unfolding into a succ^n tower. Pairs
+                                    -- with offset-aware def-eq.
+                                    match try_nat_offset_stuck(head, spine, types, top, addrs) {
+                                      (1, stuck) => stuck,
+                                      (0, _) =>
+                                        let body = expr_inst_levels(value, lvls);
+                                        whnf_with_spine(body, spine, types, top, addrs),
+                                    },
                                   KConstantInfo.Thm(_, _, _) => apply_spine(head, spine),
                                   _ => apply_spine(head, spine),
                                 },

From f7cfe23bdc1530f92387c6177348aba5895c5a75 Mon Sep 17 00:00:00 2001
From: Arthur Paulino <arthurleonardo.ap@gmail.com>
Date: Wed, 10 Jun 2026 11:14:07 -0700
Subject: [PATCH 08/13] IxVM: keep symbolic Nat.div/mod stuck (stop
 division-tower blowup)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`Nat.div x (Lit k)` / `Nat.mod x (Lit k)` with a symbolic base delta-unfolded
the division algorithm and expanded hugely, even though the result is
irreducible for a symbolic base. `Nat.shiftRight x k` unfolds to k nested
`Nat.div _ 2`, so a symbolic `>>>` materialized a deep, super-linear tower —
the dominant cost in the UTF-8 codec's `c >>> 6/12/18` encoding.

Generalize `try_nat_offset_stuck` (which already keeps `Nat.add base (Lit n)`
compact) to also leave `Nat.div`/`Nat.mod base (Lit k)` (k ≥ 2) stuck. Thresholds
keep `x/1 = x` and `x/0 = 0` reducing. Both `x >>> 18` and `(x >>> 9) >>> 9` then
reduce to the same nested-stuck-div form, so def-eq stays structural and the
identity still holds.

Measured on `x >>> 18 = (x >>> 9) >>> 9`: total FFT 15.65G -> 455M (-97%), now
depth-independent (matches `x >>> 6`). No regression on the Nat.add reproducers;
full `lake test -- --ignored ixvm` passes (297 tests).
---
 Ix/IxVM/Kernel/Whnf.lean | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/Ix/IxVM/Kernel/Whnf.lean b/Ix/IxVM/Kernel/Whnf.lean
index 193ede68..b2671d8e 100644
--- a/Ix/IxVM/Kernel/Whnf.lean
+++ b/Ix/IxVM/Kernel/Whnf.lean
@@ -141,18 +141,25 @@ def whnf := ⟦
     }
   }
 
-  -- If `head` is `Nat.add` applied to exactly (non-literal base, nonzero Lit n),
-  -- return (1, `Nat.add base (Lit n)`) in canonical form so whnf leaves it stuck
-  -- as a compact offset instead of delta-unfolding it into succ^n(base). All
-  -- magnitudes stay KLimbs. `(0, _)` means "not this shape — proceed normally".
+  -- If `head` is a Nat primitive (`Nat.add` / `Nat.div` / `Nat.mod`) applied to
+  -- exactly (non-literal base, literal second arg), return (1, the same op in
+  -- canonical form) so whnf leaves it STUCK instead of delta-unfolding it. This
+  -- stops `Nat.add x n` from materializing succ^n(x), and `Nat.div`/`Nat.mod x n`
+  -- (n ≥ 2) from expanding the division algorithm — both are irreducible for a
+  -- symbolic base, so the compact form IS the normal form. `Nat.shiftRight x k`
+  -- unfolds to k nested `Nat.div _ 2`, which now stay stuck. Thresholds: `add`
+  -- keeps any nonzero n; `div`/`mod` keep n ≥ 2 (so `x/1 = x`, `x/0 = 0` still
+  -- reduce). All magnitudes stay KLimbs. `(0, _)` = "not this shape".
   fn try_nat_offset_stuck(head: KExpr, spine: List‹KExpr›, types: List‹KExpr›,
                           top: List‹&KConstantInfo›, addrs: List‹Addr›) -> (G, KExpr) {
-    let head_addr = nat_add_addr();
     match load(head) {
       KExprNode.Const(idx, _) =>
-        match address_eq(list_lookup(addrs, idx), head_addr) {
+        let ca = list_lookup(addrs, idx);
+        let is_add = address_eq(ca, nat_add_addr());
+        let is_divmod = address_eq(ca, nat_div_addr()) + address_eq(ca, nat_mod_addr());
+        match is_add + is_divmod {
           0 => (0, store(KExprNode.BVar(0))),
-          1 =>
+          _ =>
             match list_length(spine) {
               2 =>
                 let a0_w = whnf(list_lookup(spine, 0), types, top, addrs);
@@ -160,8 +167,9 @@ def whnf := ⟦
                 match try_extract_nat(a1_w, addrs) {
                   (0, _) => (0, store(KExprNode.BVar(0))),
                   (1, n) =>
-                    match klimbs_is_zero(n) {
-                      1 => (0, store(KExprNode.BVar(0))),
+                    -- reject n=0 (all ops) and n=1 (div/mod only).
+                    let bad = klimbs_is_zero(n) + is_divmod * klimbs_is_zero(klimbs_dec(n));
+                    match bad {
                       0 =>
                         match try_extract_nat(a0_w, addrs) {
                           (1, _) => (0, store(KExprNode.BVar(0))),
@@ -170,6 +178,7 @@ def whnf := ⟦
                                   store(KExprNode.App(head, a0_w)),
                                   mk_nat_lit(n)))),
                         },
+                      _ => (0, store(KExprNode.BVar(0))),
                     },
                 },
               _ => (0, store(KExprNode.BVar(0))),

From fa3447e804011810800372cfa442678006f0b32a Mon Sep 17 00:00:00 2001
From: Arthur Paulino <arthurleonardo.ap@gmail.com>
Date: Wed, 10 Jun 2026 15:52:56 -0700
Subject: [PATCH 09/13] IxVM: narrow Ixon expr deserialization (Let split +
 telescope by &Expr)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Ingress deserialization (parsing every closure const's Ixon bytes into Expr
trees) is the dominant cost when checking large-closure constants. Two width
cuts on the hottest parsers:

- get_expr: the Let arm (three recursive get_expr calls) set the function width
  though Let is rare. Factored into get_expr_let — width 102 -> 75.
- get_app_telescope: passed `func` by value (the wide Expr union) on every
  recursion step. Pass it by `&Expr`, loaded only at the base case — width
  95 -> 78.

Measured by profiling ingress of the `ByteArray.utf8DecodeChar?…` closure
(2000 consts): get_expr 766M -> 563M, get_app_telescope 477M -> 392M, total
ingress FFT 3.13G -> 2.85G. Full `lake test -- --ignored ixvm` passes (297).
---
 Ix/IxVM/IxonDeserialize.lean | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/Ix/IxVM/IxonDeserialize.lean b/Ix/IxVM/IxonDeserialize.lean
index 59aa0b7d..4a7cf34b 100644
--- a/Ix/IxVM/IxonDeserialize.lean
+++ b/Ix/IxVM/IxonDeserialize.lean
@@ -101,15 +101,17 @@ def ixonDeserialize := ⟦
   -- Expression deserialization
   -- ============================================================================
 
-  -- App telescope: read count args, wrapping func in App nodes
-  fn get_app_telescope(func: Expr, stream: ByteStream, count: U64) -> (Expr, ByteStream) {
+  -- App telescope: read count args, wrapping func in App nodes. `func` is passed
+  -- by pointer (loaded only at the base case) so the recursion doesn't carry the
+  -- wide `Expr` union by value on every row.
+  fn get_app_telescope(func: &Expr, stream: ByteStream, count: U64) -> (Expr, ByteStream) {
     let is_zero = u64_is_zero(count);
     match is_zero {
-      1 => (func, stream),
+      1 => (load(func), stream),
       0 =>
         let (arg, s) = get_expr(stream);
-        let app = Expr.App(store(func), store(arg));
-        get_app_telescope(app, s, relaxed_u64_pred(count)),
+        let app = Expr.App(func, store(arg));
+        get_app_telescope(store(app), s, relaxed_u64_pred(count)),
     }
   }
 
@@ -180,7 +182,7 @@ def ixonDeserialize := ⟦
       -- App: Tag4(0x7, count) + func + args...
       0x7 =>
         let (func, s2) = get_expr(s);
-        get_app_telescope(func, s2, size),
+        get_app_telescope(store(func), s2, size),
 
       -- Lam: Tag4(0x8, count) + types... + body
       0x8 => get_lam_telescope(s, size),
@@ -189,17 +191,23 @@ def ixonDeserialize := ⟦
       0x9 => get_all_telescope(s, size),
 
       -- Let: Tag4(0xA, non_dep) + expr(ty) + expr(val) + expr(body)
-      0xA =>
-        let (ty, s2) = get_expr(s);
-        let (val, s3) = get_expr(s2);
-        let (body, s4) = get_expr(s3);
-        (Expr.Let(size, store(ty), store(val), store(body)), s4),
+      0xA => get_expr_let(s, size),
 
       -- Share: Tag4(0xB, idx)
       0xB => (Expr.Share(size), s),
     }
   }
 
+  -- Let arm of get_expr, split out: three recursive `get_expr` calls make it the
+  -- widest (and a rare) arm, so inlined it taxes every get_expr row.
+  fn get_expr_let(s: ByteStream, size: U64) -> (Expr, ByteStream) {
+    let (ty, s2) = get_expr(s);
+    let (val, s3) = get_expr(s2);
+    let (body, s4) = get_expr(s3);
+    (Expr.Let(size, store(ty), store(val), store(body)), s4)
+  }
+
+
   -- ============================================================================
   -- Universe deserialization
   -- ============================================================================

From d96c738a8527a0e13e01bcd0315dfe36f5812d64 Mon Sep 17 00:00:00 2001
From: Arthur Paulino <arthurleonardo.ap@gmail.com>
Date: Thu, 11 Jun 2026 10:51:52 -0700
Subject: [PATCH 10/13] IxVM: lbr-trimmed context keys for whnf/infer/def-eq
 memoization

Key whnf / k_infer / k_is_def_eq memoization on the loose-bvar-reachable
suffix of the local context (ctx_trim / ctx_reachable), mirroring Rust's
whnf_key / infer_key / def_eq_ctx_key. A closed subterm (lbr 0) keys to
the empty context and shares its result across every binder depth; eager
substitution is unchanged. Fast paths (closed -> Nil, full-reach -> as-is)
keep the boundary cheap.

FFT (lake exe ix check <const> --stats-out): Nat.add_comm 58.28M -> 57.90M
(-0.66%); Int.emod_emod_of_dvd 5.090G -> 4.691G (-7.85%); Init shard 10
4.738G -> 4.721G (-0.37%). 297 ixvm tests pass.
---
 Ix/IxVM/Kernel/DefEq.lean | 17 ++++++++-----
 Ix/IxVM/Kernel/Infer.lean |  8 +++++++
 Ix/IxVM/Kernel/Subst.lean | 50 +++++++++++++++++++++++++++++++++++++++
 Ix/IxVM/Kernel/Whnf.lean  | 18 +++++++++-----
 4 files changed, 81 insertions(+), 12 deletions(-)

diff --git a/Ix/IxVM/Kernel/DefEq.lean b/Ix/IxVM/Kernel/DefEq.lean
index f8678931..f84c9aa4 100644
--- a/Ix/IxVM/Kernel/DefEq.lean
+++ b/Ix/IxVM/Kernel/DefEq.lean
@@ -42,13 +42,19 @@ def defEq := ⟦
     -- Tier 1: pointer equality short-circuit.
     match ptr_val(a) - ptr_val(b) {
       0 => 1,
-      _ =>
+      -- Context-trimmed memo key (mirror Rust `def_eq_ctx_key`): decide on the
+      -- suffix of `types` reachable from either side; closed pairs (lbr 0) key
+      -- to the empty context and share across binder depths.
+      _ => k_is_def_eq_core(a, b, ctx_trim(types, lbr_max(expr_lbr(a), expr_lbr(b))), top, addrs),
+    }
+  }
+
+  fn k_is_def_eq_core(a: KExpr, b: KExpr, types: List‹KExpr›,
+                      top: List‹&KConstantInfo›, addrs: List‹Addr›) -> G {
         -- Tier 1.5: lazy-delta app-congruence pre-WHNF. Mirror:
         -- src/ix/kernel/def_eq.rs:1262-1287 try_def_eq_app. When both
         -- sides share Const(idx, lvls) head with same arg count, recurse
-        -- on args directly — skips delta+beta of the def's body.
-        -- Sound: only accepts when args recursively def-eq; bails to
-        -- WHNF path otherwise.
+        -- on args directly. Sound: only accepts when args recursively def-eq.
         match try_lazy_delta_app(a, b, types, top, addrs) {
           1 => 1,
           0 =>
@@ -100,8 +106,7 @@ def defEq := ⟦
                 },
             },
         },
-        },
-    }
+        }
   }
 
   -- Mirror: src/ix/kernel/def_eq.rs:801-818 fn try_proof_irrel.
diff --git a/Ix/IxVM/Kernel/Infer.lean b/Ix/IxVM/Kernel/Infer.lean
index 0aafb651..a549d3b2 100644
--- a/Ix/IxVM/Kernel/Infer.lean
+++ b/Ix/IxVM/Kernel/Infer.lean
@@ -52,8 +52,16 @@ def infer := ⟦
   -- Lvls placed at outermost; level-params instantiation handled where
   -- the constant's declared type is fetched.
   -- ============================================================================
+  -- Context-trimmed memo wrapper (mirror Rust `infer_key`): key inference on
+  -- the suffix of `types` reachable from `e`'s loose-bvar range, so a closed
+  -- subterm (`lbr == 0`) shares its inferred type across every binder depth.
   fn k_infer(e: KExpr, types: List‹KExpr›,
              top: List‹&KConstantInfo›, addrs: List‹Addr›) -> KExpr {
+    k_infer_core(e, ctx_trim(types, expr_lbr(e)), top, addrs)
+  }
+
+  fn k_infer_core(e: KExpr, types: List‹KExpr›,
+                  top: List‹&KConstantInfo›, addrs: List‹Addr›) -> KExpr {
     match load(e) {
       KExprNode.BVar(i) => types_lookup(types, i),
 
diff --git a/Ix/IxVM/Kernel/Subst.lean b/Ix/IxVM/Kernel/Subst.lean
index b3776274..e262314d 100644
--- a/Ix/IxVM/Kernel/Subst.lean
+++ b/Ix/IxVM/Kernel/Subst.lean
@@ -77,6 +77,56 @@ def subst := ⟦
     }
   }
 
+  fn lbr_min(a: G, b: G) -> G {
+    match u32_less_than(a, b) {
+      1 => a,
+      0 => b,
+    }
+  }
+
+  -- Number of leading `types` frames an expr with loose-bvar-range `base` can
+  -- reach. `BVar(i)` reads `types[i]`, and a kept frame's own stored type can
+  -- reference further-out frames, so expand `need` to the fixpoint that closes
+  -- over every kept frame's loose range. Mirror Rust `ctx_addr_for_lbr`
+  -- (tc.rs). Trimming `types` to this prefix canonicalizes the whnf/infer/
+  -- def-eq memo key: a closed expr (base 0) keys to the empty context and
+  -- shares across all binder depths. Memoized on (types, base).
+  fn ctx_reachable(types: List‹KExpr›, base: G) -> G {
+    let len = list_length(types);
+    ctx_reachable_fix(types, len, lbr_min(base, len))
+  }
+
+  fn ctx_reachable_fix(types: List‹KExpr›, len: G, need: G) -> G {
+    let scanned = lbr_min(ctx_reachable_scan(types, need, 0, need), len);
+    match u32_less_than(need, scanned) {
+      1 => ctx_reachable_fix(types, len, scanned),
+      0 => need,
+    }
+  }
+
+  fn ctx_reachable_scan(types: List‹KExpr›, limit: G, i: G, acc: G) -> G {
+    match u32_less_than(i, limit) {
+      0 => acc,
+      1 =>
+        let fi = (i + 1) + expr_lbr(list_lookup(types, i));
+        ctx_reachable_scan(types, limit, i + 1, lbr_max(acc, fi)),
+    }
+  }
+
+  -- Trim `types` to the suffix reachable from an expr with loose range `base`.
+  -- Fast paths skip the fixpoint: `base == 0` (closed) → empty context;
+  -- `base >= len` → nothing to trim. Only partially-open exprs pay the scan.
+  fn ctx_trim(types: List‹KExpr›, base: G) -> List‹KExpr› {
+    match base {
+      0 => store(ListNode.Nil),
+      _ =>
+        match u32_less_than(base, list_length(types)) {
+          0 => types,
+          1 => list_take(types, ctx_reachable(types, base)),
+        },
+    }
+  }
+
   -- ============================================================================
   -- expr_lift
   --
diff --git a/Ix/IxVM/Kernel/Whnf.lean b/Ix/IxVM/Kernel/Whnf.lean
index b2671d8e..eea288f9 100644
--- a/Ix/IxVM/Kernel/Whnf.lean
+++ b/Ix/IxVM/Kernel/Whnf.lean
@@ -277,18 +277,24 @@ def whnf := ⟦
   fn whnf(e: KExpr, types: List‹KExpr›,
           top: List‹&KConstantInfo›, addrs: List‹Addr›) -> KExpr {
     -- Fast path: trivial whnf normal forms. Srt / Lit / Lam / Forall / BVar
-    -- never reduce — skip collect_spine + dispatch.
+    -- never reduce — skip collect_spine + dispatch (and the context trim).
     match load(e) {
       KExprNode.Srt(_) => e,
       KExprNode.Lit(_) => e,
       KExprNode.Lam(_, _) => e,
       KExprNode.Forall(_, _) => e,
       KExprNode.BVar(_) => e,
-      _ =>
-        let pair = collect_spine(e);
-        match pair {
-          (head, spine) => whnf_with_spine(head, spine, types, top, addrs),
-        },
+      -- Context-trimmed memo key (mirror Rust `whnf_key`): reduce on the
+      -- reachable suffix so a closed term shares its WHNF across binder depths.
+      _ => whnf_core(e, ctx_trim(types, expr_lbr(e)), top, addrs),
+    }
+  }
+
+  fn whnf_core(e: KExpr, types: List‹KExpr›,
+               top: List‹&KConstantInfo›, addrs: List‹Addr›) -> KExpr {
+    let pair = collect_spine(e);
+    match pair {
+      (head, spine) => whnf_with_spine(head, spine, types, top, addrs),
     }
   }
 

From 6ec124a215261c410ef57b9231f5aea85e7afb87 Mon Sep 17 00:00:00 2001
From: Arthur Paulino <arthurleonardo.ap@gmail.com>
Date: Thu, 11 Jun 2026 10:52:20 -0700
Subject: [PATCH 11/13] ix CLI: resolve private Lean.Name args in check/prove

parseName can't rebuild a private name (_private.M.0.foo) from its
display string -- the marker / scope-index components don't round-trip
through naive dot-splitting. Add toString fallbacks: resolveName for the
compiled-in env and resolveIxeAddr for a .ixe env, so check and prove
(which share forEachClaim) resolve names like
_private.Init.Data.Vector.Extract.0.Vector.extract_append._proof_1
across both the compiled-in and --ixe paths.
---
 Ix/Cli/CheckCmd.lean | 48 +++++++++++++++++++++++++++++++-------------
 1 file changed, 34 insertions(+), 14 deletions(-)

diff --git a/Ix/Cli/CheckCmd.lean b/Ix/Cli/CheckCmd.lean
index 8e72bc75..5fab8cbf 100644
--- a/Ix/Cli/CheckCmd.lean
+++ b/Ix/Cli/CheckCmd.lean
@@ -45,6 +45,16 @@ def parseName (arg : String) : Lean.Name :=
       | some n => Lean.Name.mkNum acc n
       | none   => Lean.Name.mkStr acc s
 
+/-- Resolve a CLI name argument against the env. `parseName` can't rebuild
+    private names (`_private.M.0.foo`) — the marker/scope-index components
+    don't round-trip through naive dot-splitting. So if the parsed name
+    isn't present, fall back to matching the arg against each constant's
+    `toString` (the displayed form the user copied). -/
+def resolveName (env : Lean.Environment) (arg : String) : Option Lean.Name :=
+  let parsed := parseName arg
+  if env.constants.contains parsed then some parsed
+  else (env.constants.toList.find? (fun (k, _) => toString k == arg)).map (·.1)
+
 def addrOfHex! (label : String) (s : String) : IO Address := do
   match Address.fromString s with
   | some a => pure a
@@ -88,6 +98,16 @@ partial def ixNameToLeanName : Ix.Name → Lean.Name
   | .str p s _ => .str (ixNameToLeanName p) s
   | .num p n _ => .num (ixNameToLeanName p) n
 
+/-- Resolve a CLI name argument against a `.ixe` env's named map. Like
+    `resolveName` for the compiled-in env: `parseName` can't rebuild private
+    names, so when the direct lookup misses, fall back to matching the arg
+    against each named constant's displayed `toString`. -/
+def resolveIxeAddr (ixonEnv : Ixon.Env) (arg : String) : Option Address :=
+  match ixonEnv.getAddr? (Ix.Name.fromLeanName (parseName arg)) with
+  | some a => some a
+  | none =>
+    (ixonEnv.named.toList.find? (fun (k, _) => toString (ixNameToLeanName k) == arg)).map (·.2.addr)
+
 /-- Build a `ClaimWitness` for the `verify_claim` entrypoint against
     `Ix.Claim.check addr none` (full transitive typecheck of the
     target's closure). -/
@@ -189,14 +209,13 @@ def forEachClaim
           if !keepGoing then break
     else
       for arg in names do
-        let name := parseName arg
-        match ixonEnv.getAddr? (Ix.Name.fromLeanName name) with
+        match resolveIxeAddr ixonEnv arg with
         | none =>
-          IO.eprintln s!"{name} not found in {path}"
-          failures := failures.push (toString name)
+          IO.eprintln s!"{arg} not found in {path}"
+          failures := failures.push arg
           if !keepGoing then break
         | some addr =>
-          let label := toString name
+          let label := arg
           let claim := Ix.Claim.check addr none
           let witness ← mkWitness addr ixonEnv
           if (← runOne claim witness label) ≠ 0 then
@@ -224,17 +243,18 @@ def forEachClaim
           if !keepGoing then break
     else
       for arg in names do
-        let name := parseName arg
-        if !env.constants.contains name then
-          IO.eprintln s!"{name} not found"
-          failures := failures.push (toString name)
+        match resolveName env arg with
+        | none =>
+          IO.eprintln s!"{arg} not found"
+          failures := failures.push arg
           if !keepGoing then break
           else continue
-        let label := toString name
-        let (claim, witness) ← buildOne name
-        if (← runOne claim witness label) ≠ 0 then
-          failures := failures.push label
-          if !keepGoing then break
+        | some name =>
+          let label := toString name
+          let (claim, witness) ← buildOne name
+          if (← runOne claim witness label) ≠ 0 then
+            failures := failures.push label
+            if !keepGoing then break
 
   if failures.isEmpty then pure 0
   else

From 32036918a43d6186137f448190069affedfe997e Mon Sep 17 00:00:00 2001
From: Arthur Paulino <arthurleonardo.ap@gmail.com>
Date: Thu, 11 Jun 2026 12:29:04 -0700
Subject: [PATCH 12/13] IxVM: multi-arg beta with narrow substitution walk
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

whnf_apply_beta substituted one spine arg per expr_inst1, re-walking the
(shrinking) body for each, so a K-arg application `(λλλ.body) x y z` walked
the body K times. Mirror Rust whnf.rs:541-567: peel the whole telescope
(peel_beta) and substitute all consumed args in ONE expr_inst_many walk
(simul_subst semantics). Single-arg betas stay on the cheap expr_inst1 path,
so short telescopes don't pay the list overhead (why the earlier
unconditional simul_subst regressed). Hot/cold split expr_inst_many_walk's
BVar arm (list_length / list_lookup / expr_lift) into expr_inst_many_bvar so
the hot App/Lam walk stays narrow.

Substitution was 53% of FFT on omega-heavy proofs. FFT (ix check <const>):
_private.…Vector.extract_append._proof_1 56.64G -> 38.31G (-32.4%);
Nat.add_comm 57.90M -> 56.17M; Init shard 10 4.721G -> 4.668G (-1.1%);
shard 9 16.346G -> 16.314G. 297 ixvm tests pass.
---
 Ix/IxVM/Kernel/Subst.lean | 62 +++++++++++++++++++++++++++++++++++++++
 Ix/IxVM/Kernel/Whnf.lean  | 37 ++++++++++++++++++-----
 2 files changed, 92 insertions(+), 7 deletions(-)

diff --git a/Ix/IxVM/Kernel/Subst.lean b/Ix/IxVM/Kernel/Subst.lean
index e262314d..e37d1e0d 100644
--- a/Ix/IxVM/Kernel/Subst.lean
+++ b/Ix/IxVM/Kernel/Subst.lean
@@ -238,6 +238,68 @@ def subst := ⟦
         store(KExprNode.Proj(tidx, fidx, expr_inst1(e1, arg, depth))),
     }
   }
+
+  -- ============================================================================
+  -- expr_inst_many — simultaneous substitution of `substs` for the `n` binders
+  -- at `depth` in ONE walk (mirror `src/ix/kernel/subst.rs::simul_subst`). Used
+  -- for multi-arg beta: `(λλλ. body) x y z` substitutes all three at once
+  -- instead of three separate `expr_inst1` re-walks of the body.
+  --
+  -- `substs` is innermost-first: `substs[0]` replaces `BVar(depth)`, …,
+  -- `substs[n-1]` replaces `BVar(depth+n-1)` (each lifted by `depth`); a free
+  -- `BVar(i ≥ depth+n)` shifts down by `n`. Same fast-path as `expr_inst1`.
+  -- ============================================================================
+  fn expr_inst_many(e: KExpr, substs: List‹KExpr›, depth: G) -> KExpr {
+    let l = expr_lbr(e);
+    match u32_less_than(depth, l) {
+      0 => e,
+      1 => expr_inst_many_walk(e, substs, depth),
+    }
+  }
+
+  -- Cold BVar arm of `expr_inst_many_walk`, split into its own circuit so the
+  -- hot App/Lam/… walk stays narrow (the `list_length` / `list_lookup` /
+  -- `expr_lift` machinery only charges the BVar rows). Mirror the hot/cold
+  -- split pattern used for `address_eq` / `whnf_with_spine`.
+  fn expr_inst_many_bvar(i: G, substs: List‹KExpr›, depth: G) -> KExpr {
+    match u32_less_than(i, depth) {
+      1 => store(KExprNode.BVar(i)),
+      0 =>
+        let n = list_length(substs);
+        match u32_less_than(i, depth + n) {
+          1 => expr_lift(list_lookup(substs, i - depth), depth, 0),
+          0 => store(KExprNode.BVar(i - n)),
+        },
+    }
+  }
+
+  fn expr_inst_many_walk(e: KExpr, substs: List‹KExpr›, depth: G) -> KExpr {
+    match load(e) {
+      KExprNode.BVar(i) => expr_inst_many_bvar(i, substs, depth),
+      KExprNode.Srt(l) => store(KExprNode.Srt(l)),
+      KExprNode.Const(idx, lvls) => store(KExprNode.Const(idx, lvls)),
+      KExprNode.App(f, a) =>
+        store(KExprNode.App(
+          expr_inst_many(f, substs, depth),
+          expr_inst_many(a, substs, depth))),
+      KExprNode.Lam(ty, body) =>
+        store(KExprNode.Lam(
+          expr_inst_many(ty, substs, depth),
+          expr_inst_many(body, substs, depth + 1))),
+      KExprNode.Forall(ty, body) =>
+        store(KExprNode.Forall(
+          expr_inst_many(ty, substs, depth),
+          expr_inst_many(body, substs, depth + 1))),
+      KExprNode.Let(ty, val, body) =>
+        store(KExprNode.Let(
+          expr_inst_many(ty, substs, depth),
+          expr_inst_many(val, substs, depth),
+          expr_inst_many(body, substs, depth + 1))),
+      KExprNode.Lit(lit) => store(KExprNode.Lit(lit)),
+      KExprNode.Proj(tidx, fidx, e1) =>
+        store(KExprNode.Proj(tidx, fidx, expr_inst_many(e1, substs, depth))),
+    }
+  }
 ⟧
 
 end IxVM
diff --git a/Ix/IxVM/Kernel/Whnf.lean b/Ix/IxVM/Kernel/Whnf.lean
index eea288f9..6b11b2f2 100644
--- a/Ix/IxVM/Kernel/Whnf.lean
+++ b/Ix/IxVM/Kernel/Whnf.lean
@@ -57,9 +57,22 @@ def whnf := ⟦
   -- ============================================================================
   -- Beta
   -- ============================================================================
-  fn beta_step(lam: KExpr, arg: KExpr) -> KExpr {
+
+  -- Peel a beta telescope: consume one spine arg per leading `Lam` of `lam`,
+  -- accumulating the consumed args innermost-first in `acc` (prepend), until
+  -- the head is no longer a `Lam` or the spine is empty. Returns the deepest
+  -- body, the consumed args (innermost-first, ready for `expr_inst_many`), and
+  -- the unconsumed spine tail.
+  fn peel_beta(lam: KExpr, spine: List‹KExpr›, acc: List‹KExpr›)
+              -> (KExpr, List‹KExpr›, List‹KExpr›) {
     match load(lam) {
-      KExprNode.Lam(_, body) => expr_inst1(body, arg, 0),
+      KExprNode.Lam(_, body) =>
+        match load(spine) {
+          ListNode.Cons(a, rest) =>
+            peel_beta(body, rest, store(ListNode.Cons(a, acc))),
+          ListNode.Nil => (lam, acc, spine),
+        },
+      _ => (lam, acc, spine),
     }
   }
 
@@ -67,13 +80,23 @@ def whnf := ⟦
   -- WHNF main loop with `prims` for nat-primitive dispatch.
   -- ============================================================================
 
+  -- Multi-arg beta (mirror src/ix/kernel/whnf.rs:541-567): peel the whole
+  -- telescope and substitute all consumed args in ONE `expr_inst_many` walk,
+  -- instead of one `expr_inst1` re-walk of the body per arg. Single-arg betas
+  -- stay on the cheap `expr_inst1` path (no list overhead).
   fn whnf_apply_beta(spine: List‹KExpr›, lam: KExpr, types: List‹KExpr›,
                      top: List‹&KConstantInfo›, addrs: List‹Addr›) -> KExpr {
-    match load(spine) {
-      ListNode.Nil => lam,
-      ListNode.Cons(a, rest) =>
-        let next = beta_step(lam, a);
-        whnf_with_spine(next, rest, types, top, addrs),
+    match peel_beta(lam, spine, store(ListNode.Nil)) {
+      (deep, consumed, rest) =>
+        match list_length(consumed) {
+          0 => apply_spine(lam, spine),
+          1 =>
+            let body2 = expr_inst1(deep, list_lookup(consumed, 0), 0);
+            whnf_with_spine(body2, rest, types, top, addrs),
+          _ =>
+            let body2 = expr_inst_many(deep, consumed, 0);
+            whnf_with_spine(body2, rest, types, top, addrs),
+        },
     }
   }
 

From 87cb8b0676e26a08000a347d0f5a679791fb44a3 Mon Sep 17 00:00:00 2001
From: Arthur Paulino <arthurleonardo.ap@gmail.com>
Date: Thu, 11 Jun 2026 16:03:47 -0700
Subject: [PATCH 13/13] IxVM: gate the primitive gauntlet behind a memoized
 prim_any_addr
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

whnf_const_head ran 5 try_* dispatchers (nat / str / bitvec / native /
decidable) in sequence on every Const head, each comparing the head address
against its family's primitive addresses — wasted on the common non-primitive
head, and worse, several try_* speculatively WHNF their args (to test for
literals / decidables), cascading into deep reductions that are then thrown
away. Factor the chain into try_address_primitives and gate it behind
prim_any_addr: a memoized (per head address) sum of the 43 head-dispatched
primitive address_eq checks — 1 only for a real primitive, so a 0 skips the
whole gauntlet straight to projection-definition / delta.

Validated by a differential assert (prim_any_addr must be 1 whenever the
gauntlet matched) run over the 297-test suite + Int.emod_emod_of_dvd +
Vector.extract_append._proof_1; it caught two missing addresses
(string_utf8_byte_size, punit_size_of_1), then passed clean across millions of
reductions. Assert removed after validation.

FFT (ix check <const>): _private.…Vector.extract_append._proof_1
38.31G -> 28.02G (-26.8%); Int.emod_emod_of_dvd 4.69G -> 3.97G (-15%);
Nat.add_comm 56.17M -> 56.06M; Init shard 9 16.314G -> 16.279G; shard 10
4.668G -> 4.659G. 297 ixvm tests pass.
---
 Ix/IxVM/Kernel/Primitive.lean |  55 ++++++++++++++++++
 Ix/IxVM/Kernel/Whnf.lean      | 102 ++++++++++++++++++----------------
 2 files changed, 110 insertions(+), 47 deletions(-)

diff --git a/Ix/IxVM/Kernel/Primitive.lean b/Ix/IxVM/Kernel/Primitive.lean
index 1e7bffec..9300ea1c 100644
--- a/Ix/IxVM/Kernel/Primitive.lean
+++ b/Ix/IxVM/Kernel/Primitive.lean
@@ -1307,6 +1307,61 @@ def primitive := ⟦
     store(KExprNode.Lit(KLiteral.Nat(n)))
   }
 
+  -- 1 iff `head_addr` is one of the head-dispatched primitive ops checked by
+  -- `try_nat_dispatch` / `try_str_dispatch` / `try_bitvec_dispatch` /
+  -- `try_reduce_native` / `try_reduce_decidable`. These addresses are mutually
+  -- exclusive (a const has one content address), so the sum is 0 or 1.
+  -- Memoized on `head_addr` (a content pointer): computed once per distinct
+  -- const, vs the per-whnf gauntlet of 5 `try_*` calls. Lets `whnf_const_head`
+  -- skip the gauntlet for the common non-primitive head. MUST stay in sync
+  -- with the `address_eq(head_addr, …)` checks in those functions — a
+  -- differential `assert` guards that during development.
+  fn prim_any_addr(head_addr: Addr) -> G {
+    address_eq(head_addr, nat_add_addr())
+    + address_eq(head_addr, nat_sub_addr())
+    + address_eq(head_addr, nat_mul_addr())
+    + address_eq(head_addr, nat_div_addr())
+    + address_eq(head_addr, nat_mod_addr())
+    + address_eq(head_addr, nat_pow_addr())
+    + address_eq(head_addr, nat_gcd_addr())
+    + address_eq(head_addr, nat_land_addr())
+    + address_eq(head_addr, nat_lor_addr())
+    + address_eq(head_addr, nat_xor_addr())
+    + address_eq(head_addr, nat_shift_left_addr())
+    + address_eq(head_addr, nat_shift_right_addr())
+    + address_eq(head_addr, nat_succ_addr())
+    + address_eq(head_addr, nat_pred_addr())
+    + address_eq(head_addr, nat_beq_addr())
+    + address_eq(head_addr, nat_ble_addr())
+    + address_eq(head_addr, nat_dec_eq_addr())
+    + address_eq(head_addr, nat_dec_le_addr())
+    + address_eq(head_addr, nat_dec_lt_addr())
+    + address_eq(head_addr, bool_true_addr())
+    + address_eq(head_addr, bool_false_addr())
+    + address_eq(head_addr, int_of_nat_addr())
+    + address_eq(head_addr, int_neg_succ_addr())
+    + address_eq(head_addr, int_dec_eq_addr())
+    + address_eq(head_addr, int_dec_le_addr())
+    + address_eq(head_addr, int_dec_lt_addr())
+    + address_eq(head_addr, bit_vec_of_nat_addr())
+    + address_eq(head_addr, bit_vec_to_nat_addr())
+    + address_eq(head_addr, bit_vec_ult_addr())
+    + address_eq(head_addr, decidable_decide_addr())
+    + address_eq(head_addr, reduce_bool_addr())
+    + address_eq(head_addr, reduce_nat_addr())
+    + address_eq(head_addr, size_of_size_of_addr())
+    + address_eq(head_addr, string_back_addr())
+    + address_eq(head_addr, string_legacy_back_addr())
+    + address_eq(head_addr, string_to_byte_array_addr())
+    + address_eq(head_addr, subtype_val_addr())
+    + address_eq(head_addr, system_platform_get_num_bits_addr())
+    + address_eq(head_addr, system_platform_num_bits_addr())
+    + address_eq(head_addr, punit_addr())
+    + address_eq(head_addr, punit_size_of_1_addr())
+    + address_eq(head_addr, unit_addr())
+    + address_eq(head_addr, string_utf8_byte_size_addr())
+  }
+
   -- Mirror: src/ix/kernel/whnf.rs:500-700 Nat-on-literals dispatch.
   -- Address-keyed (no positional prims): given the head Const's blake3
   -- address and the unreduced spine, fold a Nat primitive op when both
diff --git a/Ix/IxVM/Kernel/Whnf.lean b/Ix/IxVM/Kernel/Whnf.lean
index 6b11b2f2..d3e76b69 100644
--- a/Ix/IxVM/Kernel/Whnf.lean
+++ b/Ix/IxVM/Kernel/Whnf.lean
@@ -211,6 +211,35 @@ def whnf := ⟦
     }
   }
 
+  -- The address-keyed primitive gauntlet: nat → str → bitvec → native →
+  -- decidable, in order. Returns (1, reduced) on the first family that matches
+  -- and reduces, else (0, _). Factored out so `prim_any_addr` can gate the
+  -- whole chain for non-primitive heads in one shot.
+  fn try_address_primitives(head_addr: Addr, idx: G, lvls: List‹&KLevel›,
+                            spine: List‹KExpr›, types: List‹KExpr›,
+                            top: List‹&KConstantInfo›, addrs: List‹Addr›) -> (G, KExpr) {
+    match try_nat_dispatch(head_addr, spine, types, top, addrs) {
+      (1, r) => (1, r),
+      (0, _) =>
+        match try_str_dispatch(head_addr, spine, addrs) {
+          (1, r) => (1, r),
+          (0, _) =>
+            match try_bitvec_dispatch(head_addr, spine, types, top, addrs) {
+              (1, r) => (1, r),
+              (0, _) =>
+                match try_reduce_native(head_addr, spine, types, top, addrs) {
+                  (1, r) => (1, r),
+                  (0, _) =>
+                    match try_reduce_decidable(head_addr, idx, lvls, spine, types, top, addrs) {
+                      (1, r) => (1, r),
+                      (0, _) => (0, store(KExprNode.BVar(0))),
+                    },
+                },
+            },
+        },
+    }
+  }
+
   -- Const-head WHNF dispatch, split out of `whnf_with_spine` (see its Const arm).
   -- `head` is the original `Const(idx, lvls)` KExpr, passed for the stuck
   -- `apply_spine(head, spine)` fallbacks.
@@ -235,61 +264,40 @@ def whnf := ⟦
           (0, _) => apply_spine(head, spine),
         },
       _ =>
-        let nat_pair = try_nat_dispatch(head_addr, spine, types, top, addrs);
-        match nat_pair {
+        -- Skip the address-primitive gauntlet for the common non-primitive
+        -- head: `prim_any_addr` (memoized per head address) is 1 only for the
+        -- ~43 ops the gauntlet recognizes, so a 0 means no `try_*` can match —
+        -- go straight to projection-definition / delta. Validated by a
+        -- differential assert (now removed) over the suite + heavy consts.
+        let addr_prim = match prim_any_addr(head_addr) {
+          1 => try_address_primitives(head_addr, idx, lvls, spine, types, top, addrs),
+          _ => (0, store(KExprNode.BVar(0))),
+        };
+        match addr_prim {
           (1, reduced) => whnf(reduced, types, top, addrs),
           (0, _) =>
-            let str_pair = try_str_dispatch(head_addr, spine, addrs);
-            match str_pair {
-              (1, reduced_s) => whnf(reduced_s, types, top, addrs),
-              (0, _) =>
-                let bv_pair = try_bitvec_dispatch(head_addr, spine, types, top, addrs);
-                match bv_pair {
-                  (1, reduced_b) => whnf(reduced_b, types, top, addrs),
+                let proj_def_pair = try_reduce_projection_definition(idx, spine, top);
+                match proj_def_pair {
+                  (1, reduced_pd) => whnf(reduced_pd, types, top, addrs),
                   (0, _) =>
-                    let nat_pair2 = try_reduce_native(head_addr, spine, types, top, addrs);
-                    match nat_pair2 {
-                      (1, reduced_n) => whnf(reduced_n, types, top, addrs),
-                      (0, _) =>
-                        let dec_pair = try_reduce_decidable(head_addr, idx, lvls, spine, types, top, addrs);
-                        match dec_pair {
-                          (1, reduced_d) => whnf(reduced_d, types, top, addrs),
+                    -- Mirror src/ix/kernel/whnf.rs:756-774 (`delta_unfold_one`):
+                    -- unfold any Defn regardless of `ReducibilityHints`.
+                    match ci {
+                      KConstantInfo.Defn(_, _, value, _, _) =>
+                        -- Keep `Nat.add base (Lit n)` (symbolic base) stuck as a
+                        -- compact offset instead of delta-unfolding into a
+                        -- succ^n tower. Pairs with offset-aware def-eq.
+                        match try_nat_offset_stuck(head, spine, types, top, addrs) {
+                          (1, stuck) => stuck,
                           (0, _) =>
-                            let proj_def_pair = try_reduce_projection_definition(idx, spine, top);
-                            match proj_def_pair {
-                              (1, reduced_pd) => whnf(reduced_pd, types, top, addrs),
-                              (0, _) =>
-                                -- Mirror src/ix/kernel/whnf.rs:756-774
-                                -- (`delta_unfold_one`): unfold any Defn
-                                -- regardless of `ReducibilityHints`. The
-                                -- hint is consulted by lazy-delta's
-                                -- `delta_rank` for def-eq priority, not
-                                -- as a gate on plain whnf delta. Without
-                                -- unfolding here, ctor field types
-                                -- written via opaque defs (e.g.
-                                -- `constType (n α) (n α)`) stay stuck
-                                -- and `check_positivity_aug` misclassifies.
-                                match ci {
-                                  KConstantInfo.Defn(_, _, value, _, _) =>
-                                    -- Keep `Nat.add base (Lit n)` (symbolic base)
-                                    -- stuck as a compact offset instead of
-                                    -- delta-unfolding into a succ^n tower. Pairs
-                                    -- with offset-aware def-eq.
-                                    match try_nat_offset_stuck(head, spine, types, top, addrs) {
-                                      (1, stuck) => stuck,
-                                      (0, _) =>
-                                        let body = expr_inst_levels(value, lvls);
-                                        whnf_with_spine(body, spine, types, top, addrs),
-                                    },
-                                  KConstantInfo.Thm(_, _, _) => apply_spine(head, spine),
-                                  _ => apply_spine(head, spine),
-                                },
-                            },
+                            let body = expr_inst_levels(value, lvls);
+                            whnf_with_spine(body, spine, types, top, addrs),
                         },
+                      KConstantInfo.Thm(_, _, _) => apply_spine(head, spine),
+                      _ => apply_spine(head, spine),
                     },
                 },
             },
-        },
     }
   }