diff --git a/Cargo.lock b/Cargo.lock
index 7de24578..1d573036 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1121,6 +1121,8 @@ version = "0.15.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
 dependencies = [
+ "allocator-api2",
+ "equivalent",
  "foldhash 0.1.5",
 ]
 
@@ -1728,6 +1730,7 @@ dependencies = [
  "bytes",
  "dashmap",
  "getrandom 0.3.4",
+ "hashbrown 0.15.5",
  "indexmap",
  "iroh",
  "iroh-base",
diff --git a/Cargo.toml b/Cargo.toml
index 58b90be3..e83b8384 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -11,6 +11,7 @@ anyhow = "1"
 blake3 = "1.8.4"
 itertools = "0.14.0"
 indexmap = { version = "2", features = ["rayon"] }
+hashbrown = "0.15"
 lean-ffi = { git = "https://github.com/argumentcomputer/lean-ffi", rev = "cc98ebf67bf453ac3827cb767f78b13ea674dd6a" }
 mimalloc = { version = "0.1", default-features = false }
 multi-stark = { git = "https://github.com/argumentcomputer/multi-stark.git", rev = "9ecab51d553445c0cc7b571af00a76b8a83a6f8c" }
diff --git a/Ix/Cli/RefsOfCmd.lean b/Ix/Cli/RefsOfCmd.lean
new file mode 100644
index 00000000..c2824903
--- /dev/null
+++ b/Ix/Cli/RefsOfCmd.lean
@@ -0,0 +1,70 @@
+/-
+  `ix refs-of <Lean.Name> --ixe <path>`: print the direct constant
+  references of a constant — its `Constant.refs` entries that resolve to
+  constants in the env (literal blobs are skipped: they are pinned by
+  content-addressing and are not well-typedness obligations).
+
+  Prints a comma-separated list of 64-char hex addresses on stdout, the
+  exact shape `ix tree canonical` expects, so a frontier assumption tree
+  for a single constant is:
+
+      ix tree canonical $(ix refs-of <name> --ixe env.ixe)
+-/
+module
+public import Cli
+public import Ix.Address
+public import Ix.Common
+public import Ix.Ixon
+
+public section
+
+namespace Ix.Cli.RefsOfCmd
+
+def parseName (arg : String) : Lean.Name :=
+  arg.splitOn "." |>.foldl (init := .anonymous)
+    fun acc s => match s.toNat? with
+      | some n => Lean.Name.mkNum acc n
+      | none   => Lean.Name.mkStr acc s
+
+def runRefsOfCmd (p : Cli.Parsed) : IO UInt32 := do
+  let some nameArg := p.positionalArg? "name"
+    | p.printError "error: must specify <name>"; return 1
+  let name := parseName (nameArg.as! String)
+  let some ixeFlag := p.flag? "ixe"
+    | p.printError "error: --ixe is required"; return 1
+  let path := ixeFlag.as! String
+  let bytes ← IO.FS.readBinFile path
+  let ixonEnv ← match Ixon.rsDeEnv bytes with
+    | .error e =>
+      IO.eprintln s!"error: failed to deserialize {path}: {e}"; return 1
+    | .ok env => pure env
+  let some addr := ixonEnv.getAddr? (Ix.Name.fromLeanName name)
+    | IO.eprintln s!"error: {name} not found in {path}"; return 1
+  let some c := ixonEnv.getConst? addr
+    | IO.eprintln s!"error: constant for {name} missing from env"; return 1
+  let mut seen : Std.HashSet String := {}
+  let mut out : Array String := #[]
+  for r in c.refs do
+    if (ixonEnv.getConst? r).isSome then
+      let s := toString r
+      if !seen.contains s then
+        seen := seen.insert s
+        out := out.push s
+  IO.println (String.intercalate "," out.toList)
+  return 0
+
+end Ix.Cli.RefsOfCmd
+
+open Ix.Cli.RefsOfCmd in
+def refsOfCmd : Cli.Cmd := `[Cli|
+  "refs-of" VIA runRefsOfCmd;
+  "Print a constant's direct constant references (comma-separated hex), the leaf list for a frontier assumption tree"
+
+  FLAGS:
+    "ixe" : String; "Path to a serialized `.ixe` env (required)."
+
+  ARGS:
+    name : String; "Fully-qualified Lean.Name (e.g. `Nat.add_comm`)."
+]
+
+end
diff --git a/Ix/IxVM/Kernel/Claim.lean b/Ix/IxVM/Kernel/Claim.lean
index 7122431a..3a9d54d5 100644
--- a/Ix/IxVM/Kernel/Claim.lean
+++ b/Ix/IxVM/Kernel/Claim.lean
@@ -804,43 +804,22 @@ def claim := ⟦
     }
   }
 
-  -- Pack the first 4 address bytes (LE) into a u32 key for the skip-set rbtree.
-  --
-  -- Capped at 4 bytes because `RBTreeMap` orders keys with `u32_less_than`, a
-  -- 32-bit comparator gadget — a wider key (a single `G` could hold 7 bytes in
-  -- Goldilocks) would overflow it and corrupt the tree ordering. A 32-bit key
-  -- space means key collisions are possible (~N²/2^33 over N leaves), but they
-  -- are harmless: a collision makes `addr_in_set`'s confirming `address_eq`
-  -- fail, yielding a false negative (a frontier const gets rechecked) never a
-  -- false positive (a wrong skip). See `build_skip_set`.
-  fn addr_key(a: Addr) -> G {
-    let arr = load(a);
-    to_field(arr[0])
-      + to_field(arr[1]) * 256
-      + to_field(arr[2]) * 65536
-      + to_field(arr[3]) * 16777216
-  }
-
-  -- Build an O(log N) membership set from the assumption-leaf list, keyed on
-  -- `addr_key`. Key collisions overwrite — sound because the only consequence is
-  -- a missed skip (a frontier const gets rechecked, never wrongly trusted); the
-  -- confirming `address_eq` in `addr_in_set` rules out false positives.
-  fn build_skip_set(leaves: List‹Addr›, acc: RBTreeMap‹Addr›) -> RBTreeMap‹Addr› {
+  -- Membership map over the assumption leaves, keyed by `ptr_val`. Sound
+  -- by the same invariant as `build_addr_pos_map` (Ingress.lean): one
+  -- pointer maps to one content, so a ptr hit implies the address IS a
+  -- leaf (no false skip); a de-interned pointer reads as absent, which
+  -- here only means the constant gets CHECKED instead of skipped —
+  -- conservative. One tree lookup per constant, no confirming
+  -- address_eq, no per-lookup address loads (ported from jcb/fixes
+  -- H-14; replaces the 4-byte-content-keyed set + confirm compare).
+  fn build_asm_leaf_map(leaves: List‹Addr›) -> RBTreeMap‹G› {
     match load(leaves) {
-      ListNode.Nil => acc,
+      ListNode.Nil => RBTreeMap.Nil,
       ListNode.Cons(a, rest) =>
-        build_skip_set(rest, rbtree_map_insert(addr_key(a), a, acc)),
+        rbtree_map_insert(ptr_val(a), 1, build_asm_leaf_map(rest)),
     }
   }
 
-  -- Membership via one rbtree lookup (cheap u32 key compares) + one confirming
-  -- full `address_eq`, replacing the O(N) linear `addr_in_list` scan that
-  -- dominated address_eq cost on sharded checks.
-  fn addr_in_set(target: Addr, skip_set: RBTreeMap‹Addr›) -> G {
-    let found = rbtree_map_lookup_or_default(addr_key(target), skip_set, store([0u8; 32]));
-    address_eq(found, target)
-  }
-
   -- ============================================================================
   -- check_all variant that skips positions whose addr is in the
   -- supplied assumption-leaf list.
@@ -850,27 +829,32 @@ def claim := ⟦
                         addrs: List‹Addr›,
                         asm_leaves: List‹Addr›) {
     let _ = check_canonical_block_sort(top);
-    -- Build the skip-set once (O(N log N)) instead of an O(N) linear scan per
-    -- checked const.
-    let skip_set = build_skip_set(asm_leaves, RBTreeMap.Nil);
-    check_all_skipping_iter(consts, top, addrs, skip_set, 0)
+    let asm_map = store(build_asm_leaf_map(asm_leaves));
+    check_all_skipping_iter(consts, top, addrs, addrs, asm_map, 0)
   }
 
+  -- `cur_addrs` is the suffix of `addrs` aligned with `consts`, walked in
+  -- lockstep instead of `list_lookup(addrs, pos)` — which re-walks the
+  -- list prefix every iteration, a standalone quadratic in closure size
+  -- (ported from jcb/fixes H-14).
   fn check_all_skipping_iter(consts: List‹&KConstantInfo›,
                              top: List‹&KConstantInfo›,
                              addrs: List‹Addr›,
-                             skip_set: RBTreeMap‹Addr›,
+                             cur_addrs: List‹Addr›,
+                             asm_map: &RBTreeMap‹G›,
                              pos: G) {
     match load(consts) {
       ListNode.Nil => (),
       ListNode.Cons(&ci, rest) =>
-        let addr = list_lookup(addrs, pos);
-        match addr_in_set(addr, skip_set) {
-          1 =>
-            check_all_skipping_iter(rest, top, addrs, skip_set, pos + 1),
-          _ =>
-            let _ = check_const(ci, pos, top, addrs);
-            check_all_skipping_iter(rest, top, addrs, skip_set, pos + 1),
+        match load(cur_addrs) {
+          ListNode.Cons(addr, rest_addrs) =>
+            match rbtree_map_lookup_or_default(ptr_val(addr), load(asm_map), 0) {
+              0 =>
+                let _ = check_const(ci, pos, top, addrs);
+                check_all_skipping_iter(rest, top, addrs, rest_addrs, asm_map, pos + 1),
+              _ =>
+                check_all_skipping_iter(rest, top, addrs, rest_addrs, asm_map, pos + 1),
+            },
         },
     }
   }
diff --git a/Ix/IxVM/Kernel/Levels.lean b/Ix/IxVM/Kernel/Levels.lean
index e9801669..f8bbbfad 100644
--- a/Ix/IxVM/Kernel/Levels.lean
+++ b/Ix/IxVM/Kernel/Levels.lean
@@ -57,142 +57,583 @@ def levels := ⟦
     }
   }
 
-  fn level_has_param(l: KLevel) -> G {
-    match l {
-      KLevel.Zero => 0,
-      KLevel.Param(_) => 1,
-      KLevel.Succ(&a) => level_has_param(a),
-      KLevel.Max(&a, &b) =>
-        let ha = level_has_param(a);
-        match ha {
-          1 => 1,
-          0 => level_has_param(b),
+  -- ============================================================================
+  -- Canonical level normalization (mirror: src/ix/kernel/level.rs
+  -- normalize_level / norm_level_eq / norm_level_le, itself a port of
+  -- Lean4Lean's Level.Normalize with the documented subsumption and
+  -- covers-split fixes).
+  --
+  -- Semantic level comparison via the recursive `Level.leq` with its
+  -- two-way param-substitution split per Max/IMax-with-param is
+  -- exponential in the number of params, and every branch materializes
+  -- freshly substituted levels (the Int16.instRxcHasSize_eq pathology:
+  -- 93% of the record was level_subst_reduce/level_leq rows).
+  -- Normalization is linear in the level size, `level_normalize` is
+  -- keyed on the level ALONE so each distinct level normalizes once per
+  -- run, and equality/leq on canonical forms is cheap.
+  --
+  -- Canonical form: a path-sorted list of entries `(path, const, vars)`
+  -- where `path` is the sorted list of param indices conditioning an
+  -- imax chain (all assumed >= 1 along that branch), `const` the max
+  -- constant contribution, `vars` the idx-sorted `(param, offset)`
+  -- contributions.
+  -- ============================================================================
+  enum NLVar {
+    Mk(G, G)
+  }
+
+  enum NLEntry {
+    Mk(List‹G›, G, List‹&NLVar›)
+  }
+
+  -- Lexicographic compare of sorted G-lists: 0 = eq, 1 = a < b, 2 = a > b.
+  fn glist_cmp(a: List‹G›, b: List‹G›) -> G {
+    match load(a) {
+      ListNode.Nil =>
+        match load(b) {
+          ListNode.Nil => 0,
+          ListNode.Cons(_, _) => 1,
         },
-      KLevel.IMax(&a, &b) =>
-        let hb = level_has_param(b);
-        match hb {
-          1 => 1,
-          0 => level_has_param(a),
+      ListNode.Cons(x, ar) =>
+        match load(b) {
+          ListNode.Nil => 2,
+          ListNode.Cons(y, br) =>
+            match u32_less_than(x, y) {
+              1 => 1,
+              0 =>
+                match u32_less_than(y, x) {
+                  1 => 2,
+                  0 => glist_cmp(ar, br),
+                },
+            },
         },
     }
   }
 
-  fn level_any_param(l: KLevel) -> G {
-    match l {
-      KLevel.Param(i) => i,
-      KLevel.Succ(&a) => level_any_param(a),
-      KLevel.Max(&a, &b) =>
-        let ha = level_has_param(a);
-        match ha {
-          1 => level_any_param(a),
-          0 => level_any_param(b),
+  -- Sorted-list subset test.
+  fn glist_subset(xs: List‹G›, ys: List‹G›) -> G {
+    match load(xs) {
+      ListNode.Nil => 1,
+      ListNode.Cons(x, xr) =>
+        match load(ys) {
+          ListNode.Nil => 0,
+          ListNode.Cons(y, yr) =>
+            match u32_less_than(x, y) {
+              1 => 0,
+              0 =>
+                match u32_less_than(y, x) {
+                  1 => glist_subset(xs, yr),
+                  0 => glist_subset(xr, yr),
+                },
+            },
         },
-      KLevel.IMax(&a, &b) =>
-        let hb = level_has_param(b);
-        match hb {
-          1 => level_any_param(b),
-          0 => level_any_param(a),
+    }
+  }
+
+  fn glist_eq_len(a: List‹G›, b: List‹G›) -> G {
+    match load(a) {
+      ListNode.Nil =>
+        match load(b) {
+          ListNode.Nil => 1,
+          ListNode.Cons(_, _) => 0,
+        },
+      ListNode.Cons(_, ar) =>
+        match load(b) {
+          ListNode.Nil => 0,
+          ListNode.Cons(_, br) => glist_eq_len(ar, br),
         },
-      KLevel.Zero => 0,
     }
   }
 
-  fn level_subst_reduce(l: KLevel, p: G, repl: KLevel) -> KLevel {
-    match l {
-      KLevel.Zero => KLevel.Zero,
-      KLevel.Param(i) =>
-        match i - p {
-          0 => repl,
-          _ => KLevel.Param(i),
+  -- Insert into a sorted G-list. Returns (1, new_list) on insert,
+  -- (0, original) when already present. Mirrors level.rs ordered_insert.
+  fn glist_ordered_insert(x: G, l: List‹G›) -> (G, List‹G›) {
+    match load(l) {
+      ListNode.Nil => (1, store(ListNode.Cons(x, store(ListNode.Nil)))),
+      ListNode.Cons(h, r) =>
+        match u32_less_than(x, h) {
+          1 => (1, store(ListNode.Cons(x, l))),
+          0 =>
+            match u32_less_than(h, x) {
+              0 => (0, l),
+              1 =>
+                match glist_ordered_insert(x, r) {
+                  (f, r2) => (f, store(ListNode.Cons(h, r2))),
+                },
+            },
         },
-      KLevel.Succ(&a) =>
-        KLevel.Succ(store(level_subst_reduce(a, p, repl))),
-      KLevel.Max(&a, &b) =>
-        level_max(level_subst_reduce(a, p, repl), level_subst_reduce(b, p, repl)),
-      KLevel.IMax(&a, &b) =>
-        level_imax(level_subst_reduce(a, p, repl), level_subst_reduce(b, p, repl)),
     }
   }
 
-  fn level_leq(a: KLevel, b: KLevel) -> G {
-    match a {
-      KLevel.Zero => 1,
-      KLevel.Max(&a1, &a2) =>
-        level_leq(a1, b) * level_leq(a2, b),
-      KLevel.Succ(&a1) =>
-        match a1 {
-          KLevel.Max(&x, &y) =>
-            level_leq(KLevel.Succ(store(x)), b) * level_leq(KLevel.Succ(store(y)), b),
-          _ =>
-            match b {
-              KLevel.Succ(&b1) => level_leq(a1, b1),
-              KLevel.Max(&b1, &b2) =>
-                let r1 = level_leq(a, b1);
-                match r1 {
-                  1 => 1,
+  -- Insert (idx, k) into an idx-sorted var list; on duplicate idx keep
+  -- the max offset. Mirrors Node::add_var.
+  fn nlvars_add(vars: List‹&NLVar›, idx: G, k: G) -> List‹&NLVar› {
+    match load(vars) {
+      ListNode.Nil =>
+        store(ListNode.Cons(store(NLVar.Mk(idx, k)), store(ListNode.Nil))),
+      ListNode.Cons(v, rest) =>
+        match load(v) {
+          NLVar.Mk(vi, vo) =>
+            match u32_less_than(idx, vi) {
+              1 => store(ListNode.Cons(store(NLVar.Mk(idx, k)), vars)),
+              0 =>
+                match u32_less_than(vi, idx) {
+                  1 => store(ListNode.Cons(v, nlvars_add(rest, idx, k))),
                   0 =>
-                    let r2 = level_leq(a, b2);
-                    match r2 {
-                      1 => 1,
+                    match u32_less_than(vo, k) {
+                      1 => store(ListNode.Cons(store(NLVar.Mk(vi, k)), rest)),
+                      0 => vars,
+                    },
+                },
+            },
+        },
+    }
+  }
+
+  fn nlvars_eq(a: List‹&NLVar›, b: List‹&NLVar›) -> G {
+    match load(a) {
+      ListNode.Nil =>
+        match load(b) {
+          ListNode.Nil => 1,
+          ListNode.Cons(_, _) => 0,
+        },
+      ListNode.Cons(x, ar) =>
+        match load(b) {
+          ListNode.Nil => 0,
+          ListNode.Cons(y, br) =>
+            match load(x) {
+              NLVar.Mk(xi, xo) =>
+                match load(y) {
+                  NLVar.Mk(yi, yo) =>
+                    match xi - yi {
+                      0 =>
+                        match xo - yo {
+                          0 => nlvars_eq(ar, br),
+                          _ => 0,
+                        },
+                      _ => 0,
+                    },
+                },
+            },
+        },
+    }
+  }
+
+  fn nlvars_max_offset(vars: List‹&NLVar›) -> G {
+    match load(vars) {
+      ListNode.Nil => 0,
+      ListNode.Cons(v, rest) =>
+        match load(v) {
+          NLVar.Mk(_, vo) =>
+            let m = nlvars_max_offset(rest);
+            match u32_less_than(m, vo) {
+              1 => vo,
+              0 => m,
+            },
+        },
+    }
+  }
+
+  -- Keep x in xs unless ys has the same idx with offset >= x's.
+  -- Mirrors level.rs subsume_vars (sorted merge walk).
+  fn nlvars_subsume(xs: List‹&NLVar›, ys: List‹&NLVar›) -> List‹&NLVar› {
+    match load(xs) {
+      ListNode.Nil => xs,
+      ListNode.Cons(x, xr) =>
+        match load(ys) {
+          ListNode.Nil => xs,
+          ListNode.Cons(y, yr) =>
+            match load(x) {
+              NLVar.Mk(xi, xo) =>
+                match load(y) {
+                  NLVar.Mk(yi, yo) =>
+                    match u32_less_than(xi, yi) {
+                      1 => store(ListNode.Cons(x, nlvars_subsume(xr, ys))),
                       0 =>
-                        let bfull = KLevel.Max(store(b1), store(b2));
-                        let hp = level_has_param(bfull);
-                        match hp {
+                        match u32_less_than(yi, xi) {
+                          1 => nlvars_subsume(xs, yr),
+                          0 =>
+                            match u32_less_than(yo, xo) {
+                              1 => store(ListNode.Cons(x, nlvars_subsume(xr, yr))),
+                              0 => nlvars_subsume(xr, yr),
+                            },
+                        },
+                    },
+                },
+            },
+        },
+    }
+  }
+
+  -- Max the constant contribution into the entry at `path` (path-sorted
+  -- insert). Mirrors norm_add_const incl. its skip rule: k = 0 never
+  -- contributes; k = 1 only at the empty path (along a non-empty path
+  -- all conditioning params are >= 1 so the branch value is >= 1 anyway).
+  fn nl_add_const(acc: List‹&NLEntry›, path: List‹G›, k: G) -> List‹&NLEntry› {
+    match k {
+      0 => acc,
+      1 =>
+        match load(path) {
+          ListNode.Nil => nl_add_const_go(acc, path, k),
+          ListNode.Cons(_, _) => acc,
+        },
+      _ => nl_add_const_go(acc, path, k),
+    }
+  }
+
+  fn nl_add_const_go(acc: List‹&NLEntry›, path: List‹G›, k: G) -> List‹&NLEntry› {
+    match load(acc) {
+      ListNode.Nil =>
+        store(ListNode.Cons(
+          store(NLEntry.Mk(path, k, store(ListNode.Nil))),
+          store(ListNode.Nil))),
+      ListNode.Cons(e, rest) =>
+        match load(e) {
+          NLEntry.Mk(ep, ec, ev) =>
+            match glist_cmp(path, ep) {
+              0 =>
+                match u32_less_than(ec, k) {
+                  1 => store(ListNode.Cons(store(NLEntry.Mk(ep, k, ev)), rest)),
+                  0 => acc,
+                },
+              1 =>
+                store(ListNode.Cons(
+                  store(NLEntry.Mk(path, k, store(ListNode.Nil))), acc)),
+              _ => store(ListNode.Cons(e, nl_add_const_go(rest, path, k))),
+            },
+        },
+    }
+  }
+
+  -- Add var (idx, k) to the entry at `path` (path-sorted insert).
+  fn nl_add_var(acc: List‹&NLEntry›, path: List‹G›, idx: G, k: G) -> List‹&NLEntry› {
+    match load(acc) {
+      ListNode.Nil =>
+        store(ListNode.Cons(
+          store(NLEntry.Mk(path, 0,
+            store(ListNode.Cons(store(NLVar.Mk(idx, k)), store(ListNode.Nil))))),
+          store(ListNode.Nil))),
+      ListNode.Cons(e, rest) =>
+        match load(e) {
+          NLEntry.Mk(ep, ec, ev) =>
+            match glist_cmp(path, ep) {
+              0 =>
+                store(ListNode.Cons(
+                  store(NLEntry.Mk(ep, ec, nlvars_add(ev, idx, k))), rest)),
+              1 =>
+                store(ListNode.Cons(
+                  store(NLEntry.Mk(path, 0,
+                    store(ListNode.Cons(store(NLVar.Mk(idx, k)), store(ListNode.Nil))))),
+                  acc)),
+              _ => store(ListNode.Cons(e, nl_add_var(rest, path, idx, k))),
+            },
+        },
+    }
+  }
+
+  -- Flatten a level into canonical form. `path` = imax conditioning chain
+  -- (sorted param idxs), `k` = accumulated Succ offset. Mirrors
+  -- normalize_aux; the IMax arm delegates to the dispatch (whose cases
+  -- replicate the aux IMax-shape cases verbatim, as in level.rs).
+  fn normalize_aux(l: KLevel, path: List‹G›, k: G,
+                   acc: List‹&NLEntry›) -> List‹&NLEntry› {
+    match l {
+      KLevel.Zero => nl_add_const(acc, path, k),
+      KLevel.Succ(&inner) => normalize_aux(inner, path, k + 1, acc),
+      KLevel.Max(&a, &b) =>
+        normalize_aux(b, path, k, normalize_aux(a, path, k, acc)),
+      KLevel.IMax(&u, &b) => normalize_imax_dispatch(u, b, path, k, acc),
+      KLevel.Param(idx) =>
+        match glist_ordered_insert(idx, path) {
+          (1, new_path) =>
+            nl_add_var(nl_add_const(acc, path, k), new_path, idx, k),
+          (0, _) =>
+            match k {
+              0 => acc,
+              _ => nl_add_var(acc, path, idx, k),
+            },
+        },
+    }
+  }
+
+  -- imax(a, b) by b's shape: zero kills the branch; succ is never-zero so
+  -- imax = max; max/imax distribute; param conditions the path.
+  fn normalize_imax_dispatch(a: KLevel, b: KLevel, path: List‹G›, k: G,
+                             acc: List‹&NLEntry›) -> List‹&NLEntry› {
+    match b {
+      KLevel.Zero => nl_add_const(acc, path, k),
+      KLevel.Succ(&v) =>
+        normalize_aux(v, path, k + 1, normalize_aux(a, path, k, acc)),
+      KLevel.Max(&v, &w) =>
+        -- imax(a, max(v, w)) = max(imax(a, v), imax(a, w))
+        normalize_imax_dispatch(a, w, path, k,
+          normalize_imax_dispatch(a, v, path, k, acc)),
+      KLevel.IMax(&v, &w) =>
+        -- imax(a, imax(v, w)) = max(imax(a, w), imax(v, w))
+        normalize_imax_dispatch(v, w, path, k,
+          normalize_imax_dispatch(a, w, path, k, acc)),
+      KLevel.Param(idx) =>
+        match glist_ordered_insert(idx, path) {
+          (1, new_path) =>
+            -- param(idx) = 0 branch: imax(a, 0) = 0, contributing k.
+            normalize_aux(a, new_path, k,
+              nl_add_var(nl_add_const(acc, path, k), new_path, idx, k)),
+          (0, _) =>
+            let acc2 = match k {
+              0 => acc,
+              _ => nl_add_var(acc, path, idx, k),
+            };
+            normalize_aux(a, path, k, acc2),
+        },
+    }
+  }
+
+  -- Phase 2 subsumption: drop contributions dominated by another entry
+  -- whose path is a subset (active whenever the dominated one is).
+  -- Each entry folds over the ORIGINAL (snapshot) list, mirroring the
+  -- Rust snapshot semantics.
+  fn nl_subsumption(acc: List‹&NLEntry›) -> List‹&NLEntry› {
+    nl_subsumption_walk(acc, acc)
+  }
+
+  fn nl_subsumption_walk(rem: List‹&NLEntry›,
+                         snapshot: List‹&NLEntry›) -> List‹&NLEntry› {
+    match load(rem) {
+      ListNode.Nil => rem,
+      ListNode.Cons(e, rest) =>
+        store(ListNode.Cons(
+          nl_subsume_entry(e, snapshot),
+          nl_subsumption_walk(rest, snapshot))),
+    }
+  }
+
+  fn nl_subsume_entry(e: &NLEntry, snap: List‹&NLEntry›) -> &NLEntry {
+    match load(snap) {
+      ListNode.Nil => e,
+      ListNode.Cons(s, srest) =>
+        match load(e) {
+          NLEntry.Mk(p1, c1, v1) =>
+            match load(s) {
+              NLEntry.Mk(p2, c2, v2) =>
+                match glist_subset(p2, p1) {
+                  0 => nl_subsume_entry(e, srest),
+                  _ =>
+                    -- subset + equal length <=> p1 == p2 (self entry)
+                    let same = glist_eq_len(p1, p2);
+                    let c1n = match c1 {
+                      0 => 0,
+                      _ =>
+                        let or1 = match same {
+                          1 => 1,
+                          0 => u32_less_than(c2, c1),
+                        };
+                        let or2 = match load(v2) {
+                          ListNode.Nil => 1,
+                          ListNode.Cons(_, _) =>
+                            u32_less_than(nlvars_max_offset(v1) + 1, c1),
+                        };
+                        match or1 * or2 {
                           0 => 0,
-                          _ =>
-                            let p = level_any_param(bfull);
-                            let sp = KLevel.Succ(store(KLevel.Param(p)));
-                            let a0 = level_subst_reduce(a, p, KLevel.Zero);
-                            let b0 = level_subst_reduce(bfull, p, KLevel.Zero);
-                            let a1s = level_subst_reduce(a, p, sp);
-                            let b1s = level_subst_reduce(bfull, p, sp);
-                            level_leq(a0, b0) * level_leq(a1s, b1s),
+                          _ => c1,
                         },
+                    };
+                    let v1n = match same {
+                      1 => v1,
+                      0 =>
+                        match load(v2) {
+                          ListNode.Nil => v1,
+                          ListNode.Cons(_, _) => nlvars_subsume(v1, v2),
+                        },
+                    };
+                    nl_subsume_entry(store(NLEntry.Mk(p1, c1n, v1n)), srest),
+                },
+            },
+        },
+    }
+  }
+
+  fn nl_eq(a: List‹&NLEntry›, b: List‹&NLEntry›) -> G {
+    match load(a) {
+      ListNode.Nil =>
+        match load(b) {
+          ListNode.Nil => 1,
+          ListNode.Cons(_, _) => 0,
+        },
+      ListNode.Cons(x, ar) =>
+        match load(b) {
+          ListNode.Nil => 0,
+          ListNode.Cons(y, br) =>
+            match load(x) {
+              NLEntry.Mk(xp, xc, xv) =>
+                match load(y) {
+                  NLEntry.Mk(yp, yc, yv) =>
+                    match glist_cmp(xp, yp) {
+                      0 =>
+                        match xc - yc {
+                          0 =>
+                            match nlvars_eq(xv, yv) {
+                              1 => nl_eq(ar, br),
+                              0 => 0,
+                            },
+                          _ => 0,
+                        },
+                      _ => 0,
                     },
                 },
-              _ => 0,
             },
         },
-      KLevel.Param(i) =>
-        match b {
-          KLevel.Param(j) => eq_zero(i - j),
-          KLevel.Succ(&b1) => level_leq(a, b1),
-          KLevel.Max(&b1, &b2) =>
-            let r1 = level_leq(a, b1);
-            match r1 {
-              1 => 1,
-              0 => level_leq(a, b2),
+    }
+  }
+
+  -- Does some entry (p2, n2) in l2 with p2 SUBSET-OF p1 dominate the
+  -- constant c along every assignment activating p1? n2.const counts
+  -- unconditionally in that branch; each var (idx, off) counts as
+  -- >= off + 1 because idx IN p2 implies the param is >= 1 there.
+  -- Mirrors level.rs covers_const (the fixed, split-search version).
+  fn nl_covers_const(l2: List‹&NLEntry›, p1: List‹G›, c: G) -> G {
+    match load(l2) {
+      ListNode.Nil => 0,
+      ListNode.Cons(e, rest) =>
+        match load(e) {
+          NLEntry.Mk(p2, c2, v2) =>
+            match glist_subset(p2, p1) {
+              0 => nl_covers_const(rest, p1, c),
+              _ =>
+                let hit = match u32_less_than(c2, c) {
+                  0 => 1,
+                  1 => nlvars_any_offset_geq(v2, c),
+                };
+                match hit {
+                  1 => 1,
+                  0 => nl_covers_const(rest, p1, c),
+                },
             },
-          KLevel.IMax(&b1, &b2) =>
-            let p = level_any_param(b2);
-            let sp = KLevel.Succ(store(KLevel.Param(p)));
-            let a0 = level_subst_reduce(a, p, KLevel.Zero);
-            let bfull = KLevel.IMax(store(b1), store(b2));
-            let b0 = level_subst_reduce(bfull, p, KLevel.Zero);
-            let a1s = level_subst_reduce(a, p, sp);
-            let b1s = level_subst_reduce(bfull, p, sp);
-            level_leq(a0, b0) * level_leq(a1s, b1s),
-          KLevel.Zero => 0,
         },
-      KLevel.IMax(&a1, &a2) =>
-        let not_zero = level_is_not_zero(a2);
-        match not_zero {
-          1 => level_leq(a1, b) * level_leq(a2, b),
-          0 =>
-            let p = level_any_param(a2);
-            let sp = KLevel.Succ(store(KLevel.Param(p)));
-            let afull = KLevel.IMax(store(a1), store(a2));
-            let a0 = level_subst_reduce(afull, p, KLevel.Zero);
-            let b0 = level_subst_reduce(b, p, KLevel.Zero);
-            let a1s = level_subst_reduce(afull, p, sp);
-            let b1s = level_subst_reduce(b, p, sp);
-            level_leq(a0, b0) * level_leq(a1s, b1s),
+    }
+  }
+
+  -- Any var with offset + 1 >= c?
+  fn nlvars_any_offset_geq(vars: List‹&NLVar›, c: G) -> G {
+    match load(vars) {
+      ListNode.Nil => 0,
+      ListNode.Cons(v, rest) =>
+        match load(v) {
+          NLVar.Mk(_, vo) =>
+            match u32_less_than(vo + 1, c) {
+              0 => 1,
+              1 => nlvars_any_offset_geq(rest, c),
+            },
+        },
+    }
+  }
+
+  -- Does some entry (p2, n2) in l2 with p2 SUBSET-OF p1 contain a var
+  -- (w, off2) with off2 >= off? Mirrors level.rs covers_var.
+  fn nl_covers_var(l2: List‹&NLEntry›, p1: List‹G›, w: G, off: G) -> G {
+    match load(l2) {
+      ListNode.Nil => 0,
+      ListNode.Cons(e, rest) =>
+        match load(e) {
+          NLEntry.Mk(p2, _, v2) =>
+            match glist_subset(p2, p1) {
+              0 => nl_covers_var(rest, p1, w, off),
+              _ =>
+                match nlvars_dominates(v2, w, off) {
+                  1 => 1,
+                  0 => nl_covers_var(rest, p1, w, off),
+                },
+            },
+        },
+    }
+  }
+
+  fn nlvars_dominates(vars: List‹&NLVar›, w: G, off: G) -> G {
+    match load(vars) {
+      ListNode.Nil => 0,
+      ListNode.Cons(v, rest) =>
+        match load(v) {
+          NLVar.Mk(vi, vo) =>
+            match vi - w {
+              0 =>
+                match u32_less_than(vo, off) {
+                  0 => 1,
+                  1 => nlvars_dominates(rest, w, off),
+                },
+              _ => nlvars_dominates(rest, w, off),
+            },
         },
     }
   }
 
+  -- Semantic l1 <= l2 on canonical forms: every entry's contribution in
+  -- its activation branch must be dominated by subset-path entries of l2.
+  -- Mirrors level.rs norm_level_le (with its covers split, sound where
+  -- Lean4Lean's single-entry search is incomplete).
+  fn nl_le(l1: List‹&NLEntry›, l2: List‹&NLEntry›) -> G {
+    match load(l1) {
+      ListNode.Nil => 1,
+      ListNode.Cons(e, rest) =>
+        match load(e) {
+          NLEntry.Mk(p1, c1, v1) =>
+            let c_ok = match c1 {
+              0 => 1,
+              _ => nl_covers_const(l2, p1, c1),
+            };
+            match c_ok {
+              0 => 0,
+              1 =>
+                match nl_le_vars(v1, l2, p1) {
+                  0 => 0,
+                  1 => nl_le(rest, l2),
+                },
+            },
+        },
+    }
+  }
+
+  fn nl_le_vars(vars: List‹&NLVar›, l2: List‹&NLEntry›, p1: List‹G›) -> G {
+    match load(vars) {
+      ListNode.Nil => 1,
+      ListNode.Cons(v, rest) =>
+        match load(v) {
+          NLVar.Mk(vi, vo) =>
+            match nl_covers_var(l2, p1, vi, vo) {
+              0 => 0,
+              1 => nl_le_vars(rest, l2, p1),
+            },
+        },
+    }
+  }
+
+  -- Keyed on the level alone: each distinct level normalizes once per run.
+  -- Seeded with the empty-path entry, mirroring normalize_level.
+  fn level_normalize(l: KLevel) -> List‹&NLEntry› {
+    let seed = store(ListNode.Cons(
+      store(NLEntry.Mk(store(ListNode.Nil), 0, store(ListNode.Nil))),
+      store(ListNode.Nil)));
+    nl_subsumption(normalize_aux(l, store(ListNode.Nil), 0, seed))
+  }
+
+  fn level_leq(a: KLevel, b: KLevel) -> G {
+    match level_eq(a, b) {
+      1 => 1,
+      0 =>
+        match a {
+          KLevel.Zero => 1,
+          _ => nl_le(level_normalize(a), level_normalize(b)),
+        },
+    }
+  }
+
+  -- Semantic level equality via canonical forms (mirror: level.rs
+  -- univ_eq). The previous leq(a,b)*leq(b,a) did a two-way
+  -- param-substitution split per Max/IMax-with-param -- exponential in
+  -- params and the dominant record cost on instance-heavy checks.
   fn level_equal(a: KLevel, b: KLevel) -> G {
-    level_leq(a, b) * level_leq(b, a)
+    match level_eq(a, b) {
+      1 => 1,
+      0 => nl_eq(level_normalize(a), level_normalize(b)),
+    }
   }
 
   fn level_max(a: KLevel, b: KLevel) -> KLevel {
diff --git a/Ix/IxVM/Kernel/Primitive.lean b/Ix/IxVM/Kernel/Primitive.lean
index 9300ea1c..31e57dd2 100644
--- a/Ix/IxVM/Kernel/Primitive.lean
+++ b/Ix/IxVM/Kernel/Primitive.lean
@@ -1228,24 +1228,18 @@ def primitive := ⟦
     }
   }
 
-  -- Convert a KLimbs n into a chain `App(Const(succ), App(Const(succ),
-  -- ... Const(zero)))` for n calls of succ. Used by nat-literal-to-ctor
-  -- coercion in iota.
-  fn klimbs_to_ctor_form(n: KLimbs, zero_idx: G, succ_idx: G) -> KExpr {
-    match load(n) {
-      ListNode.Nil =>
-        store(KExprNode.Const(zero_idx, store(ListNode.Nil))),
-      ListNode.Cons(_, _) =>
-        let dec = klimbs_dec(n);
-        let pred = klimbs_to_ctor_form(dec, zero_idx, succ_idx);
-        let succ_const = store(KExprNode.Const(succ_idx, store(ListNode.Nil)));
-        store(KExprNode.App(succ_const, pred)),
-    }
-  }
-
   -- If `e` is `Lit(Nat(klimbs))` and addrs contains both Nat.zero and
-  -- Nat.succ, expand to ctor chain. Else return `e` unchanged. Mirror:
-  -- src/ix/kernel/whnf.rs:929-946 nat_to_constructor.
+  -- Nat.succ, expose ONE constructor layer: `0 -> Const(zero)`,
+  -- `n -> App(Const(succ), Lit(n-1))`. Else return `e` unchanged. Mirror:
+  -- src/ix/kernel/whnf.rs nat_to_constructor (one level, matching lean4
+  -- inductive.h:91-93 and lean4lean Reduce.lean:70).
+  --
+  -- One level, NOT the full succ chain: iota only needs the outermost
+  -- ctor to fire, and the predecessor stays a compact literal that
+  -- downstream whnf/def_eq/extract hit via the Nat fast paths. A full
+  -- `succ^n(zero)` chain materialized n fresh nodes per expansion event
+  -- and made every traversal of the major O(n) with distinct memo keys
+  -- per depth.
   fn nat_lit_to_ctor_or_self(e: KExpr, addrs: List‹Addr›) -> KExpr {
     match load(e) {
       KExprNode.Lit(lit) =>
@@ -1256,7 +1250,15 @@ def primitive := ⟦
             match z {
               (1, zero_idx) =>
                 match s {
-                  (1, succ_idx) => klimbs_to_ctor_form(klimbs, zero_idx, succ_idx),
+                  (1, succ_idx) =>
+                    match load(klimbs) {
+                      ListNode.Nil =>
+                        store(KExprNode.Const(zero_idx, store(ListNode.Nil))),
+                      ListNode.Cons(_, _) =>
+                        let pred = store(KExprNode.Lit(KLiteral.Nat(klimbs_dec(klimbs))));
+                        let succ_const = store(KExprNode.Const(succ_idx, store(ListNode.Nil)));
+                        store(KExprNode.App(succ_const, pred)),
+                    },
                   _ => e,
                 },
               _ => e,
@@ -1307,59 +1309,232 @@ def primitive := ⟦
     store(KExprNode.Lit(KLiteral.Nat(n)))
   }
 
-  -- 1 iff `head_addr` is one of the head-dispatched primitive ops checked by
-  -- `try_nat_dispatch` / `try_str_dispatch` / `try_bitvec_dispatch` /
-  -- `try_reduce_native` / `try_reduce_decidable`. These addresses are mutually
-  -- exclusive (a const has one content address), so the sum is 0 or 1.
-  -- Memoized on `head_addr` (a content pointer): computed once per distinct
-  -- const, vs the per-whnf gauntlet of 5 `try_*` calls. Lets `whnf_const_head`
-  -- skip the gauntlet for the common non-primitive head. MUST stay in sync
-  -- with the `address_eq(head_addr, …)` checks in those functions — a
-  -- differential `assert` guards that during development.
-  fn prim_any_addr(head_addr: Addr) -> G {
-    address_eq(head_addr, nat_add_addr())
-    + address_eq(head_addr, nat_sub_addr())
-    + address_eq(head_addr, nat_mul_addr())
-    + address_eq(head_addr, nat_div_addr())
-    + address_eq(head_addr, nat_mod_addr())
-    + address_eq(head_addr, nat_pow_addr())
-    + address_eq(head_addr, nat_gcd_addr())
-    + address_eq(head_addr, nat_land_addr())
-    + address_eq(head_addr, nat_lor_addr())
-    + address_eq(head_addr, nat_xor_addr())
-    + address_eq(head_addr, nat_shift_left_addr())
-    + address_eq(head_addr, nat_shift_right_addr())
-    + address_eq(head_addr, nat_succ_addr())
-    + address_eq(head_addr, nat_pred_addr())
-    + address_eq(head_addr, nat_beq_addr())
-    + address_eq(head_addr, nat_ble_addr())
-    + address_eq(head_addr, nat_dec_eq_addr())
-    + address_eq(head_addr, nat_dec_le_addr())
-    + address_eq(head_addr, nat_dec_lt_addr())
-    + address_eq(head_addr, bool_true_addr())
-    + address_eq(head_addr, bool_false_addr())
-    + address_eq(head_addr, int_of_nat_addr())
-    + address_eq(head_addr, int_neg_succ_addr())
-    + address_eq(head_addr, int_dec_eq_addr())
-    + address_eq(head_addr, int_dec_le_addr())
-    + address_eq(head_addr, int_dec_lt_addr())
-    + address_eq(head_addr, bit_vec_of_nat_addr())
-    + address_eq(head_addr, bit_vec_to_nat_addr())
-    + address_eq(head_addr, bit_vec_ult_addr())
-    + address_eq(head_addr, decidable_decide_addr())
-    + address_eq(head_addr, reduce_bool_addr())
-    + address_eq(head_addr, reduce_nat_addr())
-    + address_eq(head_addr, size_of_size_of_addr())
-    + address_eq(head_addr, string_back_addr())
-    + address_eq(head_addr, string_legacy_back_addr())
-    + address_eq(head_addr, string_to_byte_array_addr())
-    + address_eq(head_addr, subtype_val_addr())
-    + address_eq(head_addr, system_platform_get_num_bits_addr())
-    + address_eq(head_addr, system_platform_num_bits_addr())
-    + address_eq(head_addr, punit_addr())
-    + address_eq(head_addr, punit_size_of_1_addr())
-    + address_eq(head_addr, unit_addr())
-    + address_eq(head_addr, string_utf8_byte_size_addr())
+  -- Build the compact stuck offset `Nat.add base (Lit n)` applied to `post`,
+  -- the canonical normal form for a symbolic-base Nat offset (paired with
+  -- offset-aware def-eq). n = 0 collapses to `base` directly. Shared by the
+  -- nat dispatch (symbolic add/div/mod kept stuck) and the linear-rec
+  -- collapse, keeping the offset construction out of those circuits' width.
+  fn mk_nat_offset_stuck(base_w: KExpr, n: KLimbs, post: List‹KExpr›,
+                         addrs: List‹Addr›) -> (G, KExpr) {
+    match klimbs_is_zero(n) {
+      1 => (1, apply_spine(base_w, post)),
+      0 =>
+        match find_addr_idx_safe(nat_add_addr(), addrs, 0) {
+          (0, _) => (0, store(KExprNode.BVar(0))),
+          (1, add_idx) =>
+            let add_const = store(KExprNode.Const(add_idx, store(ListNode.Nil)));
+            let off = store(KExprNode.App(
+              store(KExprNode.App(add_const, base_w)),
+              mk_nat_lit(n)));
+            (1, apply_spine(off, post)),
+        },
+    }
+  }
+
+  -- ============================================================================
+  -- Primitive-family classification
+  --
+  -- `prim_family(a)` maps a Const-head address to the one reducer family
+  -- that could possibly fire on it: 0 = none, 1 = nat (`try_nat_dispatch`),
+  -- 2 = str, 3 = bitvec, 4 = native, 5 = decidable. The family address
+  -- sets are mutually disjoint (verified against each reducer's
+  -- comparisons), so classification is unambiguous.
+  --
+  -- Keyed on the ADDRESS ALONE: one memo row per distinct constant
+  -- address in the whole run (~hundreds), and the inner `address_eq`
+  -- chains are likewise keyed `(a, prim)` so they collapse after the
+  -- first occurrence of each address. This replaces running every
+  -- reducer as a gauntlet from `whnf_const_head`, whose try_* keys
+  -- include the spine and context — 4-6 wide rows per Const-head whnf,
+  -- almost all guaranteed misses.
+  -- ============================================================================
+  fn prim_family(a: Addr) -> G {
+    match is_nat_prim_addr(a) {
+      1 => 1,
+      0 =>
+        match is_str_prim_addr(a) {
+          1 => 2,
+          0 =>
+            match is_bitvec_prim_addr(a) {
+              1 => 3,
+              0 =>
+                match is_native_prim_addr(a) {
+                  1 => 4,
+                  0 =>
+                    match is_decidable_prim_addr(a) {
+                      1 => 5,
+                      0 => 0,
+                    },
+                },
+            },
+        },
+    }
+  }
+
+  -- Addresses handled by `try_nat_dispatch`: succ/pred plus the binop
+  -- set of `try_nat_binop_addr`.
+  fn is_nat_prim_addr(a: Addr) -> G {
+    match address_eq(a, nat_succ_addr()) {
+      1 => 1,
+      0 =>
+      match address_eq(a, nat_pred_addr()) {
+        1 => 1,
+        0 =>
+        match address_eq(a, nat_add_addr()) {
+          1 => 1,
+          0 =>
+          match address_eq(a, nat_sub_addr()) {
+            1 => 1,
+            0 =>
+            match address_eq(a, nat_mul_addr()) {
+              1 => 1,
+              0 =>
+              match address_eq(a, nat_div_addr()) {
+                1 => 1,
+                0 =>
+                match address_eq(a, nat_mod_addr()) {
+                  1 => 1,
+                  0 =>
+                  match address_eq(a, nat_pow_addr()) {
+                    1 => 1,
+                    0 =>
+                    match address_eq(a, nat_gcd_addr()) {
+                      1 => 1,
+                      0 =>
+                      match address_eq(a, nat_beq_addr()) {
+                        1 => 1,
+                        0 =>
+                        match address_eq(a, nat_ble_addr()) {
+                          1 => 1,
+                          0 =>
+                          match address_eq(a, nat_land_addr()) {
+                            1 => 1,
+                            0 =>
+                            match address_eq(a, nat_lor_addr()) {
+                              1 => 1,
+                              0 =>
+                              match address_eq(a, nat_xor_addr()) {
+                                1 => 1,
+                                0 =>
+                                match address_eq(a, nat_shift_left_addr()) {
+                                  1 => 1,
+                                  0 =>
+                                  match address_eq(a, nat_shift_right_addr()) {
+                                    1 => 1,
+                                    0 =>
+                                    0,
+                                  },
+                                },
+                              },
+                            },
+                          },
+                        },
+                      },
+                    },
+                  },
+                },
+              },
+            },
+          },
+        },
+      },
+    }
+  }
+
+  -- Addresses handled by `try_str_dispatch`.
+  fn is_str_prim_addr(a: Addr) -> G {
+    match address_eq(a, string_utf8_byte_size_addr()) {
+      1 => 1,
+      0 =>
+      match address_eq(a, string_to_byte_array_addr()) {
+        1 => 1,
+        0 =>
+        match address_eq(a, string_back_addr()) {
+          1 => 1,
+          0 =>
+          match address_eq(a, string_legacy_back_addr()) {
+            1 => 1,
+            0 =>
+            0,
+          },
+        },
+      },
+    }
+  }
+
+  -- Addresses handled by `try_bitvec_dispatch`.
+  fn is_bitvec_prim_addr(a: Addr) -> G {
+    match address_eq(a, bit_vec_to_nat_addr()) {
+      1 => 1,
+      0 =>
+      match address_eq(a, bit_vec_ult_addr()) {
+        1 => 1,
+        0 =>
+        match address_eq(a, decidable_decide_addr()) {
+          1 => 1,
+          0 =>
+          0,
+        },
+      },
+    }
+  }
+
+  -- Addresses handled by `try_reduce_native`.
+  fn is_native_prim_addr(a: Addr) -> G {
+    match address_eq(a, reduce_bool_addr()) {
+      1 => 1,
+      0 =>
+      match address_eq(a, reduce_nat_addr()) {
+        1 => 1,
+        0 =>
+        match address_eq(a, subtype_val_addr()) {
+          1 => 1,
+          0 =>
+          match address_eq(a, size_of_size_of_addr()) {
+            1 => 1,
+            0 =>
+            match address_eq(a, punit_size_of_1_addr()) {
+              1 => 1,
+              0 =>
+              match address_eq(a, system_platform_num_bits_addr()) {
+                1 => 1,
+                0 =>
+                0,
+              },
+            },
+          },
+        },
+      },
+    }
+  }
+
+  -- Addresses handled by `try_reduce_decidable`.
+  fn is_decidable_prim_addr(a: Addr) -> G {
+    match address_eq(a, nat_dec_eq_addr()) {
+      1 => 1,
+      0 =>
+      match address_eq(a, nat_dec_lt_addr()) {
+        1 => 1,
+        0 =>
+        match address_eq(a, nat_dec_le_addr()) {
+          1 => 1,
+          0 =>
+          match address_eq(a, int_dec_eq_addr()) {
+            1 => 1,
+            0 =>
+            match address_eq(a, int_dec_lt_addr()) {
+              1 => 1,
+              0 =>
+              match address_eq(a, int_dec_le_addr()) {
+                1 => 1,
+                0 =>
+                0,
+              },
+            },
+          },
+        },
+      },
+    }
   }
 
   -- Mirror: src/ix/kernel/whnf.rs:500-700 Nat-on-literals dispatch.
@@ -1422,13 +1597,46 @@ def primitive := ⟦
                         },
                       _ => (0, store(KExprNode.BVar(0))),
                     },
-                  _ => (0, store(KExprNode.BVar(0))),
+                  _ =>
+                    -- Symbolic base: route to the cold offset-stuck check
+                    -- (separate circuit so its width only taxes actual
+                    -- symbolic-base events, not every literal binop row).
+                    match pb {
+                      (1, nb) =>
+                        try_nat_offset_dispatch(head_addr, a0_w, nb, spine, addrs),
+                      _ => (0, store(KExprNode.BVar(0))),
+                    },
                 },
             },
         },
     }
   }
 
+  -- Cold path of try_nat_dispatch: a binary Nat op whose whnf'd base is
+  -- symbolic and whose second arg is `Lit nb`. For `Nat.add` (any nb) and
+  -- `Nat.div`/`Nat.mod` (nb >= 2) the term is irreducible: return verdict
+  -- 2 = "already stuck in compact offset form" so whnf keeps it instead of
+  -- delta-unfolding into a succ^nb tower / the division algorithm
+  -- (`x >>> k` = k nested `Nat.div _ 2`). Pairs with offset-aware def-eq.
+  fn try_nat_offset_dispatch(head_addr: Addr, a0_w: KExpr, nb: KLimbs,
+                             spine: List‹KExpr›, addrs: List‹Addr›) -> (G, KExpr) {
+    let is_add = address_eq(head_addr, nat_add_addr());
+    let is_divmod = address_eq(head_addr, nat_div_addr())
+      + address_eq(head_addr, nat_mod_addr());
+    let eligible =
+      is_add + is_divmod * (1 - klimbs_is_zero(nb))
+        * (1 - klimbs_is_zero(klimbs_dec(nb)));
+    match eligible {
+      0 => (0, store(KExprNode.BVar(0))),
+      _ =>
+        let post = list_drop(spine, 2);
+        match mk_nat_offset_stuck(a0_w, nb, post, addrs) {
+          (1, stuck) => (2, stuck),
+          (0, _) => (0, store(KExprNode.BVar(0))),
+        },
+    }
+  }
+
   -- Dispatch a Nat binary op by head address. Bool result for beq/ble
   -- wraps via Bool.true / Bool.false ctors (mk_bool).
   fn try_nat_binop_addr(head_addr: Addr, a: KLimbs, b: KLimbs,
@@ -2418,26 +2626,57 @@ def primitive := ⟦
   -- Recognizes Defn-kind constants whose body is `λ x_1 ... x_n → Prj S i (BVar k)`
   -- and shortcuts the unfolding to a direct `Prj S i args[arity-1-k]`. Pure
   -- performance: standard delta+beta still produces same result.
-  fn try_reduce_projection_definition(head_idx: G, spine: List‹KExpr›,
-                                       top: List‹&KConstantInfo›) -> (G, KExpr) {
+  -- Whether the constant at `head_idx` is a projection-definition, and
+  -- its (arity, struct_idx, field, struct_arg_idx) when so. Keyed on the
+  -- INDEX alone (top is constant per run): one memo row per distinct
+  -- constant, where the previous spine-keyed probe paid a row per delta
+  -- candidate — 12.7M entries on the UTF-8 codec check, scaling 39x per
+  -- SInt width doubling (structure-bundle instances force projections
+  -- constantly).
+  fn proj_def_info_of(head_idx: G,
+                      top: List‹&KConstantInfo›) -> (G, G, G, G, G) {
     match load(list_lookup(top, head_idx)) {
       KConstantInfo.Defn(_, _, value, _, _) =>
-        match projection_definition_info(value, 0) {
-          (1, arity, struct_idx, field, struct_arg_idx) =>
-            match u32_less_than(list_length(spine), arity) {
-              1 => (0, store(KExprNode.BVar(0))),
-              0 =>
-                let target_arg = list_lookup(spine, struct_arg_idx);
-                let proj_expr = store(KExprNode.Proj(struct_idx, field, target_arg));
-                let post = list_drop(spine, arity);
-                (1, apply_spine(proj_expr, post)),
-            },
-          _ => (0, store(KExprNode.BVar(0))),
-        },
+        projection_definition_info(value, 0),
+      _ => (0, 0, 0, 0, 0),
+    }
+  }
+
+  -- Scalar gate for inline use (let-bound matches only support scalar
+  -- scrutinees): 1 iff the constant is a projection-definition.
+  fn is_proj_def(head_idx: G, top: List‹&KConstantInfo›) -> G {
+    match proj_def_info_of(head_idx, top) {
+      (1, _, _, _, _) => 1,
+      _ => 0,
+    }
+  }
+
+  -- The spine-keyed application, called ONLY when `is_proj_def` already
+  -- said yes (rare), so the per-delta-candidate row of the old
+  -- ungated form is gone.
+  fn try_reduce_projection_definition(head_idx: G, spine: List‹KExpr›,
+                                       top: List‹&KConstantInfo›) -> (G, KExpr) {
+    match proj_def_info_of(head_idx, top) {
+      (1, arity, struct_idx, field, struct_arg_idx) =>
+        proj_def_apply(arity, struct_idx, field, struct_arg_idx, spine),
       _ => (0, store(KExprNode.BVar(0))),
     }
   }
 
+  -- The spine-dependent application, reached only for ACTUAL
+  -- projection-definitions (rare).
+  fn proj_def_apply(arity: G, struct_idx: G, field: G, struct_arg_idx: G,
+                    spine: List‹KExpr›) -> (G, KExpr) {
+    match u32_less_than(list_length(spine), arity) {
+      1 => (0, store(KExprNode.BVar(0))),
+      0 =>
+        let target_arg = list_lookup(spine, struct_arg_idx);
+        let proj_expr = store(KExprNode.Proj(struct_idx, field, target_arg));
+        let post = list_drop(spine, arity);
+        (1, apply_spine(proj_expr, post)),
+    }
+  }
+
   -- Mirror: src/ix/kernel/whnf.rs:1441-1500 try_reduce_fin_val_decidable_rec
   -- + project_decidable_fin_val_minor.
   --
diff --git a/Ix/IxVM/Kernel/Subst.lean b/Ix/IxVM/Kernel/Subst.lean
index e37d1e0d..9d8a7ce6 100644
--- a/Ix/IxVM/Kernel/Subst.lean
+++ b/Ix/IxVM/Kernel/Subst.lean
@@ -202,6 +202,14 @@ def subst := ⟦
     }
   }
 
+  -- Recursive walk. App children are guarded INLINE at the call site
+  -- (lbr check, then straight into the walk): a no-op subtree produces
+  -- NO query row at all, and a real-work child produces one walk row
+  -- instead of a dispatch row + a walk row — 35% of all expr_inst1
+  -- entries on instantiation-heavy checks were no-op dispatch rows.
+  -- `expr_lbr` is keyed on the child alone, so the guard memoizes
+  -- across arg/depth. App-arm only: guarding the binder arms too blew
+  -- the walk width 53 -> 141 and REGRESSED FFT +4%.
   fn expr_inst1_walk(e: KExpr, arg: KExpr, depth: G) -> KExpr {
     match load(e) {
       KExprNode.BVar(i) =>
@@ -214,12 +222,18 @@ def subst := ⟦
               _ => store(KExprNode.BVar(i - 1)),
             },
         },
-      KExprNode.Srt(l) => store(KExprNode.Srt(l)),
-      KExprNode.Const(idx, lvls) => store(KExprNode.Const(idx, lvls)),
+      KExprNode.Srt(_) => e,
+      KExprNode.Const(_, _) => e,
       KExprNode.App(f, a) =>
-        store(KExprNode.App(
-          expr_inst1(f, arg, depth),
-          expr_inst1(a, arg, depth))),
+        let f2 = match u32_less_than(depth, expr_lbr(f)) {
+          0 => f,
+          1 => expr_inst1_walk(f, arg, depth),
+        };
+        let a2 = match u32_less_than(depth, expr_lbr(a)) {
+          0 => a,
+          1 => expr_inst1_walk(a, arg, depth),
+        };
+        store(KExprNode.App(f2, a2)),
       KExprNode.Lam(ty, body) =>
         store(KExprNode.Lam(
           expr_inst1(ty, arg, depth),
@@ -233,7 +247,7 @@ def subst := ⟦
           expr_inst1(ty, arg, depth),
           expr_inst1(val, arg, depth),
           expr_inst1(body, arg, depth + 1))),
-      KExprNode.Lit(lit) => store(KExprNode.Lit(lit)),
+      KExprNode.Lit(_) => e,
       KExprNode.Proj(tidx, fidx, e1) =>
         store(KExprNode.Proj(tidx, fidx, expr_inst1(e1, arg, depth))),
     }
diff --git a/Ix/IxVM/Kernel/Whnf.lean b/Ix/IxVM/Kernel/Whnf.lean
index d3e76b69..93348a0e 100644
--- a/Ix/IxVM/Kernel/Whnf.lean
+++ b/Ix/IxVM/Kernel/Whnf.lean
@@ -56,6 +56,15 @@ def whnf := ⟦
 
   -- ============================================================================
   -- Beta
+  --
+  -- One argument per step, NOT a simultaneous multi-arg substitution: a
+  -- measured choice. A simultaneous `instantiate_rev`-style peel was
+  -- tried (2026-06-10) and REGRESSED FFT ~3%: recursor betas share a
+  -- constant argument prefix across iterations (motive/base/step), so
+  -- the sequential chain's per-arg memo keys hit across every iteration,
+  -- while a combined args-list key is unique per iteration and re-walks
+  -- the whole body each time. Curried beats tupled under Aiur's
+  -- content-memoization.
   -- ============================================================================
 
   -- Peel a beta telescope: consume one spine arg per leading `Lam` of `lam`,
@@ -164,82 +173,6 @@ def whnf := ⟦
     }
   }
 
-  -- If `head` is a Nat primitive (`Nat.add` / `Nat.div` / `Nat.mod`) applied to
-  -- exactly (non-literal base, literal second arg), return (1, the same op in
-  -- canonical form) so whnf leaves it STUCK instead of delta-unfolding it. This
-  -- stops `Nat.add x n` from materializing succ^n(x), and `Nat.div`/`Nat.mod x n`
-  -- (n ≥ 2) from expanding the division algorithm — both are irreducible for a
-  -- symbolic base, so the compact form IS the normal form. `Nat.shiftRight x k`
-  -- unfolds to k nested `Nat.div _ 2`, which now stay stuck. Thresholds: `add`
-  -- keeps any nonzero n; `div`/`mod` keep n ≥ 2 (so `x/1 = x`, `x/0 = 0` still
-  -- reduce). All magnitudes stay KLimbs. `(0, _)` = "not this shape".
-  fn try_nat_offset_stuck(head: KExpr, spine: List‹KExpr›, types: List‹KExpr›,
-                          top: List‹&KConstantInfo›, addrs: List‹Addr›) -> (G, KExpr) {
-    match load(head) {
-      KExprNode.Const(idx, _) =>
-        let ca = list_lookup(addrs, idx);
-        let is_add = address_eq(ca, nat_add_addr());
-        let is_divmod = address_eq(ca, nat_div_addr()) + address_eq(ca, nat_mod_addr());
-        match is_add + is_divmod {
-          0 => (0, store(KExprNode.BVar(0))),
-          _ =>
-            match list_length(spine) {
-              2 =>
-                let a0_w = whnf(list_lookup(spine, 0), types, top, addrs);
-                let a1_w = whnf(list_lookup(spine, 1), types, top, addrs);
-                match try_extract_nat(a1_w, addrs) {
-                  (0, _) => (0, store(KExprNode.BVar(0))),
-                  (1, n) =>
-                    -- reject n=0 (all ops) and n=1 (div/mod only).
-                    let bad = klimbs_is_zero(n) + is_divmod * klimbs_is_zero(klimbs_dec(n));
-                    match bad {
-                      0 =>
-                        match try_extract_nat(a0_w, addrs) {
-                          (1, _) => (0, store(KExprNode.BVar(0))),
-                          (0, _) =>
-                            (1, store(KExprNode.App(
-                                  store(KExprNode.App(head, a0_w)),
-                                  mk_nat_lit(n)))),
-                        },
-                      _ => (0, store(KExprNode.BVar(0))),
-                    },
-                },
-              _ => (0, store(KExprNode.BVar(0))),
-            },
-        },
-      _ => (0, store(KExprNode.BVar(0))),
-    }
-  }
-
-  -- The address-keyed primitive gauntlet: nat → str → bitvec → native →
-  -- decidable, in order. Returns (1, reduced) on the first family that matches
-  -- and reduces, else (0, _). Factored out so `prim_any_addr` can gate the
-  -- whole chain for non-primitive heads in one shot.
-  fn try_address_primitives(head_addr: Addr, idx: G, lvls: List‹&KLevel›,
-                            spine: List‹KExpr›, types: List‹KExpr›,
-                            top: List‹&KConstantInfo›, addrs: List‹Addr›) -> (G, KExpr) {
-    match try_nat_dispatch(head_addr, spine, types, top, addrs) {
-      (1, r) => (1, r),
-      (0, _) =>
-        match try_str_dispatch(head_addr, spine, addrs) {
-          (1, r) => (1, r),
-          (0, _) =>
-            match try_bitvec_dispatch(head_addr, spine, types, top, addrs) {
-              (1, r) => (1, r),
-              (0, _) =>
-                match try_reduce_native(head_addr, spine, types, top, addrs) {
-                  (1, r) => (1, r),
-                  (0, _) =>
-                    match try_reduce_decidable(head_addr, idx, lvls, spine, types, top, addrs) {
-                      (1, r) => (1, r),
-                      (0, _) => (0, store(KExprNode.BVar(0))),
-                    },
-                },
-            },
-        },
-    }
-  }
-
   -- Const-head WHNF dispatch, split out of `whnf_with_spine` (see its Const arm).
   -- `head` is the original `Const(idx, lvls)` KExpr, passed for the stuck
   -- `apply_spine(head, spine)` fallbacks.
@@ -264,40 +197,61 @@ def whnf := ⟦
           (0, _) => apply_spine(head, spine),
         },
       _ =>
-        -- Skip the address-primitive gauntlet for the common non-primitive
-        -- head: `prim_any_addr` (memoized per head address) is 1 only for the
-        -- ~43 ops the gauntlet recognizes, so a 0 means no `try_*` can match —
-        -- go straight to projection-definition / delta. Validated by a
-        -- differential assert (now removed) over the suite + heavy consts.
-        let addr_prim = match prim_any_addr(head_addr) {
-          1 => try_address_primitives(head_addr, idx, lvls, spine, types, top, addrs),
+        -- Family-gated primitive dispatch. `prim_family` is keyed on the
+        -- head ADDRESS alone, so it memoizes to one row per distinct
+        -- constant address in the run; at most ONE family reducer is
+        -- then called. The previous form ran every reducer as a gauntlet
+        -- (nat -> str -> bitvec -> native -> decidable), charging 4-6
+        -- wide rows per Const-head whnf for guaranteed misses — the
+        -- families' address sets are mutually disjoint, so the gauntlet
+        -- order never mattered. A family-reducer miss (e.g. `Nat.add` on
+        -- non-literal args) falls through to proj-def/delta exactly as
+        -- the gauntlet's final arm did.
+        let fam = prim_family(head_addr);
+        let attempt = match fam {
+          1 => try_nat_dispatch(head_addr, spine, types, top, addrs),
+          2 => try_str_dispatch(head_addr, spine, addrs),
+          3 => try_bitvec_dispatch(head_addr, spine, types, top, addrs),
+          4 => try_reduce_native(head_addr, spine, types, top, addrs),
+          5 => try_reduce_decidable(head_addr, idx, lvls, spine, types, top, addrs),
           _ => (0, store(KExprNode.BVar(0))),
         };
-        match addr_prim {
+        match attempt {
           (1, reduced) => whnf(reduced, types, top, addrs),
+          -- verdict 2: the reducer normalized the term to an already-stuck
+          -- compact form (symbolic Nat offset); return it WITHOUT re-whnf,
+          -- which would loop.
+          (2, stuck) => stuck,
           (0, _) =>
-                let proj_def_pair = try_reduce_projection_definition(idx, spine, top);
-                match proj_def_pair {
-                  (1, reduced_pd) => whnf(reduced_pd, types, top, addrs),
-                  (0, _) =>
-                    -- Mirror src/ix/kernel/whnf.rs:756-774 (`delta_unfold_one`):
-                    -- unfold any Defn regardless of `ReducibilityHints`.
-                    match ci {
-                      KConstantInfo.Defn(_, _, value, _, _) =>
-                        -- Keep `Nat.add base (Lit n)` (symbolic base) stuck as a
-                        -- compact offset instead of delta-unfolding into a
-                        -- succ^n tower. Pairs with offset-aware def-eq.
-                        match try_nat_offset_stuck(head, spine, types, top, addrs) {
-                          (1, stuck) => stuck,
-                          (0, _) =>
-                            let body = expr_inst_levels(value, lvls);
-                            whnf_with_spine(body, spine, types, top, addrs),
-                        },
-                      KConstantInfo.Thm(_, _, _) => apply_spine(head, spine),
-                      _ => apply_spine(head, spine),
-                    },
+            -- gate on the idx-keyed proj-def classification (memoized per
+            -- constant): only real projection-definitions pay the
+            -- spine-keyed apply row
+            let proj_def_pair = match is_proj_def(idx, top) {
+              1 => try_reduce_projection_definition(idx, spine, top),
+              _ => (0, store(KExprNode.BVar(0))),
+            };
+            match proj_def_pair {
+              (1, reduced_pd) => whnf(reduced_pd, types, top, addrs),
+              (0, _) =>
+                -- Mirror src/ix/kernel/whnf.rs:756-774
+                -- (`delta_unfold_one`): unfold any Defn
+                -- regardless of `ReducibilityHints`. The
+                -- hint is consulted by lazy-delta's
+                -- `delta_rank` for def-eq priority, not
+                -- as a gate on plain whnf delta. Without
+                -- unfolding here, ctor field types
+                -- written via opaque defs (e.g.
+                -- `constType (n α) (n α)`) stay stuck
+                -- and `check_positivity_aug` misclassifies.
+                match ci {
+                  KConstantInfo.Defn(_, _, value, _, _) =>
+                    let body = expr_inst_levels(value, lvls);
+                    whnf_with_spine(body, spine, types, top, addrs),
+                  KConstantInfo.Thm(_, _, _) => apply_spine(head, spine),
+                  _ => apply_spine(head, spine),
                 },
             },
+        },
     }
   }
 
@@ -557,20 +511,7 @@ def whnf := ⟦
                   -- value in the same compact `base + n` form a literal already
                   -- has, so def-eq converges instead of descending n unary
                   -- succ layers (the UTF-8 `x + 0xC0` pathology).
-                  (0, _) =>
-                    match klimbs_is_zero(n_klimbs) {
-                      1 => (1, apply_spine(base_w, post)),
-                      0 =>
-                        match find_addr_idx_safe(nat_add_addr(), addrs, 0) {
-                          (0, _) => (0, store(KExprNode.BVar(0))),
-                          (1, add_idx) =>
-                            let add_const = store(KExprNode.Const(add_idx, store(ListNode.Nil)));
-                            let off = store(KExprNode.App(
-                              store(KExprNode.App(add_const, base_w)),
-                              mk_nat_lit(n_klimbs)));
-                            (1, apply_spine(off, post)),
-                        },
-                    },
+                  (0, _) => mk_nat_offset_stuck(base_w, n_klimbs, post, addrs),
                   (1, b_klimbs) =>
                     (1, apply_spine(mk_nat_lit(klimbs_add(b_klimbs, n_klimbs)), post)),
                 },
diff --git a/Ix/IxVM/KernelTypes.lean b/Ix/IxVM/KernelTypes.lean
index b4a57b34..3d511edd 100644
--- a/Ix/IxVM/KernelTypes.lean
+++ b/Ix/IxVM/KernelTypes.lean
@@ -68,33 +68,6 @@ def kernelTypes := ⟦
     }
   }
 
-  -- ============================================================================
-  -- Values (NbE semantic domain)
-  -- ============================================================================
-
-  enum KValNode {
-    Srt(&KLevel),
-    Lit(KLiteral),
-    Lam(KVal, KExpr, KValEnv),
-    Pi(KVal, KExpr, KValEnv),
-    Ctor(G, List‹&KLevel›, G, List‹KVal›),
-    FVar(G, KVal, List‹KVal›),
-    Axiom(G, List‹&KLevel›, List‹KVal›),
-    Defn(G, List‹&KLevel›, List‹KVal›),
-    Thm(G, List‹&KLevel›, List‹KVal›),
-    Opaque(G, List‹&KLevel›, List‹KVal›),
-    Quot(G, List‹&KLevel›, List‹KVal›),
-    Induct(G, List‹&KLevel›, List‹KVal›),
-    Rec(G, List‹&KLevel›, List‹KVal›),
-    Proj(G, G, KVal, List‹KVal›),
-    Thunk(KExpr, KValEnv)
-  }
-
-  type KVal = &KValNode
-
-  -- Value environment (de Bruijn indexed, front = most recent binder)
-  type KValEnv = List‹KVal›
-
   -- ============================================================================
   -- Recursor Rule
   --
diff --git a/Main.lean b/Main.lean
index e72ae1e8..5f70ba71 100644
--- a/Main.lean
+++ b/Main.lean
@@ -1,5 +1,6 @@
 --import Ix.Cli.StoreCmd
 import Ix.Cli.AddrOfCmd
+import Ix.Cli.RefsOfCmd
 import Ix.Cli.CheckCmd
 import Ix.Cli.CheckRsCmd
 import Ix.Cli.ClaimCmd
@@ -34,6 +35,7 @@ def ixCmd : Cli.Cmd := `[Cli|
     shardCmd;
     verifyCmd;
     addrOfCmd;
+    refsOfCmd;
     ingressCmd;
     validateCmd;
     serveCmd;
diff --git a/src/aiur/execute.rs b/src/aiur/execute.rs
index e28dae6e..c724f9b9 100644
--- a/src/aiur/execute.rs
+++ b/src/aiur/execute.rs
@@ -15,12 +15,165 @@ use crate::{
   },
 };
 
-pub struct QueryResult {
-  pub(crate) output: Vec<G>,
+/// Immutable view of one query entry.
+#[derive(Clone, Copy)]
+pub struct QueryRef<'a> {
+  pub(crate) output: &'a [G],
   pub(crate) multiplicity: G,
 }
 
-pub type QueryMap = FxIndexMap<Vec<G>, QueryResult>;
+/// Mutable view of one query entry: the output is fixed at insertion,
+/// only the multiplicity is bumped on memo hits.
+pub struct QueryRefMut<'a> {
+  pub(crate) output: &'a [G],
+  pub(crate) multiplicity: &'a mut G,
+}
+
+fn hash_g_slice(key: &[G]) -> u64 {
+  use std::hash::Hasher;
+  let mut h = rustc_hash::FxHasher::default();
+  for g in key {
+    h.write_u64(g.as_canonical_u64());
+  }
+  h.finish()
+}
+
+/// Append-only query store with a hash index.
+///
+/// Functionally the insertion-ordered map `args -> (output, multiplicity)`
+/// it replaces (`FxIndexMap<Vec<G>, QueryResult>`) — but every circuit has
+/// a FIXED key arity and output width, so keys and outputs live in flat
+/// `Vec<G>` arenas addressed by entry index, and the hash table holds only
+/// `u32` indices. This cuts per-entry overhead from ~130 B (two heap
+/// `Vec`s + IndexMap bucket + allocator metadata) to the raw field
+/// elements plus ~13 B of index. The record IS the proof witness, so
+/// entries cannot be dropped — only stored compactly; on kernel-heavy
+/// executions it is the dominant RAM consumer (billions of entries).
+///
+/// Entry index == insertion order; memory circuits use it as the pointer
+/// value, mirroring the old `IndexMap::get_index_of` semantics.
+pub struct QueryMap {
+  key_stride: usize,
+  /// Output width; inferred on first insert (not statically available in
+  /// `FunctionLayout`).
+  out_stride: usize,
+  keys: Vec<G>,
+  outs: Vec<G>,
+  mults: Vec<G>,
+  table: hashbrown::HashTable<u32>,
+}
+
+impl QueryMap {
+  pub fn new(key_stride: usize) -> Self {
+    Self {
+      key_stride,
+      out_stride: 0,
+      keys: Vec::new(),
+      outs: Vec::new(),
+      mults: Vec::new(),
+      table: hashbrown::HashTable::new(),
+    }
+  }
+
+  #[inline]
+  pub fn len(&self) -> usize {
+    self.mults.len()
+  }
+
+  #[inline]
+  pub fn is_empty(&self) -> bool {
+    self.mults.is_empty()
+  }
+
+  /// Total retained field elements (keys + outputs); used by the
+  /// `IX_AIUR_QUERY_STATS` RAM-attribution dump.
+  pub fn retained_elems(&self) -> usize {
+    self.keys.len() + self.outs.len()
+  }
+
+  #[inline]
+  fn key_at(&self, i: usize) -> &[G] {
+    &self.keys[i * self.key_stride..(i + 1) * self.key_stride]
+  }
+
+  #[inline]
+  fn out_at(&self, i: usize) -> &[G] {
+    &self.outs[i * self.out_stride..(i + 1) * self.out_stride]
+  }
+
+  pub fn get_index_of(&self, key: &[G]) -> Option<usize> {
+    debug_assert_eq!(key.len(), self.key_stride);
+    let hash = hash_g_slice(key);
+    self
+      .table
+      .find(hash, |&i| self.key_at(i as usize) == key)
+      .map(|&i| i as usize)
+  }
+
+  pub fn get(&self, key: &[G]) -> Option<QueryRef<'_>> {
+    let i = self.get_index_of(key)?;
+    Some(QueryRef { output: self.out_at(i), multiplicity: self.mults[i] })
+  }
+
+  pub fn get_mut(&mut self, key: &[G]) -> Option<QueryRefMut<'_>> {
+    let i = self.get_index_of(key)?;
+    let output = &self.outs[i * self.out_stride..(i + 1) * self.out_stride];
+    Some(QueryRefMut { output, multiplicity: &mut self.mults[i] })
+  }
+
+  /// Append a new entry. The key must not already be present: call sites
+  /// only insert on a confirmed miss, and a same-key re-entrant call
+  /// would loop forever before reaching its own insert.
+  pub fn insert(&mut self, key: &[G], output: &[G], multiplicity: G) {
+    debug_assert_eq!(key.len(), self.key_stride);
+    debug_assert!(self.get_index_of(key).is_none());
+    if self.mults.is_empty() {
+      self.out_stride = output.len();
+    } else {
+      debug_assert_eq!(output.len(), self.out_stride);
+    }
+    let hash = hash_g_slice(key);
+    let i = u32::try_from(self.mults.len()).expect("query map overflow");
+    self.keys.extend_from_slice(key);
+    self.outs.extend_from_slice(output);
+    self.mults.push(multiplicity);
+    let keys = &self.keys;
+    let stride = self.key_stride;
+    self.table.insert_unique(hash, i, |&j| {
+      let j = j as usize;
+      hash_g_slice(&keys[j * stride..(j + 1) * stride])
+    });
+  }
+
+  /// Entry at insertion index `i`: the key slice plus a mutable handle on
+  /// the multiplicity (memory `Load` bumps the pointed-to row's count).
+  pub fn get_index_mut(&mut self, i: usize) -> Option<(&[G], &mut G)> {
+    if i >= self.mults.len() {
+      return None;
+    }
+    let key = &self.keys[i * self.key_stride..(i + 1) * self.key_stride];
+    Some((key, &mut self.mults[i]))
+  }
+
+  pub fn get_index(&self, i: usize) -> Option<(&[G], QueryRef<'_>)> {
+    if i >= self.len() {
+      return None;
+    }
+    Some((
+      self.key_at(i),
+      QueryRef { output: self.out_at(i), multiplicity: self.mults[i] },
+    ))
+  }
+
+  pub fn iter(&self) -> impl Iterator<Item = (&[G], QueryRef<'_>)> {
+    (0..self.len()).map(|i| {
+      (
+        self.key_at(i),
+        QueryRef { output: self.out_at(i), multiplicity: self.mults[i] },
+      )
+    })
+  }
+}
 
 pub struct QueryRecord {
   pub(crate) function_queries: Vec<QueryMap>,
@@ -31,12 +184,15 @@ pub struct QueryRecord {
 
 impl QueryRecord {
   fn new(toplevel: &Toplevel) -> Self {
-    let function_queries =
-      toplevel.functions.iter().map(|_| QueryMap::default()).collect();
+    let function_queries = toplevel
+      .functions
+      .iter()
+      .map(|f| QueryMap::new(f.layout.input_size))
+      .collect();
     let memory_queries = toplevel
       .memory_sizes
       .iter()
-      .map(|width| (*width, QueryMap::default()))
+      .map(|width| (*width, QueryMap::new(*width)))
       .collect();
     let bytes1_queries = Bytes1Queries::new();
     let bytes2_queries = Bytes2Queries::new();
@@ -158,6 +314,40 @@ impl std::fmt::Display for ExecError {
 
 impl std::error::Error for ExecError {}
 
+/// Gated by `IX_AIUR_QUERY_STATS=1`: dump per-function query-map sizes
+/// during/after execution so RAM blowups can be attributed to specific
+/// Aiur functions (indices resolve to names via the Lean
+/// `CompiledToplevel`). Heaviest maps first, by retained G elements.
+static QUERY_STATS: std::sync::LazyLock<bool> = std::sync::LazyLock::new(
+  || std::env::var_os("IX_AIUR_QUERY_STATS").is_some(),
+);
+
+fn dump_query_stats(record: &QueryRecord, tag: &str) {
+  let mut rows: Vec<(usize, usize, usize)> = record
+    .function_queries
+    .iter()
+    .enumerate()
+    .map(|(i, m)| (i, m.len(), m.retained_elems()))
+    .filter(|(_, n, _)| *n > 0)
+    .collect();
+  rows.sort_by(|a, b| b.2.cmp(&a.2));
+  let total_entries: usize = rows.iter().map(|r| r.1).sum();
+  let total_elems: usize = rows.iter().map(|r| r.2).sum();
+  eprintln!(
+    "[aiur-stats {tag}] function_queries: {total_entries} entries, \
+     {total_elems} G-elems; top maps:"
+  );
+  for (i, n, e) in rows.iter().take(30) {
+    eprintln!("  fn{i:<4} entries={n:<12} g_elems={e}");
+  }
+  let mem: Vec<String> = record
+    .memory_queries
+    .iter()
+    .map(|(w, m)| format!("w{w}={}", m.len()))
+    .collect();
+  eprintln!("[aiur-stats {tag}] memory entries: {}", mem.join(" "));
+}
+
 impl Toplevel {
   pub fn execute(
     &self,
@@ -172,6 +362,9 @@ impl Toplevel {
     let function = &self.functions[fun_idx];
     let output =
       function.execute(fun_idx, args, self, &mut record, io_buffer)?;
+    if *QUERY_STATS {
+      dump_query_stats(&record, "final");
+    }
     Ok((record, output))
   }
 }
@@ -213,7 +406,14 @@ impl Function {
     }
     push_block_exec_entries!(&self.body);
     let mut unconstrained = false;
+    let mut stats_ops: u64 = 0;
     while let Some(exec_entry) = exec_entries_stack.pop() {
+      if *QUERY_STATS {
+        stats_ops += 1;
+        if stats_ops.is_multiple_of(1 << 31) {
+          dump_query_stats(record, &format!("ops={stats_ops}"));
+        }
+      }
       match exec_entry {
         ExecEntry::Op(Op::Const(c)) => map.push(*c),
         ExecEntry::Op(Op::Add(a, b)) => {
@@ -236,14 +436,14 @@ impl Function {
           map.push(G::from_bool(a == G::ZERO));
         },
         ExecEntry::Op(Op::Call(callee_idx, args, _, op_unconstrained)) => {
-          let args = args.iter().map(|i| map[*i]).collect();
+          let args: Vec<G> = args.iter().map(|i| map[*i]).collect();
           if let Some(result) =
             record.function_queries[*callee_idx].get_mut(&args)
           {
             if !unconstrained && !op_unconstrained {
-              result.multiplicity += G::ONE;
+              *result.multiplicity += G::ONE;
             }
-            map.extend(result.output.clone());
+            map.extend_from_slice(result.output);
           } else {
             let saved_map = std::mem::replace(&mut map, args);
             callers_states_stack.push(CallerState {
@@ -266,16 +466,16 @@ impl Function {
             .ok_or(ExecError::InvalidMemorySize(size))?;
           if let Some(result) = memory_queries.get_mut(&values) {
             if !unconstrained {
-              result.multiplicity += G::ONE;
+              *result.multiplicity += G::ONE;
             }
-            map.extend(&result.output);
+            map.extend_from_slice(result.output);
           } else {
             let ptr = G::from_usize(memory_queries.len());
-            let result = QueryResult {
-              output: vec![ptr],
-              multiplicity: G::from_bool(!unconstrained),
-            };
-            memory_queries.insert(values, result);
+            memory_queries.insert(
+              &values,
+              &[ptr],
+              G::from_bool(!unconstrained),
+            );
             map.push(ptr);
           }
         },
@@ -289,13 +489,13 @@ impl Function {
           let ptr_usize = usize::try_from(ptr_u64)
             .ok()
             .ok_or(ExecError::PointerTooLarge(ptr_u64))?;
-          let (args, result) = memory_queries
+          let (args, multiplicity) = memory_queries
             .get_index_mut(ptr_usize)
             .ok_or(ExecError::UnboundPointer { ptr: ptr_u64, size: *size })?;
           if !unconstrained {
-            result.multiplicity += G::ONE;
+            *multiplicity += G::ONE;
           }
-          map.extend(args);
+          map.extend_from_slice(args);
         },
         ExecEntry::Op(Op::AssertEq(xs, ys)) => {
           if xs.len() != ys.len() {
@@ -542,13 +742,12 @@ impl Function {
         ExecEntry::Ctrl(Ctrl::Return(_, output)) => {
           // Register the query.
           let input_size = toplevel.functions[fun_idx].layout.input_size;
-          let args = map[..input_size].to_vec();
           let output = output.iter().map(|i| map[*i]).collect::<Vec<_>>();
-          let result = QueryResult {
-            output: output.clone(),
-            multiplicity: G::from_bool(!unconstrained),
-          };
-          record.function_queries[fun_idx].insert(args, result);
+          record.function_queries[fun_idx].insert(
+            &map[..input_size],
+            &output,
+            G::from_bool(!unconstrained),
+          );
           if let Some(CallerState {
             fun_idx: caller_idx,
             map: caller_map,
diff --git a/src/aiur/memory.rs b/src/aiur/memory.rs
index fa676394..e2a90d4d 100644
--- a/src/aiur/memory.rs
+++ b/src/aiur/memory.rs
@@ -7,7 +7,7 @@ use multi_stark::{
 };
 use rayon::{
   iter::{
-    IndexedParallelIterator, IntoParallelRefIterator,
+    IndexedParallelIterator,
     IntoParallelRefMutIterator, ParallelIterator,
   },
   slice::ParallelSliceMut,
@@ -69,10 +69,10 @@ impl Memory {
 
     rows_no_padding
       .par_chunks_mut(width)
-      .zip(queries.par_iter())
       .zip(lookups_no_padding.par_iter_mut())
       .enumerate()
-      .for_each(|(i, ((row, (values, result)), row_lookups))| {
+      .for_each(|(i, (row, row_lookups))| {
+        let (values, result) = queries.get_index(i).expect("index in range");
         row[0] = result.multiplicity;
         row[1] = G::ONE;
         row[2] = G::from_usize(i);