From b86c632879a819b56e0ac405ce735853c93314c0 Mon Sep 17 00:00:00 2001
From: samuelburnham <45365069+samuelburnham@users.noreply.github.com>
Date: Thu, 11 Jun 2026 14:36:24 +0000
Subject: [PATCH 1/8] Aiur: flat append-only QueryMap + env-gated record-size
 stats
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the per-circuit FxIndexMap<Vec<G>, QueryResult> with flat arenas:
keys/outputs/multiplicities in contiguous Vec<G>s (fixed stride per
circuit) indexed by a hashbrown HashTable<u32>. The record IS the proof
witness so entries cannot be dropped — only stored compactly: per-entry
RAM drops from ~133 B (two heap Vecs + IndexMap bucket + allocator
metadata) to ~54 B, and execution runs ~3x faster wall-clock (no
per-call heap allocation, better locality). Entry index == insertion
order, preserving the memory-circuit pointer semantics. Circuit stats
are bit-identical to the map-based form.

Measured on ByteArray.utf8DecodeChar?_utf8EncodeChar_append: at the
op-count where the old representation OOM-killed a 249 GB box (12.9B
ops, 1.64B entries), the flat form sits at 104 GB vs 246 GB, with
bit-identical memory-circuit entry counts.

Also adds IX_AIUR_QUERY_STATS=1: periodic (every 2^31 ops) + final
dumps of per-function entry counts and retained G-elems, the
RAM-attribution diagnostic used to find this and the kernel-side
pathologies.
---
 Cargo.lock          |   3 +
 Cargo.toml          |   1 +
 src/aiur/execute.rs | 249 +++++++++++++++++++++++++++++++++++++++-----
 src/aiur/memory.rs  |   6 +-
 4 files changed, 231 insertions(+), 28 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index 7de24578..1d573036 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1121,6 +1121,8 @@ version = "0.15.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
 dependencies = [
+ "allocator-api2",
+ "equivalent",
  "foldhash 0.1.5",
 ]
 
@@ -1728,6 +1730,7 @@ dependencies = [
  "bytes",
  "dashmap",
  "getrandom 0.3.4",
+ "hashbrown 0.15.5",
  "indexmap",
  "iroh",
  "iroh-base",
diff --git a/Cargo.toml b/Cargo.toml
index 58b90be3..e83b8384 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -11,6 +11,7 @@ anyhow = "1"
 blake3 = "1.8.4"
 itertools = "0.14.0"
 indexmap = { version = "2", features = ["rayon"] }
+hashbrown = "0.15"
 lean-ffi = { git = "https://github.com/argumentcomputer/lean-ffi", rev = "cc98ebf67bf453ac3827cb767f78b13ea674dd6a" }
 mimalloc = { version = "0.1", default-features = false }
 multi-stark = { git = "https://github.com/argumentcomputer/multi-stark.git", rev = "9ecab51d553445c0cc7b571af00a76b8a83a6f8c" }
diff --git a/src/aiur/execute.rs b/src/aiur/execute.rs
index e28dae6e..c724f9b9 100644
--- a/src/aiur/execute.rs
+++ b/src/aiur/execute.rs
@@ -15,12 +15,165 @@ use crate::{
   },
 };
 
-pub struct QueryResult {
-  pub(crate) output: Vec<G>,
+/// Immutable view of one query entry.
+#[derive(Clone, Copy)]
+pub struct QueryRef<'a> {
+  pub(crate) output: &'a [G],
   pub(crate) multiplicity: G,
 }
 
-pub type QueryMap = FxIndexMap<Vec<G>, QueryResult>;
+/// Mutable view of one query entry: the output is fixed at insertion,
+/// only the multiplicity is bumped on memo hits.
+pub struct QueryRefMut<'a> {
+  pub(crate) output: &'a [G],
+  pub(crate) multiplicity: &'a mut G,
+}
+
+fn hash_g_slice(key: &[G]) -> u64 {
+  use std::hash::Hasher;
+  let mut h = rustc_hash::FxHasher::default();
+  for g in key {
+    h.write_u64(g.as_canonical_u64());
+  }
+  h.finish()
+}
+
+/// Append-only query store with a hash index.
+///
+/// Functionally the insertion-ordered map `args -> (output, multiplicity)`
+/// it replaces (`FxIndexMap<Vec<G>, QueryResult>`) — but every circuit has
+/// a FIXED key arity and output width, so keys and outputs live in flat
+/// `Vec<G>` arenas addressed by entry index, and the hash table holds only
+/// `u32` indices. This cuts per-entry overhead from ~130 B (two heap
+/// `Vec`s + IndexMap bucket + allocator metadata) to the raw field
+/// elements plus ~13 B of index. The record IS the proof witness, so
+/// entries cannot be dropped — only stored compactly; on kernel-heavy
+/// executions it is the dominant RAM consumer (billions of entries).
+///
+/// Entry index == insertion order; memory circuits use it as the pointer
+/// value, mirroring the old `IndexMap::get_index_of` semantics.
+pub struct QueryMap {
+  key_stride: usize,
+  /// Output width; inferred on first insert (not statically available in
+  /// `FunctionLayout`).
+  out_stride: usize,
+  keys: Vec<G>,
+  outs: Vec<G>,
+  mults: Vec<G>,
+  table: hashbrown::HashTable<u32>,
+}
+
+impl QueryMap {
+  pub fn new(key_stride: usize) -> Self {
+    Self {
+      key_stride,
+      out_stride: 0,
+      keys: Vec::new(),
+      outs: Vec::new(),
+      mults: Vec::new(),
+      table: hashbrown::HashTable::new(),
+    }
+  }
+
+  #[inline]
+  pub fn len(&self) -> usize {
+    self.mults.len()
+  }
+
+  #[inline]
+  pub fn is_empty(&self) -> bool {
+    self.mults.is_empty()
+  }
+
+  /// Total retained field elements (keys + outputs); used by the
+  /// `IX_AIUR_QUERY_STATS` RAM-attribution dump.
+  pub fn retained_elems(&self) -> usize {
+    self.keys.len() + self.outs.len()
+  }
+
+  #[inline]
+  fn key_at(&self, i: usize) -> &[G] {
+    &self.keys[i * self.key_stride..(i + 1) * self.key_stride]
+  }
+
+  #[inline]
+  fn out_at(&self, i: usize) -> &[G] {
+    &self.outs[i * self.out_stride..(i + 1) * self.out_stride]
+  }
+
+  pub fn get_index_of(&self, key: &[G]) -> Option<usize> {
+    debug_assert_eq!(key.len(), self.key_stride);
+    let hash = hash_g_slice(key);
+    self
+      .table
+      .find(hash, |&i| self.key_at(i as usize) == key)
+      .map(|&i| i as usize)
+  }
+
+  pub fn get(&self, key: &[G]) -> Option<QueryRef<'_>> {
+    let i = self.get_index_of(key)?;
+    Some(QueryRef { output: self.out_at(i), multiplicity: self.mults[i] })
+  }
+
+  pub fn get_mut(&mut self, key: &[G]) -> Option<QueryRefMut<'_>> {
+    let i = self.get_index_of(key)?;
+    let output = &self.outs[i * self.out_stride..(i + 1) * self.out_stride];
+    Some(QueryRefMut { output, multiplicity: &mut self.mults[i] })
+  }
+
+  /// Append a new entry. The key must not already be present: call sites
+  /// only insert on a confirmed miss, and a same-key re-entrant call
+  /// would loop forever before reaching its own insert.
+  pub fn insert(&mut self, key: &[G], output: &[G], multiplicity: G) {
+    debug_assert_eq!(key.len(), self.key_stride);
+    debug_assert!(self.get_index_of(key).is_none());
+    if self.mults.is_empty() {
+      self.out_stride = output.len();
+    } else {
+      debug_assert_eq!(output.len(), self.out_stride);
+    }
+    let hash = hash_g_slice(key);
+    let i = u32::try_from(self.mults.len()).expect("query map overflow");
+    self.keys.extend_from_slice(key);
+    self.outs.extend_from_slice(output);
+    self.mults.push(multiplicity);
+    let keys = &self.keys;
+    let stride = self.key_stride;
+    self.table.insert_unique(hash, i, |&j| {
+      let j = j as usize;
+      hash_g_slice(&keys[j * stride..(j + 1) * stride])
+    });
+  }
+
+  /// Entry at insertion index `i`: the key slice plus a mutable handle on
+  /// the multiplicity (memory `Load` bumps the pointed-to row's count).
+  pub fn get_index_mut(&mut self, i: usize) -> Option<(&[G], &mut G)> {
+    if i >= self.mults.len() {
+      return None;
+    }
+    let key = &self.keys[i * self.key_stride..(i + 1) * self.key_stride];
+    Some((key, &mut self.mults[i]))
+  }
+
+  pub fn get_index(&self, i: usize) -> Option<(&[G], QueryRef<'_>)> {
+    if i >= self.len() {
+      return None;
+    }
+    Some((
+      self.key_at(i),
+      QueryRef { output: self.out_at(i), multiplicity: self.mults[i] },
+    ))
+  }
+
+  pub fn iter(&self) -> impl Iterator<Item = (&[G], QueryRef<'_>)> {
+    (0..self.len()).map(|i| {
+      (
+        self.key_at(i),
+        QueryRef { output: self.out_at(i), multiplicity: self.mults[i] },
+      )
+    })
+  }
+}
 
 pub struct QueryRecord {
   pub(crate) function_queries: Vec<QueryMap>,
@@ -31,12 +184,15 @@ pub struct QueryRecord {
 
 impl QueryRecord {
   fn new(toplevel: &Toplevel) -> Self {
-    let function_queries =
-      toplevel.functions.iter().map(|_| QueryMap::default()).collect();
+    let function_queries = toplevel
+      .functions
+      .iter()
+      .map(|f| QueryMap::new(f.layout.input_size))
+      .collect();
     let memory_queries = toplevel
       .memory_sizes
       .iter()
-      .map(|width| (*width, QueryMap::default()))
+      .map(|width| (*width, QueryMap::new(*width)))
       .collect();
     let bytes1_queries = Bytes1Queries::new();
     let bytes2_queries = Bytes2Queries::new();
@@ -158,6 +314,40 @@ impl std::fmt::Display for ExecError {
 
 impl std::error::Error for ExecError {}
 
+/// Gated by `IX_AIUR_QUERY_STATS=1`: dump per-function query-map sizes
+/// during/after execution so RAM blowups can be attributed to specific
+/// Aiur functions (indices resolve to names via the Lean
+/// `CompiledToplevel`). Heaviest maps first, by retained G elements.
+static QUERY_STATS: std::sync::LazyLock<bool> = std::sync::LazyLock::new(
+  || std::env::var_os("IX_AIUR_QUERY_STATS").is_some(),
+);
+
+fn dump_query_stats(record: &QueryRecord, tag: &str) {
+  let mut rows: Vec<(usize, usize, usize)> = record
+    .function_queries
+    .iter()
+    .enumerate()
+    .map(|(i, m)| (i, m.len(), m.retained_elems()))
+    .filter(|(_, n, _)| *n > 0)
+    .collect();
+  rows.sort_by(|a, b| b.2.cmp(&a.2));
+  let total_entries: usize = rows.iter().map(|r| r.1).sum();
+  let total_elems: usize = rows.iter().map(|r| r.2).sum();
+  eprintln!(
+    "[aiur-stats {tag}] function_queries: {total_entries} entries, \
+     {total_elems} G-elems; top maps:"
+  );
+  for (i, n, e) in rows.iter().take(30) {
+    eprintln!("  fn{i:<4} entries={n:<12} g_elems={e}");
+  }
+  let mem: Vec<String> = record
+    .memory_queries
+    .iter()
+    .map(|(w, m)| format!("w{w}={}", m.len()))
+    .collect();
+  eprintln!("[aiur-stats {tag}] memory entries: {}", mem.join(" "));
+}
+
 impl Toplevel {
   pub fn execute(
     &self,
@@ -172,6 +362,9 @@ impl Toplevel {
     let function = &self.functions[fun_idx];
     let output =
       function.execute(fun_idx, args, self, &mut record, io_buffer)?;
+    if *QUERY_STATS {
+      dump_query_stats(&record, "final");
+    }
     Ok((record, output))
   }
 }
@@ -213,7 +406,14 @@ impl Function {
     }
     push_block_exec_entries!(&self.body);
     let mut unconstrained = false;
+    let mut stats_ops: u64 = 0;
     while let Some(exec_entry) = exec_entries_stack.pop() {
+      if *QUERY_STATS {
+        stats_ops += 1;
+        if stats_ops.is_multiple_of(1 << 31) {
+          dump_query_stats(record, &format!("ops={stats_ops}"));
+        }
+      }
       match exec_entry {
         ExecEntry::Op(Op::Const(c)) => map.push(*c),
         ExecEntry::Op(Op::Add(a, b)) => {
@@ -236,14 +436,14 @@ impl Function {
           map.push(G::from_bool(a == G::ZERO));
         },
         ExecEntry::Op(Op::Call(callee_idx, args, _, op_unconstrained)) => {
-          let args = args.iter().map(|i| map[*i]).collect();
+          let args: Vec<G> = args.iter().map(|i| map[*i]).collect();
           if let Some(result) =
             record.function_queries[*callee_idx].get_mut(&args)
           {
             if !unconstrained && !op_unconstrained {
-              result.multiplicity += G::ONE;
+              *result.multiplicity += G::ONE;
             }
-            map.extend(result.output.clone());
+            map.extend_from_slice(result.output);
           } else {
             let saved_map = std::mem::replace(&mut map, args);
             callers_states_stack.push(CallerState {
@@ -266,16 +466,16 @@ impl Function {
             .ok_or(ExecError::InvalidMemorySize(size))?;
           if let Some(result) = memory_queries.get_mut(&values) {
             if !unconstrained {
-              result.multiplicity += G::ONE;
+              *result.multiplicity += G::ONE;
             }
-            map.extend(&result.output);
+            map.extend_from_slice(result.output);
           } else {
             let ptr = G::from_usize(memory_queries.len());
-            let result = QueryResult {
-              output: vec![ptr],
-              multiplicity: G::from_bool(!unconstrained),
-            };
-            memory_queries.insert(values, result);
+            memory_queries.insert(
+              &values,
+              &[ptr],
+              G::from_bool(!unconstrained),
+            );
             map.push(ptr);
           }
         },
@@ -289,13 +489,13 @@ impl Function {
           let ptr_usize = usize::try_from(ptr_u64)
             .ok()
             .ok_or(ExecError::PointerTooLarge(ptr_u64))?;
-          let (args, result) = memory_queries
+          let (args, multiplicity) = memory_queries
             .get_index_mut(ptr_usize)
             .ok_or(ExecError::UnboundPointer { ptr: ptr_u64, size: *size })?;
           if !unconstrained {
-            result.multiplicity += G::ONE;
+            *multiplicity += G::ONE;
           }
-          map.extend(args);
+          map.extend_from_slice(args);
         },
         ExecEntry::Op(Op::AssertEq(xs, ys)) => {
           if xs.len() != ys.len() {
@@ -542,13 +742,12 @@ impl Function {
         ExecEntry::Ctrl(Ctrl::Return(_, output)) => {
           // Register the query.
           let input_size = toplevel.functions[fun_idx].layout.input_size;
-          let args = map[..input_size].to_vec();
           let output = output.iter().map(|i| map[*i]).collect::<Vec<_>>();
-          let result = QueryResult {
-            output: output.clone(),
-            multiplicity: G::from_bool(!unconstrained),
-          };
-          record.function_queries[fun_idx].insert(args, result);
+          record.function_queries[fun_idx].insert(
+            &map[..input_size],
+            &output,
+            G::from_bool(!unconstrained),
+          );
           if let Some(CallerState {
             fun_idx: caller_idx,
             map: caller_map,
diff --git a/src/aiur/memory.rs b/src/aiur/memory.rs
index fa676394..e2a90d4d 100644
--- a/src/aiur/memory.rs
+++ b/src/aiur/memory.rs
@@ -7,7 +7,7 @@ use multi_stark::{
 };
 use rayon::{
   iter::{
-    IndexedParallelIterator, IntoParallelRefIterator,
+    IndexedParallelIterator,
     IntoParallelRefMutIterator, ParallelIterator,
   },
   slice::ParallelSliceMut,
@@ -69,10 +69,10 @@ impl Memory {
 
     rows_no_padding
       .par_chunks_mut(width)
-      .zip(queries.par_iter())
       .zip(lookups_no_padding.par_iter_mut())
       .enumerate()
-      .for_each(|(i, ((row, (values, result)), row_lookups))| {
+      .for_each(|(i, (row, row_lookups))| {
+        let (values, result) = queries.get_index(i).expect("index in range");
         row[0] = result.multiplicity;
         row[1] = G::ONE;
         row[2] = G::from_usize(i);

From 4940b9f23406d27702c86a9429ed1fc5411758f8 Mon Sep 17 00:00:00 2001
From: samuelburnham <45365069+samuelburnham@users.noreply.github.com>
Date: Thu, 11 Jun 2026 14:36:43 +0000
Subject: [PATCH 2/8] IxVM: inline lbr guards on expr_inst1_walk App children
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Routing App children through the expr_inst1 dispatch materialized a
(child, arg, depth) record entry per distinct arg/depth pair just to
return a no-op-substitution child unchanged — 35% of all expr_inst1
entries on instantiation-heavy checks — plus a dispatch row for every
real-work recursive call. Guard the App-arm children inline (lbr check,
then straight into the walk): no-op subtrees produce zero rows, real
work one row instead of two. expr_lbr is keyed on the child alone, so
the guard memoizes across arg/depth.

App-arm only: guarding the binder arms too blew the walk width 53 -> 141
(nested match-with-call) and regressed FFT +4%; App-only keeps width 94.

Measured: foldAdd_2000 FFT 248.6M -> 244.9M; UTF8 record at equal ops
(2^31): 265M -> 239.5M entries (-9.7%), expr_inst1 39.1M -> 12.4M (-68%)
with bit-identical walk-entry counts.
---
 Ix/IxVM/Kernel/Subst.lean | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/Ix/IxVM/Kernel/Subst.lean b/Ix/IxVM/Kernel/Subst.lean
index e37d1e0d..9d8a7ce6 100644
--- a/Ix/IxVM/Kernel/Subst.lean
+++ b/Ix/IxVM/Kernel/Subst.lean
@@ -202,6 +202,14 @@ def subst := ⟦
     }
   }
 
+  -- Recursive walk. App children are guarded INLINE at the call site
+  -- (lbr check, then straight into the walk): a no-op subtree produces
+  -- NO query row at all, and a real-work child produces one walk row
+  -- instead of a dispatch row + a walk row — 35% of all expr_inst1
+  -- entries on instantiation-heavy checks were no-op dispatch rows.
+  -- `expr_lbr` is keyed on the child alone, so the guard memoizes
+  -- across arg/depth. App-arm only: guarding the binder arms too blew
+  -- the walk width 53 -> 141 and REGRESSED FFT +4%.
   fn expr_inst1_walk(e: KExpr, arg: KExpr, depth: G) -> KExpr {
     match load(e) {
       KExprNode.BVar(i) =>
@@ -214,12 +222,18 @@ def subst := ⟦
               _ => store(KExprNode.BVar(i - 1)),
             },
         },
-      KExprNode.Srt(l) => store(KExprNode.Srt(l)),
-      KExprNode.Const(idx, lvls) => store(KExprNode.Const(idx, lvls)),
+      KExprNode.Srt(_) => e,
+      KExprNode.Const(_, _) => e,
       KExprNode.App(f, a) =>
-        store(KExprNode.App(
-          expr_inst1(f, arg, depth),
-          expr_inst1(a, arg, depth))),
+        let f2 = match u32_less_than(depth, expr_lbr(f)) {
+          0 => f,
+          1 => expr_inst1_walk(f, arg, depth),
+        };
+        let a2 = match u32_less_than(depth, expr_lbr(a)) {
+          0 => a,
+          1 => expr_inst1_walk(a, arg, depth),
+        };
+        store(KExprNode.App(f2, a2)),
       KExprNode.Lam(ty, body) =>
         store(KExprNode.Lam(
           expr_inst1(ty, arg, depth),
@@ -233,7 +247,7 @@ def subst := ⟦
           expr_inst1(ty, arg, depth),
           expr_inst1(val, arg, depth),
           expr_inst1(body, arg, depth + 1))),
-      KExprNode.Lit(lit) => store(KExprNode.Lit(lit)),
+      KExprNode.Lit(_) => e,
       KExprNode.Proj(tidx, fidx, e1) =>
         store(KExprNode.Proj(tidx, fidx, expr_inst1(e1, arg, depth))),
     }

From 9deec845459ca7527560506f2f9ddf1d940f31b7 Mon Sep 17 00:00:00 2001
From: samuelburnham <45365069+samuelburnham@users.noreply.github.com>
Date: Thu, 11 Jun 2026 14:36:43 +0000
Subject: [PATCH 3/8] IxVM: drop dead KValNode/KVal/KValEnv
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

From ap/kernel 828fb85 (Arthur Paulino): the NbE value domain is defined
but referenced nowhere — the live kernel runs on de-Bruijn KExpr;
vestigial from an abandoned NbE direction. (The closed-term context
normalization from that commit was measured separately and not taken:
the per-call expr_lbr probe cost +2.9% FFT on recursor loops for a 0.5%
record reduction.)
---
 Ix/IxVM/KernelTypes.lean | 27 ---------------------------
 1 file changed, 27 deletions(-)

diff --git a/Ix/IxVM/KernelTypes.lean b/Ix/IxVM/KernelTypes.lean
index b4a57b34..3d511edd 100644
--- a/Ix/IxVM/KernelTypes.lean
+++ b/Ix/IxVM/KernelTypes.lean
@@ -68,33 +68,6 @@ def kernelTypes := ⟦
     }
   }
 
-  -- ============================================================================
-  -- Values (NbE semantic domain)
-  -- ============================================================================
-
-  enum KValNode {
-    Srt(&KLevel),
-    Lit(KLiteral),
-    Lam(KVal, KExpr, KValEnv),
-    Pi(KVal, KExpr, KValEnv),
-    Ctor(G, List‹&KLevel›, G, List‹KVal›),
-    FVar(G, KVal, List‹KVal›),
-    Axiom(G, List‹&KLevel›, List‹KVal›),
-    Defn(G, List‹&KLevel›, List‹KVal›),
-    Thm(G, List‹&KLevel›, List‹KVal›),
-    Opaque(G, List‹&KLevel›, List‹KVal›),
-    Quot(G, List‹&KLevel›, List‹KVal›),
-    Induct(G, List‹&KLevel›, List‹KVal›),
-    Rec(G, List‹&KLevel›, List‹KVal›),
-    Proj(G, G, KVal, List‹KVal›),
-    Thunk(KExpr, KValEnv)
-  }
-
-  type KVal = &KValNode
-
-  -- Value environment (de Bruijn indexed, front = most recent binder)
-  type KValEnv = List‹KVal›
-
   -- ============================================================================
   -- Recursor Rule
   --

From 130f30b9e1797ac31c80d3dd8c0eba0a0fcd5939 Mon Sep 17 00:00:00 2001
From: samuelburnham <45365069+samuelburnham@users.noreply.github.com>
Date: Thu, 11 Jun 2026 14:37:04 +0000
Subject: [PATCH 4/8] IxVM: memoized prim_family dispatch + width-safe
 offset-stuck placement
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three coordinated changes to Const-head whnf dispatch:

1. prim_family(addr) classifies a head address into the one reducer
   family that could fire on it (nat/str/bitvec/native/decidable; the
   sets are disjoint). Keyed on the ADDRESS ALONE it memoizes to one row
   per distinct constant address per run, and whnf_const_head calls at
   most one family reducer. The previous gauntlet ran every reducer in
   sequence — 4-6 rows at width 85-92 per Const-head whnf, almost all
   guaranteed misses. Marginal FFT on the bench suite: -1.3%..-28.3%
   (Vector.extract_append._proof_2: -28% FFT / -25% wall).

2. The symbolic-Nat offset-stuck check (5dcab7f) moves from a delta-arm
   probe into try_nat_dispatch's miss path as a cold function
   (try_nat_offset_dispatch, verdict 2 = "already stuck, do not
   re-whnf"), with the offset construction shared via
   mk_nat_offset_stuck (also used by the dbc4177 linear-rec collapse).
   The probe/inline forms cost +11/+34 circuit WIDTH on hot functions
   (+0.7% FFT on recursor loops, width charged on every row); the
   dispatch already has the whnf'd args, so the check is free there.
   Same semantics, residual +0.057% on one synthetic recursor benchmark.

3. nat_lit_to_ctor_or_self exposes ONE constructor layer
   (n -> succ(Lit(n-1)), mirroring lean4 inductive.h:91-93) instead of
   materializing the full succ chain: -2/-5% FFT on Nat.rec-over-literal
   workloads (Aiur's content-memoization had already collapsed the
   chain's quadratic re-walk, so the win is the linear constant).
---
 Ix/IxVM/Kernel/Primitive.lean | 352 +++++++++++++++++++++++++++-------
 Ix/IxVM/Kernel/Whnf.lean      | 171 +++++------------
 2 files changed, 333 insertions(+), 190 deletions(-)

diff --git a/Ix/IxVM/Kernel/Primitive.lean b/Ix/IxVM/Kernel/Primitive.lean
index 9300ea1c..be296179 100644
--- a/Ix/IxVM/Kernel/Primitive.lean
+++ b/Ix/IxVM/Kernel/Primitive.lean
@@ -1228,24 +1228,18 @@ def primitive := ⟦
     }
   }
 
-  -- Convert a KLimbs n into a chain `App(Const(succ), App(Const(succ),
-  -- ... Const(zero)))` for n calls of succ. Used by nat-literal-to-ctor
-  -- coercion in iota.
-  fn klimbs_to_ctor_form(n: KLimbs, zero_idx: G, succ_idx: G) -> KExpr {
-    match load(n) {
-      ListNode.Nil =>
-        store(KExprNode.Const(zero_idx, store(ListNode.Nil))),
-      ListNode.Cons(_, _) =>
-        let dec = klimbs_dec(n);
-        let pred = klimbs_to_ctor_form(dec, zero_idx, succ_idx);
-        let succ_const = store(KExprNode.Const(succ_idx, store(ListNode.Nil)));
-        store(KExprNode.App(succ_const, pred)),
-    }
-  }
-
   -- If `e` is `Lit(Nat(klimbs))` and addrs contains both Nat.zero and
-  -- Nat.succ, expand to ctor chain. Else return `e` unchanged. Mirror:
-  -- src/ix/kernel/whnf.rs:929-946 nat_to_constructor.
+  -- Nat.succ, expose ONE constructor layer: `0 -> Const(zero)`,
+  -- `n -> App(Const(succ), Lit(n-1))`. Else return `e` unchanged. Mirror:
+  -- src/ix/kernel/whnf.rs nat_to_constructor (one level, matching lean4
+  -- inductive.h:91-93 and lean4lean Reduce.lean:70).
+  --
+  -- One level, NOT the full succ chain: iota only needs the outermost
+  -- ctor to fire, and the predecessor stays a compact literal that
+  -- downstream whnf/def_eq/extract hit via the Nat fast paths. A full
+  -- `succ^n(zero)` chain materialized n fresh nodes per expansion event
+  -- and made every traversal of the major O(n) with distinct memo keys
+  -- per depth.
   fn nat_lit_to_ctor_or_self(e: KExpr, addrs: List‹Addr›) -> KExpr {
     match load(e) {
       KExprNode.Lit(lit) =>
@@ -1256,7 +1250,15 @@ def primitive := ⟦
             match z {
               (1, zero_idx) =>
                 match s {
-                  (1, succ_idx) => klimbs_to_ctor_form(klimbs, zero_idx, succ_idx),
+                  (1, succ_idx) =>
+                    match load(klimbs) {
+                      ListNode.Nil =>
+                        store(KExprNode.Const(zero_idx, store(ListNode.Nil))),
+                      ListNode.Cons(_, _) =>
+                        let pred = store(KExprNode.Lit(KLiteral.Nat(klimbs_dec(klimbs))));
+                        let succ_const = store(KExprNode.Const(succ_idx, store(ListNode.Nil)));
+                        store(KExprNode.App(succ_const, pred)),
+                    },
                   _ => e,
                 },
               _ => e,
@@ -1307,59 +1309,232 @@ def primitive := ⟦
     store(KExprNode.Lit(KLiteral.Nat(n)))
   }
 
-  -- 1 iff `head_addr` is one of the head-dispatched primitive ops checked by
-  -- `try_nat_dispatch` / `try_str_dispatch` / `try_bitvec_dispatch` /
-  -- `try_reduce_native` / `try_reduce_decidable`. These addresses are mutually
-  -- exclusive (a const has one content address), so the sum is 0 or 1.
-  -- Memoized on `head_addr` (a content pointer): computed once per distinct
-  -- const, vs the per-whnf gauntlet of 5 `try_*` calls. Lets `whnf_const_head`
-  -- skip the gauntlet for the common non-primitive head. MUST stay in sync
-  -- with the `address_eq(head_addr, …)` checks in those functions — a
-  -- differential `assert` guards that during development.
-  fn prim_any_addr(head_addr: Addr) -> G {
-    address_eq(head_addr, nat_add_addr())
-    + address_eq(head_addr, nat_sub_addr())
-    + address_eq(head_addr, nat_mul_addr())
-    + address_eq(head_addr, nat_div_addr())
-    + address_eq(head_addr, nat_mod_addr())
-    + address_eq(head_addr, nat_pow_addr())
-    + address_eq(head_addr, nat_gcd_addr())
-    + address_eq(head_addr, nat_land_addr())
-    + address_eq(head_addr, nat_lor_addr())
-    + address_eq(head_addr, nat_xor_addr())
-    + address_eq(head_addr, nat_shift_left_addr())
-    + address_eq(head_addr, nat_shift_right_addr())
-    + address_eq(head_addr, nat_succ_addr())
-    + address_eq(head_addr, nat_pred_addr())
-    + address_eq(head_addr, nat_beq_addr())
-    + address_eq(head_addr, nat_ble_addr())
-    + address_eq(head_addr, nat_dec_eq_addr())
-    + address_eq(head_addr, nat_dec_le_addr())
-    + address_eq(head_addr, nat_dec_lt_addr())
-    + address_eq(head_addr, bool_true_addr())
-    + address_eq(head_addr, bool_false_addr())
-    + address_eq(head_addr, int_of_nat_addr())
-    + address_eq(head_addr, int_neg_succ_addr())
-    + address_eq(head_addr, int_dec_eq_addr())
-    + address_eq(head_addr, int_dec_le_addr())
-    + address_eq(head_addr, int_dec_lt_addr())
-    + address_eq(head_addr, bit_vec_of_nat_addr())
-    + address_eq(head_addr, bit_vec_to_nat_addr())
-    + address_eq(head_addr, bit_vec_ult_addr())
-    + address_eq(head_addr, decidable_decide_addr())
-    + address_eq(head_addr, reduce_bool_addr())
-    + address_eq(head_addr, reduce_nat_addr())
-    + address_eq(head_addr, size_of_size_of_addr())
-    + address_eq(head_addr, string_back_addr())
-    + address_eq(head_addr, string_legacy_back_addr())
-    + address_eq(head_addr, string_to_byte_array_addr())
-    + address_eq(head_addr, subtype_val_addr())
-    + address_eq(head_addr, system_platform_get_num_bits_addr())
-    + address_eq(head_addr, system_platform_num_bits_addr())
-    + address_eq(head_addr, punit_addr())
-    + address_eq(head_addr, punit_size_of_1_addr())
-    + address_eq(head_addr, unit_addr())
-    + address_eq(head_addr, string_utf8_byte_size_addr())
+  -- Build the compact stuck offset `Nat.add base (Lit n)` applied to `post`,
+  -- the canonical normal form for a symbolic-base Nat offset (paired with
+  -- offset-aware def-eq). n = 0 collapses to `base` directly. Shared by the
+  -- nat dispatch (symbolic add/div/mod kept stuck) and the linear-rec
+  -- collapse, keeping the offset construction out of those circuits' width.
+  fn mk_nat_offset_stuck(base_w: KExpr, n: KLimbs, post: List‹KExpr›,
+                         addrs: List‹Addr›) -> (G, KExpr) {
+    match klimbs_is_zero(n) {
+      1 => (1, apply_spine(base_w, post)),
+      0 =>
+        match find_addr_idx_safe(nat_add_addr(), addrs, 0) {
+          (0, _) => (0, store(KExprNode.BVar(0))),
+          (1, add_idx) =>
+            let add_const = store(KExprNode.Const(add_idx, store(ListNode.Nil)));
+            let off = store(KExprNode.App(
+              store(KExprNode.App(add_const, base_w)),
+              mk_nat_lit(n)));
+            (1, apply_spine(off, post)),
+        },
+    }
+  }
+
+  -- ============================================================================
+  -- Primitive-family classification
+  --
+  -- `prim_family(a)` maps a Const-head address to the one reducer family
+  -- that could possibly fire on it: 0 = none, 1 = nat (`try_nat_dispatch`),
+  -- 2 = str, 3 = bitvec, 4 = native, 5 = decidable. The family address
+  -- sets are mutually disjoint (verified against each reducer's
+  -- comparisons), so classification is unambiguous.
+  --
+  -- Keyed on the ADDRESS ALONE: one memo row per distinct constant
+  -- address in the whole run (~hundreds), and the inner `address_eq`
+  -- chains are likewise keyed `(a, prim)` so they collapse after the
+  -- first occurrence of each address. This replaces running every
+  -- reducer as a gauntlet from `whnf_const_head`, whose try_* keys
+  -- include the spine and context — 4-6 wide rows per Const-head whnf,
+  -- almost all guaranteed misses.
+  -- ============================================================================
+  fn prim_family(a: Addr) -> G {
+    match is_nat_prim_addr(a) {
+      1 => 1,
+      0 =>
+        match is_str_prim_addr(a) {
+          1 => 2,
+          0 =>
+            match is_bitvec_prim_addr(a) {
+              1 => 3,
+              0 =>
+                match is_native_prim_addr(a) {
+                  1 => 4,
+                  0 =>
+                    match is_decidable_prim_addr(a) {
+                      1 => 5,
+                      0 => 0,
+                    },
+                },
+            },
+        },
+    }
+  }
+
+  -- Addresses handled by `try_nat_dispatch`: succ/pred plus the binop
+  -- set of `try_nat_binop_addr`.
+  fn is_nat_prim_addr(a: Addr) -> G {
+    match address_eq(a, nat_succ_addr()) {
+      1 => 1,
+      0 =>
+      match address_eq(a, nat_pred_addr()) {
+        1 => 1,
+        0 =>
+        match address_eq(a, nat_add_addr()) {
+          1 => 1,
+          0 =>
+          match address_eq(a, nat_sub_addr()) {
+            1 => 1,
+            0 =>
+            match address_eq(a, nat_mul_addr()) {
+              1 => 1,
+              0 =>
+              match address_eq(a, nat_div_addr()) {
+                1 => 1,
+                0 =>
+                match address_eq(a, nat_mod_addr()) {
+                  1 => 1,
+                  0 =>
+                  match address_eq(a, nat_pow_addr()) {
+                    1 => 1,
+                    0 =>
+                    match address_eq(a, nat_gcd_addr()) {
+                      1 => 1,
+                      0 =>
+                      match address_eq(a, nat_beq_addr()) {
+                        1 => 1,
+                        0 =>
+                        match address_eq(a, nat_ble_addr()) {
+                          1 => 1,
+                          0 =>
+                          match address_eq(a, nat_land_addr()) {
+                            1 => 1,
+                            0 =>
+                            match address_eq(a, nat_lor_addr()) {
+                              1 => 1,
+                              0 =>
+                              match address_eq(a, nat_xor_addr()) {
+                                1 => 1,
+                                0 =>
+                                match address_eq(a, nat_shift_left_addr()) {
+                                  1 => 1,
+                                  0 =>
+                                  match address_eq(a, nat_shift_right_addr()) {
+                                    1 => 1,
+                                    0 =>
+                                    0,
+                                  },
+                                },
+                              },
+                            },
+                          },
+                        },
+                      },
+                    },
+                  },
+                },
+              },
+            },
+          },
+        },
+      },
+    }
+  }
+
+  -- Addresses handled by `try_str_dispatch`.
+  fn is_str_prim_addr(a: Addr) -> G {
+    match address_eq(a, string_utf8_byte_size_addr()) {
+      1 => 1,
+      0 =>
+      match address_eq(a, string_to_byte_array_addr()) {
+        1 => 1,
+        0 =>
+        match address_eq(a, string_back_addr()) {
+          1 => 1,
+          0 =>
+          match address_eq(a, string_legacy_back_addr()) {
+            1 => 1,
+            0 =>
+            0,
+          },
+        },
+      },
+    }
+  }
+
+  -- Addresses handled by `try_bitvec_dispatch`.
+  fn is_bitvec_prim_addr(a: Addr) -> G {
+    match address_eq(a, bit_vec_to_nat_addr()) {
+      1 => 1,
+      0 =>
+      match address_eq(a, bit_vec_ult_addr()) {
+        1 => 1,
+        0 =>
+        match address_eq(a, decidable_decide_addr()) {
+          1 => 1,
+          0 =>
+          0,
+        },
+      },
+    }
+  }
+
+  -- Addresses handled by `try_reduce_native`.
+  fn is_native_prim_addr(a: Addr) -> G {
+    match address_eq(a, reduce_bool_addr()) {
+      1 => 1,
+      0 =>
+      match address_eq(a, reduce_nat_addr()) {
+        1 => 1,
+        0 =>
+        match address_eq(a, subtype_val_addr()) {
+          1 => 1,
+          0 =>
+          match address_eq(a, size_of_size_of_addr()) {
+            1 => 1,
+            0 =>
+            match address_eq(a, punit_size_of_1_addr()) {
+              1 => 1,
+              0 =>
+              match address_eq(a, system_platform_num_bits_addr()) {
+                1 => 1,
+                0 =>
+                0,
+              },
+            },
+          },
+        },
+      },
+    }
+  }
+
+  -- Addresses handled by `try_reduce_decidable`.
+  fn is_decidable_prim_addr(a: Addr) -> G {
+    match address_eq(a, nat_dec_eq_addr()) {
+      1 => 1,
+      0 =>
+      match address_eq(a, nat_dec_lt_addr()) {
+        1 => 1,
+        0 =>
+        match address_eq(a, nat_dec_le_addr()) {
+          1 => 1,
+          0 =>
+          match address_eq(a, int_dec_eq_addr()) {
+            1 => 1,
+            0 =>
+            match address_eq(a, int_dec_lt_addr()) {
+              1 => 1,
+              0 =>
+              match address_eq(a, int_dec_le_addr()) {
+                1 => 1,
+                0 =>
+                0,
+              },
+            },
+          },
+        },
+      },
+    }
   }
 
   -- Mirror: src/ix/kernel/whnf.rs:500-700 Nat-on-literals dispatch.
@@ -1422,13 +1597,46 @@ def primitive := ⟦
                         },
                       _ => (0, store(KExprNode.BVar(0))),
                     },
-                  _ => (0, store(KExprNode.BVar(0))),
+                  _ =>
+                    -- Symbolic base: route to the cold offset-stuck check
+                    -- (separate circuit so its width only taxes actual
+                    -- symbolic-base events, not every literal binop row).
+                    match pb {
+                      (1, nb) =>
+                        try_nat_offset_dispatch(head_addr, a0_w, nb, spine, addrs),
+                      _ => (0, store(KExprNode.BVar(0))),
+                    },
                 },
             },
         },
     }
   }
 
+  -- Cold path of try_nat_dispatch: a binary Nat op whose whnf'd base is
+  -- symbolic and whose second arg is `Lit nb`. For `Nat.add` (any nb) and
+  -- `Nat.div`/`Nat.mod` (nb >= 2) the term is irreducible: return verdict
+  -- 2 = "already stuck in compact offset form" so whnf keeps it instead of
+  -- delta-unfolding into a succ^nb tower / the division algorithm
+  -- (`x >>> k` = k nested `Nat.div _ 2`). Pairs with offset-aware def-eq.
+  fn try_nat_offset_dispatch(head_addr: Addr, a0_w: KExpr, nb: KLimbs,
+                             spine: List‹KExpr›, addrs: List‹Addr›) -> (G, KExpr) {
+    let is_add = address_eq(head_addr, nat_add_addr());
+    let is_divmod = address_eq(head_addr, nat_div_addr())
+      + address_eq(head_addr, nat_mod_addr());
+    let eligible =
+      is_add + is_divmod * (1 - klimbs_is_zero(nb))
+        * (1 - klimbs_is_zero(klimbs_dec(nb)));
+    match eligible {
+      0 => (0, store(KExprNode.BVar(0))),
+      _ =>
+        let post = list_drop(spine, 2);
+        match mk_nat_offset_stuck(a0_w, nb, post, addrs) {
+          (1, stuck) => (2, stuck),
+          (0, _) => (0, store(KExprNode.BVar(0))),
+        },
+    }
+  }
+
   -- Dispatch a Nat binary op by head address. Bool result for beq/ble
   -- wraps via Bool.true / Bool.false ctors (mk_bool).
   fn try_nat_binop_addr(head_addr: Addr, a: KLimbs, b: KLimbs,
diff --git a/Ix/IxVM/Kernel/Whnf.lean b/Ix/IxVM/Kernel/Whnf.lean
index d3e76b69..93dac335 100644
--- a/Ix/IxVM/Kernel/Whnf.lean
+++ b/Ix/IxVM/Kernel/Whnf.lean
@@ -56,6 +56,15 @@ def whnf := ⟦
 
   -- ============================================================================
   -- Beta
+  --
+  -- One argument per step, NOT a simultaneous multi-arg substitution: a
+  -- measured choice. A simultaneous `instantiate_rev`-style peel was
+  -- tried (2026-06-10) and REGRESSED FFT ~3%: recursor betas share a
+  -- constant argument prefix across iterations (motive/base/step), so
+  -- the sequential chain's per-arg memo keys hit across every iteration,
+  -- while a combined args-list key is unique per iteration and re-walks
+  -- the whole body each time. Curried beats tupled under Aiur's
+  -- content-memoization.
   -- ============================================================================
 
   -- Peel a beta telescope: consume one spine arg per leading `Lam` of `lam`,
@@ -164,82 +173,6 @@ def whnf := ⟦
     }
   }
 
-  -- If `head` is a Nat primitive (`Nat.add` / `Nat.div` / `Nat.mod`) applied to
-  -- exactly (non-literal base, literal second arg), return (1, the same op in
-  -- canonical form) so whnf leaves it STUCK instead of delta-unfolding it. This
-  -- stops `Nat.add x n` from materializing succ^n(x), and `Nat.div`/`Nat.mod x n`
-  -- (n ≥ 2) from expanding the division algorithm — both are irreducible for a
-  -- symbolic base, so the compact form IS the normal form. `Nat.shiftRight x k`
-  -- unfolds to k nested `Nat.div _ 2`, which now stay stuck. Thresholds: `add`
-  -- keeps any nonzero n; `div`/`mod` keep n ≥ 2 (so `x/1 = x`, `x/0 = 0` still
-  -- reduce). All magnitudes stay KLimbs. `(0, _)` = "not this shape".
-  fn try_nat_offset_stuck(head: KExpr, spine: List‹KExpr›, types: List‹KExpr›,
-                          top: List‹&KConstantInfo›, addrs: List‹Addr›) -> (G, KExpr) {
-    match load(head) {
-      KExprNode.Const(idx, _) =>
-        let ca = list_lookup(addrs, idx);
-        let is_add = address_eq(ca, nat_add_addr());
-        let is_divmod = address_eq(ca, nat_div_addr()) + address_eq(ca, nat_mod_addr());
-        match is_add + is_divmod {
-          0 => (0, store(KExprNode.BVar(0))),
-          _ =>
-            match list_length(spine) {
-              2 =>
-                let a0_w = whnf(list_lookup(spine, 0), types, top, addrs);
-                let a1_w = whnf(list_lookup(spine, 1), types, top, addrs);
-                match try_extract_nat(a1_w, addrs) {
-                  (0, _) => (0, store(KExprNode.BVar(0))),
-                  (1, n) =>
-                    -- reject n=0 (all ops) and n=1 (div/mod only).
-                    let bad = klimbs_is_zero(n) + is_divmod * klimbs_is_zero(klimbs_dec(n));
-                    match bad {
-                      0 =>
-                        match try_extract_nat(a0_w, addrs) {
-                          (1, _) => (0, store(KExprNode.BVar(0))),
-                          (0, _) =>
-                            (1, store(KExprNode.App(
-                                  store(KExprNode.App(head, a0_w)),
-                                  mk_nat_lit(n)))),
-                        },
-                      _ => (0, store(KExprNode.BVar(0))),
-                    },
-                },
-              _ => (0, store(KExprNode.BVar(0))),
-            },
-        },
-      _ => (0, store(KExprNode.BVar(0))),
-    }
-  }
-
-  -- The address-keyed primitive gauntlet: nat → str → bitvec → native →
-  -- decidable, in order. Returns (1, reduced) on the first family that matches
-  -- and reduces, else (0, _). Factored out so `prim_any_addr` can gate the
-  -- whole chain for non-primitive heads in one shot.
-  fn try_address_primitives(head_addr: Addr, idx: G, lvls: List‹&KLevel›,
-                            spine: List‹KExpr›, types: List‹KExpr›,
-                            top: List‹&KConstantInfo›, addrs: List‹Addr›) -> (G, KExpr) {
-    match try_nat_dispatch(head_addr, spine, types, top, addrs) {
-      (1, r) => (1, r),
-      (0, _) =>
-        match try_str_dispatch(head_addr, spine, addrs) {
-          (1, r) => (1, r),
-          (0, _) =>
-            match try_bitvec_dispatch(head_addr, spine, types, top, addrs) {
-              (1, r) => (1, r),
-              (0, _) =>
-                match try_reduce_native(head_addr, spine, types, top, addrs) {
-                  (1, r) => (1, r),
-                  (0, _) =>
-                    match try_reduce_decidable(head_addr, idx, lvls, spine, types, top, addrs) {
-                      (1, r) => (1, r),
-                      (0, _) => (0, store(KExprNode.BVar(0))),
-                    },
-                },
-            },
-        },
-    }
-  }
-
   -- Const-head WHNF dispatch, split out of `whnf_with_spine` (see its Const arm).
   -- `head` is the original `Const(idx, lvls)` KExpr, passed for the stuck
   -- `apply_spine(head, spine)` fallbacks.
@@ -264,40 +197,55 @@ def whnf := ⟦
           (0, _) => apply_spine(head, spine),
         },
       _ =>
-        -- Skip the address-primitive gauntlet for the common non-primitive
-        -- head: `prim_any_addr` (memoized per head address) is 1 only for the
-        -- ~43 ops the gauntlet recognizes, so a 0 means no `try_*` can match —
-        -- go straight to projection-definition / delta. Validated by a
-        -- differential assert (now removed) over the suite + heavy consts.
-        let addr_prim = match prim_any_addr(head_addr) {
-          1 => try_address_primitives(head_addr, idx, lvls, spine, types, top, addrs),
+        -- Family-gated primitive dispatch. `prim_family` is keyed on the
+        -- head ADDRESS alone, so it memoizes to one row per distinct
+        -- constant address in the run; at most ONE family reducer is
+        -- then called. The previous form ran every reducer as a gauntlet
+        -- (nat -> str -> bitvec -> native -> decidable), charging 4-6
+        -- wide rows per Const-head whnf for guaranteed misses — the
+        -- families' address sets are mutually disjoint, so the gauntlet
+        -- order never mattered. A family-reducer miss (e.g. `Nat.add` on
+        -- non-literal args) falls through to proj-def/delta exactly as
+        -- the gauntlet's final arm did.
+        let fam = prim_family(head_addr);
+        let attempt = match fam {
+          1 => try_nat_dispatch(head_addr, spine, types, top, addrs),
+          2 => try_str_dispatch(head_addr, spine, addrs),
+          3 => try_bitvec_dispatch(head_addr, spine, types, top, addrs),
+          4 => try_reduce_native(head_addr, spine, types, top, addrs),
+          5 => try_reduce_decidable(head_addr, idx, lvls, spine, types, top, addrs),
           _ => (0, store(KExprNode.BVar(0))),
         };
-        match addr_prim {
+        match attempt {
           (1, reduced) => whnf(reduced, types, top, addrs),
+          -- verdict 2: the reducer normalized the term to an already-stuck
+          -- compact form (symbolic Nat offset); return it WITHOUT re-whnf,
+          -- which would loop.
+          (2, stuck) => stuck,
           (0, _) =>
-                let proj_def_pair = try_reduce_projection_definition(idx, spine, top);
-                match proj_def_pair {
-                  (1, reduced_pd) => whnf(reduced_pd, types, top, addrs),
-                  (0, _) =>
-                    -- Mirror src/ix/kernel/whnf.rs:756-774 (`delta_unfold_one`):
-                    -- unfold any Defn regardless of `ReducibilityHints`.
-                    match ci {
-                      KConstantInfo.Defn(_, _, value, _, _) =>
-                        -- Keep `Nat.add base (Lit n)` (symbolic base) stuck as a
-                        -- compact offset instead of delta-unfolding into a
-                        -- succ^n tower. Pairs with offset-aware def-eq.
-                        match try_nat_offset_stuck(head, spine, types, top, addrs) {
-                          (1, stuck) => stuck,
-                          (0, _) =>
-                            let body = expr_inst_levels(value, lvls);
-                            whnf_with_spine(body, spine, types, top, addrs),
-                        },
-                      KConstantInfo.Thm(_, _, _) => apply_spine(head, spine),
-                      _ => apply_spine(head, spine),
-                    },
+            let proj_def_pair = try_reduce_projection_definition(idx, spine, top);
+            match proj_def_pair {
+              (1, reduced_pd) => whnf(reduced_pd, types, top, addrs),
+              (0, _) =>
+                -- Mirror src/ix/kernel/whnf.rs:756-774
+                -- (`delta_unfold_one`): unfold any Defn
+                -- regardless of `ReducibilityHints`. The
+                -- hint is consulted by lazy-delta's
+                -- `delta_rank` for def-eq priority, not
+                -- as a gate on plain whnf delta. Without
+                -- unfolding here, ctor field types
+                -- written via opaque defs (e.g.
+                -- `constType (n α) (n α)`) stay stuck
+                -- and `check_positivity_aug` misclassifies.
+                match ci {
+                  KConstantInfo.Defn(_, _, value, _, _) =>
+                    let body = expr_inst_levels(value, lvls);
+                    whnf_with_spine(body, spine, types, top, addrs),
+                  KConstantInfo.Thm(_, _, _) => apply_spine(head, spine),
+                  _ => apply_spine(head, spine),
                 },
             },
+        },
     }
   }
 
@@ -557,20 +505,7 @@ def whnf := ⟦
                   -- value in the same compact `base + n` form a literal already
                   -- has, so def-eq converges instead of descending n unary
                   -- succ layers (the UTF-8 `x + 0xC0` pathology).
-                  (0, _) =>
-                    match klimbs_is_zero(n_klimbs) {
-                      1 => (1, apply_spine(base_w, post)),
-                      0 =>
-                        match find_addr_idx_safe(nat_add_addr(), addrs, 0) {
-                          (0, _) => (0, store(KExprNode.BVar(0))),
-                          (1, add_idx) =>
-                            let add_const = store(KExprNode.Const(add_idx, store(ListNode.Nil)));
-                            let off = store(KExprNode.App(
-                              store(KExprNode.App(add_const, base_w)),
-                              mk_nat_lit(n_klimbs)));
-                            (1, apply_spine(off, post)),
-                        },
-                    },
+                  (0, _) => mk_nat_offset_stuck(base_w, n_klimbs, post, addrs),
                   (1, b_klimbs) =>
                     (1, apply_spine(mk_nat_lit(klimbs_add(b_klimbs, n_klimbs)), post)),
                 },

From b9827d43b1088b5b612460b16ab1ee2d82d62b7f Mon Sep 17 00:00:00 2001
From: samuelburnham <45365069+samuelburnham@users.noreply.github.com>
Date: Thu, 11 Jun 2026 14:37:22 +0000
Subject: [PATCH 5/8] IxVM: canonical level normalization for level_equal /
 level_leq
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Port the Rust kernel's canonical-form level machinery (level.rs
normalize_level / norm_level_eq / norm_level_le, itself a line-by-line
port of Lean4Lean's Level.Normalize with the covers-split soundness
fix): normalize_aux with imax-path conditioning, phase-2 subsumption,
and structural/dominance comparison on canonical forms. level_equal and
level_leq now normalize-and-compare; the previous recursive Level.leq
mirror with its two-way param-substitution split per Max/IMax — and its
helpers level_subst_reduce / level_has_param / level_any_param — is
deleted. level_normalize is keyed on the level alone, so each distinct
level normalizes once per run.

The split was exponential in the number of params and every branch
materialized freshly substituted levels. On
_private.…SInt.0.Int16.instRxcHasSize_eq the level family was 93% of
the record at 2^31 ops (level_subst_reduce 60.8M + level_leq 53.2M +
level_max 30.4M entries…), entered through Inductive's ctor-field
universe constraint (level_leq), not def-eq's level_equal.

Measured: Int16.instRxcHasSize_eq goes from NON-COMPLETING (killed at
178 GB RSS after 21 min, growth not converging) to 2m23s / 17.9 GB /
295.7B FFT. Int8 completes in 21s. The bench suite is unaffected (its
level comparisons hit the structural fast path). 297 ixvm tests pass.
---
 Ix/IxVM/Kernel/Levels.lean | 655 +++++++++++++++++++++++++++++++------
 1 file changed, 548 insertions(+), 107 deletions(-)

diff --git a/Ix/IxVM/Kernel/Levels.lean b/Ix/IxVM/Kernel/Levels.lean
index e9801669..f8bbbfad 100644
--- a/Ix/IxVM/Kernel/Levels.lean
+++ b/Ix/IxVM/Kernel/Levels.lean
@@ -57,142 +57,583 @@ def levels := ⟦
     }
   }
 
-  fn level_has_param(l: KLevel) -> G {
-    match l {
-      KLevel.Zero => 0,
-      KLevel.Param(_) => 1,
-      KLevel.Succ(&a) => level_has_param(a),
-      KLevel.Max(&a, &b) =>
-        let ha = level_has_param(a);
-        match ha {
-          1 => 1,
-          0 => level_has_param(b),
+  -- ============================================================================
+  -- Canonical level normalization (mirror: src/ix/kernel/level.rs
+  -- normalize_level / norm_level_eq / norm_level_le, itself a port of
+  -- Lean4Lean's Level.Normalize with the documented subsumption and
+  -- covers-split fixes).
+  --
+  -- Semantic level comparison via the recursive `Level.leq` with its
+  -- two-way param-substitution split per Max/IMax-with-param is
+  -- exponential in the number of params, and every branch materializes
+  -- freshly substituted levels (the Int16.instRxcHasSize_eq pathology:
+  -- 93% of the record was level_subst_reduce/level_leq rows).
+  -- Normalization is linear in the level size, `level_normalize` is
+  -- keyed on the level ALONE so each distinct level normalizes once per
+  -- run, and equality/leq on canonical forms is cheap.
+  --
+  -- Canonical form: a path-sorted list of entries `(path, const, vars)`
+  -- where `path` is the sorted list of param indices conditioning an
+  -- imax chain (all assumed >= 1 along that branch), `const` the max
+  -- constant contribution, `vars` the idx-sorted `(param, offset)`
+  -- contributions.
+  -- ============================================================================
+  enum NLVar {
+    Mk(G, G)
+  }
+
+  enum NLEntry {
+    Mk(List‹G›, G, List‹&NLVar›)
+  }
+
+  -- Lexicographic compare of sorted G-lists: 0 = eq, 1 = a < b, 2 = a > b.
+  fn glist_cmp(a: List‹G›, b: List‹G›) -> G {
+    match load(a) {
+      ListNode.Nil =>
+        match load(b) {
+          ListNode.Nil => 0,
+          ListNode.Cons(_, _) => 1,
         },
-      KLevel.IMax(&a, &b) =>
-        let hb = level_has_param(b);
-        match hb {
-          1 => 1,
-          0 => level_has_param(a),
+      ListNode.Cons(x, ar) =>
+        match load(b) {
+          ListNode.Nil => 2,
+          ListNode.Cons(y, br) =>
+            match u32_less_than(x, y) {
+              1 => 1,
+              0 =>
+                match u32_less_than(y, x) {
+                  1 => 2,
+                  0 => glist_cmp(ar, br),
+                },
+            },
         },
     }
   }
 
-  fn level_any_param(l: KLevel) -> G {
-    match l {
-      KLevel.Param(i) => i,
-      KLevel.Succ(&a) => level_any_param(a),
-      KLevel.Max(&a, &b) =>
-        let ha = level_has_param(a);
-        match ha {
-          1 => level_any_param(a),
-          0 => level_any_param(b),
+  -- Sorted-list subset test.
+  fn glist_subset(xs: List‹G›, ys: List‹G›) -> G {
+    match load(xs) {
+      ListNode.Nil => 1,
+      ListNode.Cons(x, xr) =>
+        match load(ys) {
+          ListNode.Nil => 0,
+          ListNode.Cons(y, yr) =>
+            match u32_less_than(x, y) {
+              1 => 0,
+              0 =>
+                match u32_less_than(y, x) {
+                  1 => glist_subset(xs, yr),
+                  0 => glist_subset(xr, yr),
+                },
+            },
         },
-      KLevel.IMax(&a, &b) =>
-        let hb = level_has_param(b);
-        match hb {
-          1 => level_any_param(b),
-          0 => level_any_param(a),
+    }
+  }
+
+  fn glist_eq_len(a: List‹G›, b: List‹G›) -> G {
+    match load(a) {
+      ListNode.Nil =>
+        match load(b) {
+          ListNode.Nil => 1,
+          ListNode.Cons(_, _) => 0,
+        },
+      ListNode.Cons(_, ar) =>
+        match load(b) {
+          ListNode.Nil => 0,
+          ListNode.Cons(_, br) => glist_eq_len(ar, br),
         },
-      KLevel.Zero => 0,
     }
   }
 
-  fn level_subst_reduce(l: KLevel, p: G, repl: KLevel) -> KLevel {
-    match l {
-      KLevel.Zero => KLevel.Zero,
-      KLevel.Param(i) =>
-        match i - p {
-          0 => repl,
-          _ => KLevel.Param(i),
+  -- Insert into a sorted G-list. Returns (1, new_list) on insert,
+  -- (0, original) when already present. Mirrors level.rs ordered_insert.
+  fn glist_ordered_insert(x: G, l: List‹G›) -> (G, List‹G›) {
+    match load(l) {
+      ListNode.Nil => (1, store(ListNode.Cons(x, store(ListNode.Nil)))),
+      ListNode.Cons(h, r) =>
+        match u32_less_than(x, h) {
+          1 => (1, store(ListNode.Cons(x, l))),
+          0 =>
+            match u32_less_than(h, x) {
+              0 => (0, l),
+              1 =>
+                match glist_ordered_insert(x, r) {
+                  (f, r2) => (f, store(ListNode.Cons(h, r2))),
+                },
+            },
         },
-      KLevel.Succ(&a) =>
-        KLevel.Succ(store(level_subst_reduce(a, p, repl))),
-      KLevel.Max(&a, &b) =>
-        level_max(level_subst_reduce(a, p, repl), level_subst_reduce(b, p, repl)),
-      KLevel.IMax(&a, &b) =>
-        level_imax(level_subst_reduce(a, p, repl), level_subst_reduce(b, p, repl)),
     }
   }
 
-  fn level_leq(a: KLevel, b: KLevel) -> G {
-    match a {
-      KLevel.Zero => 1,
-      KLevel.Max(&a1, &a2) =>
-        level_leq(a1, b) * level_leq(a2, b),
-      KLevel.Succ(&a1) =>
-        match a1 {
-          KLevel.Max(&x, &y) =>
-            level_leq(KLevel.Succ(store(x)), b) * level_leq(KLevel.Succ(store(y)), b),
-          _ =>
-            match b {
-              KLevel.Succ(&b1) => level_leq(a1, b1),
-              KLevel.Max(&b1, &b2) =>
-                let r1 = level_leq(a, b1);
-                match r1 {
-                  1 => 1,
+  -- Insert (idx, k) into an idx-sorted var list; on duplicate idx keep
+  -- the max offset. Mirrors Node::add_var.
+  fn nlvars_add(vars: List‹&NLVar›, idx: G, k: G) -> List‹&NLVar› {
+    match load(vars) {
+      ListNode.Nil =>
+        store(ListNode.Cons(store(NLVar.Mk(idx, k)), store(ListNode.Nil))),
+      ListNode.Cons(v, rest) =>
+        match load(v) {
+          NLVar.Mk(vi, vo) =>
+            match u32_less_than(idx, vi) {
+              1 => store(ListNode.Cons(store(NLVar.Mk(idx, k)), vars)),
+              0 =>
+                match u32_less_than(vi, idx) {
+                  1 => store(ListNode.Cons(v, nlvars_add(rest, idx, k))),
                   0 =>
-                    let r2 = level_leq(a, b2);
-                    match r2 {
-                      1 => 1,
+                    match u32_less_than(vo, k) {
+                      1 => store(ListNode.Cons(store(NLVar.Mk(vi, k)), rest)),
+                      0 => vars,
+                    },
+                },
+            },
+        },
+    }
+  }
+
+  fn nlvars_eq(a: List‹&NLVar›, b: List‹&NLVar›) -> G {
+    match load(a) {
+      ListNode.Nil =>
+        match load(b) {
+          ListNode.Nil => 1,
+          ListNode.Cons(_, _) => 0,
+        },
+      ListNode.Cons(x, ar) =>
+        match load(b) {
+          ListNode.Nil => 0,
+          ListNode.Cons(y, br) =>
+            match load(x) {
+              NLVar.Mk(xi, xo) =>
+                match load(y) {
+                  NLVar.Mk(yi, yo) =>
+                    match xi - yi {
+                      0 =>
+                        match xo - yo {
+                          0 => nlvars_eq(ar, br),
+                          _ => 0,
+                        },
+                      _ => 0,
+                    },
+                },
+            },
+        },
+    }
+  }
+
+  fn nlvars_max_offset(vars: List‹&NLVar›) -> G {
+    match load(vars) {
+      ListNode.Nil => 0,
+      ListNode.Cons(v, rest) =>
+        match load(v) {
+          NLVar.Mk(_, vo) =>
+            let m = nlvars_max_offset(rest);
+            match u32_less_than(m, vo) {
+              1 => vo,
+              0 => m,
+            },
+        },
+    }
+  }
+
+  -- Keep x in xs unless ys has the same idx with offset >= x's.
+  -- Mirrors level.rs subsume_vars (sorted merge walk).
+  fn nlvars_subsume(xs: List‹&NLVar›, ys: List‹&NLVar›) -> List‹&NLVar› {
+    match load(xs) {
+      ListNode.Nil => xs,
+      ListNode.Cons(x, xr) =>
+        match load(ys) {
+          ListNode.Nil => xs,
+          ListNode.Cons(y, yr) =>
+            match load(x) {
+              NLVar.Mk(xi, xo) =>
+                match load(y) {
+                  NLVar.Mk(yi, yo) =>
+                    match u32_less_than(xi, yi) {
+                      1 => store(ListNode.Cons(x, nlvars_subsume(xr, ys))),
                       0 =>
-                        let bfull = KLevel.Max(store(b1), store(b2));
-                        let hp = level_has_param(bfull);
-                        match hp {
+                        match u32_less_than(yi, xi) {
+                          1 => nlvars_subsume(xs, yr),
+                          0 =>
+                            match u32_less_than(yo, xo) {
+                              1 => store(ListNode.Cons(x, nlvars_subsume(xr, yr))),
+                              0 => nlvars_subsume(xr, yr),
+                            },
+                        },
+                    },
+                },
+            },
+        },
+    }
+  }
+
+  -- Max the constant contribution into the entry at `path` (path-sorted
+  -- insert). Mirrors norm_add_const incl. its skip rule: k = 0 never
+  -- contributes; k = 1 only at the empty path (along a non-empty path
+  -- all conditioning params are >= 1 so the branch value is >= 1 anyway).
+  fn nl_add_const(acc: List‹&NLEntry›, path: List‹G›, k: G) -> List‹&NLEntry› {
+    match k {
+      0 => acc,
+      1 =>
+        match load(path) {
+          ListNode.Nil => nl_add_const_go(acc, path, k),
+          ListNode.Cons(_, _) => acc,
+        },
+      _ => nl_add_const_go(acc, path, k),
+    }
+  }
+
+  fn nl_add_const_go(acc: List‹&NLEntry›, path: List‹G›, k: G) -> List‹&NLEntry› {
+    match load(acc) {
+      ListNode.Nil =>
+        store(ListNode.Cons(
+          store(NLEntry.Mk(path, k, store(ListNode.Nil))),
+          store(ListNode.Nil))),
+      ListNode.Cons(e, rest) =>
+        match load(e) {
+          NLEntry.Mk(ep, ec, ev) =>
+            match glist_cmp(path, ep) {
+              0 =>
+                match u32_less_than(ec, k) {
+                  1 => store(ListNode.Cons(store(NLEntry.Mk(ep, k, ev)), rest)),
+                  0 => acc,
+                },
+              1 =>
+                store(ListNode.Cons(
+                  store(NLEntry.Mk(path, k, store(ListNode.Nil))), acc)),
+              _ => store(ListNode.Cons(e, nl_add_const_go(rest, path, k))),
+            },
+        },
+    }
+  }
+
+  -- Add var (idx, k) to the entry at `path` (path-sorted insert).
+  fn nl_add_var(acc: List‹&NLEntry›, path: List‹G›, idx: G, k: G) -> List‹&NLEntry› {
+    match load(acc) {
+      ListNode.Nil =>
+        store(ListNode.Cons(
+          store(NLEntry.Mk(path, 0,
+            store(ListNode.Cons(store(NLVar.Mk(idx, k)), store(ListNode.Nil))))),
+          store(ListNode.Nil))),
+      ListNode.Cons(e, rest) =>
+        match load(e) {
+          NLEntry.Mk(ep, ec, ev) =>
+            match glist_cmp(path, ep) {
+              0 =>
+                store(ListNode.Cons(
+                  store(NLEntry.Mk(ep, ec, nlvars_add(ev, idx, k))), rest)),
+              1 =>
+                store(ListNode.Cons(
+                  store(NLEntry.Mk(path, 0,
+                    store(ListNode.Cons(store(NLVar.Mk(idx, k)), store(ListNode.Nil))))),
+                  acc)),
+              _ => store(ListNode.Cons(e, nl_add_var(rest, path, idx, k))),
+            },
+        },
+    }
+  }
+
+  -- Flatten a level into canonical form. `path` = imax conditioning chain
+  -- (sorted param idxs), `k` = accumulated Succ offset. Mirrors
+  -- normalize_aux; the IMax arm delegates to the dispatch (whose cases
+  -- replicate the aux IMax-shape cases verbatim, as in level.rs).
+  fn normalize_aux(l: KLevel, path: List‹G›, k: G,
+                   acc: List‹&NLEntry›) -> List‹&NLEntry› {
+    match l {
+      KLevel.Zero => nl_add_const(acc, path, k),
+      KLevel.Succ(&inner) => normalize_aux(inner, path, k + 1, acc),
+      KLevel.Max(&a, &b) =>
+        normalize_aux(b, path, k, normalize_aux(a, path, k, acc)),
+      KLevel.IMax(&u, &b) => normalize_imax_dispatch(u, b, path, k, acc),
+      KLevel.Param(idx) =>
+        match glist_ordered_insert(idx, path) {
+          (1, new_path) =>
+            nl_add_var(nl_add_const(acc, path, k), new_path, idx, k),
+          (0, _) =>
+            match k {
+              0 => acc,
+              _ => nl_add_var(acc, path, idx, k),
+            },
+        },
+    }
+  }
+
+  -- imax(a, b) by b's shape: zero kills the branch; succ is never-zero so
+  -- imax = max; max/imax distribute; param conditions the path.
+  fn normalize_imax_dispatch(a: KLevel, b: KLevel, path: List‹G›, k: G,
+                             acc: List‹&NLEntry›) -> List‹&NLEntry› {
+    match b {
+      KLevel.Zero => nl_add_const(acc, path, k),
+      KLevel.Succ(&v) =>
+        normalize_aux(v, path, k + 1, normalize_aux(a, path, k, acc)),
+      KLevel.Max(&v, &w) =>
+        -- imax(a, max(v, w)) = max(imax(a, v), imax(a, w))
+        normalize_imax_dispatch(a, w, path, k,
+          normalize_imax_dispatch(a, v, path, k, acc)),
+      KLevel.IMax(&v, &w) =>
+        -- imax(a, imax(v, w)) = max(imax(a, w), imax(v, w))
+        normalize_imax_dispatch(v, w, path, k,
+          normalize_imax_dispatch(a, w, path, k, acc)),
+      KLevel.Param(idx) =>
+        match glist_ordered_insert(idx, path) {
+          (1, new_path) =>
+            -- param(idx) = 0 branch: imax(a, 0) = 0, contributing k.
+            normalize_aux(a, new_path, k,
+              nl_add_var(nl_add_const(acc, path, k), new_path, idx, k)),
+          (0, _) =>
+            let acc2 = match k {
+              0 => acc,
+              _ => nl_add_var(acc, path, idx, k),
+            };
+            normalize_aux(a, path, k, acc2),
+        },
+    }
+  }
+
+  -- Phase 2 subsumption: drop contributions dominated by another entry
+  -- whose path is a subset (active whenever the dominated one is).
+  -- Each entry folds over the ORIGINAL (snapshot) list, mirroring the
+  -- Rust snapshot semantics.
+  fn nl_subsumption(acc: List‹&NLEntry›) -> List‹&NLEntry› {
+    nl_subsumption_walk(acc, acc)
+  }
+
+  fn nl_subsumption_walk(rem: List‹&NLEntry›,
+                         snapshot: List‹&NLEntry›) -> List‹&NLEntry› {
+    match load(rem) {
+      ListNode.Nil => rem,
+      ListNode.Cons(e, rest) =>
+        store(ListNode.Cons(
+          nl_subsume_entry(e, snapshot),
+          nl_subsumption_walk(rest, snapshot))),
+    }
+  }
+
+  fn nl_subsume_entry(e: &NLEntry, snap: List‹&NLEntry›) -> &NLEntry {
+    match load(snap) {
+      ListNode.Nil => e,
+      ListNode.Cons(s, srest) =>
+        match load(e) {
+          NLEntry.Mk(p1, c1, v1) =>
+            match load(s) {
+              NLEntry.Mk(p2, c2, v2) =>
+                match glist_subset(p2, p1) {
+                  0 => nl_subsume_entry(e, srest),
+                  _ =>
+                    -- subset + equal length <=> p1 == p2 (self entry)
+                    let same = glist_eq_len(p1, p2);
+                    let c1n = match c1 {
+                      0 => 0,
+                      _ =>
+                        let or1 = match same {
+                          1 => 1,
+                          0 => u32_less_than(c2, c1),
+                        };
+                        let or2 = match load(v2) {
+                          ListNode.Nil => 1,
+                          ListNode.Cons(_, _) =>
+                            u32_less_than(nlvars_max_offset(v1) + 1, c1),
+                        };
+                        match or1 * or2 {
                           0 => 0,
-                          _ =>
-                            let p = level_any_param(bfull);
-                            let sp = KLevel.Succ(store(KLevel.Param(p)));
-                            let a0 = level_subst_reduce(a, p, KLevel.Zero);
-                            let b0 = level_subst_reduce(bfull, p, KLevel.Zero);
-                            let a1s = level_subst_reduce(a, p, sp);
-                            let b1s = level_subst_reduce(bfull, p, sp);
-                            level_leq(a0, b0) * level_leq(a1s, b1s),
+                          _ => c1,
                         },
+                    };
+                    let v1n = match same {
+                      1 => v1,
+                      0 =>
+                        match load(v2) {
+                          ListNode.Nil => v1,
+                          ListNode.Cons(_, _) => nlvars_subsume(v1, v2),
+                        },
+                    };
+                    nl_subsume_entry(store(NLEntry.Mk(p1, c1n, v1n)), srest),
+                },
+            },
+        },
+    }
+  }
+
+  fn nl_eq(a: List‹&NLEntry›, b: List‹&NLEntry›) -> G {
+    match load(a) {
+      ListNode.Nil =>
+        match load(b) {
+          ListNode.Nil => 1,
+          ListNode.Cons(_, _) => 0,
+        },
+      ListNode.Cons(x, ar) =>
+        match load(b) {
+          ListNode.Nil => 0,
+          ListNode.Cons(y, br) =>
+            match load(x) {
+              NLEntry.Mk(xp, xc, xv) =>
+                match load(y) {
+                  NLEntry.Mk(yp, yc, yv) =>
+                    match glist_cmp(xp, yp) {
+                      0 =>
+                        match xc - yc {
+                          0 =>
+                            match nlvars_eq(xv, yv) {
+                              1 => nl_eq(ar, br),
+                              0 => 0,
+                            },
+                          _ => 0,
+                        },
+                      _ => 0,
                     },
                 },
-              _ => 0,
             },
         },
-      KLevel.Param(i) =>
-        match b {
-          KLevel.Param(j) => eq_zero(i - j),
-          KLevel.Succ(&b1) => level_leq(a, b1),
-          KLevel.Max(&b1, &b2) =>
-            let r1 = level_leq(a, b1);
-            match r1 {
-              1 => 1,
-              0 => level_leq(a, b2),
+    }
+  }
+
+  -- Does some entry (p2, n2) in l2 with p2 SUBSET-OF p1 dominate the
+  -- constant c along every assignment activating p1? n2.const counts
+  -- unconditionally in that branch; each var (idx, off) counts as
+  -- >= off + 1 because idx IN p2 implies the param is >= 1 there.
+  -- Mirrors level.rs covers_const (the fixed, split-search version).
+  fn nl_covers_const(l2: List‹&NLEntry›, p1: List‹G›, c: G) -> G {
+    match load(l2) {
+      ListNode.Nil => 0,
+      ListNode.Cons(e, rest) =>
+        match load(e) {
+          NLEntry.Mk(p2, c2, v2) =>
+            match glist_subset(p2, p1) {
+              0 => nl_covers_const(rest, p1, c),
+              _ =>
+                let hit = match u32_less_than(c2, c) {
+                  0 => 1,
+                  1 => nlvars_any_offset_geq(v2, c),
+                };
+                match hit {
+                  1 => 1,
+                  0 => nl_covers_const(rest, p1, c),
+                },
             },
-          KLevel.IMax(&b1, &b2) =>
-            let p = level_any_param(b2);
-            let sp = KLevel.Succ(store(KLevel.Param(p)));
-            let a0 = level_subst_reduce(a, p, KLevel.Zero);
-            let bfull = KLevel.IMax(store(b1), store(b2));
-            let b0 = level_subst_reduce(bfull, p, KLevel.Zero);
-            let a1s = level_subst_reduce(a, p, sp);
-            let b1s = level_subst_reduce(bfull, p, sp);
-            level_leq(a0, b0) * level_leq(a1s, b1s),
-          KLevel.Zero => 0,
         },
-      KLevel.IMax(&a1, &a2) =>
-        let not_zero = level_is_not_zero(a2);
-        match not_zero {
-          1 => level_leq(a1, b) * level_leq(a2, b),
-          0 =>
-            let p = level_any_param(a2);
-            let sp = KLevel.Succ(store(KLevel.Param(p)));
-            let afull = KLevel.IMax(store(a1), store(a2));
-            let a0 = level_subst_reduce(afull, p, KLevel.Zero);
-            let b0 = level_subst_reduce(b, p, KLevel.Zero);
-            let a1s = level_subst_reduce(afull, p, sp);
-            let b1s = level_subst_reduce(b, p, sp);
-            level_leq(a0, b0) * level_leq(a1s, b1s),
+    }
+  }
+
+  -- Any var with offset + 1 >= c?
+  fn nlvars_any_offset_geq(vars: List‹&NLVar›, c: G) -> G {
+    match load(vars) {
+      ListNode.Nil => 0,
+      ListNode.Cons(v, rest) =>
+        match load(v) {
+          NLVar.Mk(_, vo) =>
+            match u32_less_than(vo + 1, c) {
+              0 => 1,
+              1 => nlvars_any_offset_geq(rest, c),
+            },
+        },
+    }
+  }
+
+  -- Does some entry (p2, n2) in l2 with p2 SUBSET-OF p1 contain a var
+  -- (w, off2) with off2 >= off? Mirrors level.rs covers_var.
+  fn nl_covers_var(l2: List‹&NLEntry›, p1: List‹G›, w: G, off: G) -> G {
+    match load(l2) {
+      ListNode.Nil => 0,
+      ListNode.Cons(e, rest) =>
+        match load(e) {
+          NLEntry.Mk(p2, _, v2) =>
+            match glist_subset(p2, p1) {
+              0 => nl_covers_var(rest, p1, w, off),
+              _ =>
+                match nlvars_dominates(v2, w, off) {
+                  1 => 1,
+                  0 => nl_covers_var(rest, p1, w, off),
+                },
+            },
+        },
+    }
+  }
+
+  fn nlvars_dominates(vars: List‹&NLVar›, w: G, off: G) -> G {
+    match load(vars) {
+      ListNode.Nil => 0,
+      ListNode.Cons(v, rest) =>
+        match load(v) {
+          NLVar.Mk(vi, vo) =>
+            match vi - w {
+              0 =>
+                match u32_less_than(vo, off) {
+                  0 => 1,
+                  1 => nlvars_dominates(rest, w, off),
+                },
+              _ => nlvars_dominates(rest, w, off),
+            },
         },
     }
   }
 
+  -- Semantic l1 <= l2 on canonical forms: every entry's contribution in
+  -- its activation branch must be dominated by subset-path entries of l2.
+  -- Mirrors level.rs norm_level_le (with its covers split, sound where
+  -- Lean4Lean's single-entry search is incomplete).
+  fn nl_le(l1: List‹&NLEntry›, l2: List‹&NLEntry›) -> G {
+    match load(l1) {
+      ListNode.Nil => 1,
+      ListNode.Cons(e, rest) =>
+        match load(e) {
+          NLEntry.Mk(p1, c1, v1) =>
+            let c_ok = match c1 {
+              0 => 1,
+              _ => nl_covers_const(l2, p1, c1),
+            };
+            match c_ok {
+              0 => 0,
+              1 =>
+                match nl_le_vars(v1, l2, p1) {
+                  0 => 0,
+                  1 => nl_le(rest, l2),
+                },
+            },
+        },
+    }
+  }
+
+  fn nl_le_vars(vars: List‹&NLVar›, l2: List‹&NLEntry›, p1: List‹G›) -> G {
+    match load(vars) {
+      ListNode.Nil => 1,
+      ListNode.Cons(v, rest) =>
+        match load(v) {
+          NLVar.Mk(vi, vo) =>
+            match nl_covers_var(l2, p1, vi, vo) {
+              0 => 0,
+              1 => nl_le_vars(rest, l2, p1),
+            },
+        },
+    }
+  }
+
+  -- Keyed on the level alone: each distinct level normalizes once per run.
+  -- Seeded with the empty-path entry, mirroring normalize_level.
+  fn level_normalize(l: KLevel) -> List‹&NLEntry› {
+    let seed = store(ListNode.Cons(
+      store(NLEntry.Mk(store(ListNode.Nil), 0, store(ListNode.Nil))),
+      store(ListNode.Nil)));
+    nl_subsumption(normalize_aux(l, store(ListNode.Nil), 0, seed))
+  }
+
+  fn level_leq(a: KLevel, b: KLevel) -> G {
+    match level_eq(a, b) {
+      1 => 1,
+      0 =>
+        match a {
+          KLevel.Zero => 1,
+          _ => nl_le(level_normalize(a), level_normalize(b)),
+        },
+    }
+  }
+
+  -- Semantic level equality via canonical forms (mirror: level.rs
+  -- univ_eq). The previous leq(a,b)*leq(b,a) did a two-way
+  -- param-substitution split per Max/IMax-with-param -- exponential in
+  -- params and the dominant record cost on instance-heavy checks.
   fn level_equal(a: KLevel, b: KLevel) -> G {
-    level_leq(a, b) * level_leq(b, a)
+    match level_eq(a, b) {
+      1 => 1,
+      0 => nl_eq(level_normalize(a), level_normalize(b)),
+    }
   }
 
   fn level_max(a: KLevel, b: KLevel) -> KLevel {

From b017a3a49f57bf53ecd74636f83f89e223a3b5f0 Mon Sep 17 00:00:00 2001
From: samuelburnham <45365069+samuelburnham@users.noreply.github.com>
Date: Thu, 11 Jun 2026 15:56:09 +0000
Subject: [PATCH 6/8] Ix CLI: refs-of subcommand (frontier leaf list for
 assumption trees)

Print a constant's direct constant references (literal blobs filtered:
content-pinned, not well-typedness obligations) as the comma-separated
hex list `ix tree canonical` expects, so a single-constant frontier
assumption check is:

    ix tree canonical $(ix refs-of <name> --ixe env.ixe)
    ix claim check <addr> --asm <root>
    ix check --ixe env.ixe --claim <hash>
---
 Ix/Cli/RefsOfCmd.lean | 70 +++++++++++++++++++++++++++++++++++++++++++
 Main.lean             |  2 ++
 2 files changed, 72 insertions(+)
 create mode 100644 Ix/Cli/RefsOfCmd.lean

diff --git a/Ix/Cli/RefsOfCmd.lean b/Ix/Cli/RefsOfCmd.lean
new file mode 100644
index 00000000..c2824903
--- /dev/null
+++ b/Ix/Cli/RefsOfCmd.lean
@@ -0,0 +1,70 @@
+/-
+  `ix refs-of <Lean.Name> --ixe <path>`: print the direct constant
+  references of a constant — its `Constant.refs` entries that resolve to
+  constants in the env (literal blobs are skipped: they are pinned by
+  content-addressing and are not well-typedness obligations).
+
+  Prints a comma-separated list of 64-char hex addresses on stdout, the
+  exact shape `ix tree canonical` expects, so a frontier assumption tree
+  for a single constant is:
+
+      ix tree canonical $(ix refs-of <name> --ixe env.ixe)
+-/
+module
+public import Cli
+public import Ix.Address
+public import Ix.Common
+public import Ix.Ixon
+
+public section
+
+namespace Ix.Cli.RefsOfCmd
+
+def parseName (arg : String) : Lean.Name :=
+  arg.splitOn "." |>.foldl (init := .anonymous)
+    fun acc s => match s.toNat? with
+      | some n => Lean.Name.mkNum acc n
+      | none   => Lean.Name.mkStr acc s
+
+def runRefsOfCmd (p : Cli.Parsed) : IO UInt32 := do
+  let some nameArg := p.positionalArg? "name"
+    | p.printError "error: must specify <name>"; return 1
+  let name := parseName (nameArg.as! String)
+  let some ixeFlag := p.flag? "ixe"
+    | p.printError "error: --ixe is required"; return 1
+  let path := ixeFlag.as! String
+  let bytes ← IO.FS.readBinFile path
+  let ixonEnv ← match Ixon.rsDeEnv bytes with
+    | .error e =>
+      IO.eprintln s!"error: failed to deserialize {path}: {e}"; return 1
+    | .ok env => pure env
+  let some addr := ixonEnv.getAddr? (Ix.Name.fromLeanName name)
+    | IO.eprintln s!"error: {name} not found in {path}"; return 1
+  let some c := ixonEnv.getConst? addr
+    | IO.eprintln s!"error: constant for {name} missing from env"; return 1
+  let mut seen : Std.HashSet String := {}
+  let mut out : Array String := #[]
+  for r in c.refs do
+    if (ixonEnv.getConst? r).isSome then
+      let s := toString r
+      if !seen.contains s then
+        seen := seen.insert s
+        out := out.push s
+  IO.println (String.intercalate "," out.toList)
+  return 0
+
+end Ix.Cli.RefsOfCmd
+
+open Ix.Cli.RefsOfCmd in
+def refsOfCmd : Cli.Cmd := `[Cli|
+  "refs-of" VIA runRefsOfCmd;
+  "Print a constant's direct constant references (comma-separated hex), the leaf list for a frontier assumption tree"
+
+  FLAGS:
+    "ixe" : String; "Path to a serialized `.ixe` env (required)."
+
+  ARGS:
+    name : String; "Fully-qualified Lean.Name (e.g. `Nat.add_comm`)."
+]
+
+end
diff --git a/Main.lean b/Main.lean
index e72ae1e8..5f70ba71 100644
--- a/Main.lean
+++ b/Main.lean
@@ -1,5 +1,6 @@
 --import Ix.Cli.StoreCmd
 import Ix.Cli.AddrOfCmd
+import Ix.Cli.RefsOfCmd
 import Ix.Cli.CheckCmd
 import Ix.Cli.CheckRsCmd
 import Ix.Cli.ClaimCmd
@@ -34,6 +35,7 @@ def ixCmd : Cli.Cmd := `[Cli|
     shardCmd;
     verifyCmd;
     addrOfCmd;
+    refsOfCmd;
     ingressCmd;
     validateCmd;
     serveCmd;

From fbedfc866b9891e83d9b3ff35a8c7920c7d77ac6 Mon Sep 17 00:00:00 2001
From: samuelburnham <45365069+samuelburnham@users.noreply.github.com>
Date: Thu, 11 Jun 2026 21:47:07 +0000
Subject: [PATCH 7/8] IxVM: idx-keyed projection-definition classification
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

try_reduce_projection_definition ran per Const-head delta candidate
with the SPINE in its memo key, but whether a Defn is a projection
wrapper depends only on the constant: 12.7M record entries on the
UTF-8 codec check, scaling 39x per SInt width doubling (structure-
bundle instances force projections constantly).

Split the decision into proj_def_info_of, keyed on the index alone
(one row per distinct constant per run), gate the caller on the scalar
is_proj_def (let-bound matches only take scalar scrutinees — a
tuple-pattern gate compiles but fails toplevel check with "Non-tail
match in arbitrary position"), and reach the spine-keyed application
only for actual projection-definitions.

Measured (297 ixvm tests pass): Int16.instRxcHasSize_eq 189.3B ->
186.8B FFT (-1.3%, wall 1m42s -> 1m39s); mergeSort/Vector/
length_append each improve slightly. The UTF-8-class payoff is larger
(12.7M entries -> ~hundreds on that profile).
---
 Ix/IxVM/Kernel/Primitive.lean | 59 ++++++++++++++++++++++++++---------
 Ix/IxVM/Kernel/Whnf.lean      |  8 ++++-
 2 files changed, 52 insertions(+), 15 deletions(-)

diff --git a/Ix/IxVM/Kernel/Primitive.lean b/Ix/IxVM/Kernel/Primitive.lean
index be296179..31e57dd2 100644
--- a/Ix/IxVM/Kernel/Primitive.lean
+++ b/Ix/IxVM/Kernel/Primitive.lean
@@ -2626,26 +2626,57 @@ def primitive := ⟦
   -- Recognizes Defn-kind constants whose body is `λ x_1 ... x_n → Prj S i (BVar k)`
   -- and shortcuts the unfolding to a direct `Prj S i args[arity-1-k]`. Pure
   -- performance: standard delta+beta still produces same result.
-  fn try_reduce_projection_definition(head_idx: G, spine: List‹KExpr›,
-                                       top: List‹&KConstantInfo›) -> (G, KExpr) {
+  -- Whether the constant at `head_idx` is a projection-definition, and
+  -- its (arity, struct_idx, field, struct_arg_idx) when so. Keyed on the
+  -- INDEX alone (top is constant per run): one memo row per distinct
+  -- constant, where the previous spine-keyed probe paid a row per delta
+  -- candidate — 12.7M entries on the UTF-8 codec check, scaling 39x per
+  -- SInt width doubling (structure-bundle instances force projections
+  -- constantly).
+  fn proj_def_info_of(head_idx: G,
+                      top: List‹&KConstantInfo›) -> (G, G, G, G, G) {
     match load(list_lookup(top, head_idx)) {
       KConstantInfo.Defn(_, _, value, _, _) =>
-        match projection_definition_info(value, 0) {
-          (1, arity, struct_idx, field, struct_arg_idx) =>
-            match u32_less_than(list_length(spine), arity) {
-              1 => (0, store(KExprNode.BVar(0))),
-              0 =>
-                let target_arg = list_lookup(spine, struct_arg_idx);
-                let proj_expr = store(KExprNode.Proj(struct_idx, field, target_arg));
-                let post = list_drop(spine, arity);
-                (1, apply_spine(proj_expr, post)),
-            },
-          _ => (0, store(KExprNode.BVar(0))),
-        },
+        projection_definition_info(value, 0),
+      _ => (0, 0, 0, 0, 0),
+    }
+  }
+
+  -- Scalar gate for inline use (let-bound matches only support scalar
+  -- scrutinees): 1 iff the constant is a projection-definition.
+  fn is_proj_def(head_idx: G, top: List‹&KConstantInfo›) -> G {
+    match proj_def_info_of(head_idx, top) {
+      (1, _, _, _, _) => 1,
+      _ => 0,
+    }
+  }
+
+  -- The spine-keyed application, called ONLY when `is_proj_def` already
+  -- said yes (rare), so the per-delta-candidate row of the old
+  -- ungated form is gone.
+  fn try_reduce_projection_definition(head_idx: G, spine: List‹KExpr›,
+                                       top: List‹&KConstantInfo›) -> (G, KExpr) {
+    match proj_def_info_of(head_idx, top) {
+      (1, arity, struct_idx, field, struct_arg_idx) =>
+        proj_def_apply(arity, struct_idx, field, struct_arg_idx, spine),
       _ => (0, store(KExprNode.BVar(0))),
     }
   }
 
+  -- The spine-dependent application, reached only for ACTUAL
+  -- projection-definitions (rare).
+  fn proj_def_apply(arity: G, struct_idx: G, field: G, struct_arg_idx: G,
+                    spine: List‹KExpr›) -> (G, KExpr) {
+    match u32_less_than(list_length(spine), arity) {
+      1 => (0, store(KExprNode.BVar(0))),
+      0 =>
+        let target_arg = list_lookup(spine, struct_arg_idx);
+        let proj_expr = store(KExprNode.Proj(struct_idx, field, target_arg));
+        let post = list_drop(spine, arity);
+        (1, apply_spine(proj_expr, post)),
+    }
+  }
+
   -- Mirror: src/ix/kernel/whnf.rs:1441-1500 try_reduce_fin_val_decidable_rec
   -- + project_decidable_fin_val_minor.
   --
diff --git a/Ix/IxVM/Kernel/Whnf.lean b/Ix/IxVM/Kernel/Whnf.lean
index 93dac335..93348a0e 100644
--- a/Ix/IxVM/Kernel/Whnf.lean
+++ b/Ix/IxVM/Kernel/Whnf.lean
@@ -223,7 +223,13 @@ def whnf := ⟦
           -- which would loop.
           (2, stuck) => stuck,
           (0, _) =>
-            let proj_def_pair = try_reduce_projection_definition(idx, spine, top);
+            -- gate on the idx-keyed proj-def classification (memoized per
+            -- constant): only real projection-definitions pay the
+            -- spine-keyed apply row
+            let proj_def_pair = match is_proj_def(idx, top) {
+              1 => try_reduce_projection_definition(idx, spine, top),
+              _ => (0, store(KExprNode.BVar(0))),
+            };
             match proj_def_pair {
               (1, reduced_pd) => whnf(reduced_pd, types, top, addrs),
               (0, _) =>

From 2d83be1a890aec92d216ad6542fe44c5172d2eab Mon Sep 17 00:00:00 2001
From: samuelburnham <45365069+samuelburnham@users.noreply.github.com>
Date: Thu, 11 Jun 2026 21:50:14 +0000
Subject: [PATCH 8/8] =?UTF-8?q?IxVM:=20port=20jcb/fixes=20H-14=20=E2=80=94?=
 =?UTF-8?q?=20ptr=5Fval=20skip=20map=20+=20lockstep=20addr=20cursor?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two quadratic/constant-factor fixes to check_all_skipping, ported from
jcb/fixes (3763356, John C. Burnham):

- The assumption-leaf skip set keys on ptr_val instead of the first 4
  address bytes: one tree lookup per constant, no per-lookup address
  load, no confirming address_eq. Sound by the build_addr_pos_map
  interning invariant (one pointer, one content): a ptr hit implies the
  address IS a leaf; a de-interned pointer reads as absent and the
  constant just gets checked — fail-closed.
- The iterator walks the addrs list in LOCKSTEP with consts (cur_addrs
  suffix) instead of list_lookup(addrs, pos) per constant, which
  re-walked the prefix every iteration — a standalone O(closure^2).

Affects sharded checks and assumption-gated claims. 297 ixvm tests;
frontier-assumption claim smoke passes.

Assessment of the rest of jcb/fixes for Aiur: H-13 is compile-pipeline
wall-clock (not Aiur), H-15 is the Rust convenience kernel only. The
soundness items (C-1, H-2, H-3, H-12, eta/proj guards) touch
Whnf/DefEq/Primitive files this branch has restructured — flagged for
merge planning, not ported here.
---
 Ix/IxVM/Kernel/Claim.lean | 72 +++++++++++++++------------------------
 1 file changed, 28 insertions(+), 44 deletions(-)

diff --git a/Ix/IxVM/Kernel/Claim.lean b/Ix/IxVM/Kernel/Claim.lean
index 7122431a..3a9d54d5 100644
--- a/Ix/IxVM/Kernel/Claim.lean
+++ b/Ix/IxVM/Kernel/Claim.lean
@@ -804,43 +804,22 @@ def claim := ⟦
     }
   }
 
-  -- Pack the first 4 address bytes (LE) into a u32 key for the skip-set rbtree.
-  --
-  -- Capped at 4 bytes because `RBTreeMap` orders keys with `u32_less_than`, a
-  -- 32-bit comparator gadget — a wider key (a single `G` could hold 7 bytes in
-  -- Goldilocks) would overflow it and corrupt the tree ordering. A 32-bit key
-  -- space means key collisions are possible (~N²/2^33 over N leaves), but they
-  -- are harmless: a collision makes `addr_in_set`'s confirming `address_eq`
-  -- fail, yielding a false negative (a frontier const gets rechecked) never a
-  -- false positive (a wrong skip). See `build_skip_set`.
-  fn addr_key(a: Addr) -> G {
-    let arr = load(a);
-    to_field(arr[0])
-      + to_field(arr[1]) * 256
-      + to_field(arr[2]) * 65536
-      + to_field(arr[3]) * 16777216
-  }
-
-  -- Build an O(log N) membership set from the assumption-leaf list, keyed on
-  -- `addr_key`. Key collisions overwrite — sound because the only consequence is
-  -- a missed skip (a frontier const gets rechecked, never wrongly trusted); the
-  -- confirming `address_eq` in `addr_in_set` rules out false positives.
-  fn build_skip_set(leaves: List‹Addr›, acc: RBTreeMap‹Addr›) -> RBTreeMap‹Addr› {
+  -- Membership map over the assumption leaves, keyed by `ptr_val`. Sound
+  -- by the same invariant as `build_addr_pos_map` (Ingress.lean): one
+  -- pointer maps to one content, so a ptr hit implies the address IS a
+  -- leaf (no false skip); a de-interned pointer reads as absent, which
+  -- here only means the constant gets CHECKED instead of skipped —
+  -- conservative. One tree lookup per constant, no confirming
+  -- address_eq, no per-lookup address loads (ported from jcb/fixes
+  -- H-14; replaces the 4-byte-content-keyed set + confirm compare).
+  fn build_asm_leaf_map(leaves: List‹Addr›) -> RBTreeMap‹G› {
     match load(leaves) {
-      ListNode.Nil => acc,
+      ListNode.Nil => RBTreeMap.Nil,
       ListNode.Cons(a, rest) =>
-        build_skip_set(rest, rbtree_map_insert(addr_key(a), a, acc)),
+        rbtree_map_insert(ptr_val(a), 1, build_asm_leaf_map(rest)),
     }
   }
 
-  -- Membership via one rbtree lookup (cheap u32 key compares) + one confirming
-  -- full `address_eq`, replacing the O(N) linear `addr_in_list` scan that
-  -- dominated address_eq cost on sharded checks.
-  fn addr_in_set(target: Addr, skip_set: RBTreeMap‹Addr›) -> G {
-    let found = rbtree_map_lookup_or_default(addr_key(target), skip_set, store([0u8; 32]));
-    address_eq(found, target)
-  }
-
   -- ============================================================================
   -- check_all variant that skips positions whose addr is in the
   -- supplied assumption-leaf list.
@@ -850,27 +829,32 @@ def claim := ⟦
                         addrs: List‹Addr›,
                         asm_leaves: List‹Addr›) {
     let _ = check_canonical_block_sort(top);
-    -- Build the skip-set once (O(N log N)) instead of an O(N) linear scan per
-    -- checked const.
-    let skip_set = build_skip_set(asm_leaves, RBTreeMap.Nil);
-    check_all_skipping_iter(consts, top, addrs, skip_set, 0)
+    let asm_map = store(build_asm_leaf_map(asm_leaves));
+    check_all_skipping_iter(consts, top, addrs, addrs, asm_map, 0)
   }
 
+  -- `cur_addrs` is the suffix of `addrs` aligned with `consts`, walked in
+  -- lockstep instead of `list_lookup(addrs, pos)` — which re-walks the
+  -- list prefix every iteration, a standalone quadratic in closure size
+  -- (ported from jcb/fixes H-14).
   fn check_all_skipping_iter(consts: List‹&KConstantInfo›,
                              top: List‹&KConstantInfo›,
                              addrs: List‹Addr›,
-                             skip_set: RBTreeMap‹Addr›,
+                             cur_addrs: List‹Addr›,
+                             asm_map: &RBTreeMap‹G›,
                              pos: G) {
     match load(consts) {
       ListNode.Nil => (),
       ListNode.Cons(&ci, rest) =>
-        let addr = list_lookup(addrs, pos);
-        match addr_in_set(addr, skip_set) {
-          1 =>
-            check_all_skipping_iter(rest, top, addrs, skip_set, pos + 1),
-          _ =>
-            let _ = check_const(ci, pos, top, addrs);
-            check_all_skipping_iter(rest, top, addrs, skip_set, pos + 1),
+        match load(cur_addrs) {
+          ListNode.Cons(addr, rest_addrs) =>
+            match rbtree_map_lookup_or_default(ptr_val(addr), load(asm_map), 0) {
+              0 =>
+                let _ = check_const(ci, pos, top, addrs);
+                check_all_skipping_iter(rest, top, addrs, rest_addrs, asm_map, pos + 1),
+              _ =>
+                check_all_skipping_iter(rest, top, addrs, rest_addrs, asm_map, pos + 1),
+            },
         },
     }
   }