diff --git a/Cargo.lock b/Cargo.lock
index 7de24578..b1df7dd0 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1954,7 +1954,6 @@ dependencies = [
 [[package]]
 name = "multi-stark"
 version = "0.1.0"
-source = "git+https://github.com/argumentcomputer/multi-stark.git?rev=9ecab51d553445c0cc7b571af00a76b8a83a6f8c#9ecab51d553445c0cc7b571af00a76b8a83a6f8c"
 dependencies = [
  "bincode",
  "p3-air",
diff --git a/Cargo.toml b/Cargo.toml
index 58b90be3..8339ae85 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,7 +13,7 @@ itertools = "0.14.0"
 indexmap = { version = "2", features = ["rayon"] }
 lean-ffi = { git = "https://github.com/argumentcomputer/lean-ffi", rev = "cc98ebf67bf453ac3827cb767f78b13ea674dd6a" }
 mimalloc = { version = "0.1", default-features = false }
-multi-stark = { git = "https://github.com/argumentcomputer/multi-stark.git", rev = "9ecab51d553445c0cc7b571af00a76b8a83a6f8c" }
+multi-stark = { path = "multi-stark" }
 num-bigint = "0.4.6"
 rayon = "1"
 rustc-hash = "2"
@@ -31,8 +31,8 @@ getrandom = { version = "0.3", optional = true }
 tracing = "0.1"
 tracing-subscriber = { version = "0.3", features = ["env-filter"] }
 tracing-texray = { git = "https://github.com/argumentcomputer/tracing-texray", rev = "8ce04e3422cd48e68ef47fab95dba7d06b8c368c" }
-bincode = { version = "2.0.1", optional = true }
-serde = { version = "1.0.219", features = ["derive"], optional = true }
+bincode = { version = "2.0.1" }
+serde = { version = "1.0.219", features = ["derive"] }
 
 [dev-dependencies]
 quickcheck = "1.0.3"
@@ -43,7 +43,7 @@ quickcheck_macros = "1.0.0"
 default = []
 parallel = ["multi-stark/parallel"]
 test-ffi = []
-net = ["bytes", "tokio", "iroh", "iroh-base", "n0-error", "getrandom", "bincode", "serde" ]
+net = ["bytes", "tokio", "iroh", "iroh-base", "n0-error", "getrandom" ]
 
 [profile.dev]
 panic = "abort"
diff --git a/Ix/Aiur/Protocol.lean b/Ix/Aiur/Protocol.lean
index df320811..7bfdd5df 100644
--- a/Ix/Aiur/Protocol.lean
+++ b/Ix/Aiur/Protocol.lean
@@ -47,6 +47,10 @@ namespace AiurSystem
 @[extern "rs_aiur_system_build"]
 opaque build : @&Bytecode.Toplevel → @&CommitmentParameters → AiurSystem
 
+/-- Serialize the verifying key (`System<AiurCircuit>`) to bytes. -/
+@[extern "rs_aiur_system_vk_bytes"]
+opaque vkBytes : @& AiurSystem → ByteArray
+
 @[extern "rs_aiur_system_prove"]
 private opaque prove' : @& AiurSystem → @& FriParameters →
   @& Bytecode.FunIdx → @& Array G →
diff --git a/Ix/MultiStark.lean b/Ix/MultiStark.lean
new file mode 100644
index 00000000..f30912ef
--- /dev/null
+++ b/Ix/MultiStark.lean
@@ -0,0 +1,98 @@
+module
+public import Ix.Aiur.Meta
+public import Ix.IxVM.Core
+public import Ix.IxVM.ByteStream
+public import Ix.MultiStark.Goldilocks
+public import Ix.MultiStark.Deserialize
+public import Ix.MultiStark.Keccak
+public import Ix.MultiStark.Pcs
+public import Ix.MultiStark.SystemDeserialize
+public import Ix.MultiStark.Verifier
+public import Ix.MultiStark.Tests
+
+/-!
+# Multi-STARK proof verifier (Aiur)
+
+The recursive verifier. Its public statement is purely existential: *"there
+exists a valid multi-stark proof, under the FRI parameters given as public
+input, for the constraint system with this keccak-256 digest and these public
+claims."* The proof itself is **non-deterministic advice** (fed on IO channel 0,
+never hashed or otherwise bound as a public input): the Fiat-Shamir transcript
+replay plus the Merkle/OOD/FRI checks are exactly what make any accepted advice
+a valid proof — a hash binding of the proof bytes would add nothing to the
+statement, while costing one keccak-f per 136 bytes in-circuit.
+
+The verifying key and claims, by contrast, ARE digest-bound (`system_digest`,
+`claims_digest`): they determine *what was proven*.
+
+Fixed protocol assumptions (our system): `queryProofOfWorkBits = 0`,
+`capHeight = 0`, `maxLogArity = 1`, `logFinalPolyLen = 0`. The variable FRI
+parameters (`num_queries`, `commit_pow_bits`, `log_blowup`) are public inputs.
+-/
+
+public section
+
+namespace MultiStark
+
+def entrypoints := ⟦
+  -- Public inputs: the keccak-256 digests of the verifying key and the claims
+  -- (4 little-endian u64 lanes each) plus the variable FRI parameters. The
+  -- proof is pure non-deterministic advice on IO channel 0 — see the module
+  -- docstring. One stream per channel (0 = proof, 1 = vk, 2 = claims), each
+  -- registered under key `[0]` on its channel.
+  pub fn verify_multi_stark_proof(system_digest: [[U8; 8]; 4], claims_digest: [[U8; 8]; 4], num_queries: G, commit_pow_bits: G, log_blowup: G) {
+    -- Proof advice from IO channel 0: deserialize, assert fully consumed.
+    let (idx, len) = io_get_info(0, [0]);
+    let bytes = #read_byte_stream(0, idx, len);
+    let (proof, rest) = read_proof(bytes);
+    assert_eq!(load(rest), ListNode.Nil);
+    -- Verifying key (`System<AiurCircuit>`) from IO channel 1: bind the bytes
+    -- to the public keccak-256 `system_digest`, then reconstruct the system.
+    let (sidx, slen) = io_get_info(1, [0]);
+    let sbytes = #read_byte_stream(1, sidx, slen);
+    assert_eq!(keccak256(sbytes), system_digest);
+    let (sys, srest) = read_system(sbytes);
+    assert_eq!(load(srest), ListNode.Nil);
+    -- Public claims (`&[&[Val]]`) from IO channel 2: bind the bytes to the
+    -- public keccak-256 `claims_digest`, then deserialize. Binding them as a
+    -- public input is what makes the lookup argument sound (a prover cannot
+    -- choose claims adaptively).
+    let (cidx, clen) = io_get_info(2, [0]);
+    let cbytes = #read_byte_stream(2, cidx, clen);
+    assert_eq!(keccak256(cbytes), claims_digest);
+    let (claims, crest) = read_claims(cbytes);
+    assert_eq!(load(crest), ListNode.Nil);
+    -- Structural + accumulator + PCS checks.
+    assert_eq!(verify(proof), 1);
+    -- Step 3 + 5: prover-faithful Fiat-Shamir replay and the out-of-domain
+    -- composition/quotient check, `composition(ζ)·inv_vanishing(ζ) == quotient(ζ)`.
+    assert_eq!(ood_verify(sys, proof, claims, num_queries, commit_pow_bits, log_blowup), 1);
+    ()
+  }
+⟧
+
+/-- The standalone Multi-STARK verifier toplevel: `core` (lists/options) +
+`byteStream` (`U64`, `flatten_u64`, `read_byte_stream`, …) + the deserializer,
+the keccak-256 implementation, and the entrypoint. -/
+def multiStark : Except Aiur.Global Aiur.Source.Toplevel := do
+  let t ← IxVM.core.merge IxVM.byteStream
+  let t ← t.merge MultiStark.goldilocks
+  let t ← t.merge deserialize
+  let t ← t.merge keccak
+  let t ← t.merge systemDeserialize
+  let t ← t.merge pcs
+  let t ← t.merge verifier
+  t.merge entrypoints
+
+/-- The verifier toplevel PLUS its self-test entrypoints
+(`Ix/MultiStark/Tests.lean`). Kept separate from `multiStark` because every
+`pub fn` adds a circuit to the compiled system — the production verifier should
+not carry test-only width. Use this toplevel only to run the `*_test`
+entrypoints. -/
+def multiStarkTests : Except Aiur.Global Aiur.Source.Toplevel := do
+  let t ← multiStark
+  t.merge tests
+
+end MultiStark
+
+end
diff --git a/Ix/MultiStark/Deserialize.lean b/Ix/MultiStark/Deserialize.lean
new file mode 100644
index 00000000..d814a434
--- /dev/null
+++ b/Ix/MultiStark/Deserialize.lean
@@ -0,0 +1,406 @@
+module
+public import Ix.Aiur.Meta
+public import Ix.IxVM.Core
+public import Ix.IxVM.ByteStream
+
+/-!
+# Multi-STARK proof deserializer (Aiur)
+
+Aiur deserializer for `multi_stark::prover::Proof`, mirroring the hand-written
+bincode reader in `multi-stark/src/manual_codec.rs` (`fn deserialize`). Only
+the deserializer is ported (no serializer).
+
+The wire format is bincode `standard().with_little_endian().with_fixed_int_encoding()`:
+
+* every `Vec` / length prefix : `u64`, 8 bytes little-endian
+* `Option`                    : 1 tag byte (`0` = None, `1` = Some), then value
+* `u8`                        : 1 byte
+* struct                      : fields in declaration order, no framing
+* `[T; N]` / tuple            : N elements back-to-back, no length prefix
+* `Val` (Goldilocks)          : raw `u64`, 8 bytes LE (NOT reduced mod p)
+* `ExtVal` (deg-2 ext)        : `[u64; 2]`, 16 bytes LE
+* Merkle digest               : `[u64; 4]`, 32 bytes LE
+
+`read_proof` builds a `Proof` object; `Ix/MultiStark.lean` hangs the entrypoint
+(and eventual verification logic) off it.
+-/
+
+public section
+
+namespace MultiStark
+
+def deserialize := ⟦
+  -- ==========================================================================
+  -- Wire-level type mirrors of `multi-stark/src/manual_codec.rs`.
+  -- `U64`/`ByteStream` come from `IxVM.byteStream`; raw Goldilocks `Val` is the
+  -- non-canonical `u64`, kept here as the 8 little-endian bytes (`U64`).
+  -- ==========================================================================
+
+  -- `ExtVal = BinomialExtensionField<Goldilocks, 2> = 𝔽_p[X]/(X² - 7)`, stored
+  -- as its two Goldilocks coefficients `[c0, c1]` (= `c0 + c1·X`). Each coefficient
+  -- is a NON-NATIVE Goldilocks element (`[U8; 8]`, canonical LE bytes) so that the
+  -- composition-polynomial arithmetic is field-agnostic — it emulates Goldilocks
+  -- on bytes instead of using Aiur's native (outer) field. See `Goldilocks.lean`.
+  type Ext = [[U8; 8]; 2]
+
+  -- A Merkle digest: `[u64; DIGEST_ELEMS]` with `DIGEST_ELEMS = 4`.
+  type Digest = [U64; 4]
+
+  -- `Commitment = MerkleCap<..>`: only the `cap: Vec<Digest>` is on the wire.
+  type MerkleCap = List‹Digest›
+
+  -- `OpenedValuesForRound<ExtVal> = Vec<Vec<Vec<ExtVal>>>`.
+  type OpenedRound = List‹List‹List‹Ext›››
+
+  -- The three trace/quotient commitments at the head of a `Proof`.
+  enum Commitments {
+    Mk(MerkleCap, MerkleCap, MerkleCap)
+  }
+
+  -- `BatchOpening<Val, Mmcs>`: per-round input opening of a FRI query.
+  --   opened_values : Vec<Vec<Val>>   (one row of base-field values per matrix)
+  --   opening_proof : Vec<Digest>     (Merkle authentication path)
+  enum BatchOpening {
+    Mk(List‹List‹U64››, List‹Digest›)
+  }
+
+  -- `CommitPhaseProofStep<ExtVal, ExtMmcs>`: one folding step of a FRI query.
+  --   log_arity      : u8
+  --   sibling_values : Vec<Ext>  (arity - 1 siblings)
+  --   opening_proof  : Vec<Digest>
+  enum CommitPhaseProofStep {
+    Mk(U8, List‹Ext›, List‹Digest›)
+  }
+
+  -- `QueryProof<ExtVal, ExtMmcs, Vec<BatchOpening<Val, Mmcs>>>`.
+  enum QueryProof {
+    Mk(List‹BatchOpening›, List‹CommitPhaseProofStep›)
+  }
+
+  -- `PcsProof = FriProof<ExtVal, ExtMmcs, Witness = Val, ..>`.
+  --   commit_phase_commits : Vec<MerkleCap>
+  --   commit_pow_witnesses : Vec<Val>
+  --   query_proofs         : Vec<QueryProof>
+  --   final_poly           : Vec<Ext>
+  --   query_pow_witness    : Val
+  enum FriProof {
+    Mk(List‹MerkleCap›, List‹U64›, List‹QueryProof›, List‹Ext›, U64)
+  }
+
+  -- `Option<OpenedRound>` for `preprocessed_opened_values`. A dedicated
+  -- non-generic enum keeps constructor inference unambiguous.
+  enum PreprocessedOpt {
+    NoPreprocessed,
+    SomePreprocessed(OpenedRound)
+  }
+
+  -- Mirror of `multi_stark::prover::Proof`, in serialization order.
+  enum Proof {
+    Mk(
+      Commitments,        -- commitments
+      List‹Ext›,          -- intermediate_accumulators
+      List‹U8›,           -- log_degrees
+      FriProof,           -- opening_proof
+      OpenedRound,        -- quotient_opened_values
+      PreprocessedOpt,    -- preprocessed_opened_values
+      OpenedRound,        -- stage_1_opened_values
+      OpenedRound         -- stage_2_opened_values
+    )
+  }
+
+  -- ==========================================================================
+  -- Byte-reading primitives. Every reader threads the remaining `ByteStream`.
+  -- ==========================================================================
+
+  fn read_u8(stream: ByteStream) -> (U8, ByteStream) {
+    match load(stream) {
+      ListNode.Cons(byte, rest) => (byte, rest),
+      ListNode.Nil => (0u8, store(ListNode.Nil)),
+    }
+  }
+
+  -- One raw `u64`: 8 little-endian bytes, kept as `U64` (no field reduction).
+  fn read_u64(stream: ByteStream) -> (U64, ByteStream) {
+    let (b0, s0) = read_u8(stream);
+    let (b1, s1) = read_u8(s0);
+    let (b2, s2) = read_u8(s1);
+    let (b3, s3) = read_u8(s2);
+    let (b4, s4) = read_u8(s3);
+    let (b5, s5) = read_u8(s4);
+    let (b6, s6) = read_u8(s5);
+    let (b7, s7) = read_u8(s6);
+    ([b0, b1, b2, b3, b4, b5, b6, b7], s7)
+  }
+
+  -- A bincode length prefix: a fixed-width `u64`, flattened to a field count.
+  -- `flatten_u64` asserts the top byte is zero (< 2^56), always true for the
+  -- list lengths in a proof.
+  fn read_count(stream: ByteStream) -> (G, ByteStream) {
+    let (val, s) = read_u64(stream);
+    (flatten_u64(val), s)
+  }
+
+  -- Interpret a raw little-endian `u64` limb as a Goldilocks field element. The
+  -- field add/mul reduce mod p, so a non-canonical wire repr (e.g. `0` shipped
+  -- as `p = 0xFFFFFFFF00000001`) maps to the canonical field element.
+  fn limb_to_field(b: U64) -> G {
+    to_field(b[0])
+      + 0x100 * to_field(b[1])
+      + 0x10000 * to_field(b[2])
+      + 0x1000000 * to_field(b[3])
+      + 0x100000000 * to_field(b[4])
+      + 0x10000000000 * to_field(b[5])
+      + 0x1000000000000 * to_field(b[6])
+      + 0x100000000000000 * to_field(b[7])
+  }
+
+  -- `ExtVal` -> two `u64` limbs (no length prefix), each canonicalized into a
+  -- non-native Goldilocks element (`gl_reduce` maps a non-canonical wire repr,
+  -- e.g. `0` shipped as `p`, to its canonical `< p` bytes).
+  fn read_ext(stream: ByteStream) -> (Ext, ByteStream) {
+    let (a, s0) = read_u64(stream);
+    let (b, s1) = read_u64(s0);
+    ([gl_reduce(a), gl_reduce(b)], s1)
+  }
+
+
+  -- Merkle digest -> `[u64; 4]`, no length prefix.
+  fn read_digest(stream: ByteStream) -> (Digest, ByteStream) {
+    let (a, s0) = read_u64(stream);
+    let (b, s1) = read_u64(s0);
+    let (c, s2) = read_u64(s1);
+    let (d, s3) = read_u64(s2);
+    ([a, b, c, d], s3)
+  }
+
+  -- ==========================================================================
+  -- Length-prefixed `Vec<T>` readers. First-order, so one pair per element
+  -- type: `read_T_vec` reads the `u64` length then loops `read_T_vec_n`.
+  -- ==========================================================================
+
+  fn read_u8_vec(stream: ByteStream) -> (List‹U8›, ByteStream) {
+    let (n, s) = read_count(stream);
+    read_u8_vec_n(s, n)
+  }
+  fn read_u8_vec_n(stream: ByteStream, n: G) -> (List‹U8›, ByteStream) {
+    match n {
+      0 => (store(ListNode.Nil), stream),
+      _ =>
+        let (x, s) = read_u8(stream);
+        let (rest, s2) = read_u8_vec_n(s, n - 1);
+        (store(ListNode.Cons(x, rest)), s2),
+    }
+  }
+
+  fn read_u64_vec(stream: ByteStream) -> (List‹U64›, ByteStream) {
+    let (n, s) = read_count(stream);
+    read_u64_vec_n(s, n)
+  }
+  fn read_u64_vec_n(stream: ByteStream, n: G) -> (List‹U64›, ByteStream) {
+    match n {
+      0 => (store(ListNode.Nil), stream),
+      _ =>
+        let (x, s) = read_u64(stream);
+        let (rest, s2) = read_u64_vec_n(s, n - 1);
+        (store(ListNode.Cons(x, rest)), s2),
+    }
+  }
+
+  fn read_ext_vec(stream: ByteStream) -> (List‹Ext›, ByteStream) {
+    let (n, s) = read_count(stream);
+    read_ext_vec_n(s, n)
+  }
+  fn read_ext_vec_n(stream: ByteStream, n: G) -> (List‹Ext›, ByteStream) {
+    match n {
+      0 => (store(ListNode.Nil), stream),
+      _ =>
+        let (x, s) = read_ext(stream);
+        let (rest, s2) = read_ext_vec_n(s, n - 1);
+        (store(ListNode.Cons(x, rest)), s2),
+    }
+  }
+
+  fn read_digest_vec(stream: ByteStream) -> (List‹Digest›, ByteStream) {
+    let (n, s) = read_count(stream);
+    read_digest_vec_n(s, n)
+  }
+  fn read_digest_vec_n(stream: ByteStream, n: G) -> (List‹Digest›, ByteStream) {
+    match n {
+      0 => (store(ListNode.Nil), stream),
+      _ =>
+        let (x, s) = read_digest(stream);
+        let (rest, s2) = read_digest_vec_n(s, n - 1);
+        (store(ListNode.Cons(x, rest)), s2),
+    }
+  }
+
+  -- `Vec<Vec<Val>>` for `BatchOpening.opened_values`.
+  fn read_u64_vec_vec(stream: ByteStream) -> (List‹List‹U64››, ByteStream) {
+    let (n, s) = read_count(stream);
+    read_u64_vec_vec_n(s, n)
+  }
+  fn read_u64_vec_vec_n(stream: ByteStream, n: G) -> (List‹List‹U64››, ByteStream) {
+    match n {
+      0 => (store(ListNode.Nil), stream),
+      _ =>
+        let (x, s) = read_u64_vec(stream);
+        let (rest, s2) = read_u64_vec_vec_n(s, n - 1);
+        (store(ListNode.Cons(x, rest)), s2),
+    }
+  }
+
+  -- `OpenedRound = Vec<Vec<Vec<Ext>>>`, built bottom-up from `read_ext_vec`.
+  fn read_ext_vec_vec(stream: ByteStream) -> (List‹List‹Ext››, ByteStream) {
+    let (n, s) = read_count(stream);
+    read_ext_vec_vec_n(s, n)
+  }
+  fn read_ext_vec_vec_n(stream: ByteStream, n: G) -> (List‹List‹Ext››, ByteStream) {
+    match n {
+      0 => (store(ListNode.Nil), stream),
+      _ =>
+        let (x, s) = read_ext_vec(stream);
+        let (rest, s2) = read_ext_vec_vec_n(s, n - 1);
+        (store(ListNode.Cons(x, rest)), s2),
+    }
+  }
+
+  fn read_opened_round(stream: ByteStream) -> (OpenedRound, ByteStream) {
+    let (n, s) = read_count(stream);
+    read_opened_round_n(s, n)
+  }
+  fn read_opened_round_n(stream: ByteStream, n: G) -> (OpenedRound, ByteStream) {
+    match n {
+      0 => (store(ListNode.Nil), stream),
+      _ =>
+        let (x, s) = read_ext_vec_vec(stream);
+        let (rest, s2) = read_opened_round_n(s, n - 1);
+        (store(ListNode.Cons(x, rest)), s2),
+    }
+  }
+
+  -- ==========================================================================
+  -- Struct readers (fields in declaration order, no framing).
+  -- ==========================================================================
+
+  fn read_merkle_cap(stream: ByteStream) -> (MerkleCap, ByteStream) {
+    read_digest_vec(stream)
+  }
+  fn read_merkle_cap_vec(stream: ByteStream) -> (List‹MerkleCap›, ByteStream) {
+    let (n, s) = read_count(stream);
+    read_merkle_cap_vec_n(s, n)
+  }
+  fn read_merkle_cap_vec_n(stream: ByteStream, n: G) -> (List‹MerkleCap›, ByteStream) {
+    match n {
+      0 => (store(ListNode.Nil), stream),
+      _ =>
+        let (x, s) = read_merkle_cap(stream);
+        let (rest, s2) = read_merkle_cap_vec_n(s, n - 1);
+        (store(ListNode.Cons(x, rest)), s2),
+    }
+  }
+
+  fn read_commitments(stream: ByteStream) -> (Commitments, ByteStream) {
+    let (stage_1_trace, s0) = read_merkle_cap(stream);
+    let (stage_2_trace, s1) = read_merkle_cap(s0);
+    let (quotient_chunks, s2) = read_merkle_cap(s1);
+    (Commitments.Mk(stage_1_trace, stage_2_trace, quotient_chunks), s2)
+  }
+
+  fn read_batch_opening(stream: ByteStream) -> (BatchOpening, ByteStream) {
+    let (opened_values, s0) = read_u64_vec_vec(stream);
+    let (opening_proof, s1) = read_digest_vec(s0);
+    (BatchOpening.Mk(opened_values, opening_proof), s1)
+  }
+  fn read_batch_opening_vec(stream: ByteStream) -> (List‹BatchOpening›, ByteStream) {
+    let (n, s) = read_count(stream);
+    read_batch_opening_vec_n(s, n)
+  }
+  fn read_batch_opening_vec_n(stream: ByteStream, n: G) -> (List‹BatchOpening›, ByteStream) {
+    match n {
+      0 => (store(ListNode.Nil), stream),
+      _ =>
+        let (x, s) = read_batch_opening(stream);
+        let (rest, s2) = read_batch_opening_vec_n(s, n - 1);
+        (store(ListNode.Cons(x, rest)), s2),
+    }
+  }
+
+  fn read_commit_phase_step(stream: ByteStream) -> (CommitPhaseProofStep, ByteStream) {
+    let (log_arity, s0) = read_u8(stream);
+    let (sibling_values, s1) = read_ext_vec(s0);
+    let (opening_proof, s2) = read_digest_vec(s1);
+    (CommitPhaseProofStep.Mk(log_arity, sibling_values, opening_proof), s2)
+  }
+  fn read_commit_phase_step_vec(stream: ByteStream) -> (List‹CommitPhaseProofStep›, ByteStream) {
+    let (n, s) = read_count(stream);
+    read_commit_phase_step_vec_n(s, n)
+  }
+  fn read_commit_phase_step_vec_n(stream: ByteStream, n: G) -> (List‹CommitPhaseProofStep›, ByteStream) {
+    match n {
+      0 => (store(ListNode.Nil), stream),
+      _ =>
+        let (x, s) = read_commit_phase_step(stream);
+        let (rest, s2) = read_commit_phase_step_vec_n(s, n - 1);
+        (store(ListNode.Cons(x, rest)), s2),
+    }
+  }
+
+  fn read_query_proof(stream: ByteStream) -> (QueryProof, ByteStream) {
+    let (input_proof, s0) = read_batch_opening_vec(stream);
+    let (commit_phase_openings, s1) = read_commit_phase_step_vec(s0);
+    (QueryProof.Mk(input_proof, commit_phase_openings), s1)
+  }
+  fn read_query_proof_vec(stream: ByteStream) -> (List‹QueryProof›, ByteStream) {
+    let (n, s) = read_count(stream);
+    read_query_proof_vec_n(s, n)
+  }
+  fn read_query_proof_vec_n(stream: ByteStream, n: G) -> (List‹QueryProof›, ByteStream) {
+    match n {
+      0 => (store(ListNode.Nil), stream),
+      _ =>
+        let (x, s) = read_query_proof(stream);
+        let (rest, s2) = read_query_proof_vec_n(s, n - 1);
+        (store(ListNode.Cons(x, rest)), s2),
+    }
+  }
+
+  fn read_fri_proof(stream: ByteStream) -> (FriProof, ByteStream) {
+    let (commit_phase_commits, s0) = read_merkle_cap_vec(stream);
+    let (commit_pow_witnesses, s1) = read_u64_vec(s0);
+    let (query_proofs, s2) = read_query_proof_vec(s1);
+    let (final_poly, s3) = read_ext_vec(s2);
+    let (query_pow_witness, s4) = read_u64(s3);
+    (FriProof.Mk(commit_phase_commits, commit_pow_witnesses, query_proofs,
+                 final_poly, query_pow_witness), s4)
+  }
+
+  -- `Option<OpenedRound>`: 1 tag byte, then the value when `Some`.
+  fn read_preprocessed(stream: ByteStream) -> (PreprocessedOpt, ByteStream) {
+    let (tag, s) = read_u8(stream);
+    match tag {
+      0 => (PreprocessedOpt.NoPreprocessed, s),
+      _ =>
+        let (r, s2) = read_opened_round(s);
+        (PreprocessedOpt.SomePreprocessed(r), s2),
+    }
+  }
+
+  -- Full `Proof` in declaration order (see `manual_codec.rs::Reader::proof`).
+  fn read_proof(stream: ByteStream) -> (Proof, ByteStream) {
+    let (commitments, s0) = read_commitments(stream);
+    let (intermediate_accumulators, s1) = read_ext_vec(s0);
+    let (log_degrees, s2) = read_u8_vec(s1);
+    let (opening_proof, s3) = read_fri_proof(s2);
+    let (quotient_opened_values, s4) = read_opened_round(s3);
+    let (preprocessed_opened_values, s5) = read_preprocessed(s4);
+    let (stage_1_opened_values, s6) = read_opened_round(s5);
+    let (stage_2_opened_values, s7) = read_opened_round(s6);
+    (Proof.Mk(commitments, intermediate_accumulators, log_degrees, opening_proof,
+              quotient_opened_values, preprocessed_opened_values,
+              stage_1_opened_values, stage_2_opened_values), s7)
+  }
+⟧
+
+end MultiStark
+
+end
diff --git a/Ix/MultiStark/Goldilocks.lean b/Ix/MultiStark/Goldilocks.lean
new file mode 100644
index 00000000..6bfc8c3a
--- /dev/null
+++ b/Ix/MultiStark/Goldilocks.lean
@@ -0,0 +1,318 @@
+module
+public import Ix.Aiur.Meta
+
+/-!
+# Non-native Goldilocks field arithmetic (`Goldilocks = [U8; 8]`)
+
+The recursive verifier computes on the *inner* proof's field (Goldilocks),
+but Aiur compiles the verifier to its *own* (outer) field — which is **not
+always Goldilocks** (it may be the BLS scalar field). So inner-field arithmetic
+cannot use Aiur's native `+`/`*` (those compute in the outer field); it must be
+**emulated** on bytes.
+
+A `Goldilocks` element is its canonical value `< p = 2⁶⁴ − 2³² + 1` held as 8
+little-endian bytes (`[U8; 8]`), and `ExtGoldilocks = [Goldilocks; 2]` is the
+degree-2 extension `𝔽_p[X]/(X² − 7)`. All ops are built from the u8 gadgets
+(`u8_add`/`u8_sub`/`u8_mul` give full carry/borrow/16-bit-product), so they are
+field-agnostic and produce bytes directly (no field→byte decomposition needed).
+
+`EPSILON = 2³² − 1`, and `2⁶⁴ ≡ EPSILON (mod p)`, the basis of the reductions.
+
+Validated byte-for-byte against `multi-stark`'s native Goldilocks (`gl_ops_ref`
+in `multi-stark/src/types.rs`).
+-/
+
+public section
+
+namespace MultiStark
+
+def goldilocks := ⟦
+  type Goldilocks = [U8; 8]
+  type ExtGoldilocks = [Goldilocks; 2]
+
+  -- p = 2⁶⁴ − 2³² + 1 and EPSILON = 2³² − 1, little-endian.
+  fn gl_p() -> Goldilocks { [1u8, 0u8, 0u8, 0u8, 255u8, 255u8, 255u8, 255u8] }
+  fn gl_eps() -> Goldilocks { [255u8, 255u8, 255u8, 255u8, 0u8, 0u8, 0u8, 0u8] }
+  fn gl_zero() -> Goldilocks { [0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8] }
+  fn gl_one() -> Goldilocks { [1u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8] }
+
+  -- ==========================================================================
+  -- Byte-vector primitives (carry/borrow chains over the u8 gadgets).
+  -- ==========================================================================
+
+  -- Add two bytes with a carry-in (0/1). Returns (sum byte, carry-out 0/1).
+  -- `u8_add` adds only two bytes; the carry-in is folded with a second add. At
+  -- most one of the two overflow flags fires, so the carry-out stays 0/1.
+  fn adc(a: U8, b: U8, cin: G) -> (U8, G) {
+    let (s1, o1) = u8_add(a, b);
+    let (s2, o2) = u8_add(s1, u8_from_field_unsafe(cin));
+    (s2, to_field(o1) + to_field(o2))
+  }
+
+  -- Subtract two bytes with a borrow-in (0/1). Returns (diff byte, borrow-out).
+  fn sbb(a: U8, b: U8, bin: G) -> (U8, G) {
+    let (d1, w1) = u8_sub(a, b);
+    let (d2, w2) = u8_sub(d1, u8_from_field_unsafe(bin));
+    (d2, to_field(w1) + to_field(w2))
+  }
+
+  -- 8-byte (64-bit) add. Returns the 8-byte sum and the final carry (0/1).
+  fn add8(a: Goldilocks, b: Goldilocks) -> (Goldilocks, G) {
+    let (r0, c0) = adc(a[0], b[0], 0);
+    let (r1, c1) = adc(a[1], b[1], c0);
+    let (r2, c2) = adc(a[2], b[2], c1);
+    let (r3, c3) = adc(a[3], b[3], c2);
+    let (r4, c4) = adc(a[4], b[4], c3);
+    let (r5, c5) = adc(a[5], b[5], c4);
+    let (r6, c6) = adc(a[6], b[6], c5);
+    let (r7, c7) = adc(a[7], b[7], c6);
+    ([r0, r1, r2, r3, r4, r5, r6, r7], c7)
+  }
+
+  -- 8-byte (64-bit) subtract. Returns the 8-byte difference and final borrow.
+  fn sub8(a: Goldilocks, b: Goldilocks) -> (Goldilocks, G) {
+    let (r0, w0) = sbb(a[0], b[0], 0);
+    let (r1, w1) = sbb(a[1], b[1], w0);
+    let (r2, w2) = sbb(a[2], b[2], w1);
+    let (r3, w3) = sbb(a[3], b[3], w2);
+    let (r4, w4) = sbb(a[4], b[4], w3);
+    let (r5, w5) = sbb(a[5], b[5], w4);
+    let (r6, w6) = sbb(a[6], b[6], w5);
+    let (r7, w7) = sbb(a[7], b[7], w6);
+    ([r0, r1, r2, r3, r4, r5, r6, r7], w7)
+  }
+
+  -- Branchless select on a 0/1 flag: `cond ? x : y` (per byte). `cond` is a
+  -- field element in {0,1}; each result byte equals x[i] or y[i] (both < 256),
+  -- so `u8_from_field_unsafe` is safe.
+  fn sel(cond: G, x: U8, y: U8) -> U8 {
+    u8_from_field_unsafe(to_field(x) * cond + to_field(y) * (1 - cond))
+  }
+  fn select8(cond: G, x: Goldilocks, y: Goldilocks) -> Goldilocks {
+    [sel(cond, x[0], y[0]), sel(cond, x[1], y[1]), sel(cond, x[2], y[2]),
+     sel(cond, x[3], y[3]), sel(cond, x[4], y[4]), sel(cond, x[5], y[5]),
+     sel(cond, x[6], y[6]), sel(cond, x[7], y[7])]
+  }
+
+  -- ==========================================================================
+  -- Base field add / sub (mod p).
+  -- ==========================================================================
+
+  -- a + b mod p. The raw sum is `c·2⁶⁴ + s` with `c ∈ {0,1}`, value `< 2p`:
+  --   • c = 1 ⇒ s < p (since 2⁶⁴ > p), and `2⁶⁴ + s − p = s + EPSILON`
+  --     (a non-overflowing add since the true result is < p);
+  --   • c = 0 ⇒ reduce by one conditional subtraction of p (when s ≥ p).
+  fn gl_add(a: Goldilocks, b: Goldilocks) -> Goldilocks {
+    let (s, c) = add8(a, b);
+    let (s_minus_p, borrow) = sub8(s, gl_p());  -- borrow = 1 iff s < p
+    let (s_plus_eps, _) = add8(s, gl_eps());
+    let c0 = select8(1 - borrow, s_minus_p, s); -- c=0 branch: (s≥p ? s−p : s)
+    select8(c, s_plus_eps, c0)
+  }
+
+  -- a − b mod p. `sub8` borrow = 1 iff a < b, in which case the true result is
+  -- `(a − b mod 2⁶⁴) − EPSILON = a − b + p`.
+  fn gl_sub(a: Goldilocks, b: Goldilocks) -> Goldilocks {
+    let (d, borrow) = sub8(a, b);
+    let (d_minus_eps, _) = sub8(d, gl_eps());
+    select8(borrow, d_minus_eps, d)
+  }
+
+  fn gl_neg(a: Goldilocks) -> Goldilocks { gl_sub(gl_zero(), a) }
+  fn gl_seven() -> Goldilocks { [7u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8] }
+  fn gl_two() -> Goldilocks { [2u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8] }
+
+  -- Canonicalize an arbitrary 8-byte value (possibly ≥ p, e.g. a raw hash limb
+  -- or a non-canonical wire encoding) to the canonical representative `< p`.
+  -- Any 8-byte value is `< 2⁶⁴ < 2p`, so one conditional subtraction of p
+  -- suffices (borrow = 1 iff x < p ⇒ already canonical).
+  fn gl_reduce(x: Goldilocks) -> Goldilocks {
+    let (xmp, borrow) = sub8(x, gl_p());
+    select8(borrow, x, xmp)
+  }
+
+  -- 1 iff `x` is the field zero (canonicalize first, then test all bytes zero).
+  -- The byte sum is `< 8·256 = 2048`, so `eq_zero` of the sum is field-agnostic.
+  fn gl_is_zero(x: Goldilocks) -> G {
+    let r = gl_reduce(x);
+    eq_zero(to_field(r[0]) + to_field(r[1]) + to_field(r[2]) + to_field(r[3])
+          + to_field(r[4]) + to_field(r[5]) + to_field(r[6]) + to_field(r[7]))
+  }
+
+  -- 1 iff two canonical Goldilocks values are equal (byte-exact). Inputs must be
+  -- canonical (`gl_*` outputs always are); each `to_field` difference is small.
+  fn gl_eq(a: Goldilocks, b: Goldilocks) -> G {
+    eq_zero(to_field(a[0]) - to_field(b[0])) * eq_zero(to_field(a[1]) - to_field(b[1]))
+      * eq_zero(to_field(a[2]) - to_field(b[2])) * eq_zero(to_field(a[3]) - to_field(b[3]))
+      * eq_zero(to_field(a[4]) - to_field(b[4])) * eq_zero(to_field(a[5]) - to_field(b[5]))
+      * eq_zero(to_field(a[6]) - to_field(b[6])) * eq_zero(to_field(a[7]) - to_field(b[7]))
+  }
+
+  -- ==========================================================================
+  -- Base field multiply (mod p): schoolbook 64×64 → 128-bit product, then the
+  -- Goldilocks `reduce128` (ported from Plonky3 `goldilocks.rs`).
+  -- ==========================================================================
+
+  -- 8-byte × 1-byte → 9-byte product (`a · m`, with `a < 2⁶⁴`, `m < 2⁸`, so the
+  -- result is `< 2⁷²`). Column `k` is `lo_k + hi_{k-1} + carry`, and the carry
+  -- provably stays in {0,1} (each column sum ≤ 255+255+1 = 511 < 512).
+  fn mul1(a: Goldilocks, m: U8) -> [U8; 9] {
+    let (l0, h0) = u8_mul(a[0], m);
+    let (l1, h1) = u8_mul(a[1], m);
+    let (l2, h2) = u8_mul(a[2], m);
+    let (l3, h3) = u8_mul(a[3], m);
+    let (l4, h4) = u8_mul(a[4], m);
+    let (l5, h5) = u8_mul(a[5], m);
+    let (l6, h6) = u8_mul(a[6], m);
+    let (l7, h7) = u8_mul(a[7], m);
+    let (r1, c1) = adc(l1, h0, 0);
+    let (r2, c2) = adc(l2, h1, c1);
+    let (r3, c3) = adc(l3, h2, c2);
+    let (r4, c4) = adc(l4, h3, c3);
+    let (r5, c5) = adc(l5, h4, c4);
+    let (r6, c6) = adc(l6, h5, c5);
+    let (r7, c7) = adc(l7, h6, c6);
+    let (r8, _)  = adc(h7, 0u8, c7);
+    [l0, r1, r2, r3, r4, r5, r6, r7, r8]
+  }
+
+  -- 16-byte (128-bit) add, carry chain. The final carry is discarded (used only
+  -- to accumulate partial-product rows whose total provably fits in 128 bits).
+  fn add16(x: [U8; 16], y: [U8; 16]) -> [U8; 16] {
+    let (r0,  c0)  = adc(x[0],  y[0],  0);
+    let (r1,  c1)  = adc(x[1],  y[1],  c0);
+    let (r2,  c2)  = adc(x[2],  y[2],  c1);
+    let (r3,  c3)  = adc(x[3],  y[3],  c2);
+    let (r4,  c4)  = adc(x[4],  y[4],  c3);
+    let (r5,  c5)  = adc(x[5],  y[5],  c4);
+    let (r6,  c6)  = adc(x[6],  y[6],  c5);
+    let (r7,  c7)  = adc(x[7],  y[7],  c6);
+    let (r8,  c8)  = adc(x[8],  y[8],  c7);
+    let (r9,  c9)  = adc(x[9],  y[9],  c8);
+    let (r10, c10) = adc(x[10], y[10], c9);
+    let (r11, c11) = adc(x[11], y[11], c10);
+    let (r12, c12) = adc(x[12], y[12], c11);
+    let (r13, c13) = adc(x[13], y[13], c12);
+    let (r14, c14) = adc(x[14], y[14], c13);
+    let (r15, _)   = adc(x[15], y[15], c14);
+    [r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15]
+  }
+
+  -- Full 64×64 → 128-bit product as 16 little-endian bytes. Eight single-byte
+  -- rows `a · b[i]`, each shifted left by `i` bytes and accumulated.
+  fn mul128(a: Goldilocks, b: Goldilocks) -> [U8; 16] {
+    let m0 = mul1(a, b[0]);
+    let m1 = mul1(a, b[1]);
+    let m2 = mul1(a, b[2]);
+    let m3 = mul1(a, b[3]);
+    let m4 = mul1(a, b[4]);
+    let m5 = mul1(a, b[5]);
+    let m6 = mul1(a, b[6]);
+    let m7 = mul1(a, b[7]);
+    let acc = [m0[0], m0[1], m0[2], m0[3], m0[4], m0[5], m0[6], m0[7], m0[8],
+               0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8];
+    let rb1 = [0u8, m1[0], m1[1], m1[2], m1[3], m1[4], m1[5], m1[6], m1[7], m1[8],
+               0u8, 0u8, 0u8, 0u8, 0u8, 0u8];
+    let acc = add16(acc, rb1);
+    let rb2 = [0u8, 0u8, m2[0], m2[1], m2[2], m2[3], m2[4], m2[5], m2[6], m2[7], m2[8],
+               0u8, 0u8, 0u8, 0u8, 0u8];
+    let acc = add16(acc, rb2);
+    let rb3 = [0u8, 0u8, 0u8, m3[0], m3[1], m3[2], m3[3], m3[4], m3[5], m3[6], m3[7], m3[8],
+               0u8, 0u8, 0u8, 0u8];
+    let acc = add16(acc, rb3);
+    let rb4 = [0u8, 0u8, 0u8, 0u8, m4[0], m4[1], m4[2], m4[3], m4[4], m4[5], m4[6], m4[7], m4[8],
+               0u8, 0u8, 0u8];
+    let acc = add16(acc, rb4);
+    let rb5 = [0u8, 0u8, 0u8, 0u8, 0u8, m5[0], m5[1], m5[2], m5[3], m5[4], m5[5], m5[6], m5[7], m5[8],
+               0u8, 0u8];
+    let acc = add16(acc, rb5);
+    let rb6 = [0u8, 0u8, 0u8, 0u8, 0u8, 0u8, m6[0], m6[1], m6[2], m6[3], m6[4], m6[5], m6[6], m6[7], m6[8],
+               0u8];
+    let acc = add16(acc, rb6);
+    let rb7 = [0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, m7[0], m7[1], m7[2], m7[3], m7[4], m7[5], m7[6], m7[7], m7[8]];
+    add16(acc, rb7)
+  }
+
+  -- Reduce a 128-bit value (16 LE bytes) mod p. Port of Plonky3 `reduce128`:
+  -- with `x = x_lo + x_hi·2⁶⁴`, split `x_hi = x_hi_lo + x_hi_hi·2³²`; then
+  -- `2⁶⁴ ≡ EPSILON` and `2⁹⁶ ≡ −1` give `x ≡ x_lo − x_hi_hi + x_hi_lo·EPSILON`.
+  -- A final conditional subtraction of p canonicalizes (Plonky3 keeps a
+  -- non-canonical u64; we need the canonical bytes).
+  fn reduce128(p: [U8; 16]) -> Goldilocks {
+    let xlo = [p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7]];
+    let xhl = [p[8], p[9], p[10], p[11], 0u8, 0u8, 0u8, 0u8];   -- low 32 bits of x_hi
+    let xhh = [p[12], p[13], p[14], p[15], 0u8, 0u8, 0u8, 0u8]; -- high 32 bits of x_hi
+    -- t0 = x_lo − x_hi_hi (subtract EPSILON on borrow; cannot underflow).
+    let (t0w, borrow) = sub8(xlo, xhh);
+    let (t0b, _) = sub8(t0w, gl_eps());
+    let t0 = select8(borrow, t0b, t0w);
+    -- t1 = x_hi_lo · EPSILON  (< 2⁶⁴, so only the low 8 product bytes matter).
+    let t1full = mul128(xhl, gl_eps());
+    let t1 = [t1full[0], t1full[1], t1full[2], t1full[3],
+              t1full[4], t1full[5], t1full[6], t1full[7]];
+    -- t2 = t0 + t1  (add EPSILON on overflow; result stays < 2⁶⁴).
+    let (t2w, carry) = add8(t0, t1);
+    let (t2c, _) = add8(t2w, gl_eps());
+    let t2 = select8(carry, t2c, t2w);
+    -- canonicalize: borrow2 = 1 iff t2 < p ⇒ keep t2, else subtract p.
+    let (t2mp, borrow2) = sub8(t2, gl_p());
+    select8(borrow2, t2, t2mp)
+  }
+
+  fn gl_mul(a: Goldilocks, b: Goldilocks) -> Goldilocks { reduce128(mul128(a, b)) }
+  fn gl_sq(a: Goldilocks) -> Goldilocks { gl_mul(a, a) }
+
+  -- ==========================================================================
+  -- Base field inverse / divide via Fermat: a⁻¹ = a^(p−2).
+  -- p − 2 = 2⁶⁴ − 2³² − 1 — in binary: bit 32 is 0, every other bit is 1
+  -- (31 ones, one zero, 32 ones). `gl_run` applies `n` steps of `acc ← acc²·b`.
+  -- ==========================================================================
+  fn gl_run(acc: Goldilocks, base: Goldilocks, n: G) -> Goldilocks {
+    match n {
+      0 => acc,
+      _ => gl_run(gl_mul(gl_sq(acc), base), base, n - 1),
+    }
+  }
+  fn gl_inverse(x: Goldilocks) -> Goldilocks {
+    let acc = gl_run(x, x, 30);  -- bits 63..33: 31 ones (initial acc = x is bit 63)
+    let acc = gl_sq(acc);        -- bit 32: a single 0
+    gl_run(acc, x, 32)           -- bits 31..0: 32 ones
+  }
+  fn gl_div(a: Goldilocks, b: Goldilocks) -> Goldilocks { gl_mul(a, gl_inverse(b)) }
+
+  -- ==========================================================================
+  -- Extension field ExtGoldilocks = 𝔽_p[X]/(X² − 7).
+  -- ==========================================================================
+  fn eg_add(a: ExtGoldilocks, b: ExtGoldilocks) -> ExtGoldilocks {
+    [gl_add(a[0], b[0]), gl_add(a[1], b[1])]
+  }
+  fn eg_sub(a: ExtGoldilocks, b: ExtGoldilocks) -> ExtGoldilocks {
+    [gl_sub(a[0], b[0]), gl_sub(a[1], b[1])]
+  }
+  fn eg_neg(a: ExtGoldilocks) -> ExtGoldilocks {
+    [gl_neg(a[0]), gl_neg(a[1])]
+  }
+  -- (a0 + a1·X)(b0 + b1·X) = (a0·b0 + 7·a1·b1) + (a0·b1 + a1·b0)·X.
+  fn eg_mul(a: ExtGoldilocks, b: ExtGoldilocks) -> ExtGoldilocks {
+    [gl_add(gl_mul(a[0], b[0]), gl_mul(gl_seven(), gl_mul(a[1], b[1]))),
+     gl_add(gl_mul(a[0], b[1]), gl_mul(a[1], b[0]))]
+  }
+  -- conjugate ā = a0 − a1·X, norm a·ā = a0² − 7·a1² ∈ 𝔽_p, a⁻¹ = ā / norm.
+  fn eg_inverse(a: ExtGoldilocks) -> ExtGoldilocks {
+    let norm = gl_sub(gl_sq(a[0]), gl_mul(gl_seven(), gl_sq(a[1])));
+    let ninv = gl_inverse(norm);
+    [gl_mul(a[0], ninv), gl_mul(gl_neg(a[1]), ninv)]
+  }
+  fn eg_div(a: ExtGoldilocks, b: ExtGoldilocks) -> ExtGoldilocks {
+    eg_mul(a, eg_inverse(b))
+  }
+  -- 1 iff two extension elements are equal (both coordinates byte-exact).
+  fn eg_eq(a: ExtGoldilocks, b: ExtGoldilocks) -> G {
+    gl_eq(a[0], b[0]) * gl_eq(a[1], b[1])
+  }
+
+⟧
+
+end MultiStark
+
+end
diff --git a/Ix/MultiStark/Keccak.lean b/Ix/MultiStark/Keccak.lean
new file mode 100644
index 00000000..a2eb7e73
--- /dev/null
+++ b/Ix/MultiStark/Keccak.lean
@@ -0,0 +1,355 @@
+module
+public import Ix.Aiur.Meta
+public import Ix.IxVM.Core
+public import Ix.IxVM.ByteStream
+
+/-!
+# Keccak-256 in Aiur
+
+The hash multi-stark uses (`p3_keccak`: `Keccak256Hash` + `KeccakF` permutation,
+keccak-f[1600] sponge). This is the Ethereum/original Keccak variant
+(`tiny_keccak::Keccak::v256`): pad `10*1` with first pad byte `0x01` and final
+`0x80`, rate = 136 bytes (17 lanes), capacity 512, 256-bit output.
+
+Representation:
+* lane  = 64-bit word = `[U8; 8]`, little-endian bytes (byte 0 = least significant).
+* state = `[Lane; 25]`, lane index `L(x,y) = x + 5*y`.
+
+Bit ops use the `u8` gadgets (`u8_xor`/`u8_and`; NOT = `u8_xor(b, 0xFF)`).
+Rotations go through `u8_bit_decomposition` (u8 → `[field; 8]`) and a generic
+bit-list rotate, recomposing bytes with `u8_from_field_unsafe`.
+
+**Scope:** single-block only — message length ≤ 135 bytes (one keccak-f call).
+Multi-block absorption is future work.
+-/
+
+public section
+
+namespace MultiStark
+
+def keccak := ⟦
+  -- A 64-bit Keccak lane: a pointer to 8 little-endian bytes. Storing lanes
+  -- behind a pointer keeps the state (and every lane-passing function) one
+  -- column wide instead of eight.
+  type Lane = &[U8; 8]
+
+  -- The keccak-f state: a pointer to the 5×5 = 25 lanes. Passed behind a
+  -- pointer too, so state-threading functions stay one column wide.
+  type State = &[Lane; 25]
+
+  -- ==========================================================================
+  -- Lane bit-logic (byte-wise u8 gadgets).
+  -- ==========================================================================
+
+  fn xor8(a: Lane, b: Lane) -> Lane {
+    let x = load(a);
+    let y = load(b);
+    store([u8_xor(x[0], y[0]), u8_xor(x[1], y[1]), u8_xor(x[2], y[2]), u8_xor(x[3], y[3]),
+           u8_xor(x[4], y[4]), u8_xor(x[5], y[5]), u8_xor(x[6], y[6]), u8_xor(x[7], y[7])])
+  }
+
+  fn and8(a: Lane, b: Lane) -> Lane {
+    let x = load(a);
+    let y = load(b);
+    store([u8_and(x[0], y[0]), u8_and(x[1], y[1]), u8_and(x[2], y[2]), u8_and(x[3], y[3]),
+           u8_and(x[4], y[4]), u8_and(x[5], y[5]), u8_and(x[6], y[6]), u8_and(x[7], y[7])])
+  }
+
+  -- Bitwise NOT via XOR with 0xFF (keeps the byte `u8`-typed).
+  fn not8(a: Lane) -> Lane {
+    let x = load(a);
+    store([u8_xor(x[0], 255u8), u8_xor(x[1], 255u8), u8_xor(x[2], 255u8), u8_xor(x[3], 255u8),
+           u8_xor(x[4], 255u8), u8_xor(x[5], 255u8), u8_xor(x[6], 255u8), u8_xor(x[7], 255u8)])
+  }
+
+  -- ==========================================================================
+  -- Generic cyclic left-rotation of a lane by `n` bits (0 ≤ n < 64).
+  -- Decompose to a 64-element bit list (LSB first), reindex, recompose.
+  -- ==========================================================================
+
+  -- Prepend the 8 bits of a byte decomposition (LSB first) onto `tail`.
+  fn cons8(d: [G; 8], tail: List‹G›) -> List‹G› {
+    store(ListNode.Cons(d[0], store(ListNode.Cons(d[1], store(ListNode.Cons(d[2],
+    store(ListNode.Cons(d[3], store(ListNode.Cons(d[4], store(ListNode.Cons(d[5],
+    store(ListNode.Cons(d[6], store(ListNode.Cons(d[7], tail))))))))))))))))
+  }
+
+  -- Lane → 64 bits, index z = 8*byte + bit, LSB first.
+  fn lane_bits(l: Lane) -> List‹G› {
+    let v = load(l);
+    cons8(u8_bit_decomposition(v[0]),
+    cons8(u8_bit_decomposition(v[1]),
+    cons8(u8_bit_decomposition(v[2]),
+    cons8(u8_bit_decomposition(v[3]),
+    cons8(u8_bit_decomposition(v[4]),
+    cons8(u8_bit_decomposition(v[5]),
+    cons8(u8_bit_decomposition(v[6]),
+    cons8(u8_bit_decomposition(v[7]), store(ListNode.Nil)))))))))
+  }
+
+  -- x mod 64 for x in [0, 127]: drop the 64-bit (bit 6).
+  fn mod64(x: G) -> G {
+    let bits = u8_bit_decomposition(u8_from_field_unsafe(x));
+    x - 64 * bits[6]
+  }
+
+  -- Recompose a byte (as `u8`) from bits[off .. off+7] (LSB first). Indexed
+  -- access is inlined as `list_drop` (not `list_lookup`): `list_drop` recurses
+  -- on the same list pointer, so the consecutive drops here share the cached
+  -- drop-chain (far better caching than `list_lookup`).
+  fn byte_from_bits(bs: List‹G›, off: G) -> U8 {
+    -- One `list_drop` to the byte's first bit; the remaining 7 bits are the
+    -- successive tails (a deref each), so no further drops are needed.
+    let &ListNode.Cons(b0, r0) = list_drop(bs, off);
+    let &ListNode.Cons(b1, r1) = r0;
+    let &ListNode.Cons(b2, r2) = r1;
+    let &ListNode.Cons(b3, r3) = r2;
+    let &ListNode.Cons(b4, r4) = r3;
+    let &ListNode.Cons(b5, r5) = r4;
+    let &ListNode.Cons(b6, r6) = r5;
+    let &ListNode.Cons(b7, _) = r6;
+    u8_from_field_unsafe(
+      b0 + 2 * b1 + 4 * b2 + 8 * b3 + 16 * b4 + 32 * b5 + 64 * b6 + 128 * b7)
+  }
+
+  fn lane_from_bits(bs: List‹G›) -> Lane {
+    store([byte_from_bits(bs, 0), byte_from_bits(bs, 8), byte_from_bits(bs, 16), byte_from_bits(bs, 24),
+           byte_from_bits(bs, 32), byte_from_bits(bs, 40), byte_from_bits(bs, 48), byte_from_bits(bs, 56)])
+  }
+
+  -- Build the rotated bit list: output position p (0..63) takes source
+  -- bit (p - n) mod 64 = mod64(p + 64 - n).
+  fn build_rot(bs: List‹G›, n: G, p: G) -> List‹G› {
+    match 64 - p {
+      0 => store(ListNode.Nil),
+      _ =>
+        let src = mod64(p + 64 - n);
+        let &ListNode.Cons(head, _) = list_drop(bs, src);
+        store(ListNode.Cons(head, build_rot(bs, n, p + 1))),
+    }
+  }
+
+  fn rotl(l: Lane, n: G) -> Lane {
+    lane_from_bits(build_rot(lane_bits(l), n, 0))
+  }
+
+  -- ==========================================================================
+  -- keccak-f[1600]: 24 rounds of θ ρ π χ ι.
+  -- ==========================================================================
+
+  fn keccak_round(sp: State, rc: Lane) -> State {
+    let s = load(sp);
+    -- θ: column parities and the D correction.
+    let c0 = xor8(xor8(xor8(xor8(s[0], s[5]), s[10]), s[15]), s[20]);
+    let c1 = xor8(xor8(xor8(xor8(s[1], s[6]), s[11]), s[16]), s[21]);
+    let c2 = xor8(xor8(xor8(xor8(s[2], s[7]), s[12]), s[17]), s[22]);
+    let c3 = xor8(xor8(xor8(xor8(s[3], s[8]), s[13]), s[18]), s[23]);
+    let c4 = xor8(xor8(xor8(xor8(s[4], s[9]), s[14]), s[19]), s[24]);
+    let d0 = xor8(c4, rotl(c1, 1));
+    let d1 = xor8(c0, rotl(c2, 1));
+    let d2 = xor8(c1, rotl(c3, 1));
+    let d3 = xor8(c2, rotl(c4, 1));
+    let d4 = xor8(c3, rotl(c0, 1));
+    let a0 = xor8(s[0], d0);   let a1 = xor8(s[1], d1);   let a2 = xor8(s[2], d2);
+    let a3 = xor8(s[3], d3);   let a4 = xor8(s[4], d4);
+    let a5 = xor8(s[5], d0);   let a6 = xor8(s[6], d1);   let a7 = xor8(s[7], d2);
+    let a8 = xor8(s[8], d3);   let a9 = xor8(s[9], d4);
+    let a10 = xor8(s[10], d0); let a11 = xor8(s[11], d1); let a12 = xor8(s[12], d2);
+    let a13 = xor8(s[13], d3); let a14 = xor8(s[14], d4);
+    let a15 = xor8(s[15], d0); let a16 = xor8(s[16], d1); let a17 = xor8(s[17], d2);
+    let a18 = xor8(s[18], d3); let a19 = xor8(s[19], d4);
+    let a20 = xor8(s[20], d0); let a21 = xor8(s[21], d1); let a22 = xor8(s[22], d2);
+    let a23 = xor8(s[23], d3); let a24 = xor8(s[24], d4);
+    -- ρ + π: B[dest] = rotl(A'[src], offset).
+    let b0  = a0;
+    let b1  = rotl(a6, 44);
+    let b2  = rotl(a12, 43);
+    let b3  = rotl(a18, 21);
+    let b4  = rotl(a24, 14);
+    let b5  = rotl(a3, 28);
+    let b6  = rotl(a9, 20);
+    let b7  = rotl(a10, 3);
+    let b8  = rotl(a16, 45);
+    let b9  = rotl(a22, 61);
+    let b10 = rotl(a1, 1);
+    let b11 = rotl(a7, 6);
+    let b12 = rotl(a13, 25);
+    let b13 = rotl(a19, 8);
+    let b14 = rotl(a20, 18);
+    let b15 = rotl(a4, 27);
+    let b16 = rotl(a5, 36);
+    let b17 = rotl(a11, 10);
+    let b18 = rotl(a17, 15);
+    let b19 = rotl(a23, 56);
+    let b20 = rotl(a2, 62);
+    let b21 = rotl(a8, 55);
+    let b22 = rotl(a14, 39);
+    let b23 = rotl(a15, 41);
+    let b24 = rotl(a21, 2);
+    -- χ: A''[x][y] = B[x][y] ^ (¬B[x+1][y] & B[x+2][y]).
+    let e0  = xor8(b0,  and8(not8(b1),  b2));
+    let e1  = xor8(b1,  and8(not8(b2),  b3));
+    let e2  = xor8(b2,  and8(not8(b3),  b4));
+    let e3  = xor8(b3,  and8(not8(b4),  b0));
+    let e4  = xor8(b4,  and8(not8(b0),  b1));
+    let e5  = xor8(b5,  and8(not8(b6),  b7));
+    let e6  = xor8(b6,  and8(not8(b7),  b8));
+    let e7  = xor8(b7,  and8(not8(b8),  b9));
+    let e8  = xor8(b8,  and8(not8(b9),  b5));
+    let e9  = xor8(b9,  and8(not8(b5),  b6));
+    let e10 = xor8(b10, and8(not8(b11), b12));
+    let e11 = xor8(b11, and8(not8(b12), b13));
+    let e12 = xor8(b12, and8(not8(b13), b14));
+    let e13 = xor8(b13, and8(not8(b14), b10));
+    let e14 = xor8(b14, and8(not8(b10), b11));
+    let e15 = xor8(b15, and8(not8(b16), b17));
+    let e16 = xor8(b16, and8(not8(b17), b18));
+    let e17 = xor8(b17, and8(not8(b18), b19));
+    let e18 = xor8(b18, and8(not8(b19), b15));
+    let e19 = xor8(b19, and8(not8(b15), b16));
+    let e20 = xor8(b20, and8(not8(b21), b22));
+    let e21 = xor8(b21, and8(not8(b22), b23));
+    let e22 = xor8(b22, and8(not8(b23), b24));
+    let e23 = xor8(b23, and8(not8(b24), b20));
+    let e24 = xor8(b24, and8(not8(b20), b21));
+    -- ι: add round constant to lane (0,0).
+    let f0 = xor8(e0, rc);
+    store([f0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14,
+           e15, e16, e17, e18, e19, e20, e21, e22, e23, e24])
+  }
+
+  -- Round constant for round `i` (little-endian byte lane). A match (not an
+  -- array `get`, which needs a literal index) so it can be indexed at runtime.
+  fn rc_lane(i: G) -> Lane {
+    match i {
+      0  => store([0x01u8, 0u8,    0u8, 0u8,    0u8, 0u8, 0u8, 0u8   ]),
+      1  => store([0x82u8, 0x80u8, 0u8, 0u8,    0u8, 0u8, 0u8, 0u8   ]),
+      2  => store([0x8au8, 0x80u8, 0u8, 0u8,    0u8, 0u8, 0u8, 0x80u8]),
+      3  => store([0x00u8, 0x80u8, 0u8, 0x80u8, 0u8, 0u8, 0u8, 0x80u8]),
+      4  => store([0x8bu8, 0x80u8, 0u8, 0u8,    0u8, 0u8, 0u8, 0u8   ]),
+      5  => store([0x01u8, 0u8,    0u8, 0x80u8, 0u8, 0u8, 0u8, 0u8   ]),
+      6  => store([0x81u8, 0x80u8, 0u8, 0x80u8, 0u8, 0u8, 0u8, 0x80u8]),
+      7  => store([0x09u8, 0x80u8, 0u8, 0u8,    0u8, 0u8, 0u8, 0x80u8]),
+      8  => store([0x8au8, 0u8,    0u8, 0u8,    0u8, 0u8, 0u8, 0u8   ]),
+      9  => store([0x88u8, 0u8,    0u8, 0u8,    0u8, 0u8, 0u8, 0u8   ]),
+      10 => store([0x09u8, 0x80u8, 0u8, 0x80u8, 0u8, 0u8, 0u8, 0u8   ]),
+      11 => store([0x0au8, 0u8,    0u8, 0x80u8, 0u8, 0u8, 0u8, 0u8   ]),
+      12 => store([0x8bu8, 0x80u8, 0u8, 0x80u8, 0u8, 0u8, 0u8, 0u8   ]),
+      13 => store([0x8bu8, 0u8,    0u8, 0u8,    0u8, 0u8, 0u8, 0x80u8]),
+      14 => store([0x89u8, 0x80u8, 0u8, 0u8,    0u8, 0u8, 0u8, 0x80u8]),
+      15 => store([0x03u8, 0x80u8, 0u8, 0u8,    0u8, 0u8, 0u8, 0x80u8]),
+      16 => store([0x02u8, 0x80u8, 0u8, 0u8,    0u8, 0u8, 0u8, 0x80u8]),
+      17 => store([0x80u8, 0u8,    0u8, 0u8,    0u8, 0u8, 0u8, 0x80u8]),
+      18 => store([0x0au8, 0x80u8, 0u8, 0u8,    0u8, 0u8, 0u8, 0u8   ]),
+      19 => store([0x0au8, 0u8,    0u8, 0x80u8, 0u8, 0u8, 0u8, 0x80u8]),
+      20 => store([0x81u8, 0x80u8, 0u8, 0x80u8, 0u8, 0u8, 0u8, 0x80u8]),
+      21 => store([0x80u8, 0x80u8, 0u8, 0u8,    0u8, 0u8, 0u8, 0x80u8]),
+      22 => store([0x01u8, 0u8,    0u8, 0x80u8, 0u8, 0u8, 0u8, 0u8   ]),
+      _  => store([0x08u8, 0x80u8, 0u8, 0x80u8, 0u8, 0u8, 0u8, 0x80u8]),
+    }
+  }
+
+  -- Apply the last `n` keccak-f rounds recursively (round index 24 - n), so
+  -- each round is its own call frame rather than 24 inlined copies (`fold`),
+  -- keeping the circuit narrow. `keccak_f_fold(s, 24)` is the full permutation.
+  fn keccak_f_fold(s: State, n: G) -> State {
+    match n {
+      0 => s,
+      _ => keccak_f_fold(keccak_round(s, rc_lane(24 - n)), n - 1),
+    }
+  }
+
+  -- ==========================================================================
+  -- Single-block sponge (keccak-256, message ≤ 135 bytes).
+  -- ==========================================================================
+
+  -- Pad bytes for positions i..135 once the message is exhausted. `first`=1 at
+  -- position `len` (gets the 0x01 pad bit); position 135 gets the 0x80 bit
+  -- (combining to 0x81 when len == 135).
+  -- `first` is 1 on the first pad position (gets 0x01), 0 afterwards. The 0x80
+  -- bit lands on position 135 (combining to 0x81 when first==1 there). Uses
+  -- `eq_zero` (an op) instead of a match so it stays tail-position-compilable.
+  fn build_pad(i: G, first: G) -> ByteStream {
+    match 136 - i {
+      0 => store(ListNode.Nil),
+      _ =>
+        store(ListNode.Cons(u8_from_field_unsafe(first + 128 * eq_zero(135 - i)),
+                            build_pad(i + 1, 0))),
+    }
+  }
+
+  -- Take one 136-byte rate block from `stream` starting at position `i`.
+  -- Returns (block, rest, full): a length-136 byte list, the unconsumed stream,
+  -- and full=1 if 136 message bytes were consumed (no padding yet), else 0 (the
+  -- message ended in this block, so pad10*1 was applied). When the stream is
+  -- already empty this yields an all-padding block with full=0 — the extra
+  -- block keccak appends when the message length is a multiple of 136.
+  fn read_block(stream: ByteStream, i: G) -> (ByteStream, ByteStream, G) {
+    match 136 - i {
+      0 => (store(ListNode.Nil), stream, 1),
+      _ => match load(stream) {
+        ListNode.Cons(b, rest) =>
+          let (blk, r, f) = read_block(rest, i + 1);
+          (store(ListNode.Cons(b, blk)), r, f),
+        ListNode.Nil => (build_pad(i, 1), store(ListNode.Nil), 0),
+      },
+    }
+  }
+
+  fn rate_lane(rate: ByteStream, b: G) -> Lane {
+    -- One `list_drop` to the lane's first byte; the next 7 are successive tails.
+    let &ListNode.Cons(r0, t0) = list_drop(rate, b);
+    let &ListNode.Cons(r1, t1) = t0;
+    let &ListNode.Cons(r2, t2) = t1;
+    let &ListNode.Cons(r3, t3) = t2;
+    let &ListNode.Cons(r4, t4) = t3;
+    let &ListNode.Cons(r5, t5) = t4;
+    let &ListNode.Cons(r6, t6) = t5;
+    let &ListNode.Cons(r7, _) = t6;
+    store([r0, r1, r2, r3, r4, r5, r6, r7])
+  }
+
+  -- XOR one 136-byte rate block into the first 17 lanes, then permute.
+  fn absorb_one(sp: State, rate: ByteStream) -> State {
+    let state = load(sp);
+    let s = store([
+      xor8(state[0],  rate_lane(rate, 0)),   xor8(state[1],  rate_lane(rate, 8)),
+      xor8(state[2],  rate_lane(rate, 16)),  xor8(state[3],  rate_lane(rate, 24)),
+      xor8(state[4],  rate_lane(rate, 32)),  xor8(state[5],  rate_lane(rate, 40)),
+      xor8(state[6],  rate_lane(rate, 48)),  xor8(state[7],  rate_lane(rate, 56)),
+      xor8(state[8],  rate_lane(rate, 64)),  xor8(state[9],  rate_lane(rate, 72)),
+      xor8(state[10], rate_lane(rate, 80)),  xor8(state[11], rate_lane(rate, 88)),
+      xor8(state[12], rate_lane(rate, 96)),  xor8(state[13], rate_lane(rate, 104)),
+      xor8(state[14], rate_lane(rate, 112)), xor8(state[15], rate_lane(rate, 120)),
+      xor8(state[16], rate_lane(rate, 128)),
+      state[17], state[18], state[19], state[20], state[21], state[22], state[23], state[24]
+    ]);
+    keccak_f_fold(s, 24)
+  }
+
+  -- Absorb every rate block of the (padded) message into the state.
+  fn absorb_blocks(stream: ByteStream, state: State) -> State {
+    let (block, rest, full) = read_block(stream, 0);
+    let st2 = absorb_one(state, block);
+    match full {
+      0 => st2,
+      _ => absorb_blocks(rest, st2),
+    }
+  }
+
+  -- keccak-256: absorb the byte stream into the zero state, then squeeze the
+  -- first 32 bytes (lanes 0..3). Lanes are `load`ed back to inline bytes so the
+  -- digest is a plain `[[U8; 8]; 4]`.
+  fn keccak256(bytes: ByteStream) -> [[U8; 8]; 4] {
+    let z = store([0u8; 8]);
+    let init = store([z, z, z, z, z, z, z, z, z, z, z, z, z, z, z, z, z, z, z, z, z, z, z, z, z]);
+    let sp = absorb_blocks(bytes, init);
+    let s = load(sp);
+    [load(s[0]), load(s[1]), load(s[2]), load(s[3])]
+  }
+
+⟧
+
+end MultiStark
+
+end
diff --git a/Ix/MultiStark/Pcs.lean b/Ix/MultiStark/Pcs.lean
new file mode 100644
index 00000000..024adb87
--- /dev/null
+++ b/Ix/MultiStark/Pcs.lean
@@ -0,0 +1,772 @@
+module
+public import Ix.Aiur.Meta
+public import Ix.MultiStark.Deserialize
+public import Ix.MultiStark.Keccak
+
+/-!
+# PCS (FRI) verification
+
+`multi-stark/src/verifier.rs` calls `pcs.verify(coms_to_verify, opening_proof,
+&mut challenger)` — a `TwoAdicFriPcs` FRI verification: query openings, Merkle
+authentication paths, FRI folding consistency. This is the heaviest part of the
+verifier and is being ported here in stages.
+
+## Stage 1 — Merkle (MMCS) hash primitives (this file, done + validated)
+
+The input/commit-phase commitments are `MerkleTreeMmcs` over the keccak hasher
+configured in `multi-stark/src/types.rs`:
+
+* leaf hash  : `SerializingHasher<PaddingFreeSponge<KeccakF, 25, 17, 4>>`
+  — serialize each `Goldilocks` element to its canonical `u64`, then run a
+  padding-free **overwrite-mode** sponge (rate 17 lanes, capacity 8, output 4)
+  built on the keccak-f[1600] permutation (reused from `Keccak.lean`).
+* compression: `CompressionFunctionFromHasher<PaddingFreeSponge<…>, 2, 4>`
+  — hash the 8 lanes of two concatenated digests into a 4-lane digest.
+
+`PaddingFreeSponge` differs from keccak-256 (`Keccak.lean`): no `10*1` padding,
+rate 17 *lanes* (not 136 bytes), and each input block **overwrites** the rate
+region (keccak-256 XORs). A full block triggers a permute; a final partial block
+permutes once iff it absorbed ≥1 element; an empty trailing block does not
+permute.
+
+Validated against `multi-stark`'s own hasher (see `pcs_hash_test`, reference
+values from the `pcs_ref_values` test in `multi-stark/src/types.rs`).
+
+## Stage 2+ — TODO
+
+Merkle `verify_batch` (binary tree, multi-height injection), challenger
+threading, the FRI fold chain (`open_input` / `verify_query`), and the final
+polynomial check. Until those land, `pcs_verify` remains an accept-stub.
+-/
+
+public section
+
+namespace MultiStark
+
+def pcs := ⟦
+  -- ==========================================================================
+  -- PaddingFreeSponge<KeccakF, 25, 17, 4> in overwrite mode.
+  --
+  -- Lanes are the keccak `Lane` type (`&[U8; 8]`, LE). A `U64` opened value is
+  -- (for an honest, canonical proof) exactly the 8 LE bytes of a lane, so a lane
+  -- is just `store(u64)`. The 25-lane keccak `State` and `keccak_f_fold` are
+  -- reused from `Keccak.lean`.
+  -- ==========================================================================
+
+  -- Take one input element as the next rate lane, or keep the existing state
+  -- lane `dflt` when the input is exhausted. The flag is 1 iff an input element
+  -- was consumed (input is consumed front-to-back, so the per-block flags are a
+  -- run of 1s followed by 0s).
+  fn u64_pick(vals: List‹U64›, dflt: Lane) -> (Lane, List‹U64›, G) {
+    match load(vals) {
+      ListNode.Nil => (dflt, vals, 0),
+      ListNode.Cons(x, rest) => (store(x), rest, 1),
+    }
+  }
+
+  -- Absorb the input lanes into the state, 17 (= RATE) at a time, overwriting
+  -- the rate region. After a FULL block of 17, permute and continue. A final
+  -- partial block (1..16 elements) permutes once; an exactly-empty trailing
+  -- block does not permute (matches `PaddingFreeSponge::hash_iter`).
+  fn pf_absorb(sp: State, vals: List‹U64›) -> State {
+    let old = load(sp);
+    let (l0,  v1,  f0)  = u64_pick(vals, old[0]);
+    let (l1,  v2,  _f1) = u64_pick(v1,  old[1]);
+    let (l2,  v3,  _f2) = u64_pick(v2,  old[2]);
+    let (l3,  v4,  _f3) = u64_pick(v3,  old[3]);
+    let (l4,  v5,  _f4) = u64_pick(v4,  old[4]);
+    let (l5,  v6,  _f5) = u64_pick(v5,  old[5]);
+    let (l6,  v7,  _f6) = u64_pick(v6,  old[6]);
+    let (l7,  v8,  _f7) = u64_pick(v7,  old[7]);
+    let (l8,  v9,  _f8) = u64_pick(v8,  old[8]);
+    let (l9,  v10, _f9) = u64_pick(v9,  old[9]);
+    let (l10, v11, _fa) = u64_pick(v10, old[10]);
+    let (l11, v12, _fb) = u64_pick(v11, old[11]);
+    let (l12, v13, _fc) = u64_pick(v12, old[12]);
+    let (l13, v14, _fd) = u64_pick(v13, old[13]);
+    let (l14, v15, _fe) = u64_pick(v14, old[14]);
+    let (l15, v16, _ff) = u64_pick(v15, old[15]);
+    let (l16, v17, f16) = u64_pick(v16, old[16]);
+    let sp2 = store([l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13,
+                     l14, l15, l16, old[17], old[18], old[19], old[20], old[21],
+                     old[22], old[23], old[24]]);
+    match f16 {
+      1 => pf_absorb(keccak_f_fold(sp2, 24), v17),
+      _ => match f0 {
+        0 => sp2,
+        _ => keccak_f_fold(sp2, 24),
+      },
+    }
+  }
+
+  -- Hash a list of `u64` lanes (a serialized field-element row, or two
+  -- concatenated digests) into a 4-lane `Digest`.
+  fn pf_sponge_u64(vals: List‹U64›) -> Digest {
+    let z = store([0u8; 8]);
+    let init = store([z, z, z, z, z, z, z, z, z, z, z, z, z, z, z, z, z, z, z, z,
+                      z, z, z, z, z]);
+    let sp = pf_absorb(init, vals);
+    let s = load(sp);
+    [load(s[0]), load(s[1]), load(s[2]), load(s[3])]
+  }
+
+  -- The MMCS leaf hash of a single matrix row (`SerializingHasher` over the row
+  -- of canonical `u64`s). For a leaf joining several same-height matrices, the
+  -- rows are concatenated first (see `verify_batch`, future work).
+  fn mmcs_hash_row(row: List‹U64›) -> Digest {
+    pf_sponge_u64(row)
+  }
+
+  -- The MMCS 2-to-1 compression: hash the 8 lanes of two concatenated digests.
+  fn mmcs_compress(a: Digest, b: Digest) -> Digest {
+    let t0 = store(ListNode.Nil);
+    let t1 = store(ListNode.Cons(b[3], t0));
+    let t2 = store(ListNode.Cons(b[2], t1));
+    let t3 = store(ListNode.Cons(b[1], t2));
+    let t4 = store(ListNode.Cons(b[0], t3));
+    let t5 = store(ListNode.Cons(a[3], t4));
+    let t6 = store(ListNode.Cons(a[2], t5));
+    let t7 = store(ListNode.Cons(a[1], t6));
+    let t8 = store(ListNode.Cons(a[0], t7));
+    pf_sponge_u64(t8)
+  }
+
+  -- ==========================================================================
+  -- Merkle MMCS `verify_batch` (binary tree, DIGEST_ELEMS = 4).
+  --
+  -- Ports `multi-stark/Plonky3/merkle-tree/src/mmcs.rs::verify_batch` for the
+  -- binary (N = 2) case. All committed matrices have power-of-two heights, so a
+  -- matrix's height is `2^log_height`. The opened rows arrive in matrix order;
+  -- `lhs` is the matching list of per-matrix log-heights. The query `index` is
+  -- threaded as a bit list (LSB first = leaf→root path) to avoid field division.
+  --
+  -- The leaf hash joins all matrices at the maximum log-height. Walking down,
+  -- each level folds with one proof sibling (ordered by the path bit), then —
+  -- if any matrix lives at the new log-height — injects that matrix group's leaf
+  -- hash via a second compression (this consumes no proof sibling), exactly as
+  -- the Rust loop's `next_height_openings_digest` injection.
+  -- ==========================================================================
+
+  -- 1 iff two digests are equal (compared as field elements; hash outputs are
+  -- canonical so this is exact).
+  fn digest_eq(a: Digest, b: Digest) -> G {
+    eq_zero(limb_to_field(a[0]) - limb_to_field(b[0])) *
+    eq_zero(limb_to_field(a[1]) - limb_to_field(b[1])) *
+    eq_zero(limb_to_field(a[2]) - limb_to_field(b[2])) *
+    eq_zero(limb_to_field(a[3]) - limb_to_field(b[3]))
+  }
+
+  -- Compress (current, sibling) in path order: path bit 0 ⇒ current is the left
+  -- child, bit 1 ⇒ current is the right child.
+  fn compress_ordered(bit: G, d: Digest, s: Digest) -> Digest {
+    match bit {
+      0 => mmcs_compress(d, s),
+      _ => mmcs_compress(s, d),
+    }
+  }
+
+  -- 1 iff some matrix has log-height `target`.
+  fn has_height(lhs: List‹G›, target: G) -> G {
+    match load(lhs) {
+      ListNode.Nil => 0,
+      ListNode.Cons(h, rest) =>
+        match eq_zero(h - target) {
+          1 => 1,
+          _ => has_height(rest, target),
+        },
+    }
+  }
+
+  -- Concatenate the opened rows of every matrix whose log-height is `target`
+  -- (in matrix order — the stable height-sort preserves it), for the joint leaf
+  -- hash `hash_iter_slices`.
+  fn concat_at(rows: List‹List‹U64››, lhs: List‹G›, target: G) -> List‹U64› {
+    match load(rows) {
+      ListNode.Nil => store(ListNode.Nil),
+      ListNode.Cons(r, rrest) =>
+        let &ListNode.Cons(lh, lrest) = lhs;
+        concat_at_step(eq_zero(lh - target), r, rrest, lrest, target),
+    }
+  }
+  fn concat_at_step(hit: G, r: List‹U64›, rrest: List‹List‹U64››,
+      lrest: List‹G›, target: G) -> List‹U64› {
+    match hit {
+      0 => concat_at(rrest, lrest, target),
+      _ => list_concat(r, concat_at(rrest, lrest, target)),
+    }
+  }
+
+  -- Canonicalize each lane: the MMCS leaf hash serializes `as_canonical_u64`,
+  -- but opened base values are on the wire in the (possibly non-canonical)
+  -- internal Goldilocks repr — e.g. field zero ships as `p`. `gl_reduce` maps
+  -- them to `< p` before hashing (idempotent on already-canonical lanes).
+  fn canon_lanes(l: List‹U64›) -> List‹U64› {
+    match load(l) {
+      ListNode.Nil => store(ListNode.Nil),
+      ListNode.Cons(x, rest) => store(ListNode.Cons(gl_reduce(x), canon_lanes(rest))),
+    }
+  }
+  -- The joint leaf hash of all matrices at log-height `target`.
+  fn leaf_hash_at(rows: List‹List‹U64››, lhs: List‹G›, target: G) -> Digest {
+    pf_sponge_u64(canon_lanes(concat_at(rows, lhs, target)))
+  }
+
+  -- Inject the leaf hash of any matrices at log-height `lh` (if present) via a
+  -- second compression onto `d`.
+  fn inject_maybe(rows: List‹List‹U64››, lhs: List‹G›, lh: G, d: Digest) -> Digest {
+    match has_height(lhs, lh) {
+      0 => d,
+      _ => mmcs_compress(d, leaf_hash_at(rows, lhs, lh)),
+    }
+  }
+
+  -- Recompose remaining path bits (LSB first) into the cap index.
+  fn bits_to_num(bits: List‹G›) -> G {
+    match load(bits) {
+      ListNode.Nil => 0,
+      ListNode.Cons(b, rest) => b + 2 * bits_to_num(rest),
+    }
+  }
+
+  -- Walk the authentication path: one proof sibling per level (fold), with a
+  -- possible leaf injection at the new log-height `lh`. Returns the recomputed
+  -- root and the leftover cap index.
+  fn mmcs_fold(d: Digest, rows: List‹List‹U64››, lhs: List‹G›,
+      proof: List‹Digest›, ibits: List‹G›, lh: G) -> (Digest, G) {
+    match load(proof) {
+      ListNode.Nil => (d, bits_to_num(ibits)),
+      ListNode.Cons(s, prest) =>
+        let &ListNode.Cons(bit, brest) = ibits;
+        let d1 = compress_ordered(bit, d, s);
+        let d2 = inject_maybe(rows, lhs, lh, d1);
+        mmcs_fold(d2, rows, lhs, prest, brest, lh - 1),
+    }
+  }
+
+  -- Recompute the Merkle root from the opened rows + authentication path.
+  fn mmcs_root(rows: List‹List‹U64››, lhs: List‹G›, ibits: List‹G›,
+      proof: List‹Digest›, log_max: G) -> (Digest, G) {
+    let leaf = leaf_hash_at(rows, lhs, log_max);
+    mmcs_fold(leaf, rows, lhs, proof, ibits, log_max - 1)
+  }
+
+  -- 1 iff the recomputed root matches the commitment cap at the cap index.
+  fn mmcs_verify(cap: MerkleCap, rows: List‹List‹U64››, lhs: List‹G›,
+      ibits: List‹G›, proof: List‹Digest›, log_max: G) -> G {
+    let (root, capidx) = mmcs_root(rows, lhs, ibits, proof, log_max);
+    digest_eq(list_lookup(cap, capidx), root)
+  }
+
+  -- ==========================================================================
+  -- FRI fold step (`TwoAdicFriFolding::fold_row`, arity-2 case).
+  --
+  -- `innerFri` uses `maxLogArity = 1`, so every FRI fold is binary. Ports
+  -- `fold_row` for `log_arity = 1`: given the sibling pair (e0, e1) of a node
+  -- and the round challenge β,
+  --   folded = (e0 + e1)/2 + β·(e0 − e1)/(2s),
+  --   s = g_{log_height+1}^{reverse_bits_len(index, log_height)}     (base field)
+  -- where `g_k = two_adic_gen(k)`. The index is threaded as its low-`log_height`
+  -- bit list (LSB first), matching `ch_sample_bits`; `reverse_bits_len` is then
+  -- just reversing that list.
+  -- ==========================================================================
+
+  -- Reverse a `G` (bit) list onto `acc`.
+  fn glist_rev(l: List‹G›, acc: List‹G›) -> List‹G› {
+    match load(l) {
+      ListNode.Nil => acc,
+      ListNode.Cons(b, rest) => glist_rev(rest, store(ListNode.Cons(b, acc))),
+    }
+  }
+
+  -- base^(Σ bits_i · 2^i), bits LSB-first (square-and-multiply over the bits).
+  -- `base` is a non-native Goldilocks element; `bits` is a native bit list.
+  fn exp_by_bits(base: [U8; 8], bits: List‹G›) -> [U8; 8] {
+    match load(bits) {
+      ListNode.Nil => gl_one(),
+      ListNode.Cons(b, rest) =>
+        let half = exp_by_bits(gl_sq(base), rest);
+        match b {
+          0 => half,
+          _ => gl_mul(base, half),
+        },
+    }
+  }
+
+  -- The arity-2 FRI fold. `index_bits` = the low `log_height` index bits, LSB
+  -- first (so `reverse_bits_len` = reversing the list).
+  fn fri_fold2(index_bits: List‹G›, log_height: G, beta: Ext, e0: Ext, e1: Ext) -> Ext {
+    let g = two_adic_gen(log_height + 1);
+    let s = exp_by_bits(g, glist_rev(index_bits, store(ListNode.Nil)));
+    let two_s = gl_add(s, s);
+    let t1 = eg_div(eg_add(e0, e1), [gl_two(), gl_zero()]);
+    let t2 = eg_mul(beta, eg_div(eg_sub(e0, e1), [two_s, gl_zero()]));
+    eg_add(t1, t2)
+  }
+
+  -- ==========================================================================
+  -- `open_input` reduced openings (`fri/verifier.rs::open_input` inner loop).
+  --
+  -- For a matrix opened at a point z with verifier domain point x, accumulate
+  -- over the matrix columns:
+  --   ro += alpha_pow · (p_z − p_x) · q ;  alpha_pow *= alpha,   q = 1/(z − x)
+  -- where p_x are the INPUT opened base values (from the query's batch opening,
+  -- authenticated by the input MMCS) and p_z the OOD opened ext values. The
+  -- query domain point is
+  --   x = GENERATOR(7) · two_adic_gen(log_height)^{reverse_bits_len(idx, log_height)}.
+  -- All extension arithmetic — no Merkle hashing here.
+  -- ==========================================================================
+
+  -- The base-field query domain point x. `index_bits` = low-`log_height` index
+  -- bits, LSB first (so reverse_bits_len = reversing the list).
+  fn ro_x(index_bits: List‹G›, log_height: G) -> [U8; 8] {
+    gl_mul(gl_seven(), exp_by_bits(two_adic_gen(log_height), glist_rev(index_bits, store(ListNode.Nil))))
+  }
+
+  -- Accumulate one matrix-point's column contributions. `q = 1/(z − x)`.
+  fn ro_fold(p_x: List‹[U8; 8]›, p_z: List‹Ext›, q: Ext, alpha: Ext, ro: Ext, ap: Ext)
+      -> (Ext, Ext) {
+    match load(p_x) {
+      ListNode.Nil => (ro, ap),
+      ListNode.Cons(px, pxr) =>
+        let &ListNode.Cons(pz, pzr) = p_z;
+        let term = eg_mul(eg_mul(ap, eg_sub(pz, [px, gl_zero()])), q);
+        ro_fold(pxr, pzr, q, alpha, eg_add(ro, term), eg_mul(ap, alpha)),
+    }
+  }
+
+  -- ==========================================================================
+  -- PCS (FRI) verification — `two_adic_pcs::verify` + `fri::verify_fri`.
+  --
+  -- Specialised to `innerFri`: arity 2 (log_arity = 1 every round),
+  -- log_blowup = 1, log_final_poly_len = 0 (final_poly is ONE constant
+  -- coefficient ⇒ the final Horner eval is just `final_poly[0]`, no `x` needed),
+  -- num_queries = 1, commit/query PoW bits = 0 (a challenger no-op). Field
+  -- arithmetic is the non-native byte Goldilocks (`gl_*`/`eg_*`).
+  --
+  -- A reduced-opening accumulator, one per distinct log-height. `alpha_pow`
+  -- threads across every (batch, matrix, point, column) at that height, in the
+  -- prover's observation order (stage_1, stage_2, quotient, preprocessed).
+  -- ==========================================================================
+  enum Bucket { Mk(G, Ext, Ext) }   -- log_height, alpha_pow, reduced_opening
+
+  -- ── challenger: observe the opened values (observe_algebra_slice) ──────────
+  -- One ext element = its two base coordinates, each 8 LE bytes.
+  fn obs_ext_row(input: ByteStream, row: List‹Ext›) -> ByteStream {
+    match load(row) {
+      ListNode.Nil => input,
+      ListNode.Cons(e, rest) => obs_ext_row(snoc_b8(snoc_b8(input, e[0]), e[1]), rest),
+    }
+  }
+  fn obs_points(input: ByteStream, pts: List‹List‹Ext››) -> ByteStream {
+    match load(pts) {
+      ListNode.Nil => input,
+      ListNode.Cons(row, rest) => obs_points(obs_ext_row(input, row), rest),
+    }
+  }
+  fn obs_round(input: ByteStream, round: OpenedRound) -> ByteStream {
+    match load(round) {
+      ListNode.Nil => input,
+      ListNode.Cons(mat, rest) => obs_round(obs_points(input, mat), rest),
+    }
+  }
+  fn obs_prep(input: ByteStream, prep_opt: PreprocessedOpt) -> ByteStream {
+    match prep_opt {
+      PreprocessedOpt.NoPreprocessed => input,
+      PreprocessedOpt.SomePreprocessed(round) => obs_round(input, round),
+    }
+  }
+  -- Observe one Val (= 1) per FRI round, the variable-arity schedule.
+  fn obs_log_arities(input: ByteStream, comms: List‹MerkleCap›) -> ByteStream {
+    match load(comms) {
+      ListNode.Nil => input,
+      ListNode.Cons(_c, rest) =>
+        obs_log_arities(snoc_b8(input, [1u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8]), rest),
+    }
+  }
+  -- Per round: observe the commitment, then sample β (PoW bits = 0 ⇒ no-op).
+  -- `GrindingChallenger::check_witness` for one commit round: when `bits > 0`,
+  -- observe the PoW witness then sample `bits` bits and assert they are all zero
+  -- (else InvalidPowWitness). Returns the post-PoW `(input, output)` so the
+  -- immediately-following β sample continues the SAME hash stream (no observe in
+  -- between). `bits == 0` is the short-circuit: no observe, no sample.
+  fn pcs_commit_pow(input: ByteStream, witness: U64, bits: G) -> (ByteStream, ByteStream) {
+    match bits {
+      0 => (input, store(ListNode.Nil)),
+      _ =>
+        let (pbits, i1, o1) = ch_sample_bits(snoc_b8(input, witness), store(ListNode.Nil), bits);
+        assert_eq!(bits_to_num(pbits), 0);
+        (i1, o1),
+    }
+  }
+  -- Per round: observe the commitment, run the commit-phase PoW check, then
+  -- sample the folding challenge β (continuing the challenger past the PoW).
+  fn pcs_betas(input: ByteStream, comms: List‹MerkleCap›, witnesses: List‹U64›, bits: G)
+      -> (List‹Ext›, ByteStream) {
+    match load(comms) {
+      ListNode.Nil => (store(ListNode.Nil), input),
+      ListNode.Cons(c, rest) =>
+        let &ListNode.Cons(w, wrest) = witnesses;
+        let (i1, o1) = pcs_commit_pow(snoc_cap(input, c), w, bits);
+        let (b0, b1, i2, _o) = ch_sample_ext(i1, o1);
+        let (bs, i3) = pcs_betas(i2, rest, wrest, bits);
+        (store(ListNode.Cons([gl_reduce(b0), gl_reduce(b1)], bs)), i3),
+    }
+  }
+
+  -- ── reduced-opening buckets ───────────────────────────────────────────────
+  fn repeat_g(v: G, n: G) -> List‹G› {
+    match n {
+      0 => store(ListNode.Nil),
+      _ => store(ListNode.Cons(v, repeat_g(v, n - 1))),
+    }
+  }
+  -- 1 iff some circuit `i < rem` has log-height `log_degrees[i] + log_blowup == h`.
+  fn circ_has_height(log_degrees: List‹U8›, log_blowup: G, rem: G, i: G, h: G) -> G {
+    match rem {
+      0 => 0,
+      _ => match eq_zero(to_field(list_lookup(log_degrees, i)) + log_blowup - h) {
+        1 => 1,
+        _ => circ_has_height(log_degrees, log_blowup, rem - 1, i + 1, h),
+      },
+    }
+  }
+  -- One bucket per distinct log-height, built DESCENDING by counting `h` down
+  -- from `log_global_max`. Each starts (alpha_pow = 1, reduced_opening = 0).
+  fn build_buckets(log_degrees: List‹U8›, log_blowup: G, num_circuits: G, h: G) -> List‹Bucket› {
+    match h {
+      0 => store(ListNode.Nil),
+      _ => match circ_has_height(log_degrees, log_blowup, num_circuits, 0, h) {
+        0 => build_buckets(log_degrees, log_blowup, num_circuits, h - 1),
+        _ => store(ListNode.Cons(
+               Bucket.Mk(h, [gl_one(), gl_zero()], [gl_zero(), gl_zero()]),
+               build_buckets(log_degrees, log_blowup, num_circuits, h - 1))),
+      },
+    }
+  }
+  -- Find the bucket at log-height `lh`, fold one matrix-point's columns into it
+  -- (`ro_fold` threads its `alpha_pow`), and write it back.
+  fn bucket_update(buckets: List‹Bucket›, lh: G, p_x: List‹[U8; 8]›, p_z: List‹Ext›,
+      q: Ext, alpha: Ext) -> List‹Bucket› {
+    match load(buckets) {
+      ListNode.Nil => store(ListNode.Nil),
+      ListNode.Cons(b, rest) =>
+        let Bucket.Mk(h, ap, ro) = b;
+        match eq_zero(h - lh) {
+          1 =>
+            let (ro2, ap2) = ro_fold(p_x, p_z, q, alpha, ro, ap);
+            store(ListNode.Cons(Bucket.Mk(h, ap2, ro2), rest)),
+          _ => store(ListNode.Cons(b, bucket_update(rest, lh, p_x, p_z, q, alpha))),
+        },
+    }
+  }
+  -- A bucket at log-height `log_blowup` would come from a trace matrix of height
+  -- 1 (a constant polynomial); then `(f(ζ) − f(x))/(ζ − x)` must be 0. Assert it
+  -- (`open_input`'s `FinalPolyMismatch` guard). No-op if no such bucket exists.
+  fn assert_blowup_zero(buckets: List‹Bucket›, log_blowup: G) -> G {
+    match load(buckets) {
+      ListNode.Nil => 1,
+      ListNode.Cons(b, rest) =>
+        let Bucket.Mk(h, _ap, ro) = b;
+        match eq_zero(h - log_blowup) {
+          1 => assert_eq!(eg_eq(ro, [gl_zero(), gl_zero()]), 1); 1,
+          _ => assert_blowup_zero(rest, log_blowup),
+        },
+    }
+  }
+  -- 1 iff the proof carries a preprocessed opened round (used for the input-proof
+  -- batch-count check).
+  fn prep_count(prep_opt: PreprocessedOpt) -> G {
+    match prep_opt {
+      PreprocessedOpt.NoPreprocessed => 0,
+      PreprocessedOpt.SomePreprocessed(_r) => 1,
+    }
+  }
+  -- Compute x = GENERATOR·g^{revbits} for this height and fold the contribution.
+  fn ri_apply(buckets: List‹Bucket›, lh: G, idxbits: List‹G›, log_gmax: G,
+      z: Ext, p_x: List‹[U8; 8]›, p_z: List‹Ext›, alpha: Ext) -> List‹Bucket› {
+    -- the base opening and the ext opening at this point must have equal width
+    -- (PointEvaluationCountMismatch); `ro_fold` walks them in lockstep.
+    assert_eq!(eq_zero(list_length(p_x) - list_length(p_z)), 1);
+    let x = ro_x(list_drop(idxbits, log_gmax - lh), lh);
+    let q = eg_inverse(eg_sub(z, [x, gl_zero()]));
+    bucket_update(buckets, lh, p_x, p_z, q, alpha)
+  }
+
+  -- A stage_1/stage_2/preprocessed-style matrix: two opening points
+  -- (ζ, ζ·g) with the same base row `p_x`. `g` = trace subgroup generator.
+  fn open_2pt_mat(buckets: List‹Bucket›, idxbits: List‹G›, log_gmax: G, lh: G,
+      ldeg: G, zeta: Ext, p_x: List‹[U8; 8]›, mat: List‹List‹Ext››, alpha: Ext)
+      -> List‹Bucket› {
+    let pz0 = list_lookup(mat, 0);
+    let pz1 = list_lookup(mat, 1);
+    let zn = eg_mul(zeta, [two_adic_gen(ldeg), gl_zero()]);
+    let b1 = ri_apply(buckets, lh, idxbits, log_gmax, zeta, p_x, pz0, alpha);
+    ri_apply(b1, lh, idxbits, log_gmax, zn, p_x, pz1, alpha)
+  }
+  fn open_batch_2pt(buckets: List‹Bucket›, idxbits: List‹G›, log_gmax: G, log_blowup: G, ci: G,
+      rem: G, log_degrees: List‹U8›, zeta: Ext, base_rows: List‹List‹U64››,
+      opened: OpenedRound, alpha: Ext) -> List‹Bucket› {
+    match rem {
+      0 => buckets,
+      _ =>
+        let ldeg = to_field(list_lookup(log_degrees, ci));
+        let b = open_2pt_mat(buckets, idxbits, log_gmax, ldeg + log_blowup, ldeg, zeta,
+                  list_lookup(base_rows, ci), list_lookup(opened, ci), alpha);
+        open_batch_2pt(b, idxbits, log_gmax, log_blowup, ci + 1, rem - 1, log_degrees, zeta,
+                       base_rows, opened, alpha),
+    }
+  }
+
+  -- The quotient batch: `qd` chunks per circuit, one opening point (ζ) each.
+  fn open_q_chunks(buckets: List‹Bucket›, idxbits: List‹G›, log_gmax: G, lh: G,
+      chunk: G, qrem: G, zeta: Ext, base_rows: List‹List‹U64››,
+      q_opened: OpenedRound, alpha: Ext) -> (List‹Bucket›, G) {
+    match qrem {
+      0 => (buckets, chunk),
+      _ =>
+        let b = ri_apply(buckets, lh, idxbits, log_gmax, zeta,
+                  list_lookup(base_rows, chunk), list_lookup(list_lookup(q_opened, chunk), 0), alpha);
+        open_q_chunks(b, idxbits, log_gmax, lh, chunk + 1, qrem - 1, zeta, base_rows, q_opened, alpha),
+    }
+  }
+  fn open_quotient(buckets: List‹Bucket›, idxbits: List‹G›, log_gmax: G, log_blowup: G, ci: G,
+      rem: G, chunk: G, circuits: List‹SysCircuit›, log_degrees: List‹U8›, zeta: Ext,
+      base_rows: List‹List‹U64››, q_opened: OpenedRound, alpha: Ext) -> List‹Bucket› {
+    match rem {
+      0 => buckets,
+      _ =>
+        let SysCircuit.Mk(_a, _cc, md, _ph, _pw, _w1, _w2) = list_lookup(circuits, ci);
+        let qd = quotient_degree_of(md);
+        let lh = to_field(list_lookup(log_degrees, ci)) + log_blowup;
+        let (b, chunk2) = open_q_chunks(buckets, idxbits, log_gmax, lh, chunk, qd, zeta, base_rows, q_opened, alpha);
+        open_quotient(b, idxbits, log_gmax, log_blowup, ci + 1, rem - 1, chunk2, circuits, log_degrees, zeta, base_rows, q_opened, alpha),
+    }
+  }
+
+  -- The preprocessed batch: only circuits with `prep_indices[i] = Some(j)`;
+  -- `k` tracks the position in the preprocessed commitment (= base-row index).
+  fn open_prep(buckets: List‹Bucket›, idxbits: List‹G›, log_gmax: G, log_blowup: G, ci: G, rem: G,
+      k: G, log_degrees: List‹U8›, prep_indices: List‹OptIdx›, zeta: Ext,
+      base_rows: List‹List‹U64››, prep_round: OpenedRound, alpha: Ext) -> List‹Bucket› {
+    match rem {
+      0 => buckets,
+      _ => match list_lookup(prep_indices, ci) {
+        OptIdx.NoIdx =>
+          open_prep(buckets, idxbits, log_gmax, log_blowup, ci + 1, rem - 1, k, log_degrees,
+                    prep_indices, zeta, base_rows, prep_round, alpha),
+        OptIdx.SomeIdx(_j) =>
+          let ldeg = to_field(list_lookup(log_degrees, ci));
+          let b = open_2pt_mat(buckets, idxbits, log_gmax, ldeg + log_blowup, ldeg, zeta,
+                    list_lookup(base_rows, k), list_lookup(prep_round, k), alpha);
+          open_prep(b, idxbits, log_gmax, log_blowup, ci + 1, rem - 1, k + 1, log_degrees,
+                    prep_indices, zeta, base_rows, prep_round, alpha),
+      },
+    }
+  }
+  fn open_prep_batch(buckets: List‹Bucket›, input_proof: List‹BatchOpening›,
+      prep_commit: MerkleCap, prep_opt: PreprocessedOpt, prep_indices: List‹OptIdx›,
+      log_degrees: List‹U8›, num_circuits: G, idxbits: List‹G›, log_gmax: G, log_blowup: G,
+      zeta: Ext, alpha: Ext) -> List‹Bucket› {
+    match prep_opt {
+      PreprocessedOpt.NoPreprocessed => buckets,
+      PreprocessedOpt.SomePreprocessed(prep_round) =>
+        let BatchOpening.Mk(rows_p, proof_p) = list_lookup(input_proof, 3);
+        -- one opened base row per preprocessed matrix (BatchOpenedValuesCountMismatch)
+        assert_eq!(eq_zero(list_length(rows_p) - list_length(prep_round)), 1);
+        assert_eq!(mmcs_verify(prep_commit, rows_p,
+          heights_prep(log_degrees, log_blowup, prep_indices, num_circuits, 0), idxbits, proof_p, log_gmax), 1);
+        open_prep(buckets, idxbits, log_gmax, log_blowup, 0, num_circuits, 0, log_degrees,
+                  prep_indices, zeta, rows_p, prep_round, alpha),
+    }
+  }
+
+  -- ── per-batch input-MMCS matrix log-heights (`log_degree + log_blowup`) ────
+  fn heights_all(log_degrees: List‹U8›, log_blowup: G, rem: G, i: G) -> List‹G› {
+    match rem {
+      0 => store(ListNode.Nil),
+      _ => store(ListNode.Cons(to_field(list_lookup(log_degrees, i)) + log_blowup,
+                               heights_all(log_degrees, log_blowup, rem - 1, i + 1))),
+    }
+  }
+  fn heights_quotient(circuits: List‹SysCircuit›, log_degrees: List‹U8›, log_blowup: G, rem: G, i: G) -> List‹G› {
+    match rem {
+      0 => store(ListNode.Nil),
+      _ =>
+        let SysCircuit.Mk(_a, _cc, md, _ph, _pw, _w1, _w2) = list_lookup(circuits, i);
+        list_concat(repeat_g(to_field(list_lookup(log_degrees, i)) + log_blowup, quotient_degree_of(md)),
+                    heights_quotient(circuits, log_degrees, log_blowup, rem - 1, i + 1)),
+    }
+  }
+  fn heights_prep(log_degrees: List‹U8›, log_blowup: G, prep_indices: List‹OptIdx›, rem: G, i: G) -> List‹G› {
+    match rem {
+      0 => store(ListNode.Nil),
+      _ => match list_lookup(prep_indices, i) {
+        OptIdx.NoIdx => heights_prep(log_degrees, log_blowup, prep_indices, rem - 1, i + 1),
+        OptIdx.SomeIdx(_j) =>
+          store(ListNode.Cons(to_field(list_lookup(log_degrees, i)) + log_blowup,
+                              heights_prep(log_degrees, log_blowup, prep_indices, rem - 1, i + 1))),
+      },
+    }
+  }
+
+  -- ── FRI fold chain (`verify_query`, arity 2) ──────────────────────────────
+  -- Reconstruct the sibling pair: evals[index_in_group] = folded, other = sib.
+  fn recon_evals(bit: G, folded: Ext, sib: Ext) -> (Ext, Ext) {
+    match bit {
+      0 => (folded, sib),
+      _ => (sib, folded),
+    }
+  }
+  -- Flatten two ext evals to the 4 base coords of the ExtensionMmcs leaf row.
+  fn flatten2(e0: Ext, e1: Ext) -> List‹U64› {
+    store(ListNode.Cons(e0[0], store(ListNode.Cons(e0[1],
+      store(ListNode.Cons(e1[0], store(ListNode.Cons(e1[1], store(ListNode.Nil)))))))))
+  }
+  -- Roll the next reduced opening into the folded eval when its height matches
+  -- the new folded height: `folded += beta^(2^log_arity) · ro`  (log_arity = 1).
+  fn rollin(folded: Ext, log_folded: G, beta: Ext, ro_rest: List‹Bucket›) -> (Ext, List‹Bucket›) {
+    match load(ro_rest) {
+      ListNode.Nil => (folded, ro_rest),
+      ListNode.Cons(b, rest) =>
+        let Bucket.Mk(h, _ap, ro) = b;
+        match eq_zero(h - log_folded) {
+          1 => (eg_add(folded, eg_mul(ext_exp_pow2(beta, 1), ro)), rest),
+          _ => (folded, ro_rest),
+        },
+    }
+  }
+  fn verify_query(folded: Ext, betas: List‹Ext›, comms: List‹MerkleCap›,
+      openings: List‹CommitPhaseProofStep›, domidx: List‹G›, log_cur: G,
+      ro_rest: List‹Bucket›, log_final: G) -> Ext {
+    match load(openings) {
+      ListNode.Nil =>
+        -- must have folded down to exactly the final domain size, and every
+        -- reduced opening must have been rolled in (FinalFoldHeightMismatch /
+        -- UnconsumedReducedOpenings).
+        assert_eq!(eq_zero(log_cur - log_final), 1);
+        assert_eq!(list_length(ro_rest), 0);
+        folded,
+      ListNode.Cons(op, op_rest) =>
+        let &ListNode.Cons(beta, beta_rest) = betas;
+        let &ListNode.Cons(comm, comm_rest) = comms;
+        let CommitPhaseProofStep.Mk(_la, sibs, oproof) = op;
+        -- arity 2 ⇒ exactly arity-1 = 1 sibling (SiblingValuesLengthMismatch).
+        assert_eq!(list_length(sibs), 1);
+        let &ListNode.Cons(ibit, idrest) = domidx;     -- index_in_group = LSB
+        let log_folded = log_cur - 1;
+        let (e0, e1) = recon_evals(ibit, folded, list_lookup(sibs, 0));
+        -- authenticate the sibling pair against this round's commitment
+        assert_eq!(mmcs_verify(comm, store(ListNode.Cons(flatten2(e0, e1), store(ListNode.Nil))),
+          store(ListNode.Cons(log_folded, store(ListNode.Nil))), idrest, oproof, log_folded), 1);
+        let folded1 = fri_fold2(idrest, log_folded, beta, e0, e1);
+        let (folded2, ro_rest2) = rollin(folded1, log_folded, beta, ro_rest);
+        verify_query(folded2, beta_rest, comm_rest, op_rest, idrest, log_folded, ro_rest2, log_final),
+    }
+  }
+
+  -- ── one FRI query ─────────────────────────────────────────────────────────
+  -- For the query index `idxbits`: build the reduced-opening accumulators,
+  -- authenticate each input batch (input MMCS), run the fold chain, and check
+  -- the final polynomial. `log_final = log_blowup` (log_final_poly_len = 0).
+  fn verify_one_query(idxbits: List‹G›, qp: QueryProof, alpha: Ext,
+      stage1: OpenedRound, stage2: OpenedRound, q_opened: OpenedRound,
+      prep_opt: PreprocessedOpt, s1c: MerkleCap, s2c: MerkleCap, qc: MerkleCap,
+      prep_commit: MerkleCap, circuits: List‹SysCircuit›, prep_indices: List‹OptIdx›,
+      log_degrees: List‹U8›, zeta: Ext, num_circuits: G, log_blowup: G, log_gmax: G,
+      betas: List‹Ext›, commit_phase_commits: List‹MerkleCap›, final_poly: List‹Ext›,
+      num_rounds: G) -> G {
+    let QueryProof.Mk(input_proof, commit_phase_openings) = qp;
+    -- one commit-phase opening per round (QueryCommitPhaseOpeningsCountMismatch),
+    -- one input batch per commitment (InputProofBatchCountMismatch).
+    assert_eq!(eq_zero(list_length(commit_phase_openings) - num_rounds), 1);
+    assert_eq!(eq_zero(list_length(input_proof) - (3 + prep_count(prep_opt))), 1);
+    let buckets = build_buckets(log_degrees, log_blowup, num_circuits, log_gmax);
+    let BatchOpening.Mk(rows_s1, proof_s1) = list_lookup(input_proof, 0);
+    assert_eq!(eq_zero(list_length(rows_s1) - num_circuits), 1);
+    assert_eq!(mmcs_verify(s1c, rows_s1, heights_all(log_degrees, log_blowup, num_circuits, 0), idxbits, proof_s1, log_gmax), 1);
+    let buckets = open_batch_2pt(buckets, idxbits, log_gmax, log_blowup, 0, num_circuits, log_degrees, zeta, rows_s1, stage1, alpha);
+    let BatchOpening.Mk(rows_s2, proof_s2) = list_lookup(input_proof, 1);
+    assert_eq!(eq_zero(list_length(rows_s2) - num_circuits), 1);
+    assert_eq!(mmcs_verify(s2c, rows_s2, heights_all(log_degrees, log_blowup, num_circuits, 0), idxbits, proof_s2, log_gmax), 1);
+    let buckets = open_batch_2pt(buckets, idxbits, log_gmax, log_blowup, 0, num_circuits, log_degrees, zeta, rows_s2, stage2, alpha);
+    let BatchOpening.Mk(rows_q, proof_q) = list_lookup(input_proof, 2);
+    assert_eq!(eq_zero(list_length(rows_q) - list_length(q_opened)), 1);
+    assert_eq!(mmcs_verify(qc, rows_q, heights_quotient(circuits, log_degrees, log_blowup, num_circuits, 0), idxbits, proof_q, log_gmax), 1);
+    let buckets = open_quotient(buckets, idxbits, log_gmax, log_blowup, 0, num_circuits, 0, circuits, log_degrees, zeta, rows_q, q_opened, alpha);
+    let buckets = open_prep_batch(buckets, input_proof, prep_commit, prep_opt, prep_indices, log_degrees, num_circuits, idxbits, log_gmax, log_blowup, zeta, alpha);
+    -- a height-`log_blowup` (constant-poly) reduced opening must be zero
+    let _cz = assert_blowup_zero(buckets, log_blowup);
+    -- the first reduced opening must sit at log_global_max_height
+    -- (InitialReducedOpeningHeightMismatch).
+    let &ListNode.Cons(b0, ro_rest) = buckets;
+    let Bucket.Mk(h0, _ap0, folded_start) = b0;
+    assert_eq!(eq_zero(h0 - log_gmax), 1);
+    let folded = verify_query(folded_start, betas, commit_phase_commits, commit_phase_openings, idxbits, log_gmax, ro_rest, log_blowup);
+    -- final check: with log_final_poly_len = 0, eval = final_poly[0]
+    assert_eq!(eg_eq(list_lookup(final_poly, 0), folded), 1);
+    1
+  }
+
+  -- Loop over all `num_queries` query proofs, sampling one index per query
+  -- (consecutive `sample_bits` continue the same challenger stream).
+  fn query_loop(input: ByteStream, output: ByteStream, query_proofs: List‹QueryProof›,
+      alpha: Ext, stage1: OpenedRound, stage2: OpenedRound, q_opened: OpenedRound,
+      prep_opt: PreprocessedOpt, s1c: MerkleCap, s2c: MerkleCap, qc: MerkleCap,
+      prep_commit: MerkleCap, circuits: List‹SysCircuit›, prep_indices: List‹OptIdx›,
+      log_degrees: List‹U8›, zeta: Ext, num_circuits: G, log_blowup: G, log_gmax: G,
+      betas: List‹Ext›, commit_phase_commits: List‹MerkleCap›, final_poly: List‹Ext›,
+      num_rounds: G) -> G {
+    match load(query_proofs) {
+      ListNode.Nil => 1,
+      ListNode.Cons(qp, rest) =>
+        let (idxbits, input2, output2) = ch_sample_bits(input, output, log_gmax);
+        let _q = verify_one_query(idxbits, qp, alpha, stage1, stage2, q_opened, prep_opt,
+          s1c, s2c, qc, prep_commit, circuits, prep_indices, log_degrees, zeta, num_circuits,
+          log_blowup, log_gmax, betas, commit_phase_commits, final_poly, num_rounds);
+        query_loop(input2, output2, rest, alpha, stage1, stage2, q_opened, prep_opt,
+          s1c, s2c, qc, prep_commit, circuits, prep_indices, log_degrees, zeta, num_circuits,
+          log_blowup, log_gmax, betas, commit_phase_commits, final_poly, num_rounds),
+    }
+  }
+
+  -- ── top-level FRI verification ────────────────────────────────────────────
+  -- `log_blowup` comes from the verifying key; `num_queries` / `commit_pow_bits`
+  -- are the protocol's FRI parameters (public). `query_pow_bits` is taken to be 0
+  -- (our system), so the query-phase grinding check is a no-op.
+  fn pcs_fri_verify(post_zeta_input: ByteStream, stage1: OpenedRound, stage2: OpenedRound,
+      q_opened: OpenedRound, prep_opt: PreprocessedOpt, opening: FriProof,
+      s1c: MerkleCap, s2c: MerkleCap, qc: MerkleCap, prep_commit: MerkleCap,
+      circuits: List‹SysCircuit›, prep_indices: List‹OptIdx›, log_degrees: List‹U8›,
+      zeta: Ext, num_circuits: G, log_blowup: G, num_queries: G, commit_pow_bits: G) -> G {
+    let FriProof.Mk(commit_phase_commits, pw, query_proofs, final_poly, _qpw) = opening;
+    let num_rounds = list_length(commit_phase_commits);
+    -- FRI shape: one PoW witness per round, num_queries query proofs, and (since
+    -- log_final_poly_len = 0) a single final-poly coefficient.
+    assert_eq!(eq_zero(list_length(pw) - num_rounds), 1);
+    assert_eq!(eq_zero(list_length(query_proofs) - num_queries), 1);
+    assert_eq!(list_length(final_poly), 1);
+    -- challenger continuation: observe all opened values (coms_to_verify order)
+    let input = obs_round(post_zeta_input, stage1);
+    let input = obs_round(input, stage2);
+    let input = obs_round(input, q_opened);
+    let input = obs_prep(input, prep_opt);
+    -- PCS batch-combination challenge α
+    let (a0, a1, input, _oa) = ch_sample_ext(input, store(ListNode.Nil));
+    let alpha = [gl_reduce(a0), gl_reduce(a1)];
+    -- per-round FRI fold challenges β (with commit-phase PoW), then observe
+    -- final_poly + the log-arity schedule.
+    let (betas, input) = pcs_betas(input, commit_phase_commits, pw, commit_pow_bits);
+    let input = obs_ext_row(input, final_poly);
+    let input = obs_log_arities(input, commit_phase_commits);
+    -- query indices + per-query verification (log_global_max_height = #rounds + log_blowup)
+    let log_gmax = num_rounds + log_blowup;
+    query_loop(input, store(ListNode.Nil), query_proofs, alpha, stage1, stage2, q_opened,
+      prep_opt, s1c, s2c, qc, prep_commit, circuits, prep_indices, log_degrees, zeta,
+      num_circuits, log_blowup, log_gmax, betas, commit_phase_commits, final_poly, num_rounds)
+  }
+
+⟧
+
+end MultiStark
+
+end
diff --git a/Ix/MultiStark/SystemDeserialize.lean b/Ix/MultiStark/SystemDeserialize.lean
new file mode 100644
index 00000000..fbbbc046
--- /dev/null
+++ b/Ix/MultiStark/SystemDeserialize.lean
@@ -0,0 +1,286 @@
+module
+public import Ix.Aiur.Meta
+public import Ix.IxVM.Core
+public import Ix.IxVM.ByteStream
+public import Ix.MultiStark.Deserialize
+
+/-!
+# Verifying-key deserializer (Aiur)
+
+Aiur port of `src/aiur/vk_codec.rs::manual_deserialize` — reconstructs the
+verifier's `System<AiurCircuit>` from the bytes the prover places on the IO
+channel. Same bincode wire format the Rust side validated against serde:
+
+* enum tag       : `u32`, 4 bytes LE
+* `Option`       : 1 tag byte (`0` = None, `1` = Some)
+* `usize`/`u64`  : 8 bytes LE
+* `Vec<T>`       : `u64` length, then elements
+* struct         : fields in declaration order
+* `Range<usize>` : `start`, `end` (two `u64`)
+* `MerkleCap`    : `Vec<[u64; 4]>`
+* Goldilocks `G` : raw `u64`, 8 bytes LE (reduced mod p on read)
+
+Reuses the proof deserializer's byte primitives (`read_u8`, `read_u64`,
+`read_count`, `read_merkle_cap`, `limb_to_field`, `Digest`, `MerkleCap`).
+-/
+
+public section
+
+namespace MultiStark
+
+def systemDeserialize := ⟦
+  -- ==========================================================================
+  -- Reconstructed `System<AiurCircuit>` as Aiur data.
+  -- ==========================================================================
+
+  -- `SymbolicVariable.entry` (a column reference within the window).
+  enum SysEntry {
+    Preprocessed(G),   -- offset
+    Main(G),
+    Stage2(G),
+    Public,
+    Stage2Public,
+    Challenge
+  }
+
+  -- `SymbolicExpression<G>` — the AIR constraint tree. Children are pointers;
+  -- the trailing `G` on Add/Sub/Neg/Mul is the cached `degree_multiple`.
+  enum SymExpr {
+    Var(SysEntry, G),            -- entry, index
+    IsFirstRow,
+    IsLastRow,
+    IsTransition,
+    Const([U8; 8]),             -- a non-native Goldilocks constant (canonical LE bytes)
+    Add(&SymExpr, &SymExpr, G),
+    Sub(&SymExpr, &SymExpr, G),
+    Neg(&SymExpr, G),
+    Mul(&SymExpr, &SymExpr, G)
+  }
+
+  enum SysLookup { Mk(SymExpr, List‹SymExpr›) }          -- multiplicity, args
+
+  enum SysConstraints { Mk(List‹SymExpr›, G, G, G) }     -- zeros, sel_start, sel_end, width
+
+  enum SysMemory { Mk(G) }                               -- width
+
+  enum SysAir {
+    Function(SysConstraints),
+    Memory(SysMemory),
+    Bytes1,
+    Bytes2
+  }
+
+  enum SysLookupAir { Mk(SysAir, List‹SysLookup›) }      -- inner_air, lookups
+
+  -- air, constraint_count, max_constraint_degree, preprocessed_height,
+  -- preprocessed_width, stage_1_width, stage_2_width.
+  enum SysCircuit { Mk(SysLookupAir, G, G, G, G, G, G) }
+
+  enum SysParams { Mk(G, G) }                            -- log_blowup, cap_height
+
+  -- `Option`s as dedicated non-generic enums (unambiguous constructors).
+  enum OptCommit { NoCommit, SomeCommit(MerkleCap) }
+  enum OptIdx { NoIdx, SomeIdx(G) }
+
+  -- commitment_parameters, circuits, preprocessed_commit, preprocessed_indices.
+  enum Sys { Mk(SysParams, List‹SysCircuit›, OptCommit, List‹OptIdx›) }
+
+  -- ==========================================================================
+  -- Byte primitives specific to the VK format.
+  -- ==========================================================================
+
+  -- A `u32` enum tag: 4 little-endian bytes folded into a field element.
+  fn read_u32(stream: ByteStream) -> (G, ByteStream) {
+    let (b0, s0) = read_u8(stream);
+    let (b1, s1) = read_u8(s0);
+    let (b2, s2) = read_u8(s1);
+    let (b3, s3) = read_u8(s2);
+    (to_field(b0) + 0x100 * to_field(b1) + 0x10000 * to_field(b2)
+       + 0x1000000 * to_field(b3), s3)
+  }
+
+  -- A raw `u64` Goldilocks constant, canonicalized into non-native Goldilocks
+  -- bytes (`gl_reduce`) so it can feed the byte-emulated composition arithmetic.
+  fn read_field(stream: ByteStream) -> ([U8; 8], ByteStream) {
+    let (u, s) = read_u64(stream);
+    (gl_reduce(u), s)
+  }
+
+  -- ==========================================================================
+  -- Recursive readers (mirror `vk_codec` reader-by-reader).
+  -- ==========================================================================
+
+  fn read_sys_entry(stream: ByteStream) -> (SysEntry, ByteStream) {
+    let (tag, s) = read_u32(stream);
+    match tag {
+      0 => let (o, s1) = read_count(s); (SysEntry.Preprocessed(o), s1),
+      1 => let (o, s1) = read_count(s); (SysEntry.Main(o), s1),
+      2 => let (o, s1) = read_count(s); (SysEntry.Stage2(o), s1),
+      3 => (SysEntry.Public, s),
+      4 => (SysEntry.Stage2Public, s),
+      _ => (SysEntry.Challenge, s),
+    }
+  }
+
+  fn read_sym_expr(stream: ByteStream) -> (SymExpr, ByteStream) {
+    let (tag, s) = read_u32(stream);
+    match tag {
+      0 =>
+        let (e, s1) = read_sys_entry(s);
+        let (i, s2) = read_count(s1);
+        (SymExpr.Var(e, i), s2),
+      1 => (SymExpr.IsFirstRow, s),
+      2 => (SymExpr.IsLastRow, s),
+      3 => (SymExpr.IsTransition, s),
+      4 => let (c, s1) = read_field(s); (SymExpr.Const(c), s1),
+      5 =>
+        let (x, s1) = read_sym_expr(s);
+        let (y, s2) = read_sym_expr(s1);
+        let (d, s3) = read_count(s2);
+        (SymExpr.Add(store(x), store(y), d), s3),
+      6 =>
+        let (x, s1) = read_sym_expr(s);
+        let (y, s2) = read_sym_expr(s1);
+        let (d, s3) = read_count(s2);
+        (SymExpr.Sub(store(x), store(y), d), s3),
+      7 =>
+        let (x, s1) = read_sym_expr(s);
+        let (d, s2) = read_count(s1);
+        (SymExpr.Neg(store(x), d), s2),
+      _ =>
+        let (x, s1) = read_sym_expr(s);
+        let (y, s2) = read_sym_expr(s1);
+        let (d, s3) = read_count(s2);
+        (SymExpr.Mul(store(x), store(y), d), s3),
+    }
+  }
+
+  fn read_sym_exprs(stream: ByteStream) -> (List‹SymExpr›, ByteStream) {
+    let (n, s) = read_count(stream);
+    read_sym_exprs_n(s, n)
+  }
+  fn read_sym_exprs_n(stream: ByteStream, n: G) -> (List‹SymExpr›, ByteStream) {
+    match n {
+      0 => (store(ListNode.Nil), stream),
+      _ =>
+        let (x, s) = read_sym_expr(stream);
+        let (rest, s2) = read_sym_exprs_n(s, n - 1);
+        (store(ListNode.Cons(x, rest)), s2),
+    }
+  }
+
+  fn read_sys_lookup(stream: ByteStream) -> (SysLookup, ByteStream) {
+    let (m, s) = read_sym_expr(stream);
+    let (args, s2) = read_sym_exprs(s);
+    (SysLookup.Mk(m, args), s2)
+  }
+  fn read_sys_lookups(stream: ByteStream) -> (List‹SysLookup›, ByteStream) {
+    let (n, s) = read_count(stream);
+    read_sys_lookups_n(s, n)
+  }
+  fn read_sys_lookups_n(stream: ByteStream, n: G) -> (List‹SysLookup›, ByteStream) {
+    match n {
+      0 => (store(ListNode.Nil), stream),
+      _ =>
+        let (x, s) = read_sys_lookup(stream);
+        let (rest, s2) = read_sys_lookups_n(s, n - 1);
+        (store(ListNode.Cons(x, rest)), s2),
+    }
+  }
+
+  fn read_sys_constraints(stream: ByteStream) -> (SysConstraints, ByteStream) {
+    let (zeros, s) = read_sym_exprs(stream);
+    let (sel_start, s1) = read_count(s);
+    let (sel_end, s2) = read_count(s1);
+    let (width, s3) = read_count(s2);
+    (SysConstraints.Mk(zeros, sel_start, sel_end, width), s3)
+  }
+
+  fn read_sys_air(stream: ByteStream) -> (SysAir, ByteStream) {
+    let (tag, s) = read_u32(stream);
+    match tag {
+      0 => let (c, s1) = read_sys_constraints(s); (SysAir.Function(c), s1),
+      1 => let (w, s1) = read_count(s); (SysAir.Memory(SysMemory.Mk(w)), s1),
+      2 => (SysAir.Bytes1, s),
+      _ => (SysAir.Bytes2, s),
+    }
+  }
+
+  fn read_sys_lookupair(stream: ByteStream) -> (SysLookupAir, ByteStream) {
+    let (inner, s) = read_sys_air(stream);
+    let (lookups, s1) = read_sys_lookups(s);
+    (SysLookupAir.Mk(inner, lookups), s1)
+  }
+
+  fn read_sys_circuit(stream: ByteStream) -> (SysCircuit, ByteStream) {
+    let (air, s) = read_sys_lookupair(stream);
+    let (cc, s1) = read_count(s);
+    let (md, s2) = read_count(s1);
+    let (ph, s3) = read_count(s2);
+    let (pw, s4) = read_count(s3);
+    let (w1, s5) = read_count(s4);
+    let (w2, s6) = read_count(s5);
+    (SysCircuit.Mk(air, cc, md, ph, pw, w1, w2), s6)
+  }
+  fn read_sys_circuits(stream: ByteStream) -> (List‹SysCircuit›, ByteStream) {
+    let (n, s) = read_count(stream);
+    read_sys_circuits_n(s, n)
+  }
+  fn read_sys_circuits_n(stream: ByteStream, n: G) -> (List‹SysCircuit›, ByteStream) {
+    match n {
+      0 => (store(ListNode.Nil), stream),
+      _ =>
+        let (x, s) = read_sys_circuit(stream);
+        let (rest, s2) = read_sys_circuits_n(s, n - 1);
+        (store(ListNode.Cons(x, rest)), s2),
+    }
+  }
+
+  -- `Option` tag is a single byte (bincode special-cases Option).
+  fn read_opt_commit(stream: ByteStream) -> (OptCommit, ByteStream) {
+    let (tag, s) = read_u8(stream);
+    match tag {
+      0 => (OptCommit.NoCommit, s),
+      _ => let (c, s1) = read_merkle_cap(s); (OptCommit.SomeCommit(c), s1),
+    }
+  }
+  fn read_opt_idx(stream: ByteStream) -> (OptIdx, ByteStream) {
+    let (tag, s) = read_u8(stream);
+    match tag {
+      0 => (OptIdx.NoIdx, s),
+      _ => let (i, s1) = read_count(s); (OptIdx.SomeIdx(i), s1),
+    }
+  }
+  fn read_opt_idx_list(stream: ByteStream) -> (List‹OptIdx›, ByteStream) {
+    let (n, s) = read_count(stream);
+    read_opt_idx_list_n(s, n)
+  }
+  fn read_opt_idx_list_n(stream: ByteStream, n: G) -> (List‹OptIdx›, ByteStream) {
+    match n {
+      0 => (store(ListNode.Nil), stream),
+      _ =>
+        let (x, s) = read_opt_idx(stream);
+        let (rest, s2) = read_opt_idx_list_n(s, n - 1);
+        (store(ListNode.Cons(x, rest)), s2),
+    }
+  }
+
+  fn read_sys_params(stream: ByteStream) -> (SysParams, ByteStream) {
+    let (log_blowup, s) = read_count(stream);
+    let (cap_height, s1) = read_count(s);
+    (SysParams.Mk(log_blowup, cap_height), s1)
+  }
+
+  -- Full `System<AiurCircuit>`.
+  fn read_system(stream: ByteStream) -> (Sys, ByteStream) {
+    let (params, s) = read_sys_params(stream);
+    let (circuits, s1) = read_sys_circuits(s);
+    let (commit, s2) = read_opt_commit(s1);
+    let (indices, s3) = read_opt_idx_list(s2);
+    (Sys.Mk(params, circuits, commit, indices), s3)
+  }
+⟧
+
+end MultiStark
+
+end
diff --git a/Ix/MultiStark/Tests.lean b/Ix/MultiStark/Tests.lean
new file mode 100644
index 00000000..80148123
--- /dev/null
+++ b/Ix/MultiStark/Tests.lean
@@ -0,0 +1,294 @@
+module
+public import Ix.Aiur.Meta
+
+/-!
+# Self-tests for the Multi-STARK recursive verifier
+
+The `pub fn *_test` entrypoints validating the verifier's primitives against
+Rust reference values (`multi-stark/src/types.rs` test outputs). They live in a
+SEPARATE toplevel fragment, merged on top of the production verifier toplevel
+(`MultiStark.multiStarkTests` in `Ix/MultiStark.lean`): every `pub fn` adds a
+circuit to the compiled system, so keeping the tests out of
+`MultiStark.multiStark` keeps the production verifier's width free of
+test-only circuits.
+
+Cross-fragment references (e.g. `gl_add`, `mmcs_hash_row`, `fri_fold2`,
+`ch_sample_bits`) resolve after the merge — the merged toplevel has a flat
+namespace.
+-/
+
+public section
+
+namespace MultiStark
+
+def tests := ⟦
+  -- ==========================================================================
+  -- Non-native Goldilocks byte arithmetic (vs `gl_ops_ref`).
+  -- ==========================================================================
+  fn assert_g8(x: Goldilocks, e: Goldilocks) -> G {
+    assert_eq!(to_field(x[0]), to_field(e[0]));
+    assert_eq!(to_field(x[1]), to_field(e[1]));
+    assert_eq!(to_field(x[2]), to_field(e[2]));
+    assert_eq!(to_field(x[3]), to_field(e[3]));
+    assert_eq!(to_field(x[4]), to_field(e[4]));
+    assert_eq!(to_field(x[5]), to_field(e[5]));
+    assert_eq!(to_field(x[6]), to_field(e[6]));
+    assert_eq!(to_field(x[7]), to_field(e[7]));
+    1
+  }
+  pub fn gl_addsub_test() -> G {
+    let a = [16u8, 50u8, 84u8, 118u8, 152u8, 186u8, 220u8, 254u8]; -- 0xFEDCBA9876543210
+    let b = [240u8, 222u8, 188u8, 154u8, 120u8, 86u8, 52u8, 18u8]; -- 0x123456789ABCDEF0
+    assert_eq!(assert_g8(gl_add(a, b), [255u8, 16u8, 17u8, 17u8, 18u8, 17u8, 17u8, 17u8]), 1);
+    assert_eq!(assert_g8(gl_sub(a, b), [32u8, 83u8, 151u8, 219u8, 31u8, 100u8, 168u8, 236u8]), 1);
+    assert_eq!(assert_g8(gl_sub(b, a), [225u8, 172u8, 104u8, 36u8, 223u8, 155u8, 87u8, 19u8]), 1);
+    -- edge: (p-1) + 5 ≡ 4 ; 5 - (p-1) ≡ 6
+    let pm1 = [0u8, 0u8, 0u8, 0u8, 255u8, 255u8, 255u8, 255u8]; -- 0xFFFFFFFF00000000
+    let five = [5u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8];
+    assert_eq!(assert_g8(gl_add(pm1, five), [4u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8]), 1);
+    assert_eq!(assert_g8(gl_sub(five, pm1), [6u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8]), 1);
+    1
+  }
+
+  fn assert_eg(x: ExtGoldilocks, e0: Goldilocks, e1: Goldilocks) -> G {
+    assert_eq!(assert_g8(x[0], e0), 1);
+    assert_eq!(assert_g8(x[1], e1), 1);
+    1
+  }
+  pub fn gl_muldiv_test() -> G {
+    let a = [16u8, 50u8, 84u8, 118u8, 152u8, 186u8, 220u8, 254u8]; -- 0xFEDCBA9876543210
+    let b = [240u8, 222u8, 188u8, 154u8, 120u8, 86u8, 52u8, 18u8]; -- 0x123456789ABCDEF0
+    assert_eq!(assert_g8(gl_mul(a, b), [212u8, 186u8, 123u8, 108u8, 31u8, 253u8, 234u8, 250u8]), 1);
+    assert_eq!(assert_g8(gl_inverse(a), [97u8, 29u8, 109u8, 46u8, 183u8, 100u8, 8u8, 102u8]), 1);
+    assert_eq!(assert_g8(gl_div(a, b), [63u8, 59u8, 61u8, 54u8, 46u8, 255u8, 29u8, 186u8]), 1);
+    -- edge: (p-1)·5 ≡ p-5
+    let pm1 = [0u8, 0u8, 0u8, 0u8, 255u8, 255u8, 255u8, 255u8];
+    let five = [5u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8];
+    assert_eq!(assert_g8(gl_mul(pm1, five), [252u8, 255u8, 255u8, 255u8, 254u8, 255u8, 255u8, 255u8]), 1);
+    -- a·a⁻¹ = 1 and b·b⁻¹ = 1
+    assert_eq!(assert_g8(gl_mul(a, gl_inverse(a)), gl_one()), 1);
+    assert_eq!(assert_g8(gl_mul(b, gl_inverse(b)), gl_one()), 1);
+    1
+  }
+  pub fn eg_ops_test() -> G {
+    -- e0 = (0xFEDCBA9876543210, 0x0123456789ABCDEF), e1 = (0x1111111122222222, 0x3333333344444444)
+    let e0 = [[16u8, 50u8, 84u8, 118u8, 152u8, 186u8, 220u8, 254u8],
+              [239u8, 205u8, 171u8, 137u8, 103u8, 69u8, 35u8, 1u8]];
+    let e1 = [[34u8, 34u8, 34u8, 34u8, 17u8, 17u8, 17u8, 17u8],
+              [68u8, 68u8, 68u8, 68u8, 51u8, 51u8, 51u8, 51u8]];
+    assert_eq!(assert_eg(eg_add(e0, e1),
+      [49u8, 84u8, 118u8, 152u8, 170u8, 203u8, 237u8, 15u8],
+      [51u8, 18u8, 240u8, 205u8, 154u8, 120u8, 86u8, 52u8]), 1);
+    assert_eq!(assert_eg(eg_mul(e0, e1),
+      [10u8, 238u8, 162u8, 36u8, 224u8, 127u8, 182u8, 134u8],
+      [215u8, 234u8, 152u8, 224u8, 219u8, 254u8, 32u8, 67u8]), 1);
+    assert_eq!(assert_eg(eg_inverse(e0),
+      [221u8, 238u8, 29u8, 131u8, 179u8, 89u8, 214u8, 216u8],
+      [114u8, 99u8, 206u8, 108u8, 15u8, 88u8, 161u8, 246u8]), 1);
+    assert_eq!(assert_eg(eg_div(e0, e1),
+      [42u8, 59u8, 64u8, 77u8, 226u8, 214u8, 95u8, 63u8],
+      [200u8, 46u8, 148u8, 147u8, 124u8, 180u8, 248u8, 140u8]), 1);
+    -- e0 · e0⁻¹ = 1
+    assert_eq!(assert_eg(eg_mul(e0, eg_inverse(e0)), gl_one(), gl_zero()), 1);
+    1
+  }
+
+  -- ==========================================================================
+  -- Challenger (vs `pcs_challenger_ref` / `pcs_challenger4_ref`).
+  -- ==========================================================================
+
+  -- Self-test: `sample_bits(20)` after observing the single `Val`
+  -- `0x0102030405060708` (8 LE bytes) must equal the reference `799146`
+  -- (`pcs_challenger_ref` in `multi-stark/src/types.rs`).
+  pub fn sample_bits_test() -> G {
+    let input = store(ListNode.Cons(8u8, store(ListNode.Cons(7u8, store(ListNode.Cons(6u8,
+      store(ListNode.Cons(5u8, store(ListNode.Cons(4u8, store(ListNode.Cons(3u8,
+      store(ListNode.Cons(2u8, store(ListNode.Cons(1u8, store(ListNode.Nil)))))))))))))))));
+    let (bits, _i, _o) = ch_sample_bits(input, store(ListNode.Nil), 20);
+    assert_eq!(bits_to_num(bits), 799146);
+    1
+  }
+
+  -- Self-test: replay the synthetic PCS challenger sequence from
+  -- `pcs_challenger4_ref` (multi-stark/src/types.rs) and check every sampled
+  -- challenge against the reference. Decisively exercises the two consecutive
+  -- ext samples (α_pcs then α_fri) sharing one hash `output` stream.
+  pub fn pcs_challenger4_test() -> G {
+    -- post-ζ input buffer = observe V0 then V1 (forward / observation order).
+    let v0 = [8u8, 7u8, 6u8, 5u8, 4u8, 3u8, 2u8, 1u8];     -- 0x0102030405060708
+    let v1 = [136u8, 119u8, 102u8, 85u8, 68u8, 51u8, 34u8, 17u8]; -- 0x1122334455667788
+    let input = snoc_b8(snoc_b8(store(ListNode.Nil), v0), v1);
+    -- α_pcs (output empty ⇒ flush), then α_fri (CONSECUTIVE ⇒ thread output).
+    let (apcs, input, o1) = pcs_sample_ext(input, store(ListNode.Nil));
+    let (afri, input, o2) = pcs_sample_ext(input, o1);
+    assert_eq!(limb_to_field(apcs[0]), 2882912772410685996);
+    assert_eq!(limb_to_field(apcs[1]), 910933442133595775);
+    assert_eq!(limb_to_field(afri[0]), 14440140149289897216);
+    assert_eq!(limb_to_field(afri[1]), 8092267645441512944);
+    -- observe commit (clears output), sample β.
+    let v2 = [239u8, 190u8, 173u8, 222u8, 0u8, 0u8, 0u8, 0u8]; -- 0x00000000deadbeef
+    let (input, _oc) = ch_observe_val(input, v2);
+    let (beta, input, _ob) = pcs_sample_ext(input, store(ListNode.Nil));
+    assert_eq!(limb_to_field(beta[0]), 10456048119516576995);
+    assert_eq!(limb_to_field(beta[1]), 3173538015651228593);
+    -- observe final_poly coeff + log_arity (each a Val), then sample the index.
+    let v3 = [4u8, 3u8, 2u8, 1u8, 13u8, 12u8, 11u8, 10u8]; -- 0x0a0b0c0d01020304
+    let v4 = [2u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8];     -- 0x0000000000000002
+    let (input, _o3) = ch_observe_val(input, v3);
+    let (input, _o4) = ch_observe_val(input, v4);
+    let (bits, _bi, _bo) = ch_sample_bits(input, store(ListNode.Nil), 20);
+    assert_eq!(bits_to_num(bits), 336138);
+    1
+  }
+
+  -- ==========================================================================
+  -- FRI fold + reduced openings (vs `fri_fold_ref` / `ro_fold_ref`).
+  -- ==========================================================================
+
+  -- Self-test: arity-2 fold at index 5, log_height 3 vs the `fri_fold_ref`
+  -- reference (computed by the real `TwoAdicFriFolding::fold_row`).
+  pub fn fri_fold_test() -> G {
+    let index_bits = store(ListNode.Cons(1, store(ListNode.Cons(0,
+                       store(ListNode.Cons(1, store(ListNode.Nil)))))));
+    let e0 = [[17u8, 17u8, 17u8, 17u8, 17u8, 17u8, 17u8, 17u8],
+              [34u8, 34u8, 34u8, 34u8, 34u8, 34u8, 34u8, 34u8]];
+    let e1 = [[51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8],
+              [68u8, 68u8, 68u8, 68u8, 68u8, 68u8, 68u8, 68u8]];
+    let beta = [[85u8, 85u8, 85u8, 85u8, 85u8, 85u8, 85u8, 85u8],
+                [102u8, 102u8, 102u8, 102u8, 102u8, 102u8, 102u8, 102u8]];
+    let folded = fri_fold2(index_bits, 3, beta, e0, e1);
+    assert_eq!(limb_to_field(folded[0]), 9349172584842537206);
+    assert_eq!(limb_to_field(folded[1]), 984486879173118962);
+    1
+  }
+
+  -- Self-test vs `ro_fold_ref`: x at index 5 / log_height 3, then accumulate
+  -- (p_z − p_x)/(z − x) over 3 columns with alpha powers.
+  pub fn ro_fold_test() -> G {
+    let index_bits = store(ListNode.Cons(1, store(ListNode.Cons(0,
+                       store(ListNode.Cons(1, store(ListNode.Nil)))))));
+    let x = ro_x(index_bits, 3);
+    assert_eq!(limb_to_field(x), 117440512);
+    let z = [[154u8, 120u8, 86u8, 52u8, 18u8, 0u8, 0u8, 0u8],
+             [1u8, 239u8, 205u8, 171u8, 0u8, 0u8, 0u8, 0u8]];
+    let alpha = [[17u8, 17u8, 17u8, 17u8, 17u8, 17u8, 17u8, 17u8],
+                 [2u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8]];
+    let px0 = [11u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8];
+    let px1 = [22u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8];
+    let px2 = [33u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8];
+    let p_x = store(ListNode.Cons(px0, store(ListNode.Cons(px1,
+                store(ListNode.Cons(px2, store(ListNode.Nil)))))));
+    let pz0 = [[100u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8], [1u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8]];
+    let pz1 = [[200u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8], [2u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8]];
+    let pz2 = [[44u8, 1u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8], [3u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8]];
+    let p_z = store(ListNode.Cons(pz0, store(ListNode.Cons(pz1,
+                store(ListNode.Cons(pz2, store(ListNode.Nil)))))));
+    let q = eg_inverse(eg_sub(z, [x, gl_zero()]));
+    let (ro, _ap) = ro_fold(p_x, p_z, q, alpha, [gl_zero(), gl_zero()], [gl_one(), gl_zero()]);
+    assert_eq!(limb_to_field(ro[0]), 7130765474285082575);
+    assert_eq!(limb_to_field(ro[1]), 12254464995725315436);
+    1
+  }
+
+  -- ==========================================================================
+  -- Keccak MMCS sponge/compression + Merkle verify_batch
+  -- (vs `pcs_ref_values` / `pcs_merkle_ref`). Compares each output lane mod p
+  -- via `limb_to_field` (all reference lanes are canonical, so this is exact).
+  -- ==========================================================================
+
+  -- A small single-byte value as a `U64` (8 LE bytes).
+  fn u64_of(b: U8) -> U64 {
+    [b, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8]
+  }
+  -- `[i+1, i+2, …, n]` as a `List‹U64›`.
+  fn build_range(i: G, n: G) -> List‹U64› {
+    match n - i {
+      0 => store(ListNode.Nil),
+      _ => store(ListNode.Cons(u64_of(u8_from_field_unsafe(i + 1)), build_range(i + 1, n))),
+    }
+  }
+  fn assert_digest(d: Digest, e0: G, e1: G, e2: G, e3: G) -> G {
+    assert_eq!(limb_to_field(d[0]), e0);
+    assert_eq!(limb_to_field(d[1]), e1);
+    assert_eq!(limb_to_field(d[2]), e2);
+    assert_eq!(limb_to_field(d[3]), e3);
+    1
+  }
+  pub fn pcs_hash_test() -> G {
+    -- LEAF3: hash([1,2,3])
+    let d3 = mmcs_hash_row(build_range(0, 3));
+    assert_eq!(assert_digest(d3, 0xc55a6a1beaea9fec, 0xc8f0dbc4c59ec440,
+                                 0xacb1295de9bfe032, 0x445d569d3dfc9543), 1);
+    -- LEAF17: exactly one full block, no extra permute.
+    let d17 = mmcs_hash_row(build_range(0, 17));
+    assert_eq!(assert_digest(d17, 0x388da73622e8fdd5, 0xec687be9c50d2218,
+                                  0x528d145dfe6571af, 0xd2eb808dfba4703c), 1);
+    -- LEAF22: full block + 5-element partial (two permutes), >20 lanes.
+    let d22 = mmcs_hash_row(build_range(0, 22));
+    assert_eq!(assert_digest(d22, 520358013996801752, 12301199992631688477,
+      8732686820159480415, 10883226686987971725), 1);
+    -- LEAF20: full block + 3-element partial (two permutes).
+    let d20 = mmcs_hash_row(build_range(0, 20));
+    assert_eq!(assert_digest(d20, 0xec696847be88d358, 0x202861c67ff4cec8,
+                                  0x88e006a48aaa0661, 0xabaddb9d32ecd024), 1);
+    -- COMPRESS([1,2,3,4],[5,6,7,8])
+    let c = mmcs_compress([u64_of(1u8), u64_of(2u8), u64_of(3u8), u64_of(4u8)],
+                          [u64_of(5u8), u64_of(6u8), u64_of(7u8), u64_of(8u8)]);
+    assert_eq!(assert_digest(c, 0xda1ef0642722b22e, 0x4851efdbdb2a2fd8,
+                                0x37e8ff900ea95d47, 0xa153eee7805376fb), 1);
+    1
+  }
+
+  -- Merkle `verify_batch` self-test against the `pcs_merkle_ref` reference: a
+  -- cap-height-0 tree over 3 matrices of heights 8/4/2 (log-heights 3/2/1),
+  -- opened at index 5 (path bits 1,0,1). Checks the recomputed root matches the
+  -- committed root and that the cap index is 0, then that a tampered opened row
+  -- yields a different (rejected) root.
+  pub fn pcs_merkle_test() -> G {
+    -- opened rows (matrix order): m0 row5, m1 row2, m2 row1.
+    let row0 = store(ListNode.Cons(u64_of(11u8), store(ListNode.Cons(u64_of(12u8), store(ListNode.Nil)))));
+    let row1 = store(ListNode.Cons(u64_of(107u8), store(ListNode.Cons(u64_of(108u8),
+                 store(ListNode.Cons(u64_of(109u8), store(ListNode.Nil)))))));
+    let row2 = store(ListNode.Cons(u64_of(202u8), store(ListNode.Nil)));
+    let rows = store(ListNode.Cons(row0, store(ListNode.Cons(row1,
+                 store(ListNode.Cons(row2, store(ListNode.Nil)))))));
+    -- log-heights and path bits (index 5 = 0b101, LSB first).
+    let lhs = store(ListNode.Cons(3, store(ListNode.Cons(2, store(ListNode.Cons(1, store(ListNode.Nil)))))));
+    let ibits = store(ListNode.Cons(1, store(ListNode.Cons(0, store(ListNode.Cons(1, store(ListNode.Nil)))))));
+    -- authentication path SIB0, SIB1, SIB2 (each a Digest = [U64; 4]).
+    let sib0 = [[9u8, 36u8, 179u8, 127u8, 205u8, 83u8, 105u8, 203u8],
+                [95u8, 229u8, 105u8, 223u8, 113u8, 55u8, 97u8, 122u8],
+                [135u8, 8u8, 65u8, 248u8, 163u8, 163u8, 68u8, 81u8],
+                [9u8, 11u8, 20u8, 209u8, 10u8, 168u8, 151u8, 125u8]];
+    let sib1 = [[227u8, 58u8, 255u8, 213u8, 77u8, 152u8, 42u8, 77u8],
+                [113u8, 86u8, 2u8, 151u8, 97u8, 63u8, 58u8, 45u8],
+                [228u8, 139u8, 228u8, 194u8, 182u8, 115u8, 107u8, 221u8],
+                [248u8, 16u8, 30u8, 93u8, 176u8, 36u8, 205u8, 88u8]];
+    let sib2 = [[236u8, 144u8, 115u8, 218u8, 140u8, 5u8, 86u8, 229u8],
+                [95u8, 186u8, 252u8, 175u8, 21u8, 247u8, 153u8, 25u8],
+                [113u8, 78u8, 92u8, 200u8, 212u8, 175u8, 247u8, 47u8],
+                [78u8, 145u8, 206u8, 54u8, 175u8, 155u8, 165u8, 206u8]];
+    let proof = store(ListNode.Cons(sib0, store(ListNode.Cons(sib1,
+                  store(ListNode.Cons(sib2, store(ListNode.Nil)))))));
+    let (root, capidx) = mmcs_root(rows, lhs, ibits, proof, 3);
+    assert_eq!(capidx, 0);
+    assert_eq!(assert_digest(root, 0x6211b9a1a116a006, 0x435ee98e1504880f,
+                                   0x900c7274b9a215f, 0xf6e3aaac5dcd90bd), 1);
+    -- tamper: perturb m0's first opened value → root must change.
+    let bad0 = store(ListNode.Cons(u64_of(99u8), store(ListNode.Cons(u64_of(12u8), store(ListNode.Nil)))));
+    let bad_rows = store(ListNode.Cons(bad0, store(ListNode.Cons(row1,
+                     store(ListNode.Cons(row2, store(ListNode.Nil)))))));
+    let cap = store(ListNode.Cons([[6u8, 160u8, 22u8, 161u8, 161u8, 185u8, 17u8, 98u8],
+                                   [15u8, 136u8, 4u8, 21u8, 142u8, 233u8, 94u8, 67u8],
+                                   [95u8, 33u8, 154u8, 75u8, 39u8, 199u8, 0u8, 9u8],
+                                   [189u8, 144u8, 205u8, 93u8, 172u8, 170u8, 227u8, 246u8]],
+                    store(ListNode.Nil)));
+    assert_eq!(mmcs_verify(cap, rows, lhs, ibits, proof, 3), 1);
+    assert_eq!(mmcs_verify(cap, bad_rows, lhs, ibits, proof, 3), 0);
+    1
+  }
+⟧
+
+end MultiStark
+
+end
diff --git a/Ix/MultiStark/Verifier.lean b/Ix/MultiStark/Verifier.lean
new file mode 100644
index 00000000..e3207e31
--- /dev/null
+++ b/Ix/MultiStark/Verifier.lean
@@ -0,0 +1,816 @@
+module
+public import Ix.Aiur.Meta
+public import Ix.IxVM.Core
+public import Ix.IxVM.ByteStream
+public import Ix.MultiStark.Deserialize
+public import Ix.MultiStark.Keccak
+public import Ix.MultiStark.Pcs
+public import Ix.MultiStark.SystemDeserialize
+
+/-!
+# Multi-STARK verifier (Aiur)
+
+Reimplementation of `multi-stark/src/verifier.rs` (`System::verify_multiple_claims`)
+over the deserialized `Proof` (`Ix/MultiStark/Deserialize.lean`).
+
+The Rust verifier runs these steps:
+
+1. **Shape check** — proof array dimensions match the system's circuit count and
+   column widths.
+2. **Accumulator balance** — the last intermediate accumulator is zero (all
+   lookup pushes/pulls cancel).
+3. **Fiat-Shamir replay** — reconstruct the Keccak challenger: observe
+   commitments / trace heights / claims, sample (lookup, fingerprint, α, ζ).
+4. **PCS verification** — FRI opening proofs (see `Ix/MultiStark/Pcs.lean`).
+5. **OOD evaluation** — recompute the composition polynomial at ζ and check
+   `composition(ζ) · inv_vanishing(ζ) == quotient(ζ)`.
+
+### Implemented here
+* Step 1 (the system-independent part): the proof is internally consistent —
+  `stage_1`, `stage_2` and `intermediate_accumulators` all have the same length
+  (the circuit count) and it is non-zero.
+* Step 2: accumulator balance — the last `intermediate_accumulator` is the zero
+  extension element.
+* Step 3: the Fiat-Shamir challenger replay (`fiat_shamir`). Prover-faithful:
+  observes the verifying key's preprocessed commitment, the stage_1 commitment,
+  the trace heights, and the public claims (in that order), then samples and
+  re-observes the lookup/fingerprint challenges, observes stage_2, samples α,
+  observes the quotient commitment, and samples ζ — matching
+  `verify_multiple_claims` byte-for-byte.
+* Step 5: the out-of-domain composition/quotient check (`ood_verify`). For each
+  circuit it recomputes `composition(ζ)` by replaying the AIR constraint folder
+  (`VerifierConstraintFolder` + `LookupAir::eval`) over the deserialized
+  symbolic system and the opened values, recomputes `quotient(ζ)` from the
+  opened quotient chunks (barycentric `zps` weights over the split quotient
+  domains), and asserts `composition(ζ) · inv_vanishing(ζ) == quotient(ζ)`.
+  Validated end-to-end against a real factorial proof (`RecursiveVerifier.lean`):
+  the verifier accepts the honest proof and rejects a tampered claim.
+
+### Stubbed / TODO
+* Base-field samples are rejection-sampled (`ch_sample_field`): a raw 8-byte
+  limb in the band `[p, 2⁶⁴)` (probability ≈ 2⁻³²) is discarded and redrawn,
+  consuming challenger bytes exactly as `SerializingChallenger64::sample` does.
+* The PCS opening proof (`pcs_verify`) is still an accept-stub. With the PCS
+  stubbed, this verifier checks every algebraic relation except FRI proximity,
+  so it is not yet fully sound.
+-/
+
+public section
+
+namespace MultiStark
+
+def verifier := ⟦
+  -- An extension element `[c0, c1]` (`= c0 + c1·X`) is zero iff both Goldilocks
+  -- coefficients are zero. (`read_ext` already reduced the limbs mod p.)
+  fn ext_is_zero(e: Ext) -> G {
+    gl_is_zero(e[0]) * gl_is_zero(e[1])
+  }
+
+  -- 1 iff the LAST element of the accumulator list is the zero extension
+  -- element (Rust: `intermediate_accumulators.last() == Some(ExtVal::ZERO)`).
+  -- The empty list returns 0 (there is no last element to balance).
+  fn last_acc_is_zero(accs: List‹Ext›) -> G {
+    match load(accs) {
+      ListNode.Nil => 0,
+      ListNode.Cons(e, rest) =>
+        match load(rest) {
+          ListNode.Nil => ext_is_zero(e),
+          _ => last_acc_is_zero(rest),
+        },
+    }
+  }
+
+  -- ==========================================================================
+  -- Fiat-Shamir challenger: `SerializingChallenger64<Val, HashChallenger<u8,
+  -- Keccak256Hash, 32>>`. The inner byte challenger keeps an `input` buffer; a
+  -- `sample` with empty `output` flushes (`input := output := keccak256(input)`)
+  -- and pops bytes from the END of the hash output. The outer layer serializes
+  -- field elements as 8 little-endian bytes and samples field elements as
+  -- 8-byte little-endian u64s.
+  --
+  -- The challenger is threaded as a pair `(input, output)` of byte lists, where
+  -- `output` is held in pop order (front = next byte = hash byte 31, 30, …).
+  -- ==========================================================================
+
+  -- Cons 8 bytes (LSB-first) of `b` onto `tail` (one byte list segment).
+  fn b8_onto(b: [U8; 8], tail: ByteStream) -> ByteStream {
+    store(ListNode.Cons(b[0], store(ListNode.Cons(b[1], store(ListNode.Cons(b[2],
+    store(ListNode.Cons(b[3], store(ListNode.Cons(b[4], store(ListNode.Cons(b[5],
+    store(ListNode.Cons(b[6], store(ListNode.Cons(b[7], tail))))))))))))))))
+  }
+
+  -- A digest = `[u64; 4]` = 32 bytes (each limb little-endian) onto `tail`.
+  fn digest_onto(d: Digest, tail: ByteStream) -> ByteStream {
+    b8_onto(d[0], b8_onto(d[1], b8_onto(d[2], b8_onto(d[3], tail))))
+  }
+
+  -- A commitment (`MerkleCap` = `Vec<Digest>`): all digest bytes, onto `tail`.
+  fn cap_onto(cap: MerkleCap, tail: ByteStream) -> ByteStream {
+    match load(cap) {
+      ListNode.Nil => tail,
+      ListNode.Cons(d, rest) => digest_onto(d, cap_onto(rest, tail)),
+    }
+  }
+
+  -- Observe `log_degrees`: each is a `Val::from_u8`, i.e. 8 LE bytes `[ld,0,…]`.
+  fn log_degrees_onto(lds: List‹U8›, tail: ByteStream) -> ByteStream {
+    match load(lds) {
+      ListNode.Nil => tail,
+      ListNode.Cons(ld, rest) =>
+        b8_onto([ld, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8], log_degrees_onto(rest, tail)),
+    }
+  }
+
+  -- Reverse `l` onto `acc` (used to put a hash output into pop order).
+  fn rev_onto(l: ByteStream, acc: ByteStream) -> ByteStream {
+    match load(l) {
+      ListNode.Nil => acc,
+      ListNode.Cons(b, rest) => rev_onto(rest, store(ListNode.Cons(b, acc))),
+    }
+  }
+
+  -- Sample one byte. If `output` is empty, flush: hash the `input` buffer, set
+  -- the new input to the 32 hash bytes, and refill `output` (in pop order).
+  fn ch_sample_byte(input: ByteStream, output: ByteStream) -> (U8, ByteStream, ByteStream) {
+    match load(output) {
+      ListNode.Cons(b, rest) => (b, input, rest),
+      ListNode.Nil =>
+        let h = keccak256(input);
+        let fwd = digest_onto(h, store(ListNode.Nil));
+        let rev = rev_onto(fwd, store(ListNode.Nil));
+        let &ListNode.Cons(b, rest) = rev;
+        (b, fwd, rest),
+    }
+  }
+
+  -- Sample 8 bytes = `sample_array()` (for one base-field element, LE).
+  fn ch_sample8(input: ByteStream, output: ByteStream) -> ([U8; 8], ByteStream, ByteStream) {
+    let (b0, i0, o0) = ch_sample_byte(input, output);
+    let (b1, i1, o1) = ch_sample_byte(i0, o0);
+    let (b2, i2, o2) = ch_sample_byte(i1, o1);
+    let (b3, i3, o3) = ch_sample_byte(i2, o2);
+    let (b4, i4, o4) = ch_sample_byte(i3, o3);
+    let (b5, i5, o5) = ch_sample_byte(i4, o4);
+    let (b6, i6, o6) = ch_sample_byte(i5, o5);
+    let (b7, i7, o7) = ch_sample_byte(i6, o6);
+    ([b0, b1, b2, b3, b4, b5, b6, b7], i7, o7)
+  }
+
+  -- Sample one base-field element with REJECTION SAMPLING, mirroring
+  -- `SerializingChallenger64::sample`'s inner loop: draw 8 bytes as a LE u64
+  -- (the `log2_ceil(p) = 64` mask is a no-op for Goldilocks); if the raw value
+  -- is ≥ p (probability ≈ 2⁻³²), DISCARD it and draw the next 8 bytes — a
+  -- rejected draw consumes challenger bytes, shifting every later sample,
+  -- exactly as in the reference. `raw < p` ⟺ `sub8(raw, p)` borrows. The
+  -- accepted limb is canonical (< p) by construction.
+  fn ch_sample_field(input: ByteStream, output: ByteStream) -> ([U8; 8], ByteStream, ByteStream) {
+    let (raw, i1, o1) = ch_sample8(input, output);
+    let (_d, borrow) = sub8(raw, gl_p());
+    match borrow {
+      1 => (raw, i1, o1),
+      _ => ch_sample_field(i1, o1),
+    }
+  }
+
+  -- Sample a degree-2 extension element: two base samples (`from_basis_*`),
+  -- each rejection-sampled, returning their 8-byte LE limbs (canonical, but
+  -- also re-observable as raw bytes) and the threaded challenger.
+  fn ch_sample_ext(input: ByteStream, output: ByteStream) -> ([U8; 8], [U8; 8], ByteStream, ByteStream) {
+    let (c0, i0, o0) = ch_sample_field(input, output);
+    let (c1, i1, o1) = ch_sample_field(i0, o0);
+    (c0, c1, i1, o1)
+  }
+
+  -- `sample_bits(n)` (FRI query index). `SerializingChallenger64::sample_bits`
+  -- reads one 8-byte sample as a little-endian u64 and masks the low `n` bits.
+  -- We return the low `n` bits as a list (LSB first = the leaf→root Merkle/FRI
+  -- path), built from the 8 sampled bytes' bit decompositions (reusing keccak's
+  -- `cons8`), truncated to `n`.
+  fn sample8_bits(bytes: [U8; 8]) -> List‹G› {
+    cons8(u8_bit_decomposition(bytes[0]),
+    cons8(u8_bit_decomposition(bytes[1]),
+    cons8(u8_bit_decomposition(bytes[2]),
+    cons8(u8_bit_decomposition(bytes[3]),
+    cons8(u8_bit_decomposition(bytes[4]),
+    cons8(u8_bit_decomposition(bytes[5]),
+    cons8(u8_bit_decomposition(bytes[6]),
+    cons8(u8_bit_decomposition(bytes[7]), store(ListNode.Nil)))))))))
+  }
+  fn take_bits(bits: List‹G›, n: G) -> List‹G› {
+    match n {
+      0 => store(ListNode.Nil),
+      _ =>
+        let &ListNode.Cons(b, rest) = bits;
+        store(ListNode.Cons(b, take_bits(rest, n - 1))),
+    }
+  }
+  fn ch_sample_bits(input: ByteStream, output: ByteStream, n: G)
+      -> (List‹G›, ByteStream, ByteStream) {
+    let (bytes, i1, o1) = ch_sample8(input, output);
+    (take_bits(sample8_bits(bytes), n), i1, o1)
+  }
+
+  -- Append (observe) 8 little-endian bytes of `b` at the END of the challenger
+  -- input buffer. The transcript is held front-to-back (front = first observed =
+  -- first hashed, matching `keccak256`'s absorption order), so an observation
+  -- appends — `b8_onto` PREPENDS, hence the `list_concat`.
+  fn snoc_b8(input: ByteStream, b: [U8; 8]) -> ByteStream {
+    list_concat(input, b8_onto(b, store(ListNode.Nil)))
+  }
+  -- Append (observe) a commitment (`MerkleCap`) at the end of the buffer.
+  fn snoc_cap(input: ByteStream, cap: MerkleCap) -> ByteStream {
+    list_concat(input, cap_onto(cap, store(ListNode.Nil)))
+  }
+
+  -- ==========================================================================
+  -- PCS challenger continuation (Phase 4): the post-ζ transcript that
+  -- `two_adic_pcs::verify` + `verify_fri` replay. Unlike `fiat_shamir` — where
+  -- every sample is followed by an observe (so each sample re-flushes from an
+  -- empty `output`) — the PCS phase has *consecutive* samples with no observe
+  -- between (the PCS batch challenge α, then immediately the FRI batch challenge
+  -- α). So both challenger buffers must be threaded: `output` carries the
+  -- leftover hash bytes from one sample into the next instead of re-flushing.
+  -- ==========================================================================
+
+  -- Observe one `Val` (8 LE bytes): append to `input`, CLEAR `output` (any
+  -- leftover sampled bytes are discarded), per `HashChallenger::observe`.
+  fn ch_observe_val(input: ByteStream, v: U64) -> (ByteStream, ByteStream) {
+    (snoc_b8(input, v), store(ListNode.Nil))
+  }
+
+  -- Sample a degree-2 extension element, threading BOTH challenger buffers so a
+  -- following consecutive sample continues from the same hash `output` stream
+  -- (no re-flush). Limbs are rejection-sampled (`ch_sample_field`), so they are
+  -- canonical; the `gl_reduce` is a no-op kept for type/intent clarity.
+  fn pcs_sample_ext(input: ByteStream, output: ByteStream)
+      -> (Ext, ByteStream, ByteStream) {
+    let (c0, c1, i1, o1) = ch_sample_ext(input, output);
+    ([gl_reduce(c0), gl_reduce(c1)], i1, o1)
+  }
+
+  -- Append a claim's values (each `Val` as 8 LE bytes) onto `tail`, in order.
+  fn claim_vals_onto(vals: List‹U64›, tail: ByteStream) -> ByteStream {
+    match load(vals) {
+      ListNode.Nil => tail,
+      ListNode.Cons(v, rest) => b8_onto(v, claim_vals_onto(rest, tail)),
+    }
+  }
+  -- Append every claim's values (`challenger.observe_slice(claim)` per claim).
+  fn claims_onto(claims: List‹List‹U64››, tail: ByteStream) -> ByteStream {
+    match load(claims) {
+      ListNode.Nil => tail,
+      ListNode.Cons(c, rest) => claim_vals_onto(c, claims_onto(rest, tail)),
+    }
+  }
+
+  -- The preprocessed commitment cap from the verifying key, or an empty cap
+  -- (observes nothing) when there is none.
+  fn opt_commit_cap(commit: OptCommit) -> MerkleCap {
+    match commit {
+      OptCommit.NoCommit => store(ListNode.Nil),
+      OptCommit.SomeCommit(c) => c,
+    }
+  }
+
+  -- Replay the verifier transcript and derive the four challenges
+  -- `(lookup, fingerprint, alpha, zeta)`. Mirrors `verify_multiple_claims`'s
+  -- challenger sequence exactly:
+  --   observe preprocessed_commit (if any) → stage_1 → log_degrees → claims;
+  --   sample lookup, observe it; sample fingerprint, observe it;
+  --   observe stage_2; sample α; observe quotient; sample ζ.
+  -- `observe` clears the challenger's output buffer, and every sample here is
+  -- preceded by an observe, so each `ch_sample_ext` re-flushes from an empty
+  -- output (hence the `store(ListNode.Nil)` output argument each time).
+  -- Every sample is rejection-sampled (`ch_sample_field` inside
+  -- `ch_sample_ext`), so a limb in the band `[p, 2⁶⁴)` is redrawn exactly as in
+  -- the reference challenger, and the limbs observed back are canonical.
+  -- Also returns the post-ζ challenger `input` buffer, which the PCS phase
+  -- (Phase 4+) continues observing into. The leftover `output` after the ζ
+  -- sample is discarded — the next challenger op is an observe (of the opened
+  -- values), which clears `output` anyway.
+  fn fiat_shamir(prep: MerkleCap, s1: MerkleCap, s2: MerkleCap, q: MerkleCap,
+      lds: List‹U8›, claims: List‹List‹U64››) -> (Ext, Ext, Ext, Ext, ByteStream) {
+    -- Initial transcript, front-to-back: prep, stage_1, log_degrees, claims.
+    -- Built inner-to-outer with the prepend helpers so the result is in
+    -- forward (observation) order.
+    let input = claims_onto(claims, store(ListNode.Nil));
+    let input = log_degrees_onto(lds, input);
+    let input = cap_onto(s1, input);
+    let input = cap_onto(prep, input);
+    -- sample lookup challenge, then observe it back (append)
+    let (l0, l1, input, _ol) = ch_sample_ext(input, store(ListNode.Nil));
+    let input = snoc_b8(snoc_b8(input, l0), l1);
+    -- sample fingerprint challenge, then observe it back
+    let (f0, f1, input, _of) = ch_sample_ext(input, store(ListNode.Nil));
+    let input = snoc_b8(snoc_b8(input, f0), f1);
+    -- observe stage_2 commitment
+    let input = snoc_cap(input, s2);
+    -- sample constraint challenge α (not observed)
+    let (a0, a1, input, _oa) = ch_sample_ext(input, store(ListNode.Nil));
+    -- observe quotient commitment
+    let input = snoc_cap(input, q);
+    -- sample out-of-domain point ζ; keep the resulting `input` for the PCS phase
+    let (z0, z1, zinput, _oz) = ch_sample_ext(input, store(ListNode.Nil));
+    ([gl_reduce(l0), gl_reduce(l1)],
+     [gl_reduce(f0), gl_reduce(f1)],
+     [gl_reduce(a0), gl_reduce(a1)],
+     [gl_reduce(z0), gl_reduce(z1)],
+     zinput)
+  }
+
+  -- ==========================================================================
+  -- OOD evaluation domain math (`TwoAdicMultiplicativeCoset`, Goldilocks).
+  -- The trace domain for a circuit of size 2^L is the order-2^L subgroup H
+  -- (shift = 1) generated by `two_adic_gen(L)`.
+  -- ==========================================================================
+
+  -- e^(2^k) in the extension field.
+  fn ext_exp_pow2(e: Ext, k: G) -> Ext {
+    match k {
+      0 => e,
+      _ => ext_exp_pow2(eg_mul(e, e), k - 1),
+    }
+  }
+
+  -- `two_adic_generator(bits)` — a primitive 2^bits root of unity in Goldilocks
+  -- (`Plonky3/goldilocks/src/goldilocks.rs::TWO_ADIC_GENERATORS`).
+  fn two_adic_gen(bits: G) -> [U8; 8] {
+    match bits {
+      0  => [1u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8],
+      1  => [0u8, 0u8, 0u8, 0u8, 255u8, 255u8, 255u8, 255u8],
+      2  => [0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 1u8, 0u8],
+      3  => [1u8, 0u8, 0u8, 255u8, 254u8, 255u8, 255u8, 255u8],
+      4  => [1u8, 0u8, 0u8, 0u8, 255u8, 255u8, 255u8, 239u8],
+      5  => [0u8, 192u8, 255u8, 255u8, 255u8, 63u8, 0u8, 0u8],
+      6  => [0u8, 0u8, 0u8, 0u8, 128u8, 0u8, 0u8, 0u8],
+      7  => [1u8, 0u8, 0u8, 8u8, 255u8, 7u8, 0u8, 248u8],
+      8  => [102u8, 169u8, 12u8, 230u8, 60u8, 20u8, 121u8, 191u8],
+      9  => [78u8, 31u8, 65u8, 92u8, 42u8, 208u8, 5u8, 25u8],
+      10 => [114u8, 217u8, 254u8, 139u8, 215u8, 42u8, 143u8, 157u8],
+      11 => [207u8, 200u8, 161u8, 29u8, 128u8, 180u8, 83u8, 6u8],
+      12 => [182u8, 252u8, 157u8, 149u8, 153u8, 81u8, 195u8, 242u8],
+      13 => [151u8, 121u8, 209u8, 53u8, 35u8, 239u8, 68u8, 21u8],
+      14 => [226u8, 161u8, 187u8, 16u8, 147u8, 9u8, 238u8, 224u8],
+      15 => [172u8, 186u8, 6u8, 35u8, 254u8, 207u8, 178u8, 246u8],
+      16 => [14u8, 69u8, 121u8, 191u8, 48u8, 150u8, 223u8, 84u8],
+      17 => [14u8, 138u8, 61u8, 170u8, 232u8, 166u8, 208u8, 171u8],
+      18 => [172u8, 190u8, 249u8, 5u8, 123u8, 26u8, 40u8, 129u8],
+      19 => [2u8, 51u8, 170u8, 140u8, 107u8, 28u8, 212u8, 251u8],
+      20 => [109u8, 231u8, 147u8, 94u8, 205u8, 46u8, 186u8, 48u8],
+      21 => [84u8, 38u8, 50u8, 50u8, 245u8, 174u8, 2u8, 245u8],
+      22 => [181u8, 70u8, 114u8, 230u8, 173u8, 24u8, 42u8, 75u8],
+      23 => [139u8, 201u8, 251u8, 54u8, 19u8, 90u8, 157u8, 234u8],
+      24 => [113u8, 225u8, 7u8, 195u8, 49u8, 204u8, 205u8, 134u8],
+      25 => [216u8, 239u8, 207u8, 110u8, 151u8, 245u8, 186u8, 75u8],
+      26 => [134u8, 226u8, 214u8, 120u8, 91u8, 208u8, 65u8, 237u8],
+      27 => [29u8, 23u8, 90u8, 145u8, 216u8, 141u8, 215u8, 16u8],
+      28 => [133u8, 68u8, 74u8, 0u8, 0u8, 149u8, 4u8, 89u8],
+      29 => [102u8, 38u8, 109u8, 164u8, 59u8, 201u8, 168u8, 223u8],
+      30 => [69u8, 8u8, 106u8, 184u8, 9u8, 208u8, 155u8, 126u8],
+      31 => [89u8, 230u8, 136u8, 85u8, 117u8, 127u8, 10u8, 64u8],
+      _  => [140u8, 135u8, 88u8, 218u8, 220u8, 41u8, 86u8, 24u8],
+    }
+  }
+
+  -- Vanishing polynomial of the trace domain (shift = 1, size 2^L) at point ζ:
+  -- `Z_H(ζ) = ζ^(2^L) - 1`.
+  fn trace_vanishing(zeta: Ext, l: G) -> Ext {
+    eg_sub(ext_exp_pow2(zeta, l), [gl_one(), gl_zero()])
+  }
+
+  -- Lagrange selectors at ζ for the trace domain (shift = 1), mirroring
+  -- `domain.rs::selectors_at_point`:
+  --   is_first_row   = Z_H(ζ) / (ζ - 1)
+  --   is_last_row    = Z_H(ζ) / (ζ - g⁻¹)
+  --   is_transition  = ζ - g⁻¹
+  --   inv_vanishing  = 1 / Z_H(ζ)
+  -- where g = two_adic_gen(L) is the subgroup generator.
+  fn trace_selectors(zeta: Ext, l: G) -> (Ext, Ext, Ext, Ext) {
+    let zh = trace_vanishing(zeta, l);
+    let ginv = gl_inverse(two_adic_gen(l));
+    let is_first = eg_div(zh, eg_sub(zeta, [gl_one(), gl_zero()]));
+    let is_last = eg_div(zh, eg_sub(zeta, [ginv, gl_zero()]));
+    let is_trans = eg_sub(zeta, [ginv, gl_zero()]);
+    let inv_van = eg_inverse(zh);
+    (is_first, is_last, is_trans, inv_van)
+  }
+
+  -- Structural + accumulator + PCS checks of a deserialized proof (steps 1, 2,
+  -- 4). Fiat-Shamir (step 3) and the OOD check (step 5) live in `ood_verify`,
+  -- which needs the verifying key and the claims.
+  --
+  -- Returns 1 on success; `assert_eq!` aborts the (proof) execution on any
+  -- failed check, exactly as the Rust verifier returns `Err`.
+  fn verify(proof: Proof) -> G {
+    match proof {
+      Proof.Mk(_commitments, accs, _log_degrees, _opening,
+               _quotient, _preprocessed, stage_1, stage_2) =>
+        -- Step 1 (shape, system-independent): the per-round opened-value lists
+        -- and the accumulator list all have the same length = the circuit count.
+        let num_circuits = list_length(accs);
+        -- there must be at least one circuit (Rust: InvalidSystem)
+        assert_eq!(eq_zero(num_circuits), 0);
+        assert_eq!(list_length(stage_1), num_circuits);
+        assert_eq!(list_length(stage_2), num_circuits);
+
+        -- Step 2: accumulator balance — the last accumulator must be zero.
+        assert_eq!(last_acc_is_zero(accs), 1);
+        -- Step 4 (PCS/FRI) now runs inside `ood_verify`, which has the verifying
+        -- key, the challenger continuation, and the opened values it needs.
+        1,
+    }
+  }
+
+  -- ==========================================================================
+  -- Step 5: out-of-domain (OOD) evaluation.
+  --
+  -- Mirrors the per-circuit loop in `verifier.rs::verify_multiple_claims`
+  -- (lines 329-434). For each circuit it recomputes the composition polynomial
+  -- `composition(ζ)` from the opened values by replaying the AIR constraint
+  -- folder (`VerifierConstraintFolder` + `LookupAir::eval`), recomputes the
+  -- quotient `quotient(ζ)` from the opened quotient chunks via the barycentric
+  -- weights `zps`, and asserts
+  --   composition(ζ) · inv_vanishing(ζ) == quotient(ζ).
+  --
+  -- The challenges (lookup, fingerprint, α, ζ) come from `fiat_shamir` above.
+  -- As with `fiat_shamir`, claims are not observed; this assumes the no-claims
+  -- case, so the running lookup accumulator starts at the zero extension
+  -- element (`acc` in Rust = ExtVal::ZERO).
+  -- ==========================================================================
+
+  -- One Horner fold step of the constraint folder: `acc := acc·α + x`
+  -- (`VerifierConstraintFolder::assert_zero` / `assert_zero_ext`).
+  fn ood_fold(acc: Ext, alpha: Ext, x: Ext) -> Ext {
+    eg_add(eg_mul(acc, alpha), x)
+  }
+
+  -- Reconstruct an extension element from its two opened base coordinates,
+  -- `from_ext_basis([c0, c1]) = c0 + c1·X` (the ExtVal basis is `[1, X]`).
+  fn from_ext_basis(c0: Ext, c1: Ext) -> Ext {
+    eg_add(c0, eg_mul(c1, [gl_zero(), gl_one()]))
+  }
+
+  -- A stage-2 / quotient opened row arrives as `stage_2_width·2` extension
+  -- coordinates; fold consecutive pairs back into `stage_2_width` extension
+  -- elements (Rust: `chunks_exact(2).map(from_ext_basis)`).
+  fn reconstruct_ext_row(raw: List‹Ext›) -> List‹Ext› {
+    match load(raw) {
+      ListNode.Nil => store(ListNode.Nil),
+      ListNode.Cons(c0, t1) =>
+        let ListNode.Cons(c1, t2) = load(t1);
+        store(ListNode.Cons(from_ext_basis(c0, c1), reconstruct_ext_row(t2))),
+    }
+  }
+
+  -- Evaluate a symbolic AIR expression at the opened point (Rust:
+  -- `SymbolicExpression::interpret`). Function-circuit `zeros` and lookup
+  -- args/multiplicities only ever reference `Main`/`Preprocessed` entries
+  -- (offset 0) and constants; the other entries never appear here.
+  fn eval_sym(e: SymExpr, main: List‹Ext›, prep: List‹Ext›) -> Ext {
+    match e {
+      SymExpr.Var(entry, idx) =>
+        match entry {
+          SysEntry.Main(_o) => list_lookup(main, idx),
+          SysEntry.Preprocessed(_o) => list_lookup(prep, idx),
+          SysEntry.Stage2(_o) => [gl_zero(), gl_zero()],
+          SysEntry.Public => [gl_zero(), gl_zero()],
+          SysEntry.Stage2Public => [gl_zero(), gl_zero()],
+          SysEntry.Challenge => [gl_zero(), gl_zero()],
+        },
+      SymExpr.IsFirstRow => [gl_zero(), gl_zero()],
+      SymExpr.IsLastRow => [gl_zero(), gl_zero()],
+      SymExpr.IsTransition => [gl_zero(), gl_zero()],
+      SymExpr.Const(c) => [c, gl_zero()],
+      SymExpr.Add(x, y, _d) => eg_add(eval_sym(load(x), main, prep), eval_sym(load(y), main, prep)),
+      SymExpr.Sub(x, y, _d) => eg_sub(eval_sym(load(x), main, prep), eval_sym(load(y), main, prep)),
+      SymExpr.Neg(x, _d) => eg_neg(eval_sym(load(x), main, prep)),
+      SymExpr.Mul(x, y, _d) => eg_mul(eval_sym(load(x), main, prep), eval_sym(load(y), main, prep)),
+    }
+  }
+
+  -- `fingerprint(r, args) = Σ argᵢ·rⁱ` (Horner over the reversed argument list,
+  -- `lookup.rs::fingerprint`).
+  fn fingerprint_ext(r: Ext, args: List‹SymExpr›, main: List‹Ext›, prep: List‹Ext›) -> Ext {
+    match load(args) {
+      ListNode.Nil => [gl_zero(), gl_zero()],
+      ListNode.Cons(a, rest) =>
+        eg_add(eval_sym(a, main, prep), eg_mul(r, fingerprint_ext(r, rest, main, prep))),
+    }
+  }
+
+  -- Fold the inner-AIR `zeros` constraints (Function circuit): each is asserted
+  -- zero on the main (stage-1) row, with no preprocessed row.
+  fn fold_zeros(acc: Ext, alpha: Ext, zeros: List‹SymExpr›, main: List‹Ext›) -> Ext {
+    match load(zeros) {
+      ListNode.Nil => acc,
+      ListNode.Cons(z, rest) =>
+        fold_zeros(ood_fold(acc, alpha, eval_sym(z, main, store(ListNode.Nil))), alpha, rest, main),
+    }
+  }
+
+  -- Fold the selector boolean checks (Function circuit): `assert_bool(row[s])`
+  -- = `assert_zero(s·(s-1))` for `s` in the selector range `[idx, idx+count)`.
+  fn fold_sel_bools(acc: Ext, alpha: Ext, main: List‹Ext›, idx: G, count: G) -> Ext {
+    match count {
+      0 => acc,
+      _ =>
+        let x = list_lookup(main, idx);
+        let bc = eg_mul(x, eg_sub(x, [gl_one(), gl_zero()]));
+        fold_sel_bools(ood_fold(acc, alpha, bc), alpha, main, idx + 1, count - 1),
+    }
+  }
+
+  -- Fold the per-lookup constraints (`LookupAir::eval`): for lookup `k`,
+  -- `assert_one_ext(messageₖ · minvₖ)` = `assert_zero_ext(messageₖ·minvₖ - 1)`,
+  -- where `minvₖ = stage_2_row[1+k]` and
+  -- `messageₖ = lookup_challenge + fingerprint(fingerprint_challenge, argsₖ)`.
+  -- Simultaneously builds `acc_expr = stage_2_row[0] + Σ multiplicityₖ·minvₖ`.
+  -- Returns `(folder_acc, acc_expr)`.
+  fn fold_lookups(acc: Ext, alpha: Ext, lookups: List‹SysLookup›, k: G,
+      main: List‹Ext›, prep: List‹Ext›, s2row: List‹Ext›,
+      lch: Ext, fch: Ext, acc_expr: Ext) -> (Ext, Ext) {
+    match load(lookups) {
+      ListNode.Nil => (acc, acc_expr),
+      ListNode.Cons(lk, rest) =>
+        let SysLookup.Mk(mult_e, args) = lk;
+        let minv = list_lookup(s2row, k + 1);
+        let mult = eval_sym(mult_e, main, prep);
+        let fp = fingerprint_ext(fch, args, main, prep);
+        let message = eg_add(lch, fp);
+        let c = eg_sub(eg_mul(message, minv), [gl_one(), gl_zero()]);
+        let acc = ood_fold(acc, alpha, c);
+        let acc_expr = eg_add(acc_expr, eg_mul(mult, minv));
+        fold_lookups(acc, alpha, rest, k + 1, main, prep, s2row, lch, fch, acc_expr),
+    }
+  }
+
+  -- The composition polynomial `composition(ζ)` for one circuit: replays the
+  -- inner-AIR constraints (per air kind) followed by the lookup-argument
+  -- constraints, folding each with `α` exactly as `LookupAir::eval` drives the
+  -- verifier folder. `accp`/`naccp` are the current/next lookup accumulators.
+  fn ood_composition(air: SysAir, lookups: List‹SysLookup›,
+      main: List‹Ext›, main_next: List‹Ext›, s2row: List‹Ext›, s2next: List‹Ext›,
+      prep: List‹Ext›, isf: Ext, isl: Ext, ist: Ext,
+      lch: Ext, fch: Ext, accp: Ext, naccp: Ext, alpha: Ext) -> Ext {
+    -- inner-AIR constraints first, then hand the accumulator to the lookup tail
+    -- (split so each `match air` arm ends in a tail call — Aiur forbids
+    -- non-tail matches).
+    match air {
+      SysAir.Function(c) =>
+        let SysConstraints.Mk(zeros, ss, se, _w) = c;
+        let acc = fold_zeros([gl_zero(), gl_zero()], alpha, zeros, main);
+        let acc = fold_sel_bools(acc, alpha, main, ss, se - ss);
+        ood_comp_tail(acc, lookups, main, prep, s2row, s2next, isf, isl, ist, lch, fch, accp, naccp, alpha),
+      SysAir.Memory(m) =>
+        let SysMemory.Mk(_w) = m;
+        -- `Memory::eval`: is_real = col 1, ptr = col 2 (current and next row).
+        let is_real = list_lookup(main, 1);
+        let ptr = list_lookup(main, 2);
+        let is_real_next = list_lookup(main_next, 1);
+        let ptr_next = list_lookup(main_next, 2);
+        -- assert_bool(is_real)
+        let acc = ood_fold([gl_zero(), gl_zero()], alpha, eg_mul(is_real, eg_sub(is_real, [gl_one(), gl_zero()])));
+        -- is_real_transition = is_real_next · is_transition
+        let irt = eg_mul(is_real_next, ist);
+        -- when(irt).assert_one(is_real) = irt·(is_real - 1)
+        let acc = ood_fold(acc, alpha, eg_mul(irt, eg_sub(is_real, [gl_one(), gl_zero()])));
+        -- when(irt).assert_eq(ptr+1, ptr_next) = irt·(ptr + 1 - ptr_next)
+        let acc = ood_fold(acc, alpha, eg_mul(irt, eg_sub(eg_add(ptr, [gl_one(), gl_zero()]), ptr_next)));
+        ood_comp_tail(acc, lookups, main, prep, s2row, s2next, isf, isl, ist, lch, fch, accp, naccp, alpha),
+      SysAir.Bytes1 =>
+        ood_comp_tail([gl_zero(), gl_zero()], lookups, main, prep, s2row, s2next, isf, isl, ist, lch, fch, accp, naccp, alpha),
+      SysAir.Bytes2 =>
+        ood_comp_tail([gl_zero(), gl_zero()], lookups, main, prep, s2row, s2next, isf, isl, ist, lch, fch, accp, naccp, alpha),
+    }
+  }
+
+  -- Tail of `ood_composition`: fold the lookup-argument constraints onto the
+  -- inner-AIR accumulator, then the three accumulator boundary constraints.
+  fn ood_comp_tail(acc: Ext, lookups: List‹SysLookup›, main: List‹Ext›, prep: List‹Ext›,
+      s2row: List‹Ext›, s2next: List‹Ext›, isf: Ext, isl: Ext, ist: Ext,
+      lch: Ext, fch: Ext, accp: Ext, naccp: Ext, alpha: Ext) -> Ext {
+    let (acc, acc_expr) =
+      fold_lookups(acc, alpha, lookups, 0, main, prep, s2row, lch, fch, list_lookup(s2row, 0));
+    let acc_col = list_lookup(s2row, 0);
+    let next_acc_col = list_lookup(s2next, 0);
+    -- when_first_row: acc_col = accp
+    let acc = ood_fold(acc, alpha, eg_mul(isf, eg_sub(acc_col, accp)));
+    -- when_transition: acc_expr = next_acc_col
+    let acc = ood_fold(acc, alpha, eg_mul(ist, eg_sub(acc_expr, next_acc_col)));
+    -- when_last_row: acc_expr = naccp
+    ood_fold(acc, alpha, eg_mul(isl, eg_sub(acc_expr, naccp)))
+  }
+
+  -- ==========================================================================
+  -- Quotient evaluation from the opened quotient chunks.
+  --
+  -- The trace domain is the order-2^L subgroup H (shift = 1). The quotient
+  -- domain is the disjoint coset `7·H'` of size `2^(L+log_qd)` (7 = Goldilocks
+  -- GENERATOR), split into `qd = 2^log_qd` chunk domains `Dⱼ` of size `2^L`,
+  -- shift `7·g_q^j` where `g_q = two_adic_gen(L + log_qd)`. Then
+  --   quotient(ζ) = Σⱼ zpsⱼ · from_ext_basis(chunkⱼ),
+  --   zpsⱼ = Πₖ≠ⱼ Z_{Dₖ}(ζ) / Z_{Dₖ}(first_point(Dⱼ)),
+  -- with `Z_{Dₖ}(x) = (x · shift_k⁻¹)^(2^L) - 1`.
+  -- ==========================================================================
+
+  -- base-field power `base^e` (e small: the chunk index, < qd). `base` is a
+  -- non-native Goldilocks element; `e` is a native loop counter.
+  fn g_pow(base: [U8; 8], e: G) -> [U8; 8] {
+    match e {
+      0 => gl_one(),
+      _ => gl_mul(base, g_pow(base, e - 1)),
+    }
+  }
+
+  -- `Z_{Dⱼ}(x) = (x · shift_j⁻¹)^(2^L) - 1`, evaluated at extension point `x`.
+  fn vanish_chunk(x: Ext, l: G, shiftinv: [U8; 8]) -> Ext {
+    eg_sub(ext_exp_pow2(eg_mul(x, [shiftinv, gl_zero()]), l), [gl_one(), gl_zero()])
+  }
+
+  -- `zpsₜ = Πⱼ≠ₜ Z_{Dⱼ}(ζ) / Z_{Dⱼ}(shift_t)`. Iterates j over `[jidx, jidx+rem)`.
+  fn zps_prod(acc: Ext, zeta: Ext, l: G, g_q: [U8; 8], shift_t: [U8; 8], jidx: G, rem: G, t: G) -> Ext {
+    match rem {
+      0 => acc,
+      _ =>
+        let shiftinv = gl_inverse(gl_mul(gl_seven(), g_pow(g_q, jidx)));
+        -- skip the j = t factor (the chunk's own domain); branch in tail
+        -- position so the inner match is not a non-tail match.
+        match eq_zero(jidx - t) {
+          1 => zps_prod(acc, zeta, l, g_q, shift_t, jidx + 1, rem - 1, t),
+          _ =>
+            let factor = eg_mul(vanish_chunk(zeta, l, shiftinv),
+                                 eg_inverse(vanish_chunk([shift_t, gl_zero()], l, shiftinv)));
+            zps_prod(eg_mul(acc, factor), zeta, l, g_q, shift_t, jidx + 1, rem - 1, t),
+        },
+    }
+  }
+
+  -- `quotient(ζ) = Σₜ zpsₜ · from_ext_basis(chunkₜ)`, iterating the `qd` chunks
+  -- (`q_opened[idx][0] = [c0, c1]`).
+  fn quotient_sum(acc: Ext, zeta: Ext, l: G, qd: G, g_q: [U8; 8],
+      q_opened: OpenedRound, idx: G, rem: G, t: G) -> Ext {
+    match rem {
+      0 => acc,
+      _ =>
+        let shift_t = gl_mul(gl_seven(), g_pow(g_q, t));
+        let zps_t = zps_prod([gl_one(), gl_zero()], zeta, l, g_q, shift_t, 0, qd, t);
+        let ch = list_lookup(q_opened, idx);
+        let row = list_lookup(ch, 0);
+        let qv = from_ext_basis(list_lookup(row, 0), list_lookup(row, 1));
+        quotient_sum(eg_add(acc, eg_mul(zps_t, qv)), zeta, l, qd, g_q,
+                     q_opened, idx + 1, rem - 1, t + 1),
+    }
+  }
+
+  -- `quotient_degree = (max(md, 2) - 1).next_power_of_two()` and its log2.
+  -- Tabulated for `max_constraint_degree ≤ 17` (covers all current circuits);
+  -- larger degrees fall through to the `_` arm.
+  fn quotient_degree_of(md: G) -> G {
+    match md {
+      0 => 1, 1 => 1, 2 => 1,
+      3 => 2,
+      4 => 4, 5 => 4,
+      6 => 8, 7 => 8, 8 => 8, 9 => 8,
+      10 => 16, 11 => 16, 12 => 16, 13 => 16, 14 => 16, 15 => 16, 16 => 16, 17 => 16,
+      _ => 32,
+    }
+  }
+  fn log_qd_of(md: G) -> G {
+    match md {
+      0 => 0, 1 => 0, 2 => 0,
+      3 => 1,
+      4 => 2, 5 => 2,
+      6 => 3, 7 => 3, 8 => 3, 9 => 3,
+      10 => 4, 11 => 4, 12 => 4, 13 => 4, 14 => 4, 15 => 4, 16 => 4, 17 => 4,
+      _ => 5,
+    }
+  }
+
+  -- The preprocessed opened row at ζ for circuit `i`, or `Nil` if the circuit
+  -- has no preprocessed trace (`preprocessed_indices[i] = None`).
+  fn ood_prep_row(prep_opt: PreprocessedOpt, oi: OptIdx) -> List‹Ext› {
+    match oi {
+      OptIdx.NoIdx => store(ListNode.Nil),
+      OptIdx.SomeIdx(j) =>
+        match prep_opt {
+          PreprocessedOpt.NoPreprocessed => store(ListNode.Nil),
+          PreprocessedOpt.SomePreprocessed(round) =>
+            let pr = list_lookup(round, j);
+            list_lookup(pr, 0),
+        },
+    }
+  }
+
+  -- Per-circuit OOD loop: for each circuit, recompute composition(ζ) and
+  -- quotient(ζ) and assert `composition · inv_vanishing == quotient`. Threads
+  -- the running lookup accumulator `accp` and the quotient-chunk offset `lastq`.
+  fn ood_loop(circuits: List‹SysCircuit›, prep_indices: List‹OptIdx›,
+      log_degrees: List‹U8›, accs: List‹Ext›,
+      stage1: OpenedRound, stage2: OpenedRound, prep_opt: PreprocessedOpt,
+      q_opened: OpenedRound, i: G, accp: Ext, lastq: G,
+      lch: Ext, fch: Ext, alpha: Ext, zeta: Ext) -> G {
+    match load(circuits) {
+      ListNode.Nil => 1,
+      ListNode.Cons(circ, rest) =>
+        let SysCircuit.Mk(lair, _cc, md, _ph, _pw, _w1, _w2) = circ;
+        let SysLookupAir.Mk(air, lookups) = lair;
+        let l = to_field(list_lookup(log_degrees, i));
+        let qd = quotient_degree_of(md);
+        let log_qd = log_qd_of(md);
+        let naccp = list_lookup(accs, i);
+        let s1 = list_lookup(stage1, i);
+        let main = list_lookup(s1, 0);
+        let main_next = list_lookup(s1, 1);
+        let s2 = list_lookup(stage2, i);
+        let s2row = reconstruct_ext_row(list_lookup(s2, 0));
+        let s2next = reconstruct_ext_row(list_lookup(s2, 1));
+        let prep = ood_prep_row(prep_opt, list_lookup(prep_indices, i));
+        let (isf, isl, ist, invv) = trace_selectors(zeta, l);
+        let comp = ood_composition(air, lookups, main, main_next, s2row, s2next,
+                                   prep, isf, isl, ist, lch, fch, accp, naccp, alpha);
+        let g_q = two_adic_gen(l + log_qd);
+        let quot = quotient_sum([gl_zero(), gl_zero()], zeta, l, qd, g_q, q_opened, lastq, qd, 0);
+        assert_eq!(eg_eq(eg_mul(comp, invv), quot), 1);
+        ood_loop(rest, prep_indices, log_degrees, accs, stage1, stage2, prep_opt,
+                 q_opened, i + 1, naccp, lastq + qd, lch, fch, alpha, zeta),
+    }
+  }
+
+  -- The fingerprint of one claim's values: `Σ vᵢ · fch^i` (each `vᵢ` lifted from
+  -- its raw u64 limb to an extension element). Mirrors `lookup::fingerprint`.
+  fn fingerprint_vals(fch: Ext, vals: List‹U64›) -> Ext {
+    match load(vals) {
+      ListNode.Nil => [gl_zero(), gl_zero()],
+      ListNode.Cons(v, rest) =>
+        eg_add([gl_reduce(v), gl_zero()], eg_mul(fch, fingerprint_vals(fch, rest))),
+    }
+  }
+
+  -- The initial lookup accumulator built from the public claims:
+  -- `acc = Σ_claims 1 / (lookup_challenge + fingerprint(fingerprint_challenge, claim))`
+  -- (Rust `verify_multiple_claims`, lines 227-232). Empty claim list → zero.
+  fn claims_acc(acc: Ext, claims: List‹List‹U64››, lch: Ext, fch: Ext) -> Ext {
+    match load(claims) {
+      ListNode.Nil => acc,
+      ListNode.Cons(c, rest) =>
+        let msg = eg_add(lch, fingerprint_vals(fch, c));
+        claims_acc(eg_add(acc, eg_inverse(msg)), rest, lch, fch),
+    }
+  }
+
+  -- Step 3 + 5: derive the challenges via the (prover-faithful) Fiat-Shamir
+  -- replay over the verifying key's preprocessed commitment + the proof
+  -- commitments + log_degrees + claims, seed the lookup accumulator from the
+  -- claims, then run the OOD composition/quotient check for every circuit.
+  -- Returns 1 on success (any mismatch aborts via `assert_eq!`).
+  fn ood_verify(sys: Sys, proof: Proof, claims: List‹List‹U64››,
+      num_queries: G, commit_pow_bits: G, log_blowup_pub: G) -> G {
+    -- The FRI parameters (`num_queries`, `commit_pow_bits`, `log_blowup`) are
+    -- public inputs. `log_blowup` also lives in the (digest-bound) verifying
+    -- key's CommitmentParameters — assert the two agree.
+    let Sys.Mk(params, circuits, commit, prep_indices) = sys;
+    let SysParams.Mk(log_blowup, _cap_height) = params;
+    assert_eq!(eq_zero(log_blowup - log_blowup_pub), 1);
+    match proof {
+      Proof.Mk(commitments, accs, log_degrees, opening,
+               q_opened, prep_opt, stage1, stage2) =>
+        let Commitments.Mk(s1c, s2c, qc) = commitments;
+        let prep_cap = opt_commit_cap(commit);
+        let (lch, fch, alpha, zeta, post_zeta_input) = fiat_shamir(prep_cap, s1c, s2c, qc, log_degrees, claims);
+        let acc0 = claims_acc([gl_zero(), gl_zero()], claims, lch, fch);
+        -- Step 5: OOD composition/quotient identity for every circuit.
+        let _ood = ood_loop(circuits, prep_indices, log_degrees, accs, stage1, stage2,
+                 prep_opt, q_opened, 0, acc0, 0, lch, fch, alpha, zeta);
+        -- Step 4: FRI PCS proximity + opening consistency, continuing the same
+        -- Fiat-Shamir transcript past ζ (observe opened values → α, βs, query).
+        pcs_fri_verify(post_zeta_input, stage1, stage2, q_opened, prep_opt, opening,
+          s1c, s2c, qc, prep_cap, circuits, prep_indices, log_degrees, zeta,
+          list_length(circuits), log_blowup, num_queries, commit_pow_bits),
+    }
+  }
+
+  -- Read the public claims from the verifier's IO channel. Wire format (set by
+  -- the prover-side harness): u64 `num_claims`, then per claim a u64 `num_vals`
+  -- followed by `num_vals` raw `u64` `Val`s (8 LE bytes each, canonical < p).
+  fn read_claims(stream: ByteStream) -> (List‹List‹U64››, ByteStream) {
+    let (n, s) = read_count(stream);
+    read_claims_n(s, n)
+  }
+  fn read_claims_n(stream: ByteStream, n: G) -> (List‹List‹U64››, ByteStream) {
+    match n {
+      0 => (store(ListNode.Nil), stream),
+      _ =>
+        let (c, s) = read_one_claim(stream);
+        let (rest, s2) = read_claims_n(s, n - 1);
+        (store(ListNode.Cons(c, rest)), s2),
+    }
+  }
+  fn read_one_claim(stream: ByteStream) -> (List‹U64›, ByteStream) {
+    let (m, s) = read_count(stream);
+    read_u64_vec_n(s, m)
+  }
+⟧
+
+end MultiStark
+
+end
diff --git a/RecursiveVerifier.lean b/RecursiveVerifier.lean
new file mode 100644
index 00000000..b55d93ac
--- /dev/null
+++ b/RecursiveVerifier.lean
@@ -0,0 +1,201 @@
+import Ix.Aiur.Meta
+import Ix.Aiur.Protocol
+import Ix.Aiur.Compiler
+import Ix.Aiur.Statistics
+import Ix.MultiStark
+import Ix.Keccak
+import Tests.Aiur.Common
+
+/-!
+# Recursive verifier end-to-end test
+
+A standalone executable that exercises the Multi-STARK verifier scaffold
+(`Ix/MultiStark.lean`) against a real proof:
+
+1. Define a tiny Aiur toplevel with a `factorial` entrypoint.
+2. Prove `factorial(5) = 120` with the Multi-STARK backend.
+3. Serialize that proof (`Proof.toBytes`, bincode) and seed it on IO
+   channel 0 (key `[0]`; vk on channel 1, claims on channel 2) — the hint the
+   recursive verifier reads non-deterministically via `#read_byte_stream`.
+4. Run the `verify_multi_stark_proof` entrypoint over that IO buffer and
+   prove *its* execution, producing a recursive proof.
+
+Run with `lake exe recursive-verifier`.
+-/
+
+open Aiur
+
+private def toHex (b : ByteArray) : String :=
+  b.data.foldl (fun s x =>
+    let h := String.ofList (Nat.toDigits 16 x.toNat)
+    s ++ (if h.length == 1 then "0" ++ h else h)) ""
+
+/-- 8 little-endian bytes of a `Nat` (taken mod 2^64). -/
+private def u64le (n : Nat) : Array UInt8 :=
+  (Array.range 8).map (fun i => UInt8.ofNat ((n >>> (8 * i)) % 256))
+
+/-- Serialize the public claims for the verifier's IO channel, matching the
+in-circuit `read_claims` wire format: u64 `num_claims`, then per claim a u64
+`num_vals` followed by the `Val`s as canonical 8-byte little-endian `u64`s. -/
+private def serializeClaims (claims : Array (Array Aiur.G)) : ByteArray := Id.run do
+  let mut out : Array UInt8 := u64le claims.size
+  for c in claims do
+    out := out ++ u64le c.size
+    for g in c do
+      out := out ++ u64le g.val.toNat
+  return ⟨out⟩
+
+/-- A tiny Aiur program: `factorial` as the proving entrypoint. -/
+def factorialProgram : Source.Toplevel := ⟦
+  pub fn factorial(n: G) -> G {
+    match n {
+      0 => 1,
+      _ => n * factorial(n - 1),
+    }
+  }
+⟧
+
+/-- A tractable subset of the production FRI config that still exercises every
+generalized verifier path: `numQueries := 2` drives the multi-query loop,
+`commitProofOfWorkBits := 8` drives the commit-phase proof-of-work grinding
+check (one per FRI round), and `log_blowup` is read from the verifying key (the
+verifier code is blowup-value-agnostic). The full production parameters
+(`logBlowup := 2`, `numQueries := 100`, `commitProofOfWorkBits := 20`) use the
+same code paths but make *proving* (large LDE + 2²⁰ grinding per round) too slow
+for this end-to-end harness; the verifier itself handles them unchanged. -/
+def recCommitParams : Aiur.CommitmentParameters :=
+  { logBlowup := 2, capHeight := 0 }
+def innerFri : FriParameters :=
+  { logFinalPolyLen := 0, maxLogArity := 1, numQueries := 3,
+    commitProofOfWorkBits := 20, queryProofOfWorkBits := 0 }
+
+/-- Compile a toplevel and build its proving system, or fail with a message. -/
+def buildSystem (label : String) (top : Source.Toplevel) :
+    IO (CompiledToplevel × AiurSystem) := do
+  let compiled ← match top.compile with
+    | .error e => throw <| IO.userError s!"{label}: compilation failed: {e}"
+    | .ok c => pure c
+  pure (compiled, AiurSystem.build compiled.bytecode recCommitParams)
+
+def main : IO UInt32 := do
+  -- ── 1. factorial system ──────────────────────────────────────────────────
+  let (facCompiled, facSystem) ← buildSystem "factorial" factorialProgram
+  let facIdx ← match facCompiled.getFuncIdx `factorial with
+    | some i => pure i
+    | none => IO.eprintln "factorial entrypoint not found"; return 1
+
+  -- ── 2. prove factorial(5) = 120 ──────────────────────────────────────────
+  -- `G` is a reserved token (the Aiur DSL), so spell the field type qualified.
+  let input := #[Aiur.G.ofNat 5]
+  let (claim, proof, _) := facSystem.prove innerFri facIdx input default
+  let expected := buildClaim facIdx input #[Aiur.G.ofNat 120]
+  if claim != expected then
+    IO.eprintln s!"unexpected factorial claim:\n  got {claim}\n  want {expected}"
+    return 1
+  IO.println s!"✓ proved factorial(5) = 120  (claim = {claim})"
+  match facSystem.verify innerFri claim (.ofBytes proof.toBytes) with
+  | .error e => IO.eprintln s!"inner proof failed to verify: {e}"; return 1
+  | .ok _ => IO.println "✓ inner proof verifies"
+
+  -- ── 3. serialize the proof — pure non-deterministic advice (NOT hashed, not
+  --       a public input; the verifier's own checks are what bind it) ─────────
+  let proofBytes := proof.toBytes
+  IO.println s!"  serialized proof: {proofBytes.size} bytes (advice, not digest-bound)"
+  let proofGs : Array Aiur.G := proofBytes.data.map .ofUInt8
+
+  -- Verifying key (`System<AiurCircuit>`) bytes + its keccak-256 digest.
+  let vkBytes := facSystem.vkBytes
+  IO.println s!"  verifying key: {vkBytes.size} bytes  (~{(vkBytes.size + 1) / 136 + 1} keccak-f blocks)"
+  let sysDigest := Keccak.hash vkBytes
+  IO.println s!"  keccak256(vk)    = {toHex sysDigest}"
+  let sysDigestInput : Array Aiur.G := sysDigest.data.map .ofUInt8
+  let vkGs : Array Aiur.G := vkBytes.data.map .ofUInt8
+
+  -- Public claims (`&[&[Val]]` = the single factorial claim) + keccak digest.
+  let claimBytes := serializeClaims #[claim]
+  IO.println s!"  claims: {claimBytes.size} bytes"
+  let claimsDigest := Keccak.hash claimBytes
+  IO.println s!"  keccak256(claims)= {toHex claimsDigest}"
+  let claimsDigestInput : Array Aiur.G := claimsDigest.data.map .ofUInt8
+  let claimGs : Array Aiur.G := claimBytes.data.map .ofUInt8
+
+  -- Public input = vk digest ++ claims digest ++ FRI params (num_queries,
+  -- commit_pow_bits, log_blowup); IO hints: proof at [0], vk at [1], claims at [2].
+  let friParamInput : Array Aiur.G :=
+    #[Aiur.G.ofNat innerFri.numQueries, Aiur.G.ofNat innerFri.commitProofOfWorkBits,
+      Aiur.G.ofNat recCommitParams.logBlowup]
+  let input : Array Aiur.G := sysDigestInput ++ claimsDigestInput ++ friParamInput
+  let verifierIO : IOBuffer :=
+    (((default : IOBuffer).extend 0 #[Aiur.G.ofNat 0] proofGs).extend 1 #[Aiur.G.ofNat 0] vkGs).extend 2 #[Aiur.G.ofNat 0] claimGs
+
+  -- ── 4. recursive verifier system ─────────────────────────────────────────
+  -- The PRODUCTION toplevel (no test circuits — every `pub fn` adds width).
+  let vTop ← match MultiStark.multiStark with
+    | .error e => IO.eprintln s!"verifier toplevel merge failed: {e}"; return 1
+    | .ok t => pure t
+  let (vCompiled, _vSystem) ← buildSystem "verifier" vTop
+  -- The TESTS toplevel: the verifier plus the `*_test` entrypoints. Compiled
+  -- separately so the test circuits never widen the production system; only the
+  -- bytecode is needed (tests are executed, never proven).
+  let tTop ← match MultiStark.multiStarkTests with
+    | .error e => IO.eprintln s!"tests toplevel merge failed: {e}"; return 1
+    | .ok t => pure t
+  let tCompiled ← match tTop.compile with
+    | .error e => throw <| IO.userError s!"verifier-tests: compilation failed: {e}"
+    | .ok c => pure c
+  let runTest (name : Lean.Name) (okMsg : String) : IO Bool := do
+    match tCompiled.bytecode.execute (tCompiled.getFuncIdx name).get! #[] default with
+    | .error e => IO.eprintln s!"✗ {name} FAILED — {e}"; pure false
+    | .ok _ => IO.println s!"✓ {okMsg}"; pure true
+  -- Validate the verifier's primitives against the Rust reference values.
+  if !(← runTest `pcs_hash_test "pcs_hash_test: keccak MMCS sponge/compress match reference") then return 1
+  if !(← runTest `pcs_merkle_test "pcs_merkle_test: Merkle verify_batch matches reference (root + tamper)") then return 1
+  if !(← runTest `sample_bits_test "sample_bits_test: challenger sample_bits matches reference") then return 1
+  if !(← runTest `pcs_challenger4_test "pcs_challenger4_test: PCS challenger continuation (α_pcs/α_fri/β/index) matches reference") then return 1
+  if !(← runTest `fri_fold_test "fri_fold_test: FRI arity-2 fold_row matches reference") then return 1
+  if !(← runTest `ro_fold_test "ro_fold_test: open_input reduced-opening math matches reference") then return 1
+  if !(← runTest `gl_addsub_test "gl_addsub_test: non-native Goldilocks add/sub match reference") then return 1
+  if !(← runTest `gl_muldiv_test "gl_muldiv_test: non-native Goldilocks mul/inverse/div match reference") then return 1
+  if !(← runTest `eg_ops_test "eg_ops_test: non-native ExtGoldilocks add/mul/inverse/div match reference") then return 1
+  let vIdx ← match vCompiled.getFuncIdx `verify_multi_stark_proof with
+    | some i => pure i
+    | none => IO.eprintln "verify_multi_stark_proof entrypoint not found"; return 1
+
+  -- ── 5. run the verifier: deserialize proof (advice) + vk, replay
+  --       Fiat-Shamir, run the OOD + FRI checks ─────────────────────────────
+  IO.println "running verifier (advice proof + vk digest binding + OOD + FRI)…"
+  match vCompiled.bytecode.execute vIdx input verifierIO with
+  | .error e => IO.eprintln s!"✗ verifier rejected: {e}"; return 1
+  | .ok (_, _, queryCounts) =>
+    IO.println "✓ verifier accepted: vk digest bound, OOD + FRI checks pass"
+    -- ── 6. negative test: tampered proof ADVICE must be rejected by the
+    --        verification checks themselves (no digest binding any more) —
+    --        this isolates the Fiat-Shamir/Merkle/OOD/FRI rejection path.
+    --        Byte 0 is the first stage_1-commitment limb: it diverges the
+    --        replayed transcript from the one the proof was generated under.
+    let badProofGs := proofGs.set! 0 (Aiur.G.ofNat ((proofBytes.data[0]!.toNat + 1) % 256))
+    let badProofIO : IOBuffer :=
+      (((default : IOBuffer).extend 0 #[Aiur.G.ofNat 0] badProofGs).extend 1 #[Aiur.G.ofNat 0] vkGs).extend 2 #[Aiur.G.ofNat 0] claimGs
+    match vCompiled.bytecode.execute vIdx input badProofIO with
+    | .error _ => IO.println "✓ tampered proof advice correctly rejected (verification checks)"
+    | .ok _ => IO.eprintln "✗ tampered proof advice was NOT rejected"; return 1
+    -- ── 6b. negative test: a tampered CLAIM (with a matching keccak digest)
+    --        must be rejected by the OOD / accumulator check. Changing the
+    --        claim feeds a different value into Fiat-Shamir (→ different ζ) and
+    --        into the lookup accumulator, so the composition/quotient identity
+    --        no longer holds even though the keccak binding passes. ──────────
+    let badClaim : Array Aiur.G := claim.set! (claim.size - 1) (Aiur.G.ofNat 121)
+    let badClaimBytes := serializeClaims #[badClaim]
+    let badClaimsDigest := Keccak.hash badClaimBytes
+    let badClaimInput : Array Aiur.G :=
+      sysDigestInput ++ (badClaimsDigest.data.map .ofUInt8) ++ friParamInput
+    let badClaimIO : IOBuffer :=
+      (((default : IOBuffer).extend 0 #[Aiur.G.ofNat 0] proofGs).extend 1 #[Aiur.G.ofNat 0] vkGs).extend
+        2 #[Aiur.G.ofNat 0] (badClaimBytes.data.map .ofUInt8)
+    match vCompiled.bytecode.execute vIdx badClaimInput badClaimIO with
+    | .error _ => IO.println "✓ tampered claim correctly rejected (OOD/accumulator mismatch)"
+    | .ok _ => IO.eprintln "✗ tampered claim was NOT rejected"; return 1
+    -- ── 7. circuit statistics ───────────────────────────────────────────────
+    IO.println "\n── verifier circuit statistics ──"
+    Aiur.printStats (Aiur.computeStats vCompiled queryCounts)
+  pure 0
diff --git a/lakefile.lean b/lakefile.lean
index 5362daa8..853c8191 100644
--- a/lakefile.lean
+++ b/lakefile.lean
@@ -98,6 +98,10 @@ lean_exe «arena-exclude» where
   root := `Tests.Ix.Kernel.ArenaExclude
   supportInterpreter := true
 
+lean_exe «recursive-verifier» where
+  root := `RecursiveVerifier
+  supportInterpreter := true
+
 end Tests
 
 section Benchmarks
diff --git a/src/aiur.rs b/src/aiur.rs
index d54325cc..a218d9b8 100644
--- a/src/aiur.rs
+++ b/src/aiur.rs
@@ -5,6 +5,7 @@ pub mod gadgets;
 pub mod memory;
 pub mod synthesis;
 pub mod trace;
+pub mod vk_codec;
 
 use multi_stark::p3_field::PrimeCharacteristicRing;
 
diff --git a/src/aiur/constraints.rs b/src/aiur/constraints.rs
index abdbef2b..4ad382b0 100644
--- a/src/aiur/constraints.rs
+++ b/src/aiur/constraints.rs
@@ -33,6 +33,7 @@ type Degree = u8;
 static INV_256: LazyLock<G> = LazyLock::new(|| G::from_u64(256).inverse());
 
 /// Holds data for a function circuit.
+#[derive(serde::Serialize, serde::Deserialize)]
 pub struct Constraints {
   pub zeros: Vec<Expr>,
   pub selectors: Range<usize>,
diff --git a/src/aiur/memory.rs b/src/aiur/memory.rs
index fa676394..d6069942 100644
--- a/src/aiur/memory.rs
+++ b/src/aiur/memory.rs
@@ -15,8 +15,9 @@ use rayon::{
 
 use crate::aiur::{G, execute::QueryRecord, memory_channel};
 
+#[derive(serde::Serialize, serde::Deserialize)]
 pub struct Memory {
-  width: usize,
+  pub(crate) width: usize,
 }
 
 impl Memory {
diff --git a/src/aiur/synthesis.rs b/src/aiur/synthesis.rs
index 1dd7f971..bc861635 100644
--- a/src/aiur/synthesis.rs
+++ b/src/aiur/synthesis.rs
@@ -26,10 +26,11 @@ pub struct AiurSystem {
   toplevel: Toplevel,
   // perhaps remove the key from the system in verifier only mode?
   key: ProverKey,
-  system: System<AiurCircuit>,
+  pub(crate) system: System<AiurCircuit>,
 }
 
-enum AiurCircuit {
+#[derive(serde::Serialize, serde::Deserialize)]
+pub(crate) enum AiurCircuit {
   Function(Constraints),
   Memory(Memory),
   Bytes1,
diff --git a/src/aiur/vk_codec.rs b/src/aiur/vk_codec.rs
new file mode 100644
index 00000000..373ca6e7
--- /dev/null
+++ b/src/aiur/vk_codec.rs
@@ -0,0 +1,314 @@
+//! (De)serialization of the verifier's key: `System<AiurCircuit>`.
+//!
+//! The verifier needs the system's per-circuit AIR (symbolic constraints +
+//! lookups + widths), the shared preprocessed commitment, the preprocessed
+//! index map, and the commitment parameters — i.e. everything in
+//! [`System<AiurCircuit>`] except the prover-only preprocessed traces (the
+//! large gadget tables in each `LookupAir.preprocessed`, which are
+//! `#[serde(skip)]`-ed and reconstructed/committed separately).
+//!
+//! This uses serde (`#[derive(Serialize, Deserialize)]` on `System`, `Circuit`,
+//! `LookupAir`, `Lookup`, `SymbolicExpression`, `SymbolicVariable`, `Entry`,
+//! `CommitmentParameters`, and the Ix `AiurCircuit`/`Constraints`/`Memory`)
+//! with the **same bincode config the proof uses**
+//! (`standard().with_little_endian().with_fixed_int_encoding()`), so the wire
+//! format matches `Proof`'s and can be re-implemented in Aiur the same way.
+
+// The codec is exercised by tests today and wired to the FFI / Aiur port next.
+#![allow(dead_code)]
+
+use bincode::{
+  config::{Configuration, Fixint, LittleEndian, standard},
+  serde::{decode_from_slice, encode_to_vec},
+};
+use multi_stark::system::System;
+
+use crate::aiur::synthesis::{AiurCircuit, AiurSystem};
+
+/// The bincode configuration shared with `multi_stark::prover::Proof`.
+fn serde_config() -> Configuration<LittleEndian, Fixint> {
+  standard().with_little_endian().with_fixed_int_encoding()
+}
+
+/// Serialize the verifying key `System<AiurCircuit>` (preprocessed traces are
+/// skipped — see the module docs).
+pub(crate) fn to_bytes(
+  system: &System<AiurCircuit>,
+) -> Result<Vec<u8>, String> {
+  encode_to_vec(system, serde_config()).map_err(|e| e.to_string())
+}
+
+/// Deserialize a `System<AiurCircuit>` from [`to_bytes`] output, requiring that
+/// every byte is consumed.
+pub(crate) fn from_bytes(bytes: &[u8]) -> Result<System<AiurCircuit>, String> {
+  let (system, consumed) =
+    decode_from_slice(bytes, serde_config()).map_err(|e| e.to_string())?;
+  if consumed != bytes.len() {
+    return Err(format!(
+      "trailing data: consumed {consumed} of {}",
+      bytes.len()
+    ));
+  }
+  Ok(system)
+}
+
+/// Convenience: serialize the verifying key of a built [`AiurSystem`].
+pub(crate) fn aiur_system_to_bytes(
+  sys: &AiurSystem,
+) -> Result<Vec<u8>, String> {
+  to_bytes(&sys.system)
+}
+
+#[cfg(test)]
+mod tests {
+  use super::*;
+  use crate::aiur::gadgets::{AiurGadget, bytes1::Bytes1, bytes2::Bytes2};
+  use multi_stark::{lookup::LookupAir, types::CommitmentParameters};
+
+  /// Build a small real `System<AiurCircuit>` from the two byte gadgets and
+  /// check the verifying-key codec round-trips (re-encoding is byte-identical).
+  #[test]
+  fn system_vk_round_trips() {
+    let cp = CommitmentParameters { log_blowup: 1, cap_height: 0 };
+    let (system, _key) = System::new(
+      cp,
+      [
+        LookupAir::new(AiurCircuit::Bytes1, Bytes1.lookups()),
+        LookupAir::new(AiurCircuit::Bytes2, Bytes2.lookups()),
+      ],
+    );
+    let bytes = to_bytes(&system).expect("encode");
+    let back = from_bytes(&bytes).expect("decode");
+    let reencoded = to_bytes(&back).expect("re-encode");
+    assert_eq!(bytes, reencoded, "verifying-key codec round-trip mismatch");
+  }
+
+  #[test]
+  fn rejects_trailing_bytes() {
+    let cp = CommitmentParameters { log_blowup: 1, cap_height: 0 };
+    let (system, _key) =
+      System::new(cp, [LookupAir::new(AiurCircuit::Bytes1, Bytes1.lookups())]);
+    let mut bytes = to_bytes(&system).expect("encode");
+    bytes.push(0);
+    assert!(from_bytes(&bytes).is_err(), "should reject trailing data");
+  }
+}
+
+// ════════════════════════════════════════════════════════════════════════════
+// Manual deserializer — a hand-written reader matching the exact bincode bytes
+// `to_bytes` produces, decoding straight into the real `System<AiurCircuit>`.
+// This is the reference re-implemented in Aiur (the verifier reads its key from
+// this stream); `manual_matches_serde` proves it agrees with serde.
+//
+// bincode `standard().little_endian().fixed_int` layout:
+//   enum tag : u32 (4 bytes LE)       Option : 1 byte (0 = None / 1 = Some)
+//   usize / u64 / G : 8 bytes LE      Vec : u64 length, then elements
+//   struct : fields in declaration order      Range<usize> : start, end (u64)
+//   MerkleCap : Vec<[u64; 4]>         PhantomData : 0 bytes
+// ════════════════════════════════════════════════════════════════════════════
+
+use multi_stark::{
+  builder::symbolic::{Entry, SymbolicExpression, SymbolicVariable},
+  lookup::{Lookup, LookupAir},
+  p3_field::PrimeCharacteristicRing,
+  system::Circuit,
+  types::{Commitment, CommitmentParameters, Val},
+};
+
+use crate::aiur::{constraints::Constraints, memory::Memory};
+
+type Expr = SymbolicExpression<Val>;
+
+struct R<'a> {
+  buf: &'a [u8],
+  pos: usize,
+}
+
+impl<'a> R<'a> {
+  fn take(&mut self, n: usize) -> Result<&'a [u8], String> {
+    let end = self.pos.checked_add(n).ok_or("length overflow")?;
+    if end > self.buf.len() {
+      return Err(format!("eof: need {n} at offset {}", self.pos));
+    }
+    let s = &self.buf[self.pos..end];
+    self.pos = end;
+    Ok(s)
+  }
+  fn u8(&mut self) -> Result<u8, String> {
+    Ok(self.take(1)?[0])
+  }
+  fn u32(&mut self) -> Result<u32, String> {
+    Ok(u32::from_le_bytes(self.take(4)?.try_into().unwrap()))
+  }
+  fn u64(&mut self) -> Result<u64, String> {
+    Ok(u64::from_le_bytes(self.take(8)?.try_into().unwrap()))
+  }
+  fn usize(&mut self) -> Result<usize, String> {
+    usize::try_from(self.u64()?).map_err(|e| format!("usize overflow: {e}"))
+  }
+  fn g(&mut self) -> Result<Val, String> {
+    Ok(Val::from_u64(self.u64()?))
+  }
+  fn entry(&mut self) -> Result<Entry, String> {
+    Ok(match self.u32()? {
+      0 => Entry::Preprocessed { offset: self.usize()? },
+      1 => Entry::Main { offset: self.usize()? },
+      2 => Entry::Stage2 { offset: self.usize()? },
+      3 => Entry::Public,
+      4 => Entry::Stage2Public,
+      5 => Entry::Challenge,
+      t => return Err(format!("bad Entry tag {t}")),
+    })
+  }
+  fn expr(&mut self) -> Result<Expr, String> {
+    Ok(match self.u32()? {
+      0 => SymbolicExpression::Variable(SymbolicVariable::new(
+        self.entry()?,
+        self.usize()?,
+      )),
+      1 => SymbolicExpression::IsFirstRow,
+      2 => SymbolicExpression::IsLastRow,
+      3 => SymbolicExpression::IsTransition,
+      4 => SymbolicExpression::Constant(self.g()?),
+      5 => {
+        let x = Box::new(self.expr()?);
+        let y = Box::new(self.expr()?);
+        SymbolicExpression::Add { x, y, degree_multiple: self.usize()? }
+      },
+      6 => {
+        let x = Box::new(self.expr()?);
+        let y = Box::new(self.expr()?);
+        SymbolicExpression::Sub { x, y, degree_multiple: self.usize()? }
+      },
+      7 => {
+        let x = Box::new(self.expr()?);
+        SymbolicExpression::Neg { x, degree_multiple: self.usize()? }
+      },
+      8 => {
+        let x = Box::new(self.expr()?);
+        let y = Box::new(self.expr()?);
+        SymbolicExpression::Mul { x, y, degree_multiple: self.usize()? }
+      },
+      t => return Err(format!("bad SymbolicExpression tag {t}")),
+    })
+  }
+  fn vec<T>(
+    &mut self,
+    mut f: impl FnMut(&mut Self) -> Result<T, String>,
+  ) -> Result<Vec<T>, String> {
+    let n = self.usize()?;
+    let mut v = Vec::with_capacity(n.min(1 << 16));
+    for _ in 0..n {
+      v.push(f(self)?);
+    }
+    Ok(v)
+  }
+  fn lookup(&mut self) -> Result<Lookup<Expr>, String> {
+    Ok(Lookup { multiplicity: self.expr()?, args: self.vec(Self::expr)? })
+  }
+  fn aircircuit(&mut self) -> Result<AiurCircuit, String> {
+    Ok(match self.u32()? {
+      0 => AiurCircuit::Function(Constraints {
+        zeros: self.vec(Self::expr)?,
+        selectors: self.usize()?..self.usize()?,
+        width: self.usize()?,
+      }),
+      1 => AiurCircuit::Memory(Memory { width: self.usize()? }),
+      2 => AiurCircuit::Bytes1,
+      3 => AiurCircuit::Bytes2,
+      t => return Err(format!("bad AiurCircuit tag {t}")),
+    })
+  }
+  fn circuit(&mut self) -> Result<Circuit<AiurCircuit>, String> {
+    // LookupAir: inner_air, lookups (preprocessed is `#[serde(skip)]` -> None).
+    let air = LookupAir {
+      inner_air: self.aircircuit()?,
+      lookups: self.vec(Self::lookup)?,
+      preprocessed: None,
+    };
+    Ok(Circuit {
+      air,
+      constraint_count: self.usize()?,
+      max_constraint_degree: self.usize()?,
+      preprocessed_height: self.usize()?,
+      preprocessed_width: self.usize()?,
+      stage_1_width: self.usize()?,
+      stage_2_width: self.usize()?,
+    })
+  }
+  fn commitment(&mut self) -> Result<Commitment, String> {
+    let caps = self.vec(|r| Ok([r.u64()?, r.u64()?, r.u64()?, r.u64()?]))?;
+    Ok(Commitment::from(caps))
+  }
+  fn option<T>(
+    &mut self,
+    f: impl FnOnce(&mut Self) -> Result<T, String>,
+  ) -> Result<Option<T>, String> {
+    match self.u8()? {
+      0 => Ok(None),
+      1 => Ok(Some(f(self)?)),
+      t => Err(format!("bad Option tag {t}")),
+    }
+  }
+}
+
+/// Hand-written deserializer of the verifying-key `System<AiurCircuit>`,
+/// matching serde/bincode byte-for-byte (the Aiur port mirrors this).
+pub(crate) fn manual_deserialize(
+  bytes: &[u8],
+) -> Result<System<AiurCircuit>, String> {
+  let mut r = R { buf: bytes, pos: 0 };
+  let commitment_parameters =
+    CommitmentParameters { log_blowup: r.usize()?, cap_height: r.usize()? };
+  let circuits = r.vec(R::circuit)?;
+  let preprocessed_commit = r.option(R::commitment)?;
+  let preprocessed_indices = r.vec(|r| r.option(R::usize))?;
+  if r.pos != bytes.len() {
+    return Err(format!(
+      "trailing data: consumed {} of {}",
+      r.pos,
+      bytes.len()
+    ));
+  }
+  Ok(System {
+    commitment_parameters,
+    circuits,
+    preprocessed_commit,
+    preprocessed_indices,
+  })
+}
+
+#[cfg(test)]
+mod manual_tests {
+  use super::*;
+  use crate::aiur::gadgets::{AiurGadget, bytes1::Bytes1, bytes2::Bytes2};
+  use multi_stark::{lookup::LookupAir, types::CommitmentParameters};
+
+  /// The manual deserializer agrees with serde: decoding serde's bytes and
+  /// re-encoding with serde reproduces them byte-for-byte.
+  #[test]
+  fn manual_matches_serde() {
+    let cp = CommitmentParameters { log_blowup: 1, cap_height: 0 };
+    let (system, _key) = System::new(
+      cp,
+      [
+        LookupAir::new(AiurCircuit::Bytes1, Bytes1.lookups()),
+        LookupAir::new(AiurCircuit::Bytes2, Bytes2.lookups()),
+      ],
+    );
+    let bytes = to_bytes(&system).expect("serde encode");
+    let manual = manual_deserialize(&bytes).expect("manual decode");
+    let reencoded = to_bytes(&manual).expect("serde re-encode");
+    assert_eq!(bytes, reencoded, "manual deserializer disagrees with serde");
+  }
+
+  #[test]
+  fn manual_rejects_trailing() {
+    let cp = CommitmentParameters { log_blowup: 1, cap_height: 0 };
+    let (system, _key) =
+      System::new(cp, [LookupAir::new(AiurCircuit::Bytes2, Bytes2.lookups())]);
+    let mut bytes = to_bytes(&system).expect("encode");
+    bytes.push(0);
+    assert!(manual_deserialize(&bytes).is_err());
+  }
+}
diff --git a/src/ffi/aiur/protocol.rs b/src/ffi/aiur/protocol.rs
index ed2ec7b4..a26d7391 100644
--- a/src/ffi/aiur/protocol.rs
+++ b/src/ffi/aiur/protocol.rs
@@ -57,6 +57,19 @@ extern "C" fn rs_aiur_proof_of_bytes(
   LeanExternal::alloc(&AIUR_PROOF_CLASS, proof)
 }
 
+/// `Aiur.AiurSystem.vkBytes : @& AiurSystem → ByteArray`
+///
+/// Serializes the verifying key (`System<AiurCircuit>`) — see
+/// `crate::aiur::vk_codec`.
+#[unsafe(no_mangle)]
+extern "C" fn rs_aiur_system_vk_bytes(
+  system: LeanExternal<AiurSystem, LeanBorrowed<'_>>,
+) -> LeanByteArray<LeanOwned> {
+  let bytes = crate::aiur::vk_codec::aiur_system_to_bytes(system.get())
+    .expect("VK serialization error");
+  LeanByteArray::from_bytes(&bytes)
+}
+
 /// `AiurSystem.build : @&Bytecode.Toplevel → @&CommitmentParameters → AiurSystem`
 #[unsafe(no_mangle)]
 extern "C" fn rs_aiur_system_build(