From c1831081b0aeacb6f84b27858c9e6de98d6f28b0 Mon Sep 17 00:00:00 2001
From: shreyas-londhe <shreyas_londhe@outlook.com>
Date: Wed, 13 May 2026 16:51:43 +0530
Subject: [PATCH 1/5] perf(whir_zk): drop linear_forms after covector build

Take linear_forms by value in prepare_and_sumcheck / prove_blinded_polynomial
and drop it as soon as the combined covector has been built. Each Covector
in linear_forms holds num_witnesses field elements; for R1CS circuits with
3 matrices (A, B, C) and millions of witnesses this is ~100 MB freed
before the WHIR commit phase, where peak memory is hit.

Measured peak reduction on provekit (m=20 circuits):
- complete_age_check: 880 -> 805 MB (-8.5%)
- t_add_dsc_1850:     533 -> 497 MB (-6.8%)
- t_add_id_data_1850: 222 -> 203 MB (-8.6%)
- poseidon-rounds:    467 -> 467 MB (no change, small linear forms)

Protocol-equivalent. Transcript byte-identical. E2E prove+verify passes.
---
 src/protocols/whir_zk/prover.rs | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/src/protocols/whir_zk/prover.rs b/src/protocols/whir_zk/prover.rs
index a20c23fa..beafc51d 100644
--- a/src/protocols/whir_zk/prover.rs
+++ b/src/protocols/whir_zk/prover.rs
@@ -119,7 +119,7 @@ where
         &mut self,
         vectors: Vec<Cow<'_, [F]>>,
         g_polys: &[Vec<F>],
-        linear_forms: &[Box<dyn LinearForm<F>>],
+        linear_forms: Vec<Box<dyn LinearForm<F>>>,
         evaluations: &[F],
     ) -> PrepareResult<F> {
         let num_vectors = self.dims.num_vectors;
@@ -159,7 +159,7 @@ where
         let g_claims: Vec<F> = {
             let mut buf = vec![F::ZERO; size];
             let mut claims = Vec::with_capacity(linear_forms.len());
-            for w in linear_forms {
+            for w in &linear_forms {
                 buf.fill(F::ZERO);
                 w.accumulate(&mut buf, F::ONE);
                 claims.push(dot(&buf, &g_poly));
@@ -247,6 +247,8 @@ where
         for (coeff, lf) in constraint_rlc_coeffs.iter().zip(linear_forms.iter()) {
             lf.accumulate(&mut covector, *coeff);
         }
+        // Only the combined `covector` is needed past this point.
+        drop(linear_forms);
 
         let mut the_sum: F = constraint_rlc_coeffs
             .iter()
@@ -475,7 +477,7 @@ impl<F: FftField> Config<F> {
         f_hat_polys: Vec<Vec<F>>,
         masking_polys: &[Vec<F>],
         g_polys: &[Vec<F>],
-        linear_forms: &[Box<dyn LinearForm<F>>],
+        linear_forms: Vec<Box<dyn LinearForm<F>>>,
         evaluations: &[F],
     ) -> BlindedProveResult<F>
     where
@@ -682,13 +684,12 @@ impl<F: FftField> Config<F> {
             f_hat_polys,
             &secrets.masking_polys,
             &secrets.g_polys,
-            &linear_forms,
+            linear_forms,
             &evaluations,
         );
 
         // Free fields only needed during Steps 2-6, before Step 7.
         drop(f_hat_witness);
-        drop(linear_forms);
 
         // Step 7: batched blinding polynomial proof.
         self.prove_blinding_polynomial(

From fc3e614471634e803314b3b6b3dd78cd049a5ed6 Mon Sep 17 00:00:00 2001
From: shreyas-londhe <shreyas_londhe@outlook.com>
Date: Fri, 15 May 2026 11:27:52 +0530
Subject: [PATCH 2/5] perf(whir_zk): hold IRS coefficients, re-encode codeword
 on demand

The initial IRS commit witnesses (f_hat and blinding_poly) previously held
their full Reed-Solomon encoded codewords resident from commit through the
entire whir_zk::prove. The codeword is only consumed at open time (Merkle
path generation + queried row extraction); the coefficients are smaller by
the blowup factor (e.g. 4x at rate 1/4) and already retained for other
protocol uses.

Drop matrix immediately after commit. Re-encode transiently around each
open and drop again after. Three encodes per whir_zk::prove call: one
for each of f_hat's two opens (ood_stir_and_rounds, gamma_check) and one
for blinding_poly's open in prove_blinding_polynomial.

Measured on complete_age_check (m=20, N=5 interleaved):
- peak: 805 -> 706 MB (-99 MB / -12.3%)
- wall (median): 3500 -> 4220 ms (+20.6%, +720 ms)
- allocs: 3.56M -> 3.61M (+50k)

Combined with linear_forms drop (c183108) versus unoptimised v1:
- peak: 880 -> 706 MB (-174 MB / -19.8%)

Protocol-equivalent. Prove + verify roundtrip passes byte-identically.
Re-encoded codeword matches the original since interleaved_rs_encode is
deterministic.
---
 src/protocols/whir_zk/committer.rs |  18 ++++-
 src/protocols/whir_zk/prover.rs    | 107 ++++++++++++++++++++++++-----
 2 files changed, 104 insertions(+), 21 deletions(-)

diff --git a/src/protocols/whir_zk/committer.rs b/src/protocols/whir_zk/committer.rs
index 7b431565..12bdcbc9 100644
--- a/src/protocols/whir_zk/committer.rs
+++ b/src/protocols/whir_zk/committer.rs
@@ -103,7 +103,14 @@ impl<F: FftField> Config<F> {
 
         // Step 1b: Commit [[f̂]] via first WHIR instance.
         let f_hat_refs: Vec<&[F]> = f_hat_polys.iter().map(|p| p.as_slice()).collect();
-        let f_hat_witness = self.blinded_polynomial.commit(prover_state, &f_hat_refs);
+        let mut f_hat_witness = self.blinded_polynomial.commit(prover_state, &f_hat_refs);
+
+        // Drop the encoded codeword; will be re-encoded immediately before each
+        // open in prove_blinded_polynomial (Steps 4 and 6). This keeps the
+        // ~codeword_length × interleaving_depth field elements out of the
+        // resident set during the prepare_and_sumcheck rounds where global peak
+        // hits.
+        f_hat_witness.matrix = Vec::new();
 
         // Step 1c: Sample ν + 1 random ℓ-variate blinding polynomials ĝ₀..ĝ_ν.
         let num_blinding_polys = dims.num_g_polys();
@@ -138,10 +145,17 @@ impl<F: FftField> Config<F> {
         }
         let blinding_refs: Vec<&[F]> = blinding_vectors.iter().map(|v| v.as_slice()).collect();
 
-        let blinding_poly_witness = self
+        let mut blinding_poly_witness = self
             .blinding_polynomial
             .commit(prover_state, &blinding_refs);
 
+        // The encoded codeword is only needed when [[M, ĝ]] is opened in
+        // Step 7. Until then it is dead weight (held resident through all of
+        // prove_blinded_polynomial, where global peak hits). Drop the matrix
+        // here; the prover re-encodes from `secrets.blinding_vectors` just
+        // before calling `blinding_polynomial.prove`.
+        blinding_poly_witness.matrix = Vec::new();
+
         Witness {
             f_hat_witness,
             blinding_poly_witness,
diff --git a/src/protocols/whir_zk/prover.rs b/src/protocols/whir_zk/prover.rs
index beafc51d..ad34bab9 100644
--- a/src/protocols/whir_zk/prover.rs
+++ b/src/protocols/whir_zk/prover.rs
@@ -22,7 +22,9 @@ use crate::{
         embedding::Identity,
         geometric_sequence,
         linear_form::{Covector, Evaluate, LinearForm, UnivariateEvaluation},
-        multilinear_extend, univariate_evaluate, MultilinearPoint,
+        multilinear_extend,
+        ntt::interleaved_rs_encode,
+        univariate_evaluate, MultilinearPoint,
     },
     hash::Hash,
     protocols::{
@@ -314,8 +316,11 @@ where
 
     /// Step 5: OOD/STIR queries, STIR constraint accumulation, and remaining WHIR rounds.
     ///
-    /// Takes ownership of `f_hat_polys` so it can be freed after OOD evaluations,
-    /// before the memory-intensive WHIR rounds begin.
+    /// Borrows `f_hat_polys` so it remains available for re-encoding the
+    /// f_hat codeword in Step 6 (`gamma_check`). The codeword in
+    /// `f_hat_witness.matrix` is re-encoded just before its open and cleared
+    /// immediately after, to keep it out of the resident set during the
+    /// memory-intensive sumcheck rounds.
     #[allow(clippy::too_many_arguments)]
     fn ood_stir_and_rounds(
         &mut self,
@@ -323,8 +328,8 @@ where
         alpha_coeffs: &[F],
         rho: F,
         folding_randomness: MultilinearPoint<F>,
-        f_hat_witness: &irs_commit::Witness<F, F>,
-        f_hat_polys: Vec<Vec<F>>,
+        f_hat_witness: &mut irs_commit::Witness<F, F>,
+        f_hat_polys: &[Vec<F>],
         masking_polys: &[Vec<F>],
         g_polys: &[Vec<F>],
     ) -> OodStirResult<F> {
@@ -336,11 +341,27 @@ where
             .irs_committer
             .commit(self.prover_state, &[state.vector.as_slice()]);
         round_config.pow.prove(self.prover_state);
+
+        // Re-encode f_hat codeword for the upcoming open, then drop it again.
+        let f_hat_refs: Vec<&[F]> = f_hat_polys.iter().map(|p| p.as_slice()).collect();
+        f_hat_witness.matrix = interleaved_rs_encode(
+            &f_hat_refs,
+            self.config
+                .blinded_polynomial
+                .initial_committer
+                .codeword_length,
+            self.config
+                .blinded_polynomial
+                .initial_committer
+                .interleaving_depth,
+        );
+        drop(f_hat_refs);
         let in_domain = self
             .config
             .blinded_polynomial
             .initial_committer
-            .open(self.prover_state, &[f_hat_witness]);
+            .open(self.prover_state, &[&*f_hat_witness]);
+        f_hat_witness.matrix = Vec::new();
 
         let r_bar = folding_randomness.0;
         let eq_weights = compute_eq_weights(&r_bar);
@@ -385,9 +406,9 @@ where
             lambda_z_points.push(z);
         }
 
-        // Release f̂ data before WHIR rounds.
+        // Release f̂_combined before WHIR rounds. f_hat_polys is borrowed
+        // from the caller (still needed for re-encoding in gamma_check).
         drop(f_hat_combined);
-        drop(f_hat_polys);
 
         // --- STIR responses ---
         for &z in &in_domain.points {
@@ -436,9 +457,12 @@ where
     /// Step 6: Γ consistency check.
     ///
     /// Opens [[f̂]] at Γ indices and sends blinding evaluations for each γ ∈ Γ.
+    /// Re-encodes the f_hat codeword into `f_hat_witness.matrix` before the
+    /// open and clears it after, mirroring the pattern in `ood_stir_and_rounds`.
     fn gamma_check(
         &mut self,
-        f_hat_witness: &irs_commit::Witness<F, F>,
+        f_hat_witness: &mut irs_commit::Witness<F, F>,
+        f_hat_polys: &[Vec<F>],
         masking_coeffs_all: &[Vec<F>],
         g_i_coeffs: &[Vec<F>],
         gamma_points: &[F],
@@ -446,6 +470,21 @@ where
     ) {
         let gamma_f_hat_indices = gamma_to_f_hat_indices(gamma_points, self.config);
 
+        // Re-encode f_hat codeword for the open at Γ indices.
+        let f_hat_refs: Vec<&[F]> = f_hat_polys.iter().map(|p| p.as_slice()).collect();
+        f_hat_witness.matrix = interleaved_rs_encode(
+            &f_hat_refs,
+            self.config
+                .blinded_polynomial
+                .initial_committer
+                .codeword_length,
+            self.config
+                .blinded_polynomial
+                .initial_committer
+                .interleaving_depth,
+        );
+        drop(f_hat_refs);
+
         // Writes [[f̂]] openings at Γ indices to the transcript.
         // The verifier uses these to reconstruct fold(r̄, [[f̂]])(γ).
         // Return value (Evaluations) is unused: the prover already knows the values.
@@ -453,7 +492,10 @@ where
             .config
             .blinded_polynomial
             .initial_committer
-            .open_at_indices(self.prover_state, &[f_hat_witness], &gamma_f_hat_indices);
+            .open_at_indices(self.prover_state, &[&*f_hat_witness], &gamma_f_hat_indices);
+
+        // Drop the codeword again; nothing else in this protocol needs it.
+        f_hat_witness.matrix = Vec::new();
 
         for &gamma in gamma_points {
             send_blinding_evals(self.prover_state, gamma, masking_coeffs_all, g_i_coeffs);
@@ -465,16 +507,18 @@ where
 impl<F: FftField> Config<F> {
     /// Steps 2-6: Prove the blinded polynomial instance.
     ///
-    /// `f_hat_polys` is taken by value and freed during OOD evaluations (Step 5),
-    /// before the memory-intensive WHIR rounds begin.
-    /// Other witness fields are borrowed; the caller frees them before Step 7.
+    /// `f_hat_witness.matrix` is empty on entry (cleared at commit time); it
+    /// is re-encoded transiently around each open and cleared afterwards to
+    /// keep the codeword out of the resident set during sumcheck rounds.
+    /// `f_hat_polys` is borrowed (needed for re-encoding in both Step 5
+    /// `ood_stir_and_rounds` and Step 6 `gamma_check`).
     #[allow(clippy::too_many_arguments)]
     fn prove_blinded_polynomial<H, R>(
         &self,
         prover_state: &mut ProverState<H, R>,
         vectors: Vec<Cow<'_, [F]>>,
-        f_hat_witness: &irs_commit::Witness<F, F>,
-        f_hat_polys: Vec<Vec<F>>,
+        f_hat_witness: &mut irs_commit::Witness<F, F>,
+        f_hat_polys: &[Vec<F>],
         masking_polys: &[Vec<F>],
         g_polys: &[Vec<F>],
         linear_forms: Vec<Box<dyn LinearForm<F>>>,
@@ -550,6 +594,7 @@ impl<F: FftField> Config<F> {
 
         ctx.gamma_check(
             f_hat_witness,
+            f_hat_polys,
             &masking_coeffs_all,
             &g_i_coeffs,
             &gamma_points,
@@ -670,18 +715,25 @@ impl<F: FftField> Config<F> {
         Hash: ProverMessage<[H::U]>,
     {
         let Witness {
-            f_hat_witness,
-            blinding_poly_witness,
+            mut f_hat_witness,
+            mut blinding_poly_witness,
             f_hat_polys,
             secrets,
         } = witness;
 
         // Steps 2-6: blinded polynomial proof.
+        // Both `f_hat_witness.matrix` and `blinding_poly_witness.matrix` are
+        // empty here (cleared at commit time). The blinded prover re-encodes
+        // f_hat transiently around each of its two opens; blinding_poly is
+        // re-encoded just before Step 7 below. This keeps both codewords
+        // (~codeword_length × interleaving_depth field elements each) out of
+        // the resident set during the prepare_and_sumcheck rounds where global
+        // peak hits.
         let blinded = self.prove_blinded_polynomial(
             prover_state,
             vectors,
-            &f_hat_witness,
-            f_hat_polys,
+            &mut f_hat_witness,
+            &f_hat_polys,
             &secrets.masking_polys,
             &secrets.g_polys,
             linear_forms,
@@ -690,6 +742,23 @@ impl<F: FftField> Config<F> {
 
         // Free fields only needed during Steps 2-6, before Step 7.
         drop(f_hat_witness);
+        drop(f_hat_polys);
+
+        // Re-encode the [[M, ĝ]] codeword, which was dropped at commit time
+        // to keep the resident set small through Step 6.
+        let blinding_refs: Vec<&[F]> = secrets
+            .blinding_vectors
+            .iter()
+            .map(|v| v.as_slice())
+            .collect();
+        blinding_poly_witness.matrix = interleaved_rs_encode(
+            &blinding_refs,
+            self.blinding_polynomial.initial_committer.codeword_length,
+            self.blinding_polynomial
+                .initial_committer
+                .interleaving_depth,
+        );
+        drop(blinding_refs);
 
         // Step 7: batched blinding polynomial proof.
         self.prove_blinding_polynomial(

From 97fea7c62dea4f67a6bc258dc5798623f4e249f1 Mon Sep 17 00:00:00 2001
From: shreyas-londhe <shreyas_londhe@outlook.com>
Date: Fri, 15 May 2026 14:28:42 +0530
Subject: [PATCH 3/5] =?UTF-8?q?perf(whir=5Fzk):=20output-pruned=20NTT=20fo?=
 =?UTF-8?q?r=20f=CC=82=20opens?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace full Reed-Solomon re-encode at the two [[f̂]] open sites
(ood_stir_and_rounds, gamma_check) with an output-pruned NTT that
materialises only the queried codeword rows. The full
`(num_cols × codeword_length)` codeword matrix is never resident:
peak memory at the IRS-coeff bottleneck drops by a factor of
`codeword_length / in_domain_samples` (≈ 4000× at m=20, k=127), and
the per-encode flop count drops from O(N log N) to O(N + k log N).

Algorithm: Sorensen-Burrus radix-2 DIT, walking the butterfly DAG
backwards from the query set to mark only the cone of butterflies
contributing to the requested outputs. Reuses the existing roots-of-
unity cache. Reference: Sorensen & Burrus, "Efficient computation of
the DFT with only a subset of input or output points" (IEEE TSP 41,
1993). See doc comment on `NttEngine::ntt_partial`.

API additions:
  - `NttEngine::ntt_partial` + `ntt_partial_with_plan_into`
  - `PartialNttPlan` (per-(size,indices) pruning plan, reusable
    across batched NTTs that share the same query set)
  - `ntt::partial_interleaved_rs_encode` (mirrors
    `interleaved_rs_encode` but emits only the rows at `indices`)
  - `irs_commit::Config::{open_from_coeffs, open_at_indices_from_coeffs}`
    (functionally identical transcripts to `open`/`open_at_indices`;
    do not require `witness.matrix` to be populated)

The blinding-poly re-encode in `prove()` is left untouched (small
codeword, negligible cost).

Tests:
  - Randomised property tests vs full NTT across sizes 4..2^15,
    sparse and dense query subsets, zero-padded M<N inputs, and
    edge cases (empty, singletons, repeated indices, size=1).
  - `partial_interleaved_rs_encode` byte-identity against
    `interleaved_rs_encode` + row extraction across four shapes
    spanning the regimes used in whir_zk (depth 1 vs 8, single vs
    multi-poly, rate-1/4 blowup).
  - All 155 existing whir tests still pass; fixed the pre-existing
    `test_rejects_g_claim_forgery_via_rho` to mirror the production
    open path (re-encode blinding_poly_witness before
    `prove_blinding_polynomial`; use new partial-encode opens for
    f̂).
---
 src/algebra/ntt/cooley_tukey.rs | 247 ++++++++++++++++++++++++++++++++
 src/algebra/ntt/mod.rs          | 106 +++++++++++++-
 src/protocols/irs_commit.rs     | 108 ++++++++++++++
 src/protocols/whir_zk/mod.rs    |  34 ++++-
 src/protocols/whir_zk/prover.rs |  79 ++++------
 5 files changed, 516 insertions(+), 58 deletions(-)

diff --git a/src/algebra/ntt/cooley_tukey.rs b/src/algebra/ntt/cooley_tukey.rs
index 74cc5bdb..a405842c 100644
--- a/src/algebra/ntt/cooley_tukey.rs
+++ b/src/algebra/ntt/cooley_tukey.rs
@@ -356,6 +356,164 @@ impl<F: Field> NttEngine<F> {
             size => self.ntt_recurse(values, roots, size),
         }
     }
+
+    /// Output-pruned NTT (Sorensen-Burrus, radix-2 DIT).
+    ///
+    /// Computes the size-`size` NTT of `values` (zero-padded to `size` if
+    /// shorter) and returns the outputs at positions `indices`, in input
+    /// order. Output `j` equals the full NTT at position `indices[j]`.
+    ///
+    /// Walks the butterfly DAG backwards from `indices` to mark only the
+    /// cone of butterflies that contribute to the queried outputs, then
+    /// runs only the marked butterflies on the forward pass. Cost is
+    /// `O(size + indices.len() * log(size))` field operations, vs
+    /// `O(size * log(size))` for a full NTT.
+    ///
+    /// `size` must be a power of two.
+    pub fn ntt_partial(&self, values: &[F], size: usize, indices: &[usize]) -> Vec<F> {
+        let plan = PartialNttPlan::new(size, indices);
+        let mut out = vec![F::ZERO; indices.len()];
+        self.ntt_partial_with_plan_into(values, &plan, &mut out, 1);
+        out
+    }
+
+    /// Run a pruned NTT using a precomputed plan and write outputs into
+    /// `out` at stride `stride` (so `out[j * stride]` holds the result for
+    /// `plan.indices[j]`). When `stride == 1`, output is contiguous.
+    ///
+    /// Sharing a single plan across many NTTs with the same `(size, indices)`
+    /// avoids re-running the O(size · log size) mask construction per call.
+    pub fn ntt_partial_with_plan_into(
+        &self,
+        values: &[F],
+        plan: &PartialNttPlan,
+        out: &mut [F],
+        stride: usize,
+    ) {
+        let size = plan.size;
+        let indices = &plan.indices;
+        assert!(values.len() <= size, "input longer than NTT size");
+        if indices.is_empty() {
+            return;
+        }
+        assert!(
+            out.len() >= (indices.len() - 1) * stride + 1,
+            "output buffer too small for stride"
+        );
+        if size == 1 {
+            let v = values.first().copied().unwrap_or(F::ZERO);
+            for j in 0..indices.len() {
+                out[j * stride] = v;
+            }
+            return;
+        }
+
+        let log_n = size.trailing_zeros() as usize;
+        let roots = self.roots_table(size);
+
+        // Load bit-reversed input into work buffer, gated by mask[0].
+        let mut work = vec![F::ZERO; size];
+        let shift = (usize::BITS as usize) - log_n;
+        for (j, &c) in values.iter().enumerate() {
+            let rev = j.reverse_bits() >> shift;
+            if plan.mask[0][rev] {
+                work[rev] = c;
+            }
+        }
+
+        // Forward DIT, skipping butterflies with no needed outputs.
+        // The shared roots table may hold roots at a larger order than `size`;
+        // `roots[k * twiddle_step]` retrieves ω_m^k regardless.
+        for stage in 1..=log_n {
+            let m = 1usize << stage;
+            let half = m >> 1;
+            let twiddle_step = roots.len() / m;
+            let cur = &plan.mask[stage];
+            let mut base = 0;
+            while base < size {
+                for k in 0..half {
+                    let a = base + k;
+                    let b = a + half;
+                    if cur[a] || cur[b] {
+                        let w = roots[k * twiddle_step];
+                        let t = work[b] * w;
+                        let u = work[a];
+                        work[a] = u + t;
+                        work[b] = u - t;
+                    }
+                }
+                base += m;
+            }
+        }
+
+        for (j, &i) in indices.iter().enumerate() {
+            out[j * stride] = work[i];
+        }
+    }
+}
+
+/// Pruning plan for an output-pruned NTT.
+///
+/// Holds the queried output indices and the precomputed per-stage
+/// "needed-position" masks used by [`NttEngine::ntt_partial_with_plan_into`].
+/// Construct once per `(size, indices)` and reuse across multiple NTTs of
+/// the same shape (e.g. all polynomials in an interleaved batch).
+#[derive(Debug, Clone)]
+pub struct PartialNttPlan {
+    size: usize,
+    indices: Vec<usize>,
+    /// `mask[stage][p]` is true iff position `p` after `stage` DIT stages
+    /// must be correct for the final outputs. `mask[log_n]` mirrors
+    /// `indices`; `mask[0]` selects the bit-reversed input positions that
+    /// must be loaded.
+    mask: Vec<Vec<bool>>,
+}
+
+impl PartialNttPlan {
+    pub fn new(size: usize, indices: &[usize]) -> Self {
+        assert!(size.is_power_of_two(), "size must be a power of two");
+        assert!(
+            indices.iter().all(|&i| i < size),
+            "query index out of range"
+        );
+        let log_n = size.trailing_zeros() as usize;
+        let mut mask: Vec<Vec<bool>> = vec![vec![false; size]; log_n + 1];
+        for &i in indices {
+            mask[log_n][i] = true;
+        }
+        for stage in (1..=log_n).rev() {
+            let m = 1usize << stage;
+            let half = m >> 1;
+            let (lo, hi) = mask.split_at_mut(stage);
+            let cur = &hi[0];
+            let prev = &mut lo[stage - 1];
+            let mut base = 0;
+            while base < size {
+                for k in 0..half {
+                    let a = base + k;
+                    let b = a + half;
+                    if cur[a] || cur[b] {
+                        prev[a] = true;
+                        prev[b] = true;
+                    }
+                }
+                base += m;
+            }
+        }
+        Self {
+            size,
+            indices: indices.to_vec(),
+            mask,
+        }
+    }
+
+    pub fn size(&self) -> usize {
+        self.size
+    }
+
+    pub fn indices(&self) -> &[usize] {
+        &self.indices
+    }
 }
 
 /// Applies twiddle factors to a slice of field elements in-place.
@@ -963,4 +1121,93 @@ mod tests {
 
         assert_eq!(values_ntt, expected_values);
     }
+
+    #[test]
+    fn test_ntt_partial_matches_full() {
+        use ark_std::{rand::Rng, UniformRand};
+
+        let engine = NttEngine::<Field64>::new_from_fftfield();
+        let mut rng = ark_std::test_rng();
+
+        for &size in &[4usize, 16, 64, 256, 1024, 1 << 15] {
+            for _ in 0..8 {
+                // Full NTT reference.
+                let coeffs: Vec<_> = (0..size).map(|_| Field64::rand(&mut rng)).collect();
+                let mut full = coeffs.clone();
+                engine.ntt_batch(&mut full, size);
+
+                // Random subset of varying size (cover dense + sparse).
+                let k = rng.gen_range(1..=size.min(64));
+                let mut perm: Vec<usize> = (0..size).collect();
+                for i in (1..size).rev() {
+                    perm.swap(i, rng.gen_range(0..=i));
+                }
+                let indices: Vec<usize> = perm.into_iter().take(k).collect();
+
+                let partial = engine.ntt_partial(&coeffs, size, &indices);
+                assert_eq!(partial.len(), indices.len());
+                for (j, &idx) in indices.iter().enumerate() {
+                    assert_eq!(partial[j], full[idx], "size={size} idx={idx}");
+                }
+            }
+        }
+    }
+
+    #[test]
+    fn test_ntt_partial_zero_padded_input() {
+        // M < N: input is zero-padded. Partial NTT must agree with full NTT
+        // computed over the zero-padded coefficient vector.
+        use ark_std::UniformRand;
+
+        let engine = NttEngine::<Field64>::new_from_fftfield();
+        let mut rng = ark_std::test_rng();
+
+        for (m, size) in [(1usize, 4), (4, 16), (256, 1024), (1 << 13, 1 << 15)] {
+            let coeffs: Vec<_> = (0..m).map(|_| Field64::rand(&mut rng)).collect();
+            let mut padded = coeffs.clone();
+            padded.resize(size, Field64::ZERO);
+            engine.ntt_batch(&mut padded, size);
+
+            let stride = (size / 8).max(1);
+            let indices: Vec<usize> = (0..size).step_by(stride).take(8).collect();
+            let partial = engine.ntt_partial(&coeffs, size, &indices);
+            for (j, &idx) in indices.iter().enumerate() {
+                assert_eq!(partial[j], padded[idx], "m={m} size={size} idx={idx}");
+            }
+        }
+    }
+
+    #[test]
+    fn test_ntt_partial_edge_cases() {
+        use ark_std::UniformRand;
+
+        let engine = NttEngine::<Field64>::new_from_fftfield();
+        let mut rng = ark_std::test_rng();
+
+        // Empty index set.
+        let coeffs: Vec<_> = (0..16).map(|_| Field64::rand(&mut rng)).collect();
+        let out = engine.ntt_partial(&coeffs, 16, &[]);
+        assert!(out.is_empty());
+
+        // Singleton at position 0 and position N-1.
+        let coeffs: Vec<_> = (0..64).map(|_| Field64::rand(&mut rng)).collect();
+        let mut full = coeffs.clone();
+        engine.ntt_batch(&mut full, 64);
+        for idx in [0usize, 1, 31, 32, 63] {
+            let out = engine.ntt_partial(&coeffs, 64, &[idx]);
+            assert_eq!(out, vec![full[idx]], "idx={idx}");
+        }
+
+        // Repeated indices: each occurrence must yield the matching output.
+        let indices = vec![5usize, 5, 17, 5, 17];
+        let out = engine.ntt_partial(&coeffs, 64, &indices);
+        for (j, &idx) in indices.iter().enumerate() {
+            assert_eq!(out[j], full[idx]);
+        }
+
+        // size = 1: any indices must all return values[0].
+        let single = vec![Field64::from(42)];
+        let out = engine.ntt_partial(&single, 1, &[0, 0, 0]);
+        assert_eq!(out, vec![Field64::from(42); 3]);
+    }
 }
diff --git a/src/algebra/ntt/mod.rs b/src/algebra/ntt/mod.rs
index 525a4985..cdaed755 100644
--- a/src/algebra/ntt/mod.rs
+++ b/src/algebra/ntt/mod.rs
@@ -21,9 +21,9 @@ use static_assertions::assert_obj_safe;
 #[cfg(feature = "tracing")]
 use tracing::instrument;
 
-use self::matrix::MatrixMut;
+use self::{cooley_tukey::NttEngine, matrix::MatrixMut};
 pub use self::{
-    cooley_tukey::{generator, intt, intt_batch, ntt, ntt_batch},
+    cooley_tukey::{generator, intt, intt_batch, ntt, ntt_batch, PartialNttPlan},
     transpose::transpose,
     wavelet::{inverse_wavelet_transform, wavelet_transform},
 };
@@ -93,6 +93,54 @@ pub fn interleaved_rs_encode<F: 'static>(
     engine.interleaved_encode(interleaved_coeffs, codeword_length, interleaving_depth)
 }
 
+/// Partial Reed-Solomon encode that materialises only the rows at `indices`.
+///
+/// Equivalent to taking [`interleaved_rs_encode`]'s output (a row-major
+/// `(codeword_length, num_polys * interleaving_depth)` matrix) and
+/// extracting the rows whose row index is in `indices`. Output layout is
+/// row-major `(indices.len(), num_polys * interleaving_depth)`, byte-exact
+/// against the full encode.
+///
+/// Uses an output-pruned NTT (see [`PartialNttPlan`]) so peak memory and
+/// flop count are both proportional to `indices.len()`, not
+/// `codeword_length`. The pruning plan is built once for the index set and
+/// reused across every polynomial × interleaving slot.
+#[cfg_attr(feature = "tracing", instrument(level = "debug", skip(coeffs, indices), fields(size = coeffs.len(), k = indices.len())))]
+pub fn partial_interleaved_rs_encode<F: FftField>(
+    coeffs: &[&[F]],
+    codeword_length: usize,
+    interleaving_depth: usize,
+    indices: &[usize],
+) -> Vec<F> {
+    if coeffs.is_empty() || indices.is_empty() {
+        return Vec::new();
+    }
+    let poly_size = coeffs[0].len();
+    for poly in coeffs {
+        assert_eq!(poly.len(), poly_size);
+    }
+    assert!(poly_size.is_multiple_of(interleaving_depth));
+    let message_length = poly_size / interleaving_depth;
+    assert!(codeword_length.is_multiple_of(message_length));
+
+    let num_polys = coeffs.len();
+    let num_cols = num_polys * interleaving_depth;
+    let k = indices.len();
+
+    let engine = NttEngine::<F>::new_from_cache();
+    let plan = PartialNttPlan::new(codeword_length, indices);
+
+    let mut out = vec![F::ZERO; k * num_cols];
+    for (poly_idx, poly) in coeffs.iter().enumerate() {
+        for slot_idx in 0..interleaving_depth {
+            let col = poly_idx * interleaving_depth + slot_idx;
+            let block = &poly[slot_idx * message_length..(slot_idx + 1) * message_length];
+            engine.ntt_partial_with_plan_into(block, &plan, &mut out[col..], num_cols);
+        }
+    }
+    out
+}
+
 ///
 /// RS encode coefficients grouped in `interleaving_depth` contiguous blocks
 /// at the rate 1/`expansion`, then interleave the evaluations per point.
@@ -350,4 +398,58 @@ mod tests {
             interleaved_rs_encode(&[poly.as_slice()], codeword_length, 1 << folding_factor);
         assert_eq!(expected, interleaved_ntt);
     }
+
+    #[test]
+    fn test_partial_interleaved_rs_encode_matches_full() {
+        use ark_std::{rand::Rng, UniformRand};
+
+        let mut rng = ark_std::test_rng();
+
+        // Span several (num_polys, interleaving_depth, M, N) shapes covering
+        // the regimes that actually appear in whir_zk (single witness with
+        // depth 8, multi-witness with depth 1, M = N/4 blowup).
+        let cases = [
+            (1usize, 1usize, 64usize, 256usize),
+            (1, 8, 16, 64),
+            (2, 4, 32, 128),
+            (1, 8, 1 << 10, 1 << 12),
+        ];
+
+        for (num_polys, interleaving_depth, message_length, codeword_length) in cases {
+            let poly_size = message_length * interleaving_depth;
+            let polys: Vec<Vec<Field64>> = (0..num_polys)
+                .map(|_| (0..poly_size).map(|_| Field64::rand(&mut rng)).collect())
+                .collect();
+            let poly_slices: Vec<&[Field64]> = polys.iter().map(Vec::as_slice).collect();
+
+            let full = interleaved_rs_encode(&poly_slices, codeword_length, interleaving_depth);
+            let num_cols = num_polys * interleaving_depth;
+            assert_eq!(full.len(), codeword_length * num_cols);
+
+            // Random subset including 0, last, and a sprinkling in between.
+            let k = rng.gen_range(1..=codeword_length.min(16));
+            let mut perm: Vec<usize> = (0..codeword_length).collect();
+            for i in (1..codeword_length).rev() {
+                perm.swap(i, rng.gen_range(0..=i));
+            }
+            let indices: Vec<usize> = perm.into_iter().take(k).collect();
+
+            let partial = partial_interleaved_rs_encode(
+                &poly_slices,
+                codeword_length,
+                interleaving_depth,
+                &indices,
+            );
+            assert_eq!(partial.len(), k * num_cols);
+
+            for (row, &idx) in indices.iter().enumerate() {
+                let full_row = &full[idx * num_cols..(idx + 1) * num_cols];
+                let partial_row = &partial[row * num_cols..(row + 1) * num_cols];
+                assert_eq!(
+                    partial_row, full_row,
+                    "shape=({num_polys},{interleaving_depth},{message_length},{codeword_length}) row idx={idx}"
+                );
+            }
+        }
+    }
 }
diff --git a/src/protocols/irs_commit.rs b/src/protocols/irs_commit.rs
index f7fbdbec..d7c0df49 100644
--- a/src/protocols/irs_commit.rs
+++ b/src/protocols/irs_commit.rs
@@ -465,6 +465,114 @@ where
         self.verify_inner(verifier_state, commitments, indices, points)
     }
 
+    /// Opens the commitment without requiring `witness.matrix` to be
+    /// populated.
+    ///
+    /// Functionally identical to [`open`]: same in-domain challenges, same
+    /// transcript bytes (submatrix hint + Merkle paths), same returned
+    /// [`Evaluations`]. The difference is that the queried codeword rows
+    /// are reconstructed from the supplied polynomial coefficients via an
+    /// output-pruned NTT (see [`ntt::partial_interleaved_rs_encode`]), so
+    /// the prover never materialises the full `(num_cols × codeword_length)`
+    /// codeword matrix held in `witness.matrix`.
+    ///
+    /// `coeffs_per_witness[i]` must be the same polynomial slice set that
+    /// would have produced `witnesses[i].matrix` via
+    /// [`interleaved_rs_encode`]. Mismatch results in verifier rejection.
+    pub fn open_from_coeffs<H, R>(
+        &self,
+        prover_state: &mut ProverState<H, R>,
+        coeffs_per_witness: &[&[&[M::Source]]],
+        witnesses: &[&Witness<M::Source, M::Target>],
+    ) -> Evaluations<M::Source>
+    where
+        H: DuplexSpongeInterface,
+        R: RngCore + CryptoRng,
+        u8: Decoding<[H::U]>,
+        Hash: ProverMessage<[H::U]>,
+    {
+        assert_eq!(coeffs_per_witness.len(), witnesses.len());
+        for witness in witnesses {
+            assert_eq!(witness.out_of_domain.points.len(), self.out_domain_samples);
+            assert_eq!(
+                witness.out_of_domain.matrix.len(),
+                self.out_domain_samples * self.num_vectors
+            );
+        }
+        let (indices, points) = self.in_domain_challenges(prover_state);
+        self.open_inner_from_coeffs(
+            prover_state,
+            coeffs_per_witness,
+            witnesses,
+            &indices,
+            points,
+        )
+    }
+
+    /// Like [`open_from_coeffs`] but with caller-provided indices, mirroring
+    /// [`open_at_indices`]. Used for the Γ consistency check.
+    pub fn open_at_indices_from_coeffs<H, R>(
+        &self,
+        prover_state: &mut ProverState<H, R>,
+        coeffs_per_witness: &[&[&[M::Source]]],
+        witnesses: &[&Witness<M::Source, M::Target>],
+        indices: &[usize],
+    ) -> Evaluations<M::Source>
+    where
+        H: DuplexSpongeInterface,
+        R: RngCore + CryptoRng,
+        Hash: ProverMessage<[H::U]>,
+    {
+        assert!(
+            indices.iter().all(|&i| i < self.codeword_length),
+            "index out of bounds: all indices must be < codeword_length ({})",
+            self.codeword_length
+        );
+        assert_eq!(coeffs_per_witness.len(), witnesses.len());
+        let generator = self.generator();
+        let points: Vec<M::Source> = indices.iter().map(|&i| generator.pow([i as u64])).collect();
+        self.open_inner_from_coeffs(prover_state, coeffs_per_witness, witnesses, indices, points)
+    }
+
+    /// Shared open logic for [`open_from_coeffs`] and [`open_at_indices_from_coeffs`].
+    fn open_inner_from_coeffs<H, R>(
+        &self,
+        prover_state: &mut ProverState<H, R>,
+        coeffs_per_witness: &[&[&[M::Source]]],
+        witnesses: &[&Witness<M::Source, M::Target>],
+        indices: &[usize],
+        points: Vec<M::Source>,
+    ) -> Evaluations<M::Source>
+    where
+        H: DuplexSpongeInterface,
+        R: RngCore + CryptoRng,
+        Hash: ProverMessage<[H::U]>,
+    {
+        let num_cols = self.num_cols();
+        let stride = witnesses.len() * num_cols;
+        let mut matrix = vec![M::Source::ZERO; indices.len() * stride];
+        let mut matrix_col_offset = 0;
+        for (coeffs, witness) in coeffs_per_witness.iter().zip(witnesses) {
+            let submatrix = ntt::partial_interleaved_rs_encode(
+                coeffs,
+                self.codeword_length,
+                self.interleaving_depth,
+                indices,
+            );
+            debug_assert_eq!(submatrix.len(), indices.len() * num_cols);
+            for (row, src) in submatrix.chunks_exact(num_cols).enumerate() {
+                let dst = &mut matrix[row * stride + matrix_col_offset
+                    ..row * stride + matrix_col_offset + num_cols];
+                dst.copy_from_slice(src);
+            }
+            prover_state.prover_hint_ark(&submatrix);
+            self.matrix_commit
+                .open(prover_state, &witness.matrix_witness, indices);
+            matrix_col_offset += num_cols;
+        }
+        Evaluations { points, matrix }
+    }
+
     /// Shared open logic for [`open`] and [`open_at_indices`].
     fn open_inner<H, R>(
         &self,
diff --git a/src/protocols/whir_zk/mod.rs b/src/protocols/whir_zk/mod.rs
index 8b449c69..ee782854 100644
--- a/src/protocols/whir_zk/mod.rs
+++ b/src/protocols/whir_zk/mod.rs
@@ -964,10 +964,11 @@ mod tests {
             .irs_committer
             .commit(&mut prover_state, &[&f_zk]);
         round_config.pow.prove(&mut prover_state);
+        let f_hat_refs: Vec<&[F]> = f_hat_polys.iter().map(Vec::as_slice).collect();
         let in_domain = config
             .blinded_polynomial
             .initial_committer
-            .open(&mut prover_state, &[&f_hat_witness]);
+            .open_from_coeffs(&mut prover_state, &[&f_hat_refs], &[&f_hat_witness]);
 
         let mut lambda_z_points: Vec<F> = Vec::new();
         let send_blinding = |ps: &mut ProverState<_, _>, z: F| {
@@ -989,7 +990,6 @@ mod tests {
             send_blinding(&mut prover_state, z);
             lambda_z_points.push(z);
         }
-        drop(f_hat_polys);
         for &z in &in_domain.points {
             send_blinding(&mut prover_state, z);
             lambda_z_points.push(z);
@@ -1034,11 +1034,16 @@ mod tests {
             &round0_folding,
         );
         let gamma_points = remaining.first_in_domain_points;
-        let _ = config.blinded_polynomial.initial_committer.open_at_indices(
-            &mut prover_state,
-            &[&f_hat_witness],
-            &gamma_to_f_hat_indices(&gamma_points, &config),
-        );
+        let _ = config
+            .blinded_polynomial
+            .initial_committer
+            .open_at_indices_from_coeffs(
+                &mut prover_state,
+                &[&f_hat_refs],
+                &[&f_hat_witness],
+                &gamma_to_f_hat_indices(&gamma_points, &config),
+            );
+        drop(f_hat_polys);
         for &gamma in &gamma_points {
             send_blinding(&mut prover_state, gamma);
             lambda_z_points.push(gamma);
@@ -1068,6 +1073,21 @@ mod tests {
             .iter()
             .map(|v| Cow::Borrowed(v.as_slice()))
             .collect();
+        // Re-encode blinding_poly_witness.matrix (cleared at commit time);
+        // mirrors prover.rs::prove_blinded_polynomial before
+        // `prove_blinding_polynomial`.
+        let blinding_refs: Vec<&[F]> = secrets
+            .blinding_vectors
+            .iter()
+            .map(|v| v.as_slice())
+            .collect();
+        let mut blinding_poly_witness = blinding_poly_witness;
+        blinding_poly_witness.matrix = crate::algebra::ntt::interleaved_rs_encode(
+            &blinding_refs,
+            config.blinding_polynomial.initial_committer.codeword_length,
+            config.blinding_polynomial.initial_committer.interleaving_depth,
+        );
+        drop(blinding_refs);
         let _ = config.blinding_polynomial.prove(
             &mut prover_state,
             blinding_cows,
diff --git a/src/protocols/whir_zk/prover.rs b/src/protocols/whir_zk/prover.rs
index ad34bab9..f471f0d1 100644
--- a/src/protocols/whir_zk/prover.rs
+++ b/src/protocols/whir_zk/prover.rs
@@ -316,11 +316,10 @@ where
 
     /// Step 5: OOD/STIR queries, STIR constraint accumulation, and remaining WHIR rounds.
     ///
-    /// Borrows `f_hat_polys` so it remains available for re-encoding the
-    /// f_hat codeword in Step 6 (`gamma_check`). The codeword in
-    /// `f_hat_witness.matrix` is re-encoded just before its open and cleared
-    /// immediately after, to keep it out of the resident set during the
-    /// memory-intensive sumcheck rounds.
+    /// Borrows `f_hat_polys` so it remains available for the f̂ open in
+    /// Step 6 (`gamma_check`). The [[f̂]] open uses an output-pruned NTT
+    /// (`open_from_coeffs`) that materialises only the queried codeword
+    /// rows, so `f_hat_witness.matrix` stays empty throughout.
     #[allow(clippy::too_many_arguments)]
     fn ood_stir_and_rounds(
         &mut self,
@@ -342,26 +341,20 @@ where
             .commit(self.prover_state, &[state.vector.as_slice()]);
         round_config.pow.prove(self.prover_state);
 
-        // Re-encode f_hat codeword for the upcoming open, then drop it again.
+        // Open [[f̂]] at in-domain indices via output-pruned NTT: only the
+        // k = in_domain_samples queried codeword rows are materialised,
+        // skipping the full Reed-Solomon re-encode and its (num_cols ×
+        // codeword_length) allocation.
         let f_hat_refs: Vec<&[F]> = f_hat_polys.iter().map(|p| p.as_slice()).collect();
-        f_hat_witness.matrix = interleaved_rs_encode(
-            &f_hat_refs,
-            self.config
-                .blinded_polynomial
-                .initial_committer
-                .codeword_length,
-            self.config
-                .blinded_polynomial
-                .initial_committer
-                .interleaving_depth,
-        );
-        drop(f_hat_refs);
         let in_domain = self
             .config
             .blinded_polynomial
             .initial_committer
-            .open(self.prover_state, &[&*f_hat_witness]);
-        f_hat_witness.matrix = Vec::new();
+            .open_from_coeffs(
+                self.prover_state,
+                &[&f_hat_refs],
+                &[&*f_hat_witness],
+            );
 
         let r_bar = folding_randomness.0;
         let eq_weights = compute_eq_weights(&r_bar);
@@ -456,9 +449,9 @@ where
 
     /// Step 6: Γ consistency check.
     ///
-    /// Opens [[f̂]] at Γ indices and sends blinding evaluations for each γ ∈ Γ.
-    /// Re-encodes the f_hat codeword into `f_hat_witness.matrix` before the
-    /// open and clears it after, mirroring the pattern in `ood_stir_and_rounds`.
+    /// Opens [[f̂]] at Γ indices via `open_at_indices_from_coeffs` (output-
+    /// pruned NTT) and sends blinding evaluations for each γ ∈ Γ. The
+    /// codeword matrix is never materialised.
     fn gamma_check(
         &mut self,
         f_hat_witness: &mut irs_commit::Witness<F, F>,
@@ -470,32 +463,20 @@ where
     ) {
         let gamma_f_hat_indices = gamma_to_f_hat_indices(gamma_points, self.config);
 
-        // Re-encode f_hat codeword for the open at Γ indices.
+        // Open [[f̂]] at Γ indices via output-pruned NTT: the verifier
+        // reconstructs fold(r̄, [[f̂]])(γ) from these openings. Return value
+        // is unused; the prover already knows the values.
         let f_hat_refs: Vec<&[F]> = f_hat_polys.iter().map(|p| p.as_slice()).collect();
-        f_hat_witness.matrix = interleaved_rs_encode(
-            &f_hat_refs,
-            self.config
-                .blinded_polynomial
-                .initial_committer
-                .codeword_length,
-            self.config
-                .blinded_polynomial
-                .initial_committer
-                .interleaving_depth,
-        );
-        drop(f_hat_refs);
-
-        // Writes [[f̂]] openings at Γ indices to the transcript.
-        // The verifier uses these to reconstruct fold(r̄, [[f̂]])(γ).
-        // Return value (Evaluations) is unused: the prover already knows the values.
         let _f_hat_openings = self
             .config
             .blinded_polynomial
             .initial_committer
-            .open_at_indices(self.prover_state, &[&*f_hat_witness], &gamma_f_hat_indices);
-
-        // Drop the codeword again; nothing else in this protocol needs it.
-        f_hat_witness.matrix = Vec::new();
+            .open_at_indices_from_coeffs(
+                self.prover_state,
+                &[&f_hat_refs],
+                &[&*f_hat_witness],
+                &gamma_f_hat_indices,
+            );
 
         for &gamma in gamma_points {
             send_blinding_evals(self.prover_state, gamma, masking_coeffs_all, g_i_coeffs);
@@ -507,11 +488,11 @@ where
 impl<F: FftField> Config<F> {
     /// Steps 2-6: Prove the blinded polynomial instance.
     ///
-    /// `f_hat_witness.matrix` is empty on entry (cleared at commit time); it
-    /// is re-encoded transiently around each open and cleared afterwards to
-    /// keep the codeword out of the resident set during sumcheck rounds.
-    /// `f_hat_polys` is borrowed (needed for re-encoding in both Step 5
-    /// `ood_stir_and_rounds` and Step 6 `gamma_check`).
+    /// `f_hat_witness.matrix` is empty on entry (cleared at commit time)
+    /// and stays empty: both [[f̂]] opens (in `ood_stir_and_rounds` and
+    /// `gamma_check`) use output-pruned encoding, so the full codeword
+    /// matrix is never materialised. `f_hat_polys` is borrowed because
+    /// both opens read coefficients from it.
     #[allow(clippy::too_many_arguments)]
     fn prove_blinded_polynomial<H, R>(
         &self,

From d4ad1fb46300aa1aacf6ea894b4ab8eae917040c Mon Sep 17 00:00:00 2001
From: shreyas-londhe <shreyas_londhe@outlook.com>
Date: Fri, 15 May 2026 14:47:52 +0530
Subject: [PATCH 4/5] perf(whir_zk): parallelise partial_interleaved_rs_encode
 batches

Each (poly_idx, slot_idx) NTT in the partial encode is independent.
Switch to a batch-major intermediate (`(num_cols, k)`) populated via
`par_chunks_exact_mut` and transpose to the row-major output. Brings
the partial encode in line with the parallel batching the existing
`ntt_batch` performs inside the full encode.
---
 src/algebra/ntt/mod.rs | 40 ++++++++++++++++++++++++++++++++--------
 1 file changed, 32 insertions(+), 8 deletions(-)

diff --git a/src/algebra/ntt/mod.rs b/src/algebra/ntt/mod.rs
index cdaed755..2103a28a 100644
--- a/src/algebra/ntt/mod.rs
+++ b/src/algebra/ntt/mod.rs
@@ -17,6 +17,8 @@ use std::{
 };
 
 use ark_ff::{FftField, Field};
+#[cfg(feature = "parallel")]
+use rayon::prelude::*;
 use static_assertions::assert_obj_safe;
 #[cfg(feature = "tracing")]
 use tracing::instrument;
@@ -130,15 +132,37 @@ pub fn partial_interleaved_rs_encode<F: FftField>(
     let engine = NttEngine::<F>::new_from_cache();
     let plan = PartialNttPlan::new(codeword_length, indices);
 
-    let mut out = vec![F::ZERO; k * num_cols];
-    for (poly_idx, poly) in coeffs.iter().enumerate() {
-        for slot_idx in 0..interleaving_depth {
-            let col = poly_idx * interleaving_depth + slot_idx;
-            let block = &poly[slot_idx * message_length..(slot_idx + 1) * message_length];
-            engine.ntt_partial_with_plan_into(block, &plan, &mut out[col..], num_cols);
-        }
+    // Build the submatrix in batch-major layout (`(num_cols, k)`): each
+    // contiguous k-chunk is one NTT's outputs. Batches are independent, so
+    // populate in parallel across (poly_idx, slot_idx). Final transpose
+    // converts to the row-major `(k, num_cols)` layout that
+    // `irs_commit::open_inner_from_coeffs` expects.
+    let mut batch_major = vec![F::ZERO; num_cols * k];
+
+    #[cfg(feature = "parallel")]
+    {
+        batch_major
+            .par_chunks_exact_mut(k)
+            .enumerate()
+            .for_each(|(col, dst)| {
+                let poly_idx = col / interleaving_depth;
+                let slot_idx = col % interleaving_depth;
+                let block = &coeffs[poly_idx]
+                    [slot_idx * message_length..(slot_idx + 1) * message_length];
+                engine.ntt_partial_with_plan_into(block, &plan, dst, 1);
+            });
+    }
+    #[cfg(not(feature = "parallel"))]
+    for (col, dst) in batch_major.chunks_exact_mut(k).enumerate() {
+        let poly_idx = col / interleaving_depth;
+        let slot_idx = col % interleaving_depth;
+        let block =
+            &coeffs[poly_idx][slot_idx * message_length..(slot_idx + 1) * message_length];
+        engine.ntt_partial_with_plan_into(block, &plan, dst, 1);
     }
-    out
+
+    transpose(&mut batch_major, num_cols, k);
+    batch_major
 }
 
 ///

From c39ce013d24e095a6153681120f2c1e8bed2ed8e Mon Sep 17 00:00:00 2001
From: shreyas-londhe <shreyas_londhe@outlook.com>
Date: Fri, 15 May 2026 15:19:00 +0530
Subject: [PATCH 5/5] chore(whir_zk): clippy + signature cleanup
 post-pruned-NTT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewer flagged that `f_hat_witness: &mut` in `ood_stir_and_rounds`,
`gamma_check`, and `prove_blinded_polynomial` is no longer accurate —
the partial-encode path never mutates the witness. Switch to `&` and
drop the now-redundant `&*` reborrows. Misleading `&mut` could mask
future bugs where the witness is unintentionally mutated.

Also applies the smaller clippy/fmt nits the reviewer surfaced:
- ntt_partial: allow(dead_code) (kept pub for external callers; the
  hot path uses ntt_partial_with_plan_into)
- PartialNttPlan::size: const fn
- ntt_partial_with_plan_into: allow(significant_drop_tightening); the
  roots-table RwLockReadGuard is intentionally held across all DIT
  stages, mirroring ntt_dispatch
- assertion comparison form: `> n` instead of `>= n + 1`
- cargo fmt

`cargo clippy -- -D warnings` is now clean; 155 lib tests still pass.
---
 src/algebra/ntt/cooley_tukey.rs |  6 ++++--
 src/protocols/whir_zk/prover.rs | 14 +++++++-------
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/algebra/ntt/cooley_tukey.rs b/src/algebra/ntt/cooley_tukey.rs
index a405842c..a9380b3e 100644
--- a/src/algebra/ntt/cooley_tukey.rs
+++ b/src/algebra/ntt/cooley_tukey.rs
@@ -370,6 +370,7 @@ impl<F: Field> NttEngine<F> {
     /// `O(size * log(size))` for a full NTT.
     ///
     /// `size` must be a power of two.
+    #[allow(dead_code)] // public single-shot entry; batched callers use the plan-based path
     pub fn ntt_partial(&self, values: &[F], size: usize, indices: &[usize]) -> Vec<F> {
         let plan = PartialNttPlan::new(size, indices);
         let mut out = vec![F::ZERO; indices.len()];
@@ -383,6 +384,7 @@ impl<F: Field> NttEngine<F> {
     ///
     /// Sharing a single plan across many NTTs with the same `(size, indices)`
     /// avoids re-running the O(size · log size) mask construction per call.
+    #[allow(clippy::significant_drop_tightening)] // roots guard intentionally held across DIT stages
     pub fn ntt_partial_with_plan_into(
         &self,
         values: &[F],
@@ -397,7 +399,7 @@ impl<F: Field> NttEngine<F> {
             return;
         }
         assert!(
-            out.len() >= (indices.len() - 1) * stride + 1,
+            out.len() > (indices.len() - 1) * stride,
             "output buffer too small for stride"
         );
         if size == 1 {
@@ -507,7 +509,7 @@ impl PartialNttPlan {
         }
     }
 
-    pub fn size(&self) -> usize {
+    pub const fn size(&self) -> usize {
         self.size
     }
 
diff --git a/src/protocols/whir_zk/prover.rs b/src/protocols/whir_zk/prover.rs
index f471f0d1..57ec8f36 100644
--- a/src/protocols/whir_zk/prover.rs
+++ b/src/protocols/whir_zk/prover.rs
@@ -327,7 +327,7 @@ where
         alpha_coeffs: &[F],
         rho: F,
         folding_randomness: MultilinearPoint<F>,
-        f_hat_witness: &mut irs_commit::Witness<F, F>,
+        f_hat_witness: &irs_commit::Witness<F, F>,
         f_hat_polys: &[Vec<F>],
         masking_polys: &[Vec<F>],
         g_polys: &[Vec<F>],
@@ -353,7 +353,7 @@ where
             .open_from_coeffs(
                 self.prover_state,
                 &[&f_hat_refs],
-                &[&*f_hat_witness],
+                &[f_hat_witness],
             );
 
         let r_bar = folding_randomness.0;
@@ -454,7 +454,7 @@ where
     /// codeword matrix is never materialised.
     fn gamma_check(
         &mut self,
-        f_hat_witness: &mut irs_commit::Witness<F, F>,
+        f_hat_witness: &irs_commit::Witness<F, F>,
         f_hat_polys: &[Vec<F>],
         masking_coeffs_all: &[Vec<F>],
         g_i_coeffs: &[Vec<F>],
@@ -474,7 +474,7 @@ where
             .open_at_indices_from_coeffs(
                 self.prover_state,
                 &[&f_hat_refs],
-                &[&*f_hat_witness],
+                &[f_hat_witness],
                 &gamma_f_hat_indices,
             );
 
@@ -498,7 +498,7 @@ impl<F: FftField> Config<F> {
         &self,
         prover_state: &mut ProverState<H, R>,
         vectors: Vec<Cow<'_, [F]>>,
-        f_hat_witness: &mut irs_commit::Witness<F, F>,
+        f_hat_witness: &irs_commit::Witness<F, F>,
         f_hat_polys: &[Vec<F>],
         masking_polys: &[Vec<F>],
         g_polys: &[Vec<F>],
@@ -696,7 +696,7 @@ impl<F: FftField> Config<F> {
         Hash: ProverMessage<[H::U]>,
     {
         let Witness {
-            mut f_hat_witness,
+            f_hat_witness,
             mut blinding_poly_witness,
             f_hat_polys,
             secrets,
@@ -713,7 +713,7 @@ impl<F: FftField> Config<F> {
         let blinded = self.prove_blinded_polynomial(
             prover_state,
             vectors,
-            &mut f_hat_witness,
+            &f_hat_witness,
             &f_hat_polys,
             &secrets.masking_polys,
             &secrets.g_polys,