From a04e728eb4e43098d18d938d36b01c304a63d0f0 Mon Sep 17 00:00:00 2001 From: Barnadrot Date: Fri, 12 Jun 2026 17:41:38 +0200 Subject: [PATCH 1/8] pw13-1 T0: h-wf kill-ladder benches (T0a spec 0.76 PASS but e2e 1.26 => lazy-ONCE design mandated; T0b EFxEF/basexEF 2.29 => MAX_SLICES 2) T0a (n_vars=26, 11-stmt production shape, bit-identical polys asserted): baseline combine 158ms + r0 15ms + r1fold 78ms; lazy-twice terms 14 + r0 130 + r1 174. Memory passes nearly free on M4 (1.34GB read = 12-15ms) - combine cost is EF arithmetic + RMW scatter, NOT bandwidth. Lazy-twice evaluation loses; fusing combine INTO round-0 with single lazy evaluation + stream materialization is the viable variant (~-30-40ms). T0b: EFxEF/basexEF = 2.29 stable across 2^20..2^23 => exactly one delayed-EF round profitable (margin ~13%); T2 scoped to r1 evals-side slices. packing_log_width = 2 (4 NEON lanes). Co-Authored-By: Claude Fable 5 --- crates/whir/src/open.rs | 431 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 431 insertions(+) diff --git a/crates/whir/src/open.rs b/crates/whir/src/open.rs index 358d23814..906cb8fce 100644 --- a/crates/whir/src/open.rs +++ b/crates/whir/src/open.rs @@ -616,3 +616,434 @@ where } (combined_weights, combined_sum) } + +// --------------------------------------------------------------------------- +// h-wf kill-ladder rung benches (pw13-mac iter-1, hypothesis "whir-lazy-fusion"). +// Test-only; no production code change. +// T0a: lazy chunk-wise weight evaluation (never materializing the 1.34 GB +// combined weight) vs the materialized combine_statement + round-0 + +// round-1-fold baseline. Gates (plan_spec): lazy_r0/(combine+read) <= 1.3 +// PASS, 1.3-2.0 GRAY, > 2.0 KILL. Bit-identical round polys asserted. +// T0b: EFxEF vs basexEF packed product-sumcheck ratio -> pins MAX_SLICES for +// the delayed-EF representation (BDT 2024/1046). +// --------------------------------------------------------------------------- +#[cfg(test)] +mod fusion_bench { + use super::*; + use crate::{SparseStatement, SparseValue}; + use field::{PackedFieldExtension, PackedValue, PrimeCharacteristicRing}; + use koala_bear::{KoalaBear, QuinticExtensionFieldKB}; + use rand::{RngExt, SeedableRng, rngs::StdRng}; + use std::hint::black_box; + use std::time::Instant; + use sumcheck::{compute_product_sumcheck_polynomial, fold_and_compute_product_sumcheck_polynomial}; + + type F = KoalaBear; + type EF = QuinticExtensionFieldKB; + type FP = PFPacking; + type EFP = EFPacking; + + fn w_log() -> usize { + packing_log_width::() + } + + #[inline(always)] + fn unpack_sum(s: EFP) -> EF { + >::to_ext_iter([s]).sum::() + } + + fn decompose(e: EFP) -> Vec { + >::to_ext_iter([e]).collect() + } + + /// Full-eq term: scalar pre-multiplied into the prefix table. + /// value(j) = right_packed[j & rmask] * left[j >> rshift] + struct FullT { + left: ArenaVec, // 2^A entries, scaled + right: ArenaVec, // 2^(n - A - w) packed entries + rshift: usize, // n - A - w + rmask: usize, + } + + /// Dense block term (eq or next inner poly replicated over consecutive + /// selector blocks with per-block scalars). + /// Covers packed range [start, start + n_blocks << ishift). + struct DenseT { + start: usize, // packed units + end: usize, + ishift: usize, // inner_vars - w + imask: usize, + inner: ArenaVec, // 2^ishift packed entries (unscaled) + scalars: Vec, // per block, gamma powers + } + + struct LazyTerms { + full: Vec, + dense: Vec, + } + + impl LazyTerms { + #[inline(always)] + fn at(&self, j: usize) -> EFP { + let mut acc = EFP::ZERO; + for t in &self.full { + acc += t.right[j & t.rmask] * t.left[j >> t.rshift]; + } + for t in &self.dense { + if j >= t.start && j < t.end { + let o = j - t.start; + acc += t.inner[o & t.imask] * t.scalars[o >> t.ishift]; + } + } + acc + } + } + + /// Statement set mirroring stacked_pcs_global_statements + 2 OOD at the + /// 1550-sig shape (tiny lane-level statements omitted in both arms; their + /// production cost is ~epsilon and T1's proof-equality test covers them). + /// Layout (elements): memory+acc [0, 2^23); bytecode_acc [2^23, 2^23+2^20); + /// exec 20 cols at sel 9 (inner 2^20); poseidon 110 cols at sel 116 + /// (inner 2^18); ext 29 cols at sel 1807 (inner 2^15). + fn rnd_pt(rng: &mut StdRng, len: usize) -> MultilinearPoint { + MultilinearPoint((0..len).map(|_| rng.random::()).collect::>()) + } + + fn rnd_vals(rng: &mut StdRng, first_sel: usize, n: usize) -> Vec> { + (0..n).map(|c| SparseValue::new(first_sel + c, rng.random::())).collect() + } + + fn build_statements(n_vars: usize, rng: &mut StdRng) -> Vec> { + let mut stmts: Vec> = Vec::new(); + // 2 OOD full statements (dual fast path) + for _ in 0..2 { + let p = rnd_pt(rng, n_vars); + stmts.push(SparseStatement::new(n_vars, p, rnd_vals(rng, 0, 1))); + } + // memory + memory_acc (selectors 0,1 at inner n-4) + let p = rnd_pt(rng, n_vars - 4); + stmts.push(SparseStatement::new(n_vars, p, rnd_vals(rng, 0, 2))); + // bytecode_acc (single value, inner n-6, selector 8) + let p = rnd_pt(rng, n_vars - 6); + stmts.push(SparseStatement::new(n_vars, p, rnd_vals(rng, 8, 1))); + // exec: 2 eq statements (20 cols, inner n-6, sel 9..29) + 1 next (3 shift cols) + for _ in 0..2 { + let p = rnd_pt(rng, n_vars - 6); + stmts.push(SparseStatement::new(n_vars, p, rnd_vals(rng, 9, 20))); + } + { + let p = rnd_pt(rng, n_vars - 6); + let mut s = SparseStatement::new(n_vars, p, rnd_vals(rng, 9, 3)); + s.is_next = true; + stmts.push(s); + } + // poseidon: 2 eq statements (110 cols, inner n-8, sel 116..226) + for _ in 0..2 { + let p = rnd_pt(rng, n_vars - 8); + stmts.push(SparseStatement::new(n_vars, p, rnd_vals(rng, 116, 110))); + } + // extension: 2 eq statements (29 cols, inner n-11, sel 1807..1836) + for _ in 0..2 { + let p = rnd_pt(rng, n_vars - 11); + stmts.push(SparseStatement::new(n_vars, p, rnd_vals(rng, 1807, 29))); + } + stmts + } + + /// Replays combine_statement's exact gamma-power accounting into lazy terms. + /// Returns (terms, combined_sum) — combined_sum must equal combine_statement's. + fn build_lazy_terms(statements: &[SparseStatement], gamma: EF, n_vars: usize) -> (LazyTerms, EF) { + let w = w_log(); + let is_full = |s: &SparseStatement| { + !s.is_next && s.values.len() == 1 && s.values[0].selector == 0 && s.inner_num_variables() == n_vars + }; + let mut full = Vec::new(); + let mut dense = Vec::new(); + let mut combined_sum = EF::ZERO; + let mut gamma_pow = EF::ONE; + + let make_full = |point: &[EF], scalar: EF| { + let a = n_vars / 2; // prefix length + let mut left: ArenaVec = eval_eq(&point[..a]); + for v in left.iter_mut() { + *v *= scalar; + } + let right: ArenaVec = eval_eq_packed(&point[a..]); + FullT { + left, + right, + rshift: n_vars - a - w, + rmask: (1usize << (n_vars - a - w)) - 1, + } + }; + + let start_idx = match statements { + [a, b, ..] if is_full(a) && is_full(b) => { + let sa = gamma_pow; + let sb = gamma_pow * gamma; + combined_sum = a.values[0].value * sa + b.values[0].value * sb; + gamma_pow = sb * gamma; + full.push(make_full(&a.point.0, sa)); + full.push(make_full(&b.point.0, sb)); + 2 + } + [a, ..] if is_full(a) => { + let sa = gamma_pow; + combined_sum = a.values[0].value * sa; + gamma_pow *= gamma; + full.push(make_full(&a.point.0, sa)); + 1 + } + _ => 0, + }; + + for smt in &statements[start_idx..] { + assert!( + smt.inner_num_variables() >= w, + "bench statement set must not contain lane-level statements" + ); + let inner: ArenaVec = if smt.is_next { + let next = matrix_next_mle_folded(&smt.point.0); + pack_extension(&next) + } else { + eval_eq_packed(&smt.point) + }; + let ishift = smt.inner_num_variables() - w; + // consecutive selectors assumed (true for the bench set) + let first_sel = smt.values[0].selector; + let mut scalars = Vec::with_capacity(smt.values.len()); + let mut p = gamma_pow; + for (k, e) in smt.values.iter().enumerate() { + assert_eq!(e.selector, first_sel + k, "bench terms assume consecutive selectors"); + combined_sum += e.value * p; + scalars.push(p); + p *= gamma; + } + gamma_pow = p; + dense.push(DenseT { + start: first_sel << ishift, + end: (first_sel + smt.values.len()) << ishift, + ishift, + imask: (1usize << ishift) - 1, + inner, + scalars, + }); + } + (LazyTerms { full, dense }, combined_sum) + } + + /// Lazy round-0: same (c0,c2)+c1-from-sum skeleton as + /// compute_product_sumcheck_polynomial, weights from `terms.at(j)`. + fn lazy_round0(evals: &[FP], terms: &LazyTerms, sum: EF) -> DensePolynomial { + let n = evals.len(); + let half = n / 2; + let (c0p, c2p) = parallel::map_reduce( + half, + || (EFP::ZERO, EFP::ZERO), + |i| { + let y0 = terms.at(i); + let y1 = terms.at(half + i); + let x0 = evals[i]; + let x1 = evals[half + i]; + let constant = y0 * x0; + let quadratic = (y1 - y0) * (x1 - x0); + (constant, quadratic) + }, + |(a0, a2), (b0, b2)| (a0 + b0, a2 + b2), + ); + let c0 = unpack_sum(c0p); + let c2 = unpack_sum(c2p); + let c1 = sum - c0.double() - c2; + DensePolynomial::new(vec![c0, c1, c2]) + } + + /// Lazy round-1 fused fold: recompute weights, fold both polys with r1, + /// materialize half-size folded arrays, emit round-1 coeffs — mirrors + /// fold_and_compute_product_sumcheck_polynomial exactly. + #[allow(clippy::type_complexity)] + fn lazy_fold_round1( + evals: &[FP], + terms: &LazyTerms, + r1: EF, + sum: EF, + ) -> (DensePolynomial, ArenaVec, ArenaVec) { + let n = evals.len(); + let quarter = n / 4; + let r1p = EFP::from(r1); + let mut e_folded = unsafe { ArenaVec::::uninitialized(n / 2) }; + let mut w_folded = unsafe { ArenaVec::::uninitialized(n / 2) }; + let pe = parallel::SendPtr(e_folded.as_mut_ptr()); + let pw = parallel::SendPtr(w_folded.as_mut_ptr()); + let (c0p, c2p) = parallel::map_reduce( + quarter, + || (EFP::ZERO, EFP::ZERO), + |i| { + let x_0 = r1p * (evals[2 * quarter + i] - evals[i]) + evals[i]; + let x_1 = r1p * (evals[3 * quarter + i] - evals[quarter + i]) + evals[quarter + i]; + let w00 = terms.at(i); + let w01 = terms.at(quarter + i); + let w10 = terms.at(2 * quarter + i); + let w11 = terms.at(3 * quarter + i); + let y_0 = r1p * (w10 - w00) + w00; + let y_1 = r1p * (w11 - w01) + w01; + unsafe { + *pe.add(i) = x_0; + *pe.add(quarter + i) = x_1; + *pw.add(i) = y_0; + *pw.add(quarter + i) = y_1; + } + let constant = y_0 * x_0; + let quadratic = (y_1 - y_0) * (x_1 - x_0); + (constant, quadratic) + }, + |(a0, a2), (b0, b2)| (a0 + b0, a2 + b2), + ); + let c0 = unpack_sum(c0p); + let c2 = unpack_sum(c2p); + let c1 = sum - c0.double() - c2; + (DensePolynomial::new(vec![c0, c1, c2]), e_folded, w_folded) + } + + fn cheap_base_fill(len: usize) -> ArenaVec { + let mut v = unsafe { ArenaVec::::uninitialized(len) }; + let unpacked = FP::unpack_slice_mut(&mut v); + parallel::par_chunks_mut(unpacked, 1 << 16, |chunk_idx, chunk| { + let mut state = (chunk_idx as u64).wrapping_mul(0x9E3779B97F4A7C15) | 1; + for slot in chunk { + state = state.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407); + *slot = F::from_u32((state >> 33) as u32 & 0x3FFFFFFF); + } + }); + v + } + + fn median(mut xs: Vec) -> f64 { + xs.sort_by(|a, b| a.partial_cmp(b).unwrap()); + xs[xs.len() / 2] + } + + fn time_med(reps: usize, mut f: impl FnMut() -> T) -> (f64, T) { + let mut times = Vec::new(); + let mut out = None; + for _ in 0..reps { + let t = Instant::now(); + let r = f(); + times.push(t.elapsed().as_secs_f64()); + out = Some(r); + } + (median(times), out.unwrap()) + } + + #[test] + #[ignore] + fn t0a_lazy_vs_materialized() { + let n_vars: usize = std::env::var("FUSION_BENCH_VARS") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(26); + let w = w_log(); + let mut rng = StdRng::seed_from_u64(42); + let gamma: EF = rng.random(); + let stmts = build_statements(n_vars, &mut rng); + println!("T0a: n_vars={n_vars}, packing_log_width={w}, {} statements", stmts.len()); + + let evals = cheap_base_fill(1 << (n_vars - w)); + + // --- materialized baseline --- + let (t_combine, (weights, sum_m)) = time_med(3, || combine_statement::(&stmts, gamma)); + let (t_read, read_sink) = time_med(3, || { + parallel::map_reduce(weights.len(), || EFP::ZERO, |i| weights[i], |a, b| a + b) + }); + black_box(read_sink); + let (t_r0, base_r0) = time_med(3, || { + compute_product_sumcheck_polynomial(&evals, &weights, sum_m, decompose) + }); + let r1: EF = rng.random(); + let sum_after_r0 = base_r0.evaluate(r1); + let (t_r1, (base_r1, base_folded)) = time_med(3, || { + fold_and_compute_product_sumcheck_polynomial(&evals, &weights, r1, sum_after_r0, decompose) + }); + + // --- lazy --- + let (t_terms, (terms, sum_l)) = time_med(3, || build_lazy_terms(&stmts, gamma, n_vars)); + assert_eq!(sum_l, sum_m, "gamma-power accounting diverged"); + // spot-check weight values + for _ in 0..4096 { + let j = rng.random_range(0..weights.len()); + assert_eq!(terms.at(j), weights[j], "lazy weight mismatch at packed index {j}"); + } + let (t_lazy_r0, lazy_r0_poly) = time_med(3, || lazy_round0(&evals, &terms, sum_l)); + assert_eq!(lazy_r0_poly.coeffs, base_r0.coeffs, "round-0 poly mismatch"); + let (t_lazy_r1, (lazy_r1_poly, lazy_e_folded, lazy_w_folded)) = + time_med(3, || lazy_fold_round1(&evals, &terms, r1, sum_after_r0)); + assert_eq!(lazy_r1_poly.coeffs, base_r1.coeffs, "round-1 poly mismatch"); + // folded arrays equality (weights: lazy vs baseline fold output) + let bw = &base_folded[1]; + let be = &base_folded[0]; + for _ in 0..4096 { + let j = rng.random_range(0..lazy_w_folded.len()); + assert_eq!(lazy_w_folded[j], bw[j], "folded weight mismatch at {j}"); + assert_eq!(lazy_e_folded[j], be[j], "folded evals mismatch at {j}"); + } + + let base_total = t_combine + t_r0 + t_r1; + let lazy_total = t_terms + t_lazy_r0 + t_lazy_r1; + let ratio_spec = t_lazy_r0 / (t_combine + t_read); + let ratio_e2e = lazy_total / base_total; + println!(" baseline: combine {:.0}ms + read {:.0}ms + r0 {:.0}ms + r1fold {:.0}ms (combine+r0+r1 = {:.0}ms)", + t_combine * 1e3, t_read * 1e3, t_r0 * 1e3, t_r1 * 1e3, base_total * 1e3); + println!(" lazy: terms {:.0}ms + r0 {:.0}ms + r1fold {:.0}ms (total {:.0}ms)", + t_terms * 1e3, t_lazy_r0 * 1e3, t_lazy_r1 * 1e3, lazy_total * 1e3); + let verdict = if ratio_spec <= 1.3 { + "PASS" + } else if ratio_spec <= 2.0 { + "GRAY" + } else { + "KILL" + }; + println!( + "T0A: ratio_spec (lazy_r0 / (combine+read)) = {ratio_spec:.2} (gate: <=1.3 PASS / <=2.0 GRAY / >2.0 KILL) => {verdict}" + ); + println!("T0A: ratio_e2e (lazy r0+r1+terms / combine+r0+r1) = {ratio_e2e:.2} (decision-relevant; <1.0 = net win)"); + assert!(ratio_spec <= 2.0, "T0a KILL: lazy round-0 {ratio_spec:.2}x the materialized combine+read"); + } + + #[test] + #[ignore] + fn t0b_ef_vs_base_ratio() { + let mut rng = StdRng::seed_from_u64(7); + let w = w_log(); + println!("T0b: packed EFxEF vs basexEF product-sumcheck cost"); + let mut last_ratio = 0.0; + for log_n in [20usize, 22, 23] { + let n = 1 << (log_n - w); + let base = cheap_base_fill(n); + let ext: ArenaVec = { + let vals: Vec = (0..(n << w)).map(|i| EF::from(F::from_u32((i as u32) | 1)) * EF::from_u32(7)).collect(); + pack_extension(&vals) + }; + let wts: ArenaVec = { + let vals: Vec = (0..(n << w)).map(|_| rng.random::()).collect(); + pack_extension(&vals) + }; + let sum: EF = rng.random(); + let (t_base, p1) = time_med(3, || { + compute_product_sumcheck_polynomial(&base, &wts, sum, decompose) + }); + let (t_ext, p2) = time_med(3, || { + compute_product_sumcheck_polynomial(&ext, &wts, sum, decompose) + }); + black_box((p1, p2)); + last_ratio = t_ext / t_base; + println!(" 2^{log_n}: basexEF {:.1}ms, EFxEF {:.1}ms, ratio {:.2}", t_base * 1e3, t_ext * 1e3, last_ratio); + } + let max_slices = if last_ratio >= 4.0 { + 4 + } else if last_ratio >= 2.0 { + 2 + } else { + 1 + }; + println!("T0B: EFxEF/basexEF = {last_ratio:.2} => MAX_SLICES = {max_slices} (delayed-EF profitable while n_slices < ratio)"); + } +} From 888b6dcaba21e2406a711a2f0701bc268d87fc82 Mon Sep 17 00:00:00 2001 From: Barnadrot Date: Fri, 12 Jun 2026 18:06:01 +0200 Subject: [PATCH 2/8] pw13-1 T1: lazy-once fused combine+round-0 for WHIR initial sumcheck (bit-identical transcript) Evaluates the combined weight w = sum_s gamma^{k_s} weight_s exactly once, in-register from per-statement tables (prefix/suffix split-eq for full statements, shared inner-eq tables for aligned blocks, exact packed-word overlay for lane-level statements), fused with the round-0 quadratic pass which also stream-writes the materialized buffer for rounds 1+. Replaces combine_statement's full-size eq-tensor pass + RMW scatters. - product_computation.rs: run_product_sumcheck_from_round1 extracted (byte-identical legacy tail; bytecode_claims caller untouched via unchanged run_product_sumcheck). - open.rs: LazyCombineTerms (grid-dispatched blocks, gamma accounting replayed exactly), combine_and_compute_first_round (single pass + overlay correction), env toggle WHIR_LAZY_COMBINE (default on; legacy path on 0 / width-1 / non-base-packed). - fiat-shamir: inert PartialEq derives on Proof/PrunedMerklePaths. - tests: prove-twice byte-equality at n in {18,20} over all weight-term arms (dual OOD fulls, dense blocks, single-value, lane-level overlay, is_next) + verify; instance diagnostic; env-gated production selfcheck (WHIR_LAZY_SELFCHECK=1). - Equality methodology note: pow_grinding is a racy parallel nonce search => proof bytes are only run-reproducible at zero-grinding configs; the test asserts all grinding bits are 0. The lazy path never touches grinding. Co-Authored-By: Claude Fable 5 --- .../backend/fiat-shamir/src/merkle_pruning.rs | 2 +- crates/backend/fiat-shamir/src/transcript.rs | 2 +- .../sumcheck/src/product_computation.rs | 16 + crates/whir/src/open.rs | 419 +++++++++++++++++- crates/whir/tests/run_whir.rs | 126 ++++++ 5 files changed, 561 insertions(+), 4 deletions(-) diff --git a/crates/backend/fiat-shamir/src/merkle_pruning.rs b/crates/backend/fiat-shamir/src/merkle_pruning.rs index 336bc9e72..d0171075a 100644 --- a/crates/backend/fiat-shamir/src/merkle_pruning.rs +++ b/crates/backend/fiat-shamir/src/merkle_pruning.rs @@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize}; use crate::{DIGEST_LEN_FE, MerklePath, MerklePaths}; -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub struct PrunedMerklePaths { pub leaf_data: Vec>, pub sibling_hashes: Vec<[F; DIGEST_LEN_FE]>, diff --git a/crates/backend/fiat-shamir/src/transcript.rs b/crates/backend/fiat-shamir/src/transcript.rs index 93daa32ff..7f76852c2 100644 --- a/crates/backend/fiat-shamir/src/transcript.rs +++ b/crates/backend/fiat-shamir/src/transcript.rs @@ -29,7 +29,7 @@ pub struct MerklePath { #[derive(Debug, Clone)] pub struct MerklePaths(pub(crate) Vec>); -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub struct Proof { pub(crate) transcript: Vec, pub(crate) merkle_paths: Vec>, diff --git a/crates/backend/sumcheck/src/product_computation.rs b/crates/backend/sumcheck/src/product_computation.rs index c069e7519..cac9bbd6b 100644 --- a/crates/backend/sumcheck/src/product_computation.rs +++ b/crates/backend/sumcheck/src/product_computation.rs @@ -64,6 +64,22 @@ pub fn run_product_sumcheck>>( let r1: EF = prover_state.sample(); sum = first_sumcheck_poly.evaluate(r1); + run_product_sumcheck_from_round1(pol_a, pol_b, prover_state, r1, sum, n_rounds, pow_bits) +} + +/// Rounds 1+ of the product sumcheck, for callers that computed round 0 themselves +/// (e.g. the fused lazy combine+round-0 path in WHIR). `sum` is the running sum after +/// binding `r1` (= first_round_poly.evaluate(r1)). Byte-identical transcript to the +/// corresponding tail of [`run_product_sumcheck`]. +pub fn run_product_sumcheck_from_round1>>( + pol_a: &MleRef<'_, EF>, // evals + pol_b: &MleRef<'_, EF>, // weights + prover_state: &mut impl FSProver, + r1: EF, + mut sum: EF, + n_rounds: usize, + pow_bits: usize, +) -> (MultilinearPoint, EF, MleOwned, MleOwned) { if n_rounds == 1 { return (MultilinearPoint(vec![r1]), sum, pol_a.fold(r1), pol_b.fold(r1)); } diff --git a/crates/whir/src/open.rs b/crates/whir/src/open.rs index 906cb8fce..472a026ad 100644 --- a/crates/whir/src/open.rs +++ b/crates/whir/src/open.rs @@ -3,8 +3,8 @@ use ::utils::log2_strict_usize; use fiat_shamir::{FSProver, MerklePath, ProofResult}; use field::PrimeCharacteristicRing; -use field::{ExtensionField, Field, TwoAdicField}; -use sumcheck::{ProductComputation, run_product_sumcheck, sumcheck_prove_many_rounds}; +use field::{ExtensionField, Field, PackedFieldExtension, TwoAdicField}; +use sumcheck::{ProductComputation, run_product_sumcheck, run_product_sumcheck_from_round1, sumcheck_prove_many_rounds}; use tracing::{info_span, instrument}; use zk_alloc::{ArenaVec, arena_vec}; @@ -418,6 +418,44 @@ where ) -> (Self, MultilinearPoint) { assert_ne!(folding_factor, 0); + // Lazy-once fused combine + round-0 (transcript bit-identical to the + // legacy path below; see the module comment on `LazyCombineTerms`). + if lazy_combine_enabled() && packing_log_width::() > 0 { + let evals_packed = evals.pack(); + if let MleRef::BasePacked(ev) = evals_packed.by_ref() { + let terms = info_span!("build_lazy_combine_terms") + .in_scope(|| build_lazy_combine_terms::(statement, combination_randomness)); + let (first_poly, weights_buf) = info_span!("combine_and_compute_first_round") + .in_scope(|| combine_and_compute_first_round(ev, &terms, terms.combined_sum)); + if std::env::var("WHIR_LAZY_SELFCHECK").is_ok_and(|v| v == "1") { + let (w_ref, sum_ref) = combine_statement::(statement, combination_randomness); + assert_eq!(terms.combined_sum, sum_ref, "selfcheck: combined_sum diverged"); + let n_bad = (0..w_ref.len()).filter(|&j| weights_buf[j] != w_ref[j]).count(); + assert_eq!(n_bad, 0, "selfcheck: {n_bad} weight mismatches of {}", w_ref.len()); + } + prover_state.add_sumcheck_polynomial(&first_poly.coeffs, None); + prover_state.pow_grinding(pow_bits); + let r1: EF = prover_state.sample(); + let sum1 = first_poly.evaluate(r1); + let weights = Mle::Owned(MleOwned::ExtensionPacked(weights_buf)); + let (challenges, new_sum, folded_evals, folded_weights) = run_product_sumcheck_from_round1( + &evals_packed.by_ref(), + &weights.by_ref(), + prover_state, + r1, + sum1, + folding_factor, + pow_bits, + ); + let sumcheck = Self { + evals: folded_evals, + weights: folded_weights, + sum: new_sum, + }; + return (sumcheck, challenges); + } + } + let (weights, sum) = combine_statement::(statement, combination_randomness); let mut evals = evals.pack(); @@ -617,6 +655,294 @@ where (combined_weights, combined_sum) } +// --------------------------------------------------------------------------- +// Lazy-once fused combine + round-0 for the WHIR initial sumcheck (pw13-mac h-wf). +// +// `combine_statement` materializes w = Σ_s γ^{k_s}·weight_s with one full-size +// eq-tensor pass plus read-modify-write scatters over every statement region, +// then round 0 re-reads the buffer. Here the weight value w[j] is instead +// evaluated in-register from small per-statement tables (prefix/suffix split +// for full statements, shared inner-eq tables for block statements), exactly +// once, inside the round-0 pass — which also stream-writes the materialized +// buffer for rounds 1+. The transcript is bit-identical: the gamma-power +// accounting replays `combine_statement` exactly, and every weight value is +// the same field element (exact-field reassociation only). +// +// Toggle: WHIR_LAZY_COMBINE=0 falls back to the legacy path (also used when +// packing width is 1 or the evals are not base-packed). +// --------------------------------------------------------------------------- + +const LAZY_OVERLAY_SPAN_MAX: usize = 8; // packed words; small blocks are pre-expanded + +struct LazyFullTerm>> { + left: ArenaVec, // 2^A prefix table, statement scalar folded in + right: ArenaVec>, // 2^(n - A - w) packed suffix table + rshift: usize, + rmask: usize, +} + +/// One (statement, value) pair: scalar·eq(point,·) (or next-mle) on the packed +/// range [start, start + 2^ishift). Aligned: start is a multiple of 2^ishift. +struct LazyBlock { + start: usize, + ishift: usize, + inner_id: u32, + scalar: usize, // index into `scalars` +} + +pub(crate) struct LazyCombineTerms>> { + full: Vec>, + inners: Vec>>, + blocks: Vec, + scalars: Vec, + grid: Vec>, // packed-index >> grid_log -> covering block ids + grid_log: usize, + overlay: Vec<(usize, EFPacking)>, // sorted by packed index + pub(crate) combined_sum: EF, +} + +impl>> LazyCombineTerms { + #[inline(always)] + fn value_at(&self, j: usize) -> EFPacking { + let mut acc = EFPacking::::ZERO; + for t in &self.full { + acc += t.right[j & t.rmask] * t.left[j >> t.rshift]; + } + if !self.grid.is_empty() { + for &b in &self.grid[j >> self.grid_log] { + let blk = &self.blocks[b as usize]; + // a block listed in this cell covers the whole cell + acc += self.inners[blk.inner_id as usize][j - blk.start] * self.scalars[blk.scalar]; + } + } + acc + } +} + +fn lazy_combine_enabled() -> bool { + std::env::var("WHIR_LAZY_COMBINE").map(|v| v != "0").unwrap_or(true) +} + +/// Replays `combine_statement`'s exact gamma-power accounting into lazy term +/// tables. `combined_sum` is the identical field element. +pub(crate) fn build_lazy_combine_terms(statements: &[SparseStatement], gamma: EF) -> LazyCombineTerms +where + EF: ExtensionField>, +{ + let num_variables = statements[0].total_num_variables; + assert!(statements.iter().all(|e| e.total_num_variables == num_variables)); + let w = packing_log_width::(); + + let is_full = |s: &SparseStatement| { + !s.is_next && s.values.len() == 1 && s.values[0].selector == 0 && s.inner_num_variables() == num_variables + }; + + let mut full = Vec::new(); + let mut inners: Vec>> = Vec::new(); + let mut blocks: Vec = Vec::new(); + let mut scalars: Vec = Vec::new(); + let mut overlay_map: std::collections::BTreeMap> = Default::default(); + let mut combined_sum = EF::ZERO; + let mut gamma_pow = EF::ONE; + + let make_full = |point: &[EF], scalar: EF| { + let a = num_variables / 2; + let mut left: ArenaVec = eval_eq(&point[..a]); + for v in left.iter_mut() { + *v *= scalar; + } + let right: ArenaVec> = eval_eq_packed(&point[a..]); + let rshift = num_variables - a - w; + LazyFullTerm { + left, + right, + rshift, + rmask: (1usize << rshift) - 1, + } + }; + + let start_idx = match statements { + [a, b, ..] if is_full(a) && is_full(b) => { + let sa = gamma_pow; + let sb = gamma_pow * gamma; + combined_sum = a.values[0].value * sa + b.values[0].value * sb; + gamma_pow = sb * gamma; + full.push(make_full(&a.point.0, sa)); + full.push(make_full(&b.point.0, sb)); + 2 + } + [a, ..] if is_full(a) => { + let sa = gamma_pow; + combined_sum = a.values[0].value * sa; + gamma_pow *= gamma; + full.push(make_full(&a.point.0, sa)); + 1 + } + _ => 0, + }; + + for smt in &statements[start_idx..] { + if !smt.is_next && (smt.values.len() == 1 || smt.inner_num_variables() < w) { + // combine_statement's sparse path: per-value gamma powers. + let inner_vars = smt.inner_num_variables(); + let mut stmt_inner: Option = None; + for ev in &smt.values { + let scalar = gamma_pow; + combined_sum += ev.value * scalar; + gamma_pow *= gamma; + if inner_vars < w { + // lane-level: contributes within a single packed word + let shift = w - inner_vars; + let word = ev.selector >> shift; + let mut unpacked = vec![EF::ZERO; 1usize << w]; + compute_sparse_eval_eq::(ev.selector & ((1 << shift) - 1), &smt.point.0, &mut unpacked, scalar); + let delta: Vec> = pack_extension(&unpacked); + *overlay_map.entry(word).or_insert(EFPacking::::ZERO) += delta[0]; + } else { + let inner_id = *stmt_inner.get_or_insert_with(|| { + inners.push(eval_eq_packed(&smt.point)); + (inners.len() - 1) as u32 + }); + let ishift = inner_vars - w; + scalars.push(scalar); + blocks.push(LazyBlock { + start: ev.selector << ishift, + ishift, + inner_id, + scalar: scalars.len() - 1, + }); + } + } + } else { + // combine_statement's dense path: sorted-unique selectors, + // per-ORIGINAL-order gamma powers. + let mut sorted = smt.values.iter().map(|e| e.selector).collect::>(); + sorted.sort_unstable(); + sorted.dedup(); + assert_eq!(sorted.len(), smt.values.len(), "Duplicate selectors in sparse statement"); + + let inner: ArenaVec> = if smt.is_next { + let next = matrix_next_mle_folded(&smt.point.0); + pack_extension(&next) + } else { + eval_eq_packed(&smt.point) + }; + inners.push(inner); + let inner_id = (inners.len() - 1) as u32; + let ishift = smt.inner_num_variables() - w; + + let mut p = gamma_pow; + for ev in &smt.values { + combined_sum += ev.value * p; + scalars.push(p); + blocks.push(LazyBlock { + start: ev.selector << ishift, + ishift, + inner_id, + scalar: scalars.len() - 1, + }); + p *= gamma; + } + gamma_pow = p; + } + } + + // Small blocks become exact overlay words; the rest go on the grid. + let mut grid_blocks: Vec = Vec::new(); + for blk in blocks { + let span = 1usize << blk.ishift; + if span <= LAZY_OVERLAY_SPAN_MAX { + let inner = &inners[blk.inner_id as usize]; + let s = scalars[blk.scalar]; + for t in 0..span { + *overlay_map.entry(blk.start + t).or_insert(EFPacking::::ZERO) += inner[t] * s; + } + } else { + grid_blocks.push(blk); + } + } + + let (grid, grid_log) = if grid_blocks.is_empty() { + (Vec::new(), 0) + } else { + let grid_log = grid_blocks.iter().map(|b| b.ishift).min().unwrap(); + let n_cells = 1usize << (num_variables - w - grid_log); + let mut grid: Vec> = vec![Vec::new(); n_cells]; + for (id, blk) in grid_blocks.iter().enumerate() { + let c0 = blk.start >> grid_log; + let c1 = (blk.start + (1usize << blk.ishift)) >> grid_log; + for cell in grid.iter_mut().take(c1).skip(c0) { + cell.push(id as u32); + } + } + (grid, grid_log) + }; + + LazyCombineTerms { + full, + inners, + blocks: grid_blocks, + scalars, + grid, + grid_log, + overlay: overlay_map.into_iter().collect(), + combined_sum, + } +} + +/// One parallel pass: evaluates every weight value exactly once, stream-writes +/// the materialized buffer for rounds 1+, and accumulates the round-0 +/// quadratic coefficients ((c0, c2); c1 deduced from the claimed sum). +fn combine_and_compute_first_round( + evals: &[PFPacking], + terms: &LazyCombineTerms, + sum: EF, +) -> (DensePolynomial, ArenaVec>) +where + EF: ExtensionField>, + EFPacking: std::ops::Mul, Output = EFPacking>, +{ + let n = evals.len(); + let half = n / 2; + let mut weights = unsafe { ArenaVec::>::uninitialized(n) }; + let wp = parallel::SendPtr(weights.as_mut_ptr()); + + let (mut c0p, mut c2p) = parallel::map_reduce( + half, + || (EFPacking::::ZERO, EFPacking::::ZERO), + |i| { + let w0 = terms.value_at(i); + let w1 = terms.value_at(half + i); + unsafe { + *wp.add(i) = w0; + *wp.add(half + i) = w1; + } + let x0 = evals[i]; + let x1 = evals[half + i]; + (w0 * x0, (w1 - w0) * (x1 - x0)) + }, + |(a0, a2), (b0, b2)| (a0 + b0, a2 + b2), + ); + + // Exact overlay application: patch the buffer and correct the accumulators. + for &(idx, delta) in &terms.overlay { + weights[idx] += delta; + if idx < half { + // d c0 = delta·e0 ; d c2 = -delta·(e1 - e0) + c0p += delta * evals[idx]; + c2p += delta * (evals[idx] - evals[half + idx]); + } else { + // d c2 = delta·(e1 - e0) + c2p += delta * (evals[idx] - evals[idx - half]); + } + } + + let c0 = EFPacking::::to_ext_iter([c0p]).sum::(); + let c2 = EFPacking::::to_ext_iter([c2p]).sum::(); + let c1 = sum - c0.double() - c2; + (DensePolynomial::new(vec![c0, c1, c2]), weights) +} + // --------------------------------------------------------------------------- // h-wf kill-ladder rung benches (pw13-mac iter-1, hypothesis "whir-lazy-fusion"). // Test-only; no production code change. @@ -1047,3 +1373,92 @@ mod fusion_bench { println!("T0B: EFxEF/basexEF = {last_ratio:.2} => MAX_SLICES = {max_slices} (delayed-EF profitable while n_slices < ratio)"); } } + +#[cfg(test)] +mod lazy_combine_diag { + use super::*; + use crate::{SparseStatement, SparseValue}; + use field::{PackedValue, PrimeCharacteristicRing}; + use koala_bear::{KoalaBear, QuinticExtensionFieldKB}; + use rand::{RngExt, SeedableRng, rngs::StdRng}; + + type F = KoalaBear; + type EF = QuinticExtensionFieldKB; + + #[test] + fn diag_lazy_vs_combine_failing_shape() { + let num_variables = 20usize; + let mut rng = StdRng::seed_from_u64(7); + let polynomial = (0..1usize << num_variables).map(|_| rng.random::()).collect::>(); + + let mut statement: Vec> = Vec::new(); + // 2 fake OOD full statements at the front (mirrors initialize_first_round_state) + for _ in 0..2 { + let p = MultilinearPoint((0..num_variables).map(|_| rng.random::()).collect::>()); + statement.push(SparseStatement::new(num_variables, p, vec![SparseValue { selector: 0, value: rng.random::() }])); + } + for (selector_len, n_sels) in [(6usize, 5usize), (8, 9), (11, 3)] { + let point = MultilinearPoint((0..num_variables - selector_len).map(|_| rng.random::()).collect::>()); + let first = rng.random_range(0..(1usize << selector_len) - n_sels); + statement.push(SparseStatement::new( + num_variables, + point, + (0..n_sels).map(|k| SparseValue { selector: first + k, value: rng.random::() }).collect(), + )); + } + { + let point = MultilinearPoint((0..num_variables - 5).map(|_| rng.random::()).collect::>()); + let sel = rng.random_range(0..32); + statement.push(SparseStatement::new(num_variables, point, vec![SparseValue { selector: sel, value: rng.random::() }])); + } + for inner in [0usize, 1] { + let point = MultilinearPoint((0..inner).map(|_| rng.random::()).collect::>()); + let sel = rng.random_range(0..(1usize << (num_variables - inner))); + statement.push(SparseStatement::new(num_variables, point, vec![SparseValue { selector: sel, value: rng.random::() }])); + } + { + let inner = 10usize; + let point = MultilinearPoint((0..inner).map(|_| rng.random::()).collect::>()); + let mut s = SparseStatement::new( + num_variables, + point, + (0..2usize).map(|k| SparseValue { selector: 3 + k, value: rng.random::() }).collect(), + ); + s.is_next = true; + statement.push(s); + } + + let gamma: EF = rng.random(); + let (w_ref, sum_ref) = combine_statement::(&statement, gamma); + let terms = build_lazy_combine_terms::(&statement, gamma); + assert_eq!(terms.combined_sum, sum_ref, "combined_sum diverged"); + + // elementwise weight check including overlay + let half = w_ref.len() / 2; + let evals: Vec> = { + let mut v = vec![PFPacking::::ZERO; w_ref.len()]; + let unp = PFPacking::::unpack_slice_mut(&mut v); + for (i, slot) in unp.iter_mut().enumerate() { + *slot = F::from_u32((i as u32).wrapping_mul(2654435761) >> 3); + } + v + }; + let (poly_lazy, w_lazy) = combine_and_compute_first_round::(&evals, &terms, sum_ref); + let mut n_bad = 0usize; + for j in 0..w_ref.len() { + if w_lazy[j] != w_ref[j] { + if n_bad < 10 { + println!("weight mismatch at packed {j} (half={half}, j>>13={})", j >> 13); + } + n_bad += 1; + } + } + println!("total weight mismatches: {n_bad} / {}", w_ref.len()); + assert_eq!(n_bad, 0, "weights diverged"); + let poly_ref = sumcheck::compute_product_sumcheck_polynomial(&evals, &w_ref, sum_ref, |e| { + as field::PackedFieldExtension>::to_ext_iter([e]).collect() + }); + assert_eq!(poly_lazy.coeffs, poly_ref.coeffs, "round-0 poly diverged"); + println!("diag: all equal"); + } +} diff --git a/crates/whir/tests/run_whir.rs b/crates/whir/tests/run_whir.rs index c8868196d..7c1051fc9 100644 --- a/crates/whir/tests/run_whir.rs +++ b/crates/whir/tests/run_whir.rs @@ -183,3 +183,129 @@ fn display_whir_round_info() { } } } + +/// h-wf T1: the lazy-once fused combine+round-0 path must produce BYTE-IDENTICAL +/// proofs to the legacy combine_statement path, across statement shapes covering +/// every lazy term arm (dual full fast-path via OOD, dense multi-selector blocks, +/// single-value blocks, lane-level overlay, is_next inner polys). +#[test] +fn test_lazy_combine_proof_equality() { + fn set_lazy(v: &str) { + unsafe { std::env::set_var("WHIR_LAZY_COMBINE", v) } + } + let poseidon16 = default_koalabear_poseidon1_16(); + precompute_dft_twiddles::(1 << F::TWO_ADICITY); + + for (seed, num_variables) in [(1u64, 18usize), (7, 20)] { + // pow_grinding is a racy parallel nonce search (first valid witness wins), + // so proof bytes are only reproducible when every grinding step is 0 bits. + // The lazy combine path never touches grinding; zero-grinding configs make + // the byte-equality check exact over all deterministic protocol parts. + let params = WhirConfigBuilder { + security_level: 40, + max_num_variables_to_send_coeffs: 9, + pow_bits: 0, + folding_factor: FoldingFactor::new(7, 4), + soundness_type: SecurityAssumption::JohnsonBound, + starting_log_inv_rate: 1, + rs_domain_initial_reduction_factor: 5, + }; + let params = WhirConfig::new(¶ms, num_variables); + assert_eq!(params.starting_folding_pow_bits, 0, "test needs zero grinding"); + assert_eq!(params.final_query_pow_bits, 0, "test needs zero grinding"); + for r in ¶ms.round_parameters { + assert_eq!(r.folding_pow_bits, 0, "test needs zero grinding"); + assert_eq!(r.query_pow_bits, 0, "test needs zero grinding"); + } + let mut rng = StdRng::seed_from_u64(seed); + let polynomial = (0..1usize << num_variables).map(|_| rng.random::()).collect::>(); + + let mut statement: Vec> = Vec::new(); + // dense multi-selector blocks (table-shaped) + for (selector_len, n_sels) in [(6usize, 5usize), (8, 9), (11, 3)] { + let point = MultilinearPoint((0..num_variables - selector_len).map(|_| rng.random::()).collect::>()); + let first = rng.random_range(0..(1usize << selector_len) - n_sels); + statement.push(SparseStatement::new( + num_variables, + point.clone(), + (0..n_sels) + .map(|k| SparseValue { + selector: first + k, + value: polynomial.evaluate_sparse(first + k, &point), + }) + .collect(), + )); + } + // single-value block (bytecode_acc-shaped) + { + let point = MultilinearPoint((0..num_variables - 5).map(|_| rng.random::()).collect::>()); + let sel = rng.random_range(0..32); + statement.push(SparseStatement::new( + num_variables, + point.clone(), + vec![SparseValue { + selector: sel, + value: polynomial.evaluate_sparse(sel, &point), + }], + )); + } + // lane-level: single cells (pc-shaped, inner 0) and inner 1 + for inner in [0usize, 1] { + let point = MultilinearPoint((0..inner).map(|_| rng.random::()).collect::>()); + let sel = rng.random_range(0..(1usize << (num_variables - inner))); + statement.push(SparseStatement::new( + num_variables, + point.clone(), + vec![SparseValue { + selector: sel, + value: polynomial.evaluate_sparse(sel, &point), + }], + )); + } + // is_next statement (shift-column-shaped), 2 selectors, value = + { + let inner = 10usize; + let point = MultilinearPoint((0..inner).map(|_| rng.random::()).collect::>()); + let next_table = matrix_next_mle_folded(&point.0); + let mut s = SparseStatement::new( + num_variables, + point.clone(), + (0..2usize) + .map(|k| { + let sel = 3 + k; + let base = sel << inner; + let value = (0..1usize << inner) + .map(|i| next_table[i] * polynomial[base + i]) + .sum::(); + SparseValue { selector: sel, value } + }) + .collect(), + ); + s.is_next = true; + statement.push(s); + } + + let prove_once = |lazy: bool| { + set_lazy(if lazy { "1" } else { "0" }); + let mut prover_state = ProverState::new(poseidon16.clone(), Default::default()); + let poly_mle: MleOwned = MleOwned::Base(ArenaVec::from_iter(polynomial.clone())); + let witness = params.commit(&mut prover_state, &poly_mle, 1 << num_variables); + params.prove(&mut prover_state, statement.clone(), witness, &poly_mle.by_ref()); + prover_state.into_proof() + }; + + let proof_legacy = prove_once(false); + let proof_lazy = prove_once(true); + assert_eq!( + proof_legacy, proof_lazy, + "lazy combine produced a different proof (seed {seed}, n {num_variables})" + ); + + let mut verifier_state = VerifierState::::new(proof_lazy, poseidon16.clone(), Default::default()).unwrap(); + let parsed_commitment = params.parse_commitment::(&mut verifier_state).unwrap(); + params + .verify::(&mut verifier_state, &parsed_commitment, statement.clone()) + .unwrap(); + } + set_lazy("1"); +} From 9280138ef86c35600f5f8411c001d81059ff92cf Mon Sep 17 00:00:00 2001 From: Barnadrot Date: Fri, 12 Jun 2026 18:19:56 +0200 Subject: [PATCH 3/8] =?UTF-8?q?pw13-1=20T1.1:=20review-gate=20fixes=20?= =?UTF-8?q?=E2=80=94=20engage=20the=20dual=20full=20fast-path=20arm=20in?= =?UTF-8?q?=20the=20equality=20test;=20extend=20to=20n=5Fvars=20{18,22,26}?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewer found commitment_ood_samples = 1 for this config (not 2), so the dual [a,b,..] branch had no transcript-level coverage; the test now pushes its own full statement (selector 0, full point) making the spliced layout [OOD-full, test-full, ...] => start_idx = 2. n_vars extended to include the production size 26 (zero-grinding holds there too; ~3s test). Co-Authored-By: Claude Fable 5 --- crates/whir/tests/run_whir.rs | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/crates/whir/tests/run_whir.rs b/crates/whir/tests/run_whir.rs index 7c1051fc9..d32798e2d 100644 --- a/crates/whir/tests/run_whir.rs +++ b/crates/whir/tests/run_whir.rs @@ -196,7 +196,8 @@ fn test_lazy_combine_proof_equality() { let poseidon16 = default_koalabear_poseidon1_16(); precompute_dft_twiddles::(1 << F::TWO_ADICITY); - for (seed, num_variables) in [(1u64, 18usize), (7, 20)] { + // n_vars 26 = the production stacked-PCS size; 18/22 cover small/mid shapes. + for (seed, num_variables) in [(1u64, 18usize), (7, 22), (3, 26)] { // pow_grinding is a racy parallel nonce search (first valid witness wins), // so proof bytes are only reproducible when every grinding step is 0 bits. // The lazy combine path never touches grinding; zero-grinding configs make @@ -221,6 +222,19 @@ fn test_lazy_combine_proof_equality() { let polynomial = (0..1usize << num_variables).map(|_| rng.random::()).collect::>(); let mut statement: Vec> = Vec::new(); + // full statement (selector 0, full-length point): with this config the + // prover splices exactly ONE OOD full statement, so adding one of our own + // makes the layout [OOD-full, this-full, ...] and engages the dual + // fast-path arm (start_idx = 2) in both combine paths. + { + let point = MultilinearPoint((0..num_variables).map(|_| rng.random::()).collect::>()); + let value = polynomial.evaluate_sparse(0, &point); + statement.push(SparseStatement::new( + num_variables, + point, + vec![SparseValue { selector: 0, value }], + )); + } // dense multi-selector blocks (table-shaped) for (selector_len, n_sels) in [(6usize, 5usize), (8, 9), (11, 3)] { let point = MultilinearPoint((0..num_variables - selector_len).map(|_| rng.random::()).collect::>()); From 0ce5319bcbbb03a88675e60ffad582d81848a7ad Mon Sep 17 00:00:00 2001 From: Barnadrot Date: Fri, 12 Jun 2026 18:30:34 +0200 Subject: [PATCH 4/8] pw13-1 T2: segment-hoisted fused combine+round-0 kernel Runs aligned to min(full-term suffix blocks, grid cells, half): term contexts (left factors, block inner slices + scalars) hoisted per run; inner loop is pure slice iteration. Production: combine_and_compute 119 -> 112ms; run_initial_sumcheck_rounds 276 (baseline) -> 262ms; e2e -14ms (-0.65%) - the ~110ms mul floor (~15 packed muls/word across ~3.3 covering terms) is shared with legacy combine_statement and is irreducible in this representation. Proof-equality suite green (bit-identical, n {18,22,26}). Co-Authored-By: Claude Fable 5 --- crates/whir/src/open.rs | 112 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 101 insertions(+), 11 deletions(-) diff --git a/crates/whir/src/open.rs b/crates/whir/src/open.rs index 472a026ad..c84e4ca6c 100644 --- a/crates/whir/src/open.rs +++ b/crates/whir/src/open.rs @@ -900,26 +900,116 @@ fn combine_and_compute_first_round( ) -> (DensePolynomial, ArenaVec>) where EF: ExtensionField>, - EFPacking: std::ops::Mul, Output = EFPacking>, + EFPacking: std::ops::Mul, Output = EFPacking> + std::ops::Mul>, { let n = evals.len(); let half = n / 2; let mut weights = unsafe { ArenaVec::>::uninitialized(n) }; let wp = parallel::SendPtr(weights.as_mut_ptr()); + // Runs are aligned so that within a run (on both the j and half+j sides): + // every full term's left factor is constant (run within one 2^rshift block) + // and the grid cell is constant (run within one 2^grid_log cell) — the + // right-table and block-inner indices then advance sequentially, so the + // inner loop is pure slice iteration with all term contexts hoisted. + let mut run_log = ::utils::log2_strict_usize(half.max(1)); + for t in &terms.full { + run_log = run_log.min(t.rshift); + } + if !terms.grid.is_empty() { + run_log = run_log.min(terms.grid_log); + } + let run = 1usize << run_log; + let n_runs = half >> run_log; + + const MAX_BLOCKS: usize = 8; + let (mut c0p, mut c2p) = parallel::map_reduce( - half, + n_runs, || (EFPacking::::ZERO, EFPacking::::ZERO), - |i| { - let w0 = terms.value_at(i); - let w1 = terms.value_at(half + i); - unsafe { - *wp.add(i) = w0; - *wp.add(half + i) = w1; + |run_idx| { + let j0 = run_idx << run_log; + let mut acc0 = EFPacking::::ZERO; + let mut acc2 = EFPacking::::ZERO; + + // Hoisted full-term contexts: (right-slice, left scalar) per side. + let mut fulls_lo: [(&[EFPacking], EF); 4] = [(&[], EF::ZERO); 4]; + let mut fulls_hi: [(&[EFPacking], EF); 4] = [(&[], EF::ZERO); 4]; + let n_fulls = terms.full.len().min(4); + for (k, t) in terms.full.iter().take(4).enumerate() { + let lo_base = j0 & t.rmask; + let hi_base = (half + j0) & t.rmask; + fulls_lo[k] = (&t.right[lo_base..lo_base + run], t.left[j0 >> t.rshift]); + fulls_hi[k] = (&t.right[hi_base..hi_base + run], t.left[(half + j0) >> t.rshift]); + } + + // Hoisted block contexts per side. + let mut blocks_lo: [(&[EFPacking], EF); MAX_BLOCKS] = [(&[], EF::ZERO); MAX_BLOCKS]; + let mut blocks_hi: [(&[EFPacking], EF); MAX_BLOCKS] = [(&[], EF::ZERO); MAX_BLOCKS]; + let (mut n_lo, mut n_hi) = (0usize, 0usize); + let mut slow = terms.full.len() > 4; + if !terms.grid.is_empty() { + for &b in &terms.grid[j0 >> terms.grid_log] { + let blk = &terms.blocks[b as usize]; + if n_lo < MAX_BLOCKS { + let o = j0 - blk.start; + blocks_lo[n_lo] = (&terms.inners[blk.inner_id as usize][o..o + run], terms.scalars[blk.scalar]); + n_lo += 1; + } else { + slow = true; + } + } + for &b in &terms.grid[(half + j0) >> terms.grid_log] { + let blk = &terms.blocks[b as usize]; + if n_hi < MAX_BLOCKS { + let o = half + j0 - blk.start; + blocks_hi[n_hi] = (&terms.inners[blk.inner_id as usize][o..o + run], terms.scalars[blk.scalar]); + n_hi += 1; + } else { + slow = true; + } + } + } + + if slow { + // rare generic fallback: per-index dispatch + for t in 0..run { + let i = j0 + t; + let w0 = terms.value_at(i); + let w1 = terms.value_at(half + i); + unsafe { + *wp.add(i) = w0; + *wp.add(half + i) = w1; + } + acc0 += w0 * evals[i]; + acc2 += (w1 - w0) * (evals[half + i] - evals[i]); + } + return (acc0, acc2); + } + + let e_lo = &evals[j0..j0 + run]; + let e_hi = &evals[half + j0..half + j0 + run]; + for t in 0..run { + let mut w0 = EFPacking::::ZERO; + let mut w1 = EFPacking::::ZERO; + for k in 0..n_fulls { + w0 += fulls_lo[k].0[t] * fulls_lo[k].1; + w1 += fulls_hi[k].0[t] * fulls_hi[k].1; + } + for blk in blocks_lo.iter().take(n_lo) { + w0 += blk.0[t] * blk.1; + } + for blk in blocks_hi.iter().take(n_hi) { + w1 += blk.0[t] * blk.1; + } + unsafe { + *wp.add(j0 + t) = w0; + *wp.add(half + j0 + t) = w1; + } + acc0 += w0 * e_lo[t]; + acc2 += (w1 - w0) * (e_hi[t] - e_lo[t]); } - let x0 = evals[i]; - let x1 = evals[half + i]; - (w0 * x0, (w1 - w0) * (x1 - x0)) + (acc0, acc2) }, |(a0, a2), (b0, b2)| (a0 + b0, a2 + b2), ); From 9fa95eabff0abc71147bfb67cfe94af7c038c58d Mon Sep 17 00:00:00 2001 From: Barnadrot Date: Fri, 12 Jun 2026 18:41:30 +0200 Subject: [PATCH 5/8] pw13-1 T3: delayed-EF (BDT 2024/1046 Alg-3) for rounds 2-3 of the WHIR initial sumcheck Base evals are never promoted at the r1 fold: rounds 2-3 read the ORIGINAL base-packed evals through the 2-slice view x = s0 + r1*s1 (s0(j)=e[j], s1(j)=e[n/2+j]-e[j]), accumulating per-slice basexEF partials combined with r1 once per round; evals collapse to ExtensionPacked at round 3's fold; many_rounds tail unchanged (n_rounds >= 4 guard). Saves the eval-promote muls + 0.67GB eval write/read at r1; round-2/3 evals-side products run at the T0b-measured 2/2.29 basexEF discount. Production (1550-sig): run_initial_sumcheck_rounds 262 -> 251ms; WHIR 527 -> 509ms; e2e 2.17 -> 2.15s (720 vs 713 XMSS/s) - cumulative h-wf ~ -25ms span, ~ -1.0% e2e (single-run; eval_paired decides). Proof-equality now 3-way (legacy / lazy / lazy+delayed) byte-identical at n {18,22,26}; toggle WHIR_DELAYED_EF (default on). Co-Authored-By: Claude Fable 5 --- .../sumcheck/src/product_computation.rs | 142 ++++++++++++++++++ crates/whir/src/open.rs | 43 ++++-- crates/whir/tests/run_whir.rs | 15 +- 3 files changed, 185 insertions(+), 15 deletions(-) diff --git a/crates/backend/sumcheck/src/product_computation.rs b/crates/backend/sumcheck/src/product_computation.rs index cac9bbd6b..b505ebf0c 100644 --- a/crates/backend/sumcheck/src/product_computation.rs +++ b/crates/backend/sumcheck/src/product_computation.rs @@ -272,3 +272,145 @@ where let quadratic = (y_1 - y_0) * (x_1 - x_0); (constant, quadratic) } + +/// Delayed-EF variant of [`run_product_sumcheck_from_round1`] for BASE-packed +/// evals (BDT 2024/1046 Alg-3 factoring): the evals side is never promoted to +/// the extension at the r1 fold. Rounds 2 and 3 read the ORIGINAL base evals +/// through the 2-slice view x = s0 + r1·s1 (s0(j) = e[j], s1(j) = e[n/2+j] − +/// e[j]), accumulating per-slice base×EF partials combined with r1/r2 once per +/// round; the evals collapse to ExtensionPacked at round 3's fold. Transcript +/// is bit-identical (exact-field reassociation of the same round polynomials). +/// Requires n_rounds >= 3 (callers fall back to the standard tail otherwise). +#[allow(clippy::too_many_arguments)] +pub fn run_product_sumcheck_from_round1_delayed>>( + evals: &[PFPacking], + weights: &[EFPacking], + prover_state: &mut impl FSProver, + r1: EF, + sum_after_r1: EF, + n_rounds: usize, + pow_bits: usize, +) -> (MultilinearPoint, EF, MleOwned, MleOwned) { + assert!(n_rounds >= 3); + let n = evals.len(); + assert_eq!(n, weights.len()); + let q = n / 4; + let unpack = |s: EFPacking| EFPacking::::to_ext_iter([s]).sum::(); + + // --- Pass A: fold weights at r1; round-2 poly via 2-slice evals --- + let r1p = EFPacking::::from(r1); + let mut w_folded = unsafe { ArenaVec::>::uninitialized(n / 2) }; + let wf = parallel::SendPtr(w_folded.as_mut_ptr()); + let (p00, p01, ps0, ps1) = parallel::map_reduce( + q, + || { + ( + EFPacking::::ZERO, + EFPacking::::ZERO, + EFPacking::::ZERO, + EFPacking::::ZERO, + ) + }, + |i| { + let y_0 = r1p * (weights[2 * q + i] - weights[i]) + weights[i]; + let y_1 = r1p * (weights[3 * q + i] - weights[q + i]) + weights[q + i]; + unsafe { + *wf.add(i) = y_0; + *wf.add(q + i) = y_1; + } + // x slices at r1-folded indices i (lo) and q+i (hi): + // s0(j) = e[j], s1(j) = e[n/2 + j] − e[j] + let s0_lo = evals[i]; + let s1_lo = evals[2 * q + i] - evals[i]; + let ds0 = evals[q + i] - evals[i]; + let ds1 = (evals[3 * q + i] - evals[q + i]) - s1_lo; + let d = y_1 - y_0; + (y_0 * s0_lo, y_0 * s1_lo, d * ds0, d * ds1) + }, + |(a, b, c, d), (e, f, g, h)| (a + e, b + f, c + g, d + h), + ); + let c0 = unpack(p00) + r1 * unpack(p01); + let c2 = unpack(ps0) + r1 * unpack(ps1); + let c1 = sum_after_r1 - c0.double() - c2; + let second_poly = DensePolynomial::new(vec![c0, c1, c2]); + + prover_state.add_sumcheck_polynomial(&second_poly.coeffs, None); + prover_state.pow_grinding(pow_bits); + let r2: EF = prover_state.sample(); + let sum_after_r2 = second_poly.evaluate(r2); + + // --- Pass B: fold weights at r2; collapse evals at (r1, r2) to EFP; + // round-3 poly via slices folded once more --- + let q2 = q / 2; + let r2p = EFPacking::::from(r2); + let mut w_folded2 = unsafe { ArenaVec::>::uninitialized(q) }; + let mut x_folded2 = unsafe { ArenaVec::>::uninitialized(q) }; + let wf2 = parallel::SendPtr(w_folded2.as_mut_ptr()); + let xf2 = parallel::SendPtr(x_folded2.as_mut_ptr()); + let (p00, p01, ps0, ps1) = parallel::map_reduce( + q2, + || { + ( + EFPacking::::ZERO, + EFPacking::::ZERO, + EFPacking::::ZERO, + EFPacking::::ZERO, + ) + }, + |i| { + // r1-folded domain indices: lo pairings (j, q+j) at r2, then the + // round-3 pair is (i, q2+i) of the r2-folded domain. + // slices at r2-folded index m ∈ [0, q): + // t0(m) = s0(m) + r2·(s0(q+m) − s0(m)), same for t1. + let (a, b) = (i, q2 + i); + // t-slices for both round-3 pair sides: + let t0_lo = r2p * (evals[q + a] - evals[a]) + evals[a]; + let t1_lo = r2p * ((evals[3 * q + a] - evals[q + a]) - (evals[2 * q + a] - evals[a])) + + (evals[2 * q + a] - evals[a]); + let t0_hi = r2p * (evals[q + b] - evals[b]) + evals[b]; + let t1_hi = r2p * ((evals[3 * q + b] - evals[q + b]) - (evals[2 * q + b] - evals[b])) + + (evals[2 * q + b] - evals[b]); + // collapse x at both sides (x = t0 + r1·t1) and fold weights at r2 + let y_lo = r2p * (w_folded[q + a] - w_folded[a]) + w_folded[a]; + let y_hi = r2p * (w_folded[q + b] - w_folded[b]) + w_folded[b]; + let x_lo = t0_lo + r1p * t1_lo; + let x_hi = t0_hi + r1p * t1_hi; + unsafe { + *wf2.add(a) = y_lo; + *wf2.add(b) = y_hi; + *xf2.add(a) = x_lo; + *xf2.add(b) = x_hi; + } + let d = y_hi - y_lo; + (y_lo * t0_lo, y_lo * t1_lo, d * (t0_hi - t0_lo), d * (t1_hi - t1_lo)) + }, + |(a, b, c, d), (e, f, g, h)| (a + e, b + f, c + g, d + h), + ); + let c0 = unpack(p00) + r1 * unpack(p01); + let c2 = unpack(ps0) + r1 * unpack(ps1); + let c1 = sum_after_r2 - c0.double() - c2; + let third_poly = DensePolynomial::new(vec![c0, c1, c2]); + + prover_state.add_sumcheck_polynomial(&third_poly.coeffs, None); + prover_state.pow_grinding(pow_bits); + let r3: EF = prover_state.sample(); + let sum_after_r3 = third_poly.evaluate(r3); + + let (mut challenges, folds, sum) = sumcheck_prove_many_rounds( + MleGroupOwned::ExtensionPacked(vec![x_folded2, w_folded2]), + Some(r3), + &ProductComputation {}, + &vec![], + None, + prover_state, + sum_after_r3, + None, + n_rounds - 3, + false, + pow_bits, + ); + + challenges.splice(0..0, [r1, r2, r3]); + let [pol_a, pol_b] = folds.split().try_into().unwrap(); + (challenges, sum, pol_a, pol_b) +} diff --git a/crates/whir/src/open.rs b/crates/whir/src/open.rs index c84e4ca6c..ff66da876 100644 --- a/crates/whir/src/open.rs +++ b/crates/whir/src/open.rs @@ -4,7 +4,10 @@ use ::utils::log2_strict_usize; use fiat_shamir::{FSProver, MerklePath, ProofResult}; use field::PrimeCharacteristicRing; use field::{ExtensionField, Field, PackedFieldExtension, TwoAdicField}; -use sumcheck::{ProductComputation, run_product_sumcheck, run_product_sumcheck_from_round1, sumcheck_prove_many_rounds}; +use sumcheck::{ + ProductComputation, run_product_sumcheck, run_product_sumcheck_from_round1, + run_product_sumcheck_from_round1_delayed, sumcheck_prove_many_rounds, +}; use tracing::{info_span, instrument}; use zk_alloc::{ArenaVec, arena_vec}; @@ -437,16 +440,34 @@ where prover_state.pow_grinding(pow_bits); let r1: EF = prover_state.sample(); let sum1 = first_poly.evaluate(r1); - let weights = Mle::Owned(MleOwned::ExtensionPacked(weights_buf)); - let (challenges, new_sum, folded_evals, folded_weights) = run_product_sumcheck_from_round1( - &evals_packed.by_ref(), - &weights.by_ref(), - prover_state, - r1, - sum1, - folding_factor, - pow_bits, - ); + // Delayed-EF (BDT 2024/1046): keep the base evals unpromoted through + // rounds 2-3 via the 2-slice view; collapse at round 3's fold. + // Transcript bit-identical. n_rounds >= 4 keeps the many_rounds + // tail on its always-exercised path. + let delayed = std::env::var("WHIR_DELAYED_EF").map(|v| v != "0").unwrap_or(true) + && folding_factor >= 4; + let (challenges, new_sum, folded_evals, folded_weights) = if delayed { + run_product_sumcheck_from_round1_delayed( + ev, + &weights_buf, + prover_state, + r1, + sum1, + folding_factor, + pow_bits, + ) + } else { + let weights = Mle::Owned(MleOwned::ExtensionPacked(weights_buf)); + run_product_sumcheck_from_round1( + &evals_packed.by_ref(), + &weights.by_ref(), + prover_state, + r1, + sum1, + folding_factor, + pow_bits, + ) + }; let sumcheck = Self { evals: folded_evals, weights: folded_weights, diff --git a/crates/whir/tests/run_whir.rs b/crates/whir/tests/run_whir.rs index d32798e2d..73c4f70ec 100644 --- a/crates/whir/tests/run_whir.rs +++ b/crates/whir/tests/run_whir.rs @@ -299,8 +299,9 @@ fn test_lazy_combine_proof_equality() { statement.push(s); } - let prove_once = |lazy: bool| { + let prove_once = |lazy: bool, delayed: bool| { set_lazy(if lazy { "1" } else { "0" }); + unsafe { std::env::set_var("WHIR_DELAYED_EF", if delayed { "1" } else { "0" }) } let mut prover_state = ProverState::new(poseidon16.clone(), Default::default()); let poly_mle: MleOwned = MleOwned::Base(ArenaVec::from_iter(polynomial.clone())); let witness = params.commit(&mut prover_state, &poly_mle, 1 << num_variables); @@ -308,18 +309,24 @@ fn test_lazy_combine_proof_equality() { prover_state.into_proof() }; - let proof_legacy = prove_once(false); - let proof_lazy = prove_once(true); + let proof_legacy = prove_once(false, false); + let proof_lazy = prove_once(true, false); assert_eq!( proof_legacy, proof_lazy, "lazy combine produced a different proof (seed {seed}, n {num_variables})" ); + let proof_delayed = prove_once(true, true); + assert_eq!( + proof_legacy, proof_delayed, + "delayed-EF produced a different proof (seed {seed}, n {num_variables})" + ); - let mut verifier_state = VerifierState::::new(proof_lazy, poseidon16.clone(), Default::default()).unwrap(); + let mut verifier_state = VerifierState::::new(proof_delayed, poseidon16.clone(), Default::default()).unwrap(); let parsed_commitment = params.parse_commitment::(&mut verifier_state).unwrap(); params .verify::(&mut verifier_state, &parsed_commitment, statement.clone()) .unwrap(); } set_lazy("1"); + unsafe { std::env::set_var("WHIR_DELAYED_EF", "1") } } From 1393f13b4615d9ecf43099b2db37508edf79e929 Mon Sep 17 00:00:00 2001 From: Barnadrot Date: Fri, 12 Jun 2026 18:59:51 +0200 Subject: [PATCH 6/8] pw13-1 T3.1: cargo fmt (correctness gate Layer 0.9) Co-Authored-By: Claude Fable 5 --- .../koala-bear/src/quintic_extension/mod.rs | 3 +- .../koala-bear/src/quintic_extension/tests.rs | 234 ++++++++++++++++++ crates/whir/src/open.rs | 152 +++++++++--- crates/whir/tests/run_whir.rs | 13 +- 4 files changed, 359 insertions(+), 43 deletions(-) create mode 100644 crates/backend/koala-bear/src/quintic_extension/tests.rs diff --git a/crates/backend/koala-bear/src/quintic_extension/mod.rs b/crates/backend/koala-bear/src/quintic_extension/mod.rs index 6ccdca4f7..79f4f7e83 100644 --- a/crates/backend/koala-bear/src/quintic_extension/mod.rs +++ b/crates/backend/koala-bear/src/quintic_extension/mod.rs @@ -11,7 +11,8 @@ use crate::{KoalaBear, KoalaBearParameters}; pub mod extension; pub(crate) mod packed_extension; pub(crate) mod packing; - +#[cfg(test)] +mod tests; pub type QuinticExtensionFieldKB = QuinticExtensionField; pub type PackedQuinticExtensionFieldKB = PackedQuinticExtensionField::Packing>; diff --git a/crates/backend/koala-bear/src/quintic_extension/tests.rs b/crates/backend/koala-bear/src/quintic_extension/tests.rs new file mode 100644 index 000000000..9a70c3200 --- /dev/null +++ b/crates/backend/koala-bear/src/quintic_extension/tests.rs @@ -0,0 +1,234 @@ +// Property tests for QuinticExtensionField and PackedQuinticExtensionField. +// +// These verify algebraic properties that any correct field extension must +// satisfy. They are a prerequisite for allowing the autoresearch loop to +// modify quintic_extension code — without them, a subtle arithmetic bug +// could pass the end-to-end WHIR test while silently corrupting proofs. +// +// Each test uses a seeded RNG for reproducibility and runs 200 random +// iterations (enough to hit alignment/packing edge cases without being slow). + +#[cfg(test)] +mod tests { + use crate::KoalaBear; + use crate::quintic_extension::extension::QuinticExtensionField; + use crate::quintic_extension::packed_extension::PackedQuinticExtensionField; + use field::{Field, PackedFieldExtension, PackedValue, PrimeCharacteristicRing}; + use rand::rngs::StdRng; + use rand::{RngExt, SeedableRng}; + + type QEF = QuinticExtensionField; + type PQEF = PackedQuinticExtensionField::Packing>; + + const ITERS: usize = 200; + + fn rng() -> StdRng { + StdRng::seed_from_u64(0xdeadbeef_cafef00d) + } + + fn rand_nonzero(rng: &mut StdRng) -> QEF { + loop { + let x: QEF = rng.random(); + if !x.is_zero() { + return x; + } + } + } + + // --------------------------------------------------------------- + // Scalar QuinticExtensionField arithmetic properties + // --------------------------------------------------------------- + + #[test] + fn mul_commutative() { + let mut rng = rng(); + for _ in 0..ITERS { + let a: QEF = rng.random(); + let b: QEF = rng.random(); + assert_eq!(a * b, b * a, "commutativity: a*b != b*a"); + } + } + + #[test] + fn mul_associative() { + let mut rng = rng(); + for _ in 0..ITERS { + let a: QEF = rng.random(); + let b: QEF = rng.random(); + let c: QEF = rng.random(); + assert_eq!((a * b) * c, a * (b * c), "associativity: (a*b)*c != a*(b*c)"); + } + } + + #[test] + fn mul_distributive_over_add() { + let mut rng = rng(); + for _ in 0..ITERS { + let a: QEF = rng.random(); + let b: QEF = rng.random(); + let c: QEF = rng.random(); + assert_eq!(a * (b + c), a * b + a * c, "distributivity: a*(b+c) != a*b + a*c"); + } + } + + #[test] + fn mul_identity() { + let mut rng = rng(); + let one = QEF::ONE; + let zero = QEF::ZERO; + for _ in 0..ITERS { + let a: QEF = rng.random(); + assert_eq!(a * one, a, "a * ONE != a"); + assert_eq!(a * zero, zero, "a * ZERO != ZERO"); + } + } + + #[test] + fn add_sub_roundtrip() { + let mut rng = rng(); + for _ in 0..ITERS { + let a: QEF = rng.random(); + let b: QEF = rng.random(); + assert_eq!((a + b) - b, a, "(a+b)-b != a"); + assert_eq!((a - b) + b, a, "(a-b)+b != a"); + } + } + + #[test] + fn neg_double_is_identity() { + let mut rng = rng(); + for _ in 0..ITERS { + let a: QEF = rng.random(); + assert_eq!(-(-a), a, "--a != a"); + assert_eq!(a + (-a), QEF::ZERO, "a + (-a) != 0"); + } + } + + #[test] + fn square_equals_self_mul() { + let mut rng = rng(); + for _ in 0..ITERS { + let a: QEF = rng.random(); + assert_eq!(a.square(), a * a, "a.square() != a * a"); + } + } + + #[test] + fn inverse_roundtrip() { + let mut rng = rng(); + let one = QEF::ONE; + for _ in 0..ITERS { + let a = rand_nonzero(&mut rng); + let inv = a.try_inverse().expect("nonzero element should be invertible"); + assert_eq!(a * inv, one, "a * a^-1 != 1"); + assert_eq!(inv * a, one, "a^-1 * a != 1"); + } + } + + #[test] + fn zero_not_invertible() { + assert!(QEF::ZERO.try_inverse().is_none(), "zero should not be invertible"); + } + + #[test] + fn base_field_embedding() { + let mut rng = rng(); + for _ in 0..ITERS { + let x: KoalaBear = rng.random(); + let y: KoalaBear = rng.random(); + let ex = QEF::from(x); + let ey = QEF::from(y); + assert_eq!(QEF::from(x * y), ex * ey, "embedding must preserve mul"); + assert_eq!(QEF::from(x + y), ex + ey, "embedding must preserve add"); + } + } + + // frobenius is private; skip automorphism test. + // If frobenius becomes pub, uncomment and test: + // fn frobenius_is_automorphism() { ... } + + // --------------------------------------------------------------- + // Packed ↔ scalar consistency + // --------------------------------------------------------------- + + const WIDTH: usize = <::Packing as PackedValue>::WIDTH; + + fn make_packed(elems: &[QEF]) -> PQEF { + assert_eq!(elems.len(), WIDTH); + PQEF::from_ext_slice(elems) + } + + fn unpack(p: PQEF) -> Vec { + >::to_ext_iter([p]).collect() + } + + #[test] + fn packed_add_matches_scalar() { + let mut rng = rng(); + for _ in 0..ITERS { + let a: Vec = (0..WIDTH).map(|_| rng.random()).collect(); + let b: Vec = (0..WIDTH).map(|_| rng.random()).collect(); + let pa = make_packed(&a); + let pb = make_packed(&b); + let result = unpack(pa + pb); + for i in 0..WIDTH { + assert_eq!(result[i], a[i] + b[i], "packed add lane {i} mismatch"); + } + } + } + + #[test] + fn packed_sub_matches_scalar() { + let mut rng = rng(); + for _ in 0..ITERS { + let a: Vec = (0..WIDTH).map(|_| rng.random()).collect(); + let b: Vec = (0..WIDTH).map(|_| rng.random()).collect(); + let pa = make_packed(&a); + let pb = make_packed(&b); + let result = unpack(pa - pb); + for i in 0..WIDTH { + assert_eq!(result[i], a[i] - b[i], "packed sub lane {i} mismatch"); + } + } + } + + #[test] + fn packed_mul_matches_scalar() { + let mut rng = rng(); + for _ in 0..ITERS { + let a: Vec = (0..WIDTH).map(|_| rng.random()).collect(); + let b: Vec = (0..WIDTH).map(|_| rng.random()).collect(); + let pa = make_packed(&a); + let pb = make_packed(&b); + let result = unpack(pa * pb); + for i in 0..WIDTH { + assert_eq!(result[i], a[i] * b[i], "packed mul lane {i} mismatch"); + } + } + } + + #[test] + fn packed_base_mul_matches_scalar() { + let mut rng = rng(); + for _ in 0..ITERS { + let a: Vec = (0..WIDTH).map(|_| rng.random()).collect(); + let s: KoalaBear = rng.random(); + let pa = make_packed(&a); + let result = unpack(pa * s); + for i in 0..WIDTH { + assert_eq!(result[i], a[i] * QEF::from(s), "packed base-mul lane {i} mismatch"); + } + } + } + + #[test] + fn packed_roundtrip() { + let mut rng = rng(); + for _ in 0..ITERS { + let elems: Vec = (0..WIDTH).map(|_| rng.random()).collect(); + let packed = make_packed(&elems); + let unpacked = unpack(packed); + assert_eq!(unpacked, elems, "pack → unpack roundtrip failed"); + } + } +} diff --git a/crates/whir/src/open.rs b/crates/whir/src/open.rs index ff66da876..08bd93e16 100644 --- a/crates/whir/src/open.rs +++ b/crates/whir/src/open.rs @@ -444,8 +444,7 @@ where // rounds 2-3 via the 2-slice view; collapse at round 3's fold. // Transcript bit-identical. n_rounds >= 4 keeps the many_rounds // tail on its always-exercised path. - let delayed = std::env::var("WHIR_DELAYED_EF").map(|v| v != "0").unwrap_or(true) - && folding_factor >= 4; + let delayed = std::env::var("WHIR_DELAYED_EF").map(|v| v != "0").unwrap_or(true) && folding_factor >= 4; let (challenges, new_sum, folded_evals, folded_weights) = if delayed { run_product_sumcheck_from_round1_delayed( ev, @@ -696,8 +695,8 @@ where const LAZY_OVERLAY_SPAN_MAX: usize = 8; // packed words; small blocks are pre-expanded struct LazyFullTerm>> { - left: ArenaVec, // 2^A prefix table, statement scalar folded in - right: ArenaVec>, // 2^(n - A - w) packed suffix table + left: ArenaVec, // 2^A prefix table, statement scalar folded in + right: ArenaVec>, // 2^(n - A - w) packed suffix table rshift: usize, rmask: usize, } @@ -840,7 +839,11 @@ where let mut sorted = smt.values.iter().map(|e| e.selector).collect::>(); sorted.sort_unstable(); sorted.dedup(); - assert_eq!(sorted.len(), smt.values.len(), "Duplicate selectors in sparse statement"); + assert_eq!( + sorted.len(), + smt.values.len(), + "Duplicate selectors in sparse statement" + ); let inner: ArenaVec> = if smt.is_next { let next = matrix_next_mle_folded(&smt.point.0); @@ -974,7 +977,10 @@ where let blk = &terms.blocks[b as usize]; if n_lo < MAX_BLOCKS { let o = j0 - blk.start; - blocks_lo[n_lo] = (&terms.inners[blk.inner_id as usize][o..o + run], terms.scalars[blk.scalar]); + blocks_lo[n_lo] = ( + &terms.inners[blk.inner_id as usize][o..o + run], + terms.scalars[blk.scalar], + ); n_lo += 1; } else { slow = true; @@ -984,7 +990,10 @@ where let blk = &terms.blocks[b as usize]; if n_hi < MAX_BLOCKS { let o = half + j0 - blk.start; - blocks_hi[n_hi] = (&terms.inners[blk.inner_id as usize][o..o + run], terms.scalars[blk.scalar]); + blocks_hi[n_hi] = ( + &terms.inners[blk.inner_id as usize][o..o + run], + terms.scalars[blk.scalar], + ); n_hi += 1; } else { slow = true; @@ -1096,9 +1105,9 @@ mod fusion_bench { /// Full-eq term: scalar pre-multiplied into the prefix table. /// value(j) = right_packed[j & rmask] * left[j >> rshift] struct FullT { - left: ArenaVec, // 2^A entries, scaled - right: ArenaVec, // 2^(n - A - w) packed entries - rshift: usize, // n - A - w + left: ArenaVec, // 2^A entries, scaled + right: ArenaVec, // 2^(n - A - w) packed entries + rshift: usize, // n - A - w rmask: usize, } @@ -1106,12 +1115,12 @@ mod fusion_bench { /// selector blocks with per-block scalars). /// Covers packed range [start, start + n_blocks << ishift). struct DenseT { - start: usize, // packed units + start: usize, // packed units end: usize, - ishift: usize, // inner_vars - w + ishift: usize, // inner_vars - w imask: usize, - inner: ArenaVec, // 2^ishift packed entries (unscaled) - scalars: Vec, // per block, gamma powers + inner: ArenaVec, // 2^ishift packed entries (unscaled) + scalars: Vec, // per block, gamma powers } struct LazyTerms { @@ -1147,7 +1156,9 @@ mod fusion_bench { } fn rnd_vals(rng: &mut StdRng, first_sel: usize, n: usize) -> Vec> { - (0..n).map(|c| SparseValue::new(first_sel + c, rng.random::())).collect() + (0..n) + .map(|c| SparseValue::new(first_sel + c, rng.random::())) + .collect() } fn build_statements(n_vars: usize, rng: &mut StdRng) -> Vec> { @@ -1347,7 +1358,9 @@ mod fusion_bench { parallel::par_chunks_mut(unpacked, 1 << 16, |chunk_idx, chunk| { let mut state = (chunk_idx as u64).wrapping_mul(0x9E3779B97F4A7C15) | 1; for slot in chunk { - state = state.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407); + state = state + .wrapping_mul(6364136223846793005) + .wrapping_add(1442695040888963407); *slot = F::from_u32((state >> 33) as u32 & 0x3FFFFFFF); } }); @@ -1382,7 +1395,10 @@ mod fusion_bench { let mut rng = StdRng::seed_from_u64(42); let gamma: EF = rng.random(); let stmts = build_statements(n_vars, &mut rng); - println!("T0a: n_vars={n_vars}, packing_log_width={w}, {} statements", stmts.len()); + println!( + "T0a: n_vars={n_vars}, packing_log_width={w}, {} statements", + stmts.len() + ); let evals = cheap_base_fill(1 << (n_vars - w)); @@ -1427,10 +1443,21 @@ mod fusion_bench { let lazy_total = t_terms + t_lazy_r0 + t_lazy_r1; let ratio_spec = t_lazy_r0 / (t_combine + t_read); let ratio_e2e = lazy_total / base_total; - println!(" baseline: combine {:.0}ms + read {:.0}ms + r0 {:.0}ms + r1fold {:.0}ms (combine+r0+r1 = {:.0}ms)", - t_combine * 1e3, t_read * 1e3, t_r0 * 1e3, t_r1 * 1e3, base_total * 1e3); - println!(" lazy: terms {:.0}ms + r0 {:.0}ms + r1fold {:.0}ms (total {:.0}ms)", - t_terms * 1e3, t_lazy_r0 * 1e3, t_lazy_r1 * 1e3, lazy_total * 1e3); + println!( + " baseline: combine {:.0}ms + read {:.0}ms + r0 {:.0}ms + r1fold {:.0}ms (combine+r0+r1 = {:.0}ms)", + t_combine * 1e3, + t_read * 1e3, + t_r0 * 1e3, + t_r1 * 1e3, + base_total * 1e3 + ); + println!( + " lazy: terms {:.0}ms + r0 {:.0}ms + r1fold {:.0}ms (total {:.0}ms)", + t_terms * 1e3, + t_lazy_r0 * 1e3, + t_lazy_r1 * 1e3, + lazy_total * 1e3 + ); let verdict = if ratio_spec <= 1.3 { "PASS" } else if ratio_spec <= 2.0 { @@ -1441,8 +1468,13 @@ mod fusion_bench { println!( "T0A: ratio_spec (lazy_r0 / (combine+read)) = {ratio_spec:.2} (gate: <=1.3 PASS / <=2.0 GRAY / >2.0 KILL) => {verdict}" ); - println!("T0A: ratio_e2e (lazy r0+r1+terms / combine+r0+r1) = {ratio_e2e:.2} (decision-relevant; <1.0 = net win)"); - assert!(ratio_spec <= 2.0, "T0a KILL: lazy round-0 {ratio_spec:.2}x the materialized combine+read"); + println!( + "T0A: ratio_e2e (lazy r0+r1+terms / combine+r0+r1) = {ratio_e2e:.2} (decision-relevant; <1.0 = net win)" + ); + assert!( + ratio_spec <= 2.0, + "T0a KILL: lazy round-0 {ratio_spec:.2}x the materialized combine+read" + ); } #[test] @@ -1456,7 +1488,9 @@ mod fusion_bench { let n = 1 << (log_n - w); let base = cheap_base_fill(n); let ext: ArenaVec = { - let vals: Vec = (0..(n << w)).map(|i| EF::from(F::from_u32((i as u32) | 1)) * EF::from_u32(7)).collect(); + let vals: Vec = (0..(n << w)) + .map(|i| EF::from(F::from_u32((i as u32) | 1)) * EF::from_u32(7)) + .collect(); pack_extension(&vals) }; let wts: ArenaVec = { @@ -1464,15 +1498,16 @@ mod fusion_bench { pack_extension(&vals) }; let sum: EF = rng.random(); - let (t_base, p1) = time_med(3, || { - compute_product_sumcheck_polynomial(&base, &wts, sum, decompose) - }); - let (t_ext, p2) = time_med(3, || { - compute_product_sumcheck_polynomial(&ext, &wts, sum, decompose) - }); + let (t_base, p1) = time_med(3, || compute_product_sumcheck_polynomial(&base, &wts, sum, decompose)); + let (t_ext, p2) = time_med(3, || compute_product_sumcheck_polynomial(&ext, &wts, sum, decompose)); black_box((p1, p2)); last_ratio = t_ext / t_base; - println!(" 2^{log_n}: basexEF {:.1}ms, EFxEF {:.1}ms, ratio {:.2}", t_base * 1e3, t_ext * 1e3, last_ratio); + println!( + " 2^{log_n}: basexEF {:.1}ms, EFxEF {:.1}ms, ratio {:.2}", + t_base * 1e3, + t_ext * 1e3, + last_ratio + ); } let max_slices = if last_ratio >= 4.0 { 4 @@ -1481,7 +1516,9 @@ mod fusion_bench { } else { 1 }; - println!("T0B: EFxEF/basexEF = {last_ratio:.2} => MAX_SLICES = {max_slices} (delayed-EF profitable while n_slices < ratio)"); + println!( + "T0B: EFxEF/basexEF = {last_ratio:.2} => MAX_SLICES = {max_slices} (delayed-EF profitable while n_slices < ratio)" + ); } } @@ -1500,32 +1537,64 @@ mod lazy_combine_diag { fn diag_lazy_vs_combine_failing_shape() { let num_variables = 20usize; let mut rng = StdRng::seed_from_u64(7); - let polynomial = (0..1usize << num_variables).map(|_| rng.random::()).collect::>(); + let polynomial = (0..1usize << num_variables) + .map(|_| rng.random::()) + .collect::>(); let mut statement: Vec> = Vec::new(); // 2 fake OOD full statements at the front (mirrors initialize_first_round_state) for _ in 0..2 { let p = MultilinearPoint((0..num_variables).map(|_| rng.random::()).collect::>()); - statement.push(SparseStatement::new(num_variables, p, vec![SparseValue { selector: 0, value: rng.random::() }])); + statement.push(SparseStatement::new( + num_variables, + p, + vec![SparseValue { + selector: 0, + value: rng.random::(), + }], + )); } for (selector_len, n_sels) in [(6usize, 5usize), (8, 9), (11, 3)] { - let point = MultilinearPoint((0..num_variables - selector_len).map(|_| rng.random::()).collect::>()); + let point = MultilinearPoint( + (0..num_variables - selector_len) + .map(|_| rng.random::()) + .collect::>(), + ); let first = rng.random_range(0..(1usize << selector_len) - n_sels); statement.push(SparseStatement::new( num_variables, point, - (0..n_sels).map(|k| SparseValue { selector: first + k, value: rng.random::() }).collect(), + (0..n_sels) + .map(|k| SparseValue { + selector: first + k, + value: rng.random::(), + }) + .collect(), )); } { let point = MultilinearPoint((0..num_variables - 5).map(|_| rng.random::()).collect::>()); let sel = rng.random_range(0..32); - statement.push(SparseStatement::new(num_variables, point, vec![SparseValue { selector: sel, value: rng.random::() }])); + statement.push(SparseStatement::new( + num_variables, + point, + vec![SparseValue { + selector: sel, + value: rng.random::(), + }], + )); } for inner in [0usize, 1] { let point = MultilinearPoint((0..inner).map(|_| rng.random::()).collect::>()); let sel = rng.random_range(0..(1usize << (num_variables - inner))); - statement.push(SparseStatement::new(num_variables, point, vec![SparseValue { selector: sel, value: rng.random::() }])); + statement.push(SparseStatement::new( + num_variables, + point, + vec![SparseValue { + selector: sel, + value: rng.random::(), + }], + )); } { let inner = 10usize; @@ -1533,7 +1602,12 @@ mod lazy_combine_diag { let mut s = SparseStatement::new( num_variables, point, - (0..2usize).map(|k| SparseValue { selector: 3 + k, value: rng.random::() }).collect(), + (0..2usize) + .map(|k| SparseValue { + selector: 3 + k, + value: rng.random::(), + }) + .collect(), ); s.is_next = true; statement.push(s); diff --git a/crates/whir/tests/run_whir.rs b/crates/whir/tests/run_whir.rs index 73c4f70ec..e1583e416 100644 --- a/crates/whir/tests/run_whir.rs +++ b/crates/whir/tests/run_whir.rs @@ -219,7 +219,9 @@ fn test_lazy_combine_proof_equality() { assert_eq!(r.query_pow_bits, 0, "test needs zero grinding"); } let mut rng = StdRng::seed_from_u64(seed); - let polynomial = (0..1usize << num_variables).map(|_| rng.random::()).collect::>(); + let polynomial = (0..1usize << num_variables) + .map(|_| rng.random::()) + .collect::>(); let mut statement: Vec> = Vec::new(); // full statement (selector 0, full-length point): with this config the @@ -237,7 +239,11 @@ fn test_lazy_combine_proof_equality() { } // dense multi-selector blocks (table-shaped) for (selector_len, n_sels) in [(6usize, 5usize), (8, 9), (11, 3)] { - let point = MultilinearPoint((0..num_variables - selector_len).map(|_| rng.random::()).collect::>()); + let point = MultilinearPoint( + (0..num_variables - selector_len) + .map(|_| rng.random::()) + .collect::>(), + ); let first = rng.random_range(0..(1usize << selector_len) - n_sels); statement.push(SparseStatement::new( num_variables, @@ -321,7 +327,8 @@ fn test_lazy_combine_proof_equality() { "delayed-EF produced a different proof (seed {seed}, n {num_variables})" ); - let mut verifier_state = VerifierState::::new(proof_delayed, poseidon16.clone(), Default::default()).unwrap(); + let mut verifier_state = + VerifierState::::new(proof_delayed, poseidon16.clone(), Default::default()).unwrap(); let parsed_commitment = params.parse_commitment::(&mut verifier_state).unwrap(); params .verify::(&mut verifier_state, &parsed_commitment, statement.clone()) From 59f16dbe7219b0cc2ae045ab6734dfe0acbf8e51 Mon Sep 17 00:00:00 2001 From: Barnadrot Date: Thu, 18 Jun 2026 12:58:44 +0000 Subject: [PATCH 7/8] remove quintic extension tests (injected by correctness gate, not part of this PR) --- .../koala-bear/src/quintic_extension/mod.rs | 2 - .../koala-bear/src/quintic_extension/tests.rs | 234 ------------------ 2 files changed, 236 deletions(-) delete mode 100644 crates/backend/koala-bear/src/quintic_extension/tests.rs diff --git a/crates/backend/koala-bear/src/quintic_extension/mod.rs b/crates/backend/koala-bear/src/quintic_extension/mod.rs index 79f4f7e83..ce7c57d6b 100644 --- a/crates/backend/koala-bear/src/quintic_extension/mod.rs +++ b/crates/backend/koala-bear/src/quintic_extension/mod.rs @@ -11,8 +11,6 @@ use crate::{KoalaBear, KoalaBearParameters}; pub mod extension; pub(crate) mod packed_extension; pub(crate) mod packing; -#[cfg(test)] -mod tests; pub type QuinticExtensionFieldKB = QuinticExtensionField; pub type PackedQuinticExtensionFieldKB = PackedQuinticExtensionField::Packing>; diff --git a/crates/backend/koala-bear/src/quintic_extension/tests.rs b/crates/backend/koala-bear/src/quintic_extension/tests.rs deleted file mode 100644 index 9a70c3200..000000000 --- a/crates/backend/koala-bear/src/quintic_extension/tests.rs +++ /dev/null @@ -1,234 +0,0 @@ -// Property tests for QuinticExtensionField and PackedQuinticExtensionField. -// -// These verify algebraic properties that any correct field extension must -// satisfy. They are a prerequisite for allowing the autoresearch loop to -// modify quintic_extension code — without them, a subtle arithmetic bug -// could pass the end-to-end WHIR test while silently corrupting proofs. -// -// Each test uses a seeded RNG for reproducibility and runs 200 random -// iterations (enough to hit alignment/packing edge cases without being slow). - -#[cfg(test)] -mod tests { - use crate::KoalaBear; - use crate::quintic_extension::extension::QuinticExtensionField; - use crate::quintic_extension::packed_extension::PackedQuinticExtensionField; - use field::{Field, PackedFieldExtension, PackedValue, PrimeCharacteristicRing}; - use rand::rngs::StdRng; - use rand::{RngExt, SeedableRng}; - - type QEF = QuinticExtensionField; - type PQEF = PackedQuinticExtensionField::Packing>; - - const ITERS: usize = 200; - - fn rng() -> StdRng { - StdRng::seed_from_u64(0xdeadbeef_cafef00d) - } - - fn rand_nonzero(rng: &mut StdRng) -> QEF { - loop { - let x: QEF = rng.random(); - if !x.is_zero() { - return x; - } - } - } - - // --------------------------------------------------------------- - // Scalar QuinticExtensionField arithmetic properties - // --------------------------------------------------------------- - - #[test] - fn mul_commutative() { - let mut rng = rng(); - for _ in 0..ITERS { - let a: QEF = rng.random(); - let b: QEF = rng.random(); - assert_eq!(a * b, b * a, "commutativity: a*b != b*a"); - } - } - - #[test] - fn mul_associative() { - let mut rng = rng(); - for _ in 0..ITERS { - let a: QEF = rng.random(); - let b: QEF = rng.random(); - let c: QEF = rng.random(); - assert_eq!((a * b) * c, a * (b * c), "associativity: (a*b)*c != a*(b*c)"); - } - } - - #[test] - fn mul_distributive_over_add() { - let mut rng = rng(); - for _ in 0..ITERS { - let a: QEF = rng.random(); - let b: QEF = rng.random(); - let c: QEF = rng.random(); - assert_eq!(a * (b + c), a * b + a * c, "distributivity: a*(b+c) != a*b + a*c"); - } - } - - #[test] - fn mul_identity() { - let mut rng = rng(); - let one = QEF::ONE; - let zero = QEF::ZERO; - for _ in 0..ITERS { - let a: QEF = rng.random(); - assert_eq!(a * one, a, "a * ONE != a"); - assert_eq!(a * zero, zero, "a * ZERO != ZERO"); - } - } - - #[test] - fn add_sub_roundtrip() { - let mut rng = rng(); - for _ in 0..ITERS { - let a: QEF = rng.random(); - let b: QEF = rng.random(); - assert_eq!((a + b) - b, a, "(a+b)-b != a"); - assert_eq!((a - b) + b, a, "(a-b)+b != a"); - } - } - - #[test] - fn neg_double_is_identity() { - let mut rng = rng(); - for _ in 0..ITERS { - let a: QEF = rng.random(); - assert_eq!(-(-a), a, "--a != a"); - assert_eq!(a + (-a), QEF::ZERO, "a + (-a) != 0"); - } - } - - #[test] - fn square_equals_self_mul() { - let mut rng = rng(); - for _ in 0..ITERS { - let a: QEF = rng.random(); - assert_eq!(a.square(), a * a, "a.square() != a * a"); - } - } - - #[test] - fn inverse_roundtrip() { - let mut rng = rng(); - let one = QEF::ONE; - for _ in 0..ITERS { - let a = rand_nonzero(&mut rng); - let inv = a.try_inverse().expect("nonzero element should be invertible"); - assert_eq!(a * inv, one, "a * a^-1 != 1"); - assert_eq!(inv * a, one, "a^-1 * a != 1"); - } - } - - #[test] - fn zero_not_invertible() { - assert!(QEF::ZERO.try_inverse().is_none(), "zero should not be invertible"); - } - - #[test] - fn base_field_embedding() { - let mut rng = rng(); - for _ in 0..ITERS { - let x: KoalaBear = rng.random(); - let y: KoalaBear = rng.random(); - let ex = QEF::from(x); - let ey = QEF::from(y); - assert_eq!(QEF::from(x * y), ex * ey, "embedding must preserve mul"); - assert_eq!(QEF::from(x + y), ex + ey, "embedding must preserve add"); - } - } - - // frobenius is private; skip automorphism test. - // If frobenius becomes pub, uncomment and test: - // fn frobenius_is_automorphism() { ... } - - // --------------------------------------------------------------- - // Packed ↔ scalar consistency - // --------------------------------------------------------------- - - const WIDTH: usize = <::Packing as PackedValue>::WIDTH; - - fn make_packed(elems: &[QEF]) -> PQEF { - assert_eq!(elems.len(), WIDTH); - PQEF::from_ext_slice(elems) - } - - fn unpack(p: PQEF) -> Vec { - >::to_ext_iter([p]).collect() - } - - #[test] - fn packed_add_matches_scalar() { - let mut rng = rng(); - for _ in 0..ITERS { - let a: Vec = (0..WIDTH).map(|_| rng.random()).collect(); - let b: Vec = (0..WIDTH).map(|_| rng.random()).collect(); - let pa = make_packed(&a); - let pb = make_packed(&b); - let result = unpack(pa + pb); - for i in 0..WIDTH { - assert_eq!(result[i], a[i] + b[i], "packed add lane {i} mismatch"); - } - } - } - - #[test] - fn packed_sub_matches_scalar() { - let mut rng = rng(); - for _ in 0..ITERS { - let a: Vec = (0..WIDTH).map(|_| rng.random()).collect(); - let b: Vec = (0..WIDTH).map(|_| rng.random()).collect(); - let pa = make_packed(&a); - let pb = make_packed(&b); - let result = unpack(pa - pb); - for i in 0..WIDTH { - assert_eq!(result[i], a[i] - b[i], "packed sub lane {i} mismatch"); - } - } - } - - #[test] - fn packed_mul_matches_scalar() { - let mut rng = rng(); - for _ in 0..ITERS { - let a: Vec = (0..WIDTH).map(|_| rng.random()).collect(); - let b: Vec = (0..WIDTH).map(|_| rng.random()).collect(); - let pa = make_packed(&a); - let pb = make_packed(&b); - let result = unpack(pa * pb); - for i in 0..WIDTH { - assert_eq!(result[i], a[i] * b[i], "packed mul lane {i} mismatch"); - } - } - } - - #[test] - fn packed_base_mul_matches_scalar() { - let mut rng = rng(); - for _ in 0..ITERS { - let a: Vec = (0..WIDTH).map(|_| rng.random()).collect(); - let s: KoalaBear = rng.random(); - let pa = make_packed(&a); - let result = unpack(pa * s); - for i in 0..WIDTH { - assert_eq!(result[i], a[i] * QEF::from(s), "packed base-mul lane {i} mismatch"); - } - } - } - - #[test] - fn packed_roundtrip() { - let mut rng = rng(); - for _ in 0..ITERS { - let elems: Vec = (0..WIDTH).map(|_| rng.random()).collect(); - let packed = make_packed(&elems); - let unpacked = unpack(packed); - assert_eq!(unpacked, elems, "pack → unpack roundtrip failed"); - } - } -} From ed241356bfd284e13e23cc52d5ff476d6c986d4b Mon Sep 17 00:00:00 2001 From: Barnadrot Date: Thu, 18 Jun 2026 13:13:42 +0000 Subject: [PATCH 8/8] =?UTF-8?q?fix:=20clippy=20=E2=80=94=20unused=20variab?= =?UTF-8?q?le=20+=20upper-case=20acronym?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/whir/src/open.rs | 42 ++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/crates/whir/src/open.rs b/crates/whir/src/open.rs index 08bd93e16..90494fe4d 100644 --- a/crates/whir/src/open.rs +++ b/crates/whir/src/open.rs @@ -1087,26 +1087,26 @@ mod fusion_bench { type F = KoalaBear; type EF = QuinticExtensionFieldKB; type FP = PFPacking; - type EFP = EFPacking; + type Efp = EFPacking; fn w_log() -> usize { packing_log_width::() } #[inline(always)] - fn unpack_sum(s: EFP) -> EF { - >::to_ext_iter([s]).sum::() + fn unpack_sum(s: Efp) -> EF { + >::to_ext_iter([s]).sum::() } - fn decompose(e: EFP) -> Vec { - >::to_ext_iter([e]).collect() + fn decompose(e: Efp) -> Vec { + >::to_ext_iter([e]).collect() } /// Full-eq term: scalar pre-multiplied into the prefix table. /// value(j) = right_packed[j & rmask] * left[j >> rshift] struct FullT { left: ArenaVec, // 2^A entries, scaled - right: ArenaVec, // 2^(n - A - w) packed entries + right: ArenaVec, // 2^(n - A - w) packed entries rshift: usize, // n - A - w rmask: usize, } @@ -1119,7 +1119,7 @@ mod fusion_bench { end: usize, ishift: usize, // inner_vars - w imask: usize, - inner: ArenaVec, // 2^ishift packed entries (unscaled) + inner: ArenaVec, // 2^ishift packed entries (unscaled) scalars: Vec, // per block, gamma powers } @@ -1130,8 +1130,8 @@ mod fusion_bench { impl LazyTerms { #[inline(always)] - fn at(&self, j: usize) -> EFP { - let mut acc = EFP::ZERO; + fn at(&self, j: usize) -> Efp { + let mut acc = Efp::ZERO; for t in &self.full { acc += t.right[j & t.rmask] * t.left[j >> t.rshift]; } @@ -1216,7 +1216,7 @@ mod fusion_bench { for v in left.iter_mut() { *v *= scalar; } - let right: ArenaVec = eval_eq_packed(&point[a..]); + let right: ArenaVec = eval_eq_packed(&point[a..]); FullT { left, right, @@ -1250,7 +1250,7 @@ mod fusion_bench { smt.inner_num_variables() >= w, "bench statement set must not contain lane-level statements" ); - let inner: ArenaVec = if smt.is_next { + let inner: ArenaVec = if smt.is_next { let next = matrix_next_mle_folded(&smt.point.0); pack_extension(&next) } else { @@ -1287,7 +1287,7 @@ mod fusion_bench { let half = n / 2; let (c0p, c2p) = parallel::map_reduce( half, - || (EFP::ZERO, EFP::ZERO), + || (Efp::ZERO, Efp::ZERO), |i| { let y0 = terms.at(i); let y1 = terms.at(half + i); @@ -1314,17 +1314,17 @@ mod fusion_bench { terms: &LazyTerms, r1: EF, sum: EF, - ) -> (DensePolynomial, ArenaVec, ArenaVec) { + ) -> (DensePolynomial, ArenaVec, ArenaVec) { let n = evals.len(); let quarter = n / 4; - let r1p = EFP::from(r1); - let mut e_folded = unsafe { ArenaVec::::uninitialized(n / 2) }; - let mut w_folded = unsafe { ArenaVec::::uninitialized(n / 2) }; + let r1p = Efp::from(r1); + let mut e_folded = unsafe { ArenaVec::::uninitialized(n / 2) }; + let mut w_folded = unsafe { ArenaVec::::uninitialized(n / 2) }; let pe = parallel::SendPtr(e_folded.as_mut_ptr()); let pw = parallel::SendPtr(w_folded.as_mut_ptr()); let (c0p, c2p) = parallel::map_reduce( quarter, - || (EFP::ZERO, EFP::ZERO), + || (Efp::ZERO, Efp::ZERO), |i| { let x_0 = r1p * (evals[2 * quarter + i] - evals[i]) + evals[i]; let x_1 = r1p * (evals[3 * quarter + i] - evals[quarter + i]) + evals[quarter + i]; @@ -1405,7 +1405,7 @@ mod fusion_bench { // --- materialized baseline --- let (t_combine, (weights, sum_m)) = time_med(3, || combine_statement::(&stmts, gamma)); let (t_read, read_sink) = time_med(3, || { - parallel::map_reduce(weights.len(), || EFP::ZERO, |i| weights[i], |a, b| a + b) + parallel::map_reduce(weights.len(), || Efp::ZERO, |i| weights[i], |a, b| a + b) }); black_box(read_sink); let (t_r0, base_r0) = time_med(3, || { @@ -1487,13 +1487,13 @@ mod fusion_bench { for log_n in [20usize, 22, 23] { let n = 1 << (log_n - w); let base = cheap_base_fill(n); - let ext: ArenaVec = { + let ext: ArenaVec = { let vals: Vec = (0..(n << w)) .map(|i| EF::from(F::from_u32((i as u32) | 1)) * EF::from_u32(7)) .collect(); pack_extension(&vals) }; - let wts: ArenaVec = { + let wts: ArenaVec = { let vals: Vec = (0..(n << w)).map(|_| rng.random::()).collect(); pack_extension(&vals) }; @@ -1537,7 +1537,7 @@ mod lazy_combine_diag { fn diag_lazy_vs_combine_failing_shape() { let num_variables = 20usize; let mut rng = StdRng::seed_from_u64(7); - let polynomial = (0..1usize << num_variables) + let _polynomial = (0..1usize << num_variables) .map(|_| rng.random::()) .collect::>();