From 8bc4baf740a59c4f919fc8626509a7a851a2e57b Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 12 Jun 2026 17:20:39 +0000
Subject: [PATCH] Channel-parallel float dot for high channel counts (perf C6)

Frame-major history + register-blocked 8/4/2/1 channel tiles when
channels >= 4, floating-point samples, host targets only (compile-time
gate; every embedded ratchet scenario verified 0.00%). Bit-exact vs
planar (per-channel tap order unchanged; hash-verified over 30k blocks
x 4 configs). Same-minute A/B: float 8/12/16ch -38/-38/-42% wall-clock
with AVX2+FMA, -4-5% on baseline SSE2. Fixed-point measured ~1.5x
slower channel-parallel and keeps planar (taps-axis auto-vectorization
already optimal) - negative result recorded in PERFORMANCE.md along
with the two implementation traps (memory accumulators; runtime gate
in hot loops).

https://claude.ai/code/session_01HuAFfoeD5a5Xe5aGNA16M9
---
 .gitignore                       |   1 +
 README.md                        |  30 ++++-----
 docs/PERFORMANCE.md              |  33 ++++++---
 include/srt/polyphase_filter.hpp | 111 +++++++++++++++++++++++++++++--
 4 files changed, 143 insertions(+), 32 deletions(-)

diff --git a/.gitignore b/.gitignore
index 20cfe70..94fcdc7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,4 @@ compile_commands.json
 CMakeUserPresets.json
 .vscode/
 .idea/
+.claude/
diff --git a/README.md b/README.md
index b24046f..117e595 100644
--- a/README.md
+++ b/README.md
@@ -254,21 +254,21 @@ Indicative numbers from a shared machine (Intel(R) Xeon(R) Processor @ 2.80GHz,
 
 | Benchmark | ns/item | ×realtime @48k |
 |---|---:|---:|
-| `BM_Kernel_Float_Fast` | 54.5 | 382× |
-| `BM_Kernel_Float_Balanced` | 86.6 | 241× |
-| `BM_Kernel_Float_Transparent` | 132.4 | 157× |
-| `BM_Kernel_Q15_Balanced` | 51.6 | 404× |
-| `BM_Kernel_Q31_Balanced` | 66.5 | 313× |
-| `BM_Pipeline_Float_Balanced_1ch` | 88.8 | 235× |
-| `BM_Pipeline_Float_Balanced_2ch` | 131.2 | 159× |
-| `BM_Pipeline_Float_Balanced_8ch` | 460.2 | 45× |
-| `BM_Pipeline_Q15_Balanced_2ch` | 70.8 | 294× |
-| `BM_Pipeline_Q31_Balanced_2ch` | 130.3 | 160× |
-| `BM_Pipeline_Float_Transparent_2ch` | 241.5 | 86× |
-| `BM_Pipeline_Float_Balanced_12ch` | 678.3 | 31× |
-| `BM_Pipeline_Q15_Balanced_12ch` | 233.8 | 89× |
-| `BM_Pipeline_Float_Balanced_16ch` | 894.6 | 23× |
-| `BM_Pipeline_Q15_Balanced_16ch` | 290.3 | 72× |
+| `BM_Kernel_Float_Fast` | 48.9 | 426× |
+| `BM_Kernel_Float_Balanced` | 69.0 | 302× |
+| `BM_Kernel_Float_Transparent` | 109.1 | 191× |
+| `BM_Kernel_Q15_Balanced` | 45.2 | 461× |
+| `BM_Kernel_Q31_Balanced` | 62.9 | 331× |
+| `BM_Pipeline_Float_Balanced_1ch` | 67.7 | 308× |
+| `BM_Pipeline_Float_Balanced_2ch` | 107.8 | 193× |
+| `BM_Pipeline_Float_Balanced_8ch` | 336.2 | 62× |
+| `BM_Pipeline_Q15_Balanced_2ch` | 56.0 | 372× |
+| `BM_Pipeline_Q31_Balanced_2ch` | 120.8 | 173× |
+| `BM_Pipeline_Float_Transparent_2ch` | 164.2 | 127× |
+| `BM_Pipeline_Float_Balanced_12ch` | 511.8 | 41× |
+| `BM_Pipeline_Q15_Balanced_12ch` | 189.1 | 110× |
+| `BM_Pipeline_Float_Balanced_16ch` | 649.6 | 32× |
+| `BM_Pipeline_Q15_Balanced_16ch` | 241.8 | 86× |
 <!-- PERF:END -->
 
 ## Sample types
diff --git a/docs/PERFORMANCE.md b/docs/PERFORMANCE.md
index b446381..deb5a8d 100644
--- a/docs/PERFORMANCE.md
+++ b/docs/PERFORMANCE.md
@@ -163,14 +163,27 @@ table is already enforced by test thresholds.
   shape for HVX. The HVX-compatible shape is **channel-parallel** (one
   64-bit lane-pair per channel; 16 channels fill one vector exactly),
   recorded as hypothesis C6 below.
-- [ ] **PR C6…** — channel-parallel dot for high channel counts
-  (12-channel 7.1.4 and 16-channel AVB-with-reference-mics are real
-  deployments): channels in SIMD lanes, one accumulator lane per
-  channel, coefficient broadcast — bit-exact for *every* sample type
-  including float, since each channel's accumulation order is unchanged.
-  Wants a channel-major history layout above a channel threshold.
-  Candidates: AVX2 (4 double lanes), Helium, HVX (16 x int64 lanes).
-  Profile the 12-channel deinterleave/scatter cost first. Hypothesis 5
+- [x] **PR C6** — channel-parallel dot for high channel counts
+  (frame-major history + register-blocked 8/4/2/1 channel tiles,
+  `SRT_CP_MIN_CHANNELS` = 4, hosts only). Profile first (callgrind,
+  12ch Q15): per-channel dot MACs ≈ 85% of instructions, deinterleave
+  ~2% — the dots were the target. Results, same-minute A/B:
+  **float 8/12/16-channel −38/−38/−42% wall-clock with AVX2+FMA**
+  (`-march=native`; −4–5% on baseline SSE2 builds — gains scale with
+  SIMD width because the channel axis is the *only* axis the float path
+  may vectorize on: per-channel double accumulation order is unchanged,
+  so it is bit-exact, hash-verified against planar over 30k blocks ×
+  4 configs). **Fixed-point: negative result, planar kept** — the
+  channel-parallel form measured ~1.5× SLOWER than planar Q15 on hosts
+  (planar already auto-vectorizes over taps; integer reduction is
+  exactly reassociable, so that axis was never blocked). Two
+  implementation lessons recorded: a naive channels-inner loop with
+  memory accumulators is 2.8× slower than planar (register-block or
+  don't bother), and the mode gate must be compile-time — a runtime
+  bool in the hot loops cost +6–8% on the M55 ratchet before the
+  constexpr gate restored every embedded scenario to 0.00%.
+  Embedded channel-parallel (HVX 16×int64-lane, Helium) remains a
+  follow-up candidate if DSP budgets demand it. Hypothesis 5
   (deferred): explicit 4-way double accumulation for the float dot —
-  est. 2–3× float kernel on AVX2-class SIMD, but bit-changing; take only
-  if budgets demand it.
+  bit-changing; superseded for N≥4 by C6's bit-exact channel axis, only
+  relevant for mono/stereo float if ever needed.
diff --git a/include/srt/polyphase_filter.hpp b/include/srt/polyphase_filter.hpp
index 5432724..390a474 100644
--- a/include/srt/polyphase_filter.hpp
+++ b/include/srt/polyphase_filter.hpp
@@ -37,6 +37,30 @@
 #define SRT_Q15_SMLALD 0
 #endif
 
+// Channel-parallel dot product for high channel counts (hypothesis C6,
+// docs/PERFORMANCE.md): history stored frame-major so the per-tap inner
+// loop runs across channels — contiguous loads, one accumulator lane per
+// channel, coefficient broadcast. Bit-exact because each channel's
+// accumulation order over taps is unchanged (lanes are channels, not
+// taps), which is what lets the FLOAT path vectorize at all: its strict
+// per-channel double accumulation forbids tap-axis SIMD (PERFORMANCE.md
+// hypothesis 5), but the channel axis is free. Float-only by measurement:
+// fixed-point planar dots already auto-vectorize over taps on hosts
+// (integer reduction is exactly reassociable) and measured ~1.5x FASTER
+// than the channel-parallel form. Host-only: the embedded targets keep
+// their proven planar codegen (Helium on M55, SMLALD on M33-class,
+// Hexagon's measured scalar floor — see PERFORMANCE.md C4/C5).
+#if !defined(__ARM_FEATURE_MVE) && !defined(__ARM_FEATURE_DSP) && !defined(__hexagon__)
+#define SRT_CHANNEL_PARALLEL 1
+#else
+#define SRT_CHANNEL_PARALLEL 0
+#endif
+// Minimum channel count for the frame-major path (overridable for A/B
+// measurements; the blend-share planar path stays better at low counts).
+#ifndef SRT_CP_MIN_CHANNELS
+#define SRT_CP_MIN_CHANNELS 4
+#endif
+
 namespace srt {
 
 /// Specification of the interpolation prototype filter.
@@ -243,6 +267,53 @@ inline S dotRow(const typename SampleTraits<S>::Coeff* SRT_RESTRICT row, const S
     return Tr::finalize(acc);
 }
 
+/// One K-channel tile of the channel-parallel dot (hypothesis C6): K
+/// accumulators live in a constexpr-size local array — registers, not
+/// memory — while the tap loop walks the frame-major window with stride
+/// `stride` samples per frame. K is the register-blocking factor; a naive
+/// channels-inner loop with accumulators in memory measures ~2.8x SLOWER
+/// than planar (each mac round-trips its accumulator through the stack).
+template <SampleType S, std::size_t K>
+inline void dotTileFrameMajor(const typename SampleTraits<S>::Coeff* SRT_RESTRICT row,
+                              const S* SRT_RESTRICT x, std::size_t taps, std::size_t stride,
+                              S* SRT_RESTRICT out) noexcept {
+    using Tr = SampleTraits<S>;
+    typename Tr::Accum acc[K]{};
+    for (std::size_t t = 0; t < taps; ++t) {
+        const auto coeff = row[t];
+        const S* SRT_RESTRICT frame = x + t * stride;
+        for (std::size_t k = 0; k < K; ++k)
+            acc[k] = Tr::mac(acc[k], frame[k], coeff);
+    }
+    for (std::size_t k = 0; k < K; ++k)
+        out[k] = Tr::finalize(acc[k]);
+}
+
+/// Channel-parallel dot products over a frame-major history block: all
+/// channels' outputs for one frame in register-blocked tiles of 8/4/2/1.
+/// Per channel the accumulation order over taps equals dotRow's, so the
+/// outputs are bit-exact vs the planar path for every sample type — float
+/// included, since each channel's double accumulator still sums the taps
+/// in the same order (lanes are channels, not taps).
+template <SampleType S>
+inline void dotRowsFrameMajor(const typename SampleTraits<S>::Coeff* SRT_RESTRICT row,
+                              const S* SRT_RESTRICT x, std::size_t taps, std::size_t channels,
+                              S* SRT_RESTRICT out) noexcept {
+    std::size_t c = 0;
+    for (; c + 8 <= channels; c += 8)
+        dotTileFrameMajor<S, 8>(row, x + c, taps, channels, out + c);
+    if (c + 4 <= channels) {
+        dotTileFrameMajor<S, 4>(row, x + c, taps, channels, out + c);
+        c += 4;
+    }
+    if (c + 2 <= channels) {
+        dotTileFrameMajor<S, 2>(row, x + c, taps, channels, out + c);
+        c += 2;
+    }
+    if (c < channels)
+        dotTileFrameMajor<S, 1>(row, x + c, taps, channels, out + c);
+}
+
 /// Streaming fractional-delay engine for one converter instance.
 ///
 /// Owns the per-channel history delay lines (planar, contiguous windows with
@@ -262,16 +333,22 @@ inline S dotRow(const typename SampleTraits<S>::Coeff* SRT_RESTRICT row, const S
 template <SampleType S>
 class FractionalResampler {
 public:
+    /// Frame-major channel-parallel mode is compiled in only on CP targets
+    /// and only for floating-point samples (see SRT_CHANNEL_PARALLEL).
+    static constexpr bool kChannelParallel =
+        SRT_CHANNEL_PARALLEL != 0 && std::is_floating_point_v<S>;
+
     /// Allocates histories and the pop scratch buffer; setup time only.
     FractionalResampler(const PolyphaseFilterBank<S>& bank, std::size_t channels,
                         std::size_t chunkFrames = 64)
         : bank_(&bank), channels_(channels), chunk_(chunkFrames),
-          histCap_(bank.taps() + chunkFrames), scratch_(chunkFrames * channels), hist_(channels),
-          row_(bank.taps()) {
+          histCap_(bank.taps() + chunkFrames), scratch_(chunkFrames * channels),
+          frameMajor_(kChannelParallel && channels >= SRT_CP_MIN_CHANNELS),
+          hist_(frameMajor_ ? 1 : channels), row_(bank.taps()) {
         if (channels_ == 0 || chunk_ == 0)
             throw std::invalid_argument("FractionalResampler: bad config");
         for (auto& h : hist_)
-            h.assign(histCap_, SampleTraits<S>::silence());
+            h.assign(histCap_ * (frameMajor_ ? channels_ : 1), SampleTraits<S>::silence());
         reset();
     }
 
@@ -340,6 +417,13 @@ class FractionalResampler {
             constexpr bool kPreferDotRow = SRT_Q15_SMLALD && std::is_same_v<S, std::int16_t>;
             if (channels_ == 1 && !kPreferDotRow) { // fused blend+mac; no scratch traffic
                 out[n] = interpolatePhase(*bank_, window(0), m);
+            } else if (kChannelParallel && frameMajor_) { // constant-folds away off-host
+                // High channel counts: one blend, then all channels' dots in
+                // a single channel-parallel pass over the frame-major window.
+                blendRowPhase(*bank_, row_.data(), m);
+                const std::size_t taps = bank_->taps();
+                const S* base = hist_[0].data() + (end_ - taps) * channels_;
+                dotRowsFrameMajor<S>(row_.data(), base, taps, channels_, out + n * channels_);
             } else {
                 // Blend once per frame, dot per channel: the blend is the
                 // same for every channel, so this halves the inner-loop work
@@ -364,15 +448,23 @@ class FractionalResampler {
             if (scratchFrames_ == 0)
                 return false;
         }
-        if (end_ == histCap_) { // compact: keep the newest T-1 samples at the front
+        if (end_ == histCap_) { // compact: keep the newest T-1 frames at the front
             const std::size_t keep = bank_->taps() - 1;
+            // Samples per frame slot; the gate is compile-time so non-CP
+            // targets keep their previous codegen exactly (the runtime form
+            // measured +6-8% on the M55 ratchet from hot-loop branch bloat).
+            const std::size_t w = (kChannelParallel && frameMajor_) ? channels_ : 1;
             for (auto& h : hist_)
-                std::memmove(h.data(), h.data() + end_ - keep, keep * sizeof(S));
+                std::memmove(h.data(), h.data() + (end_ - keep) * w, keep * w * sizeof(S));
             end_ = keep;
         }
         const S* frame = scratch_.data() + scratchPos_ * channels_;
-        for (std::size_t c = 0; c < channels_; ++c)
-            hist_[c][end_] = frame[c];
+        if (kChannelParallel && frameMajor_) { // frames stay interleaved: one contiguous copy
+            std::memcpy(hist_[0].data() + end_ * channels_, frame, channels_ * sizeof(S));
+        } else {
+            for (std::size_t c = 0; c < channels_; ++c)
+                hist_[c][end_] = frame[c];
+        }
         ++end_;
         ++scratchPos_;
         return true;
@@ -383,6 +475,11 @@ class FractionalResampler {
     std::size_t chunk_;
     std::size_t histCap_;
     std::vector<S> scratch_; // interleaved staging for bulk pops
+    // History storage: planar (one delay line per channel, hist_[c]) below
+    // SRT_CP_MIN_CHANNELS, frame-major (single interleaved line, hist_[0])
+    // at or above it on SRT_CHANNEL_PARALLEL targets. end_/histCap_ count
+    // frames in both modes.
+    bool frameMajor_;
     std::vector<std::vector<S>> hist_;
     std::vector<typename SampleTraits<S>::Coeff> row_; // per-frame blended coefficients
     std::size_t end_ = 0; // shared end index; all channels advance in lockstep