From 8bc4baf740a59c4f919fc8626509a7a851a2e57b Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 12 Jun 2026 17:20:39 +0000 Subject: [PATCH] Channel-parallel float dot for high channel counts (perf C6) Frame-major history + register-blocked 8/4/2/1 channel tiles when channels >= 4, floating-point samples, host targets only (compile-time gate; every embedded ratchet scenario verified 0.00%). Bit-exact vs planar (per-channel tap order unchanged; hash-verified over 30k blocks x 4 configs). Same-minute A/B: float 8/12/16ch -38/-38/-42% wall-clock with AVX2+FMA, -4-5% on baseline SSE2. Fixed-point measured ~1.5x slower channel-parallel and keeps planar (taps-axis auto-vectorization already optimal) - negative result recorded in PERFORMANCE.md along with the two implementation traps (memory accumulators; runtime gate in hot loops). https://claude.ai/code/session_01HuAFfoeD5a5Xe5aGNA16M9 --- .gitignore | 1 + README.md | 30 ++++----- docs/PERFORMANCE.md | 33 ++++++--- include/srt/polyphase_filter.hpp | 111 +++++++++++++++++++++++++++++-- 4 files changed, 143 insertions(+), 32 deletions(-) diff --git a/.gitignore b/.gitignore index 20cfe70..94fcdc7 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ compile_commands.json CMakeUserPresets.json .vscode/ .idea/ +.claude/ diff --git a/README.md b/README.md index b24046f..117e595 100644 --- a/README.md +++ b/README.md @@ -254,21 +254,21 @@ Indicative numbers from a shared machine (Intel(R) Xeon(R) Processor @ 2.80GHz, | Benchmark | ns/item | ×realtime @48k | |---|---:|---:| -| `BM_Kernel_Float_Fast` | 54.5 | 382× | -| `BM_Kernel_Float_Balanced` | 86.6 | 241× | -| `BM_Kernel_Float_Transparent` | 132.4 | 157× | -| `BM_Kernel_Q15_Balanced` | 51.6 | 404× | -| `BM_Kernel_Q31_Balanced` | 66.5 | 313× | -| `BM_Pipeline_Float_Balanced_1ch` | 88.8 | 235× | -| `BM_Pipeline_Float_Balanced_2ch` | 131.2 | 159× | -| `BM_Pipeline_Float_Balanced_8ch` | 460.2 | 45× | -| `BM_Pipeline_Q15_Balanced_2ch` | 70.8 | 294× | -| `BM_Pipeline_Q31_Balanced_2ch` | 130.3 | 160× | -| `BM_Pipeline_Float_Transparent_2ch` | 241.5 | 86× | -| `BM_Pipeline_Float_Balanced_12ch` | 678.3 | 31× | -| `BM_Pipeline_Q15_Balanced_12ch` | 233.8 | 89× | -| `BM_Pipeline_Float_Balanced_16ch` | 894.6 | 23× | -| `BM_Pipeline_Q15_Balanced_16ch` | 290.3 | 72× | +| `BM_Kernel_Float_Fast` | 48.9 | 426× | +| `BM_Kernel_Float_Balanced` | 69.0 | 302× | +| `BM_Kernel_Float_Transparent` | 109.1 | 191× | +| `BM_Kernel_Q15_Balanced` | 45.2 | 461× | +| `BM_Kernel_Q31_Balanced` | 62.9 | 331× | +| `BM_Pipeline_Float_Balanced_1ch` | 67.7 | 308× | +| `BM_Pipeline_Float_Balanced_2ch` | 107.8 | 193× | +| `BM_Pipeline_Float_Balanced_8ch` | 336.2 | 62× | +| `BM_Pipeline_Q15_Balanced_2ch` | 56.0 | 372× | +| `BM_Pipeline_Q31_Balanced_2ch` | 120.8 | 173× | +| `BM_Pipeline_Float_Transparent_2ch` | 164.2 | 127× | +| `BM_Pipeline_Float_Balanced_12ch` | 511.8 | 41× | +| `BM_Pipeline_Q15_Balanced_12ch` | 189.1 | 110× | +| `BM_Pipeline_Float_Balanced_16ch` | 649.6 | 32× | +| `BM_Pipeline_Q15_Balanced_16ch` | 241.8 | 86× | ## Sample types diff --git a/docs/PERFORMANCE.md b/docs/PERFORMANCE.md index b446381..deb5a8d 100644 --- a/docs/PERFORMANCE.md +++ b/docs/PERFORMANCE.md @@ -163,14 +163,27 @@ table is already enforced by test thresholds. shape for HVX. The HVX-compatible shape is **channel-parallel** (one 64-bit lane-pair per channel; 16 channels fill one vector exactly), recorded as hypothesis C6 below. -- [ ] **PR C6…** — channel-parallel dot for high channel counts - (12-channel 7.1.4 and 16-channel AVB-with-reference-mics are real - deployments): channels in SIMD lanes, one accumulator lane per - channel, coefficient broadcast — bit-exact for *every* sample type - including float, since each channel's accumulation order is unchanged. - Wants a channel-major history layout above a channel threshold. - Candidates: AVX2 (4 double lanes), Helium, HVX (16 x int64 lanes). - Profile the 12-channel deinterleave/scatter cost first. Hypothesis 5 +- [x] **PR C6** — channel-parallel dot for high channel counts + (frame-major history + register-blocked 8/4/2/1 channel tiles, + `SRT_CP_MIN_CHANNELS` = 4, hosts only). Profile first (callgrind, + 12ch Q15): per-channel dot MACs ≈ 85% of instructions, deinterleave + ~2% — the dots were the target. Results, same-minute A/B: + **float 8/12/16-channel −38/−38/−42% wall-clock with AVX2+FMA** + (`-march=native`; −4–5% on baseline SSE2 builds — gains scale with + SIMD width because the channel axis is the *only* axis the float path + may vectorize on: per-channel double accumulation order is unchanged, + so it is bit-exact, hash-verified against planar over 30k blocks × + 4 configs). **Fixed-point: negative result, planar kept** — the + channel-parallel form measured ~1.5× SLOWER than planar Q15 on hosts + (planar already auto-vectorizes over taps; integer reduction is + exactly reassociable, so that axis was never blocked). Two + implementation lessons recorded: a naive channels-inner loop with + memory accumulators is 2.8× slower than planar (register-block or + don't bother), and the mode gate must be compile-time — a runtime + bool in the hot loops cost +6–8% on the M55 ratchet before the + constexpr gate restored every embedded scenario to 0.00%. + Embedded channel-parallel (HVX 16×int64-lane, Helium) remains a + follow-up candidate if DSP budgets demand it. Hypothesis 5 (deferred): explicit 4-way double accumulation for the float dot — - est. 2–3× float kernel on AVX2-class SIMD, but bit-changing; take only - if budgets demand it. + bit-changing; superseded for N≥4 by C6's bit-exact channel axis, only + relevant for mono/stereo float if ever needed. diff --git a/include/srt/polyphase_filter.hpp b/include/srt/polyphase_filter.hpp index 5432724..390a474 100644 --- a/include/srt/polyphase_filter.hpp +++ b/include/srt/polyphase_filter.hpp @@ -37,6 +37,30 @@ #define SRT_Q15_SMLALD 0 #endif +// Channel-parallel dot product for high channel counts (hypothesis C6, +// docs/PERFORMANCE.md): history stored frame-major so the per-tap inner +// loop runs across channels — contiguous loads, one accumulator lane per +// channel, coefficient broadcast. Bit-exact because each channel's +// accumulation order over taps is unchanged (lanes are channels, not +// taps), which is what lets the FLOAT path vectorize at all: its strict +// per-channel double accumulation forbids tap-axis SIMD (PERFORMANCE.md +// hypothesis 5), but the channel axis is free. Float-only by measurement: +// fixed-point planar dots already auto-vectorize over taps on hosts +// (integer reduction is exactly reassociable) and measured ~1.5x FASTER +// than the channel-parallel form. Host-only: the embedded targets keep +// their proven planar codegen (Helium on M55, SMLALD on M33-class, +// Hexagon's measured scalar floor — see PERFORMANCE.md C4/C5). +#if !defined(__ARM_FEATURE_MVE) && !defined(__ARM_FEATURE_DSP) && !defined(__hexagon__) +#define SRT_CHANNEL_PARALLEL 1 +#else +#define SRT_CHANNEL_PARALLEL 0 +#endif +// Minimum channel count for the frame-major path (overridable for A/B +// measurements; the blend-share planar path stays better at low counts). +#ifndef SRT_CP_MIN_CHANNELS +#define SRT_CP_MIN_CHANNELS 4 +#endif + namespace srt { /// Specification of the interpolation prototype filter. @@ -243,6 +267,53 @@ inline S dotRow(const typename SampleTraits::Coeff* SRT_RESTRICT row, const S return Tr::finalize(acc); } +/// One K-channel tile of the channel-parallel dot (hypothesis C6): K +/// accumulators live in a constexpr-size local array — registers, not +/// memory — while the tap loop walks the frame-major window with stride +/// `stride` samples per frame. K is the register-blocking factor; a naive +/// channels-inner loop with accumulators in memory measures ~2.8x SLOWER +/// than planar (each mac round-trips its accumulator through the stack). +template +inline void dotTileFrameMajor(const typename SampleTraits::Coeff* SRT_RESTRICT row, + const S* SRT_RESTRICT x, std::size_t taps, std::size_t stride, + S* SRT_RESTRICT out) noexcept { + using Tr = SampleTraits; + typename Tr::Accum acc[K]{}; + for (std::size_t t = 0; t < taps; ++t) { + const auto coeff = row[t]; + const S* SRT_RESTRICT frame = x + t * stride; + for (std::size_t k = 0; k < K; ++k) + acc[k] = Tr::mac(acc[k], frame[k], coeff); + } + for (std::size_t k = 0; k < K; ++k) + out[k] = Tr::finalize(acc[k]); +} + +/// Channel-parallel dot products over a frame-major history block: all +/// channels' outputs for one frame in register-blocked tiles of 8/4/2/1. +/// Per channel the accumulation order over taps equals dotRow's, so the +/// outputs are bit-exact vs the planar path for every sample type — float +/// included, since each channel's double accumulator still sums the taps +/// in the same order (lanes are channels, not taps). +template +inline void dotRowsFrameMajor(const typename SampleTraits::Coeff* SRT_RESTRICT row, + const S* SRT_RESTRICT x, std::size_t taps, std::size_t channels, + S* SRT_RESTRICT out) noexcept { + std::size_t c = 0; + for (; c + 8 <= channels; c += 8) + dotTileFrameMajor(row, x + c, taps, channels, out + c); + if (c + 4 <= channels) { + dotTileFrameMajor(row, x + c, taps, channels, out + c); + c += 4; + } + if (c + 2 <= channels) { + dotTileFrameMajor(row, x + c, taps, channels, out + c); + c += 2; + } + if (c < channels) + dotTileFrameMajor(row, x + c, taps, channels, out + c); +} + /// Streaming fractional-delay engine for one converter instance. /// /// Owns the per-channel history delay lines (planar, contiguous windows with @@ -262,16 +333,22 @@ inline S dotRow(const typename SampleTraits::Coeff* SRT_RESTRICT row, const S template class FractionalResampler { public: + /// Frame-major channel-parallel mode is compiled in only on CP targets + /// and only for floating-point samples (see SRT_CHANNEL_PARALLEL). + static constexpr bool kChannelParallel = + SRT_CHANNEL_PARALLEL != 0 && std::is_floating_point_v; + /// Allocates histories and the pop scratch buffer; setup time only. FractionalResampler(const PolyphaseFilterBank& bank, std::size_t channels, std::size_t chunkFrames = 64) : bank_(&bank), channels_(channels), chunk_(chunkFrames), - histCap_(bank.taps() + chunkFrames), scratch_(chunkFrames * channels), hist_(channels), - row_(bank.taps()) { + histCap_(bank.taps() + chunkFrames), scratch_(chunkFrames * channels), + frameMajor_(kChannelParallel && channels >= SRT_CP_MIN_CHANNELS), + hist_(frameMajor_ ? 1 : channels), row_(bank.taps()) { if (channels_ == 0 || chunk_ == 0) throw std::invalid_argument("FractionalResampler: bad config"); for (auto& h : hist_) - h.assign(histCap_, SampleTraits::silence()); + h.assign(histCap_ * (frameMajor_ ? channels_ : 1), SampleTraits::silence()); reset(); } @@ -340,6 +417,13 @@ class FractionalResampler { constexpr bool kPreferDotRow = SRT_Q15_SMLALD && std::is_same_v; if (channels_ == 1 && !kPreferDotRow) { // fused blend+mac; no scratch traffic out[n] = interpolatePhase(*bank_, window(0), m); + } else if (kChannelParallel && frameMajor_) { // constant-folds away off-host + // High channel counts: one blend, then all channels' dots in + // a single channel-parallel pass over the frame-major window. + blendRowPhase(*bank_, row_.data(), m); + const std::size_t taps = bank_->taps(); + const S* base = hist_[0].data() + (end_ - taps) * channels_; + dotRowsFrameMajor(row_.data(), base, taps, channels_, out + n * channels_); } else { // Blend once per frame, dot per channel: the blend is the // same for every channel, so this halves the inner-loop work @@ -364,15 +448,23 @@ class FractionalResampler { if (scratchFrames_ == 0) return false; } - if (end_ == histCap_) { // compact: keep the newest T-1 samples at the front + if (end_ == histCap_) { // compact: keep the newest T-1 frames at the front const std::size_t keep = bank_->taps() - 1; + // Samples per frame slot; the gate is compile-time so non-CP + // targets keep their previous codegen exactly (the runtime form + // measured +6-8% on the M55 ratchet from hot-loop branch bloat). + const std::size_t w = (kChannelParallel && frameMajor_) ? channels_ : 1; for (auto& h : hist_) - std::memmove(h.data(), h.data() + end_ - keep, keep * sizeof(S)); + std::memmove(h.data(), h.data() + (end_ - keep) * w, keep * w * sizeof(S)); end_ = keep; } const S* frame = scratch_.data() + scratchPos_ * channels_; - for (std::size_t c = 0; c < channels_; ++c) - hist_[c][end_] = frame[c]; + if (kChannelParallel && frameMajor_) { // frames stay interleaved: one contiguous copy + std::memcpy(hist_[0].data() + end_ * channels_, frame, channels_ * sizeof(S)); + } else { + for (std::size_t c = 0; c < channels_; ++c) + hist_[c][end_] = frame[c]; + } ++end_; ++scratchPos_; return true; @@ -383,6 +475,11 @@ class FractionalResampler { std::size_t chunk_; std::size_t histCap_; std::vector scratch_; // interleaved staging for bulk pops + // History storage: planar (one delay line per channel, hist_[c]) below + // SRT_CP_MIN_CHANNELS, frame-major (single interleaved line, hist_[0]) + // at or above it on SRT_CHANNEL_PARALLEL targets. end_/histCap_ count + // frames in both modes. + bool frameMajor_; std::vector> hist_; std::vector::Coeff> row_; // per-frame blended coefficients std::size_t end_ = 0; // shared end index; all channels advance in lockstep