From 5bfa7188147074336fdd3553fce4671620cbc61c Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 1 Jul 2026 21:09:15 +0000 Subject: [PATCH 01/16] Book foundation: mdBook skeleton, live-excerpt CI gate, first chapters book/: mdBook project with the full 22-chapter outline (placeholders build), the introduction, and the first complete chapter - the SPSC ring concurrency deep-dive, written as the tone benchmark: algorithm + memory-model argument + rejected alternatives + the wraparound proof + what tests can and cannot certify, ending in verify-it-yourself commands. Code excerpts are included live from the headers via mdBook anchors (ANCHOR comments added to spsc_ring.hpp; comment-only, format-clean). New CI job builds the book with a pinned mdbook binary (v0.4.40, sha256-verified) and treats warnings as errors, so a refactor that orphans an excerpt fails CI - the same freshness contract as the generated README tables. https://claude.ai/code/session_01HuAFfoeD5a5Xe5aGNA16M9 --- .github/workflows/ci.yml | 31 +++ .gitignore | 1 + book/book.toml | 13 ++ book/src/SUMMARY.md | 47 +++++ book/src/appendix/bibliography.md | 3 + book/src/appendix/cpp-decisions.md | 3 + book/src/appendix/glossary.md | 3 + book/src/introduction.md | 110 ++++++++++ book/src/part0/budgets.md | 3 + book/src/part0/two-crystals.md | 3 + book/src/part1/asrc.md | 3 + book/src/part1/fractional-resampler.md | 3 + book/src/part1/kaiser.md | 3 + book/src/part1/pi-servo.md | 3 + book/src/part1/polyphase-bank.md | 3 + book/src/part1/sample-traits.md | 3 + book/src/part1/spsc-ring.md | 269 +++++++++++++++++++++++++ book/src/part2/icount.md | 3 + book/src/part2/notebooks.md | 3 + book/src/part2/tests.md | 3 + book/src/part3/c1-c2.md | 3 + book/src/part3/c3-c5.md | 3 + book/src/part3/c6.md | 3 + book/src/part4/c-abi.md | 3 + book/src/part4/cortex-m.md | 3 + book/src/part4/hexagon.md | 3 + book/src/part5/hardware.md | 3 + book/src/part5/scaling.md | 3 + include/srt/spsc_ring.hpp | 10 + 29 files changed, 547 insertions(+) create mode 100644 book/book.toml create mode 100644 book/src/SUMMARY.md create mode 100644 book/src/appendix/bibliography.md create mode 100644 book/src/appendix/cpp-decisions.md create mode 100644 book/src/appendix/glossary.md create mode 100644 book/src/introduction.md create mode 100644 book/src/part0/budgets.md create mode 100644 book/src/part0/two-crystals.md create mode 100644 book/src/part1/asrc.md create mode 100644 book/src/part1/fractional-resampler.md create mode 100644 book/src/part1/kaiser.md create mode 100644 book/src/part1/pi-servo.md create mode 100644 book/src/part1/polyphase-bank.md create mode 100644 book/src/part1/sample-traits.md create mode 100644 book/src/part1/spsc-ring.md create mode 100644 book/src/part2/icount.md create mode 100644 book/src/part2/notebooks.md create mode 100644 book/src/part2/tests.md create mode 100644 book/src/part3/c1-c2.md create mode 100644 book/src/part3/c3-c5.md create mode 100644 book/src/part3/c6.md create mode 100644 book/src/part4/c-abi.md create mode 100644 book/src/part4/cortex-m.md create mode 100644 book/src/part4/hexagon.md create mode 100644 book/src/part5/hardware.md create mode 100644 book/src/part5/scaling.md diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ff16a06..588ba2b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -486,3 +486,34 @@ jobs: bench/*.cpp bench/icount/*.cpp bench/compare/*.cpp \ tools/capi/*.cpp tools/qemu_insn_plugin/*.c \ tests/*.cpp tests/support/*.hpp examples/*.cpp platform/*.c + + # The book (book/) quotes library code via mdBook anchor includes; this + # gate makes a refactor that orphans an excerpt fail CI, the same + # freshness contract as the README's generated tables. Warnings are + # errors: a missing anchor is a warning, and a missing anchor is rot. + book: + name: Book build + runs-on: ubuntu-latest + timeout-minutes: 10 + env: + MDBOOK_URL: https://github.com/rust-lang/mdBook/releases/download/v0.4.40/mdbook-v0.4.40-x86_64-unknown-linux-gnu.tar.gz + MDBOOK_SHA256: "9ef07fd288ba58ff3b99d1c94e6d414d431c9a61fdb20348e5beb74b823d546b" + steps: + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6 + + - name: Install mdBook (pinned) + run: | + curl -sfLo /tmp/mdbook.tar.gz "$MDBOOK_URL" + actual=$(sha256sum /tmp/mdbook.tar.gz | cut -d' ' -f1) + if [ "$actual" != "$MDBOOK_SHA256" ]; then + echo "::error::mdbook checksum mismatch"; exit 1 + fi + tar -xzf /tmp/mdbook.tar.gz -C /tmp + + - name: Build (warnings are errors) + run: | + /tmp/mdbook build book 2>&1 | tee /tmp/book-build.log + if grep -qiE 'warning|error' /tmp/book-build.log; then + echo "::error::mdbook reported warnings/errors (stale anchor or broken include?)" + exit 1 + fi diff --git a/.gitignore b/.gitignore index 94fcdc7..b8e7965 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ CMakeUserPresets.json .vscode/ .idea/ .claude/ +book/book/ diff --git a/book/book.toml b/book/book.toml new file mode 100644 index 0000000..ea6834d --- /dev/null +++ b/book/book.toml @@ -0,0 +1,13 @@ +[book] +title = "Inside SampleRateTap" +description = "A working tour of a real-time asynchronous sample rate converter: the DSP, the C++, the concurrency, and the measurements that hold it together." +authors = ["The SampleRateTap project"] +language = "en" +src = "src" + +[build] +create-missing = false + +[output.html] +default-theme = "rust" +git-repository-url = "https://github.com/tap/SampleRateTap" diff --git a/book/src/SUMMARY.md b/book/src/SUMMARY.md new file mode 100644 index 0000000..cad7d1a --- /dev/null +++ b/book/src/SUMMARY.md @@ -0,0 +1,47 @@ +# Summary + +[Introduction](introduction.md) + +# Part 0 — The problem + +- [Two crystals, one stream](part0/two-crystals.md) +- [Budgets: latency, quality, compute](part0/budgets.md) + +# Part I — The machine, file by file + +- [Designing the filter: kaiser.hpp](part1/kaiser.md) +- [The polyphase bank](part1/polyphase-bank.md) +- [Sample types as a customization point: sample_traits.hpp](part1/sample-traits.md) +- [The lock-free ring: spsc_ring.hpp](part1/spsc-ring.md) +- [The clock servo: pi_servo.hpp](part1/pi-servo.md) +- [The fractional resampler](part1/fractional-resampler.md) +- [Composition: asrc.hpp](part1/asrc.md) + +# Part II — The proof system + +- [Tests as specifications](part2/tests.md) +- [Counting instructions, deterministically](part2/icount.md) +- [Notebooks as calibrated instruments](part2/notebooks.md) + +# Part III — Optimizing honestly + +- [Profile first, claim later (C1–C2)](part3/c1-c2.md) +- [The integer phase and the wide MACs (C3–C5)](part3/c3-c5.md) +- [The channel axis (C6)](part3/c6.md) + +# Part IV — Portability + +- [Hexagon: a DSP that keeps secrets](part4/hexagon.md) +- [Cortex-M: bare metal, two ways](part4/cortex-m.md) +- [The C ABI](part4/c-abi.md) + +# Part V — Deployment + +- [Real clocks: bridges and firmware](part5/hardware.md) +- [Channels, rates, and the rules that scale](part5/scaling.md) + +--- + +[Appendix A: The C++ decision log](appendix/cpp-decisions.md) +[Appendix B: Glossary](appendix/glossary.md) +[Appendix C: Annotated bibliography](appendix/bibliography.md) diff --git a/book/src/appendix/bibliography.md b/book/src/appendix/bibliography.md new file mode 100644 index 0000000..4372f09 --- /dev/null +++ b/book/src/appendix/bibliography.md @@ -0,0 +1,3 @@ +# bibliography + +*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* diff --git a/book/src/appendix/cpp-decisions.md b/book/src/appendix/cpp-decisions.md new file mode 100644 index 0000000..1d5cf71 --- /dev/null +++ b/book/src/appendix/cpp-decisions.md @@ -0,0 +1,3 @@ +# cpp decisions + +*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* diff --git a/book/src/appendix/glossary.md b/book/src/appendix/glossary.md new file mode 100644 index 0000000..e0b2772 --- /dev/null +++ b/book/src/appendix/glossary.md @@ -0,0 +1,3 @@ +# glossary + +*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* diff --git a/book/src/introduction.md b/book/src/introduction.md new file mode 100644 index 0000000..d3c61ed --- /dev/null +++ b/book/src/introduction.md @@ -0,0 +1,110 @@ +# Introduction + +This book explains one piece of software completely. + +The software is **SampleRateTap**, a header-only C++20 library that solves a +narrow, stubborn problem in real-time audio: two devices both claim to run at +48 kHz, but each owns its own crystal oscillator, so neither actually does. +One drifts a few parts per million against the other — imperceptibly slowly +and absolutely relentlessly — and any system that moves audio between them +must either resample adaptively or eventually glitch. The library converts +between two such clock domains transparently (about 135 dB of measured +fidelity), in real time (about 1.5 ms of latency), on hardware from Xeon +servers down to a $5 microcontroller. + +That is a small enough problem to fit in your head and a deep enough one to +teach from. Solving it well demands working knowledge of half a dozen fields +that are usually taught separately: FIR filter design, fixed-point +arithmetic, control theory, lock-free concurrency, the C++ memory model, +SIMD micro-architecture, and the discipline of measuring instead of +guessing. The premise of this book is that you learn those subjects better +around one real, shipping artifact — where every design decision had to +survive contact with every other — than from isolated examples built to +illustrate exactly one thing. + +## Who this is for + +You are comfortable in C++ — templates, RAII, the standard library — but you +have not necessarily written audio code, used `std::memory_order_acquire` in +anger, designed a filter, or counted the instructions your compiler emits. +No DSP background is assumed; the mathematics is built up exactly as far as +the code needs it and no further. Where a result has a textbook derivation, +we cite the textbook and spend our pages on what the textbooks omit: why +*this* form of the equation, in *this* code, on *this* hardware. + +## How this book stays honest + +Two mechanical commitments distinguish this book from most code walkthroughs. + +**The excerpts are live.** Every block of library code you read is included +into the book at build time from the actual header in the repository, by +anchor. If the code changes, the book changes or the book's build breaks — +in this project's continuous integration, like every other published number. +There is no possibility of the classic tutorial failure where prose +describes code that no longer exists. + +**Every claim ends in a command.** The library's culture is that performance +and quality numbers are measured, gated, and regenerated — never asserted +from memory. The book inherits that: each chapter closes with a *Verify it +yourself* section listing the exact tests, benchmarks, or notebooks that +back what you just read. When this book says the ring buffer is correct +under weak memory ordering, you will be holding the ThreadSanitizer +invocation that fails if it is not. + +## The history is the curriculum + +This codebase was built measurement-first, and its history contains real +reversals, preserved deliberately: + +- An optimization hypothesis about the Cortex-M55's floating-point unit that + was **wrong**, discovered because a 1.4% instruction-count regression + contradicted the project's own documentation — and the documentation, not + the measurement, turned out to be at fault. +- A Hexagon vectorization effort that was implemented, proven bit-exact, + measured at a 0.31% improvement — and then **deliberately deleted**, with + the disassembly evidence recorded so nobody re-derives the dead end. +- A correctness bug that survived months of green CI because every test and + benchmark happened to be configured just clear of it, found by an + adversarial audit, and demonstrated before it was fixed. +- A toolchain that turned out to be unable to catch C++ exceptions at all — + discovered the day the first `EXPECT_THROW` reached it. + +These are not embarrassments to be edited out; they are the most valuable +material in the book. Anyone can present a finished design as if it were +inevitable. Watching a design *survive falsification* teaches you what the +finished form is actually load-bearing against. + +## The shape of the book + +**Part 0** establishes the problem and its budgets: why a plain FIFO +measurably fails (−34.7 dB!), what near-unity specialization buys, and the +arithmetic that connects picoseconds of timing jitter to decibels of +fidelity. + +**Part I** is the heart: the library's seven headers, one chapter each, in +dependency order — filter design, the polyphase table, the sample-type +traits, the lock-free ring, the clock servo, the fractional resampler, and +the converter that composes them. Each chapter covers the algorithm, the +C++ idioms chosen *and rejected*, and the failure modes the design guards +against. + +**Part II** explains the proof system: deterministic two-clock simulation, +sine-fit metrology, and the instruction-count ratchet that lets a CI runner +gate embedded performance to the exact instruction. + +**Part III** retells the optimization campaign as it actually happened — +six efforts, four wins, one honest draw, one deliberate revert — with the +real numbers and the two implementation traps that cost a day each. + +**Part IV** is portability: what a Qualcomm DSP, two bare-metal ARM cores, +and a C foreign-function interface each demanded. + +**Part V** reaches hardware: real crystals, real cycle counters, and the +configuration rules that scale across channel counts and sample rates. + +The appendices collect the C++ decision log (every idiom adopted or +rejected, with reasons), a glossary, and an annotated bibliography. + +Chapters are largely self-contained, but Part I builds on itself; if you +read only one chapter, make it [the lock-free ring](part1/spsc-ring.md) — +it is short, complete, and representative of the whole book's method. diff --git a/book/src/part0/budgets.md b/book/src/part0/budgets.md new file mode 100644 index 0000000..11b8221 --- /dev/null +++ b/book/src/part0/budgets.md @@ -0,0 +1,3 @@ +# budgets + +*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* diff --git a/book/src/part0/two-crystals.md b/book/src/part0/two-crystals.md new file mode 100644 index 0000000..cef6102 --- /dev/null +++ b/book/src/part0/two-crystals.md @@ -0,0 +1,3 @@ +# two crystals + +*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* diff --git a/book/src/part1/asrc.md b/book/src/part1/asrc.md new file mode 100644 index 0000000..fa5e198 --- /dev/null +++ b/book/src/part1/asrc.md @@ -0,0 +1,3 @@ +# asrc + +*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* diff --git a/book/src/part1/fractional-resampler.md b/book/src/part1/fractional-resampler.md new file mode 100644 index 0000000..2024cee --- /dev/null +++ b/book/src/part1/fractional-resampler.md @@ -0,0 +1,3 @@ +# fractional resampler + +*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* diff --git a/book/src/part1/kaiser.md b/book/src/part1/kaiser.md new file mode 100644 index 0000000..0b0bbe1 --- /dev/null +++ b/book/src/part1/kaiser.md @@ -0,0 +1,3 @@ +# kaiser + +*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* diff --git a/book/src/part1/pi-servo.md b/book/src/part1/pi-servo.md new file mode 100644 index 0000000..b8ef7d9 --- /dev/null +++ b/book/src/part1/pi-servo.md @@ -0,0 +1,3 @@ +# pi servo + +*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* diff --git a/book/src/part1/polyphase-bank.md b/book/src/part1/polyphase-bank.md new file mode 100644 index 0000000..415f083 --- /dev/null +++ b/book/src/part1/polyphase-bank.md @@ -0,0 +1,3 @@ +# polyphase bank + +*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* diff --git a/book/src/part1/sample-traits.md b/book/src/part1/sample-traits.md new file mode 100644 index 0000000..54a8df2 --- /dev/null +++ b/book/src/part1/sample-traits.md @@ -0,0 +1,3 @@ +# sample traits + +*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* diff --git a/book/src/part1/spsc-ring.md b/book/src/part1/spsc-ring.md new file mode 100644 index 0000000..cc909ce --- /dev/null +++ b/book/src/part1/spsc-ring.md @@ -0,0 +1,269 @@ +# The lock-free ring: `spsc_ring.hpp` + +Every other component in this library is mathematics. This one is physics. + +The converter's whole purpose is to sit between two threads that must never +wait for each other: an audio capture callback pushing frames at its +device's pace, and a playback callback pulling frames at a *different* +device's pace. If either thread ever blocks — on a mutex, on an allocation, +on a priority-inverted anything — the audio glitches, and a glitch is the +one failure this library exists to prevent. So the channel between the +threads must be **lock-free**, and not in the loose marketing sense: every +operation must complete in a bounded number of steps regardless of what the +other thread is doing, including being suspended indefinitely at the worst +possible instruction. + +The ring also serves a second master, and this is the design's quiet +novelty: its **occupancy is the control system's sensor**. The clock servo +(next chapter) estimates the rate mismatch between the two crystals +entirely from how full this buffer is. That is why the class exposes exact +`readAvailable()` and a consumer-side `discard()` — operations a generic +SPSC queue wouldn't bother with — and why "approximately full" isn't good +enough anywhere in this file: a biased occupancy reading would become a +biased frequency estimate. + +Here is the entire contract: + +```cpp +{{#include ../../../include/srt/spsc_ring.hpp:contract}} +``` + +Forty lines of comment and assertion before any logic. Three things deserve +attention already. + +**`is_trivially_copyable_v`** — the ring moves data with `memcpy`, in at +most two segments per transfer. This is a *bulk* ring: the producer hands +over whole blocks of interleaved frames, not elements one at a time. A +`memcpy`-based design rules out element types with constructors, and the +`static_assert` makes that a compile error instead of undefined behavior. + +**`std::atomic::is_always_lock_free`** — the class claims +lock-freedom, so it asserts the precondition. On every target this project +ships to, a `size_t` atomic compiles to plain loads and stores plus memory +ordering. But "every target this project ships to" is exactly the kind of +claim that rots silently; the assert costs nothing and converts rot into a +compile error. (This line has its own small history: it was added by an +audit that noticed the library asserted lock-freedom for its *telemetry* +counters but not for the indices the entire hot path rests on.) + +**Indices are monotonic, not wrapped.** `head_` and `tail_` count every +element ever written and read, forever; only at the moment of buffer access +are they masked down to a position. This is the single most consequential +decision in the file, and it earns its own section below — including what +happens when "forever" meets a 32-bit `size_t`. + +## The memory model, from the only direction that matters + +There are two ways to teach C++ memory ordering. The textbook way starts +from the six `memory_order` enumerators and their formal guarantees. The +way that actually sticks starts from a bug. + +Suppose both threads used `memory_order_relaxed` everywhere. The producer +writes 64 samples into the buffer, then advances `head_` by 64. The +consumer reads the new `head_`, concludes 64 samples are available, and +copies them out. On x86 this works every time you test it. On a Cortex-A +or M-class core — or under ThreadSanitizer — the consumer can observe the +*index* update **before** it observes the *sample data* the index claims to +cover, because nothing told either the compiler or the CPU that those +writes were related. The consumer then plays whatever stale bytes were in +the buffer. The bug is silent, rare, load-dependent, and absolutely real. + +The fix is a single pairing, used twice, and it is the only synchronization +in the file: + +> The producer **releases** `head_` after writing data; the consumer +> **acquires** `head_` before reading data. Everything the producer did +> before the release-store is visible to the consumer after the +> acquire-load that observes it. + +Read the producer side with that lens: + +```cpp +{{#include ../../../include/srt/spsc_ring.hpp:write}} +``` + +The two `memcpy` calls happen *before* the `release` store of the new head. +That ordering — data first, then the index that publishes it — is the +entire correctness argument for the data path. Symmetrically: + +```cpp +{{#include ../../../include/srt/spsc_ring.hpp:read}} +``` + +The consumer `acquire`-loads `head_` (inside the cache-refresh branch, +discussed next), and only then copies data the head covers. Its own +`release` store of `tail_` plays the mirrored role for a subtler resource: +**buffer reuse**. The producer may overwrite a slot only after the consumer +has finished copying out of it; the consumer's release of `tail_` and the +producer's acquire of it order exactly that. Miss this second pairing and +you have a bug that no amount of staring at the "obvious" head-side pairing +will reveal. + +Notice also what is *relaxed*: each side loads **its own** index with +`memory_order_relaxed`. The producer is the only writer of `head_`, so it +cannot race with itself; a thread always observes its own prior writes. +Using `acquire` there would be harmless but dishonest — ordering +annotations in this codebase are documentation, and claiming +synchronization where none is needed misleads the next reader. This is a +deliberate idiom: **the memory orderings are chosen to be exactly +sufficient, so that each one tells you why it exists.** + +### What was rejected + +A sequentially-consistent version (`memory_order_seq_cst` everywhere, the +default) would be correct. It was rejected for two reasons, in order of +importance: first, on ARM it compiles to strictly stronger barriers than +the algorithm needs, in the hottest loop the library owns; second — again +the documentation argument — `seq_cst` says "I didn't think about this," +and in a file whose whole job is to be thought about, that is the wrong +message. A mutex-based version was never on the table: it would forfeit the +bounded-progress guarantee the audio contract requires, priority inversion +being the canonical way real-time audio dies. + +## The cached-index trick + +Correctness needs one acquire/release pair per direction. Performance is +about how *rarely* you can afford to do even that. + +Every atomic load of the other thread's index is a potential cache-line +transfer between cores — the line bounces from the writer's L1 to the +reader's, hundreds of cycles when it goes badly, and it goes badly +precisely when both threads are busiest. The standard remedy (this design +follows the well-known pattern used by production SPSC queues) is for each +side to keep a **stale local copy** of the other side's index and consult +the real atomic only when the stale copy makes the operation look +impossible: + +- The producer computes free space against `tailCache_`. Only if that says + "not enough room" does it acquire-load the real `tail_` and retry the + computation. If space *still* falls short, the answer is truthful — the + buffer really is that full *right now* — and the write is clipped. +- The consumer does the same dance with `headCache_` for availability. + +The asymmetry of staleness is safe by construction: a stale `tailCache_` +can only *underestimate* free space (the consumer only ever frees), and a +stale `headCache_` can only *underestimate* availability (the producer only +ever adds). Stale data makes the ring conservative, never wrong. In the +steady state the converter lives in — producer and consumer chasing each +other around a buffer that is never near full or empty — the fast path +touches **no foreign cache lines at all**: one relaxed load of your own +index, arithmetic against a plain local member, two `memcpy`s, one release +store. + +The member layout enforces the same philosophy at the hardware level: + +```cpp +{{#include ../../../include/srt/spsc_ring.hpp:layout}} +``` + +Producer-owned state (`head_`, `tailCache_`), consumer-owned state +(`tail_`, `headCache_`), and the shared read-only state (`buf_`, `mask_`) +each get their own 64-byte cache line, so neither side's writes invalidate +lines the other side reads in its fast path. The comment records a rejected +alternative worth pausing on: +`std::hardware_destructive_interference_size` is the standard's name for +exactly this constant, and this file deliberately doesn't use it. The +constant is **ABI-fragile** — its value can differ between translation +units compiled with different tuning flags, which is why GCC warns when you +use it in a header — and a header-only library lives entirely in that +danger zone. A plain `64` with a comment is less clever and more correct. +The general lesson recurs throughout this codebase: *between a standard +facility and a constraint you can state plainly, prefer the one whose +failure mode you can reason about.* + +## Monotonic indices and the wraparound proof + +Most ring buffers wrap their indices at the capacity and pay for it twice: +one slot must be wasted to distinguish full from empty, and every index +update needs a conditional wrap. This ring's indices instead run forever +and are masked (`idx = head & mask_`) only at access time, which is why the +capacity must be a power of two (`std::bit_ceil` in the constructor) — the +mask replaces a modulo, and the full capacity is usable because occupancy +is computed by subtraction, not by comparing wrapped positions. + +The objection arrives immediately: *forever* is finite. On Hexagon and +Cortex-M, `size_t` is 32 bits; at 48 kHz stereo, the indices wrap every +twelve hours or so of continuous audio. What happens then? + +Nothing — and the reason is worth proving rather than waving at, because +the proof is two lines of modular arithmetic that many engineers have +never consciously done. Unsigned arithmetic in C++ is arithmetic modulo +2^N. Occupancy is computed as `head - tail`; if the true (unbounded) counts +are H and T, the machine computes `(H mod 2^N) - (T mod 2^N) mod 2^N`, +which equals `(H - T) mod 2^N`. Since the algorithm guarantees +`0 ≤ H - T ≤ capacity` and capacity is at most 2^31 on a 32-bit target, the +true difference is always representable, so the modular result *is* the +true result — through the wrap, across the wrap, at the wrap. The masked +position is likewise exact: capacity divides 2^N (it's a power of two), so +`(H mod 2^N) & mask = H mod capacity`. The wrap is not an edge case the +code handles; it is a case the arithmetic never notices. + +This was verified the trustworthy way as well: the audit that reviewed this +file ran the ring with indices initialized to `0xFFFFFFF8` and watched +transfers stride across the 2^32 boundary, byte-exact. The proof says it +must work; the test removes the possibility that the proof was about a +slightly different program than the one we shipped. + +## What the tests can and cannot certify + +Three layers of evidence back this file, and their *limits* are as +instructive as their coverage. + +**Single-threaded exactness** (`tests/test_spsc_ring.cpp`): fill/drain +equality, wraparound data preservation, partial writes near full, discard +accounting. These pin the sequential semantics — necessary, and nowhere +near sufficient. + +**A two-thread stress test** (`tests/test_spsc_ring_threads.cpp`): millions +of elements of a counting sequence pushed and popped with randomized chunk +sizes, verified in order on the consumer side, run under ThreadSanitizer in +CI. TSan observes the actual ordering annotations, so it would flag the +relaxed-everywhere bug described above as a data race. + +**And the honest limitation**: a sanitizer can only judge the interleavings +the hardware deigns to produce during the run, and an x86 host barely +reorders anything. A memory-ordering bug can be invisible on x86 *and* pass +TSan there, then fire on a weakly-ordered ARM core in production. This +project's answer is a weekly CI job that runs the same TSan stress on +genuinely weakly-ordered arm64 hardware, plus the per-push macOS Apple +Silicon leg. That is also a limit worth naming: none of this *proves* the +algorithm; it raises the price of being wrong. The proof remains the +acquire/release argument above — which is exactly why this chapter spent +its pages on the argument rather than the test list. + +## Why these ~130 lines look the way they do + +A summary of the decisions, several of which recur throughout the library: + +| Decision | Alternative rejected | Reason | +|---|---|---| +| Lock-free SPSC, two fixed roles | mutex; MPMC generality | bounded progress is the audio contract; generality costs exactly the cycles this file exists to save | +| Bulk `memcpy` transfers | element-at-a-time queue | the workload is blocks of frames; two `memcpy` segments beat N atomic handoffs | +| Exact occupancy + `discard()` | "approximate size is fine" | occupancy is the servo's sensor; bias here becomes frequency-estimate bias | +| Acquire/release, minimal | `seq_cst` everywhere | sufficiency-as-documentation; weaker barriers on ARM | +| Cached cross-indices | always load the atomic | steady-state fast path touches no foreign cache line | +| Monotonic masked indices | wrap-at-capacity | full capacity usable, no full/empty ambiguity; wrap is provably benign | +| `alignas(64)` literal | `hardware_destructive_interference_size` | the standard constant is ABI-fragile in headers; GCC warns for good reason | +| `static_assert` the preconditions | trust the porting engineer | rot becomes a compile error, not a field failure | + +## Verify it yourself + +```sh +# Sequential semantics, wraparound, discard accounting: +ctest --test-dir build -R SpscRing --output-on-failure + +# The two-thread counting-sequence stress (built when threads exist): +ctest --test-dir build -R TwoThreadStress --output-on-failure + +# The same stress under ThreadSanitizer (as CI runs it): +cmake -B build-tsan -DCMAKE_BUILD_TYPE=RelWithDebInfo \ + -DCMAKE_CXX_FLAGS="-fsanitize=thread" -DSRT_BUILD_EXAMPLES=OFF +cmake --build build-tsan -j && ctest --test-dir build-tsan -R SpscRing + +# Break it on purpose: change memory_order_release to relaxed in write(), +# rebuild the TSan variant, and watch the stress test report the race. +``` + +The last suggestion is the chapter in one line. The annotations are not +incantations; remove one and the tooling shows you precisely the disaster +it was holding back. diff --git a/book/src/part2/icount.md b/book/src/part2/icount.md new file mode 100644 index 0000000..28ff28a --- /dev/null +++ b/book/src/part2/icount.md @@ -0,0 +1,3 @@ +# icount + +*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* diff --git a/book/src/part2/notebooks.md b/book/src/part2/notebooks.md new file mode 100644 index 0000000..c5b84f1 --- /dev/null +++ b/book/src/part2/notebooks.md @@ -0,0 +1,3 @@ +# notebooks + +*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* diff --git a/book/src/part2/tests.md b/book/src/part2/tests.md new file mode 100644 index 0000000..8098ed7 --- /dev/null +++ b/book/src/part2/tests.md @@ -0,0 +1,3 @@ +# tests + +*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* diff --git a/book/src/part3/c1-c2.md b/book/src/part3/c1-c2.md new file mode 100644 index 0000000..595ff7f --- /dev/null +++ b/book/src/part3/c1-c2.md @@ -0,0 +1,3 @@ +# c1 c2 + +*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* diff --git a/book/src/part3/c3-c5.md b/book/src/part3/c3-c5.md new file mode 100644 index 0000000..f88096c --- /dev/null +++ b/book/src/part3/c3-c5.md @@ -0,0 +1,3 @@ +# c3 c5 + +*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* diff --git a/book/src/part3/c6.md b/book/src/part3/c6.md new file mode 100644 index 0000000..68dfc7d --- /dev/null +++ b/book/src/part3/c6.md @@ -0,0 +1,3 @@ +# c6 + +*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* diff --git a/book/src/part4/c-abi.md b/book/src/part4/c-abi.md new file mode 100644 index 0000000..f1eac17 --- /dev/null +++ b/book/src/part4/c-abi.md @@ -0,0 +1,3 @@ +# c abi + +*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* diff --git a/book/src/part4/cortex-m.md b/book/src/part4/cortex-m.md new file mode 100644 index 0000000..a38fa83 --- /dev/null +++ b/book/src/part4/cortex-m.md @@ -0,0 +1,3 @@ +# cortex m + +*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* diff --git a/book/src/part4/hexagon.md b/book/src/part4/hexagon.md new file mode 100644 index 0000000..48b639a --- /dev/null +++ b/book/src/part4/hexagon.md @@ -0,0 +1,3 @@ +# hexagon + +*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* diff --git a/book/src/part5/hardware.md b/book/src/part5/hardware.md new file mode 100644 index 0000000..ef2c709 --- /dev/null +++ b/book/src/part5/hardware.md @@ -0,0 +1,3 @@ +# hardware + +*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* diff --git a/book/src/part5/scaling.md b/book/src/part5/scaling.md new file mode 100644 index 0000000..b0a2db4 --- /dev/null +++ b/book/src/part5/scaling.md @@ -0,0 +1,3 @@ +# scaling + +*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* diff --git a/include/srt/spsc_ring.hpp b/include/srt/spsc_ring.hpp index e70b562..fa94eae 100644 --- a/include/srt/spsc_ring.hpp +++ b/include/srt/spsc_ring.hpp @@ -20,6 +20,7 @@ namespace srt { +// ANCHOR: contract /// Lock-free SPSC ring buffer of trivially copyable elements. /// /// Thread contract: write() and writeAvailable() may only be called from the @@ -34,6 +35,7 @@ class SpscRing { static_assert(std::is_trivially_copyable_v); // The lock-free claim of the whole audio path rests on these indices. static_assert(std::atomic::is_always_lock_free); + // ANCHOR_END: contract public: /// Allocates the buffer; capacity is rounded up to a power of two. @@ -45,6 +47,7 @@ class SpscRing { std::size_t capacity() const noexcept { return buf_.size(); } + // ANCHOR: write /// Producer: append up to n elements; returns the number actually written. std::size_t write(const T* src, std::size_t n) noexcept { const std::size_t head = head_.load(std::memory_order_relaxed); @@ -64,12 +67,15 @@ class SpscRing { return n; } + // ANCHOR_END: write + /// Producer: exact free space at the time of the call. std::size_t writeAvailable() noexcept { tailCache_ = tail_.load(std::memory_order_acquire); return capacity() - (head_.load(std::memory_order_relaxed) - tailCache_); } + // ANCHOR: read /// Consumer: remove up to n elements; returns the number actually read. std::size_t read(T* dst, std::size_t n) noexcept { const std::size_t tail = tail_.load(std::memory_order_relaxed); @@ -89,6 +95,8 @@ class SpscRing { return n; } + // ANCHOR_END: read + /// Consumer: exact occupancy at the time of the call. std::size_t readAvailable() noexcept { headCache_ = head_.load(std::memory_order_acquire); @@ -110,6 +118,7 @@ class SpscRing { } private: + // ANCHOR: layout // 64-byte separation to keep producer- and consumer-owned state on // distinct cache lines (std::hardware_destructive_interference_size is // deliberately avoided: it is ABI-fragile and warns on GCC). The @@ -123,6 +132,7 @@ class SpscRing { alignas(kCacheLine) std::size_t tailCache_{0}; // producer's view of tail alignas(kCacheLine) std::atomic tail_{0}; // written by consumer alignas(kCacheLine) std::size_t headCache_{0}; // consumer's view of head + // ANCHOR_END: layout }; } // namespace srt From 26039a48b3a6b7ad08b374231b79cee4a1b16471 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 1 Jul 2026 21:29:39 +0000 Subject: [PATCH 02/16] book: the composition chapter (asrc.hpp) with the feasibility-bug case study https://claude.ai/code/session_01HuAFfoeD5a5Xe5aGNA16M9 --- book/src/part1/asrc.md | 247 ++++++++++++++++++++++++++++++++++++++++- include/srt/asrc.hpp | 8 ++ 2 files changed, 253 insertions(+), 2 deletions(-) diff --git a/book/src/part1/asrc.md b/book/src/part1/asrc.md index fa5e198..366e23b 100644 --- a/book/src/part1/asrc.md +++ b/book/src/part1/asrc.md @@ -1,3 +1,246 @@ -# asrc +# Composition: `asrc.hpp` -*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* +Every previous chapter built a component that is correct on its own terms. +This chapter is about the file that has no terms of its own: `asrc.hpp` +contains almost no algorithm, no mathematics, and fewer than three hundred +lines that mostly call other files' code. It is also where the only serious +bug in the library's history lived. Both facts have the same cause. +Composition is where each component's assumptions meet every other +component's guarantees, and the gaps between them are invisible from inside +any single file. + +The cast, assembled: a `PolyphaseFilterBank` designed at construction, a +`FractionalResampler` that owns the history and the phase, a `SpscRing` +carrying interleaved frames between the two clock domains, and a `PiServo` +turning ring occupancy into a rate estimate. `BasicAsyncSampleRateConverter` +wires them together and adds the four things none of them could own alone: +a lifecycle state machine, an under/overrun policy, telemetry, and +validation. + +## The two-agent shape + +The public surface is two functions and a contract: + +- `push(interleaved, frames)` — called by exactly one producer agent, at + the input clock's pace. +- `pull(interleaved, frames)` — called by exactly one consumer agent, at + the output clock's pace. + +"Agent" rather than "thread" is deliberate. On a workstation the two agents +are threads; on the dual-core RP2350 firmware they are two processor cores; +in the deterministic test simulator they are interleaved events on one +thread. The converter never creates a thread, never names a thread, and +never synchronizes beyond what the ring already provides — it is a passive +object that two callers animate. This is why the library contains no +`std::thread`, no executor, and no callback registration: the moment a +library owns threads it owns scheduling policy, priorities, and shutdown +order, all of which belong to the application. The cost of this design is a +sharp, documented affinity contract (push is producer-only, pull is +consumer-only, `resetFromConsumer` is consumer-only); the C-ABI header +restates it because FFI callers can't read C++ doc comments. + +`push()` is eight lines and nearly trivial — clip to free space, write, +count an overrun if clipped. All composition complexity lives on the +consumer side, and that too is a decision: the producer is often an +interrupt-context audio callback with the tightest budget in the system, so +every gram of policy was moved to the puller. + +## The state machine + +`pull()` runs a three-state lifecycle — Filling, then a servo that is +Acquiring or Locked — plus two exceptional transitions. Here is the filling +and resync machinery as it ships: + +```cpp +{{#include ../../../include/srt/asrc.hpp:asrc_filling}} +``` + +```cpp +{{#include ../../../include/srt/asrc.hpp:asrc_resync}} +``` + +Filling exists because the resampler cannot produce its first output until +a full window of `taps()` history frames exists, and the servo cannot +regulate an occupancy that is still climbing toward its setpoint. So the +converter emits silence until the backlog reaches `setpoint + taps`, primes +the resampler's window in one gulp, seeds the servo's smoothers at the +observed occupancy (so the loop starts from truth rather than slewing from +zero), and begins converting — with a fade-in, discussed below. + +The two exceptional transitions are the under/overrun policy, and their +asymmetry rewards attention. **Underrun** (the consumer outran the data): +pad the rest of the block with silence, count it, return to Filling — but +call `servo_.reset(true)`, the flavor that *keeps the integrator*. The ppm +estimate is the accumulated knowledge of where the other crystal sits; a +dropout interrupts the audio, not the physics, so the estimate survives and +re-lock after a dropout takes a fraction of the original acquisition time. +**Overrun pressure** (the consumer stalled long enough for occupancy to +pass the high watermark): discard down to the setpoint in one cut, count a +resync, and re-seed the smoothers — because after a deliberate +discontinuity in the observable, letting the loop "discover" the jump would +inject exactly the transient the seed avoids. One subtlety in the resync +was wrong for months: the discard must be clamped to what the *ring* +actually holds, because the occupancy figure includes frames already staged +inside the resampler's pop scratch, which no ring discard can reach. With a +setpoint smaller than that staging buffer, the unclamped subtraction +drained the ring to zero and the converter fell into a refill-underrun +cascade. An audit found it; a regression test now pins it. + +The fade-in deserves its sentence of honesty, which the header also +carries: after every (re)fill the first 64 frames ramp linearly from +silence, so *recovery* never clicks — but the dropout's onset, and a +resync's splice, are unfaded cuts, because at the moment they happen there +is nothing valid to fade toward. A design can only be honest about which +discontinuities it removes. + +## The bug that composition hid + +Now the centerpiece, and the reason this chapter exists in its current +form. + +Every component below this file was correct. The ring transferred bytes +exactly; the servo regulated occupancy to its setpoint with textbook +dynamics; the resampler synthesized precisely the frames asked of it. And +for months, a converter built from these correct parts, at default +configuration, was **silently broken for the most common audio callback +size in the world.** + +The mechanism is embarrassingly simple once stated. A `pull(N)` must +synthesize N frames from data *already in the backlog* — in a real +deployment, no pushes land during the microseconds a pull executes. The +servo, meanwhile, faithfully regulates the backlog toward +`targetLatencyFrames`, which defaults to 48. If N is greater than 48, the +servo's goal and the consumer's need are in direct contradiction: the loop +steers occupancy *down* toward a level from which the next pull cannot be +served. Occupancy drains at the rate clamp, hits the floor, underruns, +refills, fades in — and repeats, forever. Measured at default +configuration: a 64-frame callback drops out every ~0.24 seconds +indefinitely, never reaching Locked, with the reported ppm pegged at a +false +1500 (the clamp, mistaken for the answer). A 240-frame callback +produced 80% silence. + +Why didn't anything catch it? Because every artifact that exercised the +converter had, innocently, been configured just clear of the cliff. The +quality tests pull one frame at a time — the metrologically correct choice +for their purpose. The benchmarks set the setpoint to twice the block size — +the performance-measurement-correct choice. The lock tests used 32-frame +blocks against the 48-frame default — feasible. Correct component tests, +correct measurement configurations, months of green CI, and a defaults +matrix with a hole exactly where real applications live. The lesson +generalizes and is worth stating as a rule: **a test suite validates the +configurations it contains, and silence about a configuration is not +evidence about it.** It took an adversarial audit — one explicitly tasked +with constructing failure scenarios rather than confirming passing ones — +to demonstrate it. + +The fix is the first thing `pull()` now does: + +```cpp +{{#include ../../../include/srt/asrc.hpp:asrc_feasibility}} +``` + +The design choices inside those lines carry the interesting reasoning: + +- **Adapt rather than reject.** The constructor cannot validate this — + the pull size isn't known until the first pull. Throwing from `pull()` + is forbidden by the noexcept contract, and returning an error the caller + must check is how the original silent failure happened, one layer up. + So the converter raises its *effective* setpoint to what the observed + block requires and reports the raise through + `Status::effectiveTargetLatencyFrames`. Latency follows the raised + setpoint: the honest price, visibly labeled, instead of a dropout cycle. +- **The margin is a half block.** Feasibility strictly needs + `setpoint ≥ N`; equality grazes, because block-quantized occupancy + sawtooths around the setpoint. The audit's data located the boundary + (pull = setpoint showed occasional underruns; pull comfortably below the + setpoint was clean), and `N/2` covers the sawtooth with room. +- **The raise is bounded by capacity**, computed once in the constructor — + a setpoint the FIFO cannot sustain would just move the failure. The + auto-sized FIFO's floor was raised to 1024 frames (21 ms of stereo float + costs 8 KB — memory is the cheap resource here) so that callbacks up to + roughly 340 frames work with zero configuration; beyond that, the + documentation now says plainly: size `fifoFrames` yourself. +- **Feasible configurations are untouched.** The 32-frame-against-48 + default keeps its exact behavior — verified not just by tests but by the + instruction-count ratchet: every scenario on every embedded target + measured within ±0.07% across the change, which is construction-cost + noise. The adaptation is invisible until the moment it is needed. + +The audit's failing scenarios became the regression suite +(`Feasibility.Pull64LocksCleanly` and siblings), so the bug's exact shape +is now permanently load-bearing. + +## Validation: what the constructor refuses to build + +The same audit rewrote `validated()`, and the before/after is a compact +study in what config validation is *for*. The original checked three +fields for zero. The current version rejects, with reasons recorded in a +comment: NaN or infinity anywhere in the numeric config (a NaN sample rate +previously flowed into the filter designer and constructed a converter +that emitted NaN audio — construction succeeding is worse than throwing +when what it constructs is poison); band-edge sums above the sample rate +(an anti-image filter whose cutoff exceeds input Nyquist passes images +wholesale — numerically fine, acoustically wrong); a deviation clamp large +enough that the Q0.64 conversion in the resampler would overflow an +`int64` (undefined behavior guarded at the only gate that sees the value +early enough); and size products that would wrap 32-bit `size_t` on the +embedded targets before `bad_alloc` could save anyone. The principle: +**validate at the boundary where throwing is allowed, against the +invariants of every component downstream** — the resampler can't defend +itself against a config it never sees whole. + +One postscript from the portability chapter belongs here too: on one +supported toolchain (Hexagon's static-musl configuration), C++ exceptions +cannot unwind at all, so even this careful `throw` terminates the process +there. Validation still protects — a loud death beats NaN audio — but +callers on that target are documented to validate before constructing. +Contracts end where toolchains do. + +## Telemetry that cannot lie about being lock-free + +`status()` may be called from any thread, which makes it the one place a +third agent touches the object. Every field crosses via a relaxed atomic, +single-writer, individually coherent but deliberately not mutually so — a +snapshot for humans and supervisory logic, not a synchronization +primitive. The type choices encode a portability fact worth remembering: +the counters are 32-bit atomics because on the 32-bit targets a 64-bit +`std::atomic` falls back to lock-based emulation, and a converter whose +*telemetry* takes a lock has quietly broken the lock-free promise its hot +path makes. The counters wrap at 2^32; the doc comment says so and says +what to do about it. Precision was traded for the contract, and the trade +is written down. + +## The underrun tail, end to end + +```cpp +{{#include ../../../include/srt/asrc.hpp:asrc_underrun}} +``` + +Read this excerpt slowly and you can see the whole chapter in ten lines: +the resampler asked to do exactly one job; the fade applied only when +there is something real to fade; the silence pad honoring `pull()`'s +always-fills guarantee; the integrator-preserving reset encoding what a +dropout does and does not destroy; the telemetry publish last, so +observers see states, not mid-transition fictions. + +## Verify it yourself + +```sh +# The composed state machine, end to end: +ctest --test-dir build -R 'AsrcLock' --output-on-failure + +# The feasibility bug's exact former shape, now a regression gate: +ctest --test-dir build -R 'Feasibility' --output-on-failure + +# What the constructor refuses to build (NaN, image-passing bands, +# UB-range ppm, undersized FIFOs): +ctest --test-dir build -R 'ConfigValidation' --output-on-failure + +# Resync clamping, consumer reset, fade behavior, degenerate calls: +ctest --test-dir build -R 'Resync|Reset|Fade|EdgeCalls' --output-on-failure +``` + +And one experiment worth running because it *shows you the bug*: check out +any commit before the feasibility fix, build the lock test with +`chunkOut = 64`, and watch a fully green library drop audio four times a +second. Correct parts. Broken whole. That gap is what this file is for. diff --git a/include/srt/asrc.hpp b/include/srt/asrc.hpp index a26291d..6386c28 100644 --- a/include/srt/asrc.hpp +++ b/include/srt/asrc.hpp @@ -146,6 +146,7 @@ class BasicAsyncSampleRateConverter { return ring_.read(dst, maxFrames * cfg_.channels) / cfg_.channels; }; + // ANCHOR: asrc_feasibility // Feasibility: a pull must synthesize from frames already buffered, // so the occupancy setpoint must exceed the pull block size or the // loop drains into a permanent underrun limit cycle (dropouts every @@ -173,8 +174,10 @@ class BasicAsyncSampleRateConverter { } } + // ANCHOR_END: asrc_feasibility double occ = backlogFrames(); + // ANCHOR: asrc_filling if (filling_) { if (occ < static_cast(fillThresholdFrames_)) { fillSilence(interleaved, frames * ch); @@ -190,6 +193,8 @@ class BasicAsyncSampleRateConverter { fadeFramesLeft_ = kFadeFrames; } + // ANCHOR_END: asrc_filling + // ANCHOR: asrc_resync if (occ > static_cast(highWaterFrames_)) { // hard resync const double target = static_cast(targetFrames_); // The discard can only come from the ring; frames staged in the @@ -206,9 +211,11 @@ class BasicAsyncSampleRateConverter { servo_.seed(occ + resampler_.mu()); } + // ANCHOR_END: asrc_resync const double dt = static_cast(frames) / cfg_.sampleRateHz; const double epsHat = servo_.update(occ, resampler_.mu(), dt); + // ANCHOR: asrc_underrun const std::size_t made = resampler_.process(interleaved, frames, epsHat, popFn); if (fadeFramesLeft_ != 0 && made != 0) applyFadeIn(interleaved, made); @@ -220,6 +227,7 @@ class BasicAsyncSampleRateConverter { } publishStatus(); return made; + // ANCHOR_END: asrc_underrun } /// Any thread: telemetry snapshot (relaxed atomics; fields are individually From b6fc0bd196b91306c5084a4a8cdd5fb3e1105f13 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 1 Jul 2026 21:29:39 +0000 Subject: [PATCH 03/16] book: Appendix A - the C++ decision log https://claude.ai/code/session_01HuAFfoeD5a5Xe5aGNA16M9 --- book/src/appendix/cpp-decisions.md | 756 ++++++++++++++++++++++++++++- 1 file changed, 754 insertions(+), 2 deletions(-) diff --git a/book/src/appendix/cpp-decisions.md b/book/src/appendix/cpp-decisions.md index 1d5cf71..56ab9df 100644 --- a/book/src/appendix/cpp-decisions.md +++ b/book/src/appendix/cpp-decisions.md @@ -1,3 +1,755 @@ -# cpp decisions +# Appendix A: The C++ decision log -*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* +Every chapter of this book has defended C++ decisions in passing, in the +context that made them necessary. This appendix collects them in one place, +in one format: the decision, what was rejected, why, and where in the +repository the evidence lives — because in this codebase the decisions are +*recorded*, mostly as comments at the point of consequence, and a decision +whose reason you cannot locate is a decision you cannot safely revisit. + +A theme will emerge quickly, so it is worth stating up front. Almost every +entry below is the same decision wearing different clothes: **between a +clever general mechanism and a plain constraint you can state and verify, +this library picks the constraint.** A literal `64` over a standard +interference constant; a `static_assert` over trust; a compile-time gate +over a runtime flag; a comment that shows its arithmetic over a comment +that waves at it. Where the two genuinely conflict, the tiebreaker is +always the same pair of masters: the real-time audio contract and the +embedded targets that cannot fake their way around a bad choice. + +## 1. Header-only distribution + +The entire library is seven headers under `include/srt/`. The build system +declares exactly one library target, and it has no compiled artifact: + +```cmake +add_library(SampleRateTap INTERFACE) +add_library(SampleRateTap::SampleRateTap ALIAS SampleRateTap) +target_compile_features(SampleRateTap INTERFACE cxx_std_20) +``` + +Consumption is `add_subdirectory` or `FetchContent`, deliberately and +exclusively — the README's *Consuming the library* section says so in as +many words: "there are no install/package rules yet." The tests, examples, +benchmarks and the C ABI shim are all opt-in options that default off when +the project is not top-level, and the warning flags live on a separate +`srt_warnings` target so that the library's own `-Wall -Wextra -Wpedantic +-Wconversion` discipline is never propagated into a consumer's build +(`CMakeLists.txt` carries the comment: "not propagated to consumers"). + +What was rejected is the conventional pair: a compiled static/shared +library, and a packaged install with exported config files. The costs of +header-only are real and were accepted knowingly. Every translation unit +that includes `srt/srt.hpp` re-parses and re-instantiates the templates — +compile time is paid repeatedly. There is no ABI boundary, so there is +nothing to version at link time and no way to ship a fixed `.so` to a +customer who cannot rebuild (the C ABI shim in section 15 exists precisely +for the one consumer class that needs a binary boundary). + +What it buys is decisive for this library's actual deployment surface. +The code ships to bare-metal Cortex-M33/M55 firmware, a musl-libc Hexagon +toolchain, and ordinary hosts — four toolchains in CI alone, each with its +own flags, each producing incompatible binaries. A prebuilt library per +target multiplies the release matrix; a header vanishes into whatever +build the consumer already has, including builds with LTO, `-march=native` +or MVE auto-vectorization, where cross-TU inlining of the hot kernels is +exactly what the performance chapters measured. And a template library is +header-shaped by nature: the sample-type axis of section 2 means the +"library" is not a fixed set of functions but a recipe the consumer's +compiler executes. + +| Decision | Rejected | Reason | Evidence | +|---|---|---|---| +| `INTERFACE` target, `add_subdirectory`/`FetchContent` only | compiled library; install/export packaging | four incompatible toolchains in CI; templates need instantiation in the consumer's TU; costs (compile time, no ABI) accepted, C ABI shim covers the binary-boundary case | `CMakeLists.txt`; README "Consuming the library"; `tools/capi/` | + +## 2. Templates and a concept for the sample-type axis + +The datapath comes in three sample types — `float`, Q15 `int16_t`, Q31 +`int32_t` — and the axis is expressed as a template parameter constrained +by a concept: + +```cpp +template +class BasicAsyncSampleRateConverter { ... }; + +using AsyncSampleRateConverter = BasicAsyncSampleRateConverter; +using AsyncSampleRateConverterQ15 = BasicAsyncSampleRateConverter; +using AsyncSampleRateConverterQ31 = BasicAsyncSampleRateConverter; +``` + +The first rejected alternative is virtual dispatch: an abstract +`ISampleOps` with `mac()`, `blend()`, `finalize()` virtuals. That dies on +arithmetic grounds before it reaches performance grounds — the three +datapaths do not share signatures. The float path accumulates in `double`, +the fixed-point paths in `int64_t`; the blend factor is a `float`, a Q15 +`int32_t`, or a Q20 `int32_t` depending on the type. Virtual functions +cannot vary their associated types per implementation; you would be forced +to launder everything through the widest type, which is precisely the +soft-double catastrophe the fixed-point paths exist to avoid (the M33 +baselines put the float path at roughly 19× the M55's instruction count +for exactly that reason — README, platform section). And even if the types +had lined up, an indirect call per multiply-accumulate inside a 48–80-tap +loop would forfeit the inlining and auto-vectorization that Part III +measured: the M55's Q15 kernel is fast *because* GCC can see through +`SampleTraits::mac` and emit Helium. + +The second rejected alternative is CRTP — compile-time polymorphism via +inheritance. It solves the dispatch cost but contorts the shape: the +sample type here is `int16_t` itself, a builtin, not a class that can +inherit from a base. CRTP would demand wrapper types around the samples, +and wrapped samples are no longer the raw interleaved buffers that device +drivers and the `memcpy`-based ring (section 6 of the ring chapter) +require. The concept does the one job the template needs guarding for: + +```cpp +template +concept SampleType = requires(...) { + { SampleTraits::mac(a, x, c) } -> std::same_as::Accum>; + // ... six more operations, each with its exact type checked +}; +``` + +A wrong instantiation fails at the constraint with the list of missing +operations, not three template layers deep in the dot-product loop. The +header then `static_assert`s the concept against all three shipped types — +the same trust-nothing reflex as the ring's lock-free asserts. + +| Decision | Rejected | Reason | Evidence | +|---|---|---|---| +| templates constrained by the `SampleType` concept | virtual `ISampleOps`; CRTP wrappers | per-type associated types (`Accum`, `BlendFactor`) are impossible to express virtually; builtins can't inherit; hot loops must inline and vectorize | `include/srt/sample_traits.hpp` (concept + `static_assert`s); `include/srt/asrc.hpp` aliases; README platform notes (19× soft-double) | + +## 3. A traits struct as the customization point + +Given templates, the customization could still have taken several shapes. +The library chose a traits struct with an intentionally undefined primary +template: + +```cpp +/// Primary template intentionally undefined; specialize per sample type. +template +struct SampleTraits; +``` + +Each specialization bundles three associated types (`Coeff`, `Accum`, +`BlendFactor`) with seven static functions (`makeCoeff`, `mac`, `blend`, +`finalize`, ...). Why this over the alternatives? + +**Free functions found by ADL** — the customary `swap`-style mechanism — +were worse for two reasons. First, the customization is mostly *types*, +not functions: the fact that Q15 stores coefficients as Q1.14 `int16_t` +but accumulates in `int64_t` is the design (the header's comments derive +it: Q0.15 × Q1.14 products summed exactly, one rounding in `finalize()`). +Free functions cannot carry associated types; you would need separate type +traits anyway, and the customization point would smear across two +mechanisms. Second, ADL on builtin types like `int16_t` has no associated +namespace to hook — the overloads would all pile into `srt` and be +distinguishable only by overload resolution, silently, which is exactly +how a Q15/Q31 mixup would compile and produce garbage. + +**Member policies** — making the sample type a class that knows its own +arithmetic — fail as in section 2: the sample types must remain raw +builtins so buffers stay `memcpy`-compatible and ABI-identical to what +audio drivers produce. A traits struct is the standard C++ answer for +attaching behavior to types you cannot modify, and the undefined primary +template makes "I forgot to specialize" a clean compile error at the point +of use rather than a link error or a default that half-works. + +The struct also keeps each datapath's documentation in one screenful: the +Q15 specialization's header comment is a complete fixed-point error budget +(coefficient quantization at ~−86 dB, single rounding point, "the +converter is Q15-transparent"), sitting directly above the ten lines that +implement it. + +| Decision | Rejected | Reason | Evidence | +|---|---|---|---| +| `SampleTraits` struct, undefined primary template | ADL free functions; member policies on sample classes | customization is chiefly associated types; builtins have no ADL namespace and can't have members; missing specialization = clean compile error | `include/srt/sample_traits.hpp` | + +## 4. The real-time contract: exceptions at setup, `noexcept` forever after + +This is the load-bearing wall of the whole API, stated as a contract in +the converter's class comment: + +```cpp +/// Real-time contract: the constructor performs all allocation and filter +/// design and may throw; push(), pull(), status() and resetFromConsumer() +/// are noexcept, lock-free and allocation-free. +``` + +The README's feature bullets repeat it, because it is the feature. The +constructor allocates every buffer the object will ever touch — ring, +polyphase table, histories, scratch — designs the filter in double +precision, validates the configuration, and throws `std::invalid_argument` +or `std::bad_alloc` on anything wrong. From that point on, the audio path +never allocates, never locks, never throws; every hot function is spelled +`noexcept`, and the `validated()` function exists to make the constructor +*more* throw-happy, rejecting configurations that "would otherwise +construct successfully and misbehave silently" — NaN sample rates that +design all-NaN tables, band edges that pass images wholesale, deviation +clamps that overflow the Q0.64 conversion (its comment lists each one). + +The rejected alternatives are the two ways other libraries split this. +Error codes at setup ("check the return of `init()`") were rejected +because a partially-constructed converter is not a state this object can +represent — there is no meaningful "converter without a filter table," and +C++ constructors-that-throw are precisely the tool that makes invalid +objects unrepresentable. Exceptions on the audio path were never +considered — an unwind inside a device callback is a glitch at best — but +the *strength* of the setup/hot-path split was reinforced from an +unexpected direction. When the first `EXPECT_THROW` test reached the +Hexagon CI leg, it discovered that the hexagon-linux-musl toolchain +cannot catch exceptions at all: a constructor throw terminates via +libc++abi instead of propagating. `docs/PERFORMANCE.md` records it under +Known debt, with the deployment note ("treat invalid Config as fatal — +validate inputs before constructing") and the candidate fix +(`-unwindlib=libunwind`). The discovery cost one excluded test on one leg +— because exceptions had been confined to a code region where "terminate +instead of propagate" is survivable. Had the audio path thrown, the same +toolchain quirk would have been a field failure. + +| Decision | Rejected | Reason | Evidence | +|---|---|---|---| +| all allocation + throwing in the constructor; `noexcept`/lock-free/allocation-free hot path | `init()` + error codes; exceptions anywhere near audio | invalid objects unrepresentable; RT contract is the product; Hexagon's no-unwind toolchain proved the value of confining throws to setup | `include/srt/asrc.hpp` (class comment, `validated()`); README bullets; `docs/PERFORMANCE.md` Known debt; commit "Hexagon: exclude ConfigValidation" | + +## 5. Runtime filter design, not `constexpr` tables + +A modern-C++ reflex says the Kaiser-windowed prototype — pure math on +compile-time-known presets — should be a `constexpr` table. The library +computes it at runtime, in the constructor, and `kaiser.hpp` opens with +the reason, arithmetic included: + +```cpp +/// Design note — runtime vs constexpr: the prototype tables run 12K-33K taps +/// and each tap needs sin/sqrt plus a ~50-term Bessel I0 series. Constexpr +/// evaluation is interpreted (roughly 1e3-1e4x slower than native), would +/// need hand-rolled constexpr transcendentals before C++26, and would cost +/// tens of seconds to minutes of compile time in every including translation +/// unit. Runtime design takes well under 10 ms, runs once in a constructor, +/// and is off the audio path, so all design math here is plain runtime +/// double precision. +``` + +Unpack the trade. The `balanced()` preset's prototype is 256 × 48 = +12,288 taps, and the presets range upward from there — the comment's +"12K-33K taps". Each tap evaluates `sin`, +`sqrt`, and a Bessel-I0 power series that runs to ~50 terms. `constexpr` +evaluation is an interpreter inside the compiler — three to four orders +of magnitude slower than native — and, before C++26, `std::sin` and +friends are not `constexpr`, so the transcendentals would have to be +hand-rolled *and then trusted* to match runtime libm behavior. In a +header-only library the bill lands in every consumer TU, repeatedly. The +runtime version costs under 10 ms, once, in the constructor — which +section 4 already designated as the place where expensive things happen. +And a runtime design accepts *runtime* configurations: `FilterSpec` is +not limited to the three presets, so a compile-time table would have been +a special case bolted alongside the general path, not a replacement. + +This is the header-only cost model (section 1) feeding back into design: +having accepted per-TU compilation, the library polices what each TU +costs. + +| Decision | Rejected | Reason | Evidence | +|---|---|---|---| +| filter designed at runtime in the constructor | `constexpr` coefficient tables | 12K–33K taps × transcendentals ≈ minutes of interpreted compile time per TU vs <10 ms once at runtime; needs pre-C++26 hand-rolled constexpr math; runtime `FilterSpec` must work anyway | `include/srt/detail/kaiser.hpp` header comment | + +## 6. `` over hand-rolled bit tricks; masks over modulo + +Everywhere the library needs power-of-two arithmetic it reaches for +C++20's ``: `std::bit_ceil` rounds the ring capacity up +(`SpscRing`'s constructor), rounds the phase count up +(`PolyphaseFilterBank`), and sizes the FIFO (`ringCapacityElems` in +`asrc.hpp`); `std::countr_zero` recovers log₂(L) in the phase-indexed +kernels so the polyphase branch is the top bits of the Q0.64 phase word: + +```cpp +const int lg = std::countr_zero(bank.numPhases()); // L is a power of two +const std::size_t p = static_cast(phase >> (64 - lg)); +``` + +The rejected alternative is the folklore versions — the +shift-or-shift `bit_ceil`, the de Bruijn log₂ — which every C programmer +has written and half have gotten wrong at the boundaries (what does your +hand-rolled `bit_ceil` do at 0? at values above 2⁶³?). The standard +functions have specified edge behavior, compile to single instructions +where they exist, and *name the intent* — `countr_zero(numPhases())` +under the comment "L is a power of two" is an invariant stated twice. + +The deeper decision is what the powers of two are *for*: indexing by mask +instead of modulo. The ring's monotonic indices are wrapped by `head & +mask_` — its class comment: "Indices are monotonic and wrapped by a +power-of-two mask, so the full capacity is usable" — and the ring chapter +proves the wraparound benign. The polyphase table's L being a power of +two is what lets the Q0.64 phase word split into branch index and blend +fraction by pure shifts, with no division and no double arithmetic on the +per-sample path (the phase-accumulator comment in +`polyphase_filter.hpp`). A general-modulo design would put an integer +divide — tens of cycles on the M-class cores, and a serialization point +everywhere — inside the tightest loops the library owns, to support +capacities nobody asked for. + +| Decision | Rejected | Reason | Evidence | +|---|---|---|---| +| `std::bit_ceil` / `std::countr_zero`; power-of-two capacities indexed by mask | hand-rolled bit tricks; arbitrary sizes with `%` | specified edge cases, single instructions, intent named; masks keep divides and doubles off the per-sample path | `include/srt/spsc_ring.hpp` ctor + class comment; `include/srt/polyphase_filter.hpp` (`blendRowPhase`, `interpolatePhase`, `ringCapacityElems`) | + +## 7. Memory orderings chosen to be exactly sufficient + +The ring chapter walked this in full; the appendix records it as policy, +because it generalizes beyond the ring. Every atomic operation in the +library carries an explicit ordering argument, and each ordering is the +*weakest* that keeps the algorithm correct: `release` on the store that +publishes data, `acquire` on the load that consumes a foreign index, +`relaxed` on a thread's loads of its own index — and `relaxed` on all +telemetry, whose fields are documented as "individually coherent, not +mutually" (`status()` in `asrc.hpp`). + +The rejected alternative is `seq_cst`-by-default — writing +`head_.store(x)` and letting the strongest ordering paper over the +analysis. It would be correct. It was rejected first because it is +measurably stronger than needed on the weakly-ordered targets (full +barriers on ARM in the hottest loop the library owns), and second — the +argument this codebase actually leads with — because **orderings are +documentation**. An explicit `memory_order_relaxed` on `tail_.load()` in +the producer tells the reader "this is my own index; no synchronization +happens here" — a claim the ring chapter spells out and ThreadSanitizer +checks against reality in CI. A default `seq_cst` says only "I didn't +think about this," and in the one file whose entire job is to be thought +about, that is the wrong message. The same honesty cuts the other way: +where synchronization *is* needed, the annotation names which one, so a +future editor who weakens it is contradicting a written claim, not +merely changing a default. + +| Decision | Rejected | Reason | Evidence | +|---|---|---|---| +| explicit, minimal orderings on every atomic | `seq_cst` defaults | weaker barriers on ARM where it matters; each annotation documents exactly why it exists; TSan-checked in CI | `include/srt/spsc_ring.hpp`; `include/srt/asrc.hpp` telemetry; the ring chapter's "What was rejected" | + +## 8. `alignas(64)`, not `std::hardware_destructive_interference_size` + +The ring separates producer-owned, consumer-owned and shared-read-only +state onto distinct cache lines, and it does so with a named literal: + +```cpp +// 64-byte separation to keep producer- and consumer-owned state on +// distinct cache lines (std::hardware_destructive_interference_size is +// deliberately avoided: it is ABI-fragile and warns on GCC). ... +static constexpr std::size_t kCacheLine = 64; +``` + +The standard offers a constant whose whole purpose is this alignment, and +the file's comment rejects it by name. The problem is that +`hardware_destructive_interference_size` is not a constant of the +architecture; it is a constant of the *compiler invocation* — its value +can change with `-mtune`, which means two translation units in the same +program can disagree about the layout of the same type. That is an ODR +violation waiting for a victim, and GCC ships a warning +(`-Winterference-size`) telling you exactly this whenever the constant is +used in a context that might cross an ABI boundary. A header-only library +(section 1) lives *entirely* in that danger zone: every consumer TU +re-instantiates `SpscRing`, potentially under different flags. + +A plain `64` is correct on every target this project ships to, cannot +vary between TUs, and states its assumption in a comment a porting +engineer will read. The general lesson — the ring chapter phrases it as +"between a standard facility and a constraint you can state plainly, +prefer the one whose failure mode you can reason about" — is this +appendix's opening theme in miniature. + +| Decision | Rejected | Reason | Evidence | +|---|---|---|---| +| `alignas(kCacheLine)` with `kCacheLine = 64` | `std::hardware_destructive_interference_size` | the standard constant varies with tuning flags → ODR/ABI fragility in a header; GCC warns; 64 is right everywhere shipped | `include/srt/spsc_ring.hpp` member layout comment | + +## 9. 32-bit telemetry atomics + +The converter's telemetry — state, ppm, fill, underrun/overrun/resync +counters, effective setpoint — is deliberately 32 bits wide, and the +comment above the members carries the whole argument: + +```cpp +// Telemetry is 32-bit on purpose: 64-bit atomics fall back to lock-based +// libatomic on 32-bit targets (e.g. Hexagon), which would break the +// lock-free contract of the hot path. float carries ~7 significant +// digits — ample for ppm/fill observability; counters wrap at 2^32. +``` + +The rejected alternative — `std::atomic` counters and +`double` gauges, the "obviously roomier" choice — is a trap on exactly +the targets this library most cares about. On a 32-bit ISA without a +64-bit atomic instruction, `std::atomic` still compiles and +still works: libatomic implements it *with a lock*. The hot path would +remain formally correct and silently stop being lock-free — the one +property section 4 declared as contract, broken invisibly by a telemetry +counter. The 32-bit choice keeps every telemetry access a plain +lock-free operation on Hexagon and the M-class cores, and the class +`static_assert`s it rather than assuming: + +```cpp +static_assert(std::atomic::is_always_lock_free && + std::atomic::is_always_lock_free && + std::atomic::is_always_lock_free, + "telemetry atomics must be lock-free for the RT contract"); +``` + +The cost is range, and it is documented rather than hidden: `Status`'s +comment tells callers the counters "wrap at 2^32 — far beyond any +plausible event count, but treat them as modular if you difference them +over very long horizons." (The `Status` struct itself still presents +`uint64_t` fields — the narrowing is an internal representation choice, +widened at the snapshot.) A `float` gauge carries about seven significant +digits, which comfortably resolves tenths of a ppm and hundredths of a +frame of fill — observability, not metrology. + +| Decision | Rejected | Reason | Evidence | +|---|---|---|---| +| `atomic`/`atomic` telemetry, wrap documented | 64-bit atomic counters/doubles | 64-bit atomics lock via libatomic on 32-bit targets, silently voiding the lock-free contract; 32-bit range/precision suffices and is asserted | `include/srt/asrc.hpp` telemetry members + `static_assert`; `Status` doc comment | + +## 10. Designated initializers as API + +The filter presets are written the way a datasheet reads: + +```cpp +static FilterSpec transparent() noexcept { + return {.numPhases = 512, + .tapsPerPhase = 80, + .passbandHz = 20000.0, + .stopbandHz = 26000.0, + .stopbandAttenDb = 140.0}; +} +``` + +`FilterSpec`, `Config` and `ServoConfig` are aggregates with member +initializers supplying defaults, and C++20 designated initializers do the +rest. The rejected alternatives are the two classic config-struct styles. +A positional constructor — +`FilterSpec(512, 80, 20000.0, 26000.0, 140.0)` — puts two adjacent +`double` band edges next to each other where a swap compiles silently and +mis-designs the filter (which, per `validated()`'s comment, is the kind +of error that "passes images wholesale"). A builder/setter chain adds a +mutable construction protocol and a second way for every field to be set, +to solve a problem the language now solves natively: fields are named at +the call site, unmentioned fields keep their documented defaults, and — +because designated initializers must follow declaration order — the +compiler rejects reorderings instead of reinterpreting them. + +The style is also the library's own consumption idiom: the README quick +start and every test build configs by naming only what deviates from +default. Readable initialization is not cosmetic in a config API; the +config *is* the API surface where users make their quality-versus-cost +decisions, and the presets double as documentation of three known-good +points in that space. + +| Decision | Rejected | Reason | Evidence | +|---|---|---|---| +| aggregate configs + designated initializers | positional constructors; builder chains | named fields make adjacent-double swaps impossible; defaults stay declarative; declaration-order enforcement | `include/srt/polyphase_filter.hpp` (`FilterSpec` presets); `include/srt/asrc.hpp` (`Config`); `include/srt/pi_servo.hpp` (`ServoConfig`) | + +## 11. `SRT_RESTRICT`: a portable `__restrict__`, adopted on measurement + +C++ has no standard `restrict`. The library defines a two-line macro over +the compiler extensions and applies it to the kernel pointer parameters — +and the comment above the macro is careful to claim only what was +verified: + +```cpp +// No-alias qualifier for the kernel hot loops: without it the compiler +// versions the blend loop behind a runtime aliasing check (verified with +// -fopt-info-vec; see docs/PERFORMANCE.md, hypothesis 2). +``` + +This entry is here as much for its *method* as its content. The +vectorization audit (PERFORMANCE.md, PR C2) did not assume aliasing was a +problem; it asked the compiler. `-fopt-info-vec` showed `blendRow` +vectorizing — but behind a runtime aliasing check, the loop compiled +twice with a pointer-overlap branch choosing between versions. +`SRT_RESTRICT` on the row/history pointers removes the check, and the +measured effect is recorded with the honesty this project's performance +docs enforce: **M55 `pipeline_float` −1.35% instructions, every other +embedded scenario exactly 0.00%, x86 same-state A/B −3.7% wall-clock.** +Small, real, and cheap — the qualifier documents a true invariant (the +scratch row never aliases the history), so it costs nothing to maintain. + +The rejected alternatives: doing nothing (leaving the versioned loop and +its branch in the hot path), and restructuring the code so the compiler +could prove non-aliasing itself (possible, but contorting call signatures +to communicate what one keyword states directly). MSVC spells the +extension `__restrict`, everyone else `__restrict__`; hence the macro +rather than a raw keyword. + +| Decision | Rejected | Reason | Evidence | +|---|---|---|---| +| `SRT_RESTRICT` macro on kernel pointers | nothing (alias-versioned loops); structural non-aliasing proofs | verified with `-fopt-info-vec`, measured: M55 float −1.35% insns, x86 −3.7% wall-clock; states a true invariant | `include/srt/polyphase_filter.hpp` macro + comment; `docs/PERFORMANCE.md` C2 | + +## 12. Compile-time feature gates — and the measured cost of a runtime one + +Target-specific code paths are selected by preprocessor and `constexpr` +machinery, never by runtime flags. `SRT_Q15_SMLALD` turns on the dual-MAC +Q15 dot product exactly where it wins: + +```cpp +#if defined(__ARM_FEATURE_DSP) && !defined(__ARM_FEATURE_MVE) +``` + +— DSP-extension cores *without* Helium (the M33/Pico class), because on +the M55 the compiler already auto-vectorizes the scalar loop with MVE and +the intrinsic would replace vectors with dual-MACs (the gate's comment; +PERFORMANCE.md C4 verified 0.00% change on every M55 scenario). +`SRT_CHANNEL_PARALLEL` enables the frame-major channel axis on hosts only, +and inside the class it becomes a `constexpr` member flag that +`if constexpr` and plain constant folding erase from non-participating +builds: + +```cpp +static constexpr bool kChannelParallel = + SRT_CHANNEL_PARALLEL != 0 && std::is_floating_point_v; +``` + +The reason this is dogma rather than taste is that the alternative was +tried, by accident, and measured. During C6 the mode gate was briefly an +ordinary runtime `bool` consulted in the hot loops — and the M55 +instruction ratchet, which had nothing to do with the change (C6 is +host-only), moved **+6–8%** from hot-loop branch bloat. PERFORMANCE.md +records the lesson verbatim: "the mode gate must be compile-time — a +runtime bool in the hot loops cost +6–8% on the M55 ratchet before the +constexpr gate restored every embedded scenario to 0.00%." The compaction +path in `appendOne` carries the same note at the exact line that was +guilty. A ±3% two-sided CI gate is what turned this from a silent tax +into a failed build; the constexpr gate is what turned the fix from "fast +again" into "provably byte-identical again." + +| Decision | Rejected | Reason | Evidence | +|---|---|---|---| +| preprocessor + `constexpr` flags + `if constexpr` gates | runtime mode flags | a runtime bool in the hot loop measured +6–8% on the M55 ratchet; compile-time gates keep non-participating targets' codegen byte-identical (0.00%) | `include/srt/polyphase_filter.hpp` (`SRT_Q15_SMLALD`, `SRT_CHANNEL_PARALLEL`, `kChannelParallel`, `appendOne` comment); `docs/PERFORMANCE.md` C4/C6 | + +## 13. `std::function` in the simulator, templated callables in the library + +The test harness's two-clock simulator configures its signal generators +as `std::function` fields: + +```cpp +std::function gen = [](std::uint64_t) { return S{}; }; +std::function fsInScale = [](double) { return 1.0; }; +``` + +The library's hot path, facing the identical "caller supplies a callable" +problem, does something else entirely. `FractionalResampler::process` +takes its frame source as a template parameter — +`template std::size_t process(..., PopFn&& popFrames) +noexcept` — and the converter passes a `noexcept` lambda that wraps the +ring read. Same need, opposite tools, and the split is deliberate. + +`std::function` is the right tool in the simulator: tests assign +different generators per test case at runtime, the cost of a type-erased +call per sample is irrelevant next to the double-precision sine it +invokes, and construction-time allocation in a test fixture harms +nothing. It would be the wrong tool in `process()` three ways at once. +Its call is an indirect jump through erased type information that the +optimizer cannot inline — and `popFn` is invoked inside the per-frame +loop, where the entire benefit of the current design is that the ring's +`read()` inlines into the resampler's refill path. Assigning one may +allocate, which is forbidden anywhere reachable from `pull()` +(section 4). And its call operator is not `noexcept` — an empty +`std::function` throws `bad_function_call` — which poisons the `noexcept` +audio path either with a formal lie or a terminate-on-bug. The template +parameter has none of these problems and costs only what templates +always cost: the code is instantiated per callable type, which for +exactly one production callable is nothing. + +| Decision | Rejected | Reason | Evidence | +|---|---|---|---| +| templated `PopFn&&` in the library; `std::function` only in test config | `std::function` on the hot path; templates in test fixtures | hot path needs inlining, no allocation, honest `noexcept`; tests need runtime reassignment and don't care about a type-erased call | `include/srt/polyphase_filter.hpp` (`process`, `prime`); `include/srt/asrc.hpp` (`popFn` lambda); `tests/support/two_clock_sim.hpp` | + +## 14. `std::vector` everywhere, custom allocators nowhere + +Every owned buffer in the library is a plain `std::vector`: the ring's +storage, the coefficient table, the resampler's histories, scratch and +blended row. No allocator parameters, no PMR, no small-buffer tricks. In +a real-time audio library this looks, at first glance, like negligence — +until you notice *when* those vectors are touched. Every `resize`, +`assign` and construction happens in a constructor or in `prime()`-time +setup; the hot path only ever reads `data()` and indexes. The RT problem +with allocation is not that heap memory is slow; it is that allocation +is unbounded and lock-taking *at the moment you cannot afford it*. +Section 4's contract solves that by construction-time-only allocation — +after which a custom allocator has nothing left to fix. It would add a +template parameter that infects every class signature, a policy decision +for every consumer, and a second code path to test, in exchange for +optimizing events that occur once per converter lifetime, off the audio +thread, in a place explicitly allowed to throw `bad_alloc`. + +The rejected-in-spirit alternatives — fixed `std::array` capacities, or +caller-supplied arenas — also fail the configurability test: table and +buffer sizes derive from runtime `FilterSpec` and `Config` values +(section 5), so compile-time capacities would cap the very parameters +the config API exposes. Embedded consumers who must avoid the heap +entirely have the honest option the design leaves open: construct the +converter during initialization, when the heap (or a bump allocator +behind `operator new`) is still a fine place to get memory from. + +| Decision | Rejected | Reason | Evidence | +|---|---|---|---| +| `std::vector` storage, default allocator | allocator/PMR parameters; fixed arrays; arenas | allocation is construction-only by contract, so allocators optimize a non-problem at the cost of infecting every signature; sizes are runtime config | `include/srt/spsc_ring.hpp`, `polyphase_filter.hpp`, `asrc.hpp` (members); RT contract in section 4 | + +## 15. The C ABI: opaque handles, `reinterpret_cast`, and `impl()` outside `extern "C"` + +The FFI surface (`tools/capi/`) wraps the float converter behind an +opaque `SrtHandle*`. The pattern is textbook, but two details record +decisions. First, the handle is a declared-but-never-defined struct, and +the conversion is a `reinterpret_cast` in a pair of helpers: + +```cpp +extern "C" { struct SrtHandle; } // opaque + +namespace { +srt::AsyncSampleRateConverter* impl(SrtHandle* h) noexcept { ... } +const srt::AsyncSampleRateConverter* impl(const SrtHandle* h) noexcept { ... } +} +``` + +The helpers live in an anonymous namespace *outside* the `extern "C"` +block for a reason C++ makes easy to forget: those two `impl` functions +are overloads (const and non-const), and **overloading is illegal under C +linkage** — C linkage names carry no type information to distinguish +them. Keeping the C++ conveniences in C++ linkage and only the exported +symbols in `extern "C"` is the discipline that lets the shim be written +as C++ without leaking C++ into the ABI. + +The rejected alternatives for the handle: exposing the class definition +(no ABI stability — the whole point of the shim is a boundary the C++ +headers don't have, per section 1), or a lookup table of integer handles +(indirection and lifetime bookkeeping to solve a problem the opaque +pointer already solves). Around the handle, the shim converts the C++ +error model to C conventions at the boundary: `srt_create` catches +everything and returns null; every entry point tolerates a null handle, +because — the file's own comment — the documented "check srt_create for +NULL" convention "otherwise invites a crash on exactly the path where the +caller forgot to check." An unchecked failure degrades to silence, not a +crash, which for an audio library is the correct failure sound. + +| Decision | Rejected | Reason | Evidence | +|---|---|---|---| +| opaque `SrtHandle*` + `reinterpret_cast`; `impl()` overloads outside `extern "C"`; null-tolerant entry points | exposed class; handle tables; unguarded entries | ABI boundary with zero C++ leakage; C linkage forbids overloads; unchecked create must fail soft | `tools/capi/srt_capi.cpp`, `tools/capi/srt_capi.h` | + +## 16. Deleted copy operations: these are identity types + +Both concurrency-bearing classes delete copying: + +```cpp +SpscRing(const SpscRing&) = delete; +SpscRing& operator=(const SpscRing&) = delete; +``` + +and likewise `BasicAsyncSampleRateConverter`. The rejected alternative — +letting the compiler generate copies, or writing "deep copy" semantics — +fails the simplest question first: *what would a copy even mean?* A ring +mid-stream has a producer thread and a consumer thread holding a +reference to *this specific object*; a copy would duplicate the buffer +contents but not the relationship, producing an orphan that no thread +feeds. (Mechanically, `std::atomic` members are not copyable anyway — +the language is trying to tell you the same thing.) The converter is +worse: copying would snapshot servo state, telemetry and half-consumed +scratch into a second object whose FIFO occupancy no longer corresponds +to any real clock relationship. These are what the two-agent contract +makes them: objects with identity, addressed by the threads that share +them, not values to be passed around. Deleting the operations turns the +meaningless question into a compile error — the same conversion of +convention into compiler-enforced fact as the `static_assert`s +(sections 2, 9) and the concept (section 2). Moves are deleted along +with copies (declaring the deleted copy suppresses them), which is also +right: a moved-from ring would invalidate the pointers the other thread +is using *right now*. + +| Decision | Rejected | Reason | Evidence | +|---|---|---|---| +| deleted copy (and hence move) on ring and converter | default/deep copies | two live threads reference the object by identity; a copy duplicates state but not the clock relationship; atomics aren't copyable | `include/srt/spsc_ring.hpp`, `include/srt/asrc.hpp` | + +## 17. Rejected wholesale, with reasons + +Some decisions are visible only as absences. For each, the reason is on +record. + +**`std::simd` / `std::experimental::simd`.** Not in C++20 — the library's +floor — and the portable-SIMD abstraction solves a problem this codebase +measured its way out of differently: where explicit SIMD wins, it is +gated per target and per measurement (the SMLALD path, +measured, kept; +the Hexagon `vrmpyh` path, −0.31%, implemented, proven bit-exact, and +*deliberately deleted* per the stop rule — PERFORMANCE.md C5). Where +auto-vectorization already wins (Helium on the M55, host AVX2 via the +channel axis), abstraction would only obscure what `-fopt-info-vec` and +`objdump` verified. + +**Coroutines.** The library's callers are device callbacks with hard +deadlines: `push()` on the capture thread, `pull()` on the playback +thread, both synchronous by the nature of the contract. No async model +fits — a suspension point inside a real-time callback is a category +error, and the frame flow the library does need (the resampler pulling +from the ring mid-synthesis) is expressed by the `PopFn` callable of +section 13 at zero machinery. + +**CRTP mixins.** Section 2's reasons in general form: the concept + traits +pair already delivers static dispatch and constraint checking without +forcing an inheritance shape onto builtin sample types or wrapper types +onto raw buffers. + +**Exceptions on the audio path.** Section 4; reinforced by a toolchain +that cannot unwind at all. + +**`std::jthread` (or any thread) in the library.** The library owns *no* +threads. It is a passive object with a two-agent contract — "one producer +thread calls push() at the input clock; one consumer thread calls pull() +at the output clock" (`asrc.hpp`) — and the threads belong to the caller, +because they already exist: they are the audio device callbacks. Spawning +threads would also be unbuildable on half the CI matrix; the bare-metal +targets have no `std::thread` at all, which is why even the *tests* +compile the two-thread stress only where `find_package(Threads)` succeeds +(`tests/CMakeLists.txt`). + +**Virtual interfaces for "pluggable filters."** The filter is not a +plugin point; it is a *parameter space*. `FilterSpec` exposes the five +numbers that matter (L, T, band edges, attenuation) and the design +machinery is one fixed, well-understood method (Kaiser-windowed sinc) +whose properties the quality tests pin. An `IFilterDesigner` interface +would buy the ability to substitute arbitrary coefficient tables at the +cost of an indirect call chain into the kernel (section 2's costs) and +the loss of every invariant the code currently states about its own +tables — per-branch DC gain, the extra phase row's exact continuity, +the measured |diff| ≤ 41 adjacent-phase delta of section 18. + +| Rejected | Reason | Evidence | +|---|---|---| +| `std::simd` | not in C++20; per-target measured intrinsics (kept or deleted by number) beat portable abstraction | `docs/PERFORMANCE.md` C4/C5 | +| coroutines | hard-RT synchronous callbacks; no async model fits | `include/srt/asrc.hpp` thread contract | +| CRTP mixins | concept + traits already give static dispatch without inheritance shape | `include/srt/sample_traits.hpp` | +| audio-path exceptions | RT contract; Hexagon cannot unwind | section 4 | +| `std::jthread` in the library | passive two-agent object; caller owns the (callback) threads; bare metal has none | `include/srt/asrc.hpp`; `tests/CMakeLists.txt` Threads probe | +| virtual pluggable filters | filter is a parameter space, not a plugin point; would cost kernel inlining and table invariants | `include/srt/polyphase_filter.hpp` (`FilterSpec`) | + +## 18. The meta-decision: comments that show their arithmetic + +Read back through the evidence column of this appendix and notice where +it points: overwhelmingly at *comments*. The library's final C++ decision +is about prose. Its comments do not narrate ("increment the index"); +they state constraints and record arithmetic at the point where the code +depends on them. The Q15 traits comment derives the accumulator budget +("48-80 taps add ~6-7 bits — no overflow, no intermediate rounding"). The +`kaiser.hpp` note quantifies the constexpr rejection (section 5). The +resampler's eps conversion documents its own safety margin ("|eps| is +servo-clamped to ~1e-3, so eps * 2^64 fits int64 comfortably"). The +`appendOne` compaction comment carries the +6–8% scar of section 12. +These comments are load-bearing: they are the reasons future editors +will weigh before changing the code, so they are held to the same +standard as the code. + +Including being *audited*. The package audit that hardened the core +(commit `029607f`, "Core hardening from the package audit") checked the +comments' arithmetic along with the code's, and found one wrong: the Q15 +`blend()` comment claimed the int32 product had "~5% margin" against a +worst-case adjacent-phase delta. The audit did the multiplication — +32767 × 65535 = 2,147,385,345, which sits 0.005% under `INT32_MAX`, not +5% — and the commit's own summary records the fix: "Q15 blend margin +comment corrected (0.005%, not ~5%)." The corrected comment in +`sample_traits.hpp` now shows the numbers and the measurement +(real deltas: |diff| ≤ 41 on the transparent table) and draws the +conclusion the wrong margin obscured: "a margin that thin is not an +invariant worth relying on silently" — which is precisely why the code +computes the blend in `int64_t`. Note what did *not* change: the code +was already right. The comment was the bug. + +That is the standard this appendix has been documenting all along. A +decision is not what the code happens to do; it is a claim, written where +the code makes it true, precise enough to be checked — and checked. From 0492a4128b815bbe2d157e9100440689c4448ef3 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 1 Jul 2026 21:30:40 +0000 Subject: [PATCH 04/16] book: Part 0 - the problem and its budgets https://claude.ai/code/session_01HuAFfoeD5a5Xe5aGNA16M9 --- book/src/part0/budgets.md | 351 ++++++++++++++++++++++++++++++- book/src/part0/two-crystals.md | 292 ++++++++++++++++++++++++- include/srt/asrc.hpp | 2 + include/srt/polyphase_filter.hpp | 2 + 4 files changed, 643 insertions(+), 4 deletions(-) diff --git a/book/src/part0/budgets.md b/book/src/part0/budgets.md index 11b8221..cae9d0e 100644 --- a/book/src/part0/budgets.md +++ b/book/src/part0/budgets.md @@ -1,3 +1,350 @@ -# budgets +# Budgets: latency, quality, compute -*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* +The previous chapter ended with three words used as if they were +self-explanatory: latency, quality, compute. This chapter turns each into +a number with a derivation behind it, because everything in Part I is an +expenditure against one of these three accounts, and you cannot audit an +expenditure without knowing the budget. + +The three budgets are not independent. A longer filter buys stopband +attenuation (quality) at the price of group delay (latency) and +multiply-accumulates (compute). A deeper FIFO buys servo stability +(quality, indirectly) at the price of latency. A finer polyphase table +buys interpolation accuracy at the price of memory and cache traffic. The +design that ships is not the best possible point on any single axis; it is +a defensible allocation across all three, and the allocation is different +for a Xeon than for a microcontroller. That is why the library has presets +and sample-type variants rather than one configuration: same architecture, +different budget splits. + +We take the three in the order of most surprising to least. + +## The quality budget, denominated in picoseconds + +The README makes a claim that deserves suspicion on first reading: the +phase accumulator's resolution is "far below the ~8 ps jitter budget for +120 dB transparency at 20 kHz." Eight *picoseconds* — in an audio system, +where a sample lasts twenty-one microseconds, six orders of magnitude +longer. Where does a number like that come from? + +It comes from the first real mathematics in this book, and the derivation +is three lines. This library's entire datapath is, as the last chapter +established, a creeping fractional delay: every output sample is the input +signal evaluated at a slightly wrong time, deliberately. So the natural +question is: how wrong is *acceptably* wrong? If we evaluate the signal at +time `t + Δt` instead of `t`, how large may `Δt` be before the error +matters at the quality level we are targeting? + +Take the worst case the audio band can offer: a full-scale sine at the top +of the band, + +```text +s(t) = A · sin(2π f t), f = 20 kHz. +``` + +The error caused by a small timing offset is governed by how fast the +signal can change. Differentiating, the slope is `2π f A · cos(2π f t)`, +whose magnitude peaks — at the zero crossings — at + +```text +max |ds/dt| = 2π f A. +``` + +A timing error `Δt` therefore produces an amplitude error of at most the +slope times the error: + +```text +e = 2π f A · Δt. +``` + +Now impose the quality target. The filter at the heart of this library is +designed with a 120 dB stopband — the "120 dB transparency" figure that +recurs throughout the project — and −120 dB as an amplitude ratio is +`10^(−120/20) = 10⁻⁶`. Demanding that the timing-induced error stay below +that, relative to full scale: + +```text +2π f · Δt ≤ 10⁻⁶ +Δt ≤ 10⁻⁶ / (2π · 20 000 Hz) = 7.96 × 10⁻¹² s ≈ 8 ps. +``` + +Eight picoseconds. Not because audio hardware keeps time that precisely — +it does not, remotely — but because *this library's job is to manufacture +sampling instants*. The two crystals define real time; the converter +invents the fractional positions in between, and any noise in those +invented positions is indistinguishable from noise added to the audio, at +the exchange rate the slope sets: one picosecond of timing error at 20 kHz +full scale costs about an eighth of a microvolt-per-volt, and 8 ps costs +−120 dB. Position error *is* amplitude error. That single sentence is the +reason a resampling library must care about time resolution that would be +absurd anywhere else in audio. + +Two honest qualifications keep the number from overclaiming. First, this +is a worst-case bound — the full-scale 20 kHz zero crossing — and real +program material spends almost no energy there; at 1 kHz the same +derivation gives a 20× looser budget, which is one reason the measured SNR +table is 135 dB at 997 Hz but 105 dB at 19.5 kHz. Second, the budget +governs *random or signal-uncorrelated* timing error. Slowly varying +timing error is not noise but frequency modulation — pitch wobble — and it +gets its own, much stricter treatment when the servo chapter derives why +the Quiet stage must reject its input sawtooth to roughly −120 dBc +equivalent at 20 kHz. Same currency, different account. + +## Spending the budget: sixty-four bits of phase + +With the budget in hand, we can now read the library's most important +data-representation decision as the budget allocation it is. Convert 8 ps +into the datapath's native unit, fractions of a sample at 48 kHz: + +```text +8 ps / 20.8 µs ≈ 3.8 × 10⁻⁷ samples ≈ 2⁻²¹ samples. +``` + +So the fractional position µ must be carried to about 21 fractional bits +before timing quantization alone could threaten 120 dB. Here is what the +library actually does, in the inner loop of the fractional resampler — +this is the Q0.64 phase accumulator the README describes, live from +`include/srt/polyphase_filter.hpp`: + +```cpp +{{#include ../../../include/srt/polyphase_filter.hpp:p0_phase_step}} +``` + +The fractional position lives in an unsigned 64-bit integer interpreted as +Q0.64: all 64 bits are fraction, so the resolution is 2⁻⁶⁴ of a sample — +forty-three binary orders of magnitude below the 2⁻²¹ the budget demands. +The servo's rate-deviation estimate `epsHat` is converted from double to +this fixed-point form **once per block**, and from there the per-sample +path is pure integer arithmetic: one 64-bit addition per output sample, +with the two slip cases — the fractional position creeping past 1.0 or +below 0.0, the "whole-sample slip roughly every `1/ppm` samples" of +Chapter 1 — detected by unsigned wraparound rather than comparison against +a threshold. + +Why carry 43 bits more resolution than the budget requires? Because the +excess is free, and what it buys is not resolution but *exactness*. A +phase accumulator adds a tiny ε thousands of times per second; do that in +floating point and every addition rounds, because a double near 1.0 has +2⁻⁵² of absolute resolution and a double's rounding depends on the current +magnitude of the accumulator. The earlier version of this code did exactly +that, and worked. But integer addition modulo 2⁶⁴ does not round — ever — +so the only quantization in the entire phase path is the once-per-block +conversion of ε itself, and the accumulated position between servo updates +is bit-exact. (The conversion is safe by construction: the servo clamps +|ε| to about 10⁻³, so `ε · 2⁶⁴` fits comfortably in the signed 64-bit +intermediate — the code comment above carries the argument, and the +configuration validator refuses `maxDeviationPpm` settings that could +break it.) + +The project's performance log records what this decision measured when it +landed as change C3 of the optimization campaign: the *motivation* was the +compute budget — an integer-only per-sample path with no doubles is what +keeps the inner loop cheap on DSPs without double-precision floating-point +units, and it cut Hexagon's Q31 pipeline cost by 15.5 % — but quality +*improved* as a side effect, to 135.0 dB at 997 Hz, with the log noting +the phase resolution change from 2⁻⁵² to 2⁻⁶⁴. One representation change, +paid from no budget, credited to two. Those are rare, and worth designing +toward. + +## The latency budget + +Latency is the easiest budget to state and the easiest to spend by +accident. Here is where every frame of it is decided — the converter's +entire configuration surface, live from `include/srt/asrc.hpp`: + +```cpp +{{#include ../../../include/srt/asrc.hpp:p0_config}} +``` + +The README's latency equation prices the defaults: + +```text +latency = targetLatencyFrames + (L·T − 1) / (2L) [input frames] + = 48 + (256·48 − 1)/512 + = 48 + ~24 ≈ 72 frames ≈ 1.5 ms at 48 kHz. +``` + +Two terms, and they are budget lines of entirely different character. + +The second term is the **filter group delay**, and it is a law of physics +wearing a configuration option's clothes. The interpolation filter is a +linear-phase FIR — symmetric coefficients, which is what guarantees every +frequency is delayed equally, and waveform shape is preserved — and a +symmetric filter *must* delay the signal by half its span: with `L = 256` +polyphase branches of `T = 48` taps each, `(L·T − 1)/(2L)` is 23.998 +input frames, ~0.50 ms. You cannot negotiate this term down at constant +quality; you can only buy a shorter filter. `FilterSpec::fast()` does +exactly that, cutting group delay to about 16 frames at reduced stopband, +and the `transparent()` preset spends the other way — 80 taps, 40 frames, +0.83 ms — for its extra high-frequency headroom. Quality and latency, +trading at a posted exchange rate of half a frame per tap. + +The first term, the 48-frame **FIFO setpoint**, is not physics but control +headroom, and it is the term you own. The FIFO between the clock domains +must never run empty (an audible underrun) and never hit its high +watermark (a resync), so the servo regulates its occupancy around a +setpoint — and that standing occupancy is buffered audio you are listening +through. Forty-eight frames is one millisecond at 48 kHz: enough to absorb +the push/pull phase jitter of real callbacks with margin, small enough to +keep the total design latency at 1.5 ms. + +The setpoint carries a feasibility rule that the README states in bold and +the constructor-plus-`pull()` logic enforces, because violating it does +not degrade the system — it destroys it: **the setpoint must exceed the +pull block size.** A `pull()` synthesizes output only from frames already +buffered; if the callback asks for 128 frames while the servo holds the +buffer at 48, every callback drains the FIFO through empty, and the +converter falls into a permanent dropout cycle that no amount of servo +cleverness can escape, because the geometry is simply infeasible. Rather +than document a footgun, the converter adapts: when it observes pull +blocks larger than the configured setpoint, it raises the effective +setpoint to the block size plus about half a block of margin (bounded by +FIFO capacity — callbacks above ~340 frames also need `fifoFrames` sized +explicitly), reports the raised value in +`Status::effectiveTargetLatencyFrames`, and lets latency follow. The +latency budget, in other words, has a hard floor set by your callback +size, and the library will spend up to that floor without asking — the +one budget line it refuses to let you underfund. On top of the rule sits +its softer sibling: the setpoint must also stay above the peak occupancy +excursion of your push/pull jitter, and the FIFO term breathes by a +fraction of the block size as the servo tracks drift, so 1.5 ms is a +design center, not a guarantee etched per-sample. + +`designedLatencySeconds()` reports the resulting figure at runtime, and +`tests/test_latency.cpp` closes the loop the project's way: it pushes an +impulse through a locked converter and asserts that the impulse emerges +where the equation said it would. + +## The compute budget + +The third budget is the one whose *unit* changes with the deployment. On a +server, compute is a fraction of a core; on a microcontroller, it is a +question of existence — does the workload fit under the clock rate or not. +This library targets both ends simultaneously, which is why its +performance culture is unusual, and why `docs/PERFORMANCE.md` is one of +the two canonical history documents this book draws on. + +Start at the comfortable end. On the shared 2.80 GHz Xeon that produced +the README's benchmark table, the default float converter processes a +stereo 48 kHz stream at 107.8 ns per frame — 193× faster than real time, +meaning one live stream costs about half a percent of one core. At that +end the compute budget is not about survival but about citizenship: how +many streams per core, how much headroom the rest of the audio graph +inherits. + +Now the other end. The README's platform matrix ends at the Arm +Cortex-M33 — the Raspberry Pi Pico 2's core, bare metal, no FP64 hardware, +no vector unit — and the project publishes, in the README's +instruction-count table, exactly what every workload costs there. The +numbers are *executed instructions*, measured by running fixed workloads +under QEMU with a counting plugin, and they are brutal and instructive. +The float interpolation kernel that costs the Cortex-M55 99.5 million +instructions costs the M33 1.90 **billion** — about 19× — for one reason: +the float datapath accumulates in double precision by design, and on a +core with no double-precision FPU every one of those accumulations becomes +a software floating-point library call. The compute budget on such a +target is not tightened; it is a different budget entirely, and the +Q15/Q31 fixed-point datapaths exist precisely as the correctly-denominated +response — integer-only inner loops that make the M33's cost land near the +M55's instead of 19× above it. + +What does an instruction budget *mean* on a 150 MHz M33? Divide. A 150 MHz +core executing (optimistically) one instruction per cycle retires 150 +million instructions per second, and a 48 kHz stream demands a frame every +20.8 µs — about 3,100 instructions of total budget per frame, forever, +before the rest of the firmware has run at all. Against that, the measured +comparison workloads put the full Q15 converter — servo and FIFO included +— at roughly 5,043 instructions per stereo frame on the M33: about 242 +million instructions per second for stereo, over the core's ceiling even +at ideal IPC. Mono, at roughly half that, fits. This is exactly the +README's guidance, now visible as arithmetic rather than advice: 48 kHz +Q15 mono fits a 150 MHz M33; stereo wants the `fast()` preset or the +RP2350's second core. On a Xeon the same library is a rounding error; on +the M33 the default preset is *infeasible in stereo*, and knowing that +before flashing hardware is the entire point of keeping the budget in a +table. + +The honesty clause matters as much as the numbers, and `docs/PERFORMANCE.md` +states it in its metrics table: instruction counts are deterministic to +the instruction, noise-free, and well-correlated with real cost *for +scalar code* — and they are still not cycles. They know nothing of wait +states, flash caches, or dual-issue. Cycle truth requires vendor +simulators or real silicon, which is why the repository carries +`examples/pico2_cyccnt/`, a flashable RP2350 harness that measures +DWT.CYCCNT cycles per block against these same instruction baselines, and +why the README explicitly frames the counts as "budgets pending +real-silicon validation." What determinism *does* buy is enforcement: the +counts are committed to `bench/baselines.json` and CI re-measures every +push, failing on any drift beyond ±3 % in either direction — a regression +is rejected, and an unexplained improvement is also rejected until the +baseline is re-recorded in the same diff, so stale slack cannot accumulate +to hide the next regression. Wall-clock numbers, by contrast, are never a +hard gate: shared runners are too noisy, and a gate that flakes teaches +people to ignore it. Instructions are gated because they are exact; +wall-clock is reported because it is real. Both disciplines are the same +policy — publish only what you can re-measure — applied to metrics of +different reliability. Part II returns to this machinery in detail. + +## Each budget line becomes a file + +Part 0 has now done its work: a physical problem (two crystals), a +measured cost of ignoring it (−34.7 dB), and three budgets with numbers +attached. Part I walks the library's headers in dependency order, and the +tour is really the budget ledger read line by line: + +`kaiser.hpp` is the quality budget's opening entry — the 120 dB stopband +that made the 8 ps derivation's target, purchased with a windowed-sinc +design whose tap count is the latency and compute budgets' first expense. +The polyphase bank spends memory to make one branch-pair evaluation per +output sample possible at all, and its `L = 256` branch count is sized by +the interpolation-residual rule the README quotes (−12 dB per doubling of +`L`, +12 dB per octave of signal frequency) — the reason the measured +table slopes from 135 dB at 997 Hz to 105 dB at 19.5 kHz. +`sample_traits.hpp` is the compute budget's answer to the M33 column +above: the Q15/Q31 datapaths as a customization point rather than a fork. +`spsc_ring.hpp` holds the latency budget physically — its occupancy *is* +the 48-frame line item — and doubles as the servo's sensor. `pi_servo.hpp` +polices the quality budget's FM account, rejecting the occupancy sawtooth +to the −120 dBc figure this chapter bounded. The fractional resampler +carries the Q0.64 accumulator you have already read. And `asrc.hpp` +composes the whole, enforcing the feasibility rule so the latency budget +can never be underfunded into a dropout cycle. + +Every number in those chapters traces back to one of this chapter's three +accounts. When a design choice seems baroque — a 64-bit integer phase, an +extra row in a coefficient table, a third servo stage — the question to +ask is always the same: *which budget is it spending, and which is it +defending?* + +## Verify it yourself + +```sh +# The 8 ps budget, re-derived in one line: +python3 -c "import math; print(1e-6 / (2 * math.pi * 20000))" + +# The quality budget, enforced: the pinned SNR thresholds behind the +# README's 135/120/112/105 dB table: +cmake -B build -DCMAKE_BUILD_TYPE=Release +cmake --build build -j +ctest --test-dir build -R AsrcQuality --output-on-failure + +# The latency budget, enforced: an impulse must emerge exactly where +# designedLatencySeconds() promises (48 + ~24 frames by default): +ctest --test-dir build -R Latency --output-on-failure + +# The host compute budget (Google Benchmark; the README table's source): +cmake -B build-bench -DCMAKE_BUILD_TYPE=Release -DSRT_BUILD_BENCHMARKS=ON +cmake --build build-bench -j +./build-bench/bench/srt_bench + +# The embedded compute budget: fixed workloads under QEMU, compared to +# the committed baselines at ±3% (needs the cross toolchain and a +# TCG-plugin-capable QEMU — docs/PERFORMANCE.md has the mechanics): +python3 scripts/icount.py --target m33 --build-dir --plugin +``` + +The instruction-count and benchmark tables in the README regenerate from +these same commands (`scripts/update_icount_docs.py`, +`scripts/update_perf_docs.py`), and CI fails if the published tables drift +from the measured baselines — the budgets in this chapter are audited on +every push. diff --git a/book/src/part0/two-crystals.md b/book/src/part0/two-crystals.md index cef6102..fdb1537 100644 --- a/book/src/part0/two-crystals.md +++ b/book/src/part0/two-crystals.md @@ -1,3 +1,291 @@ -# two crystals +# Two crystals, one stream -*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* +Every specification of this library begins with a lie that the audio +industry tells itself daily: "48 kHz." + +There is no such thing as 48 kHz. There is a quartz crystal on the capture +device's board resonating at very nearly the frequency its datasheet +promises, and a different quartz crystal on the playback device's board +doing the same, and neither of them consulted the other. Each was cut, +trimmed, and aged in its own factory; each sits at its own temperature, +warming with the electronics around it; each is divided down to a sample +clock through its own board's logic. When both devices claim 48 kHz, what +they mean is 48 kHz plus or minus some parts per million — and *whose* +parts per million is exactly the question. This library's working +envelope, inherited from the kind of hardware it targets, is a few hundred +ppm of offset per device, drifting slowly as temperatures change; the +default configuration accepts anything within ±1000 ppm, and the test +suite drives it across that range deliberately — including a 0 → 300 ppm +drift ramp at 10 ppm/s that must be tracked without losing lock. + +A part per million sounds like nothing. It is worth pausing on why it is +everything. + +## The integral that cannot be argued with + +Suppose you capture audio from device A and play it on device B, and +suppose the two clocks disagree by +200 ppm — the offset used throughout +this project's measurements as a realistic mid-scale case: the input side +runs at 48 009.6 Hz against the output's 48 000 Hz. The rate mismatch is +0.02 %. Per sample it is invisible. But a rate mismatch does not average +out; it *integrates*. Every second, the capture side produces 9.6 more +frames than the playback side consumes. Every second, forever. + +Put a buffer between them — the obvious move, and a correct first move — +and you have only chosen where the failure happens. The surplus +accumulates in the buffer at 9.6 frames per second. A 1,024-frame FIFO +(the converter's own default capacity floor, for scale) started half full +gives you about 53 seconds before it is completely full and something has +to give. Make the buffer deeper and you buy time linearly while paying +latency for every frame of depth; a buffer deep enough to survive an +hour-long session at 200 ppm would hold about three quarters of a second +of audio, all of which you would then be monitoring through. Flip the sign +of the mismatch and the same argument drains the buffer to empty instead. +There is no buffer size that fixes a rate mismatch, because the problem is +not jitter — which a buffer genuinely absorbs — but a nonzero mean. The +README states the consequence as the library's founding fact: whole-sample +slips occur roughly once every `1/ppm` samples, and any system that moves +audio between independent clocks must either resample adaptively or +eventually glitch. + +So the plain FIFO must fail. The interesting question — and the one this +project answered by measurement rather than assertion, because that is its +habit — is *how badly*. + +## Measuring the do-nothing option + +The comparison notebook (`notebooks/asrc_comparison.ipynb`, results +recorded in `docs/COMPARISON.md`) includes, alongside the serious +contenders, a subject called the **naive FIFO**: a buffer that simply +drops the newest samples when full, which is what "we'll deal with it +later" compiles to. It was measured under exactly the same conditions as +everything else — a 997 Hz tone at −1 dBFS crossing a +200 ppm clock +boundary, an AES17-style THD+N analysis with the fundamental removed and +the residual integrated across the 20 Hz–20 kHz band. + +The naive FIFO measures **−34.7 dB THD+N** and 94.7 dB of A-weighted +dynamic range. The converter this book describes, on the same signal and +the same clocks, measures −132.1 dB. + +What does −34.7 dB sound like? The number means that the error left after +subtracting the test tone sits only 34.7 dB below the tone itself — a +residual of about 1.8 % of the signal. If that residual were smooth +harmonic distortion, 1.8 % would already be far into plain audibility. But +it is worse than that, because of *how* the error is distributed in time. +At +200 ppm the buffer overflows and discards a sample about 9.6 times per +second, and each discard splices the waveform to a point one sample later: +a step discontinuity. A step is the broadest-band event a sampled signal +can contain; its energy smears across the entire spectrum. So the +subjective experience is not a haze of distortion but a steady mechanical +ticking — roughly ten clicks per second at this offset — riding on +otherwise clean audio. It is the sound that anyone who has misconfigured a +USB audio loopback already knows, and once heard it cannot be unheard. The +dynamic-range figure tells the same story from below: quiet passages sit +on a floor of click energy, tens of decibels above where the converter's +floor lies. + +That row of the table is the cost of doing nothing, and it calibrates +everything else in this book. Every design decision in the chapters ahead +is ultimately justified by the distance between −34.7 dB and −132.1 dB. + +## The two industry answers + +The two-crystal problem is decades old, and industry converged on two +families of solution. `docs/COMPARISON.md` opens by insisting on the +distinction, because both families are marketed under the same three +letters: there are **full ASRCs** that recover the clock ratio themselves, +and **resampler libraries** that must be handed the ratio from outside. + +**The hardware answer** is the asynchronous sample rate converter chip. +The canonical part is Analog Devices' AD1896 — the lineage this library's +architecture explicitly follows — joined by parts like TI's SRC4392. These +are dedicated silicon: serial audio in on one clock, serial audio out on +another, and the chip does everything, including the part that makes the +problem *asynchronous* — discovering the ratio between the two clocks by +itself, continuously, without being told. The datasheet numbers are +excellent: −117 dB THD+N minimum (−133 dB best case) and 142 dB dynamic +range for the AD1896; −140 dB typical and 144 dB dynamic range for the +SRC4392. Their ratio ranges are enormous — 1:8 up and 7.75:1 down for the +AD1896, 1:16 to 16:1 for the SRC4392 — because these chips are built to +convert 44.1 kHz material to 48 kHz and every other crossing a studio can +produce, not merely to absorb drift. The costs are the obvious ones: a +proprietary part, a place on the board, one stereo pair per chip, and no +help at all if your audio exists as bytes in memory rather than as a +bitstream between codecs. (A caveat the comparison document is careful +about, and this book inherits: those figures are datasheet values measured +through analog test loops, not this project's measurement. They are +comparable to the software numbers in definition, not in environment.) + +**The software answer** is the resampler library: libsamplerate, soxr, +zita-resampler. These are superb pieces of engineering with a structural +gap that `docs/COMPARISON.md` names precisely: they must be handed the +ratio by an external servo, and so they solve *only half of the drift +problem*. A resampler library answers the question "given that the input +runs 200 ppm fast, compute the output samples" — flawlessly, at any ratio +you ask. It does not answer "how fast is the input actually running right +now?", and that is the question the two-crystal problem poses, because +nothing in your system knows the answer. The true ratio is not written +down anywhere; it exists only physically, in the beat between two +oscillators, and it moves as the room warms up. In the comparison +measurements the libraries were fed the exact ratio by an oracle — the +harness knew the true offset because it had synthesized it — and under +those conditions they measure at the format ceilings: −143.5 dB THD+N +through a 24-bit interface for libsamplerate's `sinc_best`, −143.8 dB for +soxr's `VHQ`. Real numbers, and also unobtainable in the field as stated, +because the oracle does not ship. (Near-unity is their easy regime, too: +libsamplerate's published 97 dB worst case belongs to aggressive ratios, +not this one.) + +The missing half has a name: clock recovery. Somebody must observe the two +domains, estimate their ratio from evidence, and track it as it drifts — a +control problem, not a signal-processing one. The Linux/JACK ecosystem +shows what bolting that half on looks like: zita-ajbridge wraps a +delay-locked loop around zita-resampler. Operating systems solve it too, +invisibly — CoreAudio, WASAPI shared mode, and PipeWire all run ASRCs +inside their engines — with unpublished quality and typically 5–20 ms of +latency, fine for notification sounds and disqualifying for live +monitoring. + +So the field, surveyed honestly: chips that solve the whole problem in +proprietary silicon; libraries that solve the easy half in portable +software at reference quality; system engines that solve the whole problem +opaquely at whatever quality and latency they choose. What did not exist — +and what this library is — is the whole problem solved in open, portable, +embeddable software at measured quality: an AD1896-shaped architecture, +polyphase FIR plus clock servo, that you can compile. + +## The specialization that pays for everything + +You cannot simply transcribe the AD1896 into C++ and expect it to fit on a +microcontroller; the chips' generality is exactly the expensive part. +SampleRateTap's founding decision is to refuse most of the problem the +chips solve. It handles *only* the near-unity case: two domains at +nominally the same rate, within ±1000 ppm by default. It will never +convert 44.1 kHz to 48 kHz — the README lists this first among its +limitations, and `docs/COMPARISON.md` is blunt that for genuine rate +*conversion* you should put soxr or libsamplerate in the chain. + +Here is what the restriction buys. A general-ratio converter must be able +to place output samples anywhere relative to input samples, at any +spectral relationship between the rates — including downward conversions +where the filter must also band-limit, and ratios that change which parts +of its machinery dominate. In the near-unity regime none of that machinery +earns its keep. When the ratio is 1 + ε with ε a few hundred parts per +million, each output sample lands *almost exactly* on an input sample: +just a hair early or late, by a fractional offset that creeps by ε per +sample and wraps once every `1/ε` samples. The README's "How it works" +section states the consequence in one phrase: the conversion degenerates +into a **creeping fractional delay**. The datapath's job collapses to +evaluating one interpolation at a slowly sliding fractional position — a +48-tap dot product per output sample in the default configuration — plus a +servo deciding how fast the position should creep. And because the two +rates are spectrally indistinguishable, anti-imaging and anti-aliasing +collapse into a single fixed filter design, flat to 20 kHz, done once in +the constructor. + +The computational tables in `docs/COMPARISON.md` measure what that is +worth. Against libsamplerate — the closest architectural analog, a +streaming time-domain polyphase resampler — at the matched ~120 dB quality +tier, SampleRateTap converts 2.9–3.6× more frames per second (mono/stereo; +2.1× at 8 channels, where both engines amortize), while carrying half the +algorithmic latency: 24 frames (0.50 ms) of filter group delay against 46 +frames (0.96 ms). At the ~140 dB tier the gap widens to 6.2× in throughput +and to 40 frames against 143 in latency. That is the near-unity dividend, +and the comparison document names its mechanism exactly: a 48-tap window +with a creeping phase, instead of general-ratio machinery. On targets +without floating-point hardware the dividend compounds — the Q15 +fixed-point datapath has no libsamplerate analog at all, and on a +Pico-class Cortex-M33 the cheapest libsamplerate option costs about 9.8× +what SampleRateTap's intended configuration does. + +The soxr rows teach a different lesson, and reading them honestly is a +preview of the next chapter. At the ~120 dB tier soxr converts 32.4 +million stereo frames per second on the same host to SampleRateTap's 10.5 +million — soxr wins raw throughput, decisively, by processing in large +SIMD-friendly internal batches. The latency column is the price: 556 to +607 frames of algorithmic delay, 11.6 to 12.6 ms, rising to 777 frames +(16.2 ms) at its highest quality tier. Those are fine numbers for batch +conversion and impossible ones inside a 1–2 ms live-monitoring budget, and +— as `docs/COMPARISON.md` puts it — there is no setting that buys soxr's +throughput at SampleRateTap's latency. Throughput, latency, and quality +are not independent virtues to be maximized; they are a budget to be +allocated, and different tools have allocated it for different lives. + +One more number from the measured table completes the picture, because +this book does not deal in free lunches. Fed by its own servo rather than +an oracle, running causally at 1.5 ms of total design latency, +SampleRateTap measures −132.1 dB THD+N against the oracle-fed libraries' +−143.5 dB. The ~11 dB gap is the measured price of solving the *whole* +problem — discovering the ratio from buffer occupancy in real time instead +of being told it — and the comparison document presents it as exactly +that. Eleven decibels, spent 132 dB below the signal, purchasing the half +of the problem that was actually hard. The rest of this book is an account +of how both numbers — the 132 and the 11 — were achieved, measured, and +defended. + +## Watching the invisible + +Before the budgets, one more thing Chapter 1 owes you: a way to *see* the +problem, because 200 ppm is below anything your ears will report until the +FIFO finally gives way. The repository's first example, +`examples/drifting_clocks.cpp`, exists for exactly this. It runs two real +threads: a producer pushing a 997 Hz sine at a virtual 48 000.0 Hz, and a +consumer pulling at 48 kHz plus 500 ppm, both paced with absolute +`sleep_until` deadlines so the long-term rates are exact even though every +individual wakeup jitters by operating-system amounts — far rougher timing +than any real audio callback delivers. A status line prints the servo's +state and its rate estimate as it converges toward the −500 ppm +consumption deviation. + +Two of the example's own caveats are worth reading before you run it, +because each is a preview of a later chapter. First, since scheduler +jitter here is on the order of milliseconds, the demo configures a 20 ms +FIFO setpoint rather than the library's 1 ms default — your first sighting +of the latency budget bending to its environment, which is the next +chapter's subject. Second, the converter observes the clocks only through +whole 96-frame chunks, so its estimate of the ratio cannot firm up faster +than the chunk-beat period `1/(ppm × chunkRate)` — about four seconds per +beat cycle at 500 ppm — and the instantaneous estimate visibly wobbles at +that beat, which is why the display shows a three-second moving average. +The information available about two clocks is quantized by how coarsely +you watch them exchange data; that observation will return as the entire +justification for the servo's three-stage design. + +Run it and watch the state go `Filling`, then `Acquiring`, then `Locked`, +and the ppm readout settle toward −500. Nothing about the audio would have +told you any of this for the first minute — and that is the point. The +drift is always there; the only choice is whether something in the system +is measuring it. + +First, though, the budgets. Claims like "a 1–2 ms live-monitoring budget" +and "120 dB transparency" have been used here as if self-evident. They are +not. The next chapter derives each one — including why this library's +quality target works out to a timing tolerance of about eight +*picoseconds*. + +## Verify it yourself + +```sh +# Two real threads, two clocks 500 ppm apart; watch the servo lock and +# the ppm estimate converge: +cmake -B build -DCMAKE_BUILD_TYPE=Release +cmake --build build -j +./build/examples/drifting_clocks + +# Reproduce the measured table — including the −34.7 dB naive-FIFO row +# and the oracle-fed library ceilings. Needs numpy, matplotlib, and the +# `samplerate` and `soxr` Python packages; the first cell builds the +# C ABI shared library if missing: +jupyter execute notebooks/asrc_comparison.ipynb + +# The computational head-to-head on your own host (requires the system +# libsamplerate and soxr development packages, found via pkg-config): +cmake -B build-cmp -DCMAKE_BUILD_TYPE=Release \ + -DSRT_BUILD_BENCHMARKS=ON -DSRT_BUILD_COMPARE_BENCH=ON +cmake --build build-cmp -j +./build-cmp/bench/compare/srt_bench_compare +``` + +The comparison notebook pins SampleRateTap's own results with assertions, +so a regression in the library makes the reproduction fail loudly. The +numbers in this chapter are load-bearing, not decoration. diff --git a/include/srt/asrc.hpp b/include/srt/asrc.hpp index 6386c28..959e9ad 100644 --- a/include/srt/asrc.hpp +++ b/include/srt/asrc.hpp @@ -19,6 +19,7 @@ namespace srt { +// ANCHOR: p0_config /// Converter configuration. The defaults give ~1.5 ms designed latency at /// 48 kHz (FIFO setpoint 48 frames + ~24 frames filter group delay; see /// the README latency section), transparent for clocks within +/-1000 ppm. @@ -29,6 +30,7 @@ struct Config { std::size_t fifoFrames = 0; ///< ring capacity; 0 => automatic FilterSpec filter{}; ServoConfig servo{}; + // ANCHOR_END: p0_config /// Defaults adapted to a nominal rate other than 48 kHz. The filter /// band edges and servo bandwidths are absolute Hz designed for 48 kHz; diff --git a/include/srt/polyphase_filter.hpp b/include/srt/polyphase_filter.hpp index 345604a..d9365dd 100644 --- a/include/srt/polyphase_filter.hpp +++ b/include/srt/polyphase_filter.hpp @@ -414,6 +414,7 @@ class FractionalResampler { /// interleaved frames, returning the count actually delivered. template std::size_t process(S* out, std::size_t maxFrames, double epsHat, PopFn&& popFrames) noexcept { + // ANCHOR: p0_phase_step // eps in Q0.64, converted once per call (block rate). |eps| is // servo-clamped to ~1e-3, so eps * 2^64 fits int64 comfortably. const auto epsFix = static_cast(epsHat * 0x1p64); @@ -432,6 +433,7 @@ class FractionalResampler { return n; // dry: phase_ not advanced for this frame } phase_ = m; + // ANCHOR_END: p0_phase_step // Q15 on SMLALD targets routes mono through blendRow+dotRow as // well: dotRow carries the dual-MAC loop, and the two paths are // bit-exact by construction (see dotRow). From 50c3fba3aa71c66274b16f6e6565f0b5ebcbc956 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 1 Jul 2026 21:35:05 +0000 Subject: [PATCH 05/16] book: filter design chapters (kaiser.hpp and the polyphase bank) Two Part I chapters: the Kaiser prototype design walk (Bessel series, beta fit, harris length estimate, runtime-vs-constexpr decision) and the polyphase bank (extra row L, tap-reversed rows, coefficient blending). Adds comment-only kai_*/bank_* ANCHOR markers to kaiser.hpp and polyphase_filter.hpp for the live excerpts. https://claude.ai/code/session_01HuAFfoeD5a5Xe5aGNA16M9 --- book/src/part1/kaiser.md | 388 ++++++++++++++++++++++++++++++- book/src/part1/polyphase-bank.md | 355 +++++++++++++++++++++++++++- include/srt/detail/kaiser.hpp | 12 + include/srt/polyphase_filter.hpp | 10 + 4 files changed, 761 insertions(+), 4 deletions(-) diff --git a/book/src/part1/kaiser.md b/book/src/part1/kaiser.md index 0b0bbe1..d3cd50f 100644 --- a/book/src/part1/kaiser.md +++ b/book/src/part1/kaiser.md @@ -1,3 +1,387 @@ -# kaiser +# Designing the filter: `kaiser.hpp` -*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* +This is the only file in the library that runs exactly once per converter, +and it decides the quality ceiling of everything downstream. Every output +sample the converter will ever produce is a dot product against +coefficients this file computes in a few milliseconds at construction. If +the design here reaches 120 dB of image rejection, no later cleverness is +needed to preserve it — the hot path is exact integer or double +accumulation all the way out. If the design here falls short, no later +cleverness can recover it. So before touching the code, this chapter builds +the minimum filter-design theory a C++ reader actually needs — which is +less than a DSP course and different from one — and then spends its pages +where the textbooks stop: on the iteration cap, the clamp, the +normalization constant, and the compile-time-versus-runtime decision that +the textbooks never had to make. + +## The problem: evaluate a signal between its samples + +The converter's core operation (next chapter) is a *fractional delay*: given +the last `T` input samples of a signal, produce its value at a position μ +that falls between two of them. Sampling theory says this is not guesswork. +A signal sampled at rate `fs` with no content above `fs/2` is *completely +determined* between its samples; the reconstruction is + +```text +x(t) = Σₙ x[n] · sinc(t − n), sinc(u) = sin(πu) / (πu) +``` + +— every sample contributes a sinc centered on itself, and the sum +interpolates exactly. The `sinc` in this file is that function, with the +one hazard a numeric programmer would expect handled explicitly: + +```cpp +{{#include ../../../include/srt/detail/kaiser.hpp:kai_sinc}} +``` + +(The 0/0 at x = 0 is a *removable* singularity — the limit is 1 — but IEEE +arithmetic doesn't take limits, so the code must.) + +The catch is the `Σₙ`: it runs over **all** samples, and sinc decays like +1/t, which is uselessly slow. Truncating the sum to a window of `T` samples +around the evaluation point is mandatory. How you truncate is the entire +design problem. + +## Why plain truncation rings + +Chopping the sinc off after T samples is the same thing as multiplying the +ideal infinite filter by a rectangular window. Multiplication in time is +convolution in frequency, so the ideal filter's perfectly sharp frequency +response gets smeared by the rectangle's spectrum — and the rectangle's +spectrum is awful: its sidelobes start at −13 dB and buy you a stopband of +only about −21 dB. Worse, this is a *shape* problem, not a *size* problem. +Doubling T squeezes the smearing into a narrower band (the transition +sharpens) but the first sidelobe stays at the same level — the Gibbs +phenomenon. A truncated sinc leaks images at −21 dB whether it has 12 taps +or 12,000, and −21 dB is roughly the error of cheap linear interpolation. +For a 120 dB budget, truncation alone is off by five orders of magnitude. + +The fix is to taper instead of chop: multiply the sinc by a window that +falls smoothly to zero at the edges. Every smooth window trades the same +two currencies — a wider main lobe (slower transition, so more taps for the +same band edges) buys lower sidelobes (deeper stopband). The question is +only which window spends the taps most efficiently. + +## The Kaiser window, and what to cite + +James Kaiser's answer (Kaiser 1974; the survey that made it standard +practice is harris 1978) is the *I₀–sinh* window, + +```text +w[i] = I₀(β · √(1 − u²)) / I₀(β), u ∈ [−1, 1] across the window, +``` + +where I₀ is the zeroth-order modified Bessel function. It is a closed-form +approximation to the *prolate spheroidal* window — the provably optimal +concentration of energy in the main lobe — that costs one special function +instead of an eigenvalue problem. Its virtue for engineering is the single +knob: **β alone sets the sidelobe level**, continuously, from rectangular +(β = 0) to arbitrarily deep, and Kaiser published empirical formulas mapping +a stopband spec in dB directly to β and to the filter length. No iteration, +no optimization run, no table lookup: attenuation in, coefficients out. + +That is the theory, and this book will not re-derive it — Kaiser's paper +and harris's survey do it properly. What they do *not* tell you is how to +evaluate I₀ in a `noexcept` header without a math library that provides it, +what happens to the length formula when a caller hands it garbage, why the +normalization constant is `L` and not 1, or whether any of this should run +at compile time. That is the rest of this chapter. + +## `besselI0`: a power series with an escape hatch + +`` has no I₀ (`std::cyl_bessel_i` exists in the special-functions +annex, but it is optional, absent from libc++, and this library targets +toolchains as odd as hexagon-musl). So the file computes it from the power +series + +```text +I₀(x) = Σₖ [ (x/2)ᵏ / k! ]² +``` + +which converges for every finite x: + +```cpp +{{#include ../../../include/srt/detail/kaiser.hpp:kai_besseli0}} +``` + +Three details carry all the engineering. + +**The recurrence.** Each term is the previous term times `(x/2k)²` — no +factorials, no powers, no overflow staging. Term k relates to term k−1 by +exactly the ratio `r²`, computed in two multiplies. For the β values this +library ever produces (about 12.3 for the 120 dB preset, 14.5 for 140 dB) +the terms grow until k ≈ x/2 ≈ 6 and then collapse factorially; a few dozen +terms reach full double precision, matching the "~50-term" budget the +file's header comment charges against constexpr evaluation. + +**The stopping criterion.** `term < 1e-21 * sum` stops when the next term +can no longer perturb the sum's 16 significant digits — a *relative* test, +so it is correct whether I₀ is 1.0001 or 10⁴ (it is about 19,000 at β = 12). +The margin below double epsilon (≈ 2.2·10⁻¹⁶) costs a handful of extra +iterations and removes any temptation to reason about rounding at the +boundary. + +**The iteration cap — the line a textbook would not print.** The loop bound +`k < 1000` looks redundant: the series *always* converges, so the relative +test *always* fires eventually. For every real x, yes. Feed the function a +NaN — say, from an uninitialized config field three call frames up — and +every comparison involving `term` is false, including the exit test. An +unbounded loop in a `noexcept` function would hang the caller's constructor +forever. With the cap, the worst case is a garbage return value that the +converter-level validation (chapter after next) rejects anyway. The cap is +not about convergence; it is about making *termination* independent of +floating-point semantics. This costs one integer compare per iteration and +turns an unprovable property into a checkable one. + +The unit test pins the function against reference values computed +independently (`besselI0(1.0) = 1.2660658777520084…`), at tolerances that +scale with the magnitude — 10⁻¹² absolute near 1, 10⁻⁶ near 19,000 — i.e. +constant *relative* accuracy, which is what the window formula's ratio +`I₀(β·…)/I₀(β)` actually consumes. + +## `kaiserBeta`: an empirical fit, taken as published + +```cpp +{{#include ../../../include/srt/detail/kaiser.hpp:kai_beta}} +``` + +This is Kaiser's published fit, digit for digit — `0.1102`, `0.5842`, +`0.07886` are his constants, not this library's, and the right response to +magic numbers with a citation is to copy them exactly and test them exactly +(the unit test asserts the formulas symbolically, so a typo in a constant +cannot hide). Two things are worth understanding rather than memorizing: + +- **Why piecewise.** The relationship between β and achieved attenuation is + smooth but not polynomial; Kaiser fit it in two regimes. Above 50 dB the + relationship is essentially linear. Between 21 and 50 dB the fractional + power term takes over. Every preset this library ships (96–140 dB) lives + on the first line; the second exists so that off-spec experiments degrade + gracefully instead of nonsensically. +- **Why zero below 21 dB.** A rectangular window — no taper at all — + already achieves about 21 dB. Asking the fit for less than the free + floor correctly returns "don't taper." + +## `estimateTaps`: the cost formula, with a seatbelt + +β sets the stopband *depth*; the number of taps sets how fast the response +can *fall* into it. Kaiser's length estimate (the form popularized by +harris) says taps scale linearly with attenuation and inversely with +transition width: + +```cpp +{{#include ../../../include/srt/detail/kaiser.hpp:kai_estimate}} +``` + +Note what the signature normalizes to: transition width *as a fraction of +the input rate*, and the return is taps *per polyphase branch*. The full +prototype (next section) has `L·T` taps at an oversampled rate of `L·fs`; +run the classic formula at that rate and both numerator and denominator +pick up the same factor of L, which cancels. Expressing the estimate per +branch keeps the caller's arithmetic in the units the caller actually has — +"8 kHz of transition at 48 kHz" — with no L in sight. + +Plug in the `balanced()` preset: 120 dB across a 20→28 kHz transition at +48 kHz gives `(120 − 8) / (2.285 · 2π · 8000/48000) ≈ 46.8`, so 47 taps; +the unit test (`Kaiser.TapEstimateMatchesHarrisFormula`) brackets exactly +this computation at 45–49, and the shipped preset says `tapsPerPhase = 48` +— the estimate rounded up to an even count (even matters later: the SMLALD +kernel on Cortex-M33-class parts consumes taps in pairs). This function is +how the presets were *chosen*; the bank itself takes `T` from the spec, so +the estimate is a design aid with a unit test rather than a hot dependency. + +Then there is the comment at the top of the body, which earns its own +paragraph because it was not in the first version of this file. The raw +formula misbehaves at both edges of its domain: `attenDb < 8` makes the +numerator negative, and a zero or negative transition width divides to +±infinity. Both would then hit `static_cast` — and converting +a negative or non-finite `double` to an unsigned integer is **undefined +behavior** in C++, not "some big number." Not implementation-defined: +undefined, the kind UBSan flags and optimizers exploit. An adversarial +audit of the library flagged the cast; the guard was added in response. The +predicate is written `!(transWidthNorm > 0.0)` rather than +`transWidthNorm <= 0.0` deliberately — the negated form is also true for +NaN, so all three pathologies (negative, zero, NaN) funnel into the same +clamp, and the attenuation edge is covered by the `n > 4.0` select on the +other side. The floor of 4 taps is the smallest window the bank will accept. +A design helper this cheap has no business having *any* input that invokes +UB, however silly the input. + +## `designPrototype`: where all of it lands + +```cpp +{{#include ../../../include/srt/detail/kaiser.hpp:kai_prototype}} +``` + +One pass, one output array, but four decisions are packed into these lines. + +**The grid.** The prototype is the windowed sinc sampled `L` times per +input sample — `t = (i − center) / numPhases` is time measured in *input* +samples. This is the oversampled master filter that the next chapter slices +into L branches; length `L·T` means 4,096 doubles for `fast()`, 12,288 for +`balanced()`, 40,960 for `transparent()`. `center` places the peak exactly +mid-array, which makes the filter linear-phase by symmetry — its group +delay is a constant `T/2` input samples, the number the converter's latency +formula quotes. + +**The window argument, defensively.** `u` sweeps [−1, 1] across the array +and feeds `√(1 − u²)`. At the exact endpoints `1 − u²` is zero in real +arithmetic but can round a few ulps *negative* in floating point, and +`std::sqrt` of a negative is NaN — one NaN tap would silently poison every +dot product that ever touches that row. The `std::max(0.0, …)` costs +nothing and closes the hole. (Notice the theme: this file trusts +floating-point identities nowhere — not in `sinc`, not in the series exit, +not here.) + +**What `cutoffNorm` means, and its surprising value.** The cutoff is +normalized so 1.0 sits at the *input* Nyquist, and the caller centers it in +the transition band: `(passbandHz + stopbandHz) / fs`. For the balanced +preset that is (20,000 + 28,000)/48,000 = **exactly 1.0** — the −6 dB point +of this anti-imaging filter sits *at* 24 kHz, with the response still flat +at 20 kHz and 120 dB down by 28 kHz. A reader trained on decimation filters +may flinch: doesn't a cutoff at Nyquist let aliasing through? No — this +filter's job is *interpolation* in a near-unity converter. The images it +must kill are reflections of the input spectrum around `fs`, so content +below 20 kHz images no lower than 28 kHz; the band between 20 and 28 kHz +contains, by construction of the spec, nothing anyone claimed to protect. +Splitting the transition symmetrically across Nyquist spends the taps where +they buy audible margin on both sides. This is the first of several places +where "near-unity only" (the library's headline restriction) converts +directly into cheaper mathematics. + +**The normalization: sum = L, not 1.** A textbook lowpass normalizes its +coefficient sum to 1 so DC passes at unity gain. This prototype normalizes +to `L` — because no output sample is ever computed with the whole +prototype. Each output uses one branch of `T` taps: every L-th coefficient. +The L branch sums partition the total, and for a good lowpass they +partition it *evenly* — each branch's DC gain deviates from the mean only +by stopband-sized leakage (a branch sum is, in DFT terms, the prototype's +response sampled at multiples of the input rate: exactly the image +frequencies the stopband suppresses). Normalize the total to L and every +branch lands at 1 ± leakage; feed the converter DC and DC comes out, at any +fractional position. That is not left to inspection: +`Polyphase.DcGainIsUnityAcrossMu` pushes an all-ones window through the +*built* bank at 64 random μ values and requires unity within 10⁻⁴ — a +bound loose enough to admit float coefficient storage and row blending, +tight enough that a normalization bug (off by one branch, off by a factor +of L) fails by orders of magnitude. One subtle consequence lands two +chapters from now: with branch gains pinned near 1, the *peak* coefficient +also sits near 1.0, which is precisely why the fixed-point formats must +spend a headroom bit (Q1.14, Q1.30) on their coefficients. + +## The headline decision: runtime design, not `constexpr` + +Everything above is pure functions of compile-time-lookable values — and +this is C++20, where `constexpr` has teeth. The obvious modern move is to +evaluate the whole design at compile time: coefficients in `.rodata` +(attractive on a flash-based microcontroller), zero construction cost, even +`static_assert`s on the response. The file's own header records why that +was rejected, and since the reasoning is a design artifact it is kept where +refactors will trip over it: + +```cpp +{{#include ../../../include/srt/detail/kaiser.hpp:kai_design_note}} +``` + +Present the alternative fairly, because it *almost* works: + +- **The language isn't there yet.** `std::sin`, `std::sqrt`, `std::pow` + are not `constexpr` before C++26 (P1383 fixes this). A C++20 constexpr + design needs hand-rolled constexpr transcendentals — several hundred + lines of the most bug-prone code in numerics, duplicating functions the + runtime already has, in a library whose entire test story leans on + comparing against exactly those runtime functions. +- **The compile-time cost is not a rounding error.** Constexpr evaluation + is interpretation, three to four orders of magnitude slower than native + code. The design touches every one of 12K–41K taps with a `sin`, a + `sqrt`, and a ~50-term Bessel series. What runs in well under 10 ms + native becomes tens of seconds to minutes interpreted — **per translation + unit**, because a header-only library re-instantiates in every TU that + includes it. A user with twenty includes pays twenty times, on every + rebuild, forever. +- **The inputs are not actually compile-time.** The band edges are scaled + by the *runtime* sample rate (`FilterSpec::scaledTo`, + `Config::forSampleRate`) — a converter constructed for a rate read from + an ALSA descriptor at startup cannot have baked coefficients at all. A + constexpr path would be a second, divergent code path serving only the + subset of users with fully static configs. + +Against all that, the runtime cost being amortized is: one design, under +10 ms, in a constructor documented as setup-time-only, off the audio path +by the library's own RT rules. The trade is lopsided once written down — +but only once written down, which is why the file writes it down. (If +C++26 constexpr math plus a measured compile-time budget someday flips the +trade for static configs, the pure functions here are already shaped for +it: no state, no allocation, `std::span` in, coefficients out.) + +## The test evidence: the spec, measured by DFT + +A filter design module invites a lazy test — "coefficients equal last +week's coefficients." That freezes bugs in amber. What the library pins +instead is the *specification*: `tests/test_kaiser.cpp` computes the +prototype's actual frequency response by direct DFT and asserts the numbers +the presets advertise. + +The measurement function evaluates `|H(f)|` at arbitrary frequencies in Hz +against the oversampled prototype (rate `L·fs`), normalized by L so the +passband reads 0 dB — a direct O(n) sum per frequency. No FFT: an FFT +would demand a power-of-two grid, deliver frequencies nobody asked for, +and drag in a dependency, all to accelerate a few hundred evaluations in a +test that runs in milliseconds. Then, for each shipped preset: + +- **Passband flatness:** every 500 Hz from DC to the passband edge, + response within ±0.01 dB of unity. That is the "flat to 20 kHz" claim in + the README, as an executable inequality. +- **Stopband depth:** every 250 Hz from the stopband edge out to *three + times the sample rate*, response below −(spec − 1) dB. The 3·fs reach + matters: the polyphase structure's images repeat around every multiple + of fs, so a stopband that sagged past the first image would pass junk at + 96 kHz even if 28 kHz looked fine. The 250 Hz step is calibrated to the + filter, not guessed: a T-tap-per-branch prototype has sidelobe nulls + spaced fs/T ≈ 1 kHz apart, so 250 Hz sampling puts about four probes on + every lobe — a peak cannot hide between probes. The 1 dB grace absorbs + the gap between Kaiser's empirical β fit and the realized window; the + presets' 120 means "at least 119 measured," and in practice the margin + is comfortable. + +Honest limits, as always: these tests certify the *double-precision +prototype*. Coefficient quantization (float, Q1.14, Q1.30) and the +row-blending residual are downstream effects certified by the next +chapter's tests and the end-to-end SNR suite — the layering is deliberate, +so a failure names its culprit. + +## Why these ~100 lines look the way they do + +| Decision | Alternative rejected | Reason | +|---|---|---| +| Kaiser window | Parks–McClellan / remez | one β knob, closed form, no iteration to converge or fail at setup; near-optimal is optimal enough at 120 dB | +| Power-series I₀ | `std::cyl_bessel_i` | optional annex, missing on libc++/embedded toolchains; the series is 12 lines and testable | +| Iteration cap `k < 1000` | trust convergence | NaN input defeats the relative-error exit; termination must not depend on FP semantics in a `noexcept` function | +| UB clamp in `estimateTaps` | trust callers | negative/infinite → `size_t` cast is UB; found by audit, closed for one branch | +| Cutoff centered in transition, up to input Nyquist | classic conservative cutoff | near-unity interpolation only fights images of the protected band; symmetric transition spends taps evenly | +| Normalize sum to L | sum to 1 | per-*branch* DC gain is what reaches the output; pinned by the DC unit test | +| Runtime design | C++20 constexpr tables | pre-C++26 constexpr math gap; minutes of interpreted evaluation per TU; runtime sample rates exist; <10 ms once at setup | +| Spec-based DFT tests | golden coefficient files | tests the claim, not the bits; refactors that preserve the response pass | + +## Verify it yourself + +```sh +# Build and run the design-math tests: Bessel/beta reference values, the +# harris estimate bracket, and the DFT passband/stopband spec checks for +# all three presets: +cmake -B build && cmake --build build -j +ctest --test-dir build -R Kaiser --output-on-failure + +# The claim the normalization exists to protect (unity DC gain through the +# built bank, swept over mu): +ctest --test-dir build -R Polyphase.DcGain --output-on-failure + +# Break it on purpose: in designPrototype, change the normalization to +# `1.0 / sum` (the textbook choice) and watch DcGainIsUnityAcrossMu fail by +# a factor of numPhases; or weaken kaiserBeta's 0.1102 to 0.11 and watch +# the Transparent stopband check report the exact frequency that leaks. +``` + +Both sabotage runs are worth the five minutes: the first shows you which +test owns the normalization contract, and the second shows the empirical β +fit has no slack at 140 dB — which is precisely why the constants are +copied from Kaiser 1974 to the last digit. diff --git a/book/src/part1/polyphase-bank.md b/book/src/part1/polyphase-bank.md index 415f083..ef3292a 100644 --- a/book/src/part1/polyphase-bank.md +++ b/book/src/part1/polyphase-bank.md @@ -1,3 +1,354 @@ -# polyphase bank +# The polyphase bank -*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* +The previous chapter ended with a prototype filter: 12,288 double-precision +coefficients (for the default preset) describing one ideal anti-imaging +lowpass, oversampled 256× against the input rate. This chapter is about a +data structure. Per output sample, the converter's budget is one dot +product of 48 multiply-accumulates — not 12,288 — and the fractional +position μ arrives with 2⁻⁶⁴-sample resolution, demanding a filter for a +delay the table cannot possibly enumerate. `PolyphaseFilterBank` is the +arrangement of those 12,288 numbers that makes the right 48 of them, for +*any* μ, a matter of two pointer offsets and a linear blend. Almost +everything interesting about it is in the layout: one extra row nobody +asked for, every row stored backwards, and a table that no code path may +touch after its constructor returns. + +## The decomposition: L filters hiding in one + +Recall what the prototype is: the windowed sinc sampled on a grid of 1/L +input samples, `L·T` taps long. Evaluating the input signal at a position +`p/L` between samples means dotting the T input samples in the window +against the sinc *offset by p/L* — which, on the prototype's grid, is +simply every L-th coefficient starting at p: + +```text +branch 0: h[0], h[L], h[2L], … h[(T−1)L] delay 0 +branch 1: h[1], h[L+1], h[2L+1] … delay 1/L sample +branch p: h[p], h[L+p], h[2L+p] … delay p/L sample +branch L−1: … delay (L−1)/L +``` + +That is the entire polyphase decomposition for this use case — no z-domain +identities required. One oversampled filter *is* L ordinary T-tap filters +interleaved, each a fractional-delay filter for one grid position. Nothing +is computed to "decompose" it; the bank merely copies the prototype into a +`(rows × T)` table so that each branch's taps — which are strided L apart +in the prototype — become contiguous in memory, because the dot product +will read them T-at-a-time, millions of times, and the prototype order +would stride the cache to death. The classic references derive this +structure for rational resamplers (it is also how commercial ASRC silicon +like the AD1896 organizes its ROM); here it is simpler, because near-unity +operation means each output needs exactly *one* branch evaluation — the +question is only which branch, and what to do between branches. + +## Between the branches: why L = 256 and a linear blend + +μ is a 64-bit fraction; the table has L rows. Rounding μ to the nearest +row would quantize the delay to 1/L of a sample, and delay quantization on +a moving signal is *noise* — worse at high frequencies, where a fixed time +error subtends more phase. The bank's answer is the standard one at this +quality tier: pick the two rows adjacent to μ·L and interpolate the +*coefficients* linearly between them. The residual error of that blend is +the quality knob the spec exposes: + +```cpp +{{#include ../../../include/srt/polyphase_filter.hpp:bank_spec}} +``` + +The comment's two slopes are the design law for choosing L, and they are +measured properties of this code, not folklore (the README derives its +quality table from the test suite): the blend residual falls **about 12 dB +for every doubling of L** — linear interpolation has second-order error, +so halving the grid step quarters the error — and rises **about 12 dB per +octave of signal frequency**, because coefficient interpolation error acts +like a second derivative and high frequencies bend faster. You can see the +frequency slope directly in the shipped numbers for `balanced()` +(L = 256): 135 dB SNR at 997 Hz, 120 dB at 6 kHz, 112 dB at 12 kHz, 105 dB +at 19.5 kHz — once the signal frequency is high enough for the blend +residual to dominate, each octave costs roughly the predicted 8–12 dB. The +unit tests pin the same staircase at the kernel level, single tones against +the analytic sine: worst-case error below −120 dB at 997 Hz, −110 dB at +4 kHz, −100 dB at 10 kHz, −90 dB at 19 kHz. + +Why not simply crank L and skip the blend? Cost. Nearest-row lookup has +*first*-order error — about 6 dB per doubling — so matching the blend's +accuracy at 19.5 kHz would take L in the hundreds of thousands and a table +in the hundreds of megabytes. With the blend, `balanced()` is +(256 + 1) × 48 float coefficients ≈ 48 KB — resident in L2, arguably L1, +on hosts, and tolerable in MCU RAM at Q15 (≈ 24 KB). `transparent()` +doubles L *and* stretches T for ≈ 160 KB in float, buying its extra margin +mostly at the top of the band (108 dB vs 105 dB at 19.5 kHz measured end to +end). Why not a fancier blend — cubic across four rows? It would double +the coefficient traffic and the blend arithmetic in the innermost loop the +library owns, to fix the *highest-frequency* residual only; L = 256 already +puts that residual below the 105 dB the rest of the chain sustains. The +linear blend is the cheapest operation that keeps the table small and the +error second-order; everything faster is worse, everything better is not +needed at this budget. + +## The extra row: L + 1 rows for an L-phase filter + +Here is the file's cleverest line, and it is a line of *allocation*, not of +algorithm: + +```cpp +{{#include ../../../include/srt/polyphase_filter.hpp:bank_layout}} +``` + +The problem it dissolves: blending needs rows `p` and `p + 1`. For +p = 0 … L−2 both exist. At **p = L−1** the blend wants "row L" — the +branch for a delay of exactly one whole sample. Modular thinking says row L +"is" row 0, and arithmetically it is — *for a different window*. Branch 0 +is a delay of zero against the current window; the position μ → 1 is a +delay of one, which equals a delay of zero against the window advanced by +one input sample. Using row 0 against the *current* window would be wrong +by exactly one sample — not subtly wrong: it would blend the correct filter +with a copy of the signal shifted a full sample, an error at signal level. + +The conventional fixes are all branches. Detect p = L−1 and handle the +wrap specially — a data-dependent branch in the per-sample path, taken at +the beat frequency between the two crystals (at 200 ppm, about ten times a +second), which is also precisely the moment the resampler executes a +whole-sample slip, the most delicate step it performs. Or clamp μ short of +1.0 and accept a periodic discontinuity — a spur at the beat frequency, +in a library chasing 120 dB. + +The bank's fix: **store row L explicitly, as branch 0 advanced by one input +sample**. It falls out of the construction loop with no special case: + +```cpp +{{#include ../../../include/srt/polyphase_filter.hpp:bank_build}} +``` + +Follow the index math for `p == phases_`: the prototype index is +`m = t·L + L = (t+1)·L` — branch 0's tap `t + 1`. So row L holds branch 0's +coefficients shifted one *tap*, i.e. one input sample; the final tap +(`m = T·L`) falls off the prototype's end and the `(m < n)` guard writes a +zero. Row L computed against the current window is *identically* branch 0 +computed against next window. The consequences, in the order they matter: + +- **Branch-free interpolation.** `interpolate()` may always read + `phase(p)` and `phase(p + 1)` for any p ≤ L−1. No modulo, no compare, no + special case — the hot loop's structure is independent of μ. +- **Exact continuity at the μ-wrap.** As μ → 1 the blend converges to pure + row L; the whole-sample slip then advances the window and resets μ to 0, + where pure row 0 takes over — and those two evaluations are the same + arithmetic on the same samples. The seam has *zero* width: not "small + error," but bit-level agreement of the limits from both sides, up to the + one blend step of the approach. + +Neither property is left to prose. `Polyphase.ExtraRowEqualsPhaseZeroAdvancedOneTap` +asserts the layout claim coefficient by coefficient — `phase(L)[0] == 0` +and `phase(L)[u] == phase(0)[u−1]` with `EXPECT_EQ`, exact equality, no +tolerance, because the construction loop is supposed to make them the +*same numbers*, not similar ones. `Polyphase.MuWrapIsContinuousWithWindowShift` +then asserts the consequence at the semantic level: `interpolate(hist, +μ → 1)` equals `interpolate(hist + 1, μ = 0)` on random data — the +whole-sample-slip invariant the resampler (two chapters from now) leans on +every time the crystals drift one full sample apart. The cost of all this: +48 extra coefficients — 192 bytes in float — and one `+ 1` in a `resize()`. +It is the best byte-per-correctness trade in the library. + +## Rows stored backwards + +The second line of that layout comment: rows are **tap-reversed**. +Convolution is inherently a reversal — output = Σ h[k] · x[now − k] — so +either the coefficient array or the history walk must run backwards. +The resampler keeps each channel's history as an *oldest-first* window +(natural for its append-and-compact delay line, and the friendly direction +for hardware prefetchers). Storing each row reversed at construction — +`table_[p·T + (T−1−t)]` — lets the kernel be the loop every SIMD unit +wants: + +```text +for t in 0…T−1: acc += hist[t] · row[t] +``` + +both arrays walked forward, contiguously, from element zero. The +reversal is paid once per converter at build time instead of once per +sample as backwards addressing, and the payoff is documented downstream in +this book's optimization chapters: the auto-vectorized Q15 kernels, the +SMLALD pair-loads on Cortex-M33 (which require adjacent taps to sit in +ascending order in one 32-bit load), and the `SRT_RESTRICT` blend loop all +assume exactly this orientation. One subtlety the test above already +banked: "advanced one tap" for the reversed row L means shifted one slot +*toward the newer end*, which is why the zero lands in slot 0 (the oldest) +— the kind of double-negation a comment can state but only an `EXPECT_EQ` +can enforce. + +## Quantization happens here, once + +The table's element type is not `double` — it is +`SampleTraits::Coeff`, and the constructor's `makeCoeff(v)` is the +single point where the design-precision prototype becomes datapath +coefficients. Quantizing once at build time, rather than converting on the +fly, means the hot path reads exactly what it dots and the quantization +error is a fixed property of the constructed object, measurable by the +tests rather than dependent on the code path taken. + +What each sample type stores (the full traits treatment is the next +chapter; here is what the *bank* needs you to know): + +- **float** stores float coefficients: quantization at roughly −150 dB + against the double prototype — comfortably irrelevant under a 120 dB + target, which is why the float path's quality tests read the same as the + design spec. +- **Q15 and Q31 store Q1.14 and Q1.30**, not Q0.15/Q0.31 — one bit of + headroom spent because of a fact the *previous* chapter created: the + prototype is normalized so each branch has DC gain 1, which puts the + peak (center) tap at ≈ 1.0, and 1.0 does not fit a pure fractional + format whose ceiling is 1 − 2⁻¹⁵. Rather than rescale the filter (and + move the problem into output gain), each fixed-point format trades its + top precision bit for range. `makeCoeff` rounds half-away-from-zero and + saturates, so even a tap of exactly 1.0000…1 from design rounding + becomes the format's max instead of wrapping to −1 — a wraparound there + would be a −∞ dB event, not a noise-floor one. + +The bank is thus one template with three concrete personalities, and the +table *is* the personality: same layout, same extra row, same reversal, +different arithmetic downstream. + +## Validation in two layers, and the all-NaN table + +The constructor rejects what it can see is nonsense: a non-positive sample +rate, fewer than 4 taps, fewer than 2 phases, inverted or out-of-range band +edges — throwing `std::invalid_argument` at setup time, where exceptions +are allowed and cheap. This is necessary and insufficient, and the gap +between those two words is an audit story worth retelling precisely. + +Every check in the constructor is a comparison. Feed the converter a +`Config` whose `sampleRateHz` is NaN — one uninitialized field in caller +code — and every comparison is *false*: `sampleRateHz <= 0.0`? False. +`stopbandHz > sampleRateHz`? False. The constructor sails through, +`cutoffNorm` goes NaN, `designPrototype` dutifully computes 12,288 NaN +coefficients (recall the previous chapter: the Bessel iteration cap exists +so even *this* terminates), and the object constructs successfully. The +converter then runs, produces NaN audio forever, and never throws, never +asserts, never glitches in a way a log would catch. The adversarial audit +of the library built exactly this object (finding F2); the fix is the +converter-level `validated()` gate, which enforces what the bank's local +comparisons cannot express: + +- **finiteness of every double in the config** — the only guard NaN cannot + slip, because it is `std::isfinite`, not an ordering; +- **the band-edge sum rule**: `passbandHz + stopbandHz ≤ sampleRateHz`. + The bank alone accepts `stopbandHz` up to the sample rate, but the + cutoff is *centered* at `(pass + stop)/fs` — let the sum exceed fs and + the anti-image cutoff lands above the input Nyquist, a filter that + passes the very images it exists to kill, while every local check still + passes; +- plus the servo's eps-overflow clamp and 32-bit size-product overflow, + which belong to later chapters. + +All of it is pinned by `ConfigValidation.RejectsSilentMisbehavior` — each +formerly-constructible pathology now `EXPECT_THROW`s — and, just as +deliberately, by two `EXPECT_NO_THROW`s: the rate-scaling factory +`Config::forSampleRate` produces specs sitting *exactly on* the sum-rule +boundary (passband + stopband == fs up to rounding), and a validation rule +that rejected its own library's presets would be a different bug. The +division of labor is a pattern to copy: the class rejects what it can +express *locally*; the composition layer owns the invariants that only +exist between components; and every rejected configuration is one a real +caller could plausibly write. + +## C++ notes: immutability, `bit_ceil`, and the accessors + +**Immutable after construction — as architecture, not style.** The class +has no mutating member functions; every accessor is `const noexcept`. This +buys three unrelated things at once. *Thread safety by subtraction*: the +bank is built on the setup thread and read from the real-time consumer +thread; with no writes after publication there is nothing to synchronize — +the ring buffer chapter's acquire/release agonies simply do not apply to +this object. *RT discipline*: the only allocation is in the constructor, +which the header explicitly assigns to setup time; the audio path holds a +`const` pointer and cannot even express a reallocation. *Exception +containment*: everything that can throw (`bad_alloc`, +`invalid_argument`) throws before the object exists, so a constructed bank +is unconditionally valid — there is no half-designed state for the hot +path to trip over. + +**`std::bit_ceil` for L.** The constructor rounds `numPhases` up to a +power of two rather than validating it, and the reason lives in the +resampler's fast path: the Q0.64 phase accumulator selects the row by +taking the top log₂ L bits of a 64-bit fraction — one shift — and the +intra-row blend factor from the bits below — one more shift. That indexing +scheme *requires* a power-of-two L; `bit_ceil` (C++20, ``, exact and +self-describing where the old `1 << ceil(log2(n))` dance was neither) +guarantees it while giving any spec at least the resolution it asked for. +Rounding up rather than throwing is deliberate policy: more phases is +strictly better along the quality axis, so a spec of 200 phases quietly +becomes 256 rather than a setup error. The same power-of-two guarantee is +what lets `blendRowPhase` recover log₂ L with `std::countr_zero` instead +of storing it. + +**The accessor surface is four functions, and their shapes are load-bearing:** + +```cpp +{{#include ../../../include/srt/polyphase_filter.hpp:bank_accessors}} +``` + +`phase(p)` returns a raw `const Coeff*`, not a `std::span` — the kernels +consume rows through `SRT_RESTRICT`-qualified pointer parameters (that +no-alias promise is worth measured percentage points; see the +vectorization-audit chapter), and a span would be unpacked back to a +pointer at every call site while implying a bounds story the hot path +cannot afford to check. The domain quietly includes `p == numPhases()` — +the extra row is a first-class citizen of the API, which is exactly how +`interpolate()` gets to be branch-free: + +```cpp +{{#include ../../../include/srt/polyphase_filter.hpp:bank_interpolate}} +``` + +Note the one guard that *does* exist — clamping `p` when μ rounds up to +exactly L — protects against a floating-point edge of the *caller's* μ, +not of the table; and `groupDelaySamples()` reports `(L·T − 1)/(2L)`, the +true center of the linear-phase prototype in input samples, which is +"T/2" only to the resolution of the 1/(2L) half-step that the kernel +accuracy tests must account for when they compute the expected analytic +delay. The bank knows its own delay exactly; approximations are for prose. + +## Why this table looks the way it does + +| Decision | Alternative rejected | Reason | +|---|---|---| +| Contiguous T-tap rows per branch | dot the strided prototype directly | the kernel reads rows millions of times; stride-L access wastes the cache the table was sized to fit | +| Linear blend between adjacent rows | nearest row; cubic blend | nearest needs astronomically large L (first-order error); cubic doubles hot-loop work to fix a residual already below the chain's floor | +| L = 256 default | 128 / 512 | −12 dB residual per doubling vs table size; 48 KB meets the 105 dB @ 19.5 kHz budget; presets bracket it both ways | +| **Extra row L** | wrap to row 0 + branch; clamp μ | branch-free hot loop; μ-wrap/whole-sample slip exactly continuous; costs 192 bytes | +| Tap-reversed rows | reversed iteration per sample | reversal paid once at build; forward contiguous dot is what vectorizers and SMLALD pair-loads require | +| Quantize via `makeCoeff` at build | convert coefficients on the fly | error becomes a fixed, testable property of the object; hot path reads storage type directly | +| Q1.14 / Q1.30 coefficients | Q0.15 / Q0.31 | peak tap ≈ 1.0 by DC normalization; headroom bit beats wraparound at the table's largest value | +| Throw in constructor + converter `validated()` | validate in one place | the class can only check local comparisons; NaN defeats comparisons — finiteness and the band-edge *sum* rule are composition-level invariants (audit F2) | +| Immutable after construction | resettable/redesignable bank | cross-thread reads need no sync; allocation and throws confined to setup; no invalid intermediate states | +| `std::bit_ceil(numPhases)` | reject non-power-of-two | phase-bit row indexing requires 2ᵏ; rounding up is strictly quality-positive | +| Raw `const Coeff*` accessor | `std::span` row | kernels take restrict pointers; span adds implied checking the per-sample path cannot spend | + +## Verify it yourself + +```sh +# Build, then run this chapter's direct evidence: DC gain across mu, the +# extra-row layout equality, the mu-wrap continuity invariant, and the +# fractional-delay error staircase for balanced and transparent: +cmake -B build && cmake --build build -j +ctest --test-dir build -R Polyphase --output-on-failure + +# The audit's rejected-config suite (NaN rate, image-passing band edges), +# including the boundary cases that must keep constructing: +ctest --test-dir build -R ConfigValidation --output-on-failure + +# The end-to-end SNR numbers the L=256 decision is quoted against +# (997 Hz / 6 k / 12 k / 19.5 k, both presets, servo in the loop): +ctest --test-dir build -R AsrcQuality --output-on-failure + +# Break it on purpose: in the constructor, change `p <= phases_` to +# `p < phases_` and resize to phases_ * taps_ (no extra row), then make +# interpolate wrap p+1 to 0. DcGain still passes — DC can't see a +# one-sample shift — but MuWrapIsContinuousWithWindowShift fails loudly, +# which is exactly the gap between "looks fine on steady signals" and +# "correct at the slip." +``` + +The sabotage run is the section on the extra row, compressed: the wrap bug +is invisible to the easiest test and to DC reasoning, and the suite was +built by someone who knew that. diff --git a/include/srt/detail/kaiser.hpp b/include/srt/detail/kaiser.hpp index e9ac8a3..011bd67 100644 --- a/include/srt/detail/kaiser.hpp +++ b/include/srt/detail/kaiser.hpp @@ -1,3 +1,4 @@ +// ANCHOR: kai_design_note /// \file kaiser.hpp /// \brief Kaiser-window FIR prototype design for the polyphase interpolation bank. /// @@ -8,6 +9,7 @@ /// minutes of compile time in every including translation unit. Runtime design /// takes well under 10 ms, runs once in a constructor, and is off the audio path, /// so all design math here is plain runtime double precision. +// ANCHOR_END: kai_design_note #ifndef SRT_DETAIL_KAISER_HPP #define SRT_DETAIL_KAISER_HPP @@ -18,6 +20,7 @@ namespace srt::detail { +// ANCHOR: kai_besseli0 /// Modified Bessel function of the first kind, order zero, by power series. /// Converges for all practical Kaiser betas (|x| < ~40); terms are added until /// they no longer contribute at double precision. @@ -34,7 +37,9 @@ inline double besselI0(double x) noexcept { } return sum; } +// ANCHOR_END: kai_besseli0 +// ANCHOR: kai_beta /// Kaiser window shape parameter for a given stopband attenuation in dB /// (Kaiser's published empirical fit). inline double kaiserBeta(double attenDb) noexcept { @@ -44,7 +49,9 @@ inline double kaiserBeta(double attenDb) noexcept { return 0.5842 * std::pow(attenDb - 21.0, 0.4) + 0.07886 * (attenDb - 21.0); return 0.0; } +// ANCHOR_END: kai_beta +// ANCHOR: kai_estimate /// Kaiser/harris FIR length estimate, expressed per polyphase branch. /// /// \param attenDb target stopband attenuation in dB @@ -59,7 +66,9 @@ inline std::size_t estimateTaps(double attenDb, double transWidthNorm) noexcept const double n = (attenDb - 8.0) / (2.285 * 2.0 * std::numbers::pi * transWidthNorm); return n > 4.0 ? static_cast(std::ceil(n)) : 4; } +// ANCHOR_END: kai_estimate +// ANCHOR: kai_sinc /// sin(pi x)/(pi x) with the removable singularity handled. inline double sinc(double x) noexcept { if (std::abs(x) < 1e-12) @@ -67,7 +76,9 @@ inline double sinc(double x) noexcept { const double px = std::numbers::pi * x; return std::sin(px) / px; } +// ANCHOR_END: kai_sinc +// ANCHOR: kai_prototype /// Designs the Kaiser-windowed sinc prototype lowpass for an L-phase /// interpolation bank. /// @@ -98,6 +109,7 @@ inline void designPrototype(std::span h, std::size_t numPhases, double c for (auto& v : h) v *= gain; } +// ANCHOR_END: kai_prototype } // namespace srt::detail diff --git a/include/srt/polyphase_filter.hpp b/include/srt/polyphase_filter.hpp index d9365dd..dad83c2 100644 --- a/include/srt/polyphase_filter.hpp +++ b/include/srt/polyphase_filter.hpp @@ -63,6 +63,7 @@ namespace srt { +// ANCHOR: bank_spec /// Specification of the interpolation prototype filter. /// /// numPhases (L) sets the polyphase table resolution: the residual images from @@ -94,6 +95,7 @@ struct FilterSpec { .stopbandHz = 26000.0, .stopbandAttenDb = 140.0}; } + // ANCHOR_END: bank_spec /// This spec with the band edges rescaled from the 48 kHz design rate /// to sampleRateHz. The presets' passband/stopband are absolute Hz @@ -111,6 +113,7 @@ struct FilterSpec { } }; +// ANCHOR: bank_layout /// Immutable polyphase coefficient table designed at construction. /// /// Storage layout: (L+1) rows of T coefficients. Row p in [0, L) is polyphase @@ -119,11 +122,13 @@ struct FilterSpec { /// and the mu wrap 1.0 -> 0.0 (window shifted by one sample) is exactly /// continuous. Rows are stored tap-reversed so the dot product runs forward /// over an oldest-first history window. +// ANCHOR_END: bank_layout template class PolyphaseFilterBank { public: using Coeff = typename SampleTraits::Coeff; + // ANCHOR: bank_build /// Designs the prototype (double precision) and builds the table. /// Allocates; may throw std::invalid_argument / std::bad_alloc. Do this at /// setup time, not on the audio path. @@ -150,7 +155,9 @@ class PolyphaseFilterBank { } } } + // ANCHOR_END: bank_build + // ANCHOR: bank_accessors /// Row pointer for phase p in [0, numPhases()]; T contiguous coefficients. const Coeff* phase(std::size_t p) const noexcept { return table_.data() + p * taps_; } std::size_t numPhases() const noexcept { return phases_; } ///< L @@ -160,6 +167,7 @@ class PolyphaseFilterBank { double groupDelaySamples() const noexcept { return static_cast(phases_ * taps_ - 1) / (2.0 * static_cast(phases_)); } + // ANCHOR_END: bank_accessors private: std::size_t phases_; @@ -167,6 +175,7 @@ class PolyphaseFilterBank { std::vector table_; // (L+1) x T, rows tap-reversed }; +// ANCHOR: bank_interpolate /// Evaluates one output sample at fractional position mu in [0, 1). /// /// \param hist oldest-first window of the newest T input samples of one channel @@ -192,6 +201,7 @@ inline S interpolate(const PolyphaseFilterBank& bank, const S* hist, double m acc = Tr::mac(acc, hist[t], Tr::blend(c0[t], c1[t], fr)); return Tr::finalize(acc); } +// ANCHOR_END: bank_interpolate /// Blends the two phase rows adjacent to mu into `row` (taps() entries). /// Multichannel datapaths do this once per output frame and then run From 2bd804133abb509f2fb7cf6650d7f74541fdf133 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 1 Jul 2026 21:36:16 +0000 Subject: [PATCH 06/16] book: the servo and the fractional resampler chapters Part I chapters for pi_servo.hpp (plant equation, PLL-matched gains, three-stage machine, both tuning war stories) and the FractionalResampler (Q0.64 phase, slip logic, dispatch, history management). Adds comment-only sv_*/rs_* ANCHOR markers; rs_slip deliberately co-brackets the p0_phase_step region since both chapters excerpt the same loop. https://claude.ai/code/session_01HuAFfoeD5a5Xe5aGNA16M9 --- book/src/part1/fractional-resampler.md | 447 +++++++++++++++++++++- book/src/part1/pi-servo.md | 505 ++++++++++++++++++++++++- include/srt/pi_servo.hpp | 16 + include/srt/polyphase_filter.hpp | 22 ++ 4 files changed, 986 insertions(+), 4 deletions(-) diff --git a/book/src/part1/fractional-resampler.md b/book/src/part1/fractional-resampler.md index 2024cee..0d0a395 100644 --- a/book/src/part1/fractional-resampler.md +++ b/book/src/part1/fractional-resampler.md @@ -1,3 +1,446 @@ -# fractional resampler +# The fractional resampler -*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* +The servo chapter ended with a number: ε̂, the rate-deviation estimate, +delivered once per output block. This chapter spends it. + +Somebody has to turn "consume 1.000 000 2 input frames per output frame" +into actual audio, forever, without drift, without glitches at the moments +the books balance, and within a per-sample cycle budget that must hold on +a Xeon and on a DSP with no double-precision FPU. That somebody is +`FractionalResampler`, the streaming engine at the bottom of +`polyphase_filter.hpp`. It owns three things: the **history** (the last T +input frames of every channel, kept where the filter can reach them), the +**phase** (where between two input samples the next output lands), and the +**slip logic** (what happens when the phase creeps across a whole-sample +boundary). + +The near-unity specialization shapes everything here. A general-ratio +resampler schedules different numbers of outputs per input and needs +control flow to match. At ±1000 ppm, the conversion degenerates into a +*creeping fractional delay*: one output per input, plus a fractional +position μ that drifts by parts per million per sample and occasionally — +every few thousand samples — crosses a boundary and forces the window to +slip by one frame. The steady state is metronomic; all the difficulty +concentrates into keeping μ exact over unbounded time and making the +slips invisible. Those two problems are this chapter. + +## The job, one output sample at a time + +The polyphase bank chapter built the table: L + 1 rows of T coefficients, +row p holding the FIR that interpolates a signal value p/L of the way +between two input samples. `interpolate()` evaluates one output at +fractional position μ ∈ [0, 1): + +1. Scale: `pos = μ · L`. The integer part picks the phase row p; the + fractional part `fr` says how far μ sits between row p and row p+1. +2. Blend: form `c[t] = c0[t] + fr · (c1[t] − c0[t])` across the T taps — + linear interpolation between adjacent rows, the trick that makes a + 256-row table act like a continuum (the residual falls ~12 dB per + doubling of L). +3. Dot: multiply the blended row against the oldest-first history window + of the newest T input samples and accumulate — in double for float + samples, int64 for fixed point. + +μ = 0 lands the output exactly on history sample T/2 − 1; μ → 1 +approaches sample T/2. And the μ wrap 1.0 → 0.0 — the whole-sample slip — +is exactly where the bank's extra row L pays off: row L equals row 0 +advanced by one input sample, so "μ reaches 1.0 on this window" and +"μ = 0.0 on the window shifted one frame" are *the same filter*, +bit-identically, with no branch. The slip machinery below leans on that +continuity; `Polyphase.MuWrapIsContinuousWithWindowShift` pins it. + +That is the whole kernel: blend, then dot. Roughly T multiply-adds of +blending plus T of dot product per output sample, and everything else in +this chapter is about doing it cheaper, more exactly, and for more +channels — without ever changing an output bit unintentionally. + +## Sharing the blend: the C1 split + +The first optimization campaign result (Part III tells the full story; +`docs/PERFORMANCE.md` is the canonical record) started from an +observation you can make by reading the loop above: in a multichannel +converter, every channel of a frame is evaluated at the *same* μ. Calling +the fused `interpolate()` per channel recomputes an identical T-tap +coefficient blend N times per frame — for stereo, half the inner-loop +work is duplicate. + +The fix is to split the kernel at its natural seam: blend once per frame +into a scratch row, then run a plain dot product per channel. + +```cpp +{{#include ../../../include/srt/polyphase_filter.hpp:rs_dot_row}} +``` + +Two things about this function beyond its arithmetic. First, the comment +at the top is a *bit-exactness contract*: given the same μ, blend-then-mac +per tap in the same order is literally the same sequence of floating-point +(or integer) operations as the fused form, so the split changes no output +bit — and the C1 entry in `docs/PERFORMANCE.md` records "outputs unchanged +bit-for-bit" as a checked result, not a hope. This library treats +bit-exactness as the boundary between an optimization (free to ship) and +an algorithm change (needs its own quality evidence); you will see the +same distinction drawn twice more in this chapter. Second, the +`SRT_RESTRICT` qualifiers are C2's contribution: without them the +compiler versioned these loops behind runtime aliasing checks (verified +with `-fopt-info-vec`, not assumed). + +The measured C1 result: **stereo pipeline −36% wall-clock on x86, +8-channel −52%**, and −15/−30/−21% instructions (float/Q15/Q31) on the +Cortex-M55 — with the mono kernels count-identical as the control, since +mono keeps the fused path. One target barely moved, though: Hexagon +improved only −3.6/−3.3/−0.2%. Profiling explained why, and the +explanation became the next hypothesis: Hexagon's pipelines were not +dominated by blends or dots at all, but by **per-sample soft-double phase +math**. Which brings us to the centerpiece. + +## The phase accumulator: Q0.64 + +Here is the failure that motivates the design. The obvious phase state is +a `double mu`, updated per output sample as `mu += 1 + eps` with the +integer part peeled off into window advances. On a Xeon that costs a few +cheap FPU ops. On Hexagon — a 32-bit audio DSP with **no double-precision +FPU** — every one of those operations is a soft-float library call, per +sample, on the hottest path in the library. C1's flat Hexagon numbers +were this cost dominating everything else. (Honest correction from the +record, because the project's documentation initially got it wrong: the +Cortex-M55 was *assumed* to share this problem, but its scalar FPU does +support FP64 — only its MVE vector unit is fp16/fp32 — so M55 float was +never soft-double-bound. The measurement that exposed the doc error is +Part III material; the resampler design below is motivated by Hexagon and +its HiFi-class cousins, where the problem is real.) + +The C3 redesign eliminates the per-sample double entirely by changing +what the phase *is*: + +```cpp +{{#include ../../../include/srt/polyphase_filter.hpp:rs_class_doc}} +``` + +The fractional position lives in `phase_`, an unsigned 64-bit integer +read as a pure binary fraction — **Q0.64**: the value μ = `phase_` / 2⁶⁴, +so the representable range is exactly [0, 1) and the resolution is 2⁻⁶⁴ +of a sample. The key move is what it accumulates: **only ε**, the +deviation. The "1" in "advance 1 + ε input frames per output frame" is +handled by the integer machinery — consume one input frame per output +frame — and never touches the fraction. Near-unity specialization again: +because the nominal ratio is exactly 1, the fraction only has to carry +the few-hundred-ppm creep, and 64 bits of headroom below the binary point +carry it essentially forever. + +Per `process()` call — once per block, not per sample — the servo's +double ε̂ is converted to fixed point: + +```cpp +{{#include ../../../include/srt/polyphase_filter.hpp:rs_slip}} +``` + +Walk the slip logic carefully; it is the subtlest six lines in the +datapath, and the trick is that **wraparound of the unsigned add is the +slip detector**, for both signs of ε, with no comparisons against 1.0 or +0.0 anywhere: + +- **ε ≥ 0** (input clock fast; the window must occasionally hurry). The + fraction creeps upward by `epsU` each sample. When the true position + would cross 1.0, the 64-bit add wraps: `m = phase_ + epsU` comes out + *smaller* than `phase_`, which is otherwise impossible for a positive + increment. That wrap **is** the forward slip: consume one *extra* input + frame (`advance = 2` — the regular frame plus the slipped one), and the + wrapped `m` is already the correct new fraction, because mod-2⁶⁴ + arithmetic subtracted exactly the 1.0 that the extra frame consumed. +- **ε < 0** (input clock slow; the window must occasionally wait). + `epsU` is the two's-complement reinterpretation of a negative `epsFix` + — a huge unsigned number — so the same add normally wraps every + sample, and *not* wrapping is the anomaly: `m > phase_` means the + fraction dipped below 0.0. That is the backward slip: consume **no** + input frame this output (`advance = 0`, reuse the current window), and + again the modular result is already the correct fraction just below + 1.0. +- Otherwise `advance = 1`: the metronomic case. + +At +500 ppm a forward slip fires every 2 000 output samples, and thanks +to the bank's extra row the filter evaluated after `advance = 2` at small +μ is the exact continuation of the filter before it at μ ≈ 1. +`AsrcLock.WholeSampleSlipsAreGlitchFree` runs 500 ppm for seconds and +bounds the output's *second difference* by the analytic bound A·ω² of a +clean sine — a discontinuity detector that would trip on any window +mis-step at any slip. + +Note also what happens between the `appendOne` calls and `phase_ = m`: +if the source runs dry midway through an `advance = 2` slip, the function +returns with the history advanced by one frame but the phase *not* +updated. History and phase are now one frame apart — a state the class +cannot repair locally. That is not a bug; it is a documented precondition +(the contract section below), and the converter's dropout path always +resets and re-primes before processing again. + +Downstream, the phase bits feed the kernel directly: + +```cpp +{{#include ../../../include/srt/polyphase_filter.hpp:rs_blend_row_phase}} +``` + +The top log₂ L bits *are* the phase-row index; the bits below, shifted +up, *are* the intra-phase blend fraction. No multiply by L, no floor, no +subtract — the Q0.64 representation makes the split between "which row" +and "how far between rows" a matter of bit fields. One conversion to the +datapath's blend-factor type per output frame (`blendFactorFromQ64`: +single-precision for float, integer for Q15/Q31) is all that remains of +the floating-point phase math. The fused mono form is the same bit +surgery around the same blend-and-mac loop: + +```cpp +{{#include ../../../include/srt/polyphase_filter.hpp:rs_interpolate_phase}} +``` + +**Is 2⁻⁶⁴ enough?** Part 0 derived the timing-jitter budget for 120 dB +transparency at 20 kHz: about 8 picoseconds. One sample at 48 kHz is +20.8 µs; 2⁻⁶⁴ of that is ~10⁻²⁴ seconds — twelve orders of magnitude +inside the budget. The double-μ design's 2⁻⁵² was also far inside it, so +resolution was never the emergency; the deeper numerical win is +*exactness over time*. An integer accumulator adds ε with **zero +rounding error per step**, forever — the only quantization is the +once-per-block conversion of ε̂, a rate error below 10⁻¹⁹ that the servo +absorbs like any other infinitesimal drift. A double μ, by contrast, +rounds on every `+=` and carries the fraction with absolute precision +limited by its integer part's magnitude. Measured, from the C3 entry: +quality *improved* to **135.0 dB at 997 Hz** when the integer phase +landed. An optimization PR whose quality guardrail moved the right +direction — the A/B discipline (benchmarks for speed, pinned SNR +thresholds for correctness) catching a pleasant surprise instead of a +regression. + +And the cost side, from the same entry: Hexagon pipelines **−10.3% (Q15) +and −15.5% (Q31)**, with float −2.6% — the soft-double phase math C1 +identified was simply gone, and the Hexagon *kernels* stayed +count-identical as the control. M55: Q15 −5.3%, Q31 −4.6%, float +1.4% — +a genuine, accepted regression on one scenario, because the M55's scalar +FP64 hardware made doubles cheap and the integer phase traded them for +int64 ops; the cross-target win justified it, and the ratchet baseline +records the trade explicitly. x86 same-minute A/B: float −5.4%, Q15 +−12.0%. + +## Dispatching the datapath + +With phase in hand, each output frame takes one of three routes: + +```cpp +{{#include ../../../include/srt/polyphase_filter.hpp:rs_dispatch}} +``` + +Mono takes the fused `interpolatePhase` — no scratch-row traffic for a +single channel (with one exception: Q15 on SMLALD-capable Cortex-M cores +routes mono through blend + dot too, because the dual-MAC loop lives in +`dotRow`; the two paths are bit-exact by construction, which is what +makes that rerouting a non-event). Low channel counts blend once into +`row_` and dot per channel over planar histories — the C1 shape. High +channel counts on hosts take the frame-major branch, which is the next +section but one. Note the branch condition `kChannelParallel && +frameMajor_`: the first operand is `constexpr`, so on embedded targets +the entire branch constant-folds away. That is not tidiness — a runtime +flag in this loop measured **+6–8%** on the M55 instruction ratchet +before the compile-time gate restored every embedded scenario to exactly +0.00%. The ratchet is why the lesson is a number and not an anecdote. + +## Feeding the window: history management + +The filter needs the newest T frames of every channel, contiguous, +oldest-first, per channel. Input arrives interleaved, in whatever chunks +the FIFO happens to hold. Between those two facts sits `appendOne`: + +```cpp +{{#include ../../../include/srt/polyphase_filter.hpp:rs_append}} +``` + +Three mechanisms, each with an RT-safety argument: + +**Chunked staging.** Frames are pulled from the caller-supplied `popFn` +in bulk (the converter passes 16-frame chunks) into the interleaved +`scratch_` buffer, then peeled off one frame at a time as the window +advances. Bulk pops amortize the ring's index synchronization across +many frames — the cached-index design from two chapters ago does its +best work when you ask it for blocks — while the resampler still +consumes with single-frame granularity, because slips need exactly-one +extra frame on demand. Frames staged in scratch have left the ring but +not yet entered the filter, which is why `bufferedFrames()` exists: the +servo's occupancy observable must count them or the estimate would carry +a chunk-sized bias. + +**Bounded compaction.** Histories are not ring buffers; they are flat +arrays with a moving end index, sized `taps + chunkFrames`. When the end +hits capacity, `memmove` slides the newest T − 1 frames back to the +front and synthesis continues. Why copy at all, when a circular buffer +would avoid it? Because the *filter* needs a contiguous window every +sample: a ring would either split the dot product at the wrap seam +(a branch and a second loop in the hottest code in the library) or copy +into a linear scratch every frame — a memmove per *sample* instead of +one per *chunk*. The flat layout pays T − 1 frames of copy once per +`chunkFrames` appends: bounded, branch-predictable, allocation-free — +worst-case cost is fixed at construction time, which is the entire +definition of RT-safe this library uses. `process()` is `noexcept`, no +locks, no allocation; every buffer was sized in the constructor, which +is allowed to throw precisely because it runs at setup time. + +**Two storage shapes.** The member block records the fork: + +```cpp +{{#include ../../../include/srt/polyphase_filter.hpp:rs_members}} +``` + +Planar — one delay line per channel — below the channel-parallel +threshold: each channel's dot product walks its own contiguous line, and +the deinterleave happens once per frame at append time (a scalar loop +over channels). Frame-major — a single interleaved line — at or above +it: appends become one contiguous `memcpy` per frame and the compaction +one `memmove` per line-fill, but the real reason for the layout is the +kernel it enables. + +## The channel axis: C6, briefly + +For high channel counts the per-frame cost is dominated by N dot +products, and the float dot product has a vectorization problem you can +now state precisely: its accumulation order is contractual (strict +per-channel double accumulation — reassociating it changes output bits), +so the *tap axis* may not be vectorized without breaking bit-exactness. +The C2 audit verified GCC obeys: float `dotRow` compiles scalar, by +design. + +But nobody said anything about the *channel* axis. Channels are +independent accumulators; computing eight of them in lockstep, one tap +at a time, keeps every channel's tap order identical to `dotRow`'s while +filling SIMD lanes with channels instead of taps. That requires the +history to deliver all channels of tap t contiguously — the frame-major +layout — and a register-blocked kernel: + +```cpp +{{#include ../../../include/srt/polyphase_filter.hpp:rs_dot_rows_frame_major}} +``` + +The measured C6 results, condensed (the full campaign, including the +callgrind profile that justified targeting the dots and the negative +results that bounded the design, is Part III's last chapter): **float +8/12/16-channel pipelines −38/−38/−42% wall-clock with AVX2+FMA**, only +−4–5% on baseline SSE2 — the gain scales with SIMD width, as it must if +the mechanism is what we claim. Bit-exact versus planar, hash-verified +over 30 000 blocks × 4 configs. The gate is deliberately narrow, each +edge measured rather than assumed: + +- **Float-only**: fixed-point channel-parallel measured ~1.5× *slower* + than planar — integer accumulation is exactly reassociable, so the + planar Q15/Q31 dots already auto-vectorize over taps, and the tap axis + beats the channel axis when both are available. +- **Channels ≥ 4** (`SRT_CP_MIN_CHANNELS`, overridable for A/B runs): + below that, lane utilization loses to the planar path's simplicity. +- **Hosts only**: the embedded targets keep their proven codegen (Helium + on M55, SMLALD on M33-class, Hexagon's measured scalar floor); the + compile-time macro gate keeps their binaries byte-for-byte ignorant of + the mode. + +And one lesson worth carrying out of context: the first channel-parallel +attempt — accumulators in a plain array the compiler kept in memory — +measured **2.8× slower than planar**. Register-block or don't bother; +`dotTileFrameMajor`'s `constexpr`-size tiles of 8/4/2/1 are that lesson +in code form. + +## The contract: prime, process, and the one-frame lie + +`FractionalResampler` is deliberately not foolproof; it is *fast*, and +its safety is a documented protocol that the converter — its only +in-tree caller — upholds. The documentation is the code's own: + +```cpp +{{#include ../../../include/srt/polyphase_filter.hpp:rs_process_doc}} +``` + +**Prime before process.** `prime()` fills the window with T real frames +(or reports dry and stays unprimed). Call `process()` unprimed and +`window()`'s pointer arithmetic `end_ − taps()` underflows a `size_t` — +the converter guarantees priming by construction, since it only leaves +its Filling state once the backlog exceeds setpoint + taps. + +**Reset after any dry return.** You now know exactly why from the slip +walk-through: a `process()` that runs dry on the *second* append of an +`advance = 2` forward slip has already advanced the history when it +returns, but never executed `phase_ = m`. History says one frame passed; +phase says none did. Every output synthesized after resuming would be +computed one frame late relative to its nominal position — not a crash, +a *silent sub-window skew*. The class cannot un-append (the frame is +deinterleaved into the histories) and does not try to special-case it; +it defines the recovery protocol instead: `reset()` clears phase, +history, and staged scratch (stale across a discontinuity anyway), then +re-prime. The converter's underrun path does exactly this, with the +servo keeping its ppm estimate and a fade-in masking the splice. + +Finally, the small read-side API that closes the control loop: + +```cpp +{{#include ../../../include/srt/polyphase_filter.hpp:rs_mu}} +``` + +`mu()` converts the phase to double **once per pull, not per sample** — +the block-rate boundary where doubles are cheap even on Hexagon, the +same boundary the ε̂ conversion crosses in the other direction. The +servo adds it to the frame count so the observable `occ + mu` moves +*continuously* through slips: at the instant a forward slip fires, the +count drops by one exactly as μ wraps from ~1 to ~0, and the sum crosses +smoothly. Without μ in the observable, every slip would inject a +one-frame staircase into the servo's error at the beat frequency — +manufacturing the very sawtooth the previous chapter spent three filter +poles suppressing. `bufferedFrames()` completes the accounting for the +staged scratch. Two accessors, and the sensor the whole control system +reads is honest to sub-sample resolution. + +## Why this file looks the way it does + +| Decision | Alternative rejected | Reason | +|---|---|---| +| Q0.64 integer phase, ε-only | `double mu += 1 + eps` per sample | soft-double per sample dominated Hexagon pipelines (C1 finding); integer add is exact forever; measured −10/−15% Hexagon, quality up to 135.0 dB | +| Slips by unsigned wraparound | compare/floor against 1.0 and 0.0 | the mod-2⁶⁴ result *is* the corrected fraction; both slip directions fall out of one add | +| Blend once per frame + per-channel dot | fused interpolate per channel | N×(blend+dot) → blend + N×dot; bit-exact by identical per-tap order; stereo −36% wall-clock (C1) | +| Flat history + bounded memmove compaction | circular history | the dot needs a contiguous window every sample; one bounded copy per chunk beats a seam branch per sample | +| Chunked popFn staging | pop one frame at a time | amortizes ring synchronization; staged frames stay visible to the servo via `bufferedFrames()` | +| Frame-major + channel-parallel dots (float, ≥4ch, hosts) | vectorize the float tap axis | tap-axis SIMD changes accumulation order = output bits; the channel axis is free and bit-exact (−38…−42% at 8–16ch) | +| Compile-time mode gate | runtime `if (frameMajor_)` alone | a hot-loop runtime flag cost +6–8% M55 instructions; `constexpr` restored embedded codegen to 0.00% | +| Documented preconditions + `reset()` | internal auto-repair of dry slips | the failure needs a reprime anyway (stale window); a repair path would be untestable dead weight on the hot path | + +## Verify it yourself + +```sh +# Quality with the Q0.64 phase in the loop — the pinned thresholds +# include the 135 dB figure C3 improved: +ctest --test-dir build -R 'AsrcQuality\.' --output-on-failure + +# Slip continuity: the second-difference bound at +500 ppm (a slip +# every 2000 samples), plus lock/drift behavior: +ctest --test-dir build -R 'AsrcLock\.' --output-on-failure + +# The mu-wrap/extra-row continuity the slips depend on: +ctest --test-dir build -R 'Polyphase\.' --output-on-failure + +# Channel independence at 12/16 channels — on a host float build this +# exercises the frame-major channel-parallel path: +ctest --test-dir build -R 'MultiChannel' --output-on-failure + +# A/B the channel axis yourself: benchmark, then rebuild with the +# threshold pushed out of reach and benchmark again (use -march=native +# to see the AVX2 headline; SSE2 shows a few percent): +cmake -B build-bench -DCMAKE_BUILD_TYPE=Release -DSRT_BUILD_BENCHMARKS=ON \ + -DCMAKE_CXX_FLAGS="-march=native" +cmake --build build-bench -j && \ + ./build-bench/bench/srt_bench --benchmark_filter='Pipeline_Float.*(8|12|16)ch' +cmake -B build-planar -DCMAKE_BUILD_TYPE=Release -DSRT_BUILD_BENCHMARKS=ON \ + -DCMAKE_CXX_FLAGS="-march=native -DSRT_CP_MIN_CHANNELS=999" +cmake --build build-planar -j && \ + ./build-planar/bench/srt_bench --benchmark_filter='Pipeline_Float.*(8|12|16)ch' + +# Break it on purpose: change `advance = 2` to `advance = 1` in the +# forward-wrap branch of process(), rebuild, and watch +# AsrcLock.WholeSampleSlipsAreGlitchFree fail its second-difference +# bound — every slip becomes an audible one-frame stutter. +``` + +The last experiment is worth actually running once. The slip logic is +six quiet lines that look like integer bookkeeping; breaking them turns +a 135 dB converter into a machine that clicks every forty-two +milliseconds. That gap — between how little the code looks like it is +doing and how much the measurements say it is — is the fractional +resampler in one sentence. diff --git a/book/src/part1/pi-servo.md b/book/src/part1/pi-servo.md index b8ef7d9..a83dbb6 100644 --- a/book/src/part1/pi-servo.md +++ b/book/src/part1/pi-servo.md @@ -1,3 +1,504 @@ -# pi servo +# The clock servo: `pi_servo.hpp` -*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* +There is a number this entire library exists to find, and nobody will tell +it to us. + +Call it ε: the fractional rate mismatch between the two crystals. The +producer's device claims 48 kHz and delivers 48 000 × (1 + ε) frames per +second; the consumer's device claims 48 kHz and takes them away at +48 000 × (1 + something else). ε is a few parts per million, it wanders +with temperature, and no API on either side will report it — the whole +premise of the problem is that both devices believe they are correct. The +resampler in the next chapter can apply any rate correction we ask of it, +to a resolution of 2⁻⁶⁴ samples. It just needs to be told the number. + +The only observable we have is the elastic buffer between the domains: the +SPSC ring from the last chapter, whose occupancy was designed to be *exact* +for precisely this reason. If the producer's clock is fast by ε and we +consume at exactly the nominal rate, the buffer fills at ε × fs frames per +second — about one frame every two minutes at 200 ppm. That trickle is the +entire signal. The servo's job is to turn it into an estimate ε̂ good +enough that the resampler's output carries no audible trace of the +estimation process — and "audible trace" here means fluctuations in ε̂, +because whatever wobble the servo passes into the rate estimate +frequency-modulates every sample of the audio. + +This chapter is control theory for someone who has never tuned a loop, +taught the way this file was actually designed: start with the physics of +the thing being controlled, discover why the obvious controller fails, +derive the one that works, and then spend most of our effort on the real +enemy — which turns out not to be the clocks at all, but the fact that we +can only *count*. + +## The plant: a buffer that integrates + +Control theory calls the thing you are controlling the *plant*. Ours is +the FIFO, and its equation of motion is one line. The producer inserts +fs × (1 + ε_true) frames per second. The converter synthesizes fs output +frames per second, and for each output frame it consumes (1 + ε̂) input +frames — that is what "phase advance = 1 + ε̂" will mean in the next +chapter. Occupancy changes at the difference of those rates: + +```text +d(occ)/dt = fs · (ε_true − ε̂) +``` + +The buffer is a **pure integrator** with gain fs. Feed it a rate error and +it does not settle at some proportional level — it ramps, forever, until +it hits a wall (empty: dropout; full: overflow). Two consequences follow +immediately. First, doing nothing is not an option even for arbitrarily +small ε: any uncorrected mismatch is a glitch with a countdown timer on +it. Second, the plant's own integration is going to interact with whatever +memory the controller has, and getting that interaction right *is* the +design. + +The servo observes the occupancy once per `pull()` — the converter calls +`update(occ, mu, dt)` with the raw backlog in frames, the resampler's +current fractional position μ (so the observable `occ + mu` moves +continuously through whole-sample slips instead of staircasing by ±1), +and the elapsed time `dt = framesPulled / fs`. + +## Why proportional control is not enough + +The obvious controller is proportional: measure the occupancy error +`e = occ − target`, set ε̂ = Kp·e. If the buffer is too full, consume +faster; too empty, consume slower. It even works, in the sense that it +does not fall over. + +Now ask what it converges *to*. In steady state the occupancy stops +moving, so the plant equation forces ε̂ = ε_true — the estimate must equal +the true offset exactly. But a proportional controller can only produce +ε̂ = Kp·e, so the error cannot be zero: it must park at + +```text +e_ss = ε_true / Kp +``` + +a *standing occupancy offset* proportional to the clock mismatch. Plug in +the numbers this library actually uses and the problem stops being +academic. At the steady-state loop bandwidth of 0.05 Hz (we will get to +why it is that low), Kp ≈ 1.3 × 10⁻⁵ per frame. A routine 300 ppm crystal +offset parks the buffer **23 frames** away from its setpoint — half the +default 48-frame latency budget gone, sitting one frame shy of the default +24-frame unlock threshold, and different for every unit in the field +because every crystal pair drifts differently. Latency that depends on +which two devices you happened to plug in is not a spec anyone signs. + +The fix is memory. Add an integral term: + +```text +ε̂ = Kp·e + Ki·∫e dt +``` + +The integrator accumulates error until the error is gone: in steady state +it holds the entire ppm estimate by itself, ε̂ = ε_true with **zero +standing occupancy error**. Control theory calls the combination a *type-2 +loop* — two integrators around the cycle, the plant's and the +controller's — and type-2 is exactly the order needed to null a constant +rate offset. `tests/test_servo.cpp` pins this down against a pure +simulation of the plant equation: after settling at +300 ppm, the +occupancy must sit within 0.05 frames of the setpoint and ε̂ within 1 ppm +of the truth +(`Servo.LocksFromConstantOffsetAndNullsError`). + +A type-2 loop also does something a type-1 cannot: it follows a *ramp* in +the offset — a crystal warming up, drifting at 1 ppm/s — with bounded +rather than growing error. The residual is the classic acceleration error +`e_ss = (dε/dt · fs) / ωₙ²`, about 0.49 frames for 1 ppm/s at the 0.05 Hz +bandwidth, and `Servo.TracksSlowDriftRampWithBoundedLag` holds the +measured lag under one frame while `epsHat` tracks the moving truth to +2 ppm. + +If this structure sounds familiar, it should. Replace "FIFO occupancy" +with "phase difference" and this is a **phase-locked loop**: the FIFO +comparison is the phase detector, the PI filter is the loop filter, and +the resampler's μ accumulator is the numerically controlled oscillator. +The README states the analogy flatly and it is worth internalizing, +because it means every result in fifty years of PLL literature applies — +including the one that matters most here: the loop bandwidth f_L +*partitions* the input timing jitter. Components above f_L are absorbed +by the buffer and never reach the audio; components below f_L pass into +ε̂ and frequency-modulate it. Choosing f_L is choosing which noise you +eat. + +## From bandwidth to gains + +So the designer picks a bandwidth and a damping; the gains should follow +mechanically. Close the PI controller around the integrator plant and the +loop's characteristic equation is + +```text +s² + fs·Kp·s + fs·Ki = 0 +``` + +Match it against the standard second-order form +`s² + 2ζωₙs + ωₙ² = 0` — the form whose behavior every control textbook +tabulates — and read off the gains: + +```text +ωₙ = 2π·f_L Kp = 2ζωₙ / fs Ki = ωₙ² / fs +``` + +The code computes exactly this, nothing more: + +```cpp +{{#include ../../../include/srt/pi_servo.hpp:sv_gains}} +``` + +Note the division by `fs_` in both gains: the plant's gain is fs, so the +controller divides it back out, and the *closed-loop* behavior depends +only on f_L and ζ. That innocuous-looking normalization is load-bearing — +it is why the gains formula is rate-portable, and (foreshadowing the first +war story) why everything *else* in the config is not. + +Damping defaults to ζ = 1, critical damping: the fastest settling that +never overshoots. Overshoot in this loop is not a cosmetic wiggle — an +occupancy overshoot is latency spent grazing the underrun floor, so the +choice is not stylistic. + +Here is the full tuning surface, with the defaults that suit a 48 kHz +near-unity converter: + +```cpp +{{#include ../../../include/srt/pi_servo.hpp:sv_config}} +``` + +Three bandwidths, three smoother corners, and a small state machine's +worth of thresholds. A single PI loop needs exactly two numbers; this +config carries fourteen. The rest of the chapter is about earning each of +the extra twelve. + +## The enemy: a sawtooth made of counting + +If the occupancy were a real number observed noiselessly, one PI loop at +a modest bandwidth would end this chapter. It is not. The occupancy is a +**count** — quantized to whole frames on the producer side, or to whole +*push blocks* when the producer delivers audio in callbacks — and that +quantization is not benign random noise. It is deterministic and +periodic. + +Picture the steady state at +200 ppm with sample-granular transfer. The +true (unquantized) backlog creeps upward by ε input samples per sample +consumed; every time the creep accumulates one whole frame, the count +steps. The observable is a perfect sawtooth: one push-block peak to peak, +repeating at the *beat frequency* + +```text +f_beat = ε · fs / pushBlock (the README's "ppm × pushRate") +``` + +At 200 ppm and sample-granular push that is 9.6 Hz with a one-frame tooth. +With 32-frame callbacks it is 0.3 Hz with a **32-frame** tooth — the +occupancy legitimately excursions ±16 frames with neither clock having +moved. (`AsrcLock.LocksAndHoldsAtConstantOffset` averages straight +through that sawtooth and requires the *mean* fill and ppm to land on the +truth.) + +Why care about a deterministic wobble in a number we only use for its +average? Because the loop does not know it is a wobble. Whatever fraction +of the sawtooth survives into ε̂ becomes a periodic modulation of the +resampling rate — FM sidebands on every tone in the program material, at +offsets of f_beat and its harmonics. And a PI controller is a terrible +filter: above f_L its proportional path passes measurement noise straight +through at gain Kp, flat, forever. Narrowing f_L does not fix this by +itself; it lowers Kp (helping linearly) while the sawtooth needs 60–120 dB +of suppression. The loop needs help *before* the loop: error prefilters. + +But a prefilter is lag, and lag inside a feedback loop erodes phase +margin; you cannot smooth aggressively *and* acquire quickly with the same +settings. There is no single operating point that pulls in a cold start +within a second, rejects a 9.6 Hz sawtooth by 100+ dB, and follows a +warming crystal. So the servo refuses to pick one point. It picks three. + +## Three loops, one integrator + +| Stage | Loop bandwidth | Error prefilter | Role | +|---|---|---|---| +| **Acquire** | 10 Hz | 1-pole, 50 Hz | pull in from a cold start (~1 s to lock) | +| **Track** | 1 Hz | 1-pole, 5 Hz | robust lock; terminal stage for coarse-block transfer | +| **Quiet** | 0.05 Hz | 3-pole cascade, 0.5 Hz | steady state for fine-grained transfer | + +Each stage is the same PI structure with gains from the same +`computeGains`, differing only in bandwidth and in how hard the +measurement is smoothed before the loop sees it. The update begins by +maintaining *both* kinds of smoothed error on every call: + +```cpp +{{#include ../../../include/srt/pi_servo.hpp:sv_update_smooth}} +``` + +Two details here repay attention. The smoothing coefficient +`alpha(cornerHz, dt) = 1 − exp(−2π·f·dt)` is the exact discrete step of a +one-pole lowpass over an arbitrary interval, so the filter corners are +honest frequencies in Hz regardless of how large or irregular the pull +blocks are — the same property the gain formulas have via `dt` in the +integrator. And the three-pole quiet cascade (`q1_ → q2_ → q3_`) runs +**always**, even in Acquire and Track where its output does not drive the +loop. That costs three multiply-adds per block and buys two things: the +promotion gate into Quiet has real data to judge (next section), and at +the instant of promotion the cascade is already settled on the observable +— no filter warm-up transient handed to the narrowest, most fragile +stage. + +Why a *cascade* of three identical poles rather than one pole three times +lower, or something sharper? Rolloff. One pole buys 6 dB/octave above its +corner; three poles buy 18 dB/octave. Against the 9.6 Hz sawtooth, a +0.5 Hz three-pole cascade provides roughly (9.6/0.5)³ ≈ 77 dB of rejection +before the loop even sees the error — while adding only manageable lag at +the 0.05 Hz loop bandwidth two decades below. The file header states the +net result as a system-level figure: in Quiet, a one-frame sawtooth is +rejected to roughly −120 dBc equivalent at 20 kHz, while the loop still +follows a 1 ppm/s drift ramp with under half a frame of standing error. +Sharper IIR shapes (resonant poles, elliptic-style) would trade that +clean, phase-predictable lag for ringing inside a feedback loop — exactly +the wrong place for it. + +## The promotion machine + +Three stages need transitions, and transitions are where multi-mode +controllers usually betray you — a bandwidth switch with mismatched state +is a step input injected into your own loop. Here is the whole state +machine: + +```cpp +{{#include ../../../include/srt/pi_servo.hpp:sv_update_stages}} +``` + +Reading it as a protocol: promotion out of Acquire requires the *fast* +smoothed error to stay inside one frame for half a second; promotion out +of Track requires the *cascade* error to stay inside one frame for two +full seconds. Demotion is the same test run backwards with a much wider +threshold — 24 frames — and drops exactly one stage. The asymmetry +(narrow gate up, wide gate down, long holds) is hysteresis by +construction: the servo would rather linger a stage wide than oscillate +between modes. + +The choice of *which* error gates the Track→Quiet promotion is the +subtlest line in the file, and it earns the second war story below. +Gating on the cascade-smoothed error means the promotion asks precisely +the question that matters: *after the smoothing Quiet would actually use, +is the observable quiet enough to run a 0.05 Hz loop?* When a large block +beat dominates the occupancy, the answer is naturally and persistently +no — the cascade output wobbles by more than a frame at the beat +frequency, the hold timer keeps resetting, and the servo stays in Track. +Nobody wrote a rule that says "coarse-block configurations must not enter +Quiet." The physics writes it. + +Both promotions share their hold logic, and it does double duty: + +```cpp +{{#include ../../../include/srt/pi_servo.hpp:sv_hold}} +``` + +While the hold window runs, the servo is not just waiting — it is +averaging its own output ε̂ with a time constant of a fifth of the hold. +Here is why that average exists. The wide stages do not *reject* the +quantization sawtooth; they phase-track it, riding the wobble with their +whole loop. Their instantaneous ε̂ is therefore a good estimate wrapped in +a periodic error. Averaging over the hold window (many beat cycles) +strips the wobble and leaves the clean central value — and at the moment +of promotion, *that* is what gets loaded into the narrower stage's +integrator (`integ_ = clamp(epsAvg_, ...)` in the state machine above). + +Recall what the integrator *is* in steady state: the entire rate +estimate. Handing the next stage a clean integrator means handing it a +loop that is already essentially converged; the proportional path only +has to clean up residuals. That is the transient-free handoff — "to first +order," as the header says, because the smoothers keep their state and +the observable keeps its continuity, so nothing steps. +`Servo.BandwidthSwitchIsTransientFree` runs the plant through lock and +across both promotions and requires the occupancy never to leave the +one-frame lock threshold afterwards: a handoff you cannot find in the +data. + +## The output stage, and why the clamp is inside + +The last lines of `update()` are the PI itself: + +```cpp +{{#include ../../../include/srt/pi_servo.hpp:sv_update_out}} +``` + +The clamp appears twice, and the first one — on the integrator, not just +the output — is the anti-windup that every practical PI needs and every +first implementation forgets. Consider a consumer stall: the occupancy +error goes huge and stays huge for seconds while the converter waits for +the high-watermark resync. An unclamped integrator would spend that whole +time charging toward a rate estimate of thousands of ppm — a number no +crystal pair can produce — and then, after the disturbance clears, the +loop would have to *discharge* all of that false conviction through its +narrow bandwidth, dragging the occupancy through a huge excursion for tens +of seconds. Clamping the integrator at 1.5 × `maxDeviationPpm` bounds the +lie the loop can tell itself: the estimate can never leave the range +physics allows, so recovery from any disturbance starts at most one clamp +width from the truth. The output clamp then bounds what the resampler is +asked to do per sample (which also protects the Q0.64 conversion in the +next chapter). `Servo.ClampsToMaxDeviation` feeds a 10 000-frame error and +requires the output to saturate exactly at 1.5× the configured range. + +## Knowing when not to chase: `seed()` and `reset()` + +```cpp +{{#include ../../../include/srt/pi_servo.hpp:sv_reset}} +``` + +A feedback loop's reflex is to chase every step in its input. Some steps +carry no information, and the API encodes each such case explicitly: + +- **`seed(occPlusMu)`** snaps all four smoothers onto the current + observable. The converter calls it when the occupancy jumps *for a + known reason* — acquisition start, a hard resync discard. Without it, + the smoothers would report the jump as a genuine multi-frame error and + the loop would obediently swerve. +- **`reset(keepIntegrator=true)`** re-arms the state machine after a + dropout but preserves the integrator — because a dropout says nothing + about the crystals. The ppm estimate from before the glitch is still + the best available number, and relock becomes a formality + (`Servo.DropoutResetKeepsPpmEstimate` pins both flavors: `true` + preserves the estimate to 5 ppm, `false` zeroes it). +- **`setTarget()`** moves the setpoint while keeping the integrator *and* + the smoothers' tracking state, so the loop slews to the new occupancy + at its clamped rate with no discontinuity — used by the converter's + adaptive pull-block setpoint raise, where the setpoint moves but, + again, the clocks have not. + +The shared principle: the integrator is the loop's knowledge and the +smoothers are its perception. Each event handler keeps exactly the state +that is still true and resets exactly the state that is not. + +## War story one: 16 kHz, minus 32 decibels + +For a long time this library's defaults were "the defaults," full stop — +designed, tested, and shipped at 48 kHz. Then a real deployment shape +arrived: 16 kHz reference-microphone processing. Same code, same presets, +a third of the sample rate. The quality suite was duplicated at 16 kHz, +expecting boring numbers. + +The numbers came back **~32 dB worse at every tone**, falling a further +6 dB per octave of signal frequency. That frequency signature is the +fingerprint of small-index FM — phase modulation of the resampling +position, whose sidebands grow with the modulated signal's frequency — +which pointed at the servo, not the filter. + +The mechanism, worked out in +`tests/test_asrc_quality_16k.cpp`'s header comment and now baked into the +config comment: servo bandwidths and smoother corners are **absolute +hertz**, but the disturbance they exist to reject is not. The slip-beat +sawtooth sits at ε × fs — 9.6 Hz at 48 kHz, only **3.2 Hz at 16 kHz**. +The three-pole 0.5 Hz cascade whose rejection goes as f³ therefore does +(16/48)³ ≈ 28.6 dB *less* damage to the beat at 16 kHz, and the +measurement becomes servo-FM-limited: predicted ≈ 28.6 dB, measured +≈ 32 dB. The loop was not misbehaving. It was doing exactly what its +absolute-Hz constants said, against a disturbance that had moved. + +The rule that fixes it is now a method, so it cannot be half-remembered: + +```cpp +{{#include ../../../include/srt/pi_servo.hpp:sv_scaled_to}} +``` + +Every field with units of Hz scales with the rate — keeping the loop +identical in *normalized*, per-sample terms, which is the frame the +disturbance lives in. Every field denominated in frames or ppm +(`lockThresholdFrames`, `unlockThresholdFrames`, `maxDeviationPpm`) is +already normalized and stays put. And the hold times scale *inversely*: +a loop with a third the bandwidth has time constants three times longer, +so waiting "2 seconds" before promoting would mean waiting a third as +many loop time constants — the gates would fire on less evidence. The +original hand-scaled 16 kHz configuration missed the hold-time rule; +adding it re-measured identical within noise, and the test suite now +covers the factory (`Config::forSampleRate`, which applies this and the +matching `FilterSpec::scaledTo`) both structurally +(`AsrcQuality16k.ForSampleRateScalesHzFieldsOnly` checks exactly which +fields move) and behaviorally: through the factory, 16 kHz measures +136.6 dB at 333 Hz — within ~1 dB of 48 kHz at the same normalized +frequency, the 32 dB fully recovered. + +One more cost of scaling, honestly: at 16 kHz the Quiet loop runs at +~0.017 Hz, so the quality tests run for 120 seconds of simulated audio +instead of 40 — the same number of loop time constants. Slow loops are +slow everywhere, including in CI. + +## War story two: when Track is the ceiling + +The block-size study (`notebooks/asrc_block_size_study.ipynb`) asked what +happens as transfer granularity coarsens from sample-granular toward the +32- and 240-frame callbacks real audio APIs deliver. The finding shapes +how you should read the stage table: with blocks of 32 frames and up, +**the servo never promotes to Quiet — and must not.** + +The information-theoretic version of the argument: at a 32-frame block, +the occupancy observable updates a few hundred times per second with a +±16-frame deterministic sawtooth on top of a sub-frame-per-second signal. +Quiet-level performance means resolving the backlog trend to a small +fraction of a frame *through* that tooth using counts alone; the counts +simply do not carry the information. The promotion gate discovers this +without being told: the cascade-smoothed error keeps excursing past one +frame at the beat frequency, the two-second hold never completes, and +Track becomes the terminal stage — the discriminator working as designed. + +What does Track-forever sound like? The 1 Hz loop phase-tracks the block +beat: most of the sawtooth is absorbed as **latency breathing** — the +buffer level, and hence the delay, swaying by a fraction of the block at +the beat rate, inaudible by construction. The remainder leaks into ε̂ as +low-rate FM, and the study put calibrated numbers on it: **~0.9 cents rms +of frequency wobble (61 dB wideband quality) at 32-frame blocks, ~1.3 +cents / 53 dB at 5 ms blocks**, as the README reports. Cent-scale wobble +at sub-hertz rates is at the edge of perception for sustained pure tones +and irrelevant for program material — but it is a real ceiling, and it is +a *sensor* ceiling, not a servo defect. The README's limitations section +draws the forward-looking conclusion: breaking it requires a better +observable (per-block timestamps for sub-sample phase observation), not +a cleverer filter behind the same counts. + +The practical corollary is the config comment you may have skimmed past +on `unlockThresholdFrames`: it must sit comfortably above **half the +push/pull block size**, because a coarse-block sawtooth legitimately +excursions that far with the clocks standing still. The default 24 clears +a 32-frame transfer's ±16 with margin. Undersize it — say, 8 against +32-frame callbacks — and the healthy beat itself trips demotion: +Track→Acquire, re-lock, promote, trip again, a mode limit cycle +manufactured entirely in configuration. If you change one servo number +for an embedded deployment, this is the one to check. + +## The shape of the design + +| Decision | Alternative rejected | Reason | +|---|---|---| +| PI (type-2) loop | proportional-only | P parks a ppm-dependent occupancy offset (≈23 frames at 300 ppm in Quiet); the integrator nulls it | +| Gains derived from (f_L, ζ) via 2nd-order matching | hand-tuned constants | tuning surface is two physical numbers; `computeGains` is the textbook formula, verifiable by inspection | +| Three stages | one compromise bandwidth | pull-in wants 10 Hz, sawtooth rejection wants 0.05 Hz + heavy smoothing; no single point does both | +| Cascade error gates promotion | timer or lock-counter | asks the exact question ("could Quiet's own filtered error hold lock?"); auto-excludes coarse blocks | +| Integrator seeded from hold-window average | reset on transition | wide stages phase-track the sawtooth; the average is the clean estimate — handoffs transient-free | +| Integrator clamp (anti-windup) | clamp output only | disturbances must not charge the estimate past physics; recovery starts near the truth | +| `seed()`/`reset(keepIntegrator)` API | let the loop chase every step | known-cause jumps carry no clock information; keep the knowledge, refresh the perception | +| `scaledTo()` for other rates | reuse 48 kHz defaults | absolute-Hz constants vs a rate-proportional disturbance: measured −32 dB at 16 kHz | + +## Verify it yourself + +```sh +# The five servo unit tests against the pure plant equation +# (type-2 nulling, ramp tracking, transient-free handoff, clamp, reset): +ctest --test-dir build -R 'Servo\.' --output-on-failure + +# The servo inside the real converter: lock/hold through the 32-frame +# block beat, drift-ramp tracking, slip continuity, stall recovery: +ctest --test-dir build -R 'AsrcLock\.' --output-on-failure + +# War story one, end to end (long: 120 s simulated per tone; prints the +# measured SNRs — compare against the thresholds in the file): +ctest --test-dir build -R 'AsrcQuality16k\.' --output-on-failure + +# War story two: regenerate the block-size study (32 / 64 / 240 frames, +# latency breathing and the cents-rms FM decomposition): +jupyter nbconvert --to notebook --execute notebooks/asrc_block_size_study.ipynb + +# Break it on purpose: in tests/test_asrc_quality_16k.cpp, replace +# Config::forSampleRate(kFs) with a default-constructed Config (keeping +# cfg.sampleRateHz = 16000.0) and watch ~32 dB vanish from every tone. +``` + +As with the ring buffer, the last item is the chapter in one line. The +three stages, the cascade, the scaling rule — none of it is decoration. +Take any piece away and a measurement, not an opinion, tells you what it +was holding back. diff --git a/include/srt/pi_servo.hpp b/include/srt/pi_servo.hpp index a9ce41d..762c1e4 100644 --- a/include/srt/pi_servo.hpp +++ b/include/srt/pi_servo.hpp @@ -47,6 +47,7 @@ namespace srt { +// ANCHOR: sv_config /// Servo tuning. Defaults suit a 48 kHz near-unity converter. /// unlockThresholdFrames should stay comfortably above half the push/pull /// block size, since block-quantized occupancy legitimately excursions by @@ -64,7 +65,9 @@ struct ServoConfig { double quietHoldSeconds = 2.0; ///< cascade-|e| hold => track -> quiet double unlockThresholdFrames = 24.0; ///< |e| above this => demote a stage double maxDeviationPpm = 1000.0; ///< epsHat clamp = +/- 1.5x this + // ANCHOR_END: sv_config + // ANCHOR: sv_scaled_to /// This config rescaled from the 48 kHz design rate to sampleRateHz: /// the loop bandwidths and error-smoother corners are absolute Hz and /// must track the rate, or the slip-sawtooth beat (ppm * fs) walks out @@ -87,6 +90,7 @@ struct ServoConfig { s.quietHoldSeconds /= r; return s; } + // ANCHOR_END: sv_scaled_to }; /// PI loop filter + three-stage lock-state machine. Pure double-precision @@ -103,6 +107,7 @@ class PiServo { reset(false); } + // ANCHOR: sv_reset /// Re-arm the loop. keepIntegrator preserves the accumulated ppm estimate /// (the right choice after a dropout: the clocks have not changed). void reset(bool keepIntegrator) noexcept { @@ -124,7 +129,9 @@ class PiServo { /// to the new setpoint at its clamped rate with no transient discontinuity /// — used by the converter's adaptive pull-block setpoint raise. void setTarget(double targetFrames) noexcept { target_ = targetFrames; } + // ANCHOR_END: sv_reset + // ANCHOR: sv_update_smooth /// One control update; call once per pull() before synthesis. /// \param occFrames raw backlog in frames (FIFO + staged frames) /// \param mu current fractional read position; occ + mu changes @@ -143,7 +150,9 @@ class PiServo { q3_ += aq * (q2_ - q3_); const double eFast = lpFast_ - target_; const double eQuiet = q3_ - target_; + // ANCHOR_END: sv_update_smooth + // ANCHOR: sv_update_stages const double limit = 1.5 * cfg_.maxDeviationPpm * 1e-6; switch (stage_) { case Stage::Acquire: @@ -168,7 +177,9 @@ class PiServo { } break; } + // ANCHOR_END: sv_update_stages + // ANCHOR: sv_update_out double kp = 0.0; double ki = 0.0; double e = 0.0; @@ -187,6 +198,7 @@ class PiServo { epsHat_ = std::clamp(kp * e + integ_, -limit, limit); return epsHat_; } + // ANCHOR_END: sv_update_out Stage stage() const noexcept { return stage_; } bool locked() const noexcept { return stage_ != Stage::Acquire; } @@ -199,6 +211,7 @@ class PiServo { return 1.0 - std::exp(-2.0 * std::numbers::pi * cornerHz * dt); } + // ANCHOR: sv_hold /// Hold-window logic shared by both promotions: |e| must stay below the /// threshold for holdSeconds; meanwhile epsHat is averaged (time constant /// holdSeconds/5) so the promotion can hand a clean estimate to the @@ -218,12 +231,15 @@ class PiServo { holdTimer_ = 0.0; return true; } + // ANCHOR_END: sv_hold + // ANCHOR: sv_gains void computeGains(double bandwidthHz, double& kp, double& ki) const noexcept { const double wn = 2.0 * std::numbers::pi * bandwidthHz; kp = 2.0 * cfg_.damping * wn / fs_; ki = wn * wn / fs_; } + // ANCHOR_END: sv_gains ServoConfig cfg_; double fs_; diff --git a/include/srt/polyphase_filter.hpp b/include/srt/polyphase_filter.hpp index dad83c2..b7f188c 100644 --- a/include/srt/polyphase_filter.hpp +++ b/include/srt/polyphase_filter.hpp @@ -223,6 +223,7 @@ inline void blendRow(const PolyphaseFilterBank& bank, row[t] = Tr::blend(c0[t], c1[t], fr); } +// ANCHOR: rs_blend_row_phase /// Phase-bit variants: the fractional position as an unsigned Q0.64 /// fraction. The polyphase index is the top log2(L) bits and the intra-phase /// blend factor comes from the bits below — no double arithmetic per sample, @@ -242,7 +243,9 @@ inline void blendRowPhase(const PolyphaseFilterBank& bank, for (std::size_t t = 0; t < taps; ++t) row[t] = Tr::blend(c0[t], c1[t], fr); } +// ANCHOR_END: rs_blend_row_phase +// ANCHOR: rs_interpolate_phase /// interpolate() over a Q0.64 phase; fused blend+mac (mono fast path). template inline S interpolatePhase(const PolyphaseFilterBank& bank, const S* hist, @@ -259,7 +262,9 @@ inline S interpolatePhase(const PolyphaseFilterBank& bank, const S* hist, acc = Tr::mac(acc, hist[t], Tr::blend(c0[t], c1[t], fr)); return Tr::finalize(acc); } +// ANCHOR_END: rs_interpolate_phase +// ANCHOR: rs_dot_row /// Dot product of a pre-blended coefficient row against a history window. /// Identical arithmetic to interpolate() given the same mu: blend then mac, /// per tap, in the same order — outputs are bit-exact either way. @@ -291,6 +296,7 @@ inline S dotRow(const typename SampleTraits::Coeff* SRT_RESTRICT row, const S acc = Tr::mac(acc, hist[t], row[t]); return Tr::finalize(acc); } +// ANCHOR_END: rs_dot_row /// One K-channel tile of the channel-parallel dot (hypothesis C6): K /// accumulators live in a constexpr-size local array — registers, not @@ -314,6 +320,7 @@ inline void dotTileFrameMajor(const typename SampleTraits::Coeff* SRT_RESTRIC out[k] = Tr::finalize(acc[k]); } +// ANCHOR: rs_dot_rows_frame_major /// Channel-parallel dot products over a frame-major history block: all /// channels' outputs for one frame in register-blocked tiles of 8/4/2/1. /// Per channel the accumulation order over taps equals dotRow's, so the @@ -338,7 +345,9 @@ inline void dotRowsFrameMajor(const typename SampleTraits::Coeff* SRT_RESTRIC if (c < channels) dotTileFrameMajor(row, x + c, taps, channels, out + c); } +// ANCHOR_END: rs_dot_rows_frame_major +// ANCHOR: rs_class_doc /// Streaming fractional-delay engine for one converter instance. /// /// Owns the history delay lines (planar per-channel below the @@ -358,6 +367,7 @@ inline void dotRowsFrameMajor(const typename SampleTraits::Coeff* SRT_RESTRIC /// detected by 64-bit wraparound instead of comparisons. template class FractionalResampler { + // ANCHOR_END: rs_class_doc public: /// Frame-major channel-parallel mode is compiled in only on CP targets /// and only for floating-point samples (see SRT_CHANNEL_PARALLEL). @@ -388,6 +398,7 @@ class FractionalResampler { scratchPos_ = 0; } + // ANCHOR: rs_mu /// Fractional position in [0,1) as a double; used by the servo at block /// rate (one conversion per pull, not per sample). double mu() const noexcept { return static_cast(phase_) * 0x1p-64; } @@ -396,6 +407,7 @@ class FractionalResampler { /// Frames popped from the source but not yet consumed by the filter; part /// of the effective backlog the servo must observe. std::size_t bufferedFrames() const noexcept { return scratchFrames_ - scratchPos_; } + // ANCHOR_END: rs_mu /// Fills the history window with taps() frames from the source. /// Returns false (and stays unprimed) if the source ran dry. @@ -410,6 +422,7 @@ class FractionalResampler { return true; } + // ANCHOR: rs_process_doc /// Synthesizes up to maxFrames output frames (interleaved) advancing the /// read position by (1 + epsHat) input frames per output frame. Returns /// the number produced; fewer than maxFrames means the source ran dry @@ -424,7 +437,9 @@ class FractionalResampler { /// interleaved frames, returning the count actually delivered. template std::size_t process(S* out, std::size_t maxFrames, double epsHat, PopFn&& popFrames) noexcept { + // ANCHOR_END: rs_process_doc // ANCHOR: p0_phase_step + // ANCHOR: rs_slip // eps in Q0.64, converted once per call (block rate). |eps| is // servo-clamped to ~1e-3, so eps * 2^64 fits int64 comfortably. const auto epsFix = static_cast(epsHat * 0x1p64); @@ -444,6 +459,8 @@ class FractionalResampler { } phase_ = m; // ANCHOR_END: p0_phase_step + // ANCHOR_END: rs_slip + // ANCHOR: rs_dispatch // Q15 on SMLALD targets routes mono through blendRow+dotRow as // well: dotRow carries the dual-MAC loop, and the two paths are // bit-exact by construction (see dotRow). @@ -466,6 +483,7 @@ class FractionalResampler { for (std::size_t c = 0; c < channels_; ++c) out[n * channels_ + c] = dotRow(row_.data(), window(c), taps); } + // ANCHOR_END: rs_dispatch } return maxFrames; } @@ -473,6 +491,7 @@ class FractionalResampler { private: const S* window(std::size_t c) const noexcept { return hist_[c].data() + end_ - bank_->taps(); } + // ANCHOR: rs_append template bool appendOne(PopFn&& popFrames) noexcept { if (scratchPos_ == scratchFrames_) { @@ -502,12 +521,14 @@ class FractionalResampler { ++scratchPos_; return true; } + // ANCHOR_END: rs_append const PolyphaseFilterBank* bank_; std::size_t channels_; std::size_t chunk_; std::size_t histCap_; std::vector scratch_; // interleaved staging for bulk pops + // ANCHOR: rs_members // History storage: planar (one delay line per channel, hist_[c]) below // SRT_CP_MIN_CHANNELS, frame-major (single interleaved line, hist_[0]) // at or above it on SRT_CHANNEL_PARALLEL targets. end_/histCap_ count @@ -519,6 +540,7 @@ class FractionalResampler { std::size_t scratchFrames_ = 0; std::size_t scratchPos_ = 0; std::uint64_t phase_ = 0; // fractional position, unsigned Q0.64 + // ANCHOR_END: rs_members bool primed_ = false; }; From c5d91fa3dfd57e027973791bd8575f1d04fb6495 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 1 Jul 2026 21:37:09 +0000 Subject: [PATCH 07/16] book: the optimization campaign chapters (C1-C6) Part III told as it happened: C1-C2 profile-first and the restrict audit, C3's Q0.64 phase with the M55 float regression accepted, C4's bounded win, C5's implemented-then-reverted vrmpyh with disassembly evidence, and C6's channel axis with both recorded traps. Adds comment-only opt_* ANCHOR markers to polyphase_filter.hpp (opt_dot_rows co-brackets the rs_ region). https://claude.ai/code/session_01HuAFfoeD5a5Xe5aGNA16M9 --- book/src/part3/c1-c2.md | 325 ++++++++++++++++++++++++++++++- book/src/part3/c3-c5.md | 315 +++++++++++++++++++++++++++++- book/src/part3/c6.md | 325 ++++++++++++++++++++++++++++++- include/srt/polyphase_filter.hpp | 6 + 4 files changed, 965 insertions(+), 6 deletions(-) diff --git a/book/src/part3/c1-c2.md b/book/src/part3/c1-c2.md index 595ff7f..bbed89e 100644 --- a/book/src/part3/c1-c2.md +++ b/book/src/part3/c1-c2.md @@ -1,3 +1,324 @@ -# c1 c2 +# Profile first, claim later (C1–C2) -*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* +Part III is a story, told in the order it happened. The introduction promised +six optimization efforts — four wins, one honest draw, one deliberate revert — +and the next three chapters deliver them with the real numbers, including the +two that went sideways. This chapter covers the method and the first two +efforts. The method matters more than either result, because the method is +what made the later reversals *visible* instead of silently absorbed. + +A word about why the campaign existed at all. By the time it started, the +converter already beat its closest architectural analog, libsamplerate's +streaming polyphase engine, by roughly 3× at matched quality on the host +(the full head-to-head lives in `docs/COMPARISON.md`). Nobody was losing +sleep over Xeon throughput. The pressure came from the other end of the +target list: the embedded parts. A converter that costs ~1.6% of a Xeon +core even at eight channels is invisible; the same converter on a +Cortex-M33 or a Hexagon DSP is a line item in someone's cycle budget, and +every instruction shaved is budget returned to the application. That framing shaped the campaign's stopping +rule, written down before any code changed: + +> Optimization stops by budget, not by exhaustion. Stop when targets are +> met, when the profile is flat (no single hotspot ≥ 10%), or when the next +> win requires per-arch complexity the budget does not justify. + +Keep that third clause in mind. It fires, verbatim, two chapters from now. + +## The loop + +`docs/PERFORMANCE.md` opens with a working agreement — not aspiration, +process. Every PR that touched the hot path followed the same five steps: + +1. **Baseline** on the benchmark matrix. +2. **Profile** — `perf record` and a flamegraph for time, `-fopt-info-vec` + or `-Rpass=loop-vectorize` for any claim about vectorization. +3. **One hypothesis, one change, one PR** — each optimization PR carries + its before/after numbers in the description. +4. **A/B** — benchmarks for speed, the full test suite for correctness. The + pinned SNR thresholds are the quality guardrail: an optimization that + costs decibels fails CI by design, so "it's faster" can never quietly + mean "it's faster and slightly worse". +5. Repeat until a stopping condition triggers. + +Two measurement instruments back the loop, and they have opposite +personalities. + +**Wall-clock throughput** (Google Benchmark, `bench/bench_asrc.cpp`) is what +users feel, and it is noisy — the project's benches run on shared CI +runners, where a neighbor's workload can move a number more than a real +regression does. So the docs state a rule this book has already quoted and +will quote again: *wall-clock benches are never a hard gate on shared +runners*. They run as a smoke test and produce trend artifacts. When this +chapter reports a wall-clock delta, it was measured as a same-machine, +same-session A/B — the only configuration in which the ratio means +anything. + +**Executed instructions** (the QEMU TCG plugin harness, `bench/icount/`) is +the opposite: deterministic to the instruction. Each embedded scenario is a +fixed-workload binary — bare metal has no argv, so there is one binary per +scenario — run under an instruction-counting plugin on emulated Cortex-M55, +Cortex-M33, and Hexagon. Counts are exact across runs; the project verified +that before trusting them. CI compares every scenario against a checked-in +`bench/baselines.json` and fails if any metric moves more than 3% in +*either* direction. The two-sidedness is the clever part: an improvement +beyond tolerance also fails until the baseline is re-recorded in the same +diff, because stale slack in the baseline is exactly the room a later +regression would hide in. + +Instruction counts are not cycle counts — no cache misses, no dual-issue, +no branch predictor. For the scalar code these targets run, they correlate +well with real cost, and they buy something cycles on shared hardware never +can: the ability to assert that a number did not change *at all*. That +ability is the backbone of everything that follows. + +Before the first change, the hypotheses were written down in expected-ROI +order: per-channel blend redundancy first, then auto-vectorization quality, +then a fixed-point phase accumulator, then explicit SIMD kernels. Writing +the list first is cheap insurance against the oldest failure mode in +optimization work — doing the fun change instead of the valuable one, then +constructing the justification afterward. + +## C1: the blend that was computed N times + +Recall the datapath from Part I. To produce one output sample, +`interpolate()` picks the two polyphase coefficient rows adjacent to the +fractional position μ, blends them tap-by-tap by the intra-phase fraction, +and dot-products the blended coefficients against the history window: + +```cpp +typename Tr::Accum acc{}; +for (std::size_t t = 0; t < taps; ++t) + acc = Tr::mac(acc, hist[t], Tr::blend(c0[t], c1[t], fr)); +``` + +Per output sample, per channel: one blend and one multiply-accumulate per +tap. Now watch what happens in a multichannel stream. Every channel of an +output frame is evaluated at the *same* μ — the channels advance through +time together; that is what makes them a frame. So the coefficient blend, +which depends only on μ, was being recomputed identically for every +channel. For stereo, half of the inner-loop arithmetic was duplicate work. +For twelve channels, eleven-twelfths of the blend work was. + +The fix is the obvious factoring, and its entire risk profile lives in one +question: does it change the output? Compute the blended row once per +output frame into a small scratch buffer — at most 80 entries, the +`transparent` preset's tap count — then run a plain dot product per +channel: + +```cpp +blendRow(bank, row, mu); // once per frame +for (std::size_t c = 0; c < channels; ++c) + out[c] = dotRow(row, window(c), taps); +``` + +The arithmetic per channel is *identical* to the fused loop: blend then +mac, per tap, in the same order, in the same types. Only the schedule +changed — blend hoisted out of the channel loop. Identical operations in +identical order produce identical bits, even in floating point, so the A/B +had a correctness criterion stronger than any SNR threshold: outputs +unchanged **bit-for-bit**. They were. + +The measured results, from the C1 entry in the performance log: + +| Measurement | Result | +|---|---:| +| Stereo pipeline, x86 wall-clock (same-machine A/B) | −36% | +| 8-channel pipeline, x86 wall-clock | −52% | +| M55 pipeline instructions, float / Q15 / Q31 | −15% / −30% / −21% | +| Hexagon pipeline instructions, float / Q15 / Q31 | −3.6% / −3.3% / −0.2% | +| Mono kernels, both targets | count-identical | + +Three of those rows deserve commentary; they carry the chapter's lessons. + +**The mono row is a control.** Mono has no duplicate blend — one channel, +one blend, nothing to hoist — so the change should not touch the mono +kernels at all. "Should not touch" is a hypothesis, and the deterministic +counter can test it exactly: the mono kernel scenarios were +count-identical, to the instruction, on both targets. Had they moved by +even a handful of instructions, that would have meant the change did +something beyond its stated mechanism — maybe harmless, maybe not, but +either way the PR's description would have been wrong, and a wrong +description is a review failure even when the numbers are green. Every +subsequent effort in this campaign carries controls like this, and the +discipline is worth stating as a rule: **a change must move what it claims +to move, and everything else must measure 0.00%.** Wall-clock benchmarks +cannot enforce that rule; nothing measures 0.00% on a shared Xeon. +Instruction counting can, and does. + +**The scaling is the hypothesis confirmed.** Stereo −36%, 8-channel −52%: +the win grows with channel count, exactly as a per-channel redundancy +elimination should. Numbers that match the *shape* of the prediction, not +just its sign, are how you know the mechanism you described is the +mechanism that acted. + +**And then there is Hexagon.** The M55 dropped double-digit percentages; +Hexagon barely moved — −3.6% at best, and the Q31 pipeline a rounding-error +−0.2%. The same source code, the same factoring, the same eliminated +arithmetic. An under-delivering result like this is where measurement-first +culture earns its keep, because the temptation is to shrug — Hexagon is +weird, ship the M55 win — and the log did not shrug. If eliminating most of +the per-channel blend work barely dents the pipeline cost, then the +pipeline cost must be dominated by something that is neither blend nor dot +product. The remaining candidate was the per-sample *phase bookkeeping*: +μ lived in a `double`, and Hexagon has no double-precision FPU, so every μ +increment, wrap and index conversion was a soft-float library call. The +kernels were cheap; the glue between kernels was expensive. The C1 entry +records this diagnosis in one clause — "its pipelines are dominated by +per-sample soft-double phase math" — and flags it as the motivation for +C3. A disappointing result, read carefully, fingered the next target. That +is not a consolation prize; over the campaign it turned out to be C1's +second-most-valuable output. + +## C2: the audit — verify, don't assume + +Hypothesis 2 on the list was not a change at all. It was an audit: do the +hot loops actually vectorize, under the compilers and flags the project +ships with? Everyone who has read optimization folklore "knows" the +answers — contiguous arrays vectorize, reductions vectorize, the compiler +is smart. The project's rule, stated in the hypothesis itself: *verify, +don't assume.* The tool is the compiler's own testimony: +`-fopt-info-vec` on GCC, `-Rpass=loop-vectorize` (and its `-missed` +sibling) on Clang, which report loop by loop what vectorized and what did +not, and why. + +The audit produced four findings — one actionable, three that reshaped the +rest of the campaign's roadmap. + +**Finding 1: `blendRow` vectorized, but behind a runtime aliasing check.** +The compiler could not prove that the output row and the coefficient table +don't overlap — they arrive as separate pointers, and separate pointers may +alias — so it emitted *two* versions of the loop, vector and scalar, with a +runtime overlap test choosing between them every call. The fix is the +oldest annotation in the C toolbox, wrapped for portability: + +```cpp +#if defined(_MSC_VER) +#define SRT_RESTRICT __restrict +#else +#define SRT_RESTRICT __restrict__ +#endif +``` + +`SRT_RESTRICT` on the kernel pointer parameters is a promise to the +compiler — these regions do not overlap — and the caller's structure makes +the promise true: the row is a private scratch member, the table is +immutable, the histories are distinct vectors. The versioning check and the +dead scalar copy disappear. The header carries a comment tying the +qualifier to the evidence (`verified with -fopt-info-vec; see +docs/PERFORMANCE.md, hypothesis 2`), so the next maintainer knows it is +load-bearing and not cargo cult. + +**Finding 2: the Q15 dot product auto-vectorizes, no help needed.** This is +worth a paragraph, because *why* it vectorizes is the piece of theory the +next two chapters stand on. A dot product is a reduction — every iteration +folds into one accumulator, a serial dependence chain. Vectorizing it means +computing partial sums in lanes and combining them at the end, which +**reorders the additions**. For integer arithmetic that reordering is free: +int64 addition is exactly associative, every 16×16 product is exact, so any +order of summation produces the same bits. The compiler knows this and +vectorizes integer reductions at `-O2` without being asked. + +**Finding 3: the float dot product is scalar — and stays scalar, by +design.** Floating-point addition is *not* associative; reordering the +accumulation changes the rounding, which changes the output bits. The +library's float datapath promises double-precision accumulation in a +defined order — that is part of what its measured 135 dB rests on — so the +compiler correctly refuses to vectorize the reduction, and the project +correctly declined to force it with `-ffast-math` or manual partial sums. +The audit *did* record the option: explicit 4-way double accumulation would +vectorize the float dot and change output bits, and it entered the log as +**deferred hypothesis 5** — a bit-changing optimization, parked until the +budget demands it and the quality harness can re-baseline around it. Hold +that thought; hypothesis 5 has a surprising fate in the C6 chapter, where +an axis nobody had listed makes float vectorization possible *without* +changing a bit. + +**Finding 4: the Q31 dot product is scalar too**, for a blunter reason — +baseline ISAs have no packed 64-bit multiply, and Q31 MACs need 32×32→64 +products. No annotation fixes an instruction set. Noted, filed, moved on. + +One actionable change, then: `SRT_RESTRICT` on the kernel signatures. The +measured effect, and this time the controls are the headline: + +| Scenario | Δ instructions (M55) | +|---|---:| +| `pipeline_float` | −1.35% | +| every other scenario, both targets | **0.00%** | + +On x86, a same-state wall-clock A/B measured −3.7% — the aliasing check sat +in a hotter relative position there. But look at the M55 table with C1's +rule in mind. The claim was narrow: *restrict removes a runtime aliasing +check from `blendRow`*. The fixed-point pipelines blend through the same +function — but their loop bodies differ, the versioning overhead lands +differently, and on M55 only the float pipeline was paying measurably. +Fine. What the claim *requires* is that nothing else moves: the qualifier +is documentation to the optimizer, not arithmetic, so any scenario where +the codegen was already clean must be bit-identical binary. All of them +were, to the instruction. A −1.35% win surrounded by exact zeros is a +*verified mechanism*. A −1.35% win alone is just a number. + +It is worth pausing on how unusual that sentence is. In most performance +work, "this change affects only X" is a belief. Here it is a measurement, +because the instrument has no noise floor. The ratchet infrastructure was +built to catch regressions; the campaign discovered its second use almost +immediately — it certifies *non-effects*, which is what turns an +optimization PR from "trust me" into an experiment with controls. Chapter +C6 will show the dramatic version: an embedded control that *failed* — a +hosts-only feature that leaked +6–8% into the M55 — and stopped a merge. + +## What two efforts bought + +The scoreboard after C1 and C2: multichannel wall-clock roughly halved at +high channel counts, double-digit instruction reductions on the M55 +pipelines, a `restrict` qualifier with a paper trail — and, less tangibly, +three pieces of map. Hexagon's cost lives in soft-double phase math (C3's +target). The M33-class parts, with no vector unit at all, will need +something explicit for Q15 (C4's target). And the float dot product cannot +be vectorized over taps without changing bits (the constraint C6 +eventually routes around). None of those three facts was known before; all +three came from measurements that individually looked like disappointments +or non-events. + +That is the method chapter's actual thesis. The loop — baseline, profile, +one hypothesis, A/B with controls — is not bureaucracy around the real work +of optimizing. On this evidence it *is* the real work: every effort in the +next two chapters was aimed by an anomaly this chapter's measurements +surfaced and refused to explain away. + +## Verify it yourself + +```sh +# Host wall-clock benchmarks (Google Benchmark): +cmake -B build -DCMAKE_BUILD_TYPE=Release -DSRT_BUILD_BENCHMARKS=ON +cmake --build build -j +./build/bench/srt_bench --benchmark_filter='Pipeline' + +# Wall-clock deltas in this chapter are same-machine A/Bs. To reproduce +# one, run the benchmark at the parent commit and at the change on the +# same machine in the same session — the project never gates on +# wall-clock from shared runners, and neither should you. + +# The compiler's own vectorization testimony (C2's instrument): +cmake -B build-vec -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_FLAGS="-fopt-info-vec-optimized -fopt-info-vec-missed" +cmake --build build-vec -j 2>&1 | grep -i 'polyphase' + +# Deterministic instruction counts, exactly as CI gates them +# (arm-none-eabi-gcc + qemu-system-arm + the counting plugin; see +# .github/workflows/ci.yml for the plugin build): +cmake -B build-m55 -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_TOOLCHAIN_FILE=cmake/arm-cortex-m55-mps3.cmake \ + -DSRT_BUILD_TESTS=OFF -DSRT_BUILD_EXAMPLES=OFF \ + -DSRT_BUILD_ICOUNT_BENCH=ON +cmake --build build-m55 -j +python3 scripts/icount.py --target m55 --build-dir build-m55 \ + --plugin /tmp/libinsncount.so + +# The quality guardrail that every optimization PR had to clear: +ctest --test-dir build -R Quality --output-on-failure +``` + +Run the icount harness twice and diff the outputs: identical, to the +instruction. That reproducibility is the entire epistemology of this +chapter — it is what lets "all other scenarios: 0.00%" be a finding +instead of a hope. diff --git a/book/src/part3/c3-c5.md b/book/src/part3/c3-c5.md index f88096c..8b56db6 100644 --- a/book/src/part3/c3-c5.md +++ b/book/src/part3/c3-c5.md @@ -1,3 +1,314 @@ -# c3 c5 +# The integer phase and the wide MACs (C3–C5) -*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* +The previous chapter ended with an anomaly: C1 stripped most of the +per-channel blend work out of the datapath, the M55 pipelines dropped by +double digits, and Hexagon's barely moved. The diagnosis written into the +log was that Hexagon's pipeline cost was dominated not by the kernels but +by the glue between them — the per-sample phase bookkeeping, done in +`double`, on a DSP with no double-precision FPU. Every μ update was a +soft-float library call. + +This chapter is what happened when the campaign acted on that diagnosis, +and then kept going: one clean win that falsified the project's own +documentation on the way through (C3), one honest, bounded win on the +smallest target (C4), and one implementation that was correct, complete, +measured — and deliberately deleted (C5). The theme, if the chapter has +one: **a negative result recorded is a win.** Not a moral victory — an +actual asset, with a measurable replacement cost. + +## C3: evicting the last double from the per-sample path + +The fractional resampler tracks its position between input samples as a +phase in [0, 1). Before C3, that phase was a `double` named μ, and the +per-sample loop did double-precision work even on the fixed-point +datapaths: advance μ by the rate ratio, detect wrap past 1.0 or below 0.0, +scale by the phase count L, split into integer row index and fractional +blend factor. On a Xeon this is noise. On a core where every double +operation is a function call, it was — per C1's evidence — the dominant +per-sample cost. + +The C3 design replaces the double with an unsigned **Q0.64** fixed-point +fraction: a plain `uint64_t` whose full range represents [0, 1) with +resolution 2⁻⁶⁴. Three properties make this format almost suspiciously +well-suited to the job. + +**The unity part of the ratio never enters the accumulator.** This is the +near-unity specialization paying out one more time. The converter's ratio +is 1 + ε with |ε| servo-clamped to around 10⁻³, so the resampler advances +one input frame per output frame *structurally* and only the deviation ε +accumulates in the phase. ε is converted from the servo's double to a +signed Q0.64 increment **once per `process()` call** — block rate, not +sample rate. That single conversion is the only double arithmetic left +near the hot path. + +**Wraparound detection is free.** Unsigned overflow is defined, modular +arithmetic — the same property the ring buffer's monotonic indices leaned +on in Part I. If adding a positive ε wraps the phase past 2⁶⁴, the sum +comes out *smaller* than the old phase; that comparison **is** the slip +detector, and the response is to consume one extra input frame. A negative +ε wrapping below zero comes out *larger*, and the window is reused. No +epsilon-comparisons against 1.0, no branch on the sign of a floating-point +residual — two integer compares. + +**The table index and blend factor are bit fields.** L is a power of two, +so the polyphase row index is simply the top log₂ L bits of the phase, and +the intra-phase blend factor is the bits below, shifted up: + +```cpp +const int lg = std::countr_zero(bank.numPhases()); +const std::size_t p = static_cast(phase >> (64 - lg)); +const auto fr = Tr::blendFactorFromQ64(phase << lg); +``` + +No multiply by L, no floor, no subtract — shifts. The per-sample path is +now integer-only on the fixed-point datapaths, plus a single +single-precision conversion on the float path. + +Notice also what the format does to *resolution*: a double's mantissa gives +the old μ about 2⁻⁵² of precision; Q0.64 gives 2⁻⁶⁴. Twelve extra bits of +phase resolution means less quantization of the sampling instant, and Part +0's arithmetic connected phase jitter directly to distortion. So the C3 +entry contains a line that optimization logs almost never get to contain: +**quality improved** — 135.0 dB at 997 Hz, versus the previous baseline, +measured by the same pinned tests that would have failed the PR had it cost +decibels. Faster and cleaner, from one change. + +### The falsification + +Now the numbers, and the embarrassing one first, because the log put it +first. M55 instruction counts: + +| M55 scenario | Δ instructions | +|---|---:| +| pipeline Q15 | −5.3% | +| pipeline Q31 | −4.6% | +| pipeline float | **+1.4%** | + +The fixed-point wins were expected. The float *regression* was not — it +contradicted the project's own documentation. The performance plan's +hypothesis list had asserted, in writing, that the M55's float path was +soft-double-bound, just like Hexagon's; the M55 was on the list of targets +the integer phase was supposed to rescue. If that were true, replacing +per-sample double math with int64 math should have helped the float +pipeline too. Instead the float pipeline got slightly *worse*. + +One of three things had to be wrong: the measurement, the change, or the +documentation. The measurement is deterministic and was reproduced. The +change was doing exactly what it claimed on every other scenario. That +left the documentation — and a check of the architecture manuals settled +it: **the Cortex-M55's scalar FPU supports FP64.** Only its vector +extension, Helium/MVE, is limited to fp16/fp32. The M55 float path had +never been soft-double-bound; its doubles were cheap hardware doubles all +along, and C3 had traded them for int64 sequences that cost slightly more. +The genuinely double-less target in the fleet is Hexagon, and only +Hexagon. + +The correction is recorded *in the hypothesis list itself* — hypothesis 3 +in `docs/PERFORMANCE.md` now reads as a correction notice ("discovered +while measuring: Cortex-M55's *scalar* FPU does support FP64…"), so the +false belief cannot quietly re-seed a future roadmap. And the +1.4% was +accepted, eyes open, as the price of a cross-target win: the phase +accumulator is one implementation shared by every datapath, and forking it +per-target to claw back 1.4% on one scenario is exactly the per-arch +complexity the stop rule exists to refuse. + +This is the campaign's cleanest specimen of the culture the introduction +promised. A 1.4% regression on one scenario of one target is the kind of +number a wall-clock benchmark would eat as noise. The deterministic +harness surfaced it; the loop's rule — explain every number, especially +the small ugly ones — forced the investigation; the investigation +falsified a documented belief about the hardware. *The measurement audited +the documentation*, not the other way around. + +### The target it was aimed at + +Hexagon, from the PR's gating run: + +| Hexagon scenario | Δ instructions | +|---|---:| +| pipeline Q31 | −15.5% | +| pipeline Q15 | −10.3% | +| pipeline float | −2.6% | +| kernels (all types) | count-identical | + +The per-sample soft-double phase math that C1 had identified as dominating +Hexagon's pipelines is simply gone. The kernel scenarios — which measure +`interpolate()` in isolation, no phase bookkeeping — were count-identical, +the control confirming the change touched only what it claimed. On x86, a +same-minute A/B measured float −5.4% and Q15 −12.0% wall-clock; hosts keep +score too, they just don't gate. + +C1 found the target; C3 hit it. That is the loop working across PRs, not +just within one. + +## C4: two MACs per instruction, where the compiler won't + +Next on the list: explicit SIMD, "partially moot" before it started. The +audit trail from C2 explains why. On the M55, objdump had confirmed that +GCC already auto-vectorizes the Q15/Q31 kernels with Helium at -O2 — the +M55's roughly 4× Q15 advantage over the scalar M33 in the baselines is MVE +at work, no intrinsics required. But the fleet has a whole class of parts +below the M55: Cortex-M33, M4, M7 — the Raspberry Pi Pico 2 class. These +have no vector unit at all. What they *do* have is the Armv7E-M/Armv8-M +**DSP extension**: scalar instructions that treat a 32-bit register as two +16-bit lanes. The one that matters here is `SMLALD` — *signed multiply +accumulate long dual* — which takes two such registers, forms both 16×16 +products, and adds both into a 64-bit accumulator. One instruction, two +Q15 MACs: precisely the inner operation of `dotRow`, at double width. + +The bit-exactness argument is short enough to carry in your head, and it +is the same argument C2's finding 2 established: every 16×16 product is +exact in int32, int64 addition is associative, therefore summing the +products in pairs instead of one-by-one changes no output bit. The +intrinsic path and the scalar loop are not "close" — they are the same +function, by construction. (Contrast the float dot, where this argument is +exactly what fails.) + +The subtle part of C4 is not the intrinsic; it is the **gate**. Here is +the actual block from `include/srt/polyphase_filter.hpp`, pulled in live: + +```cpp +{{#include ../../../include/srt/polyphase_filter.hpp:opt_smlald_gate}} +``` + +Read the condition: DSP extension present *and MVE absent*. The naive gate +— "use the fast intrinsic wherever the ISA has it" — would have enabled +SMLALD on the M55 too, where the compiler is currently vectorizing that +loop with Helium. The intrinsic loop is hand-written; the compiler will +not auto-vectorize *through* it; enabling it on the M55 would have +silently replaced full vector arithmetic with dual scalar MACs — an +optimization for one target acting as a pessimization on a better one. +This is the MVE-gate discovery, and it generalizes: **an intrinsic is a +floor and a ceiling at once.** The ratchet verified the gate the only way +that counts — every M55 and every Hexagon scenario at exactly 0.00%, +meaning those binaries are instruction-for-instruction unaffected by C4's +existence. + +One routing consequence: mono Q15 on these targets now goes through +`blendRow` + `dotRow` rather than the fused `interpolatePhase()`, because +the dual-MAC loop lives in `dotRow` — legitimate only because C1 +established the two paths are bit-exact against each other. + +The result: **M33 `pipeline_q15` −3.1%.** And here the log does something +worth imitating, under the heading of honest accounting. A 2×-wide MAC +did not halve the frame cost, or even a tenth of it — why? Because the +M33's Q15 frame cost is not dominated by the dot product. It is dominated +by the coefficient *blend* — whose per-tap `fr * diff >> 15` is a 64-bit +product, one `smull` each, already one instruction — and by transport +around the datapath. The candidate follow-up, a packed blend, would change +the documented int64 blend invariant that the bit-exactness proofs rest +on, and was declined at current budgets. The entry even flags that the +`kernel_q15` scenario still measures the fused `interpolate()` call, which +C4 intentionally does not touch — so nobody later mistakes that flat +number for a failed optimization. This is the campaign's honest draw: a +real, kept, correctly-gated improvement, described at its true size, with +the reason it is small written next to it. + +## C5: the revert, and why it is the best entry in the log + +Hexagon's turn. The C4 argument seemed to transfer directly, at double the +width again: Hexagon's scalar ISA has `vrmpyh`, which forms **four** exact +16×16 products per instruction and can feed an int64 accumulator. Four Q15 +MACs per instruction against a loop the profile says is pure MACs — the +back-of-envelope says this is the biggest single win in the campaign. + +It was implemented. It passed the full test suite under Hexagon QEMU, +bit-exactly — the same associativity argument held. And it measured: + +> `pipeline_q15`: 119,847,854 → 119,478,758 instructions. **−0.31%.** + +A 4-wide MAC bought less than a third of a percent. + +The reflexive explanation — the one everyone reaches for first — is that +the compiler had already vectorized the loop, so the intrinsic replaced +equivalent code. The team checked, because the reflexive explanation is +also checkable: disassemble both binaries (CI's llvm-objdump, pre and +post) and count the wide MACs. The baseline binary contains **zero**. The +intrinsic build contains **10**. The compiler had *not* done it already; +the intrinsic genuinely landed, four products at a time, exactly as +designed — and it barely mattered. + +The real explanation is better, and it is the piece of architecture +knowledge the whole effort purchased: **Hexagon's scalar ISA is already +half a DSP.** Its ordinary instruction set has single-instruction 64-bit +multiply-accumulates (`Rxx += mpy`) and 64-bit loads, so the "scalar" +baseline loop was already running at a density that would take intrinsics +to reach on a Cortex-M. On top of that, the history window is 2-byte +aligned — it slides one sample at a time by design — so feeding `vrmpyh`'s +packed operands costs combine/alignment work that eats most of what the +wide multiply saves. The instruction *was* wide; the loop around it had to +pay to keep it fed. + +Now the stop rule from the first page of this Part, firing on schedule: +*the next win requires per-arch complexity the budget does not justify.* +A −0.31% improvement is real — deterministic, reproducible, green across +the suite. It is also the definition of not worth it: a Hexagon-specific +intrinsic path is a second implementation to review, to gate, to keep +bit-exact against the reference forever, purchased for three-tenths of a +percent. The code was reverted. Not lost to a branch nobody can find — +**reverted, with the entry as the deliverable**: the numbers, the +disassembly evidence, and the analysis now live in `docs/PERFORMANCE.md` +under C5, so the next engineer who has the vrmpyh idea (and someone will; +it is a *good* idea) spends five minutes reading instead of two days +re-deriving a dead end. + +The entry's final paragraph is the part that turned out to be prophetic. +Having established that the win wasn't there in scalar-wide instructions, +it asks whether HVX — Hexagon's actual 128-byte vector unit — could do +better, and answers with a shape argument: a 48–80-tap dot product does +not fill one HVX vector, and HVX's 16-bit MACs accumulate into 32-bit +lanes, which overflows the library's exact-int64 invariant after about 24 +worst-case taps. **Per-channel dot products are the wrong shape for HVX** +— not slow, *wrong-shaped*: the axis being vectorized (taps) is too short +and demands too much accumulator width per lane. The shape that fits is +turned ninety degrees: one 64-bit lane-pair per *channel*, sixteen +channels filling one vector exactly — vectorize across channels, not +across taps. That observation was recorded as hypothesis C6, and the next +chapter is what happened when it met the float datapath's +may-not-reorder-additions constraint and dissolved it. + +Score the chapter the way the introduction scored the campaign. C3: a win +that corrected the project's documentation. C4: a draw, honestly sized. +C5: a revert that produced no code and two durable facts — Hexagon's +scalar MAC density, and the channel-axis insight that C6 is built on. The +log entry for the revert cost nothing to keep and pointed directly at the +campaign's largest remaining win. Negative results, *recorded*, compound. + +## Verify it yourself + +```sh +# C3's quality claim — the pinned SNR thresholds (135 dB at 997 Hz for +# the float path) are asserted by the test suite, not the docs: +ctest --test-dir build -R Quality --output-on-failure + +# C3/C4 instruction counts on the Arm targets (the M33 leg is where C4's +# −3.1% lives; every M55 scenario is C4's 0.00% control): +cmake -B build-m33 -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_TOOLCHAIN_FILE=cmake/arm-cortex-m33-mps2.cmake \ + -DSRT_BUILD_TESTS=OFF -DSRT_BUILD_EXAMPLES=OFF \ + -DSRT_BUILD_ICOUNT_BENCH=ON +cmake --build build-m33 -j +python3 scripts/icount.py --target m33 --build-dir build-m33 \ + --plugin /tmp/libinsncount.so + +# C4's gate, interrogated directly: preprocess for an M33 and for an M55 +# and watch SRT_Q15_SMLALD flip (DSP extension without MVE vs with): +arm-none-eabi-gcc -mcpu=cortex-m33 -dM -E - ::Coeff* SRT_RESTRICT row, const S } // ANCHOR_END: rs_dot_row +// ANCHOR: opt_dot_tile /// One K-channel tile of the channel-parallel dot (hypothesis C6): K /// accumulators live in a constexpr-size local array — registers, not /// memory — while the tap loop walks the frame-major window with stride @@ -319,8 +322,10 @@ inline void dotTileFrameMajor(const typename SampleTraits::Coeff* SRT_RESTRIC for (std::size_t k = 0; k < K; ++k) out[k] = Tr::finalize(acc[k]); } +// ANCHOR_END: opt_dot_tile // ANCHOR: rs_dot_rows_frame_major +// ANCHOR: opt_dot_rows /// Channel-parallel dot products over a frame-major history block: all /// channels' outputs for one frame in register-blocked tiles of 8/4/2/1. /// Per channel the accumulation order over taps equals dotRow's, so the @@ -346,6 +351,7 @@ inline void dotRowsFrameMajor(const typename SampleTraits::Coeff* SRT_RESTRIC dotTileFrameMajor(row, x + c, taps, channels, out + c); } // ANCHOR_END: rs_dot_rows_frame_major +// ANCHOR_END: opt_dot_rows // ANCHOR: rs_class_doc /// Streaming fractional-delay engine for one converter instance. From 073d886cf76e124eb9e5c3925bba5de8311d0cb3 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 1 Jul 2026 21:37:49 +0000 Subject: [PATCH 08/16] book: sample traits and the C ABI chapters The traits/concept mechanism interleaved with a fixed-point course (Q-format headroom, recomputed accumulation bounds, the blend-margin audit correction) and the C-ABI shim chapter (opaque handles, C-linkage overload workaround, null-guard audit history, no-unwind caveat). Adds comment-only st_*/abi_* ANCHOR markers. https://claude.ai/code/session_01HuAFfoeD5a5Xe5aGNA16M9 --- book/src/part1/sample-traits.md | 428 +++++++++++++++++++++++++++++++- book/src/part4/c-abi.md | 322 +++++++++++++++++++++++- include/srt/sample_traits.hpp | 28 +++ tools/capi/srt_capi.cpp | 8 + tools/capi/srt_capi.h | 4 + 5 files changed, 786 insertions(+), 4 deletions(-) diff --git a/book/src/part1/sample-traits.md b/book/src/part1/sample-traits.md index 54a8df2..1a2d2c9 100644 --- a/book/src/part1/sample-traits.md +++ b/book/src/part1/sample-traits.md @@ -1,3 +1,427 @@ -# sample traits +# Sample types as a customization point: `sample_traits.hpp` -*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* +The polyphase machinery of the last two chapters computes one thing: a dot +product between a window of input samples and an interpolated row of filter +coefficients. The problem is that this library ships to machines that do not +agree on what a number is. A Xeon host wants `float` samples and will happily +accumulate in `double`. A Hexagon DSP has no double-precision FPU at all — +every `double` operation is a soft-float library call. A Cortex-M33 has no +vector unit and wants 16-bit samples it can crunch two at a time. The same +algorithm must therefore run in three different arithmetic systems, produce +measured quality in each, and pay nothing for the flexibility. + +Here is what "nothing" has to mean, concretely. The inner loop of +`interpolate()` runs one multiply-accumulate and one coefficient blend per +tap, per channel, per output sample. At 48 kHz stereo with the default +balanced preset (48 taps), that is about 4.6 million multiply-accumulates +per second — and every one of them goes through the customization point this +chapter describes. Any mechanism that adds even one indirect call to that +path has already lost. + +This chapter tells two interleaved stories. The C++ story is how a traits +struct and a concept make the sample type a *compile-time* customization +point — and why the obvious alternatives (virtual dispatch, CRTP) were +rejected. The arithmetic story is fixed-point numerics from scratch: what +Q-formats are, where the headroom bits went, why the accumulators are +exactly as wide as they are, and two places where the file's own comments +record hard-won corrections. The two stories are one file: + +```cpp +{{#include ../../../include/srt/sample_traits.hpp:st_overview}} +``` + +Three sample types, and a division of labor worth pausing on: the clock +servo and the filter *design* always run in `double`, because they execute a +handful of operations per block or once at construction. Only the datapath — +the code that touches every sample — is templated. Optimizing anything else +would be effort spent where the profile isn't. + +## The mechanism: a struct full of static functions + +The customization point is a class template with no primary definition: + +```cpp +{{#include ../../../include/srt/sample_traits.hpp:st_primary}} +``` + +Leaving the primary template *undefined* is deliberate. A defined primary +template would need default behavior, and there is no honest default for +"how do I multiply-accumulate your type" — any guess would compile for +unsupported types and be silently wrong. Undefined, the template turns an +unsupported type into a compile error at the first use. (A more *readable* +error is the concept's job, below.) + +Each supported type then gets a full specialization. The float one is the +simplest and shows the complete vocabulary — three associated types and +seven operations: + +```cpp +{{#include ../../../include/srt/sample_traits.hpp:st_float}} +``` + +Every operation the datapath performs on samples is named here: convert a +designed coefficient to storage form (`makeCoeff`), convert the fractional +position to the blend representation (`makeBlendFactor`, +`blendFactorFromQ64`), the multiply-accumulate (`mac`), the adjacent-phase +coefficient blend (`blend`), the accumulator-to-sample conversion +(`finalize`), and silence. The polyphase chapter's `interpolate()` is written +entirely in this vocabulary: + +```cpp +acc = Tr::mac(acc, hist[t], Tr::blend(c0[t], c1[t], fr)); +``` + +and consequently never mentions `int16_t`, `double`, or a shift instruction. +One algorithm, one body of tests, three number systems. + +### Why not virtual dispatch + +The classical OO answer — an abstract `SampleOps` interface with `mac()` and +`blend()` as virtual functions — fails on the arithmetic of the hot loop. +A virtual call is an indirect call through a vtable: the compiler cannot +inline it, and what it cannot inline it cannot optimize *across*. The Q15 +`mac` below compiles to roughly two instructions when inlined; as a virtual +call it would be a call, a return, an argument setup, and — far worse — an +opaque boundary in the middle of the loop. Everything Part III wins depends +on the compiler seeing through these functions: the Q15 dot product +auto-vectorizes on hosts and gets Helium code on the M55 (the C2 audit +verified both), and the C4 SMLALD kernel exists because the products were +visible as exact 16×16 multiplies. Four and a half million vtable +indirections per second, each one an optimization fence, was never a +candidate. + +Virtual dispatch also answers a question nobody asked. Dynamic dispatch +buys the ability to choose the implementation *at run time* — but a +converter's sample type is fixed at the moment you write +`AsyncSampleRateConverterQ15`. Paying the vtable price for flexibility that +is never exercised is the definition of the wrong tool. + +### Why not CRTP + +The curiously recurring template pattern is the usual zero-cost answer to +virtual dispatch, and it was rejected for a simpler reason: CRTP customizes +through *inheritance* — `class MySample : SampleBase` — and the +sample types here are `float`, `std::int16_t`, and `std::int32_t`. You +cannot derive from a built-in type, and you should not have to wrap one in +a class (with all the conversion friction that implies) just to teach a +library how to multiply it. A traits struct attaches behavior to a type +*from the outside*, without requiring the type's cooperation. This is the +same reason the standard library uses `std::char_traits` rather than +requiring your character type to inherit from something: the type being +customized is not yours to modify. + +The cost of the traits approach is one level of naming indirection +(`SampleTraits::mac` instead of `x.mac`), which a `using Tr =` alias +reduces to nothing. The benefit is that the whole mechanism evaporates at +compile time: every call in this file is a `static` member function, +resolved by the template machinery, inlined by any compiler at any +optimization level worth shipping. + +## Q-formats, from zero + +Now the arithmetic story. Fixed-point notation **Qm.n** describes an +integer reinterpreted as a fraction: *n* bits after the binary point, *m* +bits (beyond the sign) before it. The stored integer *k* represents the +value *k* / 2ⁿ. So: + +- **Q0.15** ("Q15"): an `int16_t` representing *k* / 2¹⁵. Range −1.0 to + +0.99997. This is what 16-bit audio *is* — the industry just rarely says + so out loud. +- **Q0.31** ("Q31"): the same idea in an `int32_t`, range −1.0 to + +(1 − 2⁻³¹). +- **Q1.14**: an `int16_t` representing *k* / 2¹⁴ — one bit of *headroom* + above ±1.0, range −2.0 to +1.99994, at the cost of one bit of precision. + +Addition in a Q-format is ordinary integer addition. Multiplication adds +the fractional bit counts: Q0.15 × Q1.14 gives a product with 29 fractional +bits (Q29). Nothing is approximate yet — an integer multiply of two 16-bit +values is *exact* in 32 bits. Fixed-point arithmetic done carefully is not +"lossy integer math"; it is exact arithmetic with explicitly scheduled +rounding. The whole craft is deciding where the one rounding happens and +proving nothing overflows before it. + +## The headroom bit: why coefficients are Q1.14, not Q0.15 + +The obvious choice for 16-bit coefficients is Q0.15, same as the samples. +It does not work, and the reason is a property of the filter itself: each +polyphase row has unity DC gain, and the prototype's *peak tap* reaches +approximately 1.0. Q0.15's most positive value is 0.99997 — the peak tap +does not fit. Saturating it would dent the filter's frequency response +precisely at the row where the response matters most. + +So the coefficients trade one precision bit for one headroom bit: + +```cpp +{{#include ../../../include/srt/sample_traits.hpp:st_q15_coeff}} +``` + +with the conversion doing round-half-away-from-zero and saturating at the +integer limits (the *design* is checked separately; saturation here is a +belt against future filter specs, not an expected event): + +```cpp +{{#include ../../../include/srt/sample_traits.hpp:st_roundsat}} +``` + +What did the traded bit cost? Quantizing coefficients to Q14 puts the +filter's stopband floor at roughly −86 dB — and the header's comment makes +the argument that matters: the Q15 *output* format's own noise floor is of +the same order. A 16-bit datapath cannot deliver more than the 16-bit +format can carry, so spending coefficient precision beyond the format's +floor would purchase nothing measurable. The end-to-end test agrees: the +Q15 converter measures **~77 dB SNR** on a half-scale 997 Hz sine across a ++200 ppm clock crossing (`tests/test_fixed_point.cpp` prints it; the CI +threshold sits at 73 dB), and that number is the *format's* floor, not the +converter's. The same trade at 32 bits gives Q1.30 coefficients +(`makeCoeff` scales by 2³⁰), where the quantization floor is so far down +that the Q31 path measures **133 dB** — statistically the float datapath's +own 135 dB. + +The two unit tests pinning the scale factors are almost insultingly simple, +and that is their virtue: `Q15::makeCoeff(1.0) == 16384` is the sentence +"the peak tap fits" written as an assertion. + +## The accumulation story: exact until the last line + +Here is the Q15 multiply-accumulate: + +```cpp +{{#include ../../../include/srt/sample_traits.hpp:st_q15_mac}} +``` + +Two things are chosen here. The product is computed in `int32_t` — a +16×16→32 multiply, which every target does in one instruction — and it is +**exact**: the worst-case product is −32768 × −32768 = 2³⁰, comfortably +inside `int32_t`'s ±2³¹ range. But note how *thin* that comfort is: a +single worst-case product uses all but one bit of an `int32_t`. Summing +even two of them could wrap. An `int32_t` accumulator is therefore not +"risky"; it is simply wrong. + +The accumulator is `int64_t`, and now do the arithmetic the comment +gestures at. The shipping filters run 32 to 80 taps per phase (fast, +balanced, transparent presets). Summing N values adds at most log₂N bits +to the worst-case magnitude: 48 taps add ~5.6 bits, 80 taps add ~6.3 — call +it six to seven bits. Worst case for the transparent preset: +80 × 2³⁰ < 2³⁷, against an accumulator that holds ±2⁶³. Twenty-six bits of +spare headroom. That surplus is the point: the sum is exact — not +approximately safe, *exact*, every intermediate value representable — no +matter what the samples and coefficients do. There is no intermediate +rounding anywhere in the loop, which also means the accumulation is +associative, which is why the C4 chapter's dual-MAC kernel and the C1 +blended-row rewrite could both be verified *bit-exact* rather than +"close enough." + +All of the rounding budget is spent in one place: + +```cpp +{{#include ../../../include/srt/sample_traits.hpp:st_q15_finalize}} +``` + +The accumulator holds a Q29 value (Q0.15 sample × Q1.14 coefficient); the +output wants Q15; so shift right by 14 after adding half an output LSB +(1 << 13). That is round-half-up. A numericist will object that +round-half-up carries a bias and round-half-even does not — and the comment +answers the objection with scale: the bias exists only on exact half values +and is a fraction of one sub-LSB rounding step, orders below the Q15 noise +floor that the 77 dB measurement already includes. Half-even costs extra +operations per output sample to fix an error you cannot measure. The +`clampSat` around it is the saturation that makes hot signals *clip* +instead of wrap — and wrapping is the catastrophic failure mode: + +```cpp +EXPECT_EQ(Q15::finalize(std::int64_t{1} << 40), 32767); +``` + +plus an end-to-end test (`FullScaleSineDoesNotWrapQ15`) that drives a +99%-of-full-scale sine through a +500 ppm crossing and asserts the output's +second difference never exceeds the analytic bound for a clean sine — a +wraparound anywhere inside would blow that bound by orders of magnitude. + +## Q31 and the pre-shift: when even int64 isn't enough + +The 32-bit path cannot copy the 16-bit strategy, and the reason is worth +computing rather than asserting. A full-precision Q0.31 × Q1.30 product +carries 61 fractional bits and a worst-case magnitude near 2⁶¹ (full-scale +sample, peak ~1.0 coefficient). An `int64_t` holds ±2⁶³ — barely four such +products of margin. The shortest shipping filter sums 32 of them; the +transparent preset sums 80. At 48 taps the worst-case sum is +48 × 2⁶¹ ≈ 2⁶⁶·⁶, over the accumulator's limit by a factor of about twelve. +Full-precision products simply do not fit, and there is no 128-bit +accumulator worth having on the targets this path exists for. + +So each product gives up 16 bits *before* joining the sum: + +```cpp +{{#include ../../../include/srt/sample_traits.hpp:st_q31_mac}} +``` + +Now redo the bound: Q45 products have worst-case magnitude 2⁴⁵, and +80 × 2⁴⁵ < 2⁵²— eleven bits of headroom restored. What did the discarded +bits cost? Each truncation throws away less than one Q45 LSB, and the +final conversion (`finalize` shifts a further 14 bits, Q45 → Q31) puts a +Q45 LSB **14 bits below the output's own LSB**. Even if all 80 taps' +truncation errors conspired in the same direction, the accumulated error is +under 80 × 2⁻⁴⁵ ≈ 2⁻³⁸·⁷ — less than 1/200 of one Q31 output LSB. The +measurement closes the argument: the Q31 converter's 133 dB / 105 dB +(997 Hz / 19.5 kHz) match the float datapath's numbers, whose residual is +set by the phase-table interpolation, not by anyone's arithmetic. The +discarded bits are provably and measurably inaudible — this is the +fixed-point craft in one line of code: *decide* where precision dies, +prove the grave is deep enough, then measure anyway. + +The full specialization, for reference — note the doc comment carries the +same overflow argument, so the file survives without the book: + +```cpp +{{#include ../../../include/srt/sample_traits.hpp:st_q31}} +``` + +## The blend, and the comment that was wrong by three orders of magnitude + +`blend` linearly interpolates between the same tap of two adjacent phase +rows (the polyphase chapter explains why; the residual falls ~12 dB per +doubling of the phase count). In Q15 it looks like this: + +```cpp +{{#include ../../../include/srt/sample_traits.hpp:st_q15_blend}} +``` + +That comment has a history, and the history is this book's whole +methodology in miniature. The blend multiplies a Q15 fraction +(`fr` ≤ 32767) by a coefficient difference (`diff` = b − a, two `int16_t` +values, so |diff| ≤ 65535). The original version of this comment justified +the `int64_t` by claiming the `int32_t` product would fit "but only with +~5% margin." An audit later recomputed it: the worst-case product is +32767 × 65535 = 2,147,385,345, and `INT32_MAX` is 2,147,483,647. The +margin is 98,302 out of 2.1 billion — **0.005%**, not 5%. Three orders of +magnitude, in a comment whose entire job was to quantify safety. + +Nothing was wrong with the *code* — it used `int64_t` and still does. But +consider what the wrong comment was waiting to do: some future optimizer, +squeezing the M33 (where the C4 campaign found this very blend dominates +the Q15 frame cost — each `fr * diff` is a `smull`), reads "~5% margin," +concludes the `int32_t` version is comfortably safe, and ships a datapath +that is one adjacent-phase anomaly away from integer overflow. The audit +also measured the *actual* worst |diff| on the transparent table: 41 — +real coefficients come nowhere near the bound. The corrected comment keeps +both numbers and the conclusion: a margin of 0.005% against a theoretical +bound is not an invariant to lean on silently, whatever today's table +does. The lesson generalizes: **a safety-margin comment is arithmetic, and +arithmetic in comments rots exactly as fast as arithmetic in code — the +difference is that no test ever fails on it.** Verify the numbers you +write in prose. This book's build system exists because of that sentence. + +(The Q31 blend uses a Q20 fraction rather than Q15 — since the product runs +in `int64_t` anyway, the six extra fraction bits are free.) + +## `blendFactorFromQ64`: feeding the integer phase + +One trait remains, and it earns its keep on exactly one class of hardware. +The C3 optimization (Part III) replaced the resampler's `double` phase +accumulator with a Q0.64 integer — after which the *only* floating-point +left on the fixed-point per-sample path was the conversion of the phase +fraction into a blend factor. `blendFactorFromQ64` closes that hole. The +Q15 version is a single shift — the top 15 bits of the fraction *are* the +Q15 blend factor: + +```cpp +{{#include ../../../include/srt/sample_traits.hpp:st_q15_q64}} +``` + +The float version is subtler: + +```cpp +{{#include ../../../include/srt/sample_traits.hpp:st_blend_q64_float}} +``` + +Why reduce to 24 bits first? Because a `float` significand holds exactly +24 bits: any integer up to 2²⁴ converts to `float` *exactly*, and the +subsequent multiply by 2⁻²⁴ (a power of two) is also exact. Convert the +full 64-bit fraction instead and the compiler must round — correctly, but +via a path that on a double-less target may detour through software +arithmetic. This two-instruction dance keeps the conversion +single-precision, exact, and branchless. The target it matters on is +Hexagon, the one genuinely FP64-less machine in the fleet (the C3 write-up +records the correction: the M55's *scalar* FPU turned out to support +doubles after all — only its vector unit doesn't). C3's gating run showed +what removing per-sample soft-double math is worth on Hexagon: −15.5% +instructions on the Q31 pipeline, −10.3% on Q15. And because 2⁻⁶⁴ phase +resolution beats the old double path's 2⁻⁵², quality *improved* while the +code got faster: 135.0 dB at 997 Hz. + +## The concept: making the contract legible + +Everything above defines the customization point; the last twenty lines of +the file *enforce* it: + +```cpp +{{#include ../../../include/srt/sample_traits.hpp:st_concept}} +``` + +The datapath templates constrain themselves with it — +`template class BasicAsyncSampleRateConverter` — and the +payoff is the shape of the failure. Instantiate the converter with +`double` (no specialization exists) and, without the concept, the error +would surface wherever the template machinery first touched the undefined +traits — some line deep inside `interpolate()`, wearing five frames of +instantiation context. With the concept, the compiler rejects +`BasicAsyncSampleRateConverter` *at the declaration you wrote*, +and its diagnostic walks the `requires`-expression clause by clause: which +operation is missing, what signature it expected. The concept turns "a +missing operation somewhere" into a checklist. Write a partial +`SampleTraits` — say, everything but `blendFactorFromQ64` — and +the error names exactly that member. + +Note the return-type constraints (`-> std::same_as<...>`) are doing real +work: a `finalize` that returned `int` instead of `int16_t` would satisfy +a naive "does it compile" check and then quietly change overload and +conversion behavior downstream. The concept pins the whole signature. + +The three `static_assert`s at the bottom are the file testing itself: every +translation unit that includes the header re-verifies that the three +shipped specializations satisfy the concept they claim to. If a future +edit breaks one — renames a member, fumbles a return type — the diagnostic +arrives at header-parse time, before any user code, naming the assert. +Cost: zero, everywhere except the compiler's own microseconds. + +## Why these ~220 lines look the way they do + +| Decision | Alternative rejected | Reason | +|---|---|---| +| Traits struct of `static` functions | virtual `SampleOps` interface | 4.6M `mac`/s in the hot loop; virtual calls block inlining and every Part III optimization behind an opaque boundary | +| External traits | CRTP / member functions | sample types are `int16_t`/`float` — built-ins can't inherit and aren't ours to modify | +| Undefined primary template | primary with defaults | no honest default for foreign arithmetic; silence would be wrongness | +| Q1.14 / Q1.30 coefficients | Q0.15 / Q0.31 | the ~1.0 peak tap must fit; one headroom bit costs a precision bit the output format couldn't carry anyway | +| `int64_t` accumulator, no intermediate rounding | `int32_t` accumulator | one worst-case Q15 product nearly fills `int32_t`; exactness makes every kernel rewrite bit-verifiable | +| Q31 products pre-shifted to Q45 | full 62-bit products | 48 taps of 2⁶¹ ≈ 2⁶⁶·⁶ overflows `int64_t` ~12×; truncation cost < 1/200 output LSB, measured invisible | +| Round-half-up in `finalize` | round-half-even | the bias is sub-sub-LSB; half-even costs real per-sample work to fix an unmeasurable error | +| `int64_t` blend product | `int32_t` (it *almost* fits) | 0.005% worst-case margin — recomputed by audit from a comment that claimed 5% | +| `SampleType` concept + self-`static_assert`s | let instantiation errors happen | failures surface at the declaration, itemized per missing operation | + +## Verify it yourself + +```sh +# The whole fixed-point suite: scale factors, saturation, DC gain, +# measured SNRs (watch for the "[ measured ]" lines), full-scale non-wrap: +ctest --test-dir build -R FixedPoint --output-on-failure + +# The measured numbers this chapter quoted: +# [ measured ] 997 Hz, 16-bit fixed: SNR ~77 dB +# [ measured ] 997 Hz, 32-bit fixed: SNR ~133 dB +# [ measured ] 19500 Hz, 32-bit fixed: SNR ~105 dB + +# Recompute the blend margin the audit checked (don't trust this book either): +python3 -c "print(32767*65535, 2**31-1, 1 - 32767*65535/(2**31-1))" + +# Break it on purpose, three ways: +# 1. In makeCoeff (Q15), change 16384.0 to 32768.0 — the peak tap saturates +# and DcGainIsUnityQ15 fails its ±4 tolerance. +# 2. In finalize (Q15), delete clampSat and cast directly — the full-scale +# sine test detects wraparound as a blown second difference. +# 3. Instantiate srt::BasicAsyncSampleRateConverter anywhere and +# read the concept diagnostic: every missing operation, by name, at the +# line you wrote. +``` + +The third experiment is the C++ half of this chapter in one error message; +the first two are the arithmetic half in two failing assertions. diff --git a/book/src/part4/c-abi.md b/book/src/part4/c-abi.md index f1eac17..3eadf9f 100644 --- a/book/src/part4/c-abi.md +++ b/book/src/part4/c-abi.md @@ -1,3 +1,321 @@ -# c abi +# The C ABI -*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* +This chapter exists because of a plot. Part II's notebooks are the +library's most persuasive evidence — the servo locking from a cold start, +the 135 dB money plot, the naive-FIFO spectrogram full of clicks — and +every curve in them comes from the *actual shipping library*, not a Python +reimplementation of it. A reimplementation would prove nothing: the entire +point of measurement-first development is that you measure the artifact you +ship. So Python has to call the C++ converter. + +And Python cannot. Neither can Julia, nor anything else that loads shared +libraries at run time, because C++ deliberately has no stable binary +interface. Symbol names are mangled, and the mangling differs by compiler. +Exceptions, RTTI, and the layout of `std::` types differ by compiler *and* +by standard-library vendor. A template — and this whole library is +templates — doesn't exist in a binary at all until something instantiates +it. The one interface every FFI on earth speaks (`ctypes`, `cffi`, Julia's +`ccall`, Rust's `extern "C"`, every scripting language's dlopen wrapper) is +the C ABI: plain functions, plain data, names that mean what they say. + +So the library ships a shim: `tools/capi/`, about ninety lines of C++ +presenting a C face, built as a shared library with `-DSRT_BUILD_CAPI=ON`. +This chapter is small because the shim is small, but three of its design +decisions were paid for the hard way — one by a compile error, one by an +audit finding, and one by a toolchain that turned out to be unable to +throw an exception at all. + +## The surface: eight functions + +The entire foreign-function interface: + +```c +{{#include ../../../tools/capi/srt_capi.h:abi_surface}} +``` + +Create, destroy, push, pull, status, latency, reset, version. The shim +wraps the *float* converter only — the notebooks are metrology instruments +and float is what they measure with; tripling the surface for Q15/Q31 +would triple the contract for consumers that don't exist yet. Minimalism +here is a feature: every function in an ABI is a promise you keep forever. + +`SrtHandle` is the classic opaque-handle pattern: a `typedef` of a struct +that is *declared* and never *defined*. C callers can hold a +`SrtHandle*`, pass it around, and store it — but never dereference it, +size it, or copy what it points to, because the compiler has no idea what +it is. Compared to the lazier convention of handing out `void*`, the named +opaque type keeps some type checking alive at the boundary: pass a +`FILE*` where an `SrtHandle*` belongs and a C compiler will at least warn. +The pointer's true identity lives on the other side of the wall. + +## Two `extern "C"` blocks and the lesson between them + +Here is the other side of the wall, and the file structure is itself a +fossil of a compile error: + +```cpp +{{#include ../../../tools/capi/srt_capi.cpp:abi_impl}} +``` + +The handle is simply the converter pointer in disguise — +`reinterpret_cast` in `srt_create`, `reinterpret_cast` back on every call. +No wrapper struct, no registry of live handles, no indirection table: +there is nothing to store beyond the object itself, so the handle *is* the +object. + +Look at where the `impl()` helpers live: in an anonymous namespace, +*between* two `extern "C"` regions rather than inside one. That placement +is load-bearing. There are two `impl()` functions — one taking +`SrtHandle*`, one taking `const SrtHandle*` — which is to say, `impl` is +**overloaded**. And overloading is illegal for functions with C linkage: +C has no name mangling, both overloads would demand the same symbol name, +and the program is ill-formed. Write the helpers in the obvious place — +inside the `extern "C"` block where everything else in the file lives — +and the compiler stops you cold. That is exactly how it was discovered +here. The fix is what you see: the helpers sit outside the C-linkage +region, in an anonymous namespace that both gives them ordinary C++ +linkage (overloading welcome) and keeps them out of the shared library's +exported symbol table, where an FFI user enumerating symbols would only be +confused by them. The general rule: `extern "C"` is for the eight names +you are promising to the world, and *nothing else* belongs inside it. + +## The error convention, and why every function tolerates `NULL` + +The shim's entire error vocabulary is one value: + +```cpp +{{#include ../../../tools/capi/srt_capi.cpp:abi_create}} +``` + +`srt_create` returns `NULL` on invalid configuration or allocation +failure. No error codes, no `errno`, no last-error string: for a +constructor with a handful of scalar parameters, "it didn't work, and the +header tells you the two reasons it can't" is a complete diagnostic, and +every additional error channel is more contract to keep frozen forever. + +The subtle decision is downstream of that one. The first version of this +shim checked nothing: `srt_push` cast the handle and called through it, +unconditionally. The hardening audit changed every entry point to this +shape: + +```cpp +{{#include ../../../tools/capi/srt_capi.cpp:abi_null}} +``` + +The reasoning is stated in the file's own header comment, and it is worth +reading as a small essay on API design: + +```cpp +{{#include ../../../tools/capi/srt_capi.cpp:abi_doc}} +``` + +A "check create for NULL" convention *concentrates* failure on precisely +the caller who forgot the check — the one writing quick notebook code, the +one least prepared for a segfault in a foreign runtime where the crash +arrives with no C++ stack and no Python traceback, just a dead kernel. +With the guards, an unchecked failed create degrades to a converter that +accepts nothing and produces zeros: `srt_pull` returns silence, which is — +not coincidentally — the same thing the real converter produces on +underrun. The failure is still visible (`srt_status` reports zeros, the +audio is silent), but it is *debuggable* instead of fatal. Eight null +checks on functions that move hundreds of frames per call cost nothing +measurable; they buy an FFI that fails the way dynamic-language users can +diagnose. + +## The header is the contract + +`srt_capi.h` did not exist in the shim's first version — the notebook +simply re-declared the prototypes in `ctypes`, which worked and proved +nothing for anyone else. The audit shipped the header, and its top comment +is the ABI's real substance — the part no binary interface can encode: + +```c +{{#include ../../../tools/capi/srt_capi.h:abi_contract}} +``` + +Three promises deserve emphasis, because each answers a real foreign-caller +failure mode. + +**Thread affinity is spelled out per function.** The C++ API's +single-producer/single-consumer contract (the ring chapter) does not +dissolve because the caller is Python or Julia — but an FFI user cannot see +`std::memory_order` annotations, so the header must say it in words: one +thread pushes, one thread pulls, `srt_status` from anywhere, +`srt_reset_from_consumer` only from the consumer, create/destroy never +concurrent with anything. An ABI that documents signatures but not thread +affinity has documented the easy half. + +**`size_t` follows the platform ABI.** On every 64-bit host this is +invisible; on a 32-bit target (and this library ships to several) `size_t` +is 32 bits, and a foreign declaration hard-coding `uint64` for `frames` +corrupts the argument list. `ctypes.c_size_t` tracks the platform +automatically — the notebook uses it — but `cffi` and Julia users write +their own declarations, so the header says it explicitly. This is the kind +of sentence you only think to write after watching Part IV's 32-bit ports +in action. + +**`srt_version()` is a probe.** It returns +`major*10000 + minor*100 + patch` — `100` for today's 0.1.0. A version +*macro* would vanish into the caller's compile; a version *function* +reports what the loaded shared library actually is, which is the question +an FFI user is really asking when their symbols don't match their +expectations. It is also the cheapest possible smoke test that the DSO +loaded and calls marshal correctly — one integer, no state, no handle. + +## Six doubles and two return values: marshaling without a struct + +Two smaller conventions in the surface reward a moment each, because both +are shaped by what FFIs do badly. + +`srt_status` reports six quantities — state, ppm estimate, FIFO fill, +underruns, overruns, resyncs — and the obvious C design is a struct. +The shim instead fills a caller-provided `double out[6]`. A struct +returned across an FFI boundary is a *layout* contract: the foreign side +must re-declare every field, in order, with matching types, padding, and +alignment, and nothing checks the re-declaration — get it wrong and the +fields silently shear. An array of one scalar type is the +lowest-common-denominator marshaling that every FFI on earth handles in +one line (`(ctypes.c_double * 6)()` in the notebook). The price is that +counters and an enum ride in doubles — harmless, since a double carries +integers exactly to 2⁵³ and the header documents each slot by index. One +type, one array, zero layout risk: for six values polled a few times per +second, the trade is not close. + +The push/pull return values encode the real-time contract from the ring +chapter, translated for callers who never read it. `srt_push` returns the +frames *accepted*, which may be fewer than offered — the clipped write +when the FIFO is full. `srt_pull` is deliberately asymmetric: it **always +fills** the requested frames, substituting silence while the converter is +still filling or after an underrun, and its return value reports how many +frames came from real input. An audio callback must hand *something* to +the DAC in bounded time; an API that could return "no data, try again" +would push retry logic — and the opportunity to get it wrong — into every +consumer. Silence-on-shortfall keeps the failure mode the library already +promised (a dropout sounds like silence, then a fade back in), and the +return value keeps it observable. FFI code that ignores both return values +still plays audio; FFI code that reads them gets telemetry. Both are valid +clients, and neither can deadlock or glitch the other side. + +## Exceptions must not cross — and one target where they cannot even fly + +Look again at `srt_create`'s body: the `new` is wrapped in +`try { ... } catch (...) { return nullptr; }`. This is not defensive +decoration. A C++ exception that propagates out of an `extern "C"` +function into a C caller is undefined behavior — there is no agreement +about what unwinding even *means* across that boundary, and the practical +result ranges from `std::terminate` to stack corruption inside a foreign +interpreter. The converter's constructor is the one place this library +throws (`Config` validation and allocation); the shim's job is to convert +that exception into the ABI's error vocabulary — `NULL` — before it +reaches the boundary. `catch (...)` rather than `catch (const +std::exception&)` because the boundary does not care *what* was thrown; +everything becomes `NULL`. + +Now the hard lesson, recorded in `docs/PERFORMANCE.md` under *Known debt*. +One of this library's supported toolchains — the Hexagon static-musl +configuration from the Part IV DSP chapter — **cannot unwind at all**. Its +runtime lacks the unwinder: when a constructor throws, the exception does +not propagate to *any* catch block, anywhere; the process terminates via +`libc++abi`. This was not discovered by reading toolchain documentation. +It was discovered the day the first `EXPECT_THROW` test reached that CI +leg and the test *runner* died — the `ConfigValidation` suite is excluded +on Hexagon to this day, and the candidate fix +(`-unwindlib=libunwind` in the toolchain file) sits unclaimed in the debt +list. + +Think through what that does to this shim's design. The `catch (...)` in +`srt_create` is *necessary* — on normal targets it is the entire error +mechanism — but on a no-unwind target it is **unreachable**: the throw +terminates the process before the catch can run. A caller on such a target +cannot be saved by any code positioned *after* the throw. The only +placement that works is *before* it: **validate, then construct.** The +deployment guidance in the debt entry says exactly this: on that +toolchain, treat an invalid `Config` as fatal and validate inputs *before* +constructing — check them against the constraints the constructor +enforces (positive finite sample rate, nonzero channels, band edges that +sum under the rate, and the rest of `validated()`'s list) so the +constructor is never asked to throw. It is a weaker mechanism than a +`catch`, and that is the point: it is the strongest mechanism the target +actually has. + +The generalizable ABI lesson: an FFI boundary that reports failure by +*catching* is betting that every target can unwind, and that bet is not +safe even within one library's own CI matrix. Error strategies that +*return* — validate-before-construct, factory functions, status codes — +degrade gracefully on runtimes where error strategies that *throw* simply +end the process. + +## The client: forty lines of ctypes + +The notebook's first code cell is the reference consumer, and it exercises +every clause above: it locates the DSO (building it on first run), declares +each prototype, and wraps the handle in a small numpy-aware class. Two +lines carry the load: + +```python +_lib.srt_create.restype = ctypes.c_void_p +_lib.srt_push.argtypes = [ctypes.c_void_p, _FLOATP, ctypes.c_size_t] +``` + +Without the explicit `restype`, `ctypes` assumes functions return a C +`int` — on a 64-bit machine the handle comes back truncated to its low 32 +bits, and the crash lands on some *later* call, far from the actual +mistake. Declaring the full prototypes is the ctypes equivalent of +including the header, and `c_size_t` is the notebook honoring the width +caveat. The wrapper's `__del__` calls `srt_destroy` (guarded, per the +convention, against a handle that never existed), and its constructor +asserts `srt_create` succeeded — the check the null-tolerance exists to +forgive, present anyway, because tolerance is for accidents, not policy. +Everything downstream — the lock-acquisition plot, the ≥125 dB +transparency assertion, the impulse-response latency check that agrees +with `srt_designed_latency_seconds()` to within 0.3 ms — runs through +these eight functions. + +## Why these ~90 lines look the way they do + +| Decision | Alternative rejected | Reason | +|---|---|---| +| C shim over the C++ API | Python bindings / pybind11 | one C ABI serves ctypes, cffi, Julia, and everything else; bindings serve one language and drag in a build dependency | +| Float converter only | mirror all three sample types | the consumers are metrology notebooks; unused surface is unpaid-for contract | +| Named opaque handle | `void*` | keeps compiler type-checking alive at the FFI edge | +| Handle = object pointer, `reinterpret_cast` | handle registry / wrapper struct | there is nothing else to store; indirection would add state and failure modes | +| `impl()` overloads outside `extern "C"` | helpers inside the block | overloading is ill-formed with C linkage — the compiler enforced this one personally | +| `NULL` return + null-tolerant entry points | "caller must check" | the convention otherwise concentrates crashes on exactly the caller who forgot, in a runtime with no useful stack trace | +| `catch (...)` → `NULL` in `srt_create` | let exceptions cross | UB across the C boundary; and see below | +| Validate-before-construct guidance | rely on the `catch` | one supported toolchain cannot unwind at all — a throw terminates before any catch runs | +| `srt_version()` function | version macro | reports the loaded binary, not the caller's compile-time assumption | +| Thread affinity + `size_t` width in the header | "see the C++ docs" | the header is the only artifact an FFI consumer reads | + +## Verify it yourself + +```sh +# Build the shared library: +cmake -B build -DCMAKE_BUILD_TYPE=Release -DSRT_BUILD_CAPI=ON +cmake --build build --target srt_capi -j + +# The exported surface — eight srt_* symbols, unmangled, and nothing else +# from this file (the impl() helpers are invisible, as promised): +nm -D --defined-only build/tools/capi/libsrt_capi.so | grep srt_ + +# The one-integer smoke test (0.1.0 -> 100): +python3 -c "import ctypes; \ + print(ctypes.CDLL('build/tools/capi/libsrt_capi.so').srt_version())" + +# The null-tolerance convention, exercised directly — no crash, zero frames: +python3 -c "import ctypes; lib = ctypes.CDLL('build/tools/capi/libsrt_capi.so'); \ + lib.srt_create.restype = ctypes.c_void_p; \ + print('bad create:', lib.srt_create(ctypes.c_double(-1.0), 0, 0, 1)); \ + print('push on NULL:', lib.srt_push(None, None, 128))" + +# The full reference client, plots and assertions included: +jupyter nbconvert --to notebook --execute notebooks/asrc_demo.ipynb \ + --output /tmp/asrc_demo_run.ipynb + +# Break it on purpose: move the two impl() overloads inside the extern "C" +# block and rebuild — the compiler rejects the overload set, which is the +# whole story of this file's structure in one diagnostic. +``` + +The second Python one-liner is the chapter's argument compressed: an +invalid configuration and a forgotten check, and the program prints two +zeros instead of dying. diff --git a/include/srt/sample_traits.hpp b/include/srt/sample_traits.hpp index b3eb832..5fd4647 100644 --- a/include/srt/sample_traits.hpp +++ b/include/srt/sample_traits.hpp @@ -1,3 +1,4 @@ +// ANCHOR: st_overview /// \file sample_traits.hpp /// \brief Sample-type customization point for the resampling datapath. /// @@ -14,6 +15,7 @@ /// The clock servo and the filter design always run in double regardless of /// sample type (control path and one-time init, not the audio path), so the /// fixed-point datapaths contain no floating-point inner loops. +// ANCHOR_END: st_overview #ifndef SRT_SAMPLE_TRAITS_HPP #define SRT_SAMPLE_TRAITS_HPP @@ -26,6 +28,7 @@ namespace srt { namespace detail { +// ANCHOR: st_roundsat /// Round-and-saturate a double to a signed integer coefficient/sample type. template constexpr I roundSat(double v) noexcept { @@ -38,6 +41,7 @@ constexpr I roundSat(double v) noexcept { return std::numeric_limits::max(); return static_cast(r); } +// ANCHOR_END: st_roundsat /// Saturate a 64-bit accumulator result to a narrower signed integer. template @@ -49,10 +53,13 @@ constexpr I clampSat(std::int64_t v) noexcept { } // namespace detail +// ANCHOR: st_primary /// Primary template intentionally undefined; specialize per sample type. template struct SampleTraits; +// ANCHOR_END: st_primary +// ANCHOR: st_float /// Float datapath: float samples and coefficients, double accumulation. /// The double accumulator keeps the dot-product noise floor far below the /// 120 dB transparency target; float coefficient storage quantizes the @@ -69,6 +76,7 @@ struct SampleTraits { /// Convert the intra-phase fraction (in [0,1)) once per output sample. static BlendFactor makeBlendFactor(double fr) noexcept { return static_cast(fr); } + // ANCHOR: st_blend_q64_float /// Blend factor from the top bits of a Q0.64 intra-phase fraction. /// Single-precision only: the value is reduced to 24 bits first so the /// uint->float conversion is exact and no double op is needed @@ -76,6 +84,7 @@ struct SampleTraits { static BlendFactor blendFactorFromQ64(std::uint64_t frac) noexcept { return static_cast(frac >> 40) * 0x1p-24f; } + // ANCHOR_END: st_blend_q64_float /// acc + x * c, in the accumulator domain. static Accum mac(Accum acc, float x, Coeff c) noexcept { @@ -91,7 +100,9 @@ struct SampleTraits { /// The zero/silence sample value. static float silence() noexcept { return 0.0f; } }; +// ANCHOR_END: st_float +// ANCHOR: st_q15_header /// Q15 fixed-point datapath (samples are int16_t in Q0.15). /// /// Coefficients are stored in Q1.14: the prototype's peak tap reaches ~1.0 @@ -107,26 +118,34 @@ struct SampleTraits { using Coeff = std::int16_t; using Accum = std::int64_t; using BlendFactor = std::int32_t; ///< fraction in Q15 + // ANCHOR_END: st_q15_header + // ANCHOR: st_q15_coeff static Coeff makeCoeff(double c) noexcept { return detail::roundSat(c * 16384.0); // Q1.14 } + // ANCHOR_END: st_q15_coeff static BlendFactor makeBlendFactor(double fr) noexcept { return static_cast(fr * 32768.0); // Q15 } + // ANCHOR: st_q15_q64 /// Q15 blend factor straight from a Q0.64 fraction's top bits: no /// floating point at all on the fixed-point per-sample path. static BlendFactor blendFactorFromQ64(std::uint64_t frac) noexcept { return static_cast(frac >> 49); // Q15 } + // ANCHOR_END: st_q15_q64 + // ANCHOR: st_q15_mac static Accum mac(Accum acc, std::int16_t x, Coeff c) noexcept { return acc + static_cast(static_cast(x) * static_cast(c)); } + // ANCHOR_END: st_q15_mac + // ANCHOR: st_q15_blend static Coeff blend(Coeff a, Coeff b, BlendFactor fr) noexcept { // Q14 + (Q15 * Q14) >> 15, in int64: the worst-case int32 product // 32767 * 65535 = 2,147,385,345 sits 0.005% under INT32_MAX — @@ -136,16 +155,20 @@ struct SampleTraits { const std::int64_t diff = static_cast(b) - a; return static_cast(a + ((fr * diff) >> 15)); } + // ANCHOR_END: st_q15_blend + // ANCHOR: st_q15_finalize static std::int16_t finalize(Accum acc) noexcept { // Round-half-up, not half-even: the bias is a fraction of one // sub-LSB rounding step, far below the Q15 noise floor. return detail::clampSat((acc + (1 << 13)) >> 14); // Q29 -> Q15 } + // ANCHOR_END: st_q15_finalize static std::int16_t silence() noexcept { return 0; } }; +// ANCHOR: st_q31 /// Q31 fixed-point datapath (samples are int32_t in Q0.31). /// /// Coefficients are stored in Q1.30 (one headroom bit for the ~1.0 peak @@ -174,9 +197,11 @@ struct SampleTraits { return static_cast(frac >> 44); // Q20 } + // ANCHOR: st_q31_mac static Accum mac(Accum acc, std::int32_t x, Coeff c) noexcept { return acc + ((static_cast(x) * c) >> 16); // Q61 -> Q45 } + // ANCHOR_END: st_q31_mac static Coeff blend(Coeff a, Coeff b, BlendFactor fr) noexcept { const std::int64_t diff = static_cast(b) - a; @@ -189,7 +214,9 @@ struct SampleTraits { static std::int32_t silence() noexcept { return 0; } }; +// ANCHOR_END: st_q31 +// ANCHOR: st_concept /// Satisfied by any type with a complete, well-formed SampleTraits /// specialization. template @@ -212,6 +239,7 @@ concept SampleType = static_assert(SampleType); static_assert(SampleType); static_assert(SampleType); +// ANCHOR_END: st_concept } // namespace srt diff --git a/tools/capi/srt_capi.cpp b/tools/capi/srt_capi.cpp index 0858bb9..04b2207 100644 --- a/tools/capi/srt_capi.cpp +++ b/tools/capi/srt_capi.cpp @@ -1,3 +1,4 @@ +// ANCHOR: abi_doc /// \file srt_capi.cpp /// \brief C ABI shim over the float converter, for FFI consumers (ctypes, /// cffi, Julia, ...). Build with SRT_BUILD_CAPI=ON; srt_capi.h is the @@ -9,12 +10,14 @@ /// zero return values, and every entry point tolerates a null handle — the /// documented error convention ("check srt_create for NULL") otherwise /// invites a crash on exactly the path where the caller forgot to check. +// ANCHOR_END: abi_doc #include #include #include #include "srt/srt.hpp" +// ANCHOR: abi_impl extern "C" { struct SrtHandle; // opaque } @@ -27,6 +30,7 @@ const srt::AsyncSampleRateConverter* impl(const SrtHandle* h) noexcept { return reinterpret_cast(h); } } // namespace +// ANCHOR_END: abi_impl extern "C" { @@ -34,6 +38,7 @@ unsigned srt_version(void) noexcept { return SRT_VERSION_MAJOR * 10000u + SRT_VERSION_MINOR * 100u + SRT_VERSION_PATCH; } +// ANCHOR: abi_create /// preset: 0 = fast, 1 = balanced, 2 = transparent. SrtHandle* srt_create(double sampleRateHz, std::size_t channels, std::size_t targetLatencyFrames, int preset) noexcept { @@ -51,11 +56,13 @@ SrtHandle* srt_create(double sampleRateHz, std::size_t channels, std::size_t tar return nullptr; } } +// ANCHOR_END: abi_create void srt_destroy(SrtHandle* h) noexcept { delete impl(h); } +// ANCHOR: abi_null std::size_t srt_push(SrtHandle* h, const float* interleaved, std::size_t frames) noexcept { return h ? impl(h)->push(interleaved, frames) : 0; } @@ -63,6 +70,7 @@ std::size_t srt_push(SrtHandle* h, const float* interleaved, std::size_t frames) std::size_t srt_pull(SrtHandle* h, float* interleaved, std::size_t frames) noexcept { return h ? impl(h)->pull(interleaved, frames) : 0; } +// ANCHOR_END: abi_null /// out[0]=state (0 Filling, 1 Acquiring, 2 Locked), out[1]=ppm, /// out[2]=fifoFillFrames, out[3]=underruns, out[4]=overruns, out[5]=resyncs. diff --git a/tools/capi/srt_capi.h b/tools/capi/srt_capi.h index b8b3195..0f6ad4f 100644 --- a/tools/capi/srt_capi.h +++ b/tools/capi/srt_capi.h @@ -1,3 +1,4 @@ +/* ANCHOR: abi_contract */ /* SampleRateTap C ABI — FFI surface over the float converter. * * Build the shared library with -DSRT_BUILD_CAPI=ON. This header is the @@ -17,6 +18,7 @@ * size_t in these signatures follows the platform ABI (32-bit on 32-bit * targets) — declare foreign types accordingly. */ +/* ANCHOR_END: abi_contract */ #ifndef SRT_CAPI_H #define SRT_CAPI_H @@ -26,6 +28,7 @@ extern "C" { #endif +/* ANCHOR: abi_surface */ typedef struct SrtHandle SrtHandle; /* ABI/version probe: returns SRT_VERSION_MAJOR*10000 + @@ -55,6 +58,7 @@ double srt_designed_latency_seconds(const SrtHandle* h); /* Consumer thread: discard all buffered input, forget the ppm estimate, * return to Filling. */ void srt_reset_from_consumer(SrtHandle* h); +/* ANCHOR_END: abi_surface */ #ifdef __cplusplus } From 343778a6598055c77c7c6732733e52986bb5b1f8 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 1 Jul 2026 21:38:04 +0000 Subject: [PATCH 09/16] book: hardware, scaling, glossary, and bibliography Part V chapters (real-clock bridges and firmware; the channels/rates/ blocks scaling rules with their measured evidence) plus the glossary (~55 entries) and the annotated bibliography. No new anchors; snippets are short hand-copies per the style contract. https://claude.ai/code/session_01HuAFfoeD5a5Xe5aGNA16M9 --- book/src/appendix/bibliography.md | 104 +++++++- book/src/appendix/glossary.md | 266 ++++++++++++++++++- book/src/part5/hardware.md | 428 +++++++++++++++++++++++++++++- book/src/part5/scaling.md | 312 +++++++++++++++++++++- 4 files changed, 1102 insertions(+), 8 deletions(-) diff --git a/book/src/appendix/bibliography.md b/book/src/appendix/bibliography.md index 4372f09..62fb056 100644 --- a/book/src/appendix/bibliography.md +++ b/book/src/appendix/bibliography.md @@ -1,3 +1,103 @@ -# bibliography +# Appendix C: Annotated bibliography -*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* +This project's provenance statement is short: all code implements +long-published methods, and no third-party source was copied. This +appendix lists those methods' sources — plus the tools and competitors the +measurements depend on — with a note on what the project *actually took* +from each. It deliberately cites nothing the codebase does not genuinely +draw on. + +## Signal processing + +**J. F. Kaiser, "Nonrecursive digital filter design using the I₀-sinh +window function," *Proc. IEEE Int. Symp. Circuits and Systems*, 1974.** +The origin of the Kaiser window and of the two empirical fits the library +evaluates verbatim in `include/srt/detail/kaiser.hpp`: stopband +attenuation → window shape parameter β, and the attenuation/transition- +width → filter-length estimate. The project took the closed forms exactly +as published — the value of the Kaiser window here is precisely that its +design procedure is a page of code with known error bounds, needing no +iterative optimization at construction time. + +**f. harris, *Multirate Signal Processing for Communication Systems*, +Prentice Hall, 2004.** The standard reference for polyphase decomposition +— factoring one long prototype filter into L short branches indexed by +fractional delay — which is the structure of the library's coefficient +table. The tap-length estimate in `estimateTaps()` is the Kaiser/harris +formula in the form `N = (A − 8) / (2.285 · Δω)`, applied per polyphase +branch; the codebase credits both names, as the literature does. + +**J. O. Smith, "Digital Audio Resampling Home Page" (and the *Bandlimited +Interpolation* material), CCRMA, Stanford University.** The theory the +datapath implements: resampling as evaluation of a windowed-sinc +interpolation kernel at fractional positions, with a finite table of +kernel phases and *linear interpolation between adjacent table entries*. +Smith's analysis of that last step is where the library's most-quoted +scaling law comes from — interpolation residue falling ~12 dB per doubling +of the table size L and rising with signal frequency — which Part 0 turns +into the budget arithmetic connecting L to decibels. + +**Analog Devices, AD1896 datasheet ("192 kHz Stereo Asynchronous Sample +Rate Converter").** The architectural ancestor. The README describes the +library as "the classic commercial-ASRC architecture (AD1896-style +polyphase FIR + clock servo), specialized for the near-unity regime," and +the datasheet documents that architecture: a polyphase interpolation +filter addressed by a recovered rate ratio, with a FIFO between the clock +domains. It also supplies the hardware row in the comparison table — +quoted as datasheet values, with the caveats about measurement environment +stated in `docs/COMPARISON.md`. + +**AES17, *AES standard method for digital audio engineering — Measurement +of digital audio equipment* (Audio Engineering Society).** The measurement +definition behind the headline quality numbers: remove the fundamental, +integrate the residual across the audio band for THD+N, measure dynamic +range at −60 dBFS with A-weighting. The comparison notebook implements an +AES17-style procedure (exact fit plus ±20 Hz notch, 20 Hz–20 kHz +integration) and calibrates it against synthetic signals before use — the +standard is what makes the −132 dB figure commensurable with silicon +datasheets rather than a house metric. + +## The measured competitors + +**libsamplerate (Secret Rabbit Code), E. de Castro Lopo — +documentation at libsndfile.github.io/libsamplerate.** The closest +architectural analog (streaming time-domain polyphase resampler) and one +of the two software subjects measured under identical conditions in +`docs/COMPARISON.md` and the comparison notebook. Its documentation also +supplied the honesty check the comparison repeats: the published "97 dB +worst case" figure applies to aggressive ratios, so near-unity results at +the format ceiling are its *easy* regime, not a contradiction. + +**soxr (the SoX Resampler library) — github.com/chirlu/soxr.** The second +measured competitor, and the source of its own latency figure via +`soxr_delay()`. What the project took from soxr is mostly a boundary +lesson made quantitative: soxr wins raw host throughput decisively and +carries ~12–16 ms of latency doing it, which is the measured statement of +why a 1–2 ms live-monitoring budget needs a different design. + +## C++ + +**Anthony Williams, *C++ Concurrency in Action*, 2nd ed., Manning, 2019.** +The working reference for the C++ memory model as this book teaches it: +acquire/release pairing as the establishment of happens-before, the +legitimacy of relaxed loads of data a thread itself owns, and lock-free +queue design generally. The ring chapter's proof style — argue the two +release/acquire pairs, then treat everything else as sequential code — +is the book's method applied to a hundred-line class. + +**cppreference.com — in particular `std::memory_order`, +`std::atomic::is_always_lock_free`, `std::bit_ceil`, and +`std::hardware_destructive_interference_size`.** The day-to-day authority +for the exact semantics the headers rely on: the ordering guarantees the +ring asserts, the compile-time lock-freedom predicate the audit added, +the power-of-two rounding used by the ring and the polyphase table, and — +for the interference-size constant — the documented ABI fragility that +justified *rejecting* the standard facility in favor of a literal `64`. + +## Tooling + +**mdBook — rust-lang.github.io/mdBook.** The tool this book is built +with. Its `\{{#include path:anchor}}` mechanism is what makes the book's +central honesty commitment mechanical rather than aspirational: code +excerpts are pulled from the real headers at build time, so prose that +drifts from the code breaks the build in CI instead of quietly lying. diff --git a/book/src/appendix/glossary.md b/book/src/appendix/glossary.md index e0b2772..953ba34 100644 --- a/book/src/appendix/glossary.md +++ b/book/src/appendix/glossary.md @@ -1,3 +1,265 @@ -# glossary +# Appendix B: Glossary -*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* +Terms of art as this book uses them. Where the general meaning and this +project's usage differ, the entry gives the project's. + +**Acquire/release** — the pair of C++ memory orderings that establishes +*happens-before* across threads: everything written before a +release-store is visible after the acquire-load that observes it. The +only synchronization in the library's ring buffer, used once per +direction; the same pair carries the converter across the RP2350's two +cores in the dual-core firmware. + +**AES17** — the Audio Engineering Society's standard for measuring +digital audio equipment, defining how THD+N and dynamic range are taken +(notch the fundamental, integrate the residual over the audio band, +A-weight for DR). The comparison notebook implements an AES17-style +measurement so the library's numbers are commensurable with hardware +datasheets. + +**Anti-image filter** — the lowpass that removes the spectral copies +(images) created by interpolating between sample instants. In this +library it is the Kaiser-windowed sinc prototype: pass the audio band +flat, suppress everything from the first image down by the stopband +attenuation. + +**ASRC (asynchronous sample rate converter)** — a converter between two +sample streams whose clocks are *independent*: the ratio is not known in +advance, is never exactly rational, and drifts, so it must be recovered +continuously by a servo. Distinct from a resampler library, which must be +handed the ratio from outside. + +**Beat frequency** — the rate at which a slow periodic alignment +recurs; here, the rate at which whole-sample slips (and hence occupancy +sawteeth) arrive: `ppm × fs` for sample-granular transfer, divided by +the block size for block transfer. + +**Blend factor** — the fractional weight μ used to linearly interpolate +between the two polyphase coefficient rows adjacent to the current +fractional position. Computed once per output frame and shared across +all channels, which is why N channels cost `blend + N × dot`. + +**Block-beat sawtooth** — the deterministic waveform that block-quantized +transfer imprints on the FIFO occupancy observable: one push/pull block +peak-to-peak, at the beat frequency. It is measurement quantization, not +clock movement; the servo's stage gating and the unlock threshold both +exist to keep it out of the rate estimate. + +**Cache line** — the unit (64 bytes on the targets here) in which cores +move memory between their caches. Data structures shared between two +real-time threads are laid out in whole cache lines per owner. + +**Cache-line ping-pong** — the performance failure where a line written +by one core and read by another migrates back and forth on every access, +costing hundreds of cycles each trip. The ring buffer's cached-index +design exists so the steady-state fast path touches no foreign line at +all. + +**Cent** — one hundredth of a semitone, about 0.06% in frequency; the +unit in which the block-size study reports the low-rate FM that coarse +blocks impose (~0.9 cents rms at 32-frame blocks). + +**dBc** — decibels relative to the carrier: the level of a sideband or +spur measured against the signal that carries it, used for the servo's +sawtooth-rejection figures. + +**dBFS** — decibels relative to digital full scale; −1 dBFS is the AES17 +measurement level, 0.5 FS (−6 dBFS) the quality suite's. + +**DWT / CYCCNT** — the Data Watchpoint and Trace unit of Arm M-profile +cores and its free-running 32-bit cycle counter. Optional silicon (hence +the `NOCYCCNT` runtime check), per-core on the RP2350, and the +instrument that converts QEMU instruction baselines into real cycle +budgets. + +**False sharing** — two logically unrelated variables sharing one cache +line, so writes to either invalidate both readers. Prevented in the ring +by giving producer state, consumer state, and shared read-only state a +64-byte-aligned line each. + +**FIFO** — first-in-first-out buffer. In this library the SPSC ring +between the clock domains; its occupancy doubles as the servo's phase +detector, which is why it exposes exact occupancy rather than an +approximation. + +**Fractional delay** — a delay of a non-integer number of samples, +realized by interpolating between stored samples. The near-unity ASRC's +datapath is a fractional delay that *creeps*: the fractional position +advances by the small rate deviation every frame. + +**Frame** — one sample per channel at one time instant; interleaved +buffers store frame after frame. Latency and occupancy are denominated +in frames so they are channel-count-invariant. + +**Group delay** — the delay a filter imposes on signal envelopes; for +the linear-phase FIR here it is a constant (T−1)/2 taps ≈ 24 input +samples for the default filter, the fixed half of the converter's +latency budget. + +**Header-only** — a library shipped entirely as headers, compiled into +each consuming translation unit. It buys trivial integration and full +inlining, and costs ABI fragility discipline (see the rejected +`hardware_destructive_interference_size`). + +**Interleaved** — channel-multiplexed sample layout +(`L R L R …`), the wire format of `push()`/`pull()`. + +**Kaiser window** — the near-optimal FIR design window with one shape +parameter β trading main-lobe width against sidelobe level, plus +published closed-form fits from stopband attenuation to β and to filter +length. Chosen because the design math is a page of code with known +error bounds, evaluated once at construction. + +**Latency breathing** — the slow wander of the FIFO term of end-to-end +latency (a fraction of the block size) as the servo phase-tracks the +block beat in Track stage; benign, and distinct from an actual setpoint +change. + +**Lock-free** — progress guarantee: every operation completes in a +bounded number of steps regardless of what other threads do, including +being suspended at the worst instruction. Required of everything on the +audio path; asserted at compile time for every atomic the hot path +touches. + +**Memory model / `std::memory_order`** — the C++ rules defining which +values a load may observe across threads, controlled per-operation by +ordering annotations. This codebase's idiom is *sufficiency as +documentation*: each annotation is exactly as strong as the proof needs, +so each one tells the reader why it exists. + +**MVE / Helium** — Arm's M-profile Vector Extension (Cortex-M55 class): +128-bit SIMD including fp32, but no double precision. Its presence or +absence gates which Q15 kernel the library compiles. + +**NCO (numerically controlled oscillator)** — an accumulator whose +increment sets its frequency. The converter's μ phase accumulator is the +NCO of its PLL: the servo's ε̂ sets the increment, wraps mark whole-sample +slips. + +**Near-unity** — the regime this library specializes in: conversion +ratios within a few hundred ppm of 1.0 (two "48 kHz" clocks), where the +general resampling problem degenerates into a creeping fractional delay. +The specialization is what buys the 48-tap datapath and sub-millisecond +filter delay. + +**Occupancy** — the number of frames currently buffered between the +domains (ring plus staged frames). The servo's only sensor; its +quantization is the fundamental measurement limit of the design. + +**Phase accumulator** — the unsigned Q0.64 integer holding the +fractional resampling position. It accumulates only the rate *deviation* +per output sample, in integer arithmetic (resolution 2⁻⁶⁴ samples), and +detects whole-sample slips by 64-bit wraparound. + +**Polyphase decomposition** — factoring one long interpolation filter +into L short branches, one per fractional-delay phase, so each output +sample evaluates T taps instead of L·T. The table stores L+1 rows so the +μ wrap 1→0 is branch-free and exactly continuous. + +**ppm (parts per million)** — 10⁻⁶, the natural unit of crystal +tolerance and drift. Consumer crystals sit tens of ppm from nominal; the +converter accepts ±1000 ppm by default. + +**Q-format (Q0.15, Q1.14, Q1.30, Q0.64 …)** — fixed-point notation: +Qm.n has m integer bits and n fractional bits in a signed word (the +project writes the unsigned 64-bit phase as Q0.64). Q15 audio samples +are Q0.15; the corresponding coefficients are Q1.14 so values slightly +above 1.0 survive; accumulation is int64. + +**Ratchet** — the CI mechanism that compares deterministic instruction +counts against committed baselines at ±3% in *both* directions: a +regression fails, and an unexplained improvement also fails until the +baseline is deliberately re-committed. Two-sided so that numbers can +only change on purpose. + +**Semihosting** — a debug protocol by which a bare-metal program calls +into its host/debugger for I/O; how the Cortex-M test binaries print +results and exit under QEMU system emulation. + +**Seqlock** — a reader-retry publication scheme: the writer makes a +sequence counter odd, writes the payload, makes it even; readers retry +until one even value brackets a whole read. Used by the dual-core +firmware to publish multi-word statistics coherently with only 32-bit +atomics. + +**Servo** — a feedback controller steering a plant toward a setpoint; +here the PI controller that steers FIFO occupancy to the target by +adjusting the resampling rate, thereby *becoming* the clock-ratio +estimator. + +**Setpoint** — the target FIFO occupancy (`targetLatencyFrames`), +i.e. the buffering half of the latency budget. Must exceed the pull +block and the peak jitter excursion; the converter raises its +*effective* value when it observes otherwise. + +**Sine-fit metrology** — measuring quality by least-squares-fitting the +known test tone (amplitude, phase, frequency) and analyzing the residual +after exact subtraction. Sharper than FFT bins for single-tone tests and +immune to window leakage — leakage of the fitted tone cannot masquerade +as noise or crosstalk. + +**Slip** — the whole-sample event in near-unity conversion: after +roughly 1/ppm samples the accumulated fractional position crosses a +sample boundary and the read window shifts by one input sample. The +extra polyphase row makes the slip exactly continuous in the output. + +**SNR (signal-to-noise ratio)** — here, the fitted test tone's power +against everything else in the analysis window (a THD+N-style residual, +so distortion counts as noise), in dB. + +**Soft float / soft double** — floating-point arithmetic emulated in +integer instructions because the hardware lacks the format — FP64 +everywhere on Cortex-M33 and Hexagon. The reason the fixed-point +datapaths exist and the reason the servo's double math is budgeted per +block, not per sample. + +**SPSC (single-producer single-consumer)** — the concurrency restriction +of the library's ring: exactly one pushing agent and one pulling agent. +The restriction is what makes lock-freedom cheap — and it is a contract +about agents, not threads, which is what lets two CPU cores satisfy it. + +**TCG plugin** — an instrumentation hook in QEMU's Tiny Code Generator; +the project's counting plugin observes every executed guest instruction, +yielding the deterministic per-workload counts the ratchet gates. + +**THD+N (total harmonic distortion plus noise)** — everything that is +not the test signal — harmonics, spurs, noise — integrated over the +audio band and expressed relative to the signal. The AES17 measurement +the comparison document reports (−132 dB at the 24-bit interface). + +**ThreadSanitizer (TSan)** — a compiler-instrumented data-race detector +that observes the ordering annotations actually used. It certifies only +the interleavings a run produces, which is why the project also runs it +on genuinely weakly-ordered arm64 hardware. + +**Type-2 loop** — a control loop with two integrators around the cycle +(here: the PI's integrator plus the FIFO, which integrates rate error +into occupancy). Type 2 is what nulls a *constant* rate offset with zero +standing occupancy error. + +**UF2** — the drag-and-drop flashing format of Raspberry Pi boards; the +build artifact of both Pico 2 firmware harnesses. + +**Underrun / overrun / resync** — the converter's three accounting +events: a pull found too little data (output silence-padded, refill and +re-lock), a push found the FIFO full (newest frames dropped), and the +consumer-side hard discard back to the setpoint after the high watermark +is reached. All three are counted, published in `Status`, and expected +to be zero after lock. + +**VLIW (very long instruction word)** — an architecture that packs +several operations into one issue packet scheduled by the compiler, as +on Qualcomm's Hexagon DSP. Why "instructions executed" and "packets +executed" differ there, and part of why instruction counts are budgets +rather than cycle counts. + +**Wraparound arithmetic** — unsigned integer arithmetic modulo 2^N, +which C++ defines exactly. The ring's monotonic indices and the DWT +cycle deltas both rely on the same theorem: a difference that fits the +word is computed exactly *through* the wrap, so the wrap is not an edge +case but a non-event. + +**xrun** — ALSA's collective name for a device-level underrun or overrun +(the OS missed the hardware's deadline). Handled in the bridge by +`snd_pcm_recover`; distinct from the converter's own underrun/overrun +accounting, which sits one layer up. diff --git a/book/src/part5/hardware.md b/book/src/part5/hardware.md index ef2c709..554ea06 100644 --- a/book/src/part5/hardware.md +++ b/book/src/part5/hardware.md @@ -1,3 +1,427 @@ -# hardware +# Real clocks: bridges and firmware -*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* +Everything measured so far in this book — the 135 dB residual, the lock in +~1 s, the drift ramp tracked without unlocking — came out of a simulation. +A good one: deterministic, sample-granular, reproducible to the bit, able +to synthesize a +200 ppm offset that is *exactly* +200 ppm so the servo's +estimate has a truth value to be judged against. That determinism is the +whole reason Part II's proof system works, and it is also, unavoidably, a +confession. A simulated clock is a number in a loop. It has no crystal, no +temperature coefficient, no USB host controller rescheduling its transfers, +no twelve-hour soak in a warm room. The library exists to reconcile two +*physical* oscillators, and at some point the only honest move is to plug +two of them in. + +This chapter is about that move: what real hardware can prove that the +deterministic suite cannot, the three test setups the project defined for +it, and the three harnesses that shipped — an ALSA bridge for Linux hosts +and two firmware images for the Raspberry Pi Pico 2. It ends by stating +plainly which numbers exist today and which still await a physical board. + +## What simulation cannot say + +Be precise about the gap, because it is narrower than "simulation isn't +real." The two-clock simulator *is* the library's use case in every +algorithmic respect; nothing about the datapath or the servo mathematics +changes on hardware. What changes is the input to the control loop: + +- **The offset stops being constant.** Real crystals sit typically + 20–200 ppm apart and move several ppm with temperature — slowly, over + minutes, as the room warms or a component self-heats. The suite tests a + *scripted* drift ramp; hardware supplies an unscripted one, forever. +- **The pacing stops being clean.** A simulated push arrives exactly on + schedule. A USB audio dongle's data arrives when the host controller and + the kernel get around to it — jitter that is structured, bursty, and + unlike anything a deterministic loop generates. The FIFO setpoint rule + ("exceed the peak occupancy excursion of your push/pull jitter") is only + ever *exercised* by real jitter. +- **Time stops being short.** The quality suite analyzes one second of + audio after settling. The claim a deployment actually cares about — + *zero* underruns, overruns, or resyncs over hours — is a statement about + the tails of every distribution at once, and the only instrument that + measures tails is a soak. A multi-hour run on independent oscillators is + the test no simulation honestly replaces. + +There is also one thing simulation does *better*, worth keeping: a +synthesized offset is exact, so convergence can be asserted to a tolerance. +Two real crystals give you a true offset you don't know — you can check the +estimate is *stable* and *independently corroborated* (count frames from +each device against `CLOCK_MONOTONIC` for ten minutes; the measured rate +ratio should match the servo's estimate to well under 1 ppm), but never +that it equals a known constant. The hardware plan uses both kinds of truth +deliberately, as we'll see. + +`docs/HARDWARE_TESTING.md` defines three setups, in increasing order of +effort, all commodity parts: + +1. **One Pi, two USB audio dongles** (~$15 of adapters). Each dongle clocks + its own 48 kHz from its own crystal; the library bridges them. The + canonical real-world test, and the source of the headline result the + project wants: "locked to the real inter-crystal offset of X ppm, N + hours, zero discontinuities." +2. **Pi + Raspberry Pi Pico 2.** Validates the QEMU-derived Cortex-M33 + numbers on an actual RP2350: real cycles against emulated instruction + counts, and the dual-core deployment shape. +3. **Two Pis over Ethernet.** The network-audio case, where `push()` sees + bursty UDP delivery instead of callback-paced blocks — the setpoint rule + under genuinely hostile jitter. + +Setup 1's harness is `examples/alsa_bridge.cpp`; Setup 2's are +`examples/pico2_cyccnt/` and `examples/pico2_dualcore/`. Setup 3's programs +are not yet written. Each shipped harness is worth reading closely, because +each one is the library's documented rules applied under witness. + +## The ALSA bridge: two blocking threads, on purpose + +The bridge is ~370 lines and structurally almost insolently simple: open a +capture device and a playback device, start two threads, and let each +thread block on its device. + +```cpp +std::thread capture([&] { + // ... + const snd_pcm_sframes_t n = snd_pcm_readi(in.pcm, dst, period); + // ... + asrc.push(buf.data(), frames); // overruns counted by the converter +}); + +std::thread playback([&] { + // ... + asrc.pull(buf.data(), period); // silence-pads while filling/underrun + // ... + snd_pcm_writei(out.pcm, src, period - done); +}); +``` + +The simplicity is the point. The library's runtime contract is one producer +agent and one consumer agent, each paced by its own clock — and a blocking +ALSA read *is* a clock. `snd_pcm_readi()` returns when the capture device +has delivered a period of frames, which happens at the cadence of that +device's crystal; `snd_pcm_writei()` blocks until the playback device has +made room, at the cadence of the other crystal. The two threads never +communicate except through the converter, which is exactly the interface +the whole library was designed around. No callbacks, no timers, no event +loop: the hardware paces the threads, and the converter absorbs the +difference. If you want to see the two-agent contract of +[the ring chapter](../part1/spsc-ring.md) with the abstractions removed, +this file is it. + +A few decisions inside deserve attention. + +**Format negotiation prefers honesty over generality.** The bridge asks +each device for `FLOAT_LE` — the converter's native sample type, no +conversion — and falls back to `S16_LE` with explicit scale-and-clamp +helpers when the hardware refuses. That is the entire format matrix. Cheap +dongles are overwhelmingly S16 devices, and a test harness that negotiated +every ALSA format under the sun would bury its purpose in plumbing. It +also *refuses* a rate it didn't ask for: if the device counters with +anything but the requested rate, the bridge errors out rather than +silently measuring the wrong experiment. + +**Xrun recovery is delegated, then observed.** When a read or write +returns an error, the bridge calls `snd_pcm_recover()` and continues; +only an unrecoverable error stops the run. This is deliberate division of +labor: ALSA xruns are a *device*-level discontinuity (the OS failed to +service the hardware in time), and the converter has its own machinery — +silence-padding, refill, re-lock with the ppm estimate kept — for the +*converter*-level consequences. The bridge does not try to be clever +across that boundary; it recovers the PCM and lets the converter's +counters record whatever backlash arrives. During a soak, the once-per- +second status line is where you watch both layers at once. + +**The one configuration rule in the file is the ServoConfig rule.** The +bridge runs with `--period` frames per ALSA transfer (default 128), and +block-quantized transfer means the FIFO occupancy legitimately excursions +by around half a block without the clocks having moved. The servo's +`unlockThresholdFrames` defaults to 24 — tuned for fine-grained transfer — +so the bridge applies the documented rule in code: + +```cpp +// Per the ServoConfig guidance: the unlock threshold must sit +// comfortably above half the transfer block, or block-quantized +// occupancy excursions can demote the servo stage spuriously. +cfg.servo.unlockThresholdFrames = + std::max(cfg.servo.unlockThresholdFrames, 1.5 * static_cast(args.period)); +``` + +Miss this and the harness would report spurious servo demotions that have +nothing to do with the clocks — a measurement artifact manufactured by the +measurement tool. (The next chapter returns to this rule as one of the +three scaling axes.) + +**The telemetry switches are the experiment design.** Three flags turn the +bridge from a demo into an instrument: + +- `--csv ` appends the once-per-second `status()` snapshot — state, + ppm, smoothed fill, underrun/overrun/resync counters — as a CSV row. + This is the soak's evidence: the ppm trace over hours *is* the thermal- + drift measurement, and the counters' final values *are* the + zero-discontinuity claim. Point a hair dryer at one dongle and the trace + should show the crystal move several ppm in real time, tracked without + anything audible; a fast ±50 ppm step should show a stage demotion and a + re-lock. +- `--dump ` has the playback thread also write the post-ASRC float + stream to disk, raw. This exists because of an honest limitation of + cheap hardware: a $7 dongle's analog path measures around −80 dB, and + no quality claim about a 135 dB converter survives passage through it. + The dump sidesteps the analog path entirely — the *clocks* are real even + if the signal never goes analog — and the notebook tooling + (`notebooks/asrc_comparison.ipynb` carries the AES17-style measurement + machinery) analyzes the capture offline. +- `--tone ` completes that thought. In tone mode the capture thread + *still blocks on* `snd_pcm_readi()` — the input device's crystal still + paces every push — but the captured samples are discarded and a clean + synthetic sine is pushed instead. Real clocks, known signal, no trust + placed in an ADC that hasn't earned it. The combination + `--tone 997 --dump out.raw --csv trace.csv` is Setup 1's full + measurement: a 997 Hz tone through two real crystals into the AES17 + notebook. + +## `pico2_cyccnt`: buying cycles with instructions + +Part II built a performance ratchet on QEMU instruction counts: +deterministic, noise-free, gateable in CI at ±3%. The README's Cortex-M33 +table says a 2-second Q15 stereo workload executes 484,146,844 +instructions — that number will be identical tomorrow, which is what makes +it a regression gate. But it is a count of *instructions*, and silicon +budgets are spent in *cycles*. An instruction can take one cycle or ten; +memory waits, pipeline stalls, and branch penalties exist in silicon and +not in QEMU's functional model. So every deployment claim derived from the +ratchet — "Q15 mono fits a 150 MHz core with room to spare, stereo is +tight" — has been carrying an asterisk: *instruction counts are not cycle +counts; treat these as budgets pending real-silicon validation.* + +`examples/pico2_cyccnt/` is the firmware that removes the asterisk. It is +a standalone flashable UF2 (deliberately *not* part of the root build — +it drags in the whole Pico SDK) that runs the exact steady-state workload +of the icount benchmarks — the same `push(32)`/`pull(32)` duplex loop — +on a real RP2350, timing every block with the Cortex-M33's DWT cycle +counter: + +```cpp +bool enableCycleCounter() { + CoreDebug->DEMCR |= CoreDebug_DEMCR_TRCENA_Msk; + if (DWT->CTRL & DWT_CTRL_NOCYCCNT_Msk) + return false; // implementation without a cycle counter + DWT->CYCCNT = 0; + DWT->CTRL |= DWT_CTRL_CYCCNTENA_Msk; + return true; +} +``` + +DWT — the Data Watchpoint and Trace unit — is optional silicon on +M-profile cores, so the firmware checks `NOCYCCNT` at runtime rather than +assuming; `TRCENA` gates the whole trace block and must be set first. The +counter is 32 bits free-running, which wraps in ~28.6 s at 150 MHz — fine, +because the firmware only ever takes per-block unsigned deltas, and +unsigned subtraction across a wrap is exact by the same modular-arithmetic +argument the ring buffer's indices rested on. A thousand warmup iterations +run first (past the Filling state, servo settled), then two thousand +measured blocks, reported as mean, p99, and max — the tail statistics +matter, because the workload runs with interrupts live and USB +housekeeping shows up in the max column. + +The output table covers Q15 in both presets at 1, 2, and 12 channels, plus +float at one channel. The float rows are not there in the hope of good +news; they exist to put a *measured* number on "soft-double accumulation +is the wrong datapath on an FP64-less core" — the QEMU baselines already +price float at roughly 3.8× the Q15 instruction count, and a cycle figure +makes the guidance concrete rather than rhetorical. + +The deeper purpose is calibration. The committed M33 baselines divide out +to 5,043 instructions per frame for the stereo Q15 pipeline and 10,027 for +the 12-channel one. Divide the firmware's measured cycles-per-frame by +those figures and you get the constant the whole ratchet has been waiting +for: *one QEMU instruction ≈ N RP2350 cycles*. That single ratio converts +every current and future M33 instruction baseline into a real cycle +budget — the ratchet keeps its CI-grade determinism, and hardware +contributes exactly one number, measured once per silicon revision instead +of once per commit. + +One scoping note recorded in the README of the harness: the cycled input +buffer is 4,800 frames rather than the icount workload's 12,000, so that +the 12-channel case fits the RP2350's 520 KB SRAM alongside the converter. +Per-block work is unchanged; the deviation is documented because an +unexplained difference between two "identical" workloads is how +calibration constants go quietly wrong. + +## `pico2_dualcore`: one clock domain per core + +The README's platform guidance ends with a suggestion: on Pico-class +parts, stereo `balanced()` wants either the `fast()` preset *or the +RP2350's second core*. `examples/pico2_dualcore/` is that suggestion built +and made falsifiable — the converter's two ends on the two Cortex-M33 +cores, one core per clock domain, judging its own run against PASS/FAIL +gates. + +- **core0 is the producer.** It pushes 32-frame blocks paced by the + microsecond timer at `rate × (1 + 200e-6)` — a +200 ppm offset + synthesized from the shared timebase. This is the simulation trick + imported onto silicon, and it is what real crystals can never give: an + offset that is *exactly* +200 ppm, so the converged estimate can be + asserted within ±5 ppm rather than merely admired. core0 also owns all + USB telemetry. +- **core1 is the consumer.** It pulls 32-frame blocks at exactly the + nominal rate and times every `pull()` with DWT.CYCCNT — enabled *on + core1*, because each RP2350 core has a private DWT behind the same + fixed address (the 0xE000_0000 private peripheral region is core-local; + enabling the counter from core0 would start the wrong one). core1 never + prints: contending on the stdio mutex from the paced core would put USB + stalls onto the output clock domain. + +Is running the two ends on two *cores* even within the library's +contract? The firmware answers this in its opening comment, and the +reasoning belongs in this book: the contract is one producer *agent* and +one consumer *agent* around a lock-free SPSC ring with acquire/release +atomics. It names agents and memory ordering, not `std::thread`. The +RP2350's cores share coherent SRAM with no data caches in front of it, so +C++ atomics behave across cores exactly as they do across threads — two +cores satisfy the contract precisely as two threads do. `push()` stays +core0-only, `pull()` stays core1-only, `status()` is documented +any-thread. The chapter on the ring said the memory-ordering argument was +the proof and the tests merely raised the price of being wrong; here the +same argument, unchanged, carries the design onto a second processor. + +Everything else that crosses cores is an explicit block of **32-bit** +atomics, and the width is a load-bearing decision inherited from the +library itself: on the M33, 64-bit `std::atomic` is not lock-free — it +routes through a library lock, which is exactly the failure the library's +own telemetry avoided by keeping its counters 32-bit. The firmware +`static_assert`s the lock-freedom of every cross-core type. The phase +handoff is a single release store of the converter pointer (publishing +every plain write the constructor performed) matched by an acquire load on +core1; the teardown is the mirrored pair through a `consumerDone` flag, so +destroying the converter cannot race core1's last `pull()`. + +The consumer's statistics need more than individual atomicity, though: a +printed telemetry line should describe one *instant*, not a mean from this +second next to a max from the last. With 64-bit atomics off the table, the +firmware uses a seqlock — the sequence counter goes odd while the writer +updates, even when it finishes, and the reader retries until the same even +value brackets its whole read: + +```cpp +void publishSnapshot(const Snapshot& s) { + const std::uint32_t q = g.seq.load(std::memory_order_relaxed); + g.seq.store(q + 1, std::memory_order_relaxed); + std::atomic_thread_fence(std::memory_order_release); + // ... payload stores, all relaxed 32-bit atomics ... + g.seq.store(q + 2, std::memory_order_release); +} +``` + +The payload fields are themselves relaxed atomics — no torn reads, no +undefined behavior — so the seqlock adds only mutual coherence, and a +retry costs nothing at a 1 Hz read rate. It is the cheapest possible +answer to "publish five numbers atomically on a core with 32-bit +atomics," and a pattern worth stealing. + +### The honest scoping decision + +The firmware runs two ~30-second phases. Phase A is Q15 stereo +`balanced()` at 48 kHz — the configuration the README calls tight on one +core, now with the input domain moved off the consumer's core entirely. +Phase B is the 12-channel reference-microphone/AVB shape... at **16 kHz**, +not 48. Its README records why, and the passage is a model of how to scope +a demo honestly: + +> Phase B is 16 kHz **by arithmetic, not caution**: the M33 QEMU baseline +> puts `pipeline12_q15` at 10,027 insns/frame against a 150 MHz / 48 kHz +> budget of 3,125 cycles/frame — more than 3× over, and `pull()` of a +> single instance is one consumer by contract, so no core assignment can +> split it across cores. Dual-core buys one clock domain per core, not +> more datapath than one core has. + +That last sentence is the chapter's most important deployment fact. The +SPSC contract that makes the converter lock-free is also a ceiling: one +consumer agent means the entire per-pull datapath — all twelve channels of +it — executes on whichever core calls `pull()`. A second core removes the +*other* clock domain's work and everything else the application does, and +that is all it removes. At 16 kHz the per-frame budget triples to 9,375 +cycles and the 12-channel shape fits; and since measured cycles per block +are rate-independent, phase B still delivers the real-silicon counterpart +of the 12-channel instruction baseline. Nothing was hidden by the rate +change — 16 kHz is that configuration's actual deployment rate (the +next chapter's rate-scaling rules are applied in the phase B config, +`FilterSpec` band edges and servo bandwidths scaled by 16/48) — but the +README refuses to let you believe dual-core bought compute it didn't. + +Two more of the library's documented rules appear in this firmware as +lived decisions rather than advice. The FIFO setpoint is 144 frames, not +the default 48: the producer core shares its time with USB logging, whose +worst-case writer stall is capped at 2 ms in the build — 96 frames of +consumer progress at 48 kHz — so the setpoint must exceed that excursion +with margin. That is the README latency rule applied to a producer that +also logs. And the pacing schedules compute absolute due times +(`t0 + (b·num)/den` in integer microseconds), so a stall is followed by +catch-up pushes rather than permanent schedule slip — the difference +between jitter the FIFO absorbs and a rate error the servo would chase. + +A PASS requires: Locked within 2 s of cold start (6 s for phase B, whose +scaled servo is proportionally slower), every 1 Hz ppm sample after the +settling gate within ±5 of the synthesized +200, and zero underruns, +overruns, *and* resyncs after first lock — overruns and resyncs gate too, +because they are the signature of a consumer that cannot keep up. The +firmware prints per-phase verdicts, an `OVERALL` line, and a sentinel +string, so a future self-hosted CI lane can parse a soak the same way the +QEMU lanes parse emulated runs. + +The dual-core README also states its own limit, and it belongs here +verbatim in spirit: both domains are paced from the RP2350's one timer — +that is what makes +200.0 an exact, assertable truth — so this firmware +*cannot* prove the inter-crystal lock that Setups 1 and 2 ultimately want. +It proves the deployment shape: two cores, two clock domains, lock-free +handoff, real cycle headroom. + +## What is measured, and what is not yet + +The project's culture is that numbers are measured or absent, so here is +the ledger as it stands: + +- **Shipped and measured on real clocks: nothing yet.** All quality and + performance figures in this book so far come from deterministic + simulation, host benchmarks, and QEMU instruction counting. +- **Shipped and awaiting hardware:** all three harnesses build — the ALSA + bridge wherever ALSA exists, both Pico 2 firmwares as flashable UF2s — + but `docs/HARDWARE_TESTING.md` says it plainly: *the measured numbers + await a physical Pico 2*, and the multi-hour dongle soak awaits an + afternoon with a Pi. The cycles-per-instruction calibration constant, + the real `%core@48k` figures, the hour-scale zero-discontinuity claim, + and the thermal-drift trace are all, today, well-instrumented empty + columns. +- **Not yet written:** the small script that plots a `--csv` ppm trace and + runs the notebook analysis over a `--dump` capture, and both Setup 3 + programs (UDP sender, receiver-with-ASRC — the plan is to reuse the + bridge's output half). + +A book that inherited this project's habits could not end the chapter any +other way. The harnesses are the falsifiable form of the library's +deployment claims; until a board runs them, the claims stay labeled as +budgets. + +## Verify it yourself + +```sh +# No hardware: two OS threads 500 ppm apart, lock and estimate on live +# (jittery) scheduling — the software rehearsal of the bridge: +cmake -B build -DSRT_BUILD_EXAMPLES=ON && cmake --build build -j +./build/examples/drifting_clocks + +# Setup 1 (Linux + two audio devices; srt_alsa_bridge builds when ALSA +# is found). Real clocks, synthetic tone, telemetry + capture: +./build/examples/srt_alsa_bridge --in hw:1,0 --out hw:2,0 \ + --tone 997 --csv trace.csv --dump post_asrc.f32 --seconds 3600 +# Then: ppm column of trace.csv is the thermal-drift instrument; analyze +# post_asrc.f32 with the AES17 machinery in notebooks/asrc_comparison.ipynb. + +# Setup 2 firmware (standalone builds; arm-none-eabi-gcc + network for +# the Pico SDK fetch): +cd examples/pico2_cyccnt && cmake -B build -DPICO_BOARD=pico2 && cmake --build build -j +cd examples/pico2_dualcore && cmake -B build -DPICO_BOARD=pico2 && cmake --build build -j +# Flash the UF2s, open the USB serial port, and wait for the sentinel +# lines: SRT_PICO2_DONE / SRT_PICO2_DUALCORE_DONE with per-phase PASS/FAIL. +``` + +If you have the hardware this project's authors did not have on their +bench, you are holding the most valuable contribution available: run the +soak, and turn the empty columns into numbers. diff --git a/book/src/part5/scaling.md b/book/src/part5/scaling.md index b0a2db4..2f783c3 100644 --- a/book/src/part5/scaling.md +++ b/book/src/part5/scaling.md @@ -1,3 +1,311 @@ -# scaling +# Channels, rates, and the rules that scale -*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* +Every measured number in this book so far was taken at one operating point: +48 kHz, one or two channels, fine-grained transfer. Real deployments move +along three axes away from that point — more **channels**, a different +**sample rate**, coarser **blocks** — and each axis has a rule, a failure +mode when the rule is ignored, and a measurement that pins both. This is +the chapter a deploying engineer should read twice: once before choosing a +configuration, and once after the first surprising telemetry line. + +The three rules, stated up front: + +1. **Channels**: one converter instance per *clock domain*, never per + channel group; channel count is then a nearly-free multiplier on the + dot product. +2. **Rates**: every configuration field denominated in absolute hertz must + scale with the sample rate — start from `Config::forSampleRate()`. +3. **Blocks**: the FIFO setpoint must exceed the pull block size (the + converter now enforces this) and the servo's unlock threshold must + clear the block-quantization sawtooth; coarse blocks also move you into + a measurably different quality regime. + +## Channels: coherence is free, so don't pay for it + +`Config::channels` is a runtime count with no architectural limit — mono +through 7.1.4 and beyond. The design rule is about instance boundaries: +**one instance per clock domain**. If a 12-channel AVB stream and a stereo +monitor feed arrive on the *same* recovered clock, they are one domain and +could share one instance per stream as convenient; but never split one +stream's channels across instances, and never funnel two clock domains +into one. + +The reason to keep a stream's channels together is a property the +implementation gives you by construction. Within one instance, every +channel of a frame is resampled at *literally the same fractional +position*: the phase accumulator, the servo, and the coefficient blend are +all per-instance state, and the per-channel work is only the dot product. +There is no per-channel phase to drift, so inter-channel phase coherence +is exact — not "matched to within a specification," but bit-identical in +the only quantity that could differ. Two audiences care intensely: + +- **Surround imaging.** Phantom sources between speakers are constructed + from inter-channel amplitude and time relationships; an ASRC that + resampled channels at even slightly different phases would smear them. + Here there is no skew to budget for. +- **Microphone arrays.** Beamforming and cross-correlation live entirely + on inter-channel time differences at sub-sample precision. The README + calls out the AVB case directly: a stream bundling reference microphones + with the program feed keeps its array geometry intact through the + converter. (AVB Class A's 8-frame packets are also fine-grained enough + for the Quiet servo stage — the block axis, below, cooperates.) + +Split those channels across instances and you forfeit the guarantee: each +instance runs its own servo on its own FIFO, and two servos tracking the +same physical clock still produce two independently-wobbling phase +trajectories. The coherence rule costs nothing and buys exactness; its +violation costs exactness and buys nothing. + +### What N channels cost + +Sharing one fractional position per frame also shapes the cost. Each +output frame computes the coefficient blend — the interpolation between +adjacent polyphase rows — *once*, then reuses it for every channel's dot +product. N channels cost `blend + N × dot`, not `N × (blend + dot)`; the +fixed overhead amortizes, so the marginal channel is cheaper than the +first. + +The instruction-count table in the README measures this shape. Comparing +the 12-channel Q15 pipeline against stereo across the three gated targets: + +| Target | `pipeline_q15` (2 ch) | `pipeline12_q15` (12 ch) | ratio | +|---|---:|---:|---:| +| Cortex-M33 | 484,146,844 | 962,613,655 | 2.0× | +| Cortex-M55 | 127,446,817 | 387,876,968 | 3.0× | +| Hexagon | 119,847,854 | 378,858,793 | 3.2× | + +Six times the channels for 2.0–3.2× the instructions. The spread itself is +informative: the M33's 2.0× says its per-frame cost is dominated by +shared work (the servo's soft-double arithmetic on an FP64-less core), so +extra channels are nearly half price; the M55 and Hexagon, whose shared +work is cheap, sit closer to the pure dot-product slope. On the host, the +same shape: Q15 stereo at 56.0 ns/frame versus 12-channel at 189.1 — +3.4× for 6× the channels, with a 12-channel stream still running at 110× +realtime on one Xeon core. + +### The proof that channels don't leak + +Coherence and cost say nothing about *correctness* — an interleave bug or +a channel permutation would sail through every single-channel quality +metric in the suite. `tests/test_multichannel.cpp` exists for exactly +that blind spot: every channel of one instance gets its own tone (600 + +731·c Hz — distinct, non-harmonically related, all inside the flat +passband up to 16 channels), and after a +200 ppm crossing each channel +must contain its own tone at full quality and nothing measurable of any +neighbor's. + +"Nothing measurable" is made rigorous the way this project usually is: the +channel's own tone is removed by tracked least-squares fit before the +other channels' frequencies are fitted on the residual, so the own tone's +spectral leakage (about −67 dB over a 1 s rectangular window at these +spacings) cannot masquerade as crosstalk. The gated results: worst +crosstalk below **−100 dB** for 12-channel float, below **−72 dB** for +16-channel Q15 — the latter sitting at the 16-bit format's own floor, +which is the honest bound for that datapath. Amplitude and per-channel +SNR are asserted in the same run, so a permutation, a gain error, and +crosstalk are all caught by one test. + +One coverage note is worth repeating because of how it was found. The +host's channel-parallel float kernel tiles channels in blocks of 8/4/2/1, +and an audit noticed that no test ever ran the K=2 and K=1 remainder +tiles — every configured count happened to decompose without them. The +suite now runs 5- and 7-channel variants (5 = 4+1, 7 = 4+2+1) precisely +to execute those tiles. The general lesson from Part II recurs: coverage +you haven't verified reaches the code is coverage you don't have. + +## Rates: hertz-denominated defaults are a 48 kHz assumption + +The library's defaults read as innocently portable — until you notice +which fields carry units. `FilterSpec::balanced()` places the passband +edge at 20,000 Hz and the first image to suppress at 28,000 Hz; +`ServoConfig` sets loop bandwidths of 10/1/0.05 Hz and smoother corners of +50/5/0.5 Hz. Every one of those is an *absolute frequency chosen for +48 kHz operation*, and the two misconfigurations they invite fail in +instructively different ways. + +The filter misconfiguration fails loudly, by design. Default band edges at +a 16 kHz rate would put the anti-image cutoff far above the input Nyquist +— a filter that passes images wholesale — and the constructor's validation +rejects the geometry outright (`passbandHz + stopbandHz` must not exceed +the sample rate), so you cannot ship it by accident. The servo +misconfiguration is the dangerous one, because nothing forces you to +notice: scale the filter (you must, to construct at all), keep the default +servo, and the converter builds, locks, tracks, and converts — while +silently costing about **32 dB** of quality at 16 kHz. That number is +measured, and the mechanism is worth understanding because it is the whole +rate-scaling story in one incident. + +At 16 kHz with a 200 ppm offset, the whole-sample slips arrive at +`ppm × fs` = 3.2 Hz instead of 9.6 Hz. The servo's three-pole Quiet +smoother has an absolute 0.5 Hz corner, so a beat at one-third the +frequency is rejected `(16/48)³` ≈ 28.6 dB *less* — the slip sawtooth +walks out from under the smoother, leaks into the rate estimate, and +frequency-modulates the audio. The measurement wears the FM signature +openly: roughly 32 dB below the 48 kHz figures at every tone, falling +6 dB per octave of signal frequency, exactly as small-index FM sidebands +scale. Nothing was wrong with the filter; the *control loop* was mistuned +by a factor of three because its tuning was written in hertz. + +The remedy is the `scaledTo` trio, and the factory that applies it: + +```cpp +srt::Config cfg = srt::Config::forSampleRate(16000.0); +cfg.channels = ...; // then adjust as usual +``` + +`FilterSpec::scaledTo` multiplies the band edges by `fs/48000` — same L +and T, so the same table size and per-frame cost, with the identical +response at every *normalized* frequency. `ServoConfig::scaledTo` does the +same to the six bandwidth/corner fields, keeping the loop identical in +per-sample terms — and scales the two hold times *inversely*, so the +promotion gates wait the same number of loop time constants rather than +the same number of wall-clock seconds. (That last refinement postdates the +first hand-scaled fix; re-measured, it changed nothing within noise, and +the test asserting it exists so the equivalence stays checked rather than +remembered.) Frame-denominated fields — lock and unlock thresholds, +`targetLatencyFrames`, ppm limits — are rate-invariant and stay put, +though their *duration* in milliseconds scales inversely with the rate. + +`tests/test_asrc_quality_16k.cpp` runs the full quality methodology +through the factory, and the outcome is the point of the design: 16 kHz +matches the 48 kHz *normalized-frequency structure*. The tones sit at the +same f/fs as the 48 kHz suite's 997 Hz / 6 k / 12 k / 19.5 k, and measure +136.6 / 121.9 / 114.3 / 106.5 dB against the 48 kHz suite's 135.0 / +120.0 / 112.8 / 105.8 on the same host — within about 1 dB down the line, +confirming that interpolation noise depends only on f/fs. Two consequences +deploy with you: group delay at the same tap count stays ~24 *input +samples*, which is three times as many milliseconds at 16 kHz (1.5 ms vs +0.5 ms); and the scaled Quiet loop at ~0.017 Hz settles proportionally +slower — the 16 kHz test runs 120 s where the 48 kHz one ran 40 s, the +same number of samples and of time constants. + +## Blocks: feasibility, then observability + +The block-size axis has two boundaries, one hard and one +information-theoretic. + +### The hard one: a pull can only synthesize from what is buffered + +`pull(frames)` produces output from frames already in the FIFO. If the +occupancy setpoint sits at or below the pull block size, the loop is +infeasible: each pull drains the buffer past the setpoint, the servo +steers to refill it, the next pull drains it again — a permanent underrun +limit cycle, dropouts every few hundred milliseconds, never locking. Early +versions documented the rule ("the setpoint must exceed the pull block +size") and trusted the integrator between chair and keyboard; the current +converter enforces it. When `pull()` observes a block larger than the +setpoint in force, it raises the *effective* setpoint to the block plus a +margin — half a block, at least one pop chunk — sized so the entry +occupancy never grazes the pull size even at the bottom of the block-beat +sawtooth, and bounded by FIFO capacity: + +```cpp +const std::size_t needed = frames + std::max(frames / 2, kPopChunkFrames); +const std::size_t newTarget = + std::clamp(needed, cfg_.targetLatencyFrames, maxTargetFrames_); +``` + +Configurations that already satisfy the rule are left exactly as +configured; the servo slews to a raised setpoint glitch-free (integrator +kept — the clocks haven't changed, only the target). The cost is not +hidden: latency follows the raised setpoint, `designedLatencySeconds()` +reports it, and `Status::effectiveTargetLatencyFrames` differs from the +configured value exactly when the adaptation has occurred — a field worth +plotting in deployment telemetry, because it is the converter telling you +your latency budget and your callback size disagree. Capacity bounds the +raise: the default ring (a 1024-frame floor) accommodates pull blocks up +to ~340 frames; larger callbacks need `fifoFrames` sized explicitly. + +### The soft one: what a coarse count can tell a servo + +The servo's only sensor is FIFO occupancy, and occupancy is quantized — +to whole frames at best, to whole *blocks* with block transfer. At +deviation ε the observable carries a deterministic sawtooth, one push +block peak-to-peak, at the beat frequency `ε × fs / block`. Whatever the +loop passes into its estimate frequency-modulates the audio. With +sample-granular transfer the sawtooth is one frame and the Quiet stage's +three-pole cascade rejects it to roughly −120 dBc equivalent at 20 kHz. +With ≥32-frame callbacks, that level of quiet is +**information-theoretically unavailable from counts alone** — no filter +recovers sub-sawtooth phase from an observable whose quantization *is* +the sawtooth, not while still tracking real drift. + +The design response is to stop pretending. Promotion from Track to Quiet +is gated on the cascade-smoothed error staying small, which is naturally +false while a large block beat dominates the observable — the gate is +itself the discriminator between the two regimes, so coarse-block +operation deliberately stays in Track. There the block beat is mostly +phase-tracked as benign *latency breathing* (the FIFO term of the latency +wanders by a fraction of the block as the servo follows the beat), and +the remainder appears as low-rate FM measured in cents: +`notebooks/asrc_block_size_study.ipynb` puts it at ~0.9 cents rms over a +61 dB wideband floor at 32-frame blocks, ~1.3 cents rms over 53 dB at +5 ms (240-frame) blocks. Those are honest numbers for a different regime, +not a degradation of the headline ones — the 135 dB figures are for +fine-grained transfer, and the comparison document says so plainly. If +your deployment pushes hardware-DMA-sized blocks and needs studio +transparency, the current converter is not information-limited by +accident, and the limitations section of the README sketches the eventual +answer (per-block timestamps for sub-sample phase observation). + +One more block-denominated rule closes the loop with the previous +chapter. The servo's `unlockThresholdFrames` (default 24) is the +excursion that demotes a stage; block-quantized occupancy legitimately +excursions by about half a block without the clocks having moved. The +guidance in `pi_servo.hpp` — keep the threshold comfortably above half +the block — is applied literally in the ALSA bridge (`1.5 ×` the period), +and ignoring it produces the most confusing failure on this axis: a +converter that locks, runs cleanly, and "spuriously" demotes itself on +schedule, at the beat frequency, forever. + +## The configuration walk, in order + +The axes compose, so a deployment configures them in dependency order: + +1. Start from `Config::forSampleRate(rate)` — never raw defaults at a + non-48 kHz rate. +2. Set `channels` to the full width of each clock domain's stream; one + instance per domain. +3. Set `targetLatencyFrames` above your pull block *and* your worst + push/pull jitter excursion (the dual-core firmware's 144-frame + setpoint against a 2 ms logging stall is the worked example); set + `fifoFrames` explicitly past ~340-frame callbacks. +4. Raise `unlockThresholdFrames` above ~1.5× your transfer block. +5. Then watch `Status::effectiveTargetLatencyFrames` and the resync + counters in production — they are the converter's own opinion of + whether steps 3 and 4 were done right. + +## Verify it yourself + +```sh +# Channel independence: 12ch float (< -100 dB crosstalk), 16ch Q15 +# (< -72 dB), plus the 5/7-channel remainder-tile variants: +ctest --test-dir build -R MultiChannel --output-on-failure + +# The rate-scaling rule and the 16 kHz measurements (slow: each case is +# a 120 s simulated run; the first test checks the factory arithmetic +# deterministically): +ctest --test-dir build -R AsrcQuality16k --output-on-failure + +# The -32 dB failure itself, reproduced: in test_asrc_quality_16k.cpp, +# keep Config::forSampleRate(kFs) but overwrite the servo with unscaled +# defaults (cfg.servo = srt::ServoConfig{};) — the converter still builds +# and locks, and every threshold fails by ~30 dB, falling 6 dB per octave +# of tone frequency: the FM signature. (Restoring the unscaled *filter* +# instead fails fast: the constructor rejects band edges above the input +# Nyquist.) + +# The block axis, measured: latency breathing and the cents-scale FM +# decomposition at 32/64/240-frame blocks: +jupyter nbconvert --execute notebooks/asrc_block_size_study.ipynb + +# The feasibility rule live: run the drifting-clocks example, then rerun +# with cfg.targetLatencyFrames set below kChunk in the source — the +# adaptive raise reports itself in effectiveTargetLatencyFrames instead +# of dropping out: +./build/examples/drifting_clocks +``` + +The break-it-on-purpose suggestions are, as ever, the chapter in +miniature: each rule here was learned from a measured failure, and each +failure is still one edit away from being watched happening. From 4a1ac4b65927979ef55c5f99a74ce6bb37e95d91 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 1 Jul 2026 21:38:30 +0000 Subject: [PATCH 10/16] book: the portability chapters (Hexagon and Cortex-M) Hexagon as the one-file cross+emulator port with its four lessons (soft-double phase, restrict deltas, the C5 revert, no-unwind exceptions) and the two Cortex-M bare-metal ports (startup anatomy, memory maps, one-shot ctest protocol, Helium and SMLALD discoveries). Adds comment-only pt_* ANCHOR markers to the startup file, both linker scripts, and the M55 toolchain file; the .ld C-style anchor comments resolve in mdBook. https://claude.ai/code/session_01HuAFfoeD5a5Xe5aGNA16M9 --- book/src/part4/cortex-m.md | 404 +++++++++++++++++++++++++++++- book/src/part4/hexagon.md | 381 +++++++++++++++++++++++++++- cmake/arm-cortex-m55-mps3.cmake | 2 + platform/armv8m_startup.c | 10 + platform/mps2_an505/mps2_an505.ld | 4 + platform/mps3_an547/mps3_an547.ld | 4 + 6 files changed, 801 insertions(+), 4 deletions(-) diff --git a/book/src/part4/cortex-m.md b/book/src/part4/cortex-m.md index a38fa83..33ecd3e 100644 --- a/book/src/part4/cortex-m.md +++ b/book/src/part4/cortex-m.md @@ -1,3 +1,403 @@ -# cortex m +# Cortex-M: bare metal, two ways -*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* +The Hexagon port ran the library on a strange ISA under a familiar OS. +The Cortex-M ports remove the OS. No loader, no threads, no filesystem, +no `argv`, no reliable way to even return an exit code — and the library +must still build, run its test suite, and hold its instruction budgets, +because MCU-class parts are where a $5 deployment actually lives. + +The project runs two of them, and the pairing is deliberate. Each board +exists to prove something the other cannot: + +- **Cortex-M55**, on QEMU's MPS3 AN547 board model. The M55 has Helium + (MVE, the M-profile vector extension) and a full scalar FPU. It proves + the library survives *bare metal itself* — the startup, the memory + map, the missing runtime — and it turned out to be hiding the single + most surprising compiler discovery in the project's history. +- **Cortex-M33**, on QEMU's MPS2+ AN505 board model. The M33 is the + Raspberry Pi Pico 2 / RP2350 class of core: single-precision FPU only, + no Helium, DSP extension present. It proves what deployment on a cheap + part actually costs, in numbers concrete enough to be budgets. + +Both share one startup file and one CTest strategy; they differ in linker +script and in what their instruction counts taught the project. This +chapter covers the shared anatomy first, then the two boards' discoveries. + +## What `-nostartfiles` obligates you to + +The toolchain files (`cmake/arm-cortex-m55-mps3.cmake`, +`cmake/arm-cortex-m33-mps2.cmake`) link with `--specs=rdimon.specs +-nostartfiles`: newlib with semihosted I/O, and *no* toolchain crt0. From +that moment the project owes the CPU everything crt0 used to provide, and +the debt is paid in one C file, `platform/armv8m_startup.c`, shared by +both targets. + +It starts where the core starts — the vector table: + +```c +{{#include ../../../platform/armv8m_startup.c:pt_vectors}} +``` + +An Armv8-M core fetches its initial stack pointer from word 0 and its +reset address from word 1; the linker scripts pin this array at the +address the core will look (`KEEP(*(.vectors))`, first section — ITCM +address 0 on the AN547, the secure-alias base on the AN505, where VTOR +points at reset). The `used` attribute stops the compiler discarding an +array nothing references; `KEEP` stops `--gc-sections` doing the same at +link time. Belt and suspenders, because the failure mode — a garbage +vector table — doesn't diagnose itself; the core simply jumps into +nothing. + +There is a subtlety in how this file reaches the link, and it is the kind +of decision this book exists to record. The toolchain files pass the +startup source *on the linker command line*, from +`cmake/arm-cortex-m55-mps3.cmake`: + +```cmake +{{#include ../../../cmake/arm-cortex-m55-mps3.cmake:pt_linkline}} +``` + +The `g++` driver would otherwise compile a `.c` link input as C++, and C++ +is allowed to lower those `(uintptr_t)&Reset_Handler` initializers to +*dynamic* initialization — code that runs at startup, initializing the +table that decides where startup begins. C guarantees address-constant +initializers are link-time constants. The table must be constant for the +same reason a ladder's bottom rung must not be attached to the top of the +ladder. (The `extern "C"` guards keep the file well-defined if someone +ever does compile it as C++; the `-x c` makes sure nobody has to find out +the hard way.) + +### Reset, in the only order that works + +```c +{{#include ../../../platform/armv8m_startup.c:pt_reset}} +``` + +Four moves, each ordered by a hazard: + +**MSPLIM first.** Armv8-M Mainline gives the main stack a hardware floor: +write an address to `MSPLIM` and any stack-pointer excursion below it +faults immediately, instead of the stack silently growing down into +whatever data lives below it. Why does this matter enough to be +instruction one? Because the alternative failure is the worst kind: +a deep call chain during one test overwrites the heap's top, the +corruption surfaces ten allocations later in an unrelated structure, and +the emulated target has no debugger attached and no memory protection +unit configured. A stack limit register converts that archaeology into a +HardFault at the exact instruction that crossed the line — and the +startup file gives HardFault its own handler (a `bkpt` and a park loop, +distinct from `Default_Handler`) precisely so the fault is identifiable. +This wasn't in the first version of the file; it was added by the same +infrastructure audit that hardened the Hexagon toolchain cache, and it +cost two linker-script symbols and one instruction. Insurance is rarely +priced this low. + +**FPU enable before any FP instruction, with `DSB; ISB`.** At reset, +coprocessors CP10/CP11 — the scalar FPU and MVE — are disabled; the +first FP instruction would fault. The CPACR write grants access, and the +barrier pair is not decoration: `DSB` forces the write to complete, `ISB` +flushes instructions already fetched under the old permissions. Omit the +barriers and the enable *usually* works — until an instruction prefetched +before the write faults on a real pipeline. The startup does this before +touching newlib because newlib code may legitimately use FP registers. + +**Zero `.bss`, but do not copy `.data`.** C guarantees zero-initialized +statics; nobody has provided that guarantee yet, so `memset` over the +linker-defined `__bss_start__..__bss_end__` does. The conspicuous absence +is the traditional `.data` copy loop — see the linker scripts below, +because that absence is a documented dependency on QEMU, not an +oversight. + +**Then the runtime, in dependency order:** semihosting file handles +(`initialise_monitor_handles`) so `printf` works, `__libc_init_array` so +C++ static constructors run, then `exit(main(0, NULL))` — `exit`, not a +bare return, so `atexit` handlers and stream flushes happen before the +semihosting exit call. `main` receives no arguments. There is no one to +pass any; that fact shapes the whole test harness below. + +### The runtime pieces the toolchain didn't bring + +Two more gaps get filled in the same file. First, the heap. +`librdimon`'s weak `_sbrk` sizes the heap by asking the host, via the +semihosting `SYS_HEAPINFO` call, where the heap should live — an answer +that depends on the emulator's mood for a given board model. The startup +overrides it with the boring, deterministic version: + +```c +{{#include ../../../platform/armv8m_startup.c:pt_sbrk}} +``` + +The heap is exactly the region the linker script says, ends exactly where +the script says, and `malloc` fails with `ENOMEM` — a *testable* +condition — rather than wandering into memory the map never granted. + +Second, 64-bit atomics. The library's telemetry counters are +`std::atomic`; M-profile has no 64-bit exclusive-access +instructions, GCC lowers those operations to `__atomic_*_8` library +calls, and the bare-metal toolchain ships no libatomic. The startup +provides the four helpers the link actually needs, built on the classic +single-core primitive — mask interrupts, do the plain 64-bit access, +restore: + +```c +{{#include ../../../platform/armv8m_startup.c:pt_irqlock}} +``` + +```c +{{#include ../../../platform/armv8m_startup.c:pt_atomic_rmw}} +``` + +Why is PRIMASK sufficient where a mutex or an exclusive-access loop would +be required elsewhere? Because on a single-core part, the only agent that +can interleave with a sequence of instructions is an interrupt handler on +the same core — there is no second observer, no other cache, no store +buffer visible from elsewhere. `cpsid i` makes the critical section +literally uninterruptible, so the load-modify-store is atomic with +respect to everything that exists on the machine. The reasoning is sound +*only* single-core, which is why the dual-core RP2350 firmware at the end +of this chapter pointedly refuses to rely on it, and shares nothing +across cores except 32-bit atomics. Note also what the file does *not* +do: it implements only the helpers currently linked, and deliberately +omits the rest (compare-exchange and friends), so any future need +surfaces as a link error instead of as a silently wrong fallback. + +## Two linker scripts, two philosophies of stack + +The memory maps mirror each board model. The AN547: + +```ld +{{#include ../../../platform/mps3_an547/mps3_an547.ld:pt_memory}} +``` + +Four regions, four jobs: vectors in ITCM (address 0, where VTOR resets), +code in SRAM, **the stack owning all of DTCM**, data/bss/heap in ISRAM. +Giving the stack a private 512 KB region is a luxury the board offers and +the script accepts gratefully — the stack limit is simply the region's +base, and stack and heap physically cannot collide because they do not +share a region. + +The AN505 has only the two big SRAMs, so stack and heap must cohabit, +and the script makes the boundary explicit rather than hopeful: + +```ld +{{#include ../../../platform/mps2_an505/mps2_an505.ld:pt_heap_stack}} +``` + +The stack descends from the top of DATA; the heap is *capped* 64 KB below +the top; `__stack_limit` is set exactly at the cap. Between `_sbrk` +refusing to grow past `__heap_end__` and MSPLIM faulting below +`__stack_limit`, the classic bare-metal failure — stack and heap growing +silently into each other — is fenced from both sides. One side returns +`ENOMEM`; the other side HardFaults. Neither corrupts. + +And the honesty clause, stated in both scripts' headers: **QEMU's +`-kernel` loader places the ELF directly into RAM, so VMA == LMA and +`.data` needs no load-time copy.** On real silicon booting from flash, +initialized data must be linked with a load address in flash and copied +to RAM by the startup — the loop this startup deliberately does not +have. The scripts say so in as many words. This is the same discipline +as the performance documentation: the artifact records what it is +validated for, and the boundary of that validation, in the place the next +user will actually look. A linker script that works under QEMU while +*looking* like a flash-boot script would be a trap; one that documents +"QEMU-only, here's why" is a foundation. + +## CTest without an operating system + +The toolchain files end with `set(SRT_BARE_METAL ON)`, and +`tests/CMakeLists.txt` branches on it. The problem it solves: CTest's +contract with a test binary is "run it with arguments, read its exit +code," and bare metal breaks both halves. There is no `argv` to pass a +`--gtest_filter`, and semihosting does not reliably propagate the guest's +exit status through `qemu-system-arm`. + +The replacement is a one-shot protocol. A dedicated `main` bakes the +filter in at compile time, and the *pass criterion is a printed string*: + +```cpp +{{#include ../../../tests/bare_metal_main.cpp}} +``` + +CTest registers a single test whose `PASS_REGULAR_EXPRESSION` is +`SRT_TESTS_COMPLETE rc=0` and whose `FAIL_REGULAR_EXPRESSION` is gtest's +`[ FAILED ]` marker: the run passes only if the summary line is printed +*and* no failure marker ever appears. The completion line is printed at +the last possible moment, so a crash, fault, or park-loop after the tests +cannot masquerade as success — the harness times out instead (the +`Default_Handler` comment in the startup file closes this loop: faults +park, parking times out, timeouts fail). + +Three details in that file repay attention: + +- **The filter excludes by category, not by taste.** What is cut is + minutes of soft-float virtual audio proving target-independent control + math already proven on every host leg; what stays is everything only + the target can falsify — datapath arithmetic, ring behavior on 32-bit + `size_t`, the end-to-end converter. The comment about `AsrcQuality*` + versus `AsrcQuality.*` records a real trap: in gtest filters the dot is + a literal, and the wrong spelling silently *narrows* the exclusion. +- **The empty-run guard.** A filter typo can select zero tests, and + `RUN_ALL_TESTS()` cheerfully returns 0 for an empty run — a green CI + leg testing nothing, forever. The guard fails the run if fewer than 15 + tests were selected (the real selection is ~20; the slack allows + legitimate removals). It must be checked *after* `RUN_ALL_TESTS()`, + because gtest applies the filter inside it — the count reads zero + before. This guard, like MSPLIM, arrived via audit: the theme of that + audit was hunting for ways a passing signal could be vacuous. +- **GoogleTest itself needs the bare-metal treatment.** Newlib ships stub + `pthread.h`/`regex.h` headers that make POSIX feature *detection* + succeed spuriously, so the build doesn't probe for threads at all on + bare metal and pins the feature macros (`GTEST_HAS_POSIX_RE=0`, stream + redirection and filesystem off) — value-checked macros only, since + gtest tests `GTEST_HAS_DEATH_TEST` with `#ifdef` and defining it to 0 + would *enable* what it names. + +The result: `ctest --test-dir build` on a developer machine runs ~20 +tests on an emulated Cortex-M55 exactly as transparently as the Hexagon +chapter's suite — `CMAKE_CROSSCOMPILING_EMULATOR` is doing the same work, +with `qemu-system-arm -M mps3-an547 -nographic -semihosting -kernel` +prefixed to the binary instead of `qemu-hexagon`. + +## What the M55 was hiding: Helium at plain `-O2` + +The M55 port existed for correctness. Its instruction baselines then sat +quietly in `bench/baselines.json` until the M33 arrived and gave them a +comparison point — and the comparison didn't add up. Identical source, +identical flags, same GCC: the M33's Q15 pipeline count came in at +roughly **4× the M55's**. Slower silicon-for-silicon was expected; +4× in *executed instructions* was not, because instruction counts don't +care about clock speed or memory latency. Something was executing +different instructions. + +`objdump` answered in one line of shell: the M55 binary contained **71 +MVE instructions**. The M33 binary contained **zero** (it has no MVE to +contain). Nobody had written a line of SIMD — **GCC auto-vectorizes the +Q15/Q31 kernels with Helium at plain `-O2`** when targeting +`-mcpu=cortex-m55`. The M55's numbers had been MVE-accelerated from the +day the target landed, and the project's own performance plan — which +listed "explicit Helium kernels for the M55" as future optimization +headroom — was describing work the compiler had already done. The +hypothesis list in `docs/PERFORMANCE.md` was rewritten the same day: +explicit M55 SIMD is *moot*; the real headroom was on the cores without +MVE, which became C4. + +The M55 also supplied the project's most instructive documentation bug, +told in this book's introduction: the C3 integer-phase change showed +`pipeline_float` **+1.4%** on the M55, contradicting the expectation that +removing double math must help a core documented (in the project's own +notes) as having no FP64. The measurement was right and the notes were +wrong: the M55's *scalar* FPU executes FP64 in hardware — only the MVE +vector unit is fp16/fp32. C3 had traded cheap hardware doubles for int64 +arithmetic on that one target, a fair price for the large cross-target +wins (Q15 −5.3%, Q31 −4.6% on the same core), and the correction is +recorded in the plan's hypothesis list. A 1.4% anomaly in a deterministic +metric was enough to falsify a "fact" everyone involved would have sworn +to. Noisy metrics don't generate that kind of pressure; this is why the +ratchet gates on instructions and not on milliseconds. + +## What the M33 exists to say about the Pico 2 + +The M33 leg is the deployment-realism target, and its numbers are meant +to be read as a datasheet for the Raspberry Pi Pico 2 class of part. + +**Float is not a datapath here.** The committed baselines put +`kernel_float` at 1,897,321,329 instructions against the M55's 99,468,474 +for the same workload — the README's "~19×" — because every `double` +accumulation in the float kernel is soft-float library calls on a core +with a single-precision-only FPU. The consequence is stated as guidance, +not lament: on Pico-class parts, use Q15 or Q31, the formats the +fixed-point traits chapter built for exactly this moment. + +**The DSP extension was idle until C4.** Disassembly of the original M33 +binaries found barely any use of the DSP extension (two `smlal`s). The +C4 kernel fixed that with `SMLALD` — packed dual 16×16 MAC into a 64-bit +accumulator — gated on `__ARM_FEATURE_DSP && !__ARM_FEATURE_MVE` so the +M55 keeps its auto-vectorized loop (verified: 0.00% change on every M55 +and Hexagon scenario), bit-exact by construction because the products are +exact in int32 and int64 accumulation is associative. It bought −3.1% on +`pipeline_q15`, and the C4 entry keeps honest books about why the win is +bounded: the M33's Q15 frame cost is dominated by the coefficient blend's +64-bit products and transport, not by the dot product the intrinsic +accelerates. + +**Budgets, stated as instructions, pending cycles.** Dividing the +baselines out: `pipeline_q15` is 484,146,844 instructions per 96,000 +frames ≈ **5,043 instructions per stereo frame**; the 12-channel shape is +≈ 10,027. A 150 MHz core at 48 kHz has 3,125 *cycles* per frame. The +README draws the honest conclusion in instruction-space — Q15 mono fits +a 150 MHz core, stereo wants the `fast()` preset or the RP2350's second +core — and then refuses to pretend the units match: instructions are not +cycles, the ratio between them is an empirical property of real silicon, +and the guidance is explicitly a budget *pending real-silicon +validation*. + +Two flashable firmwares exist to close exactly that loop, and they are +the bridge from this chapter's emulated world to Part V's hardware: + +- **`examples/pico2_cyccnt`** runs the same fixed pipeline workloads on a + real Pico 2 and times each 32-frame block with the M33's DWT.CYCCNT + hardware cycle counter. Its output divided by the committed baselines + (5,043 and 10,027 instructions per frame) yields the + cycles-per-QEMU-instruction calibration constant that turns *every* + M33 baseline, current and future, into a real cycle budget. +- **`examples/pico2_dualcore`** is the "second core" clause made + literal — and it is the library's concurrency story passing its + sternest exam. The `push()`/`pull()` contract names one producer agent + and one consumer agent around the lock-free ring; it never says + *threads*. On the RP2350, core 0 becomes the producer clock domain + (pushing at a synthesized +200 ppm offset, so the servo's estimate has + an exact truth value to be judged against — the one thing two real + crystals can never give you) and core 1 becomes the consumer, timing + every `pull()` with its own per-core DWT. Two cores over coherent SRAM + satisfy the acquire/release contract exactly as two threads do. + Everything else crossing cores is 32-bit atomics only — because on the + M33, 64-bit `std::atomic` is not lock-free, the same fact the startup + file's PRIMASK helpers exist to paper over on *one* core and which no + single-core trick can fix across two. Even the firmware's 12-channel + phase runs at 16 kHz *by arithmetic, not caution*: 10,027 + instructions per frame against a 3,125-cycle budget cannot fit at + 48 kHz on one core, and `pull()` of one converter instance is one + consumer by contract — a second core buys one clock domain per core, + not more datapath than one core has. + +## Verify it yourself + +```sh +# Both bare-metal legs, end to end (arm-none-eabi-g++ and qemu-system-arm +# on PATH — exactly what CI installs): +cmake -B build-m55 -DCMAKE_BUILD_TYPE=MinSizeRel \ + -DCMAKE_TOOLCHAIN_FILE=cmake/arm-cortex-m55-mps3.cmake -DSRT_BUILD_EXAMPLES=OFF +cmake --build build-m55 -j && ctest --test-dir build-m55 --output-on-failure + +cmake -B build-m33 -DCMAKE_BUILD_TYPE=MinSizeRel \ + -DCMAKE_TOOLCHAIN_FILE=cmake/arm-cortex-m33-mps2.cmake -DSRT_BUILD_EXAMPLES=OFF +cmake --build build-m33 -j && ctest --test-dir build-m33 --output-on-failure + +# The Helium discovery, on today's binaries: MVE loads/MACs present in the +# M55 build, absent in the M33 build. (The recorded count at discovery was +# 71 vs 0; the exact number moves with the compiler — the zero does not.) +arm-none-eabi-objdump -d build-m55/tests/srt_tests | grep -cE 'vldr|vmlaldav' +arm-none-eabi-objdump -d build-m33/tests/srt_tests | grep -cE 'vldr|vmlaldav' + +# The empty-run guard, demonstrated: break the filter in +# tests/bare_metal_main.cpp (e.g. filter = "NoSuchTest*"), rebuild, and the +# run fails with "filter is broken" instead of passing green. + +# The instruction budgets (counting-plugin build is in ci.yml icount-ratchet; +# same configure for m33 with the other toolchain file): +cmake -B build-m55-ic -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_TOOLCHAIN_FILE=cmake/arm-cortex-m55-mps3.cmake \ + -DSRT_BUILD_TESTS=OFF -DSRT_BUILD_EXAMPLES=OFF -DSRT_BUILD_ICOUNT_BENCH=ON +cmake --build build-m55-ic -j +python3 scripts/icount.py --target m55 --build-dir build-m55-ic --plugin /tmp/libinsncount.so + +# The budgets on real silicon (a Raspberry Pi Pico 2 and a USB cable): +# examples/pico2_cyccnt/README.md — cycles per frame, DWT.CYCCNT +# examples/pico2_dualcore/README.md — one clock domain per core, self-judging +``` + +The two `objdump` lines are this chapter compressed: the same source, the +same compiler, the same flags — and the difference between the binaries +is a discovery you can grep for. Bare metal did not make the library +different; it made what the library was already doing *visible*, one +instruction at a time. diff --git a/book/src/part4/hexagon.md b/book/src/part4/hexagon.md index 48b639a..6a95749 100644 --- a/book/src/part4/hexagon.md +++ b/book/src/part4/hexagon.md @@ -1,3 +1,380 @@ -# hexagon +# Hexagon: a DSP that keeps secrets -*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* +Every portability chapter in this part answers the same question: what did +the target force the library to learn that no amount of host testing could +have taught it? Hexagon — Qualcomm's DSP architecture, the kind of core +that audio actually ships on inside a phone — answered it four times, and +three of the four answers contradicted a reasonable engineer's prior. This +chapter walks through the port itself (which is small) and then the four +lessons (which are the point), in the order the project learned them. + +First, the ground rules the target sets. Hexagon here is +`hexagon-unknown-linux-musl`: a 32-bit `size_t` (the ring chapter's +wraparound proof stops being theoretical), musl instead of glibc, clang +instead of GCC, and — the fact that ends up organizing half of Part III — +**no double-precision FPU**. Every `double` the library touches on this +target is a call into soft-float routines. The library's float datapath +accumulates in `double` deliberately (that decision is defended in the +polyphase chapter); on Hexagon that choice has a price tag, and this +chapter contains the receipts. + +## The whole port is one file + +Here is everything SampleRateTap needed to run its test suite on a +Qualcomm DSP: + +```cmake +{{#include ../../../cmake/hexagon-linux-musl.cmake}} +``` + +Thirty lines, and most of them are comments. Two decisions carry the file. + +**`CMAKE_CROSSCOMPILING_EMULATOR qemu-hexagon`.** This single line is what +makes the port *routine* instead of a parallel test infrastructure. CMake +prepends the emulator to every test command it registers, so `ctest` runs +each cross-compiled binary under `qemu-hexagon` user-mode emulation without +knowing it is doing anything unusual. It goes further than the obvious +case: `gtest_discover_tests()` needs to *execute* the test binary at build +time to enumerate its tests, and the emulator prefix makes discovery work +too — which is why `tests/CMakeLists.txt` raises `DISCOVERY_TIMEOUT` to +120 seconds and the per-test timeout to 900. Instruction-set emulation is +slow, roughly an order of magnitude or two; the timeouts are the only +place the build system admits it. + +The same pattern is deliberately generic. The commented-out HiFi4/HiFi5 +job template in `.github/workflows/ci.yml` is this toolchain file with the +names changed (`xt-clang++`, `xt-run`): any target with a cross-compiler +and an instruction-set emulator drops into the same shape, and the test +suite — the project's real asset — transfers unmodified. + +**`-static`.** A dynamically linked musl binary needs the emulator to be +told where the target's loader and shared libraries live (`qemu-hexagon +-L /path/to/sysroot`), and that path would have to thread through CMake, +CTest, CI, and every developer's shell. Static linking deletes the whole +problem: the binary is self-contained, the emulator invocation is just +`qemu-hexagon ./srt_tests`, and nothing about the sysroot can drift out of +sync. For a test rig this is the right trade without much argument — the +binaries are throwaway artifacts, nobody cares that they are megabytes +instead of kilobytes. Keep this decision in mind, though. It comes back at +the end of the chapter with teeth. + +The CI leg (`hexagon-qemu` in `ci.yml`) runs the suite with an exclusion +list: the multi-minute quality and lock simulations, the 10-million-element +thread stress, and a few others. The reasoning is stated in the workflow +and worth internalizing: those tests prove target-independent control +mathematics and host concurrency, which emulation neither speeds up nor +measures meaningfully. What stays *in* is exactly what the target can +falsify — kernel accuracy, fixed-point arithmetic, 32-bit `size_t` +behavior, atomics lowering, musl's corners. An emulated test leg should +run only the tests that target can fail. + +One boundary must be drawn before any number in this chapter is quoted, +and the toolchain file draws it in its own header comment: user-mode +emulation validates ISA-level *correctness*, never performance. QEMU +translates guest instructions to host instructions and runs them as fast +as it can; nothing about its timing resembles the DSP's. What emulation +*can* produce deterministically is the count of guest instructions +executed — the metric Part II's ratchet chapter is built on — and that +count is a good proxy for scalar-code cost while remaining a proxy. +Cycle-accurate Hexagon numbers require the proprietary Hexagon SDK +simulator, which this project does not have; the documentation says so +rather than letting instruction counts impersonate cycles. Every Hexagon +figure below is therefore an instruction count, exact to the instruction, +reproducible on your machine, and honest about what it is not. + +## Lesson one: the genuinely FP64-less target + +The first thing Hexagon did was refuse to be impressed by an optimization +that worked everywhere else. + +C1 — the blended-row precompute, Part III's opening win — cut the M55 +pipeline instruction counts by 15–30% and host stereo wall-clock by 36%. +The same change on Hexagon: **−3.6% float, −3.3% Q15, −0.2% Q31**. Not +wrong, not a regression — just strangely small, and "strangely small" is +the most informative result an instruction counter can produce. If a +change that halves the inner-loop arithmetic barely moves the total, the +total is not made of inner-loop arithmetic. The diagnosis, recorded in +the C1 entry of `docs/PERFORMANCE.md`: Hexagon's pipelines were dominated +by the per-sample phase bookkeeping, done in `double` and therefore +soft-floated — every phase increment, wrap and blend-factor conversion +expanding into library-call arithmetic that dwarfed the MACs the +optimization had so carefully thinned. + +That diagnosis did two things. It motivated C3, the Q0.64 integer phase +accumulator, whose design you have already seen in Part III. And it +forced a correction that is preserved in `docs/PERFORMANCE.md`'s +hypothesis list: the project had been assuming the Cortex-M55 was also in +this soft-double class, and it is not — the M55's *scalar* FPU executes +FP64 in hardware (only the MVE vector unit is fp16/fp32). The M55's +float numbers had never been soft-double-bound. **Hexagon is the +genuinely FP64-less target**, the only one in CI where "the phase math is +done in doubles" translates to "the phase math is done in subroutines." + +Which is why C3's Hexagon column is the loudest in the whole optimization +campaign. Eliminating soft-double phase math from the per-sample path +bought, from the PR's gating run: + +| Scenario | Hexagon instructions | +|---|---:| +| `pipeline_q31` | **−15.5%** | +| `pipeline_q15` | **−10.3%** | +| `pipeline_float` | −2.6% | +| kernels | count-identical (control) | + +The kernels-identical row deserves its footnote: the change touched only +the converter's per-sample phase path, so the isolated-kernel workloads +*must* not move. They didn't, to the instruction. That is what a control +group looks like in this project's methodology, and the deterministic +QEMU counts are what make a control group meaningful at all — a +wall-clock benchmark can certify "similar," never "identical." + +## Lesson two: hexagon-clang wants aliasing proven, not promised + +C2, the vectorization audit, restrict-qualified the kernel hot-loop +pointers after `-fopt-info-vec` showed GCC vectorizing the blend loop +only behind a runtime aliasing check ("loop versioned for +vectorization"). On the M55 the payoff was real but narrow: +`pipeline_float` −1.35%, every other scenario exactly 0.00%. + +The same one-line annotation on Hexagon, from the PR's gating run: + +| Scenario | arm-gcc (M55) | hexagon-clang | +|---|---:|---:| +| `pipeline_float` | −1.35% | −1.6% | +| `pipeline_q15` | 0.00% | **−6.2%** | +| `pipeline_q31` | 0.00% | **−12.3%** | +| kernels | 0.00% | 0.00% (control) | + +Same source, same semantics, wildly different sensitivity. The commit +that pinned the new Hexagon baselines states the finding plainly: +*hexagon-clang benefits from provable no-aliasing far more than arm-gcc +did* — once aliasing is provable it schedules the dot loops +substantially better. That is consistent with what Hexagon is: a VLIW +machine whose compiler packs multiple operations per issue packet and +therefore lives or dies by how freely it may reorder memory operations. +A `restrict` that merely deletes one runtime check on an in-order ARM +core instead unlocks the scheduler on a DSP. + +The portable lesson is about division of labor: `SRT_RESTRICT` was added +for a measured GCC reason, and the *same annotation* paid a much larger, +unlooked-for dividend on the DSP compiler. Aliasing facts belong in the +source, stated once, precisely — because you cannot predict which +backend will be able to spend them. + +## Lesson three: the ISA already had the trick (C5) + +By C5 the project had a pattern that worked: the C4 packed dual-MAC Q15 +kernel had just bought −3.1% on the Cortex-M33 with a small block of +intrinsics. Hexagon has a directly analogous instruction, `vrmpyh` — +four exact 16×16 products summed into a 64-bit accumulator per +instruction, C4's argument at twice the width. The hypothesis practically +wrote itself. + +It was implemented properly: a `vrmpyh` intrinsic loop for the Q15 dot +product, bit-exact against the portable path, full suite green on Hexagon +QEMU. Then it was measured, and the ratchet reported: + +> `pipeline_q15`: 119,847,854 → 119,478,758. **−0.31%.** + +A result that small demands an explanation before it demands a decision, +because there are two very different ways to earn −0.31%: either the +compiler was already emitting wide MACs (making the intrinsics +redundant), or the wide MACs genuinely don't matter here. The two imply +opposite things about future work, so the project pulled disassembly from +CI (`llvm-objdump`, pre and post): the baseline binary contains **zero** +wide-MAC instructions; the intrinsic build contains **10**. The compiler +had not already done it. The instructions landed, executed — and saved +almost nothing. + +The explanation is in the scalar ISA. Hexagon already issues +single-instruction 64-bit multiply-accumulates (`Rxx += mpy`) and 64-bit +loads, so the portable C++ loop was already running close to one MAC per +instruction, with none of the per-element overheads the M33's baseline +loop had been paying. And what a 4-wide reduce could still have saved, +the fix-up work ate: the history window is 2-byte aligned by nature (it +is a stream of Q15 samples), so feeding `vrmpyh` requires combine and +alignment work that costs nearly what the wider multiply saves. C4 won on +the M33 because there was fat to cut; Hexagon's baseline had none. + +You can see the same fact from the committed baselines, without any +intrinsics experiment at all. The README's instruction-count table has +`kernel_q15` at 102,819,852 on Hexagon against 181,994,196 on the +Cortex-M55 — the scalar DSP executes *fewer* instructions than the core +whose Q15 loops GCC vectorizes with Helium. Cross-ISA instruction counts +must be read with care (an instruction is not a unit of work, and fewer +instructions is not the same claim as faster), but as a measure of *MAC +density* the comparison is legitimate: Hexagon's ISA packs so much of +this workload into each instruction that there was structurally little +for a wider multiply to remove. C5's failure was, in hindsight, already +sitting in the baseline table. The experiment's value was turning "in +hindsight" into a checked fact with disassembly attached. + +So the code was deleted. Not shelved, not flag-gated: reverted, per the +stop rule in `docs/PERFORMANCE.md` — per-architecture complexity must +justify itself, and −0.31% does not justify a permanent intrinsic code +path that every future refactor must keep bit-exact. The C5 entry in +`docs/PERFORMANCE.md` *is* the deliverable: the numbers, the disassembly +evidence, and the reasoning, recorded so that nobody re-derives this dead +end in two years when the file looks temptingly scalar again. + +The entry also pre-empts the obvious follow-up — "fine, scalar `vrmpyh` +is redundant, but what about HVX, the 128-byte vector unit?" — with +arithmetic instead of enthusiasm. A 48–80-tap dot product doesn't fill +one HVX vector; worse, HVX 16-bit MACs accumulate in 32-bit lanes, and +the library's exact-int64 accumulation invariant overflows 32 bits after +about 24 worst-case taps. Per-channel tap-axis dots are simply the wrong +*shape* for HVX. The shape that fits — one 64-bit lane pair per channel, +16 channels filling a vector exactly — is the channel-parallel form, and +that observation, recorded as the successor hypothesis, became C6. + +Negative results are worth exactly what you write down about them. + +## Lesson four: the exception secret + +For months the Hexagon leg was the quiet one. Then a hardening PR added +the library's first `EXPECT_THROW` tests — constructor validation, +`Config::validated()` throwing on nonsense configurations — and the +Hexagon leg turned red in a way no other platform did. The constructor +throws correctly. The `EXPECT_THROW` machinery is standing by to catch. +And the exception never arrives: **this static-musl toolchain +configuration cannot unwind the stack.** The throw reaches the runtime, +the unwinder that should walk the frames is not part of the link, and +`libc++abi` does the only honest thing left — terminate. Every other +platform passed; main was red on exactly one leg, because that leg was +the first place a C++ exception had ever actually been *thrown* in this +project's CI history. + +Remember `-static`, the convenience decision from the top of the chapter? +This is its bill arriving. The configuration had silently shipped without +a working unwind path, and nothing in months of green CI could have said +so, because exception propagation is invisible until the first frame +needs unwinding. A capability you never exercise is a capability you do +not have — you merely have no evidence yet. + +The response is a case study in how this project metabolizes a +limitation, three moves in one commit: + +1. **Quarantine precisely.** `ConfigValidation` is excluded from the + Hexagon `ctest` invocation — that suite and nothing else, with a + comment in `ci.yml` explaining why. Validation logic is + target-independent and still covered on every other leg; what Hexagon + cannot test is the *unwinding*, not the *validating*. +2. **Record it where deployers look.** The Known-debt ledger in + `docs/PERFORMANCE.md` gets an entry with the deployment rule stated as + a rule: on this toolchain configuration, an invalid `Config` is + **fatal** — validate inputs *before* constructing, because the + constructor's throw will take the process down rather than propagate. + The toolchain file itself carries the same caveat, so the next person + to cross-compile inherits the warning at the point of use. +3. **Name the candidate fix without pretending it is done.** Linking an + unwinder (`-unwindlib=libunwind`) in the toolchain file would likely + restore propagation; it stays a recorded candidate until someone + verifies it, because "probably fixable" and "fixed" are different + ledger states. + +The library's API already leaned the right way — `validated()` exists +precisely so callers can validate before constructing — so the rule +costs a deployer one line. But the general finding stands, and it is the +chapter's title: a target can keep a secret like this indefinitely, and +the only way to surface it is to route every kind of behavior through the +target. The first `EXPECT_THROW` to reach the leg was, in effect, the +first test of a claim the toolchain had been silently making all along. + +## The CI craft: trusting your emulator and your compiler + +Two pieces of infrastructure make the Hexagon numbers in this book +reproducible rather than anecdotal, and both are about supply chain more +than about DSP. + +**The emulator is built from source, on purpose.** The instruction-count +ratchet needs a `qemu-hexagon` with TCG plugin support — the counting +plugin is how "executed instructions" becomes a number at all. Neither +Debian's `qemu-user` package nor the qemu bundled with the Hexagon +toolchain enables plugins. So the `icount-ratchet` job compiles its own: +the pinned QEMU 8.2.2 source tarball, verified against a hard-pinned +SHA256, configured minimally — + +```sh +./configure --target-list=hexagon-linux-user --enable-plugins \ + --disable-docs --disable-tools --disable-system +``` + +— about four minutes to build the one binary needed, cached thereafter. +The job then *probes* the result (`qemu-hexagon -plugin help`, judged by +the error text because qemu exits nonzero either way when given no guest +binary) rather than assuming the cache returned what was put in. The +plugin header is pinned to the commit the v8.2.2 tag pointed at, by +commit SHA — tags are movable; commits are not. + +**The toolchain is verified twice, against two different threats.** The +cross-compiler is the prebuilt open-source release from +`quic/toolchain_for_hexagon` (clang 19.1.5, hosted on CodeLinaro). On +download, CI checks it against the *published* `SHA256SUMS` file — which +catches corruption and cache poisoning — and against a *hard pin* baked +into the workflow, which is the only check that catches an origin +compromise, since an attacker who can replace the tarball can replace the +SUMS file beside it. The cache key is derived from the pinned digest +itself, so no job that has not verified the pin can ever write the cache +entry a trusting job will read. That last detail was not free: an audit +found two other jobs sharing the trusted cache key while downloading +without verification — a classic poisoning window — and the fix (verify +everywhere, key on the digest) is part of the same hardening commit that +gave the Cortex-M targets their stack-limit register in the next chapter. + +None of this is DSP knowledge. All of it is what "the Hexagon numbers are +CI-gated" has to mean if the phrase is to carry weight: the compiler +whose output is being counted and the emulator doing the counting are +both pinned, verified artifacts, not whatever the package manager felt +like resolving that morning. + +## What the port did not require + +It is worth pausing on the dog that didn't bark. Running a modern C++20 +template library on a Qualcomm DSP required: one 30-line toolchain file, +a test-filter list, and zero changes to library code. No `#ifdef +__hexagon__` exists in any header. The 32-bit `size_t` was already +handled by the ring's wraparound arithmetic (proved, then tested, in the +ring chapter); the absence of threads never came up because the library +never spawns one; the atomics lowered correctly because the ring asserts +`is_always_lock_free` at compile time and would have refused to build +otherwise. The port was boring precisely to the degree that the library's +portability claims were already true — and interesting precisely where +the *toolchain*, not the library, had been making claims nobody had +tested. Both halves of that sentence are the reason to port early: the +boring half is regression-proofed for free from then on, and the +interesting half you want to hear about from CI, not from a customer. + +## Verify it yourself + +```sh +# The port, end to end (hexagon-unknown-linux-musl-clang++ and qemu-hexagon +# on PATH; .github/workflows/ci.yml "hexagon-qemu" has the toolchain URLs): +cmake -B build-hex -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_TOOLCHAIN_FILE=cmake/hexagon-linux-musl.cmake \ + -DSRT_BUILD_EXAMPLES=OFF +cmake --build build-hex -j +ctest --test-dir build-hex --output-on-failure \ + -E 'AsrcQuality|AsrcLock|TwoThreadStress|TransparentPrototypeMeetsSpec|MultiChannel\.|Feasibility|Reset\.|ConfigValidation' + +# The exception secret, demonstrated: remove ConfigValidation from the -E +# list above and watch libc++abi terminate instead of EXPECT_THROW passing. + +# The instruction counts (needs the plugin-enabled qemu-hexagon; the +# icount-ratchet job in ci.yml shows the 4-minute from-source build): +cmake -B build-hex-ic -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_TOOLCHAIN_FILE=cmake/hexagon-linux-musl.cmake \ + -DSRT_BUILD_TESTS=OFF -DSRT_BUILD_EXAMPLES=OFF -DSRT_BUILD_ICOUNT_BENCH=ON +cmake --build build-hex-ic -j +python3 scripts/icount.py --target hexagon --build-dir build-hex-ic \ + --plugin /path/to/libinsncount.so + +# The C5 negative result's disassembly evidence, reproduced on today's +# binary (the count should be zero — the intrinsics were reverted): +llvm-objdump -d build-hex-ic/bench/icount/srt_icount_pipeline_q15 | grep -c vrmpy +``` + +The last command is this chapter's thesis in one line. The claim "the +wide-MAC intrinsics were deliberately not kept" is not a story in a +design document; it is a property of the shipped binary that you can +count, and the C5 entry in `docs/PERFORMANCE.md` is the record of why +counting it settled the question. diff --git a/cmake/arm-cortex-m55-mps3.cmake b/cmake/arm-cortex-m55-mps3.cmake index aa5a904..23a8991 100644 --- a/cmake/arm-cortex-m55-mps3.cmake +++ b/cmake/arm-cortex-m55-mps3.cmake @@ -23,6 +23,7 @@ set(CMAKE_C_FLAGS_INIT "-mcpu=cortex-m55 -mthumb -mfloat-abi=hard -ffunction-sec set(CMAKE_CXX_FLAGS_INIT "${CMAKE_C_FLAGS_INIT}") get_filename_component(_srt_platform "${CMAKE_CURRENT_LIST_DIR}/../platform/mps3_an547" ABSOLUTE) +# ANCHOR: pt_linkline # The startup .c is handed to the link line directly; the gcc driver # compiles it with the same -mcpu/-mfloat-abi flags as everything else. # `-x c` forces C compilation even under the g++ driver (which would treat @@ -30,6 +31,7 @@ get_filename_component(_srt_platform "${CMAKE_CURRENT_LIST_DIR}/../platform/mps3 # initializers are link-time constants, never dynamic initialization. set(CMAKE_EXE_LINKER_FLAGS_INIT "--specs=rdimon.specs -nostartfiles -Wl,--gc-sections -T${_srt_platform}/mps3_an547.ld -x c ${CMAKE_CURRENT_LIST_DIR}/../platform/armv8m_startup.c -x none") +# ANCHOR_END: pt_linkline set(CMAKE_CROSSCOMPILING_EMULATOR "qemu-system-arm;-M;mps3-an547;-nographic;-semihosting;-kernel") diff --git a/platform/armv8m_startup.c b/platform/armv8m_startup.c index a0007e8..4db20bd 100644 --- a/platform/armv8m_startup.c +++ b/platform/armv8m_startup.c @@ -47,6 +47,7 @@ void* __dso_handle; void _init(void) {} void _fini(void) {} +/* ANCHOR: pt_sbrk */ void* _sbrk(ptrdiff_t increment) { static char* brk = &__heap_start__; if (brk + increment > &__heap_end__) { @@ -57,7 +58,9 @@ void* _sbrk(ptrdiff_t increment) { brk += increment; return prev; } +/* ANCHOR_END: pt_sbrk */ +/* ANCHOR: pt_irqlock */ static inline uint32_t irqLock(void) { uint32_t primask; __asm volatile("mrs %0, PRIMASK\n cpsid i" : "=r"(primask)::"memory"); @@ -67,6 +70,7 @@ static inline uint32_t irqLock(void) { static inline void irqRestore(uint32_t primask) { __asm volatile("msr PRIMASK, %0" ::"r"(primask) : "memory"); } +/* ANCHOR_END: pt_irqlock */ uint64_t __atomic_load_8(const volatile void* ptr, int memorder) { (void)memorder; @@ -83,6 +87,7 @@ void __atomic_store_8(volatile void* ptr, uint64_t value, int memorder) { irqRestore(m); } +/* ANCHOR: pt_atomic_rmw */ uint64_t __atomic_fetch_add_8(volatile void* ptr, uint64_t value, int memorder) { (void)memorder; const uint32_t m = irqLock(); @@ -91,6 +96,7 @@ uint64_t __atomic_fetch_add_8(volatile void* ptr, uint64_t value, int memorder) irqRestore(m); return prev; } +/* ANCHOR_END: pt_atomic_rmw */ uint64_t __atomic_exchange_8(volatile void* ptr, uint64_t value, int memorder) { (void)memorder; @@ -101,6 +107,7 @@ uint64_t __atomic_exchange_8(volatile void* ptr, uint64_t value, int memorder) { return prev; } +/* ANCHOR: pt_reset */ void Reset_Handler(void) { /* MSPLIM exists on Armv8-M Mainline only (both targets are M33/M55 * class): a main-stack overflow past __stack_limit raises a fault @@ -119,6 +126,7 @@ void Reset_Handler(void) { __libc_init_array(); /* C++ static constructors */ exit(main(0, (char**)0)); } +/* ANCHOR_END: pt_reset */ void Default_Handler(void) { for (;;) { @@ -134,6 +142,7 @@ void HardFault_Handler(void) { } } +/* ANCHOR: pt_vectors */ __attribute__((section(".vectors"), used)) static const uintptr_t vectors[16] = { (uintptr_t)&__stack_top, (uintptr_t)&Reset_Handler, @@ -152,6 +161,7 @@ __attribute__((section(".vectors"), used)) static const uintptr_t vectors[16] = (uintptr_t)&Default_Handler, /* PendSV */ (uintptr_t)&Default_Handler, /* SysTick */ }; +/* ANCHOR_END: pt_vectors */ #ifdef __cplusplus } /* extern "C" */ diff --git a/platform/mps2_an505/mps2_an505.ld b/platform/mps2_an505/mps2_an505.ld index d90299f..f2a7c42 100644 --- a/platform/mps2_an505/mps2_an505.ld +++ b/platform/mps2_an505/mps2_an505.ld @@ -8,6 +8,7 @@ * SSRAM1 4 MB @ 0x10000000 - vector table + code + rodata * SSRAM2/3 4 MB @ 0x38000000 - data + bss + heap + stack */ +/* ANCHOR: pt_memory */ MEMORY { CODE (rx) : ORIGIN = 0x10000000, LENGTH = 4M @@ -15,6 +16,7 @@ MEMORY } __stack_top = ORIGIN(DATA) + LENGTH(DATA); +/* ANCHOR_END: pt_memory */ ENTRY(Reset_Handler) @@ -72,6 +74,7 @@ SECTIONS __bss_end__ = .; } > DATA + /* ANCHOR: pt_heap_stack */ /* Stack lives at the top of DATA; cap the heap 64 KB below it. */ .heap (NOLOAD) : ALIGN(8) { __heap_start__ = .; @@ -82,6 +85,7 @@ SECTIONS /* MSPLIM (set in Reset_Handler): the stack may descend to the heap cap * but no further — overflow into the heap faults instead of corrupting. */ __stack_limit = __heap_end__; + /* ANCHOR_END: pt_heap_stack */ /* librdimon's (unused, weak) _sbrk references `end`; satisfy it. */ PROVIDE(end = __heap_start__); diff --git a/platform/mps3_an547/mps3_an547.ld b/platform/mps3_an547/mps3_an547.ld index a0a4d40..c2777f4 100644 --- a/platform/mps3_an547/mps3_an547.ld +++ b/platform/mps3_an547/mps3_an547.ld @@ -8,6 +8,7 @@ * DTCM 512 KB @ 0x20000000 - stack * ISRAM 2 MB @ 0x21000000 - data + bss + heap */ +/* ANCHOR: pt_memory */ MEMORY { ITCM (rx) : ORIGIN = 0x00000000, LENGTH = 512K @@ -20,6 +21,7 @@ __stack_top = ORIGIN(DTCM) + LENGTH(DTCM); /* MSPLIM (set in Reset_Handler): the stack owns all of DTCM, so the lowest * address it may legally reach is the region base. */ __stack_limit = ORIGIN(DTCM); +/* ANCHOR_END: pt_memory */ ENTRY(Reset_Handler) @@ -77,11 +79,13 @@ SECTIONS __bss_end__ = .; } > DATA + /* ANCHOR: pt_heap */ .heap (NOLOAD) : ALIGN(8) { __heap_start__ = .; . = ORIGIN(DATA) + LENGTH(DATA); __heap_end__ = .; } > DATA + /* ANCHOR_END: pt_heap */ /* librdimon's (unused, weak) _sbrk references `end`; satisfy it. */ PROVIDE(end = __heap_start__); From b52c7d98ecec5c3eef2771b91b870891aea2e66e Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 1 Jul 2026 21:39:10 +0000 Subject: [PATCH 11/16] book: the proof-system chapters (tests, icount, notebooks) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Part II: tests-as-specifications (virtual-time simulator, sine-fit metrology, bare-metal one-shot protocol), deterministic instruction counting (plugin walk, two-sided ratchet, instructions-vs-cycles), and notebooks as calibrated instruments with their five measurement traps. Adds comment-only pf_* ANCHOR markers to two_clock_sim.hpp and insn_count.c. Also fixes the demo notebook's summary cell to the measured 126.4 dB — the docs truth sweep claimed this fix but never applied it — and an adjacent 'dB dB' typo. https://claude.ai/code/session_01HuAFfoeD5a5Xe5aGNA16M9 --- book/src/part2/icount.md | 319 ++++++++++++++++++++- book/src/part2/notebooks.md | 320 ++++++++++++++++++++- book/src/part2/tests.md | 420 +++++++++++++++++++++++++++- notebooks/asrc_demo.ipynb | 4 +- tests/support/two_clock_sim.hpp | 4 + tools/qemu_insn_plugin/insn_count.c | 2 + 6 files changed, 1061 insertions(+), 8 deletions(-) diff --git a/book/src/part2/icount.md b/book/src/part2/icount.md index 28ff28a..70501b1 100644 --- a/book/src/part2/icount.md +++ b/book/src/part2/icount.md @@ -1,3 +1,318 @@ -# icount +# Counting instructions, deterministically -*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* +The optimization campaign of Part III makes claims like "−5.3% on the M55 +Q15 pipeline" and expects you to believe the decimal point. This chapter is +about the machinery that makes such a decimal point *mean* something — and +about why the obvious metric, time, had to be fired from that job first. + +## Wall-clock cannot hold a gate + +The project's benchmarks run in CI on shared, virtualized runners: machines +whose actual delivered performance depends on what every other tenant is +doing, what frequency the host decided on, and which physical box the job +landed on today. `docs/PERFORMANCE.md` states the resulting policy without +hedging: *wall-clock benches are never a hard gate on shared runners; they +run as a smoke test and produce trend artifacts only.* + +That policy was not adopted from theory. During the C2 vectorization audit +(Part III), the README's wall-clock table was deliberately *not* +regenerated, because the shared machine was measurably in a different state +than the annotated session that produced the table — about **20% slower +across the board on unchanged code**. Sit with that number for a moment: the +optimization being evaluated in that PR was worth 3.7% on the same metric. +A gate that must detect 3% shifts through 20% ambient swings is not a gate; +it is a random number generator with a pass rate. You can fight the noise +statistically — pin runners, repeat runs, compare medians — and projects +do, but every mitigation buys precision with CI minutes and still cannot +promise that a 1% regression fails *deterministically*. + +The library's answer is to gate a different quantity entirely: **executed +instructions**. Run a fixed workload under an emulator, count every guest +instruction that retires, and the result is a property of the *binary*, not +the weather — bit-identical across runs (the project verified this before +trusting it), independent of host load, and, for the scalar code these +embedded targets run, well correlated with real cost. The metrics table in +`docs/PERFORMANCE.md` is careful about that last clause, and so is the end +of this chapter; but first, the machinery. + +## Forty lines of plugin + +QEMU's TCG (Tiny Code Generator) translates guest instructions into host +code one *translation block* at a time, and since QEMU 4.2 it exposes a +plugin API that lets you hook that translation. The project's entire +counting instrument is `tools/qemu_insn_plugin/insn_count.c` — small enough +that its two working functions fit here: + +```c +{{#include ../../../tools/qemu_insn_plugin/insn_count.c:pf_hooks}} +``` + +The design point that matters is `qemu_plugin_register_vcpu_insn_exec_inline` +with `QEMU_PLUGIN_INLINE_ADD_U64`. There are two ways a TCG plugin can count +executions: register a *callback* per instruction — a host function call +every time the guest retires one instruction — or register an *inline +operation*, which asks QEMU to plant a bare 64-bit add into the generated +host code itself. The callback form would multiply the emulation time of a +billion-instruction workload by a large constant; the inline form costs one +host add per guest instruction and no calls. `tb_trans` fires once per +translation block *translation* (not execution), walks the block's +instructions, and attaches an inline `+1` to each — after which counting +proceeds at essentially full emulation speed forever, because translated +blocks are cached and re-executed. + +The header comment is candid about the accuracy contract this buys: +"the single counter is exact for our single-vCPU deterministic workloads." +A plain `uint64_t` incremented from generated code would be a data race on +an SMP guest; every target this ratchet gates is a single emulated core +running a single-threaded workload, so the simple counter is exact — and +the precondition is written down where the next porter will read it. + +The second function is the entire output interface: an `atexit` callback +prints one line, `SRT_INSN_COUNT `, through `qemu_plugin_outs()`. That +choice has a trap the driver script had to learn about: + +```python +def qemu_cmd(target: str, plugin: str, binary: str) -> list[str]: + # "-d plugin" routes qemu_plugin_outs() to stderr; without it the count + # line is silently dropped. + if target == "hexagon": + return ["qemu-hexagon", "-d", "plugin", "-plugin", plugin, binary] +``` + +`qemu_plugin_outs` writes to QEMU's *log*, and unless `-d plugin` enables +the plugin log channel, the write goes nowhere — no error, no warning, no +line. The comment in `scripts/icount.py` preserves the discovery so nobody +re-makes it, and the script's parser treats a missing count line as a hard +failure ("plugin not loaded?") rather than a zero, so the silent-drop +failure mode cannot masquerade as a measurement. + +## One binary per scenario + +What gets counted matters as much as how. The counted workloads live in +`bench/icount/` — and they are *not* the Google Benchmark suite, which +auto-tunes its iteration counts to the machine's speed and would therefore +execute a different number of instructions on every run. A countable +workload must be **fixed**: same work, same iteration counts, same +everything, decided at compile time. + +`bench/icount/icount_main.cpp` defines seven scenarios — `interpolate()` in +isolation and the full push/pull pipeline, each in float/Q15/Q31, plus a +12-channel Q15 pipeline for the 7.1.4 deployment shape — selected by +preprocessor definitions (`SRT_SC_KIND`, `SRT_SC_TYPE`, `SRT_SC_CH`) into +one binary each, because the bare-metal targets have no argv to select with +at runtime. Each binary runs a deterministic loop (two virtual seconds of +audio through the pipeline; 200 000 interpolations for the kernels), +accumulates a checksum, and ends with: + +```cpp + const bool ok = checksum == checksum; // NaN check + std::printf("SRT_ICOUNT_DONE ok=%d checksum=%.17g\n", ok ? 1 : 0, checksum); +``` + +The three gated targets each run under the QEMU mode that matches their +deployment reality. Hexagon binaries are Linux user-space processes, so +`qemu-hexagon` (user-mode emulation) runs them directly. The two Cortex-M +targets are bare metal: `qemu-system-arm` boots each binary as a kernel on +a full board model — MPS3 AN547 for the M55, MPS2 AN505 for the M33 — with +semihosting for the printf. That fidelity matters for the metric: a +system-mode count includes the startup code, vector table dance, and +runtime the deployed firmware will actually execute, which is why the +plugin counts the whole run and the workloads are sized so the measured +loop dominates. + +The checksum earns its place three times over: it defeats dead-code +elimination (a compiler that deleted the unobserved workload would produce +a spectacular "improvement"); printed to 17 significant digits, it pins +cross-run determinism — if two runs of one binary ever printed different +checksums, the instruction counts would be incomparable and something would +be deeply wrong; and the pipeline workload deliberately poisons it with a +NaN if the converter ever underruns, so a broken configuration cannot +produce a plausible count. `icount.py` refuses to record anything unless +`SRT_ICOUNT_DONE ok=1` appeared. + +## The ratchet, and why it is two-sided + +`scripts/icount.py` glues plugin to workloads: find every `srt_icount_*` +binary in the build directory, run each under the target's QEMU with the +plugin, and compare against the committed `bench/baselines.json` at a +tolerance of ±3%. A scenario with no recorded baseline fails. A recorded +baseline of zero fails. A regression beyond tolerance fails. And — the +clause that makes this a *ratchet* rather than a mere alarm — an +**improvement** beyond tolerance fails too: + +```python + elif delta < -args.tolerance: + # Two-sided: a stale (too-high) baseline would let future + # regressions hide inside the slack, so improvements must be + # committed too. + verdict = ("IMPROVED beyond tolerance — run icount.py --update " + "and commit bench/baselines.json") + failures.append(scenario) +``` + +The two-sidedness was not in the original design. The first version of the +ratchet failed only on regression, which sounds like the point — until an +infrastructure audit traced the incentive structure. Suppose your PR makes +`pipeline_q15` 10% cheaper and you don't update the baseline. CI passes; +everyone is happy; the baseline is now 10% stale. The *next* PR can regress +the same scenario by 9% — undoing nearly all of your win — and CI passes +again, because measured-vs-baseline is still inside the slack. Improvements +that go unclaimed become a hiding place for regressions exactly their size. +The audit's fix (the same infrastructure-hardening pass that added the +bare-metal empty-run guard of the previous chapter) makes the gate +symmetric: if you made it faster, you must *say so*, in the same PR, by +re-recording the baseline — `icount.py --update` — and committing the diff. +The improvement becomes reviewable history, the gate snaps tight around the +new value, and there is never slack for anything to hide in. + +`--update` has its own small discipline: it rewrites the target's entry to +*exactly* the measured scenarios, so a renamed or deleted workload cannot +linger in the JSON as a dead gate entry that never fails and never means +anything. + +One boundary of the ratchet is drawn in a CMake naming convention. The +cross-resampler comparison workloads (`docs/COMPARISON.md` runs the same +fixed task through this library and through libsamplerate, per target) are +built as `cmp_icount_*` precisely so that `icount.py`'s `srt_icount_*` glob +never picks them up: competitor counts are *recorded* in the docs with +their date and toolchain, but not *gated*. The distinction is deliberate. +A gate on someone else's code would fail on their releases, punish this +project for their regressions, and pressure nobody who can act on it; a +gate is a promise, and you can only promise about code you maintain. + +The tolerance deserves a sentence, because "±3% on a deterministic count" +sounds contradictory. Counts are bit-identical across runs *of one binary*; +the slack absorbs a different variation: innocuous recompilation effects. +Code layout, inlining decisions, and register allocation shift by fractions +of a percent when unrelated code changes; the C6 work measured its embedded +control scenarios at exactly 0.00% only because nothing in their path +changed. Three percent is wide enough that touching a comment never fails +the gate, and narrow enough that the +6–8% cost of a runtime flag in a hot +loop — a real mistake, caught by this exact gate during C6 and fixed with a +compile-time gate before merge — cannot pass it. + +## Baselines are compiler-dependent, by design + +An instruction count is a property of the binary, and the binary is a +product of the compiler. When the CI image's `gcc-arm-none-eabi` or +hexagon-clang package updates, every count moves a little, and the ratchet +job fails on unchanged library code. `docs/PERFORMANCE.md` is explicit that +this is **working as intended, not a flake**: the response is to re-record +the baselines in a reviewed commit whose diff *is* the record of what the +toolchain update did to the library's cost. The alternative — normalizing +counts, or pinning tolerances wide enough to ride out compiler churn — +would trade an occasional, explainable, reviewable failure for permanent +blindness to exactly the kind of shift a performance-conscious project most +wants to see. + +The same philosophy shows up in how the tools themselves are provisioned. +The plugin compiles against a `qemu-plugin.h` pinned to the exact commit +QEMU 8.2.2's tag pointed at, checksum-verified on download. And the Hexagon +leg builds its own emulator: neither Debian's `qemu-hexagon` nor the one +bundled with the CodeLinaro toolchain enables TCG plugins, so CI compiles a +plugin-capable `qemu-hexagon` from the pinned QEMU source (linux-user +target only, cached thereafter). A measurement gate whose instruments are +unpinned is a gate whose meaning can change without a diff. + +## What instructions do and do not predict + +Time to honor the caveat. An instruction count is not a cycle count, and +the project's documentation never claims otherwise — the metrics table +says "well-correlated with real cost **for scalar code**," and +cycle-accurate numbers are explicitly delegated to vendor simulators or +hardware counters. + +Where the correlation is good: in-order scalar cores running out of +tightly-coupled memory, which describes the Cortex-M33 and M55 targets +closely. Most instructions are single-cycle, there is no cache hierarchy to +miss in, and a 5% instruction reduction is a real, similar-sized cycle +reduction. + +Where it bends: anything that changes the *mix* rather than the count. +The C3 fixed-point phase accumulator made the M55 float pipeline count +**+1.4%** worse — it replaced hardware-double operations with int64 +sequences, more instructions of cheaper mix — and the project accepted the +regression for the cross-target win, with the reasoning in the PR rather +than hidden in an average. + +Where it bends furthest is Hexagon, and the reason is architectural: +Hexagon is a VLIW machine that issues *packets* of up to four instructions +per cycle. Two versions of a loop with identical instruction counts can +differ meaningfully in cycles depending on how well their instructions pack +into packets — and conversely, removing instructions that packed for free +saves nothing. The C5 experiment (Part III) is the cautionary tale: a +hand-written `vrmpyh` wide-MAC kernel, proven bit-exact, verified by +disassembly to contain ten wide MACs where the baseline had zero, measured +**−0.31%** — 119,847,854 to 119,478,758 instructions on `pipeline_q15`. The +instruction metric faithfully reported that the change barely mattered; on +a VLIW machine it takes packet-level analysis (or silicon) to know whether +even that number survives translation to time. + +The project's calibration path for the gap is hardware, and it ships in the +repository: `examples/pico2_cyccnt/` is a flashable RP2350 firmware that +runs the *same* `runPipeline` workload as the icount scenarios — 32-frame +push/pull blocks, 997 Hz sine, 1 000 warm-up and 2 000 measured iterations +— timed per block with the Cortex-M33's DWT.CYCCNT cycle counter, printing +mean/p99/max cycles per block, cycles per frame, and the fraction of a +150 MHz core one 48 kHz stream costs. Correlating those cycle figures +against the committed M33 instruction baselines yields the +cycles-per-instruction ratio for exactly this code on exactly that silicon +— after which the deterministic, CI-friendly instruction gate can be read +in real-time units. Until that correlation is run on hardware you own, the +documentation deliberately states the M33 figures as instruction *budgets*, +not cycle claims; the truth-sweep audit that enforced that wording appears +again in the next chapter. + +## The last mile: numbers that cannot go stale + +A gated number that is hand-copied into a README is a number waiting to +rot. The published instruction-count table is therefore not written by +anyone: `scripts/update_icount_docs.py` regenerates it **1:1 from +`bench/baselines.json`** — every row, every comma — between +`` and `` markers, and the CI +ratchet job's final step is: + +```sh +python3 scripts/update_icount_docs.py +git diff --exit-code README.md || { + echo "::error::README icount table is stale; run scripts/update_icount_docs.py"; exit 1; } +``` + +Regenerate and diff. If the committed README does not match the committed +baselines exactly, the build fails — so the numbers a visitor reads are, by +construction, the numbers the gate enforces. It is the same commitment this +book makes with live-included code, applied to a table: *derived artifacts +must be derived, in CI, every time, or they are testimony rather than +evidence.* + +## Verify it yourself + +```sh +# Build the counting plugin (fetch qemu-plugin.h for QEMU 8.2.x first; +# ci.yml pins the exact URL and checksum): +gcc -shared -fPIC $(pkg-config --cflags glib-2.0) -I/path/to/plugin-header \ + tools/qemu_insn_plugin/insn_count.c -o /tmp/libinsncount.so + +# Cross-build the fixed workloads and run the ratchet (arm-none-eabi-gcc): +cmake -B build-m55 -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_TOOLCHAIN_FILE=cmake/arm-cortex-m55-mps3.cmake \ + -DSRT_BUILD_TESTS=OFF -DSRT_BUILD_EXAMPLES=OFF -DSRT_BUILD_ICOUNT_BENCH=ON +cmake --build build-m55 -j +python3 scripts/icount.py --target m55 --build-dir build-m55 \ + --plugin /tmp/libinsncount.so + +# Determinism: run any one binary twice and compare the counts exactly. +qemu-system-arm -M mps3-an547 -nographic -semihosting -d plugin \ + -plugin /tmp/libinsncount.so -kernel build-m55/bench/icount/srt_icount_pipeline_q15 + +# See the two-sided gate work: re-run icount.py with --tolerance 0.0001 +# and watch benign recompilation deltas fail in *both* directions. + +# The docs-freshness gate: +python3 scripts/update_icount_docs.py && git diff --exit-code README.md +``` + +And the experiment that motivates the whole chapter: run any wall-clock +benchmark from `bench/` twice on a shared machine, an hour apart, and +compare. The instruction counts you just produced will not have moved by a +single instruction; the nanoseconds will tell you about the machine's day. diff --git a/book/src/part2/notebooks.md b/book/src/part2/notebooks.md index c5b84f1..1b74152 100644 --- a/book/src/part2/notebooks.md +++ b/book/src/part2/notebooks.md @@ -1,3 +1,319 @@ -# notebooks +# Notebooks as calibrated instruments -*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* +The previous two chapters covered claims a machine can gate: thresholds in +tests, instruction counts in a ratchet. But some of this project's most +consequential claims are not pass/fail propositions. *How much worse is a +naive FIFO?* *What does block size cost in latency and pitch stability?* +*How does the converter measure against libsamplerate, soxr, and two +hardware ASRC chips, under one definition of THD+N?* Answering those takes +plots, long simulated runs, and a measurement methodology that itself needs +defending — which is to say, it takes a lab notebook. The repository has +three, under `notebooks/`, and they are treated with the same severity as +the test suite: **committed with their outputs, calibrated before they +measure, and pinned with assertions so that a regression fails the re-run.** + +This chapter is about that discipline — and about five specific ways a +quality measurement can lie, each of which this project actually hit, and +each of which is now encoded in the notebooks as a guard, a docstring, or a +scar. + +## Three instruments, one method + +**`asrc_demo.ipynb`** is the front door: it loads the library through its C +ABI with `ctypes` (no Python bindings, ~80 lines of wrapper), reproduces +the naive-FIFO disaster, then walks lock acquisition, transparency, +spectrograms, latency, drift tracking, and dropout recovery. Its committed +outputs are where the README's "what does it sound like" numbers come from: +clicks roughly ten times per second at 29 dB SNR for the naive path, +126.4 dB for the converter under the notebook's instrument. + +**`asrc_block_size_study.ipynb`** answers a deployment question: what +happens at block sizes 32, 64, and 240 frames? Its committed conclusion — +Track-stage operation turns block quantization into cent-scale, low-rate FM +over a 53–61 dB wideband floor, while designed latency scales as roughly +`2·B/fs + 0.5 ms` — is quoted by `docs/COMPARISON.md` whenever coarse-block +operation comes up. + +**`asrc_comparison.ipynb`** is the adversarial one: a single AES17-style +measurement implementation applied identically to SampleRateTap, +libsamplerate `sinc_best`, soxr `VHQ`, and a naive FIFO, with the deck +deliberately stacked *against* the home team — the libraries are handed the +exact clock ratio as an oracle, while the converter must discover it from +FIFO occupancy and still gets measured on the result. Every software number +in `docs/COMPARISON.md`'s tables is a committed output of this notebook. + +All three share a spine: the deterministic two-clock simulation from the +[tests chapter](tests.md), re-implemented in a few lines of Python around +the C ABI. Producer and consumer events interleave by next-event virtual +time, so a +200 ppm producer delivers its extra sample every 5 000 exactly, +and a re-run reproduces the committed outputs. Determinism is what makes +"committed outputs" meaningful — a notebook whose numbers wander between +runs is a screenshot, not an instrument. + +Why notebooks at all, rather than more tests? Because the *output* here is +the plot and the number-in-context, not a boolean; because the runs are +minutes long and belong in a manually-triggered lab rather than every CI +push; and because a reader deciding whether to trust the library should be +able to see the methodology, the code, and the result in one document — +then re-execute it. The committed outputs are the published lab record; the +re-run is the replication. + +## Calibrate the instrument before believing it + +The core discipline, stated as a rule: **no measurement function in these +notebooks reports on the converter until it has first reported on a +synthetic signal with a known answer, in the same notebook, above the +measurement.** + +The comparison notebook builds an AES17-style THD+N meter — and then +immediately feeds it a pure 997 Hz tone plus white noise injected at +exactly −100 and −130 dBFS, computes what a perfect meter must read (the +injected level, corrected for the fundamental's RMS and the fraction of +white noise falling in the 20 Hz–20 kHz integration band), and asserts +agreement within half a dB: + +```python + got, f0 = thdn_db(sig, 997.0) + ... + assert abs(got - expect) < 0.5 +print("instrument calibrated") +``` + +Only after `instrument calibrated` prints does any subject get measured. +The block-size study does the same for a subtler instrument — a +decomposition of a near-sinusoid into low-rate pitch modulation (in cents) +and a wideband noise floor — by synthesizing a tone with *exactly* 1 cent +of 10 Hz FM over a −120 dB noise floor. The committed output reads: + +```text +calibration: peak 1.000 cents (true 1.000), rms 0.707 (true 0.707), +wideband 111.0 dB (true ~111) +``` + +That calibration cell carries the project's most candid admission, in its +own markdown: "This cell earned its keep: three earlier formulations of the +split each leaked modulation into the noise figure, and the calibration +caught every one." One of those three failures survives as a docstring on +the low-pass filter inside the decomposition — a boxcar smoother's passband +droop left a percent-level copy of the modulation in the high-passed +remainder, silently bounding the measurable floor — and another as a +warning that reconstruct-and-subtract in the signal domain fails subtly +(sub-split phase errors multiply the carrier). Without the synthetic-signal +check, every one of those buggy instruments would have produced a plausible +wrong number about the converter, and the notebook would have published it +with a straight face. Calibration converts "my measurement code is probably +right" into a demonstrated property, at the cost of one cell. + +## Pin the result, or the notebook is a brochure + +Every notebook ends its key measurements with `assert`. The demo, after +measuring transparency: + +```python +assert snr_asrc > 125.0, "transparency regression" +``` + +The comparison, after the full table: + +```python +first = names[0] +assert thdn[first] < -130 and dr24[first] > 130 +``` + +The block-size study, after the FM decomposition — with a comment that +names the philosophy: + +```python + # Documented behavior as of this measurement: FM peaks stay below the + # ~5-8 cent audibility region (B=240 gets closest) and the wideband + # floor stays above 50 dB. These pin behavior, not aspiration. + assert metrics[B]["cents_peak"] < 5.0, f"FM at B={B} reached audibility" +``` + +This is the notebook version of the test suite's +thresholds-just-under-measured convention. A notebook without assertions +degrades into marketing: it gets re-run after some future change, a plot +looks subtly worse, and nobody's eye catches it. With assertions, re-running +the notebook *is* a regression test — `docs/COMPARISON.md` says exactly +this in its caveats: software figures "regenerate by re-running the +comparison notebook; its assertions pin SampleRateTap's results so +regressions fail the run." The notebook is simultaneously the lab record +and the gate on its own claims. + +The rest of this chapter is the honest-measurement traps those assertions +and calibrations exist to catch — each one a mistake this project made, or +nearly made, with the receipt still in the file. + +## Trap one: your window lies about the floor + +A 997 Hz fundamental at −1 dBFS sits some 130 dB above the residual being +measured. Take a plain FFT of that and the window function smears the +fundamental's energy across the spectrum at the window's sidelobe level — +with common windows, far above the thing you are trying to see. The +notebooks handle this on two fronts. For *display*, the demo's spectrum +helper documents its choice: a Kaiser window with β = 24, "sidelobes +~−190 dB, so a −130 dB noise floor is actually visible." For *measurement*, +no window is trusted at all: the comparison notebook refines the +fundamental's frequency by the phase-slope method (per-window phase of a +least-squares fit, regressed against time — "precision far beyond FFT bins, +which a 130 dB measurement needs," as its markdown puts it), then removes +the fundamental by a single global least-squares fit, *exactly*, before any +spectrum is taken. Only the residual — fundamental already subtracted — +meets an FFT, and then only for integration. A ±20 Hz notch around the +fundamental catches what the fit leaves; the notebook notes this notch is +far *narrower* than AES17 permits hardware testers, a conservatism that +works against the software subjects in every comparison. + +This is the same decision the test suite made with its tracked sine fit, +arrived at for the same reason: at these dynamic ranges, subtraction is +exact and windows are not. + +## Trap two: measure the converter, not its transient + +An ASRC has stages. Fresh from a cold start it acquires; once locked it +tracks; given sample-granular occupancy data for long enough, the servo +promotes to its low-bandwidth Quiet stage — and the residual keeps +improving for tens of seconds as the loop forgets its own acquisition. A +measurement window placed too early reads the servo's history, not the +converter's quality. + +The numbers make the point better than prose. The comparison notebook runs +32 seconds and discards the first 25 before analyzing ("we analyze its +output well after the servo's Quiet stage engages," its markdown says). The +48 kHz quality tests run 40 seconds and analyze the final one. And when the +16 kHz suite was built by scaling the servo bandwidths with the sample rate, +the settle time scaled *inversely*: the quiet loop lands at ~0.017 Hz, and +the suite had to run **120 seconds** — the same number of samples, the same +number of loop time constants as 40 s at 48 kHz — with the test's comment +recording that a 40-second run still sits ~15 dB above the settled +residual. Fifteen decibels is the difference between a correct claim and an +embarrassing one, controlled entirely by *when you look*. + +The flip side matters equally: the block-size study measures the Track +stage *on purpose*, because block-fed deployments never reach Quiet — that +is the regime under study. Neither window placement is "right"; what is +right is that each notebook states which regime it is measuring and why. + +## Trap three: the flush at the end of the stream + +The comparison notebook hands each competitor the same input and analyzes a +window of its output. Where you cut that window turned out to matter more +than anything else in the file: + +```python +def mid_window(y, analyze_s, guard_s=1.0): + """Trim both ends: one-shot converters flush a filter tail at the end of + the stream, and including it poisons the measurement by ~60 dB (found + the hard way; a control experiment at 2:1 exposed it).""" + y = np.asarray(y, dtype=np.float32) + end = len(y) - int(guard_s * FS) + return y[end - int(analyze_s * FS):end] +``` + +A one-shot resampler API, given the whole stream at once, drains its filter +state at the end — a tail of samples that are not steady-state conversion +output. Include that tail in the analysis window and the measured THD+N +degrades by roughly **60 dB**: enough to turn soxr's −150 dB into an +apparently mediocre converter. The bug was found "the hard way," and the +docstring preserves how: a control experiment at a 2:1 ratio — where the +correct answer was known independently — read absurdly wrong, and the +investigation traced it to the tail. Every one-shot subject is therefore +measured on a mid-stream window with a one-second guard at each end. + +Note whose numbers this guard protects: the *competitors'*. An honest +comparison has to be most careful about errors that flatter the home team, +and an unguarded tail window would have been exactly that kind of error. + +## Trap four: comparing float software to 24-bit silicon + +The comparison's final tables land next to datasheet values for the AD1896 +and SRC4392 — hardware ASRCs measured at their pins, which are 24 bits +wide. A float32 pipeline has no fixed noise floor at all (its noise scales +down with the signal), so its "native" dynamic range mostly measures the +arithmetic format, not the converter. Quoting float numbers against silicon +datasheets would be a category error dressed as a benchmark. + +The notebook's equalizer is four lines: + +```python +def q24(y): + """Round to a 24-bit interface, undithered -- what a hardware ASRC + presents at its pins. The equalizer that makes software and silicon + numbers directly comparable.""" + return np.round(np.asarray(y, np.float64) * 8388608.0) / 8388608.0 +``` + +Every subject's output is measured both ways, and `docs/COMPARISON.md` +leads with the 24-bit columns as the chip-comparable condition. The result +reads differently than bravado would: at that interface the oracle-fed +libraries measure at the 24-bit format ceiling itself (~−143.5 dB THD+N), +all three real converters share the identical 149.1 dB A-weighted +dynamic-range ceiling, and SampleRateTap's −132.1 dB sits ~11 dB behind the +oracles — a gap the document does not explain away but *prices*: it is the +measured cost of solving the clock-recovery half of the problem, which the +libraries do not attempt. Even so, the caveats refuse the flattering frame +in the other direction too: datasheet numbers come from analog test loops +with wider notches, and "a pristine-digital software measurement and a +bench measurement of a chip are comparable in definition, not in +environment." + +## Trap five: the summary cell nobody executes + +The last trap is the quietest, and this project walked into it. The demo +notebook's measurement cell printed, in its committed output: + +```text +ASRC SNR: 126.4 dB | naive: 29.4 dB | improvement: 97 dB +``` + +with `assert snr_asrc > 125.0` enforcing it. The *summary table* at the +bottom of the same notebook claimed "SNR > 130 dB." Nothing failed. Nothing +could fail: markdown does not execute, so no assertion, calibration, or +re-run will ever check a number typed into prose. The two cells sat a few +screens apart, one measured and one remembered, disagreeing by 4 dB — the +one place a documentation audit found the repository overstating its own +results. (The measured 135 dB figure from the test suite is real, but it is +a *different instrument* — a tracked global fit over a different window — +and a summary must quote its own cell, not the best number available +elsewhere in the repo.) The fix was the boring, correct one: the summary +now states 126.4 dB and points at the assertion. + +The lesson generalizes beyond notebooks: **summaries drift from cells the +same way READMEs drift from benchmarks and comments drift from code.** +Executable claims stay honest by execution; prose claims stay honest only +by audit. This project's response operates at both levels — push every +number it can into asserted, regenerated, machine-checked form (the test +thresholds, the icount table's regenerate-and-diff gate, the notebook +assertions), and schedule adversarial audits for the residue that only +prose can carry. This book is itself downstream of that lesson: the code +you read here is included live from the headers, because an author's +summary of code is just one more markdown cell. + +## Verify it yourself + +```sh +# Build the C ABI once; the notebooks find (or build) it themselves: +cmake -B build -DCMAKE_BUILD_TYPE=Release -DSRT_BUILD_CAPI=ON +cmake --build build --target srt_capi -j + +# Re-run each instrument end to end; any pinned regression fails the run +# (deps: numpy, matplotlib, plus samplerate and soxr for the comparison): +jupyter nbconvert --to notebook --execute notebooks/asrc_demo.ipynb +jupyter nbconvert --to notebook --execute notebooks/asrc_block_size_study.ipynb +jupyter nbconvert --to notebook --execute notebooks/asrc_comparison.ipynb + +# Watch a calibration catch a broken instrument: in asrc_comparison.ipynb, +# widen the notch (notch_hz=20.0 -> 2000.0) or in the block-size study +# replace lowpass_fft with a boxcar mean — the synthetic-signal cell fails +# before any subject is measured. + +# The traps, in the sources' own words: +grep -rn "poisons the measurement" notebooks/asrc_comparison.ipynb +grep -rn "earned its keep" notebooks/asrc_block_size_study.ipynb +grep -rn "pin behavior, not aspiration" notebooks/asrc_block_size_study.ipynb +``` + +The demo notebook's summary table is the one artifact in this chapter that +no command can verify — which is the point. Read it next to the measurement +cell above it, check that the numbers agree, and you will have performed, +by hand, the audit that fixed it. diff --git a/book/src/part2/tests.md b/book/src/part2/tests.md index 8098ed7..85d6e6b 100644 --- a/book/src/part2/tests.md +++ b/book/src/part2/tests.md @@ -1,3 +1,419 @@ -# tests +# Tests as specifications -*Chapter not yet written — see the outline in the book plan. Each chapter lands as its own PR.* +Part I ended each chapter with a list of tests. This chapter is about what +those tests actually *are* — because in this project they are not the usual +smoke detectors bolted on after the fact. They are the specification. The +README publishes a table of signal-to-noise figures; the reason that table +can be trusted is not editorial diligence, it is that every number in it has +a test asserting something just below it, and CI runs the assertion on every +push. `docs/PERFORMANCE.md` states the policy in one line: "The SNR table is +already enforced by test thresholds." + +That sentence hides three design problems, each with a wrong answer that +most test suites pick by default. How tight do you pin a measured quantity? +How do you make a two-clock, two-thread, analog-flavored system produce the +same bits every run? And how do you measure 135 dB of fidelity without your +measuring instrument lying to you? The suite's answers are the subject of +this chapter. + +## Thresholds a few dB under reality + +Here is the convention, straight from the top of the quality suite +(`tests/test_asrc_quality.cpp`): + +```cpp +// Thresholds sit 4-7 dB under measured performance (135/120/113/106 dB for +// balanced at 997/6k/12k/19.5k; 133/108 dB for transparent). The residual at +// high frequencies is dominated by the linear interpolation between adjacent +// phase-table rows, which falls ~12 dB per doubling of numPhases and rises +// ~12 dB per octave of signal frequency. +``` + +And a representative enforcement: + +```cpp +TEST(AsrcQuality, Balanced997Hz) { + EXPECT_GT(measureSnrDb(srt::FilterSpec::balanced(), 997.0), 128.0); +} +``` + +Measured 135.0, asserted 128.0. Consider the two alternatives this rejects. + +**A loose threshold** — say, "SNR must exceed 60 dB, comfortably transparent +for casual listening" — turns the test into a tautology. The converter could +regress by seventy decibels, an *enormous* defect by this library's +standards, and CI would stay green while the README continued to advertise +135 dB. A loose threshold means the published claim and the enforced claim +are different claims, and only the weaker one is real. This suite's position +is that a quality number you publish is a number you gate, at very nearly +the value you publish. + +**An exact threshold** — asserting 135.0 because you measured 135.0 — fails +for the opposite reason: the measurement is a physical quantity with +legitimate variation. Different hosts, compilers, and math libraries move +the residual by fractions of a dB; the float path's strict double +accumulation keeps outputs bit-stable per platform but not across them. The +4–7 dB of headroom is sized to absorb that variation and nothing else: any +*algorithmic* regression — a filter redesign that loses stopband, a servo +change that leaks more clock noise into the passband — costs whole decibels +and lands outside the slack. + +The comment carries a second load worth noticing: it explains *where the +residual comes from* (phase-table interpolation, with its 12 dB scaling laws +in both `numPhases` and signal frequency). That converts the threshold table +from arbitrary constants into a checkable physical model — when the 16 kHz +suite was added later, its expectations could be *predicted* from the same +model (the residual depends on the normalized frequency f/fs, so tones at +the same f/fs should measure the same), then measured, and they matched +within about 1 dB (`tests/test_asrc_quality_16k.cpp` records both sets of +numbers). A threshold you can predict is a specification; a threshold you +can only observe is a snapshot. + +The convention also imposes a maintenance discipline that deserves to be +stated honestly: when performance *improves*, the thresholds are stale and +must be re-pinned upward, or the enforcement quietly loosens. That happened +in this repository — the Q0.64 phase accumulator (Part III) improved the +997 Hz figure to 135.0 dB, and a subsequent documentation audit re-aligned +the published headline and threshold comments to the post-change reality. +The instruction-count ratchet in the next chapter solves the same +staleness problem mechanically, with a two-sided gate; the quality suite +solves it by convention and audit. The difference is instructive: ±3% on a +deterministic integer can be automated; "4–7 dB under a measurement that +legitimately varies by platform" still needs a human to re-pin. + +## The two-clock simulator + +Every quality number above comes from the same experimental rig, and it fits +in a page of header (`tests/support/two_clock_sim.hpp`). The problem it +solves: the converter's whole reason to exist is that *two independent +clocks* drive it, but tests that use two real threads and real timers are +nondeterministic — schedulers differ, load differs, and a 0.2 dB shift in a +measurement could be the code or could be the machine. For metrology you +want the clocks without the threads. + +The rig is a struct of knobs: + +```cpp +{{#include ../../../tests/support/two_clock_sim.hpp:pf_knobs}} +``` + +and one loop: + +```cpp +{{#include ../../../tests/support/two_clock_sim.hpp:pf_run}} +``` + +This is discrete-event simulation reduced to its minimum. Two virtual +clocks, `tIn` and `tOut`, advance in *virtual time*: a producer event pushes +`chunkIn` frames and advances `tIn` by `chunkIn / fsIn`; a consumer event +pulls `chunkOut` frames and advances `tOut` by `chunkOut / fsOut`; whichever +clock is behind fires next. With `fsIn = 48 000 × (1 + 200 ppm)` and +`fsOut = 48 000`, the producer naturally lands one extra sample every 5 000 +— the exact asynchrony a real capture/playback pair exhibits, with zero +dependence on the host scheduler. Runs are exactly reproducible: same +sequence of pushes and pulls, same occupancy trajectory seen by the servo, +same output samples, every time, on every machine. + +Why determinism beats realism for regression work: + +- **A failure is a coordinate, not a weather report.** When + `Balanced19_5kHz` drops below 100 dB, re-running reproduces the identical + run; you can bisect it, instrument it, and diff intermediate state against + a good commit. A threads-and-timers failure reproduces "sometimes." +- **Thresholds can be tight.** The 4–7 dB convention above is only possible + because run-to-run variance is zero; scheduler-dependent tests must budget + slack for the scheduler, and that slack is exactly where regressions hide. +- **The interesting parameter becomes controllable.** Transfer granularity + — how many frames move per event — is a *physical property of real + deployments* (sample-synchronous codecs at one extreme, USB and network + audio moving multi-frame bursts at the other), and it changes converter + behavior: the servo promotes to its low-bandwidth Quiet stage only when + occupancy is observed at fine granularity. The quality suites run + `chunkIn = chunkOut = 1` to reach the Quiet stage; the multichannel short + variants run `chunk = 8` deliberately, to certify the Track stage that + block-fed deployments actually live in. In a real-threads test, + granularity would be an accident of scheduling; here it is an axis of the + test matrix. +- **Slow clock dynamics are testable at all.** `fsInScale` lets a test ramp + the input rate — the lock suite sweeps drift ramps and asserts the servo + follows without unlocking — which on real hardware would require a + programmable oscillator and a lab. + +What determinism deliberately does *not* cover is the one thing it removes: +real concurrency. The memory-ordering claims of the ring buffer are tested +by the separate two-thread stress under ThreadSanitizer (the +[ring chapter](../part1/spsc-ring.md) walks its limits). The division of +labor is explicit — realism where realism is the subject, simulation +everywhere else — and the technique travels: the same virtual-time +interleaving reappears in Python inside every notebook of the +[notebooks chapter](notebooks.md). + +One number shows what the rig's determinism costs in patience rather than +trust. The quality runs last 40 virtual seconds because, as the test's +comment puts it, "the 0.05 Hz locked loop must fully forget the acquisition +transient before the measurement window" — and only the final second is +analyzed. At 16 kHz the servo bandwidths scale down with the rate, so the +same suite runs 120 seconds to cover the identical number of loop time +constants; its comment records that a 40 s run still sits ~15 dB above the +settled residual. Deterministic time is cheap; *skipping* settling time is +how you measure your transient instead of your converter. + +## Sine-fit metrology + +The simulator produces a signal; something must turn it into a decibel +figure, and at 135 dB the instrument is the hard part. The suite's +instrument (`tests/support/sine_analysis.hpp`) is a least-squares sine fit: +model the output window as `a·sin(ωi) + b·cos(ωi) + c`, solve the 3×3 +normal equations for the best-fit fundamental, subtract it *exactly*, and +call everything that remains — harmonics, images, servo noise, quantization +— the residual. `snrDb()` is then the fitted fundamental's power over the +residual's. + +Why a fit instead of an FFT? Because subtraction is exact and windows are +not. A windowed spectrum smears the near-full-scale fundamental across +neighboring bins at the window's sidelobe level; measuring a residual 135 dB +down *under* that skirt means fighting your own instrument. The fit has no +window: the fundamental is removed to the precision of the arithmetic +(double throughout), and the method's own floor sits far below anything the +converter produces. (The notebooks meet the same problem with the same +answer, plus a notch — that chapter tells the ~60 dB horror story that +motivates the extra guard.) + +One refinement matters enough to justify its own function. `fitSine` +requires the frequency; `fitSineTracked` *finds* it, starting from the +nominal value: + +```cpp + for (int iter = 0; iter < 4; ++iter) { + const SineFit a = fitSine(x.first(half), f); + const SineFit b = fitSine(x.subspan(half), f); + // b.phase is relative to the second half's start; predict it from a. + const double twoPi = 2.0 * std::numbers::pi; + const double predicted = a.phase + twoPi * f * static_cast(half); + const double dphi = std::remainder(b.phase - predicted, twoPi); + f += dphi / (twoPi * static_cast(half)); + } +``` + +Fit each half of the window; if the assumed frequency is slightly wrong, the +second half's phase arrives shifted from where the first half's fit predicts +it; the shift, divided by the half-window's span, is the frequency error. +Four iterations converge far below the starting error. + +The reason this exists is a property of the device under test. An ASRC's +rate estimate converges *asymptotically* — the Quiet-stage loop is +deliberately slow, so even after a 40-second run the estimate can sit a +fraction of a ppm off the true ratio. A rigid fit at the nominal frequency +would see the output tone microscopically detuned from the model and book +the mismatch as residual: a completely inaudible frequency offset, misread +as noise. Tracking the fundamental before measuring distortion is exactly +what commercial THD analyzers do, and the header's comment says so — the +test instrument follows metrology practice, not convenience. + +But an instrument that *tracks* the signal could also *excuse* it: a +converter that genuinely played the wrong pitch would have its error +absorbed into the tracked frequency and measure clean. The suite closes +that hole with a guard on the tracker itself: + +```cpp + // The tracked frequency must still match the true clock ratio closely. + EXPECT_NEAR(fit.freqNorm / nuOutExpected, 1.0, 2e-6); +``` + +The fit may refine the frequency, but only within 2 ppm of what the clock +ratio dictates — enough for servo convergence tails, nowhere near enough to +hide a real pitch error. Every use of the tracked fit carries this check. +It is the measurement-code version of a lesson this book keeps repeating: +whenever you give a tool freedom, pin the freedom. + +## Crosstalk that cannot hide, leakage that cannot masquerade + +Single-channel quality metrics are structurally blind to a whole class of +multichannel bugs: swap two channels in the deinterleave, or bleed a percent +of channel 3 into channel 4, and every per-channel SNR still measures +perfect. `tests/test_multichannel.cpp` exists for exactly those bugs, and +its design is a small case study in adversarial measurement. + +The setup: one converter instance, every channel carrying a *distinct* tone +— `600 + 731·c` Hz, non-harmonically related, all inside the flat passband +for up to 16 channels — with per-channel phase offsets to decorrelate the +waveforms. After conversion across the usual +200 ppm crossing, each channel +must contain its own tone at full quality and nothing measurable of any +other channel's. The deployment shapes are real: 12 channels is 7.1.4 +surround, 16 is an AVB stream bundling reference microphones with the +program feed. + +The subtlety is in the analysis order, and the file header explains it: + +```cpp +// Method: own tone is removed by tracked least-squares fit; the other +// channels' frequencies are then fitted on the residual, so the own tone's +// spectral leakage (about -67 dB at these spacings over a 1 s rectangular +// window) cannot masquerade as crosstalk. The fit noise floor on the +// residual is ~43 dB below the residual RMS, far under every threshold. +``` + +Fit channel *k*'s frequency directly on channel *c*'s raw signal and the +finite one-second window makes channel *c*'s own tone leak energy into that +fit at about −67 dB — the test would "detect" crosstalk at −67 dB on a +converter with none, capping the assertable threshold right there. Removing +the own tone first (exact subtraction of the tracked fit) drops the +masquerade floor to the fit noise on the residual, far under every +threshold. Order of operations *is* the instrument here: same data, same +fits, and only one sequencing yields a measurement capable of asserting +−100 dB. The pinned claims follow the quality suite's convention: crosstalk +below −100 dB per channel for float (−72 dB for Q15, whose own quantization +floor is the binding constraint), with amplitude and SNR checked alongside. + +One more design decision hides in the channel counts of the short variants: + +```cpp +// Channels 5 and 7 are the only counts that reach the channel-parallel +// K=2 and K=1 remainder tiles (8/4/2/1 tiling: 5 = 4+1, 7 = 4+2+1) — the +// audit found those tiles had zero coverage. +``` + +The C6 optimization (Part III) processes channels in register-blocked tiles +of 8, 4, 2, and 1. Testing 2, 12, and 16 channels — every *deployment* +shape — exercises only the wide tiles. Five and seven channels are useless +deployment shapes and ideal test shapes: they force the remainder paths. An +audit found those tiles had zero coverage across the entire suite; the fix +was not more assertions but better-chosen *inputs*. Coverage lives in the +test matrix, not the expectation count. + +## The bare-metal one-shot, and the filter that needed a test + +On the Cortex-M55 and M33 CI legs, the suite runs as a bare-metal kernel +under `qemu-system-arm`: no OS, no filesystem, no command line. That +environment breaks three assumptions ordinary gtest runs lean on, and +`tests/bare_metal_main.cpp` plus `tests/CMakeLists.txt` repair them one by +one — each repair with a story. + +**No argv** means no `--gtest_filter` from the harness, so the +emulation-appropriate filter is baked into a custom `main()`: + +```cpp + ::testing::GTEST_FLAG(filter) = "-AsrcQuality*:AsrcLock.*:Servo.*:Kaiser.*MeetsSpec:" + "FixedPoint.AsrcQuality*:" + "FixedPoint.FullScaleSineDoesNotWrapQ15:" + "MultiChannel.*:Feasibility.*:Reset.*"; +``` + +**No reliable exit codes** — semihosting does not dependably propagate a +process status through the emulator — means the run is judged on text. +CTest watches for a sentinel: + +```cmake + add_test(NAME srt_tests_emulated COMMAND srt_tests) + set_tests_properties(srt_tests_emulated PROPERTIES + PASS_REGULAR_EXPRESSION "SRT_TESTS_COMPLETE rc=0" + FAIL_REGULAR_EXPRESSION "\\[ FAILED \\]" + TIMEOUT 1800) +``` + +The sentinel is printed as the *last* act of `main()`, after +`RUN_ALL_TESTS()` returns — deliberately, so a crash after gtest's own +summary (a static destructor, a late fault) cannot register as a pass. The +`FAIL_REGULAR_EXPRESSION` is a second, independent tripwire: even if a +mangled run somehow emitted the sentinel, any visible test-failure line +still fails the CTest. + +**Nobody watching** is the third broken assumption, and its repair has the +best history. `RUN_ALL_TESTS()` returns 0 when every selected test passes — +including when the filter selects *zero* tests. A typo in that baked-in +filter string would produce an empty run, print the sentinel with `rc=0`, +and turn the entire on-target suite green forever. An infrastructure audit +realized this, and the guard went in: + +```cpp + const int selected = ::testing::UnitTest::GetInstance()->test_to_run_count(); + if (selected < 15) { + std::printf("only %d tests selected (expected >= 15): filter is broken\n", selected); + std::printf("SRT_TESTS_COMPLETE rc=1\n"); + return 1; + } +``` + +Two details show the care level. The count is checked *after* the run, +because gtest applies the filter inside `RUN_ALL_TESTS()` — read it before +and it is always zero, which was verified on target rather than assumed. +And the bound is 15 against a selection of roughly 20, leaving headroom for +legitimate test removals without masking a typo. + +The guard was not paranoia; the filter had *already* had a real bug. When +the 16 kHz quality suite (`AsrcQuality16k`) was added, the exclusion then +read `-AsrcQuality.*` — and in gtest filter syntax, unlike regex, `.` is a +literal character. `AsrcQuality.*` matches `AsrcQuality.Balanced997Hz` but +not `AsrcQuality16k.Balanced333Hz`, so the new two-minute simulations would +have quietly joined every bare-metal CI run, at emulation speed. The fix +widened the pattern to `AsrcQuality*` (no dot). Look back at the filter +string and you can now read its dots as deliberate: `MultiChannel.*` — +*with* the literal dot — excludes exactly the `MultiChannel` suite while +keeping `MultiChannelShort` in, which the comment beside it calls out as the +only on-target coverage of the N-channel deinterleave and wide-MAC dotRow +paths. The same character is a bug in one line and a scalpel in the next; +the difference is whether its meaning was chosen. + +## What the emulated targets deliberately skip + +The baked filter and its `ctest -E` sibling on the Hexagon leg exclude the +same family: the quality suites, the lock and servo simulations, the filter +design verification, the feasibility and reset sims — collectively, as the +file header puts it, "minutes of soft-float virtual audio that validate +target-independent control math already covered on every host platform." +That phrase is the policy. A 40-second sample-granular quality run is cheap +arithmetic on a Xeon and an eternity under instruction-set emulation — and +it would re-prove something that *cannot differ* on the target: the servo's +control law and the filter designer's mathematics are pure functions of +their inputs, identical on every conforming C++ implementation. + +What *can* differ on target — and therefore what the on-target run keeps — +is the datapath: kernel accuracy on the target's arithmetic, the fixed-point +paths (including the SMLALD dual-MAC route on M33-class cores), the ring +buffer, the deinterleave, the end-to-end latency path. The exclusion list is +not a shortcut; it is a claim about *where target-dependence lives*, and the +short multichannel variants exist precisely because that claim would +otherwise have left the N > 2 datapath uncovered on the machines it was +written for. + +One exclusion is different in kind, and the CI file is honest about it: +`ConfigValidation` is skipped on Hexagon not because it is slow but because +that leg's static-musl toolchain cannot unwind — the constructor throws +correctly, `EXPECT_THROW` never catches, and libc++abi terminates. The +limitation is recorded in `docs/PERFORMANCE.md` under known debt, with the +deployment guidance it implies (validate configs before constructing on that +toolchain). A skipped test with a documented reason is a specification too: +it specifies the boundary of what the platform supports. + +## Verify it yourself + +```sh +# The quality suite: watch the printed [ measured ] lines clear the +# thresholds by the documented few dB: +ctest --test-dir build -R AsrcQuality --output-on-failure + +# The threshold convention, in the tests' own words: +grep -n -A4 "Thresholds sit" tests/test_asrc_quality.cpp tests/test_asrc_quality_16k.cpp + +# Multichannel independence, long and short (per-channel crosstalk prints): +ctest --test-dir build -R MultiChannel --output-on-failure + +# Determinism of the rig: run a quality test twice and diff the output. +ctest --test-dir build -R Balanced997 --output-on-failure # (run it twice) + +# The bare-metal one-shot, exactly as CI runs it (needs arm-none-eabi-gcc +# and qemu-system-arm): +cmake -B build-m55 -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_TOOLCHAIN_FILE=cmake/arm-cortex-m55-mps3.cmake +cmake --build build-m55 -j && ctest --test-dir build-m55 -V + +# Break the empty-run guard on purpose: change the baked filter in +# tests/bare_metal_main.cpp to a typo like "NoSuchSuite.*", rebuild, and +# watch the run fail with "filter is broken" instead of passing green. +``` + +The last experiment is this chapter's thesis in miniature. A test suite is +only a specification if an empty, wrong, or stale version of it *fails* — +and every mechanism in this chapter, from pinned thresholds to the +fifteen-test floor, exists to make silence impossible to mistake for +success. diff --git a/notebooks/asrc_demo.ipynb b/notebooks/asrc_demo.ipynb index b026d74..08d0f77 100644 --- a/notebooks/asrc_demo.ipynb +++ b/notebooks/asrc_demo.ipynb @@ -727,8 +727,8 @@ "\n", "| What | Measured here |\n", "|---|---|\n", - "| Naive FIFO at +200 ppm | clicks ~10×/s, SNR around 29 dB dB |\n", - "| SampleRateTap, same conditions | **SNR > 130 dB** — at the 24-bit noise floor |\n", + "| Naive FIFO at +200 ppm | clicks ~10×/s, SNR around 29 dB |\n", + "| SampleRateTap, same conditions | **SNR 126.4 dB measured** (cell asserts > 125) |\n", "| Lock from cold start | ~1 s |\n", "| Latency | ≈ designed 1.5 ms, linear phase |\n", "| 50 ppm/s drift ramp | tracked, locked, zero underruns |\n", diff --git a/tests/support/two_clock_sim.hpp b/tests/support/two_clock_sim.hpp index adbc5df..6dd2b1a 100644 --- a/tests/support/two_clock_sim.hpp +++ b/tests/support/two_clock_sim.hpp @@ -12,6 +12,7 @@ namespace srt_test { +// ANCHOR: pf_knobs template struct TwoClockSimT { srt::BasicAsyncSampleRateConverter& asrc; @@ -27,7 +28,9 @@ struct TwoClockSimT { /// Optional input-rate modulation: fsIn scale factor at virtual time t /// (e.g. for drift-ramp tests). Defaults to constant 1. std::function fsInScale = [](double) { return 1.0; }; + // ANCHOR_END: pf_knobs + // ANCHOR: pf_run /// Runs for `seconds` of output-clock virtual time. onOut receives every /// pulled block: (interleavedSamples, frames, virtualTime). template @@ -59,6 +62,7 @@ struct TwoClockSimT { } } } + // ANCHOR_END: pf_run }; using TwoClockSim = TwoClockSimT; diff --git a/tools/qemu_insn_plugin/insn_count.c b/tools/qemu_insn_plugin/insn_count.c index 9f8fb2d..0f67c74 100644 --- a/tools/qemu_insn_plugin/insn_count.c +++ b/tools/qemu_insn_plugin/insn_count.c @@ -21,6 +21,7 @@ QEMU_PLUGIN_EXPORT int qemu_plugin_version = QEMU_PLUGIN_VERSION; static uint64_t insn_count; +/* ANCHOR: pf_hooks */ static void tb_trans(qemu_plugin_id_t id, struct qemu_plugin_tb* tb) { (void)id; size_t n = qemu_plugin_tb_n_insns(tb); @@ -37,6 +38,7 @@ static void at_exit(qemu_plugin_id_t id, void* userdata) { g_autofree gchar* msg = g_strdup_printf("SRT_INSN_COUNT %" PRIu64 "\n", insn_count); qemu_plugin_outs(msg); } +/* ANCHOR_END: pf_hooks */ QEMU_PLUGIN_EXPORT int qemu_plugin_install(qemu_plugin_id_t id, const qemu_info_t* info, int argc, char** argv) { From 81480c1810506896d28b387140e951b208241509 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 1 Jul 2026 22:18:47 +0000 Subject: [PATCH 12/16] book: regenerable figures Six SVG figures wired into the chapters, all produced by scripts/book_figures.py from the sources the text cites: the Kaiser window and prototype responses re-run kaiser.hpp's math formula-for- formula; the servo lock/dropout/re-lock trace and both feasibility panels are measured by compiling scripts/book_figures_trace.cpp against the real headers (the before panel against the pre-fix commit's tree via git archive); the Q0.64 slip figure runs the actual mod-2^64 arithmetic; the architecture diagram is drawn and labeled as such. The introduction gains a third mechanical commitment describing this, and the book CI job now verifies every image reference resolves (regeneration is not gated: matplotlib SVG output is not byte-stable across versions). https://claude.ai/code/session_01HuAFfoeD5a5Xe5aGNA16M9 --- .github/workflows/ci.yml | 22 + book/src/img/architecture.svg | 1760 +++++++++ book/src/img/feasibility.svg | 4870 ++++++++++++++++++++++++ book/src/img/kaiser-response.svg | 3683 ++++++++++++++++++ book/src/img/kaiser-window.svg | 1791 +++++++++ book/src/img/q064-slip.svg | 2039 ++++++++++ book/src/img/servo-lock.svg | 3005 +++++++++++++++ book/src/introduction.md | 12 +- book/src/part1/asrc.md | 20 + book/src/part1/fractional-resampler.md | 10 + book/src/part1/kaiser.md | 17 + book/src/part1/pi-servo.md | 17 + scripts/book_figures.py | 459 +++ scripts/book_figures_trace.cpp | 63 + 14 files changed, 17767 insertions(+), 1 deletion(-) create mode 100644 book/src/img/architecture.svg create mode 100644 book/src/img/feasibility.svg create mode 100644 book/src/img/kaiser-response.svg create mode 100644 book/src/img/kaiser-window.svg create mode 100644 book/src/img/q064-slip.svg create mode 100644 book/src/img/servo-lock.svg create mode 100644 scripts/book_figures.py create mode 100644 scripts/book_figures_trace.cpp diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 588ba2b..8ed298b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -517,3 +517,25 @@ jobs: echo "::error::mdbook reported warnings/errors (stale anchor or broken include?)" exit 1 fi + + # mdBook does not fail on a missing image, so check every relative + # image reference resolves. (The SVGs are committed, generated by + # scripts/book_figures.py; regeneration is not gated because + # matplotlib's SVG output is not byte-stable across versions.) + - name: Check image references resolve + run: | + python3 - <<'EOF' + import pathlib, re, sys + src = pathlib.Path("book/src") + missing = [] + for md in src.rglob("*.md"): + for target in re.findall(r"!\[[^\]]*\]\(([^)#?]+)", md.read_text()): + if target.startswith(("http://", "https://")): + continue + if not (md.parent / target).resolve().exists(): + missing.append(f"{md}: {target}") + if missing: + print("::error::broken image reference(s):") + print("\n".join(missing)) + sys.exit(1) + EOF diff --git a/book/src/img/architecture.svg b/book/src/img/architecture.svg new file mode 100644 index 0000000..e44ea4c --- /dev/null +++ b/book/src/img/architecture.svg @@ -0,0 +1,1760 @@ + + + + + + + + 2026-07-01T22:14:40.852216 + image/svg+xml + + + Matplotlib v3.10.9, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/book/src/img/feasibility.svg b/book/src/img/feasibility.svg new file mode 100644 index 0000000..d0c31ac --- /dev/null +++ b/book/src/img/feasibility.svg @@ -0,0 +1,4870 @@ + + + + + + + + 2026-07-01T22:14:43.804899 + image/svg+xml + + + Matplotlib v3.10.9, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/book/src/img/kaiser-response.svg b/book/src/img/kaiser-response.svg new file mode 100644 index 0000000..4b8ccd3 --- /dev/null +++ b/book/src/img/kaiser-response.svg @@ -0,0 +1,3683 @@ + + + + + + + + 2026-07-01T22:14:40.234973 + image/svg+xml + + + Matplotlib v3.10.9, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/book/src/img/kaiser-window.svg b/book/src/img/kaiser-window.svg new file mode 100644 index 0000000..98b5525 --- /dev/null +++ b/book/src/img/kaiser-window.svg @@ -0,0 +1,1791 @@ + + + + + + + + 2026-07-01T22:14:39.749395 + image/svg+xml + + + Matplotlib v3.10.9, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/book/src/img/q064-slip.svg b/book/src/img/q064-slip.svg new file mode 100644 index 0000000..ece9f37 --- /dev/null +++ b/book/src/img/q064-slip.svg @@ -0,0 +1,2039 @@ + + + + + + + + 2026-07-01T22:14:40.600970 + image/svg+xml + + + Matplotlib v3.10.9, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/book/src/img/servo-lock.svg b/book/src/img/servo-lock.svg new file mode 100644 index 0000000..32f7d63 --- /dev/null +++ b/book/src/img/servo-lock.svg @@ -0,0 +1,3005 @@ + + + + + + + + 2026-07-01T22:14:43.379611 + image/svg+xml + + + Matplotlib v3.10.9, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/book/src/introduction.md b/book/src/introduction.md index d3c61ed..1916add 100644 --- a/book/src/introduction.md +++ b/book/src/introduction.md @@ -34,7 +34,7 @@ we cite the textbook and spend our pages on what the textbooks omit: why ## How this book stays honest -Two mechanical commitments distinguish this book from most code walkthroughs. +Three mechanical commitments distinguish this book from most code walkthroughs. **The excerpts are live.** Every block of library code you read is included into the book at build time from the actual header in the repository, by @@ -51,6 +51,16 @@ back what you just read. When this book says the ring buffer is correct under weak memory ordering, you will be holding the ThreadSanitizer invocation that fails if it is not. +**The figures are regenerable.** Every plot in this book is produced by +`scripts/book_figures.py` from the sources the text cites: the filter +curves re-run the header's design math formula-for-formula, and the servo +and feasibility traces are *measured* — the script compiles a small trace +dumper against the real headers (and, for the before-the-fix panel, +against the pre-fix commit's headers pulled from git history) and runs it +in deterministic virtual time. Rerun the script and you reproduce every +figure; nothing is drawn from memory except the one architecture diagram, +which is labeled as drawn. + ## The history is the curriculum This codebase was built measurement-first, and its history contains real diff --git a/book/src/part1/asrc.md b/book/src/part1/asrc.md index 366e23b..82453dd 100644 --- a/book/src/part1/asrc.md +++ b/book/src/part1/asrc.md @@ -17,6 +17,15 @@ wires them together and adds the four things none of them could own alone: a lifecycle state machine, an under/overrun policy, telemetry, and validation. +![The composed converter: producer pushes into the ring, the servo turns +ring occupancy into a rate estimate, the resampler consumes at that rate, +the consumer pulls](../img/architecture.svg) + +*The whole machine on one page. The ring is the only structure both clock +domains touch; everything downstream of it — servo, resampler, and both +their states — lives on the consumer's side, which is why `pull()` carries +all the policy and `push()` is eight lines.* + ## The two-agent shape The public surface is two functions and a contract: @@ -119,6 +128,17 @@ indefinitely, never reaching Locked, with the reported ppm pegged at a false +1500 (the clamp, mistaken for the answer). A 240-frame callback produced 80% silence. +![Measured FIFO occupancy for pull(64) at default configuration, before +and after the feasibility fix](../img/feasibility.svg) + +*Both panels are measurements, not models: `scripts/book_figures.py` +compiles the same trace dumper against the include/ tree of the last +pre-fix commit (via `git archive`) and against HEAD, and runs the +identical scenario. Before: drain, underrun, refill — four dropouts a +second, forever. After: one adaptive raise on the first pull, then the +servo regulates the effective setpoint and the underrun count stays at +zero.* + Why didn't anything catch it? Because every artifact that exercised the converter had, innocently, been configured just clear of the cliff. The quality tests pull one frame at a time — the metrologically correct choice diff --git a/book/src/part1/fractional-resampler.md b/book/src/part1/fractional-resampler.md index 0d0a395..5e9a313 100644 --- a/book/src/part1/fractional-resampler.md +++ b/book/src/part1/fractional-resampler.md @@ -157,6 +157,16 @@ slip detector**, for both signs of ε, with no comparisons against 1.0 or 1.0. - Otherwise `advance = 1`: the metronomic case. +![The Q0.64 phase accumulator slipping by wraparound, for both signs of +epsilon](../img/q064-slip.svg) + +*The slip logic run with the real mod-2⁶⁴ arithmetic, ε exaggerated to +0.09 so the wraps are visible (at the real |ε| ≈ 2×10⁻⁴ a slip fires once +every few thousand frames). Left: the fraction creeps upward until the add +wraps past 1.0 — consume one extra frame. Right: with ε negative the add +wraps on *every* ordinary frame, and the anomaly is the one that doesn't — +reuse the window. From `scripts/book_figures.py`.* + At +500 ppm a forward slip fires every 2 000 output samples, and thanks to the bank's extra row the filter evaluated after `advance = 2` at small μ is the exact continuation of the filter before it at μ ≈ 1. diff --git a/book/src/part1/kaiser.md b/book/src/part1/kaiser.md index d3cd50f..de66bf5 100644 --- a/book/src/part1/kaiser.md +++ b/book/src/part1/kaiser.md @@ -87,6 +87,15 @@ what happens to the length formula when a caller hands it garbage, why the normalization constant is `L` and not 1, or whether any of this should run at compile time. That is the rest of this chapter. +![The Kaiser window for the three presets' beta values: higher attenuation +targets produce more strongly tapered windows](../img/kaiser-window.svg) + +*The knob in action: the presets' attenuation targets (96/120/140 dB) map +through `kaiserBeta` to β = 9.6/12.3/14.5, and higher β buys its deeper +stopband by tapering the window harder — which widens the main lobe, which +is why `estimateTaps` charges more taps for the same transition width. +Generated by `scripts/book_figures.py` from the same formulas.* + ## `besselI0`: a power series with an escape hatch `` has no I₀ (`std::cyl_bessel_i` exists in the special-functions @@ -321,6 +330,14 @@ instead is the *specification*: `tests/test_kaiser.cpp` computes the prototype's actual frequency response by direct DFT and asserts the numbers the presets advertise. +![Prototype magnitude response of the three presets, with a passband-ripple +detail panel](../img/kaiser-response.svg) + +*What the spec tests pin: each preset's transition starts at its passband +edge and reaches its rated floor by its stopband edge, and the detail panel +shows all three passbands flat within ±0.01 dB. The curves come from +`scripts/book_figures.py`, which re-runs `designPrototype`'s math verbatim.* + The measurement function evaluates `|H(f)|` at arbitrary frequencies in Hz against the oversampled prototype (rate `L·fs`), normalized by L so the passband reads 0 dB — a direct O(n) sum per frequency. No FFT: an FFT diff --git a/book/src/part1/pi-servo.md b/book/src/part1/pi-servo.md index a83dbb6..36fae99 100644 --- a/book/src/part1/pi-servo.md +++ b/book/src/part1/pi-servo.md @@ -461,6 +461,23 @@ Track→Acquire, re-lock, promote, trip again, a mode limit cycle manufactured entirely in configuration. If you change one servo number for an embedded deployment, this is the one to check. +## The whole life cycle, measured + +Everything this chapter described is visible in one trace: the converter +driven at +200 ppm in deterministic virtual time (1-frame pushes — the +long tests' methodology), with a 50 ms producer stall injected at t = 28 s. + +![Measured occupancy and ppm estimate through acquire, lock, a 50 ms +dropout, and re-lock](../img/servo-lock.svg) + +*Acquiring's 10 Hz loop rings clamp-to-clamp on the quantized occupancy — +the sawtooth of the "enemy" section, live — yet the smoothed occupancy +never strays two frames from the setpoint, and promotion lands in half a +second. After the stall, `reset(true)` keeps the integrator, so the +re-acquire rings around 200 ppm rather than starting over from zero. +Generated by `scripts/book_figures.py`, which compiles a small trace +dumper against the real headers and runs exactly this scenario.* + ## The shape of the design | Decision | Alternative rejected | Reason | diff --git a/scripts/book_figures.py b/scripts/book_figures.py new file mode 100644 index 0000000..6e705a1 --- /dev/null +++ b/scripts/book_figures.py @@ -0,0 +1,459 @@ +#!/usr/bin/env python3 +"""Regenerates the book's figures (book/src/img/*.svg). + +Every figure is produced from the same sources the text cites: + +- the filter figures re-run the exact design math of + include/srt/detail/kaiser.hpp (formula-for-formula port below); +- the servo and feasibility figures are MEASURED: this script compiles + scripts/book_figures_trace.cpp against the current include/ tree and runs + it in deterministic virtual time. The feasibility "before" panel compiles + the same tool against the include/ tree of commit 045de5d — the last + commit before the PR #25 feasibility fix — extracted with `git archive`, + so both panels of that figure are measurements, not models; +- the phase-wraparound figure runs the resampler's actual uint64 slip + arithmetic (mod 2^64) in Python integers; +- the architecture figure is drawn, not computed. + +Usage: python3 scripts/book_figures.py (from the repo root) +Needs: numpy, matplotlib, g++, git. + +The SVGs are committed. CI does not regenerate them — matplotlib's SVG +output is not byte-stable across matplotlib versions, so a regenerate-and- +diff gate would ratchet toolchain noise, not truth — but the book CI job +does verify that every image the chapters reference exists. +""" + +import os +import subprocess +import sys +import tempfile + +import numpy as np +import matplotlib + +matplotlib.use("Agg") +import matplotlib.pyplot as plt +from matplotlib.patches import FancyBboxPatch, FancyArrowPatch + +ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +OUT = os.path.join(ROOT, "book", "src", "img") +PREFIX_COMMIT = "045de5d" # last commit before the feasibility fix (PR #25) + +# Palette (validated categorical slots + chrome ink; light surface). +SURFACE = "#fcfcfb" +INK = "#0b0b0b" +SECONDARY = "#52514e" +MUTED = "#898781" +GRID = "#e1e0d9" +BASELINE = "#c3c2b7" +BLUE = "#2a78d6" # slot 1 +AQUA = "#1baf7a" # slot 2 (sub-3:1 contrast: always direct-labeled) +YELLOW = "#eda100" # slot 3 (sub-3:1 contrast: always direct-labeled) +RED = "#e34948" # slot 6, used only for the pre-fix (failing) trace + +plt.rcParams.update({ + "figure.facecolor": SURFACE, + "axes.facecolor": SURFACE, + "savefig.facecolor": SURFACE, + "font.family": "sans-serif", + "font.sans-serif": ["DejaVu Sans"], + "font.size": 9, + "text.color": INK, + "axes.edgecolor": BASELINE, + "axes.labelcolor": SECONDARY, + "axes.titlecolor": INK, + "axes.titlesize": 10, + "axes.linewidth": 0.75, + "axes.grid": True, + "grid.color": GRID, + "grid.linewidth": 0.75, + "grid.linestyle": "-", + "xtick.color": MUTED, + "ytick.color": MUTED, + "xtick.labelcolor": MUTED, + "ytick.labelcolor": MUTED, + "lines.linewidth": 1.5, + "lines.solid_joinstyle": "round", + "lines.solid_capstyle": "round", + "legend.frameon": False, + "svg.hashsalt": "sampleratetap-book", +}) + + +def save(fig, name): + fig.savefig(os.path.join(OUT, name + ".svg")) + png_dir = os.environ.get("PNG_OUT") # optional raster copies for review + if png_dir: + fig.savefig(os.path.join(png_dir, name + ".png"), dpi=110) + + +def despine(ax): + for side in ("top", "right"): + ax.spines[side].set_visible(False) + + +# --- the filter design math, ported formula-for-formula from kaiser.hpp --- + +def bessel_i0(x): + x = np.asarray(x, dtype=float) + half = 0.5 * x + term = np.ones_like(x) + total = np.ones_like(x) + for k in range(1, 1000): + r = half / k + term = term * r * r + total = total + term + if np.all(term < 1e-21 * total): + break + return total + + +def kaiser_beta(atten_db): + if atten_db > 50.0: + return 0.1102 * (atten_db - 8.7) + if atten_db > 21.0: + return 0.5842 * (atten_db - 21.0) ** 0.4 + 0.07886 * (atten_db - 21.0) + return 0.0 + + +def design_prototype(num_phases, taps_per_phase, cutoff_norm, beta): + n = num_phases * taps_per_phase + i = np.arange(n, dtype=float) + center = 0.5 * (n - 1) + t = (i - center) / num_phases + u = (i - center) / center + w = bessel_i0(beta * np.sqrt(np.maximum(0.0, 1.0 - u * u))) / bessel_i0(beta) + h = cutoff_norm * np.sinc(cutoff_norm * t) * w # np.sinc is sin(pi x)/(pi x) + return h * (num_phases / h.sum()) + + +# FilterSpec presets, verbatim from polyphase_filter.hpp. +PRESETS = [ + ("fast", 128, 32, 18000.0, 30000.0, 96.0, BLUE), + ("balanced", 256, 48, 20000.0, 28000.0, 120.0, AQUA), + ("transparent", 512, 80, 20000.0, 26000.0, 140.0, YELLOW), +] +FS = 48000.0 + + +def preset_response(L, T, pass_hz, stop_hz, atten_db, nfft=1 << 21): + cutoff = (pass_hz + stop_hz) / FS + h = design_prototype(L, T, cutoff, kaiser_beta(atten_db)) + H = np.fft.rfft(h, nfft) / L + f = np.arange(H.size) * (L * FS) / nfft + keep = f <= 48000.0 + return f[keep], 20.0 * np.log10(np.maximum(np.abs(H[keep]), 1e-12)) + + +def fig_kaiser_window(): + fig, ax = plt.subplots(figsize=(6.4, 3.2), layout="constrained") + u = np.linspace(-1.0, 1.0, 801) + iu = 180 # u = -0.55, where the three curves are well separated + for name, _, _, _, _, atten, color in PRESETS: + beta = kaiser_beta(atten) + w = bessel_i0(beta * np.sqrt(1.0 - u * u)) / bessel_i0(beta) + ax.plot(u, w, color=color, label=f"{name}: {atten:.0f} dB, β = {beta:.1f}") + ax.annotate(name, (u[iu], w[iu]), xytext=(-4, 4), + textcoords="offset points", color=SECONDARY, fontsize=8.5, + ha="right", bbox=dict(fc=SURFACE, ec="none", pad=1.0)) + ax.set_xlabel("window argument u (full aperture −1 … 1)") + ax.set_ylabel("w(u)") + ax.set_title("Kaiser window: attenuation buys taper") + ax.legend(loc="upper right", fontsize=8.5) + ax.set_xlim(-1.0, 1.0) + ax.set_ylim(0.0, 1.05) + despine(ax) + save(fig, "kaiser-window") + plt.close(fig) + + +def fig_kaiser_response(): + fig, (ax, axz) = plt.subplots( + 2, 1, figsize=(7.0, 5.6), layout="constrained", height_ratios=[2.4, 1.0]) + for name, L, T, pass_hz, stop_hz, atten, color in PRESETS: + f, db = preset_response(L, T, pass_hz, stop_hz, atten) + ax.plot(f / 1e3, db, color=color, label=name) + axz.plot(f / 1e3, db, color=color) + # direct label at each preset's measured stopband floor + floor = db[f >= stop_hz].max() + ax.annotate(f"{name}: {floor:.0f} dB past {stop_hz/1e3:.0f} kHz", + (47.0, floor), xytext=(0, 7), + textcoords="offset points", color=SECONDARY, fontsize=8.5, + ha="right", bbox=dict(fc=SURFACE, ec="none", pad=1.0)) + for x, label in ((20.0, "20 kHz passband edge"), (24.0, "input Nyquist")): + ax.axvline(x, color=BASELINE, lw=0.75, zorder=0) + ax.annotate(label, (x, -182), rotation=90, xytext=(-3, 0), + textcoords="offset points", color=MUTED, + fontsize=7.5, ha="right", va="bottom", + bbox=dict(fc=SURFACE, ec="none", pad=1.0)) + ax.set_ylim(-185, 8) + ax.set_xlim(0, 48) + ax.set_ylabel("magnitude (dB)") + ax.set_title("Prototype magnitude response, the three presets") + ax.legend(loc="upper right", fontsize=8.5) + despine(ax) + axz.set_xlim(0, 22) + axz.set_ylim(-0.031, 0.031) + axz.set_xlabel("frequency at 48 kHz (kHz)") + axz.set_ylabel("passband detail (dB)") + axz.annotate("all three presets flat within ±0.01 dB across their passbands", + (0.5, 0.021), color=SECONDARY, fontsize=8.5, ha="left") + despine(axz) + save(fig, "kaiser-response") + plt.close(fig) + + +# --- measured traces via the C++ tool --- + +def build_trace_tool(include_dir, exe): + subprocess.run( + ["g++", "-O2", "-std=c++20", f"-I{include_dir}", + os.path.join(ROOT, "scripts", "book_figures_trace.cpp"), "-o", exe], + check=True) + + +def run_trace(exe, *args): + out = subprocess.run([exe] + [str(a) for a in args], + check=True, capture_output=True, text=True).stdout + rows = [line.split(",") for line in out.strip().splitlines()[1:]] + a = np.array(rows, dtype=float) + return {"t": a[:, 0], "fill": a[:, 1], "state": a[:, 2], + "ppm": a[:, 3], "underruns": a[:, 4]} + + +def fig_servo_lock(head_exe): + # 1-frame pushes: the long tests' methodology — block-quantized pushes + # would hide the 200 ppm surplus in one 32-frame lump every ~3.3 s. + tr = run_trace(head_exe, 32, 1, 200, 45, 28.0, 0.05) + fig, (axf, axp) = plt.subplots( + 2, 1, figsize=(7.0, 4.8), sharex=True, layout="constrained") + + state = tr["state"] + t_lock1 = tr["t"][np.argmax(state == 2)] + i_stall = int(np.searchsorted(tr["underruns"], 0.5)) + t_stall = tr["t"][i_stall] + after_stall = (tr["t"] > t_stall) & (state == 2) + t_lock2 = tr["t"][np.argmax(after_stall)] + + axf.plot(tr["t"], tr["fill"], color=BLUE, lw=1.2) + axf.axhline(48, color=BASELINE, lw=0.75, zorder=0) + axf.annotate("setpoint 48", (44.8, 48), xytext=(0, 5), + textcoords="offset points", color=MUTED, fontsize=8, ha="right") + axf.set_ylim(46.6, 50.6) + axf.annotate(f"cold start: Locked in {t_lock1:.2f} s", + (t_lock1, 50.0), xytext=(2.5, 50.0), textcoords="data", + color=SECONDARY, fontsize=8, + arrowprops=dict(arrowstyle="-", color=SECONDARY, lw=0.75)) + axf.annotate("50 ms producer stall → refill → " + f"re-Locked {t_lock2 - t_stall:.2f} s later", + (t_stall, 50.0), xytext=(30.5, 50.0), textcoords="data", + color=SECONDARY, fontsize=8, + arrowprops=dict(arrowstyle="-", color=SECONDARY, lw=0.75)) + axf.set_ylabel("FIFO occupancy (frames)") + axf.set_title("Acquire, lock, dropout, re-lock (measured, producer +200 ppm)") + + axp.plot(tr["t"], tr["ppm"], color=BLUE, lw=1.2) + for y in (1500, -1500): + axp.axhline(y, color=BASELINE, lw=0.75, zorder=0) + axp.axhline(200, color=BASELINE, lw=0.75, zorder=0) + axp.annotate("true offset 200 ppm", (44.8, 200), xytext=(0, 5), + textcoords="offset points", color=MUTED, fontsize=8, ha="right") + axp.annotate("servo clamp ±1500 ppm", (44.8, 1500), xytext=(0, -10), + textcoords="offset points", color=MUTED, fontsize=8, + ha="right", va="top") + axp.annotate("Acquiring (10 Hz) rings against the clamp\n" + "on the ±1-frame quantized occupancy", + (2.7, 900), color=SECONDARY, fontsize=8, ha="left") + axp.annotate("Locked (1 Hz): settles on the true offset", + (14, 480), color=SECONDARY, fontsize=8, ha="left") + axp.set_ylim(-1750, 1950) + axp.set_ylabel("estimated ppm") + axp.set_xlabel("time (s)") + + # hairlines at the recorded stage transitions + for i in np.flatnonzero(np.diff(state) != 0): + for ax in (axf, axp): + ax.axvline(tr["t"][i + 1], color=GRID, lw=0.75, zorder=0) + for ax in (axf, axp): + despine(ax) + save(fig, "servo-lock") + plt.close(fig) + return tr + + +def fig_feasibility(head_exe, prefix_exe): + before = run_trace(prefix_exe, 64, 32, 200, 6) + after = run_trace(head_exe, 64, 32, 200, 6) + fig, (axb, axa) = plt.subplots( + 2, 1, figsize=(7.0, 4.8), sharex=True, sharey=True, layout="constrained") + + axb.plot(before["t"], before["fill"], color=RED, lw=1.2) + hits = np.flatnonzero(np.diff(before["underruns"]) > 0) + axb.plot(before["t"][hits + 1], before["fill"][hits + 1], "o", + ms=4.5, color=RED, mec=SURFACE, mew=1.0, ls="none") + axb.annotate(f"{int(before['underruns'][-1])} underruns in 6 s — " + "one every ~0.25 s, forever", + (1.6, 116), color=SECONDARY, fontsize=8.5, ha="left") + axb.axhline(48, color=BASELINE, lw=0.75, zorder=0) + axb.set_title(f"Before (commit {PREFIX_COMMIT}, measured): " + "pull(64) against setpoint 48") + axb.set_ylabel("FIFO occupancy (frames)") + + axa.plot(after["t"], after["fill"], color=BLUE, lw=1.2) + axa.axhline(48, color=BASELINE, lw=0.75, zorder=0) + axa.axhline(96, color=BASELINE, lw=0.75, zorder=0) + axa.annotate("configured setpoint 48", (5.95, 48), xytext=(0, 5), + textcoords="offset points", color=MUTED, fontsize=8, ha="right", + bbox=dict(fc=SURFACE, ec="none", pad=1.0)) + axa.annotate("effective setpoint 96 = 64 + 64/2, raised on first pull", + (5.95, 96), xytext=(0, 5), textcoords="offset points", + color=MUTED, fontsize=8, ha="right", + bbox=dict(fc=SURFACE, ec="none", pad=1.0)) + axa.set_title(f"After (HEAD, measured): {int(after['underruns'][-1])} underruns, " + "servo regulates the raised setpoint") + axa.set_ylabel("FIFO occupancy (frames)") + axa.set_xlabel("time (s)") + axa.set_ylim(38, 132) + for ax in (axb, axa): + despine(ax) + save(fig, "feasibility") + plt.close(fig) + return before, after + + +# --- the Q0.64 wraparound, run with the real modular arithmetic --- + +def fig_q064(): + M = 1 << 64 + eps_mag = 0.09 # exaggerated so the wrap is visible; real |eps| ~ 2e-4 + fig, axes = plt.subplots( + 1, 2, figsize=(7.0, 3.0), sharey=True, layout="constrained") + for ax, sign, title, note in ( + (axes[0], +1, "ε > 0: wrap past 1.0 → advance 2", + "consume one extra input frame"), + (axes[1], -1, "ε < 0: wrap below 0.0 → advance 0", + "re-use the current window"), + ): + eps_fix = int(sign * eps_mag * M) % M # two's-complement, like the C++ + phase, mu = 0 if sign > 0 else int(0.5 * M), [] + wraps = [] + for n in range(26): + m = (phase + eps_fix) % M + if sign > 0 and m < phase: + wraps.append(n) + if sign < 0 and m > phase: + wraps.append(n) + phase = m + mu.append(phase / M) + n = np.arange(26) + mu = np.array(mu) + ax.plot(n, mu, color=BLUE, lw=1.2, marker="o", ms=4.5, + mec=SURFACE, mew=1.0) + for w in wraps: + ax.plot([w], [mu[w]], "o", ms=6, color=BLUE, mec=SURFACE, mew=1.0) + w0 = wraps[0] # annotate the first wrap only; the rest just repeat + ax.annotate(note, (w0, mu[w0]), xytext=(8, 16 * sign), + textcoords="offset points", color=SECONDARY, fontsize=8, + bbox=dict(fc=SURFACE, ec="none", pad=1.0), + arrowprops=dict(arrowstyle="-", color=SECONDARY, lw=0.75)) + ax.set_title(title, fontsize=9) + ax.set_xlabel("output frame n") + ax.set_ylim(-0.06, 1.06) + despine(ax) + axes[0].set_ylabel("phase μ = phase_ / 2⁶⁴") + fig.suptitle("The Q0.64 accumulator slips by wrapping (ε exaggerated to 0.09; real |ε| ≈ 2×10⁻⁴)", + fontsize=9.5, color=INK) + save(fig, "q064-slip") + plt.close(fig) + + +# --- the architecture diagram (drawn) --- + +def fig_architecture(): + fig, ax = plt.subplots(figsize=(8.2, 3.6), layout="constrained") + ax.set_xlim(0, 100) + ax.set_ylim(0, 44) + ax.axis("off") + + # clock-domain washes + for x0, x1, color, label in ((0, 33, BLUE, "input clock domain (producer)"), + (52, 100, AQUA, "output clock domain (consumer)")): + ax.add_patch(FancyBboxPatch((x0 + 0.5, 1), x1 - x0 - 1, 42, + boxstyle="round,pad=0,rounding_size=1.5", + fc=color, ec="none", alpha=0.08)) + ax.text((x0 + x1) / 2, 2.6, label, ha="center", color=SECONDARY, + fontsize=8) + + def box(x, y, w, h, title, sub=None, weight="bold"): + ax.add_patch(FancyBboxPatch((x, y), w, h, + boxstyle="round,pad=0,rounding_size=1.2", + fc=SURFACE, ec=BASELINE, lw=1.0)) + cy = y + h / 2 + (1.6 if sub else 0) + ax.text(x + w / 2, cy, title, ha="center", va="center", + color=INK, fontsize=8.6, fontweight=weight) + if sub: + ax.text(x + w / 2, cy - 3.8, sub, ha="center", va="center", + color=SECONDARY, fontsize=7.6) + + def arrow(p, q, label=None, dy=1.4, style="-|>"): + ax.add_patch(FancyArrowPatch(p, q, arrowstyle=style, color=SECONDARY, + lw=1.1, mutation_scale=9, + shrinkA=1, shrinkB=1)) + if label: + ax.text((p[0] + q[0]) / 2, (p[1] + q[1]) / 2 + dy, label, + ha="center", color=SECONDARY, fontsize=7.6) + + box(3, 24, 16, 10, "producer", "audio callback / core 0") + box(36, 24, 15, 10, "SpscRing", "interleaved frames") + box(60, 7, 16, 10, "PiServo", "occupancy → ε̂") + box(57, 24, 22, 10, "FractionalResampler", "polyphase bank + Q0.64 phase") + box(84, 24, 13, 10, "consumer", "core 1 / thread") + + arrow((19, 29), (36, 29), "push()") + arrow((51, 29), (57, 29), "pop") + arrow((79, 29), (84, 29), "pull()") + # occupancy: ring bottom, down and across to the servo + arrow((43.5, 24), (43.5, 12), None, style="-") + arrow((43.5, 12), (60, 12), None) + ax.text(51.5, 13.4, "occupancy", ha="center", color=SECONDARY, fontsize=7.6) + # rate estimate: servo top, up into the resampler + arrow((70, 17), (70, 24), None) + ax.text(71.5, 20.2, "ε̂ (rate estimate)", ha="left", color=SECONDARY, + fontsize=7.6) + ax.text(50, 41.5, "one passive object, two callers — the converter owns no threads", + ha="center", color=SECONDARY, fontsize=8.4) + save(fig, "architecture") + plt.close(fig) + + +def main(): + os.makedirs(OUT, exist_ok=True) + fig_kaiser_window() + fig_kaiser_response() + fig_q064() + fig_architecture() + + with tempfile.TemporaryDirectory() as tmp: + head_exe = os.path.join(tmp, "trace_head") + build_trace_tool(os.path.join(ROOT, "include"), head_exe) + prefix_tree = os.path.join(tmp, "prefix") + os.makedirs(prefix_tree) + archive = subprocess.run(["git", "-C", ROOT, "archive", PREFIX_COMMIT, "include"], + check=True, capture_output=True).stdout + subprocess.run(["tar", "-x", "-C", prefix_tree], input=archive, check=True) + prefix_exe = os.path.join(tmp, "trace_prefix") + build_trace_tool(os.path.join(prefix_tree, "include"), prefix_exe) + + tr = fig_servo_lock(head_exe) + before, after = fig_feasibility(head_exe, prefix_exe) + + print(f"servo: locked at t={tr['t'][np.argmax(tr['state'] == 2)]:.1f}s, " + f"final ppm {tr['ppm'][-1]:.1f}, underruns {int(tr['underruns'][-1])}") + print(f"feasibility: before {int(before['underruns'][-1])} underruns/6s, " + f"after {int(after['underruns'][-1])}") + print(f"wrote 6 SVGs to {OUT}") + + +if __name__ == "__main__": + main() diff --git a/scripts/book_figures_trace.cpp b/scripts/book_figures_trace.cpp new file mode 100644 index 0000000..22e58f9 --- /dev/null +++ b/scripts/book_figures_trace.cpp @@ -0,0 +1,63 @@ +// Trace dumper for the book's measured figures (scripts/book_figures.py). +// +// Runs the converter in deterministic virtual time — the same event-driven +// two-clock scheme as tests/support/two_clock_sim.hpp — and prints one CSV +// row per pull: t,fill,state,ppm,underruns. book_figures.py compiles this +// file twice, once against the current include/ tree and once against the +// tree of the last pre-feasibility-fix commit, so the before/after figure +// in the composition chapter is measured on both sides of the fix, not +// modeled. Only Status fields that exist in both versions are printed. +// +// Usage: trace pullBlock pushBlock ppm seconds [dropStart dropDur] +#include +#include +#include +#include +#include + +#include + +int main(int argc, char** argv) { + if (argc < 5) { + std::fprintf(stderr, "usage: %s pullBlock pushBlock ppm seconds [dropStart dropDur]\n", + argv[0]); + return 2; + } + const std::size_t pullBlock = static_cast(std::atol(argv[1])); + const std::size_t pushBlock = static_cast(std::atol(argv[2])); + const double ppm = std::atof(argv[3]); + const double seconds = std::atof(argv[4]); + const double dropStart = argc > 5 ? std::atof(argv[5]) : -1.0; + const double dropDur = argc > 6 ? std::atof(argv[6]) : 0.0; + + srt::Config cfg; + cfg.channels = 1; + srt::AsyncSampleRateConverter conv(cfg); + + const double fsOut = cfg.sampleRateHz; + const double fsIn = fsOut * (1.0 + ppm * 1e-6); // producer's crystal + std::vector in(pushBlock), out(pullBlock); + + double tPush = 0.0, tPull = 0.0, phase = 0.0; + const double dPhase = 2.0 * std::numbers::pi * 997.0 / fsIn; + std::puts("t,fill,state,ppm,underruns"); + while (tPull < seconds) { + if (tPush <= tPull) { + if (!(tPush >= dropStart && tPush < dropStart + dropDur)) { + for (auto& v : in) { + v = 0.5f * static_cast(std::sin(phase)); + phase += dPhase; + } + conv.push(in.data(), pushBlock); + } + tPush += static_cast(pushBlock) / fsIn; + continue; + } + conv.pull(out.data(), pullBlock); + tPull += static_cast(pullBlock) / fsOut; + const srt::Status s = conv.status(); + std::printf("%.6f,%.2f,%d,%.2f,%llu\n", tPull, s.fifoFillFrames, static_cast(s.state), + s.ppm, static_cast(s.underruns)); + } + return 0; +} From 57181fcf8aaecbbf60be2f793c0b8842060e1d1c Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 1 Jul 2026 22:27:11 +0000 Subject: [PATCH 13/16] README: point at the book https://claude.ai/code/session_01HuAFfoeD5a5Xe5aGNA16M9 --- README.md | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/README.md b/README.md index 4cbba32..8574c4b 100644 --- a/README.md +++ b/README.md @@ -74,6 +74,29 @@ there are no install/package rules yet. Version 0.1.0 (`SRT_VERSION_*` in `srt/srt.hpp`, `srt_version()` over the C ABI); pre-1.0, the API may still change between versions. +## The book + +The repository includes a full-length tutorial book (`book/`) that walks +every header file line by line — the DSP, the C++ idioms chosen and +rejected, how thread safety works, how the servo was tuned, and the +optimization campaign with its dead ends preserved. It is written for a +reader learning C++, DSP, and real-time concurrency with this converter as +the running example. Three mechanical commitments keep it honest: every +code excerpt is included live from the actual headers at build time (CI +fails on a stale anchor), every figure is regenerated from the same math +and measured traces by `scripts/book_figures.py`, and every chapter ends +with runnable commands that reproduce its claims. + +Build it with [mdBook](https://rust-lang.github.io/mdBook/) (CI pins +v0.4.40): + +```sh +mdbook build book # or: mdbook serve book --open +``` + +Start at `book/src/SUMMARY.md` for the table of contents, or read the +sources directly — they are plain Markdown. + ## How it works The design follows the classic commercial-ASRC architecture (AD1896-style From 0d68d697f8f278d15041fbb8a21acabfa2cf5414 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 1 Jul 2026 22:35:12 +0000 Subject: [PATCH 14/16] book: retitle, and an epigraph for every chapter The book is now 'SampleRateTap: The Story of an Asynchronous Sample Rate Converter'. Each of the 24 pages opens with a quote matched to its theme (Heraclitus for the two crystals, Maxwell's On Governors for the servo, Kronecker for Q0.64, Dijkstra for the tests, the full Knuth quote for C1-C2, Postel for the C ABI, ...). Uncertain attributions are marked 'attributed to'. https://claude.ai/code/session_01HuAFfoeD5a5Xe5aGNA16M9 --- book/book.toml | 2 +- book/src/appendix/bibliography.md | 4 ++++ book/src/appendix/cpp-decisions.md | 4 ++++ book/src/appendix/glossary.md | 4 ++++ book/src/introduction.md | 4 ++++ book/src/part0/budgets.md | 4 ++++ book/src/part0/two-crystals.md | 4 ++++ book/src/part1/asrc.md | 4 ++++ book/src/part1/fractional-resampler.md | 4 ++++ book/src/part1/kaiser.md | 4 ++++ book/src/part1/pi-servo.md | 4 ++++ book/src/part1/polyphase-bank.md | 4 ++++ book/src/part1/sample-traits.md | 4 ++++ book/src/part1/spsc-ring.md | 4 ++++ book/src/part2/icount.md | 4 ++++ book/src/part2/notebooks.md | 4 ++++ book/src/part2/tests.md | 4 ++++ book/src/part3/c1-c2.md | 4 ++++ book/src/part3/c3-c5.md | 4 ++++ book/src/part3/c6.md | 4 ++++ book/src/part4/c-abi.md | 4 ++++ book/src/part4/cortex-m.md | 4 ++++ book/src/part4/hexagon.md | 4 ++++ book/src/part5/hardware.md | 4 ++++ book/src/part5/scaling.md | 4 ++++ 25 files changed, 97 insertions(+), 1 deletion(-) diff --git a/book/book.toml b/book/book.toml index ea6834d..fe2e72b 100644 --- a/book/book.toml +++ b/book/book.toml @@ -1,5 +1,5 @@ [book] -title = "Inside SampleRateTap" +title = "SampleRateTap: The Story of an Asynchronous Sample Rate Converter" description = "A working tour of a real-time asynchronous sample rate converter: the DSP, the C++, the concurrency, and the measurements that hold it together." authors = ["The SampleRateTap project"] language = "en" diff --git a/book/src/appendix/bibliography.md b/book/src/appendix/bibliography.md index 62fb056..34a1700 100644 --- a/book/src/appendix/bibliography.md +++ b/book/src/appendix/bibliography.md @@ -1,5 +1,9 @@ # Appendix C: Annotated bibliography +> If I have seen further it is by standing on the shoulders of giants. +> +> — Isaac Newton, letter to Robert Hooke + This project's provenance statement is short: all code implements long-published methods, and no third-party source was copied. This appendix lists those methods' sources — plus the tools and competitors the diff --git a/book/src/appendix/cpp-decisions.md b/book/src/appendix/cpp-decisions.md index 56ab9df..c1744db 100644 --- a/book/src/appendix/cpp-decisions.md +++ b/book/src/appendix/cpp-decisions.md @@ -1,5 +1,9 @@ # Appendix A: The C++ decision log +> There are only two kinds of languages: the ones people complain about and the ones nobody uses. +> +> — Bjarne Stroustrup + Every chapter of this book has defended C++ decisions in passing, in the context that made them necessary. This appendix collects them in one place, in one format: the decision, what was rejected, why, and where in the diff --git a/book/src/appendix/glossary.md b/book/src/appendix/glossary.md index 953ba34..c767f1a 100644 --- a/book/src/appendix/glossary.md +++ b/book/src/appendix/glossary.md @@ -1,5 +1,9 @@ # Appendix B: Glossary +> The limits of my language mean the limits of my world. +> +> — Ludwig Wittgenstein, *Tractatus Logico-Philosophicus* + Terms of art as this book uses them. Where the general meaning and this project's usage differ, the entry gives the project's. diff --git a/book/src/introduction.md b/book/src/introduction.md index 1916add..ac5da99 100644 --- a/book/src/introduction.md +++ b/book/src/introduction.md @@ -1,5 +1,9 @@ # Introduction +> Talk is cheap. Show me the code. +> +> — Linus Torvalds + This book explains one piece of software completely. The software is **SampleRateTap**, a header-only C++20 library that solves a diff --git a/book/src/part0/budgets.md b/book/src/part0/budgets.md index cae9d0e..add62e8 100644 --- a/book/src/part0/budgets.md +++ b/book/src/part0/budgets.md @@ -1,5 +1,9 @@ # Budgets: latency, quality, compute +> Perfection is achieved, not when there is nothing more to add, but when there is nothing left to take away. +> +> — Antoine de Saint-Exupéry, *Wind, Sand and Stars* + The previous chapter ended with three words used as if they were self-explanatory: latency, quality, compute. This chapter turns each into a number with a derivation behind it, because everything in Part I is an diff --git a/book/src/part0/two-crystals.md b/book/src/part0/two-crystals.md index fdb1537..d9d8696 100644 --- a/book/src/part0/two-crystals.md +++ b/book/src/part0/two-crystals.md @@ -1,5 +1,9 @@ # Two crystals, one stream +> No man ever steps in the same river twice, for it is not the same river and he is not the same man. +> +> — attributed to Heraclitus + Every specification of this library begins with a lie that the audio industry tells itself daily: "48 kHz." diff --git a/book/src/part1/asrc.md b/book/src/part1/asrc.md index 82453dd..45a676e 100644 --- a/book/src/part1/asrc.md +++ b/book/src/part1/asrc.md @@ -1,5 +1,9 @@ # Composition: `asrc.hpp` +> The whole is something beside the parts. +> +> — Aristotle, *Metaphysics* + Every previous chapter built a component that is correct on its own terms. This chapter is about the file that has no terms of its own: `asrc.hpp` contains almost no algorithm, no mathematics, and fewer than three hundred diff --git a/book/src/part1/fractional-resampler.md b/book/src/part1/fractional-resampler.md index 5e9a313..a8bf3ce 100644 --- a/book/src/part1/fractional-resampler.md +++ b/book/src/part1/fractional-resampler.md @@ -1,5 +1,9 @@ # The fractional resampler +> God made the integers; all else is the work of man. +> +> — Leopold Kronecker + The servo chapter ended with a number: ε̂, the rate-deviation estimate, delivered once per output block. This chapter spends it. diff --git a/book/src/part1/kaiser.md b/book/src/part1/kaiser.md index de66bf5..be64433 100644 --- a/book/src/part1/kaiser.md +++ b/book/src/part1/kaiser.md @@ -1,5 +1,9 @@ # Designing the filter: `kaiser.hpp` +> The purpose of computing is insight, not numbers. +> +> — Richard Hamming + This is the only file in the library that runs exactly once per converter, and it decides the quality ceiling of everything downstream. Every output sample the converter will ever produce is a dot product against diff --git a/book/src/part1/pi-servo.md b/book/src/part1/pi-servo.md index 36fae99..b407866 100644 --- a/book/src/part1/pi-servo.md +++ b/book/src/part1/pi-servo.md @@ -1,5 +1,9 @@ # The clock servo: `pi_servo.hpp` +> A governor is a part of a machine by means of which the velocity of the machine is kept nearly uniform, notwithstanding variations in the driving-power or the resistance. +> +> — James Clerk Maxwell, *On Governors* (1868) + There is a number this entire library exists to find, and nobody will tell it to us. diff --git a/book/src/part1/polyphase-bank.md b/book/src/part1/polyphase-bank.md index ef3292a..a9c07a6 100644 --- a/book/src/part1/polyphase-bank.md +++ b/book/src/part1/polyphase-bank.md @@ -1,5 +1,9 @@ # The polyphase bank +> Show me your flowcharts and conceal your tables, and I shall continue to be mystified. Show me your tables, and I won't usually need your flowcharts; they'll be obvious. +> +> — Fred Brooks, *The Mythical Man-Month* + The previous chapter ended with a prototype filter: 12,288 double-precision coefficients (for the default preset) describing one ideal anti-imaging lowpass, oversampled 256× against the input rate. This chapter is about a diff --git a/book/src/part1/sample-traits.md b/book/src/part1/sample-traits.md index 1a2d2c9..0347ff9 100644 --- a/book/src/part1/sample-traits.md +++ b/book/src/part1/sample-traits.md @@ -1,5 +1,9 @@ # Sample types as a customization point: `sample_traits.hpp` +> Make illegal states unrepresentable. +> +> — Yaron Minsky + The polyphase machinery of the last two chapters computes one thing: a dot product between a window of input samples and an interpolated row of filter coefficients. The problem is that this library ships to machines that do not diff --git a/book/src/part1/spsc-ring.md b/book/src/part1/spsc-ring.md index cc909ce..15419f1 100644 --- a/book/src/part1/spsc-ring.md +++ b/book/src/part1/spsc-ring.md @@ -1,5 +1,9 @@ # The lock-free ring: `spsc_ring.hpp` +> Time is what keeps everything from happening at once. +> +> — Ray Cummings + Every other component in this library is mathematics. This one is physics. The converter's whole purpose is to sit between two threads that must never diff --git a/book/src/part2/icount.md b/book/src/part2/icount.md index 70501b1..17fc574 100644 --- a/book/src/part2/icount.md +++ b/book/src/part2/icount.md @@ -1,5 +1,9 @@ # Counting instructions, deterministically +> When you can measure what you are speaking about, and express it in numbers, you know something about it; but when you cannot measure it, when you cannot express it in numbers, your knowledge is of a meagre and unsatisfactory kind. +> +> — Lord Kelvin + The optimization campaign of Part III makes claims like "−5.3% on the M55 Q15 pipeline" and expects you to believe the decimal point. This chapter is about the machinery that makes such a decimal point *mean* something — and diff --git a/book/src/part2/notebooks.md b/book/src/part2/notebooks.md index 1b74152..c9b2b2f 100644 --- a/book/src/part2/notebooks.md +++ b/book/src/part2/notebooks.md @@ -1,5 +1,9 @@ # Notebooks as calibrated instruments +> The first principle is that you must not fool yourself — and you are the easiest person to fool. +> +> — Richard Feynman + The previous two chapters covered claims a machine can gate: thresholds in tests, instruction counts in a ratchet. But some of this project's most consequential claims are not pass/fail propositions. *How much worse is a diff --git a/book/src/part2/tests.md b/book/src/part2/tests.md index 85d6e6b..51c4721 100644 --- a/book/src/part2/tests.md +++ b/book/src/part2/tests.md @@ -1,5 +1,9 @@ # Tests as specifications +> Program testing can be used to show the presence of bugs, but never to show their absence! +> +> — Edsger W. Dijkstra + Part I ended each chapter with a list of tests. This chapter is about what those tests actually *are* — because in this project they are not the usual smoke detectors bolted on after the fact. They are the specification. The diff --git a/book/src/part3/c1-c2.md b/book/src/part3/c1-c2.md index bbed89e..f060f86 100644 --- a/book/src/part3/c1-c2.md +++ b/book/src/part3/c1-c2.md @@ -1,5 +1,9 @@ # Profile first, claim later (C1–C2) +> We should forget about small efficiencies, say about 97% of the time: premature optimization is the root of all evil. Yet we should not pass up our opportunities in that critical 3%. +> +> — Donald Knuth + Part III is a story, told in the order it happened. The introduction promised six optimization efforts — four wins, one honest draw, one deliberate revert — and the next three chapters deliver them with the real numbers, including the diff --git a/book/src/part3/c3-c5.md b/book/src/part3/c3-c5.md index 8b56db6..f497a71 100644 --- a/book/src/part3/c3-c5.md +++ b/book/src/part3/c3-c5.md @@ -1,5 +1,9 @@ # The integer phase and the wide MACs (C3–C5) +> If it disagrees with experiment, it is wrong. In that simple statement is the key to science. +> +> — Richard Feynman + The previous chapter ended with an anomaly: C1 stripped most of the per-channel blend work out of the datapath, the M55 pipelines dropped by double digits, and Hexagon's barely moved. The diagnosis written into the diff --git a/book/src/part3/c6.md b/book/src/part3/c6.md index 2bb7083..a4da423 100644 --- a/book/src/part3/c6.md +++ b/book/src/part3/c6.md @@ -1,5 +1,9 @@ # The channel axis (C6) +> If you were plowing a field, which would you rather use: two strong oxen or 1024 chickens? +> +> — attributed to Seymour Cray + The campaign's last effort began with an inheritance and a constraint. The inheritance came from the C5 revert: per-channel dot products are the diff --git a/book/src/part4/c-abi.md b/book/src/part4/c-abi.md index 3eadf9f..2c2d6a6 100644 --- a/book/src/part4/c-abi.md +++ b/book/src/part4/c-abi.md @@ -1,5 +1,9 @@ # The C ABI +> Be conservative in what you do, be liberal in what you accept from others. +> +> — Jon Postel, RFC 761 + This chapter exists because of a plot. Part II's notebooks are the library's most persuasive evidence — the servo locking from a cold start, the 135 dB money plot, the naive-FIFO spectrogram full of clicks — and diff --git a/book/src/part4/cortex-m.md b/book/src/part4/cortex-m.md index 33ecd3e..9701be3 100644 --- a/book/src/part4/cortex-m.md +++ b/book/src/part4/cortex-m.md @@ -1,5 +1,9 @@ # Cortex-M: bare metal, two ways +> Civilization advances by extending the number of important operations which we can perform without thinking about them. +> +> — Alfred North Whitehead + The Hexagon port ran the library on a strange ISA under a familiar OS. The Cortex-M ports remove the OS. No loader, no threads, no filesystem, no `argv`, no reliable way to even return an exit code — and the library diff --git a/book/src/part4/hexagon.md b/book/src/part4/hexagon.md index 6a95749..dc8f2ea 100644 --- a/book/src/part4/hexagon.md +++ b/book/src/part4/hexagon.md @@ -1,5 +1,9 @@ # Hexagon: a DSP that keeps secrets +> Trust, but verify. +> +> — Russian proverb + Every portability chapter in this part answers the same question: what did the target force the library to learn that no amount of host testing could have taught it? Hexagon — Qualcomm's DSP architecture, the kind of core diff --git a/book/src/part5/hardware.md b/book/src/part5/hardware.md index 554ea06..1e705b5 100644 --- a/book/src/part5/hardware.md +++ b/book/src/part5/hardware.md @@ -1,5 +1,9 @@ # Real clocks: bridges and firmware +> In theory, there is no difference between theory and practice. In practice, there is. +> +> — attributed to Yogi Berra, among others + Everything measured so far in this book — the 135 dB residual, the lock in ~1 s, the drift ramp tracked without unlocking — came out of a simulation. A good one: deterministic, sample-granular, reproducible to the bit, able diff --git a/book/src/part5/scaling.md b/book/src/part5/scaling.md index 2f783c3..1651156 100644 --- a/book/src/part5/scaling.md +++ b/book/src/part5/scaling.md @@ -1,5 +1,9 @@ # Channels, rates, and the rules that scale +> For every type of animal there is a most convenient size, and a large change in size inevitably carries with it a change of form. +> +> — J. B. S. Haldane, *On Being the Right Size* + Every measured number in this book so far was taken at one operating point: 48 kHz, one or two channels, fine-grained transfer. Real deployments move along three axes away from that point — more **channels**, a different From acfa63a2895dbc8b0967db1471c607efca8ac432 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 1 Jul 2026 23:41:00 +0000 Subject: [PATCH 15/16] book: three epigraph swaps toward the philosophical Notebooks gets 'lies, damned lies, and statistics' (its five traps are that quote, measured); sample_traits gets the Heart Sutra (the primary template is deliberately empty, the specializations are its form); hardware trades the theory/practice aphorism for the Zen woodcutter. Also resolves the double-Feynman. https://claude.ai/code/session_01HuAFfoeD5a5Xe5aGNA16M9 --- book/src/part1/sample-traits.md | 4 ++-- book/src/part2/notebooks.md | 4 ++-- book/src/part5/hardware.md | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/book/src/part1/sample-traits.md b/book/src/part1/sample-traits.md index 0347ff9..31f5902 100644 --- a/book/src/part1/sample-traits.md +++ b/book/src/part1/sample-traits.md @@ -1,8 +1,8 @@ # Sample types as a customization point: `sample_traits.hpp` -> Make illegal states unrepresentable. +> Form is exactly emptiness; emptiness is exactly form. > -> — Yaron Minsky +> — the Heart Sutra The polyphase machinery of the last two chapters computes one thing: a dot product between a window of input samples and an interpolated row of filter diff --git a/book/src/part2/notebooks.md b/book/src/part2/notebooks.md index c9b2b2f..a9441b6 100644 --- a/book/src/part2/notebooks.md +++ b/book/src/part2/notebooks.md @@ -1,8 +1,8 @@ # Notebooks as calibrated instruments -> The first principle is that you must not fool yourself — and you are the easiest person to fool. +> There are three kinds of lies: lies, damned lies, and statistics. > -> — Richard Feynman +> — popularized by Mark Twain, who credited Benjamin Disraeli The previous two chapters covered claims a machine can gate: thresholds in tests, instruction counts in a ratchet. But some of this project's most diff --git a/book/src/part5/hardware.md b/book/src/part5/hardware.md index 1e705b5..fe120e9 100644 --- a/book/src/part5/hardware.md +++ b/book/src/part5/hardware.md @@ -1,8 +1,8 @@ # Real clocks: bridges and firmware -> In theory, there is no difference between theory and practice. In practice, there is. +> Before enlightenment: chop wood, carry water. After enlightenment: chop wood, carry water. > -> — attributed to Yogi Berra, among others +> — Zen proverb Everything measured so far in this book — the 135 dB residual, the lock in ~1 s, the drift ramp tracked without unlocking — came out of a simulation. From dbd651a68f1e150f3594d17586964e9ab4a9afa8 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 1 Jul 2026 23:51:34 +0000 Subject: [PATCH 16/16] book: publish to GitHub Pages from main New book-pages workflow: builds with the pinned mdBook and deploys book/book to GitHub Pages (public site, deliberate) on pushes to main that touch the book or any excerpt source, plus workflow_dispatch. book.toml gains site-url for the /SampleRateTap/ path; README links the published site. https://claude.ai/code/session_01HuAFfoeD5a5Xe5aGNA16M9 --- .github/workflows/book-pages.yml | 74 ++++++++++++++++++++++++++++++++ README.md | 6 ++- book/book.toml | 1 + 3 files changed, 79 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/book-pages.yml diff --git a/.github/workflows/book-pages.yml b/.github/workflows/book-pages.yml new file mode 100644 index 0000000..f53db92 --- /dev/null +++ b/.github/workflows/book-pages.yml @@ -0,0 +1,74 @@ +# Publishes the book to GitHub Pages on every push to main. +# +# The site is PUBLIC (https://tap.github.io/SampleRateTap/) even though the +# repository is private; it contains the book's code excerpts by design. +# One-time setup if the first run complains: Settings -> Pages -> Source: +# "GitHub Actions" (configure-pages below attempts to enable it itself). +name: book-pages + +on: + push: + branches: [main] + paths: + - "book/**" + - "include/**" + - "platform/**" + - "tests/support/**" + - "tools/**" + - "cmake/**" + - ".github/workflows/book-pages.yml" + workflow_dispatch: + +permissions: + contents: read + pages: write + id-token: write + +concurrency: + group: pages + cancel-in-progress: false + +env: + MDBOOK_URL: https://github.com/rust-lang/mdBook/releases/download/v0.4.40/mdbook-v0.4.40-x86_64-unknown-linux-gnu.tar.gz + MDBOOK_SHA256: "9ef07fd288ba58ff3b99d1c94e6d414d431c9a61fdb20348e5beb74b823d546b" + +jobs: + deploy: + runs-on: ubuntu-latest + timeout-minutes: 10 + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + steps: + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6 + + - name: Install mdBook (pinned) + run: | + curl -sfLo /tmp/mdbook.tar.gz "$MDBOOK_URL" + actual=$(sha256sum /tmp/mdbook.tar.gz | cut -d' ' -f1) + if [ "$actual" != "$MDBOOK_SHA256" ]; then + echo "::error::mdbook checksum mismatch"; exit 1 + fi + tar -xzf /tmp/mdbook.tar.gz -C /tmp + + - name: Build (warnings are errors) + run: | + /tmp/mdbook build book 2>&1 | tee /tmp/book-build.log + if grep -qiE 'warning|error' /tmp/book-build.log; then + echo "::error::mdbook reported warnings/errors" + exit 1 + fi + + - name: Configure Pages + uses: actions/configure-pages@983d7736d9b0ae728b81ab479565c72886d7745b # v5 + with: + enablement: true + + - name: Upload site + uses: actions/upload-pages-artifact@56afc609e74202658d3ffba0e8f6dda462b719fa # v3 + with: + path: book/book + + - name: Deploy + id: deployment + uses: actions/deploy-pages@d6db90164ac5ed86f2b6aed7e0febac5b3c0c03e # v4 diff --git a/README.md b/README.md index 8574c4b..a15065c 100644 --- a/README.md +++ b/README.md @@ -87,8 +87,10 @@ fails on a stale anchor), every figure is regenerated from the same math and measured traces by `scripts/book_figures.py`, and every chapter ends with runnable commands that reproduce its claims. -Build it with [mdBook](https://rust-lang.github.io/mdBook/) (CI pins -v0.4.40): +Read it at **** (published from +`main` by the `book-pages` workflow; note the site is public even though +this repository is private), or build it locally with +[mdBook](https://rust-lang.github.io/mdBook/) (CI pins v0.4.40): ```sh mdbook build book # or: mdbook serve book --open diff --git a/book/book.toml b/book/book.toml index fe2e72b..2765989 100644 --- a/book/book.toml +++ b/book/book.toml @@ -11,3 +11,4 @@ create-missing = false [output.html] default-theme = "rust" git-repository-url = "https://github.com/tap/SampleRateTap" +site-url = "/SampleRateTap/"