diff --git a/.github/workflows/book-pages.yml b/.github/workflows/book-pages.yml
new file mode 100644
index 0000000..f53db92
--- /dev/null
+++ b/.github/workflows/book-pages.yml
@@ -0,0 +1,74 @@
+# Publishes the book to GitHub Pages on every push to main.
+#
+# The site is PUBLIC (https://tap.github.io/SampleRateTap/) even though the
+# repository is private; it contains the book's code excerpts by design.
+# One-time setup if the first run complains: Settings -> Pages -> Source:
+# "GitHub Actions" (configure-pages below attempts to enable it itself).
+name: book-pages
+
+on:
+ push:
+ branches: [main]
+ paths:
+ - "book/**"
+ - "include/**"
+ - "platform/**"
+ - "tests/support/**"
+ - "tools/**"
+ - "cmake/**"
+ - ".github/workflows/book-pages.yml"
+ workflow_dispatch:
+
+permissions:
+ contents: read
+ pages: write
+ id-token: write
+
+concurrency:
+ group: pages
+ cancel-in-progress: false
+
+env:
+ MDBOOK_URL: https://github.com/rust-lang/mdBook/releases/download/v0.4.40/mdbook-v0.4.40-x86_64-unknown-linux-gnu.tar.gz
+ MDBOOK_SHA256: "9ef07fd288ba58ff3b99d1c94e6d414d431c9a61fdb20348e5beb74b823d546b"
+
+jobs:
+ deploy:
+ runs-on: ubuntu-latest
+ timeout-minutes: 10
+ environment:
+ name: github-pages
+ url: ${{ steps.deployment.outputs.page_url }}
+ steps:
+ - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6
+
+ - name: Install mdBook (pinned)
+ run: |
+ curl -sfLo /tmp/mdbook.tar.gz "$MDBOOK_URL"
+ actual=$(sha256sum /tmp/mdbook.tar.gz | cut -d' ' -f1)
+ if [ "$actual" != "$MDBOOK_SHA256" ]; then
+ echo "::error::mdbook checksum mismatch"; exit 1
+ fi
+ tar -xzf /tmp/mdbook.tar.gz -C /tmp
+
+ - name: Build (warnings are errors)
+ run: |
+ /tmp/mdbook build book 2>&1 | tee /tmp/book-build.log
+ if grep -qiE 'warning|error' /tmp/book-build.log; then
+ echo "::error::mdbook reported warnings/errors"
+ exit 1
+ fi
+
+ - name: Configure Pages
+ uses: actions/configure-pages@983d7736d9b0ae728b81ab479565c72886d7745b # v5
+ with:
+ enablement: true
+
+ - name: Upload site
+ uses: actions/upload-pages-artifact@56afc609e74202658d3ffba0e8f6dda462b719fa # v3
+ with:
+ path: book/book
+
+ - name: Deploy
+ id: deployment
+ uses: actions/deploy-pages@d6db90164ac5ed86f2b6aed7e0febac5b3c0c03e # v4
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index ff16a06..8ed298b 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -486,3 +486,56 @@ jobs:
bench/*.cpp bench/icount/*.cpp bench/compare/*.cpp \
tools/capi/*.cpp tools/qemu_insn_plugin/*.c \
tests/*.cpp tests/support/*.hpp examples/*.cpp platform/*.c
+
+ # The book (book/) quotes library code via mdBook anchor includes; this
+ # gate makes a refactor that orphans an excerpt fail CI, the same
+ # freshness contract as the README's generated tables. Warnings are
+ # errors: a missing anchor is a warning, and a missing anchor is rot.
+ book:
+ name: Book build
+ runs-on: ubuntu-latest
+ timeout-minutes: 10
+ env:
+ MDBOOK_URL: https://github.com/rust-lang/mdBook/releases/download/v0.4.40/mdbook-v0.4.40-x86_64-unknown-linux-gnu.tar.gz
+ MDBOOK_SHA256: "9ef07fd288ba58ff3b99d1c94e6d414d431c9a61fdb20348e5beb74b823d546b"
+ steps:
+ - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6
+
+ - name: Install mdBook (pinned)
+ run: |
+ curl -sfLo /tmp/mdbook.tar.gz "$MDBOOK_URL"
+ actual=$(sha256sum /tmp/mdbook.tar.gz | cut -d' ' -f1)
+ if [ "$actual" != "$MDBOOK_SHA256" ]; then
+ echo "::error::mdbook checksum mismatch"; exit 1
+ fi
+ tar -xzf /tmp/mdbook.tar.gz -C /tmp
+
+ - name: Build (warnings are errors)
+ run: |
+ /tmp/mdbook build book 2>&1 | tee /tmp/book-build.log
+ if grep -qiE 'warning|error' /tmp/book-build.log; then
+ echo "::error::mdbook reported warnings/errors (stale anchor or broken include?)"
+ exit 1
+ fi
+
+ # mdBook does not fail on a missing image, so check every relative
+ # image reference resolves. (The SVGs are committed, generated by
+ # scripts/book_figures.py; regeneration is not gated because
+ # matplotlib's SVG output is not byte-stable across versions.)
+ - name: Check image references resolve
+ run: |
+ python3 - <<'EOF'
+ import pathlib, re, sys
+ src = pathlib.Path("book/src")
+ missing = []
+ for md in src.rglob("*.md"):
+ for target in re.findall(r"!\[[^\]]*\]\(([^)#?]+)", md.read_text()):
+ if target.startswith(("http://", "https://")):
+ continue
+ if not (md.parent / target).resolve().exists():
+ missing.append(f"{md}: {target}")
+ if missing:
+ print("::error::broken image reference(s):")
+ print("\n".join(missing))
+ sys.exit(1)
+ EOF
diff --git a/.gitignore b/.gitignore
index 94fcdc7..b8e7965 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,4 @@ CMakeUserPresets.json
.vscode/
.idea/
.claude/
+book/book/
diff --git a/README.md b/README.md
index 4cbba32..a15065c 100644
--- a/README.md
+++ b/README.md
@@ -74,6 +74,31 @@ there are no install/package rules yet. Version 0.1.0 (`SRT_VERSION_*` in
`srt/srt.hpp`, `srt_version()` over the C ABI); pre-1.0, the API may
still change between versions.
+## The book
+
+The repository includes a full-length tutorial book (`book/`) that walks
+every header file line by line — the DSP, the C++ idioms chosen and
+rejected, how thread safety works, how the servo was tuned, and the
+optimization campaign with its dead ends preserved. It is written for a
+reader learning C++, DSP, and real-time concurrency with this converter as
+the running example. Three mechanical commitments keep it honest: every
+code excerpt is included live from the actual headers at build time (CI
+fails on a stale anchor), every figure is regenerated from the same math
+and measured traces by `scripts/book_figures.py`, and every chapter ends
+with runnable commands that reproduce its claims.
+
+Read it at **** (published from
+`main` by the `book-pages` workflow; note the site is public even though
+this repository is private), or build it locally with
+[mdBook](https://rust-lang.github.io/mdBook/) (CI pins v0.4.40):
+
+```sh
+mdbook build book # or: mdbook serve book --open
+```
+
+Start at `book/src/SUMMARY.md` for the table of contents, or read the
+sources directly — they are plain Markdown.
+
## How it works
The design follows the classic commercial-ASRC architecture (AD1896-style
diff --git a/book/book.toml b/book/book.toml
new file mode 100644
index 0000000..2765989
--- /dev/null
+++ b/book/book.toml
@@ -0,0 +1,14 @@
+[book]
+title = "SampleRateTap: The Story of an Asynchronous Sample Rate Converter"
+description = "A working tour of a real-time asynchronous sample rate converter: the DSP, the C++, the concurrency, and the measurements that hold it together."
+authors = ["The SampleRateTap project"]
+language = "en"
+src = "src"
+
+[build]
+create-missing = false
+
+[output.html]
+default-theme = "rust"
+git-repository-url = "https://github.com/tap/SampleRateTap"
+site-url = "/SampleRateTap/"
diff --git a/book/src/SUMMARY.md b/book/src/SUMMARY.md
new file mode 100644
index 0000000..cad7d1a
--- /dev/null
+++ b/book/src/SUMMARY.md
@@ -0,0 +1,47 @@
+# Summary
+
+[Introduction](introduction.md)
+
+# Part 0 — The problem
+
+- [Two crystals, one stream](part0/two-crystals.md)
+- [Budgets: latency, quality, compute](part0/budgets.md)
+
+# Part I — The machine, file by file
+
+- [Designing the filter: kaiser.hpp](part1/kaiser.md)
+- [The polyphase bank](part1/polyphase-bank.md)
+- [Sample types as a customization point: sample_traits.hpp](part1/sample-traits.md)
+- [The lock-free ring: spsc_ring.hpp](part1/spsc-ring.md)
+- [The clock servo: pi_servo.hpp](part1/pi-servo.md)
+- [The fractional resampler](part1/fractional-resampler.md)
+- [Composition: asrc.hpp](part1/asrc.md)
+
+# Part II — The proof system
+
+- [Tests as specifications](part2/tests.md)
+- [Counting instructions, deterministically](part2/icount.md)
+- [Notebooks as calibrated instruments](part2/notebooks.md)
+
+# Part III — Optimizing honestly
+
+- [Profile first, claim later (C1–C2)](part3/c1-c2.md)
+- [The integer phase and the wide MACs (C3–C5)](part3/c3-c5.md)
+- [The channel axis (C6)](part3/c6.md)
+
+# Part IV — Portability
+
+- [Hexagon: a DSP that keeps secrets](part4/hexagon.md)
+- [Cortex-M: bare metal, two ways](part4/cortex-m.md)
+- [The C ABI](part4/c-abi.md)
+
+# Part V — Deployment
+
+- [Real clocks: bridges and firmware](part5/hardware.md)
+- [Channels, rates, and the rules that scale](part5/scaling.md)
+
+---
+
+[Appendix A: The C++ decision log](appendix/cpp-decisions.md)
+[Appendix B: Glossary](appendix/glossary.md)
+[Appendix C: Annotated bibliography](appendix/bibliography.md)
diff --git a/book/src/appendix/bibliography.md b/book/src/appendix/bibliography.md
new file mode 100644
index 0000000..34a1700
--- /dev/null
+++ b/book/src/appendix/bibliography.md
@@ -0,0 +1,107 @@
+# Appendix C: Annotated bibliography
+
+> If I have seen further it is by standing on the shoulders of giants.
+>
+> — Isaac Newton, letter to Robert Hooke
+
+This project's provenance statement is short: all code implements
+long-published methods, and no third-party source was copied. This
+appendix lists those methods' sources — plus the tools and competitors the
+measurements depend on — with a note on what the project *actually took*
+from each. It deliberately cites nothing the codebase does not genuinely
+draw on.
+
+## Signal processing
+
+**J. F. Kaiser, "Nonrecursive digital filter design using the I₀-sinh
+window function," *Proc. IEEE Int. Symp. Circuits and Systems*, 1974.**
+The origin of the Kaiser window and of the two empirical fits the library
+evaluates verbatim in `include/srt/detail/kaiser.hpp`: stopband
+attenuation → window shape parameter β, and the attenuation/transition-
+width → filter-length estimate. The project took the closed forms exactly
+as published — the value of the Kaiser window here is precisely that its
+design procedure is a page of code with known error bounds, needing no
+iterative optimization at construction time.
+
+**f. harris, *Multirate Signal Processing for Communication Systems*,
+Prentice Hall, 2004.** The standard reference for polyphase decomposition
+— factoring one long prototype filter into L short branches indexed by
+fractional delay — which is the structure of the library's coefficient
+table. The tap-length estimate in `estimateTaps()` is the Kaiser/harris
+formula in the form `N = (A − 8) / (2.285 · Δω)`, applied per polyphase
+branch; the codebase credits both names, as the literature does.
+
+**J. O. Smith, "Digital Audio Resampling Home Page" (and the *Bandlimited
+Interpolation* material), CCRMA, Stanford University.** The theory the
+datapath implements: resampling as evaluation of a windowed-sinc
+interpolation kernel at fractional positions, with a finite table of
+kernel phases and *linear interpolation between adjacent table entries*.
+Smith's analysis of that last step is where the library's most-quoted
+scaling law comes from — interpolation residue falling ~12 dB per doubling
+of the table size L and rising with signal frequency — which Part 0 turns
+into the budget arithmetic connecting L to decibels.
+
+**Analog Devices, AD1896 datasheet ("192 kHz Stereo Asynchronous Sample
+Rate Converter").** The architectural ancestor. The README describes the
+library as "the classic commercial-ASRC architecture (AD1896-style
+polyphase FIR + clock servo), specialized for the near-unity regime," and
+the datasheet documents that architecture: a polyphase interpolation
+filter addressed by a recovered rate ratio, with a FIFO between the clock
+domains. It also supplies the hardware row in the comparison table —
+quoted as datasheet values, with the caveats about measurement environment
+stated in `docs/COMPARISON.md`.
+
+**AES17, *AES standard method for digital audio engineering — Measurement
+of digital audio equipment* (Audio Engineering Society).** The measurement
+definition behind the headline quality numbers: remove the fundamental,
+integrate the residual across the audio band for THD+N, measure dynamic
+range at −60 dBFS with A-weighting. The comparison notebook implements an
+AES17-style procedure (exact fit plus ±20 Hz notch, 20 Hz–20 kHz
+integration) and calibrates it against synthetic signals before use — the
+standard is what makes the −132 dB figure commensurable with silicon
+datasheets rather than a house metric.
+
+## The measured competitors
+
+**libsamplerate (Secret Rabbit Code), E. de Castro Lopo —
+documentation at libsndfile.github.io/libsamplerate.** The closest
+architectural analog (streaming time-domain polyphase resampler) and one
+of the two software subjects measured under identical conditions in
+`docs/COMPARISON.md` and the comparison notebook. Its documentation also
+supplied the honesty check the comparison repeats: the published "97 dB
+worst case" figure applies to aggressive ratios, so near-unity results at
+the format ceiling are its *easy* regime, not a contradiction.
+
+**soxr (the SoX Resampler library) — github.com/chirlu/soxr.** The second
+measured competitor, and the source of its own latency figure via
+`soxr_delay()`. What the project took from soxr is mostly a boundary
+lesson made quantitative: soxr wins raw host throughput decisively and
+carries ~12–16 ms of latency doing it, which is the measured statement of
+why a 1–2 ms live-monitoring budget needs a different design.
+
+## C++
+
+**Anthony Williams, *C++ Concurrency in Action*, 2nd ed., Manning, 2019.**
+The working reference for the C++ memory model as this book teaches it:
+acquire/release pairing as the establishment of happens-before, the
+legitimacy of relaxed loads of data a thread itself owns, and lock-free
+queue design generally. The ring chapter's proof style — argue the two
+release/acquire pairs, then treat everything else as sequential code —
+is the book's method applied to a hundred-line class.
+
+**cppreference.com — in particular `std::memory_order`,
+`std::atomic::is_always_lock_free`, `std::bit_ceil`, and
+`std::hardware_destructive_interference_size`.** The day-to-day authority
+for the exact semantics the headers rely on: the ordering guarantees the
+ring asserts, the compile-time lock-freedom predicate the audit added,
+the power-of-two rounding used by the ring and the polyphase table, and —
+for the interference-size constant — the documented ABI fragility that
+justified *rejecting* the standard facility in favor of a literal `64`.
+
+## Tooling
+
+**mdBook — rust-lang.github.io/mdBook.** The tool this book is built
+with. Its `\{{#include path:anchor}}` mechanism is what makes the book's
+central honesty commitment mechanical rather than aspirational: code
+excerpts are pulled from the real headers at build time, so prose that
+drifts from the code breaks the build in CI instead of quietly lying.
diff --git a/book/src/appendix/cpp-decisions.md b/book/src/appendix/cpp-decisions.md
new file mode 100644
index 0000000..c1744db
--- /dev/null
+++ b/book/src/appendix/cpp-decisions.md
@@ -0,0 +1,759 @@
+# Appendix A: The C++ decision log
+
+> There are only two kinds of languages: the ones people complain about and the ones nobody uses.
+>
+> — Bjarne Stroustrup
+
+Every chapter of this book has defended C++ decisions in passing, in the
+context that made them necessary. This appendix collects them in one place,
+in one format: the decision, what was rejected, why, and where in the
+repository the evidence lives — because in this codebase the decisions are
+*recorded*, mostly as comments at the point of consequence, and a decision
+whose reason you cannot locate is a decision you cannot safely revisit.
+
+A theme will emerge quickly, so it is worth stating up front. Almost every
+entry below is the same decision wearing different clothes: **between a
+clever general mechanism and a plain constraint you can state and verify,
+this library picks the constraint.** A literal `64` over a standard
+interference constant; a `static_assert` over trust; a compile-time gate
+over a runtime flag; a comment that shows its arithmetic over a comment
+that waves at it. Where the two genuinely conflict, the tiebreaker is
+always the same pair of masters: the real-time audio contract and the
+embedded targets that cannot fake their way around a bad choice.
+
+## 1. Header-only distribution
+
+The entire library is seven headers under `include/srt/`. The build system
+declares exactly one library target, and it has no compiled artifact:
+
+```cmake
+add_library(SampleRateTap INTERFACE)
+add_library(SampleRateTap::SampleRateTap ALIAS SampleRateTap)
+target_compile_features(SampleRateTap INTERFACE cxx_std_20)
+```
+
+Consumption is `add_subdirectory` or `FetchContent`, deliberately and
+exclusively — the README's *Consuming the library* section says so in as
+many words: "there are no install/package rules yet." The tests, examples,
+benchmarks and the C ABI shim are all opt-in options that default off when
+the project is not top-level, and the warning flags live on a separate
+`srt_warnings` target so that the library's own `-Wall -Wextra -Wpedantic
+-Wconversion` discipline is never propagated into a consumer's build
+(`CMakeLists.txt` carries the comment: "not propagated to consumers").
+
+What was rejected is the conventional pair: a compiled static/shared
+library, and a packaged install with exported config files. The costs of
+header-only are real and were accepted knowingly. Every translation unit
+that includes `srt/srt.hpp` re-parses and re-instantiates the templates —
+compile time is paid repeatedly. There is no ABI boundary, so there is
+nothing to version at link time and no way to ship a fixed `.so` to a
+customer who cannot rebuild (the C ABI shim in section 15 exists precisely
+for the one consumer class that needs a binary boundary).
+
+What it buys is decisive for this library's actual deployment surface.
+The code ships to bare-metal Cortex-M33/M55 firmware, a musl-libc Hexagon
+toolchain, and ordinary hosts — four toolchains in CI alone, each with its
+own flags, each producing incompatible binaries. A prebuilt library per
+target multiplies the release matrix; a header vanishes into whatever
+build the consumer already has, including builds with LTO, `-march=native`
+or MVE auto-vectorization, where cross-TU inlining of the hot kernels is
+exactly what the performance chapters measured. And a template library is
+header-shaped by nature: the sample-type axis of section 2 means the
+"library" is not a fixed set of functions but a recipe the consumer's
+compiler executes.
+
+| Decision | Rejected | Reason | Evidence |
+|---|---|---|---|
+| `INTERFACE` target, `add_subdirectory`/`FetchContent` only | compiled library; install/export packaging | four incompatible toolchains in CI; templates need instantiation in the consumer's TU; costs (compile time, no ABI) accepted, C ABI shim covers the binary-boundary case | `CMakeLists.txt`; README "Consuming the library"; `tools/capi/` |
+
+## 2. Templates and a concept for the sample-type axis
+
+The datapath comes in three sample types — `float`, Q15 `int16_t`, Q31
+`int32_t` — and the axis is expressed as a template parameter constrained
+by a concept:
+
+```cpp
+template
+class BasicAsyncSampleRateConverter { ... };
+
+using AsyncSampleRateConverter = BasicAsyncSampleRateConverter;
+using AsyncSampleRateConverterQ15 = BasicAsyncSampleRateConverter;
+using AsyncSampleRateConverterQ31 = BasicAsyncSampleRateConverter;
+```
+
+The first rejected alternative is virtual dispatch: an abstract
+`ISampleOps` with `mac()`, `blend()`, `finalize()` virtuals. That dies on
+arithmetic grounds before it reaches performance grounds — the three
+datapaths do not share signatures. The float path accumulates in `double`,
+the fixed-point paths in `int64_t`; the blend factor is a `float`, a Q15
+`int32_t`, or a Q20 `int32_t` depending on the type. Virtual functions
+cannot vary their associated types per implementation; you would be forced
+to launder everything through the widest type, which is precisely the
+soft-double catastrophe the fixed-point paths exist to avoid (the M33
+baselines put the float path at roughly 19× the M55's instruction count
+for exactly that reason — README, platform section). And even if the types
+had lined up, an indirect call per multiply-accumulate inside a 48–80-tap
+loop would forfeit the inlining and auto-vectorization that Part III
+measured: the M55's Q15 kernel is fast *because* GCC can see through
+`SampleTraits::mac` and emit Helium.
+
+The second rejected alternative is CRTP — compile-time polymorphism via
+inheritance. It solves the dispatch cost but contorts the shape: the
+sample type here is `int16_t` itself, a builtin, not a class that can
+inherit from a base. CRTP would demand wrapper types around the samples,
+and wrapped samples are no longer the raw interleaved buffers that device
+drivers and the `memcpy`-based ring (section 6 of the ring chapter)
+require. The concept does the one job the template needs guarding for:
+
+```cpp
+template
+concept SampleType = requires(...) {
+ { SampleTraits::mac(a, x, c) } -> std::same_as::Accum>;
+ // ... six more operations, each with its exact type checked
+};
+```
+
+A wrong instantiation fails at the constraint with the list of missing
+operations, not three template layers deep in the dot-product loop. The
+header then `static_assert`s the concept against all three shipped types —
+the same trust-nothing reflex as the ring's lock-free asserts.
+
+| Decision | Rejected | Reason | Evidence |
+|---|---|---|---|
+| templates constrained by the `SampleType` concept | virtual `ISampleOps`; CRTP wrappers | per-type associated types (`Accum`, `BlendFactor`) are impossible to express virtually; builtins can't inherit; hot loops must inline and vectorize | `include/srt/sample_traits.hpp` (concept + `static_assert`s); `include/srt/asrc.hpp` aliases; README platform notes (19× soft-double) |
+
+## 3. A traits struct as the customization point
+
+Given templates, the customization could still have taken several shapes.
+The library chose a traits struct with an intentionally undefined primary
+template:
+
+```cpp
+/// Primary template intentionally undefined; specialize per sample type.
+template
+struct SampleTraits;
+```
+
+Each specialization bundles three associated types (`Coeff`, `Accum`,
+`BlendFactor`) with seven static functions (`makeCoeff`, `mac`, `blend`,
+`finalize`, ...). Why this over the alternatives?
+
+**Free functions found by ADL** — the customary `swap`-style mechanism —
+were worse for two reasons. First, the customization is mostly *types*,
+not functions: the fact that Q15 stores coefficients as Q1.14 `int16_t`
+but accumulates in `int64_t` is the design (the header's comments derive
+it: Q0.15 × Q1.14 products summed exactly, one rounding in `finalize()`).
+Free functions cannot carry associated types; you would need separate type
+traits anyway, and the customization point would smear across two
+mechanisms. Second, ADL on builtin types like `int16_t` has no associated
+namespace to hook — the overloads would all pile into `srt` and be
+distinguishable only by overload resolution, silently, which is exactly
+how a Q15/Q31 mixup would compile and produce garbage.
+
+**Member policies** — making the sample type a class that knows its own
+arithmetic — fail as in section 2: the sample types must remain raw
+builtins so buffers stay `memcpy`-compatible and ABI-identical to what
+audio drivers produce. A traits struct is the standard C++ answer for
+attaching behavior to types you cannot modify, and the undefined primary
+template makes "I forgot to specialize" a clean compile error at the point
+of use rather than a link error or a default that half-works.
+
+The struct also keeps each datapath's documentation in one screenful: the
+Q15 specialization's header comment is a complete fixed-point error budget
+(coefficient quantization at ~−86 dB, single rounding point, "the
+converter is Q15-transparent"), sitting directly above the ten lines that
+implement it.
+
+| Decision | Rejected | Reason | Evidence |
+|---|---|---|---|
+| `SampleTraits` struct, undefined primary template | ADL free functions; member policies on sample classes | customization is chiefly associated types; builtins have no ADL namespace and can't have members; missing specialization = clean compile error | `include/srt/sample_traits.hpp` |
+
+## 4. The real-time contract: exceptions at setup, `noexcept` forever after
+
+This is the load-bearing wall of the whole API, stated as a contract in
+the converter's class comment:
+
+```cpp
+/// Real-time contract: the constructor performs all allocation and filter
+/// design and may throw; push(), pull(), status() and resetFromConsumer()
+/// are noexcept, lock-free and allocation-free.
+```
+
+The README's feature bullets repeat it, because it is the feature. The
+constructor allocates every buffer the object will ever touch — ring,
+polyphase table, histories, scratch — designs the filter in double
+precision, validates the configuration, and throws `std::invalid_argument`
+or `std::bad_alloc` on anything wrong. From that point on, the audio path
+never allocates, never locks, never throws; every hot function is spelled
+`noexcept`, and the `validated()` function exists to make the constructor
+*more* throw-happy, rejecting configurations that "would otherwise
+construct successfully and misbehave silently" — NaN sample rates that
+design all-NaN tables, band edges that pass images wholesale, deviation
+clamps that overflow the Q0.64 conversion (its comment lists each one).
+
+The rejected alternatives are the two ways other libraries split this.
+Error codes at setup ("check the return of `init()`") were rejected
+because a partially-constructed converter is not a state this object can
+represent — there is no meaningful "converter without a filter table," and
+C++ constructors-that-throw are precisely the tool that makes invalid
+objects unrepresentable. Exceptions on the audio path were never
+considered — an unwind inside a device callback is a glitch at best — but
+the *strength* of the setup/hot-path split was reinforced from an
+unexpected direction. When the first `EXPECT_THROW` test reached the
+Hexagon CI leg, it discovered that the hexagon-linux-musl toolchain
+cannot catch exceptions at all: a constructor throw terminates via
+libc++abi instead of propagating. `docs/PERFORMANCE.md` records it under
+Known debt, with the deployment note ("treat invalid Config as fatal —
+validate inputs before constructing") and the candidate fix
+(`-unwindlib=libunwind`). The discovery cost one excluded test on one leg
+— because exceptions had been confined to a code region where "terminate
+instead of propagate" is survivable. Had the audio path thrown, the same
+toolchain quirk would have been a field failure.
+
+| Decision | Rejected | Reason | Evidence |
+|---|---|---|---|
+| all allocation + throwing in the constructor; `noexcept`/lock-free/allocation-free hot path | `init()` + error codes; exceptions anywhere near audio | invalid objects unrepresentable; RT contract is the product; Hexagon's no-unwind toolchain proved the value of confining throws to setup | `include/srt/asrc.hpp` (class comment, `validated()`); README bullets; `docs/PERFORMANCE.md` Known debt; commit "Hexagon: exclude ConfigValidation" |
+
+## 5. Runtime filter design, not `constexpr` tables
+
+A modern-C++ reflex says the Kaiser-windowed prototype — pure math on
+compile-time-known presets — should be a `constexpr` table. The library
+computes it at runtime, in the constructor, and `kaiser.hpp` opens with
+the reason, arithmetic included:
+
+```cpp
+/// Design note — runtime vs constexpr: the prototype tables run 12K-33K taps
+/// and each tap needs sin/sqrt plus a ~50-term Bessel I0 series. Constexpr
+/// evaluation is interpreted (roughly 1e3-1e4x slower than native), would
+/// need hand-rolled constexpr transcendentals before C++26, and would cost
+/// tens of seconds to minutes of compile time in every including translation
+/// unit. Runtime design takes well under 10 ms, runs once in a constructor,
+/// and is off the audio path, so all design math here is plain runtime
+/// double precision.
+```
+
+Unpack the trade. The `balanced()` preset's prototype is 256 × 48 =
+12,288 taps, and the presets range upward from there — the comment's
+"12K-33K taps". Each tap evaluates `sin`,
+`sqrt`, and a Bessel-I0 power series that runs to ~50 terms. `constexpr`
+evaluation is an interpreter inside the compiler — three to four orders
+of magnitude slower than native — and, before C++26, `std::sin` and
+friends are not `constexpr`, so the transcendentals would have to be
+hand-rolled *and then trusted* to match runtime libm behavior. In a
+header-only library the bill lands in every consumer TU, repeatedly. The
+runtime version costs under 10 ms, once, in the constructor — which
+section 4 already designated as the place where expensive things happen.
+And a runtime design accepts *runtime* configurations: `FilterSpec` is
+not limited to the three presets, so a compile-time table would have been
+a special case bolted alongside the general path, not a replacement.
+
+This is the header-only cost model (section 1) feeding back into design:
+having accepted per-TU compilation, the library polices what each TU
+costs.
+
+| Decision | Rejected | Reason | Evidence |
+|---|---|---|---|
+| filter designed at runtime in the constructor | `constexpr` coefficient tables | 12K–33K taps × transcendentals ≈ minutes of interpreted compile time per TU vs <10 ms once at runtime; needs pre-C++26 hand-rolled constexpr math; runtime `FilterSpec` must work anyway | `include/srt/detail/kaiser.hpp` header comment |
+
+## 6. `` over hand-rolled bit tricks; masks over modulo
+
+Everywhere the library needs power-of-two arithmetic it reaches for
+C++20's ``: `std::bit_ceil` rounds the ring capacity up
+(`SpscRing`'s constructor), rounds the phase count up
+(`PolyphaseFilterBank`), and sizes the FIFO (`ringCapacityElems` in
+`asrc.hpp`); `std::countr_zero` recovers log₂(L) in the phase-indexed
+kernels so the polyphase branch is the top bits of the Q0.64 phase word:
+
+```cpp
+const int lg = std::countr_zero(bank.numPhases()); // L is a power of two
+const std::size_t p = static_cast(phase >> (64 - lg));
+```
+
+The rejected alternative is the folklore versions — the
+shift-or-shift `bit_ceil`, the de Bruijn log₂ — which every C programmer
+has written and half have gotten wrong at the boundaries (what does your
+hand-rolled `bit_ceil` do at 0? at values above 2⁶³?). The standard
+functions have specified edge behavior, compile to single instructions
+where they exist, and *name the intent* — `countr_zero(numPhases())`
+under the comment "L is a power of two" is an invariant stated twice.
+
+The deeper decision is what the powers of two are *for*: indexing by mask
+instead of modulo. The ring's monotonic indices are wrapped by `head &
+mask_` — its class comment: "Indices are monotonic and wrapped by a
+power-of-two mask, so the full capacity is usable" — and the ring chapter
+proves the wraparound benign. The polyphase table's L being a power of
+two is what lets the Q0.64 phase word split into branch index and blend
+fraction by pure shifts, with no division and no double arithmetic on the
+per-sample path (the phase-accumulator comment in
+`polyphase_filter.hpp`). A general-modulo design would put an integer
+divide — tens of cycles on the M-class cores, and a serialization point
+everywhere — inside the tightest loops the library owns, to support
+capacities nobody asked for.
+
+| Decision | Rejected | Reason | Evidence |
+|---|---|---|---|
+| `std::bit_ceil` / `std::countr_zero`; power-of-two capacities indexed by mask | hand-rolled bit tricks; arbitrary sizes with `%` | specified edge cases, single instructions, intent named; masks keep divides and doubles off the per-sample path | `include/srt/spsc_ring.hpp` ctor + class comment; `include/srt/polyphase_filter.hpp` (`blendRowPhase`, `interpolatePhase`, `ringCapacityElems`) |
+
+## 7. Memory orderings chosen to be exactly sufficient
+
+The ring chapter walked this in full; the appendix records it as policy,
+because it generalizes beyond the ring. Every atomic operation in the
+library carries an explicit ordering argument, and each ordering is the
+*weakest* that keeps the algorithm correct: `release` on the store that
+publishes data, `acquire` on the load that consumes a foreign index,
+`relaxed` on a thread's loads of its own index — and `relaxed` on all
+telemetry, whose fields are documented as "individually coherent, not
+mutually" (`status()` in `asrc.hpp`).
+
+The rejected alternative is `seq_cst`-by-default — writing
+`head_.store(x)` and letting the strongest ordering paper over the
+analysis. It would be correct. It was rejected first because it is
+measurably stronger than needed on the weakly-ordered targets (full
+barriers on ARM in the hottest loop the library owns), and second — the
+argument this codebase actually leads with — because **orderings are
+documentation**. An explicit `memory_order_relaxed` on `tail_.load()` in
+the producer tells the reader "this is my own index; no synchronization
+happens here" — a claim the ring chapter spells out and ThreadSanitizer
+checks against reality in CI. A default `seq_cst` says only "I didn't
+think about this," and in the one file whose entire job is to be thought
+about, that is the wrong message. The same honesty cuts the other way:
+where synchronization *is* needed, the annotation names which one, so a
+future editor who weakens it is contradicting a written claim, not
+merely changing a default.
+
+| Decision | Rejected | Reason | Evidence |
+|---|---|---|---|
+| explicit, minimal orderings on every atomic | `seq_cst` defaults | weaker barriers on ARM where it matters; each annotation documents exactly why it exists; TSan-checked in CI | `include/srt/spsc_ring.hpp`; `include/srt/asrc.hpp` telemetry; the ring chapter's "What was rejected" |
+
+## 8. `alignas(64)`, not `std::hardware_destructive_interference_size`
+
+The ring separates producer-owned, consumer-owned and shared-read-only
+state onto distinct cache lines, and it does so with a named literal:
+
+```cpp
+// 64-byte separation to keep producer- and consumer-owned state on
+// distinct cache lines (std::hardware_destructive_interference_size is
+// deliberately avoided: it is ABI-fragile and warns on GCC). ...
+static constexpr std::size_t kCacheLine = 64;
+```
+
+The standard offers a constant whose whole purpose is this alignment, and
+the file's comment rejects it by name. The problem is that
+`hardware_destructive_interference_size` is not a constant of the
+architecture; it is a constant of the *compiler invocation* — its value
+can change with `-mtune`, which means two translation units in the same
+program can disagree about the layout of the same type. That is an ODR
+violation waiting for a victim, and GCC ships a warning
+(`-Winterference-size`) telling you exactly this whenever the constant is
+used in a context that might cross an ABI boundary. A header-only library
+(section 1) lives *entirely* in that danger zone: every consumer TU
+re-instantiates `SpscRing`, potentially under different flags.
+
+A plain `64` is correct on every target this project ships to, cannot
+vary between TUs, and states its assumption in a comment a porting
+engineer will read. The general lesson — the ring chapter phrases it as
+"between a standard facility and a constraint you can state plainly,
+prefer the one whose failure mode you can reason about" — is this
+appendix's opening theme in miniature.
+
+| Decision | Rejected | Reason | Evidence |
+|---|---|---|---|
+| `alignas(kCacheLine)` with `kCacheLine = 64` | `std::hardware_destructive_interference_size` | the standard constant varies with tuning flags → ODR/ABI fragility in a header; GCC warns; 64 is right everywhere shipped | `include/srt/spsc_ring.hpp` member layout comment |
+
+## 9. 32-bit telemetry atomics
+
+The converter's telemetry — state, ppm, fill, underrun/overrun/resync
+counters, effective setpoint — is deliberately 32 bits wide, and the
+comment above the members carries the whole argument:
+
+```cpp
+// Telemetry is 32-bit on purpose: 64-bit atomics fall back to lock-based
+// libatomic on 32-bit targets (e.g. Hexagon), which would break the
+// lock-free contract of the hot path. float carries ~7 significant
+// digits — ample for ppm/fill observability; counters wrap at 2^32.
+```
+
+The rejected alternative — `std::atomic` counters and
+`double` gauges, the "obviously roomier" choice — is a trap on exactly
+the targets this library most cares about. On a 32-bit ISA without a
+64-bit atomic instruction, `std::atomic` still compiles and
+still works: libatomic implements it *with a lock*. The hot path would
+remain formally correct and silently stop being lock-free — the one
+property section 4 declared as contract, broken invisibly by a telemetry
+counter. The 32-bit choice keeps every telemetry access a plain
+lock-free operation on Hexagon and the M-class cores, and the class
+`static_assert`s it rather than assuming:
+
+```cpp
+static_assert(std::atomic::is_always_lock_free &&
+ std::atomic::is_always_lock_free &&
+ std::atomic::is_always_lock_free,
+ "telemetry atomics must be lock-free for the RT contract");
+```
+
+The cost is range, and it is documented rather than hidden: `Status`'s
+comment tells callers the counters "wrap at 2^32 — far beyond any
+plausible event count, but treat them as modular if you difference them
+over very long horizons." (The `Status` struct itself still presents
+`uint64_t` fields — the narrowing is an internal representation choice,
+widened at the snapshot.) A `float` gauge carries about seven significant
+digits, which comfortably resolves tenths of a ppm and hundredths of a
+frame of fill — observability, not metrology.
+
+| Decision | Rejected | Reason | Evidence |
+|---|---|---|---|
+| `atomic`/`atomic` telemetry, wrap documented | 64-bit atomic counters/doubles | 64-bit atomics lock via libatomic on 32-bit targets, silently voiding the lock-free contract; 32-bit range/precision suffices and is asserted | `include/srt/asrc.hpp` telemetry members + `static_assert`; `Status` doc comment |
+
+## 10. Designated initializers as API
+
+The filter presets are written the way a datasheet reads:
+
+```cpp
+static FilterSpec transparent() noexcept {
+ return {.numPhases = 512,
+ .tapsPerPhase = 80,
+ .passbandHz = 20000.0,
+ .stopbandHz = 26000.0,
+ .stopbandAttenDb = 140.0};
+}
+```
+
+`FilterSpec`, `Config` and `ServoConfig` are aggregates with member
+initializers supplying defaults, and C++20 designated initializers do the
+rest. The rejected alternatives are the two classic config-struct styles.
+A positional constructor —
+`FilterSpec(512, 80, 20000.0, 26000.0, 140.0)` — puts two adjacent
+`double` band edges next to each other where a swap compiles silently and
+mis-designs the filter (which, per `validated()`'s comment, is the kind
+of error that "passes images wholesale"). A builder/setter chain adds a
+mutable construction protocol and a second way for every field to be set,
+to solve a problem the language now solves natively: fields are named at
+the call site, unmentioned fields keep their documented defaults, and —
+because designated initializers must follow declaration order — the
+compiler rejects reorderings instead of reinterpreting them.
+
+The style is also the library's own consumption idiom: the README quick
+start and every test build configs by naming only what deviates from
+default. Readable initialization is not cosmetic in a config API; the
+config *is* the API surface where users make their quality-versus-cost
+decisions, and the presets double as documentation of three known-good
+points in that space.
+
+| Decision | Rejected | Reason | Evidence |
+|---|---|---|---|
+| aggregate configs + designated initializers | positional constructors; builder chains | named fields make adjacent-double swaps impossible; defaults stay declarative; declaration-order enforcement | `include/srt/polyphase_filter.hpp` (`FilterSpec` presets); `include/srt/asrc.hpp` (`Config`); `include/srt/pi_servo.hpp` (`ServoConfig`) |
+
+## 11. `SRT_RESTRICT`: a portable `__restrict__`, adopted on measurement
+
+C++ has no standard `restrict`. The library defines a two-line macro over
+the compiler extensions and applies it to the kernel pointer parameters —
+and the comment above the macro is careful to claim only what was
+verified:
+
+```cpp
+// No-alias qualifier for the kernel hot loops: without it the compiler
+// versions the blend loop behind a runtime aliasing check (verified with
+// -fopt-info-vec; see docs/PERFORMANCE.md, hypothesis 2).
+```
+
+This entry is here as much for its *method* as its content. The
+vectorization audit (PERFORMANCE.md, PR C2) did not assume aliasing was a
+problem; it asked the compiler. `-fopt-info-vec` showed `blendRow`
+vectorizing — but behind a runtime aliasing check, the loop compiled
+twice with a pointer-overlap branch choosing between versions.
+`SRT_RESTRICT` on the row/history pointers removes the check, and the
+measured effect is recorded with the honesty this project's performance
+docs enforce: **M55 `pipeline_float` −1.35% instructions, every other
+embedded scenario exactly 0.00%, x86 same-state A/B −3.7% wall-clock.**
+Small, real, and cheap — the qualifier documents a true invariant (the
+scratch row never aliases the history), so it costs nothing to maintain.
+
+The rejected alternatives: doing nothing (leaving the versioned loop and
+its branch in the hot path), and restructuring the code so the compiler
+could prove non-aliasing itself (possible, but contorting call signatures
+to communicate what one keyword states directly). MSVC spells the
+extension `__restrict`, everyone else `__restrict__`; hence the macro
+rather than a raw keyword.
+
+| Decision | Rejected | Reason | Evidence |
+|---|---|---|---|
+| `SRT_RESTRICT` macro on kernel pointers | nothing (alias-versioned loops); structural non-aliasing proofs | verified with `-fopt-info-vec`, measured: M55 float −1.35% insns, x86 −3.7% wall-clock; states a true invariant | `include/srt/polyphase_filter.hpp` macro + comment; `docs/PERFORMANCE.md` C2 |
+
+## 12. Compile-time feature gates — and the measured cost of a runtime one
+
+Target-specific code paths are selected by preprocessor and `constexpr`
+machinery, never by runtime flags. `SRT_Q15_SMLALD` turns on the dual-MAC
+Q15 dot product exactly where it wins:
+
+```cpp
+#if defined(__ARM_FEATURE_DSP) && !defined(__ARM_FEATURE_MVE)
+```
+
+— DSP-extension cores *without* Helium (the M33/Pico class), because on
+the M55 the compiler already auto-vectorizes the scalar loop with MVE and
+the intrinsic would replace vectors with dual-MACs (the gate's comment;
+PERFORMANCE.md C4 verified 0.00% change on every M55 scenario).
+`SRT_CHANNEL_PARALLEL` enables the frame-major channel axis on hosts only,
+and inside the class it becomes a `constexpr` member flag that
+`if constexpr` and plain constant folding erase from non-participating
+builds:
+
+```cpp
+static constexpr bool kChannelParallel =
+ SRT_CHANNEL_PARALLEL != 0 && std::is_floating_point_v;
+```
+
+The reason this is dogma rather than taste is that the alternative was
+tried, by accident, and measured. During C6 the mode gate was briefly an
+ordinary runtime `bool` consulted in the hot loops — and the M55
+instruction ratchet, which had nothing to do with the change (C6 is
+host-only), moved **+6–8%** from hot-loop branch bloat. PERFORMANCE.md
+records the lesson verbatim: "the mode gate must be compile-time — a
+runtime bool in the hot loops cost +6–8% on the M55 ratchet before the
+constexpr gate restored every embedded scenario to 0.00%." The compaction
+path in `appendOne` carries the same note at the exact line that was
+guilty. A ±3% two-sided CI gate is what turned this from a silent tax
+into a failed build; the constexpr gate is what turned the fix from "fast
+again" into "provably byte-identical again."
+
+| Decision | Rejected | Reason | Evidence |
+|---|---|---|---|
+| preprocessor + `constexpr` flags + `if constexpr` gates | runtime mode flags | a runtime bool in the hot loop measured +6–8% on the M55 ratchet; compile-time gates keep non-participating targets' codegen byte-identical (0.00%) | `include/srt/polyphase_filter.hpp` (`SRT_Q15_SMLALD`, `SRT_CHANNEL_PARALLEL`, `kChannelParallel`, `appendOne` comment); `docs/PERFORMANCE.md` C4/C6 |
+
+## 13. `std::function` in the simulator, templated callables in the library
+
+The test harness's two-clock simulator configures its signal generators
+as `std::function` fields:
+
+```cpp
+std::function gen = [](std::uint64_t) { return S{}; };
+std::function fsInScale = [](double) { return 1.0; };
+```
+
+The library's hot path, facing the identical "caller supplies a callable"
+problem, does something else entirely. `FractionalResampler::process`
+takes its frame source as a template parameter —
+`template std::size_t process(..., PopFn&& popFrames)
+noexcept` — and the converter passes a `noexcept` lambda that wraps the
+ring read. Same need, opposite tools, and the split is deliberate.
+
+`std::function` is the right tool in the simulator: tests assign
+different generators per test case at runtime, the cost of a type-erased
+call per sample is irrelevant next to the double-precision sine it
+invokes, and construction-time allocation in a test fixture harms
+nothing. It would be the wrong tool in `process()` three ways at once.
+Its call is an indirect jump through erased type information that the
+optimizer cannot inline — and `popFn` is invoked inside the per-frame
+loop, where the entire benefit of the current design is that the ring's
+`read()` inlines into the resampler's refill path. Assigning one may
+allocate, which is forbidden anywhere reachable from `pull()`
+(section 4). And its call operator is not `noexcept` — an empty
+`std::function` throws `bad_function_call` — which poisons the `noexcept`
+audio path either with a formal lie or a terminate-on-bug. The template
+parameter has none of these problems and costs only what templates
+always cost: the code is instantiated per callable type, which for
+exactly one production callable is nothing.
+
+| Decision | Rejected | Reason | Evidence |
+|---|---|---|---|
+| templated `PopFn&&` in the library; `std::function` only in test config | `std::function` on the hot path; templates in test fixtures | hot path needs inlining, no allocation, honest `noexcept`; tests need runtime reassignment and don't care about a type-erased call | `include/srt/polyphase_filter.hpp` (`process`, `prime`); `include/srt/asrc.hpp` (`popFn` lambda); `tests/support/two_clock_sim.hpp` |
+
+## 14. `std::vector` everywhere, custom allocators nowhere
+
+Every owned buffer in the library is a plain `std::vector`: the ring's
+storage, the coefficient table, the resampler's histories, scratch and
+blended row. No allocator parameters, no PMR, no small-buffer tricks. In
+a real-time audio library this looks, at first glance, like negligence —
+until you notice *when* those vectors are touched. Every `resize`,
+`assign` and construction happens in a constructor or in `prime()`-time
+setup; the hot path only ever reads `data()` and indexes. The RT problem
+with allocation is not that heap memory is slow; it is that allocation
+is unbounded and lock-taking *at the moment you cannot afford it*.
+Section 4's contract solves that by construction-time-only allocation —
+after which a custom allocator has nothing left to fix. It would add a
+template parameter that infects every class signature, a policy decision
+for every consumer, and a second code path to test, in exchange for
+optimizing events that occur once per converter lifetime, off the audio
+thread, in a place explicitly allowed to throw `bad_alloc`.
+
+The rejected-in-spirit alternatives — fixed `std::array` capacities, or
+caller-supplied arenas — also fail the configurability test: table and
+buffer sizes derive from runtime `FilterSpec` and `Config` values
+(section 5), so compile-time capacities would cap the very parameters
+the config API exposes. Embedded consumers who must avoid the heap
+entirely have the honest option the design leaves open: construct the
+converter during initialization, when the heap (or a bump allocator
+behind `operator new`) is still a fine place to get memory from.
+
+| Decision | Rejected | Reason | Evidence |
+|---|---|---|---|
+| `std::vector` storage, default allocator | allocator/PMR parameters; fixed arrays; arenas | allocation is construction-only by contract, so allocators optimize a non-problem at the cost of infecting every signature; sizes are runtime config | `include/srt/spsc_ring.hpp`, `polyphase_filter.hpp`, `asrc.hpp` (members); RT contract in section 4 |
+
+## 15. The C ABI: opaque handles, `reinterpret_cast`, and `impl()` outside `extern "C"`
+
+The FFI surface (`tools/capi/`) wraps the float converter behind an
+opaque `SrtHandle*`. The pattern is textbook, but two details record
+decisions. First, the handle is a declared-but-never-defined struct, and
+the conversion is a `reinterpret_cast` in a pair of helpers:
+
+```cpp
+extern "C" { struct SrtHandle; } // opaque
+
+namespace {
+srt::AsyncSampleRateConverter* impl(SrtHandle* h) noexcept { ... }
+const srt::AsyncSampleRateConverter* impl(const SrtHandle* h) noexcept { ... }
+}
+```
+
+The helpers live in an anonymous namespace *outside* the `extern "C"`
+block for a reason C++ makes easy to forget: those two `impl` functions
+are overloads (const and non-const), and **overloading is illegal under C
+linkage** — C linkage names carry no type information to distinguish
+them. Keeping the C++ conveniences in C++ linkage and only the exported
+symbols in `extern "C"` is the discipline that lets the shim be written
+as C++ without leaking C++ into the ABI.
+
+The rejected alternatives for the handle: exposing the class definition
+(no ABI stability — the whole point of the shim is a boundary the C++
+headers don't have, per section 1), or a lookup table of integer handles
+(indirection and lifetime bookkeeping to solve a problem the opaque
+pointer already solves). Around the handle, the shim converts the C++
+error model to C conventions at the boundary: `srt_create` catches
+everything and returns null; every entry point tolerates a null handle,
+because — the file's own comment — the documented "check srt_create for
+NULL" convention "otherwise invites a crash on exactly the path where the
+caller forgot to check." An unchecked failure degrades to silence, not a
+crash, which for an audio library is the correct failure sound.
+
+| Decision | Rejected | Reason | Evidence |
+|---|---|---|---|
+| opaque `SrtHandle*` + `reinterpret_cast`; `impl()` overloads outside `extern "C"`; null-tolerant entry points | exposed class; handle tables; unguarded entries | ABI boundary with zero C++ leakage; C linkage forbids overloads; unchecked create must fail soft | `tools/capi/srt_capi.cpp`, `tools/capi/srt_capi.h` |
+
+## 16. Deleted copy operations: these are identity types
+
+Both concurrency-bearing classes delete copying:
+
+```cpp
+SpscRing(const SpscRing&) = delete;
+SpscRing& operator=(const SpscRing&) = delete;
+```
+
+and likewise `BasicAsyncSampleRateConverter`. The rejected alternative —
+letting the compiler generate copies, or writing "deep copy" semantics —
+fails the simplest question first: *what would a copy even mean?* A ring
+mid-stream has a producer thread and a consumer thread holding a
+reference to *this specific object*; a copy would duplicate the buffer
+contents but not the relationship, producing an orphan that no thread
+feeds. (Mechanically, `std::atomic` members are not copyable anyway —
+the language is trying to tell you the same thing.) The converter is
+worse: copying would snapshot servo state, telemetry and half-consumed
+scratch into a second object whose FIFO occupancy no longer corresponds
+to any real clock relationship. These are what the two-agent contract
+makes them: objects with identity, addressed by the threads that share
+them, not values to be passed around. Deleting the operations turns the
+meaningless question into a compile error — the same conversion of
+convention into compiler-enforced fact as the `static_assert`s
+(sections 2, 9) and the concept (section 2). Moves are deleted along
+with copies (declaring the deleted copy suppresses them), which is also
+right: a moved-from ring would invalidate the pointers the other thread
+is using *right now*.
+
+| Decision | Rejected | Reason | Evidence |
+|---|---|---|---|
+| deleted copy (and hence move) on ring and converter | default/deep copies | two live threads reference the object by identity; a copy duplicates state but not the clock relationship; atomics aren't copyable | `include/srt/spsc_ring.hpp`, `include/srt/asrc.hpp` |
+
+## 17. Rejected wholesale, with reasons
+
+Some decisions are visible only as absences. For each, the reason is on
+record.
+
+**`std::simd` / `std::experimental::simd`.** Not in C++20 — the library's
+floor — and the portable-SIMD abstraction solves a problem this codebase
+measured its way out of differently: where explicit SIMD wins, it is
+gated per target and per measurement (the SMLALD path, +measured, kept;
+the Hexagon `vrmpyh` path, −0.31%, implemented, proven bit-exact, and
+*deliberately deleted* per the stop rule — PERFORMANCE.md C5). Where
+auto-vectorization already wins (Helium on the M55, host AVX2 via the
+channel axis), abstraction would only obscure what `-fopt-info-vec` and
+`objdump` verified.
+
+**Coroutines.** The library's callers are device callbacks with hard
+deadlines: `push()` on the capture thread, `pull()` on the playback
+thread, both synchronous by the nature of the contract. No async model
+fits — a suspension point inside a real-time callback is a category
+error, and the frame flow the library does need (the resampler pulling
+from the ring mid-synthesis) is expressed by the `PopFn` callable of
+section 13 at zero machinery.
+
+**CRTP mixins.** Section 2's reasons in general form: the concept + traits
+pair already delivers static dispatch and constraint checking without
+forcing an inheritance shape onto builtin sample types or wrapper types
+onto raw buffers.
+
+**Exceptions on the audio path.** Section 4; reinforced by a toolchain
+that cannot unwind at all.
+
+**`std::jthread` (or any thread) in the library.** The library owns *no*
+threads. It is a passive object with a two-agent contract — "one producer
+thread calls push() at the input clock; one consumer thread calls pull()
+at the output clock" (`asrc.hpp`) — and the threads belong to the caller,
+because they already exist: they are the audio device callbacks. Spawning
+threads would also be unbuildable on half the CI matrix; the bare-metal
+targets have no `std::thread` at all, which is why even the *tests*
+compile the two-thread stress only where `find_package(Threads)` succeeds
+(`tests/CMakeLists.txt`).
+
+**Virtual interfaces for "pluggable filters."** The filter is not a
+plugin point; it is a *parameter space*. `FilterSpec` exposes the five
+numbers that matter (L, T, band edges, attenuation) and the design
+machinery is one fixed, well-understood method (Kaiser-windowed sinc)
+whose properties the quality tests pin. An `IFilterDesigner` interface
+would buy the ability to substitute arbitrary coefficient tables at the
+cost of an indirect call chain into the kernel (section 2's costs) and
+the loss of every invariant the code currently states about its own
+tables — per-branch DC gain, the extra phase row's exact continuity,
+the measured |diff| ≤ 41 adjacent-phase delta of section 18.
+
+| Rejected | Reason | Evidence |
+|---|---|---|
+| `std::simd` | not in C++20; per-target measured intrinsics (kept or deleted by number) beat portable abstraction | `docs/PERFORMANCE.md` C4/C5 |
+| coroutines | hard-RT synchronous callbacks; no async model fits | `include/srt/asrc.hpp` thread contract |
+| CRTP mixins | concept + traits already give static dispatch without inheritance shape | `include/srt/sample_traits.hpp` |
+| audio-path exceptions | RT contract; Hexagon cannot unwind | section 4 |
+| `std::jthread` in the library | passive two-agent object; caller owns the (callback) threads; bare metal has none | `include/srt/asrc.hpp`; `tests/CMakeLists.txt` Threads probe |
+| virtual pluggable filters | filter is a parameter space, not a plugin point; would cost kernel inlining and table invariants | `include/srt/polyphase_filter.hpp` (`FilterSpec`) |
+
+## 18. The meta-decision: comments that show their arithmetic
+
+Read back through the evidence column of this appendix and notice where
+it points: overwhelmingly at *comments*. The library's final C++ decision
+is about prose. Its comments do not narrate ("increment the index");
+they state constraints and record arithmetic at the point where the code
+depends on them. The Q15 traits comment derives the accumulator budget
+("48-80 taps add ~6-7 bits — no overflow, no intermediate rounding"). The
+`kaiser.hpp` note quantifies the constexpr rejection (section 5). The
+resampler's eps conversion documents its own safety margin ("|eps| is
+servo-clamped to ~1e-3, so eps * 2^64 fits int64 comfortably"). The
+`appendOne` compaction comment carries the +6–8% scar of section 12.
+These comments are load-bearing: they are the reasons future editors
+will weigh before changing the code, so they are held to the same
+standard as the code.
+
+Including being *audited*. The package audit that hardened the core
+(commit `029607f`, "Core hardening from the package audit") checked the
+comments' arithmetic along with the code's, and found one wrong: the Q15
+`blend()` comment claimed the int32 product had "~5% margin" against a
+worst-case adjacent-phase delta. The audit did the multiplication —
+32767 × 65535 = 2,147,385,345, which sits 0.005% under `INT32_MAX`, not
+5% — and the commit's own summary records the fix: "Q15 blend margin
+comment corrected (0.005%, not ~5%)." The corrected comment in
+`sample_traits.hpp` now shows the numbers and the measurement
+(real deltas: |diff| ≤ 41 on the transparent table) and draws the
+conclusion the wrong margin obscured: "a margin that thin is not an
+invariant worth relying on silently" — which is precisely why the code
+computes the blend in `int64_t`. Note what did *not* change: the code
+was already right. The comment was the bug.
+
+That is the standard this appendix has been documenting all along. A
+decision is not what the code happens to do; it is a claim, written where
+the code makes it true, precise enough to be checked — and checked.
diff --git a/book/src/appendix/glossary.md b/book/src/appendix/glossary.md
new file mode 100644
index 0000000..c767f1a
--- /dev/null
+++ b/book/src/appendix/glossary.md
@@ -0,0 +1,269 @@
+# Appendix B: Glossary
+
+> The limits of my language mean the limits of my world.
+>
+> — Ludwig Wittgenstein, *Tractatus Logico-Philosophicus*
+
+Terms of art as this book uses them. Where the general meaning and this
+project's usage differ, the entry gives the project's.
+
+**Acquire/release** — the pair of C++ memory orderings that establishes
+*happens-before* across threads: everything written before a
+release-store is visible after the acquire-load that observes it. The
+only synchronization in the library's ring buffer, used once per
+direction; the same pair carries the converter across the RP2350's two
+cores in the dual-core firmware.
+
+**AES17** — the Audio Engineering Society's standard for measuring
+digital audio equipment, defining how THD+N and dynamic range are taken
+(notch the fundamental, integrate the residual over the audio band,
+A-weight for DR). The comparison notebook implements an AES17-style
+measurement so the library's numbers are commensurable with hardware
+datasheets.
+
+**Anti-image filter** — the lowpass that removes the spectral copies
+(images) created by interpolating between sample instants. In this
+library it is the Kaiser-windowed sinc prototype: pass the audio band
+flat, suppress everything from the first image down by the stopband
+attenuation.
+
+**ASRC (asynchronous sample rate converter)** — a converter between two
+sample streams whose clocks are *independent*: the ratio is not known in
+advance, is never exactly rational, and drifts, so it must be recovered
+continuously by a servo. Distinct from a resampler library, which must be
+handed the ratio from outside.
+
+**Beat frequency** — the rate at which a slow periodic alignment
+recurs; here, the rate at which whole-sample slips (and hence occupancy
+sawteeth) arrive: `ppm × fs` for sample-granular transfer, divided by
+the block size for block transfer.
+
+**Blend factor** — the fractional weight μ used to linearly interpolate
+between the two polyphase coefficient rows adjacent to the current
+fractional position. Computed once per output frame and shared across
+all channels, which is why N channels cost `blend + N × dot`.
+
+**Block-beat sawtooth** — the deterministic waveform that block-quantized
+transfer imprints on the FIFO occupancy observable: one push/pull block
+peak-to-peak, at the beat frequency. It is measurement quantization, not
+clock movement; the servo's stage gating and the unlock threshold both
+exist to keep it out of the rate estimate.
+
+**Cache line** — the unit (64 bytes on the targets here) in which cores
+move memory between their caches. Data structures shared between two
+real-time threads are laid out in whole cache lines per owner.
+
+**Cache-line ping-pong** — the performance failure where a line written
+by one core and read by another migrates back and forth on every access,
+costing hundreds of cycles each trip. The ring buffer's cached-index
+design exists so the steady-state fast path touches no foreign line at
+all.
+
+**Cent** — one hundredth of a semitone, about 0.06% in frequency; the
+unit in which the block-size study reports the low-rate FM that coarse
+blocks impose (~0.9 cents rms at 32-frame blocks).
+
+**dBc** — decibels relative to the carrier: the level of a sideband or
+spur measured against the signal that carries it, used for the servo's
+sawtooth-rejection figures.
+
+**dBFS** — decibels relative to digital full scale; −1 dBFS is the AES17
+measurement level, 0.5 FS (−6 dBFS) the quality suite's.
+
+**DWT / CYCCNT** — the Data Watchpoint and Trace unit of Arm M-profile
+cores and its free-running 32-bit cycle counter. Optional silicon (hence
+the `NOCYCCNT` runtime check), per-core on the RP2350, and the
+instrument that converts QEMU instruction baselines into real cycle
+budgets.
+
+**False sharing** — two logically unrelated variables sharing one cache
+line, so writes to either invalidate both readers. Prevented in the ring
+by giving producer state, consumer state, and shared read-only state a
+64-byte-aligned line each.
+
+**FIFO** — first-in-first-out buffer. In this library the SPSC ring
+between the clock domains; its occupancy doubles as the servo's phase
+detector, which is why it exposes exact occupancy rather than an
+approximation.
+
+**Fractional delay** — a delay of a non-integer number of samples,
+realized by interpolating between stored samples. The near-unity ASRC's
+datapath is a fractional delay that *creeps*: the fractional position
+advances by the small rate deviation every frame.
+
+**Frame** — one sample per channel at one time instant; interleaved
+buffers store frame after frame. Latency and occupancy are denominated
+in frames so they are channel-count-invariant.
+
+**Group delay** — the delay a filter imposes on signal envelopes; for
+the linear-phase FIR here it is a constant (T−1)/2 taps ≈ 24 input
+samples for the default filter, the fixed half of the converter's
+latency budget.
+
+**Header-only** — a library shipped entirely as headers, compiled into
+each consuming translation unit. It buys trivial integration and full
+inlining, and costs ABI fragility discipline (see the rejected
+`hardware_destructive_interference_size`).
+
+**Interleaved** — channel-multiplexed sample layout
+(`L R L R …`), the wire format of `push()`/`pull()`.
+
+**Kaiser window** — the near-optimal FIR design window with one shape
+parameter β trading main-lobe width against sidelobe level, plus
+published closed-form fits from stopband attenuation to β and to filter
+length. Chosen because the design math is a page of code with known
+error bounds, evaluated once at construction.
+
+**Latency breathing** — the slow wander of the FIFO term of end-to-end
+latency (a fraction of the block size) as the servo phase-tracks the
+block beat in Track stage; benign, and distinct from an actual setpoint
+change.
+
+**Lock-free** — progress guarantee: every operation completes in a
+bounded number of steps regardless of what other threads do, including
+being suspended at the worst instruction. Required of everything on the
+audio path; asserted at compile time for every atomic the hot path
+touches.
+
+**Memory model / `std::memory_order`** — the C++ rules defining which
+values a load may observe across threads, controlled per-operation by
+ordering annotations. This codebase's idiom is *sufficiency as
+documentation*: each annotation is exactly as strong as the proof needs,
+so each one tells the reader why it exists.
+
+**MVE / Helium** — Arm's M-profile Vector Extension (Cortex-M55 class):
+128-bit SIMD including fp32, but no double precision. Its presence or
+absence gates which Q15 kernel the library compiles.
+
+**NCO (numerically controlled oscillator)** — an accumulator whose
+increment sets its frequency. The converter's μ phase accumulator is the
+NCO of its PLL: the servo's ε̂ sets the increment, wraps mark whole-sample
+slips.
+
+**Near-unity** — the regime this library specializes in: conversion
+ratios within a few hundred ppm of 1.0 (two "48 kHz" clocks), where the
+general resampling problem degenerates into a creeping fractional delay.
+The specialization is what buys the 48-tap datapath and sub-millisecond
+filter delay.
+
+**Occupancy** — the number of frames currently buffered between the
+domains (ring plus staged frames). The servo's only sensor; its
+quantization is the fundamental measurement limit of the design.
+
+**Phase accumulator** — the unsigned Q0.64 integer holding the
+fractional resampling position. It accumulates only the rate *deviation*
+per output sample, in integer arithmetic (resolution 2⁻⁶⁴ samples), and
+detects whole-sample slips by 64-bit wraparound.
+
+**Polyphase decomposition** — factoring one long interpolation filter
+into L short branches, one per fractional-delay phase, so each output
+sample evaluates T taps instead of L·T. The table stores L+1 rows so the
+μ wrap 1→0 is branch-free and exactly continuous.
+
+**ppm (parts per million)** — 10⁻⁶, the natural unit of crystal
+tolerance and drift. Consumer crystals sit tens of ppm from nominal; the
+converter accepts ±1000 ppm by default.
+
+**Q-format (Q0.15, Q1.14, Q1.30, Q0.64 …)** — fixed-point notation:
+Qm.n has m integer bits and n fractional bits in a signed word (the
+project writes the unsigned 64-bit phase as Q0.64). Q15 audio samples
+are Q0.15; the corresponding coefficients are Q1.14 so values slightly
+above 1.0 survive; accumulation is int64.
+
+**Ratchet** — the CI mechanism that compares deterministic instruction
+counts against committed baselines at ±3% in *both* directions: a
+regression fails, and an unexplained improvement also fails until the
+baseline is deliberately re-committed. Two-sided so that numbers can
+only change on purpose.
+
+**Semihosting** — a debug protocol by which a bare-metal program calls
+into its host/debugger for I/O; how the Cortex-M test binaries print
+results and exit under QEMU system emulation.
+
+**Seqlock** — a reader-retry publication scheme: the writer makes a
+sequence counter odd, writes the payload, makes it even; readers retry
+until one even value brackets a whole read. Used by the dual-core
+firmware to publish multi-word statistics coherently with only 32-bit
+atomics.
+
+**Servo** — a feedback controller steering a plant toward a setpoint;
+here the PI controller that steers FIFO occupancy to the target by
+adjusting the resampling rate, thereby *becoming* the clock-ratio
+estimator.
+
+**Setpoint** — the target FIFO occupancy (`targetLatencyFrames`),
+i.e. the buffering half of the latency budget. Must exceed the pull
+block and the peak jitter excursion; the converter raises its
+*effective* value when it observes otherwise.
+
+**Sine-fit metrology** — measuring quality by least-squares-fitting the
+known test tone (amplitude, phase, frequency) and analyzing the residual
+after exact subtraction. Sharper than FFT bins for single-tone tests and
+immune to window leakage — leakage of the fitted tone cannot masquerade
+as noise or crosstalk.
+
+**Slip** — the whole-sample event in near-unity conversion: after
+roughly 1/ppm samples the accumulated fractional position crosses a
+sample boundary and the read window shifts by one input sample. The
+extra polyphase row makes the slip exactly continuous in the output.
+
+**SNR (signal-to-noise ratio)** — here, the fitted test tone's power
+against everything else in the analysis window (a THD+N-style residual,
+so distortion counts as noise), in dB.
+
+**Soft float / soft double** — floating-point arithmetic emulated in
+integer instructions because the hardware lacks the format — FP64
+everywhere on Cortex-M33 and Hexagon. The reason the fixed-point
+datapaths exist and the reason the servo's double math is budgeted per
+block, not per sample.
+
+**SPSC (single-producer single-consumer)** — the concurrency restriction
+of the library's ring: exactly one pushing agent and one pulling agent.
+The restriction is what makes lock-freedom cheap — and it is a contract
+about agents, not threads, which is what lets two CPU cores satisfy it.
+
+**TCG plugin** — an instrumentation hook in QEMU's Tiny Code Generator;
+the project's counting plugin observes every executed guest instruction,
+yielding the deterministic per-workload counts the ratchet gates.
+
+**THD+N (total harmonic distortion plus noise)** — everything that is
+not the test signal — harmonics, spurs, noise — integrated over the
+audio band and expressed relative to the signal. The AES17 measurement
+the comparison document reports (−132 dB at the 24-bit interface).
+
+**ThreadSanitizer (TSan)** — a compiler-instrumented data-race detector
+that observes the ordering annotations actually used. It certifies only
+the interleavings a run produces, which is why the project also runs it
+on genuinely weakly-ordered arm64 hardware.
+
+**Type-2 loop** — a control loop with two integrators around the cycle
+(here: the PI's integrator plus the FIFO, which integrates rate error
+into occupancy). Type 2 is what nulls a *constant* rate offset with zero
+standing occupancy error.
+
+**UF2** — the drag-and-drop flashing format of Raspberry Pi boards; the
+build artifact of both Pico 2 firmware harnesses.
+
+**Underrun / overrun / resync** — the converter's three accounting
+events: a pull found too little data (output silence-padded, refill and
+re-lock), a push found the FIFO full (newest frames dropped), and the
+consumer-side hard discard back to the setpoint after the high watermark
+is reached. All three are counted, published in `Status`, and expected
+to be zero after lock.
+
+**VLIW (very long instruction word)** — an architecture that packs
+several operations into one issue packet scheduled by the compiler, as
+on Qualcomm's Hexagon DSP. Why "instructions executed" and "packets
+executed" differ there, and part of why instruction counts are budgets
+rather than cycle counts.
+
+**Wraparound arithmetic** — unsigned integer arithmetic modulo 2^N,
+which C++ defines exactly. The ring's monotonic indices and the DWT
+cycle deltas both rely on the same theorem: a difference that fits the
+word is computed exactly *through* the wrap, so the wrap is not an edge
+case but a non-event.
+
+**xrun** — ALSA's collective name for a device-level underrun or overrun
+(the OS missed the hardware's deadline). Handled in the bridge by
+`snd_pcm_recover`; distinct from the converter's own underrun/overrun
+accounting, which sits one layer up.
diff --git a/book/src/img/architecture.svg b/book/src/img/architecture.svg
new file mode 100644
index 0000000..e44ea4c
--- /dev/null
+++ b/book/src/img/architecture.svg
@@ -0,0 +1,1760 @@
+
+
+
diff --git a/book/src/img/feasibility.svg b/book/src/img/feasibility.svg
new file mode 100644
index 0000000..d0c31ac
--- /dev/null
+++ b/book/src/img/feasibility.svg
@@ -0,0 +1,4870 @@
+
+
+
diff --git a/book/src/img/kaiser-response.svg b/book/src/img/kaiser-response.svg
new file mode 100644
index 0000000..4b8ccd3
--- /dev/null
+++ b/book/src/img/kaiser-response.svg
@@ -0,0 +1,3683 @@
+
+
+
diff --git a/book/src/img/kaiser-window.svg b/book/src/img/kaiser-window.svg
new file mode 100644
index 0000000..98b5525
--- /dev/null
+++ b/book/src/img/kaiser-window.svg
@@ -0,0 +1,1791 @@
+
+
+
diff --git a/book/src/img/q064-slip.svg b/book/src/img/q064-slip.svg
new file mode 100644
index 0000000..ece9f37
--- /dev/null
+++ b/book/src/img/q064-slip.svg
@@ -0,0 +1,2039 @@
+
+
+
diff --git a/book/src/img/servo-lock.svg b/book/src/img/servo-lock.svg
new file mode 100644
index 0000000..32f7d63
--- /dev/null
+++ b/book/src/img/servo-lock.svg
@@ -0,0 +1,3005 @@
+
+
+
diff --git a/book/src/introduction.md b/book/src/introduction.md
new file mode 100644
index 0000000..ac5da99
--- /dev/null
+++ b/book/src/introduction.md
@@ -0,0 +1,124 @@
+# Introduction
+
+> Talk is cheap. Show me the code.
+>
+> — Linus Torvalds
+
+This book explains one piece of software completely.
+
+The software is **SampleRateTap**, a header-only C++20 library that solves a
+narrow, stubborn problem in real-time audio: two devices both claim to run at
+48 kHz, but each owns its own crystal oscillator, so neither actually does.
+One drifts a few parts per million against the other — imperceptibly slowly
+and absolutely relentlessly — and any system that moves audio between them
+must either resample adaptively or eventually glitch. The library converts
+between two such clock domains transparently (about 135 dB of measured
+fidelity), in real time (about 1.5 ms of latency), on hardware from Xeon
+servers down to a $5 microcontroller.
+
+That is a small enough problem to fit in your head and a deep enough one to
+teach from. Solving it well demands working knowledge of half a dozen fields
+that are usually taught separately: FIR filter design, fixed-point
+arithmetic, control theory, lock-free concurrency, the C++ memory model,
+SIMD micro-architecture, and the discipline of measuring instead of
+guessing. The premise of this book is that you learn those subjects better
+around one real, shipping artifact — where every design decision had to
+survive contact with every other — than from isolated examples built to
+illustrate exactly one thing.
+
+## Who this is for
+
+You are comfortable in C++ — templates, RAII, the standard library — but you
+have not necessarily written audio code, used `std::memory_order_acquire` in
+anger, designed a filter, or counted the instructions your compiler emits.
+No DSP background is assumed; the mathematics is built up exactly as far as
+the code needs it and no further. Where a result has a textbook derivation,
+we cite the textbook and spend our pages on what the textbooks omit: why
+*this* form of the equation, in *this* code, on *this* hardware.
+
+## How this book stays honest
+
+Three mechanical commitments distinguish this book from most code walkthroughs.
+
+**The excerpts are live.** Every block of library code you read is included
+into the book at build time from the actual header in the repository, by
+anchor. If the code changes, the book changes or the book's build breaks —
+in this project's continuous integration, like every other published number.
+There is no possibility of the classic tutorial failure where prose
+describes code that no longer exists.
+
+**Every claim ends in a command.** The library's culture is that performance
+and quality numbers are measured, gated, and regenerated — never asserted
+from memory. The book inherits that: each chapter closes with a *Verify it
+yourself* section listing the exact tests, benchmarks, or notebooks that
+back what you just read. When this book says the ring buffer is correct
+under weak memory ordering, you will be holding the ThreadSanitizer
+invocation that fails if it is not.
+
+**The figures are regenerable.** Every plot in this book is produced by
+`scripts/book_figures.py` from the sources the text cites: the filter
+curves re-run the header's design math formula-for-formula, and the servo
+and feasibility traces are *measured* — the script compiles a small trace
+dumper against the real headers (and, for the before-the-fix panel,
+against the pre-fix commit's headers pulled from git history) and runs it
+in deterministic virtual time. Rerun the script and you reproduce every
+figure; nothing is drawn from memory except the one architecture diagram,
+which is labeled as drawn.
+
+## The history is the curriculum
+
+This codebase was built measurement-first, and its history contains real
+reversals, preserved deliberately:
+
+- An optimization hypothesis about the Cortex-M55's floating-point unit that
+ was **wrong**, discovered because a 1.4% instruction-count regression
+ contradicted the project's own documentation — and the documentation, not
+ the measurement, turned out to be at fault.
+- A Hexagon vectorization effort that was implemented, proven bit-exact,
+ measured at a 0.31% improvement — and then **deliberately deleted**, with
+ the disassembly evidence recorded so nobody re-derives the dead end.
+- A correctness bug that survived months of green CI because every test and
+ benchmark happened to be configured just clear of it, found by an
+ adversarial audit, and demonstrated before it was fixed.
+- A toolchain that turned out to be unable to catch C++ exceptions at all —
+ discovered the day the first `EXPECT_THROW` reached it.
+
+These are not embarrassments to be edited out; they are the most valuable
+material in the book. Anyone can present a finished design as if it were
+inevitable. Watching a design *survive falsification* teaches you what the
+finished form is actually load-bearing against.
+
+## The shape of the book
+
+**Part 0** establishes the problem and its budgets: why a plain FIFO
+measurably fails (−34.7 dB!), what near-unity specialization buys, and the
+arithmetic that connects picoseconds of timing jitter to decibels of
+fidelity.
+
+**Part I** is the heart: the library's seven headers, one chapter each, in
+dependency order — filter design, the polyphase table, the sample-type
+traits, the lock-free ring, the clock servo, the fractional resampler, and
+the converter that composes them. Each chapter covers the algorithm, the
+C++ idioms chosen *and rejected*, and the failure modes the design guards
+against.
+
+**Part II** explains the proof system: deterministic two-clock simulation,
+sine-fit metrology, and the instruction-count ratchet that lets a CI runner
+gate embedded performance to the exact instruction.
+
+**Part III** retells the optimization campaign as it actually happened —
+six efforts, four wins, one honest draw, one deliberate revert — with the
+real numbers and the two implementation traps that cost a day each.
+
+**Part IV** is portability: what a Qualcomm DSP, two bare-metal ARM cores,
+and a C foreign-function interface each demanded.
+
+**Part V** reaches hardware: real crystals, real cycle counters, and the
+configuration rules that scale across channel counts and sample rates.
+
+The appendices collect the C++ decision log (every idiom adopted or
+rejected, with reasons), a glossary, and an annotated bibliography.
+
+Chapters are largely self-contained, but Part I builds on itself; if you
+read only one chapter, make it [the lock-free ring](part1/spsc-ring.md) —
+it is short, complete, and representative of the whole book's method.
diff --git a/book/src/part0/budgets.md b/book/src/part0/budgets.md
new file mode 100644
index 0000000..add62e8
--- /dev/null
+++ b/book/src/part0/budgets.md
@@ -0,0 +1,354 @@
+# Budgets: latency, quality, compute
+
+> Perfection is achieved, not when there is nothing more to add, but when there is nothing left to take away.
+>
+> — Antoine de Saint-Exupéry, *Wind, Sand and Stars*
+
+The previous chapter ended with three words used as if they were
+self-explanatory: latency, quality, compute. This chapter turns each into
+a number with a derivation behind it, because everything in Part I is an
+expenditure against one of these three accounts, and you cannot audit an
+expenditure without knowing the budget.
+
+The three budgets are not independent. A longer filter buys stopband
+attenuation (quality) at the price of group delay (latency) and
+multiply-accumulates (compute). A deeper FIFO buys servo stability
+(quality, indirectly) at the price of latency. A finer polyphase table
+buys interpolation accuracy at the price of memory and cache traffic. The
+design that ships is not the best possible point on any single axis; it is
+a defensible allocation across all three, and the allocation is different
+for a Xeon than for a microcontroller. That is why the library has presets
+and sample-type variants rather than one configuration: same architecture,
+different budget splits.
+
+We take the three in the order of most surprising to least.
+
+## The quality budget, denominated in picoseconds
+
+The README makes a claim that deserves suspicion on first reading: the
+phase accumulator's resolution is "far below the ~8 ps jitter budget for
+120 dB transparency at 20 kHz." Eight *picoseconds* — in an audio system,
+where a sample lasts twenty-one microseconds, six orders of magnitude
+longer. Where does a number like that come from?
+
+It comes from the first real mathematics in this book, and the derivation
+is three lines. This library's entire datapath is, as the last chapter
+established, a creeping fractional delay: every output sample is the input
+signal evaluated at a slightly wrong time, deliberately. So the natural
+question is: how wrong is *acceptably* wrong? If we evaluate the signal at
+time `t + Δt` instead of `t`, how large may `Δt` be before the error
+matters at the quality level we are targeting?
+
+Take the worst case the audio band can offer: a full-scale sine at the top
+of the band,
+
+```text
+s(t) = A · sin(2π f t), f = 20 kHz.
+```
+
+The error caused by a small timing offset is governed by how fast the
+signal can change. Differentiating, the slope is `2π f A · cos(2π f t)`,
+whose magnitude peaks — at the zero crossings — at
+
+```text
+max |ds/dt| = 2π f A.
+```
+
+A timing error `Δt` therefore produces an amplitude error of at most the
+slope times the error:
+
+```text
+e = 2π f A · Δt.
+```
+
+Now impose the quality target. The filter at the heart of this library is
+designed with a 120 dB stopband — the "120 dB transparency" figure that
+recurs throughout the project — and −120 dB as an amplitude ratio is
+`10^(−120/20) = 10⁻⁶`. Demanding that the timing-induced error stay below
+that, relative to full scale:
+
+```text
+2π f · Δt ≤ 10⁻⁶
+Δt ≤ 10⁻⁶ / (2π · 20 000 Hz) = 7.96 × 10⁻¹² s ≈ 8 ps.
+```
+
+Eight picoseconds. Not because audio hardware keeps time that precisely —
+it does not, remotely — but because *this library's job is to manufacture
+sampling instants*. The two crystals define real time; the converter
+invents the fractional positions in between, and any noise in those
+invented positions is indistinguishable from noise added to the audio, at
+the exchange rate the slope sets: one picosecond of timing error at 20 kHz
+full scale costs about an eighth of a microvolt-per-volt, and 8 ps costs
+−120 dB. Position error *is* amplitude error. That single sentence is the
+reason a resampling library must care about time resolution that would be
+absurd anywhere else in audio.
+
+Two honest qualifications keep the number from overclaiming. First, this
+is a worst-case bound — the full-scale 20 kHz zero crossing — and real
+program material spends almost no energy there; at 1 kHz the same
+derivation gives a 20× looser budget, which is one reason the measured SNR
+table is 135 dB at 997 Hz but 105 dB at 19.5 kHz. Second, the budget
+governs *random or signal-uncorrelated* timing error. Slowly varying
+timing error is not noise but frequency modulation — pitch wobble — and it
+gets its own, much stricter treatment when the servo chapter derives why
+the Quiet stage must reject its input sawtooth to roughly −120 dBc
+equivalent at 20 kHz. Same currency, different account.
+
+## Spending the budget: sixty-four bits of phase
+
+With the budget in hand, we can now read the library's most important
+data-representation decision as the budget allocation it is. Convert 8 ps
+into the datapath's native unit, fractions of a sample at 48 kHz:
+
+```text
+8 ps / 20.8 µs ≈ 3.8 × 10⁻⁷ samples ≈ 2⁻²¹ samples.
+```
+
+So the fractional position µ must be carried to about 21 fractional bits
+before timing quantization alone could threaten 120 dB. Here is what the
+library actually does, in the inner loop of the fractional resampler —
+this is the Q0.64 phase accumulator the README describes, live from
+`include/srt/polyphase_filter.hpp`:
+
+```cpp
+{{#include ../../../include/srt/polyphase_filter.hpp:p0_phase_step}}
+```
+
+The fractional position lives in an unsigned 64-bit integer interpreted as
+Q0.64: all 64 bits are fraction, so the resolution is 2⁻⁶⁴ of a sample —
+forty-three binary orders of magnitude below the 2⁻²¹ the budget demands.
+The servo's rate-deviation estimate `epsHat` is converted from double to
+this fixed-point form **once per block**, and from there the per-sample
+path is pure integer arithmetic: one 64-bit addition per output sample,
+with the two slip cases — the fractional position creeping past 1.0 or
+below 0.0, the "whole-sample slip roughly every `1/ppm` samples" of
+Chapter 1 — detected by unsigned wraparound rather than comparison against
+a threshold.
+
+Why carry 43 bits more resolution than the budget requires? Because the
+excess is free, and what it buys is not resolution but *exactness*. A
+phase accumulator adds a tiny ε thousands of times per second; do that in
+floating point and every addition rounds, because a double near 1.0 has
+2⁻⁵² of absolute resolution and a double's rounding depends on the current
+magnitude of the accumulator. The earlier version of this code did exactly
+that, and worked. But integer addition modulo 2⁶⁴ does not round — ever —
+so the only quantization in the entire phase path is the once-per-block
+conversion of ε itself, and the accumulated position between servo updates
+is bit-exact. (The conversion is safe by construction: the servo clamps
+|ε| to about 10⁻³, so `ε · 2⁶⁴` fits comfortably in the signed 64-bit
+intermediate — the code comment above carries the argument, and the
+configuration validator refuses `maxDeviationPpm` settings that could
+break it.)
+
+The project's performance log records what this decision measured when it
+landed as change C3 of the optimization campaign: the *motivation* was the
+compute budget — an integer-only per-sample path with no doubles is what
+keeps the inner loop cheap on DSPs without double-precision floating-point
+units, and it cut Hexagon's Q31 pipeline cost by 15.5 % — but quality
+*improved* as a side effect, to 135.0 dB at 997 Hz, with the log noting
+the phase resolution change from 2⁻⁵² to 2⁻⁶⁴. One representation change,
+paid from no budget, credited to two. Those are rare, and worth designing
+toward.
+
+## The latency budget
+
+Latency is the easiest budget to state and the easiest to spend by
+accident. Here is where every frame of it is decided — the converter's
+entire configuration surface, live from `include/srt/asrc.hpp`:
+
+```cpp
+{{#include ../../../include/srt/asrc.hpp:p0_config}}
+```
+
+The README's latency equation prices the defaults:
+
+```text
+latency = targetLatencyFrames + (L·T − 1) / (2L) [input frames]
+ = 48 + (256·48 − 1)/512
+ = 48 + ~24 ≈ 72 frames ≈ 1.5 ms at 48 kHz.
+```
+
+Two terms, and they are budget lines of entirely different character.
+
+The second term is the **filter group delay**, and it is a law of physics
+wearing a configuration option's clothes. The interpolation filter is a
+linear-phase FIR — symmetric coefficients, which is what guarantees every
+frequency is delayed equally, and waveform shape is preserved — and a
+symmetric filter *must* delay the signal by half its span: with `L = 256`
+polyphase branches of `T = 48` taps each, `(L·T − 1)/(2L)` is 23.998
+input frames, ~0.50 ms. You cannot negotiate this term down at constant
+quality; you can only buy a shorter filter. `FilterSpec::fast()` does
+exactly that, cutting group delay to about 16 frames at reduced stopband,
+and the `transparent()` preset spends the other way — 80 taps, 40 frames,
+0.83 ms — for its extra high-frequency headroom. Quality and latency,
+trading at a posted exchange rate of half a frame per tap.
+
+The first term, the 48-frame **FIFO setpoint**, is not physics but control
+headroom, and it is the term you own. The FIFO between the clock domains
+must never run empty (an audible underrun) and never hit its high
+watermark (a resync), so the servo regulates its occupancy around a
+setpoint — and that standing occupancy is buffered audio you are listening
+through. Forty-eight frames is one millisecond at 48 kHz: enough to absorb
+the push/pull phase jitter of real callbacks with margin, small enough to
+keep the total design latency at 1.5 ms.
+
+The setpoint carries a feasibility rule that the README states in bold and
+the constructor-plus-`pull()` logic enforces, because violating it does
+not degrade the system — it destroys it: **the setpoint must exceed the
+pull block size.** A `pull()` synthesizes output only from frames already
+buffered; if the callback asks for 128 frames while the servo holds the
+buffer at 48, every callback drains the FIFO through empty, and the
+converter falls into a permanent dropout cycle that no amount of servo
+cleverness can escape, because the geometry is simply infeasible. Rather
+than document a footgun, the converter adapts: when it observes pull
+blocks larger than the configured setpoint, it raises the effective
+setpoint to the block size plus about half a block of margin (bounded by
+FIFO capacity — callbacks above ~340 frames also need `fifoFrames` sized
+explicitly), reports the raised value in
+`Status::effectiveTargetLatencyFrames`, and lets latency follow. The
+latency budget, in other words, has a hard floor set by your callback
+size, and the library will spend up to that floor without asking — the
+one budget line it refuses to let you underfund. On top of the rule sits
+its softer sibling: the setpoint must also stay above the peak occupancy
+excursion of your push/pull jitter, and the FIFO term breathes by a
+fraction of the block size as the servo tracks drift, so 1.5 ms is a
+design center, not a guarantee etched per-sample.
+
+`designedLatencySeconds()` reports the resulting figure at runtime, and
+`tests/test_latency.cpp` closes the loop the project's way: it pushes an
+impulse through a locked converter and asserts that the impulse emerges
+where the equation said it would.
+
+## The compute budget
+
+The third budget is the one whose *unit* changes with the deployment. On a
+server, compute is a fraction of a core; on a microcontroller, it is a
+question of existence — does the workload fit under the clock rate or not.
+This library targets both ends simultaneously, which is why its
+performance culture is unusual, and why `docs/PERFORMANCE.md` is one of
+the two canonical history documents this book draws on.
+
+Start at the comfortable end. On the shared 2.80 GHz Xeon that produced
+the README's benchmark table, the default float converter processes a
+stereo 48 kHz stream at 107.8 ns per frame — 193× faster than real time,
+meaning one live stream costs about half a percent of one core. At that
+end the compute budget is not about survival but about citizenship: how
+many streams per core, how much headroom the rest of the audio graph
+inherits.
+
+Now the other end. The README's platform matrix ends at the Arm
+Cortex-M33 — the Raspberry Pi Pico 2's core, bare metal, no FP64 hardware,
+no vector unit — and the project publishes, in the README's
+instruction-count table, exactly what every workload costs there. The
+numbers are *executed instructions*, measured by running fixed workloads
+under QEMU with a counting plugin, and they are brutal and instructive.
+The float interpolation kernel that costs the Cortex-M55 99.5 million
+instructions costs the M33 1.90 **billion** — about 19× — for one reason:
+the float datapath accumulates in double precision by design, and on a
+core with no double-precision FPU every one of those accumulations becomes
+a software floating-point library call. The compute budget on such a
+target is not tightened; it is a different budget entirely, and the
+Q15/Q31 fixed-point datapaths exist precisely as the correctly-denominated
+response — integer-only inner loops that make the M33's cost land near the
+M55's instead of 19× above it.
+
+What does an instruction budget *mean* on a 150 MHz M33? Divide. A 150 MHz
+core executing (optimistically) one instruction per cycle retires 150
+million instructions per second, and a 48 kHz stream demands a frame every
+20.8 µs — about 3,100 instructions of total budget per frame, forever,
+before the rest of the firmware has run at all. Against that, the measured
+comparison workloads put the full Q15 converter — servo and FIFO included
+— at roughly 5,043 instructions per stereo frame on the M33: about 242
+million instructions per second for stereo, over the core's ceiling even
+at ideal IPC. Mono, at roughly half that, fits. This is exactly the
+README's guidance, now visible as arithmetic rather than advice: 48 kHz
+Q15 mono fits a 150 MHz M33; stereo wants the `fast()` preset or the
+RP2350's second core. On a Xeon the same library is a rounding error; on
+the M33 the default preset is *infeasible in stereo*, and knowing that
+before flashing hardware is the entire point of keeping the budget in a
+table.
+
+The honesty clause matters as much as the numbers, and `docs/PERFORMANCE.md`
+states it in its metrics table: instruction counts are deterministic to
+the instruction, noise-free, and well-correlated with real cost *for
+scalar code* — and they are still not cycles. They know nothing of wait
+states, flash caches, or dual-issue. Cycle truth requires vendor
+simulators or real silicon, which is why the repository carries
+`examples/pico2_cyccnt/`, a flashable RP2350 harness that measures
+DWT.CYCCNT cycles per block against these same instruction baselines, and
+why the README explicitly frames the counts as "budgets pending
+real-silicon validation." What determinism *does* buy is enforcement: the
+counts are committed to `bench/baselines.json` and CI re-measures every
+push, failing on any drift beyond ±3 % in either direction — a regression
+is rejected, and an unexplained improvement is also rejected until the
+baseline is re-recorded in the same diff, so stale slack cannot accumulate
+to hide the next regression. Wall-clock numbers, by contrast, are never a
+hard gate: shared runners are too noisy, and a gate that flakes teaches
+people to ignore it. Instructions are gated because they are exact;
+wall-clock is reported because it is real. Both disciplines are the same
+policy — publish only what you can re-measure — applied to metrics of
+different reliability. Part II returns to this machinery in detail.
+
+## Each budget line becomes a file
+
+Part 0 has now done its work: a physical problem (two crystals), a
+measured cost of ignoring it (−34.7 dB), and three budgets with numbers
+attached. Part I walks the library's headers in dependency order, and the
+tour is really the budget ledger read line by line:
+
+`kaiser.hpp` is the quality budget's opening entry — the 120 dB stopband
+that made the 8 ps derivation's target, purchased with a windowed-sinc
+design whose tap count is the latency and compute budgets' first expense.
+The polyphase bank spends memory to make one branch-pair evaluation per
+output sample possible at all, and its `L = 256` branch count is sized by
+the interpolation-residual rule the README quotes (−12 dB per doubling of
+`L`, +12 dB per octave of signal frequency) — the reason the measured
+table slopes from 135 dB at 997 Hz to 105 dB at 19.5 kHz.
+`sample_traits.hpp` is the compute budget's answer to the M33 column
+above: the Q15/Q31 datapaths as a customization point rather than a fork.
+`spsc_ring.hpp` holds the latency budget physically — its occupancy *is*
+the 48-frame line item — and doubles as the servo's sensor. `pi_servo.hpp`
+polices the quality budget's FM account, rejecting the occupancy sawtooth
+to the −120 dBc figure this chapter bounded. The fractional resampler
+carries the Q0.64 accumulator you have already read. And `asrc.hpp`
+composes the whole, enforcing the feasibility rule so the latency budget
+can never be underfunded into a dropout cycle.
+
+Every number in those chapters traces back to one of this chapter's three
+accounts. When a design choice seems baroque — a 64-bit integer phase, an
+extra row in a coefficient table, a third servo stage — the question to
+ask is always the same: *which budget is it spending, and which is it
+defending?*
+
+## Verify it yourself
+
+```sh
+# The 8 ps budget, re-derived in one line:
+python3 -c "import math; print(1e-6 / (2 * math.pi * 20000))"
+
+# The quality budget, enforced: the pinned SNR thresholds behind the
+# README's 135/120/112/105 dB table:
+cmake -B build -DCMAKE_BUILD_TYPE=Release
+cmake --build build -j
+ctest --test-dir build -R AsrcQuality --output-on-failure
+
+# The latency budget, enforced: an impulse must emerge exactly where
+# designedLatencySeconds() promises (48 + ~24 frames by default):
+ctest --test-dir build -R Latency --output-on-failure
+
+# The host compute budget (Google Benchmark; the README table's source):
+cmake -B build-bench -DCMAKE_BUILD_TYPE=Release -DSRT_BUILD_BENCHMARKS=ON
+cmake --build build-bench -j
+./build-bench/bench/srt_bench
+
+# The embedded compute budget: fixed workloads under QEMU, compared to
+# the committed baselines at ±3% (needs the cross toolchain and a
+# TCG-plugin-capable QEMU — docs/PERFORMANCE.md has the mechanics):
+python3 scripts/icount.py --target m33 --build-dir --plugin
+```
+
+The instruction-count and benchmark tables in the README regenerate from
+these same commands (`scripts/update_icount_docs.py`,
+`scripts/update_perf_docs.py`), and CI fails if the published tables drift
+from the measured baselines — the budgets in this chapter are audited on
+every push.
diff --git a/book/src/part0/two-crystals.md b/book/src/part0/two-crystals.md
new file mode 100644
index 0000000..d9d8696
--- /dev/null
+++ b/book/src/part0/two-crystals.md
@@ -0,0 +1,295 @@
+# Two crystals, one stream
+
+> No man ever steps in the same river twice, for it is not the same river and he is not the same man.
+>
+> — attributed to Heraclitus
+
+Every specification of this library begins with a lie that the audio
+industry tells itself daily: "48 kHz."
+
+There is no such thing as 48 kHz. There is a quartz crystal on the capture
+device's board resonating at very nearly the frequency its datasheet
+promises, and a different quartz crystal on the playback device's board
+doing the same, and neither of them consulted the other. Each was cut,
+trimmed, and aged in its own factory; each sits at its own temperature,
+warming with the electronics around it; each is divided down to a sample
+clock through its own board's logic. When both devices claim 48 kHz, what
+they mean is 48 kHz plus or minus some parts per million — and *whose*
+parts per million is exactly the question. This library's working
+envelope, inherited from the kind of hardware it targets, is a few hundred
+ppm of offset per device, drifting slowly as temperatures change; the
+default configuration accepts anything within ±1000 ppm, and the test
+suite drives it across that range deliberately — including a 0 → 300 ppm
+drift ramp at 10 ppm/s that must be tracked without losing lock.
+
+A part per million sounds like nothing. It is worth pausing on why it is
+everything.
+
+## The integral that cannot be argued with
+
+Suppose you capture audio from device A and play it on device B, and
+suppose the two clocks disagree by +200 ppm — the offset used throughout
+this project's measurements as a realistic mid-scale case: the input side
+runs at 48 009.6 Hz against the output's 48 000 Hz. The rate mismatch is
+0.02 %. Per sample it is invisible. But a rate mismatch does not average
+out; it *integrates*. Every second, the capture side produces 9.6 more
+frames than the playback side consumes. Every second, forever.
+
+Put a buffer between them — the obvious move, and a correct first move —
+and you have only chosen where the failure happens. The surplus
+accumulates in the buffer at 9.6 frames per second. A 1,024-frame FIFO
+(the converter's own default capacity floor, for scale) started half full
+gives you about 53 seconds before it is completely full and something has
+to give. Make the buffer deeper and you buy time linearly while paying
+latency for every frame of depth; a buffer deep enough to survive an
+hour-long session at 200 ppm would hold about three quarters of a second
+of audio, all of which you would then be monitoring through. Flip the sign
+of the mismatch and the same argument drains the buffer to empty instead.
+There is no buffer size that fixes a rate mismatch, because the problem is
+not jitter — which a buffer genuinely absorbs — but a nonzero mean. The
+README states the consequence as the library's founding fact: whole-sample
+slips occur roughly once every `1/ppm` samples, and any system that moves
+audio between independent clocks must either resample adaptively or
+eventually glitch.
+
+So the plain FIFO must fail. The interesting question — and the one this
+project answered by measurement rather than assertion, because that is its
+habit — is *how badly*.
+
+## Measuring the do-nothing option
+
+The comparison notebook (`notebooks/asrc_comparison.ipynb`, results
+recorded in `docs/COMPARISON.md`) includes, alongside the serious
+contenders, a subject called the **naive FIFO**: a buffer that simply
+drops the newest samples when full, which is what "we'll deal with it
+later" compiles to. It was measured under exactly the same conditions as
+everything else — a 997 Hz tone at −1 dBFS crossing a +200 ppm clock
+boundary, an AES17-style THD+N analysis with the fundamental removed and
+the residual integrated across the 20 Hz–20 kHz band.
+
+The naive FIFO measures **−34.7 dB THD+N** and 94.7 dB of A-weighted
+dynamic range. The converter this book describes, on the same signal and
+the same clocks, measures −132.1 dB.
+
+What does −34.7 dB sound like? The number means that the error left after
+subtracting the test tone sits only 34.7 dB below the tone itself — a
+residual of about 1.8 % of the signal. If that residual were smooth
+harmonic distortion, 1.8 % would already be far into plain audibility. But
+it is worse than that, because of *how* the error is distributed in time.
+At +200 ppm the buffer overflows and discards a sample about 9.6 times per
+second, and each discard splices the waveform to a point one sample later:
+a step discontinuity. A step is the broadest-band event a sampled signal
+can contain; its energy smears across the entire spectrum. So the
+subjective experience is not a haze of distortion but a steady mechanical
+ticking — roughly ten clicks per second at this offset — riding on
+otherwise clean audio. It is the sound that anyone who has misconfigured a
+USB audio loopback already knows, and once heard it cannot be unheard. The
+dynamic-range figure tells the same story from below: quiet passages sit
+on a floor of click energy, tens of decibels above where the converter's
+floor lies.
+
+That row of the table is the cost of doing nothing, and it calibrates
+everything else in this book. Every design decision in the chapters ahead
+is ultimately justified by the distance between −34.7 dB and −132.1 dB.
+
+## The two industry answers
+
+The two-crystal problem is decades old, and industry converged on two
+families of solution. `docs/COMPARISON.md` opens by insisting on the
+distinction, because both families are marketed under the same three
+letters: there are **full ASRCs** that recover the clock ratio themselves,
+and **resampler libraries** that must be handed the ratio from outside.
+
+**The hardware answer** is the asynchronous sample rate converter chip.
+The canonical part is Analog Devices' AD1896 — the lineage this library's
+architecture explicitly follows — joined by parts like TI's SRC4392. These
+are dedicated silicon: serial audio in on one clock, serial audio out on
+another, and the chip does everything, including the part that makes the
+problem *asynchronous* — discovering the ratio between the two clocks by
+itself, continuously, without being told. The datasheet numbers are
+excellent: −117 dB THD+N minimum (−133 dB best case) and 142 dB dynamic
+range for the AD1896; −140 dB typical and 144 dB dynamic range for the
+SRC4392. Their ratio ranges are enormous — 1:8 up and 7.75:1 down for the
+AD1896, 1:16 to 16:1 for the SRC4392 — because these chips are built to
+convert 44.1 kHz material to 48 kHz and every other crossing a studio can
+produce, not merely to absorb drift. The costs are the obvious ones: a
+proprietary part, a place on the board, one stereo pair per chip, and no
+help at all if your audio exists as bytes in memory rather than as a
+bitstream between codecs. (A caveat the comparison document is careful
+about, and this book inherits: those figures are datasheet values measured
+through analog test loops, not this project's measurement. They are
+comparable to the software numbers in definition, not in environment.)
+
+**The software answer** is the resampler library: libsamplerate, soxr,
+zita-resampler. These are superb pieces of engineering with a structural
+gap that `docs/COMPARISON.md` names precisely: they must be handed the
+ratio by an external servo, and so they solve *only half of the drift
+problem*. A resampler library answers the question "given that the input
+runs 200 ppm fast, compute the output samples" — flawlessly, at any ratio
+you ask. It does not answer "how fast is the input actually running right
+now?", and that is the question the two-crystal problem poses, because
+nothing in your system knows the answer. The true ratio is not written
+down anywhere; it exists only physically, in the beat between two
+oscillators, and it moves as the room warms up. In the comparison
+measurements the libraries were fed the exact ratio by an oracle — the
+harness knew the true offset because it had synthesized it — and under
+those conditions they measure at the format ceilings: −143.5 dB THD+N
+through a 24-bit interface for libsamplerate's `sinc_best`, −143.8 dB for
+soxr's `VHQ`. Real numbers, and also unobtainable in the field as stated,
+because the oracle does not ship. (Near-unity is their easy regime, too:
+libsamplerate's published 97 dB worst case belongs to aggressive ratios,
+not this one.)
+
+The missing half has a name: clock recovery. Somebody must observe the two
+domains, estimate their ratio from evidence, and track it as it drifts — a
+control problem, not a signal-processing one. The Linux/JACK ecosystem
+shows what bolting that half on looks like: zita-ajbridge wraps a
+delay-locked loop around zita-resampler. Operating systems solve it too,
+invisibly — CoreAudio, WASAPI shared mode, and PipeWire all run ASRCs
+inside their engines — with unpublished quality and typically 5–20 ms of
+latency, fine for notification sounds and disqualifying for live
+monitoring.
+
+So the field, surveyed honestly: chips that solve the whole problem in
+proprietary silicon; libraries that solve the easy half in portable
+software at reference quality; system engines that solve the whole problem
+opaquely at whatever quality and latency they choose. What did not exist —
+and what this library is — is the whole problem solved in open, portable,
+embeddable software at measured quality: an AD1896-shaped architecture,
+polyphase FIR plus clock servo, that you can compile.
+
+## The specialization that pays for everything
+
+You cannot simply transcribe the AD1896 into C++ and expect it to fit on a
+microcontroller; the chips' generality is exactly the expensive part.
+SampleRateTap's founding decision is to refuse most of the problem the
+chips solve. It handles *only* the near-unity case: two domains at
+nominally the same rate, within ±1000 ppm by default. It will never
+convert 44.1 kHz to 48 kHz — the README lists this first among its
+limitations, and `docs/COMPARISON.md` is blunt that for genuine rate
+*conversion* you should put soxr or libsamplerate in the chain.
+
+Here is what the restriction buys. A general-ratio converter must be able
+to place output samples anywhere relative to input samples, at any
+spectral relationship between the rates — including downward conversions
+where the filter must also band-limit, and ratios that change which parts
+of its machinery dominate. In the near-unity regime none of that machinery
+earns its keep. When the ratio is 1 + ε with ε a few hundred parts per
+million, each output sample lands *almost exactly* on an input sample:
+just a hair early or late, by a fractional offset that creeps by ε per
+sample and wraps once every `1/ε` samples. The README's "How it works"
+section states the consequence in one phrase: the conversion degenerates
+into a **creeping fractional delay**. The datapath's job collapses to
+evaluating one interpolation at a slowly sliding fractional position — a
+48-tap dot product per output sample in the default configuration — plus a
+servo deciding how fast the position should creep. And because the two
+rates are spectrally indistinguishable, anti-imaging and anti-aliasing
+collapse into a single fixed filter design, flat to 20 kHz, done once in
+the constructor.
+
+The computational tables in `docs/COMPARISON.md` measure what that is
+worth. Against libsamplerate — the closest architectural analog, a
+streaming time-domain polyphase resampler — at the matched ~120 dB quality
+tier, SampleRateTap converts 2.9–3.6× more frames per second (mono/stereo;
+2.1× at 8 channels, where both engines amortize), while carrying half the
+algorithmic latency: 24 frames (0.50 ms) of filter group delay against 46
+frames (0.96 ms). At the ~140 dB tier the gap widens to 6.2× in throughput
+and to 40 frames against 143 in latency. That is the near-unity dividend,
+and the comparison document names its mechanism exactly: a 48-tap window
+with a creeping phase, instead of general-ratio machinery. On targets
+without floating-point hardware the dividend compounds — the Q15
+fixed-point datapath has no libsamplerate analog at all, and on a
+Pico-class Cortex-M33 the cheapest libsamplerate option costs about 9.8×
+what SampleRateTap's intended configuration does.
+
+The soxr rows teach a different lesson, and reading them honestly is a
+preview of the next chapter. At the ~120 dB tier soxr converts 32.4
+million stereo frames per second on the same host to SampleRateTap's 10.5
+million — soxr wins raw throughput, decisively, by processing in large
+SIMD-friendly internal batches. The latency column is the price: 556 to
+607 frames of algorithmic delay, 11.6 to 12.6 ms, rising to 777 frames
+(16.2 ms) at its highest quality tier. Those are fine numbers for batch
+conversion and impossible ones inside a 1–2 ms live-monitoring budget, and
+— as `docs/COMPARISON.md` puts it — there is no setting that buys soxr's
+throughput at SampleRateTap's latency. Throughput, latency, and quality
+are not independent virtues to be maximized; they are a budget to be
+allocated, and different tools have allocated it for different lives.
+
+One more number from the measured table completes the picture, because
+this book does not deal in free lunches. Fed by its own servo rather than
+an oracle, running causally at 1.5 ms of total design latency,
+SampleRateTap measures −132.1 dB THD+N against the oracle-fed libraries'
+−143.5 dB. The ~11 dB gap is the measured price of solving the *whole*
+problem — discovering the ratio from buffer occupancy in real time instead
+of being told it — and the comparison document presents it as exactly
+that. Eleven decibels, spent 132 dB below the signal, purchasing the half
+of the problem that was actually hard. The rest of this book is an account
+of how both numbers — the 132 and the 11 — were achieved, measured, and
+defended.
+
+## Watching the invisible
+
+Before the budgets, one more thing Chapter 1 owes you: a way to *see* the
+problem, because 200 ppm is below anything your ears will report until the
+FIFO finally gives way. The repository's first example,
+`examples/drifting_clocks.cpp`, exists for exactly this. It runs two real
+threads: a producer pushing a 997 Hz sine at a virtual 48 000.0 Hz, and a
+consumer pulling at 48 kHz plus 500 ppm, both paced with absolute
+`sleep_until` deadlines so the long-term rates are exact even though every
+individual wakeup jitters by operating-system amounts — far rougher timing
+than any real audio callback delivers. A status line prints the servo's
+state and its rate estimate as it converges toward the −500 ppm
+consumption deviation.
+
+Two of the example's own caveats are worth reading before you run it,
+because each is a preview of a later chapter. First, since scheduler
+jitter here is on the order of milliseconds, the demo configures a 20 ms
+FIFO setpoint rather than the library's 1 ms default — your first sighting
+of the latency budget bending to its environment, which is the next
+chapter's subject. Second, the converter observes the clocks only through
+whole 96-frame chunks, so its estimate of the ratio cannot firm up faster
+than the chunk-beat period `1/(ppm × chunkRate)` — about four seconds per
+beat cycle at 500 ppm — and the instantaneous estimate visibly wobbles at
+that beat, which is why the display shows a three-second moving average.
+The information available about two clocks is quantized by how coarsely
+you watch them exchange data; that observation will return as the entire
+justification for the servo's three-stage design.
+
+Run it and watch the state go `Filling`, then `Acquiring`, then `Locked`,
+and the ppm readout settle toward −500. Nothing about the audio would have
+told you any of this for the first minute — and that is the point. The
+drift is always there; the only choice is whether something in the system
+is measuring it.
+
+First, though, the budgets. Claims like "a 1–2 ms live-monitoring budget"
+and "120 dB transparency" have been used here as if self-evident. They are
+not. The next chapter derives each one — including why this library's
+quality target works out to a timing tolerance of about eight
+*picoseconds*.
+
+## Verify it yourself
+
+```sh
+# Two real threads, two clocks 500 ppm apart; watch the servo lock and
+# the ppm estimate converge:
+cmake -B build -DCMAKE_BUILD_TYPE=Release
+cmake --build build -j
+./build/examples/drifting_clocks
+
+# Reproduce the measured table — including the −34.7 dB naive-FIFO row
+# and the oracle-fed library ceilings. Needs numpy, matplotlib, and the
+# `samplerate` and `soxr` Python packages; the first cell builds the
+# C ABI shared library if missing:
+jupyter execute notebooks/asrc_comparison.ipynb
+
+# The computational head-to-head on your own host (requires the system
+# libsamplerate and soxr development packages, found via pkg-config):
+cmake -B build-cmp -DCMAKE_BUILD_TYPE=Release \
+ -DSRT_BUILD_BENCHMARKS=ON -DSRT_BUILD_COMPARE_BENCH=ON
+cmake --build build-cmp -j
+./build-cmp/bench/compare/srt_bench_compare
+```
+
+The comparison notebook pins SampleRateTap's own results with assertions,
+so a regression in the library makes the reproduction fail loudly. The
+numbers in this chapter are load-bearing, not decoration.
diff --git a/book/src/part1/asrc.md b/book/src/part1/asrc.md
new file mode 100644
index 0000000..45a676e
--- /dev/null
+++ b/book/src/part1/asrc.md
@@ -0,0 +1,270 @@
+# Composition: `asrc.hpp`
+
+> The whole is something beside the parts.
+>
+> — Aristotle, *Metaphysics*
+
+Every previous chapter built a component that is correct on its own terms.
+This chapter is about the file that has no terms of its own: `asrc.hpp`
+contains almost no algorithm, no mathematics, and fewer than three hundred
+lines that mostly call other files' code. It is also where the only serious
+bug in the library's history lived. Both facts have the same cause.
+Composition is where each component's assumptions meet every other
+component's guarantees, and the gaps between them are invisible from inside
+any single file.
+
+The cast, assembled: a `PolyphaseFilterBank` designed at construction, a
+`FractionalResampler` that owns the history and the phase, a `SpscRing`
+carrying interleaved frames between the two clock domains, and a `PiServo`
+turning ring occupancy into a rate estimate. `BasicAsyncSampleRateConverter`
+wires them together and adds the four things none of them could own alone:
+a lifecycle state machine, an under/overrun policy, telemetry, and
+validation.
+
+
+
+*The whole machine on one page. The ring is the only structure both clock
+domains touch; everything downstream of it — servo, resampler, and both
+their states — lives on the consumer's side, which is why `pull()` carries
+all the policy and `push()` is eight lines.*
+
+## The two-agent shape
+
+The public surface is two functions and a contract:
+
+- `push(interleaved, frames)` — called by exactly one producer agent, at
+ the input clock's pace.
+- `pull(interleaved, frames)` — called by exactly one consumer agent, at
+ the output clock's pace.
+
+"Agent" rather than "thread" is deliberate. On a workstation the two agents
+are threads; on the dual-core RP2350 firmware they are two processor cores;
+in the deterministic test simulator they are interleaved events on one
+thread. The converter never creates a thread, never names a thread, and
+never synchronizes beyond what the ring already provides — it is a passive
+object that two callers animate. This is why the library contains no
+`std::thread`, no executor, and no callback registration: the moment a
+library owns threads it owns scheduling policy, priorities, and shutdown
+order, all of which belong to the application. The cost of this design is a
+sharp, documented affinity contract (push is producer-only, pull is
+consumer-only, `resetFromConsumer` is consumer-only); the C-ABI header
+restates it because FFI callers can't read C++ doc comments.
+
+`push()` is eight lines and nearly trivial — clip to free space, write,
+count an overrun if clipped. All composition complexity lives on the
+consumer side, and that too is a decision: the producer is often an
+interrupt-context audio callback with the tightest budget in the system, so
+every gram of policy was moved to the puller.
+
+## The state machine
+
+`pull()` runs a three-state lifecycle — Filling, then a servo that is
+Acquiring or Locked — plus two exceptional transitions. Here is the filling
+and resync machinery as it ships:
+
+```cpp
+{{#include ../../../include/srt/asrc.hpp:asrc_filling}}
+```
+
+```cpp
+{{#include ../../../include/srt/asrc.hpp:asrc_resync}}
+```
+
+Filling exists because the resampler cannot produce its first output until
+a full window of `taps()` history frames exists, and the servo cannot
+regulate an occupancy that is still climbing toward its setpoint. So the
+converter emits silence until the backlog reaches `setpoint + taps`, primes
+the resampler's window in one gulp, seeds the servo's smoothers at the
+observed occupancy (so the loop starts from truth rather than slewing from
+zero), and begins converting — with a fade-in, discussed below.
+
+The two exceptional transitions are the under/overrun policy, and their
+asymmetry rewards attention. **Underrun** (the consumer outran the data):
+pad the rest of the block with silence, count it, return to Filling — but
+call `servo_.reset(true)`, the flavor that *keeps the integrator*. The ppm
+estimate is the accumulated knowledge of where the other crystal sits; a
+dropout interrupts the audio, not the physics, so the estimate survives and
+re-lock after a dropout takes a fraction of the original acquisition time.
+**Overrun pressure** (the consumer stalled long enough for occupancy to
+pass the high watermark): discard down to the setpoint in one cut, count a
+resync, and re-seed the smoothers — because after a deliberate
+discontinuity in the observable, letting the loop "discover" the jump would
+inject exactly the transient the seed avoids. One subtlety in the resync
+was wrong for months: the discard must be clamped to what the *ring*
+actually holds, because the occupancy figure includes frames already staged
+inside the resampler's pop scratch, which no ring discard can reach. With a
+setpoint smaller than that staging buffer, the unclamped subtraction
+drained the ring to zero and the converter fell into a refill-underrun
+cascade. An audit found it; a regression test now pins it.
+
+The fade-in deserves its sentence of honesty, which the header also
+carries: after every (re)fill the first 64 frames ramp linearly from
+silence, so *recovery* never clicks — but the dropout's onset, and a
+resync's splice, are unfaded cuts, because at the moment they happen there
+is nothing valid to fade toward. A design can only be honest about which
+discontinuities it removes.
+
+## The bug that composition hid
+
+Now the centerpiece, and the reason this chapter exists in its current
+form.
+
+Every component below this file was correct. The ring transferred bytes
+exactly; the servo regulated occupancy to its setpoint with textbook
+dynamics; the resampler synthesized precisely the frames asked of it. And
+for months, a converter built from these correct parts, at default
+configuration, was **silently broken for the most common audio callback
+size in the world.**
+
+The mechanism is embarrassingly simple once stated. A `pull(N)` must
+synthesize N frames from data *already in the backlog* — in a real
+deployment, no pushes land during the microseconds a pull executes. The
+servo, meanwhile, faithfully regulates the backlog toward
+`targetLatencyFrames`, which defaults to 48. If N is greater than 48, the
+servo's goal and the consumer's need are in direct contradiction: the loop
+steers occupancy *down* toward a level from which the next pull cannot be
+served. Occupancy drains at the rate clamp, hits the floor, underruns,
+refills, fades in — and repeats, forever. Measured at default
+configuration: a 64-frame callback drops out every ~0.24 seconds
+indefinitely, never reaching Locked, with the reported ppm pegged at a
+false +1500 (the clamp, mistaken for the answer). A 240-frame callback
+produced 80% silence.
+
+
+
+*Both panels are measurements, not models: `scripts/book_figures.py`
+compiles the same trace dumper against the include/ tree of the last
+pre-fix commit (via `git archive`) and against HEAD, and runs the
+identical scenario. Before: drain, underrun, refill — four dropouts a
+second, forever. After: one adaptive raise on the first pull, then the
+servo regulates the effective setpoint and the underrun count stays at
+zero.*
+
+Why didn't anything catch it? Because every artifact that exercised the
+converter had, innocently, been configured just clear of the cliff. The
+quality tests pull one frame at a time — the metrologically correct choice
+for their purpose. The benchmarks set the setpoint to twice the block size —
+the performance-measurement-correct choice. The lock tests used 32-frame
+blocks against the 48-frame default — feasible. Correct component tests,
+correct measurement configurations, months of green CI, and a defaults
+matrix with a hole exactly where real applications live. The lesson
+generalizes and is worth stating as a rule: **a test suite validates the
+configurations it contains, and silence about a configuration is not
+evidence about it.** It took an adversarial audit — one explicitly tasked
+with constructing failure scenarios rather than confirming passing ones —
+to demonstrate it.
+
+The fix is the first thing `pull()` now does:
+
+```cpp
+{{#include ../../../include/srt/asrc.hpp:asrc_feasibility}}
+```
+
+The design choices inside those lines carry the interesting reasoning:
+
+- **Adapt rather than reject.** The constructor cannot validate this —
+ the pull size isn't known until the first pull. Throwing from `pull()`
+ is forbidden by the noexcept contract, and returning an error the caller
+ must check is how the original silent failure happened, one layer up.
+ So the converter raises its *effective* setpoint to what the observed
+ block requires and reports the raise through
+ `Status::effectiveTargetLatencyFrames`. Latency follows the raised
+ setpoint: the honest price, visibly labeled, instead of a dropout cycle.
+- **The margin is a half block.** Feasibility strictly needs
+ `setpoint ≥ N`; equality grazes, because block-quantized occupancy
+ sawtooths around the setpoint. The audit's data located the boundary
+ (pull = setpoint showed occasional underruns; pull comfortably below the
+ setpoint was clean), and `N/2` covers the sawtooth with room.
+- **The raise is bounded by capacity**, computed once in the constructor —
+ a setpoint the FIFO cannot sustain would just move the failure. The
+ auto-sized FIFO's floor was raised to 1024 frames (21 ms of stereo float
+ costs 8 KB — memory is the cheap resource here) so that callbacks up to
+ roughly 340 frames work with zero configuration; beyond that, the
+ documentation now says plainly: size `fifoFrames` yourself.
+- **Feasible configurations are untouched.** The 32-frame-against-48
+ default keeps its exact behavior — verified not just by tests but by the
+ instruction-count ratchet: every scenario on every embedded target
+ measured within ±0.07% across the change, which is construction-cost
+ noise. The adaptation is invisible until the moment it is needed.
+
+The audit's failing scenarios became the regression suite
+(`Feasibility.Pull64LocksCleanly` and siblings), so the bug's exact shape
+is now permanently load-bearing.
+
+## Validation: what the constructor refuses to build
+
+The same audit rewrote `validated()`, and the before/after is a compact
+study in what config validation is *for*. The original checked three
+fields for zero. The current version rejects, with reasons recorded in a
+comment: NaN or infinity anywhere in the numeric config (a NaN sample rate
+previously flowed into the filter designer and constructed a converter
+that emitted NaN audio — construction succeeding is worse than throwing
+when what it constructs is poison); band-edge sums above the sample rate
+(an anti-image filter whose cutoff exceeds input Nyquist passes images
+wholesale — numerically fine, acoustically wrong); a deviation clamp large
+enough that the Q0.64 conversion in the resampler would overflow an
+`int64` (undefined behavior guarded at the only gate that sees the value
+early enough); and size products that would wrap 32-bit `size_t` on the
+embedded targets before `bad_alloc` could save anyone. The principle:
+**validate at the boundary where throwing is allowed, against the
+invariants of every component downstream** — the resampler can't defend
+itself against a config it never sees whole.
+
+One postscript from the portability chapter belongs here too: on one
+supported toolchain (Hexagon's static-musl configuration), C++ exceptions
+cannot unwind at all, so even this careful `throw` terminates the process
+there. Validation still protects — a loud death beats NaN audio — but
+callers on that target are documented to validate before constructing.
+Contracts end where toolchains do.
+
+## Telemetry that cannot lie about being lock-free
+
+`status()` may be called from any thread, which makes it the one place a
+third agent touches the object. Every field crosses via a relaxed atomic,
+single-writer, individually coherent but deliberately not mutually so — a
+snapshot for humans and supervisory logic, not a synchronization
+primitive. The type choices encode a portability fact worth remembering:
+the counters are 32-bit atomics because on the 32-bit targets a 64-bit
+`std::atomic` falls back to lock-based emulation, and a converter whose
+*telemetry* takes a lock has quietly broken the lock-free promise its hot
+path makes. The counters wrap at 2^32; the doc comment says so and says
+what to do about it. Precision was traded for the contract, and the trade
+is written down.
+
+## The underrun tail, end to end
+
+```cpp
+{{#include ../../../include/srt/asrc.hpp:asrc_underrun}}
+```
+
+Read this excerpt slowly and you can see the whole chapter in ten lines:
+the resampler asked to do exactly one job; the fade applied only when
+there is something real to fade; the silence pad honoring `pull()`'s
+always-fills guarantee; the integrator-preserving reset encoding what a
+dropout does and does not destroy; the telemetry publish last, so
+observers see states, not mid-transition fictions.
+
+## Verify it yourself
+
+```sh
+# The composed state machine, end to end:
+ctest --test-dir build -R 'AsrcLock' --output-on-failure
+
+# The feasibility bug's exact former shape, now a regression gate:
+ctest --test-dir build -R 'Feasibility' --output-on-failure
+
+# What the constructor refuses to build (NaN, image-passing bands,
+# UB-range ppm, undersized FIFOs):
+ctest --test-dir build -R 'ConfigValidation' --output-on-failure
+
+# Resync clamping, consumer reset, fade behavior, degenerate calls:
+ctest --test-dir build -R 'Resync|Reset|Fade|EdgeCalls' --output-on-failure
+```
+
+And one experiment worth running because it *shows you the bug*: check out
+any commit before the feasibility fix, build the lock test with
+`chunkOut = 64`, and watch a fully green library drop audio four times a
+second. Correct parts. Broken whole. That gap is what this file is for.
diff --git a/book/src/part1/fractional-resampler.md b/book/src/part1/fractional-resampler.md
new file mode 100644
index 0000000..a8bf3ce
--- /dev/null
+++ b/book/src/part1/fractional-resampler.md
@@ -0,0 +1,460 @@
+# The fractional resampler
+
+> God made the integers; all else is the work of man.
+>
+> — Leopold Kronecker
+
+The servo chapter ended with a number: ε̂, the rate-deviation estimate,
+delivered once per output block. This chapter spends it.
+
+Somebody has to turn "consume 1.000 000 2 input frames per output frame"
+into actual audio, forever, without drift, without glitches at the moments
+the books balance, and within a per-sample cycle budget that must hold on
+a Xeon and on a DSP with no double-precision FPU. That somebody is
+`FractionalResampler`, the streaming engine at the bottom of
+`polyphase_filter.hpp`. It owns three things: the **history** (the last T
+input frames of every channel, kept where the filter can reach them), the
+**phase** (where between two input samples the next output lands), and the
+**slip logic** (what happens when the phase creeps across a whole-sample
+boundary).
+
+The near-unity specialization shapes everything here. A general-ratio
+resampler schedules different numbers of outputs per input and needs
+control flow to match. At ±1000 ppm, the conversion degenerates into a
+*creeping fractional delay*: one output per input, plus a fractional
+position μ that drifts by parts per million per sample and occasionally —
+every few thousand samples — crosses a boundary and forces the window to
+slip by one frame. The steady state is metronomic; all the difficulty
+concentrates into keeping μ exact over unbounded time and making the
+slips invisible. Those two problems are this chapter.
+
+## The job, one output sample at a time
+
+The polyphase bank chapter built the table: L + 1 rows of T coefficients,
+row p holding the FIR that interpolates a signal value p/L of the way
+between two input samples. `interpolate()` evaluates one output at
+fractional position μ ∈ [0, 1):
+
+1. Scale: `pos = μ · L`. The integer part picks the phase row p; the
+ fractional part `fr` says how far μ sits between row p and row p+1.
+2. Blend: form `c[t] = c0[t] + fr · (c1[t] − c0[t])` across the T taps —
+ linear interpolation between adjacent rows, the trick that makes a
+ 256-row table act like a continuum (the residual falls ~12 dB per
+ doubling of L).
+3. Dot: multiply the blended row against the oldest-first history window
+ of the newest T input samples and accumulate — in double for float
+ samples, int64 for fixed point.
+
+μ = 0 lands the output exactly on history sample T/2 − 1; μ → 1
+approaches sample T/2. And the μ wrap 1.0 → 0.0 — the whole-sample slip —
+is exactly where the bank's extra row L pays off: row L equals row 0
+advanced by one input sample, so "μ reaches 1.0 on this window" and
+"μ = 0.0 on the window shifted one frame" are *the same filter*,
+bit-identically, with no branch. The slip machinery below leans on that
+continuity; `Polyphase.MuWrapIsContinuousWithWindowShift` pins it.
+
+That is the whole kernel: blend, then dot. Roughly T multiply-adds of
+blending plus T of dot product per output sample, and everything else in
+this chapter is about doing it cheaper, more exactly, and for more
+channels — without ever changing an output bit unintentionally.
+
+## Sharing the blend: the C1 split
+
+The first optimization campaign result (Part III tells the full story;
+`docs/PERFORMANCE.md` is the canonical record) started from an
+observation you can make by reading the loop above: in a multichannel
+converter, every channel of a frame is evaluated at the *same* μ. Calling
+the fused `interpolate()` per channel recomputes an identical T-tap
+coefficient blend N times per frame — for stereo, half the inner-loop
+work is duplicate.
+
+The fix is to split the kernel at its natural seam: blend once per frame
+into a scratch row, then run a plain dot product per channel.
+
+```cpp
+{{#include ../../../include/srt/polyphase_filter.hpp:rs_dot_row}}
+```
+
+Two things about this function beyond its arithmetic. First, the comment
+at the top is a *bit-exactness contract*: given the same μ, blend-then-mac
+per tap in the same order is literally the same sequence of floating-point
+(or integer) operations as the fused form, so the split changes no output
+bit — and the C1 entry in `docs/PERFORMANCE.md` records "outputs unchanged
+bit-for-bit" as a checked result, not a hope. This library treats
+bit-exactness as the boundary between an optimization (free to ship) and
+an algorithm change (needs its own quality evidence); you will see the
+same distinction drawn twice more in this chapter. Second, the
+`SRT_RESTRICT` qualifiers are C2's contribution: without them the
+compiler versioned these loops behind runtime aliasing checks (verified
+with `-fopt-info-vec`, not assumed).
+
+The measured C1 result: **stereo pipeline −36% wall-clock on x86,
+8-channel −52%**, and −15/−30/−21% instructions (float/Q15/Q31) on the
+Cortex-M55 — with the mono kernels count-identical as the control, since
+mono keeps the fused path. One target barely moved, though: Hexagon
+improved only −3.6/−3.3/−0.2%. Profiling explained why, and the
+explanation became the next hypothesis: Hexagon's pipelines were not
+dominated by blends or dots at all, but by **per-sample soft-double phase
+math**. Which brings us to the centerpiece.
+
+## The phase accumulator: Q0.64
+
+Here is the failure that motivates the design. The obvious phase state is
+a `double mu`, updated per output sample as `mu += 1 + eps` with the
+integer part peeled off into window advances. On a Xeon that costs a few
+cheap FPU ops. On Hexagon — a 32-bit audio DSP with **no double-precision
+FPU** — every one of those operations is a soft-float library call, per
+sample, on the hottest path in the library. C1's flat Hexagon numbers
+were this cost dominating everything else. (Honest correction from the
+record, because the project's documentation initially got it wrong: the
+Cortex-M55 was *assumed* to share this problem, but its scalar FPU does
+support FP64 — only its MVE vector unit is fp16/fp32 — so M55 float was
+never soft-double-bound. The measurement that exposed the doc error is
+Part III material; the resampler design below is motivated by Hexagon and
+its HiFi-class cousins, where the problem is real.)
+
+The C3 redesign eliminates the per-sample double entirely by changing
+what the phase *is*:
+
+```cpp
+{{#include ../../../include/srt/polyphase_filter.hpp:rs_class_doc}}
+```
+
+The fractional position lives in `phase_`, an unsigned 64-bit integer
+read as a pure binary fraction — **Q0.64**: the value μ = `phase_` / 2⁶⁴,
+so the representable range is exactly [0, 1) and the resolution is 2⁻⁶⁴
+of a sample. The key move is what it accumulates: **only ε**, the
+deviation. The "1" in "advance 1 + ε input frames per output frame" is
+handled by the integer machinery — consume one input frame per output
+frame — and never touches the fraction. Near-unity specialization again:
+because the nominal ratio is exactly 1, the fraction only has to carry
+the few-hundred-ppm creep, and 64 bits of headroom below the binary point
+carry it essentially forever.
+
+Per `process()` call — once per block, not per sample — the servo's
+double ε̂ is converted to fixed point:
+
+```cpp
+{{#include ../../../include/srt/polyphase_filter.hpp:rs_slip}}
+```
+
+Walk the slip logic carefully; it is the subtlest six lines in the
+datapath, and the trick is that **wraparound of the unsigned add is the
+slip detector**, for both signs of ε, with no comparisons against 1.0 or
+0.0 anywhere:
+
+- **ε ≥ 0** (input clock fast; the window must occasionally hurry). The
+ fraction creeps upward by `epsU` each sample. When the true position
+ would cross 1.0, the 64-bit add wraps: `m = phase_ + epsU` comes out
+ *smaller* than `phase_`, which is otherwise impossible for a positive
+ increment. That wrap **is** the forward slip: consume one *extra* input
+ frame (`advance = 2` — the regular frame plus the slipped one), and the
+ wrapped `m` is already the correct new fraction, because mod-2⁶⁴
+ arithmetic subtracted exactly the 1.0 that the extra frame consumed.
+- **ε < 0** (input clock slow; the window must occasionally wait).
+ `epsU` is the two's-complement reinterpretation of a negative `epsFix`
+ — a huge unsigned number — so the same add normally wraps every
+ sample, and *not* wrapping is the anomaly: `m > phase_` means the
+ fraction dipped below 0.0. That is the backward slip: consume **no**
+ input frame this output (`advance = 0`, reuse the current window), and
+ again the modular result is already the correct fraction just below
+ 1.0.
+- Otherwise `advance = 1`: the metronomic case.
+
+
+
+*The slip logic run with the real mod-2⁶⁴ arithmetic, ε exaggerated to
+0.09 so the wraps are visible (at the real |ε| ≈ 2×10⁻⁴ a slip fires once
+every few thousand frames). Left: the fraction creeps upward until the add
+wraps past 1.0 — consume one extra frame. Right: with ε negative the add
+wraps on *every* ordinary frame, and the anomaly is the one that doesn't —
+reuse the window. From `scripts/book_figures.py`.*
+
+At +500 ppm a forward slip fires every 2 000 output samples, and thanks
+to the bank's extra row the filter evaluated after `advance = 2` at small
+μ is the exact continuation of the filter before it at μ ≈ 1.
+`AsrcLock.WholeSampleSlipsAreGlitchFree` runs 500 ppm for seconds and
+bounds the output's *second difference* by the analytic bound A·ω² of a
+clean sine — a discontinuity detector that would trip on any window
+mis-step at any slip.
+
+Note also what happens between the `appendOne` calls and `phase_ = m`:
+if the source runs dry midway through an `advance = 2` slip, the function
+returns with the history advanced by one frame but the phase *not*
+updated. History and phase are now one frame apart — a state the class
+cannot repair locally. That is not a bug; it is a documented precondition
+(the contract section below), and the converter's dropout path always
+resets and re-primes before processing again.
+
+Downstream, the phase bits feed the kernel directly:
+
+```cpp
+{{#include ../../../include/srt/polyphase_filter.hpp:rs_blend_row_phase}}
+```
+
+The top log₂ L bits *are* the phase-row index; the bits below, shifted
+up, *are* the intra-phase blend fraction. No multiply by L, no floor, no
+subtract — the Q0.64 representation makes the split between "which row"
+and "how far between rows" a matter of bit fields. One conversion to the
+datapath's blend-factor type per output frame (`blendFactorFromQ64`:
+single-precision for float, integer for Q15/Q31) is all that remains of
+the floating-point phase math. The fused mono form is the same bit
+surgery around the same blend-and-mac loop:
+
+```cpp
+{{#include ../../../include/srt/polyphase_filter.hpp:rs_interpolate_phase}}
+```
+
+**Is 2⁻⁶⁴ enough?** Part 0 derived the timing-jitter budget for 120 dB
+transparency at 20 kHz: about 8 picoseconds. One sample at 48 kHz is
+20.8 µs; 2⁻⁶⁴ of that is ~10⁻²⁴ seconds — twelve orders of magnitude
+inside the budget. The double-μ design's 2⁻⁵² was also far inside it, so
+resolution was never the emergency; the deeper numerical win is
+*exactness over time*. An integer accumulator adds ε with **zero
+rounding error per step**, forever — the only quantization is the
+once-per-block conversion of ε̂, a rate error below 10⁻¹⁹ that the servo
+absorbs like any other infinitesimal drift. A double μ, by contrast,
+rounds on every `+=` and carries the fraction with absolute precision
+limited by its integer part's magnitude. Measured, from the C3 entry:
+quality *improved* to **135.0 dB at 997 Hz** when the integer phase
+landed. An optimization PR whose quality guardrail moved the right
+direction — the A/B discipline (benchmarks for speed, pinned SNR
+thresholds for correctness) catching a pleasant surprise instead of a
+regression.
+
+And the cost side, from the same entry: Hexagon pipelines **−10.3% (Q15)
+and −15.5% (Q31)**, with float −2.6% — the soft-double phase math C1
+identified was simply gone, and the Hexagon *kernels* stayed
+count-identical as the control. M55: Q15 −5.3%, Q31 −4.6%, float +1.4% —
+a genuine, accepted regression on one scenario, because the M55's scalar
+FP64 hardware made doubles cheap and the integer phase traded them for
+int64 ops; the cross-target win justified it, and the ratchet baseline
+records the trade explicitly. x86 same-minute A/B: float −5.4%, Q15
+−12.0%.
+
+## Dispatching the datapath
+
+With phase in hand, each output frame takes one of three routes:
+
+```cpp
+{{#include ../../../include/srt/polyphase_filter.hpp:rs_dispatch}}
+```
+
+Mono takes the fused `interpolatePhase` — no scratch-row traffic for a
+single channel (with one exception: Q15 on SMLALD-capable Cortex-M cores
+routes mono through blend + dot too, because the dual-MAC loop lives in
+`dotRow`; the two paths are bit-exact by construction, which is what
+makes that rerouting a non-event). Low channel counts blend once into
+`row_` and dot per channel over planar histories — the C1 shape. High
+channel counts on hosts take the frame-major branch, which is the next
+section but one. Note the branch condition `kChannelParallel &&
+frameMajor_`: the first operand is `constexpr`, so on embedded targets
+the entire branch constant-folds away. That is not tidiness — a runtime
+flag in this loop measured **+6–8%** on the M55 instruction ratchet
+before the compile-time gate restored every embedded scenario to exactly
+0.00%. The ratchet is why the lesson is a number and not an anecdote.
+
+## Feeding the window: history management
+
+The filter needs the newest T frames of every channel, contiguous,
+oldest-first, per channel. Input arrives interleaved, in whatever chunks
+the FIFO happens to hold. Between those two facts sits `appendOne`:
+
+```cpp
+{{#include ../../../include/srt/polyphase_filter.hpp:rs_append}}
+```
+
+Three mechanisms, each with an RT-safety argument:
+
+**Chunked staging.** Frames are pulled from the caller-supplied `popFn`
+in bulk (the converter passes 16-frame chunks) into the interleaved
+`scratch_` buffer, then peeled off one frame at a time as the window
+advances. Bulk pops amortize the ring's index synchronization across
+many frames — the cached-index design from two chapters ago does its
+best work when you ask it for blocks — while the resampler still
+consumes with single-frame granularity, because slips need exactly-one
+extra frame on demand. Frames staged in scratch have left the ring but
+not yet entered the filter, which is why `bufferedFrames()` exists: the
+servo's occupancy observable must count them or the estimate would carry
+a chunk-sized bias.
+
+**Bounded compaction.** Histories are not ring buffers; they are flat
+arrays with a moving end index, sized `taps + chunkFrames`. When the end
+hits capacity, `memmove` slides the newest T − 1 frames back to the
+front and synthesis continues. Why copy at all, when a circular buffer
+would avoid it? Because the *filter* needs a contiguous window every
+sample: a ring would either split the dot product at the wrap seam
+(a branch and a second loop in the hottest code in the library) or copy
+into a linear scratch every frame — a memmove per *sample* instead of
+one per *chunk*. The flat layout pays T − 1 frames of copy once per
+`chunkFrames` appends: bounded, branch-predictable, allocation-free —
+worst-case cost is fixed at construction time, which is the entire
+definition of RT-safe this library uses. `process()` is `noexcept`, no
+locks, no allocation; every buffer was sized in the constructor, which
+is allowed to throw precisely because it runs at setup time.
+
+**Two storage shapes.** The member block records the fork:
+
+```cpp
+{{#include ../../../include/srt/polyphase_filter.hpp:rs_members}}
+```
+
+Planar — one delay line per channel — below the channel-parallel
+threshold: each channel's dot product walks its own contiguous line, and
+the deinterleave happens once per frame at append time (a scalar loop
+over channels). Frame-major — a single interleaved line — at or above
+it: appends become one contiguous `memcpy` per frame and the compaction
+one `memmove` per line-fill, but the real reason for the layout is the
+kernel it enables.
+
+## The channel axis: C6, briefly
+
+For high channel counts the per-frame cost is dominated by N dot
+products, and the float dot product has a vectorization problem you can
+now state precisely: its accumulation order is contractual (strict
+per-channel double accumulation — reassociating it changes output bits),
+so the *tap axis* may not be vectorized without breaking bit-exactness.
+The C2 audit verified GCC obeys: float `dotRow` compiles scalar, by
+design.
+
+But nobody said anything about the *channel* axis. Channels are
+independent accumulators; computing eight of them in lockstep, one tap
+at a time, keeps every channel's tap order identical to `dotRow`'s while
+filling SIMD lanes with channels instead of taps. That requires the
+history to deliver all channels of tap t contiguously — the frame-major
+layout — and a register-blocked kernel:
+
+```cpp
+{{#include ../../../include/srt/polyphase_filter.hpp:rs_dot_rows_frame_major}}
+```
+
+The measured C6 results, condensed (the full campaign, including the
+callgrind profile that justified targeting the dots and the negative
+results that bounded the design, is Part III's last chapter): **float
+8/12/16-channel pipelines −38/−38/−42% wall-clock with AVX2+FMA**, only
+−4–5% on baseline SSE2 — the gain scales with SIMD width, as it must if
+the mechanism is what we claim. Bit-exact versus planar, hash-verified
+over 30 000 blocks × 4 configs. The gate is deliberately narrow, each
+edge measured rather than assumed:
+
+- **Float-only**: fixed-point channel-parallel measured ~1.5× *slower*
+ than planar — integer accumulation is exactly reassociable, so the
+ planar Q15/Q31 dots already auto-vectorize over taps, and the tap axis
+ beats the channel axis when both are available.
+- **Channels ≥ 4** (`SRT_CP_MIN_CHANNELS`, overridable for A/B runs):
+ below that, lane utilization loses to the planar path's simplicity.
+- **Hosts only**: the embedded targets keep their proven codegen (Helium
+ on M55, SMLALD on M33-class, Hexagon's measured scalar floor); the
+ compile-time macro gate keeps their binaries byte-for-byte ignorant of
+ the mode.
+
+And one lesson worth carrying out of context: the first channel-parallel
+attempt — accumulators in a plain array the compiler kept in memory —
+measured **2.8× slower than planar**. Register-block or don't bother;
+`dotTileFrameMajor`'s `constexpr`-size tiles of 8/4/2/1 are that lesson
+in code form.
+
+## The contract: prime, process, and the one-frame lie
+
+`FractionalResampler` is deliberately not foolproof; it is *fast*, and
+its safety is a documented protocol that the converter — its only
+in-tree caller — upholds. The documentation is the code's own:
+
+```cpp
+{{#include ../../../include/srt/polyphase_filter.hpp:rs_process_doc}}
+```
+
+**Prime before process.** `prime()` fills the window with T real frames
+(or reports dry and stays unprimed). Call `process()` unprimed and
+`window()`'s pointer arithmetic `end_ − taps()` underflows a `size_t` —
+the converter guarantees priming by construction, since it only leaves
+its Filling state once the backlog exceeds setpoint + taps.
+
+**Reset after any dry return.** You now know exactly why from the slip
+walk-through: a `process()` that runs dry on the *second* append of an
+`advance = 2` forward slip has already advanced the history when it
+returns, but never executed `phase_ = m`. History says one frame passed;
+phase says none did. Every output synthesized after resuming would be
+computed one frame late relative to its nominal position — not a crash,
+a *silent sub-window skew*. The class cannot un-append (the frame is
+deinterleaved into the histories) and does not try to special-case it;
+it defines the recovery protocol instead: `reset()` clears phase,
+history, and staged scratch (stale across a discontinuity anyway), then
+re-prime. The converter's underrun path does exactly this, with the
+servo keeping its ppm estimate and a fade-in masking the splice.
+
+Finally, the small read-side API that closes the control loop:
+
+```cpp
+{{#include ../../../include/srt/polyphase_filter.hpp:rs_mu}}
+```
+
+`mu()` converts the phase to double **once per pull, not per sample** —
+the block-rate boundary where doubles are cheap even on Hexagon, the
+same boundary the ε̂ conversion crosses in the other direction. The
+servo adds it to the frame count so the observable `occ + mu` moves
+*continuously* through slips: at the instant a forward slip fires, the
+count drops by one exactly as μ wraps from ~1 to ~0, and the sum crosses
+smoothly. Without μ in the observable, every slip would inject a
+one-frame staircase into the servo's error at the beat frequency —
+manufacturing the very sawtooth the previous chapter spent three filter
+poles suppressing. `bufferedFrames()` completes the accounting for the
+staged scratch. Two accessors, and the sensor the whole control system
+reads is honest to sub-sample resolution.
+
+## Why this file looks the way it does
+
+| Decision | Alternative rejected | Reason |
+|---|---|---|
+| Q0.64 integer phase, ε-only | `double mu += 1 + eps` per sample | soft-double per sample dominated Hexagon pipelines (C1 finding); integer add is exact forever; measured −10/−15% Hexagon, quality up to 135.0 dB |
+| Slips by unsigned wraparound | compare/floor against 1.0 and 0.0 | the mod-2⁶⁴ result *is* the corrected fraction; both slip directions fall out of one add |
+| Blend once per frame + per-channel dot | fused interpolate per channel | N×(blend+dot) → blend + N×dot; bit-exact by identical per-tap order; stereo −36% wall-clock (C1) |
+| Flat history + bounded memmove compaction | circular history | the dot needs a contiguous window every sample; one bounded copy per chunk beats a seam branch per sample |
+| Chunked popFn staging | pop one frame at a time | amortizes ring synchronization; staged frames stay visible to the servo via `bufferedFrames()` |
+| Frame-major + channel-parallel dots (float, ≥4ch, hosts) | vectorize the float tap axis | tap-axis SIMD changes accumulation order = output bits; the channel axis is free and bit-exact (−38…−42% at 8–16ch) |
+| Compile-time mode gate | runtime `if (frameMajor_)` alone | a hot-loop runtime flag cost +6–8% M55 instructions; `constexpr` restored embedded codegen to 0.00% |
+| Documented preconditions + `reset()` | internal auto-repair of dry slips | the failure needs a reprime anyway (stale window); a repair path would be untestable dead weight on the hot path |
+
+## Verify it yourself
+
+```sh
+# Quality with the Q0.64 phase in the loop — the pinned thresholds
+# include the 135 dB figure C3 improved:
+ctest --test-dir build -R 'AsrcQuality\.' --output-on-failure
+
+# Slip continuity: the second-difference bound at +500 ppm (a slip
+# every 2000 samples), plus lock/drift behavior:
+ctest --test-dir build -R 'AsrcLock\.' --output-on-failure
+
+# The mu-wrap/extra-row continuity the slips depend on:
+ctest --test-dir build -R 'Polyphase\.' --output-on-failure
+
+# Channel independence at 12/16 channels — on a host float build this
+# exercises the frame-major channel-parallel path:
+ctest --test-dir build -R 'MultiChannel' --output-on-failure
+
+# A/B the channel axis yourself: benchmark, then rebuild with the
+# threshold pushed out of reach and benchmark again (use -march=native
+# to see the AVX2 headline; SSE2 shows a few percent):
+cmake -B build-bench -DCMAKE_BUILD_TYPE=Release -DSRT_BUILD_BENCHMARKS=ON \
+ -DCMAKE_CXX_FLAGS="-march=native"
+cmake --build build-bench -j && \
+ ./build-bench/bench/srt_bench --benchmark_filter='Pipeline_Float.*(8|12|16)ch'
+cmake -B build-planar -DCMAKE_BUILD_TYPE=Release -DSRT_BUILD_BENCHMARKS=ON \
+ -DCMAKE_CXX_FLAGS="-march=native -DSRT_CP_MIN_CHANNELS=999"
+cmake --build build-planar -j && \
+ ./build-planar/bench/srt_bench --benchmark_filter='Pipeline_Float.*(8|12|16)ch'
+
+# Break it on purpose: change `advance = 2` to `advance = 1` in the
+# forward-wrap branch of process(), rebuild, and watch
+# AsrcLock.WholeSampleSlipsAreGlitchFree fail its second-difference
+# bound — every slip becomes an audible one-frame stutter.
+```
+
+The last experiment is worth actually running once. The slip logic is
+six quiet lines that look like integer bookkeeping; breaking them turns
+a 135 dB converter into a machine that clicks every forty-two
+milliseconds. That gap — between how little the code looks like it is
+doing and how much the measurements say it is — is the fractional
+resampler in one sentence.
diff --git a/book/src/part1/kaiser.md b/book/src/part1/kaiser.md
new file mode 100644
index 0000000..be64433
--- /dev/null
+++ b/book/src/part1/kaiser.md
@@ -0,0 +1,408 @@
+# Designing the filter: `kaiser.hpp`
+
+> The purpose of computing is insight, not numbers.
+>
+> — Richard Hamming
+
+This is the only file in the library that runs exactly once per converter,
+and it decides the quality ceiling of everything downstream. Every output
+sample the converter will ever produce is a dot product against
+coefficients this file computes in a few milliseconds at construction. If
+the design here reaches 120 dB of image rejection, no later cleverness is
+needed to preserve it — the hot path is exact integer or double
+accumulation all the way out. If the design here falls short, no later
+cleverness can recover it. So before touching the code, this chapter builds
+the minimum filter-design theory a C++ reader actually needs — which is
+less than a DSP course and different from one — and then spends its pages
+where the textbooks stop: on the iteration cap, the clamp, the
+normalization constant, and the compile-time-versus-runtime decision that
+the textbooks never had to make.
+
+## The problem: evaluate a signal between its samples
+
+The converter's core operation (next chapter) is a *fractional delay*: given
+the last `T` input samples of a signal, produce its value at a position μ
+that falls between two of them. Sampling theory says this is not guesswork.
+A signal sampled at rate `fs` with no content above `fs/2` is *completely
+determined* between its samples; the reconstruction is
+
+```text
+x(t) = Σₙ x[n] · sinc(t − n), sinc(u) = sin(πu) / (πu)
+```
+
+— every sample contributes a sinc centered on itself, and the sum
+interpolates exactly. The `sinc` in this file is that function, with the
+one hazard a numeric programmer would expect handled explicitly:
+
+```cpp
+{{#include ../../../include/srt/detail/kaiser.hpp:kai_sinc}}
+```
+
+(The 0/0 at x = 0 is a *removable* singularity — the limit is 1 — but IEEE
+arithmetic doesn't take limits, so the code must.)
+
+The catch is the `Σₙ`: it runs over **all** samples, and sinc decays like
+1/t, which is uselessly slow. Truncating the sum to a window of `T` samples
+around the evaluation point is mandatory. How you truncate is the entire
+design problem.
+
+## Why plain truncation rings
+
+Chopping the sinc off after T samples is the same thing as multiplying the
+ideal infinite filter by a rectangular window. Multiplication in time is
+convolution in frequency, so the ideal filter's perfectly sharp frequency
+response gets smeared by the rectangle's spectrum — and the rectangle's
+spectrum is awful: its sidelobes start at −13 dB and buy you a stopband of
+only about −21 dB. Worse, this is a *shape* problem, not a *size* problem.
+Doubling T squeezes the smearing into a narrower band (the transition
+sharpens) but the first sidelobe stays at the same level — the Gibbs
+phenomenon. A truncated sinc leaks images at −21 dB whether it has 12 taps
+or 12,000, and −21 dB is roughly the error of cheap linear interpolation.
+For a 120 dB budget, truncation alone is off by five orders of magnitude.
+
+The fix is to taper instead of chop: multiply the sinc by a window that
+falls smoothly to zero at the edges. Every smooth window trades the same
+two currencies — a wider main lobe (slower transition, so more taps for the
+same band edges) buys lower sidelobes (deeper stopband). The question is
+only which window spends the taps most efficiently.
+
+## The Kaiser window, and what to cite
+
+James Kaiser's answer (Kaiser 1974; the survey that made it standard
+practice is harris 1978) is the *I₀–sinh* window,
+
+```text
+w[i] = I₀(β · √(1 − u²)) / I₀(β), u ∈ [−1, 1] across the window,
+```
+
+where I₀ is the zeroth-order modified Bessel function. It is a closed-form
+approximation to the *prolate spheroidal* window — the provably optimal
+concentration of energy in the main lobe — that costs one special function
+instead of an eigenvalue problem. Its virtue for engineering is the single
+knob: **β alone sets the sidelobe level**, continuously, from rectangular
+(β = 0) to arbitrarily deep, and Kaiser published empirical formulas mapping
+a stopband spec in dB directly to β and to the filter length. No iteration,
+no optimization run, no table lookup: attenuation in, coefficients out.
+
+That is the theory, and this book will not re-derive it — Kaiser's paper
+and harris's survey do it properly. What they do *not* tell you is how to
+evaluate I₀ in a `noexcept` header without a math library that provides it,
+what happens to the length formula when a caller hands it garbage, why the
+normalization constant is `L` and not 1, or whether any of this should run
+at compile time. That is the rest of this chapter.
+
+
+
+*The knob in action: the presets' attenuation targets (96/120/140 dB) map
+through `kaiserBeta` to β = 9.6/12.3/14.5, and higher β buys its deeper
+stopband by tapering the window harder — which widens the main lobe, which
+is why `estimateTaps` charges more taps for the same transition width.
+Generated by `scripts/book_figures.py` from the same formulas.*
+
+## `besselI0`: a power series with an escape hatch
+
+`` has no I₀ (`std::cyl_bessel_i` exists in the special-functions
+annex, but it is optional, absent from libc++, and this library targets
+toolchains as odd as hexagon-musl). So the file computes it from the power
+series
+
+```text
+I₀(x) = Σₖ [ (x/2)ᵏ / k! ]²
+```
+
+which converges for every finite x:
+
+```cpp
+{{#include ../../../include/srt/detail/kaiser.hpp:kai_besseli0}}
+```
+
+Three details carry all the engineering.
+
+**The recurrence.** Each term is the previous term times `(x/2k)²` — no
+factorials, no powers, no overflow staging. Term k relates to term k−1 by
+exactly the ratio `r²`, computed in two multiplies. For the β values this
+library ever produces (about 12.3 for the 120 dB preset, 14.5 for 140 dB)
+the terms grow until k ≈ x/2 ≈ 6 and then collapse factorially; a few dozen
+terms reach full double precision, matching the "~50-term" budget the
+file's header comment charges against constexpr evaluation.
+
+**The stopping criterion.** `term < 1e-21 * sum` stops when the next term
+can no longer perturb the sum's 16 significant digits — a *relative* test,
+so it is correct whether I₀ is 1.0001 or 10⁴ (it is about 19,000 at β = 12).
+The margin below double epsilon (≈ 2.2·10⁻¹⁶) costs a handful of extra
+iterations and removes any temptation to reason about rounding at the
+boundary.
+
+**The iteration cap — the line a textbook would not print.** The loop bound
+`k < 1000` looks redundant: the series *always* converges, so the relative
+test *always* fires eventually. For every real x, yes. Feed the function a
+NaN — say, from an uninitialized config field three call frames up — and
+every comparison involving `term` is false, including the exit test. An
+unbounded loop in a `noexcept` function would hang the caller's constructor
+forever. With the cap, the worst case is a garbage return value that the
+converter-level validation (chapter after next) rejects anyway. The cap is
+not about convergence; it is about making *termination* independent of
+floating-point semantics. This costs one integer compare per iteration and
+turns an unprovable property into a checkable one.
+
+The unit test pins the function against reference values computed
+independently (`besselI0(1.0) = 1.2660658777520084…`), at tolerances that
+scale with the magnitude — 10⁻¹² absolute near 1, 10⁻⁶ near 19,000 — i.e.
+constant *relative* accuracy, which is what the window formula's ratio
+`I₀(β·…)/I₀(β)` actually consumes.
+
+## `kaiserBeta`: an empirical fit, taken as published
+
+```cpp
+{{#include ../../../include/srt/detail/kaiser.hpp:kai_beta}}
+```
+
+This is Kaiser's published fit, digit for digit — `0.1102`, `0.5842`,
+`0.07886` are his constants, not this library's, and the right response to
+magic numbers with a citation is to copy them exactly and test them exactly
+(the unit test asserts the formulas symbolically, so a typo in a constant
+cannot hide). Two things are worth understanding rather than memorizing:
+
+- **Why piecewise.** The relationship between β and achieved attenuation is
+ smooth but not polynomial; Kaiser fit it in two regimes. Above 50 dB the
+ relationship is essentially linear. Between 21 and 50 dB the fractional
+ power term takes over. Every preset this library ships (96–140 dB) lives
+ on the first line; the second exists so that off-spec experiments degrade
+ gracefully instead of nonsensically.
+- **Why zero below 21 dB.** A rectangular window — no taper at all —
+ already achieves about 21 dB. Asking the fit for less than the free
+ floor correctly returns "don't taper."
+
+## `estimateTaps`: the cost formula, with a seatbelt
+
+β sets the stopband *depth*; the number of taps sets how fast the response
+can *fall* into it. Kaiser's length estimate (the form popularized by
+harris) says taps scale linearly with attenuation and inversely with
+transition width:
+
+```cpp
+{{#include ../../../include/srt/detail/kaiser.hpp:kai_estimate}}
+```
+
+Note what the signature normalizes to: transition width *as a fraction of
+the input rate*, and the return is taps *per polyphase branch*. The full
+prototype (next section) has `L·T` taps at an oversampled rate of `L·fs`;
+run the classic formula at that rate and both numerator and denominator
+pick up the same factor of L, which cancels. Expressing the estimate per
+branch keeps the caller's arithmetic in the units the caller actually has —
+"8 kHz of transition at 48 kHz" — with no L in sight.
+
+Plug in the `balanced()` preset: 120 dB across a 20→28 kHz transition at
+48 kHz gives `(120 − 8) / (2.285 · 2π · 8000/48000) ≈ 46.8`, so 47 taps;
+the unit test (`Kaiser.TapEstimateMatchesHarrisFormula`) brackets exactly
+this computation at 45–49, and the shipped preset says `tapsPerPhase = 48`
+— the estimate rounded up to an even count (even matters later: the SMLALD
+kernel on Cortex-M33-class parts consumes taps in pairs). This function is
+how the presets were *chosen*; the bank itself takes `T` from the spec, so
+the estimate is a design aid with a unit test rather than a hot dependency.
+
+Then there is the comment at the top of the body, which earns its own
+paragraph because it was not in the first version of this file. The raw
+formula misbehaves at both edges of its domain: `attenDb < 8` makes the
+numerator negative, and a zero or negative transition width divides to
+±infinity. Both would then hit `static_cast` — and converting
+a negative or non-finite `double` to an unsigned integer is **undefined
+behavior** in C++, not "some big number." Not implementation-defined:
+undefined, the kind UBSan flags and optimizers exploit. An adversarial
+audit of the library flagged the cast; the guard was added in response. The
+predicate is written `!(transWidthNorm > 0.0)` rather than
+`transWidthNorm <= 0.0` deliberately — the negated form is also true for
+NaN, so all three pathologies (negative, zero, NaN) funnel into the same
+clamp, and the attenuation edge is covered by the `n > 4.0` select on the
+other side. The floor of 4 taps is the smallest window the bank will accept.
+A design helper this cheap has no business having *any* input that invokes
+UB, however silly the input.
+
+## `designPrototype`: where all of it lands
+
+```cpp
+{{#include ../../../include/srt/detail/kaiser.hpp:kai_prototype}}
+```
+
+One pass, one output array, but four decisions are packed into these lines.
+
+**The grid.** The prototype is the windowed sinc sampled `L` times per
+input sample — `t = (i − center) / numPhases` is time measured in *input*
+samples. This is the oversampled master filter that the next chapter slices
+into L branches; length `L·T` means 4,096 doubles for `fast()`, 12,288 for
+`balanced()`, 40,960 for `transparent()`. `center` places the peak exactly
+mid-array, which makes the filter linear-phase by symmetry — its group
+delay is a constant `T/2` input samples, the number the converter's latency
+formula quotes.
+
+**The window argument, defensively.** `u` sweeps [−1, 1] across the array
+and feeds `√(1 − u²)`. At the exact endpoints `1 − u²` is zero in real
+arithmetic but can round a few ulps *negative* in floating point, and
+`std::sqrt` of a negative is NaN — one NaN tap would silently poison every
+dot product that ever touches that row. The `std::max(0.0, …)` costs
+nothing and closes the hole. (Notice the theme: this file trusts
+floating-point identities nowhere — not in `sinc`, not in the series exit,
+not here.)
+
+**What `cutoffNorm` means, and its surprising value.** The cutoff is
+normalized so 1.0 sits at the *input* Nyquist, and the caller centers it in
+the transition band: `(passbandHz + stopbandHz) / fs`. For the balanced
+preset that is (20,000 + 28,000)/48,000 = **exactly 1.0** — the −6 dB point
+of this anti-imaging filter sits *at* 24 kHz, with the response still flat
+at 20 kHz and 120 dB down by 28 kHz. A reader trained on decimation filters
+may flinch: doesn't a cutoff at Nyquist let aliasing through? No — this
+filter's job is *interpolation* in a near-unity converter. The images it
+must kill are reflections of the input spectrum around `fs`, so content
+below 20 kHz images no lower than 28 kHz; the band between 20 and 28 kHz
+contains, by construction of the spec, nothing anyone claimed to protect.
+Splitting the transition symmetrically across Nyquist spends the taps where
+they buy audible margin on both sides. This is the first of several places
+where "near-unity only" (the library's headline restriction) converts
+directly into cheaper mathematics.
+
+**The normalization: sum = L, not 1.** A textbook lowpass normalizes its
+coefficient sum to 1 so DC passes at unity gain. This prototype normalizes
+to `L` — because no output sample is ever computed with the whole
+prototype. Each output uses one branch of `T` taps: every L-th coefficient.
+The L branch sums partition the total, and for a good lowpass they
+partition it *evenly* — each branch's DC gain deviates from the mean only
+by stopband-sized leakage (a branch sum is, in DFT terms, the prototype's
+response sampled at multiples of the input rate: exactly the image
+frequencies the stopband suppresses). Normalize the total to L and every
+branch lands at 1 ± leakage; feed the converter DC and DC comes out, at any
+fractional position. That is not left to inspection:
+`Polyphase.DcGainIsUnityAcrossMu` pushes an all-ones window through the
+*built* bank at 64 random μ values and requires unity within 10⁻⁴ — a
+bound loose enough to admit float coefficient storage and row blending,
+tight enough that a normalization bug (off by one branch, off by a factor
+of L) fails by orders of magnitude. One subtle consequence lands two
+chapters from now: with branch gains pinned near 1, the *peak* coefficient
+also sits near 1.0, which is precisely why the fixed-point formats must
+spend a headroom bit (Q1.14, Q1.30) on their coefficients.
+
+## The headline decision: runtime design, not `constexpr`
+
+Everything above is pure functions of compile-time-lookable values — and
+this is C++20, where `constexpr` has teeth. The obvious modern move is to
+evaluate the whole design at compile time: coefficients in `.rodata`
+(attractive on a flash-based microcontroller), zero construction cost, even
+`static_assert`s on the response. The file's own header records why that
+was rejected, and since the reasoning is a design artifact it is kept where
+refactors will trip over it:
+
+```cpp
+{{#include ../../../include/srt/detail/kaiser.hpp:kai_design_note}}
+```
+
+Present the alternative fairly, because it *almost* works:
+
+- **The language isn't there yet.** `std::sin`, `std::sqrt`, `std::pow`
+ are not `constexpr` before C++26 (P1383 fixes this). A C++20 constexpr
+ design needs hand-rolled constexpr transcendentals — several hundred
+ lines of the most bug-prone code in numerics, duplicating functions the
+ runtime already has, in a library whose entire test story leans on
+ comparing against exactly those runtime functions.
+- **The compile-time cost is not a rounding error.** Constexpr evaluation
+ is interpretation, three to four orders of magnitude slower than native
+ code. The design touches every one of 12K–41K taps with a `sin`, a
+ `sqrt`, and a ~50-term Bessel series. What runs in well under 10 ms
+ native becomes tens of seconds to minutes interpreted — **per translation
+ unit**, because a header-only library re-instantiates in every TU that
+ includes it. A user with twenty includes pays twenty times, on every
+ rebuild, forever.
+- **The inputs are not actually compile-time.** The band edges are scaled
+ by the *runtime* sample rate (`FilterSpec::scaledTo`,
+ `Config::forSampleRate`) — a converter constructed for a rate read from
+ an ALSA descriptor at startup cannot have baked coefficients at all. A
+ constexpr path would be a second, divergent code path serving only the
+ subset of users with fully static configs.
+
+Against all that, the runtime cost being amortized is: one design, under
+10 ms, in a constructor documented as setup-time-only, off the audio path
+by the library's own RT rules. The trade is lopsided once written down —
+but only once written down, which is why the file writes it down. (If
+C++26 constexpr math plus a measured compile-time budget someday flips the
+trade for static configs, the pure functions here are already shaped for
+it: no state, no allocation, `std::span` in, coefficients out.)
+
+## The test evidence: the spec, measured by DFT
+
+A filter design module invites a lazy test — "coefficients equal last
+week's coefficients." That freezes bugs in amber. What the library pins
+instead is the *specification*: `tests/test_kaiser.cpp` computes the
+prototype's actual frequency response by direct DFT and asserts the numbers
+the presets advertise.
+
+
+
+*What the spec tests pin: each preset's transition starts at its passband
+edge and reaches its rated floor by its stopband edge, and the detail panel
+shows all three passbands flat within ±0.01 dB. The curves come from
+`scripts/book_figures.py`, which re-runs `designPrototype`'s math verbatim.*
+
+The measurement function evaluates `|H(f)|` at arbitrary frequencies in Hz
+against the oversampled prototype (rate `L·fs`), normalized by L so the
+passband reads 0 dB — a direct O(n) sum per frequency. No FFT: an FFT
+would demand a power-of-two grid, deliver frequencies nobody asked for,
+and drag in a dependency, all to accelerate a few hundred evaluations in a
+test that runs in milliseconds. Then, for each shipped preset:
+
+- **Passband flatness:** every 500 Hz from DC to the passband edge,
+ response within ±0.01 dB of unity. That is the "flat to 20 kHz" claim in
+ the README, as an executable inequality.
+- **Stopband depth:** every 250 Hz from the stopband edge out to *three
+ times the sample rate*, response below −(spec − 1) dB. The 3·fs reach
+ matters: the polyphase structure's images repeat around every multiple
+ of fs, so a stopband that sagged past the first image would pass junk at
+ 96 kHz even if 28 kHz looked fine. The 250 Hz step is calibrated to the
+ filter, not guessed: a T-tap-per-branch prototype has sidelobe nulls
+ spaced fs/T ≈ 1 kHz apart, so 250 Hz sampling puts about four probes on
+ every lobe — a peak cannot hide between probes. The 1 dB grace absorbs
+ the gap between Kaiser's empirical β fit and the realized window; the
+ presets' 120 means "at least 119 measured," and in practice the margin
+ is comfortable.
+
+Honest limits, as always: these tests certify the *double-precision
+prototype*. Coefficient quantization (float, Q1.14, Q1.30) and the
+row-blending residual are downstream effects certified by the next
+chapter's tests and the end-to-end SNR suite — the layering is deliberate,
+so a failure names its culprit.
+
+## Why these ~100 lines look the way they do
+
+| Decision | Alternative rejected | Reason |
+|---|---|---|
+| Kaiser window | Parks–McClellan / remez | one β knob, closed form, no iteration to converge or fail at setup; near-optimal is optimal enough at 120 dB |
+| Power-series I₀ | `std::cyl_bessel_i` | optional annex, missing on libc++/embedded toolchains; the series is 12 lines and testable |
+| Iteration cap `k < 1000` | trust convergence | NaN input defeats the relative-error exit; termination must not depend on FP semantics in a `noexcept` function |
+| UB clamp in `estimateTaps` | trust callers | negative/infinite → `size_t` cast is UB; found by audit, closed for one branch |
+| Cutoff centered in transition, up to input Nyquist | classic conservative cutoff | near-unity interpolation only fights images of the protected band; symmetric transition spends taps evenly |
+| Normalize sum to L | sum to 1 | per-*branch* DC gain is what reaches the output; pinned by the DC unit test |
+| Runtime design | C++20 constexpr tables | pre-C++26 constexpr math gap; minutes of interpreted evaluation per TU; runtime sample rates exist; <10 ms once at setup |
+| Spec-based DFT tests | golden coefficient files | tests the claim, not the bits; refactors that preserve the response pass |
+
+## Verify it yourself
+
+```sh
+# Build and run the design-math tests: Bessel/beta reference values, the
+# harris estimate bracket, and the DFT passband/stopband spec checks for
+# all three presets:
+cmake -B build && cmake --build build -j
+ctest --test-dir build -R Kaiser --output-on-failure
+
+# The claim the normalization exists to protect (unity DC gain through the
+# built bank, swept over mu):
+ctest --test-dir build -R Polyphase.DcGain --output-on-failure
+
+# Break it on purpose: in designPrototype, change the normalization to
+# `1.0 / sum` (the textbook choice) and watch DcGainIsUnityAcrossMu fail by
+# a factor of numPhases; or weaken kaiserBeta's 0.1102 to 0.11 and watch
+# the Transparent stopband check report the exact frequency that leaks.
+```
+
+Both sabotage runs are worth the five minutes: the first shows you which
+test owns the normalization contract, and the second shows the empirical β
+fit has no slack at 140 dB — which is precisely why the constants are
+copied from Kaiser 1974 to the last digit.
diff --git a/book/src/part1/pi-servo.md b/book/src/part1/pi-servo.md
new file mode 100644
index 0000000..b407866
--- /dev/null
+++ b/book/src/part1/pi-servo.md
@@ -0,0 +1,525 @@
+# The clock servo: `pi_servo.hpp`
+
+> A governor is a part of a machine by means of which the velocity of the machine is kept nearly uniform, notwithstanding variations in the driving-power or the resistance.
+>
+> — James Clerk Maxwell, *On Governors* (1868)
+
+There is a number this entire library exists to find, and nobody will tell
+it to us.
+
+Call it ε: the fractional rate mismatch between the two crystals. The
+producer's device claims 48 kHz and delivers 48 000 × (1 + ε) frames per
+second; the consumer's device claims 48 kHz and takes them away at
+48 000 × (1 + something else). ε is a few parts per million, it wanders
+with temperature, and no API on either side will report it — the whole
+premise of the problem is that both devices believe they are correct. The
+resampler in the next chapter can apply any rate correction we ask of it,
+to a resolution of 2⁻⁶⁴ samples. It just needs to be told the number.
+
+The only observable we have is the elastic buffer between the domains: the
+SPSC ring from the last chapter, whose occupancy was designed to be *exact*
+for precisely this reason. If the producer's clock is fast by ε and we
+consume at exactly the nominal rate, the buffer fills at ε × fs frames per
+second — about one frame every two minutes at 200 ppm. That trickle is the
+entire signal. The servo's job is to turn it into an estimate ε̂ good
+enough that the resampler's output carries no audible trace of the
+estimation process — and "audible trace" here means fluctuations in ε̂,
+because whatever wobble the servo passes into the rate estimate
+frequency-modulates every sample of the audio.
+
+This chapter is control theory for someone who has never tuned a loop,
+taught the way this file was actually designed: start with the physics of
+the thing being controlled, discover why the obvious controller fails,
+derive the one that works, and then spend most of our effort on the real
+enemy — which turns out not to be the clocks at all, but the fact that we
+can only *count*.
+
+## The plant: a buffer that integrates
+
+Control theory calls the thing you are controlling the *plant*. Ours is
+the FIFO, and its equation of motion is one line. The producer inserts
+fs × (1 + ε_true) frames per second. The converter synthesizes fs output
+frames per second, and for each output frame it consumes (1 + ε̂) input
+frames — that is what "phase advance = 1 + ε̂" will mean in the next
+chapter. Occupancy changes at the difference of those rates:
+
+```text
+d(occ)/dt = fs · (ε_true − ε̂)
+```
+
+The buffer is a **pure integrator** with gain fs. Feed it a rate error and
+it does not settle at some proportional level — it ramps, forever, until
+it hits a wall (empty: dropout; full: overflow). Two consequences follow
+immediately. First, doing nothing is not an option even for arbitrarily
+small ε: any uncorrected mismatch is a glitch with a countdown timer on
+it. Second, the plant's own integration is going to interact with whatever
+memory the controller has, and getting that interaction right *is* the
+design.
+
+The servo observes the occupancy once per `pull()` — the converter calls
+`update(occ, mu, dt)` with the raw backlog in frames, the resampler's
+current fractional position μ (so the observable `occ + mu` moves
+continuously through whole-sample slips instead of staircasing by ±1),
+and the elapsed time `dt = framesPulled / fs`.
+
+## Why proportional control is not enough
+
+The obvious controller is proportional: measure the occupancy error
+`e = occ − target`, set ε̂ = Kp·e. If the buffer is too full, consume
+faster; too empty, consume slower. It even works, in the sense that it
+does not fall over.
+
+Now ask what it converges *to*. In steady state the occupancy stops
+moving, so the plant equation forces ε̂ = ε_true — the estimate must equal
+the true offset exactly. But a proportional controller can only produce
+ε̂ = Kp·e, so the error cannot be zero: it must park at
+
+```text
+e_ss = ε_true / Kp
+```
+
+a *standing occupancy offset* proportional to the clock mismatch. Plug in
+the numbers this library actually uses and the problem stops being
+academic. At the steady-state loop bandwidth of 0.05 Hz (we will get to
+why it is that low), Kp ≈ 1.3 × 10⁻⁵ per frame. A routine 300 ppm crystal
+offset parks the buffer **23 frames** away from its setpoint — half the
+default 48-frame latency budget gone, sitting one frame shy of the default
+24-frame unlock threshold, and different for every unit in the field
+because every crystal pair drifts differently. Latency that depends on
+which two devices you happened to plug in is not a spec anyone signs.
+
+The fix is memory. Add an integral term:
+
+```text
+ε̂ = Kp·e + Ki·∫e dt
+```
+
+The integrator accumulates error until the error is gone: in steady state
+it holds the entire ppm estimate by itself, ε̂ = ε_true with **zero
+standing occupancy error**. Control theory calls the combination a *type-2
+loop* — two integrators around the cycle, the plant's and the
+controller's — and type-2 is exactly the order needed to null a constant
+rate offset. `tests/test_servo.cpp` pins this down against a pure
+simulation of the plant equation: after settling at +300 ppm, the
+occupancy must sit within 0.05 frames of the setpoint and ε̂ within 1 ppm
+of the truth
+(`Servo.LocksFromConstantOffsetAndNullsError`).
+
+A type-2 loop also does something a type-1 cannot: it follows a *ramp* in
+the offset — a crystal warming up, drifting at 1 ppm/s — with bounded
+rather than growing error. The residual is the classic acceleration error
+`e_ss = (dε/dt · fs) / ωₙ²`, about 0.49 frames for 1 ppm/s at the 0.05 Hz
+bandwidth, and `Servo.TracksSlowDriftRampWithBoundedLag` holds the
+measured lag under one frame while `epsHat` tracks the moving truth to
+2 ppm.
+
+If this structure sounds familiar, it should. Replace "FIFO occupancy"
+with "phase difference" and this is a **phase-locked loop**: the FIFO
+comparison is the phase detector, the PI filter is the loop filter, and
+the resampler's μ accumulator is the numerically controlled oscillator.
+The README states the analogy flatly and it is worth internalizing,
+because it means every result in fifty years of PLL literature applies —
+including the one that matters most here: the loop bandwidth f_L
+*partitions* the input timing jitter. Components above f_L are absorbed
+by the buffer and never reach the audio; components below f_L pass into
+ε̂ and frequency-modulate it. Choosing f_L is choosing which noise you
+eat.
+
+## From bandwidth to gains
+
+So the designer picks a bandwidth and a damping; the gains should follow
+mechanically. Close the PI controller around the integrator plant and the
+loop's characteristic equation is
+
+```text
+s² + fs·Kp·s + fs·Ki = 0
+```
+
+Match it against the standard second-order form
+`s² + 2ζωₙs + ωₙ² = 0` — the form whose behavior every control textbook
+tabulates — and read off the gains:
+
+```text
+ωₙ = 2π·f_L Kp = 2ζωₙ / fs Ki = ωₙ² / fs
+```
+
+The code computes exactly this, nothing more:
+
+```cpp
+{{#include ../../../include/srt/pi_servo.hpp:sv_gains}}
+```
+
+Note the division by `fs_` in both gains: the plant's gain is fs, so the
+controller divides it back out, and the *closed-loop* behavior depends
+only on f_L and ζ. That innocuous-looking normalization is load-bearing —
+it is why the gains formula is rate-portable, and (foreshadowing the first
+war story) why everything *else* in the config is not.
+
+Damping defaults to ζ = 1, critical damping: the fastest settling that
+never overshoots. Overshoot in this loop is not a cosmetic wiggle — an
+occupancy overshoot is latency spent grazing the underrun floor, so the
+choice is not stylistic.
+
+Here is the full tuning surface, with the defaults that suit a 48 kHz
+near-unity converter:
+
+```cpp
+{{#include ../../../include/srt/pi_servo.hpp:sv_config}}
+```
+
+Three bandwidths, three smoother corners, and a small state machine's
+worth of thresholds. A single PI loop needs exactly two numbers; this
+config carries fourteen. The rest of the chapter is about earning each of
+the extra twelve.
+
+## The enemy: a sawtooth made of counting
+
+If the occupancy were a real number observed noiselessly, one PI loop at
+a modest bandwidth would end this chapter. It is not. The occupancy is a
+**count** — quantized to whole frames on the producer side, or to whole
+*push blocks* when the producer delivers audio in callbacks — and that
+quantization is not benign random noise. It is deterministic and
+periodic.
+
+Picture the steady state at +200 ppm with sample-granular transfer. The
+true (unquantized) backlog creeps upward by ε input samples per sample
+consumed; every time the creep accumulates one whole frame, the count
+steps. The observable is a perfect sawtooth: one push-block peak to peak,
+repeating at the *beat frequency*
+
+```text
+f_beat = ε · fs / pushBlock (the README's "ppm × pushRate")
+```
+
+At 200 ppm and sample-granular push that is 9.6 Hz with a one-frame tooth.
+With 32-frame callbacks it is 0.3 Hz with a **32-frame** tooth — the
+occupancy legitimately excursions ±16 frames with neither clock having
+moved. (`AsrcLock.LocksAndHoldsAtConstantOffset` averages straight
+through that sawtooth and requires the *mean* fill and ppm to land on the
+truth.)
+
+Why care about a deterministic wobble in a number we only use for its
+average? Because the loop does not know it is a wobble. Whatever fraction
+of the sawtooth survives into ε̂ becomes a periodic modulation of the
+resampling rate — FM sidebands on every tone in the program material, at
+offsets of f_beat and its harmonics. And a PI controller is a terrible
+filter: above f_L its proportional path passes measurement noise straight
+through at gain Kp, flat, forever. Narrowing f_L does not fix this by
+itself; it lowers Kp (helping linearly) while the sawtooth needs 60–120 dB
+of suppression. The loop needs help *before* the loop: error prefilters.
+
+But a prefilter is lag, and lag inside a feedback loop erodes phase
+margin; you cannot smooth aggressively *and* acquire quickly with the same
+settings. There is no single operating point that pulls in a cold start
+within a second, rejects a 9.6 Hz sawtooth by 100+ dB, and follows a
+warming crystal. So the servo refuses to pick one point. It picks three.
+
+## Three loops, one integrator
+
+| Stage | Loop bandwidth | Error prefilter | Role |
+|---|---|---|---|
+| **Acquire** | 10 Hz | 1-pole, 50 Hz | pull in from a cold start (~1 s to lock) |
+| **Track** | 1 Hz | 1-pole, 5 Hz | robust lock; terminal stage for coarse-block transfer |
+| **Quiet** | 0.05 Hz | 3-pole cascade, 0.5 Hz | steady state for fine-grained transfer |
+
+Each stage is the same PI structure with gains from the same
+`computeGains`, differing only in bandwidth and in how hard the
+measurement is smoothed before the loop sees it. The update begins by
+maintaining *both* kinds of smoothed error on every call:
+
+```cpp
+{{#include ../../../include/srt/pi_servo.hpp:sv_update_smooth}}
+```
+
+Two details here repay attention. The smoothing coefficient
+`alpha(cornerHz, dt) = 1 − exp(−2π·f·dt)` is the exact discrete step of a
+one-pole lowpass over an arbitrary interval, so the filter corners are
+honest frequencies in Hz regardless of how large or irregular the pull
+blocks are — the same property the gain formulas have via `dt` in the
+integrator. And the three-pole quiet cascade (`q1_ → q2_ → q3_`) runs
+**always**, even in Acquire and Track where its output does not drive the
+loop. That costs three multiply-adds per block and buys two things: the
+promotion gate into Quiet has real data to judge (next section), and at
+the instant of promotion the cascade is already settled on the observable
+— no filter warm-up transient handed to the narrowest, most fragile
+stage.
+
+Why a *cascade* of three identical poles rather than one pole three times
+lower, or something sharper? Rolloff. One pole buys 6 dB/octave above its
+corner; three poles buy 18 dB/octave. Against the 9.6 Hz sawtooth, a
+0.5 Hz three-pole cascade provides roughly (9.6/0.5)³ ≈ 77 dB of rejection
+before the loop even sees the error — while adding only manageable lag at
+the 0.05 Hz loop bandwidth two decades below. The file header states the
+net result as a system-level figure: in Quiet, a one-frame sawtooth is
+rejected to roughly −120 dBc equivalent at 20 kHz, while the loop still
+follows a 1 ppm/s drift ramp with under half a frame of standing error.
+Sharper IIR shapes (resonant poles, elliptic-style) would trade that
+clean, phase-predictable lag for ringing inside a feedback loop — exactly
+the wrong place for it.
+
+## The promotion machine
+
+Three stages need transitions, and transitions are where multi-mode
+controllers usually betray you — a bandwidth switch with mismatched state
+is a step input injected into your own loop. Here is the whole state
+machine:
+
+```cpp
+{{#include ../../../include/srt/pi_servo.hpp:sv_update_stages}}
+```
+
+Reading it as a protocol: promotion out of Acquire requires the *fast*
+smoothed error to stay inside one frame for half a second; promotion out
+of Track requires the *cascade* error to stay inside one frame for two
+full seconds. Demotion is the same test run backwards with a much wider
+threshold — 24 frames — and drops exactly one stage. The asymmetry
+(narrow gate up, wide gate down, long holds) is hysteresis by
+construction: the servo would rather linger a stage wide than oscillate
+between modes.
+
+The choice of *which* error gates the Track→Quiet promotion is the
+subtlest line in the file, and it earns the second war story below.
+Gating on the cascade-smoothed error means the promotion asks precisely
+the question that matters: *after the smoothing Quiet would actually use,
+is the observable quiet enough to run a 0.05 Hz loop?* When a large block
+beat dominates the occupancy, the answer is naturally and persistently
+no — the cascade output wobbles by more than a frame at the beat
+frequency, the hold timer keeps resetting, and the servo stays in Track.
+Nobody wrote a rule that says "coarse-block configurations must not enter
+Quiet." The physics writes it.
+
+Both promotions share their hold logic, and it does double duty:
+
+```cpp
+{{#include ../../../include/srt/pi_servo.hpp:sv_hold}}
+```
+
+While the hold window runs, the servo is not just waiting — it is
+averaging its own output ε̂ with a time constant of a fifth of the hold.
+Here is why that average exists. The wide stages do not *reject* the
+quantization sawtooth; they phase-track it, riding the wobble with their
+whole loop. Their instantaneous ε̂ is therefore a good estimate wrapped in
+a periodic error. Averaging over the hold window (many beat cycles)
+strips the wobble and leaves the clean central value — and at the moment
+of promotion, *that* is what gets loaded into the narrower stage's
+integrator (`integ_ = clamp(epsAvg_, ...)` in the state machine above).
+
+Recall what the integrator *is* in steady state: the entire rate
+estimate. Handing the next stage a clean integrator means handing it a
+loop that is already essentially converged; the proportional path only
+has to clean up residuals. That is the transient-free handoff — "to first
+order," as the header says, because the smoothers keep their state and
+the observable keeps its continuity, so nothing steps.
+`Servo.BandwidthSwitchIsTransientFree` runs the plant through lock and
+across both promotions and requires the occupancy never to leave the
+one-frame lock threshold afterwards: a handoff you cannot find in the
+data.
+
+## The output stage, and why the clamp is inside
+
+The last lines of `update()` are the PI itself:
+
+```cpp
+{{#include ../../../include/srt/pi_servo.hpp:sv_update_out}}
+```
+
+The clamp appears twice, and the first one — on the integrator, not just
+the output — is the anti-windup that every practical PI needs and every
+first implementation forgets. Consider a consumer stall: the occupancy
+error goes huge and stays huge for seconds while the converter waits for
+the high-watermark resync. An unclamped integrator would spend that whole
+time charging toward a rate estimate of thousands of ppm — a number no
+crystal pair can produce — and then, after the disturbance clears, the
+loop would have to *discharge* all of that false conviction through its
+narrow bandwidth, dragging the occupancy through a huge excursion for tens
+of seconds. Clamping the integrator at 1.5 × `maxDeviationPpm` bounds the
+lie the loop can tell itself: the estimate can never leave the range
+physics allows, so recovery from any disturbance starts at most one clamp
+width from the truth. The output clamp then bounds what the resampler is
+asked to do per sample (which also protects the Q0.64 conversion in the
+next chapter). `Servo.ClampsToMaxDeviation` feeds a 10 000-frame error and
+requires the output to saturate exactly at 1.5× the configured range.
+
+## Knowing when not to chase: `seed()` and `reset()`
+
+```cpp
+{{#include ../../../include/srt/pi_servo.hpp:sv_reset}}
+```
+
+A feedback loop's reflex is to chase every step in its input. Some steps
+carry no information, and the API encodes each such case explicitly:
+
+- **`seed(occPlusMu)`** snaps all four smoothers onto the current
+ observable. The converter calls it when the occupancy jumps *for a
+ known reason* — acquisition start, a hard resync discard. Without it,
+ the smoothers would report the jump as a genuine multi-frame error and
+ the loop would obediently swerve.
+- **`reset(keepIntegrator=true)`** re-arms the state machine after a
+ dropout but preserves the integrator — because a dropout says nothing
+ about the crystals. The ppm estimate from before the glitch is still
+ the best available number, and relock becomes a formality
+ (`Servo.DropoutResetKeepsPpmEstimate` pins both flavors: `true`
+ preserves the estimate to 5 ppm, `false` zeroes it).
+- **`setTarget()`** moves the setpoint while keeping the integrator *and*
+ the smoothers' tracking state, so the loop slews to the new occupancy
+ at its clamped rate with no discontinuity — used by the converter's
+ adaptive pull-block setpoint raise, where the setpoint moves but,
+ again, the clocks have not.
+
+The shared principle: the integrator is the loop's knowledge and the
+smoothers are its perception. Each event handler keeps exactly the state
+that is still true and resets exactly the state that is not.
+
+## War story one: 16 kHz, minus 32 decibels
+
+For a long time this library's defaults were "the defaults," full stop —
+designed, tested, and shipped at 48 kHz. Then a real deployment shape
+arrived: 16 kHz reference-microphone processing. Same code, same presets,
+a third of the sample rate. The quality suite was duplicated at 16 kHz,
+expecting boring numbers.
+
+The numbers came back **~32 dB worse at every tone**, falling a further
+6 dB per octave of signal frequency. That frequency signature is the
+fingerprint of small-index FM — phase modulation of the resampling
+position, whose sidebands grow with the modulated signal's frequency —
+which pointed at the servo, not the filter.
+
+The mechanism, worked out in
+`tests/test_asrc_quality_16k.cpp`'s header comment and now baked into the
+config comment: servo bandwidths and smoother corners are **absolute
+hertz**, but the disturbance they exist to reject is not. The slip-beat
+sawtooth sits at ε × fs — 9.6 Hz at 48 kHz, only **3.2 Hz at 16 kHz**.
+The three-pole 0.5 Hz cascade whose rejection goes as f³ therefore does
+(16/48)³ ≈ 28.6 dB *less* damage to the beat at 16 kHz, and the
+measurement becomes servo-FM-limited: predicted ≈ 28.6 dB, measured
+≈ 32 dB. The loop was not misbehaving. It was doing exactly what its
+absolute-Hz constants said, against a disturbance that had moved.
+
+The rule that fixes it is now a method, so it cannot be half-remembered:
+
+```cpp
+{{#include ../../../include/srt/pi_servo.hpp:sv_scaled_to}}
+```
+
+Every field with units of Hz scales with the rate — keeping the loop
+identical in *normalized*, per-sample terms, which is the frame the
+disturbance lives in. Every field denominated in frames or ppm
+(`lockThresholdFrames`, `unlockThresholdFrames`, `maxDeviationPpm`) is
+already normalized and stays put. And the hold times scale *inversely*:
+a loop with a third the bandwidth has time constants three times longer,
+so waiting "2 seconds" before promoting would mean waiting a third as
+many loop time constants — the gates would fire on less evidence. The
+original hand-scaled 16 kHz configuration missed the hold-time rule;
+adding it re-measured identical within noise, and the test suite now
+covers the factory (`Config::forSampleRate`, which applies this and the
+matching `FilterSpec::scaledTo`) both structurally
+(`AsrcQuality16k.ForSampleRateScalesHzFieldsOnly` checks exactly which
+fields move) and behaviorally: through the factory, 16 kHz measures
+136.6 dB at 333 Hz — within ~1 dB of 48 kHz at the same normalized
+frequency, the 32 dB fully recovered.
+
+One more cost of scaling, honestly: at 16 kHz the Quiet loop runs at
+~0.017 Hz, so the quality tests run for 120 seconds of simulated audio
+instead of 40 — the same number of loop time constants. Slow loops are
+slow everywhere, including in CI.
+
+## War story two: when Track is the ceiling
+
+The block-size study (`notebooks/asrc_block_size_study.ipynb`) asked what
+happens as transfer granularity coarsens from sample-granular toward the
+32- and 240-frame callbacks real audio APIs deliver. The finding shapes
+how you should read the stage table: with blocks of 32 frames and up,
+**the servo never promotes to Quiet — and must not.**
+
+The information-theoretic version of the argument: at a 32-frame block,
+the occupancy observable updates a few hundred times per second with a
+±16-frame deterministic sawtooth on top of a sub-frame-per-second signal.
+Quiet-level performance means resolving the backlog trend to a small
+fraction of a frame *through* that tooth using counts alone; the counts
+simply do not carry the information. The promotion gate discovers this
+without being told: the cascade-smoothed error keeps excursing past one
+frame at the beat frequency, the two-second hold never completes, and
+Track becomes the terminal stage — the discriminator working as designed.
+
+What does Track-forever sound like? The 1 Hz loop phase-tracks the block
+beat: most of the sawtooth is absorbed as **latency breathing** — the
+buffer level, and hence the delay, swaying by a fraction of the block at
+the beat rate, inaudible by construction. The remainder leaks into ε̂ as
+low-rate FM, and the study put calibrated numbers on it: **~0.9 cents rms
+of frequency wobble (61 dB wideband quality) at 32-frame blocks, ~1.3
+cents / 53 dB at 5 ms blocks**, as the README reports. Cent-scale wobble
+at sub-hertz rates is at the edge of perception for sustained pure tones
+and irrelevant for program material — but it is a real ceiling, and it is
+a *sensor* ceiling, not a servo defect. The README's limitations section
+draws the forward-looking conclusion: breaking it requires a better
+observable (per-block timestamps for sub-sample phase observation), not
+a cleverer filter behind the same counts.
+
+The practical corollary is the config comment you may have skimmed past
+on `unlockThresholdFrames`: it must sit comfortably above **half the
+push/pull block size**, because a coarse-block sawtooth legitimately
+excursions that far with the clocks standing still. The default 24 clears
+a 32-frame transfer's ±16 with margin. Undersize it — say, 8 against
+32-frame callbacks — and the healthy beat itself trips demotion:
+Track→Acquire, re-lock, promote, trip again, a mode limit cycle
+manufactured entirely in configuration. If you change one servo number
+for an embedded deployment, this is the one to check.
+
+## The whole life cycle, measured
+
+Everything this chapter described is visible in one trace: the converter
+driven at +200 ppm in deterministic virtual time (1-frame pushes — the
+long tests' methodology), with a 50 ms producer stall injected at t = 28 s.
+
+
+
+*Acquiring's 10 Hz loop rings clamp-to-clamp on the quantized occupancy —
+the sawtooth of the "enemy" section, live — yet the smoothed occupancy
+never strays two frames from the setpoint, and promotion lands in half a
+second. After the stall, `reset(true)` keeps the integrator, so the
+re-acquire rings around 200 ppm rather than starting over from zero.
+Generated by `scripts/book_figures.py`, which compiles a small trace
+dumper against the real headers and runs exactly this scenario.*
+
+## The shape of the design
+
+| Decision | Alternative rejected | Reason |
+|---|---|---|
+| PI (type-2) loop | proportional-only | P parks a ppm-dependent occupancy offset (≈23 frames at 300 ppm in Quiet); the integrator nulls it |
+| Gains derived from (f_L, ζ) via 2nd-order matching | hand-tuned constants | tuning surface is two physical numbers; `computeGains` is the textbook formula, verifiable by inspection |
+| Three stages | one compromise bandwidth | pull-in wants 10 Hz, sawtooth rejection wants 0.05 Hz + heavy smoothing; no single point does both |
+| Cascade error gates promotion | timer or lock-counter | asks the exact question ("could Quiet's own filtered error hold lock?"); auto-excludes coarse blocks |
+| Integrator seeded from hold-window average | reset on transition | wide stages phase-track the sawtooth; the average is the clean estimate — handoffs transient-free |
+| Integrator clamp (anti-windup) | clamp output only | disturbances must not charge the estimate past physics; recovery starts near the truth |
+| `seed()`/`reset(keepIntegrator)` API | let the loop chase every step | known-cause jumps carry no clock information; keep the knowledge, refresh the perception |
+| `scaledTo()` for other rates | reuse 48 kHz defaults | absolute-Hz constants vs a rate-proportional disturbance: measured −32 dB at 16 kHz |
+
+## Verify it yourself
+
+```sh
+# The five servo unit tests against the pure plant equation
+# (type-2 nulling, ramp tracking, transient-free handoff, clamp, reset):
+ctest --test-dir build -R 'Servo\.' --output-on-failure
+
+# The servo inside the real converter: lock/hold through the 32-frame
+# block beat, drift-ramp tracking, slip continuity, stall recovery:
+ctest --test-dir build -R 'AsrcLock\.' --output-on-failure
+
+# War story one, end to end (long: 120 s simulated per tone; prints the
+# measured SNRs — compare against the thresholds in the file):
+ctest --test-dir build -R 'AsrcQuality16k\.' --output-on-failure
+
+# War story two: regenerate the block-size study (32 / 64 / 240 frames,
+# latency breathing and the cents-rms FM decomposition):
+jupyter nbconvert --to notebook --execute notebooks/asrc_block_size_study.ipynb
+
+# Break it on purpose: in tests/test_asrc_quality_16k.cpp, replace
+# Config::forSampleRate(kFs) with a default-constructed Config (keeping
+# cfg.sampleRateHz = 16000.0) and watch ~32 dB vanish from every tone.
+```
+
+As with the ring buffer, the last item is the chapter in one line. The
+three stages, the cascade, the scaling rule — none of it is decoration.
+Take any piece away and a measurement, not an opinion, tells you what it
+was holding back.
diff --git a/book/src/part1/polyphase-bank.md b/book/src/part1/polyphase-bank.md
new file mode 100644
index 0000000..a9c07a6
--- /dev/null
+++ b/book/src/part1/polyphase-bank.md
@@ -0,0 +1,358 @@
+# The polyphase bank
+
+> Show me your flowcharts and conceal your tables, and I shall continue to be mystified. Show me your tables, and I won't usually need your flowcharts; they'll be obvious.
+>
+> — Fred Brooks, *The Mythical Man-Month*
+
+The previous chapter ended with a prototype filter: 12,288 double-precision
+coefficients (for the default preset) describing one ideal anti-imaging
+lowpass, oversampled 256× against the input rate. This chapter is about a
+data structure. Per output sample, the converter's budget is one dot
+product of 48 multiply-accumulates — not 12,288 — and the fractional
+position μ arrives with 2⁻⁶⁴-sample resolution, demanding a filter for a
+delay the table cannot possibly enumerate. `PolyphaseFilterBank` is the
+arrangement of those 12,288 numbers that makes the right 48 of them, for
+*any* μ, a matter of two pointer offsets and a linear blend. Almost
+everything interesting about it is in the layout: one extra row nobody
+asked for, every row stored backwards, and a table that no code path may
+touch after its constructor returns.
+
+## The decomposition: L filters hiding in one
+
+Recall what the prototype is: the windowed sinc sampled on a grid of 1/L
+input samples, `L·T` taps long. Evaluating the input signal at a position
+`p/L` between samples means dotting the T input samples in the window
+against the sinc *offset by p/L* — which, on the prototype's grid, is
+simply every L-th coefficient starting at p:
+
+```text
+branch 0: h[0], h[L], h[2L], … h[(T−1)L] delay 0
+branch 1: h[1], h[L+1], h[2L+1] … delay 1/L sample
+branch p: h[p], h[L+p], h[2L+p] … delay p/L sample
+branch L−1: … delay (L−1)/L
+```
+
+That is the entire polyphase decomposition for this use case — no z-domain
+identities required. One oversampled filter *is* L ordinary T-tap filters
+interleaved, each a fractional-delay filter for one grid position. Nothing
+is computed to "decompose" it; the bank merely copies the prototype into a
+`(rows × T)` table so that each branch's taps — which are strided L apart
+in the prototype — become contiguous in memory, because the dot product
+will read them T-at-a-time, millions of times, and the prototype order
+would stride the cache to death. The classic references derive this
+structure for rational resamplers (it is also how commercial ASRC silicon
+like the AD1896 organizes its ROM); here it is simpler, because near-unity
+operation means each output needs exactly *one* branch evaluation — the
+question is only which branch, and what to do between branches.
+
+## Between the branches: why L = 256 and a linear blend
+
+μ is a 64-bit fraction; the table has L rows. Rounding μ to the nearest
+row would quantize the delay to 1/L of a sample, and delay quantization on
+a moving signal is *noise* — worse at high frequencies, where a fixed time
+error subtends more phase. The bank's answer is the standard one at this
+quality tier: pick the two rows adjacent to μ·L and interpolate the
+*coefficients* linearly between them. The residual error of that blend is
+the quality knob the spec exposes:
+
+```cpp
+{{#include ../../../include/srt/polyphase_filter.hpp:bank_spec}}
+```
+
+The comment's two slopes are the design law for choosing L, and they are
+measured properties of this code, not folklore (the README derives its
+quality table from the test suite): the blend residual falls **about 12 dB
+for every doubling of L** — linear interpolation has second-order error,
+so halving the grid step quarters the error — and rises **about 12 dB per
+octave of signal frequency**, because coefficient interpolation error acts
+like a second derivative and high frequencies bend faster. You can see the
+frequency slope directly in the shipped numbers for `balanced()`
+(L = 256): 135 dB SNR at 997 Hz, 120 dB at 6 kHz, 112 dB at 12 kHz, 105 dB
+at 19.5 kHz — once the signal frequency is high enough for the blend
+residual to dominate, each octave costs roughly the predicted 8–12 dB. The
+unit tests pin the same staircase at the kernel level, single tones against
+the analytic sine: worst-case error below −120 dB at 997 Hz, −110 dB at
+4 kHz, −100 dB at 10 kHz, −90 dB at 19 kHz.
+
+Why not simply crank L and skip the blend? Cost. Nearest-row lookup has
+*first*-order error — about 6 dB per doubling — so matching the blend's
+accuracy at 19.5 kHz would take L in the hundreds of thousands and a table
+in the hundreds of megabytes. With the blend, `balanced()` is
+(256 + 1) × 48 float coefficients ≈ 48 KB — resident in L2, arguably L1,
+on hosts, and tolerable in MCU RAM at Q15 (≈ 24 KB). `transparent()`
+doubles L *and* stretches T for ≈ 160 KB in float, buying its extra margin
+mostly at the top of the band (108 dB vs 105 dB at 19.5 kHz measured end to
+end). Why not a fancier blend — cubic across four rows? It would double
+the coefficient traffic and the blend arithmetic in the innermost loop the
+library owns, to fix the *highest-frequency* residual only; L = 256 already
+puts that residual below the 105 dB the rest of the chain sustains. The
+linear blend is the cheapest operation that keeps the table small and the
+error second-order; everything faster is worse, everything better is not
+needed at this budget.
+
+## The extra row: L + 1 rows for an L-phase filter
+
+Here is the file's cleverest line, and it is a line of *allocation*, not of
+algorithm:
+
+```cpp
+{{#include ../../../include/srt/polyphase_filter.hpp:bank_layout}}
+```
+
+The problem it dissolves: blending needs rows `p` and `p + 1`. For
+p = 0 … L−2 both exist. At **p = L−1** the blend wants "row L" — the
+branch for a delay of exactly one whole sample. Modular thinking says row L
+"is" row 0, and arithmetically it is — *for a different window*. Branch 0
+is a delay of zero against the current window; the position μ → 1 is a
+delay of one, which equals a delay of zero against the window advanced by
+one input sample. Using row 0 against the *current* window would be wrong
+by exactly one sample — not subtly wrong: it would blend the correct filter
+with a copy of the signal shifted a full sample, an error at signal level.
+
+The conventional fixes are all branches. Detect p = L−1 and handle the
+wrap specially — a data-dependent branch in the per-sample path, taken at
+the beat frequency between the two crystals (at 200 ppm, about ten times a
+second), which is also precisely the moment the resampler executes a
+whole-sample slip, the most delicate step it performs. Or clamp μ short of
+1.0 and accept a periodic discontinuity — a spur at the beat frequency,
+in a library chasing 120 dB.
+
+The bank's fix: **store row L explicitly, as branch 0 advanced by one input
+sample**. It falls out of the construction loop with no special case:
+
+```cpp
+{{#include ../../../include/srt/polyphase_filter.hpp:bank_build}}
+```
+
+Follow the index math for `p == phases_`: the prototype index is
+`m = t·L + L = (t+1)·L` — branch 0's tap `t + 1`. So row L holds branch 0's
+coefficients shifted one *tap*, i.e. one input sample; the final tap
+(`m = T·L`) falls off the prototype's end and the `(m < n)` guard writes a
+zero. Row L computed against the current window is *identically* branch 0
+computed against next window. The consequences, in the order they matter:
+
+- **Branch-free interpolation.** `interpolate()` may always read
+ `phase(p)` and `phase(p + 1)` for any p ≤ L−1. No modulo, no compare, no
+ special case — the hot loop's structure is independent of μ.
+- **Exact continuity at the μ-wrap.** As μ → 1 the blend converges to pure
+ row L; the whole-sample slip then advances the window and resets μ to 0,
+ where pure row 0 takes over — and those two evaluations are the same
+ arithmetic on the same samples. The seam has *zero* width: not "small
+ error," but bit-level agreement of the limits from both sides, up to the
+ one blend step of the approach.
+
+Neither property is left to prose. `Polyphase.ExtraRowEqualsPhaseZeroAdvancedOneTap`
+asserts the layout claim coefficient by coefficient — `phase(L)[0] == 0`
+and `phase(L)[u] == phase(0)[u−1]` with `EXPECT_EQ`, exact equality, no
+tolerance, because the construction loop is supposed to make them the
+*same numbers*, not similar ones. `Polyphase.MuWrapIsContinuousWithWindowShift`
+then asserts the consequence at the semantic level: `interpolate(hist,
+μ → 1)` equals `interpolate(hist + 1, μ = 0)` on random data — the
+whole-sample-slip invariant the resampler (two chapters from now) leans on
+every time the crystals drift one full sample apart. The cost of all this:
+48 extra coefficients — 192 bytes in float — and one `+ 1` in a `resize()`.
+It is the best byte-per-correctness trade in the library.
+
+## Rows stored backwards
+
+The second line of that layout comment: rows are **tap-reversed**.
+Convolution is inherently a reversal — output = Σ h[k] · x[now − k] — so
+either the coefficient array or the history walk must run backwards.
+The resampler keeps each channel's history as an *oldest-first* window
+(natural for its append-and-compact delay line, and the friendly direction
+for hardware prefetchers). Storing each row reversed at construction —
+`table_[p·T + (T−1−t)]` — lets the kernel be the loop every SIMD unit
+wants:
+
+```text
+for t in 0…T−1: acc += hist[t] · row[t]
+```
+
+both arrays walked forward, contiguously, from element zero. The
+reversal is paid once per converter at build time instead of once per
+sample as backwards addressing, and the payoff is documented downstream in
+this book's optimization chapters: the auto-vectorized Q15 kernels, the
+SMLALD pair-loads on Cortex-M33 (which require adjacent taps to sit in
+ascending order in one 32-bit load), and the `SRT_RESTRICT` blend loop all
+assume exactly this orientation. One subtlety the test above already
+banked: "advanced one tap" for the reversed row L means shifted one slot
+*toward the newer end*, which is why the zero lands in slot 0 (the oldest)
+— the kind of double-negation a comment can state but only an `EXPECT_EQ`
+can enforce.
+
+## Quantization happens here, once
+
+The table's element type is not `double` — it is
+`SampleTraits::Coeff`, and the constructor's `makeCoeff(v)` is the
+single point where the design-precision prototype becomes datapath
+coefficients. Quantizing once at build time, rather than converting on the
+fly, means the hot path reads exactly what it dots and the quantization
+error is a fixed property of the constructed object, measurable by the
+tests rather than dependent on the code path taken.
+
+What each sample type stores (the full traits treatment is the next
+chapter; here is what the *bank* needs you to know):
+
+- **float** stores float coefficients: quantization at roughly −150 dB
+ against the double prototype — comfortably irrelevant under a 120 dB
+ target, which is why the float path's quality tests read the same as the
+ design spec.
+- **Q15 and Q31 store Q1.14 and Q1.30**, not Q0.15/Q0.31 — one bit of
+ headroom spent because of a fact the *previous* chapter created: the
+ prototype is normalized so each branch has DC gain 1, which puts the
+ peak (center) tap at ≈ 1.0, and 1.0 does not fit a pure fractional
+ format whose ceiling is 1 − 2⁻¹⁵. Rather than rescale the filter (and
+ move the problem into output gain), each fixed-point format trades its
+ top precision bit for range. `makeCoeff` rounds half-away-from-zero and
+ saturates, so even a tap of exactly 1.0000…1 from design rounding
+ becomes the format's max instead of wrapping to −1 — a wraparound there
+ would be a −∞ dB event, not a noise-floor one.
+
+The bank is thus one template with three concrete personalities, and the
+table *is* the personality: same layout, same extra row, same reversal,
+different arithmetic downstream.
+
+## Validation in two layers, and the all-NaN table
+
+The constructor rejects what it can see is nonsense: a non-positive sample
+rate, fewer than 4 taps, fewer than 2 phases, inverted or out-of-range band
+edges — throwing `std::invalid_argument` at setup time, where exceptions
+are allowed and cheap. This is necessary and insufficient, and the gap
+between those two words is an audit story worth retelling precisely.
+
+Every check in the constructor is a comparison. Feed the converter a
+`Config` whose `sampleRateHz` is NaN — one uninitialized field in caller
+code — and every comparison is *false*: `sampleRateHz <= 0.0`? False.
+`stopbandHz > sampleRateHz`? False. The constructor sails through,
+`cutoffNorm` goes NaN, `designPrototype` dutifully computes 12,288 NaN
+coefficients (recall the previous chapter: the Bessel iteration cap exists
+so even *this* terminates), and the object constructs successfully. The
+converter then runs, produces NaN audio forever, and never throws, never
+asserts, never glitches in a way a log would catch. The adversarial audit
+of the library built exactly this object (finding F2); the fix is the
+converter-level `validated()` gate, which enforces what the bank's local
+comparisons cannot express:
+
+- **finiteness of every double in the config** — the only guard NaN cannot
+ slip, because it is `std::isfinite`, not an ordering;
+- **the band-edge sum rule**: `passbandHz + stopbandHz ≤ sampleRateHz`.
+ The bank alone accepts `stopbandHz` up to the sample rate, but the
+ cutoff is *centered* at `(pass + stop)/fs` — let the sum exceed fs and
+ the anti-image cutoff lands above the input Nyquist, a filter that
+ passes the very images it exists to kill, while every local check still
+ passes;
+- plus the servo's eps-overflow clamp and 32-bit size-product overflow,
+ which belong to later chapters.
+
+All of it is pinned by `ConfigValidation.RejectsSilentMisbehavior` — each
+formerly-constructible pathology now `EXPECT_THROW`s — and, just as
+deliberately, by two `EXPECT_NO_THROW`s: the rate-scaling factory
+`Config::forSampleRate` produces specs sitting *exactly on* the sum-rule
+boundary (passband + stopband == fs up to rounding), and a validation rule
+that rejected its own library's presets would be a different bug. The
+division of labor is a pattern to copy: the class rejects what it can
+express *locally*; the composition layer owns the invariants that only
+exist between components; and every rejected configuration is one a real
+caller could plausibly write.
+
+## C++ notes: immutability, `bit_ceil`, and the accessors
+
+**Immutable after construction — as architecture, not style.** The class
+has no mutating member functions; every accessor is `const noexcept`. This
+buys three unrelated things at once. *Thread safety by subtraction*: the
+bank is built on the setup thread and read from the real-time consumer
+thread; with no writes after publication there is nothing to synchronize —
+the ring buffer chapter's acquire/release agonies simply do not apply to
+this object. *RT discipline*: the only allocation is in the constructor,
+which the header explicitly assigns to setup time; the audio path holds a
+`const` pointer and cannot even express a reallocation. *Exception
+containment*: everything that can throw (`bad_alloc`,
+`invalid_argument`) throws before the object exists, so a constructed bank
+is unconditionally valid — there is no half-designed state for the hot
+path to trip over.
+
+**`std::bit_ceil` for L.** The constructor rounds `numPhases` up to a
+power of two rather than validating it, and the reason lives in the
+resampler's fast path: the Q0.64 phase accumulator selects the row by
+taking the top log₂ L bits of a 64-bit fraction — one shift — and the
+intra-row blend factor from the bits below — one more shift. That indexing
+scheme *requires* a power-of-two L; `bit_ceil` (C++20, ``, exact and
+self-describing where the old `1 << ceil(log2(n))` dance was neither)
+guarantees it while giving any spec at least the resolution it asked for.
+Rounding up rather than throwing is deliberate policy: more phases is
+strictly better along the quality axis, so a spec of 200 phases quietly
+becomes 256 rather than a setup error. The same power-of-two guarantee is
+what lets `blendRowPhase` recover log₂ L with `std::countr_zero` instead
+of storing it.
+
+**The accessor surface is four functions, and their shapes are load-bearing:**
+
+```cpp
+{{#include ../../../include/srt/polyphase_filter.hpp:bank_accessors}}
+```
+
+`phase(p)` returns a raw `const Coeff*`, not a `std::span` — the kernels
+consume rows through `SRT_RESTRICT`-qualified pointer parameters (that
+no-alias promise is worth measured percentage points; see the
+vectorization-audit chapter), and a span would be unpacked back to a
+pointer at every call site while implying a bounds story the hot path
+cannot afford to check. The domain quietly includes `p == numPhases()` —
+the extra row is a first-class citizen of the API, which is exactly how
+`interpolate()` gets to be branch-free:
+
+```cpp
+{{#include ../../../include/srt/polyphase_filter.hpp:bank_interpolate}}
+```
+
+Note the one guard that *does* exist — clamping `p` when μ rounds up to
+exactly L — protects against a floating-point edge of the *caller's* μ,
+not of the table; and `groupDelaySamples()` reports `(L·T − 1)/(2L)`, the
+true center of the linear-phase prototype in input samples, which is
+"T/2" only to the resolution of the 1/(2L) half-step that the kernel
+accuracy tests must account for when they compute the expected analytic
+delay. The bank knows its own delay exactly; approximations are for prose.
+
+## Why this table looks the way it does
+
+| Decision | Alternative rejected | Reason |
+|---|---|---|
+| Contiguous T-tap rows per branch | dot the strided prototype directly | the kernel reads rows millions of times; stride-L access wastes the cache the table was sized to fit |
+| Linear blend between adjacent rows | nearest row; cubic blend | nearest needs astronomically large L (first-order error); cubic doubles hot-loop work to fix a residual already below the chain's floor |
+| L = 256 default | 128 / 512 | −12 dB residual per doubling vs table size; 48 KB meets the 105 dB @ 19.5 kHz budget; presets bracket it both ways |
+| **Extra row L** | wrap to row 0 + branch; clamp μ | branch-free hot loop; μ-wrap/whole-sample slip exactly continuous; costs 192 bytes |
+| Tap-reversed rows | reversed iteration per sample | reversal paid once at build; forward contiguous dot is what vectorizers and SMLALD pair-loads require |
+| Quantize via `makeCoeff` at build | convert coefficients on the fly | error becomes a fixed, testable property of the object; hot path reads storage type directly |
+| Q1.14 / Q1.30 coefficients | Q0.15 / Q0.31 | peak tap ≈ 1.0 by DC normalization; headroom bit beats wraparound at the table's largest value |
+| Throw in constructor + converter `validated()` | validate in one place | the class can only check local comparisons; NaN defeats comparisons — finiteness and the band-edge *sum* rule are composition-level invariants (audit F2) |
+| Immutable after construction | resettable/redesignable bank | cross-thread reads need no sync; allocation and throws confined to setup; no invalid intermediate states |
+| `std::bit_ceil(numPhases)` | reject non-power-of-two | phase-bit row indexing requires 2ᵏ; rounding up is strictly quality-positive |
+| Raw `const Coeff*` accessor | `std::span` row | kernels take restrict pointers; span adds implied checking the per-sample path cannot spend |
+
+## Verify it yourself
+
+```sh
+# Build, then run this chapter's direct evidence: DC gain across mu, the
+# extra-row layout equality, the mu-wrap continuity invariant, and the
+# fractional-delay error staircase for balanced and transparent:
+cmake -B build && cmake --build build -j
+ctest --test-dir build -R Polyphase --output-on-failure
+
+# The audit's rejected-config suite (NaN rate, image-passing band edges),
+# including the boundary cases that must keep constructing:
+ctest --test-dir build -R ConfigValidation --output-on-failure
+
+# The end-to-end SNR numbers the L=256 decision is quoted against
+# (997 Hz / 6 k / 12 k / 19.5 k, both presets, servo in the loop):
+ctest --test-dir build -R AsrcQuality --output-on-failure
+
+# Break it on purpose: in the constructor, change `p <= phases_` to
+# `p < phases_` and resize to phases_ * taps_ (no extra row), then make
+# interpolate wrap p+1 to 0. DcGain still passes — DC can't see a
+# one-sample shift — but MuWrapIsContinuousWithWindowShift fails loudly,
+# which is exactly the gap between "looks fine on steady signals" and
+# "correct at the slip."
+```
+
+The sabotage run is the section on the extra row, compressed: the wrap bug
+is invisible to the easiest test and to DC reasoning, and the suite was
+built by someone who knew that.
diff --git a/book/src/part1/sample-traits.md b/book/src/part1/sample-traits.md
new file mode 100644
index 0000000..31f5902
--- /dev/null
+++ b/book/src/part1/sample-traits.md
@@ -0,0 +1,431 @@
+# Sample types as a customization point: `sample_traits.hpp`
+
+> Form is exactly emptiness; emptiness is exactly form.
+>
+> — the Heart Sutra
+
+The polyphase machinery of the last two chapters computes one thing: a dot
+product between a window of input samples and an interpolated row of filter
+coefficients. The problem is that this library ships to machines that do not
+agree on what a number is. A Xeon host wants `float` samples and will happily
+accumulate in `double`. A Hexagon DSP has no double-precision FPU at all —
+every `double` operation is a soft-float library call. A Cortex-M33 has no
+vector unit and wants 16-bit samples it can crunch two at a time. The same
+algorithm must therefore run in three different arithmetic systems, produce
+measured quality in each, and pay nothing for the flexibility.
+
+Here is what "nothing" has to mean, concretely. The inner loop of
+`interpolate()` runs one multiply-accumulate and one coefficient blend per
+tap, per channel, per output sample. At 48 kHz stereo with the default
+balanced preset (48 taps), that is about 4.6 million multiply-accumulates
+per second — and every one of them goes through the customization point this
+chapter describes. Any mechanism that adds even one indirect call to that
+path has already lost.
+
+This chapter tells two interleaved stories. The C++ story is how a traits
+struct and a concept make the sample type a *compile-time* customization
+point — and why the obvious alternatives (virtual dispatch, CRTP) were
+rejected. The arithmetic story is fixed-point numerics from scratch: what
+Q-formats are, where the headroom bits went, why the accumulators are
+exactly as wide as they are, and two places where the file's own comments
+record hard-won corrections. The two stories are one file:
+
+```cpp
+{{#include ../../../include/srt/sample_traits.hpp:st_overview}}
+```
+
+Three sample types, and a division of labor worth pausing on: the clock
+servo and the filter *design* always run in `double`, because they execute a
+handful of operations per block or once at construction. Only the datapath —
+the code that touches every sample — is templated. Optimizing anything else
+would be effort spent where the profile isn't.
+
+## The mechanism: a struct full of static functions
+
+The customization point is a class template with no primary definition:
+
+```cpp
+{{#include ../../../include/srt/sample_traits.hpp:st_primary}}
+```
+
+Leaving the primary template *undefined* is deliberate. A defined primary
+template would need default behavior, and there is no honest default for
+"how do I multiply-accumulate your type" — any guess would compile for
+unsupported types and be silently wrong. Undefined, the template turns an
+unsupported type into a compile error at the first use. (A more *readable*
+error is the concept's job, below.)
+
+Each supported type then gets a full specialization. The float one is the
+simplest and shows the complete vocabulary — three associated types and
+seven operations:
+
+```cpp
+{{#include ../../../include/srt/sample_traits.hpp:st_float}}
+```
+
+Every operation the datapath performs on samples is named here: convert a
+designed coefficient to storage form (`makeCoeff`), convert the fractional
+position to the blend representation (`makeBlendFactor`,
+`blendFactorFromQ64`), the multiply-accumulate (`mac`), the adjacent-phase
+coefficient blend (`blend`), the accumulator-to-sample conversion
+(`finalize`), and silence. The polyphase chapter's `interpolate()` is written
+entirely in this vocabulary:
+
+```cpp
+acc = Tr::mac(acc, hist[t], Tr::blend(c0[t], c1[t], fr));
+```
+
+and consequently never mentions `int16_t`, `double`, or a shift instruction.
+One algorithm, one body of tests, three number systems.
+
+### Why not virtual dispatch
+
+The classical OO answer — an abstract `SampleOps` interface with `mac()` and
+`blend()` as virtual functions — fails on the arithmetic of the hot loop.
+A virtual call is an indirect call through a vtable: the compiler cannot
+inline it, and what it cannot inline it cannot optimize *across*. The Q15
+`mac` below compiles to roughly two instructions when inlined; as a virtual
+call it would be a call, a return, an argument setup, and — far worse — an
+opaque boundary in the middle of the loop. Everything Part III wins depends
+on the compiler seeing through these functions: the Q15 dot product
+auto-vectorizes on hosts and gets Helium code on the M55 (the C2 audit
+verified both), and the C4 SMLALD kernel exists because the products were
+visible as exact 16×16 multiplies. Four and a half million vtable
+indirections per second, each one an optimization fence, was never a
+candidate.
+
+Virtual dispatch also answers a question nobody asked. Dynamic dispatch
+buys the ability to choose the implementation *at run time* — but a
+converter's sample type is fixed at the moment you write
+`AsyncSampleRateConverterQ15`. Paying the vtable price for flexibility that
+is never exercised is the definition of the wrong tool.
+
+### Why not CRTP
+
+The curiously recurring template pattern is the usual zero-cost answer to
+virtual dispatch, and it was rejected for a simpler reason: CRTP customizes
+through *inheritance* — `class MySample : SampleBase` — and the
+sample types here are `float`, `std::int16_t`, and `std::int32_t`. You
+cannot derive from a built-in type, and you should not have to wrap one in
+a class (with all the conversion friction that implies) just to teach a
+library how to multiply it. A traits struct attaches behavior to a type
+*from the outside*, without requiring the type's cooperation. This is the
+same reason the standard library uses `std::char_traits` rather than
+requiring your character type to inherit from something: the type being
+customized is not yours to modify.
+
+The cost of the traits approach is one level of naming indirection
+(`SampleTraits::mac` instead of `x.mac`), which a `using Tr =` alias
+reduces to nothing. The benefit is that the whole mechanism evaporates at
+compile time: every call in this file is a `static` member function,
+resolved by the template machinery, inlined by any compiler at any
+optimization level worth shipping.
+
+## Q-formats, from zero
+
+Now the arithmetic story. Fixed-point notation **Qm.n** describes an
+integer reinterpreted as a fraction: *n* bits after the binary point, *m*
+bits (beyond the sign) before it. The stored integer *k* represents the
+value *k* / 2ⁿ. So:
+
+- **Q0.15** ("Q15"): an `int16_t` representing *k* / 2¹⁵. Range −1.0 to
+ +0.99997. This is what 16-bit audio *is* — the industry just rarely says
+ so out loud.
+- **Q0.31** ("Q31"): the same idea in an `int32_t`, range −1.0 to
+ +(1 − 2⁻³¹).
+- **Q1.14**: an `int16_t` representing *k* / 2¹⁴ — one bit of *headroom*
+ above ±1.0, range −2.0 to +1.99994, at the cost of one bit of precision.
+
+Addition in a Q-format is ordinary integer addition. Multiplication adds
+the fractional bit counts: Q0.15 × Q1.14 gives a product with 29 fractional
+bits (Q29). Nothing is approximate yet — an integer multiply of two 16-bit
+values is *exact* in 32 bits. Fixed-point arithmetic done carefully is not
+"lossy integer math"; it is exact arithmetic with explicitly scheduled
+rounding. The whole craft is deciding where the one rounding happens and
+proving nothing overflows before it.
+
+## The headroom bit: why coefficients are Q1.14, not Q0.15
+
+The obvious choice for 16-bit coefficients is Q0.15, same as the samples.
+It does not work, and the reason is a property of the filter itself: each
+polyphase row has unity DC gain, and the prototype's *peak tap* reaches
+approximately 1.0. Q0.15's most positive value is 0.99997 — the peak tap
+does not fit. Saturating it would dent the filter's frequency response
+precisely at the row where the response matters most.
+
+So the coefficients trade one precision bit for one headroom bit:
+
+```cpp
+{{#include ../../../include/srt/sample_traits.hpp:st_q15_coeff}}
+```
+
+with the conversion doing round-half-away-from-zero and saturating at the
+integer limits (the *design* is checked separately; saturation here is a
+belt against future filter specs, not an expected event):
+
+```cpp
+{{#include ../../../include/srt/sample_traits.hpp:st_roundsat}}
+```
+
+What did the traded bit cost? Quantizing coefficients to Q14 puts the
+filter's stopband floor at roughly −86 dB — and the header's comment makes
+the argument that matters: the Q15 *output* format's own noise floor is of
+the same order. A 16-bit datapath cannot deliver more than the 16-bit
+format can carry, so spending coefficient precision beyond the format's
+floor would purchase nothing measurable. The end-to-end test agrees: the
+Q15 converter measures **~77 dB SNR** on a half-scale 997 Hz sine across a
++200 ppm clock crossing (`tests/test_fixed_point.cpp` prints it; the CI
+threshold sits at 73 dB), and that number is the *format's* floor, not the
+converter's. The same trade at 32 bits gives Q1.30 coefficients
+(`makeCoeff` scales by 2³⁰), where the quantization floor is so far down
+that the Q31 path measures **133 dB** — statistically the float datapath's
+own 135 dB.
+
+The two unit tests pinning the scale factors are almost insultingly simple,
+and that is their virtue: `Q15::makeCoeff(1.0) == 16384` is the sentence
+"the peak tap fits" written as an assertion.
+
+## The accumulation story: exact until the last line
+
+Here is the Q15 multiply-accumulate:
+
+```cpp
+{{#include ../../../include/srt/sample_traits.hpp:st_q15_mac}}
+```
+
+Two things are chosen here. The product is computed in `int32_t` — a
+16×16→32 multiply, which every target does in one instruction — and it is
+**exact**: the worst-case product is −32768 × −32768 = 2³⁰, comfortably
+inside `int32_t`'s ±2³¹ range. But note how *thin* that comfort is: a
+single worst-case product uses all but one bit of an `int32_t`. Summing
+even two of them could wrap. An `int32_t` accumulator is therefore not
+"risky"; it is simply wrong.
+
+The accumulator is `int64_t`, and now do the arithmetic the comment
+gestures at. The shipping filters run 32 to 80 taps per phase (fast,
+balanced, transparent presets). Summing N values adds at most log₂N bits
+to the worst-case magnitude: 48 taps add ~5.6 bits, 80 taps add ~6.3 — call
+it six to seven bits. Worst case for the transparent preset:
+80 × 2³⁰ < 2³⁷, against an accumulator that holds ±2⁶³. Twenty-six bits of
+spare headroom. That surplus is the point: the sum is exact — not
+approximately safe, *exact*, every intermediate value representable — no
+matter what the samples and coefficients do. There is no intermediate
+rounding anywhere in the loop, which also means the accumulation is
+associative, which is why the C4 chapter's dual-MAC kernel and the C1
+blended-row rewrite could both be verified *bit-exact* rather than
+"close enough."
+
+All of the rounding budget is spent in one place:
+
+```cpp
+{{#include ../../../include/srt/sample_traits.hpp:st_q15_finalize}}
+```
+
+The accumulator holds a Q29 value (Q0.15 sample × Q1.14 coefficient); the
+output wants Q15; so shift right by 14 after adding half an output LSB
+(1 << 13). That is round-half-up. A numericist will object that
+round-half-up carries a bias and round-half-even does not — and the comment
+answers the objection with scale: the bias exists only on exact half values
+and is a fraction of one sub-LSB rounding step, orders below the Q15 noise
+floor that the 77 dB measurement already includes. Half-even costs extra
+operations per output sample to fix an error you cannot measure. The
+`clampSat` around it is the saturation that makes hot signals *clip*
+instead of wrap — and wrapping is the catastrophic failure mode:
+
+```cpp
+EXPECT_EQ(Q15::finalize(std::int64_t{1} << 40), 32767);
+```
+
+plus an end-to-end test (`FullScaleSineDoesNotWrapQ15`) that drives a
+99%-of-full-scale sine through a +500 ppm crossing and asserts the output's
+second difference never exceeds the analytic bound for a clean sine — a
+wraparound anywhere inside would blow that bound by orders of magnitude.
+
+## Q31 and the pre-shift: when even int64 isn't enough
+
+The 32-bit path cannot copy the 16-bit strategy, and the reason is worth
+computing rather than asserting. A full-precision Q0.31 × Q1.30 product
+carries 61 fractional bits and a worst-case magnitude near 2⁶¹ (full-scale
+sample, peak ~1.0 coefficient). An `int64_t` holds ±2⁶³ — barely four such
+products of margin. The shortest shipping filter sums 32 of them; the
+transparent preset sums 80. At 48 taps the worst-case sum is
+48 × 2⁶¹ ≈ 2⁶⁶·⁶, over the accumulator's limit by a factor of about twelve.
+Full-precision products simply do not fit, and there is no 128-bit
+accumulator worth having on the targets this path exists for.
+
+So each product gives up 16 bits *before* joining the sum:
+
+```cpp
+{{#include ../../../include/srt/sample_traits.hpp:st_q31_mac}}
+```
+
+Now redo the bound: Q45 products have worst-case magnitude 2⁴⁵, and
+80 × 2⁴⁵ < 2⁵²— eleven bits of headroom restored. What did the discarded
+bits cost? Each truncation throws away less than one Q45 LSB, and the
+final conversion (`finalize` shifts a further 14 bits, Q45 → Q31) puts a
+Q45 LSB **14 bits below the output's own LSB**. Even if all 80 taps'
+truncation errors conspired in the same direction, the accumulated error is
+under 80 × 2⁻⁴⁵ ≈ 2⁻³⁸·⁷ — less than 1/200 of one Q31 output LSB. The
+measurement closes the argument: the Q31 converter's 133 dB / 105 dB
+(997 Hz / 19.5 kHz) match the float datapath's numbers, whose residual is
+set by the phase-table interpolation, not by anyone's arithmetic. The
+discarded bits are provably and measurably inaudible — this is the
+fixed-point craft in one line of code: *decide* where precision dies,
+prove the grave is deep enough, then measure anyway.
+
+The full specialization, for reference — note the doc comment carries the
+same overflow argument, so the file survives without the book:
+
+```cpp
+{{#include ../../../include/srt/sample_traits.hpp:st_q31}}
+```
+
+## The blend, and the comment that was wrong by three orders of magnitude
+
+`blend` linearly interpolates between the same tap of two adjacent phase
+rows (the polyphase chapter explains why; the residual falls ~12 dB per
+doubling of the phase count). In Q15 it looks like this:
+
+```cpp
+{{#include ../../../include/srt/sample_traits.hpp:st_q15_blend}}
+```
+
+That comment has a history, and the history is this book's whole
+methodology in miniature. The blend multiplies a Q15 fraction
+(`fr` ≤ 32767) by a coefficient difference (`diff` = b − a, two `int16_t`
+values, so |diff| ≤ 65535). The original version of this comment justified
+the `int64_t` by claiming the `int32_t` product would fit "but only with
+~5% margin." An audit later recomputed it: the worst-case product is
+32767 × 65535 = 2,147,385,345, and `INT32_MAX` is 2,147,483,647. The
+margin is 98,302 out of 2.1 billion — **0.005%**, not 5%. Three orders of
+magnitude, in a comment whose entire job was to quantify safety.
+
+Nothing was wrong with the *code* — it used `int64_t` and still does. But
+consider what the wrong comment was waiting to do: some future optimizer,
+squeezing the M33 (where the C4 campaign found this very blend dominates
+the Q15 frame cost — each `fr * diff` is a `smull`), reads "~5% margin,"
+concludes the `int32_t` version is comfortably safe, and ships a datapath
+that is one adjacent-phase anomaly away from integer overflow. The audit
+also measured the *actual* worst |diff| on the transparent table: 41 —
+real coefficients come nowhere near the bound. The corrected comment keeps
+both numbers and the conclusion: a margin of 0.005% against a theoretical
+bound is not an invariant to lean on silently, whatever today's table
+does. The lesson generalizes: **a safety-margin comment is arithmetic, and
+arithmetic in comments rots exactly as fast as arithmetic in code — the
+difference is that no test ever fails on it.** Verify the numbers you
+write in prose. This book's build system exists because of that sentence.
+
+(The Q31 blend uses a Q20 fraction rather than Q15 — since the product runs
+in `int64_t` anyway, the six extra fraction bits are free.)
+
+## `blendFactorFromQ64`: feeding the integer phase
+
+One trait remains, and it earns its keep on exactly one class of hardware.
+The C3 optimization (Part III) replaced the resampler's `double` phase
+accumulator with a Q0.64 integer — after which the *only* floating-point
+left on the fixed-point per-sample path was the conversion of the phase
+fraction into a blend factor. `blendFactorFromQ64` closes that hole. The
+Q15 version is a single shift — the top 15 bits of the fraction *are* the
+Q15 blend factor:
+
+```cpp
+{{#include ../../../include/srt/sample_traits.hpp:st_q15_q64}}
+```
+
+The float version is subtler:
+
+```cpp
+{{#include ../../../include/srt/sample_traits.hpp:st_blend_q64_float}}
+```
+
+Why reduce to 24 bits first? Because a `float` significand holds exactly
+24 bits: any integer up to 2²⁴ converts to `float` *exactly*, and the
+subsequent multiply by 2⁻²⁴ (a power of two) is also exact. Convert the
+full 64-bit fraction instead and the compiler must round — correctly, but
+via a path that on a double-less target may detour through software
+arithmetic. This two-instruction dance keeps the conversion
+single-precision, exact, and branchless. The target it matters on is
+Hexagon, the one genuinely FP64-less machine in the fleet (the C3 write-up
+records the correction: the M55's *scalar* FPU turned out to support
+doubles after all — only its vector unit doesn't). C3's gating run showed
+what removing per-sample soft-double math is worth on Hexagon: −15.5%
+instructions on the Q31 pipeline, −10.3% on Q15. And because 2⁻⁶⁴ phase
+resolution beats the old double path's 2⁻⁵², quality *improved* while the
+code got faster: 135.0 dB at 997 Hz.
+
+## The concept: making the contract legible
+
+Everything above defines the customization point; the last twenty lines of
+the file *enforce* it:
+
+```cpp
+{{#include ../../../include/srt/sample_traits.hpp:st_concept}}
+```
+
+The datapath templates constrain themselves with it —
+`template class BasicAsyncSampleRateConverter` — and the
+payoff is the shape of the failure. Instantiate the converter with
+`double` (no specialization exists) and, without the concept, the error
+would surface wherever the template machinery first touched the undefined
+traits — some line deep inside `interpolate()`, wearing five frames of
+instantiation context. With the concept, the compiler rejects
+`BasicAsyncSampleRateConverter` *at the declaration you wrote*,
+and its diagnostic walks the `requires`-expression clause by clause: which
+operation is missing, what signature it expected. The concept turns "a
+missing operation somewhere" into a checklist. Write a partial
+`SampleTraits` — say, everything but `blendFactorFromQ64` — and
+the error names exactly that member.
+
+Note the return-type constraints (`-> std::same_as<...>`) are doing real
+work: a `finalize` that returned `int` instead of `int16_t` would satisfy
+a naive "does it compile" check and then quietly change overload and
+conversion behavior downstream. The concept pins the whole signature.
+
+The three `static_assert`s at the bottom are the file testing itself: every
+translation unit that includes the header re-verifies that the three
+shipped specializations satisfy the concept they claim to. If a future
+edit breaks one — renames a member, fumbles a return type — the diagnostic
+arrives at header-parse time, before any user code, naming the assert.
+Cost: zero, everywhere except the compiler's own microseconds.
+
+## Why these ~220 lines look the way they do
+
+| Decision | Alternative rejected | Reason |
+|---|---|---|
+| Traits struct of `static` functions | virtual `SampleOps` interface | 4.6M `mac`/s in the hot loop; virtual calls block inlining and every Part III optimization behind an opaque boundary |
+| External traits | CRTP / member functions | sample types are `int16_t`/`float` — built-ins can't inherit and aren't ours to modify |
+| Undefined primary template | primary with defaults | no honest default for foreign arithmetic; silence would be wrongness |
+| Q1.14 / Q1.30 coefficients | Q0.15 / Q0.31 | the ~1.0 peak tap must fit; one headroom bit costs a precision bit the output format couldn't carry anyway |
+| `int64_t` accumulator, no intermediate rounding | `int32_t` accumulator | one worst-case Q15 product nearly fills `int32_t`; exactness makes every kernel rewrite bit-verifiable |
+| Q31 products pre-shifted to Q45 | full 62-bit products | 48 taps of 2⁶¹ ≈ 2⁶⁶·⁶ overflows `int64_t` ~12×; truncation cost < 1/200 output LSB, measured invisible |
+| Round-half-up in `finalize` | round-half-even | the bias is sub-sub-LSB; half-even costs real per-sample work to fix an unmeasurable error |
+| `int64_t` blend product | `int32_t` (it *almost* fits) | 0.005% worst-case margin — recomputed by audit from a comment that claimed 5% |
+| `SampleType` concept + self-`static_assert`s | let instantiation errors happen | failures surface at the declaration, itemized per missing operation |
+
+## Verify it yourself
+
+```sh
+# The whole fixed-point suite: scale factors, saturation, DC gain,
+# measured SNRs (watch for the "[ measured ]" lines), full-scale non-wrap:
+ctest --test-dir build -R FixedPoint --output-on-failure
+
+# The measured numbers this chapter quoted:
+# [ measured ] 997 Hz, 16-bit fixed: SNR ~77 dB
+# [ measured ] 997 Hz, 32-bit fixed: SNR ~133 dB
+# [ measured ] 19500 Hz, 32-bit fixed: SNR ~105 dB
+
+# Recompute the blend margin the audit checked (don't trust this book either):
+python3 -c "print(32767*65535, 2**31-1, 1 - 32767*65535/(2**31-1))"
+
+# Break it on purpose, three ways:
+# 1. In makeCoeff (Q15), change 16384.0 to 32768.0 — the peak tap saturates
+# and DcGainIsUnityQ15 fails its ±4 tolerance.
+# 2. In finalize (Q15), delete clampSat and cast directly — the full-scale
+# sine test detects wraparound as a blown second difference.
+# 3. Instantiate srt::BasicAsyncSampleRateConverter anywhere and
+# read the concept diagnostic: every missing operation, by name, at the
+# line you wrote.
+```
+
+The third experiment is the C++ half of this chapter in one error message;
+the first two are the arithmetic half in two failing assertions.
diff --git a/book/src/part1/spsc-ring.md b/book/src/part1/spsc-ring.md
new file mode 100644
index 0000000..15419f1
--- /dev/null
+++ b/book/src/part1/spsc-ring.md
@@ -0,0 +1,273 @@
+# The lock-free ring: `spsc_ring.hpp`
+
+> Time is what keeps everything from happening at once.
+>
+> — Ray Cummings
+
+Every other component in this library is mathematics. This one is physics.
+
+The converter's whole purpose is to sit between two threads that must never
+wait for each other: an audio capture callback pushing frames at its
+device's pace, and a playback callback pulling frames at a *different*
+device's pace. If either thread ever blocks — on a mutex, on an allocation,
+on a priority-inverted anything — the audio glitches, and a glitch is the
+one failure this library exists to prevent. So the channel between the
+threads must be **lock-free**, and not in the loose marketing sense: every
+operation must complete in a bounded number of steps regardless of what the
+other thread is doing, including being suspended indefinitely at the worst
+possible instruction.
+
+The ring also serves a second master, and this is the design's quiet
+novelty: its **occupancy is the control system's sensor**. The clock servo
+(next chapter) estimates the rate mismatch between the two crystals
+entirely from how full this buffer is. That is why the class exposes exact
+`readAvailable()` and a consumer-side `discard()` — operations a generic
+SPSC queue wouldn't bother with — and why "approximately full" isn't good
+enough anywhere in this file: a biased occupancy reading would become a
+biased frequency estimate.
+
+Here is the entire contract:
+
+```cpp
+{{#include ../../../include/srt/spsc_ring.hpp:contract}}
+```
+
+Forty lines of comment and assertion before any logic. Three things deserve
+attention already.
+
+**`is_trivially_copyable_v`** — the ring moves data with `memcpy`, in at
+most two segments per transfer. This is a *bulk* ring: the producer hands
+over whole blocks of interleaved frames, not elements one at a time. A
+`memcpy`-based design rules out element types with constructors, and the
+`static_assert` makes that a compile error instead of undefined behavior.
+
+**`std::atomic::is_always_lock_free`** — the class claims
+lock-freedom, so it asserts the precondition. On every target this project
+ships to, a `size_t` atomic compiles to plain loads and stores plus memory
+ordering. But "every target this project ships to" is exactly the kind of
+claim that rots silently; the assert costs nothing and converts rot into a
+compile error. (This line has its own small history: it was added by an
+audit that noticed the library asserted lock-freedom for its *telemetry*
+counters but not for the indices the entire hot path rests on.)
+
+**Indices are monotonic, not wrapped.** `head_` and `tail_` count every
+element ever written and read, forever; only at the moment of buffer access
+are they masked down to a position. This is the single most consequential
+decision in the file, and it earns its own section below — including what
+happens when "forever" meets a 32-bit `size_t`.
+
+## The memory model, from the only direction that matters
+
+There are two ways to teach C++ memory ordering. The textbook way starts
+from the six `memory_order` enumerators and their formal guarantees. The
+way that actually sticks starts from a bug.
+
+Suppose both threads used `memory_order_relaxed` everywhere. The producer
+writes 64 samples into the buffer, then advances `head_` by 64. The
+consumer reads the new `head_`, concludes 64 samples are available, and
+copies them out. On x86 this works every time you test it. On a Cortex-A
+or M-class core — or under ThreadSanitizer — the consumer can observe the
+*index* update **before** it observes the *sample data* the index claims to
+cover, because nothing told either the compiler or the CPU that those
+writes were related. The consumer then plays whatever stale bytes were in
+the buffer. The bug is silent, rare, load-dependent, and absolutely real.
+
+The fix is a single pairing, used twice, and it is the only synchronization
+in the file:
+
+> The producer **releases** `head_` after writing data; the consumer
+> **acquires** `head_` before reading data. Everything the producer did
+> before the release-store is visible to the consumer after the
+> acquire-load that observes it.
+
+Read the producer side with that lens:
+
+```cpp
+{{#include ../../../include/srt/spsc_ring.hpp:write}}
+```
+
+The two `memcpy` calls happen *before* the `release` store of the new head.
+That ordering — data first, then the index that publishes it — is the
+entire correctness argument for the data path. Symmetrically:
+
+```cpp
+{{#include ../../../include/srt/spsc_ring.hpp:read}}
+```
+
+The consumer `acquire`-loads `head_` (inside the cache-refresh branch,
+discussed next), and only then copies data the head covers. Its own
+`release` store of `tail_` plays the mirrored role for a subtler resource:
+**buffer reuse**. The producer may overwrite a slot only after the consumer
+has finished copying out of it; the consumer's release of `tail_` and the
+producer's acquire of it order exactly that. Miss this second pairing and
+you have a bug that no amount of staring at the "obvious" head-side pairing
+will reveal.
+
+Notice also what is *relaxed*: each side loads **its own** index with
+`memory_order_relaxed`. The producer is the only writer of `head_`, so it
+cannot race with itself; a thread always observes its own prior writes.
+Using `acquire` there would be harmless but dishonest — ordering
+annotations in this codebase are documentation, and claiming
+synchronization where none is needed misleads the next reader. This is a
+deliberate idiom: **the memory orderings are chosen to be exactly
+sufficient, so that each one tells you why it exists.**
+
+### What was rejected
+
+A sequentially-consistent version (`memory_order_seq_cst` everywhere, the
+default) would be correct. It was rejected for two reasons, in order of
+importance: first, on ARM it compiles to strictly stronger barriers than
+the algorithm needs, in the hottest loop the library owns; second — again
+the documentation argument — `seq_cst` says "I didn't think about this,"
+and in a file whose whole job is to be thought about, that is the wrong
+message. A mutex-based version was never on the table: it would forfeit the
+bounded-progress guarantee the audio contract requires, priority inversion
+being the canonical way real-time audio dies.
+
+## The cached-index trick
+
+Correctness needs one acquire/release pair per direction. Performance is
+about how *rarely* you can afford to do even that.
+
+Every atomic load of the other thread's index is a potential cache-line
+transfer between cores — the line bounces from the writer's L1 to the
+reader's, hundreds of cycles when it goes badly, and it goes badly
+precisely when both threads are busiest. The standard remedy (this design
+follows the well-known pattern used by production SPSC queues) is for each
+side to keep a **stale local copy** of the other side's index and consult
+the real atomic only when the stale copy makes the operation look
+impossible:
+
+- The producer computes free space against `tailCache_`. Only if that says
+ "not enough room" does it acquire-load the real `tail_` and retry the
+ computation. If space *still* falls short, the answer is truthful — the
+ buffer really is that full *right now* — and the write is clipped.
+- The consumer does the same dance with `headCache_` for availability.
+
+The asymmetry of staleness is safe by construction: a stale `tailCache_`
+can only *underestimate* free space (the consumer only ever frees), and a
+stale `headCache_` can only *underestimate* availability (the producer only
+ever adds). Stale data makes the ring conservative, never wrong. In the
+steady state the converter lives in — producer and consumer chasing each
+other around a buffer that is never near full or empty — the fast path
+touches **no foreign cache lines at all**: one relaxed load of your own
+index, arithmetic against a plain local member, two `memcpy`s, one release
+store.
+
+The member layout enforces the same philosophy at the hardware level:
+
+```cpp
+{{#include ../../../include/srt/spsc_ring.hpp:layout}}
+```
+
+Producer-owned state (`head_`, `tailCache_`), consumer-owned state
+(`tail_`, `headCache_`), and the shared read-only state (`buf_`, `mask_`)
+each get their own 64-byte cache line, so neither side's writes invalidate
+lines the other side reads in its fast path. The comment records a rejected
+alternative worth pausing on:
+`std::hardware_destructive_interference_size` is the standard's name for
+exactly this constant, and this file deliberately doesn't use it. The
+constant is **ABI-fragile** — its value can differ between translation
+units compiled with different tuning flags, which is why GCC warns when you
+use it in a header — and a header-only library lives entirely in that
+danger zone. A plain `64` with a comment is less clever and more correct.
+The general lesson recurs throughout this codebase: *between a standard
+facility and a constraint you can state plainly, prefer the one whose
+failure mode you can reason about.*
+
+## Monotonic indices and the wraparound proof
+
+Most ring buffers wrap their indices at the capacity and pay for it twice:
+one slot must be wasted to distinguish full from empty, and every index
+update needs a conditional wrap. This ring's indices instead run forever
+and are masked (`idx = head & mask_`) only at access time, which is why the
+capacity must be a power of two (`std::bit_ceil` in the constructor) — the
+mask replaces a modulo, and the full capacity is usable because occupancy
+is computed by subtraction, not by comparing wrapped positions.
+
+The objection arrives immediately: *forever* is finite. On Hexagon and
+Cortex-M, `size_t` is 32 bits; at 48 kHz stereo, the indices wrap every
+twelve hours or so of continuous audio. What happens then?
+
+Nothing — and the reason is worth proving rather than waving at, because
+the proof is two lines of modular arithmetic that many engineers have
+never consciously done. Unsigned arithmetic in C++ is arithmetic modulo
+2^N. Occupancy is computed as `head - tail`; if the true (unbounded) counts
+are H and T, the machine computes `(H mod 2^N) - (T mod 2^N) mod 2^N`,
+which equals `(H - T) mod 2^N`. Since the algorithm guarantees
+`0 ≤ H - T ≤ capacity` and capacity is at most 2^31 on a 32-bit target, the
+true difference is always representable, so the modular result *is* the
+true result — through the wrap, across the wrap, at the wrap. The masked
+position is likewise exact: capacity divides 2^N (it's a power of two), so
+`(H mod 2^N) & mask = H mod capacity`. The wrap is not an edge case the
+code handles; it is a case the arithmetic never notices.
+
+This was verified the trustworthy way as well: the audit that reviewed this
+file ran the ring with indices initialized to `0xFFFFFFF8` and watched
+transfers stride across the 2^32 boundary, byte-exact. The proof says it
+must work; the test removes the possibility that the proof was about a
+slightly different program than the one we shipped.
+
+## What the tests can and cannot certify
+
+Three layers of evidence back this file, and their *limits* are as
+instructive as their coverage.
+
+**Single-threaded exactness** (`tests/test_spsc_ring.cpp`): fill/drain
+equality, wraparound data preservation, partial writes near full, discard
+accounting. These pin the sequential semantics — necessary, and nowhere
+near sufficient.
+
+**A two-thread stress test** (`tests/test_spsc_ring_threads.cpp`): millions
+of elements of a counting sequence pushed and popped with randomized chunk
+sizes, verified in order on the consumer side, run under ThreadSanitizer in
+CI. TSan observes the actual ordering annotations, so it would flag the
+relaxed-everywhere bug described above as a data race.
+
+**And the honest limitation**: a sanitizer can only judge the interleavings
+the hardware deigns to produce during the run, and an x86 host barely
+reorders anything. A memory-ordering bug can be invisible on x86 *and* pass
+TSan there, then fire on a weakly-ordered ARM core in production. This
+project's answer is a weekly CI job that runs the same TSan stress on
+genuinely weakly-ordered arm64 hardware, plus the per-push macOS Apple
+Silicon leg. That is also a limit worth naming: none of this *proves* the
+algorithm; it raises the price of being wrong. The proof remains the
+acquire/release argument above — which is exactly why this chapter spent
+its pages on the argument rather than the test list.
+
+## Why these ~130 lines look the way they do
+
+A summary of the decisions, several of which recur throughout the library:
+
+| Decision | Alternative rejected | Reason |
+|---|---|---|
+| Lock-free SPSC, two fixed roles | mutex; MPMC generality | bounded progress is the audio contract; generality costs exactly the cycles this file exists to save |
+| Bulk `memcpy` transfers | element-at-a-time queue | the workload is blocks of frames; two `memcpy` segments beat N atomic handoffs |
+| Exact occupancy + `discard()` | "approximate size is fine" | occupancy is the servo's sensor; bias here becomes frequency-estimate bias |
+| Acquire/release, minimal | `seq_cst` everywhere | sufficiency-as-documentation; weaker barriers on ARM |
+| Cached cross-indices | always load the atomic | steady-state fast path touches no foreign cache line |
+| Monotonic masked indices | wrap-at-capacity | full capacity usable, no full/empty ambiguity; wrap is provably benign |
+| `alignas(64)` literal | `hardware_destructive_interference_size` | the standard constant is ABI-fragile in headers; GCC warns for good reason |
+| `static_assert` the preconditions | trust the porting engineer | rot becomes a compile error, not a field failure |
+
+## Verify it yourself
+
+```sh
+# Sequential semantics, wraparound, discard accounting:
+ctest --test-dir build -R SpscRing --output-on-failure
+
+# The two-thread counting-sequence stress (built when threads exist):
+ctest --test-dir build -R TwoThreadStress --output-on-failure
+
+# The same stress under ThreadSanitizer (as CI runs it):
+cmake -B build-tsan -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+ -DCMAKE_CXX_FLAGS="-fsanitize=thread" -DSRT_BUILD_EXAMPLES=OFF
+cmake --build build-tsan -j && ctest --test-dir build-tsan -R SpscRing
+
+# Break it on purpose: change memory_order_release to relaxed in write(),
+# rebuild the TSan variant, and watch the stress test report the race.
+```
+
+The last suggestion is the chapter in one line. The annotations are not
+incantations; remove one and the tooling shows you precisely the disaster
+it was holding back.
diff --git a/book/src/part2/icount.md b/book/src/part2/icount.md
new file mode 100644
index 0000000..17fc574
--- /dev/null
+++ b/book/src/part2/icount.md
@@ -0,0 +1,322 @@
+# Counting instructions, deterministically
+
+> When you can measure what you are speaking about, and express it in numbers, you know something about it; but when you cannot measure it, when you cannot express it in numbers, your knowledge is of a meagre and unsatisfactory kind.
+>
+> — Lord Kelvin
+
+The optimization campaign of Part III makes claims like "−5.3% on the M55
+Q15 pipeline" and expects you to believe the decimal point. This chapter is
+about the machinery that makes such a decimal point *mean* something — and
+about why the obvious metric, time, had to be fired from that job first.
+
+## Wall-clock cannot hold a gate
+
+The project's benchmarks run in CI on shared, virtualized runners: machines
+whose actual delivered performance depends on what every other tenant is
+doing, what frequency the host decided on, and which physical box the job
+landed on today. `docs/PERFORMANCE.md` states the resulting policy without
+hedging: *wall-clock benches are never a hard gate on shared runners; they
+run as a smoke test and produce trend artifacts only.*
+
+That policy was not adopted from theory. During the C2 vectorization audit
+(Part III), the README's wall-clock table was deliberately *not*
+regenerated, because the shared machine was measurably in a different state
+than the annotated session that produced the table — about **20% slower
+across the board on unchanged code**. Sit with that number for a moment: the
+optimization being evaluated in that PR was worth 3.7% on the same metric.
+A gate that must detect 3% shifts through 20% ambient swings is not a gate;
+it is a random number generator with a pass rate. You can fight the noise
+statistically — pin runners, repeat runs, compare medians — and projects
+do, but every mitigation buys precision with CI minutes and still cannot
+promise that a 1% regression fails *deterministically*.
+
+The library's answer is to gate a different quantity entirely: **executed
+instructions**. Run a fixed workload under an emulator, count every guest
+instruction that retires, and the result is a property of the *binary*, not
+the weather — bit-identical across runs (the project verified this before
+trusting it), independent of host load, and, for the scalar code these
+embedded targets run, well correlated with real cost. The metrics table in
+`docs/PERFORMANCE.md` is careful about that last clause, and so is the end
+of this chapter; but first, the machinery.
+
+## Forty lines of plugin
+
+QEMU's TCG (Tiny Code Generator) translates guest instructions into host
+code one *translation block* at a time, and since QEMU 4.2 it exposes a
+plugin API that lets you hook that translation. The project's entire
+counting instrument is `tools/qemu_insn_plugin/insn_count.c` — small enough
+that its two working functions fit here:
+
+```c
+{{#include ../../../tools/qemu_insn_plugin/insn_count.c:pf_hooks}}
+```
+
+The design point that matters is `qemu_plugin_register_vcpu_insn_exec_inline`
+with `QEMU_PLUGIN_INLINE_ADD_U64`. There are two ways a TCG plugin can count
+executions: register a *callback* per instruction — a host function call
+every time the guest retires one instruction — or register an *inline
+operation*, which asks QEMU to plant a bare 64-bit add into the generated
+host code itself. The callback form would multiply the emulation time of a
+billion-instruction workload by a large constant; the inline form costs one
+host add per guest instruction and no calls. `tb_trans` fires once per
+translation block *translation* (not execution), walks the block's
+instructions, and attaches an inline `+1` to each — after which counting
+proceeds at essentially full emulation speed forever, because translated
+blocks are cached and re-executed.
+
+The header comment is candid about the accuracy contract this buys:
+"the single counter is exact for our single-vCPU deterministic workloads."
+A plain `uint64_t` incremented from generated code would be a data race on
+an SMP guest; every target this ratchet gates is a single emulated core
+running a single-threaded workload, so the simple counter is exact — and
+the precondition is written down where the next porter will read it.
+
+The second function is the entire output interface: an `atexit` callback
+prints one line, `SRT_INSN_COUNT `, through `qemu_plugin_outs()`. That
+choice has a trap the driver script had to learn about:
+
+```python
+def qemu_cmd(target: str, plugin: str, binary: str) -> list[str]:
+ # "-d plugin" routes qemu_plugin_outs() to stderr; without it the count
+ # line is silently dropped.
+ if target == "hexagon":
+ return ["qemu-hexagon", "-d", "plugin", "-plugin", plugin, binary]
+```
+
+`qemu_plugin_outs` writes to QEMU's *log*, and unless `-d plugin` enables
+the plugin log channel, the write goes nowhere — no error, no warning, no
+line. The comment in `scripts/icount.py` preserves the discovery so nobody
+re-makes it, and the script's parser treats a missing count line as a hard
+failure ("plugin not loaded?") rather than a zero, so the silent-drop
+failure mode cannot masquerade as a measurement.
+
+## One binary per scenario
+
+What gets counted matters as much as how. The counted workloads live in
+`bench/icount/` — and they are *not* the Google Benchmark suite, which
+auto-tunes its iteration counts to the machine's speed and would therefore
+execute a different number of instructions on every run. A countable
+workload must be **fixed**: same work, same iteration counts, same
+everything, decided at compile time.
+
+`bench/icount/icount_main.cpp` defines seven scenarios — `interpolate()` in
+isolation and the full push/pull pipeline, each in float/Q15/Q31, plus a
+12-channel Q15 pipeline for the 7.1.4 deployment shape — selected by
+preprocessor definitions (`SRT_SC_KIND`, `SRT_SC_TYPE`, `SRT_SC_CH`) into
+one binary each, because the bare-metal targets have no argv to select with
+at runtime. Each binary runs a deterministic loop (two virtual seconds of
+audio through the pipeline; 200 000 interpolations for the kernels),
+accumulates a checksum, and ends with:
+
+```cpp
+ const bool ok = checksum == checksum; // NaN check
+ std::printf("SRT_ICOUNT_DONE ok=%d checksum=%.17g\n", ok ? 1 : 0, checksum);
+```
+
+The three gated targets each run under the QEMU mode that matches their
+deployment reality. Hexagon binaries are Linux user-space processes, so
+`qemu-hexagon` (user-mode emulation) runs them directly. The two Cortex-M
+targets are bare metal: `qemu-system-arm` boots each binary as a kernel on
+a full board model — MPS3 AN547 for the M55, MPS2 AN505 for the M33 — with
+semihosting for the printf. That fidelity matters for the metric: a
+system-mode count includes the startup code, vector table dance, and
+runtime the deployed firmware will actually execute, which is why the
+plugin counts the whole run and the workloads are sized so the measured
+loop dominates.
+
+The checksum earns its place three times over: it defeats dead-code
+elimination (a compiler that deleted the unobserved workload would produce
+a spectacular "improvement"); printed to 17 significant digits, it pins
+cross-run determinism — if two runs of one binary ever printed different
+checksums, the instruction counts would be incomparable and something would
+be deeply wrong; and the pipeline workload deliberately poisons it with a
+NaN if the converter ever underruns, so a broken configuration cannot
+produce a plausible count. `icount.py` refuses to record anything unless
+`SRT_ICOUNT_DONE ok=1` appeared.
+
+## The ratchet, and why it is two-sided
+
+`scripts/icount.py` glues plugin to workloads: find every `srt_icount_*`
+binary in the build directory, run each under the target's QEMU with the
+plugin, and compare against the committed `bench/baselines.json` at a
+tolerance of ±3%. A scenario with no recorded baseline fails. A recorded
+baseline of zero fails. A regression beyond tolerance fails. And — the
+clause that makes this a *ratchet* rather than a mere alarm — an
+**improvement** beyond tolerance fails too:
+
+```python
+ elif delta < -args.tolerance:
+ # Two-sided: a stale (too-high) baseline would let future
+ # regressions hide inside the slack, so improvements must be
+ # committed too.
+ verdict = ("IMPROVED beyond tolerance — run icount.py --update "
+ "and commit bench/baselines.json")
+ failures.append(scenario)
+```
+
+The two-sidedness was not in the original design. The first version of the
+ratchet failed only on regression, which sounds like the point — until an
+infrastructure audit traced the incentive structure. Suppose your PR makes
+`pipeline_q15` 10% cheaper and you don't update the baseline. CI passes;
+everyone is happy; the baseline is now 10% stale. The *next* PR can regress
+the same scenario by 9% — undoing nearly all of your win — and CI passes
+again, because measured-vs-baseline is still inside the slack. Improvements
+that go unclaimed become a hiding place for regressions exactly their size.
+The audit's fix (the same infrastructure-hardening pass that added the
+bare-metal empty-run guard of the previous chapter) makes the gate
+symmetric: if you made it faster, you must *say so*, in the same PR, by
+re-recording the baseline — `icount.py --update` — and committing the diff.
+The improvement becomes reviewable history, the gate snaps tight around the
+new value, and there is never slack for anything to hide in.
+
+`--update` has its own small discipline: it rewrites the target's entry to
+*exactly* the measured scenarios, so a renamed or deleted workload cannot
+linger in the JSON as a dead gate entry that never fails and never means
+anything.
+
+One boundary of the ratchet is drawn in a CMake naming convention. The
+cross-resampler comparison workloads (`docs/COMPARISON.md` runs the same
+fixed task through this library and through libsamplerate, per target) are
+built as `cmp_icount_*` precisely so that `icount.py`'s `srt_icount_*` glob
+never picks them up: competitor counts are *recorded* in the docs with
+their date and toolchain, but not *gated*. The distinction is deliberate.
+A gate on someone else's code would fail on their releases, punish this
+project for their regressions, and pressure nobody who can act on it; a
+gate is a promise, and you can only promise about code you maintain.
+
+The tolerance deserves a sentence, because "±3% on a deterministic count"
+sounds contradictory. Counts are bit-identical across runs *of one binary*;
+the slack absorbs a different variation: innocuous recompilation effects.
+Code layout, inlining decisions, and register allocation shift by fractions
+of a percent when unrelated code changes; the C6 work measured its embedded
+control scenarios at exactly 0.00% only because nothing in their path
+changed. Three percent is wide enough that touching a comment never fails
+the gate, and narrow enough that the +6–8% cost of a runtime flag in a hot
+loop — a real mistake, caught by this exact gate during C6 and fixed with a
+compile-time gate before merge — cannot pass it.
+
+## Baselines are compiler-dependent, by design
+
+An instruction count is a property of the binary, and the binary is a
+product of the compiler. When the CI image's `gcc-arm-none-eabi` or
+hexagon-clang package updates, every count moves a little, and the ratchet
+job fails on unchanged library code. `docs/PERFORMANCE.md` is explicit that
+this is **working as intended, not a flake**: the response is to re-record
+the baselines in a reviewed commit whose diff *is* the record of what the
+toolchain update did to the library's cost. The alternative — normalizing
+counts, or pinning tolerances wide enough to ride out compiler churn —
+would trade an occasional, explainable, reviewable failure for permanent
+blindness to exactly the kind of shift a performance-conscious project most
+wants to see.
+
+The same philosophy shows up in how the tools themselves are provisioned.
+The plugin compiles against a `qemu-plugin.h` pinned to the exact commit
+QEMU 8.2.2's tag pointed at, checksum-verified on download. And the Hexagon
+leg builds its own emulator: neither Debian's `qemu-hexagon` nor the one
+bundled with the CodeLinaro toolchain enables TCG plugins, so CI compiles a
+plugin-capable `qemu-hexagon` from the pinned QEMU source (linux-user
+target only, cached thereafter). A measurement gate whose instruments are
+unpinned is a gate whose meaning can change without a diff.
+
+## What instructions do and do not predict
+
+Time to honor the caveat. An instruction count is not a cycle count, and
+the project's documentation never claims otherwise — the metrics table
+says "well-correlated with real cost **for scalar code**," and
+cycle-accurate numbers are explicitly delegated to vendor simulators or
+hardware counters.
+
+Where the correlation is good: in-order scalar cores running out of
+tightly-coupled memory, which describes the Cortex-M33 and M55 targets
+closely. Most instructions are single-cycle, there is no cache hierarchy to
+miss in, and a 5% instruction reduction is a real, similar-sized cycle
+reduction.
+
+Where it bends: anything that changes the *mix* rather than the count.
+The C3 fixed-point phase accumulator made the M55 float pipeline count
+**+1.4%** worse — it replaced hardware-double operations with int64
+sequences, more instructions of cheaper mix — and the project accepted the
+regression for the cross-target win, with the reasoning in the PR rather
+than hidden in an average.
+
+Where it bends furthest is Hexagon, and the reason is architectural:
+Hexagon is a VLIW machine that issues *packets* of up to four instructions
+per cycle. Two versions of a loop with identical instruction counts can
+differ meaningfully in cycles depending on how well their instructions pack
+into packets — and conversely, removing instructions that packed for free
+saves nothing. The C5 experiment (Part III) is the cautionary tale: a
+hand-written `vrmpyh` wide-MAC kernel, proven bit-exact, verified by
+disassembly to contain ten wide MACs where the baseline had zero, measured
+**−0.31%** — 119,847,854 to 119,478,758 instructions on `pipeline_q15`. The
+instruction metric faithfully reported that the change barely mattered; on
+a VLIW machine it takes packet-level analysis (or silicon) to know whether
+even that number survives translation to time.
+
+The project's calibration path for the gap is hardware, and it ships in the
+repository: `examples/pico2_cyccnt/` is a flashable RP2350 firmware that
+runs the *same* `runPipeline` workload as the icount scenarios — 32-frame
+push/pull blocks, 997 Hz sine, 1 000 warm-up and 2 000 measured iterations
+— timed per block with the Cortex-M33's DWT.CYCCNT cycle counter, printing
+mean/p99/max cycles per block, cycles per frame, and the fraction of a
+150 MHz core one 48 kHz stream costs. Correlating those cycle figures
+against the committed M33 instruction baselines yields the
+cycles-per-instruction ratio for exactly this code on exactly that silicon
+— after which the deterministic, CI-friendly instruction gate can be read
+in real-time units. Until that correlation is run on hardware you own, the
+documentation deliberately states the M33 figures as instruction *budgets*,
+not cycle claims; the truth-sweep audit that enforced that wording appears
+again in the next chapter.
+
+## The last mile: numbers that cannot go stale
+
+A gated number that is hand-copied into a README is a number waiting to
+rot. The published instruction-count table is therefore not written by
+anyone: `scripts/update_icount_docs.py` regenerates it **1:1 from
+`bench/baselines.json`** — every row, every comma — between
+`` and `` markers, and the CI
+ratchet job's final step is:
+
+```sh
+python3 scripts/update_icount_docs.py
+git diff --exit-code README.md || {
+ echo "::error::README icount table is stale; run scripts/update_icount_docs.py"; exit 1; }
+```
+
+Regenerate and diff. If the committed README does not match the committed
+baselines exactly, the build fails — so the numbers a visitor reads are, by
+construction, the numbers the gate enforces. It is the same commitment this
+book makes with live-included code, applied to a table: *derived artifacts
+must be derived, in CI, every time, or they are testimony rather than
+evidence.*
+
+## Verify it yourself
+
+```sh
+# Build the counting plugin (fetch qemu-plugin.h for QEMU 8.2.x first;
+# ci.yml pins the exact URL and checksum):
+gcc -shared -fPIC $(pkg-config --cflags glib-2.0) -I/path/to/plugin-header \
+ tools/qemu_insn_plugin/insn_count.c -o /tmp/libinsncount.so
+
+# Cross-build the fixed workloads and run the ratchet (arm-none-eabi-gcc):
+cmake -B build-m55 -DCMAKE_BUILD_TYPE=Release \
+ -DCMAKE_TOOLCHAIN_FILE=cmake/arm-cortex-m55-mps3.cmake \
+ -DSRT_BUILD_TESTS=OFF -DSRT_BUILD_EXAMPLES=OFF -DSRT_BUILD_ICOUNT_BENCH=ON
+cmake --build build-m55 -j
+python3 scripts/icount.py --target m55 --build-dir build-m55 \
+ --plugin /tmp/libinsncount.so
+
+# Determinism: run any one binary twice and compare the counts exactly.
+qemu-system-arm -M mps3-an547 -nographic -semihosting -d plugin \
+ -plugin /tmp/libinsncount.so -kernel build-m55/bench/icount/srt_icount_pipeline_q15
+
+# See the two-sided gate work: re-run icount.py with --tolerance 0.0001
+# and watch benign recompilation deltas fail in *both* directions.
+
+# The docs-freshness gate:
+python3 scripts/update_icount_docs.py && git diff --exit-code README.md
+```
+
+And the experiment that motivates the whole chapter: run any wall-clock
+benchmark from `bench/` twice on a shared machine, an hour apart, and
+compare. The instruction counts you just produced will not have moved by a
+single instruction; the nanoseconds will tell you about the machine's day.
diff --git a/book/src/part2/notebooks.md b/book/src/part2/notebooks.md
new file mode 100644
index 0000000..a9441b6
--- /dev/null
+++ b/book/src/part2/notebooks.md
@@ -0,0 +1,323 @@
+# Notebooks as calibrated instruments
+
+> There are three kinds of lies: lies, damned lies, and statistics.
+>
+> — popularized by Mark Twain, who credited Benjamin Disraeli
+
+The previous two chapters covered claims a machine can gate: thresholds in
+tests, instruction counts in a ratchet. But some of this project's most
+consequential claims are not pass/fail propositions. *How much worse is a
+naive FIFO?* *What does block size cost in latency and pitch stability?*
+*How does the converter measure against libsamplerate, soxr, and two
+hardware ASRC chips, under one definition of THD+N?* Answering those takes
+plots, long simulated runs, and a measurement methodology that itself needs
+defending — which is to say, it takes a lab notebook. The repository has
+three, under `notebooks/`, and they are treated with the same severity as
+the test suite: **committed with their outputs, calibrated before they
+measure, and pinned with assertions so that a regression fails the re-run.**
+
+This chapter is about that discipline — and about five specific ways a
+quality measurement can lie, each of which this project actually hit, and
+each of which is now encoded in the notebooks as a guard, a docstring, or a
+scar.
+
+## Three instruments, one method
+
+**`asrc_demo.ipynb`** is the front door: it loads the library through its C
+ABI with `ctypes` (no Python bindings, ~80 lines of wrapper), reproduces
+the naive-FIFO disaster, then walks lock acquisition, transparency,
+spectrograms, latency, drift tracking, and dropout recovery. Its committed
+outputs are where the README's "what does it sound like" numbers come from:
+clicks roughly ten times per second at 29 dB SNR for the naive path,
+126.4 dB for the converter under the notebook's instrument.
+
+**`asrc_block_size_study.ipynb`** answers a deployment question: what
+happens at block sizes 32, 64, and 240 frames? Its committed conclusion —
+Track-stage operation turns block quantization into cent-scale, low-rate FM
+over a 53–61 dB wideband floor, while designed latency scales as roughly
+`2·B/fs + 0.5 ms` — is quoted by `docs/COMPARISON.md` whenever coarse-block
+operation comes up.
+
+**`asrc_comparison.ipynb`** is the adversarial one: a single AES17-style
+measurement implementation applied identically to SampleRateTap,
+libsamplerate `sinc_best`, soxr `VHQ`, and a naive FIFO, with the deck
+deliberately stacked *against* the home team — the libraries are handed the
+exact clock ratio as an oracle, while the converter must discover it from
+FIFO occupancy and still gets measured on the result. Every software number
+in `docs/COMPARISON.md`'s tables is a committed output of this notebook.
+
+All three share a spine: the deterministic two-clock simulation from the
+[tests chapter](tests.md), re-implemented in a few lines of Python around
+the C ABI. Producer and consumer events interleave by next-event virtual
+time, so a +200 ppm producer delivers its extra sample every 5 000 exactly,
+and a re-run reproduces the committed outputs. Determinism is what makes
+"committed outputs" meaningful — a notebook whose numbers wander between
+runs is a screenshot, not an instrument.
+
+Why notebooks at all, rather than more tests? Because the *output* here is
+the plot and the number-in-context, not a boolean; because the runs are
+minutes long and belong in a manually-triggered lab rather than every CI
+push; and because a reader deciding whether to trust the library should be
+able to see the methodology, the code, and the result in one document —
+then re-execute it. The committed outputs are the published lab record; the
+re-run is the replication.
+
+## Calibrate the instrument before believing it
+
+The core discipline, stated as a rule: **no measurement function in these
+notebooks reports on the converter until it has first reported on a
+synthetic signal with a known answer, in the same notebook, above the
+measurement.**
+
+The comparison notebook builds an AES17-style THD+N meter — and then
+immediately feeds it a pure 997 Hz tone plus white noise injected at
+exactly −100 and −130 dBFS, computes what a perfect meter must read (the
+injected level, corrected for the fundamental's RMS and the fraction of
+white noise falling in the 20 Hz–20 kHz integration band), and asserts
+agreement within half a dB:
+
+```python
+ got, f0 = thdn_db(sig, 997.0)
+ ...
+ assert abs(got - expect) < 0.5
+print("instrument calibrated")
+```
+
+Only after `instrument calibrated` prints does any subject get measured.
+The block-size study does the same for a subtler instrument — a
+decomposition of a near-sinusoid into low-rate pitch modulation (in cents)
+and a wideband noise floor — by synthesizing a tone with *exactly* 1 cent
+of 10 Hz FM over a −120 dB noise floor. The committed output reads:
+
+```text
+calibration: peak 1.000 cents (true 1.000), rms 0.707 (true 0.707),
+wideband 111.0 dB (true ~111)
+```
+
+That calibration cell carries the project's most candid admission, in its
+own markdown: "This cell earned its keep: three earlier formulations of the
+split each leaked modulation into the noise figure, and the calibration
+caught every one." One of those three failures survives as a docstring on
+the low-pass filter inside the decomposition — a boxcar smoother's passband
+droop left a percent-level copy of the modulation in the high-passed
+remainder, silently bounding the measurable floor — and another as a
+warning that reconstruct-and-subtract in the signal domain fails subtly
+(sub-split phase errors multiply the carrier). Without the synthetic-signal
+check, every one of those buggy instruments would have produced a plausible
+wrong number about the converter, and the notebook would have published it
+with a straight face. Calibration converts "my measurement code is probably
+right" into a demonstrated property, at the cost of one cell.
+
+## Pin the result, or the notebook is a brochure
+
+Every notebook ends its key measurements with `assert`. The demo, after
+measuring transparency:
+
+```python
+assert snr_asrc > 125.0, "transparency regression"
+```
+
+The comparison, after the full table:
+
+```python
+first = names[0]
+assert thdn[first] < -130 and dr24[first] > 130
+```
+
+The block-size study, after the FM decomposition — with a comment that
+names the philosophy:
+
+```python
+ # Documented behavior as of this measurement: FM peaks stay below the
+ # ~5-8 cent audibility region (B=240 gets closest) and the wideband
+ # floor stays above 50 dB. These pin behavior, not aspiration.
+ assert metrics[B]["cents_peak"] < 5.0, f"FM at B={B} reached audibility"
+```
+
+This is the notebook version of the test suite's
+thresholds-just-under-measured convention. A notebook without assertions
+degrades into marketing: it gets re-run after some future change, a plot
+looks subtly worse, and nobody's eye catches it. With assertions, re-running
+the notebook *is* a regression test — `docs/COMPARISON.md` says exactly
+this in its caveats: software figures "regenerate by re-running the
+comparison notebook; its assertions pin SampleRateTap's results so
+regressions fail the run." The notebook is simultaneously the lab record
+and the gate on its own claims.
+
+The rest of this chapter is the honest-measurement traps those assertions
+and calibrations exist to catch — each one a mistake this project made, or
+nearly made, with the receipt still in the file.
+
+## Trap one: your window lies about the floor
+
+A 997 Hz fundamental at −1 dBFS sits some 130 dB above the residual being
+measured. Take a plain FFT of that and the window function smears the
+fundamental's energy across the spectrum at the window's sidelobe level —
+with common windows, far above the thing you are trying to see. The
+notebooks handle this on two fronts. For *display*, the demo's spectrum
+helper documents its choice: a Kaiser window with β = 24, "sidelobes
+~−190 dB, so a −130 dB noise floor is actually visible." For *measurement*,
+no window is trusted at all: the comparison notebook refines the
+fundamental's frequency by the phase-slope method (per-window phase of a
+least-squares fit, regressed against time — "precision far beyond FFT bins,
+which a 130 dB measurement needs," as its markdown puts it), then removes
+the fundamental by a single global least-squares fit, *exactly*, before any
+spectrum is taken. Only the residual — fundamental already subtracted —
+meets an FFT, and then only for integration. A ±20 Hz notch around the
+fundamental catches what the fit leaves; the notebook notes this notch is
+far *narrower* than AES17 permits hardware testers, a conservatism that
+works against the software subjects in every comparison.
+
+This is the same decision the test suite made with its tracked sine fit,
+arrived at for the same reason: at these dynamic ranges, subtraction is
+exact and windows are not.
+
+## Trap two: measure the converter, not its transient
+
+An ASRC has stages. Fresh from a cold start it acquires; once locked it
+tracks; given sample-granular occupancy data for long enough, the servo
+promotes to its low-bandwidth Quiet stage — and the residual keeps
+improving for tens of seconds as the loop forgets its own acquisition. A
+measurement window placed too early reads the servo's history, not the
+converter's quality.
+
+The numbers make the point better than prose. The comparison notebook runs
+32 seconds and discards the first 25 before analyzing ("we analyze its
+output well after the servo's Quiet stage engages," its markdown says). The
+48 kHz quality tests run 40 seconds and analyze the final one. And when the
+16 kHz suite was built by scaling the servo bandwidths with the sample rate,
+the settle time scaled *inversely*: the quiet loop lands at ~0.017 Hz, and
+the suite had to run **120 seconds** — the same number of samples, the same
+number of loop time constants as 40 s at 48 kHz — with the test's comment
+recording that a 40-second run still sits ~15 dB above the settled
+residual. Fifteen decibels is the difference between a correct claim and an
+embarrassing one, controlled entirely by *when you look*.
+
+The flip side matters equally: the block-size study measures the Track
+stage *on purpose*, because block-fed deployments never reach Quiet — that
+is the regime under study. Neither window placement is "right"; what is
+right is that each notebook states which regime it is measuring and why.
+
+## Trap three: the flush at the end of the stream
+
+The comparison notebook hands each competitor the same input and analyzes a
+window of its output. Where you cut that window turned out to matter more
+than anything else in the file:
+
+```python
+def mid_window(y, analyze_s, guard_s=1.0):
+ """Trim both ends: one-shot converters flush a filter tail at the end of
+ the stream, and including it poisons the measurement by ~60 dB (found
+ the hard way; a control experiment at 2:1 exposed it)."""
+ y = np.asarray(y, dtype=np.float32)
+ end = len(y) - int(guard_s * FS)
+ return y[end - int(analyze_s * FS):end]
+```
+
+A one-shot resampler API, given the whole stream at once, drains its filter
+state at the end — a tail of samples that are not steady-state conversion
+output. Include that tail in the analysis window and the measured THD+N
+degrades by roughly **60 dB**: enough to turn soxr's −150 dB into an
+apparently mediocre converter. The bug was found "the hard way," and the
+docstring preserves how: a control experiment at a 2:1 ratio — where the
+correct answer was known independently — read absurdly wrong, and the
+investigation traced it to the tail. Every one-shot subject is therefore
+measured on a mid-stream window with a one-second guard at each end.
+
+Note whose numbers this guard protects: the *competitors'*. An honest
+comparison has to be most careful about errors that flatter the home team,
+and an unguarded tail window would have been exactly that kind of error.
+
+## Trap four: comparing float software to 24-bit silicon
+
+The comparison's final tables land next to datasheet values for the AD1896
+and SRC4392 — hardware ASRCs measured at their pins, which are 24 bits
+wide. A float32 pipeline has no fixed noise floor at all (its noise scales
+down with the signal), so its "native" dynamic range mostly measures the
+arithmetic format, not the converter. Quoting float numbers against silicon
+datasheets would be a category error dressed as a benchmark.
+
+The notebook's equalizer is four lines:
+
+```python
+def q24(y):
+ """Round to a 24-bit interface, undithered -- what a hardware ASRC
+ presents at its pins. The equalizer that makes software and silicon
+ numbers directly comparable."""
+ return np.round(np.asarray(y, np.float64) * 8388608.0) / 8388608.0
+```
+
+Every subject's output is measured both ways, and `docs/COMPARISON.md`
+leads with the 24-bit columns as the chip-comparable condition. The result
+reads differently than bravado would: at that interface the oracle-fed
+libraries measure at the 24-bit format ceiling itself (~−143.5 dB THD+N),
+all three real converters share the identical 149.1 dB A-weighted
+dynamic-range ceiling, and SampleRateTap's −132.1 dB sits ~11 dB behind the
+oracles — a gap the document does not explain away but *prices*: it is the
+measured cost of solving the clock-recovery half of the problem, which the
+libraries do not attempt. Even so, the caveats refuse the flattering frame
+in the other direction too: datasheet numbers come from analog test loops
+with wider notches, and "a pristine-digital software measurement and a
+bench measurement of a chip are comparable in definition, not in
+environment."
+
+## Trap five: the summary cell nobody executes
+
+The last trap is the quietest, and this project walked into it. The demo
+notebook's measurement cell printed, in its committed output:
+
+```text
+ASRC SNR: 126.4 dB | naive: 29.4 dB | improvement: 97 dB
+```
+
+with `assert snr_asrc > 125.0` enforcing it. The *summary table* at the
+bottom of the same notebook claimed "SNR > 130 dB." Nothing failed. Nothing
+could fail: markdown does not execute, so no assertion, calibration, or
+re-run will ever check a number typed into prose. The two cells sat a few
+screens apart, one measured and one remembered, disagreeing by 4 dB — the
+one place a documentation audit found the repository overstating its own
+results. (The measured 135 dB figure from the test suite is real, but it is
+a *different instrument* — a tracked global fit over a different window —
+and a summary must quote its own cell, not the best number available
+elsewhere in the repo.) The fix was the boring, correct one: the summary
+now states 126.4 dB and points at the assertion.
+
+The lesson generalizes beyond notebooks: **summaries drift from cells the
+same way READMEs drift from benchmarks and comments drift from code.**
+Executable claims stay honest by execution; prose claims stay honest only
+by audit. This project's response operates at both levels — push every
+number it can into asserted, regenerated, machine-checked form (the test
+thresholds, the icount table's regenerate-and-diff gate, the notebook
+assertions), and schedule adversarial audits for the residue that only
+prose can carry. This book is itself downstream of that lesson: the code
+you read here is included live from the headers, because an author's
+summary of code is just one more markdown cell.
+
+## Verify it yourself
+
+```sh
+# Build the C ABI once; the notebooks find (or build) it themselves:
+cmake -B build -DCMAKE_BUILD_TYPE=Release -DSRT_BUILD_CAPI=ON
+cmake --build build --target srt_capi -j
+
+# Re-run each instrument end to end; any pinned regression fails the run
+# (deps: numpy, matplotlib, plus samplerate and soxr for the comparison):
+jupyter nbconvert --to notebook --execute notebooks/asrc_demo.ipynb
+jupyter nbconvert --to notebook --execute notebooks/asrc_block_size_study.ipynb
+jupyter nbconvert --to notebook --execute notebooks/asrc_comparison.ipynb
+
+# Watch a calibration catch a broken instrument: in asrc_comparison.ipynb,
+# widen the notch (notch_hz=20.0 -> 2000.0) or in the block-size study
+# replace lowpass_fft with a boxcar mean — the synthetic-signal cell fails
+# before any subject is measured.
+
+# The traps, in the sources' own words:
+grep -rn "poisons the measurement" notebooks/asrc_comparison.ipynb
+grep -rn "earned its keep" notebooks/asrc_block_size_study.ipynb
+grep -rn "pin behavior, not aspiration" notebooks/asrc_block_size_study.ipynb
+```
+
+The demo notebook's summary table is the one artifact in this chapter that
+no command can verify — which is the point. Read it next to the measurement
+cell above it, check that the numbers agree, and you will have performed,
+by hand, the audit that fixed it.
diff --git a/book/src/part2/tests.md b/book/src/part2/tests.md
new file mode 100644
index 0000000..51c4721
--- /dev/null
+++ b/book/src/part2/tests.md
@@ -0,0 +1,423 @@
+# Tests as specifications
+
+> Program testing can be used to show the presence of bugs, but never to show their absence!
+>
+> — Edsger W. Dijkstra
+
+Part I ended each chapter with a list of tests. This chapter is about what
+those tests actually *are* — because in this project they are not the usual
+smoke detectors bolted on after the fact. They are the specification. The
+README publishes a table of signal-to-noise figures; the reason that table
+can be trusted is not editorial diligence, it is that every number in it has
+a test asserting something just below it, and CI runs the assertion on every
+push. `docs/PERFORMANCE.md` states the policy in one line: "The SNR table is
+already enforced by test thresholds."
+
+That sentence hides three design problems, each with a wrong answer that
+most test suites pick by default. How tight do you pin a measured quantity?
+How do you make a two-clock, two-thread, analog-flavored system produce the
+same bits every run? And how do you measure 135 dB of fidelity without your
+measuring instrument lying to you? The suite's answers are the subject of
+this chapter.
+
+## Thresholds a few dB under reality
+
+Here is the convention, straight from the top of the quality suite
+(`tests/test_asrc_quality.cpp`):
+
+```cpp
+// Thresholds sit 4-7 dB under measured performance (135/120/113/106 dB for
+// balanced at 997/6k/12k/19.5k; 133/108 dB for transparent). The residual at
+// high frequencies is dominated by the linear interpolation between adjacent
+// phase-table rows, which falls ~12 dB per doubling of numPhases and rises
+// ~12 dB per octave of signal frequency.
+```
+
+And a representative enforcement:
+
+```cpp
+TEST(AsrcQuality, Balanced997Hz) {
+ EXPECT_GT(measureSnrDb(srt::FilterSpec::balanced(), 997.0), 128.0);
+}
+```
+
+Measured 135.0, asserted 128.0. Consider the two alternatives this rejects.
+
+**A loose threshold** — say, "SNR must exceed 60 dB, comfortably transparent
+for casual listening" — turns the test into a tautology. The converter could
+regress by seventy decibels, an *enormous* defect by this library's
+standards, and CI would stay green while the README continued to advertise
+135 dB. A loose threshold means the published claim and the enforced claim
+are different claims, and only the weaker one is real. This suite's position
+is that a quality number you publish is a number you gate, at very nearly
+the value you publish.
+
+**An exact threshold** — asserting 135.0 because you measured 135.0 — fails
+for the opposite reason: the measurement is a physical quantity with
+legitimate variation. Different hosts, compilers, and math libraries move
+the residual by fractions of a dB; the float path's strict double
+accumulation keeps outputs bit-stable per platform but not across them. The
+4–7 dB of headroom is sized to absorb that variation and nothing else: any
+*algorithmic* regression — a filter redesign that loses stopband, a servo
+change that leaks more clock noise into the passband — costs whole decibels
+and lands outside the slack.
+
+The comment carries a second load worth noticing: it explains *where the
+residual comes from* (phase-table interpolation, with its 12 dB scaling laws
+in both `numPhases` and signal frequency). That converts the threshold table
+from arbitrary constants into a checkable physical model — when the 16 kHz
+suite was added later, its expectations could be *predicted* from the same
+model (the residual depends on the normalized frequency f/fs, so tones at
+the same f/fs should measure the same), then measured, and they matched
+within about 1 dB (`tests/test_asrc_quality_16k.cpp` records both sets of
+numbers). A threshold you can predict is a specification; a threshold you
+can only observe is a snapshot.
+
+The convention also imposes a maintenance discipline that deserves to be
+stated honestly: when performance *improves*, the thresholds are stale and
+must be re-pinned upward, or the enforcement quietly loosens. That happened
+in this repository — the Q0.64 phase accumulator (Part III) improved the
+997 Hz figure to 135.0 dB, and a subsequent documentation audit re-aligned
+the published headline and threshold comments to the post-change reality.
+The instruction-count ratchet in the next chapter solves the same
+staleness problem mechanically, with a two-sided gate; the quality suite
+solves it by convention and audit. The difference is instructive: ±3% on a
+deterministic integer can be automated; "4–7 dB under a measurement that
+legitimately varies by platform" still needs a human to re-pin.
+
+## The two-clock simulator
+
+Every quality number above comes from the same experimental rig, and it fits
+in a page of header (`tests/support/two_clock_sim.hpp`). The problem it
+solves: the converter's whole reason to exist is that *two independent
+clocks* drive it, but tests that use two real threads and real timers are
+nondeterministic — schedulers differ, load differs, and a 0.2 dB shift in a
+measurement could be the code or could be the machine. For metrology you
+want the clocks without the threads.
+
+The rig is a struct of knobs:
+
+```cpp
+{{#include ../../../tests/support/two_clock_sim.hpp:pf_knobs}}
+```
+
+and one loop:
+
+```cpp
+{{#include ../../../tests/support/two_clock_sim.hpp:pf_run}}
+```
+
+This is discrete-event simulation reduced to its minimum. Two virtual
+clocks, `tIn` and `tOut`, advance in *virtual time*: a producer event pushes
+`chunkIn` frames and advances `tIn` by `chunkIn / fsIn`; a consumer event
+pulls `chunkOut` frames and advances `tOut` by `chunkOut / fsOut`; whichever
+clock is behind fires next. With `fsIn = 48 000 × (1 + 200 ppm)` and
+`fsOut = 48 000`, the producer naturally lands one extra sample every 5 000
+— the exact asynchrony a real capture/playback pair exhibits, with zero
+dependence on the host scheduler. Runs are exactly reproducible: same
+sequence of pushes and pulls, same occupancy trajectory seen by the servo,
+same output samples, every time, on every machine.
+
+Why determinism beats realism for regression work:
+
+- **A failure is a coordinate, not a weather report.** When
+ `Balanced19_5kHz` drops below 100 dB, re-running reproduces the identical
+ run; you can bisect it, instrument it, and diff intermediate state against
+ a good commit. A threads-and-timers failure reproduces "sometimes."
+- **Thresholds can be tight.** The 4–7 dB convention above is only possible
+ because run-to-run variance is zero; scheduler-dependent tests must budget
+ slack for the scheduler, and that slack is exactly where regressions hide.
+- **The interesting parameter becomes controllable.** Transfer granularity
+ — how many frames move per event — is a *physical property of real
+ deployments* (sample-synchronous codecs at one extreme, USB and network
+ audio moving multi-frame bursts at the other), and it changes converter
+ behavior: the servo promotes to its low-bandwidth Quiet stage only when
+ occupancy is observed at fine granularity. The quality suites run
+ `chunkIn = chunkOut = 1` to reach the Quiet stage; the multichannel short
+ variants run `chunk = 8` deliberately, to certify the Track stage that
+ block-fed deployments actually live in. In a real-threads test,
+ granularity would be an accident of scheduling; here it is an axis of the
+ test matrix.
+- **Slow clock dynamics are testable at all.** `fsInScale` lets a test ramp
+ the input rate — the lock suite sweeps drift ramps and asserts the servo
+ follows without unlocking — which on real hardware would require a
+ programmable oscillator and a lab.
+
+What determinism deliberately does *not* cover is the one thing it removes:
+real concurrency. The memory-ordering claims of the ring buffer are tested
+by the separate two-thread stress under ThreadSanitizer (the
+[ring chapter](../part1/spsc-ring.md) walks its limits). The division of
+labor is explicit — realism where realism is the subject, simulation
+everywhere else — and the technique travels: the same virtual-time
+interleaving reappears in Python inside every notebook of the
+[notebooks chapter](notebooks.md).
+
+One number shows what the rig's determinism costs in patience rather than
+trust. The quality runs last 40 virtual seconds because, as the test's
+comment puts it, "the 0.05 Hz locked loop must fully forget the acquisition
+transient before the measurement window" — and only the final second is
+analyzed. At 16 kHz the servo bandwidths scale down with the rate, so the
+same suite runs 120 seconds to cover the identical number of loop time
+constants; its comment records that a 40 s run still sits ~15 dB above the
+settled residual. Deterministic time is cheap; *skipping* settling time is
+how you measure your transient instead of your converter.
+
+## Sine-fit metrology
+
+The simulator produces a signal; something must turn it into a decibel
+figure, and at 135 dB the instrument is the hard part. The suite's
+instrument (`tests/support/sine_analysis.hpp`) is a least-squares sine fit:
+model the output window as `a·sin(ωi) + b·cos(ωi) + c`, solve the 3×3
+normal equations for the best-fit fundamental, subtract it *exactly*, and
+call everything that remains — harmonics, images, servo noise, quantization
+— the residual. `snrDb()` is then the fitted fundamental's power over the
+residual's.
+
+Why a fit instead of an FFT? Because subtraction is exact and windows are
+not. A windowed spectrum smears the near-full-scale fundamental across
+neighboring bins at the window's sidelobe level; measuring a residual 135 dB
+down *under* that skirt means fighting your own instrument. The fit has no
+window: the fundamental is removed to the precision of the arithmetic
+(double throughout), and the method's own floor sits far below anything the
+converter produces. (The notebooks meet the same problem with the same
+answer, plus a notch — that chapter tells the ~60 dB horror story that
+motivates the extra guard.)
+
+One refinement matters enough to justify its own function. `fitSine`
+requires the frequency; `fitSineTracked` *finds* it, starting from the
+nominal value:
+
+```cpp
+ for (int iter = 0; iter < 4; ++iter) {
+ const SineFit a = fitSine(x.first(half), f);
+ const SineFit b = fitSine(x.subspan(half), f);
+ // b.phase is relative to the second half's start; predict it from a.
+ const double twoPi = 2.0 * std::numbers::pi;
+ const double predicted = a.phase + twoPi * f * static_cast(half);
+ const double dphi = std::remainder(b.phase - predicted, twoPi);
+ f += dphi / (twoPi * static_cast(half));
+ }
+```
+
+Fit each half of the window; if the assumed frequency is slightly wrong, the
+second half's phase arrives shifted from where the first half's fit predicts
+it; the shift, divided by the half-window's span, is the frequency error.
+Four iterations converge far below the starting error.
+
+The reason this exists is a property of the device under test. An ASRC's
+rate estimate converges *asymptotically* — the Quiet-stage loop is
+deliberately slow, so even after a 40-second run the estimate can sit a
+fraction of a ppm off the true ratio. A rigid fit at the nominal frequency
+would see the output tone microscopically detuned from the model and book
+the mismatch as residual: a completely inaudible frequency offset, misread
+as noise. Tracking the fundamental before measuring distortion is exactly
+what commercial THD analyzers do, and the header's comment says so — the
+test instrument follows metrology practice, not convenience.
+
+But an instrument that *tracks* the signal could also *excuse* it: a
+converter that genuinely played the wrong pitch would have its error
+absorbed into the tracked frequency and measure clean. The suite closes
+that hole with a guard on the tracker itself:
+
+```cpp
+ // The tracked frequency must still match the true clock ratio closely.
+ EXPECT_NEAR(fit.freqNorm / nuOutExpected, 1.0, 2e-6);
+```
+
+The fit may refine the frequency, but only within 2 ppm of what the clock
+ratio dictates — enough for servo convergence tails, nowhere near enough to
+hide a real pitch error. Every use of the tracked fit carries this check.
+It is the measurement-code version of a lesson this book keeps repeating:
+whenever you give a tool freedom, pin the freedom.
+
+## Crosstalk that cannot hide, leakage that cannot masquerade
+
+Single-channel quality metrics are structurally blind to a whole class of
+multichannel bugs: swap two channels in the deinterleave, or bleed a percent
+of channel 3 into channel 4, and every per-channel SNR still measures
+perfect. `tests/test_multichannel.cpp` exists for exactly those bugs, and
+its design is a small case study in adversarial measurement.
+
+The setup: one converter instance, every channel carrying a *distinct* tone
+— `600 + 731·c` Hz, non-harmonically related, all inside the flat passband
+for up to 16 channels — with per-channel phase offsets to decorrelate the
+waveforms. After conversion across the usual +200 ppm crossing, each channel
+must contain its own tone at full quality and nothing measurable of any
+other channel's. The deployment shapes are real: 12 channels is 7.1.4
+surround, 16 is an AVB stream bundling reference microphones with the
+program feed.
+
+The subtlety is in the analysis order, and the file header explains it:
+
+```cpp
+// Method: own tone is removed by tracked least-squares fit; the other
+// channels' frequencies are then fitted on the residual, so the own tone's
+// spectral leakage (about -67 dB at these spacings over a 1 s rectangular
+// window) cannot masquerade as crosstalk. The fit noise floor on the
+// residual is ~43 dB below the residual RMS, far under every threshold.
+```
+
+Fit channel *k*'s frequency directly on channel *c*'s raw signal and the
+finite one-second window makes channel *c*'s own tone leak energy into that
+fit at about −67 dB — the test would "detect" crosstalk at −67 dB on a
+converter with none, capping the assertable threshold right there. Removing
+the own tone first (exact subtraction of the tracked fit) drops the
+masquerade floor to the fit noise on the residual, far under every
+threshold. Order of operations *is* the instrument here: same data, same
+fits, and only one sequencing yields a measurement capable of asserting
+−100 dB. The pinned claims follow the quality suite's convention: crosstalk
+below −100 dB per channel for float (−72 dB for Q15, whose own quantization
+floor is the binding constraint), with amplitude and SNR checked alongside.
+
+One more design decision hides in the channel counts of the short variants:
+
+```cpp
+// Channels 5 and 7 are the only counts that reach the channel-parallel
+// K=2 and K=1 remainder tiles (8/4/2/1 tiling: 5 = 4+1, 7 = 4+2+1) — the
+// audit found those tiles had zero coverage.
+```
+
+The C6 optimization (Part III) processes channels in register-blocked tiles
+of 8, 4, 2, and 1. Testing 2, 12, and 16 channels — every *deployment*
+shape — exercises only the wide tiles. Five and seven channels are useless
+deployment shapes and ideal test shapes: they force the remainder paths. An
+audit found those tiles had zero coverage across the entire suite; the fix
+was not more assertions but better-chosen *inputs*. Coverage lives in the
+test matrix, not the expectation count.
+
+## The bare-metal one-shot, and the filter that needed a test
+
+On the Cortex-M55 and M33 CI legs, the suite runs as a bare-metal kernel
+under `qemu-system-arm`: no OS, no filesystem, no command line. That
+environment breaks three assumptions ordinary gtest runs lean on, and
+`tests/bare_metal_main.cpp` plus `tests/CMakeLists.txt` repair them one by
+one — each repair with a story.
+
+**No argv** means no `--gtest_filter` from the harness, so the
+emulation-appropriate filter is baked into a custom `main()`:
+
+```cpp
+ ::testing::GTEST_FLAG(filter) = "-AsrcQuality*:AsrcLock.*:Servo.*:Kaiser.*MeetsSpec:"
+ "FixedPoint.AsrcQuality*:"
+ "FixedPoint.FullScaleSineDoesNotWrapQ15:"
+ "MultiChannel.*:Feasibility.*:Reset.*";
+```
+
+**No reliable exit codes** — semihosting does not dependably propagate a
+process status through the emulator — means the run is judged on text.
+CTest watches for a sentinel:
+
+```cmake
+ add_test(NAME srt_tests_emulated COMMAND srt_tests)
+ set_tests_properties(srt_tests_emulated PROPERTIES
+ PASS_REGULAR_EXPRESSION "SRT_TESTS_COMPLETE rc=0"
+ FAIL_REGULAR_EXPRESSION "\\[ FAILED \\]"
+ TIMEOUT 1800)
+```
+
+The sentinel is printed as the *last* act of `main()`, after
+`RUN_ALL_TESTS()` returns — deliberately, so a crash after gtest's own
+summary (a static destructor, a late fault) cannot register as a pass. The
+`FAIL_REGULAR_EXPRESSION` is a second, independent tripwire: even if a
+mangled run somehow emitted the sentinel, any visible test-failure line
+still fails the CTest.
+
+**Nobody watching** is the third broken assumption, and its repair has the
+best history. `RUN_ALL_TESTS()` returns 0 when every selected test passes —
+including when the filter selects *zero* tests. A typo in that baked-in
+filter string would produce an empty run, print the sentinel with `rc=0`,
+and turn the entire on-target suite green forever. An infrastructure audit
+realized this, and the guard went in:
+
+```cpp
+ const int selected = ::testing::UnitTest::GetInstance()->test_to_run_count();
+ if (selected < 15) {
+ std::printf("only %d tests selected (expected >= 15): filter is broken\n", selected);
+ std::printf("SRT_TESTS_COMPLETE rc=1\n");
+ return 1;
+ }
+```
+
+Two details show the care level. The count is checked *after* the run,
+because gtest applies the filter inside `RUN_ALL_TESTS()` — read it before
+and it is always zero, which was verified on target rather than assumed.
+And the bound is 15 against a selection of roughly 20, leaving headroom for
+legitimate test removals without masking a typo.
+
+The guard was not paranoia; the filter had *already* had a real bug. When
+the 16 kHz quality suite (`AsrcQuality16k`) was added, the exclusion then
+read `-AsrcQuality.*` — and in gtest filter syntax, unlike regex, `.` is a
+literal character. `AsrcQuality.*` matches `AsrcQuality.Balanced997Hz` but
+not `AsrcQuality16k.Balanced333Hz`, so the new two-minute simulations would
+have quietly joined every bare-metal CI run, at emulation speed. The fix
+widened the pattern to `AsrcQuality*` (no dot). Look back at the filter
+string and you can now read its dots as deliberate: `MultiChannel.*` —
+*with* the literal dot — excludes exactly the `MultiChannel` suite while
+keeping `MultiChannelShort` in, which the comment beside it calls out as the
+only on-target coverage of the N-channel deinterleave and wide-MAC dotRow
+paths. The same character is a bug in one line and a scalpel in the next;
+the difference is whether its meaning was chosen.
+
+## What the emulated targets deliberately skip
+
+The baked filter and its `ctest -E` sibling on the Hexagon leg exclude the
+same family: the quality suites, the lock and servo simulations, the filter
+design verification, the feasibility and reset sims — collectively, as the
+file header puts it, "minutes of soft-float virtual audio that validate
+target-independent control math already covered on every host platform."
+That phrase is the policy. A 40-second sample-granular quality run is cheap
+arithmetic on a Xeon and an eternity under instruction-set emulation — and
+it would re-prove something that *cannot differ* on the target: the servo's
+control law and the filter designer's mathematics are pure functions of
+their inputs, identical on every conforming C++ implementation.
+
+What *can* differ on target — and therefore what the on-target run keeps —
+is the datapath: kernel accuracy on the target's arithmetic, the fixed-point
+paths (including the SMLALD dual-MAC route on M33-class cores), the ring
+buffer, the deinterleave, the end-to-end latency path. The exclusion list is
+not a shortcut; it is a claim about *where target-dependence lives*, and the
+short multichannel variants exist precisely because that claim would
+otherwise have left the N > 2 datapath uncovered on the machines it was
+written for.
+
+One exclusion is different in kind, and the CI file is honest about it:
+`ConfigValidation` is skipped on Hexagon not because it is slow but because
+that leg's static-musl toolchain cannot unwind — the constructor throws
+correctly, `EXPECT_THROW` never catches, and libc++abi terminates. The
+limitation is recorded in `docs/PERFORMANCE.md` under known debt, with the
+deployment guidance it implies (validate configs before constructing on that
+toolchain). A skipped test with a documented reason is a specification too:
+it specifies the boundary of what the platform supports.
+
+## Verify it yourself
+
+```sh
+# The quality suite: watch the printed [ measured ] lines clear the
+# thresholds by the documented few dB:
+ctest --test-dir build -R AsrcQuality --output-on-failure
+
+# The threshold convention, in the tests' own words:
+grep -n -A4 "Thresholds sit" tests/test_asrc_quality.cpp tests/test_asrc_quality_16k.cpp
+
+# Multichannel independence, long and short (per-channel crosstalk prints):
+ctest --test-dir build -R MultiChannel --output-on-failure
+
+# Determinism of the rig: run a quality test twice and diff the output.
+ctest --test-dir build -R Balanced997 --output-on-failure # (run it twice)
+
+# The bare-metal one-shot, exactly as CI runs it (needs arm-none-eabi-gcc
+# and qemu-system-arm):
+cmake -B build-m55 -DCMAKE_BUILD_TYPE=Release \
+ -DCMAKE_TOOLCHAIN_FILE=cmake/arm-cortex-m55-mps3.cmake
+cmake --build build-m55 -j && ctest --test-dir build-m55 -V
+
+# Break the empty-run guard on purpose: change the baked filter in
+# tests/bare_metal_main.cpp to a typo like "NoSuchSuite.*", rebuild, and
+# watch the run fail with "filter is broken" instead of passing green.
+```
+
+The last experiment is this chapter's thesis in miniature. A test suite is
+only a specification if an empty, wrong, or stale version of it *fails* —
+and every mechanism in this chapter, from pinned thresholds to the
+fifteen-test floor, exists to make silence impossible to mistake for
+success.
diff --git a/book/src/part3/c1-c2.md b/book/src/part3/c1-c2.md
new file mode 100644
index 0000000..f060f86
--- /dev/null
+++ b/book/src/part3/c1-c2.md
@@ -0,0 +1,328 @@
+# Profile first, claim later (C1–C2)
+
+> We should forget about small efficiencies, say about 97% of the time: premature optimization is the root of all evil. Yet we should not pass up our opportunities in that critical 3%.
+>
+> — Donald Knuth
+
+Part III is a story, told in the order it happened. The introduction promised
+six optimization efforts — four wins, one honest draw, one deliberate revert —
+and the next three chapters deliver them with the real numbers, including the
+two that went sideways. This chapter covers the method and the first two
+efforts. The method matters more than either result, because the method is
+what made the later reversals *visible* instead of silently absorbed.
+
+A word about why the campaign existed at all. By the time it started, the
+converter already beat its closest architectural analog, libsamplerate's
+streaming polyphase engine, by roughly 3× at matched quality on the host
+(the full head-to-head lives in `docs/COMPARISON.md`). Nobody was losing
+sleep over Xeon throughput. The pressure came from the other end of the
+target list: the embedded parts. A converter that costs ~1.6% of a Xeon
+core even at eight channels is invisible; the same converter on a
+Cortex-M33 or a Hexagon DSP is a line item in someone's cycle budget, and
+every instruction shaved is budget returned to the application. That framing shaped the campaign's stopping
+rule, written down before any code changed:
+
+> Optimization stops by budget, not by exhaustion. Stop when targets are
+> met, when the profile is flat (no single hotspot ≥ 10%), or when the next
+> win requires per-arch complexity the budget does not justify.
+
+Keep that third clause in mind. It fires, verbatim, two chapters from now.
+
+## The loop
+
+`docs/PERFORMANCE.md` opens with a working agreement — not aspiration,
+process. Every PR that touched the hot path followed the same five steps:
+
+1. **Baseline** on the benchmark matrix.
+2. **Profile** — `perf record` and a flamegraph for time, `-fopt-info-vec`
+ or `-Rpass=loop-vectorize` for any claim about vectorization.
+3. **One hypothesis, one change, one PR** — each optimization PR carries
+ its before/after numbers in the description.
+4. **A/B** — benchmarks for speed, the full test suite for correctness. The
+ pinned SNR thresholds are the quality guardrail: an optimization that
+ costs decibels fails CI by design, so "it's faster" can never quietly
+ mean "it's faster and slightly worse".
+5. Repeat until a stopping condition triggers.
+
+Two measurement instruments back the loop, and they have opposite
+personalities.
+
+**Wall-clock throughput** (Google Benchmark, `bench/bench_asrc.cpp`) is what
+users feel, and it is noisy — the project's benches run on shared CI
+runners, where a neighbor's workload can move a number more than a real
+regression does. So the docs state a rule this book has already quoted and
+will quote again: *wall-clock benches are never a hard gate on shared
+runners*. They run as a smoke test and produce trend artifacts. When this
+chapter reports a wall-clock delta, it was measured as a same-machine,
+same-session A/B — the only configuration in which the ratio means
+anything.
+
+**Executed instructions** (the QEMU TCG plugin harness, `bench/icount/`) is
+the opposite: deterministic to the instruction. Each embedded scenario is a
+fixed-workload binary — bare metal has no argv, so there is one binary per
+scenario — run under an instruction-counting plugin on emulated Cortex-M55,
+Cortex-M33, and Hexagon. Counts are exact across runs; the project verified
+that before trusting them. CI compares every scenario against a checked-in
+`bench/baselines.json` and fails if any metric moves more than 3% in
+*either* direction. The two-sidedness is the clever part: an improvement
+beyond tolerance also fails until the baseline is re-recorded in the same
+diff, because stale slack in the baseline is exactly the room a later
+regression would hide in.
+
+Instruction counts are not cycle counts — no cache misses, no dual-issue,
+no branch predictor. For the scalar code these targets run, they correlate
+well with real cost, and they buy something cycles on shared hardware never
+can: the ability to assert that a number did not change *at all*. That
+ability is the backbone of everything that follows.
+
+Before the first change, the hypotheses were written down in expected-ROI
+order: per-channel blend redundancy first, then auto-vectorization quality,
+then a fixed-point phase accumulator, then explicit SIMD kernels. Writing
+the list first is cheap insurance against the oldest failure mode in
+optimization work — doing the fun change instead of the valuable one, then
+constructing the justification afterward.
+
+## C1: the blend that was computed N times
+
+Recall the datapath from Part I. To produce one output sample,
+`interpolate()` picks the two polyphase coefficient rows adjacent to the
+fractional position μ, blends them tap-by-tap by the intra-phase fraction,
+and dot-products the blended coefficients against the history window:
+
+```cpp
+typename Tr::Accum acc{};
+for (std::size_t t = 0; t < taps; ++t)
+ acc = Tr::mac(acc, hist[t], Tr::blend(c0[t], c1[t], fr));
+```
+
+Per output sample, per channel: one blend and one multiply-accumulate per
+tap. Now watch what happens in a multichannel stream. Every channel of an
+output frame is evaluated at the *same* μ — the channels advance through
+time together; that is what makes them a frame. So the coefficient blend,
+which depends only on μ, was being recomputed identically for every
+channel. For stereo, half of the inner-loop arithmetic was duplicate work.
+For twelve channels, eleven-twelfths of the blend work was.
+
+The fix is the obvious factoring, and its entire risk profile lives in one
+question: does it change the output? Compute the blended row once per
+output frame into a small scratch buffer — at most 80 entries, the
+`transparent` preset's tap count — then run a plain dot product per
+channel:
+
+```cpp
+blendRow(bank, row, mu); // once per frame
+for (std::size_t c = 0; c < channels; ++c)
+ out[c] = dotRow(row, window(c), taps);
+```
+
+The arithmetic per channel is *identical* to the fused loop: blend then
+mac, per tap, in the same order, in the same types. Only the schedule
+changed — blend hoisted out of the channel loop. Identical operations in
+identical order produce identical bits, even in floating point, so the A/B
+had a correctness criterion stronger than any SNR threshold: outputs
+unchanged **bit-for-bit**. They were.
+
+The measured results, from the C1 entry in the performance log:
+
+| Measurement | Result |
+|---|---:|
+| Stereo pipeline, x86 wall-clock (same-machine A/B) | −36% |
+| 8-channel pipeline, x86 wall-clock | −52% |
+| M55 pipeline instructions, float / Q15 / Q31 | −15% / −30% / −21% |
+| Hexagon pipeline instructions, float / Q15 / Q31 | −3.6% / −3.3% / −0.2% |
+| Mono kernels, both targets | count-identical |
+
+Three of those rows deserve commentary; they carry the chapter's lessons.
+
+**The mono row is a control.** Mono has no duplicate blend — one channel,
+one blend, nothing to hoist — so the change should not touch the mono
+kernels at all. "Should not touch" is a hypothesis, and the deterministic
+counter can test it exactly: the mono kernel scenarios were
+count-identical, to the instruction, on both targets. Had they moved by
+even a handful of instructions, that would have meant the change did
+something beyond its stated mechanism — maybe harmless, maybe not, but
+either way the PR's description would have been wrong, and a wrong
+description is a review failure even when the numbers are green. Every
+subsequent effort in this campaign carries controls like this, and the
+discipline is worth stating as a rule: **a change must move what it claims
+to move, and everything else must measure 0.00%.** Wall-clock benchmarks
+cannot enforce that rule; nothing measures 0.00% on a shared Xeon.
+Instruction counting can, and does.
+
+**The scaling is the hypothesis confirmed.** Stereo −36%, 8-channel −52%:
+the win grows with channel count, exactly as a per-channel redundancy
+elimination should. Numbers that match the *shape* of the prediction, not
+just its sign, are how you know the mechanism you described is the
+mechanism that acted.
+
+**And then there is Hexagon.** The M55 dropped double-digit percentages;
+Hexagon barely moved — −3.6% at best, and the Q31 pipeline a rounding-error
+−0.2%. The same source code, the same factoring, the same eliminated
+arithmetic. An under-delivering result like this is where measurement-first
+culture earns its keep, because the temptation is to shrug — Hexagon is
+weird, ship the M55 win — and the log did not shrug. If eliminating most of
+the per-channel blend work barely dents the pipeline cost, then the
+pipeline cost must be dominated by something that is neither blend nor dot
+product. The remaining candidate was the per-sample *phase bookkeeping*:
+μ lived in a `double`, and Hexagon has no double-precision FPU, so every μ
+increment, wrap and index conversion was a soft-float library call. The
+kernels were cheap; the glue between kernels was expensive. The C1 entry
+records this diagnosis in one clause — "its pipelines are dominated by
+per-sample soft-double phase math" — and flags it as the motivation for
+C3. A disappointing result, read carefully, fingered the next target. That
+is not a consolation prize; over the campaign it turned out to be C1's
+second-most-valuable output.
+
+## C2: the audit — verify, don't assume
+
+Hypothesis 2 on the list was not a change at all. It was an audit: do the
+hot loops actually vectorize, under the compilers and flags the project
+ships with? Everyone who has read optimization folklore "knows" the
+answers — contiguous arrays vectorize, reductions vectorize, the compiler
+is smart. The project's rule, stated in the hypothesis itself: *verify,
+don't assume.* The tool is the compiler's own testimony:
+`-fopt-info-vec` on GCC, `-Rpass=loop-vectorize` (and its `-missed`
+sibling) on Clang, which report loop by loop what vectorized and what did
+not, and why.
+
+The audit produced four findings — one actionable, three that reshaped the
+rest of the campaign's roadmap.
+
+**Finding 1: `blendRow` vectorized, but behind a runtime aliasing check.**
+The compiler could not prove that the output row and the coefficient table
+don't overlap — they arrive as separate pointers, and separate pointers may
+alias — so it emitted *two* versions of the loop, vector and scalar, with a
+runtime overlap test choosing between them every call. The fix is the
+oldest annotation in the C toolbox, wrapped for portability:
+
+```cpp
+#if defined(_MSC_VER)
+#define SRT_RESTRICT __restrict
+#else
+#define SRT_RESTRICT __restrict__
+#endif
+```
+
+`SRT_RESTRICT` on the kernel pointer parameters is a promise to the
+compiler — these regions do not overlap — and the caller's structure makes
+the promise true: the row is a private scratch member, the table is
+immutable, the histories are distinct vectors. The versioning check and the
+dead scalar copy disappear. The header carries a comment tying the
+qualifier to the evidence (`verified with -fopt-info-vec; see
+docs/PERFORMANCE.md, hypothesis 2`), so the next maintainer knows it is
+load-bearing and not cargo cult.
+
+**Finding 2: the Q15 dot product auto-vectorizes, no help needed.** This is
+worth a paragraph, because *why* it vectorizes is the piece of theory the
+next two chapters stand on. A dot product is a reduction — every iteration
+folds into one accumulator, a serial dependence chain. Vectorizing it means
+computing partial sums in lanes and combining them at the end, which
+**reorders the additions**. For integer arithmetic that reordering is free:
+int64 addition is exactly associative, every 16×16 product is exact, so any
+order of summation produces the same bits. The compiler knows this and
+vectorizes integer reductions at `-O2` without being asked.
+
+**Finding 3: the float dot product is scalar — and stays scalar, by
+design.** Floating-point addition is *not* associative; reordering the
+accumulation changes the rounding, which changes the output bits. The
+library's float datapath promises double-precision accumulation in a
+defined order — that is part of what its measured 135 dB rests on — so the
+compiler correctly refuses to vectorize the reduction, and the project
+correctly declined to force it with `-ffast-math` or manual partial sums.
+The audit *did* record the option: explicit 4-way double accumulation would
+vectorize the float dot and change output bits, and it entered the log as
+**deferred hypothesis 5** — a bit-changing optimization, parked until the
+budget demands it and the quality harness can re-baseline around it. Hold
+that thought; hypothesis 5 has a surprising fate in the C6 chapter, where
+an axis nobody had listed makes float vectorization possible *without*
+changing a bit.
+
+**Finding 4: the Q31 dot product is scalar too**, for a blunter reason —
+baseline ISAs have no packed 64-bit multiply, and Q31 MACs need 32×32→64
+products. No annotation fixes an instruction set. Noted, filed, moved on.
+
+One actionable change, then: `SRT_RESTRICT` on the kernel signatures. The
+measured effect, and this time the controls are the headline:
+
+| Scenario | Δ instructions (M55) |
+|---|---:|
+| `pipeline_float` | −1.35% |
+| every other scenario, both targets | **0.00%** |
+
+On x86, a same-state wall-clock A/B measured −3.7% — the aliasing check sat
+in a hotter relative position there. But look at the M55 table with C1's
+rule in mind. The claim was narrow: *restrict removes a runtime aliasing
+check from `blendRow`*. The fixed-point pipelines blend through the same
+function — but their loop bodies differ, the versioning overhead lands
+differently, and on M55 only the float pipeline was paying measurably.
+Fine. What the claim *requires* is that nothing else moves: the qualifier
+is documentation to the optimizer, not arithmetic, so any scenario where
+the codegen was already clean must be bit-identical binary. All of them
+were, to the instruction. A −1.35% win surrounded by exact zeros is a
+*verified mechanism*. A −1.35% win alone is just a number.
+
+It is worth pausing on how unusual that sentence is. In most performance
+work, "this change affects only X" is a belief. Here it is a measurement,
+because the instrument has no noise floor. The ratchet infrastructure was
+built to catch regressions; the campaign discovered its second use almost
+immediately — it certifies *non-effects*, which is what turns an
+optimization PR from "trust me" into an experiment with controls. Chapter
+C6 will show the dramatic version: an embedded control that *failed* — a
+hosts-only feature that leaked +6–8% into the M55 — and stopped a merge.
+
+## What two efforts bought
+
+The scoreboard after C1 and C2: multichannel wall-clock roughly halved at
+high channel counts, double-digit instruction reductions on the M55
+pipelines, a `restrict` qualifier with a paper trail — and, less tangibly,
+three pieces of map. Hexagon's cost lives in soft-double phase math (C3's
+target). The M33-class parts, with no vector unit at all, will need
+something explicit for Q15 (C4's target). And the float dot product cannot
+be vectorized over taps without changing bits (the constraint C6
+eventually routes around). None of those three facts was known before; all
+three came from measurements that individually looked like disappointments
+or non-events.
+
+That is the method chapter's actual thesis. The loop — baseline, profile,
+one hypothesis, A/B with controls — is not bureaucracy around the real work
+of optimizing. On this evidence it *is* the real work: every effort in the
+next two chapters was aimed by an anomaly this chapter's measurements
+surfaced and refused to explain away.
+
+## Verify it yourself
+
+```sh
+# Host wall-clock benchmarks (Google Benchmark):
+cmake -B build -DCMAKE_BUILD_TYPE=Release -DSRT_BUILD_BENCHMARKS=ON
+cmake --build build -j
+./build/bench/srt_bench --benchmark_filter='Pipeline'
+
+# Wall-clock deltas in this chapter are same-machine A/Bs. To reproduce
+# one, run the benchmark at the parent commit and at the change on the
+# same machine in the same session — the project never gates on
+# wall-clock from shared runners, and neither should you.
+
+# The compiler's own vectorization testimony (C2's instrument):
+cmake -B build-vec -DCMAKE_BUILD_TYPE=Release \
+ -DCMAKE_CXX_FLAGS="-fopt-info-vec-optimized -fopt-info-vec-missed"
+cmake --build build-vec -j 2>&1 | grep -i 'polyphase'
+
+# Deterministic instruction counts, exactly as CI gates them
+# (arm-none-eabi-gcc + qemu-system-arm + the counting plugin; see
+# .github/workflows/ci.yml for the plugin build):
+cmake -B build-m55 -DCMAKE_BUILD_TYPE=Release \
+ -DCMAKE_TOOLCHAIN_FILE=cmake/arm-cortex-m55-mps3.cmake \
+ -DSRT_BUILD_TESTS=OFF -DSRT_BUILD_EXAMPLES=OFF \
+ -DSRT_BUILD_ICOUNT_BENCH=ON
+cmake --build build-m55 -j
+python3 scripts/icount.py --target m55 --build-dir build-m55 \
+ --plugin /tmp/libinsncount.so
+
+# The quality guardrail that every optimization PR had to clear:
+ctest --test-dir build -R Quality --output-on-failure
+```
+
+Run the icount harness twice and diff the outputs: identical, to the
+instruction. That reproducibility is the entire epistemology of this
+chapter — it is what lets "all other scenarios: 0.00%" be a finding
+instead of a hope.
diff --git a/book/src/part3/c3-c5.md b/book/src/part3/c3-c5.md
new file mode 100644
index 0000000..f497a71
--- /dev/null
+++ b/book/src/part3/c3-c5.md
@@ -0,0 +1,318 @@
+# The integer phase and the wide MACs (C3–C5)
+
+> If it disagrees with experiment, it is wrong. In that simple statement is the key to science.
+>
+> — Richard Feynman
+
+The previous chapter ended with an anomaly: C1 stripped most of the
+per-channel blend work out of the datapath, the M55 pipelines dropped by
+double digits, and Hexagon's barely moved. The diagnosis written into the
+log was that Hexagon's pipeline cost was dominated not by the kernels but
+by the glue between them — the per-sample phase bookkeeping, done in
+`double`, on a DSP with no double-precision FPU. Every μ update was a
+soft-float library call.
+
+This chapter is what happened when the campaign acted on that diagnosis,
+and then kept going: one clean win that falsified the project's own
+documentation on the way through (C3), one honest, bounded win on the
+smallest target (C4), and one implementation that was correct, complete,
+measured — and deliberately deleted (C5). The theme, if the chapter has
+one: **a negative result recorded is a win.** Not a moral victory — an
+actual asset, with a measurable replacement cost.
+
+## C3: evicting the last double from the per-sample path
+
+The fractional resampler tracks its position between input samples as a
+phase in [0, 1). Before C3, that phase was a `double` named μ, and the
+per-sample loop did double-precision work even on the fixed-point
+datapaths: advance μ by the rate ratio, detect wrap past 1.0 or below 0.0,
+scale by the phase count L, split into integer row index and fractional
+blend factor. On a Xeon this is noise. On a core where every double
+operation is a function call, it was — per C1's evidence — the dominant
+per-sample cost.
+
+The C3 design replaces the double with an unsigned **Q0.64** fixed-point
+fraction: a plain `uint64_t` whose full range represents [0, 1) with
+resolution 2⁻⁶⁴. Three properties make this format almost suspiciously
+well-suited to the job.
+
+**The unity part of the ratio never enters the accumulator.** This is the
+near-unity specialization paying out one more time. The converter's ratio
+is 1 + ε with |ε| servo-clamped to around 10⁻³, so the resampler advances
+one input frame per output frame *structurally* and only the deviation ε
+accumulates in the phase. ε is converted from the servo's double to a
+signed Q0.64 increment **once per `process()` call** — block rate, not
+sample rate. That single conversion is the only double arithmetic left
+near the hot path.
+
+**Wraparound detection is free.** Unsigned overflow is defined, modular
+arithmetic — the same property the ring buffer's monotonic indices leaned
+on in Part I. If adding a positive ε wraps the phase past 2⁶⁴, the sum
+comes out *smaller* than the old phase; that comparison **is** the slip
+detector, and the response is to consume one extra input frame. A negative
+ε wrapping below zero comes out *larger*, and the window is reused. No
+epsilon-comparisons against 1.0, no branch on the sign of a floating-point
+residual — two integer compares.
+
+**The table index and blend factor are bit fields.** L is a power of two,
+so the polyphase row index is simply the top log₂ L bits of the phase, and
+the intra-phase blend factor is the bits below, shifted up:
+
+```cpp
+const int lg = std::countr_zero(bank.numPhases());
+const std::size_t p = static_cast(phase >> (64 - lg));
+const auto fr = Tr::blendFactorFromQ64(phase << lg);
+```
+
+No multiply by L, no floor, no subtract — shifts. The per-sample path is
+now integer-only on the fixed-point datapaths, plus a single
+single-precision conversion on the float path.
+
+Notice also what the format does to *resolution*: a double's mantissa gives
+the old μ about 2⁻⁵² of precision; Q0.64 gives 2⁻⁶⁴. Twelve extra bits of
+phase resolution means less quantization of the sampling instant, and Part
+0's arithmetic connected phase jitter directly to distortion. So the C3
+entry contains a line that optimization logs almost never get to contain:
+**quality improved** — 135.0 dB at 997 Hz, versus the previous baseline,
+measured by the same pinned tests that would have failed the PR had it cost
+decibels. Faster and cleaner, from one change.
+
+### The falsification
+
+Now the numbers, and the embarrassing one first, because the log put it
+first. M55 instruction counts:
+
+| M55 scenario | Δ instructions |
+|---|---:|
+| pipeline Q15 | −5.3% |
+| pipeline Q31 | −4.6% |
+| pipeline float | **+1.4%** |
+
+The fixed-point wins were expected. The float *regression* was not — it
+contradicted the project's own documentation. The performance plan's
+hypothesis list had asserted, in writing, that the M55's float path was
+soft-double-bound, just like Hexagon's; the M55 was on the list of targets
+the integer phase was supposed to rescue. If that were true, replacing
+per-sample double math with int64 math should have helped the float
+pipeline too. Instead the float pipeline got slightly *worse*.
+
+One of three things had to be wrong: the measurement, the change, or the
+documentation. The measurement is deterministic and was reproduced. The
+change was doing exactly what it claimed on every other scenario. That
+left the documentation — and a check of the architecture manuals settled
+it: **the Cortex-M55's scalar FPU supports FP64.** Only its vector
+extension, Helium/MVE, is limited to fp16/fp32. The M55 float path had
+never been soft-double-bound; its doubles were cheap hardware doubles all
+along, and C3 had traded them for int64 sequences that cost slightly more.
+The genuinely double-less target in the fleet is Hexagon, and only
+Hexagon.
+
+The correction is recorded *in the hypothesis list itself* — hypothesis 3
+in `docs/PERFORMANCE.md` now reads as a correction notice ("discovered
+while measuring: Cortex-M55's *scalar* FPU does support FP64…"), so the
+false belief cannot quietly re-seed a future roadmap. And the +1.4% was
+accepted, eyes open, as the price of a cross-target win: the phase
+accumulator is one implementation shared by every datapath, and forking it
+per-target to claw back 1.4% on one scenario is exactly the per-arch
+complexity the stop rule exists to refuse.
+
+This is the campaign's cleanest specimen of the culture the introduction
+promised. A 1.4% regression on one scenario of one target is the kind of
+number a wall-clock benchmark would eat as noise. The deterministic
+harness surfaced it; the loop's rule — explain every number, especially
+the small ugly ones — forced the investigation; the investigation
+falsified a documented belief about the hardware. *The measurement audited
+the documentation*, not the other way around.
+
+### The target it was aimed at
+
+Hexagon, from the PR's gating run:
+
+| Hexagon scenario | Δ instructions |
+|---|---:|
+| pipeline Q31 | −15.5% |
+| pipeline Q15 | −10.3% |
+| pipeline float | −2.6% |
+| kernels (all types) | count-identical |
+
+The per-sample soft-double phase math that C1 had identified as dominating
+Hexagon's pipelines is simply gone. The kernel scenarios — which measure
+`interpolate()` in isolation, no phase bookkeeping — were count-identical,
+the control confirming the change touched only what it claimed. On x86, a
+same-minute A/B measured float −5.4% and Q15 −12.0% wall-clock; hosts keep
+score too, they just don't gate.
+
+C1 found the target; C3 hit it. That is the loop working across PRs, not
+just within one.
+
+## C4: two MACs per instruction, where the compiler won't
+
+Next on the list: explicit SIMD, "partially moot" before it started. The
+audit trail from C2 explains why. On the M55, objdump had confirmed that
+GCC already auto-vectorizes the Q15/Q31 kernels with Helium at -O2 — the
+M55's roughly 4× Q15 advantage over the scalar M33 in the baselines is MVE
+at work, no intrinsics required. But the fleet has a whole class of parts
+below the M55: Cortex-M33, M4, M7 — the Raspberry Pi Pico 2 class. These
+have no vector unit at all. What they *do* have is the Armv7E-M/Armv8-M
+**DSP extension**: scalar instructions that treat a 32-bit register as two
+16-bit lanes. The one that matters here is `SMLALD` — *signed multiply
+accumulate long dual* — which takes two such registers, forms both 16×16
+products, and adds both into a 64-bit accumulator. One instruction, two
+Q15 MACs: precisely the inner operation of `dotRow`, at double width.
+
+The bit-exactness argument is short enough to carry in your head, and it
+is the same argument C2's finding 2 established: every 16×16 product is
+exact in int32, int64 addition is associative, therefore summing the
+products in pairs instead of one-by-one changes no output bit. The
+intrinsic path and the scalar loop are not "close" — they are the same
+function, by construction. (Contrast the float dot, where this argument is
+exactly what fails.)
+
+The subtle part of C4 is not the intrinsic; it is the **gate**. Here is
+the actual block from `include/srt/polyphase_filter.hpp`, pulled in live:
+
+```cpp
+{{#include ../../../include/srt/polyphase_filter.hpp:opt_smlald_gate}}
+```
+
+Read the condition: DSP extension present *and MVE absent*. The naive gate
+— "use the fast intrinsic wherever the ISA has it" — would have enabled
+SMLALD on the M55 too, where the compiler is currently vectorizing that
+loop with Helium. The intrinsic loop is hand-written; the compiler will
+not auto-vectorize *through* it; enabling it on the M55 would have
+silently replaced full vector arithmetic with dual scalar MACs — an
+optimization for one target acting as a pessimization on a better one.
+This is the MVE-gate discovery, and it generalizes: **an intrinsic is a
+floor and a ceiling at once.** The ratchet verified the gate the only way
+that counts — every M55 and every Hexagon scenario at exactly 0.00%,
+meaning those binaries are instruction-for-instruction unaffected by C4's
+existence.
+
+One routing consequence: mono Q15 on these targets now goes through
+`blendRow` + `dotRow` rather than the fused `interpolatePhase()`, because
+the dual-MAC loop lives in `dotRow` — legitimate only because C1
+established the two paths are bit-exact against each other.
+
+The result: **M33 `pipeline_q15` −3.1%.** And here the log does something
+worth imitating, under the heading of honest accounting. A 2×-wide MAC
+did not halve the frame cost, or even a tenth of it — why? Because the
+M33's Q15 frame cost is not dominated by the dot product. It is dominated
+by the coefficient *blend* — whose per-tap `fr * diff >> 15` is a 64-bit
+product, one `smull` each, already one instruction — and by transport
+around the datapath. The candidate follow-up, a packed blend, would change
+the documented int64 blend invariant that the bit-exactness proofs rest
+on, and was declined at current budgets. The entry even flags that the
+`kernel_q15` scenario still measures the fused `interpolate()` call, which
+C4 intentionally does not touch — so nobody later mistakes that flat
+number for a failed optimization. This is the campaign's honest draw: a
+real, kept, correctly-gated improvement, described at its true size, with
+the reason it is small written next to it.
+
+## C5: the revert, and why it is the best entry in the log
+
+Hexagon's turn. The C4 argument seemed to transfer directly, at double the
+width again: Hexagon's scalar ISA has `vrmpyh`, which forms **four** exact
+16×16 products per instruction and can feed an int64 accumulator. Four Q15
+MACs per instruction against a loop the profile says is pure MACs — the
+back-of-envelope says this is the biggest single win in the campaign.
+
+It was implemented. It passed the full test suite under Hexagon QEMU,
+bit-exactly — the same associativity argument held. And it measured:
+
+> `pipeline_q15`: 119,847,854 → 119,478,758 instructions. **−0.31%.**
+
+A 4-wide MAC bought less than a third of a percent.
+
+The reflexive explanation — the one everyone reaches for first — is that
+the compiler had already vectorized the loop, so the intrinsic replaced
+equivalent code. The team checked, because the reflexive explanation is
+also checkable: disassemble both binaries (CI's llvm-objdump, pre and
+post) and count the wide MACs. The baseline binary contains **zero**. The
+intrinsic build contains **10**. The compiler had *not* done it already;
+the intrinsic genuinely landed, four products at a time, exactly as
+designed — and it barely mattered.
+
+The real explanation is better, and it is the piece of architecture
+knowledge the whole effort purchased: **Hexagon's scalar ISA is already
+half a DSP.** Its ordinary instruction set has single-instruction 64-bit
+multiply-accumulates (`Rxx += mpy`) and 64-bit loads, so the "scalar"
+baseline loop was already running at a density that would take intrinsics
+to reach on a Cortex-M. On top of that, the history window is 2-byte
+aligned — it slides one sample at a time by design — so feeding `vrmpyh`'s
+packed operands costs combine/alignment work that eats most of what the
+wide multiply saves. The instruction *was* wide; the loop around it had to
+pay to keep it fed.
+
+Now the stop rule from the first page of this Part, firing on schedule:
+*the next win requires per-arch complexity the budget does not justify.*
+A −0.31% improvement is real — deterministic, reproducible, green across
+the suite. It is also the definition of not worth it: a Hexagon-specific
+intrinsic path is a second implementation to review, to gate, to keep
+bit-exact against the reference forever, purchased for three-tenths of a
+percent. The code was reverted. Not lost to a branch nobody can find —
+**reverted, with the entry as the deliverable**: the numbers, the
+disassembly evidence, and the analysis now live in `docs/PERFORMANCE.md`
+under C5, so the next engineer who has the vrmpyh idea (and someone will;
+it is a *good* idea) spends five minutes reading instead of two days
+re-deriving a dead end.
+
+The entry's final paragraph is the part that turned out to be prophetic.
+Having established that the win wasn't there in scalar-wide instructions,
+it asks whether HVX — Hexagon's actual 128-byte vector unit — could do
+better, and answers with a shape argument: a 48–80-tap dot product does
+not fill one HVX vector, and HVX's 16-bit MACs accumulate into 32-bit
+lanes, which overflows the library's exact-int64 invariant after about 24
+worst-case taps. **Per-channel dot products are the wrong shape for HVX**
+— not slow, *wrong-shaped*: the axis being vectorized (taps) is too short
+and demands too much accumulator width per lane. The shape that fits is
+turned ninety degrees: one 64-bit lane-pair per *channel*, sixteen
+channels filling one vector exactly — vectorize across channels, not
+across taps. That observation was recorded as hypothesis C6, and the next
+chapter is what happened when it met the float datapath's
+may-not-reorder-additions constraint and dissolved it.
+
+Score the chapter the way the introduction scored the campaign. C3: a win
+that corrected the project's documentation. C4: a draw, honestly sized.
+C5: a revert that produced no code and two durable facts — Hexagon's
+scalar MAC density, and the channel-axis insight that C6 is built on. The
+log entry for the revert cost nothing to keep and pointed directly at the
+campaign's largest remaining win. Negative results, *recorded*, compound.
+
+## Verify it yourself
+
+```sh
+# C3's quality claim — the pinned SNR thresholds (135 dB at 997 Hz for
+# the float path) are asserted by the test suite, not the docs:
+ctest --test-dir build -R Quality --output-on-failure
+
+# C3/C4 instruction counts on the Arm targets (the M33 leg is where C4's
+# −3.1% lives; every M55 scenario is C4's 0.00% control):
+cmake -B build-m33 -DCMAKE_BUILD_TYPE=Release \
+ -DCMAKE_TOOLCHAIN_FILE=cmake/arm-cortex-m33-mps2.cmake \
+ -DSRT_BUILD_TESTS=OFF -DSRT_BUILD_EXAMPLES=OFF \
+ -DSRT_BUILD_ICOUNT_BENCH=ON
+cmake --build build-m33 -j
+python3 scripts/icount.py --target m33 --build-dir build-m33 \
+ --plugin /tmp/libinsncount.so
+
+# C4's gate, interrogated directly: preprocess for an M33 and for an M55
+# and watch SRT_Q15_SMLALD flip (DSP extension without MVE vs with):
+arm-none-eabi-gcc -mcpu=cortex-m33 -dM -E - If you were plowing a field, which would you rather use: two strong oxen or 1024 chickens?
+>
+> — attributed to Seymour Cray
+
+The campaign's last effort began with an inheritance and a constraint.
+
+The inheritance came from the C5 revert: per-channel dot products are the
+wrong *shape* for wide vector units — the tap axis is too short to fill
+wide registers, and lane-width accumulators can't honor the library's
+exact-arithmetic invariants. The shape that fits is rotated ninety
+degrees: put *channels* in the lanes.
+
+The constraint came from the C2 audit: the float dot product may not be
+vectorized over taps at any price, because reordering a double-precision
+accumulation changes output bits, and bit-stability is part of the
+library's contract with its own test suite. The one idea that had been on
+the table — explicit 4-way partial sums, deferred hypothesis 5 — was known
+to be bit-changing and sat parked.
+
+C6 is the observation that the inheritance dissolves the constraint. But
+before the observation was allowed to become code, the loop's first rule
+applied: profile first, even when the hypothesis arrives pre-argued.
+
+## The profile that killed the other hypothesis
+
+There was a competing story about where the multichannel money was. The
+resampler's input arrives interleaved — frame by frame, channels adjacent —
+and the datapath stored history *planar*, one delay line per channel. So
+every input frame was deinterleaved on arrival: a scatter, channel by
+channel, sample by sample. Scatters offend performance intuition, the code
+looked like a textbook strided-access antipattern, and "the deinterleave is
+eating us at high channel counts" was a perfectly plausible claim.
+
+Callgrind, on the 12-channel Q15 pipeline — the 7.1.4 deployment shape —
+settled it before any design work started:
+
+| Where the instructions went (12-channel pipeline) | Share |
+|---|---:|
+| per-channel dot-product MACs | ≈ 85% |
+| deinterleave | ~2% |
+
+The scatter-cost hypothesis died in one profiler run. Two percent is not a
+target; you cannot win more than two points by optimizing it to zero.
+Eighty-five percent in the dots meant the frame's cost was, to first
+order, the MACs themselves — and for float, those MACs were running
+scalar, one channel at a time, by contractual necessity. If the dots were
+to get cheaper, it would have to be by *widening* them, and the only legal
+axis for widening was the one C5's postmortem had pointed at.
+
+Worth pausing on the method point: the profile did not suggest the design
+— C5's shape argument did that. The profile's job was to confirm the
+design was aimed at the cost, and to kill the alternative before it
+consumed a week. Both jobs took one afternoon.
+
+## The insight: lanes are channels, not taps
+
+Here is the argument at the level of arithmetic, because everything else
+in C6 is bookkeeping around it.
+
+A vectorized loop computes several elements of *something* per
+instruction. Tap-axis vectorization computes several taps of **one
+channel's** sum per instruction — partial sums, combined at the end, which
+reorders that channel's additions and changes its rounding. Forbidden by
+the float contract.
+
+Channel-axis vectorization computes one tap of **several channels'** sums
+per instruction. Lane k holds channel k's accumulator; each step of the
+tap loop multiplies each lane's history sample by the *same* broadcast
+coefficient and adds into that lane. Watch what happens to any single
+channel: its accumulator receives tap 0, then tap 1, then tap 2 — the
+identical operations in the identical order as the scalar `dotRow`. No
+channel's sum is ever split, reassociated, or combined with another's. The
+channels were always independent computations; SIMD lanes are independent
+computations; the map is exact.
+
+So the channel axis is bit-exact *even for float* — not approximately,
+not within tolerance: the same IEEE operations in the same order per
+channel, hence the same bits. It is the only vectorization axis the float
+contract permits, and it had been sitting in plain sight behind a storage
+decision: you cannot load "tap t of channels 0..7" in one instruction if
+every channel lives in its own array. Channels-in-lanes requires
+**frame-major** history — samples of one frame adjacent in memory.
+
+Which is interleaved order. The deinterleave the profile had just measured
+at 2%? For the channel-parallel path it isn't cheap — it's *deleted*.
+Frames are copied into the history window as-is, one contiguous `memcpy`
+per frame.
+
+## Frame-major storage, concretely
+
+The storage change is worth seeing at the level of the data structure,
+because it is where a reader of `polyphase_filter.hpp` will first bump
+into C6. Below `SRT_CP_MIN_CHANNELS`, the resampler keeps what Part I
+described: one delay line per channel, `hist_[c]`, and a per-channel
+`window(c)` pointer for the planar dot. At or above the threshold, on
+channel-parallel targets, there is a single delay line, `hist_[0]`, whose
+slots are whole frames — `channels` samples wide — and the bookkeeping
+indices (`end_`, the capacity) count *frames* in both modes, so the sliding
+and compaction logic upstream is shared rather than forked. When the
+window slides past the end of the allocation, the same `memmove`
+compaction runs in either mode; the only difference is the width of a
+slot. Appending input is where the modes diverge visibly: planar scatters
+each frame's samples into their per-channel lines, frame-major copies the
+frame in one shot, still interleaved.
+
+The kernel then reads the window with a stride. For output frame *n*, the
+newest `taps` frames sit contiguously at the end of the window; tap *t* of
+channel *k* lives at `base[t * channels + k]`. The tile's inner step loads
+`frame[0..K-1]` — K adjacent samples, one vector load when K matches the
+lane count — multiplies by the broadcast coefficient, and accumulates. The
+per-frame schedule is C1's, unchanged: one `blendRowPhase()` per output
+frame, then all channels' dots; C6 replaces only the per-channel dot loop
+with a single channel-parallel pass over the frame-major window.
+
+The threshold itself was measured, not assumed. At one and two channels
+the planar path is better — the blend-share structure from C1 already
+amortizes the expensive part, the tile machinery has fixed overhead, and
+mono additionally keeps its fused no-scratch fast path — so
+`SRT_CP_MIN_CHANNELS` defaults to 4, the point where the lanes start
+paying. It is deliberately an overridable macro rather than a hard-coded
+constant: the A/B that chose 4 (`-DSRT_CP_MIN_CHANNELS=...` on the
+benchmark build, both directions) stays reproducible by anyone who
+suspects their machine bends the crossover.
+
+## Two traps, both recorded
+
+The performance log preserves two implementation lessons from C6, both of
+the kind that cost a day and read as obvious afterward. They are in the
+log precisely so they only cost a day once.
+
+**Trap one: the naive nest is worse than nothing.** The first-cut
+channel-parallel loop — taps outer, channels inner, accumulators in a
+plain memory array sized at runtime — measured **2.8× slower than
+planar**. Not slower than the target; slower than the *unoptimized
+baseline*. With the accumulator array's size unknown at compile time, the
+compiler could not promote the accumulators to registers, so every one of
+the frame's `taps × channels` MACs round-tripped its accumulator through
+the stack — load, fuse, store — and the memory traffic swamped the SIMD
+gain several times over. The fix is register blocking, and it must be
+structural, not hopeful: a fixed-size tile of K channels whose K
+accumulators live in a `constexpr`-sized local array the compiler
+demonstrably keeps in registers. The library's tile is a template on K,
+taken live from `include/srt/polyphase_filter.hpp`:
+
+```cpp
+{{#include ../../../include/srt/polyphase_filter.hpp:opt_dot_tile}}
+```
+
+The doc comment carries the 2.8× number in the header itself — the trap
+is documented at the exact place where a refactor would re-arm it. The
+lesson generalizes: **register-block or don't bother.** A vector unit fed
+from the stack is slower than scalar code fed from registers.
+
+**Trap two: the mode gate must be free where the mode is off.** C6 is a
+hosts-only, float-only feature; the embedded targets keep their proven
+codegen — Helium auto-vectorization on the M55, C4's SMLALD on the
+M33 class, Hexagon's measured scalar floor. The first implementation
+selected the path with an ordinary runtime boolean, and the embedded
+ratchet — the instrument that certifies non-effects — refused the PR:
+**+6–8% on M55 scenarios** that were supposed to measure 0.00%. A bool
+that is false on every embedded run still costs branches in the hot loops
+and blocks the optimizer around them. The gate had to become
+compile-time — a `constexpr` flag that constant-folds the entire
+channel-parallel path out of existence on non-host targets — after which
+every embedded scenario returned to exactly 0.00% and the PR merged.
+
+Note who caught it: not review, not intuition — the controls. A
+wall-clock smoke test would never have resolved 6% on an emulated M55,
+because nothing wall-clock touches an emulated M55 at all. The
+deterministic ratchet made "this feature is free where it is disabled" a
+gated, falsifiable claim, and its first draft was false. This is the
+failure-mode twin of C2's all-zeros table, and together they are the
+strongest advertisement in the campaign for counting instructions.
+
+## The dispatcher and the odd channel counts
+
+Real streams are not all multiples of eight, so the channel loop runs a
+cascade of tiles — 8, then 4, then 2, then 1 — each a specialization of
+the same template:
+
+```cpp
+{{#include ../../../include/srt/polyphase_filter.hpp:opt_dot_rows}}
+```
+
+Twelve channels is 8+4. Sixteen is 8+8. The deployment shapes exercise
+only the big tiles — which is exactly why the odd shapes need deliberate
+tests: **5 channels (4+1) and 7 channels (4+2+1) are the only counts that
+reach the K=1 and K=2 remainder tiles.** Ship only 8/12/16-channel tests
+and two of the four tile instantiations are dead code with green CI —
+until a 7-channel consumer instantiates them in production.
+`tests/test_multichannel.cpp` therefore carries dedicated 5- and 7-channel
+independence tests (`MultiChannelShort.Independence5chFloat`,
+`Independence7chFloat`), with a comment stating the tiling arithmetic so
+nobody "simplifies" them away as redundant with the 12-channel test. An
+untested tile is untested code, however trivially it falls out of a
+template.
+
+## The results, and what they scale with
+
+Same-minute A/B on the host, float pipelines:
+
+| Configuration | Δ wall-clock |
+|---|---:|
+| 8-channel, AVX2+FMA (`-march=native`) | −38% |
+| 12-channel, AVX2+FMA | −38% |
+| 16-channel, AVX2+FMA | −42% |
+| 8–16-channel, baseline SSE2 build | −4–5% |
+
+Bit-exactness was not left to the argument above, clean as it is: the
+channel-parallel outputs were **hash-verified against the planar path over
+30,000 blocks across four configurations**. The proof says the bits must
+match; the harness removes the possibility that the proof described a
+slightly different loop than the one that shipped. (Part I's ring-buffer
+chapter made the same move with its wraparound proof. The habit is the
+point.)
+
+The spread between the SSE2 row and the AVX2 rows is the design's
+signature, and the log calls it out: **gains scale with SIMD width.** The
+C++ contains no intrinsics — no AVX2 code, no dispatch tables. It exposes
+an axis: independent accumulator lanes, contiguous frame-major loads, a
+broadcast coefficient, register-resident tiles. What the axis is worth
+depends on the vector unit the compiler is targeting — 4 double lanes with
+FMA under AVX2, 2 under SSE2, wider still if a consumer builds for
+AVX-512. For a header-only library this division of labor is exactly
+right: the header is compiled inside the consumer's translation units,
+with the consumer's flags, so *the consumer chooses the SIMD width* and
+the same source meets them at whatever width they paid for.
+
+### The fixed-point half: a negative result, kept as a boundary
+
+C6's hypothesis was drafted for HVX and Q15; it shipped for hosts and
+float. The fixed-point measurement is the reason, and it is the campaign's
+last negative result: **channel-parallel Q15 measured ~1.5× slower than
+planar on hosts, and planar was kept.**
+
+No mystery, just the C2 audit paying out one more time. The planar Q15 dot
+*already* auto-vectorizes — over taps, the axis the float path is denied,
+because integer reduction is exactly reassociable and the compiler has
+been quietly exploiting that since C2 verified it. So for Q15 the
+channel-parallel form was not competing against scalar code; it was
+competing against the compiler's own tap-axis vectorization, from a
+storage layout chosen for a different sample type, and it lost. The two
+datapaths end up mirror images, each vectorized along the one axis its
+arithmetic permits, neither layout right for both:
+
+| | Tap axis | Channel axis |
+|---|---|---|
+| float | forbidden (reorders double accumulation) | **C6: bit-exact, −38–42%** |
+| Q15 | **already auto-vectorized (reassociable)** | measured ~1.5× slower |
+
+So the shipped rule is: float at ≥ `SRT_CP_MIN_CHANNELS` channels goes
+frame-major and channel-parallel; fixed-point stays planar everywhere; the
+embedded targets see none of it, enforced at 0.00% by the ratchet. And
+hypothesis 5 — the bit-changing 4-way float accumulation from C2 — was
+superseded without ever being implemented: for four channels and up, C6
+vectorizes the float path *without* changing a bit, leaving hypothesis 5
+relevant only to mono and stereo float, if a budget ever demands it.
+
+## Where the campaign stopped
+
+The embedded version of the channel axis — HVX with one 64-bit lane-pair
+per channel, sixteen channels filling one 128-byte vector exactly; Helium
+similarly on the M-class — remains in the log as a follow-up candidate
+*if DSP budgets demand it*. That phrasing is the stop rule doing its job
+one last time. The per-channel dots were measured at 85% of the
+12-channel pipeline; the shape is proven on hosts; the invariant analysis
+for HVX is already written down in the C5 entry. The next engineer starts
+from all of that — or never starts, because no budget ever asks. Either
+outcome is correct. Optimization stops by budget, not by exhaustion.
+
+Six efforts. C1 and C2, the method proving itself on easy money and
+cataloging the walls. C3, the integer phase — the biggest embedded win,
+and a falsified line of documentation as a bonus. C4, the honest draw,
+its small size explained rather than excused. C5, the deliberate revert
+that paid for this chapter. C6, the campaign's insight compounding: a
+shape argument from a failed Hexagon experiment, applied through a
+constraint mapped by a vectorization audit, aimed by a callgrind profile,
+guarded by an instruction ratchet, landing a 38–42% win that is bit-exact
+to the code it replaced. None of these six PRs would look impressive in
+isolation. The system that produced them — one hypothesis at a time, every
+number explained, every dead end recorded — is the thing this Part was
+actually about.
+
+## Verify it yourself
+
+```sh
+# The profile that aimed C6 (Linux, valgrind + callgrind):
+cmake -B build -DCMAKE_BUILD_TYPE=Release -DSRT_BUILD_BENCHMARKS=ON
+cmake --build build -j
+valgrind --tool=callgrind ./build/bench/srt_bench \
+ --benchmark_filter='Pipeline_Q15_Balanced_12ch' \
+ --benchmark_min_time=1x
+callgrind_annotate callgrind.out.* # dots ≈ 85%, deinterleave ~2%
+
+# The headline A/B: build once portable, once with native SIMD, and run
+# the float multichannel benches on the same machine, same session
+# (wall-clock is never a hard gate; only a same-machine ratio is real):
+cmake -B build-native -DCMAKE_BUILD_TYPE=Release \
+ -DSRT_BUILD_BENCHMARKS=ON -DCMAKE_CXX_FLAGS="-march=native"
+cmake --build build-native -j
+./build/bench/srt_bench --benchmark_filter='Float_Balanced_(8|12|16)ch'
+./build-native/bench/srt_bench --benchmark_filter='Float_Balanced_(8|12|16)ch'
+
+# Bit-exactness and the remainder tiles (5 = 4+1 and 7 = 4+2+1 are the
+# only counts that reach the K=1/K=2 tiles):
+ctest --test-dir build -R MultiChannel --output-on-failure
+
+# The A/B knob C6's threshold was chosen with — force the planar path
+# and watch the multichannel float benches give the win back:
+cmake -B build-planar -DCMAKE_BUILD_TYPE=Release \
+ -DSRT_BUILD_BENCHMARKS=ON \
+ -DCMAKE_CXX_FLAGS="-march=native -DSRT_CP_MIN_CHANNELS=64"
+cmake --build build-planar -j
+./build-planar/bench/srt_bench --benchmark_filter='Float_Balanced_12ch'
+
+# The claim that embedded targets are untouched is CI-gated, not
+# rhetorical — every M55/M33/Hexagon scenario sits at 0.00% vs
+# bench/baselines.json:
+python3 scripts/icount.py --target m55 --build-dir build-m55 \
+ --plugin /tmp/libinsncount.so
+```
+
+The last command is the quiet one, and it is the one to internalize. A
+38% win on the machine in front of you is easy to believe. The mature
+habit is demanding equally hard evidence for the other half of the claim
+— that every target which was promised *nothing* received exactly that.
diff --git a/book/src/part4/c-abi.md b/book/src/part4/c-abi.md
new file mode 100644
index 0000000..2c2d6a6
--- /dev/null
+++ b/book/src/part4/c-abi.md
@@ -0,0 +1,325 @@
+# The C ABI
+
+> Be conservative in what you do, be liberal in what you accept from others.
+>
+> — Jon Postel, RFC 761
+
+This chapter exists because of a plot. Part II's notebooks are the
+library's most persuasive evidence — the servo locking from a cold start,
+the 135 dB money plot, the naive-FIFO spectrogram full of clicks — and
+every curve in them comes from the *actual shipping library*, not a Python
+reimplementation of it. A reimplementation would prove nothing: the entire
+point of measurement-first development is that you measure the artifact you
+ship. So Python has to call the C++ converter.
+
+And Python cannot. Neither can Julia, nor anything else that loads shared
+libraries at run time, because C++ deliberately has no stable binary
+interface. Symbol names are mangled, and the mangling differs by compiler.
+Exceptions, RTTI, and the layout of `std::` types differ by compiler *and*
+by standard-library vendor. A template — and this whole library is
+templates — doesn't exist in a binary at all until something instantiates
+it. The one interface every FFI on earth speaks (`ctypes`, `cffi`, Julia's
+`ccall`, Rust's `extern "C"`, every scripting language's dlopen wrapper) is
+the C ABI: plain functions, plain data, names that mean what they say.
+
+So the library ships a shim: `tools/capi/`, about ninety lines of C++
+presenting a C face, built as a shared library with `-DSRT_BUILD_CAPI=ON`.
+This chapter is small because the shim is small, but three of its design
+decisions were paid for the hard way — one by a compile error, one by an
+audit finding, and one by a toolchain that turned out to be unable to
+throw an exception at all.
+
+## The surface: eight functions
+
+The entire foreign-function interface:
+
+```c
+{{#include ../../../tools/capi/srt_capi.h:abi_surface}}
+```
+
+Create, destroy, push, pull, status, latency, reset, version. The shim
+wraps the *float* converter only — the notebooks are metrology instruments
+and float is what they measure with; tripling the surface for Q15/Q31
+would triple the contract for consumers that don't exist yet. Minimalism
+here is a feature: every function in an ABI is a promise you keep forever.
+
+`SrtHandle` is the classic opaque-handle pattern: a `typedef` of a struct
+that is *declared* and never *defined*. C callers can hold a
+`SrtHandle*`, pass it around, and store it — but never dereference it,
+size it, or copy what it points to, because the compiler has no idea what
+it is. Compared to the lazier convention of handing out `void*`, the named
+opaque type keeps some type checking alive at the boundary: pass a
+`FILE*` where an `SrtHandle*` belongs and a C compiler will at least warn.
+The pointer's true identity lives on the other side of the wall.
+
+## Two `extern "C"` blocks and the lesson between them
+
+Here is the other side of the wall, and the file structure is itself a
+fossil of a compile error:
+
+```cpp
+{{#include ../../../tools/capi/srt_capi.cpp:abi_impl}}
+```
+
+The handle is simply the converter pointer in disguise —
+`reinterpret_cast` in `srt_create`, `reinterpret_cast` back on every call.
+No wrapper struct, no registry of live handles, no indirection table:
+there is nothing to store beyond the object itself, so the handle *is* the
+object.
+
+Look at where the `impl()` helpers live: in an anonymous namespace,
+*between* two `extern "C"` regions rather than inside one. That placement
+is load-bearing. There are two `impl()` functions — one taking
+`SrtHandle*`, one taking `const SrtHandle*` — which is to say, `impl` is
+**overloaded**. And overloading is illegal for functions with C linkage:
+C has no name mangling, both overloads would demand the same symbol name,
+and the program is ill-formed. Write the helpers in the obvious place —
+inside the `extern "C"` block where everything else in the file lives —
+and the compiler stops you cold. That is exactly how it was discovered
+here. The fix is what you see: the helpers sit outside the C-linkage
+region, in an anonymous namespace that both gives them ordinary C++
+linkage (overloading welcome) and keeps them out of the shared library's
+exported symbol table, where an FFI user enumerating symbols would only be
+confused by them. The general rule: `extern "C"` is for the eight names
+you are promising to the world, and *nothing else* belongs inside it.
+
+## The error convention, and why every function tolerates `NULL`
+
+The shim's entire error vocabulary is one value:
+
+```cpp
+{{#include ../../../tools/capi/srt_capi.cpp:abi_create}}
+```
+
+`srt_create` returns `NULL` on invalid configuration or allocation
+failure. No error codes, no `errno`, no last-error string: for a
+constructor with a handful of scalar parameters, "it didn't work, and the
+header tells you the two reasons it can't" is a complete diagnostic, and
+every additional error channel is more contract to keep frozen forever.
+
+The subtle decision is downstream of that one. The first version of this
+shim checked nothing: `srt_push` cast the handle and called through it,
+unconditionally. The hardening audit changed every entry point to this
+shape:
+
+```cpp
+{{#include ../../../tools/capi/srt_capi.cpp:abi_null}}
+```
+
+The reasoning is stated in the file's own header comment, and it is worth
+reading as a small essay on API design:
+
+```cpp
+{{#include ../../../tools/capi/srt_capi.cpp:abi_doc}}
+```
+
+A "check create for NULL" convention *concentrates* failure on precisely
+the caller who forgot the check — the one writing quick notebook code, the
+one least prepared for a segfault in a foreign runtime where the crash
+arrives with no C++ stack and no Python traceback, just a dead kernel.
+With the guards, an unchecked failed create degrades to a converter that
+accepts nothing and produces zeros: `srt_pull` returns silence, which is —
+not coincidentally — the same thing the real converter produces on
+underrun. The failure is still visible (`srt_status` reports zeros, the
+audio is silent), but it is *debuggable* instead of fatal. Eight null
+checks on functions that move hundreds of frames per call cost nothing
+measurable; they buy an FFI that fails the way dynamic-language users can
+diagnose.
+
+## The header is the contract
+
+`srt_capi.h` did not exist in the shim's first version — the notebook
+simply re-declared the prototypes in `ctypes`, which worked and proved
+nothing for anyone else. The audit shipped the header, and its top comment
+is the ABI's real substance — the part no binary interface can encode:
+
+```c
+{{#include ../../../tools/capi/srt_capi.h:abi_contract}}
+```
+
+Three promises deserve emphasis, because each answers a real foreign-caller
+failure mode.
+
+**Thread affinity is spelled out per function.** The C++ API's
+single-producer/single-consumer contract (the ring chapter) does not
+dissolve because the caller is Python or Julia — but an FFI user cannot see
+`std::memory_order` annotations, so the header must say it in words: one
+thread pushes, one thread pulls, `srt_status` from anywhere,
+`srt_reset_from_consumer` only from the consumer, create/destroy never
+concurrent with anything. An ABI that documents signatures but not thread
+affinity has documented the easy half.
+
+**`size_t` follows the platform ABI.** On every 64-bit host this is
+invisible; on a 32-bit target (and this library ships to several) `size_t`
+is 32 bits, and a foreign declaration hard-coding `uint64` for `frames`
+corrupts the argument list. `ctypes.c_size_t` tracks the platform
+automatically — the notebook uses it — but `cffi` and Julia users write
+their own declarations, so the header says it explicitly. This is the kind
+of sentence you only think to write after watching Part IV's 32-bit ports
+in action.
+
+**`srt_version()` is a probe.** It returns
+`major*10000 + minor*100 + patch` — `100` for today's 0.1.0. A version
+*macro* would vanish into the caller's compile; a version *function*
+reports what the loaded shared library actually is, which is the question
+an FFI user is really asking when their symbols don't match their
+expectations. It is also the cheapest possible smoke test that the DSO
+loaded and calls marshal correctly — one integer, no state, no handle.
+
+## Six doubles and two return values: marshaling without a struct
+
+Two smaller conventions in the surface reward a moment each, because both
+are shaped by what FFIs do badly.
+
+`srt_status` reports six quantities — state, ppm estimate, FIFO fill,
+underruns, overruns, resyncs — and the obvious C design is a struct.
+The shim instead fills a caller-provided `double out[6]`. A struct
+returned across an FFI boundary is a *layout* contract: the foreign side
+must re-declare every field, in order, with matching types, padding, and
+alignment, and nothing checks the re-declaration — get it wrong and the
+fields silently shear. An array of one scalar type is the
+lowest-common-denominator marshaling that every FFI on earth handles in
+one line (`(ctypes.c_double * 6)()` in the notebook). The price is that
+counters and an enum ride in doubles — harmless, since a double carries
+integers exactly to 2⁵³ and the header documents each slot by index. One
+type, one array, zero layout risk: for six values polled a few times per
+second, the trade is not close.
+
+The push/pull return values encode the real-time contract from the ring
+chapter, translated for callers who never read it. `srt_push` returns the
+frames *accepted*, which may be fewer than offered — the clipped write
+when the FIFO is full. `srt_pull` is deliberately asymmetric: it **always
+fills** the requested frames, substituting silence while the converter is
+still filling or after an underrun, and its return value reports how many
+frames came from real input. An audio callback must hand *something* to
+the DAC in bounded time; an API that could return "no data, try again"
+would push retry logic — and the opportunity to get it wrong — into every
+consumer. Silence-on-shortfall keeps the failure mode the library already
+promised (a dropout sounds like silence, then a fade back in), and the
+return value keeps it observable. FFI code that ignores both return values
+still plays audio; FFI code that reads them gets telemetry. Both are valid
+clients, and neither can deadlock or glitch the other side.
+
+## Exceptions must not cross — and one target where they cannot even fly
+
+Look again at `srt_create`'s body: the `new` is wrapped in
+`try { ... } catch (...) { return nullptr; }`. This is not defensive
+decoration. A C++ exception that propagates out of an `extern "C"`
+function into a C caller is undefined behavior — there is no agreement
+about what unwinding even *means* across that boundary, and the practical
+result ranges from `std::terminate` to stack corruption inside a foreign
+interpreter. The converter's constructor is the one place this library
+throws (`Config` validation and allocation); the shim's job is to convert
+that exception into the ABI's error vocabulary — `NULL` — before it
+reaches the boundary. `catch (...)` rather than `catch (const
+std::exception&)` because the boundary does not care *what* was thrown;
+everything becomes `NULL`.
+
+Now the hard lesson, recorded in `docs/PERFORMANCE.md` under *Known debt*.
+One of this library's supported toolchains — the Hexagon static-musl
+configuration from the Part IV DSP chapter — **cannot unwind at all**. Its
+runtime lacks the unwinder: when a constructor throws, the exception does
+not propagate to *any* catch block, anywhere; the process terminates via
+`libc++abi`. This was not discovered by reading toolchain documentation.
+It was discovered the day the first `EXPECT_THROW` test reached that CI
+leg and the test *runner* died — the `ConfigValidation` suite is excluded
+on Hexagon to this day, and the candidate fix
+(`-unwindlib=libunwind` in the toolchain file) sits unclaimed in the debt
+list.
+
+Think through what that does to this shim's design. The `catch (...)` in
+`srt_create` is *necessary* — on normal targets it is the entire error
+mechanism — but on a no-unwind target it is **unreachable**: the throw
+terminates the process before the catch can run. A caller on such a target
+cannot be saved by any code positioned *after* the throw. The only
+placement that works is *before* it: **validate, then construct.** The
+deployment guidance in the debt entry says exactly this: on that
+toolchain, treat an invalid `Config` as fatal and validate inputs *before*
+constructing — check them against the constraints the constructor
+enforces (positive finite sample rate, nonzero channels, band edges that
+sum under the rate, and the rest of `validated()`'s list) so the
+constructor is never asked to throw. It is a weaker mechanism than a
+`catch`, and that is the point: it is the strongest mechanism the target
+actually has.
+
+The generalizable ABI lesson: an FFI boundary that reports failure by
+*catching* is betting that every target can unwind, and that bet is not
+safe even within one library's own CI matrix. Error strategies that
+*return* — validate-before-construct, factory functions, status codes —
+degrade gracefully on runtimes where error strategies that *throw* simply
+end the process.
+
+## The client: forty lines of ctypes
+
+The notebook's first code cell is the reference consumer, and it exercises
+every clause above: it locates the DSO (building it on first run), declares
+each prototype, and wraps the handle in a small numpy-aware class. Two
+lines carry the load:
+
+```python
+_lib.srt_create.restype = ctypes.c_void_p
+_lib.srt_push.argtypes = [ctypes.c_void_p, _FLOATP, ctypes.c_size_t]
+```
+
+Without the explicit `restype`, `ctypes` assumes functions return a C
+`int` — on a 64-bit machine the handle comes back truncated to its low 32
+bits, and the crash lands on some *later* call, far from the actual
+mistake. Declaring the full prototypes is the ctypes equivalent of
+including the header, and `c_size_t` is the notebook honoring the width
+caveat. The wrapper's `__del__` calls `srt_destroy` (guarded, per the
+convention, against a handle that never existed), and its constructor
+asserts `srt_create` succeeded — the check the null-tolerance exists to
+forgive, present anyway, because tolerance is for accidents, not policy.
+Everything downstream — the lock-acquisition plot, the ≥125 dB
+transparency assertion, the impulse-response latency check that agrees
+with `srt_designed_latency_seconds()` to within 0.3 ms — runs through
+these eight functions.
+
+## Why these ~90 lines look the way they do
+
+| Decision | Alternative rejected | Reason |
+|---|---|---|
+| C shim over the C++ API | Python bindings / pybind11 | one C ABI serves ctypes, cffi, Julia, and everything else; bindings serve one language and drag in a build dependency |
+| Float converter only | mirror all three sample types | the consumers are metrology notebooks; unused surface is unpaid-for contract |
+| Named opaque handle | `void*` | keeps compiler type-checking alive at the FFI edge |
+| Handle = object pointer, `reinterpret_cast` | handle registry / wrapper struct | there is nothing else to store; indirection would add state and failure modes |
+| `impl()` overloads outside `extern "C"` | helpers inside the block | overloading is ill-formed with C linkage — the compiler enforced this one personally |
+| `NULL` return + null-tolerant entry points | "caller must check" | the convention otherwise concentrates crashes on exactly the caller who forgot, in a runtime with no useful stack trace |
+| `catch (...)` → `NULL` in `srt_create` | let exceptions cross | UB across the C boundary; and see below |
+| Validate-before-construct guidance | rely on the `catch` | one supported toolchain cannot unwind at all — a throw terminates before any catch runs |
+| `srt_version()` function | version macro | reports the loaded binary, not the caller's compile-time assumption |
+| Thread affinity + `size_t` width in the header | "see the C++ docs" | the header is the only artifact an FFI consumer reads |
+
+## Verify it yourself
+
+```sh
+# Build the shared library:
+cmake -B build -DCMAKE_BUILD_TYPE=Release -DSRT_BUILD_CAPI=ON
+cmake --build build --target srt_capi -j
+
+# The exported surface — eight srt_* symbols, unmangled, and nothing else
+# from this file (the impl() helpers are invisible, as promised):
+nm -D --defined-only build/tools/capi/libsrt_capi.so | grep srt_
+
+# The one-integer smoke test (0.1.0 -> 100):
+python3 -c "import ctypes; \
+ print(ctypes.CDLL('build/tools/capi/libsrt_capi.so').srt_version())"
+
+# The null-tolerance convention, exercised directly — no crash, zero frames:
+python3 -c "import ctypes; lib = ctypes.CDLL('build/tools/capi/libsrt_capi.so'); \
+ lib.srt_create.restype = ctypes.c_void_p; \
+ print('bad create:', lib.srt_create(ctypes.c_double(-1.0), 0, 0, 1)); \
+ print('push on NULL:', lib.srt_push(None, None, 128))"
+
+# The full reference client, plots and assertions included:
+jupyter nbconvert --to notebook --execute notebooks/asrc_demo.ipynb \
+ --output /tmp/asrc_demo_run.ipynb
+
+# Break it on purpose: move the two impl() overloads inside the extern "C"
+# block and rebuild — the compiler rejects the overload set, which is the
+# whole story of this file's structure in one diagnostic.
+```
+
+The second Python one-liner is the chapter's argument compressed: an
+invalid configuration and a forgotten check, and the program prints two
+zeros instead of dying.
diff --git a/book/src/part4/cortex-m.md b/book/src/part4/cortex-m.md
new file mode 100644
index 0000000..9701be3
--- /dev/null
+++ b/book/src/part4/cortex-m.md
@@ -0,0 +1,407 @@
+# Cortex-M: bare metal, two ways
+
+> Civilization advances by extending the number of important operations which we can perform without thinking about them.
+>
+> — Alfred North Whitehead
+
+The Hexagon port ran the library on a strange ISA under a familiar OS.
+The Cortex-M ports remove the OS. No loader, no threads, no filesystem,
+no `argv`, no reliable way to even return an exit code — and the library
+must still build, run its test suite, and hold its instruction budgets,
+because MCU-class parts are where a $5 deployment actually lives.
+
+The project runs two of them, and the pairing is deliberate. Each board
+exists to prove something the other cannot:
+
+- **Cortex-M55**, on QEMU's MPS3 AN547 board model. The M55 has Helium
+ (MVE, the M-profile vector extension) and a full scalar FPU. It proves
+ the library survives *bare metal itself* — the startup, the memory
+ map, the missing runtime — and it turned out to be hiding the single
+ most surprising compiler discovery in the project's history.
+- **Cortex-M33**, on QEMU's MPS2+ AN505 board model. The M33 is the
+ Raspberry Pi Pico 2 / RP2350 class of core: single-precision FPU only,
+ no Helium, DSP extension present. It proves what deployment on a cheap
+ part actually costs, in numbers concrete enough to be budgets.
+
+Both share one startup file and one CTest strategy; they differ in linker
+script and in what their instruction counts taught the project. This
+chapter covers the shared anatomy first, then the two boards' discoveries.
+
+## What `-nostartfiles` obligates you to
+
+The toolchain files (`cmake/arm-cortex-m55-mps3.cmake`,
+`cmake/arm-cortex-m33-mps2.cmake`) link with `--specs=rdimon.specs
+-nostartfiles`: newlib with semihosted I/O, and *no* toolchain crt0. From
+that moment the project owes the CPU everything crt0 used to provide, and
+the debt is paid in one C file, `platform/armv8m_startup.c`, shared by
+both targets.
+
+It starts where the core starts — the vector table:
+
+```c
+{{#include ../../../platform/armv8m_startup.c:pt_vectors}}
+```
+
+An Armv8-M core fetches its initial stack pointer from word 0 and its
+reset address from word 1; the linker scripts pin this array at the
+address the core will look (`KEEP(*(.vectors))`, first section — ITCM
+address 0 on the AN547, the secure-alias base on the AN505, where VTOR
+points at reset). The `used` attribute stops the compiler discarding an
+array nothing references; `KEEP` stops `--gc-sections` doing the same at
+link time. Belt and suspenders, because the failure mode — a garbage
+vector table — doesn't diagnose itself; the core simply jumps into
+nothing.
+
+There is a subtlety in how this file reaches the link, and it is the kind
+of decision this book exists to record. The toolchain files pass the
+startup source *on the linker command line*, from
+`cmake/arm-cortex-m55-mps3.cmake`:
+
+```cmake
+{{#include ../../../cmake/arm-cortex-m55-mps3.cmake:pt_linkline}}
+```
+
+The `g++` driver would otherwise compile a `.c` link input as C++, and C++
+is allowed to lower those `(uintptr_t)&Reset_Handler` initializers to
+*dynamic* initialization — code that runs at startup, initializing the
+table that decides where startup begins. C guarantees address-constant
+initializers are link-time constants. The table must be constant for the
+same reason a ladder's bottom rung must not be attached to the top of the
+ladder. (The `extern "C"` guards keep the file well-defined if someone
+ever does compile it as C++; the `-x c` makes sure nobody has to find out
+the hard way.)
+
+### Reset, in the only order that works
+
+```c
+{{#include ../../../platform/armv8m_startup.c:pt_reset}}
+```
+
+Four moves, each ordered by a hazard:
+
+**MSPLIM first.** Armv8-M Mainline gives the main stack a hardware floor:
+write an address to `MSPLIM` and any stack-pointer excursion below it
+faults immediately, instead of the stack silently growing down into
+whatever data lives below it. Why does this matter enough to be
+instruction one? Because the alternative failure is the worst kind:
+a deep call chain during one test overwrites the heap's top, the
+corruption surfaces ten allocations later in an unrelated structure, and
+the emulated target has no debugger attached and no memory protection
+unit configured. A stack limit register converts that archaeology into a
+HardFault at the exact instruction that crossed the line — and the
+startup file gives HardFault its own handler (a `bkpt` and a park loop,
+distinct from `Default_Handler`) precisely so the fault is identifiable.
+This wasn't in the first version of the file; it was added by the same
+infrastructure audit that hardened the Hexagon toolchain cache, and it
+cost two linker-script symbols and one instruction. Insurance is rarely
+priced this low.
+
+**FPU enable before any FP instruction, with `DSB; ISB`.** At reset,
+coprocessors CP10/CP11 — the scalar FPU and MVE — are disabled; the
+first FP instruction would fault. The CPACR write grants access, and the
+barrier pair is not decoration: `DSB` forces the write to complete, `ISB`
+flushes instructions already fetched under the old permissions. Omit the
+barriers and the enable *usually* works — until an instruction prefetched
+before the write faults on a real pipeline. The startup does this before
+touching newlib because newlib code may legitimately use FP registers.
+
+**Zero `.bss`, but do not copy `.data`.** C guarantees zero-initialized
+statics; nobody has provided that guarantee yet, so `memset` over the
+linker-defined `__bss_start__..__bss_end__` does. The conspicuous absence
+is the traditional `.data` copy loop — see the linker scripts below,
+because that absence is a documented dependency on QEMU, not an
+oversight.
+
+**Then the runtime, in dependency order:** semihosting file handles
+(`initialise_monitor_handles`) so `printf` works, `__libc_init_array` so
+C++ static constructors run, then `exit(main(0, NULL))` — `exit`, not a
+bare return, so `atexit` handlers and stream flushes happen before the
+semihosting exit call. `main` receives no arguments. There is no one to
+pass any; that fact shapes the whole test harness below.
+
+### The runtime pieces the toolchain didn't bring
+
+Two more gaps get filled in the same file. First, the heap.
+`librdimon`'s weak `_sbrk` sizes the heap by asking the host, via the
+semihosting `SYS_HEAPINFO` call, where the heap should live — an answer
+that depends on the emulator's mood for a given board model. The startup
+overrides it with the boring, deterministic version:
+
+```c
+{{#include ../../../platform/armv8m_startup.c:pt_sbrk}}
+```
+
+The heap is exactly the region the linker script says, ends exactly where
+the script says, and `malloc` fails with `ENOMEM` — a *testable*
+condition — rather than wandering into memory the map never granted.
+
+Second, 64-bit atomics. The library's telemetry counters are
+`std::atomic`; M-profile has no 64-bit exclusive-access
+instructions, GCC lowers those operations to `__atomic_*_8` library
+calls, and the bare-metal toolchain ships no libatomic. The startup
+provides the four helpers the link actually needs, built on the classic
+single-core primitive — mask interrupts, do the plain 64-bit access,
+restore:
+
+```c
+{{#include ../../../platform/armv8m_startup.c:pt_irqlock}}
+```
+
+```c
+{{#include ../../../platform/armv8m_startup.c:pt_atomic_rmw}}
+```
+
+Why is PRIMASK sufficient where a mutex or an exclusive-access loop would
+be required elsewhere? Because on a single-core part, the only agent that
+can interleave with a sequence of instructions is an interrupt handler on
+the same core — there is no second observer, no other cache, no store
+buffer visible from elsewhere. `cpsid i` makes the critical section
+literally uninterruptible, so the load-modify-store is atomic with
+respect to everything that exists on the machine. The reasoning is sound
+*only* single-core, which is why the dual-core RP2350 firmware at the end
+of this chapter pointedly refuses to rely on it, and shares nothing
+across cores except 32-bit atomics. Note also what the file does *not*
+do: it implements only the helpers currently linked, and deliberately
+omits the rest (compare-exchange and friends), so any future need
+surfaces as a link error instead of as a silently wrong fallback.
+
+## Two linker scripts, two philosophies of stack
+
+The memory maps mirror each board model. The AN547:
+
+```ld
+{{#include ../../../platform/mps3_an547/mps3_an547.ld:pt_memory}}
+```
+
+Four regions, four jobs: vectors in ITCM (address 0, where VTOR resets),
+code in SRAM, **the stack owning all of DTCM**, data/bss/heap in ISRAM.
+Giving the stack a private 512 KB region is a luxury the board offers and
+the script accepts gratefully — the stack limit is simply the region's
+base, and stack and heap physically cannot collide because they do not
+share a region.
+
+The AN505 has only the two big SRAMs, so stack and heap must cohabit,
+and the script makes the boundary explicit rather than hopeful:
+
+```ld
+{{#include ../../../platform/mps2_an505/mps2_an505.ld:pt_heap_stack}}
+```
+
+The stack descends from the top of DATA; the heap is *capped* 64 KB below
+the top; `__stack_limit` is set exactly at the cap. Between `_sbrk`
+refusing to grow past `__heap_end__` and MSPLIM faulting below
+`__stack_limit`, the classic bare-metal failure — stack and heap growing
+silently into each other — is fenced from both sides. One side returns
+`ENOMEM`; the other side HardFaults. Neither corrupts.
+
+And the honesty clause, stated in both scripts' headers: **QEMU's
+`-kernel` loader places the ELF directly into RAM, so VMA == LMA and
+`.data` needs no load-time copy.** On real silicon booting from flash,
+initialized data must be linked with a load address in flash and copied
+to RAM by the startup — the loop this startup deliberately does not
+have. The scripts say so in as many words. This is the same discipline
+as the performance documentation: the artifact records what it is
+validated for, and the boundary of that validation, in the place the next
+user will actually look. A linker script that works under QEMU while
+*looking* like a flash-boot script would be a trap; one that documents
+"QEMU-only, here's why" is a foundation.
+
+## CTest without an operating system
+
+The toolchain files end with `set(SRT_BARE_METAL ON)`, and
+`tests/CMakeLists.txt` branches on it. The problem it solves: CTest's
+contract with a test binary is "run it with arguments, read its exit
+code," and bare metal breaks both halves. There is no `argv` to pass a
+`--gtest_filter`, and semihosting does not reliably propagate the guest's
+exit status through `qemu-system-arm`.
+
+The replacement is a one-shot protocol. A dedicated `main` bakes the
+filter in at compile time, and the *pass criterion is a printed string*:
+
+```cpp
+{{#include ../../../tests/bare_metal_main.cpp}}
+```
+
+CTest registers a single test whose `PASS_REGULAR_EXPRESSION` is
+`SRT_TESTS_COMPLETE rc=0` and whose `FAIL_REGULAR_EXPRESSION` is gtest's
+`[ FAILED ]` marker: the run passes only if the summary line is printed
+*and* no failure marker ever appears. The completion line is printed at
+the last possible moment, so a crash, fault, or park-loop after the tests
+cannot masquerade as success — the harness times out instead (the
+`Default_Handler` comment in the startup file closes this loop: faults
+park, parking times out, timeouts fail).
+
+Three details in that file repay attention:
+
+- **The filter excludes by category, not by taste.** What is cut is
+ minutes of soft-float virtual audio proving target-independent control
+ math already proven on every host leg; what stays is everything only
+ the target can falsify — datapath arithmetic, ring behavior on 32-bit
+ `size_t`, the end-to-end converter. The comment about `AsrcQuality*`
+ versus `AsrcQuality.*` records a real trap: in gtest filters the dot is
+ a literal, and the wrong spelling silently *narrows* the exclusion.
+- **The empty-run guard.** A filter typo can select zero tests, and
+ `RUN_ALL_TESTS()` cheerfully returns 0 for an empty run — a green CI
+ leg testing nothing, forever. The guard fails the run if fewer than 15
+ tests were selected (the real selection is ~20; the slack allows
+ legitimate removals). It must be checked *after* `RUN_ALL_TESTS()`,
+ because gtest applies the filter inside it — the count reads zero
+ before. This guard, like MSPLIM, arrived via audit: the theme of that
+ audit was hunting for ways a passing signal could be vacuous.
+- **GoogleTest itself needs the bare-metal treatment.** Newlib ships stub
+ `pthread.h`/`regex.h` headers that make POSIX feature *detection*
+ succeed spuriously, so the build doesn't probe for threads at all on
+ bare metal and pins the feature macros (`GTEST_HAS_POSIX_RE=0`, stream
+ redirection and filesystem off) — value-checked macros only, since
+ gtest tests `GTEST_HAS_DEATH_TEST` with `#ifdef` and defining it to 0
+ would *enable* what it names.
+
+The result: `ctest --test-dir build` on a developer machine runs ~20
+tests on an emulated Cortex-M55 exactly as transparently as the Hexagon
+chapter's suite — `CMAKE_CROSSCOMPILING_EMULATOR` is doing the same work,
+with `qemu-system-arm -M mps3-an547 -nographic -semihosting -kernel`
+prefixed to the binary instead of `qemu-hexagon`.
+
+## What the M55 was hiding: Helium at plain `-O2`
+
+The M55 port existed for correctness. Its instruction baselines then sat
+quietly in `bench/baselines.json` until the M33 arrived and gave them a
+comparison point — and the comparison didn't add up. Identical source,
+identical flags, same GCC: the M33's Q15 pipeline count came in at
+roughly **4× the M55's**. Slower silicon-for-silicon was expected;
+4× in *executed instructions* was not, because instruction counts don't
+care about clock speed or memory latency. Something was executing
+different instructions.
+
+`objdump` answered in one line of shell: the M55 binary contained **71
+MVE instructions**. The M33 binary contained **zero** (it has no MVE to
+contain). Nobody had written a line of SIMD — **GCC auto-vectorizes the
+Q15/Q31 kernels with Helium at plain `-O2`** when targeting
+`-mcpu=cortex-m55`. The M55's numbers had been MVE-accelerated from the
+day the target landed, and the project's own performance plan — which
+listed "explicit Helium kernels for the M55" as future optimization
+headroom — was describing work the compiler had already done. The
+hypothesis list in `docs/PERFORMANCE.md` was rewritten the same day:
+explicit M55 SIMD is *moot*; the real headroom was on the cores without
+MVE, which became C4.
+
+The M55 also supplied the project's most instructive documentation bug,
+told in this book's introduction: the C3 integer-phase change showed
+`pipeline_float` **+1.4%** on the M55, contradicting the expectation that
+removing double math must help a core documented (in the project's own
+notes) as having no FP64. The measurement was right and the notes were
+wrong: the M55's *scalar* FPU executes FP64 in hardware — only the MVE
+vector unit is fp16/fp32. C3 had traded cheap hardware doubles for int64
+arithmetic on that one target, a fair price for the large cross-target
+wins (Q15 −5.3%, Q31 −4.6% on the same core), and the correction is
+recorded in the plan's hypothesis list. A 1.4% anomaly in a deterministic
+metric was enough to falsify a "fact" everyone involved would have sworn
+to. Noisy metrics don't generate that kind of pressure; this is why the
+ratchet gates on instructions and not on milliseconds.
+
+## What the M33 exists to say about the Pico 2
+
+The M33 leg is the deployment-realism target, and its numbers are meant
+to be read as a datasheet for the Raspberry Pi Pico 2 class of part.
+
+**Float is not a datapath here.** The committed baselines put
+`kernel_float` at 1,897,321,329 instructions against the M55's 99,468,474
+for the same workload — the README's "~19×" — because every `double`
+accumulation in the float kernel is soft-float library calls on a core
+with a single-precision-only FPU. The consequence is stated as guidance,
+not lament: on Pico-class parts, use Q15 or Q31, the formats the
+fixed-point traits chapter built for exactly this moment.
+
+**The DSP extension was idle until C4.** Disassembly of the original M33
+binaries found barely any use of the DSP extension (two `smlal`s). The
+C4 kernel fixed that with `SMLALD` — packed dual 16×16 MAC into a 64-bit
+accumulator — gated on `__ARM_FEATURE_DSP && !__ARM_FEATURE_MVE` so the
+M55 keeps its auto-vectorized loop (verified: 0.00% change on every M55
+and Hexagon scenario), bit-exact by construction because the products are
+exact in int32 and int64 accumulation is associative. It bought −3.1% on
+`pipeline_q15`, and the C4 entry keeps honest books about why the win is
+bounded: the M33's Q15 frame cost is dominated by the coefficient blend's
+64-bit products and transport, not by the dot product the intrinsic
+accelerates.
+
+**Budgets, stated as instructions, pending cycles.** Dividing the
+baselines out: `pipeline_q15` is 484,146,844 instructions per 96,000
+frames ≈ **5,043 instructions per stereo frame**; the 12-channel shape is
+≈ 10,027. A 150 MHz core at 48 kHz has 3,125 *cycles* per frame. The
+README draws the honest conclusion in instruction-space — Q15 mono fits
+a 150 MHz core, stereo wants the `fast()` preset or the RP2350's second
+core — and then refuses to pretend the units match: instructions are not
+cycles, the ratio between them is an empirical property of real silicon,
+and the guidance is explicitly a budget *pending real-silicon
+validation*.
+
+Two flashable firmwares exist to close exactly that loop, and they are
+the bridge from this chapter's emulated world to Part V's hardware:
+
+- **`examples/pico2_cyccnt`** runs the same fixed pipeline workloads on a
+ real Pico 2 and times each 32-frame block with the M33's DWT.CYCCNT
+ hardware cycle counter. Its output divided by the committed baselines
+ (5,043 and 10,027 instructions per frame) yields the
+ cycles-per-QEMU-instruction calibration constant that turns *every*
+ M33 baseline, current and future, into a real cycle budget.
+- **`examples/pico2_dualcore`** is the "second core" clause made
+ literal — and it is the library's concurrency story passing its
+ sternest exam. The `push()`/`pull()` contract names one producer agent
+ and one consumer agent around the lock-free ring; it never says
+ *threads*. On the RP2350, core 0 becomes the producer clock domain
+ (pushing at a synthesized +200 ppm offset, so the servo's estimate has
+ an exact truth value to be judged against — the one thing two real
+ crystals can never give you) and core 1 becomes the consumer, timing
+ every `pull()` with its own per-core DWT. Two cores over coherent SRAM
+ satisfy the acquire/release contract exactly as two threads do.
+ Everything else crossing cores is 32-bit atomics only — because on the
+ M33, 64-bit `std::atomic` is not lock-free, the same fact the startup
+ file's PRIMASK helpers exist to paper over on *one* core and which no
+ single-core trick can fix across two. Even the firmware's 12-channel
+ phase runs at 16 kHz *by arithmetic, not caution*: 10,027
+ instructions per frame against a 3,125-cycle budget cannot fit at
+ 48 kHz on one core, and `pull()` of one converter instance is one
+ consumer by contract — a second core buys one clock domain per core,
+ not more datapath than one core has.
+
+## Verify it yourself
+
+```sh
+# Both bare-metal legs, end to end (arm-none-eabi-g++ and qemu-system-arm
+# on PATH — exactly what CI installs):
+cmake -B build-m55 -DCMAKE_BUILD_TYPE=MinSizeRel \
+ -DCMAKE_TOOLCHAIN_FILE=cmake/arm-cortex-m55-mps3.cmake -DSRT_BUILD_EXAMPLES=OFF
+cmake --build build-m55 -j && ctest --test-dir build-m55 --output-on-failure
+
+cmake -B build-m33 -DCMAKE_BUILD_TYPE=MinSizeRel \
+ -DCMAKE_TOOLCHAIN_FILE=cmake/arm-cortex-m33-mps2.cmake -DSRT_BUILD_EXAMPLES=OFF
+cmake --build build-m33 -j && ctest --test-dir build-m33 --output-on-failure
+
+# The Helium discovery, on today's binaries: MVE loads/MACs present in the
+# M55 build, absent in the M33 build. (The recorded count at discovery was
+# 71 vs 0; the exact number moves with the compiler — the zero does not.)
+arm-none-eabi-objdump -d build-m55/tests/srt_tests | grep -cE 'vldr|vmlaldav'
+arm-none-eabi-objdump -d build-m33/tests/srt_tests | grep -cE 'vldr|vmlaldav'
+
+# The empty-run guard, demonstrated: break the filter in
+# tests/bare_metal_main.cpp (e.g. filter = "NoSuchTest*"), rebuild, and the
+# run fails with "filter is broken" instead of passing green.
+
+# The instruction budgets (counting-plugin build is in ci.yml icount-ratchet;
+# same configure for m33 with the other toolchain file):
+cmake -B build-m55-ic -DCMAKE_BUILD_TYPE=Release \
+ -DCMAKE_TOOLCHAIN_FILE=cmake/arm-cortex-m55-mps3.cmake \
+ -DSRT_BUILD_TESTS=OFF -DSRT_BUILD_EXAMPLES=OFF -DSRT_BUILD_ICOUNT_BENCH=ON
+cmake --build build-m55-ic -j
+python3 scripts/icount.py --target m55 --build-dir build-m55-ic --plugin /tmp/libinsncount.so
+
+# The budgets on real silicon (a Raspberry Pi Pico 2 and a USB cable):
+# examples/pico2_cyccnt/README.md — cycles per frame, DWT.CYCCNT
+# examples/pico2_dualcore/README.md — one clock domain per core, self-judging
+```
+
+The two `objdump` lines are this chapter compressed: the same source, the
+same compiler, the same flags — and the difference between the binaries
+is a discovery you can grep for. Bare metal did not make the library
+different; it made what the library was already doing *visible*, one
+instruction at a time.
diff --git a/book/src/part4/hexagon.md b/book/src/part4/hexagon.md
new file mode 100644
index 0000000..dc8f2ea
--- /dev/null
+++ b/book/src/part4/hexagon.md
@@ -0,0 +1,384 @@
+# Hexagon: a DSP that keeps secrets
+
+> Trust, but verify.
+>
+> — Russian proverb
+
+Every portability chapter in this part answers the same question: what did
+the target force the library to learn that no amount of host testing could
+have taught it? Hexagon — Qualcomm's DSP architecture, the kind of core
+that audio actually ships on inside a phone — answered it four times, and
+three of the four answers contradicted a reasonable engineer's prior. This
+chapter walks through the port itself (which is small) and then the four
+lessons (which are the point), in the order the project learned them.
+
+First, the ground rules the target sets. Hexagon here is
+`hexagon-unknown-linux-musl`: a 32-bit `size_t` (the ring chapter's
+wraparound proof stops being theoretical), musl instead of glibc, clang
+instead of GCC, and — the fact that ends up organizing half of Part III —
+**no double-precision FPU**. Every `double` the library touches on this
+target is a call into soft-float routines. The library's float datapath
+accumulates in `double` deliberately (that decision is defended in the
+polyphase chapter); on Hexagon that choice has a price tag, and this
+chapter contains the receipts.
+
+## The whole port is one file
+
+Here is everything SampleRateTap needed to run its test suite on a
+Qualcomm DSP:
+
+```cmake
+{{#include ../../../cmake/hexagon-linux-musl.cmake}}
+```
+
+Thirty lines, and most of them are comments. Two decisions carry the file.
+
+**`CMAKE_CROSSCOMPILING_EMULATOR qemu-hexagon`.** This single line is what
+makes the port *routine* instead of a parallel test infrastructure. CMake
+prepends the emulator to every test command it registers, so `ctest` runs
+each cross-compiled binary under `qemu-hexagon` user-mode emulation without
+knowing it is doing anything unusual. It goes further than the obvious
+case: `gtest_discover_tests()` needs to *execute* the test binary at build
+time to enumerate its tests, and the emulator prefix makes discovery work
+too — which is why `tests/CMakeLists.txt` raises `DISCOVERY_TIMEOUT` to
+120 seconds and the per-test timeout to 900. Instruction-set emulation is
+slow, roughly an order of magnitude or two; the timeouts are the only
+place the build system admits it.
+
+The same pattern is deliberately generic. The commented-out HiFi4/HiFi5
+job template in `.github/workflows/ci.yml` is this toolchain file with the
+names changed (`xt-clang++`, `xt-run`): any target with a cross-compiler
+and an instruction-set emulator drops into the same shape, and the test
+suite — the project's real asset — transfers unmodified.
+
+**`-static`.** A dynamically linked musl binary needs the emulator to be
+told where the target's loader and shared libraries live (`qemu-hexagon
+-L /path/to/sysroot`), and that path would have to thread through CMake,
+CTest, CI, and every developer's shell. Static linking deletes the whole
+problem: the binary is self-contained, the emulator invocation is just
+`qemu-hexagon ./srt_tests`, and nothing about the sysroot can drift out of
+sync. For a test rig this is the right trade without much argument — the
+binaries are throwaway artifacts, nobody cares that they are megabytes
+instead of kilobytes. Keep this decision in mind, though. It comes back at
+the end of the chapter with teeth.
+
+The CI leg (`hexagon-qemu` in `ci.yml`) runs the suite with an exclusion
+list: the multi-minute quality and lock simulations, the 10-million-element
+thread stress, and a few others. The reasoning is stated in the workflow
+and worth internalizing: those tests prove target-independent control
+mathematics and host concurrency, which emulation neither speeds up nor
+measures meaningfully. What stays *in* is exactly what the target can
+falsify — kernel accuracy, fixed-point arithmetic, 32-bit `size_t`
+behavior, atomics lowering, musl's corners. An emulated test leg should
+run only the tests that target can fail.
+
+One boundary must be drawn before any number in this chapter is quoted,
+and the toolchain file draws it in its own header comment: user-mode
+emulation validates ISA-level *correctness*, never performance. QEMU
+translates guest instructions to host instructions and runs them as fast
+as it can; nothing about its timing resembles the DSP's. What emulation
+*can* produce deterministically is the count of guest instructions
+executed — the metric Part II's ratchet chapter is built on — and that
+count is a good proxy for scalar-code cost while remaining a proxy.
+Cycle-accurate Hexagon numbers require the proprietary Hexagon SDK
+simulator, which this project does not have; the documentation says so
+rather than letting instruction counts impersonate cycles. Every Hexagon
+figure below is therefore an instruction count, exact to the instruction,
+reproducible on your machine, and honest about what it is not.
+
+## Lesson one: the genuinely FP64-less target
+
+The first thing Hexagon did was refuse to be impressed by an optimization
+that worked everywhere else.
+
+C1 — the blended-row precompute, Part III's opening win — cut the M55
+pipeline instruction counts by 15–30% and host stereo wall-clock by 36%.
+The same change on Hexagon: **−3.6% float, −3.3% Q15, −0.2% Q31**. Not
+wrong, not a regression — just strangely small, and "strangely small" is
+the most informative result an instruction counter can produce. If a
+change that halves the inner-loop arithmetic barely moves the total, the
+total is not made of inner-loop arithmetic. The diagnosis, recorded in
+the C1 entry of `docs/PERFORMANCE.md`: Hexagon's pipelines were dominated
+by the per-sample phase bookkeeping, done in `double` and therefore
+soft-floated — every phase increment, wrap and blend-factor conversion
+expanding into library-call arithmetic that dwarfed the MACs the
+optimization had so carefully thinned.
+
+That diagnosis did two things. It motivated C3, the Q0.64 integer phase
+accumulator, whose design you have already seen in Part III. And it
+forced a correction that is preserved in `docs/PERFORMANCE.md`'s
+hypothesis list: the project had been assuming the Cortex-M55 was also in
+this soft-double class, and it is not — the M55's *scalar* FPU executes
+FP64 in hardware (only the MVE vector unit is fp16/fp32). The M55's
+float numbers had never been soft-double-bound. **Hexagon is the
+genuinely FP64-less target**, the only one in CI where "the phase math is
+done in doubles" translates to "the phase math is done in subroutines."
+
+Which is why C3's Hexagon column is the loudest in the whole optimization
+campaign. Eliminating soft-double phase math from the per-sample path
+bought, from the PR's gating run:
+
+| Scenario | Hexagon instructions |
+|---|---:|
+| `pipeline_q31` | **−15.5%** |
+| `pipeline_q15` | **−10.3%** |
+| `pipeline_float` | −2.6% |
+| kernels | count-identical (control) |
+
+The kernels-identical row deserves its footnote: the change touched only
+the converter's per-sample phase path, so the isolated-kernel workloads
+*must* not move. They didn't, to the instruction. That is what a control
+group looks like in this project's methodology, and the deterministic
+QEMU counts are what make a control group meaningful at all — a
+wall-clock benchmark can certify "similar," never "identical."
+
+## Lesson two: hexagon-clang wants aliasing proven, not promised
+
+C2, the vectorization audit, restrict-qualified the kernel hot-loop
+pointers after `-fopt-info-vec` showed GCC vectorizing the blend loop
+only behind a runtime aliasing check ("loop versioned for
+vectorization"). On the M55 the payoff was real but narrow:
+`pipeline_float` −1.35%, every other scenario exactly 0.00%.
+
+The same one-line annotation on Hexagon, from the PR's gating run:
+
+| Scenario | arm-gcc (M55) | hexagon-clang |
+|---|---:|---:|
+| `pipeline_float` | −1.35% | −1.6% |
+| `pipeline_q15` | 0.00% | **−6.2%** |
+| `pipeline_q31` | 0.00% | **−12.3%** |
+| kernels | 0.00% | 0.00% (control) |
+
+Same source, same semantics, wildly different sensitivity. The commit
+that pinned the new Hexagon baselines states the finding plainly:
+*hexagon-clang benefits from provable no-aliasing far more than arm-gcc
+did* — once aliasing is provable it schedules the dot loops
+substantially better. That is consistent with what Hexagon is: a VLIW
+machine whose compiler packs multiple operations per issue packet and
+therefore lives or dies by how freely it may reorder memory operations.
+A `restrict` that merely deletes one runtime check on an in-order ARM
+core instead unlocks the scheduler on a DSP.
+
+The portable lesson is about division of labor: `SRT_RESTRICT` was added
+for a measured GCC reason, and the *same annotation* paid a much larger,
+unlooked-for dividend on the DSP compiler. Aliasing facts belong in the
+source, stated once, precisely — because you cannot predict which
+backend will be able to spend them.
+
+## Lesson three: the ISA already had the trick (C5)
+
+By C5 the project had a pattern that worked: the C4 packed dual-MAC Q15
+kernel had just bought −3.1% on the Cortex-M33 with a small block of
+intrinsics. Hexagon has a directly analogous instruction, `vrmpyh` —
+four exact 16×16 products summed into a 64-bit accumulator per
+instruction, C4's argument at twice the width. The hypothesis practically
+wrote itself.
+
+It was implemented properly: a `vrmpyh` intrinsic loop for the Q15 dot
+product, bit-exact against the portable path, full suite green on Hexagon
+QEMU. Then it was measured, and the ratchet reported:
+
+> `pipeline_q15`: 119,847,854 → 119,478,758. **−0.31%.**
+
+A result that small demands an explanation before it demands a decision,
+because there are two very different ways to earn −0.31%: either the
+compiler was already emitting wide MACs (making the intrinsics
+redundant), or the wide MACs genuinely don't matter here. The two imply
+opposite things about future work, so the project pulled disassembly from
+CI (`llvm-objdump`, pre and post): the baseline binary contains **zero**
+wide-MAC instructions; the intrinsic build contains **10**. The compiler
+had not already done it. The instructions landed, executed — and saved
+almost nothing.
+
+The explanation is in the scalar ISA. Hexagon already issues
+single-instruction 64-bit multiply-accumulates (`Rxx += mpy`) and 64-bit
+loads, so the portable C++ loop was already running close to one MAC per
+instruction, with none of the per-element overheads the M33's baseline
+loop had been paying. And what a 4-wide reduce could still have saved,
+the fix-up work ate: the history window is 2-byte aligned by nature (it
+is a stream of Q15 samples), so feeding `vrmpyh` requires combine and
+alignment work that costs nearly what the wider multiply saves. C4 won on
+the M33 because there was fat to cut; Hexagon's baseline had none.
+
+You can see the same fact from the committed baselines, without any
+intrinsics experiment at all. The README's instruction-count table has
+`kernel_q15` at 102,819,852 on Hexagon against 181,994,196 on the
+Cortex-M55 — the scalar DSP executes *fewer* instructions than the core
+whose Q15 loops GCC vectorizes with Helium. Cross-ISA instruction counts
+must be read with care (an instruction is not a unit of work, and fewer
+instructions is not the same claim as faster), but as a measure of *MAC
+density* the comparison is legitimate: Hexagon's ISA packs so much of
+this workload into each instruction that there was structurally little
+for a wider multiply to remove. C5's failure was, in hindsight, already
+sitting in the baseline table. The experiment's value was turning "in
+hindsight" into a checked fact with disassembly attached.
+
+So the code was deleted. Not shelved, not flag-gated: reverted, per the
+stop rule in `docs/PERFORMANCE.md` — per-architecture complexity must
+justify itself, and −0.31% does not justify a permanent intrinsic code
+path that every future refactor must keep bit-exact. The C5 entry in
+`docs/PERFORMANCE.md` *is* the deliverable: the numbers, the disassembly
+evidence, and the reasoning, recorded so that nobody re-derives this dead
+end in two years when the file looks temptingly scalar again.
+
+The entry also pre-empts the obvious follow-up — "fine, scalar `vrmpyh`
+is redundant, but what about HVX, the 128-byte vector unit?" — with
+arithmetic instead of enthusiasm. A 48–80-tap dot product doesn't fill
+one HVX vector; worse, HVX 16-bit MACs accumulate in 32-bit lanes, and
+the library's exact-int64 accumulation invariant overflows 32 bits after
+about 24 worst-case taps. Per-channel tap-axis dots are simply the wrong
+*shape* for HVX. The shape that fits — one 64-bit lane pair per channel,
+16 channels filling a vector exactly — is the channel-parallel form, and
+that observation, recorded as the successor hypothesis, became C6.
+
+Negative results are worth exactly what you write down about them.
+
+## Lesson four: the exception secret
+
+For months the Hexagon leg was the quiet one. Then a hardening PR added
+the library's first `EXPECT_THROW` tests — constructor validation,
+`Config::validated()` throwing on nonsense configurations — and the
+Hexagon leg turned red in a way no other platform did. The constructor
+throws correctly. The `EXPECT_THROW` machinery is standing by to catch.
+And the exception never arrives: **this static-musl toolchain
+configuration cannot unwind the stack.** The throw reaches the runtime,
+the unwinder that should walk the frames is not part of the link, and
+`libc++abi` does the only honest thing left — terminate. Every other
+platform passed; main was red on exactly one leg, because that leg was
+the first place a C++ exception had ever actually been *thrown* in this
+project's CI history.
+
+Remember `-static`, the convenience decision from the top of the chapter?
+This is its bill arriving. The configuration had silently shipped without
+a working unwind path, and nothing in months of green CI could have said
+so, because exception propagation is invisible until the first frame
+needs unwinding. A capability you never exercise is a capability you do
+not have — you merely have no evidence yet.
+
+The response is a case study in how this project metabolizes a
+limitation, three moves in one commit:
+
+1. **Quarantine precisely.** `ConfigValidation` is excluded from the
+ Hexagon `ctest` invocation — that suite and nothing else, with a
+ comment in `ci.yml` explaining why. Validation logic is
+ target-independent and still covered on every other leg; what Hexagon
+ cannot test is the *unwinding*, not the *validating*.
+2. **Record it where deployers look.** The Known-debt ledger in
+ `docs/PERFORMANCE.md` gets an entry with the deployment rule stated as
+ a rule: on this toolchain configuration, an invalid `Config` is
+ **fatal** — validate inputs *before* constructing, because the
+ constructor's throw will take the process down rather than propagate.
+ The toolchain file itself carries the same caveat, so the next person
+ to cross-compile inherits the warning at the point of use.
+3. **Name the candidate fix without pretending it is done.** Linking an
+ unwinder (`-unwindlib=libunwind`) in the toolchain file would likely
+ restore propagation; it stays a recorded candidate until someone
+ verifies it, because "probably fixable" and "fixed" are different
+ ledger states.
+
+The library's API already leaned the right way — `validated()` exists
+precisely so callers can validate before constructing — so the rule
+costs a deployer one line. But the general finding stands, and it is the
+chapter's title: a target can keep a secret like this indefinitely, and
+the only way to surface it is to route every kind of behavior through the
+target. The first `EXPECT_THROW` to reach the leg was, in effect, the
+first test of a claim the toolchain had been silently making all along.
+
+## The CI craft: trusting your emulator and your compiler
+
+Two pieces of infrastructure make the Hexagon numbers in this book
+reproducible rather than anecdotal, and both are about supply chain more
+than about DSP.
+
+**The emulator is built from source, on purpose.** The instruction-count
+ratchet needs a `qemu-hexagon` with TCG plugin support — the counting
+plugin is how "executed instructions" becomes a number at all. Neither
+Debian's `qemu-user` package nor the qemu bundled with the Hexagon
+toolchain enables plugins. So the `icount-ratchet` job compiles its own:
+the pinned QEMU 8.2.2 source tarball, verified against a hard-pinned
+SHA256, configured minimally —
+
+```sh
+./configure --target-list=hexagon-linux-user --enable-plugins \
+ --disable-docs --disable-tools --disable-system
+```
+
+— about four minutes to build the one binary needed, cached thereafter.
+The job then *probes* the result (`qemu-hexagon -plugin help`, judged by
+the error text because qemu exits nonzero either way when given no guest
+binary) rather than assuming the cache returned what was put in. The
+plugin header is pinned to the commit the v8.2.2 tag pointed at, by
+commit SHA — tags are movable; commits are not.
+
+**The toolchain is verified twice, against two different threats.** The
+cross-compiler is the prebuilt open-source release from
+`quic/toolchain_for_hexagon` (clang 19.1.5, hosted on CodeLinaro). On
+download, CI checks it against the *published* `SHA256SUMS` file — which
+catches corruption and cache poisoning — and against a *hard pin* baked
+into the workflow, which is the only check that catches an origin
+compromise, since an attacker who can replace the tarball can replace the
+SUMS file beside it. The cache key is derived from the pinned digest
+itself, so no job that has not verified the pin can ever write the cache
+entry a trusting job will read. That last detail was not free: an audit
+found two other jobs sharing the trusted cache key while downloading
+without verification — a classic poisoning window — and the fix (verify
+everywhere, key on the digest) is part of the same hardening commit that
+gave the Cortex-M targets their stack-limit register in the next chapter.
+
+None of this is DSP knowledge. All of it is what "the Hexagon numbers are
+CI-gated" has to mean if the phrase is to carry weight: the compiler
+whose output is being counted and the emulator doing the counting are
+both pinned, verified artifacts, not whatever the package manager felt
+like resolving that morning.
+
+## What the port did not require
+
+It is worth pausing on the dog that didn't bark. Running a modern C++20
+template library on a Qualcomm DSP required: one 30-line toolchain file,
+a test-filter list, and zero changes to library code. No `#ifdef
+__hexagon__` exists in any header. The 32-bit `size_t` was already
+handled by the ring's wraparound arithmetic (proved, then tested, in the
+ring chapter); the absence of threads never came up because the library
+never spawns one; the atomics lowered correctly because the ring asserts
+`is_always_lock_free` at compile time and would have refused to build
+otherwise. The port was boring precisely to the degree that the library's
+portability claims were already true — and interesting precisely where
+the *toolchain*, not the library, had been making claims nobody had
+tested. Both halves of that sentence are the reason to port early: the
+boring half is regression-proofed for free from then on, and the
+interesting half you want to hear about from CI, not from a customer.
+
+## Verify it yourself
+
+```sh
+# The port, end to end (hexagon-unknown-linux-musl-clang++ and qemu-hexagon
+# on PATH; .github/workflows/ci.yml "hexagon-qemu" has the toolchain URLs):
+cmake -B build-hex -DCMAKE_BUILD_TYPE=Release \
+ -DCMAKE_TOOLCHAIN_FILE=cmake/hexagon-linux-musl.cmake \
+ -DSRT_BUILD_EXAMPLES=OFF
+cmake --build build-hex -j
+ctest --test-dir build-hex --output-on-failure \
+ -E 'AsrcQuality|AsrcLock|TwoThreadStress|TransparentPrototypeMeetsSpec|MultiChannel\.|Feasibility|Reset\.|ConfigValidation'
+
+# The exception secret, demonstrated: remove ConfigValidation from the -E
+# list above and watch libc++abi terminate instead of EXPECT_THROW passing.
+
+# The instruction counts (needs the plugin-enabled qemu-hexagon; the
+# icount-ratchet job in ci.yml shows the 4-minute from-source build):
+cmake -B build-hex-ic -DCMAKE_BUILD_TYPE=Release \
+ -DCMAKE_TOOLCHAIN_FILE=cmake/hexagon-linux-musl.cmake \
+ -DSRT_BUILD_TESTS=OFF -DSRT_BUILD_EXAMPLES=OFF -DSRT_BUILD_ICOUNT_BENCH=ON
+cmake --build build-hex-ic -j
+python3 scripts/icount.py --target hexagon --build-dir build-hex-ic \
+ --plugin /path/to/libinsncount.so
+
+# The C5 negative result's disassembly evidence, reproduced on today's
+# binary (the count should be zero — the intrinsics were reverted):
+llvm-objdump -d build-hex-ic/bench/icount/srt_icount_pipeline_q15 | grep -c vrmpy
+```
+
+The last command is this chapter's thesis in one line. The claim "the
+wide-MAC intrinsics were deliberately not kept" is not a story in a
+design document; it is a property of the shipped binary that you can
+count, and the C5 entry in `docs/PERFORMANCE.md` is the record of why
+counting it settled the question.
diff --git a/book/src/part5/hardware.md b/book/src/part5/hardware.md
new file mode 100644
index 0000000..fe120e9
--- /dev/null
+++ b/book/src/part5/hardware.md
@@ -0,0 +1,431 @@
+# Real clocks: bridges and firmware
+
+> Before enlightenment: chop wood, carry water. After enlightenment: chop wood, carry water.
+>
+> — Zen proverb
+
+Everything measured so far in this book — the 135 dB residual, the lock in
+~1 s, the drift ramp tracked without unlocking — came out of a simulation.
+A good one: deterministic, sample-granular, reproducible to the bit, able
+to synthesize a +200 ppm offset that is *exactly* +200 ppm so the servo's
+estimate has a truth value to be judged against. That determinism is the
+whole reason Part II's proof system works, and it is also, unavoidably, a
+confession. A simulated clock is a number in a loop. It has no crystal, no
+temperature coefficient, no USB host controller rescheduling its transfers,
+no twelve-hour soak in a warm room. The library exists to reconcile two
+*physical* oscillators, and at some point the only honest move is to plug
+two of them in.
+
+This chapter is about that move: what real hardware can prove that the
+deterministic suite cannot, the three test setups the project defined for
+it, and the three harnesses that shipped — an ALSA bridge for Linux hosts
+and two firmware images for the Raspberry Pi Pico 2. It ends by stating
+plainly which numbers exist today and which still await a physical board.
+
+## What simulation cannot say
+
+Be precise about the gap, because it is narrower than "simulation isn't
+real." The two-clock simulator *is* the library's use case in every
+algorithmic respect; nothing about the datapath or the servo mathematics
+changes on hardware. What changes is the input to the control loop:
+
+- **The offset stops being constant.** Real crystals sit typically
+ 20–200 ppm apart and move several ppm with temperature — slowly, over
+ minutes, as the room warms or a component self-heats. The suite tests a
+ *scripted* drift ramp; hardware supplies an unscripted one, forever.
+- **The pacing stops being clean.** A simulated push arrives exactly on
+ schedule. A USB audio dongle's data arrives when the host controller and
+ the kernel get around to it — jitter that is structured, bursty, and
+ unlike anything a deterministic loop generates. The FIFO setpoint rule
+ ("exceed the peak occupancy excursion of your push/pull jitter") is only
+ ever *exercised* by real jitter.
+- **Time stops being short.** The quality suite analyzes one second of
+ audio after settling. The claim a deployment actually cares about —
+ *zero* underruns, overruns, or resyncs over hours — is a statement about
+ the tails of every distribution at once, and the only instrument that
+ measures tails is a soak. A multi-hour run on independent oscillators is
+ the test no simulation honestly replaces.
+
+There is also one thing simulation does *better*, worth keeping: a
+synthesized offset is exact, so convergence can be asserted to a tolerance.
+Two real crystals give you a true offset you don't know — you can check the
+estimate is *stable* and *independently corroborated* (count frames from
+each device against `CLOCK_MONOTONIC` for ten minutes; the measured rate
+ratio should match the servo's estimate to well under 1 ppm), but never
+that it equals a known constant. The hardware plan uses both kinds of truth
+deliberately, as we'll see.
+
+`docs/HARDWARE_TESTING.md` defines three setups, in increasing order of
+effort, all commodity parts:
+
+1. **One Pi, two USB audio dongles** (~$15 of adapters). Each dongle clocks
+ its own 48 kHz from its own crystal; the library bridges them. The
+ canonical real-world test, and the source of the headline result the
+ project wants: "locked to the real inter-crystal offset of X ppm, N
+ hours, zero discontinuities."
+2. **Pi + Raspberry Pi Pico 2.** Validates the QEMU-derived Cortex-M33
+ numbers on an actual RP2350: real cycles against emulated instruction
+ counts, and the dual-core deployment shape.
+3. **Two Pis over Ethernet.** The network-audio case, where `push()` sees
+ bursty UDP delivery instead of callback-paced blocks — the setpoint rule
+ under genuinely hostile jitter.
+
+Setup 1's harness is `examples/alsa_bridge.cpp`; Setup 2's are
+`examples/pico2_cyccnt/` and `examples/pico2_dualcore/`. Setup 3's programs
+are not yet written. Each shipped harness is worth reading closely, because
+each one is the library's documented rules applied under witness.
+
+## The ALSA bridge: two blocking threads, on purpose
+
+The bridge is ~370 lines and structurally almost insolently simple: open a
+capture device and a playback device, start two threads, and let each
+thread block on its device.
+
+```cpp
+std::thread capture([&] {
+ // ...
+ const snd_pcm_sframes_t n = snd_pcm_readi(in.pcm, dst, period);
+ // ...
+ asrc.push(buf.data(), frames); // overruns counted by the converter
+});
+
+std::thread playback([&] {
+ // ...
+ asrc.pull(buf.data(), period); // silence-pads while filling/underrun
+ // ...
+ snd_pcm_writei(out.pcm, src, period - done);
+});
+```
+
+The simplicity is the point. The library's runtime contract is one producer
+agent and one consumer agent, each paced by its own clock — and a blocking
+ALSA read *is* a clock. `snd_pcm_readi()` returns when the capture device
+has delivered a period of frames, which happens at the cadence of that
+device's crystal; `snd_pcm_writei()` blocks until the playback device has
+made room, at the cadence of the other crystal. The two threads never
+communicate except through the converter, which is exactly the interface
+the whole library was designed around. No callbacks, no timers, no event
+loop: the hardware paces the threads, and the converter absorbs the
+difference. If you want to see the two-agent contract of
+[the ring chapter](../part1/spsc-ring.md) with the abstractions removed,
+this file is it.
+
+A few decisions inside deserve attention.
+
+**Format negotiation prefers honesty over generality.** The bridge asks
+each device for `FLOAT_LE` — the converter's native sample type, no
+conversion — and falls back to `S16_LE` with explicit scale-and-clamp
+helpers when the hardware refuses. That is the entire format matrix. Cheap
+dongles are overwhelmingly S16 devices, and a test harness that negotiated
+every ALSA format under the sun would bury its purpose in plumbing. It
+also *refuses* a rate it didn't ask for: if the device counters with
+anything but the requested rate, the bridge errors out rather than
+silently measuring the wrong experiment.
+
+**Xrun recovery is delegated, then observed.** When a read or write
+returns an error, the bridge calls `snd_pcm_recover()` and continues;
+only an unrecoverable error stops the run. This is deliberate division of
+labor: ALSA xruns are a *device*-level discontinuity (the OS failed to
+service the hardware in time), and the converter has its own machinery —
+silence-padding, refill, re-lock with the ppm estimate kept — for the
+*converter*-level consequences. The bridge does not try to be clever
+across that boundary; it recovers the PCM and lets the converter's
+counters record whatever backlash arrives. During a soak, the once-per-
+second status line is where you watch both layers at once.
+
+**The one configuration rule in the file is the ServoConfig rule.** The
+bridge runs with `--period` frames per ALSA transfer (default 128), and
+block-quantized transfer means the FIFO occupancy legitimately excursions
+by around half a block without the clocks having moved. The servo's
+`unlockThresholdFrames` defaults to 24 — tuned for fine-grained transfer —
+so the bridge applies the documented rule in code:
+
+```cpp
+// Per the ServoConfig guidance: the unlock threshold must sit
+// comfortably above half the transfer block, or block-quantized
+// occupancy excursions can demote the servo stage spuriously.
+cfg.servo.unlockThresholdFrames =
+ std::max(cfg.servo.unlockThresholdFrames, 1.5 * static_cast(args.period));
+```
+
+Miss this and the harness would report spurious servo demotions that have
+nothing to do with the clocks — a measurement artifact manufactured by the
+measurement tool. (The next chapter returns to this rule as one of the
+three scaling axes.)
+
+**The telemetry switches are the experiment design.** Three flags turn the
+bridge from a demo into an instrument:
+
+- `--csv ` appends the once-per-second `status()` snapshot — state,
+ ppm, smoothed fill, underrun/overrun/resync counters — as a CSV row.
+ This is the soak's evidence: the ppm trace over hours *is* the thermal-
+ drift measurement, and the counters' final values *are* the
+ zero-discontinuity claim. Point a hair dryer at one dongle and the trace
+ should show the crystal move several ppm in real time, tracked without
+ anything audible; a fast ±50 ppm step should show a stage demotion and a
+ re-lock.
+- `--dump ` has the playback thread also write the post-ASRC float
+ stream to disk, raw. This exists because of an honest limitation of
+ cheap hardware: a $7 dongle's analog path measures around −80 dB, and
+ no quality claim about a 135 dB converter survives passage through it.
+ The dump sidesteps the analog path entirely — the *clocks* are real even
+ if the signal never goes analog — and the notebook tooling
+ (`notebooks/asrc_comparison.ipynb` carries the AES17-style measurement
+ machinery) analyzes the capture offline.
+- `--tone ` completes that thought. In tone mode the capture thread
+ *still blocks on* `snd_pcm_readi()` — the input device's crystal still
+ paces every push — but the captured samples are discarded and a clean
+ synthetic sine is pushed instead. Real clocks, known signal, no trust
+ placed in an ADC that hasn't earned it. The combination
+ `--tone 997 --dump out.raw --csv trace.csv` is Setup 1's full
+ measurement: a 997 Hz tone through two real crystals into the AES17
+ notebook.
+
+## `pico2_cyccnt`: buying cycles with instructions
+
+Part II built a performance ratchet on QEMU instruction counts:
+deterministic, noise-free, gateable in CI at ±3%. The README's Cortex-M33
+table says a 2-second Q15 stereo workload executes 484,146,844
+instructions — that number will be identical tomorrow, which is what makes
+it a regression gate. But it is a count of *instructions*, and silicon
+budgets are spent in *cycles*. An instruction can take one cycle or ten;
+memory waits, pipeline stalls, and branch penalties exist in silicon and
+not in QEMU's functional model. So every deployment claim derived from the
+ratchet — "Q15 mono fits a 150 MHz core with room to spare, stereo is
+tight" — has been carrying an asterisk: *instruction counts are not cycle
+counts; treat these as budgets pending real-silicon validation.*
+
+`examples/pico2_cyccnt/` is the firmware that removes the asterisk. It is
+a standalone flashable UF2 (deliberately *not* part of the root build —
+it drags in the whole Pico SDK) that runs the exact steady-state workload
+of the icount benchmarks — the same `push(32)`/`pull(32)` duplex loop —
+on a real RP2350, timing every block with the Cortex-M33's DWT cycle
+counter:
+
+```cpp
+bool enableCycleCounter() {
+ CoreDebug->DEMCR |= CoreDebug_DEMCR_TRCENA_Msk;
+ if (DWT->CTRL & DWT_CTRL_NOCYCCNT_Msk)
+ return false; // implementation without a cycle counter
+ DWT->CYCCNT = 0;
+ DWT->CTRL |= DWT_CTRL_CYCCNTENA_Msk;
+ return true;
+}
+```
+
+DWT — the Data Watchpoint and Trace unit — is optional silicon on
+M-profile cores, so the firmware checks `NOCYCCNT` at runtime rather than
+assuming; `TRCENA` gates the whole trace block and must be set first. The
+counter is 32 bits free-running, which wraps in ~28.6 s at 150 MHz — fine,
+because the firmware only ever takes per-block unsigned deltas, and
+unsigned subtraction across a wrap is exact by the same modular-arithmetic
+argument the ring buffer's indices rested on. A thousand warmup iterations
+run first (past the Filling state, servo settled), then two thousand
+measured blocks, reported as mean, p99, and max — the tail statistics
+matter, because the workload runs with interrupts live and USB
+housekeeping shows up in the max column.
+
+The output table covers Q15 in both presets at 1, 2, and 12 channels, plus
+float at one channel. The float rows are not there in the hope of good
+news; they exist to put a *measured* number on "soft-double accumulation
+is the wrong datapath on an FP64-less core" — the QEMU baselines already
+price float at roughly 3.8× the Q15 instruction count, and a cycle figure
+makes the guidance concrete rather than rhetorical.
+
+The deeper purpose is calibration. The committed M33 baselines divide out
+to 5,043 instructions per frame for the stereo Q15 pipeline and 10,027 for
+the 12-channel one. Divide the firmware's measured cycles-per-frame by
+those figures and you get the constant the whole ratchet has been waiting
+for: *one QEMU instruction ≈ N RP2350 cycles*. That single ratio converts
+every current and future M33 instruction baseline into a real cycle
+budget — the ratchet keeps its CI-grade determinism, and hardware
+contributes exactly one number, measured once per silicon revision instead
+of once per commit.
+
+One scoping note recorded in the README of the harness: the cycled input
+buffer is 4,800 frames rather than the icount workload's 12,000, so that
+the 12-channel case fits the RP2350's 520 KB SRAM alongside the converter.
+Per-block work is unchanged; the deviation is documented because an
+unexplained difference between two "identical" workloads is how
+calibration constants go quietly wrong.
+
+## `pico2_dualcore`: one clock domain per core
+
+The README's platform guidance ends with a suggestion: on Pico-class
+parts, stereo `balanced()` wants either the `fast()` preset *or the
+RP2350's second core*. `examples/pico2_dualcore/` is that suggestion built
+and made falsifiable — the converter's two ends on the two Cortex-M33
+cores, one core per clock domain, judging its own run against PASS/FAIL
+gates.
+
+- **core0 is the producer.** It pushes 32-frame blocks paced by the
+ microsecond timer at `rate × (1 + 200e-6)` — a +200 ppm offset
+ synthesized from the shared timebase. This is the simulation trick
+ imported onto silicon, and it is what real crystals can never give: an
+ offset that is *exactly* +200 ppm, so the converged estimate can be
+ asserted within ±5 ppm rather than merely admired. core0 also owns all
+ USB telemetry.
+- **core1 is the consumer.** It pulls 32-frame blocks at exactly the
+ nominal rate and times every `pull()` with DWT.CYCCNT — enabled *on
+ core1*, because each RP2350 core has a private DWT behind the same
+ fixed address (the 0xE000_0000 private peripheral region is core-local;
+ enabling the counter from core0 would start the wrong one). core1 never
+ prints: contending on the stdio mutex from the paced core would put USB
+ stalls onto the output clock domain.
+
+Is running the two ends on two *cores* even within the library's
+contract? The firmware answers this in its opening comment, and the
+reasoning belongs in this book: the contract is one producer *agent* and
+one consumer *agent* around a lock-free SPSC ring with acquire/release
+atomics. It names agents and memory ordering, not `std::thread`. The
+RP2350's cores share coherent SRAM with no data caches in front of it, so
+C++ atomics behave across cores exactly as they do across threads — two
+cores satisfy the contract precisely as two threads do. `push()` stays
+core0-only, `pull()` stays core1-only, `status()` is documented
+any-thread. The chapter on the ring said the memory-ordering argument was
+the proof and the tests merely raised the price of being wrong; here the
+same argument, unchanged, carries the design onto a second processor.
+
+Everything else that crosses cores is an explicit block of **32-bit**
+atomics, and the width is a load-bearing decision inherited from the
+library itself: on the M33, 64-bit `std::atomic` is not lock-free — it
+routes through a library lock, which is exactly the failure the library's
+own telemetry avoided by keeping its counters 32-bit. The firmware
+`static_assert`s the lock-freedom of every cross-core type. The phase
+handoff is a single release store of the converter pointer (publishing
+every plain write the constructor performed) matched by an acquire load on
+core1; the teardown is the mirrored pair through a `consumerDone` flag, so
+destroying the converter cannot race core1's last `pull()`.
+
+The consumer's statistics need more than individual atomicity, though: a
+printed telemetry line should describe one *instant*, not a mean from this
+second next to a max from the last. With 64-bit atomics off the table, the
+firmware uses a seqlock — the sequence counter goes odd while the writer
+updates, even when it finishes, and the reader retries until the same even
+value brackets its whole read:
+
+```cpp
+void publishSnapshot(const Snapshot& s) {
+ const std::uint32_t q = g.seq.load(std::memory_order_relaxed);
+ g.seq.store(q + 1, std::memory_order_relaxed);
+ std::atomic_thread_fence(std::memory_order_release);
+ // ... payload stores, all relaxed 32-bit atomics ...
+ g.seq.store(q + 2, std::memory_order_release);
+}
+```
+
+The payload fields are themselves relaxed atomics — no torn reads, no
+undefined behavior — so the seqlock adds only mutual coherence, and a
+retry costs nothing at a 1 Hz read rate. It is the cheapest possible
+answer to "publish five numbers atomically on a core with 32-bit
+atomics," and a pattern worth stealing.
+
+### The honest scoping decision
+
+The firmware runs two ~30-second phases. Phase A is Q15 stereo
+`balanced()` at 48 kHz — the configuration the README calls tight on one
+core, now with the input domain moved off the consumer's core entirely.
+Phase B is the 12-channel reference-microphone/AVB shape... at **16 kHz**,
+not 48. Its README records why, and the passage is a model of how to scope
+a demo honestly:
+
+> Phase B is 16 kHz **by arithmetic, not caution**: the M33 QEMU baseline
+> puts `pipeline12_q15` at 10,027 insns/frame against a 150 MHz / 48 kHz
+> budget of 3,125 cycles/frame — more than 3× over, and `pull()` of a
+> single instance is one consumer by contract, so no core assignment can
+> split it across cores. Dual-core buys one clock domain per core, not
+> more datapath than one core has.
+
+That last sentence is the chapter's most important deployment fact. The
+SPSC contract that makes the converter lock-free is also a ceiling: one
+consumer agent means the entire per-pull datapath — all twelve channels of
+it — executes on whichever core calls `pull()`. A second core removes the
+*other* clock domain's work and everything else the application does, and
+that is all it removes. At 16 kHz the per-frame budget triples to 9,375
+cycles and the 12-channel shape fits; and since measured cycles per block
+are rate-independent, phase B still delivers the real-silicon counterpart
+of the 12-channel instruction baseline. Nothing was hidden by the rate
+change — 16 kHz is that configuration's actual deployment rate (the
+next chapter's rate-scaling rules are applied in the phase B config,
+`FilterSpec` band edges and servo bandwidths scaled by 16/48) — but the
+README refuses to let you believe dual-core bought compute it didn't.
+
+Two more of the library's documented rules appear in this firmware as
+lived decisions rather than advice. The FIFO setpoint is 144 frames, not
+the default 48: the producer core shares its time with USB logging, whose
+worst-case writer stall is capped at 2 ms in the build — 96 frames of
+consumer progress at 48 kHz — so the setpoint must exceed that excursion
+with margin. That is the README latency rule applied to a producer that
+also logs. And the pacing schedules compute absolute due times
+(`t0 + (b·num)/den` in integer microseconds), so a stall is followed by
+catch-up pushes rather than permanent schedule slip — the difference
+between jitter the FIFO absorbs and a rate error the servo would chase.
+
+A PASS requires: Locked within 2 s of cold start (6 s for phase B, whose
+scaled servo is proportionally slower), every 1 Hz ppm sample after the
+settling gate within ±5 of the synthesized +200, and zero underruns,
+overruns, *and* resyncs after first lock — overruns and resyncs gate too,
+because they are the signature of a consumer that cannot keep up. The
+firmware prints per-phase verdicts, an `OVERALL` line, and a sentinel
+string, so a future self-hosted CI lane can parse a soak the same way the
+QEMU lanes parse emulated runs.
+
+The dual-core README also states its own limit, and it belongs here
+verbatim in spirit: both domains are paced from the RP2350's one timer —
+that is what makes +200.0 an exact, assertable truth — so this firmware
+*cannot* prove the inter-crystal lock that Setups 1 and 2 ultimately want.
+It proves the deployment shape: two cores, two clock domains, lock-free
+handoff, real cycle headroom.
+
+## What is measured, and what is not yet
+
+The project's culture is that numbers are measured or absent, so here is
+the ledger as it stands:
+
+- **Shipped and measured on real clocks: nothing yet.** All quality and
+ performance figures in this book so far come from deterministic
+ simulation, host benchmarks, and QEMU instruction counting.
+- **Shipped and awaiting hardware:** all three harnesses build — the ALSA
+ bridge wherever ALSA exists, both Pico 2 firmwares as flashable UF2s —
+ but `docs/HARDWARE_TESTING.md` says it plainly: *the measured numbers
+ await a physical Pico 2*, and the multi-hour dongle soak awaits an
+ afternoon with a Pi. The cycles-per-instruction calibration constant,
+ the real `%core@48k` figures, the hour-scale zero-discontinuity claim,
+ and the thermal-drift trace are all, today, well-instrumented empty
+ columns.
+- **Not yet written:** the small script that plots a `--csv` ppm trace and
+ runs the notebook analysis over a `--dump` capture, and both Setup 3
+ programs (UDP sender, receiver-with-ASRC — the plan is to reuse the
+ bridge's output half).
+
+A book that inherited this project's habits could not end the chapter any
+other way. The harnesses are the falsifiable form of the library's
+deployment claims; until a board runs them, the claims stay labeled as
+budgets.
+
+## Verify it yourself
+
+```sh
+# No hardware: two OS threads 500 ppm apart, lock and estimate on live
+# (jittery) scheduling — the software rehearsal of the bridge:
+cmake -B build -DSRT_BUILD_EXAMPLES=ON && cmake --build build -j
+./build/examples/drifting_clocks
+
+# Setup 1 (Linux + two audio devices; srt_alsa_bridge builds when ALSA
+# is found). Real clocks, synthetic tone, telemetry + capture:
+./build/examples/srt_alsa_bridge --in hw:1,0 --out hw:2,0 \
+ --tone 997 --csv trace.csv --dump post_asrc.f32 --seconds 3600
+# Then: ppm column of trace.csv is the thermal-drift instrument; analyze
+# post_asrc.f32 with the AES17 machinery in notebooks/asrc_comparison.ipynb.
+
+# Setup 2 firmware (standalone builds; arm-none-eabi-gcc + network for
+# the Pico SDK fetch):
+cd examples/pico2_cyccnt && cmake -B build -DPICO_BOARD=pico2 && cmake --build build -j
+cd examples/pico2_dualcore && cmake -B build -DPICO_BOARD=pico2 && cmake --build build -j
+# Flash the UF2s, open the USB serial port, and wait for the sentinel
+# lines: SRT_PICO2_DONE / SRT_PICO2_DUALCORE_DONE with per-phase PASS/FAIL.
+```
+
+If you have the hardware this project's authors did not have on their
+bench, you are holding the most valuable contribution available: run the
+soak, and turn the empty columns into numbers.
diff --git a/book/src/part5/scaling.md b/book/src/part5/scaling.md
new file mode 100644
index 0000000..1651156
--- /dev/null
+++ b/book/src/part5/scaling.md
@@ -0,0 +1,315 @@
+# Channels, rates, and the rules that scale
+
+> For every type of animal there is a most convenient size, and a large change in size inevitably carries with it a change of form.
+>
+> — J. B. S. Haldane, *On Being the Right Size*
+
+Every measured number in this book so far was taken at one operating point:
+48 kHz, one or two channels, fine-grained transfer. Real deployments move
+along three axes away from that point — more **channels**, a different
+**sample rate**, coarser **blocks** — and each axis has a rule, a failure
+mode when the rule is ignored, and a measurement that pins both. This is
+the chapter a deploying engineer should read twice: once before choosing a
+configuration, and once after the first surprising telemetry line.
+
+The three rules, stated up front:
+
+1. **Channels**: one converter instance per *clock domain*, never per
+ channel group; channel count is then a nearly-free multiplier on the
+ dot product.
+2. **Rates**: every configuration field denominated in absolute hertz must
+ scale with the sample rate — start from `Config::forSampleRate()`.
+3. **Blocks**: the FIFO setpoint must exceed the pull block size (the
+ converter now enforces this) and the servo's unlock threshold must
+ clear the block-quantization sawtooth; coarse blocks also move you into
+ a measurably different quality regime.
+
+## Channels: coherence is free, so don't pay for it
+
+`Config::channels` is a runtime count with no architectural limit — mono
+through 7.1.4 and beyond. The design rule is about instance boundaries:
+**one instance per clock domain**. If a 12-channel AVB stream and a stereo
+monitor feed arrive on the *same* recovered clock, they are one domain and
+could share one instance per stream as convenient; but never split one
+stream's channels across instances, and never funnel two clock domains
+into one.
+
+The reason to keep a stream's channels together is a property the
+implementation gives you by construction. Within one instance, every
+channel of a frame is resampled at *literally the same fractional
+position*: the phase accumulator, the servo, and the coefficient blend are
+all per-instance state, and the per-channel work is only the dot product.
+There is no per-channel phase to drift, so inter-channel phase coherence
+is exact — not "matched to within a specification," but bit-identical in
+the only quantity that could differ. Two audiences care intensely:
+
+- **Surround imaging.** Phantom sources between speakers are constructed
+ from inter-channel amplitude and time relationships; an ASRC that
+ resampled channels at even slightly different phases would smear them.
+ Here there is no skew to budget for.
+- **Microphone arrays.** Beamforming and cross-correlation live entirely
+ on inter-channel time differences at sub-sample precision. The README
+ calls out the AVB case directly: a stream bundling reference microphones
+ with the program feed keeps its array geometry intact through the
+ converter. (AVB Class A's 8-frame packets are also fine-grained enough
+ for the Quiet servo stage — the block axis, below, cooperates.)
+
+Split those channels across instances and you forfeit the guarantee: each
+instance runs its own servo on its own FIFO, and two servos tracking the
+same physical clock still produce two independently-wobbling phase
+trajectories. The coherence rule costs nothing and buys exactness; its
+violation costs exactness and buys nothing.
+
+### What N channels cost
+
+Sharing one fractional position per frame also shapes the cost. Each
+output frame computes the coefficient blend — the interpolation between
+adjacent polyphase rows — *once*, then reuses it for every channel's dot
+product. N channels cost `blend + N × dot`, not `N × (blend + dot)`; the
+fixed overhead amortizes, so the marginal channel is cheaper than the
+first.
+
+The instruction-count table in the README measures this shape. Comparing
+the 12-channel Q15 pipeline against stereo across the three gated targets:
+
+| Target | `pipeline_q15` (2 ch) | `pipeline12_q15` (12 ch) | ratio |
+|---|---:|---:|---:|
+| Cortex-M33 | 484,146,844 | 962,613,655 | 2.0× |
+| Cortex-M55 | 127,446,817 | 387,876,968 | 3.0× |
+| Hexagon | 119,847,854 | 378,858,793 | 3.2× |
+
+Six times the channels for 2.0–3.2× the instructions. The spread itself is
+informative: the M33's 2.0× says its per-frame cost is dominated by
+shared work (the servo's soft-double arithmetic on an FP64-less core), so
+extra channels are nearly half price; the M55 and Hexagon, whose shared
+work is cheap, sit closer to the pure dot-product slope. On the host, the
+same shape: Q15 stereo at 56.0 ns/frame versus 12-channel at 189.1 —
+3.4× for 6× the channels, with a 12-channel stream still running at 110×
+realtime on one Xeon core.
+
+### The proof that channels don't leak
+
+Coherence and cost say nothing about *correctness* — an interleave bug or
+a channel permutation would sail through every single-channel quality
+metric in the suite. `tests/test_multichannel.cpp` exists for exactly
+that blind spot: every channel of one instance gets its own tone (600 +
+731·c Hz — distinct, non-harmonically related, all inside the flat
+passband up to 16 channels), and after a +200 ppm crossing each channel
+must contain its own tone at full quality and nothing measurable of any
+neighbor's.
+
+"Nothing measurable" is made rigorous the way this project usually is: the
+channel's own tone is removed by tracked least-squares fit before the
+other channels' frequencies are fitted on the residual, so the own tone's
+spectral leakage (about −67 dB over a 1 s rectangular window at these
+spacings) cannot masquerade as crosstalk. The gated results: worst
+crosstalk below **−100 dB** for 12-channel float, below **−72 dB** for
+16-channel Q15 — the latter sitting at the 16-bit format's own floor,
+which is the honest bound for that datapath. Amplitude and per-channel
+SNR are asserted in the same run, so a permutation, a gain error, and
+crosstalk are all caught by one test.
+
+One coverage note is worth repeating because of how it was found. The
+host's channel-parallel float kernel tiles channels in blocks of 8/4/2/1,
+and an audit noticed that no test ever ran the K=2 and K=1 remainder
+tiles — every configured count happened to decompose without them. The
+suite now runs 5- and 7-channel variants (5 = 4+1, 7 = 4+2+1) precisely
+to execute those tiles. The general lesson from Part II recurs: coverage
+you haven't verified reaches the code is coverage you don't have.
+
+## Rates: hertz-denominated defaults are a 48 kHz assumption
+
+The library's defaults read as innocently portable — until you notice
+which fields carry units. `FilterSpec::balanced()` places the passband
+edge at 20,000 Hz and the first image to suppress at 28,000 Hz;
+`ServoConfig` sets loop bandwidths of 10/1/0.05 Hz and smoother corners of
+50/5/0.5 Hz. Every one of those is an *absolute frequency chosen for
+48 kHz operation*, and the two misconfigurations they invite fail in
+instructively different ways.
+
+The filter misconfiguration fails loudly, by design. Default band edges at
+a 16 kHz rate would put the anti-image cutoff far above the input Nyquist
+— a filter that passes images wholesale — and the constructor's validation
+rejects the geometry outright (`passbandHz + stopbandHz` must not exceed
+the sample rate), so you cannot ship it by accident. The servo
+misconfiguration is the dangerous one, because nothing forces you to
+notice: scale the filter (you must, to construct at all), keep the default
+servo, and the converter builds, locks, tracks, and converts — while
+silently costing about **32 dB** of quality at 16 kHz. That number is
+measured, and the mechanism is worth understanding because it is the whole
+rate-scaling story in one incident.
+
+At 16 kHz with a 200 ppm offset, the whole-sample slips arrive at
+`ppm × fs` = 3.2 Hz instead of 9.6 Hz. The servo's three-pole Quiet
+smoother has an absolute 0.5 Hz corner, so a beat at one-third the
+frequency is rejected `(16/48)³` ≈ 28.6 dB *less* — the slip sawtooth
+walks out from under the smoother, leaks into the rate estimate, and
+frequency-modulates the audio. The measurement wears the FM signature
+openly: roughly 32 dB below the 48 kHz figures at every tone, falling
+6 dB per octave of signal frequency, exactly as small-index FM sidebands
+scale. Nothing was wrong with the filter; the *control loop* was mistuned
+by a factor of three because its tuning was written in hertz.
+
+The remedy is the `scaledTo` trio, and the factory that applies it:
+
+```cpp
+srt::Config cfg = srt::Config::forSampleRate(16000.0);
+cfg.channels = ...; // then adjust as usual
+```
+
+`FilterSpec::scaledTo` multiplies the band edges by `fs/48000` — same L
+and T, so the same table size and per-frame cost, with the identical
+response at every *normalized* frequency. `ServoConfig::scaledTo` does the
+same to the six bandwidth/corner fields, keeping the loop identical in
+per-sample terms — and scales the two hold times *inversely*, so the
+promotion gates wait the same number of loop time constants rather than
+the same number of wall-clock seconds. (That last refinement postdates the
+first hand-scaled fix; re-measured, it changed nothing within noise, and
+the test asserting it exists so the equivalence stays checked rather than
+remembered.) Frame-denominated fields — lock and unlock thresholds,
+`targetLatencyFrames`, ppm limits — are rate-invariant and stay put,
+though their *duration* in milliseconds scales inversely with the rate.
+
+`tests/test_asrc_quality_16k.cpp` runs the full quality methodology
+through the factory, and the outcome is the point of the design: 16 kHz
+matches the 48 kHz *normalized-frequency structure*. The tones sit at the
+same f/fs as the 48 kHz suite's 997 Hz / 6 k / 12 k / 19.5 k, and measure
+136.6 / 121.9 / 114.3 / 106.5 dB against the 48 kHz suite's 135.0 /
+120.0 / 112.8 / 105.8 on the same host — within about 1 dB down the line,
+confirming that interpolation noise depends only on f/fs. Two consequences
+deploy with you: group delay at the same tap count stays ~24 *input
+samples*, which is three times as many milliseconds at 16 kHz (1.5 ms vs
+0.5 ms); and the scaled Quiet loop at ~0.017 Hz settles proportionally
+slower — the 16 kHz test runs 120 s where the 48 kHz one ran 40 s, the
+same number of samples and of time constants.
+
+## Blocks: feasibility, then observability
+
+The block-size axis has two boundaries, one hard and one
+information-theoretic.
+
+### The hard one: a pull can only synthesize from what is buffered
+
+`pull(frames)` produces output from frames already in the FIFO. If the
+occupancy setpoint sits at or below the pull block size, the loop is
+infeasible: each pull drains the buffer past the setpoint, the servo
+steers to refill it, the next pull drains it again — a permanent underrun
+limit cycle, dropouts every few hundred milliseconds, never locking. Early
+versions documented the rule ("the setpoint must exceed the pull block
+size") and trusted the integrator between chair and keyboard; the current
+converter enforces it. When `pull()` observes a block larger than the
+setpoint in force, it raises the *effective* setpoint to the block plus a
+margin — half a block, at least one pop chunk — sized so the entry
+occupancy never grazes the pull size even at the bottom of the block-beat
+sawtooth, and bounded by FIFO capacity:
+
+```cpp
+const std::size_t needed = frames + std::max(frames / 2, kPopChunkFrames);
+const std::size_t newTarget =
+ std::clamp(needed, cfg_.targetLatencyFrames, maxTargetFrames_);
+```
+
+Configurations that already satisfy the rule are left exactly as
+configured; the servo slews to a raised setpoint glitch-free (integrator
+kept — the clocks haven't changed, only the target). The cost is not
+hidden: latency follows the raised setpoint, `designedLatencySeconds()`
+reports it, and `Status::effectiveTargetLatencyFrames` differs from the
+configured value exactly when the adaptation has occurred — a field worth
+plotting in deployment telemetry, because it is the converter telling you
+your latency budget and your callback size disagree. Capacity bounds the
+raise: the default ring (a 1024-frame floor) accommodates pull blocks up
+to ~340 frames; larger callbacks need `fifoFrames` sized explicitly.
+
+### The soft one: what a coarse count can tell a servo
+
+The servo's only sensor is FIFO occupancy, and occupancy is quantized —
+to whole frames at best, to whole *blocks* with block transfer. At
+deviation ε the observable carries a deterministic sawtooth, one push
+block peak-to-peak, at the beat frequency `ε × fs / block`. Whatever the
+loop passes into its estimate frequency-modulates the audio. With
+sample-granular transfer the sawtooth is one frame and the Quiet stage's
+three-pole cascade rejects it to roughly −120 dBc equivalent at 20 kHz.
+With ≥32-frame callbacks, that level of quiet is
+**information-theoretically unavailable from counts alone** — no filter
+recovers sub-sawtooth phase from an observable whose quantization *is*
+the sawtooth, not while still tracking real drift.
+
+The design response is to stop pretending. Promotion from Track to Quiet
+is gated on the cascade-smoothed error staying small, which is naturally
+false while a large block beat dominates the observable — the gate is
+itself the discriminator between the two regimes, so coarse-block
+operation deliberately stays in Track. There the block beat is mostly
+phase-tracked as benign *latency breathing* (the FIFO term of the latency
+wanders by a fraction of the block as the servo follows the beat), and
+the remainder appears as low-rate FM measured in cents:
+`notebooks/asrc_block_size_study.ipynb` puts it at ~0.9 cents rms over a
+61 dB wideband floor at 32-frame blocks, ~1.3 cents rms over 53 dB at
+5 ms (240-frame) blocks. Those are honest numbers for a different regime,
+not a degradation of the headline ones — the 135 dB figures are for
+fine-grained transfer, and the comparison document says so plainly. If
+your deployment pushes hardware-DMA-sized blocks and needs studio
+transparency, the current converter is not information-limited by
+accident, and the limitations section of the README sketches the eventual
+answer (per-block timestamps for sub-sample phase observation).
+
+One more block-denominated rule closes the loop with the previous
+chapter. The servo's `unlockThresholdFrames` (default 24) is the
+excursion that demotes a stage; block-quantized occupancy legitimately
+excursions by about half a block without the clocks having moved. The
+guidance in `pi_servo.hpp` — keep the threshold comfortably above half
+the block — is applied literally in the ALSA bridge (`1.5 ×` the period),
+and ignoring it produces the most confusing failure on this axis: a
+converter that locks, runs cleanly, and "spuriously" demotes itself on
+schedule, at the beat frequency, forever.
+
+## The configuration walk, in order
+
+The axes compose, so a deployment configures them in dependency order:
+
+1. Start from `Config::forSampleRate(rate)` — never raw defaults at a
+ non-48 kHz rate.
+2. Set `channels` to the full width of each clock domain's stream; one
+ instance per domain.
+3. Set `targetLatencyFrames` above your pull block *and* your worst
+ push/pull jitter excursion (the dual-core firmware's 144-frame
+ setpoint against a 2 ms logging stall is the worked example); set
+ `fifoFrames` explicitly past ~340-frame callbacks.
+4. Raise `unlockThresholdFrames` above ~1.5× your transfer block.
+5. Then watch `Status::effectiveTargetLatencyFrames` and the resync
+ counters in production — they are the converter's own opinion of
+ whether steps 3 and 4 were done right.
+
+## Verify it yourself
+
+```sh
+# Channel independence: 12ch float (< -100 dB crosstalk), 16ch Q15
+# (< -72 dB), plus the 5/7-channel remainder-tile variants:
+ctest --test-dir build -R MultiChannel --output-on-failure
+
+# The rate-scaling rule and the 16 kHz measurements (slow: each case is
+# a 120 s simulated run; the first test checks the factory arithmetic
+# deterministically):
+ctest --test-dir build -R AsrcQuality16k --output-on-failure
+
+# The -32 dB failure itself, reproduced: in test_asrc_quality_16k.cpp,
+# keep Config::forSampleRate(kFs) but overwrite the servo with unscaled
+# defaults (cfg.servo = srt::ServoConfig{};) — the converter still builds
+# and locks, and every threshold fails by ~30 dB, falling 6 dB per octave
+# of tone frequency: the FM signature. (Restoring the unscaled *filter*
+# instead fails fast: the constructor rejects band edges above the input
+# Nyquist.)
+
+# The block axis, measured: latency breathing and the cents-scale FM
+# decomposition at 32/64/240-frame blocks:
+jupyter nbconvert --execute notebooks/asrc_block_size_study.ipynb
+
+# The feasibility rule live: run the drifting-clocks example, then rerun
+# with cfg.targetLatencyFrames set below kChunk in the source — the
+# adaptive raise reports itself in effectiveTargetLatencyFrames instead
+# of dropping out:
+./build/examples/drifting_clocks
+```
+
+The break-it-on-purpose suggestions are, as ever, the chapter in
+miniature: each rule here was learned from a measured failure, and each
+failure is still one edit away from being watched happening.
diff --git a/cmake/arm-cortex-m55-mps3.cmake b/cmake/arm-cortex-m55-mps3.cmake
index aa5a904..23a8991 100644
--- a/cmake/arm-cortex-m55-mps3.cmake
+++ b/cmake/arm-cortex-m55-mps3.cmake
@@ -23,6 +23,7 @@ set(CMAKE_C_FLAGS_INIT "-mcpu=cortex-m55 -mthumb -mfloat-abi=hard -ffunction-sec
set(CMAKE_CXX_FLAGS_INIT "${CMAKE_C_FLAGS_INIT}")
get_filename_component(_srt_platform "${CMAKE_CURRENT_LIST_DIR}/../platform/mps3_an547" ABSOLUTE)
+# ANCHOR: pt_linkline
# The startup .c is handed to the link line directly; the gcc driver
# compiles it with the same -mcpu/-mfloat-abi flags as everything else.
# `-x c` forces C compilation even under the g++ driver (which would treat
@@ -30,6 +31,7 @@ get_filename_component(_srt_platform "${CMAKE_CURRENT_LIST_DIR}/../platform/mps3
# initializers are link-time constants, never dynamic initialization.
set(CMAKE_EXE_LINKER_FLAGS_INIT
"--specs=rdimon.specs -nostartfiles -Wl,--gc-sections -T${_srt_platform}/mps3_an547.ld -x c ${CMAKE_CURRENT_LIST_DIR}/../platform/armv8m_startup.c -x none")
+# ANCHOR_END: pt_linkline
set(CMAKE_CROSSCOMPILING_EMULATOR
"qemu-system-arm;-M;mps3-an547;-nographic;-semihosting;-kernel")
diff --git a/include/srt/asrc.hpp b/include/srt/asrc.hpp
index a26291d..959e9ad 100644
--- a/include/srt/asrc.hpp
+++ b/include/srt/asrc.hpp
@@ -19,6 +19,7 @@
namespace srt {
+// ANCHOR: p0_config
/// Converter configuration. The defaults give ~1.5 ms designed latency at
/// 48 kHz (FIFO setpoint 48 frames + ~24 frames filter group delay; see
/// the README latency section), transparent for clocks within +/-1000 ppm.
@@ -29,6 +30,7 @@ struct Config {
std::size_t fifoFrames = 0; ///< ring capacity; 0 => automatic
FilterSpec filter{};
ServoConfig servo{};
+ // ANCHOR_END: p0_config
/// Defaults adapted to a nominal rate other than 48 kHz. The filter
/// band edges and servo bandwidths are absolute Hz designed for 48 kHz;
@@ -146,6 +148,7 @@ class BasicAsyncSampleRateConverter {
return ring_.read(dst, maxFrames * cfg_.channels) / cfg_.channels;
};
+ // ANCHOR: asrc_feasibility
// Feasibility: a pull must synthesize from frames already buffered,
// so the occupancy setpoint must exceed the pull block size or the
// loop drains into a permanent underrun limit cycle (dropouts every
@@ -173,8 +176,10 @@ class BasicAsyncSampleRateConverter {
}
}
+ // ANCHOR_END: asrc_feasibility
double occ = backlogFrames();
+ // ANCHOR: asrc_filling
if (filling_) {
if (occ < static_cast(fillThresholdFrames_)) {
fillSilence(interleaved, frames * ch);
@@ -190,6 +195,8 @@ class BasicAsyncSampleRateConverter {
fadeFramesLeft_ = kFadeFrames;
}
+ // ANCHOR_END: asrc_filling
+ // ANCHOR: asrc_resync
if (occ > static_cast(highWaterFrames_)) { // hard resync
const double target = static_cast(targetFrames_);
// The discard can only come from the ring; frames staged in the
@@ -206,9 +213,11 @@ class BasicAsyncSampleRateConverter {
servo_.seed(occ + resampler_.mu());
}
+ // ANCHOR_END: asrc_resync
const double dt = static_cast(frames) / cfg_.sampleRateHz;
const double epsHat = servo_.update(occ, resampler_.mu(), dt);
+ // ANCHOR: asrc_underrun
const std::size_t made = resampler_.process(interleaved, frames, epsHat, popFn);
if (fadeFramesLeft_ != 0 && made != 0)
applyFadeIn(interleaved, made);
@@ -220,6 +229,7 @@ class BasicAsyncSampleRateConverter {
}
publishStatus();
return made;
+ // ANCHOR_END: asrc_underrun
}
/// Any thread: telemetry snapshot (relaxed atomics; fields are individually
diff --git a/include/srt/detail/kaiser.hpp b/include/srt/detail/kaiser.hpp
index e9ac8a3..011bd67 100644
--- a/include/srt/detail/kaiser.hpp
+++ b/include/srt/detail/kaiser.hpp
@@ -1,3 +1,4 @@
+// ANCHOR: kai_design_note
/// \file kaiser.hpp
/// \brief Kaiser-window FIR prototype design for the polyphase interpolation bank.
///
@@ -8,6 +9,7 @@
/// minutes of compile time in every including translation unit. Runtime design
/// takes well under 10 ms, runs once in a constructor, and is off the audio path,
/// so all design math here is plain runtime double precision.
+// ANCHOR_END: kai_design_note
#ifndef SRT_DETAIL_KAISER_HPP
#define SRT_DETAIL_KAISER_HPP
@@ -18,6 +20,7 @@
namespace srt::detail {
+// ANCHOR: kai_besseli0
/// Modified Bessel function of the first kind, order zero, by power series.
/// Converges for all practical Kaiser betas (|x| < ~40); terms are added until
/// they no longer contribute at double precision.
@@ -34,7 +37,9 @@ inline double besselI0(double x) noexcept {
}
return sum;
}
+// ANCHOR_END: kai_besseli0
+// ANCHOR: kai_beta
/// Kaiser window shape parameter for a given stopband attenuation in dB
/// (Kaiser's published empirical fit).
inline double kaiserBeta(double attenDb) noexcept {
@@ -44,7 +49,9 @@ inline double kaiserBeta(double attenDb) noexcept {
return 0.5842 * std::pow(attenDb - 21.0, 0.4) + 0.07886 * (attenDb - 21.0);
return 0.0;
}
+// ANCHOR_END: kai_beta
+// ANCHOR: kai_estimate
/// Kaiser/harris FIR length estimate, expressed per polyphase branch.
///
/// \param attenDb target stopband attenuation in dB
@@ -59,7 +66,9 @@ inline std::size_t estimateTaps(double attenDb, double transWidthNorm) noexcept
const double n = (attenDb - 8.0) / (2.285 * 2.0 * std::numbers::pi * transWidthNorm);
return n > 4.0 ? static_cast(std::ceil(n)) : 4;
}
+// ANCHOR_END: kai_estimate
+// ANCHOR: kai_sinc
/// sin(pi x)/(pi x) with the removable singularity handled.
inline double sinc(double x) noexcept {
if (std::abs(x) < 1e-12)
@@ -67,7 +76,9 @@ inline double sinc(double x) noexcept {
const double px = std::numbers::pi * x;
return std::sin(px) / px;
}
+// ANCHOR_END: kai_sinc
+// ANCHOR: kai_prototype
/// Designs the Kaiser-windowed sinc prototype lowpass for an L-phase
/// interpolation bank.
///
@@ -98,6 +109,7 @@ inline void designPrototype(std::span h, std::size_t numPhases, double c
for (auto& v : h)
v *= gain;
}
+// ANCHOR_END: kai_prototype
} // namespace srt::detail
diff --git a/include/srt/pi_servo.hpp b/include/srt/pi_servo.hpp
index a9ce41d..762c1e4 100644
--- a/include/srt/pi_servo.hpp
+++ b/include/srt/pi_servo.hpp
@@ -47,6 +47,7 @@
namespace srt {
+// ANCHOR: sv_config
/// Servo tuning. Defaults suit a 48 kHz near-unity converter.
/// unlockThresholdFrames should stay comfortably above half the push/pull
/// block size, since block-quantized occupancy legitimately excursions by
@@ -64,7 +65,9 @@ struct ServoConfig {
double quietHoldSeconds = 2.0; ///< cascade-|e| hold => track -> quiet
double unlockThresholdFrames = 24.0; ///< |e| above this => demote a stage
double maxDeviationPpm = 1000.0; ///< epsHat clamp = +/- 1.5x this
+ // ANCHOR_END: sv_config
+ // ANCHOR: sv_scaled_to
/// This config rescaled from the 48 kHz design rate to sampleRateHz:
/// the loop bandwidths and error-smoother corners are absolute Hz and
/// must track the rate, or the slip-sawtooth beat (ppm * fs) walks out
@@ -87,6 +90,7 @@ struct ServoConfig {
s.quietHoldSeconds /= r;
return s;
}
+ // ANCHOR_END: sv_scaled_to
};
/// PI loop filter + three-stage lock-state machine. Pure double-precision
@@ -103,6 +107,7 @@ class PiServo {
reset(false);
}
+ // ANCHOR: sv_reset
/// Re-arm the loop. keepIntegrator preserves the accumulated ppm estimate
/// (the right choice after a dropout: the clocks have not changed).
void reset(bool keepIntegrator) noexcept {
@@ -124,7 +129,9 @@ class PiServo {
/// to the new setpoint at its clamped rate with no transient discontinuity
/// — used by the converter's adaptive pull-block setpoint raise.
void setTarget(double targetFrames) noexcept { target_ = targetFrames; }
+ // ANCHOR_END: sv_reset
+ // ANCHOR: sv_update_smooth
/// One control update; call once per pull() before synthesis.
/// \param occFrames raw backlog in frames (FIFO + staged frames)
/// \param mu current fractional read position; occ + mu changes
@@ -143,7 +150,9 @@ class PiServo {
q3_ += aq * (q2_ - q3_);
const double eFast = lpFast_ - target_;
const double eQuiet = q3_ - target_;
+ // ANCHOR_END: sv_update_smooth
+ // ANCHOR: sv_update_stages
const double limit = 1.5 * cfg_.maxDeviationPpm * 1e-6;
switch (stage_) {
case Stage::Acquire:
@@ -168,7 +177,9 @@ class PiServo {
}
break;
}
+ // ANCHOR_END: sv_update_stages
+ // ANCHOR: sv_update_out
double kp = 0.0;
double ki = 0.0;
double e = 0.0;
@@ -187,6 +198,7 @@ class PiServo {
epsHat_ = std::clamp(kp * e + integ_, -limit, limit);
return epsHat_;
}
+ // ANCHOR_END: sv_update_out
Stage stage() const noexcept { return stage_; }
bool locked() const noexcept { return stage_ != Stage::Acquire; }
@@ -199,6 +211,7 @@ class PiServo {
return 1.0 - std::exp(-2.0 * std::numbers::pi * cornerHz * dt);
}
+ // ANCHOR: sv_hold
/// Hold-window logic shared by both promotions: |e| must stay below the
/// threshold for holdSeconds; meanwhile epsHat is averaged (time constant
/// holdSeconds/5) so the promotion can hand a clean estimate to the
@@ -218,12 +231,15 @@ class PiServo {
holdTimer_ = 0.0;
return true;
}
+ // ANCHOR_END: sv_hold
+ // ANCHOR: sv_gains
void computeGains(double bandwidthHz, double& kp, double& ki) const noexcept {
const double wn = 2.0 * std::numbers::pi * bandwidthHz;
kp = 2.0 * cfg_.damping * wn / fs_;
ki = wn * wn / fs_;
}
+ // ANCHOR_END: sv_gains
ServoConfig cfg_;
double fs_;
diff --git a/include/srt/polyphase_filter.hpp b/include/srt/polyphase_filter.hpp
index 345604a..9d3205e 100644
--- a/include/srt/polyphase_filter.hpp
+++ b/include/srt/polyphase_filter.hpp
@@ -23,6 +23,7 @@
#define SRT_RESTRICT __restrict__
#endif
+// ANCHOR: opt_smlald_gate
// Dual 16x16 MAC (SMLALD) for the Q15 dot product on Arm cores that have
// the DSP extension but no Helium — the Cortex-M33/M4/M7 class (e.g.
// Raspberry Pi Pico 2). Gated off when MVE is present: on M55 the compiler
@@ -36,6 +37,7 @@
#else
#define SRT_Q15_SMLALD 0
#endif
+// ANCHOR_END: opt_smlald_gate
// Channel-parallel dot product for high channel counts (hypothesis C6,
// docs/PERFORMANCE.md): history stored frame-major so the per-tap inner
@@ -63,6 +65,7 @@
namespace srt {
+// ANCHOR: bank_spec
/// Specification of the interpolation prototype filter.
///
/// numPhases (L) sets the polyphase table resolution: the residual images from
@@ -94,6 +97,7 @@ struct FilterSpec {
.stopbandHz = 26000.0,
.stopbandAttenDb = 140.0};
}
+ // ANCHOR_END: bank_spec
/// This spec with the band edges rescaled from the 48 kHz design rate
/// to sampleRateHz. The presets' passband/stopband are absolute Hz
@@ -111,6 +115,7 @@ struct FilterSpec {
}
};
+// ANCHOR: bank_layout
/// Immutable polyphase coefficient table designed at construction.
///
/// Storage layout: (L+1) rows of T coefficients. Row p in [0, L) is polyphase
@@ -119,11 +124,13 @@ struct FilterSpec {
/// and the mu wrap 1.0 -> 0.0 (window shifted by one sample) is exactly
/// continuous. Rows are stored tap-reversed so the dot product runs forward
/// over an oldest-first history window.
+// ANCHOR_END: bank_layout
template
class PolyphaseFilterBank {
public:
using Coeff = typename SampleTraits::Coeff;
+ // ANCHOR: bank_build
/// Designs the prototype (double precision) and builds the table.
/// Allocates; may throw std::invalid_argument / std::bad_alloc. Do this at
/// setup time, not on the audio path.
@@ -150,7 +157,9 @@ class PolyphaseFilterBank {
}
}
}
+ // ANCHOR_END: bank_build
+ // ANCHOR: bank_accessors
/// Row pointer for phase p in [0, numPhases()]; T contiguous coefficients.
const Coeff* phase(std::size_t p) const noexcept { return table_.data() + p * taps_; }
std::size_t numPhases() const noexcept { return phases_; } ///< L
@@ -160,6 +169,7 @@ class PolyphaseFilterBank {
double groupDelaySamples() const noexcept {
return static_cast(phases_ * taps_ - 1) / (2.0 * static_cast(phases_));
}
+ // ANCHOR_END: bank_accessors
private:
std::size_t phases_;
@@ -167,6 +177,7 @@ class PolyphaseFilterBank {
std::vector table_; // (L+1) x T, rows tap-reversed
};
+// ANCHOR: bank_interpolate
/// Evaluates one output sample at fractional position mu in [0, 1).
///
/// \param hist oldest-first window of the newest T input samples of one channel
@@ -192,6 +203,7 @@ inline S interpolate(const PolyphaseFilterBank& bank, const S* hist, double m
acc = Tr::mac(acc, hist[t], Tr::blend(c0[t], c1[t], fr));
return Tr::finalize(acc);
}
+// ANCHOR_END: bank_interpolate
/// Blends the two phase rows adjacent to mu into `row` (taps() entries).
/// Multichannel datapaths do this once per output frame and then run
@@ -213,6 +225,7 @@ inline void blendRow(const PolyphaseFilterBank& bank,
row[t] = Tr::blend(c0[t], c1[t], fr);
}
+// ANCHOR: rs_blend_row_phase
/// Phase-bit variants: the fractional position as an unsigned Q0.64
/// fraction. The polyphase index is the top log2(L) bits and the intra-phase
/// blend factor comes from the bits below — no double arithmetic per sample,
@@ -232,7 +245,9 @@ inline void blendRowPhase(const PolyphaseFilterBank& bank,
for (std::size_t t = 0; t < taps; ++t)
row[t] = Tr::blend(c0[t], c1[t], fr);
}
+// ANCHOR_END: rs_blend_row_phase
+// ANCHOR: rs_interpolate_phase
/// interpolate() over a Q0.64 phase; fused blend+mac (mono fast path).
template
inline S interpolatePhase(const PolyphaseFilterBank& bank, const S* hist,
@@ -249,7 +264,9 @@ inline S interpolatePhase(const PolyphaseFilterBank& bank, const S* hist,
acc = Tr::mac(acc, hist[t], Tr::blend(c0[t], c1[t], fr));
return Tr::finalize(acc);
}
+// ANCHOR_END: rs_interpolate_phase
+// ANCHOR: rs_dot_row
/// Dot product of a pre-blended coefficient row against a history window.
/// Identical arithmetic to interpolate() given the same mu: blend then mac,
/// per tap, in the same order — outputs are bit-exact either way.
@@ -281,7 +298,9 @@ inline S dotRow(const typename SampleTraits::Coeff* SRT_RESTRICT row, const S
acc = Tr::mac(acc, hist[t], row[t]);
return Tr::finalize(acc);
}
+// ANCHOR_END: rs_dot_row
+// ANCHOR: opt_dot_tile
/// One K-channel tile of the channel-parallel dot (hypothesis C6): K
/// accumulators live in a constexpr-size local array — registers, not
/// memory — while the tap loop walks the frame-major window with stride
@@ -303,7 +322,10 @@ inline void dotTileFrameMajor(const typename SampleTraits::Coeff* SRT_RESTRIC
for (std::size_t k = 0; k < K; ++k)
out[k] = Tr::finalize(acc[k]);
}
+// ANCHOR_END: opt_dot_tile
+// ANCHOR: rs_dot_rows_frame_major
+// ANCHOR: opt_dot_rows
/// Channel-parallel dot products over a frame-major history block: all
/// channels' outputs for one frame in register-blocked tiles of 8/4/2/1.
/// Per channel the accumulation order over taps equals dotRow's, so the
@@ -328,7 +350,10 @@ inline void dotRowsFrameMajor(const typename SampleTraits::Coeff* SRT_RESTRIC
if (c < channels)
dotTileFrameMajor(row, x + c, taps, channels, out + c);
}
+// ANCHOR_END: rs_dot_rows_frame_major
+// ANCHOR_END: opt_dot_rows
+// ANCHOR: rs_class_doc
/// Streaming fractional-delay engine for one converter instance.
///
/// Owns the history delay lines (planar per-channel below the
@@ -348,6 +373,7 @@ inline void dotRowsFrameMajor(const typename SampleTraits::Coeff* SRT_RESTRIC
/// detected by 64-bit wraparound instead of comparisons.
template
class FractionalResampler {
+ // ANCHOR_END: rs_class_doc
public:
/// Frame-major channel-parallel mode is compiled in only on CP targets
/// and only for floating-point samples (see SRT_CHANNEL_PARALLEL).
@@ -378,6 +404,7 @@ class FractionalResampler {
scratchPos_ = 0;
}
+ // ANCHOR: rs_mu
/// Fractional position in [0,1) as a double; used by the servo at block
/// rate (one conversion per pull, not per sample).
double mu() const noexcept { return static_cast(phase_) * 0x1p-64; }
@@ -386,6 +413,7 @@ class FractionalResampler {
/// Frames popped from the source but not yet consumed by the filter; part
/// of the effective backlog the servo must observe.
std::size_t bufferedFrames() const noexcept { return scratchFrames_ - scratchPos_; }
+ // ANCHOR_END: rs_mu
/// Fills the history window with taps() frames from the source.
/// Returns false (and stays unprimed) if the source ran dry.
@@ -400,6 +428,7 @@ class FractionalResampler {
return true;
}
+ // ANCHOR: rs_process_doc
/// Synthesizes up to maxFrames output frames (interleaved) advancing the
/// read position by (1 + epsHat) input frames per output frame. Returns
/// the number produced; fewer than maxFrames means the source ran dry
@@ -414,6 +443,9 @@ class FractionalResampler {
/// interleaved frames, returning the count actually delivered.
template
std::size_t process(S* out, std::size_t maxFrames, double epsHat, PopFn&& popFrames) noexcept {
+ // ANCHOR_END: rs_process_doc
+ // ANCHOR: p0_phase_step
+ // ANCHOR: rs_slip
// eps in Q0.64, converted once per call (block rate). |eps| is
// servo-clamped to ~1e-3, so eps * 2^64 fits int64 comfortably.
const auto epsFix = static_cast(epsHat * 0x1p64);
@@ -432,6 +464,9 @@ class FractionalResampler {
return n; // dry: phase_ not advanced for this frame
}
phase_ = m;
+ // ANCHOR_END: p0_phase_step
+ // ANCHOR_END: rs_slip
+ // ANCHOR: rs_dispatch
// Q15 on SMLALD targets routes mono through blendRow+dotRow as
// well: dotRow carries the dual-MAC loop, and the two paths are
// bit-exact by construction (see dotRow).
@@ -454,6 +489,7 @@ class FractionalResampler {
for (std::size_t c = 0; c < channels_; ++c)
out[n * channels_ + c] = dotRow(row_.data(), window(c), taps);
}
+ // ANCHOR_END: rs_dispatch
}
return maxFrames;
}
@@ -461,6 +497,7 @@ class FractionalResampler {
private:
const S* window(std::size_t c) const noexcept { return hist_[c].data() + end_ - bank_->taps(); }
+ // ANCHOR: rs_append
template
bool appendOne(PopFn&& popFrames) noexcept {
if (scratchPos_ == scratchFrames_) {
@@ -490,12 +527,14 @@ class FractionalResampler {
++scratchPos_;
return true;
}
+ // ANCHOR_END: rs_append
const PolyphaseFilterBank* bank_;
std::size_t channels_;
std::size_t chunk_;
std::size_t histCap_;
std::vector scratch_; // interleaved staging for bulk pops
+ // ANCHOR: rs_members
// History storage: planar (one delay line per channel, hist_[c]) below
// SRT_CP_MIN_CHANNELS, frame-major (single interleaved line, hist_[0])
// at or above it on SRT_CHANNEL_PARALLEL targets. end_/histCap_ count
@@ -507,6 +546,7 @@ class FractionalResampler {
std::size_t scratchFrames_ = 0;
std::size_t scratchPos_ = 0;
std::uint64_t phase_ = 0; // fractional position, unsigned Q0.64
+ // ANCHOR_END: rs_members
bool primed_ = false;
};
diff --git a/include/srt/sample_traits.hpp b/include/srt/sample_traits.hpp
index b3eb832..5fd4647 100644
--- a/include/srt/sample_traits.hpp
+++ b/include/srt/sample_traits.hpp
@@ -1,3 +1,4 @@
+// ANCHOR: st_overview
/// \file sample_traits.hpp
/// \brief Sample-type customization point for the resampling datapath.
///
@@ -14,6 +15,7 @@
/// The clock servo and the filter design always run in double regardless of
/// sample type (control path and one-time init, not the audio path), so the
/// fixed-point datapaths contain no floating-point inner loops.
+// ANCHOR_END: st_overview
#ifndef SRT_SAMPLE_TRAITS_HPP
#define SRT_SAMPLE_TRAITS_HPP
@@ -26,6 +28,7 @@ namespace srt {
namespace detail {
+// ANCHOR: st_roundsat
/// Round-and-saturate a double to a signed integer coefficient/sample type.
template
constexpr I roundSat(double v) noexcept {
@@ -38,6 +41,7 @@ constexpr I roundSat(double v) noexcept {
return std::numeric_limits::max();
return static_cast(r);
}
+// ANCHOR_END: st_roundsat
/// Saturate a 64-bit accumulator result to a narrower signed integer.
template
@@ -49,10 +53,13 @@ constexpr I clampSat(std::int64_t v) noexcept {
} // namespace detail
+// ANCHOR: st_primary
/// Primary template intentionally undefined; specialize per sample type.
template
struct SampleTraits;
+// ANCHOR_END: st_primary
+// ANCHOR: st_float
/// Float datapath: float samples and coefficients, double accumulation.
/// The double accumulator keeps the dot-product noise floor far below the
/// 120 dB transparency target; float coefficient storage quantizes the
@@ -69,6 +76,7 @@ struct SampleTraits {
/// Convert the intra-phase fraction (in [0,1)) once per output sample.
static BlendFactor makeBlendFactor(double fr) noexcept { return static_cast(fr); }
+ // ANCHOR: st_blend_q64_float
/// Blend factor from the top bits of a Q0.64 intra-phase fraction.
/// Single-precision only: the value is reduced to 24 bits first so the
/// uint->float conversion is exact and no double op is needed
@@ -76,6 +84,7 @@ struct SampleTraits {
static BlendFactor blendFactorFromQ64(std::uint64_t frac) noexcept {
return static_cast(frac >> 40) * 0x1p-24f;
}
+ // ANCHOR_END: st_blend_q64_float
/// acc + x * c, in the accumulator domain.
static Accum mac(Accum acc, float x, Coeff c) noexcept {
@@ -91,7 +100,9 @@ struct SampleTraits {
/// The zero/silence sample value.
static float silence() noexcept { return 0.0f; }
};
+// ANCHOR_END: st_float
+// ANCHOR: st_q15_header
/// Q15 fixed-point datapath (samples are int16_t in Q0.15).
///
/// Coefficients are stored in Q1.14: the prototype's peak tap reaches ~1.0
@@ -107,26 +118,34 @@ struct SampleTraits {
using Coeff = std::int16_t;
using Accum = std::int64_t;
using BlendFactor = std::int32_t; ///< fraction in Q15
+ // ANCHOR_END: st_q15_header
+ // ANCHOR: st_q15_coeff
static Coeff makeCoeff(double c) noexcept {
return detail::roundSat(c * 16384.0); // Q1.14
}
+ // ANCHOR_END: st_q15_coeff
static BlendFactor makeBlendFactor(double fr) noexcept {
return static_cast(fr * 32768.0); // Q15
}
+ // ANCHOR: st_q15_q64
/// Q15 blend factor straight from a Q0.64 fraction's top bits: no
/// floating point at all on the fixed-point per-sample path.
static BlendFactor blendFactorFromQ64(std::uint64_t frac) noexcept {
return static_cast(frac >> 49); // Q15
}
+ // ANCHOR_END: st_q15_q64
+ // ANCHOR: st_q15_mac
static Accum mac(Accum acc, std::int16_t x, Coeff c) noexcept {
return acc + static_cast(static_cast(x) *
static_cast(c));
}
+ // ANCHOR_END: st_q15_mac
+ // ANCHOR: st_q15_blend
static Coeff blend(Coeff a, Coeff b, BlendFactor fr) noexcept {
// Q14 + (Q15 * Q14) >> 15, in int64: the worst-case int32 product
// 32767 * 65535 = 2,147,385,345 sits 0.005% under INT32_MAX —
@@ -136,16 +155,20 @@ struct SampleTraits {
const std::int64_t diff = static_cast(b) - a;
return static_cast(a + ((fr * diff) >> 15));
}
+ // ANCHOR_END: st_q15_blend
+ // ANCHOR: st_q15_finalize
static std::int16_t finalize(Accum acc) noexcept {
// Round-half-up, not half-even: the bias is a fraction of one
// sub-LSB rounding step, far below the Q15 noise floor.
return detail::clampSat((acc + (1 << 13)) >> 14); // Q29 -> Q15
}
+ // ANCHOR_END: st_q15_finalize
static std::int16_t silence() noexcept { return 0; }
};
+// ANCHOR: st_q31
/// Q31 fixed-point datapath (samples are int32_t in Q0.31).
///
/// Coefficients are stored in Q1.30 (one headroom bit for the ~1.0 peak
@@ -174,9 +197,11 @@ struct SampleTraits {
return static_cast(frac >> 44); // Q20
}
+ // ANCHOR: st_q31_mac
static Accum mac(Accum acc, std::int32_t x, Coeff c) noexcept {
return acc + ((static_cast(x) * c) >> 16); // Q61 -> Q45
}
+ // ANCHOR_END: st_q31_mac
static Coeff blend(Coeff a, Coeff b, BlendFactor fr) noexcept {
const std::int64_t diff = static_cast(b) - a;
@@ -189,7 +214,9 @@ struct SampleTraits {
static std::int32_t silence() noexcept { return 0; }
};
+// ANCHOR_END: st_q31
+// ANCHOR: st_concept
/// Satisfied by any type with a complete, well-formed SampleTraits
/// specialization.
template
@@ -212,6 +239,7 @@ concept SampleType =
static_assert(SampleType);
static_assert(SampleType);
static_assert(SampleType);
+// ANCHOR_END: st_concept
} // namespace srt
diff --git a/include/srt/spsc_ring.hpp b/include/srt/spsc_ring.hpp
index e70b562..fa94eae 100644
--- a/include/srt/spsc_ring.hpp
+++ b/include/srt/spsc_ring.hpp
@@ -20,6 +20,7 @@
namespace srt {
+// ANCHOR: contract
/// Lock-free SPSC ring buffer of trivially copyable elements.
///
/// Thread contract: write() and writeAvailable() may only be called from the
@@ -34,6 +35,7 @@ class SpscRing {
static_assert(std::is_trivially_copyable_v);
// The lock-free claim of the whole audio path rests on these indices.
static_assert(std::atomic::is_always_lock_free);
+ // ANCHOR_END: contract
public:
/// Allocates the buffer; capacity is rounded up to a power of two.
@@ -45,6 +47,7 @@ class SpscRing {
std::size_t capacity() const noexcept { return buf_.size(); }
+ // ANCHOR: write
/// Producer: append up to n elements; returns the number actually written.
std::size_t write(const T* src, std::size_t n) noexcept {
const std::size_t head = head_.load(std::memory_order_relaxed);
@@ -64,12 +67,15 @@ class SpscRing {
return n;
}
+ // ANCHOR_END: write
+
/// Producer: exact free space at the time of the call.
std::size_t writeAvailable() noexcept {
tailCache_ = tail_.load(std::memory_order_acquire);
return capacity() - (head_.load(std::memory_order_relaxed) - tailCache_);
}
+ // ANCHOR: read
/// Consumer: remove up to n elements; returns the number actually read.
std::size_t read(T* dst, std::size_t n) noexcept {
const std::size_t tail = tail_.load(std::memory_order_relaxed);
@@ -89,6 +95,8 @@ class SpscRing {
return n;
}
+ // ANCHOR_END: read
+
/// Consumer: exact occupancy at the time of the call.
std::size_t readAvailable() noexcept {
headCache_ = head_.load(std::memory_order_acquire);
@@ -110,6 +118,7 @@ class SpscRing {
}
private:
+ // ANCHOR: layout
// 64-byte separation to keep producer- and consumer-owned state on
// distinct cache lines (std::hardware_destructive_interference_size is
// deliberately avoided: it is ABI-fragile and warns on GCC). The
@@ -123,6 +132,7 @@ class SpscRing {
alignas(kCacheLine) std::size_t tailCache_{0}; // producer's view of tail
alignas(kCacheLine) std::atomic tail_{0}; // written by consumer
alignas(kCacheLine) std::size_t headCache_{0}; // consumer's view of head
+ // ANCHOR_END: layout
};
} // namespace srt
diff --git a/notebooks/asrc_demo.ipynb b/notebooks/asrc_demo.ipynb
index b026d74..08d0f77 100644
--- a/notebooks/asrc_demo.ipynb
+++ b/notebooks/asrc_demo.ipynb
@@ -727,8 +727,8 @@
"\n",
"| What | Measured here |\n",
"|---|---|\n",
- "| Naive FIFO at +200 ppm | clicks ~10×/s, SNR around 29 dB dB |\n",
- "| SampleRateTap, same conditions | **SNR > 130 dB** — at the 24-bit noise floor |\n",
+ "| Naive FIFO at +200 ppm | clicks ~10×/s, SNR around 29 dB |\n",
+ "| SampleRateTap, same conditions | **SNR 126.4 dB measured** (cell asserts > 125) |\n",
"| Lock from cold start | ~1 s |\n",
"| Latency | ≈ designed 1.5 ms, linear phase |\n",
"| 50 ppm/s drift ramp | tracked, locked, zero underruns |\n",
diff --git a/platform/armv8m_startup.c b/platform/armv8m_startup.c
index a0007e8..4db20bd 100644
--- a/platform/armv8m_startup.c
+++ b/platform/armv8m_startup.c
@@ -47,6 +47,7 @@ void* __dso_handle;
void _init(void) {}
void _fini(void) {}
+/* ANCHOR: pt_sbrk */
void* _sbrk(ptrdiff_t increment) {
static char* brk = &__heap_start__;
if (brk + increment > &__heap_end__) {
@@ -57,7 +58,9 @@ void* _sbrk(ptrdiff_t increment) {
brk += increment;
return prev;
}
+/* ANCHOR_END: pt_sbrk */
+/* ANCHOR: pt_irqlock */
static inline uint32_t irqLock(void) {
uint32_t primask;
__asm volatile("mrs %0, PRIMASK\n cpsid i" : "=r"(primask)::"memory");
@@ -67,6 +70,7 @@ static inline uint32_t irqLock(void) {
static inline void irqRestore(uint32_t primask) {
__asm volatile("msr PRIMASK, %0" ::"r"(primask) : "memory");
}
+/* ANCHOR_END: pt_irqlock */
uint64_t __atomic_load_8(const volatile void* ptr, int memorder) {
(void)memorder;
@@ -83,6 +87,7 @@ void __atomic_store_8(volatile void* ptr, uint64_t value, int memorder) {
irqRestore(m);
}
+/* ANCHOR: pt_atomic_rmw */
uint64_t __atomic_fetch_add_8(volatile void* ptr, uint64_t value, int memorder) {
(void)memorder;
const uint32_t m = irqLock();
@@ -91,6 +96,7 @@ uint64_t __atomic_fetch_add_8(volatile void* ptr, uint64_t value, int memorder)
irqRestore(m);
return prev;
}
+/* ANCHOR_END: pt_atomic_rmw */
uint64_t __atomic_exchange_8(volatile void* ptr, uint64_t value, int memorder) {
(void)memorder;
@@ -101,6 +107,7 @@ uint64_t __atomic_exchange_8(volatile void* ptr, uint64_t value, int memorder) {
return prev;
}
+/* ANCHOR: pt_reset */
void Reset_Handler(void) {
/* MSPLIM exists on Armv8-M Mainline only (both targets are M33/M55
* class): a main-stack overflow past __stack_limit raises a fault
@@ -119,6 +126,7 @@ void Reset_Handler(void) {
__libc_init_array(); /* C++ static constructors */
exit(main(0, (char**)0));
}
+/* ANCHOR_END: pt_reset */
void Default_Handler(void) {
for (;;) {
@@ -134,6 +142,7 @@ void HardFault_Handler(void) {
}
}
+/* ANCHOR: pt_vectors */
__attribute__((section(".vectors"), used)) static const uintptr_t vectors[16] = {
(uintptr_t)&__stack_top,
(uintptr_t)&Reset_Handler,
@@ -152,6 +161,7 @@ __attribute__((section(".vectors"), used)) static const uintptr_t vectors[16] =
(uintptr_t)&Default_Handler, /* PendSV */
(uintptr_t)&Default_Handler, /* SysTick */
};
+/* ANCHOR_END: pt_vectors */
#ifdef __cplusplus
} /* extern "C" */
diff --git a/platform/mps2_an505/mps2_an505.ld b/platform/mps2_an505/mps2_an505.ld
index d90299f..f2a7c42 100644
--- a/platform/mps2_an505/mps2_an505.ld
+++ b/platform/mps2_an505/mps2_an505.ld
@@ -8,6 +8,7 @@
* SSRAM1 4 MB @ 0x10000000 - vector table + code + rodata
* SSRAM2/3 4 MB @ 0x38000000 - data + bss + heap + stack
*/
+/* ANCHOR: pt_memory */
MEMORY
{
CODE (rx) : ORIGIN = 0x10000000, LENGTH = 4M
@@ -15,6 +16,7 @@ MEMORY
}
__stack_top = ORIGIN(DATA) + LENGTH(DATA);
+/* ANCHOR_END: pt_memory */
ENTRY(Reset_Handler)
@@ -72,6 +74,7 @@ SECTIONS
__bss_end__ = .;
} > DATA
+ /* ANCHOR: pt_heap_stack */
/* Stack lives at the top of DATA; cap the heap 64 KB below it. */
.heap (NOLOAD) : ALIGN(8) {
__heap_start__ = .;
@@ -82,6 +85,7 @@ SECTIONS
/* MSPLIM (set in Reset_Handler): the stack may descend to the heap cap
* but no further — overflow into the heap faults instead of corrupting. */
__stack_limit = __heap_end__;
+ /* ANCHOR_END: pt_heap_stack */
/* librdimon's (unused, weak) _sbrk references `end`; satisfy it. */
PROVIDE(end = __heap_start__);
diff --git a/platform/mps3_an547/mps3_an547.ld b/platform/mps3_an547/mps3_an547.ld
index a0a4d40..c2777f4 100644
--- a/platform/mps3_an547/mps3_an547.ld
+++ b/platform/mps3_an547/mps3_an547.ld
@@ -8,6 +8,7 @@
* DTCM 512 KB @ 0x20000000 - stack
* ISRAM 2 MB @ 0x21000000 - data + bss + heap
*/
+/* ANCHOR: pt_memory */
MEMORY
{
ITCM (rx) : ORIGIN = 0x00000000, LENGTH = 512K
@@ -20,6 +21,7 @@ __stack_top = ORIGIN(DTCM) + LENGTH(DTCM);
/* MSPLIM (set in Reset_Handler): the stack owns all of DTCM, so the lowest
* address it may legally reach is the region base. */
__stack_limit = ORIGIN(DTCM);
+/* ANCHOR_END: pt_memory */
ENTRY(Reset_Handler)
@@ -77,11 +79,13 @@ SECTIONS
__bss_end__ = .;
} > DATA
+ /* ANCHOR: pt_heap */
.heap (NOLOAD) : ALIGN(8) {
__heap_start__ = .;
. = ORIGIN(DATA) + LENGTH(DATA);
__heap_end__ = .;
} > DATA
+ /* ANCHOR_END: pt_heap */
/* librdimon's (unused, weak) _sbrk references `end`; satisfy it. */
PROVIDE(end = __heap_start__);
diff --git a/scripts/book_figures.py b/scripts/book_figures.py
new file mode 100644
index 0000000..6e705a1
--- /dev/null
+++ b/scripts/book_figures.py
@@ -0,0 +1,459 @@
+#!/usr/bin/env python3
+"""Regenerates the book's figures (book/src/img/*.svg).
+
+Every figure is produced from the same sources the text cites:
+
+- the filter figures re-run the exact design math of
+ include/srt/detail/kaiser.hpp (formula-for-formula port below);
+- the servo and feasibility figures are MEASURED: this script compiles
+ scripts/book_figures_trace.cpp against the current include/ tree and runs
+ it in deterministic virtual time. The feasibility "before" panel compiles
+ the same tool against the include/ tree of commit 045de5d — the last
+ commit before the PR #25 feasibility fix — extracted with `git archive`,
+ so both panels of that figure are measurements, not models;
+- the phase-wraparound figure runs the resampler's actual uint64 slip
+ arithmetic (mod 2^64) in Python integers;
+- the architecture figure is drawn, not computed.
+
+Usage: python3 scripts/book_figures.py (from the repo root)
+Needs: numpy, matplotlib, g++, git.
+
+The SVGs are committed. CI does not regenerate them — matplotlib's SVG
+output is not byte-stable across matplotlib versions, so a regenerate-and-
+diff gate would ratchet toolchain noise, not truth — but the book CI job
+does verify that every image the chapters reference exists.
+"""
+
+import os
+import subprocess
+import sys
+import tempfile
+
+import numpy as np
+import matplotlib
+
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+from matplotlib.patches import FancyBboxPatch, FancyArrowPatch
+
+ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+OUT = os.path.join(ROOT, "book", "src", "img")
+PREFIX_COMMIT = "045de5d" # last commit before the feasibility fix (PR #25)
+
+# Palette (validated categorical slots + chrome ink; light surface).
+SURFACE = "#fcfcfb"
+INK = "#0b0b0b"
+SECONDARY = "#52514e"
+MUTED = "#898781"
+GRID = "#e1e0d9"
+BASELINE = "#c3c2b7"
+BLUE = "#2a78d6" # slot 1
+AQUA = "#1baf7a" # slot 2 (sub-3:1 contrast: always direct-labeled)
+YELLOW = "#eda100" # slot 3 (sub-3:1 contrast: always direct-labeled)
+RED = "#e34948" # slot 6, used only for the pre-fix (failing) trace
+
+plt.rcParams.update({
+ "figure.facecolor": SURFACE,
+ "axes.facecolor": SURFACE,
+ "savefig.facecolor": SURFACE,
+ "font.family": "sans-serif",
+ "font.sans-serif": ["DejaVu Sans"],
+ "font.size": 9,
+ "text.color": INK,
+ "axes.edgecolor": BASELINE,
+ "axes.labelcolor": SECONDARY,
+ "axes.titlecolor": INK,
+ "axes.titlesize": 10,
+ "axes.linewidth": 0.75,
+ "axes.grid": True,
+ "grid.color": GRID,
+ "grid.linewidth": 0.75,
+ "grid.linestyle": "-",
+ "xtick.color": MUTED,
+ "ytick.color": MUTED,
+ "xtick.labelcolor": MUTED,
+ "ytick.labelcolor": MUTED,
+ "lines.linewidth": 1.5,
+ "lines.solid_joinstyle": "round",
+ "lines.solid_capstyle": "round",
+ "legend.frameon": False,
+ "svg.hashsalt": "sampleratetap-book",
+})
+
+
+def save(fig, name):
+ fig.savefig(os.path.join(OUT, name + ".svg"))
+ png_dir = os.environ.get("PNG_OUT") # optional raster copies for review
+ if png_dir:
+ fig.savefig(os.path.join(png_dir, name + ".png"), dpi=110)
+
+
+def despine(ax):
+ for side in ("top", "right"):
+ ax.spines[side].set_visible(False)
+
+
+# --- the filter design math, ported formula-for-formula from kaiser.hpp ---
+
+def bessel_i0(x):
+ x = np.asarray(x, dtype=float)
+ half = 0.5 * x
+ term = np.ones_like(x)
+ total = np.ones_like(x)
+ for k in range(1, 1000):
+ r = half / k
+ term = term * r * r
+ total = total + term
+ if np.all(term < 1e-21 * total):
+ break
+ return total
+
+
+def kaiser_beta(atten_db):
+ if atten_db > 50.0:
+ return 0.1102 * (atten_db - 8.7)
+ if atten_db > 21.0:
+ return 0.5842 * (atten_db - 21.0) ** 0.4 + 0.07886 * (atten_db - 21.0)
+ return 0.0
+
+
+def design_prototype(num_phases, taps_per_phase, cutoff_norm, beta):
+ n = num_phases * taps_per_phase
+ i = np.arange(n, dtype=float)
+ center = 0.5 * (n - 1)
+ t = (i - center) / num_phases
+ u = (i - center) / center
+ w = bessel_i0(beta * np.sqrt(np.maximum(0.0, 1.0 - u * u))) / bessel_i0(beta)
+ h = cutoff_norm * np.sinc(cutoff_norm * t) * w # np.sinc is sin(pi x)/(pi x)
+ return h * (num_phases / h.sum())
+
+
+# FilterSpec presets, verbatim from polyphase_filter.hpp.
+PRESETS = [
+ ("fast", 128, 32, 18000.0, 30000.0, 96.0, BLUE),
+ ("balanced", 256, 48, 20000.0, 28000.0, 120.0, AQUA),
+ ("transparent", 512, 80, 20000.0, 26000.0, 140.0, YELLOW),
+]
+FS = 48000.0
+
+
+def preset_response(L, T, pass_hz, stop_hz, atten_db, nfft=1 << 21):
+ cutoff = (pass_hz + stop_hz) / FS
+ h = design_prototype(L, T, cutoff, kaiser_beta(atten_db))
+ H = np.fft.rfft(h, nfft) / L
+ f = np.arange(H.size) * (L * FS) / nfft
+ keep = f <= 48000.0
+ return f[keep], 20.0 * np.log10(np.maximum(np.abs(H[keep]), 1e-12))
+
+
+def fig_kaiser_window():
+ fig, ax = plt.subplots(figsize=(6.4, 3.2), layout="constrained")
+ u = np.linspace(-1.0, 1.0, 801)
+ iu = 180 # u = -0.55, where the three curves are well separated
+ for name, _, _, _, _, atten, color in PRESETS:
+ beta = kaiser_beta(atten)
+ w = bessel_i0(beta * np.sqrt(1.0 - u * u)) / bessel_i0(beta)
+ ax.plot(u, w, color=color, label=f"{name}: {atten:.0f} dB, β = {beta:.1f}")
+ ax.annotate(name, (u[iu], w[iu]), xytext=(-4, 4),
+ textcoords="offset points", color=SECONDARY, fontsize=8.5,
+ ha="right", bbox=dict(fc=SURFACE, ec="none", pad=1.0))
+ ax.set_xlabel("window argument u (full aperture −1 … 1)")
+ ax.set_ylabel("w(u)")
+ ax.set_title("Kaiser window: attenuation buys taper")
+ ax.legend(loc="upper right", fontsize=8.5)
+ ax.set_xlim(-1.0, 1.0)
+ ax.set_ylim(0.0, 1.05)
+ despine(ax)
+ save(fig, "kaiser-window")
+ plt.close(fig)
+
+
+def fig_kaiser_response():
+ fig, (ax, axz) = plt.subplots(
+ 2, 1, figsize=(7.0, 5.6), layout="constrained", height_ratios=[2.4, 1.0])
+ for name, L, T, pass_hz, stop_hz, atten, color in PRESETS:
+ f, db = preset_response(L, T, pass_hz, stop_hz, atten)
+ ax.plot(f / 1e3, db, color=color, label=name)
+ axz.plot(f / 1e3, db, color=color)
+ # direct label at each preset's measured stopband floor
+ floor = db[f >= stop_hz].max()
+ ax.annotate(f"{name}: {floor:.0f} dB past {stop_hz/1e3:.0f} kHz",
+ (47.0, floor), xytext=(0, 7),
+ textcoords="offset points", color=SECONDARY, fontsize=8.5,
+ ha="right", bbox=dict(fc=SURFACE, ec="none", pad=1.0))
+ for x, label in ((20.0, "20 kHz passband edge"), (24.0, "input Nyquist")):
+ ax.axvline(x, color=BASELINE, lw=0.75, zorder=0)
+ ax.annotate(label, (x, -182), rotation=90, xytext=(-3, 0),
+ textcoords="offset points", color=MUTED,
+ fontsize=7.5, ha="right", va="bottom",
+ bbox=dict(fc=SURFACE, ec="none", pad=1.0))
+ ax.set_ylim(-185, 8)
+ ax.set_xlim(0, 48)
+ ax.set_ylabel("magnitude (dB)")
+ ax.set_title("Prototype magnitude response, the three presets")
+ ax.legend(loc="upper right", fontsize=8.5)
+ despine(ax)
+ axz.set_xlim(0, 22)
+ axz.set_ylim(-0.031, 0.031)
+ axz.set_xlabel("frequency at 48 kHz (kHz)")
+ axz.set_ylabel("passband detail (dB)")
+ axz.annotate("all three presets flat within ±0.01 dB across their passbands",
+ (0.5, 0.021), color=SECONDARY, fontsize=8.5, ha="left")
+ despine(axz)
+ save(fig, "kaiser-response")
+ plt.close(fig)
+
+
+# --- measured traces via the C++ tool ---
+
+def build_trace_tool(include_dir, exe):
+ subprocess.run(
+ ["g++", "-O2", "-std=c++20", f"-I{include_dir}",
+ os.path.join(ROOT, "scripts", "book_figures_trace.cpp"), "-o", exe],
+ check=True)
+
+
+def run_trace(exe, *args):
+ out = subprocess.run([exe] + [str(a) for a in args],
+ check=True, capture_output=True, text=True).stdout
+ rows = [line.split(",") for line in out.strip().splitlines()[1:]]
+ a = np.array(rows, dtype=float)
+ return {"t": a[:, 0], "fill": a[:, 1], "state": a[:, 2],
+ "ppm": a[:, 3], "underruns": a[:, 4]}
+
+
+def fig_servo_lock(head_exe):
+ # 1-frame pushes: the long tests' methodology — block-quantized pushes
+ # would hide the 200 ppm surplus in one 32-frame lump every ~3.3 s.
+ tr = run_trace(head_exe, 32, 1, 200, 45, 28.0, 0.05)
+ fig, (axf, axp) = plt.subplots(
+ 2, 1, figsize=(7.0, 4.8), sharex=True, layout="constrained")
+
+ state = tr["state"]
+ t_lock1 = tr["t"][np.argmax(state == 2)]
+ i_stall = int(np.searchsorted(tr["underruns"], 0.5))
+ t_stall = tr["t"][i_stall]
+ after_stall = (tr["t"] > t_stall) & (state == 2)
+ t_lock2 = tr["t"][np.argmax(after_stall)]
+
+ axf.plot(tr["t"], tr["fill"], color=BLUE, lw=1.2)
+ axf.axhline(48, color=BASELINE, lw=0.75, zorder=0)
+ axf.annotate("setpoint 48", (44.8, 48), xytext=(0, 5),
+ textcoords="offset points", color=MUTED, fontsize=8, ha="right")
+ axf.set_ylim(46.6, 50.6)
+ axf.annotate(f"cold start: Locked in {t_lock1:.2f} s",
+ (t_lock1, 50.0), xytext=(2.5, 50.0), textcoords="data",
+ color=SECONDARY, fontsize=8,
+ arrowprops=dict(arrowstyle="-", color=SECONDARY, lw=0.75))
+ axf.annotate("50 ms producer stall → refill → "
+ f"re-Locked {t_lock2 - t_stall:.2f} s later",
+ (t_stall, 50.0), xytext=(30.5, 50.0), textcoords="data",
+ color=SECONDARY, fontsize=8,
+ arrowprops=dict(arrowstyle="-", color=SECONDARY, lw=0.75))
+ axf.set_ylabel("FIFO occupancy (frames)")
+ axf.set_title("Acquire, lock, dropout, re-lock (measured, producer +200 ppm)")
+
+ axp.plot(tr["t"], tr["ppm"], color=BLUE, lw=1.2)
+ for y in (1500, -1500):
+ axp.axhline(y, color=BASELINE, lw=0.75, zorder=0)
+ axp.axhline(200, color=BASELINE, lw=0.75, zorder=0)
+ axp.annotate("true offset 200 ppm", (44.8, 200), xytext=(0, 5),
+ textcoords="offset points", color=MUTED, fontsize=8, ha="right")
+ axp.annotate("servo clamp ±1500 ppm", (44.8, 1500), xytext=(0, -10),
+ textcoords="offset points", color=MUTED, fontsize=8,
+ ha="right", va="top")
+ axp.annotate("Acquiring (10 Hz) rings against the clamp\n"
+ "on the ±1-frame quantized occupancy",
+ (2.7, 900), color=SECONDARY, fontsize=8, ha="left")
+ axp.annotate("Locked (1 Hz): settles on the true offset",
+ (14, 480), color=SECONDARY, fontsize=8, ha="left")
+ axp.set_ylim(-1750, 1950)
+ axp.set_ylabel("estimated ppm")
+ axp.set_xlabel("time (s)")
+
+ # hairlines at the recorded stage transitions
+ for i in np.flatnonzero(np.diff(state) != 0):
+ for ax in (axf, axp):
+ ax.axvline(tr["t"][i + 1], color=GRID, lw=0.75, zorder=0)
+ for ax in (axf, axp):
+ despine(ax)
+ save(fig, "servo-lock")
+ plt.close(fig)
+ return tr
+
+
+def fig_feasibility(head_exe, prefix_exe):
+ before = run_trace(prefix_exe, 64, 32, 200, 6)
+ after = run_trace(head_exe, 64, 32, 200, 6)
+ fig, (axb, axa) = plt.subplots(
+ 2, 1, figsize=(7.0, 4.8), sharex=True, sharey=True, layout="constrained")
+
+ axb.plot(before["t"], before["fill"], color=RED, lw=1.2)
+ hits = np.flatnonzero(np.diff(before["underruns"]) > 0)
+ axb.plot(before["t"][hits + 1], before["fill"][hits + 1], "o",
+ ms=4.5, color=RED, mec=SURFACE, mew=1.0, ls="none")
+ axb.annotate(f"{int(before['underruns'][-1])} underruns in 6 s — "
+ "one every ~0.25 s, forever",
+ (1.6, 116), color=SECONDARY, fontsize=8.5, ha="left")
+ axb.axhline(48, color=BASELINE, lw=0.75, zorder=0)
+ axb.set_title(f"Before (commit {PREFIX_COMMIT}, measured): "
+ "pull(64) against setpoint 48")
+ axb.set_ylabel("FIFO occupancy (frames)")
+
+ axa.plot(after["t"], after["fill"], color=BLUE, lw=1.2)
+ axa.axhline(48, color=BASELINE, lw=0.75, zorder=0)
+ axa.axhline(96, color=BASELINE, lw=0.75, zorder=0)
+ axa.annotate("configured setpoint 48", (5.95, 48), xytext=(0, 5),
+ textcoords="offset points", color=MUTED, fontsize=8, ha="right",
+ bbox=dict(fc=SURFACE, ec="none", pad=1.0))
+ axa.annotate("effective setpoint 96 = 64 + 64/2, raised on first pull",
+ (5.95, 96), xytext=(0, 5), textcoords="offset points",
+ color=MUTED, fontsize=8, ha="right",
+ bbox=dict(fc=SURFACE, ec="none", pad=1.0))
+ axa.set_title(f"After (HEAD, measured): {int(after['underruns'][-1])} underruns, "
+ "servo regulates the raised setpoint")
+ axa.set_ylabel("FIFO occupancy (frames)")
+ axa.set_xlabel("time (s)")
+ axa.set_ylim(38, 132)
+ for ax in (axb, axa):
+ despine(ax)
+ save(fig, "feasibility")
+ plt.close(fig)
+ return before, after
+
+
+# --- the Q0.64 wraparound, run with the real modular arithmetic ---
+
+def fig_q064():
+ M = 1 << 64
+ eps_mag = 0.09 # exaggerated so the wrap is visible; real |eps| ~ 2e-4
+ fig, axes = plt.subplots(
+ 1, 2, figsize=(7.0, 3.0), sharey=True, layout="constrained")
+ for ax, sign, title, note in (
+ (axes[0], +1, "ε > 0: wrap past 1.0 → advance 2",
+ "consume one extra input frame"),
+ (axes[1], -1, "ε < 0: wrap below 0.0 → advance 0",
+ "re-use the current window"),
+ ):
+ eps_fix = int(sign * eps_mag * M) % M # two's-complement, like the C++
+ phase, mu = 0 if sign > 0 else int(0.5 * M), []
+ wraps = []
+ for n in range(26):
+ m = (phase + eps_fix) % M
+ if sign > 0 and m < phase:
+ wraps.append(n)
+ if sign < 0 and m > phase:
+ wraps.append(n)
+ phase = m
+ mu.append(phase / M)
+ n = np.arange(26)
+ mu = np.array(mu)
+ ax.plot(n, mu, color=BLUE, lw=1.2, marker="o", ms=4.5,
+ mec=SURFACE, mew=1.0)
+ for w in wraps:
+ ax.plot([w], [mu[w]], "o", ms=6, color=BLUE, mec=SURFACE, mew=1.0)
+ w0 = wraps[0] # annotate the first wrap only; the rest just repeat
+ ax.annotate(note, (w0, mu[w0]), xytext=(8, 16 * sign),
+ textcoords="offset points", color=SECONDARY, fontsize=8,
+ bbox=dict(fc=SURFACE, ec="none", pad=1.0),
+ arrowprops=dict(arrowstyle="-", color=SECONDARY, lw=0.75))
+ ax.set_title(title, fontsize=9)
+ ax.set_xlabel("output frame n")
+ ax.set_ylim(-0.06, 1.06)
+ despine(ax)
+ axes[0].set_ylabel("phase μ = phase_ / 2⁶⁴")
+ fig.suptitle("The Q0.64 accumulator slips by wrapping (ε exaggerated to 0.09; real |ε| ≈ 2×10⁻⁴)",
+ fontsize=9.5, color=INK)
+ save(fig, "q064-slip")
+ plt.close(fig)
+
+
+# --- the architecture diagram (drawn) ---
+
+def fig_architecture():
+ fig, ax = plt.subplots(figsize=(8.2, 3.6), layout="constrained")
+ ax.set_xlim(0, 100)
+ ax.set_ylim(0, 44)
+ ax.axis("off")
+
+ # clock-domain washes
+ for x0, x1, color, label in ((0, 33, BLUE, "input clock domain (producer)"),
+ (52, 100, AQUA, "output clock domain (consumer)")):
+ ax.add_patch(FancyBboxPatch((x0 + 0.5, 1), x1 - x0 - 1, 42,
+ boxstyle="round,pad=0,rounding_size=1.5",
+ fc=color, ec="none", alpha=0.08))
+ ax.text((x0 + x1) / 2, 2.6, label, ha="center", color=SECONDARY,
+ fontsize=8)
+
+ def box(x, y, w, h, title, sub=None, weight="bold"):
+ ax.add_patch(FancyBboxPatch((x, y), w, h,
+ boxstyle="round,pad=0,rounding_size=1.2",
+ fc=SURFACE, ec=BASELINE, lw=1.0))
+ cy = y + h / 2 + (1.6 if sub else 0)
+ ax.text(x + w / 2, cy, title, ha="center", va="center",
+ color=INK, fontsize=8.6, fontweight=weight)
+ if sub:
+ ax.text(x + w / 2, cy - 3.8, sub, ha="center", va="center",
+ color=SECONDARY, fontsize=7.6)
+
+ def arrow(p, q, label=None, dy=1.4, style="-|>"):
+ ax.add_patch(FancyArrowPatch(p, q, arrowstyle=style, color=SECONDARY,
+ lw=1.1, mutation_scale=9,
+ shrinkA=1, shrinkB=1))
+ if label:
+ ax.text((p[0] + q[0]) / 2, (p[1] + q[1]) / 2 + dy, label,
+ ha="center", color=SECONDARY, fontsize=7.6)
+
+ box(3, 24, 16, 10, "producer", "audio callback / core 0")
+ box(36, 24, 15, 10, "SpscRing", "interleaved frames")
+ box(60, 7, 16, 10, "PiServo", "occupancy → ε̂")
+ box(57, 24, 22, 10, "FractionalResampler", "polyphase bank + Q0.64 phase")
+ box(84, 24, 13, 10, "consumer", "core 1 / thread")
+
+ arrow((19, 29), (36, 29), "push()")
+ arrow((51, 29), (57, 29), "pop")
+ arrow((79, 29), (84, 29), "pull()")
+ # occupancy: ring bottom, down and across to the servo
+ arrow((43.5, 24), (43.5, 12), None, style="-")
+ arrow((43.5, 12), (60, 12), None)
+ ax.text(51.5, 13.4, "occupancy", ha="center", color=SECONDARY, fontsize=7.6)
+ # rate estimate: servo top, up into the resampler
+ arrow((70, 17), (70, 24), None)
+ ax.text(71.5, 20.2, "ε̂ (rate estimate)", ha="left", color=SECONDARY,
+ fontsize=7.6)
+ ax.text(50, 41.5, "one passive object, two callers — the converter owns no threads",
+ ha="center", color=SECONDARY, fontsize=8.4)
+ save(fig, "architecture")
+ plt.close(fig)
+
+
+def main():
+ os.makedirs(OUT, exist_ok=True)
+ fig_kaiser_window()
+ fig_kaiser_response()
+ fig_q064()
+ fig_architecture()
+
+ with tempfile.TemporaryDirectory() as tmp:
+ head_exe = os.path.join(tmp, "trace_head")
+ build_trace_tool(os.path.join(ROOT, "include"), head_exe)
+ prefix_tree = os.path.join(tmp, "prefix")
+ os.makedirs(prefix_tree)
+ archive = subprocess.run(["git", "-C", ROOT, "archive", PREFIX_COMMIT, "include"],
+ check=True, capture_output=True).stdout
+ subprocess.run(["tar", "-x", "-C", prefix_tree], input=archive, check=True)
+ prefix_exe = os.path.join(tmp, "trace_prefix")
+ build_trace_tool(os.path.join(prefix_tree, "include"), prefix_exe)
+
+ tr = fig_servo_lock(head_exe)
+ before, after = fig_feasibility(head_exe, prefix_exe)
+
+ print(f"servo: locked at t={tr['t'][np.argmax(tr['state'] == 2)]:.1f}s, "
+ f"final ppm {tr['ppm'][-1]:.1f}, underruns {int(tr['underruns'][-1])}")
+ print(f"feasibility: before {int(before['underruns'][-1])} underruns/6s, "
+ f"after {int(after['underruns'][-1])}")
+ print(f"wrote 6 SVGs to {OUT}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/book_figures_trace.cpp b/scripts/book_figures_trace.cpp
new file mode 100644
index 0000000..22e58f9
--- /dev/null
+++ b/scripts/book_figures_trace.cpp
@@ -0,0 +1,63 @@
+// Trace dumper for the book's measured figures (scripts/book_figures.py).
+//
+// Runs the converter in deterministic virtual time — the same event-driven
+// two-clock scheme as tests/support/two_clock_sim.hpp — and prints one CSV
+// row per pull: t,fill,state,ppm,underruns. book_figures.py compiles this
+// file twice, once against the current include/ tree and once against the
+// tree of the last pre-feasibility-fix commit, so the before/after figure
+// in the composition chapter is measured on both sides of the fix, not
+// modeled. Only Status fields that exist in both versions are printed.
+//
+// Usage: trace pullBlock pushBlock ppm seconds [dropStart dropDur]
+#include
+#include
+#include
+#include
+#include
+
+#include
+
+int main(int argc, char** argv) {
+ if (argc < 5) {
+ std::fprintf(stderr, "usage: %s pullBlock pushBlock ppm seconds [dropStart dropDur]\n",
+ argv[0]);
+ return 2;
+ }
+ const std::size_t pullBlock = static_cast(std::atol(argv[1]));
+ const std::size_t pushBlock = static_cast(std::atol(argv[2]));
+ const double ppm = std::atof(argv[3]);
+ const double seconds = std::atof(argv[4]);
+ const double dropStart = argc > 5 ? std::atof(argv[5]) : -1.0;
+ const double dropDur = argc > 6 ? std::atof(argv[6]) : 0.0;
+
+ srt::Config cfg;
+ cfg.channels = 1;
+ srt::AsyncSampleRateConverter conv(cfg);
+
+ const double fsOut = cfg.sampleRateHz;
+ const double fsIn = fsOut * (1.0 + ppm * 1e-6); // producer's crystal
+ std::vector in(pushBlock), out(pullBlock);
+
+ double tPush = 0.0, tPull = 0.0, phase = 0.0;
+ const double dPhase = 2.0 * std::numbers::pi * 997.0 / fsIn;
+ std::puts("t,fill,state,ppm,underruns");
+ while (tPull < seconds) {
+ if (tPush <= tPull) {
+ if (!(tPush >= dropStart && tPush < dropStart + dropDur)) {
+ for (auto& v : in) {
+ v = 0.5f * static_cast(std::sin(phase));
+ phase += dPhase;
+ }
+ conv.push(in.data(), pushBlock);
+ }
+ tPush += static_cast(pushBlock) / fsIn;
+ continue;
+ }
+ conv.pull(out.data(), pullBlock);
+ tPull += static_cast(pullBlock) / fsOut;
+ const srt::Status s = conv.status();
+ std::printf("%.6f,%.2f,%d,%.2f,%llu\n", tPull, s.fifoFillFrames, static_cast(s.state),
+ s.ppm, static_cast(s.underruns));
+ }
+ return 0;
+}
diff --git a/tests/support/two_clock_sim.hpp b/tests/support/two_clock_sim.hpp
index adbc5df..6dd2b1a 100644
--- a/tests/support/two_clock_sim.hpp
+++ b/tests/support/two_clock_sim.hpp
@@ -12,6 +12,7 @@
namespace srt_test {
+// ANCHOR: pf_knobs
template
struct TwoClockSimT {
srt::BasicAsyncSampleRateConverter& asrc;
@@ -27,7 +28,9 @@ struct TwoClockSimT {
/// Optional input-rate modulation: fsIn scale factor at virtual time t
/// (e.g. for drift-ramp tests). Defaults to constant 1.
std::function fsInScale = [](double) { return 1.0; };
+ // ANCHOR_END: pf_knobs
+ // ANCHOR: pf_run
/// Runs for `seconds` of output-clock virtual time. onOut receives every
/// pulled block: (interleavedSamples, frames, virtualTime).
template
@@ -59,6 +62,7 @@ struct TwoClockSimT {
}
}
}
+ // ANCHOR_END: pf_run
};
using TwoClockSim = TwoClockSimT;
diff --git a/tools/capi/srt_capi.cpp b/tools/capi/srt_capi.cpp
index 0858bb9..04b2207 100644
--- a/tools/capi/srt_capi.cpp
+++ b/tools/capi/srt_capi.cpp
@@ -1,3 +1,4 @@
+// ANCHOR: abi_doc
/// \file srt_capi.cpp
/// \brief C ABI shim over the float converter, for FFI consumers (ctypes,
/// cffi, Julia, ...). Build with SRT_BUILD_CAPI=ON; srt_capi.h is the
@@ -9,12 +10,14 @@
/// zero return values, and every entry point tolerates a null handle — the
/// documented error convention ("check srt_create for NULL") otherwise
/// invites a crash on exactly the path where the caller forgot to check.
+// ANCHOR_END: abi_doc
#include
#include
#include
#include "srt/srt.hpp"
+// ANCHOR: abi_impl
extern "C" {
struct SrtHandle; // opaque
}
@@ -27,6 +30,7 @@ const srt::AsyncSampleRateConverter* impl(const SrtHandle* h) noexcept {
return reinterpret_cast(h);
}
} // namespace
+// ANCHOR_END: abi_impl
extern "C" {
@@ -34,6 +38,7 @@ unsigned srt_version(void) noexcept {
return SRT_VERSION_MAJOR * 10000u + SRT_VERSION_MINOR * 100u + SRT_VERSION_PATCH;
}
+// ANCHOR: abi_create
/// preset: 0 = fast, 1 = balanced, 2 = transparent.
SrtHandle* srt_create(double sampleRateHz, std::size_t channels, std::size_t targetLatencyFrames,
int preset) noexcept {
@@ -51,11 +56,13 @@ SrtHandle* srt_create(double sampleRateHz, std::size_t channels, std::size_t tar
return nullptr;
}
}
+// ANCHOR_END: abi_create
void srt_destroy(SrtHandle* h) noexcept {
delete impl(h);
}
+// ANCHOR: abi_null
std::size_t srt_push(SrtHandle* h, const float* interleaved, std::size_t frames) noexcept {
return h ? impl(h)->push(interleaved, frames) : 0;
}
@@ -63,6 +70,7 @@ std::size_t srt_push(SrtHandle* h, const float* interleaved, std::size_t frames)
std::size_t srt_pull(SrtHandle* h, float* interleaved, std::size_t frames) noexcept {
return h ? impl(h)->pull(interleaved, frames) : 0;
}
+// ANCHOR_END: abi_null
/// out[0]=state (0 Filling, 1 Acquiring, 2 Locked), out[1]=ppm,
/// out[2]=fifoFillFrames, out[3]=underruns, out[4]=overruns, out[5]=resyncs.
diff --git a/tools/capi/srt_capi.h b/tools/capi/srt_capi.h
index b8b3195..0f6ad4f 100644
--- a/tools/capi/srt_capi.h
+++ b/tools/capi/srt_capi.h
@@ -1,3 +1,4 @@
+/* ANCHOR: abi_contract */
/* SampleRateTap C ABI — FFI surface over the float converter.
*
* Build the shared library with -DSRT_BUILD_CAPI=ON. This header is the
@@ -17,6 +18,7 @@
* size_t in these signatures follows the platform ABI (32-bit on 32-bit
* targets) — declare foreign types accordingly.
*/
+/* ANCHOR_END: abi_contract */
#ifndef SRT_CAPI_H
#define SRT_CAPI_H
@@ -26,6 +28,7 @@
extern "C" {
#endif
+/* ANCHOR: abi_surface */
typedef struct SrtHandle SrtHandle;
/* ABI/version probe: returns SRT_VERSION_MAJOR*10000 +
@@ -55,6 +58,7 @@ double srt_designed_latency_seconds(const SrtHandle* h);
/* Consumer thread: discard all buffered input, forget the ppm estimate,
* return to Filling. */
void srt_reset_from_consumer(SrtHandle* h);
+/* ANCHOR_END: abi_surface */
#ifdef __cplusplus
}
diff --git a/tools/qemu_insn_plugin/insn_count.c b/tools/qemu_insn_plugin/insn_count.c
index 9f8fb2d..0f67c74 100644
--- a/tools/qemu_insn_plugin/insn_count.c
+++ b/tools/qemu_insn_plugin/insn_count.c
@@ -21,6 +21,7 @@ QEMU_PLUGIN_EXPORT int qemu_plugin_version = QEMU_PLUGIN_VERSION;
static uint64_t insn_count;
+/* ANCHOR: pf_hooks */
static void tb_trans(qemu_plugin_id_t id, struct qemu_plugin_tb* tb) {
(void)id;
size_t n = qemu_plugin_tb_n_insns(tb);
@@ -37,6 +38,7 @@ static void at_exit(qemu_plugin_id_t id, void* userdata) {
g_autofree gchar* msg = g_strdup_printf("SRT_INSN_COUNT %" PRIu64 "\n", insn_count);
qemu_plugin_outs(msg);
}
+/* ANCHOR_END: pf_hooks */
QEMU_PLUGIN_EXPORT int qemu_plugin_install(qemu_plugin_id_t id, const qemu_info_t* info, int argc,
char** argv) {