diff --git a/.github/workflows/book-pages.yml b/.github/workflows/book-pages.yml new file mode 100644 index 0000000..f53db92 --- /dev/null +++ b/.github/workflows/book-pages.yml @@ -0,0 +1,74 @@ +# Publishes the book to GitHub Pages on every push to main. +# +# The site is PUBLIC (https://tap.github.io/SampleRateTap/) even though the +# repository is private; it contains the book's code excerpts by design. +# One-time setup if the first run complains: Settings -> Pages -> Source: +# "GitHub Actions" (configure-pages below attempts to enable it itself). +name: book-pages + +on: + push: + branches: [main] + paths: + - "book/**" + - "include/**" + - "platform/**" + - "tests/support/**" + - "tools/**" + - "cmake/**" + - ".github/workflows/book-pages.yml" + workflow_dispatch: + +permissions: + contents: read + pages: write + id-token: write + +concurrency: + group: pages + cancel-in-progress: false + +env: + MDBOOK_URL: https://github.com/rust-lang/mdBook/releases/download/v0.4.40/mdbook-v0.4.40-x86_64-unknown-linux-gnu.tar.gz + MDBOOK_SHA256: "9ef07fd288ba58ff3b99d1c94e6d414d431c9a61fdb20348e5beb74b823d546b" + +jobs: + deploy: + runs-on: ubuntu-latest + timeout-minutes: 10 + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + steps: + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6 + + - name: Install mdBook (pinned) + run: | + curl -sfLo /tmp/mdbook.tar.gz "$MDBOOK_URL" + actual=$(sha256sum /tmp/mdbook.tar.gz | cut -d' ' -f1) + if [ "$actual" != "$MDBOOK_SHA256" ]; then + echo "::error::mdbook checksum mismatch"; exit 1 + fi + tar -xzf /tmp/mdbook.tar.gz -C /tmp + + - name: Build (warnings are errors) + run: | + /tmp/mdbook build book 2>&1 | tee /tmp/book-build.log + if grep -qiE 'warning|error' /tmp/book-build.log; then + echo "::error::mdbook reported warnings/errors" + exit 1 + fi + + - name: Configure Pages + uses: actions/configure-pages@983d7736d9b0ae728b81ab479565c72886d7745b # v5 + with: + enablement: true + + - name: Upload site + uses: actions/upload-pages-artifact@56afc609e74202658d3ffba0e8f6dda462b719fa # v3 + with: + path: book/book + + - name: Deploy + id: deployment + uses: actions/deploy-pages@d6db90164ac5ed86f2b6aed7e0febac5b3c0c03e # v4 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ff16a06..8ed298b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -486,3 +486,56 @@ jobs: bench/*.cpp bench/icount/*.cpp bench/compare/*.cpp \ tools/capi/*.cpp tools/qemu_insn_plugin/*.c \ tests/*.cpp tests/support/*.hpp examples/*.cpp platform/*.c + + # The book (book/) quotes library code via mdBook anchor includes; this + # gate makes a refactor that orphans an excerpt fail CI, the same + # freshness contract as the README's generated tables. Warnings are + # errors: a missing anchor is a warning, and a missing anchor is rot. + book: + name: Book build + runs-on: ubuntu-latest + timeout-minutes: 10 + env: + MDBOOK_URL: https://github.com/rust-lang/mdBook/releases/download/v0.4.40/mdbook-v0.4.40-x86_64-unknown-linux-gnu.tar.gz + MDBOOK_SHA256: "9ef07fd288ba58ff3b99d1c94e6d414d431c9a61fdb20348e5beb74b823d546b" + steps: + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6 + + - name: Install mdBook (pinned) + run: | + curl -sfLo /tmp/mdbook.tar.gz "$MDBOOK_URL" + actual=$(sha256sum /tmp/mdbook.tar.gz | cut -d' ' -f1) + if [ "$actual" != "$MDBOOK_SHA256" ]; then + echo "::error::mdbook checksum mismatch"; exit 1 + fi + tar -xzf /tmp/mdbook.tar.gz -C /tmp + + - name: Build (warnings are errors) + run: | + /tmp/mdbook build book 2>&1 | tee /tmp/book-build.log + if grep -qiE 'warning|error' /tmp/book-build.log; then + echo "::error::mdbook reported warnings/errors (stale anchor or broken include?)" + exit 1 + fi + + # mdBook does not fail on a missing image, so check every relative + # image reference resolves. (The SVGs are committed, generated by + # scripts/book_figures.py; regeneration is not gated because + # matplotlib's SVG output is not byte-stable across versions.) + - name: Check image references resolve + run: | + python3 - <<'EOF' + import pathlib, re, sys + src = pathlib.Path("book/src") + missing = [] + for md in src.rglob("*.md"): + for target in re.findall(r"!\[[^\]]*\]\(([^)#?]+)", md.read_text()): + if target.startswith(("http://", "https://")): + continue + if not (md.parent / target).resolve().exists(): + missing.append(f"{md}: {target}") + if missing: + print("::error::broken image reference(s):") + print("\n".join(missing)) + sys.exit(1) + EOF diff --git a/.gitignore b/.gitignore index 94fcdc7..b8e7965 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ CMakeUserPresets.json .vscode/ .idea/ .claude/ +book/book/ diff --git a/README.md b/README.md index 4cbba32..a15065c 100644 --- a/README.md +++ b/README.md @@ -74,6 +74,31 @@ there are no install/package rules yet. Version 0.1.0 (`SRT_VERSION_*` in `srt/srt.hpp`, `srt_version()` over the C ABI); pre-1.0, the API may still change between versions. +## The book + +The repository includes a full-length tutorial book (`book/`) that walks +every header file line by line — the DSP, the C++ idioms chosen and +rejected, how thread safety works, how the servo was tuned, and the +optimization campaign with its dead ends preserved. It is written for a +reader learning C++, DSP, and real-time concurrency with this converter as +the running example. Three mechanical commitments keep it honest: every +code excerpt is included live from the actual headers at build time (CI +fails on a stale anchor), every figure is regenerated from the same math +and measured traces by `scripts/book_figures.py`, and every chapter ends +with runnable commands that reproduce its claims. + +Read it at **** (published from +`main` by the `book-pages` workflow; note the site is public even though +this repository is private), or build it locally with +[mdBook](https://rust-lang.github.io/mdBook/) (CI pins v0.4.40): + +```sh +mdbook build book # or: mdbook serve book --open +``` + +Start at `book/src/SUMMARY.md` for the table of contents, or read the +sources directly — they are plain Markdown. + ## How it works The design follows the classic commercial-ASRC architecture (AD1896-style diff --git a/book/book.toml b/book/book.toml new file mode 100644 index 0000000..2765989 --- /dev/null +++ b/book/book.toml @@ -0,0 +1,14 @@ +[book] +title = "SampleRateTap: The Story of an Asynchronous Sample Rate Converter" +description = "A working tour of a real-time asynchronous sample rate converter: the DSP, the C++, the concurrency, and the measurements that hold it together." +authors = ["The SampleRateTap project"] +language = "en" +src = "src" + +[build] +create-missing = false + +[output.html] +default-theme = "rust" +git-repository-url = "https://github.com/tap/SampleRateTap" +site-url = "/SampleRateTap/" diff --git a/book/src/SUMMARY.md b/book/src/SUMMARY.md new file mode 100644 index 0000000..cad7d1a --- /dev/null +++ b/book/src/SUMMARY.md @@ -0,0 +1,47 @@ +# Summary + +[Introduction](introduction.md) + +# Part 0 — The problem + +- [Two crystals, one stream](part0/two-crystals.md) +- [Budgets: latency, quality, compute](part0/budgets.md) + +# Part I — The machine, file by file + +- [Designing the filter: kaiser.hpp](part1/kaiser.md) +- [The polyphase bank](part1/polyphase-bank.md) +- [Sample types as a customization point: sample_traits.hpp](part1/sample-traits.md) +- [The lock-free ring: spsc_ring.hpp](part1/spsc-ring.md) +- [The clock servo: pi_servo.hpp](part1/pi-servo.md) +- [The fractional resampler](part1/fractional-resampler.md) +- [Composition: asrc.hpp](part1/asrc.md) + +# Part II — The proof system + +- [Tests as specifications](part2/tests.md) +- [Counting instructions, deterministically](part2/icount.md) +- [Notebooks as calibrated instruments](part2/notebooks.md) + +# Part III — Optimizing honestly + +- [Profile first, claim later (C1–C2)](part3/c1-c2.md) +- [The integer phase and the wide MACs (C3–C5)](part3/c3-c5.md) +- [The channel axis (C6)](part3/c6.md) + +# Part IV — Portability + +- [Hexagon: a DSP that keeps secrets](part4/hexagon.md) +- [Cortex-M: bare metal, two ways](part4/cortex-m.md) +- [The C ABI](part4/c-abi.md) + +# Part V — Deployment + +- [Real clocks: bridges and firmware](part5/hardware.md) +- [Channels, rates, and the rules that scale](part5/scaling.md) + +--- + +[Appendix A: The C++ decision log](appendix/cpp-decisions.md) +[Appendix B: Glossary](appendix/glossary.md) +[Appendix C: Annotated bibliography](appendix/bibliography.md) diff --git a/book/src/appendix/bibliography.md b/book/src/appendix/bibliography.md new file mode 100644 index 0000000..34a1700 --- /dev/null +++ b/book/src/appendix/bibliography.md @@ -0,0 +1,107 @@ +# Appendix C: Annotated bibliography + +> If I have seen further it is by standing on the shoulders of giants. +> +> — Isaac Newton, letter to Robert Hooke + +This project's provenance statement is short: all code implements +long-published methods, and no third-party source was copied. This +appendix lists those methods' sources — plus the tools and competitors the +measurements depend on — with a note on what the project *actually took* +from each. It deliberately cites nothing the codebase does not genuinely +draw on. + +## Signal processing + +**J. F. Kaiser, "Nonrecursive digital filter design using the I₀-sinh +window function," *Proc. IEEE Int. Symp. Circuits and Systems*, 1974.** +The origin of the Kaiser window and of the two empirical fits the library +evaluates verbatim in `include/srt/detail/kaiser.hpp`: stopband +attenuation → window shape parameter β, and the attenuation/transition- +width → filter-length estimate. The project took the closed forms exactly +as published — the value of the Kaiser window here is precisely that its +design procedure is a page of code with known error bounds, needing no +iterative optimization at construction time. + +**f. harris, *Multirate Signal Processing for Communication Systems*, +Prentice Hall, 2004.** The standard reference for polyphase decomposition +— factoring one long prototype filter into L short branches indexed by +fractional delay — which is the structure of the library's coefficient +table. The tap-length estimate in `estimateTaps()` is the Kaiser/harris +formula in the form `N = (A − 8) / (2.285 · Δω)`, applied per polyphase +branch; the codebase credits both names, as the literature does. + +**J. O. Smith, "Digital Audio Resampling Home Page" (and the *Bandlimited +Interpolation* material), CCRMA, Stanford University.** The theory the +datapath implements: resampling as evaluation of a windowed-sinc +interpolation kernel at fractional positions, with a finite table of +kernel phases and *linear interpolation between adjacent table entries*. +Smith's analysis of that last step is where the library's most-quoted +scaling law comes from — interpolation residue falling ~12 dB per doubling +of the table size L and rising with signal frequency — which Part 0 turns +into the budget arithmetic connecting L to decibels. + +**Analog Devices, AD1896 datasheet ("192 kHz Stereo Asynchronous Sample +Rate Converter").** The architectural ancestor. The README describes the +library as "the classic commercial-ASRC architecture (AD1896-style +polyphase FIR + clock servo), specialized for the near-unity regime," and +the datasheet documents that architecture: a polyphase interpolation +filter addressed by a recovered rate ratio, with a FIFO between the clock +domains. It also supplies the hardware row in the comparison table — +quoted as datasheet values, with the caveats about measurement environment +stated in `docs/COMPARISON.md`. + +**AES17, *AES standard method for digital audio engineering — Measurement +of digital audio equipment* (Audio Engineering Society).** The measurement +definition behind the headline quality numbers: remove the fundamental, +integrate the residual across the audio band for THD+N, measure dynamic +range at −60 dBFS with A-weighting. The comparison notebook implements an +AES17-style procedure (exact fit plus ±20 Hz notch, 20 Hz–20 kHz +integration) and calibrates it against synthetic signals before use — the +standard is what makes the −132 dB figure commensurable with silicon +datasheets rather than a house metric. + +## The measured competitors + +**libsamplerate (Secret Rabbit Code), E. de Castro Lopo — +documentation at libsndfile.github.io/libsamplerate.** The closest +architectural analog (streaming time-domain polyphase resampler) and one +of the two software subjects measured under identical conditions in +`docs/COMPARISON.md` and the comparison notebook. Its documentation also +supplied the honesty check the comparison repeats: the published "97 dB +worst case" figure applies to aggressive ratios, so near-unity results at +the format ceiling are its *easy* regime, not a contradiction. + +**soxr (the SoX Resampler library) — github.com/chirlu/soxr.** The second +measured competitor, and the source of its own latency figure via +`soxr_delay()`. What the project took from soxr is mostly a boundary +lesson made quantitative: soxr wins raw host throughput decisively and +carries ~12–16 ms of latency doing it, which is the measured statement of +why a 1–2 ms live-monitoring budget needs a different design. + +## C++ + +**Anthony Williams, *C++ Concurrency in Action*, 2nd ed., Manning, 2019.** +The working reference for the C++ memory model as this book teaches it: +acquire/release pairing as the establishment of happens-before, the +legitimacy of relaxed loads of data a thread itself owns, and lock-free +queue design generally. The ring chapter's proof style — argue the two +release/acquire pairs, then treat everything else as sequential code — +is the book's method applied to a hundred-line class. + +**cppreference.com — in particular `std::memory_order`, +`std::atomic::is_always_lock_free`, `std::bit_ceil`, and +`std::hardware_destructive_interference_size`.** The day-to-day authority +for the exact semantics the headers rely on: the ordering guarantees the +ring asserts, the compile-time lock-freedom predicate the audit added, +the power-of-two rounding used by the ring and the polyphase table, and — +for the interference-size constant — the documented ABI fragility that +justified *rejecting* the standard facility in favor of a literal `64`. + +## Tooling + +**mdBook — rust-lang.github.io/mdBook.** The tool this book is built +with. Its `\{{#include path:anchor}}` mechanism is what makes the book's +central honesty commitment mechanical rather than aspirational: code +excerpts are pulled from the real headers at build time, so prose that +drifts from the code breaks the build in CI instead of quietly lying. diff --git a/book/src/appendix/cpp-decisions.md b/book/src/appendix/cpp-decisions.md new file mode 100644 index 0000000..c1744db --- /dev/null +++ b/book/src/appendix/cpp-decisions.md @@ -0,0 +1,759 @@ +# Appendix A: The C++ decision log + +> There are only two kinds of languages: the ones people complain about and the ones nobody uses. +> +> — Bjarne Stroustrup + +Every chapter of this book has defended C++ decisions in passing, in the +context that made them necessary. This appendix collects them in one place, +in one format: the decision, what was rejected, why, and where in the +repository the evidence lives — because in this codebase the decisions are +*recorded*, mostly as comments at the point of consequence, and a decision +whose reason you cannot locate is a decision you cannot safely revisit. + +A theme will emerge quickly, so it is worth stating up front. Almost every +entry below is the same decision wearing different clothes: **between a +clever general mechanism and a plain constraint you can state and verify, +this library picks the constraint.** A literal `64` over a standard +interference constant; a `static_assert` over trust; a compile-time gate +over a runtime flag; a comment that shows its arithmetic over a comment +that waves at it. Where the two genuinely conflict, the tiebreaker is +always the same pair of masters: the real-time audio contract and the +embedded targets that cannot fake their way around a bad choice. + +## 1. Header-only distribution + +The entire library is seven headers under `include/srt/`. The build system +declares exactly one library target, and it has no compiled artifact: + +```cmake +add_library(SampleRateTap INTERFACE) +add_library(SampleRateTap::SampleRateTap ALIAS SampleRateTap) +target_compile_features(SampleRateTap INTERFACE cxx_std_20) +``` + +Consumption is `add_subdirectory` or `FetchContent`, deliberately and +exclusively — the README's *Consuming the library* section says so in as +many words: "there are no install/package rules yet." The tests, examples, +benchmarks and the C ABI shim are all opt-in options that default off when +the project is not top-level, and the warning flags live on a separate +`srt_warnings` target so that the library's own `-Wall -Wextra -Wpedantic +-Wconversion` discipline is never propagated into a consumer's build +(`CMakeLists.txt` carries the comment: "not propagated to consumers"). + +What was rejected is the conventional pair: a compiled static/shared +library, and a packaged install with exported config files. The costs of +header-only are real and were accepted knowingly. Every translation unit +that includes `srt/srt.hpp` re-parses and re-instantiates the templates — +compile time is paid repeatedly. There is no ABI boundary, so there is +nothing to version at link time and no way to ship a fixed `.so` to a +customer who cannot rebuild (the C ABI shim in section 15 exists precisely +for the one consumer class that needs a binary boundary). + +What it buys is decisive for this library's actual deployment surface. +The code ships to bare-metal Cortex-M33/M55 firmware, a musl-libc Hexagon +toolchain, and ordinary hosts — four toolchains in CI alone, each with its +own flags, each producing incompatible binaries. A prebuilt library per +target multiplies the release matrix; a header vanishes into whatever +build the consumer already has, including builds with LTO, `-march=native` +or MVE auto-vectorization, where cross-TU inlining of the hot kernels is +exactly what the performance chapters measured. And a template library is +header-shaped by nature: the sample-type axis of section 2 means the +"library" is not a fixed set of functions but a recipe the consumer's +compiler executes. + +| Decision | Rejected | Reason | Evidence | +|---|---|---|---| +| `INTERFACE` target, `add_subdirectory`/`FetchContent` only | compiled library; install/export packaging | four incompatible toolchains in CI; templates need instantiation in the consumer's TU; costs (compile time, no ABI) accepted, C ABI shim covers the binary-boundary case | `CMakeLists.txt`; README "Consuming the library"; `tools/capi/` | + +## 2. Templates and a concept for the sample-type axis + +The datapath comes in three sample types — `float`, Q15 `int16_t`, Q31 +`int32_t` — and the axis is expressed as a template parameter constrained +by a concept: + +```cpp +template +class BasicAsyncSampleRateConverter { ... }; + +using AsyncSampleRateConverter = BasicAsyncSampleRateConverter; +using AsyncSampleRateConverterQ15 = BasicAsyncSampleRateConverter; +using AsyncSampleRateConverterQ31 = BasicAsyncSampleRateConverter; +``` + +The first rejected alternative is virtual dispatch: an abstract +`ISampleOps` with `mac()`, `blend()`, `finalize()` virtuals. That dies on +arithmetic grounds before it reaches performance grounds — the three +datapaths do not share signatures. The float path accumulates in `double`, +the fixed-point paths in `int64_t`; the blend factor is a `float`, a Q15 +`int32_t`, or a Q20 `int32_t` depending on the type. Virtual functions +cannot vary their associated types per implementation; you would be forced +to launder everything through the widest type, which is precisely the +soft-double catastrophe the fixed-point paths exist to avoid (the M33 +baselines put the float path at roughly 19× the M55's instruction count +for exactly that reason — README, platform section). And even if the types +had lined up, an indirect call per multiply-accumulate inside a 48–80-tap +loop would forfeit the inlining and auto-vectorization that Part III +measured: the M55's Q15 kernel is fast *because* GCC can see through +`SampleTraits::mac` and emit Helium. + +The second rejected alternative is CRTP — compile-time polymorphism via +inheritance. It solves the dispatch cost but contorts the shape: the +sample type here is `int16_t` itself, a builtin, not a class that can +inherit from a base. CRTP would demand wrapper types around the samples, +and wrapped samples are no longer the raw interleaved buffers that device +drivers and the `memcpy`-based ring (section 6 of the ring chapter) +require. The concept does the one job the template needs guarding for: + +```cpp +template +concept SampleType = requires(...) { + { SampleTraits::mac(a, x, c) } -> std::same_as::Accum>; + // ... six more operations, each with its exact type checked +}; +``` + +A wrong instantiation fails at the constraint with the list of missing +operations, not three template layers deep in the dot-product loop. The +header then `static_assert`s the concept against all three shipped types — +the same trust-nothing reflex as the ring's lock-free asserts. + +| Decision | Rejected | Reason | Evidence | +|---|---|---|---| +| templates constrained by the `SampleType` concept | virtual `ISampleOps`; CRTP wrappers | per-type associated types (`Accum`, `BlendFactor`) are impossible to express virtually; builtins can't inherit; hot loops must inline and vectorize | `include/srt/sample_traits.hpp` (concept + `static_assert`s); `include/srt/asrc.hpp` aliases; README platform notes (19× soft-double) | + +## 3. A traits struct as the customization point + +Given templates, the customization could still have taken several shapes. +The library chose a traits struct with an intentionally undefined primary +template: + +```cpp +/// Primary template intentionally undefined; specialize per sample type. +template +struct SampleTraits; +``` + +Each specialization bundles three associated types (`Coeff`, `Accum`, +`BlendFactor`) with seven static functions (`makeCoeff`, `mac`, `blend`, +`finalize`, ...). Why this over the alternatives? + +**Free functions found by ADL** — the customary `swap`-style mechanism — +were worse for two reasons. First, the customization is mostly *types*, +not functions: the fact that Q15 stores coefficients as Q1.14 `int16_t` +but accumulates in `int64_t` is the design (the header's comments derive +it: Q0.15 × Q1.14 products summed exactly, one rounding in `finalize()`). +Free functions cannot carry associated types; you would need separate type +traits anyway, and the customization point would smear across two +mechanisms. Second, ADL on builtin types like `int16_t` has no associated +namespace to hook — the overloads would all pile into `srt` and be +distinguishable only by overload resolution, silently, which is exactly +how a Q15/Q31 mixup would compile and produce garbage. + +**Member policies** — making the sample type a class that knows its own +arithmetic — fail as in section 2: the sample types must remain raw +builtins so buffers stay `memcpy`-compatible and ABI-identical to what +audio drivers produce. A traits struct is the standard C++ answer for +attaching behavior to types you cannot modify, and the undefined primary +template makes "I forgot to specialize" a clean compile error at the point +of use rather than a link error or a default that half-works. + +The struct also keeps each datapath's documentation in one screenful: the +Q15 specialization's header comment is a complete fixed-point error budget +(coefficient quantization at ~−86 dB, single rounding point, "the +converter is Q15-transparent"), sitting directly above the ten lines that +implement it. + +| Decision | Rejected | Reason | Evidence | +|---|---|---|---| +| `SampleTraits` struct, undefined primary template | ADL free functions; member policies on sample classes | customization is chiefly associated types; builtins have no ADL namespace and can't have members; missing specialization = clean compile error | `include/srt/sample_traits.hpp` | + +## 4. The real-time contract: exceptions at setup, `noexcept` forever after + +This is the load-bearing wall of the whole API, stated as a contract in +the converter's class comment: + +```cpp +/// Real-time contract: the constructor performs all allocation and filter +/// design and may throw; push(), pull(), status() and resetFromConsumer() +/// are noexcept, lock-free and allocation-free. +``` + +The README's feature bullets repeat it, because it is the feature. The +constructor allocates every buffer the object will ever touch — ring, +polyphase table, histories, scratch — designs the filter in double +precision, validates the configuration, and throws `std::invalid_argument` +or `std::bad_alloc` on anything wrong. From that point on, the audio path +never allocates, never locks, never throws; every hot function is spelled +`noexcept`, and the `validated()` function exists to make the constructor +*more* throw-happy, rejecting configurations that "would otherwise +construct successfully and misbehave silently" — NaN sample rates that +design all-NaN tables, band edges that pass images wholesale, deviation +clamps that overflow the Q0.64 conversion (its comment lists each one). + +The rejected alternatives are the two ways other libraries split this. +Error codes at setup ("check the return of `init()`") were rejected +because a partially-constructed converter is not a state this object can +represent — there is no meaningful "converter without a filter table," and +C++ constructors-that-throw are precisely the tool that makes invalid +objects unrepresentable. Exceptions on the audio path were never +considered — an unwind inside a device callback is a glitch at best — but +the *strength* of the setup/hot-path split was reinforced from an +unexpected direction. When the first `EXPECT_THROW` test reached the +Hexagon CI leg, it discovered that the hexagon-linux-musl toolchain +cannot catch exceptions at all: a constructor throw terminates via +libc++abi instead of propagating. `docs/PERFORMANCE.md` records it under +Known debt, with the deployment note ("treat invalid Config as fatal — +validate inputs before constructing") and the candidate fix +(`-unwindlib=libunwind`). The discovery cost one excluded test on one leg +— because exceptions had been confined to a code region where "terminate +instead of propagate" is survivable. Had the audio path thrown, the same +toolchain quirk would have been a field failure. + +| Decision | Rejected | Reason | Evidence | +|---|---|---|---| +| all allocation + throwing in the constructor; `noexcept`/lock-free/allocation-free hot path | `init()` + error codes; exceptions anywhere near audio | invalid objects unrepresentable; RT contract is the product; Hexagon's no-unwind toolchain proved the value of confining throws to setup | `include/srt/asrc.hpp` (class comment, `validated()`); README bullets; `docs/PERFORMANCE.md` Known debt; commit "Hexagon: exclude ConfigValidation" | + +## 5. Runtime filter design, not `constexpr` tables + +A modern-C++ reflex says the Kaiser-windowed prototype — pure math on +compile-time-known presets — should be a `constexpr` table. The library +computes it at runtime, in the constructor, and `kaiser.hpp` opens with +the reason, arithmetic included: + +```cpp +/// Design note — runtime vs constexpr: the prototype tables run 12K-33K taps +/// and each tap needs sin/sqrt plus a ~50-term Bessel I0 series. Constexpr +/// evaluation is interpreted (roughly 1e3-1e4x slower than native), would +/// need hand-rolled constexpr transcendentals before C++26, and would cost +/// tens of seconds to minutes of compile time in every including translation +/// unit. Runtime design takes well under 10 ms, runs once in a constructor, +/// and is off the audio path, so all design math here is plain runtime +/// double precision. +``` + +Unpack the trade. The `balanced()` preset's prototype is 256 × 48 = +12,288 taps, and the presets range upward from there — the comment's +"12K-33K taps". Each tap evaluates `sin`, +`sqrt`, and a Bessel-I0 power series that runs to ~50 terms. `constexpr` +evaluation is an interpreter inside the compiler — three to four orders +of magnitude slower than native — and, before C++26, `std::sin` and +friends are not `constexpr`, so the transcendentals would have to be +hand-rolled *and then trusted* to match runtime libm behavior. In a +header-only library the bill lands in every consumer TU, repeatedly. The +runtime version costs under 10 ms, once, in the constructor — which +section 4 already designated as the place where expensive things happen. +And a runtime design accepts *runtime* configurations: `FilterSpec` is +not limited to the three presets, so a compile-time table would have been +a special case bolted alongside the general path, not a replacement. + +This is the header-only cost model (section 1) feeding back into design: +having accepted per-TU compilation, the library polices what each TU +costs. + +| Decision | Rejected | Reason | Evidence | +|---|---|---|---| +| filter designed at runtime in the constructor | `constexpr` coefficient tables | 12K–33K taps × transcendentals ≈ minutes of interpreted compile time per TU vs <10 ms once at runtime; needs pre-C++26 hand-rolled constexpr math; runtime `FilterSpec` must work anyway | `include/srt/detail/kaiser.hpp` header comment | + +## 6. `` over hand-rolled bit tricks; masks over modulo + +Everywhere the library needs power-of-two arithmetic it reaches for +C++20's ``: `std::bit_ceil` rounds the ring capacity up +(`SpscRing`'s constructor), rounds the phase count up +(`PolyphaseFilterBank`), and sizes the FIFO (`ringCapacityElems` in +`asrc.hpp`); `std::countr_zero` recovers log₂(L) in the phase-indexed +kernels so the polyphase branch is the top bits of the Q0.64 phase word: + +```cpp +const int lg = std::countr_zero(bank.numPhases()); // L is a power of two +const std::size_t p = static_cast(phase >> (64 - lg)); +``` + +The rejected alternative is the folklore versions — the +shift-or-shift `bit_ceil`, the de Bruijn log₂ — which every C programmer +has written and half have gotten wrong at the boundaries (what does your +hand-rolled `bit_ceil` do at 0? at values above 2⁶³?). The standard +functions have specified edge behavior, compile to single instructions +where they exist, and *name the intent* — `countr_zero(numPhases())` +under the comment "L is a power of two" is an invariant stated twice. + +The deeper decision is what the powers of two are *for*: indexing by mask +instead of modulo. The ring's monotonic indices are wrapped by `head & +mask_` — its class comment: "Indices are monotonic and wrapped by a +power-of-two mask, so the full capacity is usable" — and the ring chapter +proves the wraparound benign. The polyphase table's L being a power of +two is what lets the Q0.64 phase word split into branch index and blend +fraction by pure shifts, with no division and no double arithmetic on the +per-sample path (the phase-accumulator comment in +`polyphase_filter.hpp`). A general-modulo design would put an integer +divide — tens of cycles on the M-class cores, and a serialization point +everywhere — inside the tightest loops the library owns, to support +capacities nobody asked for. + +| Decision | Rejected | Reason | Evidence | +|---|---|---|---| +| `std::bit_ceil` / `std::countr_zero`; power-of-two capacities indexed by mask | hand-rolled bit tricks; arbitrary sizes with `%` | specified edge cases, single instructions, intent named; masks keep divides and doubles off the per-sample path | `include/srt/spsc_ring.hpp` ctor + class comment; `include/srt/polyphase_filter.hpp` (`blendRowPhase`, `interpolatePhase`, `ringCapacityElems`) | + +## 7. Memory orderings chosen to be exactly sufficient + +The ring chapter walked this in full; the appendix records it as policy, +because it generalizes beyond the ring. Every atomic operation in the +library carries an explicit ordering argument, and each ordering is the +*weakest* that keeps the algorithm correct: `release` on the store that +publishes data, `acquire` on the load that consumes a foreign index, +`relaxed` on a thread's loads of its own index — and `relaxed` on all +telemetry, whose fields are documented as "individually coherent, not +mutually" (`status()` in `asrc.hpp`). + +The rejected alternative is `seq_cst`-by-default — writing +`head_.store(x)` and letting the strongest ordering paper over the +analysis. It would be correct. It was rejected first because it is +measurably stronger than needed on the weakly-ordered targets (full +barriers on ARM in the hottest loop the library owns), and second — the +argument this codebase actually leads with — because **orderings are +documentation**. An explicit `memory_order_relaxed` on `tail_.load()` in +the producer tells the reader "this is my own index; no synchronization +happens here" — a claim the ring chapter spells out and ThreadSanitizer +checks against reality in CI. A default `seq_cst` says only "I didn't +think about this," and in the one file whose entire job is to be thought +about, that is the wrong message. The same honesty cuts the other way: +where synchronization *is* needed, the annotation names which one, so a +future editor who weakens it is contradicting a written claim, not +merely changing a default. + +| Decision | Rejected | Reason | Evidence | +|---|---|---|---| +| explicit, minimal orderings on every atomic | `seq_cst` defaults | weaker barriers on ARM where it matters; each annotation documents exactly why it exists; TSan-checked in CI | `include/srt/spsc_ring.hpp`; `include/srt/asrc.hpp` telemetry; the ring chapter's "What was rejected" | + +## 8. `alignas(64)`, not `std::hardware_destructive_interference_size` + +The ring separates producer-owned, consumer-owned and shared-read-only +state onto distinct cache lines, and it does so with a named literal: + +```cpp +// 64-byte separation to keep producer- and consumer-owned state on +// distinct cache lines (std::hardware_destructive_interference_size is +// deliberately avoided: it is ABI-fragile and warns on GCC). ... +static constexpr std::size_t kCacheLine = 64; +``` + +The standard offers a constant whose whole purpose is this alignment, and +the file's comment rejects it by name. The problem is that +`hardware_destructive_interference_size` is not a constant of the +architecture; it is a constant of the *compiler invocation* — its value +can change with `-mtune`, which means two translation units in the same +program can disagree about the layout of the same type. That is an ODR +violation waiting for a victim, and GCC ships a warning +(`-Winterference-size`) telling you exactly this whenever the constant is +used in a context that might cross an ABI boundary. A header-only library +(section 1) lives *entirely* in that danger zone: every consumer TU +re-instantiates `SpscRing`, potentially under different flags. + +A plain `64` is correct on every target this project ships to, cannot +vary between TUs, and states its assumption in a comment a porting +engineer will read. The general lesson — the ring chapter phrases it as +"between a standard facility and a constraint you can state plainly, +prefer the one whose failure mode you can reason about" — is this +appendix's opening theme in miniature. + +| Decision | Rejected | Reason | Evidence | +|---|---|---|---| +| `alignas(kCacheLine)` with `kCacheLine = 64` | `std::hardware_destructive_interference_size` | the standard constant varies with tuning flags → ODR/ABI fragility in a header; GCC warns; 64 is right everywhere shipped | `include/srt/spsc_ring.hpp` member layout comment | + +## 9. 32-bit telemetry atomics + +The converter's telemetry — state, ppm, fill, underrun/overrun/resync +counters, effective setpoint — is deliberately 32 bits wide, and the +comment above the members carries the whole argument: + +```cpp +// Telemetry is 32-bit on purpose: 64-bit atomics fall back to lock-based +// libatomic on 32-bit targets (e.g. Hexagon), which would break the +// lock-free contract of the hot path. float carries ~7 significant +// digits — ample for ppm/fill observability; counters wrap at 2^32. +``` + +The rejected alternative — `std::atomic` counters and +`double` gauges, the "obviously roomier" choice — is a trap on exactly +the targets this library most cares about. On a 32-bit ISA without a +64-bit atomic instruction, `std::atomic` still compiles and +still works: libatomic implements it *with a lock*. The hot path would +remain formally correct and silently stop being lock-free — the one +property section 4 declared as contract, broken invisibly by a telemetry +counter. The 32-bit choice keeps every telemetry access a plain +lock-free operation on Hexagon and the M-class cores, and the class +`static_assert`s it rather than assuming: + +```cpp +static_assert(std::atomic::is_always_lock_free && + std::atomic::is_always_lock_free && + std::atomic::is_always_lock_free, + "telemetry atomics must be lock-free for the RT contract"); +``` + +The cost is range, and it is documented rather than hidden: `Status`'s +comment tells callers the counters "wrap at 2^32 — far beyond any +plausible event count, but treat them as modular if you difference them +over very long horizons." (The `Status` struct itself still presents +`uint64_t` fields — the narrowing is an internal representation choice, +widened at the snapshot.) A `float` gauge carries about seven significant +digits, which comfortably resolves tenths of a ppm and hundredths of a +frame of fill — observability, not metrology. + +| Decision | Rejected | Reason | Evidence | +|---|---|---|---| +| `atomic`/`atomic` telemetry, wrap documented | 64-bit atomic counters/doubles | 64-bit atomics lock via libatomic on 32-bit targets, silently voiding the lock-free contract; 32-bit range/precision suffices and is asserted | `include/srt/asrc.hpp` telemetry members + `static_assert`; `Status` doc comment | + +## 10. Designated initializers as API + +The filter presets are written the way a datasheet reads: + +```cpp +static FilterSpec transparent() noexcept { + return {.numPhases = 512, + .tapsPerPhase = 80, + .passbandHz = 20000.0, + .stopbandHz = 26000.0, + .stopbandAttenDb = 140.0}; +} +``` + +`FilterSpec`, `Config` and `ServoConfig` are aggregates with member +initializers supplying defaults, and C++20 designated initializers do the +rest. The rejected alternatives are the two classic config-struct styles. +A positional constructor — +`FilterSpec(512, 80, 20000.0, 26000.0, 140.0)` — puts two adjacent +`double` band edges next to each other where a swap compiles silently and +mis-designs the filter (which, per `validated()`'s comment, is the kind +of error that "passes images wholesale"). A builder/setter chain adds a +mutable construction protocol and a second way for every field to be set, +to solve a problem the language now solves natively: fields are named at +the call site, unmentioned fields keep their documented defaults, and — +because designated initializers must follow declaration order — the +compiler rejects reorderings instead of reinterpreting them. + +The style is also the library's own consumption idiom: the README quick +start and every test build configs by naming only what deviates from +default. Readable initialization is not cosmetic in a config API; the +config *is* the API surface where users make their quality-versus-cost +decisions, and the presets double as documentation of three known-good +points in that space. + +| Decision | Rejected | Reason | Evidence | +|---|---|---|---| +| aggregate configs + designated initializers | positional constructors; builder chains | named fields make adjacent-double swaps impossible; defaults stay declarative; declaration-order enforcement | `include/srt/polyphase_filter.hpp` (`FilterSpec` presets); `include/srt/asrc.hpp` (`Config`); `include/srt/pi_servo.hpp` (`ServoConfig`) | + +## 11. `SRT_RESTRICT`: a portable `__restrict__`, adopted on measurement + +C++ has no standard `restrict`. The library defines a two-line macro over +the compiler extensions and applies it to the kernel pointer parameters — +and the comment above the macro is careful to claim only what was +verified: + +```cpp +// No-alias qualifier for the kernel hot loops: without it the compiler +// versions the blend loop behind a runtime aliasing check (verified with +// -fopt-info-vec; see docs/PERFORMANCE.md, hypothesis 2). +``` + +This entry is here as much for its *method* as its content. The +vectorization audit (PERFORMANCE.md, PR C2) did not assume aliasing was a +problem; it asked the compiler. `-fopt-info-vec` showed `blendRow` +vectorizing — but behind a runtime aliasing check, the loop compiled +twice with a pointer-overlap branch choosing between versions. +`SRT_RESTRICT` on the row/history pointers removes the check, and the +measured effect is recorded with the honesty this project's performance +docs enforce: **M55 `pipeline_float` −1.35% instructions, every other +embedded scenario exactly 0.00%, x86 same-state A/B −3.7% wall-clock.** +Small, real, and cheap — the qualifier documents a true invariant (the +scratch row never aliases the history), so it costs nothing to maintain. + +The rejected alternatives: doing nothing (leaving the versioned loop and +its branch in the hot path), and restructuring the code so the compiler +could prove non-aliasing itself (possible, but contorting call signatures +to communicate what one keyword states directly). MSVC spells the +extension `__restrict`, everyone else `__restrict__`; hence the macro +rather than a raw keyword. + +| Decision | Rejected | Reason | Evidence | +|---|---|---|---| +| `SRT_RESTRICT` macro on kernel pointers | nothing (alias-versioned loops); structural non-aliasing proofs | verified with `-fopt-info-vec`, measured: M55 float −1.35% insns, x86 −3.7% wall-clock; states a true invariant | `include/srt/polyphase_filter.hpp` macro + comment; `docs/PERFORMANCE.md` C2 | + +## 12. Compile-time feature gates — and the measured cost of a runtime one + +Target-specific code paths are selected by preprocessor and `constexpr` +machinery, never by runtime flags. `SRT_Q15_SMLALD` turns on the dual-MAC +Q15 dot product exactly where it wins: + +```cpp +#if defined(__ARM_FEATURE_DSP) && !defined(__ARM_FEATURE_MVE) +``` + +— DSP-extension cores *without* Helium (the M33/Pico class), because on +the M55 the compiler already auto-vectorizes the scalar loop with MVE and +the intrinsic would replace vectors with dual-MACs (the gate's comment; +PERFORMANCE.md C4 verified 0.00% change on every M55 scenario). +`SRT_CHANNEL_PARALLEL` enables the frame-major channel axis on hosts only, +and inside the class it becomes a `constexpr` member flag that +`if constexpr` and plain constant folding erase from non-participating +builds: + +```cpp +static constexpr bool kChannelParallel = + SRT_CHANNEL_PARALLEL != 0 && std::is_floating_point_v; +``` + +The reason this is dogma rather than taste is that the alternative was +tried, by accident, and measured. During C6 the mode gate was briefly an +ordinary runtime `bool` consulted in the hot loops — and the M55 +instruction ratchet, which had nothing to do with the change (C6 is +host-only), moved **+6–8%** from hot-loop branch bloat. PERFORMANCE.md +records the lesson verbatim: "the mode gate must be compile-time — a +runtime bool in the hot loops cost +6–8% on the M55 ratchet before the +constexpr gate restored every embedded scenario to 0.00%." The compaction +path in `appendOne` carries the same note at the exact line that was +guilty. A ±3% two-sided CI gate is what turned this from a silent tax +into a failed build; the constexpr gate is what turned the fix from "fast +again" into "provably byte-identical again." + +| Decision | Rejected | Reason | Evidence | +|---|---|---|---| +| preprocessor + `constexpr` flags + `if constexpr` gates | runtime mode flags | a runtime bool in the hot loop measured +6–8% on the M55 ratchet; compile-time gates keep non-participating targets' codegen byte-identical (0.00%) | `include/srt/polyphase_filter.hpp` (`SRT_Q15_SMLALD`, `SRT_CHANNEL_PARALLEL`, `kChannelParallel`, `appendOne` comment); `docs/PERFORMANCE.md` C4/C6 | + +## 13. `std::function` in the simulator, templated callables in the library + +The test harness's two-clock simulator configures its signal generators +as `std::function` fields: + +```cpp +std::function gen = [](std::uint64_t) { return S{}; }; +std::function fsInScale = [](double) { return 1.0; }; +``` + +The library's hot path, facing the identical "caller supplies a callable" +problem, does something else entirely. `FractionalResampler::process` +takes its frame source as a template parameter — +`template std::size_t process(..., PopFn&& popFrames) +noexcept` — and the converter passes a `noexcept` lambda that wraps the +ring read. Same need, opposite tools, and the split is deliberate. + +`std::function` is the right tool in the simulator: tests assign +different generators per test case at runtime, the cost of a type-erased +call per sample is irrelevant next to the double-precision sine it +invokes, and construction-time allocation in a test fixture harms +nothing. It would be the wrong tool in `process()` three ways at once. +Its call is an indirect jump through erased type information that the +optimizer cannot inline — and `popFn` is invoked inside the per-frame +loop, where the entire benefit of the current design is that the ring's +`read()` inlines into the resampler's refill path. Assigning one may +allocate, which is forbidden anywhere reachable from `pull()` +(section 4). And its call operator is not `noexcept` — an empty +`std::function` throws `bad_function_call` — which poisons the `noexcept` +audio path either with a formal lie or a terminate-on-bug. The template +parameter has none of these problems and costs only what templates +always cost: the code is instantiated per callable type, which for +exactly one production callable is nothing. + +| Decision | Rejected | Reason | Evidence | +|---|---|---|---| +| templated `PopFn&&` in the library; `std::function` only in test config | `std::function` on the hot path; templates in test fixtures | hot path needs inlining, no allocation, honest `noexcept`; tests need runtime reassignment and don't care about a type-erased call | `include/srt/polyphase_filter.hpp` (`process`, `prime`); `include/srt/asrc.hpp` (`popFn` lambda); `tests/support/two_clock_sim.hpp` | + +## 14. `std::vector` everywhere, custom allocators nowhere + +Every owned buffer in the library is a plain `std::vector`: the ring's +storage, the coefficient table, the resampler's histories, scratch and +blended row. No allocator parameters, no PMR, no small-buffer tricks. In +a real-time audio library this looks, at first glance, like negligence — +until you notice *when* those vectors are touched. Every `resize`, +`assign` and construction happens in a constructor or in `prime()`-time +setup; the hot path only ever reads `data()` and indexes. The RT problem +with allocation is not that heap memory is slow; it is that allocation +is unbounded and lock-taking *at the moment you cannot afford it*. +Section 4's contract solves that by construction-time-only allocation — +after which a custom allocator has nothing left to fix. It would add a +template parameter that infects every class signature, a policy decision +for every consumer, and a second code path to test, in exchange for +optimizing events that occur once per converter lifetime, off the audio +thread, in a place explicitly allowed to throw `bad_alloc`. + +The rejected-in-spirit alternatives — fixed `std::array` capacities, or +caller-supplied arenas — also fail the configurability test: table and +buffer sizes derive from runtime `FilterSpec` and `Config` values +(section 5), so compile-time capacities would cap the very parameters +the config API exposes. Embedded consumers who must avoid the heap +entirely have the honest option the design leaves open: construct the +converter during initialization, when the heap (or a bump allocator +behind `operator new`) is still a fine place to get memory from. + +| Decision | Rejected | Reason | Evidence | +|---|---|---|---| +| `std::vector` storage, default allocator | allocator/PMR parameters; fixed arrays; arenas | allocation is construction-only by contract, so allocators optimize a non-problem at the cost of infecting every signature; sizes are runtime config | `include/srt/spsc_ring.hpp`, `polyphase_filter.hpp`, `asrc.hpp` (members); RT contract in section 4 | + +## 15. The C ABI: opaque handles, `reinterpret_cast`, and `impl()` outside `extern "C"` + +The FFI surface (`tools/capi/`) wraps the float converter behind an +opaque `SrtHandle*`. The pattern is textbook, but two details record +decisions. First, the handle is a declared-but-never-defined struct, and +the conversion is a `reinterpret_cast` in a pair of helpers: + +```cpp +extern "C" { struct SrtHandle; } // opaque + +namespace { +srt::AsyncSampleRateConverter* impl(SrtHandle* h) noexcept { ... } +const srt::AsyncSampleRateConverter* impl(const SrtHandle* h) noexcept { ... } +} +``` + +The helpers live in an anonymous namespace *outside* the `extern "C"` +block for a reason C++ makes easy to forget: those two `impl` functions +are overloads (const and non-const), and **overloading is illegal under C +linkage** — C linkage names carry no type information to distinguish +them. Keeping the C++ conveniences in C++ linkage and only the exported +symbols in `extern "C"` is the discipline that lets the shim be written +as C++ without leaking C++ into the ABI. + +The rejected alternatives for the handle: exposing the class definition +(no ABI stability — the whole point of the shim is a boundary the C++ +headers don't have, per section 1), or a lookup table of integer handles +(indirection and lifetime bookkeeping to solve a problem the opaque +pointer already solves). Around the handle, the shim converts the C++ +error model to C conventions at the boundary: `srt_create` catches +everything and returns null; every entry point tolerates a null handle, +because — the file's own comment — the documented "check srt_create for +NULL" convention "otherwise invites a crash on exactly the path where the +caller forgot to check." An unchecked failure degrades to silence, not a +crash, which for an audio library is the correct failure sound. + +| Decision | Rejected | Reason | Evidence | +|---|---|---|---| +| opaque `SrtHandle*` + `reinterpret_cast`; `impl()` overloads outside `extern "C"`; null-tolerant entry points | exposed class; handle tables; unguarded entries | ABI boundary with zero C++ leakage; C linkage forbids overloads; unchecked create must fail soft | `tools/capi/srt_capi.cpp`, `tools/capi/srt_capi.h` | + +## 16. Deleted copy operations: these are identity types + +Both concurrency-bearing classes delete copying: + +```cpp +SpscRing(const SpscRing&) = delete; +SpscRing& operator=(const SpscRing&) = delete; +``` + +and likewise `BasicAsyncSampleRateConverter`. The rejected alternative — +letting the compiler generate copies, or writing "deep copy" semantics — +fails the simplest question first: *what would a copy even mean?* A ring +mid-stream has a producer thread and a consumer thread holding a +reference to *this specific object*; a copy would duplicate the buffer +contents but not the relationship, producing an orphan that no thread +feeds. (Mechanically, `std::atomic` members are not copyable anyway — +the language is trying to tell you the same thing.) The converter is +worse: copying would snapshot servo state, telemetry and half-consumed +scratch into a second object whose FIFO occupancy no longer corresponds +to any real clock relationship. These are what the two-agent contract +makes them: objects with identity, addressed by the threads that share +them, not values to be passed around. Deleting the operations turns the +meaningless question into a compile error — the same conversion of +convention into compiler-enforced fact as the `static_assert`s +(sections 2, 9) and the concept (section 2). Moves are deleted along +with copies (declaring the deleted copy suppresses them), which is also +right: a moved-from ring would invalidate the pointers the other thread +is using *right now*. + +| Decision | Rejected | Reason | Evidence | +|---|---|---|---| +| deleted copy (and hence move) on ring and converter | default/deep copies | two live threads reference the object by identity; a copy duplicates state but not the clock relationship; atomics aren't copyable | `include/srt/spsc_ring.hpp`, `include/srt/asrc.hpp` | + +## 17. Rejected wholesale, with reasons + +Some decisions are visible only as absences. For each, the reason is on +record. + +**`std::simd` / `std::experimental::simd`.** Not in C++20 — the library's +floor — and the portable-SIMD abstraction solves a problem this codebase +measured its way out of differently: where explicit SIMD wins, it is +gated per target and per measurement (the SMLALD path, +measured, kept; +the Hexagon `vrmpyh` path, −0.31%, implemented, proven bit-exact, and +*deliberately deleted* per the stop rule — PERFORMANCE.md C5). Where +auto-vectorization already wins (Helium on the M55, host AVX2 via the +channel axis), abstraction would only obscure what `-fopt-info-vec` and +`objdump` verified. + +**Coroutines.** The library's callers are device callbacks with hard +deadlines: `push()` on the capture thread, `pull()` on the playback +thread, both synchronous by the nature of the contract. No async model +fits — a suspension point inside a real-time callback is a category +error, and the frame flow the library does need (the resampler pulling +from the ring mid-synthesis) is expressed by the `PopFn` callable of +section 13 at zero machinery. + +**CRTP mixins.** Section 2's reasons in general form: the concept + traits +pair already delivers static dispatch and constraint checking without +forcing an inheritance shape onto builtin sample types or wrapper types +onto raw buffers. + +**Exceptions on the audio path.** Section 4; reinforced by a toolchain +that cannot unwind at all. + +**`std::jthread` (or any thread) in the library.** The library owns *no* +threads. It is a passive object with a two-agent contract — "one producer +thread calls push() at the input clock; one consumer thread calls pull() +at the output clock" (`asrc.hpp`) — and the threads belong to the caller, +because they already exist: they are the audio device callbacks. Spawning +threads would also be unbuildable on half the CI matrix; the bare-metal +targets have no `std::thread` at all, which is why even the *tests* +compile the two-thread stress only where `find_package(Threads)` succeeds +(`tests/CMakeLists.txt`). + +**Virtual interfaces for "pluggable filters."** The filter is not a +plugin point; it is a *parameter space*. `FilterSpec` exposes the five +numbers that matter (L, T, band edges, attenuation) and the design +machinery is one fixed, well-understood method (Kaiser-windowed sinc) +whose properties the quality tests pin. An `IFilterDesigner` interface +would buy the ability to substitute arbitrary coefficient tables at the +cost of an indirect call chain into the kernel (section 2's costs) and +the loss of every invariant the code currently states about its own +tables — per-branch DC gain, the extra phase row's exact continuity, +the measured |diff| ≤ 41 adjacent-phase delta of section 18. + +| Rejected | Reason | Evidence | +|---|---|---| +| `std::simd` | not in C++20; per-target measured intrinsics (kept or deleted by number) beat portable abstraction | `docs/PERFORMANCE.md` C4/C5 | +| coroutines | hard-RT synchronous callbacks; no async model fits | `include/srt/asrc.hpp` thread contract | +| CRTP mixins | concept + traits already give static dispatch without inheritance shape | `include/srt/sample_traits.hpp` | +| audio-path exceptions | RT contract; Hexagon cannot unwind | section 4 | +| `std::jthread` in the library | passive two-agent object; caller owns the (callback) threads; bare metal has none | `include/srt/asrc.hpp`; `tests/CMakeLists.txt` Threads probe | +| virtual pluggable filters | filter is a parameter space, not a plugin point; would cost kernel inlining and table invariants | `include/srt/polyphase_filter.hpp` (`FilterSpec`) | + +## 18. The meta-decision: comments that show their arithmetic + +Read back through the evidence column of this appendix and notice where +it points: overwhelmingly at *comments*. The library's final C++ decision +is about prose. Its comments do not narrate ("increment the index"); +they state constraints and record arithmetic at the point where the code +depends on them. The Q15 traits comment derives the accumulator budget +("48-80 taps add ~6-7 bits — no overflow, no intermediate rounding"). The +`kaiser.hpp` note quantifies the constexpr rejection (section 5). The +resampler's eps conversion documents its own safety margin ("|eps| is +servo-clamped to ~1e-3, so eps * 2^64 fits int64 comfortably"). The +`appendOne` compaction comment carries the +6–8% scar of section 12. +These comments are load-bearing: they are the reasons future editors +will weigh before changing the code, so they are held to the same +standard as the code. + +Including being *audited*. The package audit that hardened the core +(commit `029607f`, "Core hardening from the package audit") checked the +comments' arithmetic along with the code's, and found one wrong: the Q15 +`blend()` comment claimed the int32 product had "~5% margin" against a +worst-case adjacent-phase delta. The audit did the multiplication — +32767 × 65535 = 2,147,385,345, which sits 0.005% under `INT32_MAX`, not +5% — and the commit's own summary records the fix: "Q15 blend margin +comment corrected (0.005%, not ~5%)." The corrected comment in +`sample_traits.hpp` now shows the numbers and the measurement +(real deltas: |diff| ≤ 41 on the transparent table) and draws the +conclusion the wrong margin obscured: "a margin that thin is not an +invariant worth relying on silently" — which is precisely why the code +computes the blend in `int64_t`. Note what did *not* change: the code +was already right. The comment was the bug. + +That is the standard this appendix has been documenting all along. A +decision is not what the code happens to do; it is a claim, written where +the code makes it true, precise enough to be checked — and checked. diff --git a/book/src/appendix/glossary.md b/book/src/appendix/glossary.md new file mode 100644 index 0000000..c767f1a --- /dev/null +++ b/book/src/appendix/glossary.md @@ -0,0 +1,269 @@ +# Appendix B: Glossary + +> The limits of my language mean the limits of my world. +> +> — Ludwig Wittgenstein, *Tractatus Logico-Philosophicus* + +Terms of art as this book uses them. Where the general meaning and this +project's usage differ, the entry gives the project's. + +**Acquire/release** — the pair of C++ memory orderings that establishes +*happens-before* across threads: everything written before a +release-store is visible after the acquire-load that observes it. The +only synchronization in the library's ring buffer, used once per +direction; the same pair carries the converter across the RP2350's two +cores in the dual-core firmware. + +**AES17** — the Audio Engineering Society's standard for measuring +digital audio equipment, defining how THD+N and dynamic range are taken +(notch the fundamental, integrate the residual over the audio band, +A-weight for DR). The comparison notebook implements an AES17-style +measurement so the library's numbers are commensurable with hardware +datasheets. + +**Anti-image filter** — the lowpass that removes the spectral copies +(images) created by interpolating between sample instants. In this +library it is the Kaiser-windowed sinc prototype: pass the audio band +flat, suppress everything from the first image down by the stopband +attenuation. + +**ASRC (asynchronous sample rate converter)** — a converter between two +sample streams whose clocks are *independent*: the ratio is not known in +advance, is never exactly rational, and drifts, so it must be recovered +continuously by a servo. Distinct from a resampler library, which must be +handed the ratio from outside. + +**Beat frequency** — the rate at which a slow periodic alignment +recurs; here, the rate at which whole-sample slips (and hence occupancy +sawteeth) arrive: `ppm × fs` for sample-granular transfer, divided by +the block size for block transfer. + +**Blend factor** — the fractional weight μ used to linearly interpolate +between the two polyphase coefficient rows adjacent to the current +fractional position. Computed once per output frame and shared across +all channels, which is why N channels cost `blend + N × dot`. + +**Block-beat sawtooth** — the deterministic waveform that block-quantized +transfer imprints on the FIFO occupancy observable: one push/pull block +peak-to-peak, at the beat frequency. It is measurement quantization, not +clock movement; the servo's stage gating and the unlock threshold both +exist to keep it out of the rate estimate. + +**Cache line** — the unit (64 bytes on the targets here) in which cores +move memory between their caches. Data structures shared between two +real-time threads are laid out in whole cache lines per owner. + +**Cache-line ping-pong** — the performance failure where a line written +by one core and read by another migrates back and forth on every access, +costing hundreds of cycles each trip. The ring buffer's cached-index +design exists so the steady-state fast path touches no foreign line at +all. + +**Cent** — one hundredth of a semitone, about 0.06% in frequency; the +unit in which the block-size study reports the low-rate FM that coarse +blocks impose (~0.9 cents rms at 32-frame blocks). + +**dBc** — decibels relative to the carrier: the level of a sideband or +spur measured against the signal that carries it, used for the servo's +sawtooth-rejection figures. + +**dBFS** — decibels relative to digital full scale; −1 dBFS is the AES17 +measurement level, 0.5 FS (−6 dBFS) the quality suite's. + +**DWT / CYCCNT** — the Data Watchpoint and Trace unit of Arm M-profile +cores and its free-running 32-bit cycle counter. Optional silicon (hence +the `NOCYCCNT` runtime check), per-core on the RP2350, and the +instrument that converts QEMU instruction baselines into real cycle +budgets. + +**False sharing** — two logically unrelated variables sharing one cache +line, so writes to either invalidate both readers. Prevented in the ring +by giving producer state, consumer state, and shared read-only state a +64-byte-aligned line each. + +**FIFO** — first-in-first-out buffer. In this library the SPSC ring +between the clock domains; its occupancy doubles as the servo's phase +detector, which is why it exposes exact occupancy rather than an +approximation. + +**Fractional delay** — a delay of a non-integer number of samples, +realized by interpolating between stored samples. The near-unity ASRC's +datapath is a fractional delay that *creeps*: the fractional position +advances by the small rate deviation every frame. + +**Frame** — one sample per channel at one time instant; interleaved +buffers store frame after frame. Latency and occupancy are denominated +in frames so they are channel-count-invariant. + +**Group delay** — the delay a filter imposes on signal envelopes; for +the linear-phase FIR here it is a constant (T−1)/2 taps ≈ 24 input +samples for the default filter, the fixed half of the converter's +latency budget. + +**Header-only** — a library shipped entirely as headers, compiled into +each consuming translation unit. It buys trivial integration and full +inlining, and costs ABI fragility discipline (see the rejected +`hardware_destructive_interference_size`). + +**Interleaved** — channel-multiplexed sample layout +(`L R L R …`), the wire format of `push()`/`pull()`. + +**Kaiser window** — the near-optimal FIR design window with one shape +parameter β trading main-lobe width against sidelobe level, plus +published closed-form fits from stopband attenuation to β and to filter +length. Chosen because the design math is a page of code with known +error bounds, evaluated once at construction. + +**Latency breathing** — the slow wander of the FIFO term of end-to-end +latency (a fraction of the block size) as the servo phase-tracks the +block beat in Track stage; benign, and distinct from an actual setpoint +change. + +**Lock-free** — progress guarantee: every operation completes in a +bounded number of steps regardless of what other threads do, including +being suspended at the worst instruction. Required of everything on the +audio path; asserted at compile time for every atomic the hot path +touches. + +**Memory model / `std::memory_order`** — the C++ rules defining which +values a load may observe across threads, controlled per-operation by +ordering annotations. This codebase's idiom is *sufficiency as +documentation*: each annotation is exactly as strong as the proof needs, +so each one tells the reader why it exists. + +**MVE / Helium** — Arm's M-profile Vector Extension (Cortex-M55 class): +128-bit SIMD including fp32, but no double precision. Its presence or +absence gates which Q15 kernel the library compiles. + +**NCO (numerically controlled oscillator)** — an accumulator whose +increment sets its frequency. The converter's μ phase accumulator is the +NCO of its PLL: the servo's ε̂ sets the increment, wraps mark whole-sample +slips. + +**Near-unity** — the regime this library specializes in: conversion +ratios within a few hundred ppm of 1.0 (two "48 kHz" clocks), where the +general resampling problem degenerates into a creeping fractional delay. +The specialization is what buys the 48-tap datapath and sub-millisecond +filter delay. + +**Occupancy** — the number of frames currently buffered between the +domains (ring plus staged frames). The servo's only sensor; its +quantization is the fundamental measurement limit of the design. + +**Phase accumulator** — the unsigned Q0.64 integer holding the +fractional resampling position. It accumulates only the rate *deviation* +per output sample, in integer arithmetic (resolution 2⁻⁶⁴ samples), and +detects whole-sample slips by 64-bit wraparound. + +**Polyphase decomposition** — factoring one long interpolation filter +into L short branches, one per fractional-delay phase, so each output +sample evaluates T taps instead of L·T. The table stores L+1 rows so the +μ wrap 1→0 is branch-free and exactly continuous. + +**ppm (parts per million)** — 10⁻⁶, the natural unit of crystal +tolerance and drift. Consumer crystals sit tens of ppm from nominal; the +converter accepts ±1000 ppm by default. + +**Q-format (Q0.15, Q1.14, Q1.30, Q0.64 …)** — fixed-point notation: +Qm.n has m integer bits and n fractional bits in a signed word (the +project writes the unsigned 64-bit phase as Q0.64). Q15 audio samples +are Q0.15; the corresponding coefficients are Q1.14 so values slightly +above 1.0 survive; accumulation is int64. + +**Ratchet** — the CI mechanism that compares deterministic instruction +counts against committed baselines at ±3% in *both* directions: a +regression fails, and an unexplained improvement also fails until the +baseline is deliberately re-committed. Two-sided so that numbers can +only change on purpose. + +**Semihosting** — a debug protocol by which a bare-metal program calls +into its host/debugger for I/O; how the Cortex-M test binaries print +results and exit under QEMU system emulation. + +**Seqlock** — a reader-retry publication scheme: the writer makes a +sequence counter odd, writes the payload, makes it even; readers retry +until one even value brackets a whole read. Used by the dual-core +firmware to publish multi-word statistics coherently with only 32-bit +atomics. + +**Servo** — a feedback controller steering a plant toward a setpoint; +here the PI controller that steers FIFO occupancy to the target by +adjusting the resampling rate, thereby *becoming* the clock-ratio +estimator. + +**Setpoint** — the target FIFO occupancy (`targetLatencyFrames`), +i.e. the buffering half of the latency budget. Must exceed the pull +block and the peak jitter excursion; the converter raises its +*effective* value when it observes otherwise. + +**Sine-fit metrology** — measuring quality by least-squares-fitting the +known test tone (amplitude, phase, frequency) and analyzing the residual +after exact subtraction. Sharper than FFT bins for single-tone tests and +immune to window leakage — leakage of the fitted tone cannot masquerade +as noise or crosstalk. + +**Slip** — the whole-sample event in near-unity conversion: after +roughly 1/ppm samples the accumulated fractional position crosses a +sample boundary and the read window shifts by one input sample. The +extra polyphase row makes the slip exactly continuous in the output. + +**SNR (signal-to-noise ratio)** — here, the fitted test tone's power +against everything else in the analysis window (a THD+N-style residual, +so distortion counts as noise), in dB. + +**Soft float / soft double** — floating-point arithmetic emulated in +integer instructions because the hardware lacks the format — FP64 +everywhere on Cortex-M33 and Hexagon. The reason the fixed-point +datapaths exist and the reason the servo's double math is budgeted per +block, not per sample. + +**SPSC (single-producer single-consumer)** — the concurrency restriction +of the library's ring: exactly one pushing agent and one pulling agent. +The restriction is what makes lock-freedom cheap — and it is a contract +about agents, not threads, which is what lets two CPU cores satisfy it. + +**TCG plugin** — an instrumentation hook in QEMU's Tiny Code Generator; +the project's counting plugin observes every executed guest instruction, +yielding the deterministic per-workload counts the ratchet gates. + +**THD+N (total harmonic distortion plus noise)** — everything that is +not the test signal — harmonics, spurs, noise — integrated over the +audio band and expressed relative to the signal. The AES17 measurement +the comparison document reports (−132 dB at the 24-bit interface). + +**ThreadSanitizer (TSan)** — a compiler-instrumented data-race detector +that observes the ordering annotations actually used. It certifies only +the interleavings a run produces, which is why the project also runs it +on genuinely weakly-ordered arm64 hardware. + +**Type-2 loop** — a control loop with two integrators around the cycle +(here: the PI's integrator plus the FIFO, which integrates rate error +into occupancy). Type 2 is what nulls a *constant* rate offset with zero +standing occupancy error. + +**UF2** — the drag-and-drop flashing format of Raspberry Pi boards; the +build artifact of both Pico 2 firmware harnesses. + +**Underrun / overrun / resync** — the converter's three accounting +events: a pull found too little data (output silence-padded, refill and +re-lock), a push found the FIFO full (newest frames dropped), and the +consumer-side hard discard back to the setpoint after the high watermark +is reached. All three are counted, published in `Status`, and expected +to be zero after lock. + +**VLIW (very long instruction word)** — an architecture that packs +several operations into one issue packet scheduled by the compiler, as +on Qualcomm's Hexagon DSP. Why "instructions executed" and "packets +executed" differ there, and part of why instruction counts are budgets +rather than cycle counts. + +**Wraparound arithmetic** — unsigned integer arithmetic modulo 2^N, +which C++ defines exactly. The ring's monotonic indices and the DWT +cycle deltas both rely on the same theorem: a difference that fits the +word is computed exactly *through* the wrap, so the wrap is not an edge +case but a non-event. + +**xrun** — ALSA's collective name for a device-level underrun or overrun +(the OS missed the hardware's deadline). Handled in the bridge by +`snd_pcm_recover`; distinct from the converter's own underrun/overrun +accounting, which sits one layer up. diff --git a/book/src/img/architecture.svg b/book/src/img/architecture.svg new file mode 100644 index 0000000..e44ea4c --- /dev/null +++ b/book/src/img/architecture.svg @@ -0,0 +1,1760 @@ + + + + + + + + 2026-07-01T22:14:40.852216 + image/svg+xml + + + Matplotlib v3.10.9, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/book/src/img/feasibility.svg b/book/src/img/feasibility.svg new file mode 100644 index 0000000..d0c31ac --- /dev/null +++ b/book/src/img/feasibility.svg @@ -0,0 +1,4870 @@ + + + + + + + + 2026-07-01T22:14:43.804899 + image/svg+xml + + + Matplotlib v3.10.9, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/book/src/img/kaiser-response.svg b/book/src/img/kaiser-response.svg new file mode 100644 index 0000000..4b8ccd3 --- /dev/null +++ b/book/src/img/kaiser-response.svg @@ -0,0 +1,3683 @@ + + + + + + + + 2026-07-01T22:14:40.234973 + image/svg+xml + + + Matplotlib v3.10.9, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/book/src/img/kaiser-window.svg b/book/src/img/kaiser-window.svg new file mode 100644 index 0000000..98b5525 --- /dev/null +++ b/book/src/img/kaiser-window.svg @@ -0,0 +1,1791 @@ + + + + + + + + 2026-07-01T22:14:39.749395 + image/svg+xml + + + Matplotlib v3.10.9, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/book/src/img/q064-slip.svg b/book/src/img/q064-slip.svg new file mode 100644 index 0000000..ece9f37 --- /dev/null +++ b/book/src/img/q064-slip.svg @@ -0,0 +1,2039 @@ + + + + + + + + 2026-07-01T22:14:40.600970 + image/svg+xml + + + Matplotlib v3.10.9, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/book/src/img/servo-lock.svg b/book/src/img/servo-lock.svg new file mode 100644 index 0000000..32f7d63 --- /dev/null +++ b/book/src/img/servo-lock.svg @@ -0,0 +1,3005 @@ + + + + + + + + 2026-07-01T22:14:43.379611 + image/svg+xml + + + Matplotlib v3.10.9, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/book/src/introduction.md b/book/src/introduction.md new file mode 100644 index 0000000..ac5da99 --- /dev/null +++ b/book/src/introduction.md @@ -0,0 +1,124 @@ +# Introduction + +> Talk is cheap. Show me the code. +> +> — Linus Torvalds + +This book explains one piece of software completely. + +The software is **SampleRateTap**, a header-only C++20 library that solves a +narrow, stubborn problem in real-time audio: two devices both claim to run at +48 kHz, but each owns its own crystal oscillator, so neither actually does. +One drifts a few parts per million against the other — imperceptibly slowly +and absolutely relentlessly — and any system that moves audio between them +must either resample adaptively or eventually glitch. The library converts +between two such clock domains transparently (about 135 dB of measured +fidelity), in real time (about 1.5 ms of latency), on hardware from Xeon +servers down to a $5 microcontroller. + +That is a small enough problem to fit in your head and a deep enough one to +teach from. Solving it well demands working knowledge of half a dozen fields +that are usually taught separately: FIR filter design, fixed-point +arithmetic, control theory, lock-free concurrency, the C++ memory model, +SIMD micro-architecture, and the discipline of measuring instead of +guessing. The premise of this book is that you learn those subjects better +around one real, shipping artifact — where every design decision had to +survive contact with every other — than from isolated examples built to +illustrate exactly one thing. + +## Who this is for + +You are comfortable in C++ — templates, RAII, the standard library — but you +have not necessarily written audio code, used `std::memory_order_acquire` in +anger, designed a filter, or counted the instructions your compiler emits. +No DSP background is assumed; the mathematics is built up exactly as far as +the code needs it and no further. Where a result has a textbook derivation, +we cite the textbook and spend our pages on what the textbooks omit: why +*this* form of the equation, in *this* code, on *this* hardware. + +## How this book stays honest + +Three mechanical commitments distinguish this book from most code walkthroughs. + +**The excerpts are live.** Every block of library code you read is included +into the book at build time from the actual header in the repository, by +anchor. If the code changes, the book changes or the book's build breaks — +in this project's continuous integration, like every other published number. +There is no possibility of the classic tutorial failure where prose +describes code that no longer exists. + +**Every claim ends in a command.** The library's culture is that performance +and quality numbers are measured, gated, and regenerated — never asserted +from memory. The book inherits that: each chapter closes with a *Verify it +yourself* section listing the exact tests, benchmarks, or notebooks that +back what you just read. When this book says the ring buffer is correct +under weak memory ordering, you will be holding the ThreadSanitizer +invocation that fails if it is not. + +**The figures are regenerable.** Every plot in this book is produced by +`scripts/book_figures.py` from the sources the text cites: the filter +curves re-run the header's design math formula-for-formula, and the servo +and feasibility traces are *measured* — the script compiles a small trace +dumper against the real headers (and, for the before-the-fix panel, +against the pre-fix commit's headers pulled from git history) and runs it +in deterministic virtual time. Rerun the script and you reproduce every +figure; nothing is drawn from memory except the one architecture diagram, +which is labeled as drawn. + +## The history is the curriculum + +This codebase was built measurement-first, and its history contains real +reversals, preserved deliberately: + +- An optimization hypothesis about the Cortex-M55's floating-point unit that + was **wrong**, discovered because a 1.4% instruction-count regression + contradicted the project's own documentation — and the documentation, not + the measurement, turned out to be at fault. +- A Hexagon vectorization effort that was implemented, proven bit-exact, + measured at a 0.31% improvement — and then **deliberately deleted**, with + the disassembly evidence recorded so nobody re-derives the dead end. +- A correctness bug that survived months of green CI because every test and + benchmark happened to be configured just clear of it, found by an + adversarial audit, and demonstrated before it was fixed. +- A toolchain that turned out to be unable to catch C++ exceptions at all — + discovered the day the first `EXPECT_THROW` reached it. + +These are not embarrassments to be edited out; they are the most valuable +material in the book. Anyone can present a finished design as if it were +inevitable. Watching a design *survive falsification* teaches you what the +finished form is actually load-bearing against. + +## The shape of the book + +**Part 0** establishes the problem and its budgets: why a plain FIFO +measurably fails (−34.7 dB!), what near-unity specialization buys, and the +arithmetic that connects picoseconds of timing jitter to decibels of +fidelity. + +**Part I** is the heart: the library's seven headers, one chapter each, in +dependency order — filter design, the polyphase table, the sample-type +traits, the lock-free ring, the clock servo, the fractional resampler, and +the converter that composes them. Each chapter covers the algorithm, the +C++ idioms chosen *and rejected*, and the failure modes the design guards +against. + +**Part II** explains the proof system: deterministic two-clock simulation, +sine-fit metrology, and the instruction-count ratchet that lets a CI runner +gate embedded performance to the exact instruction. + +**Part III** retells the optimization campaign as it actually happened — +six efforts, four wins, one honest draw, one deliberate revert — with the +real numbers and the two implementation traps that cost a day each. + +**Part IV** is portability: what a Qualcomm DSP, two bare-metal ARM cores, +and a C foreign-function interface each demanded. + +**Part V** reaches hardware: real crystals, real cycle counters, and the +configuration rules that scale across channel counts and sample rates. + +The appendices collect the C++ decision log (every idiom adopted or +rejected, with reasons), a glossary, and an annotated bibliography. + +Chapters are largely self-contained, but Part I builds on itself; if you +read only one chapter, make it [the lock-free ring](part1/spsc-ring.md) — +it is short, complete, and representative of the whole book's method. diff --git a/book/src/part0/budgets.md b/book/src/part0/budgets.md new file mode 100644 index 0000000..add62e8 --- /dev/null +++ b/book/src/part0/budgets.md @@ -0,0 +1,354 @@ +# Budgets: latency, quality, compute + +> Perfection is achieved, not when there is nothing more to add, but when there is nothing left to take away. +> +> — Antoine de Saint-Exupéry, *Wind, Sand and Stars* + +The previous chapter ended with three words used as if they were +self-explanatory: latency, quality, compute. This chapter turns each into +a number with a derivation behind it, because everything in Part I is an +expenditure against one of these three accounts, and you cannot audit an +expenditure without knowing the budget. + +The three budgets are not independent. A longer filter buys stopband +attenuation (quality) at the price of group delay (latency) and +multiply-accumulates (compute). A deeper FIFO buys servo stability +(quality, indirectly) at the price of latency. A finer polyphase table +buys interpolation accuracy at the price of memory and cache traffic. The +design that ships is not the best possible point on any single axis; it is +a defensible allocation across all three, and the allocation is different +for a Xeon than for a microcontroller. That is why the library has presets +and sample-type variants rather than one configuration: same architecture, +different budget splits. + +We take the three in the order of most surprising to least. + +## The quality budget, denominated in picoseconds + +The README makes a claim that deserves suspicion on first reading: the +phase accumulator's resolution is "far below the ~8 ps jitter budget for +120 dB transparency at 20 kHz." Eight *picoseconds* — in an audio system, +where a sample lasts twenty-one microseconds, six orders of magnitude +longer. Where does a number like that come from? + +It comes from the first real mathematics in this book, and the derivation +is three lines. This library's entire datapath is, as the last chapter +established, a creeping fractional delay: every output sample is the input +signal evaluated at a slightly wrong time, deliberately. So the natural +question is: how wrong is *acceptably* wrong? If we evaluate the signal at +time `t + Δt` instead of `t`, how large may `Δt` be before the error +matters at the quality level we are targeting? + +Take the worst case the audio band can offer: a full-scale sine at the top +of the band, + +```text +s(t) = A · sin(2π f t), f = 20 kHz. +``` + +The error caused by a small timing offset is governed by how fast the +signal can change. Differentiating, the slope is `2π f A · cos(2π f t)`, +whose magnitude peaks — at the zero crossings — at + +```text +max |ds/dt| = 2π f A. +``` + +A timing error `Δt` therefore produces an amplitude error of at most the +slope times the error: + +```text +e = 2π f A · Δt. +``` + +Now impose the quality target. The filter at the heart of this library is +designed with a 120 dB stopband — the "120 dB transparency" figure that +recurs throughout the project — and −120 dB as an amplitude ratio is +`10^(−120/20) = 10⁻⁶`. Demanding that the timing-induced error stay below +that, relative to full scale: + +```text +2π f · Δt ≤ 10⁻⁶ +Δt ≤ 10⁻⁶ / (2π · 20 000 Hz) = 7.96 × 10⁻¹² s ≈ 8 ps. +``` + +Eight picoseconds. Not because audio hardware keeps time that precisely — +it does not, remotely — but because *this library's job is to manufacture +sampling instants*. The two crystals define real time; the converter +invents the fractional positions in between, and any noise in those +invented positions is indistinguishable from noise added to the audio, at +the exchange rate the slope sets: one picosecond of timing error at 20 kHz +full scale costs about an eighth of a microvolt-per-volt, and 8 ps costs +−120 dB. Position error *is* amplitude error. That single sentence is the +reason a resampling library must care about time resolution that would be +absurd anywhere else in audio. + +Two honest qualifications keep the number from overclaiming. First, this +is a worst-case bound — the full-scale 20 kHz zero crossing — and real +program material spends almost no energy there; at 1 kHz the same +derivation gives a 20× looser budget, which is one reason the measured SNR +table is 135 dB at 997 Hz but 105 dB at 19.5 kHz. Second, the budget +governs *random or signal-uncorrelated* timing error. Slowly varying +timing error is not noise but frequency modulation — pitch wobble — and it +gets its own, much stricter treatment when the servo chapter derives why +the Quiet stage must reject its input sawtooth to roughly −120 dBc +equivalent at 20 kHz. Same currency, different account. + +## Spending the budget: sixty-four bits of phase + +With the budget in hand, we can now read the library's most important +data-representation decision as the budget allocation it is. Convert 8 ps +into the datapath's native unit, fractions of a sample at 48 kHz: + +```text +8 ps / 20.8 µs ≈ 3.8 × 10⁻⁷ samples ≈ 2⁻²¹ samples. +``` + +So the fractional position µ must be carried to about 21 fractional bits +before timing quantization alone could threaten 120 dB. Here is what the +library actually does, in the inner loop of the fractional resampler — +this is the Q0.64 phase accumulator the README describes, live from +`include/srt/polyphase_filter.hpp`: + +```cpp +{{#include ../../../include/srt/polyphase_filter.hpp:p0_phase_step}} +``` + +The fractional position lives in an unsigned 64-bit integer interpreted as +Q0.64: all 64 bits are fraction, so the resolution is 2⁻⁶⁴ of a sample — +forty-three binary orders of magnitude below the 2⁻²¹ the budget demands. +The servo's rate-deviation estimate `epsHat` is converted from double to +this fixed-point form **once per block**, and from there the per-sample +path is pure integer arithmetic: one 64-bit addition per output sample, +with the two slip cases — the fractional position creeping past 1.0 or +below 0.0, the "whole-sample slip roughly every `1/ppm` samples" of +Chapter 1 — detected by unsigned wraparound rather than comparison against +a threshold. + +Why carry 43 bits more resolution than the budget requires? Because the +excess is free, and what it buys is not resolution but *exactness*. A +phase accumulator adds a tiny ε thousands of times per second; do that in +floating point and every addition rounds, because a double near 1.0 has +2⁻⁵² of absolute resolution and a double's rounding depends on the current +magnitude of the accumulator. The earlier version of this code did exactly +that, and worked. But integer addition modulo 2⁶⁴ does not round — ever — +so the only quantization in the entire phase path is the once-per-block +conversion of ε itself, and the accumulated position between servo updates +is bit-exact. (The conversion is safe by construction: the servo clamps +|ε| to about 10⁻³, so `ε · 2⁶⁴` fits comfortably in the signed 64-bit +intermediate — the code comment above carries the argument, and the +configuration validator refuses `maxDeviationPpm` settings that could +break it.) + +The project's performance log records what this decision measured when it +landed as change C3 of the optimization campaign: the *motivation* was the +compute budget — an integer-only per-sample path with no doubles is what +keeps the inner loop cheap on DSPs without double-precision floating-point +units, and it cut Hexagon's Q31 pipeline cost by 15.5 % — but quality +*improved* as a side effect, to 135.0 dB at 997 Hz, with the log noting +the phase resolution change from 2⁻⁵² to 2⁻⁶⁴. One representation change, +paid from no budget, credited to two. Those are rare, and worth designing +toward. + +## The latency budget + +Latency is the easiest budget to state and the easiest to spend by +accident. Here is where every frame of it is decided — the converter's +entire configuration surface, live from `include/srt/asrc.hpp`: + +```cpp +{{#include ../../../include/srt/asrc.hpp:p0_config}} +``` + +The README's latency equation prices the defaults: + +```text +latency = targetLatencyFrames + (L·T − 1) / (2L) [input frames] + = 48 + (256·48 − 1)/512 + = 48 + ~24 ≈ 72 frames ≈ 1.5 ms at 48 kHz. +``` + +Two terms, and they are budget lines of entirely different character. + +The second term is the **filter group delay**, and it is a law of physics +wearing a configuration option's clothes. The interpolation filter is a +linear-phase FIR — symmetric coefficients, which is what guarantees every +frequency is delayed equally, and waveform shape is preserved — and a +symmetric filter *must* delay the signal by half its span: with `L = 256` +polyphase branches of `T = 48` taps each, `(L·T − 1)/(2L)` is 23.998 +input frames, ~0.50 ms. You cannot negotiate this term down at constant +quality; you can only buy a shorter filter. `FilterSpec::fast()` does +exactly that, cutting group delay to about 16 frames at reduced stopband, +and the `transparent()` preset spends the other way — 80 taps, 40 frames, +0.83 ms — for its extra high-frequency headroom. Quality and latency, +trading at a posted exchange rate of half a frame per tap. + +The first term, the 48-frame **FIFO setpoint**, is not physics but control +headroom, and it is the term you own. The FIFO between the clock domains +must never run empty (an audible underrun) and never hit its high +watermark (a resync), so the servo regulates its occupancy around a +setpoint — and that standing occupancy is buffered audio you are listening +through. Forty-eight frames is one millisecond at 48 kHz: enough to absorb +the push/pull phase jitter of real callbacks with margin, small enough to +keep the total design latency at 1.5 ms. + +The setpoint carries a feasibility rule that the README states in bold and +the constructor-plus-`pull()` logic enforces, because violating it does +not degrade the system — it destroys it: **the setpoint must exceed the +pull block size.** A `pull()` synthesizes output only from frames already +buffered; if the callback asks for 128 frames while the servo holds the +buffer at 48, every callback drains the FIFO through empty, and the +converter falls into a permanent dropout cycle that no amount of servo +cleverness can escape, because the geometry is simply infeasible. Rather +than document a footgun, the converter adapts: when it observes pull +blocks larger than the configured setpoint, it raises the effective +setpoint to the block size plus about half a block of margin (bounded by +FIFO capacity — callbacks above ~340 frames also need `fifoFrames` sized +explicitly), reports the raised value in +`Status::effectiveTargetLatencyFrames`, and lets latency follow. The +latency budget, in other words, has a hard floor set by your callback +size, and the library will spend up to that floor without asking — the +one budget line it refuses to let you underfund. On top of the rule sits +its softer sibling: the setpoint must also stay above the peak occupancy +excursion of your push/pull jitter, and the FIFO term breathes by a +fraction of the block size as the servo tracks drift, so 1.5 ms is a +design center, not a guarantee etched per-sample. + +`designedLatencySeconds()` reports the resulting figure at runtime, and +`tests/test_latency.cpp` closes the loop the project's way: it pushes an +impulse through a locked converter and asserts that the impulse emerges +where the equation said it would. + +## The compute budget + +The third budget is the one whose *unit* changes with the deployment. On a +server, compute is a fraction of a core; on a microcontroller, it is a +question of existence — does the workload fit under the clock rate or not. +This library targets both ends simultaneously, which is why its +performance culture is unusual, and why `docs/PERFORMANCE.md` is one of +the two canonical history documents this book draws on. + +Start at the comfortable end. On the shared 2.80 GHz Xeon that produced +the README's benchmark table, the default float converter processes a +stereo 48 kHz stream at 107.8 ns per frame — 193× faster than real time, +meaning one live stream costs about half a percent of one core. At that +end the compute budget is not about survival but about citizenship: how +many streams per core, how much headroom the rest of the audio graph +inherits. + +Now the other end. The README's platform matrix ends at the Arm +Cortex-M33 — the Raspberry Pi Pico 2's core, bare metal, no FP64 hardware, +no vector unit — and the project publishes, in the README's +instruction-count table, exactly what every workload costs there. The +numbers are *executed instructions*, measured by running fixed workloads +under QEMU with a counting plugin, and they are brutal and instructive. +The float interpolation kernel that costs the Cortex-M55 99.5 million +instructions costs the M33 1.90 **billion** — about 19× — for one reason: +the float datapath accumulates in double precision by design, and on a +core with no double-precision FPU every one of those accumulations becomes +a software floating-point library call. The compute budget on such a +target is not tightened; it is a different budget entirely, and the +Q15/Q31 fixed-point datapaths exist precisely as the correctly-denominated +response — integer-only inner loops that make the M33's cost land near the +M55's instead of 19× above it. + +What does an instruction budget *mean* on a 150 MHz M33? Divide. A 150 MHz +core executing (optimistically) one instruction per cycle retires 150 +million instructions per second, and a 48 kHz stream demands a frame every +20.8 µs — about 3,100 instructions of total budget per frame, forever, +before the rest of the firmware has run at all. Against that, the measured +comparison workloads put the full Q15 converter — servo and FIFO included +— at roughly 5,043 instructions per stereo frame on the M33: about 242 +million instructions per second for stereo, over the core's ceiling even +at ideal IPC. Mono, at roughly half that, fits. This is exactly the +README's guidance, now visible as arithmetic rather than advice: 48 kHz +Q15 mono fits a 150 MHz M33; stereo wants the `fast()` preset or the +RP2350's second core. On a Xeon the same library is a rounding error; on +the M33 the default preset is *infeasible in stereo*, and knowing that +before flashing hardware is the entire point of keeping the budget in a +table. + +The honesty clause matters as much as the numbers, and `docs/PERFORMANCE.md` +states it in its metrics table: instruction counts are deterministic to +the instruction, noise-free, and well-correlated with real cost *for +scalar code* — and they are still not cycles. They know nothing of wait +states, flash caches, or dual-issue. Cycle truth requires vendor +simulators or real silicon, which is why the repository carries +`examples/pico2_cyccnt/`, a flashable RP2350 harness that measures +DWT.CYCCNT cycles per block against these same instruction baselines, and +why the README explicitly frames the counts as "budgets pending +real-silicon validation." What determinism *does* buy is enforcement: the +counts are committed to `bench/baselines.json` and CI re-measures every +push, failing on any drift beyond ±3 % in either direction — a regression +is rejected, and an unexplained improvement is also rejected until the +baseline is re-recorded in the same diff, so stale slack cannot accumulate +to hide the next regression. Wall-clock numbers, by contrast, are never a +hard gate: shared runners are too noisy, and a gate that flakes teaches +people to ignore it. Instructions are gated because they are exact; +wall-clock is reported because it is real. Both disciplines are the same +policy — publish only what you can re-measure — applied to metrics of +different reliability. Part II returns to this machinery in detail. + +## Each budget line becomes a file + +Part 0 has now done its work: a physical problem (two crystals), a +measured cost of ignoring it (−34.7 dB), and three budgets with numbers +attached. Part I walks the library's headers in dependency order, and the +tour is really the budget ledger read line by line: + +`kaiser.hpp` is the quality budget's opening entry — the 120 dB stopband +that made the 8 ps derivation's target, purchased with a windowed-sinc +design whose tap count is the latency and compute budgets' first expense. +The polyphase bank spends memory to make one branch-pair evaluation per +output sample possible at all, and its `L = 256` branch count is sized by +the interpolation-residual rule the README quotes (−12 dB per doubling of +`L`, +12 dB per octave of signal frequency) — the reason the measured +table slopes from 135 dB at 997 Hz to 105 dB at 19.5 kHz. +`sample_traits.hpp` is the compute budget's answer to the M33 column +above: the Q15/Q31 datapaths as a customization point rather than a fork. +`spsc_ring.hpp` holds the latency budget physically — its occupancy *is* +the 48-frame line item — and doubles as the servo's sensor. `pi_servo.hpp` +polices the quality budget's FM account, rejecting the occupancy sawtooth +to the −120 dBc figure this chapter bounded. The fractional resampler +carries the Q0.64 accumulator you have already read. And `asrc.hpp` +composes the whole, enforcing the feasibility rule so the latency budget +can never be underfunded into a dropout cycle. + +Every number in those chapters traces back to one of this chapter's three +accounts. When a design choice seems baroque — a 64-bit integer phase, an +extra row in a coefficient table, a third servo stage — the question to +ask is always the same: *which budget is it spending, and which is it +defending?* + +## Verify it yourself + +```sh +# The 8 ps budget, re-derived in one line: +python3 -c "import math; print(1e-6 / (2 * math.pi * 20000))" + +# The quality budget, enforced: the pinned SNR thresholds behind the +# README's 135/120/112/105 dB table: +cmake -B build -DCMAKE_BUILD_TYPE=Release +cmake --build build -j +ctest --test-dir build -R AsrcQuality --output-on-failure + +# The latency budget, enforced: an impulse must emerge exactly where +# designedLatencySeconds() promises (48 + ~24 frames by default): +ctest --test-dir build -R Latency --output-on-failure + +# The host compute budget (Google Benchmark; the README table's source): +cmake -B build-bench -DCMAKE_BUILD_TYPE=Release -DSRT_BUILD_BENCHMARKS=ON +cmake --build build-bench -j +./build-bench/bench/srt_bench + +# The embedded compute budget: fixed workloads under QEMU, compared to +# the committed baselines at ±3% (needs the cross toolchain and a +# TCG-plugin-capable QEMU — docs/PERFORMANCE.md has the mechanics): +python3 scripts/icount.py --target m33 --build-dir --plugin +``` + +The instruction-count and benchmark tables in the README regenerate from +these same commands (`scripts/update_icount_docs.py`, +`scripts/update_perf_docs.py`), and CI fails if the published tables drift +from the measured baselines — the budgets in this chapter are audited on +every push. diff --git a/book/src/part0/two-crystals.md b/book/src/part0/two-crystals.md new file mode 100644 index 0000000..d9d8696 --- /dev/null +++ b/book/src/part0/two-crystals.md @@ -0,0 +1,295 @@ +# Two crystals, one stream + +> No man ever steps in the same river twice, for it is not the same river and he is not the same man. +> +> — attributed to Heraclitus + +Every specification of this library begins with a lie that the audio +industry tells itself daily: "48 kHz." + +There is no such thing as 48 kHz. There is a quartz crystal on the capture +device's board resonating at very nearly the frequency its datasheet +promises, and a different quartz crystal on the playback device's board +doing the same, and neither of them consulted the other. Each was cut, +trimmed, and aged in its own factory; each sits at its own temperature, +warming with the electronics around it; each is divided down to a sample +clock through its own board's logic. When both devices claim 48 kHz, what +they mean is 48 kHz plus or minus some parts per million — and *whose* +parts per million is exactly the question. This library's working +envelope, inherited from the kind of hardware it targets, is a few hundred +ppm of offset per device, drifting slowly as temperatures change; the +default configuration accepts anything within ±1000 ppm, and the test +suite drives it across that range deliberately — including a 0 → 300 ppm +drift ramp at 10 ppm/s that must be tracked without losing lock. + +A part per million sounds like nothing. It is worth pausing on why it is +everything. + +## The integral that cannot be argued with + +Suppose you capture audio from device A and play it on device B, and +suppose the two clocks disagree by +200 ppm — the offset used throughout +this project's measurements as a realistic mid-scale case: the input side +runs at 48 009.6 Hz against the output's 48 000 Hz. The rate mismatch is +0.02 %. Per sample it is invisible. But a rate mismatch does not average +out; it *integrates*. Every second, the capture side produces 9.6 more +frames than the playback side consumes. Every second, forever. + +Put a buffer between them — the obvious move, and a correct first move — +and you have only chosen where the failure happens. The surplus +accumulates in the buffer at 9.6 frames per second. A 1,024-frame FIFO +(the converter's own default capacity floor, for scale) started half full +gives you about 53 seconds before it is completely full and something has +to give. Make the buffer deeper and you buy time linearly while paying +latency for every frame of depth; a buffer deep enough to survive an +hour-long session at 200 ppm would hold about three quarters of a second +of audio, all of which you would then be monitoring through. Flip the sign +of the mismatch and the same argument drains the buffer to empty instead. +There is no buffer size that fixes a rate mismatch, because the problem is +not jitter — which a buffer genuinely absorbs — but a nonzero mean. The +README states the consequence as the library's founding fact: whole-sample +slips occur roughly once every `1/ppm` samples, and any system that moves +audio between independent clocks must either resample adaptively or +eventually glitch. + +So the plain FIFO must fail. The interesting question — and the one this +project answered by measurement rather than assertion, because that is its +habit — is *how badly*. + +## Measuring the do-nothing option + +The comparison notebook (`notebooks/asrc_comparison.ipynb`, results +recorded in `docs/COMPARISON.md`) includes, alongside the serious +contenders, a subject called the **naive FIFO**: a buffer that simply +drops the newest samples when full, which is what "we'll deal with it +later" compiles to. It was measured under exactly the same conditions as +everything else — a 997 Hz tone at −1 dBFS crossing a +200 ppm clock +boundary, an AES17-style THD+N analysis with the fundamental removed and +the residual integrated across the 20 Hz–20 kHz band. + +The naive FIFO measures **−34.7 dB THD+N** and 94.7 dB of A-weighted +dynamic range. The converter this book describes, on the same signal and +the same clocks, measures −132.1 dB. + +What does −34.7 dB sound like? The number means that the error left after +subtracting the test tone sits only 34.7 dB below the tone itself — a +residual of about 1.8 % of the signal. If that residual were smooth +harmonic distortion, 1.8 % would already be far into plain audibility. But +it is worse than that, because of *how* the error is distributed in time. +At +200 ppm the buffer overflows and discards a sample about 9.6 times per +second, and each discard splices the waveform to a point one sample later: +a step discontinuity. A step is the broadest-band event a sampled signal +can contain; its energy smears across the entire spectrum. So the +subjective experience is not a haze of distortion but a steady mechanical +ticking — roughly ten clicks per second at this offset — riding on +otherwise clean audio. It is the sound that anyone who has misconfigured a +USB audio loopback already knows, and once heard it cannot be unheard. The +dynamic-range figure tells the same story from below: quiet passages sit +on a floor of click energy, tens of decibels above where the converter's +floor lies. + +That row of the table is the cost of doing nothing, and it calibrates +everything else in this book. Every design decision in the chapters ahead +is ultimately justified by the distance between −34.7 dB and −132.1 dB. + +## The two industry answers + +The two-crystal problem is decades old, and industry converged on two +families of solution. `docs/COMPARISON.md` opens by insisting on the +distinction, because both families are marketed under the same three +letters: there are **full ASRCs** that recover the clock ratio themselves, +and **resampler libraries** that must be handed the ratio from outside. + +**The hardware answer** is the asynchronous sample rate converter chip. +The canonical part is Analog Devices' AD1896 — the lineage this library's +architecture explicitly follows — joined by parts like TI's SRC4392. These +are dedicated silicon: serial audio in on one clock, serial audio out on +another, and the chip does everything, including the part that makes the +problem *asynchronous* — discovering the ratio between the two clocks by +itself, continuously, without being told. The datasheet numbers are +excellent: −117 dB THD+N minimum (−133 dB best case) and 142 dB dynamic +range for the AD1896; −140 dB typical and 144 dB dynamic range for the +SRC4392. Their ratio ranges are enormous — 1:8 up and 7.75:1 down for the +AD1896, 1:16 to 16:1 for the SRC4392 — because these chips are built to +convert 44.1 kHz material to 48 kHz and every other crossing a studio can +produce, not merely to absorb drift. The costs are the obvious ones: a +proprietary part, a place on the board, one stereo pair per chip, and no +help at all if your audio exists as bytes in memory rather than as a +bitstream between codecs. (A caveat the comparison document is careful +about, and this book inherits: those figures are datasheet values measured +through analog test loops, not this project's measurement. They are +comparable to the software numbers in definition, not in environment.) + +**The software answer** is the resampler library: libsamplerate, soxr, +zita-resampler. These are superb pieces of engineering with a structural +gap that `docs/COMPARISON.md` names precisely: they must be handed the +ratio by an external servo, and so they solve *only half of the drift +problem*. A resampler library answers the question "given that the input +runs 200 ppm fast, compute the output samples" — flawlessly, at any ratio +you ask. It does not answer "how fast is the input actually running right +now?", and that is the question the two-crystal problem poses, because +nothing in your system knows the answer. The true ratio is not written +down anywhere; it exists only physically, in the beat between two +oscillators, and it moves as the room warms up. In the comparison +measurements the libraries were fed the exact ratio by an oracle — the +harness knew the true offset because it had synthesized it — and under +those conditions they measure at the format ceilings: −143.5 dB THD+N +through a 24-bit interface for libsamplerate's `sinc_best`, −143.8 dB for +soxr's `VHQ`. Real numbers, and also unobtainable in the field as stated, +because the oracle does not ship. (Near-unity is their easy regime, too: +libsamplerate's published 97 dB worst case belongs to aggressive ratios, +not this one.) + +The missing half has a name: clock recovery. Somebody must observe the two +domains, estimate their ratio from evidence, and track it as it drifts — a +control problem, not a signal-processing one. The Linux/JACK ecosystem +shows what bolting that half on looks like: zita-ajbridge wraps a +delay-locked loop around zita-resampler. Operating systems solve it too, +invisibly — CoreAudio, WASAPI shared mode, and PipeWire all run ASRCs +inside their engines — with unpublished quality and typically 5–20 ms of +latency, fine for notification sounds and disqualifying for live +monitoring. + +So the field, surveyed honestly: chips that solve the whole problem in +proprietary silicon; libraries that solve the easy half in portable +software at reference quality; system engines that solve the whole problem +opaquely at whatever quality and latency they choose. What did not exist — +and what this library is — is the whole problem solved in open, portable, +embeddable software at measured quality: an AD1896-shaped architecture, +polyphase FIR plus clock servo, that you can compile. + +## The specialization that pays for everything + +You cannot simply transcribe the AD1896 into C++ and expect it to fit on a +microcontroller; the chips' generality is exactly the expensive part. +SampleRateTap's founding decision is to refuse most of the problem the +chips solve. It handles *only* the near-unity case: two domains at +nominally the same rate, within ±1000 ppm by default. It will never +convert 44.1 kHz to 48 kHz — the README lists this first among its +limitations, and `docs/COMPARISON.md` is blunt that for genuine rate +*conversion* you should put soxr or libsamplerate in the chain. + +Here is what the restriction buys. A general-ratio converter must be able +to place output samples anywhere relative to input samples, at any +spectral relationship between the rates — including downward conversions +where the filter must also band-limit, and ratios that change which parts +of its machinery dominate. In the near-unity regime none of that machinery +earns its keep. When the ratio is 1 + ε with ε a few hundred parts per +million, each output sample lands *almost exactly* on an input sample: +just a hair early or late, by a fractional offset that creeps by ε per +sample and wraps once every `1/ε` samples. The README's "How it works" +section states the consequence in one phrase: the conversion degenerates +into a **creeping fractional delay**. The datapath's job collapses to +evaluating one interpolation at a slowly sliding fractional position — a +48-tap dot product per output sample in the default configuration — plus a +servo deciding how fast the position should creep. And because the two +rates are spectrally indistinguishable, anti-imaging and anti-aliasing +collapse into a single fixed filter design, flat to 20 kHz, done once in +the constructor. + +The computational tables in `docs/COMPARISON.md` measure what that is +worth. Against libsamplerate — the closest architectural analog, a +streaming time-domain polyphase resampler — at the matched ~120 dB quality +tier, SampleRateTap converts 2.9–3.6× more frames per second (mono/stereo; +2.1× at 8 channels, where both engines amortize), while carrying half the +algorithmic latency: 24 frames (0.50 ms) of filter group delay against 46 +frames (0.96 ms). At the ~140 dB tier the gap widens to 6.2× in throughput +and to 40 frames against 143 in latency. That is the near-unity dividend, +and the comparison document names its mechanism exactly: a 48-tap window +with a creeping phase, instead of general-ratio machinery. On targets +without floating-point hardware the dividend compounds — the Q15 +fixed-point datapath has no libsamplerate analog at all, and on a +Pico-class Cortex-M33 the cheapest libsamplerate option costs about 9.8× +what SampleRateTap's intended configuration does. + +The soxr rows teach a different lesson, and reading them honestly is a +preview of the next chapter. At the ~120 dB tier soxr converts 32.4 +million stereo frames per second on the same host to SampleRateTap's 10.5 +million — soxr wins raw throughput, decisively, by processing in large +SIMD-friendly internal batches. The latency column is the price: 556 to +607 frames of algorithmic delay, 11.6 to 12.6 ms, rising to 777 frames +(16.2 ms) at its highest quality tier. Those are fine numbers for batch +conversion and impossible ones inside a 1–2 ms live-monitoring budget, and +— as `docs/COMPARISON.md` puts it — there is no setting that buys soxr's +throughput at SampleRateTap's latency. Throughput, latency, and quality +are not independent virtues to be maximized; they are a budget to be +allocated, and different tools have allocated it for different lives. + +One more number from the measured table completes the picture, because +this book does not deal in free lunches. Fed by its own servo rather than +an oracle, running causally at 1.5 ms of total design latency, +SampleRateTap measures −132.1 dB THD+N against the oracle-fed libraries' +−143.5 dB. The ~11 dB gap is the measured price of solving the *whole* +problem — discovering the ratio from buffer occupancy in real time instead +of being told it — and the comparison document presents it as exactly +that. Eleven decibels, spent 132 dB below the signal, purchasing the half +of the problem that was actually hard. The rest of this book is an account +of how both numbers — the 132 and the 11 — were achieved, measured, and +defended. + +## Watching the invisible + +Before the budgets, one more thing Chapter 1 owes you: a way to *see* the +problem, because 200 ppm is below anything your ears will report until the +FIFO finally gives way. The repository's first example, +`examples/drifting_clocks.cpp`, exists for exactly this. It runs two real +threads: a producer pushing a 997 Hz sine at a virtual 48 000.0 Hz, and a +consumer pulling at 48 kHz plus 500 ppm, both paced with absolute +`sleep_until` deadlines so the long-term rates are exact even though every +individual wakeup jitters by operating-system amounts — far rougher timing +than any real audio callback delivers. A status line prints the servo's +state and its rate estimate as it converges toward the −500 ppm +consumption deviation. + +Two of the example's own caveats are worth reading before you run it, +because each is a preview of a later chapter. First, since scheduler +jitter here is on the order of milliseconds, the demo configures a 20 ms +FIFO setpoint rather than the library's 1 ms default — your first sighting +of the latency budget bending to its environment, which is the next +chapter's subject. Second, the converter observes the clocks only through +whole 96-frame chunks, so its estimate of the ratio cannot firm up faster +than the chunk-beat period `1/(ppm × chunkRate)` — about four seconds per +beat cycle at 500 ppm — and the instantaneous estimate visibly wobbles at +that beat, which is why the display shows a three-second moving average. +The information available about two clocks is quantized by how coarsely +you watch them exchange data; that observation will return as the entire +justification for the servo's three-stage design. + +Run it and watch the state go `Filling`, then `Acquiring`, then `Locked`, +and the ppm readout settle toward −500. Nothing about the audio would have +told you any of this for the first minute — and that is the point. The +drift is always there; the only choice is whether something in the system +is measuring it. + +First, though, the budgets. Claims like "a 1–2 ms live-monitoring budget" +and "120 dB transparency" have been used here as if self-evident. They are +not. The next chapter derives each one — including why this library's +quality target works out to a timing tolerance of about eight +*picoseconds*. + +## Verify it yourself + +```sh +# Two real threads, two clocks 500 ppm apart; watch the servo lock and +# the ppm estimate converge: +cmake -B build -DCMAKE_BUILD_TYPE=Release +cmake --build build -j +./build/examples/drifting_clocks + +# Reproduce the measured table — including the −34.7 dB naive-FIFO row +# and the oracle-fed library ceilings. Needs numpy, matplotlib, and the +# `samplerate` and `soxr` Python packages; the first cell builds the +# C ABI shared library if missing: +jupyter execute notebooks/asrc_comparison.ipynb + +# The computational head-to-head on your own host (requires the system +# libsamplerate and soxr development packages, found via pkg-config): +cmake -B build-cmp -DCMAKE_BUILD_TYPE=Release \ + -DSRT_BUILD_BENCHMARKS=ON -DSRT_BUILD_COMPARE_BENCH=ON +cmake --build build-cmp -j +./build-cmp/bench/compare/srt_bench_compare +``` + +The comparison notebook pins SampleRateTap's own results with assertions, +so a regression in the library makes the reproduction fail loudly. The +numbers in this chapter are load-bearing, not decoration. diff --git a/book/src/part1/asrc.md b/book/src/part1/asrc.md new file mode 100644 index 0000000..45a676e --- /dev/null +++ b/book/src/part1/asrc.md @@ -0,0 +1,270 @@ +# Composition: `asrc.hpp` + +> The whole is something beside the parts. +> +> — Aristotle, *Metaphysics* + +Every previous chapter built a component that is correct on its own terms. +This chapter is about the file that has no terms of its own: `asrc.hpp` +contains almost no algorithm, no mathematics, and fewer than three hundred +lines that mostly call other files' code. It is also where the only serious +bug in the library's history lived. Both facts have the same cause. +Composition is where each component's assumptions meet every other +component's guarantees, and the gaps between them are invisible from inside +any single file. + +The cast, assembled: a `PolyphaseFilterBank` designed at construction, a +`FractionalResampler` that owns the history and the phase, a `SpscRing` +carrying interleaved frames between the two clock domains, and a `PiServo` +turning ring occupancy into a rate estimate. `BasicAsyncSampleRateConverter` +wires them together and adds the four things none of them could own alone: +a lifecycle state machine, an under/overrun policy, telemetry, and +validation. + +![The composed converter: producer pushes into the ring, the servo turns +ring occupancy into a rate estimate, the resampler consumes at that rate, +the consumer pulls](../img/architecture.svg) + +*The whole machine on one page. The ring is the only structure both clock +domains touch; everything downstream of it — servo, resampler, and both +their states — lives on the consumer's side, which is why `pull()` carries +all the policy and `push()` is eight lines.* + +## The two-agent shape + +The public surface is two functions and a contract: + +- `push(interleaved, frames)` — called by exactly one producer agent, at + the input clock's pace. +- `pull(interleaved, frames)` — called by exactly one consumer agent, at + the output clock's pace. + +"Agent" rather than "thread" is deliberate. On a workstation the two agents +are threads; on the dual-core RP2350 firmware they are two processor cores; +in the deterministic test simulator they are interleaved events on one +thread. The converter never creates a thread, never names a thread, and +never synchronizes beyond what the ring already provides — it is a passive +object that two callers animate. This is why the library contains no +`std::thread`, no executor, and no callback registration: the moment a +library owns threads it owns scheduling policy, priorities, and shutdown +order, all of which belong to the application. The cost of this design is a +sharp, documented affinity contract (push is producer-only, pull is +consumer-only, `resetFromConsumer` is consumer-only); the C-ABI header +restates it because FFI callers can't read C++ doc comments. + +`push()` is eight lines and nearly trivial — clip to free space, write, +count an overrun if clipped. All composition complexity lives on the +consumer side, and that too is a decision: the producer is often an +interrupt-context audio callback with the tightest budget in the system, so +every gram of policy was moved to the puller. + +## The state machine + +`pull()` runs a three-state lifecycle — Filling, then a servo that is +Acquiring or Locked — plus two exceptional transitions. Here is the filling +and resync machinery as it ships: + +```cpp +{{#include ../../../include/srt/asrc.hpp:asrc_filling}} +``` + +```cpp +{{#include ../../../include/srt/asrc.hpp:asrc_resync}} +``` + +Filling exists because the resampler cannot produce its first output until +a full window of `taps()` history frames exists, and the servo cannot +regulate an occupancy that is still climbing toward its setpoint. So the +converter emits silence until the backlog reaches `setpoint + taps`, primes +the resampler's window in one gulp, seeds the servo's smoothers at the +observed occupancy (so the loop starts from truth rather than slewing from +zero), and begins converting — with a fade-in, discussed below. + +The two exceptional transitions are the under/overrun policy, and their +asymmetry rewards attention. **Underrun** (the consumer outran the data): +pad the rest of the block with silence, count it, return to Filling — but +call `servo_.reset(true)`, the flavor that *keeps the integrator*. The ppm +estimate is the accumulated knowledge of where the other crystal sits; a +dropout interrupts the audio, not the physics, so the estimate survives and +re-lock after a dropout takes a fraction of the original acquisition time. +**Overrun pressure** (the consumer stalled long enough for occupancy to +pass the high watermark): discard down to the setpoint in one cut, count a +resync, and re-seed the smoothers — because after a deliberate +discontinuity in the observable, letting the loop "discover" the jump would +inject exactly the transient the seed avoids. One subtlety in the resync +was wrong for months: the discard must be clamped to what the *ring* +actually holds, because the occupancy figure includes frames already staged +inside the resampler's pop scratch, which no ring discard can reach. With a +setpoint smaller than that staging buffer, the unclamped subtraction +drained the ring to zero and the converter fell into a refill-underrun +cascade. An audit found it; a regression test now pins it. + +The fade-in deserves its sentence of honesty, which the header also +carries: after every (re)fill the first 64 frames ramp linearly from +silence, so *recovery* never clicks — but the dropout's onset, and a +resync's splice, are unfaded cuts, because at the moment they happen there +is nothing valid to fade toward. A design can only be honest about which +discontinuities it removes. + +## The bug that composition hid + +Now the centerpiece, and the reason this chapter exists in its current +form. + +Every component below this file was correct. The ring transferred bytes +exactly; the servo regulated occupancy to its setpoint with textbook +dynamics; the resampler synthesized precisely the frames asked of it. And +for months, a converter built from these correct parts, at default +configuration, was **silently broken for the most common audio callback +size in the world.** + +The mechanism is embarrassingly simple once stated. A `pull(N)` must +synthesize N frames from data *already in the backlog* — in a real +deployment, no pushes land during the microseconds a pull executes. The +servo, meanwhile, faithfully regulates the backlog toward +`targetLatencyFrames`, which defaults to 48. If N is greater than 48, the +servo's goal and the consumer's need are in direct contradiction: the loop +steers occupancy *down* toward a level from which the next pull cannot be +served. Occupancy drains at the rate clamp, hits the floor, underruns, +refills, fades in — and repeats, forever. Measured at default +configuration: a 64-frame callback drops out every ~0.24 seconds +indefinitely, never reaching Locked, with the reported ppm pegged at a +false +1500 (the clamp, mistaken for the answer). A 240-frame callback +produced 80% silence. + +![Measured FIFO occupancy for pull(64) at default configuration, before +and after the feasibility fix](../img/feasibility.svg) + +*Both panels are measurements, not models: `scripts/book_figures.py` +compiles the same trace dumper against the include/ tree of the last +pre-fix commit (via `git archive`) and against HEAD, and runs the +identical scenario. Before: drain, underrun, refill — four dropouts a +second, forever. After: one adaptive raise on the first pull, then the +servo regulates the effective setpoint and the underrun count stays at +zero.* + +Why didn't anything catch it? Because every artifact that exercised the +converter had, innocently, been configured just clear of the cliff. The +quality tests pull one frame at a time — the metrologically correct choice +for their purpose. The benchmarks set the setpoint to twice the block size — +the performance-measurement-correct choice. The lock tests used 32-frame +blocks against the 48-frame default — feasible. Correct component tests, +correct measurement configurations, months of green CI, and a defaults +matrix with a hole exactly where real applications live. The lesson +generalizes and is worth stating as a rule: **a test suite validates the +configurations it contains, and silence about a configuration is not +evidence about it.** It took an adversarial audit — one explicitly tasked +with constructing failure scenarios rather than confirming passing ones — +to demonstrate it. + +The fix is the first thing `pull()` now does: + +```cpp +{{#include ../../../include/srt/asrc.hpp:asrc_feasibility}} +``` + +The design choices inside those lines carry the interesting reasoning: + +- **Adapt rather than reject.** The constructor cannot validate this — + the pull size isn't known until the first pull. Throwing from `pull()` + is forbidden by the noexcept contract, and returning an error the caller + must check is how the original silent failure happened, one layer up. + So the converter raises its *effective* setpoint to what the observed + block requires and reports the raise through + `Status::effectiveTargetLatencyFrames`. Latency follows the raised + setpoint: the honest price, visibly labeled, instead of a dropout cycle. +- **The margin is a half block.** Feasibility strictly needs + `setpoint ≥ N`; equality grazes, because block-quantized occupancy + sawtooths around the setpoint. The audit's data located the boundary + (pull = setpoint showed occasional underruns; pull comfortably below the + setpoint was clean), and `N/2` covers the sawtooth with room. +- **The raise is bounded by capacity**, computed once in the constructor — + a setpoint the FIFO cannot sustain would just move the failure. The + auto-sized FIFO's floor was raised to 1024 frames (21 ms of stereo float + costs 8 KB — memory is the cheap resource here) so that callbacks up to + roughly 340 frames work with zero configuration; beyond that, the + documentation now says plainly: size `fifoFrames` yourself. +- **Feasible configurations are untouched.** The 32-frame-against-48 + default keeps its exact behavior — verified not just by tests but by the + instruction-count ratchet: every scenario on every embedded target + measured within ±0.07% across the change, which is construction-cost + noise. The adaptation is invisible until the moment it is needed. + +The audit's failing scenarios became the regression suite +(`Feasibility.Pull64LocksCleanly` and siblings), so the bug's exact shape +is now permanently load-bearing. + +## Validation: what the constructor refuses to build + +The same audit rewrote `validated()`, and the before/after is a compact +study in what config validation is *for*. The original checked three +fields for zero. The current version rejects, with reasons recorded in a +comment: NaN or infinity anywhere in the numeric config (a NaN sample rate +previously flowed into the filter designer and constructed a converter +that emitted NaN audio — construction succeeding is worse than throwing +when what it constructs is poison); band-edge sums above the sample rate +(an anti-image filter whose cutoff exceeds input Nyquist passes images +wholesale — numerically fine, acoustically wrong); a deviation clamp large +enough that the Q0.64 conversion in the resampler would overflow an +`int64` (undefined behavior guarded at the only gate that sees the value +early enough); and size products that would wrap 32-bit `size_t` on the +embedded targets before `bad_alloc` could save anyone. The principle: +**validate at the boundary where throwing is allowed, against the +invariants of every component downstream** — the resampler can't defend +itself against a config it never sees whole. + +One postscript from the portability chapter belongs here too: on one +supported toolchain (Hexagon's static-musl configuration), C++ exceptions +cannot unwind at all, so even this careful `throw` terminates the process +there. Validation still protects — a loud death beats NaN audio — but +callers on that target are documented to validate before constructing. +Contracts end where toolchains do. + +## Telemetry that cannot lie about being lock-free + +`status()` may be called from any thread, which makes it the one place a +third agent touches the object. Every field crosses via a relaxed atomic, +single-writer, individually coherent but deliberately not mutually so — a +snapshot for humans and supervisory logic, not a synchronization +primitive. The type choices encode a portability fact worth remembering: +the counters are 32-bit atomics because on the 32-bit targets a 64-bit +`std::atomic` falls back to lock-based emulation, and a converter whose +*telemetry* takes a lock has quietly broken the lock-free promise its hot +path makes. The counters wrap at 2^32; the doc comment says so and says +what to do about it. Precision was traded for the contract, and the trade +is written down. + +## The underrun tail, end to end + +```cpp +{{#include ../../../include/srt/asrc.hpp:asrc_underrun}} +``` + +Read this excerpt slowly and you can see the whole chapter in ten lines: +the resampler asked to do exactly one job; the fade applied only when +there is something real to fade; the silence pad honoring `pull()`'s +always-fills guarantee; the integrator-preserving reset encoding what a +dropout does and does not destroy; the telemetry publish last, so +observers see states, not mid-transition fictions. + +## Verify it yourself + +```sh +# The composed state machine, end to end: +ctest --test-dir build -R 'AsrcLock' --output-on-failure + +# The feasibility bug's exact former shape, now a regression gate: +ctest --test-dir build -R 'Feasibility' --output-on-failure + +# What the constructor refuses to build (NaN, image-passing bands, +# UB-range ppm, undersized FIFOs): +ctest --test-dir build -R 'ConfigValidation' --output-on-failure + +# Resync clamping, consumer reset, fade behavior, degenerate calls: +ctest --test-dir build -R 'Resync|Reset|Fade|EdgeCalls' --output-on-failure +``` + +And one experiment worth running because it *shows you the bug*: check out +any commit before the feasibility fix, build the lock test with +`chunkOut = 64`, and watch a fully green library drop audio four times a +second. Correct parts. Broken whole. That gap is what this file is for. diff --git a/book/src/part1/fractional-resampler.md b/book/src/part1/fractional-resampler.md new file mode 100644 index 0000000..a8bf3ce --- /dev/null +++ b/book/src/part1/fractional-resampler.md @@ -0,0 +1,460 @@ +# The fractional resampler + +> God made the integers; all else is the work of man. +> +> — Leopold Kronecker + +The servo chapter ended with a number: ε̂, the rate-deviation estimate, +delivered once per output block. This chapter spends it. + +Somebody has to turn "consume 1.000 000 2 input frames per output frame" +into actual audio, forever, without drift, without glitches at the moments +the books balance, and within a per-sample cycle budget that must hold on +a Xeon and on a DSP with no double-precision FPU. That somebody is +`FractionalResampler`, the streaming engine at the bottom of +`polyphase_filter.hpp`. It owns three things: the **history** (the last T +input frames of every channel, kept where the filter can reach them), the +**phase** (where between two input samples the next output lands), and the +**slip logic** (what happens when the phase creeps across a whole-sample +boundary). + +The near-unity specialization shapes everything here. A general-ratio +resampler schedules different numbers of outputs per input and needs +control flow to match. At ±1000 ppm, the conversion degenerates into a +*creeping fractional delay*: one output per input, plus a fractional +position μ that drifts by parts per million per sample and occasionally — +every few thousand samples — crosses a boundary and forces the window to +slip by one frame. The steady state is metronomic; all the difficulty +concentrates into keeping μ exact over unbounded time and making the +slips invisible. Those two problems are this chapter. + +## The job, one output sample at a time + +The polyphase bank chapter built the table: L + 1 rows of T coefficients, +row p holding the FIR that interpolates a signal value p/L of the way +between two input samples. `interpolate()` evaluates one output at +fractional position μ ∈ [0, 1): + +1. Scale: `pos = μ · L`. The integer part picks the phase row p; the + fractional part `fr` says how far μ sits between row p and row p+1. +2. Blend: form `c[t] = c0[t] + fr · (c1[t] − c0[t])` across the T taps — + linear interpolation between adjacent rows, the trick that makes a + 256-row table act like a continuum (the residual falls ~12 dB per + doubling of L). +3. Dot: multiply the blended row against the oldest-first history window + of the newest T input samples and accumulate — in double for float + samples, int64 for fixed point. + +μ = 0 lands the output exactly on history sample T/2 − 1; μ → 1 +approaches sample T/2. And the μ wrap 1.0 → 0.0 — the whole-sample slip — +is exactly where the bank's extra row L pays off: row L equals row 0 +advanced by one input sample, so "μ reaches 1.0 on this window" and +"μ = 0.0 on the window shifted one frame" are *the same filter*, +bit-identically, with no branch. The slip machinery below leans on that +continuity; `Polyphase.MuWrapIsContinuousWithWindowShift` pins it. + +That is the whole kernel: blend, then dot. Roughly T multiply-adds of +blending plus T of dot product per output sample, and everything else in +this chapter is about doing it cheaper, more exactly, and for more +channels — without ever changing an output bit unintentionally. + +## Sharing the blend: the C1 split + +The first optimization campaign result (Part III tells the full story; +`docs/PERFORMANCE.md` is the canonical record) started from an +observation you can make by reading the loop above: in a multichannel +converter, every channel of a frame is evaluated at the *same* μ. Calling +the fused `interpolate()` per channel recomputes an identical T-tap +coefficient blend N times per frame — for stereo, half the inner-loop +work is duplicate. + +The fix is to split the kernel at its natural seam: blend once per frame +into a scratch row, then run a plain dot product per channel. + +```cpp +{{#include ../../../include/srt/polyphase_filter.hpp:rs_dot_row}} +``` + +Two things about this function beyond its arithmetic. First, the comment +at the top is a *bit-exactness contract*: given the same μ, blend-then-mac +per tap in the same order is literally the same sequence of floating-point +(or integer) operations as the fused form, so the split changes no output +bit — and the C1 entry in `docs/PERFORMANCE.md` records "outputs unchanged +bit-for-bit" as a checked result, not a hope. This library treats +bit-exactness as the boundary between an optimization (free to ship) and +an algorithm change (needs its own quality evidence); you will see the +same distinction drawn twice more in this chapter. Second, the +`SRT_RESTRICT` qualifiers are C2's contribution: without them the +compiler versioned these loops behind runtime aliasing checks (verified +with `-fopt-info-vec`, not assumed). + +The measured C1 result: **stereo pipeline −36% wall-clock on x86, +8-channel −52%**, and −15/−30/−21% instructions (float/Q15/Q31) on the +Cortex-M55 — with the mono kernels count-identical as the control, since +mono keeps the fused path. One target barely moved, though: Hexagon +improved only −3.6/−3.3/−0.2%. Profiling explained why, and the +explanation became the next hypothesis: Hexagon's pipelines were not +dominated by blends or dots at all, but by **per-sample soft-double phase +math**. Which brings us to the centerpiece. + +## The phase accumulator: Q0.64 + +Here is the failure that motivates the design. The obvious phase state is +a `double mu`, updated per output sample as `mu += 1 + eps` with the +integer part peeled off into window advances. On a Xeon that costs a few +cheap FPU ops. On Hexagon — a 32-bit audio DSP with **no double-precision +FPU** — every one of those operations is a soft-float library call, per +sample, on the hottest path in the library. C1's flat Hexagon numbers +were this cost dominating everything else. (Honest correction from the +record, because the project's documentation initially got it wrong: the +Cortex-M55 was *assumed* to share this problem, but its scalar FPU does +support FP64 — only its MVE vector unit is fp16/fp32 — so M55 float was +never soft-double-bound. The measurement that exposed the doc error is +Part III material; the resampler design below is motivated by Hexagon and +its HiFi-class cousins, where the problem is real.) + +The C3 redesign eliminates the per-sample double entirely by changing +what the phase *is*: + +```cpp +{{#include ../../../include/srt/polyphase_filter.hpp:rs_class_doc}} +``` + +The fractional position lives in `phase_`, an unsigned 64-bit integer +read as a pure binary fraction — **Q0.64**: the value μ = `phase_` / 2⁶⁴, +so the representable range is exactly [0, 1) and the resolution is 2⁻⁶⁴ +of a sample. The key move is what it accumulates: **only ε**, the +deviation. The "1" in "advance 1 + ε input frames per output frame" is +handled by the integer machinery — consume one input frame per output +frame — and never touches the fraction. Near-unity specialization again: +because the nominal ratio is exactly 1, the fraction only has to carry +the few-hundred-ppm creep, and 64 bits of headroom below the binary point +carry it essentially forever. + +Per `process()` call — once per block, not per sample — the servo's +double ε̂ is converted to fixed point: + +```cpp +{{#include ../../../include/srt/polyphase_filter.hpp:rs_slip}} +``` + +Walk the slip logic carefully; it is the subtlest six lines in the +datapath, and the trick is that **wraparound of the unsigned add is the +slip detector**, for both signs of ε, with no comparisons against 1.0 or +0.0 anywhere: + +- **ε ≥ 0** (input clock fast; the window must occasionally hurry). The + fraction creeps upward by `epsU` each sample. When the true position + would cross 1.0, the 64-bit add wraps: `m = phase_ + epsU` comes out + *smaller* than `phase_`, which is otherwise impossible for a positive + increment. That wrap **is** the forward slip: consume one *extra* input + frame (`advance = 2` — the regular frame plus the slipped one), and the + wrapped `m` is already the correct new fraction, because mod-2⁶⁴ + arithmetic subtracted exactly the 1.0 that the extra frame consumed. +- **ε < 0** (input clock slow; the window must occasionally wait). + `epsU` is the two's-complement reinterpretation of a negative `epsFix` + — a huge unsigned number — so the same add normally wraps every + sample, and *not* wrapping is the anomaly: `m > phase_` means the + fraction dipped below 0.0. That is the backward slip: consume **no** + input frame this output (`advance = 0`, reuse the current window), and + again the modular result is already the correct fraction just below + 1.0. +- Otherwise `advance = 1`: the metronomic case. + +![The Q0.64 phase accumulator slipping by wraparound, for both signs of +epsilon](../img/q064-slip.svg) + +*The slip logic run with the real mod-2⁶⁴ arithmetic, ε exaggerated to +0.09 so the wraps are visible (at the real |ε| ≈ 2×10⁻⁴ a slip fires once +every few thousand frames). Left: the fraction creeps upward until the add +wraps past 1.0 — consume one extra frame. Right: with ε negative the add +wraps on *every* ordinary frame, and the anomaly is the one that doesn't — +reuse the window. From `scripts/book_figures.py`.* + +At +500 ppm a forward slip fires every 2 000 output samples, and thanks +to the bank's extra row the filter evaluated after `advance = 2` at small +μ is the exact continuation of the filter before it at μ ≈ 1. +`AsrcLock.WholeSampleSlipsAreGlitchFree` runs 500 ppm for seconds and +bounds the output's *second difference* by the analytic bound A·ω² of a +clean sine — a discontinuity detector that would trip on any window +mis-step at any slip. + +Note also what happens between the `appendOne` calls and `phase_ = m`: +if the source runs dry midway through an `advance = 2` slip, the function +returns with the history advanced by one frame but the phase *not* +updated. History and phase are now one frame apart — a state the class +cannot repair locally. That is not a bug; it is a documented precondition +(the contract section below), and the converter's dropout path always +resets and re-primes before processing again. + +Downstream, the phase bits feed the kernel directly: + +```cpp +{{#include ../../../include/srt/polyphase_filter.hpp:rs_blend_row_phase}} +``` + +The top log₂ L bits *are* the phase-row index; the bits below, shifted +up, *are* the intra-phase blend fraction. No multiply by L, no floor, no +subtract — the Q0.64 representation makes the split between "which row" +and "how far between rows" a matter of bit fields. One conversion to the +datapath's blend-factor type per output frame (`blendFactorFromQ64`: +single-precision for float, integer for Q15/Q31) is all that remains of +the floating-point phase math. The fused mono form is the same bit +surgery around the same blend-and-mac loop: + +```cpp +{{#include ../../../include/srt/polyphase_filter.hpp:rs_interpolate_phase}} +``` + +**Is 2⁻⁶⁴ enough?** Part 0 derived the timing-jitter budget for 120 dB +transparency at 20 kHz: about 8 picoseconds. One sample at 48 kHz is +20.8 µs; 2⁻⁶⁴ of that is ~10⁻²⁴ seconds — twelve orders of magnitude +inside the budget. The double-μ design's 2⁻⁵² was also far inside it, so +resolution was never the emergency; the deeper numerical win is +*exactness over time*. An integer accumulator adds ε with **zero +rounding error per step**, forever — the only quantization is the +once-per-block conversion of ε̂, a rate error below 10⁻¹⁹ that the servo +absorbs like any other infinitesimal drift. A double μ, by contrast, +rounds on every `+=` and carries the fraction with absolute precision +limited by its integer part's magnitude. Measured, from the C3 entry: +quality *improved* to **135.0 dB at 997 Hz** when the integer phase +landed. An optimization PR whose quality guardrail moved the right +direction — the A/B discipline (benchmarks for speed, pinned SNR +thresholds for correctness) catching a pleasant surprise instead of a +regression. + +And the cost side, from the same entry: Hexagon pipelines **−10.3% (Q15) +and −15.5% (Q31)**, with float −2.6% — the soft-double phase math C1 +identified was simply gone, and the Hexagon *kernels* stayed +count-identical as the control. M55: Q15 −5.3%, Q31 −4.6%, float +1.4% — +a genuine, accepted regression on one scenario, because the M55's scalar +FP64 hardware made doubles cheap and the integer phase traded them for +int64 ops; the cross-target win justified it, and the ratchet baseline +records the trade explicitly. x86 same-minute A/B: float −5.4%, Q15 +−12.0%. + +## Dispatching the datapath + +With phase in hand, each output frame takes one of three routes: + +```cpp +{{#include ../../../include/srt/polyphase_filter.hpp:rs_dispatch}} +``` + +Mono takes the fused `interpolatePhase` — no scratch-row traffic for a +single channel (with one exception: Q15 on SMLALD-capable Cortex-M cores +routes mono through blend + dot too, because the dual-MAC loop lives in +`dotRow`; the two paths are bit-exact by construction, which is what +makes that rerouting a non-event). Low channel counts blend once into +`row_` and dot per channel over planar histories — the C1 shape. High +channel counts on hosts take the frame-major branch, which is the next +section but one. Note the branch condition `kChannelParallel && +frameMajor_`: the first operand is `constexpr`, so on embedded targets +the entire branch constant-folds away. That is not tidiness — a runtime +flag in this loop measured **+6–8%** on the M55 instruction ratchet +before the compile-time gate restored every embedded scenario to exactly +0.00%. The ratchet is why the lesson is a number and not an anecdote. + +## Feeding the window: history management + +The filter needs the newest T frames of every channel, contiguous, +oldest-first, per channel. Input arrives interleaved, in whatever chunks +the FIFO happens to hold. Between those two facts sits `appendOne`: + +```cpp +{{#include ../../../include/srt/polyphase_filter.hpp:rs_append}} +``` + +Three mechanisms, each with an RT-safety argument: + +**Chunked staging.** Frames are pulled from the caller-supplied `popFn` +in bulk (the converter passes 16-frame chunks) into the interleaved +`scratch_` buffer, then peeled off one frame at a time as the window +advances. Bulk pops amortize the ring's index synchronization across +many frames — the cached-index design from two chapters ago does its +best work when you ask it for blocks — while the resampler still +consumes with single-frame granularity, because slips need exactly-one +extra frame on demand. Frames staged in scratch have left the ring but +not yet entered the filter, which is why `bufferedFrames()` exists: the +servo's occupancy observable must count them or the estimate would carry +a chunk-sized bias. + +**Bounded compaction.** Histories are not ring buffers; they are flat +arrays with a moving end index, sized `taps + chunkFrames`. When the end +hits capacity, `memmove` slides the newest T − 1 frames back to the +front and synthesis continues. Why copy at all, when a circular buffer +would avoid it? Because the *filter* needs a contiguous window every +sample: a ring would either split the dot product at the wrap seam +(a branch and a second loop in the hottest code in the library) or copy +into a linear scratch every frame — a memmove per *sample* instead of +one per *chunk*. The flat layout pays T − 1 frames of copy once per +`chunkFrames` appends: bounded, branch-predictable, allocation-free — +worst-case cost is fixed at construction time, which is the entire +definition of RT-safe this library uses. `process()` is `noexcept`, no +locks, no allocation; every buffer was sized in the constructor, which +is allowed to throw precisely because it runs at setup time. + +**Two storage shapes.** The member block records the fork: + +```cpp +{{#include ../../../include/srt/polyphase_filter.hpp:rs_members}} +``` + +Planar — one delay line per channel — below the channel-parallel +threshold: each channel's dot product walks its own contiguous line, and +the deinterleave happens once per frame at append time (a scalar loop +over channels). Frame-major — a single interleaved line — at or above +it: appends become one contiguous `memcpy` per frame and the compaction +one `memmove` per line-fill, but the real reason for the layout is the +kernel it enables. + +## The channel axis: C6, briefly + +For high channel counts the per-frame cost is dominated by N dot +products, and the float dot product has a vectorization problem you can +now state precisely: its accumulation order is contractual (strict +per-channel double accumulation — reassociating it changes output bits), +so the *tap axis* may not be vectorized without breaking bit-exactness. +The C2 audit verified GCC obeys: float `dotRow` compiles scalar, by +design. + +But nobody said anything about the *channel* axis. Channels are +independent accumulators; computing eight of them in lockstep, one tap +at a time, keeps every channel's tap order identical to `dotRow`'s while +filling SIMD lanes with channels instead of taps. That requires the +history to deliver all channels of tap t contiguously — the frame-major +layout — and a register-blocked kernel: + +```cpp +{{#include ../../../include/srt/polyphase_filter.hpp:rs_dot_rows_frame_major}} +``` + +The measured C6 results, condensed (the full campaign, including the +callgrind profile that justified targeting the dots and the negative +results that bounded the design, is Part III's last chapter): **float +8/12/16-channel pipelines −38/−38/−42% wall-clock with AVX2+FMA**, only +−4–5% on baseline SSE2 — the gain scales with SIMD width, as it must if +the mechanism is what we claim. Bit-exact versus planar, hash-verified +over 30 000 blocks × 4 configs. The gate is deliberately narrow, each +edge measured rather than assumed: + +- **Float-only**: fixed-point channel-parallel measured ~1.5× *slower* + than planar — integer accumulation is exactly reassociable, so the + planar Q15/Q31 dots already auto-vectorize over taps, and the tap axis + beats the channel axis when both are available. +- **Channels ≥ 4** (`SRT_CP_MIN_CHANNELS`, overridable for A/B runs): + below that, lane utilization loses to the planar path's simplicity. +- **Hosts only**: the embedded targets keep their proven codegen (Helium + on M55, SMLALD on M33-class, Hexagon's measured scalar floor); the + compile-time macro gate keeps their binaries byte-for-byte ignorant of + the mode. + +And one lesson worth carrying out of context: the first channel-parallel +attempt — accumulators in a plain array the compiler kept in memory — +measured **2.8× slower than planar**. Register-block or don't bother; +`dotTileFrameMajor`'s `constexpr`-size tiles of 8/4/2/1 are that lesson +in code form. + +## The contract: prime, process, and the one-frame lie + +`FractionalResampler` is deliberately not foolproof; it is *fast*, and +its safety is a documented protocol that the converter — its only +in-tree caller — upholds. The documentation is the code's own: + +```cpp +{{#include ../../../include/srt/polyphase_filter.hpp:rs_process_doc}} +``` + +**Prime before process.** `prime()` fills the window with T real frames +(or reports dry and stays unprimed). Call `process()` unprimed and +`window()`'s pointer arithmetic `end_ − taps()` underflows a `size_t` — +the converter guarantees priming by construction, since it only leaves +its Filling state once the backlog exceeds setpoint + taps. + +**Reset after any dry return.** You now know exactly why from the slip +walk-through: a `process()` that runs dry on the *second* append of an +`advance = 2` forward slip has already advanced the history when it +returns, but never executed `phase_ = m`. History says one frame passed; +phase says none did. Every output synthesized after resuming would be +computed one frame late relative to its nominal position — not a crash, +a *silent sub-window skew*. The class cannot un-append (the frame is +deinterleaved into the histories) and does not try to special-case it; +it defines the recovery protocol instead: `reset()` clears phase, +history, and staged scratch (stale across a discontinuity anyway), then +re-prime. The converter's underrun path does exactly this, with the +servo keeping its ppm estimate and a fade-in masking the splice. + +Finally, the small read-side API that closes the control loop: + +```cpp +{{#include ../../../include/srt/polyphase_filter.hpp:rs_mu}} +``` + +`mu()` converts the phase to double **once per pull, not per sample** — +the block-rate boundary where doubles are cheap even on Hexagon, the +same boundary the ε̂ conversion crosses in the other direction. The +servo adds it to the frame count so the observable `occ + mu` moves +*continuously* through slips: at the instant a forward slip fires, the +count drops by one exactly as μ wraps from ~1 to ~0, and the sum crosses +smoothly. Without μ in the observable, every slip would inject a +one-frame staircase into the servo's error at the beat frequency — +manufacturing the very sawtooth the previous chapter spent three filter +poles suppressing. `bufferedFrames()` completes the accounting for the +staged scratch. Two accessors, and the sensor the whole control system +reads is honest to sub-sample resolution. + +## Why this file looks the way it does + +| Decision | Alternative rejected | Reason | +|---|---|---| +| Q0.64 integer phase, ε-only | `double mu += 1 + eps` per sample | soft-double per sample dominated Hexagon pipelines (C1 finding); integer add is exact forever; measured −10/−15% Hexagon, quality up to 135.0 dB | +| Slips by unsigned wraparound | compare/floor against 1.0 and 0.0 | the mod-2⁶⁴ result *is* the corrected fraction; both slip directions fall out of one add | +| Blend once per frame + per-channel dot | fused interpolate per channel | N×(blend+dot) → blend + N×dot; bit-exact by identical per-tap order; stereo −36% wall-clock (C1) | +| Flat history + bounded memmove compaction | circular history | the dot needs a contiguous window every sample; one bounded copy per chunk beats a seam branch per sample | +| Chunked popFn staging | pop one frame at a time | amortizes ring synchronization; staged frames stay visible to the servo via `bufferedFrames()` | +| Frame-major + channel-parallel dots (float, ≥4ch, hosts) | vectorize the float tap axis | tap-axis SIMD changes accumulation order = output bits; the channel axis is free and bit-exact (−38…−42% at 8–16ch) | +| Compile-time mode gate | runtime `if (frameMajor_)` alone | a hot-loop runtime flag cost +6–8% M55 instructions; `constexpr` restored embedded codegen to 0.00% | +| Documented preconditions + `reset()` | internal auto-repair of dry slips | the failure needs a reprime anyway (stale window); a repair path would be untestable dead weight on the hot path | + +## Verify it yourself + +```sh +# Quality with the Q0.64 phase in the loop — the pinned thresholds +# include the 135 dB figure C3 improved: +ctest --test-dir build -R 'AsrcQuality\.' --output-on-failure + +# Slip continuity: the second-difference bound at +500 ppm (a slip +# every 2000 samples), plus lock/drift behavior: +ctest --test-dir build -R 'AsrcLock\.' --output-on-failure + +# The mu-wrap/extra-row continuity the slips depend on: +ctest --test-dir build -R 'Polyphase\.' --output-on-failure + +# Channel independence at 12/16 channels — on a host float build this +# exercises the frame-major channel-parallel path: +ctest --test-dir build -R 'MultiChannel' --output-on-failure + +# A/B the channel axis yourself: benchmark, then rebuild with the +# threshold pushed out of reach and benchmark again (use -march=native +# to see the AVX2 headline; SSE2 shows a few percent): +cmake -B build-bench -DCMAKE_BUILD_TYPE=Release -DSRT_BUILD_BENCHMARKS=ON \ + -DCMAKE_CXX_FLAGS="-march=native" +cmake --build build-bench -j && \ + ./build-bench/bench/srt_bench --benchmark_filter='Pipeline_Float.*(8|12|16)ch' +cmake -B build-planar -DCMAKE_BUILD_TYPE=Release -DSRT_BUILD_BENCHMARKS=ON \ + -DCMAKE_CXX_FLAGS="-march=native -DSRT_CP_MIN_CHANNELS=999" +cmake --build build-planar -j && \ + ./build-planar/bench/srt_bench --benchmark_filter='Pipeline_Float.*(8|12|16)ch' + +# Break it on purpose: change `advance = 2` to `advance = 1` in the +# forward-wrap branch of process(), rebuild, and watch +# AsrcLock.WholeSampleSlipsAreGlitchFree fail its second-difference +# bound — every slip becomes an audible one-frame stutter. +``` + +The last experiment is worth actually running once. The slip logic is +six quiet lines that look like integer bookkeeping; breaking them turns +a 135 dB converter into a machine that clicks every forty-two +milliseconds. That gap — between how little the code looks like it is +doing and how much the measurements say it is — is the fractional +resampler in one sentence. diff --git a/book/src/part1/kaiser.md b/book/src/part1/kaiser.md new file mode 100644 index 0000000..be64433 --- /dev/null +++ b/book/src/part1/kaiser.md @@ -0,0 +1,408 @@ +# Designing the filter: `kaiser.hpp` + +> The purpose of computing is insight, not numbers. +> +> — Richard Hamming + +This is the only file in the library that runs exactly once per converter, +and it decides the quality ceiling of everything downstream. Every output +sample the converter will ever produce is a dot product against +coefficients this file computes in a few milliseconds at construction. If +the design here reaches 120 dB of image rejection, no later cleverness is +needed to preserve it — the hot path is exact integer or double +accumulation all the way out. If the design here falls short, no later +cleverness can recover it. So before touching the code, this chapter builds +the minimum filter-design theory a C++ reader actually needs — which is +less than a DSP course and different from one — and then spends its pages +where the textbooks stop: on the iteration cap, the clamp, the +normalization constant, and the compile-time-versus-runtime decision that +the textbooks never had to make. + +## The problem: evaluate a signal between its samples + +The converter's core operation (next chapter) is a *fractional delay*: given +the last `T` input samples of a signal, produce its value at a position μ +that falls between two of them. Sampling theory says this is not guesswork. +A signal sampled at rate `fs` with no content above `fs/2` is *completely +determined* between its samples; the reconstruction is + +```text +x(t) = Σₙ x[n] · sinc(t − n), sinc(u) = sin(πu) / (πu) +``` + +— every sample contributes a sinc centered on itself, and the sum +interpolates exactly. The `sinc` in this file is that function, with the +one hazard a numeric programmer would expect handled explicitly: + +```cpp +{{#include ../../../include/srt/detail/kaiser.hpp:kai_sinc}} +``` + +(The 0/0 at x = 0 is a *removable* singularity — the limit is 1 — but IEEE +arithmetic doesn't take limits, so the code must.) + +The catch is the `Σₙ`: it runs over **all** samples, and sinc decays like +1/t, which is uselessly slow. Truncating the sum to a window of `T` samples +around the evaluation point is mandatory. How you truncate is the entire +design problem. + +## Why plain truncation rings + +Chopping the sinc off after T samples is the same thing as multiplying the +ideal infinite filter by a rectangular window. Multiplication in time is +convolution in frequency, so the ideal filter's perfectly sharp frequency +response gets smeared by the rectangle's spectrum — and the rectangle's +spectrum is awful: its sidelobes start at −13 dB and buy you a stopband of +only about −21 dB. Worse, this is a *shape* problem, not a *size* problem. +Doubling T squeezes the smearing into a narrower band (the transition +sharpens) but the first sidelobe stays at the same level — the Gibbs +phenomenon. A truncated sinc leaks images at −21 dB whether it has 12 taps +or 12,000, and −21 dB is roughly the error of cheap linear interpolation. +For a 120 dB budget, truncation alone is off by five orders of magnitude. + +The fix is to taper instead of chop: multiply the sinc by a window that +falls smoothly to zero at the edges. Every smooth window trades the same +two currencies — a wider main lobe (slower transition, so more taps for the +same band edges) buys lower sidelobes (deeper stopband). The question is +only which window spends the taps most efficiently. + +## The Kaiser window, and what to cite + +James Kaiser's answer (Kaiser 1974; the survey that made it standard +practice is harris 1978) is the *I₀–sinh* window, + +```text +w[i] = I₀(β · √(1 − u²)) / I₀(β), u ∈ [−1, 1] across the window, +``` + +where I₀ is the zeroth-order modified Bessel function. It is a closed-form +approximation to the *prolate spheroidal* window — the provably optimal +concentration of energy in the main lobe — that costs one special function +instead of an eigenvalue problem. Its virtue for engineering is the single +knob: **β alone sets the sidelobe level**, continuously, from rectangular +(β = 0) to arbitrarily deep, and Kaiser published empirical formulas mapping +a stopband spec in dB directly to β and to the filter length. No iteration, +no optimization run, no table lookup: attenuation in, coefficients out. + +That is the theory, and this book will not re-derive it — Kaiser's paper +and harris's survey do it properly. What they do *not* tell you is how to +evaluate I₀ in a `noexcept` header without a math library that provides it, +what happens to the length formula when a caller hands it garbage, why the +normalization constant is `L` and not 1, or whether any of this should run +at compile time. That is the rest of this chapter. + +![The Kaiser window for the three presets' beta values: higher attenuation +targets produce more strongly tapered windows](../img/kaiser-window.svg) + +*The knob in action: the presets' attenuation targets (96/120/140 dB) map +through `kaiserBeta` to β = 9.6/12.3/14.5, and higher β buys its deeper +stopband by tapering the window harder — which widens the main lobe, which +is why `estimateTaps` charges more taps for the same transition width. +Generated by `scripts/book_figures.py` from the same formulas.* + +## `besselI0`: a power series with an escape hatch + +`` has no I₀ (`std::cyl_bessel_i` exists in the special-functions +annex, but it is optional, absent from libc++, and this library targets +toolchains as odd as hexagon-musl). So the file computes it from the power +series + +```text +I₀(x) = Σₖ [ (x/2)ᵏ / k! ]² +``` + +which converges for every finite x: + +```cpp +{{#include ../../../include/srt/detail/kaiser.hpp:kai_besseli0}} +``` + +Three details carry all the engineering. + +**The recurrence.** Each term is the previous term times `(x/2k)²` — no +factorials, no powers, no overflow staging. Term k relates to term k−1 by +exactly the ratio `r²`, computed in two multiplies. For the β values this +library ever produces (about 12.3 for the 120 dB preset, 14.5 for 140 dB) +the terms grow until k ≈ x/2 ≈ 6 and then collapse factorially; a few dozen +terms reach full double precision, matching the "~50-term" budget the +file's header comment charges against constexpr evaluation. + +**The stopping criterion.** `term < 1e-21 * sum` stops when the next term +can no longer perturb the sum's 16 significant digits — a *relative* test, +so it is correct whether I₀ is 1.0001 or 10⁴ (it is about 19,000 at β = 12). +The margin below double epsilon (≈ 2.2·10⁻¹⁶) costs a handful of extra +iterations and removes any temptation to reason about rounding at the +boundary. + +**The iteration cap — the line a textbook would not print.** The loop bound +`k < 1000` looks redundant: the series *always* converges, so the relative +test *always* fires eventually. For every real x, yes. Feed the function a +NaN — say, from an uninitialized config field three call frames up — and +every comparison involving `term` is false, including the exit test. An +unbounded loop in a `noexcept` function would hang the caller's constructor +forever. With the cap, the worst case is a garbage return value that the +converter-level validation (chapter after next) rejects anyway. The cap is +not about convergence; it is about making *termination* independent of +floating-point semantics. This costs one integer compare per iteration and +turns an unprovable property into a checkable one. + +The unit test pins the function against reference values computed +independently (`besselI0(1.0) = 1.2660658777520084…`), at tolerances that +scale with the magnitude — 10⁻¹² absolute near 1, 10⁻⁶ near 19,000 — i.e. +constant *relative* accuracy, which is what the window formula's ratio +`I₀(β·…)/I₀(β)` actually consumes. + +## `kaiserBeta`: an empirical fit, taken as published + +```cpp +{{#include ../../../include/srt/detail/kaiser.hpp:kai_beta}} +``` + +This is Kaiser's published fit, digit for digit — `0.1102`, `0.5842`, +`0.07886` are his constants, not this library's, and the right response to +magic numbers with a citation is to copy them exactly and test them exactly +(the unit test asserts the formulas symbolically, so a typo in a constant +cannot hide). Two things are worth understanding rather than memorizing: + +- **Why piecewise.** The relationship between β and achieved attenuation is + smooth but not polynomial; Kaiser fit it in two regimes. Above 50 dB the + relationship is essentially linear. Between 21 and 50 dB the fractional + power term takes over. Every preset this library ships (96–140 dB) lives + on the first line; the second exists so that off-spec experiments degrade + gracefully instead of nonsensically. +- **Why zero below 21 dB.** A rectangular window — no taper at all — + already achieves about 21 dB. Asking the fit for less than the free + floor correctly returns "don't taper." + +## `estimateTaps`: the cost formula, with a seatbelt + +β sets the stopband *depth*; the number of taps sets how fast the response +can *fall* into it. Kaiser's length estimate (the form popularized by +harris) says taps scale linearly with attenuation and inversely with +transition width: + +```cpp +{{#include ../../../include/srt/detail/kaiser.hpp:kai_estimate}} +``` + +Note what the signature normalizes to: transition width *as a fraction of +the input rate*, and the return is taps *per polyphase branch*. The full +prototype (next section) has `L·T` taps at an oversampled rate of `L·fs`; +run the classic formula at that rate and both numerator and denominator +pick up the same factor of L, which cancels. Expressing the estimate per +branch keeps the caller's arithmetic in the units the caller actually has — +"8 kHz of transition at 48 kHz" — with no L in sight. + +Plug in the `balanced()` preset: 120 dB across a 20→28 kHz transition at +48 kHz gives `(120 − 8) / (2.285 · 2π · 8000/48000) ≈ 46.8`, so 47 taps; +the unit test (`Kaiser.TapEstimateMatchesHarrisFormula`) brackets exactly +this computation at 45–49, and the shipped preset says `tapsPerPhase = 48` +— the estimate rounded up to an even count (even matters later: the SMLALD +kernel on Cortex-M33-class parts consumes taps in pairs). This function is +how the presets were *chosen*; the bank itself takes `T` from the spec, so +the estimate is a design aid with a unit test rather than a hot dependency. + +Then there is the comment at the top of the body, which earns its own +paragraph because it was not in the first version of this file. The raw +formula misbehaves at both edges of its domain: `attenDb < 8` makes the +numerator negative, and a zero or negative transition width divides to +±infinity. Both would then hit `static_cast` — and converting +a negative or non-finite `double` to an unsigned integer is **undefined +behavior** in C++, not "some big number." Not implementation-defined: +undefined, the kind UBSan flags and optimizers exploit. An adversarial +audit of the library flagged the cast; the guard was added in response. The +predicate is written `!(transWidthNorm > 0.0)` rather than +`transWidthNorm <= 0.0` deliberately — the negated form is also true for +NaN, so all three pathologies (negative, zero, NaN) funnel into the same +clamp, and the attenuation edge is covered by the `n > 4.0` select on the +other side. The floor of 4 taps is the smallest window the bank will accept. +A design helper this cheap has no business having *any* input that invokes +UB, however silly the input. + +## `designPrototype`: where all of it lands + +```cpp +{{#include ../../../include/srt/detail/kaiser.hpp:kai_prototype}} +``` + +One pass, one output array, but four decisions are packed into these lines. + +**The grid.** The prototype is the windowed sinc sampled `L` times per +input sample — `t = (i − center) / numPhases` is time measured in *input* +samples. This is the oversampled master filter that the next chapter slices +into L branches; length `L·T` means 4,096 doubles for `fast()`, 12,288 for +`balanced()`, 40,960 for `transparent()`. `center` places the peak exactly +mid-array, which makes the filter linear-phase by symmetry — its group +delay is a constant `T/2` input samples, the number the converter's latency +formula quotes. + +**The window argument, defensively.** `u` sweeps [−1, 1] across the array +and feeds `√(1 − u²)`. At the exact endpoints `1 − u²` is zero in real +arithmetic but can round a few ulps *negative* in floating point, and +`std::sqrt` of a negative is NaN — one NaN tap would silently poison every +dot product that ever touches that row. The `std::max(0.0, …)` costs +nothing and closes the hole. (Notice the theme: this file trusts +floating-point identities nowhere — not in `sinc`, not in the series exit, +not here.) + +**What `cutoffNorm` means, and its surprising value.** The cutoff is +normalized so 1.0 sits at the *input* Nyquist, and the caller centers it in +the transition band: `(passbandHz + stopbandHz) / fs`. For the balanced +preset that is (20,000 + 28,000)/48,000 = **exactly 1.0** — the −6 dB point +of this anti-imaging filter sits *at* 24 kHz, with the response still flat +at 20 kHz and 120 dB down by 28 kHz. A reader trained on decimation filters +may flinch: doesn't a cutoff at Nyquist let aliasing through? No — this +filter's job is *interpolation* in a near-unity converter. The images it +must kill are reflections of the input spectrum around `fs`, so content +below 20 kHz images no lower than 28 kHz; the band between 20 and 28 kHz +contains, by construction of the spec, nothing anyone claimed to protect. +Splitting the transition symmetrically across Nyquist spends the taps where +they buy audible margin on both sides. This is the first of several places +where "near-unity only" (the library's headline restriction) converts +directly into cheaper mathematics. + +**The normalization: sum = L, not 1.** A textbook lowpass normalizes its +coefficient sum to 1 so DC passes at unity gain. This prototype normalizes +to `L` — because no output sample is ever computed with the whole +prototype. Each output uses one branch of `T` taps: every L-th coefficient. +The L branch sums partition the total, and for a good lowpass they +partition it *evenly* — each branch's DC gain deviates from the mean only +by stopband-sized leakage (a branch sum is, in DFT terms, the prototype's +response sampled at multiples of the input rate: exactly the image +frequencies the stopband suppresses). Normalize the total to L and every +branch lands at 1 ± leakage; feed the converter DC and DC comes out, at any +fractional position. That is not left to inspection: +`Polyphase.DcGainIsUnityAcrossMu` pushes an all-ones window through the +*built* bank at 64 random μ values and requires unity within 10⁻⁴ — a +bound loose enough to admit float coefficient storage and row blending, +tight enough that a normalization bug (off by one branch, off by a factor +of L) fails by orders of magnitude. One subtle consequence lands two +chapters from now: with branch gains pinned near 1, the *peak* coefficient +also sits near 1.0, which is precisely why the fixed-point formats must +spend a headroom bit (Q1.14, Q1.30) on their coefficients. + +## The headline decision: runtime design, not `constexpr` + +Everything above is pure functions of compile-time-lookable values — and +this is C++20, where `constexpr` has teeth. The obvious modern move is to +evaluate the whole design at compile time: coefficients in `.rodata` +(attractive on a flash-based microcontroller), zero construction cost, even +`static_assert`s on the response. The file's own header records why that +was rejected, and since the reasoning is a design artifact it is kept where +refactors will trip over it: + +```cpp +{{#include ../../../include/srt/detail/kaiser.hpp:kai_design_note}} +``` + +Present the alternative fairly, because it *almost* works: + +- **The language isn't there yet.** `std::sin`, `std::sqrt`, `std::pow` + are not `constexpr` before C++26 (P1383 fixes this). A C++20 constexpr + design needs hand-rolled constexpr transcendentals — several hundred + lines of the most bug-prone code in numerics, duplicating functions the + runtime already has, in a library whose entire test story leans on + comparing against exactly those runtime functions. +- **The compile-time cost is not a rounding error.** Constexpr evaluation + is interpretation, three to four orders of magnitude slower than native + code. The design touches every one of 12K–41K taps with a `sin`, a + `sqrt`, and a ~50-term Bessel series. What runs in well under 10 ms + native becomes tens of seconds to minutes interpreted — **per translation + unit**, because a header-only library re-instantiates in every TU that + includes it. A user with twenty includes pays twenty times, on every + rebuild, forever. +- **The inputs are not actually compile-time.** The band edges are scaled + by the *runtime* sample rate (`FilterSpec::scaledTo`, + `Config::forSampleRate`) — a converter constructed for a rate read from + an ALSA descriptor at startup cannot have baked coefficients at all. A + constexpr path would be a second, divergent code path serving only the + subset of users with fully static configs. + +Against all that, the runtime cost being amortized is: one design, under +10 ms, in a constructor documented as setup-time-only, off the audio path +by the library's own RT rules. The trade is lopsided once written down — +but only once written down, which is why the file writes it down. (If +C++26 constexpr math plus a measured compile-time budget someday flips the +trade for static configs, the pure functions here are already shaped for +it: no state, no allocation, `std::span` in, coefficients out.) + +## The test evidence: the spec, measured by DFT + +A filter design module invites a lazy test — "coefficients equal last +week's coefficients." That freezes bugs in amber. What the library pins +instead is the *specification*: `tests/test_kaiser.cpp` computes the +prototype's actual frequency response by direct DFT and asserts the numbers +the presets advertise. + +![Prototype magnitude response of the three presets, with a passband-ripple +detail panel](../img/kaiser-response.svg) + +*What the spec tests pin: each preset's transition starts at its passband +edge and reaches its rated floor by its stopband edge, and the detail panel +shows all three passbands flat within ±0.01 dB. The curves come from +`scripts/book_figures.py`, which re-runs `designPrototype`'s math verbatim.* + +The measurement function evaluates `|H(f)|` at arbitrary frequencies in Hz +against the oversampled prototype (rate `L·fs`), normalized by L so the +passband reads 0 dB — a direct O(n) sum per frequency. No FFT: an FFT +would demand a power-of-two grid, deliver frequencies nobody asked for, +and drag in a dependency, all to accelerate a few hundred evaluations in a +test that runs in milliseconds. Then, for each shipped preset: + +- **Passband flatness:** every 500 Hz from DC to the passband edge, + response within ±0.01 dB of unity. That is the "flat to 20 kHz" claim in + the README, as an executable inequality. +- **Stopband depth:** every 250 Hz from the stopband edge out to *three + times the sample rate*, response below −(spec − 1) dB. The 3·fs reach + matters: the polyphase structure's images repeat around every multiple + of fs, so a stopband that sagged past the first image would pass junk at + 96 kHz even if 28 kHz looked fine. The 250 Hz step is calibrated to the + filter, not guessed: a T-tap-per-branch prototype has sidelobe nulls + spaced fs/T ≈ 1 kHz apart, so 250 Hz sampling puts about four probes on + every lobe — a peak cannot hide between probes. The 1 dB grace absorbs + the gap between Kaiser's empirical β fit and the realized window; the + presets' 120 means "at least 119 measured," and in practice the margin + is comfortable. + +Honest limits, as always: these tests certify the *double-precision +prototype*. Coefficient quantization (float, Q1.14, Q1.30) and the +row-blending residual are downstream effects certified by the next +chapter's tests and the end-to-end SNR suite — the layering is deliberate, +so a failure names its culprit. + +## Why these ~100 lines look the way they do + +| Decision | Alternative rejected | Reason | +|---|---|---| +| Kaiser window | Parks–McClellan / remez | one β knob, closed form, no iteration to converge or fail at setup; near-optimal is optimal enough at 120 dB | +| Power-series I₀ | `std::cyl_bessel_i` | optional annex, missing on libc++/embedded toolchains; the series is 12 lines and testable | +| Iteration cap `k < 1000` | trust convergence | NaN input defeats the relative-error exit; termination must not depend on FP semantics in a `noexcept` function | +| UB clamp in `estimateTaps` | trust callers | negative/infinite → `size_t` cast is UB; found by audit, closed for one branch | +| Cutoff centered in transition, up to input Nyquist | classic conservative cutoff | near-unity interpolation only fights images of the protected band; symmetric transition spends taps evenly | +| Normalize sum to L | sum to 1 | per-*branch* DC gain is what reaches the output; pinned by the DC unit test | +| Runtime design | C++20 constexpr tables | pre-C++26 constexpr math gap; minutes of interpreted evaluation per TU; runtime sample rates exist; <10 ms once at setup | +| Spec-based DFT tests | golden coefficient files | tests the claim, not the bits; refactors that preserve the response pass | + +## Verify it yourself + +```sh +# Build and run the design-math tests: Bessel/beta reference values, the +# harris estimate bracket, and the DFT passband/stopband spec checks for +# all three presets: +cmake -B build && cmake --build build -j +ctest --test-dir build -R Kaiser --output-on-failure + +# The claim the normalization exists to protect (unity DC gain through the +# built bank, swept over mu): +ctest --test-dir build -R Polyphase.DcGain --output-on-failure + +# Break it on purpose: in designPrototype, change the normalization to +# `1.0 / sum` (the textbook choice) and watch DcGainIsUnityAcrossMu fail by +# a factor of numPhases; or weaken kaiserBeta's 0.1102 to 0.11 and watch +# the Transparent stopband check report the exact frequency that leaks. +``` + +Both sabotage runs are worth the five minutes: the first shows you which +test owns the normalization contract, and the second shows the empirical β +fit has no slack at 140 dB — which is precisely why the constants are +copied from Kaiser 1974 to the last digit. diff --git a/book/src/part1/pi-servo.md b/book/src/part1/pi-servo.md new file mode 100644 index 0000000..b407866 --- /dev/null +++ b/book/src/part1/pi-servo.md @@ -0,0 +1,525 @@ +# The clock servo: `pi_servo.hpp` + +> A governor is a part of a machine by means of which the velocity of the machine is kept nearly uniform, notwithstanding variations in the driving-power or the resistance. +> +> — James Clerk Maxwell, *On Governors* (1868) + +There is a number this entire library exists to find, and nobody will tell +it to us. + +Call it ε: the fractional rate mismatch between the two crystals. The +producer's device claims 48 kHz and delivers 48 000 × (1 + ε) frames per +second; the consumer's device claims 48 kHz and takes them away at +48 000 × (1 + something else). ε is a few parts per million, it wanders +with temperature, and no API on either side will report it — the whole +premise of the problem is that both devices believe they are correct. The +resampler in the next chapter can apply any rate correction we ask of it, +to a resolution of 2⁻⁶⁴ samples. It just needs to be told the number. + +The only observable we have is the elastic buffer between the domains: the +SPSC ring from the last chapter, whose occupancy was designed to be *exact* +for precisely this reason. If the producer's clock is fast by ε and we +consume at exactly the nominal rate, the buffer fills at ε × fs frames per +second — about one frame every two minutes at 200 ppm. That trickle is the +entire signal. The servo's job is to turn it into an estimate ε̂ good +enough that the resampler's output carries no audible trace of the +estimation process — and "audible trace" here means fluctuations in ε̂, +because whatever wobble the servo passes into the rate estimate +frequency-modulates every sample of the audio. + +This chapter is control theory for someone who has never tuned a loop, +taught the way this file was actually designed: start with the physics of +the thing being controlled, discover why the obvious controller fails, +derive the one that works, and then spend most of our effort on the real +enemy — which turns out not to be the clocks at all, but the fact that we +can only *count*. + +## The plant: a buffer that integrates + +Control theory calls the thing you are controlling the *plant*. Ours is +the FIFO, and its equation of motion is one line. The producer inserts +fs × (1 + ε_true) frames per second. The converter synthesizes fs output +frames per second, and for each output frame it consumes (1 + ε̂) input +frames — that is what "phase advance = 1 + ε̂" will mean in the next +chapter. Occupancy changes at the difference of those rates: + +```text +d(occ)/dt = fs · (ε_true − ε̂) +``` + +The buffer is a **pure integrator** with gain fs. Feed it a rate error and +it does not settle at some proportional level — it ramps, forever, until +it hits a wall (empty: dropout; full: overflow). Two consequences follow +immediately. First, doing nothing is not an option even for arbitrarily +small ε: any uncorrected mismatch is a glitch with a countdown timer on +it. Second, the plant's own integration is going to interact with whatever +memory the controller has, and getting that interaction right *is* the +design. + +The servo observes the occupancy once per `pull()` — the converter calls +`update(occ, mu, dt)` with the raw backlog in frames, the resampler's +current fractional position μ (so the observable `occ + mu` moves +continuously through whole-sample slips instead of staircasing by ±1), +and the elapsed time `dt = framesPulled / fs`. + +## Why proportional control is not enough + +The obvious controller is proportional: measure the occupancy error +`e = occ − target`, set ε̂ = Kp·e. If the buffer is too full, consume +faster; too empty, consume slower. It even works, in the sense that it +does not fall over. + +Now ask what it converges *to*. In steady state the occupancy stops +moving, so the plant equation forces ε̂ = ε_true — the estimate must equal +the true offset exactly. But a proportional controller can only produce +ε̂ = Kp·e, so the error cannot be zero: it must park at + +```text +e_ss = ε_true / Kp +``` + +a *standing occupancy offset* proportional to the clock mismatch. Plug in +the numbers this library actually uses and the problem stops being +academic. At the steady-state loop bandwidth of 0.05 Hz (we will get to +why it is that low), Kp ≈ 1.3 × 10⁻⁵ per frame. A routine 300 ppm crystal +offset parks the buffer **23 frames** away from its setpoint — half the +default 48-frame latency budget gone, sitting one frame shy of the default +24-frame unlock threshold, and different for every unit in the field +because every crystal pair drifts differently. Latency that depends on +which two devices you happened to plug in is not a spec anyone signs. + +The fix is memory. Add an integral term: + +```text +ε̂ = Kp·e + Ki·∫e dt +``` + +The integrator accumulates error until the error is gone: in steady state +it holds the entire ppm estimate by itself, ε̂ = ε_true with **zero +standing occupancy error**. Control theory calls the combination a *type-2 +loop* — two integrators around the cycle, the plant's and the +controller's — and type-2 is exactly the order needed to null a constant +rate offset. `tests/test_servo.cpp` pins this down against a pure +simulation of the plant equation: after settling at +300 ppm, the +occupancy must sit within 0.05 frames of the setpoint and ε̂ within 1 ppm +of the truth +(`Servo.LocksFromConstantOffsetAndNullsError`). + +A type-2 loop also does something a type-1 cannot: it follows a *ramp* in +the offset — a crystal warming up, drifting at 1 ppm/s — with bounded +rather than growing error. The residual is the classic acceleration error +`e_ss = (dε/dt · fs) / ωₙ²`, about 0.49 frames for 1 ppm/s at the 0.05 Hz +bandwidth, and `Servo.TracksSlowDriftRampWithBoundedLag` holds the +measured lag under one frame while `epsHat` tracks the moving truth to +2 ppm. + +If this structure sounds familiar, it should. Replace "FIFO occupancy" +with "phase difference" and this is a **phase-locked loop**: the FIFO +comparison is the phase detector, the PI filter is the loop filter, and +the resampler's μ accumulator is the numerically controlled oscillator. +The README states the analogy flatly and it is worth internalizing, +because it means every result in fifty years of PLL literature applies — +including the one that matters most here: the loop bandwidth f_L +*partitions* the input timing jitter. Components above f_L are absorbed +by the buffer and never reach the audio; components below f_L pass into +ε̂ and frequency-modulate it. Choosing f_L is choosing which noise you +eat. + +## From bandwidth to gains + +So the designer picks a bandwidth and a damping; the gains should follow +mechanically. Close the PI controller around the integrator plant and the +loop's characteristic equation is + +```text +s² + fs·Kp·s + fs·Ki = 0 +``` + +Match it against the standard second-order form +`s² + 2ζωₙs + ωₙ² = 0` — the form whose behavior every control textbook +tabulates — and read off the gains: + +```text +ωₙ = 2π·f_L Kp = 2ζωₙ / fs Ki = ωₙ² / fs +``` + +The code computes exactly this, nothing more: + +```cpp +{{#include ../../../include/srt/pi_servo.hpp:sv_gains}} +``` + +Note the division by `fs_` in both gains: the plant's gain is fs, so the +controller divides it back out, and the *closed-loop* behavior depends +only on f_L and ζ. That innocuous-looking normalization is load-bearing — +it is why the gains formula is rate-portable, and (foreshadowing the first +war story) why everything *else* in the config is not. + +Damping defaults to ζ = 1, critical damping: the fastest settling that +never overshoots. Overshoot in this loop is not a cosmetic wiggle — an +occupancy overshoot is latency spent grazing the underrun floor, so the +choice is not stylistic. + +Here is the full tuning surface, with the defaults that suit a 48 kHz +near-unity converter: + +```cpp +{{#include ../../../include/srt/pi_servo.hpp:sv_config}} +``` + +Three bandwidths, three smoother corners, and a small state machine's +worth of thresholds. A single PI loop needs exactly two numbers; this +config carries fourteen. The rest of the chapter is about earning each of +the extra twelve. + +## The enemy: a sawtooth made of counting + +If the occupancy were a real number observed noiselessly, one PI loop at +a modest bandwidth would end this chapter. It is not. The occupancy is a +**count** — quantized to whole frames on the producer side, or to whole +*push blocks* when the producer delivers audio in callbacks — and that +quantization is not benign random noise. It is deterministic and +periodic. + +Picture the steady state at +200 ppm with sample-granular transfer. The +true (unquantized) backlog creeps upward by ε input samples per sample +consumed; every time the creep accumulates one whole frame, the count +steps. The observable is a perfect sawtooth: one push-block peak to peak, +repeating at the *beat frequency* + +```text +f_beat = ε · fs / pushBlock (the README's "ppm × pushRate") +``` + +At 200 ppm and sample-granular push that is 9.6 Hz with a one-frame tooth. +With 32-frame callbacks it is 0.3 Hz with a **32-frame** tooth — the +occupancy legitimately excursions ±16 frames with neither clock having +moved. (`AsrcLock.LocksAndHoldsAtConstantOffset` averages straight +through that sawtooth and requires the *mean* fill and ppm to land on the +truth.) + +Why care about a deterministic wobble in a number we only use for its +average? Because the loop does not know it is a wobble. Whatever fraction +of the sawtooth survives into ε̂ becomes a periodic modulation of the +resampling rate — FM sidebands on every tone in the program material, at +offsets of f_beat and its harmonics. And a PI controller is a terrible +filter: above f_L its proportional path passes measurement noise straight +through at gain Kp, flat, forever. Narrowing f_L does not fix this by +itself; it lowers Kp (helping linearly) while the sawtooth needs 60–120 dB +of suppression. The loop needs help *before* the loop: error prefilters. + +But a prefilter is lag, and lag inside a feedback loop erodes phase +margin; you cannot smooth aggressively *and* acquire quickly with the same +settings. There is no single operating point that pulls in a cold start +within a second, rejects a 9.6 Hz sawtooth by 100+ dB, and follows a +warming crystal. So the servo refuses to pick one point. It picks three. + +## Three loops, one integrator + +| Stage | Loop bandwidth | Error prefilter | Role | +|---|---|---|---| +| **Acquire** | 10 Hz | 1-pole, 50 Hz | pull in from a cold start (~1 s to lock) | +| **Track** | 1 Hz | 1-pole, 5 Hz | robust lock; terminal stage for coarse-block transfer | +| **Quiet** | 0.05 Hz | 3-pole cascade, 0.5 Hz | steady state for fine-grained transfer | + +Each stage is the same PI structure with gains from the same +`computeGains`, differing only in bandwidth and in how hard the +measurement is smoothed before the loop sees it. The update begins by +maintaining *both* kinds of smoothed error on every call: + +```cpp +{{#include ../../../include/srt/pi_servo.hpp:sv_update_smooth}} +``` + +Two details here repay attention. The smoothing coefficient +`alpha(cornerHz, dt) = 1 − exp(−2π·f·dt)` is the exact discrete step of a +one-pole lowpass over an arbitrary interval, so the filter corners are +honest frequencies in Hz regardless of how large or irregular the pull +blocks are — the same property the gain formulas have via `dt` in the +integrator. And the three-pole quiet cascade (`q1_ → q2_ → q3_`) runs +**always**, even in Acquire and Track where its output does not drive the +loop. That costs three multiply-adds per block and buys two things: the +promotion gate into Quiet has real data to judge (next section), and at +the instant of promotion the cascade is already settled on the observable +— no filter warm-up transient handed to the narrowest, most fragile +stage. + +Why a *cascade* of three identical poles rather than one pole three times +lower, or something sharper? Rolloff. One pole buys 6 dB/octave above its +corner; three poles buy 18 dB/octave. Against the 9.6 Hz sawtooth, a +0.5 Hz three-pole cascade provides roughly (9.6/0.5)³ ≈ 77 dB of rejection +before the loop even sees the error — while adding only manageable lag at +the 0.05 Hz loop bandwidth two decades below. The file header states the +net result as a system-level figure: in Quiet, a one-frame sawtooth is +rejected to roughly −120 dBc equivalent at 20 kHz, while the loop still +follows a 1 ppm/s drift ramp with under half a frame of standing error. +Sharper IIR shapes (resonant poles, elliptic-style) would trade that +clean, phase-predictable lag for ringing inside a feedback loop — exactly +the wrong place for it. + +## The promotion machine + +Three stages need transitions, and transitions are where multi-mode +controllers usually betray you — a bandwidth switch with mismatched state +is a step input injected into your own loop. Here is the whole state +machine: + +```cpp +{{#include ../../../include/srt/pi_servo.hpp:sv_update_stages}} +``` + +Reading it as a protocol: promotion out of Acquire requires the *fast* +smoothed error to stay inside one frame for half a second; promotion out +of Track requires the *cascade* error to stay inside one frame for two +full seconds. Demotion is the same test run backwards with a much wider +threshold — 24 frames — and drops exactly one stage. The asymmetry +(narrow gate up, wide gate down, long holds) is hysteresis by +construction: the servo would rather linger a stage wide than oscillate +between modes. + +The choice of *which* error gates the Track→Quiet promotion is the +subtlest line in the file, and it earns the second war story below. +Gating on the cascade-smoothed error means the promotion asks precisely +the question that matters: *after the smoothing Quiet would actually use, +is the observable quiet enough to run a 0.05 Hz loop?* When a large block +beat dominates the occupancy, the answer is naturally and persistently +no — the cascade output wobbles by more than a frame at the beat +frequency, the hold timer keeps resetting, and the servo stays in Track. +Nobody wrote a rule that says "coarse-block configurations must not enter +Quiet." The physics writes it. + +Both promotions share their hold logic, and it does double duty: + +```cpp +{{#include ../../../include/srt/pi_servo.hpp:sv_hold}} +``` + +While the hold window runs, the servo is not just waiting — it is +averaging its own output ε̂ with a time constant of a fifth of the hold. +Here is why that average exists. The wide stages do not *reject* the +quantization sawtooth; they phase-track it, riding the wobble with their +whole loop. Their instantaneous ε̂ is therefore a good estimate wrapped in +a periodic error. Averaging over the hold window (many beat cycles) +strips the wobble and leaves the clean central value — and at the moment +of promotion, *that* is what gets loaded into the narrower stage's +integrator (`integ_ = clamp(epsAvg_, ...)` in the state machine above). + +Recall what the integrator *is* in steady state: the entire rate +estimate. Handing the next stage a clean integrator means handing it a +loop that is already essentially converged; the proportional path only +has to clean up residuals. That is the transient-free handoff — "to first +order," as the header says, because the smoothers keep their state and +the observable keeps its continuity, so nothing steps. +`Servo.BandwidthSwitchIsTransientFree` runs the plant through lock and +across both promotions and requires the occupancy never to leave the +one-frame lock threshold afterwards: a handoff you cannot find in the +data. + +## The output stage, and why the clamp is inside + +The last lines of `update()` are the PI itself: + +```cpp +{{#include ../../../include/srt/pi_servo.hpp:sv_update_out}} +``` + +The clamp appears twice, and the first one — on the integrator, not just +the output — is the anti-windup that every practical PI needs and every +first implementation forgets. Consider a consumer stall: the occupancy +error goes huge and stays huge for seconds while the converter waits for +the high-watermark resync. An unclamped integrator would spend that whole +time charging toward a rate estimate of thousands of ppm — a number no +crystal pair can produce — and then, after the disturbance clears, the +loop would have to *discharge* all of that false conviction through its +narrow bandwidth, dragging the occupancy through a huge excursion for tens +of seconds. Clamping the integrator at 1.5 × `maxDeviationPpm` bounds the +lie the loop can tell itself: the estimate can never leave the range +physics allows, so recovery from any disturbance starts at most one clamp +width from the truth. The output clamp then bounds what the resampler is +asked to do per sample (which also protects the Q0.64 conversion in the +next chapter). `Servo.ClampsToMaxDeviation` feeds a 10 000-frame error and +requires the output to saturate exactly at 1.5× the configured range. + +## Knowing when not to chase: `seed()` and `reset()` + +```cpp +{{#include ../../../include/srt/pi_servo.hpp:sv_reset}} +``` + +A feedback loop's reflex is to chase every step in its input. Some steps +carry no information, and the API encodes each such case explicitly: + +- **`seed(occPlusMu)`** snaps all four smoothers onto the current + observable. The converter calls it when the occupancy jumps *for a + known reason* — acquisition start, a hard resync discard. Without it, + the smoothers would report the jump as a genuine multi-frame error and + the loop would obediently swerve. +- **`reset(keepIntegrator=true)`** re-arms the state machine after a + dropout but preserves the integrator — because a dropout says nothing + about the crystals. The ppm estimate from before the glitch is still + the best available number, and relock becomes a formality + (`Servo.DropoutResetKeepsPpmEstimate` pins both flavors: `true` + preserves the estimate to 5 ppm, `false` zeroes it). +- **`setTarget()`** moves the setpoint while keeping the integrator *and* + the smoothers' tracking state, so the loop slews to the new occupancy + at its clamped rate with no discontinuity — used by the converter's + adaptive pull-block setpoint raise, where the setpoint moves but, + again, the clocks have not. + +The shared principle: the integrator is the loop's knowledge and the +smoothers are its perception. Each event handler keeps exactly the state +that is still true and resets exactly the state that is not. + +## War story one: 16 kHz, minus 32 decibels + +For a long time this library's defaults were "the defaults," full stop — +designed, tested, and shipped at 48 kHz. Then a real deployment shape +arrived: 16 kHz reference-microphone processing. Same code, same presets, +a third of the sample rate. The quality suite was duplicated at 16 kHz, +expecting boring numbers. + +The numbers came back **~32 dB worse at every tone**, falling a further +6 dB per octave of signal frequency. That frequency signature is the +fingerprint of small-index FM — phase modulation of the resampling +position, whose sidebands grow with the modulated signal's frequency — +which pointed at the servo, not the filter. + +The mechanism, worked out in +`tests/test_asrc_quality_16k.cpp`'s header comment and now baked into the +config comment: servo bandwidths and smoother corners are **absolute +hertz**, but the disturbance they exist to reject is not. The slip-beat +sawtooth sits at ε × fs — 9.6 Hz at 48 kHz, only **3.2 Hz at 16 kHz**. +The three-pole 0.5 Hz cascade whose rejection goes as f³ therefore does +(16/48)³ ≈ 28.6 dB *less* damage to the beat at 16 kHz, and the +measurement becomes servo-FM-limited: predicted ≈ 28.6 dB, measured +≈ 32 dB. The loop was not misbehaving. It was doing exactly what its +absolute-Hz constants said, against a disturbance that had moved. + +The rule that fixes it is now a method, so it cannot be half-remembered: + +```cpp +{{#include ../../../include/srt/pi_servo.hpp:sv_scaled_to}} +``` + +Every field with units of Hz scales with the rate — keeping the loop +identical in *normalized*, per-sample terms, which is the frame the +disturbance lives in. Every field denominated in frames or ppm +(`lockThresholdFrames`, `unlockThresholdFrames`, `maxDeviationPpm`) is +already normalized and stays put. And the hold times scale *inversely*: +a loop with a third the bandwidth has time constants three times longer, +so waiting "2 seconds" before promoting would mean waiting a third as +many loop time constants — the gates would fire on less evidence. The +original hand-scaled 16 kHz configuration missed the hold-time rule; +adding it re-measured identical within noise, and the test suite now +covers the factory (`Config::forSampleRate`, which applies this and the +matching `FilterSpec::scaledTo`) both structurally +(`AsrcQuality16k.ForSampleRateScalesHzFieldsOnly` checks exactly which +fields move) and behaviorally: through the factory, 16 kHz measures +136.6 dB at 333 Hz — within ~1 dB of 48 kHz at the same normalized +frequency, the 32 dB fully recovered. + +One more cost of scaling, honestly: at 16 kHz the Quiet loop runs at +~0.017 Hz, so the quality tests run for 120 seconds of simulated audio +instead of 40 — the same number of loop time constants. Slow loops are +slow everywhere, including in CI. + +## War story two: when Track is the ceiling + +The block-size study (`notebooks/asrc_block_size_study.ipynb`) asked what +happens as transfer granularity coarsens from sample-granular toward the +32- and 240-frame callbacks real audio APIs deliver. The finding shapes +how you should read the stage table: with blocks of 32 frames and up, +**the servo never promotes to Quiet — and must not.** + +The information-theoretic version of the argument: at a 32-frame block, +the occupancy observable updates a few hundred times per second with a +±16-frame deterministic sawtooth on top of a sub-frame-per-second signal. +Quiet-level performance means resolving the backlog trend to a small +fraction of a frame *through* that tooth using counts alone; the counts +simply do not carry the information. The promotion gate discovers this +without being told: the cascade-smoothed error keeps excursing past one +frame at the beat frequency, the two-second hold never completes, and +Track becomes the terminal stage — the discriminator working as designed. + +What does Track-forever sound like? The 1 Hz loop phase-tracks the block +beat: most of the sawtooth is absorbed as **latency breathing** — the +buffer level, and hence the delay, swaying by a fraction of the block at +the beat rate, inaudible by construction. The remainder leaks into ε̂ as +low-rate FM, and the study put calibrated numbers on it: **~0.9 cents rms +of frequency wobble (61 dB wideband quality) at 32-frame blocks, ~1.3 +cents / 53 dB at 5 ms blocks**, as the README reports. Cent-scale wobble +at sub-hertz rates is at the edge of perception for sustained pure tones +and irrelevant for program material — but it is a real ceiling, and it is +a *sensor* ceiling, not a servo defect. The README's limitations section +draws the forward-looking conclusion: breaking it requires a better +observable (per-block timestamps for sub-sample phase observation), not +a cleverer filter behind the same counts. + +The practical corollary is the config comment you may have skimmed past +on `unlockThresholdFrames`: it must sit comfortably above **half the +push/pull block size**, because a coarse-block sawtooth legitimately +excursions that far with the clocks standing still. The default 24 clears +a 32-frame transfer's ±16 with margin. Undersize it — say, 8 against +32-frame callbacks — and the healthy beat itself trips demotion: +Track→Acquire, re-lock, promote, trip again, a mode limit cycle +manufactured entirely in configuration. If you change one servo number +for an embedded deployment, this is the one to check. + +## The whole life cycle, measured + +Everything this chapter described is visible in one trace: the converter +driven at +200 ppm in deterministic virtual time (1-frame pushes — the +long tests' methodology), with a 50 ms producer stall injected at t = 28 s. + +![Measured occupancy and ppm estimate through acquire, lock, a 50 ms +dropout, and re-lock](../img/servo-lock.svg) + +*Acquiring's 10 Hz loop rings clamp-to-clamp on the quantized occupancy — +the sawtooth of the "enemy" section, live — yet the smoothed occupancy +never strays two frames from the setpoint, and promotion lands in half a +second. After the stall, `reset(true)` keeps the integrator, so the +re-acquire rings around 200 ppm rather than starting over from zero. +Generated by `scripts/book_figures.py`, which compiles a small trace +dumper against the real headers and runs exactly this scenario.* + +## The shape of the design + +| Decision | Alternative rejected | Reason | +|---|---|---| +| PI (type-2) loop | proportional-only | P parks a ppm-dependent occupancy offset (≈23 frames at 300 ppm in Quiet); the integrator nulls it | +| Gains derived from (f_L, ζ) via 2nd-order matching | hand-tuned constants | tuning surface is two physical numbers; `computeGains` is the textbook formula, verifiable by inspection | +| Three stages | one compromise bandwidth | pull-in wants 10 Hz, sawtooth rejection wants 0.05 Hz + heavy smoothing; no single point does both | +| Cascade error gates promotion | timer or lock-counter | asks the exact question ("could Quiet's own filtered error hold lock?"); auto-excludes coarse blocks | +| Integrator seeded from hold-window average | reset on transition | wide stages phase-track the sawtooth; the average is the clean estimate — handoffs transient-free | +| Integrator clamp (anti-windup) | clamp output only | disturbances must not charge the estimate past physics; recovery starts near the truth | +| `seed()`/`reset(keepIntegrator)` API | let the loop chase every step | known-cause jumps carry no clock information; keep the knowledge, refresh the perception | +| `scaledTo()` for other rates | reuse 48 kHz defaults | absolute-Hz constants vs a rate-proportional disturbance: measured −32 dB at 16 kHz | + +## Verify it yourself + +```sh +# The five servo unit tests against the pure plant equation +# (type-2 nulling, ramp tracking, transient-free handoff, clamp, reset): +ctest --test-dir build -R 'Servo\.' --output-on-failure + +# The servo inside the real converter: lock/hold through the 32-frame +# block beat, drift-ramp tracking, slip continuity, stall recovery: +ctest --test-dir build -R 'AsrcLock\.' --output-on-failure + +# War story one, end to end (long: 120 s simulated per tone; prints the +# measured SNRs — compare against the thresholds in the file): +ctest --test-dir build -R 'AsrcQuality16k\.' --output-on-failure + +# War story two: regenerate the block-size study (32 / 64 / 240 frames, +# latency breathing and the cents-rms FM decomposition): +jupyter nbconvert --to notebook --execute notebooks/asrc_block_size_study.ipynb + +# Break it on purpose: in tests/test_asrc_quality_16k.cpp, replace +# Config::forSampleRate(kFs) with a default-constructed Config (keeping +# cfg.sampleRateHz = 16000.0) and watch ~32 dB vanish from every tone. +``` + +As with the ring buffer, the last item is the chapter in one line. The +three stages, the cascade, the scaling rule — none of it is decoration. +Take any piece away and a measurement, not an opinion, tells you what it +was holding back. diff --git a/book/src/part1/polyphase-bank.md b/book/src/part1/polyphase-bank.md new file mode 100644 index 0000000..a9c07a6 --- /dev/null +++ b/book/src/part1/polyphase-bank.md @@ -0,0 +1,358 @@ +# The polyphase bank + +> Show me your flowcharts and conceal your tables, and I shall continue to be mystified. Show me your tables, and I won't usually need your flowcharts; they'll be obvious. +> +> — Fred Brooks, *The Mythical Man-Month* + +The previous chapter ended with a prototype filter: 12,288 double-precision +coefficients (for the default preset) describing one ideal anti-imaging +lowpass, oversampled 256× against the input rate. This chapter is about a +data structure. Per output sample, the converter's budget is one dot +product of 48 multiply-accumulates — not 12,288 — and the fractional +position μ arrives with 2⁻⁶⁴-sample resolution, demanding a filter for a +delay the table cannot possibly enumerate. `PolyphaseFilterBank` is the +arrangement of those 12,288 numbers that makes the right 48 of them, for +*any* μ, a matter of two pointer offsets and a linear blend. Almost +everything interesting about it is in the layout: one extra row nobody +asked for, every row stored backwards, and a table that no code path may +touch after its constructor returns. + +## The decomposition: L filters hiding in one + +Recall what the prototype is: the windowed sinc sampled on a grid of 1/L +input samples, `L·T` taps long. Evaluating the input signal at a position +`p/L` between samples means dotting the T input samples in the window +against the sinc *offset by p/L* — which, on the prototype's grid, is +simply every L-th coefficient starting at p: + +```text +branch 0: h[0], h[L], h[2L], … h[(T−1)L] delay 0 +branch 1: h[1], h[L+1], h[2L+1] … delay 1/L sample +branch p: h[p], h[L+p], h[2L+p] … delay p/L sample +branch L−1: … delay (L−1)/L +``` + +That is the entire polyphase decomposition for this use case — no z-domain +identities required. One oversampled filter *is* L ordinary T-tap filters +interleaved, each a fractional-delay filter for one grid position. Nothing +is computed to "decompose" it; the bank merely copies the prototype into a +`(rows × T)` table so that each branch's taps — which are strided L apart +in the prototype — become contiguous in memory, because the dot product +will read them T-at-a-time, millions of times, and the prototype order +would stride the cache to death. The classic references derive this +structure for rational resamplers (it is also how commercial ASRC silicon +like the AD1896 organizes its ROM); here it is simpler, because near-unity +operation means each output needs exactly *one* branch evaluation — the +question is only which branch, and what to do between branches. + +## Between the branches: why L = 256 and a linear blend + +μ is a 64-bit fraction; the table has L rows. Rounding μ to the nearest +row would quantize the delay to 1/L of a sample, and delay quantization on +a moving signal is *noise* — worse at high frequencies, where a fixed time +error subtends more phase. The bank's answer is the standard one at this +quality tier: pick the two rows adjacent to μ·L and interpolate the +*coefficients* linearly between them. The residual error of that blend is +the quality knob the spec exposes: + +```cpp +{{#include ../../../include/srt/polyphase_filter.hpp:bank_spec}} +``` + +The comment's two slopes are the design law for choosing L, and they are +measured properties of this code, not folklore (the README derives its +quality table from the test suite): the blend residual falls **about 12 dB +for every doubling of L** — linear interpolation has second-order error, +so halving the grid step quarters the error — and rises **about 12 dB per +octave of signal frequency**, because coefficient interpolation error acts +like a second derivative and high frequencies bend faster. You can see the +frequency slope directly in the shipped numbers for `balanced()` +(L = 256): 135 dB SNR at 997 Hz, 120 dB at 6 kHz, 112 dB at 12 kHz, 105 dB +at 19.5 kHz — once the signal frequency is high enough for the blend +residual to dominate, each octave costs roughly the predicted 8–12 dB. The +unit tests pin the same staircase at the kernel level, single tones against +the analytic sine: worst-case error below −120 dB at 997 Hz, −110 dB at +4 kHz, −100 dB at 10 kHz, −90 dB at 19 kHz. + +Why not simply crank L and skip the blend? Cost. Nearest-row lookup has +*first*-order error — about 6 dB per doubling — so matching the blend's +accuracy at 19.5 kHz would take L in the hundreds of thousands and a table +in the hundreds of megabytes. With the blend, `balanced()` is +(256 + 1) × 48 float coefficients ≈ 48 KB — resident in L2, arguably L1, +on hosts, and tolerable in MCU RAM at Q15 (≈ 24 KB). `transparent()` +doubles L *and* stretches T for ≈ 160 KB in float, buying its extra margin +mostly at the top of the band (108 dB vs 105 dB at 19.5 kHz measured end to +end). Why not a fancier blend — cubic across four rows? It would double +the coefficient traffic and the blend arithmetic in the innermost loop the +library owns, to fix the *highest-frequency* residual only; L = 256 already +puts that residual below the 105 dB the rest of the chain sustains. The +linear blend is the cheapest operation that keeps the table small and the +error second-order; everything faster is worse, everything better is not +needed at this budget. + +## The extra row: L + 1 rows for an L-phase filter + +Here is the file's cleverest line, and it is a line of *allocation*, not of +algorithm: + +```cpp +{{#include ../../../include/srt/polyphase_filter.hpp:bank_layout}} +``` + +The problem it dissolves: blending needs rows `p` and `p + 1`. For +p = 0 … L−2 both exist. At **p = L−1** the blend wants "row L" — the +branch for a delay of exactly one whole sample. Modular thinking says row L +"is" row 0, and arithmetically it is — *for a different window*. Branch 0 +is a delay of zero against the current window; the position μ → 1 is a +delay of one, which equals a delay of zero against the window advanced by +one input sample. Using row 0 against the *current* window would be wrong +by exactly one sample — not subtly wrong: it would blend the correct filter +with a copy of the signal shifted a full sample, an error at signal level. + +The conventional fixes are all branches. Detect p = L−1 and handle the +wrap specially — a data-dependent branch in the per-sample path, taken at +the beat frequency between the two crystals (at 200 ppm, about ten times a +second), which is also precisely the moment the resampler executes a +whole-sample slip, the most delicate step it performs. Or clamp μ short of +1.0 and accept a periodic discontinuity — a spur at the beat frequency, +in a library chasing 120 dB. + +The bank's fix: **store row L explicitly, as branch 0 advanced by one input +sample**. It falls out of the construction loop with no special case: + +```cpp +{{#include ../../../include/srt/polyphase_filter.hpp:bank_build}} +``` + +Follow the index math for `p == phases_`: the prototype index is +`m = t·L + L = (t+1)·L` — branch 0's tap `t + 1`. So row L holds branch 0's +coefficients shifted one *tap*, i.e. one input sample; the final tap +(`m = T·L`) falls off the prototype's end and the `(m < n)` guard writes a +zero. Row L computed against the current window is *identically* branch 0 +computed against next window. The consequences, in the order they matter: + +- **Branch-free interpolation.** `interpolate()` may always read + `phase(p)` and `phase(p + 1)` for any p ≤ L−1. No modulo, no compare, no + special case — the hot loop's structure is independent of μ. +- **Exact continuity at the μ-wrap.** As μ → 1 the blend converges to pure + row L; the whole-sample slip then advances the window and resets μ to 0, + where pure row 0 takes over — and those two evaluations are the same + arithmetic on the same samples. The seam has *zero* width: not "small + error," but bit-level agreement of the limits from both sides, up to the + one blend step of the approach. + +Neither property is left to prose. `Polyphase.ExtraRowEqualsPhaseZeroAdvancedOneTap` +asserts the layout claim coefficient by coefficient — `phase(L)[0] == 0` +and `phase(L)[u] == phase(0)[u−1]` with `EXPECT_EQ`, exact equality, no +tolerance, because the construction loop is supposed to make them the +*same numbers*, not similar ones. `Polyphase.MuWrapIsContinuousWithWindowShift` +then asserts the consequence at the semantic level: `interpolate(hist, +μ → 1)` equals `interpolate(hist + 1, μ = 0)` on random data — the +whole-sample-slip invariant the resampler (two chapters from now) leans on +every time the crystals drift one full sample apart. The cost of all this: +48 extra coefficients — 192 bytes in float — and one `+ 1` in a `resize()`. +It is the best byte-per-correctness trade in the library. + +## Rows stored backwards + +The second line of that layout comment: rows are **tap-reversed**. +Convolution is inherently a reversal — output = Σ h[k] · x[now − k] — so +either the coefficient array or the history walk must run backwards. +The resampler keeps each channel's history as an *oldest-first* window +(natural for its append-and-compact delay line, and the friendly direction +for hardware prefetchers). Storing each row reversed at construction — +`table_[p·T + (T−1−t)]` — lets the kernel be the loop every SIMD unit +wants: + +```text +for t in 0…T−1: acc += hist[t] · row[t] +``` + +both arrays walked forward, contiguously, from element zero. The +reversal is paid once per converter at build time instead of once per +sample as backwards addressing, and the payoff is documented downstream in +this book's optimization chapters: the auto-vectorized Q15 kernels, the +SMLALD pair-loads on Cortex-M33 (which require adjacent taps to sit in +ascending order in one 32-bit load), and the `SRT_RESTRICT` blend loop all +assume exactly this orientation. One subtlety the test above already +banked: "advanced one tap" for the reversed row L means shifted one slot +*toward the newer end*, which is why the zero lands in slot 0 (the oldest) +— the kind of double-negation a comment can state but only an `EXPECT_EQ` +can enforce. + +## Quantization happens here, once + +The table's element type is not `double` — it is +`SampleTraits::Coeff`, and the constructor's `makeCoeff(v)` is the +single point where the design-precision prototype becomes datapath +coefficients. Quantizing once at build time, rather than converting on the +fly, means the hot path reads exactly what it dots and the quantization +error is a fixed property of the constructed object, measurable by the +tests rather than dependent on the code path taken. + +What each sample type stores (the full traits treatment is the next +chapter; here is what the *bank* needs you to know): + +- **float** stores float coefficients: quantization at roughly −150 dB + against the double prototype — comfortably irrelevant under a 120 dB + target, which is why the float path's quality tests read the same as the + design spec. +- **Q15 and Q31 store Q1.14 and Q1.30**, not Q0.15/Q0.31 — one bit of + headroom spent because of a fact the *previous* chapter created: the + prototype is normalized so each branch has DC gain 1, which puts the + peak (center) tap at ≈ 1.0, and 1.0 does not fit a pure fractional + format whose ceiling is 1 − 2⁻¹⁵. Rather than rescale the filter (and + move the problem into output gain), each fixed-point format trades its + top precision bit for range. `makeCoeff` rounds half-away-from-zero and + saturates, so even a tap of exactly 1.0000…1 from design rounding + becomes the format's max instead of wrapping to −1 — a wraparound there + would be a −∞ dB event, not a noise-floor one. + +The bank is thus one template with three concrete personalities, and the +table *is* the personality: same layout, same extra row, same reversal, +different arithmetic downstream. + +## Validation in two layers, and the all-NaN table + +The constructor rejects what it can see is nonsense: a non-positive sample +rate, fewer than 4 taps, fewer than 2 phases, inverted or out-of-range band +edges — throwing `std::invalid_argument` at setup time, where exceptions +are allowed and cheap. This is necessary and insufficient, and the gap +between those two words is an audit story worth retelling precisely. + +Every check in the constructor is a comparison. Feed the converter a +`Config` whose `sampleRateHz` is NaN — one uninitialized field in caller +code — and every comparison is *false*: `sampleRateHz <= 0.0`? False. +`stopbandHz > sampleRateHz`? False. The constructor sails through, +`cutoffNorm` goes NaN, `designPrototype` dutifully computes 12,288 NaN +coefficients (recall the previous chapter: the Bessel iteration cap exists +so even *this* terminates), and the object constructs successfully. The +converter then runs, produces NaN audio forever, and never throws, never +asserts, never glitches in a way a log would catch. The adversarial audit +of the library built exactly this object (finding F2); the fix is the +converter-level `validated()` gate, which enforces what the bank's local +comparisons cannot express: + +- **finiteness of every double in the config** — the only guard NaN cannot + slip, because it is `std::isfinite`, not an ordering; +- **the band-edge sum rule**: `passbandHz + stopbandHz ≤ sampleRateHz`. + The bank alone accepts `stopbandHz` up to the sample rate, but the + cutoff is *centered* at `(pass + stop)/fs` — let the sum exceed fs and + the anti-image cutoff lands above the input Nyquist, a filter that + passes the very images it exists to kill, while every local check still + passes; +- plus the servo's eps-overflow clamp and 32-bit size-product overflow, + which belong to later chapters. + +All of it is pinned by `ConfigValidation.RejectsSilentMisbehavior` — each +formerly-constructible pathology now `EXPECT_THROW`s — and, just as +deliberately, by two `EXPECT_NO_THROW`s: the rate-scaling factory +`Config::forSampleRate` produces specs sitting *exactly on* the sum-rule +boundary (passband + stopband == fs up to rounding), and a validation rule +that rejected its own library's presets would be a different bug. The +division of labor is a pattern to copy: the class rejects what it can +express *locally*; the composition layer owns the invariants that only +exist between components; and every rejected configuration is one a real +caller could plausibly write. + +## C++ notes: immutability, `bit_ceil`, and the accessors + +**Immutable after construction — as architecture, not style.** The class +has no mutating member functions; every accessor is `const noexcept`. This +buys three unrelated things at once. *Thread safety by subtraction*: the +bank is built on the setup thread and read from the real-time consumer +thread; with no writes after publication there is nothing to synchronize — +the ring buffer chapter's acquire/release agonies simply do not apply to +this object. *RT discipline*: the only allocation is in the constructor, +which the header explicitly assigns to setup time; the audio path holds a +`const` pointer and cannot even express a reallocation. *Exception +containment*: everything that can throw (`bad_alloc`, +`invalid_argument`) throws before the object exists, so a constructed bank +is unconditionally valid — there is no half-designed state for the hot +path to trip over. + +**`std::bit_ceil` for L.** The constructor rounds `numPhases` up to a +power of two rather than validating it, and the reason lives in the +resampler's fast path: the Q0.64 phase accumulator selects the row by +taking the top log₂ L bits of a 64-bit fraction — one shift — and the +intra-row blend factor from the bits below — one more shift. That indexing +scheme *requires* a power-of-two L; `bit_ceil` (C++20, ``, exact and +self-describing where the old `1 << ceil(log2(n))` dance was neither) +guarantees it while giving any spec at least the resolution it asked for. +Rounding up rather than throwing is deliberate policy: more phases is +strictly better along the quality axis, so a spec of 200 phases quietly +becomes 256 rather than a setup error. The same power-of-two guarantee is +what lets `blendRowPhase` recover log₂ L with `std::countr_zero` instead +of storing it. + +**The accessor surface is four functions, and their shapes are load-bearing:** + +```cpp +{{#include ../../../include/srt/polyphase_filter.hpp:bank_accessors}} +``` + +`phase(p)` returns a raw `const Coeff*`, not a `std::span` — the kernels +consume rows through `SRT_RESTRICT`-qualified pointer parameters (that +no-alias promise is worth measured percentage points; see the +vectorization-audit chapter), and a span would be unpacked back to a +pointer at every call site while implying a bounds story the hot path +cannot afford to check. The domain quietly includes `p == numPhases()` — +the extra row is a first-class citizen of the API, which is exactly how +`interpolate()` gets to be branch-free: + +```cpp +{{#include ../../../include/srt/polyphase_filter.hpp:bank_interpolate}} +``` + +Note the one guard that *does* exist — clamping `p` when μ rounds up to +exactly L — protects against a floating-point edge of the *caller's* μ, +not of the table; and `groupDelaySamples()` reports `(L·T − 1)/(2L)`, the +true center of the linear-phase prototype in input samples, which is +"T/2" only to the resolution of the 1/(2L) half-step that the kernel +accuracy tests must account for when they compute the expected analytic +delay. The bank knows its own delay exactly; approximations are for prose. + +## Why this table looks the way it does + +| Decision | Alternative rejected | Reason | +|---|---|---| +| Contiguous T-tap rows per branch | dot the strided prototype directly | the kernel reads rows millions of times; stride-L access wastes the cache the table was sized to fit | +| Linear blend between adjacent rows | nearest row; cubic blend | nearest needs astronomically large L (first-order error); cubic doubles hot-loop work to fix a residual already below the chain's floor | +| L = 256 default | 128 / 512 | −12 dB residual per doubling vs table size; 48 KB meets the 105 dB @ 19.5 kHz budget; presets bracket it both ways | +| **Extra row L** | wrap to row 0 + branch; clamp μ | branch-free hot loop; μ-wrap/whole-sample slip exactly continuous; costs 192 bytes | +| Tap-reversed rows | reversed iteration per sample | reversal paid once at build; forward contiguous dot is what vectorizers and SMLALD pair-loads require | +| Quantize via `makeCoeff` at build | convert coefficients on the fly | error becomes a fixed, testable property of the object; hot path reads storage type directly | +| Q1.14 / Q1.30 coefficients | Q0.15 / Q0.31 | peak tap ≈ 1.0 by DC normalization; headroom bit beats wraparound at the table's largest value | +| Throw in constructor + converter `validated()` | validate in one place | the class can only check local comparisons; NaN defeats comparisons — finiteness and the band-edge *sum* rule are composition-level invariants (audit F2) | +| Immutable after construction | resettable/redesignable bank | cross-thread reads need no sync; allocation and throws confined to setup; no invalid intermediate states | +| `std::bit_ceil(numPhases)` | reject non-power-of-two | phase-bit row indexing requires 2ᵏ; rounding up is strictly quality-positive | +| Raw `const Coeff*` accessor | `std::span` row | kernels take restrict pointers; span adds implied checking the per-sample path cannot spend | + +## Verify it yourself + +```sh +# Build, then run this chapter's direct evidence: DC gain across mu, the +# extra-row layout equality, the mu-wrap continuity invariant, and the +# fractional-delay error staircase for balanced and transparent: +cmake -B build && cmake --build build -j +ctest --test-dir build -R Polyphase --output-on-failure + +# The audit's rejected-config suite (NaN rate, image-passing band edges), +# including the boundary cases that must keep constructing: +ctest --test-dir build -R ConfigValidation --output-on-failure + +# The end-to-end SNR numbers the L=256 decision is quoted against +# (997 Hz / 6 k / 12 k / 19.5 k, both presets, servo in the loop): +ctest --test-dir build -R AsrcQuality --output-on-failure + +# Break it on purpose: in the constructor, change `p <= phases_` to +# `p < phases_` and resize to phases_ * taps_ (no extra row), then make +# interpolate wrap p+1 to 0. DcGain still passes — DC can't see a +# one-sample shift — but MuWrapIsContinuousWithWindowShift fails loudly, +# which is exactly the gap between "looks fine on steady signals" and +# "correct at the slip." +``` + +The sabotage run is the section on the extra row, compressed: the wrap bug +is invisible to the easiest test and to DC reasoning, and the suite was +built by someone who knew that. diff --git a/book/src/part1/sample-traits.md b/book/src/part1/sample-traits.md new file mode 100644 index 0000000..31f5902 --- /dev/null +++ b/book/src/part1/sample-traits.md @@ -0,0 +1,431 @@ +# Sample types as a customization point: `sample_traits.hpp` + +> Form is exactly emptiness; emptiness is exactly form. +> +> — the Heart Sutra + +The polyphase machinery of the last two chapters computes one thing: a dot +product between a window of input samples and an interpolated row of filter +coefficients. The problem is that this library ships to machines that do not +agree on what a number is. A Xeon host wants `float` samples and will happily +accumulate in `double`. A Hexagon DSP has no double-precision FPU at all — +every `double` operation is a soft-float library call. A Cortex-M33 has no +vector unit and wants 16-bit samples it can crunch two at a time. The same +algorithm must therefore run in three different arithmetic systems, produce +measured quality in each, and pay nothing for the flexibility. + +Here is what "nothing" has to mean, concretely. The inner loop of +`interpolate()` runs one multiply-accumulate and one coefficient blend per +tap, per channel, per output sample. At 48 kHz stereo with the default +balanced preset (48 taps), that is about 4.6 million multiply-accumulates +per second — and every one of them goes through the customization point this +chapter describes. Any mechanism that adds even one indirect call to that +path has already lost. + +This chapter tells two interleaved stories. The C++ story is how a traits +struct and a concept make the sample type a *compile-time* customization +point — and why the obvious alternatives (virtual dispatch, CRTP) were +rejected. The arithmetic story is fixed-point numerics from scratch: what +Q-formats are, where the headroom bits went, why the accumulators are +exactly as wide as they are, and two places where the file's own comments +record hard-won corrections. The two stories are one file: + +```cpp +{{#include ../../../include/srt/sample_traits.hpp:st_overview}} +``` + +Three sample types, and a division of labor worth pausing on: the clock +servo and the filter *design* always run in `double`, because they execute a +handful of operations per block or once at construction. Only the datapath — +the code that touches every sample — is templated. Optimizing anything else +would be effort spent where the profile isn't. + +## The mechanism: a struct full of static functions + +The customization point is a class template with no primary definition: + +```cpp +{{#include ../../../include/srt/sample_traits.hpp:st_primary}} +``` + +Leaving the primary template *undefined* is deliberate. A defined primary +template would need default behavior, and there is no honest default for +"how do I multiply-accumulate your type" — any guess would compile for +unsupported types and be silently wrong. Undefined, the template turns an +unsupported type into a compile error at the first use. (A more *readable* +error is the concept's job, below.) + +Each supported type then gets a full specialization. The float one is the +simplest and shows the complete vocabulary — three associated types and +seven operations: + +```cpp +{{#include ../../../include/srt/sample_traits.hpp:st_float}} +``` + +Every operation the datapath performs on samples is named here: convert a +designed coefficient to storage form (`makeCoeff`), convert the fractional +position to the blend representation (`makeBlendFactor`, +`blendFactorFromQ64`), the multiply-accumulate (`mac`), the adjacent-phase +coefficient blend (`blend`), the accumulator-to-sample conversion +(`finalize`), and silence. The polyphase chapter's `interpolate()` is written +entirely in this vocabulary: + +```cpp +acc = Tr::mac(acc, hist[t], Tr::blend(c0[t], c1[t], fr)); +``` + +and consequently never mentions `int16_t`, `double`, or a shift instruction. +One algorithm, one body of tests, three number systems. + +### Why not virtual dispatch + +The classical OO answer — an abstract `SampleOps` interface with `mac()` and +`blend()` as virtual functions — fails on the arithmetic of the hot loop. +A virtual call is an indirect call through a vtable: the compiler cannot +inline it, and what it cannot inline it cannot optimize *across*. The Q15 +`mac` below compiles to roughly two instructions when inlined; as a virtual +call it would be a call, a return, an argument setup, and — far worse — an +opaque boundary in the middle of the loop. Everything Part III wins depends +on the compiler seeing through these functions: the Q15 dot product +auto-vectorizes on hosts and gets Helium code on the M55 (the C2 audit +verified both), and the C4 SMLALD kernel exists because the products were +visible as exact 16×16 multiplies. Four and a half million vtable +indirections per second, each one an optimization fence, was never a +candidate. + +Virtual dispatch also answers a question nobody asked. Dynamic dispatch +buys the ability to choose the implementation *at run time* — but a +converter's sample type is fixed at the moment you write +`AsyncSampleRateConverterQ15`. Paying the vtable price for flexibility that +is never exercised is the definition of the wrong tool. + +### Why not CRTP + +The curiously recurring template pattern is the usual zero-cost answer to +virtual dispatch, and it was rejected for a simpler reason: CRTP customizes +through *inheritance* — `class MySample : SampleBase` — and the +sample types here are `float`, `std::int16_t`, and `std::int32_t`. You +cannot derive from a built-in type, and you should not have to wrap one in +a class (with all the conversion friction that implies) just to teach a +library how to multiply it. A traits struct attaches behavior to a type +*from the outside*, without requiring the type's cooperation. This is the +same reason the standard library uses `std::char_traits` rather than +requiring your character type to inherit from something: the type being +customized is not yours to modify. + +The cost of the traits approach is one level of naming indirection +(`SampleTraits::mac` instead of `x.mac`), which a `using Tr =` alias +reduces to nothing. The benefit is that the whole mechanism evaporates at +compile time: every call in this file is a `static` member function, +resolved by the template machinery, inlined by any compiler at any +optimization level worth shipping. + +## Q-formats, from zero + +Now the arithmetic story. Fixed-point notation **Qm.n** describes an +integer reinterpreted as a fraction: *n* bits after the binary point, *m* +bits (beyond the sign) before it. The stored integer *k* represents the +value *k* / 2ⁿ. So: + +- **Q0.15** ("Q15"): an `int16_t` representing *k* / 2¹⁵. Range −1.0 to + +0.99997. This is what 16-bit audio *is* — the industry just rarely says + so out loud. +- **Q0.31** ("Q31"): the same idea in an `int32_t`, range −1.0 to + +(1 − 2⁻³¹). +- **Q1.14**: an `int16_t` representing *k* / 2¹⁴ — one bit of *headroom* + above ±1.0, range −2.0 to +1.99994, at the cost of one bit of precision. + +Addition in a Q-format is ordinary integer addition. Multiplication adds +the fractional bit counts: Q0.15 × Q1.14 gives a product with 29 fractional +bits (Q29). Nothing is approximate yet — an integer multiply of two 16-bit +values is *exact* in 32 bits. Fixed-point arithmetic done carefully is not +"lossy integer math"; it is exact arithmetic with explicitly scheduled +rounding. The whole craft is deciding where the one rounding happens and +proving nothing overflows before it. + +## The headroom bit: why coefficients are Q1.14, not Q0.15 + +The obvious choice for 16-bit coefficients is Q0.15, same as the samples. +It does not work, and the reason is a property of the filter itself: each +polyphase row has unity DC gain, and the prototype's *peak tap* reaches +approximately 1.0. Q0.15's most positive value is 0.99997 — the peak tap +does not fit. Saturating it would dent the filter's frequency response +precisely at the row where the response matters most. + +So the coefficients trade one precision bit for one headroom bit: + +```cpp +{{#include ../../../include/srt/sample_traits.hpp:st_q15_coeff}} +``` + +with the conversion doing round-half-away-from-zero and saturating at the +integer limits (the *design* is checked separately; saturation here is a +belt against future filter specs, not an expected event): + +```cpp +{{#include ../../../include/srt/sample_traits.hpp:st_roundsat}} +``` + +What did the traded bit cost? Quantizing coefficients to Q14 puts the +filter's stopband floor at roughly −86 dB — and the header's comment makes +the argument that matters: the Q15 *output* format's own noise floor is of +the same order. A 16-bit datapath cannot deliver more than the 16-bit +format can carry, so spending coefficient precision beyond the format's +floor would purchase nothing measurable. The end-to-end test agrees: the +Q15 converter measures **~77 dB SNR** on a half-scale 997 Hz sine across a ++200 ppm clock crossing (`tests/test_fixed_point.cpp` prints it; the CI +threshold sits at 73 dB), and that number is the *format's* floor, not the +converter's. The same trade at 32 bits gives Q1.30 coefficients +(`makeCoeff` scales by 2³⁰), where the quantization floor is so far down +that the Q31 path measures **133 dB** — statistically the float datapath's +own 135 dB. + +The two unit tests pinning the scale factors are almost insultingly simple, +and that is their virtue: `Q15::makeCoeff(1.0) == 16384` is the sentence +"the peak tap fits" written as an assertion. + +## The accumulation story: exact until the last line + +Here is the Q15 multiply-accumulate: + +```cpp +{{#include ../../../include/srt/sample_traits.hpp:st_q15_mac}} +``` + +Two things are chosen here. The product is computed in `int32_t` — a +16×16→32 multiply, which every target does in one instruction — and it is +**exact**: the worst-case product is −32768 × −32768 = 2³⁰, comfortably +inside `int32_t`'s ±2³¹ range. But note how *thin* that comfort is: a +single worst-case product uses all but one bit of an `int32_t`. Summing +even two of them could wrap. An `int32_t` accumulator is therefore not +"risky"; it is simply wrong. + +The accumulator is `int64_t`, and now do the arithmetic the comment +gestures at. The shipping filters run 32 to 80 taps per phase (fast, +balanced, transparent presets). Summing N values adds at most log₂N bits +to the worst-case magnitude: 48 taps add ~5.6 bits, 80 taps add ~6.3 — call +it six to seven bits. Worst case for the transparent preset: +80 × 2³⁰ < 2³⁷, against an accumulator that holds ±2⁶³. Twenty-six bits of +spare headroom. That surplus is the point: the sum is exact — not +approximately safe, *exact*, every intermediate value representable — no +matter what the samples and coefficients do. There is no intermediate +rounding anywhere in the loop, which also means the accumulation is +associative, which is why the C4 chapter's dual-MAC kernel and the C1 +blended-row rewrite could both be verified *bit-exact* rather than +"close enough." + +All of the rounding budget is spent in one place: + +```cpp +{{#include ../../../include/srt/sample_traits.hpp:st_q15_finalize}} +``` + +The accumulator holds a Q29 value (Q0.15 sample × Q1.14 coefficient); the +output wants Q15; so shift right by 14 after adding half an output LSB +(1 << 13). That is round-half-up. A numericist will object that +round-half-up carries a bias and round-half-even does not — and the comment +answers the objection with scale: the bias exists only on exact half values +and is a fraction of one sub-LSB rounding step, orders below the Q15 noise +floor that the 77 dB measurement already includes. Half-even costs extra +operations per output sample to fix an error you cannot measure. The +`clampSat` around it is the saturation that makes hot signals *clip* +instead of wrap — and wrapping is the catastrophic failure mode: + +```cpp +EXPECT_EQ(Q15::finalize(std::int64_t{1} << 40), 32767); +``` + +plus an end-to-end test (`FullScaleSineDoesNotWrapQ15`) that drives a +99%-of-full-scale sine through a +500 ppm crossing and asserts the output's +second difference never exceeds the analytic bound for a clean sine — a +wraparound anywhere inside would blow that bound by orders of magnitude. + +## Q31 and the pre-shift: when even int64 isn't enough + +The 32-bit path cannot copy the 16-bit strategy, and the reason is worth +computing rather than asserting. A full-precision Q0.31 × Q1.30 product +carries 61 fractional bits and a worst-case magnitude near 2⁶¹ (full-scale +sample, peak ~1.0 coefficient). An `int64_t` holds ±2⁶³ — barely four such +products of margin. The shortest shipping filter sums 32 of them; the +transparent preset sums 80. At 48 taps the worst-case sum is +48 × 2⁶¹ ≈ 2⁶⁶·⁶, over the accumulator's limit by a factor of about twelve. +Full-precision products simply do not fit, and there is no 128-bit +accumulator worth having on the targets this path exists for. + +So each product gives up 16 bits *before* joining the sum: + +```cpp +{{#include ../../../include/srt/sample_traits.hpp:st_q31_mac}} +``` + +Now redo the bound: Q45 products have worst-case magnitude 2⁴⁵, and +80 × 2⁴⁵ < 2⁵²— eleven bits of headroom restored. What did the discarded +bits cost? Each truncation throws away less than one Q45 LSB, and the +final conversion (`finalize` shifts a further 14 bits, Q45 → Q31) puts a +Q45 LSB **14 bits below the output's own LSB**. Even if all 80 taps' +truncation errors conspired in the same direction, the accumulated error is +under 80 × 2⁻⁴⁵ ≈ 2⁻³⁸·⁷ — less than 1/200 of one Q31 output LSB. The +measurement closes the argument: the Q31 converter's 133 dB / 105 dB +(997 Hz / 19.5 kHz) match the float datapath's numbers, whose residual is +set by the phase-table interpolation, not by anyone's arithmetic. The +discarded bits are provably and measurably inaudible — this is the +fixed-point craft in one line of code: *decide* where precision dies, +prove the grave is deep enough, then measure anyway. + +The full specialization, for reference — note the doc comment carries the +same overflow argument, so the file survives without the book: + +```cpp +{{#include ../../../include/srt/sample_traits.hpp:st_q31}} +``` + +## The blend, and the comment that was wrong by three orders of magnitude + +`blend` linearly interpolates between the same tap of two adjacent phase +rows (the polyphase chapter explains why; the residual falls ~12 dB per +doubling of the phase count). In Q15 it looks like this: + +```cpp +{{#include ../../../include/srt/sample_traits.hpp:st_q15_blend}} +``` + +That comment has a history, and the history is this book's whole +methodology in miniature. The blend multiplies a Q15 fraction +(`fr` ≤ 32767) by a coefficient difference (`diff` = b − a, two `int16_t` +values, so |diff| ≤ 65535). The original version of this comment justified +the `int64_t` by claiming the `int32_t` product would fit "but only with +~5% margin." An audit later recomputed it: the worst-case product is +32767 × 65535 = 2,147,385,345, and `INT32_MAX` is 2,147,483,647. The +margin is 98,302 out of 2.1 billion — **0.005%**, not 5%. Three orders of +magnitude, in a comment whose entire job was to quantify safety. + +Nothing was wrong with the *code* — it used `int64_t` and still does. But +consider what the wrong comment was waiting to do: some future optimizer, +squeezing the M33 (where the C4 campaign found this very blend dominates +the Q15 frame cost — each `fr * diff` is a `smull`), reads "~5% margin," +concludes the `int32_t` version is comfortably safe, and ships a datapath +that is one adjacent-phase anomaly away from integer overflow. The audit +also measured the *actual* worst |diff| on the transparent table: 41 — +real coefficients come nowhere near the bound. The corrected comment keeps +both numbers and the conclusion: a margin of 0.005% against a theoretical +bound is not an invariant to lean on silently, whatever today's table +does. The lesson generalizes: **a safety-margin comment is arithmetic, and +arithmetic in comments rots exactly as fast as arithmetic in code — the +difference is that no test ever fails on it.** Verify the numbers you +write in prose. This book's build system exists because of that sentence. + +(The Q31 blend uses a Q20 fraction rather than Q15 — since the product runs +in `int64_t` anyway, the six extra fraction bits are free.) + +## `blendFactorFromQ64`: feeding the integer phase + +One trait remains, and it earns its keep on exactly one class of hardware. +The C3 optimization (Part III) replaced the resampler's `double` phase +accumulator with a Q0.64 integer — after which the *only* floating-point +left on the fixed-point per-sample path was the conversion of the phase +fraction into a blend factor. `blendFactorFromQ64` closes that hole. The +Q15 version is a single shift — the top 15 bits of the fraction *are* the +Q15 blend factor: + +```cpp +{{#include ../../../include/srt/sample_traits.hpp:st_q15_q64}} +``` + +The float version is subtler: + +```cpp +{{#include ../../../include/srt/sample_traits.hpp:st_blend_q64_float}} +``` + +Why reduce to 24 bits first? Because a `float` significand holds exactly +24 bits: any integer up to 2²⁴ converts to `float` *exactly*, and the +subsequent multiply by 2⁻²⁴ (a power of two) is also exact. Convert the +full 64-bit fraction instead and the compiler must round — correctly, but +via a path that on a double-less target may detour through software +arithmetic. This two-instruction dance keeps the conversion +single-precision, exact, and branchless. The target it matters on is +Hexagon, the one genuinely FP64-less machine in the fleet (the C3 write-up +records the correction: the M55's *scalar* FPU turned out to support +doubles after all — only its vector unit doesn't). C3's gating run showed +what removing per-sample soft-double math is worth on Hexagon: −15.5% +instructions on the Q31 pipeline, −10.3% on Q15. And because 2⁻⁶⁴ phase +resolution beats the old double path's 2⁻⁵², quality *improved* while the +code got faster: 135.0 dB at 997 Hz. + +## The concept: making the contract legible + +Everything above defines the customization point; the last twenty lines of +the file *enforce* it: + +```cpp +{{#include ../../../include/srt/sample_traits.hpp:st_concept}} +``` + +The datapath templates constrain themselves with it — +`template class BasicAsyncSampleRateConverter` — and the +payoff is the shape of the failure. Instantiate the converter with +`double` (no specialization exists) and, without the concept, the error +would surface wherever the template machinery first touched the undefined +traits — some line deep inside `interpolate()`, wearing five frames of +instantiation context. With the concept, the compiler rejects +`BasicAsyncSampleRateConverter` *at the declaration you wrote*, +and its diagnostic walks the `requires`-expression clause by clause: which +operation is missing, what signature it expected. The concept turns "a +missing operation somewhere" into a checklist. Write a partial +`SampleTraits` — say, everything but `blendFactorFromQ64` — and +the error names exactly that member. + +Note the return-type constraints (`-> std::same_as<...>`) are doing real +work: a `finalize` that returned `int` instead of `int16_t` would satisfy +a naive "does it compile" check and then quietly change overload and +conversion behavior downstream. The concept pins the whole signature. + +The three `static_assert`s at the bottom are the file testing itself: every +translation unit that includes the header re-verifies that the three +shipped specializations satisfy the concept they claim to. If a future +edit breaks one — renames a member, fumbles a return type — the diagnostic +arrives at header-parse time, before any user code, naming the assert. +Cost: zero, everywhere except the compiler's own microseconds. + +## Why these ~220 lines look the way they do + +| Decision | Alternative rejected | Reason | +|---|---|---| +| Traits struct of `static` functions | virtual `SampleOps` interface | 4.6M `mac`/s in the hot loop; virtual calls block inlining and every Part III optimization behind an opaque boundary | +| External traits | CRTP / member functions | sample types are `int16_t`/`float` — built-ins can't inherit and aren't ours to modify | +| Undefined primary template | primary with defaults | no honest default for foreign arithmetic; silence would be wrongness | +| Q1.14 / Q1.30 coefficients | Q0.15 / Q0.31 | the ~1.0 peak tap must fit; one headroom bit costs a precision bit the output format couldn't carry anyway | +| `int64_t` accumulator, no intermediate rounding | `int32_t` accumulator | one worst-case Q15 product nearly fills `int32_t`; exactness makes every kernel rewrite bit-verifiable | +| Q31 products pre-shifted to Q45 | full 62-bit products | 48 taps of 2⁶¹ ≈ 2⁶⁶·⁶ overflows `int64_t` ~12×; truncation cost < 1/200 output LSB, measured invisible | +| Round-half-up in `finalize` | round-half-even | the bias is sub-sub-LSB; half-even costs real per-sample work to fix an unmeasurable error | +| `int64_t` blend product | `int32_t` (it *almost* fits) | 0.005% worst-case margin — recomputed by audit from a comment that claimed 5% | +| `SampleType` concept + self-`static_assert`s | let instantiation errors happen | failures surface at the declaration, itemized per missing operation | + +## Verify it yourself + +```sh +# The whole fixed-point suite: scale factors, saturation, DC gain, +# measured SNRs (watch for the "[ measured ]" lines), full-scale non-wrap: +ctest --test-dir build -R FixedPoint --output-on-failure + +# The measured numbers this chapter quoted: +# [ measured ] 997 Hz, 16-bit fixed: SNR ~77 dB +# [ measured ] 997 Hz, 32-bit fixed: SNR ~133 dB +# [ measured ] 19500 Hz, 32-bit fixed: SNR ~105 dB + +# Recompute the blend margin the audit checked (don't trust this book either): +python3 -c "print(32767*65535, 2**31-1, 1 - 32767*65535/(2**31-1))" + +# Break it on purpose, three ways: +# 1. In makeCoeff (Q15), change 16384.0 to 32768.0 — the peak tap saturates +# and DcGainIsUnityQ15 fails its ±4 tolerance. +# 2. In finalize (Q15), delete clampSat and cast directly — the full-scale +# sine test detects wraparound as a blown second difference. +# 3. Instantiate srt::BasicAsyncSampleRateConverter anywhere and +# read the concept diagnostic: every missing operation, by name, at the +# line you wrote. +``` + +The third experiment is the C++ half of this chapter in one error message; +the first two are the arithmetic half in two failing assertions. diff --git a/book/src/part1/spsc-ring.md b/book/src/part1/spsc-ring.md new file mode 100644 index 0000000..15419f1 --- /dev/null +++ b/book/src/part1/spsc-ring.md @@ -0,0 +1,273 @@ +# The lock-free ring: `spsc_ring.hpp` + +> Time is what keeps everything from happening at once. +> +> — Ray Cummings + +Every other component in this library is mathematics. This one is physics. + +The converter's whole purpose is to sit between two threads that must never +wait for each other: an audio capture callback pushing frames at its +device's pace, and a playback callback pulling frames at a *different* +device's pace. If either thread ever blocks — on a mutex, on an allocation, +on a priority-inverted anything — the audio glitches, and a glitch is the +one failure this library exists to prevent. So the channel between the +threads must be **lock-free**, and not in the loose marketing sense: every +operation must complete in a bounded number of steps regardless of what the +other thread is doing, including being suspended indefinitely at the worst +possible instruction. + +The ring also serves a second master, and this is the design's quiet +novelty: its **occupancy is the control system's sensor**. The clock servo +(next chapter) estimates the rate mismatch between the two crystals +entirely from how full this buffer is. That is why the class exposes exact +`readAvailable()` and a consumer-side `discard()` — operations a generic +SPSC queue wouldn't bother with — and why "approximately full" isn't good +enough anywhere in this file: a biased occupancy reading would become a +biased frequency estimate. + +Here is the entire contract: + +```cpp +{{#include ../../../include/srt/spsc_ring.hpp:contract}} +``` + +Forty lines of comment and assertion before any logic. Three things deserve +attention already. + +**`is_trivially_copyable_v`** — the ring moves data with `memcpy`, in at +most two segments per transfer. This is a *bulk* ring: the producer hands +over whole blocks of interleaved frames, not elements one at a time. A +`memcpy`-based design rules out element types with constructors, and the +`static_assert` makes that a compile error instead of undefined behavior. + +**`std::atomic::is_always_lock_free`** — the class claims +lock-freedom, so it asserts the precondition. On every target this project +ships to, a `size_t` atomic compiles to plain loads and stores plus memory +ordering. But "every target this project ships to" is exactly the kind of +claim that rots silently; the assert costs nothing and converts rot into a +compile error. (This line has its own small history: it was added by an +audit that noticed the library asserted lock-freedom for its *telemetry* +counters but not for the indices the entire hot path rests on.) + +**Indices are monotonic, not wrapped.** `head_` and `tail_` count every +element ever written and read, forever; only at the moment of buffer access +are they masked down to a position. This is the single most consequential +decision in the file, and it earns its own section below — including what +happens when "forever" meets a 32-bit `size_t`. + +## The memory model, from the only direction that matters + +There are two ways to teach C++ memory ordering. The textbook way starts +from the six `memory_order` enumerators and their formal guarantees. The +way that actually sticks starts from a bug. + +Suppose both threads used `memory_order_relaxed` everywhere. The producer +writes 64 samples into the buffer, then advances `head_` by 64. The +consumer reads the new `head_`, concludes 64 samples are available, and +copies them out. On x86 this works every time you test it. On a Cortex-A +or M-class core — or under ThreadSanitizer — the consumer can observe the +*index* update **before** it observes the *sample data* the index claims to +cover, because nothing told either the compiler or the CPU that those +writes were related. The consumer then plays whatever stale bytes were in +the buffer. The bug is silent, rare, load-dependent, and absolutely real. + +The fix is a single pairing, used twice, and it is the only synchronization +in the file: + +> The producer **releases** `head_` after writing data; the consumer +> **acquires** `head_` before reading data. Everything the producer did +> before the release-store is visible to the consumer after the +> acquire-load that observes it. + +Read the producer side with that lens: + +```cpp +{{#include ../../../include/srt/spsc_ring.hpp:write}} +``` + +The two `memcpy` calls happen *before* the `release` store of the new head. +That ordering — data first, then the index that publishes it — is the +entire correctness argument for the data path. Symmetrically: + +```cpp +{{#include ../../../include/srt/spsc_ring.hpp:read}} +``` + +The consumer `acquire`-loads `head_` (inside the cache-refresh branch, +discussed next), and only then copies data the head covers. Its own +`release` store of `tail_` plays the mirrored role for a subtler resource: +**buffer reuse**. The producer may overwrite a slot only after the consumer +has finished copying out of it; the consumer's release of `tail_` and the +producer's acquire of it order exactly that. Miss this second pairing and +you have a bug that no amount of staring at the "obvious" head-side pairing +will reveal. + +Notice also what is *relaxed*: each side loads **its own** index with +`memory_order_relaxed`. The producer is the only writer of `head_`, so it +cannot race with itself; a thread always observes its own prior writes. +Using `acquire` there would be harmless but dishonest — ordering +annotations in this codebase are documentation, and claiming +synchronization where none is needed misleads the next reader. This is a +deliberate idiom: **the memory orderings are chosen to be exactly +sufficient, so that each one tells you why it exists.** + +### What was rejected + +A sequentially-consistent version (`memory_order_seq_cst` everywhere, the +default) would be correct. It was rejected for two reasons, in order of +importance: first, on ARM it compiles to strictly stronger barriers than +the algorithm needs, in the hottest loop the library owns; second — again +the documentation argument — `seq_cst` says "I didn't think about this," +and in a file whose whole job is to be thought about, that is the wrong +message. A mutex-based version was never on the table: it would forfeit the +bounded-progress guarantee the audio contract requires, priority inversion +being the canonical way real-time audio dies. + +## The cached-index trick + +Correctness needs one acquire/release pair per direction. Performance is +about how *rarely* you can afford to do even that. + +Every atomic load of the other thread's index is a potential cache-line +transfer between cores — the line bounces from the writer's L1 to the +reader's, hundreds of cycles when it goes badly, and it goes badly +precisely when both threads are busiest. The standard remedy (this design +follows the well-known pattern used by production SPSC queues) is for each +side to keep a **stale local copy** of the other side's index and consult +the real atomic only when the stale copy makes the operation look +impossible: + +- The producer computes free space against `tailCache_`. Only if that says + "not enough room" does it acquire-load the real `tail_` and retry the + computation. If space *still* falls short, the answer is truthful — the + buffer really is that full *right now* — and the write is clipped. +- The consumer does the same dance with `headCache_` for availability. + +The asymmetry of staleness is safe by construction: a stale `tailCache_` +can only *underestimate* free space (the consumer only ever frees), and a +stale `headCache_` can only *underestimate* availability (the producer only +ever adds). Stale data makes the ring conservative, never wrong. In the +steady state the converter lives in — producer and consumer chasing each +other around a buffer that is never near full or empty — the fast path +touches **no foreign cache lines at all**: one relaxed load of your own +index, arithmetic against a plain local member, two `memcpy`s, one release +store. + +The member layout enforces the same philosophy at the hardware level: + +```cpp +{{#include ../../../include/srt/spsc_ring.hpp:layout}} +``` + +Producer-owned state (`head_`, `tailCache_`), consumer-owned state +(`tail_`, `headCache_`), and the shared read-only state (`buf_`, `mask_`) +each get their own 64-byte cache line, so neither side's writes invalidate +lines the other side reads in its fast path. The comment records a rejected +alternative worth pausing on: +`std::hardware_destructive_interference_size` is the standard's name for +exactly this constant, and this file deliberately doesn't use it. The +constant is **ABI-fragile** — its value can differ between translation +units compiled with different tuning flags, which is why GCC warns when you +use it in a header — and a header-only library lives entirely in that +danger zone. A plain `64` with a comment is less clever and more correct. +The general lesson recurs throughout this codebase: *between a standard +facility and a constraint you can state plainly, prefer the one whose +failure mode you can reason about.* + +## Monotonic indices and the wraparound proof + +Most ring buffers wrap their indices at the capacity and pay for it twice: +one slot must be wasted to distinguish full from empty, and every index +update needs a conditional wrap. This ring's indices instead run forever +and are masked (`idx = head & mask_`) only at access time, which is why the +capacity must be a power of two (`std::bit_ceil` in the constructor) — the +mask replaces a modulo, and the full capacity is usable because occupancy +is computed by subtraction, not by comparing wrapped positions. + +The objection arrives immediately: *forever* is finite. On Hexagon and +Cortex-M, `size_t` is 32 bits; at 48 kHz stereo, the indices wrap every +twelve hours or so of continuous audio. What happens then? + +Nothing — and the reason is worth proving rather than waving at, because +the proof is two lines of modular arithmetic that many engineers have +never consciously done. Unsigned arithmetic in C++ is arithmetic modulo +2^N. Occupancy is computed as `head - tail`; if the true (unbounded) counts +are H and T, the machine computes `(H mod 2^N) - (T mod 2^N) mod 2^N`, +which equals `(H - T) mod 2^N`. Since the algorithm guarantees +`0 ≤ H - T ≤ capacity` and capacity is at most 2^31 on a 32-bit target, the +true difference is always representable, so the modular result *is* the +true result — through the wrap, across the wrap, at the wrap. The masked +position is likewise exact: capacity divides 2^N (it's a power of two), so +`(H mod 2^N) & mask = H mod capacity`. The wrap is not an edge case the +code handles; it is a case the arithmetic never notices. + +This was verified the trustworthy way as well: the audit that reviewed this +file ran the ring with indices initialized to `0xFFFFFFF8` and watched +transfers stride across the 2^32 boundary, byte-exact. The proof says it +must work; the test removes the possibility that the proof was about a +slightly different program than the one we shipped. + +## What the tests can and cannot certify + +Three layers of evidence back this file, and their *limits* are as +instructive as their coverage. + +**Single-threaded exactness** (`tests/test_spsc_ring.cpp`): fill/drain +equality, wraparound data preservation, partial writes near full, discard +accounting. These pin the sequential semantics — necessary, and nowhere +near sufficient. + +**A two-thread stress test** (`tests/test_spsc_ring_threads.cpp`): millions +of elements of a counting sequence pushed and popped with randomized chunk +sizes, verified in order on the consumer side, run under ThreadSanitizer in +CI. TSan observes the actual ordering annotations, so it would flag the +relaxed-everywhere bug described above as a data race. + +**And the honest limitation**: a sanitizer can only judge the interleavings +the hardware deigns to produce during the run, and an x86 host barely +reorders anything. A memory-ordering bug can be invisible on x86 *and* pass +TSan there, then fire on a weakly-ordered ARM core in production. This +project's answer is a weekly CI job that runs the same TSan stress on +genuinely weakly-ordered arm64 hardware, plus the per-push macOS Apple +Silicon leg. That is also a limit worth naming: none of this *proves* the +algorithm; it raises the price of being wrong. The proof remains the +acquire/release argument above — which is exactly why this chapter spent +its pages on the argument rather than the test list. + +## Why these ~130 lines look the way they do + +A summary of the decisions, several of which recur throughout the library: + +| Decision | Alternative rejected | Reason | +|---|---|---| +| Lock-free SPSC, two fixed roles | mutex; MPMC generality | bounded progress is the audio contract; generality costs exactly the cycles this file exists to save | +| Bulk `memcpy` transfers | element-at-a-time queue | the workload is blocks of frames; two `memcpy` segments beat N atomic handoffs | +| Exact occupancy + `discard()` | "approximate size is fine" | occupancy is the servo's sensor; bias here becomes frequency-estimate bias | +| Acquire/release, minimal | `seq_cst` everywhere | sufficiency-as-documentation; weaker barriers on ARM | +| Cached cross-indices | always load the atomic | steady-state fast path touches no foreign cache line | +| Monotonic masked indices | wrap-at-capacity | full capacity usable, no full/empty ambiguity; wrap is provably benign | +| `alignas(64)` literal | `hardware_destructive_interference_size` | the standard constant is ABI-fragile in headers; GCC warns for good reason | +| `static_assert` the preconditions | trust the porting engineer | rot becomes a compile error, not a field failure | + +## Verify it yourself + +```sh +# Sequential semantics, wraparound, discard accounting: +ctest --test-dir build -R SpscRing --output-on-failure + +# The two-thread counting-sequence stress (built when threads exist): +ctest --test-dir build -R TwoThreadStress --output-on-failure + +# The same stress under ThreadSanitizer (as CI runs it): +cmake -B build-tsan -DCMAKE_BUILD_TYPE=RelWithDebInfo \ + -DCMAKE_CXX_FLAGS="-fsanitize=thread" -DSRT_BUILD_EXAMPLES=OFF +cmake --build build-tsan -j && ctest --test-dir build-tsan -R SpscRing + +# Break it on purpose: change memory_order_release to relaxed in write(), +# rebuild the TSan variant, and watch the stress test report the race. +``` + +The last suggestion is the chapter in one line. The annotations are not +incantations; remove one and the tooling shows you precisely the disaster +it was holding back. diff --git a/book/src/part2/icount.md b/book/src/part2/icount.md new file mode 100644 index 0000000..17fc574 --- /dev/null +++ b/book/src/part2/icount.md @@ -0,0 +1,322 @@ +# Counting instructions, deterministically + +> When you can measure what you are speaking about, and express it in numbers, you know something about it; but when you cannot measure it, when you cannot express it in numbers, your knowledge is of a meagre and unsatisfactory kind. +> +> — Lord Kelvin + +The optimization campaign of Part III makes claims like "−5.3% on the M55 +Q15 pipeline" and expects you to believe the decimal point. This chapter is +about the machinery that makes such a decimal point *mean* something — and +about why the obvious metric, time, had to be fired from that job first. + +## Wall-clock cannot hold a gate + +The project's benchmarks run in CI on shared, virtualized runners: machines +whose actual delivered performance depends on what every other tenant is +doing, what frequency the host decided on, and which physical box the job +landed on today. `docs/PERFORMANCE.md` states the resulting policy without +hedging: *wall-clock benches are never a hard gate on shared runners; they +run as a smoke test and produce trend artifacts only.* + +That policy was not adopted from theory. During the C2 vectorization audit +(Part III), the README's wall-clock table was deliberately *not* +regenerated, because the shared machine was measurably in a different state +than the annotated session that produced the table — about **20% slower +across the board on unchanged code**. Sit with that number for a moment: the +optimization being evaluated in that PR was worth 3.7% on the same metric. +A gate that must detect 3% shifts through 20% ambient swings is not a gate; +it is a random number generator with a pass rate. You can fight the noise +statistically — pin runners, repeat runs, compare medians — and projects +do, but every mitigation buys precision with CI minutes and still cannot +promise that a 1% regression fails *deterministically*. + +The library's answer is to gate a different quantity entirely: **executed +instructions**. Run a fixed workload under an emulator, count every guest +instruction that retires, and the result is a property of the *binary*, not +the weather — bit-identical across runs (the project verified this before +trusting it), independent of host load, and, for the scalar code these +embedded targets run, well correlated with real cost. The metrics table in +`docs/PERFORMANCE.md` is careful about that last clause, and so is the end +of this chapter; but first, the machinery. + +## Forty lines of plugin + +QEMU's TCG (Tiny Code Generator) translates guest instructions into host +code one *translation block* at a time, and since QEMU 4.2 it exposes a +plugin API that lets you hook that translation. The project's entire +counting instrument is `tools/qemu_insn_plugin/insn_count.c` — small enough +that its two working functions fit here: + +```c +{{#include ../../../tools/qemu_insn_plugin/insn_count.c:pf_hooks}} +``` + +The design point that matters is `qemu_plugin_register_vcpu_insn_exec_inline` +with `QEMU_PLUGIN_INLINE_ADD_U64`. There are two ways a TCG plugin can count +executions: register a *callback* per instruction — a host function call +every time the guest retires one instruction — or register an *inline +operation*, which asks QEMU to plant a bare 64-bit add into the generated +host code itself. The callback form would multiply the emulation time of a +billion-instruction workload by a large constant; the inline form costs one +host add per guest instruction and no calls. `tb_trans` fires once per +translation block *translation* (not execution), walks the block's +instructions, and attaches an inline `+1` to each — after which counting +proceeds at essentially full emulation speed forever, because translated +blocks are cached and re-executed. + +The header comment is candid about the accuracy contract this buys: +"the single counter is exact for our single-vCPU deterministic workloads." +A plain `uint64_t` incremented from generated code would be a data race on +an SMP guest; every target this ratchet gates is a single emulated core +running a single-threaded workload, so the simple counter is exact — and +the precondition is written down where the next porter will read it. + +The second function is the entire output interface: an `atexit` callback +prints one line, `SRT_INSN_COUNT `, through `qemu_plugin_outs()`. That +choice has a trap the driver script had to learn about: + +```python +def qemu_cmd(target: str, plugin: str, binary: str) -> list[str]: + # "-d plugin" routes qemu_plugin_outs() to stderr; without it the count + # line is silently dropped. + if target == "hexagon": + return ["qemu-hexagon", "-d", "plugin", "-plugin", plugin, binary] +``` + +`qemu_plugin_outs` writes to QEMU's *log*, and unless `-d plugin` enables +the plugin log channel, the write goes nowhere — no error, no warning, no +line. The comment in `scripts/icount.py` preserves the discovery so nobody +re-makes it, and the script's parser treats a missing count line as a hard +failure ("plugin not loaded?") rather than a zero, so the silent-drop +failure mode cannot masquerade as a measurement. + +## One binary per scenario + +What gets counted matters as much as how. The counted workloads live in +`bench/icount/` — and they are *not* the Google Benchmark suite, which +auto-tunes its iteration counts to the machine's speed and would therefore +execute a different number of instructions on every run. A countable +workload must be **fixed**: same work, same iteration counts, same +everything, decided at compile time. + +`bench/icount/icount_main.cpp` defines seven scenarios — `interpolate()` in +isolation and the full push/pull pipeline, each in float/Q15/Q31, plus a +12-channel Q15 pipeline for the 7.1.4 deployment shape — selected by +preprocessor definitions (`SRT_SC_KIND`, `SRT_SC_TYPE`, `SRT_SC_CH`) into +one binary each, because the bare-metal targets have no argv to select with +at runtime. Each binary runs a deterministic loop (two virtual seconds of +audio through the pipeline; 200 000 interpolations for the kernels), +accumulates a checksum, and ends with: + +```cpp + const bool ok = checksum == checksum; // NaN check + std::printf("SRT_ICOUNT_DONE ok=%d checksum=%.17g\n", ok ? 1 : 0, checksum); +``` + +The three gated targets each run under the QEMU mode that matches their +deployment reality. Hexagon binaries are Linux user-space processes, so +`qemu-hexagon` (user-mode emulation) runs them directly. The two Cortex-M +targets are bare metal: `qemu-system-arm` boots each binary as a kernel on +a full board model — MPS3 AN547 for the M55, MPS2 AN505 for the M33 — with +semihosting for the printf. That fidelity matters for the metric: a +system-mode count includes the startup code, vector table dance, and +runtime the deployed firmware will actually execute, which is why the +plugin counts the whole run and the workloads are sized so the measured +loop dominates. + +The checksum earns its place three times over: it defeats dead-code +elimination (a compiler that deleted the unobserved workload would produce +a spectacular "improvement"); printed to 17 significant digits, it pins +cross-run determinism — if two runs of one binary ever printed different +checksums, the instruction counts would be incomparable and something would +be deeply wrong; and the pipeline workload deliberately poisons it with a +NaN if the converter ever underruns, so a broken configuration cannot +produce a plausible count. `icount.py` refuses to record anything unless +`SRT_ICOUNT_DONE ok=1` appeared. + +## The ratchet, and why it is two-sided + +`scripts/icount.py` glues plugin to workloads: find every `srt_icount_*` +binary in the build directory, run each under the target's QEMU with the +plugin, and compare against the committed `bench/baselines.json` at a +tolerance of ±3%. A scenario with no recorded baseline fails. A recorded +baseline of zero fails. A regression beyond tolerance fails. And — the +clause that makes this a *ratchet* rather than a mere alarm — an +**improvement** beyond tolerance fails too: + +```python + elif delta < -args.tolerance: + # Two-sided: a stale (too-high) baseline would let future + # regressions hide inside the slack, so improvements must be + # committed too. + verdict = ("IMPROVED beyond tolerance — run icount.py --update " + "and commit bench/baselines.json") + failures.append(scenario) +``` + +The two-sidedness was not in the original design. The first version of the +ratchet failed only on regression, which sounds like the point — until an +infrastructure audit traced the incentive structure. Suppose your PR makes +`pipeline_q15` 10% cheaper and you don't update the baseline. CI passes; +everyone is happy; the baseline is now 10% stale. The *next* PR can regress +the same scenario by 9% — undoing nearly all of your win — and CI passes +again, because measured-vs-baseline is still inside the slack. Improvements +that go unclaimed become a hiding place for regressions exactly their size. +The audit's fix (the same infrastructure-hardening pass that added the +bare-metal empty-run guard of the previous chapter) makes the gate +symmetric: if you made it faster, you must *say so*, in the same PR, by +re-recording the baseline — `icount.py --update` — and committing the diff. +The improvement becomes reviewable history, the gate snaps tight around the +new value, and there is never slack for anything to hide in. + +`--update` has its own small discipline: it rewrites the target's entry to +*exactly* the measured scenarios, so a renamed or deleted workload cannot +linger in the JSON as a dead gate entry that never fails and never means +anything. + +One boundary of the ratchet is drawn in a CMake naming convention. The +cross-resampler comparison workloads (`docs/COMPARISON.md` runs the same +fixed task through this library and through libsamplerate, per target) are +built as `cmp_icount_*` precisely so that `icount.py`'s `srt_icount_*` glob +never picks them up: competitor counts are *recorded* in the docs with +their date and toolchain, but not *gated*. The distinction is deliberate. +A gate on someone else's code would fail on their releases, punish this +project for their regressions, and pressure nobody who can act on it; a +gate is a promise, and you can only promise about code you maintain. + +The tolerance deserves a sentence, because "±3% on a deterministic count" +sounds contradictory. Counts are bit-identical across runs *of one binary*; +the slack absorbs a different variation: innocuous recompilation effects. +Code layout, inlining decisions, and register allocation shift by fractions +of a percent when unrelated code changes; the C6 work measured its embedded +control scenarios at exactly 0.00% only because nothing in their path +changed. Three percent is wide enough that touching a comment never fails +the gate, and narrow enough that the +6–8% cost of a runtime flag in a hot +loop — a real mistake, caught by this exact gate during C6 and fixed with a +compile-time gate before merge — cannot pass it. + +## Baselines are compiler-dependent, by design + +An instruction count is a property of the binary, and the binary is a +product of the compiler. When the CI image's `gcc-arm-none-eabi` or +hexagon-clang package updates, every count moves a little, and the ratchet +job fails on unchanged library code. `docs/PERFORMANCE.md` is explicit that +this is **working as intended, not a flake**: the response is to re-record +the baselines in a reviewed commit whose diff *is* the record of what the +toolchain update did to the library's cost. The alternative — normalizing +counts, or pinning tolerances wide enough to ride out compiler churn — +would trade an occasional, explainable, reviewable failure for permanent +blindness to exactly the kind of shift a performance-conscious project most +wants to see. + +The same philosophy shows up in how the tools themselves are provisioned. +The plugin compiles against a `qemu-plugin.h` pinned to the exact commit +QEMU 8.2.2's tag pointed at, checksum-verified on download. And the Hexagon +leg builds its own emulator: neither Debian's `qemu-hexagon` nor the one +bundled with the CodeLinaro toolchain enables TCG plugins, so CI compiles a +plugin-capable `qemu-hexagon` from the pinned QEMU source (linux-user +target only, cached thereafter). A measurement gate whose instruments are +unpinned is a gate whose meaning can change without a diff. + +## What instructions do and do not predict + +Time to honor the caveat. An instruction count is not a cycle count, and +the project's documentation never claims otherwise — the metrics table +says "well-correlated with real cost **for scalar code**," and +cycle-accurate numbers are explicitly delegated to vendor simulators or +hardware counters. + +Where the correlation is good: in-order scalar cores running out of +tightly-coupled memory, which describes the Cortex-M33 and M55 targets +closely. Most instructions are single-cycle, there is no cache hierarchy to +miss in, and a 5% instruction reduction is a real, similar-sized cycle +reduction. + +Where it bends: anything that changes the *mix* rather than the count. +The C3 fixed-point phase accumulator made the M55 float pipeline count +**+1.4%** worse — it replaced hardware-double operations with int64 +sequences, more instructions of cheaper mix — and the project accepted the +regression for the cross-target win, with the reasoning in the PR rather +than hidden in an average. + +Where it bends furthest is Hexagon, and the reason is architectural: +Hexagon is a VLIW machine that issues *packets* of up to four instructions +per cycle. Two versions of a loop with identical instruction counts can +differ meaningfully in cycles depending on how well their instructions pack +into packets — and conversely, removing instructions that packed for free +saves nothing. The C5 experiment (Part III) is the cautionary tale: a +hand-written `vrmpyh` wide-MAC kernel, proven bit-exact, verified by +disassembly to contain ten wide MACs where the baseline had zero, measured +**−0.31%** — 119,847,854 to 119,478,758 instructions on `pipeline_q15`. The +instruction metric faithfully reported that the change barely mattered; on +a VLIW machine it takes packet-level analysis (or silicon) to know whether +even that number survives translation to time. + +The project's calibration path for the gap is hardware, and it ships in the +repository: `examples/pico2_cyccnt/` is a flashable RP2350 firmware that +runs the *same* `runPipeline` workload as the icount scenarios — 32-frame +push/pull blocks, 997 Hz sine, 1 000 warm-up and 2 000 measured iterations +— timed per block with the Cortex-M33's DWT.CYCCNT cycle counter, printing +mean/p99/max cycles per block, cycles per frame, and the fraction of a +150 MHz core one 48 kHz stream costs. Correlating those cycle figures +against the committed M33 instruction baselines yields the +cycles-per-instruction ratio for exactly this code on exactly that silicon +— after which the deterministic, CI-friendly instruction gate can be read +in real-time units. Until that correlation is run on hardware you own, the +documentation deliberately states the M33 figures as instruction *budgets*, +not cycle claims; the truth-sweep audit that enforced that wording appears +again in the next chapter. + +## The last mile: numbers that cannot go stale + +A gated number that is hand-copied into a README is a number waiting to +rot. The published instruction-count table is therefore not written by +anyone: `scripts/update_icount_docs.py` regenerates it **1:1 from +`bench/baselines.json`** — every row, every comma — between +`` and `` markers, and the CI +ratchet job's final step is: + +```sh +python3 scripts/update_icount_docs.py +git diff --exit-code README.md || { + echo "::error::README icount table is stale; run scripts/update_icount_docs.py"; exit 1; } +``` + +Regenerate and diff. If the committed README does not match the committed +baselines exactly, the build fails — so the numbers a visitor reads are, by +construction, the numbers the gate enforces. It is the same commitment this +book makes with live-included code, applied to a table: *derived artifacts +must be derived, in CI, every time, or they are testimony rather than +evidence.* + +## Verify it yourself + +```sh +# Build the counting plugin (fetch qemu-plugin.h for QEMU 8.2.x first; +# ci.yml pins the exact URL and checksum): +gcc -shared -fPIC $(pkg-config --cflags glib-2.0) -I/path/to/plugin-header \ + tools/qemu_insn_plugin/insn_count.c -o /tmp/libinsncount.so + +# Cross-build the fixed workloads and run the ratchet (arm-none-eabi-gcc): +cmake -B build-m55 -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_TOOLCHAIN_FILE=cmake/arm-cortex-m55-mps3.cmake \ + -DSRT_BUILD_TESTS=OFF -DSRT_BUILD_EXAMPLES=OFF -DSRT_BUILD_ICOUNT_BENCH=ON +cmake --build build-m55 -j +python3 scripts/icount.py --target m55 --build-dir build-m55 \ + --plugin /tmp/libinsncount.so + +# Determinism: run any one binary twice and compare the counts exactly. +qemu-system-arm -M mps3-an547 -nographic -semihosting -d plugin \ + -plugin /tmp/libinsncount.so -kernel build-m55/bench/icount/srt_icount_pipeline_q15 + +# See the two-sided gate work: re-run icount.py with --tolerance 0.0001 +# and watch benign recompilation deltas fail in *both* directions. + +# The docs-freshness gate: +python3 scripts/update_icount_docs.py && git diff --exit-code README.md +``` + +And the experiment that motivates the whole chapter: run any wall-clock +benchmark from `bench/` twice on a shared machine, an hour apart, and +compare. The instruction counts you just produced will not have moved by a +single instruction; the nanoseconds will tell you about the machine's day. diff --git a/book/src/part2/notebooks.md b/book/src/part2/notebooks.md new file mode 100644 index 0000000..a9441b6 --- /dev/null +++ b/book/src/part2/notebooks.md @@ -0,0 +1,323 @@ +# Notebooks as calibrated instruments + +> There are three kinds of lies: lies, damned lies, and statistics. +> +> — popularized by Mark Twain, who credited Benjamin Disraeli + +The previous two chapters covered claims a machine can gate: thresholds in +tests, instruction counts in a ratchet. But some of this project's most +consequential claims are not pass/fail propositions. *How much worse is a +naive FIFO?* *What does block size cost in latency and pitch stability?* +*How does the converter measure against libsamplerate, soxr, and two +hardware ASRC chips, under one definition of THD+N?* Answering those takes +plots, long simulated runs, and a measurement methodology that itself needs +defending — which is to say, it takes a lab notebook. The repository has +three, under `notebooks/`, and they are treated with the same severity as +the test suite: **committed with their outputs, calibrated before they +measure, and pinned with assertions so that a regression fails the re-run.** + +This chapter is about that discipline — and about five specific ways a +quality measurement can lie, each of which this project actually hit, and +each of which is now encoded in the notebooks as a guard, a docstring, or a +scar. + +## Three instruments, one method + +**`asrc_demo.ipynb`** is the front door: it loads the library through its C +ABI with `ctypes` (no Python bindings, ~80 lines of wrapper), reproduces +the naive-FIFO disaster, then walks lock acquisition, transparency, +spectrograms, latency, drift tracking, and dropout recovery. Its committed +outputs are where the README's "what does it sound like" numbers come from: +clicks roughly ten times per second at 29 dB SNR for the naive path, +126.4 dB for the converter under the notebook's instrument. + +**`asrc_block_size_study.ipynb`** answers a deployment question: what +happens at block sizes 32, 64, and 240 frames? Its committed conclusion — +Track-stage operation turns block quantization into cent-scale, low-rate FM +over a 53–61 dB wideband floor, while designed latency scales as roughly +`2·B/fs + 0.5 ms` — is quoted by `docs/COMPARISON.md` whenever coarse-block +operation comes up. + +**`asrc_comparison.ipynb`** is the adversarial one: a single AES17-style +measurement implementation applied identically to SampleRateTap, +libsamplerate `sinc_best`, soxr `VHQ`, and a naive FIFO, with the deck +deliberately stacked *against* the home team — the libraries are handed the +exact clock ratio as an oracle, while the converter must discover it from +FIFO occupancy and still gets measured on the result. Every software number +in `docs/COMPARISON.md`'s tables is a committed output of this notebook. + +All three share a spine: the deterministic two-clock simulation from the +[tests chapter](tests.md), re-implemented in a few lines of Python around +the C ABI. Producer and consumer events interleave by next-event virtual +time, so a +200 ppm producer delivers its extra sample every 5 000 exactly, +and a re-run reproduces the committed outputs. Determinism is what makes +"committed outputs" meaningful — a notebook whose numbers wander between +runs is a screenshot, not an instrument. + +Why notebooks at all, rather than more tests? Because the *output* here is +the plot and the number-in-context, not a boolean; because the runs are +minutes long and belong in a manually-triggered lab rather than every CI +push; and because a reader deciding whether to trust the library should be +able to see the methodology, the code, and the result in one document — +then re-execute it. The committed outputs are the published lab record; the +re-run is the replication. + +## Calibrate the instrument before believing it + +The core discipline, stated as a rule: **no measurement function in these +notebooks reports on the converter until it has first reported on a +synthetic signal with a known answer, in the same notebook, above the +measurement.** + +The comparison notebook builds an AES17-style THD+N meter — and then +immediately feeds it a pure 997 Hz tone plus white noise injected at +exactly −100 and −130 dBFS, computes what a perfect meter must read (the +injected level, corrected for the fundamental's RMS and the fraction of +white noise falling in the 20 Hz–20 kHz integration band), and asserts +agreement within half a dB: + +```python + got, f0 = thdn_db(sig, 997.0) + ... + assert abs(got - expect) < 0.5 +print("instrument calibrated") +``` + +Only after `instrument calibrated` prints does any subject get measured. +The block-size study does the same for a subtler instrument — a +decomposition of a near-sinusoid into low-rate pitch modulation (in cents) +and a wideband noise floor — by synthesizing a tone with *exactly* 1 cent +of 10 Hz FM over a −120 dB noise floor. The committed output reads: + +```text +calibration: peak 1.000 cents (true 1.000), rms 0.707 (true 0.707), +wideband 111.0 dB (true ~111) +``` + +That calibration cell carries the project's most candid admission, in its +own markdown: "This cell earned its keep: three earlier formulations of the +split each leaked modulation into the noise figure, and the calibration +caught every one." One of those three failures survives as a docstring on +the low-pass filter inside the decomposition — a boxcar smoother's passband +droop left a percent-level copy of the modulation in the high-passed +remainder, silently bounding the measurable floor — and another as a +warning that reconstruct-and-subtract in the signal domain fails subtly +(sub-split phase errors multiply the carrier). Without the synthetic-signal +check, every one of those buggy instruments would have produced a plausible +wrong number about the converter, and the notebook would have published it +with a straight face. Calibration converts "my measurement code is probably +right" into a demonstrated property, at the cost of one cell. + +## Pin the result, or the notebook is a brochure + +Every notebook ends its key measurements with `assert`. The demo, after +measuring transparency: + +```python +assert snr_asrc > 125.0, "transparency regression" +``` + +The comparison, after the full table: + +```python +first = names[0] +assert thdn[first] < -130 and dr24[first] > 130 +``` + +The block-size study, after the FM decomposition — with a comment that +names the philosophy: + +```python + # Documented behavior as of this measurement: FM peaks stay below the + # ~5-8 cent audibility region (B=240 gets closest) and the wideband + # floor stays above 50 dB. These pin behavior, not aspiration. + assert metrics[B]["cents_peak"] < 5.0, f"FM at B={B} reached audibility" +``` + +This is the notebook version of the test suite's +thresholds-just-under-measured convention. A notebook without assertions +degrades into marketing: it gets re-run after some future change, a plot +looks subtly worse, and nobody's eye catches it. With assertions, re-running +the notebook *is* a regression test — `docs/COMPARISON.md` says exactly +this in its caveats: software figures "regenerate by re-running the +comparison notebook; its assertions pin SampleRateTap's results so +regressions fail the run." The notebook is simultaneously the lab record +and the gate on its own claims. + +The rest of this chapter is the honest-measurement traps those assertions +and calibrations exist to catch — each one a mistake this project made, or +nearly made, with the receipt still in the file. + +## Trap one: your window lies about the floor + +A 997 Hz fundamental at −1 dBFS sits some 130 dB above the residual being +measured. Take a plain FFT of that and the window function smears the +fundamental's energy across the spectrum at the window's sidelobe level — +with common windows, far above the thing you are trying to see. The +notebooks handle this on two fronts. For *display*, the demo's spectrum +helper documents its choice: a Kaiser window with β = 24, "sidelobes +~−190 dB, so a −130 dB noise floor is actually visible." For *measurement*, +no window is trusted at all: the comparison notebook refines the +fundamental's frequency by the phase-slope method (per-window phase of a +least-squares fit, regressed against time — "precision far beyond FFT bins, +which a 130 dB measurement needs," as its markdown puts it), then removes +the fundamental by a single global least-squares fit, *exactly*, before any +spectrum is taken. Only the residual — fundamental already subtracted — +meets an FFT, and then only for integration. A ±20 Hz notch around the +fundamental catches what the fit leaves; the notebook notes this notch is +far *narrower* than AES17 permits hardware testers, a conservatism that +works against the software subjects in every comparison. + +This is the same decision the test suite made with its tracked sine fit, +arrived at for the same reason: at these dynamic ranges, subtraction is +exact and windows are not. + +## Trap two: measure the converter, not its transient + +An ASRC has stages. Fresh from a cold start it acquires; once locked it +tracks; given sample-granular occupancy data for long enough, the servo +promotes to its low-bandwidth Quiet stage — and the residual keeps +improving for tens of seconds as the loop forgets its own acquisition. A +measurement window placed too early reads the servo's history, not the +converter's quality. + +The numbers make the point better than prose. The comparison notebook runs +32 seconds and discards the first 25 before analyzing ("we analyze its +output well after the servo's Quiet stage engages," its markdown says). The +48 kHz quality tests run 40 seconds and analyze the final one. And when the +16 kHz suite was built by scaling the servo bandwidths with the sample rate, +the settle time scaled *inversely*: the quiet loop lands at ~0.017 Hz, and +the suite had to run **120 seconds** — the same number of samples, the same +number of loop time constants as 40 s at 48 kHz — with the test's comment +recording that a 40-second run still sits ~15 dB above the settled +residual. Fifteen decibels is the difference between a correct claim and an +embarrassing one, controlled entirely by *when you look*. + +The flip side matters equally: the block-size study measures the Track +stage *on purpose*, because block-fed deployments never reach Quiet — that +is the regime under study. Neither window placement is "right"; what is +right is that each notebook states which regime it is measuring and why. + +## Trap three: the flush at the end of the stream + +The comparison notebook hands each competitor the same input and analyzes a +window of its output. Where you cut that window turned out to matter more +than anything else in the file: + +```python +def mid_window(y, analyze_s, guard_s=1.0): + """Trim both ends: one-shot converters flush a filter tail at the end of + the stream, and including it poisons the measurement by ~60 dB (found + the hard way; a control experiment at 2:1 exposed it).""" + y = np.asarray(y, dtype=np.float32) + end = len(y) - int(guard_s * FS) + return y[end - int(analyze_s * FS):end] +``` + +A one-shot resampler API, given the whole stream at once, drains its filter +state at the end — a tail of samples that are not steady-state conversion +output. Include that tail in the analysis window and the measured THD+N +degrades by roughly **60 dB**: enough to turn soxr's −150 dB into an +apparently mediocre converter. The bug was found "the hard way," and the +docstring preserves how: a control experiment at a 2:1 ratio — where the +correct answer was known independently — read absurdly wrong, and the +investigation traced it to the tail. Every one-shot subject is therefore +measured on a mid-stream window with a one-second guard at each end. + +Note whose numbers this guard protects: the *competitors'*. An honest +comparison has to be most careful about errors that flatter the home team, +and an unguarded tail window would have been exactly that kind of error. + +## Trap four: comparing float software to 24-bit silicon + +The comparison's final tables land next to datasheet values for the AD1896 +and SRC4392 — hardware ASRCs measured at their pins, which are 24 bits +wide. A float32 pipeline has no fixed noise floor at all (its noise scales +down with the signal), so its "native" dynamic range mostly measures the +arithmetic format, not the converter. Quoting float numbers against silicon +datasheets would be a category error dressed as a benchmark. + +The notebook's equalizer is four lines: + +```python +def q24(y): + """Round to a 24-bit interface, undithered -- what a hardware ASRC + presents at its pins. The equalizer that makes software and silicon + numbers directly comparable.""" + return np.round(np.asarray(y, np.float64) * 8388608.0) / 8388608.0 +``` + +Every subject's output is measured both ways, and `docs/COMPARISON.md` +leads with the 24-bit columns as the chip-comparable condition. The result +reads differently than bravado would: at that interface the oracle-fed +libraries measure at the 24-bit format ceiling itself (~−143.5 dB THD+N), +all three real converters share the identical 149.1 dB A-weighted +dynamic-range ceiling, and SampleRateTap's −132.1 dB sits ~11 dB behind the +oracles — a gap the document does not explain away but *prices*: it is the +measured cost of solving the clock-recovery half of the problem, which the +libraries do not attempt. Even so, the caveats refuse the flattering frame +in the other direction too: datasheet numbers come from analog test loops +with wider notches, and "a pristine-digital software measurement and a +bench measurement of a chip are comparable in definition, not in +environment." + +## Trap five: the summary cell nobody executes + +The last trap is the quietest, and this project walked into it. The demo +notebook's measurement cell printed, in its committed output: + +```text +ASRC SNR: 126.4 dB | naive: 29.4 dB | improvement: 97 dB +``` + +with `assert snr_asrc > 125.0` enforcing it. The *summary table* at the +bottom of the same notebook claimed "SNR > 130 dB." Nothing failed. Nothing +could fail: markdown does not execute, so no assertion, calibration, or +re-run will ever check a number typed into prose. The two cells sat a few +screens apart, one measured and one remembered, disagreeing by 4 dB — the +one place a documentation audit found the repository overstating its own +results. (The measured 135 dB figure from the test suite is real, but it is +a *different instrument* — a tracked global fit over a different window — +and a summary must quote its own cell, not the best number available +elsewhere in the repo.) The fix was the boring, correct one: the summary +now states 126.4 dB and points at the assertion. + +The lesson generalizes beyond notebooks: **summaries drift from cells the +same way READMEs drift from benchmarks and comments drift from code.** +Executable claims stay honest by execution; prose claims stay honest only +by audit. This project's response operates at both levels — push every +number it can into asserted, regenerated, machine-checked form (the test +thresholds, the icount table's regenerate-and-diff gate, the notebook +assertions), and schedule adversarial audits for the residue that only +prose can carry. This book is itself downstream of that lesson: the code +you read here is included live from the headers, because an author's +summary of code is just one more markdown cell. + +## Verify it yourself + +```sh +# Build the C ABI once; the notebooks find (or build) it themselves: +cmake -B build -DCMAKE_BUILD_TYPE=Release -DSRT_BUILD_CAPI=ON +cmake --build build --target srt_capi -j + +# Re-run each instrument end to end; any pinned regression fails the run +# (deps: numpy, matplotlib, plus samplerate and soxr for the comparison): +jupyter nbconvert --to notebook --execute notebooks/asrc_demo.ipynb +jupyter nbconvert --to notebook --execute notebooks/asrc_block_size_study.ipynb +jupyter nbconvert --to notebook --execute notebooks/asrc_comparison.ipynb + +# Watch a calibration catch a broken instrument: in asrc_comparison.ipynb, +# widen the notch (notch_hz=20.0 -> 2000.0) or in the block-size study +# replace lowpass_fft with a boxcar mean — the synthetic-signal cell fails +# before any subject is measured. + +# The traps, in the sources' own words: +grep -rn "poisons the measurement" notebooks/asrc_comparison.ipynb +grep -rn "earned its keep" notebooks/asrc_block_size_study.ipynb +grep -rn "pin behavior, not aspiration" notebooks/asrc_block_size_study.ipynb +``` + +The demo notebook's summary table is the one artifact in this chapter that +no command can verify — which is the point. Read it next to the measurement +cell above it, check that the numbers agree, and you will have performed, +by hand, the audit that fixed it. diff --git a/book/src/part2/tests.md b/book/src/part2/tests.md new file mode 100644 index 0000000..51c4721 --- /dev/null +++ b/book/src/part2/tests.md @@ -0,0 +1,423 @@ +# Tests as specifications + +> Program testing can be used to show the presence of bugs, but never to show their absence! +> +> — Edsger W. Dijkstra + +Part I ended each chapter with a list of tests. This chapter is about what +those tests actually *are* — because in this project they are not the usual +smoke detectors bolted on after the fact. They are the specification. The +README publishes a table of signal-to-noise figures; the reason that table +can be trusted is not editorial diligence, it is that every number in it has +a test asserting something just below it, and CI runs the assertion on every +push. `docs/PERFORMANCE.md` states the policy in one line: "The SNR table is +already enforced by test thresholds." + +That sentence hides three design problems, each with a wrong answer that +most test suites pick by default. How tight do you pin a measured quantity? +How do you make a two-clock, two-thread, analog-flavored system produce the +same bits every run? And how do you measure 135 dB of fidelity without your +measuring instrument lying to you? The suite's answers are the subject of +this chapter. + +## Thresholds a few dB under reality + +Here is the convention, straight from the top of the quality suite +(`tests/test_asrc_quality.cpp`): + +```cpp +// Thresholds sit 4-7 dB under measured performance (135/120/113/106 dB for +// balanced at 997/6k/12k/19.5k; 133/108 dB for transparent). The residual at +// high frequencies is dominated by the linear interpolation between adjacent +// phase-table rows, which falls ~12 dB per doubling of numPhases and rises +// ~12 dB per octave of signal frequency. +``` + +And a representative enforcement: + +```cpp +TEST(AsrcQuality, Balanced997Hz) { + EXPECT_GT(measureSnrDb(srt::FilterSpec::balanced(), 997.0), 128.0); +} +``` + +Measured 135.0, asserted 128.0. Consider the two alternatives this rejects. + +**A loose threshold** — say, "SNR must exceed 60 dB, comfortably transparent +for casual listening" — turns the test into a tautology. The converter could +regress by seventy decibels, an *enormous* defect by this library's +standards, and CI would stay green while the README continued to advertise +135 dB. A loose threshold means the published claim and the enforced claim +are different claims, and only the weaker one is real. This suite's position +is that a quality number you publish is a number you gate, at very nearly +the value you publish. + +**An exact threshold** — asserting 135.0 because you measured 135.0 — fails +for the opposite reason: the measurement is a physical quantity with +legitimate variation. Different hosts, compilers, and math libraries move +the residual by fractions of a dB; the float path's strict double +accumulation keeps outputs bit-stable per platform but not across them. The +4–7 dB of headroom is sized to absorb that variation and nothing else: any +*algorithmic* regression — a filter redesign that loses stopband, a servo +change that leaks more clock noise into the passband — costs whole decibels +and lands outside the slack. + +The comment carries a second load worth noticing: it explains *where the +residual comes from* (phase-table interpolation, with its 12 dB scaling laws +in both `numPhases` and signal frequency). That converts the threshold table +from arbitrary constants into a checkable physical model — when the 16 kHz +suite was added later, its expectations could be *predicted* from the same +model (the residual depends on the normalized frequency f/fs, so tones at +the same f/fs should measure the same), then measured, and they matched +within about 1 dB (`tests/test_asrc_quality_16k.cpp` records both sets of +numbers). A threshold you can predict is a specification; a threshold you +can only observe is a snapshot. + +The convention also imposes a maintenance discipline that deserves to be +stated honestly: when performance *improves*, the thresholds are stale and +must be re-pinned upward, or the enforcement quietly loosens. That happened +in this repository — the Q0.64 phase accumulator (Part III) improved the +997 Hz figure to 135.0 dB, and a subsequent documentation audit re-aligned +the published headline and threshold comments to the post-change reality. +The instruction-count ratchet in the next chapter solves the same +staleness problem mechanically, with a two-sided gate; the quality suite +solves it by convention and audit. The difference is instructive: ±3% on a +deterministic integer can be automated; "4–7 dB under a measurement that +legitimately varies by platform" still needs a human to re-pin. + +## The two-clock simulator + +Every quality number above comes from the same experimental rig, and it fits +in a page of header (`tests/support/two_clock_sim.hpp`). The problem it +solves: the converter's whole reason to exist is that *two independent +clocks* drive it, but tests that use two real threads and real timers are +nondeterministic — schedulers differ, load differs, and a 0.2 dB shift in a +measurement could be the code or could be the machine. For metrology you +want the clocks without the threads. + +The rig is a struct of knobs: + +```cpp +{{#include ../../../tests/support/two_clock_sim.hpp:pf_knobs}} +``` + +and one loop: + +```cpp +{{#include ../../../tests/support/two_clock_sim.hpp:pf_run}} +``` + +This is discrete-event simulation reduced to its minimum. Two virtual +clocks, `tIn` and `tOut`, advance in *virtual time*: a producer event pushes +`chunkIn` frames and advances `tIn` by `chunkIn / fsIn`; a consumer event +pulls `chunkOut` frames and advances `tOut` by `chunkOut / fsOut`; whichever +clock is behind fires next. With `fsIn = 48 000 × (1 + 200 ppm)` and +`fsOut = 48 000`, the producer naturally lands one extra sample every 5 000 +— the exact asynchrony a real capture/playback pair exhibits, with zero +dependence on the host scheduler. Runs are exactly reproducible: same +sequence of pushes and pulls, same occupancy trajectory seen by the servo, +same output samples, every time, on every machine. + +Why determinism beats realism for regression work: + +- **A failure is a coordinate, not a weather report.** When + `Balanced19_5kHz` drops below 100 dB, re-running reproduces the identical + run; you can bisect it, instrument it, and diff intermediate state against + a good commit. A threads-and-timers failure reproduces "sometimes." +- **Thresholds can be tight.** The 4–7 dB convention above is only possible + because run-to-run variance is zero; scheduler-dependent tests must budget + slack for the scheduler, and that slack is exactly where regressions hide. +- **The interesting parameter becomes controllable.** Transfer granularity + — how many frames move per event — is a *physical property of real + deployments* (sample-synchronous codecs at one extreme, USB and network + audio moving multi-frame bursts at the other), and it changes converter + behavior: the servo promotes to its low-bandwidth Quiet stage only when + occupancy is observed at fine granularity. The quality suites run + `chunkIn = chunkOut = 1` to reach the Quiet stage; the multichannel short + variants run `chunk = 8` deliberately, to certify the Track stage that + block-fed deployments actually live in. In a real-threads test, + granularity would be an accident of scheduling; here it is an axis of the + test matrix. +- **Slow clock dynamics are testable at all.** `fsInScale` lets a test ramp + the input rate — the lock suite sweeps drift ramps and asserts the servo + follows without unlocking — which on real hardware would require a + programmable oscillator and a lab. + +What determinism deliberately does *not* cover is the one thing it removes: +real concurrency. The memory-ordering claims of the ring buffer are tested +by the separate two-thread stress under ThreadSanitizer (the +[ring chapter](../part1/spsc-ring.md) walks its limits). The division of +labor is explicit — realism where realism is the subject, simulation +everywhere else — and the technique travels: the same virtual-time +interleaving reappears in Python inside every notebook of the +[notebooks chapter](notebooks.md). + +One number shows what the rig's determinism costs in patience rather than +trust. The quality runs last 40 virtual seconds because, as the test's +comment puts it, "the 0.05 Hz locked loop must fully forget the acquisition +transient before the measurement window" — and only the final second is +analyzed. At 16 kHz the servo bandwidths scale down with the rate, so the +same suite runs 120 seconds to cover the identical number of loop time +constants; its comment records that a 40 s run still sits ~15 dB above the +settled residual. Deterministic time is cheap; *skipping* settling time is +how you measure your transient instead of your converter. + +## Sine-fit metrology + +The simulator produces a signal; something must turn it into a decibel +figure, and at 135 dB the instrument is the hard part. The suite's +instrument (`tests/support/sine_analysis.hpp`) is a least-squares sine fit: +model the output window as `a·sin(ωi) + b·cos(ωi) + c`, solve the 3×3 +normal equations for the best-fit fundamental, subtract it *exactly*, and +call everything that remains — harmonics, images, servo noise, quantization +— the residual. `snrDb()` is then the fitted fundamental's power over the +residual's. + +Why a fit instead of an FFT? Because subtraction is exact and windows are +not. A windowed spectrum smears the near-full-scale fundamental across +neighboring bins at the window's sidelobe level; measuring a residual 135 dB +down *under* that skirt means fighting your own instrument. The fit has no +window: the fundamental is removed to the precision of the arithmetic +(double throughout), and the method's own floor sits far below anything the +converter produces. (The notebooks meet the same problem with the same +answer, plus a notch — that chapter tells the ~60 dB horror story that +motivates the extra guard.) + +One refinement matters enough to justify its own function. `fitSine` +requires the frequency; `fitSineTracked` *finds* it, starting from the +nominal value: + +```cpp + for (int iter = 0; iter < 4; ++iter) { + const SineFit a = fitSine(x.first(half), f); + const SineFit b = fitSine(x.subspan(half), f); + // b.phase is relative to the second half's start; predict it from a. + const double twoPi = 2.0 * std::numbers::pi; + const double predicted = a.phase + twoPi * f * static_cast(half); + const double dphi = std::remainder(b.phase - predicted, twoPi); + f += dphi / (twoPi * static_cast(half)); + } +``` + +Fit each half of the window; if the assumed frequency is slightly wrong, the +second half's phase arrives shifted from where the first half's fit predicts +it; the shift, divided by the half-window's span, is the frequency error. +Four iterations converge far below the starting error. + +The reason this exists is a property of the device under test. An ASRC's +rate estimate converges *asymptotically* — the Quiet-stage loop is +deliberately slow, so even after a 40-second run the estimate can sit a +fraction of a ppm off the true ratio. A rigid fit at the nominal frequency +would see the output tone microscopically detuned from the model and book +the mismatch as residual: a completely inaudible frequency offset, misread +as noise. Tracking the fundamental before measuring distortion is exactly +what commercial THD analyzers do, and the header's comment says so — the +test instrument follows metrology practice, not convenience. + +But an instrument that *tracks* the signal could also *excuse* it: a +converter that genuinely played the wrong pitch would have its error +absorbed into the tracked frequency and measure clean. The suite closes +that hole with a guard on the tracker itself: + +```cpp + // The tracked frequency must still match the true clock ratio closely. + EXPECT_NEAR(fit.freqNorm / nuOutExpected, 1.0, 2e-6); +``` + +The fit may refine the frequency, but only within 2 ppm of what the clock +ratio dictates — enough for servo convergence tails, nowhere near enough to +hide a real pitch error. Every use of the tracked fit carries this check. +It is the measurement-code version of a lesson this book keeps repeating: +whenever you give a tool freedom, pin the freedom. + +## Crosstalk that cannot hide, leakage that cannot masquerade + +Single-channel quality metrics are structurally blind to a whole class of +multichannel bugs: swap two channels in the deinterleave, or bleed a percent +of channel 3 into channel 4, and every per-channel SNR still measures +perfect. `tests/test_multichannel.cpp` exists for exactly those bugs, and +its design is a small case study in adversarial measurement. + +The setup: one converter instance, every channel carrying a *distinct* tone +— `600 + 731·c` Hz, non-harmonically related, all inside the flat passband +for up to 16 channels — with per-channel phase offsets to decorrelate the +waveforms. After conversion across the usual +200 ppm crossing, each channel +must contain its own tone at full quality and nothing measurable of any +other channel's. The deployment shapes are real: 12 channels is 7.1.4 +surround, 16 is an AVB stream bundling reference microphones with the +program feed. + +The subtlety is in the analysis order, and the file header explains it: + +```cpp +// Method: own tone is removed by tracked least-squares fit; the other +// channels' frequencies are then fitted on the residual, so the own tone's +// spectral leakage (about -67 dB at these spacings over a 1 s rectangular +// window) cannot masquerade as crosstalk. The fit noise floor on the +// residual is ~43 dB below the residual RMS, far under every threshold. +``` + +Fit channel *k*'s frequency directly on channel *c*'s raw signal and the +finite one-second window makes channel *c*'s own tone leak energy into that +fit at about −67 dB — the test would "detect" crosstalk at −67 dB on a +converter with none, capping the assertable threshold right there. Removing +the own tone first (exact subtraction of the tracked fit) drops the +masquerade floor to the fit noise on the residual, far under every +threshold. Order of operations *is* the instrument here: same data, same +fits, and only one sequencing yields a measurement capable of asserting +−100 dB. The pinned claims follow the quality suite's convention: crosstalk +below −100 dB per channel for float (−72 dB for Q15, whose own quantization +floor is the binding constraint), with amplitude and SNR checked alongside. + +One more design decision hides in the channel counts of the short variants: + +```cpp +// Channels 5 and 7 are the only counts that reach the channel-parallel +// K=2 and K=1 remainder tiles (8/4/2/1 tiling: 5 = 4+1, 7 = 4+2+1) — the +// audit found those tiles had zero coverage. +``` + +The C6 optimization (Part III) processes channels in register-blocked tiles +of 8, 4, 2, and 1. Testing 2, 12, and 16 channels — every *deployment* +shape — exercises only the wide tiles. Five and seven channels are useless +deployment shapes and ideal test shapes: they force the remainder paths. An +audit found those tiles had zero coverage across the entire suite; the fix +was not more assertions but better-chosen *inputs*. Coverage lives in the +test matrix, not the expectation count. + +## The bare-metal one-shot, and the filter that needed a test + +On the Cortex-M55 and M33 CI legs, the suite runs as a bare-metal kernel +under `qemu-system-arm`: no OS, no filesystem, no command line. That +environment breaks three assumptions ordinary gtest runs lean on, and +`tests/bare_metal_main.cpp` plus `tests/CMakeLists.txt` repair them one by +one — each repair with a story. + +**No argv** means no `--gtest_filter` from the harness, so the +emulation-appropriate filter is baked into a custom `main()`: + +```cpp + ::testing::GTEST_FLAG(filter) = "-AsrcQuality*:AsrcLock.*:Servo.*:Kaiser.*MeetsSpec:" + "FixedPoint.AsrcQuality*:" + "FixedPoint.FullScaleSineDoesNotWrapQ15:" + "MultiChannel.*:Feasibility.*:Reset.*"; +``` + +**No reliable exit codes** — semihosting does not dependably propagate a +process status through the emulator — means the run is judged on text. +CTest watches for a sentinel: + +```cmake + add_test(NAME srt_tests_emulated COMMAND srt_tests) + set_tests_properties(srt_tests_emulated PROPERTIES + PASS_REGULAR_EXPRESSION "SRT_TESTS_COMPLETE rc=0" + FAIL_REGULAR_EXPRESSION "\\[ FAILED \\]" + TIMEOUT 1800) +``` + +The sentinel is printed as the *last* act of `main()`, after +`RUN_ALL_TESTS()` returns — deliberately, so a crash after gtest's own +summary (a static destructor, a late fault) cannot register as a pass. The +`FAIL_REGULAR_EXPRESSION` is a second, independent tripwire: even if a +mangled run somehow emitted the sentinel, any visible test-failure line +still fails the CTest. + +**Nobody watching** is the third broken assumption, and its repair has the +best history. `RUN_ALL_TESTS()` returns 0 when every selected test passes — +including when the filter selects *zero* tests. A typo in that baked-in +filter string would produce an empty run, print the sentinel with `rc=0`, +and turn the entire on-target suite green forever. An infrastructure audit +realized this, and the guard went in: + +```cpp + const int selected = ::testing::UnitTest::GetInstance()->test_to_run_count(); + if (selected < 15) { + std::printf("only %d tests selected (expected >= 15): filter is broken\n", selected); + std::printf("SRT_TESTS_COMPLETE rc=1\n"); + return 1; + } +``` + +Two details show the care level. The count is checked *after* the run, +because gtest applies the filter inside `RUN_ALL_TESTS()` — read it before +and it is always zero, which was verified on target rather than assumed. +And the bound is 15 against a selection of roughly 20, leaving headroom for +legitimate test removals without masking a typo. + +The guard was not paranoia; the filter had *already* had a real bug. When +the 16 kHz quality suite (`AsrcQuality16k`) was added, the exclusion then +read `-AsrcQuality.*` — and in gtest filter syntax, unlike regex, `.` is a +literal character. `AsrcQuality.*` matches `AsrcQuality.Balanced997Hz` but +not `AsrcQuality16k.Balanced333Hz`, so the new two-minute simulations would +have quietly joined every bare-metal CI run, at emulation speed. The fix +widened the pattern to `AsrcQuality*` (no dot). Look back at the filter +string and you can now read its dots as deliberate: `MultiChannel.*` — +*with* the literal dot — excludes exactly the `MultiChannel` suite while +keeping `MultiChannelShort` in, which the comment beside it calls out as the +only on-target coverage of the N-channel deinterleave and wide-MAC dotRow +paths. The same character is a bug in one line and a scalpel in the next; +the difference is whether its meaning was chosen. + +## What the emulated targets deliberately skip + +The baked filter and its `ctest -E` sibling on the Hexagon leg exclude the +same family: the quality suites, the lock and servo simulations, the filter +design verification, the feasibility and reset sims — collectively, as the +file header puts it, "minutes of soft-float virtual audio that validate +target-independent control math already covered on every host platform." +That phrase is the policy. A 40-second sample-granular quality run is cheap +arithmetic on a Xeon and an eternity under instruction-set emulation — and +it would re-prove something that *cannot differ* on the target: the servo's +control law and the filter designer's mathematics are pure functions of +their inputs, identical on every conforming C++ implementation. + +What *can* differ on target — and therefore what the on-target run keeps — +is the datapath: kernel accuracy on the target's arithmetic, the fixed-point +paths (including the SMLALD dual-MAC route on M33-class cores), the ring +buffer, the deinterleave, the end-to-end latency path. The exclusion list is +not a shortcut; it is a claim about *where target-dependence lives*, and the +short multichannel variants exist precisely because that claim would +otherwise have left the N > 2 datapath uncovered on the machines it was +written for. + +One exclusion is different in kind, and the CI file is honest about it: +`ConfigValidation` is skipped on Hexagon not because it is slow but because +that leg's static-musl toolchain cannot unwind — the constructor throws +correctly, `EXPECT_THROW` never catches, and libc++abi terminates. The +limitation is recorded in `docs/PERFORMANCE.md` under known debt, with the +deployment guidance it implies (validate configs before constructing on that +toolchain). A skipped test with a documented reason is a specification too: +it specifies the boundary of what the platform supports. + +## Verify it yourself + +```sh +# The quality suite: watch the printed [ measured ] lines clear the +# thresholds by the documented few dB: +ctest --test-dir build -R AsrcQuality --output-on-failure + +# The threshold convention, in the tests' own words: +grep -n -A4 "Thresholds sit" tests/test_asrc_quality.cpp tests/test_asrc_quality_16k.cpp + +# Multichannel independence, long and short (per-channel crosstalk prints): +ctest --test-dir build -R MultiChannel --output-on-failure + +# Determinism of the rig: run a quality test twice and diff the output. +ctest --test-dir build -R Balanced997 --output-on-failure # (run it twice) + +# The bare-metal one-shot, exactly as CI runs it (needs arm-none-eabi-gcc +# and qemu-system-arm): +cmake -B build-m55 -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_TOOLCHAIN_FILE=cmake/arm-cortex-m55-mps3.cmake +cmake --build build-m55 -j && ctest --test-dir build-m55 -V + +# Break the empty-run guard on purpose: change the baked filter in +# tests/bare_metal_main.cpp to a typo like "NoSuchSuite.*", rebuild, and +# watch the run fail with "filter is broken" instead of passing green. +``` + +The last experiment is this chapter's thesis in miniature. A test suite is +only a specification if an empty, wrong, or stale version of it *fails* — +and every mechanism in this chapter, from pinned thresholds to the +fifteen-test floor, exists to make silence impossible to mistake for +success. diff --git a/book/src/part3/c1-c2.md b/book/src/part3/c1-c2.md new file mode 100644 index 0000000..f060f86 --- /dev/null +++ b/book/src/part3/c1-c2.md @@ -0,0 +1,328 @@ +# Profile first, claim later (C1–C2) + +> We should forget about small efficiencies, say about 97% of the time: premature optimization is the root of all evil. Yet we should not pass up our opportunities in that critical 3%. +> +> — Donald Knuth + +Part III is a story, told in the order it happened. The introduction promised +six optimization efforts — four wins, one honest draw, one deliberate revert — +and the next three chapters deliver them with the real numbers, including the +two that went sideways. This chapter covers the method and the first two +efforts. The method matters more than either result, because the method is +what made the later reversals *visible* instead of silently absorbed. + +A word about why the campaign existed at all. By the time it started, the +converter already beat its closest architectural analog, libsamplerate's +streaming polyphase engine, by roughly 3× at matched quality on the host +(the full head-to-head lives in `docs/COMPARISON.md`). Nobody was losing +sleep over Xeon throughput. The pressure came from the other end of the +target list: the embedded parts. A converter that costs ~1.6% of a Xeon +core even at eight channels is invisible; the same converter on a +Cortex-M33 or a Hexagon DSP is a line item in someone's cycle budget, and +every instruction shaved is budget returned to the application. That framing shaped the campaign's stopping +rule, written down before any code changed: + +> Optimization stops by budget, not by exhaustion. Stop when targets are +> met, when the profile is flat (no single hotspot ≥ 10%), or when the next +> win requires per-arch complexity the budget does not justify. + +Keep that third clause in mind. It fires, verbatim, two chapters from now. + +## The loop + +`docs/PERFORMANCE.md` opens with a working agreement — not aspiration, +process. Every PR that touched the hot path followed the same five steps: + +1. **Baseline** on the benchmark matrix. +2. **Profile** — `perf record` and a flamegraph for time, `-fopt-info-vec` + or `-Rpass=loop-vectorize` for any claim about vectorization. +3. **One hypothesis, one change, one PR** — each optimization PR carries + its before/after numbers in the description. +4. **A/B** — benchmarks for speed, the full test suite for correctness. The + pinned SNR thresholds are the quality guardrail: an optimization that + costs decibels fails CI by design, so "it's faster" can never quietly + mean "it's faster and slightly worse". +5. Repeat until a stopping condition triggers. + +Two measurement instruments back the loop, and they have opposite +personalities. + +**Wall-clock throughput** (Google Benchmark, `bench/bench_asrc.cpp`) is what +users feel, and it is noisy — the project's benches run on shared CI +runners, where a neighbor's workload can move a number more than a real +regression does. So the docs state a rule this book has already quoted and +will quote again: *wall-clock benches are never a hard gate on shared +runners*. They run as a smoke test and produce trend artifacts. When this +chapter reports a wall-clock delta, it was measured as a same-machine, +same-session A/B — the only configuration in which the ratio means +anything. + +**Executed instructions** (the QEMU TCG plugin harness, `bench/icount/`) is +the opposite: deterministic to the instruction. Each embedded scenario is a +fixed-workload binary — bare metal has no argv, so there is one binary per +scenario — run under an instruction-counting plugin on emulated Cortex-M55, +Cortex-M33, and Hexagon. Counts are exact across runs; the project verified +that before trusting them. CI compares every scenario against a checked-in +`bench/baselines.json` and fails if any metric moves more than 3% in +*either* direction. The two-sidedness is the clever part: an improvement +beyond tolerance also fails until the baseline is re-recorded in the same +diff, because stale slack in the baseline is exactly the room a later +regression would hide in. + +Instruction counts are not cycle counts — no cache misses, no dual-issue, +no branch predictor. For the scalar code these targets run, they correlate +well with real cost, and they buy something cycles on shared hardware never +can: the ability to assert that a number did not change *at all*. That +ability is the backbone of everything that follows. + +Before the first change, the hypotheses were written down in expected-ROI +order: per-channel blend redundancy first, then auto-vectorization quality, +then a fixed-point phase accumulator, then explicit SIMD kernels. Writing +the list first is cheap insurance against the oldest failure mode in +optimization work — doing the fun change instead of the valuable one, then +constructing the justification afterward. + +## C1: the blend that was computed N times + +Recall the datapath from Part I. To produce one output sample, +`interpolate()` picks the two polyphase coefficient rows adjacent to the +fractional position μ, blends them tap-by-tap by the intra-phase fraction, +and dot-products the blended coefficients against the history window: + +```cpp +typename Tr::Accum acc{}; +for (std::size_t t = 0; t < taps; ++t) + acc = Tr::mac(acc, hist[t], Tr::blend(c0[t], c1[t], fr)); +``` + +Per output sample, per channel: one blend and one multiply-accumulate per +tap. Now watch what happens in a multichannel stream. Every channel of an +output frame is evaluated at the *same* μ — the channels advance through +time together; that is what makes them a frame. So the coefficient blend, +which depends only on μ, was being recomputed identically for every +channel. For stereo, half of the inner-loop arithmetic was duplicate work. +For twelve channels, eleven-twelfths of the blend work was. + +The fix is the obvious factoring, and its entire risk profile lives in one +question: does it change the output? Compute the blended row once per +output frame into a small scratch buffer — at most 80 entries, the +`transparent` preset's tap count — then run a plain dot product per +channel: + +```cpp +blendRow(bank, row, mu); // once per frame +for (std::size_t c = 0; c < channels; ++c) + out[c] = dotRow(row, window(c), taps); +``` + +The arithmetic per channel is *identical* to the fused loop: blend then +mac, per tap, in the same order, in the same types. Only the schedule +changed — blend hoisted out of the channel loop. Identical operations in +identical order produce identical bits, even in floating point, so the A/B +had a correctness criterion stronger than any SNR threshold: outputs +unchanged **bit-for-bit**. They were. + +The measured results, from the C1 entry in the performance log: + +| Measurement | Result | +|---|---:| +| Stereo pipeline, x86 wall-clock (same-machine A/B) | −36% | +| 8-channel pipeline, x86 wall-clock | −52% | +| M55 pipeline instructions, float / Q15 / Q31 | −15% / −30% / −21% | +| Hexagon pipeline instructions, float / Q15 / Q31 | −3.6% / −3.3% / −0.2% | +| Mono kernels, both targets | count-identical | + +Three of those rows deserve commentary; they carry the chapter's lessons. + +**The mono row is a control.** Mono has no duplicate blend — one channel, +one blend, nothing to hoist — so the change should not touch the mono +kernels at all. "Should not touch" is a hypothesis, and the deterministic +counter can test it exactly: the mono kernel scenarios were +count-identical, to the instruction, on both targets. Had they moved by +even a handful of instructions, that would have meant the change did +something beyond its stated mechanism — maybe harmless, maybe not, but +either way the PR's description would have been wrong, and a wrong +description is a review failure even when the numbers are green. Every +subsequent effort in this campaign carries controls like this, and the +discipline is worth stating as a rule: **a change must move what it claims +to move, and everything else must measure 0.00%.** Wall-clock benchmarks +cannot enforce that rule; nothing measures 0.00% on a shared Xeon. +Instruction counting can, and does. + +**The scaling is the hypothesis confirmed.** Stereo −36%, 8-channel −52%: +the win grows with channel count, exactly as a per-channel redundancy +elimination should. Numbers that match the *shape* of the prediction, not +just its sign, are how you know the mechanism you described is the +mechanism that acted. + +**And then there is Hexagon.** The M55 dropped double-digit percentages; +Hexagon barely moved — −3.6% at best, and the Q31 pipeline a rounding-error +−0.2%. The same source code, the same factoring, the same eliminated +arithmetic. An under-delivering result like this is where measurement-first +culture earns its keep, because the temptation is to shrug — Hexagon is +weird, ship the M55 win — and the log did not shrug. If eliminating most of +the per-channel blend work barely dents the pipeline cost, then the +pipeline cost must be dominated by something that is neither blend nor dot +product. The remaining candidate was the per-sample *phase bookkeeping*: +μ lived in a `double`, and Hexagon has no double-precision FPU, so every μ +increment, wrap and index conversion was a soft-float library call. The +kernels were cheap; the glue between kernels was expensive. The C1 entry +records this diagnosis in one clause — "its pipelines are dominated by +per-sample soft-double phase math" — and flags it as the motivation for +C3. A disappointing result, read carefully, fingered the next target. That +is not a consolation prize; over the campaign it turned out to be C1's +second-most-valuable output. + +## C2: the audit — verify, don't assume + +Hypothesis 2 on the list was not a change at all. It was an audit: do the +hot loops actually vectorize, under the compilers and flags the project +ships with? Everyone who has read optimization folklore "knows" the +answers — contiguous arrays vectorize, reductions vectorize, the compiler +is smart. The project's rule, stated in the hypothesis itself: *verify, +don't assume.* The tool is the compiler's own testimony: +`-fopt-info-vec` on GCC, `-Rpass=loop-vectorize` (and its `-missed` +sibling) on Clang, which report loop by loop what vectorized and what did +not, and why. + +The audit produced four findings — one actionable, three that reshaped the +rest of the campaign's roadmap. + +**Finding 1: `blendRow` vectorized, but behind a runtime aliasing check.** +The compiler could not prove that the output row and the coefficient table +don't overlap — they arrive as separate pointers, and separate pointers may +alias — so it emitted *two* versions of the loop, vector and scalar, with a +runtime overlap test choosing between them every call. The fix is the +oldest annotation in the C toolbox, wrapped for portability: + +```cpp +#if defined(_MSC_VER) +#define SRT_RESTRICT __restrict +#else +#define SRT_RESTRICT __restrict__ +#endif +``` + +`SRT_RESTRICT` on the kernel pointer parameters is a promise to the +compiler — these regions do not overlap — and the caller's structure makes +the promise true: the row is a private scratch member, the table is +immutable, the histories are distinct vectors. The versioning check and the +dead scalar copy disappear. The header carries a comment tying the +qualifier to the evidence (`verified with -fopt-info-vec; see +docs/PERFORMANCE.md, hypothesis 2`), so the next maintainer knows it is +load-bearing and not cargo cult. + +**Finding 2: the Q15 dot product auto-vectorizes, no help needed.** This is +worth a paragraph, because *why* it vectorizes is the piece of theory the +next two chapters stand on. A dot product is a reduction — every iteration +folds into one accumulator, a serial dependence chain. Vectorizing it means +computing partial sums in lanes and combining them at the end, which +**reorders the additions**. For integer arithmetic that reordering is free: +int64 addition is exactly associative, every 16×16 product is exact, so any +order of summation produces the same bits. The compiler knows this and +vectorizes integer reductions at `-O2` without being asked. + +**Finding 3: the float dot product is scalar — and stays scalar, by +design.** Floating-point addition is *not* associative; reordering the +accumulation changes the rounding, which changes the output bits. The +library's float datapath promises double-precision accumulation in a +defined order — that is part of what its measured 135 dB rests on — so the +compiler correctly refuses to vectorize the reduction, and the project +correctly declined to force it with `-ffast-math` or manual partial sums. +The audit *did* record the option: explicit 4-way double accumulation would +vectorize the float dot and change output bits, and it entered the log as +**deferred hypothesis 5** — a bit-changing optimization, parked until the +budget demands it and the quality harness can re-baseline around it. Hold +that thought; hypothesis 5 has a surprising fate in the C6 chapter, where +an axis nobody had listed makes float vectorization possible *without* +changing a bit. + +**Finding 4: the Q31 dot product is scalar too**, for a blunter reason — +baseline ISAs have no packed 64-bit multiply, and Q31 MACs need 32×32→64 +products. No annotation fixes an instruction set. Noted, filed, moved on. + +One actionable change, then: `SRT_RESTRICT` on the kernel signatures. The +measured effect, and this time the controls are the headline: + +| Scenario | Δ instructions (M55) | +|---|---:| +| `pipeline_float` | −1.35% | +| every other scenario, both targets | **0.00%** | + +On x86, a same-state wall-clock A/B measured −3.7% — the aliasing check sat +in a hotter relative position there. But look at the M55 table with C1's +rule in mind. The claim was narrow: *restrict removes a runtime aliasing +check from `blendRow`*. The fixed-point pipelines blend through the same +function — but their loop bodies differ, the versioning overhead lands +differently, and on M55 only the float pipeline was paying measurably. +Fine. What the claim *requires* is that nothing else moves: the qualifier +is documentation to the optimizer, not arithmetic, so any scenario where +the codegen was already clean must be bit-identical binary. All of them +were, to the instruction. A −1.35% win surrounded by exact zeros is a +*verified mechanism*. A −1.35% win alone is just a number. + +It is worth pausing on how unusual that sentence is. In most performance +work, "this change affects only X" is a belief. Here it is a measurement, +because the instrument has no noise floor. The ratchet infrastructure was +built to catch regressions; the campaign discovered its second use almost +immediately — it certifies *non-effects*, which is what turns an +optimization PR from "trust me" into an experiment with controls. Chapter +C6 will show the dramatic version: an embedded control that *failed* — a +hosts-only feature that leaked +6–8% into the M55 — and stopped a merge. + +## What two efforts bought + +The scoreboard after C1 and C2: multichannel wall-clock roughly halved at +high channel counts, double-digit instruction reductions on the M55 +pipelines, a `restrict` qualifier with a paper trail — and, less tangibly, +three pieces of map. Hexagon's cost lives in soft-double phase math (C3's +target). The M33-class parts, with no vector unit at all, will need +something explicit for Q15 (C4's target). And the float dot product cannot +be vectorized over taps without changing bits (the constraint C6 +eventually routes around). None of those three facts was known before; all +three came from measurements that individually looked like disappointments +or non-events. + +That is the method chapter's actual thesis. The loop — baseline, profile, +one hypothesis, A/B with controls — is not bureaucracy around the real work +of optimizing. On this evidence it *is* the real work: every effort in the +next two chapters was aimed by an anomaly this chapter's measurements +surfaced and refused to explain away. + +## Verify it yourself + +```sh +# Host wall-clock benchmarks (Google Benchmark): +cmake -B build -DCMAKE_BUILD_TYPE=Release -DSRT_BUILD_BENCHMARKS=ON +cmake --build build -j +./build/bench/srt_bench --benchmark_filter='Pipeline' + +# Wall-clock deltas in this chapter are same-machine A/Bs. To reproduce +# one, run the benchmark at the parent commit and at the change on the +# same machine in the same session — the project never gates on +# wall-clock from shared runners, and neither should you. + +# The compiler's own vectorization testimony (C2's instrument): +cmake -B build-vec -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_FLAGS="-fopt-info-vec-optimized -fopt-info-vec-missed" +cmake --build build-vec -j 2>&1 | grep -i 'polyphase' + +# Deterministic instruction counts, exactly as CI gates them +# (arm-none-eabi-gcc + qemu-system-arm + the counting plugin; see +# .github/workflows/ci.yml for the plugin build): +cmake -B build-m55 -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_TOOLCHAIN_FILE=cmake/arm-cortex-m55-mps3.cmake \ + -DSRT_BUILD_TESTS=OFF -DSRT_BUILD_EXAMPLES=OFF \ + -DSRT_BUILD_ICOUNT_BENCH=ON +cmake --build build-m55 -j +python3 scripts/icount.py --target m55 --build-dir build-m55 \ + --plugin /tmp/libinsncount.so + +# The quality guardrail that every optimization PR had to clear: +ctest --test-dir build -R Quality --output-on-failure +``` + +Run the icount harness twice and diff the outputs: identical, to the +instruction. That reproducibility is the entire epistemology of this +chapter — it is what lets "all other scenarios: 0.00%" be a finding +instead of a hope. diff --git a/book/src/part3/c3-c5.md b/book/src/part3/c3-c5.md new file mode 100644 index 0000000..f497a71 --- /dev/null +++ b/book/src/part3/c3-c5.md @@ -0,0 +1,318 @@ +# The integer phase and the wide MACs (C3–C5) + +> If it disagrees with experiment, it is wrong. In that simple statement is the key to science. +> +> — Richard Feynman + +The previous chapter ended with an anomaly: C1 stripped most of the +per-channel blend work out of the datapath, the M55 pipelines dropped by +double digits, and Hexagon's barely moved. The diagnosis written into the +log was that Hexagon's pipeline cost was dominated not by the kernels but +by the glue between them — the per-sample phase bookkeeping, done in +`double`, on a DSP with no double-precision FPU. Every μ update was a +soft-float library call. + +This chapter is what happened when the campaign acted on that diagnosis, +and then kept going: one clean win that falsified the project's own +documentation on the way through (C3), one honest, bounded win on the +smallest target (C4), and one implementation that was correct, complete, +measured — and deliberately deleted (C5). The theme, if the chapter has +one: **a negative result recorded is a win.** Not a moral victory — an +actual asset, with a measurable replacement cost. + +## C3: evicting the last double from the per-sample path + +The fractional resampler tracks its position between input samples as a +phase in [0, 1). Before C3, that phase was a `double` named μ, and the +per-sample loop did double-precision work even on the fixed-point +datapaths: advance μ by the rate ratio, detect wrap past 1.0 or below 0.0, +scale by the phase count L, split into integer row index and fractional +blend factor. On a Xeon this is noise. On a core where every double +operation is a function call, it was — per C1's evidence — the dominant +per-sample cost. + +The C3 design replaces the double with an unsigned **Q0.64** fixed-point +fraction: a plain `uint64_t` whose full range represents [0, 1) with +resolution 2⁻⁶⁴. Three properties make this format almost suspiciously +well-suited to the job. + +**The unity part of the ratio never enters the accumulator.** This is the +near-unity specialization paying out one more time. The converter's ratio +is 1 + ε with |ε| servo-clamped to around 10⁻³, so the resampler advances +one input frame per output frame *structurally* and only the deviation ε +accumulates in the phase. ε is converted from the servo's double to a +signed Q0.64 increment **once per `process()` call** — block rate, not +sample rate. That single conversion is the only double arithmetic left +near the hot path. + +**Wraparound detection is free.** Unsigned overflow is defined, modular +arithmetic — the same property the ring buffer's monotonic indices leaned +on in Part I. If adding a positive ε wraps the phase past 2⁶⁴, the sum +comes out *smaller* than the old phase; that comparison **is** the slip +detector, and the response is to consume one extra input frame. A negative +ε wrapping below zero comes out *larger*, and the window is reused. No +epsilon-comparisons against 1.0, no branch on the sign of a floating-point +residual — two integer compares. + +**The table index and blend factor are bit fields.** L is a power of two, +so the polyphase row index is simply the top log₂ L bits of the phase, and +the intra-phase blend factor is the bits below, shifted up: + +```cpp +const int lg = std::countr_zero(bank.numPhases()); +const std::size_t p = static_cast(phase >> (64 - lg)); +const auto fr = Tr::blendFactorFromQ64(phase << lg); +``` + +No multiply by L, no floor, no subtract — shifts. The per-sample path is +now integer-only on the fixed-point datapaths, plus a single +single-precision conversion on the float path. + +Notice also what the format does to *resolution*: a double's mantissa gives +the old μ about 2⁻⁵² of precision; Q0.64 gives 2⁻⁶⁴. Twelve extra bits of +phase resolution means less quantization of the sampling instant, and Part +0's arithmetic connected phase jitter directly to distortion. So the C3 +entry contains a line that optimization logs almost never get to contain: +**quality improved** — 135.0 dB at 997 Hz, versus the previous baseline, +measured by the same pinned tests that would have failed the PR had it cost +decibels. Faster and cleaner, from one change. + +### The falsification + +Now the numbers, and the embarrassing one first, because the log put it +first. M55 instruction counts: + +| M55 scenario | Δ instructions | +|---|---:| +| pipeline Q15 | −5.3% | +| pipeline Q31 | −4.6% | +| pipeline float | **+1.4%** | + +The fixed-point wins were expected. The float *regression* was not — it +contradicted the project's own documentation. The performance plan's +hypothesis list had asserted, in writing, that the M55's float path was +soft-double-bound, just like Hexagon's; the M55 was on the list of targets +the integer phase was supposed to rescue. If that were true, replacing +per-sample double math with int64 math should have helped the float +pipeline too. Instead the float pipeline got slightly *worse*. + +One of three things had to be wrong: the measurement, the change, or the +documentation. The measurement is deterministic and was reproduced. The +change was doing exactly what it claimed on every other scenario. That +left the documentation — and a check of the architecture manuals settled +it: **the Cortex-M55's scalar FPU supports FP64.** Only its vector +extension, Helium/MVE, is limited to fp16/fp32. The M55 float path had +never been soft-double-bound; its doubles were cheap hardware doubles all +along, and C3 had traded them for int64 sequences that cost slightly more. +The genuinely double-less target in the fleet is Hexagon, and only +Hexagon. + +The correction is recorded *in the hypothesis list itself* — hypothesis 3 +in `docs/PERFORMANCE.md` now reads as a correction notice ("discovered +while measuring: Cortex-M55's *scalar* FPU does support FP64…"), so the +false belief cannot quietly re-seed a future roadmap. And the +1.4% was +accepted, eyes open, as the price of a cross-target win: the phase +accumulator is one implementation shared by every datapath, and forking it +per-target to claw back 1.4% on one scenario is exactly the per-arch +complexity the stop rule exists to refuse. + +This is the campaign's cleanest specimen of the culture the introduction +promised. A 1.4% regression on one scenario of one target is the kind of +number a wall-clock benchmark would eat as noise. The deterministic +harness surfaced it; the loop's rule — explain every number, especially +the small ugly ones — forced the investigation; the investigation +falsified a documented belief about the hardware. *The measurement audited +the documentation*, not the other way around. + +### The target it was aimed at + +Hexagon, from the PR's gating run: + +| Hexagon scenario | Δ instructions | +|---|---:| +| pipeline Q31 | −15.5% | +| pipeline Q15 | −10.3% | +| pipeline float | −2.6% | +| kernels (all types) | count-identical | + +The per-sample soft-double phase math that C1 had identified as dominating +Hexagon's pipelines is simply gone. The kernel scenarios — which measure +`interpolate()` in isolation, no phase bookkeeping — were count-identical, +the control confirming the change touched only what it claimed. On x86, a +same-minute A/B measured float −5.4% and Q15 −12.0% wall-clock; hosts keep +score too, they just don't gate. + +C1 found the target; C3 hit it. That is the loop working across PRs, not +just within one. + +## C4: two MACs per instruction, where the compiler won't + +Next on the list: explicit SIMD, "partially moot" before it started. The +audit trail from C2 explains why. On the M55, objdump had confirmed that +GCC already auto-vectorizes the Q15/Q31 kernels with Helium at -O2 — the +M55's roughly 4× Q15 advantage over the scalar M33 in the baselines is MVE +at work, no intrinsics required. But the fleet has a whole class of parts +below the M55: Cortex-M33, M4, M7 — the Raspberry Pi Pico 2 class. These +have no vector unit at all. What they *do* have is the Armv7E-M/Armv8-M +**DSP extension**: scalar instructions that treat a 32-bit register as two +16-bit lanes. The one that matters here is `SMLALD` — *signed multiply +accumulate long dual* — which takes two such registers, forms both 16×16 +products, and adds both into a 64-bit accumulator. One instruction, two +Q15 MACs: precisely the inner operation of `dotRow`, at double width. + +The bit-exactness argument is short enough to carry in your head, and it +is the same argument C2's finding 2 established: every 16×16 product is +exact in int32, int64 addition is associative, therefore summing the +products in pairs instead of one-by-one changes no output bit. The +intrinsic path and the scalar loop are not "close" — they are the same +function, by construction. (Contrast the float dot, where this argument is +exactly what fails.) + +The subtle part of C4 is not the intrinsic; it is the **gate**. Here is +the actual block from `include/srt/polyphase_filter.hpp`, pulled in live: + +```cpp +{{#include ../../../include/srt/polyphase_filter.hpp:opt_smlald_gate}} +``` + +Read the condition: DSP extension present *and MVE absent*. The naive gate +— "use the fast intrinsic wherever the ISA has it" — would have enabled +SMLALD on the M55 too, where the compiler is currently vectorizing that +loop with Helium. The intrinsic loop is hand-written; the compiler will +not auto-vectorize *through* it; enabling it on the M55 would have +silently replaced full vector arithmetic with dual scalar MACs — an +optimization for one target acting as a pessimization on a better one. +This is the MVE-gate discovery, and it generalizes: **an intrinsic is a +floor and a ceiling at once.** The ratchet verified the gate the only way +that counts — every M55 and every Hexagon scenario at exactly 0.00%, +meaning those binaries are instruction-for-instruction unaffected by C4's +existence. + +One routing consequence: mono Q15 on these targets now goes through +`blendRow` + `dotRow` rather than the fused `interpolatePhase()`, because +the dual-MAC loop lives in `dotRow` — legitimate only because C1 +established the two paths are bit-exact against each other. + +The result: **M33 `pipeline_q15` −3.1%.** And here the log does something +worth imitating, under the heading of honest accounting. A 2×-wide MAC +did not halve the frame cost, or even a tenth of it — why? Because the +M33's Q15 frame cost is not dominated by the dot product. It is dominated +by the coefficient *blend* — whose per-tap `fr * diff >> 15` is a 64-bit +product, one `smull` each, already one instruction — and by transport +around the datapath. The candidate follow-up, a packed blend, would change +the documented int64 blend invariant that the bit-exactness proofs rest +on, and was declined at current budgets. The entry even flags that the +`kernel_q15` scenario still measures the fused `interpolate()` call, which +C4 intentionally does not touch — so nobody later mistakes that flat +number for a failed optimization. This is the campaign's honest draw: a +real, kept, correctly-gated improvement, described at its true size, with +the reason it is small written next to it. + +## C5: the revert, and why it is the best entry in the log + +Hexagon's turn. The C4 argument seemed to transfer directly, at double the +width again: Hexagon's scalar ISA has `vrmpyh`, which forms **four** exact +16×16 products per instruction and can feed an int64 accumulator. Four Q15 +MACs per instruction against a loop the profile says is pure MACs — the +back-of-envelope says this is the biggest single win in the campaign. + +It was implemented. It passed the full test suite under Hexagon QEMU, +bit-exactly — the same associativity argument held. And it measured: + +> `pipeline_q15`: 119,847,854 → 119,478,758 instructions. **−0.31%.** + +A 4-wide MAC bought less than a third of a percent. + +The reflexive explanation — the one everyone reaches for first — is that +the compiler had already vectorized the loop, so the intrinsic replaced +equivalent code. The team checked, because the reflexive explanation is +also checkable: disassemble both binaries (CI's llvm-objdump, pre and +post) and count the wide MACs. The baseline binary contains **zero**. The +intrinsic build contains **10**. The compiler had *not* done it already; +the intrinsic genuinely landed, four products at a time, exactly as +designed — and it barely mattered. + +The real explanation is better, and it is the piece of architecture +knowledge the whole effort purchased: **Hexagon's scalar ISA is already +half a DSP.** Its ordinary instruction set has single-instruction 64-bit +multiply-accumulates (`Rxx += mpy`) and 64-bit loads, so the "scalar" +baseline loop was already running at a density that would take intrinsics +to reach on a Cortex-M. On top of that, the history window is 2-byte +aligned — it slides one sample at a time by design — so feeding `vrmpyh`'s +packed operands costs combine/alignment work that eats most of what the +wide multiply saves. The instruction *was* wide; the loop around it had to +pay to keep it fed. + +Now the stop rule from the first page of this Part, firing on schedule: +*the next win requires per-arch complexity the budget does not justify.* +A −0.31% improvement is real — deterministic, reproducible, green across +the suite. It is also the definition of not worth it: a Hexagon-specific +intrinsic path is a second implementation to review, to gate, to keep +bit-exact against the reference forever, purchased for three-tenths of a +percent. The code was reverted. Not lost to a branch nobody can find — +**reverted, with the entry as the deliverable**: the numbers, the +disassembly evidence, and the analysis now live in `docs/PERFORMANCE.md` +under C5, so the next engineer who has the vrmpyh idea (and someone will; +it is a *good* idea) spends five minutes reading instead of two days +re-deriving a dead end. + +The entry's final paragraph is the part that turned out to be prophetic. +Having established that the win wasn't there in scalar-wide instructions, +it asks whether HVX — Hexagon's actual 128-byte vector unit — could do +better, and answers with a shape argument: a 48–80-tap dot product does +not fill one HVX vector, and HVX's 16-bit MACs accumulate into 32-bit +lanes, which overflows the library's exact-int64 invariant after about 24 +worst-case taps. **Per-channel dot products are the wrong shape for HVX** +— not slow, *wrong-shaped*: the axis being vectorized (taps) is too short +and demands too much accumulator width per lane. The shape that fits is +turned ninety degrees: one 64-bit lane-pair per *channel*, sixteen +channels filling one vector exactly — vectorize across channels, not +across taps. That observation was recorded as hypothesis C6, and the next +chapter is what happened when it met the float datapath's +may-not-reorder-additions constraint and dissolved it. + +Score the chapter the way the introduction scored the campaign. C3: a win +that corrected the project's documentation. C4: a draw, honestly sized. +C5: a revert that produced no code and two durable facts — Hexagon's +scalar MAC density, and the channel-axis insight that C6 is built on. The +log entry for the revert cost nothing to keep and pointed directly at the +campaign's largest remaining win. Negative results, *recorded*, compound. + +## Verify it yourself + +```sh +# C3's quality claim — the pinned SNR thresholds (135 dB at 997 Hz for +# the float path) are asserted by the test suite, not the docs: +ctest --test-dir build -R Quality --output-on-failure + +# C3/C4 instruction counts on the Arm targets (the M33 leg is where C4's +# −3.1% lives; every M55 scenario is C4's 0.00% control): +cmake -B build-m33 -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_TOOLCHAIN_FILE=cmake/arm-cortex-m33-mps2.cmake \ + -DSRT_BUILD_TESTS=OFF -DSRT_BUILD_EXAMPLES=OFF \ + -DSRT_BUILD_ICOUNT_BENCH=ON +cmake --build build-m33 -j +python3 scripts/icount.py --target m33 --build-dir build-m33 \ + --plugin /tmp/libinsncount.so + +# C4's gate, interrogated directly: preprocess for an M33 and for an M55 +# and watch SRT_Q15_SMLALD flip (DSP extension without MVE vs with): +arm-none-eabi-gcc -mcpu=cortex-m33 -dM -E - If you were plowing a field, which would you rather use: two strong oxen or 1024 chickens? +> +> — attributed to Seymour Cray + +The campaign's last effort began with an inheritance and a constraint. + +The inheritance came from the C5 revert: per-channel dot products are the +wrong *shape* for wide vector units — the tap axis is too short to fill +wide registers, and lane-width accumulators can't honor the library's +exact-arithmetic invariants. The shape that fits is rotated ninety +degrees: put *channels* in the lanes. + +The constraint came from the C2 audit: the float dot product may not be +vectorized over taps at any price, because reordering a double-precision +accumulation changes output bits, and bit-stability is part of the +library's contract with its own test suite. The one idea that had been on +the table — explicit 4-way partial sums, deferred hypothesis 5 — was known +to be bit-changing and sat parked. + +C6 is the observation that the inheritance dissolves the constraint. But +before the observation was allowed to become code, the loop's first rule +applied: profile first, even when the hypothesis arrives pre-argued. + +## The profile that killed the other hypothesis + +There was a competing story about where the multichannel money was. The +resampler's input arrives interleaved — frame by frame, channels adjacent — +and the datapath stored history *planar*, one delay line per channel. So +every input frame was deinterleaved on arrival: a scatter, channel by +channel, sample by sample. Scatters offend performance intuition, the code +looked like a textbook strided-access antipattern, and "the deinterleave is +eating us at high channel counts" was a perfectly plausible claim. + +Callgrind, on the 12-channel Q15 pipeline — the 7.1.4 deployment shape — +settled it before any design work started: + +| Where the instructions went (12-channel pipeline) | Share | +|---|---:| +| per-channel dot-product MACs | ≈ 85% | +| deinterleave | ~2% | + +The scatter-cost hypothesis died in one profiler run. Two percent is not a +target; you cannot win more than two points by optimizing it to zero. +Eighty-five percent in the dots meant the frame's cost was, to first +order, the MACs themselves — and for float, those MACs were running +scalar, one channel at a time, by contractual necessity. If the dots were +to get cheaper, it would have to be by *widening* them, and the only legal +axis for widening was the one C5's postmortem had pointed at. + +Worth pausing on the method point: the profile did not suggest the design +— C5's shape argument did that. The profile's job was to confirm the +design was aimed at the cost, and to kill the alternative before it +consumed a week. Both jobs took one afternoon. + +## The insight: lanes are channels, not taps + +Here is the argument at the level of arithmetic, because everything else +in C6 is bookkeeping around it. + +A vectorized loop computes several elements of *something* per +instruction. Tap-axis vectorization computes several taps of **one +channel's** sum per instruction — partial sums, combined at the end, which +reorders that channel's additions and changes its rounding. Forbidden by +the float contract. + +Channel-axis vectorization computes one tap of **several channels'** sums +per instruction. Lane k holds channel k's accumulator; each step of the +tap loop multiplies each lane's history sample by the *same* broadcast +coefficient and adds into that lane. Watch what happens to any single +channel: its accumulator receives tap 0, then tap 1, then tap 2 — the +identical operations in the identical order as the scalar `dotRow`. No +channel's sum is ever split, reassociated, or combined with another's. The +channels were always independent computations; SIMD lanes are independent +computations; the map is exact. + +So the channel axis is bit-exact *even for float* — not approximately, +not within tolerance: the same IEEE operations in the same order per +channel, hence the same bits. It is the only vectorization axis the float +contract permits, and it had been sitting in plain sight behind a storage +decision: you cannot load "tap t of channels 0..7" in one instruction if +every channel lives in its own array. Channels-in-lanes requires +**frame-major** history — samples of one frame adjacent in memory. + +Which is interleaved order. The deinterleave the profile had just measured +at 2%? For the channel-parallel path it isn't cheap — it's *deleted*. +Frames are copied into the history window as-is, one contiguous `memcpy` +per frame. + +## Frame-major storage, concretely + +The storage change is worth seeing at the level of the data structure, +because it is where a reader of `polyphase_filter.hpp` will first bump +into C6. Below `SRT_CP_MIN_CHANNELS`, the resampler keeps what Part I +described: one delay line per channel, `hist_[c]`, and a per-channel +`window(c)` pointer for the planar dot. At or above the threshold, on +channel-parallel targets, there is a single delay line, `hist_[0]`, whose +slots are whole frames — `channels` samples wide — and the bookkeeping +indices (`end_`, the capacity) count *frames* in both modes, so the sliding +and compaction logic upstream is shared rather than forked. When the +window slides past the end of the allocation, the same `memmove` +compaction runs in either mode; the only difference is the width of a +slot. Appending input is where the modes diverge visibly: planar scatters +each frame's samples into their per-channel lines, frame-major copies the +frame in one shot, still interleaved. + +The kernel then reads the window with a stride. For output frame *n*, the +newest `taps` frames sit contiguously at the end of the window; tap *t* of +channel *k* lives at `base[t * channels + k]`. The tile's inner step loads +`frame[0..K-1]` — K adjacent samples, one vector load when K matches the +lane count — multiplies by the broadcast coefficient, and accumulates. The +per-frame schedule is C1's, unchanged: one `blendRowPhase()` per output +frame, then all channels' dots; C6 replaces only the per-channel dot loop +with a single channel-parallel pass over the frame-major window. + +The threshold itself was measured, not assumed. At one and two channels +the planar path is better — the blend-share structure from C1 already +amortizes the expensive part, the tile machinery has fixed overhead, and +mono additionally keeps its fused no-scratch fast path — so +`SRT_CP_MIN_CHANNELS` defaults to 4, the point where the lanes start +paying. It is deliberately an overridable macro rather than a hard-coded +constant: the A/B that chose 4 (`-DSRT_CP_MIN_CHANNELS=...` on the +benchmark build, both directions) stays reproducible by anyone who +suspects their machine bends the crossover. + +## Two traps, both recorded + +The performance log preserves two implementation lessons from C6, both of +the kind that cost a day and read as obvious afterward. They are in the +log precisely so they only cost a day once. + +**Trap one: the naive nest is worse than nothing.** The first-cut +channel-parallel loop — taps outer, channels inner, accumulators in a +plain memory array sized at runtime — measured **2.8× slower than +planar**. Not slower than the target; slower than the *unoptimized +baseline*. With the accumulator array's size unknown at compile time, the +compiler could not promote the accumulators to registers, so every one of +the frame's `taps × channels` MACs round-tripped its accumulator through +the stack — load, fuse, store — and the memory traffic swamped the SIMD +gain several times over. The fix is register blocking, and it must be +structural, not hopeful: a fixed-size tile of K channels whose K +accumulators live in a `constexpr`-sized local array the compiler +demonstrably keeps in registers. The library's tile is a template on K, +taken live from `include/srt/polyphase_filter.hpp`: + +```cpp +{{#include ../../../include/srt/polyphase_filter.hpp:opt_dot_tile}} +``` + +The doc comment carries the 2.8× number in the header itself — the trap +is documented at the exact place where a refactor would re-arm it. The +lesson generalizes: **register-block or don't bother.** A vector unit fed +from the stack is slower than scalar code fed from registers. + +**Trap two: the mode gate must be free where the mode is off.** C6 is a +hosts-only, float-only feature; the embedded targets keep their proven +codegen — Helium auto-vectorization on the M55, C4's SMLALD on the +M33 class, Hexagon's measured scalar floor. The first implementation +selected the path with an ordinary runtime boolean, and the embedded +ratchet — the instrument that certifies non-effects — refused the PR: +**+6–8% on M55 scenarios** that were supposed to measure 0.00%. A bool +that is false on every embedded run still costs branches in the hot loops +and blocks the optimizer around them. The gate had to become +compile-time — a `constexpr` flag that constant-folds the entire +channel-parallel path out of existence on non-host targets — after which +every embedded scenario returned to exactly 0.00% and the PR merged. + +Note who caught it: not review, not intuition — the controls. A +wall-clock smoke test would never have resolved 6% on an emulated M55, +because nothing wall-clock touches an emulated M55 at all. The +deterministic ratchet made "this feature is free where it is disabled" a +gated, falsifiable claim, and its first draft was false. This is the +failure-mode twin of C2's all-zeros table, and together they are the +strongest advertisement in the campaign for counting instructions. + +## The dispatcher and the odd channel counts + +Real streams are not all multiples of eight, so the channel loop runs a +cascade of tiles — 8, then 4, then 2, then 1 — each a specialization of +the same template: + +```cpp +{{#include ../../../include/srt/polyphase_filter.hpp:opt_dot_rows}} +``` + +Twelve channels is 8+4. Sixteen is 8+8. The deployment shapes exercise +only the big tiles — which is exactly why the odd shapes need deliberate +tests: **5 channels (4+1) and 7 channels (4+2+1) are the only counts that +reach the K=1 and K=2 remainder tiles.** Ship only 8/12/16-channel tests +and two of the four tile instantiations are dead code with green CI — +until a 7-channel consumer instantiates them in production. +`tests/test_multichannel.cpp` therefore carries dedicated 5- and 7-channel +independence tests (`MultiChannelShort.Independence5chFloat`, +`Independence7chFloat`), with a comment stating the tiling arithmetic so +nobody "simplifies" them away as redundant with the 12-channel test. An +untested tile is untested code, however trivially it falls out of a +template. + +## The results, and what they scale with + +Same-minute A/B on the host, float pipelines: + +| Configuration | Δ wall-clock | +|---|---:| +| 8-channel, AVX2+FMA (`-march=native`) | −38% | +| 12-channel, AVX2+FMA | −38% | +| 16-channel, AVX2+FMA | −42% | +| 8–16-channel, baseline SSE2 build | −4–5% | + +Bit-exactness was not left to the argument above, clean as it is: the +channel-parallel outputs were **hash-verified against the planar path over +30,000 blocks across four configurations**. The proof says the bits must +match; the harness removes the possibility that the proof described a +slightly different loop than the one that shipped. (Part I's ring-buffer +chapter made the same move with its wraparound proof. The habit is the +point.) + +The spread between the SSE2 row and the AVX2 rows is the design's +signature, and the log calls it out: **gains scale with SIMD width.** The +C++ contains no intrinsics — no AVX2 code, no dispatch tables. It exposes +an axis: independent accumulator lanes, contiguous frame-major loads, a +broadcast coefficient, register-resident tiles. What the axis is worth +depends on the vector unit the compiler is targeting — 4 double lanes with +FMA under AVX2, 2 under SSE2, wider still if a consumer builds for +AVX-512. For a header-only library this division of labor is exactly +right: the header is compiled inside the consumer's translation units, +with the consumer's flags, so *the consumer chooses the SIMD width* and +the same source meets them at whatever width they paid for. + +### The fixed-point half: a negative result, kept as a boundary + +C6's hypothesis was drafted for HVX and Q15; it shipped for hosts and +float. The fixed-point measurement is the reason, and it is the campaign's +last negative result: **channel-parallel Q15 measured ~1.5× slower than +planar on hosts, and planar was kept.** + +No mystery, just the C2 audit paying out one more time. The planar Q15 dot +*already* auto-vectorizes — over taps, the axis the float path is denied, +because integer reduction is exactly reassociable and the compiler has +been quietly exploiting that since C2 verified it. So for Q15 the +channel-parallel form was not competing against scalar code; it was +competing against the compiler's own tap-axis vectorization, from a +storage layout chosen for a different sample type, and it lost. The two +datapaths end up mirror images, each vectorized along the one axis its +arithmetic permits, neither layout right for both: + +| | Tap axis | Channel axis | +|---|---|---| +| float | forbidden (reorders double accumulation) | **C6: bit-exact, −38–42%** | +| Q15 | **already auto-vectorized (reassociable)** | measured ~1.5× slower | + +So the shipped rule is: float at ≥ `SRT_CP_MIN_CHANNELS` channels goes +frame-major and channel-parallel; fixed-point stays planar everywhere; the +embedded targets see none of it, enforced at 0.00% by the ratchet. And +hypothesis 5 — the bit-changing 4-way float accumulation from C2 — was +superseded without ever being implemented: for four channels and up, C6 +vectorizes the float path *without* changing a bit, leaving hypothesis 5 +relevant only to mono and stereo float, if a budget ever demands it. + +## Where the campaign stopped + +The embedded version of the channel axis — HVX with one 64-bit lane-pair +per channel, sixteen channels filling one 128-byte vector exactly; Helium +similarly on the M-class — remains in the log as a follow-up candidate +*if DSP budgets demand it*. That phrasing is the stop rule doing its job +one last time. The per-channel dots were measured at 85% of the +12-channel pipeline; the shape is proven on hosts; the invariant analysis +for HVX is already written down in the C5 entry. The next engineer starts +from all of that — or never starts, because no budget ever asks. Either +outcome is correct. Optimization stops by budget, not by exhaustion. + +Six efforts. C1 and C2, the method proving itself on easy money and +cataloging the walls. C3, the integer phase — the biggest embedded win, +and a falsified line of documentation as a bonus. C4, the honest draw, +its small size explained rather than excused. C5, the deliberate revert +that paid for this chapter. C6, the campaign's insight compounding: a +shape argument from a failed Hexagon experiment, applied through a +constraint mapped by a vectorization audit, aimed by a callgrind profile, +guarded by an instruction ratchet, landing a 38–42% win that is bit-exact +to the code it replaced. None of these six PRs would look impressive in +isolation. The system that produced them — one hypothesis at a time, every +number explained, every dead end recorded — is the thing this Part was +actually about. + +## Verify it yourself + +```sh +# The profile that aimed C6 (Linux, valgrind + callgrind): +cmake -B build -DCMAKE_BUILD_TYPE=Release -DSRT_BUILD_BENCHMARKS=ON +cmake --build build -j +valgrind --tool=callgrind ./build/bench/srt_bench \ + --benchmark_filter='Pipeline_Q15_Balanced_12ch' \ + --benchmark_min_time=1x +callgrind_annotate callgrind.out.* # dots ≈ 85%, deinterleave ~2% + +# The headline A/B: build once portable, once with native SIMD, and run +# the float multichannel benches on the same machine, same session +# (wall-clock is never a hard gate; only a same-machine ratio is real): +cmake -B build-native -DCMAKE_BUILD_TYPE=Release \ + -DSRT_BUILD_BENCHMARKS=ON -DCMAKE_CXX_FLAGS="-march=native" +cmake --build build-native -j +./build/bench/srt_bench --benchmark_filter='Float_Balanced_(8|12|16)ch' +./build-native/bench/srt_bench --benchmark_filter='Float_Balanced_(8|12|16)ch' + +# Bit-exactness and the remainder tiles (5 = 4+1 and 7 = 4+2+1 are the +# only counts that reach the K=1/K=2 tiles): +ctest --test-dir build -R MultiChannel --output-on-failure + +# The A/B knob C6's threshold was chosen with — force the planar path +# and watch the multichannel float benches give the win back: +cmake -B build-planar -DCMAKE_BUILD_TYPE=Release \ + -DSRT_BUILD_BENCHMARKS=ON \ + -DCMAKE_CXX_FLAGS="-march=native -DSRT_CP_MIN_CHANNELS=64" +cmake --build build-planar -j +./build-planar/bench/srt_bench --benchmark_filter='Float_Balanced_12ch' + +# The claim that embedded targets are untouched is CI-gated, not +# rhetorical — every M55/M33/Hexagon scenario sits at 0.00% vs +# bench/baselines.json: +python3 scripts/icount.py --target m55 --build-dir build-m55 \ + --plugin /tmp/libinsncount.so +``` + +The last command is the quiet one, and it is the one to internalize. A +38% win on the machine in front of you is easy to believe. The mature +habit is demanding equally hard evidence for the other half of the claim +— that every target which was promised *nothing* received exactly that. diff --git a/book/src/part4/c-abi.md b/book/src/part4/c-abi.md new file mode 100644 index 0000000..2c2d6a6 --- /dev/null +++ b/book/src/part4/c-abi.md @@ -0,0 +1,325 @@ +# The C ABI + +> Be conservative in what you do, be liberal in what you accept from others. +> +> — Jon Postel, RFC 761 + +This chapter exists because of a plot. Part II's notebooks are the +library's most persuasive evidence — the servo locking from a cold start, +the 135 dB money plot, the naive-FIFO spectrogram full of clicks — and +every curve in them comes from the *actual shipping library*, not a Python +reimplementation of it. A reimplementation would prove nothing: the entire +point of measurement-first development is that you measure the artifact you +ship. So Python has to call the C++ converter. + +And Python cannot. Neither can Julia, nor anything else that loads shared +libraries at run time, because C++ deliberately has no stable binary +interface. Symbol names are mangled, and the mangling differs by compiler. +Exceptions, RTTI, and the layout of `std::` types differ by compiler *and* +by standard-library vendor. A template — and this whole library is +templates — doesn't exist in a binary at all until something instantiates +it. The one interface every FFI on earth speaks (`ctypes`, `cffi`, Julia's +`ccall`, Rust's `extern "C"`, every scripting language's dlopen wrapper) is +the C ABI: plain functions, plain data, names that mean what they say. + +So the library ships a shim: `tools/capi/`, about ninety lines of C++ +presenting a C face, built as a shared library with `-DSRT_BUILD_CAPI=ON`. +This chapter is small because the shim is small, but three of its design +decisions were paid for the hard way — one by a compile error, one by an +audit finding, and one by a toolchain that turned out to be unable to +throw an exception at all. + +## The surface: eight functions + +The entire foreign-function interface: + +```c +{{#include ../../../tools/capi/srt_capi.h:abi_surface}} +``` + +Create, destroy, push, pull, status, latency, reset, version. The shim +wraps the *float* converter only — the notebooks are metrology instruments +and float is what they measure with; tripling the surface for Q15/Q31 +would triple the contract for consumers that don't exist yet. Minimalism +here is a feature: every function in an ABI is a promise you keep forever. + +`SrtHandle` is the classic opaque-handle pattern: a `typedef` of a struct +that is *declared* and never *defined*. C callers can hold a +`SrtHandle*`, pass it around, and store it — but never dereference it, +size it, or copy what it points to, because the compiler has no idea what +it is. Compared to the lazier convention of handing out `void*`, the named +opaque type keeps some type checking alive at the boundary: pass a +`FILE*` where an `SrtHandle*` belongs and a C compiler will at least warn. +The pointer's true identity lives on the other side of the wall. + +## Two `extern "C"` blocks and the lesson between them + +Here is the other side of the wall, and the file structure is itself a +fossil of a compile error: + +```cpp +{{#include ../../../tools/capi/srt_capi.cpp:abi_impl}} +``` + +The handle is simply the converter pointer in disguise — +`reinterpret_cast` in `srt_create`, `reinterpret_cast` back on every call. +No wrapper struct, no registry of live handles, no indirection table: +there is nothing to store beyond the object itself, so the handle *is* the +object. + +Look at where the `impl()` helpers live: in an anonymous namespace, +*between* two `extern "C"` regions rather than inside one. That placement +is load-bearing. There are two `impl()` functions — one taking +`SrtHandle*`, one taking `const SrtHandle*` — which is to say, `impl` is +**overloaded**. And overloading is illegal for functions with C linkage: +C has no name mangling, both overloads would demand the same symbol name, +and the program is ill-formed. Write the helpers in the obvious place — +inside the `extern "C"` block where everything else in the file lives — +and the compiler stops you cold. That is exactly how it was discovered +here. The fix is what you see: the helpers sit outside the C-linkage +region, in an anonymous namespace that both gives them ordinary C++ +linkage (overloading welcome) and keeps them out of the shared library's +exported symbol table, where an FFI user enumerating symbols would only be +confused by them. The general rule: `extern "C"` is for the eight names +you are promising to the world, and *nothing else* belongs inside it. + +## The error convention, and why every function tolerates `NULL` + +The shim's entire error vocabulary is one value: + +```cpp +{{#include ../../../tools/capi/srt_capi.cpp:abi_create}} +``` + +`srt_create` returns `NULL` on invalid configuration or allocation +failure. No error codes, no `errno`, no last-error string: for a +constructor with a handful of scalar parameters, "it didn't work, and the +header tells you the two reasons it can't" is a complete diagnostic, and +every additional error channel is more contract to keep frozen forever. + +The subtle decision is downstream of that one. The first version of this +shim checked nothing: `srt_push` cast the handle and called through it, +unconditionally. The hardening audit changed every entry point to this +shape: + +```cpp +{{#include ../../../tools/capi/srt_capi.cpp:abi_null}} +``` + +The reasoning is stated in the file's own header comment, and it is worth +reading as a small essay on API design: + +```cpp +{{#include ../../../tools/capi/srt_capi.cpp:abi_doc}} +``` + +A "check create for NULL" convention *concentrates* failure on precisely +the caller who forgot the check — the one writing quick notebook code, the +one least prepared for a segfault in a foreign runtime where the crash +arrives with no C++ stack and no Python traceback, just a dead kernel. +With the guards, an unchecked failed create degrades to a converter that +accepts nothing and produces zeros: `srt_pull` returns silence, which is — +not coincidentally — the same thing the real converter produces on +underrun. The failure is still visible (`srt_status` reports zeros, the +audio is silent), but it is *debuggable* instead of fatal. Eight null +checks on functions that move hundreds of frames per call cost nothing +measurable; they buy an FFI that fails the way dynamic-language users can +diagnose. + +## The header is the contract + +`srt_capi.h` did not exist in the shim's first version — the notebook +simply re-declared the prototypes in `ctypes`, which worked and proved +nothing for anyone else. The audit shipped the header, and its top comment +is the ABI's real substance — the part no binary interface can encode: + +```c +{{#include ../../../tools/capi/srt_capi.h:abi_contract}} +``` + +Three promises deserve emphasis, because each answers a real foreign-caller +failure mode. + +**Thread affinity is spelled out per function.** The C++ API's +single-producer/single-consumer contract (the ring chapter) does not +dissolve because the caller is Python or Julia — but an FFI user cannot see +`std::memory_order` annotations, so the header must say it in words: one +thread pushes, one thread pulls, `srt_status` from anywhere, +`srt_reset_from_consumer` only from the consumer, create/destroy never +concurrent with anything. An ABI that documents signatures but not thread +affinity has documented the easy half. + +**`size_t` follows the platform ABI.** On every 64-bit host this is +invisible; on a 32-bit target (and this library ships to several) `size_t` +is 32 bits, and a foreign declaration hard-coding `uint64` for `frames` +corrupts the argument list. `ctypes.c_size_t` tracks the platform +automatically — the notebook uses it — but `cffi` and Julia users write +their own declarations, so the header says it explicitly. This is the kind +of sentence you only think to write after watching Part IV's 32-bit ports +in action. + +**`srt_version()` is a probe.** It returns +`major*10000 + minor*100 + patch` — `100` for today's 0.1.0. A version +*macro* would vanish into the caller's compile; a version *function* +reports what the loaded shared library actually is, which is the question +an FFI user is really asking when their symbols don't match their +expectations. It is also the cheapest possible smoke test that the DSO +loaded and calls marshal correctly — one integer, no state, no handle. + +## Six doubles and two return values: marshaling without a struct + +Two smaller conventions in the surface reward a moment each, because both +are shaped by what FFIs do badly. + +`srt_status` reports six quantities — state, ppm estimate, FIFO fill, +underruns, overruns, resyncs — and the obvious C design is a struct. +The shim instead fills a caller-provided `double out[6]`. A struct +returned across an FFI boundary is a *layout* contract: the foreign side +must re-declare every field, in order, with matching types, padding, and +alignment, and nothing checks the re-declaration — get it wrong and the +fields silently shear. An array of one scalar type is the +lowest-common-denominator marshaling that every FFI on earth handles in +one line (`(ctypes.c_double * 6)()` in the notebook). The price is that +counters and an enum ride in doubles — harmless, since a double carries +integers exactly to 2⁵³ and the header documents each slot by index. One +type, one array, zero layout risk: for six values polled a few times per +second, the trade is not close. + +The push/pull return values encode the real-time contract from the ring +chapter, translated for callers who never read it. `srt_push` returns the +frames *accepted*, which may be fewer than offered — the clipped write +when the FIFO is full. `srt_pull` is deliberately asymmetric: it **always +fills** the requested frames, substituting silence while the converter is +still filling or after an underrun, and its return value reports how many +frames came from real input. An audio callback must hand *something* to +the DAC in bounded time; an API that could return "no data, try again" +would push retry logic — and the opportunity to get it wrong — into every +consumer. Silence-on-shortfall keeps the failure mode the library already +promised (a dropout sounds like silence, then a fade back in), and the +return value keeps it observable. FFI code that ignores both return values +still plays audio; FFI code that reads them gets telemetry. Both are valid +clients, and neither can deadlock or glitch the other side. + +## Exceptions must not cross — and one target where they cannot even fly + +Look again at `srt_create`'s body: the `new` is wrapped in +`try { ... } catch (...) { return nullptr; }`. This is not defensive +decoration. A C++ exception that propagates out of an `extern "C"` +function into a C caller is undefined behavior — there is no agreement +about what unwinding even *means* across that boundary, and the practical +result ranges from `std::terminate` to stack corruption inside a foreign +interpreter. The converter's constructor is the one place this library +throws (`Config` validation and allocation); the shim's job is to convert +that exception into the ABI's error vocabulary — `NULL` — before it +reaches the boundary. `catch (...)` rather than `catch (const +std::exception&)` because the boundary does not care *what* was thrown; +everything becomes `NULL`. + +Now the hard lesson, recorded in `docs/PERFORMANCE.md` under *Known debt*. +One of this library's supported toolchains — the Hexagon static-musl +configuration from the Part IV DSP chapter — **cannot unwind at all**. Its +runtime lacks the unwinder: when a constructor throws, the exception does +not propagate to *any* catch block, anywhere; the process terminates via +`libc++abi`. This was not discovered by reading toolchain documentation. +It was discovered the day the first `EXPECT_THROW` test reached that CI +leg and the test *runner* died — the `ConfigValidation` suite is excluded +on Hexagon to this day, and the candidate fix +(`-unwindlib=libunwind` in the toolchain file) sits unclaimed in the debt +list. + +Think through what that does to this shim's design. The `catch (...)` in +`srt_create` is *necessary* — on normal targets it is the entire error +mechanism — but on a no-unwind target it is **unreachable**: the throw +terminates the process before the catch can run. A caller on such a target +cannot be saved by any code positioned *after* the throw. The only +placement that works is *before* it: **validate, then construct.** The +deployment guidance in the debt entry says exactly this: on that +toolchain, treat an invalid `Config` as fatal and validate inputs *before* +constructing — check them against the constraints the constructor +enforces (positive finite sample rate, nonzero channels, band edges that +sum under the rate, and the rest of `validated()`'s list) so the +constructor is never asked to throw. It is a weaker mechanism than a +`catch`, and that is the point: it is the strongest mechanism the target +actually has. + +The generalizable ABI lesson: an FFI boundary that reports failure by +*catching* is betting that every target can unwind, and that bet is not +safe even within one library's own CI matrix. Error strategies that +*return* — validate-before-construct, factory functions, status codes — +degrade gracefully on runtimes where error strategies that *throw* simply +end the process. + +## The client: forty lines of ctypes + +The notebook's first code cell is the reference consumer, and it exercises +every clause above: it locates the DSO (building it on first run), declares +each prototype, and wraps the handle in a small numpy-aware class. Two +lines carry the load: + +```python +_lib.srt_create.restype = ctypes.c_void_p +_lib.srt_push.argtypes = [ctypes.c_void_p, _FLOATP, ctypes.c_size_t] +``` + +Without the explicit `restype`, `ctypes` assumes functions return a C +`int` — on a 64-bit machine the handle comes back truncated to its low 32 +bits, and the crash lands on some *later* call, far from the actual +mistake. Declaring the full prototypes is the ctypes equivalent of +including the header, and `c_size_t` is the notebook honoring the width +caveat. The wrapper's `__del__` calls `srt_destroy` (guarded, per the +convention, against a handle that never existed), and its constructor +asserts `srt_create` succeeded — the check the null-tolerance exists to +forgive, present anyway, because tolerance is for accidents, not policy. +Everything downstream — the lock-acquisition plot, the ≥125 dB +transparency assertion, the impulse-response latency check that agrees +with `srt_designed_latency_seconds()` to within 0.3 ms — runs through +these eight functions. + +## Why these ~90 lines look the way they do + +| Decision | Alternative rejected | Reason | +|---|---|---| +| C shim over the C++ API | Python bindings / pybind11 | one C ABI serves ctypes, cffi, Julia, and everything else; bindings serve one language and drag in a build dependency | +| Float converter only | mirror all three sample types | the consumers are metrology notebooks; unused surface is unpaid-for contract | +| Named opaque handle | `void*` | keeps compiler type-checking alive at the FFI edge | +| Handle = object pointer, `reinterpret_cast` | handle registry / wrapper struct | there is nothing else to store; indirection would add state and failure modes | +| `impl()` overloads outside `extern "C"` | helpers inside the block | overloading is ill-formed with C linkage — the compiler enforced this one personally | +| `NULL` return + null-tolerant entry points | "caller must check" | the convention otherwise concentrates crashes on exactly the caller who forgot, in a runtime with no useful stack trace | +| `catch (...)` → `NULL` in `srt_create` | let exceptions cross | UB across the C boundary; and see below | +| Validate-before-construct guidance | rely on the `catch` | one supported toolchain cannot unwind at all — a throw terminates before any catch runs | +| `srt_version()` function | version macro | reports the loaded binary, not the caller's compile-time assumption | +| Thread affinity + `size_t` width in the header | "see the C++ docs" | the header is the only artifact an FFI consumer reads | + +## Verify it yourself + +```sh +# Build the shared library: +cmake -B build -DCMAKE_BUILD_TYPE=Release -DSRT_BUILD_CAPI=ON +cmake --build build --target srt_capi -j + +# The exported surface — eight srt_* symbols, unmangled, and nothing else +# from this file (the impl() helpers are invisible, as promised): +nm -D --defined-only build/tools/capi/libsrt_capi.so | grep srt_ + +# The one-integer smoke test (0.1.0 -> 100): +python3 -c "import ctypes; \ + print(ctypes.CDLL('build/tools/capi/libsrt_capi.so').srt_version())" + +# The null-tolerance convention, exercised directly — no crash, zero frames: +python3 -c "import ctypes; lib = ctypes.CDLL('build/tools/capi/libsrt_capi.so'); \ + lib.srt_create.restype = ctypes.c_void_p; \ + print('bad create:', lib.srt_create(ctypes.c_double(-1.0), 0, 0, 1)); \ + print('push on NULL:', lib.srt_push(None, None, 128))" + +# The full reference client, plots and assertions included: +jupyter nbconvert --to notebook --execute notebooks/asrc_demo.ipynb \ + --output /tmp/asrc_demo_run.ipynb + +# Break it on purpose: move the two impl() overloads inside the extern "C" +# block and rebuild — the compiler rejects the overload set, which is the +# whole story of this file's structure in one diagnostic. +``` + +The second Python one-liner is the chapter's argument compressed: an +invalid configuration and a forgotten check, and the program prints two +zeros instead of dying. diff --git a/book/src/part4/cortex-m.md b/book/src/part4/cortex-m.md new file mode 100644 index 0000000..9701be3 --- /dev/null +++ b/book/src/part4/cortex-m.md @@ -0,0 +1,407 @@ +# Cortex-M: bare metal, two ways + +> Civilization advances by extending the number of important operations which we can perform without thinking about them. +> +> — Alfred North Whitehead + +The Hexagon port ran the library on a strange ISA under a familiar OS. +The Cortex-M ports remove the OS. No loader, no threads, no filesystem, +no `argv`, no reliable way to even return an exit code — and the library +must still build, run its test suite, and hold its instruction budgets, +because MCU-class parts are where a $5 deployment actually lives. + +The project runs two of them, and the pairing is deliberate. Each board +exists to prove something the other cannot: + +- **Cortex-M55**, on QEMU's MPS3 AN547 board model. The M55 has Helium + (MVE, the M-profile vector extension) and a full scalar FPU. It proves + the library survives *bare metal itself* — the startup, the memory + map, the missing runtime — and it turned out to be hiding the single + most surprising compiler discovery in the project's history. +- **Cortex-M33**, on QEMU's MPS2+ AN505 board model. The M33 is the + Raspberry Pi Pico 2 / RP2350 class of core: single-precision FPU only, + no Helium, DSP extension present. It proves what deployment on a cheap + part actually costs, in numbers concrete enough to be budgets. + +Both share one startup file and one CTest strategy; they differ in linker +script and in what their instruction counts taught the project. This +chapter covers the shared anatomy first, then the two boards' discoveries. + +## What `-nostartfiles` obligates you to + +The toolchain files (`cmake/arm-cortex-m55-mps3.cmake`, +`cmake/arm-cortex-m33-mps2.cmake`) link with `--specs=rdimon.specs +-nostartfiles`: newlib with semihosted I/O, and *no* toolchain crt0. From +that moment the project owes the CPU everything crt0 used to provide, and +the debt is paid in one C file, `platform/armv8m_startup.c`, shared by +both targets. + +It starts where the core starts — the vector table: + +```c +{{#include ../../../platform/armv8m_startup.c:pt_vectors}} +``` + +An Armv8-M core fetches its initial stack pointer from word 0 and its +reset address from word 1; the linker scripts pin this array at the +address the core will look (`KEEP(*(.vectors))`, first section — ITCM +address 0 on the AN547, the secure-alias base on the AN505, where VTOR +points at reset). The `used` attribute stops the compiler discarding an +array nothing references; `KEEP` stops `--gc-sections` doing the same at +link time. Belt and suspenders, because the failure mode — a garbage +vector table — doesn't diagnose itself; the core simply jumps into +nothing. + +There is a subtlety in how this file reaches the link, and it is the kind +of decision this book exists to record. The toolchain files pass the +startup source *on the linker command line*, from +`cmake/arm-cortex-m55-mps3.cmake`: + +```cmake +{{#include ../../../cmake/arm-cortex-m55-mps3.cmake:pt_linkline}} +``` + +The `g++` driver would otherwise compile a `.c` link input as C++, and C++ +is allowed to lower those `(uintptr_t)&Reset_Handler` initializers to +*dynamic* initialization — code that runs at startup, initializing the +table that decides where startup begins. C guarantees address-constant +initializers are link-time constants. The table must be constant for the +same reason a ladder's bottom rung must not be attached to the top of the +ladder. (The `extern "C"` guards keep the file well-defined if someone +ever does compile it as C++; the `-x c` makes sure nobody has to find out +the hard way.) + +### Reset, in the only order that works + +```c +{{#include ../../../platform/armv8m_startup.c:pt_reset}} +``` + +Four moves, each ordered by a hazard: + +**MSPLIM first.** Armv8-M Mainline gives the main stack a hardware floor: +write an address to `MSPLIM` and any stack-pointer excursion below it +faults immediately, instead of the stack silently growing down into +whatever data lives below it. Why does this matter enough to be +instruction one? Because the alternative failure is the worst kind: +a deep call chain during one test overwrites the heap's top, the +corruption surfaces ten allocations later in an unrelated structure, and +the emulated target has no debugger attached and no memory protection +unit configured. A stack limit register converts that archaeology into a +HardFault at the exact instruction that crossed the line — and the +startup file gives HardFault its own handler (a `bkpt` and a park loop, +distinct from `Default_Handler`) precisely so the fault is identifiable. +This wasn't in the first version of the file; it was added by the same +infrastructure audit that hardened the Hexagon toolchain cache, and it +cost two linker-script symbols and one instruction. Insurance is rarely +priced this low. + +**FPU enable before any FP instruction, with `DSB; ISB`.** At reset, +coprocessors CP10/CP11 — the scalar FPU and MVE — are disabled; the +first FP instruction would fault. The CPACR write grants access, and the +barrier pair is not decoration: `DSB` forces the write to complete, `ISB` +flushes instructions already fetched under the old permissions. Omit the +barriers and the enable *usually* works — until an instruction prefetched +before the write faults on a real pipeline. The startup does this before +touching newlib because newlib code may legitimately use FP registers. + +**Zero `.bss`, but do not copy `.data`.** C guarantees zero-initialized +statics; nobody has provided that guarantee yet, so `memset` over the +linker-defined `__bss_start__..__bss_end__` does. The conspicuous absence +is the traditional `.data` copy loop — see the linker scripts below, +because that absence is a documented dependency on QEMU, not an +oversight. + +**Then the runtime, in dependency order:** semihosting file handles +(`initialise_monitor_handles`) so `printf` works, `__libc_init_array` so +C++ static constructors run, then `exit(main(0, NULL))` — `exit`, not a +bare return, so `atexit` handlers and stream flushes happen before the +semihosting exit call. `main` receives no arguments. There is no one to +pass any; that fact shapes the whole test harness below. + +### The runtime pieces the toolchain didn't bring + +Two more gaps get filled in the same file. First, the heap. +`librdimon`'s weak `_sbrk` sizes the heap by asking the host, via the +semihosting `SYS_HEAPINFO` call, where the heap should live — an answer +that depends on the emulator's mood for a given board model. The startup +overrides it with the boring, deterministic version: + +```c +{{#include ../../../platform/armv8m_startup.c:pt_sbrk}} +``` + +The heap is exactly the region the linker script says, ends exactly where +the script says, and `malloc` fails with `ENOMEM` — a *testable* +condition — rather than wandering into memory the map never granted. + +Second, 64-bit atomics. The library's telemetry counters are +`std::atomic`; M-profile has no 64-bit exclusive-access +instructions, GCC lowers those operations to `__atomic_*_8` library +calls, and the bare-metal toolchain ships no libatomic. The startup +provides the four helpers the link actually needs, built on the classic +single-core primitive — mask interrupts, do the plain 64-bit access, +restore: + +```c +{{#include ../../../platform/armv8m_startup.c:pt_irqlock}} +``` + +```c +{{#include ../../../platform/armv8m_startup.c:pt_atomic_rmw}} +``` + +Why is PRIMASK sufficient where a mutex or an exclusive-access loop would +be required elsewhere? Because on a single-core part, the only agent that +can interleave with a sequence of instructions is an interrupt handler on +the same core — there is no second observer, no other cache, no store +buffer visible from elsewhere. `cpsid i` makes the critical section +literally uninterruptible, so the load-modify-store is atomic with +respect to everything that exists on the machine. The reasoning is sound +*only* single-core, which is why the dual-core RP2350 firmware at the end +of this chapter pointedly refuses to rely on it, and shares nothing +across cores except 32-bit atomics. Note also what the file does *not* +do: it implements only the helpers currently linked, and deliberately +omits the rest (compare-exchange and friends), so any future need +surfaces as a link error instead of as a silently wrong fallback. + +## Two linker scripts, two philosophies of stack + +The memory maps mirror each board model. The AN547: + +```ld +{{#include ../../../platform/mps3_an547/mps3_an547.ld:pt_memory}} +``` + +Four regions, four jobs: vectors in ITCM (address 0, where VTOR resets), +code in SRAM, **the stack owning all of DTCM**, data/bss/heap in ISRAM. +Giving the stack a private 512 KB region is a luxury the board offers and +the script accepts gratefully — the stack limit is simply the region's +base, and stack and heap physically cannot collide because they do not +share a region. + +The AN505 has only the two big SRAMs, so stack and heap must cohabit, +and the script makes the boundary explicit rather than hopeful: + +```ld +{{#include ../../../platform/mps2_an505/mps2_an505.ld:pt_heap_stack}} +``` + +The stack descends from the top of DATA; the heap is *capped* 64 KB below +the top; `__stack_limit` is set exactly at the cap. Between `_sbrk` +refusing to grow past `__heap_end__` and MSPLIM faulting below +`__stack_limit`, the classic bare-metal failure — stack and heap growing +silently into each other — is fenced from both sides. One side returns +`ENOMEM`; the other side HardFaults. Neither corrupts. + +And the honesty clause, stated in both scripts' headers: **QEMU's +`-kernel` loader places the ELF directly into RAM, so VMA == LMA and +`.data` needs no load-time copy.** On real silicon booting from flash, +initialized data must be linked with a load address in flash and copied +to RAM by the startup — the loop this startup deliberately does not +have. The scripts say so in as many words. This is the same discipline +as the performance documentation: the artifact records what it is +validated for, and the boundary of that validation, in the place the next +user will actually look. A linker script that works under QEMU while +*looking* like a flash-boot script would be a trap; one that documents +"QEMU-only, here's why" is a foundation. + +## CTest without an operating system + +The toolchain files end with `set(SRT_BARE_METAL ON)`, and +`tests/CMakeLists.txt` branches on it. The problem it solves: CTest's +contract with a test binary is "run it with arguments, read its exit +code," and bare metal breaks both halves. There is no `argv` to pass a +`--gtest_filter`, and semihosting does not reliably propagate the guest's +exit status through `qemu-system-arm`. + +The replacement is a one-shot protocol. A dedicated `main` bakes the +filter in at compile time, and the *pass criterion is a printed string*: + +```cpp +{{#include ../../../tests/bare_metal_main.cpp}} +``` + +CTest registers a single test whose `PASS_REGULAR_EXPRESSION` is +`SRT_TESTS_COMPLETE rc=0` and whose `FAIL_REGULAR_EXPRESSION` is gtest's +`[ FAILED ]` marker: the run passes only if the summary line is printed +*and* no failure marker ever appears. The completion line is printed at +the last possible moment, so a crash, fault, or park-loop after the tests +cannot masquerade as success — the harness times out instead (the +`Default_Handler` comment in the startup file closes this loop: faults +park, parking times out, timeouts fail). + +Three details in that file repay attention: + +- **The filter excludes by category, not by taste.** What is cut is + minutes of soft-float virtual audio proving target-independent control + math already proven on every host leg; what stays is everything only + the target can falsify — datapath arithmetic, ring behavior on 32-bit + `size_t`, the end-to-end converter. The comment about `AsrcQuality*` + versus `AsrcQuality.*` records a real trap: in gtest filters the dot is + a literal, and the wrong spelling silently *narrows* the exclusion. +- **The empty-run guard.** A filter typo can select zero tests, and + `RUN_ALL_TESTS()` cheerfully returns 0 for an empty run — a green CI + leg testing nothing, forever. The guard fails the run if fewer than 15 + tests were selected (the real selection is ~20; the slack allows + legitimate removals). It must be checked *after* `RUN_ALL_TESTS()`, + because gtest applies the filter inside it — the count reads zero + before. This guard, like MSPLIM, arrived via audit: the theme of that + audit was hunting for ways a passing signal could be vacuous. +- **GoogleTest itself needs the bare-metal treatment.** Newlib ships stub + `pthread.h`/`regex.h` headers that make POSIX feature *detection* + succeed spuriously, so the build doesn't probe for threads at all on + bare metal and pins the feature macros (`GTEST_HAS_POSIX_RE=0`, stream + redirection and filesystem off) — value-checked macros only, since + gtest tests `GTEST_HAS_DEATH_TEST` with `#ifdef` and defining it to 0 + would *enable* what it names. + +The result: `ctest --test-dir build` on a developer machine runs ~20 +tests on an emulated Cortex-M55 exactly as transparently as the Hexagon +chapter's suite — `CMAKE_CROSSCOMPILING_EMULATOR` is doing the same work, +with `qemu-system-arm -M mps3-an547 -nographic -semihosting -kernel` +prefixed to the binary instead of `qemu-hexagon`. + +## What the M55 was hiding: Helium at plain `-O2` + +The M55 port existed for correctness. Its instruction baselines then sat +quietly in `bench/baselines.json` until the M33 arrived and gave them a +comparison point — and the comparison didn't add up. Identical source, +identical flags, same GCC: the M33's Q15 pipeline count came in at +roughly **4× the M55's**. Slower silicon-for-silicon was expected; +4× in *executed instructions* was not, because instruction counts don't +care about clock speed or memory latency. Something was executing +different instructions. + +`objdump` answered in one line of shell: the M55 binary contained **71 +MVE instructions**. The M33 binary contained **zero** (it has no MVE to +contain). Nobody had written a line of SIMD — **GCC auto-vectorizes the +Q15/Q31 kernels with Helium at plain `-O2`** when targeting +`-mcpu=cortex-m55`. The M55's numbers had been MVE-accelerated from the +day the target landed, and the project's own performance plan — which +listed "explicit Helium kernels for the M55" as future optimization +headroom — was describing work the compiler had already done. The +hypothesis list in `docs/PERFORMANCE.md` was rewritten the same day: +explicit M55 SIMD is *moot*; the real headroom was on the cores without +MVE, which became C4. + +The M55 also supplied the project's most instructive documentation bug, +told in this book's introduction: the C3 integer-phase change showed +`pipeline_float` **+1.4%** on the M55, contradicting the expectation that +removing double math must help a core documented (in the project's own +notes) as having no FP64. The measurement was right and the notes were +wrong: the M55's *scalar* FPU executes FP64 in hardware — only the MVE +vector unit is fp16/fp32. C3 had traded cheap hardware doubles for int64 +arithmetic on that one target, a fair price for the large cross-target +wins (Q15 −5.3%, Q31 −4.6% on the same core), and the correction is +recorded in the plan's hypothesis list. A 1.4% anomaly in a deterministic +metric was enough to falsify a "fact" everyone involved would have sworn +to. Noisy metrics don't generate that kind of pressure; this is why the +ratchet gates on instructions and not on milliseconds. + +## What the M33 exists to say about the Pico 2 + +The M33 leg is the deployment-realism target, and its numbers are meant +to be read as a datasheet for the Raspberry Pi Pico 2 class of part. + +**Float is not a datapath here.** The committed baselines put +`kernel_float` at 1,897,321,329 instructions against the M55's 99,468,474 +for the same workload — the README's "~19×" — because every `double` +accumulation in the float kernel is soft-float library calls on a core +with a single-precision-only FPU. The consequence is stated as guidance, +not lament: on Pico-class parts, use Q15 or Q31, the formats the +fixed-point traits chapter built for exactly this moment. + +**The DSP extension was idle until C4.** Disassembly of the original M33 +binaries found barely any use of the DSP extension (two `smlal`s). The +C4 kernel fixed that with `SMLALD` — packed dual 16×16 MAC into a 64-bit +accumulator — gated on `__ARM_FEATURE_DSP && !__ARM_FEATURE_MVE` so the +M55 keeps its auto-vectorized loop (verified: 0.00% change on every M55 +and Hexagon scenario), bit-exact by construction because the products are +exact in int32 and int64 accumulation is associative. It bought −3.1% on +`pipeline_q15`, and the C4 entry keeps honest books about why the win is +bounded: the M33's Q15 frame cost is dominated by the coefficient blend's +64-bit products and transport, not by the dot product the intrinsic +accelerates. + +**Budgets, stated as instructions, pending cycles.** Dividing the +baselines out: `pipeline_q15` is 484,146,844 instructions per 96,000 +frames ≈ **5,043 instructions per stereo frame**; the 12-channel shape is +≈ 10,027. A 150 MHz core at 48 kHz has 3,125 *cycles* per frame. The +README draws the honest conclusion in instruction-space — Q15 mono fits +a 150 MHz core, stereo wants the `fast()` preset or the RP2350's second +core — and then refuses to pretend the units match: instructions are not +cycles, the ratio between them is an empirical property of real silicon, +and the guidance is explicitly a budget *pending real-silicon +validation*. + +Two flashable firmwares exist to close exactly that loop, and they are +the bridge from this chapter's emulated world to Part V's hardware: + +- **`examples/pico2_cyccnt`** runs the same fixed pipeline workloads on a + real Pico 2 and times each 32-frame block with the M33's DWT.CYCCNT + hardware cycle counter. Its output divided by the committed baselines + (5,043 and 10,027 instructions per frame) yields the + cycles-per-QEMU-instruction calibration constant that turns *every* + M33 baseline, current and future, into a real cycle budget. +- **`examples/pico2_dualcore`** is the "second core" clause made + literal — and it is the library's concurrency story passing its + sternest exam. The `push()`/`pull()` contract names one producer agent + and one consumer agent around the lock-free ring; it never says + *threads*. On the RP2350, core 0 becomes the producer clock domain + (pushing at a synthesized +200 ppm offset, so the servo's estimate has + an exact truth value to be judged against — the one thing two real + crystals can never give you) and core 1 becomes the consumer, timing + every `pull()` with its own per-core DWT. Two cores over coherent SRAM + satisfy the acquire/release contract exactly as two threads do. + Everything else crossing cores is 32-bit atomics only — because on the + M33, 64-bit `std::atomic` is not lock-free, the same fact the startup + file's PRIMASK helpers exist to paper over on *one* core and which no + single-core trick can fix across two. Even the firmware's 12-channel + phase runs at 16 kHz *by arithmetic, not caution*: 10,027 + instructions per frame against a 3,125-cycle budget cannot fit at + 48 kHz on one core, and `pull()` of one converter instance is one + consumer by contract — a second core buys one clock domain per core, + not more datapath than one core has. + +## Verify it yourself + +```sh +# Both bare-metal legs, end to end (arm-none-eabi-g++ and qemu-system-arm +# on PATH — exactly what CI installs): +cmake -B build-m55 -DCMAKE_BUILD_TYPE=MinSizeRel \ + -DCMAKE_TOOLCHAIN_FILE=cmake/arm-cortex-m55-mps3.cmake -DSRT_BUILD_EXAMPLES=OFF +cmake --build build-m55 -j && ctest --test-dir build-m55 --output-on-failure + +cmake -B build-m33 -DCMAKE_BUILD_TYPE=MinSizeRel \ + -DCMAKE_TOOLCHAIN_FILE=cmake/arm-cortex-m33-mps2.cmake -DSRT_BUILD_EXAMPLES=OFF +cmake --build build-m33 -j && ctest --test-dir build-m33 --output-on-failure + +# The Helium discovery, on today's binaries: MVE loads/MACs present in the +# M55 build, absent in the M33 build. (The recorded count at discovery was +# 71 vs 0; the exact number moves with the compiler — the zero does not.) +arm-none-eabi-objdump -d build-m55/tests/srt_tests | grep -cE 'vldr|vmlaldav' +arm-none-eabi-objdump -d build-m33/tests/srt_tests | grep -cE 'vldr|vmlaldav' + +# The empty-run guard, demonstrated: break the filter in +# tests/bare_metal_main.cpp (e.g. filter = "NoSuchTest*"), rebuild, and the +# run fails with "filter is broken" instead of passing green. + +# The instruction budgets (counting-plugin build is in ci.yml icount-ratchet; +# same configure for m33 with the other toolchain file): +cmake -B build-m55-ic -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_TOOLCHAIN_FILE=cmake/arm-cortex-m55-mps3.cmake \ + -DSRT_BUILD_TESTS=OFF -DSRT_BUILD_EXAMPLES=OFF -DSRT_BUILD_ICOUNT_BENCH=ON +cmake --build build-m55-ic -j +python3 scripts/icount.py --target m55 --build-dir build-m55-ic --plugin /tmp/libinsncount.so + +# The budgets on real silicon (a Raspberry Pi Pico 2 and a USB cable): +# examples/pico2_cyccnt/README.md — cycles per frame, DWT.CYCCNT +# examples/pico2_dualcore/README.md — one clock domain per core, self-judging +``` + +The two `objdump` lines are this chapter compressed: the same source, the +same compiler, the same flags — and the difference between the binaries +is a discovery you can grep for. Bare metal did not make the library +different; it made what the library was already doing *visible*, one +instruction at a time. diff --git a/book/src/part4/hexagon.md b/book/src/part4/hexagon.md new file mode 100644 index 0000000..dc8f2ea --- /dev/null +++ b/book/src/part4/hexagon.md @@ -0,0 +1,384 @@ +# Hexagon: a DSP that keeps secrets + +> Trust, but verify. +> +> — Russian proverb + +Every portability chapter in this part answers the same question: what did +the target force the library to learn that no amount of host testing could +have taught it? Hexagon — Qualcomm's DSP architecture, the kind of core +that audio actually ships on inside a phone — answered it four times, and +three of the four answers contradicted a reasonable engineer's prior. This +chapter walks through the port itself (which is small) and then the four +lessons (which are the point), in the order the project learned them. + +First, the ground rules the target sets. Hexagon here is +`hexagon-unknown-linux-musl`: a 32-bit `size_t` (the ring chapter's +wraparound proof stops being theoretical), musl instead of glibc, clang +instead of GCC, and — the fact that ends up organizing half of Part III — +**no double-precision FPU**. Every `double` the library touches on this +target is a call into soft-float routines. The library's float datapath +accumulates in `double` deliberately (that decision is defended in the +polyphase chapter); on Hexagon that choice has a price tag, and this +chapter contains the receipts. + +## The whole port is one file + +Here is everything SampleRateTap needed to run its test suite on a +Qualcomm DSP: + +```cmake +{{#include ../../../cmake/hexagon-linux-musl.cmake}} +``` + +Thirty lines, and most of them are comments. Two decisions carry the file. + +**`CMAKE_CROSSCOMPILING_EMULATOR qemu-hexagon`.** This single line is what +makes the port *routine* instead of a parallel test infrastructure. CMake +prepends the emulator to every test command it registers, so `ctest` runs +each cross-compiled binary under `qemu-hexagon` user-mode emulation without +knowing it is doing anything unusual. It goes further than the obvious +case: `gtest_discover_tests()` needs to *execute* the test binary at build +time to enumerate its tests, and the emulator prefix makes discovery work +too — which is why `tests/CMakeLists.txt` raises `DISCOVERY_TIMEOUT` to +120 seconds and the per-test timeout to 900. Instruction-set emulation is +slow, roughly an order of magnitude or two; the timeouts are the only +place the build system admits it. + +The same pattern is deliberately generic. The commented-out HiFi4/HiFi5 +job template in `.github/workflows/ci.yml` is this toolchain file with the +names changed (`xt-clang++`, `xt-run`): any target with a cross-compiler +and an instruction-set emulator drops into the same shape, and the test +suite — the project's real asset — transfers unmodified. + +**`-static`.** A dynamically linked musl binary needs the emulator to be +told where the target's loader and shared libraries live (`qemu-hexagon +-L /path/to/sysroot`), and that path would have to thread through CMake, +CTest, CI, and every developer's shell. Static linking deletes the whole +problem: the binary is self-contained, the emulator invocation is just +`qemu-hexagon ./srt_tests`, and nothing about the sysroot can drift out of +sync. For a test rig this is the right trade without much argument — the +binaries are throwaway artifacts, nobody cares that they are megabytes +instead of kilobytes. Keep this decision in mind, though. It comes back at +the end of the chapter with teeth. + +The CI leg (`hexagon-qemu` in `ci.yml`) runs the suite with an exclusion +list: the multi-minute quality and lock simulations, the 10-million-element +thread stress, and a few others. The reasoning is stated in the workflow +and worth internalizing: those tests prove target-independent control +mathematics and host concurrency, which emulation neither speeds up nor +measures meaningfully. What stays *in* is exactly what the target can +falsify — kernel accuracy, fixed-point arithmetic, 32-bit `size_t` +behavior, atomics lowering, musl's corners. An emulated test leg should +run only the tests that target can fail. + +One boundary must be drawn before any number in this chapter is quoted, +and the toolchain file draws it in its own header comment: user-mode +emulation validates ISA-level *correctness*, never performance. QEMU +translates guest instructions to host instructions and runs them as fast +as it can; nothing about its timing resembles the DSP's. What emulation +*can* produce deterministically is the count of guest instructions +executed — the metric Part II's ratchet chapter is built on — and that +count is a good proxy for scalar-code cost while remaining a proxy. +Cycle-accurate Hexagon numbers require the proprietary Hexagon SDK +simulator, which this project does not have; the documentation says so +rather than letting instruction counts impersonate cycles. Every Hexagon +figure below is therefore an instruction count, exact to the instruction, +reproducible on your machine, and honest about what it is not. + +## Lesson one: the genuinely FP64-less target + +The first thing Hexagon did was refuse to be impressed by an optimization +that worked everywhere else. + +C1 — the blended-row precompute, Part III's opening win — cut the M55 +pipeline instruction counts by 15–30% and host stereo wall-clock by 36%. +The same change on Hexagon: **−3.6% float, −3.3% Q15, −0.2% Q31**. Not +wrong, not a regression — just strangely small, and "strangely small" is +the most informative result an instruction counter can produce. If a +change that halves the inner-loop arithmetic barely moves the total, the +total is not made of inner-loop arithmetic. The diagnosis, recorded in +the C1 entry of `docs/PERFORMANCE.md`: Hexagon's pipelines were dominated +by the per-sample phase bookkeeping, done in `double` and therefore +soft-floated — every phase increment, wrap and blend-factor conversion +expanding into library-call arithmetic that dwarfed the MACs the +optimization had so carefully thinned. + +That diagnosis did two things. It motivated C3, the Q0.64 integer phase +accumulator, whose design you have already seen in Part III. And it +forced a correction that is preserved in `docs/PERFORMANCE.md`'s +hypothesis list: the project had been assuming the Cortex-M55 was also in +this soft-double class, and it is not — the M55's *scalar* FPU executes +FP64 in hardware (only the MVE vector unit is fp16/fp32). The M55's +float numbers had never been soft-double-bound. **Hexagon is the +genuinely FP64-less target**, the only one in CI where "the phase math is +done in doubles" translates to "the phase math is done in subroutines." + +Which is why C3's Hexagon column is the loudest in the whole optimization +campaign. Eliminating soft-double phase math from the per-sample path +bought, from the PR's gating run: + +| Scenario | Hexagon instructions | +|---|---:| +| `pipeline_q31` | **−15.5%** | +| `pipeline_q15` | **−10.3%** | +| `pipeline_float` | −2.6% | +| kernels | count-identical (control) | + +The kernels-identical row deserves its footnote: the change touched only +the converter's per-sample phase path, so the isolated-kernel workloads +*must* not move. They didn't, to the instruction. That is what a control +group looks like in this project's methodology, and the deterministic +QEMU counts are what make a control group meaningful at all — a +wall-clock benchmark can certify "similar," never "identical." + +## Lesson two: hexagon-clang wants aliasing proven, not promised + +C2, the vectorization audit, restrict-qualified the kernel hot-loop +pointers after `-fopt-info-vec` showed GCC vectorizing the blend loop +only behind a runtime aliasing check ("loop versioned for +vectorization"). On the M55 the payoff was real but narrow: +`pipeline_float` −1.35%, every other scenario exactly 0.00%. + +The same one-line annotation on Hexagon, from the PR's gating run: + +| Scenario | arm-gcc (M55) | hexagon-clang | +|---|---:|---:| +| `pipeline_float` | −1.35% | −1.6% | +| `pipeline_q15` | 0.00% | **−6.2%** | +| `pipeline_q31` | 0.00% | **−12.3%** | +| kernels | 0.00% | 0.00% (control) | + +Same source, same semantics, wildly different sensitivity. The commit +that pinned the new Hexagon baselines states the finding plainly: +*hexagon-clang benefits from provable no-aliasing far more than arm-gcc +did* — once aliasing is provable it schedules the dot loops +substantially better. That is consistent with what Hexagon is: a VLIW +machine whose compiler packs multiple operations per issue packet and +therefore lives or dies by how freely it may reorder memory operations. +A `restrict` that merely deletes one runtime check on an in-order ARM +core instead unlocks the scheduler on a DSP. + +The portable lesson is about division of labor: `SRT_RESTRICT` was added +for a measured GCC reason, and the *same annotation* paid a much larger, +unlooked-for dividend on the DSP compiler. Aliasing facts belong in the +source, stated once, precisely — because you cannot predict which +backend will be able to spend them. + +## Lesson three: the ISA already had the trick (C5) + +By C5 the project had a pattern that worked: the C4 packed dual-MAC Q15 +kernel had just bought −3.1% on the Cortex-M33 with a small block of +intrinsics. Hexagon has a directly analogous instruction, `vrmpyh` — +four exact 16×16 products summed into a 64-bit accumulator per +instruction, C4's argument at twice the width. The hypothesis practically +wrote itself. + +It was implemented properly: a `vrmpyh` intrinsic loop for the Q15 dot +product, bit-exact against the portable path, full suite green on Hexagon +QEMU. Then it was measured, and the ratchet reported: + +> `pipeline_q15`: 119,847,854 → 119,478,758. **−0.31%.** + +A result that small demands an explanation before it demands a decision, +because there are two very different ways to earn −0.31%: either the +compiler was already emitting wide MACs (making the intrinsics +redundant), or the wide MACs genuinely don't matter here. The two imply +opposite things about future work, so the project pulled disassembly from +CI (`llvm-objdump`, pre and post): the baseline binary contains **zero** +wide-MAC instructions; the intrinsic build contains **10**. The compiler +had not already done it. The instructions landed, executed — and saved +almost nothing. + +The explanation is in the scalar ISA. Hexagon already issues +single-instruction 64-bit multiply-accumulates (`Rxx += mpy`) and 64-bit +loads, so the portable C++ loop was already running close to one MAC per +instruction, with none of the per-element overheads the M33's baseline +loop had been paying. And what a 4-wide reduce could still have saved, +the fix-up work ate: the history window is 2-byte aligned by nature (it +is a stream of Q15 samples), so feeding `vrmpyh` requires combine and +alignment work that costs nearly what the wider multiply saves. C4 won on +the M33 because there was fat to cut; Hexagon's baseline had none. + +You can see the same fact from the committed baselines, without any +intrinsics experiment at all. The README's instruction-count table has +`kernel_q15` at 102,819,852 on Hexagon against 181,994,196 on the +Cortex-M55 — the scalar DSP executes *fewer* instructions than the core +whose Q15 loops GCC vectorizes with Helium. Cross-ISA instruction counts +must be read with care (an instruction is not a unit of work, and fewer +instructions is not the same claim as faster), but as a measure of *MAC +density* the comparison is legitimate: Hexagon's ISA packs so much of +this workload into each instruction that there was structurally little +for a wider multiply to remove. C5's failure was, in hindsight, already +sitting in the baseline table. The experiment's value was turning "in +hindsight" into a checked fact with disassembly attached. + +So the code was deleted. Not shelved, not flag-gated: reverted, per the +stop rule in `docs/PERFORMANCE.md` — per-architecture complexity must +justify itself, and −0.31% does not justify a permanent intrinsic code +path that every future refactor must keep bit-exact. The C5 entry in +`docs/PERFORMANCE.md` *is* the deliverable: the numbers, the disassembly +evidence, and the reasoning, recorded so that nobody re-derives this dead +end in two years when the file looks temptingly scalar again. + +The entry also pre-empts the obvious follow-up — "fine, scalar `vrmpyh` +is redundant, but what about HVX, the 128-byte vector unit?" — with +arithmetic instead of enthusiasm. A 48–80-tap dot product doesn't fill +one HVX vector; worse, HVX 16-bit MACs accumulate in 32-bit lanes, and +the library's exact-int64 accumulation invariant overflows 32 bits after +about 24 worst-case taps. Per-channel tap-axis dots are simply the wrong +*shape* for HVX. The shape that fits — one 64-bit lane pair per channel, +16 channels filling a vector exactly — is the channel-parallel form, and +that observation, recorded as the successor hypothesis, became C6. + +Negative results are worth exactly what you write down about them. + +## Lesson four: the exception secret + +For months the Hexagon leg was the quiet one. Then a hardening PR added +the library's first `EXPECT_THROW` tests — constructor validation, +`Config::validated()` throwing on nonsense configurations — and the +Hexagon leg turned red in a way no other platform did. The constructor +throws correctly. The `EXPECT_THROW` machinery is standing by to catch. +And the exception never arrives: **this static-musl toolchain +configuration cannot unwind the stack.** The throw reaches the runtime, +the unwinder that should walk the frames is not part of the link, and +`libc++abi` does the only honest thing left — terminate. Every other +platform passed; main was red on exactly one leg, because that leg was +the first place a C++ exception had ever actually been *thrown* in this +project's CI history. + +Remember `-static`, the convenience decision from the top of the chapter? +This is its bill arriving. The configuration had silently shipped without +a working unwind path, and nothing in months of green CI could have said +so, because exception propagation is invisible until the first frame +needs unwinding. A capability you never exercise is a capability you do +not have — you merely have no evidence yet. + +The response is a case study in how this project metabolizes a +limitation, three moves in one commit: + +1. **Quarantine precisely.** `ConfigValidation` is excluded from the + Hexagon `ctest` invocation — that suite and nothing else, with a + comment in `ci.yml` explaining why. Validation logic is + target-independent and still covered on every other leg; what Hexagon + cannot test is the *unwinding*, not the *validating*. +2. **Record it where deployers look.** The Known-debt ledger in + `docs/PERFORMANCE.md` gets an entry with the deployment rule stated as + a rule: on this toolchain configuration, an invalid `Config` is + **fatal** — validate inputs *before* constructing, because the + constructor's throw will take the process down rather than propagate. + The toolchain file itself carries the same caveat, so the next person + to cross-compile inherits the warning at the point of use. +3. **Name the candidate fix without pretending it is done.** Linking an + unwinder (`-unwindlib=libunwind`) in the toolchain file would likely + restore propagation; it stays a recorded candidate until someone + verifies it, because "probably fixable" and "fixed" are different + ledger states. + +The library's API already leaned the right way — `validated()` exists +precisely so callers can validate before constructing — so the rule +costs a deployer one line. But the general finding stands, and it is the +chapter's title: a target can keep a secret like this indefinitely, and +the only way to surface it is to route every kind of behavior through the +target. The first `EXPECT_THROW` to reach the leg was, in effect, the +first test of a claim the toolchain had been silently making all along. + +## The CI craft: trusting your emulator and your compiler + +Two pieces of infrastructure make the Hexagon numbers in this book +reproducible rather than anecdotal, and both are about supply chain more +than about DSP. + +**The emulator is built from source, on purpose.** The instruction-count +ratchet needs a `qemu-hexagon` with TCG plugin support — the counting +plugin is how "executed instructions" becomes a number at all. Neither +Debian's `qemu-user` package nor the qemu bundled with the Hexagon +toolchain enables plugins. So the `icount-ratchet` job compiles its own: +the pinned QEMU 8.2.2 source tarball, verified against a hard-pinned +SHA256, configured minimally — + +```sh +./configure --target-list=hexagon-linux-user --enable-plugins \ + --disable-docs --disable-tools --disable-system +``` + +— about four minutes to build the one binary needed, cached thereafter. +The job then *probes* the result (`qemu-hexagon -plugin help`, judged by +the error text because qemu exits nonzero either way when given no guest +binary) rather than assuming the cache returned what was put in. The +plugin header is pinned to the commit the v8.2.2 tag pointed at, by +commit SHA — tags are movable; commits are not. + +**The toolchain is verified twice, against two different threats.** The +cross-compiler is the prebuilt open-source release from +`quic/toolchain_for_hexagon` (clang 19.1.5, hosted on CodeLinaro). On +download, CI checks it against the *published* `SHA256SUMS` file — which +catches corruption and cache poisoning — and against a *hard pin* baked +into the workflow, which is the only check that catches an origin +compromise, since an attacker who can replace the tarball can replace the +SUMS file beside it. The cache key is derived from the pinned digest +itself, so no job that has not verified the pin can ever write the cache +entry a trusting job will read. That last detail was not free: an audit +found two other jobs sharing the trusted cache key while downloading +without verification — a classic poisoning window — and the fix (verify +everywhere, key on the digest) is part of the same hardening commit that +gave the Cortex-M targets their stack-limit register in the next chapter. + +None of this is DSP knowledge. All of it is what "the Hexagon numbers are +CI-gated" has to mean if the phrase is to carry weight: the compiler +whose output is being counted and the emulator doing the counting are +both pinned, verified artifacts, not whatever the package manager felt +like resolving that morning. + +## What the port did not require + +It is worth pausing on the dog that didn't bark. Running a modern C++20 +template library on a Qualcomm DSP required: one 30-line toolchain file, +a test-filter list, and zero changes to library code. No `#ifdef +__hexagon__` exists in any header. The 32-bit `size_t` was already +handled by the ring's wraparound arithmetic (proved, then tested, in the +ring chapter); the absence of threads never came up because the library +never spawns one; the atomics lowered correctly because the ring asserts +`is_always_lock_free` at compile time and would have refused to build +otherwise. The port was boring precisely to the degree that the library's +portability claims were already true — and interesting precisely where +the *toolchain*, not the library, had been making claims nobody had +tested. Both halves of that sentence are the reason to port early: the +boring half is regression-proofed for free from then on, and the +interesting half you want to hear about from CI, not from a customer. + +## Verify it yourself + +```sh +# The port, end to end (hexagon-unknown-linux-musl-clang++ and qemu-hexagon +# on PATH; .github/workflows/ci.yml "hexagon-qemu" has the toolchain URLs): +cmake -B build-hex -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_TOOLCHAIN_FILE=cmake/hexagon-linux-musl.cmake \ + -DSRT_BUILD_EXAMPLES=OFF +cmake --build build-hex -j +ctest --test-dir build-hex --output-on-failure \ + -E 'AsrcQuality|AsrcLock|TwoThreadStress|TransparentPrototypeMeetsSpec|MultiChannel\.|Feasibility|Reset\.|ConfigValidation' + +# The exception secret, demonstrated: remove ConfigValidation from the -E +# list above and watch libc++abi terminate instead of EXPECT_THROW passing. + +# The instruction counts (needs the plugin-enabled qemu-hexagon; the +# icount-ratchet job in ci.yml shows the 4-minute from-source build): +cmake -B build-hex-ic -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_TOOLCHAIN_FILE=cmake/hexagon-linux-musl.cmake \ + -DSRT_BUILD_TESTS=OFF -DSRT_BUILD_EXAMPLES=OFF -DSRT_BUILD_ICOUNT_BENCH=ON +cmake --build build-hex-ic -j +python3 scripts/icount.py --target hexagon --build-dir build-hex-ic \ + --plugin /path/to/libinsncount.so + +# The C5 negative result's disassembly evidence, reproduced on today's +# binary (the count should be zero — the intrinsics were reverted): +llvm-objdump -d build-hex-ic/bench/icount/srt_icount_pipeline_q15 | grep -c vrmpy +``` + +The last command is this chapter's thesis in one line. The claim "the +wide-MAC intrinsics were deliberately not kept" is not a story in a +design document; it is a property of the shipped binary that you can +count, and the C5 entry in `docs/PERFORMANCE.md` is the record of why +counting it settled the question. diff --git a/book/src/part5/hardware.md b/book/src/part5/hardware.md new file mode 100644 index 0000000..fe120e9 --- /dev/null +++ b/book/src/part5/hardware.md @@ -0,0 +1,431 @@ +# Real clocks: bridges and firmware + +> Before enlightenment: chop wood, carry water. After enlightenment: chop wood, carry water. +> +> — Zen proverb + +Everything measured so far in this book — the 135 dB residual, the lock in +~1 s, the drift ramp tracked without unlocking — came out of a simulation. +A good one: deterministic, sample-granular, reproducible to the bit, able +to synthesize a +200 ppm offset that is *exactly* +200 ppm so the servo's +estimate has a truth value to be judged against. That determinism is the +whole reason Part II's proof system works, and it is also, unavoidably, a +confession. A simulated clock is a number in a loop. It has no crystal, no +temperature coefficient, no USB host controller rescheduling its transfers, +no twelve-hour soak in a warm room. The library exists to reconcile two +*physical* oscillators, and at some point the only honest move is to plug +two of them in. + +This chapter is about that move: what real hardware can prove that the +deterministic suite cannot, the three test setups the project defined for +it, and the three harnesses that shipped — an ALSA bridge for Linux hosts +and two firmware images for the Raspberry Pi Pico 2. It ends by stating +plainly which numbers exist today and which still await a physical board. + +## What simulation cannot say + +Be precise about the gap, because it is narrower than "simulation isn't +real." The two-clock simulator *is* the library's use case in every +algorithmic respect; nothing about the datapath or the servo mathematics +changes on hardware. What changes is the input to the control loop: + +- **The offset stops being constant.** Real crystals sit typically + 20–200 ppm apart and move several ppm with temperature — slowly, over + minutes, as the room warms or a component self-heats. The suite tests a + *scripted* drift ramp; hardware supplies an unscripted one, forever. +- **The pacing stops being clean.** A simulated push arrives exactly on + schedule. A USB audio dongle's data arrives when the host controller and + the kernel get around to it — jitter that is structured, bursty, and + unlike anything a deterministic loop generates. The FIFO setpoint rule + ("exceed the peak occupancy excursion of your push/pull jitter") is only + ever *exercised* by real jitter. +- **Time stops being short.** The quality suite analyzes one second of + audio after settling. The claim a deployment actually cares about — + *zero* underruns, overruns, or resyncs over hours — is a statement about + the tails of every distribution at once, and the only instrument that + measures tails is a soak. A multi-hour run on independent oscillators is + the test no simulation honestly replaces. + +There is also one thing simulation does *better*, worth keeping: a +synthesized offset is exact, so convergence can be asserted to a tolerance. +Two real crystals give you a true offset you don't know — you can check the +estimate is *stable* and *independently corroborated* (count frames from +each device against `CLOCK_MONOTONIC` for ten minutes; the measured rate +ratio should match the servo's estimate to well under 1 ppm), but never +that it equals a known constant. The hardware plan uses both kinds of truth +deliberately, as we'll see. + +`docs/HARDWARE_TESTING.md` defines three setups, in increasing order of +effort, all commodity parts: + +1. **One Pi, two USB audio dongles** (~$15 of adapters). Each dongle clocks + its own 48 kHz from its own crystal; the library bridges them. The + canonical real-world test, and the source of the headline result the + project wants: "locked to the real inter-crystal offset of X ppm, N + hours, zero discontinuities." +2. **Pi + Raspberry Pi Pico 2.** Validates the QEMU-derived Cortex-M33 + numbers on an actual RP2350: real cycles against emulated instruction + counts, and the dual-core deployment shape. +3. **Two Pis over Ethernet.** The network-audio case, where `push()` sees + bursty UDP delivery instead of callback-paced blocks — the setpoint rule + under genuinely hostile jitter. + +Setup 1's harness is `examples/alsa_bridge.cpp`; Setup 2's are +`examples/pico2_cyccnt/` and `examples/pico2_dualcore/`. Setup 3's programs +are not yet written. Each shipped harness is worth reading closely, because +each one is the library's documented rules applied under witness. + +## The ALSA bridge: two blocking threads, on purpose + +The bridge is ~370 lines and structurally almost insolently simple: open a +capture device and a playback device, start two threads, and let each +thread block on its device. + +```cpp +std::thread capture([&] { + // ... + const snd_pcm_sframes_t n = snd_pcm_readi(in.pcm, dst, period); + // ... + asrc.push(buf.data(), frames); // overruns counted by the converter +}); + +std::thread playback([&] { + // ... + asrc.pull(buf.data(), period); // silence-pads while filling/underrun + // ... + snd_pcm_writei(out.pcm, src, period - done); +}); +``` + +The simplicity is the point. The library's runtime contract is one producer +agent and one consumer agent, each paced by its own clock — and a blocking +ALSA read *is* a clock. `snd_pcm_readi()` returns when the capture device +has delivered a period of frames, which happens at the cadence of that +device's crystal; `snd_pcm_writei()` blocks until the playback device has +made room, at the cadence of the other crystal. The two threads never +communicate except through the converter, which is exactly the interface +the whole library was designed around. No callbacks, no timers, no event +loop: the hardware paces the threads, and the converter absorbs the +difference. If you want to see the two-agent contract of +[the ring chapter](../part1/spsc-ring.md) with the abstractions removed, +this file is it. + +A few decisions inside deserve attention. + +**Format negotiation prefers honesty over generality.** The bridge asks +each device for `FLOAT_LE` — the converter's native sample type, no +conversion — and falls back to `S16_LE` with explicit scale-and-clamp +helpers when the hardware refuses. That is the entire format matrix. Cheap +dongles are overwhelmingly S16 devices, and a test harness that negotiated +every ALSA format under the sun would bury its purpose in plumbing. It +also *refuses* a rate it didn't ask for: if the device counters with +anything but the requested rate, the bridge errors out rather than +silently measuring the wrong experiment. + +**Xrun recovery is delegated, then observed.** When a read or write +returns an error, the bridge calls `snd_pcm_recover()` and continues; +only an unrecoverable error stops the run. This is deliberate division of +labor: ALSA xruns are a *device*-level discontinuity (the OS failed to +service the hardware in time), and the converter has its own machinery — +silence-padding, refill, re-lock with the ppm estimate kept — for the +*converter*-level consequences. The bridge does not try to be clever +across that boundary; it recovers the PCM and lets the converter's +counters record whatever backlash arrives. During a soak, the once-per- +second status line is where you watch both layers at once. + +**The one configuration rule in the file is the ServoConfig rule.** The +bridge runs with `--period` frames per ALSA transfer (default 128), and +block-quantized transfer means the FIFO occupancy legitimately excursions +by around half a block without the clocks having moved. The servo's +`unlockThresholdFrames` defaults to 24 — tuned for fine-grained transfer — +so the bridge applies the documented rule in code: + +```cpp +// Per the ServoConfig guidance: the unlock threshold must sit +// comfortably above half the transfer block, or block-quantized +// occupancy excursions can demote the servo stage spuriously. +cfg.servo.unlockThresholdFrames = + std::max(cfg.servo.unlockThresholdFrames, 1.5 * static_cast(args.period)); +``` + +Miss this and the harness would report spurious servo demotions that have +nothing to do with the clocks — a measurement artifact manufactured by the +measurement tool. (The next chapter returns to this rule as one of the +three scaling axes.) + +**The telemetry switches are the experiment design.** Three flags turn the +bridge from a demo into an instrument: + +- `--csv ` appends the once-per-second `status()` snapshot — state, + ppm, smoothed fill, underrun/overrun/resync counters — as a CSV row. + This is the soak's evidence: the ppm trace over hours *is* the thermal- + drift measurement, and the counters' final values *are* the + zero-discontinuity claim. Point a hair dryer at one dongle and the trace + should show the crystal move several ppm in real time, tracked without + anything audible; a fast ±50 ppm step should show a stage demotion and a + re-lock. +- `--dump ` has the playback thread also write the post-ASRC float + stream to disk, raw. This exists because of an honest limitation of + cheap hardware: a $7 dongle's analog path measures around −80 dB, and + no quality claim about a 135 dB converter survives passage through it. + The dump sidesteps the analog path entirely — the *clocks* are real even + if the signal never goes analog — and the notebook tooling + (`notebooks/asrc_comparison.ipynb` carries the AES17-style measurement + machinery) analyzes the capture offline. +- `--tone ` completes that thought. In tone mode the capture thread + *still blocks on* `snd_pcm_readi()` — the input device's crystal still + paces every push — but the captured samples are discarded and a clean + synthetic sine is pushed instead. Real clocks, known signal, no trust + placed in an ADC that hasn't earned it. The combination + `--tone 997 --dump out.raw --csv trace.csv` is Setup 1's full + measurement: a 997 Hz tone through two real crystals into the AES17 + notebook. + +## `pico2_cyccnt`: buying cycles with instructions + +Part II built a performance ratchet on QEMU instruction counts: +deterministic, noise-free, gateable in CI at ±3%. The README's Cortex-M33 +table says a 2-second Q15 stereo workload executes 484,146,844 +instructions — that number will be identical tomorrow, which is what makes +it a regression gate. But it is a count of *instructions*, and silicon +budgets are spent in *cycles*. An instruction can take one cycle or ten; +memory waits, pipeline stalls, and branch penalties exist in silicon and +not in QEMU's functional model. So every deployment claim derived from the +ratchet — "Q15 mono fits a 150 MHz core with room to spare, stereo is +tight" — has been carrying an asterisk: *instruction counts are not cycle +counts; treat these as budgets pending real-silicon validation.* + +`examples/pico2_cyccnt/` is the firmware that removes the asterisk. It is +a standalone flashable UF2 (deliberately *not* part of the root build — +it drags in the whole Pico SDK) that runs the exact steady-state workload +of the icount benchmarks — the same `push(32)`/`pull(32)` duplex loop — +on a real RP2350, timing every block with the Cortex-M33's DWT cycle +counter: + +```cpp +bool enableCycleCounter() { + CoreDebug->DEMCR |= CoreDebug_DEMCR_TRCENA_Msk; + if (DWT->CTRL & DWT_CTRL_NOCYCCNT_Msk) + return false; // implementation without a cycle counter + DWT->CYCCNT = 0; + DWT->CTRL |= DWT_CTRL_CYCCNTENA_Msk; + return true; +} +``` + +DWT — the Data Watchpoint and Trace unit — is optional silicon on +M-profile cores, so the firmware checks `NOCYCCNT` at runtime rather than +assuming; `TRCENA` gates the whole trace block and must be set first. The +counter is 32 bits free-running, which wraps in ~28.6 s at 150 MHz — fine, +because the firmware only ever takes per-block unsigned deltas, and +unsigned subtraction across a wrap is exact by the same modular-arithmetic +argument the ring buffer's indices rested on. A thousand warmup iterations +run first (past the Filling state, servo settled), then two thousand +measured blocks, reported as mean, p99, and max — the tail statistics +matter, because the workload runs with interrupts live and USB +housekeeping shows up in the max column. + +The output table covers Q15 in both presets at 1, 2, and 12 channels, plus +float at one channel. The float rows are not there in the hope of good +news; they exist to put a *measured* number on "soft-double accumulation +is the wrong datapath on an FP64-less core" — the QEMU baselines already +price float at roughly 3.8× the Q15 instruction count, and a cycle figure +makes the guidance concrete rather than rhetorical. + +The deeper purpose is calibration. The committed M33 baselines divide out +to 5,043 instructions per frame for the stereo Q15 pipeline and 10,027 for +the 12-channel one. Divide the firmware's measured cycles-per-frame by +those figures and you get the constant the whole ratchet has been waiting +for: *one QEMU instruction ≈ N RP2350 cycles*. That single ratio converts +every current and future M33 instruction baseline into a real cycle +budget — the ratchet keeps its CI-grade determinism, and hardware +contributes exactly one number, measured once per silicon revision instead +of once per commit. + +One scoping note recorded in the README of the harness: the cycled input +buffer is 4,800 frames rather than the icount workload's 12,000, so that +the 12-channel case fits the RP2350's 520 KB SRAM alongside the converter. +Per-block work is unchanged; the deviation is documented because an +unexplained difference between two "identical" workloads is how +calibration constants go quietly wrong. + +## `pico2_dualcore`: one clock domain per core + +The README's platform guidance ends with a suggestion: on Pico-class +parts, stereo `balanced()` wants either the `fast()` preset *or the +RP2350's second core*. `examples/pico2_dualcore/` is that suggestion built +and made falsifiable — the converter's two ends on the two Cortex-M33 +cores, one core per clock domain, judging its own run against PASS/FAIL +gates. + +- **core0 is the producer.** It pushes 32-frame blocks paced by the + microsecond timer at `rate × (1 + 200e-6)` — a +200 ppm offset + synthesized from the shared timebase. This is the simulation trick + imported onto silicon, and it is what real crystals can never give: an + offset that is *exactly* +200 ppm, so the converged estimate can be + asserted within ±5 ppm rather than merely admired. core0 also owns all + USB telemetry. +- **core1 is the consumer.** It pulls 32-frame blocks at exactly the + nominal rate and times every `pull()` with DWT.CYCCNT — enabled *on + core1*, because each RP2350 core has a private DWT behind the same + fixed address (the 0xE000_0000 private peripheral region is core-local; + enabling the counter from core0 would start the wrong one). core1 never + prints: contending on the stdio mutex from the paced core would put USB + stalls onto the output clock domain. + +Is running the two ends on two *cores* even within the library's +contract? The firmware answers this in its opening comment, and the +reasoning belongs in this book: the contract is one producer *agent* and +one consumer *agent* around a lock-free SPSC ring with acquire/release +atomics. It names agents and memory ordering, not `std::thread`. The +RP2350's cores share coherent SRAM with no data caches in front of it, so +C++ atomics behave across cores exactly as they do across threads — two +cores satisfy the contract precisely as two threads do. `push()` stays +core0-only, `pull()` stays core1-only, `status()` is documented +any-thread. The chapter on the ring said the memory-ordering argument was +the proof and the tests merely raised the price of being wrong; here the +same argument, unchanged, carries the design onto a second processor. + +Everything else that crosses cores is an explicit block of **32-bit** +atomics, and the width is a load-bearing decision inherited from the +library itself: on the M33, 64-bit `std::atomic` is not lock-free — it +routes through a library lock, which is exactly the failure the library's +own telemetry avoided by keeping its counters 32-bit. The firmware +`static_assert`s the lock-freedom of every cross-core type. The phase +handoff is a single release store of the converter pointer (publishing +every plain write the constructor performed) matched by an acquire load on +core1; the teardown is the mirrored pair through a `consumerDone` flag, so +destroying the converter cannot race core1's last `pull()`. + +The consumer's statistics need more than individual atomicity, though: a +printed telemetry line should describe one *instant*, not a mean from this +second next to a max from the last. With 64-bit atomics off the table, the +firmware uses a seqlock — the sequence counter goes odd while the writer +updates, even when it finishes, and the reader retries until the same even +value brackets its whole read: + +```cpp +void publishSnapshot(const Snapshot& s) { + const std::uint32_t q = g.seq.load(std::memory_order_relaxed); + g.seq.store(q + 1, std::memory_order_relaxed); + std::atomic_thread_fence(std::memory_order_release); + // ... payload stores, all relaxed 32-bit atomics ... + g.seq.store(q + 2, std::memory_order_release); +} +``` + +The payload fields are themselves relaxed atomics — no torn reads, no +undefined behavior — so the seqlock adds only mutual coherence, and a +retry costs nothing at a 1 Hz read rate. It is the cheapest possible +answer to "publish five numbers atomically on a core with 32-bit +atomics," and a pattern worth stealing. + +### The honest scoping decision + +The firmware runs two ~30-second phases. Phase A is Q15 stereo +`balanced()` at 48 kHz — the configuration the README calls tight on one +core, now with the input domain moved off the consumer's core entirely. +Phase B is the 12-channel reference-microphone/AVB shape... at **16 kHz**, +not 48. Its README records why, and the passage is a model of how to scope +a demo honestly: + +> Phase B is 16 kHz **by arithmetic, not caution**: the M33 QEMU baseline +> puts `pipeline12_q15` at 10,027 insns/frame against a 150 MHz / 48 kHz +> budget of 3,125 cycles/frame — more than 3× over, and `pull()` of a +> single instance is one consumer by contract, so no core assignment can +> split it across cores. Dual-core buys one clock domain per core, not +> more datapath than one core has. + +That last sentence is the chapter's most important deployment fact. The +SPSC contract that makes the converter lock-free is also a ceiling: one +consumer agent means the entire per-pull datapath — all twelve channels of +it — executes on whichever core calls `pull()`. A second core removes the +*other* clock domain's work and everything else the application does, and +that is all it removes. At 16 kHz the per-frame budget triples to 9,375 +cycles and the 12-channel shape fits; and since measured cycles per block +are rate-independent, phase B still delivers the real-silicon counterpart +of the 12-channel instruction baseline. Nothing was hidden by the rate +change — 16 kHz is that configuration's actual deployment rate (the +next chapter's rate-scaling rules are applied in the phase B config, +`FilterSpec` band edges and servo bandwidths scaled by 16/48) — but the +README refuses to let you believe dual-core bought compute it didn't. + +Two more of the library's documented rules appear in this firmware as +lived decisions rather than advice. The FIFO setpoint is 144 frames, not +the default 48: the producer core shares its time with USB logging, whose +worst-case writer stall is capped at 2 ms in the build — 96 frames of +consumer progress at 48 kHz — so the setpoint must exceed that excursion +with margin. That is the README latency rule applied to a producer that +also logs. And the pacing schedules compute absolute due times +(`t0 + (b·num)/den` in integer microseconds), so a stall is followed by +catch-up pushes rather than permanent schedule slip — the difference +between jitter the FIFO absorbs and a rate error the servo would chase. + +A PASS requires: Locked within 2 s of cold start (6 s for phase B, whose +scaled servo is proportionally slower), every 1 Hz ppm sample after the +settling gate within ±5 of the synthesized +200, and zero underruns, +overruns, *and* resyncs after first lock — overruns and resyncs gate too, +because they are the signature of a consumer that cannot keep up. The +firmware prints per-phase verdicts, an `OVERALL` line, and a sentinel +string, so a future self-hosted CI lane can parse a soak the same way the +QEMU lanes parse emulated runs. + +The dual-core README also states its own limit, and it belongs here +verbatim in spirit: both domains are paced from the RP2350's one timer — +that is what makes +200.0 an exact, assertable truth — so this firmware +*cannot* prove the inter-crystal lock that Setups 1 and 2 ultimately want. +It proves the deployment shape: two cores, two clock domains, lock-free +handoff, real cycle headroom. + +## What is measured, and what is not yet + +The project's culture is that numbers are measured or absent, so here is +the ledger as it stands: + +- **Shipped and measured on real clocks: nothing yet.** All quality and + performance figures in this book so far come from deterministic + simulation, host benchmarks, and QEMU instruction counting. +- **Shipped and awaiting hardware:** all three harnesses build — the ALSA + bridge wherever ALSA exists, both Pico 2 firmwares as flashable UF2s — + but `docs/HARDWARE_TESTING.md` says it plainly: *the measured numbers + await a physical Pico 2*, and the multi-hour dongle soak awaits an + afternoon with a Pi. The cycles-per-instruction calibration constant, + the real `%core@48k` figures, the hour-scale zero-discontinuity claim, + and the thermal-drift trace are all, today, well-instrumented empty + columns. +- **Not yet written:** the small script that plots a `--csv` ppm trace and + runs the notebook analysis over a `--dump` capture, and both Setup 3 + programs (UDP sender, receiver-with-ASRC — the plan is to reuse the + bridge's output half). + +A book that inherited this project's habits could not end the chapter any +other way. The harnesses are the falsifiable form of the library's +deployment claims; until a board runs them, the claims stay labeled as +budgets. + +## Verify it yourself + +```sh +# No hardware: two OS threads 500 ppm apart, lock and estimate on live +# (jittery) scheduling — the software rehearsal of the bridge: +cmake -B build -DSRT_BUILD_EXAMPLES=ON && cmake --build build -j +./build/examples/drifting_clocks + +# Setup 1 (Linux + two audio devices; srt_alsa_bridge builds when ALSA +# is found). Real clocks, synthetic tone, telemetry + capture: +./build/examples/srt_alsa_bridge --in hw:1,0 --out hw:2,0 \ + --tone 997 --csv trace.csv --dump post_asrc.f32 --seconds 3600 +# Then: ppm column of trace.csv is the thermal-drift instrument; analyze +# post_asrc.f32 with the AES17 machinery in notebooks/asrc_comparison.ipynb. + +# Setup 2 firmware (standalone builds; arm-none-eabi-gcc + network for +# the Pico SDK fetch): +cd examples/pico2_cyccnt && cmake -B build -DPICO_BOARD=pico2 && cmake --build build -j +cd examples/pico2_dualcore && cmake -B build -DPICO_BOARD=pico2 && cmake --build build -j +# Flash the UF2s, open the USB serial port, and wait for the sentinel +# lines: SRT_PICO2_DONE / SRT_PICO2_DUALCORE_DONE with per-phase PASS/FAIL. +``` + +If you have the hardware this project's authors did not have on their +bench, you are holding the most valuable contribution available: run the +soak, and turn the empty columns into numbers. diff --git a/book/src/part5/scaling.md b/book/src/part5/scaling.md new file mode 100644 index 0000000..1651156 --- /dev/null +++ b/book/src/part5/scaling.md @@ -0,0 +1,315 @@ +# Channels, rates, and the rules that scale + +> For every type of animal there is a most convenient size, and a large change in size inevitably carries with it a change of form. +> +> — J. B. S. Haldane, *On Being the Right Size* + +Every measured number in this book so far was taken at one operating point: +48 kHz, one or two channels, fine-grained transfer. Real deployments move +along three axes away from that point — more **channels**, a different +**sample rate**, coarser **blocks** — and each axis has a rule, a failure +mode when the rule is ignored, and a measurement that pins both. This is +the chapter a deploying engineer should read twice: once before choosing a +configuration, and once after the first surprising telemetry line. + +The three rules, stated up front: + +1. **Channels**: one converter instance per *clock domain*, never per + channel group; channel count is then a nearly-free multiplier on the + dot product. +2. **Rates**: every configuration field denominated in absolute hertz must + scale with the sample rate — start from `Config::forSampleRate()`. +3. **Blocks**: the FIFO setpoint must exceed the pull block size (the + converter now enforces this) and the servo's unlock threshold must + clear the block-quantization sawtooth; coarse blocks also move you into + a measurably different quality regime. + +## Channels: coherence is free, so don't pay for it + +`Config::channels` is a runtime count with no architectural limit — mono +through 7.1.4 and beyond. The design rule is about instance boundaries: +**one instance per clock domain**. If a 12-channel AVB stream and a stereo +monitor feed arrive on the *same* recovered clock, they are one domain and +could share one instance per stream as convenient; but never split one +stream's channels across instances, and never funnel two clock domains +into one. + +The reason to keep a stream's channels together is a property the +implementation gives you by construction. Within one instance, every +channel of a frame is resampled at *literally the same fractional +position*: the phase accumulator, the servo, and the coefficient blend are +all per-instance state, and the per-channel work is only the dot product. +There is no per-channel phase to drift, so inter-channel phase coherence +is exact — not "matched to within a specification," but bit-identical in +the only quantity that could differ. Two audiences care intensely: + +- **Surround imaging.** Phantom sources between speakers are constructed + from inter-channel amplitude and time relationships; an ASRC that + resampled channels at even slightly different phases would smear them. + Here there is no skew to budget for. +- **Microphone arrays.** Beamforming and cross-correlation live entirely + on inter-channel time differences at sub-sample precision. The README + calls out the AVB case directly: a stream bundling reference microphones + with the program feed keeps its array geometry intact through the + converter. (AVB Class A's 8-frame packets are also fine-grained enough + for the Quiet servo stage — the block axis, below, cooperates.) + +Split those channels across instances and you forfeit the guarantee: each +instance runs its own servo on its own FIFO, and two servos tracking the +same physical clock still produce two independently-wobbling phase +trajectories. The coherence rule costs nothing and buys exactness; its +violation costs exactness and buys nothing. + +### What N channels cost + +Sharing one fractional position per frame also shapes the cost. Each +output frame computes the coefficient blend — the interpolation between +adjacent polyphase rows — *once*, then reuses it for every channel's dot +product. N channels cost `blend + N × dot`, not `N × (blend + dot)`; the +fixed overhead amortizes, so the marginal channel is cheaper than the +first. + +The instruction-count table in the README measures this shape. Comparing +the 12-channel Q15 pipeline against stereo across the three gated targets: + +| Target | `pipeline_q15` (2 ch) | `pipeline12_q15` (12 ch) | ratio | +|---|---:|---:|---:| +| Cortex-M33 | 484,146,844 | 962,613,655 | 2.0× | +| Cortex-M55 | 127,446,817 | 387,876,968 | 3.0× | +| Hexagon | 119,847,854 | 378,858,793 | 3.2× | + +Six times the channels for 2.0–3.2× the instructions. The spread itself is +informative: the M33's 2.0× says its per-frame cost is dominated by +shared work (the servo's soft-double arithmetic on an FP64-less core), so +extra channels are nearly half price; the M55 and Hexagon, whose shared +work is cheap, sit closer to the pure dot-product slope. On the host, the +same shape: Q15 stereo at 56.0 ns/frame versus 12-channel at 189.1 — +3.4× for 6× the channels, with a 12-channel stream still running at 110× +realtime on one Xeon core. + +### The proof that channels don't leak + +Coherence and cost say nothing about *correctness* — an interleave bug or +a channel permutation would sail through every single-channel quality +metric in the suite. `tests/test_multichannel.cpp` exists for exactly +that blind spot: every channel of one instance gets its own tone (600 + +731·c Hz — distinct, non-harmonically related, all inside the flat +passband up to 16 channels), and after a +200 ppm crossing each channel +must contain its own tone at full quality and nothing measurable of any +neighbor's. + +"Nothing measurable" is made rigorous the way this project usually is: the +channel's own tone is removed by tracked least-squares fit before the +other channels' frequencies are fitted on the residual, so the own tone's +spectral leakage (about −67 dB over a 1 s rectangular window at these +spacings) cannot masquerade as crosstalk. The gated results: worst +crosstalk below **−100 dB** for 12-channel float, below **−72 dB** for +16-channel Q15 — the latter sitting at the 16-bit format's own floor, +which is the honest bound for that datapath. Amplitude and per-channel +SNR are asserted in the same run, so a permutation, a gain error, and +crosstalk are all caught by one test. + +One coverage note is worth repeating because of how it was found. The +host's channel-parallel float kernel tiles channels in blocks of 8/4/2/1, +and an audit noticed that no test ever ran the K=2 and K=1 remainder +tiles — every configured count happened to decompose without them. The +suite now runs 5- and 7-channel variants (5 = 4+1, 7 = 4+2+1) precisely +to execute those tiles. The general lesson from Part II recurs: coverage +you haven't verified reaches the code is coverage you don't have. + +## Rates: hertz-denominated defaults are a 48 kHz assumption + +The library's defaults read as innocently portable — until you notice +which fields carry units. `FilterSpec::balanced()` places the passband +edge at 20,000 Hz and the first image to suppress at 28,000 Hz; +`ServoConfig` sets loop bandwidths of 10/1/0.05 Hz and smoother corners of +50/5/0.5 Hz. Every one of those is an *absolute frequency chosen for +48 kHz operation*, and the two misconfigurations they invite fail in +instructively different ways. + +The filter misconfiguration fails loudly, by design. Default band edges at +a 16 kHz rate would put the anti-image cutoff far above the input Nyquist +— a filter that passes images wholesale — and the constructor's validation +rejects the geometry outright (`passbandHz + stopbandHz` must not exceed +the sample rate), so you cannot ship it by accident. The servo +misconfiguration is the dangerous one, because nothing forces you to +notice: scale the filter (you must, to construct at all), keep the default +servo, and the converter builds, locks, tracks, and converts — while +silently costing about **32 dB** of quality at 16 kHz. That number is +measured, and the mechanism is worth understanding because it is the whole +rate-scaling story in one incident. + +At 16 kHz with a 200 ppm offset, the whole-sample slips arrive at +`ppm × fs` = 3.2 Hz instead of 9.6 Hz. The servo's three-pole Quiet +smoother has an absolute 0.5 Hz corner, so a beat at one-third the +frequency is rejected `(16/48)³` ≈ 28.6 dB *less* — the slip sawtooth +walks out from under the smoother, leaks into the rate estimate, and +frequency-modulates the audio. The measurement wears the FM signature +openly: roughly 32 dB below the 48 kHz figures at every tone, falling +6 dB per octave of signal frequency, exactly as small-index FM sidebands +scale. Nothing was wrong with the filter; the *control loop* was mistuned +by a factor of three because its tuning was written in hertz. + +The remedy is the `scaledTo` trio, and the factory that applies it: + +```cpp +srt::Config cfg = srt::Config::forSampleRate(16000.0); +cfg.channels = ...; // then adjust as usual +``` + +`FilterSpec::scaledTo` multiplies the band edges by `fs/48000` — same L +and T, so the same table size and per-frame cost, with the identical +response at every *normalized* frequency. `ServoConfig::scaledTo` does the +same to the six bandwidth/corner fields, keeping the loop identical in +per-sample terms — and scales the two hold times *inversely*, so the +promotion gates wait the same number of loop time constants rather than +the same number of wall-clock seconds. (That last refinement postdates the +first hand-scaled fix; re-measured, it changed nothing within noise, and +the test asserting it exists so the equivalence stays checked rather than +remembered.) Frame-denominated fields — lock and unlock thresholds, +`targetLatencyFrames`, ppm limits — are rate-invariant and stay put, +though their *duration* in milliseconds scales inversely with the rate. + +`tests/test_asrc_quality_16k.cpp` runs the full quality methodology +through the factory, and the outcome is the point of the design: 16 kHz +matches the 48 kHz *normalized-frequency structure*. The tones sit at the +same f/fs as the 48 kHz suite's 997 Hz / 6 k / 12 k / 19.5 k, and measure +136.6 / 121.9 / 114.3 / 106.5 dB against the 48 kHz suite's 135.0 / +120.0 / 112.8 / 105.8 on the same host — within about 1 dB down the line, +confirming that interpolation noise depends only on f/fs. Two consequences +deploy with you: group delay at the same tap count stays ~24 *input +samples*, which is three times as many milliseconds at 16 kHz (1.5 ms vs +0.5 ms); and the scaled Quiet loop at ~0.017 Hz settles proportionally +slower — the 16 kHz test runs 120 s where the 48 kHz one ran 40 s, the +same number of samples and of time constants. + +## Blocks: feasibility, then observability + +The block-size axis has two boundaries, one hard and one +information-theoretic. + +### The hard one: a pull can only synthesize from what is buffered + +`pull(frames)` produces output from frames already in the FIFO. If the +occupancy setpoint sits at or below the pull block size, the loop is +infeasible: each pull drains the buffer past the setpoint, the servo +steers to refill it, the next pull drains it again — a permanent underrun +limit cycle, dropouts every few hundred milliseconds, never locking. Early +versions documented the rule ("the setpoint must exceed the pull block +size") and trusted the integrator between chair and keyboard; the current +converter enforces it. When `pull()` observes a block larger than the +setpoint in force, it raises the *effective* setpoint to the block plus a +margin — half a block, at least one pop chunk — sized so the entry +occupancy never grazes the pull size even at the bottom of the block-beat +sawtooth, and bounded by FIFO capacity: + +```cpp +const std::size_t needed = frames + std::max(frames / 2, kPopChunkFrames); +const std::size_t newTarget = + std::clamp(needed, cfg_.targetLatencyFrames, maxTargetFrames_); +``` + +Configurations that already satisfy the rule are left exactly as +configured; the servo slews to a raised setpoint glitch-free (integrator +kept — the clocks haven't changed, only the target). The cost is not +hidden: latency follows the raised setpoint, `designedLatencySeconds()` +reports it, and `Status::effectiveTargetLatencyFrames` differs from the +configured value exactly when the adaptation has occurred — a field worth +plotting in deployment telemetry, because it is the converter telling you +your latency budget and your callback size disagree. Capacity bounds the +raise: the default ring (a 1024-frame floor) accommodates pull blocks up +to ~340 frames; larger callbacks need `fifoFrames` sized explicitly. + +### The soft one: what a coarse count can tell a servo + +The servo's only sensor is FIFO occupancy, and occupancy is quantized — +to whole frames at best, to whole *blocks* with block transfer. At +deviation ε the observable carries a deterministic sawtooth, one push +block peak-to-peak, at the beat frequency `ε × fs / block`. Whatever the +loop passes into its estimate frequency-modulates the audio. With +sample-granular transfer the sawtooth is one frame and the Quiet stage's +three-pole cascade rejects it to roughly −120 dBc equivalent at 20 kHz. +With ≥32-frame callbacks, that level of quiet is +**information-theoretically unavailable from counts alone** — no filter +recovers sub-sawtooth phase from an observable whose quantization *is* +the sawtooth, not while still tracking real drift. + +The design response is to stop pretending. Promotion from Track to Quiet +is gated on the cascade-smoothed error staying small, which is naturally +false while a large block beat dominates the observable — the gate is +itself the discriminator between the two regimes, so coarse-block +operation deliberately stays in Track. There the block beat is mostly +phase-tracked as benign *latency breathing* (the FIFO term of the latency +wanders by a fraction of the block as the servo follows the beat), and +the remainder appears as low-rate FM measured in cents: +`notebooks/asrc_block_size_study.ipynb` puts it at ~0.9 cents rms over a +61 dB wideband floor at 32-frame blocks, ~1.3 cents rms over 53 dB at +5 ms (240-frame) blocks. Those are honest numbers for a different regime, +not a degradation of the headline ones — the 135 dB figures are for +fine-grained transfer, and the comparison document says so plainly. If +your deployment pushes hardware-DMA-sized blocks and needs studio +transparency, the current converter is not information-limited by +accident, and the limitations section of the README sketches the eventual +answer (per-block timestamps for sub-sample phase observation). + +One more block-denominated rule closes the loop with the previous +chapter. The servo's `unlockThresholdFrames` (default 24) is the +excursion that demotes a stage; block-quantized occupancy legitimately +excursions by about half a block without the clocks having moved. The +guidance in `pi_servo.hpp` — keep the threshold comfortably above half +the block — is applied literally in the ALSA bridge (`1.5 ×` the period), +and ignoring it produces the most confusing failure on this axis: a +converter that locks, runs cleanly, and "spuriously" demotes itself on +schedule, at the beat frequency, forever. + +## The configuration walk, in order + +The axes compose, so a deployment configures them in dependency order: + +1. Start from `Config::forSampleRate(rate)` — never raw defaults at a + non-48 kHz rate. +2. Set `channels` to the full width of each clock domain's stream; one + instance per domain. +3. Set `targetLatencyFrames` above your pull block *and* your worst + push/pull jitter excursion (the dual-core firmware's 144-frame + setpoint against a 2 ms logging stall is the worked example); set + `fifoFrames` explicitly past ~340-frame callbacks. +4. Raise `unlockThresholdFrames` above ~1.5× your transfer block. +5. Then watch `Status::effectiveTargetLatencyFrames` and the resync + counters in production — they are the converter's own opinion of + whether steps 3 and 4 were done right. + +## Verify it yourself + +```sh +# Channel independence: 12ch float (< -100 dB crosstalk), 16ch Q15 +# (< -72 dB), plus the 5/7-channel remainder-tile variants: +ctest --test-dir build -R MultiChannel --output-on-failure + +# The rate-scaling rule and the 16 kHz measurements (slow: each case is +# a 120 s simulated run; the first test checks the factory arithmetic +# deterministically): +ctest --test-dir build -R AsrcQuality16k --output-on-failure + +# The -32 dB failure itself, reproduced: in test_asrc_quality_16k.cpp, +# keep Config::forSampleRate(kFs) but overwrite the servo with unscaled +# defaults (cfg.servo = srt::ServoConfig{};) — the converter still builds +# and locks, and every threshold fails by ~30 dB, falling 6 dB per octave +# of tone frequency: the FM signature. (Restoring the unscaled *filter* +# instead fails fast: the constructor rejects band edges above the input +# Nyquist.) + +# The block axis, measured: latency breathing and the cents-scale FM +# decomposition at 32/64/240-frame blocks: +jupyter nbconvert --execute notebooks/asrc_block_size_study.ipynb + +# The feasibility rule live: run the drifting-clocks example, then rerun +# with cfg.targetLatencyFrames set below kChunk in the source — the +# adaptive raise reports itself in effectiveTargetLatencyFrames instead +# of dropping out: +./build/examples/drifting_clocks +``` + +The break-it-on-purpose suggestions are, as ever, the chapter in +miniature: each rule here was learned from a measured failure, and each +failure is still one edit away from being watched happening. diff --git a/cmake/arm-cortex-m55-mps3.cmake b/cmake/arm-cortex-m55-mps3.cmake index aa5a904..23a8991 100644 --- a/cmake/arm-cortex-m55-mps3.cmake +++ b/cmake/arm-cortex-m55-mps3.cmake @@ -23,6 +23,7 @@ set(CMAKE_C_FLAGS_INIT "-mcpu=cortex-m55 -mthumb -mfloat-abi=hard -ffunction-sec set(CMAKE_CXX_FLAGS_INIT "${CMAKE_C_FLAGS_INIT}") get_filename_component(_srt_platform "${CMAKE_CURRENT_LIST_DIR}/../platform/mps3_an547" ABSOLUTE) +# ANCHOR: pt_linkline # The startup .c is handed to the link line directly; the gcc driver # compiles it with the same -mcpu/-mfloat-abi flags as everything else. # `-x c` forces C compilation even under the g++ driver (which would treat @@ -30,6 +31,7 @@ get_filename_component(_srt_platform "${CMAKE_CURRENT_LIST_DIR}/../platform/mps3 # initializers are link-time constants, never dynamic initialization. set(CMAKE_EXE_LINKER_FLAGS_INIT "--specs=rdimon.specs -nostartfiles -Wl,--gc-sections -T${_srt_platform}/mps3_an547.ld -x c ${CMAKE_CURRENT_LIST_DIR}/../platform/armv8m_startup.c -x none") +# ANCHOR_END: pt_linkline set(CMAKE_CROSSCOMPILING_EMULATOR "qemu-system-arm;-M;mps3-an547;-nographic;-semihosting;-kernel") diff --git a/include/srt/asrc.hpp b/include/srt/asrc.hpp index a26291d..959e9ad 100644 --- a/include/srt/asrc.hpp +++ b/include/srt/asrc.hpp @@ -19,6 +19,7 @@ namespace srt { +// ANCHOR: p0_config /// Converter configuration. The defaults give ~1.5 ms designed latency at /// 48 kHz (FIFO setpoint 48 frames + ~24 frames filter group delay; see /// the README latency section), transparent for clocks within +/-1000 ppm. @@ -29,6 +30,7 @@ struct Config { std::size_t fifoFrames = 0; ///< ring capacity; 0 => automatic FilterSpec filter{}; ServoConfig servo{}; + // ANCHOR_END: p0_config /// Defaults adapted to a nominal rate other than 48 kHz. The filter /// band edges and servo bandwidths are absolute Hz designed for 48 kHz; @@ -146,6 +148,7 @@ class BasicAsyncSampleRateConverter { return ring_.read(dst, maxFrames * cfg_.channels) / cfg_.channels; }; + // ANCHOR: asrc_feasibility // Feasibility: a pull must synthesize from frames already buffered, // so the occupancy setpoint must exceed the pull block size or the // loop drains into a permanent underrun limit cycle (dropouts every @@ -173,8 +176,10 @@ class BasicAsyncSampleRateConverter { } } + // ANCHOR_END: asrc_feasibility double occ = backlogFrames(); + // ANCHOR: asrc_filling if (filling_) { if (occ < static_cast(fillThresholdFrames_)) { fillSilence(interleaved, frames * ch); @@ -190,6 +195,8 @@ class BasicAsyncSampleRateConverter { fadeFramesLeft_ = kFadeFrames; } + // ANCHOR_END: asrc_filling + // ANCHOR: asrc_resync if (occ > static_cast(highWaterFrames_)) { // hard resync const double target = static_cast(targetFrames_); // The discard can only come from the ring; frames staged in the @@ -206,9 +213,11 @@ class BasicAsyncSampleRateConverter { servo_.seed(occ + resampler_.mu()); } + // ANCHOR_END: asrc_resync const double dt = static_cast(frames) / cfg_.sampleRateHz; const double epsHat = servo_.update(occ, resampler_.mu(), dt); + // ANCHOR: asrc_underrun const std::size_t made = resampler_.process(interleaved, frames, epsHat, popFn); if (fadeFramesLeft_ != 0 && made != 0) applyFadeIn(interleaved, made); @@ -220,6 +229,7 @@ class BasicAsyncSampleRateConverter { } publishStatus(); return made; + // ANCHOR_END: asrc_underrun } /// Any thread: telemetry snapshot (relaxed atomics; fields are individually diff --git a/include/srt/detail/kaiser.hpp b/include/srt/detail/kaiser.hpp index e9ac8a3..011bd67 100644 --- a/include/srt/detail/kaiser.hpp +++ b/include/srt/detail/kaiser.hpp @@ -1,3 +1,4 @@ +// ANCHOR: kai_design_note /// \file kaiser.hpp /// \brief Kaiser-window FIR prototype design for the polyphase interpolation bank. /// @@ -8,6 +9,7 @@ /// minutes of compile time in every including translation unit. Runtime design /// takes well under 10 ms, runs once in a constructor, and is off the audio path, /// so all design math here is plain runtime double precision. +// ANCHOR_END: kai_design_note #ifndef SRT_DETAIL_KAISER_HPP #define SRT_DETAIL_KAISER_HPP @@ -18,6 +20,7 @@ namespace srt::detail { +// ANCHOR: kai_besseli0 /// Modified Bessel function of the first kind, order zero, by power series. /// Converges for all practical Kaiser betas (|x| < ~40); terms are added until /// they no longer contribute at double precision. @@ -34,7 +37,9 @@ inline double besselI0(double x) noexcept { } return sum; } +// ANCHOR_END: kai_besseli0 +// ANCHOR: kai_beta /// Kaiser window shape parameter for a given stopband attenuation in dB /// (Kaiser's published empirical fit). inline double kaiserBeta(double attenDb) noexcept { @@ -44,7 +49,9 @@ inline double kaiserBeta(double attenDb) noexcept { return 0.5842 * std::pow(attenDb - 21.0, 0.4) + 0.07886 * (attenDb - 21.0); return 0.0; } +// ANCHOR_END: kai_beta +// ANCHOR: kai_estimate /// Kaiser/harris FIR length estimate, expressed per polyphase branch. /// /// \param attenDb target stopband attenuation in dB @@ -59,7 +66,9 @@ inline std::size_t estimateTaps(double attenDb, double transWidthNorm) noexcept const double n = (attenDb - 8.0) / (2.285 * 2.0 * std::numbers::pi * transWidthNorm); return n > 4.0 ? static_cast(std::ceil(n)) : 4; } +// ANCHOR_END: kai_estimate +// ANCHOR: kai_sinc /// sin(pi x)/(pi x) with the removable singularity handled. inline double sinc(double x) noexcept { if (std::abs(x) < 1e-12) @@ -67,7 +76,9 @@ inline double sinc(double x) noexcept { const double px = std::numbers::pi * x; return std::sin(px) / px; } +// ANCHOR_END: kai_sinc +// ANCHOR: kai_prototype /// Designs the Kaiser-windowed sinc prototype lowpass for an L-phase /// interpolation bank. /// @@ -98,6 +109,7 @@ inline void designPrototype(std::span h, std::size_t numPhases, double c for (auto& v : h) v *= gain; } +// ANCHOR_END: kai_prototype } // namespace srt::detail diff --git a/include/srt/pi_servo.hpp b/include/srt/pi_servo.hpp index a9ce41d..762c1e4 100644 --- a/include/srt/pi_servo.hpp +++ b/include/srt/pi_servo.hpp @@ -47,6 +47,7 @@ namespace srt { +// ANCHOR: sv_config /// Servo tuning. Defaults suit a 48 kHz near-unity converter. /// unlockThresholdFrames should stay comfortably above half the push/pull /// block size, since block-quantized occupancy legitimately excursions by @@ -64,7 +65,9 @@ struct ServoConfig { double quietHoldSeconds = 2.0; ///< cascade-|e| hold => track -> quiet double unlockThresholdFrames = 24.0; ///< |e| above this => demote a stage double maxDeviationPpm = 1000.0; ///< epsHat clamp = +/- 1.5x this + // ANCHOR_END: sv_config + // ANCHOR: sv_scaled_to /// This config rescaled from the 48 kHz design rate to sampleRateHz: /// the loop bandwidths and error-smoother corners are absolute Hz and /// must track the rate, or the slip-sawtooth beat (ppm * fs) walks out @@ -87,6 +90,7 @@ struct ServoConfig { s.quietHoldSeconds /= r; return s; } + // ANCHOR_END: sv_scaled_to }; /// PI loop filter + three-stage lock-state machine. Pure double-precision @@ -103,6 +107,7 @@ class PiServo { reset(false); } + // ANCHOR: sv_reset /// Re-arm the loop. keepIntegrator preserves the accumulated ppm estimate /// (the right choice after a dropout: the clocks have not changed). void reset(bool keepIntegrator) noexcept { @@ -124,7 +129,9 @@ class PiServo { /// to the new setpoint at its clamped rate with no transient discontinuity /// — used by the converter's adaptive pull-block setpoint raise. void setTarget(double targetFrames) noexcept { target_ = targetFrames; } + // ANCHOR_END: sv_reset + // ANCHOR: sv_update_smooth /// One control update; call once per pull() before synthesis. /// \param occFrames raw backlog in frames (FIFO + staged frames) /// \param mu current fractional read position; occ + mu changes @@ -143,7 +150,9 @@ class PiServo { q3_ += aq * (q2_ - q3_); const double eFast = lpFast_ - target_; const double eQuiet = q3_ - target_; + // ANCHOR_END: sv_update_smooth + // ANCHOR: sv_update_stages const double limit = 1.5 * cfg_.maxDeviationPpm * 1e-6; switch (stage_) { case Stage::Acquire: @@ -168,7 +177,9 @@ class PiServo { } break; } + // ANCHOR_END: sv_update_stages + // ANCHOR: sv_update_out double kp = 0.0; double ki = 0.0; double e = 0.0; @@ -187,6 +198,7 @@ class PiServo { epsHat_ = std::clamp(kp * e + integ_, -limit, limit); return epsHat_; } + // ANCHOR_END: sv_update_out Stage stage() const noexcept { return stage_; } bool locked() const noexcept { return stage_ != Stage::Acquire; } @@ -199,6 +211,7 @@ class PiServo { return 1.0 - std::exp(-2.0 * std::numbers::pi * cornerHz * dt); } + // ANCHOR: sv_hold /// Hold-window logic shared by both promotions: |e| must stay below the /// threshold for holdSeconds; meanwhile epsHat is averaged (time constant /// holdSeconds/5) so the promotion can hand a clean estimate to the @@ -218,12 +231,15 @@ class PiServo { holdTimer_ = 0.0; return true; } + // ANCHOR_END: sv_hold + // ANCHOR: sv_gains void computeGains(double bandwidthHz, double& kp, double& ki) const noexcept { const double wn = 2.0 * std::numbers::pi * bandwidthHz; kp = 2.0 * cfg_.damping * wn / fs_; ki = wn * wn / fs_; } + // ANCHOR_END: sv_gains ServoConfig cfg_; double fs_; diff --git a/include/srt/polyphase_filter.hpp b/include/srt/polyphase_filter.hpp index 345604a..9d3205e 100644 --- a/include/srt/polyphase_filter.hpp +++ b/include/srt/polyphase_filter.hpp @@ -23,6 +23,7 @@ #define SRT_RESTRICT __restrict__ #endif +// ANCHOR: opt_smlald_gate // Dual 16x16 MAC (SMLALD) for the Q15 dot product on Arm cores that have // the DSP extension but no Helium — the Cortex-M33/M4/M7 class (e.g. // Raspberry Pi Pico 2). Gated off when MVE is present: on M55 the compiler @@ -36,6 +37,7 @@ #else #define SRT_Q15_SMLALD 0 #endif +// ANCHOR_END: opt_smlald_gate // Channel-parallel dot product for high channel counts (hypothesis C6, // docs/PERFORMANCE.md): history stored frame-major so the per-tap inner @@ -63,6 +65,7 @@ namespace srt { +// ANCHOR: bank_spec /// Specification of the interpolation prototype filter. /// /// numPhases (L) sets the polyphase table resolution: the residual images from @@ -94,6 +97,7 @@ struct FilterSpec { .stopbandHz = 26000.0, .stopbandAttenDb = 140.0}; } + // ANCHOR_END: bank_spec /// This spec with the band edges rescaled from the 48 kHz design rate /// to sampleRateHz. The presets' passband/stopband are absolute Hz @@ -111,6 +115,7 @@ struct FilterSpec { } }; +// ANCHOR: bank_layout /// Immutable polyphase coefficient table designed at construction. /// /// Storage layout: (L+1) rows of T coefficients. Row p in [0, L) is polyphase @@ -119,11 +124,13 @@ struct FilterSpec { /// and the mu wrap 1.0 -> 0.0 (window shifted by one sample) is exactly /// continuous. Rows are stored tap-reversed so the dot product runs forward /// over an oldest-first history window. +// ANCHOR_END: bank_layout template class PolyphaseFilterBank { public: using Coeff = typename SampleTraits::Coeff; + // ANCHOR: bank_build /// Designs the prototype (double precision) and builds the table. /// Allocates; may throw std::invalid_argument / std::bad_alloc. Do this at /// setup time, not on the audio path. @@ -150,7 +157,9 @@ class PolyphaseFilterBank { } } } + // ANCHOR_END: bank_build + // ANCHOR: bank_accessors /// Row pointer for phase p in [0, numPhases()]; T contiguous coefficients. const Coeff* phase(std::size_t p) const noexcept { return table_.data() + p * taps_; } std::size_t numPhases() const noexcept { return phases_; } ///< L @@ -160,6 +169,7 @@ class PolyphaseFilterBank { double groupDelaySamples() const noexcept { return static_cast(phases_ * taps_ - 1) / (2.0 * static_cast(phases_)); } + // ANCHOR_END: bank_accessors private: std::size_t phases_; @@ -167,6 +177,7 @@ class PolyphaseFilterBank { std::vector table_; // (L+1) x T, rows tap-reversed }; +// ANCHOR: bank_interpolate /// Evaluates one output sample at fractional position mu in [0, 1). /// /// \param hist oldest-first window of the newest T input samples of one channel @@ -192,6 +203,7 @@ inline S interpolate(const PolyphaseFilterBank& bank, const S* hist, double m acc = Tr::mac(acc, hist[t], Tr::blend(c0[t], c1[t], fr)); return Tr::finalize(acc); } +// ANCHOR_END: bank_interpolate /// Blends the two phase rows adjacent to mu into `row` (taps() entries). /// Multichannel datapaths do this once per output frame and then run @@ -213,6 +225,7 @@ inline void blendRow(const PolyphaseFilterBank& bank, row[t] = Tr::blend(c0[t], c1[t], fr); } +// ANCHOR: rs_blend_row_phase /// Phase-bit variants: the fractional position as an unsigned Q0.64 /// fraction. The polyphase index is the top log2(L) bits and the intra-phase /// blend factor comes from the bits below — no double arithmetic per sample, @@ -232,7 +245,9 @@ inline void blendRowPhase(const PolyphaseFilterBank& bank, for (std::size_t t = 0; t < taps; ++t) row[t] = Tr::blend(c0[t], c1[t], fr); } +// ANCHOR_END: rs_blend_row_phase +// ANCHOR: rs_interpolate_phase /// interpolate() over a Q0.64 phase; fused blend+mac (mono fast path). template inline S interpolatePhase(const PolyphaseFilterBank& bank, const S* hist, @@ -249,7 +264,9 @@ inline S interpolatePhase(const PolyphaseFilterBank& bank, const S* hist, acc = Tr::mac(acc, hist[t], Tr::blend(c0[t], c1[t], fr)); return Tr::finalize(acc); } +// ANCHOR_END: rs_interpolate_phase +// ANCHOR: rs_dot_row /// Dot product of a pre-blended coefficient row against a history window. /// Identical arithmetic to interpolate() given the same mu: blend then mac, /// per tap, in the same order — outputs are bit-exact either way. @@ -281,7 +298,9 @@ inline S dotRow(const typename SampleTraits::Coeff* SRT_RESTRICT row, const S acc = Tr::mac(acc, hist[t], row[t]); return Tr::finalize(acc); } +// ANCHOR_END: rs_dot_row +// ANCHOR: opt_dot_tile /// One K-channel tile of the channel-parallel dot (hypothesis C6): K /// accumulators live in a constexpr-size local array — registers, not /// memory — while the tap loop walks the frame-major window with stride @@ -303,7 +322,10 @@ inline void dotTileFrameMajor(const typename SampleTraits::Coeff* SRT_RESTRIC for (std::size_t k = 0; k < K; ++k) out[k] = Tr::finalize(acc[k]); } +// ANCHOR_END: opt_dot_tile +// ANCHOR: rs_dot_rows_frame_major +// ANCHOR: opt_dot_rows /// Channel-parallel dot products over a frame-major history block: all /// channels' outputs for one frame in register-blocked tiles of 8/4/2/1. /// Per channel the accumulation order over taps equals dotRow's, so the @@ -328,7 +350,10 @@ inline void dotRowsFrameMajor(const typename SampleTraits::Coeff* SRT_RESTRIC if (c < channels) dotTileFrameMajor(row, x + c, taps, channels, out + c); } +// ANCHOR_END: rs_dot_rows_frame_major +// ANCHOR_END: opt_dot_rows +// ANCHOR: rs_class_doc /// Streaming fractional-delay engine for one converter instance. /// /// Owns the history delay lines (planar per-channel below the @@ -348,6 +373,7 @@ inline void dotRowsFrameMajor(const typename SampleTraits::Coeff* SRT_RESTRIC /// detected by 64-bit wraparound instead of comparisons. template class FractionalResampler { + // ANCHOR_END: rs_class_doc public: /// Frame-major channel-parallel mode is compiled in only on CP targets /// and only for floating-point samples (see SRT_CHANNEL_PARALLEL). @@ -378,6 +404,7 @@ class FractionalResampler { scratchPos_ = 0; } + // ANCHOR: rs_mu /// Fractional position in [0,1) as a double; used by the servo at block /// rate (one conversion per pull, not per sample). double mu() const noexcept { return static_cast(phase_) * 0x1p-64; } @@ -386,6 +413,7 @@ class FractionalResampler { /// Frames popped from the source but not yet consumed by the filter; part /// of the effective backlog the servo must observe. std::size_t bufferedFrames() const noexcept { return scratchFrames_ - scratchPos_; } + // ANCHOR_END: rs_mu /// Fills the history window with taps() frames from the source. /// Returns false (and stays unprimed) if the source ran dry. @@ -400,6 +428,7 @@ class FractionalResampler { return true; } + // ANCHOR: rs_process_doc /// Synthesizes up to maxFrames output frames (interleaved) advancing the /// read position by (1 + epsHat) input frames per output frame. Returns /// the number produced; fewer than maxFrames means the source ran dry @@ -414,6 +443,9 @@ class FractionalResampler { /// interleaved frames, returning the count actually delivered. template std::size_t process(S* out, std::size_t maxFrames, double epsHat, PopFn&& popFrames) noexcept { + // ANCHOR_END: rs_process_doc + // ANCHOR: p0_phase_step + // ANCHOR: rs_slip // eps in Q0.64, converted once per call (block rate). |eps| is // servo-clamped to ~1e-3, so eps * 2^64 fits int64 comfortably. const auto epsFix = static_cast(epsHat * 0x1p64); @@ -432,6 +464,9 @@ class FractionalResampler { return n; // dry: phase_ not advanced for this frame } phase_ = m; + // ANCHOR_END: p0_phase_step + // ANCHOR_END: rs_slip + // ANCHOR: rs_dispatch // Q15 on SMLALD targets routes mono through blendRow+dotRow as // well: dotRow carries the dual-MAC loop, and the two paths are // bit-exact by construction (see dotRow). @@ -454,6 +489,7 @@ class FractionalResampler { for (std::size_t c = 0; c < channels_; ++c) out[n * channels_ + c] = dotRow(row_.data(), window(c), taps); } + // ANCHOR_END: rs_dispatch } return maxFrames; } @@ -461,6 +497,7 @@ class FractionalResampler { private: const S* window(std::size_t c) const noexcept { return hist_[c].data() + end_ - bank_->taps(); } + // ANCHOR: rs_append template bool appendOne(PopFn&& popFrames) noexcept { if (scratchPos_ == scratchFrames_) { @@ -490,12 +527,14 @@ class FractionalResampler { ++scratchPos_; return true; } + // ANCHOR_END: rs_append const PolyphaseFilterBank* bank_; std::size_t channels_; std::size_t chunk_; std::size_t histCap_; std::vector scratch_; // interleaved staging for bulk pops + // ANCHOR: rs_members // History storage: planar (one delay line per channel, hist_[c]) below // SRT_CP_MIN_CHANNELS, frame-major (single interleaved line, hist_[0]) // at or above it on SRT_CHANNEL_PARALLEL targets. end_/histCap_ count @@ -507,6 +546,7 @@ class FractionalResampler { std::size_t scratchFrames_ = 0; std::size_t scratchPos_ = 0; std::uint64_t phase_ = 0; // fractional position, unsigned Q0.64 + // ANCHOR_END: rs_members bool primed_ = false; }; diff --git a/include/srt/sample_traits.hpp b/include/srt/sample_traits.hpp index b3eb832..5fd4647 100644 --- a/include/srt/sample_traits.hpp +++ b/include/srt/sample_traits.hpp @@ -1,3 +1,4 @@ +// ANCHOR: st_overview /// \file sample_traits.hpp /// \brief Sample-type customization point for the resampling datapath. /// @@ -14,6 +15,7 @@ /// The clock servo and the filter design always run in double regardless of /// sample type (control path and one-time init, not the audio path), so the /// fixed-point datapaths contain no floating-point inner loops. +// ANCHOR_END: st_overview #ifndef SRT_SAMPLE_TRAITS_HPP #define SRT_SAMPLE_TRAITS_HPP @@ -26,6 +28,7 @@ namespace srt { namespace detail { +// ANCHOR: st_roundsat /// Round-and-saturate a double to a signed integer coefficient/sample type. template constexpr I roundSat(double v) noexcept { @@ -38,6 +41,7 @@ constexpr I roundSat(double v) noexcept { return std::numeric_limits::max(); return static_cast(r); } +// ANCHOR_END: st_roundsat /// Saturate a 64-bit accumulator result to a narrower signed integer. template @@ -49,10 +53,13 @@ constexpr I clampSat(std::int64_t v) noexcept { } // namespace detail +// ANCHOR: st_primary /// Primary template intentionally undefined; specialize per sample type. template struct SampleTraits; +// ANCHOR_END: st_primary +// ANCHOR: st_float /// Float datapath: float samples and coefficients, double accumulation. /// The double accumulator keeps the dot-product noise floor far below the /// 120 dB transparency target; float coefficient storage quantizes the @@ -69,6 +76,7 @@ struct SampleTraits { /// Convert the intra-phase fraction (in [0,1)) once per output sample. static BlendFactor makeBlendFactor(double fr) noexcept { return static_cast(fr); } + // ANCHOR: st_blend_q64_float /// Blend factor from the top bits of a Q0.64 intra-phase fraction. /// Single-precision only: the value is reduced to 24 bits first so the /// uint->float conversion is exact and no double op is needed @@ -76,6 +84,7 @@ struct SampleTraits { static BlendFactor blendFactorFromQ64(std::uint64_t frac) noexcept { return static_cast(frac >> 40) * 0x1p-24f; } + // ANCHOR_END: st_blend_q64_float /// acc + x * c, in the accumulator domain. static Accum mac(Accum acc, float x, Coeff c) noexcept { @@ -91,7 +100,9 @@ struct SampleTraits { /// The zero/silence sample value. static float silence() noexcept { return 0.0f; } }; +// ANCHOR_END: st_float +// ANCHOR: st_q15_header /// Q15 fixed-point datapath (samples are int16_t in Q0.15). /// /// Coefficients are stored in Q1.14: the prototype's peak tap reaches ~1.0 @@ -107,26 +118,34 @@ struct SampleTraits { using Coeff = std::int16_t; using Accum = std::int64_t; using BlendFactor = std::int32_t; ///< fraction in Q15 + // ANCHOR_END: st_q15_header + // ANCHOR: st_q15_coeff static Coeff makeCoeff(double c) noexcept { return detail::roundSat(c * 16384.0); // Q1.14 } + // ANCHOR_END: st_q15_coeff static BlendFactor makeBlendFactor(double fr) noexcept { return static_cast(fr * 32768.0); // Q15 } + // ANCHOR: st_q15_q64 /// Q15 blend factor straight from a Q0.64 fraction's top bits: no /// floating point at all on the fixed-point per-sample path. static BlendFactor blendFactorFromQ64(std::uint64_t frac) noexcept { return static_cast(frac >> 49); // Q15 } + // ANCHOR_END: st_q15_q64 + // ANCHOR: st_q15_mac static Accum mac(Accum acc, std::int16_t x, Coeff c) noexcept { return acc + static_cast(static_cast(x) * static_cast(c)); } + // ANCHOR_END: st_q15_mac + // ANCHOR: st_q15_blend static Coeff blend(Coeff a, Coeff b, BlendFactor fr) noexcept { // Q14 + (Q15 * Q14) >> 15, in int64: the worst-case int32 product // 32767 * 65535 = 2,147,385,345 sits 0.005% under INT32_MAX — @@ -136,16 +155,20 @@ struct SampleTraits { const std::int64_t diff = static_cast(b) - a; return static_cast(a + ((fr * diff) >> 15)); } + // ANCHOR_END: st_q15_blend + // ANCHOR: st_q15_finalize static std::int16_t finalize(Accum acc) noexcept { // Round-half-up, not half-even: the bias is a fraction of one // sub-LSB rounding step, far below the Q15 noise floor. return detail::clampSat((acc + (1 << 13)) >> 14); // Q29 -> Q15 } + // ANCHOR_END: st_q15_finalize static std::int16_t silence() noexcept { return 0; } }; +// ANCHOR: st_q31 /// Q31 fixed-point datapath (samples are int32_t in Q0.31). /// /// Coefficients are stored in Q1.30 (one headroom bit for the ~1.0 peak @@ -174,9 +197,11 @@ struct SampleTraits { return static_cast(frac >> 44); // Q20 } + // ANCHOR: st_q31_mac static Accum mac(Accum acc, std::int32_t x, Coeff c) noexcept { return acc + ((static_cast(x) * c) >> 16); // Q61 -> Q45 } + // ANCHOR_END: st_q31_mac static Coeff blend(Coeff a, Coeff b, BlendFactor fr) noexcept { const std::int64_t diff = static_cast(b) - a; @@ -189,7 +214,9 @@ struct SampleTraits { static std::int32_t silence() noexcept { return 0; } }; +// ANCHOR_END: st_q31 +// ANCHOR: st_concept /// Satisfied by any type with a complete, well-formed SampleTraits /// specialization. template @@ -212,6 +239,7 @@ concept SampleType = static_assert(SampleType); static_assert(SampleType); static_assert(SampleType); +// ANCHOR_END: st_concept } // namespace srt diff --git a/include/srt/spsc_ring.hpp b/include/srt/spsc_ring.hpp index e70b562..fa94eae 100644 --- a/include/srt/spsc_ring.hpp +++ b/include/srt/spsc_ring.hpp @@ -20,6 +20,7 @@ namespace srt { +// ANCHOR: contract /// Lock-free SPSC ring buffer of trivially copyable elements. /// /// Thread contract: write() and writeAvailable() may only be called from the @@ -34,6 +35,7 @@ class SpscRing { static_assert(std::is_trivially_copyable_v); // The lock-free claim of the whole audio path rests on these indices. static_assert(std::atomic::is_always_lock_free); + // ANCHOR_END: contract public: /// Allocates the buffer; capacity is rounded up to a power of two. @@ -45,6 +47,7 @@ class SpscRing { std::size_t capacity() const noexcept { return buf_.size(); } + // ANCHOR: write /// Producer: append up to n elements; returns the number actually written. std::size_t write(const T* src, std::size_t n) noexcept { const std::size_t head = head_.load(std::memory_order_relaxed); @@ -64,12 +67,15 @@ class SpscRing { return n; } + // ANCHOR_END: write + /// Producer: exact free space at the time of the call. std::size_t writeAvailable() noexcept { tailCache_ = tail_.load(std::memory_order_acquire); return capacity() - (head_.load(std::memory_order_relaxed) - tailCache_); } + // ANCHOR: read /// Consumer: remove up to n elements; returns the number actually read. std::size_t read(T* dst, std::size_t n) noexcept { const std::size_t tail = tail_.load(std::memory_order_relaxed); @@ -89,6 +95,8 @@ class SpscRing { return n; } + // ANCHOR_END: read + /// Consumer: exact occupancy at the time of the call. std::size_t readAvailable() noexcept { headCache_ = head_.load(std::memory_order_acquire); @@ -110,6 +118,7 @@ class SpscRing { } private: + // ANCHOR: layout // 64-byte separation to keep producer- and consumer-owned state on // distinct cache lines (std::hardware_destructive_interference_size is // deliberately avoided: it is ABI-fragile and warns on GCC). The @@ -123,6 +132,7 @@ class SpscRing { alignas(kCacheLine) std::size_t tailCache_{0}; // producer's view of tail alignas(kCacheLine) std::atomic tail_{0}; // written by consumer alignas(kCacheLine) std::size_t headCache_{0}; // consumer's view of head + // ANCHOR_END: layout }; } // namespace srt diff --git a/notebooks/asrc_demo.ipynb b/notebooks/asrc_demo.ipynb index b026d74..08d0f77 100644 --- a/notebooks/asrc_demo.ipynb +++ b/notebooks/asrc_demo.ipynb @@ -727,8 +727,8 @@ "\n", "| What | Measured here |\n", "|---|---|\n", - "| Naive FIFO at +200 ppm | clicks ~10×/s, SNR around 29 dB dB |\n", - "| SampleRateTap, same conditions | **SNR > 130 dB** — at the 24-bit noise floor |\n", + "| Naive FIFO at +200 ppm | clicks ~10×/s, SNR around 29 dB |\n", + "| SampleRateTap, same conditions | **SNR 126.4 dB measured** (cell asserts > 125) |\n", "| Lock from cold start | ~1 s |\n", "| Latency | ≈ designed 1.5 ms, linear phase |\n", "| 50 ppm/s drift ramp | tracked, locked, zero underruns |\n", diff --git a/platform/armv8m_startup.c b/platform/armv8m_startup.c index a0007e8..4db20bd 100644 --- a/platform/armv8m_startup.c +++ b/platform/armv8m_startup.c @@ -47,6 +47,7 @@ void* __dso_handle; void _init(void) {} void _fini(void) {} +/* ANCHOR: pt_sbrk */ void* _sbrk(ptrdiff_t increment) { static char* brk = &__heap_start__; if (brk + increment > &__heap_end__) { @@ -57,7 +58,9 @@ void* _sbrk(ptrdiff_t increment) { brk += increment; return prev; } +/* ANCHOR_END: pt_sbrk */ +/* ANCHOR: pt_irqlock */ static inline uint32_t irqLock(void) { uint32_t primask; __asm volatile("mrs %0, PRIMASK\n cpsid i" : "=r"(primask)::"memory"); @@ -67,6 +70,7 @@ static inline uint32_t irqLock(void) { static inline void irqRestore(uint32_t primask) { __asm volatile("msr PRIMASK, %0" ::"r"(primask) : "memory"); } +/* ANCHOR_END: pt_irqlock */ uint64_t __atomic_load_8(const volatile void* ptr, int memorder) { (void)memorder; @@ -83,6 +87,7 @@ void __atomic_store_8(volatile void* ptr, uint64_t value, int memorder) { irqRestore(m); } +/* ANCHOR: pt_atomic_rmw */ uint64_t __atomic_fetch_add_8(volatile void* ptr, uint64_t value, int memorder) { (void)memorder; const uint32_t m = irqLock(); @@ -91,6 +96,7 @@ uint64_t __atomic_fetch_add_8(volatile void* ptr, uint64_t value, int memorder) irqRestore(m); return prev; } +/* ANCHOR_END: pt_atomic_rmw */ uint64_t __atomic_exchange_8(volatile void* ptr, uint64_t value, int memorder) { (void)memorder; @@ -101,6 +107,7 @@ uint64_t __atomic_exchange_8(volatile void* ptr, uint64_t value, int memorder) { return prev; } +/* ANCHOR: pt_reset */ void Reset_Handler(void) { /* MSPLIM exists on Armv8-M Mainline only (both targets are M33/M55 * class): a main-stack overflow past __stack_limit raises a fault @@ -119,6 +126,7 @@ void Reset_Handler(void) { __libc_init_array(); /* C++ static constructors */ exit(main(0, (char**)0)); } +/* ANCHOR_END: pt_reset */ void Default_Handler(void) { for (;;) { @@ -134,6 +142,7 @@ void HardFault_Handler(void) { } } +/* ANCHOR: pt_vectors */ __attribute__((section(".vectors"), used)) static const uintptr_t vectors[16] = { (uintptr_t)&__stack_top, (uintptr_t)&Reset_Handler, @@ -152,6 +161,7 @@ __attribute__((section(".vectors"), used)) static const uintptr_t vectors[16] = (uintptr_t)&Default_Handler, /* PendSV */ (uintptr_t)&Default_Handler, /* SysTick */ }; +/* ANCHOR_END: pt_vectors */ #ifdef __cplusplus } /* extern "C" */ diff --git a/platform/mps2_an505/mps2_an505.ld b/platform/mps2_an505/mps2_an505.ld index d90299f..f2a7c42 100644 --- a/platform/mps2_an505/mps2_an505.ld +++ b/platform/mps2_an505/mps2_an505.ld @@ -8,6 +8,7 @@ * SSRAM1 4 MB @ 0x10000000 - vector table + code + rodata * SSRAM2/3 4 MB @ 0x38000000 - data + bss + heap + stack */ +/* ANCHOR: pt_memory */ MEMORY { CODE (rx) : ORIGIN = 0x10000000, LENGTH = 4M @@ -15,6 +16,7 @@ MEMORY } __stack_top = ORIGIN(DATA) + LENGTH(DATA); +/* ANCHOR_END: pt_memory */ ENTRY(Reset_Handler) @@ -72,6 +74,7 @@ SECTIONS __bss_end__ = .; } > DATA + /* ANCHOR: pt_heap_stack */ /* Stack lives at the top of DATA; cap the heap 64 KB below it. */ .heap (NOLOAD) : ALIGN(8) { __heap_start__ = .; @@ -82,6 +85,7 @@ SECTIONS /* MSPLIM (set in Reset_Handler): the stack may descend to the heap cap * but no further — overflow into the heap faults instead of corrupting. */ __stack_limit = __heap_end__; + /* ANCHOR_END: pt_heap_stack */ /* librdimon's (unused, weak) _sbrk references `end`; satisfy it. */ PROVIDE(end = __heap_start__); diff --git a/platform/mps3_an547/mps3_an547.ld b/platform/mps3_an547/mps3_an547.ld index a0a4d40..c2777f4 100644 --- a/platform/mps3_an547/mps3_an547.ld +++ b/platform/mps3_an547/mps3_an547.ld @@ -8,6 +8,7 @@ * DTCM 512 KB @ 0x20000000 - stack * ISRAM 2 MB @ 0x21000000 - data + bss + heap */ +/* ANCHOR: pt_memory */ MEMORY { ITCM (rx) : ORIGIN = 0x00000000, LENGTH = 512K @@ -20,6 +21,7 @@ __stack_top = ORIGIN(DTCM) + LENGTH(DTCM); /* MSPLIM (set in Reset_Handler): the stack owns all of DTCM, so the lowest * address it may legally reach is the region base. */ __stack_limit = ORIGIN(DTCM); +/* ANCHOR_END: pt_memory */ ENTRY(Reset_Handler) @@ -77,11 +79,13 @@ SECTIONS __bss_end__ = .; } > DATA + /* ANCHOR: pt_heap */ .heap (NOLOAD) : ALIGN(8) { __heap_start__ = .; . = ORIGIN(DATA) + LENGTH(DATA); __heap_end__ = .; } > DATA + /* ANCHOR_END: pt_heap */ /* librdimon's (unused, weak) _sbrk references `end`; satisfy it. */ PROVIDE(end = __heap_start__); diff --git a/scripts/book_figures.py b/scripts/book_figures.py new file mode 100644 index 0000000..6e705a1 --- /dev/null +++ b/scripts/book_figures.py @@ -0,0 +1,459 @@ +#!/usr/bin/env python3 +"""Regenerates the book's figures (book/src/img/*.svg). + +Every figure is produced from the same sources the text cites: + +- the filter figures re-run the exact design math of + include/srt/detail/kaiser.hpp (formula-for-formula port below); +- the servo and feasibility figures are MEASURED: this script compiles + scripts/book_figures_trace.cpp against the current include/ tree and runs + it in deterministic virtual time. The feasibility "before" panel compiles + the same tool against the include/ tree of commit 045de5d — the last + commit before the PR #25 feasibility fix — extracted with `git archive`, + so both panels of that figure are measurements, not models; +- the phase-wraparound figure runs the resampler's actual uint64 slip + arithmetic (mod 2^64) in Python integers; +- the architecture figure is drawn, not computed. + +Usage: python3 scripts/book_figures.py (from the repo root) +Needs: numpy, matplotlib, g++, git. + +The SVGs are committed. CI does not regenerate them — matplotlib's SVG +output is not byte-stable across matplotlib versions, so a regenerate-and- +diff gate would ratchet toolchain noise, not truth — but the book CI job +does verify that every image the chapters reference exists. +""" + +import os +import subprocess +import sys +import tempfile + +import numpy as np +import matplotlib + +matplotlib.use("Agg") +import matplotlib.pyplot as plt +from matplotlib.patches import FancyBboxPatch, FancyArrowPatch + +ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +OUT = os.path.join(ROOT, "book", "src", "img") +PREFIX_COMMIT = "045de5d" # last commit before the feasibility fix (PR #25) + +# Palette (validated categorical slots + chrome ink; light surface). +SURFACE = "#fcfcfb" +INK = "#0b0b0b" +SECONDARY = "#52514e" +MUTED = "#898781" +GRID = "#e1e0d9" +BASELINE = "#c3c2b7" +BLUE = "#2a78d6" # slot 1 +AQUA = "#1baf7a" # slot 2 (sub-3:1 contrast: always direct-labeled) +YELLOW = "#eda100" # slot 3 (sub-3:1 contrast: always direct-labeled) +RED = "#e34948" # slot 6, used only for the pre-fix (failing) trace + +plt.rcParams.update({ + "figure.facecolor": SURFACE, + "axes.facecolor": SURFACE, + "savefig.facecolor": SURFACE, + "font.family": "sans-serif", + "font.sans-serif": ["DejaVu Sans"], + "font.size": 9, + "text.color": INK, + "axes.edgecolor": BASELINE, + "axes.labelcolor": SECONDARY, + "axes.titlecolor": INK, + "axes.titlesize": 10, + "axes.linewidth": 0.75, + "axes.grid": True, + "grid.color": GRID, + "grid.linewidth": 0.75, + "grid.linestyle": "-", + "xtick.color": MUTED, + "ytick.color": MUTED, + "xtick.labelcolor": MUTED, + "ytick.labelcolor": MUTED, + "lines.linewidth": 1.5, + "lines.solid_joinstyle": "round", + "lines.solid_capstyle": "round", + "legend.frameon": False, + "svg.hashsalt": "sampleratetap-book", +}) + + +def save(fig, name): + fig.savefig(os.path.join(OUT, name + ".svg")) + png_dir = os.environ.get("PNG_OUT") # optional raster copies for review + if png_dir: + fig.savefig(os.path.join(png_dir, name + ".png"), dpi=110) + + +def despine(ax): + for side in ("top", "right"): + ax.spines[side].set_visible(False) + + +# --- the filter design math, ported formula-for-formula from kaiser.hpp --- + +def bessel_i0(x): + x = np.asarray(x, dtype=float) + half = 0.5 * x + term = np.ones_like(x) + total = np.ones_like(x) + for k in range(1, 1000): + r = half / k + term = term * r * r + total = total + term + if np.all(term < 1e-21 * total): + break + return total + + +def kaiser_beta(atten_db): + if atten_db > 50.0: + return 0.1102 * (atten_db - 8.7) + if atten_db > 21.0: + return 0.5842 * (atten_db - 21.0) ** 0.4 + 0.07886 * (atten_db - 21.0) + return 0.0 + + +def design_prototype(num_phases, taps_per_phase, cutoff_norm, beta): + n = num_phases * taps_per_phase + i = np.arange(n, dtype=float) + center = 0.5 * (n - 1) + t = (i - center) / num_phases + u = (i - center) / center + w = bessel_i0(beta * np.sqrt(np.maximum(0.0, 1.0 - u * u))) / bessel_i0(beta) + h = cutoff_norm * np.sinc(cutoff_norm * t) * w # np.sinc is sin(pi x)/(pi x) + return h * (num_phases / h.sum()) + + +# FilterSpec presets, verbatim from polyphase_filter.hpp. +PRESETS = [ + ("fast", 128, 32, 18000.0, 30000.0, 96.0, BLUE), + ("balanced", 256, 48, 20000.0, 28000.0, 120.0, AQUA), + ("transparent", 512, 80, 20000.0, 26000.0, 140.0, YELLOW), +] +FS = 48000.0 + + +def preset_response(L, T, pass_hz, stop_hz, atten_db, nfft=1 << 21): + cutoff = (pass_hz + stop_hz) / FS + h = design_prototype(L, T, cutoff, kaiser_beta(atten_db)) + H = np.fft.rfft(h, nfft) / L + f = np.arange(H.size) * (L * FS) / nfft + keep = f <= 48000.0 + return f[keep], 20.0 * np.log10(np.maximum(np.abs(H[keep]), 1e-12)) + + +def fig_kaiser_window(): + fig, ax = plt.subplots(figsize=(6.4, 3.2), layout="constrained") + u = np.linspace(-1.0, 1.0, 801) + iu = 180 # u = -0.55, where the three curves are well separated + for name, _, _, _, _, atten, color in PRESETS: + beta = kaiser_beta(atten) + w = bessel_i0(beta * np.sqrt(1.0 - u * u)) / bessel_i0(beta) + ax.plot(u, w, color=color, label=f"{name}: {atten:.0f} dB, β = {beta:.1f}") + ax.annotate(name, (u[iu], w[iu]), xytext=(-4, 4), + textcoords="offset points", color=SECONDARY, fontsize=8.5, + ha="right", bbox=dict(fc=SURFACE, ec="none", pad=1.0)) + ax.set_xlabel("window argument u (full aperture −1 … 1)") + ax.set_ylabel("w(u)") + ax.set_title("Kaiser window: attenuation buys taper") + ax.legend(loc="upper right", fontsize=8.5) + ax.set_xlim(-1.0, 1.0) + ax.set_ylim(0.0, 1.05) + despine(ax) + save(fig, "kaiser-window") + plt.close(fig) + + +def fig_kaiser_response(): + fig, (ax, axz) = plt.subplots( + 2, 1, figsize=(7.0, 5.6), layout="constrained", height_ratios=[2.4, 1.0]) + for name, L, T, pass_hz, stop_hz, atten, color in PRESETS: + f, db = preset_response(L, T, pass_hz, stop_hz, atten) + ax.plot(f / 1e3, db, color=color, label=name) + axz.plot(f / 1e3, db, color=color) + # direct label at each preset's measured stopband floor + floor = db[f >= stop_hz].max() + ax.annotate(f"{name}: {floor:.0f} dB past {stop_hz/1e3:.0f} kHz", + (47.0, floor), xytext=(0, 7), + textcoords="offset points", color=SECONDARY, fontsize=8.5, + ha="right", bbox=dict(fc=SURFACE, ec="none", pad=1.0)) + for x, label in ((20.0, "20 kHz passband edge"), (24.0, "input Nyquist")): + ax.axvline(x, color=BASELINE, lw=0.75, zorder=0) + ax.annotate(label, (x, -182), rotation=90, xytext=(-3, 0), + textcoords="offset points", color=MUTED, + fontsize=7.5, ha="right", va="bottom", + bbox=dict(fc=SURFACE, ec="none", pad=1.0)) + ax.set_ylim(-185, 8) + ax.set_xlim(0, 48) + ax.set_ylabel("magnitude (dB)") + ax.set_title("Prototype magnitude response, the three presets") + ax.legend(loc="upper right", fontsize=8.5) + despine(ax) + axz.set_xlim(0, 22) + axz.set_ylim(-0.031, 0.031) + axz.set_xlabel("frequency at 48 kHz (kHz)") + axz.set_ylabel("passband detail (dB)") + axz.annotate("all three presets flat within ±0.01 dB across their passbands", + (0.5, 0.021), color=SECONDARY, fontsize=8.5, ha="left") + despine(axz) + save(fig, "kaiser-response") + plt.close(fig) + + +# --- measured traces via the C++ tool --- + +def build_trace_tool(include_dir, exe): + subprocess.run( + ["g++", "-O2", "-std=c++20", f"-I{include_dir}", + os.path.join(ROOT, "scripts", "book_figures_trace.cpp"), "-o", exe], + check=True) + + +def run_trace(exe, *args): + out = subprocess.run([exe] + [str(a) for a in args], + check=True, capture_output=True, text=True).stdout + rows = [line.split(",") for line in out.strip().splitlines()[1:]] + a = np.array(rows, dtype=float) + return {"t": a[:, 0], "fill": a[:, 1], "state": a[:, 2], + "ppm": a[:, 3], "underruns": a[:, 4]} + + +def fig_servo_lock(head_exe): + # 1-frame pushes: the long tests' methodology — block-quantized pushes + # would hide the 200 ppm surplus in one 32-frame lump every ~3.3 s. + tr = run_trace(head_exe, 32, 1, 200, 45, 28.0, 0.05) + fig, (axf, axp) = plt.subplots( + 2, 1, figsize=(7.0, 4.8), sharex=True, layout="constrained") + + state = tr["state"] + t_lock1 = tr["t"][np.argmax(state == 2)] + i_stall = int(np.searchsorted(tr["underruns"], 0.5)) + t_stall = tr["t"][i_stall] + after_stall = (tr["t"] > t_stall) & (state == 2) + t_lock2 = tr["t"][np.argmax(after_stall)] + + axf.plot(tr["t"], tr["fill"], color=BLUE, lw=1.2) + axf.axhline(48, color=BASELINE, lw=0.75, zorder=0) + axf.annotate("setpoint 48", (44.8, 48), xytext=(0, 5), + textcoords="offset points", color=MUTED, fontsize=8, ha="right") + axf.set_ylim(46.6, 50.6) + axf.annotate(f"cold start: Locked in {t_lock1:.2f} s", + (t_lock1, 50.0), xytext=(2.5, 50.0), textcoords="data", + color=SECONDARY, fontsize=8, + arrowprops=dict(arrowstyle="-", color=SECONDARY, lw=0.75)) + axf.annotate("50 ms producer stall → refill → " + f"re-Locked {t_lock2 - t_stall:.2f} s later", + (t_stall, 50.0), xytext=(30.5, 50.0), textcoords="data", + color=SECONDARY, fontsize=8, + arrowprops=dict(arrowstyle="-", color=SECONDARY, lw=0.75)) + axf.set_ylabel("FIFO occupancy (frames)") + axf.set_title("Acquire, lock, dropout, re-lock (measured, producer +200 ppm)") + + axp.plot(tr["t"], tr["ppm"], color=BLUE, lw=1.2) + for y in (1500, -1500): + axp.axhline(y, color=BASELINE, lw=0.75, zorder=0) + axp.axhline(200, color=BASELINE, lw=0.75, zorder=0) + axp.annotate("true offset 200 ppm", (44.8, 200), xytext=(0, 5), + textcoords="offset points", color=MUTED, fontsize=8, ha="right") + axp.annotate("servo clamp ±1500 ppm", (44.8, 1500), xytext=(0, -10), + textcoords="offset points", color=MUTED, fontsize=8, + ha="right", va="top") + axp.annotate("Acquiring (10 Hz) rings against the clamp\n" + "on the ±1-frame quantized occupancy", + (2.7, 900), color=SECONDARY, fontsize=8, ha="left") + axp.annotate("Locked (1 Hz): settles on the true offset", + (14, 480), color=SECONDARY, fontsize=8, ha="left") + axp.set_ylim(-1750, 1950) + axp.set_ylabel("estimated ppm") + axp.set_xlabel("time (s)") + + # hairlines at the recorded stage transitions + for i in np.flatnonzero(np.diff(state) != 0): + for ax in (axf, axp): + ax.axvline(tr["t"][i + 1], color=GRID, lw=0.75, zorder=0) + for ax in (axf, axp): + despine(ax) + save(fig, "servo-lock") + plt.close(fig) + return tr + + +def fig_feasibility(head_exe, prefix_exe): + before = run_trace(prefix_exe, 64, 32, 200, 6) + after = run_trace(head_exe, 64, 32, 200, 6) + fig, (axb, axa) = plt.subplots( + 2, 1, figsize=(7.0, 4.8), sharex=True, sharey=True, layout="constrained") + + axb.plot(before["t"], before["fill"], color=RED, lw=1.2) + hits = np.flatnonzero(np.diff(before["underruns"]) > 0) + axb.plot(before["t"][hits + 1], before["fill"][hits + 1], "o", + ms=4.5, color=RED, mec=SURFACE, mew=1.0, ls="none") + axb.annotate(f"{int(before['underruns'][-1])} underruns in 6 s — " + "one every ~0.25 s, forever", + (1.6, 116), color=SECONDARY, fontsize=8.5, ha="left") + axb.axhline(48, color=BASELINE, lw=0.75, zorder=0) + axb.set_title(f"Before (commit {PREFIX_COMMIT}, measured): " + "pull(64) against setpoint 48") + axb.set_ylabel("FIFO occupancy (frames)") + + axa.plot(after["t"], after["fill"], color=BLUE, lw=1.2) + axa.axhline(48, color=BASELINE, lw=0.75, zorder=0) + axa.axhline(96, color=BASELINE, lw=0.75, zorder=0) + axa.annotate("configured setpoint 48", (5.95, 48), xytext=(0, 5), + textcoords="offset points", color=MUTED, fontsize=8, ha="right", + bbox=dict(fc=SURFACE, ec="none", pad=1.0)) + axa.annotate("effective setpoint 96 = 64 + 64/2, raised on first pull", + (5.95, 96), xytext=(0, 5), textcoords="offset points", + color=MUTED, fontsize=8, ha="right", + bbox=dict(fc=SURFACE, ec="none", pad=1.0)) + axa.set_title(f"After (HEAD, measured): {int(after['underruns'][-1])} underruns, " + "servo regulates the raised setpoint") + axa.set_ylabel("FIFO occupancy (frames)") + axa.set_xlabel("time (s)") + axa.set_ylim(38, 132) + for ax in (axb, axa): + despine(ax) + save(fig, "feasibility") + plt.close(fig) + return before, after + + +# --- the Q0.64 wraparound, run with the real modular arithmetic --- + +def fig_q064(): + M = 1 << 64 + eps_mag = 0.09 # exaggerated so the wrap is visible; real |eps| ~ 2e-4 + fig, axes = plt.subplots( + 1, 2, figsize=(7.0, 3.0), sharey=True, layout="constrained") + for ax, sign, title, note in ( + (axes[0], +1, "ε > 0: wrap past 1.0 → advance 2", + "consume one extra input frame"), + (axes[1], -1, "ε < 0: wrap below 0.0 → advance 0", + "re-use the current window"), + ): + eps_fix = int(sign * eps_mag * M) % M # two's-complement, like the C++ + phase, mu = 0 if sign > 0 else int(0.5 * M), [] + wraps = [] + for n in range(26): + m = (phase + eps_fix) % M + if sign > 0 and m < phase: + wraps.append(n) + if sign < 0 and m > phase: + wraps.append(n) + phase = m + mu.append(phase / M) + n = np.arange(26) + mu = np.array(mu) + ax.plot(n, mu, color=BLUE, lw=1.2, marker="o", ms=4.5, + mec=SURFACE, mew=1.0) + for w in wraps: + ax.plot([w], [mu[w]], "o", ms=6, color=BLUE, mec=SURFACE, mew=1.0) + w0 = wraps[0] # annotate the first wrap only; the rest just repeat + ax.annotate(note, (w0, mu[w0]), xytext=(8, 16 * sign), + textcoords="offset points", color=SECONDARY, fontsize=8, + bbox=dict(fc=SURFACE, ec="none", pad=1.0), + arrowprops=dict(arrowstyle="-", color=SECONDARY, lw=0.75)) + ax.set_title(title, fontsize=9) + ax.set_xlabel("output frame n") + ax.set_ylim(-0.06, 1.06) + despine(ax) + axes[0].set_ylabel("phase μ = phase_ / 2⁶⁴") + fig.suptitle("The Q0.64 accumulator slips by wrapping (ε exaggerated to 0.09; real |ε| ≈ 2×10⁻⁴)", + fontsize=9.5, color=INK) + save(fig, "q064-slip") + plt.close(fig) + + +# --- the architecture diagram (drawn) --- + +def fig_architecture(): + fig, ax = plt.subplots(figsize=(8.2, 3.6), layout="constrained") + ax.set_xlim(0, 100) + ax.set_ylim(0, 44) + ax.axis("off") + + # clock-domain washes + for x0, x1, color, label in ((0, 33, BLUE, "input clock domain (producer)"), + (52, 100, AQUA, "output clock domain (consumer)")): + ax.add_patch(FancyBboxPatch((x0 + 0.5, 1), x1 - x0 - 1, 42, + boxstyle="round,pad=0,rounding_size=1.5", + fc=color, ec="none", alpha=0.08)) + ax.text((x0 + x1) / 2, 2.6, label, ha="center", color=SECONDARY, + fontsize=8) + + def box(x, y, w, h, title, sub=None, weight="bold"): + ax.add_patch(FancyBboxPatch((x, y), w, h, + boxstyle="round,pad=0,rounding_size=1.2", + fc=SURFACE, ec=BASELINE, lw=1.0)) + cy = y + h / 2 + (1.6 if sub else 0) + ax.text(x + w / 2, cy, title, ha="center", va="center", + color=INK, fontsize=8.6, fontweight=weight) + if sub: + ax.text(x + w / 2, cy - 3.8, sub, ha="center", va="center", + color=SECONDARY, fontsize=7.6) + + def arrow(p, q, label=None, dy=1.4, style="-|>"): + ax.add_patch(FancyArrowPatch(p, q, arrowstyle=style, color=SECONDARY, + lw=1.1, mutation_scale=9, + shrinkA=1, shrinkB=1)) + if label: + ax.text((p[0] + q[0]) / 2, (p[1] + q[1]) / 2 + dy, label, + ha="center", color=SECONDARY, fontsize=7.6) + + box(3, 24, 16, 10, "producer", "audio callback / core 0") + box(36, 24, 15, 10, "SpscRing", "interleaved frames") + box(60, 7, 16, 10, "PiServo", "occupancy → ε̂") + box(57, 24, 22, 10, "FractionalResampler", "polyphase bank + Q0.64 phase") + box(84, 24, 13, 10, "consumer", "core 1 / thread") + + arrow((19, 29), (36, 29), "push()") + arrow((51, 29), (57, 29), "pop") + arrow((79, 29), (84, 29), "pull()") + # occupancy: ring bottom, down and across to the servo + arrow((43.5, 24), (43.5, 12), None, style="-") + arrow((43.5, 12), (60, 12), None) + ax.text(51.5, 13.4, "occupancy", ha="center", color=SECONDARY, fontsize=7.6) + # rate estimate: servo top, up into the resampler + arrow((70, 17), (70, 24), None) + ax.text(71.5, 20.2, "ε̂ (rate estimate)", ha="left", color=SECONDARY, + fontsize=7.6) + ax.text(50, 41.5, "one passive object, two callers — the converter owns no threads", + ha="center", color=SECONDARY, fontsize=8.4) + save(fig, "architecture") + plt.close(fig) + + +def main(): + os.makedirs(OUT, exist_ok=True) + fig_kaiser_window() + fig_kaiser_response() + fig_q064() + fig_architecture() + + with tempfile.TemporaryDirectory() as tmp: + head_exe = os.path.join(tmp, "trace_head") + build_trace_tool(os.path.join(ROOT, "include"), head_exe) + prefix_tree = os.path.join(tmp, "prefix") + os.makedirs(prefix_tree) + archive = subprocess.run(["git", "-C", ROOT, "archive", PREFIX_COMMIT, "include"], + check=True, capture_output=True).stdout + subprocess.run(["tar", "-x", "-C", prefix_tree], input=archive, check=True) + prefix_exe = os.path.join(tmp, "trace_prefix") + build_trace_tool(os.path.join(prefix_tree, "include"), prefix_exe) + + tr = fig_servo_lock(head_exe) + before, after = fig_feasibility(head_exe, prefix_exe) + + print(f"servo: locked at t={tr['t'][np.argmax(tr['state'] == 2)]:.1f}s, " + f"final ppm {tr['ppm'][-1]:.1f}, underruns {int(tr['underruns'][-1])}") + print(f"feasibility: before {int(before['underruns'][-1])} underruns/6s, " + f"after {int(after['underruns'][-1])}") + print(f"wrote 6 SVGs to {OUT}") + + +if __name__ == "__main__": + main() diff --git a/scripts/book_figures_trace.cpp b/scripts/book_figures_trace.cpp new file mode 100644 index 0000000..22e58f9 --- /dev/null +++ b/scripts/book_figures_trace.cpp @@ -0,0 +1,63 @@ +// Trace dumper for the book's measured figures (scripts/book_figures.py). +// +// Runs the converter in deterministic virtual time — the same event-driven +// two-clock scheme as tests/support/two_clock_sim.hpp — and prints one CSV +// row per pull: t,fill,state,ppm,underruns. book_figures.py compiles this +// file twice, once against the current include/ tree and once against the +// tree of the last pre-feasibility-fix commit, so the before/after figure +// in the composition chapter is measured on both sides of the fix, not +// modeled. Only Status fields that exist in both versions are printed. +// +// Usage: trace pullBlock pushBlock ppm seconds [dropStart dropDur] +#include +#include +#include +#include +#include + +#include + +int main(int argc, char** argv) { + if (argc < 5) { + std::fprintf(stderr, "usage: %s pullBlock pushBlock ppm seconds [dropStart dropDur]\n", + argv[0]); + return 2; + } + const std::size_t pullBlock = static_cast(std::atol(argv[1])); + const std::size_t pushBlock = static_cast(std::atol(argv[2])); + const double ppm = std::atof(argv[3]); + const double seconds = std::atof(argv[4]); + const double dropStart = argc > 5 ? std::atof(argv[5]) : -1.0; + const double dropDur = argc > 6 ? std::atof(argv[6]) : 0.0; + + srt::Config cfg; + cfg.channels = 1; + srt::AsyncSampleRateConverter conv(cfg); + + const double fsOut = cfg.sampleRateHz; + const double fsIn = fsOut * (1.0 + ppm * 1e-6); // producer's crystal + std::vector in(pushBlock), out(pullBlock); + + double tPush = 0.0, tPull = 0.0, phase = 0.0; + const double dPhase = 2.0 * std::numbers::pi * 997.0 / fsIn; + std::puts("t,fill,state,ppm,underruns"); + while (tPull < seconds) { + if (tPush <= tPull) { + if (!(tPush >= dropStart && tPush < dropStart + dropDur)) { + for (auto& v : in) { + v = 0.5f * static_cast(std::sin(phase)); + phase += dPhase; + } + conv.push(in.data(), pushBlock); + } + tPush += static_cast(pushBlock) / fsIn; + continue; + } + conv.pull(out.data(), pullBlock); + tPull += static_cast(pullBlock) / fsOut; + const srt::Status s = conv.status(); + std::printf("%.6f,%.2f,%d,%.2f,%llu\n", tPull, s.fifoFillFrames, static_cast(s.state), + s.ppm, static_cast(s.underruns)); + } + return 0; +} diff --git a/tests/support/two_clock_sim.hpp b/tests/support/two_clock_sim.hpp index adbc5df..6dd2b1a 100644 --- a/tests/support/two_clock_sim.hpp +++ b/tests/support/two_clock_sim.hpp @@ -12,6 +12,7 @@ namespace srt_test { +// ANCHOR: pf_knobs template struct TwoClockSimT { srt::BasicAsyncSampleRateConverter& asrc; @@ -27,7 +28,9 @@ struct TwoClockSimT { /// Optional input-rate modulation: fsIn scale factor at virtual time t /// (e.g. for drift-ramp tests). Defaults to constant 1. std::function fsInScale = [](double) { return 1.0; }; + // ANCHOR_END: pf_knobs + // ANCHOR: pf_run /// Runs for `seconds` of output-clock virtual time. onOut receives every /// pulled block: (interleavedSamples, frames, virtualTime). template @@ -59,6 +62,7 @@ struct TwoClockSimT { } } } + // ANCHOR_END: pf_run }; using TwoClockSim = TwoClockSimT; diff --git a/tools/capi/srt_capi.cpp b/tools/capi/srt_capi.cpp index 0858bb9..04b2207 100644 --- a/tools/capi/srt_capi.cpp +++ b/tools/capi/srt_capi.cpp @@ -1,3 +1,4 @@ +// ANCHOR: abi_doc /// \file srt_capi.cpp /// \brief C ABI shim over the float converter, for FFI consumers (ctypes, /// cffi, Julia, ...). Build with SRT_BUILD_CAPI=ON; srt_capi.h is the @@ -9,12 +10,14 @@ /// zero return values, and every entry point tolerates a null handle — the /// documented error convention ("check srt_create for NULL") otherwise /// invites a crash on exactly the path where the caller forgot to check. +// ANCHOR_END: abi_doc #include #include #include #include "srt/srt.hpp" +// ANCHOR: abi_impl extern "C" { struct SrtHandle; // opaque } @@ -27,6 +30,7 @@ const srt::AsyncSampleRateConverter* impl(const SrtHandle* h) noexcept { return reinterpret_cast(h); } } // namespace +// ANCHOR_END: abi_impl extern "C" { @@ -34,6 +38,7 @@ unsigned srt_version(void) noexcept { return SRT_VERSION_MAJOR * 10000u + SRT_VERSION_MINOR * 100u + SRT_VERSION_PATCH; } +// ANCHOR: abi_create /// preset: 0 = fast, 1 = balanced, 2 = transparent. SrtHandle* srt_create(double sampleRateHz, std::size_t channels, std::size_t targetLatencyFrames, int preset) noexcept { @@ -51,11 +56,13 @@ SrtHandle* srt_create(double sampleRateHz, std::size_t channels, std::size_t tar return nullptr; } } +// ANCHOR_END: abi_create void srt_destroy(SrtHandle* h) noexcept { delete impl(h); } +// ANCHOR: abi_null std::size_t srt_push(SrtHandle* h, const float* interleaved, std::size_t frames) noexcept { return h ? impl(h)->push(interleaved, frames) : 0; } @@ -63,6 +70,7 @@ std::size_t srt_push(SrtHandle* h, const float* interleaved, std::size_t frames) std::size_t srt_pull(SrtHandle* h, float* interleaved, std::size_t frames) noexcept { return h ? impl(h)->pull(interleaved, frames) : 0; } +// ANCHOR_END: abi_null /// out[0]=state (0 Filling, 1 Acquiring, 2 Locked), out[1]=ppm, /// out[2]=fifoFillFrames, out[3]=underruns, out[4]=overruns, out[5]=resyncs. diff --git a/tools/capi/srt_capi.h b/tools/capi/srt_capi.h index b8b3195..0f6ad4f 100644 --- a/tools/capi/srt_capi.h +++ b/tools/capi/srt_capi.h @@ -1,3 +1,4 @@ +/* ANCHOR: abi_contract */ /* SampleRateTap C ABI — FFI surface over the float converter. * * Build the shared library with -DSRT_BUILD_CAPI=ON. This header is the @@ -17,6 +18,7 @@ * size_t in these signatures follows the platform ABI (32-bit on 32-bit * targets) — declare foreign types accordingly. */ +/* ANCHOR_END: abi_contract */ #ifndef SRT_CAPI_H #define SRT_CAPI_H @@ -26,6 +28,7 @@ extern "C" { #endif +/* ANCHOR: abi_surface */ typedef struct SrtHandle SrtHandle; /* ABI/version probe: returns SRT_VERSION_MAJOR*10000 + @@ -55,6 +58,7 @@ double srt_designed_latency_seconds(const SrtHandle* h); /* Consumer thread: discard all buffered input, forget the ppm estimate, * return to Filling. */ void srt_reset_from_consumer(SrtHandle* h); +/* ANCHOR_END: abi_surface */ #ifdef __cplusplus } diff --git a/tools/qemu_insn_plugin/insn_count.c b/tools/qemu_insn_plugin/insn_count.c index 9f8fb2d..0f67c74 100644 --- a/tools/qemu_insn_plugin/insn_count.c +++ b/tools/qemu_insn_plugin/insn_count.c @@ -21,6 +21,7 @@ QEMU_PLUGIN_EXPORT int qemu_plugin_version = QEMU_PLUGIN_VERSION; static uint64_t insn_count; +/* ANCHOR: pf_hooks */ static void tb_trans(qemu_plugin_id_t id, struct qemu_plugin_tb* tb) { (void)id; size_t n = qemu_plugin_tb_n_insns(tb); @@ -37,6 +38,7 @@ static void at_exit(qemu_plugin_id_t id, void* userdata) { g_autofree gchar* msg = g_strdup_printf("SRT_INSN_COUNT %" PRIu64 "\n", insn_count); qemu_plugin_outs(msg); } +/* ANCHOR_END: pf_hooks */ QEMU_PLUGIN_EXPORT int qemu_plugin_install(qemu_plugin_id_t id, const qemu_info_t* info, int argc, char** argv) {