diff --git a/.github/workflows/ci-arm64.yml b/.github/workflows/ci-arm64.yml index 6e47089..f9b6684 100644 --- a/.github/workflows/ci-arm64.yml +++ b/.github/workflows/ci-arm64.yml @@ -1,6 +1,8 @@ -# Native arm64 coverage: the only job that exercises the SPSC ring's -# acquire/release protocol on real weakly-ordered hardware (x86 TSan runs -# can't reorder like Arm does, and QEMU does not model weak memory). +# TSan on native arm64: ci.yml's macos-latest leg already runs the suite +# per push on Apple Silicon (arm64), but without TSan — this workflow's +# unique value is the ring stress under TSan on real weakly-ordered +# hardware (x86 TSan runs can't reorder like Arm does, and QEMU does not +# model weak memory). # # Kept out of the per-push workflow on purpose: GitHub's arm64 hosted # runners ("ubuntu-24.04-arm") are not available on every plan for private @@ -13,6 +15,10 @@ on: schedule: - cron: "17 6 * * 1" # Mondays 06:17 UTC +permissions: + contents: read + issues: write + jobs: arm64-native: name: Linux arm64 (native weak-memory) @@ -52,3 +58,22 @@ jobs: for i in 1 2 3 4 5; do ctest --test-dir build-tsan -R 'SpscRing' --output-on-failure done + + # Scheduled runs have no PR or push audience; without this a weekly + # failure goes unseen until someone checks the Actions tab. + - name: File or update failure issue + if: failure() + env: + GH_TOKEN: ${{ github.token }} + run: | + title="ci-arm64 weekly run failing" + body="Run failed: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + number=$(gh issue list --repo "${{ github.repository }}" \ + --state open --search "in:title \"$title\"" \ + --json number,title \ + --jq ".[] | select(.title == \"$title\") | .number" | head -1) + if [ -n "$number" ]; then + gh issue comment "$number" --repo "${{ github.repository }}" --body "$body" + else + gh issue create --repo "${{ github.repository }}" --title "$title" --body "$body" + fi diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 64f50b9..9041e99 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -25,7 +25,8 @@ jobs: - name: macOS AppleClang os: macos-latest werror: ON - # Warnings stay non-fatal on MSVC until /W4 output has been triaged. + # Warnings stay non-fatal on MSVC until /W4 output has been + # triaged (docs/PERFORMANCE.md "Known debt"). - name: Windows MSVC os: windows-latest werror: OFF @@ -107,10 +108,10 @@ jobs: uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5 with: path: ~/hexagon - # -verified suffix: caches predating checksum verification are not - # reused (the download step, and thus verification, only runs on a - # cache miss). - key: ${{ env.HEXAGON_TOOLCHAIN_URL }}-verified-1 + # Keyed on the pinned digest: every job that can write this key + # verifies its download against the same pin, so no unverified + # writer can poison the trusted entry. + key: hexagon-toolchain-${{ env.HEXAGON_TOOLCHAIN_SHA256 }}-1 - name: Download toolchain if: steps.cache.outputs.cache-hit != 'true' @@ -268,11 +269,17 @@ jobs: runs-on: ubuntu-latest timeout-minutes: 45 env: - QEMU_PLUGIN_HEADER_URL: https://raw.githubusercontent.com/qemu/qemu/v8.2.2/include/qemu/qemu-plugin.h + # Commit the v8.2.2 tag pointed at when pinned (tags are movable; + # commit SHAs are not), with the header's digest verified on download. + QEMU_PLUGIN_HEADER_URL: https://raw.githubusercontent.com/qemu/qemu/11aa0b1ff115b86160c4d37e7c37e6a6b13b77ea/include/qemu/qemu-plugin.h + QEMU_PLUGIN_HEADER_SHA256: "c53a2af163e80e3f4bc6c60dbdfc84003db329d757e37cd8a16a77e1d82606ff" QEMU_SRC_URL: https://download.qemu.org/qemu-8.2.2.tar.xz # Hard pin from the "qemu source sha256:" line of run #24. QEMU_SRC_SHA256: "847346c1b82c1a54b2c38f6edbd85549edeb17430b7d4d3da12620e2962bc4f3" HEXAGON_TOOLCHAIN_URL: https://artifacts.codelinaro.org/artifactory/codelinaro-toolchain-for-hexagon/19.1.5/clang+llvm-19.1.5-cross-hexagon-unknown-linux-musl.tar.zst + # Same hard pin as the hexagon-qemu job: this job also writes the + # shared toolchain cache, so it must verify against the same digest. + HEXAGON_TOOLCHAIN_SHA256: "55b41922318f6331590ab7baa7f5dbdd99c109327a9c44a52c5e9878fab148c1" steps: - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6 @@ -285,6 +292,10 @@ jobs: - name: Build counting plugin run: | curl -sfLo /tmp/qemu-plugin.h "$QEMU_PLUGIN_HEADER_URL" + actual=$(sha256sum /tmp/qemu-plugin.h | cut -d' ' -f1) + if [ "$actual" != "$QEMU_PLUGIN_HEADER_SHA256" ]; then + echo "::error::qemu-plugin.h checksum mismatch"; exit 1 + fi gcc -shared -fPIC $(pkg-config --cflags glib-2.0) -I/tmp \ -o /tmp/libinsncount.so tools/qemu_insn_plugin/insn_count.c @@ -349,13 +360,20 @@ jobs: uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5 with: path: ~/hexagon - key: ${{ env.HEXAGON_TOOLCHAIN_URL }}-verified-1 + # Same digest-keyed name as the hexagon-qemu job; the download + # below verifies the same pin before anything is saved under it. + key: hexagon-toolchain-${{ env.HEXAGON_TOOLCHAIN_SHA256 }}-1 - name: Ratchet Hexagon run: | if [ "${{ steps.cache.outputs.cache-hit }}" != "true" ]; then mkdir -p ~/hexagon && cd ~/hexagon curl -sfLo toolchain.tar.zst "$HEXAGON_TOOLCHAIN_URL" + actual=$(sha256sum toolchain.tar.zst | cut -d' ' -f1) + if [ "$actual" != "$HEXAGON_TOOLCHAIN_SHA256" ]; then + echo "::error::toolchain checksum mismatch against pinned value" + exit 1 + fi tar --zstd -xf toolchain.tar.zst && rm toolchain.tar.zst cd "$GITHUB_WORKSPACE" fi @@ -416,6 +434,39 @@ jobs: - name: Run (smoke) run: ./build/bench/srt_bench --benchmark_min_time=0.01s + # Keeps the manually-triggered comparison paths (compare.yml, + # bench/compare) compiling per push; build-only — the measured numbers + # come from the manual workflow, never from here. + compare-smoke: + name: Comparison build smoke + runs-on: ubuntu-latest + timeout-minutes: 20 + steps: + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6 + + - name: Install dependencies + run: > + sudo apt-get update -q && + sudo apt-get install -y -q libsamplerate0-dev libsoxr-dev + gcc-arm-none-eabi + + - name: Build host comparison bench + run: > + cmake -B build-host + -DCMAKE_BUILD_TYPE=Release + -DSRT_BUILD_BENCHMARKS=ON + -DSRT_BUILD_COMPARE_BENCH=ON + && cmake --build build-host -j 4 --target srt_bench_compare + + - name: Build M55 comparison workload + run: > + cmake -B build-m55 + -DCMAKE_BUILD_TYPE=Release + -DCMAKE_TOOLCHAIN_FILE=cmake/arm-cortex-m55-mps3.cmake + -DSRT_BUILD_TESTS=OFF -DSRT_BUILD_EXAMPLES=OFF + -DSRT_BUILD_ICOUNT_BENCH=ON -DSRT_ICOUNT_COMPARE=ON + && cmake --build build-m55 -j 4 --target cmp_icount_lsr_medium + clang-format: name: clang-format runs-on: ubuntu-latest @@ -427,5 +478,6 @@ jobs: sudo apt-get update -q && sudo apt-get install -y -q clang-format-18 clang-format-18 --dry-run --Werror \ include/srt/*.hpp include/srt/detail/*.hpp \ - bench/*.cpp bench/icount/*.cpp tools/capi/*.cpp \ - tests/*.cpp tests/support/*.hpp examples/*.cpp + bench/*.cpp bench/icount/*.cpp bench/compare/*.cpp \ + tools/capi/*.cpp tools/qemu_insn_plugin/*.c \ + tests/*.cpp tests/support/*.hpp examples/*.cpp platform/*.c diff --git a/.github/workflows/compare.yml b/.github/workflows/compare.yml index f632ea0..691c92f 100644 --- a/.github/workflows/compare.yml +++ b/.github/workflows/compare.yml @@ -14,10 +14,16 @@ jobs: runs-on: ubuntu-latest timeout-minutes: 60 env: - QEMU_PLUGIN_HEADER_URL: https://raw.githubusercontent.com/qemu/qemu/v8.2.2/include/qemu/qemu-plugin.h + # Commit the v8.2.2 tag pointed at when pinned (tags are movable; + # commit SHAs are not), with the header's digest verified on download. + QEMU_PLUGIN_HEADER_URL: https://raw.githubusercontent.com/qemu/qemu/11aa0b1ff115b86160c4d37e7c37e6a6b13b77ea/include/qemu/qemu-plugin.h + QEMU_PLUGIN_HEADER_SHA256: "c53a2af163e80e3f4bc6c60dbdfc84003db329d757e37cd8a16a77e1d82606ff" QEMU_SRC_URL: https://download.qemu.org/qemu-8.2.2.tar.xz QEMU_SRC_SHA256: "847346c1b82c1a54b2c38f6edbd85549edeb17430b7d4d3da12620e2962bc4f3" HEXAGON_TOOLCHAIN_URL: https://artifacts.codelinaro.org/artifactory/codelinaro-toolchain-for-hexagon/19.1.5/clang+llvm-19.1.5-cross-hexagon-unknown-linux-musl.tar.zst + # Same hard pin as ci.yml's hexagon jobs: this job also writes the + # shared toolchain cache, so it must verify against the same digest. + HEXAGON_TOOLCHAIN_SHA256: "55b41922318f6331590ab7baa7f5dbdd99c109327a9c44a52c5e9878fab148c1" steps: - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6 @@ -30,6 +36,10 @@ jobs: - name: Build counting plugin run: | curl -sfLo /tmp/qemu-plugin.h "$QEMU_PLUGIN_HEADER_URL" + actual=$(sha256sum /tmp/qemu-plugin.h | cut -d' ' -f1) + if [ "$actual" != "$QEMU_PLUGIN_HEADER_SHA256" ]; then + echo "::error::qemu-plugin.h checksum mismatch"; exit 1 + fi gcc -shared -fPIC $(pkg-config --cflags glib-2.0) -I/tmp \ -o /tmp/libinsncount.so tools/qemu_insn_plugin/insn_count.c @@ -85,13 +95,20 @@ jobs: uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5 with: path: ~/hexagon - key: ${{ env.HEXAGON_TOOLCHAIN_URL }}-verified-1 + # Same digest-keyed name as ci.yml's hexagon jobs; the download + # below verifies the same pin before anything is saved under it. + key: hexagon-toolchain-${{ env.HEXAGON_TOOLCHAIN_SHA256 }}-1 - name: Measure Hexagon run: | if [ "${{ steps.cache.outputs.cache-hit }}" != "true" ]; then mkdir -p ~/hexagon && cd ~/hexagon curl -sfLo toolchain.tar.zst "$HEXAGON_TOOLCHAIN_URL" + actual=$(sha256sum toolchain.tar.zst | cut -d' ' -f1) + if [ "$actual" != "$HEXAGON_TOOLCHAIN_SHA256" ]; then + echo "::error::toolchain checksum mismatch against pinned value" + exit 1 + fi tar --zstd -xf toolchain.tar.zst && rm toolchain.tar.zst cd "$GITHUB_WORKSPACE" fi diff --git a/bench/CMakeLists.txt b/bench/CMakeLists.txt index f7c8f31..dd4dbd7 100644 --- a/bench/CMakeLists.txt +++ b/bench/CMakeLists.txt @@ -6,7 +6,8 @@ set(BENCHMARK_ENABLE_INSTALL OFF CACHE BOOL "" FORCE) FetchContent_Declare( googlebenchmark GIT_REPOSITORY https://github.com/google/benchmark.git - GIT_TAG v1.9.1) + # Commit pin, not the movable tag: tags can be re-pointed upstream. + GIT_TAG c58e6d0710581e3a08d65c349664128a8d9a2461) # v1.9.1 FetchContent_MakeAvailable(googlebenchmark) add_executable(srt_bench bench_asrc.cpp) diff --git a/docs/PERFORMANCE.md b/docs/PERFORMANCE.md index deb5a8d..0f6e8c0 100644 --- a/docs/PERFORMANCE.md +++ b/docs/PERFORMANCE.md @@ -65,10 +65,13 @@ baseline lands and revised deliberately. Stop when any of: ## Regression prevention -- **Deterministic ratchet (CI-gated)**: the QEMU instruction-count benches - compare against a checked-in `bench/baselines.json`; a PR fails if any - metric regresses > 3%. Improvements update the file *in the diff* — - reviewable, with history in git. +- **Deterministic ratchet (CI-gated, two-sided)**: the QEMU + instruction-count benches compare against a checked-in + `bench/baselines.json`; a PR fails if any metric moves more than 3% in + *either* direction. Regressions are rejected; improvements beyond + tolerance also fail until the baseline is re-recorded (`icount.py + --update`) *in the diff* — otherwise the stale slack would let later + regressions hide. Reviewable, with history in git. Mechanics: `bench/icount/` builds one fixed-workload binary per scenario (no argv on bare metal); `tools/qemu_insn_plugin/` is the counting @@ -95,6 +98,14 @@ from `bench/baselines.json`, and the icount-ratchet CI job regenerates it and fails on any diff — those published numbers cannot go stale. The SNR table is already enforced by test thresholds. +## Known debt + +- **MSVC /W4 triage outstanding**: the Windows CI leg builds with + `SRT_WERROR=OFF` until the /W4 output has been triaged (ci.yml carries + the matching comment). +- **Tail-latency benchmark not implemented**: the Metrics table promises + p99/max per-call `pull(128)` timing; no benchmark measures it yet. + ## Sequencing & status - [x] **PR A** — this document, Google Benchmark infrastructure diff --git a/platform/armv8m_startup.c b/platform/armv8m_startup.c index eba0f05..a0007e8 100644 --- a/platform/armv8m_startup.c +++ b/platform/armv8m_startup.c @@ -33,6 +33,7 @@ extern "C" { extern uint32_t __bss_start__, __bss_end__; extern uint32_t __stack_top; +extern uint32_t __stack_limit; extern char __heap_start__, __heap_end__; extern void __libc_init_array(void); @@ -101,6 +102,11 @@ uint64_t __atomic_exchange_8(volatile void* ptr, uint64_t value, int memorder) { } void Reset_Handler(void) { + /* MSPLIM exists on Armv8-M Mainline only (both targets are M33/M55 + * class): a main-stack overflow past __stack_limit raises a fault + * instead of silently corrupting whatever sits below the stack. */ + __asm volatile("msr msplim, %0" ::"r"(&__stack_limit)); + /* Grant full access to CP10/CP11 (scalar FPU + MVE) first: code below * may legitimately use FP registers once newlib is involved. */ volatile uint32_t* const cpacr = (volatile uint32_t*)0xE000ED88u; @@ -120,15 +126,23 @@ void Default_Handler(void) { } } +void HardFault_Handler(void) { + /* Distinct park loop so a HardFault (e.g. MSPLIM violation escalation) + * is distinguishable from other parked vectors under a debugger. */ + __asm volatile("bkpt #0"); + for (;;) { + } +} + __attribute__((section(".vectors"), used)) static const uintptr_t vectors[16] = { (uintptr_t)&__stack_top, (uintptr_t)&Reset_Handler, - (uintptr_t)&Default_Handler, /* NMI */ - (uintptr_t)&Default_Handler, /* HardFault */ - (uintptr_t)&Default_Handler, /* MemManage */ - (uintptr_t)&Default_Handler, /* BusFault */ - (uintptr_t)&Default_Handler, /* UsageFault */ - (uintptr_t)&Default_Handler, /* SecureFault */ + (uintptr_t)&Default_Handler, /* NMI */ + (uintptr_t)&HardFault_Handler, /* HardFault */ + (uintptr_t)&Default_Handler, /* MemManage */ + (uintptr_t)&Default_Handler, /* BusFault */ + (uintptr_t)&Default_Handler, /* UsageFault */ + (uintptr_t)&Default_Handler, /* SecureFault */ 0, 0, 0, diff --git a/platform/mps2_an505/mps2_an505.ld b/platform/mps2_an505/mps2_an505.ld index bc4463c..d90299f 100644 --- a/platform/mps2_an505/mps2_an505.ld +++ b/platform/mps2_an505/mps2_an505.ld @@ -79,6 +79,10 @@ SECTIONS __heap_end__ = .; } > DATA + /* MSPLIM (set in Reset_Handler): the stack may descend to the heap cap + * but no further — overflow into the heap faults instead of corrupting. */ + __stack_limit = __heap_end__; + /* librdimon's (unused, weak) _sbrk references `end`; satisfy it. */ PROVIDE(end = __heap_start__); } diff --git a/platform/mps3_an547/mps3_an547.ld b/platform/mps3_an547/mps3_an547.ld index 5be848e..a0a4d40 100644 --- a/platform/mps3_an547/mps3_an547.ld +++ b/platform/mps3_an547/mps3_an547.ld @@ -17,6 +17,9 @@ MEMORY } __stack_top = ORIGIN(DTCM) + LENGTH(DTCM); +/* MSPLIM (set in Reset_Handler): the stack owns all of DTCM, so the lowest + * address it may legally reach is the region base. */ +__stack_limit = ORIGIN(DTCM); ENTRY(Reset_Handler) diff --git a/scripts/icount.py b/scripts/icount.py index 0d27575..26856cd 100644 --- a/scripts/icount.py +++ b/scripts/icount.py @@ -4,12 +4,13 @@ Runs every srt_icount_* binary in a build directory under QEMU with the instruction-counting plugin, then compares against bench/baselines.json. - icount.py --target {hexagon,m55} --build-dir DIR --plugin LIB [--update] + icount.py --target {hexagon,m55,m33} --build-dir DIR --plugin LIB [--update] [--baselines bench/baselines.json] [--tolerance 0.03] -Exit nonzero if any scenario regresses beyond tolerance, or has no recorded -baseline (its measured value is printed so it can be committed). --update -rewrites the baselines file with the measured values instead. +The gate is two-sided: exit nonzero if any scenario regresses beyond +tolerance, improves beyond tolerance (the baseline must be re-recorded so +the gate stays tight), or has no recorded baseline. --update rewrites the +target's entry to exactly the measured scenarios instead. """ import argparse import glob @@ -38,8 +39,11 @@ def qemu_cmd(target: str, plugin: str, binary: str) -> list[str]: def measure(target: str, plugin: str, binary: str) -> int: - proc = subprocess.run(qemu_cmd(target, plugin, binary), timeout=3600, - capture_output=True, text=True) + try: + proc = subprocess.run(qemu_cmd(target, plugin, binary), timeout=600, + capture_output=True, text=True) + except subprocess.TimeoutExpired: + raise SystemExit(f"{binary}: timed out after 600 s under QEMU") out = proc.stdout + proc.stderr if "SRT_ICOUNT_DONE ok=1" not in out: print(out, file=sys.stderr) @@ -69,17 +73,22 @@ def main() -> int: path = pathlib.Path(args.baselines) baselines = json.loads(path.read_text()) if path.exists() else {} - base = baselines.setdefault(args.target, {}) + base = baselines.get(args.target, {}) failures = [] + measured = {} for binary in binaries: scenario = os.path.basename(binary).removeprefix("srt_icount_") count = measure(args.target, args.plugin, binary) + measured[scenario] = count recorded = base.get(scenario) if recorded is None: print(f"{scenario}: {count} insns (NO BASELINE — commit this value)") if not args.update: failures.append(scenario) + elif recorded == 0: + print(f"{scenario}: {count} insns vs baseline 0 (INVALID BASELINE)") + failures.append(scenario) else: delta = (count - recorded) / recorded verdict = "ok" @@ -87,13 +96,19 @@ def main() -> int: verdict = "REGRESSION" failures.append(scenario) elif delta < -args.tolerance: - verdict = "improved — update the baseline" + # Two-sided: a stale (too-high) baseline would let future + # regressions hide inside the slack, so improvements must be + # committed too. + verdict = ("IMPROVED beyond tolerance — run icount.py --update " + "and commit bench/baselines.json") + failures.append(scenario) print(f"{scenario}: {count} insns vs baseline {recorded} " f"({delta:+.2%}) {verdict}") - if args.update: - base[scenario] = count if args.update: + # Exactly the measured scenarios: stale keys for renamed/removed + # workloads must not linger as dead gate entries. + baselines[args.target] = measured path.write_text(json.dumps(baselines, indent=2, sort_keys=True) + "\n") print(f"updated {path}") return 0 diff --git a/scripts/update_perf_docs.py b/scripts/update_perf_docs.py index b69e39d..43389b6 100644 --- a/scripts/update_perf_docs.py +++ b/scripts/update_perf_docs.py @@ -36,6 +36,13 @@ def run_bench(bench: str) -> list[dict]: def table(rows: list[dict]) -> str: + usable = [b for b in rows + if b.get("run_type") == "iteration" and b.get("items_per_second")] + if not usable: + # An empty or items_per_second-less run would silently blank the + # README table. + raise SystemExit("benchmark produced no usable rows " + "(empty list or missing items_per_second)") lines = [ f"Indicative numbers from a shared machine ({cpu_name()}, " f"{datetime.date.today().isoformat()}); regenerate with " diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 999e5f2..9d94811 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -22,7 +22,8 @@ endif() FetchContent_Declare( googletest GIT_REPOSITORY https://github.com/google/googletest.git - GIT_TAG v1.14.0 + # Commit pin, not the movable tag: tags can be re-pointed upstream. + GIT_TAG f8d7d77c06936315286eb55f8de22cd23c188571 # v1.14.0 FIND_PACKAGE_ARGS NAMES GTest) set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) FetchContent_MakeAvailable(googletest) diff --git a/tests/bare_metal_main.cpp b/tests/bare_metal_main.cpp index fd909db..5c4a554 100644 --- a/tests/bare_metal_main.cpp +++ b/tests/bare_metal_main.cpp @@ -21,6 +21,17 @@ int main() { "MultiChannel.*"; ::testing::InitGoogleTest(); const int rc = RUN_ALL_TESTS(); + // A filter typo selects zero tests and RUN_ALL_TESTS() returns 0 — an + // empty run must not pass green. Checked after the run because gtest + // only applies the filter inside RUN_ALL_TESTS (the count reads 0 + // before it). The on-target selection is ~20 tests; 15 leaves headroom + // for legitimate removals without masking a typo. + const int selected = ::testing::UnitTest::GetInstance()->test_to_run_count(); + if (selected < 15) { + std::printf("only %d tests selected (expected >= 15): filter is broken\n", selected); + std::printf("SRT_TESTS_COMPLETE rc=1\n"); + return 1; + } // CTest's pass criterion: printed only if we get all the way here, so a // crash after gtest's summary cannot register as a pass. std::printf("SRT_TESTS_COMPLETE rc=%d\n", rc);