div0rce · div0rce · Jun 22, 2026 · Jun 22, 2026 · Jun 22, 2026 · codescene-delta-analysis
diff --git a/CMakePresets.json b/CMakePresets.json
@@ -31,6 +31,15 @@
         "QSL_BUILD_BENCHMARKS": "ON"
       }
     },
+    {
+      "name": "flamegraph",
+      "displayName": "Flamegraph profiling (frame pointers + debug info)",
+      "inherits": "bench",
+      "binaryDir": "${sourceDir}/build/flamegraph",
+      "cacheVariables": {
+        "CMAKE_CXX_FLAGS": "-fno-omit-frame-pointer -g"
+      }
+    },
     {
       "name": "asan",
       "displayName": "ASan",
@@ -56,6 +65,7 @@
     { "name": "dev", "configurePreset": "dev" },
     { "name": "release", "configurePreset": "release" },
     { "name": "bench", "configurePreset": "bench" },
+    { "name": "flamegraph", "configurePreset": "flamegraph" },
     { "name": "asan", "configurePreset": "asan" },
     { "name": "tsan", "configurePreset": "tsan" }
   ],

diff --git a/Makefile b/Makefile
@@ -64,11 +64,13 @@ perf-record:
 	QSL_BENCH_BIN=build/bench/qsl-bench bash scripts/perf_record.sh
 
 # Issue #32: render a perf call-graph flamegraph (SVG) from the benchmark harness. Linux-only.
+# Uses the dedicated frame-pointer build (build/flamegraph) so `perf --call-graph fp` yields clean,
+# fully-symbolized stacks (no [unknown] gaps); the latency `bench` build stays untouched.
 flamegraph:
 	@test "$$(uname -s)" = "Linux" || { echo "error: make flamegraph requires Linux perf; current OS is $$(uname -s)." >&2; exit 2; }
-	cmake --preset bench
-	cmake --build --preset bench --target qsl-bench
-	QSL_BENCH_BIN=build/bench/qsl-bench bash scripts/flamegraph.sh
+	cmake --preset flamegraph
+	cmake --build --preset flamegraph --target qsl-bench
+	QSL_BENCH_BIN=build/flamegraph/qsl-bench bash scripts/flamegraph.sh
 
 # M43: CPU-affinity / scheduler-migration / NUMA locality study. Linux-only.
 numa-study:

diff --git a/README.md b/README.md
@@ -111,17 +111,18 @@ the core numbers above.
 
 ### Flamegraph
 
-Where on-CPU time goes in the `qsl-bench` synthetic suite, rendered by `make flamegraph`
+Where on-CPU time goes in the matching engine, rendered by `make flamegraph`
 (`scripts/flamegraph.sh` → the dependency-free `scripts/flamegraph.py` — no external FlameGraph
-toolchain):
+toolchain). It records `perf --call-graph fp` against a dedicated frame-pointer build
+(`build/flamegraph`) while `qsl-bench profile` drives a warm, bounded order flow for 5s, so the
+capture is dense (~20k samples) and stacks are fully symbolized — no `[unknown]` frames:
 
-[![qsl-bench cpu-clock flamegraph](results/flamegraph.svg)](results/flamegraph.svg)
+[![qsl-bench matching-engine flamegraph](results/flamegraph.svg)](results/flamegraph.svg)
 
 This is a **software cpu-clock sampling** hot-symbol profile, **not** PMU evidence: frame width is
-proportional to on-CPU samples (329 folded across 159 stacks on this run), not wall-clock latency or
-throughput, and it is hardware/kernel/compiler/build dependent. The hot frames are protocol
-`decode_new_order`, gateway session framing, `MatchingEngine::new_limit`, and order-book
-cancel/allocation. Provenance and classification are in
+proportional to on-CPU samples, not wall-clock latency or throughput, and it is
+hardware/kernel/compiler/build dependent. The hot frames are `MatchingEngine::new_limit`/`cancel`,
+the order-book level/index operations, and the allocator. Provenance and classification are in
 [`results/flamegraph.txt`](results/flamegraph.txt); methodology in
 [docs/perf_analysis.md](docs/perf_analysis.md). GitHub renders the SVG statically; download the raw
 file for interactive zoom and search.

diff --git a/apps/qsl-bench/main.cpp b/apps/qsl-bench/main.cpp
@@ -13,7 +13,9 @@
 #include <cstddef>
 #include <cstdint>
 #include <cstdio>
+#include <cstdlib>
 #include <string>
+#include <string_view>
 #include <vector>
 
 namespace qsl::bench {
@@ -103,6 +105,113 @@ void run_diff_benchmarks() {
     }
 }
 
+const char *storage_name(qsl::engine::OrderBook::Storage s) {
+    switch (s) {
+    case qsl::engine::OrderBook::Storage::Baseline:
+        return "baseline";
+    case qsl::engine::OrderBook::Storage::Pooled:
+        return "pooled";
+    case qsl::engine::OrderBook::Storage::IntrusivePooled:
+        return "intrusive";
+    case qsl::engine::OrderBook::Storage::Contiguous:
+        return "contiguous";
+    }
+    return "baseline";
+}
+
+// QSL_BENCH_STORAGE lets the profiling workload A/B the order-book storage mode without rebuilding.
+qsl::engine::OrderBook::Storage profile_storage_from_env() {
+    const char *s = std::getenv("QSL_BENCH_STORAGE");
+    const std::string_view v = (s != nullptr) ? s : "";
+    if (v == "pooled") {
+        return qsl::engine::OrderBook::Storage::Pooled;
+    }
+    if (v == "intrusive") {
+        return qsl::engine::OrderBook::Storage::IntrusivePooled;
+    }
+    if (v == "contiguous") {
+        return qsl::engine::OrderBook::Storage::Contiguous;
+    }
+    return qsl::engine::OrderBook::Storage::Baseline;
+}
+
+// Long-running, warm, deterministic profiling workload for `make flamegraph`. Drives a bounded
+// steady-state order flow (add / cross / cancel / modify) through the matching engine for a
+// wall-clock duration, so perf collects a dense sample set on a stable working set rather than the
+// ~80ms one-shot benchmark suite. Wall-clock is fine here: this is the benchmark layer, never the
+// deterministic engine path. The book stays ~W deep (cancel-oldest), keeping a pooled allocator
+// warm so steady state issues no malloc/free.
+void run_profile_workload(int argc, char **argv) {
+    using namespace qsl;
+    using core::Side;
+    using core::TimeInForce;
+
+    double seconds = 5.0;
+    if (argc >= 3) {
+        seconds = std::strtod(argv[2], nullptr);
+    } else if (const char *e = std::getenv("QSL_BENCH_PROFILE_SECONDS")) {
+        seconds = std::strtod(e, nullptr);
+    }
+    if (!(seconds > 0.0)) {
+        seconds = 5.0;
+    }
+
+    const auto storage = profile_storage_from_env();
+    engine::MatchingEngine eng{storage};
+    const auto sym = eng.register_symbol("AAPL");
+
+    constexpr std::size_t kRing = 512; // bounded resting depth
+    std::vector<core::OrderId> ring;
+    ring.reserve(kRing);
+    std::size_t head = 0;
+
+    // splitmix64 keeps the flow reproducible across runs/compilers without <random> overhead.
+    std::uint64_t state = 0x9E3779B97F4A7C15ULL;
+    const auto next = [&state] {
+        state += 0x9E3779B97F4A7C15ULL;
+        std::uint64_t z = state;
+        z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ULL;
+        z = (z ^ (z >> 27)) * 0x94D049BB133111EBULL;
+        return z ^ (z >> 31);
+    };
+
+    core::OrderId id = 1;
+    std::uint64_t ops = 0;
+    const auto t0 = clock_type::now();
+    const auto deadline = t0 + std::chrono::duration_cast<clock_type::duration>(
+                                   std::chrono::duration<double>(seconds));
+    while (clock_type::now() < deadline) {
+        for (int k = 0; k < 4096; ++k) { // batch between clock reads
+            const std::uint64_t r = next();
+            const Side side = ((r & 1U) != 0U) ? Side::Buy : Side::Sell;
+            const core::Price price = 100 + static_cast<core::Price>((r >> 1) % 64); // [100,164)
+            const auto qty = 1 + static_cast<core::Quantity>((r >> 8) % 8);
+            const core::OrderId oid = id++;
+            g_sink += eng.new_limit(sym, oid, side, price, qty, TimeInForce::GTC).size();
+            if (ring.size() < kRing) {
+                ring.push_back(oid);
+            } else {
+                g_sink += eng.cancel(sym, ring[head]).size();
+                ring[head] = oid;
+                head = (head + 1) % kRing;
+            }
+            if ((r & 7U) == 0U && !ring.empty()) {
+                const core::OrderId mid = ring[(head + (ring.size() / 2)) % ring.size()];
+                g_sink += eng.modify(sym, mid, price, qty).size();
+            }
+            if ((r & 15U) == 0U) {
+                g_sink +=
+                    eng.new_market(sym, id++, ((r & 2U) != 0U) ? Side::Sell : Side::Buy, 3).size();
+            }
+            ++ops;
+        }
+    }
+    const double secs = std::chrono::duration<double>(clock_type::now() - t0).count();
+    std::printf("profile workload: storage=%s ops=%llu elapsed=%.3fs (%.0f ops/sec) resting~%zu\n",
+                storage_name(storage), static_cast<unsigned long long>(ops), secs,
+                static_cast<double>(ops) / secs, ring.size());
+}
+
 // Run a named benchmark subcommand (argv[1]); returns true if one matched and ran, so main exits.
 bool run_subcommand(int argc, char **argv) {
     if (argc < 2) {
@@ -111,6 +220,8 @@ bool run_subcommand(int argc, char **argv) {
     const std::string command = argv[1];
     if (command == "diff") {
         run_diff_benchmarks();
+    } else if (command == "profile") {
+        run_profile_workload(argc, argv);
     } else if (command == "pool") {
         qsl::bench::run_order_pool_benchmarks();
     } else if (command == "storage") {

diff --git a/docs/perf_analysis.md b/docs/perf_analysis.md
@@ -62,12 +62,29 @@ make flamegraph
 ```
 
 This runs `scripts/flamegraph.sh`, which records call-graph samples
-(`perf record --call-graph dwarf -F 4000 -g -e cpu-clock`), folds them, and renders an SVG to
-`results/flamegraph.svg` plus a text companion `results/flamegraph.txt` (provenance, classification,
-and the top folded stacks). DWARF call graphs are used so stacks unwind correctly even though the
-`bench` (Release) preset omits frame pointers — the application symbols (`OrderBook::add_limit`,
-`MatchingEngine::new_limit`, the replay path, …) resolve from the symbol table without changing the
-optimization level under measurement.
+(`perf record --call-graph fp -F 4000 -g -e cpu-clock`) against a dedicated **frame-pointer build**
+(the `flamegraph` CMake preset → `build/flamegraph`, which adds `-fno-omit-frame-pointer -g` on top
+of the Release `bench` flags) while `qsl-bench profile` drives a long, warm, bounded order flow. It
+folds the samples and renders an SVG to `results/flamegraph.svg` plus a text companion
+`results/flamegraph.txt` (provenance, classification, and the top folded stacks). Frame-pointer
+unwinding is used because it produces clean, fully-symbolized stacks: the earlier DWARF default left
+`[unknown]` gaps (the Release build omits frame pointers and DWARF unwinding truncated deep stacks).
+The dedicated build keeps the latency `bench` numbers in `results/latest.txt` untouched — they come
+from the unmodified Release `bench` preset.
+
+Two design points address common flamegraph problems:
+
+- **Sample density and duration.** The artifact profiles `qsl-bench profile [seconds]` — a warm,
+  bounded, deterministic steady-state order flow (add / cross / cancel / modify, book held ~512
+  deep) — for 5s by default, so the capture carries tens of thousands of samples instead of the
+  ~80ms (~329-sample) one-shot benchmark suite. `QSL_FLAMEGRAPH_SECONDS` tunes the duration and
+  `QSL_BENCH_STORAGE={baseline,pooled,intrusive,contiguous}` selects the order-book storage mode.
+- **No `[unknown]` frames.** Frame-pointer unwinding resolves the whole application and C-runtime
+  startup chain. The one residual unresolvable frame is the glibc allocator boundary (Fedora's libc
+  is built without frame pointers), so `flamegraph.py` folds a lone `[unknown]` frame into its
+  caller by default — the sample is preserved and the real neighbours (the app frame and the named
+  libc symbol such as `cfree` / `operator new`) stay in the stack. Pass `--keep-unknown` to disable
+  the fold.
 
 The folding and SVG rendering live in `scripts/flamegraph.py`, a dependency-free Python script
 (standard library only) that reimplements the `stackcollapse` + flamegraph data model rather than

diff --git a/include/qsl/engine/matching_engine.hpp b/include/qsl/engine/matching_engine.hpp
@@ -59,12 +59,17 @@ class MatchingEngine {
     SymbolId register_symbol(std::string_view name);
     [[nodiscard]] std::optional<SymbolId> symbol_id(std::string_view name) const;
 
-    std::vector<EngineEvent> new_limit(SymbolId symbol, OrderId id, Side side, Price price,
-                                       Quantity quantity, TimeInForce tif);
-    std::vector<EngineEvent> new_market(SymbolId symbol, OrderId id, Side side, Quantity quantity);
-    std::vector<EngineEvent> cancel(SymbolId symbol, OrderId id);
-    std::vector<EngineEvent> modify(SymbolId symbol, OrderId id, Price new_price,
-                                    Quantity new_quantity);
+    // Mutators return a reference to a per-engine scratch buffer that is reused (and cleared) on
+    // every mutating call, so the hot path issues no per-operation heap allocation. The reference
+    // is valid only until the next mutating call on this engine; consumers that must retain events
+    // (the gateway result, replay accumulation) copy out, which they already do by owning a vector.
+    const std::vector<EngineEvent> &new_limit(SymbolId symbol, OrderId id, Side side, Price price,
+                                              Quantity quantity, TimeInForce tif);
+    const std::vector<EngineEvent> &new_market(SymbolId symbol, OrderId id, Side side,
+                                               Quantity quantity);
+    const std::vector<EngineEvent> &cancel(SymbolId symbol, OrderId id);
+    const std::vector<EngineEvent> &modify(SymbolId symbol, OrderId id, Price new_price,
+                                           Quantity new_quantity);
 
     [[nodiscard]] SeqNo last_seq() const noexcept { return seq_; }
     [[nodiscard]] EngineSnapshot snapshot() const;
@@ -91,10 +96,15 @@ class MatchingEngine {
     OrderBook *find_book(SymbolId symbol) noexcept;
     SeqNo next_seq() noexcept { return ++seq_; }
 
+    // Reused across mutating calls so the event stream needs no per-operation allocation; cleared
+    // (capacity retained) at the start of each mutator. Returned by const reference (see mutators).
+    std::vector<EngineEvent> &reset_events();
+
     SymbolRegistry registry_;
     std::map<SymbolId, OrderBook> books_; // ordered -> deterministic snapshot
     SeqNo seq_{0};
     OrderBook::Storage book_storage_{OrderBook::Storage::Baseline};
+    std::vector<EngineEvent> events_; // mutator scratch buffer (reused; see mutators)
 };
 
 } // namespace qsl::engine
diff --git a/results/flamegraph.svg b/results/flamegraph.svg