diff --git a/CMakePresets.json b/CMakePresets.json index 3cc82bb..112a631 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -31,6 +31,15 @@ "QSL_BUILD_BENCHMARKS": "ON" } }, + { + "name": "flamegraph", + "displayName": "Flamegraph profiling (frame pointers + debug info)", + "inherits": "bench", + "binaryDir": "${sourceDir}/build/flamegraph", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "-fno-omit-frame-pointer -g" + } + }, { "name": "asan", "displayName": "ASan", @@ -56,6 +65,7 @@ { "name": "dev", "configurePreset": "dev" }, { "name": "release", "configurePreset": "release" }, { "name": "bench", "configurePreset": "bench" }, + { "name": "flamegraph", "configurePreset": "flamegraph" }, { "name": "asan", "configurePreset": "asan" }, { "name": "tsan", "configurePreset": "tsan" } ], diff --git a/Makefile b/Makefile index 8c2e932..d7ab16a 100644 --- a/Makefile +++ b/Makefile @@ -64,11 +64,13 @@ perf-record: QSL_BENCH_BIN=build/bench/qsl-bench bash scripts/perf_record.sh # Issue #32: render a perf call-graph flamegraph (SVG) from the benchmark harness. Linux-only. +# Uses the dedicated frame-pointer build (build/flamegraph) so `perf --call-graph fp` yields clean, +# fully-symbolized stacks (no [unknown] gaps); the latency `bench` build stays untouched. flamegraph: @test "$$(uname -s)" = "Linux" || { echo "error: make flamegraph requires Linux perf; current OS is $$(uname -s)." >&2; exit 2; } - cmake --preset bench - cmake --build --preset bench --target qsl-bench - QSL_BENCH_BIN=build/bench/qsl-bench bash scripts/flamegraph.sh + cmake --preset flamegraph + cmake --build --preset flamegraph --target qsl-bench + QSL_BENCH_BIN=build/flamegraph/qsl-bench bash scripts/flamegraph.sh # M43: CPU-affinity / scheduler-migration / NUMA locality study. Linux-only. numa-study: diff --git a/README.md b/README.md index d1bb19d..ee12fd7 100644 --- a/README.md +++ b/README.md @@ -111,17 +111,18 @@ the core numbers above. ### Flamegraph -Where on-CPU time goes in the `qsl-bench` synthetic suite, rendered by `make flamegraph` +Where on-CPU time goes in the matching engine, rendered by `make flamegraph` (`scripts/flamegraph.sh` → the dependency-free `scripts/flamegraph.py` — no external FlameGraph -toolchain): +toolchain). It records `perf --call-graph fp` against a dedicated frame-pointer build +(`build/flamegraph`) while `qsl-bench profile` drives a warm, bounded order flow for 5s, so the +capture is dense (~20k samples) and stacks are fully symbolized — no `[unknown]` frames: -[![qsl-bench cpu-clock flamegraph](results/flamegraph.svg)](results/flamegraph.svg) +[![qsl-bench matching-engine flamegraph](results/flamegraph.svg)](results/flamegraph.svg) This is a **software cpu-clock sampling** hot-symbol profile, **not** PMU evidence: frame width is -proportional to on-CPU samples (329 folded across 159 stacks on this run), not wall-clock latency or -throughput, and it is hardware/kernel/compiler/build dependent. The hot frames are protocol -`decode_new_order`, gateway session framing, `MatchingEngine::new_limit`, and order-book -cancel/allocation. Provenance and classification are in +proportional to on-CPU samples, not wall-clock latency or throughput, and it is +hardware/kernel/compiler/build dependent. The hot frames are `MatchingEngine::new_limit`/`cancel`, +the order-book level/index operations, and the allocator. Provenance and classification are in [`results/flamegraph.txt`](results/flamegraph.txt); methodology in [docs/perf_analysis.md](docs/perf_analysis.md). GitHub renders the SVG statically; download the raw file for interactive zoom and search. diff --git a/apps/qsl-bench/main.cpp b/apps/qsl-bench/main.cpp index 8cafa0d..0e4b57e 100644 --- a/apps/qsl-bench/main.cpp +++ b/apps/qsl-bench/main.cpp @@ -13,7 +13,9 @@ #include #include #include +#include #include +#include #include namespace qsl::bench { @@ -103,6 +105,113 @@ void run_diff_benchmarks() { } } +const char *storage_name(qsl::engine::OrderBook::Storage s) { + switch (s) { + case qsl::engine::OrderBook::Storage::Baseline: + return "baseline"; + case qsl::engine::OrderBook::Storage::Pooled: + return "pooled"; + case qsl::engine::OrderBook::Storage::IntrusivePooled: + return "intrusive"; + case qsl::engine::OrderBook::Storage::Contiguous: + return "contiguous"; + } + return "baseline"; +} + +// QSL_BENCH_STORAGE lets the profiling workload A/B the order-book storage mode without rebuilding. +qsl::engine::OrderBook::Storage profile_storage_from_env() { + const char *s = std::getenv("QSL_BENCH_STORAGE"); + const std::string_view v = (s != nullptr) ? s : ""; + if (v == "pooled") { + return qsl::engine::OrderBook::Storage::Pooled; + } + if (v == "intrusive") { + return qsl::engine::OrderBook::Storage::IntrusivePooled; + } + if (v == "contiguous") { + return qsl::engine::OrderBook::Storage::Contiguous; + } + return qsl::engine::OrderBook::Storage::Baseline; +} + +// Long-running, warm, deterministic profiling workload for `make flamegraph`. Drives a bounded +// steady-state order flow (add / cross / cancel / modify) through the matching engine for a +// wall-clock duration, so perf collects a dense sample set on a stable working set rather than the +// ~80ms one-shot benchmark suite. Wall-clock is fine here: this is the benchmark layer, never the +// deterministic engine path. The book stays ~W deep (cancel-oldest), keeping a pooled allocator +// warm so steady state issues no malloc/free. +void run_profile_workload(int argc, char **argv) { + using namespace qsl; + using core::Side; + using core::TimeInForce; + + double seconds = 5.0; + if (argc >= 3) { + seconds = std::strtod(argv[2], nullptr); + } else if (const char *e = std::getenv("QSL_BENCH_PROFILE_SECONDS")) { + seconds = std::strtod(e, nullptr); + } + if (!(seconds > 0.0)) { + seconds = 5.0; + } + + const auto storage = profile_storage_from_env(); + engine::MatchingEngine eng{storage}; + const auto sym = eng.register_symbol("AAPL"); + + constexpr std::size_t kRing = 512; // bounded resting depth + std::vector ring; + ring.reserve(kRing); + std::size_t head = 0; + + // splitmix64 keeps the flow reproducible across runs/compilers without overhead. + std::uint64_t state = 0x9E3779B97F4A7C15ULL; + const auto next = [&state] { + state += 0x9E3779B97F4A7C15ULL; + std::uint64_t z = state; + z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ULL; + z = (z ^ (z >> 27)) * 0x94D049BB133111EBULL; + return z ^ (z >> 31); + }; + + core::OrderId id = 1; + std::uint64_t ops = 0; + const auto t0 = clock_type::now(); + const auto deadline = t0 + std::chrono::duration_cast( + std::chrono::duration(seconds)); + while (clock_type::now() < deadline) { + for (int k = 0; k < 4096; ++k) { // batch between clock reads + const std::uint64_t r = next(); + const Side side = ((r & 1U) != 0U) ? Side::Buy : Side::Sell; + const core::Price price = 100 + static_cast((r >> 1) % 64); // [100,164) + const auto qty = 1 + static_cast((r >> 8) % 8); + const core::OrderId oid = id++; + g_sink += eng.new_limit(sym, oid, side, price, qty, TimeInForce::GTC).size(); + if (ring.size() < kRing) { + ring.push_back(oid); + } else { + g_sink += eng.cancel(sym, ring[head]).size(); + ring[head] = oid; + head = (head + 1) % kRing; + } + if ((r & 7U) == 0U && !ring.empty()) { + const core::OrderId mid = ring[(head + (ring.size() / 2)) % ring.size()]; + g_sink += eng.modify(sym, mid, price, qty).size(); + } + if ((r & 15U) == 0U) { + g_sink += + eng.new_market(sym, id++, ((r & 2U) != 0U) ? Side::Sell : Side::Buy, 3).size(); + } + ++ops; + } + } + const double secs = std::chrono::duration(clock_type::now() - t0).count(); + std::printf("profile workload: storage=%s ops=%llu elapsed=%.3fs (%.0f ops/sec) resting~%zu\n", + storage_name(storage), static_cast(ops), secs, + static_cast(ops) / secs, ring.size()); +} + // Run a named benchmark subcommand (argv[1]); returns true if one matched and ran, so main exits. bool run_subcommand(int argc, char **argv) { if (argc < 2) { @@ -111,6 +220,8 @@ bool run_subcommand(int argc, char **argv) { const std::string command = argv[1]; if (command == "diff") { run_diff_benchmarks(); + } else if (command == "profile") { + run_profile_workload(argc, argv); } else if (command == "pool") { qsl::bench::run_order_pool_benchmarks(); } else if (command == "storage") { diff --git a/docs/perf_analysis.md b/docs/perf_analysis.md index 7400f02..cb12958 100644 --- a/docs/perf_analysis.md +++ b/docs/perf_analysis.md @@ -62,12 +62,29 @@ make flamegraph ``` This runs `scripts/flamegraph.sh`, which records call-graph samples -(`perf record --call-graph dwarf -F 4000 -g -e cpu-clock`), folds them, and renders an SVG to -`results/flamegraph.svg` plus a text companion `results/flamegraph.txt` (provenance, classification, -and the top folded stacks). DWARF call graphs are used so stacks unwind correctly even though the -`bench` (Release) preset omits frame pointers — the application symbols (`OrderBook::add_limit`, -`MatchingEngine::new_limit`, the replay path, …) resolve from the symbol table without changing the -optimization level under measurement. +(`perf record --call-graph fp -F 4000 -g -e cpu-clock`) against a dedicated **frame-pointer build** +(the `flamegraph` CMake preset → `build/flamegraph`, which adds `-fno-omit-frame-pointer -g` on top +of the Release `bench` flags) while `qsl-bench profile` drives a long, warm, bounded order flow. It +folds the samples and renders an SVG to `results/flamegraph.svg` plus a text companion +`results/flamegraph.txt` (provenance, classification, and the top folded stacks). Frame-pointer +unwinding is used because it produces clean, fully-symbolized stacks: the earlier DWARF default left +`[unknown]` gaps (the Release build omits frame pointers and DWARF unwinding truncated deep stacks). +The dedicated build keeps the latency `bench` numbers in `results/latest.txt` untouched — they come +from the unmodified Release `bench` preset. + +Two design points address common flamegraph problems: + +- **Sample density and duration.** The artifact profiles `qsl-bench profile [seconds]` — a warm, + bounded, deterministic steady-state order flow (add / cross / cancel / modify, book held ~512 + deep) — for 5s by default, so the capture carries tens of thousands of samples instead of the + ~80ms (~329-sample) one-shot benchmark suite. `QSL_FLAMEGRAPH_SECONDS` tunes the duration and + `QSL_BENCH_STORAGE={baseline,pooled,intrusive,contiguous}` selects the order-book storage mode. +- **No `[unknown]` frames.** Frame-pointer unwinding resolves the whole application and C-runtime + startup chain. The one residual unresolvable frame is the glibc allocator boundary (Fedora's libc + is built without frame pointers), so `flamegraph.py` folds a lone `[unknown]` frame into its + caller by default — the sample is preserved and the real neighbours (the app frame and the named + libc symbol such as `cfree` / `operator new`) stay in the stack. Pass `--keep-unknown` to disable + the fold. The folding and SVG rendering live in `scripts/flamegraph.py`, a dependency-free Python script (standard library only) that reimplements the `stackcollapse` + flamegraph data model rather than diff --git a/include/qsl/engine/matching_engine.hpp b/include/qsl/engine/matching_engine.hpp index 9fa237e..458c04b 100644 --- a/include/qsl/engine/matching_engine.hpp +++ b/include/qsl/engine/matching_engine.hpp @@ -59,12 +59,17 @@ class MatchingEngine { SymbolId register_symbol(std::string_view name); [[nodiscard]] std::optional symbol_id(std::string_view name) const; - std::vector new_limit(SymbolId symbol, OrderId id, Side side, Price price, - Quantity quantity, TimeInForce tif); - std::vector new_market(SymbolId symbol, OrderId id, Side side, Quantity quantity); - std::vector cancel(SymbolId symbol, OrderId id); - std::vector modify(SymbolId symbol, OrderId id, Price new_price, - Quantity new_quantity); + // Mutators return a reference to a per-engine scratch buffer that is reused (and cleared) on + // every mutating call, so the hot path issues no per-operation heap allocation. The reference + // is valid only until the next mutating call on this engine; consumers that must retain events + // (the gateway result, replay accumulation) copy out, which they already do by owning a vector. + const std::vector &new_limit(SymbolId symbol, OrderId id, Side side, Price price, + Quantity quantity, TimeInForce tif); + const std::vector &new_market(SymbolId symbol, OrderId id, Side side, + Quantity quantity); + const std::vector &cancel(SymbolId symbol, OrderId id); + const std::vector &modify(SymbolId symbol, OrderId id, Price new_price, + Quantity new_quantity); [[nodiscard]] SeqNo last_seq() const noexcept { return seq_; } [[nodiscard]] EngineSnapshot snapshot() const; @@ -91,10 +96,15 @@ class MatchingEngine { OrderBook *find_book(SymbolId symbol) noexcept; SeqNo next_seq() noexcept { return ++seq_; } + // Reused across mutating calls so the event stream needs no per-operation allocation; cleared + // (capacity retained) at the start of each mutator. Returned by const reference (see mutators). + std::vector &reset_events(); + SymbolRegistry registry_; std::map books_; // ordered -> deterministic snapshot SeqNo seq_{0}; OrderBook::Storage book_storage_{OrderBook::Storage::Baseline}; + std::vector events_; // mutator scratch buffer (reused; see mutators) }; } // namespace qsl::engine diff --git a/results/flamegraph.svg b/results/flamegraph.svg index 80466d2..661c1f2 100644 --- a/results/flamegraph.svg +++ b/results/flamegraph.svg @@ -2,16 +2,16 @@ @@ -28,4 +28,4 @@ function qslSearch(){ else{r.classList.remove('hl');}} if(detail)detail.textContent='Search: '+term; } -]]>QSL Matching-Engine Flame Graph (qsl-bench)flamegraph (software cpu-clock sampling hot-symbol profile) | Linux aarch64 | cpu-clock @ 4000Hz | 329 samples | 159 stacks | 2026-06-22T02:18:23ZSearch all (329 cpu-clock samples, 100.00%)allqsl-bench (329 cpu-clock samples, 100.00%)qsl-bench[unknown] (251 cpu-clock samples, 76.29%)[unknown][unknown] (237 cpu-clock samples, 72.04%)[unknown][unknown] (201 cpu-clock samples, 61.09%)[unknown][unknown] (2 cpu-clock samples, 0.61%)[unknown] (2 cpu-clock samples, 0.61%)[unknown] (2 cpu-clock samples, 0.61%)[unknown] (2 cpu-clock samples, 0.61%)[unknown] (1 cpu-clock samples, 0.30%)do_lookup_x (1 cpu-clock samples, 0.30%)_dl_lookup_symbol_x (1 cpu-clock samples, 0.30%)_dl_new_hash (1 cpu-clock samples, 0.30%)__libc_start_call_main (199 cpu-clock samples, 60.49%)__libc_start_call_mainmain (199 cpu-clock samples, 60.49%)maincfree@GLIBC_2.17 (1 cpu-clock samples, 0.30%)qsl::engine::OrderBook::add_limit(unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce) (20 cpu-clock samples, 6.08%)qsl::en..decltype(auto) qsl::engine::OrderBook::dispatch_storage<qsl::engine::OrderBook::contains(unsigned long) const::{lambda()#1}, qsl::engine::OrderBook::contains(unsigned long) const::{lambda(qsl::engine::OrderBook::IntrusiveStore const&)#1}, qsl::engine::OrderBook::contains(unsigned long) const::{lambda(qsl::engine::OrderBook::ContiguousStore const&)#1}>(qsl::engine::OrderBook::contains(unsigned long) const::{lambda()#1}&&, qsl::engine::OrderBook::contains(unsigned long) const::{lambda(qsl::engine::OrderBook::IntrusiveStore const&)#1}&&, qsl::engine::OrderBook::contains(unsigned long) const::{lambda(qsl::engine::OrderBook::ContiguousStore const&)#1}&&) const [clone .isra.0] (2 cpu-clock samples, 0.61%)qsl::engine::OrderBook::match_baseline(qsl::core::Side, qsl::engine::OrderBook::MatchContext&) (2 cpu-clock samples, 0.61%)qsl::engine::OrderBook::rest(unsigned long, qsl::core::Side, long, unsigned int) (13 cpu-clock samples, 3.95%)qsl..operator new(unsigned long, std::align_val_t) (1 cpu-clock samples, 0.30%)qsl::engine::OrderBook::level_for[abi:cxx11](qsl::core::Side, long) (4 cpu-clock samples, 1.22%)std::pair<std::_Rb_tree_iterator<std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > > >, bool> std::_Rb_tree<long, std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > >, std::_Select1st<std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > > >, std::greater<long>, std::pmr::polymorphic_allocator<std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > > > >::_M_emplace_unique<long&, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > >(long&, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> >&&) (3 cpu-clock samples, 0.91%)std::_Rb_tree_insert_and_rebalance(bool, std::_Rb_tree_node_base*, std::_Rb_tree_node_base*, std::_Rb_tree_node_base&) (1 cpu-clock samples, 0.30%)std::__detail::_Map_base<unsigned long, std::pair<unsigned long const, qsl::engine::OrderBook::Locator>, std::pmr::polymorphic_allocator<std::pair<unsigned long const, qsl::engine::OrderBook::Locator> >, std::__detail::_Select1st, std::equal_to<unsigned long>, std::hash<unsigned long>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits<false, false, true>, true>::operator[](unsigned long const&) (7 cpu-clock samples, 2.13%)std::_Hashtable<unsigned long, std::pair<unsigned long const, qsl::engine::OrderBook::Locator>, std::pmr::polymorphic_allocator<std::pair<unsigned long const, qsl::engine::OrderBook::Locator> >, std::__detail::_Select1st, std::equal_to<unsigned long>, std::hash<unsigned long>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits<false, false, true> >::_M_insert_unique_node(unsigned long, unsigned long, std::__detail::_Hash_node<std::pair<unsigned long const, qsl::engine::OrderBook::Locator>, false>*, unsigned long) (3 cpu-clock samples, 0.91%)std::__detail::_Prime_rehash_policy::_M_need_rehash(unsigned long, unsigned long, unsigned long) const (1 cpu-clock samples, 0.30%)qsl::engine::OrderBook::cancel(unsigned long) (18 cpu-clock samples, 5.47%)qsl::e..decltype(auto) qsl::engine::OrderBook::dispatch_storage<qsl::engine::OrderBook::cancel(unsigned long)::{lambda()#1}, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::IntrusiveStore&)#1}, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::ContiguousStore&)#1}>(qsl::engine::OrderBook::cancel(unsigned long)::{lambda()#1}&&, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::IntrusiveStore&)#1}&&, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::ContiguousStore&)#1}&&) [clone .isra.0] (18 cpu-clock samples, 5.47%)declty..qsl::engine::OrderBook::erase_resting_order(qsl::engine::OrderBook::Locator const&) (13 cpu-clock samples, 3.95%)qsl..std::_Rb_tree_rebalance_for_erase(std::_Rb_tree_node_base*, std::_Rb_tree_node_base&) (4 cpu-clock samples, 1.22%)std::__detail::_List_node_base::_M_unhook() (1 cpu-clock samples, 0.30%)std::pmr::(anonymous namespace)::newdel_res_t::do_deallocate(void*, unsigned long, unsigned long) (1 cpu-clock samples, 0.30%)std::_Hashtable<unsigned long, std::pair<unsigned long const, qsl::engine::OrderBook::Locator>, std::pmr::polymorphic_allocator<std::pair<unsigned long const, qsl::engine::OrderBook::Locator> >, std::__detail::_Select1st, std::equal_to<unsigned long>, std::hash<unsigned long>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits<false, false, true> >::_M_erase(unsigned long, std::__detail::_Hash_node_base*, std::__detail::_Hash_node<std::pair<unsigned long const, qsl::engine::OrderBook::Locator>, false>*) (3 cpu-clock samples, 0.91%)cfree@GLIBC_2.17 (1 cpu-clock samples, 0.30%)qsl::engine::OrderBook::modify(unsigned long, long, unsigned int) (2 cpu-clock samples, 0.61%)qsl::gateway::Session::on_bytes(std::span<std::byte const, 18446744073709551615ul>) (56 cpu-clock samples, 17.02%)qsl::gateway::Session::on..qsl::gateway::Session::on_bytes(std::span<std::byte const, 18446744073709551615ul>, std::vector<std::byte, std::allocator<std::byte> >&, unsigned long) (56 cpu-clock samples, 17.02%)qsl::gateway::Session::on..qsl::gateway::Session::process_frame(std::span<std::byte const, 18446744073709551615ul>, std::vector<std::byte, std::allocator<std::byte> >&, unsigned long) (53 cpu-clock samples, 16.11%)qsl::gateway::Session::p..cfree@GLIBC_2.17 (1 cpu-clock samples, 0.30%)qsl::gateway::(anonymous namespace)::emit_result(unsigned long, qsl::gateway::GatewayResult const&, std::vector<std::byte, std::allocator<std::byte> >&, unsigned long) (13 cpu-clock samples, 3.95%)qsl..cfree@GLIBC_2.17 (3 cpu-clock samples, 0.91%)qsl::gateway::(anonymous namespace)::append(std::vector<std::byte, std::allocator<std::byte> >&, std::vector<std::byte, std::allocator<std::byte> > const&, unsigned long) [clone .isra.0] (5 cpu-clock samples, 1.52%)__memcpy_generic (3 cpu-clock samples, 0.91%)qsl::protocol::encode(qsl::protocol::Fill const&) (2 cpu-clock samples, 0.61%)operator new(unsigned long) (1 cpu-clock samples, 0.30%)qsl::gateway::OrderGateway::new_limit(unsigned int, unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce) (33 cpu-clock samples, 10.03%)qsl::gateway::..qsl::engine::MatchingEngine::can_store_limit(unsigned int, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce) const (2 cpu-clock samples, 0.61%)qsl::engine::MatchingEngine::contains(unsigned int, unsigned long) const (4 cpu-clock samples, 1.22%)qsl::engine::MatchingEngine::has_symbol(unsigned int) const (1 cpu-clock samples, 0.30%)qsl::engine::MatchingEngine::new_limit(unsigned int, unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce) (16 cpu-clock samples, 4.86%)qsl::..qsl::engine::OrderBook::add_limit(unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce) (3 cpu-clock samples, 0.91%)qsl::engine::OrderBook::match_baseline(qsl::core::Side, qsl::engine::OrderBook::MatchContext&) (1 cpu-clock samples, 0.30%)qsl::engine::OrderBook::can_store_limit(qsl::core::Side, long, unsigned int, qsl::core::TimeInForce) const (1 cpu-clock samples, 0.30%)qsl::engine::OrderBook::contains(unsigned long) const (1 cpu-clock samples, 0.30%)qsl::engine::OrderBook::can_store_limit(qsl::core::Side, long, unsigned int, qsl::core::TimeInForce) const (2 cpu-clock samples, 0.61%)qsl::engine::OrderBook::contains(unsigned long) const (1 cpu-clock samples, 0.30%)qsl::engine::check_limit(qsl::engine::RiskConfig const&, qsl::core::Side, long, unsigned int) (1 cpu-clock samples, 0.30%)qsl::protocol::decode_header(std::span<std::byte const, 18446744073709551615ul>) (3 cpu-clock samples, 0.91%)qsl::protocol::decode_new_order(std::span<std::byte const, 18446744073709551615ul>) (3 cpu-clock samples, 0.91%)qsl::protocol::decode_header(std::span<std::byte const, 18446744073709551615ul>) (1 cpu-clock samples, 0.30%)qsl::protocol::decode_new_order(std::span<std::byte const, 18446744073709551615ul>) (15 cpu-clock samples, 4.56%)qsl:..qsl::protocol::encode(qsl::protocol::NewOrder const&, unsigned long) (1 cpu-clock samples, 0.30%)qsl::replay::apply(qsl::engine::MatchingEngine&, std::variant<qsl::replay::RegisterSymbol, qsl::replay::NewLimit, qsl::replay::NewMarket, qsl::replay::Cancel, qsl::replay::Modify> const&) (33 cpu-clock samples, 10.03%)qsl::replay::a..qsl::engine::MatchingEngine::cancel(unsigned int, unsigned long) (4 cpu-clock samples, 1.22%)qsl::engine::OrderBook::cancel(unsigned long) (3 cpu-clock samples, 0.91%)decltype(auto) qsl::engine::OrderBook::dispatch_storage<qsl::engine::OrderBook::cancel(unsigned long)::{lambda()#1}, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::IntrusiveStore&)#1}, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::ContiguousStore&)#1}>(qsl::engine::OrderBook::cancel(unsigned long)::{lambda()#1}&&, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::IntrusiveStore&)#1}&&, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::ContiguousStore&)#1}&&) [clone .isra.0] (3 cpu-clock samples, 0.91%)qsl::engine::OrderBook::erase_resting_order(qsl::engine::OrderBook::Locator const&) (2 cpu-clock samples, 0.61%)std::pmr::(anonymous namespace)::newdel_res_t::do_deallocate(void*, unsigned long, unsigned long) (1 cpu-clock samples, 0.30%)std::_Hashtable<unsigned long, std::pair<unsigned long const, qsl::engine::OrderBook::Locator>, std::pmr::polymorphic_allocator<std::pair<unsigned long const, qsl::engine::OrderBook::Locator> >, std::__detail::_Select1st, std::equal_to<unsigned long>, std::hash<unsigned long>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits<false, false, true> >::_M_erase(unsigned long, std::__detail::_Hash_node_base*, std::__detail::_Hash_node<std::pair<unsigned long const, qsl::engine::OrderBook::Locator>, false>*) (1 cpu-clock samples, 0.30%)qsl::engine::MatchingEngine::modify(unsigned int, unsigned long, long, unsigned int) (5 cpu-clock samples, 1.52%)qsl::engine::OrderBook::modify(unsigned long, long, unsigned int) (5 cpu-clock samples, 1.52%)decltype(auto) qsl::engine::OrderBook::dispatch_storage<qsl::engine::OrderBook::cancel(unsigned long)::{lambda()#1}, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::IntrusiveStore&)#1}, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::ContiguousStore&)#1}>(qsl::engine::OrderBook::cancel(unsigned long)::{lambda()#1}&&, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::IntrusiveStore&)#1}&&, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::ContiguousStore&)#1}&&) [clone .isra.0] (3 cpu-clock samples, 0.91%)qsl::engine::OrderBook::erase_resting_order(qsl::engine::OrderBook::Locator const&) (2 cpu-clock samples, 0.61%)std::_Hashtable<unsigned long, std::pair<unsigned long const, qsl::engine::OrderBook::Locator>, std::pmr::polymorphic_allocator<std::pair<unsigned long const, qsl::engine::OrderBook::Locator> >, std::__detail::_Select1st, std::equal_to<unsigned long>, std::hash<unsigned long>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits<false, false, true> >::_M_erase(unsigned long, std::__detail::_Hash_node_base*, std::__detail::_Hash_node<std::pair<unsigned long const, qsl::engine::OrderBook::Locator>, false>*) (1 cpu-clock samples, 0.30%)qsl::engine::MatchingEngine::new_limit(unsigned int, unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce) (17 cpu-clock samples, 5.17%)qsl::..qsl::engine::OrderBook::add_limit(unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce) (11 cpu-clock samples, 3.34%)qs..qsl::engine::OrderBook::match_baseline(qsl::core::Side, qsl::engine::OrderBook::MatchContext&) (7 cpu-clock samples, 2.13%)qsl::engine::OrderBook::fill_front_order(std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> >&, long, qsl::engine::OrderBook::MatchContext&) (2 cpu-clock samples, 0.61%)std::_Hashtable<unsigned long, std::pair<unsigned long const, qsl::engine::OrderBook::Locator>, std::pmr::polymorphic_allocator<std::pair<unsigned long const, qsl::engine::OrderBook::Locator> >, std::__detail::_Select1st, std::equal_to<unsigned long>, std::hash<unsigned long>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits<false, false, true> >::_M_erase(unsigned long, std::__detail::_Hash_node_base*, std::__detail::_Hash_node<std::pair<unsigned long const, qsl::engine::OrderBook::Locator>, false>*) (2 cpu-clock samples, 0.61%)std::_Rb_tree_rebalance_for_erase(std::_Rb_tree_node_base*, std::_Rb_tree_node_base&) (1 cpu-clock samples, 0.30%)qsl::engine::OrderBook::rest(unsigned long, qsl::core::Side, long, unsigned int) (3 cpu-clock samples, 0.91%)qsl::engine::OrderBook::level_for[abi:cxx11](qsl::core::Side, long) (3 cpu-clock samples, 0.91%)std::pair<std::_Rb_tree_iterator<std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > > >, bool> std::_Rb_tree<long, std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > >, std::_Select1st<std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > > >, std::greater<long>, std::pmr::polymorphic_allocator<std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > > > >::_M_emplace_unique<long&, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > >(long&, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> >&&) (2 cpu-clock samples, 0.61%)std::_Rb_tree_decrement(std::_Rb_tree_node_base*) (1 cpu-clock samples, 0.30%)std::_Rb_tree_insert_and_rebalance(bool, std::_Rb_tree_node_base*, std::_Rb_tree_node_base*, std::_Rb_tree_node_base&) (1 cpu-clock samples, 0.30%)std::pair<std::_Rb_tree_iterator<std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > > >, bool> std::_Rb_tree<long, std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > >, std::_Select1st<std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > > >, std::less<long>, std::pmr::polymorphic_allocator<std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > > > >::_M_emplace_unique<long&, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > >(long&, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> >&&) (1 cpu-clock samples, 0.30%)qsl::engine::MatchingEngine::new_market(unsigned int, unsigned long, qsl::core::Side, unsigned int) (3 cpu-clock samples, 0.91%)qsl::engine::OrderBook::add_market(unsigned long, qsl::core::Side, unsigned int) (2 cpu-clock samples, 0.61%)qsl::engine::OrderBook::match_baseline(qsl::core::Side, qsl::engine::OrderBook::MatchContext&) (2 cpu-clock samples, 0.61%)std::_Rb_tree_rebalance_for_erase(std::_Rb_tree_node_base*, std::_Rb_tree_node_base&) (1 cpu-clock samples, 0.30%)qsl::replay::generate_flow(unsigned long, unsigned int, unsigned long) (18 cpu-clock samples, 5.47%)qsl::r..qsl::engine::MatchingEngine::contains(unsigned int, unsigned long) const (11 cpu-clock samples, 3.34%)qs..qsl::engine::OrderBook::contains(unsigned long) const (5 cpu-clock samples, 1.52%)qsl::replay::apply(qsl::engine::MatchingEngine&, std::variant<qsl::replay::RegisterSymbol, qsl::replay::NewLimit, qsl::replay::NewMarket, qsl::replay::Cancel, qsl::replay::Modify> const&) (2 cpu-clock samples, 0.61%)qsl::engine::MatchingEngine::new_limit(unsigned int, unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce) (2 cpu-clock samples, 0.61%)qsl::engine::OrderBook::add_limit(unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce) (1 cpu-clock samples, 0.30%)qsl::engine::OrderBook::rest(unsigned long, qsl::core::Side, long, unsigned int) (1 cpu-clock samples, 0.30%)qsl::engine::OrderBook::level_for[abi:cxx11](qsl::core::Side, long) (1 cpu-clock samples, 0.30%)std::pair<std::_Rb_tree_iterator<std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > > >, bool> std::_Rb_tree<long, std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > >, std::_Select1st<std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > > >, std::less<long>, std::pmr::polymorphic_allocator<std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > > > >::_M_emplace_unique<long&, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > >(long&, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> >&&) (1 cpu-clock samples, 0.30%)std::_Rb_tree_insert_and_rebalance(bool, std::_Rb_tree_node_base*, std::_Rb_tree_node_base*, std::_Rb_tree_node_base&) (1 cpu-clock samples, 0.30%)qsl::replay::replay(qsl::engine::MatchingEngine&, std::vector<qsl::replay::LogRecord, std::allocator<qsl::replay::LogRecord> > const&) (34 cpu-clock samples, 10.33%)qsl::replay::r..operator delete(void*, unsigned long) (1 cpu-clock samples, 0.30%)qsl::replay::apply(qsl::engine::MatchingEngine&, std::variant<qsl::replay::RegisterSymbol, qsl::replay::NewLimit, qsl::replay::NewMarket, qsl::replay::Cancel, qsl::replay::Modify> const&) (26 cpu-clock samples, 7.90%)qsl::repla..qsl::engine::MatchingEngine::cancel(unsigned int, unsigned long) (3 cpu-clock samples, 0.91%)qsl::engine::OrderBook::cancel(unsigned long) (1 cpu-clock samples, 0.30%)decltype(auto) qsl::engine::OrderBook::dispatch_storage<qsl::engine::OrderBook::cancel(unsigned long)::{lambda()#1}, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::IntrusiveStore&)#1}, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::ContiguousStore&)#1}>(qsl::engine::OrderBook::cancel(unsigned long)::{lambda()#1}&&, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::IntrusiveStore&)#1}&&, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::ContiguousStore&)#1}&&) [clone .isra.0] (1 cpu-clock samples, 0.30%)qsl::engine::MatchingEngine::modify(unsigned int, unsigned long, long, unsigned int) (1 cpu-clock samples, 0.30%)qsl::engine::OrderBook::modify(unsigned long, long, unsigned int) (1 cpu-clock samples, 0.30%)qsl::engine::OrderBook::add_limit(unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce) (1 cpu-clock samples, 0.30%)qsl::engine::OrderBook::match_baseline(qsl::core::Side, qsl::engine::OrderBook::MatchContext&) (1 cpu-clock samples, 0.30%)std::_Rb_tree_rebalance_for_erase(std::_Rb_tree_node_base*, std::_Rb_tree_node_base&) (1 cpu-clock samples, 0.30%)qsl::engine::MatchingEngine::new_limit(unsigned int, unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce) (19 cpu-clock samples, 5.78%)qsl::e..qsl::engine::OrderBook::add_limit(unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce) (17 cpu-clock samples, 5.17%)qsl::..operator delete(void*, unsigned long) (1 cpu-clock samples, 0.30%)qsl::engine::OrderBook::match_baseline(qsl::core::Side, qsl::engine::OrderBook::MatchContext&) (4 cpu-clock samples, 1.22%)std::_Hashtable<unsigned long, std::pair<unsigned long const, qsl::engine::OrderBook::Locator>, std::pmr::polymorphic_allocator<std::pair<unsigned long const, qsl::engine::OrderBook::Locator> >, std::__detail::_Select1st, std::equal_to<unsigned long>, std::hash<unsigned long>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits<false, false, true> >::_M_erase(unsigned long, std::__detail::_Hash_node_base*, std::__detail::_Hash_node<std::pair<unsigned long const, qsl::engine::OrderBook::Locator>, false>*) (1 cpu-clock samples, 0.30%)std::_Rb_tree_rebalance_for_erase(std::_Rb_tree_node_base*, std::_Rb_tree_node_base&) (1 cpu-clock samples, 0.30%)qsl::engine::OrderBook::rest(unsigned long, qsl::core::Side, long, unsigned int) (11 cpu-clock samples, 3.34%)qs..qsl::engine::OrderBook::level_for[abi:cxx11](qsl::core::Side, long) (7 cpu-clock samples, 2.13%)std::pair<std::_Rb_tree_iterator<std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > > >, bool> std::_Rb_tree<long, std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > >, std::_Select1st<std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > > >, std::greater<long>, std::pmr::polymorphic_allocator<std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > > > >::_M_emplace_unique<long&, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > >(long&, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> >&&) (4 cpu-clock samples, 1.22%)std::_Rb_tree_insert_and_rebalance(bool, std::_Rb_tree_node_base*, std::_Rb_tree_node_base*, std::_Rb_tree_node_base&) (2 cpu-clock samples, 0.61%)std::pair<std::_Rb_tree_iterator<std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > > >, bool> std::_Rb_tree<long, std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > >, std::_Select1st<std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > > >, std::less<long>, std::pmr::polymorphic_allocator<std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > > > >::_M_emplace_unique<long&, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > >(long&, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> >&&) (3 cpu-clock samples, 0.91%)std::_Rb_tree_decrement(std::_Rb_tree_node_base*) (1 cpu-clock samples, 0.30%)std::__detail::_Map_base<unsigned long, std::pair<unsigned long const, qsl::engine::OrderBook::Locator>, std::pmr::polymorphic_allocator<std::pair<unsigned long const, qsl::engine::OrderBook::Locator> >, std::__detail::_Select1st, std::equal_to<unsigned long>, std::hash<unsigned long>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits<false, false, true>, true>::operator[](unsigned long const&) (3 cpu-clock samples, 0.91%)std::_Hashtable<unsigned long, std::pair<unsigned long const, qsl::engine::OrderBook::Locator>, std::pmr::polymorphic_allocator<std::pair<unsigned long const, qsl::engine::OrderBook::Locator> >, std::__detail::_Select1st, std::equal_to<unsigned long>, std::hash<unsigned long>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits<false, false, true> >::_M_insert_unique_node(unsigned long, unsigned long, std::__detail::_Hash_node<std::pair<unsigned long const, qsl::engine::OrderBook::Locator>, false>*, unsigned long) (1 cpu-clock samples, 0.30%)qsl::engine::OrderBook::contains(unsigned long) const (2 cpu-clock samples, 0.61%)qsl::engine::MatchingEngine::new_market(unsigned int, unsigned long, qsl::core::Side, unsigned int) (1 cpu-clock samples, 0.30%)qsl::replay::decode_command(std::span<std::byte const, 18446744073709551615ul>) (3 cpu-clock samples, 0.91%)operator new(unsigned long) (5 cpu-clock samples, 1.52%)malloc@plt (5 cpu-clock samples, 1.52%)operator new(unsigned long, std::align_val_t) (2 cpu-clock samples, 0.61%)posix_memalign@plt (2 cpu-clock samples, 0.61%)qsl::engine::OrderBook::level_for[abi:cxx11](qsl::core::Side, long) (4 cpu-clock samples, 1.22%)[unknown] (4 cpu-clock samples, 1.22%)[unknown] (4 cpu-clock samples, 1.22%)[unknown] (2 cpu-clock samples, 0.61%)__posix_memalign (2 cpu-clock samples, 0.61%)malloc (2 cpu-clock samples, 0.61%)operator new(unsigned long, std::align_val_t) (2 cpu-clock samples, 0.61%)__posix_memalign (1 cpu-clock samples, 0.30%)qsl::engine::OrderBook::rest(unsigned long, qsl::core::Side, long, unsigned int) (7 cpu-clock samples, 2.13%)[unknown] (5 cpu-clock samples, 1.52%)[unknown] (5 cpu-clock samples, 1.52%)[unknown] (5 cpu-clock samples, 1.52%)[unknown] (1 cpu-clock samples, 0.30%)_mid_memalign (1 cpu-clock samples, 0.30%)__posix_memalign (4 cpu-clock samples, 1.22%)malloc (3 cpu-clock samples, 0.91%)operator new(unsigned long, std::align_val_t)@plt (2 cpu-clock samples, 0.61%)qsl::gateway::(anonymous namespace)::emit_result(unsigned long, qsl::gateway::GatewayResult const&, std::vector<std::byte, std::allocator<std::byte> >&, unsigned long) (10 cpu-clock samples, 3.04%)qs..[unknown] (9 cpu-clock samples, 2.74%)[..[unknown] (9 cpu-clock samples, 2.74%)[..cfree@GLIBC_2.17 (3 cpu-clock samples, 0.91%)operator new(unsigned long) (6 cpu-clock samples, 1.82%)malloc (4 cpu-clock samples, 1.22%)operator delete(void*)@plt (1 cpu-clock samples, 0.30%)qsl::gateway::OrderGateway::new_limit(unsigned int, unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce) (8 cpu-clock samples, 2.43%)q..[unknown] (8 cpu-clock samples, 2.43%)[..[unknown] (8 cpu-clock samples, 2.43%)[..cfree@GLIBC_2.17 (1 cpu-clock samples, 0.30%)operator new(unsigned long) (7 cpu-clock samples, 2.13%)malloc (4 cpu-clock samples, 1.22%)decltype(auto) qsl::engine::OrderBook::dispatch_storage<qsl::engine::OrderBook::cancel(unsigned long)::{lambda()#1}, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::IntrusiveStore&)#1}, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::ContiguousStore&)#1}>(qsl::engine::OrderBook::cancel(unsigned long)::{lambda()#1}&&, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::IntrusiveStore&)#1}&&, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::ContiguousStore&)#1}&&) [clone .isra.0] (1 cpu-clock samples, 0.30%)[unknown] (1 cpu-clock samples, 0.30%)[unknown] (1 cpu-clock samples, 0.30%)cfree@GLIBC_2.17 (1 cpu-clock samples, 0.30%)main (1 cpu-clock samples, 0.30%)decltype(auto) qsl::engine::OrderBook::dispatch_storage<qsl::engine::OrderBook::cancel(unsigned long)::{lambda()#1}, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::IntrusiveStore&)#1}, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::ContiguousStore&)#1}>(qsl::engine::OrderBook::cancel(unsigned long)::{lambda()#1}&&, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::IntrusiveStore&)#1}&&, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::ContiguousStore&)#1}&&) [clone .isra.0] (1 cpu-clock samples, 0.30%)[unknown] (1 cpu-clock samples, 0.30%)[unknown] (1 cpu-clock samples, 0.30%)cfree@GLIBC_2.17 (1 cpu-clock samples, 0.30%)operator new(unsigned long) (1 cpu-clock samples, 0.30%)malloc@plt (1 cpu-clock samples, 0.30%)operator new(unsigned long, std::align_val_t) (1 cpu-clock samples, 0.30%)posix_memalign@plt (1 cpu-clock samples, 0.30%)qsl::engine::OrderBook::level_for[abi:cxx11](qsl::core::Side, long) (3 cpu-clock samples, 0.91%)[unknown] (3 cpu-clock samples, 0.91%)[unknown] (3 cpu-clock samples, 0.91%)[unknown] (1 cpu-clock samples, 0.30%)[unknown] (1 cpu-clock samples, 0.30%)_mid_memalign (1 cpu-clock samples, 0.30%)cfree@GLIBC_2.17 (1 cpu-clock samples, 0.30%)operator new(unsigned long, std::align_val_t) (1 cpu-clock samples, 0.30%)__posix_memalign (1 cpu-clock samples, 0.30%)qsl::engine::OrderBook::rest(unsigned long, qsl::core::Side, long, unsigned int) (3 cpu-clock samples, 0.91%)[unknown] (3 cpu-clock samples, 0.91%)[unknown] (3 cpu-clock samples, 0.91%)[unknown] (3 cpu-clock samples, 0.91%)[unknown] (1 cpu-clock samples, 0.30%)_mid_memalign (1 cpu-clock samples, 0.30%)__posix_memalign (2 cpu-clock samples, 0.61%)malloc (1 cpu-clock samples, 0.30%)qsl::gateway::Session::process_frame(std::span<std::byte const, 18446744073709551615ul>, std::vector<std::byte, std::allocator<std::byte> >&, unsigned long) (3 cpu-clock samples, 0.91%)[unknown] (2 cpu-clock samples, 0.61%)[unknown] (2 cpu-clock samples, 0.61%)cfree@GLIBC_2.17 (2 cpu-clock samples, 0.61%)free@plt (1 cpu-clock samples, 0.30%)std::__detail::_Map_base<unsigned long, std::pair<unsigned long const, qsl::engine::OrderBook::Locator>, std::pmr::polymorphic_allocator<std::pair<unsigned long const, qsl::engine::OrderBook::Locator> >, std::__detail::_Select1st, std::equal_to<unsigned long>, std::hash<unsigned long>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits<false, false, true>, true>::operator[](unsigned long const&) (1 cpu-clock samples, 0.30%)operator new(unsigned long, std::align_val_t)@plt (1 cpu-clock samples, 0.30%)__libc_start_call_main (9 cpu-clock samples, 2.74%)_..[unknown] (9 cpu-clock samples, 2.74%)[..[unknown] (9 cpu-clock samples, 2.74%)[..[unknown] (1 cpu-clock samples, 0.30%)[unknown] (1 cpu-clock samples, 0.30%)unlink_chunk.isra.0 (1 cpu-clock samples, 0.30%)cfree@GLIBC_2.17 (8 cpu-clock samples, 2.43%)c..decltype(auto) qsl::engine::OrderBook::dispatch_storage<qsl::engine::OrderBook::cancel(unsigned long)::{lambda()#1}, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::IntrusiveStore&)#1}, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::ContiguousStore&)#1}>(qsl::engine::OrderBook::cancel(unsigned long)::{lambda()#1}&&, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::IntrusiveStore&)#1}&&, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::ContiguousStore&)#1}&&) [clone .isra.0] (4 cpu-clock samples, 1.22%)[unknown] (4 cpu-clock samples, 1.22%)[unknown] (4 cpu-clock samples, 1.22%)cfree@GLIBC_2.17 (4 cpu-clock samples, 1.22%)main (11 cpu-clock samples, 3.34%)main[unknown] (5 cpu-clock samples, 1.52%)[unknown] (5 cpu-clock samples, 1.52%)[unknown] (1 cpu-clock samples, 0.30%)_int_free_merge_chunk (1 cpu-clock samples, 0.30%)operator new(unsigned long) (4 cpu-clock samples, 1.22%)malloc (4 cpu-clock samples, 1.22%)free@plt (2 cpu-clock samples, 0.61%)operator delete(void*)@plt (3 cpu-clock samples, 0.91%)operator delete(void*, unsigned long)@plt (1 cpu-clock samples, 0.30%)operator new(unsigned long) (4 cpu-clock samples, 1.22%)malloc@plt (4 cpu-clock samples, 1.22%)qsl::engine::MatchingEngine::new_limit(unsigned int, unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce) (8 cpu-clock samples, 2.43%)q..[unknown] (3 cpu-clock samples, 0.91%)[unknown] (3 cpu-clock samples, 0.91%)cfree@GLIBC_2.17 (2 cpu-clock samples, 0.61%)operator new(unsigned long) (1 cpu-clock samples, 0.30%)malloc (1 cpu-clock samples, 0.30%)free@plt (1 cpu-clock samples, 0.30%)operator delete(void*)@plt (1 cpu-clock samples, 0.30%)operator delete(void*, unsigned long)@plt (1 cpu-clock samples, 0.30%)operator new(unsigned long)@plt (2 cpu-clock samples, 0.61%)qsl::engine::MatchingEngine::new_market(unsigned int, unsigned long, qsl::core::Side, unsigned int) (1 cpu-clock samples, 0.30%)operator new(unsigned long)@plt (1 cpu-clock samples, 0.30%)qsl::engine::OrderBook::add_limit(unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce) (12 cpu-clock samples, 3.65%)qsl..[unknown] (10 cpu-clock samples, 3.04%)[u..[unknown] (10 cpu-clock samples, 3.04%)[u..[unknown] (7 cpu-clock samples, 2.13%)[unknown] (1 cpu-clock samples, 0.30%)_mid_memalign (1 cpu-clock samples, 0.30%)__posix_memalign (6 cpu-clock samples, 1.82%)malloc (4 cpu-clock samples, 1.22%)operator new(unsigned long, std::align_val_t) (3 cpu-clock samples, 0.91%)__posix_memalign (2 cpu-clock samples, 0.61%)memcpy@plt (1 cpu-clock samples, 0.30%)operator delete(void*)@plt (1 cpu-clock samples, 0.30%)qsl::engine::OrderBook::erase_resting_order(qsl::engine::OrderBook::Locator const&) (11 cpu-clock samples, 3.34%)qs..operator delete(void*, std::align_val_t)@plt (5 cpu-clock samples, 1.52%)operator delete(void*, unsigned long, std::align_val_t)@plt (5 cpu-clock samples, 1.52%)std::_Rb_tree_rebalance_for_erase(std::_Rb_tree_node_base*, std::_Rb_tree_node_base&)@plt (1 cpu-clock samples, 0.30%)qsl::engine::OrderBook::fill_front_order(std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> >&, long, qsl::engine::OrderBook::MatchContext&) (1 cpu-clock samples, 0.30%)operator new(unsigned long)@plt (1 cpu-clock samples, 0.30%)qsl::gateway::(anonymous namespace)::append(std::vector<std::byte, std::allocator<std::byte> >&, std::vector<std::byte, std::allocator<std::byte> > const&, unsigned long) [clone .isra.0] (1 cpu-clock samples, 0.30%)operator delete(void*)@plt (1 cpu-clock samples, 0.30%)qsl::gateway::Session::on_bytes(std::span<std::byte const, 18446744073709551615ul>, std::vector<std::byte, std::allocator<std::byte> >&, unsigned long) (3 cpu-clock samples, 0.91%)[unknown] (2 cpu-clock samples, 0.61%)[unknown] (2 cpu-clock samples, 0.61%)cfree@GLIBC_2.17 (2 cpu-clock samples, 0.61%)memcpy@plt (1 cpu-clock samples, 0.30%)qsl::protocol::encode(qsl::protocol::Ack const&) (1 cpu-clock samples, 0.30%)operator new(unsigned long)@plt (1 cpu-clock samples, 0.30%)qsl::protocol::encode(qsl::protocol::NewOrder const&, unsigned long) (1 cpu-clock samples, 0.30%)operator new(unsigned long)@plt (1 cpu-clock samples, 0.30%)qsl::replay::apply(qsl::engine::MatchingEngine&, std::variant<qsl::replay::RegisterSymbol, qsl::replay::NewLimit, qsl::replay::NewMarket, qsl::replay::Cancel, qsl::replay::Modify> const&) (1 cpu-clock samples, 0.30%)[unknown] (1 cpu-clock samples, 0.30%)[unknown] (1 cpu-clock samples, 0.30%)operator new(unsigned long) (1 cpu-clock samples, 0.30%)malloc (1 cpu-clock samples, 0.30%)qsl::replay::replay(qsl::engine::MatchingEngine&, std::vector<qsl::replay::LogRecord, std::allocator<qsl::replay::LogRecord> > const&) (1 cpu-clock samples, 0.30%)memcpy@plt (1 cpu-clock samples, 0.30%)std::_Hashtable<unsigned long, std::pair<unsigned long const, qsl::engine::OrderBook::Locator>, std::pmr::polymorphic_allocator<std::pair<unsigned long const, qsl::engine::OrderBook::Locator> >, std::__detail::_Select1st, std::equal_to<unsigned long>, std::hash<unsigned long>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits<false, false, true> >::_M_erase(unsigned long, std::__detail::_Hash_node_base*, std::__detail::_Hash_node<std::pair<unsigned long const, qsl::engine::OrderBook::Locator>, false>*) (7 cpu-clock samples, 2.13%)free@plt (2 cpu-clock samples, 0.61%)operator delete(void*, unsigned long, std::align_val_t)@plt (5 cpu-clock samples, 1.52%)std::pair<std::_Rb_tree_iterator<std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > > >, bool> std::_Rb_tree<long, std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > >, std::_Select1st<std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > > >, std::greater<long>, std::pmr::polymorphic_allocator<std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > > > >::_M_emplace_unique<long&, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > >(long&, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> >&&) (2 cpu-clock samples, 0.61%)free@plt (1 cpu-clock samples, 0.30%)std::_Rb_tree_insert_and_rebalance(bool, std::_Rb_tree_node_base*, std::_Rb_tree_node_base*, std::_Rb_tree_node_base&)@plt (1 cpu-clock samples, 0.30%) +]]>QSL Matching-Engine Flame Graph (qsl-bench)flamegraph (software cpu-clock sampling hot-symbol profile) | Linux aarch64 | cpu-clock @ 4000Hz | 20001 samples | 372 stacks | 2026-06-22T17:45:09ZSearch all (20001 cpu-clock samples, 100.00%)allqsl-bench (20001 cpu-clock samples, 100.00%)qsl-bench_start (20001 cpu-clock samples, 100.00%)_start__libc_start_main@@GLIBC_2.34 (19999 cpu-clock samples, 99.99%)__libc_start_main@@GLIBC_2.34__libc_start_call_main (19999 cpu-clock samples, 99.99%)__libc_start_call_mainmain (19999 cpu-clock samples, 99.99%)maincfree@GLIBC_2.17 (56 cpu-clock samples, 0.28%)qsl::engine::MatchingEngine::cancel(unsigned int, unsigned long) (3134 cpu-clock samples, 15.67%)qsl::engine::MatchingEn..qsl::engine::OrderBook::cancel(unsigned long) (2786 cpu-clock samples, 13.93%)qsl::engine::OrderBo..decltype(auto) qsl::engine::OrderBook::dispatch_storage<qsl::engine::OrderBook::cancel(unsigned long)::{lambda()#1}, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::IntrusiveStore&)#1}, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::ContiguousStore&)#1}>(qsl::engine::OrderBook::cancel(unsigned long)::{lambda()#1}&&, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::IntrusiveStore&)#1}&&, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::ContiguousStore&)#1}&&) [clone .isra.0] (2718 cpu-clock samples, 13.59%)decltype(auto) qsl::..cfree@GLIBC_2.17 (30 cpu-clock samples, 0.15%)qsl::engine::OrderBook::erase_resting_order(qsl::engine::OrderBook::Locator const&) (710 cpu-clock samples, 3.55%)qsl.._int_free_chunk (12 cpu-clock samples, 0.06%)_int_free_create_chunk (3 cpu-clock samples, 0.01%)_int_free_merge_chunk (6 cpu-clock samples, 0.03%)_int_free_create_chunk (4 cpu-clock samples, 0.02%)cfree@GLIBC_2.17 (57 cpu-clock samples, 0.28%)free@plt (24 cpu-clock samples, 0.12%)operator delete(void*, std::align_val_t)@plt (17 cpu-clock samples, 0.08%)operator delete(void*, unsigned long, std::align_val_t)@plt (11 cpu-clock samples, 0.05%)std::_Rb_tree_rebalance_for_erase(std::_Rb_tree_node_base*, std::_Rb_tree_node_base&) (14 cpu-clock samples, 0.07%)std::__detail::_List_node_base::_M_unhook() (15 cpu-clock samples, 0.07%)std::__detail::_List_node_base::_M_unhook()@plt (16 cpu-clock samples, 0.08%)std::_Hashtable<unsigned long, std::pair<unsigned long const, qsl::engine::OrderBook::Locator>, std::pmr::polymorphic_allocator<std::pair<unsigned long const, qsl::engine::OrderBook::Locator> >, std::__detail::_Select1st, std::equal_to<unsigned long>, std::hash<unsigned long>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits<false, false, true> >::_M_erase(unsigned long, std::__detail::_Hash_node_base*, std::__detail::_Hash_node<std::pair<unsigned long const, qsl::engine::OrderBook::Locator>, false>*) (206 cpu-clock samples, 1.03%)_int_free_chunk (13 cpu-clock samples, 0.06%)_int_free_maybe_trim (5 cpu-clock samples, 0.02%)_int_free_merge_chunk (5 cpu-clock samples, 0.02%)_int_free_create_chunk (3 cpu-clock samples, 0.01%)cfree@GLIBC_2.17 (68 cpu-clock samples, 0.34%)free@plt (14 cpu-clock samples, 0.07%)operator delete(void*, std::align_val_t)@plt (14 cpu-clock samples, 0.07%)operator delete(void*, unsigned long, std::align_val_t)@plt (28 cpu-clock samples, 0.14%)std::pmr::(anonymous namespace)::newdel_res_t::do_deallocate(void*, unsigned long, unsigned long) (8 cpu-clock samples, 0.04%)qsl::engine::MatchingEngine::modify(unsigned int, unsigned long, long, unsigned int) (881 cpu-clock samples, 4.40%)qsl:..cfree@GLIBC_2.17 (3 cpu-clock samples, 0.01%)qsl::engine::OrderBook::can_apply_modify(unsigned long, long, unsigned int) const (9 cpu-clock samples, 0.04%)qsl::engine::OrderBook::contains(unsigned long) const (233 cpu-clock samples, 1.16%)qsl::engine::OrderBook::modify(unsigned long, long, unsigned int) (557 cpu-clock samples, 2.78%)q..decltype(auto) qsl::engine::OrderBook::dispatch_storage<qsl::engine::OrderBook::cancel(unsigned long)::{lambda()#1}, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::IntrusiveStore&)#1}, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::ContiguousStore&)#1}>(qsl::engine::OrderBook::cancel(unsigned long)::{lambda()#1}&&, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::IntrusiveStore&)#1}&&, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::ContiguousStore&)#1}&&) [clone .isra.0] (170 cpu-clock samples, 0.85%)cfree@GLIBC_2.17 (5 cpu-clock samples, 0.02%)qsl::engine::OrderBook::erase_resting_order(qsl::engine::OrderBook::Locator const&) (115 cpu-clock samples, 0.57%)_int_free_chunk (4 cpu-clock samples, 0.02%)_int_free_merge_chunk (3 cpu-clock samples, 0.01%)_int_free_create_chunk (2 cpu-clock samples, 0.01%)cfree@GLIBC_2.17 (13 cpu-clock samples, 0.06%)free@plt (4 cpu-clock samples, 0.02%)operator delete(void*, std::align_val_t)@plt (4 cpu-clock samples, 0.02%)operator delete(void*, unsigned long, std::align_val_t)@plt (4 cpu-clock samples, 0.02%)std::__detail::_List_node_base::_M_unhook() (3 cpu-clock samples, 0.01%)std::__detail::_List_node_base::_M_unhook()@plt (3 cpu-clock samples, 0.01%)std::_Hashtable<unsigned long, std::pair<unsigned long const, qsl::engine::OrderBook::Locator>, std::pmr::polymorphic_allocator<std::pair<unsigned long const, qsl::engine::OrderBook::Locator> >, std::__detail::_Select1st, std::equal_to<unsigned long>, std::hash<unsigned long>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits<false, false, true> >::_M_erase(unsigned long, std::__detail::_Hash_node_base*, std::__detail::_Hash_node<std::pair<unsigned long const, qsl::engine::OrderBook::Locator>, false>*) (37 cpu-clock samples, 0.18%)cfree@GLIBC_2.17 (6 cpu-clock samples, 0.03%)free@plt (3 cpu-clock samples, 0.01%)operator delete(void*, unsigned long, std::align_val_t)@plt (3 cpu-clock samples, 0.01%)std::pmr::(anonymous namespace)::newdel_res_t::do_deallocate(void*, unsigned long, unsigned long) (2 cpu-clock samples, 0.01%)qsl::engine::OrderBook::add_limit(unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce) (370 cpu-clock samples, 1.85%)__memcpy_generic (5 cpu-clock samples, 0.02%)cfree@GLIBC_2.17 (2 cpu-clock samples, 0.01%)decltype(auto) qsl::engine::OrderBook::dispatch_storage<qsl::engine::OrderBook::contains(unsigned long) const::{lambda()#1}, qsl::engine::OrderBook::contains(unsigned long) const::{lambda(qsl::engine::OrderBook::IntrusiveStore const&)#1}, qsl::engine::OrderBook::contains(unsigned long) const::{lambda(qsl::engine::OrderBook::ContiguousStore const&)#1}>(qsl::engine::OrderBook::contains(unsigned long) const::{lambda()#1}&&, qsl::engine::OrderBook::contains(unsigned long) const::{lambda(qsl::engine::OrderBook::IntrusiveStore const&)#1}&&, qsl::engine::OrderBook::contains(unsigned long) const::{lambda(qsl::engine::OrderBook::ContiguousStore const&)#1}&&) const [clone .isra.0] (26 cpu-clock samples, 0.13%)free@plt (2 cpu-clock samples, 0.01%)operator delete(void*)@plt (4 cpu-clock samples, 0.02%)operator delete(void*, unsigned long) (4 cpu-clock samples, 0.02%)operator delete(void*, unsigned long)@plt (3 cpu-clock samples, 0.01%)operator new(unsigned long) (8 cpu-clock samples, 0.04%)malloc (3 cpu-clock samples, 0.01%)malloc@plt (2 cpu-clock samples, 0.01%)operator new(unsigned long)@plt (2 cpu-clock samples, 0.01%)qsl::engine::OrderBook::match_baseline(qsl::core::Side, qsl::engine::OrderBook::MatchContext&) (151 cpu-clock samples, 0.75%)__memcpy_generic (4 cpu-clock samples, 0.02%)cfree@GLIBC_2.17 (15 cpu-clock samples, 0.07%)operator delete(void*, std::align_val_t)@plt (2 cpu-clock samples, 0.01%)operator new(unsigned long) (6 cpu-clock samples, 0.03%)malloc (3 cpu-clock samples, 0.01%)malloc@plt (2 cpu-clock samples, 0.01%)qsl::engine::OrderBook::fill_front_order(std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> >&, long, qsl::engine::OrderBook::MatchContext&) (55 cpu-clock samples, 0.27%)_int_free_chunk (2 cpu-clock samples, 0.01%)cfree@GLIBC_2.17 (2 cpu-clock samples, 0.01%)free@plt (4 cpu-clock samples, 0.02%)operator new(unsigned long) (7 cpu-clock samples, 0.03%)malloc (5 cpu-clock samples, 0.02%)malloc@plt (2 cpu-clock samples, 0.01%)std::_Hashtable<unsigned long, std::pair<unsigned long const, qsl::engine::OrderBook::Locator>, std::pmr::polymorphic_allocator<std::pair<unsigned long const, qsl::engine::OrderBook::Locator> >, std::__detail::_Select1st, std::equal_to<unsigned long>, std::hash<unsigned long>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits<false, false, true> >::_M_erase(unsigned long, std::__detail::_Hash_node_base*, std::__detail::_Hash_node<std::pair<unsigned long const, qsl::engine::OrderBook::Locator>, false>*) (22 cpu-clock samples, 0.11%)cfree@GLIBC_2.17 (5 cpu-clock samples, 0.02%)operator delete(void*, unsigned long, std::align_val_t)@plt (2 cpu-clock samples, 0.01%)std::pmr::(anonymous namespace)::newdel_res_t::do_deallocate(void*, unsigned long, unsigned long) (2 cpu-clock samples, 0.01%)std::__detail::_List_node_base::_M_unhook() (2 cpu-clock samples, 0.01%)std::_Hashtable<unsigned long, std::pair<unsigned long const, qsl::engine::OrderBook::Locator>, std::pmr::polymorphic_allocator<std::pair<unsigned long const, qsl::engine::OrderBook::Locator> >, std::__detail::_Select1st, std::equal_to<unsigned long>, std::hash<unsigned long>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits<false, false, true> >::_M_erase(unsigned long, std::__detail::_Hash_node_base*, std::__detail::_Hash_node<std::pair<unsigned long const, qsl::engine::OrderBook::Locator>, false>*) (7 cpu-clock samples, 0.03%)cfree@GLIBC_2.17 (2 cpu-clock samples, 0.01%)std::pmr::(anonymous namespace)::newdel_res_t::do_deallocate(void*, unsigned long, unsigned long) (2 cpu-clock samples, 0.01%)std::_Rb_tree_rebalance_for_erase(std::_Rb_tree_node_base*, std::_Rb_tree_node_base&) (15 cpu-clock samples, 0.07%)std::_Rb_tree_rebalance_for_erase(std::_Rb_tree_node_base*, std::_Rb_tree_node_base&)@plt (3 cpu-clock samples, 0.01%)std::__detail::_List_node_base::_M_unhook() (2 cpu-clock samples, 0.01%)qsl::engine::OrderBook::rest(unsigned long, qsl::core::Side, long, unsigned int) (146 cpu-clock samples, 0.73%)operator new(unsigned long, std::align_val_t) (17 cpu-clock samples, 0.08%)__posix_memalign (7 cpu-clock samples, 0.03%)_mid_memalign (2 cpu-clock samples, 0.01%)malloc (2 cpu-clock samples, 0.01%)qsl::engine::OrderBook::level_for[abi:cxx11](qsl::core::Side, long) (87 cpu-clock samples, 0.43%)std::pair<std::_Rb_tree_iterator<std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > > >, bool> std::_Rb_tree<long, std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > >, std::_Select1st<std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > > >, std::greater<long>, std::pmr::polymorphic_allocator<std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > > > >::_M_emplace_unique<long&, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > >(long&, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> >&&) (43 cpu-clock samples, 0.21%)cfree@GLIBC_2.17 (2 cpu-clock samples, 0.01%)operator new(unsigned long, std::align_val_t) (4 cpu-clock samples, 0.02%)__posix_memalign (3 cpu-clock samples, 0.01%)std::_Rb_tree_decrement(std::_Rb_tree_node_base*) (4 cpu-clock samples, 0.02%)std::_Rb_tree_insert_and_rebalance(bool, std::_Rb_tree_node_base*, std::_Rb_tree_node_base*, std::_Rb_tree_node_base&)@plt (2 cpu-clock samples, 0.01%)std::pair<std::_Rb_tree_iterator<std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > > >, bool> std::_Rb_tree<long, std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > >, std::_Select1st<std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > > >, std::less<long>, std::pmr::polymorphic_allocator<std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > > > >::_M_emplace_unique<long&, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > >(long&, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> >&&) (39 cpu-clock samples, 0.19%)cfree@GLIBC_2.17 (2 cpu-clock samples, 0.01%)operator new(unsigned long, std::align_val_t) (3 cpu-clock samples, 0.01%)__posix_memalign (2 cpu-clock samples, 0.01%)std::pmr::(anonymous namespace)::newdel_res_t::do_deallocate(void*, unsigned long, unsigned long) (2 cpu-clock samples, 0.01%)std::__detail::_List_node_base::_M_hook(std::__detail::_List_node_base*) (2 cpu-clock samples, 0.01%)std::__detail::_Map_base<unsigned long, std::pair<unsigned long const, qsl::engine::OrderBook::Locator>, std::pmr::polymorphic_allocator<std::pair<unsigned long const, qsl::engine::OrderBook::Locator> >, std::__detail::_Select1st, std::equal_to<unsigned long>, std::hash<unsigned long>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits<false, false, true>, true>::operator[](unsigned long const&) (33 cpu-clock samples, 0.16%)operator new(unsigned long, std::align_val_t) (11 cpu-clock samples, 0.05%)__posix_memalign (7 cpu-clock samples, 0.03%)malloc (4 cpu-clock samples, 0.02%)std::_Hashtable<unsigned long, std::pair<unsigned long const, qsl::engine::OrderBook::Locator>, std::pmr::polymorphic_allocator<std::pair<unsigned long const, qsl::engine::OrderBook::Locator> >, std::__detail::_Select1st, std::equal_to<unsigned long>, std::hash<unsigned long>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits<false, false, true> >::_M_insert_unique_node(unsigned long, unsigned long, std::__detail::_Hash_node<std::pair<unsigned long const, qsl::engine::OrderBook::Locator>, false>*, unsigned long) (5 cpu-clock samples, 0.02%)std::__detail::_Prime_rehash_policy::_M_need_rehash(unsigned long, unsigned long, unsigned long) const (3 cpu-clock samples, 0.01%)qsl::engine::MatchingEngine::new_limit(unsigned int, unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce) (14752 cpu-clock samples, 73.76%)qsl::engine::MatchingEngine::new_limit(unsigned int, unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeI.._int_free_chunk (6 cpu-clock samples, 0.03%)_int_free_merge_chunk (4 cpu-clock samples, 0.02%)_int_free_create_chunk (3 cpu-clock samples, 0.01%)cfree@GLIBC_2.17 (175 cpu-clock samples, 0.87%)free@plt (32 cpu-clock samples, 0.16%)operator delete(void*)@plt (36 cpu-clock samples, 0.18%)operator delete(void*, unsigned long)@plt (18 cpu-clock samples, 0.09%)operator new(unsigned long) (30 cpu-clock samples, 0.15%)qsl::engine::OrderBook::add_limit(unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce) (12793 cpu-clock samples, 63.96%)qsl::engine::OrderBook::add_limit(unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInF..__memcpy_generic (85 cpu-clock samples, 0.42%)cfree@GLIBC_2.17 (196 cpu-clock samples, 0.98%)decltype(auto) qsl::engine::OrderBook::dispatch_storage<qsl::engine::OrderBook::contains(unsigned long) const::{lambda()#1}, qsl::engine::OrderBook::contains(unsigned long) const::{lambda(qsl::engine::OrderBook::IntrusiveStore const&)#1}, qsl::engine::OrderBook::contains(unsigned long) const::{lambda(qsl::engine::OrderBook::ContiguousStore const&)#1}>(qsl::engine::OrderBook::contains(unsigned long) const::{lambda()#1}&&, qsl::engine::OrderBook::contains(unsigned long) const::{lambda(qsl::engine::OrderBook::IntrusiveStore const&)#1}&&, qsl::engine::OrderBook::contains(unsigned long) const::{lambda(qsl::engine::OrderBook::ContiguousStore const&)#1}&&) const [clone .isra.0] (224 cpu-clock samples, 1.12%)free@plt (28 cpu-clock samples, 0.14%)memcpy@plt (12 cpu-clock samples, 0.06%)operator delete(void*)@plt (76 cpu-clock samples, 0.38%)operator delete(void*, unsigned long) (107 cpu-clock samples, 0.53%)operator delete(void*, unsigned long)@plt (53 cpu-clock samples, 0.26%)operator new(unsigned long) (270 cpu-clock samples, 1.35%)__libc_malloc2 (23 cpu-clock samples, 0.11%)_int_malloc (21 cpu-clock samples, 0.10%)malloc (112 cpu-clock samples, 0.56%)malloc@plt (47 cpu-clock samples, 0.23%)operator new(unsigned long)@plt (36 cpu-clock samples, 0.18%)operator new(unsigned long, std::align_val_t) (2 cpu-clock samples, 0.01%)qsl::engine::OrderBook::match_baseline(qsl::core::Side, qsl::engine::OrderBook::MatchContext&) (5083 cpu-clock samples, 25.41%)qsl::engine::OrderBook::match_baseline(..__memcpy_generic (62 cpu-clock samples, 0.31%)_int_free_chunk (35 cpu-clock samples, 0.17%)_int_free_merge_chunk (26 cpu-clock samples, 0.13%)_int_free_create_chunk (16 cpu-clock samples, 0.08%)unlink_chunk.isra.0 (4 cpu-clock samples, 0.02%)unlink_chunk.isra.0 (2 cpu-clock samples, 0.01%)_int_free_maybe_trim (2 cpu-clock samples, 0.01%)_int_free_merge_chunk (2 cpu-clock samples, 0.01%)cfree@GLIBC_2.17 (266 cpu-clock samples, 1.33%)free@plt (70 cpu-clock samples, 0.35%)operator delete(void*)@plt (25 cpu-clock samples, 0.12%)operator delete(void*, std::align_val_t)@plt (55 cpu-clock samples, 0.27%)operator delete(void*, unsigned long) (62 cpu-clock samples, 0.31%)operator delete(void*, unsigned long)@plt (7 cpu-clock samples, 0.03%)operator delete(void*, unsigned long, std::align_val_t)@plt (83 cpu-clock samples, 0.41%)operator new(unsigned long) (211 cpu-clock samples, 1.05%)__libc_malloc2 (11 cpu-clock samples, 0.05%)_int_malloc (10 cpu-clock samples, 0.05%)malloc (107 cpu-clock samples, 0.53%)malloc@plt (39 cpu-clock samples, 0.19%)operator new(unsigned long)@plt (39 cpu-clock samples, 0.19%)qsl::engine::OrderBook::fill_front_order(std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> >&, long, qsl::engine::OrderBook::MatchContext&) (1266 cpu-clock samples, 6.33%)qsl::en..__memcpy_generic (36 cpu-clock samples, 0.18%)_int_free_chunk (24 cpu-clock samples, 0.12%)_int_free_maybe_trim (3 cpu-clock samples, 0.01%)_int_free_merge_chunk (17 cpu-clock samples, 0.08%)_int_free_create_chunk (7 cpu-clock samples, 0.03%)unlink_chunk.isra.0 (2 cpu-clock samples, 0.01%)cfree@GLIBC_2.17 (112 cpu-clock samples, 0.56%)free@plt (32 cpu-clock samples, 0.16%)operator delete(void*)@plt (17 cpu-clock samples, 0.08%)operator delete(void*, std::align_val_t)@plt (13 cpu-clock samples, 0.06%)operator delete(void*, unsigned long) (50 cpu-clock samples, 0.25%)operator delete(void*, unsigned long)@plt (4 cpu-clock samples, 0.02%)operator delete(void*, unsigned long, std::align_val_t)@plt (41 cpu-clock samples, 0.20%)operator new(unsigned long) (165 cpu-clock samples, 0.82%)__libc_malloc2 (8 cpu-clock samples, 0.04%)_int_malloc (7 cpu-clock samples, 0.03%)malloc (84 cpu-clock samples, 0.42%)malloc@plt (48 cpu-clock samples, 0.24%)operator new(unsigned long)@plt (32 cpu-clock samples, 0.16%)std::_Hashtable<unsigned long, std::pair<unsigned long const, qsl::engine::OrderBook::Locator>, std::pmr::polymorphic_allocator<std::pair<unsigned long const, qsl::engine::OrderBook::Locator> >, std::__detail::_Select1st, std::equal_to<unsigned long>, std::hash<unsigned long>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits<false, false, true> >::_M_erase(unsigned long, std::__detail::_Hash_node_base*, std::__detail::_Hash_node<std::pair<unsigned long const, qsl::engine::OrderBook::Locator>, false>*) (357 cpu-clock samples, 1.78%)_int_free_chunk (8 cpu-clock samples, 0.04%)_int_free_merge_chunk (5 cpu-clock samples, 0.02%)_int_free_create_chunk (2 cpu-clock samples, 0.01%)cfree@GLIBC_2.17 (58 cpu-clock samples, 0.29%)free@plt (14 cpu-clock samples, 0.07%)operator delete(void*, std::align_val_t)@plt (18 cpu-clock samples, 0.09%)operator delete(void*, unsigned long, std::align_val_t)@plt (37 cpu-clock samples, 0.18%)std::pmr::(anonymous namespace)::newdel_res_t::do_deallocate(void*, unsigned long, unsigned long) (14 cpu-clock samples, 0.07%)std::__detail::_List_node_base::_M_unhook() (24 cpu-clock samples, 0.12%)std::__detail::_List_node_base::_M_unhook()@plt (8 cpu-clock samples, 0.04%)std::pmr::(anonymous namespace)::newdel_res_t::do_deallocate(void*, unsigned long, unsigned long) (4 cpu-clock samples, 0.02%)std::_Hashtable<unsigned long, std::pair<unsigned long const, qsl::engine::OrderBook::Locator>, std::pmr::polymorphic_allocator<std::pair<unsigned long const, qsl::engine::OrderBook::Locator> >, std::__detail::_Select1st, std::equal_to<unsigned long>, std::hash<unsigned long>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits<false, false, true> >::_M_erase(unsigned long, std::__detail::_Hash_node_base*, std::__detail::_Hash_node<std::pair<unsigned long const, qsl::engine::OrderBook::Locator>, false>*) (453 cpu-clock samples, 2.26%)_int_free_chunk (9 cpu-clock samples, 0.04%)_int_free_merge_chunk (7 cpu-clock samples, 0.03%)_int_free_create_chunk (4 cpu-clock samples, 0.02%)_int_free_merge_chunk (2 cpu-clock samples, 0.01%)cfree@GLIBC_2.17 (80 cpu-clock samples, 0.40%)free@plt (24 cpu-clock samples, 0.12%)operator delete(void*, std::align_val_t)@plt (14 cpu-clock samples, 0.07%)operator delete(void*, unsigned long, std::align_val_t)@plt (49 cpu-clock samples, 0.24%)std::pmr::(anonymous namespace)::newdel_res_t::do_deallocate(void*, unsigned long, unsigned long) (33 cpu-clock samples, 0.16%)std::_Rb_tree_rebalance_for_erase(std::_Rb_tree_node_base*, std::_Rb_tree_node_base&) (489 cpu-clock samples, 2.44%)s..std::_Rb_tree_rebalance_for_erase(std::_Rb_tree_node_base*, std::_Rb_tree_node_base&)@plt (17 cpu-clock samples, 0.08%)std::__detail::_List_node_base::_M_unhook() (34 cpu-clock samples, 0.17%)std::__detail::_List_node_base::_M_unhook()@plt (9 cpu-clock samples, 0.04%)std::pmr::(anonymous namespace)::newdel_res_t::do_deallocate(void*, unsigned long, unsigned long) (26 cpu-clock samples, 0.13%)qsl::engine::OrderBook::rest(unsigned long, qsl::core::Side, long, unsigned int) (6024 cpu-clock samples, 30.12%)qsl::engine::OrderBook::rest(unsigned long, qsl..operator new(unsigned long, std::align_val_t) (583 cpu-clock samples, 2.91%)op..__posix_memalign (388 cpu-clock samples, 1.94%)__libc_malloc2 (50 cpu-clock samples, 0.25%)_int_malloc (45 cpu-clock samples, 0.22%)unlink_chunk.isra.0 (3 cpu-clock samples, 0.01%)_mid_memalign (64 cpu-clock samples, 0.32%)malloc (168 cpu-clock samples, 0.84%)_mid_memalign (27 cpu-clock samples, 0.13%)posix_memalign@plt (27 cpu-clock samples, 0.13%)operator new(unsigned long, std::align_val_t)@plt (34 cpu-clock samples, 0.17%)qsl::engine::OrderBook::level_for[abi:cxx11](qsl::core::Side, long) (3784 cpu-clock samples, 18.92%)qsl::engine::OrderBook::level..cfree@GLIBC_2.17 (41 cpu-clock samples, 0.20%)std::pair<std::_Rb_tree_iterator<std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > > >, bool> std::_Rb_tree<long, std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > >, std::_Select1st<std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > > >, std::greater<long>, std::pmr::polymorphic_allocator<std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > > > >::_M_emplace_unique<long&, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > >(long&, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> >&&) (1928 cpu-clock samples, 9.64%)std::pair<std..__posix_memalign (2 cpu-clock samples, 0.01%)cfree@GLIBC_2.17 (44 cpu-clock samples, 0.22%)free@plt (10 cpu-clock samples, 0.05%)operator delete(void*, std::align_val_t)@plt (7 cpu-clock samples, 0.03%)operator delete(void*, unsigned long, std::align_val_t)@plt (19 cpu-clock samples, 0.09%)operator new(unsigned long, std::align_val_t) (272 cpu-clock samples, 1.36%)__posix_memalign (169 cpu-clock samples, 0.84%)__libc_malloc2 (3 cpu-clock samples, 0.01%)_int_malloc (3 cpu-clock samples, 0.01%)_mid_memalign (29 cpu-clock samples, 0.14%)malloc (80 cpu-clock samples, 0.40%)_mid_memalign (24 cpu-clock samples, 0.12%)posix_memalign@plt (18 cpu-clock samples, 0.09%)operator new(unsigned long, std::align_val_t)@plt (32 cpu-clock samples, 0.16%)std::_Rb_tree_decrement(std::_Rb_tree_node_base*) (145 cpu-clock samples, 0.72%)std::_Rb_tree_decrement(std::_Rb_tree_node_base*)@plt (3 cpu-clock samples, 0.01%)std::_Rb_tree_insert_and_rebalance(bool, std::_Rb_tree_node_base*, std::_Rb_tree_node_base*, std::_Rb_tree_node_base&) (214 cpu-clock samples, 1.07%)std::_Rb_tree_insert_and_rebalance(bool, std::_Rb_tree_node_base*, std::_Rb_tree_node_base*, std::_Rb_tree_node_base&)@plt (30 cpu-clock samples, 0.15%)std::pmr::(anonymous namespace)::newdel_res_t::do_deallocate(void*, unsigned long, unsigned long) (11 cpu-clock samples, 0.05%)std::pair<std::_Rb_tree_iterator<std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > > >, bool> std::_Rb_tree<long, std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > >, std::_Select1st<std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > > >, std::less<long>, std::pmr::polymorphic_allocator<std::pair<long const, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > > > >::_M_emplace_unique<long&, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> > >(long&, std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> >&&) (1684 cpu-clock samples, 8.42%)std::pair<s..cfree@GLIBC_2.17 (31 cpu-clock samples, 0.15%)free@plt (9 cpu-clock samples, 0.04%)operator delete(void*, std::align_val_t)@plt (7 cpu-clock samples, 0.03%)operator delete(void*, unsigned long, std::align_val_t)@plt (12 cpu-clock samples, 0.06%)operator new(unsigned long, std::align_val_t) (277 cpu-clock samples, 1.38%)__posix_memalign (180 cpu-clock samples, 0.90%)__libc_malloc2 (3 cpu-clock samples, 0.01%)_int_malloc (3 cpu-clock samples, 0.01%)_mid_memalign (33 cpu-clock samples, 0.16%)malloc (83 cpu-clock samples, 0.41%)_mid_memalign (18 cpu-clock samples, 0.09%)posix_memalign@plt (20 cpu-clock samples, 0.10%)operator new(unsigned long, std::align_val_t)@plt (39 cpu-clock samples, 0.19%)std::_Rb_tree_decrement(std::_Rb_tree_node_base*) (102 cpu-clock samples, 0.51%)std::_Rb_tree_decrement(std::_Rb_tree_node_base*)@plt (13 cpu-clock samples, 0.06%)std::_Rb_tree_insert_and_rebalance(bool, std::_Rb_tree_node_base*, std::_Rb_tree_node_base*, std::_Rb_tree_node_base&) (191 cpu-clock samples, 0.95%)std::_Rb_tree_insert_and_rebalance(bool, std::_Rb_tree_node_base*, std::_Rb_tree_node_base*, std::_Rb_tree_node_base&)@plt (21 cpu-clock samples, 0.10%)std::pmr::(anonymous namespace)::newdel_res_t::do_allocate(unsigned long, unsigned long) (3 cpu-clock samples, 0.01%)std::pmr::(anonymous namespace)::newdel_res_t::do_deallocate(void*, unsigned long, unsigned long) (5 cpu-clock samples, 0.02%)std::__detail::_List_node_base::_M_hook(std::__detail::_List_node_base*) (29 cpu-clock samples, 0.14%)std::__detail::_List_node_base::_M_hook(std::__detail::_List_node_base*)@plt (34 cpu-clock samples, 0.17%)std::__detail::_Map_base<unsigned long, std::pair<unsigned long const, qsl::engine::OrderBook::Locator>, std::pmr::polymorphic_allocator<std::pair<unsigned long const, qsl::engine::OrderBook::Locator> >, std::__detail::_Select1st, std::equal_to<unsigned long>, std::hash<unsigned long>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits<false, false, true>, true>::operator[](unsigned long const&) (1423 cpu-clock samples, 7.11%)std::__de..__posix_memalign (23 cpu-clock samples, 0.11%)operator new(unsigned long, std::align_val_t) (561 cpu-clock samples, 2.80%)o..__posix_memalign (379 cpu-clock samples, 1.89%)__libc_malloc2 (30 cpu-clock samples, 0.15%)_int_malloc (27 cpu-clock samples, 0.13%)_mid_memalign (64 cpu-clock samples, 0.32%)malloc (193 cpu-clock samples, 0.96%)_mid_memalign (29 cpu-clock samples, 0.14%)posix_memalign@plt (30 cpu-clock samples, 0.15%)operator new(unsigned long, std::align_val_t)@plt (90 cpu-clock samples, 0.45%)std::_Hashtable<unsigned long, std::pair<unsigned long const, qsl::engine::OrderBook::Locator>, std::pmr::polymorphic_allocator<std::pair<unsigned long const, qsl::engine::OrderBook::Locator> >, std::__detail::_Select1st, std::equal_to<unsigned long>, std::hash<unsigned long>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits<false, false, true> >::_M_insert_unique_node(unsigned long, unsigned long, std::__detail::_Hash_node<std::pair<unsigned long const, qsl::engine::OrderBook::Locator>, false>*, unsigned long) (252 cpu-clock samples, 1.26%)std::__detail::_Prime_rehash_policy::_M_need_rehash(unsigned long, unsigned long, unsigned long) const (76 cpu-clock samples, 0.38%)std::__detail::_Prime_rehash_policy::_M_need_rehash(unsigned long, unsigned long, unsigned long) const@plt (28 cpu-clock samples, 0.14%)std::pmr::(anonymous namespace)::newdel_res_t::do_allocate(unsigned long, unsigned long) (21 cpu-clock samples, 0.10%)std::pmr::(anonymous namespace)::newdel_res_t::do_allocate(unsigned long, unsigned long) (2 cpu-clock samples, 0.01%)qsl::engine::OrderBook::can_store_limit(qsl::core::Side, long, unsigned int, qsl::core::TimeInForce) const (131 cpu-clock samples, 0.65%)qsl::engine::OrderBook::contains(unsigned long) const (853 cpu-clock samples, 4.26%)qsl:..qsl::engine::MatchingEngine::new_market(unsigned int, unsigned long, qsl::core::Side, unsigned int) (628 cpu-clock samples, 3.14%)qs..cfree@GLIBC_2.17 (15 cpu-clock samples, 0.07%)free@plt (4 cpu-clock samples, 0.02%)operator delete(void*)@plt (7 cpu-clock samples, 0.03%)operator delete(void*, unsigned long)@plt (6 cpu-clock samples, 0.03%)qsl::engine::OrderBook::add_market(unsigned long, qsl::core::Side, unsigned int) (494 cpu-clock samples, 2.47%)q..decltype(auto) qsl::engine::OrderBook::dispatch_storage<qsl::engine::OrderBook::contains(unsigned long) const::{lambda()#1}, qsl::engine::OrderBook::contains(unsigned long) const::{lambda(qsl::engine::OrderBook::IntrusiveStore const&)#1}, qsl::engine::OrderBook::contains(unsigned long) const::{lambda(qsl::engine::OrderBook::ContiguousStore const&)#1}>(qsl::engine::OrderBook::contains(unsigned long) const::{lambda()#1}&&, qsl::engine::OrderBook::contains(unsigned long) const::{lambda(qsl::engine::OrderBook::IntrusiveStore const&)#1}&&, qsl::engine::OrderBook::contains(unsigned long) const::{lambda(qsl::engine::OrderBook::ContiguousStore const&)#1}&&) const [clone .isra.0] (22 cpu-clock samples, 0.11%)qsl::engine::OrderBook::match_baseline(qsl::core::Side, qsl::engine::OrderBook::MatchContext&) (454 cpu-clock samples, 2.27%)cfree@GLIBC_2.17 (11 cpu-clock samples, 0.05%)free@plt (5 cpu-clock samples, 0.02%)operator delete(void*, std::align_val_t)@plt (5 cpu-clock samples, 0.02%)operator delete(void*, unsigned long, std::align_val_t)@plt (5 cpu-clock samples, 0.02%)operator new(unsigned long) (6 cpu-clock samples, 0.03%)qsl::engine::OrderBook::fill_front_order(std::__cxx11::list<qsl::engine::Order, std::pmr::polymorphic_allocator<qsl::engine::Order> >&, long, qsl::engine::OrderBook::MatchContext&) (306 cpu-clock samples, 1.53%)__memcpy_generic (10 cpu-clock samples, 0.05%)_int_free_chunk (9 cpu-clock samples, 0.04%)_int_free_merge_chunk (8 cpu-clock samples, 0.04%)_int_free_create_chunk (4 cpu-clock samples, 0.02%)cfree@GLIBC_2.17 (32 cpu-clock samples, 0.16%)free@plt (7 cpu-clock samples, 0.03%)operator delete(void*)@plt (3 cpu-clock samples, 0.01%)operator delete(void*, std::align_val_t)@plt (6 cpu-clock samples, 0.03%)operator delete(void*, unsigned long) (10 cpu-clock samples, 0.05%)operator delete(void*, unsigned long, std::align_val_t)@plt (3 cpu-clock samples, 0.01%)operator new(unsigned long) (41 cpu-clock samples, 0.20%)malloc (24 cpu-clock samples, 0.12%)malloc@plt (10 cpu-clock samples, 0.05%)operator new(unsigned long)@plt (8 cpu-clock samples, 0.04%)std::_Hashtable<unsigned long, std::pair<unsigned long const, qsl::engine::OrderBook::Locator>, std::pmr::polymorphic_allocator<std::pair<unsigned long const, qsl::engine::OrderBook::Locator> >, std::__detail::_Select1st, std::equal_to<unsigned long>, std::hash<unsigned long>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits<false, false, true> >::_M_erase(unsigned long, std::__detail::_Hash_node_base*, std::__detail::_Hash_node<std::pair<unsigned long const, qsl::engine::OrderBook::Locator>, false>*) (72 cpu-clock samples, 0.36%)cfree@GLIBC_2.17 (12 cpu-clock samples, 0.06%)free@plt (3 cpu-clock samples, 0.01%)operator delete(void*, std::align_val_t)@plt (5 cpu-clock samples, 0.02%)operator delete(void*, unsigned long, std::align_val_t)@plt (8 cpu-clock samples, 0.04%)std::pmr::(anonymous namespace)::newdel_res_t::do_deallocate(void*, unsigned long, unsigned long) (10 cpu-clock samples, 0.05%)std::__detail::_List_node_base::_M_unhook() (3 cpu-clock samples, 0.01%)std::__detail::_List_node_base::_M_unhook()@plt (4 cpu-clock samples, 0.02%)std::pmr::(anonymous namespace)::newdel_res_t::do_deallocate(void*, unsigned long, unsigned long) (2 cpu-clock samples, 0.01%)std::_Rb_tree_rebalance_for_erase(std::_Rb_tree_node_base*, std::_Rb_tree_node_base&) (34 cpu-clock samples, 0.17%)std::_Rb_tree_rebalance_for_erase(std::_Rb_tree_node_base*, std::_Rb_tree_node_base&)@plt (2 cpu-clock samples, 0.01%)std::pmr::(anonymous namespace)::newdel_res_t::do_deallocate(void*, unsigned long, unsigned long) (5 cpu-clock samples, 0.02%)qsl::engine::OrderBook::contains(unsigned long) const (50 cpu-clock samples, 0.25%)_dl_start (2 cpu-clock samples, 0.01%)_dl_sysdep_start (2 cpu-clock samples, 0.01%)dl_main (2 cpu-clock samples, 0.01%)_dl_relocate_object (2 cpu-clock samples, 0.01%)_dl_relocate_object_no_relro (2 cpu-clock samples, 0.01%)_dl_lookup_symbol_x (2 cpu-clock samples, 0.01%) diff --git a/results/flamegraph.txt b/results/flamegraph.txt index 4969a22..85312a2 100644 --- a/results/flamegraph.txt +++ b/results/flamegraph.txt @@ -8,20 +8,20 @@ Perf: perf version 6.19.14-400.asahi.fc44.aarch64 Perf paranoid: 2 Build type: Release Provenance version: 1 -Git commit (informational): 31070b1 -Source digest: sha256:6aa521e6295a99f9dbf7dee9e5bcef04e93174ed12c3e8de9b991a8bfc14c809 +Git commit (informational): aee3387 +Source digest: sha256:69f5cb221ac0d6dc8b1f87c35c82c6689912a58d1241a8d326a23f8956def323 Source digest scope: flamegraph-benchmark Dirty inputs: no Generated output: results/flamegraph.svg -Date: 2026-06-22T02:18:23Z -Benchmark binary: build/bench/qsl-bench -Dataset: qsl-bench default synthetic benchmark suite -Call graph: dwarf +Date: 2026-06-22T17:45:09Z +Benchmark binary: build/flamegraph/qsl-bench +Dataset: qsl-bench profile workload (warm bounded order flow, 5s) +Call graph: fp Record event: cpu-clock Sample freq: 4000 Hz -Sample count (folded total): 329 -Sample count (perf record est.): 329 -Folded stacks: 159 +Sample count (folded total): 20001 +Sample count (perf record est.): 20001 +Folded stacks: 372 Minimum samples for hot profile: 200 Insufficient samples: no Record status: 0 @@ -35,25 +35,21 @@ investigation. Frame width is proportional to on-CPU samples, not wall-clock latency or throughput, and is hardware/kernel/compiler/build dependent. Top 15 folded stacks (count stack): - 15 qsl-bench;[unknown];[unknown];[unknown];__libc_start_call_main;main;qsl::protocol::decode_new_order(std::span) - 11 qsl-bench;[unknown];[unknown];[unknown];__libc_start_call_main;main;qsl::gateway::Session::on_bytes(std::span);qsl::gateway::Session::on_bytes(std::span, std::vector >&, unsigned long);qsl::gateway::Session::process_frame(std::span, std::vector >&, unsigned long);qsl::gateway::OrderGateway::new_limit(unsigned int, unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce);qsl::engine::MatchingEngine::new_limit(unsigned int, unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce) - 11 qsl-bench;[unknown];[unknown];[unknown];__libc_start_call_main;main;qsl::replay::generate_flow(unsigned long, unsigned int, unsigned long);qsl::engine::MatchingEngine::contains(unsigned int, unsigned long) const - 8 qsl-bench;__libc_start_call_main;[unknown];[unknown];cfree@GLIBC_2.17 - 7 qsl-bench;[unknown];[unknown];[unknown];__libc_start_call_main;main;qsl::engine::OrderBook::cancel(unsigned long);decltype(auto) qsl::engine::OrderBook::dispatch_storage(qsl::engine::OrderBook::cancel(unsigned long)::{lambda()#1}&&, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::IntrusiveStore&)#1}&&, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::ContiguousStore&)#1}&&) [clone .isra.0];qsl::engine::OrderBook::erase_resting_order(qsl::engine::OrderBook::Locator const&) - 6 qsl-bench;[unknown];[unknown];[unknown];__libc_start_call_main;main;qsl::gateway::Session::on_bytes(std::span);qsl::gateway::Session::on_bytes(std::span, std::vector >&, unsigned long);qsl::gateway::Session::process_frame(std::span, std::vector >&, unsigned long);qsl::gateway::OrderGateway::new_limit(unsigned int, unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce) - 6 qsl-bench;[unknown];[unknown];[unknown];__libc_start_call_main;main;qsl::replay::apply(qsl::engine::MatchingEngine&, std::variant const&);qsl::engine::MatchingEngine::new_limit(unsigned int, unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce) - 5 qsl-bench;qsl::engine::OrderBook::erase_resting_order(qsl::engine::OrderBook::Locator const&);operator delete(void*, std::align_val_t)@plt - 5 qsl-bench;qsl::engine::OrderBook::erase_resting_order(qsl::engine::OrderBook::Locator const&);operator delete(void*, unsigned long, std::align_val_t)@plt - 5 qsl-bench;std::_Hashtable, std::pmr::polymorphic_allocator >, std::__detail::_Select1st, std::equal_to, std::hash, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits >::_M_erase(unsigned long, std::__detail::_Hash_node_base*, std::__detail::_Hash_node, false>*);operator delete(void*, unsigned long, std::align_val_t)@plt - 5 qsl-bench;[unknown];[unknown];operator new(unsigned long);malloc@plt - 5 qsl-bench;[unknown];[unknown];[unknown];__libc_start_call_main;main;qsl::replay::generate_flow(unsigned long, unsigned int, unsigned long);qsl::engine::OrderBook::contains(unsigned long) const - 4 qsl-bench;decltype(auto) qsl::engine::OrderBook::dispatch_storage(qsl::engine::OrderBook::cancel(unsigned long)::{lambda()#1}&&, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::IntrusiveStore&)#1}&&, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::ContiguousStore&)#1}&&) [clone .isra.0];[unknown];[unknown];cfree@GLIBC_2.17 - 4 qsl-bench;main;[unknown];[unknown];operator new(unsigned long);malloc - 4 qsl-bench;operator new(unsigned long);malloc@plt + 1870 qsl-bench;_start;__libc_start_main@@GLIBC_2.34;__libc_start_call_main;main;qsl::engine::MatchingEngine::new_limit(unsigned int, unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce);qsl::engine::OrderBook::add_limit(unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce);qsl::engine::OrderBook::match_baseline(qsl::core::Side, qsl::engine::OrderBook::MatchContext&) + 1772 qsl-bench;_start;__libc_start_main@@GLIBC_2.34;__libc_start_call_main;main;qsl::engine::MatchingEngine::cancel(unsigned int, unsigned long);qsl::engine::OrderBook::cancel(unsigned long);decltype(auto) qsl::engine::OrderBook::dispatch_storage(qsl::engine::OrderBook::cancel(unsigned long)::{lambda()#1}&&, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::IntrusiveStore&)#1}&&, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::ContiguousStore&)#1}&&) [clone .isra.0] + 1137 qsl-bench;_start;__libc_start_main@@GLIBC_2.34;__libc_start_call_main;main;qsl::engine::MatchingEngine::new_limit(unsigned int, unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce);qsl::engine::OrderBook::add_limit(unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce);qsl::engine::OrderBook::rest(unsigned long, qsl::core::Side, long, unsigned int);qsl::engine::OrderBook::level_for[abi:cxx11](qsl::core::Side, long);std::pair > > >, bool> std::_Rb_tree > >, std::_Select1st > > >, std::greater, std::pmr::polymorphic_allocator > > > >::_M_emplace_unique > >(long&, std::__cxx11::list >&&) + 974 qsl-bench;_start;__libc_start_main@@GLIBC_2.34;__libc_start_call_main;main;qsl::engine::MatchingEngine::new_limit(unsigned int, unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce);qsl::engine::OrderBook::add_limit(unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce);qsl::engine::OrderBook::rest(unsigned long, qsl::core::Side, long, unsigned int);qsl::engine::OrderBook::level_for[abi:cxx11](qsl::core::Side, long);std::pair > > >, bool> std::_Rb_tree > >, std::_Select1st > > >, std::less, std::pmr::polymorphic_allocator > > > >::_M_emplace_unique > >(long&, std::__cxx11::list >&&) + 853 qsl-bench;_start;__libc_start_main@@GLIBC_2.34;__libc_start_call_main;main;qsl::engine::MatchingEngine::new_limit(unsigned int, unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce);qsl::engine::OrderBook::contains(unsigned long) const + 677 qsl-bench;_start;__libc_start_main@@GLIBC_2.34;__libc_start_call_main;main;qsl::engine::MatchingEngine::new_limit(unsigned int, unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce) + 596 qsl-bench;_start;__libc_start_main@@GLIBC_2.34;__libc_start_call_main;main;qsl::engine::MatchingEngine::new_limit(unsigned int, unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce);qsl::engine::OrderBook::add_limit(unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce) + 547 qsl-bench;_start;__libc_start_main@@GLIBC_2.34;__libc_start_call_main;main + 543 qsl-bench;_start;__libc_start_main@@GLIBC_2.34;__libc_start_call_main;main;qsl::engine::MatchingEngine::cancel(unsigned int, unsigned long);qsl::engine::OrderBook::cancel(unsigned long);decltype(auto) qsl::engine::OrderBook::dispatch_storage(qsl::engine::OrderBook::cancel(unsigned long)::{lambda()#1}&&, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::IntrusiveStore&)#1}&&, qsl::engine::OrderBook::cancel(unsigned long)::{lambda(qsl::engine::OrderBook::ContiguousStore&)#1}&&) [clone .isra.0];qsl::engine::OrderBook::erase_resting_order(qsl::engine::OrderBook::Locator const&) + 489 qsl-bench;_start;__libc_start_main@@GLIBC_2.34;__libc_start_call_main;main;qsl::engine::MatchingEngine::new_limit(unsigned int, unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce);qsl::engine::OrderBook::add_limit(unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce);qsl::engine::OrderBook::match_baseline(qsl::core::Side, qsl::engine::OrderBook::MatchContext&);std::_Rb_tree_rebalance_for_erase(std::_Rb_tree_node_base*, std::_Rb_tree_node_base&) + 476 qsl-bench;_start;__libc_start_main@@GLIBC_2.34;__libc_start_call_main;main;qsl::engine::MatchingEngine::new_limit(unsigned int, unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce);qsl::engine::OrderBook::add_limit(unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce);qsl::engine::OrderBook::rest(unsigned long, qsl::core::Side, long, unsigned int);std::__detail::_Map_base, std::pmr::polymorphic_allocator >, std::__detail::_Select1st, std::equal_to, std::hash, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits, true>::operator[](unsigned long const&) + 348 qsl-bench;_start;__libc_start_main@@GLIBC_2.34;__libc_start_call_main;main;qsl::engine::MatchingEngine::cancel(unsigned int, unsigned long) + 344 qsl-bench;_start;__libc_start_main@@GLIBC_2.34;__libc_start_call_main;main;qsl::engine::MatchingEngine::new_limit(unsigned int, unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce);qsl::engine::OrderBook::add_limit(unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce);qsl::engine::OrderBook::match_baseline(qsl::core::Side, qsl::engine::OrderBook::MatchContext&);qsl::engine::OrderBook::fill_front_order(std::__cxx11::list >&, long, qsl::engine::OrderBook::MatchContext&) + 266 qsl-bench;_start;__libc_start_main@@GLIBC_2.34;__libc_start_call_main;main;qsl::engine::MatchingEngine::new_limit(unsigned int, unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce);qsl::engine::OrderBook::add_limit(unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce);qsl::engine::OrderBook::match_baseline(qsl::core::Side, qsl::engine::OrderBook::MatchContext&);cfree@GLIBC_2.17 + 242 qsl-bench;_start;__libc_start_main@@GLIBC_2.34;__libc_start_call_main;main;qsl::engine::MatchingEngine::new_limit(unsigned int, unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce);qsl::engine::OrderBook::add_limit(unsigned long, qsl::core::Side, long, unsigned int, qsl::core::TimeInForce);qsl::engine::OrderBook::match_baseline(qsl::core::Side, qsl::engine::OrderBook::MatchContext&);std::_Hashtable, std::pmr::polymorphic_allocator >, std::__detail::_Select1st, std::equal_to, std::hash, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits >::_M_erase(unsigned long, std::__detail::_Hash_node_base*, std::__detail::_Hash_node, false>*) Benchmark output: -order_book add/mod/cancel 200000 ops 132.8 ns/op 7531861 ops/sec -protocol encode+decode 500000 ops 20.5 ns/op 48773893 ops/sec -gateway session (fill) 200000 ops 127.4 ns/op 7848348 ops/sec -matching engine flow 5004 items 101.6 ns/item 9840697 items/sec -replay command log 5004 items 112.0 ns/item 8928265 items/sec +profile workload: storage=baseline ops=39497728 elapsed=5.000s (7899377 ops/sec) resting~512 diff --git a/scripts/flamegraph.py b/scripts/flamegraph.py index a9cc7f3..b10a145 100755 --- a/scripts/flamegraph.py +++ b/scripts/flamegraph.py @@ -63,8 +63,10 @@ class _Folder: loop flat (one if/elif/else) instead of a deeply nested block. """ - def __init__(self) -> None: + def __init__(self, drop_unknown: bool = True) -> None: self.folded: dict[str, int] = {} + self.dropped_unknown = 0 # count of unresolved frames folded into their caller + self._drop_unknown = drop_unknown self._comm = "" self._stack: list[str] = [] @@ -86,10 +88,19 @@ def end_sample(self) -> None: def _flush(self) -> None: if self._stack: frames = list(reversed(self._stack)) # perf prints leaf-first + if self._drop_unknown: + # Frame-pointer unwinding emits a single unresolvable "[unknown]" frame at the + # glibc allocator boundary (Fedora's libc is built without frame pointers). Fold it + # into its caller: the sample is preserved and the real neighbours (the app frame and + # the named libc symbol such as cfree/operator new) stay in the stack. + kept = [f for f in frames if f != "[unknown]"] + self.dropped_unknown += len(frames) - len(kept) + frames = kept if self._comm: frames.insert(0, self._comm) - key = ";".join(frames) - self.folded[key] = self.folded.get(key, 0) + 1 + if frames: + key = ";".join(frames) + self.folded[key] = self.folded.get(key, 0) + 1 self._stack = [] def result(self) -> dict[str, int]: @@ -97,9 +108,9 @@ def result(self) -> dict[str, int]: return self.folded -def fold_perf_script(lines) -> dict[str, int]: +def fold_perf_script(lines, drop_unknown: bool = True) -> dict[str, int]: """Collapse `perf script` output into {stack_string: sample_count}.""" - folder = _Folder() + folder = _Folder(drop_unknown=drop_unknown) for raw in lines: line = raw.rstrip("\n") if not line.strip(): @@ -108,7 +119,14 @@ def fold_perf_script(lines) -> dict[str, int]: folder.add_frame(line) else: folder.start_sample(line) - return folder.result() + result = folder.result() + if folder.dropped_unknown: + print( + f"flamegraph.py: folded {folder.dropped_unknown} unresolved [unknown] frame(s) " + "into their caller", + file=sys.stderr, + ) + return result def parse_collapsed(lines) -> dict[str, int]: @@ -333,12 +351,14 @@ def main(argv=None) -> int: ap.add_argument("--countname", default="samples") ap.add_argument("--root-name", default="all") ap.add_argument("--width", type=int, default=1200) + ap.add_argument("--keep-unknown", action="store_true", + help="keep unresolved [unknown] frames instead of folding them into the caller") args = ap.parse_args(argv) if args.from_collapsed: folded = parse_collapsed(sys.stdin) else: - folded = fold_perf_script(sys.stdin) + folded = fold_perf_script(sys.stdin, drop_unknown=not args.keep_unknown) if args.collapse_only: for stack in sorted(folded): diff --git a/scripts/flamegraph.sh b/scripts/flamegraph.sh index 3d7dbfa..c7bf338 100755 --- a/scripts/flamegraph.sh +++ b/scripts/flamegraph.sh @@ -1,9 +1,11 @@ #!/usr/bin/env bash # Generate a Linux perf flamegraph from the benchmark harness. # -# Records call-graph samples with `perf record --call-graph dwarf`, folds them -# with scripts/flamegraph.py (a dependency-free stackcollapse + SVG renderer), -# and writes: +# Records call-graph samples with `perf record --call-graph fp` against the +# dedicated frame-pointer build (build/flamegraph, -fno-omit-frame-pointer -g) +# while qsl-bench runs its long-running `profile` workload, then folds them with +# scripts/flamegraph.py (a dependency-free stackcollapse + SVG renderer), and +# writes: # results/flamegraph.svg -- the visual flamegraph (provenance embedded as a # leading XML comment + a visible subtitle) # results/flamegraph.txt -- provenance + classification + top folded stacks @@ -18,13 +20,19 @@ cd "$(dirname "$0")/.." # shellcheck source=scripts/qsl_common.sh source scripts/qsl_common.sh -BIN="${QSL_BENCH_BIN:-build/bench/qsl-bench}" +BIN="${QSL_BENCH_BIN:-build/flamegraph/qsl-bench}" OUT_SVG="${QSL_FLAMEGRAPH_SVG:-results/flamegraph.svg}" OUT_TXT="${QSL_FLAMEGRAPH_TXT:-results/flamegraph.txt}" DATA="${QSL_FLAMEGRAPH_DATA:-build/perf/qsl-bench.flame.data}" EVENT="${QSL_FLAMEGRAPH_EVENT:-cpu-clock}" FREQ="${QSL_FLAMEGRAPH_FREQ:-4000}" -CALLGRAPH="${QSL_FLAMEGRAPH_CALLGRAPH:-dwarf}" +# Frame-pointer unwinding (the flamegraph preset keeps frame pointers) gives clean, fully-symbolized +# stacks; the prior dwarf default left [unknown] gaps because the Release bench build omits them. +CALLGRAPH="${QSL_FLAMEGRAPH_CALLGRAPH:-fp}" +# Seconds of warm steady-state order flow to sample. ~5s at -F 4000 yields tens of thousands of +# samples, versus the ~80ms (~329-sample) one-shot benchmark suite. +PROFILE_SECONDS="${QSL_FLAMEGRAPH_SECONDS:-5}" +BENCH_ARGS=(profile "$PROFILE_SECONDS") MIN_SAMPLES="${QSL_FLAMEGRAPH_MIN_SAMPLES:-200}" TOP_STACKS="${QSL_FLAMEGRAPH_TOP_STACKS:-15}" BUILD_DIR="$(dirname "$BIN")" @@ -87,9 +95,10 @@ SVG_TMP="$(mktemp)" TXT_TMP="$(mktemp)" trap 'rm -f "$BENCH_OUT" "$RECORD_BENCH_OUT" "$RECORD_ERR" "$SCRIPT_OUT" "$SCRIPT_ERR" "$FOLDED" "$COLLAPSE_ERR" "$SVG_TMP" "$TXT_TMP"' EXIT -# Fail fast if the benchmark itself is broken (partial mode must not mask this). +# Fail fast if the benchmark itself is broken (partial mode must not mask this). A short profile +# run validates the workload quickly without paying the full sampling duration. BENCH_STATUS=0 -"$BIN" >"$BENCH_OUT" 2>&1 || BENCH_STATUS=$? +"$BIN" profile 0.2 >"$BENCH_OUT" 2>&1 || BENCH_STATUS=$? if [[ "$BENCH_STATUS" -ne 0 ]]; then echo "error: benchmark command failed before perf record (status $BENCH_STATUS); partial mode cannot override this." >&2 cat "$BENCH_OUT" >&2 @@ -97,7 +106,7 @@ if [[ "$BENCH_STATUS" -ne 0 ]]; then fi RECORD_STATUS=0 -perf record --call-graph "$CALLGRAPH" -F "$FREQ" -g -e "$EVENT" -o "$DATA" -- "$BIN" \ +perf record --call-graph "$CALLGRAPH" -F "$FREQ" -g -e "$EVENT" -o "$DATA" -- "$BIN" "${BENCH_ARGS[@]}" \ >"$RECORD_BENCH_OUT" 2>"$RECORD_ERR" || RECORD_STATUS=$? SCRIPT_STATUS=0 @@ -209,7 +218,7 @@ fi echo "Build type: $(qsl_build_type "$BUILD_DIR")" echo "$PROVENANCE" echo "Benchmark binary: $BIN" - echo "Dataset: qsl-bench default synthetic benchmark suite" + echo "Dataset: qsl-bench profile workload (warm bounded order flow, ${PROFILE_SECONDS}s)" echo "Call graph: $CALLGRAPH" echo "Record event: $EVENT" echo "Sample freq: $FREQ Hz" @@ -246,7 +255,13 @@ fi fi echo echo "Benchmark output:" - cat "$BENCH_OUT" + # Prefer the actually-sampled run's summary; fall back to the fail-fast pre-check on a + # partial/failed record so the section is never empty. + if [[ -s "$RECORD_BENCH_OUT" ]]; then + cat "$RECORD_BENCH_OUT" + else + cat "$BENCH_OUT" + fi } >"$TXT_TMP" qsl_publish_artifact "$TXT_TMP" "$OUT_TXT" echo "wrote $OUT_TXT" diff --git a/src/engine/matching_engine.cpp b/src/engine/matching_engine.cpp index 2fa0a50..4a98aba 100644 --- a/src/engine/matching_engine.cpp +++ b/src/engine/matching_engine.cpp @@ -40,10 +40,15 @@ OrderBook *MatchingEngine::find_book(SymbolId symbol) noexcept { return it == books_.end() ? nullptr : &it->second; } -std::vector MatchingEngine::new_limit(SymbolId symbol, OrderId id, Side side, - Price price, Quantity quantity, - TimeInForce tif) { - std::vector events; +std::vector &MatchingEngine::reset_events() { + events_.clear(); // retains capacity: no per-operation reallocation in steady state + return events_; +} + +const std::vector &MatchingEngine::new_limit(SymbolId symbol, OrderId id, Side side, + Price price, Quantity quantity, + TimeInForce tif) { + std::vector &events = reset_events(); OrderBook *book = find_book(symbol); if (book == nullptr) { return events; // unknown symbol: rejection is the risk layer's job (M5) @@ -62,9 +67,9 @@ std::vector MatchingEngine::new_limit(SymbolId symbol, OrderId id, return events; } -std::vector MatchingEngine::new_market(SymbolId symbol, OrderId id, Side side, - Quantity quantity) { - std::vector events; +const std::vector &MatchingEngine::new_market(SymbolId symbol, OrderId id, Side side, + Quantity quantity) { + std::vector &events = reset_events(); OrderBook *book = find_book(symbol); if (book == nullptr) { return events; @@ -80,8 +85,8 @@ std::vector MatchingEngine::new_market(SymbolId symbol, OrderId id, return events; } -std::vector MatchingEngine::cancel(SymbolId symbol, OrderId id) { - std::vector events; +const std::vector &MatchingEngine::cancel(SymbolId symbol, OrderId id) { + std::vector &events = reset_events(); OrderBook *book = find_book(symbol); if (book == nullptr) { return events; @@ -92,9 +97,9 @@ std::vector MatchingEngine::cancel(SymbolId symbol, OrderId id) { return events; } -std::vector MatchingEngine::modify(SymbolId symbol, OrderId id, Price new_price, - Quantity new_quantity) { - std::vector events; +const std::vector &MatchingEngine::modify(SymbolId symbol, OrderId id, Price new_price, + Quantity new_quantity) { + std::vector &events = reset_events(); OrderBook *book = find_book(symbol); if (book == nullptr || !book->contains(id)) { return events; // unknown symbol/order: rejection is the risk layer's job (M5) diff --git a/tests/shell/test_flamegraph.sh b/tests/shell/test_flamegraph.sh index 585ba34..bd5a349 100644 --- a/tests/shell/test_flamegraph.sh +++ b/tests/shell/test_flamegraph.sh @@ -104,6 +104,26 @@ expect_eq "collapse-only is deterministic" "$FOLDED" "$FOLDED2" SORTED="$(printf '%s\n' "$FOLDED" | LC_ALL=C sort)" expect_eq "collapse-only output is sorted" "$SORTED" "$FOLDED" +# --- [unknown] frame folding ------------------------------------------------ +# Frame-pointer unwinding emits a lone unresolvable "[unknown]" frame at the glibc allocator +# boundary. By default it is folded into its caller: the sample is preserved and the real +# neighbours (the app frame and the named libc symbol) stay. --keep-unknown disables the fold. +make_unknown_script() { + printf '%s\n' \ + "qsl-bench 100 1.0: 1000 cpu-clock:u:" \ + "${TAB}aaaa cfree+0x5 (/usr/lib64/libc.so.6)" \ + "${TAB}bbbb [unknown] (/usr/lib64/libc.so.6)" \ + "${TAB}415cd0 qsl::engine::OrderBook::add_limit(unsigned long)+0x10 (/path/qsl-bench)" \ + "${TAB}402887 main+0x10 (/path/qsl-bench)" \ + "" +} +UNK_FOLDED="$(make_unknown_script | python3 "$FG" --collapse-only 2>/dev/null)" +expect_eq "lone [unknown] frame is folded into caller, sample preserved with cfree leaf" \ + 'qsl-bench;main;qsl::engine::OrderBook::add_limit(unsigned long);cfree 1' "$UNK_FOLDED" +expect_not_contains "folded output has no [unknown]" '[unknown]' "$UNK_FOLDED" +UNK_KEPT="$(make_unknown_script | python3 "$FG" --collapse-only --keep-unknown 2>/dev/null)" +expect_contains "--keep-unknown preserves the [unknown] frame" '[unknown]' "$UNK_KEPT" + # --- SVG rendering ---------------------------------------------------------- SVG="$(make_perf_script | python3 "$FG" --title "T" --subtitle "S")"