diff --git a/.github/workflows/project-ci.yml b/.github/workflows/project-ci.yml new file mode 100644 index 0000000..14d7ac1 --- /dev/null +++ b/.github/workflows/project-ci.yml @@ -0,0 +1,88 @@ +# managed by workflow os: project-ci +name: project-ci + +on: + pull_request: + merge_group: + workflow_dispatch: + +permissions: + contents: read + +jobs: + project-ci: + runs-on: ubuntu-latest + timeout-minutes: 25 + steps: + - name: Check out repo + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Set up Node + if: ${{ hashFiles('package.json') != '' }} + uses: actions/setup-node@v4 + with: + node-version: "20" + + - name: Set up Go + if: ${{ hashFiles('go.mod') != '' }} + uses: actions/setup-go@v5 + with: + go-version: stable + + - name: Set up Rust + if: ${{ hashFiles('Cargo.toml') != '' }} + uses: dtolnay/rust-toolchain@stable + + - name: Run project CI + shell: bash + run: | + set -euo pipefail + ran_any=0 + + if [[ -f pyproject.toml || -f setup.py || -f requirements.txt || -d tests ]]; then + python -m pip install --upgrade pip + if [[ -f requirements.txt ]]; then python -m pip install -r requirements.txt; fi + if [[ -f requirements-dev.txt ]]; then python -m pip install -r requirements-dev.txt; fi + python -m pip install pytest + if [[ -f pyproject.toml || -f setup.py ]]; then python -m pip install -e . || true; fi + if [[ -d tests ]]; then + python -m pytest tests -q --tb=short + ran_any=1 + fi + fi + + if [[ -f package.json ]]; then + if [[ -f package-lock.json ]]; then npm ci; else npm install; fi + if npm run | grep -qE '(^|[[:space:]])test([[:space:]]|$)'; then + npm test -- --runInBand || npm test + ran_any=1 + fi + fi + + if [[ -f go.mod ]]; then + go test ./... + ran_any=1 + fi + + if [[ -f Cargo.toml ]]; then + cargo test --all-targets --all-features + ran_any=1 + fi + + if [[ -f CMakeLists.txt ]]; then + sudo apt-get update + sudo apt-get install -y cmake ninja-build + cmake -S . -B build -G Ninja + cmake --build build + ctest --test-dir build --output-on-failure + ran_any=1 + fi + + if [[ "$ran_any" -eq 0 ]]; then + echo "No recognized project CI harness found; passing as metadata-only repo." + fi diff --git a/.github/workflows/secret-scan.yml b/.github/workflows/secret-scan.yml new file mode 100644 index 0000000..00a3092 --- /dev/null +++ b/.github/workflows/secret-scan.yml @@ -0,0 +1,27 @@ +# managed by workflow os: secret-scan +name: secret-scan + +on: + pull_request: + push: + merge_group: + workflow_dispatch: + +permissions: + contents: read + +jobs: + secret-scan: + runs-on: ubuntu-latest + timeout-minutes: 10 + steps: + - name: Check out repo + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Run gitleaks + uses: gitleaks/gitleaks-action@v2 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITLEAKS_CONFIG: .gitleaks.toml diff --git a/.gitignore b/.gitignore index e0d63ba..ba99364 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,48 @@ __pycache__/ .venv/ venv/ *.pyc + +# CMake and out-of-source build trees /build/ -.cmake-test-build/ -reports/ +/build-*/ +/build_*/ +/.cmake-test-build/ +/cmake-build-*/ +/out/ +/CMakeCache.txt +/CMakeFiles/ +/CTestTestfile.cmake +/CTestCostData.txt +/DartConfiguration.tcl +/Testing/ +/Makefile +/build.ninja +/.ninja_deps +/.ninja_log +/rules.ninja +/cmake_install.cmake +/compile_commands.json +/lob_engine +/lob_engine.exe +/lob_benchmark +/lob_benchmark.exe +/test_parser +/test_parser.exe +/test_order_book +/test_order_book.exe +/test_analytics +/test_analytics.exe + +# Generated benchmark, analytics, and report artifacts +/benchmark/*.csv +/benchmark/*.json +/benchmark/*.log +/benchmark/*.out +/benchmark/*.txt +/report/*.csv +/report/*.json +/report/*.log +/report/*.out +/report/*.txt +/report/generated/ +/reports/ diff --git a/.gitleaks.toml b/.gitleaks.toml new file mode 100644 index 0000000..3b15916 --- /dev/null +++ b/.gitleaks.toml @@ -0,0 +1,11 @@ +# managed by workflow os: gitleaks-config +title = "workflow managed secret scan configuration" + +[extend] +useDefault = true + +[[rules]] +id = "openclaw-auth-token" +description = "OpenClaw auth or gateway token" +regex = '''(?i)(?:OPENCLAW_(?:AUTH|GATEWAY|API)_TOKEN|openclaw(?:[_-]?(?:auth|gateway|api))?[_-]?token)[^\n]{0,32}[=:][^\S\r\n]*["']?[A-Za-z0-9._\-]{12,}["']?''' +keywords = ["openclaw", "OPENCLAW_"] diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..745ff41 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,12 @@ +# managed by workflow os: pre-commit +default_install_hook_types: + - pre-commit + - pre-push + +repos: + - repo: https://github.com/gitleaks/gitleaks + rev: v8.30.1 + hooks: + - id: gitleaks + stages: [pre-commit, pre-push, manual] + args: ["--config=.gitleaks.toml"] diff --git a/README.md b/README.md index a9488d4..3db5760 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Real-Time Limit Order Book Engine in C++ -This repository implements a small, deterministic C++ limit-order-book engine for LOBSTER-style message data. It includes: +This repository implements a small, deterministic C++ limit-order-book engine for LOBSTER-style message data. The parser and replay code operate on the LOBSTER six-column message schema, but the checked-in CSVs are tiny synthetic/reduced fixtures for reproducibility, not full proprietary LOBSTER distributions. The repo includes: - typed CSV ingestion for LOBSTER message rows - order lifecycle processing for add, cancel, and execute events @@ -8,7 +8,7 @@ This repository implements a small, deterministic C++ limit-order-book engine fo - two price-level backends: `std::map` and flat sorted `std::vector` - rolling analytics and CSV export after every processed message - deterministic C++ and Python integration tests -- replay benchmark tooling and a checked-in benchmark report +- replay benchmark tooling and a hand-maintained benchmark reproducibility note ## Repository layout @@ -19,29 +19,42 @@ This repository implements a small, deterministic C++ limit-order-book engine fo - `data/`: checked-in small sample datasets used for deterministic tests and reproducible benchmark captures - `report/`: benchmark and methodology notes -## Build +## Reproducible build + +From a fresh clone, run the build, verifier, and benchmark commands below in order. Start with a clean temporary build directory instead of an in-repo build tree: + +```bash +build_dir="$(mktemp -d "${TMPDIR:-/tmp}/lob-engine-build.XXXXXX")" +cmake -S . -B "$build_dir" -DCMAKE_BUILD_TYPE=Release +cmake --build "$build_dir" --config Release +``` + +## Correctness verification + +Run the CMake/CTest verifier from that build directory, then run the existing Python test suite from the repo root: ```bash -cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -cmake --build build -ctest --test-dir build --output-on-failure +ctest --test-dir "$build_dir" --output-on-failure -C Release +python -m pytest tests -q --tb=short ``` +`ctest` runs the three C++ test executables plus the `lob_benchmark_smoke` path. `python -m pytest tests -q --tb=short` configures and reuses a separate `.cmake-test-build/` directory under the repo root; that directory and the analytics CSVs produced there are ignored local test artifacts. + ## CLI usage Replay a dataset and print final top-of-book state: ```bash -./build/lob_engine data/AAPL_sample_messages.csv --backend both --depth 10 --repeat 5 +"$build_dir/lob_engine" data/AAPL_sample_messages.csv --backend both --depth 10 --repeat 5 ``` Export analytics rows after every processed message: ```bash -./build/lob_engine \ +"$build_dir/lob_engine" \ data/AAPL_sample_messages.csv \ --backend both \ - --analytics-out build/analytics.csv \ + --analytics-out "$build_dir/analytics.csv" \ --trade-window-messages 1000 \ --realized-vol-window-seconds 300 ``` @@ -81,15 +94,13 @@ Deterministic parity tests assert that both backends produce identical book snap ## Benchmarking -The benchmark harness focuses on replay throughput and simple preallocation effects: +The benchmark harness focuses on replay throughput and simple preallocation effects on the checked-in reduced fixtures. These four commands are the final step in the fresh-clone verification sequence documented above: ```bash -./build/lob_benchmark \ - --dataset data/AAPL_sample_messages.csv \ - --backend both \ - --reserve both \ - --depth 5 \ - --repeat 100000 +"$build_dir/lob_benchmark" --dataset data/AAPL_sample_messages.csv --backend both --reserve both --depth 5 --repeat 100000 +"$build_dir/lob_benchmark" --dataset data/MSFT_sample_messages.csv --backend both --reserve both --depth 5 --repeat 100000 +"$build_dir/lob_benchmark" --dataset data/NVDA_sample_messages.csv --backend both --reserve both --depth 5 --repeat 100000 +"$build_dir/lob_benchmark" --dataset data/TSLA_sample_messages.csv --backend both --reserve both --depth 5 --repeat 100000 ``` What the benchmark compares: @@ -102,20 +113,21 @@ What the benchmark compares: - `unordered_map::reserve()` for order lookup - vector capacity reservation for the flat backend -This is the bounded hot-path allocation reduction implemented in the repo. The benchmark report records the measured effect on the checked-in sample datasets. - -On a fresh build of this repository on a 4-core AMD EPYC-Rome VM, the fastest AAPL replay configuration processed `60.1 million messages/second` with the flat-vector backend and reserve disabled. +This is the bounded hot-path allocation reduction implemented in the repo. Throughput numbers are host-dependent and should be treated as local measurements on the checked-in reduced fixtures, not as publishable claims about full vendor datasets. See `report/benchmark_report.md` for the exact datasets and commands used for reproducible reruns. ## Dataset note -The repo ships small checked-in reproducibility datasets: +The repo ships five checked-in reproducibility fixtures: - `AAPL_sample_messages.csv` - `MSFT_sample_messages.csv` - `NVDA_sample_messages.csv` - `TSLA_sample_messages.csv` +- `sample_messages.csv` + +The four ticker-named files are 25-line reduced fixtures with 20 valid messages plus 5 intentionally malformed rows each. `sample_messages.csv` is a legacy generic fixture with the same contents as `AAPL_sample_messages.csv`, kept because the parser and Python integration tests reference it directly. -They are intentionally tiny and deterministic so the build, tests, and benchmark report can run in CI or on a fresh clone without external data dependencies. They are suitable for correctness checks and relative replay comparisons, not production-grade market simulation. +These files are intentionally tiny and deterministic so the build, tests, and benchmark workflow can run on a fresh clone without external data dependencies. They are suitable for correctness checks and relative replay comparisons, not production-grade market simulation or claims about full vendor data. ## Why this is useful for quant / HFT workflows diff --git a/report/benchmark_report.md b/report/benchmark_report.md index b0f25c5..743b834 100644 --- a/report/benchmark_report.md +++ b/report/benchmark_report.md @@ -1,78 +1,60 @@ # Benchmark Report -This report captures reproducible replay benchmarks from a clean checkout of the repository after the analytics/export and preallocation work landed. +This document is a hand-maintained reproducibility note for the replay benchmark harness. The repository does not track generated benchmark outputs, build trees, or machine-specific report artifacts. -## Reproduction +## Fixture scope -```bash -cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -cmake --build build -ctest --test-dir build --output-on-failure -./build/lob_benchmark --dataset data/AAPL_sample_messages.csv --backend both --reserve both --depth 5 --repeat 200000 -./build/lob_benchmark --dataset data/MSFT_sample_messages.csv --backend both --reserve both --depth 5 --repeat 100000 -./build/lob_benchmark --dataset data/NVDA_sample_messages.csv --backend both --reserve both --depth 5 --repeat 100000 -./build/lob_benchmark --dataset data/TSLA_sample_messages.csv --backend both --reserve both --depth 5 --repeat 100000 -``` - -## Scope - -- timed section: replay-only -- analytics/export: available in `lob_engine`, but excluded from the replay throughput timer -- backends compared: `std::map` and flat sorted `std::vector` -- allocation mode compared: reserve/preallocation `off` vs `on` - -## Results - -### AAPL sample +The checked-in CSV files are reduced reproducibility fixtures that match the LOBSTER message schema, not full proprietary LOBSTER distributions. -| Backend | Reserve | Throughput msgs/s | Avg ns/msg | -| --- | --- | ---: | ---: | -| map | off | 55,621,183 | 17.979 | -| flat | off | 59,151,913 | 16.906 | -| map | on | 57,645,505 | 17.347 | -| flat | on | 56,307,627 | 17.760 | +| Dataset | Purpose | Rows on disk | Parsed rows | Malformed rows | +| --- | --- | ---: | ---: | ---: | +| `data/AAPL_sample_messages.csv` | Reduced AAPL-like benchmark fixture | 25 | 20 | 5 | +| `data/MSFT_sample_messages.csv` | Reduced MSFT-like benchmark fixture | 25 | 20 | 5 | +| `data/NVDA_sample_messages.csv` | Reduced NVDA-like benchmark fixture | 25 | 20 | 5 | +| `data/TSLA_sample_messages.csv` | Reduced TSLA-like benchmark fixture | 25 | 20 | 5 | +| `data/sample_messages.csv` | Legacy parser/integration-test alias of the AAPL fixture | 25 | 20 | 5 | -Takeaway: on the checked-in AAPL sample, `flat` wins without reserve, while `map` slightly benefits from preallocation. +The fixtures intentionally include malformed rows so parser error accounting is exercised during correctness checks. They also include obviously synthetic values, so benchmark output should be interpreted only as a local engineering signal for this repository, not as a publishable claim about full proprietary datasets. -### MSFT sample +## Fresh-clone sequence -| Backend | Reserve | Throughput msgs/s | Avg ns/msg | -| --- | --- | ---: | ---: | -| map | off | 29,049,272 | 34.424 | -| flat | off | 55,216,799 | 18.110 | -| map | on | 51,246,182 | 19.514 | -| flat | on | 49,013,940 | 20.402 | - -Takeaway: the reserve hint materially improves the `map` path on this sample and narrows the gap to the flat-vector backend. - -### NVDA sample +```bash +build_dir="$(mktemp -d "${TMPDIR:-/tmp}/lob-engine-build.XXXXXX")" +cmake -S . -B "$build_dir" -DCMAKE_BUILD_TYPE=Release +cmake --build "$build_dir" --config Release +ctest --test-dir "$build_dir" --output-on-failure -C Release +python -m pytest tests -q --tb=short +"$build_dir/lob_benchmark" --dataset data/AAPL_sample_messages.csv --backend both --reserve both --depth 5 --repeat 100000 +"$build_dir/lob_benchmark" --dataset data/MSFT_sample_messages.csv --backend both --reserve both --depth 5 --repeat 100000 +"$build_dir/lob_benchmark" --dataset data/NVDA_sample_messages.csv --backend both --reserve both --depth 5 --repeat 100000 +"$build_dir/lob_benchmark" --dataset data/TSLA_sample_messages.csv --backend both --reserve both --depth 5 --repeat 100000 +``` -| Backend | Reserve | Throughput msgs/s | Avg ns/msg | -| --- | --- | ---: | ---: | -| map | off | 46,925,367 | 21.310 | -| flat | off | 54,675,249 | 18.290 | -| map | on | 53,910,207 | 18.549 | -| flat | on | 45,742,321 | 21.862 | +`ctest` covers the C++ test executables plus a smoke run of `lob_benchmark`. `python -m pytest tests -q --tb=short` reruns the existing Python integration suite, which configures and reuses a separate `.cmake-test-build/` directory under the repo root and emits local analytics CSV byproducts there. -Takeaway: preallocation helps the `map` backend more than the flat-vector backend on this sample. +## Benchmark commands -### TSLA sample +```bash +"$build_dir/lob_benchmark" --dataset data/AAPL_sample_messages.csv --backend both --reserve both --depth 5 --repeat 100000 +"$build_dir/lob_benchmark" --dataset data/MSFT_sample_messages.csv --backend both --reserve both --depth 5 --repeat 100000 +"$build_dir/lob_benchmark" --dataset data/NVDA_sample_messages.csv --backend both --reserve both --depth 5 --repeat 100000 +"$build_dir/lob_benchmark" --dataset data/TSLA_sample_messages.csv --backend both --reserve both --depth 5 --repeat 100000 +``` -| Backend | Reserve | Throughput msgs/s | Avg ns/msg | -| --- | --- | ---: | ---: | -| map | off | 55,859,805 | 17.902 | -| flat | off | 57,432,210 | 17.412 | -| map | on | 55,976,073 | 17.865 | -| flat | on | 52,862,960 | 18.917 | +Each command prints: -Takeaway: the two container choices are close on the TSLA sample, with flat-vector slightly ahead without reserve. +- parsed and malformed row counts +- throughput for `map` and `flat_vector` +- reserve `off` and `on` +- final top-of-book snapshot for a quick sanity check -## Interpretation +## Scope -- The flat-vector backend can outperform `std::map` on these shallow sample books because the active level count stays low and the contiguous representation is cache-friendly. -- Reserve/preallocation mainly helps the order-ID `unordered_map` and removes some rehash churn on the replay path. -- The effect is workload-dependent. There is no single winner across every sample dataset, which is exactly why both container choices are kept in the repo and exposed through the same interface. +- timed section: replay-only +- analytics/export: available in `lob_engine`, but excluded from the replay throughput timer +- backends compared: `std::map` and flat sorted `std::vector` +- allocation mode compared: reserve/preallocation `off` vs `on` -## Important note +## Report regeneration -The checked-in datasets are intentionally tiny reproducibility samples, not large production LOBSTER files. These numbers should be treated as relative engineering signals for this repo, not as final claims about live-market performance. +There is no checked-in script that rewrites this file. To refresh the report, rerun the build, verifier, and benchmark commands above, then update this markdown manually with any observations you want to preserve.