From d28b2e0786b3d2b699f48cbd3c4af7b351ca4457 Mon Sep 17 00:00:00 2001
From: "codspeed-hq[bot]" <117304815+codspeed-hq[bot]@users.noreply.github.com>
Date: Tue, 3 Mar 2026 16:47:22 +0000
Subject: [PATCH 1/2] Add CodSpeed performance benchmarks

Set up continuous performance tracking using CodSpeed with google_benchmark
wrappers for the taskflow runtime benchmarks (fib, skynet, nqueens, matmul).

- Add cpp/codspeed/ directory with CMake build and benchmark source files
- Add .github/workflows/codspeed.yml for CI integration
- Add CodSpeed badge to README
---
 .github/workflows/codspeed.yml | 39 ++++++++++++++
 README.md                      |  2 +
 cpp/codspeed/CMakeLists.txt    | 50 ++++++++++++++++++
 cpp/codspeed/bench_fib.cpp     | 49 +++++++++++++++++
 cpp/codspeed/bench_matmul.cpp  | 88 +++++++++++++++++++++++++++++++
 cpp/codspeed/bench_nqueens.cpp | 96 ++++++++++++++++++++++++++++++++++
 cpp/codspeed/bench_skynet.cpp  | 73 ++++++++++++++++++++++++++
 7 files changed, 397 insertions(+)
 create mode 100644 .github/workflows/codspeed.yml
 create mode 100644 cpp/codspeed/CMakeLists.txt
 create mode 100644 cpp/codspeed/bench_fib.cpp
 create mode 100644 cpp/codspeed/bench_matmul.cpp
 create mode 100644 cpp/codspeed/bench_nqueens.cpp
 create mode 100644 cpp/codspeed/bench_skynet.cpp
diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml
new file mode 100644
index 0000000..cc8ffdb
--- /dev/null
+++ b/.github/workflows/codspeed.yml
@@ -0,0 +1,39 @@
+name: CodSpeed
+
+on:
+  push:
+    branches:
+      - "main"
+  pull_request:
+  workflow_dispatch:
+
+permissions:
+  contents: read
+  id-token: write
+
+jobs:
+  codspeed:
+    name: Run benchmarks
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Build CodSpeed benchmarks
+        run: |
+          cd cpp/codspeed
+          mkdir -p build && cd build
+          cmake -DCODSPEED_MODE=simulation -DCMAKE_BUILD_TYPE=RelWithDebInfo ..
+          make -j$(nproc) bench_fib bench_skynet bench_nqueens bench_matmul
+
+      - name: Run CodSpeed benchmarks
+        uses: CodSpeedHQ/action@v4
+        with:
+          mode: simulation
+          run: |
+            cd cpp/codspeed/build
+            ./bench_fib
+            ./bench_skynet
+            ./bench_nqueens
+            ./bench_matmul
diff --git a/README.md b/README.md
index df3d90d..616f053 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,8 @@
 # runtime-benchmarks
 Benchmarks to compare the performance of async runtimes / executors.
 
+[![CodSpeed](https://img.shields.io/endpoint?url=https://codspeed.io/badge.json)](https://codspeed.io/tzcnt/runtime-benchmarks?utm_source=badge)
+
 [<img src="https://fleetcode.com/img/bench-splash.png">](https://fleetcode.com/runtime-benchmarks/)
 
 An interactive view of the full results dataset is available at: https://fleetcode.com/runtime-benchmarks/
diff --git a/cpp/codspeed/CMakeLists.txt b/cpp/codspeed/CMakeLists.txt
new file mode 100644
index 0000000..163e5f7
--- /dev/null
+++ b/cpp/codspeed/CMakeLists.txt
@@ -0,0 +1,50 @@
+cmake_minimum_required(VERSION 3.16)
+project(runtime_benchmarks_codspeed)
+
+set(CMAKE_MODULE_PATH
+    ${runtime_benchmarks_codspeed_SOURCE_DIR}/../1CMake
+    ${CMAKE_MODULE_PATH})
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS "1")
+set(CMAKE_CXX_STANDARD 20)
+
+add_definitions(
+    "-DTF_ENABLE_ATOMIC_NOTIFIER"
+)
+
+include(../1CMake/CPM.cmake)
+
+CPMAddPackage(
+    NAME taskflow
+    GIT_REPOSITORY https://github.com/taskflow/taskflow.git
+    GIT_TAG 1ac5852c5a1679face4a4755eaab5dbcc558128e
+    DOWNLOAD_ONLY)
+
+set(BENCHMARK_DOWNLOAD_DEPENDENCIES ON)
+
+include(FetchContent)
+FetchContent_Declare(
+    google_benchmark
+    GIT_REPOSITORY https://github.com/CodSpeedHQ/codspeed-cpp
+    SOURCE_SUBDIR google_benchmark
+    GIT_TAG main
+)
+FetchContent_MakeAvailable(google_benchmark)
+
+include_directories(
+    ${taskflow_SOURCE_DIR}
+    "../2common"
+)
+
+add_executable(bench_fib bench_fib.cpp)
+target_link_libraries(bench_fib benchmark::benchmark)
+
+add_executable(bench_skynet bench_skynet.cpp)
+target_link_libraries(bench_skynet benchmark::benchmark)
+
+add_executable(bench_nqueens bench_nqueens.cpp)
+target_link_libraries(bench_nqueens benchmark::benchmark)
+target_compile_options(bench_nqueens PRIVATE "-falign-loops=64")
+
+add_executable(bench_matmul bench_matmul.cpp)
+target_link_libraries(bench_matmul benchmark::benchmark)
diff --git a/cpp/codspeed/bench_fib.cpp b/cpp/codspeed/bench_fib.cpp
new file mode 100644
index 0000000..f8f5595
--- /dev/null
+++ b/cpp/codspeed/bench_fib.cpp
@@ -0,0 +1,49 @@
+// CodSpeed benchmark wrapper for the recursive fork fibonacci test
+// using the taskflow runtime.
+
+// Adapted from cpp/taskflow/fib.cpp
+
+#include <taskflow/taskflow.hpp>
+
+#include <benchmark/benchmark.h>
+#include <cstdlib>
+#include <optional>
+#include <thread>
+
+static size_t thread_count = std::thread::hardware_concurrency();
+std::optional<tf::Executor> executor;
+
+size_t fib(size_t n) {
+  if (n < 2) {
+    return n;
+  }
+
+  tf::TaskGroup tg = executor->task_group();
+
+  size_t x, y;
+
+  tg.silent_async([n, &x]() { x = fib(n - 1); });
+  y = fib(n - 2);
+
+  tg.corun();
+  return x + y;
+}
+
+static void BM_Fib(benchmark::State& state) {
+  size_t n = static_cast<size_t>(state.range(0));
+  if (!executor.has_value()) {
+    executor.emplace(thread_count);
+  }
+
+  // warmup
+  size_t result = 0;
+  executor->async([&result, n]() { result = fib(n); }).get();
+
+  for (auto _ : state) {
+    result = 0;
+    executor->async([&result, n]() { result = fib(n); }).get();
+    benchmark::DoNotOptimize(result);
+  }
+}
+BENCHMARK(BM_Fib)->Arg(30);
+BENCHMARK_MAIN();
diff --git a/cpp/codspeed/bench_matmul.cpp b/cpp/codspeed/bench_matmul.cpp
new file mode 100644
index 0000000..31a78fa
--- /dev/null
+++ b/cpp/codspeed/bench_matmul.cpp
@@ -0,0 +1,88 @@
+// CodSpeed benchmark wrapper for the matmul test
+// using the taskflow runtime.
+
+// Adapted from cpp/taskflow/matmul.cpp
+
+#include "matmul.hpp"
+#include <taskflow/algorithm/for_each.hpp>
+#include <taskflow/taskflow.hpp>
+
+#include <benchmark/benchmark.h>
+#include <cstdlib>
+#include <exception>
+#include <optional>
+#include <thread>
+#include <vector>
+
+static size_t thread_count = std::thread::hardware_concurrency();
+std::optional<tf::Executor> executor;
+
+void matmul(int* a, int* b, int* c, int n, int N) {
+  if (n <= 32) {
+    matmul_small(a, b, c, n, N);
+  } else {
+    int k = n / 2;
+
+    tf::TaskGroup tg = executor->task_group();
+    tg.silent_async([=]() { matmul(a, b, c, k, N); });
+    tg.silent_async([=]() { matmul(a, b + k, c + k, k, N); });
+    tg.silent_async(
+      [=]() { matmul(a + k * N, b, c + k * N, k, N); }
+    );
+    matmul(a + k * N, b + k, c + k * N + k, k, N);
+    tg.corun();
+
+    tg.silent_async(
+      [=]() { matmul(a + k, b + k * N, c, k, N); }
+    );
+    tg.silent_async(
+      [=]() { matmul(a + k, b + k * N + k, c + k, k, N); }
+    );
+    tg.silent_async(
+      [=]() { matmul(a + k * N + k, b + k * N, c + k * N, k, N); }
+    );
+    matmul(
+      a + k * N + k, b + k * N + k, c + k * N + k, k, N
+    );
+    tg.corun();
+  }
+}
+
+std::vector<int> run_matmul(tf::Executor& exec, int N) {
+  std::vector<int> A(N * N, 1);
+  std::vector<int> B(N * N, 1);
+  std::vector<int> C(N * N, 0);
+
+  int* a = A.data();
+  int* b = B.data();
+  int* c = C.data();
+
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < N; j++) {
+      a[i * N + j] = 1;
+      b[i * N + j] = 1;
+      c[i * N + j] = 0;
+    }
+  }
+
+  exec.async([=]() { matmul(a, b, c, N, N); }).get();
+  return C;
+}
+
+static void BM_Matmul(benchmark::State& state) {
+  int N = static_cast<int>(state.range(0));
+  if (!executor.has_value()) {
+    executor.emplace(thread_count);
+  }
+
+  // warmup
+  run_matmul(*executor, N);
+
+  for (auto _ : state) {
+    auto result = run_matmul(*executor, N);
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+}
+BENCHMARK(BM_Matmul)->Arg(512);
+BENCHMARK_MAIN();
diff --git a/cpp/codspeed/bench_nqueens.cpp b/cpp/codspeed/bench_nqueens.cpp
new file mode 100644
index 0000000..72b1a4e
--- /dev/null
+++ b/cpp/codspeed/bench_nqueens.cpp
@@ -0,0 +1,96 @@
+// CodSpeed benchmark wrapper for the nqueens test
+// using the taskflow runtime.
+
+// Adapted from cpp/taskflow/nqueens.cpp
+
+#include <taskflow/taskflow.hpp>
+
+#include <array>
+#include <benchmark/benchmark.h>
+#include <cstdlib>
+#include <optional>
+#include <ranges>
+#include <thread>
+
+static size_t thread_count = std::thread::hardware_concurrency();
+std::optional<tf::Executor> executor;
+
+inline constexpr int nqueens_work = 14;
+
+inline constexpr std::array<int, 28> answers = {
+  0,       1,         0,          0,          2,           10,     4,
+  40,      92,        352,        724,        2'680,       14'200, 73'712,
+  365'596, 2'279'184, 14'772'512, 95'815'104, 666'090'624,
+};
+
+template <size_t N>
+void nqueens(int xMax, std::array<char, N> buf, int& out) {
+  if (N == xMax) {
+    out = 1;
+    return;
+  }
+
+  size_t taskCount = 0;
+  std::array<int, nqueens_work> results;
+  auto tasks =
+    std::ranges::views::iota(0UL, N) |
+    std::ranges::views::filter(
+      [xMax, &buf, &taskCount](int y) {
+        char q = y;
+        for (int x = 0; x < xMax; x++) {
+          char p = buf[x];
+          if (q == p || q == p - (xMax - x) ||
+              q == p + (xMax - x)) {
+            return false;
+          }
+        }
+        return true;
+      }
+    ) |
+    std::ranges::views::transform(
+      [xMax, &buf, &taskCount, &results](int y) {
+        buf[xMax] = y;
+        size_t idx = taskCount;
+        ++taskCount;
+        return [xMax, buf, idx, &results]() {
+          nqueens(xMax + 1, buf, results[idx]);
+        };
+      }
+    );
+
+  tf::TaskGroup tg = executor->task_group();
+
+  for (auto&& t : tasks) {
+    tg.silent_async(t);
+  }
+  tg.corun();
+
+  int ret = 0;
+  for (size_t i = 0; i < taskCount; ++i) {
+    ret += results[i];
+  }
+
+  out = ret;
+}
+
+static void BM_NQueens(benchmark::State& state) {
+  if (!executor.has_value()) {
+    executor.emplace(thread_count);
+  }
+
+  // warmup
+  {
+    std::array<char, nqueens_work> buf{};
+    int result;
+    executor->async([&]() { nqueens(0, buf, result); }).get();
+  }
+
+  for (auto _ : state) {
+    std::array<char, nqueens_work> buf{};
+    int result;
+    executor->async([&]() { nqueens(0, buf, result); }).get();
+    benchmark::DoNotOptimize(result);
+  }
+}
+BENCHMARK(BM_NQueens);
+BENCHMARK_MAIN();
diff --git a/cpp/codspeed/bench_skynet.cpp b/cpp/codspeed/bench_skynet.cpp
new file mode 100644
index 0000000..2457993
--- /dev/null
+++ b/cpp/codspeed/bench_skynet.cpp
@@ -0,0 +1,73 @@
+// CodSpeed benchmark wrapper for the skynet test
+// using the taskflow runtime.
+
+// Adapted from cpp/taskflow/skynet.cpp
+
+#include <taskflow/taskflow.hpp>
+
+#include <array>
+#include <benchmark/benchmark.h>
+#include <cinttypes>
+#include <cstdlib>
+#include <optional>
+#include <thread>
+
+static size_t thread_count = std::thread::hardware_concurrency();
+std::optional<tf::Executor> executor;
+
+template <size_t DepthMax>
+size_t skynet_one(size_t BaseNum, size_t Depth) {
+  if (Depth == DepthMax) {
+    return BaseNum;
+  }
+  size_t depthOffset = 1;
+  for (size_t i = 0; i < DepthMax - Depth - 1; ++i) {
+    depthOffset *= 10;
+  }
+
+  std::array<size_t, 10> results;
+
+  tf::TaskGroup tg = executor->task_group();
+
+  for (size_t i = 0; i < 9; ++i) {
+    tg.silent_async([=, &results, idx = i]() {
+      results[idx] =
+        skynet_one<DepthMax>(BaseNum + depthOffset * idx, Depth + 1);
+    });
+  }
+  results[9] =
+    skynet_one<DepthMax>(BaseNum + depthOffset * 9, Depth + 1);
+  tg.corun();
+
+  size_t count = 0;
+  for (size_t idx = 0; idx < 10; ++idx) {
+    count += results[idx];
+  }
+  return count;
+}
+
+template <size_t DepthMax>
+void skynet(tf::Executor& exec) {
+  size_t count;
+  exec.async([&]() { count = skynet_one<DepthMax>(0, 0); }).get();
+  if (count != 4999999950000000) {
+    std::fprintf(
+      stderr, "ERROR: wrong result - %" PRIu64 "\n", count
+    );
+  }
+}
+
+static void BM_Skynet(benchmark::State& state) {
+  if (!executor.has_value()) {
+    executor.emplace(thread_count);
+  }
+
+  // warmup
+  skynet<8>(*executor);
+
+  for (auto _ : state) {
+    skynet<8>(*executor);
+  }
+}
+BENCHMARK(BM_Skynet);
+BENCHMARK_MAIN();

From badc8355c62973fbd4ec285933be81adf754679a Mon Sep 17 00:00:00 2001
From: "codspeed-hq[bot]" <117304815+codspeed-hq[bot]@users.noreply.github.com>
Date: Tue, 3 Mar 2026 17:33:32 +0000
Subject: [PATCH 2/2] Reduce benchmark problem sizes for simulation mode

Reduce problem sizes to ensure reasonable CI runtimes under CodSpeed
simulation (valgrind) mode:
- fib: 30 -> 20
- skynet: depth 8 (100M tasks) -> depth 6 (1M tasks)
- nqueens: N=14 -> N=10
- matmul: 512x512 -> 256x256
---
 cpp/codspeed/bench_fib.cpp     |  2 +-
 cpp/codspeed/bench_matmul.cpp  |  4 +++-
 cpp/codspeed/bench_nqueens.cpp |  4 +++-
 cpp/codspeed/bench_skynet.cpp  | 14 +++++++++++---
 4 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/cpp/codspeed/bench_fib.cpp b/cpp/codspeed/bench_fib.cpp
index f8f5595..fb41bec 100644
--- a/cpp/codspeed/bench_fib.cpp
+++ b/cpp/codspeed/bench_fib.cpp
@@ -45,5 +45,5 @@ static void BM_Fib(benchmark::State& state) {
     benchmark::DoNotOptimize(result);
   }
 }
-BENCHMARK(BM_Fib)->Arg(30);
+BENCHMARK(BM_Fib)->Arg(20);
 BENCHMARK_MAIN();
diff --git a/cpp/codspeed/bench_matmul.cpp b/cpp/codspeed/bench_matmul.cpp
index 31a78fa..37517cf 100644
--- a/cpp/codspeed/bench_matmul.cpp
+++ b/cpp/codspeed/bench_matmul.cpp
@@ -2,6 +2,8 @@
 // using the taskflow runtime.
 
 // Adapted from cpp/taskflow/matmul.cpp
+// Uses 256x256 instead of larger sizes for reasonable CI runtimes
+// under simulation mode.
 
 #include "matmul.hpp"
 #include <taskflow/algorithm/for_each.hpp>
@@ -84,5 +86,5 @@ static void BM_Matmul(benchmark::State& state) {
     benchmark::ClobberMemory();
   }
 }
-BENCHMARK(BM_Matmul)->Arg(512);
+BENCHMARK(BM_Matmul)->Arg(256);
 BENCHMARK_MAIN();
diff --git a/cpp/codspeed/bench_nqueens.cpp b/cpp/codspeed/bench_nqueens.cpp
index 72b1a4e..3aa3cb7 100644
--- a/cpp/codspeed/bench_nqueens.cpp
+++ b/cpp/codspeed/bench_nqueens.cpp
@@ -2,6 +2,8 @@
 // using the taskflow runtime.
 
 // Adapted from cpp/taskflow/nqueens.cpp
+// Uses N=10 instead of N=14 for reasonable CI runtimes
+// under simulation mode.
 
 #include <taskflow/taskflow.hpp>
 
@@ -15,7 +17,7 @@
 static size_t thread_count = std::thread::hardware_concurrency();
 std::optional<tf::Executor> executor;
 
-inline constexpr int nqueens_work = 14;
+inline constexpr int nqueens_work = 10;
 
 inline constexpr std::array<int, 28> answers = {
   0,       1,         0,          0,          2,           10,     4,
diff --git a/cpp/codspeed/bench_skynet.cpp b/cpp/codspeed/bench_skynet.cpp
index 2457993..5215b21 100644
--- a/cpp/codspeed/bench_skynet.cpp
+++ b/cpp/codspeed/bench_skynet.cpp
@@ -2,6 +2,8 @@
 // using the taskflow runtime.
 
 // Adapted from cpp/taskflow/skynet.cpp
+// Uses depth 6 (1M tasks) instead of depth 8 (100M tasks)
+// for reasonable CI runtimes under simulation mode.
 
 #include <taskflow/taskflow.hpp>
 
@@ -50,7 +52,13 @@ template <size_t DepthMax>
 void skynet(tf::Executor& exec) {
   size_t count;
   exec.async([&]() { count = skynet_one<DepthMax>(0, 0); }).get();
-  if (count != 4999999950000000) {
+  size_t expected = 0;
+  size_t max_val = 1;
+  for (size_t i = 0; i < DepthMax; ++i) {
+    max_val *= 10;
+  }
+  expected = (max_val - 1) * max_val / 2;
+  if (count != expected) {
     std::fprintf(
       stderr, "ERROR: wrong result - %" PRIu64 "\n", count
     );
@@ -63,10 +71,10 @@ static void BM_Skynet(benchmark::State& state) {
   }
 
   // warmup
-  skynet<8>(*executor);
+  skynet<6>(*executor);
 
   for (auto _ : state) {
-    skynet<8>(*executor);
+    skynet<6>(*executor);
   }
 }
 BENCHMARK(BM_Skynet);