tzcnt · codspeed-hq · Mar 3, 2026 · Mar 3, 2026
diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml
@@ -0,0 +1,39 @@
+name: CodSpeed
+
+on:
+  push:
+    branches:
+      - "main"
+  pull_request:
+  workflow_dispatch:
+
+permissions:
+  contents: read
+  id-token: write
+
+jobs:
+  codspeed:
+    name: Run benchmarks
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Build CodSpeed benchmarks
+        run: |
+          cd cpp/codspeed
+          mkdir -p build && cd build
+          cmake -DCODSPEED_MODE=simulation -DCMAKE_BUILD_TYPE=RelWithDebInfo ..
+          make -j$(nproc) bench_fib bench_skynet bench_nqueens bench_matmul
+
+      - name: Run CodSpeed benchmarks
+        uses: CodSpeedHQ/action@v4
+        with:
+          mode: simulation
+          run: |
+            cd cpp/codspeed/build
+            ./bench_fib
+            ./bench_skynet
+            ./bench_nqueens
+            ./bench_matmul
diff --git a/README.md b/README.md
@@ -1,6 +1,8 @@
 # runtime-benchmarks
 Benchmarks to compare the performance of async runtimes / executors.
 
+[![CodSpeed](https://img.shields.io/endpoint?url=https://codspeed.io/badge.json)](https://codspeed.io/tzcnt/runtime-benchmarks?utm_source=badge)
+
 [<img src="https://fleetcode.com/img/bench-splash.png">](https://fleetcode.com/runtime-benchmarks/)
 
 An interactive view of the full results dataset is available at: https://fleetcode.com/runtime-benchmarks/

diff --git a/cpp/codspeed/CMakeLists.txt b/cpp/codspeed/CMakeLists.txt
@@ -0,0 +1,50 @@
+cmake_minimum_required(VERSION 3.16)
+project(runtime_benchmarks_codspeed)
+
+set(CMAKE_MODULE_PATH
+    ${runtime_benchmarks_codspeed_SOURCE_DIR}/../1CMake
+    ${CMAKE_MODULE_PATH})
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS "1")
+set(CMAKE_CXX_STANDARD 20)
+
+add_definitions(
+    "-DTF_ENABLE_ATOMIC_NOTIFIER"
+)
+
+include(../1CMake/CPM.cmake)
+
+CPMAddPackage(
+    NAME taskflow
+    GIT_REPOSITORY https://github.com/taskflow/taskflow.git
+    GIT_TAG 1ac5852c5a1679face4a4755eaab5dbcc558128e
+    DOWNLOAD_ONLY)
+
+set(BENCHMARK_DOWNLOAD_DEPENDENCIES ON)
+
+include(FetchContent)
+FetchContent_Declare(
+    google_benchmark
+    GIT_REPOSITORY https://github.com/CodSpeedHQ/codspeed-cpp
+    SOURCE_SUBDIR google_benchmark
+    GIT_TAG main
+)
+FetchContent_MakeAvailable(google_benchmark)
+
+include_directories(
+    ${taskflow_SOURCE_DIR}
+    "../2common"
+)
+
+add_executable(bench_fib bench_fib.cpp)
+target_link_libraries(bench_fib benchmark::benchmark)
+
+add_executable(bench_skynet bench_skynet.cpp)
+target_link_libraries(bench_skynet benchmark::benchmark)
+
+add_executable(bench_nqueens bench_nqueens.cpp)
+target_link_libraries(bench_nqueens benchmark::benchmark)
+target_compile_options(bench_nqueens PRIVATE "-falign-loops=64")
+
+add_executable(bench_matmul bench_matmul.cpp)
+target_link_libraries(bench_matmul benchmark::benchmark)
diff --git a/cpp/codspeed/bench_fib.cpp b/cpp/codspeed/bench_fib.cpp
@@ -0,0 +1,49 @@
+// CodSpeed benchmark wrapper for the recursive fork fibonacci test
+// using the taskflow runtime.
+
+// Adapted from cpp/taskflow/fib.cpp
+
+#include <taskflow/taskflow.hpp>
+
+#include <benchmark/benchmark.h>
+#include <cstdlib>
+#include <optional>
+#include <thread>
+
+static size_t thread_count = std::thread::hardware_concurrency();
+std::optional<tf::Executor> executor;
+
+size_t fib(size_t n) {
+  if (n < 2) {
+    return n;
+  }
+
+  tf::TaskGroup tg = executor->task_group();
+
+  size_t x, y;
+
+  tg.silent_async([n, &x]() { x = fib(n - 1); });
+  y = fib(n - 2);
+
+  tg.corun();
+  return x + y;
+}
+
+static void BM_Fib(benchmark::State& state) {
+  size_t n = static_cast<size_t>(state.range(0));
+  if (!executor.has_value()) {
+    executor.emplace(thread_count);
+  }
+
+  // warmup
+  size_t result = 0;
+  executor->async([&result, n]() { result = fib(n); }).get();
+
+  for (auto _ : state) {
+    result = 0;
+    executor->async([&result, n]() { result = fib(n); }).get();
+    benchmark::DoNotOptimize(result);
+  }
+}
+BENCHMARK(BM_Fib)->Arg(20);
+BENCHMARK_MAIN();
diff --git a/cpp/codspeed/bench_matmul.cpp b/cpp/codspeed/bench_matmul.cpp
@@ -0,0 +1,90 @@
+// CodSpeed benchmark wrapper for the matmul test
+// using the taskflow runtime.
+
+// Adapted from cpp/taskflow/matmul.cpp
+// Uses 256x256 instead of larger sizes for reasonable CI runtimes
+// under simulation mode.
+
+#include "matmul.hpp"
+#include <taskflow/algorithm/for_each.hpp>
+#include <taskflow/taskflow.hpp>
+
+#include <benchmark/benchmark.h>
+#include <cstdlib>
+#include <exception>
+#include <optional>
+#include <thread>
+#include <vector>
+
+static size_t thread_count = std::thread::hardware_concurrency();
+std::optional<tf::Executor> executor;
+
+void matmul(int* a, int* b, int* c, int n, int N) {
+  if (n <= 32) {
+    matmul_small(a, b, c, n, N);
+  } else {
+    int k = n / 2;
+
+    tf::TaskGroup tg = executor->task_group();
+    tg.silent_async([=]() { matmul(a, b, c, k, N); });
+    tg.silent_async([=]() { matmul(a, b + k, c + k, k, N); });
+    tg.silent_async(
+      [=]() { matmul(a + k * N, b, c + k * N, k, N); }
+    );
+    matmul(a + k * N, b + k, c + k * N + k, k, N);
+    tg.corun();
+
+    tg.silent_async(
+      [=]() { matmul(a + k, b + k * N, c, k, N); }
+    );
+    tg.silent_async(
+      [=]() { matmul(a + k, b + k * N + k, c + k, k, N); }
+    );
+    tg.silent_async(
+      [=]() { matmul(a + k * N + k, b + k * N, c + k * N, k, N); }
+    );
+    matmul(
+      a + k * N + k, b + k * N + k, c + k * N + k, k, N
+    );
+    tg.corun();
+  }
+}
+
+std::vector<int> run_matmul(tf::Executor& exec, int N) {
+  std::vector<int> A(N * N, 1);
+  std::vector<int> B(N * N, 1);
+  std::vector<int> C(N * N, 0);
+
+  int* a = A.data();
+  int* b = B.data();
+  int* c = C.data();
+
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < N; j++) {
+      a[i * N + j] = 1;
+      b[i * N + j] = 1;
+      c[i * N + j] = 0;
+    }
+  }
+
+  exec.async([=]() { matmul(a, b, c, N, N); }).get();
+  return C;
+}
+
+static void BM_Matmul(benchmark::State& state) {
+  int N = static_cast<int>(state.range(0));
+  if (!executor.has_value()) {
+    executor.emplace(thread_count);
+  }
+
+  // warmup
+  run_matmul(*executor, N);
+
+  for (auto _ : state) {
+    auto result = run_matmul(*executor, N);
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+}
+BENCHMARK(BM_Matmul)->Arg(256);
+BENCHMARK_MAIN();
diff --git a/cpp/codspeed/bench_nqueens.cpp b/cpp/codspeed/bench_nqueens.cpp
@@ -0,0 +1,98 @@
+// CodSpeed benchmark wrapper for the nqueens test
+// using the taskflow runtime.
+
+// Adapted from cpp/taskflow/nqueens.cpp
+// Uses N=10 instead of N=14 for reasonable CI runtimes
+// under simulation mode.
+
+#include <taskflow/taskflow.hpp>
+
+#include <array>
+#include <benchmark/benchmark.h>
+#include <cstdlib>
+#include <optional>
+#include <ranges>
+#include <thread>
+
+static size_t thread_count = std::thread::hardware_concurrency();
+std::optional<tf::Executor> executor;
+
+inline constexpr int nqueens_work = 10;
+
+inline constexpr std::array<int, 28> answers = {
+  0,       1,         0,          0,          2,           10,     4,
+  40,      92,        352,        724,        2'680,       14'200, 73'712,
+  365'596, 2'279'184, 14'772'512, 95'815'104, 666'090'624,
+};
+
+template <size_t N>
+void nqueens(int xMax, std::array<char, N> buf, int& out) {
+  if (N == xMax) {
+    out = 1;
+    return;
+  }
+
+  size_t taskCount = 0;
+  std::array<int, nqueens_work> results;
+  auto tasks =
+    std::ranges::views::iota(0UL, N) |
+    std::ranges::views::filter(
+      [xMax, &buf, &taskCount](int y) {
+        char q = y;
+        for (int x = 0; x < xMax; x++) {
+          char p = buf[x];
+          if (q == p || q == p - (xMax - x) ||
+              q == p + (xMax - x)) {
+            return false;
+          }
+        }
+        return true;
+      }
+    ) |
+    std::ranges::views::transform(
+      [xMax, &buf, &taskCount, &results](int y) {
+        buf[xMax] = y;
+        size_t idx = taskCount;
+        ++taskCount;
+        return [xMax, buf, idx, &results]() {
+          nqueens(xMax + 1, buf, results[idx]);
+        };
+      }
+    );
+
+  tf::TaskGroup tg = executor->task_group();
+
+  for (auto&& t : tasks) {
+    tg.silent_async(t);
+  }
+  tg.corun();
+
+  int ret = 0;
+  for (size_t i = 0; i < taskCount; ++i) {
+    ret += results[i];
+  }
+
+  out = ret;
+}
+
+static void BM_NQueens(benchmark::State& state) {
+  if (!executor.has_value()) {
+    executor.emplace(thread_count);
+  }
+
+  // warmup
+  {
+    std::array<char, nqueens_work> buf{};
+    int result;
+    executor->async([&]() { nqueens(0, buf, result); }).get();
+  }
+
+  for (auto _ : state) {
+    std::array<char, nqueens_work> buf{};
+    int result;
+    executor->async([&]() { nqueens(0, buf, result); }).get();
+    benchmark::DoNotOptimize(result);
+  }
+}
+BENCHMARK(BM_NQueens);
+BENCHMARK_MAIN();