From d28b2e0786b3d2b699f48cbd3c4af7b351ca4457 Mon Sep 17 00:00:00 2001
From: "codspeed-hq[bot]" <117304815+codspeed-hq[bot]@users.noreply.github.com>
Date: Tue, 3 Mar 2026 16:47:22 +0000
Subject: [PATCH 1/2] Add CodSpeed performance benchmarks
Set up continuous performance tracking using CodSpeed with google_benchmark
wrappers for the taskflow runtime benchmarks (fib, skynet, nqueens, matmul).
- Add cpp/codspeed/ directory with CMake build and benchmark source files
- Add .github/workflows/codspeed.yml for CI integration
- Add CodSpeed badge to README
---
.github/workflows/codspeed.yml | 39 ++++++++++++++
README.md | 2 +
cpp/codspeed/CMakeLists.txt | 50 ++++++++++++++++++
cpp/codspeed/bench_fib.cpp | 49 +++++++++++++++++
cpp/codspeed/bench_matmul.cpp | 88 +++++++++++++++++++++++++++++++
cpp/codspeed/bench_nqueens.cpp | 96 ++++++++++++++++++++++++++++++++++
cpp/codspeed/bench_skynet.cpp | 73 ++++++++++++++++++++++++++
7 files changed, 397 insertions(+)
create mode 100644 .github/workflows/codspeed.yml
create mode 100644 cpp/codspeed/CMakeLists.txt
create mode 100644 cpp/codspeed/bench_fib.cpp
create mode 100644 cpp/codspeed/bench_matmul.cpp
create mode 100644 cpp/codspeed/bench_nqueens.cpp
create mode 100644 cpp/codspeed/bench_skynet.cpp
diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml
new file mode 100644
index 0000000..cc8ffdb
--- /dev/null
+++ b/.github/workflows/codspeed.yml
@@ -0,0 +1,39 @@
+name: CodSpeed
+
+on:
+ push:
+ branches:
+ - "main"
+ pull_request:
+ workflow_dispatch:
+
+permissions:
+ contents: read
+ id-token: write
+
+jobs:
+ codspeed:
+ name: Run benchmarks
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ submodules: recursive
+
+ - name: Build CodSpeed benchmarks
+ run: |
+ cd cpp/codspeed
+ mkdir -p build && cd build
+ cmake -DCODSPEED_MODE=simulation -DCMAKE_BUILD_TYPE=RelWithDebInfo ..
+ make -j$(nproc) bench_fib bench_skynet bench_nqueens bench_matmul
+
+ - name: Run CodSpeed benchmarks
+ uses: CodSpeedHQ/action@v4
+ with:
+ mode: simulation
+ run: |
+ cd cpp/codspeed/build
+ ./bench_fib
+ ./bench_skynet
+ ./bench_nqueens
+ ./bench_matmul
diff --git a/README.md b/README.md
index df3d90d..616f053 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,8 @@
# runtime-benchmarks
Benchmarks to compare the performance of async runtimes / executors.
+[](https://codspeed.io/tzcnt/runtime-benchmarks?utm_source=badge)
+
[
](https://fleetcode.com/runtime-benchmarks/)
An interactive view of the full results dataset is available at: https://fleetcode.com/runtime-benchmarks/
diff --git a/cpp/codspeed/CMakeLists.txt b/cpp/codspeed/CMakeLists.txt
new file mode 100644
index 0000000..163e5f7
--- /dev/null
+++ b/cpp/codspeed/CMakeLists.txt
@@ -0,0 +1,50 @@
+cmake_minimum_required(VERSION 3.16)
+project(runtime_benchmarks_codspeed)
+
+set(CMAKE_MODULE_PATH
+ ${runtime_benchmarks_codspeed_SOURCE_DIR}/../1CMake
+ ${CMAKE_MODULE_PATH})
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS "1")
+set(CMAKE_CXX_STANDARD 20)
+
+add_definitions(
+ "-DTF_ENABLE_ATOMIC_NOTIFIER"
+)
+
+include(../1CMake/CPM.cmake)
+
+CPMAddPackage(
+ NAME taskflow
+ GIT_REPOSITORY https://github.com/taskflow/taskflow.git
+ GIT_TAG 1ac5852c5a1679face4a4755eaab5dbcc558128e
+ DOWNLOAD_ONLY)
+
+set(BENCHMARK_DOWNLOAD_DEPENDENCIES ON)
+
+include(FetchContent)
+FetchContent_Declare(
+ google_benchmark
+ GIT_REPOSITORY https://github.com/CodSpeedHQ/codspeed-cpp
+ SOURCE_SUBDIR google_benchmark
+ GIT_TAG main
+)
+FetchContent_MakeAvailable(google_benchmark)
+
+include_directories(
+ ${taskflow_SOURCE_DIR}
+ "../2common"
+)
+
+add_executable(bench_fib bench_fib.cpp)
+target_link_libraries(bench_fib benchmark::benchmark)
+
+add_executable(bench_skynet bench_skynet.cpp)
+target_link_libraries(bench_skynet benchmark::benchmark)
+
+add_executable(bench_nqueens bench_nqueens.cpp)
+target_link_libraries(bench_nqueens benchmark::benchmark)
+target_compile_options(bench_nqueens PRIVATE "-falign-loops=64")
+
+add_executable(bench_matmul bench_matmul.cpp)
+target_link_libraries(bench_matmul benchmark::benchmark)
diff --git a/cpp/codspeed/bench_fib.cpp b/cpp/codspeed/bench_fib.cpp
new file mode 100644
index 0000000..f8f5595
--- /dev/null
+++ b/cpp/codspeed/bench_fib.cpp
@@ -0,0 +1,49 @@
+// CodSpeed benchmark wrapper for the recursive fork fibonacci test
+// using the taskflow runtime.
+
+// Adapted from cpp/taskflow/fib.cpp
+
+#include
+
+#include
+#include
+#include
+#include
+
+static size_t thread_count = std::thread::hardware_concurrency();
+std::optional executor;
+
+size_t fib(size_t n) {
+ if (n < 2) {
+ return n;
+ }
+
+ tf::TaskGroup tg = executor->task_group();
+
+ size_t x, y;
+
+ tg.silent_async([n, &x]() { x = fib(n - 1); });
+ y = fib(n - 2);
+
+ tg.corun();
+ return x + y;
+}
+
+static void BM_Fib(benchmark::State& state) {
+ size_t n = static_cast(state.range(0));
+ if (!executor.has_value()) {
+ executor.emplace(thread_count);
+ }
+
+ // warmup
+ size_t result = 0;
+ executor->async([&result, n]() { result = fib(n); }).get();
+
+ for (auto _ : state) {
+ result = 0;
+ executor->async([&result, n]() { result = fib(n); }).get();
+ benchmark::DoNotOptimize(result);
+ }
+}
+BENCHMARK(BM_Fib)->Arg(30);
+BENCHMARK_MAIN();
diff --git a/cpp/codspeed/bench_matmul.cpp b/cpp/codspeed/bench_matmul.cpp
new file mode 100644
index 0000000..31a78fa
--- /dev/null
+++ b/cpp/codspeed/bench_matmul.cpp
@@ -0,0 +1,88 @@
+// CodSpeed benchmark wrapper for the matmul test
+// using the taskflow runtime.
+
+// Adapted from cpp/taskflow/matmul.cpp
+
+#include "matmul.hpp"
+#include
+#include
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+static size_t thread_count = std::thread::hardware_concurrency();
+std::optional executor;
+
+void matmul(int* a, int* b, int* c, int n, int N) {
+ if (n <= 32) {
+ matmul_small(a, b, c, n, N);
+ } else {
+ int k = n / 2;
+
+ tf::TaskGroup tg = executor->task_group();
+ tg.silent_async([=]() { matmul(a, b, c, k, N); });
+ tg.silent_async([=]() { matmul(a, b + k, c + k, k, N); });
+ tg.silent_async(
+ [=]() { matmul(a + k * N, b, c + k * N, k, N); }
+ );
+ matmul(a + k * N, b + k, c + k * N + k, k, N);
+ tg.corun();
+
+ tg.silent_async(
+ [=]() { matmul(a + k, b + k * N, c, k, N); }
+ );
+ tg.silent_async(
+ [=]() { matmul(a + k, b + k * N + k, c + k, k, N); }
+ );
+ tg.silent_async(
+ [=]() { matmul(a + k * N + k, b + k * N, c + k * N, k, N); }
+ );
+ matmul(
+ a + k * N + k, b + k * N + k, c + k * N + k, k, N
+ );
+ tg.corun();
+ }
+}
+
+std::vector run_matmul(tf::Executor& exec, int N) {
+ std::vector A(N * N, 1);
+ std::vector B(N * N, 1);
+ std::vector C(N * N, 0);
+
+ int* a = A.data();
+ int* b = B.data();
+ int* c = C.data();
+
+ for (int i = 0; i < N; i++) {
+ for (int j = 0; j < N; j++) {
+ a[i * N + j] = 1;
+ b[i * N + j] = 1;
+ c[i * N + j] = 0;
+ }
+ }
+
+ exec.async([=]() { matmul(a, b, c, N, N); }).get();
+ return C;
+}
+
+static void BM_Matmul(benchmark::State& state) {
+ int N = static_cast(state.range(0));
+ if (!executor.has_value()) {
+ executor.emplace(thread_count);
+ }
+
+ // warmup
+ run_matmul(*executor, N);
+
+ for (auto _ : state) {
+ auto result = run_matmul(*executor, N);
+ benchmark::DoNotOptimize(result.data());
+ benchmark::ClobberMemory();
+ }
+}
+BENCHMARK(BM_Matmul)->Arg(512);
+BENCHMARK_MAIN();
diff --git a/cpp/codspeed/bench_nqueens.cpp b/cpp/codspeed/bench_nqueens.cpp
new file mode 100644
index 0000000..72b1a4e
--- /dev/null
+++ b/cpp/codspeed/bench_nqueens.cpp
@@ -0,0 +1,96 @@
+// CodSpeed benchmark wrapper for the nqueens test
+// using the taskflow runtime.
+
+// Adapted from cpp/taskflow/nqueens.cpp
+
+#include
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+static size_t thread_count = std::thread::hardware_concurrency();
+std::optional executor;
+
+inline constexpr int nqueens_work = 14;
+
+inline constexpr std::array answers = {
+ 0, 1, 0, 0, 2, 10, 4,
+ 40, 92, 352, 724, 2'680, 14'200, 73'712,
+ 365'596, 2'279'184, 14'772'512, 95'815'104, 666'090'624,
+};
+
+template
+void nqueens(int xMax, std::array buf, int& out) {
+ if (N == xMax) {
+ out = 1;
+ return;
+ }
+
+ size_t taskCount = 0;
+ std::array results;
+ auto tasks =
+ std::ranges::views::iota(0UL, N) |
+ std::ranges::views::filter(
+ [xMax, &buf, &taskCount](int y) {
+ char q = y;
+ for (int x = 0; x < xMax; x++) {
+ char p = buf[x];
+ if (q == p || q == p - (xMax - x) ||
+ q == p + (xMax - x)) {
+ return false;
+ }
+ }
+ return true;
+ }
+ ) |
+ std::ranges::views::transform(
+ [xMax, &buf, &taskCount, &results](int y) {
+ buf[xMax] = y;
+ size_t idx = taskCount;
+ ++taskCount;
+ return [xMax, buf, idx, &results]() {
+ nqueens(xMax + 1, buf, results[idx]);
+ };
+ }
+ );
+
+ tf::TaskGroup tg = executor->task_group();
+
+ for (auto&& t : tasks) {
+ tg.silent_async(t);
+ }
+ tg.corun();
+
+ int ret = 0;
+ for (size_t i = 0; i < taskCount; ++i) {
+ ret += results[i];
+ }
+
+ out = ret;
+}
+
+static void BM_NQueens(benchmark::State& state) {
+ if (!executor.has_value()) {
+ executor.emplace(thread_count);
+ }
+
+ // warmup
+ {
+ std::array buf{};
+ int result;
+ executor->async([&]() { nqueens(0, buf, result); }).get();
+ }
+
+ for (auto _ : state) {
+ std::array buf{};
+ int result;
+ executor->async([&]() { nqueens(0, buf, result); }).get();
+ benchmark::DoNotOptimize(result);
+ }
+}
+BENCHMARK(BM_NQueens);
+BENCHMARK_MAIN();
diff --git a/cpp/codspeed/bench_skynet.cpp b/cpp/codspeed/bench_skynet.cpp
new file mode 100644
index 0000000..2457993
--- /dev/null
+++ b/cpp/codspeed/bench_skynet.cpp
@@ -0,0 +1,73 @@
+// CodSpeed benchmark wrapper for the skynet test
+// using the taskflow runtime.
+
+// Adapted from cpp/taskflow/skynet.cpp
+
+#include
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+static size_t thread_count = std::thread::hardware_concurrency();
+std::optional executor;
+
+template
+size_t skynet_one(size_t BaseNum, size_t Depth) {
+ if (Depth == DepthMax) {
+ return BaseNum;
+ }
+ size_t depthOffset = 1;
+ for (size_t i = 0; i < DepthMax - Depth - 1; ++i) {
+ depthOffset *= 10;
+ }
+
+ std::array results;
+
+ tf::TaskGroup tg = executor->task_group();
+
+ for (size_t i = 0; i < 9; ++i) {
+ tg.silent_async([=, &results, idx = i]() {
+ results[idx] =
+ skynet_one(BaseNum + depthOffset * idx, Depth + 1);
+ });
+ }
+ results[9] =
+ skynet_one(BaseNum + depthOffset * 9, Depth + 1);
+ tg.corun();
+
+ size_t count = 0;
+ for (size_t idx = 0; idx < 10; ++idx) {
+ count += results[idx];
+ }
+ return count;
+}
+
+template
+void skynet(tf::Executor& exec) {
+ size_t count;
+ exec.async([&]() { count = skynet_one(0, 0); }).get();
+ if (count != 4999999950000000) {
+ std::fprintf(
+ stderr, "ERROR: wrong result - %" PRIu64 "\n", count
+ );
+ }
+}
+
+static void BM_Skynet(benchmark::State& state) {
+ if (!executor.has_value()) {
+ executor.emplace(thread_count);
+ }
+
+ // warmup
+ skynet<8>(*executor);
+
+ for (auto _ : state) {
+ skynet<8>(*executor);
+ }
+}
+BENCHMARK(BM_Skynet);
+BENCHMARK_MAIN();
From badc8355c62973fbd4ec285933be81adf754679a Mon Sep 17 00:00:00 2001
From: "codspeed-hq[bot]" <117304815+codspeed-hq[bot]@users.noreply.github.com>
Date: Tue, 3 Mar 2026 17:33:32 +0000
Subject: [PATCH 2/2] Reduce benchmark problem sizes for simulation mode
Reduce problem sizes to ensure reasonable CI runtimes under CodSpeed
simulation (valgrind) mode:
- fib: 30 -> 20
- skynet: depth 8 (100M tasks) -> depth 6 (1M tasks)
- nqueens: N=14 -> N=10
- matmul: 512x512 -> 256x256
---
cpp/codspeed/bench_fib.cpp | 2 +-
cpp/codspeed/bench_matmul.cpp | 4 +++-
cpp/codspeed/bench_nqueens.cpp | 4 +++-
cpp/codspeed/bench_skynet.cpp | 14 +++++++++++---
4 files changed, 18 insertions(+), 6 deletions(-)
diff --git a/cpp/codspeed/bench_fib.cpp b/cpp/codspeed/bench_fib.cpp
index f8f5595..fb41bec 100644
--- a/cpp/codspeed/bench_fib.cpp
+++ b/cpp/codspeed/bench_fib.cpp
@@ -45,5 +45,5 @@ static void BM_Fib(benchmark::State& state) {
benchmark::DoNotOptimize(result);
}
}
-BENCHMARK(BM_Fib)->Arg(30);
+BENCHMARK(BM_Fib)->Arg(20);
BENCHMARK_MAIN();
diff --git a/cpp/codspeed/bench_matmul.cpp b/cpp/codspeed/bench_matmul.cpp
index 31a78fa..37517cf 100644
--- a/cpp/codspeed/bench_matmul.cpp
+++ b/cpp/codspeed/bench_matmul.cpp
@@ -2,6 +2,8 @@
// using the taskflow runtime.
// Adapted from cpp/taskflow/matmul.cpp
+// Uses 256x256 instead of larger sizes for reasonable CI runtimes
+// under simulation mode.
#include "matmul.hpp"
#include
@@ -84,5 +86,5 @@ static void BM_Matmul(benchmark::State& state) {
benchmark::ClobberMemory();
}
}
-BENCHMARK(BM_Matmul)->Arg(512);
+BENCHMARK(BM_Matmul)->Arg(256);
BENCHMARK_MAIN();
diff --git a/cpp/codspeed/bench_nqueens.cpp b/cpp/codspeed/bench_nqueens.cpp
index 72b1a4e..3aa3cb7 100644
--- a/cpp/codspeed/bench_nqueens.cpp
+++ b/cpp/codspeed/bench_nqueens.cpp
@@ -2,6 +2,8 @@
// using the taskflow runtime.
// Adapted from cpp/taskflow/nqueens.cpp
+// Uses N=10 instead of N=14 for reasonable CI runtimes
+// under simulation mode.
#include
@@ -15,7 +17,7 @@
static size_t thread_count = std::thread::hardware_concurrency();
std::optional executor;
-inline constexpr int nqueens_work = 14;
+inline constexpr int nqueens_work = 10;
inline constexpr std::array answers = {
0, 1, 0, 0, 2, 10, 4,
diff --git a/cpp/codspeed/bench_skynet.cpp b/cpp/codspeed/bench_skynet.cpp
index 2457993..5215b21 100644
--- a/cpp/codspeed/bench_skynet.cpp
+++ b/cpp/codspeed/bench_skynet.cpp
@@ -2,6 +2,8 @@
// using the taskflow runtime.
// Adapted from cpp/taskflow/skynet.cpp
+// Uses depth 6 (1M tasks) instead of depth 8 (100M tasks)
+// for reasonable CI runtimes under simulation mode.
#include
@@ -50,7 +52,13 @@ template
void skynet(tf::Executor& exec) {
size_t count;
exec.async([&]() { count = skynet_one(0, 0); }).get();
- if (count != 4999999950000000) {
+ size_t expected = 0;
+ size_t max_val = 1;
+ for (size_t i = 0; i < DepthMax; ++i) {
+ max_val *= 10;
+ }
+ expected = (max_val - 1) * max_val / 2;
+ if (count != expected) {
std::fprintf(
stderr, "ERROR: wrong result - %" PRIu64 "\n", count
);
@@ -63,10 +71,10 @@ static void BM_Skynet(benchmark::State& state) {
}
// warmup
- skynet<8>(*executor);
+ skynet<6>(*executor);
for (auto _ : state) {
- skynet<8>(*executor);
+ skynet<6>(*executor);
}
}
BENCHMARK(BM_Skynet);