From d28b2e0786b3d2b699f48cbd3c4af7b351ca4457 Mon Sep 17 00:00:00 2001 From: "codspeed-hq[bot]" <117304815+codspeed-hq[bot]@users.noreply.github.com> Date: Tue, 3 Mar 2026 16:47:22 +0000 Subject: [PATCH 1/2] Add CodSpeed performance benchmarks Set up continuous performance tracking using CodSpeed with google_benchmark wrappers for the taskflow runtime benchmarks (fib, skynet, nqueens, matmul). - Add cpp/codspeed/ directory with CMake build and benchmark source files - Add .github/workflows/codspeed.yml for CI integration - Add CodSpeed badge to README --- .github/workflows/codspeed.yml | 39 ++++++++++++++ README.md | 2 + cpp/codspeed/CMakeLists.txt | 50 ++++++++++++++++++ cpp/codspeed/bench_fib.cpp | 49 +++++++++++++++++ cpp/codspeed/bench_matmul.cpp | 88 +++++++++++++++++++++++++++++++ cpp/codspeed/bench_nqueens.cpp | 96 ++++++++++++++++++++++++++++++++++ cpp/codspeed/bench_skynet.cpp | 73 ++++++++++++++++++++++++++ 7 files changed, 397 insertions(+) create mode 100644 .github/workflows/codspeed.yml create mode 100644 cpp/codspeed/CMakeLists.txt create mode 100644 cpp/codspeed/bench_fib.cpp create mode 100644 cpp/codspeed/bench_matmul.cpp create mode 100644 cpp/codspeed/bench_nqueens.cpp create mode 100644 cpp/codspeed/bench_skynet.cpp diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml new file mode 100644 index 0000000..cc8ffdb --- /dev/null +++ b/.github/workflows/codspeed.yml @@ -0,0 +1,39 @@ +name: CodSpeed + +on: + push: + branches: + - "main" + pull_request: + workflow_dispatch: + +permissions: + contents: read + id-token: write + +jobs: + codspeed: + name: Run benchmarks + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Build CodSpeed benchmarks + run: | + cd cpp/codspeed + mkdir -p build && cd build + cmake -DCODSPEED_MODE=simulation -DCMAKE_BUILD_TYPE=RelWithDebInfo .. + make -j$(nproc) bench_fib bench_skynet bench_nqueens bench_matmul + + - name: Run CodSpeed benchmarks + uses: CodSpeedHQ/action@v4 + with: + mode: simulation + run: | + cd cpp/codspeed/build + ./bench_fib + ./bench_skynet + ./bench_nqueens + ./bench_matmul diff --git a/README.md b/README.md index df3d90d..616f053 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,8 @@ # runtime-benchmarks Benchmarks to compare the performance of async runtimes / executors. +[![CodSpeed](https://img.shields.io/endpoint?url=https://codspeed.io/badge.json)](https://codspeed.io/tzcnt/runtime-benchmarks?utm_source=badge) + [](https://fleetcode.com/runtime-benchmarks/) An interactive view of the full results dataset is available at: https://fleetcode.com/runtime-benchmarks/ diff --git a/cpp/codspeed/CMakeLists.txt b/cpp/codspeed/CMakeLists.txt new file mode 100644 index 0000000..163e5f7 --- /dev/null +++ b/cpp/codspeed/CMakeLists.txt @@ -0,0 +1,50 @@ +cmake_minimum_required(VERSION 3.16) +project(runtime_benchmarks_codspeed) + +set(CMAKE_MODULE_PATH + ${runtime_benchmarks_codspeed_SOURCE_DIR}/../1CMake + ${CMAKE_MODULE_PATH}) + +set(CMAKE_EXPORT_COMPILE_COMMANDS "1") +set(CMAKE_CXX_STANDARD 20) + +add_definitions( + "-DTF_ENABLE_ATOMIC_NOTIFIER" +) + +include(../1CMake/CPM.cmake) + +CPMAddPackage( + NAME taskflow + GIT_REPOSITORY https://github.com/taskflow/taskflow.git + GIT_TAG 1ac5852c5a1679face4a4755eaab5dbcc558128e + DOWNLOAD_ONLY) + +set(BENCHMARK_DOWNLOAD_DEPENDENCIES ON) + +include(FetchContent) +FetchContent_Declare( + google_benchmark + GIT_REPOSITORY https://github.com/CodSpeedHQ/codspeed-cpp + SOURCE_SUBDIR google_benchmark + GIT_TAG main +) +FetchContent_MakeAvailable(google_benchmark) + +include_directories( + ${taskflow_SOURCE_DIR} + "../2common" +) + +add_executable(bench_fib bench_fib.cpp) +target_link_libraries(bench_fib benchmark::benchmark) + +add_executable(bench_skynet bench_skynet.cpp) +target_link_libraries(bench_skynet benchmark::benchmark) + +add_executable(bench_nqueens bench_nqueens.cpp) +target_link_libraries(bench_nqueens benchmark::benchmark) +target_compile_options(bench_nqueens PRIVATE "-falign-loops=64") + +add_executable(bench_matmul bench_matmul.cpp) +target_link_libraries(bench_matmul benchmark::benchmark) diff --git a/cpp/codspeed/bench_fib.cpp b/cpp/codspeed/bench_fib.cpp new file mode 100644 index 0000000..f8f5595 --- /dev/null +++ b/cpp/codspeed/bench_fib.cpp @@ -0,0 +1,49 @@ +// CodSpeed benchmark wrapper for the recursive fork fibonacci test +// using the taskflow runtime. + +// Adapted from cpp/taskflow/fib.cpp + +#include + +#include +#include +#include +#include + +static size_t thread_count = std::thread::hardware_concurrency(); +std::optional executor; + +size_t fib(size_t n) { + if (n < 2) { + return n; + } + + tf::TaskGroup tg = executor->task_group(); + + size_t x, y; + + tg.silent_async([n, &x]() { x = fib(n - 1); }); + y = fib(n - 2); + + tg.corun(); + return x + y; +} + +static void BM_Fib(benchmark::State& state) { + size_t n = static_cast(state.range(0)); + if (!executor.has_value()) { + executor.emplace(thread_count); + } + + // warmup + size_t result = 0; + executor->async([&result, n]() { result = fib(n); }).get(); + + for (auto _ : state) { + result = 0; + executor->async([&result, n]() { result = fib(n); }).get(); + benchmark::DoNotOptimize(result); + } +} +BENCHMARK(BM_Fib)->Arg(30); +BENCHMARK_MAIN(); diff --git a/cpp/codspeed/bench_matmul.cpp b/cpp/codspeed/bench_matmul.cpp new file mode 100644 index 0000000..31a78fa --- /dev/null +++ b/cpp/codspeed/bench_matmul.cpp @@ -0,0 +1,88 @@ +// CodSpeed benchmark wrapper for the matmul test +// using the taskflow runtime. + +// Adapted from cpp/taskflow/matmul.cpp + +#include "matmul.hpp" +#include +#include + +#include +#include +#include +#include +#include +#include + +static size_t thread_count = std::thread::hardware_concurrency(); +std::optional executor; + +void matmul(int* a, int* b, int* c, int n, int N) { + if (n <= 32) { + matmul_small(a, b, c, n, N); + } else { + int k = n / 2; + + tf::TaskGroup tg = executor->task_group(); + tg.silent_async([=]() { matmul(a, b, c, k, N); }); + tg.silent_async([=]() { matmul(a, b + k, c + k, k, N); }); + tg.silent_async( + [=]() { matmul(a + k * N, b, c + k * N, k, N); } + ); + matmul(a + k * N, b + k, c + k * N + k, k, N); + tg.corun(); + + tg.silent_async( + [=]() { matmul(a + k, b + k * N, c, k, N); } + ); + tg.silent_async( + [=]() { matmul(a + k, b + k * N + k, c + k, k, N); } + ); + tg.silent_async( + [=]() { matmul(a + k * N + k, b + k * N, c + k * N, k, N); } + ); + matmul( + a + k * N + k, b + k * N + k, c + k * N + k, k, N + ); + tg.corun(); + } +} + +std::vector run_matmul(tf::Executor& exec, int N) { + std::vector A(N * N, 1); + std::vector B(N * N, 1); + std::vector C(N * N, 0); + + int* a = A.data(); + int* b = B.data(); + int* c = C.data(); + + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + a[i * N + j] = 1; + b[i * N + j] = 1; + c[i * N + j] = 0; + } + } + + exec.async([=]() { matmul(a, b, c, N, N); }).get(); + return C; +} + +static void BM_Matmul(benchmark::State& state) { + int N = static_cast(state.range(0)); + if (!executor.has_value()) { + executor.emplace(thread_count); + } + + // warmup + run_matmul(*executor, N); + + for (auto _ : state) { + auto result = run_matmul(*executor, N); + benchmark::DoNotOptimize(result.data()); + benchmark::ClobberMemory(); + } +} +BENCHMARK(BM_Matmul)->Arg(512); +BENCHMARK_MAIN(); diff --git a/cpp/codspeed/bench_nqueens.cpp b/cpp/codspeed/bench_nqueens.cpp new file mode 100644 index 0000000..72b1a4e --- /dev/null +++ b/cpp/codspeed/bench_nqueens.cpp @@ -0,0 +1,96 @@ +// CodSpeed benchmark wrapper for the nqueens test +// using the taskflow runtime. + +// Adapted from cpp/taskflow/nqueens.cpp + +#include + +#include +#include +#include +#include +#include +#include + +static size_t thread_count = std::thread::hardware_concurrency(); +std::optional executor; + +inline constexpr int nqueens_work = 14; + +inline constexpr std::array answers = { + 0, 1, 0, 0, 2, 10, 4, + 40, 92, 352, 724, 2'680, 14'200, 73'712, + 365'596, 2'279'184, 14'772'512, 95'815'104, 666'090'624, +}; + +template +void nqueens(int xMax, std::array buf, int& out) { + if (N == xMax) { + out = 1; + return; + } + + size_t taskCount = 0; + std::array results; + auto tasks = + std::ranges::views::iota(0UL, N) | + std::ranges::views::filter( + [xMax, &buf, &taskCount](int y) { + char q = y; + for (int x = 0; x < xMax; x++) { + char p = buf[x]; + if (q == p || q == p - (xMax - x) || + q == p + (xMax - x)) { + return false; + } + } + return true; + } + ) | + std::ranges::views::transform( + [xMax, &buf, &taskCount, &results](int y) { + buf[xMax] = y; + size_t idx = taskCount; + ++taskCount; + return [xMax, buf, idx, &results]() { + nqueens(xMax + 1, buf, results[idx]); + }; + } + ); + + tf::TaskGroup tg = executor->task_group(); + + for (auto&& t : tasks) { + tg.silent_async(t); + } + tg.corun(); + + int ret = 0; + for (size_t i = 0; i < taskCount; ++i) { + ret += results[i]; + } + + out = ret; +} + +static void BM_NQueens(benchmark::State& state) { + if (!executor.has_value()) { + executor.emplace(thread_count); + } + + // warmup + { + std::array buf{}; + int result; + executor->async([&]() { nqueens(0, buf, result); }).get(); + } + + for (auto _ : state) { + std::array buf{}; + int result; + executor->async([&]() { nqueens(0, buf, result); }).get(); + benchmark::DoNotOptimize(result); + } +} +BENCHMARK(BM_NQueens); +BENCHMARK_MAIN(); diff --git a/cpp/codspeed/bench_skynet.cpp b/cpp/codspeed/bench_skynet.cpp new file mode 100644 index 0000000..2457993 --- /dev/null +++ b/cpp/codspeed/bench_skynet.cpp @@ -0,0 +1,73 @@ +// CodSpeed benchmark wrapper for the skynet test +// using the taskflow runtime. + +// Adapted from cpp/taskflow/skynet.cpp + +#include + +#include +#include +#include +#include +#include +#include + +static size_t thread_count = std::thread::hardware_concurrency(); +std::optional executor; + +template +size_t skynet_one(size_t BaseNum, size_t Depth) { + if (Depth == DepthMax) { + return BaseNum; + } + size_t depthOffset = 1; + for (size_t i = 0; i < DepthMax - Depth - 1; ++i) { + depthOffset *= 10; + } + + std::array results; + + tf::TaskGroup tg = executor->task_group(); + + for (size_t i = 0; i < 9; ++i) { + tg.silent_async([=, &results, idx = i]() { + results[idx] = + skynet_one(BaseNum + depthOffset * idx, Depth + 1); + }); + } + results[9] = + skynet_one(BaseNum + depthOffset * 9, Depth + 1); + tg.corun(); + + size_t count = 0; + for (size_t idx = 0; idx < 10; ++idx) { + count += results[idx]; + } + return count; +} + +template +void skynet(tf::Executor& exec) { + size_t count; + exec.async([&]() { count = skynet_one(0, 0); }).get(); + if (count != 4999999950000000) { + std::fprintf( + stderr, "ERROR: wrong result - %" PRIu64 "\n", count + ); + } +} + +static void BM_Skynet(benchmark::State& state) { + if (!executor.has_value()) { + executor.emplace(thread_count); + } + + // warmup + skynet<8>(*executor); + + for (auto _ : state) { + skynet<8>(*executor); + } +} +BENCHMARK(BM_Skynet); +BENCHMARK_MAIN(); From badc8355c62973fbd4ec285933be81adf754679a Mon Sep 17 00:00:00 2001 From: "codspeed-hq[bot]" <117304815+codspeed-hq[bot]@users.noreply.github.com> Date: Tue, 3 Mar 2026 17:33:32 +0000 Subject: [PATCH 2/2] Reduce benchmark problem sizes for simulation mode Reduce problem sizes to ensure reasonable CI runtimes under CodSpeed simulation (valgrind) mode: - fib: 30 -> 20 - skynet: depth 8 (100M tasks) -> depth 6 (1M tasks) - nqueens: N=14 -> N=10 - matmul: 512x512 -> 256x256 --- cpp/codspeed/bench_fib.cpp | 2 +- cpp/codspeed/bench_matmul.cpp | 4 +++- cpp/codspeed/bench_nqueens.cpp | 4 +++- cpp/codspeed/bench_skynet.cpp | 14 +++++++++++--- 4 files changed, 18 insertions(+), 6 deletions(-) diff --git a/cpp/codspeed/bench_fib.cpp b/cpp/codspeed/bench_fib.cpp index f8f5595..fb41bec 100644 --- a/cpp/codspeed/bench_fib.cpp +++ b/cpp/codspeed/bench_fib.cpp @@ -45,5 +45,5 @@ static void BM_Fib(benchmark::State& state) { benchmark::DoNotOptimize(result); } } -BENCHMARK(BM_Fib)->Arg(30); +BENCHMARK(BM_Fib)->Arg(20); BENCHMARK_MAIN(); diff --git a/cpp/codspeed/bench_matmul.cpp b/cpp/codspeed/bench_matmul.cpp index 31a78fa..37517cf 100644 --- a/cpp/codspeed/bench_matmul.cpp +++ b/cpp/codspeed/bench_matmul.cpp @@ -2,6 +2,8 @@ // using the taskflow runtime. // Adapted from cpp/taskflow/matmul.cpp +// Uses 256x256 instead of larger sizes for reasonable CI runtimes +// under simulation mode. #include "matmul.hpp" #include @@ -84,5 +86,5 @@ static void BM_Matmul(benchmark::State& state) { benchmark::ClobberMemory(); } } -BENCHMARK(BM_Matmul)->Arg(512); +BENCHMARK(BM_Matmul)->Arg(256); BENCHMARK_MAIN(); diff --git a/cpp/codspeed/bench_nqueens.cpp b/cpp/codspeed/bench_nqueens.cpp index 72b1a4e..3aa3cb7 100644 --- a/cpp/codspeed/bench_nqueens.cpp +++ b/cpp/codspeed/bench_nqueens.cpp @@ -2,6 +2,8 @@ // using the taskflow runtime. // Adapted from cpp/taskflow/nqueens.cpp +// Uses N=10 instead of N=14 for reasonable CI runtimes +// under simulation mode. #include @@ -15,7 +17,7 @@ static size_t thread_count = std::thread::hardware_concurrency(); std::optional executor; -inline constexpr int nqueens_work = 14; +inline constexpr int nqueens_work = 10; inline constexpr std::array answers = { 0, 1, 0, 0, 2, 10, 4, diff --git a/cpp/codspeed/bench_skynet.cpp b/cpp/codspeed/bench_skynet.cpp index 2457993..5215b21 100644 --- a/cpp/codspeed/bench_skynet.cpp +++ b/cpp/codspeed/bench_skynet.cpp @@ -2,6 +2,8 @@ // using the taskflow runtime. // Adapted from cpp/taskflow/skynet.cpp +// Uses depth 6 (1M tasks) instead of depth 8 (100M tasks) +// for reasonable CI runtimes under simulation mode. #include @@ -50,7 +52,13 @@ template void skynet(tf::Executor& exec) { size_t count; exec.async([&]() { count = skynet_one(0, 0); }).get(); - if (count != 4999999950000000) { + size_t expected = 0; + size_t max_val = 1; + for (size_t i = 0; i < DepthMax; ++i) { + max_val *= 10; + } + expected = (max_val - 1) * max_val / 2; + if (count != expected) { std::fprintf( stderr, "ERROR: wrong result - %" PRIu64 "\n", count ); @@ -63,10 +71,10 @@ static void BM_Skynet(benchmark::State& state) { } // warmup - skynet<8>(*executor); + skynet<6>(*executor); for (auto _ : state) { - skynet<8>(*executor); + skynet<6>(*executor); } } BENCHMARK(BM_Skynet);