Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions .github/workflows/codspeed.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
name: CodSpeed

on:
push:
branches:
- "main"
pull_request:
workflow_dispatch:

permissions:
contents: read
id-token: write

jobs:
codspeed:
name: Run benchmarks
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
submodules: recursive

- name: Build CodSpeed benchmarks
run: |
cd cpp/codspeed
mkdir -p build && cd build
cmake -DCODSPEED_MODE=simulation -DCMAKE_BUILD_TYPE=RelWithDebInfo ..
make -j$(nproc) bench_fib bench_skynet bench_nqueens bench_matmul

- name: Run CodSpeed benchmarks
uses: CodSpeedHQ/action@v4
with:
mode: simulation
run: |
cd cpp/codspeed/build
./bench_fib
./bench_skynet
./bench_nqueens
./bench_matmul
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# runtime-benchmarks
Benchmarks to compare the performance of async runtimes / executors.

[![CodSpeed](https://img.shields.io/endpoint?url=https://codspeed.io/badge.json)](https://codspeed.io/tzcnt/runtime-benchmarks?utm_source=badge)

[<img src="https://fleetcode.com/img/bench-splash.png">](https://fleetcode.com/runtime-benchmarks/)

An interactive view of the full results dataset is available at: https://fleetcode.com/runtime-benchmarks/
Expand Down
50 changes: 50 additions & 0 deletions cpp/codspeed/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
cmake_minimum_required(VERSION 3.16)
project(runtime_benchmarks_codspeed)

set(CMAKE_MODULE_PATH
${runtime_benchmarks_codspeed_SOURCE_DIR}/../1CMake
${CMAKE_MODULE_PATH})

set(CMAKE_EXPORT_COMPILE_COMMANDS "1")
set(CMAKE_CXX_STANDARD 20)

add_definitions(
"-DTF_ENABLE_ATOMIC_NOTIFIER"
)

include(../1CMake/CPM.cmake)

CPMAddPackage(
NAME taskflow
GIT_REPOSITORY https://github.com/taskflow/taskflow.git
GIT_TAG 1ac5852c5a1679face4a4755eaab5dbcc558128e
DOWNLOAD_ONLY)

set(BENCHMARK_DOWNLOAD_DEPENDENCIES ON)

include(FetchContent)
FetchContent_Declare(
google_benchmark
GIT_REPOSITORY https://github.com/CodSpeedHQ/codspeed-cpp
SOURCE_SUBDIR google_benchmark
GIT_TAG main
)
FetchContent_MakeAvailable(google_benchmark)

include_directories(
${taskflow_SOURCE_DIR}
"../2common"
)

add_executable(bench_fib bench_fib.cpp)
target_link_libraries(bench_fib benchmark::benchmark)

add_executable(bench_skynet bench_skynet.cpp)
target_link_libraries(bench_skynet benchmark::benchmark)

add_executable(bench_nqueens bench_nqueens.cpp)
target_link_libraries(bench_nqueens benchmark::benchmark)
target_compile_options(bench_nqueens PRIVATE "-falign-loops=64")

add_executable(bench_matmul bench_matmul.cpp)
target_link_libraries(bench_matmul benchmark::benchmark)
49 changes: 49 additions & 0 deletions cpp/codspeed/bench_fib.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
// CodSpeed benchmark wrapper for the recursive fork fibonacci test
// using the taskflow runtime.

// Adapted from cpp/taskflow/fib.cpp

#include <taskflow/taskflow.hpp>

#include <benchmark/benchmark.h>
#include <cstdlib>
#include <optional>
#include <thread>

static size_t thread_count = std::thread::hardware_concurrency();
std::optional<tf::Executor> executor;

size_t fib(size_t n) {
if (n < 2) {
return n;
}

tf::TaskGroup tg = executor->task_group();

size_t x, y;

tg.silent_async([n, &x]() { x = fib(n - 1); });
y = fib(n - 2);

tg.corun();
return x + y;
}

static void BM_Fib(benchmark::State& state) {
size_t n = static_cast<size_t>(state.range(0));
if (!executor.has_value()) {
executor.emplace(thread_count);
}

// warmup
size_t result = 0;
executor->async([&result, n]() { result = fib(n); }).get();

for (auto _ : state) {
result = 0;
executor->async([&result, n]() { result = fib(n); }).get();
benchmark::DoNotOptimize(result);
}
}
BENCHMARK(BM_Fib)->Arg(20);
BENCHMARK_MAIN();
90 changes: 90 additions & 0 deletions cpp/codspeed/bench_matmul.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
// CodSpeed benchmark wrapper for the matmul test
// using the taskflow runtime.

// Adapted from cpp/taskflow/matmul.cpp
// Uses 256x256 instead of larger sizes for reasonable CI runtimes
// under simulation mode.

#include "matmul.hpp"
#include <taskflow/algorithm/for_each.hpp>
#include <taskflow/taskflow.hpp>

#include <benchmark/benchmark.h>
#include <cstdlib>
#include <exception>
#include <optional>
#include <thread>
#include <vector>

static size_t thread_count = std::thread::hardware_concurrency();
std::optional<tf::Executor> executor;

void matmul(int* a, int* b, int* c, int n, int N) {
if (n <= 32) {
matmul_small(a, b, c, n, N);
} else {
int k = n / 2;

tf::TaskGroup tg = executor->task_group();
tg.silent_async([=]() { matmul(a, b, c, k, N); });
tg.silent_async([=]() { matmul(a, b + k, c + k, k, N); });
tg.silent_async(
[=]() { matmul(a + k * N, b, c + k * N, k, N); }
);
matmul(a + k * N, b + k, c + k * N + k, k, N);
tg.corun();

tg.silent_async(
[=]() { matmul(a + k, b + k * N, c, k, N); }
);
tg.silent_async(
[=]() { matmul(a + k, b + k * N + k, c + k, k, N); }
);
tg.silent_async(
[=]() { matmul(a + k * N + k, b + k * N, c + k * N, k, N); }
);
matmul(
a + k * N + k, b + k * N + k, c + k * N + k, k, N
);
tg.corun();
}
}

std::vector<int> run_matmul(tf::Executor& exec, int N) {
std::vector<int> A(N * N, 1);
std::vector<int> B(N * N, 1);
std::vector<int> C(N * N, 0);

int* a = A.data();
int* b = B.data();
int* c = C.data();

for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
a[i * N + j] = 1;
b[i * N + j] = 1;
c[i * N + j] = 0;
}
}

exec.async([=]() { matmul(a, b, c, N, N); }).get();
return C;
}

static void BM_Matmul(benchmark::State& state) {
int N = static_cast<int>(state.range(0));
if (!executor.has_value()) {
executor.emplace(thread_count);
}

// warmup
run_matmul(*executor, N);

for (auto _ : state) {
auto result = run_matmul(*executor, N);
benchmark::DoNotOptimize(result.data());
benchmark::ClobberMemory();
}
}
BENCHMARK(BM_Matmul)->Arg(256);
BENCHMARK_MAIN();
98 changes: 98 additions & 0 deletions cpp/codspeed/bench_nqueens.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
// CodSpeed benchmark wrapper for the nqueens test
// using the taskflow runtime.

// Adapted from cpp/taskflow/nqueens.cpp
// Uses N=10 instead of N=14 for reasonable CI runtimes
// under simulation mode.

#include <taskflow/taskflow.hpp>

#include <array>
#include <benchmark/benchmark.h>
#include <cstdlib>
#include <optional>
#include <ranges>
#include <thread>

static size_t thread_count = std::thread::hardware_concurrency();
std::optional<tf::Executor> executor;

inline constexpr int nqueens_work = 10;

inline constexpr std::array<int, 28> answers = {
0, 1, 0, 0, 2, 10, 4,
40, 92, 352, 724, 2'680, 14'200, 73'712,
365'596, 2'279'184, 14'772'512, 95'815'104, 666'090'624,
};

template <size_t N>
void nqueens(int xMax, std::array<char, N> buf, int& out) {
if (N == xMax) {
out = 1;
return;
}

size_t taskCount = 0;
std::array<int, nqueens_work> results;
auto tasks =
std::ranges::views::iota(0UL, N) |
std::ranges::views::filter(
[xMax, &buf, &taskCount](int y) {
char q = y;
for (int x = 0; x < xMax; x++) {
char p = buf[x];
if (q == p || q == p - (xMax - x) ||
q == p + (xMax - x)) {
return false;
}
}
return true;
}
) |
std::ranges::views::transform(
[xMax, &buf, &taskCount, &results](int y) {
buf[xMax] = y;
size_t idx = taskCount;
++taskCount;
return [xMax, buf, idx, &results]() {
nqueens(xMax + 1, buf, results[idx]);
};
}
);

tf::TaskGroup tg = executor->task_group();

for (auto&& t : tasks) {
tg.silent_async(t);
}
tg.corun();

int ret = 0;
for (size_t i = 0; i < taskCount; ++i) {
ret += results[i];
}

out = ret;
}

static void BM_NQueens(benchmark::State& state) {
if (!executor.has_value()) {
executor.emplace(thread_count);
}

// warmup
{
std::array<char, nqueens_work> buf{};
int result;
executor->async([&]() { nqueens(0, buf, result); }).get();
}

for (auto _ : state) {
std::array<char, nqueens_work> buf{};
int result;
executor->async([&]() { nqueens(0, buf, result); }).get();
benchmark::DoNotOptimize(result);
}
}
BENCHMARK(BM_NQueens);
BENCHMARK_MAIN();
Loading