Skip to content
7 changes: 7 additions & 0 deletions .github/workflows/baseline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ concurrency:

permissions:
contents: read
actions: write

env:
CARGO_TERM_COLOR: always
Expand Down Expand Up @@ -50,6 +51,12 @@ jobs:

# Cache keyed by SHA so each merge gets its own entry.
# benchmark.yml uses restore-keys prefix matching to find the latest one.
# Delete stale cache first so re-runs on the same commit work.
- name: Clear stale baseline cache
run: gh cache delete "fluxbench-baseline-${{ github.sha }}" 2>/dev/null || true
env:
GH_TOKEN: ${{ github.token }}

- name: Cache baseline
uses: actions/cache/save@v4
with:
Expand Down
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ members = [

[workspace.package]
authors = ["Farhan Syah"]
version = "0.1.1"
version = "0.1.2"
edition = "2024"
rust-version = "1.85"
license = "Apache-2.0"
Expand Down
6 changes: 6 additions & 0 deletions examples/examples/ci_regression.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ use std::hint::black_box;
group = "hot_path",
severity = "critical",
threshold = 5.0,
warmup = "2s",
measurement = "3s",
tags = "latency"
)]
fn request_handler(b: &mut Bencher) {
Expand All @@ -50,6 +52,8 @@ fn request_handler(b: &mut Bencher) {
group = "hot_path",
severity = "critical",
threshold = 3.0,
warmup = "2s",
measurement = "3s",
tags = "throughput"
)]
fn token_scan(b: &mut Bencher) {
Expand All @@ -69,6 +73,8 @@ fn token_scan(b: &mut Bencher) {
group = "hot_path",
severity = "warning",
threshold = 10.0,
warmup = "2s",
measurement = "3s",
tags = "throughput"
)]
fn batch_transform(b: &mut Bencher) {
Expand Down
3 changes: 2 additions & 1 deletion flux.toml
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ save_baseline = false

[ci]
# Regression threshold percentage — fail CI if exceeded (default: 5.0)
regression_threshold = 5.0
# Shared CI runners (GitHub Actions) have 10-40% noise; use 25%+ to avoid false positives.
regression_threshold = 25.0
# Emit GitHub Actions annotations (::warning, ::error) (default: false)
github_annotations = true
# Exit non-zero on critical verification failures (default: true)
Expand Down
1 change: 1 addition & 0 deletions fluxbench-cli/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ path = "src/lib.rs"
[[bin]]
name = "fluxbench"
path = "src/main.rs"
doc = false

[dependencies]
fluxbench-core.workspace = true
Expand Down
6 changes: 6 additions & 0 deletions fluxbench-cli/src/executor/execution.rs
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,8 @@ pub struct BenchExecutionResult {
pub failure_kind: Option<String>,
pub backtrace: Option<String>,
pub severity: fluxbench_core::Severity,
/// Per-benchmark regression threshold (0.0 = use global)
pub threshold: f64,
}

/// Execute benchmarks and produce results (in-process mode)
Expand Down Expand Up @@ -217,6 +219,7 @@ impl Executor {
failure_kind: None,
backtrace: None,
severity: bench.severity,
threshold: bench.threshold,
}
}
Err(panic) => {
Expand Down Expand Up @@ -244,6 +247,7 @@ impl Executor {
failure_kind: Some("panic".to_string()),
backtrace: None,
severity: bench.severity,
threshold: bench.threshold,
}
}
}
Expand Down Expand Up @@ -357,6 +361,7 @@ impl IsolatedExecutor {
failure_kind: Some("crashed".to_string()),
backtrace: None,
severity: bench.severity,
threshold: bench.threshold,
});
pb.inc(1);
}
Expand Down Expand Up @@ -431,6 +436,7 @@ impl IsolatedExecutor {
failure_kind,
backtrace,
severity: bench.severity,
threshold: bench.threshold,
}
}
}
29 changes: 18 additions & 11 deletions fluxbench-cli/src/executor/formatting.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
//! - Comparison tables with speedup calculations
//! - Verification results summary

use fluxbench_report::{BenchmarkReportResult, BenchmarkStatus, Report};
use fluxbench_report::{BenchmarkReportResult, BenchmarkStatus, Report, format_duration};

/// Format a report for human-readable terminal display
///
Expand Down Expand Up @@ -51,20 +51,27 @@ pub fn format_human_output(report: &Report) -> String {

if let Some(metrics) = &result.metrics {
output.push_str(&format!(
" mean: {:.2} ns median: {:.2} ns stddev: {:.2} ns\n",
metrics.mean_ns, metrics.median_ns, metrics.std_dev_ns
" mean: {} median: {} stddev: {}\n",
format_duration(metrics.mean_ns),
format_duration(metrics.median_ns),
format_duration(metrics.std_dev_ns),
));
output.push_str(&format!(
" min: {:.2} ns max: {:.2} ns samples: {}\n",
metrics.min_ns, metrics.max_ns, metrics.samples
" min: {} max: {} samples: {}\n",
format_duration(metrics.min_ns),
format_duration(metrics.max_ns),
metrics.samples,
));
output.push_str(&format!(
" p50: {:.2} ns p95: {:.2} ns p99: {:.2} ns\n",
metrics.p50_ns, metrics.p95_ns, metrics.p99_ns
" p50: {} p95: {} p99: {}\n",
format_duration(metrics.p50_ns),
format_duration(metrics.p95_ns),
format_duration(metrics.p99_ns),
));
output.push_str(&format!(
" 95% CI: [{:.2}, {:.2}] ns\n",
metrics.ci_lower_ns, metrics.ci_upper_ns
" 95% CI: [{}, {}]\n",
format_duration(metrics.ci_lower_ns),
format_duration(metrics.ci_upper_ns),
));
if let Some(throughput) = metrics.throughput_ops_sec {
output.push_str(&format!(" throughput: {:.2} ops/sec\n", throughput));
Expand Down Expand Up @@ -133,9 +140,9 @@ pub fn format_human_output(report: &Report) -> String {
};

output.push_str(&format!(
" {:<width$} {:>12.2} {:>10}{}\n",
" {:<width$} {:>12} {:>10}{}\n",
entry.benchmark_id,
entry.value,
format_duration(entry.value),
speedup_str,
baseline_marker,
width = max_name_len
Expand Down
1 change: 1 addition & 0 deletions fluxbench-cli/src/executor/report.rs
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ pub fn build_report(
file: result.file.clone(),
line: result.line,
metrics,
threshold: result.threshold,
comparison: None, // Filled when comparing to baseline
failure,
});
Expand Down
Loading