From 5936f984fdf11e3004bc84643609b12d70742ea6 Mon Sep 17 00:00:00 2001 From: Farhan Syah Date: Fri, 13 Feb 2026 14:57:24 +0800 Subject: [PATCH 1/8] fix(ci): clear stale baseline cache on workflow re-runs Add cache deletion step to prevent stale baseline artifacts when re-running the same commit. GitHub Actions caches are immutable per key, so re-runs on the same SHA would fail to update the baseline without this fix. Also add actions:write permission required for cache deletion API. --- .github/workflows/baseline.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/baseline.yml b/.github/workflows/baseline.yml index 3aad88f..183c1c0 100644 --- a/.github/workflows/baseline.yml +++ b/.github/workflows/baseline.yml @@ -20,6 +20,7 @@ concurrency: permissions: contents: read + actions: write env: CARGO_TERM_COLOR: always @@ -50,6 +51,12 @@ jobs: # Cache keyed by SHA so each merge gets its own entry. # benchmark.yml uses restore-keys prefix matching to find the latest one. + # Delete stale cache first so re-runs on the same commit work. + - name: Clear stale baseline cache + run: gh cache delete "fluxbench-baseline-${{ github.sha }}" 2>/dev/null || true + env: + GH_TOKEN: ${{ github.token }} + - name: Cache baseline uses: actions/cache/save@v4 with: From 220430823915a6c6d0dea3f32bc15714bc8a1b6e Mon Sep 17 00:00:00 2001 From: Farhan Syah Date: Fri, 13 Feb 2026 14:58:30 +0800 Subject: [PATCH 2/8] feat(report): export duration formatting utility for reuse Make format_duration public and export from the report crate root, enabling consistent human-readable duration formatting (ns/us/ms/s) across CLI and report generation. --- fluxbench-report/src/github.rs | 3 ++- fluxbench-report/src/lib.rs | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/fluxbench-report/src/github.rs b/fluxbench-report/src/github.rs index c5385cc..aaa66d5 100644 --- a/fluxbench-report/src/github.rs +++ b/fluxbench-report/src/github.rs @@ -213,7 +213,8 @@ fn write_verifications(output: &mut String, verifications: &[VerificationResult] output.push('\n'); } -fn format_duration(ns: f64) -> String { +/// Format a duration in nanoseconds to a human-readable string (ns/us/ms/s). +pub fn format_duration(ns: f64) -> String { if ns < 1_000.0 { format!("{:.0} ns", ns) } else if ns < 1_000_000.0 { diff --git a/fluxbench-report/src/lib.rs b/fluxbench-report/src/lib.rs index ff1ddd3..4e9d119 100644 --- a/fluxbench-report/src/lib.rs +++ b/fluxbench-report/src/lib.rs @@ -14,7 +14,7 @@ mod json; mod report; pub use csv::generate_csv_report; -pub use github::generate_github_summary; +pub use github::{format_duration, generate_github_summary}; pub use html::generate_html_report; pub use json::{ReportSchema, generate_json_report}; pub use report::{ From 768f77933f01b115e26e2551b95f902b10ac8f86 Mon Sep 17 00:00:00 2001 From: Farhan Syah Date: Fri, 13 Feb 2026 14:58:40 +0800 Subject: [PATCH 3/8] feat(cli): use human-readable duration formatting in benchmark reports Replace raw nanosecond values with adaptive formatting in human-readable output. Metrics now display as "2.5 ms" instead of "2500000.00 ns", eliminating mental unit conversion. --- fluxbench-cli/src/executor/formatting.rs | 29 +++++++++++++++--------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/fluxbench-cli/src/executor/formatting.rs b/fluxbench-cli/src/executor/formatting.rs index 3768186..c87064f 100644 --- a/fluxbench-cli/src/executor/formatting.rs +++ b/fluxbench-cli/src/executor/formatting.rs @@ -10,7 +10,7 @@ //! - Comparison tables with speedup calculations //! - Verification results summary -use fluxbench_report::{BenchmarkReportResult, BenchmarkStatus, Report}; +use fluxbench_report::{BenchmarkReportResult, BenchmarkStatus, Report, format_duration}; /// Format a report for human-readable terminal display /// @@ -51,20 +51,27 @@ pub fn format_human_output(report: &Report) -> String { if let Some(metrics) = &result.metrics { output.push_str(&format!( - " mean: {:.2} ns median: {:.2} ns stddev: {:.2} ns\n", - metrics.mean_ns, metrics.median_ns, metrics.std_dev_ns + " mean: {} median: {} stddev: {}\n", + format_duration(metrics.mean_ns), + format_duration(metrics.median_ns), + format_duration(metrics.std_dev_ns), )); output.push_str(&format!( - " min: {:.2} ns max: {:.2} ns samples: {}\n", - metrics.min_ns, metrics.max_ns, metrics.samples + " min: {} max: {} samples: {}\n", + format_duration(metrics.min_ns), + format_duration(metrics.max_ns), + metrics.samples, )); output.push_str(&format!( - " p50: {:.2} ns p95: {:.2} ns p99: {:.2} ns\n", - metrics.p50_ns, metrics.p95_ns, metrics.p99_ns + " p50: {} p95: {} p99: {}\n", + format_duration(metrics.p50_ns), + format_duration(metrics.p95_ns), + format_duration(metrics.p99_ns), )); output.push_str(&format!( - " 95% CI: [{:.2}, {:.2}] ns\n", - metrics.ci_lower_ns, metrics.ci_upper_ns + " 95% CI: [{}, {}]\n", + format_duration(metrics.ci_lower_ns), + format_duration(metrics.ci_upper_ns), )); if let Some(throughput) = metrics.throughput_ops_sec { output.push_str(&format!(" throughput: {:.2} ops/sec\n", throughput)); @@ -133,9 +140,9 @@ pub fn format_human_output(report: &Report) -> String { }; output.push_str(&format!( - " {:12.2} {:>10}{}\n", + " {:12} {:>10}{}\n", entry.benchmark_id, - entry.value, + format_duration(entry.value), speedup_str, baseline_marker, width = max_name_len From 29aadedfbf494d824367d1e90decc6fb2a22db8b Mon Sep 17 00:00:00 2001 From: Farhan Syah Date: Fri, 13 Feb 2026 15:01:15 +0800 Subject: [PATCH 4/8] feat(cli): enhance baseline handling and add GitHub Actions integration Improve baseline path resolution: - Make --baseline flag accept optional path argument - Fall back to config baseline_path or default target/fluxbench/baseline.json - Enable simpler CLI usage: just --baseline instead of --baseline /path/to/file Add GitHub Actions annotations: - Emit ::error:: for crashed/failed benchmarks with file/line location - Annotate significant regressions with baseline comparison - Mark verification failures as errors or warnings based on severity - Annotations appear inline on PR diffs in GitHub CI Use human-readable duration formatting in comparison output for consistency with other reports. --- fluxbench-cli/src/lib.rs | 130 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 120 insertions(+), 10 deletions(-) diff --git a/fluxbench-cli/src/lib.rs b/fluxbench-cli/src/lib.rs index 7895c05..df4e498 100644 --- a/fluxbench-cli/src/lib.rs +++ b/fluxbench-cli/src/lib.rs @@ -36,8 +36,8 @@ use clap::{Parser, Subcommand}; use fluxbench_core::{BenchmarkDef, WorkerMain}; use fluxbench_logic::aggregate_verifications; use fluxbench_report::{ - OutputFormat, generate_csv_report, generate_github_summary, generate_html_report, - generate_json_report, + OutputFormat, format_duration, generate_csv_report, generate_github_summary, + generate_html_report, generate_json_report, }; use rayon::ThreadPoolBuilder; use regex::Regex; @@ -67,8 +67,9 @@ pub struct Cli { pub output: Option, /// Load baseline for comparison + /// Optionally specify a path; defaults to config or target/fluxbench/baseline.json #[arg(long)] - pub baseline: Option, + pub baseline: Option>, /// Dry run - list benchmarks without executing #[arg(long)] @@ -493,6 +494,11 @@ fn run_benchmarks( report.summary.critical_failures = verification_summary.critical_failures; report.summary.warnings = verification_summary.failed - verification_summary.critical_failures; + // Emit GitHub Actions annotations if enabled + if config.ci.github_annotations { + emit_github_annotations(&report); + } + // Generate output let output = match format { OutputFormat::Json => generate_json_report(&report)?, @@ -542,8 +548,8 @@ fn compare_benchmarks( git_ref: &str, format: OutputFormat, ) -> anyhow::Result<()> { - // Load baseline - let baseline_path = cli.baseline.as_ref().ok_or_else(|| { + // Load baseline — resolve path from CLI, config, or default + let baseline_path = resolve_baseline_path(&cli.baseline, config).ok_or_else(|| { anyhow::anyhow!( "--baseline required for comparison, or use 'compare' command with a git ref" ) @@ -556,7 +562,7 @@ fn compare_benchmarks( )); } - let baseline_json = std::fs::read_to_string(baseline_path)?; + let baseline_json = std::fs::read_to_string(&baseline_path)?; let baseline: fluxbench_report::Report = serde_json::from_str(&baseline_json)?; let resolved_git_ref = resolve_git_ref(git_ref)?; @@ -675,6 +681,11 @@ fn compare_benchmarks( report.summary.critical_failures = verification_summary.critical_failures; report.summary.warnings = verification_summary.failed - verification_summary.critical_failures; + // Emit GitHub Actions annotations if enabled + if config.ci.github_annotations { + emit_github_annotations(&report); + } + // Generate output let output = match format { OutputFormat::Json => generate_json_report(&report)?, @@ -747,6 +758,102 @@ fn save_baseline_if_needed( Ok(()) } +/// Resolve baseline path from CLI flag, config, or default. +/// +/// - `Some(Some(path))` — explicit path from `--baseline /path/to/file` +/// - `Some(None)` — `--baseline` with no value, use config or default +/// - `None` — flag not passed at all +fn resolve_baseline_path( + cli_baseline: &Option>, + config: &FluxConfig, +) -> Option { + match cli_baseline { + Some(Some(path)) => Some(path.clone()), + Some(None) => { + // --baseline passed without path: use config or default + Some( + config + .output + .baseline_path + .as_ref() + .map(PathBuf::from) + .unwrap_or_else(|| PathBuf::from("target/fluxbench/baseline.json")), + ) + } + None => None, + } +} + +/// Emit `::error::` and `::warning::` annotations for GitHub Actions. +/// +/// These appear inline on PR diffs when running in GitHub Actions CI. +fn emit_github_annotations(report: &fluxbench_report::Report) { + // Annotate crashed/failed benchmarks + for result in &report.results { + match result.status { + fluxbench_report::BenchmarkStatus::Crashed => { + let msg = result + .failure + .as_ref() + .map(|f| f.message.as_str()) + .unwrap_or("benchmark crashed"); + println!( + "::error file={},line={}::{}: {}", + result.file, result.line, result.id, msg + ); + } + fluxbench_report::BenchmarkStatus::Failed => { + let msg = result + .failure + .as_ref() + .map(|f| f.message.as_str()) + .unwrap_or("benchmark failed"); + println!( + "::error file={},line={}::{}: {}", + result.file, result.line, result.id, msg + ); + } + _ => {} + } + + // Annotate significant regressions + if let Some(cmp) = &result.comparison { + if cmp.is_significant && cmp.relative_change > 0.0 { + println!( + "::error file={},line={}::{}: regression {:+.1}% ({} → {})", + result.file, + result.line, + result.id, + cmp.relative_change, + format_duration(cmp.baseline_mean_ns), + result + .metrics + .as_ref() + .map(|m| format_duration(m.mean_ns)) + .unwrap_or_default(), + ); + } + } + } + + // Annotate verification failures + for v in &report.verifications { + match &v.status { + fluxbench_logic::VerificationStatus::Failed => { + let level = match v.severity { + fluxbench_core::Severity::Critical => "error", + _ => "warning", + }; + println!("::{}::{}: {}", level, v.id, v.message); + } + fluxbench_logic::VerificationStatus::Error { message } => { + println!("::error::{}: evaluation error: {}", v.id, message); + } + _ => {} + } + } +} + fn resolve_git_ref(git_ref: &str) -> anyhow::Result { let output = std::process::Command::new("git") .args(["rev-parse", "--verify", git_ref]) @@ -816,12 +923,15 @@ fn format_comparison_output( }; output.push_str(&format!( - " baseline: {:.2} ns → current: {:.2} ns\n", - comparison.baseline_mean_ns, metrics.mean_ns + " baseline: {} → current: {}\n", + format_duration(comparison.baseline_mean_ns), + format_duration(metrics.mean_ns), )); output.push_str(&format!( - " change: {:+.2}% ({:+.2} ns) {}\n", - comparison.relative_change, comparison.absolute_change_ns, change_icon + " change: {:+.2}% ({}) {}\n", + comparison.relative_change, + format_duration(comparison.absolute_change_ns.abs()), + change_icon, )); } From 2203a41354ca2489fcd51e3ec92f81496240ae5c Mon Sep 17 00:00:00 2001 From: Farhan Syah Date: Fri, 13 Feb 2026 15:01:22 +0800 Subject: [PATCH 5/8] chore(release): bump version to 0.1.2 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index eb7bfa3..4c9c4d9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,7 +14,7 @@ members = [ [workspace.package] authors = ["Farhan Syah"] -version = "0.1.1" +version = "0.1.2" edition = "2024" rust-version = "1.85" license = "Apache-2.0" From 173b9a5dcb4ac6d57fd746ea389a6f4ec8ac6b86 Mon Sep 17 00:00:00 2001 From: Farhan Syah Date: Fri, 13 Feb 2026 15:14:26 +0800 Subject: [PATCH 6/8] chore(cli): disable documentation generation for binary target Binary executables don't need API documentation, only library crates do. --- fluxbench-cli/Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/fluxbench-cli/Cargo.toml b/fluxbench-cli/Cargo.toml index 8a40a37..555edbe 100644 --- a/fluxbench-cli/Cargo.toml +++ b/fluxbench-cli/Cargo.toml @@ -17,6 +17,7 @@ path = "src/lib.rs" [[bin]] name = "fluxbench" path = "src/main.rs" +doc = false [dependencies] fluxbench-core.workspace = true From 5bd2624a1348f853f44a737cbd1c82ff02f30eed Mon Sep 17 00:00:00 2001 From: Farhan Syah Date: Fri, 13 Feb 2026 15:14:33 +0800 Subject: [PATCH 7/8] refactor(cli): extract baseline comparison into reusable function Extract baseline comparison logic into apply_baseline_comparison() to eliminate code duplication between run_benchmarks and compare_benchmarks. This enables baseline comparison support when using --baseline flag with the run command, not just the compare command. Also fixes documentation formatting in ComparisonSeries field comment. --- fluxbench-cli/src/lib.rs | 158 ++++++++++++++++++++------------- fluxbench-report/src/report.rs | 2 +- 2 files changed, 98 insertions(+), 62 deletions(-) diff --git a/fluxbench-cli/src/lib.rs b/fluxbench-cli/src/lib.rs index df4e498..59d925d 100644 --- a/fluxbench-cli/src/lib.rs +++ b/fluxbench-cli/src/lib.rs @@ -481,6 +481,33 @@ fn run_benchmarks( let total_duration_ms = start_time.elapsed().as_secs_f64() * 1000.0; let mut report = build_report(&results, &stats, &exec_config, total_duration_ms); + // Load and apply baseline comparison if --baseline was passed + if let Some(baseline_path) = resolve_baseline_path(&cli.baseline, config) { + if baseline_path.exists() { + match std::fs::read_to_string(&baseline_path).and_then(|json| { + serde_json::from_str::(&json) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e)) + }) { + Ok(baseline) => { + let threshold = cli.threshold.unwrap_or(config.ci.regression_threshold); + apply_baseline_comparison(&mut report, &baseline, threshold); + } + Err(e) => { + eprintln!( + "Warning: failed to load baseline {}: {}", + baseline_path.display(), + e + ); + } + } + } else { + eprintln!( + "Warning: baseline file not found: {}", + baseline_path.display() + ); + } + } + // Run comparisons, synthetics, and verifications let (comparison_results, comparison_series, synthetic_results, verification_results) = execute_verifications(&results, &stats); @@ -607,68 +634,9 @@ fn compare_benchmarks( let total_duration_ms = start_time.elapsed().as_secs_f64() * 1000.0; let mut report = build_report(&results, &stats, &exec_config, total_duration_ms); - // Store baseline metadata for summary header - report.baseline_meta = Some(baseline.meta.clone()); - - // Add comparison data + // Apply baseline comparison data let regression_threshold = cli.threshold.unwrap_or(config.ci.regression_threshold); - let baseline_map: std::collections::HashMap<_, _> = baseline - .results - .iter() - .filter_map(|r| r.metrics.as_ref().map(|m| (r.id.clone(), m.clone()))) - .collect(); - - for result in &mut report.results { - if let (Some(metrics), Some(baseline_metrics)) = - (&result.metrics, baseline_map.get(&result.id)) - { - let baseline_mean = baseline_metrics.mean_ns; - let absolute_change = metrics.mean_ns - baseline_mean; - let relative_change = if baseline_mean > 0.0 { - (absolute_change / baseline_mean) * 100.0 - } else { - 0.0 - }; - - // Determine significance via CI non-overlap and threshold crossing. - let ci_non_overlap = metrics.ci_upper_ns < baseline_metrics.ci_lower_ns - || metrics.ci_lower_ns > baseline_metrics.ci_upper_ns; - let is_significant = relative_change.abs() > regression_threshold && ci_non_overlap; - - // Track regressions/improvements - if relative_change > regression_threshold { - report.summary.regressions += 1; - } else if relative_change < -regression_threshold { - report.summary.improvements += 1; - } - - let mut effect_size = if metrics.std_dev_ns > f64::EPSILON { - absolute_change / metrics.std_dev_ns - } else { - 0.0 - }; - if !effect_size.is_finite() { - effect_size = 0.0; - } - - let probability_regression = if ci_non_overlap { - if relative_change > 0.0 { 0.99 } else { 0.01 } - } else if relative_change > 0.0 { - 0.60 - } else { - 0.40 - }; - - result.comparison = Some(fluxbench_report::Comparison { - baseline_mean_ns: baseline_mean, - absolute_change_ns: absolute_change, - relative_change, - probability_regression, - is_significant, - effect_size, - }); - } - } + apply_baseline_comparison(&mut report, &baseline, regression_threshold); // Run comparisons, synthetics, and verifications let (comparison_results, comparison_series, synthetic_results, verification_results) = @@ -758,6 +726,74 @@ fn save_baseline_if_needed( Ok(()) } +/// Apply baseline comparison data to the report. +/// +/// Computes per-benchmark regression/improvement metrics by comparing current +/// results against baseline means, CI overlap, and effect size. +fn apply_baseline_comparison( + report: &mut fluxbench_report::Report, + baseline: &fluxbench_report::Report, + regression_threshold: f64, +) { + report.baseline_meta = Some(baseline.meta.clone()); + + let baseline_map: std::collections::HashMap<_, _> = baseline + .results + .iter() + .filter_map(|r| r.metrics.as_ref().map(|m| (r.id.clone(), m.clone()))) + .collect(); + + for result in &mut report.results { + if let (Some(metrics), Some(baseline_metrics)) = + (&result.metrics, baseline_map.get(&result.id)) + { + let baseline_mean = baseline_metrics.mean_ns; + let absolute_change = metrics.mean_ns - baseline_mean; + let relative_change = if baseline_mean > 0.0 { + (absolute_change / baseline_mean) * 100.0 + } else { + 0.0 + }; + + let ci_non_overlap = metrics.ci_upper_ns < baseline_metrics.ci_lower_ns + || metrics.ci_lower_ns > baseline_metrics.ci_upper_ns; + let is_significant = relative_change.abs() > regression_threshold && ci_non_overlap; + + if relative_change > regression_threshold { + report.summary.regressions += 1; + } else if relative_change < -regression_threshold { + report.summary.improvements += 1; + } + + let mut effect_size = if metrics.std_dev_ns > f64::EPSILON { + absolute_change / metrics.std_dev_ns + } else { + 0.0 + }; + if !effect_size.is_finite() { + effect_size = 0.0; + } + + let probability_regression = if ci_non_overlap { + if relative_change > 0.0 { 0.99 } else { 0.01 } + } else if relative_change > 0.0 { + 0.60 + } else { + 0.40 + }; + + result.comparison = Some(fluxbench_report::Comparison { + baseline_mean_ns: baseline_mean, + absolute_change_ns: absolute_change, + relative_change, + probability_regression, + is_significant, + effect_size, + }); + } + } +} + /// Resolve baseline path from CLI flag, config, or default. /// /// - `Some(Some(path))` — explicit path from `--baseline /path/to/file` diff --git a/fluxbench-report/src/report.rs b/fluxbench-report/src/report.rs index da998d2..2023fce 100644 --- a/fluxbench-report/src/report.rs +++ b/fluxbench-report/src/report.rs @@ -66,7 +66,7 @@ pub struct ComparisonSeries { pub x_values: Vec, /// Competitor/series names (benchmark IDs) pub series_names: Vec, - /// Data points: series_data[series_idx][x_idx] = value + /// Data points: `series_data[series_idx][x_idx] = value` pub series_data: Vec>, /// Metric used pub metric: String, From 12e2567e94ab8b4d77c4b04a19f54c711e7e5b13 Mon Sep 17 00:00:00 2001 From: Farhan Syah Date: Fri, 13 Feb 2026 15:35:56 +0800 Subject: [PATCH 8/8] feat(cli): add per-benchmark regression thresholds Enable benchmarks to override the global CI regression threshold with benchmark-specific values. Critical benchmarks can now enforce stricter thresholds while less sensitive benchmarks use the global setting. Changes: - Add threshold field to BenchmarkReportResult for per-benchmark overrides - Thread threshold value through execution and reporting pipeline - Implement threshold selection logic (per-benchmark > 0 overrides global) - Display custom thresholds in GitHub Actions annotations - Add comprehensive unit tests for threshold precedence scenarios - Raise default CI threshold to 25% to accommodate shared runner variance The per-benchmark threshold is specified via the bench macro attribute and defaults to 0.0 (use global). When set, it takes precedence over the global regression_threshold in regression detection and reporting. --- examples/examples/ci_regression.rs | 6 + flux.toml | 3 +- fluxbench-cli/src/executor/execution.rs | 6 + fluxbench-cli/src/executor/report.rs | 1 + fluxbench-cli/src/lib.rs | 192 +++++++++++++++++++++++- fluxbench-core/src/lib.rs | 2 +- fluxbench-report/src/github.rs | 12 +- fluxbench-report/src/report.rs | 3 + 8 files changed, 216 insertions(+), 9 deletions(-) diff --git a/examples/examples/ci_regression.rs b/examples/examples/ci_regression.rs index f76920b..fa4e7f3 100644 --- a/examples/examples/ci_regression.rs +++ b/examples/examples/ci_regression.rs @@ -30,6 +30,8 @@ use std::hint::black_box; group = "hot_path", severity = "critical", threshold = 5.0, + warmup = "2s", + measurement = "3s", tags = "latency" )] fn request_handler(b: &mut Bencher) { @@ -50,6 +52,8 @@ fn request_handler(b: &mut Bencher) { group = "hot_path", severity = "critical", threshold = 3.0, + warmup = "2s", + measurement = "3s", tags = "throughput" )] fn token_scan(b: &mut Bencher) { @@ -69,6 +73,8 @@ fn token_scan(b: &mut Bencher) { group = "hot_path", severity = "warning", threshold = 10.0, + warmup = "2s", + measurement = "3s", tags = "throughput" )] fn batch_transform(b: &mut Bencher) { diff --git a/flux.toml b/flux.toml index 113f3b5..bfc93f6 100644 --- a/flux.toml +++ b/flux.toml @@ -59,7 +59,8 @@ save_baseline = false [ci] # Regression threshold percentage — fail CI if exceeded (default: 5.0) -regression_threshold = 5.0 +# Shared CI runners (GitHub Actions) have 10-40% noise; use 25%+ to avoid false positives. +regression_threshold = 25.0 # Emit GitHub Actions annotations (::warning, ::error) (default: false) github_annotations = true # Exit non-zero on critical verification failures (default: true) diff --git a/fluxbench-cli/src/executor/execution.rs b/fluxbench-cli/src/executor/execution.rs index c6091e2..f655a5e 100644 --- a/fluxbench-cli/src/executor/execution.rs +++ b/fluxbench-cli/src/executor/execution.rs @@ -119,6 +119,8 @@ pub struct BenchExecutionResult { pub failure_kind: Option, pub backtrace: Option, pub severity: fluxbench_core::Severity, + /// Per-benchmark regression threshold (0.0 = use global) + pub threshold: f64, } /// Execute benchmarks and produce results (in-process mode) @@ -217,6 +219,7 @@ impl Executor { failure_kind: None, backtrace: None, severity: bench.severity, + threshold: bench.threshold, } } Err(panic) => { @@ -244,6 +247,7 @@ impl Executor { failure_kind: Some("panic".to_string()), backtrace: None, severity: bench.severity, + threshold: bench.threshold, } } } @@ -357,6 +361,7 @@ impl IsolatedExecutor { failure_kind: Some("crashed".to_string()), backtrace: None, severity: bench.severity, + threshold: bench.threshold, }); pb.inc(1); } @@ -431,6 +436,7 @@ impl IsolatedExecutor { failure_kind, backtrace, severity: bench.severity, + threshold: bench.threshold, } } } diff --git a/fluxbench-cli/src/executor/report.rs b/fluxbench-cli/src/executor/report.rs index cf1aab2..31fbb32 100644 --- a/fluxbench-cli/src/executor/report.rs +++ b/fluxbench-cli/src/executor/report.rs @@ -153,6 +153,7 @@ pub fn build_report( file: result.file.clone(), line: result.line, metrics, + threshold: result.threshold, comparison: None, // Filled when comparing to baseline failure, }); diff --git a/fluxbench-cli/src/lib.rs b/fluxbench-cli/src/lib.rs index 59d925d..98f0e7f 100644 --- a/fluxbench-cli/src/lib.rs +++ b/fluxbench-cli/src/lib.rs @@ -747,6 +747,13 @@ fn apply_baseline_comparison( if let (Some(metrics), Some(baseline_metrics)) = (&result.metrics, baseline_map.get(&result.id)) { + // Use per-benchmark threshold if set (> 0.0), otherwise global + let effective_threshold = if result.threshold > 0.0 { + result.threshold + } else { + regression_threshold + }; + let baseline_mean = baseline_metrics.mean_ns; let absolute_change = metrics.mean_ns - baseline_mean; let relative_change = if baseline_mean > 0.0 { @@ -757,11 +764,11 @@ fn apply_baseline_comparison( let ci_non_overlap = metrics.ci_upper_ns < baseline_metrics.ci_lower_ns || metrics.ci_lower_ns > baseline_metrics.ci_upper_ns; - let is_significant = relative_change.abs() > regression_threshold && ci_non_overlap; + let is_significant = relative_change.abs() > effective_threshold && ci_non_overlap; - if relative_change > regression_threshold { + if relative_change > effective_threshold { report.summary.regressions += 1; - } else if relative_change < -regression_threshold { + } else if relative_change < -effective_threshold { report.summary.improvements += 1; } @@ -987,3 +994,182 @@ fn format_comparison_output( output } + +#[cfg(test)] +mod tests { + use super::*; + use fluxbench_report::{ + BenchmarkMetrics, BenchmarkReportResult, BenchmarkStatus, Report, ReportConfig, ReportMeta, + ReportSummary, SystemInfo, + }; + + fn dummy_meta() -> ReportMeta { + ReportMeta { + schema_version: 1, + version: "0.1.0".to_string(), + timestamp: chrono::Utc::now(), + git_commit: None, + git_branch: None, + system: SystemInfo { + os: "linux".to_string(), + os_version: "6.0".to_string(), + cpu: "test".to_string(), + cpu_cores: 1, + memory_gb: 1.0, + }, + config: ReportConfig { + warmup_time_ns: 0, + measurement_time_ns: 0, + min_iterations: None, + max_iterations: None, + bootstrap_iterations: 0, + confidence_level: 0.95, + track_allocations: false, + }, + } + } + + fn dummy_metrics(mean: f64) -> BenchmarkMetrics { + BenchmarkMetrics { + samples: 100, + mean_ns: mean, + median_ns: mean, + std_dev_ns: mean * 0.01, + min_ns: mean * 0.9, + max_ns: mean * 1.1, + p50_ns: mean, + p90_ns: mean * 1.05, + p95_ns: mean * 1.07, + p99_ns: mean * 1.09, + p999_ns: mean * 1.1, + skewness: 0.0, + kurtosis: 3.0, + ci_lower_ns: mean * 0.98, + ci_upper_ns: mean * 1.02, + ci_level: 0.95, + throughput_ops_sec: None, + alloc_bytes: 0, + alloc_count: 0, + mean_cycles: 0.0, + median_cycles: 0.0, + min_cycles: 0, + max_cycles: 0, + cycles_per_ns: 0.0, + } + } + + fn dummy_result(id: &str, mean: f64, threshold: f64) -> BenchmarkReportResult { + BenchmarkReportResult { + id: id.to_string(), + name: id.to_string(), + group: "test".to_string(), + status: BenchmarkStatus::Passed, + severity: fluxbench_core::Severity::Warning, + file: "test.rs".to_string(), + line: 1, + metrics: Some(dummy_metrics(mean)), + threshold, + comparison: None, + failure: None, + } + } + + fn dummy_report(results: Vec) -> Report { + let total = results.len(); + Report { + meta: dummy_meta(), + results, + comparisons: vec![], + comparison_series: vec![], + synthetics: vec![], + verifications: vec![], + summary: ReportSummary { + total_benchmarks: total, + passed: total, + ..Default::default() + }, + baseline_meta: None, + } + } + + #[test] + fn per_bench_threshold_overrides_global() { + // Baseline: 100ns. Current: 108ns → 8% regression. + // Global threshold: 25%. Per-bench threshold: 5%. + // Should detect regression via per-bench threshold but not global. + let mut report = dummy_report(vec![dummy_result("fast_bench", 108.0, 5.0)]); + let baseline = dummy_report(vec![dummy_result("fast_bench", 100.0, 5.0)]); + + apply_baseline_comparison(&mut report, &baseline, 25.0); + + assert_eq!( + report.summary.regressions, 1, + "per-bench 5% should catch 8% regression" + ); + let cmp = report.results[0].comparison.as_ref().unwrap(); + assert!(cmp.is_significant); + } + + #[test] + fn zero_threshold_falls_back_to_global() { + // Baseline: 100ns. Current: 108ns → 8% regression. + // Global threshold: 25%. Per-bench threshold: 0.0 (use global). + // 8% < 25%, so no regression. + let mut report = dummy_report(vec![dummy_result("normal_bench", 108.0, 0.0)]); + let baseline = dummy_report(vec![dummy_result("normal_bench", 100.0, 0.0)]); + + apply_baseline_comparison(&mut report, &baseline, 25.0); + + assert_eq!( + report.summary.regressions, 0, + "8% under 25% global should not regress" + ); + let cmp = report.results[0].comparison.as_ref().unwrap(); + assert!(!cmp.is_significant); + } + + #[test] + fn mixed_thresholds_independent() { + // Two benchmarks: one with tight per-bench threshold, one using global. + // Both regress by 8%. + let mut report = dummy_report(vec![ + dummy_result("tight", 108.0, 5.0), // per-bench 5% → should regress + dummy_result("loose", 108.0, 0.0), // global 25% → should not + ]); + let baseline = dummy_report(vec![ + dummy_result("tight", 100.0, 5.0), + dummy_result("loose", 100.0, 0.0), + ]); + + apply_baseline_comparison(&mut report, &baseline, 25.0); + + assert_eq!(report.summary.regressions, 1); + assert!( + report.results[0] + .comparison + .as_ref() + .unwrap() + .is_significant + ); + assert!( + !report.results[1] + .comparison + .as_ref() + .unwrap() + .is_significant + ); + } + + #[test] + fn per_bench_threshold_detects_improvement() { + // Baseline: 100ns. Current: 90ns → -10% improvement. + // Per-bench threshold: 5%. + let mut report = dummy_report(vec![dummy_result("improving", 90.0, 5.0)]); + let baseline = dummy_report(vec![dummy_result("improving", 100.0, 5.0)]); + + apply_baseline_comparison(&mut report, &baseline, 25.0); + + assert_eq!(report.summary.improvements, 1); + assert_eq!(report.summary.regressions, 0); + } +} diff --git a/fluxbench-core/src/lib.rs b/fluxbench-core/src/lib.rs index 0bc84ed..bf328c5 100644 --- a/fluxbench-core/src/lib.rs +++ b/fluxbench-core/src/lib.rs @@ -32,7 +32,7 @@ pub struct BenchmarkDef { pub group: &'static str, /// Severity level for CI reporting pub severity: Severity, - /// Regression threshold percentage + /// Per-benchmark regression threshold percentage (0.0 = use global threshold) pub threshold: f64, /// Absolute time budget in nanoseconds pub budget_ns: Option, diff --git a/fluxbench-report/src/github.rs b/fluxbench-report/src/github.rs index aaa66d5..ebe119e 100644 --- a/fluxbench-report/src/github.rs +++ b/fluxbench-report/src/github.rs @@ -101,13 +101,17 @@ fn write_baseline_comparison(output: &mut String, report: &Report) { let baseline = format_duration(cmp.baseline_mean_ns); let change = format_change(cmp.relative_change); let status = if cmp.relative_change.abs() < 1.0 { - "stable" + "stable".to_string() } else if cmp.is_significant && cmp.relative_change > 0.0 { - "REGRESSION" + if result.threshold > 0.0 { + format!("REGRESSION (>{:.0}%)", result.threshold) + } else { + "REGRESSION".to_string() + } } else if cmp.is_significant && cmp.relative_change < 0.0 { - "improved" + "improved".to_string() } else { - "within noise" + "within noise".to_string() }; output.push_str(&format!( diff --git a/fluxbench-report/src/report.rs b/fluxbench-report/src/report.rs index 2023fce..1262346 100644 --- a/fluxbench-report/src/report.rs +++ b/fluxbench-report/src/report.rs @@ -144,6 +144,9 @@ pub struct BenchmarkReportResult { pub line: u32, /// Timing and statistical metrics (if successful) pub metrics: Option, + /// Per-benchmark regression threshold percentage (0.0 = use global) + #[serde(default)] + pub threshold: f64, /// Comparison results against baseline (if applicable) pub comparison: Option, /// Failure details (if failed)