From 5936f984fdf11e3004bc84643609b12d70742ea6 Mon Sep 17 00:00:00 2001
From: Farhan Syah <bizimpulse@gmail.com>
Date: Fri, 13 Feb 2026 14:57:24 +0800
Subject: [PATCH 1/8] fix(ci): clear stale baseline cache on workflow re-runs

Add cache deletion step to prevent stale baseline artifacts when re-running
the same commit. GitHub Actions caches are immutable per key, so re-runs on
the same SHA would fail to update the baseline without this fix.

Also add actions:write permission required for cache deletion API.
---
 .github/workflows/baseline.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.github/workflows/baseline.yml b/.github/workflows/baseline.yml
index 3aad88f..183c1c0 100644
--- a/.github/workflows/baseline.yml
+++ b/.github/workflows/baseline.yml
@@ -20,6 +20,7 @@ concurrency:
 
 permissions:
   contents: read
+  actions: write
 
 env:
   CARGO_TERM_COLOR: always
@@ -50,6 +51,12 @@ jobs:
 
       # Cache keyed by SHA so each merge gets its own entry.
       # benchmark.yml uses restore-keys prefix matching to find the latest one.
+      # Delete stale cache first so re-runs on the same commit work.
+      - name: Clear stale baseline cache
+        run: gh cache delete "fluxbench-baseline-${{ github.sha }}" 2>/dev/null || true
+        env:
+          GH_TOKEN: ${{ github.token }}
+
       - name: Cache baseline
         uses: actions/cache/save@v4
         with:

From 220430823915a6c6d0dea3f32bc15714bc8a1b6e Mon Sep 17 00:00:00 2001
From: Farhan Syah <bizimpulse@gmail.com>
Date: Fri, 13 Feb 2026 14:58:30 +0800
Subject: [PATCH 2/8] feat(report): export duration formatting utility for
 reuse

Make format_duration public and export from the report crate root, enabling
consistent human-readable duration formatting (ns/us/ms/s) across CLI and
report generation.
---
 fluxbench-report/src/github.rs | 3 ++-
 fluxbench-report/src/lib.rs    | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/fluxbench-report/src/github.rs b/fluxbench-report/src/github.rs
index c5385cc..aaa66d5 100644
--- a/fluxbench-report/src/github.rs
+++ b/fluxbench-report/src/github.rs
@@ -213,7 +213,8 @@ fn write_verifications(output: &mut String, verifications: &[VerificationResult]
     output.push('\n');
 }
 
-fn format_duration(ns: f64) -> String {
+/// Format a duration in nanoseconds to a human-readable string (ns/us/ms/s).
+pub fn format_duration(ns: f64) -> String {
     if ns < 1_000.0 {
         format!("{:.0} ns", ns)
     } else if ns < 1_000_000.0 {
diff --git a/fluxbench-report/src/lib.rs b/fluxbench-report/src/lib.rs
index ff1ddd3..4e9d119 100644
--- a/fluxbench-report/src/lib.rs
+++ b/fluxbench-report/src/lib.rs
@@ -14,7 +14,7 @@ mod json;
 mod report;
 
 pub use csv::generate_csv_report;
-pub use github::generate_github_summary;
+pub use github::{format_duration, generate_github_summary};
 pub use html::generate_html_report;
 pub use json::{ReportSchema, generate_json_report};
 pub use report::{

From 768f77933f01b115e26e2551b95f902b10ac8f86 Mon Sep 17 00:00:00 2001
From: Farhan Syah <bizimpulse@gmail.com>
Date: Fri, 13 Feb 2026 14:58:40 +0800
Subject: [PATCH 3/8] feat(cli): use human-readable duration formatting in
 benchmark reports

Replace raw nanosecond values with adaptive formatting in human-readable
output. Metrics now display as "2.5 ms" instead of "2500000.00 ns",
eliminating mental unit conversion.
---
 fluxbench-cli/src/executor/formatting.rs | 29 +++++++++++++++---------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/fluxbench-cli/src/executor/formatting.rs b/fluxbench-cli/src/executor/formatting.rs
index 3768186..c87064f 100644
--- a/fluxbench-cli/src/executor/formatting.rs
+++ b/fluxbench-cli/src/executor/formatting.rs
@@ -10,7 +10,7 @@
 //! - Comparison tables with speedup calculations
 //! - Verification results summary
 
-use fluxbench_report::{BenchmarkReportResult, BenchmarkStatus, Report};
+use fluxbench_report::{BenchmarkReportResult, BenchmarkStatus, Report, format_duration};
 
 /// Format a report for human-readable terminal display
 ///
@@ -51,20 +51,27 @@ pub fn format_human_output(report: &Report) -> String {
 
             if let Some(metrics) = &result.metrics {
                 output.push_str(&format!(
-                    "      mean: {:.2} ns  median: {:.2} ns  stddev: {:.2} ns\n",
-                    metrics.mean_ns, metrics.median_ns, metrics.std_dev_ns
+                    "      mean: {}  median: {}  stddev: {}\n",
+                    format_duration(metrics.mean_ns),
+                    format_duration(metrics.median_ns),
+                    format_duration(metrics.std_dev_ns),
                 ));
                 output.push_str(&format!(
-                    "      min: {:.2} ns  max: {:.2} ns  samples: {}\n",
-                    metrics.min_ns, metrics.max_ns, metrics.samples
+                    "      min: {}  max: {}  samples: {}\n",
+                    format_duration(metrics.min_ns),
+                    format_duration(metrics.max_ns),
+                    metrics.samples,
                 ));
                 output.push_str(&format!(
-                    "      p50: {:.2} ns  p95: {:.2} ns  p99: {:.2} ns\n",
-                    metrics.p50_ns, metrics.p95_ns, metrics.p99_ns
+                    "      p50: {}  p95: {}  p99: {}\n",
+                    format_duration(metrics.p50_ns),
+                    format_duration(metrics.p95_ns),
+                    format_duration(metrics.p99_ns),
                 ));
                 output.push_str(&format!(
-                    "      95% CI: [{:.2}, {:.2}] ns\n",
-                    metrics.ci_lower_ns, metrics.ci_upper_ns
+                    "      95% CI: [{}, {}]\n",
+                    format_duration(metrics.ci_lower_ns),
+                    format_duration(metrics.ci_upper_ns),
                 ));
                 if let Some(throughput) = metrics.throughput_ops_sec {
                     output.push_str(&format!("      throughput: {:.2} ops/sec\n", throughput));
@@ -133,9 +140,9 @@ pub fn format_human_output(report: &Report) -> String {
             };
 
             output.push_str(&format!(
-                "  {:<width$}  {:>12.2}  {:>10}{}\n",
+                "  {:<width$}  {:>12}  {:>10}{}\n",
                 entry.benchmark_id,
-                entry.value,
+                format_duration(entry.value),
                 speedup_str,
                 baseline_marker,
                 width = max_name_len

From 29aadedfbf494d824367d1e90decc6fb2a22db8b Mon Sep 17 00:00:00 2001
From: Farhan Syah <bizimpulse@gmail.com>
Date: Fri, 13 Feb 2026 15:01:15 +0800
Subject: [PATCH 4/8] feat(cli): enhance baseline handling and add GitHub
 Actions integration

Improve baseline path resolution:
- Make --baseline flag accept optional path argument
- Fall back to config baseline_path or default target/fluxbench/baseline.json
- Enable simpler CLI usage: just --baseline instead of --baseline /path/to/file

Add GitHub Actions annotations:
- Emit ::error:: for crashed/failed benchmarks with file/line location
- Annotate significant regressions with baseline comparison
- Mark verification failures as errors or warnings based on severity
- Annotations appear inline on PR diffs in GitHub CI

Use human-readable duration formatting in comparison output for consistency
with other reports.
---
 fluxbench-cli/src/lib.rs | 130 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 120 insertions(+), 10 deletions(-)

diff --git a/fluxbench-cli/src/lib.rs b/fluxbench-cli/src/lib.rs
index 7895c05..df4e498 100644
--- a/fluxbench-cli/src/lib.rs
+++ b/fluxbench-cli/src/lib.rs
@@ -36,8 +36,8 @@ use clap::{Parser, Subcommand};
 use fluxbench_core::{BenchmarkDef, WorkerMain};
 use fluxbench_logic::aggregate_verifications;
 use fluxbench_report::{
-    OutputFormat, generate_csv_report, generate_github_summary, generate_html_report,
-    generate_json_report,
+    OutputFormat, format_duration, generate_csv_report, generate_github_summary,
+    generate_html_report, generate_json_report,
 };
 use rayon::ThreadPoolBuilder;
 use regex::Regex;
@@ -67,8 +67,9 @@ pub struct Cli {
     pub output: Option<PathBuf>,
 
     /// Load baseline for comparison
+    /// Optionally specify a path; defaults to config or target/fluxbench/baseline.json
     #[arg(long)]
-    pub baseline: Option<PathBuf>,
+    pub baseline: Option<Option<PathBuf>>,
 
     /// Dry run - list benchmarks without executing
     #[arg(long)]
@@ -493,6 +494,11 @@ fn run_benchmarks(
     report.summary.critical_failures = verification_summary.critical_failures;
     report.summary.warnings = verification_summary.failed - verification_summary.critical_failures;
 
+    // Emit GitHub Actions annotations if enabled
+    if config.ci.github_annotations {
+        emit_github_annotations(&report);
+    }
+
     // Generate output
     let output = match format {
         OutputFormat::Json => generate_json_report(&report)?,
@@ -542,8 +548,8 @@ fn compare_benchmarks(
     git_ref: &str,
     format: OutputFormat,
 ) -> anyhow::Result<()> {
-    // Load baseline
-    let baseline_path = cli.baseline.as_ref().ok_or_else(|| {
+    // Load baseline — resolve path from CLI, config, or default
+    let baseline_path = resolve_baseline_path(&cli.baseline, config).ok_or_else(|| {
         anyhow::anyhow!(
             "--baseline required for comparison, or use 'compare' command with a git ref"
         )
@@ -556,7 +562,7 @@ fn compare_benchmarks(
         ));
     }
 
-    let baseline_json = std::fs::read_to_string(baseline_path)?;
+    let baseline_json = std::fs::read_to_string(&baseline_path)?;
     let baseline: fluxbench_report::Report = serde_json::from_str(&baseline_json)?;
     let resolved_git_ref = resolve_git_ref(git_ref)?;
 
@@ -675,6 +681,11 @@ fn compare_benchmarks(
     report.summary.critical_failures = verification_summary.critical_failures;
     report.summary.warnings = verification_summary.failed - verification_summary.critical_failures;
 
+    // Emit GitHub Actions annotations if enabled
+    if config.ci.github_annotations {
+        emit_github_annotations(&report);
+    }
+
     // Generate output
     let output = match format {
         OutputFormat::Json => generate_json_report(&report)?,
@@ -747,6 +758,102 @@ fn save_baseline_if_needed(
     Ok(())
 }
 
+/// Resolve baseline path from CLI flag, config, or default.
+///
+/// - `Some(Some(path))` — explicit path from `--baseline /path/to/file`
+/// - `Some(None)` — `--baseline` with no value, use config or default
+/// - `None` — flag not passed at all
+fn resolve_baseline_path(
+    cli_baseline: &Option<Option<PathBuf>>,
+    config: &FluxConfig,
+) -> Option<PathBuf> {
+    match cli_baseline {
+        Some(Some(path)) => Some(path.clone()),
+        Some(None) => {
+            // --baseline passed without path: use config or default
+            Some(
+                config
+                    .output
+                    .baseline_path
+                    .as_ref()
+                    .map(PathBuf::from)
+                    .unwrap_or_else(|| PathBuf::from("target/fluxbench/baseline.json")),
+            )
+        }
+        None => None,
+    }
+}
+
+/// Emit `::error::` and `::warning::` annotations for GitHub Actions.
+///
+/// These appear inline on PR diffs when running in GitHub Actions CI.
+fn emit_github_annotations(report: &fluxbench_report::Report) {
+    // Annotate crashed/failed benchmarks
+    for result in &report.results {
+        match result.status {
+            fluxbench_report::BenchmarkStatus::Crashed => {
+                let msg = result
+                    .failure
+                    .as_ref()
+                    .map(|f| f.message.as_str())
+                    .unwrap_or("benchmark crashed");
+                println!(
+                    "::error file={},line={}::{}: {}",
+                    result.file, result.line, result.id, msg
+                );
+            }
+            fluxbench_report::BenchmarkStatus::Failed => {
+                let msg = result
+                    .failure
+                    .as_ref()
+                    .map(|f| f.message.as_str())
+                    .unwrap_or("benchmark failed");
+                println!(
+                    "::error file={},line={}::{}: {}",
+                    result.file, result.line, result.id, msg
+                );
+            }
+            _ => {}
+        }
+
+        // Annotate significant regressions
+        if let Some(cmp) = &result.comparison {
+            if cmp.is_significant && cmp.relative_change > 0.0 {
+                println!(
+                    "::error file={},line={}::{}: regression {:+.1}% ({} → {})",
+                    result.file,
+                    result.line,
+                    result.id,
+                    cmp.relative_change,
+                    format_duration(cmp.baseline_mean_ns),
+                    result
+                        .metrics
+                        .as_ref()
+                        .map(|m| format_duration(m.mean_ns))
+                        .unwrap_or_default(),
+                );
+            }
+        }
+    }
+
+    // Annotate verification failures
+    for v in &report.verifications {
+        match &v.status {
+            fluxbench_logic::VerificationStatus::Failed => {
+                let level = match v.severity {
+                    fluxbench_core::Severity::Critical => "error",
+                    _ => "warning",
+                };
+                println!("::{}::{}: {}", level, v.id, v.message);
+            }
+            fluxbench_logic::VerificationStatus::Error { message } => {
+                println!("::error::{}: evaluation error: {}", v.id, message);
+            }
+            _ => {}
+        }
+    }
+}
+
 fn resolve_git_ref(git_ref: &str) -> anyhow::Result<String> {
     let output = std::process::Command::new("git")
         .args(["rev-parse", "--verify", git_ref])
@@ -816,12 +923,15 @@ fn format_comparison_output(
             };
 
             output.push_str(&format!(
-                "    baseline: {:.2} ns → current: {:.2} ns\n",
-                comparison.baseline_mean_ns, metrics.mean_ns
+                "    baseline: {} → current: {}\n",
+                format_duration(comparison.baseline_mean_ns),
+                format_duration(metrics.mean_ns),
             ));
             output.push_str(&format!(
-                "    change: {:+.2}% ({:+.2} ns) {}\n",
-                comparison.relative_change, comparison.absolute_change_ns, change_icon
+                "    change: {:+.2}% ({}) {}\n",
+                comparison.relative_change,
+                format_duration(comparison.absolute_change_ns.abs()),
+                change_icon,
             ));
         }
 

From 2203a41354ca2489fcd51e3ec92f81496240ae5c Mon Sep 17 00:00:00 2001
From: Farhan Syah <bizimpulse@gmail.com>
Date: Fri, 13 Feb 2026 15:01:22 +0800
Subject: [PATCH 5/8] chore(release): bump version to 0.1.2

---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index eb7bfa3..4c9c4d9 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -14,7 +14,7 @@ members = [
 
 [workspace.package]
 authors = ["Farhan Syah"]
-version = "0.1.1"
+version = "0.1.2"
 edition = "2024"
 rust-version = "1.85"
 license = "Apache-2.0"

From 173b9a5dcb4ac6d57fd746ea389a6f4ec8ac6b86 Mon Sep 17 00:00:00 2001
From: Farhan Syah <bizimpulse@gmail.com>
Date: Fri, 13 Feb 2026 15:14:26 +0800
Subject: [PATCH 6/8] chore(cli): disable documentation generation for binary
 target

Binary executables don't need API documentation, only library crates do.
---
 fluxbench-cli/Cargo.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fluxbench-cli/Cargo.toml b/fluxbench-cli/Cargo.toml
index 8a40a37..555edbe 100644
--- a/fluxbench-cli/Cargo.toml
+++ b/fluxbench-cli/Cargo.toml
@@ -17,6 +17,7 @@ path = "src/lib.rs"
 [[bin]]
 name = "fluxbench"
 path = "src/main.rs"
+doc = false
 
 [dependencies]
 fluxbench-core.workspace = true

From 5bd2624a1348f853f44a737cbd1c82ff02f30eed Mon Sep 17 00:00:00 2001
From: Farhan Syah <bizimpulse@gmail.com>
Date: Fri, 13 Feb 2026 15:14:33 +0800
Subject: [PATCH 7/8] refactor(cli): extract baseline comparison into reusable
 function

Extract baseline comparison logic into apply_baseline_comparison() to
eliminate code duplication between run_benchmarks and compare_benchmarks.
This enables baseline comparison support when using --baseline flag with
the run command, not just the compare command.

Also fixes documentation formatting in ComparisonSeries field comment.
---
 fluxbench-cli/src/lib.rs       | 158 ++++++++++++++++++++-------------
 fluxbench-report/src/report.rs |   2 +-
 2 files changed, 98 insertions(+), 62 deletions(-)

diff --git a/fluxbench-cli/src/lib.rs b/fluxbench-cli/src/lib.rs
index df4e498..59d925d 100644
--- a/fluxbench-cli/src/lib.rs
+++ b/fluxbench-cli/src/lib.rs
@@ -481,6 +481,33 @@ fn run_benchmarks(
     let total_duration_ms = start_time.elapsed().as_secs_f64() * 1000.0;
     let mut report = build_report(&results, &stats, &exec_config, total_duration_ms);
 
+    // Load and apply baseline comparison if --baseline was passed
+    if let Some(baseline_path) = resolve_baseline_path(&cli.baseline, config) {
+        if baseline_path.exists() {
+            match std::fs::read_to_string(&baseline_path).and_then(|json| {
+                serde_json::from_str::<fluxbench_report::Report>(&json)
+                    .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))
+            }) {
+                Ok(baseline) => {
+                    let threshold = cli.threshold.unwrap_or(config.ci.regression_threshold);
+                    apply_baseline_comparison(&mut report, &baseline, threshold);
+                }
+                Err(e) => {
+                    eprintln!(
+                        "Warning: failed to load baseline {}: {}",
+                        baseline_path.display(),
+                        e
+                    );
+                }
+            }
+        } else {
+            eprintln!(
+                "Warning: baseline file not found: {}",
+                baseline_path.display()
+            );
+        }
+    }
+
     // Run comparisons, synthetics, and verifications
     let (comparison_results, comparison_series, synthetic_results, verification_results) =
         execute_verifications(&results, &stats);
@@ -607,68 +634,9 @@ fn compare_benchmarks(
     let total_duration_ms = start_time.elapsed().as_secs_f64() * 1000.0;
     let mut report = build_report(&results, &stats, &exec_config, total_duration_ms);
 
-    // Store baseline metadata for summary header
-    report.baseline_meta = Some(baseline.meta.clone());
-
-    // Add comparison data
+    // Apply baseline comparison data
     let regression_threshold = cli.threshold.unwrap_or(config.ci.regression_threshold);
-    let baseline_map: std::collections::HashMap<_, _> = baseline
-        .results
-        .iter()
-        .filter_map(|r| r.metrics.as_ref().map(|m| (r.id.clone(), m.clone())))
-        .collect();
-
-    for result in &mut report.results {
-        if let (Some(metrics), Some(baseline_metrics)) =
-            (&result.metrics, baseline_map.get(&result.id))
-        {
-            let baseline_mean = baseline_metrics.mean_ns;
-            let absolute_change = metrics.mean_ns - baseline_mean;
-            let relative_change = if baseline_mean > 0.0 {
-                (absolute_change / baseline_mean) * 100.0
-            } else {
-                0.0
-            };
-
-            // Determine significance via CI non-overlap and threshold crossing.
-            let ci_non_overlap = metrics.ci_upper_ns < baseline_metrics.ci_lower_ns
-                || metrics.ci_lower_ns > baseline_metrics.ci_upper_ns;
-            let is_significant = relative_change.abs() > regression_threshold && ci_non_overlap;
-
-            // Track regressions/improvements
-            if relative_change > regression_threshold {
-                report.summary.regressions += 1;
-            } else if relative_change < -regression_threshold {
-                report.summary.improvements += 1;
-            }
-
-            let mut effect_size = if metrics.std_dev_ns > f64::EPSILON {
-                absolute_change / metrics.std_dev_ns
-            } else {
-                0.0
-            };
-            if !effect_size.is_finite() {
-                effect_size = 0.0;
-            }
-
-            let probability_regression = if ci_non_overlap {
-                if relative_change > 0.0 { 0.99 } else { 0.01 }
-            } else if relative_change > 0.0 {
-                0.60
-            } else {
-                0.40
-            };
-
-            result.comparison = Some(fluxbench_report::Comparison {
-                baseline_mean_ns: baseline_mean,
-                absolute_change_ns: absolute_change,
-                relative_change,
-                probability_regression,
-                is_significant,
-                effect_size,
-            });
-        }
-    }
+    apply_baseline_comparison(&mut report, &baseline, regression_threshold);
 
     // Run comparisons, synthetics, and verifications
     let (comparison_results, comparison_series, synthetic_results, verification_results) =
@@ -758,6 +726,74 @@ fn save_baseline_if_needed(
     Ok(())
 }
 
+/// Apply baseline comparison data to the report.
+///
+/// Computes per-benchmark regression/improvement metrics by comparing current
+/// results against baseline means, CI overlap, and effect size.
+fn apply_baseline_comparison(
+    report: &mut fluxbench_report::Report,
+    baseline: &fluxbench_report::Report,
+    regression_threshold: f64,
+) {
+    report.baseline_meta = Some(baseline.meta.clone());
+
+    let baseline_map: std::collections::HashMap<_, _> = baseline
+        .results
+        .iter()
+        .filter_map(|r| r.metrics.as_ref().map(|m| (r.id.clone(), m.clone())))
+        .collect();
+
+    for result in &mut report.results {
+        if let (Some(metrics), Some(baseline_metrics)) =
+            (&result.metrics, baseline_map.get(&result.id))
+        {
+            let baseline_mean = baseline_metrics.mean_ns;
+            let absolute_change = metrics.mean_ns - baseline_mean;
+            let relative_change = if baseline_mean > 0.0 {
+                (absolute_change / baseline_mean) * 100.0
+            } else {
+                0.0
+            };
+
+            let ci_non_overlap = metrics.ci_upper_ns < baseline_metrics.ci_lower_ns
+                || metrics.ci_lower_ns > baseline_metrics.ci_upper_ns;
+            let is_significant = relative_change.abs() > regression_threshold && ci_non_overlap;
+
+            if relative_change > regression_threshold {
+                report.summary.regressions += 1;
+            } else if relative_change < -regression_threshold {
+                report.summary.improvements += 1;
+            }
+
+            let mut effect_size = if metrics.std_dev_ns > f64::EPSILON {
+                absolute_change / metrics.std_dev_ns
+            } else {
+                0.0
+            };
+            if !effect_size.is_finite() {
+                effect_size = 0.0;
+            }
+
+            let probability_regression = if ci_non_overlap {
+                if relative_change > 0.0 { 0.99 } else { 0.01 }
+            } else if relative_change > 0.0 {
+                0.60
+            } else {
+                0.40
+            };
+
+            result.comparison = Some(fluxbench_report::Comparison {
+                baseline_mean_ns: baseline_mean,
+                absolute_change_ns: absolute_change,
+                relative_change,
+                probability_regression,
+                is_significant,
+                effect_size,
+            });
+        }
+    }
+}
+
 /// Resolve baseline path from CLI flag, config, or default.
 ///
 /// - `Some(Some(path))` — explicit path from `--baseline /path/to/file`
diff --git a/fluxbench-report/src/report.rs b/fluxbench-report/src/report.rs
index da998d2..2023fce 100644
--- a/fluxbench-report/src/report.rs
+++ b/fluxbench-report/src/report.rs
@@ -66,7 +66,7 @@ pub struct ComparisonSeries {
     pub x_values: Vec<String>,
     /// Competitor/series names (benchmark IDs)
     pub series_names: Vec<String>,
-    /// Data points: series_data[series_idx][x_idx] = value
+    /// Data points: `series_data[series_idx][x_idx] = value`
     pub series_data: Vec<Vec<f64>>,
     /// Metric used
     pub metric: String,

From 12e2567e94ab8b4d77c4b04a19f54c711e7e5b13 Mon Sep 17 00:00:00 2001
From: Farhan Syah <bizimpulse@gmail.com>
Date: Fri, 13 Feb 2026 15:35:56 +0800
Subject: [PATCH 8/8] feat(cli): add per-benchmark regression thresholds

Enable benchmarks to override the global CI regression threshold with
benchmark-specific values. Critical benchmarks can now enforce stricter
thresholds while less sensitive benchmarks use the global setting.

Changes:
- Add threshold field to BenchmarkReportResult for per-benchmark overrides
- Thread threshold value through execution and reporting pipeline
- Implement threshold selection logic (per-benchmark > 0 overrides global)
- Display custom thresholds in GitHub Actions annotations
- Add comprehensive unit tests for threshold precedence scenarios
- Raise default CI threshold to 25% to accommodate shared runner variance

The per-benchmark threshold is specified via the bench macro attribute
and defaults to 0.0 (use global). When set, it takes precedence over
the global regression_threshold in regression detection and reporting.
---
 examples/examples/ci_regression.rs      |   6 +
 flux.toml                               |   3 +-
 fluxbench-cli/src/executor/execution.rs |   6 +
 fluxbench-cli/src/executor/report.rs    |   1 +
 fluxbench-cli/src/lib.rs                | 192 +++++++++++++++++++++++-
 fluxbench-core/src/lib.rs               |   2 +-
 fluxbench-report/src/github.rs          |  12 +-
 fluxbench-report/src/report.rs          |   3 +
 8 files changed, 216 insertions(+), 9 deletions(-)

diff --git a/examples/examples/ci_regression.rs b/examples/examples/ci_regression.rs
index f76920b..fa4e7f3 100644
--- a/examples/examples/ci_regression.rs
+++ b/examples/examples/ci_regression.rs
@@ -30,6 +30,8 @@ use std::hint::black_box;
     group = "hot_path",
     severity = "critical",
     threshold = 5.0,
+    warmup = "2s",
+    measurement = "3s",
     tags = "latency"
 )]
 fn request_handler(b: &mut Bencher) {
@@ -50,6 +52,8 @@ fn request_handler(b: &mut Bencher) {
     group = "hot_path",
     severity = "critical",
     threshold = 3.0,
+    warmup = "2s",
+    measurement = "3s",
     tags = "throughput"
 )]
 fn token_scan(b: &mut Bencher) {
@@ -69,6 +73,8 @@ fn token_scan(b: &mut Bencher) {
     group = "hot_path",
     severity = "warning",
     threshold = 10.0,
+    warmup = "2s",
+    measurement = "3s",
     tags = "throughput"
 )]
 fn batch_transform(b: &mut Bencher) {
diff --git a/flux.toml b/flux.toml
index 113f3b5..bfc93f6 100644
--- a/flux.toml
+++ b/flux.toml
@@ -59,7 +59,8 @@ save_baseline = false
 
 [ci]
 # Regression threshold percentage — fail CI if exceeded (default: 5.0)
-regression_threshold = 5.0
+# Shared CI runners (GitHub Actions) have 10-40% noise; use 25%+ to avoid false positives.
+regression_threshold = 25.0
 # Emit GitHub Actions annotations (::warning, ::error) (default: false)
 github_annotations = true
 # Exit non-zero on critical verification failures (default: true)
diff --git a/fluxbench-cli/src/executor/execution.rs b/fluxbench-cli/src/executor/execution.rs
index c6091e2..f655a5e 100644
--- a/fluxbench-cli/src/executor/execution.rs
+++ b/fluxbench-cli/src/executor/execution.rs
@@ -119,6 +119,8 @@ pub struct BenchExecutionResult {
     pub failure_kind: Option<String>,
     pub backtrace: Option<String>,
     pub severity: fluxbench_core::Severity,
+    /// Per-benchmark regression threshold (0.0 = use global)
+    pub threshold: f64,
 }
 
 /// Execute benchmarks and produce results (in-process mode)
@@ -217,6 +219,7 @@ impl Executor {
                     failure_kind: None,
                     backtrace: None,
                     severity: bench.severity,
+                    threshold: bench.threshold,
                 }
             }
             Err(panic) => {
@@ -244,6 +247,7 @@ impl Executor {
                     failure_kind: Some("panic".to_string()),
                     backtrace: None,
                     severity: bench.severity,
+                    threshold: bench.threshold,
                 }
             }
         }
@@ -357,6 +361,7 @@ impl IsolatedExecutor {
                         failure_kind: Some("crashed".to_string()),
                         backtrace: None,
                         severity: bench.severity,
+                        threshold: bench.threshold,
                     });
                     pb.inc(1);
                 }
@@ -431,6 +436,7 @@ impl IsolatedExecutor {
             failure_kind,
             backtrace,
             severity: bench.severity,
+            threshold: bench.threshold,
         }
     }
 }
diff --git a/fluxbench-cli/src/executor/report.rs b/fluxbench-cli/src/executor/report.rs
index cf1aab2..31fbb32 100644
--- a/fluxbench-cli/src/executor/report.rs
+++ b/fluxbench-cli/src/executor/report.rs
@@ -153,6 +153,7 @@ pub fn build_report(
             file: result.file.clone(),
             line: result.line,
             metrics,
+            threshold: result.threshold,
             comparison: None, // Filled when comparing to baseline
             failure,
         });
diff --git a/fluxbench-cli/src/lib.rs b/fluxbench-cli/src/lib.rs
index 59d925d..98f0e7f 100644
--- a/fluxbench-cli/src/lib.rs
+++ b/fluxbench-cli/src/lib.rs
@@ -747,6 +747,13 @@ fn apply_baseline_comparison(
         if let (Some(metrics), Some(baseline_metrics)) =
             (&result.metrics, baseline_map.get(&result.id))
         {
+            // Use per-benchmark threshold if set (> 0.0), otherwise global
+            let effective_threshold = if result.threshold > 0.0 {
+                result.threshold
+            } else {
+                regression_threshold
+            };
+
             let baseline_mean = baseline_metrics.mean_ns;
             let absolute_change = metrics.mean_ns - baseline_mean;
             let relative_change = if baseline_mean > 0.0 {
@@ -757,11 +764,11 @@ fn apply_baseline_comparison(
 
             let ci_non_overlap = metrics.ci_upper_ns < baseline_metrics.ci_lower_ns
                 || metrics.ci_lower_ns > baseline_metrics.ci_upper_ns;
-            let is_significant = relative_change.abs() > regression_threshold && ci_non_overlap;
+            let is_significant = relative_change.abs() > effective_threshold && ci_non_overlap;
 
-            if relative_change > regression_threshold {
+            if relative_change > effective_threshold {
                 report.summary.regressions += 1;
-            } else if relative_change < -regression_threshold {
+            } else if relative_change < -effective_threshold {
                 report.summary.improvements += 1;
             }
 
@@ -987,3 +994,182 @@ fn format_comparison_output(
 
     output
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use fluxbench_report::{
+        BenchmarkMetrics, BenchmarkReportResult, BenchmarkStatus, Report, ReportConfig, ReportMeta,
+        ReportSummary, SystemInfo,
+    };
+
+    fn dummy_meta() -> ReportMeta {
+        ReportMeta {
+            schema_version: 1,
+            version: "0.1.0".to_string(),
+            timestamp: chrono::Utc::now(),
+            git_commit: None,
+            git_branch: None,
+            system: SystemInfo {
+                os: "linux".to_string(),
+                os_version: "6.0".to_string(),
+                cpu: "test".to_string(),
+                cpu_cores: 1,
+                memory_gb: 1.0,
+            },
+            config: ReportConfig {
+                warmup_time_ns: 0,
+                measurement_time_ns: 0,
+                min_iterations: None,
+                max_iterations: None,
+                bootstrap_iterations: 0,
+                confidence_level: 0.95,
+                track_allocations: false,
+            },
+        }
+    }
+
+    fn dummy_metrics(mean: f64) -> BenchmarkMetrics {
+        BenchmarkMetrics {
+            samples: 100,
+            mean_ns: mean,
+            median_ns: mean,
+            std_dev_ns: mean * 0.01,
+            min_ns: mean * 0.9,
+            max_ns: mean * 1.1,
+            p50_ns: mean,
+            p90_ns: mean * 1.05,
+            p95_ns: mean * 1.07,
+            p99_ns: mean * 1.09,
+            p999_ns: mean * 1.1,
+            skewness: 0.0,
+            kurtosis: 3.0,
+            ci_lower_ns: mean * 0.98,
+            ci_upper_ns: mean * 1.02,
+            ci_level: 0.95,
+            throughput_ops_sec: None,
+            alloc_bytes: 0,
+            alloc_count: 0,
+            mean_cycles: 0.0,
+            median_cycles: 0.0,
+            min_cycles: 0,
+            max_cycles: 0,
+            cycles_per_ns: 0.0,
+        }
+    }
+
+    fn dummy_result(id: &str, mean: f64, threshold: f64) -> BenchmarkReportResult {
+        BenchmarkReportResult {
+            id: id.to_string(),
+            name: id.to_string(),
+            group: "test".to_string(),
+            status: BenchmarkStatus::Passed,
+            severity: fluxbench_core::Severity::Warning,
+            file: "test.rs".to_string(),
+            line: 1,
+            metrics: Some(dummy_metrics(mean)),
+            threshold,
+            comparison: None,
+            failure: None,
+        }
+    }
+
+    fn dummy_report(results: Vec<BenchmarkReportResult>) -> Report {
+        let total = results.len();
+        Report {
+            meta: dummy_meta(),
+            results,
+            comparisons: vec![],
+            comparison_series: vec![],
+            synthetics: vec![],
+            verifications: vec![],
+            summary: ReportSummary {
+                total_benchmarks: total,
+                passed: total,
+                ..Default::default()
+            },
+            baseline_meta: None,
+        }
+    }
+
+    #[test]
+    fn per_bench_threshold_overrides_global() {
+        // Baseline: 100ns. Current: 108ns → 8% regression.
+        // Global threshold: 25%. Per-bench threshold: 5%.
+        // Should detect regression via per-bench threshold but not global.
+        let mut report = dummy_report(vec![dummy_result("fast_bench", 108.0, 5.0)]);
+        let baseline = dummy_report(vec![dummy_result("fast_bench", 100.0, 5.0)]);
+
+        apply_baseline_comparison(&mut report, &baseline, 25.0);
+
+        assert_eq!(
+            report.summary.regressions, 1,
+            "per-bench 5% should catch 8% regression"
+        );
+        let cmp = report.results[0].comparison.as_ref().unwrap();
+        assert!(cmp.is_significant);
+    }
+
+    #[test]
+    fn zero_threshold_falls_back_to_global() {
+        // Baseline: 100ns. Current: 108ns → 8% regression.
+        // Global threshold: 25%. Per-bench threshold: 0.0 (use global).
+        // 8% < 25%, so no regression.
+        let mut report = dummy_report(vec![dummy_result("normal_bench", 108.0, 0.0)]);
+        let baseline = dummy_report(vec![dummy_result("normal_bench", 100.0, 0.0)]);
+
+        apply_baseline_comparison(&mut report, &baseline, 25.0);
+
+        assert_eq!(
+            report.summary.regressions, 0,
+            "8% under 25% global should not regress"
+        );
+        let cmp = report.results[0].comparison.as_ref().unwrap();
+        assert!(!cmp.is_significant);
+    }
+
+    #[test]
+    fn mixed_thresholds_independent() {
+        // Two benchmarks: one with tight per-bench threshold, one using global.
+        // Both regress by 8%.
+        let mut report = dummy_report(vec![
+            dummy_result("tight", 108.0, 5.0), // per-bench 5% → should regress
+            dummy_result("loose", 108.0, 0.0), // global 25% → should not
+        ]);
+        let baseline = dummy_report(vec![
+            dummy_result("tight", 100.0, 5.0),
+            dummy_result("loose", 100.0, 0.0),
+        ]);
+
+        apply_baseline_comparison(&mut report, &baseline, 25.0);
+
+        assert_eq!(report.summary.regressions, 1);
+        assert!(
+            report.results[0]
+                .comparison
+                .as_ref()
+                .unwrap()
+                .is_significant
+        );
+        assert!(
+            !report.results[1]
+                .comparison
+                .as_ref()
+                .unwrap()
+                .is_significant
+        );
+    }
+
+    #[test]
+    fn per_bench_threshold_detects_improvement() {
+        // Baseline: 100ns. Current: 90ns → -10% improvement.
+        // Per-bench threshold: 5%.
+        let mut report = dummy_report(vec![dummy_result("improving", 90.0, 5.0)]);
+        let baseline = dummy_report(vec![dummy_result("improving", 100.0, 5.0)]);
+
+        apply_baseline_comparison(&mut report, &baseline, 25.0);
+
+        assert_eq!(report.summary.improvements, 1);
+        assert_eq!(report.summary.regressions, 0);
+    }
+}
diff --git a/fluxbench-core/src/lib.rs b/fluxbench-core/src/lib.rs
index 0bc84ed..bf328c5 100644
--- a/fluxbench-core/src/lib.rs
+++ b/fluxbench-core/src/lib.rs
@@ -32,7 +32,7 @@ pub struct BenchmarkDef {
     pub group: &'static str,
     /// Severity level for CI reporting
     pub severity: Severity,
-    /// Regression threshold percentage
+    /// Per-benchmark regression threshold percentage (0.0 = use global threshold)
     pub threshold: f64,
     /// Absolute time budget in nanoseconds
     pub budget_ns: Option<u64>,
diff --git a/fluxbench-report/src/github.rs b/fluxbench-report/src/github.rs
index aaa66d5..ebe119e 100644
--- a/fluxbench-report/src/github.rs
+++ b/fluxbench-report/src/github.rs
@@ -101,13 +101,17 @@ fn write_baseline_comparison(output: &mut String, report: &Report) {
             let baseline = format_duration(cmp.baseline_mean_ns);
             let change = format_change(cmp.relative_change);
             let status = if cmp.relative_change.abs() < 1.0 {
-                "stable"
+                "stable".to_string()
             } else if cmp.is_significant && cmp.relative_change > 0.0 {
-                "REGRESSION"
+                if result.threshold > 0.0 {
+                    format!("REGRESSION (>{:.0}%)", result.threshold)
+                } else {
+                    "REGRESSION".to_string()
+                }
             } else if cmp.is_significant && cmp.relative_change < 0.0 {
-                "improved"
+                "improved".to_string()
             } else {
-                "within noise"
+                "within noise".to_string()
             };
 
             output.push_str(&format!(
diff --git a/fluxbench-report/src/report.rs b/fluxbench-report/src/report.rs
index 2023fce..1262346 100644
--- a/fluxbench-report/src/report.rs
+++ b/fluxbench-report/src/report.rs
@@ -144,6 +144,9 @@ pub struct BenchmarkReportResult {
     pub line: u32,
     /// Timing and statistical metrics (if successful)
     pub metrics: Option<BenchmarkMetrics>,
+    /// Per-benchmark regression threshold percentage (0.0 = use global)
+    #[serde(default)]
+    pub threshold: f64,
     /// Comparison results against baseline (if applicable)
     pub comparison: Option<Comparison>,
     /// Failure details (if failed)