ml-rust · farhan-syah · Feb 13, 2026 · Feb 13, 2026 · Feb 13, 2026 · Feb 13, 2026
diff --git a/.github/workflows/baseline.yml b/.github/workflows/baseline.yml
@@ -20,6 +20,7 @@ concurrency:
 
 permissions:
   contents: read
+  actions: write
 
 env:
   CARGO_TERM_COLOR: always
@@ -50,6 +51,12 @@ jobs:
 
       # Cache keyed by SHA so each merge gets its own entry.
       # benchmark.yml uses restore-keys prefix matching to find the latest one.
+      # Delete stale cache first so re-runs on the same commit work.
+      - name: Clear stale baseline cache
+        run: gh cache delete "fluxbench-baseline-${{ github.sha }}" 2>/dev/null || true
+        env:
+          GH_TOKEN: ${{ github.token }}
+
       - name: Cache baseline
         uses: actions/cache/save@v4
         with:

diff --git a/Cargo.toml b/Cargo.toml
@@ -14,7 +14,7 @@ members = [
 
 [workspace.package]
 authors = ["Farhan Syah"]
-version = "0.1.1"
+version = "0.1.2"
 edition = "2024"
 rust-version = "1.85"
 license = "Apache-2.0"

diff --git a/examples/examples/ci_regression.rs b/examples/examples/ci_regression.rs
@@ -30,6 +30,8 @@ use std::hint::black_box;
     group = "hot_path",
     severity = "critical",
     threshold = 5.0,
+    warmup = "2s",
+    measurement = "3s",
     tags = "latency"
 )]
 fn request_handler(b: &mut Bencher) {
@@ -50,6 +52,8 @@ fn request_handler(b: &mut Bencher) {
     group = "hot_path",
     severity = "critical",
     threshold = 3.0,
+    warmup = "2s",
+    measurement = "3s",
     tags = "throughput"
 )]
 fn token_scan(b: &mut Bencher) {
@@ -69,6 +73,8 @@ fn token_scan(b: &mut Bencher) {
     group = "hot_path",
     severity = "warning",
     threshold = 10.0,
+    warmup = "2s",
+    measurement = "3s",
     tags = "throughput"
 )]
 fn batch_transform(b: &mut Bencher) {

diff --git a/flux.toml b/flux.toml
@@ -59,7 +59,8 @@ save_baseline = false
 
 [ci]
 # Regression threshold percentage — fail CI if exceeded (default: 5.0)
-regression_threshold = 5.0
+# Shared CI runners (GitHub Actions) have 10-40% noise; use 25%+ to avoid false positives.
+regression_threshold = 25.0
 # Emit GitHub Actions annotations (::warning, ::error) (default: false)
 github_annotations = true
 # Exit non-zero on critical verification failures (default: true)

diff --git a/fluxbench-cli/Cargo.toml b/fluxbench-cli/Cargo.toml
@@ -17,6 +17,7 @@ path = "src/lib.rs"
 [[bin]]
 name = "fluxbench"
 path = "src/main.rs"
+doc = false
 
 [dependencies]
 fluxbench-core.workspace = true

diff --git a/fluxbench-cli/src/executor/execution.rs b/fluxbench-cli/src/executor/execution.rs
@@ -119,6 +119,8 @@ pub struct BenchExecutionResult {
     pub failure_kind: Option<String>,
     pub backtrace: Option<String>,
     pub severity: fluxbench_core::Severity,
+    /// Per-benchmark regression threshold (0.0 = use global)
+    pub threshold: f64,
 }
 
 /// Execute benchmarks and produce results (in-process mode)
@@ -217,6 +219,7 @@ impl Executor {
                     failure_kind: None,
                     backtrace: None,
                     severity: bench.severity,
+                    threshold: bench.threshold,
                 }
             }
             Err(panic) => {
@@ -244,6 +247,7 @@ impl Executor {
                     failure_kind: Some("panic".to_string()),
                     backtrace: None,
                     severity: bench.severity,
+                    threshold: bench.threshold,
                 }
             }
         }
@@ -357,6 +361,7 @@ impl IsolatedExecutor {
                         failure_kind: Some("crashed".to_string()),
                         backtrace: None,
                         severity: bench.severity,
+                        threshold: bench.threshold,
                     });
                     pb.inc(1);
                 }
@@ -431,6 +436,7 @@ impl IsolatedExecutor {
             failure_kind,
             backtrace,
             severity: bench.severity,
+            threshold: bench.threshold,
         }
     }
 }
diff --git a/fluxbench-cli/src/executor/formatting.rs b/fluxbench-cli/src/executor/formatting.rs
@@ -10,7 +10,7 @@
 //! - Comparison tables with speedup calculations
 //! - Verification results summary
 
-use fluxbench_report::{BenchmarkReportResult, BenchmarkStatus, Report};
+use fluxbench_report::{BenchmarkReportResult, BenchmarkStatus, Report, format_duration};
 
 /// Format a report for human-readable terminal display
 ///
@@ -51,20 +51,27 @@ pub fn format_human_output(report: &Report) -> String {
 
             if let Some(metrics) = &result.metrics {
                 output.push_str(&format!(
-                    "      mean: {:.2} ns  median: {:.2} ns  stddev: {:.2} ns\n",
-                    metrics.mean_ns, metrics.median_ns, metrics.std_dev_ns
+                    "      mean: {}  median: {}  stddev: {}\n",
+                    format_duration(metrics.mean_ns),
+                    format_duration(metrics.median_ns),
+                    format_duration(metrics.std_dev_ns),
                 ));
                 output.push_str(&format!(
-                    "      min: {:.2} ns  max: {:.2} ns  samples: {}\n",
-                    metrics.min_ns, metrics.max_ns, metrics.samples
+                    "      min: {}  max: {}  samples: {}\n",
+                    format_duration(metrics.min_ns),
+                    format_duration(metrics.max_ns),
+                    metrics.samples,
                 ));
                 output.push_str(&format!(
-                    "      p50: {:.2} ns  p95: {:.2} ns  p99: {:.2} ns\n",
-                    metrics.p50_ns, metrics.p95_ns, metrics.p99_ns
+                    "      p50: {}  p95: {}  p99: {}\n",
+                    format_duration(metrics.p50_ns),
+                    format_duration(metrics.p95_ns),
+                    format_duration(metrics.p99_ns),
                 ));
                 output.push_str(&format!(
-                    "      95% CI: [{:.2}, {:.2}] ns\n",
-                    metrics.ci_lower_ns, metrics.ci_upper_ns
+                    "      95% CI: [{}, {}]\n",
+                    format_duration(metrics.ci_lower_ns),
+                    format_duration(metrics.ci_upper_ns),
                 ));
                 if let Some(throughput) = metrics.throughput_ops_sec {
                     output.push_str(&format!("      throughput: {:.2} ops/sec\n", throughput));
@@ -133,9 +140,9 @@ pub fn format_human_output(report: &Report) -> String {
             };
 
             output.push_str(&format!(
-                "  {:<width$}  {:>12.2}  {:>10}{}\n",
+                "  {:<width$}  {:>12}  {:>10}{}\n",
                 entry.benchmark_id,
-                entry.value,
+                format_duration(entry.value),
                 speedup_str,
                 baseline_marker,
                 width = max_name_len

diff --git a/fluxbench-cli/src/executor/report.rs b/fluxbench-cli/src/executor/report.rs
@@ -153,6 +153,7 @@ pub fn build_report(
             file: result.file.clone(),
             line: result.line,
             metrics,
+            threshold: result.threshold,
             comparison: None, // Filled when comparing to baseline
             failure,
         });