ml-rust · farhan-syah · Feb 13, 2026 · Feb 10, 2026 · Feb 10, 2026 · Feb 10, 2026
diff --git a/.github/workflows/baseline.yml b/.github/workflows/baseline.yml
@@ -0,0 +1,55 @@
+# Save benchmark baseline.
+#
+# This workflow runs the CI regression benchmarks in "save" mode:
+# it writes a baseline JSON to the GitHub Actions cache, keyed by commit SHA.
+#
+# benchmark.yml (on PRs) restores this cache to compare against, enabling
+# regression detection. Cache keys use prefix matching so the latest baseline
+# from main is always picked up, even across many merges.
+#
+# Triggered manually via workflow_dispatch (should be run from the main branch).
+
+name: Baseline
+
+on:
+  workflow_dispatch:
+
+concurrency:
+  group: baseline-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+
+env:
+  CARGO_TERM_COLOR: always
+
+jobs:
+  test:
+    name: Test Suite
+    uses: ./.github/workflows/test.yml
+
+  baseline:
+    needs: test
+    name: Save Benchmark Baseline
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install Rust
+        uses: dtolnay/rust-toolchain@stable
+
+      - uses: Swatinem/rust-cache@v2
+        with:
+          prefix-key: bench
+
+      - name: Run benchmarks and save baseline
+        run: cargo bench --bench ci_regression -- --save-baseline
+
+      # Cache keyed by SHA so each merge gets its own entry.
+      # benchmark.yml uses restore-keys prefix matching to find the latest one.
+      - name: Cache baseline
+        uses: actions/cache/save@v4
+        with:
+          path: target/fluxbench/baseline.json
+          key: numr-bench-baseline-${{ github.sha }}
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -0,0 +1,77 @@
+# Benchmark regression check.
+#
+# Runs on PRs (non-draft) and can be called by other workflows (e.g. release.yml).
+#
+# How regression detection works:
+#   1. baseline.yml saves a baseline JSON after each merge to main (cached by commit SHA).
+#   2. This workflow restores that baseline and passes it via --baseline to fluxbench.
+#   3. Each benchmark has a per-bench threshold — regressions beyond this are flagged.
+#   4. Exit codes are controlled by #[verify] expressions with severity levels:
+#        - critical: exits non-zero -> job fails -> PR blocked
+#        - warning:  exits zero -> shows warnings in summary
+#        - info:     logged in the summary only
+#   5. If no baseline exists yet (first run), benchmarks run without comparison.
+
+name: Benchmark
+
+on:
+  pull_request:
+    branches: [main]
+    types: [opened, synchronize, reopened, ready_for_review]
+  workflow_call:
+  workflow_dispatch:
+
+concurrency:
+  group: benchmark-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+
+env:
+  CARGO_TERM_COLOR: always
+
+jobs:
+  test:
+    name: Test Suite
+    if: github.event.pull_request.draft == false
+    uses: ./.github/workflows/test.yml
+
+  benchmark:
+    needs: test
+    name: Regression Check
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install Rust
+        uses: dtolnay/rust-toolchain@stable
+
+      - uses: Swatinem/rust-cache@v2
+        with:
+          prefix-key: bench
+
+      - name: Build benchmarks
+        run: cargo build --bench ci_regression --release
+
+      # Restore the most recent baseline saved by baseline.yml on main.
+      # Uses prefix matching — the exact key won't match, but restore-keys
+      # picks the latest cache entry starting with "numr-bench-baseline-".
+      # On cache miss (no baseline yet), this is a silent no-op.
+      - name: Restore baseline from main
+        uses: actions/cache/restore@v4
+        with:
+          path: target/fluxbench/baseline.json
+          key: numr-bench-baseline-dummy
+          restore-keys: numr-bench-baseline-
+
+      # --format github-summary: renders a markdown table for the step summary.
+      # --baseline (if file exists): enables regression comparison against main.
+      # Exit code reflects critical verification failures (see flux.toml: fail_on_critical).
+      - name: Run benchmarks
+        run: |
+          ARGS="--format github-summary"
+          if [ -f target/fluxbench/baseline.json ]; then
+            ARGS="$ARGS --baseline target/fluxbench/baseline.json"
+          fi
+          cargo bench --bench ci_regression -- $ARGS >> $GITHUB_STEP_SUMMARY
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -1,10 +1,17 @@
+# CI — thin wrapper that calls the reusable test workflow.
+#
+# All test jobs (lint, cross-platform tests, backend compile gates, parity,
+# examples) live in test.yml to avoid duplication across ci.yml, benchmark.yml,
+# baseline.yml, and release.yml.
+
 name: CI
 
 on:
   pull_request:
     branches: [main]
     types: [opened, synchronize, reopened, ready_for_review]
   workflow_dispatch:
+  workflow_call:
 
 concurrency:
   group: ci-${{ github.ref }}
@@ -13,59 +20,8 @@ concurrency:
 permissions:
   contents: read
 
-env:
-  CARGO_TERM_COLOR: always
-
 jobs:
-  lint:
-    if: github.event.pull_request.draft == false
-    name: Lint, Format & Docs
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Install Rust
-        uses: dtolnay/rust-toolchain@stable
-        with:
-          components: rustfmt, clippy
-
-      - uses: Swatinem/rust-cache@v2
-        with:
-          prefix-key: lint
-
-      - name: Check formatting
-        run: cargo fmt --all --check
-
-      - name: Run clippy (all CI-safe features)
-        run: cargo clippy --all-targets --features f16,sparse -- -D warnings
-
-      - name: Build docs
-        run: cargo doc --no-deps --features f16,sparse
-
-      - name: Run doctests
-        run: cargo test --doc --features f16,sparse
-
   test:
     if: github.event.pull_request.draft == false
-    name: Test (${{ matrix.os }})
-    runs-on: ${{ matrix.os }}
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ubuntu-latest, macos-latest, windows-latest]
-
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Install Rust
-        uses: dtolnay/rust-toolchain@stable
-
-      - uses: Swatinem/rust-cache@v2
-        with:
-          prefix-key: test
-
-      - name: Run tests (default)
-        run: cargo test
-
-      - name: Run tests (f16 + sparse)
-        run: cargo test --features f16,sparse
+    name: Test Suite
+    uses: ./.github/workflows/test.yml
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -59,61 +59,15 @@ jobs:
 
           echo "version=$TAG_VERSION" >> $GITHUB_OUTPUT
 
-  lint:
-    name: Lint, Format & Docs
+  # Reuse benchmark workflow which includes the full test suite + regression check
+  ci:
+    name: CI + Benchmark
     needs: validate-version
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Install Rust
-        uses: dtolnay/rust-toolchain@stable
-        with:
-          components: rustfmt, clippy
-
-      - uses: Swatinem/rust-cache@v2
-        with:
-          prefix-key: lint
-
-      - name: Check formatting
-        run: cargo fmt --all --check
-
-      - name: Run clippy (all CI-safe features)
-        run: cargo clippy --all-targets --features f16,sparse -- -D warnings
-
-      - name: Build docs
-        run: cargo doc --no-deps --features f16,sparse
-
-      - name: Run doctests
-        run: cargo test --doc --features f16,sparse
-
-  test:
-    name: Test (${{ matrix.os }})
-    needs: validate-version
-    runs-on: ${{ matrix.os }}
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ubuntu-latest, macos-latest, windows-latest]
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Install Rust
-        uses: dtolnay/rust-toolchain@stable
-
-      - uses: Swatinem/rust-cache@v2
-        with:
-          prefix-key: test
-
-      - name: Run tests (default)
-        run: cargo test
-
-      - name: Run tests (f16 + sparse)
-        run: cargo test --features f16,sparse
+    uses: ./.github/workflows/benchmark.yml
 
   publish:
     name: Publish to crates.io
-    needs: [validate-version, lint, test]
+    needs: [validate-version, ci]
     runs-on: ubuntu-latest
     environment: crates-io
     steps:

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -0,0 +1,118 @@
+# Reusable test workflow: lint, format, docs, cross-platform tests, backend checks.
+#
+# Called by:
+#   - ci.yml        (PR checks)
+#   - benchmark.yml (PR regression checks)
+#   - baseline.yml  (post-merge baseline saves)
+#   - release.yml   (via benchmark.yml)
+#
+# Not triggered directly — use workflow_call only.
+
+name: Test
+
+on:
+  workflow_call:
+
+permissions:
+  contents: read
+
+env:
+  CARGO_TERM_COLOR: always
+
+jobs:
+  lint:
+    name: Lint, Format & Docs
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install Rust
+        uses: dtolnay/rust-toolchain@stable
+        with:
+          components: rustfmt, clippy
+
+      - uses: Swatinem/rust-cache@v2
+        with:
+          prefix-key: lint
+
+      - name: Check formatting
+        run: cargo fmt --all --check
+
+      - name: Run clippy (all CI-safe features)
+        run: cargo clippy --all-targets --features f16,sparse -- -D warnings
+
+      - name: Build docs
+        run: cargo doc --no-deps --features f16,sparse
+
+      - name: Run doctests
+        run: cargo test --doc --features f16,sparse
+
+  test:
+    name: Test (${{ matrix.os }})
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-latest]
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install Rust
+        uses: dtolnay/rust-toolchain@stable
+
+      - uses: Swatinem/rust-cache@v2
+        with:
+          prefix-key: test
+
+      - name: Run tests (default)
+        run: cargo test
+
+      - name: Run tests (f16 + sparse)
+        run: cargo test --features f16,sparse
+
+  backend-and-parity:
+    name: Backend Compile, Parity & Examples
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install Rust
+        uses: dtolnay/rust-toolchain@stable
+
+      - uses: Swatinem/rust-cache@v2
+        with:
+          prefix-key: backend-parity
+
+      # Backend compile gates
+      - name: "Compile: cpu-only (no default features)"
+        run: cargo check --no-default-features --features cpu
+
+      - name: "Compile: cpu + f16 + sparse"
+        run: cargo check --features f16,sparse
+
+      - name: "Compile: wgpu"
+        run: cargo check --features wgpu,f16,sparse
+
+      - name: "Compile tests: cpu-only"
+        run: cargo test --no-run --no-default-features --features cpu
+
+      - name: "Compile tests: wgpu"
+        run: cargo test --no-run --features wgpu,f16,sparse
+
+      # Backend parity
+      - name: Run backend parity tests
+        run: cargo test backend_parity --features f16,sparse
+
+      # Examples
+      - name: Build all examples
+        run: cargo build --examples --features sparse
+
+      - name: Run examples
+        run: |
+          cargo run --example basic_tensor_ops
+          cargo run --example autograd_linear_regression
+          cargo run --example conv_unfold_im2col
+          cargo run --example fft_roundtrip
+          cargo run --example sparse_coo_csr_workflow --features sparse
+          cargo run --example backend_switch_cpu_wgpu