ml-rust · farhan-syah · Mar 14, 2026 · Feb 13, 2026 · Feb 13, 2026 · Feb 17, 2026
diff --git a/.github/workflows/baseline.yml b/.github/workflows/baseline.yml
@@ -34,7 +34,7 @@ jobs:
     name: Save Benchmark Baseline
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
 
       - name: Install Rust
         uses: dtolnay/rust-toolchain@stable
@@ -49,7 +49,7 @@ jobs:
       # Cache keyed by SHA so each merge gets its own entry.
       # benchmark.yml uses restore-keys prefix matching to find the latest one.
       - name: Cache baseline
-        uses: actions/cache/save@v4
+        uses: actions/cache/save@v5
         with:
           path: target/fluxbench/baseline.json
           key: numr-bench-baseline-${{ github.sha }}
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -42,7 +42,7 @@ jobs:
     name: Regression Check
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
         with:
           fetch-depth: 0
 
@@ -61,7 +61,7 @@ jobs:
       # picks the latest cache entry starting with "numr-bench-baseline-".
       - name: Restore baseline from main
         id: baseline-cache
-        uses: actions/cache/restore@v4
+        uses: actions/cache/restore@v5
         with:
           path: target/fluxbench/baseline.json
           key: numr-bench-baseline-dummy

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -23,7 +23,7 @@ jobs:
     outputs:
       version: ${{ steps.version.outputs.version }}
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
 
       - name: Install Rust
         uses: dtolnay/rust-toolchain@stable
@@ -71,7 +71,7 @@ jobs:
     runs-on: ubuntu-latest
     environment: crates-io
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
 
       - name: Install Rust
         uses: dtolnay/rust-toolchain@stable

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -24,7 +24,7 @@ jobs:
     name: Lint, Format & Docs
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
 
       - name: Install Rust
         uses: dtolnay/rust-toolchain@stable
@@ -56,7 +56,7 @@ jobs:
         os: [ubuntu-latest, macos-latest, windows-latest]
 
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
 
       - name: Install Rust
         uses: dtolnay/rust-toolchain@stable
@@ -75,7 +75,7 @@ jobs:
     name: Backend Compile, Parity & Examples
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
 
       - name: Install Rust
         uses: dtolnay/rust-toolchain@stable
@@ -86,7 +86,7 @@ jobs:
 
       # Backend compile gates
       - name: "Compile: cpu-only (no default features)"
-        run: cargo check --no-default-features --features cpu
+        run: cargo check --no-default-features
 
       - name: "Compile: cpu + f16 + sparse"
         run: cargo check --features f16,sparse
@@ -95,7 +95,7 @@ jobs:
         run: cargo check --features wgpu,f16,sparse
 
       - name: "Compile tests: cpu-only"
-        run: cargo test --no-run --no-default-features --features cpu
+        run: cargo test --no-run --no-default-features
 
       - name: "Compile tests: wgpu"
         run: cargo test --no-run --features wgpu,f16,sparse

diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "numr"
-version = "0.4.0"
+version = "0.5.0"
 edition = "2024"
 rust-version = "1.89"
 description = "High-performance numerical computing with multi-backend GPU acceleration (CPU/CUDA/WebGPU)"
@@ -15,14 +15,20 @@ features = ["f16", "sparse"]
 # cuda and wgpu require hardware SDKs not available on docs.rs
 
 [features]
-default = ["cpu", "rayon"]
-cpu = []
+default = ["rayon"]
 cuda = ["dep:cudarc"]
+nccl = ["cuda", "cudarc?/nccl"]
+distributed = ["dep:nexar", "dep:tokio"]
+distributed-gpu = ["distributed", "nccl", "dep:nexar-nccl"]
 wgpu = ["dep:wgpu", "dep:pollster"]
 rayon = ["dep:rayon"]
-f16 = ["dep:half", "cudarc?/f16"]   # Half-precision floats (F16, BF16) - optional reduced-precision support
-fp8 = []                             # 8-bit floats (FP8E4M3, FP8E5M2) - optional ultra-low-precision support
-sparse = []                          # Sparse tensor formats (CSR, CSC, COO) and operations
+f16 = [
+  "dep:half",
+  "cudarc?/f16",
+] # Half-precision floats (F16, BF16) - optional reduced-precision support
+fp8 = [
+] # 8-bit floats (FP8E4M3, FP8E5M2) - optional ultra-low-precision support
+sparse = [] # Sparse tensor formats (CSR, CSC, COO) and operations
 
 [dependencies]
 # Core
@@ -35,11 +41,7 @@ parking_lot = "0.12"
 # Optional: Parallelism
 rayon = { version = "1.11", optional = true }
 
-# Random number generation (required for rand/randn operations)
-rand = "0.9"
-rand_distr = "0.5"
-
-# Zero-copy serialization for embedded data 
+# Zero-copy serialization for embedded data (used by sobol_data)
 rkyv = "0.8"
 
 # Optional: Half-precision floats
@@ -48,15 +50,20 @@ half = { version = "2.7", optional = true, features = [
   "num-traits",
 ] }
 
+# Optional: Inter-node distributed communication
+nexar = { version = "0.1", optional = true }
+nexar-nccl = { version = "0.1", optional = true }
+tokio = { version = "1", features = ["rt"], optional = true }
+
 # Optional: CUDA backend
-cudarc = { version = "0.18", optional = true, features = [
+cudarc = { version = "0.19", optional = true, features = [
   "cuda-version-from-build-system",
 ] }
 
 # Optional: WebGPU backend
 wgpu = { version = "28.0", optional = true }
 pollster = { version = "0.4", optional = true }
-paste = "1.0.15"
+paste = "1.0"
 
 [dev-dependencies]
 approx = "0.5"

diff --git a/README.md b/README.md
@@ -90,7 +90,7 @@ numr implements a comprehensive set of tensor operations across CPU, CUDA, and W
 ### Shape and Data Movement
 
 - **ShapeOps**: cat, stack, split, chunk, repeat, pad, roll
-- **IndexingOps**: gather, scatter, gather_nd, scatter_reduce, index_select, masked_select, masked_fill, embedding_lookup, bincount, argmax, argmin
+- **IndexingOps**: gather, scatter, gather_nd, scatter_reduce, index_select, masked_select, masked_fill, embedding_lookup, bincount, argmax, argmin, slice_assign
 - **SortingOps**: sort, argsort, topk, unique, nonzero, searchsorted
 
 ### Reductions
@@ -106,22 +106,34 @@ numr implements a comprehensive set of tensor operations across CPU, CUDA, and W
 
 ### Activation & Normalization Functions
 
-- **ActivationOps**: relu, sigmoid, silu, gelu, leaky_relu, elu, softmax
-- **NormalizationOps**: rms_norm, layer_norm
+- **ActivationOps**: relu, sigmoid, silu, gelu, swiglu, leaky_relu, elu, softmax, dropout, fused activation-mul (for gated architectures)
+- **NormalizationOps**: rms_norm, layer_norm, batch_norm, group_norm, instance_norm, fused add-norm (residual + normalize in one pass)
+- **GemmEpilogueOps**: fused matmul+bias+activation in a single kernel (forward + backward)
+- **FusedElementwiseOps**: fused element-wise operation chains across all backends
 - **ConvOps**: conv1d, conv2d, depthwise_conv2d (with stride, padding, dilation, groups)
+- **EinsumOps**: Einstein summation notation
 
 _These are mathematical functions commonly used in ML, but numr itself is not an ML framework._
 
 ### Linear Algebra
 
-- **MatmulOps**: matmul, matmul_bias (fused GEMM+bias)
+- **MatmulOps**: matmul, matmul_bias (fused GEMM+bias), i8×i8→i32 quantized matmul, FP8 matmul
 - **LinalgOps**: solve, lstsq, pinverse, inverse, det, trace, matrix_rank, diag, matrix_norm, kron, khatri_rao
 - **ComplexOps**: conj, real, imag, angle (for complex tensor support)
 
+### Automatic Differentiation
+
+- **Reverse-mode**: `Var<R>` tracked tensors, `backward()` for gradient computation
+- **Forward-mode**: `jvp()`, `jacobian_forward()` via dual numbers
+- **Second-order**: `hvp()` for Hessian-vector products, `backward_with_graph()` for higher-order gradients
+- **Activation checkpointing**: `checkpoint()` to trade compute for memory
+- **Backward hooks**: `BackwardHook` trait for gradient notifications (e.g., distributed allreduce)
+- **Differentiable ops**: matmul, conv1d, conv2d, softmax, rms_norm, layer_norm, SiLU, softplus, SwiGLU, dropout, fused GEMM epilogue, fused add-norm, dtype cast, narrow, cat
+
 ### Statistics and Probability
 
 - **StatisticalOps**: var, std, skew, kurtosis, quantile, percentile, median, cov, corrcoef
-- **RandomOps**: rand, randn, randint, multinomial, bernoulli, poisson, binomial, beta, gamma, exponential, chi_squared, student_t, f_distribution
+- **RandomOps**: rand, randn, randint, multinomial, bernoulli, poisson, binomial, beta, gamma, exponential, chi_squared, student_t, f_distribution (with seeded deterministic generation)
 - **MultivariateRandomOps**: multivariate_normal, wishart, dirichlet
 - **QuasirandomOps**: Sobol, Halton sequences
 
@@ -165,10 +177,38 @@ _These are mathematical functions commonly used in ML, but numr itself is not an
 
 - polyroots, polyval, polyfromroots, polymul
 
+**Iterative Solvers (`numr::iterative`):**
+
+- **Linear solvers**: CG, MINRES, BiCGSTAB, GMRES, LGMRES, CGS, QMR, Jacobi, SOR, Adaptive GMRES
+- **Eigensolvers**: Lanczos (symmetric), Arnoldi/IRAM (non-symmetric)
+- **Sparse SVD**: via Lanczos bidiagonalization
+- **Preconditioners**: ILU(0), IC(0), Algebraic Multigrid (AMG) with V-cycles
+
 **Sparse Tensors (`numr::sparse`, feature-gated):**
 
 - Formats: CSR, CSC, COO
 - Operations: SpGEMM (sparse matrix multiplication), SpMV (sparse matrix-vector), DSMM (dense-sparse matrix)
+- 2:4 structured sparsity with multi-backend support
+
+**Sparse Linear Algebra (`numr::sparse_linalg`):**
+
+- **Direct solvers**: Sparse LU (Gilbert-Peierls), sparse QR
+- **Incomplete factorizations**: ILU(0), ILU(k), IC(0)
+- **Preprocessing**: COLAMD ordering, maximum transversal
+- **Symbolic/numeric split**: Reuse sparsity structure for repeated solves
+
+**Graph Capture (`numr::runtime`):**
+
+- **`Graph` trait**: Capture a sequence of operations and replay them with zero re-launch overhead
+- **CUDA Graphs**: Full capture support—fixed-address buffer replay for inference loops and training steps
+- **CPU / WebGPU**: Transparent no-op path; callers write backend-agnostic code using `R::supports_graph_capture()`
+
+**Distributed Computing (`numr::communicator`, feature `nccl`):**
+
+- **`CommunicatorGroup`**: Single-node multi-GPU all-reduce, broadcast, and allgather via NCCL
+- **`HierarchicalCommunicator`**: Two-level collective—NCCL intra-node, nexar inter-node
+- **`NexarNetCommunicator`**: Pure-Rust distributed transport (QUIC via nexar) for multi-machine tensor parallelism
+- **`BackwardHook`**: Autograd hook interface—trigger cross-node gradient synchronization during `backward()`
 
 ## Dtypes
 
@@ -198,15 +238,15 @@ Every operation supports every compatible dtype. No hardcoded f32-only kernels.
 
 All backends implement identical algorithms with native kernels—no cuBLAS, MKL, or vendor library dependencies.
 
-| Hardware     | Backend | Feature       | Status  | Notes              |
-| ------------ | ------- | ------------- | ------- | ------------------ |
-| CPU (x86-64) | CPU     | cpu (default) | ✓       | AVX-512/AVX2 SIMD  |
-| CPU (ARM64)  | CPU     | cpu           | ✓       | NEON SIMD          |
-| NVIDIA GPU   | CUDA    | cuda          | ✓       | Native PTX kernels |
-| AMD GPU      | WebGPU  | wgpu          | ✓       | WGSL shaders       |
-| Intel GPU    | WebGPU  | wgpu          | ✓       | WGSL shaders       |
-| Apple GPU    | WebGPU  | wgpu          | ✓       | WGSL shaders       |
-| AMD GPU      | ROCm    | -             | Planned | Native HIP kernels |
+| Hardware     | Backend | Feature       | Status  | Notes                                                  |
+| ------------ | ------- | ------------- | ------- | ------------------------------------------------------ |
+| CPU (x86-64) | CPU     | cpu (default) | ✓       | AVX-512/AVX2 SIMD                                      |
+| CPU (ARM64)  | CPU     | cpu           | ✓       | NEON SIMD                                              |
+| NVIDIA GPU   | CUDA    | cuda          | ✓       | Native PTX kernels, caching allocator, GEMV fast paths |
+| AMD GPU      | WebGPU  | wgpu          | ✓       | WGSL shaders                                           |
+| Intel GPU    | WebGPU  | wgpu          | ✓       | WGSL shaders                                           |
+| Apple GPU    | WebGPU  | wgpu          | ✓       | WGSL shaders                                           |
+| AMD GPU      | ROCm    | -             | Planned | Native HIP kernels                                     |
 
 ### SIMD Acceleration
 
@@ -443,6 +483,45 @@ fn main() -> Result<()> {
 }
 ```
 
+### Automatic Differentiation
+
+```rust
+use numr::prelude::*;
+use numr::autograd::*;
+
+fn main() -> Result<()> {
+    let client = CpuRuntime::client()?;
+
+    // Create tracked variables
+    let x = Var::new(Tensor::<CpuRuntime>::from_slice(&[2.0, 3.0], &[2])?, true);
+    let w = Var::new(Tensor::<CpuRuntime>::from_slice(&[0.5, -1.0], &[2])?, true);
+
+    // Forward pass (builds computation graph)
+    let y = var_mul(&x, &w, &client)?;
+    let loss = var_sum(&y, &client)?;
+
+    // Backward pass
+    let grads = backward(&loss, &client)?;
+    let dx = grads.get(x.tensor());  // gradients w.r.t. x
+    let dw = grads.get(w.tensor());  // gradients w.r.t. w
+
+    // Activation checkpointing (trade compute for memory)
+    let checkpointed = checkpoint(|inputs| {
+        let h = var_relu(&inputs[0], &client)?;
+        var_matmul(&h, &inputs[1], &client)
+    }, &[&x, &w])?;
+
+    // Forward-mode AD (Jacobian-vector products)
+    let tangent = Tensor::<CpuRuntime>::ones(&[2], &device)?;
+    let jvp_result = jvp(|x| client.mul(x, x), &x.tensor(), &tangent, &client)?;
+
+    // Hessian-vector product
+    let hvp_result = hvp(|x, c| c.mul(x, x), &x.tensor(), &tangent, &client)?;
+
+    Ok(())
+}
+```
+
 ## Installation
 
 ### CPU-only (default)
@@ -484,7 +563,9 @@ numr = { version = "*", features = [
 | `wgpu`   | Cross-platform GPU (WebGPU)                         | ✗       |
 | `rayon`  | Multi-threaded CPU via Rayon                        | ✓       |
 | `f16`    | Half-precision floats (F16, BF16)                   | ✗       |
+| `fp8`    | FP8 precision (E4M3, E5M2)                          | ✗       |
 | `sparse` | Sparse tensor support (CSR, CSC, COO)               | ✗       |
+| `nccl`   | Multi-GPU communication via NCCL                    | ✗       |
 
 ## Building from Source