From 8da96be6b3f7c237ecc107d19f9dfd109cec5b01 Mon Sep 17 00:00:00 2001
From: jinmanx <jinmanx@nvidia.com>
Date: Mon, 18 May 2026 00:57:47 -0700
Subject: [PATCH 1/7] add cutile jsd example

---
 benchmark/scripts/utils.py                    |   9 +-
 setup.py                                      |   9 +
 src/liger_kernel/ops/__init__.py              |  11 +-
 src/liger_kernel/ops/backends/README.md       |  31 +++-
 src/liger_kernel/ops/backends/__init__.py     |   1 +
 .../ops/backends/_cutile/__init__.py          |   6 +
 .../ops/backends/_cutile/ops/__init__.py      |  23 +++
 .../ops/backends/_cutile/ops/jsd.py           | 170 ++++++++++++++++++
 .../ops/backends/_cutile/ops/utils.py         |  16 ++
 src/liger_kernel/ops/backends/registry.py     |  27 +++
 test/transformers/test_cutile_backend.py      |  49 +++++
 11 files changed, 342 insertions(+), 10 deletions(-)
 create mode 100644 src/liger_kernel/ops/backends/_cutile/__init__.py
 create mode 100644 src/liger_kernel/ops/backends/_cutile/ops/__init__.py
 create mode 100644 src/liger_kernel/ops/backends/_cutile/ops/jsd.py
 create mode 100644 src/liger_kernel/ops/backends/_cutile/ops/utils.py
 create mode 100644 test/transformers/test_cutile_backend.py
diff --git a/benchmark/scripts/utils.py b/benchmark/scripts/utils.py
index f3774a133..5b761e59c 100644
--- a/benchmark/scripts/utils.py
+++ b/benchmark/scripts/utils.py
@@ -250,6 +250,13 @@ def get_formatted_time():
     return time.strftime("%Y-%m-%d %H:%M:%S")
 
 
+def get_display_kernel_provider(kernel_provider: str) -> str:
+    backend = os.environ.get("LIGER_KERNEL_BACKEND", "").strip().lower()
+    if kernel_provider == "liger" and backend:
+        return f"liger-{backend}"
+    return kernel_provider
+
+
 def get_gpu_name():
     """
     Returns the current GPU name, formatted to serve as a directory name
@@ -422,7 +429,7 @@ def run_benchmarks(
                 benchmark_run_data = BenchmarkData(
                     kernel_name=kernel_name,
                     kernel_operation_mode=kernel_operation_mode,
-                    kernel_provider=kernel_provider,
+                    kernel_provider=get_display_kernel_provider(kernel_provider),
                     metric_name=metric_name,
                     metric_unit=metric_unit,
                     gpu_name=gpu_name,
diff --git a/setup.py b/setup.py
index bf3457eeb..fab3ec386 100644
--- a/setup.py
+++ b/setup.py
@@ -31,7 +31,16 @@ def get_default_dependencies():
 
 def get_optional_dependencies():
     """Get optional dependency groups."""
+    cutile_deps = [
+        "cuda-tile",
+    ]
+    cutile_tileiras_deps = [
+        "cuda-tile[tileiras]",
+    ]
+
     return {
+        "cutile": cutile_deps,
+        "cutile-tileiras": cutile_tileiras_deps,
         "dev": [
             "transformers>=4.52.0",
             "matplotlib>=3.7.2",
diff --git a/src/liger_kernel/ops/__init__.py b/src/liger_kernel/ops/__init__.py
index a92caccc1..d6baff97c 100644
--- a/src/liger_kernel/ops/__init__.py
+++ b/src/liger_kernel/ops/__init__.py
@@ -106,8 +106,8 @@ def _replace_with_vendor_ops():
 
     This function is called automatically on module load. It:
     1. Detects the current device (cuda, npu, xpu, etc.)
-    2. Looks up the vendor for that device via VENDOR_REGISTRY
-    3. Loads and applies vendor-specific implementations
+    2. Selects the backend for that device, including explicit backend overrides
+    3. Loads and applies backend-specific implementations
 
     Vendor implementations should be placed in:
         liger_kernel/ops/backends/_<vendor>/ops/
@@ -117,13 +117,12 @@ def _replace_with_vendor_ops():
 
     Note: Vendor can both override existing ops AND add new vendor-specific ops.
     """
-    from liger_kernel.ops.backends import get_vendor_for_device
+    from liger_kernel.ops.backends import select_backend_for_device
     from liger_kernel.utils import infer_device
 
     device = infer_device()
 
-    # Look up vendor info for this device
-    vendor_info = get_vendor_for_device(device)
+    vendor_info = select_backend_for_device(device)
     if vendor_info is None:
         return
 
@@ -144,6 +143,8 @@ def _replace_with_vendor_ops():
             globals()[name] = getattr(vendor_ops, name)
 
     except ImportError:
+        if vendor_info.required:
+            raise
         # Vendor module not available, use default implementations
         pass
 
diff --git a/src/liger_kernel/ops/backends/README.md b/src/liger_kernel/ops/backends/README.md
index d4067157b..ce5060ca4 100644
--- a/src/liger_kernel/ops/backends/README.md
+++ b/src/liger_kernel/ops/backends/README.md
@@ -1,12 +1,15 @@
 # Adding a New Vendor Backend
 
-This directory contains vendor-specific operator implementations that automatically replace the default (CUDA) implementations when running on the corresponding device.
+This directory contains backend-specific operator implementations that can replace the default implementations.
+
+Most backends are selected automatically by device vendor, such as `npu -> ascend`. Optional backends for an existing device, such as `cutile` on CUDA, should be selected explicitly with `LIGER_KERNEL_BACKEND`.
 
 ## Concepts
 
 - **Vendor**: Chip manufacturer (e.g., `ascend`, `intel`, `nvidia`)
 - **Device**: Device type (e.g., `npu`, `xpu`, `cuda`)
 - **VendorInfo**: Defines the mapping between vendor and device
+- **Backend override**: An explicit backend selected with `LIGER_KERNEL_BACKEND`, used for optional implementations that are not the default backend for a device
 
 ## Directory Structure
 
@@ -14,12 +17,15 @@ This directory contains vendor-specific operator implementations that automatica
 backends/
 ├── README.md          
 ├── __init__.py         
-├── registry.py         # VendorInfo, register_vendor(), VENDOR_REGISTRY
+├── registry.py         # VendorInfo, register_vendor(), VENDOR_REGISTRY, select_backend_for_device()
 ├── _ascend/            # Ascend (Huawei) vendor - supports NPU
 │   ├── __init__.py     # Registers VendorInfo for NPU
 │   └── ops/
 │       ├── __init__.py # Exports vendor-specific implementations
 │       └── geglu.py    # NPU-specific GEGLU implementation
+├── _cutile/            # Optional CUDA backend - selected by LIGER_KERNEL_BACKEND=cutile
+│   └── ops/
+│       └── ...
 └── _<vendor>/          # Your new vendor backend
     └── ...
 ```
@@ -29,8 +35,11 @@ backends/
 1. When `liger_kernel.ops.backends` is imported, it imports all vendor packages (e.g., `_ascend`)
 2. Each vendor's `__init__.py` calls `register_vendor()` to register itself
 3. When `liger_kernel.ops` is imported, `_replace_with_vendor_ops()` is called
-4. It detects the current device via `infer_device()` and looks up the vendor
-5. Vendor implementations replace/add to the `liger_kernel.ops` namespace
+4. It detects the current device via `infer_device()`
+5. It calls `select_backend_for_device()`:
+   - If `LIGER_KERNEL_BACKEND` is not set, it falls back to `get_vendor_for_device(device)`
+   - If `LIGER_KERNEL_BACKEND=cutile`, it requires `device == "cuda"` and loads `_cutile.ops`
+6. Backend implementations replace/add to the `liger_kernel.ops` namespace
 
 ## Adding a New Vendor
 
@@ -149,3 +158,17 @@ Vendors can also **add new operators** that don't exist in the default implement
 ## Example: Ascend NPU Backend
 
 See `_ascend/` directory for a complete example of the Ascend NPU backend implementation.
+
+## Enable cuTile Backend
+
+We need to explicitly set environment variables to enable cuTile.
+
+For example, `cutile` is selected with:
+
+```bash
+LIGER_KERNEL_BACKEND=cutile python your_script.py
+```
+
+cuTile is only supported on CUDA devices. When `LIGER_KERNEL_BACKEND=cutile` is set, Liger-Kernel selects the cuTile operator implementations instead of the default CUDA implementations.
+
+If the selected backend cannot be imported, the import error is raised instead of silently falling back to the default implementation.
diff --git a/src/liger_kernel/ops/backends/__init__.py b/src/liger_kernel/ops/backends/__init__.py
index ad7779c48..8f20b5905 100644
--- a/src/liger_kernel/ops/backends/__init__.py
+++ b/src/liger_kernel/ops/backends/__init__.py
@@ -5,6 +5,7 @@
 from liger_kernel.ops.backends.registry import VendorInfo  # noqa: F401
 from liger_kernel.ops.backends.registry import get_vendor_for_device  # noqa: F401
 from liger_kernel.ops.backends.registry import register_vendor  # noqa: F401
+from liger_kernel.ops.backends.registry import select_backend_for_device  # noqa: F401
 
 # Auto-import all _<vendor> subpackages to trigger registration
 # Each vendor's __init__.py calls register_vendor() when imported
diff --git a/src/liger_kernel/ops/backends/_cutile/__init__.py b/src/liger_kernel/ops/backends/_cutile/__init__.py
new file mode 100644
index 000000000..16d000218
--- /dev/null
+++ b/src/liger_kernel/ops/backends/_cutile/__init__.py
@@ -0,0 +1,6 @@
+"""
+cuTile backend for Liger-Kernel.
+
+cuTile is an optional CUDA backend selected explicitly with
+LIGER_KERNEL_BACKEND=cutile, so it does not register as the default CUDA vendor.
+"""
diff --git a/src/liger_kernel/ops/backends/_cutile/ops/__init__.py b/src/liger_kernel/ops/backends/_cutile/ops/__init__.py
new file mode 100644
index 000000000..96abe1c9c
--- /dev/null
+++ b/src/liger_kernel/ops/backends/_cutile/ops/__init__.py
@@ -0,0 +1,23 @@
+"""
+cuTile-specific operator implementations.
+"""
+
+try:
+    import cuda.tile as ct  # noqa: F401
+except ImportError as exc:
+    raise ImportError(
+        "cuTile backend requires cuda-tile. Install it with `pip install cuda-tile` "
+        "or `pip install 'cuda-tile[tileiras]'` to include the optional tileiras compiler. "
+        "When installing Liger-Kernel, use `pip install 'liger-kernel[cutile]'` "
+        "or `pip install 'liger-kernel[cutile-tileiras]'`."
+    ) from exc
+
+from liger_kernel.ops.backends._cutile.ops.jsd import LigerJSDFunction
+from liger_kernel.ops.backends._cutile.ops.jsd import jsd_backward
+from liger_kernel.ops.backends._cutile.ops.jsd import jsd_forward
+
+__all__ = [
+    "LigerJSDFunction",
+    "jsd_backward",
+    "jsd_forward",
+]
diff --git a/src/liger_kernel/ops/backends/_cutile/ops/jsd.py b/src/liger_kernel/ops/backends/_cutile/ops/jsd.py
new file mode 100644
index 000000000..c70820f51
--- /dev/null
+++ b/src/liger_kernel/ops/backends/_cutile/ops/jsd.py
@@ -0,0 +1,170 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: MIT
+
+import math
+
+from typing import Optional
+
+import cuda.tile as ct
+import torch
+
+from liger_kernel.ops.backends._cutile.ops.utils import _next_power_of_2
+from liger_kernel.ops.utils import ensure_contiguous
+
+ConstFloat = ct.Constant[float]
+ConstInt = ct.Constant[int]
+JSD_BLOCK_SIZE = 4096
+
+
+@ct.kernel(occupancy=ct.ByTarget(sm_100=4))
+def jsd_kernel_ct(
+    x,  # (BT, V) log Q (student)
+    y,  # (BT, V) log P (teacher)
+    loss,  # (BT, V) float32 loss accumulator
+    dx,  # (BT, V) gradient output
+    label,  # (BT,) label tensor, or dummy tensor when HAS_LABEL=0
+    beta: ConstFloat,
+    inv_n_non_ignore: ConstFloat,
+    ignore_index: ConstInt,
+    n_cols: ConstInt,
+    BLOCK_SIZE: ConstInt,
+    HAS_LABEL: ConstInt,
+):
+    """
+    cuTile kernel for generalized Jensen-Shannon Divergence.
+    """
+    row_idx = ct.bid(0)
+
+    if HAS_LABEL:
+        lbl = ct.load(label, row_idx, shape=())
+        if lbl == ignore_index:
+            num_chunks_early = (n_cols + BLOCK_SIZE - 1) // BLOCK_SIZE
+            for ci in range(num_chunks_early):
+                col_indices = ct.arange(BLOCK_SIZE, dtype=ct.int32) + ci * BLOCK_SIZE
+                ct.scatter(dx, (row_idx, col_indices), ct.full((BLOCK_SIZE,), 0.0, dtype=dx.dtype), check_bounds=True)
+            return
+
+    num_chunks = (n_cols + BLOCK_SIZE - 1) // BLOCK_SIZE
+    for chunk_idx in range(num_chunks):
+        col_indices = ct.arange(BLOCK_SIZE, dtype=ct.int32) + chunk_idx * BLOCK_SIZE
+
+        x_tile = ct.gather(x, (row_idx, col_indices), check_bounds=True, padding_value=-math.inf)
+        y_tile = ct.gather(y, (row_idx, col_indices), check_bounds=True, padding_value=-math.inf)
+
+        x_f32 = ct.astype(x_tile, ct.float32)
+        y_f32 = ct.astype(y_tile, ct.float32)
+
+        loss_tile = ct.full((BLOCK_SIZE,), 0.0, dtype=ct.float32)
+        dx_tile = ct.full((BLOCK_SIZE,), 0.0, dtype=ct.float32)
+
+        if beta == 0.0:
+            y_max = ct.max(y_f32, 0, keepdims=True)
+            y_prob = ct.exp(y_f32 - y_max) * ct.exp(y_max)
+            loss_tile = y_prob * (y_f32 - x_f32)
+            dx_tile = -y_prob
+        elif beta == 1.0:
+            x_max = ct.max(x_f32, 0, keepdims=True)
+            x_prob = ct.exp(x_f32 - x_max) * ct.exp(x_max)
+            loss_tile = x_prob * (x_f32 - y_f32)
+            dx_tile = loss_tile + x_prob
+        else:
+            x_max = ct.max(x_f32, 0, keepdims=True)
+            y_max = ct.max(y_f32, 0, keepdims=True)
+            max_val = ct.maximum(x_max, y_max)
+            exp_max = ct.exp(max_val)
+            q_prob = ct.exp(x_f32 - max_val) * exp_max
+            p_prob = ct.exp(y_f32 - max_val) * exp_max
+            beta_p = beta * p_prob
+            one_minus_beta_q = (1.0 - beta) * q_prob
+            m_prob = beta_p + one_minus_beta_q
+            log_m = ct.log(m_prob)
+            loss_tile = beta_p * y_f32 + one_minus_beta_q * x_f32 - m_prob * log_m
+            dx_tile = one_minus_beta_q * (x_f32 - log_m)
+
+        loss_tile = loss_tile * inv_n_non_ignore
+        dx_tile = dx_tile * inv_n_non_ignore
+
+        ct.scatter(loss, (row_idx, col_indices), loss_tile, check_bounds=True)
+        ct.scatter(dx, (row_idx, col_indices), ct.astype(dx_tile, dx.dtype), check_bounds=True)
+
+
+def jsd_forward(_input, target, shift_labels, beta, ignore_index, has_label):
+    num_rows, vocab_size = _input.shape
+    BLOCK_SIZE = min(JSD_BLOCK_SIZE, _next_power_of_2(vocab_size))
+
+    loss = torch.zeros(_input.shape, dtype=torch.float32, device=_input.device)
+    dx = torch.empty_like(_input)
+
+    if has_label:
+        n_non_ignore = (shift_labels != ignore_index).sum().item()
+    else:
+        n_non_ignore = num_rows
+
+    if n_non_ignore == 0:
+        return torch.tensor(0.0, device=_input.device, dtype=_input.dtype), torch.zeros_like(_input)
+
+    inv_n_non_ignore = 1.0 / n_non_ignore
+    label_tensor = shift_labels if has_label else torch.empty(1, device=_input.device, dtype=torch.int64)
+
+    ct.launch(
+        torch.cuda.current_stream(),
+        (num_rows, 1, 1),
+        jsd_kernel_ct,
+        (
+            _input,
+            target,
+            loss,
+            dx,
+            label_tensor,
+            float(beta),
+            float(inv_n_non_ignore),
+            int(ignore_index),
+            int(vocab_size),
+            int(BLOCK_SIZE),
+            int(has_label),
+        ),
+    )
+
+    return torch.sum(loss).to(_input.dtype), dx
+
+
+def jsd_backward(dx, grad_output):
+    if torch.equal(grad_output, torch.tensor(1.0, device=grad_output.device)):
+        return dx
+    return grad_output * dx
+
+
+class LigerJSDFunction(torch.autograd.Function):
+    r"""
+    cuTile autograd wrapper for the generalized Jensen-Shannon Divergence loss.
+    """
+
+    @staticmethod
+    @ensure_contiguous
+    def forward(
+        ctx,
+        _input: torch.Tensor,
+        target: torch.Tensor,
+        shift_labels: Optional[torch.Tensor],
+        beta: float,
+        ignore_index: int,
+    ) -> torch.Tensor:
+        has_label = False
+        if shift_labels is not None:
+            assert shift_labels.shape == (_input.shape[0],), (
+                f"shift_labels must have shape (BT,). Got: {shift_labels.shape}"
+            )
+            shift_labels = shift_labels.contiguous()
+            has_label = True
+
+        loss, dx = jsd_forward(_input, target, shift_labels, beta, ignore_index, has_label)
+        ctx.save_for_backward(dx)
+        return loss
+
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, grad_output: torch.Tensor):
+        (dx,) = ctx.saved_tensors
+        dx = jsd_backward(dx, grad_output)
+        return (dx, None, None, None, None)
diff --git a/src/liger_kernel/ops/backends/_cutile/ops/utils.py b/src/liger_kernel/ops/backends/_cutile/ops/utils.py
new file mode 100644
index 000000000..5bc958f22
--- /dev/null
+++ b/src/liger_kernel/ops/backends/_cutile/ops/utils.py
@@ -0,0 +1,16 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: MIT
+
+
+def _next_power_of_2(n: int):
+    """Return the smallest power of 2 greater than or equal to n."""
+    n -= 1
+    n |= n >> 1
+    n |= n >> 2
+    n |= n >> 4
+    n |= n >> 8
+    n |= n >> 16
+    n |= n >> 32
+    n += 1
+    return n
diff --git a/src/liger_kernel/ops/backends/registry.py b/src/liger_kernel/ops/backends/registry.py
index 5fe3613c8..b98af7b23 100644
--- a/src/liger_kernel/ops/backends/registry.py
+++ b/src/liger_kernel/ops/backends/registry.py
@@ -5,6 +5,8 @@
 Each vendor registers itself by calling register_vendor() in its __init__.py.
 """
 
+import os
+
 from dataclasses import dataclass
 from typing import Optional
 
@@ -20,10 +22,12 @@ class VendorInfo:
     Attributes:
         vendor: Vendor name (e.g., "ascend", "intel", "nvidia")
         device: Device type this vendor supports (e.g., "npu", "xpu")
+        required: Whether failing to load this backend should raise an error.
     """
 
     vendor: str
     device: str
+    required: bool = False
 
     @property
     def module_path(self) -> str:
@@ -59,3 +63,26 @@ def get_vendor_for_device(device: str) -> Optional[VendorInfo]:
         VendorInfo if found, None otherwise
     """
     return VENDOR_REGISTRY.get(device)
+
+
+def select_backend_for_device(device: str) -> Optional[VendorInfo]:
+    """
+    Select the backend implementation for a given device.
+
+    LIGER_KERNEL_BACKEND is an explicit override for optional backends that are
+    not the default vendor implementation for a device.
+    """
+    backend = os.environ.get("LIGER_KERNEL_BACKEND")
+    if backend is None:
+        return get_vendor_for_device(device)
+
+    backend = backend.strip().lower()
+    if backend == "":
+        return get_vendor_for_device(device)
+
+    if backend == "cutile":
+        if device != "cuda":
+            raise RuntimeError("LIGER_KERNEL_BACKEND=cutile requires a CUDA/GPU device.")
+        return VendorInfo(vendor="cutile", device="cuda", required=True)
+
+    raise RuntimeError(f"Unsupported LIGER_KERNEL_BACKEND: {backend}")
diff --git a/test/transformers/test_cutile_backend.py b/test/transformers/test_cutile_backend.py
new file mode 100644
index 000000000..75c112f26
--- /dev/null
+++ b/test/transformers/test_cutile_backend.py
@@ -0,0 +1,49 @@
+import os
+import subprocess
+import sys
+import textwrap
+
+from pathlib import Path
+
+import pytest
+import torch
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="cuTile backend requires CUDA")
+@pytest.mark.skipif(
+    os.environ.get("LIGER_KERNEL_BACKEND", "").strip().lower() != "cutile",
+    reason="cuTile backend selection test requires LIGER_KERNEL_BACKEND=cutile",
+)
+def test_liger_kernel_backend_cutile_selects_cutile_jsd_function():
+    repo_root = Path(__file__).resolve().parents[2]
+    pythonpath = os.pathsep.join(
+        [
+            str(repo_root / "src"),
+            str(repo_root),
+            os.environ.get("PYTHONPATH", ""),
+        ]
+    )
+    env = {
+        **os.environ,
+        "LIGER_KERNEL_BACKEND": "cutile",
+        "PYTHONPATH": pythonpath,
+    }
+    script = textwrap.dedent(
+        """
+        from liger_kernel.transformers.jsd import LigerJSDFunction
+
+        module_name = LigerJSDFunction.__module__
+        expected_prefix = "liger_kernel.ops.backends._cutile."
+        if not module_name.startswith(expected_prefix):
+            raise AssertionError(
+                f"Expected cuTile LigerJSDFunction from {expected_prefix}, got {module_name}"
+            )
+        """
+    )
+
+    subprocess.run(
+        [sys.executable, "-c", script],
+        check=True,
+        env=env,
+        cwd=repo_root,
+    )

From f2593b22206daf039d67373353f3582a7f2e80eb Mon Sep 17 00:00:00 2001
From: jinmanx <jinmanx@nvidia.com>
Date: Mon, 18 May 2026 04:07:48 -0700
Subject: [PATCH 2/7] refine

---
 README.md                                 | 20 +++++++-
 benchmark/data/all_benchmark_data.csv     | 60 +++++++++++++++++++++++
 setup.py                                  |  2 +-
 src/liger_kernel/ops/__init__.py          |  7 ++-
 src/liger_kernel/ops/backends/registry.py |  6 +--
 5 files changed, 87 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 69926e4ee..1e48e7a5d 100644
--- a/README.md
+++ b/README.md
@@ -113,7 +113,7 @@ y = orpo_loss(lm_head.weight, x, target)
 - **Ease of use:** Simply patch your Hugging Face model with one line of code, or compose your own model using our Liger Kernel modules.
 - **Time and memory efficient:** In the same spirit as Flash-Attn, but for layers like **RMSNorm**, **RoPE**, **SwiGLU**, and **CrossEntropy**! Increases multi-GPU training throughput by 20% and reduces memory usage by 60% with **kernel fusion**, **in-place replacement**, and **chunking** techniques.
 - **Exact:** Computation is exact—no approximations! Both forward and backward passes are implemented with rigorous unit tests and undergo convergence testing against training runs without Liger Kernel to ensure accuracy.
-- **Lightweight:** Liger Kernel has minimal dependencies, requiring only Torch and Triton—no extra libraries needed! Say goodbye to dependency headaches!
+- **Lightweight:** Liger Kernel has minimal default dependencies, requiring only Torch and Triton. Optional backends such as cuTile can be installed explicitly when needed.
 - **Multi-GPU supported:** Compatible with multi-GPU setups (PyTorch FSDP, DeepSpeed, DDP, etc.).
 - **Trainer Framework Integration**: [Axolotl](https://github.com/axolotl-ai-cloud/axolotl), [LLaMa-Factory](https://github.com/hiyouga/LLaMA-Factory), [SFTTrainer](https://github.com/huggingface/trl/releases/tag/v0.10.1), [Hugging Face Trainer](https://github.com/huggingface/transformers/pull/32860), [SWIFT](https://github.com/modelscope/ms-swift), [oumi](https://github.com/oumi-ai/oumi/tree/main)
 
@@ -139,6 +139,8 @@ pip3 install --pre torch torchvision torchaudio --index-url https://download.pyt
 ### Optional Dependencies
 
 - `transformers >= 4.x`: Required if you plan to use the transformers models patching APIs. The specific model you are working will dictate the minimum version of transformers.
+- `cuda-tile`: Required when enabling the optional cuTile backend on CUDA. Use this when your environment already provides CUDA Toolkit 13.1 or newer, or an existing tileiras compiler installation.
+- `cuda-tile[tileiras]`: Required when enabling the optional cuTile backend with the tileiras compiler installed directly into your Python environment.
 
 > **Note:**
 > Our kernels inherit the full spectrum of hardware compatibility offered by [Triton](https://github.com/triton-lang/triton).
@@ -168,10 +170,26 @@ pip install -e .
 # Setup Development Dependencies
 pip install -e ".[dev]"
 
+# Setup cuTile Dependencies
+pip install -e ".[cutile]"
+
+# Or install cuTile with the optional tileiras compiler
+pip install -e ".[cutile-tileiras]"
+
 # NOTE -> For AMD users only
 pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.3/
 ```
 
+### Enable cuTile Backend
+
+cuTile is an optional CUDA-only backend. After installing the `cutile` or `cutile-tileiras` extra, enable it explicitly:
+
+```bash
+LIGER_KERNEL_BACKEND=cutile python your_script.py
+```
+
+`LIGER_KERNEL_BACKEND` currently only supports `cutile`. Selecting it on a non-CUDA device, or without the required cuTile dependencies, raises an error.
+
 
 ## Getting Started
 
diff --git a/benchmark/data/all_benchmark_data.csv b/benchmark/data/all_benchmark_data.csv
index 366ee7db5..b9b9fa49d 100644
--- a/benchmark/data/all_benchmark_data.csv
+++ b/benchmark/data/all_benchmark_data.csv
@@ -2187,3 +2187,63 @@ fused_moe,huggingface,backward,memory,MB,E,num_experts,16,2072.1728515625,2072.1
 fused_moe,huggingface,backward,memory,MB,E,num_experts,32,2737.08349609375,2737.08349609375,2737.08349609375,"{""sweep_dim"": ""E"", ""T"": 8192, ""E"": null, ""H"": 2048, ""intermediate_dim"": 768, ""K"": 8, ""dtype"": ""torch.bfloat16""}",NVIDIA H100 80GB HBM3,2026-04-02 23:59:56,0.7.0
 fused_moe,huggingface,backward,memory,MB,E,num_experts,64,4078.97021484375,4078.97021484375,4078.97021484375,"{""sweep_dim"": ""E"", ""T"": 8192, ""E"": null, ""H"": 2048, ""intermediate_dim"": 768, ""K"": 8, ""dtype"": ""torch.bfloat16""}",NVIDIA H100 80GB HBM3,2026-04-02 23:59:56,0.7.0
 fused_moe,huggingface,backward,memory,MB,E,num_experts,128,6763.82275390625,6763.82275390625,6763.82275390625,"{""sweep_dim"": ""E"", ""T"": 8192, ""E"": null, ""H"": 2048, ""intermediate_dim"": 768, ""K"": 8, ""dtype"": ""torch.bfloat16""}",NVIDIA H100 80GB HBM3,2026-04-02 23:59:56,0.7.0
+jsd,liger,forward,speed,ms,BT,B * T,1024,4.802591800689697,4.792543983459472,4.822508716583252,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:50,0.8.0
+jsd,liger,forward,speed,ms,BT,B * T,2048,8.209935665130615,8.20173397064209,8.231609344482422,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:50,0.8.0
+jsd,liger,forward,speed,ms,BT,B * T,4096,16.51097583770752,16.486431121826172,16.534496307373047,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:50,0.8.0
+jsd,liger,forward,speed,ms,BT,B * T,8192,33.019935607910156,32.965196228027345,33.029823303222656,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:50,0.8.0
+jsd,liger,forward,speed,ms,BT,B * T,16384,68.23961639404297,68.23961639404297,68.23961639404297,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:50,0.8.0
+jsd,torch,forward,speed,ms,BT,B * T,1024,2.262079954147339,2.2600127696990966,2.263961601257324,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:51,0.8.0
+jsd,torch,forward,speed,ms,BT,B * T,2048,4.448192119598389,4.446419334411622,4.4502272605896,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:51,0.8.0
+jsd,torch,forward,speed,ms,BT,B * T,4096,8.790271759033203,8.787808418273926,8.79417610168457,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:51,0.8.0
+jsd,torch,forward,speed,ms,BT,B * T,8192,17.53606414794922,17.53264045715332,17.541554260253907,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:51,0.8.0
+jsd,torch,forward,speed,ms,BT,B * T,16384,35.041263580322266,35.0344955444336,35.048031616210935,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:51,0.8.0
+jsd,liger,backward,speed,ms,BT,B * T,1024,0.6543359756469727,0.6532800197601318,0.6553919911384583,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:52,0.8.0
+jsd,liger,backward,speed,ms,BT,B * T,2048,1.1776319742202759,1.176576018333435,1.1796480417251587,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:52,0.8.0
+jsd,liger,backward,speed,ms,BT,B * T,4096,2.2847520112991333,2.2845120429992676,2.286623954772949,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:52,0.8.0
+jsd,liger,backward,speed,ms,BT,B * T,8192,4.443583965301514,4.440294361114502,4.446220779418946,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:52,0.8.0
+jsd,liger,backward,speed,ms,BT,B * T,16384,8.761343955993652,8.759296417236328,8.764415740966797,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:52,0.8.0
+jsd,torch,backward,speed,ms,BT,B * T,1024,3.628959894180298,3.627891206741333,3.6328320026397707,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:53,0.8.0
+jsd,torch,backward,speed,ms,BT,B * T,2048,7.150623798370361,7.148947238922119,7.15374059677124,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:53,0.8.0
+jsd,torch,backward,speed,ms,BT,B * T,4096,14.258943557739258,14.257344245910645,14.26035213470459,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:53,0.8.0
+jsd,torch,backward,speed,ms,BT,B * T,8192,28.43654441833496,28.435987091064455,28.436717224121097,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:53,0.8.0
+jsd,torch,backward,speed,ms,BT,B * T,16384,56.82896041870117,56.82896041870117,56.82896041870117,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:53,0.8.0
+jsd,liger,full,speed,ms,BT,B * T,1024,5.446160078048706,5.43007984161377,5.45614709854126,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:55,0.8.0
+jsd,liger,full,speed,ms,BT,B * T,2048,9.357344150543213,9.346265411376953,9.371161460876465,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:55,0.8.0
+jsd,liger,full,speed,ms,BT,B * T,4096,18.769920349121094,18.760723114013672,18.82929916381836,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:55,0.8.0
+jsd,liger,full,speed,ms,BT,B * T,8192,37.44615936279297,37.407769012451176,37.48454971313477,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:55,0.8.0
+jsd,liger,full,speed,ms,BT,B * T,16384,76.7877426147461,76.7877426147461,76.7877426147461,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:55,0.8.0
+jsd,torch,full,speed,ms,BT,B * T,1024,5.885536193847656,5.883008003234863,5.8876800537109375,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:56,0.8.0
+jsd,torch,full,speed,ms,BT,B * T,2048,11.584159851074219,11.5828031539917,11.587251472473145,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:56,0.8.0
+jsd,torch,full,speed,ms,BT,B * T,4096,23.026704788208008,23.025049972534177,23.02828178405762,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:56,0.8.0
+jsd,torch,full,speed,ms,BT,B * T,8192,45.981807708740234,45.97930145263672,45.98431396484375,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:56,0.8.0
+jsd,torch,full,speed,ms,BT,B * T,16384,91.8117446899414,91.8117446899414,91.8117446899414,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:56,0.8.0
+jsd,liger,full,memory,MB,BT,B * T,1024,3012.0048828125,3012.0048828125,3012.0048828125,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:58,0.8.0
+jsd,liger,full,memory,MB,BT,B * T,2048,6012.0087890625,6012.0087890625,6012.0087890625,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:58,0.8.0
+jsd,liger,full,memory,MB,BT,B * T,4096,12024.0166015625,12024.0166015625,12024.0166015625,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:58,0.8.0
+jsd,liger,full,memory,MB,BT,B * T,8192,24048.015625,24048.015625,24048.015625,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:58,0.8.0
+jsd,liger,full,memory,MB,BT,B * T,16384,48096.015625,48096.015625,48096.015625,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:58,0.8.0
+jsd,torch,full,memory,MB,BT,B * T,1024,6519.0009765625,6519.0009765625,6519.0009765625,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:59,0.8.0
+jsd,torch,full,memory,MB,BT,B * T,2048,13026.0009765625,13026.0009765625,13026.0009765625,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:59,0.8.0
+jsd,torch,full,memory,MB,BT,B * T,4096,26052.0,26052.0,26052.0,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:59,0.8.0
+jsd,torch,full,memory,MB,BT,B * T,8192,52104.0,52104.0,52104.0,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:59,0.8.0
+jsd,torch,full,memory,MB,BT,B * T,16384,104208.0,104208.0,104208.0,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:59,0.8.0
+jsd,liger-cutile,forward,speed,ms,BT,B * T,1024,0.7814080119132996,0.7793023943901062,0.7831360101699829,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:59:22,0.8.0
+jsd,liger-cutile,forward,speed,ms,BT,B * T,2048,1.4285119771957397,1.4254208087921143,1.4325439929962158,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:59:22,0.8.0
+jsd,liger-cutile,forward,speed,ms,BT,B * T,4096,2.7792000770568848,2.773011255264282,2.783692789077759,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:59:22,0.8.0
+jsd,liger-cutile,forward,speed,ms,BT,B * T,8192,5.50931191444397,5.502655982971191,5.513644886016845,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:59:22,0.8.0
+jsd,liger-cutile,forward,speed,ms,BT,B * T,16384,10.931103706359863,10.921529960632324,10.938668823242187,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:59:22,0.8.0
+jsd,liger-cutile,backward,speed,ms,BT,B * T,1024,0.6397919952869415,0.6379775881767273,0.6410239934921265,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:59:24,0.8.0
+jsd,liger-cutile,backward,speed,ms,BT,B * T,2048,1.1632959842681885,1.1621824026107788,1.1643712043762207,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:59:24,0.8.0
+jsd,liger-cutile,backward,speed,ms,BT,B * T,4096,2.2692480087280273,2.267148876190186,2.272768020629883,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:59:24,0.8.0
+jsd,liger-cutile,backward,speed,ms,BT,B * T,8192,4.425759792327881,4.422994995117188,4.428908729553223,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:59:24,0.8.0
+jsd,liger-cutile,backward,speed,ms,BT,B * T,16384,8.757247924804688,8.7510404586792,8.760255813598633,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:59:24,0.8.0
+jsd,liger-cutile,full,speed,ms,BT,B * T,1024,1.4090240001678467,1.4069759845733643,1.4106112003326414,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:59:26,0.8.0
+jsd,liger-cutile,full,speed,ms,BT,B * T,2048,2.5795199871063232,2.5753151893615724,2.5846080780029297,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:59:26,0.8.0
+jsd,liger-cutile,full,speed,ms,BT,B * T,4096,5.042175769805908,5.039103984832764,5.045452690124511,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:59:26,0.8.0
+jsd,liger-cutile,full,speed,ms,BT,B * T,8192,9.92411184310913,9.921158599853515,9.930310440063476,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:59:26,0.8.0
+jsd,liger-cutile,full,speed,ms,BT,B * T,16384,19.671167373657227,19.667975234985352,19.68026809692383,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:59:26,0.8.0
+jsd,liger-cutile,full,memory,MB,BT,B * T,1024,3012.00537109375,3012.00537109375,3012.00537109375,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:59:29,0.8.0
+jsd,liger-cutile,full,memory,MB,BT,B * T,2048,6012.00927734375,6012.00927734375,6012.00927734375,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:59:29,0.8.0
+jsd,liger-cutile,full,memory,MB,BT,B * T,4096,12024.017578125,12024.017578125,12024.017578125,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:59:29,0.8.0
+jsd,liger-cutile,full,memory,MB,BT,B * T,8192,24048.017578125,24048.017578125,24048.017578125,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:59:29,0.8.0
+jsd,liger-cutile,full,memory,MB,BT,B * T,16384,48096.015625,48096.015625,48096.015625,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:59:29,0.8.0
diff --git a/setup.py b/setup.py
index fab3ec386..76487c24b 100644
--- a/setup.py
+++ b/setup.py
@@ -55,7 +55,7 @@ def get_optional_dependencies():
             "mkdocs-material",
             "torchvision>=0.20",
             "prek>=0.2.28",
-        ]
+        ],
     }
 
 
diff --git a/src/liger_kernel/ops/__init__.py b/src/liger_kernel/ops/__init__.py
index d6baff97c..147ad9676 100644
--- a/src/liger_kernel/ops/__init__.py
+++ b/src/liger_kernel/ops/__init__.py
@@ -143,9 +143,12 @@ def _replace_with_vendor_ops():
             globals()[name] = getattr(vendor_ops, name)
 
     except ImportError:
-        if vendor_info.required:
+        import os
+
+        backend = os.environ.get("LIGER_KERNEL_BACKEND", "").strip().lower()
+        if backend == "cutile":
             raise
-        # Vendor module not available, use default implementations
+        # Vendor module not available, use default implementations.
         pass
 
 
diff --git a/src/liger_kernel/ops/backends/registry.py b/src/liger_kernel/ops/backends/registry.py
index b98af7b23..c89febc72 100644
--- a/src/liger_kernel/ops/backends/registry.py
+++ b/src/liger_kernel/ops/backends/registry.py
@@ -22,12 +22,10 @@ class VendorInfo:
     Attributes:
         vendor: Vendor name (e.g., "ascend", "intel", "nvidia")
         device: Device type this vendor supports (e.g., "npu", "xpu")
-        required: Whether failing to load this backend should raise an error.
     """
 
     vendor: str
     device: str
-    required: bool = False
 
     @property
     def module_path(self) -> str:
@@ -83,6 +81,6 @@ def select_backend_for_device(device: str) -> Optional[VendorInfo]:
     if backend == "cutile":
         if device != "cuda":
             raise RuntimeError("LIGER_KERNEL_BACKEND=cutile requires a CUDA/GPU device.")
-        return VendorInfo(vendor="cutile", device="cuda", required=True)
+        return VendorInfo(vendor="cutile", device="cuda")
 
-    raise RuntimeError(f"Unsupported LIGER_KERNEL_BACKEND: {backend}")
+    raise RuntimeError(f"Unsupported LIGER_KERNEL_BACKEND: {backend}. Only 'cutile' is currently supported.")

From 7733ee49a9be4fd117541cc5bbd94a195a017d97 Mon Sep 17 00:00:00 2001
From: jinmanx <jinmanx@nvidia.com>
Date: Mon, 18 May 2026 20:11:56 -0700
Subject: [PATCH 3/7] refine readme

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ffa72ad71..16e5971ca 100644
--- a/README.md
+++ b/README.md
@@ -113,7 +113,7 @@ y = orpo_loss(lm_head.weight, x, target)
 - **Ease of use:** Simply patch your Hugging Face model with one line of code, or compose your own model using our Liger Kernel modules.
 - **Time and memory efficient:** In the same spirit as Flash-Attn, but for layers like **RMSNorm**, **RoPE**, **SwiGLU**, and **CrossEntropy**! Increases multi-GPU training throughput by 20% and reduces memory usage by 60% with **kernel fusion**, **in-place replacement**, and **chunking** techniques.
 - **Exact:** Computation is exact—no approximations! Both forward and backward passes are implemented with rigorous unit tests and undergo convergence testing against training runs without Liger Kernel to ensure accuracy.
-- **Lightweight:** Liger Kernel has minimal default dependencies, requiring only Torch and Triton. Optional backends such as cuTile can be installed explicitly when needed.
+- **Lightweight:** Liger Kernel has minimal dependencies, requiring only Torch and Triton—no extra libraries needed! Say goodbye to dependency headaches!
 - **Multi-GPU supported:** Compatible with multi-GPU setups (PyTorch FSDP, DeepSpeed, DDP, etc.).
 - **Trainer Framework Integration**: [Axolotl](https://github.com/axolotl-ai-cloud/axolotl), [LLaMa-Factory](https://github.com/hiyouga/LLaMA-Factory), [SFTTrainer](https://github.com/huggingface/trl/releases/tag/v0.10.1), [Hugging Face Trainer](https://github.com/huggingface/transformers/pull/32860), [SWIFT](https://github.com/modelscope/ms-swift), [oumi](https://github.com/oumi-ai/oumi/tree/main)
 

From 8ac80fd0ed6c017b887ae3dc5dd286dee1a7552c Mon Sep 17 00:00:00 2001
From: Vaibhav Jindal <vaibhav.jndl@gmail.com>
Date: Tue, 19 May 2026 13:35:07 -0700
Subject: [PATCH 4/7] Refactor backend registry: rename Vendor to Backend,
 support multi-device

The existing VendorInfo / VENDOR_REGISTRY model assumed one vendor per
device (e.g., Ascend on NPU). It doesn't model alternative DSLs on the
same device (e.g., cuTile on CUDA, where Triton is already the default),
which was being shoehorned in via a hardcoded `if backend == "cutile":`
branch and a _cutile/__init__.py that deliberately skipped registration.

Replace VendorInfo(vendor, device) with BackendInfo(name, devices,
default_devices):

  - devices: tuple of supported devices (multi-device backends OK)
  - default_devices: subset auto-applied on import; empty = opt-in
    only via LIGER_KERNEL_BACKEND

Both Ascend and cuTile now register through the same register_backend()
mechanism, no special cases. Ascend uses devices=("npu",),
default_devices=("npu",); cuTile uses devices=("cuda",) with empty
default_devices, making it opt-in. The dispatcher
(_replace_with_backend_ops) reads the env var once and threads explicit
through, dropping the duplicate parse in the import fallback.

No behavior change for end users: LIGER_KERNEL_BACKEND=cutile still
selects cuTile on CUDA, and Ascend on NPU still auto-applies.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/liger_kernel/ops/__init__.py              |  67 ++++----
 src/liger_kernel/ops/backends/README.md       | 147 ++++++++++--------
 src/liger_kernel/ops/backends/__init__.py     |  14 +-
 .../ops/backends/_ascend/__init__.py          |   8 +-
 .../ops/backends/_cutile/__init__.py          |  10 +-
 src/liger_kernel/ops/backends/registry.py     | 131 +++++++++-------
 6 files changed, 211 insertions(+), 166 deletions(-)

diff --git a/src/liger_kernel/ops/__init__.py b/src/liger_kernel/ops/__init__.py
index 6be4eae43..6eb2f7454 100644
--- a/src/liger_kernel/ops/__init__.py
+++ b/src/liger_kernel/ops/__init__.py
@@ -1,12 +1,12 @@
 """
-Liger-Kernel operators with automatic vendor-specific replacement.
+Liger-Kernel operators with automatic backend-specific replacement.
 
 This module provides two ways to import operators:
 
 1. Import from this package (recommended for Function classes):
        from liger_kernel.ops import LigerGELUMulFunction
 
-   This automatically uses vendor-specific implementation if available.
+   This automatically uses the active backend's implementation if any is selected.
 
 2. Import from submodules (for kernel functions or specific access):
        from liger_kernel.ops.geglu import geglu_forward, geglu_backward
@@ -15,10 +15,12 @@
 
 The replacement mechanism:
 1. Default implementations are imported from individual modules (e.g., geglu.py)
-2. On module load, device is detected via infer_device()
-3. If running on a supported vendor device (npu, xpu, etc.), the default
-   implementations are replaced with vendor-specific ones
-4. All subsequent imports from this package get the replaced versions
+2. On module load, device is detected via infer_device() and the env var
+   LIGER_KERNEL_BACKEND is read
+3. select_backend() picks an active backend (auto-applied for the device, or
+   explicitly requested via env var)
+4. If a backend is selected, its implementations replace/extend the symbols here
+5. All subsequent imports from this package get the replaced versions
 
 Note: Direct imports from submodules (e.g., from liger_kernel.ops.geglu import ...)
       are NOT affected by the replacement mechanism.
@@ -27,7 +29,7 @@
 # =============================================================================
 # Import default implementations
 # Both Function classes and kernel functions are imported here.
-# All of these can be replaced by vendor-specific implementations.
+# All of these can be replaced by backend-specific implementations.
 # =============================================================================
 
 from liger_kernel.ops.attn_res import LigerAttnResFunction  # noqa: F401
@@ -94,65 +96,68 @@
 from liger_kernel.ops.tvd import LigerTVDLossFunction  # noqa: F401
 
 # NOTE: __all__ is intentionally NOT defined.
-# - Import from this package (liger_kernel.ops) -> subject to vendor replacement
+# - Import from this package (liger_kernel.ops) -> subject to backend replacement
 # - Import from submodules (liger_kernel.ops.geglu) -> always use default implementation
 
 
 # =============================================================================
-# Vendor-specific replacement logic
+# Backend-specific replacement logic
 # =============================================================================
 
 
-def _replace_with_vendor_ops():
+def _replace_with_backend_ops():
     """
-    Replace/add vendor-specific operator implementations.
+    Replace/add backend-specific operator implementations.
 
     This function is called automatically on module load. It:
     1. Detects the current device (cuda, npu, xpu, etc.)
-    2. Selects the backend for that device, including explicit backend overrides
+    2. Selects the active backend via ``select_backend()``, honoring any
+       explicit ``LIGER_KERNEL_BACKEND`` override
     3. Loads and applies backend-specific implementations
 
-    Vendor implementations should be placed in:
-        liger_kernel/ops/backends/_<vendor>/ops/
+    Backend implementations should be placed in:
+        liger_kernel/ops/backends/_<name>/ops/
 
-    If the vendor module defines __all__, only those symbols are exported.
+    If the backend module defines __all__, only those symbols are exported.
     Otherwise, all public symbols (not starting with _) are auto-discovered.
 
-    Note: Vendor can both override existing ops AND add new vendor-specific ops.
+    Note: Backends can both override existing ops AND add new backend-specific ops.
     """
-    from liger_kernel.ops.backends import select_backend_for_device
+    import os
+
+    from liger_kernel.ops.backends import LIGER_KERNEL_BACKEND_ENV
+    from liger_kernel.ops.backends import select_backend
     from liger_kernel.utils import infer_device
 
     device = infer_device()
-
-    vendor_info = select_backend_for_device(device)
-    if vendor_info is None:
+    explicit = os.environ.get(LIGER_KERNEL_BACKEND_ENV, "").strip().lower() or None
+    backend_info = select_backend(device, explicit=explicit)
+    if backend_info is None:
         return
 
     try:
         import importlib
 
-        vendor_ops = importlib.import_module(vendor_info.module_path)
+        backend_ops = importlib.import_module(backend_info.module_path)
 
         # Get names to export: use __all__ if defined, otherwise auto-discover
-        names_to_export = getattr(vendor_ops, "__all__", None)
+        names_to_export = getattr(backend_ops, "__all__", None)
 
         if names_to_export is None:
             # Auto-discover: find all public symbols (classes and functions)
-            names_to_export = [name for name in dir(vendor_ops) if not name.startswith("_")]
+            names_to_export = [name for name in dir(backend_ops) if not name.startswith("_")]
 
         # Replace or add to this module's globals
         for name in names_to_export:
-            globals()[name] = getattr(vendor_ops, name)
+            globals()[name] = getattr(backend_ops, name)
 
     except ImportError:
-        import os
-
-        backend = os.environ.get("LIGER_KERNEL_BACKEND", "").strip().lower()
-        if backend == "cutile":
+        # An auto-selected backend that fails to import (e.g., missing optional
+        # vendor SDK in the environment) silently falls back to defaults. An
+        # explicitly-requested backend, however, must succeed — re-raise so the
+        # user sees the underlying error.
+        if explicit:
             raise
-        # Vendor module not available, use default implementations.
-        pass
 
 
-_replace_with_vendor_ops()
+_replace_with_backend_ops()
diff --git a/src/liger_kernel/ops/backends/README.md b/src/liger_kernel/ops/backends/README.md
index ce5060ca4..ce445e44d 100644
--- a/src/liger_kernel/ops/backends/README.md
+++ b/src/liger_kernel/ops/backends/README.md
@@ -1,79 +1,86 @@
-# Adding a New Vendor Backend
+# Adding a New Backend
 
-This directory contains backend-specific operator implementations that can replace the default implementations.
+This directory contains backend-specific operator implementations that can replace the default Liger implementations.
 
-Most backends are selected automatically by device vendor, such as `npu -> ascend`. Optional backends for an existing device, such as `cutile` on CUDA, should be selected explicitly with `LIGER_KERNEL_BACKEND`.
+A **backend** here is a named alternative implementation of Liger's operators. It may target a different hardware device (e.g., Ascend NPU vs. NVIDIA CUDA) or a different DSL on the same device (e.g., cuTile vs. Triton on CUDA), and may support one or more devices.
+
+Each backend declares two device sets:
+
+- **`devices`** — every device the backend supports.
+- **`default_devices`** — the subset on which the backend is auto-applied at import time. On supported devices not listed here, the backend is opt-in only and must be requested explicitly via the `LIGER_KERNEL_BACKEND` environment variable.
 
 ## Concepts
 
-- **Vendor**: Chip manufacturer (e.g., `ascend`, `intel`, `nvidia`)
-- **Device**: Device type (e.g., `npu`, `xpu`, `cuda`)
-- **VendorInfo**: Defines the mapping between vendor and device
-- **Backend override**: An explicit backend selected with `LIGER_KERNEL_BACKEND`, used for optional implementations that are not the default backend for a device
+- **Device**: PyTorch device type returned by `infer_device()` (e.g., `cuda`, `npu`, `xpu`)
+- **BackendInfo**: Declarative description of a backend (name, supported devices, default devices)
+- **Auto-applied backend**: A backend whose `default_devices` includes the current device — applied automatically (e.g., Ascend on NPU)
+- **Opt-in backend**: A backend whose `default_devices` is empty (or excludes the current device) — applied only when `LIGER_KERNEL_BACKEND=<name>` is set (e.g., cuTile on CUDA)
 
 ## Directory Structure
 
 ```
 backends/
-├── README.md          
-├── __init__.py         
-├── registry.py         # VendorInfo, register_vendor(), VENDOR_REGISTRY, select_backend_for_device()
-├── _ascend/            # Ascend (Huawei) vendor - supports NPU
-│   ├── __init__.py     # Registers VendorInfo for NPU
+├── README.md
+├── __init__.py
+├── registry.py          # BackendInfo, register_backend(), BACKEND_REGISTRY, select_backend()
+├── _ascend/             # Ascend backend — auto-applied on NPU
+│   ├── __init__.py      # register_backend(BackendInfo(name="ascend", devices=("npu",), default_devices=("npu",)))
 │   └── ops/
-│       ├── __init__.py # Exports vendor-specific implementations
-│       └── geglu.py    # NPU-specific GEGLU implementation
-├── _cutile/            # Optional CUDA backend - selected by LIGER_KERNEL_BACKEND=cutile
+│       ├── __init__.py  # Exports backend-specific implementations
+│       └── geglu.py     # NPU-specific GEGLU implementation
+├── _cutile/             # cuTile backend — opt-in on CUDA
+│   ├── __init__.py      # register_backend(BackendInfo(name="cutile", devices=("cuda",)))
 │   └── ops/
 │       └── ...
-└── _<vendor>/          # Your new vendor backend
+└── _<name>/             # Your new backend
     └── ...
 ```
 
 ## How It Works
 
-1. When `liger_kernel.ops.backends` is imported, it imports all vendor packages (e.g., `_ascend`)
-2. Each vendor's `__init__.py` calls `register_vendor()` to register itself
-3. When `liger_kernel.ops` is imported, `_replace_with_vendor_ops()` is called
-4. It detects the current device via `infer_device()`
-5. It calls `select_backend_for_device()`:
-   - If `LIGER_KERNEL_BACKEND` is not set, it falls back to `get_vendor_for_device(device)`
-   - If `LIGER_KERNEL_BACKEND=cutile`, it requires `device == "cuda"` and loads `_cutile.ops`
-6. Backend implementations replace/add to the `liger_kernel.ops` namespace
+1. When `liger_kernel.ops.backends` is imported, every `_<name>/` subpackage is auto-imported.
+2. Each backend's `__init__.py` calls `register_backend()` to register itself.
+3. When `liger_kernel.ops` is imported, `_replace_with_backend_ops()` is called.
+4. It detects the current device via `infer_device()` and reads `LIGER_KERNEL_BACKEND` from the environment.
+5. It calls `select_backend(device, explicit=...)`:
+   - If `LIGER_KERNEL_BACKEND` is **set**, the named backend is selected (validated against the current device).
+   - If `LIGER_KERNEL_BACKEND` is **unset**, the first registered backend that lists the current device in its `default_devices` is selected; otherwise the defaults are kept.
+6. The selected backend's operators replace/extend the symbols in the `liger_kernel.ops` namespace.
 
-## Adding a New Vendor
+If an auto-selected backend fails to import (e.g., the vendor SDK isn't installed), the dispatcher silently falls back to defaults. An explicitly-requested backend that fails to import re-raises so the user sees the underlying error.
 
-### Step 1: Create Directory Structure
+## Adding a New Backend
+
+### Step 1: Create the directory structure
 
 ```bash
-mkdir -p backends/_<vendor>/ops
-touch backends/_<vendor>/__init__.py
-touch backends/_<vendor>/ops/__init__.py
+mkdir -p backends/_<name>/ops
+touch backends/_<name>/__init__.py
+touch backends/_<name>/ops/__init__.py
 ```
 
-### Step 2: Register Your Vendor
+### Step 2: Register your backend
 
-In `backends/_<vendor>/__init__.py`, register your vendor:
+In `backends/_<name>/__init__.py`:
 
 ```python
 """
-<Vendor> backend for Liger-Kernel.
+<Name> backend for Liger-Kernel.
 """
 
-from liger_kernel.ops.backends.registry import VendorInfo, register_vendor
+from liger_kernel.ops.backends.registry import BackendInfo
+from liger_kernel.ops.backends.registry import register_backend
 
-register_vendor(
-    VendorInfo(
-        vendor="<vendor>",
-        device="<device>",
-    )
-)
-```
+# Auto-applied on the listed devices:
+register_backend(BackendInfo(name="<name>", devices=("<device>",), default_devices=("<device>",)))
 
+# Or, opt-in only (selected via LIGER_KERNEL_BACKEND=<name>):
+# register_backend(BackendInfo(name="<name>", devices=("<device>",)))
+```
 
-### Step 3: Ensure Device Detection Works
+### Step 3: Ensure device detection works
 
-Make sure `infer_device()` in `liger_kernel/utils.py` can detect your device:
+Make sure `infer_device()` in `liger_kernel/utils.py` recognizes your device:
 
 ```python
 def infer_device():
@@ -87,48 +94,42 @@ def infer_device():
     return "cpu"
 ```
 
-### Step 4: Implement Vendor-Specific Operators
+### Step 4: Implement backend-specific operators
 
-Create operator files in `backends/_<vendor>/ops/`. For example, `geglu.py`:
+Create operator files in `backends/_<name>/ops/`. For example, `geglu.py`:
 
 ```python
 import torch
 
 class LigerGELUMulFunction(torch.autograd.Function):
-    """
-    Vendor-specific LigerGELUMulFunction implementation.
-    """
+    """Backend-specific LigerGELUMulFunction implementation."""
+
     @staticmethod
     def forward(ctx, a, b):
-        # Your vendor-specific forward implementation
         ...
 
     @staticmethod
     def backward(ctx, dc):
-        # Your vendor-specific backward implementation
         ...
 
-# Optional: vendor-specific kernel functions
-def geglu_forward_vendor(a, b):
+def geglu_forward_backend(a, b):
     ...
 
-def geglu_backward_vendor(a, b, dc):
+def geglu_backward_backend(a, b, dc):
     ...
 ```
 
 ### Step 5: Export in `ops/__init__.py`
 
-In `backends/_<vendor>/ops/__init__.py`, export your implementations:
+In `backends/_<name>/ops/__init__.py`:
 
 ```python
-"""
-<Vendor>-specific operator implementations.
-"""
+"""<Name>-specific operator implementations."""
 
 from .<module> import (
     LigerGELUMulFunction,
-    geglu_forward_vendor as geglu_forward,   # Rename to match default API
-    geglu_backward_vendor as geglu_backward,
+    geglu_forward_backend as geglu_forward,   # Rename to match default API
+    geglu_backward_backend as geglu_backward,
 )
 
 # Explicitly declare what to export (recommended)
@@ -143,11 +144,11 @@ __all__ = [
 
 ### Incremental Override
 
-You **don't need to implement all operators**. Only implement the ones that require vendor-specific adaptations. Unimplemented operators will automatically fall back to the default (CUDA) implementation.
+You **don't need to implement all operators**. Only implement the ones that require backend-specific adaptations. Unimplemented operators automatically fall back to the default implementation.
 
-### Vendor-Specific Additions
+### Backend-Specific Additions
 
-Vendors can also **add new operators** that don't exist in the default implementation. These will be exported to `liger_kernel.ops` namespace for users to import.
+Backends can also **add new operators** that don't exist in the default implementation. These will be exported to the `liger_kernel.ops` namespace for users to import.
 
 ### Naming Convention
 
@@ -155,20 +156,30 @@ Vendors can also **add new operators** that don't exist in the default implement
 - This allows seamless replacement without changing user code
 - Use `as` imports to rename if your internal naming differs
 
+### Multi-Device Backends
+
+A backend can support multiple devices by listing them all in `devices`. It can be the default on a subset (or none) of them. Examples:
+
+```python
+# Supports CUDA and XPU; default on neither (opt-in everywhere):
+register_backend(BackendInfo(name="inductor", devices=("cuda", "xpu")))
+
+# Supports CUDA and XPU; auto-applied on XPU only:
+register_backend(BackendInfo(name="example", devices=("cuda", "xpu"), default_devices=("xpu",)))
+```
+
 ## Example: Ascend NPU Backend
 
-See `_ascend/` directory for a complete example of the Ascend NPU backend implementation.
+See `_ascend/` for a complete example of an auto-applied backend.
 
-## Enable cuTile Backend
+## Example: cuTile Backend
 
-We need to explicitly set environment variables to enable cuTile.
+See `_cutile/` for a complete example of an opt-in backend.
 
-For example, `cutile` is selected with:
+Enable it on a CUDA device with:
 
 ```bash
 LIGER_KERNEL_BACKEND=cutile python your_script.py
 ```
 
-cuTile is only supported on CUDA devices. When `LIGER_KERNEL_BACKEND=cutile` is set, Liger-Kernel selects the cuTile operator implementations instead of the default CUDA implementations.
-
-If the selected backend cannot be imported, the import error is raised instead of silently falling back to the default implementation.
+`select_backend()` validates the request: if the current device isn't in the backend's `devices`, or if `cuda-tile` isn't installed, the user gets a clear error instead of a silent fallback.
diff --git a/src/liger_kernel/ops/backends/__init__.py b/src/liger_kernel/ops/backends/__init__.py
index 8f20b5905..f293646b0 100644
--- a/src/liger_kernel/ops/backends/__init__.py
+++ b/src/liger_kernel/ops/backends/__init__.py
@@ -1,14 +1,14 @@
 import importlib
 import pkgutil
 
-from liger_kernel.ops.backends.registry import VENDOR_REGISTRY  # noqa: F401
-from liger_kernel.ops.backends.registry import VendorInfo  # noqa: F401
-from liger_kernel.ops.backends.registry import get_vendor_for_device  # noqa: F401
-from liger_kernel.ops.backends.registry import register_vendor  # noqa: F401
-from liger_kernel.ops.backends.registry import select_backend_for_device  # noqa: F401
+from liger_kernel.ops.backends.registry import BACKEND_REGISTRY  # noqa: F401
+from liger_kernel.ops.backends.registry import LIGER_KERNEL_BACKEND_ENV  # noqa: F401
+from liger_kernel.ops.backends.registry import BackendInfo  # noqa: F401
+from liger_kernel.ops.backends.registry import register_backend  # noqa: F401
+from liger_kernel.ops.backends.registry import select_backend  # noqa: F401
 
-# Auto-import all _<vendor> subpackages to trigger registration
-# Each vendor's __init__.py calls register_vendor() when imported
+# Auto-import all _<backend> subpackages to trigger registration.
+# Each backend's __init__.py calls register_backend() when imported.
 for _, modname, ispkg in pkgutil.iter_modules(__path__):
     if ispkg and modname.startswith("_"):
         importlib.import_module(f"{__name__}.{modname}")
diff --git a/src/liger_kernel/ops/backends/_ascend/__init__.py b/src/liger_kernel/ops/backends/_ascend/__init__.py
index a07e7ab09..e7358b81e 100644
--- a/src/liger_kernel/ops/backends/_ascend/__init__.py
+++ b/src/liger_kernel/ops/backends/_ascend/__init__.py
@@ -1,5 +1,5 @@
-from liger_kernel.ops.backends.registry import VendorInfo
-from liger_kernel.ops.backends.registry import register_vendor
+from liger_kernel.ops.backends.registry import BackendInfo
+from liger_kernel.ops.backends.registry import register_backend
 
-# Register Ascend vendor for NPU device
-register_vendor(VendorInfo(vendor="ascend", device="npu"))
+# Ascend NPU backend — default on NPU devices.
+register_backend(BackendInfo(name="ascend", devices=("npu",), default_devices=("npu",)))
diff --git a/src/liger_kernel/ops/backends/_cutile/__init__.py b/src/liger_kernel/ops/backends/_cutile/__init__.py
index 16d000218..d6f6a7663 100644
--- a/src/liger_kernel/ops/backends/_cutile/__init__.py
+++ b/src/liger_kernel/ops/backends/_cutile/__init__.py
@@ -1,6 +1,12 @@
 """
 cuTile backend for Liger-Kernel.
 
-cuTile is an optional CUDA backend selected explicitly with
-LIGER_KERNEL_BACKEND=cutile, so it does not register as the default CUDA vendor.
+cuTile is an optional CUDA backend. It is opt-in only — users select it
+explicitly via ``LIGER_KERNEL_BACKEND=cutile``. It is not auto-applied on
+any device (note the empty ``default_devices`` on the registration below).
 """
+
+from liger_kernel.ops.backends.registry import BackendInfo
+from liger_kernel.ops.backends.registry import register_backend
+
+register_backend(BackendInfo(name="cutile", devices=("cuda",)))
diff --git a/src/liger_kernel/ops/backends/registry.py b/src/liger_kernel/ops/backends/registry.py
index c89febc72..a138bcc29 100644
--- a/src/liger_kernel/ops/backends/registry.py
+++ b/src/liger_kernel/ops/backends/registry.py
@@ -1,86 +1,109 @@
 """
-Vendor registry for Liger-Kernel multi-backend support.
+Backend registry for Liger-Kernel multi-backend support.
 
-This module defines VendorInfo and the registry for vendor registration.
-Each vendor registers itself by calling register_vendor() in its __init__.py.
-"""
+A "backend" here is a named implementation of Liger's operators. It may correspond
+to a different hardware device (e.g., Ascend on NPU) or a different DSL on the
+same device (e.g., cuTile on CUDA), and it may support one or more devices.
+
+Each backend declares:
+  - the set of devices it supports
+  - the subset of those devices on which it is the *default* (auto-applied on
+    import). On any other supported device the backend is opt-in only and must
+    be requested explicitly via the LIGER_KERNEL_BACKEND environment variable.
 
-import os
+Each backend registers itself by calling register_backend() in its __init__.py.
+"""
 
 from dataclasses import dataclass
+from dataclasses import field
 from typing import Optional
+from typing import Tuple
 
 # Dynamically get backends package path to avoid hardcoding
 _BACKENDS_PACKAGE = __name__.rsplit(".", 1)[0]  # "liger_kernel.ops.backends"
 
+# Environment variable users set to explicitly select an opt-in backend.
+LIGER_KERNEL_BACKEND_ENV = "LIGER_KERNEL_BACKEND"
 
-@dataclass
-class VendorInfo:
+
+@dataclass(frozen=True)
+class BackendInfo:
     """
-    Information about a chip vendor and its supported device.
+    Information about a backend implementation.
 
     Attributes:
-        vendor: Vendor name (e.g., "ascend", "intel", "nvidia")
-        device: Device type this vendor supports (e.g., "npu", "xpu")
+        name: Backend identifier (e.g., "ascend", "cutile"). The on-disk
+            directory must be ``backends/_<name>/``.
+        devices: Tuple of device types this backend supports
+            (e.g., ``("npu",)``, ``("cuda",)``, ``("cuda", "xpu")``).
+        default_devices: Subset of ``devices`` on which this backend is
+            automatically applied at import time. On supported devices not
+            listed here, the backend is opt-in only via ``LIGER_KERNEL_BACKEND``.
+            Empty tuple (the default) means the backend is opt-in only on every
+            device it supports.
     """
 
-    vendor: str
-    device: str
+    name: str
+    devices: Tuple[str, ...]
+    default_devices: Tuple[str, ...] = field(default_factory=tuple)
+
+    def __post_init__(self):
+        if not self.devices:
+            raise ValueError(f"Backend {self.name!r} must declare at least one supported device.")
+        extra = set(self.default_devices) - set(self.devices)
+        if extra:
+            raise ValueError(
+                f"Backend {self.name!r}: default_devices {sorted(extra)} not in devices {list(self.devices)}."
+            )
 
     @property
     def module_path(self) -> str:
-        """Auto-generated module path based on vendor name."""
-        return f"{_BACKENDS_PACKAGE}._{self.vendor}.ops"
+        """Auto-generated module path based on backend name."""
+        return f"{_BACKENDS_PACKAGE}._{self.name}.ops"
 
 
-# Registry mapping device types to their vendor info
-# Vendors register themselves via register_vendor()
-VENDOR_REGISTRY: dict[str, VendorInfo] = {}
+# Registry mapping backend names to their info.
+BACKEND_REGISTRY: dict[str, BackendInfo] = {}
 
 
-def register_vendor(vendor_info: VendorInfo) -> None:
-    """
-    Register a vendor's info in the global registry.
+def register_backend(info: BackendInfo) -> None:
+    """Register a backend's info in the global registry."""
+    BACKEND_REGISTRY[info.name] = info
 
-    This should be called in each vendor's __init__.py to register itself.
 
-    Args:
-        vendor_info: VendorInfo instance to register
+def select_backend(device: str, explicit: Optional[str] = None) -> Optional[BackendInfo]:
     """
-    VENDOR_REGISTRY[vendor_info.device] = vendor_info
-
-
-def get_vendor_for_device(device: str) -> Optional[VendorInfo]:
-    """
-    Get the VendorInfo for a given device type.
+    Select the backend implementation for the current device.
 
     Args:
-        device: Device type (e.g., "npu", "xpu")
+        device: Device type from ``infer_device()`` (e.g., "cuda", "npu").
+        explicit: If set, force selection of this named backend. The backend's
+            supported devices are validated against the runtime.
 
     Returns:
-        VendorInfo if found, None otherwise
-    """
-    return VENDOR_REGISTRY.get(device)
-
-
-def select_backend_for_device(device: str) -> Optional[VendorInfo]:
-    """
-    Select the backend implementation for a given device.
+        ``BackendInfo`` if a backend should replace the defaults, ``None`` to keep defaults.
 
-    LIGER_KERNEL_BACKEND is an explicit override for optional backends that are
-    not the default vendor implementation for a device.
+    Raises:
+        RuntimeError: If ``explicit`` names an unknown backend or is incompatible
+            with the current device.
     """
-    backend = os.environ.get("LIGER_KERNEL_BACKEND")
-    if backend is None:
-        return get_vendor_for_device(device)
-
-    backend = backend.strip().lower()
-    if backend == "":
-        return get_vendor_for_device(device)
-
-    if backend == "cutile":
-        if device != "cuda":
-            raise RuntimeError("LIGER_KERNEL_BACKEND=cutile requires a CUDA/GPU device.")
-        return VendorInfo(vendor="cutile", device="cuda")
-
-    raise RuntimeError(f"Unsupported LIGER_KERNEL_BACKEND: {backend}. Only 'cutile' is currently supported.")
+    if explicit:
+        info = BACKEND_REGISTRY.get(explicit)
+        if info is None:
+            known = ", ".join(sorted(BACKEND_REGISTRY)) or "<none registered>"
+            raise RuntimeError(
+                f"Unknown {LIGER_KERNEL_BACKEND_ENV}={explicit!r}. Registered backends: {known}."
+            )
+        if device not in info.devices:
+            supported = ", ".join(info.devices)
+            raise RuntimeError(
+                f"{LIGER_KERNEL_BACKEND_ENV}={info.name!r} supports devices ({supported}), "
+                f"but the current device is {device!r}."
+            )
+        return info
+
+    # Auto-select: pick a backend that declares the current device as one of its defaults.
+    for info in BACKEND_REGISTRY.values():
+        if device in info.default_devices:
+            return info
+    return None

From f51c8f719d9de7c401ed7cad9e7694f5cd9b0592 Mon Sep 17 00:00:00 2001
From: Vaibhav Jindal <vaibhav.jndl@gmail.com>
Date: Mon, 25 May 2026 23:32:01 -0700
Subject: [PATCH 5/7] Restructure: hardware backends in backends/, DSLs at top
 level

Per design consensus on PR #1228 with @Tcc0403, @zheliuyu, and @kolehma8,
restructure the operator dispatch model to cleanly separate two axes:

- Hardware: default (CUDA, top-level) vs alternative (Ascend, ROCm, ...,
  under backends/_<name>/).
- DSL/implementation: Triton (default), cuTile, future CUTLASS / CuteDSL /
  TileLang on CUDA, future tilelang-ascend / triton-ascend on NPU, ...

Each (hardware, DSL) pair becomes one registered implementation.

Concrete changes:

1. Move src/liger_kernel/ops/backends/_cutile/ -> src/liger_kernel/ops/cutile/.
   cuTile is a DSL alternative for the default hardware (CUDA), so it lives
   as a peer of backends/, not inside it. backends/ now strictly means
   "non-default hardware backends," matching its original intent.

2. Rename the registry abstraction to remove name collision with the
   backends/ directory:

     BackendInfo                -> ImplInfo
     BACKEND_REGISTRY           -> IMPL_REGISTRY
     register_backend           -> register_impl
     select_backend             -> select_impl
     _replace_with_backend_ops  -> _replace_with_impl_ops
     env LIGER_KERNEL_BACKEND   -> env LIGER_KERNEL_IMPL
     const LIGER_KERNEL_BACKEND_ENV -> const LIGER_KERNEL_IMPL_ENV

3. ImplInfo.module_path becomes an explicit required field (was auto-derived
   from name). This is necessary because cutile/ at top level and _ascend/
   inside backends/ have different module paths; the old auto-derivation
   only worked for the latter.

4. New _discover_impls() in ops/__init__.py auto-imports both:
   - liger_kernel.ops.backends (triggers _<name>/ registration via the
     existing pkgutil loop in backends/__init__.py).
   - Non-private subpackages of ops/ (excluding "backends" and
     "experimental") so DSL peers like cutile/ self-register.

5. README.md, backends/README.md, test/transformers/test_cutile_backend.py,
   and benchmark/scripts/utils.py updated for the new env var name and
   module paths.

No behavior change for end users: LIGER_KERNEL_IMPL=cutile selects cuTile
on CUDA (was LIGER_KERNEL_BACKEND=cutile), and Ascend continues to
auto-apply on NPU.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 README.md                                     |   8 +-
 benchmark/scripts/utils.py                    |   6 +-
 src/liger_kernel/ops/__init__.py              |  96 +++++---
 src/liger_kernel/ops/backends/README.md       | 211 ++++++++++--------
 src/liger_kernel/ops/backends/__init__.py     |  15 +-
 .../ops/backends/_ascend/__init__.py          |  15 +-
 .../ops/backends/_cutile/__init__.py          |  12 -
 src/liger_kernel/ops/backends/registry.py     |  95 ++++----
 src/liger_kernel/ops/cutile/__init__.py       |  18 ++
 .../_cutile => cutile}/ops/__init__.py        |   6 +-
 .../{backends/_cutile => cutile}/ops/jsd.py   |   2 +-
 .../{backends/_cutile => cutile}/ops/utils.py |   0
 test/transformers/test_cutile_backend.py      |  10 +-
 13 files changed, 281 insertions(+), 213 deletions(-)
 delete mode 100644 src/liger_kernel/ops/backends/_cutile/__init__.py
 create mode 100644 src/liger_kernel/ops/cutile/__init__.py
 rename src/liger_kernel/ops/{backends/_cutile => cutile}/ops/__init__.py (73%)
 rename src/liger_kernel/ops/{backends/_cutile => cutile}/ops/jsd.py (98%)
 rename src/liger_kernel/ops/{backends/_cutile => cutile}/ops/utils.py (100%)

diff --git a/README.md b/README.md
index a633adeae..ce88d15a0 100644
--- a/README.md
+++ b/README.md
@@ -182,13 +182,13 @@ pip3 install --pre torch torchvision torchaudio --index-url https://download.pyt
 
 ### Enable cuTile Backend
 
-cuTile is an optional CUDA-only backend. After installing the `cutile` or `cutile-tileiras` extra, enable it explicitly:
+cuTile is an optional CUDA-only DSL implementation. After installing the `cutile` or `cutile-tileiras` extra, enable it explicitly:
 
 ```bash
-LIGER_KERNEL_BACKEND=cutile python your_script.py
+LIGER_KERNEL_IMPL=cutile python your_script.py
 ```
 
-`LIGER_KERNEL_BACKEND` currently only supports `cutile`. Selecting it on a non-CUDA device, or without the required cuTile dependencies, raises an error.
+`LIGER_KERNEL_IMPL` selects an opt-in implementation registered with Liger (currently `cutile`). Selecting one on an unsupported device, or without the required dependencies installed, raises an error.
 
 
 ## Getting Started
@@ -308,7 +308,7 @@ loss.backward()
 | **Kernel**                      | **API**                                                     |
 |---------------------------------|-------------------------------------------------------------|
 | RMSNorm                         | `liger_kernel.transformers.LigerRMSNorm`                    |
-| Modulated RMSNorm               | `liger_kernel.transformers.LigerModulatedRMSNorm`           |
+| Modulated RMSNorm               | `liger_kernel.transformers.LigerModulatedRMSNorm`           |
 | LayerNorm                       | `liger_kernel.transformers.LigerLayerNorm`                  |
 | RoPE                            | `liger_kernel.transformers.liger_rotary_pos_emb`            |
 | SwiGLU                          | `liger_kernel.transformers.LigerSwiGLUMLP`                  |
diff --git a/benchmark/scripts/utils.py b/benchmark/scripts/utils.py
index 20c569ebb..457483f22 100644
--- a/benchmark/scripts/utils.py
+++ b/benchmark/scripts/utils.py
@@ -262,9 +262,9 @@ def get_formatted_time():
 
 
 def get_display_kernel_provider(kernel_provider: str) -> str:
-    backend = os.environ.get("LIGER_KERNEL_BACKEND", "").strip().lower()
-    if kernel_provider == "liger" and backend:
-        return f"liger-{backend}"
+    impl = os.environ.get("LIGER_KERNEL_IMPL", "").strip().lower()
+    if kernel_provider == "liger" and impl:
+        return f"liger-{impl}"
     return kernel_provider
 
 
diff --git a/src/liger_kernel/ops/__init__.py b/src/liger_kernel/ops/__init__.py
index 6eb2f7454..b37384ec3 100644
--- a/src/liger_kernel/ops/__init__.py
+++ b/src/liger_kernel/ops/__init__.py
@@ -1,12 +1,12 @@
 """
-Liger-Kernel operators with automatic backend-specific replacement.
+Liger-Kernel operators with automatic implementation-specific replacement.
 
 This module provides two ways to import operators:
 
 1. Import from this package (recommended for Function classes):
        from liger_kernel.ops import LigerGELUMulFunction
 
-   This automatically uses the active backend's implementation if any is selected.
+   This automatically uses the active implementation if any is selected.
 
 2. Import from submodules (for kernel functions or specific access):
        from liger_kernel.ops.geglu import geglu_forward, geglu_backward
@@ -16,10 +16,10 @@
 The replacement mechanism:
 1. Default implementations are imported from individual modules (e.g., geglu.py)
 2. On module load, device is detected via infer_device() and the env var
-   LIGER_KERNEL_BACKEND is read
-3. select_backend() picks an active backend (auto-applied for the device, or
-   explicitly requested via env var)
-4. If a backend is selected, its implementations replace/extend the symbols here
+   LIGER_KERNEL_IMPL is read
+3. select_impl() picks an active implementation (auto-applied for the device,
+   or explicitly requested via env var)
+4. If one is selected, its operators replace/extend the symbols here
 5. All subsequent imports from this package get the replaced versions
 
 Note: Direct imports from submodules (e.g., from liger_kernel.ops.geglu import ...)
@@ -101,63 +101,89 @@
 
 
 # =============================================================================
-# Backend-specific replacement logic
+# Implementation discovery + dispatch
 # =============================================================================
 
 
-def _replace_with_backend_ops():
+def _discover_impls():
     """
-    Replace/add backend-specific operator implementations.
+    Trigger self-registration of all implementations.
+
+    Two sources of implementations:
+      - Hardware backends in ``backends/_<name>/`` (loaded by
+        ``backends/__init__.py``'s own auto-import loop).
+      - DSL alternatives at the top level of ``ops/`` (e.g., ``cutile/``).
+        Each DSL subpackage's ``__init__.py`` calls ``register_impl()``
+        when imported.
+    """
+    import importlib
+    import pkgutil
+
+    # Hardware backends self-register when `backends` is imported.
+    importlib.import_module("liger_kernel.ops.backends")
+
+    # DSL alternatives — non-private subpackages of `ops/`, minus reserved
+    # directories that aren't implementation containers.
+    reserved = {"backends", "experimental"}
+    for _, modname, ispkg in pkgutil.iter_modules(__path__):
+        if ispkg and not modname.startswith("_") and modname not in reserved:
+            importlib.import_module(f"{__name__}.{modname}")
+
+
+def _replace_with_impl_ops():
+    """
+    Replace/add implementation-specific operators on top of the defaults.
 
     This function is called automatically on module load. It:
-    1. Detects the current device (cuda, npu, xpu, etc.)
-    2. Selects the active backend via ``select_backend()``, honoring any
-       explicit ``LIGER_KERNEL_BACKEND`` override
-    3. Loads and applies backend-specific implementations
+    1. Detects the current device (cuda, npu, xpu, etc.).
+    2. Selects the active implementation via ``select_impl()``, honoring any
+       explicit ``LIGER_KERNEL_IMPL`` override.
+    3. Loads and applies the implementation's operators.
 
-    Backend implementations should be placed in:
-        liger_kernel/ops/backends/_<name>/ops/
+    Implementations live either at:
+        liger_kernel/ops/<name>/ops/                     (DSL alternatives)
+        liger_kernel/ops/backends/_<name>/ops/           (hardware backends)
 
-    If the backend module defines __all__, only those symbols are exported.
-    Otherwise, all public symbols (not starting with _) are auto-discovered.
+    If the implementation module defines ``__all__``, only those symbols are
+    exported. Otherwise, all public symbols (not starting with ``_``) are
+    auto-discovered.
 
-    Note: Backends can both override existing ops AND add new backend-specific ops.
+    Note: Implementations can both override existing ops AND add new ones.
     """
     import os
 
-    from liger_kernel.ops.backends import LIGER_KERNEL_BACKEND_ENV
-    from liger_kernel.ops.backends import select_backend
+    from liger_kernel.ops.backends import LIGER_KERNEL_IMPL_ENV
+    from liger_kernel.ops.backends import select_impl
     from liger_kernel.utils import infer_device
 
     device = infer_device()
-    explicit = os.environ.get(LIGER_KERNEL_BACKEND_ENV, "").strip().lower() or None
-    backend_info = select_backend(device, explicit=explicit)
-    if backend_info is None:
+    explicit = os.environ.get(LIGER_KERNEL_IMPL_ENV, "").strip().lower() or None
+    impl_info = select_impl(device, explicit=explicit)
+    if impl_info is None:
         return
 
     try:
         import importlib
 
-        backend_ops = importlib.import_module(backend_info.module_path)
-
-        # Get names to export: use __all__ if defined, otherwise auto-discover
-        names_to_export = getattr(backend_ops, "__all__", None)
+        impl_ops = importlib.import_module(impl_info.module_path)
 
+        # Get names to export: use __all__ if defined, otherwise auto-discover.
+        names_to_export = getattr(impl_ops, "__all__", None)
         if names_to_export is None:
-            # Auto-discover: find all public symbols (classes and functions)
-            names_to_export = [name for name in dir(backend_ops) if not name.startswith("_")]
+            names_to_export = [name for name in dir(impl_ops) if not name.startswith("_")]
 
-        # Replace or add to this module's globals
+        # Replace or add to this module's globals.
         for name in names_to_export:
-            globals()[name] = getattr(backend_ops, name)
+            globals()[name] = getattr(impl_ops, name)
 
     except ImportError:
-        # An auto-selected backend that fails to import (e.g., missing optional
-        # vendor SDK in the environment) silently falls back to defaults. An
-        # explicitly-requested backend, however, must succeed — re-raise so the
+        # An auto-selected implementation that fails to import (e.g., missing
+        # optional vendor SDK) silently falls back to defaults. An explicitly
+        # requested implementation, however, must succeed — re-raise so the
         # user sees the underlying error.
         if explicit:
             raise
 
 
-_replace_with_backend_ops()
+_discover_impls()
+_replace_with_impl_ops()
diff --git a/src/liger_kernel/ops/backends/README.md b/src/liger_kernel/ops/backends/README.md
index ce445e44d..6d06ce7f3 100644
--- a/src/liger_kernel/ops/backends/README.md
+++ b/src/liger_kernel/ops/backends/README.md
@@ -1,86 +1,93 @@
-# Adding a New Backend
+# Adding a New Hardware Backend
 
-This directory contains backend-specific operator implementations that can replace the default Liger implementations.
+This directory holds **alternative hardware backends** — operator implementations for devices other than the default (CUDA). Examples: Ascend NPU, future ROCm, future XPU.
 
-A **backend** here is a named alternative implementation of Liger's operators. It may target a different hardware device (e.g., Ascend NPU vs. NVIDIA CUDA) or a different DSL on the same device (e.g., cuTile vs. Triton on CUDA), and may support one or more devices.
+DSL alternatives for the *default* hardware (CUDA) — cuTile, future CUTLASS / CuteDSL / TileLang — live at the top level of `src/liger_kernel/ops/` (peers of this `backends/` directory), not inside it. The contract for registering them is the same; only the on-disk location differs.
 
-Each backend declares two device sets:
+## Concepts
 
-- **`devices`** — every device the backend supports.
-- **`default_devices`** — the subset on which the backend is auto-applied at import time. On supported devices not listed here, the backend is opt-in only and must be requested explicitly via the `LIGER_KERNEL_BACKEND` environment variable.
+An **implementation** is a named alternative kernel set. Each implementation declares:
 
-## Concepts
+- **`name`** — identifier (e.g., `ascend`, `cutile`). Users select it via `LIGER_KERNEL_IMPL=<name>`.
+- **`devices`** — every device the implementation supports.
+- **`default_devices`** — the subset where it is auto-applied at import time. On supported devices not in this set, the implementation is opt-in only (requires `LIGER_KERNEL_IMPL=<name>`). Empty means opt-in only on every supported device.
+- **`module_path`** — the Python module path where the kernels live (e.g., `liger_kernel.ops.cutile.ops`, `liger_kernel.ops.backends._ascend.ops`).
+
+Two flavors fall out of the data:
 
-- **Device**: PyTorch device type returned by `infer_device()` (e.g., `cuda`, `npu`, `xpu`)
-- **BackendInfo**: Declarative description of a backend (name, supported devices, default devices)
-- **Auto-applied backend**: A backend whose `default_devices` includes the current device — applied automatically (e.g., Ascend on NPU)
-- **Opt-in backend**: A backend whose `default_devices` is empty (or excludes the current device) — applied only when `LIGER_KERNEL_BACKEND=<name>` is set (e.g., cuTile on CUDA)
+- **Auto-applied** — `default_devices` includes the current device. Replaces defaults automatically (e.g., Ascend on NPU).
+- **Opt-in** — only selected when the user sets `LIGER_KERNEL_IMPL=<name>` (e.g., cuTile on CUDA).
 
-## Directory Structure
+## Directory layout (full tree)
 
 ```
-backends/
-├── README.md
-├── __init__.py
-├── registry.py          # BackendInfo, register_backend(), BACKEND_REGISTRY, select_backend()
-├── _ascend/             # Ascend backend — auto-applied on NPU
-│   ├── __init__.py      # register_backend(BackendInfo(name="ascend", devices=("npu",), default_devices=("npu",)))
+src/liger_kernel/ops/
+├── jsd.py, rms_norm.py, ...            # default Triton-on-CUDA — the canonical kernels
+├── cutile/                              # opt-in DSL on CUDA
+│   ├── __init__.py                      # register_impl(ImplInfo(name="cutile", devices=("cuda",), module_path=...))
 │   └── ops/
-│       ├── __init__.py  # Exports backend-specific implementations
-│       └── geglu.py     # NPU-specific GEGLU implementation
-├── _cutile/             # cuTile backend — opt-in on CUDA
-│   ├── __init__.py      # register_backend(BackendInfo(name="cutile", devices=("cuda",)))
-│   └── ops/
-│       └── ...
-└── _<name>/             # Your new backend
-    └── ...
+│       ├── __init__.py
+│       └── jsd.py
+├── backends/                            # alternative hardware backends (this directory)
+│   ├── README.md
+│   ├── __init__.py                      # auto-imports _<name>/ subpackages
+│   ├── registry.py                      # ImplInfo, register_impl(), select_impl(), IMPL_REGISTRY
+│   └── _ascend/                         # Ascend NPU — auto-applied on NPU
+│       ├── __init__.py                  # register_impl(ImplInfo(name="ascend", ...))
+│       └── ops/
+│           ├── __init__.py
+│           └── jsd.py, ...
+└── __init__.py                          # imports defaults, runs _replace_with_impl_ops()
 ```
 
-## How It Works
+## How dispatch works
 
-1. When `liger_kernel.ops.backends` is imported, every `_<name>/` subpackage is auto-imported.
-2. Each backend's `__init__.py` calls `register_backend()` to register itself.
-3. When `liger_kernel.ops` is imported, `_replace_with_backend_ops()` is called.
-4. It detects the current device via `infer_device()` and reads `LIGER_KERNEL_BACKEND` from the environment.
-5. It calls `select_backend(device, explicit=...)`:
-   - If `LIGER_KERNEL_BACKEND` is **set**, the named backend is selected (validated against the current device).
-   - If `LIGER_KERNEL_BACKEND` is **unset**, the first registered backend that lists the current device in its `default_devices` is selected; otherwise the defaults are kept.
-6. The selected backend's operators replace/extend the symbols in the `liger_kernel.ops` namespace.
+1. `liger_kernel.ops` is imported. Default top-level kernels (`ops/jsd.py`, etc.) load first.
+2. `_discover_impls()` runs:
+   - Imports `liger_kernel.ops.backends`, which auto-imports each `_<name>/` subpackage. Each subpackage's `__init__.py` calls `register_impl()`.
+   - Iterates the top-level non-private subpackages of `ops/` (e.g., `cutile/`), excluding reserved dirs (`backends`, `experimental`), and imports each. Same self-registration pattern.
+3. `_replace_with_impl_ops()` runs:
+   - Detects the current device via `infer_device()`.
+   - Reads `LIGER_KERNEL_IMPL` from the environment.
+   - Calls `select_impl(device, explicit=<env var or None>)`:
+     - If the env var is set, the named implementation is looked up and validated (device must be in `devices`).
+     - If unset, the first registered implementation listing the current device in its `default_devices` is returned; otherwise no replacement happens.
+   - If an implementation was selected, its operators replace/extend the `liger_kernel.ops` namespace.
 
-If an auto-selected backend fails to import (e.g., the vendor SDK isn't installed), the dispatcher silently falls back to defaults. An explicitly-requested backend that fails to import re-raises so the user sees the underlying error.
+If an auto-selected implementation fails to import (e.g., the vendor SDK isn't installed), the dispatcher silently falls back to defaults. An explicitly-requested implementation that fails to import re-raises so the user sees the underlying error.
 
-## Adding a New Backend
+## Adding a new hardware backend (lives in `backends/_<name>/`)
 
-### Step 1: Create the directory structure
+### Step 1: Create the directory
 
 ```bash
-mkdir -p backends/_<name>/ops
-touch backends/_<name>/__init__.py
-touch backends/_<name>/ops/__init__.py
+mkdir -p src/liger_kernel/ops/backends/_<name>/ops
+touch src/liger_kernel/ops/backends/_<name>/__init__.py
+touch src/liger_kernel/ops/backends/_<name>/ops/__init__.py
 ```
 
-### Step 2: Register your backend
+### Step 2: Register the implementation
 
 In `backends/_<name>/__init__.py`:
 
 ```python
-"""
-<Name> backend for Liger-Kernel.
-"""
+"""<Name> hardware backend for Liger-Kernel."""
 
-from liger_kernel.ops.backends.registry import BackendInfo
-from liger_kernel.ops.backends.registry import register_backend
+from liger_kernel.ops.backends.registry import ImplInfo
+from liger_kernel.ops.backends.registry import register_impl
 
 # Auto-applied on the listed devices:
-register_backend(BackendInfo(name="<name>", devices=("<device>",), default_devices=("<device>",)))
-
-# Or, opt-in only (selected via LIGER_KERNEL_BACKEND=<name>):
-# register_backend(BackendInfo(name="<name>", devices=("<device>",)))
+register_impl(ImplInfo(
+    name="<name>",
+    devices=("<device>",),
+    default_devices=("<device>",),
+    module_path=f"{__name__}.ops",
+))
 ```
 
 ### Step 3: Ensure device detection works
 
-Make sure `infer_device()` in `liger_kernel/utils.py` recognizes your device:
+Make sure `infer_device()` in `liger_kernel/utils.py` recognizes the device. Example:
 
 ```python
 def infer_device():
@@ -88,13 +95,12 @@ def infer_device():
         return "cuda"
     if is_npu_available():
         return "npu"
-    # Add your device detection here
     if is_<device>_available():
         return "<device>"
     return "cpu"
 ```
 
-### Step 4: Implement backend-specific operators
+### Step 4: Implement the operators
 
 Create operator files in `backends/_<name>/ops/`. For example, `geglu.py`:
 
@@ -102,7 +108,7 @@ Create operator files in `backends/_<name>/ops/`. For example, `geglu.py`:
 import torch
 
 class LigerGELUMulFunction(torch.autograd.Function):
-    """Backend-specific LigerGELUMulFunction implementation."""
+    """Backend-specific LigerGELUMulFunction."""
 
     @staticmethod
     def forward(ctx, a, b):
@@ -111,28 +117,19 @@ class LigerGELUMulFunction(torch.autograd.Function):
     @staticmethod
     def backward(ctx, dc):
         ...
-
-def geglu_forward_backend(a, b):
-    ...
-
-def geglu_backward_backend(a, b, dc):
-    ...
 ```
 
-### Step 5: Export in `ops/__init__.py`
+### Step 5: Export from `ops/__init__.py`
 
 In `backends/_<name>/ops/__init__.py`:
 
 ```python
 """<Name>-specific operator implementations."""
 
-from .<module> import (
-    LigerGELUMulFunction,
-    geglu_forward_backend as geglu_forward,   # Rename to match default API
-    geglu_backward_backend as geglu_backward,
-)
+from .geglu import LigerGELUMulFunction
+from .geglu import geglu_backward
+from .geglu import geglu_forward
 
-# Explicitly declare what to export (recommended)
 __all__ = [
     "LigerGELUMulFunction",
     "geglu_forward",
@@ -140,46 +137,74 @@ __all__ = [
 ]
 ```
 
-## Key Points
+## Adding a new DSL implementation (lives at top level of `ops/`)
 
-### Incremental Override
+The pattern is the same — only the on-disk location and the `module_path` differ:
 
-You **don't need to implement all operators**. Only implement the ones that require backend-specific adaptations. Unimplemented operators automatically fall back to the default implementation.
+```
+src/liger_kernel/ops/<name>/
+├── __init__.py                          # register_impl(...)
+└── ops/
+    ├── __init__.py                      # exports symbols
+    └── jsd.py, ...                      # kernel files
+```
 
-### Backend-Specific Additions
+```python
+# ops/<name>/__init__.py
 
-Backends can also **add new operators** that don't exist in the default implementation. These will be exported to the `liger_kernel.ops` namespace for users to import.
+from liger_kernel.ops.backends.registry import ImplInfo
+from liger_kernel.ops.backends.registry import register_impl
 
-### Naming Convention
+# Opt-in only (no `default_devices`):
+register_impl(ImplInfo(
+    name="<name>",
+    devices=("cuda",),
+    module_path=f"{__name__}.ops",  # liger_kernel.ops.<name>.ops
+))
+```
 
-- Use the **same class/function names** as the default implementations for overrides
-- This allows seamless replacement without changing user code
-- Use `as` imports to rename if your internal naming differs
+## Key points
 
-### Multi-Device Backends
+### Incremental override
 
-A backend can support multiple devices by listing them all in `devices`. It can be the default on a subset (or none) of them. Examples:
+You **don't need to implement all operators**. Only implement the ones that need a different version. Unimplemented operators fall back to the defaults.
 
-```python
-# Supports CUDA and XPU; default on neither (opt-in everywhere):
-register_backend(BackendInfo(name="inductor", devices=("cuda", "xpu")))
+### Adding new operators
 
-# Supports CUDA and XPU; auto-applied on XPU only:
-register_backend(BackendInfo(name="example", devices=("cuda", "xpu"), default_devices=("xpu",)))
-```
+An implementation can also **add new operators** that don't exist in the defaults. They are exported to `liger_kernel.ops` for users to import.
 
-## Example: Ascend NPU Backend
+### Naming convention
 
-See `_ascend/` for a complete example of an auto-applied backend.
+- Use the **same class/function names** as the defaults when overriding — lets user code stay unchanged.
+- Use `as` imports to rename if your internal naming differs.
 
-## Example: cuTile Backend
+### Multi-device implementations
 
-See `_cutile/` for a complete example of an opt-in backend.
+An implementation can support multiple devices by listing them all in `devices`. It can be the default on a subset (or none) of them.
 
-Enable it on a CUDA device with:
+```python
+# Supports CUDA and XPU; default on neither (opt-in everywhere):
+register_impl(ImplInfo(
+    name="inductor",
+    devices=("cuda", "xpu"),
+    module_path="liger_kernel.ops.inductor.ops",
+))
 
-```bash
-LIGER_KERNEL_BACKEND=cutile python your_script.py
+# Supports CUDA and XPU; auto-applied on XPU only:
+register_impl(ImplInfo(
+    name="example",
+    devices=("cuda", "xpu"),
+    default_devices=("xpu",),
+    module_path="liger_kernel.ops.example.ops",
+))
 ```
 
-`select_backend()` validates the request: if the current device isn't in the backend's `devices`, or if `cuda-tile` isn't installed, the user gets a clear error instead of a silent fallback.
+## Examples in this repo
+
+- `backends/_ascend/` — auto-applied hardware backend (Ascend NPU).
+- `../cutile/` — opt-in DSL implementation on CUDA. Enable with:
+  ```bash
+  LIGER_KERNEL_IMPL=cutile python your_script.py
+  ```
+
+`select_impl()` validates the request: if the current device isn't in the implementation's `devices`, or its module fails to import, the user gets a clear error instead of a silent fallback.
diff --git a/src/liger_kernel/ops/backends/__init__.py b/src/liger_kernel/ops/backends/__init__.py
index f293646b0..f9d0fea38 100644
--- a/src/liger_kernel/ops/backends/__init__.py
+++ b/src/liger_kernel/ops/backends/__init__.py
@@ -1,14 +1,15 @@
 import importlib
 import pkgutil
 
-from liger_kernel.ops.backends.registry import BACKEND_REGISTRY  # noqa: F401
-from liger_kernel.ops.backends.registry import LIGER_KERNEL_BACKEND_ENV  # noqa: F401
-from liger_kernel.ops.backends.registry import BackendInfo  # noqa: F401
-from liger_kernel.ops.backends.registry import register_backend  # noqa: F401
-from liger_kernel.ops.backends.registry import select_backend  # noqa: F401
+from liger_kernel.ops.backends.registry import IMPL_REGISTRY  # noqa: F401
+from liger_kernel.ops.backends.registry import LIGER_KERNEL_IMPL_ENV  # noqa: F401
+from liger_kernel.ops.backends.registry import ImplInfo  # noqa: F401
+from liger_kernel.ops.backends.registry import register_impl  # noqa: F401
+from liger_kernel.ops.backends.registry import select_impl  # noqa: F401
 
-# Auto-import all _<backend> subpackages to trigger registration.
-# Each backend's __init__.py calls register_backend() when imported.
+# Auto-import all _<name> subpackages to trigger registration of
+# alternative-hardware backends (e.g., _ascend/). Each one calls register_impl()
+# in its __init__.py.
 for _, modname, ispkg in pkgutil.iter_modules(__path__):
     if ispkg and modname.startswith("_"):
         importlib.import_module(f"{__name__}.{modname}")
diff --git a/src/liger_kernel/ops/backends/_ascend/__init__.py b/src/liger_kernel/ops/backends/_ascend/__init__.py
index e7358b81e..f4ad594b5 100644
--- a/src/liger_kernel/ops/backends/_ascend/__init__.py
+++ b/src/liger_kernel/ops/backends/_ascend/__init__.py
@@ -1,5 +1,14 @@
-from liger_kernel.ops.backends.registry import BackendInfo
-from liger_kernel.ops.backends.registry import register_backend
+from liger_kernel.ops.backends.registry import ImplInfo
+from liger_kernel.ops.backends.registry import register_impl
 
 # Ascend NPU backend — default on NPU devices.
-register_backend(BackendInfo(name="ascend", devices=("npu",), default_devices=("npu",)))
+# Future: when tilelang-ascend lands, this can be renamed to "ascend-triton"
+# and a second register_impl(ImplInfo(name="ascend-tilelang", ...)) added.
+register_impl(
+    ImplInfo(
+        name="ascend",
+        devices=("npu",),
+        default_devices=("npu",),
+        module_path=f"{__name__}.ops",  # liger_kernel.ops.backends._ascend.ops
+    )
+)
diff --git a/src/liger_kernel/ops/backends/_cutile/__init__.py b/src/liger_kernel/ops/backends/_cutile/__init__.py
deleted file mode 100644
index d6f6a7663..000000000
--- a/src/liger_kernel/ops/backends/_cutile/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-"""
-cuTile backend for Liger-Kernel.
-
-cuTile is an optional CUDA backend. It is opt-in only — users select it
-explicitly via ``LIGER_KERNEL_BACKEND=cutile``. It is not auto-applied on
-any device (note the empty ``default_devices`` on the registration below).
-"""
-
-from liger_kernel.ops.backends.registry import BackendInfo
-from liger_kernel.ops.backends.registry import register_backend
-
-register_backend(BackendInfo(name="cutile", devices=("cuda",)))
diff --git a/src/liger_kernel/ops/backends/registry.py b/src/liger_kernel/ops/backends/registry.py
index a138bcc29..572d33720 100644
--- a/src/liger_kernel/ops/backends/registry.py
+++ b/src/liger_kernel/ops/backends/registry.py
@@ -1,17 +1,21 @@
 """
-Backend registry for Liger-Kernel multi-backend support.
+Implementation registry for Liger-Kernel.
 
-A "backend" here is a named implementation of Liger's operators. It may correspond
-to a different hardware device (e.g., Ascend on NPU) or a different DSL on the
-same device (e.g., cuTile on CUDA), and it may support one or more devices.
+An "implementation" here is a named alternative kernel set. It may correspond
+to a different hardware device (e.g., Ascend on NPU, in ``backends/_ascend/``)
+or a different DSL on the same device (e.g., cuTile on CUDA, in ``ops/cutile/``).
+It may support one or more devices.
 
-Each backend declares:
+Each implementation declares:
   - the set of devices it supports
   - the subset of those devices on which it is the *default* (auto-applied on
-    import). On any other supported device the backend is opt-in only and must
-    be requested explicitly via the LIGER_KERNEL_BACKEND environment variable.
+    import). On any other supported device the implementation is opt-in only
+    and must be requested explicitly via the LIGER_KERNEL_IMPL environment
+    variable.
+  - the Python module path where its operators live.
 
-Each backend registers itself by calling register_backend() in its __init__.py.
+Each implementation registers itself by calling register_impl() in its
+__init__.py.
 """
 
 from dataclasses import dataclass
@@ -19,91 +23,88 @@
 from typing import Optional
 from typing import Tuple
 
-# Dynamically get backends package path to avoid hardcoding
-_BACKENDS_PACKAGE = __name__.rsplit(".", 1)[0]  # "liger_kernel.ops.backends"
-
-# Environment variable users set to explicitly select an opt-in backend.
-LIGER_KERNEL_BACKEND_ENV = "LIGER_KERNEL_BACKEND"
+# Environment variable users set to explicitly select an opt-in implementation.
+LIGER_KERNEL_IMPL_ENV = "LIGER_KERNEL_IMPL"
 
 
 @dataclass(frozen=True)
-class BackendInfo:
+class ImplInfo:
     """
-    Information about a backend implementation.
+    Information about a kernel implementation.
 
     Attributes:
-        name: Backend identifier (e.g., "ascend", "cutile"). The on-disk
-            directory must be ``backends/_<name>/``.
-        devices: Tuple of device types this backend supports
+        name: Implementation identifier (e.g., "ascend", "cutile"). Also the
+            value users pass via ``LIGER_KERNEL_IMPL=<name>``.
+        devices: Tuple of device types this implementation supports
             (e.g., ``("npu",)``, ``("cuda",)``, ``("cuda", "xpu")``).
-        default_devices: Subset of ``devices`` on which this backend is
-            automatically applied at import time. On supported devices not
-            listed here, the backend is opt-in only via ``LIGER_KERNEL_BACKEND``.
-            Empty tuple (the default) means the backend is opt-in only on every
-            device it supports.
+        default_devices: Subset of ``devices`` on which this implementation
+            is automatically applied at import time. On supported devices not
+            listed here, it is opt-in only via ``LIGER_KERNEL_IMPL``. Empty
+            tuple (the default) means opt-in only on every supported device.
+        module_path: Python module path where the operator implementations
+            live (e.g., ``"liger_kernel.ops.cutile.ops"``). Required.
     """
 
     name: str
     devices: Tuple[str, ...]
     default_devices: Tuple[str, ...] = field(default_factory=tuple)
+    module_path: str = ""
 
     def __post_init__(self):
         if not self.devices:
-            raise ValueError(f"Backend {self.name!r} must declare at least one supported device.")
+            raise ValueError(f"Implementation {self.name!r} must declare at least one supported device.")
+        if not self.module_path:
+            raise ValueError(f"Implementation {self.name!r} must declare a module_path.")
         extra = set(self.default_devices) - set(self.devices)
         if extra:
             raise ValueError(
-                f"Backend {self.name!r}: default_devices {sorted(extra)} not in devices {list(self.devices)}."
+                f"Implementation {self.name!r}: default_devices {sorted(extra)} not in devices {list(self.devices)}."
             )
 
-    @property
-    def module_path(self) -> str:
-        """Auto-generated module path based on backend name."""
-        return f"{_BACKENDS_PACKAGE}._{self.name}.ops"
-
 
-# Registry mapping backend names to their info.
-BACKEND_REGISTRY: dict[str, BackendInfo] = {}
+# Registry mapping implementation names to their info.
+IMPL_REGISTRY: dict[str, ImplInfo] = {}
 
 
-def register_backend(info: BackendInfo) -> None:
-    """Register a backend's info in the global registry."""
-    BACKEND_REGISTRY[info.name] = info
+def register_impl(info: ImplInfo) -> None:
+    """Register an implementation's info in the global registry."""
+    IMPL_REGISTRY[info.name] = info
 
 
-def select_backend(device: str, explicit: Optional[str] = None) -> Optional[BackendInfo]:
+def select_impl(device: str, explicit: Optional[str] = None) -> Optional[ImplInfo]:
     """
-    Select the backend implementation for the current device.
+    Select the implementation for the current device.
 
     Args:
         device: Device type from ``infer_device()`` (e.g., "cuda", "npu").
-        explicit: If set, force selection of this named backend. The backend's
+        explicit: If set, force selection of this named implementation. The
             supported devices are validated against the runtime.
 
     Returns:
-        ``BackendInfo`` if a backend should replace the defaults, ``None`` to keep defaults.
+        ``ImplInfo`` if an implementation should replace the defaults,
+        ``None`` to keep defaults.
 
     Raises:
-        RuntimeError: If ``explicit`` names an unknown backend or is incompatible
-            with the current device.
+        RuntimeError: If ``explicit`` names an unknown implementation or one
+            incompatible with the current device.
     """
     if explicit:
-        info = BACKEND_REGISTRY.get(explicit)
+        info = IMPL_REGISTRY.get(explicit)
         if info is None:
-            known = ", ".join(sorted(BACKEND_REGISTRY)) or "<none registered>"
+            known = ", ".join(sorted(IMPL_REGISTRY)) or "<none registered>"
             raise RuntimeError(
-                f"Unknown {LIGER_KERNEL_BACKEND_ENV}={explicit!r}. Registered backends: {known}."
+                f"Unknown {LIGER_KERNEL_IMPL_ENV}={explicit!r}. Registered implementations: {known}."
             )
         if device not in info.devices:
             supported = ", ".join(info.devices)
             raise RuntimeError(
-                f"{LIGER_KERNEL_BACKEND_ENV}={info.name!r} supports devices ({supported}), "
+                f"{LIGER_KERNEL_IMPL_ENV}={info.name!r} supports devices ({supported}), "
                 f"but the current device is {device!r}."
             )
         return info
 
-    # Auto-select: pick a backend that declares the current device as one of its defaults.
-    for info in BACKEND_REGISTRY.values():
+    # Auto-select: pick an implementation that lists the current device in its defaults.
+    for info in IMPL_REGISTRY.values():
         if device in info.default_devices:
             return info
     return None
diff --git a/src/liger_kernel/ops/cutile/__init__.py b/src/liger_kernel/ops/cutile/__init__.py
new file mode 100644
index 000000000..031f257c1
--- /dev/null
+++ b/src/liger_kernel/ops/cutile/__init__.py
@@ -0,0 +1,18 @@
+"""
+cuTile backend for Liger-Kernel.
+
+cuTile is an optional CUDA-only DSL. It is opt-in only — users select it
+explicitly via ``LIGER_KERNEL_IMPL=cutile``. It is not auto-applied on
+any device (note the empty ``default_devices`` on the registration below).
+"""
+
+from liger_kernel.ops.backends.registry import ImplInfo
+from liger_kernel.ops.backends.registry import register_impl
+
+register_impl(
+    ImplInfo(
+        name="cutile",
+        devices=("cuda",),
+        module_path=f"{__name__}.ops",  # liger_kernel.ops.cutile.ops
+    )
+)
diff --git a/src/liger_kernel/ops/backends/_cutile/ops/__init__.py b/src/liger_kernel/ops/cutile/ops/__init__.py
similarity index 73%
rename from src/liger_kernel/ops/backends/_cutile/ops/__init__.py
rename to src/liger_kernel/ops/cutile/ops/__init__.py
index 96abe1c9c..a3dafe684 100644
--- a/src/liger_kernel/ops/backends/_cutile/ops/__init__.py
+++ b/src/liger_kernel/ops/cutile/ops/__init__.py
@@ -12,9 +12,9 @@
         "or `pip install 'liger-kernel[cutile-tileiras]'`."
     ) from exc
 
-from liger_kernel.ops.backends._cutile.ops.jsd import LigerJSDFunction
-from liger_kernel.ops.backends._cutile.ops.jsd import jsd_backward
-from liger_kernel.ops.backends._cutile.ops.jsd import jsd_forward
+from liger_kernel.ops.cutile.ops.jsd import LigerJSDFunction
+from liger_kernel.ops.cutile.ops.jsd import jsd_backward
+from liger_kernel.ops.cutile.ops.jsd import jsd_forward
 
 __all__ = [
     "LigerJSDFunction",
diff --git a/src/liger_kernel/ops/backends/_cutile/ops/jsd.py b/src/liger_kernel/ops/cutile/ops/jsd.py
similarity index 98%
rename from src/liger_kernel/ops/backends/_cutile/ops/jsd.py
rename to src/liger_kernel/ops/cutile/ops/jsd.py
index c70820f51..bacb02bfe 100644
--- a/src/liger_kernel/ops/backends/_cutile/ops/jsd.py
+++ b/src/liger_kernel/ops/cutile/ops/jsd.py
@@ -9,7 +9,7 @@
 import cuda.tile as ct
 import torch
 
-from liger_kernel.ops.backends._cutile.ops.utils import _next_power_of_2
+from liger_kernel.ops.cutile.ops.utils import _next_power_of_2
 from liger_kernel.ops.utils import ensure_contiguous
 
 ConstFloat = ct.Constant[float]
diff --git a/src/liger_kernel/ops/backends/_cutile/ops/utils.py b/src/liger_kernel/ops/cutile/ops/utils.py
similarity index 100%
rename from src/liger_kernel/ops/backends/_cutile/ops/utils.py
rename to src/liger_kernel/ops/cutile/ops/utils.py
diff --git a/test/transformers/test_cutile_backend.py b/test/transformers/test_cutile_backend.py
index 75c112f26..762d25ff4 100644
--- a/test/transformers/test_cutile_backend.py
+++ b/test/transformers/test_cutile_backend.py
@@ -11,10 +11,10 @@
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="cuTile backend requires CUDA")
 @pytest.mark.skipif(
-    os.environ.get("LIGER_KERNEL_BACKEND", "").strip().lower() != "cutile",
-    reason="cuTile backend selection test requires LIGER_KERNEL_BACKEND=cutile",
+    os.environ.get("LIGER_KERNEL_IMPL", "").strip().lower() != "cutile",
+    reason="cuTile backend selection test requires LIGER_KERNEL_IMPL=cutile",
 )
-def test_liger_kernel_backend_cutile_selects_cutile_jsd_function():
+def test_liger_kernel_impl_cutile_selects_cutile_jsd_function():
     repo_root = Path(__file__).resolve().parents[2]
     pythonpath = os.pathsep.join(
         [
@@ -25,7 +25,7 @@ def test_liger_kernel_backend_cutile_selects_cutile_jsd_function():
     )
     env = {
         **os.environ,
-        "LIGER_KERNEL_BACKEND": "cutile",
+        "LIGER_KERNEL_IMPL": "cutile",
         "PYTHONPATH": pythonpath,
     }
     script = textwrap.dedent(
@@ -33,7 +33,7 @@ def test_liger_kernel_backend_cutile_selects_cutile_jsd_function():
         from liger_kernel.transformers.jsd import LigerJSDFunction
 
         module_name = LigerJSDFunction.__module__
-        expected_prefix = "liger_kernel.ops.backends._cutile."
+        expected_prefix = "liger_kernel.ops.cutile."
         if not module_name.startswith(expected_prefix):
             raise AssertionError(
                 f"Expected cuTile LigerJSDFunction from {expected_prefix}, got {module_name}"

From 651911b9fcc9e77e1641d24201777f898664bad2 Mon Sep 17 00:00:00 2001
From: jinmanx <jinmanx@nvidia.com>
Date: Wed, 27 May 2026 10:20:34 -0700
Subject: [PATCH 6/7] use a seperate csv file

---
 benchmark/data/all_benchmark_data.csv        | 92 +++++++-------------
 benchmark/data/all_benchmark_data_cutile.csv | 33 +++++++
 benchmark/scripts/utils.py                   | 13 +--
 3 files changed, 69 insertions(+), 69 deletions(-)
 create mode 100644 benchmark/data/all_benchmark_data_cutile.csv

diff --git a/benchmark/data/all_benchmark_data.csv b/benchmark/data/all_benchmark_data.csv
index b9b9fa49d..7b9b0c1b0 100644
--- a/benchmark/data/all_benchmark_data.csv
+++ b/benchmark/data/all_benchmark_data.csv
@@ -2187,63 +2187,35 @@ fused_moe,huggingface,backward,memory,MB,E,num_experts,16,2072.1728515625,2072.1
 fused_moe,huggingface,backward,memory,MB,E,num_experts,32,2737.08349609375,2737.08349609375,2737.08349609375,"{""sweep_dim"": ""E"", ""T"": 8192, ""E"": null, ""H"": 2048, ""intermediate_dim"": 768, ""K"": 8, ""dtype"": ""torch.bfloat16""}",NVIDIA H100 80GB HBM3,2026-04-02 23:59:56,0.7.0
 fused_moe,huggingface,backward,memory,MB,E,num_experts,64,4078.97021484375,4078.97021484375,4078.97021484375,"{""sweep_dim"": ""E"", ""T"": 8192, ""E"": null, ""H"": 2048, ""intermediate_dim"": 768, ""K"": 8, ""dtype"": ""torch.bfloat16""}",NVIDIA H100 80GB HBM3,2026-04-02 23:59:56,0.7.0
 fused_moe,huggingface,backward,memory,MB,E,num_experts,128,6763.82275390625,6763.82275390625,6763.82275390625,"{""sweep_dim"": ""E"", ""T"": 8192, ""E"": null, ""H"": 2048, ""intermediate_dim"": 768, ""K"": 8, ""dtype"": ""torch.bfloat16""}",NVIDIA H100 80GB HBM3,2026-04-02 23:59:56,0.7.0
-jsd,liger,forward,speed,ms,BT,B * T,1024,4.802591800689697,4.792543983459472,4.822508716583252,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:50,0.8.0
-jsd,liger,forward,speed,ms,BT,B * T,2048,8.209935665130615,8.20173397064209,8.231609344482422,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:50,0.8.0
-jsd,liger,forward,speed,ms,BT,B * T,4096,16.51097583770752,16.486431121826172,16.534496307373047,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:50,0.8.0
-jsd,liger,forward,speed,ms,BT,B * T,8192,33.019935607910156,32.965196228027345,33.029823303222656,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:50,0.8.0
-jsd,liger,forward,speed,ms,BT,B * T,16384,68.23961639404297,68.23961639404297,68.23961639404297,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:50,0.8.0
-jsd,torch,forward,speed,ms,BT,B * T,1024,2.262079954147339,2.2600127696990966,2.263961601257324,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:51,0.8.0
-jsd,torch,forward,speed,ms,BT,B * T,2048,4.448192119598389,4.446419334411622,4.4502272605896,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:51,0.8.0
-jsd,torch,forward,speed,ms,BT,B * T,4096,8.790271759033203,8.787808418273926,8.79417610168457,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:51,0.8.0
-jsd,torch,forward,speed,ms,BT,B * T,8192,17.53606414794922,17.53264045715332,17.541554260253907,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:51,0.8.0
-jsd,torch,forward,speed,ms,BT,B * T,16384,35.041263580322266,35.0344955444336,35.048031616210935,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:51,0.8.0
-jsd,liger,backward,speed,ms,BT,B * T,1024,0.6543359756469727,0.6532800197601318,0.6553919911384583,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:52,0.8.0
-jsd,liger,backward,speed,ms,BT,B * T,2048,1.1776319742202759,1.176576018333435,1.1796480417251587,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:52,0.8.0
-jsd,liger,backward,speed,ms,BT,B * T,4096,2.2847520112991333,2.2845120429992676,2.286623954772949,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:52,0.8.0
-jsd,liger,backward,speed,ms,BT,B * T,8192,4.443583965301514,4.440294361114502,4.446220779418946,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:52,0.8.0
-jsd,liger,backward,speed,ms,BT,B * T,16384,8.761343955993652,8.759296417236328,8.764415740966797,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:52,0.8.0
-jsd,torch,backward,speed,ms,BT,B * T,1024,3.628959894180298,3.627891206741333,3.6328320026397707,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:53,0.8.0
-jsd,torch,backward,speed,ms,BT,B * T,2048,7.150623798370361,7.148947238922119,7.15374059677124,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:53,0.8.0
-jsd,torch,backward,speed,ms,BT,B * T,4096,14.258943557739258,14.257344245910645,14.26035213470459,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:53,0.8.0
-jsd,torch,backward,speed,ms,BT,B * T,8192,28.43654441833496,28.435987091064455,28.436717224121097,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:53,0.8.0
-jsd,torch,backward,speed,ms,BT,B * T,16384,56.82896041870117,56.82896041870117,56.82896041870117,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:53,0.8.0
-jsd,liger,full,speed,ms,BT,B * T,1024,5.446160078048706,5.43007984161377,5.45614709854126,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:55,0.8.0
-jsd,liger,full,speed,ms,BT,B * T,2048,9.357344150543213,9.346265411376953,9.371161460876465,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:55,0.8.0
-jsd,liger,full,speed,ms,BT,B * T,4096,18.769920349121094,18.760723114013672,18.82929916381836,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:55,0.8.0
-jsd,liger,full,speed,ms,BT,B * T,8192,37.44615936279297,37.407769012451176,37.48454971313477,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:55,0.8.0
-jsd,liger,full,speed,ms,BT,B * T,16384,76.7877426147461,76.7877426147461,76.7877426147461,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:55,0.8.0
-jsd,torch,full,speed,ms,BT,B * T,1024,5.885536193847656,5.883008003234863,5.8876800537109375,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:56,0.8.0
-jsd,torch,full,speed,ms,BT,B * T,2048,11.584159851074219,11.5828031539917,11.587251472473145,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:56,0.8.0
-jsd,torch,full,speed,ms,BT,B * T,4096,23.026704788208008,23.025049972534177,23.02828178405762,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:56,0.8.0
-jsd,torch,full,speed,ms,BT,B * T,8192,45.981807708740234,45.97930145263672,45.98431396484375,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:56,0.8.0
-jsd,torch,full,speed,ms,BT,B * T,16384,91.8117446899414,91.8117446899414,91.8117446899414,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:56,0.8.0
-jsd,liger,full,memory,MB,BT,B * T,1024,3012.0048828125,3012.0048828125,3012.0048828125,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:58,0.8.0
-jsd,liger,full,memory,MB,BT,B * T,2048,6012.0087890625,6012.0087890625,6012.0087890625,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:58,0.8.0
-jsd,liger,full,memory,MB,BT,B * T,4096,12024.0166015625,12024.0166015625,12024.0166015625,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:58,0.8.0
-jsd,liger,full,memory,MB,BT,B * T,8192,24048.015625,24048.015625,24048.015625,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:58,0.8.0
-jsd,liger,full,memory,MB,BT,B * T,16384,48096.015625,48096.015625,48096.015625,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:58,0.8.0
-jsd,torch,full,memory,MB,BT,B * T,1024,6519.0009765625,6519.0009765625,6519.0009765625,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:59,0.8.0
-jsd,torch,full,memory,MB,BT,B * T,2048,13026.0009765625,13026.0009765625,13026.0009765625,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:59,0.8.0
-jsd,torch,full,memory,MB,BT,B * T,4096,26052.0,26052.0,26052.0,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:59,0.8.0
-jsd,torch,full,memory,MB,BT,B * T,8192,52104.0,52104.0,52104.0,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:59,0.8.0
-jsd,torch,full,memory,MB,BT,B * T,16384,104208.0,104208.0,104208.0,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:58:59,0.8.0
-jsd,liger-cutile,forward,speed,ms,BT,B * T,1024,0.7814080119132996,0.7793023943901062,0.7831360101699829,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:59:22,0.8.0
-jsd,liger-cutile,forward,speed,ms,BT,B * T,2048,1.4285119771957397,1.4254208087921143,1.4325439929962158,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:59:22,0.8.0
-jsd,liger-cutile,forward,speed,ms,BT,B * T,4096,2.7792000770568848,2.773011255264282,2.783692789077759,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:59:22,0.8.0
-jsd,liger-cutile,forward,speed,ms,BT,B * T,8192,5.50931191444397,5.502655982971191,5.513644886016845,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:59:22,0.8.0
-jsd,liger-cutile,forward,speed,ms,BT,B * T,16384,10.931103706359863,10.921529960632324,10.938668823242187,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:59:22,0.8.0
-jsd,liger-cutile,backward,speed,ms,BT,B * T,1024,0.6397919952869415,0.6379775881767273,0.6410239934921265,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:59:24,0.8.0
-jsd,liger-cutile,backward,speed,ms,BT,B * T,2048,1.1632959842681885,1.1621824026107788,1.1643712043762207,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:59:24,0.8.0
-jsd,liger-cutile,backward,speed,ms,BT,B * T,4096,2.2692480087280273,2.267148876190186,2.272768020629883,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:59:24,0.8.0
-jsd,liger-cutile,backward,speed,ms,BT,B * T,8192,4.425759792327881,4.422994995117188,4.428908729553223,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:59:24,0.8.0
-jsd,liger-cutile,backward,speed,ms,BT,B * T,16384,8.757247924804688,8.7510404586792,8.760255813598633,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:59:24,0.8.0
-jsd,liger-cutile,full,speed,ms,BT,B * T,1024,1.4090240001678467,1.4069759845733643,1.4106112003326414,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:59:26,0.8.0
-jsd,liger-cutile,full,speed,ms,BT,B * T,2048,2.5795199871063232,2.5753151893615724,2.5846080780029297,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:59:26,0.8.0
-jsd,liger-cutile,full,speed,ms,BT,B * T,4096,5.042175769805908,5.039103984832764,5.045452690124511,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:59:26,0.8.0
-jsd,liger-cutile,full,speed,ms,BT,B * T,8192,9.92411184310913,9.921158599853515,9.930310440063476,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:59:26,0.8.0
-jsd,liger-cutile,full,speed,ms,BT,B * T,16384,19.671167373657227,19.667975234985352,19.68026809692383,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:59:26,0.8.0
-jsd,liger-cutile,full,memory,MB,BT,B * T,1024,3012.00537109375,3012.00537109375,3012.00537109375,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:59:29,0.8.0
-jsd,liger-cutile,full,memory,MB,BT,B * T,2048,6012.00927734375,6012.00927734375,6012.00927734375,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:59:29,0.8.0
-jsd,liger-cutile,full,memory,MB,BT,B * T,4096,12024.017578125,12024.017578125,12024.017578125,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:59:29,0.8.0
-jsd,liger-cutile,full,memory,MB,BT,B * T,8192,24048.017578125,24048.017578125,24048.017578125,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:59:29,0.8.0
-jsd,liger-cutile,full,memory,MB,BT,B * T,16384,48096.015625,48096.015625,48096.015625,"{""vocab_size"": 128256}",NVIDIA B300 SXM6 AC,2026-05-18 09:59:29,0.8.0
+jsd,torch,full,speed,ms,BT,total tokens,1024,5.921823978424072,5.921823978424072,5.921823978424072,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:13:37,0.8.0
+jsd,torch,full,speed,ms,BT,total tokens,2048,12.200063705444336,12.200063705444336,12.200063705444336,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:13:37,0.8.0
+jsd,torch,full,speed,ms,BT,total tokens,4096,24.145984649658203,24.145984649658203,24.145984649658203,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:13:37,0.8.0
+jsd,torch,full,speed,ms,BT,total tokens,8192,50.45283126831055,50.45283126831055,50.45283126831055,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:13:37,0.8.0
+jsd,liger,full,speed,ms,BT,total tokens,1024,6.0959038734436035,6.0959038734436035,6.0959038734436035,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:28,0.8.0
+jsd,liger,full,speed,ms,BT,total tokens,2048,10.940447807312012,10.940447807312012,10.940447807312012,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:28,0.8.0
+jsd,liger,full,speed,ms,BT,total tokens,4096,21.781631469726562,21.781631469726562,21.781631469726562,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:28,0.8.0
+jsd,liger,full,speed,ms,BT,total tokens,8192,44.07699203491211,44.07699203491211,44.07699203491211,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:28,0.8.0
+jsd,torch,forward,speed,ms,BT,total tokens,1024,2.2900800704956055,2.2883904933929444,2.2906303882598875,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:29,0.8.0
+jsd,torch,forward,speed,ms,BT,total tokens,2048,4.97105598449707,4.9135422706604,5.02856969833374,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:29,0.8.0
+jsd,torch,forward,speed,ms,BT,total tokens,4096,9.907423973083496,9.907423973083496,9.907423973083496,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:29,0.8.0
+jsd,torch,forward,speed,ms,BT,total tokens,8192,20.02751922607422,20.02751922607422,20.02751922607422,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:29,0.8.0
+jsd,liger,forward,speed,ms,BT,total tokens,1024,5.783552169799805,5.783552169799805,5.783552169799805,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:29,0.8.0
+jsd,liger,forward,speed,ms,BT,total tokens,2048,9.110560417175293,9.110560417175293,9.110560417175293,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:29,0.8.0
+jsd,liger,forward,speed,ms,BT,total tokens,4096,18.322431564331055,18.322431564331055,18.322431564331055,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:29,0.8.0
+jsd,liger,forward,speed,ms,BT,total tokens,8192,37.44358444213867,37.44358444213867,37.44358444213867,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:29,0.8.0
+jsd,torch,backward,speed,ms,BT,total tokens,1024,3.7858558893203735,3.7852798938751224,3.786431884765625,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:30,0.8.0
+jsd,torch,backward,speed,ms,BT,total tokens,2048,7.665791988372803,7.665791988372803,7.665791988372803,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:30,0.8.0
+jsd,torch,backward,speed,ms,BT,total tokens,4096,15.20956802368164,15.20956802368164,15.20956802368164,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:30,0.8.0
+jsd,torch,backward,speed,ms,BT,total tokens,8192,30.310592651367188,30.310592651367188,30.310592651367188,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:30,0.8.0
+jsd,liger,backward,speed,ms,BT,total tokens,1024,1.0158560276031494,1.004588794708252,1.0225855827331543,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:30,0.8.0
+jsd,liger,backward,speed,ms,BT,total tokens,2048,1.8555200099945068,1.8544960021972656,1.8571839809417723,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:30,0.8.0
+jsd,liger,backward,speed,ms,BT,total tokens,4096,3.7145920991897583,3.7130560874938965,3.71612811088562,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:30,0.8.0
+jsd,liger,backward,speed,ms,BT,total tokens,8192,7.243807792663574,7.243807792663574,7.243807792663574,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:30,0.8.0
+jsd,torch,full,memory,MB,BT,total tokens,1024,6526.0009765625,6526.0009765625,6526.0009765625,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:30,0.8.0
+jsd,torch,full,memory,MB,BT,total tokens,2048,13026.0009765625,13026.0009765625,13026.0009765625,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:30,0.8.0
+jsd,torch,full,memory,MB,BT,total tokens,4096,26052.0,26052.0,26052.0,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:30,0.8.0
+jsd,torch,full,memory,MB,BT,total tokens,8192,52104.0,52104.0,52104.0,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:30,0.8.0
+jsd,liger,full,memory,MB,BT,total tokens,1024,3514.0009765625,3514.0009765625,3514.0009765625,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:32,0.8.0
+jsd,liger,full,memory,MB,BT,total tokens,2048,7014.0009765625,7014.0009765625,7014.0009765625,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:32,0.8.0
+jsd,liger,full,memory,MB,BT,total tokens,4096,14028.0009765625,14028.0009765625,14028.0009765625,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:32,0.8.0
+jsd,liger,full,memory,MB,BT,total tokens,8192,28056.0,28056.0,28056.0,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:32,0.8.0
diff --git a/benchmark/data/all_benchmark_data_cutile.csv b/benchmark/data/all_benchmark_data_cutile.csv
new file mode 100644
index 000000000..deba35977
--- /dev/null
+++ b/benchmark/data/all_benchmark_data_cutile.csv
@@ -0,0 +1,33 @@
+kernel_name,kernel_provider,kernel_operation_mode,metric_name,metric_unit,x_name,x_label,x_value,y_value_50,y_value_20,y_value_80,extra_benchmark_config_str,gpu_name,timestamp,liger_version
+jsd,torch,full,speed,ms,BT,total tokens,1024,5.9279680252075195,5.9279680252075195,5.9279680252075195,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:12,0.8.0
+jsd,torch,full,speed,ms,BT,total tokens,2048,12.093536376953125,12.093536376953125,12.093536376953125,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:12,0.8.0
+jsd,torch,full,speed,ms,BT,total tokens,4096,24.353023529052734,24.353023529052734,24.353023529052734,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:12,0.8.0
+jsd,torch,full,speed,ms,BT,total tokens,8192,51.63132858276367,51.63132858276367,51.63132858276367,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:12,0.8.0
+jsd,liger,full,speed,ms,BT,total tokens,1024,1.5985119938850403,1.5944639444351196,1.6005439758300781,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:15,0.8.0
+jsd,liger,full,speed,ms,BT,total tokens,2048,3.0249600410461426,3.024307155609131,3.0514752864837646,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:15,0.8.0
+jsd,liger,full,speed,ms,BT,total tokens,4096,6.043647766113281,6.043647766113281,6.043647766113281,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:15,0.8.0
+jsd,liger,full,speed,ms,BT,total tokens,8192,12.18057632446289,12.18057632446289,12.18057632446289,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:15,0.8.0
+jsd,torch,forward,speed,ms,BT,total tokens,1024,2.2989439964294434,2.2989439964294434,2.298969554901123,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:16,0.8.0
+jsd,torch,forward,speed,ms,BT,total tokens,2048,4.600415945053101,4.598918342590332,4.60191354751587,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:16,0.8.0
+jsd,torch,forward,speed,ms,BT,total tokens,4096,9.270400047302246,9.270400047302246,9.270400047302246,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:16,0.8.0
+jsd,torch,forward,speed,ms,BT,total tokens,8192,19.314847946166992,19.314847946166992,19.314847946166992,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:16,0.8.0
+jsd,liger,forward,speed,ms,BT,total tokens,1024,0.9553920030593872,0.9492863893508912,0.9575616240501403,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:16,0.8.0
+jsd,liger,forward,speed,ms,BT,total tokens,2048,1.4541120529174805,1.4528576374053954,1.4553215980529786,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:16,0.8.0
+jsd,liger,forward,speed,ms,BT,total tokens,4096,2.5651841163635254,2.5584064960479735,2.5675840854644774,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:16,0.8.0
+jsd,liger,forward,speed,ms,BT,total tokens,8192,5.1241278648376465,5.1241278648376465,5.1241278648376465,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:16,0.8.0
+jsd,torch,backward,speed,ms,BT,total tokens,1024,3.8217118978500366,3.8216639041900637,3.82175989151001,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:16,0.8.0
+jsd,torch,backward,speed,ms,BT,total tokens,2048,7.542975902557373,7.542975902557373,7.542975902557373,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:16,0.8.0
+jsd,torch,backward,speed,ms,BT,total tokens,4096,15.150239944458008,15.150239944458008,15.150239944458008,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:16,0.8.0
+jsd,torch,backward,speed,ms,BT,total tokens,8192,30.65158462524414,30.65158462524414,30.65158462524414,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:16,0.8.0
+jsd,liger,backward,speed,ms,BT,total tokens,1024,1.018943965435028,1.0006976008415223,1.0215808391571044,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:17,0.8.0
+jsd,liger,backward,speed,ms,BT,total tokens,2048,1.8514400124549866,1.8510143756866455,1.8518656492233276,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:17,0.8.0
+jsd,liger,backward,speed,ms,BT,total tokens,4096,3.6808160543441772,3.680499267578125,3.6811328411102293,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:17,0.8.0
+jsd,liger,backward,speed,ms,BT,total tokens,8192,7.2151360511779785,7.2151360511779785,7.2151360511779785,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:17,0.8.0
+jsd,torch,full,memory,MB,BT,total tokens,1024,6526.0009765625,6526.0009765625,6526.0009765625,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:17,0.8.0
+jsd,torch,full,memory,MB,BT,total tokens,2048,13026.0009765625,13026.0009765625,13026.0009765625,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:17,0.8.0
+jsd,torch,full,memory,MB,BT,total tokens,4096,26052.0,26052.0,26052.0,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:17,0.8.0
+jsd,torch,full,memory,MB,BT,total tokens,8192,52104.0,52104.0,52104.0,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:17,0.8.0
+jsd,liger,full,memory,MB,BT,total tokens,1024,3514.0009765625,3514.0009765625,3514.0009765625,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:18,0.8.0
+jsd,liger,full,memory,MB,BT,total tokens,2048,7014.0009765625,7014.0009765625,7014.0009765625,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:18,0.8.0
+jsd,liger,full,memory,MB,BT,total tokens,4096,14028.0009765625,14028.0009765625,14028.0009765625,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:18,0.8.0
+jsd,liger,full,memory,MB,BT,total tokens,8192,28056.0,28056.0,28056.0,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:18,0.8.0
diff --git a/benchmark/scripts/utils.py b/benchmark/scripts/utils.py
index 457483f22..8459cab84 100644
--- a/benchmark/scripts/utils.py
+++ b/benchmark/scripts/utils.py
@@ -261,13 +261,6 @@ def get_formatted_time():
     return time.strftime("%Y-%m-%d %H:%M:%S")
 
 
-def get_display_kernel_provider(kernel_provider: str) -> str:
-    impl = os.environ.get("LIGER_KERNEL_IMPL", "").strip().lower()
-    if kernel_provider == "liger" and impl:
-        return f"liger-{impl}"
-    return kernel_provider
-
-
 def get_gpu_name():
     """
     Returns the current GPU name, formatted to serve as a directory name
@@ -440,7 +433,7 @@ def run_benchmarks(
                 benchmark_run_data = BenchmarkData(
                     kernel_name=kernel_name,
                     kernel_operation_mode=kernel_operation_mode,
-                    kernel_provider=get_display_kernel_provider(kernel_provider),
+                    kernel_provider=kernel_provider,
                     metric_name=metric_name,
                     metric_unit=metric_unit,
                     gpu_name=gpu_name,
@@ -459,7 +452,9 @@ def run_benchmarks(
 
     print_benchmark_data(benchmark_data_list)
 
-    update_benchmark_data_csv(benchmark_data_list=benchmark_data_list, overwrite=overwrite)
+    impl_name = os.environ.get("LIGER_KERNEL_IMPL", "").strip().lower()
+    file_name = "all_benchmark_data.csv" if impl_name == "" else f"all_benchmark_data_{impl_name}.csv"
+    update_benchmark_data_csv(benchmark_data_list=benchmark_data_list, filename=file_name, overwrite=overwrite)
 
 
 def parse_benchmark_script_args():

From 2f049f9d2827286055c5c3372d96dce2268d46fb Mon Sep 17 00:00:00 2001
From: Vaibhav Jindal <vaibhav.jndl@gmail.com>
Date: Wed, 27 May 2026 16:36:51 -0700
Subject: [PATCH 7/7] Apply ruff format

---
 src/liger_kernel/ops/backends/registry.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/liger_kernel/ops/backends/registry.py b/src/liger_kernel/ops/backends/registry.py
index 572d33720..16e65b1bd 100644
--- a/src/liger_kernel/ops/backends/registry.py
+++ b/src/liger_kernel/ops/backends/registry.py
@@ -92,9 +92,7 @@ def select_impl(device: str, explicit: Optional[str] = None) -> Optional[ImplInf
         info = IMPL_REGISTRY.get(explicit)
         if info is None:
             known = ", ".join(sorted(IMPL_REGISTRY)) or "<none registered>"
-            raise RuntimeError(
-                f"Unknown {LIGER_KERNEL_IMPL_ENV}={explicit!r}. Registered implementations: {known}."
-            )
+            raise RuntimeError(f"Unknown {LIGER_KERNEL_IMPL_ENV}={explicit!r}. Registered implementations: {known}.")
         if device not in info.devices:
             supported = ", ".join(info.devices)
             raise RuntimeError(