Skip to content
Merged
20 changes: 19 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,8 @@ pip3 install --pre torch torchvision torchaudio --index-url https://download.pyt
### Optional Dependencies

- `transformers >= 4.x`: Required if you plan to use the transformers models patching APIs. The specific model you are working will dictate the minimum version of transformers.
- `cuda-tile`: Required when enabling the optional cuTile backend on CUDA. Use this when your environment already provides CUDA Toolkit 13.1 or newer, or an existing tileiras compiler installation.
- `cuda-tile[tileiras]`: Required when enabling the optional cuTile backend with the tileiras compiler installed directly into your Python environment.

> **Note:**
> Our kernels inherit the full spectrum of hardware compatibility offered by [Triton](https://github.com/triton-lang/triton).
Expand Down Expand Up @@ -168,10 +170,26 @@ pip install -e .
# Setup Development Dependencies
pip install -e ".[dev]"

# Setup cuTile Dependencies
pip install -e ".[cutile]"

# Or install cuTile with the optional tileiras compiler
pip install -e ".[cutile-tileiras]"

# NOTE -> For AMD users only
pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.3/
```

### Enable cuTile Backend

cuTile is an optional CUDA-only DSL implementation. After installing the `cutile` or `cutile-tileiras` extra, enable it explicitly:

```bash
LIGER_KERNEL_IMPL=cutile python your_script.py
```

`LIGER_KERNEL_IMPL` selects an opt-in implementation registered with Liger (currently `cutile`). Selecting one on an unsupported device, or without the required dependencies installed, raises an error.


## Getting Started

Expand Down Expand Up @@ -290,7 +308,7 @@ loss.backward()
| **Kernel** | **API** |
|---------------------------------|-------------------------------------------------------------|
| RMSNorm | `liger_kernel.transformers.LigerRMSNorm` |
| Modulated RMSNorm | `liger_kernel.transformers.LigerModulatedRMSNorm` |
| Modulated RMSNorm | `liger_kernel.transformers.LigerModulatedRMSNorm` |
| LayerNorm | `liger_kernel.transformers.LigerLayerNorm` |
| RoPE | `liger_kernel.transformers.liger_rotary_pos_emb` |
| SwiGLU | `liger_kernel.transformers.LigerSwiGLUMLP` |
Expand Down
32 changes: 32 additions & 0 deletions benchmark/data/all_benchmark_data.csv
Original file line number Diff line number Diff line change
Expand Up @@ -2187,3 +2187,35 @@ fused_moe,huggingface,backward,memory,MB,E,num_experts,16,2072.1728515625,2072.1
fused_moe,huggingface,backward,memory,MB,E,num_experts,32,2737.08349609375,2737.08349609375,2737.08349609375,"{""sweep_dim"": ""E"", ""T"": 8192, ""E"": null, ""H"": 2048, ""intermediate_dim"": 768, ""K"": 8, ""dtype"": ""torch.bfloat16""}",NVIDIA H100 80GB HBM3,2026-04-02 23:59:56,0.7.0
fused_moe,huggingface,backward,memory,MB,E,num_experts,64,4078.97021484375,4078.97021484375,4078.97021484375,"{""sweep_dim"": ""E"", ""T"": 8192, ""E"": null, ""H"": 2048, ""intermediate_dim"": 768, ""K"": 8, ""dtype"": ""torch.bfloat16""}",NVIDIA H100 80GB HBM3,2026-04-02 23:59:56,0.7.0
fused_moe,huggingface,backward,memory,MB,E,num_experts,128,6763.82275390625,6763.82275390625,6763.82275390625,"{""sweep_dim"": ""E"", ""T"": 8192, ""E"": null, ""H"": 2048, ""intermediate_dim"": 768, ""K"": 8, ""dtype"": ""torch.bfloat16""}",NVIDIA H100 80GB HBM3,2026-04-02 23:59:56,0.7.0
jsd,torch,full,speed,ms,BT,total tokens,1024,5.921823978424072,5.921823978424072,5.921823978424072,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:13:37,0.8.0
jsd,torch,full,speed,ms,BT,total tokens,2048,12.200063705444336,12.200063705444336,12.200063705444336,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:13:37,0.8.0
jsd,torch,full,speed,ms,BT,total tokens,4096,24.145984649658203,24.145984649658203,24.145984649658203,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:13:37,0.8.0
jsd,torch,full,speed,ms,BT,total tokens,8192,50.45283126831055,50.45283126831055,50.45283126831055,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:13:37,0.8.0
jsd,liger,full,speed,ms,BT,total tokens,1024,6.0959038734436035,6.0959038734436035,6.0959038734436035,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:28,0.8.0
jsd,liger,full,speed,ms,BT,total tokens,2048,10.940447807312012,10.940447807312012,10.940447807312012,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:28,0.8.0
jsd,liger,full,speed,ms,BT,total tokens,4096,21.781631469726562,21.781631469726562,21.781631469726562,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:28,0.8.0
jsd,liger,full,speed,ms,BT,total tokens,8192,44.07699203491211,44.07699203491211,44.07699203491211,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:28,0.8.0
jsd,torch,forward,speed,ms,BT,total tokens,1024,2.2900800704956055,2.2883904933929444,2.2906303882598875,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:29,0.8.0
jsd,torch,forward,speed,ms,BT,total tokens,2048,4.97105598449707,4.9135422706604,5.02856969833374,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:29,0.8.0
jsd,torch,forward,speed,ms,BT,total tokens,4096,9.907423973083496,9.907423973083496,9.907423973083496,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:29,0.8.0
jsd,torch,forward,speed,ms,BT,total tokens,8192,20.02751922607422,20.02751922607422,20.02751922607422,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:29,0.8.0
jsd,liger,forward,speed,ms,BT,total tokens,1024,5.783552169799805,5.783552169799805,5.783552169799805,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:29,0.8.0
jsd,liger,forward,speed,ms,BT,total tokens,2048,9.110560417175293,9.110560417175293,9.110560417175293,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:29,0.8.0
jsd,liger,forward,speed,ms,BT,total tokens,4096,18.322431564331055,18.322431564331055,18.322431564331055,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:29,0.8.0
jsd,liger,forward,speed,ms,BT,total tokens,8192,37.44358444213867,37.44358444213867,37.44358444213867,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:29,0.8.0
jsd,torch,backward,speed,ms,BT,total tokens,1024,3.7858558893203735,3.7852798938751224,3.786431884765625,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:30,0.8.0
jsd,torch,backward,speed,ms,BT,total tokens,2048,7.665791988372803,7.665791988372803,7.665791988372803,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:30,0.8.0
jsd,torch,backward,speed,ms,BT,total tokens,4096,15.20956802368164,15.20956802368164,15.20956802368164,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:30,0.8.0
jsd,torch,backward,speed,ms,BT,total tokens,8192,30.310592651367188,30.310592651367188,30.310592651367188,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:30,0.8.0
jsd,liger,backward,speed,ms,BT,total tokens,1024,1.0158560276031494,1.004588794708252,1.0225855827331543,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:30,0.8.0
jsd,liger,backward,speed,ms,BT,total tokens,2048,1.8555200099945068,1.8544960021972656,1.8571839809417723,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:30,0.8.0
jsd,liger,backward,speed,ms,BT,total tokens,4096,3.7145920991897583,3.7130560874938965,3.71612811088562,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:30,0.8.0
jsd,liger,backward,speed,ms,BT,total tokens,8192,7.243807792663574,7.243807792663574,7.243807792663574,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:30,0.8.0
jsd,torch,full,memory,MB,BT,total tokens,1024,6526.0009765625,6526.0009765625,6526.0009765625,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:30,0.8.0
jsd,torch,full,memory,MB,BT,total tokens,2048,13026.0009765625,13026.0009765625,13026.0009765625,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:30,0.8.0
jsd,torch,full,memory,MB,BT,total tokens,4096,26052.0,26052.0,26052.0,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:30,0.8.0
jsd,torch,full,memory,MB,BT,total tokens,8192,52104.0,52104.0,52104.0,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:30,0.8.0
jsd,liger,full,memory,MB,BT,total tokens,1024,3514.0009765625,3514.0009765625,3514.0009765625,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:32,0.8.0
jsd,liger,full,memory,MB,BT,total tokens,2048,7014.0009765625,7014.0009765625,7014.0009765625,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:32,0.8.0
jsd,liger,full,memory,MB,BT,total tokens,4096,14028.0009765625,14028.0009765625,14028.0009765625,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:32,0.8.0
jsd,liger,full,memory,MB,BT,total tokens,8192,28056.0,28056.0,28056.0,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:14:32,0.8.0
33 changes: 33 additions & 0 deletions benchmark/data/all_benchmark_data_cutile.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
kernel_name,kernel_provider,kernel_operation_mode,metric_name,metric_unit,x_name,x_label,x_value,y_value_50,y_value_20,y_value_80,extra_benchmark_config_str,gpu_name,timestamp,liger_version
jsd,torch,full,speed,ms,BT,total tokens,1024,5.9279680252075195,5.9279680252075195,5.9279680252075195,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:12,0.8.0
jsd,torch,full,speed,ms,BT,total tokens,2048,12.093536376953125,12.093536376953125,12.093536376953125,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:12,0.8.0
jsd,torch,full,speed,ms,BT,total tokens,4096,24.353023529052734,24.353023529052734,24.353023529052734,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:12,0.8.0
jsd,torch,full,speed,ms,BT,total tokens,8192,51.63132858276367,51.63132858276367,51.63132858276367,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:12,0.8.0
jsd,liger,full,speed,ms,BT,total tokens,1024,1.5985119938850403,1.5944639444351196,1.6005439758300781,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:15,0.8.0
jsd,liger,full,speed,ms,BT,total tokens,2048,3.0249600410461426,3.024307155609131,3.0514752864837646,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:15,0.8.0
jsd,liger,full,speed,ms,BT,total tokens,4096,6.043647766113281,6.043647766113281,6.043647766113281,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:15,0.8.0
jsd,liger,full,speed,ms,BT,total tokens,8192,12.18057632446289,12.18057632446289,12.18057632446289,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:15,0.8.0
jsd,torch,forward,speed,ms,BT,total tokens,1024,2.2989439964294434,2.2989439964294434,2.298969554901123,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:16,0.8.0
jsd,torch,forward,speed,ms,BT,total tokens,2048,4.600415945053101,4.598918342590332,4.60191354751587,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:16,0.8.0
jsd,torch,forward,speed,ms,BT,total tokens,4096,9.270400047302246,9.270400047302246,9.270400047302246,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:16,0.8.0
jsd,torch,forward,speed,ms,BT,total tokens,8192,19.314847946166992,19.314847946166992,19.314847946166992,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:16,0.8.0
jsd,liger,forward,speed,ms,BT,total tokens,1024,0.9553920030593872,0.9492863893508912,0.9575616240501403,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:16,0.8.0
jsd,liger,forward,speed,ms,BT,total tokens,2048,1.4541120529174805,1.4528576374053954,1.4553215980529786,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:16,0.8.0
jsd,liger,forward,speed,ms,BT,total tokens,4096,2.5651841163635254,2.5584064960479735,2.5675840854644774,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:16,0.8.0
jsd,liger,forward,speed,ms,BT,total tokens,8192,5.1241278648376465,5.1241278648376465,5.1241278648376465,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:16,0.8.0
jsd,torch,backward,speed,ms,BT,total tokens,1024,3.8217118978500366,3.8216639041900637,3.82175989151001,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:16,0.8.0
jsd,torch,backward,speed,ms,BT,total tokens,2048,7.542975902557373,7.542975902557373,7.542975902557373,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:16,0.8.0
jsd,torch,backward,speed,ms,BT,total tokens,4096,15.150239944458008,15.150239944458008,15.150239944458008,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:16,0.8.0
jsd,torch,backward,speed,ms,BT,total tokens,8192,30.65158462524414,30.65158462524414,30.65158462524414,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:16,0.8.0
jsd,liger,backward,speed,ms,BT,total tokens,1024,1.018943965435028,1.0006976008415223,1.0215808391571044,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:17,0.8.0
jsd,liger,backward,speed,ms,BT,total tokens,2048,1.8514400124549866,1.8510143756866455,1.8518656492233276,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:17,0.8.0
jsd,liger,backward,speed,ms,BT,total tokens,4096,3.6808160543441772,3.680499267578125,3.6811328411102293,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:17,0.8.0
jsd,liger,backward,speed,ms,BT,total tokens,8192,7.2151360511779785,7.2151360511779785,7.2151360511779785,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:17,0.8.0
jsd,torch,full,memory,MB,BT,total tokens,1024,6526.0009765625,6526.0009765625,6526.0009765625,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:17,0.8.0
jsd,torch,full,memory,MB,BT,total tokens,2048,13026.0009765625,13026.0009765625,13026.0009765625,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:17,0.8.0
jsd,torch,full,memory,MB,BT,total tokens,4096,26052.0,26052.0,26052.0,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:17,0.8.0
jsd,torch,full,memory,MB,BT,total tokens,8192,52104.0,52104.0,52104.0,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:17,0.8.0
jsd,liger,full,memory,MB,BT,total tokens,1024,3514.0009765625,3514.0009765625,3514.0009765625,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:18,0.8.0
jsd,liger,full,memory,MB,BT,total tokens,2048,7014.0009765625,7014.0009765625,7014.0009765625,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:18,0.8.0
jsd,liger,full,memory,MB,BT,total tokens,4096,14028.0009765625,14028.0009765625,14028.0009765625,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:18,0.8.0
jsd,liger,full,memory,MB,BT,total tokens,8192,28056.0,28056.0,28056.0,"{""vocab_size"": 128256, ""bsz"": 1, ""seq_len"": 8192}",NVIDIA B200,2026-05-27 17:16:18,0.8.0
4 changes: 3 additions & 1 deletion benchmark/scripts/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -452,7 +452,9 @@ def run_benchmarks(

print_benchmark_data(benchmark_data_list)

update_benchmark_data_csv(benchmark_data_list=benchmark_data_list, overwrite=overwrite)
impl_name = os.environ.get("LIGER_KERNEL_IMPL", "").strip().lower()
file_name = "all_benchmark_data.csv" if impl_name == "" else f"all_benchmark_data_{impl_name}.csv"
update_benchmark_data_csv(benchmark_data_list=benchmark_data_list, filename=file_name, overwrite=overwrite)


def parse_benchmark_script_args():
Expand Down
11 changes: 10 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,16 @@ def get_default_dependencies():

def get_optional_dependencies():
"""Get optional dependency groups."""
cutile_deps = [
"cuda-tile",
]
cutile_tileiras_deps = [
"cuda-tile[tileiras]",
]

return {
"cutile": cutile_deps,
"cutile-tileiras": cutile_tileiras_deps,
"dev": [
"transformers>=4.52.0",
"matplotlib>=3.7.2",
Expand All @@ -48,7 +57,7 @@ def get_optional_dependencies():
"mkdocs-material",
"torchvision>=0.20",
"prek>=0.2.28",
]
],
}


Expand Down
Loading
Loading