HKUSTDial · LoserCheems · May 12, 2026 · May 11, 2026 · May 12, 2026 · May 12, 2026
diff --git a/flash_sparse_attn/ops/triton/autotuner.py b/flash_sparse_attn/ops/triton/autotuner.py
@@ -0,0 +1,247 @@
+import triton
+
+
+def get_fwd_dense_autotune_configs():
+    configs = []
+    for tile_m in [64, 128, 256]:
+        for tile_n in [32, 64, 128]:
+            for num_warps in [4, 8]:
+                for num_stages in [1, 2]:
+                    configs.append(
+                        triton.Config(
+                            {"TILE_M": tile_m, "TILE_N": tile_n},
+                            num_warps=num_warps,
+                            num_stages=num_stages,
+                            num_ctas=1,
+                        )
+                    )
+    return configs
+
+
+def get_fwd_sparse_autotune_configs():
+    configs = []
+    for tile_m in [64, 128, 256]:
+        for tile_n in [32, 64, 128]:
+            for num_warps in [4, 8]:
+                for num_stages in [1, 2]:
+                    configs.append(
+                        triton.Config(
+                            {"TILE_M": tile_m, "TILE_N": tile_n},
+                            num_warps=num_warps,
+                            num_stages=num_stages,
+                            num_ctas=1,
+                        )
+                    )
+    return configs
+
+
+def get_fwd_gated_autotune_configs():
+    configs = []
+    for tile_m in [64, 128, 256]:
+        for tile_n in [32, 64, 128]:
+            for num_warps in [4, 8]:
+                for num_stages in [1, 2]:
+                    configs.append(
+                        triton.Config(
+                            {"TILE_M": tile_m, "TILE_N": tile_n},
+                            num_warps=num_warps,
+                            num_stages=num_stages,
+                            num_ctas=1,
+                        )
+                    )
+    return configs
+
+
+def get_bwd_dense_autotune_configs():
+    configs = []
+    for tile_m in [32, 64, 128]:
+        for tile_n in [64, 128, 256]:
+            for num_warps in [4, 8]:
+                for num_stages in [1, 2]:
+                    configs.append(
+                        triton.Config(
+                            {"TILE_M": tile_m, "TILE_N": tile_n},
+                            num_warps=num_warps,
+                            num_stages=num_stages,
+                            num_ctas=1,
+                        )
+                    )
+    return configs
+
+
+def get_bwd_sparse_autotune_configs():
+    configs = []
+    for tile_m in [32, 64, 128]:
+        for tile_n in [64, 128, 256]:
+            for num_warps in [4, 8]:
+                for num_stages in [1, 2]:
+                    configs.append(
+                        triton.Config(
+                            {"TILE_M": tile_m, "TILE_N": tile_n},
+                            num_warps=num_warps,
+                            num_stages=num_stages,
+                            num_ctas=1,
+                        )
+                    )
+    return configs
+
+
+def get_bwd_gated_autotune_configs():
+    configs = []
+    for tile_m in [32, 64, 128]:
+        for tile_n in [64, 128, 256]:
+            for num_warps in [4, 8]:
+                for num_stages in [1, 2]:
+                    configs.append(
+                        triton.Config(
+                            {"TILE_M": tile_m, "TILE_N": tile_n},
+                            num_warps=num_warps,
+                            num_stages=num_stages,
+                            num_ctas=1,
+                        )
+                    )
+    return configs
+
+
+def get_dec_dense_autotune_configs():
+    configs = []
+    for tile_m in [16, 32, 64]:
+        for tile_n in [64, 128, 256]:
+            for num_warps in [4, 8]:
+                for num_stages in [1, 2]:
+                    configs.append(
+                        triton.Config(
+                            {"TILE_M": tile_m, "TILE_N": tile_n},
+                            num_warps=num_warps,
+                            num_stages=num_stages,
+                            num_ctas=1,
+                        )
+                    )
+    return configs
+
+
+def get_dec_sparse_autotune_configs():
+    configs = []
+    for tile_m in [16, 32, 64]:
+        for tile_n in [64, 128, 256]:
+            for num_warps in [4, 8]:
+                for num_stages in [1, 2]:
+                    configs.append(
+                        triton.Config(
+                            {"TILE_M": tile_m, "TILE_N": tile_n},
+                            num_warps=num_warps,
+                            num_stages=num_stages,
+                            num_ctas=1,
+                        )
+                    )
+    return configs
+
+
+def get_dec_gated_autotune_configs():
+    configs = []
+    for tile_m in [16, 32, 64]:
+        for tile_n in [64, 128, 256]:
+            for num_warps in [4, 8]:
+                for num_stages in [1, 2]:
+                    configs.append(
+                        triton.Config(
+                            {"TILE_M": tile_m, "TILE_N": tile_n},
+                            num_warps=num_warps,
+                            num_stages=num_stages,
+                            num_ctas=1,
+                        )
+                    )
+    return configs
+
+
+def make_fwd_dense_autotuned_kernel(jit_kernel):
+    configs = get_fwd_dense_autotune_configs()
+    return triton.autotune(
+        configs=configs,
+        key=["SEQLEN_Q_CACHE", "SEQLEN_K_CACHE", "TILE_K"],
+    )(jit_kernel)
+
+
+def make_fwd_sparse_autotuned_kernel(jit_kernel):
+    configs = get_fwd_sparse_autotune_configs()
+    return triton.autotune(
+        configs=configs,
+        key=["SEQLEN_Q_CACHE", "SEQLEN_K_CACHE", "TILE_K"],
+    )(jit_kernel)
+
+
+def make_fwd_gated_autotuned_kernel(jit_kernel):
+    configs = get_fwd_gated_autotune_configs()
+    return triton.autotune(
+        configs=configs,
+        key=["SEQLEN_Q_CACHE", "SEQLEN_K_CACHE", "TILE_K"],
+    )(jit_kernel)
+
+
+def make_bwd_dense_autotuned_kernel(jit_kernel):
+    configs = get_bwd_dense_autotune_configs()
+    return triton.autotune(
+        configs=configs,
+        key=["SEQLEN_Q_CACHE", "SEQLEN_K_CACHE", "TILE_K"],
+    )(jit_kernel)
+
+
+def make_bwd_sparse_autotuned_kernel(jit_kernel):
+    configs = get_bwd_sparse_autotune_configs()
+    return triton.autotune(
+        configs=configs,
+        key=["SEQLEN_Q_CACHE", "SEQLEN_K_CACHE", "TILE_K"],
+    )(jit_kernel)
+
+
+def make_bwd_gated_autotuned_kernel(jit_kernel):
+    configs = get_bwd_gated_autotune_configs()
+    return triton.autotune(
+        configs=configs,
+        key=["SEQLEN_Q_CACHE", "SEQLEN_K_CACHE", "TILE_K"],
+    )(jit_kernel)
+
+
+def make_dec_dense_autotuned_kernel(jit_kernel):
+    configs = get_dec_dense_autotune_configs()
+    return triton.autotune(
+        configs=configs,
+        key=["SEQLEN_Q_CACHE", "SEQLEN_K_CACHE", "TILE_K"],
+    )(jit_kernel)
+
+
+def make_dec_sparse_autotuned_kernel(jit_kernel):
+    configs = get_dec_sparse_autotune_configs()
+    return triton.autotune(
+        configs=configs,
+        key=["SEQLEN_Q_CACHE", "SEQLEN_K_CACHE", "TILE_K"],
+    )(jit_kernel)
+
+
+def make_dec_gated_autotuned_kernel(jit_kernel):
+    configs = get_dec_gated_autotune_configs()
+    return triton.autotune(
+        configs=configs,
+        key=["SEQLEN_Q_CACHE", "SEQLEN_K_CACHE", "TILE_K"],
+    )(jit_kernel)
+
+
+class AutotunedKernel:
+    STRIP_KWARGS = {"TILE_M", "TILE_N", "num_warps", "num_stages", "num_ctas"}
+
+    def __init__(self, autotuned_kernel):
+        self._autotuned = autotuned_kernel
+
+    def __getitem__(self, grid):
+        autotuned = self._autotuned
+
+        class _Launcher:
+            def __call__(_, *args, **kwargs):
+                for key in AutotunedKernel.STRIP_KWARGS:
+                    kwargs.pop(key, None)
+                return autotuned[grid](*args, **kwargs)
+
+        return _Launcher()
+
+    def __getattr__(self, name):
+        return getattr(self._autotuned, name)
diff --git a/flash_sparse_attn/ops/triton/flash_dense_bwd.py b/flash_sparse_attn/ops/triton/flash_dense_bwd.py
@@ -17,6 +17,7 @@
     flash_bwd_preprocess,
     flash_bwd_postprocess,
     kernel_repr,
+    autotuner,
 )
 
 
@@ -163,6 +164,8 @@ def _bwd_dense_kernel(
     seqlen_q,
     seqlen_k,
     head_dim,
+    SEQLEN_Q_CACHE: tl.constexpr,
+    SEQLEN_K_CACHE: tl.constexpr,
     QHEADS_PER_KVHEAD: tl.constexpr,
     TILE_M: tl.constexpr,
     TILE_N: tl.constexpr,
@@ -639,6 +642,18 @@ def _bwd_dense_kernel(
 _bwd_dense_kernel = cache_utils.wrap_kernel(_bwd_dense_kernel)
 
 
+_bwd_dense_kernel_autotuned = None
+
+
+def _get_autotuned_kernel():
+    global _bwd_dense_kernel_autotuned
+    if _bwd_dense_kernel_autotuned is None:
+        jit_kernel = _bwd_dense_kernel._kernel
+        autotuned = autotuner.make_bwd_dense_autotuned_kernel(jit_kernel)
+        _bwd_dense_kernel_autotuned = autotuner.AutotunedKernel(autotuned)
+    return _bwd_dense_kernel_autotuned
+
+
 def _flash_dense_attn_backward(
     query: torch.Tensor,
     key: torch.Tensor,
@@ -649,6 +664,7 @@ def _flash_dense_attn_backward(
     is_causal: bool = False,
     softmax_scale: float = None,
     window_size: Tuple[int, int] = (None, None),
+    is_autotune: bool = False,
     skip_checks: bool = False,
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     device = query.device
@@ -682,13 +698,19 @@ def _flash_dense_attn_backward(
 
     TILE_K = max(triton.next_power_of_2(head_dim), 16)
 
-    TILE_M, TILE_N, num_warps, num_stages, num_ctas = (
-        launch_template.get_bwd_dense_launch_config(
-            tile_k=TILE_K,
-            device=device,
-            arch=arch,
+    if is_autotune:
+        kernel = _get_autotuned_kernel()
+        TILE_M = TILE_N = 64
+        num_warps = num_stages = num_ctas = None
+    else:
+        kernel = _bwd_dense_kernel
+        TILE_M, TILE_N, num_warps, num_stages, num_ctas = (
+            launch_template.get_bwd_dense_launch_config(
+                tile_k=TILE_K,
+                device=device,
+                arch=arch,
+            )
         )
-    )
 
     seqlen_q_rounded = int(math.ceil(seqlen_q / TILE_M) * TILE_M)
     head_dim_rounded = int(math.ceil(head_dim / 32) * 32)
@@ -746,7 +768,7 @@ def _flash_dense_attn_backward(
         batch_size=batch_size,
     )
 
-    _bwd_dense_kernel[grid](
+    kernel[grid](
         query,
         key,
         value,
@@ -789,9 +811,11 @@ def _flash_dense_attn_backward(
         None,
         None,
         None,
-        seqlen_q,
-        seqlen_k,
-        head_dim,
+        seqlen_q=seqlen_q,
+        seqlen_k=seqlen_k,
+        head_dim=head_dim,
+        SEQLEN_Q_CACHE=seqlen_q // 1024,
+        SEQLEN_K_CACHE=seqlen_k // 1024,
         QHEADS_PER_KVHEAD=qhead_per_kvhead,
         TILE_M=TILE_M,
         TILE_N=TILE_N,
@@ -840,6 +864,7 @@ def _flash_dense_attn_varlen_backward(
     window_size: Tuple[int, int] = (None, None),
     seqused_q: Optional[torch.Tensor] = None,
     seqused_k: Optional[torch.Tensor] = None,
+    is_autotune: bool = False,
     skip_checks: bool = False,
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     device = query.device
@@ -876,13 +901,19 @@ def _flash_dense_attn_varlen_backward(
 
     TILE_K = max(triton.next_power_of_2(head_dim), 16)
 
-    TILE_M, TILE_N, num_warps, num_stages, num_ctas = (
-        launch_template.get_bwd_dense_launch_config(
-            tile_k=TILE_K,
-            device=device,
-            arch=arch,
+    if is_autotune:
+        kernel = _get_autotuned_kernel()
+        TILE_M = TILE_N = 64
+        num_warps = num_stages = num_ctas = None
+    else:
+        kernel = _bwd_dense_kernel
+        TILE_M, TILE_N, num_warps, num_stages, num_ctas = (
+            launch_template.get_bwd_dense_launch_config(
+                tile_k=TILE_K,
+                device=device,
+                arch=arch,
+            )
         )
-    )
 
     total_q_rounded_padded = int(
         math.ceil((total_q + batch_size * TILE_M) / TILE_M) * TILE_M
@@ -946,7 +977,7 @@ def _flash_dense_attn_varlen_backward(
         batch_size=batch_size,
     )
 
-    _bwd_dense_kernel[grid](
+    kernel[grid](
         query,
         key,
         value,
@@ -989,9 +1020,11 @@ def _flash_dense_attn_varlen_backward(
         cu_seqlens_k,
         seqused_q,
         seqused_k,
-        seqlen_q,
-        seqlen_k,
-        head_dim,
+        seqlen_q=seqlen_q,
+        seqlen_k=seqlen_k,
+        head_dim=head_dim,
+        SEQLEN_Q_CACHE=seqlen_q // 1024,
+        SEQLEN_K_CACHE=seqlen_k // 1024,
         QHEADS_PER_KVHEAD=qhead_per_kvhead,
         TILE_M=TILE_M,
         TILE_N=TILE_N,