HKUSTDial · LoserCheems · May 24, 2026 · May 16, 2026 · May 16, 2026 · May 16, 2026
diff --git a/flash_sparse_attn/ops/triton/autotuner.py b/flash_sparse_attn/ops/triton/autotuner.py
@@ -1,12 +1,113 @@
+import torch
 import triton
 
 
+def _get_max_shared_mem():
+    props = torch.cuda.get_device_properties(torch.cuda.current_device())
+    return getattr(
+        props, "shared_memory_per_block_optin", props.shared_memory_per_block
+    )
+
+
+def _smem_bytes_fwd(tile_m, tile_n, tile_k, num_stages, dtype_bytes):
+    resident = tile_m * tile_k * dtype_bytes
+    pipelined = 2 * tile_n * tile_k * dtype_bytes * num_stages
+    return resident + pipelined
+
+
+def _smem_bytes_bwd(tile_m, tile_n, tile_k, num_stages, dtype_bytes):
+    resident = 2 * tile_n * tile_k * dtype_bytes
+    pipelined = 2 * tile_m * tile_k * dtype_bytes * num_stages
+    return resident + pipelined
+
+
+def _smem_bytes_dec(tile_m, tile_n, tile_k, num_stages, dtype_bytes):
+    resident = tile_m * tile_k * dtype_bytes
+    pipelined = 2 * tile_n * tile_k * dtype_bytes * num_stages
+    return resident + pipelined
+
+
+def _prune_fwd_configs(configs, named_args, **kwargs):
+    tile_k = kwargs.get("TILE_K", named_args.get("TILE_K", 128))
+    dtype_bytes = named_args["Q"].element_size()
+    max_smem = _get_max_shared_mem() - 4 * 1024
+    pruned = []
+    for cfg in configs:
+        tm, tn, ns = cfg.kwargs["TILE_M"], cfg.kwargs["TILE_N"], cfg.num_stages
+        if _smem_bytes_fwd(tm, tn, tile_k, ns, dtype_bytes) <= max_smem:
+            pruned.append(cfg)
+    if not pruned:
+        pruned = [
+            min(
+                configs,
+                key=lambda c: _smem_bytes_fwd(
+                    c.kwargs["TILE_M"],
+                    c.kwargs["TILE_N"],
+                    tile_k,
+                    c.num_stages,
+                    dtype_bytes,
+                ),
+            )
+        ]
+    return pruned
+
+
+def _prune_bwd_configs(configs, named_args, **kwargs):
+    tile_k = kwargs.get("TILE_K", named_args.get("TILE_K", 128))
+    dtype_bytes = named_args["Q"].element_size()
+    max_smem = _get_max_shared_mem() - 4 * 1024
+    pruned = []
+    for cfg in configs:
+        tm, tn, ns = cfg.kwargs["TILE_M"], cfg.kwargs["TILE_N"], cfg.num_stages
+        if _smem_bytes_bwd(tm, tn, tile_k, ns, dtype_bytes) <= max_smem:
+            pruned.append(cfg)
+    if not pruned:
+        pruned = [
+            min(
+                configs,
+                key=lambda c: _smem_bytes_bwd(
+                    c.kwargs["TILE_M"],
+                    c.kwargs["TILE_N"],
+                    tile_k,
+                    c.num_stages,
+                    dtype_bytes,
+                ),
+            )
+        ]
+    return pruned
+
+
+def _prune_dec_configs(configs, named_args, **kwargs):
+    tile_k = kwargs.get("TILE_K", named_args.get("TILE_K", 128))
+    dtype_bytes = named_args["Q"].element_size()
+    max_smem = _get_max_shared_mem() - 4 * 1024
+    pruned = []
+    for cfg in configs:
+        tm, tn, ns = cfg.kwargs["TILE_M"], cfg.kwargs["TILE_N"], cfg.num_stages
+        if _smem_bytes_dec(tm, tn, tile_k, ns, dtype_bytes) <= max_smem:
+            pruned.append(cfg)
+    if not pruned:
+        pruned = [
+            min(
+                configs,
+                key=lambda c: _smem_bytes_dec(
+                    c.kwargs["TILE_M"],
+                    c.kwargs["TILE_N"],
+                    tile_k,
+                    c.num_stages,
+                    dtype_bytes,
+                ),
+            )
+        ]
+    return pruned
+
+
 def get_fwd_dense_autotune_configs():
     configs = []
     for tile_m in [64, 128, 256]:
         for tile_n in [32, 64, 128]:
             for num_warps in [4, 8]:
-                for num_stages in [1, 2]:
+                for num_stages in [1, 2, 3]:
                     configs.append(
                         triton.Config(
                             {"TILE_M": tile_m, "TILE_N": tile_n},
@@ -23,7 +124,7 @@ def get_fwd_sparse_autotune_configs():
     for tile_m in [64, 128, 256]:
         for tile_n in [32, 64, 128]:
             for num_warps in [4, 8]:
-                for num_stages in [1, 2]:
+                for num_stages in [1, 2, 3]:
                     configs.append(
                         triton.Config(
                             {"TILE_M": tile_m, "TILE_N": tile_n},
@@ -40,7 +141,7 @@ def get_fwd_gated_autotune_configs():
     for tile_m in [64, 128, 256]:
         for tile_n in [32, 64, 128]:
             for num_warps in [4, 8]:
-                for num_stages in [1, 2]:
+                for num_stages in [1, 2, 3]:
                     configs.append(
                         triton.Config(
                             {"TILE_M": tile_m, "TILE_N": tile_n},
@@ -57,7 +158,7 @@ def get_bwd_dense_autotune_configs():
     for tile_m in [32, 64, 128]:
         for tile_n in [64, 128, 256]:
             for num_warps in [4, 8]:
-                for num_stages in [1, 2]:
+                for num_stages in [1, 2, 3]:
                     configs.append(
                         triton.Config(
                             {"TILE_M": tile_m, "TILE_N": tile_n},
@@ -74,7 +175,7 @@ def get_bwd_sparse_autotune_configs():
     for tile_m in [32, 64, 128]:
         for tile_n in [64, 128, 256]:
             for num_warps in [4, 8]:
-                for num_stages in [1, 2]:
+                for num_stages in [1, 2, 3]:
                     configs.append(
                         triton.Config(
                             {"TILE_M": tile_m, "TILE_N": tile_n},
@@ -91,7 +192,7 @@ def get_bwd_gated_autotune_configs():
     for tile_m in [32, 64, 128]:
         for tile_n in [64, 128, 256]:
             for num_warps in [4, 8]:
-                for num_stages in [1, 2]:
+                for num_stages in [1, 2, 3]:
                     configs.append(
                         triton.Config(
                             {"TILE_M": tile_m, "TILE_N": tile_n},
@@ -108,7 +209,7 @@ def get_dec_dense_autotune_configs():
     for tile_m in [16, 32, 64]:
         for tile_n in [64, 128, 256]:
             for num_warps in [4, 8]:
-                for num_stages in [1, 2]:
+                for num_stages in [1, 2, 3]:
                     configs.append(
                         triton.Config(
                             {"TILE_M": tile_m, "TILE_N": tile_n},
@@ -125,7 +226,7 @@ def get_dec_sparse_autotune_configs():
     for tile_m in [16, 32, 64]:
         for tile_n in [64, 128, 256]:
             for num_warps in [4, 8]:
-                for num_stages in [1, 2]:
+                for num_stages in [1, 2, 3]:
                     configs.append(
                         triton.Config(
                             {"TILE_M": tile_m, "TILE_N": tile_n},
@@ -142,7 +243,7 @@ def get_dec_gated_autotune_configs():
     for tile_m in [16, 32, 64]:
         for tile_n in [64, 128, 256]:
             for num_warps in [4, 8]:
-                for num_stages in [1, 2]:
+                for num_stages in [1, 2, 3]:
                     configs.append(
                         triton.Config(
                             {"TILE_M": tile_m, "TILE_N": tile_n},
@@ -158,71 +259,80 @@ def make_fwd_dense_autotuned_kernel(jit_kernel):
     configs = get_fwd_dense_autotune_configs()
     return triton.autotune(
         configs=configs,
-        key=["SEQLEN_Q_CACHE", "SEQLEN_K_CACHE", "TILE_K"],
+        key=["SEQLEN_Q_CACHE", "SEQLEN_K_CACHE", "TILE_K", "IS_CAUSAL", "IS_LOCAL"],
+        prune_configs_by={"early_config_prune": _prune_fwd_configs},
     )(jit_kernel)
 
 
 def make_fwd_sparse_autotuned_kernel(jit_kernel):
     configs = get_fwd_sparse_autotune_configs()
     return triton.autotune(
         configs=configs,
-        key=["SEQLEN_Q_CACHE", "SEQLEN_K_CACHE", "TILE_K"],
+        key=["SEQLEN_Q_CACHE", "SEQLEN_K_CACHE", "TILE_K", "IS_CAUSAL", "IS_LOCAL"],
+        prune_configs_by={"early_config_prune": _prune_fwd_configs},
     )(jit_kernel)
 
 
 def make_fwd_gated_autotuned_kernel(jit_kernel):
     configs = get_fwd_gated_autotune_configs()
     return triton.autotune(
         configs=configs,
-        key=["SEQLEN_Q_CACHE", "SEQLEN_K_CACHE", "TILE_K"],
+        key=["SEQLEN_Q_CACHE", "SEQLEN_K_CACHE", "TILE_K", "IS_CAUSAL", "IS_LOCAL"],
+        prune_configs_by={"early_config_prune": _prune_fwd_configs},
     )(jit_kernel)
 
 
 def make_bwd_dense_autotuned_kernel(jit_kernel):
     configs = get_bwd_dense_autotune_configs()
     return triton.autotune(
         configs=configs,
-        key=["SEQLEN_Q_CACHE", "SEQLEN_K_CACHE", "TILE_K"],
+        key=["SEQLEN_Q_CACHE", "SEQLEN_K_CACHE", "TILE_K", "IS_CAUSAL", "IS_LOCAL"],
+        prune_configs_by={"early_config_prune": _prune_bwd_configs},
     )(jit_kernel)
 
 
 def make_bwd_sparse_autotuned_kernel(jit_kernel):
     configs = get_bwd_sparse_autotune_configs()
     return triton.autotune(
         configs=configs,
-        key=["SEQLEN_Q_CACHE", "SEQLEN_K_CACHE", "TILE_K"],
+        key=["SEQLEN_Q_CACHE", "SEQLEN_K_CACHE", "TILE_K", "IS_CAUSAL", "IS_LOCAL"],
+        prune_configs_by={"early_config_prune": _prune_bwd_configs},
     )(jit_kernel)
 
 
 def make_bwd_gated_autotuned_kernel(jit_kernel):
     configs = get_bwd_gated_autotune_configs()
     return triton.autotune(
         configs=configs,
-        key=["SEQLEN_Q_CACHE", "SEQLEN_K_CACHE", "TILE_K"],
+        key=["SEQLEN_Q_CACHE", "SEQLEN_K_CACHE", "TILE_K", "IS_CAUSAL", "IS_LOCAL"],
+        prune_configs_by={"early_config_prune": _prune_bwd_configs},
     )(jit_kernel)
 
 
 def make_dec_dense_autotuned_kernel(jit_kernel):
     configs = get_dec_dense_autotune_configs()
     return triton.autotune(
         configs=configs,
-        key=["SEQLEN_Q_CACHE", "SEQLEN_K_CACHE", "TILE_K"],
+        key=["SEQLEN_Q_CACHE", "SEQLEN_K_CACHE", "TILE_K", "IS_LOCAL"],
+        prune_configs_by={"early_config_prune": _prune_dec_configs},
     )(jit_kernel)
 
 
 def make_dec_sparse_autotuned_kernel(jit_kernel):
     configs = get_dec_sparse_autotune_configs()
     return triton.autotune(
         configs=configs,
-        key=["SEQLEN_Q_CACHE", "SEQLEN_K_CACHE", "TILE_K"],
+        key=["SEQLEN_Q_CACHE", "SEQLEN_K_CACHE", "TILE_K", "IS_LOCAL"],
+        prune_configs_by={"early_config_prune": _prune_dec_configs},
     )(jit_kernel)
 
 
 def make_dec_gated_autotuned_kernel(jit_kernel):
     configs = get_dec_gated_autotune_configs()
     return triton.autotune(
         configs=configs,
-        key=["SEQLEN_Q_CACHE", "SEQLEN_K_CACHE", "TILE_K"],
+        key=["SEQLEN_Q_CACHE", "SEQLEN_K_CACHE", "TILE_K", "IS_LOCAL"],
+        prune_configs_by={"early_config_prune": _prune_dec_configs},
     )(jit_kernel)