libxsmm · sandlbn · Jun 1, 2026 · May 31, 2026 · May 31, 2026 · May 31, 2026
diff --git a/backends/triton/cpu/KernelBench/level1/10_3D_tensor_matrix_multiplication.py b/backends/triton/cpu/KernelBench/level1/10_3D_tensor_matrix_multiplication.py
@@ -10,15 +10,15 @@
 import triton.language as tl
 
 
-def _configs():
-    return [
+@triton.autotune(
+    configs=[
         triton.Config(
-            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
-        ),
-    ]
-
-
-@triton.autotune(configs=_configs(), key=["M", "N", "K"])
+            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": gs},
+        )
+        for gs in [1, 2, 4, 8]
+    ],
+    key=["M", "N", "K"],
+)
 @triton.jit
 def _matmul_kernel(
     a_ptr,
@@ -101,6 +101,13 @@ def forward(self, A, B):
         c_flat = torch.empty((total_m, l), device=a.device, dtype=torch.bfloat16)
 
         def grid(META):
+            assert (
+                m % META["BLOCK_M"] == 0
+                and l % META["BLOCK_N"] == 0
+                and k % META["BLOCK_K"] == 0
+            ), (
+                "M, L, and K must be divisible by BLOCK_M, BLOCK_N, and BLOCK_K respectively"
+            )
             return (
                 triton.cdiv(total_m, META["BLOCK_M"]) * triton.cdiv(l, META["BLOCK_N"]),
             )
@@ -118,6 +125,7 @@ def grid(META):
             b.stride(1),
             c_flat.stride(0),
             c_flat.stride(1),
+            assume_in_bounds=True,
         )
 
         return c_flat.reshape(batch, m, l)

diff --git a/backends/triton/cpu/KernelBench/level1/11_4D_tensor_matrix_multiplication.py b/backends/triton/cpu/KernelBench/level1/11_4D_tensor_matrix_multiplication.py
@@ -29,16 +29,13 @@ def swizzle_tile(
     return pid_m, pid_n
 
 
-def get_autotune_configs():
-    return [
-        triton.Config(
-            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
-        ),
-    ]
-
-
 @triton.autotune(
-    configs=get_autotune_configs(),
+    configs=[
+        triton.Config(
+            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": gs},
+        )
+        for gs in [1, 2, 4, 8]
+    ],
     key=["M", "N", "K"],
 )
 @triton.jit
@@ -111,9 +108,15 @@ def forward(self, A, B):
 
         C_2d = torch.empty((M, N), device=A.device, dtype=torch.bfloat16)
 
-        grid = lambda META: (
-            triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),
-        )
+        def grid(META):
+            assert (
+                M % META["BLOCK_M"] == 0
+                and N % META["BLOCK_N"] == 0
+                and K % META["BLOCK_K"] == 0
+            ), (
+                "M, N, and K must be divisible by BLOCK_M, BLOCK_N, and BLOCK_K respectively"
+            )
+            return (triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),)
 
         _gemm_kernel[grid](
             A_flat,
@@ -128,6 +131,7 @@ def forward(self, A, B):
             B_fp16.stride(1),
             C_2d.stride(0),
             C_2d.stride(1),
+            assume_in_bounds=True,
         )
 
         result = C_2d.view(b_dim, i_dim, j_dim, k_dim)

diff --git a/backends/triton/cpu/KernelBench/level1/12_Matmul_with_diagonal_matrices_.py b/backends/triton/cpu/KernelBench/level1/12_Matmul_with_diagonal_matrices_.py
@@ -15,6 +15,9 @@
         triton.Config(
             {"BLOCK_M": 32, "BLOCK_N": 32},
         ),
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 64},
+        ),
     ],
     key=["N", "M"],
 )

diff --git a/backends/triton/cpu/KernelBench/level1/13_Matmul_for_symmetric_matrices.py b/backends/triton/cpu/KernelBench/level1/13_Matmul_for_symmetric_matrices.py
@@ -29,15 +29,15 @@ def swizzle_tile(
     return pid_m, pid_n
 
 
-def _configs():
-    return [
+@triton.autotune(
+    configs=[
         triton.Config(
-            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
-        ),
-    ]
-
-
-@triton.autotune(configs=_configs(), key=["M", "N", "K"])
+            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": gs},
+        )
+        for gs in [1, 2, 4, 8]
+    ],
+    key=["M", "N", "K"],
+)
 @triton.jit
 def _matmul_kernel(
     a_ptr,
@@ -111,9 +111,16 @@ def forward(self, A, B):
         N = B.shape[1]
         C = torch.empty((M, N), device=A.device, dtype=torch.bfloat16)
 
-        grid = lambda META: (
-            triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),
-        )
+        def grid(META):
+            assert (
+                M % META["BLOCK_M"] == 0
+                and N % META["BLOCK_N"] == 0
+                and K % META["BLOCK_K"] == 0
+            ), (
+                "M, N, and K must be divisible by BLOCK_M, BLOCK_N, and BLOCK_K respectively"
+            )
+            return (triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),)
+
         _matmul_kernel[grid](
             A,
             B,
@@ -130,6 +137,7 @@ def forward(self, A, B):
             DIVISIBLE_M=(M % 256 == 0),
             DIVISIBLE_N=(N % 128 == 0),
             DIVISIBLE_K=(K % 32 == 0),
+            assume_in_bounds=True,
         )
         return C
 

diff --git a/backends/triton/cpu/KernelBench/level1/14_Matmul_for_upper_triangular_matrices.py b/backends/triton/cpu/KernelBench/level1/14_Matmul_for_upper_triangular_matrices.py
@@ -34,8 +34,9 @@ def swizzle_tile(
 @triton.autotune(
     configs=[
         triton.Config(
-            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
-        ),
+            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": gs},
+        )
+        for gs in [1, 2, 4, 8]
     ],
     key=["M", "N", "K"],
 )
@@ -117,9 +118,15 @@ def forward(self, A, B):
         N = B.shape[1]
         C = torch.zeros((M, N), device=A.device, dtype=A.dtype)
 
-        grid = lambda META: (
-            triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),
-        )
+        def grid(META):
+            assert (
+                M % META["BLOCK_M"] == 0
+                and N % META["BLOCK_N"] == 0
+                and K % META["BLOCK_K"] == 0
+            ), (
+                "M, N, and K must be divisible by BLOCK_M, BLOCK_N, and BLOCK_K respectively"
+            )
+            return (triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),)
 
         _triu_matmul_kernel[grid](
             A,
@@ -134,6 +141,7 @@ def forward(self, A, B):
             B.stride(1),
             C.stride(0),
             C.stride(1),
+            assume_in_bounds=True,
         )
         return C
 

diff --git a/backends/triton/cpu/KernelBench/level1/15_Matmul_for_lower_triangular_matrices.py b/backends/triton/cpu/KernelBench/level1/15_Matmul_for_lower_triangular_matrices.py
@@ -32,10 +32,11 @@ def swizzle_tile(
 @triton.autotune(
     configs=[
         triton.Config(
-            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
-        ),
+            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": gs},
+        )
+        for gs in [1, 2, 4, 8]
     ],
-    key=["M"],
+    key=["M", "N", "K"],
 )
 @triton.jit
 def tril_matmul_kernel(
@@ -111,9 +112,14 @@ def forward(self, A, B):
         M = A.shape[0]
         C = torch.zeros(M, M, device=A.device, dtype=A.dtype)
 
-        grid = lambda META: (
-            triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(M, META["BLOCK_N"]),
-        )
+        def grid(META):
+            assert (
+                M % META["BLOCK_M"] == 0
+                and M % META["BLOCK_N"] == 0
+                and M % META["BLOCK_K"] == 0
+            ), "M must be divisible by BLOCK_M, BLOCK_N, and BLOCK_K respectively"
+            return (triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(M, META["BLOCK_N"]),)
+
         tril_matmul_kernel[grid](
             A,
             B,
@@ -125,6 +131,7 @@ def forward(self, A, B):
             B.stride(1),
             C.stride(0),
             C.stride(1),
+            assume_in_bounds=True,
         )
         return C
 

diff --git a/backends/triton/cpu/KernelBench/level1/16_Matmul_with_transposed_A.py b/backends/triton/cpu/KernelBench/level1/16_Matmul_with_transposed_A.py
@@ -10,16 +10,13 @@
 import triton.language as tl
 
 
-def get_autotune_configs():
-    return [
-        triton.Config(
-            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
-        ),
-    ]
-
-
 @triton.autotune(
-    configs=get_autotune_configs(),
+    configs=[
+        triton.Config(
+            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": gs},
+        )
+        for gs in [1, 2, 4, 8]
+    ],
     key=["M", "N", "K"],
 )
 @triton.jit
@@ -96,6 +93,13 @@ def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
         C = torch.empty((M, N), device=A.device, dtype=A.dtype)
 
         def grid(META):
+            assert (
+                M % META["BLOCK_M"] == 0
+                and N % META["BLOCK_N"] == 0
+                and K % META["BLOCK_K"] == 0
+            ), (
+                "M, N, and K must be divisible by BLOCK_M, BLOCK_N, and BLOCK_K respectively"
+            )
             return (triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),)
 
         _matmul_at_kernel[grid](
@@ -111,6 +115,7 @@ def grid(META):
             B.stride(1),
             C.stride(0),
             C.stride(1),
+            assume_in_bounds=True,
         )
         return C
 

diff --git a/backends/triton/cpu/KernelBench/level1/17_Matmul_with_transposed_B.py b/backends/triton/cpu/KernelBench/level1/17_Matmul_with_transposed_B.py
@@ -10,16 +10,13 @@
 import triton.language as tl
 
 
-def get_autotune_configs():
-    return [
-        triton.Config(
-            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
-        ),
-    ]
-
-
 @triton.autotune(
-    configs=get_autotune_configs(),
+    configs=[
+        triton.Config(
+            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": gs},
+        )
+        for gs in [1, 2, 4, 8]
+    ],
     key=["M", "N", "K"],
 )
 @triton.jit
@@ -100,6 +97,13 @@ def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
         C = torch.empty((M, N), device=A.device, dtype=A.dtype)
 
         def grid(META):
+            assert (
+                M % META["BLOCK_M"] == 0
+                and N % META["BLOCK_N"] == 0
+                and K % META["BLOCK_K"] == 0
+            ), (
+                "M, N, and K must be divisible by BLOCK_M, BLOCK_N, and BLOCK_K respectively"
+            )
             return (triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),)
 
         _matmul_bt_kernel[grid](
@@ -115,6 +119,7 @@ def grid(META):
             B.stride(1),
             C.stride(0),
             C.stride(1),
+            assume_in_bounds=True,
         )
 
         return C

diff --git a/backends/triton/cpu/KernelBench/level1/18_Matmul_with_transposed_both.py b/backends/triton/cpu/KernelBench/level1/18_Matmul_with_transposed_both.py
@@ -10,15 +10,15 @@
 import triton.language as tl
 
 
-def _configs():
-    return [
+@triton.autotune(
+    configs=[
         triton.Config(
-            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
-        ),
-    ]
-
-
-@triton.autotune(configs=_configs(), key=["M", "N", "K"])
+            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": gs},
+        )
+        for gs in [1, 2, 4, 8]
+    ],
+    key=["M", "N", "K"],
+)
 @triton.jit
 def _matmul_tt_kernel(
     A_ptr,
@@ -96,6 +96,13 @@ def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
         C = torch.empty((M, N), device=A.device, dtype=A.dtype)
 
         def grid(META):
+            assert (
+                M % META["BLOCK_M"] == 0
+                and N % META["BLOCK_N"] == 0
+                and K % META["BLOCK_K"] == 0
+            ), (
+                "M, N, and K must be divisible by BLOCK_M, BLOCK_N, and BLOCK_K respectively"
+            )
             return (triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),)
 
         _matmul_tt_kernel[grid](
@@ -111,6 +118,7 @@ def grid(META):
             B.stride(1),
             C.stride(0),
             C.stride(1),
+            assume_in_bounds=True,
         )
 
         return C