libxsmm · jopperm · Jun 1, 2026 · May 26, 2026 · May 27, 2026 · May 31, 2026
diff --git a/backends/triton/cpu/KernelBench/level1/100_HingeLoss.py b/backends/triton/cpu/KernelBench/level1/100_HingeLoss.py
@@ -0,0 +1,83 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {"BLOCK_SIZE": 32},
+        ),
+    ],
+    key=["D"],
+)
+@triton.jit
+def _hinge_loss_kernel(
+    pred_ptr,
+    targ_ptr,
+    out_ptr,
+    B,
+    D,
+    stride_pb,
+    BLOCK_SIZE: tl.constexpr,
+):
+    row_idx = tl.program_id(0)
+    row_start = row_idx * stride_pb
+
+    acc = 0.0
+
+    for col_start in range(0, D, BLOCK_SIZE):
+        cols = col_start + tl.arange(0, BLOCK_SIZE)
+        mask = cols < D
+
+        targ = tl.load(targ_ptr + cols, mask=mask, other=0.0).to(tl.float32)
+        pred = tl.load(pred_ptr + row_start + cols, mask=mask, other=0.0).to(tl.float32)
+
+        hinge = tl.maximum(1.0 - pred * targ, 0.0)
+        acc += tl.sum(hinge, axis=0)
+
+    tl.store(out_ptr + row_idx, acc.to(tl.float32))
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, predictions, targets):
+        B, D = predictions.shape
+        row_sums = torch.empty(B, device=predictions.device, dtype=torch.float32)
+
+        grid = (B,)
+        _hinge_loss_kernel[grid](
+            predictions,
+            targets,
+            row_sums,
+            B,
+            D,
+            predictions.stride(0),
+        )
+
+        return row_sums.sum() / (B * D)
+
+
+batch_size = 32768
+input_shape = (32768,)
+dim = 1
+
+
+def get_inputs():
+    return [
+        torch.rand(batch_size, *input_shape),
+        torch.randint(0, 2, (batch_size,)).float() * 2 - 1,
+    ]
+
+
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/10_3D_tensor_matrix_multiplication.py b/backends/triton/cpu/KernelBench/level1/10_3D_tensor_matrix_multiplication.py
@@ -0,0 +1,133 @@
+# ruff: noqa: E731, E741
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+def _configs():
+    return [
+        triton.Config(
+            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
+        ),
+    ]
+
+
+@triton.autotune(configs=_configs(), key=["M", "N", "K"])
+@triton.jit
+def _matmul_kernel(
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    M,
+    N,
+    K,
+    stride_am: tl.constexpr,
+    stride_ak: tl.constexpr,
+    stride_bk: tl.constexpr,
+    stride_bn: tl.constexpr,
+    stride_cm: tl.constexpr,
+    stride_cn: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    pid = tl.program_id(0)
+
+    num_pid_m = tl.cdiv(M, BLOCK_M)
+    num_pid_n = tl.cdiv(N, BLOCK_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = tl.minimum(num_pid_m - first_pid_m, GROUP_SIZE_M)
+
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    a_desc = tl.make_tensor_descriptor(
+        base=a_ptr,
+        shape=(M, K),
+        strides=(stride_am, stride_ak),
+        block_shape=(BLOCK_M, BLOCK_K),
+    )
+    b_desc = tl.make_tensor_descriptor(
+        base=b_ptr,
+        shape=(K, N),
+        strides=(stride_bk, stride_bn),
+        block_shape=(BLOCK_K, BLOCK_N),
+    )
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    for off_k in range(0, K, BLOCK_K):
+        a_tile = a_desc.load([pid_m * BLOCK_M, off_k])
+        b_tile = b_desc.load([off_k, pid_n * BLOCK_N])
+        acc += tl.dot(a_tile, b_tile)
+    c_desc = tl.make_tensor_descriptor(
+        base=c_ptr,
+        shape=(M, N),
+        strides=(stride_cm, stride_cn),
+        block_shape=(BLOCK_M, BLOCK_N),
+    )
+    c_desc.store([pid_m * BLOCK_M, pid_n * BLOCK_N], acc.to(c_ptr.type.element_ty))
+
+
+N = 16
+M = 1024
+K = 2048
+L = 768
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, A, B):
+        batch, m, k = A.shape
+        _, l = B.shape
+
+        a = A.to(torch.bfloat16).contiguous()
+        b = B.to(torch.bfloat16).contiguous()
+
+        a_flat = a.reshape(batch * m, k)
+        total_m = batch * m
+
+        c_flat = torch.empty((total_m, l), device=a.device, dtype=torch.bfloat16)
+
+        def grid(META):
+            return (
+                triton.cdiv(total_m, META["BLOCK_M"]) * triton.cdiv(l, META["BLOCK_N"]),
+            )
+
+        _matmul_kernel[grid](
+            a_flat,
+            b,
+            c_flat,
+            total_m,
+            l,
+            k,
+            a_flat.stride(0),
+            a_flat.stride(1),
+            b.stride(0),
+            b.stride(1),
+            c_flat.stride(0),
+            c_flat.stride(1),
+        )
+
+        return c_flat.reshape(batch, m, l)
+
+
+def get_inputs():
+    A = torch.rand(N, M, K, dtype=torch.bfloat16)
+    B = torch.rand(K, L, dtype=torch.bfloat16)
+    return [A, B]
+
+
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/11_4D_tensor_matrix_multiplication.py b/backends/triton/cpu/KernelBench/level1/11_4D_tensor_matrix_multiplication.py
@@ -0,0 +1,153 @@
+# ruff: noqa: E731, E741
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def swizzle_tile(
+    tile_id,
+    M,
+    N,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    grid_m = tl.cdiv(M, BLOCK_M)
+    grid_n = tl.cdiv(N, BLOCK_N)
+    width = GROUP_SIZE_M * grid_n
+    group_id = tile_id // width
+    group_size = tl.minimum(GROUP_SIZE_M, grid_m - group_id * GROUP_SIZE_M)
+    pid_m = group_id * GROUP_SIZE_M + ((tile_id % width) % group_size)
+    pid_n = (tile_id % width) // group_size
+    return pid_m, pid_n
+
+
+def get_autotune_configs():
+    return [
+        triton.Config(
+            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
+        ),
+    ]
+
+
+@triton.autotune(
+    configs=get_autotune_configs(),
+    key=["M", "N", "K"],
+)
+@triton.jit
+def _gemm_kernel(
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    M,
+    N,
+    K,
+    stride_am: tl.constexpr,
+    stride_ak: tl.constexpr,
+    stride_bk: tl.constexpr,
+    stride_bn: tl.constexpr,
+    stride_cm: tl.constexpr,
+    stride_cn: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    pid_m, pid_n = swizzle_tile(pid, M, N, BLOCK_M, BLOCK_N, GROUP_SIZE_M)
+
+    a_desc = tl.make_tensor_descriptor(
+        base=a_ptr,
+        shape=(M, K),
+        strides=(stride_am, stride_ak),
+        block_shape=(BLOCK_M, BLOCK_K),
+    )
+    b_desc = tl.make_tensor_descriptor(
+        base=b_ptr,
+        shape=(K, N),
+        strides=(stride_bk, stride_bn),
+        block_shape=(BLOCK_K, BLOCK_N),
+    )
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    for off_k in range(0, K, BLOCK_K):
+        a_block = a_desc.load([pid_m * BLOCK_M, off_k])
+        b_block = b_desc.load([off_k, pid_n * BLOCK_N])
+        acc += tl.dot(a_block, b_block)
+    c_desc = tl.make_tensor_descriptor(
+        base=c_ptr,
+        shape=(M, N),
+        strides=(stride_cm, stride_cn),
+        block_shape=(BLOCK_M, BLOCK_N),
+    )
+    c_desc.store([pid_m * BLOCK_M, pid_n * BLOCK_N], acc.to(c_ptr.type.element_ty))
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, A, B):
+        b_dim, i_dim, j_dim, l_dim = A.shape
+        k_dim = B.shape[1]
+
+        A_flat = A.contiguous().view(-1, l_dim)
+        if A_flat.dtype != torch.bfloat16:
+            A_flat = A_flat.to(torch.bfloat16)
+        B_fp16 = B.contiguous()
+        if B_fp16.dtype != torch.bfloat16:
+            B_fp16 = B_fp16.to(torch.bfloat16)
+
+        M = A_flat.shape[0]
+        N = k_dim
+        K = l_dim
+
+        C_2d = torch.empty((M, N), device=A.device, dtype=torch.bfloat16)
+
+        grid = lambda META: (
+            triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),
+        )
+
+        _gemm_kernel[grid](
+            A_flat,
+            B_fp16,
+            C_2d,
+            M,
+            N,
+            K,
+            A_flat.stride(0),
+            A_flat.stride(1),
+            B_fp16.stride(0),
+            B_fp16.stride(1),
+            C_2d.stride(0),
+            C_2d.stride(1),
+        )
+
+        result = C_2d.view(b_dim, i_dim, j_dim, k_dim)
+        if A.dtype != torch.bfloat16:
+            result = result.to(A.dtype)
+        return result
+
+
+b = 8
+i = 256
+j = 512
+l = 256
+k = 768
+
+
+def get_inputs():
+    A = torch.rand(b, i, j, l, dtype=torch.bfloat16)
+    B = torch.rand(l, k, dtype=torch.bfloat16)
+    return [A, B]
+
+
+def get_init_inputs():
+    return []