libxsmm · jopperm · May 26, 2026 · May 26, 2026
diff --git a/backends/triton/cpu/KernelBench/level1/100_HingeLoss.py b/backends/triton/cpu/KernelBench/level1/100_HingeLoss.py
@@ -0,0 +1,75 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_X": 1024, "BLOCK_Y": 4}, num_warps=4, num_stages=2),
+        triton.Config({"BLOCK_X": 2048, "BLOCK_Y": 2}, num_warps=4, num_stages=2),
+        triton.Config({"BLOCK_X": 4096, "BLOCK_Y": 1}, num_warps=8, num_stages=2),
+    ],
+    key=["B", "D"],
+)
+@triton.jit
+def _hinge_loss_kernel(
+    pred_ptr,
+    targ_ptr,
+    out_ptr,
+    B,
+    D,
+    stride_pb,
+    stride_pd,
+    BLOCK_X: tl.constexpr,
+    BLOCK_Y: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    row_start = pid * BLOCK_Y
+    rows = row_start + tl.arange(0, BLOCK_Y)
+    mask_y = rows < B
+
+    acc = tl.zeros((BLOCK_Y,), dtype=tl.float32)
+
+    for col_start in range(0, D, BLOCK_X):
+        cols = col_start + tl.arange(0, BLOCK_X)
+        mask_x = cols < D
+
+        targ = tl.load(targ_ptr + cols, mask=mask_x, other=0.0).to(tl.float32)
+
+        offs = rows[:, None] * stride_pb + cols[None, :] * stride_pd
+        mask = mask_y[:, None] & mask_x[None, :]
+        pred = tl.load(pred_ptr + offs, mask=mask, other=0.0).to(tl.float32)
+
+        hinge = tl.maximum(1.0 - pred * targ[None, :], 0.0)
+        acc += tl.sum(hinge, axis=1)
+
+    tl.store(out_ptr + rows, acc, mask=mask_y)
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, predictions, targets):
+        B, D = predictions.shape
+        row_sums = torch.empty(B, device=predictions.device, dtype=torch.float32)
+
+        grid = lambda META: (triton.cdiv(B, META["BLOCK_Y"]),)
+        _hinge_loss_kernel[grid](
+            predictions,
+            targets,
+            row_sums,
+            B,
+            D,
+            predictions.stride(0),
+            predictions.stride(1),
+        )
+
+        return row_sums.sum() / (B * D)
diff --git a/backends/triton/cpu/KernelBench/level1/51_Argmax_over_a_dimension.py b/backends/triton/cpu/KernelBench/level1/51_Argmax_over_a_dimension.py
@@ -0,0 +1,90 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_N": 128}, num_warps=8, num_stages=2),
+        triton.Config({"BLOCK_N": 128}, num_warps=8, num_stages=2),
+        triton.Config({"BLOCK_N": 128}, num_warps=4, num_stages=3),
+        triton.Config({"BLOCK_N": 64}, num_warps=4, num_stages=2),
+        triton.Config({"BLOCK_N": 64}, num_warps=4, num_stages=2),
+        triton.Config({"BLOCK_N": 256}, num_warps=8, num_stages=2),
+        triton.Config({"BLOCK_N": 256}, num_warps=8, num_stages=2),
+        triton.Config({"BLOCK_N": 128}, num_warps=16, num_stages=2),
+    ],
+    key=["D1", "D2"],
+)
+@triton.jit
+def argmax_dim1_kernel(
+    x_ptr,
+    out_ptr,
+    D1: tl.constexpr,
+    D2: tl.constexpr,
+    stride_b,
+    stride_d1,
+    stride_d2,
+    stride_ob,
+    stride_od2,
+    BLOCK_N: tl.constexpr,
+):
+    pid_n = tl.program_id(0)
+    pid_b = tl.program_id(1)
+
+    col_start = pid_n * BLOCK_N
+    cols = col_start + tl.arange(0, BLOCK_N)
+    col_mask = cols < D2
+
+    max_val = tl.full((BLOCK_N,), -float("inf"), dtype=tl.float32)
+    max_idx = tl.zeros((BLOCK_N,), dtype=tl.int32)
+
+    batch_offset = pid_b.to(tl.int64) * stride_b
+    col_offsets = cols.to(tl.int64) * stride_d2
+    base = x_ptr + batch_offset + col_offsets
+
+    for k in tl.range(0, D1):
+        val = tl.load(base + k * stride_d1, mask=col_mask, other=-float("inf")).to(
+            tl.float32
+        )
+        update = val > max_val
+        max_val = tl.where(update, val, max_val)
+        max_idx = tl.where(update, k, max_idx)
+
+    out_ptrs = out_ptr + pid_b.to(tl.int64) * stride_ob + cols.to(tl.int64) * stride_od2
+    tl.store(out_ptrs, max_idx.to(tl.int64), mask=col_mask)
+
+
+class Model(nn.Module):
+    def __init__(self, dim=1):
+        super(Model, self).__init__()
+        try:
+            self.dim = int(dim)
+        except (ValueError, TypeError):
+            self.dim = 1
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, D1, D2 = x.shape
+        output = torch.empty((B, D2), device=x.device, dtype=torch.int64)
+
+        grid = lambda META: (triton.cdiv(D2, META["BLOCK_N"]), B)
+        argmax_dim1_kernel[grid](
+            x,
+            output,
+            D1,
+            D2,
+            x.stride(0),
+            x.stride(1),
+            x.stride(2),
+            output.stride(0),
+            output.stride(1),
+        )
+
+        return output
diff --git a/backends/triton/cpu/KernelBench/level1/52_Argmin_over_a_dimension.py b/backends/triton/cpu/KernelBench/level1/52_Argmin_over_a_dimension.py
@@ -0,0 +1,135 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {"BLOCK_D2": 256, "BLOCK_K": 16, "warp_size": 32}, num_warps=4, num_stages=2
+        ),
+        triton.Config(
+            {"BLOCK_D2": 256, "BLOCK_K": 32, "warp_size": 32}, num_warps=4, num_stages=2
+        ),
+        triton.Config(
+            {"BLOCK_D2": 512, "BLOCK_K": 16, "warp_size": 32}, num_warps=4, num_stages=2
+        ),
+        triton.Config(
+            {"BLOCK_D2": 512, "BLOCK_K": 32, "warp_size": 32}, num_warps=8, num_stages=2
+        ),
+        triton.Config(
+            {"BLOCK_D2": 256, "BLOCK_K": 64, "warp_size": 32}, num_warps=8, num_stages=2
+        ),
+        triton.Config(
+            {"BLOCK_D2": 128, "BLOCK_K": 32, "warp_size": 16}, num_warps=4, num_stages=2
+        ),
+        triton.Config(
+            {"BLOCK_D2": 256, "BLOCK_K": 32, "warp_size": 16}, num_warps=8, num_stages=2
+        ),
+        triton.Config(
+            {"BLOCK_D2": 512, "BLOCK_K": 16, "warp_size": 16}, num_warps=8, num_stages=2
+        ),
+        triton.Config(
+            {"BLOCK_D2": 512, "BLOCK_K": 32, "warp_size": 16},
+            num_warps=16,
+            num_stages=2,
+        ),
+        triton.Config(
+            {"BLOCK_D2": 1024, "BLOCK_K": 16, "warp_size": 32},
+            num_warps=8,
+            num_stages=2,
+        ),
+    ],
+    key=["D1", "D2"],
+)
+@triton.jit
+def argmin_kernel(
+    x_ptr,
+    out_ptr,
+    B,
+    D1,
+    D2,
+    stride_b,
+    stride_d1,
+    stride_d2,
+    out_stride_b,
+    out_stride_d2,
+    BLOCK_D2: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    warp_size: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    num_d2_blocks = tl.cdiv(D2, BLOCK_D2)
+    batch_idx = pid // num_d2_blocks
+    d2_block_idx = pid % num_d2_blocks
+
+    d2_start = d2_block_idx * BLOCK_D2
+    d2_offs = d2_start + tl.arange(0, BLOCK_D2)
+    d2_mask = d2_offs < D2
+
+    base = x_ptr + batch_idx.to(tl.int64) * stride_b
+
+    min_val = tl.full([BLOCK_D2], float("inf"), dtype=tl.float32)
+    min_idx = tl.zeros([BLOCK_D2], dtype=tl.int32)
+
+    k_offs_base = tl.arange(0, BLOCK_K)
+
+    for k_start in tl.range(0, D1, BLOCK_K):
+        k_offs = k_start + k_offs_base
+        k_mask = k_offs < D1
+        ptrs = (
+            base
+            + k_offs[:, None].to(tl.int64) * stride_d1
+            + d2_offs[None, :] * stride_d2
+        )
+        mask = k_mask[:, None] & d2_mask[None, :]
+        tile = tl.load(ptrs, mask=mask, other=float("inf")).to(tl.float32)
+
+        tile_min = tl.min(tile, axis=0)
+
+        update = tile_min < min_val
+
+        k_indices = k_offs[:, None]
+        large_k = tl.full([1], D1, dtype=tl.int32)
+        k_masked = tl.where(tile == tile_min[None, :], k_indices, large_k)
+        tile_argmin = tl.min(k_masked, axis=0)
+
+        min_idx = tl.where(update, tile_argmin, min_idx)
+        min_val = tl.where(update, tile_min, min_val)
+
+    out_ptrs = out_ptr + batch_idx.to(tl.int64) * out_stride_b + d2_offs * out_stride_d2
+    tl.store(out_ptrs, min_idx.to(tl.int64), mask=d2_mask)
+
+
+class Model(nn.Module):
+    def __init__(self, dim: int):
+        super(Model, self).__init__()
+        self.dim = dim
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, D1, D2 = x.shape
+        output = torch.empty(B, D2, device=x.device, dtype=torch.int64)
+
+        grid = lambda META: (B * triton.cdiv(D2, META["BLOCK_D2"]),)
+
+        argmin_kernel[grid](
+            x,
+            output,
+            B,
+            D1,
+            D2,
+            x.stride(0),
+            x.stride(1),
+            x.stride(2),
+            output.stride(0),
+            output.stride(1),
+        )
+
+        return output