Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions backends/triton/cpu/KernelBench/level1/100_HingeLoss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# ruff: noqa: E731
# AUTOGENERATED KERNEL (LLM)
# Source: LLM-generated candidate implementation
# Status: Experimental / uncurated
# Expectation: Correctness-first, performance not representative

import torch
import torch.nn as nn
import triton
import triton.language as tl


@triton.autotune(
configs=[
triton.Config({"BLOCK_X": 1024, "BLOCK_Y": 4}, num_warps=4, num_stages=2),
triton.Config({"BLOCK_X": 2048, "BLOCK_Y": 2}, num_warps=4, num_stages=2),
triton.Config({"BLOCK_X": 4096, "BLOCK_Y": 1}, num_warps=8, num_stages=2),
],
key=["B", "D"],
)
@triton.jit
def _hinge_loss_kernel(
pred_ptr,
targ_ptr,
out_ptr,
B,
D,
stride_pb,
stride_pd,
BLOCK_X: tl.constexpr,
BLOCK_Y: tl.constexpr,
):
pid = tl.program_id(0)
row_start = pid * BLOCK_Y
rows = row_start + tl.arange(0, BLOCK_Y)
mask_y = rows < B

acc = tl.zeros((BLOCK_Y,), dtype=tl.float32)

for col_start in range(0, D, BLOCK_X):
cols = col_start + tl.arange(0, BLOCK_X)
mask_x = cols < D

targ = tl.load(targ_ptr + cols, mask=mask_x, other=0.0).to(tl.float32)

offs = rows[:, None] * stride_pb + cols[None, :] * stride_pd
mask = mask_y[:, None] & mask_x[None, :]
pred = tl.load(pred_ptr + offs, mask=mask, other=0.0).to(tl.float32)

hinge = tl.maximum(1.0 - pred * targ[None, :], 0.0)
acc += tl.sum(hinge, axis=1)

tl.store(out_ptr + rows, acc, mask=mask_y)


class Model(nn.Module):
def __init__(self):
super(Model, self).__init__()

def forward(self, predictions, targets):
B, D = predictions.shape
row_sums = torch.empty(B, device=predictions.device, dtype=torch.float32)

grid = lambda META: (triton.cdiv(B, META["BLOCK_Y"]),)
_hinge_loss_kernel[grid](
predictions,
targets,
row_sums,
B,
D,
predictions.stride(0),
predictions.stride(1),
)

return row_sums.sum() / (B * D)
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# ruff: noqa: E731
# AUTOGENERATED KERNEL (LLM)
# Source: LLM-generated candidate implementation
# Status: Experimental / uncurated
# Expectation: Correctness-first, performance not representative

import torch
import torch.nn as nn
import triton
import triton.language as tl


@triton.autotune(
configs=[
triton.Config({"BLOCK_N": 128}, num_warps=8, num_stages=2),
triton.Config({"BLOCK_N": 128}, num_warps=8, num_stages=2),
triton.Config({"BLOCK_N": 128}, num_warps=4, num_stages=3),
triton.Config({"BLOCK_N": 64}, num_warps=4, num_stages=2),
triton.Config({"BLOCK_N": 64}, num_warps=4, num_stages=2),
triton.Config({"BLOCK_N": 256}, num_warps=8, num_stages=2),
triton.Config({"BLOCK_N": 256}, num_warps=8, num_stages=2),
triton.Config({"BLOCK_N": 128}, num_warps=16, num_stages=2),
],
key=["D1", "D2"],
)
@triton.jit
def argmax_dim1_kernel(
x_ptr,
out_ptr,
D1: tl.constexpr,
D2: tl.constexpr,
stride_b,
stride_d1,
stride_d2,
stride_ob,
stride_od2,
BLOCK_N: tl.constexpr,
):
pid_n = tl.program_id(0)
pid_b = tl.program_id(1)

col_start = pid_n * BLOCK_N
cols = col_start + tl.arange(0, BLOCK_N)
col_mask = cols < D2

max_val = tl.full((BLOCK_N,), -float("inf"), dtype=tl.float32)
max_idx = tl.zeros((BLOCK_N,), dtype=tl.int32)

batch_offset = pid_b.to(tl.int64) * stride_b
col_offsets = cols.to(tl.int64) * stride_d2
base = x_ptr + batch_offset + col_offsets

for k in tl.range(0, D1):
val = tl.load(base + k * stride_d1, mask=col_mask, other=-float("inf")).to(
tl.float32
)
update = val > max_val
max_val = tl.where(update, val, max_val)
max_idx = tl.where(update, k, max_idx)

out_ptrs = out_ptr + pid_b.to(tl.int64) * stride_ob + cols.to(tl.int64) * stride_od2
tl.store(out_ptrs, max_idx.to(tl.int64), mask=col_mask)


class Model(nn.Module):
def __init__(self, dim=1):
super(Model, self).__init__()
try:
self.dim = int(dim)
except (ValueError, TypeError):
self.dim = 1

def forward(self, x: torch.Tensor) -> torch.Tensor:
B, D1, D2 = x.shape
output = torch.empty((B, D2), device=x.device, dtype=torch.int64)

grid = lambda META: (triton.cdiv(D2, META["BLOCK_N"]), B)
argmax_dim1_kernel[grid](
x,
output,
D1,
D2,
x.stride(0),
x.stride(1),
x.stride(2),
output.stride(0),
output.stride(1),
)

return output
135 changes: 135 additions & 0 deletions backends/triton/cpu/KernelBench/level1/52_Argmin_over_a_dimension.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
# ruff: noqa: E731
# AUTOGENERATED KERNEL (LLM)
# Source: LLM-generated candidate implementation
# Status: Experimental / uncurated
# Expectation: Correctness-first, performance not representative

import torch
import torch.nn as nn
import triton
import triton.language as tl


@triton.autotune(
configs=[
triton.Config(
{"BLOCK_D2": 256, "BLOCK_K": 16, "warp_size": 32}, num_warps=4, num_stages=2
),
triton.Config(
{"BLOCK_D2": 256, "BLOCK_K": 32, "warp_size": 32}, num_warps=4, num_stages=2
),
triton.Config(
{"BLOCK_D2": 512, "BLOCK_K": 16, "warp_size": 32}, num_warps=4, num_stages=2
),
triton.Config(
{"BLOCK_D2": 512, "BLOCK_K": 32, "warp_size": 32}, num_warps=8, num_stages=2
),
triton.Config(
{"BLOCK_D2": 256, "BLOCK_K": 64, "warp_size": 32}, num_warps=8, num_stages=2
),
triton.Config(
{"BLOCK_D2": 128, "BLOCK_K": 32, "warp_size": 16}, num_warps=4, num_stages=2
),
triton.Config(
{"BLOCK_D2": 256, "BLOCK_K": 32, "warp_size": 16}, num_warps=8, num_stages=2
),
triton.Config(
{"BLOCK_D2": 512, "BLOCK_K": 16, "warp_size": 16}, num_warps=8, num_stages=2
),
triton.Config(
{"BLOCK_D2": 512, "BLOCK_K": 32, "warp_size": 16},
num_warps=16,
num_stages=2,
),
triton.Config(
{"BLOCK_D2": 1024, "BLOCK_K": 16, "warp_size": 32},
num_warps=8,
num_stages=2,
),
],
key=["D1", "D2"],
)
@triton.jit
def argmin_kernel(
x_ptr,
out_ptr,
B,
D1,
D2,
stride_b,
stride_d1,
stride_d2,
out_stride_b,
out_stride_d2,
BLOCK_D2: tl.constexpr,
BLOCK_K: tl.constexpr,
warp_size: tl.constexpr,
):
pid = tl.program_id(0)
num_d2_blocks = tl.cdiv(D2, BLOCK_D2)
batch_idx = pid // num_d2_blocks
d2_block_idx = pid % num_d2_blocks

d2_start = d2_block_idx * BLOCK_D2
d2_offs = d2_start + tl.arange(0, BLOCK_D2)
d2_mask = d2_offs < D2

base = x_ptr + batch_idx.to(tl.int64) * stride_b

min_val = tl.full([BLOCK_D2], float("inf"), dtype=tl.float32)
min_idx = tl.zeros([BLOCK_D2], dtype=tl.int32)

k_offs_base = tl.arange(0, BLOCK_K)

for k_start in tl.range(0, D1, BLOCK_K):
k_offs = k_start + k_offs_base
k_mask = k_offs < D1
ptrs = (
base
+ k_offs[:, None].to(tl.int64) * stride_d1
+ d2_offs[None, :] * stride_d2
)
mask = k_mask[:, None] & d2_mask[None, :]
tile = tl.load(ptrs, mask=mask, other=float("inf")).to(tl.float32)

tile_min = tl.min(tile, axis=0)

update = tile_min < min_val

k_indices = k_offs[:, None]
large_k = tl.full([1], D1, dtype=tl.int32)
k_masked = tl.where(tile == tile_min[None, :], k_indices, large_k)
tile_argmin = tl.min(k_masked, axis=0)

min_idx = tl.where(update, tile_argmin, min_idx)
min_val = tl.where(update, tile_min, min_val)

out_ptrs = out_ptr + batch_idx.to(tl.int64) * out_stride_b + d2_offs * out_stride_d2
tl.store(out_ptrs, min_idx.to(tl.int64), mask=d2_mask)


class Model(nn.Module):
def __init__(self, dim: int):
super(Model, self).__init__()
self.dim = dim

def forward(self, x: torch.Tensor) -> torch.Tensor:
B, D1, D2 = x.shape
output = torch.empty(B, D2, device=x.device, dtype=torch.int64)

grid = lambda META: (B * triton.cdiv(D2, META["BLOCK_D2"]),)

argmin_kernel[grid](
x,
output,
B,
D1,
D2,
x.stride(0),
x.stride(1),
x.stride(2),
output.stride(0),
output.stride(1),
)

return output
Loading