Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 49 additions & 21 deletions .github/workflows/kernel_bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,32 +3,36 @@ name: KernelBench Perf
on:
workflow_dispatch:
inputs:
RUN_CPU_TORCH:
description: "Run on CPU (PyTorch eager)"
DEVICE_CPU:
description: "Device: CPU"
type: boolean
default: false
RUN_CPU_MLIR:
description: "Run on CPU (MLIR)"
DEVICE_XPU:
description: "Device: Intel GPU"
type: boolean
default: true
DEVICE_CUDA:
description: "Device: Nvidia GPU"
type: boolean
default: false
RUN_XPU_TORCH:
description: "Run on Intel GPU (PyTorch eager)"
BACKEND_PYTORCH:
description: "Backend: PyTorch (eager)"
type: boolean
default: true
RUN_XPU_TORCH_COMPILE:
description: "Run on Intel GPU (PyTorch compile)"
BACKEND_PYTORCH_COMPILE:
description: "Backend: PyTorch (compile)"
type: boolean
default: false
RUN_XPU_TRITON:
description: "Run on Intel GPU (Triton)"
BACKEND_TRITON:
description: "Backend: Triton"
type: boolean
default: false
RUN_XPU_HELION:
description: "Run on Intel GPU (Helion)"
BACKEND_HELION:
description: "Backend: Helion"
type: boolean
default: false
RUN_CUDA_TORCH:
description: "Run on Nvidia GPU (PyTorch eager)"
BACKEND_MLIR:
description: "Backend: MLIR"
type: boolean
default: false

Expand All @@ -40,7 +44,7 @@ jobs:
CPU-PyTorch:
runs-on: pcl-tiergarten
if: |
(github.event_name == 'workflow_dispatch' && inputs.RUN_CPU_TORCH)
(github.event_name == 'workflow_dispatch' && inputs.DEVICE_CPU && inputs.BACKEND_PYTORCH)

steps:
- uses: actions/checkout@v5
Expand All @@ -52,7 +56,7 @@ jobs:
CPU-MLIR:
runs-on: pcl-tiergarten
if: |
(github.event_name == 'workflow_dispatch' && inputs.RUN_CPU_MLIR)
(github.event_name == 'workflow_dispatch' && inputs.DEVICE_CPU && inputs.BACKEND_MLIR)

steps:
- uses: actions/checkout@v5
Expand All @@ -64,7 +68,7 @@ jobs:
XPU-PyTorch:
runs-on: pcl-tiergarten
if: |
(github.event_name == 'workflow_dispatch' && inputs.RUN_XPU_TORCH)
(github.event_name == 'workflow_dispatch' && inputs.DEVICE_XPU && inputs.BACKEND_PYTORCH)

steps:
- uses: actions/checkout@v5
Expand All @@ -76,7 +80,7 @@ jobs:
XPU-PyTorch-Compile:
runs-on: pcl-tiergarten
if: |
(github.event_name == 'workflow_dispatch' && inputs.RUN_XPU_TORCH_COMPILE)
(github.event_name == 'workflow_dispatch' && inputs.DEVICE_XPU && inputs.BACKEND_PYTORCH_COMPILE)

steps:
- uses: actions/checkout@v5
Expand All @@ -88,7 +92,7 @@ jobs:
XPU-Triton:
runs-on: pcl-tiergarten
if: |
(github.event_name == 'workflow_dispatch' && inputs.RUN_XPU_TRITON)
(github.event_name == 'workflow_dispatch' && inputs.DEVICE_XPU && inputs.BACKEND_TRITON)

steps:
- uses: actions/checkout@v5
Expand All @@ -100,7 +104,7 @@ jobs:
XPU-Helion:
runs-on: pcl-tiergarten
if: |
(github.event_name == 'workflow_dispatch' && inputs.RUN_XPU_HELION)
(github.event_name == 'workflow_dispatch' && inputs.DEVICE_XPU && inputs.BACKEND_HELION)

steps:
- uses: actions/checkout@v5
Expand All @@ -112,11 +116,35 @@ jobs:
CUDA-PyTorch:
runs-on: pcl-tiergarten
if: |
(github.event_name == 'workflow_dispatch' && inputs.RUN_CUDA_TORCH)
(github.event_name == 'workflow_dispatch' && inputs.DEVICE_CUDA && inputs.BACKEND_PYTORCH)

steps:
- uses: actions/checkout@v5

- name: Nvidia A100
run: "${{ env.SRUN }} --partition=a100 --time=0:15:00 -- \
'${{ github.workspace }}/infra/scripts/ci-cuda-run-kernel-bench.sh -b torch'"

CUDA-Triton:
runs-on: pcl-tiergarten
if: |
(github.event_name == 'workflow_dispatch' && inputs.DEVICE_CUDA && inputs.BACKEND_TRITON)

steps:
- uses: actions/checkout@v5

- name: Nvidia A100
run: "${{ env.SRUN }} --partition=a100 --time=0:15:00 -- \
'${{ github.workspace }}/infra/scripts/ci-cuda-run-kernel-bench.sh -b triton'"

CUDA-Helion:
runs-on: pcl-tiergarten
if: |
(github.event_name == 'workflow_dispatch' && inputs.DEVICE_CUDA && inputs.BACKEND_HELION)

steps:
- uses: actions/checkout@v5

- name: Nvidia A100
run: "${{ env.SRUN }} --partition=a100 --time=0:15:00 -- \
'${{ github.workspace }}/infra/scripts/ci-cuda-run-kernel-bench.sh -b helion'"
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ A benchmarking framework for evaluating AI kernel implementations across multipl
|:---:|:---:|:---:|:---:|:---:|
| **CPU** | ✅ | ❌ | ❌ | ✅ |
| **XPU** | ✅ | ✅ | ✅ | ❌ |
| **CUDA** | ✅ | ⚠️ | ⚠️ | ❌ |
| **CUDA** | ✅ | | | ❌ |

✅ - Supported ⚠️ - Partially implemented ❌ - Unsupported

Expand Down
8 changes: 6 additions & 2 deletions ai_bench/harness/runner/kernel_bench_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,13 @@ def __init__(
if self.is_torch_backend():
self.kernels = ai_utils.kernel_bench_dir() / "KernelBench"
elif self.backend == ai_hc.Backend.TRITON:
self.kernels = ai_utils.triton_kernels_dir() / "KernelBench"
self.kernels = (
ai_utils.triton_kernels_dir() / self.device.type / "KernelBench"
)
elif self.backend == ai_hc.Backend.HELION:
self.kernels = ai_utils.helion_kernels_dir() / "KernelBench"
self.kernels = (
ai_utils.helion_kernels_dir() / self.device.type / "KernelBench"
)
elif self.backend == ai_hc.Backend.MLIR:
self.kernels = (
ai_utils.mlir_kernels_dir() / self.device.type / "KernelBench"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# Example Helion CUDA kernel
# Source: helion matmul example
# Status: Experimental / uncurated
# Expectation: Correctness-first, performance not representative

import helion
import helion.language as hl
import torch
import torch.nn as nn


@helion.kernel(
static_shapes=True,
configs=[
helion.Config(
block_sizes=[64, 128, 16],
indexing="tensor_descriptor",
l2_groupings=[32],
loop_orders=[[1, 0]],
num_stages=2,
num_warps=8,
pid_type="flat",
range_flattens=[None, None],
range_multi_buffers=[None, None],
range_num_stages=[0, 2],
range_unroll_factors=[0, 1],
),
helion.Config(
block_sizes=[256, 256, 32],
indexing="tensor_descriptor",
l2_groupings=[4],
loop_orders=[[0, 1]],
num_stages=2,
num_warps=32,
pid_type="flat",
range_flattens=[None, False],
range_multi_buffers=[None, False],
range_num_stages=[0, 2],
range_unroll_factors=[0, 1],
),
helion.Config(
block_sizes=[256, 128, 32],
indexing="tensor_descriptor",
l2_groupings=[32],
loop_orders=[[0, 1]],
num_stages=4,
num_warps=32,
pid_type="persistent_interleaved",
range_flattens=[None, False],
range_multi_buffers=[True, False],
range_num_stages=[1, 4],
range_unroll_factors=[4, 1],
),
helion.Config(
block_sizes=[128, 256, 16],
indexing="tensor_descriptor",
l2_groupings=[4],
loop_orders=[[0, 1]],
num_stages=5,
num_warps=32,
pid_type="persistent_interleaved",
range_flattens=[None, True],
range_multi_buffers=[False, False],
range_num_stages=[1, 4],
range_unroll_factors=[2, 0],
),
],
)
def _square_matmul_kernel(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
"""
Performs square matrix multiplication using Helion.
C = A * B

Args:
A: Input matrix A of shape (N, N)
B: Input matrix B of shape (N, N)

Returns:
Output matrix C of shape (N, N)
"""
N, N2 = A.size()
N3, N4 = B.size()
assert N == N2 == N3 == N4, f"size mismatch: A{A.size()}, B{B.size()}"

out = torch.empty(
[N, N], dtype=torch.promote_types(A.dtype, B.dtype), device=A.device
)

for tile_m, tile_n in hl.tile([N, N]):
acc = hl.zeros([tile_m, tile_n], dtype=torch.float32)
for tile_k in hl.tile(N):
acc = torch.addmm(acc, A[tile_m, tile_k], B[tile_k, tile_n])
out[tile_m, tile_n] = acc

return out


class Model(nn.Module):
def __init__(self, *args, **kwargs):
super(Model, self).__init__()

def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
return _square_matmul_kernel(A, B)
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# Example Helion XPU kernel
# Source: helion matmul example
# Status: Experimental / uncurated
# Expectation: Correctness-first, performance not representative

import helion
import helion.language as hl
import torch
Expand Down
Loading