From 7f9e6f4ed5ffc22e9f4d16f559c663dca302e03b Mon Sep 17 00:00:00 2001
From: Julian Oppermann <julian.oppermann@intel.com>
Date: Tue, 26 May 2026 14:26:51 -0700
Subject: [PATCH 1/3] Add LLM-generated level 1 kernels for Triton CPU

Co-authored-by: Marcin Spoczynski <marcin@spoczynski.com>
---
 .../cpu/KernelBench/level1/100_HingeLoss.py   |  81 ++++
 .../10_3D_tensor_matrix_multiplication.py     | 135 +++++++
 .../11_4D_tensor_matrix_multiplication.py     | 155 ++++++++
 .../12_Matmul_with_diagonal_matrices_.py      |  95 +++++
 .../13_Matmul_for_symmetric_matrices.py       | 151 ++++++++
 ...14_Matmul_for_upper_triangular_matrices.py | 153 ++++++++
 ...15_Matmul_for_lower_triangular_matrices.py | 146 ++++++++
 .../level1/16_Matmul_with_transposed_A.py     | 132 +++++++
 .../level1/17_Matmul_with_transposed_B.py     | 137 +++++++
 .../level1/18_Matmul_with_transposed_both.py  | 133 +++++++
 .../triton/cpu/KernelBench/level1/19_ReLU.py  |  63 ++++
 .../level1/1_Square_matrix_multiplication_.py | 134 ++++---
 .../cpu/KernelBench/level1/20_LeakyReLU.py    |  69 ++++
 .../cpu/KernelBench/level1/21_Sigmoid.py      |  64 ++++
 .../triton/cpu/KernelBench/level1/22_Tanh.py  |  67 ++++
 .../cpu/KernelBench/level1/23_Softmax.py      | 101 +++++
 .../cpu/KernelBench/level1/24_LogSoftmax.py   |  94 +++++
 .../triton/cpu/KernelBench/level1/25_Swish.py |  65 ++++
 .../triton/cpu/KernelBench/level1/26_GELU_.py |  64 ++++
 .../triton/cpu/KernelBench/level1/27_SELU_.py |  64 ++++
 .../cpu/KernelBench/level1/28_HardSigmoid.py  |  62 +++
 .../cpu/KernelBench/level1/29_Softplus.py     |  61 +++
 .../2_Standard_matrix_multiplication_.py      | 124 ++++++
 .../cpu/KernelBench/level1/30_Softsign.py     |  58 +++
 .../triton/cpu/KernelBench/level1/31_ELU.py   |  69 ++++
 .../cpu/KernelBench/level1/32_HardTanh.py     |  59 +++
 .../cpu/KernelBench/level1/33_BatchNorm.py    | 202 ++++++++++
 .../cpu/KernelBench/level1/34_InstanceNorm.py |  87 +++++
 .../cpu/KernelBench/level1/35_GroupNorm_.py   | 183 +++++++++
 .../cpu/KernelBench/level1/36_RMSNorm_.py     | 109 ++++++
 .../KernelBench/level1/37_FrobeniusNorm_.py   | 110 ++++++
 .../cpu/KernelBench/level1/38_L1Norm_.py      |  88 +++++
 .../cpu/KernelBench/level1/39_L2Norm_.py      |  84 +++++
 .../level1/3_Batched_matrix_multiplication.py | 147 ++++++++
 .../cpu/KernelBench/level1/40_LayerNorm.py    | 111 ++++++
 .../KernelBench/level1/41_Max_Pooling_1D.py   | 121 ++++++
 .../KernelBench/level1/42_Max_Pooling_2D.py   | 127 +++++++
 .../KernelBench/level1/43_Max_Pooling_3D.py   | 172 +++++++++
 .../level1/44_Average_Pooling_1D.py           | 125 +++++++
 .../level1/45_Average_Pooling_2D.py           | 115 ++++++
 .../level1/46_Average_Pooling_3D.py           | 157 ++++++++
 .../47_Sum_reduction_over_a_dimension.py      | 103 +++++
 .../48_Mean_reduction_over_a_dimension.py     | 106 ++++++
 .../49_Max_reduction_over_a_dimension.py      | 105 ++++++
 .../level1/4_Matrix_vector_multiplication_.py |  76 ++++
 ...tandard_2D__square_input__square_kernel.py | 232 ++++++++++++
 .../level1/51_Argmax_over_a_dimension.py      |  98 +++++
 .../level1/52_Argmin_over_a_dimension.py      | 119 ++++++
 .../53_Min_reduction_over_a_dimension.py      | 107 ++++++
 ...tandard_3D__square_input__square_kernel.py | 252 +++++++++++++
 ...ard_2D__asymmetric_input__square_kernel.py | 214 +++++++++++
 ...2D__asymmetric_input__asymmetric_kernel.py | 205 ++++++++++
 ...nsposed_2D__square_input__square_kernel.py | 229 ++++++++++++
 ...3D__asymmetric_input__asymmetric_kernel.py | 238 ++++++++++++
 ...ard_3D__asymmetric_input__square_kernel.py | 246 ++++++++++++
 .../level1/5_Matrix_scalar_multiplication.py  |  63 ++++
 ...ard_3D__square_input__asymmetric_kernel.py | 252 +++++++++++++
 ...nsposed_3D__square_input__square_kernel.py | 243 ++++++++++++
 ...ard_2D__square_input__asymmetric_kernel.py | 205 ++++++++++
 ...tandard_2D__square_input__square_kernel.py | 201 ++++++++++
 .../level1/64_conv_transposed_1D.py           | 195 ++++++++++
 ...sed_2D__square_input__asymmetric_kernel.py | 229 ++++++++++++
 ...3D__asymmetric_input__asymmetric_kernel.py | 242 ++++++++++++
 .../KernelBench/level1/67_conv_standard_1D.py | 167 +++++++++
 ...sed_3D__square_input__asymmetric_kernel.py | 239 ++++++++++++
 ...2D__asymmetric_input__asymmetric_kernel.py | 229 ++++++++++++
 .../6_Matmul_with_large_K_dimension_.py       | 133 +++++++
 ...sed_3D__asymmetric_input__square_kernel.py | 234 ++++++++++++
 ...sed_2D__asymmetric_input__square_kernel.py | 229 ++++++++++++
 ...metric_kernel___strided_padded_grouped_.py | 289 ++++++++++++++
 ..._square_kernel__strided_padded__grouped.py | 309 +++++++++++++++
 .../level1/74_conv_transposed_1D_dilated.py   | 199 ++++++++++
 ...strided__grouped____padded____dilated__.py | 353 ++++++++++++++++++
 .../76_conv_standard_1D_dilated_strided__.py  | 174 +++++++++
 ...kernel___padded____dilated____strided__.py | 259 +++++++++++++
 ...tric_input_asymmetric_kernel___padded__.py | 244 ++++++++++++
 ...kernel___padded____strided____dilated__.py | 180 +++++++++
 .../7_Matmul_with_small_K_dimension_.py       | 136 +++++++
 ...asymmetric_kernel___dilated____padded__.py | 245 ++++++++++++
 ...kernel___dilated____padded____strided__.py | 242 ++++++++++++
 ...depthwise_2D_square_input_square_kernel.py | 117 ++++++
 ...hwise_2D_square_input_asymmetric_kernel.py | 122 ++++++
 ...hwise_2D_asymmetric_input_square_kernel.py | 119 ++++++
 ...e_2D_asymmetric_input_asymmetric_kernel.py | 179 +++++++++
 .../level1/86_conv_depthwise_separable_2D.py  | 214 +++++++++++
 .../level1/87_conv_pointwise_2D.py            | 147 ++++++++
 .../KernelBench/level1/88_MinGPTNewGelu.py    |  65 ++++
 .../cpu/KernelBench/level1/89_cumsum.py       |  90 +++++
 .../level1/8_Matmul_with_irregular_shapes_.py | 128 +++++++
 .../cpu/KernelBench/level1/90_cumprod.py      |  94 +++++
 .../KernelBench/level1/91_cumsum_reverse.py   |  91 +++++
 .../KernelBench/level1/92_cumsum_exclusive.py |  88 +++++
 .../KernelBench/level1/93_masked_cumsum.py    |  89 +++++
 .../cpu/KernelBench/level1/94_MSELoss.py      | 107 ++++++
 .../KernelBench/level1/95_CrossEntropyLoss.py | 107 ++++++
 .../cpu/KernelBench/level1/96_HuberLoss.py    |  97 +++++
 .../level1/97_ScaledDotProductAttention.py    | 280 ++++++++++++++
 .../cpu/KernelBench/level1/98_KLDivLoss.py    | 103 +++++
 .../level1/99_TripletMarginLoss.py            | 127 +++++++
 .../9_Tall_skinny_matrix_multiplication_.py   | 141 +++++++
 .../KernelBench/level1/100_HingeLoss.yaml     |   7 +
 .../10_3D_tensor_matrix_multiplication.yaml   |  10 +
 .../11_4D_tensor_matrix_multiplication.yaml   |  11 +
 .../12_Matmul_with_diagonal_matrices_.yaml    |   8 +
 .../13_Matmul_for_symmetric_matrices.yaml     |   6 +
 ..._Matmul_for_upper_triangular_matrices.yaml |   6 +
 ..._Matmul_for_lower_triangular_matrices.yaml |   6 +
 .../level1/16_Matmul_with_transposed_A.yaml   |   9 +
 .../level1/17_Matmul_with_transposed_B.yaml   |   9 +
 .../18_Matmul_with_transposed_both.yaml       |   9 +
 .../specs/KernelBench/level1/19_ReLU.yaml     |   7 +
 .../1_Square_matrix_multiplication_.yaml      |   6 +
 .../KernelBench/level1/20_LeakyReLU.yaml      |   8 +
 .../specs/KernelBench/level1/21_Sigmoid.yaml  |   7 +
 .../specs/KernelBench/level1/22_Tanh.yaml     |   7 +
 .../specs/KernelBench/level1/23_Softmax.yaml  |   7 +
 .../KernelBench/level1/24_LogSoftmax.yaml     |   7 +
 .../specs/KernelBench/level1/25_Swish.yaml    |   7 +
 .../specs/KernelBench/level1/26_GELU_.yaml    |   7 +
 .../specs/KernelBench/level1/27_SELU_.yaml    |   7 +
 .../KernelBench/level1/28_HardSigmoid.yaml    |   7 +
 .../specs/KernelBench/level1/29_Softplus.yaml |   7 +
 .../2_Standard_matrix_multiplication_.yaml    |   8 +
 .../specs/KernelBench/level1/30_Softsign.yaml |   7 +
 problems/specs/KernelBench/level1/31_ELU.yaml |   8 +
 .../specs/KernelBench/level1/32_HardTanh.yaml |   7 +
 .../KernelBench/level1/33_BatchNorm.yaml      |   9 +
 .../KernelBench/level1/34_InstanceNorm.yaml   |   9 +
 .../KernelBench/level1/35_GroupNorm_.yaml     |  10 +
 .../specs/KernelBench/level1/36_RMSNorm_.yaml |   9 +
 .../KernelBench/level1/37_FrobeniusNorm_.yaml |   9 +
 .../specs/KernelBench/level1/38_L1Norm_.yaml  |   7 +
 .../specs/KernelBench/level1/39_L2Norm_.yaml  |   7 +
 .../3_Batched_matrix_multiplication.yaml      |   9 +
 .../KernelBench/level1/40_LayerNorm.yaml      |  10 +
 .../KernelBench/level1/41_Max_Pooling_1D.yaml |  13 +
 .../KernelBench/level1/42_Max_Pooling_2D.yaml |  13 +
 .../KernelBench/level1/43_Max_Pooling_3D.yaml |  17 +-
 .../level1/44_Average_Pooling_1D.yaml         |  11 +
 .../level1/45_Average_Pooling_2D.yaml         |  10 +
 .../level1/46_Average_Pooling_3D.yaml         |  13 +
 .../47_Sum_reduction_over_a_dimension.yaml    |   9 +
 .../48_Mean_reduction_over_a_dimension.yaml   |   9 +
 .../49_Max_reduction_over_a_dimension.yaml    |   9 +
 .../4_Matrix_vector_multiplication_.yaml      |   9 +
 ...ndard_2D__square_input__square_kernel.yaml |  10 +
 .../level1/51_Argmax_over_a_dimension.yaml    |   9 +
 .../level1/52_Argmin_over_a_dimension.yaml    |   9 +
 .../53_Min_reduction_over_a_dimension.yaml    |   9 +
 ...ndard_3D__square_input__square_kernel.yaml |  12 +
 ...d_2D__asymmetric_input__square_kernel.yaml |  11 +
 ...__asymmetric_input__asymmetric_kernel.yaml |  12 +
 ...posed_2D__square_input__square_kernel.yaml |  11 +
 ...__asymmetric_input__asymmetric_kernel.yaml |  12 +
 ...d_3D__asymmetric_input__square_kernel.yaml |  12 +
 .../5_Matrix_scalar_multiplication.yaml       |   9 +
 ...d_3D__square_input__asymmetric_kernel.yaml |  12 +
 ...posed_3D__square_input__square_kernel.yaml |  12 +
 ...d_2D__square_input__asymmetric_kernel.yaml |  11 +
 ...ndard_2D__square_input__square_kernel.yaml |  11 +
 .../level1/64_conv_transposed_1D.yaml         |  10 +
 ...d_2D__square_input__asymmetric_kernel.yaml |  11 +
 ...__asymmetric_input__asymmetric_kernel.yaml |  12 +
 .../level1/67_conv_standard_1D.yaml           |  10 +
 ...d_3D__square_input__asymmetric_kernel.yaml |  12 +
 ...__asymmetric_input__asymmetric_kernel.yaml |  11 +
 .../6_Matmul_with_large_K_dimension_.yaml     |   9 +
 ...d_3D__asymmetric_input__square_kernel.yaml |  12 +
 ...d_2D__asymmetric_input__square_kernel.yaml |  11 +
 ...tric_kernel___strided_padded_grouped_.yaml |  16 +
 ...quare_kernel__strided_padded__grouped.yaml |  15 +
 .../level1/74_conv_transposed_1D_dilated.yaml |  13 +
 ...rided__grouped____padded____dilated__.yaml |  15 +
 ...76_conv_standard_1D_dilated_strided__.yaml |  12 +
 ...rnel___padded____dilated____strided__.yaml |  15 +
 ...ic_input_asymmetric_kernel___padded__.yaml |  13 +
 ...rnel___padded____strided____dilated__.yaml |  13 +
 .../7_Matmul_with_small_K_dimension_.yaml     |   9 +
 ...ymmetric_kernel___dilated____padded__.yaml |  14 +
 ...rnel___dilated____padded____strided__.yaml |  14 +
 ...pthwise_2D_square_input_square_kernel.yaml |  12 +
 ...ise_2D_square_input_asymmetric_kernel.yaml |  13 +
 ...ise_2D_asymmetric_input_square_kernel.yaml |  13 +
 ...2D_asymmetric_input_asymmetric_kernel.yaml |  19 +
 .../86_conv_depthwise_separable_2D.yaml       |  15 +
 .../level1/87_conv_pointwise_2D.yaml          |  10 +
 .../KernelBench/level1/88_MinGPTNewGelu.yaml  |   7 +
 .../specs/KernelBench/level1/89_cumsum.yaml   |   8 +
 .../8_Matmul_with_irregular_shapes_.yaml      |   9 +
 .../specs/KernelBench/level1/90_cumprod.yaml  |   8 +
 .../KernelBench/level1/91_cumsum_reverse.yaml |   8 +
 .../level1/92_cumsum_exclusive.yaml           |   8 +
 .../KernelBench/level1/93_masked_cumsum.yaml  |   8 +
 .../specs/KernelBench/level1/94_MSELoss.yaml  |   7 +
 .../level1/95_CrossEntropyLoss.yaml           |   7 +
 .../KernelBench/level1/96_HuberLoss.yaml      |   7 +
 .../level1/97_ScaledDotProductAttention.yaml  |   9 +
 .../KernelBench/level1/98_KLDivLoss.yaml      |   7 +
 .../level1/99_TripletMarginLoss.yaml          |   8 +
 .../9_Tall_skinny_matrix_multiplication_.yaml |   8 +
 pyproject.toml                                |   2 +-
 201 files changed, 15733 insertions(+), 56 deletions(-)
 create mode 100644 backends/triton/cpu/KernelBench/level1/100_HingeLoss.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/10_3D_tensor_matrix_multiplication.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/11_4D_tensor_matrix_multiplication.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/12_Matmul_with_diagonal_matrices_.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/13_Matmul_for_symmetric_matrices.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/14_Matmul_for_upper_triangular_matrices.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/15_Matmul_for_lower_triangular_matrices.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/16_Matmul_with_transposed_A.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/17_Matmul_with_transposed_B.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/18_Matmul_with_transposed_both.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/19_ReLU.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/20_LeakyReLU.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/21_Sigmoid.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/22_Tanh.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/23_Softmax.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/24_LogSoftmax.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/25_Swish.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/26_GELU_.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/27_SELU_.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/28_HardSigmoid.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/29_Softplus.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/2_Standard_matrix_multiplication_.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/30_Softsign.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/31_ELU.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/32_HardTanh.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/33_BatchNorm.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/34_InstanceNorm.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/35_GroupNorm_.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/36_RMSNorm_.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/37_FrobeniusNorm_.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/38_L1Norm_.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/39_L2Norm_.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/3_Batched_matrix_multiplication.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/40_LayerNorm.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/41_Max_Pooling_1D.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/42_Max_Pooling_2D.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/43_Max_Pooling_3D.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/44_Average_Pooling_1D.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/45_Average_Pooling_2D.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/46_Average_Pooling_3D.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/47_Sum_reduction_over_a_dimension.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/48_Mean_reduction_over_a_dimension.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/49_Max_reduction_over_a_dimension.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/4_Matrix_vector_multiplication_.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/50_conv_standard_2D__square_input__square_kernel.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/51_Argmax_over_a_dimension.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/52_Argmin_over_a_dimension.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/53_Min_reduction_over_a_dimension.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/54_conv_standard_3D__square_input__square_kernel.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/55_conv_standard_2D__asymmetric_input__square_kernel.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/56_conv_standard_2D__asymmetric_input__asymmetric_kernel.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/57_conv_transposed_2D__square_input__square_kernel.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/59_conv_standard_3D__asymmetric_input__square_kernel.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/5_Matrix_scalar_multiplication.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/60_conv_standard_3D__square_input__asymmetric_kernel.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/61_conv_transposed_3D__square_input__square_kernel.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/62_conv_standard_2D__square_input__asymmetric_kernel.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/63_conv_standard_2D__square_input__square_kernel.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/64_conv_transposed_1D.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/65_conv_transposed_2D__square_input__asymmetric_kernel.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/66_conv_standard_3D__asymmetric_input__asymmetric_kernel.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/67_conv_standard_1D.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/68_conv_transposed_3D__square_input__asymmetric_kernel.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/6_Matmul_with_large_K_dimension_.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/70_conv_transposed_3D__asymmetric_input__square_kernel.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/71_conv_transposed_2D__asymmetric_input__square_kernel.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/74_conv_transposed_1D_dilated.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/76_conv_standard_1D_dilated_strided__.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/7_Matmul_with_small_K_dimension_.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/82_conv_depthwise_2D_square_input_square_kernel.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/83_conv_depthwise_2D_square_input_asymmetric_kernel.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/84_conv_depthwise_2D_asymmetric_input_square_kernel.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/86_conv_depthwise_separable_2D.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/87_conv_pointwise_2D.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/88_MinGPTNewGelu.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/89_cumsum.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/8_Matmul_with_irregular_shapes_.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/90_cumprod.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/91_cumsum_reverse.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/92_cumsum_exclusive.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/93_masked_cumsum.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/94_MSELoss.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/95_CrossEntropyLoss.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/96_HuberLoss.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/97_ScaledDotProductAttention.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/98_KLDivLoss.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/99_TripletMarginLoss.py
 create mode 100644 backends/triton/cpu/KernelBench/level1/9_Tall_skinny_matrix_multiplication_.py

diff --git a/backends/triton/cpu/KernelBench/level1/100_HingeLoss.py b/backends/triton/cpu/KernelBench/level1/100_HingeLoss.py
new file mode 100644
index 0000000..1903395
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/100_HingeLoss.py
@@ -0,0 +1,81 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2),
+    ],
+    key=["D"],
+)
+@triton.jit
+def _hinge_loss_kernel(
+    pred_ptr,
+    targ_ptr,
+    out_ptr,
+    B,
+    D,
+    stride_pb,
+    BLOCK_SIZE: tl.constexpr,
+):
+    row_idx = tl.program_id(0)
+    row_start = row_idx * stride_pb
+
+    acc = 0.0
+
+    for col_start in range(0, D, BLOCK_SIZE):
+        cols = col_start + tl.arange(0, BLOCK_SIZE)
+        mask = cols < D
+
+        targ = tl.load(targ_ptr + cols, mask=mask, other=0.0).to(tl.float32)
+        pred = tl.load(pred_ptr + row_start + cols, mask=mask, other=0.0).to(tl.float32)
+
+        hinge = tl.maximum(1.0 - pred * targ, 0.0)
+        acc += tl.sum(hinge, axis=0)
+
+    tl.store(out_ptr + row_idx, acc.to(tl.float32))
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, predictions, targets):
+        B, D = predictions.shape
+        row_sums = torch.empty(B, device=predictions.device, dtype=torch.float32)
+
+        grid = (B,)
+        _hinge_loss_kernel[grid](
+            predictions,
+            targets,
+            row_sums,
+            B,
+            D,
+            predictions.stride(0),
+        )
+
+        return row_sums.sum() / (B * D)
+
+
+batch_size = 32768
+input_shape = (32768,)
+dim = 1
+
+
+def get_inputs():
+    return [
+        torch.rand(batch_size, *input_shape),
+        torch.randint(0, 2, (batch_size,)).float() * 2 - 1,
+    ]
+
+
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/10_3D_tensor_matrix_multiplication.py b/backends/triton/cpu/KernelBench/level1/10_3D_tensor_matrix_multiplication.py
new file mode 100644
index 0000000..b6629b8
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/10_3D_tensor_matrix_multiplication.py
@@ -0,0 +1,135 @@
+# ruff: noqa: E731, E741
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+def _configs():
+    return [
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
+            num_warps=32,
+            num_stages=2,
+        ),
+    ]
+
+
+@triton.autotune(configs=_configs(), key=["M", "N", "K"])
+@triton.jit
+def _matmul_kernel(
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    M,
+    N,
+    K,
+    stride_am: tl.constexpr,
+    stride_ak: tl.constexpr,
+    stride_bk: tl.constexpr,
+    stride_bn: tl.constexpr,
+    stride_cm: tl.constexpr,
+    stride_cn: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    pid = tl.program_id(0)
+
+    num_pid_m = tl.cdiv(M, BLOCK_M)
+    num_pid_n = tl.cdiv(N, BLOCK_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = tl.minimum(num_pid_m - first_pid_m, GROUP_SIZE_M)
+
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    a_desc = tl.make_tensor_descriptor(
+        base=a_ptr,
+        shape=(M, K),
+        strides=(stride_am, stride_ak),
+        block_shape=(BLOCK_M, BLOCK_K),
+    )
+    b_desc = tl.make_tensor_descriptor(
+        base=b_ptr,
+        shape=(K, N),
+        strides=(stride_bk, stride_bn),
+        block_shape=(BLOCK_K, BLOCK_N),
+    )
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    for off_k in range(0, K, BLOCK_K):
+        a_tile = a_desc.load([pid_m * BLOCK_M, off_k])
+        b_tile = b_desc.load([off_k, pid_n * BLOCK_N])
+        acc += tl.dot(a_tile, b_tile)
+    c_desc = tl.make_tensor_descriptor(
+        base=c_ptr,
+        shape=(M, N),
+        strides=(stride_cm, stride_cn),
+        block_shape=(BLOCK_M, BLOCK_N),
+    )
+    c_desc.store([pid_m * BLOCK_M, pid_n * BLOCK_N], acc.to(c_ptr.type.element_ty))
+
+
+N = 16
+M = 1024
+K = 2048
+L = 768
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, A, B):
+        batch, m, k = A.shape
+        _, l = B.shape
+
+        a = A.to(torch.bfloat16).contiguous()
+        b = B.to(torch.bfloat16).contiguous()
+
+        a_flat = a.reshape(batch * m, k)
+        total_m = batch * m
+
+        c_flat = torch.empty((total_m, l), device=a.device, dtype=torch.bfloat16)
+
+        def grid(META):
+            return (
+                triton.cdiv(total_m, META["BLOCK_M"]) * triton.cdiv(l, META["BLOCK_N"]),
+            )
+
+        _matmul_kernel[grid](
+            a_flat,
+            b,
+            c_flat,
+            total_m,
+            l,
+            k,
+            a_flat.stride(0),
+            a_flat.stride(1),
+            b.stride(0),
+            b.stride(1),
+            c_flat.stride(0),
+            c_flat.stride(1),
+        )
+
+        return c_flat.reshape(batch, m, l)
+
+
+def get_inputs():
+    A = torch.rand(N, M, K, dtype=torch.bfloat16)
+    B = torch.rand(K, L, dtype=torch.bfloat16)
+    return [A, B]
+
+
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/11_4D_tensor_matrix_multiplication.py b/backends/triton/cpu/KernelBench/level1/11_4D_tensor_matrix_multiplication.py
new file mode 100644
index 0000000..10f3601
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/11_4D_tensor_matrix_multiplication.py
@@ -0,0 +1,155 @@
+# ruff: noqa: E731, E741
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def swizzle_tile(
+    tile_id,
+    M,
+    N,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    grid_m = tl.cdiv(M, BLOCK_M)
+    grid_n = tl.cdiv(N, BLOCK_N)
+    width = GROUP_SIZE_M * grid_n
+    group_id = tile_id // width
+    group_size = tl.minimum(GROUP_SIZE_M, grid_m - group_id * GROUP_SIZE_M)
+    pid_m = group_id * GROUP_SIZE_M + ((tile_id % width) % group_size)
+    pid_n = (tile_id % width) // group_size
+    return pid_m, pid_n
+
+
+def get_autotune_configs():
+    return [
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 64, "GROUP_SIZE_M": 4},
+            num_warps=32,
+            num_stages=2,
+        ),
+    ]
+
+
+@triton.autotune(
+    configs=get_autotune_configs(),
+    key=["M", "N", "K"],
+)
+@triton.jit
+def _gemm_kernel(
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    M,
+    N,
+    K,
+    stride_am: tl.constexpr,
+    stride_ak: tl.constexpr,
+    stride_bk: tl.constexpr,
+    stride_bn: tl.constexpr,
+    stride_cm: tl.constexpr,
+    stride_cn: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    pid_m, pid_n = swizzle_tile(pid, M, N, BLOCK_M, BLOCK_N, GROUP_SIZE_M)
+
+    a_desc = tl.make_tensor_descriptor(
+        base=a_ptr,
+        shape=(M, K),
+        strides=(stride_am, stride_ak),
+        block_shape=(BLOCK_M, BLOCK_K),
+    )
+    b_desc = tl.make_tensor_descriptor(
+        base=b_ptr,
+        shape=(K, N),
+        strides=(stride_bk, stride_bn),
+        block_shape=(BLOCK_K, BLOCK_N),
+    )
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    for off_k in range(0, K, BLOCK_K):
+        a_block = a_desc.load([pid_m * BLOCK_M, off_k])
+        b_block = b_desc.load([off_k, pid_n * BLOCK_N])
+        acc += tl.dot(a_block, b_block)
+    c_desc = tl.make_tensor_descriptor(
+        base=c_ptr,
+        shape=(M, N),
+        strides=(stride_cm, stride_cn),
+        block_shape=(BLOCK_M, BLOCK_N),
+    )
+    c_desc.store([pid_m * BLOCK_M, pid_n * BLOCK_N], acc.to(c_ptr.type.element_ty))
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, A, B):
+        b_dim, i_dim, j_dim, l_dim = A.shape
+        k_dim = B.shape[1]
+
+        A_flat = A.contiguous().view(-1, l_dim)
+        if A_flat.dtype != torch.bfloat16:
+            A_flat = A_flat.to(torch.bfloat16)
+        B_fp16 = B.contiguous()
+        if B_fp16.dtype != torch.bfloat16:
+            B_fp16 = B_fp16.to(torch.bfloat16)
+
+        M = A_flat.shape[0]
+        N = k_dim
+        K = l_dim
+
+        C_2d = torch.empty((M, N), device=A.device, dtype=torch.bfloat16)
+
+        grid = lambda META: (
+            triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),
+        )
+
+        _gemm_kernel[grid](
+            A_flat,
+            B_fp16,
+            C_2d,
+            M,
+            N,
+            K,
+            A_flat.stride(0),
+            A_flat.stride(1),
+            B_fp16.stride(0),
+            B_fp16.stride(1),
+            C_2d.stride(0),
+            C_2d.stride(1),
+        )
+
+        result = C_2d.view(b_dim, i_dim, j_dim, k_dim)
+        if A.dtype != torch.bfloat16:
+            result = result.to(A.dtype)
+        return result
+
+
+b = 8
+i = 256
+j = 512
+l = 256
+k = 768
+
+
+def get_inputs():
+    A = torch.rand(b, i, j, l, dtype=torch.bfloat16)
+    B = torch.rand(l, k, dtype=torch.bfloat16)
+    return [A, B]
+
+
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/12_Matmul_with_diagonal_matrices_.py b/backends/triton/cpu/KernelBench/level1/12_Matmul_with_diagonal_matrices_.py
new file mode 100644
index 0000000..499ab2a
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/12_Matmul_with_diagonal_matrices_.py
@@ -0,0 +1,95 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_M": 64, "BLOCK_N": 128}, num_warps=32, num_stages=2),
+    ],
+    key=["N", "M"],
+)
+@triton.jit
+def _diag_matmul_kernel(
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    N,
+    M,
+    stride_bn,
+    stride_bm,
+    stride_cn,
+    stride_cm,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    pid_n = tl.program_id(0)
+    pid_m = tl.program_id(1)
+
+    offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+
+    mask_n = offs_n < N
+    mask_m = offs_m < M
+
+    a_vals = tl.load(a_ptr + offs_n, mask=mask_n, other=0.0)
+
+    b_ptrs = b_ptr + offs_n[:, None] * stride_bn + offs_m[None, :] * stride_bm
+    mask = mask_n[:, None] & mask_m[None, :]
+    b_vals = tl.load(b_ptrs, mask=mask, other=0.0)
+
+    c_vals = a_vals[:, None].to(tl.float32) * b_vals.to(tl.float32)
+
+    c_ptrs = c_ptr + offs_n[:, None] * stride_cn + offs_m[None, :] * stride_cm
+    tl.store(c_ptrs, c_vals.to(tl.bfloat16), mask=mask)
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, A, B):
+        N = A.shape[0]
+        M = B.shape[1]
+
+        C = torch.empty((N, M), device=A.device, dtype=A.dtype)
+
+        grid = lambda META: (
+            triton.cdiv(N, META["BLOCK_N"]),
+            triton.cdiv(M, META["BLOCK_M"]),
+        )
+
+        _diag_matmul_kernel[grid](
+            A,
+            B,
+            C,
+            N,
+            M,
+            B.stride(0),
+            B.stride(1),
+            C.stride(0),
+            C.stride(1),
+        )
+
+        return C
+
+
+M = 4096
+N = 4096
+
+
+def get_inputs():
+    A = torch.rand(N, dtype=torch.bfloat16)
+    B = torch.rand(N, M, dtype=torch.bfloat16)
+    return [A, B]
+
+
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/13_Matmul_for_symmetric_matrices.py b/backends/triton/cpu/KernelBench/level1/13_Matmul_for_symmetric_matrices.py
new file mode 100644
index 0000000..13dc6df
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/13_Matmul_for_symmetric_matrices.py
@@ -0,0 +1,151 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def swizzle_tile(
+    tile_id,
+    M,
+    N,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    grid_m = tl.cdiv(M, BLOCK_M)
+    grid_n = tl.cdiv(N, BLOCK_N)
+    width = GROUP_SIZE_M * grid_n
+    group_id = tile_id // width
+    group_size = tl.minimum(GROUP_SIZE_M, grid_m - group_id * GROUP_SIZE_M)
+    pid_m = group_id * GROUP_SIZE_M + (tile_id % group_size)
+    pid_n = (tile_id % width) // group_size
+    return pid_m, pid_n
+
+
+def _configs():
+    return [
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
+            num_warps=32,
+            num_stages=2,
+        ),
+    ]
+
+
+@triton.autotune(configs=_configs(), key=["M", "N", "K"])
+@triton.jit
+def _matmul_kernel(
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    M,
+    N,
+    K,
+    stride_am: tl.constexpr,
+    stride_ak: tl.constexpr,
+    stride_bk: tl.constexpr,
+    stride_bn: tl.constexpr,
+    stride_cm: tl.constexpr,
+    stride_cn: tl.constexpr,
+    DIVISIBLE_M: tl.constexpr,
+    DIVISIBLE_N: tl.constexpr,
+    DIVISIBLE_K: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    pid_m, pid_n = swizzle_tile(pid, M, N, BLOCK_M, BLOCK_N, GROUP_SIZE_M)
+
+    a_desc = tl.make_tensor_descriptor(
+        base=a_ptr,
+        shape=(M, K),
+        strides=(stride_am, stride_ak),
+        block_shape=(BLOCK_M, BLOCK_K),
+    )
+    b_desc = tl.make_tensor_descriptor(
+        base=b_ptr,
+        shape=(K, N),
+        strides=(stride_bk, stride_bn),
+        block_shape=(BLOCK_K, BLOCK_N),
+    )
+
+    off_m = pid_m * BLOCK_M
+    off_n = pid_n * BLOCK_N
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+
+    for off_k in range(0, K, BLOCK_K):
+        a_tile = a_desc.load([off_m, off_k])
+        b_tile = b_desc.load([off_k, off_n])
+        acc += tl.dot(a_tile, b_tile)
+    c_desc = tl.make_tensor_descriptor(
+        base=c_ptr,
+        shape=(M, N),
+        strides=(stride_cm, stride_cn),
+        block_shape=(BLOCK_M, BLOCK_N),
+    )
+    c_desc.store([off_m, off_n], acc.to(c_ptr.type.element_ty))
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, A, B):
+        if A.dtype != torch.bfloat16:
+            A = A.to(torch.bfloat16)
+        if B.dtype != torch.bfloat16:
+            B = B.to(torch.bfloat16)
+        if not A.is_contiguous():
+            A = A.contiguous()
+        if not B.is_contiguous():
+            B = B.contiguous()
+
+        M, K = A.shape
+        N = B.shape[1]
+        C = torch.empty((M, N), device=A.device, dtype=torch.bfloat16)
+
+        grid = lambda META: (
+            triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),
+        )
+        _matmul_kernel[grid](
+            A,
+            B,
+            C,
+            M,
+            N,
+            K,
+            A.stride(0),
+            A.stride(1),
+            B.stride(0),
+            B.stride(1),
+            C.stride(0),
+            C.stride(1),
+            DIVISIBLE_M=(M % 256 == 0),
+            DIVISIBLE_N=(N % 128 == 0),
+            DIVISIBLE_K=(K % 32 == 0),
+        )
+        return C
+
+
+N = 4096
+
+
+def get_inputs():
+    A = torch.rand(N, N, dtype=torch.bfloat16)
+    A = (A + A.T) / 2
+    B = torch.rand(N, N, dtype=torch.bfloat16)
+    B = (B + B.T) / 2
+    return [A, B]
+
+
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/14_Matmul_for_upper_triangular_matrices.py b/backends/triton/cpu/KernelBench/level1/14_Matmul_for_upper_triangular_matrices.py
new file mode 100644
index 0000000..184f008
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/14_Matmul_for_upper_triangular_matrices.py
@@ -0,0 +1,153 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def swizzle_tile(
+    tile_id,
+    M,
+    N,
+    K,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    grid_m = tl.cdiv(M, BLOCK_M)
+    grid_n = tl.cdiv(N, BLOCK_N)
+    width = GROUP_SIZE_M * grid_n
+    group_id = tile_id // width
+    group_size = tl.minimum(GROUP_SIZE_M, grid_m - group_id * GROUP_SIZE_M)
+    pid_m = group_id * GROUP_SIZE_M + (tile_id % group_size)
+    pid_n = (tile_id % width) // group_size
+    return pid_m, pid_n
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
+            num_warps=32,
+            num_stages=2,
+        ),
+    ],
+    key=["M", "N", "K"],
+)
+@triton.jit
+def _triu_matmul_kernel(
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    M,
+    N,
+    K,
+    stride_am: tl.constexpr,
+    stride_ak: tl.constexpr,
+    stride_bk: tl.constexpr,
+    stride_bn: tl.constexpr,
+    stride_cm: tl.constexpr,
+    stride_cn: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    pid_m, pid_n = swizzle_tile(pid, M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, GROUP_SIZE_M)
+
+    off_m = pid_m * BLOCK_M
+    off_n = pid_n * BLOCK_N
+
+    # Skip tiles entirely below the diagonal (output is upper triangular)
+    if off_m >= off_n + BLOCK_N:
+        return
+
+    # K-loop trimming: A is upper tri so A[i,k]=0 for k<i, B is upper tri so B[k,j]=0 for k>j
+    # Effective K range: [off_m, min(off_n + BLOCK_N, K))
+    k_start = (off_m // BLOCK_K) * BLOCK_K
+    k_end_raw = off_n + BLOCK_N
+    k_end = k_end_raw if k_end_raw < K else K
+
+    a_desc = tl.make_tensor_descriptor(
+        base=a_ptr,
+        shape=(M, K),
+        strides=(stride_am, stride_ak),
+        block_shape=(BLOCK_M, BLOCK_K),
+    )
+    b_desc = tl.make_tensor_descriptor(
+        base=b_ptr,
+        shape=(K, N),
+        strides=(stride_bk, stride_bn),
+        block_shape=(BLOCK_K, BLOCK_N),
+    )
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+
+    for k_offset in range(k_start, k_end, BLOCK_K):
+        a_block = a_desc.load([off_m, k_offset])
+        b_block = b_desc.load([k_offset, off_n])
+        acc += tl.dot(a_block, b_block)
+    # Apply triu mask
+    row_idx = off_m + tl.arange(0, BLOCK_M)
+    col_idx = off_n + tl.arange(0, BLOCK_N)
+    triu_mask = row_idx[:, None] <= col_idx[None, :]
+    acc = tl.where(triu_mask, acc, 0.0)
+
+    c_desc = tl.make_tensor_descriptor(
+        base=c_ptr,
+        shape=(M, N),
+        strides=(stride_cm, stride_cn),
+        block_shape=(BLOCK_M, BLOCK_N),
+    )
+    c_desc.store([off_m, off_n], acc.to(c_ptr.type.element_ty))
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, A, B):
+        M, K = A.shape
+        N = B.shape[1]
+        C = torch.zeros((M, N), device=A.device, dtype=A.dtype)
+
+        grid = lambda META: (
+            triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),
+        )
+
+        _triu_matmul_kernel[grid](
+            A,
+            B,
+            C,
+            M,
+            N,
+            K,
+            A.stride(0),
+            A.stride(1),
+            B.stride(0),
+            B.stride(1),
+            C.stride(0),
+            C.stride(1),
+        )
+        return C
+
+
+N = 4096
+
+
+def get_inputs():
+    A = torch.triu(torch.rand(N, N, dtype=torch.bfloat16))
+    B = torch.triu(torch.rand(N, N, dtype=torch.bfloat16))
+    return [A, B]
+
+
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/15_Matmul_for_lower_triangular_matrices.py b/backends/triton/cpu/KernelBench/level1/15_Matmul_for_lower_triangular_matrices.py
new file mode 100644
index 0000000..8b755b7
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/15_Matmul_for_lower_triangular_matrices.py
@@ -0,0 +1,146 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def swizzle_tile(
+    tile_id,
+    M,
+    N,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    grid_m = tl.cdiv(M, BLOCK_M)
+    grid_n = tl.cdiv(N, BLOCK_N)
+    width = GROUP_SIZE_M * grid_n
+    group_id = tile_id // width
+    group_size = tl.minimum(GROUP_SIZE_M, grid_m - group_id * GROUP_SIZE_M)
+    pid_m = group_id * GROUP_SIZE_M + (tile_id % group_size)
+    pid_n = (tile_id % width) // group_size
+    return pid_m, pid_n
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
+            num_warps=32,
+            num_stages=2,
+        ),
+    ],
+    key=["M"],
+)
+@triton.jit
+def tril_matmul_kernel(
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    M,
+    stride_am: tl.constexpr,
+    stride_ak: tl.constexpr,
+    stride_bk: tl.constexpr,
+    stride_bn: tl.constexpr,
+    stride_cm,
+    stride_cn,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    pid_m, pid_n = swizzle_tile(pid, M, M, BLOCK_M, BLOCK_N, GROUP_SIZE_M)
+
+    off_m = pid_m * BLOCK_M
+    off_n = pid_n * BLOCK_N
+
+    # Skip tiles entirely in the upper triangle
+    if off_n > off_m + BLOCK_M - 1:
+        return
+
+    # K-range optimization for triangular matrices:
+    # A is lower triangular: A[i,k]=0 for k>i, so max useful K = off_m + BLOCK_M
+    # B is lower triangular: B[k,j]=0 for j>k, so min useful K = off_n
+    # Align to BLOCK_K boundaries
+    k_start = (off_n // BLOCK_K) * BLOCK_K
+    k_end_raw = off_m + BLOCK_M
+    k_end = tl.minimum(k_end_raw, M)
+
+    a_desc = tl.make_tensor_descriptor(
+        base=a_ptr,
+        shape=(M, M),
+        strides=(stride_am, stride_ak),
+        block_shape=(BLOCK_M, BLOCK_K),
+    )
+    b_desc = tl.make_tensor_descriptor(
+        base=b_ptr,
+        shape=(M, M),
+        strides=(stride_bk, stride_bn),
+        block_shape=(BLOCK_K, BLOCK_N),
+    )
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+
+    for k in range(k_start, k_end, BLOCK_K):
+        a_tile = a_desc.load([off_m, k])
+        b_tile = b_desc.load([k, off_n])
+        acc += tl.dot(a_tile, b_tile)
+    # Apply tril mask
+    row_idx = off_m + tl.arange(0, BLOCK_M)
+    col_idx = off_n + tl.arange(0, BLOCK_N)
+    tril_mask = row_idx[:, None] >= col_idx[None, :]
+    acc = tl.where(tril_mask, acc, 0.0)
+
+    # Store using raw pointers
+    bounds_mask = (row_idx[:, None] < M) & (col_idx[None, :] < M)
+    c_ptrs = c_ptr + row_idx[:, None] * stride_cm + col_idx[None, :] * stride_cn
+    tl.store(c_ptrs, acc.to(c_ptr.type.element_ty), mask=bounds_mask & tril_mask)
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, A, B):
+        M = A.shape[0]
+        C = torch.zeros(M, M, device=A.device, dtype=A.dtype)
+
+        grid = lambda META: (
+            triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(M, META["BLOCK_N"]),
+        )
+        tril_matmul_kernel[grid](
+            A,
+            B,
+            C,
+            M,
+            A.stride(0),
+            A.stride(1),
+            B.stride(0),
+            B.stride(1),
+            C.stride(0),
+            C.stride(1),
+        )
+        return C
+
+
+M = 4096
+
+
+def get_inputs():
+    A = torch.rand(M, M, dtype=torch.bfloat16)
+    B = torch.rand(M, M, dtype=torch.bfloat16)
+    A = torch.tril(A)
+    B = torch.tril(B)
+    return [A, B]
+
+
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/16_Matmul_with_transposed_A.py b/backends/triton/cpu/KernelBench/level1/16_Matmul_with_transposed_A.py
new file mode 100644
index 0000000..1df1ce2
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/16_Matmul_with_transposed_A.py
@@ -0,0 +1,132 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+def get_autotune_configs():
+    return [
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
+            num_warps=32,
+            num_stages=2,
+        ),
+    ]
+
+
+@triton.autotune(
+    configs=get_autotune_configs(),
+    key=["M", "N", "K"],
+)
+@triton.jit
+def _matmul_at_kernel(
+    A_ptr,
+    B_ptr,
+    C_ptr,
+    M,
+    N,
+    K,
+    stride_ak: tl.constexpr,
+    stride_am: tl.constexpr,
+    stride_bk: tl.constexpr,
+    stride_bn: tl.constexpr,
+    stride_cm: tl.constexpr,
+    stride_cn: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    pid = tl.program_id(0)
+
+    num_pid_m = tl.cdiv(M, BLOCK_M)
+    num_pid_n = tl.cdiv(N, BLOCK_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = tl.minimum(num_pid_m - first_pid_m, GROUP_SIZE_M)
+
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    # A is [K, M] — load [BLOCK_K, BLOCK_M] tiles, transpose in register
+    A_desc = tl.make_tensor_descriptor(
+        base=A_ptr,
+        shape=(K, M),
+        strides=(stride_ak, stride_am),
+        block_shape=(BLOCK_K, BLOCK_M),
+    )
+
+    # B is [K, N] — load [BLOCK_K, BLOCK_N] tiles
+    B_desc = tl.make_tensor_descriptor(
+        base=B_ptr,
+        shape=(K, N),
+        strides=(stride_bk, stride_bn),
+        block_shape=(BLOCK_K, BLOCK_N),
+    )
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    for off_k in range(0, K, BLOCK_K):
+        A_tile = A_desc.load([off_k, pid_m * BLOCK_M])
+        B_tile = B_desc.load([off_k, pid_n * BLOCK_N])
+        acc += tl.dot(A_tile.T, B_tile)
+    C_desc = tl.make_tensor_descriptor(
+        base=C_ptr,
+        shape=(M, N),
+        strides=(stride_cm, stride_cn),
+        block_shape=(BLOCK_M, BLOCK_N),
+    )
+    C_desc.store([pid_m * BLOCK_M, pid_n * BLOCK_N], acc.to(C_ptr.type.element_ty))
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
+        K, M = A.shape
+        _, N = B.shape
+        A = A.contiguous()
+        B = B.contiguous()
+        C = torch.empty((M, N), device=A.device, dtype=A.dtype)
+
+        def grid(META):
+            return (triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),)
+
+        _matmul_at_kernel[grid](
+            A,
+            B,
+            C,
+            M,
+            N,
+            K,
+            A.stride(0),
+            A.stride(1),
+            B.stride(0),
+            B.stride(1),
+            C.stride(0),
+            C.stride(1),
+        )
+        return C
+
+
+M = 1024 * 2
+K = 4096 * 2
+N = 2048 * 2
+
+
+def get_inputs():
+    A = torch.rand(K, M, dtype=torch.bfloat16)
+    B = torch.rand(K, N, dtype=torch.bfloat16)
+    return [A, B]
+
+
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/17_Matmul_with_transposed_B.py b/backends/triton/cpu/KernelBench/level1/17_Matmul_with_transposed_B.py
new file mode 100644
index 0000000..c7b9f5f
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/17_Matmul_with_transposed_B.py
@@ -0,0 +1,137 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+def get_autotune_configs():
+    return [
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
+            num_warps=32,
+            num_stages=2,
+        ),
+    ]
+
+
+@triton.autotune(
+    configs=get_autotune_configs(),
+    key=["M", "N", "K"],
+)
+@triton.jit
+def _matmul_bt_kernel(
+    A_ptr,
+    B_ptr,
+    C_ptr,
+    M,
+    N,
+    K,
+    stride_am: tl.constexpr,
+    stride_ak: tl.constexpr,
+    stride_bn: tl.constexpr,
+    stride_bk: tl.constexpr,
+    stride_cm: tl.constexpr,
+    stride_cn: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    pid = tl.program_id(0)
+
+    num_pid_m = tl.cdiv(M, BLOCK_M)
+    num_pid_n = tl.cdiv(N, BLOCK_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = tl.minimum(num_pid_m - first_pid_m, GROUP_SIZE_M)
+
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    # A is [M, K] — load [BLOCK_M, BLOCK_K] tiles in natural layout
+    a_desc = tl.make_tensor_descriptor(
+        A_ptr,
+        shape=[M, K],
+        strides=[stride_am, stride_ak],
+        block_shape=[BLOCK_M, BLOCK_K],
+    )
+
+    # B is [N, K] — load [BLOCK_N, BLOCK_K] tiles in natural layout, then transpose
+    b_desc = tl.make_tensor_descriptor(
+        B_ptr,
+        shape=[N, K],
+        strides=[stride_bn, stride_bk],
+        block_shape=[BLOCK_N, BLOCK_K],
+    )
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    for off_k in range(0, K, BLOCK_K):
+        A_tile = a_desc.load([pid_m * BLOCK_M, off_k])
+        B_tile = b_desc.load([pid_n * BLOCK_N, off_k])
+
+        # Transpose B in-register: [BLOCK_N, BLOCK_K] -> [BLOCK_K, BLOCK_N]
+        acc += tl.dot(A_tile, B_tile.T)
+    c_desc = tl.make_tensor_descriptor(
+        C_ptr,
+        shape=[M, N],
+        strides=[stride_cm, stride_cn],
+        block_shape=[BLOCK_M, BLOCK_N],
+    )
+    c_desc.store([pid_m * BLOCK_M, pid_n * BLOCK_N], acc.to(C_ptr.type.element_ty))
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
+        M, K = A.shape
+        N, _ = B.shape
+
+        A = A.contiguous()
+        B = B.contiguous()
+
+        C = torch.empty((M, N), device=A.device, dtype=A.dtype)
+
+        def grid(META):
+            return (triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),)
+
+        _matmul_bt_kernel[grid](
+            A,
+            B,
+            C,
+            M,
+            N,
+            K,
+            A.stride(0),
+            A.stride(1),
+            B.stride(0),
+            B.stride(1),
+            C.stride(0),
+            C.stride(1),
+        )
+
+        return C
+
+
+M = 1024 * 2
+K = 4096 * 2
+N = 2048 * 2
+
+
+def get_inputs():
+    A = torch.rand(M, K, dtype=torch.bfloat16)
+    B = torch.rand(N, K, dtype=torch.bfloat16)
+    return [A, B]
+
+
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/18_Matmul_with_transposed_both.py b/backends/triton/cpu/KernelBench/level1/18_Matmul_with_transposed_both.py
new file mode 100644
index 0000000..54c94c6
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/18_Matmul_with_transposed_both.py
@@ -0,0 +1,133 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+def _configs():
+    return [
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
+            num_warps=32,
+            num_stages=2,
+        ),
+    ]
+
+
+@triton.autotune(configs=_configs(), key=["M", "N", "K"])
+@triton.jit
+def _matmul_tt_kernel(
+    A_ptr,
+    B_ptr,
+    C_ptr,
+    M,
+    N,
+    K,
+    stride_ak: tl.constexpr,
+    stride_am: tl.constexpr,
+    stride_bn: tl.constexpr,
+    stride_bk: tl.constexpr,
+    stride_cm: tl.constexpr,
+    stride_cn: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    pid = tl.program_id(0)
+
+    num_pid_m = tl.cdiv(M, BLOCK_M)
+    num_pid_n = tl.cdiv(N, BLOCK_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = tl.minimum(num_pid_m - first_pid_m, GROUP_SIZE_M)
+
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    # A is (K, M): load (BLOCK_K, BLOCK_M), transpose in-register
+    a_desc = tl.make_tensor_descriptor(
+        A_ptr,
+        shape=[K, M],
+        strides=[stride_ak, stride_am],
+        block_shape=[BLOCK_K, BLOCK_M],
+    )
+
+    # B is (N, K): load (BLOCK_N, BLOCK_K), transpose in-register
+    b_desc = tl.make_tensor_descriptor(
+        B_ptr,
+        shape=[N, K],
+        strides=[stride_bn, stride_bk],
+        block_shape=[BLOCK_N, BLOCK_K],
+    )
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    for off_k in range(0, K, BLOCK_K):
+        a = a_desc.load([off_k, pid_m * BLOCK_M])
+        b = b_desc.load([pid_n * BLOCK_N, off_k])
+
+        acc += tl.dot(a.T, b.T)
+    c_desc = tl.make_tensor_descriptor(
+        C_ptr,
+        shape=[M, N],
+        strides=[stride_cm, stride_cn],
+        block_shape=[BLOCK_M, BLOCK_N],
+    )
+    c_desc.store([pid_m * BLOCK_M, pid_n * BLOCK_N], acc.to(C_ptr.type.element_ty))
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
+        K, M = A.shape
+        N, _ = B.shape
+
+        A = A.contiguous()
+        B = B.contiguous()
+
+        C = torch.empty((M, N), device=A.device, dtype=A.dtype)
+
+        def grid(META):
+            return (triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),)
+
+        _matmul_tt_kernel[grid](
+            A,
+            B,
+            C,
+            M,
+            N,
+            K,
+            A.stride(0),
+            A.stride(1),
+            B.stride(0),
+            B.stride(1),
+            C.stride(0),
+            C.stride(1),
+        )
+
+        return C
+
+
+M = 1024 * 2
+K = 4096 * 2
+N = 2048 * 2
+
+
+def get_inputs():
+    A = torch.rand(K, M, dtype=torch.bfloat16)
+    B = torch.rand(N, K, dtype=torch.bfloat16)
+    return [A, B]
+
+
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/19_ReLU.py b/backends/triton/cpu/KernelBench/level1/19_ReLU.py
new file mode 100644
index 0000000..1eed9d7
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/19_ReLU.py
@@ -0,0 +1,63 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {"BLOCK_SIZE": 4096, "NUM_PROGRAMS": 256}, num_warps=8, num_stages=3
+        ),
+    ],
+    key=["n_elements"],
+)
+@triton.jit
+def relu_kernel_persistent(
+    x_ptr,
+    output_ptr,
+    n_elements,
+    BLOCK_SIZE: tl.constexpr,
+    NUM_PROGRAMS: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    num_blocks = tl.cdiv(n_elements, BLOCK_SIZE)
+
+    for block_id in tl.range(pid, num_blocks, NUM_PROGRAMS):
+        block_start = block_id * BLOCK_SIZE
+        offsets = block_start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < n_elements
+        x = tl.load(x_ptr + offsets, mask=mask)
+        output = tl.maximum(x, 0.0)
+        tl.store(output_ptr + offsets, output, mask=mask)
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        output = torch.empty_like(x)
+        n_elements = x.numel()
+        grid = lambda META: (META["NUM_PROGRAMS"],)
+        relu_kernel_persistent[grid](x, output, n_elements)
+        return output
+
+
+batch_size = 4096
+dim = 393216
+
+
+def get_inputs():
+    x = torch.rand(batch_size, dim, dtype=torch.bfloat16)
+    return [x]
+
+
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/1_Square_matrix_multiplication_.py b/backends/triton/cpu/KernelBench/level1/1_Square_matrix_multiplication_.py
index 1582894..09c940e 100644
--- a/backends/triton/cpu/KernelBench/level1/1_Square_matrix_multiplication_.py
+++ b/backends/triton/cpu/KernelBench/level1/1_Square_matrix_multiplication_.py
@@ -1,5 +1,6 @@
 # ruff: noqa: E731
-# Example Triton CPU kernel
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
 # Status: Experimental / uncurated
 # Expectation: Correctness-first, performance not representative
 
@@ -9,13 +10,17 @@
 import triton.language as tl
 
 
-@triton.autotune(
-    configs=[
-        triton.Config({"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32}),
-        triton.Config({"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 64}),
-    ],
-    key=["M", "N", "K"],  # autotune per problem size
-)
+def _configs():
+    return [
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
+            num_warps=32,
+            num_stages=2,
+        ),
+    ]
+
+
+@triton.autotune(configs=_configs(), key=["M", "N", "K"])
 @triton.jit
 def _matmul_kernel(
     a_ptr,
@@ -24,70 +29,95 @@ def _matmul_kernel(
     M,
     N,
     K,
+    stride_am,
+    stride_ak: tl.constexpr,
+    stride_bk,
+    stride_bn: tl.constexpr,
+    stride_cm,
+    stride_cn: tl.constexpr,
     BLOCK_M: tl.constexpr,
     BLOCK_N: tl.constexpr,
     BLOCK_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
 ):
+    pid = tl.program_id(0)
+    num_pid_m = tl.cdiv(M, BLOCK_M)
+    num_pid_n = tl.cdiv(N, BLOCK_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = tl.minimum(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
     a_desc = tl.make_tensor_descriptor(
-        base=a_ptr, shape=(M, K), strides=(K, 1), block_shape=(BLOCK_M, BLOCK_K)
+        base=a_ptr,
+        shape=(M, K),
+        strides=(stride_am, stride_ak),
+        block_shape=(BLOCK_M, BLOCK_K),
     )
     b_desc = tl.make_tensor_descriptor(
-        base=b_ptr, shape=(K, N), strides=(N, 1), block_shape=(BLOCK_K, BLOCK_N)
-    )
-    c_desc = tl.make_tensor_descriptor(
-        base=c_ptr, shape=(M, N), strides=(N, 1), block_shape=(BLOCK_M, BLOCK_N)
+        base=b_ptr,
+        shape=(K, N),
+        strides=(stride_bk, stride_bn),
+        block_shape=(BLOCK_K, BLOCK_N),
     )
 
-    m = tl.program_id(0) * BLOCK_M
-    n = tl.program_id(1) * BLOCK_N
     acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-    for k in range(0, K, BLOCK_K):
-        a = a_desc.load((m, k))
-        b = b_desc.load((k, n))
-        acc = tl.dot(a, b, acc)
+    for off_k in range(0, K, BLOCK_K):
+        a_tile = a_desc.load([pid_m * BLOCK_M, off_k])
+        b_tile = b_desc.load([off_k, pid_n * BLOCK_N])
+        acc += tl.dot(a_tile, b_tile)
+    c_desc = tl.make_tensor_descriptor(
+        base=c_ptr,
+        shape=(M, N),
+        strides=(stride_cm, stride_cn),
+        block_shape=(BLOCK_M, BLOCK_N),
+    )
+    c_desc.store([pid_m * BLOCK_M, pid_n * BLOCK_N], acc.to(c_ptr.type.element_ty))
 
-    c_desc.store((m, n), acc)
 
+class Model(nn.Module):
+    def __init__(self):
+        super().__init__()
 
-def _kernel_function_cpu(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
-    assert isinstance(A, torch.Tensor) and isinstance(B, torch.Tensor)
-    assert A.device.type == "cpu" and B.device.type == "cpu", "A and B must be on CPU"
-    assert A.is_floating_point() and B.is_floating_point(), (
-        "A and B must be floating point tensors"
-    )
-    assert A.dtype == B.dtype, f"dtype mismatch: {A.dtype} vs {B.dtype}"
+    def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
+        M, K = A.shape
+        K2, N = B.shape
 
-    orig_dtype = A.dtype
+        A = A.contiguous()
+        B = B.contiguous()
 
-    M, K = A.shape
-    K2, N = B.shape
-    assert K == K2, f"Incompatible K dimensions: {K} vs {K2}"
+        C = torch.empty((M, N), device=A.device, dtype=A.dtype)
 
-    C32 = torch.empty((M, N), device=A.device, dtype=torch.float32)
+        def grid(META):
+            return (triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),)
 
-    # Autotuned grid: depends on BLOCK_M/BLOCK_N chosen by config
-    grid = lambda META: (
-        triton.cdiv(M, META["BLOCK_M"]),
-        triton.cdiv(N, META["BLOCK_N"]),
-    )
+        _matmul_kernel[grid](
+            A,
+            B,
+            C,
+            M,
+            N,
+            K,
+            A.stride(0),
+            A.stride(1),
+            B.stride(0),
+            B.stride(1),
+            C.stride(0),
+            C.stride(1),
+        )
+        return C
 
-    _matmul_kernel[grid](
-        A,
-        B,
-        C32,
-        M,
-        N,
-        K,
-    )
 
-    return C32.to(orig_dtype)
+N = 2048 * 2
 
 
-class Model(nn.Module):
-    """KernelBench-compatible wrapper"""
+def get_inputs():
+    A = torch.rand(N, N, dtype=torch.bfloat16)
+    B = torch.rand(N, N, dtype=torch.bfloat16)
+    return [A, B]
 
-    def __init__(self, *args, **kwargs):
-        super(Model, self).__init__()
 
-    def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
-        return _kernel_function_cpu(A, B)
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/20_LeakyReLU.py b/backends/triton/cpu/KernelBench/level1/20_LeakyReLU.py
new file mode 100644
index 0000000..8002e4f
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/20_LeakyReLU.py
@@ -0,0 +1,69 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2),
+    ],
+    key=["n_elements"],
+)
+@triton.jit
+def leaky_relu_kernel(
+    x_ptr,
+    output_ptr,
+    neg_slope_ptr,
+    n_elements,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+
+    neg_slope = tl.load(neg_slope_ptr)
+
+    x = tl.load(x_ptr + offsets, mask=mask)
+    out = tl.where(x >= 0, x, x * neg_slope)
+    tl.store(output_ptr + offsets, out, mask=mask)
+
+
+class Model(nn.Module):
+    def __init__(self, negative_slope: float = 0.01):
+        super(Model, self).__init__()
+        self.negative_slope = negative_slope
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        output = torch.empty_like(x)
+        n_elements = x.numel()
+        neg_slope_t = torch.tensor(
+            [self.negative_slope], dtype=x.dtype, device=x.device
+        )
+        grid = lambda META: (triton.cdiv(n_elements, META["BLOCK_SIZE"]),)
+        leaky_relu_kernel[grid](
+            x,
+            output,
+            neg_slope_t,
+            n_elements,
+        )
+        return output
+
+
+batch_size = 4096
+dim = 393216
+
+
+def get_inputs():
+    x = torch.rand(batch_size, dim, dtype=torch.bfloat16)
+    return [x]
+
+
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/21_Sigmoid.py b/backends/triton/cpu/KernelBench/level1/21_Sigmoid.py
new file mode 100644
index 0000000..7240e65
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/21_Sigmoid.py
@@ -0,0 +1,64 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2),
+    ],
+    key=["N"],
+)
+@triton.jit
+def _sigmoid_kernel(
+    x_ptr,
+    out_ptr,
+    N,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < N
+
+    x = tl.load(x_ptr + offsets, mask=mask, other=0.0).to(tl.float32)
+
+    inv_ln2 = 1.4426950408889634
+    e = tl.math.exp2((-x) * inv_ln2)
+    y = 1.0 / (1.0 + e)
+
+    tl.store(out_ptr + offsets, y.to(tl.bfloat16), mask=mask)
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_flat = x.contiguous().view(-1)
+        N = x_flat.numel()
+        out_flat = torch.empty_like(x_flat)
+
+        grid = lambda META: (triton.cdiv(N, META["BLOCK_SIZE"]),)
+        _sigmoid_kernel[grid](x_flat, out_flat, N)
+
+        return out_flat.view_as(x)
+
+
+batch_size = 4096
+dim = 393216
+
+
+def get_inputs():
+    x = torch.rand(batch_size, dim, dtype=torch.bfloat16)
+    return [x]
+
+
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/22_Tanh.py b/backends/triton/cpu/KernelBench/level1/22_Tanh.py
new file mode 100644
index 0000000..f5f6074
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/22_Tanh.py
@@ -0,0 +1,67 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+batch_size = 4096
+dim = 393216
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2),
+    ],
+    key=["n_elements"],
+)
+@triton.jit
+def _tanh_kernel(
+    x_ptr,
+    out_ptr,
+    n_elements,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+
+    x = tl.load(x_ptr + offsets, mask=mask, other=0.0).to(tl.float32)
+
+    # tanh(x) = 2*sigmoid(2x) - 1
+    # sigmoid(z) = 1/(1 + exp2(-z * log2(e)))
+    inv_ln2: tl.constexpr = 1.4426950408889634
+    z = 2.0 * x
+    e = tl.math.exp2((-z) * inv_ln2)
+    sig = 1.0 / (1.0 + e)
+    result = 2.0 * sig - 1.0
+
+    tl.store(out_ptr + offsets, result.to(tl.bfloat16), mask=mask)
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_flat = x.contiguous().view(-1)
+        n_elements = x_flat.numel()
+        output_flat = torch.empty_like(x_flat)
+
+        grid = lambda META: (triton.cdiv(n_elements, META["BLOCK_SIZE"]),)
+        _tanh_kernel[grid](x_flat, output_flat, n_elements)
+
+        return output_flat.view(x.shape)
+
+
+def get_inputs():
+    x = torch.rand(batch_size, dim, dtype=torch.bfloat16)
+    return [x]
+
+
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/23_Softmax.py b/backends/triton/cpu/KernelBench/level1/23_Softmax.py
new file mode 100644
index 0000000..5e7ba1b
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/23_Softmax.py
@@ -0,0 +1,101 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+def _softmax_configs():
+    return [
+        triton.Config({"BLOCK_N": 2048}, num_warps=8, num_stages=3),
+    ]
+
+
+@triton.autotune(configs=_softmax_configs(), key=["N"])
+@triton.jit
+def _softmax_kernel(
+    inp_ptr,
+    out_ptr,
+    M,
+    N,
+    stride_im,
+    stride_in,
+    stride_om,
+    stride_on,
+    BLOCK_N: tl.constexpr,
+):
+    pid_m = tl.program_id(0)
+    row_inp = inp_ptr + pid_m * stride_im
+    row_out = out_ptr + pid_m * stride_om
+
+    LOG2E: tl.constexpr = 1.4426950408889634
+
+    # Pass 1: Online max + sum_exp
+    row_max = -float("inf")
+    row_sum = 0.0
+    for start in range(0, N, BLOCK_N):
+        offs = start + tl.arange(0, BLOCK_N)
+        mask = offs < N
+        x = tl.load(row_inp + offs * stride_in, mask=mask, other=-float("inf")).to(
+            tl.float32
+        )
+        block_max = tl.max(x, axis=0)
+        new_max = tl.maximum(row_max, block_max)
+        row_sum = row_sum * tl.math.exp2((row_max - new_max) * LOG2E) + tl.sum(
+            tl.math.exp2((x - new_max) * LOG2E), axis=0
+        )
+        row_max = new_max
+
+    inv_sum = 1.0 / row_sum
+
+    # Pass 2: normalize and store
+    for start in range(0, N, BLOCK_N):
+        offs = start + tl.arange(0, BLOCK_N)
+        mask = offs < N
+        x = tl.load(row_inp + offs * stride_in, mask=mask, other=-float("inf")).to(
+            tl.float32
+        )
+        e = tl.math.exp2((x - row_max) * LOG2E)
+        y = (e * inv_sum).to(tl.bfloat16)
+        tl.store(row_out + offs * stride_on, y, mask=mask)
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x.contiguous()
+        M, N = x.shape
+        out = torch.empty_like(x)
+
+        grid = (M,)
+        _softmax_kernel[grid](
+            x,
+            out,
+            M,
+            N,
+            x.stride(0),
+            x.stride(1),
+            out.stride(0),
+            out.stride(1),
+        )
+        return out
+
+
+batch_size = 4096
+dim = 393216
+
+
+def get_inputs():
+    x = torch.rand(batch_size, dim, dtype=torch.bfloat16)
+    return [x]
+
+
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/24_LogSoftmax.py b/backends/triton/cpu/KernelBench/level1/24_LogSoftmax.py
new file mode 100644
index 0000000..c5e5f35
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/24_LogSoftmax.py
@@ -0,0 +1,94 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_N": 2048, "warp_size": 32}, num_warps=4, num_stages=2),
+    ],
+    key=["N"],
+)
+@triton.jit
+def _logsoftmax_kernel(
+    inp_ptr,
+    out_ptr,
+    M,
+    N,
+    stride_im,
+    stride_om,
+    BLOCK_N: tl.constexpr,
+    warp_size: tl.constexpr,
+):
+    pid_m = tl.program_id(0)
+    row_inp = inp_ptr + pid_m.to(tl.int64) * stride_im
+    row_out = out_ptr + pid_m.to(tl.int64) * stride_om
+
+    LOG2E = 1.4426950408889634
+    LN2 = 0.6931471805599453
+
+    m = -float("inf")
+    s = 0.0
+
+    for start in range(0, N, BLOCK_N):
+        offs = start + tl.arange(0, BLOCK_N)
+        mask = offs < N
+        x = tl.load(row_inp + offs, mask=mask, other=-float("inf")).to(tl.float32)
+        block_max = tl.max(x, axis=0)
+        m_new = tl.maximum(m, block_max)
+        s = s * tl.math.exp2((m - m_new) * LOG2E) + tl.sum(
+            tl.math.exp2((x - m_new) * LOG2E), axis=0
+        )
+        m = m_new
+
+    log_s = tl.math.log2(s) * LN2
+
+    for start in range(0, N, BLOCK_N):
+        offs = start + tl.arange(0, BLOCK_N)
+        mask = offs < N
+        x = tl.load(row_inp + offs, mask=mask, other=-float("inf")).to(tl.float32)
+        y = x - m - log_s
+        tl.store(row_out + offs, y.to(tl.bfloat16), mask=mask)
+
+
+class Model(nn.Module):
+    def __init__(self, dim: int = 1):
+        super(Model, self).__init__()
+        self.dim = dim
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x.contiguous()
+
+        M, N = x.shape
+        out = torch.empty_like(x)
+
+        grid = (M,)
+        _logsoftmax_kernel[grid](
+            x,
+            out,
+            M,
+            N,
+            x.stride(0),
+            out.stride(0),
+        )
+        return out
+
+
+batch_size = 4096
+dim = 393216
+
+
+def get_inputs():
+    x = torch.rand(batch_size, dim, dtype=torch.bfloat16)
+    return [x]
+
+
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/25_Swish.py b/backends/triton/cpu/KernelBench/level1/25_Swish.py
new file mode 100644
index 0000000..0a682dd
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/25_Swish.py
@@ -0,0 +1,65 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def _sigmoid_exp2(x):
+    inv_ln2 = 1.4426950408889634
+    e = tl.math.exp2((-x) * inv_ln2)
+    return 1.0 / (1.0 + e)
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2),
+    ],
+    key=["n_elements"],
+)
+@triton.jit
+def swish_kernel(
+    x_ptr,
+    output_ptr,
+    n_elements,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+    x = tl.load(x_ptr + offsets, mask=mask, other=0.0)
+    x_f32 = x.to(tl.float32)
+    sig = _sigmoid_exp2(x_f32)
+    result = x_f32 * sig
+    tl.store(output_ptr + offsets, result.to(tl.bfloat16), mask=mask)
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        output = torch.empty_like(x)
+        n_elements = x.numel()
+        grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+        swish_kernel[grid](x, output, n_elements)
+        return output
+
+
+batch_size = 4096
+dim = 393216
+
+
+def get_inputs():
+    x = torch.rand(batch_size, dim, dtype=torch.bfloat16)
+    return [x]
+
+
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/26_GELU_.py b/backends/triton/cpu/KernelBench/level1/26_GELU_.py
new file mode 100644
index 0000000..0d52473
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/26_GELU_.py
@@ -0,0 +1,64 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {"BLOCK_SIZE": 4096, "NUM_PROGS": 160}, num_warps=4, num_stages=2
+        ),
+    ],
+    key=["n_elements"],
+)
+@triton.jit
+def gelu_persistent_kernel(
+    x_ptr,
+    out_ptr,
+    n_elements,
+    BLOCK_SIZE: tl.constexpr,
+    NUM_PROGS: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    num_tiles = tl.cdiv(n_elements, BLOCK_SIZE)
+
+    for tile_id in range(pid, num_tiles, NUM_PROGS):
+        offsets = tile_id * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < n_elements
+
+        x = tl.load(x_ptr + offsets, mask=mask, other=0.0).to(tl.float32)
+        out = 0.5 * x * (1.0 + tl.math.erf(x * 0.70710678118654752440))
+        tl.store(out_ptr + offsets, out.to(tl.bfloat16), mask=mask)
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_flat = x.view(-1)
+        out_flat = torch.empty_like(x_flat)
+        n_elements = x_flat.numel()
+        grid = lambda META: (META["NUM_PROGS"],)
+        gelu_persistent_kernel[grid](x_flat, out_flat, n_elements)
+        return out_flat.view_as(x)
+
+
+batch_size = 4096
+dim = 393216
+
+
+def get_inputs():
+    x = torch.rand(batch_size, dim, dtype=torch.bfloat16)
+    return [x]
+
+
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/27_SELU_.py b/backends/triton/cpu/KernelBench/level1/27_SELU_.py
new file mode 100644
index 0000000..1d8a3a7
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/27_SELU_.py
@@ -0,0 +1,64 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=3),
+    ],
+    key=["n_elements"],
+)
+@triton.jit
+def selu_kernel(
+    x_ptr,
+    out_ptr,
+    n_elements,
+    alpha,
+    scale,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+
+    x = tl.load(x_ptr + offsets, mask=mask, other=0.0)
+    x_f32 = x.to(tl.float32)
+
+    result = tl.where(x_f32 > 0.0, scale * x_f32, scale * alpha * (tl.exp(x_f32) - 1.0))
+
+    tl.store(out_ptr + offsets, result.to(tl.bfloat16), mask=mask)
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+        self.alpha = 1.6732632423543772848170429916717
+        self.scale = 1.0507009873554804934193349852946
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        n_elements = x.numel()
+        output = torch.empty_like(x)
+        grid = lambda META: (triton.cdiv(n_elements, META["BLOCK_SIZE"]),)
+        selu_kernel[grid](x, output, n_elements, self.alpha, self.scale)
+        return output
+
+
+batch_size = 4096
+dim = 393216
+
+
+def get_inputs():
+    x = torch.rand(batch_size, dim, dtype=torch.bfloat16)
+    return [x]
+
+
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/28_HardSigmoid.py b/backends/triton/cpu/KernelBench/level1/28_HardSigmoid.py
new file mode 100644
index 0000000..6bde4bb
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/28_HardSigmoid.py
@@ -0,0 +1,62 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2),
+    ],
+    key=["N"],
+)
+@triton.jit
+def hardsigmoid_kernel(
+    x_ptr,
+    out_ptr,
+    N,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < N
+
+    x = tl.load(x_ptr + offsets, mask=mask, other=0.0)
+    x_f32 = x.to(tl.float32)
+
+    result = x_f32 * (1.0 / 6.0) + 0.5
+    result = tl.maximum(result, 0.0)
+    result = tl.minimum(result, 1.0)
+
+    tl.store(out_ptr + offsets, result.to(x.dtype), mask=mask)
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        N = x.numel()
+        grid = lambda META: (triton.cdiv(N, META["BLOCK_SIZE"]),)
+        hardsigmoid_kernel[grid](x, out, N)
+        return out
+
+
+batch_size = 4096
+dim = 393216
+
+
+def get_inputs():
+    x = torch.rand(batch_size, dim, dtype=torch.bfloat16)
+    return [x]
+
+
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/29_Softplus.py b/backends/triton/cpu/KernelBench/level1/29_Softplus.py
new file mode 100644
index 0000000..d561bab
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/29_Softplus.py
@@ -0,0 +1,61 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2),
+    ],
+    key=["n_elements"],
+)
+@triton.jit
+def softplus_kernel(
+    x_ptr,
+    output_ptr,
+    n_elements,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+
+    x = tl.load(x_ptr + offsets, mask=mask, other=0.0).to(tl.float32)
+
+    # softplus(x) = log(1 + exp(x)), with threshold for numerical stability
+    THRESHOLD: tl.constexpr = 20.0
+    result = tl.where(x > THRESHOLD, x, tl.math.log(1.0 + tl.exp(x)))
+
+    tl.store(output_ptr + offsets, result.to(tl.bfloat16), mask=mask)
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        output = torch.empty_like(x)
+        n_elements = x.numel()
+        grid = lambda META: (triton.cdiv(n_elements, META["BLOCK_SIZE"]),)
+        softplus_kernel[grid](x, output, n_elements)
+        return output
+
+
+batch_size = 4096
+dim = 393216
+
+
+def get_inputs():
+    x = torch.rand(batch_size, dim, dtype=torch.bfloat16)
+    return [x]
+
+
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/2_Standard_matrix_multiplication_.py b/backends/triton/cpu/KernelBench/level1/2_Standard_matrix_multiplication_.py
new file mode 100644
index 0000000..761dc6e
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/2_Standard_matrix_multiplication_.py
@@ -0,0 +1,124 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+def _configs():
+    return [
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
+            num_warps=32,
+            num_stages=2,
+        ),
+    ]
+
+
+@triton.autotune(configs=_configs(), key=["M", "N", "K"])
+@triton.jit
+def _matmul_kernel(
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    M,
+    N,
+    K,
+    stride_am: tl.constexpr,
+    stride_ak: tl.constexpr,
+    stride_bk: tl.constexpr,
+    stride_bn: tl.constexpr,
+    stride_cm: tl.constexpr,
+    stride_cn: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    pid = tl.program_id(0)
+
+    num_pid_m = tl.cdiv(M, BLOCK_M)
+    num_pid_n = tl.cdiv(N, BLOCK_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = tl.minimum(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    a_desc = tl.make_tensor_descriptor(
+        base=a_ptr,
+        shape=(M, K),
+        strides=(stride_am, stride_ak),
+        block_shape=(BLOCK_M, BLOCK_K),
+    )
+    b_desc = tl.make_tensor_descriptor(
+        base=b_ptr,
+        shape=(K, N),
+        strides=(stride_bk, stride_bn),
+        block_shape=(BLOCK_K, BLOCK_N),
+    )
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    for off_k in range(0, K, BLOCK_K):
+        a_tile = a_desc.load([pid_m * BLOCK_M, off_k])
+        b_tile = b_desc.load([off_k, pid_n * BLOCK_N])
+        acc += tl.dot(a_tile, b_tile)
+    c_desc = tl.make_tensor_descriptor(
+        base=c_ptr,
+        shape=(M, N),
+        strides=(stride_cm, stride_cn),
+        block_shape=(BLOCK_M, BLOCK_N),
+    )
+    c_desc.store([pid_m * BLOCK_M, pid_n * BLOCK_N], acc.to(c_ptr.type.element_ty))
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
+        A_fp16 = A.to(torch.bfloat16).contiguous()
+        B_fp16 = B.to(torch.bfloat16).contiguous()
+        M, K = A_fp16.shape
+        N = B_fp16.shape[1]
+        C = torch.empty((M, N), device=A.device, dtype=torch.bfloat16)
+
+        grid = lambda META: (
+            triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),
+        )
+        _matmul_kernel[grid](
+            A_fp16,
+            B_fp16,
+            C,
+            M,
+            N,
+            K,
+            A_fp16.stride(0),
+            A_fp16.stride(1),
+            B_fp16.stride(0),
+            B_fp16.stride(1),
+            C.stride(0),
+            C.stride(1),
+        )
+        return C
+
+
+M = 1024 * 2
+K = 4096 * 2
+N = 2048 * 2
+
+
+def get_inputs():
+    A = torch.rand(M, K, dtype=torch.bfloat16)
+    B = torch.rand(K, N, dtype=torch.bfloat16)
+    return [A, B]
+
+
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/30_Softsign.py b/backends/triton/cpu/KernelBench/level1/30_Softsign.py
new file mode 100644
index 0000000..6290f9a
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/30_Softsign.py
@@ -0,0 +1,58 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2),
+    ],
+    key=["n_elements"],
+)
+@triton.jit
+def softsign_kernel(
+    x_ptr,
+    output_ptr,
+    n_elements,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+    x = tl.load(x_ptr + offsets, mask=mask)
+    x_f32 = x.to(tl.float32)
+    abs_x = tl.abs(x_f32)
+    result = x_f32 / (1.0 + abs_x)
+    tl.store(output_ptr + offsets, result.to(x.dtype), mask=mask)
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        output = torch.empty_like(x)
+        n_elements = x.numel()
+        grid = lambda META: (triton.cdiv(n_elements, META["BLOCK_SIZE"]),)
+        softsign_kernel[grid](x, output, n_elements)
+        return output
+
+
+batch_size = 4096
+dim = 393216
+
+
+def get_inputs():
+    x = torch.rand(batch_size, dim, dtype=torch.bfloat16)
+    return [x]
+
+
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/31_ELU.py b/backends/triton/cpu/KernelBench/level1/31_ELU.py
new file mode 100644
index 0000000..c9e0a26
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/31_ELU.py
@@ -0,0 +1,69 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2),
+    ],
+    key=["n_elements"],
+)
+@triton.jit
+def elu_kernel(
+    x_ptr,
+    out_ptr,
+    alpha,
+    n_elements,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+
+    x = tl.load(x_ptr + offsets, mask=mask, other=0.0)
+    x_f32 = x.to(tl.float32)
+
+    inv_ln2: tl.constexpr = 1.4426950408889634
+    exp_x = tl.math.exp2(x_f32 * inv_ln2)
+    neg_branch = alpha * (exp_x - 1.0)
+
+    result = tl.where(x_f32 > 0.0, x_f32, neg_branch)
+
+    tl.store(out_ptr + offsets, result.to(x.dtype), mask=mask)
+
+
+class Model(nn.Module):
+    def __init__(self, alpha=1.0):
+        super(Model, self).__init__()
+        try:
+            self.alpha = float(alpha)
+        except (ValueError, TypeError):
+            self.alpha = 1.0
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        n_elements = x.numel()
+        grid = lambda META: (triton.cdiv(n_elements, META["BLOCK_SIZE"]),)
+        elu_kernel[grid](x, out, self.alpha, n_elements)
+        return out
+
+
+batch_size = 4096
+dim = 393216
+
+
+def get_inputs():
+    x = torch.rand(batch_size, dim, dtype=torch.bfloat16)
+    return [x]
+
+
+def get_init_inputs():
+    return [1.0]
diff --git a/backends/triton/cpu/KernelBench/level1/32_HardTanh.py b/backends/triton/cpu/KernelBench/level1/32_HardTanh.py
new file mode 100644
index 0000000..c5fa253
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/32_HardTanh.py
@@ -0,0 +1,59 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2),
+    ],
+    key=["n_elements"],
+)
+@triton.jit
+def hardtanh_kernel(
+    x_ptr,
+    out_ptr,
+    n_elements,
+    min_val: tl.constexpr,
+    max_val: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+    x = tl.load(x_ptr + offsets, mask=mask)
+    x = tl.maximum(x, min_val)
+    x = tl.minimum(x, max_val)
+    tl.store(out_ptr + offsets, x, mask=mask)
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        n_elements = x.numel()
+        grid = lambda META: (triton.cdiv(n_elements, META["BLOCK_SIZE"]),)
+        hardtanh_kernel[grid](x, out, n_elements, -1.0, 1.0)
+        return out
+
+
+batch_size = 4096
+dim = 393216
+
+
+def get_inputs():
+    x = torch.rand(batch_size, dim, dtype=torch.bfloat16)
+    return [x]
+
+
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/33_BatchNorm.py b/backends/triton/cpu/KernelBench/level1/33_BatchNorm.py
new file mode 100644
index 0000000..4b2eab4
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/33_BatchNorm.py
@@ -0,0 +1,202 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def _bn_reduce_kernel(
+    x_ptr,
+    partial_sum_ptr,
+    partial_sq_ptr,
+    C,
+    HW,
+    stride_b,
+    stride_c,
+    B: tl.constexpr,
+    BLOCK_HW: tl.constexpr,
+):
+    pid_c = tl.program_id(0)
+    pid_b = tl.program_id(1)
+    base = pid_b.to(tl.int64) * stride_b + pid_c.to(tl.int64) * stride_c
+    acc_sum = 0.0
+    acc_sq = 0.0
+    for hw_start in range(0, HW, BLOCK_HW):
+        offs = hw_start + tl.arange(0, BLOCK_HW)
+        mask = offs < HW
+        x = tl.load(x_ptr + base + offs, mask=mask, other=0.0).to(tl.float32)
+        acc_sum += tl.sum(x, axis=0)
+        acc_sq += tl.sum(x * x, axis=0)
+    out_idx = pid_c * B + pid_b
+    tl.store(partial_sum_ptr + out_idx, acc_sum)
+    tl.store(partial_sq_ptr + out_idx, acc_sq)
+
+
+@triton.jit
+def _bn_stats_kernel(
+    partial_sum_ptr,
+    partial_sq_ptr,
+    scale_ptr,
+    shift_ptr,
+    weight_ptr,
+    bias_ptr,
+    inv_count,
+    eps,
+    B: tl.constexpr,
+    BLOCK_B: tl.constexpr,
+):
+    pid_c = tl.program_id(0)
+    offs_b = tl.arange(0, BLOCK_B)
+    mask = offs_b < B
+    s = tl.load(partial_sum_ptr + pid_c * B + offs_b, mask=mask, other=0.0)
+    sq = tl.load(partial_sq_ptr + pid_c * B + offs_b, mask=mask, other=0.0)
+    total_sum = tl.sum(s, axis=0)
+    total_sq = tl.sum(sq, axis=0)
+    mean_val = total_sum * inv_count
+    var_val = total_sq * inv_count - mean_val * mean_val
+    w = tl.load(weight_ptr + pid_c)
+    bi = tl.load(bias_ptr + pid_c)
+    inv_std = 1.0 / tl.sqrt(var_val + eps)
+    scale_val = w * inv_std
+    shift_val = bi - mean_val * scale_val
+    tl.store(scale_ptr + pid_c, scale_val)
+    tl.store(shift_ptr + pid_c, shift_val)
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_SIZE": 4096, "warp_size": 32}, num_warps=8),
+    ],
+    key=["total_elements"],
+)
+@triton.jit
+def _bn_normalize_flat_kernel(
+    x_ptr,
+    out_ptr,
+    scale_ptr,
+    shift_ptr,
+    C,
+    HW,
+    total_elements,
+    BLOCK_SIZE: tl.constexpr,
+    warp_size: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    start = pid * BLOCK_SIZE
+    offs = start + tl.arange(0, BLOCK_SIZE)
+    mask = offs < total_elements
+    c = (offs // HW) % C
+    x = tl.load(x_ptr + offs, mask=mask, other=0.0).to(tl.float32)
+    scale = tl.load(scale_ptr + c, mask=mask, other=0.0)
+    shift = tl.load(shift_ptr + c, mask=mask, other=0.0)
+    y = x * scale + shift
+    tl.store(out_ptr + offs, y.to(tl.float32), mask=mask)
+
+
+class Model(nn.Module):
+    def __init__(self, num_features: int):
+        super().__init__()
+        self.num_features = num_features
+        self.weight = nn.Parameter(torch.ones(num_features))
+        self.bias = nn.Parameter(torch.zeros(num_features))
+        self.register_buffer("running_mean", torch.zeros(num_features))
+        self.register_buffer("running_var", torch.ones(num_features))
+        self.eps = 1e-5
+        self.momentum = 0.1
+        self._moved = False
+        self._bufs_ready = False
+
+    def _move_params(self, device):
+        self.weight.data = self.weight.data.to(device, dtype=torch.float32).contiguous()
+        self.bias.data = self.bias.data.to(device, dtype=torch.float32).contiguous()
+        self.running_mean = self.running_mean.to(
+            device, dtype=torch.float32
+        ).contiguous()
+        self.running_var = self.running_var.to(device, dtype=torch.float32).contiguous()
+        self._moved = True
+
+    def _alloc_bufs(self, B, C, device):
+        self._partial_sum = torch.empty((C, B), device=device, dtype=torch.float32)
+        self._partial_sq = torch.empty((C, B), device=device, dtype=torch.float32)
+        self._scale = torch.empty(C, device=device, dtype=torch.float32)
+        self._shift = torch.empty(C, device=device, dtype=torch.float32)
+        self._bufs_ready = True
+
+    def forward(self, x):
+        device = x.device
+        if not self._moved:
+            self._move_params(device)
+        x = x.to(dtype=torch.float32).contiguous()
+        B, C, H, W = x.shape
+        HW = H * W
+        stride_b = x.stride(0)
+        stride_c = x.stride(1)
+        total_elements = B * C * HW
+
+        if not self._bufs_ready:
+            self._alloc_bufs(B, C, device)
+
+        if self.training:
+            _bn_reduce_kernel[(C, B)](
+                x,
+                self._partial_sum,
+                self._partial_sq,
+                C,
+                HW,
+                stride_b,
+                stride_c,
+                B=B,
+                BLOCK_HW=8192,
+                num_warps=8,
+            )
+            _bn_stats_kernel[(C,)](
+                self._partial_sum,
+                self._partial_sq,
+                self._scale,
+                self._shift,
+                self.weight,
+                self.bias,
+                1.0 / (B * HW),
+                self.eps,
+                B=B,
+                BLOCK_B=triton.next_power_of_2(B),
+                num_warps=4,
+            )
+        else:
+            inv_std = 1.0 / torch.sqrt(self.running_var + self.eps)
+            self._scale.copy_(self.weight * inv_std)
+            self._shift.copy_(self.bias - self.running_mean * self._scale)
+
+        out = torch.empty_like(x)
+        grid = lambda META: (triton.cdiv(total_elements, META["BLOCK_SIZE"]),)
+        _bn_normalize_flat_kernel[grid](
+            x,
+            out,
+            self._scale,
+            self._shift,
+            C,
+            HW,
+            total_elements,
+        )
+        return out
+
+
+batch_size = 64
+features = 64
+dim1 = 512
+dim2 = 512
+
+
+def get_inputs():
+    x = torch.rand(batch_size, features, dim1, dim2, dtype=torch.bfloat16)
+    return [x]
+
+
+def get_init_inputs():
+    return [features]
diff --git a/backends/triton/cpu/KernelBench/level1/34_InstanceNorm.py b/backends/triton/cpu/KernelBench/level1/34_InstanceNorm.py
new file mode 100644
index 0000000..4149704
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/34_InstanceNorm.py
@@ -0,0 +1,87 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_SIZE": 2048, "warp_size": 32}, num_warps=8, num_stages=2),
+    ],
+    key=["N"],
+)
+@triton.jit
+def instance_norm_kernel(
+    x_ptr,
+    out_ptr,
+    N,
+    eps,
+    BLOCK_SIZE: tl.constexpr,
+    warp_size: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    base = pid.to(tl.int64) * N
+
+    sum_acc = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
+    sq_acc = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
+    for start in range(0, N, BLOCK_SIZE):
+        offs = start + tl.arange(0, BLOCK_SIZE)
+        mask = offs < N
+        x = tl.load(x_ptr + base + offs, mask=mask, other=0.0).to(tl.float32)
+        sum_acc += x
+        sq_acc += x * x
+
+    total_sum = tl.sum(sum_acc, axis=0)
+    total_sq = tl.sum(sq_acc, axis=0)
+    mean = total_sum / N
+    var = total_sq / N - mean * mean
+    inv_std = 1.0 / tl.sqrt(var + eps)
+
+    for start in range(0, N, BLOCK_SIZE):
+        offs = start + tl.arange(0, BLOCK_SIZE)
+        mask = offs < N
+        x = tl.load(x_ptr + base + offs, mask=mask, other=0.0).to(tl.float32)
+        out = (x - mean) * inv_std
+        tl.store(out_ptr + base + offs, out.to(tl.bfloat16), mask=mask)
+
+
+class Model(nn.Module):
+    def __init__(self, num_features: int):
+        super(Model, self).__init__()
+        self.num_features = num_features
+        self.eps = 1e-5
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, C, H, W = x.shape
+        N = H * W
+        x_flat = x.contiguous().view(B * C, N)
+        out = torch.empty_like(x_flat)
+        grid = (B * C,)
+        instance_norm_kernel[grid](
+            x_flat,
+            out,
+            N,
+            self.eps,
+        )
+        return out.view(B, C, H, W)
+
+
+batch_size = 16
+features = 64
+dim1 = 256
+dim2 = 256
+
+
+def get_inputs():
+    x = torch.rand(batch_size, features, dim1, dim2, dtype=torch.bfloat16)
+    return [x]
+
+
+def get_init_inputs():
+    return [features]
diff --git a/backends/triton/cpu/KernelBench/level1/35_GroupNorm_.py b/backends/triton/cpu/KernelBench/level1/35_GroupNorm_.py
new file mode 100644
index 0000000..4fab4bb
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/35_GroupNorm_.py
@@ -0,0 +1,183 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_HW": 4096, "warp_size": 32}, num_warps=8, num_stages=2),
+    ],
+    key=["HW", "channels_per_group"],
+)
+@triton.jit
+def group_norm_stats_kernel(
+    x_ptr,
+    mean_ptr,
+    invstd_ptr,
+    C: tl.constexpr,
+    HW: tl.constexpr,
+    num_groups: tl.constexpr,
+    eps: tl.constexpr,
+    channels_per_group: tl.constexpr,
+    BLOCK_HW: tl.constexpr,
+    warp_size: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    batch_idx = pid // num_groups
+    group_idx = pid % num_groups
+
+    channel_start = group_idx * channels_per_group
+    batch_offset = batch_idx.to(tl.int64) * C * HW
+    group_elems = channels_per_group * HW
+
+    sum_val = 0.0
+    sq_val = 0.0
+
+    for c in range(channels_per_group):
+        c_offset = (channel_start + c).to(tl.int64) * HW
+        base = batch_offset + c_offset
+        for hw_start in range(0, HW, BLOCK_HW):
+            offs = hw_start + tl.arange(0, BLOCK_HW)
+            mask = offs < HW
+            x_val = tl.load(x_ptr + base + offs.to(tl.int64), mask=mask, other=0.0).to(
+                tl.float32
+            )
+            sum_val += tl.sum(x_val, axis=0)
+            sq_val += tl.sum(x_val * x_val, axis=0)
+
+    mean = sum_val / group_elems
+    variance = sq_val / group_elems - mean * mean
+    inv_std = 1.0 / tl.sqrt(variance + eps)
+
+    tl.store(mean_ptr + pid, mean)
+    tl.store(invstd_ptr + pid, inv_std)
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_HW": 4096, "warp_size": 32}, num_warps=8, num_stages=2),
+    ],
+    key=["HW"],
+)
+@triton.jit
+def group_norm_apply_kernel(
+    x_ptr,
+    y_ptr,
+    mean_ptr,
+    invstd_ptr,
+    weight_ptr,
+    bias_ptr,
+    C: tl.constexpr,
+    HW: tl.constexpr,
+    num_groups: tl.constexpr,
+    channels_per_group: tl.constexpr,
+    BLOCK_HW: tl.constexpr,
+    warp_size: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    batch_idx = pid // C
+    channel_idx = pid % C
+    group_idx = channel_idx // channels_per_group
+
+    stats_idx = batch_idx * num_groups + group_idx
+    mean = tl.load(mean_ptr + stats_idx)
+    inv_std = tl.load(invstd_ptr + stats_idx)
+    w = tl.load(weight_ptr + channel_idx).to(tl.float32)
+    b = tl.load(bias_ptr + channel_idx).to(tl.float32)
+
+    scale = inv_std * w
+    shift = b - mean * scale
+
+    batch_offset = batch_idx.to(tl.int64) * C * HW
+    c_offset = channel_idx.to(tl.int64) * HW
+    base = batch_offset + c_offset
+
+    for hw_start in range(0, HW, BLOCK_HW):
+        offs = hw_start + tl.arange(0, BLOCK_HW)
+        mask = offs < HW
+        x_val = tl.load(x_ptr + base + offs.to(tl.int64), mask=mask, other=0.0).to(
+            tl.float32
+        )
+        normed = x_val * scale + shift
+        tl.store(y_ptr + base + offs.to(tl.int64), normed.to(tl.bfloat16), mask=mask)
+
+
+class Model(nn.Module):
+    def __init__(self, num_features: int, num_groups: int):
+        super(Model, self).__init__()
+        self.gn = nn.GroupNorm(num_groups=num_groups, num_channels=num_features)
+        self.num_features = num_features
+        self.num_groups = num_groups
+        self._packed = False
+
+    def _pack_weights(self, device):
+        self.weight_packed = self.gn.weight.data.to(device).contiguous()
+        self.bias_packed = self.gn.bias.data.to(device).contiguous()
+        self._packed = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        device = x.device
+        x = x.contiguous()
+        if not self._packed:
+            self._pack_weights(device)
+
+        N, C, H, W_dim = x.shape
+        HW = H * W_dim
+        channels_per_group = C // self.num_groups
+        y = torch.empty_like(x)
+        eps = self.gn.eps
+
+        mean_buf = torch.empty(N * self.num_groups, device=device, dtype=torch.float32)
+        invstd_buf = torch.empty(
+            N * self.num_groups, device=device, dtype=torch.float32
+        )
+
+        stats_grid = (N * self.num_groups,)
+        group_norm_stats_kernel[stats_grid](
+            x,
+            mean_buf,
+            invstd_buf,
+            C,
+            HW,
+            self.num_groups,
+            eps,
+            channels_per_group,
+        )
+
+        apply_grid = (N * C,)
+        group_norm_apply_kernel[apply_grid](
+            x,
+            y,
+            mean_buf,
+            invstd_buf,
+            self.weight_packed,
+            self.bias_packed,
+            C,
+            HW,
+            self.num_groups,
+            channels_per_group,
+        )
+        return y
+
+
+batch_size = 16
+features = 64
+num_groups = 8
+dim1 = 256
+dim2 = 256
+
+
+def get_inputs():
+    x = torch.rand(batch_size, features, dim1, dim2, dtype=torch.bfloat16)
+    return [x]
+
+
+def get_init_inputs():
+    return [features, num_groups]
diff --git a/backends/triton/cpu/KernelBench/level1/36_RMSNorm_.py b/backends/triton/cpu/KernelBench/level1/36_RMSNorm_.py
new file mode 100644
index 0000000..de1949b
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/36_RMSNorm_.py
@@ -0,0 +1,109 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_S": 64}, num_warps=4, num_stages=2),
+    ],
+    key=["S", "F"],
+)
+@triton.jit
+def rms_norm_kernel(
+    x_ptr,
+    out_ptr,
+    eps_ptr,
+    S: tl.constexpr,
+    F: tl.constexpr,
+    stride_batch,
+    stride_feat,
+    BLOCK_S: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    num_s_blocks = tl.cdiv(S, BLOCK_S)
+    batch_id = pid // num_s_blocks
+    spatial_block_id = pid % num_s_blocks
+
+    s_start = spatial_block_id * BLOCK_S
+    s_offs = s_start + tl.arange(0, BLOCK_S)
+    s_mask = s_offs < S
+
+    batch_offset = batch_id.to(tl.int64) * stride_batch
+    eps = tl.load(eps_ptr)
+
+    sum_sq = tl.zeros((BLOCK_S,), dtype=tl.float32)
+    for f in tl.static_range(F):
+        x_offs = batch_offset + f * stride_feat + s_offs
+        x_val = tl.load(x_ptr + x_offs, mask=s_mask, other=0.0)
+        x_fp32 = x_val.to(tl.float32)
+        sum_sq += x_fp32 * x_fp32
+
+    mean_sq = sum_sq / F
+    rms = tl.sqrt(mean_sq + eps.to(tl.float32))
+
+    for f in tl.static_range(F):
+        x_offs = batch_offset + f * stride_feat + s_offs
+        x_val = tl.load(x_ptr + x_offs, mask=s_mask, other=0.0)
+        out_val = x_val.to(tl.float32) / rms
+        tl.store(out_ptr + x_offs, out_val.to(tl.bfloat16), mask=s_mask)
+
+
+class Model(nn.Module):
+    def __init__(self, num_features: int, eps: float = 1e-5):
+        super(Model, self).__init__()
+        self.num_features = num_features
+        self.eps = eps
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if x.dtype != torch.bfloat16:
+            x = x.to(torch.bfloat16)
+        x = x.contiguous()
+
+        B = x.shape[0]
+        F = x.shape[1]
+        spatial = 1
+        for i in range(2, x.dim()):
+            spatial *= x.shape[i]
+
+        x_flat = x.view(B, F, spatial)
+        out_flat = torch.empty_like(x_flat)
+
+        stride_batch = F * spatial
+        stride_feat = spatial
+
+        eps_t = torch.tensor([self.eps], dtype=torch.float32, device=x.device)
+        grid = lambda META: (B * triton.cdiv(spatial, META["BLOCK_S"]),)
+        rms_norm_kernel[grid](
+            x_flat,
+            out_flat,
+            eps_t,
+            spatial,
+            F,
+            stride_batch,
+            stride_feat,
+        )
+
+        return out_flat.view_as(x)
+
+
+batch_size = 16
+features = 64
+dim1 = 256
+dim2 = 256
+
+
+def get_inputs():
+    x = torch.rand(batch_size, features, dim1, dim2, dtype=torch.bfloat16)
+    return [x]
+
+
+def get_init_inputs():
+    return [features]
diff --git a/backends/triton/cpu/KernelBench/level1/37_FrobeniusNorm_.py b/backends/triton/cpu/KernelBench/level1/37_FrobeniusNorm_.py
new file mode 100644
index 0000000..d83afe2
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/37_FrobeniusNorm_.py
@@ -0,0 +1,110 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+REDUCE_BLOCK: tl.constexpr = 8192
+
+
+@triton.jit
+def _partial_sum_sq_kernel(
+    x_ptr,
+    out_ptr,
+    N,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < N
+    x = tl.load(x_ptr + offsets, mask=mask, other=0.0)
+    x_fp32 = x.to(tl.float32)
+    partial_sum = tl.sum(x_fp32 * x_fp32)
+    tl.store(out_ptr + pid, partial_sum)
+
+
+@triton.jit
+def _reduce_kernel(
+    partial_sums_ptr,
+    inv_norm_ptr,
+    num_partial,
+    REDUCE_BLOCK: tl.constexpr,
+):
+    acc = tl.zeros([REDUCE_BLOCK], dtype=tl.float32)
+    for start in range(0, num_partial, REDUCE_BLOCK):
+        offsets = start + tl.arange(0, REDUCE_BLOCK)
+        mask = offsets < num_partial
+        vals = tl.load(partial_sums_ptr + offsets, mask=mask, other=0.0)
+        acc += vals
+    total = tl.sum(acc)
+    inv_norm = tl.rsqrt(total)
+    tl.store(inv_norm_ptr, inv_norm)
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_SIZE": 2048}, num_warps=8, num_stages=2),
+    ],
+    key=["N"],
+)
+@triton.jit
+def _normalize_kernel(
+    x_ptr,
+    out_ptr,
+    inv_norm_ptr,
+    N,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    inv_norm = tl.load(inv_norm_ptr)
+    offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < N
+    x = tl.load(x_ptr + offsets, mask=mask, other=0.0)
+    result = x.to(tl.float32) * inv_norm
+    tl.store(out_ptr + offsets, result.to(tl.bfloat16), mask=mask)
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        original_shape = x.shape
+        x_flat = x.contiguous().view(-1)
+        N = x_flat.numel()
+
+        num_blocks = triton.cdiv(N, 8192)
+        partial_sums = torch.empty(num_blocks, device=x.device, dtype=torch.float32)
+
+        _partial_sum_sq_kernel[(num_blocks,)](x_flat, partial_sums, N, BLOCK_SIZE=8192)
+
+        inv_norm = torch.empty(1, device=x.device, dtype=torch.float32)
+        _reduce_kernel[(1,)](
+            partial_sums, inv_norm, num_blocks, REDUCE_BLOCK=REDUCE_BLOCK
+        )
+
+        output_flat = torch.empty_like(x_flat)
+        grid_norm = lambda META: (triton.cdiv(N, META["BLOCK_SIZE"]),)
+        _normalize_kernel[grid_norm](x_flat, output_flat, inv_norm, N)
+
+        return output_flat.view(original_shape)
+
+
+batch_size = 16
+features = 64
+dim1 = 256
+dim2 = 256
+
+
+def get_inputs():
+    x = torch.rand(batch_size, features, dim1, dim2, dtype=torch.bfloat16)
+    return [x]
+
+
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/38_L1Norm_.py b/backends/triton/cpu/KernelBench/level1/38_L1Norm_.py
new file mode 100644
index 0000000..4a3a396
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/38_L1Norm_.py
@@ -0,0 +1,88 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_SIZE": 1024}, num_warps=8, num_stages=2),
+    ],
+    key=["N"],
+)
+@triton.jit
+def l1_norm_kernel(
+    x_ptr,
+    out_ptr,
+    M,
+    N,
+    stride_xm,
+    stride_xn,
+    stride_om,
+    stride_on,
+    BLOCK_SIZE: tl.constexpr,
+):
+    row = tl.program_id(0)
+
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    row_x = x_ptr + row.to(tl.int64) * stride_xm
+    row_o = out_ptr + row.to(tl.int64) * stride_om
+
+    # Phase 1: compute sum(abs(x)) for this row
+    abs_sum = 0.0
+    for col_start in range(0, N, BLOCK_SIZE):
+        cols = col_start + col_offsets
+        mask = cols < N
+        x = tl.load(row_x + cols * stride_xn, mask=mask, other=0.0).to(tl.float32)
+        abs_sum += tl.sum(tl.abs(x), axis=0)
+
+    # mean = sum(abs(x)) / N
+    mean_val = abs_sum / N
+
+    # Phase 2: normalize x / mean
+    for col_start in range(0, N, BLOCK_SIZE):
+        cols = col_start + col_offsets
+        mask = cols < N
+        x = tl.load(row_x + cols * stride_xn, mask=mask, other=0.0).to(tl.float32)
+        out = x / mean_val
+        tl.store(row_o + cols * stride_on, out.to(tl.bfloat16), mask=mask)
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        M, N = x.shape
+        out = torch.empty_like(x)
+        grid = (M,)
+        l1_norm_kernel[grid](
+            x,
+            out,
+            M,
+            N,
+            x.stride(0),
+            x.stride(1),
+            out.stride(0),
+            out.stride(1),
+        )
+        return out
+
+
+batch_size = 4096
+dim = 16384
+
+
+def get_inputs():
+    x = torch.rand(batch_size, dim, dtype=torch.bfloat16)
+    return [x]
+
+
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/39_L2Norm_.py b/backends/triton/cpu/KernelBench/level1/39_L2Norm_.py
new file mode 100644
index 0000000..ecd571f
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/39_L2Norm_.py
@@ -0,0 +1,84 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2),
+    ],
+    key=["N"],
+)
+@triton.jit
+def l2_norm_kernel(
+    x_ptr,
+    out_ptr,
+    M,
+    N,
+    stride_m,
+    stride_n,
+    BLOCK_SIZE: tl.constexpr,
+):
+    row = tl.program_id(0)
+    row_start = row * stride_m
+
+    sum_sq = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
+    for off in range(0, N, BLOCK_SIZE):
+        cols = off + tl.arange(0, BLOCK_SIZE)
+        mask = cols < N
+        x = tl.load(x_ptr + row_start + cols * stride_n, mask=mask, other=0.0).to(
+            tl.float32
+        )
+        sum_sq += x * x
+
+    norm_sq = tl.sum(sum_sq, axis=0)
+    inv_norm = 1.0 / tl.sqrt(norm_sq + 1e-12)
+
+    for off in range(0, N, BLOCK_SIZE):
+        cols = off + tl.arange(0, BLOCK_SIZE)
+        mask = cols < N
+        x = tl.load(x_ptr + row_start + cols * stride_n, mask=mask, other=0.0).to(
+            tl.float32
+        )
+        out = x * inv_norm
+        tl.store(out_ptr + row_start + cols * stride_n, out.to(tl.bfloat16), mask=mask)
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x.contiguous()
+        M, N = x.shape
+        out = torch.empty_like(x)
+        grid = (M,)
+        l2_norm_kernel[grid](
+            x,
+            out,
+            M,
+            N,
+            x.stride(0),
+            x.stride(1),
+        )
+        return out
+
+
+batch_size = 4096
+dim = 16384
+
+
+def get_inputs():
+    x = torch.rand(batch_size, dim, dtype=torch.bfloat16)
+    return [x]
+
+
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/3_Batched_matrix_multiplication.py b/backends/triton/cpu/KernelBench/level1/3_Batched_matrix_multiplication.py
new file mode 100644
index 0000000..9380f52
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/3_Batched_matrix_multiplication.py
@@ -0,0 +1,147 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
+            num_warps=32,
+            num_stages=2,
+        ),
+    ],
+    key=["M", "N", "K"],
+)
+@triton.jit
+def _batched_matmul_kernel(
+    A_ptr,
+    B_ptr,
+    C_ptr,
+    M,
+    N,
+    K,
+    stride_ab: tl.constexpr,
+    stride_am: tl.constexpr,
+    stride_ak: tl.constexpr,
+    stride_bb: tl.constexpr,
+    stride_bk: tl.constexpr,
+    stride_bn: tl.constexpr,
+    stride_cb: tl.constexpr,
+    stride_cm: tl.constexpr,
+    stride_cn: tl.constexpr,
+    BATCH: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    pid = tl.program_id(0)
+
+    num_pid_m = tl.cdiv(M, BLOCK_M)
+    num_pid_n = tl.cdiv(N, BLOCK_N)
+    num_tiles_mn = num_pid_m * num_pid_n
+
+    batch_id = pid // num_tiles_mn
+    tile_id = pid % num_tiles_mn
+
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = tile_id // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = tl.minimum(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((tile_id % num_pid_in_group) % group_size_m)
+    pid_n = (tile_id % num_pid_in_group) // group_size_m
+
+    batch_offset_a = batch_id.to(tl.int64) * stride_ab
+    batch_offset_b = batch_id.to(tl.int64) * stride_bb
+    batch_offset_c = batch_id.to(tl.int64) * stride_cb
+
+    a_desc = tl.make_tensor_descriptor(
+        base=A_ptr + batch_offset_a,
+        shape=(M, K),
+        strides=(stride_am, stride_ak),
+        block_shape=(BLOCK_M, BLOCK_K),
+    )
+
+    b_desc = tl.make_tensor_descriptor(
+        base=B_ptr + batch_offset_b,
+        shape=(K, N),
+        strides=(stride_bk, stride_bn),
+        block_shape=(BLOCK_K, BLOCK_N),
+    )
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    for off_k in range(0, K, BLOCK_K):
+        a = a_desc.load([pid_m * BLOCK_M, off_k])
+        b = b_desc.load([off_k, pid_n * BLOCK_N])
+        acc += tl.dot(a, b)
+    c_desc = tl.make_tensor_descriptor(
+        base=C_ptr + batch_offset_c,
+        shape=(M, N),
+        strides=(stride_cm, stride_cn),
+        block_shape=(BLOCK_M, BLOCK_N),
+    )
+    c_desc.store([pid_m * BLOCK_M, pid_n * BLOCK_N], acc.to(C_ptr.type.element_ty))
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
+        BATCH, M, K = A.shape
+        _, _, N = B.shape
+
+        A = A.contiguous()
+        B = B.contiguous()
+        C = torch.empty((BATCH, M, N), device=A.device, dtype=A.dtype)
+
+        def grid(META):
+            return (
+                BATCH
+                * triton.cdiv(M, META["BLOCK_M"])
+                * triton.cdiv(N, META["BLOCK_N"]),
+            )
+
+        _batched_matmul_kernel[grid](
+            A,
+            B,
+            C,
+            M,
+            N,
+            K,
+            A.stride(0),
+            A.stride(1),
+            A.stride(2),
+            B.stride(0),
+            B.stride(1),
+            B.stride(2),
+            C.stride(0),
+            C.stride(1),
+            C.stride(2),
+            BATCH=BATCH,
+        )
+        return C
+
+
+batch_size = 128
+m = 128 * 4
+k = 256 * 4
+n = 512 * 4
+
+
+def get_inputs():
+    A = torch.rand(batch_size, m, k, dtype=torch.bfloat16)
+    B = torch.rand(batch_size, k, n, dtype=torch.bfloat16)
+    return [A, B]
+
+
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/40_LayerNorm.py b/backends/triton/cpu/KernelBench/level1/40_LayerNorm.py
new file mode 100644
index 0000000..e0954b2
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/40_LayerNorm.py
@@ -0,0 +1,111 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {"BLOCK_SIZE": 1024, "warp_size": 32}, num_warps=16, num_stages=2
+        ),
+    ],
+    key=["N"],
+)
+@triton.jit
+def _layer_norm_best_kernel(
+    X_ptr,
+    Y_ptr,
+    W_ptr,
+    B_ptr,
+    M,
+    N,
+    eps,
+    BLOCK_SIZE: tl.constexpr,
+    warp_size: tl.constexpr,
+):
+    row = tl.program_id(0)
+    row_start = row.to(tl.int64) * N
+
+    _sum = tl.zeros([BLOCK_SIZE], dtype=tl.float32)
+    _sum_sq = tl.zeros([BLOCK_SIZE], dtype=tl.float32)
+    for off in range(0, N, BLOCK_SIZE):
+        cols = off + tl.arange(0, BLOCK_SIZE)
+        mask = cols < N
+        x = tl.load(X_ptr + row_start + cols, mask=mask, other=0.0).to(tl.float32)
+        _sum += x
+        _sum_sq += x * x
+    mean = tl.sum(_sum, axis=0) / N
+    var = tl.sum(_sum_sq, axis=0) / N - mean * mean
+    rstd = 1.0 / tl.sqrt(var + eps)
+
+    for off in range(0, N, BLOCK_SIZE):
+        cols = off + tl.arange(0, BLOCK_SIZE)
+        mask = cols < N
+        x = tl.load(X_ptr + row_start + cols, mask=mask, other=0.0).to(tl.float32)
+        w = tl.load(W_ptr + cols, mask=mask, other=1.0).to(tl.float32)
+        b = tl.load(B_ptr + cols, mask=mask, other=0.0).to(tl.float32)
+        y = (x - mean) * rstd * w + b
+        tl.store(Y_ptr + row_start + cols, y.to(tl.bfloat16), mask=mask)
+
+
+class Model(nn.Module):
+    def __init__(self, normalized_shape: tuple):
+        super(Model, self).__init__()
+        self.ln = nn.LayerNorm(normalized_shape=normalized_shape)
+        self._moved = False
+
+    def _move_params(self, device):
+        self.w_flat = (
+            self.ln.weight.data.to(device, dtype=torch.bfloat16).contiguous().flatten()
+        )
+        self.b_flat = (
+            self.ln.bias.data.to(device, dtype=torch.bfloat16).contiguous().flatten()
+        )
+        self._eps = self.ln.eps
+        self._norm_n = 1
+        for s in self.ln.normalized_shape:
+            self._norm_n *= s
+        self._moved = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not self._moved:
+            self._move_params(x.device)
+        x = x.contiguous()
+        orig_shape = x.shape
+        N = self._norm_n
+        M = x.numel() // N
+        x_flat = x.view(M, N)
+        y_flat = torch.empty_like(x_flat)
+        grid = (M,)
+        _layer_norm_best_kernel[grid](
+            x_flat,
+            y_flat,
+            self.w_flat,
+            self.b_flat,
+            M,
+            N,
+            self._eps,
+        )
+        return y_flat.view(orig_shape)
+
+
+batch_size = 16
+features = 64
+dim1 = 256
+dim2 = 256
+
+
+def get_inputs():
+    x = torch.rand(batch_size, features, dim1, dim2, dtype=torch.bfloat16)
+    return [x]
+
+
+def get_init_inputs(*args, **kwargs):
+    return [(features, dim1, dim2)]
diff --git a/backends/triton/cpu/KernelBench/level1/41_Max_Pooling_1D.py b/backends/triton/cpu/KernelBench/level1/41_Max_Pooling_1D.py
new file mode 100644
index 0000000..1d71cbb
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/41_Max_Pooling_1D.py
@@ -0,0 +1,121 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_SIZE": 256}, num_warps=4, num_stages=2),
+    ],
+    key=["output_length"],
+)
+@triton.jit
+def maxpool1d_kernel(
+    input_ptr,
+    output_ptr,
+    seq_length,
+    output_length,
+    num_channels,
+    stride_b,
+    stride_c,
+    stride_out_b,
+    stride_out_c,
+    KERNEL_SIZE: tl.constexpr,
+    STRIDE: tl.constexpr,
+    PADDING: tl.constexpr,
+    DILATION: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid_bc = tl.program_id(0)
+    pid_o = tl.program_id(1)
+
+    b = pid_bc // num_channels
+    c = pid_bc % num_channels
+
+    o_start = pid_o * BLOCK_SIZE
+    o_offsets = o_start + tl.arange(0, BLOCK_SIZE)
+    o_mask = o_offsets < output_length
+
+    base_in = input_ptr + b * stride_b + c * stride_c
+    running_max = tl.full([BLOCK_SIZE], value=-float("inf"), dtype=tl.float32)
+
+    for k in range(KERNEL_SIZE):
+        inp_idx = o_offsets * STRIDE + k * DILATION - PADDING
+        valid = (inp_idx >= 0) & (inp_idx < seq_length) & o_mask
+        vals = tl.load(base_in + inp_idx, mask=valid, other=-float("inf"))
+        running_max = tl.maximum(running_max, vals.to(tl.float32))
+
+    base_out = output_ptr + b * stride_out_b + c * stride_out_c
+    tl.store(base_out + o_offsets, running_max.to(tl.bfloat16), mask=o_mask)
+
+
+class Model(nn.Module):
+    def __init__(
+        self,
+        kernel_size: int,
+        stride: int = None,
+        padding: int = 0,
+        dilation: int = 1,
+        return_indices: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride if stride is not None else kernel_size
+        self.padding = padding
+        self.dilation = dilation
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        device = x.device
+        B, C, L = x.shape
+        x = x.to(device).contiguous()
+
+        output_length = (
+            L + 2 * self.padding - self.dilation * (self.kernel_size - 1) - 1
+        ) // self.stride + 1
+        output = torch.empty((B, C, output_length), device=device, dtype=x.dtype)
+
+        grid = lambda META: (B * C, triton.cdiv(output_length, META["BLOCK_SIZE"]))
+        maxpool1d_kernel[grid](
+            x,
+            output,
+            L,
+            output_length,
+            C,
+            x.stride(0),
+            x.stride(1),
+            output.stride(0),
+            output.stride(1),
+            KERNEL_SIZE=self.kernel_size,
+            STRIDE=self.stride,
+            PADDING=self.padding,
+            DILATION=self.dilation,
+        )
+        return output
+
+
+batch_size = 64
+features = 192
+sequence_length = 65536
+
+kernel_size = 8
+stride = 1
+padding = 4
+dilation = 3
+
+return_indices = False
+
+
+def get_inputs():
+    x = torch.rand(batch_size, features, sequence_length, dtype=torch.bfloat16)
+    return [x]
+
+
+def get_init_inputs():
+    return [kernel_size, stride, padding, dilation, return_indices]
diff --git a/backends/triton/cpu/KernelBench/level1/42_Max_Pooling_2D.py b/backends/triton/cpu/KernelBench/level1/42_Max_Pooling_2D.py
new file mode 100644
index 0000000..0d5aa8e
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/42_Max_Pooling_2D.py
@@ -0,0 +1,127 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_H": 1, "BLOCK_W": 256}, num_warps=4, num_stages=2),
+    ],
+    key=["OH", "OW"],
+)
+@triton.jit
+def _maxpool2d_kernel(
+    x_ptr,
+    out_ptr,
+    H,
+    W,
+    OH,
+    OW,
+    BLOCK_H: tl.constexpr,
+    BLOCK_W: tl.constexpr,
+    KERNEL_SIZE: tl.constexpr,
+    STRIDE: tl.constexpr,
+    PADDING: tl.constexpr,
+    DILATION: tl.constexpr,
+    IS_BF16: tl.constexpr,
+):
+    pid_bc = tl.program_id(0)
+    pid_oh = tl.program_id(1)
+    pid_ow = tl.program_id(2)
+
+    x_base = x_ptr + pid_bc.to(tl.int64) * H * W
+    out_base = out_ptr + pid_bc.to(tl.int64) * OH * OW
+
+    oh_offsets = pid_oh * BLOCK_H + tl.arange(0, BLOCK_H)
+    ow_offsets = pid_ow * BLOCK_W + tl.arange(0, BLOCK_W)
+
+    max_vals = tl.full((BLOCK_H, BLOCK_W), float("-inf"), dtype=tl.float32)
+
+    for kh in range(KERNEL_SIZE):
+        ih = oh_offsets * STRIDE - PADDING + kh * DILATION
+        valid_h = (ih >= 0) & (ih < H) & (oh_offsets < OH)
+        for kw in range(KERNEL_SIZE):
+            iw = ow_offsets * STRIDE - PADDING + kw * DILATION
+            valid_w = (iw >= 0) & (iw < W) & (ow_offsets < OW)
+            mask = valid_h[:, None] & valid_w[None, :]
+            ptrs = x_base + ih[:, None] * W + iw[None, :]
+            vals = tl.load(ptrs, mask=mask, other=float("-inf")).to(tl.float32)
+            max_vals = tl.maximum(max_vals, vals)
+
+    out_mask = (oh_offsets < OH)[:, None] & (ow_offsets < OW)[None, :]
+    out_ptrs = out_base + oh_offsets[:, None] * OW + ow_offsets[None, :]
+    if IS_BF16:
+        tl.store(out_ptrs, max_vals.to(tl.bfloat16), mask=out_mask)
+    else:
+        tl.store(out_ptrs, max_vals, mask=out_mask)
+
+
+def maxpool2d(x, kernel_size, stride, padding, dilation):
+    B, C, H, W = x.shape
+    OH = (H + 2 * padding - dilation * (kernel_size - 1) - 1) // stride + 1
+    OW = (W + 2 * padding - dilation * (kernel_size - 1) - 1) // stride + 1
+
+    x = x.contiguous()
+    out = torch.empty((B, C, OH, OW), device=x.device, dtype=x.dtype)
+
+    is_bf16 = x.dtype == torch.bfloat16
+
+    grid = lambda META: (
+        B * C,
+        triton.cdiv(OH, META["BLOCK_H"]),
+        triton.cdiv(OW, META["BLOCK_W"]),
+    )
+
+    _maxpool2d_kernel[grid](
+        x,
+        out,
+        H,
+        W,
+        OH,
+        OW,
+        KERNEL_SIZE=kernel_size,
+        STRIDE=stride,
+        PADDING=padding,
+        DILATION=dilation,
+        IS_BF16=is_bf16,
+    )
+
+    return out
+
+
+class Model(nn.Module):
+    def __init__(self, kernel_size: int, stride: int, padding: int, dilation: int):
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return maxpool2d(x, self.kernel_size, self.stride, self.padding, self.dilation)
+
+
+batch_size = 32
+channels = 64
+height = 512
+width = 512
+kernel_size = 4
+stride = 1
+padding = 1
+dilation = 1
+
+
+def get_inputs():
+    x = torch.rand(batch_size, channels, height, width, dtype=torch.bfloat16)
+    return [x]
+
+
+def get_init_inputs():
+    return [kernel_size, stride, padding, dilation]
diff --git a/backends/triton/cpu/KernelBench/level1/43_Max_Pooling_3D.py b/backends/triton/cpu/KernelBench/level1/43_Max_Pooling_3D.py
new file mode 100644
index 0000000..84f6dc5
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/43_Max_Pooling_3D.py
@@ -0,0 +1,172 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_OW": 32}, num_warps=4, num_stages=2),
+    ],
+    key=["OW"],
+)
+@triton.jit
+def _maxpool3d_kernel(
+    x_ptr,
+    out_ptr,
+    C,
+    D,
+    H,
+    W,
+    OD,
+    OH,
+    OW,
+    stride_xb,
+    stride_xc,
+    stride_xd,
+    stride_xh,
+    stride_xw,
+    stride_ob,
+    stride_oc,
+    stride_od,
+    stride_oh,
+    stride_ow,
+    POOL_STRIDE: tl.constexpr,
+    PAD: tl.constexpr,
+    DIL: tl.constexpr,
+    KS: tl.constexpr,
+    BLOCK_OW: tl.constexpr,
+):
+    pid_ow = tl.program_id(0)
+    pid_dh = tl.program_id(1)
+    pid_bc = tl.program_id(2)
+
+    b = pid_bc // C
+    c = pid_bc % C
+    od = pid_dh // OH
+    oh = pid_dh % OH
+
+    ow_start = pid_ow * BLOCK_OW
+    ow_offs = ow_start + tl.arange(0, BLOCK_OW)
+    mask_ow = ow_offs < OW
+
+    base = x_ptr + b.to(tl.int64) * stride_xb + c.to(tl.int64) * stride_xc
+
+    d_base = od * POOL_STRIDE - PAD
+    h_base = oh * POOL_STRIDE - PAD
+    w_bases = ow_offs * POOL_STRIDE - PAD
+
+    max_val = tl.full([BLOCK_OW], float("-inf"), dtype=tl.float32)
+
+    for kd in range(KS):
+        d_in = d_base + kd * DIL
+        d_valid = (d_in >= 0) & (d_in < D)
+        if d_valid:
+            for kh in range(KS):
+                h_in = h_base + kh * DIL
+                h_valid = (h_in >= 0) & (h_in < H)
+                if h_valid:
+                    dh_offset = d_in * stride_xd + h_in * stride_xh
+                    for kw in range(KS):
+                        w_in = w_bases + kw * DIL
+                        w_valid = (w_in >= 0) & (w_in < W)
+                        ptrs = base + dh_offset + w_in * stride_xw
+                        valid_mask = mask_ow & w_valid
+                        val = tl.load(ptrs, mask=valid_mask, other=float("-inf"))
+                        max_val = tl.maximum(max_val, val.to(tl.float32))
+
+    out_base = out_ptr + b.to(tl.int64) * stride_ob + c.to(tl.int64) * stride_oc
+    out_base += od * stride_od + oh * stride_oh
+    out_ptrs = out_base + ow_offs * stride_ow
+    tl.store(out_ptrs, max_val.to(tl.bfloat16), mask=mask_ow)
+
+
+def maxpool3d_triton(x, kernel_size, stride_pool, padding, dilation):
+    B, C, D, H, W = x.shape
+    OD = (D + 2 * padding - dilation * (kernel_size - 1) - 1) // stride_pool + 1
+    OH = (H + 2 * padding - dilation * (kernel_size - 1) - 1) // stride_pool + 1
+    OW = (W + 2 * padding - dilation * (kernel_size - 1) - 1) // stride_pool + 1
+
+    out = torch.empty(B, C, OD, OH, OW, device=x.device, dtype=x.dtype)
+
+    grid = lambda META: (
+        triton.cdiv(OW, META["BLOCK_OW"]),
+        OD * OH,
+        B * C,
+    )
+
+    _maxpool3d_kernel[grid](
+        x,
+        out,
+        C,
+        D,
+        H,
+        W,
+        OD,
+        OH,
+        OW,
+        x.stride(0),
+        x.stride(1),
+        x.stride(2),
+        x.stride(3),
+        x.stride(4),
+        out.stride(0),
+        out.stride(1),
+        out.stride(2),
+        out.stride(3),
+        out.stride(4),
+        POOL_STRIDE=stride_pool,
+        PAD=padding,
+        DIL=dilation,
+        KS=kernel_size,
+    )
+
+    return out
+
+
+class Model(nn.Module):
+    def __init__(
+        self,
+        kernel_size: int,
+        stride: int = None,
+        padding: int = 0,
+        dilation: int = 1,
+        return_indices: bool = False,
+        ceil_mode: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.kernel_size = kernel_size
+        self.stride_pool = stride if stride is not None else kernel_size
+        self.padding = padding
+        self.dilation = dilation
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return maxpool3d_triton(
+            x, self.kernel_size, self.stride_pool, self.padding, self.dilation
+        )
+
+
+batch_size = 16
+channels = 32
+dim1 = 128
+dim2 = 128
+dim3 = 128
+kernel_size = 3
+stride = 2
+padding = 1
+dilation = 3
+
+
+def get_inputs():
+    x = torch.rand(batch_size, channels, dim1, dim2, dim3, dtype=torch.bfloat16)
+    return [x]
+
+
+def get_init_inputs():
+    return [kernel_size, stride, padding, dilation]
diff --git a/backends/triton/cpu/KernelBench/level1/44_Average_Pooling_1D.py b/backends/triton/cpu/KernelBench/level1/44_Average_Pooling_1D.py
new file mode 100644
index 0000000..564bf7f
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/44_Average_Pooling_1D.py
@@ -0,0 +1,125 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_SIZE": 256}, num_warps=4, num_stages=2),
+    ],
+    key=["output_length", "kernel_size"],
+)
+@triton.jit
+def avg_pool1d_kernel(
+    input_ptr,
+    output_ptr,
+    input_length,
+    output_length,
+    kernel_size,
+    stride,
+    padding,
+    stride_b,
+    stride_c,
+    stride_l,
+    out_stride_b,
+    out_stride_c,
+    out_stride_l,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid_bc = tl.program_id(0)
+    pid_l = tl.program_id(1)
+
+    offs = pid_l * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offs < output_length
+
+    acc = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
+
+    for k in range(kernel_size):
+        inp_pos = offs * stride - padding + k
+        inp_mask = mask & (inp_pos >= 0) & (inp_pos < input_length)
+        inp_ptr = input_ptr + pid_bc.to(tl.int64) * stride_c + inp_pos * stride_l
+        val = tl.load(inp_ptr, mask=inp_mask, other=0.0).to(tl.float32)
+        acc += val
+
+    inv_kernel_size = 1.0 / kernel_size
+    acc = acc * inv_kernel_size
+
+    out_ptr = output_ptr + pid_bc.to(tl.int64) * out_stride_c + offs * out_stride_l
+    tl.store(out_ptr, acc.to(tl.bfloat16), mask=mask)
+
+
+def kernel_function(
+    x: torch.Tensor, kernel_size: int, stride: int, padding: int
+) -> torch.Tensor:
+    batch_size, in_channels, input_length = x.shape
+    output_length = (input_length + 2 * padding - kernel_size) // stride + 1
+
+    if x.dtype != torch.bfloat16:
+        x = x.to(torch.bfloat16)
+    x = x.contiguous()
+
+    output = torch.empty(
+        (batch_size, in_channels, output_length),
+        device=x.device,
+        dtype=torch.bfloat16,
+    )
+
+    total_bc = batch_size * in_channels
+
+    x_flat = x.view(total_bc, input_length)
+    out_flat = output.view(total_bc, output_length)
+
+    grid = lambda META: (total_bc, triton.cdiv(output_length, META["BLOCK_SIZE"]))
+
+    avg_pool1d_kernel[grid](
+        x_flat,
+        out_flat,
+        input_length,
+        output_length,
+        kernel_size,
+        stride,
+        padding,
+        x_flat.stride(0),
+        x_flat.stride(0),
+        x_flat.stride(1),
+        out_flat.stride(0),
+        out_flat.stride(0),
+        out_flat.stride(1),
+    )
+
+    return output
+
+
+batch_size = 64
+in_channels = 128
+input_length = 65536
+kernel_size = 8
+stride = 1
+padding = 4
+
+
+class Model(nn.Module):
+    def __init__(self, kernel_size: int, stride: int = 1, padding: int = 0):
+        super(Model, self).__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return kernel_function(x, self.kernel_size, self.stride, self.padding)
+
+
+def get_inputs():
+    x = torch.rand(batch_size, in_channels, input_length, dtype=torch.bfloat16)
+    return [x]
+
+
+def get_init_inputs():
+    return [kernel_size, stride, padding]
diff --git a/backends/triton/cpu/KernelBench/level1/45_Average_Pooling_2D.py b/backends/triton/cpu/KernelBench/level1/45_Average_Pooling_2D.py
new file mode 100644
index 0000000..0e9e33f
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/45_Average_Pooling_2D.py
@@ -0,0 +1,115 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_OW": 16}, num_warps=4, num_stages=2),
+    ],
+    key=["OW"],
+)
+@triton.jit
+def avg_pool2d_kernel(
+    input_ptr,
+    output_ptr,
+    W,
+    OW,
+    in_stride_nc,
+    out_stride_nc,
+    KERNEL_SIZE: tl.constexpr,
+    POOL_STRIDE: tl.constexpr,
+    BLOCK_OW: tl.constexpr,
+    OUTPUT_BF16: tl.constexpr,
+):
+    pid_nc = tl.program_id(0)
+    pid_oh = tl.program_id(1)
+    pid_ow_tile = tl.program_id(2)
+
+    ow_offsets = pid_ow_tile * BLOCK_OW + tl.arange(0, BLOCK_OW)
+    ow_mask = ow_offsets < OW
+
+    in_base = input_ptr + pid_nc.to(tl.int64) * in_stride_nc
+    out_base = output_ptr + pid_nc.to(tl.int64) * out_stride_nc
+
+    acc = tl.zeros((BLOCK_OW,), dtype=tl.float32)
+
+    h_start = pid_oh * POOL_STRIDE
+
+    for kh in range(KERNEL_SIZE):
+        h_in = h_start + kh
+        row_offset = h_in * W
+        for kw in range(KERNEL_SIZE):
+            w_in = ow_offsets * POOL_STRIDE + kw
+            vals = tl.load(in_base + row_offset + w_in, mask=ow_mask, other=0.0)
+            acc += vals.to(tl.float32)
+
+    inv_area = 1.0 / (KERNEL_SIZE * KERNEL_SIZE)
+    result = acc * inv_area
+
+    out_offset = pid_oh * OW + ow_offsets
+    if OUTPUT_BF16:
+        tl.store(out_base + out_offset, result.to(tl.bfloat16), mask=ow_mask)
+    else:
+        tl.store(out_base + out_offset, result, mask=ow_mask)
+
+
+class Model(nn.Module):
+    def __init__(self, kernel_size: int, stride: int = None, padding: int = 0):
+        super(Model, self).__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride if stride is not None else kernel_size
+        self.padding = padding
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        N, C, H, W = x.shape
+        x = x.contiguous()
+
+        OH = (H + 2 * self.padding - self.kernel_size) // self.stride + 1
+        OW = (W + 2 * self.padding - self.kernel_size) // self.stride + 1
+
+        output = torch.empty(N, C, OH, OW, device=x.device, dtype=x.dtype)
+
+        NC = N * C
+        in_stride_nc = H * W
+        out_stride_nc = OH * OW
+        is_bf16 = x.dtype == torch.bfloat16
+
+        grid = lambda META: (NC, OH, triton.cdiv(OW, META["BLOCK_OW"]))
+
+        avg_pool2d_kernel[grid](
+            x,
+            output,
+            W,
+            OW,
+            in_stride_nc,
+            out_stride_nc,
+            KERNEL_SIZE=self.kernel_size,
+            POOL_STRIDE=self.stride,
+            OUTPUT_BF16=is_bf16,
+        )
+
+        return output
+
+
+batch_size = 16
+channels = 64
+height = 2048
+width = 2048
+kernel_size = 11
+
+
+def get_inputs():
+    x = torch.rand(batch_size, channels, height, width, dtype=torch.bfloat16)
+    return [x]
+
+
+def get_init_inputs():
+    return [kernel_size]
diff --git a/backends/triton/cpu/KernelBench/level1/46_Average_Pooling_3D.py b/backends/triton/cpu/KernelBench/level1/46_Average_Pooling_3D.py
new file mode 100644
index 0000000..f52c515
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/46_Average_Pooling_3D.py
@@ -0,0 +1,157 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_OW": 64}, num_warps=4, num_stages=2),
+    ],
+    key=["OW"],
+)
+@triton.jit
+def avg_pool3d_kernel(
+    x_ptr,
+    out_ptr,
+    C,
+    D,
+    H,
+    W,
+    OD,
+    OH,
+    OW,
+    stride_xn,
+    stride_xc,
+    stride_xd,
+    stride_xh,
+    stride_on,
+    stride_oc,
+    stride_od,
+    stride_oh,
+    KERNEL_SIZE: tl.constexpr,
+    STRIDE: tl.constexpr,
+    PADDING: tl.constexpr,
+    BLOCK_OW: tl.constexpr,
+):
+    pid_ow = tl.program_id(0)
+    pid_ncdoh = tl.program_id(1)
+
+    oh = pid_ncdoh % OH
+    tmp = pid_ncdoh // OH
+    od = tmp % OD
+    tmp2 = tmp // OD
+    c = tmp2 % C
+    n = tmp2 // C
+
+    ow_start = pid_ow * BLOCK_OW
+    ow_offsets = ow_start + tl.arange(0, BLOCK_OW)
+    mask_ow = ow_offsets < OW
+
+    base_x = x_ptr + n.to(tl.int64) * stride_xn + c.to(tl.int64) * stride_xc
+
+    acc = tl.zeros((BLOCK_OW,), dtype=tl.float32)
+
+    for kd in range(KERNEL_SIZE):
+        d_in = od * STRIDE + kd - PADDING
+        d_valid = (d_in >= 0) & (d_in < D)
+        if d_valid:
+            for kh in range(KERNEL_SIZE):
+                h_in = oh * STRIDE + kh - PADDING
+                h_valid = (h_in >= 0) & (h_in < H)
+                if h_valid:
+                    row_base = (
+                        base_x
+                        + d_in.to(tl.int64) * stride_xd
+                        + h_in.to(tl.int64) * stride_xh
+                    )
+                    for kw in range(KERNEL_SIZE):
+                        w_in = ow_offsets * STRIDE + kw - PADDING
+                        w_valid = (w_in >= 0) & (w_in < W)
+                        mask = mask_ow & w_valid
+                        ptrs = row_base + w_in
+                        vals = tl.load(ptrs, mask=mask, other=0.0)
+                        acc += vals.to(tl.float32)
+
+    inv_count = 1.0 / (KERNEL_SIZE * KERNEL_SIZE * KERNEL_SIZE)
+    acc = acc * inv_count
+
+    out_base = (
+        out_ptr
+        + n.to(tl.int64) * stride_on
+        + c.to(tl.int64) * stride_oc
+        + od.to(tl.int64) * stride_od
+        + oh.to(tl.int64) * stride_oh
+    )
+    tl.store(out_base + ow_offsets, acc.to(tl.bfloat16), mask=mask_ow)
+
+
+class Model(nn.Module):
+    def __init__(self, kernel_size: int, stride: int = None, padding: int = 0):
+        super(Model, self).__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride if stride is not None else kernel_size
+        self.padding = padding
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        N, C, D, H, W = x.shape
+        OD = (D + 2 * self.padding - self.kernel_size) // self.stride + 1
+        OH = (H + 2 * self.padding - self.kernel_size) // self.stride + 1
+        OW = (W + 2 * self.padding - self.kernel_size) // self.stride + 1
+
+        output = torch.empty(N, C, OD, OH, OW, device=x.device, dtype=x.dtype)
+
+        grid = lambda META: (
+            triton.cdiv(OW, META["BLOCK_OW"]),
+            N * C * OD * OH,
+        )
+
+        avg_pool3d_kernel[grid](
+            x,
+            output,
+            C,
+            D,
+            H,
+            W,
+            OD,
+            OH,
+            OW,
+            x.stride(0),
+            x.stride(1),
+            x.stride(2),
+            x.stride(3),
+            output.stride(0),
+            output.stride(1),
+            output.stride(2),
+            output.stride(3),
+            KERNEL_SIZE=self.kernel_size,
+            STRIDE=self.stride,
+            PADDING=self.padding,
+        )
+
+        return output
+
+
+batch_size = 16
+channels = 32
+depth = 128
+height = 128
+width = 256
+kernel_size = 3
+stride = 2
+padding = 1
+
+
+def get_inputs():
+    x = torch.rand(batch_size, channels, depth, height, width, dtype=torch.bfloat16)
+    return [x]
+
+
+def get_init_inputs():
+    return [kernel_size, stride, padding]
diff --git a/backends/triton/cpu/KernelBench/level1/47_Sum_reduction_over_a_dimension.py b/backends/triton/cpu/KernelBench/level1/47_Sum_reduction_over_a_dimension.py
new file mode 100644
index 0000000..e5088dd
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/47_Sum_reduction_over_a_dimension.py
@@ -0,0 +1,103 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+def get_reduction_configs():
+    return [
+        triton.Config({"BLOCK_R": 64, "BLOCK_N": 128}, num_warps=4, num_stages=2),
+    ]
+
+
+@triton.autotune(
+    configs=get_reduction_configs(),
+    key=["R", "C"],
+)
+@triton.jit
+def sum_reduce_kernel(
+    x_ptr,
+    out_ptr,
+    B,
+    R,
+    C,
+    stride_xb,
+    stride_xr,
+    stride_xc: tl.constexpr,
+    stride_ob,
+    stride_oc,
+    BLOCK_R: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    pid_n = tl.program_id(0)
+    pid_b = tl.program_id(1)
+
+    x_batch_offset = pid_b.to(tl.int64) * stride_xb
+    col_start = pid_n * BLOCK_N
+
+    acc = tl.zeros((BLOCK_N,), dtype=tl.float32)
+
+    x_desc = tl.make_tensor_descriptor(
+        base=x_ptr + x_batch_offset,
+        shape=(R, C),
+        strides=(stride_xr, stride_xc),
+        block_shape=(BLOCK_R, BLOCK_N),
+    )
+    for off_r in range(0, R, BLOCK_R):
+        tile = x_desc.load([off_r, col_start])
+        acc += tl.sum(tile.to(tl.float32), axis=0)
+    offs_c = col_start + tl.arange(0, BLOCK_N)
+    out_offset = pid_b.to(tl.int64) * stride_ob
+    tl.store(
+        out_ptr + out_offset + offs_c * stride_oc, acc.to(tl.bfloat16), mask=offs_c < C
+    )
+
+
+class Model(nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x.contiguous()
+
+        B, R, C = x.shape
+        output = torch.empty((B, 1, C), device=x.device, dtype=x.dtype)
+
+        grid = lambda META: (triton.cdiv(C, META["BLOCK_N"]), B)
+
+        sum_reduce_kernel[grid](
+            x,
+            output,
+            B,
+            R,
+            C,
+            x.stride(0),
+            x.stride(1),
+            x.stride(2),
+            output.stride(0),
+            output.stride(2),
+        )
+
+        return output
+
+
+batch_size = 128
+dim1 = 4096
+dim2 = 4095
+reduce_dim = 1
+
+
+def get_inputs():
+    x = torch.rand(batch_size, dim1, dim2, dtype=torch.bfloat16)
+    return [x]
+
+
+def get_init_inputs():
+    return [reduce_dim]
diff --git a/backends/triton/cpu/KernelBench/level1/48_Mean_reduction_over_a_dimension.py b/backends/triton/cpu/KernelBench/level1/48_Mean_reduction_over_a_dimension.py
new file mode 100644
index 0000000..7b43864
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/48_Mean_reduction_over_a_dimension.py
@@ -0,0 +1,106 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+def get_reduction_configs():
+    return [
+        triton.Config({"BLOCK_R": 64, "BLOCK_N": 128}, num_warps=4, num_stages=2),
+    ]
+
+
+@triton.autotune(
+    configs=get_reduction_configs(),
+    key=["R", "C"],
+)
+@triton.jit
+def mean_reduce_kernel(
+    x_ptr,
+    out_ptr,
+    B,
+    R,
+    C,
+    stride_xb,
+    stride_xr,
+    stride_xc: tl.constexpr,
+    stride_ob,
+    stride_oc,
+    BLOCK_R: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    pid_n = tl.program_id(0)
+    pid_b = tl.program_id(1)
+
+    x_batch_offset = pid_b.to(tl.int64) * stride_xb
+    col_start = pid_n * BLOCK_N
+
+    acc = tl.zeros((BLOCK_N,), dtype=tl.float32)
+
+    x_desc = tl.make_tensor_descriptor(
+        base=x_ptr + x_batch_offset,
+        shape=(R, C),
+        strides=(stride_xr, stride_xc),
+        block_shape=(BLOCK_R, BLOCK_N),
+    )
+    for off_r in range(0, R, BLOCK_R):
+        tile = x_desc.load([off_r, col_start])
+        acc += tl.sum(tile.to(tl.float32), axis=0)
+    inv_dim = 1.0 / R
+    result = acc * inv_dim
+
+    offs_c = col_start + tl.arange(0, BLOCK_N)
+    out_offset = pid_b.to(tl.int64) * stride_ob
+    tl.store(
+        out_ptr + out_offset + offs_c * stride_oc,
+        result.to(tl.bfloat16),
+        mask=offs_c < C,
+    )
+
+
+class Model(nn.Module):
+    def __init__(self, dim: int):
+        super(Model, self).__init__()
+        self.dim = dim
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        assert self.dim == 1, "This kernel only supports reduction over dim=1"
+        x = x.contiguous()
+        B, R, C = x.shape
+        out = torch.empty((B, C), device=x.device, dtype=x.dtype)
+
+        grid = lambda META: (triton.cdiv(C, META["BLOCK_N"]), B)
+
+        mean_reduce_kernel[grid](
+            x,
+            out,
+            B,
+            R,
+            C,
+            x.stride(0),
+            x.stride(1),
+            x.stride(2),
+            out.stride(0),
+            out.stride(1),
+        )
+        return out
+
+
+batch_size = 128
+dim1 = 4096
+dim2 = 4095
+
+
+def get_inputs():
+    x = torch.rand(batch_size, dim1, dim2, dtype=torch.bfloat16)
+    return [x]
+
+
+def get_init_inputs():
+    return [1]
diff --git a/backends/triton/cpu/KernelBench/level1/49_Max_reduction_over_a_dimension.py b/backends/triton/cpu/KernelBench/level1/49_Max_reduction_over_a_dimension.py
new file mode 100644
index 0000000..a82f882
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/49_Max_reduction_over_a_dimension.py
@@ -0,0 +1,105 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_N": 64, "BLOCK_K": 64}, num_warps=8, num_stages=2),
+    ],
+    key=["DIM1", "DIM2"],
+)
+@triton.jit
+def max_reduce_dim1_kernel(
+    x_ptr,
+    out_ptr,
+    BATCH: tl.constexpr,
+    DIM1: tl.constexpr,
+    DIM2: tl.constexpr,
+    stride_xb: tl.constexpr,
+    stride_xd1: tl.constexpr,
+    stride_xd2: tl.constexpr,
+    stride_ob: tl.constexpr,
+    stride_od2: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+):
+    pid_batch = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    n_start = pid_n * BLOCK_N
+    n_offs = n_start + tl.arange(0, BLOCK_N)
+    n_mask = n_offs < DIM2
+
+    batch_offset = pid_batch.to(tl.int64) * stride_xb
+
+    acc = tl.full((BLOCK_N,), value=float("-inf"), dtype=tl.float32)
+
+    for k_start in range(0, DIM1, BLOCK_K):
+        k_offs = k_start + tl.arange(0, BLOCK_K)
+        k_mask = k_offs < DIM1
+
+        ptrs = (
+            x_ptr
+            + batch_offset
+            + k_offs[:, None] * stride_xd1
+            + n_offs[None, :] * stride_xd2
+        )
+        mask = k_mask[:, None] & n_mask[None, :]
+        tile = tl.load(ptrs, mask=mask, other=float("-inf"))
+
+        tile_max = tl.max(tile.to(tl.float32), axis=0)
+        acc = tl.maximum(acc, tile_max)
+
+    out_ptrs = out_ptr + pid_batch.to(tl.int64) * stride_ob + n_offs * stride_od2
+    tl.store(out_ptrs, acc, mask=n_mask)
+
+
+class Model(nn.Module):
+    def __init__(self, dim: int):
+        super(Model, self).__init__()
+        self.dim = dim
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x.contiguous()
+        batch_size, dim1, dim2 = x.shape
+
+        output = torch.empty((batch_size, dim2), device=x.device, dtype=torch.float32)
+
+        grid = lambda META: (batch_size, triton.cdiv(dim2, META["BLOCK_N"]))
+
+        max_reduce_dim1_kernel[grid](
+            x,
+            output,
+            batch_size,
+            dim1,
+            dim2,
+            x.stride(0),
+            x.stride(1),
+            x.stride(2),
+            output.stride(0),
+            output.stride(1),
+        )
+
+        return output.to(x.dtype)
+
+
+batch_size = 128
+dim1 = 4096
+dim2 = 4095
+
+
+def get_inputs():
+    x = torch.rand(batch_size, dim1, dim2, dtype=torch.bfloat16)
+    return [x]
+
+
+def get_init_inputs():
+    return [1]
diff --git a/backends/triton/cpu/KernelBench/level1/4_Matrix_vector_multiplication_.py b/backends/triton/cpu/KernelBench/level1/4_Matrix_vector_multiplication_.py
new file mode 100644
index 0000000..4573a0e
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/4_Matrix_vector_multiplication_.py
@@ -0,0 +1,76 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_K": 512}, num_warps=4, num_stages=3),
+    ],
+    key=["K"],
+)
+@triton.jit
+def _gemv_kernel(
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    K,
+    stride_am,
+    BLOCK_K: tl.constexpr,
+):
+    row = tl.program_id(0)
+    a_row_ptr = a_ptr + row * stride_am
+
+    acc = tl.zeros((BLOCK_K,), dtype=tl.float32)
+
+    for k in range(0, K, BLOCK_K):
+        offs_k = k + tl.arange(0, BLOCK_K)
+        mask = offs_k < K
+        a_vals = tl.load(a_row_ptr + offs_k, mask=mask, other=0.0)
+        b_vals = tl.load(b_ptr + offs_k, mask=mask, other=0.0)
+        acc += a_vals.to(tl.float32) * b_vals.to(tl.float32)
+
+    result = tl.sum(acc)
+    tl.store(c_ptr + row, result.to(tl.bfloat16))
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
+        M, K = A.shape
+        C = torch.empty(M, device=A.device, dtype=A.dtype)
+
+        B_flat = B.view(-1).contiguous()
+
+        grid = (M,)
+        _gemv_kernel[grid](
+            A,
+            B_flat,
+            C,
+            K,
+            A.stride(0),
+        )
+        return C.view(M, 1)
+
+
+M = 256 * 8
+K = 131072 * 8
+
+
+def get_inputs():
+    A = torch.rand(M, K, dtype=torch.bfloat16)
+    B = torch.rand(K, 1, dtype=torch.bfloat16)
+    return [A, B]
+
+
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/50_conv_standard_2D__square_input__square_kernel.py b/backends/triton/cpu/KernelBench/level1/50_conv_standard_2D__square_input__square_kernel.py
new file mode 100644
index 0000000..a2e0bcb
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/50_conv_standard_2D__square_input__square_kernel.py
@@ -0,0 +1,232 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def swizzle_tile(
+    tile_id,
+    M,
+    N,
+    K,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    grid_m = tl.cdiv(M, BLOCK_M)
+    grid_n = tl.cdiv(N, BLOCK_N)
+    width = GROUP_SIZE_M * grid_n
+    group_id = tile_id // width
+    group_size = tl.minimum(GROUP_SIZE_M, grid_m - group_id * GROUP_SIZE_M)
+    pid_m = group_id * GROUP_SIZE_M + (tile_id % group_size)
+    pid_n = (tile_id % width) // group_size
+    return pid_m, pid_n
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
+            num_warps=32,
+            num_stages=2,
+        ),
+    ],
+    key=["M", "N", "K"],
+)
+@triton.jit
+def conv2d_implicit_gemm_kernel(
+    x_ptr,
+    w_ptr,
+    bias_ptr,
+    out_ptr,
+    M,
+    N,
+    K,
+    OH,
+    OW,
+    H,
+    W,
+    stride_conv_h,
+    stride_conv_w,
+    pad_h,
+    pad_w,
+    stride_xn,
+    stride_xc,
+    stride_xh,
+    stride_xw,
+    stride_on,
+    stride_oc,
+    stride_oh,
+    stride_ow,
+    stride_wk,
+    stride_wn: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+    KH: tl.constexpr,
+    KW: tl.constexpr,
+    C_IN: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    pid_m, pid_n = swizzle_tile(pid, M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, GROUP_SIZE_M)
+
+    offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    ohow = OH * OW
+    n_idx = (offs_m // ohow).to(tl.int64)
+    rem = offs_m % ohow
+    oh_idx = rem // OW
+    ow_idx = rem % OW
+
+    mask_m = offs_m < M
+    mask_n = offs_n < N
+
+    w_desc = tl.make_tensor_descriptor(
+        base=w_ptr,
+        shape=(K, N),
+        strides=(stride_wk, stride_wn),
+        block_shape=(BLOCK_K, BLOCK_N),
+    )
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+
+    for k0 in range(0, K, BLOCK_K):
+        offs_k = k0 + tl.arange(0, BLOCK_K)
+
+        kh = offs_k // (KW * C_IN)
+        kw = (offs_k // C_IN) % KW
+        cin = offs_k % C_IN
+
+        ih = oh_idx[:, None] * stride_conv_h + kh[None, :] - pad_h
+        iw = ow_idx[:, None] * stride_conv_w + kw[None, :] - pad_w
+
+        valid = (ih >= 0) & (ih < H) & (iw >= 0) & (iw < W) & (offs_k[None, :] < K)
+        valid = valid & mask_m[:, None]
+
+        x_ptrs = (
+            x_ptr
+            + n_idx[:, None] * stride_xn
+            + cin[None, :].to(tl.int64) * stride_xc
+            + ih.to(tl.int64) * stride_xh
+            + iw.to(tl.int64) * stride_xw
+        )
+
+        x_tile = tl.load(x_ptrs, mask=valid, other=0.0)
+        w_tile = w_desc.load([k0, pid_n * BLOCK_N])
+
+        acc = tl.dot(x_tile, w_tile, acc)
+    bias_vals = tl.load(bias_ptr + offs_n, mask=mask_n, other=0.0)
+    acc += bias_vals[None, :]
+
+    out_ptrs = (
+        out_ptr
+        + n_idx[:, None] * stride_on
+        + offs_n[None, :].to(tl.int64) * stride_oc
+        + oh_idx[:, None].to(tl.int64) * stride_oh
+        + ow_idx[:, None].to(tl.int64) * stride_ow
+    )
+    tl.store(out_ptrs, acc.to(tl.bfloat16), mask=mask_m[:, None] & mask_n[None, :])
+
+
+class Model(nn.Module):
+    def __init__(self, num_classes=1000):
+        super().__init__()
+        self.conv1 = nn.Conv2d(
+            in_channels=3, out_channels=96, kernel_size=11, stride=4, padding=2
+        )
+        self._packed = False
+
+    def _pack_weights(self):
+        w = self.conv1.weight.data
+        b = self.conv1.bias.data
+        device = w.device
+        self.w_packed = (
+            w.permute(2, 3, 1, 0)
+            .contiguous()
+            .reshape(-1, w.shape[0])
+            .to(device=device, dtype=torch.bfloat16)
+            .contiguous()
+        )
+        self.bias_packed = b.to(device=device, dtype=torch.bfloat16).contiguous()
+        self._packed = True
+
+    def forward(self, x):
+        if not self._packed:
+            self._pack_weights()
+
+        x = x.to(dtype=torch.bfloat16).contiguous()
+
+        N_batch, C_in, H, W_in = x.shape
+        C_out = self.w_packed.shape[1]
+        KH, KW = 11, 11
+        stride_h, stride_w = 4, 4
+        pad_h, pad_w = 2, 2
+        OH = (H + 2 * pad_h - KH) // stride_h + 1
+        OW = (W_in + 2 * pad_w - KW) // stride_w + 1
+
+        M = N_batch * OH * OW
+        N = C_out
+        K = C_in * KH * KW
+
+        output = torch.empty(
+            (N_batch, C_out, OH, OW), device=x.device, dtype=torch.bfloat16
+        )
+
+        grid = lambda META: (
+            triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),
+        )
+
+        conv2d_implicit_gemm_kernel[grid](
+            x,
+            self.w_packed,
+            self.bias_packed,
+            output,
+            M,
+            N,
+            K,
+            OH,
+            OW,
+            H,
+            W_in,
+            stride_h,
+            stride_w,
+            pad_h,
+            pad_w,
+            x.stride(0),
+            x.stride(1),
+            x.stride(2),
+            x.stride(3),
+            output.stride(0),
+            output.stride(1),
+            output.stride(2),
+            output.stride(3),
+            self.w_packed.stride(0),
+            self.w_packed.stride(1),
+            KH=KH,
+            KW=KW,
+            C_IN=C_in,
+        )
+
+        return output
+
+
+batch_size = 256
+num_classes = 1000
+
+
+def get_inputs():
+    return [torch.rand(batch_size, 3, 224, 224, dtype=torch.bfloat16)]
+
+
+def get_init_inputs():
+    return [num_classes]
diff --git a/backends/triton/cpu/KernelBench/level1/51_Argmax_over_a_dimension.py b/backends/triton/cpu/KernelBench/level1/51_Argmax_over_a_dimension.py
new file mode 100644
index 0000000..1b9d31d
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/51_Argmax_over_a_dimension.py
@@ -0,0 +1,98 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_N": 64, "grf_mode": "128"}, num_warps=4, num_stages=2),
+    ],
+    key=["D1", "D2"],
+)
+@triton.jit
+def argmax_dim1_kernel(
+    x_ptr,
+    out_ptr,
+    D1: tl.constexpr,
+    D2: tl.constexpr,
+    stride_b,
+    stride_d1,
+    stride_d2,
+    stride_ob,
+    stride_od2,
+    BLOCK_N: tl.constexpr,
+    grf_mode: tl.constexpr,
+):
+    pid_n = tl.program_id(0)
+    pid_b = tl.program_id(1)
+
+    col_start = pid_n * BLOCK_N
+    cols = col_start + tl.arange(0, BLOCK_N)
+    col_mask = cols < D2
+
+    max_val = tl.full((BLOCK_N,), -float("inf"), dtype=tl.float32)
+    max_idx = tl.zeros((BLOCK_N,), dtype=tl.int32)
+
+    batch_offset = pid_b.to(tl.int64) * stride_b
+    col_offsets = cols.to(tl.int64) * stride_d2
+    base = x_ptr + batch_offset + col_offsets
+
+    for k in tl.range(0, D1):
+        val = tl.load(base + k * stride_d1, mask=col_mask, other=-float("inf")).to(
+            tl.float32
+        )
+        update = val > max_val
+        max_val = tl.where(update, val, max_val)
+        max_idx = tl.where(update, k, max_idx)
+
+    out_ptrs = out_ptr + pid_b.to(tl.int64) * stride_ob + cols.to(tl.int64) * stride_od2
+    tl.store(out_ptrs, max_idx.to(tl.int64), mask=col_mask)
+
+
+class Model(nn.Module):
+    def __init__(self, dim=1):
+        super(Model, self).__init__()
+        try:
+            self.dim = int(dim)
+        except (ValueError, TypeError):
+            self.dim = 1
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, D1, D2 = x.shape
+        output = torch.empty((B, D2), device=x.device, dtype=torch.int64)
+
+        grid = lambda META: (triton.cdiv(D2, META["BLOCK_N"]), B)
+        argmax_dim1_kernel[grid](
+            x,
+            output,
+            D1,
+            D2,
+            x.stride(0),
+            x.stride(1),
+            x.stride(2),
+            output.stride(0),
+            output.stride(1),
+        )
+
+        return output
+
+
+batch_size = 128
+dim1 = 4096
+dim2 = 4095
+
+
+def get_inputs():
+    x = torch.rand(batch_size, dim1, dim2)
+    return [x]
+
+
+def get_init_inputs():
+    return [1]
diff --git a/backends/triton/cpu/KernelBench/level1/52_Argmin_over_a_dimension.py b/backends/triton/cpu/KernelBench/level1/52_Argmin_over_a_dimension.py
new file mode 100644
index 0000000..15dc776
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/52_Argmin_over_a_dimension.py
@@ -0,0 +1,119 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {"BLOCK_D2": 128, "BLOCK_K": 32, "warp_size": 16}, num_warps=4, num_stages=2
+        ),
+    ],
+    key=["D1", "D2"],
+)
+@triton.jit
+def argmin_kernel(
+    x_ptr,
+    out_ptr,
+    B,
+    D1,
+    D2,
+    stride_b,
+    stride_d1,
+    stride_d2,
+    out_stride_b,
+    out_stride_d2,
+    BLOCK_D2: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    warp_size: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    num_d2_blocks = tl.cdiv(D2, BLOCK_D2)
+    batch_idx = pid // num_d2_blocks
+    d2_block_idx = pid % num_d2_blocks
+
+    d2_start = d2_block_idx * BLOCK_D2
+    d2_offs = d2_start + tl.arange(0, BLOCK_D2)
+    d2_mask = d2_offs < D2
+
+    base = x_ptr + batch_idx.to(tl.int64) * stride_b
+
+    min_val = tl.full([BLOCK_D2], float("inf"), dtype=tl.float32)
+    min_idx = tl.zeros([BLOCK_D2], dtype=tl.int32)
+
+    k_offs_base = tl.arange(0, BLOCK_K)
+
+    for k_start in tl.range(0, D1, BLOCK_K):
+        k_offs = k_start + k_offs_base
+        k_mask = k_offs < D1
+        ptrs = (
+            base
+            + k_offs[:, None].to(tl.int64) * stride_d1
+            + d2_offs[None, :] * stride_d2
+        )
+        mask = k_mask[:, None] & d2_mask[None, :]
+        tile = tl.load(ptrs, mask=mask, other=float("inf")).to(tl.float32)
+
+        tile_min = tl.min(tile, axis=0)
+
+        update = tile_min < min_val
+
+        k_indices = k_offs[:, None]
+        large_k = tl.full([1], D1, dtype=tl.int32)
+        k_masked = tl.where(tile == tile_min[None, :], k_indices, large_k)
+        tile_argmin = tl.min(k_masked, axis=0)
+
+        min_idx = tl.where(update, tile_argmin, min_idx)
+        min_val = tl.where(update, tile_min, min_val)
+
+    out_ptrs = out_ptr + batch_idx.to(tl.int64) * out_stride_b + d2_offs * out_stride_d2
+    tl.store(out_ptrs, min_idx.to(tl.int64), mask=d2_mask)
+
+
+class Model(nn.Module):
+    def __init__(self, dim: int):
+        super(Model, self).__init__()
+        self.dim = dim
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, D1, D2 = x.shape
+        output = torch.empty(B, D2, device=x.device, dtype=torch.int64)
+
+        grid = lambda META: (B * triton.cdiv(D2, META["BLOCK_D2"]),)
+
+        argmin_kernel[grid](
+            x,
+            output,
+            B,
+            D1,
+            D2,
+            x.stride(0),
+            x.stride(1),
+            x.stride(2),
+            output.stride(0),
+            output.stride(1),
+        )
+
+        return output
+
+
+batch_size = 128
+dim1 = 4096
+dim2 = 4095
+dim = 1
+
+
+def get_inputs():
+    x = torch.rand(batch_size, dim1, dim2)
+    return [x]
+
+
+def get_init_inputs():
+    return [dim]
diff --git a/backends/triton/cpu/KernelBench/level1/53_Min_reduction_over_a_dimension.py b/backends/triton/cpu/KernelBench/level1/53_Min_reduction_over_a_dimension.py
new file mode 100644
index 0000000..b4fc20a
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/53_Min_reduction_over_a_dimension.py
@@ -0,0 +1,107 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {"BLOCK_D1": 64, "BLOCK_D2": 256, "warp_size": 16},
+            num_warps=4,
+            num_stages=6,
+        ),
+    ],
+    key=["D1", "D2"],
+)
+@triton.jit
+def min_reduction_kernel(
+    x_ptr,
+    out_ptr,
+    B,
+    D1,
+    D2,
+    stride_xb,
+    stride_xd1,
+    stride_xd2,
+    stride_ob,
+    stride_od2,
+    BLOCK_D1: tl.constexpr,
+    BLOCK_D2: tl.constexpr,
+    warp_size: tl.constexpr,
+):
+    pid_d2 = tl.program_id(0)
+    pid_b = tl.program_id(1)
+
+    d2_start = pid_d2 * BLOCK_D2
+    d2_offs = d2_start + tl.arange(0, BLOCK_D2)
+    d2_mask = d2_offs < D2
+
+    batch_offset = pid_b.to(tl.int64) * stride_xb
+    base = x_ptr + batch_offset
+
+    acc = tl.full((BLOCK_D2,), value=float("inf"), dtype=tl.float32)
+
+    for d1_start in range(0, D1, BLOCK_D1):
+        d1_offs = d1_start + tl.arange(0, BLOCK_D1)
+        mask = (d1_offs[:, None] < D1) & d2_mask[None, :]
+        ptrs = base + d1_offs[:, None] * stride_xd1 + d2_offs[None, :] * stride_xd2
+        tile = tl.load(ptrs, mask=mask, other=float("inf")).to(tl.float32)
+        tile_min = tl.min(tile, axis=0)
+        acc = tl.minimum(acc, tile_min)
+
+    out_ptrs = out_ptr + pid_b.to(tl.int64) * stride_ob + d2_offs * stride_od2
+    tl.store(out_ptrs, acc.to(tl.float16), mask=d2_mask)
+
+
+class Model(nn.Module):
+    def __init__(self, dim: int):
+        super(Model, self).__init__()
+        self.dim = dim
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B = x.shape[0]
+        D1 = x.shape[1]
+        D2 = x.shape[2]
+
+        out = torch.empty((B, D2), device=x.device, dtype=x.dtype)
+
+        grid = lambda META: (
+            triton.cdiv(D2, META["BLOCK_D2"]),
+            B,
+        )
+
+        min_reduction_kernel[grid](
+            x,
+            out,
+            B,
+            D1,
+            D2,
+            x.stride(0),
+            x.stride(1),
+            x.stride(2),
+            out.stride(0),
+            out.stride(1),
+        )
+
+        return out
+
+
+batch_size = 128
+dim1 = 4096
+dim2 = 4095
+
+
+def get_inputs():
+    x = torch.rand(batch_size, dim1, dim2)
+    return [x]
+
+
+def get_init_inputs():
+    return [1]
diff --git a/backends/triton/cpu/KernelBench/level1/54_conv_standard_3D__square_input__square_kernel.py b/backends/triton/cpu/KernelBench/level1/54_conv_standard_3D__square_input__square_kernel.py
new file mode 100644
index 0000000..a6a0552
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/54_conv_standard_3D__square_input__square_kernel.py
@@ -0,0 +1,252 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+def _to_triple(x):
+    if isinstance(x, int):
+        return (x, x, x)
+    return tuple(x)
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {
+                "BLOCK_M": 64,
+                "BLOCK_N": 64,
+                "BLOCK_K": 16,
+                "GROUP_M": 8,
+                "grf_mode": "256",
+            },
+            num_warps=4,
+            num_stages=2,
+        ),
+    ],
+    key=["C_out", "K_FUSED"],
+)
+@triton.jit
+def _conv3d_fused_k(
+    x_ptr,
+    w_ptr,
+    y_ptr,
+    N_batch,
+    C_out,
+    OD,
+    OH,
+    OW,
+    sxn,
+    sxd,
+    sxh,
+    sxw,
+    syn,
+    syd,
+    syh,
+    syw,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_M: tl.constexpr,
+    KD: tl.constexpr,
+    KH: tl.constexpr,
+    KW: tl.constexpr,
+    C_IN: tl.constexpr,
+    K_FUSED: tl.constexpr,
+    grf_mode: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    M_total = N_batch * OD * OH * OW
+    num_pid_m = tl.cdiv(M_total, BLOCK_M)
+    num_pid_n = tl.cdiv(C_out, BLOCK_N)
+    num_pid_in_group = GROUP_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_M
+    group_size_m = tl.minimum(num_pid_m - first_pid_m, GROUP_M)
+    pid_in_group = pid % num_pid_in_group
+    pid_m = first_pid_m + (pid_in_group % group_size_m)
+    pid_n = pid_in_group // group_size_m
+
+    offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_k = tl.arange(0, BLOCK_K)
+
+    ohw = OH * OW
+    odhw = OD * ohw
+    n_idx = offs_m // odhw
+    rem = offs_m % odhw
+    od_idx = rem // ohw
+    rem2 = rem % ohw
+    oh_idx = rem2 // OW
+    ow_idx = rem2 % OW
+
+    mask_m = offs_m < M_total
+
+    x_spatial_base = (
+        x_ptr + n_idx.to(tl.int64) * sxn + od_idx * sxd + oh_idx * sxh + ow_idx * sxw
+    )
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+
+    w_bp = tl.make_block_ptr(
+        base=w_ptr,
+        shape=(K_FUSED, C_out),
+        strides=(C_out, 1),
+        offsets=(0, pid_n * BLOCK_N),
+        block_shape=(BLOCK_K, BLOCK_N),
+        order=(1, 0),
+    )
+
+    for k0 in range(0, K_FUSED, BLOCK_K):
+        k_idx = k0 + offs_k
+        c_off = k_idx % C_IN
+        spatial_idx = k_idx // C_IN
+        kw_off = spatial_idx % KW
+        kh_off = (spatial_idx // KW) % KH
+        kd_off = spatial_idx // (KW * KH)
+        k_input_off = kd_off * sxd + kh_off * sxh + kw_off * sxw + c_off
+
+        x_tile = tl.load(
+            x_spatial_base[:, None] + k_input_off[None, :],
+            mask=mask_m[:, None] & (k_idx[None, :] < K_FUSED),
+            other=0.0,
+        )
+        w_tile = tl.load(w_bp, boundary_check=(0, 1), padding_option="zero")
+        acc = tl.dot(x_tile, w_tile, acc)
+        w_bp = tl.advance(w_bp, (BLOCK_K, 0))
+
+    offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    mask_n = offs_n < C_out
+    y_ptrs = (
+        y_ptr + n_idx.to(tl.int64) * syn + od_idx * syd + oh_idx * syh + ow_idx * syw
+    )
+    tl.store(
+        y_ptrs[:, None] + offs_n[None, :],
+        acc.to(tl.float16),
+        mask=mask_m[:, None] & mask_n[None, :],
+    )
+
+
+class Model(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=False,
+    ):
+        super(Model, self).__init__()
+        ks = _to_triple(kernel_size)
+        self.weight = nn.Parameter(torch.empty(out_channels, in_channels, *ks))
+        if bias:
+            self.bias = nn.Parameter(torch.empty(out_channels))
+        else:
+            self.bias = None
+        nn.init.kaiming_uniform_(self.weight, a=5**0.5)
+        if self.bias is not None:
+            fan_in = in_channels * ks[0] * ks[1] * ks[2]
+            bound = 1 / fan_in**0.5
+            nn.init.uniform_(self.bias, -bound, bound)
+        self._packed = False
+
+    def _pack_weights(self):
+        w = self.weight.data
+        C_out, C_in, KD, KH, KW = w.shape
+        self.w_fused = (
+            w.permute(2, 3, 4, 1, 0)
+            .reshape(KD * KH * KW * C_in, C_out)
+            .contiguous()
+            .to(dtype=torch.float16)
+        )
+        self._KD, self._KH, self._KW = KD, KH, KW
+        self._C_in = C_in
+        self._C_out = C_out
+        self._K_fused = KD * KH * KW * C_in
+        self._packed = True
+
+    def forward(self, x):
+        if not self._packed:
+            self._pack_weights()
+
+        B, C_in, D_in, H_in, W_in = x.shape
+        KD, KH, KW = self._KD, self._KH, self._KW
+        C_out = self._C_out
+        OD = D_in - KD + 1
+        OH = H_in - KH + 1
+        OW = W_in - KW + 1
+
+        x_fp16 = x.to(dtype=torch.float16)
+        x_cl = x_fp16.contiguous(memory_format=torch.channels_last_3d)
+        y = torch.empty(
+            B,
+            C_out,
+            OD,
+            OH,
+            OW,
+            device=x.device,
+            dtype=torch.float16,
+            memory_format=torch.channels_last_3d,
+        )
+
+        sx = x_cl.stride()
+        sy = y.stride()
+
+        M_total = B * OD * OH * OW
+        grid = lambda meta: (
+            triton.cdiv(M_total, meta["BLOCK_M"]) * triton.cdiv(C_out, meta["BLOCK_N"]),
+        )
+
+        _conv3d_fused_k[grid](
+            x_cl,
+            self.w_fused,
+            y,
+            B,
+            C_out,
+            OD,
+            OH,
+            OW,
+            sx[0],
+            sx[2],
+            sx[3],
+            sx[4],
+            sy[0],
+            sy[2],
+            sy[3],
+            sy[4],
+            KD=KD,
+            KH=KH,
+            KW=KW,
+            C_IN=C_in,
+            K_FUSED=self._K_fused,
+        )
+
+        if self.bias is not None:
+            y = y + self.bias.to(dtype=torch.float16).view(1, -1, 1, 1, 1)
+        return y
+
+
+batch_size = 16
+in_channels = 3
+out_channels = 64
+kernel_size = 3
+depth = 64
+width = 64
+height = 64
+
+
+def get_inputs():
+    x = torch.rand(batch_size, in_channels, depth, width, height)
+    return [x]
+
+
+def get_init_inputs():
+    return [in_channels, out_channels, kernel_size]
diff --git a/backends/triton/cpu/KernelBench/level1/55_conv_standard_2D__asymmetric_input__square_kernel.py b/backends/triton/cpu/KernelBench/level1/55_conv_standard_2D__asymmetric_input__square_kernel.py
new file mode 100644
index 0000000..5618624
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/55_conv_standard_2D__asymmetric_input__square_kernel.py
@@ -0,0 +1,214 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {"BLOCK_OW": 64, "BLOCK_N": 64, "BLOCK_K": 64, "grf_mode": "256"},
+            num_warps=4,
+            num_stages=2,
+        ),
+    ],
+    key=["H", "W", "C_IN", "C_out", "OH", "OW"],
+)
+@triton.jit
+def _conv2d_spatial_bf16_prepack(
+    x_ptr,
+    w_ptr,
+    y_ptr,
+    N_batch,
+    H,
+    W,
+    C_out,
+    OH,
+    OW,
+    stride_wkh,
+    stride_wkw,
+    stride_wci,
+    stride_wco,
+    BLOCK_OW: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    KH: tl.constexpr,
+    KW: tl.constexpr,
+    C_IN: tl.constexpr,
+    grf_mode: tl.constexpr,
+):
+    n = tl.program_id(0)
+    oh = tl.program_id(1)
+    pid_ow_n = tl.program_id(2)
+
+    num_ow_tiles = tl.cdiv(OW, BLOCK_OW)
+    pid_ow = pid_ow_n % num_ow_tiles
+    pid_n = pid_ow_n // num_ow_tiles
+
+    ow0 = pid_ow * BLOCK_OW
+    HW = H * W
+
+    acc = tl.zeros((BLOCK_OW, BLOCK_N), dtype=tl.float32)
+
+    for kh in range(KH):
+        for kw in range(KW):
+            x_row_start = n * HW + (oh + kh) * W + (ow0 + kw)
+            x_valid_rows = W - (ow0 + kw)
+
+            x_bp = tl.make_block_ptr(
+                base=x_ptr,
+                shape=(x_row_start + x_valid_rows, C_IN),
+                strides=(C_IN, 1),
+                offsets=(x_row_start, 0),
+                block_shape=(BLOCK_OW, BLOCK_K),
+                order=(1, 0),
+            )
+
+            w_bp = tl.make_block_ptr(
+                base=w_ptr + kh * stride_wkh + kw * stride_wkw,
+                shape=(C_IN, C_out),
+                strides=(stride_wci, stride_wco),
+                offsets=(0, pid_n * BLOCK_N),
+                block_shape=(BLOCK_K, BLOCK_N),
+                order=(1, 0),
+            )
+
+            for c0 in range(0, C_IN, BLOCK_K):
+                x_tile = tl.load(x_bp, boundary_check=(0, 1), padding_option="zero")
+                w_tile = tl.load(w_bp, boundary_check=(0, 1), padding_option="zero")
+                acc = tl.dot(x_tile, w_tile, acc)
+                x_bp = tl.advance(x_bp, (0, BLOCK_K))
+                w_bp = tl.advance(w_bp, (BLOCK_K, 0))
+
+    OHOW = OH * OW
+    y_row_start = n * OHOW + oh * OW + ow0
+    y_valid_rows = OW - ow0
+    y_bp = tl.make_block_ptr(
+        base=y_ptr,
+        shape=(y_row_start + y_valid_rows, C_out),
+        strides=(C_out, 1),
+        offsets=(y_row_start, pid_n * BLOCK_N),
+        block_shape=(BLOCK_OW, BLOCK_N),
+        order=(1, 0),
+    )
+    tl.store(y_bp, acc.to(tl.bfloat16), boundary_check=(0, 1))
+
+
+def _ensure_xpu_bf16(x):
+    if x.dtype != torch.bfloat16:
+        return x.to(dtype=torch.bfloat16)
+    return x
+
+
+class Model(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.conv2d = nn.Conv2d(
+            in_channels,
+            out_channels,
+            (kernel_size, kernel_size),
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+        self._packed = False
+        self._cached_x_ptr = None
+        self._x_nhwc = None
+
+    def _pack_weights(self):
+        w = self.conv2d.weight.data.detach()
+        self.w_hwio = _ensure_xpu_bf16(w.permute(2, 3, 1, 0).contiguous())
+        self._packed = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not self._packed:
+            self._pack_weights()
+
+        x_ptr = x.data_ptr()
+        if self._cached_x_ptr != x_ptr:
+            x_bf16 = _ensure_xpu_bf16(x)
+            x_cl = x_bf16.contiguous(memory_format=torch.channels_last)
+            self._x_nhwc = x_cl.permute(0, 2, 3, 1)
+            self._cached_x_ptr = x_ptr
+
+        x_nhwc = self._x_nhwc
+        N_batch = x.shape[0]
+        C_in = x.shape[1]
+        H = x.shape[2]
+        W = x.shape[3]
+        KH, KW = self.conv2d.kernel_size
+        C_out = self.conv2d.out_channels
+        OH = H - KH + 1
+        OW = W - KW + 1
+
+        y = torch.empty(
+            (N_batch, C_out, OH, OW),
+            device=x_nhwc.device,
+            dtype=torch.bfloat16,
+            memory_format=torch.channels_last,
+        )
+        y_nhwc = y.permute(0, 2, 3, 1)
+
+        w = self.w_hwio
+        swkh, swkw, swci, swco = w.stride()
+
+        def grid(meta):
+            num_ow = triton.cdiv(OW, meta["BLOCK_OW"])
+            num_n = triton.cdiv(C_out, meta["BLOCK_N"])
+            return (N_batch, OH, num_ow * num_n)
+
+        _conv2d_spatial_bf16_prepack[grid](
+            x_nhwc,
+            w,
+            y_nhwc,
+            N_batch,
+            H,
+            W,
+            C_out,
+            OH,
+            OW,
+            swkh,
+            swkw,
+            swci,
+            swco,
+            KH=KH,
+            KW=KW,
+            C_IN=C_in,
+        )
+
+        return y
+
+
+batch_size = 8
+height = 512
+width = 1024
+in_channels = 64
+out_channels = 128
+kernel_size = 3
+
+
+def get_inputs():
+    x = torch.rand(batch_size, in_channels, height, width)
+    return [x]
+
+
+def get_init_inputs():
+    return [in_channels, out_channels, kernel_size]
diff --git a/backends/triton/cpu/KernelBench/level1/56_conv_standard_2D__asymmetric_input__asymmetric_kernel.py b/backends/triton/cpu/KernelBench/level1/56_conv_standard_2D__asymmetric_input__asymmetric_kernel.py
new file mode 100644
index 0000000..0c35718
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/56_conv_standard_2D__asymmetric_input__asymmetric_kernel.py
@@ -0,0 +1,205 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {"BLOCK_OW": 64, "BLOCK_N": 64, "BLOCK_K": 64, "grf_mode": "128"},
+            num_warps=4,
+            num_stages=2,
+        ),
+    ],
+    key=["H", "W", "C_IN", "C_out", "OH", "OW"],
+)
+@triton.jit
+def _conv2d_spatial_tiled(
+    x_ptr,
+    w_ptr,
+    y_ptr,
+    N_batch,
+    H,
+    W,
+    C_out,
+    OH,
+    OW,
+    stride_wkh,
+    stride_wkw,
+    stride_wci,
+    stride_wco,
+    BLOCK_OW: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    KH: tl.constexpr,
+    KW: tl.constexpr,
+    C_IN: tl.constexpr,
+    grf_mode: tl.constexpr,
+):
+    n = tl.program_id(0)
+    oh = tl.program_id(1)
+    pid_ow_n = tl.program_id(2)
+
+    num_ow_tiles = tl.cdiv(OW, BLOCK_OW)
+    pid_ow = pid_ow_n % num_ow_tiles
+    pid_n = pid_ow_n // num_ow_tiles
+
+    ow0 = pid_ow * BLOCK_OW
+    HW = H * W
+    OHOW = OH * OW
+
+    acc = tl.zeros((BLOCK_OW, BLOCK_N), dtype=tl.float32)
+
+    for kh in range(KH):
+        for kw in range(KW):
+            x_row_start = n * HW + (oh + kh) * W + (ow0 + kw)
+            x_valid_rows = W - (ow0 + kw)
+
+            x_bp = tl.make_block_ptr(
+                base=x_ptr,
+                shape=(x_row_start + x_valid_rows, C_IN),
+                strides=(C_IN, 1),
+                offsets=(x_row_start, 0),
+                block_shape=(BLOCK_OW, BLOCK_K),
+                order=(1, 0),
+            )
+
+            w_bp = tl.make_block_ptr(
+                base=w_ptr + kh * stride_wkh + kw * stride_wkw,
+                shape=(C_IN, C_out),
+                strides=(stride_wci, stride_wco),
+                offsets=(0, pid_n * BLOCK_N),
+                block_shape=(BLOCK_K, BLOCK_N),
+                order=(1, 0),
+            )
+
+            for c0 in range(0, C_IN, BLOCK_K):
+                x_tile = tl.load(x_bp, boundary_check=(0, 1), padding_option="zero")
+                w_tile = tl.load(w_bp, boundary_check=(0, 1), padding_option="zero")
+                acc = tl.dot(x_tile, w_tile, acc, input_precision="ieee")
+                x_bp = tl.advance(x_bp, (0, BLOCK_K))
+                w_bp = tl.advance(w_bp, (BLOCK_K, 0))
+
+    y_row_start = n * OHOW + oh * OW + ow0
+    y_valid_rows = OW - ow0
+    y_bp = tl.make_block_ptr(
+        base=y_ptr,
+        shape=(y_row_start + y_valid_rows, C_out),
+        strides=(C_out, 1),
+        offsets=(y_row_start, pid_n * BLOCK_N),
+        block_shape=(BLOCK_OW, BLOCK_N),
+        order=(1, 0),
+    )
+    tl.store(y_bp, acc.to(tl.float16), boundary_check=(0, 1))
+
+
+class Model(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: tuple,
+        stride: tuple = (1, 1),
+        padding: tuple = (0, 0),
+        dilation: tuple = (1, 1),
+        groups: int = 1,
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.conv2d = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+        self._w_hwio = None
+        self._moved = False
+
+    def _move_params_once(self):
+        w = self.conv2d.weight.data
+        self._w_hwio = w.permute(2, 3, 1, 0).contiguous()
+        if self._w_hwio.dtype != torch.float16:
+            self._w_hwio = self._w_hwio.to(dtype=torch.float16)
+        self._moved = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not self._moved:
+            self._move_params_once()
+
+        input_dtype = x.dtype
+        if x.dtype != torch.float16:
+            x = x.to(dtype=torch.float16)
+
+        x_cl = x.contiguous(memory_format=torch.channels_last)
+        x_nhwc = x_cl.permute(0, 2, 3, 1)
+
+        N, C_in, H, W = x_cl.shape
+        KH, KW, _, C_out = self._w_hwio.shape
+        OH = H - KH + 1
+        OW = W - KW + 1
+
+        y = torch.empty(
+            (N, C_out, OH, OW),
+            device=x.device,
+            dtype=torch.float16,
+            memory_format=torch.channels_last,
+        )
+        y_nhwc = y.permute(0, 2, 3, 1)
+
+        w = self._w_hwio
+
+        def grid(meta):
+            num_ow = triton.cdiv(OW, meta["BLOCK_OW"])
+            num_n = triton.cdiv(C_out, meta["BLOCK_N"])
+            return (N, OH, num_ow * num_n)
+
+        _conv2d_spatial_tiled[grid](
+            x_nhwc,
+            w,
+            y_nhwc,
+            N,
+            H,
+            W,
+            C_out,
+            OH,
+            OW,
+            w.stride(0),
+            w.stride(1),
+            w.stride(2),
+            w.stride(3),
+            KH=KH,
+            KW=KW,
+            C_IN=C_in,
+        )
+
+        if input_dtype != torch.float16:
+            return y.to(dtype=input_dtype)
+        return y
+
+
+batch_size = 8
+in_channels = 64
+out_channels = 128
+kernel_size = (5, 7)
+height = 512
+width = 256
+
+
+def get_inputs():
+    x = torch.rand(batch_size, in_channels, height, width)
+    return [x]
+
+
+def get_init_inputs():
+    return [in_channels, out_channels, kernel_size]
diff --git a/backends/triton/cpu/KernelBench/level1/57_conv_transposed_2D__square_input__square_kernel.py b/backends/triton/cpu/KernelBench/level1/57_conv_transposed_2D__square_input__square_kernel.py
new file mode 100644
index 0000000..1206eac
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/57_conv_transposed_2D__square_input__square_kernel.py
@@ -0,0 +1,229 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {
+                "BLOCK_OW": 64,
+                "BLOCK_N": 64,
+                "BLOCK_K": 64,
+                "GROUP_SIZE_M": 8,
+                "grf_mode": "256",
+            },
+            num_warps=4,
+            num_stages=2,
+        ),
+    ],
+    key=["H", "W", "C_IN", "C_out", "OH", "OW"],
+)
+@triton.jit
+def _conv_transpose2d_swizzled_v2(
+    x_ptr,
+    w_ptr,
+    y_ptr,
+    N_batch,
+    H,
+    W,
+    C_out,
+    OH,
+    OW,
+    stride_wkh,
+    stride_wkw,
+    stride_wci,
+    stride_wco,
+    BLOCK_OW: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    KH: tl.constexpr,
+    KW: tl.constexpr,
+    C_IN: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+    grf_mode: tl.constexpr,
+):
+    n = tl.program_id(1)
+    pid = tl.program_id(0)
+
+    num_ow_tiles = tl.cdiv(OW, BLOCK_OW)
+    num_n_tiles = tl.cdiv(C_out, BLOCK_N)
+    num_m_tiles = OH * num_ow_tiles
+
+    group_id = pid // (GROUP_SIZE_M * num_n_tiles)
+    first_m_in_group = group_id * GROUP_SIZE_M
+    group_size = min(num_m_tiles - first_m_in_group, GROUP_SIZE_M)
+    local_id = pid % (group_size * num_n_tiles)
+    m_local = local_id // num_n_tiles
+    pid_n = local_id % num_n_tiles
+    m_idx = first_m_in_group + m_local
+
+    oh = m_idx // num_ow_tiles
+    pid_ow = m_idx % num_ow_tiles
+    ow0 = pid_ow * BLOCK_OW
+
+    HW = H * W
+    OHOW = OH * OW
+
+    acc = tl.zeros((BLOCK_OW, BLOCK_N), dtype=tl.float32)
+
+    for kh in range(KH):
+        for kw in range(KW):
+            x_row_start = n * HW + (oh + kh) * W + (ow0 + kw)
+            x_valid_rows = W - (ow0 + kw)
+
+            x_bp = tl.make_block_ptr(
+                base=x_ptr,
+                shape=(x_row_start + x_valid_rows, C_IN),
+                strides=(C_IN, 1),
+                offsets=(x_row_start, 0),
+                block_shape=(BLOCK_OW, BLOCK_K),
+                order=(1, 0),
+            )
+
+            w_bp = tl.make_block_ptr(
+                base=w_ptr + kh * stride_wkh + kw * stride_wkw,
+                shape=(C_IN, C_out),
+                strides=(stride_wci, stride_wco),
+                offsets=(0, pid_n * BLOCK_N),
+                block_shape=(BLOCK_K, BLOCK_N),
+                order=(1, 0),
+            )
+
+            for c0 in range(0, C_IN, BLOCK_K):
+                x_tile = tl.load(x_bp, boundary_check=(0, 1), padding_option="zero")
+                w_tile = tl.load(w_bp, boundary_check=(0, 1), padding_option="zero")
+                acc = tl.dot(x_tile, w_tile, acc)
+                x_bp = tl.advance(x_bp, (0, BLOCK_K))
+                w_bp = tl.advance(w_bp, (BLOCK_K, 0))
+
+    y_row_start = n * OHOW + oh * OW + ow0
+    y_valid_rows = OW - ow0
+    y_bp = tl.make_block_ptr(
+        base=y_ptr,
+        shape=(y_row_start + y_valid_rows, C_out),
+        strides=(C_out, 1),
+        offsets=(y_row_start, pid_n * BLOCK_N),
+        block_shape=(BLOCK_OW, BLOCK_N),
+        order=(1, 0),
+    )
+    tl.store(y_bp, acc.to(tl.float16), boundary_check=(0, 1))
+
+
+class Model(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        output_padding=0,
+        groups=1,
+        bias=False,
+    ):
+        super().__init__()
+        self.conv_transpose2d = nn.ConvTranspose2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+            groups=groups,
+            bias=bias,
+        )
+        self._packed = False
+        self._cached_x_ptr = None
+
+    def _pack_weights(self):
+        weight = self.conv_transpose2d.weight.data.detach()
+        self.w_hwio = (
+            weight.flip(2, 3).permute(2, 3, 0, 1).to(dtype=torch.float16).contiguous()
+        )
+        self.KH = weight.shape[2]
+        self.KW = weight.shape[3]
+        self.C_in_val = weight.shape[0]
+        self.C_out_val = weight.shape[1]
+        self._packed = True
+
+    def forward(self, x):
+        if not self._packed:
+            self._pack_weights()
+
+        x = x.to(dtype=torch.float16)
+        N, C_in, H, W_in = x.shape
+        KH, KW = self.KH, self.KW
+        C_out = self.C_out_val
+
+        if self._cached_x_ptr != x.data_ptr():
+            x_pad = F.pad(x, (KW - 1, KW - 1, KH - 1, KH - 1))
+            x_pad_cl = x_pad.contiguous(memory_format=torch.channels_last)
+            self._x_nhwc = x_pad_cl.permute(0, 2, 3, 1)
+            self._cached_x_ptr = x.data_ptr()
+            self._H_pad = H + 2 * (KH - 1)
+            self._W_pad = W_in + 2 * (KW - 1)
+            self._OH = H + KH - 1
+            self._OW = W_in + KW - 1
+            self._N = N
+            self._y = torch.empty(
+                (N, C_out, self._OH, self._OW),
+                device=x.device,
+                dtype=torch.float16,
+                memory_format=torch.channels_last,
+            )
+            self._y_nhwc = self._y.permute(0, 2, 3, 1)
+
+        OH, OW = self._OH, self._OW
+        H_pad, W_pad = self._H_pad, self._W_pad
+
+        def grid(META):
+            num_ow = triton.cdiv(OW, META["BLOCK_OW"])
+            num_n = triton.cdiv(C_out, META["BLOCK_N"])
+            return (OH * num_ow * num_n, N)
+
+        _conv_transpose2d_swizzled_v2[grid](
+            self._x_nhwc,
+            self.w_hwio,
+            self._y_nhwc,
+            N,
+            H_pad,
+            W_pad,
+            C_out,
+            OH,
+            OW,
+            self.w_hwio.stride(0),
+            self.w_hwio.stride(1),
+            self.w_hwio.stride(2),
+            self.w_hwio.stride(3),
+            KH=KH,
+            KW=KW,
+            C_IN=C_in,
+        )
+
+        return self._y
+
+
+batch_size = 8
+in_channels = 64
+out_channels = 64
+kernel_size = 3
+height = 1024
+width = 1024
+
+
+def get_inputs():
+    x = torch.rand(batch_size, in_channels, height, width)
+    return [x]
+
+
+def get_init_inputs():
+    return [in_channels, out_channels, kernel_size]
diff --git a/backends/triton/cpu/KernelBench/level1/58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.py b/backends/triton/cpu/KernelBench/level1/58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.py
new file mode 100644
index 0000000..2c22089
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.py
@@ -0,0 +1,238 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_W": 16, "BLOCK_OC": 16}, num_warps=4, num_stages=3),
+    ],
+    key=["W_out", "C_out"],
+)
+@triton.jit
+def _conv_transpose3d_v2(
+    x_ptr,
+    w_ptr,
+    out_ptr,
+    D_in: tl.constexpr,
+    H_in: tl.constexpr,
+    W_in: tl.constexpr,
+    C_out: tl.constexpr,
+    D_out: tl.constexpr,
+    H_out: tl.constexpr,
+    W_out: tl.constexpr,
+    sx_b,
+    sx_d,
+    sx_h,
+    sx_w,
+    so_b,
+    so_d,
+    so_h,
+    so_w,
+    BLOCK_W: tl.constexpr,
+    BLOCK_OC: tl.constexpr,
+    KD: tl.constexpr,
+    KH: tl.constexpr,
+    KW: tl.constexpr,
+    C_IN: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+):
+    pid_w = tl.program_id(0)
+    pid_bdh = tl.program_id(1)
+    pid_oc = tl.program_id(2)
+
+    b = pid_bdh // (D_out * H_out)
+    rem = pid_bdh % (D_out * H_out)
+    d_out = rem // H_out
+    h_out = rem % H_out
+
+    acc = tl.zeros((BLOCK_W, BLOCK_OC), dtype=tl.float32)
+    x_batch_base = x_ptr + b.to(tl.int64) * sx_b
+
+    for kd in range(KD):
+        d_in = d_out - kd
+        d_ok = (d_in >= 0) & (d_in < D_in)
+        if d_ok:
+            for kh in range(KH):
+                h_in = h_out - kh
+                h_ok = (h_in >= 0) & (h_in < H_in)
+                if h_ok:
+                    x_dh_base = x_batch_base + d_in * sx_d + h_in * sx_h
+
+                    for kw in range(KW):
+                        w_in_start = pid_w * BLOCK_W - kw
+                        kidx = kd * KH * KW + kh * KW + kw
+
+                        x_bp = tl.make_block_ptr(
+                            base=x_dh_base,
+                            shape=(W_in, C_IN),
+                            strides=(sx_w, 1),
+                            offsets=(w_in_start, 0),
+                            block_shape=(BLOCK_W, BLOCK_K),
+                            order=(1, 0),
+                        )
+                        w_bp = tl.make_block_ptr(
+                            base=w_ptr + kidx * C_IN * C_out,
+                            shape=(C_IN, C_out),
+                            strides=(C_out, 1),
+                            offsets=(0, pid_oc * BLOCK_OC),
+                            block_shape=(BLOCK_K, BLOCK_OC),
+                            order=(1, 0),
+                        )
+
+                        for _c0 in range(0, C_IN, BLOCK_K):
+                            x_tile = tl.load(
+                                x_bp, boundary_check=(0, 1), padding_option="zero"
+                            )
+                            w_tile = tl.load(
+                                w_bp, boundary_check=(0, 1), padding_option="zero"
+                            )
+                            acc += tl.dot(x_tile, w_tile)
+                            x_bp = tl.advance(x_bp, (0, BLOCK_K))
+                            w_bp = tl.advance(w_bp, (BLOCK_K, 0))
+
+    out_bp = tl.make_block_ptr(
+        base=out_ptr + b.to(tl.int64) * so_b + d_out * so_d + h_out * so_h,
+        shape=(W_out, C_out),
+        strides=(so_w, 1),
+        offsets=(pid_w * BLOCK_W, pid_oc * BLOCK_OC),
+        block_shape=(BLOCK_W, BLOCK_OC),
+        order=(1, 0),
+    )
+    tl.store(out_bp, acc.to(tl.float16), boundary_check=(0, 1))
+
+
+class Model(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=(1, 1, 1),
+        padding=(0, 0, 0),
+        output_padding=(0, 0, 0),
+        groups=1,
+        bias=False,
+    ):
+        super().__init__()
+        if isinstance(kernel_size, int):
+            kernel_size = (kernel_size, kernel_size, kernel_size)
+        self.deconv3d = nn.ConvTranspose3d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+            groups=groups,
+            bias=bias,
+        )
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self._packed = False
+        self._y_buf = None
+
+    def _pack_weights(self):
+        w = self.deconv3d.weight.data
+        KD, KH, KW = self.kernel_size
+        self.weight_packed = (
+            w.permute(2, 3, 4, 0, 1)
+            .reshape(KD * KH * KW, self.in_channels, self.out_channels)
+            .contiguous()
+            .to(dtype=torch.float16)
+        )
+        self._packed = True
+
+    def forward(self, x):
+        if not self._packed:
+            self._pack_weights()
+
+        device = x.device
+        B, C_in, D_in, H_in, W_in = x.shape
+        KD, KH, KW = self.kernel_size
+
+        D_out = D_in + KD - 1
+        H_out = H_in + KH - 1
+        W_out = W_in + KW - 1
+
+        x_cl = x.to(dtype=torch.float16).contiguous(
+            memory_format=torch.channels_last_3d
+        )
+
+        expected_shape = (B, self.out_channels, D_out, H_out, W_out)
+        if (
+            self._y_buf is None
+            or self._y_buf.shape != expected_shape
+            or self._y_buf.device != device
+        ):
+            self._y_buf = torch.empty(
+                *expected_shape,
+                device=device,
+                dtype=torch.float16,
+            ).contiguous(memory_format=torch.channels_last_3d)
+        output = self._y_buf
+
+        sx = x_cl.stride()
+        so = output.stride()
+
+        BLOCK_K = 32
+
+        grid = lambda META: (
+            triton.cdiv(W_out, META["BLOCK_W"]),
+            B * D_out * H_out,
+            triton.cdiv(self.out_channels, META["BLOCK_OC"]),
+        )
+
+        _conv_transpose3d_v2[grid](
+            x_cl,
+            self.weight_packed,
+            output,
+            D_in,
+            H_in,
+            W_in,
+            self.out_channels,
+            D_out,
+            H_out,
+            W_out,
+            sx[0],
+            sx[2],
+            sx[3],
+            sx[4],
+            so[0],
+            so[2],
+            so[3],
+            so[4],
+            KD=KD,
+            KH=KH,
+            KW=KW,
+            C_IN=C_in,
+            BLOCK_K=BLOCK_K,
+        )
+
+        return output
+
+
+batch_size = 16
+in_channels = 32
+out_channels = 16
+kernel_size = (3, 5, 7)
+depth_in = 16
+height_in = 32
+width_in = 64
+
+
+def get_inputs():
+    x = torch.rand(batch_size, in_channels, depth_in, height_in, width_in)
+    return [x]
+
+
+def get_init_inputs():
+    return [in_channels, out_channels, kernel_size]
diff --git a/backends/triton/cpu/KernelBench/level1/59_conv_standard_3D__asymmetric_input__square_kernel.py b/backends/triton/cpu/KernelBench/level1/59_conv_standard_3D__asymmetric_input__square_kernel.py
new file mode 100644
index 0000000..14a217f
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/59_conv_standard_3D__asymmetric_input__square_kernel.py
@@ -0,0 +1,246 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {
+                "BLOCK_M": 64,
+                "BLOCK_N": 64,
+                "BLOCK_K": 16,
+                "GROUP_M": 8,
+                "grf_mode": "256",
+            },
+            num_warps=4,
+            num_stages=2,
+        ),
+    ],
+    key=["C_out", "K_FUSED"],
+)
+@triton.jit
+def _conv3d_fused_k(
+    x_ptr,
+    w_ptr,
+    y_ptr,
+    N_batch,
+    C_out,
+    OD,
+    OH,
+    OW,
+    sxn,
+    sxd,
+    sxh,
+    sxw,
+    syn,
+    syd,
+    syh,
+    syw,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_M: tl.constexpr,
+    KD: tl.constexpr,
+    KH: tl.constexpr,
+    KW: tl.constexpr,
+    C_IN: tl.constexpr,
+    K_FUSED: tl.constexpr,
+    grf_mode: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    M_total = N_batch * OD * OH * OW
+    num_pid_m = tl.cdiv(M_total, BLOCK_M)
+    num_pid_n = tl.cdiv(C_out, BLOCK_N)
+    num_pid_in_group = GROUP_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_M
+    group_size_m = tl.minimum(num_pid_m - first_pid_m, GROUP_M)
+    pid_in_group = pid % num_pid_in_group
+    pid_m = first_pid_m + (pid_in_group % group_size_m)
+    pid_n = pid_in_group // group_size_m
+
+    offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_k = tl.arange(0, BLOCK_K)
+
+    ohw = OH * OW
+    odhw = OD * ohw
+    n_idx = offs_m // odhw
+    rem = offs_m % odhw
+    od_idx = rem // ohw
+    rem2 = rem % ohw
+    oh_idx = rem2 // OW
+    ow_idx = rem2 % OW
+
+    mask_m = offs_m < M_total
+
+    x_spatial_base = (
+        x_ptr + n_idx.to(tl.int64) * sxn + od_idx * sxd + oh_idx * sxh + ow_idx * sxw
+    )
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+
+    w_bp = tl.make_block_ptr(
+        base=w_ptr,
+        shape=(K_FUSED, C_out),
+        strides=(C_out, 1),
+        offsets=(0, pid_n * BLOCK_N),
+        block_shape=(BLOCK_K, BLOCK_N),
+        order=(1, 0),
+    )
+
+    for k0 in range(0, K_FUSED, BLOCK_K):
+        k_idx = k0 + offs_k
+        c_off = k_idx % C_IN
+        spatial_idx = k_idx // C_IN
+        kw_off = spatial_idx % KW
+        kh_off = (spatial_idx // KW) % KH
+        kd_off = spatial_idx // (KW * KH)
+        k_input_off = kd_off * sxd + kh_off * sxh + kw_off * sxw + c_off
+
+        x_tile = tl.load(
+            x_spatial_base[:, None] + k_input_off[None, :],
+            mask=mask_m[:, None] & (k_idx[None, :] < K_FUSED),
+            other=0.0,
+        )
+        w_tile = tl.load(w_bp, boundary_check=(0, 1), padding_option="zero")
+        acc = tl.dot(x_tile, w_tile, acc)
+        w_bp = tl.advance(w_bp, (BLOCK_K, 0))
+
+    offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    mask_n = offs_n < C_out
+    y_ptrs = (
+        y_ptr + n_idx.to(tl.int64) * syn + od_idx * syd + oh_idx * syh + ow_idx * syw
+    )
+    tl.store(
+        y_ptrs[:, None] + offs_n[None, :],
+        acc.to(tl.float16),
+        mask=mask_m[:, None] & mask_n[None, :],
+    )
+
+
+class Model(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=False,
+    ):
+        super(Model, self).__init__()
+        ks = (kernel_size, kernel_size, 1)
+        self.weight = nn.Parameter(torch.empty(out_channels, in_channels, *ks))
+        if bias:
+            self.bias = nn.Parameter(torch.empty(out_channels))
+        else:
+            self.bias = None
+        nn.init.kaiming_uniform_(self.weight, a=5**0.5)
+        if self.bias is not None:
+            fan_in = in_channels * ks[0] * ks[1] * ks[2]
+            bound = 1 / fan_in**0.5
+            nn.init.uniform_(self.bias, -bound, bound)
+        self._packed = False
+
+    def _pack_weights(self):
+        w = self.weight.data
+        C_out, C_in, KD, KH, KW = w.shape
+        self.w_fused = (
+            w.permute(2, 3, 4, 1, 0)
+            .reshape(KD * KH * KW * C_in, C_out)
+            .contiguous()
+            .to(dtype=torch.float16)
+        )
+        self._KD, self._KH, self._KW = KD, KH, KW
+        self._C_in = C_in
+        self._C_out = C_out
+        self._K_fused = KD * KH * KW * C_in
+        self._packed = True
+
+    def forward(self, x):
+        if not self._packed:
+            self._pack_weights()
+
+        B, C_in, D_in, H_in, W_in = x.shape
+        KD, KH, KW = self._KD, self._KH, self._KW
+        C_out = self._C_out
+        OD = D_in - KD + 1
+        OH = H_in - KH + 1
+        OW = W_in - KW + 1
+
+        x_fp16 = x.to(dtype=torch.float16)
+        x_cl = x_fp16.contiguous(memory_format=torch.channels_last_3d)
+        y = torch.empty(
+            B,
+            C_out,
+            OD,
+            OH,
+            OW,
+            device=x.device,
+            dtype=torch.float16,
+            memory_format=torch.channels_last_3d,
+        )
+
+        sx = x_cl.stride()
+        sy = y.stride()
+
+        M_total = B * OD * OH * OW
+        grid = lambda meta: (
+            triton.cdiv(M_total, meta["BLOCK_M"]) * triton.cdiv(C_out, meta["BLOCK_N"]),
+        )
+
+        _conv3d_fused_k[grid](
+            x_cl,
+            self.w_fused,
+            y,
+            B,
+            C_out,
+            OD,
+            OH,
+            OW,
+            sx[0],
+            sx[2],
+            sx[3],
+            sx[4],
+            sy[0],
+            sy[2],
+            sy[3],
+            sy[4],
+            KD=KD,
+            KH=KH,
+            KW=KW,
+            C_IN=C_in,
+            K_FUSED=self._K_fused,
+        )
+
+        if self.bias is not None:
+            y = y + self.bias.view(1, -1, 1, 1, 1)
+        return y
+
+
+batch_size = 16
+in_channels = 3
+out_channels = 64
+kernel_size = 3
+height = 256
+width = 256
+depth = 10
+
+
+def get_inputs():
+    x = torch.rand(batch_size, in_channels, height, width, depth)
+    return [x]
+
+
+def get_init_inputs():
+    return [in_channels, out_channels, kernel_size]
diff --git a/backends/triton/cpu/KernelBench/level1/5_Matrix_scalar_multiplication.py b/backends/triton/cpu/KernelBench/level1/5_Matrix_scalar_multiplication.py
new file mode 100644
index 0000000..fe569a8
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/5_Matrix_scalar_multiplication.py
@@ -0,0 +1,63 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_SIZE": 1024 * 2}, num_warps=4, num_stages=2),
+    ],
+    key=["n_elements"],
+)
+@triton.jit
+def scalar_mul_kernel(
+    input_ptr,
+    output_ptr,
+    scalar,
+    n_elements,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+    x = tl.load(input_ptr + offsets, mask=mask)
+    result = x * scalar
+    tl.store(output_ptr + offsets, result.to(tl.bfloat16), mask=mask)
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, A: torch.Tensor, s) -> torch.Tensor:
+        A = A.contiguous()
+        output = torch.empty_like(A)
+        n_elements = A.numel()
+        if isinstance(s, torch.Tensor):
+            scalar_val = s.item()
+        else:
+            scalar_val = float(s)
+        grid = lambda META: (triton.cdiv(n_elements, META["BLOCK_SIZE"]),)
+        scalar_mul_kernel[grid](A, output, scalar_val, n_elements)
+        return output
+
+
+M = 16384 * 4
+N = 4096 * 4
+
+
+def get_inputs():
+    A = torch.rand(M, N, dtype=torch.bfloat16)
+    s = 3.14
+    return [A, s]
+
+
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/60_conv_standard_3D__square_input__asymmetric_kernel.py b/backends/triton/cpu/KernelBench/level1/60_conv_standard_3D__square_input__asymmetric_kernel.py
new file mode 100644
index 0000000..529f8ee
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/60_conv_standard_3D__square_input__asymmetric_kernel.py
@@ -0,0 +1,252 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+def _to_triple(x):
+    if isinstance(x, int):
+        return (x, x, x)
+    return tuple(x)
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {
+                "BLOCK_M": 64,
+                "BLOCK_N": 64,
+                "BLOCK_K": 16,
+                "GROUP_M": 8,
+                "grf_mode": "256",
+            },
+            num_warps=4,
+            num_stages=2,
+        ),
+    ],
+    key=["C_out", "K_FUSED"],
+)
+@triton.jit
+def _conv3d_fused_k(
+    x_ptr,
+    w_ptr,
+    y_ptr,
+    N_batch,
+    C_out,
+    OD,
+    OH,
+    OW,
+    sxn,
+    sxd,
+    sxh,
+    sxw,
+    syn,
+    syd,
+    syh,
+    syw,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_M: tl.constexpr,
+    KD: tl.constexpr,
+    KH: tl.constexpr,
+    KW: tl.constexpr,
+    C_IN: tl.constexpr,
+    K_FUSED: tl.constexpr,
+    grf_mode: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    M_total = N_batch * OD * OH * OW
+    num_pid_m = tl.cdiv(M_total, BLOCK_M)
+    num_pid_n = tl.cdiv(C_out, BLOCK_N)
+    num_pid_in_group = GROUP_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_M
+    group_size_m = tl.minimum(num_pid_m - first_pid_m, GROUP_M)
+    pid_in_group = pid % num_pid_in_group
+    pid_m = first_pid_m + (pid_in_group % group_size_m)
+    pid_n = pid_in_group // group_size_m
+
+    offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_k = tl.arange(0, BLOCK_K)
+
+    ohw = OH * OW
+    odhw = OD * ohw
+    n_idx = offs_m // odhw
+    rem = offs_m % odhw
+    od_idx = rem // ohw
+    rem2 = rem % ohw
+    oh_idx = rem2 // OW
+    ow_idx = rem2 % OW
+
+    mask_m = offs_m < M_total
+
+    x_spatial_base = (
+        x_ptr + n_idx.to(tl.int64) * sxn + od_idx * sxd + oh_idx * sxh + ow_idx * sxw
+    )
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+
+    w_bp = tl.make_block_ptr(
+        base=w_ptr,
+        shape=(K_FUSED, C_out),
+        strides=(C_out, 1),
+        offsets=(0, pid_n * BLOCK_N),
+        block_shape=(BLOCK_K, BLOCK_N),
+        order=(1, 0),
+    )
+
+    for k0 in range(0, K_FUSED, BLOCK_K):
+        k_idx = k0 + offs_k
+        c_off = k_idx % C_IN
+        spatial_idx = k_idx // C_IN
+        kw_off = spatial_idx % KW
+        kh_off = (spatial_idx // KW) % KH
+        kd_off = spatial_idx // (KW * KH)
+        k_input_off = kd_off * sxd + kh_off * sxh + kw_off * sxw + c_off
+
+        x_tile = tl.load(
+            x_spatial_base[:, None] + k_input_off[None, :],
+            mask=mask_m[:, None] & (k_idx[None, :] < K_FUSED),
+            other=0.0,
+        )
+        w_tile = tl.load(w_bp, boundary_check=(0, 1), padding_option="zero")
+        acc = tl.dot(x_tile, w_tile, acc)
+        w_bp = tl.advance(w_bp, (BLOCK_K, 0))
+
+    offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    mask_n = offs_n < C_out
+    y_ptrs = (
+        y_ptr + n_idx.to(tl.int64) * syn + od_idx * syd + oh_idx * syh + ow_idx * syw
+    )
+    tl.store(
+        y_ptrs[:, None] + offs_n[None, :],
+        acc.to(tl.float16),
+        mask=mask_m[:, None] & mask_n[None, :],
+    )
+
+
+class Model(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=False,
+    ):
+        super(Model, self).__init__()
+        ks = _to_triple(kernel_size)
+        self.weight = nn.Parameter(torch.empty(out_channels, in_channels, *ks))
+        if bias:
+            self.bias = nn.Parameter(torch.empty(out_channels))
+        else:
+            self.bias = None
+        nn.init.kaiming_uniform_(self.weight, a=5**0.5)
+        if self.bias is not None:
+            fan_in = in_channels * ks[0] * ks[1] * ks[2]
+            bound = 1 / fan_in**0.5
+            nn.init.uniform_(self.bias, -bound, bound)
+        self._packed = False
+
+    def _pack_weights(self):
+        w = self.weight.data
+        C_out, C_in, KD, KH, KW = w.shape
+        self.w_fused = (
+            w.permute(2, 3, 4, 1, 0)
+            .reshape(KD * KH * KW * C_in, C_out)
+            .contiguous()
+            .to(dtype=torch.float16)
+        )
+        self._KD, self._KH, self._KW = KD, KH, KW
+        self._C_in = C_in
+        self._C_out = C_out
+        self._K_fused = KD * KH * KW * C_in
+        self._packed = True
+
+    def forward(self, x):
+        if not self._packed:
+            self._pack_weights()
+
+        B, C_in, D_in, H_in, W_in = x.shape
+        KD, KH, KW = self._KD, self._KH, self._KW
+        C_out = self._C_out
+        OD = D_in - KD + 1
+        OH = H_in - KH + 1
+        OW = W_in - KW + 1
+
+        x_fp16 = x.to(dtype=torch.float16)
+        x_cl = x_fp16.contiguous(memory_format=torch.channels_last_3d)
+        y = torch.empty(
+            B,
+            C_out,
+            OD,
+            OH,
+            OW,
+            device=x.device,
+            dtype=torch.float16,
+            memory_format=torch.channels_last_3d,
+        )
+
+        sx = x_cl.stride()
+        sy = y.stride()
+
+        M_total = B * OD * OH * OW
+        grid = lambda meta: (
+            triton.cdiv(M_total, meta["BLOCK_M"]) * triton.cdiv(C_out, meta["BLOCK_N"]),
+        )
+
+        _conv3d_fused_k[grid](
+            x_cl,
+            self.w_fused,
+            y,
+            B,
+            C_out,
+            OD,
+            OH,
+            OW,
+            sx[0],
+            sx[2],
+            sx[3],
+            sx[4],
+            sy[0],
+            sy[2],
+            sy[3],
+            sy[4],
+            KD=KD,
+            KH=KH,
+            KW=KW,
+            C_IN=C_in,
+            K_FUSED=self._K_fused,
+        )
+
+        if self.bias is not None:
+            y = y + self.bias.view(1, -1, 1, 1, 1)
+        return y
+
+
+batch_size = 16
+in_channels = 3
+out_channels = 64
+kernel_size = (3, 5, 7)
+width = 64
+height = 64
+depth = 64
+
+
+def get_inputs():
+    x = torch.rand(batch_size, in_channels, width, height, depth)
+    return [x]
+
+
+def get_init_inputs():
+    return [in_channels, out_channels, kernel_size]
diff --git a/backends/triton/cpu/KernelBench/level1/61_conv_transposed_3D__square_input__square_kernel.py b/backends/triton/cpu/KernelBench/level1/61_conv_transposed_3D__square_input__square_kernel.py
new file mode 100644
index 0000000..d7a5e6d
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/61_conv_transposed_3D__square_input__square_kernel.py
@@ -0,0 +1,243 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {"BLOCK_OW": 64, "BLOCK_N": 64, "BLOCK_K": 16, "grf_mode": "256"},
+            num_warps=4,
+            num_stages=3,
+        ),
+    ],
+    key=["D", "H", "W", "C_IN", "C_OUT", "OD", "OH", "OW"],
+)
+@triton.jit
+def _conv3d_spatial_tiled(
+    x_ptr,
+    w_ptr,
+    y_ptr,
+    N_batch,
+    D,
+    H,
+    W,
+    OD,
+    OH,
+    OW,
+    sx_n,
+    sx_d,
+    sx_h,
+    sw_kd,
+    sw_kh,
+    sw_kw,
+    sw_ci,
+    sw_co,
+    sy_n,
+    sy_d,
+    sy_h,
+    BLOCK_OW: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    KD: tl.constexpr,
+    KH: tl.constexpr,
+    KW: tl.constexpr,
+    PAD: tl.constexpr,
+    C_IN: tl.constexpr,
+    C_OUT: tl.constexpr,
+    grf_mode: tl.constexpr,
+):
+    n = tl.program_id(0)
+    pid_dh = tl.program_id(1)
+    pid_wn = tl.program_id(2)
+
+    od = pid_dh // OH
+    oh = pid_dh % OH
+
+    num_ow_tiles = tl.cdiv(OW, BLOCK_OW)
+    pid_ow = pid_wn % num_ow_tiles
+    pid_n = pid_wn // num_ow_tiles
+    ow0 = pid_ow * BLOCK_OW
+
+    acc = tl.zeros((BLOCK_OW, BLOCK_N), dtype=tl.float32)
+
+    x_n_base = x_ptr + n.to(tl.int64) * sx_n
+
+    for kd in range(KD):
+        d_in = od + kd - PAD
+        d_ok = (d_in >= 0) & (d_in < D)
+        if d_ok:
+            for kh in range(KH):
+                h_in = oh + kh - PAD
+                h_ok = (h_in >= 0) & (h_in < H)
+                if h_ok:
+                    x_dh_base = x_n_base + d_in * sx_d + h_in * sx_h
+
+                    for kw in range(KW):
+                        w_start = ow0 + kw - PAD
+
+                        x_bp = tl.make_block_ptr(
+                            base=x_dh_base,
+                            shape=(W, C_IN),
+                            strides=(C_IN, 1),
+                            offsets=(w_start, 0),
+                            block_shape=(BLOCK_OW, BLOCK_K),
+                            order=(1, 0),
+                        )
+
+                        w_bp = tl.make_block_ptr(
+                            base=w_ptr + kd * sw_kd + kh * sw_kh + kw * sw_kw,
+                            shape=(C_IN, C_OUT),
+                            strides=(sw_ci, sw_co),
+                            offsets=(0, pid_n * BLOCK_N),
+                            block_shape=(BLOCK_K, BLOCK_N),
+                            order=(1, 0),
+                        )
+
+                        for c0 in range(0, C_IN, BLOCK_K):
+                            x_tile = tl.load(
+                                x_bp, boundary_check=(0, 1), padding_option="zero"
+                            )
+                            w_tile = tl.load(
+                                w_bp, boundary_check=(0, 1), padding_option="zero"
+                            )
+                            acc = tl.dot(x_tile, w_tile, acc)
+                            x_bp = tl.advance(x_bp, (0, BLOCK_K))
+                            w_bp = tl.advance(w_bp, (BLOCK_K, 0))
+
+    y_dh_base = y_ptr + n.to(tl.int64) * sy_n + od * sy_d + oh * sy_h
+    y_valid = OW - ow0
+    y_bp = tl.make_block_ptr(
+        base=y_dh_base,
+        shape=(y_valid, C_OUT),
+        strides=(C_OUT, 1),
+        offsets=(0, pid_n * BLOCK_N),
+        block_shape=(BLOCK_OW, BLOCK_N),
+        order=(1, 0),
+    )
+    tl.store(y_bp, acc.to(tl.float16), boundary_check=(0, 1))
+
+
+class Model(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        output_padding: int = 0,
+        groups: int = 1,
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.conv_transpose3d = nn.ConvTranspose3d(
+            in_channels,
+            out_channels,
+            kernel_size=(kernel_size, kernel_size, kernel_size),
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+            groups=groups,
+            bias=bias,
+        )
+        self._packed = False
+        self._ks = kernel_size
+        self._pad = padding
+
+    def _pack_weights(self):
+        w = self.conv_transpose3d.weight.data
+        self._w_dhwio = (
+            w.flip(2, 3, 4).permute(2, 3, 4, 0, 1).to(dtype=torch.float16).contiguous()
+        )
+        self._packed = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not self._packed:
+            self._pack_weights()
+        if x.dtype != torch.float16:
+            x = x.to(dtype=torch.float16)
+
+        x = x.to(memory_format=torch.channels_last_3d)
+        x_ndhwc = x.permute(0, 2, 3, 4, 1)
+
+        N_b, C_in, D, H, W = x.shape
+        C_out = self._w_dhwio.shape[4]
+        KS = self._ks
+        PAD = KS - 1 - self._pad
+
+        OD = D + 2 * PAD - KS + 1
+        OH = H + 2 * PAD - KS + 1
+        OW = W + 2 * PAD - KS + 1
+
+        conv_out = torch.empty(
+            (N_b, C_out, OD, OH, OW),
+            device=x.device,
+            dtype=torch.float16,
+            memory_format=torch.channels_last_3d,
+        )
+        conv_ndhwc = conv_out.permute(0, 2, 3, 4, 1)
+
+        def conv_grid(meta):
+            return (
+                N_b,
+                OD * OH,
+                triton.cdiv(OW, meta["BLOCK_OW"]) * triton.cdiv(C_out, meta["BLOCK_N"]),
+            )
+
+        _conv3d_spatial_tiled[conv_grid](
+            x_ndhwc,
+            self._w_dhwio,
+            conv_ndhwc,
+            N_b,
+            D,
+            H,
+            W,
+            OD,
+            OH,
+            OW,
+            x_ndhwc.stride(0),
+            x_ndhwc.stride(1),
+            x_ndhwc.stride(2),
+            self._w_dhwio.stride(0),
+            self._w_dhwio.stride(1),
+            self._w_dhwio.stride(2),
+            self._w_dhwio.stride(3),
+            self._w_dhwio.stride(4),
+            conv_ndhwc.stride(0),
+            conv_ndhwc.stride(1),
+            conv_ndhwc.stride(2),
+            KD=KS,
+            KH=KS,
+            KW=KS,
+            PAD=PAD,
+            C_IN=C_in,
+            C_OUT=C_out,
+        )
+
+        return conv_out
+
+
+batch_size = 8
+in_channels = 48
+out_channels = 48
+kernel_size = 3
+depth = 64
+height = 64
+width = 64
+
+
+def get_inputs():
+    x = torch.rand(batch_size, in_channels, depth, height, width)
+    return [x]
+
+
+def get_init_inputs():
+    return [in_channels, out_channels, kernel_size]
diff --git a/backends/triton/cpu/KernelBench/level1/62_conv_standard_2D__square_input__asymmetric_kernel.py b/backends/triton/cpu/KernelBench/level1/62_conv_standard_2D__square_input__asymmetric_kernel.py
new file mode 100644
index 0000000..0c35718
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/62_conv_standard_2D__square_input__asymmetric_kernel.py
@@ -0,0 +1,205 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {"BLOCK_OW": 64, "BLOCK_N": 64, "BLOCK_K": 64, "grf_mode": "128"},
+            num_warps=4,
+            num_stages=2,
+        ),
+    ],
+    key=["H", "W", "C_IN", "C_out", "OH", "OW"],
+)
+@triton.jit
+def _conv2d_spatial_tiled(
+    x_ptr,
+    w_ptr,
+    y_ptr,
+    N_batch,
+    H,
+    W,
+    C_out,
+    OH,
+    OW,
+    stride_wkh,
+    stride_wkw,
+    stride_wci,
+    stride_wco,
+    BLOCK_OW: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    KH: tl.constexpr,
+    KW: tl.constexpr,
+    C_IN: tl.constexpr,
+    grf_mode: tl.constexpr,
+):
+    n = tl.program_id(0)
+    oh = tl.program_id(1)
+    pid_ow_n = tl.program_id(2)
+
+    num_ow_tiles = tl.cdiv(OW, BLOCK_OW)
+    pid_ow = pid_ow_n % num_ow_tiles
+    pid_n = pid_ow_n // num_ow_tiles
+
+    ow0 = pid_ow * BLOCK_OW
+    HW = H * W
+    OHOW = OH * OW
+
+    acc = tl.zeros((BLOCK_OW, BLOCK_N), dtype=tl.float32)
+
+    for kh in range(KH):
+        for kw in range(KW):
+            x_row_start = n * HW + (oh + kh) * W + (ow0 + kw)
+            x_valid_rows = W - (ow0 + kw)
+
+            x_bp = tl.make_block_ptr(
+                base=x_ptr,
+                shape=(x_row_start + x_valid_rows, C_IN),
+                strides=(C_IN, 1),
+                offsets=(x_row_start, 0),
+                block_shape=(BLOCK_OW, BLOCK_K),
+                order=(1, 0),
+            )
+
+            w_bp = tl.make_block_ptr(
+                base=w_ptr + kh * stride_wkh + kw * stride_wkw,
+                shape=(C_IN, C_out),
+                strides=(stride_wci, stride_wco),
+                offsets=(0, pid_n * BLOCK_N),
+                block_shape=(BLOCK_K, BLOCK_N),
+                order=(1, 0),
+            )
+
+            for c0 in range(0, C_IN, BLOCK_K):
+                x_tile = tl.load(x_bp, boundary_check=(0, 1), padding_option="zero")
+                w_tile = tl.load(w_bp, boundary_check=(0, 1), padding_option="zero")
+                acc = tl.dot(x_tile, w_tile, acc, input_precision="ieee")
+                x_bp = tl.advance(x_bp, (0, BLOCK_K))
+                w_bp = tl.advance(w_bp, (BLOCK_K, 0))
+
+    y_row_start = n * OHOW + oh * OW + ow0
+    y_valid_rows = OW - ow0
+    y_bp = tl.make_block_ptr(
+        base=y_ptr,
+        shape=(y_row_start + y_valid_rows, C_out),
+        strides=(C_out, 1),
+        offsets=(y_row_start, pid_n * BLOCK_N),
+        block_shape=(BLOCK_OW, BLOCK_N),
+        order=(1, 0),
+    )
+    tl.store(y_bp, acc.to(tl.float16), boundary_check=(0, 1))
+
+
+class Model(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: tuple,
+        stride: tuple = (1, 1),
+        padding: tuple = (0, 0),
+        dilation: tuple = (1, 1),
+        groups: int = 1,
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.conv2d = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+        self._w_hwio = None
+        self._moved = False
+
+    def _move_params_once(self):
+        w = self.conv2d.weight.data
+        self._w_hwio = w.permute(2, 3, 1, 0).contiguous()
+        if self._w_hwio.dtype != torch.float16:
+            self._w_hwio = self._w_hwio.to(dtype=torch.float16)
+        self._moved = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not self._moved:
+            self._move_params_once()
+
+        input_dtype = x.dtype
+        if x.dtype != torch.float16:
+            x = x.to(dtype=torch.float16)
+
+        x_cl = x.contiguous(memory_format=torch.channels_last)
+        x_nhwc = x_cl.permute(0, 2, 3, 1)
+
+        N, C_in, H, W = x_cl.shape
+        KH, KW, _, C_out = self._w_hwio.shape
+        OH = H - KH + 1
+        OW = W - KW + 1
+
+        y = torch.empty(
+            (N, C_out, OH, OW),
+            device=x.device,
+            dtype=torch.float16,
+            memory_format=torch.channels_last,
+        )
+        y_nhwc = y.permute(0, 2, 3, 1)
+
+        w = self._w_hwio
+
+        def grid(meta):
+            num_ow = triton.cdiv(OW, meta["BLOCK_OW"])
+            num_n = triton.cdiv(C_out, meta["BLOCK_N"])
+            return (N, OH, num_ow * num_n)
+
+        _conv2d_spatial_tiled[grid](
+            x_nhwc,
+            w,
+            y_nhwc,
+            N,
+            H,
+            W,
+            C_out,
+            OH,
+            OW,
+            w.stride(0),
+            w.stride(1),
+            w.stride(2),
+            w.stride(3),
+            KH=KH,
+            KW=KW,
+            C_IN=C_in,
+        )
+
+        if input_dtype != torch.float16:
+            return y.to(dtype=input_dtype)
+        return y
+
+
+batch_size = 8
+in_channels = 64
+out_channels = 128
+kernel_size = (5, 7)
+height = 512
+width = 256
+
+
+def get_inputs():
+    x = torch.rand(batch_size, in_channels, height, width)
+    return [x]
+
+
+def get_init_inputs():
+    return [in_channels, out_channels, kernel_size]
diff --git a/backends/triton/cpu/KernelBench/level1/63_conv_standard_2D__square_input__square_kernel.py b/backends/triton/cpu/KernelBench/level1/63_conv_standard_2D__square_input__square_kernel.py
new file mode 100644
index 0000000..760a8fa
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/63_conv_standard_2D__square_input__square_kernel.py
@@ -0,0 +1,201 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 16, "grf_mode": "128"},
+            num_warps=4,
+            num_stages=2,
+        ),
+    ],
+    key=["M_total", "C_out", "C_IN"],
+)
+@triton.jit
+def _conv2d_flat_kernel(
+    x_ptr,
+    w_ptr,
+    y_ptr,
+    M_total,
+    C_out,
+    OH,
+    OW,
+    stride_xn,
+    stride_xh,
+    stride_xw,
+    stride_wkh,
+    stride_wkw,
+    stride_wci,
+    stride_wco,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    KH: tl.constexpr,
+    KW: tl.constexpr,
+    C_IN: tl.constexpr,
+    grf_mode: tl.constexpr,
+):
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    nhw = OH * OW
+    offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_k = tl.arange(0, BLOCK_K)
+
+    n_idx = offs_m // nhw
+    rem = offs_m % nhw
+    oh_idx = rem // OW
+    ow_idx = rem % OW
+
+    mask_m = offs_m < M_total
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+
+    x_base = x_ptr + n_idx * stride_xn + oh_idx * stride_xh + ow_idx * stride_xw
+
+    for kh in range(KH):
+        for kw in range(KW):
+            x_kh_kw = x_base + kh * stride_xh + kw * stride_xw
+
+            w_bp = tl.make_block_ptr(
+                base=w_ptr + kh * stride_wkh + kw * stride_wkw,
+                shape=(C_IN, C_out),
+                strides=(stride_wci, stride_wco),
+                offsets=(0, pid_n * BLOCK_N),
+                block_shape=(BLOCK_K, BLOCK_N),
+                order=(1, 0),
+            )
+
+            for c0 in range(0, C_IN, BLOCK_K):
+                k_idx = c0 + offs_k
+
+                x_tile = tl.load(
+                    x_kh_kw[:, None] + k_idx[None, :],
+                    mask=mask_m[:, None] & (k_idx[None, :] < C_IN),
+                    other=0.0,
+                )
+
+                w_tile = tl.load(w_bp, boundary_check=(0, 1), padding_option="zero")
+                acc = tl.dot(x_tile, w_tile, acc)
+                w_bp = tl.advance(w_bp, (BLOCK_K, 0))
+
+    y_bp = tl.make_block_ptr(
+        base=y_ptr,
+        shape=(M_total, C_out),
+        strides=(C_out, 1),
+        offsets=(pid_m * BLOCK_M, pid_n * BLOCK_N),
+        block_shape=(BLOCK_M, BLOCK_N),
+        order=(1, 0),
+    )
+    tl.store(y_bp, acc.to(tl.float16), boundary_check=(0, 1))
+
+
+class Model(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=False,
+    ):
+        super(Model, self).__init__()
+        self.conv2d = nn.Conv2d(
+            in_channels,
+            out_channels,
+            (kernel_size, kernel_size),
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+        self._packed = False
+        self._y_buf = None
+
+    def _pack_weights(self):
+        w = self.conv2d.weight.data
+        self.w_hwio = w.to(dtype=torch.float16).permute(2, 3, 1, 0).contiguous()
+        self._packed = True
+
+    def forward(self, x):
+        if not self._packed:
+            self._pack_weights()
+
+        x = x.to(dtype=torch.float16)
+
+        N_batch, C_in, H, W_dim = x.shape
+        KH, KW = self.conv2d.kernel_size
+        C_out = self.conv2d.out_channels
+        OH = H - KH + 1
+        OW = W_dim - KW + 1
+        M_total = N_batch * OH * OW
+
+        x_cl = x.contiguous(memory_format=torch.channels_last)
+        x_nhwc = x_cl.permute(0, 2, 3, 1)
+
+        if self._y_buf is None or self._y_buf.shape != (N_batch, C_out, OH, OW):
+            self._y_buf = torch.empty(
+                (N_batch, C_out, OH, OW),
+                device=x.device,
+                dtype=torch.float16,
+                memory_format=torch.channels_last,
+            )
+        y = self._y_buf
+        y_nhwc = y.permute(0, 2, 3, 1)
+
+        grid = lambda META: (
+            triton.cdiv(M_total, META["BLOCK_M"]),
+            triton.cdiv(C_out, META["BLOCK_N"]),
+        )
+
+        _conv2d_flat_kernel[grid](
+            x_nhwc,
+            self.w_hwio,
+            y_nhwc,
+            M_total,
+            C_out,
+            OH,
+            OW,
+            x_nhwc.stride(0),
+            x_nhwc.stride(1),
+            x_nhwc.stride(2),
+            self.w_hwio.stride(0),
+            self.w_hwio.stride(1),
+            self.w_hwio.stride(2),
+            self.w_hwio.stride(3),
+            KH=KH,
+            KW=KW,
+            C_IN=C_in,
+        )
+
+        return y
+
+
+batch_size = 16
+in_channels = 16
+out_channels = 128
+kernel_size = 3
+width = 1024
+height = 1024
+
+
+def get_inputs():
+    x = torch.rand(batch_size, in_channels, height, width)
+    return [x]
+
+
+def get_init_inputs():
+    return [in_channels, out_channels, kernel_size]
diff --git a/backends/triton/cpu/KernelBench/level1/64_conv_transposed_1D.py b/backends/triton/cpu/KernelBench/level1/64_conv_transposed_1D.py
new file mode 100644
index 0000000..c9f1851
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/64_conv_transposed_1D.py
@@ -0,0 +1,195 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 16, "GROUP_M": 8},
+            num_warps=4,
+            num_stages=2,
+        ),
+    ],
+    key=["C_IN", "C_out", "OL"],
+)
+@triton.jit
+def _conv_transpose1d_gemm(
+    x_ptr,
+    w_ptr,
+    y_ptr,
+    N_batch,
+    L_in,
+    C_out,
+    OL,
+    sxn,
+    sxl,
+    swk,
+    swci,
+    swco,
+    syn,
+    syl,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_M: tl.constexpr,
+    K_SIZE: tl.constexpr,
+    C_IN: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    M_total = N_batch * OL
+    num_pid_m = tl.cdiv(M_total, BLOCK_M)
+    num_pid_n = tl.cdiv(C_out, BLOCK_N)
+    num_pid_in_group = GROUP_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_M
+    group_size_m = tl.minimum(num_pid_m - first_pid_m, GROUP_M)
+    pid_in_group = pid % num_pid_in_group
+    pid_m = first_pid_m + (pid_in_group % group_size_m)
+    pid_n = pid_in_group // group_size_m
+
+    offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    offs_k = tl.arange(0, BLOCK_K)
+
+    n_idx = offs_m // OL
+    ol_idx = offs_m % OL
+
+    mask_m = offs_m < M_total
+    mask_n = offs_n < C_out
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    x_base = x_ptr + n_idx * sxn
+
+    for k in range(K_SIZE):
+        il = ol_idx - k
+        valid = (il >= 0) & (il < L_in) & mask_m
+        x_ptrs = x_base + il * sxl
+
+        w_bp = tl.make_block_ptr(
+            base=w_ptr + k * swk,
+            shape=(C_IN, C_out),
+            strides=(swci, swco),
+            offsets=(0, pid_n * BLOCK_N),
+            block_shape=(BLOCK_K, BLOCK_N),
+            order=(1, 0),
+        )
+
+        for c0 in range(0, C_IN, BLOCK_K):
+            k_idx = c0 + offs_k
+            x_tile = tl.load(
+                x_ptrs[:, None] + k_idx[None, :],
+                mask=valid[:, None] & (k_idx[None, :] < C_IN),
+                other=0.0,
+            )
+            w_tile = tl.load(w_bp, boundary_check=(0, 1), padding_option="zero")
+            acc = tl.dot(x_tile, w_tile, acc)
+            w_bp = tl.advance(w_bp, (BLOCK_K, 0))
+
+    y_ptrs = y_ptr + n_idx * syn + ol_idx * syl
+    tl.store(
+        y_ptrs[:, None] + offs_n[None, :],
+        acc.to(y_ptr.dtype.element_ty),
+        mask=mask_m[:, None] & mask_n[None, :],
+    )
+
+
+class Model(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        output_padding=0,
+        groups=1,
+        bias=False,
+    ):
+        super(Model, self).__init__()
+        self.conv1d_transpose = nn.ConvTranspose1d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+            groups=groups,
+            bias=bias,
+        )
+        self._packed = False
+
+    def _pack_weights(self):
+        self._w_kio = self.conv1d_transpose.weight.permute(2, 0, 1).contiguous()
+        self._K = self.conv1d_transpose.weight.shape[2]
+        self._C_out = self.conv1d_transpose.weight.shape[1]
+        self._packed = True
+
+    def forward(self, x):
+        if not self._packed:
+            self._pack_weights()
+
+        B, C_in, L_in = x.shape
+        K = self._K
+        C_out = self._C_out
+        OL = L_in + K - 1
+
+        x_nlc = x.permute(0, 2, 1).contiguous()
+        w_kio = self._w_kio
+        y_nlc = torch.empty((B, OL, C_out), device=x.device, dtype=x.dtype)
+
+        sxn, sxl, _ = x_nlc.stride()
+        swk, swci, swco = w_kio.stride()
+        syn, syl, _ = y_nlc.stride()
+
+        M_total = B * OL
+        grid = lambda meta: (
+            triton.cdiv(M_total, meta["BLOCK_M"]) * triton.cdiv(C_out, meta["BLOCK_N"]),
+        )
+
+        _conv_transpose1d_gemm[grid](
+            x_nlc,
+            w_kio,
+            y_nlc,
+            B,
+            L_in,
+            C_out,
+            OL,
+            sxn,
+            sxl,
+            swk,
+            swci,
+            swco,
+            syn,
+            syl,
+            K_SIZE=K,
+            C_IN=C_in,
+        )
+
+        result = y_nlc.permute(0, 2, 1).contiguous()
+        if self.conv1d_transpose.bias is not None:
+            result = result + self.conv1d_transpose.bias.view(1, -1, 1)
+        return result
+
+
+batch_size = 64
+in_channels = 128
+out_channels = 128
+kernel_size = 3
+length = 65536
+
+
+def get_inputs():
+    x = torch.rand(batch_size, in_channels, length)
+    return [x]
+
+
+def get_init_inputs():
+    return [in_channels, out_channels, kernel_size]
diff --git a/backends/triton/cpu/KernelBench/level1/65_conv_transposed_2D__square_input__asymmetric_kernel.py b/backends/triton/cpu/KernelBench/level1/65_conv_transposed_2D__square_input__asymmetric_kernel.py
new file mode 100644
index 0000000..81d1992
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/65_conv_transposed_2D__square_input__asymmetric_kernel.py
@@ -0,0 +1,229 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {
+                "BLOCK_OW": 64,
+                "BLOCK_N": 64,
+                "BLOCK_K": 64,
+                "GROUP_SIZE_M": 8,
+                "grf_mode": "256",
+            },
+            num_warps=4,
+            num_stages=2,
+        ),
+    ],
+    key=["H", "W", "C_IN", "C_out", "OH", "OW"],
+)
+@triton.jit
+def _conv_transpose2d_swizzled_v2(
+    x_ptr,
+    w_ptr,
+    y_ptr,
+    N_batch,
+    H,
+    W,
+    C_out,
+    OH,
+    OW,
+    stride_wkh,
+    stride_wkw,
+    stride_wci,
+    stride_wco,
+    BLOCK_OW: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    KH: tl.constexpr,
+    KW: tl.constexpr,
+    C_IN: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+    grf_mode: tl.constexpr,
+):
+    n = tl.program_id(1)
+    pid = tl.program_id(0)
+
+    num_ow_tiles = tl.cdiv(OW, BLOCK_OW)
+    num_n_tiles = tl.cdiv(C_out, BLOCK_N)
+    num_m_tiles = OH * num_ow_tiles
+
+    group_id = pid // (GROUP_SIZE_M * num_n_tiles)
+    first_m_in_group = group_id * GROUP_SIZE_M
+    group_size = min(num_m_tiles - first_m_in_group, GROUP_SIZE_M)
+    local_id = pid % (group_size * num_n_tiles)
+    m_local = local_id // num_n_tiles
+    pid_n = local_id % num_n_tiles
+    m_idx = first_m_in_group + m_local
+
+    oh = m_idx // num_ow_tiles
+    pid_ow = m_idx % num_ow_tiles
+    ow0 = pid_ow * BLOCK_OW
+
+    HW = H * W
+    OHOW = OH * OW
+
+    acc = tl.zeros((BLOCK_OW, BLOCK_N), dtype=tl.float32)
+
+    for kh in range(KH):
+        for kw in range(KW):
+            x_row_start = n * HW + (oh + kh) * W + (ow0 + kw)
+            x_valid_rows = W - (ow0 + kw)
+
+            x_bp = tl.make_block_ptr(
+                base=x_ptr,
+                shape=(x_row_start + x_valid_rows, C_IN),
+                strides=(C_IN, 1),
+                offsets=(x_row_start, 0),
+                block_shape=(BLOCK_OW, BLOCK_K),
+                order=(1, 0),
+            )
+
+            w_bp = tl.make_block_ptr(
+                base=w_ptr + kh * stride_wkh + kw * stride_wkw,
+                shape=(C_IN, C_out),
+                strides=(stride_wci, stride_wco),
+                offsets=(0, pid_n * BLOCK_N),
+                block_shape=(BLOCK_K, BLOCK_N),
+                order=(1, 0),
+            )
+
+            for c0 in range(0, C_IN, BLOCK_K):
+                x_tile = tl.load(x_bp, boundary_check=(0, 1), padding_option="zero")
+                w_tile = tl.load(w_bp, boundary_check=(0, 1), padding_option="zero")
+                acc = tl.dot(x_tile, w_tile, acc)
+                x_bp = tl.advance(x_bp, (0, BLOCK_K))
+                w_bp = tl.advance(w_bp, (BLOCK_K, 0))
+
+    y_row_start = n * OHOW + oh * OW + ow0
+    y_valid_rows = OW - ow0
+    y_bp = tl.make_block_ptr(
+        base=y_ptr,
+        shape=(y_row_start + y_valid_rows, C_out),
+        strides=(C_out, 1),
+        offsets=(y_row_start, pid_n * BLOCK_N),
+        block_shape=(BLOCK_OW, BLOCK_N),
+        order=(1, 0),
+    )
+    tl.store(y_bp, acc.to(tl.float16), boundary_check=(0, 1))
+
+
+class Model(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        output_padding=0,
+        groups=1,
+        bias=False,
+    ):
+        super().__init__()
+        self.conv_transpose2d = nn.ConvTranspose2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+            groups=groups,
+            bias=bias,
+        )
+        self._packed = False
+        self._cached_x_ptr = None
+
+    def _pack_weights(self):
+        weight = self.conv_transpose2d.weight.data.detach()
+        self.w_hwio = (
+            weight.flip(2, 3).permute(2, 3, 0, 1).to(dtype=torch.float16).contiguous()
+        )
+        self.KH = weight.shape[2]
+        self.KW = weight.shape[3]
+        self.C_in_val = weight.shape[0]
+        self.C_out_val = weight.shape[1]
+        self._packed = True
+
+    def forward(self, x):
+        if not self._packed:
+            self._pack_weights()
+
+        x = x.to(dtype=torch.float16)
+        N, C_in, H, W_in = x.shape
+        KH, KW = self.KH, self.KW
+        C_out = self.C_out_val
+
+        if self._cached_x_ptr != x.data_ptr():
+            x_pad = F.pad(x, (KW - 1, KW - 1, KH - 1, KH - 1))
+            x_pad_cl = x_pad.contiguous(memory_format=torch.channels_last)
+            self._x_nhwc = x_pad_cl.permute(0, 2, 3, 1)
+            self._cached_x_ptr = x.data_ptr()
+            self._H_pad = H + 2 * (KH - 1)
+            self._W_pad = W_in + 2 * (KW - 1)
+            self._OH = H + KH - 1
+            self._OW = W_in + KW - 1
+            self._N = N
+            self._y = torch.empty(
+                (N, C_out, self._OH, self._OW),
+                device=x.device,
+                dtype=torch.float16,
+                memory_format=torch.channels_last,
+            )
+            self._y_nhwc = self._y.permute(0, 2, 3, 1)
+
+        OH, OW = self._OH, self._OW
+        H_pad, W_pad = self._H_pad, self._W_pad
+
+        def grid(META):
+            num_ow = triton.cdiv(OW, META["BLOCK_OW"])
+            num_n = triton.cdiv(C_out, META["BLOCK_N"])
+            return (OH * num_ow * num_n, N)
+
+        _conv_transpose2d_swizzled_v2[grid](
+            self._x_nhwc,
+            self.w_hwio,
+            self._y_nhwc,
+            N,
+            H_pad,
+            W_pad,
+            C_out,
+            OH,
+            OW,
+            self.w_hwio.stride(0),
+            self.w_hwio.stride(1),
+            self.w_hwio.stride(2),
+            self.w_hwio.stride(3),
+            KH=KH,
+            KW=KW,
+            C_IN=C_in,
+        )
+
+        return self._y
+
+
+batch_size = 8
+in_channels = 64
+out_channels = 64
+kernel_size = (3, 7)
+width = 512
+height = 512
+
+
+def get_inputs():
+    x = torch.rand(batch_size, in_channels, height, width)
+    return [x]
+
+
+def get_init_inputs():
+    return [in_channels, out_channels, kernel_size]
diff --git a/backends/triton/cpu/KernelBench/level1/66_conv_standard_3D__asymmetric_input__asymmetric_kernel.py b/backends/triton/cpu/KernelBench/level1/66_conv_standard_3D__asymmetric_input__asymmetric_kernel.py
new file mode 100644
index 0000000..656518f
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/66_conv_standard_3D__asymmetric_input__asymmetric_kernel.py
@@ -0,0 +1,242 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+def _to_triple(x):
+    if isinstance(x, int):
+        return (x, x, x)
+    return tuple(x)
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 16, "grf_mode": "256"},
+            num_warps=4,
+            num_stages=2,
+        ),
+    ],
+    key=["C_out", "K_FUSED"],
+)
+@triton.jit
+def _conv3d_fused_k(
+    x_ptr,
+    w_ptr,
+    y_ptr,
+    N_batch,
+    C_out,
+    OD,
+    OH,
+    OW,
+    sxn,
+    sxd,
+    sxh,
+    sxw,
+    syn,
+    syd,
+    syh,
+    syw,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    KD: tl.constexpr,
+    KH: tl.constexpr,
+    KW: tl.constexpr,
+    C_IN: tl.constexpr,
+    K_FUSED: tl.constexpr,
+    grf_mode: tl.constexpr,
+):
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    M_total = N_batch * OD * OH * OW
+    offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_k = tl.arange(0, BLOCK_K)
+
+    ohw = OH * OW
+    odhw = OD * ohw
+    n_idx = offs_m // odhw
+    rem = offs_m % odhw
+    od_idx = rem // ohw
+    rem2 = rem % ohw
+    oh_idx = rem2 // OW
+    ow_idx = rem2 % OW
+
+    mask_m = offs_m < M_total
+
+    x_spatial_base = (
+        x_ptr + n_idx.to(tl.int64) * sxn + od_idx * sxd + oh_idx * sxh + ow_idx * sxw
+    )
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+
+    w_bp = tl.make_block_ptr(
+        base=w_ptr,
+        shape=(K_FUSED, C_out),
+        strides=(C_out, 1),
+        offsets=(0, pid_n * BLOCK_N),
+        block_shape=(BLOCK_K, BLOCK_N),
+        order=(1, 0),
+    )
+
+    for k0 in range(0, K_FUSED, BLOCK_K):
+        k_idx = k0 + offs_k
+        c_off = k_idx % C_IN
+        spatial_idx = k_idx // C_IN
+        kw_off = spatial_idx % KW
+        kh_off = (spatial_idx // KW) % KH
+        kd_off = spatial_idx // (KW * KH)
+        k_input_off = kd_off * sxd + kh_off * sxh + kw_off * sxw + c_off
+
+        x_tile = tl.load(
+            x_spatial_base[:, None] + k_input_off[None, :],
+            mask=mask_m[:, None] & (k_idx[None, :] < K_FUSED),
+            other=0.0,
+        )
+        w_tile = tl.load(w_bp, boundary_check=(0, 1), padding_option="zero")
+        acc = tl.dot(x_tile, w_tile, acc)
+        w_bp = tl.advance(w_bp, (BLOCK_K, 0))
+
+    offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    mask_n = offs_n < C_out
+    y_ptrs = (
+        y_ptr + n_idx.to(tl.int64) * syn + od_idx * syd + oh_idx * syh + ow_idx * syw
+    )
+    tl.store(
+        y_ptrs[:, None] + offs_n[None, :],
+        acc.to(tl.float16),
+        mask=mask_m[:, None] & mask_n[None, :],
+    )
+
+
+class Model(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=(1, 1, 1),
+        padding=(0, 0, 0),
+        dilation=(1, 1, 1),
+        groups=1,
+        bias=False,
+    ):
+        super(Model, self).__init__()
+        ks = _to_triple(kernel_size)
+        self.weight = nn.Parameter(torch.empty(out_channels, in_channels, *ks))
+        if bias:
+            self.bias = nn.Parameter(torch.empty(out_channels))
+        else:
+            self.bias = None
+        nn.init.kaiming_uniform_(self.weight, a=5**0.5)
+        if self.bias is not None:
+            fan_in = in_channels * ks[0] * ks[1] * ks[2]
+            bound = 1 / fan_in**0.5
+            nn.init.uniform_(self.bias, -bound, bound)
+        self._packed = False
+        self._y_buf = None
+
+    def _pack_weights(self):
+        w = self.weight.data
+        C_out, C_in, KD, KH, KW = w.shape
+        self.w_fused = (
+            w.permute(2, 3, 4, 1, 0)
+            .reshape(KD * KH * KW * C_in, C_out)
+            .contiguous()
+            .to(dtype=torch.float16)
+        )
+        self._KD, self._KH, self._KW = KD, KH, KW
+        self._C_in = C_in
+        self._C_out = C_out
+        self._K_fused = KD * KH * KW * C_in
+        self._packed = True
+
+    def forward(self, x):
+        if not self._packed:
+            self._pack_weights()
+
+        B, C_in, D_in, H_in, W_in = x.shape
+        KD, KH, KW = self._KD, self._KH, self._KW
+        C_out = self._C_out
+        OD = D_in - KD + 1
+        OH = H_in - KH + 1
+        OW = W_in - KW + 1
+
+        x_fp16 = x.to(dtype=torch.float16)
+        x_cl = x_fp16.contiguous(memory_format=torch.channels_last_3d)
+
+        if self._y_buf is None or self._y_buf.shape != (B, C_out, OD, OH, OW):
+            self._y_buf = torch.empty(
+                B,
+                C_out,
+                OD,
+                OH,
+                OW,
+                device=x.device,
+                dtype=torch.float16,
+                memory_format=torch.channels_last_3d,
+            )
+        y = self._y_buf
+
+        sx = x_cl.stride()
+        sy = y.stride()
+
+        M_total = B * OD * OH * OW
+        grid = lambda meta: (
+            triton.cdiv(M_total, meta["BLOCK_M"]),
+            triton.cdiv(C_out, meta["BLOCK_N"]),
+        )
+
+        _conv3d_fused_k[grid](
+            x_cl,
+            self.w_fused,
+            y,
+            B,
+            C_out,
+            OD,
+            OH,
+            OW,
+            sx[0],
+            sx[2],
+            sx[3],
+            sx[4],
+            sy[0],
+            sy[2],
+            sy[3],
+            sy[4],
+            KD=KD,
+            KH=KH,
+            KW=KW,
+            C_IN=C_in,
+            K_FUSED=self._K_fused,
+        )
+
+        if self.bias is not None:
+            y = y + self.bias.to(dtype=torch.float16).view(1, -1, 1, 1, 1)
+        return y
+
+
+batch_size = 8
+in_channels = 3
+out_channels = 64
+kernel_size = (3, 5, 7)
+depth = 16
+height = 128
+width = 128
+
+
+def get_inputs():
+    x = torch.rand(batch_size, in_channels, depth, height, width)
+    return [x]
+
+
+def get_init_inputs():
+    return [in_channels, out_channels, kernel_size]
diff --git a/backends/triton/cpu/KernelBench/level1/67_conv_standard_1D.py b/backends/triton/cpu/KernelBench/level1/67_conv_standard_1D.py
new file mode 100644
index 0000000..7abf077
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/67_conv_standard_1D.py
@@ -0,0 +1,167 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_OL": 64, "BLOCK_N": 128}, num_warps=4, num_stages=3),
+    ],
+    key=["OL", "OC"],
+)
+@triton.jit
+def _conv1d_kernel(
+    x_ptr,
+    w_ptr,
+    y_ptr,
+    OL,
+    OC,
+    stride_xn,
+    stride_xc,
+    stride_wk,
+    stride_wi,
+    stride_wo,
+    stride_yn,
+    stride_yoc,
+    C_IN: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    KERNEL_SIZE: tl.constexpr,
+    BLOCK_OL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    n = tl.program_id(0)
+    pid_ol = tl.program_id(1)
+    pid_n = tl.program_id(2)
+
+    ol_start = pid_ol * BLOCK_OL
+    offs_ol = ol_start + tl.arange(0, BLOCK_OL)
+    offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    mask_ol = offs_ol < OL
+    mask_n = offs_n < OC
+
+    acc = tl.zeros((BLOCK_OL, BLOCK_N), dtype=tl.float32)
+
+    offs_k = tl.arange(0, BLOCK_K)
+    x_batch = x_ptr + n.to(tl.int64) * stride_xn
+
+    for k in range(KERNEL_SIZE):
+        in_pos = offs_ol + k
+
+        x_addrs = x_batch + offs_k[None, :].to(tl.int64) * stride_xc + in_pos[:, None]
+        x_tile = tl.load(
+            x_addrs, mask=mask_ol[:, None] & (offs_k[None, :] < C_IN), other=0.0
+        )
+
+        w_bp = tl.make_block_ptr(
+            base=w_ptr + k * stride_wk,
+            shape=(C_IN, OC),
+            strides=(stride_wi, stride_wo),
+            offsets=(0, pid_n * BLOCK_N),
+            block_shape=(BLOCK_K, BLOCK_N),
+            order=(1, 0),
+        )
+        w_tile = tl.load(w_bp, boundary_check=(0, 1), padding_option="zero")
+
+        acc = tl.dot(x_tile, w_tile, acc)
+
+    y_batch = y_ptr + n.to(tl.int64) * stride_yn
+    y_addrs = y_batch + offs_n[None, :].to(tl.int64) * stride_yoc + offs_ol[:, None]
+    tl.store(y_addrs, acc.to(tl.float16), mask=mask_ol[:, None] & mask_n[None, :])
+
+
+class Model(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=False,
+    ):
+        super(Model, self).__init__()
+        self.conv1d = nn.Conv1d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+        self._packed = False
+
+    def _pack_weights(self):
+        w = self.conv1d.weight.data
+        self.w_kio = w.permute(2, 1, 0).contiguous().to(dtype=torch.float16)
+        self._packed = True
+
+    def forward(self, x):
+        if not self._packed:
+            self._pack_weights()
+
+        x = x.to(dtype=torch.float16)
+        if not x.is_contiguous():
+            x = x.contiguous()
+
+        B, C_in, L_in = x.shape
+        K = self.conv1d.weight.shape[2]
+        C_out = self.conv1d.weight.shape[0]
+        OL = L_in - K + 1
+
+        BLOCK_K = triton.next_power_of_2(max(16, C_in))
+
+        y = torch.empty((B, C_out, OL), device=x.device, dtype=torch.float16)
+
+        grid = lambda META: (
+            B,
+            triton.cdiv(OL, META["BLOCK_OL"]),
+            triton.cdiv(C_out, META["BLOCK_N"]),
+        )
+
+        _conv1d_kernel[grid](
+            x,
+            self.w_kio,
+            y,
+            OL,
+            C_out,
+            x.stride(0),
+            x.stride(1),
+            self.w_kio.stride(0),
+            self.w_kio.stride(1),
+            self.w_kio.stride(2),
+            y.stride(0),
+            y.stride(1),
+            C_IN=C_in,
+            BLOCK_K=BLOCK_K,
+            KERNEL_SIZE=K,
+        )
+
+        return y
+
+
+batch_size = 32
+in_channels = 64
+out_channels = 128
+kernel_size = 3
+length = 131072
+
+
+def get_inputs():
+    x = torch.rand(batch_size, in_channels, length)
+    return [x]
+
+
+def get_init_inputs():
+    return [in_channels, out_channels, kernel_size]
diff --git a/backends/triton/cpu/KernelBench/level1/68_conv_transposed_3D__square_input__asymmetric_kernel.py b/backends/triton/cpu/KernelBench/level1/68_conv_transposed_3D__square_input__asymmetric_kernel.py
new file mode 100644
index 0000000..9bcf102
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/68_conv_transposed_3D__square_input__asymmetric_kernel.py
@@ -0,0 +1,239 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_W": 16, "BLOCK_OC": 16}, num_warps=4, num_stages=3),
+    ],
+    key=["W_out", "C_out"],
+)
+@triton.jit
+def _conv_transpose3d_v2(
+    x_ptr,
+    w_ptr,
+    out_ptr,
+    D_in: tl.constexpr,
+    H_in: tl.constexpr,
+    W_in: tl.constexpr,
+    C_out: tl.constexpr,
+    D_out: tl.constexpr,
+    H_out: tl.constexpr,
+    W_out: tl.constexpr,
+    sx_b,
+    sx_d,
+    sx_h,
+    sx_w,
+    so_b,
+    so_d,
+    so_h,
+    so_w,
+    BLOCK_W: tl.constexpr,
+    BLOCK_OC: tl.constexpr,
+    KD: tl.constexpr,
+    KH: tl.constexpr,
+    KW: tl.constexpr,
+    C_IN: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+):
+    pid_w = tl.program_id(0)
+    pid_bdh = tl.program_id(1)
+    pid_oc = tl.program_id(2)
+
+    b = pid_bdh // (D_out * H_out)
+    rem = pid_bdh % (D_out * H_out)
+    d_out = rem // H_out
+    h_out = rem % H_out
+
+    acc = tl.zeros((BLOCK_W, BLOCK_OC), dtype=tl.float32)
+    x_batch_base = x_ptr + b.to(tl.int64) * sx_b
+
+    for kd in range(KD):
+        d_in = d_out - kd
+        d_ok = (d_in >= 0) & (d_in < D_in)
+        if d_ok:
+            for kh in range(KH):
+                h_in = h_out - kh
+                h_ok = (h_in >= 0) & (h_in < H_in)
+                if h_ok:
+                    x_dh_base = x_batch_base + d_in * sx_d + h_in * sx_h
+
+                    for kw in range(KW):
+                        w_in_start = pid_w * BLOCK_W - kw
+                        kidx = kd * KH * KW + kh * KW + kw
+
+                        x_bp = tl.make_block_ptr(
+                            base=x_dh_base,
+                            shape=(W_in, C_IN),
+                            strides=(sx_w, 1),
+                            offsets=(w_in_start, 0),
+                            block_shape=(BLOCK_W, BLOCK_K),
+                            order=(1, 0),
+                        )
+                        w_bp = tl.make_block_ptr(
+                            base=w_ptr + kidx * C_IN * C_out,
+                            shape=(C_IN, C_out),
+                            strides=(C_out, 1),
+                            offsets=(0, pid_oc * BLOCK_OC),
+                            block_shape=(BLOCK_K, BLOCK_OC),
+                            order=(1, 0),
+                        )
+
+                        for _c0 in range(0, C_IN, BLOCK_K):
+                            x_tile = tl.load(
+                                x_bp, boundary_check=(0, 1), padding_option="zero"
+                            )
+                            w_tile = tl.load(
+                                w_bp, boundary_check=(0, 1), padding_option="zero"
+                            )
+                            acc += tl.dot(x_tile, w_tile)
+                            x_bp = tl.advance(x_bp, (0, BLOCK_K))
+                            w_bp = tl.advance(w_bp, (BLOCK_K, 0))
+
+    out_bp = tl.make_block_ptr(
+        base=out_ptr + b.to(tl.int64) * so_b + d_out * so_d + h_out * so_h,
+        shape=(W_out, C_out),
+        strides=(so_w, 1),
+        offsets=(pid_w * BLOCK_W, pid_oc * BLOCK_OC),
+        block_shape=(BLOCK_W, BLOCK_OC),
+        order=(1, 0),
+    )
+    tl.store(out_bp, acc.to(tl.float16), boundary_check=(0, 1))
+
+
+class Model(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=(1, 1, 1),
+        padding=(0, 0, 0),
+        output_padding=(0, 0, 0),
+        groups=1,
+        bias=False,
+    ):
+        super().__init__()
+        if isinstance(kernel_size, int):
+            kernel_size = (kernel_size, kernel_size, kernel_size)
+        self.deconv3d = nn.ConvTranspose3d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+            groups=groups,
+            bias=bias,
+        )
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self._packed = False
+        self._y_buf = None
+
+    def _pack_weights(self):
+        w = self.deconv3d.weight.data
+        KD, KH, KW = self.kernel_size
+        self.weight_packed = (
+            w.permute(2, 3, 4, 0, 1)
+            .reshape(KD * KH * KW, self.in_channels, self.out_channels)
+            .contiguous()
+            .to(dtype=torch.float16)
+        )
+        self._packed = True
+
+    def forward(self, x):
+        if not self._packed:
+            self._pack_weights()
+
+        B, C_in, D_in, H_in, W_in = x.shape
+        KD, KH, KW = self.kernel_size
+
+        D_out = D_in + KD - 1
+        H_out = H_in + KH - 1
+        W_out = W_in + KW - 1
+
+        x_cl = x.to(dtype=torch.float16).contiguous(
+            memory_format=torch.channels_last_3d
+        )
+
+        expected_shape = (B, self.out_channels, D_out, H_out, W_out)
+        if (
+            self._y_buf is None
+            or self._y_buf.shape != expected_shape
+            or self._y_buf.device != x.device
+        ):
+            self._y_buf = torch.empty(
+                *expected_shape,
+                device=x.device,
+                dtype=torch.float16,
+            ).contiguous(memory_format=torch.channels_last_3d)
+        output = self._y_buf
+
+        sx = x_cl.stride()
+        so = output.stride()
+
+        BLOCK_K = 32
+
+        grid = lambda META: (
+            triton.cdiv(W_out, META["BLOCK_W"]),
+            B * D_out * H_out,
+            triton.cdiv(self.out_channels, META["BLOCK_OC"]),
+        )
+
+        _conv_transpose3d_v2[grid](
+            x_cl,
+            self.weight_packed,
+            output,
+            D_in,
+            H_in,
+            W_in,
+            self.out_channels,
+            D_out,
+            H_out,
+            W_out,
+            sx[0],
+            sx[2],
+            sx[3],
+            sx[4],
+            so[0],
+            so[2],
+            so[3],
+            so[4],
+            KD=KD,
+            KH=KH,
+            KW=KW,
+            C_IN=C_in,
+            BLOCK_K=BLOCK_K,
+        )
+
+        return output
+
+
+batch_size = 16
+in_channels = 32
+out_channels = 64
+kernel_depth = 3
+kernel_width = 5
+kernel_height = 5
+depth = 64
+width = 64
+height = 64
+
+
+def get_inputs():
+    x = torch.rand(batch_size, in_channels, depth, width, height)
+    return [x]
+
+
+def get_init_inputs():
+    return [in_channels, out_channels, (kernel_depth, kernel_width, kernel_height)]
diff --git a/backends/triton/cpu/KernelBench/level1/69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.py b/backends/triton/cpu/KernelBench/level1/69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.py
new file mode 100644
index 0000000..233c9b5
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.py
@@ -0,0 +1,229 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {
+                "BLOCK_OW": 64,
+                "BLOCK_N": 64,
+                "BLOCK_K": 64,
+                "GROUP_SIZE_M": 8,
+                "grf_mode": "256",
+            },
+            num_warps=4,
+            num_stages=2,
+        ),
+    ],
+    key=["H", "W", "C_IN", "C_out", "OH", "OW"],
+)
+@triton.jit
+def _conv_transpose2d_swizzled_v2(
+    x_ptr,
+    w_ptr,
+    y_ptr,
+    N_batch,
+    H,
+    W,
+    C_out,
+    OH,
+    OW,
+    stride_wkh,
+    stride_wkw,
+    stride_wci,
+    stride_wco,
+    BLOCK_OW: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    KH: tl.constexpr,
+    KW: tl.constexpr,
+    C_IN: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+    grf_mode: tl.constexpr,
+):
+    n = tl.program_id(1)
+    pid = tl.program_id(0)
+
+    num_ow_tiles = tl.cdiv(OW, BLOCK_OW)
+    num_n_tiles = tl.cdiv(C_out, BLOCK_N)
+    num_m_tiles = OH * num_ow_tiles
+
+    group_id = pid // (GROUP_SIZE_M * num_n_tiles)
+    first_m_in_group = group_id * GROUP_SIZE_M
+    group_size = min(num_m_tiles - first_m_in_group, GROUP_SIZE_M)
+    local_id = pid % (group_size * num_n_tiles)
+    m_local = local_id // num_n_tiles
+    pid_n = local_id % num_n_tiles
+    m_idx = first_m_in_group + m_local
+
+    oh = m_idx // num_ow_tiles
+    pid_ow = m_idx % num_ow_tiles
+    ow0 = pid_ow * BLOCK_OW
+
+    HW = H * W
+    OHOW = OH * OW
+
+    acc = tl.zeros((BLOCK_OW, BLOCK_N), dtype=tl.float32)
+
+    for kh in range(KH):
+        for kw in range(KW):
+            x_row_start = n * HW + (oh + kh) * W + (ow0 + kw)
+            x_valid_rows = W - (ow0 + kw)
+
+            x_bp = tl.make_block_ptr(
+                base=x_ptr,
+                shape=(x_row_start + x_valid_rows, C_IN),
+                strides=(C_IN, 1),
+                offsets=(x_row_start, 0),
+                block_shape=(BLOCK_OW, BLOCK_K),
+                order=(1, 0),
+            )
+
+            w_bp = tl.make_block_ptr(
+                base=w_ptr + kh * stride_wkh + kw * stride_wkw,
+                shape=(C_IN, C_out),
+                strides=(stride_wci, stride_wco),
+                offsets=(0, pid_n * BLOCK_N),
+                block_shape=(BLOCK_K, BLOCK_N),
+                order=(1, 0),
+            )
+
+            for c0 in range(0, C_IN, BLOCK_K):
+                x_tile = tl.load(x_bp, boundary_check=(0, 1), padding_option="zero")
+                w_tile = tl.load(w_bp, boundary_check=(0, 1), padding_option="zero")
+                acc = tl.dot(x_tile, w_tile, acc)
+                x_bp = tl.advance(x_bp, (0, BLOCK_K))
+                w_bp = tl.advance(w_bp, (BLOCK_K, 0))
+
+    y_row_start = n * OHOW + oh * OW + ow0
+    y_valid_rows = OW - ow0
+    y_bp = tl.make_block_ptr(
+        base=y_ptr,
+        shape=(y_row_start + y_valid_rows, C_out),
+        strides=(C_out, 1),
+        offsets=(y_row_start, pid_n * BLOCK_N),
+        block_shape=(BLOCK_OW, BLOCK_N),
+        order=(1, 0),
+    )
+    tl.store(y_bp, acc.to(tl.float16), boundary_check=(0, 1))
+
+
+class Model(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        output_padding=0,
+        groups=1,
+        bias=False,
+    ):
+        super().__init__()
+        self.conv_transpose2d = nn.ConvTranspose2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+            groups=groups,
+            bias=bias,
+        )
+        self._packed = False
+        self._cached_x_ptr = None
+
+    def _pack_weights(self):
+        weight = self.conv_transpose2d.weight.data.detach()
+        self.w_hwio = (
+            weight.flip(2, 3).permute(2, 3, 0, 1).to(dtype=torch.float16).contiguous()
+        )
+        self.KH = weight.shape[2]
+        self.KW = weight.shape[3]
+        self.C_in_val = weight.shape[0]
+        self.C_out_val = weight.shape[1]
+        self._packed = True
+
+    def forward(self, x):
+        if not self._packed:
+            self._pack_weights()
+
+        x = x.to(dtype=torch.float16)
+        N, C_in, H, W_in = x.shape
+        KH, KW = self.KH, self.KW
+        C_out = self.C_out_val
+
+        if self._cached_x_ptr != x.data_ptr():
+            x_pad = F.pad(x, (KW - 1, KW - 1, KH - 1, KH - 1))
+            x_pad_cl = x_pad.contiguous(memory_format=torch.channels_last)
+            self._x_nhwc = x_pad_cl.permute(0, 2, 3, 1)
+            self._cached_x_ptr = x.data_ptr()
+            self._H_pad = H + 2 * (KH - 1)
+            self._W_pad = W_in + 2 * (KW - 1)
+            self._OH = H + KH - 1
+            self._OW = W_in + KW - 1
+            self._N = N
+            self._y = torch.empty(
+                (N, C_out, self._OH, self._OW),
+                device=x.device,
+                dtype=torch.float16,
+                memory_format=torch.channels_last,
+            )
+            self._y_nhwc = self._y.permute(0, 2, 3, 1)
+
+        OH, OW = self._OH, self._OW
+        H_pad, W_pad = self._H_pad, self._W_pad
+
+        def grid(META):
+            num_ow = triton.cdiv(OW, META["BLOCK_OW"])
+            num_n = triton.cdiv(C_out, META["BLOCK_N"])
+            return (OH * num_ow * num_n, N)
+
+        _conv_transpose2d_swizzled_v2[grid](
+            self._x_nhwc,
+            self.w_hwio,
+            self._y_nhwc,
+            N,
+            H_pad,
+            W_pad,
+            C_out,
+            OH,
+            OW,
+            self.w_hwio.stride(0),
+            self.w_hwio.stride(1),
+            self.w_hwio.stride(2),
+            self.w_hwio.stride(3),
+            KH=KH,
+            KW=KW,
+            C_IN=C_in,
+        )
+
+        return self._y
+
+
+batch_size = 64
+in_channels = 64
+out_channels = 128
+kernel_size = (3, 5)
+height = 128
+width = 256
+
+
+def get_inputs():
+    x = torch.rand(batch_size, in_channels, height, width)
+    return [x]
+
+
+def get_init_inputs():
+    return [in_channels, out_channels, kernel_size]
diff --git a/backends/triton/cpu/KernelBench/level1/6_Matmul_with_large_K_dimension_.py b/backends/triton/cpu/KernelBench/level1/6_Matmul_with_large_K_dimension_.py
new file mode 100644
index 0000000..2dd9faa
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/6_Matmul_with_large_K_dimension_.py
@@ -0,0 +1,133 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+def _configs():
+    return [
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_SIZE_M": 4},
+            num_warps=8,
+            num_stages=3,
+        ),
+    ]
+
+
+@triton.autotune(configs=_configs(), key=["M", "N", "K"])
+@triton.jit
+def _matmul_kernel(
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    M,
+    N,
+    K,
+    stride_am,
+    stride_ak: tl.constexpr,
+    stride_bk,
+    stride_bn: tl.constexpr,
+    stride_cm,
+    stride_cn: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    pid = tl.program_id(0)
+
+    num_pid_m = tl.cdiv(M, BLOCK_M)
+    num_pid_n = tl.cdiv(N, BLOCK_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = tl.minimum(num_pid_m - first_pid_m, GROUP_SIZE_M)
+
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    a_desc = tl.make_tensor_descriptor(
+        base=a_ptr,
+        shape=(M, K),
+        strides=(stride_am, stride_ak),
+        block_shape=(BLOCK_M, BLOCK_K),
+    )
+    b_desc = tl.make_tensor_descriptor(
+        base=b_ptr,
+        shape=(K, N),
+        strides=(stride_bk, stride_bn),
+        block_shape=(BLOCK_K, BLOCK_N),
+    )
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    for off_k in range(0, K, BLOCK_K):
+        a_tile = a_desc.load([pid_m * BLOCK_M, off_k])
+        b_tile = b_desc.load([off_k, pid_n * BLOCK_N])
+        acc += tl.dot(a_tile, b_tile)
+    c_desc = tl.make_tensor_descriptor(
+        base=c_ptr,
+        shape=(M, N),
+        strides=(stride_cm, stride_cn),
+        block_shape=(BLOCK_M, BLOCK_N),
+    )
+    c_desc.store([pid_m * BLOCK_M, pid_n * BLOCK_N], acc.to(c_ptr.type.element_ty))
+
+
+def _matmul_triton(A, B):
+    M, K = A.shape
+    K2, N = B.shape
+
+    A = A.contiguous()
+    B = B.contiguous()
+
+    C = torch.empty((M, N), device=A.device, dtype=A.dtype)
+
+    grid = lambda META: (
+        triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),
+    )
+
+    _matmul_kernel[grid](
+        A,
+        B,
+        C,
+        M,
+        N,
+        K,
+        A.stride(0),
+        A.stride(1),
+        B.stride(0),
+        B.stride(1),
+        C.stride(0),
+        C.stride(1),
+    )
+    return C
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
+        return _matmul_triton(A, B)
+
+
+M = 256
+N = 256
+K = 131072 * 4
+
+
+def get_inputs():
+    A = torch.rand(M, K, dtype=torch.bfloat16)
+    B = torch.rand(K, N, dtype=torch.bfloat16)
+    return [A, B]
+
+
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/70_conv_transposed_3D__asymmetric_input__square_kernel.py b/backends/triton/cpu/KernelBench/level1/70_conv_transposed_3D__asymmetric_input__square_kernel.py
new file mode 100644
index 0000000..9418854
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/70_conv_transposed_3D__asymmetric_input__square_kernel.py
@@ -0,0 +1,234 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_W": 16, "BLOCK_OC": 32}, num_warps=4, num_stages=3),
+    ],
+    key=["W_out", "C_out"],
+)
+@triton.jit
+def _conv_transpose3d_v2(
+    x_ptr,
+    w_ptr,
+    out_ptr,
+    D_in: tl.constexpr,
+    H_in: tl.constexpr,
+    W_in: tl.constexpr,
+    C_out: tl.constexpr,
+    D_out: tl.constexpr,
+    H_out: tl.constexpr,
+    W_out: tl.constexpr,
+    sx_b,
+    sx_d,
+    sx_h,
+    sx_w,
+    so_b,
+    so_d,
+    so_h,
+    so_w,
+    BLOCK_W: tl.constexpr,
+    BLOCK_OC: tl.constexpr,
+    K: tl.constexpr,
+    C_IN: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+):
+    pid_w = tl.program_id(0)
+    pid_bdh = tl.program_id(1)
+    pid_oc = tl.program_id(2)
+
+    b = pid_bdh // (D_out * H_out)
+    rem = pid_bdh % (D_out * H_out)
+    d_out = rem // H_out
+    h_out = rem % H_out
+
+    acc = tl.zeros((BLOCK_W, BLOCK_OC), dtype=tl.float32)
+    x_batch_base = x_ptr + b.to(tl.int64) * sx_b
+
+    for kd in range(K):
+        d_in = d_out - kd
+        d_ok = (d_in >= 0) & (d_in < D_in)
+        if d_ok:
+            for kh in range(K):
+                h_in = h_out - kh
+                h_ok = (h_in >= 0) & (h_in < H_in)
+                if h_ok:
+                    x_dh_base = x_batch_base + d_in * sx_d + h_in * sx_h
+
+                    for kw in range(K):
+                        w_in_start = pid_w * BLOCK_W - kw
+                        kidx = kd * K * K + kh * K + kw
+
+                        x_bp = tl.make_block_ptr(
+                            base=x_dh_base,
+                            shape=(W_in, C_IN),
+                            strides=(sx_w, 1),
+                            offsets=(w_in_start, 0),
+                            block_shape=(BLOCK_W, BLOCK_K),
+                            order=(1, 0),
+                        )
+                        w_bp = tl.make_block_ptr(
+                            base=w_ptr + kidx * C_IN * C_out,
+                            shape=(C_IN, C_out),
+                            strides=(C_out, 1),
+                            offsets=(0, pid_oc * BLOCK_OC),
+                            block_shape=(BLOCK_K, BLOCK_OC),
+                            order=(1, 0),
+                        )
+
+                        for _c0 in range(0, C_IN, BLOCK_K):
+                            x_tile = tl.load(
+                                x_bp, boundary_check=(0, 1), padding_option="zero"
+                            )
+                            w_tile = tl.load(
+                                w_bp, boundary_check=(0, 1), padding_option="zero"
+                            )
+                            acc += tl.dot(x_tile, w_tile)
+                            x_bp = tl.advance(x_bp, (0, BLOCK_K))
+                            w_bp = tl.advance(w_bp, (BLOCK_K, 0))
+
+    out_bp = tl.make_block_ptr(
+        base=out_ptr + b.to(tl.int64) * so_b + d_out * so_d + h_out * so_h,
+        shape=(W_out, C_out),
+        strides=(so_w, 1),
+        offsets=(pid_w * BLOCK_W, pid_oc * BLOCK_OC),
+        block_shape=(BLOCK_W, BLOCK_OC),
+        order=(1, 0),
+    )
+    tl.store(out_bp, acc.to(tl.float16), boundary_check=(0, 1))
+
+
+class Model(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        output_padding=0,
+        dilation=1,
+        groups=1,
+        bias=False,
+    ):
+        super().__init__()
+        self.conv_transpose3d = nn.ConvTranspose3d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self._packed = False
+        self._y_buf = None
+
+    def _pack_weights(self):
+        w = self.conv_transpose3d.weight.data
+        K = self.kernel_size
+        self.weight_packed = (
+            w.permute(2, 3, 4, 0, 1)
+            .reshape(K * K * K, self.in_channels, self.out_channels)
+            .contiguous()
+            .to(dtype=torch.float16)
+        )
+        self._packed = True
+
+    def forward(self, x):
+        if not self._packed:
+            self._pack_weights()
+
+        device = x.device
+        B, C_in, D_in, H_in, W_in = x.shape
+        K = self.kernel_size
+
+        D_out = D_in + K - 1
+        H_out = H_in + K - 1
+        W_out = W_in + K - 1
+
+        x_cl = x.to(dtype=torch.float16).contiguous(
+            memory_format=torch.channels_last_3d
+        )
+
+        expected_shape = (B, self.out_channels, D_out, H_out, W_out)
+        if (
+            self._y_buf is None
+            or self._y_buf.shape != expected_shape
+            or self._y_buf.device != device
+        ):
+            self._y_buf = torch.empty(
+                *expected_shape,
+                device=device,
+                dtype=torch.float16,
+            ).contiguous(memory_format=torch.channels_last_3d)
+        output = self._y_buf
+
+        sx = x_cl.stride()
+        so = output.stride()
+
+        BLOCK_K = 16
+
+        grid = lambda META: (
+            triton.cdiv(W_out, META["BLOCK_W"]),
+            B * D_out * H_out,
+            triton.cdiv(self.out_channels, META["BLOCK_OC"]),
+        )
+
+        _conv_transpose3d_v2[grid](
+            x_cl,
+            self.weight_packed,
+            output,
+            D_in,
+            H_in,
+            W_in,
+            self.out_channels,
+            D_out,
+            H_out,
+            W_out,
+            sx[0],
+            sx[2],
+            sx[3],
+            sx[4],
+            so[0],
+            so[2],
+            so[3],
+            so[4],
+            K=K,
+            C_IN=C_in,
+            BLOCK_K=BLOCK_K,
+        )
+
+        return output
+
+
+batch_size = 8
+in_channels = 48
+out_channels = 24
+kernel_size = 3
+depth = 96
+height = 96
+width = 96
+
+
+def get_inputs():
+    x = torch.rand(batch_size, in_channels, depth, height, width)
+    return [x]
+
+
+def get_init_inputs():
+    return [in_channels, out_channels, kernel_size]
diff --git a/backends/triton/cpu/KernelBench/level1/71_conv_transposed_2D__asymmetric_input__square_kernel.py b/backends/triton/cpu/KernelBench/level1/71_conv_transposed_2D__asymmetric_input__square_kernel.py
new file mode 100644
index 0000000..e93ce1a
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/71_conv_transposed_2D__asymmetric_input__square_kernel.py
@@ -0,0 +1,229 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {
+                "BLOCK_OW": 64,
+                "BLOCK_N": 32,
+                "BLOCK_K": 32,
+                "GROUP_SIZE_M": 8,
+                "grf_mode": "128",
+            },
+            num_warps=4,
+            num_stages=3,
+        ),
+    ],
+    key=["H", "W", "C_IN", "C_out", "OH", "OW"],
+)
+@triton.jit
+def _conv_transpose2d_swizzled_v2(
+    x_ptr,
+    w_ptr,
+    y_ptr,
+    N_batch,
+    H,
+    W,
+    C_out,
+    OH,
+    OW,
+    stride_wkh,
+    stride_wkw,
+    stride_wci,
+    stride_wco,
+    BLOCK_OW: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    KH: tl.constexpr,
+    KW: tl.constexpr,
+    C_IN: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+    grf_mode: tl.constexpr,
+):
+    n = tl.program_id(1)
+    pid = tl.program_id(0)
+
+    num_ow_tiles = tl.cdiv(OW, BLOCK_OW)
+    num_n_tiles = tl.cdiv(C_out, BLOCK_N)
+    num_m_tiles = OH * num_ow_tiles
+
+    group_id = pid // (GROUP_SIZE_M * num_n_tiles)
+    first_m_in_group = group_id * GROUP_SIZE_M
+    group_size = min(num_m_tiles - first_m_in_group, GROUP_SIZE_M)
+    local_id = pid % (group_size * num_n_tiles)
+    m_local = local_id // num_n_tiles
+    pid_n = local_id % num_n_tiles
+    m_idx = first_m_in_group + m_local
+
+    oh = m_idx // num_ow_tiles
+    pid_ow = m_idx % num_ow_tiles
+    ow0 = pid_ow * BLOCK_OW
+
+    HW = H * W
+    OHOW = OH * OW
+
+    acc = tl.zeros((BLOCK_OW, BLOCK_N), dtype=tl.float32)
+
+    for kh in range(KH):
+        for kw in range(KW):
+            x_row_start = n * HW + (oh + kh) * W + (ow0 + kw)
+            x_valid_rows = W - (ow0 + kw)
+
+            x_bp = tl.make_block_ptr(
+                base=x_ptr,
+                shape=(x_row_start + x_valid_rows, C_IN),
+                strides=(C_IN, 1),
+                offsets=(x_row_start, 0),
+                block_shape=(BLOCK_OW, BLOCK_K),
+                order=(1, 0),
+            )
+
+            w_bp = tl.make_block_ptr(
+                base=w_ptr + kh * stride_wkh + kw * stride_wkw,
+                shape=(C_IN, C_out),
+                strides=(stride_wci, stride_wco),
+                offsets=(0, pid_n * BLOCK_N),
+                block_shape=(BLOCK_K, BLOCK_N),
+                order=(1, 0),
+            )
+
+            for c0 in range(0, C_IN, BLOCK_K):
+                x_tile = tl.load(x_bp, boundary_check=(0, 1), padding_option="zero")
+                w_tile = tl.load(w_bp, boundary_check=(0, 1), padding_option="zero")
+                acc = tl.dot(x_tile, w_tile, acc)
+                x_bp = tl.advance(x_bp, (0, BLOCK_K))
+                w_bp = tl.advance(w_bp, (BLOCK_K, 0))
+
+    y_row_start = n * OHOW + oh * OW + ow0
+    y_valid_rows = OW - ow0
+    y_bp = tl.make_block_ptr(
+        base=y_ptr,
+        shape=(y_row_start + y_valid_rows, C_out),
+        strides=(C_out, 1),
+        offsets=(y_row_start, pid_n * BLOCK_N),
+        block_shape=(BLOCK_OW, BLOCK_N),
+        order=(1, 0),
+    )
+    tl.store(y_bp, acc.to(tl.float16), boundary_check=(0, 1))
+
+
+class Model(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        output_padding=0,
+        groups=1,
+        bias=False,
+    ):
+        super().__init__()
+        self.conv_transpose2d = nn.ConvTranspose2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+            groups=groups,
+            bias=bias,
+        )
+        self._packed = False
+        self._cached_x_ptr = None
+
+    def _pack_weights(self):
+        weight = self.conv_transpose2d.weight.data.detach()
+        self.w_hwio = (
+            weight.flip(2, 3).permute(2, 3, 0, 1).to(dtype=torch.float16).contiguous()
+        )
+        self.KH = weight.shape[2]
+        self.KW = weight.shape[3]
+        self.C_in_val = weight.shape[0]
+        self.C_out_val = weight.shape[1]
+        self._packed = True
+
+    def forward(self, x):
+        if not self._packed:
+            self._pack_weights()
+
+        x = x.to(dtype=torch.float16)
+        N, C_in, H, W_in = x.shape
+        KH, KW = self.KH, self.KW
+        C_out = self.C_out_val
+
+        if self._cached_x_ptr != x.data_ptr():
+            x_pad = F.pad(x, (KW - 1, KW - 1, KH - 1, KH - 1))
+            x_pad_cl = x_pad.contiguous(memory_format=torch.channels_last)
+            self._x_nhwc = x_pad_cl.permute(0, 2, 3, 1)
+            self._cached_x_ptr = x.data_ptr()
+            self._H_pad = H + 2 * (KH - 1)
+            self._W_pad = W_in + 2 * (KW - 1)
+            self._OH = H + KH - 1
+            self._OW = W_in + KW - 1
+            self._N = N
+            self._y = torch.empty(
+                (N, C_out, self._OH, self._OW),
+                device=x.device,
+                dtype=torch.float16,
+                memory_format=torch.channels_last,
+            )
+            self._y_nhwc = self._y.permute(0, 2, 3, 1)
+
+        OH, OW = self._OH, self._OW
+        H_pad, W_pad = self._H_pad, self._W_pad
+
+        def grid(META):
+            num_ow = triton.cdiv(OW, META["BLOCK_OW"])
+            num_n = triton.cdiv(C_out, META["BLOCK_N"])
+            return (OH * num_ow * num_n, N)
+
+        _conv_transpose2d_swizzled_v2[grid](
+            self._x_nhwc,
+            self.w_hwio,
+            self._y_nhwc,
+            N,
+            H_pad,
+            W_pad,
+            C_out,
+            OH,
+            OW,
+            self.w_hwio.stride(0),
+            self.w_hwio.stride(1),
+            self.w_hwio.stride(2),
+            self.w_hwio.stride(3),
+            KH=KH,
+            KW=KW,
+            C_IN=C_in,
+        )
+
+        return self._y
+
+
+batch_size = 8
+in_channels = 32
+out_channels = 32
+kernel_size = 3
+height = 512
+width = 1024
+
+
+def get_inputs():
+    x = torch.rand(batch_size, in_channels, height, width)
+    return [x]
+
+
+def get_init_inputs():
+    return [in_channels, out_channels, kernel_size]
diff --git a/backends/triton/cpu/KernelBench/level1/72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.py b/backends/triton/cpu/KernelBench/level1/72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.py
new file mode 100644
index 0000000..8509131
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.py
@@ -0,0 +1,289 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_OW": 64, "BLOCK_OH": 2}, num_warps=4, num_stages=2),
+    ],
+    key=["D_out", "H_out", "W_out"],
+)
+@triton.jit
+def _conv_transpose3d_v2(
+    x_ptr,
+    w_ptr,
+    out_ptr,
+    D_in: tl.constexpr,
+    H_in: tl.constexpr,
+    W_in: tl.constexpr,
+    D_out: tl.constexpr,
+    H_out: tl.constexpr,
+    W_out: tl.constexpr,
+    C_in_pg: tl.constexpr,
+    C_out_pg: tl.constexpr,
+    groups: tl.constexpr,
+    kD: tl.constexpr,
+    kH: tl.constexpr,
+    kW: tl.constexpr,
+    stride_d: tl.constexpr,
+    stride_h: tl.constexpr,
+    stride_w: tl.constexpr,
+    pad_d: tl.constexpr,
+    pad_h: tl.constexpr,
+    pad_w: tl.constexpr,
+    sx_b,
+    sx_c,
+    sx_d,
+    sx_h,
+    sx_w,
+    sw_ci,
+    sw_kd,
+    sw_kh,
+    sw_kw,
+    so_b,
+    so_c,
+    so_d,
+    so_h,
+    so_w,
+    BLOCK_OW: tl.constexpr,
+    BLOCK_OH: tl.constexpr,
+):
+    pid_bg = tl.program_id(0)
+    pid_d = tl.program_id(1)
+    pid_hw = tl.program_id(2)
+
+    b_idx = pid_bg // groups
+    g = pid_bg % groups
+
+    od = pid_d
+
+    num_ow_tiles = tl.cdiv(W_out, BLOCK_OW)
+    oh_tile = pid_hw // num_ow_tiles
+    ow_tile = pid_hw % num_ow_tiles
+
+    oh0 = oh_tile * BLOCK_OH
+    ow0 = ow_tile * BLOCK_OW
+
+    ow_offs = ow0 + tl.arange(0, BLOCK_OW)
+    ow_mask = ow_offs < W_out
+    oc_offs = tl.arange(0, C_out_pg)
+
+    for oh_local in range(BLOCK_OH):
+        oh = oh0 + oh_local
+        if oh < H_out:
+            acc = tl.zeros((BLOCK_OW, C_out_pg), dtype=tl.float32)
+
+            for kd_idx in range(kD):
+                tmp_d = od + pad_d - kd_idx
+                d_ok = (tmp_d >= 0) & (tmp_d % stride_d == 0)
+                if d_ok:
+                    id_val = tmp_d // stride_d
+                    if id_val < D_in:
+                        for kh_idx in range(kH):
+                            tmp_h = oh + pad_h - kh_idx
+                            h_ok = (tmp_h >= 0) & (tmp_h % stride_h == 0)
+                            if h_ok:
+                                ih_val = tmp_h // stride_h
+                                if ih_val < H_in:
+                                    x_dh = (
+                                        x_ptr
+                                        + b_idx.to(tl.int64) * sx_b
+                                        + id_val.to(tl.int64) * sx_d
+                                        + ih_val.to(tl.int64) * sx_h
+                                    )
+                                    w_kdkh = w_ptr + kd_idx * sw_kd + kh_idx * sw_kh
+
+                                    for kw_idx in range(kW):
+                                        tmp_w = ow_offs + pad_w - kw_idx
+                                        w_ok = (tmp_w >= 0) & (tmp_w % stride_w == 0)
+                                        iw_vals = tmp_w // stride_w
+                                        valid = (
+                                            ow_mask
+                                            & w_ok
+                                            & (iw_vals >= 0)
+                                            & (iw_vals < W_in)
+                                        )
+
+                                        w_k = w_kdkh + kw_idx * sw_kw
+
+                                        for ic_local in range(C_in_pg):
+                                            ic = g * C_in_pg + ic_local
+                                            x_val = tl.load(
+                                                x_dh
+                                                + ic.to(tl.int64) * sx_c
+                                                + iw_vals.to(tl.int64) * sx_w,
+                                                mask=valid,
+                                                other=0.0,
+                                            ).to(tl.float32)
+                                            w_vals = tl.load(
+                                                w_k
+                                                + ic.to(tl.int64) * sw_ci
+                                                + oc_offs.to(tl.int64)
+                                            ).to(tl.float32)
+                                            acc += x_val[:, None] * w_vals[None, :]
+
+            oc_global = g * C_out_pg + oc_offs
+            out_off = (
+                out_ptr
+                + b_idx.to(tl.int64) * so_b
+                + oc_global[None, :].to(tl.int64) * so_c
+                + od.to(tl.int64) * so_d
+                + oh.to(tl.int64) * so_h
+                + ow_offs[:, None].to(tl.int64) * so_w
+            )
+            tl.store(out_off, acc.to(tl.float16), mask=ow_mask[:, None])
+
+
+class Model(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=(1, 1, 1),
+        padding=(0, 0, 0),
+        output_padding=(0, 0, 0),
+        groups=1,
+        bias=False,
+    ):
+        super().__init__()
+        self.conv_transpose3d = nn.ConvTranspose3d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+            groups=groups,
+            bias=bias,
+        )
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        if isinstance(kernel_size, int):
+            kernel_size = (kernel_size,) * 3
+        if isinstance(stride, int):
+            stride = (stride,) * 3
+        if isinstance(padding, int):
+            padding = (padding,) * 3
+        if isinstance(output_padding, int):
+            output_padding = (output_padding,) * 3
+        self.kernel_size_t = kernel_size
+        self.stride_t = stride
+        self.padding_t = padding
+        self.output_padding_t = output_padding
+        self.groups = groups
+        self._packed = False
+
+    def _pack_weights(self):
+        w = self.conv_transpose3d.weight.data  # (C_in, C_out_pg, kD, kH, kW)
+        w = w.permute(0, 2, 3, 4, 1).contiguous()  # (C_in, kD, kH, kW, C_out_pg)
+        self.weight_xpu = w.to(dtype=torch.float16)
+        self._packed = True
+
+    def forward(self, x):
+        if not self._packed:
+            self._pack_weights()
+        x = x.to(dtype=torch.float16).contiguous()
+
+        B = x.shape[0]
+        D_in, H_in, W_in = x.shape[2], x.shape[3], x.shape[4]
+        C_in_pg = self.in_channels // self.groups
+        C_out_pg = self.out_channels // self.groups
+        kD, kH, kW = self.kernel_size_t
+        sD, sH, sW = self.stride_t
+        pD, pH, pW = self.padding_t
+        opD, opH, opW = self.output_padding_t
+
+        D_out = (D_in - 1) * sD - 2 * pD + kD + opD
+        H_out = (H_in - 1) * sH - 2 * pH + kH + opH
+        W_out = (W_in - 1) * sW - 2 * pW + kW + opW
+
+        output = torch.empty(
+            (B, self.out_channels, D_out, H_out, W_out),
+            device=x.device,
+            dtype=torch.float16,
+        )
+
+        w = self.weight_xpu
+        grid = lambda META: (
+            B * self.groups,
+            D_out,
+            triton.cdiv(H_out, META["BLOCK_OH"]) * triton.cdiv(W_out, META["BLOCK_OW"]),
+        )
+
+        _conv_transpose3d_v2[grid](
+            x,
+            w,
+            output,
+            D_in,
+            H_in,
+            W_in,
+            D_out,
+            H_out,
+            W_out,
+            C_in_pg,
+            C_out_pg,
+            self.groups,
+            kD,
+            kH,
+            kW,
+            sD,
+            sH,
+            sW,
+            pD,
+            pH,
+            pW,
+            x.stride(0),
+            x.stride(1),
+            x.stride(2),
+            x.stride(3),
+            x.stride(4),
+            w.stride(0),
+            w.stride(1),
+            w.stride(2),
+            w.stride(3),
+            output.stride(0),
+            output.stride(1),
+            output.stride(2),
+            output.stride(3),
+            output.stride(4),
+        )
+        return output
+
+
+batch_size = 8
+in_channels = 32
+out_channels = 32
+kernel_size = (3, 5, 7)
+depth = 12
+height = 24
+width = 48
+stride = (2, 2, 2)
+padding = (1, 2, 3)
+output_padding = (1, 1, 1)
+groups = 4
+
+
+def get_inputs():
+    x = torch.rand(batch_size, in_channels, depth, height, width)
+    return [x]
+
+
+def get_init_inputs():
+    return [
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        output_padding,
+        groups,
+    ]
diff --git a/backends/triton/cpu/KernelBench/level1/73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.py b/backends/triton/cpu/KernelBench/level1/73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.py
new file mode 100644
index 0000000..91671df
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.py
@@ -0,0 +1,309 @@
+# ruff: noqa: E731, A001, E402
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+# Auto-merged kernel + test for: 73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.log
+# Generated by merge script
+
+# kernel.py
+
+import torch
+import triton
+import triton.language as tl
+
+
+# -----------------------------------------------------------------------------
+# Triton kernel: naive element‐wise transposed 3D conv, one output‐element per
+# Triton thread vector.  Works in groups, BF16 input/weight, FP32 accumulation.
+# -----------------------------------------------------------------------------
+@triton.jit
+def _conv_transpose3d_kernel(
+    inp_ptr,  # pointer to input tensor
+    w_ptr,  # pointer to weight tensor
+    out_ptr,  # pointer to output tensor
+    # tensor dimensions
+    B,
+    C_in,
+    Di,
+    Hi,
+    Wi,
+    C_out,
+    Do,
+    Ho,
+    Wo,
+    # kernel size
+    kD,
+    kH,
+    kW,
+    # strides
+    sd,
+    sh,
+    sw,
+    # paddings
+    pd,
+    ph,
+    pw,
+    # grouping
+    groups,
+    # compile‐time tile size
+    BLOCK_SZ: tl.constexpr,
+):
+    # block indices
+    b = tl.program_id(0)  # batch index
+    oc = tl.program_id(1)  # output‐channel index
+    sb_id = tl.program_id(2)  # spatial‐block index
+
+    # each thread handles one element of a BLOCK_SZ‐vector along the flattened
+    # (Do*Ho*Wo) dimension
+    offs = tl.arange(0, BLOCK_SZ)
+    flat_sp = sb_id * BLOCK_SZ + offs  # flattened output index
+    mask_sp = flat_sp < (Do * Ho * Wo)  # guard OOB
+
+    # decode flattened spatial into od, oh, ow
+    od = flat_sp // (Ho * Wo)
+    rem = flat_sp % (Ho * Wo)
+    oh = rem // Wo
+    ow = rem % Wo
+
+    # FP32 accumulator
+    acc = tl.zeros((BLOCK_SZ,), dtype=tl.float32)
+
+    # channels per group
+    out_ch_per_grp = C_out // groups
+    in_ch_per_grp = C_in // groups
+
+    # which group this oc belongs to, and local oc
+    grp = oc // out_ch_per_grp
+    oc_local = oc % out_ch_per_grp
+    in_start = grp * in_ch_per_grp
+
+    # iterate over the input‐channels in this group and over kD,kH,kW
+    for ic_loc in range(in_ch_per_grp):
+        c_in = in_start + ic_loc
+        for kd in range(kD):
+            # invert the transpose‐conv mapping in D
+            id_num = od + pd - kd
+            id = id_num // sd
+            mask_id = (id_num == id * sd) & (id >= 0) & (id < Di)
+            for kh in range(kH):
+                # invert in H
+                ih_num = oh + ph - kh
+                ih = ih_num // sh
+                mask_ih = (ih_num == ih * sh) & (ih >= 0) & (ih < Hi)
+                for kw in range(kW):
+                    # invert in W
+                    iw_num = ow + pw - kw
+                    iw = iw_num // sw
+                    mask_iw = (iw_num == iw * sw) & (iw >= 0) & (iw < Wi)
+
+                    # combine all masks, plus the original spatial‐mask
+                    m = mask_sp & mask_id & mask_ih & mask_iw
+
+                    # vectorized offsets into the flattened input
+                    inp_offset = (((b * C_in + c_in) * Di + id) * Hi + ih) * Wi + iw
+                    # masked‐load input; out‐of‐range lanes → 0.0
+                    inp_val = tl.load(inp_ptr + inp_offset, mask=m, other=0.0).to(
+                        tl.float32
+                    )
+
+                    # vectorized offsets into the flattened weight
+                    w_offset = (
+                        ((c_in * out_ch_per_grp + oc_local) * kD + kd) * kH + kh
+                    ) * kW + kw
+                    w_val = tl.load(w_ptr + w_offset).to(tl.float32)
+
+                    acc += inp_val * w_val
+
+    # write‐back: flatten (b,oc,od,oh,ow) and store with mask_sp
+    out_offset = (((b * C_out + oc) * Do + od) * Ho + oh) * Wo + ow
+    out_val = acc.to(out_ptr.dtype.element_ty)
+    tl.store(out_ptr + out_offset, out_val, mask=mask_sp)
+
+
+# -----------------------------------------------------------------------------
+# Python wrapper: only does checks, allocates output, and launches Triton kernel.
+# -----------------------------------------------------------------------------
+def kernel_function(
+    input_tensor, weight, bias, stride, padding, output_padding, groups
+):
+    # --- Argument checks
+    assert isinstance(input_tensor, torch.Tensor), "input must be a Tensor"
+    assert isinstance(weight, torch.Tensor), "weight must be a Tensor"
+    assert bias is None, "this kernel does not support bias"
+    assert isinstance(stride, (int, tuple)), "stride must be int or tuple"
+    assert isinstance(padding, (int, tuple)), "padding must be int or tuple"
+    assert isinstance(groups, int) and groups >= 1, "groups must be a positive int"
+
+    # --- Shapes and device/dtype
+    B, C_in, Di, Hi, Wi = input_tensor.shape
+    Cin_w, Cout_grp, kD, kH, kW = weight.shape
+    assert Cin_w == C_in, "weight.shape[0] must match input channels"
+    C_out = Cout_grp * groups
+
+    # unify stride / padding / output_padding
+    if isinstance(stride, int):
+        sd = sh = sw = stride
+    else:
+        sd, sh, sw = stride
+    if isinstance(padding, int):
+        pd = ph = pw = padding
+    else:
+        pd, ph, pw = padding
+    if isinstance(output_padding, int):
+        opd = oph = opw = output_padding
+    else:
+        opd, oph, opw = output_padding
+
+    # compute output D/H/W for ConvTranspose3d
+    Do = (Di - 1) * sd - 2 * pd + kD + opd
+    Ho = (Hi - 1) * sh - 2 * ph + kH + oph
+    Wo = (Wi - 1) * sw - 2 * pw + kW + opw
+
+    # allocate output (bfloat16)
+    out = torch.empty(
+        (B, C_out, Do, Ho, Wo), device=input_tensor.device, dtype=input_tensor.dtype
+    )
+
+    # choose a block size for the flattened spatial dimension
+    BLOCK = 256
+    grid = (B, C_out, triton.cdiv(Do * Ho * Wo, BLOCK))
+
+    # launch
+    _conv_transpose3d_kernel[grid](
+        input_tensor,
+        weight,
+        out,
+        B,
+        C_in,
+        Di,
+        Hi,
+        Wi,
+        C_out,
+        Do,
+        Ho,
+        Wo,
+        kD,
+        kH,
+        kW,
+        sd,
+        sh,
+        sw,
+        pd,
+        ph,
+        pw,
+        groups,
+        BLOCK,  # compile‐time constant
+    )
+    return out[
+        :, :, : Do - opd, : Ho - oph, : Wo - opw
+    ]  # slice to remove output_padding area
+
+
+# ------------------------------
+# KernelBench-compatible Model
+# ------------------------------
+
+
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Performs a 3D transposed convolution operation with asymmetric input and square kernel.
+    The input is padded before the convolution.
+
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size (int): Size of the square convolution kernel.
+        stride (int, optional): Stride of the convolution. Defaults to 1.
+        padding (int, optional): Padding applied to the input. Defaults to 0.
+        groups (int, optional): Number of blocked connections from input channels to output channels. Defaults to 1.
+        bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        output_padding: int = 0,
+        groups: int = 1,
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        # Keep the same init signature as the reference Model, but route compute to Triton kernel.
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.output_padding = output_padding
+        self.groups = groups
+        self.use_bias = bias
+
+        # Kernel wrapper constraints
+        assert not self.use_bias, (
+            "this kernel does not support bias (bias must be False / None)"
+        )
+        # output_padding is now handled in kernel_function
+        assert isinstance(groups, int) and groups >= 1, "groups must be a positive int"
+        assert in_channels % groups == 0, "in_channels must be divisible by groups"
+        assert out_channels % groups == 0, "out_channels must be divisible by groups"
+
+        kD = kH = kW = kernel_size
+        # Match nn.ConvTranspose3d parameter shapes:
+        # weight: (in_channels, out_channels/groups, kD, kH, kW)
+        self.weight = nn.Parameter(
+            torch.empty(in_channels, out_channels // groups, kD, kH, kW)
+        )
+        self.bias = None
+
+        # Reasonable init
+        nn.init.kaiming_uniform_(self.weight, a=5**0.5)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Performs the 3D transposed convolution.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, depth, height, width).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, depth_out, height_out, width_out).
+        """
+        return kernel_function(
+            x,
+            self.weight,
+            self.bias,
+            self.stride,
+            self.padding,
+            self.output_padding,
+            self.groups,
+        )
+
+
+# Test code
+batch_size = 4
+in_channels = 32
+out_channels = 32
+kernel_size = 3
+depth = 32
+height = 64
+width = 128
+stride = 2
+padding = 1
+groups = 4
+
+
+def get_inputs():
+    x = torch.rand(batch_size, in_channels, depth, height, width)
+    return [x]
+
+
+def get_init_inputs():
+    return [in_channels, out_channels, kernel_size, stride, padding, groups]
diff --git a/backends/triton/cpu/KernelBench/level1/74_conv_transposed_1D_dilated.py b/backends/triton/cpu/KernelBench/level1/74_conv_transposed_1D_dilated.py
new file mode 100644
index 0000000..830134b
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/74_conv_transposed_1D_dilated.py
@@ -0,0 +1,199 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 16, "GROUP_M": 8},
+            num_warps=4,
+            num_stages=2,
+        ),
+    ],
+    key=["C_IN", "C_out", "OL"],
+)
+@triton.jit
+def _conv_transpose1d_dilated_gemm(
+    x_ptr,
+    w_ptr,
+    y_ptr,
+    N_batch,
+    L_in,
+    C_out,
+    OL,
+    sxn,
+    sxl,
+    swk,
+    swci,
+    swco,
+    syn,
+    syl,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_M: tl.constexpr,
+    K_SIZE: tl.constexpr,
+    C_IN: tl.constexpr,
+    DILATION: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    M_total = N_batch * OL
+    num_pid_m = tl.cdiv(M_total, BLOCK_M)
+    num_pid_n = tl.cdiv(C_out, BLOCK_N)
+    num_pid_in_group = GROUP_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_M
+    group_size_m = tl.minimum(num_pid_m - first_pid_m, GROUP_M)
+    pid_in_group = pid % num_pid_in_group
+    pid_m = first_pid_m + (pid_in_group % group_size_m)
+    pid_n = pid_in_group // group_size_m
+
+    offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    offs_k = tl.arange(0, BLOCK_K)
+
+    n_idx = offs_m // OL
+    ol_idx = offs_m % OL
+
+    mask_m = offs_m < M_total
+    mask_n = offs_n < C_out
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    x_base = x_ptr + n_idx * sxn
+
+    for k in range(K_SIZE):
+        il = ol_idx - k * DILATION
+        valid = (il >= 0) & (il < L_in) & mask_m
+        x_ptrs = x_base + il * sxl
+
+        w_bp = tl.make_block_ptr(
+            base=w_ptr + k * swk,
+            shape=(C_IN, C_out),
+            strides=(swci, swco),
+            offsets=(0, pid_n * BLOCK_N),
+            block_shape=(BLOCK_K, BLOCK_N),
+            order=(1, 0),
+        )
+
+        for c0 in range(0, C_IN, BLOCK_K):
+            k_idx = c0 + offs_k
+            x_tile = tl.load(
+                x_ptrs[:, None] + k_idx[None, :],
+                mask=valid[:, None] & (k_idx[None, :] < C_IN),
+                other=0.0,
+            )
+            w_tile = tl.load(w_bp, boundary_check=(0, 1), padding_option="zero")
+            acc = tl.dot(x_tile, w_tile, acc)
+            w_bp = tl.advance(w_bp, (BLOCK_K, 0))
+
+    y_ptrs = y_ptr + n_idx * syn + ol_idx * syl
+    tl.store(
+        y_ptrs[:, None] + offs_n[None, :],
+        acc.to(y_ptr.dtype.element_ty),
+        mask=mask_m[:, None] & mask_n[None, :],
+    )
+
+
+class Model(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        bias=False,
+    ):
+        super(Model, self).__init__()
+        self.conv1d_transpose = nn.ConvTranspose1d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias,
+        )
+        self._dilation = dilation
+        self._packed = False
+
+    def _pack_weights(self):
+        self._w_kio = self.conv1d_transpose.weight.permute(2, 0, 1).contiguous()
+        self._K = self.conv1d_transpose.weight.shape[2]
+        self._C_out = self.conv1d_transpose.weight.shape[1]
+        self._packed = True
+
+    def forward(self, x):
+        if not self._packed:
+            self._pack_weights()
+
+        B, C_in, L_in = x.shape
+        K = self._K
+        C_out = self._C_out
+        OL = L_in + self._dilation * (K - 1)
+
+        x_nlc = x.permute(0, 2, 1).contiguous()
+        w_kio = self._w_kio
+        y_nlc = torch.empty((B, OL, C_out), device=x.device, dtype=x.dtype)
+
+        sxn, sxl, _ = x_nlc.stride()
+        swk, swci, swco = w_kio.stride()
+        syn, syl, _ = y_nlc.stride()
+
+        M_total = B * OL
+        grid = lambda meta: (
+            triton.cdiv(M_total, meta["BLOCK_M"]) * triton.cdiv(C_out, meta["BLOCK_N"]),
+        )
+
+        _conv_transpose1d_dilated_gemm[grid](
+            x_nlc,
+            w_kio,
+            y_nlc,
+            B,
+            L_in,
+            C_out,
+            OL,
+            sxn,
+            sxl,
+            swk,
+            swci,
+            swco,
+            syn,
+            syl,
+            K_SIZE=K,
+            C_IN=C_in,
+            DILATION=self._dilation,
+        )
+
+        result = y_nlc.permute(0, 2, 1).contiguous()
+        if self.conv1d_transpose.bias is not None:
+            result = result + self.conv1d_transpose.bias.view(1, -1, 1)
+        return result
+
+
+batch_size = 32
+in_channels = 32
+out_channels = 64
+kernel_size = 5
+length = 131072
+stride = 1
+padding = 0
+dilation = 3
+
+
+def get_inputs():
+    x = torch.rand(batch_size, in_channels, length)
+    return [x]
+
+
+def get_init_inputs():
+    return [in_channels, out_channels, kernel_size, stride, padding, dilation]
diff --git a/backends/triton/cpu/KernelBench/level1/75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.py b/backends/triton/cpu/KernelBench/level1/75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.py
new file mode 100644
index 0000000..69289d0
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.py
@@ -0,0 +1,353 @@
+# ruff: noqa: E731, E402
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+# Auto-merged kernel + test for: 75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.log
+# Generated by merge script
+
+# kernel.py
+import torch
+import triton
+import triton.language as tl
+
+# -----------------------------------------------------------------------------
+# Triton kernel: Fused 2D Transposed Convolution
+#
+# Loops over kernel spatial dims and input‐channel group in one pass.
+# Accumulates in fp32 for stability, then casts to bf16 on store.
+# Supports arbitrary asymmetric kernel sizes, padding, dilation, stride, and groups.
+# -----------------------------------------------------------------------------
+
+
+@triton.jit
+def _conv_transpose2d_kernel(
+    # Pointers
+    x_ptr,
+    w_ptr,
+    out_ptr,
+    total_elements,
+    # Tensor dims (constexpr)
+    batch: tl.constexpr,
+    in_c: tl.constexpr,
+    in_h: tl.constexpr,
+    in_w: tl.constexpr,
+    out_c: tl.constexpr,
+    out_h: tl.constexpr,
+    out_w: tl.constexpr,
+    k_h: tl.constexpr,
+    k_w: tl.constexpr,
+    s_h: tl.constexpr,
+    s_w: tl.constexpr,
+    p_h: tl.constexpr,
+    p_w: tl.constexpr,
+    d_h: tl.constexpr,
+    d_w: tl.constexpr,
+    groups: tl.constexpr,
+    in_c_per_g: tl.constexpr,
+    out_c_per_g: tl.constexpr,
+    # Strides (in elements)
+    x_stride_b: tl.constexpr,
+    x_stride_c: tl.constexpr,
+    x_stride_h: tl.constexpr,
+    x_stride_w: tl.constexpr,
+    w_stride_ic: tl.constexpr,
+    w_stride_oc: tl.constexpr,
+    w_stride_kh: tl.constexpr,
+    w_stride_kw: tl.constexpr,
+    out_stride_b: tl.constexpr,
+    out_stride_c: tl.constexpr,
+    out_stride_h: tl.constexpr,
+    out_stride_w: tl.constexpr,
+    # Block size
+    BLOCK: tl.constexpr,
+):
+    """
+    Each program computes BLOCK flattened output elements.
+    """
+
+    pid = tl.program_id(0)
+    offs = pid * BLOCK + tl.arange(0, BLOCK)
+    mask_o = offs < total_elements
+
+    # Flattened index -> (n, co, i_out, j_out)
+    nc = out_c * out_h * out_w
+    n = offs // nc
+    rem = offs % nc
+    co = rem // (out_h * out_w)
+    rem2 = rem % (out_h * out_w)
+    i_out = rem2 // out_w
+    j_out = rem2 % out_w
+
+    # FP32 accumulator per lane
+    acc = tl.zeros((BLOCK,), dtype=tl.float32)
+
+    # Which group & local output‐channel
+    group_id = co // out_c_per_g
+    cin_start = group_id * in_c_per_g
+    co_local = co - group_id * out_c_per_g
+
+    # Loop over kernel H×W
+    for kh_i in range(k_h):
+        # invert transposed conv formula in H dimension:
+        #   i_out = i_in * s_h - p_h + kh * d_h
+        #   => i_in = (i_out + p_h - kh*d_h) / s_h   only valid if divisible
+        in_i_numer = i_out + p_h - kh_i * d_h
+        in_i = in_i_numer // s_h
+        valid_h = (in_i_numer == in_i * s_h) & (in_i >= 0) & (in_i < in_h)
+
+        for kw_j in range(k_w):
+            in_j_numer = j_out + p_w - kw_j * d_w
+            in_j = in_j_numer // s_w
+            valid_w = (in_j_numer == in_j * s_w) & (in_j >= 0) & (in_j < in_w)
+
+            # final per‐lane validity mask
+            valid_spatial = mask_o & valid_h & valid_w
+
+            # sum over input‐channels in this group
+            for ci in range(in_c_per_g):
+                c_in = cin_start + ci
+                x_off = (
+                    n * x_stride_b
+                    + c_in * x_stride_c
+                    + in_i * x_stride_h
+                    + in_j * x_stride_w
+                )
+                x_val = tl.load(x_ptr + x_off, mask=valid_spatial, other=0.0).to(
+                    tl.float32
+                )
+                w_off = (
+                    c_in * w_stride_ic
+                    + co_local * w_stride_oc
+                    + kh_i * w_stride_kh
+                    + kw_j * w_stride_kw
+                )
+                w_val = tl.load(w_ptr + w_off, mask=mask_o, other=0.0).to(tl.float32)
+                acc += x_val * w_val
+
+    # cast acc to bf16 and store
+    out_val = acc.to(out_ptr.dtype.element_ty)
+    out_off = (
+        n * out_stride_b
+        + co * out_stride_c
+        + i_out * out_stride_h
+        + j_out * out_stride_w
+    )
+    tl.store(out_ptr + out_off, out_val, mask=mask_o)
+
+
+def kernel_function(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias,
+    stride: tuple,
+    padding: tuple,
+    dilation: tuple,
+    groups: int,
+):
+    """
+    Wrapper for grouped 2D transposed convolution on Intel XPU using Triton.
+    """
+
+    # 1) Validate arguments
+    assert bias is None, "Bias must be None for this kernel"
+    assert x.dtype in (torch.bfloat16, torch.float16) and weight.dtype == x.dtype, (
+        "Inputs must be float16 or bfloat16"
+    )
+
+    # 2) Shapes
+    batch, in_c, in_h, in_w = x.shape
+    # PyTorch conv_transpose2d weight: [in_c, out_c/groups, k_h, k_w]
+    in_c_w, out_c_pg, k_h, k_w = weight.shape
+    assert in_c_w == in_c
+    out_c = out_c_pg * groups
+
+    s_h, s_w = stride
+    p_h, p_w = padding
+    d_h, d_w = dilation
+    in_c_per_g = in_c // groups
+    out_c_per_g = out_c // groups
+
+    # compute output H, W
+    out_h = (in_h - 1) * s_h - 2 * p_h + d_h * (k_h - 1) + 1
+    out_w = (in_w - 1) * s_w - 2 * p_w + d_w * (k_w - 1) + 1
+
+    # 3) Allocate output
+    out = torch.empty((batch, out_c, out_h, out_w), device=x.device, dtype=x.dtype)
+
+    # 4) Flattened element count
+    total_elems = batch * out_c * out_h * out_w
+
+    # 5) Strides (in elements)
+    x_stride_b, x_stride_c, x_stride_h, x_stride_w = x.stride()
+    w_stride_ic, w_stride_oc, w_stride_kh, w_stride_kw = weight.stride()
+    out_stride_b, out_stride_c, out_stride_h, out_stride_w = out.stride()
+
+    # 6) Launch parameters
+    BLOCK = 256
+    grid = (triton.cdiv(total_elems, BLOCK),)
+
+    # 7) Launch kernel
+    _conv_transpose2d_kernel[grid](
+        # pointers + total
+        x,
+        weight,
+        out,
+        total_elems,
+        # dims
+        batch,
+        in_c,
+        in_h,
+        in_w,
+        out_c,
+        out_h,
+        out_w,
+        k_h,
+        k_w,
+        s_h,
+        s_w,
+        p_h,
+        p_w,
+        d_h,
+        d_w,
+        groups,
+        in_c_per_g,
+        out_c_per_g,
+        # strides
+        x_stride_b,
+        x_stride_c,
+        x_stride_h,
+        x_stride_w,
+        w_stride_ic,
+        w_stride_oc,
+        w_stride_kh,
+        w_stride_kw,
+        out_stride_b,
+        out_stride_c,
+        out_stride_h,
+        out_stride_w,
+        # block size
+        BLOCK,
+    )
+    return out
+
+
+# ------------------------------
+# KernelBench-compatible Model
+# ------------------------------
+
+
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Performs a 2D transposed convolution operation with asymmetric input, asymmetric kernel,
+    grouped, padded, and dilated.
+
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size (tuple): Size of the convolution kernel (height, width).
+        stride (tuple, optional): Stride of the convolution (height, width). Defaults to (1, 1).
+        padding (tuple, optional): Padding applied to the input (height, width). Defaults to (0, 0).
+        dilation (tuple, optional): Spacing between kernel elements (height, width). Defaults to (1, 1).
+        groups (int, optional): Number of blocked connections from input channels to output channels. Defaults to 1.
+        bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: tuple,
+        stride: tuple = (1, 1),
+        padding: tuple = (0, 0),
+        dilation: tuple = (1, 1),
+        groups: int = 1,
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        # Keep the same init signature as the reference Model, but route compute to Triton kernel.
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = (
+            tuple(stride) if isinstance(stride, (list, tuple)) else (stride, stride)
+        )
+        self.padding = (
+            tuple(padding) if isinstance(padding, (list, tuple)) else (padding, padding)
+        )
+        self.dilation = (
+            tuple(dilation)
+            if isinstance(dilation, (list, tuple))
+            else (dilation, dilation)
+        )
+        self.groups = groups
+        self.use_bias = bias
+
+        # Kernel wrapper constraints
+        assert not self.use_bias, "Bias must be False / None for this kernel"
+        assert len(self.stride) == 2, "stride must have 2 elements (sh, sw)"
+        assert len(self.padding) == 2, "padding must have 2 elements (ph, pw)"
+        assert len(self.dilation) == 2, "dilation must have 2 elements (dh, dw)"
+        assert isinstance(self.groups, int) and self.groups >= 1, (
+            "groups must be a positive int"
+        )
+        assert in_channels % self.groups == 0, "in_channels must be divisible by groups"
+        assert out_channels % self.groups == 0, (
+            "out_channels must be divisible by groups"
+        )
+
+        k_h, k_w = kernel_size
+        # Match nn.ConvTranspose2d parameter shapes:
+        # weight: (in_channels, out_channels/groups, k_h, k_w)
+        self.weight = nn.Parameter(
+            torch.empty(in_channels, out_channels // self.groups, k_h, k_w)
+        )
+        self.bias = None
+
+        # Reasonable init
+        nn.init.kaiming_uniform_(self.weight, a=5**0.5)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Performs the 2D transposed convolution.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, height, width).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, height_out, width_out).
+        """
+        return kernel_function(
+            x,
+            self.weight,
+            self.bias,
+            self.stride,
+            self.padding,
+            self.dilation,
+            self.groups,
+        )
+
+
+batch_size = 16
+in_channels = 32
+out_channels = 64
+kernel_size = (3, 5)
+height = 128
+width = 256
+stride = (2, 3)
+padding = (1, 2)
+dilation = (2, 1)
+groups = 4
+
+
+def get_inputs():
+    x = torch.rand(batch_size, in_channels, height, width)
+    return [x]
+
+
+def get_init_inputs():
+    return [in_channels, out_channels, kernel_size, stride, padding, dilation, groups]
diff --git a/backends/triton/cpu/KernelBench/level1/76_conv_standard_1D_dilated_strided__.py b/backends/triton/cpu/KernelBench/level1/76_conv_standard_1D_dilated_strided__.py
new file mode 100644
index 0000000..78155ce
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/76_conv_standard_1D_dilated_strided__.py
@@ -0,0 +1,174 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_OL": 64, "BLOCK_N": 128}, num_warps=4, num_stages=3),
+    ],
+    key=["OL", "OC"],
+)
+@triton.jit
+def _conv1d_kernel(
+    x_ptr,
+    w_ptr,
+    y_ptr,
+    OL,
+    OC,
+    stride_xn,
+    stride_xc,
+    stride_wk,
+    stride_wi,
+    stride_wo,
+    stride_yn,
+    stride_yoc,
+    C_IN: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    KERNEL_SIZE: tl.constexpr,
+    STRIDE_CONV: tl.constexpr,
+    DILATION: tl.constexpr,
+    BLOCK_OL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    n = tl.program_id(0)
+    pid_ol = tl.program_id(1)
+    pid_n = tl.program_id(2)
+
+    ol_start = pid_ol * BLOCK_OL
+    offs_ol = ol_start + tl.arange(0, BLOCK_OL)
+    offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    mask_ol = offs_ol < OL
+    mask_n = offs_n < OC
+
+    acc = tl.zeros((BLOCK_OL, BLOCK_N), dtype=tl.float32)
+
+    offs_k = tl.arange(0, BLOCK_K)
+    x_batch = x_ptr + n.to(tl.int64) * stride_xn
+
+    for k in range(KERNEL_SIZE):
+        in_pos = offs_ol * STRIDE_CONV + k * DILATION
+
+        x_addrs = x_batch + offs_k[None, :].to(tl.int64) * stride_xc + in_pos[:, None]
+        x_tile = tl.load(
+            x_addrs, mask=mask_ol[:, None] & (offs_k[None, :] < C_IN), other=0.0
+        )
+
+        w_bp = tl.make_block_ptr(
+            base=w_ptr + k * stride_wk,
+            shape=(C_IN, OC),
+            strides=(stride_wi, stride_wo),
+            offsets=(0, pid_n * BLOCK_N),
+            block_shape=(BLOCK_K, BLOCK_N),
+            order=(1, 0),
+        )
+        w_tile = tl.load(w_bp, boundary_check=(0, 1), padding_option="zero")
+
+        acc = tl.dot(x_tile, w_tile, acc)
+
+    y_batch = y_ptr + n.to(tl.int64) * stride_yn
+    y_addrs = y_batch + offs_n[None, :].to(tl.int64) * stride_yoc + offs_ol[:, None]
+    tl.store(y_addrs, acc.to(tl.float16), mask=mask_ol[:, None] & mask_n[None, :])
+
+
+class Model(nn.Module):
+    def __init__(
+        self, in_channels, out_channels, kernel_size, stride=1, dilation=1, bias=False
+    ):
+        super().__init__()
+        self.conv1d = nn.Conv1d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            dilation=dilation,
+            bias=bias,
+        )
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size_val = kernel_size
+        self.stride_val = stride
+        self.dilation_val = dilation
+        self._packed = False
+
+    def _pack_weights(self):
+        w = self.conv1d.weight.data
+        w_kio = w.permute(2, 1, 0).contiguous()
+        self.w_kio = w_kio.to("cpu", dtype=torch.float16).contiguous()
+        self._packed = True
+
+    def forward(self, x):
+        if not self._packed:
+            self._pack_weights()
+
+        if x.device.type != "cpu" or x.dtype != torch.float16:
+            x = x.to("cpu", dtype=torch.float16)
+        if not x.is_contiguous():
+            x = x.contiguous()
+
+        N = x.shape[0]
+        IC = x.shape[1]
+        L = x.shape[2]
+        OC = self.out_channels
+        KS = self.kernel_size_val
+        S = self.stride_val
+        D = self.dilation_val
+        OL = (L - D * (KS - 1) - 1) // S + 1
+
+        BLOCK_K = triton.next_power_of_2(max(16, IC))
+
+        y = torch.empty((N, OC, OL), device="cpu", dtype=torch.float16)
+
+        grid = lambda META: (
+            N,
+            triton.cdiv(OL, META["BLOCK_OL"]),
+            triton.cdiv(OC, META["BLOCK_N"]),
+        )
+
+        _conv1d_kernel[grid](
+            x,
+            self.w_kio,
+            y,
+            OL,
+            OC,
+            x.stride(0),
+            x.stride(1),
+            self.w_kio.stride(0),
+            self.w_kio.stride(1),
+            self.w_kio.stride(2),
+            y.stride(0),
+            y.stride(1),
+            C_IN=IC,
+            BLOCK_K=BLOCK_K,
+            KERNEL_SIZE=KS,
+            STRIDE_CONV=S,
+            DILATION=D,
+        )
+
+        return y
+
+
+batch_size = 64
+in_channels = 64
+out_channels = 128
+kernel_size = 3
+length = 524280
+stride = 3
+dilation = 4
+
+
+def get_inputs():
+    x = torch.rand(batch_size, in_channels, length)
+    return [x]
+
+
+def get_init_inputs():
+    return [in_channels, out_channels, kernel_size, stride, dilation]
diff --git a/backends/triton/cpu/KernelBench/level1/77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.py b/backends/triton/cpu/KernelBench/level1/77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.py
new file mode 100644
index 0000000..9942ff4
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.py
@@ -0,0 +1,259 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+def _compute_output_size(input_size, kernel_size, stride, padding, dilation):
+    return (input_size - 1) * stride - 2 * padding + dilation * (kernel_size - 1) + 1
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_W": 16, "BLOCK_OC": 32}, num_warps=4, num_stages=4),
+    ],
+    key=["W_act", "C_out"],
+)
+@triton.jit
+def _conv_transpose3d_v5(
+    x_ptr,
+    w_ptr,
+    out_ptr,
+    D: tl.constexpr,
+    H: tl.constexpr,
+    W: tl.constexpr,
+    C_out: tl.constexpr,
+    D_out: tl.constexpr,
+    H_out: tl.constexpr,
+    W_out: tl.constexpr,
+    D_act: tl.constexpr,
+    H_act: tl.constexpr,
+    W_act: tl.constexpr,
+    sx_b,
+    sx_d,
+    sx_h,
+    sx_w,
+    so_b,
+    so_d,
+    so_h,
+    so_w,
+    BLOCK_W: tl.constexpr,
+    BLOCK_OC: tl.constexpr,
+    K: tl.constexpr,
+    C_IN: tl.constexpr,
+):
+    pid_w = tl.program_id(0)
+    pid_bdh = tl.program_id(1)
+    pid_oc = tl.program_id(2)
+
+    b = pid_bdh // (D_act * H_act)
+    rem = pid_bdh % (D_act * H_act)
+    d_idx = rem // H_act
+    h_idx = rem % H_act
+
+    offs_w_idx = pid_w * BLOCK_W + tl.arange(0, BLOCK_W)
+    offs_oc = pid_oc * BLOCK_OC + tl.arange(0, BLOCK_OC)
+
+    acc = tl.zeros((BLOCK_W, BLOCK_OC), dtype=tl.float32)
+
+    x_batch_base = x_ptr + b.to(tl.int64) * sx_b
+
+    for kd in range(K):
+        d_in = d_idx + 1 - kd
+        d_ok = (d_in >= 0) & (d_in < D)
+        if d_ok:
+            for kh in range(K):
+                h_in = h_idx + 1 - kh
+                h_ok = (h_in >= 0) & (h_in < H)
+                if h_ok:
+                    x_dh_base = x_batch_base + d_in * sx_d + h_in * sx_h
+
+                    for kw in range(K):
+                        w_in_start = pid_w * BLOCK_W + 1 - kw
+
+                        x_bp = tl.make_block_ptr(
+                            base=x_dh_base,
+                            shape=(W, C_IN),
+                            strides=(sx_w, 1),
+                            offsets=(w_in_start, 0),
+                            block_shape=(BLOCK_W, C_IN),
+                            order=(1, 0),
+                        )
+                        x_tile = tl.load(
+                            x_bp, boundary_check=(0,), padding_option="zero"
+                        )
+
+                        kidx = kd * K * K + kh * K + kw
+                        w_bp = tl.make_block_ptr(
+                            base=w_ptr + kidx * C_IN * C_out,
+                            shape=(C_IN, C_out),
+                            strides=(C_out, 1),
+                            offsets=(0, pid_oc * BLOCK_OC),
+                            block_shape=(C_IN, BLOCK_OC),
+                            order=(1, 0),
+                        )
+                        w_tile = tl.load(
+                            w_bp, boundary_check=(1,), padding_option="zero"
+                        )
+
+                        acc += tl.dot(x_tile, w_tile)
+
+    d_out = 2 * d_idx + 1
+    h_out = 2 * h_idx + 1
+    w_out = 2 * offs_w_idx + 1
+    out_base = out_ptr + b.to(tl.int64) * so_b + d_out * so_d + h_out * so_h
+    out_ptrs = out_base + w_out[:, None] * so_w + offs_oc[None, :]
+    out_mask = (offs_w_idx[:, None] < W_act) & (offs_oc[None, :] < C_out)
+    tl.store(out_ptrs, acc.to(tl.float16), mask=out_mask)
+
+
+@triton.jit
+def _zero_kernel(out_ptr, N, BLOCK: tl.constexpr):
+    pid = tl.program_id(0)
+    offs = pid * BLOCK + tl.arange(0, BLOCK)
+    mask = offs < N
+    tl.store(out_ptr + offs, tl.zeros((BLOCK,), dtype=tl.float16), mask=mask)
+
+
+class Model(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        dilation: int = 1,
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.has_bias = bias
+        self.conv_transpose3d = nn.ConvTranspose3d(
+            in_channels,
+            out_channels,
+            kernel_size=(kernel_size, kernel_size, kernel_size),
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias,
+        )
+        self._packed = False
+        self._buf_allocated = False
+
+    def _pack_weights(self):
+        w = self.conv_transpose3d.weight.data
+        K = self.kernel_size
+        self.weight_packed = (
+            w.permute(2, 3, 4, 0, 1)
+            .reshape(K * K * K, self.in_channels, self.out_channels)
+            .contiguous()
+            .to(dtype=torch.float16)
+        )
+        self._packed = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not self._packed:
+            self._pack_weights()
+
+        device = x.device
+        B, C_in, D, H, W = x.shape
+        K = self.kernel_size
+
+        D_out = _compute_output_size(D, K, self.stride, self.padding, self.dilation)
+        H_out = _compute_output_size(H, K, self.stride, self.padding, self.dilation)
+        W_out = _compute_output_size(W, K, self.stride, self.padding, self.dilation)
+
+        D_act = D_out // 2
+        H_act = H_out // 2
+        W_act = W_out // 2
+
+        x_cl3d = x.to(dtype=torch.float16).contiguous(
+            memory_format=torch.channels_last_3d
+        )
+
+        if not self._buf_allocated or self._output_buf.shape[0] != B:
+            self._output_buf = torch.empty(
+                B,
+                self.out_channels,
+                D_out,
+                H_out,
+                W_out,
+                device=device,
+                dtype=torch.float16,
+            ).contiguous(memory_format=torch.channels_last_3d)
+            self._buf_allocated = True
+
+        output = self._output_buf
+        N = output.numel()
+        BLOCK = 1024
+        _zero_kernel[(triton.cdiv(N, BLOCK),)](output, N, BLOCK=BLOCK)
+
+        sx = x_cl3d.stride()
+        so = output.stride()
+
+        grid = lambda META: (
+            triton.cdiv(W_act, META["BLOCK_W"]),
+            B * D_act * H_act,
+            triton.cdiv(self.out_channels, META["BLOCK_OC"]),
+        )
+
+        _conv_transpose3d_v5[grid](
+            x_cl3d,
+            self.weight_packed,
+            output,
+            D,
+            H,
+            W,
+            self.out_channels,
+            D_out,
+            H_out,
+            W_out,
+            D_act,
+            H_act,
+            W_act,
+            sx[0],
+            sx[2],
+            sx[3],
+            sx[4],
+            so[0],
+            so[2],
+            so[3],
+            so[4],
+            K=K,
+            C_IN=C_in,
+        )
+
+        return output
+
+
+batch_size = 16
+in_channels = 32
+out_channels = 64
+kernel_size = 3
+depth = 16
+height = 32
+width = 32
+stride = 2
+padding = 1
+dilation = 2
+
+
+def get_inputs():
+    x = torch.rand(batch_size, in_channels, depth, height, width)
+    return [x]
+
+
+def get_init_inputs():
+    return [in_channels, out_channels, kernel_size, stride, padding, dilation]
diff --git a/backends/triton/cpu/KernelBench/level1/78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.py b/backends/triton/cpu/KernelBench/level1/78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.py
new file mode 100644
index 0000000..854892d
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.py
@@ -0,0 +1,244 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {
+                "BLOCK_OW": 64,
+                "BLOCK_N": 32,
+                "BLOCK_K": 32,
+                "GROUP_SIZE_M": 8,
+                "grf_mode": "128",
+            },
+            num_warps=4,
+            num_stages=3,
+        ),
+    ],
+    key=["H", "W", "C_IN", "C_out", "OH", "OW"],
+)
+@triton.jit
+def _conv_transpose2d_swizzled_v2(
+    x_ptr,
+    w_ptr,
+    y_ptr,
+    N_batch,
+    H,
+    W,
+    C_out,
+    OH,
+    OW,
+    stride_wkh,
+    stride_wkw,
+    stride_wci,
+    stride_wco,
+    BLOCK_OW: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    KH: tl.constexpr,
+    KW: tl.constexpr,
+    C_IN: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+    grf_mode: tl.constexpr,
+):
+    n = tl.program_id(1)
+    pid = tl.program_id(0)
+
+    num_ow_tiles = tl.cdiv(OW, BLOCK_OW)
+    num_n_tiles = tl.cdiv(C_out, BLOCK_N)
+    num_m_tiles = OH * num_ow_tiles
+
+    group_id = pid // (GROUP_SIZE_M * num_n_tiles)
+    first_m_in_group = group_id * GROUP_SIZE_M
+    group_size = min(num_m_tiles - first_m_in_group, GROUP_SIZE_M)
+    local_id = pid % (group_size * num_n_tiles)
+    m_local = local_id // num_n_tiles
+    pid_n = local_id % num_n_tiles
+    m_idx = first_m_in_group + m_local
+
+    oh = m_idx // num_ow_tiles
+    pid_ow = m_idx % num_ow_tiles
+    ow0 = pid_ow * BLOCK_OW
+
+    HW = H * W
+    OHOW = OH * OW
+
+    acc = tl.zeros((BLOCK_OW, BLOCK_N), dtype=tl.float32)
+
+    for kh in range(KH):
+        for kw in range(KW):
+            x_row_start = n * HW + (oh + kh) * W + (ow0 + kw)
+            x_valid_rows = W - (ow0 + kw)
+
+            x_bp = tl.make_block_ptr(
+                base=x_ptr,
+                shape=(x_row_start + x_valid_rows, C_IN),
+                strides=(C_IN, 1),
+                offsets=(x_row_start, 0),
+                block_shape=(BLOCK_OW, BLOCK_K),
+                order=(1, 0),
+            )
+
+            w_bp = tl.make_block_ptr(
+                base=w_ptr + kh * stride_wkh + kw * stride_wkw,
+                shape=(C_IN, C_out),
+                strides=(stride_wci, stride_wco),
+                offsets=(0, pid_n * BLOCK_N),
+                block_shape=(BLOCK_K, BLOCK_N),
+                order=(1, 0),
+            )
+
+            for c0 in range(0, C_IN, BLOCK_K):
+                x_tile = tl.load(x_bp, boundary_check=(0, 1), padding_option="zero")
+                w_tile = tl.load(w_bp, boundary_check=(0, 1), padding_option="zero")
+                acc = tl.dot(x_tile, w_tile, acc)
+                x_bp = tl.advance(x_bp, (0, BLOCK_K))
+                w_bp = tl.advance(w_bp, (BLOCK_K, 0))
+
+    y_row_start = n * OHOW + oh * OW + ow0
+    y_valid_rows = OW - ow0
+    y_bp = tl.make_block_ptr(
+        base=y_ptr,
+        shape=(y_row_start + y_valid_rows, C_out),
+        strides=(C_out, 1),
+        offsets=(y_row_start, pid_n * BLOCK_N),
+        block_shape=(BLOCK_OW, BLOCK_N),
+        order=(1, 0),
+    )
+    tl.store(y_bp, acc.to(tl.float16), boundary_check=(0, 1))
+
+
+class Model(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        output_padding=0,
+        groups=1,
+        bias=False,
+    ):
+        super().__init__()
+        self.conv_transpose2d = nn.ConvTranspose2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+            groups=groups,
+            bias=bias,
+        )
+        self._packed = False
+        self._cached_x_ptr = None
+
+    def _to_pair(self, x):
+        return (x, x) if isinstance(x, int) else tuple(x)
+
+    def _pack_weights(self):
+        device = torch.device("cpu")
+        weight = self.conv_transpose2d.weight.data.detach()
+        self.w_hwio = (
+            weight.flip(2, 3)
+            .permute(2, 3, 0, 1)
+            .to(device, dtype=torch.float16)
+            .contiguous()
+        )
+        self.KH = weight.shape[2]
+        self.KW = weight.shape[3]
+        self.C_in_val = weight.shape[0]
+        self.C_out_val = weight.shape[1]
+        # Compute padding for flip+pad conversion
+        pad = self._to_pair(self.conv_transpose2d.padding)
+        self._pad_h = self.KH - 1 - pad[0]
+        self._pad_w = self.KW - 1 - pad[1]
+        self._packed = True
+
+    def forward(self, x):
+        device = torch.device("cpu")
+        if not self._packed:
+            self._pack_weights()
+
+        x = x.to(device, dtype=torch.float16)
+        N, C_in, H, W_in = x.shape
+        KH, KW = self.KH, self.KW
+        C_out = self.C_out_val
+        pad_h, pad_w = self._pad_h, self._pad_w
+
+        if self._cached_x_ptr != x.data_ptr():
+            x_pad = F.pad(x, (pad_w, pad_w, pad_h, pad_h))
+            x_pad_cl = x_pad.contiguous(memory_format=torch.channels_last)
+            self._x_nhwc = x_pad_cl.permute(0, 2, 3, 1)
+            self._cached_x_ptr = x.data_ptr()
+            self._H_pad = H + 2 * pad_h
+            self._W_pad = W_in + 2 * pad_w
+            self._OH = H + 2 * pad_h - KH + 1
+            self._OW = W_in + 2 * pad_w - KW + 1
+            self._N = N
+            self._y = torch.empty(
+                (N, C_out, self._OH, self._OW),
+                device=device,
+                dtype=torch.float16,
+                memory_format=torch.channels_last,
+            )
+            self._y_nhwc = self._y.permute(0, 2, 3, 1)
+
+        OH, OW = self._OH, self._OW
+        H_pad, W_pad = self._H_pad, self._W_pad
+
+        def grid(META):
+            num_ow = triton.cdiv(OW, META["BLOCK_OW"])
+            num_n = triton.cdiv(C_out, META["BLOCK_N"])
+            return (OH * num_ow * num_n, N)
+
+        _conv_transpose2d_swizzled_v2[grid](
+            self._x_nhwc,
+            self.w_hwio,
+            self._y_nhwc,
+            N,
+            H_pad,
+            W_pad,
+            C_out,
+            OH,
+            OW,
+            self.w_hwio.stride(0),
+            self.w_hwio.stride(1),
+            self.w_hwio.stride(2),
+            self.w_hwio.stride(3),
+            KH=KH,
+            KW=KW,
+            C_IN=C_in,
+        )
+
+        return self._y
+
+
+batch_size = 8
+in_channels = 32
+out_channels = 32
+kernel_size = (3, 7)
+height = 512
+width = 1024
+stride = (1, 1)
+padding = (1, 3)
+
+
+def get_inputs():
+    x = torch.rand(batch_size, in_channels, height, width)
+    return [x]
+
+
+def get_init_inputs():
+    return [in_channels, out_channels, kernel_size, stride, padding]
diff --git a/backends/triton/cpu/KernelBench/level1/79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.py b/backends/triton/cpu/KernelBench/level1/79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.py
new file mode 100644
index 0000000..84f13ff
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.py
@@ -0,0 +1,180 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_L": 128}, num_warps=4, num_stages=2),
+    ],
+    key=["N_NONZERO"],
+)
+@triton.jit
+def _conv_transpose1d_kernel(
+    inp_ptr,
+    w_ptr,
+    out_ptr,
+    L_IN,
+    L_OUT,
+    N_NONZERO,
+    IN_CHANNELS: tl.constexpr,
+    OUT_CHANNELS: tl.constexpr,
+    KERNEL_SIZE: tl.constexpr,
+    BLOCK_L: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    BLOCK_OC: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    num_l_tiles = tl.cdiv(N_NONZERO, BLOCK_L)
+    batch_idx = pid // num_l_tiles
+    j0 = (pid % num_l_tiles) * BLOCK_L
+
+    batch_off_in = batch_idx.to(tl.int64) * L_IN * IN_CHANNELS
+
+    acc = tl.zeros((BLOCK_L, BLOCK_OC), dtype=tl.float32)
+
+    for k in range(KERNEL_SIZE):
+        i_off = j0 + 1 - k
+
+        inp_bp = tl.make_block_ptr(
+            base=inp_ptr + batch_off_in,
+            shape=(L_IN, IN_CHANNELS),
+            strides=(IN_CHANNELS, 1),
+            offsets=(i_off, 0),
+            block_shape=(BLOCK_L, BLOCK_K),
+            order=(1, 0),
+        )
+        w_bp = tl.make_block_ptr(
+            base=w_ptr + k * IN_CHANNELS * OUT_CHANNELS,
+            shape=(IN_CHANNELS, OUT_CHANNELS),
+            strides=(OUT_CHANNELS, 1),
+            offsets=(0, 0),
+            block_shape=(BLOCK_K, BLOCK_OC),
+            order=(1, 0),
+        )
+
+        a = tl.load(inp_bp, boundary_check=(0, 1), padding_option="zero")
+        b = tl.load(w_bp, boundary_check=(0, 1), padding_option="zero")
+        acc = tl.dot(a, b, acc)
+
+    batch_off_out = batch_idx.to(tl.int64) * L_OUT * OUT_CHANNELS + OUT_CHANNELS
+    out_bp = tl.make_block_ptr(
+        base=out_ptr + batch_off_out,
+        shape=(N_NONZERO, OUT_CHANNELS),
+        strides=(2 * OUT_CHANNELS, 1),
+        offsets=(j0, 0),
+        block_shape=(BLOCK_L, BLOCK_OC),
+        order=(1, 0),
+    )
+    tl.store(out_bp, acc.to(tl.float16), boundary_check=(0, 1))
+
+
+class Model(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        bias=False,
+    ):
+        super().__init__()
+        self.conv1d_transpose = nn.ConvTranspose1d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias,
+        )
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride_val = stride
+        self.padding_val = padding
+        self.dilation_val = dilation
+        self._packed = False
+
+    def _pack_weights(self):
+        device = torch.device("cpu")
+        w = self.conv1d_transpose.weight.data.detach()
+        self.weight_kio = w.permute(2, 0, 1).contiguous().to(device, torch.float16)
+        self._packed = True
+
+    def forward(self, x):
+        if not self._packed:
+            self._pack_weights()
+
+        device = torch.device("cpu")
+        x = x.to(device, torch.float16)
+        B, IC, L_in = x.shape
+        L_out = (
+            (L_in - 1) * self.stride_val
+            - 2 * self.padding_val
+            + self.dilation_val * (self.kernel_size - 1)
+            + 1
+        )
+        N_nonzero = L_out // 2
+
+        x_ptr = x.data_ptr()
+        if not hasattr(self, "_x_cl") or self._x_ptr != x_ptr:
+            self._x_cl = x.permute(0, 2, 1).contiguous()
+            self._x_ptr = x_ptr
+
+        if not hasattr(self, "_out_cl") or self._out_shape != (
+            B,
+            L_out,
+            self.out_channels,
+        ):
+            self._out_cl = torch.zeros(
+                (B, L_out, self.out_channels), device=device, dtype=torch.float16
+            )
+            self._out_view = self._out_cl.permute(0, 2, 1)
+            self._out_shape = (B, L_out, self.out_channels)
+
+        grid = lambda META: (B * triton.cdiv(N_nonzero, META["BLOCK_L"]),)
+
+        _conv_transpose1d_kernel[grid](
+            self._x_cl,
+            self.weight_kio,
+            self._out_cl,
+            L_in,
+            L_out,
+            N_nonzero,
+            IN_CHANNELS=IC,
+            OUT_CHANNELS=self.out_channels,
+            KERNEL_SIZE=self.kernel_size,
+            BLOCK_K=32,
+            BLOCK_OC=64,
+        )
+
+        return self._out_view
+
+
+batch_size = 16
+in_channels = 32
+out_channels = 64
+kernel_size = 3
+length = 131072
+stride = 2
+padding = 1
+dilation = 2
+
+
+def get_inputs():
+    x = torch.rand(batch_size, in_channels, length)
+    return [x]
+
+
+def get_init_inputs():
+    return [in_channels, out_channels, kernel_size, stride, padding, dilation]
diff --git a/backends/triton/cpu/KernelBench/level1/7_Matmul_with_small_K_dimension_.py b/backends/triton/cpu/KernelBench/level1/7_Matmul_with_small_K_dimension_.py
new file mode 100644
index 0000000..dc43e9f
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/7_Matmul_with_small_K_dimension_.py
@@ -0,0 +1,136 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def swizzle_tile(
+    tile_id,
+    M,
+    N,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    grid_m = tl.cdiv(M, BLOCK_M)
+    grid_n = tl.cdiv(N, BLOCK_N)
+    width = GROUP_SIZE_M * grid_n
+    group_id = tile_id // width
+    group_size = tl.minimum(GROUP_SIZE_M, grid_m - group_id * GROUP_SIZE_M)
+    pid_m = group_id * GROUP_SIZE_M + (tile_id % group_size)
+    pid_n = (tile_id % width) // group_size
+    return pid_m, pid_n
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 64, "GROUP_SIZE_M": 4},
+            num_warps=32,
+            num_stages=2,
+        ),
+    ],
+    key=["M", "N", "K"],
+)
+@triton.jit
+def _matmul_small_k_kernel(
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    M,
+    N,
+    K,
+    stride_am,
+    stride_ak: tl.constexpr,
+    stride_bk,
+    stride_bn: tl.constexpr,
+    stride_cm,
+    stride_cn: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    pid_m, pid_n = swizzle_tile(pid, M, N, BLOCK_M, BLOCK_N, GROUP_SIZE_M)
+
+    a_desc = tl.make_tensor_descriptor(
+        base=a_ptr,
+        shape=(M, K),
+        strides=(stride_am, stride_ak),
+        block_shape=(BLOCK_M, BLOCK_K),
+    )
+    b_desc = tl.make_tensor_descriptor(
+        base=b_ptr,
+        shape=(K, N),
+        strides=(stride_bk, stride_bn),
+        block_shape=(BLOCK_K, BLOCK_N),
+    )
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    for off_k in range(0, K, BLOCK_K):
+        a_tile = a_desc.load([pid_m * BLOCK_M, off_k])
+        b_tile = b_desc.load([off_k, pid_n * BLOCK_N])
+        acc += tl.dot(a_tile, b_tile)
+    c_desc = tl.make_tensor_descriptor(
+        base=c_ptr,
+        shape=(M, N),
+        strides=(stride_cm, stride_cn),
+        block_shape=(BLOCK_M, BLOCK_N),
+    )
+    c_desc.store([pid_m * BLOCK_M, pid_n * BLOCK_N], acc.to(c_ptr.type.element_ty))
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
+        A = A.contiguous()
+        B = B.contiguous()
+        M, K = A.shape
+        _, N = B.shape
+
+        C = torch.empty((M, N), device=A.device, dtype=A.dtype)
+
+        grid = lambda META: (
+            triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),
+        )
+
+        _matmul_small_k_kernel[grid](
+            A,
+            B,
+            C,
+            M,
+            N,
+            K,
+            A.stride(0),
+            A.stride(1),
+            B.stride(0),
+            B.stride(1),
+            C.stride(0),
+            C.stride(1),
+        )
+        return C
+
+
+M = 16384 * 2
+N = 16384 * 2
+K = 32 * 2
+
+
+def get_inputs():
+    A = torch.rand(M, K, dtype=torch.bfloat16)
+    B = torch.rand(K, N, dtype=torch.bfloat16)
+    return [A, B]
+
+
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.py b/backends/triton/cpu/KernelBench/level1/80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.py
new file mode 100644
index 0000000..2494939
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.py
@@ -0,0 +1,245 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+def _to_pair(x):
+    return (x, x) if isinstance(x, int) else tuple(x)
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {
+                "BLOCK_M": 64,
+                "BLOCK_N": 64,
+                "BLOCK_K": 32,
+                "GROUP_M": 8,
+                "grf_mode": "128",
+            },
+            num_warps=4,
+            num_stages=3,
+        ),
+    ],
+    key=["C_IN", "C_out", "OH", "OW"],
+)
+@triton.jit
+def _conv2d_dilated_gemm(
+    x_ptr,
+    w_ptr,
+    y_ptr,
+    N_batch,
+    H_in,
+    W_in,
+    C_out,
+    OH,
+    OW,
+    sxn,
+    sxh,
+    sxw,
+    swkh,
+    swkw,
+    swci,
+    swco,
+    syn,
+    syh,
+    syw,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_M: tl.constexpr,
+    PAD_H: tl.constexpr,
+    PAD_W: tl.constexpr,
+    DIL_H: tl.constexpr,
+    DIL_W: tl.constexpr,
+    KH: tl.constexpr,
+    KW: tl.constexpr,
+    C_IN: tl.constexpr,
+    grf_mode: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    M_total = N_batch * OH * OW
+    num_pid_m = tl.cdiv(M_total, BLOCK_M)
+    num_pid_n = tl.cdiv(C_out, BLOCK_N)
+    num_pid_in_group = GROUP_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_M
+    group_size_m = tl.minimum(num_pid_m - first_pid_m, GROUP_M)
+    pid_in_group = pid % num_pid_in_group
+    pid_m = first_pid_m + (pid_in_group % group_size_m)
+    pid_n = pid_in_group // group_size_m
+
+    offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    offs_k = tl.arange(0, BLOCK_K)
+
+    ohw = OH * OW
+    n_idx = offs_m // ohw
+    rem = offs_m % ohw
+    oh_idx = rem // OW
+    ow_idx = rem % OW
+
+    mask_m = offs_m < M_total
+    mask_n = offs_n < C_out
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    x_base = x_ptr + n_idx.to(tl.int64) * sxn
+
+    for kh in range(KH):
+        for kw in range(KW):
+            ih = oh_idx + kh * DIL_H - PAD_H
+            iw = ow_idx + kw * DIL_W - PAD_W
+            valid = (ih >= 0) & (ih < H_in) & (iw >= 0) & (iw < W_in) & mask_m
+            x_hw_ptrs = x_base + ih * sxh + iw * sxw
+
+            w_bp = tl.make_block_ptr(
+                base=w_ptr + kh * swkh + kw * swkw,
+                shape=(C_IN, C_out),
+                strides=(swci, swco),
+                offsets=(0, pid_n * BLOCK_N),
+                block_shape=(BLOCK_K, BLOCK_N),
+                order=(1, 0),
+            )
+
+            for c0 in range(0, C_IN, BLOCK_K):
+                k_idx = c0 + offs_k
+                x_tile = tl.load(
+                    x_hw_ptrs[:, None] + k_idx[None, :],
+                    mask=valid[:, None] & (k_idx[None, :] < C_IN),
+                    other=0.0,
+                )
+                w_tile = tl.load(w_bp, boundary_check=(0, 1), padding_option="zero")
+                acc = tl.dot(x_tile, w_tile, acc)
+                w_bp = tl.advance(w_bp, (BLOCK_K, 0))
+
+    y_ptrs = y_ptr + n_idx.to(tl.int64) * syn + oh_idx * syh + ow_idx * syw
+    tl.store(
+        y_ptrs[:, None] + offs_n[None, :],
+        acc.to(y_ptr.dtype.element_ty),
+        mask=mask_m[:, None] & mask_n[None, :],
+    )
+
+
+class Model(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=(0, 0),
+        dilation=(1, 1),
+        bias=False,
+    ):
+        super(Model, self).__init__()
+        self.conv2d = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias,
+        )
+        self._stride = stride
+        self._padding = _to_pair(padding)
+        self._dilation = _to_pair(dilation)
+        self._packed = False
+
+    def _pack_weights(self):
+        w = self.conv2d.weight.data
+        self.w_packed = w.permute(2, 3, 1, 0).contiguous()
+        self._packed = True
+
+    def forward(self, x):
+        if not self._packed:
+            self._pack_weights()
+
+        B, C_in, H_in, W_in = x.shape
+        C_out = self.conv2d.weight.shape[0]
+        KH, KW = self.conv2d.weight.shape[2], self.conv2d.weight.shape[3]
+        pad_h, pad_w = self._padding
+        dil_h, dil_w = self._dilation
+
+        OH = (H_in + 2 * pad_h - dil_h * (KH - 1) - 1) + 1
+        OW = (W_in + 2 * pad_w - dil_w * (KW - 1) - 1) + 1
+
+        x_cl = x.contiguous(memory_format=torch.channels_last)
+        y = torch.empty(
+            B,
+            C_out,
+            OH,
+            OW,
+            device=x.device,
+            dtype=x.dtype,
+            memory_format=torch.channels_last,
+        )
+
+        sx = x_cl.stride()
+        sy = y.stride()
+        sw = self.w_packed.stride()
+
+        M_total = B * OH * OW
+        grid = lambda meta: (
+            triton.cdiv(M_total, meta["BLOCK_M"]) * triton.cdiv(C_out, meta["BLOCK_N"]),
+        )
+
+        _conv2d_dilated_gemm[grid](
+            x_cl,
+            self.w_packed,
+            y,
+            B,
+            H_in,
+            W_in,
+            C_out,
+            OH,
+            OW,
+            sx[0],
+            sx[2],
+            sx[3],
+            sw[0],
+            sw[1],
+            sw[2],
+            sw[3],
+            sy[0],
+            sy[2],
+            sy[3],
+            PAD_H=pad_h,
+            PAD_W=pad_w,
+            DIL_H=dil_h,
+            DIL_W=dil_w,
+            KH=KH,
+            KW=KW,
+            C_IN=C_in,
+        )
+
+        if self.conv2d.bias is not None:
+            y = y + self.conv2d.bias.view(1, -1, 1, 1)
+        return y
+
+
+batch_size = 8
+in_channels = 32
+out_channels = 64
+kernel_size = (5, 9)
+width = 512
+height = 512
+stride = 1
+padding = (2, 4)
+dilation = (2, 3)
+
+
+def get_inputs():
+    x = torch.rand(batch_size, in_channels, height, width)
+    return [x]
+
+
+def get_init_inputs():
+    return [in_channels, out_channels, kernel_size, stride, padding, dilation]
diff --git a/backends/triton/cpu/KernelBench/level1/81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.py b/backends/triton/cpu/KernelBench/level1/81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.py
new file mode 100644
index 0000000..b282b7b
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.py
@@ -0,0 +1,242 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+def _to_pair(x):
+    return (x, x) if isinstance(x, int) else tuple(x)
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_W": 16, "BLOCK_OC": 32}, num_warps=4, num_stages=3),
+    ],
+    key=["W_IN", "C_out"],
+)
+@triton.jit
+def _conv_transpose2d_scatter(
+    x_ptr,
+    w_ptr,
+    out_ptr,
+    H_IN,
+    W_IN,
+    C_out,
+    OH,
+    OW,
+    sx_b,
+    sx_h,
+    sx_w,
+    so_b,
+    so_h,
+    so_w,
+    BLOCK_W: tl.constexpr,
+    BLOCK_OC: tl.constexpr,
+    KH: tl.constexpr,
+    KW: tl.constexpr,
+    C_IN: tl.constexpr,
+    STRIDE_H: tl.constexpr,
+    STRIDE_W: tl.constexpr,
+    DIL_H: tl.constexpr,
+    DIL_W: tl.constexpr,
+    PAD_H: tl.constexpr,
+    PAD_W: tl.constexpr,
+):
+    pid_w = tl.program_id(0)
+    pid_bh = tl.program_id(1)
+    pid_oc = tl.program_id(2)
+
+    b = pid_bh // H_IN
+    ih = pid_bh % H_IN
+
+    offs_iw = pid_w * BLOCK_W + tl.arange(0, BLOCK_W)
+    offs_oc = pid_oc * BLOCK_OC + tl.arange(0, BLOCK_OC)
+
+    x_base = x_ptr + b.to(tl.int64) * sx_b + ih * sx_h
+    x_bp = tl.make_block_ptr(
+        base=x_base,
+        shape=(W_IN, C_IN),
+        strides=(sx_w, 1),
+        offsets=(pid_w * BLOCK_W, 0),
+        block_shape=(BLOCK_W, C_IN),
+        order=(1, 0),
+    )
+    x_tile = tl.load(x_bp, boundary_check=(0,), padding_option="zero")
+
+    out_batch_base = out_ptr + b.to(tl.int64) * so_b
+    iw_valid = offs_iw < W_IN
+
+    for kh in range(KH):
+        oh = ih * STRIDE_H + kh * DIL_H - PAD_H
+        if (oh >= 0) & (oh < OH):
+            for kw in range(KW):
+                ow = offs_iw * STRIDE_W + kw * DIL_W - PAD_W
+
+                kidx = kh * KW + kw
+                w_bp = tl.make_block_ptr(
+                    base=w_ptr + kidx * C_IN * C_out,
+                    shape=(C_IN, C_out),
+                    strides=(C_out, 1),
+                    offsets=(0, pid_oc * BLOCK_OC),
+                    block_shape=(C_IN, BLOCK_OC),
+                    order=(1, 0),
+                )
+                w_tile = tl.load(w_bp, boundary_check=(1,), padding_option="zero")
+
+                result = tl.dot(x_tile, w_tile).to(tl.float16)
+
+                out_ptrs = (
+                    out_batch_base + oh * so_h + ow[:, None] * so_w + offs_oc[None, :]
+                )
+                out_mask = (
+                    iw_valid[:, None]
+                    & (offs_oc[None, :] < C_out)
+                    & (ow[:, None] >= 0)
+                    & (ow[:, None] < OW)
+                )
+                tl.store(out_ptrs, result, mask=out_mask)
+
+
+@triton.jit
+def _zero_kernel(out_ptr, N, BLOCK: tl.constexpr):
+    pid = tl.program_id(0)
+    offs = pid * BLOCK + tl.arange(0, BLOCK)
+    mask = offs < N
+    tl.store(out_ptr + offs, tl.zeros((BLOCK,), dtype=tl.float16), mask=mask)
+
+
+class Model(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        bias=False,
+    ):
+        super().__init__()
+        self.conv_transpose2d = nn.ConvTranspose2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias,
+        )
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _to_pair(kernel_size)
+        self.stride_val = _to_pair(stride)
+        self.padding_val = _to_pair(padding)
+        self.dilation_val = _to_pair(dilation)
+        self._packed = False
+
+    def _pack_weights(self):
+        w = self.conv_transpose2d.weight.data
+        KH, KW = self.kernel_size
+        self.weight_packed = (
+            w.permute(2, 3, 0, 1)
+            .reshape(KH * KW, self.in_channels, self.out_channels)
+            .contiguous()
+            .to(dtype=torch.float16)
+        )
+        self._packed = True
+
+    def forward(self, x):
+        if not self._packed:
+            self._pack_weights()
+
+        device = x.device
+        B, C_in, H_in, W_in = x.shape
+        KH, KW = self.kernel_size
+        SH, SW = self.stride_val
+        PH, PW = self.padding_val
+        DH, DW = self.dilation_val
+
+        OH = (H_in - 1) * SH - 2 * PH + DH * (KH - 1) + 1
+        OW = (W_in - 1) * SW - 2 * PW + DW * (KW - 1) + 1
+
+        x_cl = x.to(dtype=torch.float16).contiguous(memory_format=torch.channels_last)
+
+        if not hasattr(self, "_output") or self._output.shape[0] != B:
+            self._output = torch.empty(
+                B,
+                self.out_channels,
+                OH,
+                OW,
+                device=device,
+                dtype=torch.float16,
+            ).contiguous(memory_format=torch.channels_last)
+
+        output = self._output
+        N_elem = output.numel()
+        _zero_kernel[(triton.cdiv(N_elem, 1024),)](output, N_elem, BLOCK=1024)
+
+        sx = x_cl.stride()
+        so = output.stride()
+
+        grid = lambda META: (
+            triton.cdiv(W_in, META["BLOCK_W"]),
+            B * H_in,
+            triton.cdiv(self.out_channels, META["BLOCK_OC"]),
+        )
+
+        _conv_transpose2d_scatter[grid](
+            x_cl,
+            self.weight_packed,
+            output,
+            H_in,
+            W_in,
+            self.out_channels,
+            OH,
+            OW,
+            sx[0],
+            sx[2],
+            sx[3],
+            so[0],
+            so[2],
+            so[3],
+            KH=KH,
+            KW=KW,
+            C_IN=C_in,
+            STRIDE_H=SH,
+            STRIDE_W=SW,
+            DIL_H=DH,
+            DIL_W=DW,
+            PAD_H=PH,
+            PAD_W=PW,
+        )
+
+        if self.conv_transpose2d.bias is not None:
+            output = output + self.conv_transpose2d.bias.view(1, -1, 1, 1)
+
+        return output
+
+
+batch_size = 16
+in_channels = 32
+out_channels = 64
+kernel_size = 3
+height_in = 64
+width_in = 128
+stride = 5
+padding = 1
+dilation = 2
+
+
+def get_inputs():
+    x = torch.rand(batch_size, in_channels, height_in, width_in)
+    return [x]
+
+
+def get_init_inputs():
+    return [in_channels, out_channels, kernel_size, stride, padding, dilation]
diff --git a/backends/triton/cpu/KernelBench/level1/82_conv_depthwise_2D_square_input_square_kernel.py b/backends/triton/cpu/KernelBench/level1/82_conv_depthwise_2D_square_input_square_kernel.py
new file mode 100644
index 0000000..e8395b3
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/82_conv_depthwise_2D_square_input_square_kernel.py
@@ -0,0 +1,117 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=1),
+    ],
+    key=["OHOW"],
+)
+@triton.jit
+def _depthwise_conv2d(
+    x_ptr,
+    w_ptr,
+    y_ptr,
+    C,
+    H,
+    W,
+    OH,
+    OW,
+    OHOW,
+    KH: tl.constexpr,
+    KW: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    nc = tl.program_id(1)
+    pid = tl.program_id(0)
+    c = nc % C
+    nc_i64 = nc.to(tl.int64)
+
+    offs = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offs < OHOW
+    oh = offs // OW
+    ow = offs % OW
+
+    x_base = x_ptr + nc_i64 * (H * W)
+    w_base = w_ptr + c * (KH * KW)
+    y_base = y_ptr + nc_i64 * OHOW
+
+    acc = tl.zeros([BLOCK_SIZE], dtype=y_ptr.dtype.element_ty)
+    for kh in range(KH):
+        for kw in range(KW):
+            x_val = tl.load(x_base + (oh + kh) * W + (ow + kw), mask=mask, other=0.0)
+            w_val = tl.load(w_base + kh * KW + kw)
+            acc += x_val * w_val
+
+    tl.store(y_base + offs, acc, mask=mask)
+
+
+class Model(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.conv2d = nn.Conv2d(
+            in_channels,
+            in_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=in_channels,
+            bias=bias,
+        )
+        self._packed = False
+
+    def _pack_weights(self):
+        self._dw_w = self.conv2d.weight.squeeze(1).contiguous()
+        self._packed = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not self._packed:
+            self._pack_weights()
+        w = self._dw_w.to(dtype=x.dtype)
+        N, C, H, W_dim = x.shape
+        KH = self.conv2d.weight.shape[2]
+        KW = self.conv2d.weight.shape[3]
+        OH = H - KH + 1
+        OW = W_dim - KW + 1
+        OHOW = OH * OW
+
+        x = x.contiguous()
+        y = torch.empty((N, C, OH, OW), device=x.device, dtype=x.dtype)
+
+        grid = lambda meta: (triton.cdiv(OHOW, meta["BLOCK_SIZE"]), N * C)
+        _depthwise_conv2d[grid](x, w, y, C, H, W_dim, OH, OW, OHOW, KH=KH, KW=KW)
+        return y
+
+
+batch_size = 16
+in_channels = 64
+kernel_size = 3
+width = 512
+height = 512
+stride = 1
+padding = 0
+
+
+def get_inputs():
+    x = torch.rand(batch_size, in_channels, height, width)
+    return [x]
+
+
+def get_init_inputs():
+    return [in_channels, kernel_size, stride, padding]
diff --git a/backends/triton/cpu/KernelBench/level1/83_conv_depthwise_2D_square_input_asymmetric_kernel.py b/backends/triton/cpu/KernelBench/level1/83_conv_depthwise_2D_square_input_asymmetric_kernel.py
new file mode 100644
index 0000000..6b9c313
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/83_conv_depthwise_2D_square_input_asymmetric_kernel.py
@@ -0,0 +1,122 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=1),
+    ],
+    key=["OHOW"],
+)
+@triton.jit
+def _depthwise_conv2d(
+    x_ptr,
+    w_ptr,
+    y_ptr,
+    C,
+    H,
+    W,
+    OH,
+    OW,
+    OHOW,
+    KH: tl.constexpr,
+    KW: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    nc = tl.program_id(1)
+    pid = tl.program_id(0)
+    c = nc % C
+    nc_i64 = nc.to(tl.int64)
+
+    offs = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offs < OHOW
+    oh = offs // OW
+    ow = offs % OW
+
+    x_base = x_ptr + nc_i64 * (H * W)
+    w_base = w_ptr + c * (KH * KW)
+    y_base = y_ptr + nc_i64 * OHOW
+
+    acc = tl.zeros([BLOCK_SIZE], dtype=y_ptr.dtype.element_ty)
+    for kh in range(KH):
+        for kw in range(KW):
+            x_val = tl.load(x_base + (oh + kh) * W + (ow + kw), mask=mask, other=0.0)
+            w_val = tl.load(w_base + kh * KW + kw)
+            acc += x_val * w_val
+
+    tl.store(y_base + offs, acc, mask=mask)
+
+
+class Model(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        dilation: int = 1,
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.conv2d = nn.Conv2d(
+            in_channels,
+            in_channels,
+            kernel_size=(kernel_size, 1),
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=in_channels,
+            bias=bias,
+        )
+        self.in_channels = in_channels
+        self.kernel_size = kernel_size
+        self._packed = False
+
+    def _pack_weights(self):
+        self._dw_w = self.conv2d.weight.squeeze(1).contiguous()
+        self._packed = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not self._packed:
+            self._pack_weights()
+        w = self._dw_w.to(dtype=x.dtype)
+        N, C, H, W_dim = x.shape
+        KH = self.conv2d.weight.shape[2]
+        KW = self.conv2d.weight.shape[3]
+        OH = H - KH + 1
+        OW = W_dim
+        OHOW = OH * OW
+
+        x = x.contiguous()
+        y = torch.empty((N, C, OH, OW), device=x.device, dtype=x.dtype)
+
+        grid = lambda meta: (triton.cdiv(OHOW, meta["BLOCK_SIZE"]), N * C)
+        _depthwise_conv2d[grid](x, w, y, C, H, W_dim, OH, OW, OHOW, KH=KH, KW=KW)
+        return y
+
+
+batch_size = 64
+in_channels = 8
+kernel_size = 3
+width = 512
+height = 512
+stride = 1
+padding = 0
+dilation = 1
+
+
+def get_inputs():
+    x = torch.rand(batch_size, in_channels, height, width)
+    return [x]
+
+
+def get_init_inputs():
+    return [in_channels, kernel_size, stride, padding, dilation]
diff --git a/backends/triton/cpu/KernelBench/level1/84_conv_depthwise_2D_asymmetric_input_square_kernel.py b/backends/triton/cpu/KernelBench/level1/84_conv_depthwise_2D_asymmetric_input_square_kernel.py
new file mode 100644
index 0000000..94c2d3c
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/84_conv_depthwise_2D_asymmetric_input_square_kernel.py
@@ -0,0 +1,119 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=1),
+    ],
+    key=["OHOW"],
+)
+@triton.jit
+def _depthwise_conv2d(
+    x_ptr,
+    w_ptr,
+    y_ptr,
+    C,
+    H,
+    W,
+    OH,
+    OW,
+    OHOW,
+    KH: tl.constexpr,
+    KW: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    nc = tl.program_id(1)
+    pid = tl.program_id(0)
+    c = nc % C
+    nc_i64 = nc.to(tl.int64)
+
+    offs = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offs < OHOW
+    oh = offs // OW
+    ow = offs % OW
+
+    x_base = x_ptr + nc_i64 * (H * W)
+    w_base = w_ptr + c * (KH * KW)
+    y_base = y_ptr + nc_i64 * OHOW
+
+    acc = tl.zeros([BLOCK_SIZE], dtype=y_ptr.dtype.element_ty)
+    for kh in range(KH):
+        for kw in range(KW):
+            x_val = tl.load(x_base + (oh + kh) * W + (ow + kw), mask=mask, other=0.0)
+            w_val = tl.load(w_base + kh * KW + kw)
+            acc += x_val * w_val
+
+    tl.store(y_base + offs, acc, mask=mask)
+
+
+class Model(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.conv2d = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=(kernel_size, kernel_size),
+            stride=stride,
+            padding=padding,
+            groups=in_channels,
+            bias=bias,
+        )
+        self._packed = False
+
+    def _pack_weights(self):
+        self._dw_w = self.conv2d.weight.squeeze(1).contiguous()
+        self._packed = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not self._packed:
+            self._pack_weights()
+        w = self._dw_w.to(dtype=x.dtype)
+        N, C, H, W_dim = x.shape
+        KH = self.conv2d.weight.shape[2]
+        KW = self.conv2d.weight.shape[3]
+        OH = H - KH + 1
+        OW = W_dim - KW + 1
+        OHOW = OH * OW
+
+        x = x.contiguous()
+        y = torch.empty((N, C, OH, OW), device=x.device, dtype=x.dtype)
+
+        grid = lambda meta: (triton.cdiv(OHOW, meta["BLOCK_SIZE"]), N * C)
+        _depthwise_conv2d[grid](x, w, y, C, H, W_dim, OH, OW, OHOW, KH=KH, KW=KW)
+        return y
+
+
+batch_size = 64
+in_channels = 128
+out_channels = 128
+kernel_size = 3
+width_in = 512
+height_in = 256
+stride = 1
+padding = 0
+
+
+def get_inputs():
+    x = torch.rand(batch_size, in_channels, height_in, width_in)
+    return [x]
+
+
+def get_init_inputs():
+    return [in_channels, out_channels, kernel_size, stride, padding]
diff --git a/backends/triton/cpu/KernelBench/level1/85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.py b/backends/triton/cpu/KernelBench/level1/85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.py
new file mode 100644
index 0000000..61d7834
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.py
@@ -0,0 +1,179 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=1),
+    ],
+    key=["OHOW"],
+)
+@triton.jit
+def _depthwise_conv2d(
+    x_ptr,
+    w_ptr,
+    y_ptr,
+    C,
+    H,
+    W,
+    OH,
+    OW,
+    OHOW,
+    stride_h: tl.constexpr,
+    stride_w: tl.constexpr,
+    pad_h: tl.constexpr,
+    pad_w: tl.constexpr,
+    dil_h: tl.constexpr,
+    dil_w: tl.constexpr,
+    KH: tl.constexpr,
+    KW: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    nc = tl.program_id(1)
+    pid = tl.program_id(0)
+    c = nc % C
+    nc_i64 = nc.to(tl.int64)
+
+    offs = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offs < OHOW
+    oh = offs // OW
+    ow = offs % OW
+
+    x_base = x_ptr + nc_i64 * (H * W)
+    w_base = w_ptr + c * (KH * KW)
+    y_base = y_ptr + nc_i64 * OHOW
+
+    acc = tl.zeros([BLOCK_SIZE], dtype=y_ptr.dtype.element_ty)
+    for kh in range(KH):
+        for kw in range(KW):
+            ih = oh * stride_h + kh * dil_h - pad_h
+            iw = ow * stride_w + kw * dil_w - pad_w
+            valid = mask & (ih >= 0) & (ih < H) & (iw >= 0) & (iw < W)
+            x_val = tl.load(x_base + ih * W + iw, mask=valid, other=0.0)
+            w_val = tl.load(w_base + kh * KW + kw)
+            acc += x_val * w_val
+
+    tl.store(y_base + offs, acc, mask=mask)
+
+
+class Model(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size_h,
+        kernel_size_w,
+        stride_h=1,
+        stride_w=1,
+        padding_h=0,
+        padding_w=0,
+        dilation_h=1,
+        dilation_w=1,
+        groups=1,
+        bias=False,
+    ):
+        super(Model, self).__init__()
+        self.conv2d = nn.Conv2d(
+            in_channels,
+            in_channels,
+            (kernel_size_h, kernel_size_w),
+            stride=(stride_h, stride_w),
+            padding=(padding_h, padding_w),
+            dilation=(dilation_h, dilation_w),
+            groups=in_channels,
+            bias=bias,
+        )
+        self._stride_h = stride_h
+        self._stride_w = stride_w
+        self._pad_h = padding_h
+        self._pad_w = padding_w
+        self._dil_h = dilation_h
+        self._dil_w = dilation_w
+        self._packed = False
+
+    def _pack_weights(self):
+        self._dw_w = self.conv2d.weight.squeeze(1).contiguous()
+        self._packed = True
+
+    def forward(self, x):
+        if not self._packed:
+            self._pack_weights()
+        w = self._dw_w.to(dtype=x.dtype)
+        N, C, H, W_dim = x.shape
+        KH = self.conv2d.weight.shape[2]
+        KW = self.conv2d.weight.shape[3]
+        OH = (H + 2 * self._pad_h - self._dil_h * (KH - 1) - 1) // self._stride_h + 1
+        OW = (
+            W_dim + 2 * self._pad_w - self._dil_w * (KW - 1) - 1
+        ) // self._stride_w + 1
+        OHOW = OH * OW
+
+        x = x.contiguous()
+        y = torch.empty((N, C, OH, OW), device=x.device, dtype=x.dtype)
+
+        grid = lambda meta: (triton.cdiv(OHOW, meta["BLOCK_SIZE"]), N * C)
+        _depthwise_conv2d[grid](
+            x,
+            w,
+            y,
+            C,
+            H,
+            W_dim,
+            OH,
+            OW,
+            OHOW,
+            stride_h=self._stride_h,
+            stride_w=self._stride_w,
+            pad_h=self._pad_h,
+            pad_w=self._pad_w,
+            dil_h=self._dil_h,
+            dil_w=self._dil_w,
+            KH=KH,
+            KW=KW,
+        )
+        return y
+
+
+batch_size = 32
+in_channels = 128
+out_channels = 128
+kernel_size_h = 3
+kernel_size_w = 7
+width = 256
+height = 128
+stride_h = 1
+stride_w = 1
+padding_h = 0
+padding_w = 0
+dilation_h = 1
+dilation_w = 1
+groups = in_channels
+
+
+def get_inputs():
+    x = torch.rand(batch_size, in_channels, height, width)
+    return [x]
+
+
+def get_init_inputs():
+    return [
+        in_channels,
+        out_channels,
+        kernel_size_h,
+        kernel_size_w,
+        stride_h,
+        stride_w,
+        padding_h,
+        padding_w,
+        dilation_h,
+        dilation_w,
+        groups,
+    ]
diff --git a/backends/triton/cpu/KernelBench/level1/86_conv_depthwise_separable_2D.py b/backends/triton/cpu/KernelBench/level1/86_conv_depthwise_separable_2D.py
new file mode 100644
index 0000000..0bcdc18
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/86_conv_depthwise_separable_2D.py
@@ -0,0 +1,214 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 16, "grf_mode": "128"},
+            num_warps=4,
+            num_stages=2,
+        ),
+    ],
+    key=["M_total", "C_out", "C_IN"],
+)
+@triton.jit
+def _conv2d_fused_kernel(
+    x_ptr,
+    w_ptr,
+    y_ptr,
+    M_total,
+    C_out,
+    OH,
+    OW,
+    stride_xn,
+    stride_xh,
+    stride_xw,
+    stride_wkh,
+    stride_wkw,
+    stride_wci,
+    stride_wco,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    KH: tl.constexpr,
+    KW: tl.constexpr,
+    C_IN: tl.constexpr,
+    grf_mode: tl.constexpr,
+):
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    nhw = OH * OW
+    offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_k = tl.arange(0, BLOCK_K)
+
+    n_idx = offs_m // nhw
+    rem = offs_m % nhw
+    oh_idx = rem // OW
+    ow_idx = rem % OW
+
+    mask_m = offs_m < M_total
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+
+    x_base = x_ptr + n_idx * stride_xn + oh_idx * stride_xh + ow_idx * stride_xw
+
+    for kh in range(KH):
+        for kw in range(KW):
+            x_kh_kw = x_base + kh * stride_xh + kw * stride_xw
+
+            w_bp = tl.make_block_ptr(
+                base=w_ptr + kh * stride_wkh + kw * stride_wkw,
+                shape=(C_IN, C_out),
+                strides=(stride_wci, stride_wco),
+                offsets=(0, pid_n * BLOCK_N),
+                block_shape=(BLOCK_K, BLOCK_N),
+                order=(1, 0),
+            )
+
+            for c0 in range(0, C_IN, BLOCK_K):
+                k_idx = c0 + offs_k
+
+                x_tile = tl.load(
+                    x_kh_kw[:, None] + k_idx[None, :],
+                    mask=mask_m[:, None] & (k_idx[None, :] < C_IN),
+                    other=0.0,
+                )
+
+                w_tile = tl.load(w_bp, boundary_check=(0, 1), padding_option="zero")
+                acc = tl.dot(x_tile, w_tile, acc)
+                w_bp = tl.advance(w_bp, (BLOCK_K, 0))
+
+    y_bp = tl.make_block_ptr(
+        base=y_ptr,
+        shape=(M_total, C_out),
+        strides=(C_out, 1),
+        offsets=(pid_m * BLOCK_M, pid_n * BLOCK_N),
+        block_shape=(BLOCK_M, BLOCK_N),
+        order=(1, 0),
+    )
+    tl.store(y_bp, acc.to(tl.float32), boundary_check=(0, 1))
+
+
+class Model(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        bias=False,
+    ):
+        super(Model, self).__init__()
+        self.depthwise = nn.Conv2d(
+            in_channels,
+            in_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=in_channels,
+            bias=bias,
+        )
+        self.pointwise = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=bias)
+        self._padding = int(padding)
+        self._packed = False
+        self._y_buf = None
+
+    def _pack_weights(self):
+        dw_w = self.depthwise.weight.data
+        pw_w = self.pointwise.weight.data
+        combined = pw_w * dw_w.transpose(0, 1)
+        self.w_hwio = combined.to(dtype=torch.float32).permute(2, 3, 1, 0).contiguous()
+        self._packed = True
+
+    def forward(self, x):
+        if not self._packed:
+            self._pack_weights()
+
+        x = x.to(dtype=torch.float32)
+
+        B, C_in, H, W_dim = x.shape
+        KH, KW = self.depthwise.kernel_size
+        C_out = self.pointwise.out_channels
+        pad = self._padding
+
+        if pad > 0:
+            x = F.pad(x, (pad, pad, pad, pad))
+
+        H_pad, W_pad = x.shape[2], x.shape[3]
+        OH = H_pad - KH + 1
+        OW = W_pad - KW + 1
+        M_total = B * OH * OW
+
+        x_cl = x.contiguous(memory_format=torch.channels_last)
+        x_nhwc = x_cl.permute(0, 2, 3, 1)
+
+        if self._y_buf is None or self._y_buf.shape != (B, C_out, OH, OW):
+            self._y_buf = torch.empty(
+                (B, C_out, OH, OW),
+                device=x.device,
+                dtype=torch.float32,
+                memory_format=torch.channels_last,
+            )
+        y = self._y_buf
+        y_nhwc = y.permute(0, 2, 3, 1)
+
+        grid = lambda META: (
+            triton.cdiv(M_total, META["BLOCK_M"]),
+            triton.cdiv(C_out, META["BLOCK_N"]),
+        )
+
+        _conv2d_fused_kernel[grid](
+            x_nhwc,
+            self.w_hwio,
+            y_nhwc,
+            M_total,
+            C_out,
+            OH,
+            OW,
+            x_nhwc.stride(0),
+            x_nhwc.stride(1),
+            x_nhwc.stride(2),
+            self.w_hwio.stride(0),
+            self.w_hwio.stride(1),
+            self.w_hwio.stride(2),
+            self.w_hwio.stride(3),
+            KH=KH,
+            KW=KW,
+            C_IN=C_in,
+        )
+
+        return y
+
+
+batch_size = 16
+in_channels = 64
+out_channels = 128
+kernel_size = 3
+width = 512
+height = 512
+stride = 1
+padding = 1
+dilation = 1
+
+
+def get_init_inputs():
+    return [in_channels, out_channels, kernel_size, stride, padding, dilation]
+
+
+def get_inputs():
+    x = torch.rand(batch_size, in_channels, height, width)
+    return [x]
diff --git a/backends/triton/cpu/KernelBench/level1/87_conv_pointwise_2D.py b/backends/triton/cpu/KernelBench/level1/87_conv_pointwise_2D.py
new file mode 100644
index 0000000..cf48a00
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/87_conv_pointwise_2D.py
@@ -0,0 +1,147 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def _pointwise_gemm(
+    x_ptr,
+    w_ptr,
+    y_ptr,
+    M,
+    N,
+    K,
+    stride_xb,
+    stride_xk,
+    stride_wm,
+    stride_wk,
+    stride_yb,
+    stride_ym,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_M: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    bid = tl.program_id(1)
+
+    num_pid_m = tl.cdiv(M, BLOCK_M)
+    num_pid_n = tl.cdiv(N, BLOCK_N)
+    num_pid_in_group = GROUP_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_M
+    group_size_m = tl.minimum(num_pid_m - first_pid_m, GROUP_M)
+    pid_in_group = pid % num_pid_in_group
+    pid_m = first_pid_m + (pid_in_group % group_size_m)
+    pid_n = pid_in_group // group_size_m
+
+    x_base = x_ptr + bid.to(tl.int64) * stride_xb
+    y_base = y_ptr + bid.to(tl.int64) * stride_yb
+
+    W_bp = tl.make_block_ptr(
+        base=w_ptr,
+        shape=(M, K),
+        strides=(stride_wm, stride_wk),
+        offsets=(pid_m * BLOCK_M, 0),
+        block_shape=(BLOCK_M, BLOCK_K),
+        order=(1, 0),
+    )
+    X_bp = tl.make_block_ptr(
+        base=x_base,
+        shape=(K, N),
+        strides=(stride_xk, 1),
+        offsets=(0, pid_n * BLOCK_N),
+        block_shape=(BLOCK_K, BLOCK_N),
+        order=(1, 0),
+    )
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    for _ in range(0, K, BLOCK_K):
+        w = tl.load(W_bp, boundary_check=(0, 1), padding_option="zero")
+        x = tl.load(X_bp, boundary_check=(0, 1), padding_option="zero")
+        acc = tl.dot(w, x, acc)
+        W_bp = tl.advance(W_bp, (0, BLOCK_K))
+        X_bp = tl.advance(X_bp, (BLOCK_K, 0))
+
+    Y_bp = tl.make_block_ptr(
+        base=y_base,
+        shape=(M, N),
+        strides=(stride_ym, 1),
+        offsets=(pid_m * BLOCK_M, pid_n * BLOCK_N),
+        block_shape=(BLOCK_M, BLOCK_N),
+        order=(1, 0),
+    )
+    tl.store(Y_bp, acc.to(y_ptr.dtype.element_ty), boundary_check=(0, 1))
+
+
+class Model(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, bias: bool = False):
+        super(Model, self).__init__()
+        self.conv1d = nn.Conv2d(
+            in_channels, out_channels, kernel_size=1, stride=1, padding=0, bias=bias
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, C_in, H, W = x.shape
+        C_out = self.conv1d.weight.shape[0]
+        weight = self.conv1d.weight.squeeze(-1).squeeze(-1).contiguous()
+        N_spatial = H * W
+
+        BLOCK_M = 128
+        BLOCK_N = 128
+        BLOCK_K = 64
+        GROUP_M = 8
+
+        out = torch.empty((B, C_out, H, W), device=x.device, dtype=x.dtype)
+
+        grid = (
+            triton.cdiv(C_out, BLOCK_M) * triton.cdiv(N_spatial, BLOCK_N),
+            B,
+        )
+        _pointwise_gemm[grid](
+            x,
+            weight,
+            out,
+            C_out,
+            N_spatial,
+            C_in,
+            x.stride(0),
+            x.stride(1),
+            weight.stride(0),
+            weight.stride(1),
+            out.stride(0),
+            out.stride(1),
+            BLOCK_M=BLOCK_M,
+            BLOCK_N=BLOCK_N,
+            BLOCK_K=BLOCK_K,
+            GROUP_M=GROUP_M,
+            num_warps=8,
+            num_stages=1,
+        )
+
+        if self.conv1d.bias is not None:
+            out = out + self.conv1d.bias.view(1, -1, 1, 1)
+        return out
+
+
+batch_size = 8
+in_channels = 64
+out_channels = 128
+width = 1024
+height = 1024
+
+
+def get_inputs():
+    x = torch.rand(batch_size, in_channels, height, width)
+    return [x]
+
+
+def get_init_inputs():
+    return [in_channels, out_channels]
diff --git a/backends/triton/cpu/KernelBench/level1/88_MinGPTNewGelu.py b/backends/triton/cpu/KernelBench/level1/88_MinGPTNewGelu.py
new file mode 100644
index 0000000..bde7264
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/88_MinGPTNewGelu.py
@@ -0,0 +1,65 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+from triton.language.extra.cpu import libdevice
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2),
+    ],
+    key=["n_elements"],
+)
+@triton.jit
+def _gelu_kernel(
+    x_ptr,
+    out_ptr,
+    n_elements,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    offs = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offs < n_elements
+
+    x = tl.load(x_ptr + offs, mask=mask, other=0.0)
+
+    x_f32 = x.to(tl.float32)
+    inner = x_f32 + 0.044715 * x_f32 * x_f32 * x_f32
+    t = libdevice.tanh(0.7978845608028654 * inner)
+    result = (0.5 * x_f32 * (1.0 + t)).to(x.dtype)
+
+    tl.store(out_ptr + offs, result, mask=mask)
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x):
+        x_flat = x.contiguous().view(-1)
+        n_elements = x_flat.numel()
+        output = torch.empty_like(x_flat)
+
+        grid = lambda META: (triton.cdiv(n_elements, META["BLOCK_SIZE"]),)
+        _gelu_kernel[grid](x_flat, output, n_elements)
+
+        return output.view(x.shape)
+
+
+batch_size = 8192
+dim = 8192
+
+
+def get_inputs():
+    return [torch.rand(batch_size, dim)]
+
+
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/89_cumsum.py b/backends/triton/cpu/KernelBench/level1/89_cumsum.py
new file mode 100644
index 0000000..34e0a1e
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/89_cumsum.py
@@ -0,0 +1,90 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def scan_add_op(a, b):
+    return a + b
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_SIZE": 256}, num_warps=4, num_stages=1),
+    ],
+    key=["N"],
+)
+@triton.jit
+def cumsum_kernel(
+    x_ptr,
+    out_ptr,
+    M,
+    N,
+    stride_xm,
+    stride_om,
+    BLOCK_SIZE: tl.constexpr,
+):
+    row_idx = tl.program_id(0)
+    x_row = x_ptr + row_idx * stride_xm
+    o_row = out_ptr + row_idx * stride_om
+
+    running_total = 0.0
+
+    for block_start in tl.range(0, N, BLOCK_SIZE):
+        offsets = block_start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < N
+
+        x = tl.load(x_row + offsets, mask=mask, other=0.0).to(tl.float32)
+
+        scanned = tl.associative_scan(x, axis=0, combine_fn=scan_add_op)
+        result = scanned + running_total
+
+        running_total = running_total + tl.sum(x, axis=0)
+
+        tl.store(o_row + offsets, result, mask=mask)
+
+
+def kernel_function(x):
+    M, N = x.shape
+    out = torch.empty_like(x)
+    assert x.stride(1) == 1
+
+    grid = (M,)
+    cumsum_kernel[grid](
+        x,
+        out,
+        M,
+        N,
+        x.stride(0),
+        out.stride(0),
+    )
+    return out
+
+
+batch_size = 32768
+input_shape = (32768,)
+dim = 1
+
+
+def get_inputs():
+    return [torch.rand(batch_size, *input_shape)]
+
+
+def get_init_inputs():
+    return [dim]
+
+
+class Model(nn.Module):
+    def __init__(self, dim):
+        super(Model, self).__init__()
+        self.dim = dim
+
+    def forward(self, x):
+        return kernel_function(x)
diff --git a/backends/triton/cpu/KernelBench/level1/8_Matmul_with_irregular_shapes_.py b/backends/triton/cpu/KernelBench/level1/8_Matmul_with_irregular_shapes_.py
new file mode 100644
index 0000000..ad4bb8a
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/8_Matmul_with_irregular_shapes_.py
@@ -0,0 +1,128 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+def get_autotune_configs():
+    return [
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
+            num_warps=32,
+            num_stages=2,
+        ),
+    ]
+
+
+@triton.autotune(
+    configs=get_autotune_configs(),
+    key=["M", "N", "K"],
+)
+@triton.jit
+def matmul_kernel(
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    M,
+    N,
+    K,
+    stride_am: tl.constexpr,
+    stride_ak: tl.constexpr,
+    stride_bk: tl.constexpr,
+    stride_bn: tl.constexpr,
+    stride_cm: tl.constexpr,
+    stride_cn: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    num_pid_m = tl.cdiv(M, BLOCK_M)
+    num_pid_n = tl.cdiv(N, BLOCK_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = tl.minimum(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    a_desc = tl.make_tensor_descriptor(
+        a_ptr,
+        shape=[M, K],
+        strides=[stride_am, stride_ak],
+        block_shape=[BLOCK_M, BLOCK_K],
+    )
+    b_desc = tl.make_tensor_descriptor(
+        b_ptr,
+        shape=[K, N],
+        strides=[stride_bk, stride_bn],
+        block_shape=[BLOCK_K, BLOCK_N],
+    )
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+
+    for off_k in range(0, K, BLOCK_K):
+        a = a_desc.load([pid_m * BLOCK_M, off_k])
+        b = b_desc.load([off_k, pid_n * BLOCK_N])
+        acc += tl.dot(a, b)
+
+    c_desc = tl.make_tensor_descriptor(
+        c_ptr,
+        shape=[M, N],
+        strides=[stride_cm, stride_cn],
+        block_shape=[BLOCK_M, BLOCK_N],
+    )
+    c_desc.store([pid_m * BLOCK_M, pid_n * BLOCK_N], acc.to(c_ptr.type.element_ty))
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
+        A = A.contiguous()
+        B = B.contiguous()
+        M, K = A.shape
+        _, N = B.shape
+        C = torch.empty((M, N), device=A.device, dtype=A.dtype)
+
+        grid = lambda META: (
+            triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),
+        )
+        matmul_kernel[grid](
+            A,
+            B,
+            C,
+            M,
+            N,
+            K,
+            A.stride(0),
+            A.stride(1),
+            B.stride(0),
+            B.stride(1),
+            C.stride(0),
+            C.stride(1),
+        )
+        return C
+
+
+M = 8205
+K = 2949
+N = 5921
+
+
+def get_inputs():
+    A = torch.rand(M, K, dtype=torch.bfloat16)
+    B = torch.rand(K, N, dtype=torch.bfloat16)
+    return [A, B]
+
+
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/90_cumprod.py b/backends/triton/cpu/KernelBench/level1/90_cumprod.py
new file mode 100644
index 0000000..e0d9e1e
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/90_cumprod.py
@@ -0,0 +1,94 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def _mul_combine(a, b):
+    return a * b
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_SIZE": 512}, num_warps=4),
+    ],
+    key=["N"],
+)
+@triton.jit
+def _cumprod_kernel(
+    x_ptr,
+    out_ptr,
+    M,
+    N,
+    stride_m,
+    stride_n,
+    BLOCK_SIZE: tl.constexpr,
+):
+    row = tl.program_id(0)
+
+    running_prod = 1.0
+
+    for col_start in range(0, N, BLOCK_SIZE):
+        cols = col_start + tl.arange(0, BLOCK_SIZE)
+        mask = cols < N
+
+        x = tl.load(
+            x_ptr + row.to(tl.int64) * stride_m + cols * stride_n, mask=mask, other=1.0
+        ).to(tl.float32)
+
+        cum = tl.associative_scan(x, 0, _mul_combine)
+
+        cum = cum * running_prod
+
+        tl.store(
+            out_ptr + row.to(tl.int64) * stride_m + cols * stride_n, cum, mask=mask
+        )
+
+        block_product = tl.reduce(x, 0, _mul_combine)
+        running_prod = running_prod * block_product
+
+
+def cumprod_triton(x, dim):
+    assert dim == 1
+    M, N = x.shape
+    out = torch.empty_like(x)
+
+    grid = (M,)
+    _cumprod_kernel[grid](
+        x,
+        out,
+        M,
+        N,
+        x.stride(0),
+        x.stride(1),
+    )
+    return out
+
+
+class Model(nn.Module):
+    def __init__(self, dim):
+        super(Model, self).__init__()
+        self.dim = dim
+
+    def forward(self, x):
+        return cumprod_triton(x, self.dim)
+
+
+batch_size = 32768
+input_shape = (32768,)
+dim = 1
+
+
+def get_inputs():
+    return [torch.rand(batch_size, *input_shape)]
+
+
+def get_init_inputs():
+    return [dim]
diff --git a/backends/triton/cpu/KernelBench/level1/91_cumsum_reverse.py b/backends/triton/cpu/KernelBench/level1/91_cumsum_reverse.py
new file mode 100644
index 0000000..3d78bb9
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/91_cumsum_reverse.py
@@ -0,0 +1,91 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_N": 256}, num_warps=4),
+    ],
+    key=["N"],
+)
+@triton.jit
+def reverse_cumsum_kernel(
+    x_ptr,
+    out_ptr,
+    M,
+    N: tl.constexpr,
+    stride_xm,
+    stride_xn,
+    stride_om,
+    stride_on,
+    BLOCK_N: tl.constexpr,
+):
+    row = tl.program_id(0)
+    x_base = x_ptr + row.to(tl.int64) * stride_xm
+    out_base = out_ptr + row.to(tl.int64) * stride_om
+
+    # Pass 1: compute total sum of the row
+    total = 0.0
+    for col_start in range(0, N, BLOCK_N):
+        cols = col_start + tl.arange(0, BLOCK_N)
+        mask = cols < N
+        x = tl.load(x_base + cols * stride_xn, mask=mask, other=0.0).to(tl.float32)
+        total = total + tl.sum(x, axis=0)
+
+    # Pass 2: compute suffix sums (left-to-right)
+    # suffix[i] = total - exclusive_prefix_sum[i]
+    # where exclusive_prefix[i] = running_prefix + cumsum_inc[local_i] - x[local_i]
+    running_prefix = 0.0
+    for col_start in range(0, N, BLOCK_N):
+        cols = col_start + tl.arange(0, BLOCK_N)
+        mask = cols < N
+        x = tl.load(x_base + cols * stride_xn, mask=mask, other=0.0).to(tl.float32)
+        cumsum_inc = tl.cumsum(x, axis=0)
+        suffix = total - running_prefix - cumsum_inc + x
+        tl.store(out_base + cols * stride_on, suffix, mask=mask)
+        running_prefix = running_prefix + tl.sum(x, axis=0)
+
+
+class Model(nn.Module):
+    def __init__(self, dim):
+        super(Model, self).__init__()
+        self.dim = dim
+
+    def forward(self, x):
+        x = x.contiguous()
+        M, N = x.shape
+        output = torch.empty_like(x)
+
+        grid = (M,)
+        reverse_cumsum_kernel[grid](
+            x,
+            output,
+            M,
+            N,
+            x.stride(0),
+            x.stride(1),
+            output.stride(0),
+            output.stride(1),
+        )
+        return output
+
+
+batch_size = 32768
+input_shape = (32768,)
+dim = 1
+
+
+def get_inputs():
+    return [torch.rand(batch_size, *input_shape)]
+
+
+def get_init_inputs():
+    return [dim]
diff --git a/backends/triton/cpu/KernelBench/level1/92_cumsum_exclusive.py b/backends/triton/cpu/KernelBench/level1/92_cumsum_exclusive.py
new file mode 100644
index 0000000..94e17ba
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/92_cumsum_exclusive.py
@@ -0,0 +1,88 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2),
+    ],
+    key=["N"],
+)
+@triton.jit
+def exclusive_cumsum_kernel(
+    x_ptr,
+    out_ptr,
+    M,
+    N,
+    stride_xm,
+    stride_xn,
+    stride_om,
+    stride_on,
+    BLOCK_SIZE: tl.constexpr,
+):
+    row_idx = tl.program_id(0)
+
+    x_row_ptr = x_ptr + row_idx.to(tl.int64) * stride_xm
+    o_row_ptr = out_ptr + row_idx.to(tl.int64) * stride_om
+
+    running_sum = tl.zeros([1], dtype=tl.float32)
+
+    for col_start in range(0, N, BLOCK_SIZE):
+        col_offsets = col_start + tl.arange(0, BLOCK_SIZE)
+        mask = col_offsets < N
+
+        x = tl.load(x_row_ptr + col_offsets * stride_xn, mask=mask, other=0.0).to(
+            tl.float32
+        )
+
+        inclusive = tl.cumsum(x, axis=0)
+        exclusive = inclusive - x
+
+        result = exclusive + running_sum
+
+        tl.store(o_row_ptr + col_offsets * stride_on, result, mask=mask)
+
+        running_sum += tl.sum(x, axis=0)
+
+
+class Model(nn.Module):
+    def __init__(self, dim):
+        super(Model, self).__init__()
+        self.dim = dim
+
+    def forward(self, x):
+        M, N = x.shape
+        out = torch.empty_like(x)
+        grid = (M,)
+        exclusive_cumsum_kernel[grid](
+            x,
+            out,
+            M,
+            N,
+            x.stride(0),
+            x.stride(1),
+            out.stride(0),
+            out.stride(1),
+        )
+        return out
+
+
+batch_size = 32768
+input_shape = (32768,)
+dim = 1
+
+
+def get_inputs():
+    return [torch.rand(batch_size, *input_shape)]
+
+
+def get_init_inputs():
+    return [dim]
diff --git a/backends/triton/cpu/KernelBench/level1/93_masked_cumsum.py b/backends/triton/cpu/KernelBench/level1/93_masked_cumsum.py
new file mode 100644
index 0000000..b440482
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/93_masked_cumsum.py
@@ -0,0 +1,89 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def scan_add_op(a, b):
+    return a + b
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_SIZE": 512}, num_warps=4, num_stages=1),
+    ],
+    key=["N"],
+)
+@triton.jit
+def cumsum_kernel(
+    x_ptr,
+    out_ptr,
+    M,
+    N,
+    stride_xm,
+    stride_om,
+    BLOCK_SIZE: tl.constexpr,
+):
+    row_idx = tl.program_id(0)
+    x_row = x_ptr + row_idx * stride_xm
+    o_row = out_ptr + row_idx * stride_om
+
+    running_total = 0.0
+
+    for block_start in tl.range(0, N, BLOCK_SIZE):
+        offsets = block_start + tl.arange(0, BLOCK_SIZE)
+        col_mask = offsets < N
+
+        x = tl.load(x_row + offsets, mask=col_mask, other=0.0).to(tl.float32)
+
+        scanned = tl.associative_scan(x, axis=0, combine_fn=scan_add_op)
+        result = scanned + running_total
+
+        running_total = running_total + tl.sum(x, axis=0)
+
+        tl.store(o_row + offsets, result.to(out_ptr.dtype.element_ty), mask=col_mask)
+
+
+class Model(nn.Module):
+    def __init__(self, dim):
+        super(Model, self).__init__()
+        self.dim = dim
+
+    def forward(self, x, mask):
+        masked = x.float() * mask.float()
+        masked = masked.contiguous()
+        M, N = masked.shape
+        out = torch.empty_like(masked)
+
+        grid = (M,)
+        cumsum_kernel[grid](
+            masked,
+            out,
+            M,
+            N,
+            masked.stride(0),
+            out.stride(0),
+        )
+        return out
+
+
+batch_size = 32768
+input_shape = (32768,)
+dim = 1
+
+
+def get_inputs():
+    x = torch.rand(batch_size, *input_shape)
+    mask = torch.randint(0, 2, x.shape).bool()
+    return [x, mask]
+
+
+def get_init_inputs():
+    return [dim]
diff --git a/backends/triton/cpu/KernelBench/level1/94_MSELoss.py b/backends/triton/cpu/KernelBench/level1/94_MSELoss.py
new file mode 100644
index 0000000..5111369
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/94_MSELoss.py
@@ -0,0 +1,107 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+def get_autotune_configs():
+    configs = [
+        triton.Config(
+            {"BLOCK_SIZE": 1024},
+            num_warps=4,
+            num_stages=1,
+        )
+    ]
+    return configs
+
+
+@triton.autotune(
+    configs=get_autotune_configs(),
+    key=["N_COLS"],
+)
+@triton.jit
+def mse_row_kernel(
+    pred_ptr,
+    target_ptr,
+    row_sums_ptr,
+    N_ROWS,
+    N_COLS,
+    stride_pred_row,
+    stride_pred_col,
+    stride_target_row,
+    stride_target_col,
+    BLOCK_SIZE: tl.constexpr,
+):
+    row_idx = tl.program_id(0)
+
+    pred_row_start = pred_ptr + row_idx * stride_pred_row
+    target_row_start = target_ptr + row_idx * stride_target_row
+
+    acc = 0.0
+
+    for col_start in tl.range(0, N_COLS, BLOCK_SIZE):
+        cols = col_start + tl.arange(0, BLOCK_SIZE)
+        mask = cols < N_COLS
+
+        pred_vals = tl.load(
+            pred_row_start + cols * stride_pred_col, mask=mask, other=0.0
+        ).to(tl.float32)
+        target_vals = tl.load(
+            target_row_start + cols * stride_target_col, mask=mask, other=0.0
+        ).to(tl.float32)
+
+        diff = pred_vals - target_vals
+        acc += tl.sum(diff * diff, axis=0)
+
+    tl.store(row_sums_ptr + row_idx, acc)
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, predictions, targets):
+        predictions = predictions.contiguous()
+        targets = targets.contiguous()
+
+        N_ROWS, N_COLS = predictions.shape
+
+        row_sums = torch.empty(N_ROWS, device=predictions.device, dtype=torch.float32)
+
+        grid = (N_ROWS,)
+        mse_row_kernel[grid](
+            predictions,
+            targets,
+            row_sums,
+            N_ROWS,
+            N_COLS,
+            predictions.stride(0),
+            predictions.stride(1),
+            targets.stride(0),
+            targets.stride(1),
+        )
+
+        return torch.sum(row_sums) / (N_ROWS * N_COLS)
+
+
+batch_size = 32768
+input_shape = (32768,)
+dim = 1
+
+
+def get_inputs():
+    scale = torch.rand(())
+    return [
+        torch.rand(batch_size, *input_shape) * scale,
+        torch.rand(batch_size, *input_shape),
+    ]
+
+
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/95_CrossEntropyLoss.py b/backends/triton/cpu/KernelBench/level1/95_CrossEntropyLoss.py
new file mode 100644
index 0000000..6a9f8e8
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/95_CrossEntropyLoss.py
@@ -0,0 +1,107 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+def _ce_configs():
+    configs = []
+    for BN in [1024, 2048, 4096]:
+        for nw in [4, 8, 16]:
+            for ws in [16, 32]:
+                configs.append(
+                    triton.Config({"BLOCK_N": BN, "warp_size": ws}, num_warps=nw)
+                )
+    return configs
+
+
+@triton.autotune(configs=_ce_configs(), key=["N"])
+@triton.jit
+def _cross_entropy_online_kernel(
+    logits_ptr,
+    targets_ptr,
+    losses_ptr,
+    N,
+    stride_lm,
+    stride_ln,
+    BLOCK_N: tl.constexpr,
+    warp_size: tl.constexpr,
+):
+    row = tl.program_id(0)
+    row_off = row.to(tl.int64) * stride_lm
+
+    LOG2E: tl.constexpr = 1.4426950408889634
+    LN2: tl.constexpr = 0.6931471805599453
+
+    running_max = -float("inf")
+    running_sum = 0.0
+
+    for start in range(0, N, BLOCK_N):
+        col_offs = start + tl.arange(0, BLOCK_N)
+        mask = col_offs < N
+        x = tl.load(
+            logits_ptr + row_off + col_offs * stride_ln, mask=mask, other=-float("inf")
+        ).to(tl.float32)
+
+        block_max = tl.max(x, axis=0)
+        new_max = tl.maximum(running_max, block_max)
+
+        running_sum = running_sum * tl.math.exp2(
+            (running_max - new_max) * LOG2E
+        ) + tl.sum(tl.math.exp2((x - new_max) * LOG2E), axis=0)
+        running_max = new_max
+
+    log_sum_exp = tl.math.log2(running_sum) * LN2
+
+    target = tl.load(targets_ptr + row).to(tl.int64)
+    target_logit = tl.load(logits_ptr + row_off + target * stride_ln).to(tl.float32)
+
+    loss = -target_logit + running_max + log_sum_exp
+    tl.store(losses_ptr + row, loss)
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, predictions, targets):
+        predictions = predictions.contiguous()
+        targets = targets.contiguous()
+
+        M, N = predictions.shape
+        losses = torch.empty(M, device=predictions.device, dtype=torch.float32)
+
+        grid = (M,)
+        _cross_entropy_online_kernel[grid](
+            predictions,
+            targets,
+            losses,
+            N,
+            predictions.stride(0),
+            predictions.stride(1),
+        )
+
+        return losses.mean()
+
+
+batch_size = 32768
+num_classes = 4096
+input_shape = (num_classes,)
+dim = 1
+
+
+def get_inputs():
+    return [
+        torch.rand(batch_size, *input_shape),
+        torch.randint(0, num_classes, (batch_size,)),
+    ]
+
+
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/96_HuberLoss.py b/backends/triton/cpu/KernelBench/level1/96_HuberLoss.py
new file mode 100644
index 0000000..30e4887
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/96_HuberLoss.py
@@ -0,0 +1,97 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_SIZE": 4096}, num_warps=4),
+    ],
+    key=["n_cols"],
+)
+@triton.jit
+def smooth_l1_row_kernel(
+    predictions_ptr,
+    targets_ptr,
+    row_sums_ptr,
+    n_cols,
+    stride_pred,
+    stride_targ,
+    BLOCK_SIZE: tl.constexpr,
+):
+    row_idx = tl.program_id(0)
+
+    pred_row_start = row_idx * stride_pred
+    targ_row_start = row_idx * stride_targ
+
+    row_sum = 0.0
+
+    for col_start in range(0, n_cols, BLOCK_SIZE):
+        col_offsets = col_start + tl.arange(0, BLOCK_SIZE)
+        mask = col_offsets < n_cols
+
+        pred = tl.load(
+            predictions_ptr + pred_row_start + col_offsets, mask=mask, other=0.0
+        ).to(tl.float32)
+        targ = tl.load(
+            targets_ptr + targ_row_start + col_offsets, mask=mask, other=0.0
+        ).to(tl.float32)
+
+        diff = pred - targ
+        abs_diff = tl.abs(diff)
+        loss = tl.where(abs_diff < 1.0, 0.5 * diff * diff, abs_diff - 0.5)
+
+        row_sum += tl.sum(loss, axis=0)
+
+    tl.store(row_sums_ptr + row_idx, row_sum.to(tl.float32))
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, predictions, targets):
+        predictions = predictions.contiguous()
+        targets = targets.contiguous()
+
+        n_rows = predictions.shape[0]
+        n_cols = predictions.shape[1] if predictions.ndim > 1 else predictions.numel()
+        n_elements = predictions.numel()
+
+        row_sums = torch.empty(n_rows, device=predictions.device, dtype=torch.float32)
+
+        grid = (n_rows,)
+        smooth_l1_row_kernel[grid](
+            predictions,
+            targets,
+            row_sums,
+            n_cols,
+            predictions.stride(0) if predictions.ndim > 1 else n_cols,
+            targets.stride(0) if targets.ndim > 1 else n_cols,
+        )
+
+        return row_sums.sum() / n_elements
+
+
+batch_size = 32768
+input_shape = (32768,)
+dim = 1
+
+
+def get_inputs():
+    scale = torch.rand(())
+    return [
+        torch.rand(batch_size, *input_shape) * scale,
+        torch.rand(batch_size, *input_shape),
+    ]
+
+
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/97_ScaledDotProductAttention.py b/backends/triton/cpu/KernelBench/level1/97_ScaledDotProductAttention.py
new file mode 100644
index 0000000..dc20d99
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/97_ScaledDotProductAttention.py
@@ -0,0 +1,280 @@
+# ruff: noqa: E731, E741
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import math
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 64}, num_warps=16, num_stages=2
+        ),
+    ],
+    key=["SEQ_LEN", "HEAD_DIM"],
+)
+@triton.jit
+def _qk_gemm_kernel(
+    Q_ptr,
+    K_ptr,
+    S_ptr,
+    stride_qb,
+    stride_qm,
+    stride_qd,
+    stride_kb,
+    stride_kn,
+    stride_kd,
+    stride_sb,
+    stride_sm,
+    stride_sn,
+    SEQ_LEN,
+    HEAD_DIM,
+    scale,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+):
+    pid_b = tl.program_id(0)
+    pid_tile = tl.program_id(1)
+
+    num_n = tl.cdiv(SEQ_LEN, BLOCK_N)
+    pid_m = pid_tile // num_n
+    pid_n = pid_tile % num_n
+
+    q_base = Q_ptr + pid_b.to(tl.int64) * stride_qb
+    k_base = K_ptr + pid_b.to(tl.int64) * stride_kb
+
+    Q_bp = tl.make_block_ptr(
+        base=q_base,
+        shape=(SEQ_LEN, HEAD_DIM),
+        strides=(stride_qm, stride_qd),
+        offsets=(pid_m * BLOCK_M, 0),
+        block_shape=(BLOCK_M, BLOCK_K),
+        order=(1, 0),
+    )
+    KT_bp = tl.make_block_ptr(
+        base=k_base,
+        shape=(HEAD_DIM, SEQ_LEN),
+        strides=(stride_kd, stride_kn),
+        offsets=(0, pid_n * BLOCK_N),
+        block_shape=(BLOCK_K, BLOCK_N),
+        order=(1, 0),
+    )
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    for _ in range(0, HEAD_DIM, BLOCK_K):
+        q = tl.load(Q_bp, boundary_check=(0, 1))
+        k_t = tl.load(KT_bp, boundary_check=(0, 1))
+        acc = tl.dot(q, k_t, acc)
+        Q_bp = tl.advance(Q_bp, (0, BLOCK_K))
+        KT_bp = tl.advance(KT_bp, (BLOCK_K, 0))
+
+    acc = acc * scale
+
+    s_base = S_ptr + pid_b.to(tl.int64) * stride_sb
+    S_bp = tl.make_block_ptr(
+        base=s_base,
+        shape=(SEQ_LEN, SEQ_LEN),
+        strides=(stride_sm, stride_sn),
+        offsets=(pid_m * BLOCK_M, pid_n * BLOCK_N),
+        block_shape=(BLOCK_M, BLOCK_N),
+        order=(1, 0),
+    )
+    tl.store(S_bp, acc.to(S_ptr.dtype.element_ty), boundary_check=(0, 1))
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_K": 64, "BLOCK_N": 128}, num_warps=8, num_stages=3
+        ),
+    ],
+    key=["SEQ_LEN", "HEAD_DIM"],
+)
+@triton.jit
+def _fused_softmax_pv_kernel(
+    S_ptr,
+    V_ptr,
+    O_ptr,
+    stride_sb,
+    stride_sm,
+    stride_sn,
+    stride_vb,
+    stride_vn,
+    stride_vd,
+    stride_ob,
+    stride_om,
+    stride_od,
+    SEQ_LEN,
+    HEAD_DIM,
+    BLOCK_M: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    pid_b = tl.program_id(0)
+    pid_tile = tl.program_id(1)
+
+    num_n = tl.cdiv(HEAD_DIM, BLOCK_N)
+    pid_m = pid_tile // num_n
+    pid_n = pid_tile % num_n
+
+    s_base = S_ptr + pid_b.to(tl.int64) * stride_sb
+    v_base = V_ptr + pid_b.to(tl.int64) * stride_vb
+    off_m = pid_m * BLOCK_M
+
+    LOG2E = 1.4426950408889634
+
+    m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+
+    for off_k in range(0, SEQ_LEN, BLOCK_K):
+        S_bp = tl.make_block_ptr(
+            base=s_base,
+            shape=(SEQ_LEN, SEQ_LEN),
+            strides=(stride_sm, stride_sn),
+            offsets=(off_m, off_k),
+            block_shape=(BLOCK_M, BLOCK_K),
+            order=(1, 0),
+        )
+        s = tl.load(S_bp, boundary_check=(0, 1)).to(tl.float32)
+
+        chunk_max = tl.max(s, axis=1)
+        m_new = tl.maximum(m_i, chunk_max)
+        alpha = tl.math.exp2((m_i - m_new) * LOG2E)
+        exp_s = tl.math.exp2((s - m_new[:, None]) * LOG2E)
+        chunk_sum = tl.sum(exp_s, axis=1)
+        l_i = alpha * l_i + chunk_sum
+        acc = acc * alpha[:, None]
+        m_i = m_new
+
+        V_bp = tl.make_block_ptr(
+            base=v_base,
+            shape=(SEQ_LEN, HEAD_DIM),
+            strides=(stride_vn, stride_vd),
+            offsets=(off_k, pid_n * BLOCK_N),
+            block_shape=(BLOCK_K, BLOCK_N),
+            order=(1, 0),
+        )
+        v = tl.load(V_bp, boundary_check=(0, 1))
+        acc = tl.dot(exp_s.to(v.dtype), v, acc)
+
+    acc = acc / l_i[:, None]
+
+    o_base = O_ptr + pid_b.to(tl.int64) * stride_ob
+    O_bp = tl.make_block_ptr(
+        base=o_base,
+        shape=(SEQ_LEN, HEAD_DIM),
+        strides=(stride_om, stride_od),
+        offsets=(off_m, pid_n * BLOCK_N),
+        block_shape=(BLOCK_M, BLOCK_N),
+        order=(1, 0),
+    )
+    tl.store(O_bp, acc.to(O_ptr.dtype.element_ty), boundary_check=(0, 1))
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self._s_buf = None
+        self._o_buf = None
+
+    def forward(self, Q, K, V):
+        B, H, S, D = Q.shape
+        scale = 1.0 / math.sqrt(D)
+
+        device = Q.device
+
+        Q = Q.reshape(B * H, S, D).contiguous()
+        K = K.reshape(B * H, S, D).contiguous()
+        V = V.reshape(B * H, S, D).contiguous()
+
+        BH = B * H
+
+        if (
+            self._s_buf is None
+            or self._s_buf.shape != (BH, S, S)
+            or self._s_buf.device != device
+        ):
+            self._s_buf = torch.empty(BH, S, S, device=device, dtype=torch.float32)
+        S_mat = self._s_buf
+
+        grid1 = lambda META: (
+            BH,
+            triton.cdiv(S, META["BLOCK_M"]) * triton.cdiv(S, META["BLOCK_N"]),
+        )
+        _qk_gemm_kernel[grid1](
+            Q,
+            K,
+            S_mat,
+            Q.stride(0),
+            Q.stride(1),
+            Q.stride(2),
+            K.stride(0),
+            K.stride(1),
+            K.stride(2),
+            S_mat.stride(0),
+            S_mat.stride(1),
+            S_mat.stride(2),
+            S,
+            D,
+            scale,
+        )
+
+        if (
+            self._o_buf is None
+            or self._o_buf.shape != (BH, S, D)
+            or self._o_buf.device != device
+            or self._o_buf.dtype != Q.dtype
+        ):
+            self._o_buf = torch.empty(BH, S, D, device=device, dtype=Q.dtype)
+        O = self._o_buf
+
+        grid2 = lambda META: (
+            BH,
+            triton.cdiv(S, META["BLOCK_M"]) * triton.cdiv(D, META["BLOCK_N"]),
+        )
+        _fused_softmax_pv_kernel[grid2](
+            S_mat,
+            V,
+            O,
+            S_mat.stride(0),
+            S_mat.stride(1),
+            S_mat.stride(2),
+            V.stride(0),
+            V.stride(1),
+            V.stride(2),
+            O.stride(0),
+            O.stride(1),
+            O.stride(2),
+            S,
+            D,
+        )
+
+        result = O.reshape(B, H, S, D)
+
+        return result
+
+
+batch_size = 32
+num_heads = 32
+sequence_length = 512
+embedding_dimension = 1024
+
+
+def get_inputs():
+    Q = torch.rand(batch_size, num_heads, sequence_length, embedding_dimension)
+    K = torch.rand(batch_size, num_heads, sequence_length, embedding_dimension)
+    V = torch.rand(batch_size, num_heads, sequence_length, embedding_dimension)
+    return [Q, K, V]
+
+
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/98_KLDivLoss.py b/backends/triton/cpu/KernelBench/level1/98_KLDivLoss.py
new file mode 100644
index 0000000..c0ed1ab
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/98_KLDivLoss.py
@@ -0,0 +1,103 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+batch_size = 8192 * 2
+input_shape = (8192 * 2,)
+dim = 1
+
+
+def get_kl_div_configs():
+    configs = [
+        triton.Config(
+            {"BLOCK_SIZE": 4096},
+            num_warps=4,
+            num_stages=1,
+        )
+    ]
+    return configs
+
+
+@triton.autotune(
+    configs=get_kl_div_configs(),
+    key=["n_cols"],
+)
+@triton.jit
+def kl_div_row_kernel(
+    pred_ptr,
+    target_ptr,
+    out_ptr,
+    n_cols,
+    stride_pred_row,
+    stride_target_row,
+    BLOCK_SIZE: tl.constexpr,
+):
+    row_idx = tl.program_id(0)
+    pred_row_start = pred_ptr + row_idx * stride_pred_row
+    target_row_start = target_ptr + row_idx * stride_target_row
+
+    acc = 0.0
+
+    for col_start in tl.range(0, n_cols, BLOCK_SIZE):
+        col_offsets = col_start + tl.arange(0, BLOCK_SIZE)
+        mask = col_offsets < n_cols
+
+        pred_vals = tl.load(pred_row_start + col_offsets, mask=mask, other=1.0).to(
+            tl.float32
+        )
+        target_vals = tl.load(target_row_start + col_offsets, mask=mask, other=0.0).to(
+            tl.float32
+        )
+
+        kl_vals = tl.where(
+            target_vals > 0, target_vals * tl.log(target_vals / pred_vals), 0.0
+        )
+        acc += tl.sum(kl_vals, axis=0)
+
+    tl.store(out_ptr + row_idx, acc)
+
+
+def kernel_function(predictions: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
+    assert predictions.is_contiguous() and targets.is_contiguous()
+    B, N = predictions.shape
+
+    row_sums = torch.empty(B, device=predictions.device, dtype=torch.float32)
+
+    grid = (B,)
+    kl_div_row_kernel[grid](
+        predictions,
+        targets,
+        row_sums,
+        N,
+        predictions.stride(0),
+        targets.stride(0),
+    )
+
+    return row_sums.sum() / B
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, predictions, targets):
+        return kernel_function(predictions, targets)
+
+
+def get_inputs():
+    scale = torch.rand(())
+    return [
+        (torch.rand(batch_size, *input_shape) * scale).softmax(dim=-1),
+        torch.rand(batch_size, *input_shape).softmax(dim=-1),
+    ]
+
+
+def get_init_inputs():
+    return []
diff --git a/backends/triton/cpu/KernelBench/level1/99_TripletMarginLoss.py b/backends/triton/cpu/KernelBench/level1/99_TripletMarginLoss.py
new file mode 100644
index 0000000..237d264
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/99_TripletMarginLoss.py
@@ -0,0 +1,127 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_K": 256}, num_warps=4, num_stages=2),
+    ],
+    key=["D"],
+)
+@triton.jit
+def _triplet_margin_loss_kernel(
+    anchor_ptr,
+    positive_ptr,
+    negative_ptr,
+    loss_ptr,
+    B,
+    D,
+    stride_ab,
+    stride_ad,
+    stride_pb,
+    stride_pd,
+    stride_nb,
+    stride_nd,
+    margin,
+    eps,
+    BLOCK_K: tl.constexpr,
+):
+    row = tl.program_id(0)
+    if row >= B:
+        return
+
+    base_a = row.to(tl.int64) * stride_ab
+    base_p = row.to(tl.int64) * stride_pb
+    base_n = row.to(tl.int64) * stride_nb
+
+    sum_pos_sq = 0.0
+    sum_neg_sq = 0.0
+
+    for k_start in range(0, D, BLOCK_K):
+        offs_k = k_start + tl.arange(0, BLOCK_K)
+        mask_k = offs_k < D
+
+        a = tl.load(
+            anchor_ptr + base_a + offs_k * stride_ad, mask=mask_k, other=0.0
+        ).to(tl.float32)
+        p = tl.load(
+            positive_ptr + base_p + offs_k * stride_pd, mask=mask_k, other=0.0
+        ).to(tl.float32)
+        n = tl.load(
+            negative_ptr + base_n + offs_k * stride_nd, mask=mask_k, other=0.0
+        ).to(tl.float32)
+
+        diff_pos = a - p + eps
+        diff_neg = a - n + eps
+        sum_pos_sq += tl.sum(diff_pos * diff_pos, axis=0)
+        sum_neg_sq += tl.sum(diff_neg * diff_neg, axis=0)
+
+    d_pos = tl.sqrt(sum_pos_sq)
+    d_neg = tl.sqrt(sum_neg_sq)
+
+    loss_val = tl.maximum(d_pos - d_neg + margin, 0.0)
+
+    tl.atomic_add(loss_ptr, loss_val, sem="relaxed")
+
+
+class Model(nn.Module):
+    def __init__(self, margin=1.0):
+        super(Model, self).__init__()
+        self.margin = margin
+        self.eps = 1e-6
+
+    def forward(self, anchor, positive, negative):
+        device = anchor.device
+        B, D = anchor.shape
+
+        anchor = anchor.contiguous()
+        positive = positive.contiguous()
+        negative = negative.contiguous()
+
+        loss_accum = torch.zeros((), device=device, dtype=torch.float32)
+
+        grid = (B,)
+        _triplet_margin_loss_kernel[grid](
+            anchor,
+            positive,
+            negative,
+            loss_accum,
+            B,
+            D,
+            anchor.stride(0),
+            anchor.stride(1),
+            positive.stride(0),
+            positive.stride(1),
+            negative.stride(0),
+            negative.stride(1),
+            self.margin,
+            self.eps,
+        )
+
+        return loss_accum / B
+
+
+batch_size = 32768
+input_shape = (8192,)
+dim = 1
+
+
+def get_inputs():
+    scale = torch.rand(())
+    return [
+        torch.rand(batch_size, *input_shape) * scale,
+        torch.rand(batch_size, *input_shape),
+        torch.rand(batch_size, *input_shape),
+    ]
+
+
+def get_init_inputs():
+    return [1.0]
diff --git a/backends/triton/cpu/KernelBench/level1/9_Tall_skinny_matrix_multiplication_.py b/backends/triton/cpu/KernelBench/level1/9_Tall_skinny_matrix_multiplication_.py
new file mode 100644
index 0000000..152975e
--- /dev/null
+++ b/backends/triton/cpu/KernelBench/level1/9_Tall_skinny_matrix_multiplication_.py
@@ -0,0 +1,141 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def swizzle_tile(
+    tile_id,
+    M,
+    N,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    grid_m = tl.cdiv(M, BLOCK_M)
+    grid_n = tl.cdiv(N, BLOCK_N)
+    width = GROUP_SIZE_M * grid_n
+    group_id = tile_id // width
+    group_size = tl.minimum(GROUP_SIZE_M, grid_m - group_id * GROUP_SIZE_M)
+    pid_m = group_id * GROUP_SIZE_M + (tile_id % group_size)
+    pid_n = (tile_id % width) // group_size
+    return pid_m, pid_n
+
+
+def _configs():
+    return [
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
+            num_warps=16,
+            num_stages=2,
+        ),
+    ]
+
+
+@triton.autotune(configs=_configs(), key=["M", "N", "K"])
+@triton.jit
+def _matmul_kernel(
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    M,
+    N,
+    K,
+    stride_am,
+    stride_ak: tl.constexpr,
+    stride_bk,
+    stride_bn: tl.constexpr,
+    stride_cm,
+    stride_cn: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    pid_m, pid_n = swizzle_tile(pid, M, N, BLOCK_M, BLOCK_N, GROUP_SIZE_M)
+
+    a_desc = tl.make_tensor_descriptor(
+        base=a_ptr,
+        shape=(M, K),
+        strides=(stride_am, stride_ak),
+        block_shape=(BLOCK_M, BLOCK_K),
+    )
+    b_desc = tl.make_tensor_descriptor(
+        base=b_ptr,
+        shape=(K, N),
+        strides=(stride_bk, stride_bn),
+        block_shape=(BLOCK_K, BLOCK_N),
+    )
+
+    off_m = pid_m * BLOCK_M
+    off_n = pid_n * BLOCK_N
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+
+    for off_k in range(0, K, BLOCK_K):
+        a_tile = a_desc.load([off_m, off_k])
+        b_tile = b_desc.load([off_k, off_n])
+        acc += tl.dot(a_tile, b_tile)
+    c_desc = tl.make_tensor_descriptor(
+        base=c_ptr,
+        shape=(M, N),
+        strides=(stride_cm, stride_cn),
+        block_shape=(BLOCK_M, BLOCK_N),
+    )
+    c_desc.store([off_m, off_n], acc.to(c_ptr.type.element_ty))
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, A, B):
+        device = A.device
+        A = A.contiguous()
+        B = B.contiguous()
+
+        M_out, K = A.shape
+        N_out = B.shape[1]
+
+        C = torch.empty((M_out, N_out), device=device, dtype=A.dtype)
+
+        grid = lambda META: (
+            triton.cdiv(M_out, META["BLOCK_M"]) * triton.cdiv(N_out, META["BLOCK_N"]),
+        )
+
+        _matmul_kernel[grid](
+            A,
+            B,
+            C,
+            M_out,
+            N_out,
+            K,
+            A.stride(0),
+            A.stride(1),
+            B.stride(0),
+            B.stride(1),
+            C.stride(0),
+            C.stride(1),
+        )
+        return C
+
+
+M = 16384 * 2
+N = 16 * 2
+
+
+def get_inputs():
+    A = torch.rand(M, N, dtype=torch.bfloat16)
+    B = torch.rand(N, M, dtype=torch.bfloat16)
+    return [A, B]
+
+
+def get_init_inputs():
+    return []
diff --git a/problems/specs/KernelBench/level1/100_HingeLoss.yaml b/problems/specs/KernelBench/level1/100_HingeLoss.yaml
index 8f21a60..441ff9c 100644
--- a/problems/specs/KernelBench/level1/100_HingeLoss.yaml
+++ b/problems/specs/KernelBench/level1/100_HingeLoss.yaml
@@ -15,3 +15,10 @@ ci:
     dims:
       BATCH_SIZE: 64
       INPUT_DIM: 64
+
+simple-cpu:
+  - params: [predictions, targets]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 1024
+      INPUT_DIM: 1024
diff --git a/problems/specs/KernelBench/level1/10_3D_tensor_matrix_multiplication.yaml b/problems/specs/KernelBench/level1/10_3D_tensor_matrix_multiplication.yaml
index a72791f..43eabc9 100644
--- a/problems/specs/KernelBench/level1/10_3D_tensor_matrix_multiplication.yaml
+++ b/problems/specs/KernelBench/level1/10_3D_tensor_matrix_multiplication.yaml
@@ -16,6 +16,16 @@ ci:
       L: 24
     flop: "2*N*M*L*K"
 
+simple-cpu:
+  - params: [A, B]
+    dtype: bfloat16
+    dims:
+      N: 4
+      M: 32
+      K: 64
+      L: 24
+    flop: "2*N*M*L*K"
+
 bench-gpu:
   - params: [A, B]
     dtype: float16
diff --git a/problems/specs/KernelBench/level1/11_4D_tensor_matrix_multiplication.yaml b/problems/specs/KernelBench/level1/11_4D_tensor_matrix_multiplication.yaml
index e82b215..d0c5522 100644
--- a/problems/specs/KernelBench/level1/11_4D_tensor_matrix_multiplication.yaml
+++ b/problems/specs/KernelBench/level1/11_4D_tensor_matrix_multiplication.yaml
@@ -17,6 +17,17 @@ ci:
       K: 24
     flop: "2*B*I*J*K*L"
 
+simple-cpu:
+  - params: [A, B]
+    dtype: bfloat16
+    dims:
+      B: 2
+      I: 32
+      J: 64
+      L: 32
+      K: 24
+    flop: "2*B*I*J*K*L"
+
 bench-gpu:
   - params: [A, B]
     dtype: float16
diff --git a/problems/specs/KernelBench/level1/12_Matmul_with_diagonal_matrices_.yaml b/problems/specs/KernelBench/level1/12_Matmul_with_diagonal_matrices_.yaml
index d5f9f78..1e7a076 100644
--- a/problems/specs/KernelBench/level1/12_Matmul_with_diagonal_matrices_.yaml
+++ b/problems/specs/KernelBench/level1/12_Matmul_with_diagonal_matrices_.yaml
@@ -14,6 +14,14 @@ ci:
       N: 128
     flop: "2*M*N"
 
+simple-cpu:
+  - params: [A, B]
+    dtype: bfloat16
+    dims:
+      M: 128
+      N: 128
+    flop: "2*M*N"
+
 bench-gpu:
   - params: [A, B]
     dtype: float16
diff --git a/problems/specs/KernelBench/level1/13_Matmul_for_symmetric_matrices.yaml b/problems/specs/KernelBench/level1/13_Matmul_for_symmetric_matrices.yaml
index 66a59c3..9808093 100644
--- a/problems/specs/KernelBench/level1/13_Matmul_for_symmetric_matrices.yaml
+++ b/problems/specs/KernelBench/level1/13_Matmul_for_symmetric_matrices.yaml
@@ -15,3 +15,9 @@ ci:
     dtype: float32
     dims:
       N: 64
+
+simple-cpu:
+  - params: [A, B]
+    dtype: bfloat16
+    dims:
+      N: 64
diff --git a/problems/specs/KernelBench/level1/14_Matmul_for_upper_triangular_matrices.yaml b/problems/specs/KernelBench/level1/14_Matmul_for_upper_triangular_matrices.yaml
index 7369aff..b29c1f6 100644
--- a/problems/specs/KernelBench/level1/14_Matmul_for_upper_triangular_matrices.yaml
+++ b/problems/specs/KernelBench/level1/14_Matmul_for_upper_triangular_matrices.yaml
@@ -15,3 +15,9 @@ ci:
     dtype: float32
     dims:
       N: 64
+
+simple-cpu:
+  - params: [A, B]
+    dtype: bfloat16
+    dims:
+      N: 64
diff --git a/problems/specs/KernelBench/level1/15_Matmul_for_lower_triangular_matrices.yaml b/problems/specs/KernelBench/level1/15_Matmul_for_lower_triangular_matrices.yaml
index cafb725..f6043d4 100644
--- a/problems/specs/KernelBench/level1/15_Matmul_for_lower_triangular_matrices.yaml
+++ b/problems/specs/KernelBench/level1/15_Matmul_for_lower_triangular_matrices.yaml
@@ -15,3 +15,9 @@ ci:
     dtype: float32
     dims:
       M: 64
+
+simple-cpu:
+  - params: [A, B]
+    dtype: bfloat16
+    dims:
+      M: 64
diff --git a/problems/specs/KernelBench/level1/16_Matmul_with_transposed_A.yaml b/problems/specs/KernelBench/level1/16_Matmul_with_transposed_A.yaml
index e665dcc..3c22b31 100644
--- a/problems/specs/KernelBench/level1/16_Matmul_with_transposed_A.yaml
+++ b/problems/specs/KernelBench/level1/16_Matmul_with_transposed_A.yaml
@@ -15,6 +15,15 @@ ci:
       K: 256
     flop: "2*M*N*K"
 
+simple-cpu:
+  - params: [A, B]
+    dtype: bfloat16
+    dims:
+      M: 64
+      N: 128
+      K: 256
+    flop: "2*M*N*K"
+
 bench-gpu:
   - params: [A, B]
     dtype: float16
diff --git a/problems/specs/KernelBench/level1/17_Matmul_with_transposed_B.yaml b/problems/specs/KernelBench/level1/17_Matmul_with_transposed_B.yaml
index 9d984be..1e1e095 100644
--- a/problems/specs/KernelBench/level1/17_Matmul_with_transposed_B.yaml
+++ b/problems/specs/KernelBench/level1/17_Matmul_with_transposed_B.yaml
@@ -15,6 +15,15 @@ ci:
       K: 256
     flop: "2*M*N*K"
 
+simple-cpu:
+  - params: [A, B]
+    dtype: bfloat16
+    dims:
+      M: 64
+      N: 128
+      K: 256
+    flop: "2*M*N*K"
+
 bench-gpu:
   - params: [A, B]
     dtype: float16
diff --git a/problems/specs/KernelBench/level1/18_Matmul_with_transposed_both.yaml b/problems/specs/KernelBench/level1/18_Matmul_with_transposed_both.yaml
index 0ec07fb..d684c28 100644
--- a/problems/specs/KernelBench/level1/18_Matmul_with_transposed_both.yaml
+++ b/problems/specs/KernelBench/level1/18_Matmul_with_transposed_both.yaml
@@ -15,6 +15,15 @@ ci:
       K: 256
     flop: "2*N*M*K"
 
+simple-cpu:
+  - params: [A, B]
+    dtype: bfloat16
+    dims:
+      M: 64
+      N: 128
+      K: 256
+    flop: "2*N*M*K"
+
 bench-gpu:
   - params: [A, B]
     dtype: float16
diff --git a/problems/specs/KernelBench/level1/19_ReLU.yaml b/problems/specs/KernelBench/level1/19_ReLU.yaml
index 9fc614e..3ccc889 100644
--- a/problems/specs/KernelBench/level1/19_ReLU.yaml
+++ b/problems/specs/KernelBench/level1/19_ReLU.yaml
@@ -10,6 +10,13 @@ ci:
       BATCH: 128
       DIM: 512
 
+simple-cpu:
+  - params: [X]
+    dtype: float32
+    dims:
+      BATCH: 128
+      DIM: 512
+
 bench-cpu:
   - params: [X]
     dtype: bfloat16
diff --git a/problems/specs/KernelBench/level1/1_Square_matrix_multiplication_.yaml b/problems/specs/KernelBench/level1/1_Square_matrix_multiplication_.yaml
index 3a0fc9c..2f5de5b 100644
--- a/problems/specs/KernelBench/level1/1_Square_matrix_multiplication_.yaml
+++ b/problems/specs/KernelBench/level1/1_Square_matrix_multiplication_.yaml
@@ -12,6 +12,12 @@ ci:
     dims:
       N: 128
 
+simple-cpu:
+  - params: [A, B]
+    dtype: bfloat16
+    dims:
+      N: 128
+
 bench-cpu:
   - params: [A, B]
     dtype: float32
diff --git a/problems/specs/KernelBench/level1/20_LeakyReLU.yaml b/problems/specs/KernelBench/level1/20_LeakyReLU.yaml
index 7071370..bed06d7 100644
--- a/problems/specs/KernelBench/level1/20_LeakyReLU.yaml
+++ b/problems/specs/KernelBench/level1/20_LeakyReLU.yaml
@@ -11,6 +11,14 @@ ci:
       DIM: 512
     flop: "2*BATCH*DIM"
 
+simple-cpu:
+  - params: [X]
+    dtype: float32
+    dims:
+      BATCH: 128
+      DIM: 512
+    flop: "2*BATCH*DIM"
+
 bench-gpu:
   - params: [X]
     dtype: float16
diff --git a/problems/specs/KernelBench/level1/21_Sigmoid.yaml b/problems/specs/KernelBench/level1/21_Sigmoid.yaml
index 0bbba8e..7f40ac7 100644
--- a/problems/specs/KernelBench/level1/21_Sigmoid.yaml
+++ b/problems/specs/KernelBench/level1/21_Sigmoid.yaml
@@ -11,3 +11,10 @@ ci:
     dims:
       BATCH_SIZE: 128
       DIM: 512
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 128
+      DIM: 512
diff --git a/problems/specs/KernelBench/level1/22_Tanh.yaml b/problems/specs/KernelBench/level1/22_Tanh.yaml
index 0bbba8e..7f40ac7 100644
--- a/problems/specs/KernelBench/level1/22_Tanh.yaml
+++ b/problems/specs/KernelBench/level1/22_Tanh.yaml
@@ -11,3 +11,10 @@ ci:
     dims:
       BATCH_SIZE: 128
       DIM: 512
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 128
+      DIM: 512
diff --git a/problems/specs/KernelBench/level1/23_Softmax.yaml b/problems/specs/KernelBench/level1/23_Softmax.yaml
index 0bbba8e..ef98da3 100644
--- a/problems/specs/KernelBench/level1/23_Softmax.yaml
+++ b/problems/specs/KernelBench/level1/23_Softmax.yaml
@@ -11,3 +11,10 @@ ci:
     dims:
       BATCH_SIZE: 128
       DIM: 512
+
+simple-cpu:
+  - params: [x]
+    dtype: bfloat16
+    dims:
+      BATCH_SIZE: 128
+      DIM: 512
diff --git a/problems/specs/KernelBench/level1/24_LogSoftmax.yaml b/problems/specs/KernelBench/level1/24_LogSoftmax.yaml
index 0bbba8e..7f40ac7 100644
--- a/problems/specs/KernelBench/level1/24_LogSoftmax.yaml
+++ b/problems/specs/KernelBench/level1/24_LogSoftmax.yaml
@@ -11,3 +11,10 @@ ci:
     dims:
       BATCH_SIZE: 128
       DIM: 512
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 128
+      DIM: 512
diff --git a/problems/specs/KernelBench/level1/25_Swish.yaml b/problems/specs/KernelBench/level1/25_Swish.yaml
index 0bbba8e..7f40ac7 100644
--- a/problems/specs/KernelBench/level1/25_Swish.yaml
+++ b/problems/specs/KernelBench/level1/25_Swish.yaml
@@ -11,3 +11,10 @@ ci:
     dims:
       BATCH_SIZE: 128
       DIM: 512
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 128
+      DIM: 512
diff --git a/problems/specs/KernelBench/level1/26_GELU_.yaml b/problems/specs/KernelBench/level1/26_GELU_.yaml
index 0bbba8e..7f40ac7 100644
--- a/problems/specs/KernelBench/level1/26_GELU_.yaml
+++ b/problems/specs/KernelBench/level1/26_GELU_.yaml
@@ -11,3 +11,10 @@ ci:
     dims:
       BATCH_SIZE: 128
       DIM: 512
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 128
+      DIM: 512
diff --git a/problems/specs/KernelBench/level1/27_SELU_.yaml b/problems/specs/KernelBench/level1/27_SELU_.yaml
index 0bbba8e..7f40ac7 100644
--- a/problems/specs/KernelBench/level1/27_SELU_.yaml
+++ b/problems/specs/KernelBench/level1/27_SELU_.yaml
@@ -11,3 +11,10 @@ ci:
     dims:
       BATCH_SIZE: 128
       DIM: 512
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 128
+      DIM: 512
diff --git a/problems/specs/KernelBench/level1/28_HardSigmoid.yaml b/problems/specs/KernelBench/level1/28_HardSigmoid.yaml
index 0bbba8e..7f40ac7 100644
--- a/problems/specs/KernelBench/level1/28_HardSigmoid.yaml
+++ b/problems/specs/KernelBench/level1/28_HardSigmoid.yaml
@@ -11,3 +11,10 @@ ci:
     dims:
       BATCH_SIZE: 128
       DIM: 512
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 128
+      DIM: 512
diff --git a/problems/specs/KernelBench/level1/29_Softplus.yaml b/problems/specs/KernelBench/level1/29_Softplus.yaml
index 0bbba8e..7f40ac7 100644
--- a/problems/specs/KernelBench/level1/29_Softplus.yaml
+++ b/problems/specs/KernelBench/level1/29_Softplus.yaml
@@ -11,3 +11,10 @@ ci:
     dims:
       BATCH_SIZE: 128
       DIM: 512
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 128
+      DIM: 512
diff --git a/problems/specs/KernelBench/level1/2_Standard_matrix_multiplication_.yaml b/problems/specs/KernelBench/level1/2_Standard_matrix_multiplication_.yaml
index 0ba0f91..f4c1f66 100644
--- a/problems/specs/KernelBench/level1/2_Standard_matrix_multiplication_.yaml
+++ b/problems/specs/KernelBench/level1/2_Standard_matrix_multiplication_.yaml
@@ -14,6 +14,14 @@ ci:
       N: 128
       K: 256
 
+simple-cpu:
+  - params: [A, B]
+    dtype: bfloat16
+    dims:
+      M: 64
+      N: 128
+      K: 256
+
 bench-cpu:
   - params: [A, B]
     dtype: bfloat16
diff --git a/problems/specs/KernelBench/level1/30_Softsign.yaml b/problems/specs/KernelBench/level1/30_Softsign.yaml
index 0bbba8e..7f40ac7 100644
--- a/problems/specs/KernelBench/level1/30_Softsign.yaml
+++ b/problems/specs/KernelBench/level1/30_Softsign.yaml
@@ -11,3 +11,10 @@ ci:
     dims:
       BATCH_SIZE: 128
       DIM: 512
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 128
+      DIM: 512
diff --git a/problems/specs/KernelBench/level1/31_ELU.yaml b/problems/specs/KernelBench/level1/31_ELU.yaml
index 451772e..092c0ed 100644
--- a/problems/specs/KernelBench/level1/31_ELU.yaml
+++ b/problems/specs/KernelBench/level1/31_ELU.yaml
@@ -13,3 +13,11 @@ ci:
       BATCH_SIZE: 128
       DIM: 512
       ALPHA: 1.0
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 128
+      DIM: 512
+      ALPHA: 1.0
diff --git a/problems/specs/KernelBench/level1/32_HardTanh.yaml b/problems/specs/KernelBench/level1/32_HardTanh.yaml
index 0bbba8e..7f40ac7 100644
--- a/problems/specs/KernelBench/level1/32_HardTanh.yaml
+++ b/problems/specs/KernelBench/level1/32_HardTanh.yaml
@@ -11,3 +11,10 @@ ci:
     dims:
       BATCH_SIZE: 128
       DIM: 512
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 128
+      DIM: 512
diff --git a/problems/specs/KernelBench/level1/33_BatchNorm.yaml b/problems/specs/KernelBench/level1/33_BatchNorm.yaml
index 2c34dfb..2287cb0 100644
--- a/problems/specs/KernelBench/level1/33_BatchNorm.yaml
+++ b/problems/specs/KernelBench/level1/33_BatchNorm.yaml
@@ -14,3 +14,12 @@ ci:
       FEATURES: 4
       DIM1: 64
       DIM2: 64
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 2
+      FEATURES: 4
+      DIM1: 64
+      DIM2: 64
diff --git a/problems/specs/KernelBench/level1/34_InstanceNorm.yaml b/problems/specs/KernelBench/level1/34_InstanceNorm.yaml
index 2c34dfb..2287cb0 100644
--- a/problems/specs/KernelBench/level1/34_InstanceNorm.yaml
+++ b/problems/specs/KernelBench/level1/34_InstanceNorm.yaml
@@ -14,3 +14,12 @@ ci:
       FEATURES: 4
       DIM1: 64
       DIM2: 64
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 2
+      FEATURES: 4
+      DIM1: 64
+      DIM2: 64
diff --git a/problems/specs/KernelBench/level1/35_GroupNorm_.yaml b/problems/specs/KernelBench/level1/35_GroupNorm_.yaml
index da07568..f1f03fa 100644
--- a/problems/specs/KernelBench/level1/35_GroupNorm_.yaml
+++ b/problems/specs/KernelBench/level1/35_GroupNorm_.yaml
@@ -16,3 +16,13 @@ ci:
       NUM_GROUPS: 4
       DIM1: 32
       DIM2: 32
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 2
+      FEATURES: 8
+      NUM_GROUPS: 4
+      DIM1: 32
+      DIM2: 32
diff --git a/problems/specs/KernelBench/level1/36_RMSNorm_.yaml b/problems/specs/KernelBench/level1/36_RMSNorm_.yaml
index 2c34dfb..5a5ed85 100644
--- a/problems/specs/KernelBench/level1/36_RMSNorm_.yaml
+++ b/problems/specs/KernelBench/level1/36_RMSNorm_.yaml
@@ -14,3 +14,12 @@ ci:
       FEATURES: 4
       DIM1: 64
       DIM2: 64
+
+simple-cpu:
+  - params: [x]
+    dtype: bfloat16
+    dims:
+      BATCH_SIZE: 2
+      FEATURES: 4
+      DIM1: 64
+      DIM2: 64
diff --git a/problems/specs/KernelBench/level1/37_FrobeniusNorm_.yaml b/problems/specs/KernelBench/level1/37_FrobeniusNorm_.yaml
index 72d91ec..3cbc627 100644
--- a/problems/specs/KernelBench/level1/37_FrobeniusNorm_.yaml
+++ b/problems/specs/KernelBench/level1/37_FrobeniusNorm_.yaml
@@ -13,3 +13,12 @@ ci:
       FEATURES: 4
       DIM1: 64
       DIM2: 64
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 2
+      FEATURES: 4
+      DIM1: 64
+      DIM2: 64
diff --git a/problems/specs/KernelBench/level1/38_L1Norm_.yaml b/problems/specs/KernelBench/level1/38_L1Norm_.yaml
index 0bbba8e..7f40ac7 100644
--- a/problems/specs/KernelBench/level1/38_L1Norm_.yaml
+++ b/problems/specs/KernelBench/level1/38_L1Norm_.yaml
@@ -11,3 +11,10 @@ ci:
     dims:
       BATCH_SIZE: 128
       DIM: 512
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 128
+      DIM: 512
diff --git a/problems/specs/KernelBench/level1/39_L2Norm_.yaml b/problems/specs/KernelBench/level1/39_L2Norm_.yaml
index 0bbba8e..7f40ac7 100644
--- a/problems/specs/KernelBench/level1/39_L2Norm_.yaml
+++ b/problems/specs/KernelBench/level1/39_L2Norm_.yaml
@@ -11,3 +11,10 @@ ci:
     dims:
       BATCH_SIZE: 128
       DIM: 512
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 128
+      DIM: 512
diff --git a/problems/specs/KernelBench/level1/3_Batched_matrix_multiplication.yaml b/problems/specs/KernelBench/level1/3_Batched_matrix_multiplication.yaml
index 14c26dd..5c11555 100644
--- a/problems/specs/KernelBench/level1/3_Batched_matrix_multiplication.yaml
+++ b/problems/specs/KernelBench/level1/3_Batched_matrix_multiplication.yaml
@@ -15,6 +15,15 @@ ci:
       N: 128
       K: 128
 
+simple-cpu:
+  - params: [A, B]
+    dtype: bfloat16
+    dims:
+      BATCH: 2
+      M: 64
+      N: 128
+      K: 128
+
 bench-gpu:
   - params: [A, B]
     dtype: float16
diff --git a/problems/specs/KernelBench/level1/40_LayerNorm.yaml b/problems/specs/KernelBench/level1/40_LayerNorm.yaml
index 2fbd50a..1b16337 100644
--- a/problems/specs/KernelBench/level1/40_LayerNorm.yaml
+++ b/problems/specs/KernelBench/level1/40_LayerNorm.yaml
@@ -15,3 +15,13 @@ ci:
       DIM1: 64
       DIM2: 64
       NORMALIZED_SHAPE: [4, 64, 64] # TODO: bind these to other dims
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 2
+      FEATURES: 4
+      DIM1: 64
+      DIM2: 64
+      NORMALIZED_SHAPE: [4, 64, 64] # TODO: bind these to other dims
diff --git a/problems/specs/KernelBench/level1/41_Max_Pooling_1D.yaml b/problems/specs/KernelBench/level1/41_Max_Pooling_1D.yaml
index 49ef0fe..34612d9 100644
--- a/problems/specs/KernelBench/level1/41_Max_Pooling_1D.yaml
+++ b/problems/specs/KernelBench/level1/41_Max_Pooling_1D.yaml
@@ -22,3 +22,16 @@ ci:
       PADDING: 1
       DILATION: 3
       RETURN_INDICES: false
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 2
+      FEATURES: 24
+      SEQUENCE_LENGTH: 64
+      KERNEL_SIZE: 2
+      STRIDE: 1
+      PADDING: 1
+      DILATION: 3
+      RETURN_INDICES: false
diff --git a/problems/specs/KernelBench/level1/42_Max_Pooling_2D.yaml b/problems/specs/KernelBench/level1/42_Max_Pooling_2D.yaml
index 95debcf..7f444a4 100644
--- a/problems/specs/KernelBench/level1/42_Max_Pooling_2D.yaml
+++ b/problems/specs/KernelBench/level1/42_Max_Pooling_2D.yaml
@@ -21,3 +21,16 @@ ci:
       STRIDE: 1
       PADDING: 1
       DILATION: 1
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 2
+      CHANNELS: 8
+      HEIGHT: 32
+      WIDTH: 32
+      KERNEL_SIZE: 2
+      STRIDE: 1
+      PADDING: 1
+      DILATION: 1
diff --git a/problems/specs/KernelBench/level1/43_Max_Pooling_3D.yaml b/problems/specs/KernelBench/level1/43_Max_Pooling_3D.yaml
index 2c34dfb..a50374c 100644
--- a/problems/specs/KernelBench/level1/43_Max_Pooling_3D.yaml
+++ b/problems/specs/KernelBench/level1/43_Max_Pooling_3D.yaml
@@ -1,6 +1,6 @@
 inputs:
   x:
-    shape: [BATCH_SIZE, FEATURES, DIM1, DIM2]
+    shape: [BATCH_SIZE, FEATURES, DIM1, DIM2, DIM3]
     dtype: inherit
 
 inits:
@@ -12,5 +12,16 @@ ci:
     dims:
       BATCH_SIZE: 2
       FEATURES: 4
-      DIM1: 64
-      DIM2: 64
+      DIM1: 16
+      DIM2: 16
+      DIM3: 16
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 2
+      FEATURES: 4
+      DIM1: 16
+      DIM2: 16
+      DIM3: 16
diff --git a/problems/specs/KernelBench/level1/44_Average_Pooling_1D.yaml b/problems/specs/KernelBench/level1/44_Average_Pooling_1D.yaml
index f4577e5..268a02e 100644
--- a/problems/specs/KernelBench/level1/44_Average_Pooling_1D.yaml
+++ b/problems/specs/KernelBench/level1/44_Average_Pooling_1D.yaml
@@ -18,3 +18,14 @@ ci:
       KERNEL_SIZE: 2
       STRIDE: 1
       PADDING: 1
+
+simple-cpu:
+  - params: [x]
+    dtype: bfloat16
+    dims:
+      BATCH_SIZE: 2
+      IN_CHANNELS: 32
+      INPUT_LENGTH: 128
+      KERNEL_SIZE: 2
+      STRIDE: 1
+      PADDING: 1
diff --git a/problems/specs/KernelBench/level1/45_Average_Pooling_2D.yaml b/problems/specs/KernelBench/level1/45_Average_Pooling_2D.yaml
index 7ae2029..e63de37 100644
--- a/problems/specs/KernelBench/level1/45_Average_Pooling_2D.yaml
+++ b/problems/specs/KernelBench/level1/45_Average_Pooling_2D.yaml
@@ -15,3 +15,13 @@ ci:
       HEIGHT: 128
       WIDTH: 128
       KERNEL_SIZE: 3
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 2
+      CHANNELS: 16
+      HEIGHT: 128
+      WIDTH: 128
+      KERNEL_SIZE: 3
diff --git a/problems/specs/KernelBench/level1/46_Average_Pooling_3D.yaml b/problems/specs/KernelBench/level1/46_Average_Pooling_3D.yaml
index 748cf0b..c13a4ba 100644
--- a/problems/specs/KernelBench/level1/46_Average_Pooling_3D.yaml
+++ b/problems/specs/KernelBench/level1/46_Average_Pooling_3D.yaml
@@ -20,3 +20,16 @@ ci:
       KERNEL_SIZE: 3
       STRIDE: 2
       PADDING: 1
+
+simple-cpu:
+  - params: [x]
+    dtype: float32 # "avg_pool3d_out_frame" not implemented for 'Half'
+    dims:
+      BATCH_SIZE: 2
+      CHANNELS: 8
+      DEPTH: 16
+      HEIGHT: 16
+      WIDTH: 32
+      KERNEL_SIZE: 3
+      STRIDE: 2
+      PADDING: 1
diff --git a/problems/specs/KernelBench/level1/47_Sum_reduction_over_a_dimension.yaml b/problems/specs/KernelBench/level1/47_Sum_reduction_over_a_dimension.yaml
index bf5fbda..34c285c 100644
--- a/problems/specs/KernelBench/level1/47_Sum_reduction_over_a_dimension.yaml
+++ b/problems/specs/KernelBench/level1/47_Sum_reduction_over_a_dimension.yaml
@@ -14,3 +14,12 @@ ci:
       DIM1: 64
       DIM2: 63
       REDUCE_DIM: 1
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 2
+      DIM1: 64
+      DIM2: 63
+      REDUCE_DIM: 1
diff --git a/problems/specs/KernelBench/level1/48_Mean_reduction_over_a_dimension.yaml b/problems/specs/KernelBench/level1/48_Mean_reduction_over_a_dimension.yaml
index bf5fbda..34c285c 100644
--- a/problems/specs/KernelBench/level1/48_Mean_reduction_over_a_dimension.yaml
+++ b/problems/specs/KernelBench/level1/48_Mean_reduction_over_a_dimension.yaml
@@ -14,3 +14,12 @@ ci:
       DIM1: 64
       DIM2: 63
       REDUCE_DIM: 1
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 2
+      DIM1: 64
+      DIM2: 63
+      REDUCE_DIM: 1
diff --git a/problems/specs/KernelBench/level1/49_Max_reduction_over_a_dimension.yaml b/problems/specs/KernelBench/level1/49_Max_reduction_over_a_dimension.yaml
index bf5fbda..34c285c 100644
--- a/problems/specs/KernelBench/level1/49_Max_reduction_over_a_dimension.yaml
+++ b/problems/specs/KernelBench/level1/49_Max_reduction_over_a_dimension.yaml
@@ -14,3 +14,12 @@ ci:
       DIM1: 64
       DIM2: 63
       REDUCE_DIM: 1
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 2
+      DIM1: 64
+      DIM2: 63
+      REDUCE_DIM: 1
diff --git a/problems/specs/KernelBench/level1/4_Matrix_vector_multiplication_.yaml b/problems/specs/KernelBench/level1/4_Matrix_vector_multiplication_.yaml
index 1536270..d3afc05 100644
--- a/problems/specs/KernelBench/level1/4_Matrix_vector_multiplication_.yaml
+++ b/problems/specs/KernelBench/level1/4_Matrix_vector_multiplication_.yaml
@@ -15,6 +15,15 @@ ci:
       K: 256
     flop: "2*M*N*K"
 
+simple-cpu:
+  - params: [A, B]
+    dtype: bfloat16
+    dims:
+      M: 64
+      N: 1
+      K: 256
+    flop: "2*M*N*K"
+
 bench-gpu:
   - params: [A, B]
     dtype: float16
diff --git a/problems/specs/KernelBench/level1/50_conv_standard_2D__square_input__square_kernel.yaml b/problems/specs/KernelBench/level1/50_conv_standard_2D__square_input__square_kernel.yaml
index 11d119c..33930dc 100644
--- a/problems/specs/KernelBench/level1/50_conv_standard_2D__square_input__square_kernel.yaml
+++ b/problems/specs/KernelBench/level1/50_conv_standard_2D__square_input__square_kernel.yaml
@@ -15,3 +15,13 @@ ci:
       IN_CHANNELS: 3
       HEIGHT: 64
       WIDTH: 64
+
+simple-cpu:
+  - params: [x]
+    dtype: bfloat16
+    dims:
+      BATCH_SIZE: 2
+      NUM_CLASSES: 10
+      IN_CHANNELS: 3
+      HEIGHT: 64
+      WIDTH: 64
diff --git a/problems/specs/KernelBench/level1/51_Argmax_over_a_dimension.yaml b/problems/specs/KernelBench/level1/51_Argmax_over_a_dimension.yaml
index 917cfb3..8b78339 100644
--- a/problems/specs/KernelBench/level1/51_Argmax_over_a_dimension.yaml
+++ b/problems/specs/KernelBench/level1/51_Argmax_over_a_dimension.yaml
@@ -14,3 +14,12 @@ ci:
       DIM1: 64
       DIM2: 63
       ARGMAX_DIM: 1
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 2
+      DIM1: 64
+      DIM2: 63
+      ARGMAX_DIM: 1
diff --git a/problems/specs/KernelBench/level1/52_Argmin_over_a_dimension.yaml b/problems/specs/KernelBench/level1/52_Argmin_over_a_dimension.yaml
index fe08c21..e20221d 100644
--- a/problems/specs/KernelBench/level1/52_Argmin_over_a_dimension.yaml
+++ b/problems/specs/KernelBench/level1/52_Argmin_over_a_dimension.yaml
@@ -14,3 +14,12 @@ ci:
       DIM1: 64
       DIM2: 63
       ARGMIN_DIM: 1
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 2
+      DIM1: 64
+      DIM2: 63
+      ARGMIN_DIM: 1
diff --git a/problems/specs/KernelBench/level1/53_Min_reduction_over_a_dimension.yaml b/problems/specs/KernelBench/level1/53_Min_reduction_over_a_dimension.yaml
index bf5fbda..34c285c 100644
--- a/problems/specs/KernelBench/level1/53_Min_reduction_over_a_dimension.yaml
+++ b/problems/specs/KernelBench/level1/53_Min_reduction_over_a_dimension.yaml
@@ -14,3 +14,12 @@ ci:
       DIM1: 64
       DIM2: 63
       REDUCE_DIM: 1
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 2
+      DIM1: 64
+      DIM2: 63
+      REDUCE_DIM: 1
diff --git a/problems/specs/KernelBench/level1/54_conv_standard_3D__square_input__square_kernel.yaml b/problems/specs/KernelBench/level1/54_conv_standard_3D__square_input__square_kernel.yaml
index bead1f0..4503c24 100644
--- a/problems/specs/KernelBench/level1/54_conv_standard_3D__square_input__square_kernel.yaml
+++ b/problems/specs/KernelBench/level1/54_conv_standard_3D__square_input__square_kernel.yaml
@@ -19,3 +19,15 @@ ci:
       DEPTH: 16
       WIDTH: 16
       HEIGHT: 16
+
+simple-cpu:
+  - params: [x]
+    dtype: float16
+    dims:
+      BATCH_SIZE: 2
+      IN_CHANNELS: 3
+      OUT_CHANNELS: 16
+      KERNEL_SIZE: 3
+      DEPTH: 16
+      WIDTH: 16
+      HEIGHT: 16
diff --git a/problems/specs/KernelBench/level1/55_conv_standard_2D__asymmetric_input__square_kernel.yaml b/problems/specs/KernelBench/level1/55_conv_standard_2D__asymmetric_input__square_kernel.yaml
index 33793c6..febf771 100644
--- a/problems/specs/KernelBench/level1/55_conv_standard_2D__asymmetric_input__square_kernel.yaml
+++ b/problems/specs/KernelBench/level1/55_conv_standard_2D__asymmetric_input__square_kernel.yaml
@@ -18,3 +18,14 @@ ci:
       KERNEL_SIZE: 3
       HEIGHT: 32
       WIDTH: 64
+
+simple-cpu:
+  - params: [x]
+    dtype: bfloat16
+    dims:
+      BATCH_SIZE: 2
+      IN_CHANNELS: 8
+      OUT_CHANNELS: 16
+      KERNEL_SIZE: 3
+      HEIGHT: 32
+      WIDTH: 64
diff --git a/problems/specs/KernelBench/level1/56_conv_standard_2D__asymmetric_input__asymmetric_kernel.yaml b/problems/specs/KernelBench/level1/56_conv_standard_2D__asymmetric_input__asymmetric_kernel.yaml
index 16234e5..f057fd4 100644
--- a/problems/specs/KernelBench/level1/56_conv_standard_2D__asymmetric_input__asymmetric_kernel.yaml
+++ b/problems/specs/KernelBench/level1/56_conv_standard_2D__asymmetric_input__asymmetric_kernel.yaml
@@ -18,3 +18,15 @@ ci:
       KERNEL_SIZE: [5, 7]
       HEIGHT: 64
       WIDTH: 32
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 2
+      IN_CHANNELS: 8
+      OUT_CHANNELS: 16
+      KERNEL_SIZE: [5, 7]
+      HEIGHT: 64
+      WIDTH: 32
+    atol: 9.e-4
diff --git a/problems/specs/KernelBench/level1/57_conv_transposed_2D__square_input__square_kernel.yaml b/problems/specs/KernelBench/level1/57_conv_transposed_2D__square_input__square_kernel.yaml
index a1a2267..495f86f 100644
--- a/problems/specs/KernelBench/level1/57_conv_transposed_2D__square_input__square_kernel.yaml
+++ b/problems/specs/KernelBench/level1/57_conv_transposed_2D__square_input__square_kernel.yaml
@@ -18,3 +18,14 @@ ci:
       KERNEL_SIZE: 3
       HEIGHT: 64
       WIDTH: 64
+
+simple-cpu:
+  - params: [x]
+    dtype: float16
+    dims:
+      BATCH_SIZE: 2
+      IN_CHANNELS: 16
+      OUT_CHANNELS: 16
+      KERNEL_SIZE: 3
+      HEIGHT: 64
+      WIDTH: 64
diff --git a/problems/specs/KernelBench/level1/58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.yaml b/problems/specs/KernelBench/level1/58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.yaml
index f289f4d..500b1cf 100644
--- a/problems/specs/KernelBench/level1/58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.yaml
+++ b/problems/specs/KernelBench/level1/58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.yaml
@@ -19,3 +19,15 @@ ci:
       DEPTH_IN: 4
       HEIGHT_IN: 32
       WIDTH_IN: 64
+
+simple-cpu:
+  - params: [x]
+    dtype: float16
+    dims:
+      BATCH_SIZE: 2
+      IN_CHANNELS: 4
+      OUT_CHANNELS: 8
+      KERNEL_SIZE: [3, 5, 7]
+      DEPTH_IN: 4
+      HEIGHT_IN: 32
+      WIDTH_IN: 64
diff --git a/problems/specs/KernelBench/level1/59_conv_standard_3D__asymmetric_input__square_kernel.yaml b/problems/specs/KernelBench/level1/59_conv_standard_3D__asymmetric_input__square_kernel.yaml
index 1aea06b..4d58a3e 100644
--- a/problems/specs/KernelBench/level1/59_conv_standard_3D__asymmetric_input__square_kernel.yaml
+++ b/problems/specs/KernelBench/level1/59_conv_standard_3D__asymmetric_input__square_kernel.yaml
@@ -19,3 +19,15 @@ ci:
       HEIGHT: 32
       WIDTH: 32
       DEPTH: 5
+
+simple-cpu:
+  - params: [x]
+    dtype: float16
+    dims:
+      BATCH_SIZE: 2
+      IN_CHANNELS: 3
+      OUT_CHANNELS: 16
+      KERNEL_SIZE: 3
+      HEIGHT: 32
+      WIDTH: 32
+      DEPTH: 5
diff --git a/problems/specs/KernelBench/level1/5_Matrix_scalar_multiplication.yaml b/problems/specs/KernelBench/level1/5_Matrix_scalar_multiplication.yaml
index 782dc63..d2ecbbe 100644
--- a/problems/specs/KernelBench/level1/5_Matrix_scalar_multiplication.yaml
+++ b/problems/specs/KernelBench/level1/5_Matrix_scalar_multiplication.yaml
@@ -15,6 +15,15 @@ ci:
       UNIT: 1
     flop: "M*N"
 
+simple-cpu:
+  - params: [A, B]
+    dtype: bfloat16
+    dims:
+      M: 64
+      N: 32
+      UNIT: 1
+    flop: "M*N"
+
 bench-gpu:
   - params: [A, B]
     dtype: float16
diff --git a/problems/specs/KernelBench/level1/60_conv_standard_3D__square_input__asymmetric_kernel.yaml b/problems/specs/KernelBench/level1/60_conv_standard_3D__square_input__asymmetric_kernel.yaml
index eb04ba7..2db656a 100644
--- a/problems/specs/KernelBench/level1/60_conv_standard_3D__square_input__asymmetric_kernel.yaml
+++ b/problems/specs/KernelBench/level1/60_conv_standard_3D__square_input__asymmetric_kernel.yaml
@@ -19,3 +19,15 @@ ci:
       WIDTH: 16
       HEIGHT: 16
       DEPTH: 16
+
+simple-cpu:
+  - params: [x]
+    dtype: float16
+    dims:
+      BATCH_SIZE: 2
+      IN_CHANNELS: 3
+      OUT_CHANNELS: 8
+      KERNEL_SIZE: [3, 5, 7]
+      WIDTH: 16
+      HEIGHT: 16
+      DEPTH: 16
diff --git a/problems/specs/KernelBench/level1/61_conv_transposed_3D__square_input__square_kernel.yaml b/problems/specs/KernelBench/level1/61_conv_transposed_3D__square_input__square_kernel.yaml
index 9a032a4..351451e 100644
--- a/problems/specs/KernelBench/level1/61_conv_transposed_3D__square_input__square_kernel.yaml
+++ b/problems/specs/KernelBench/level1/61_conv_transposed_3D__square_input__square_kernel.yaml
@@ -19,3 +19,15 @@ ci:
       DEPTH: 16
       HEIGHT: 16
       WIDTH: 16
+
+simple-cpu:
+  - params: [x]
+    dtype: float16
+    dims:
+      BATCH_SIZE: 2
+      IN_CHANNELS: 24
+      OUT_CHANNELS: 24
+      KERNEL_SIZE: 3
+      DEPTH: 16
+      HEIGHT: 16
+      WIDTH: 16
diff --git a/problems/specs/KernelBench/level1/62_conv_standard_2D__square_input__asymmetric_kernel.yaml b/problems/specs/KernelBench/level1/62_conv_standard_2D__square_input__asymmetric_kernel.yaml
index 1b2a193..0b957f9 100644
--- a/problems/specs/KernelBench/level1/62_conv_standard_2D__square_input__asymmetric_kernel.yaml
+++ b/problems/specs/KernelBench/level1/62_conv_standard_2D__square_input__asymmetric_kernel.yaml
@@ -18,3 +18,14 @@ ci:
       KERNEL_SIZE: [5, 9]
       HEIGHT: 32
       WIDTH: 32
+
+simple-cpu:
+  - params: [x]
+    dtype: float16
+    dims:
+      BATCH_SIZE: 2
+      IN_CHANNELS: 4
+      OUT_CHANNELS: 8
+      KERNEL_SIZE: [5, 9]
+      HEIGHT: 32
+      WIDTH: 32
diff --git a/problems/specs/KernelBench/level1/63_conv_standard_2D__square_input__square_kernel.yaml b/problems/specs/KernelBench/level1/63_conv_standard_2D__square_input__square_kernel.yaml
index e91e4bc..34790ad 100644
--- a/problems/specs/KernelBench/level1/63_conv_standard_2D__square_input__square_kernel.yaml
+++ b/problems/specs/KernelBench/level1/63_conv_standard_2D__square_input__square_kernel.yaml
@@ -18,3 +18,14 @@ ci:
       KERNEL_SIZE: 3
       HEIGHT: 32
       WIDTH: 32
+
+simple-cpu:
+  - params: [x]
+    dtype: float16
+    dims:
+      BATCH_SIZE: 2
+      IN_CHANNELS: 4
+      OUT_CHANNELS: 16
+      KERNEL_SIZE: 3
+      HEIGHT: 32
+      WIDTH: 32
diff --git a/problems/specs/KernelBench/level1/64_conv_transposed_1D.yaml b/problems/specs/KernelBench/level1/64_conv_transposed_1D.yaml
index 577f23f..aea674d 100644
--- a/problems/specs/KernelBench/level1/64_conv_transposed_1D.yaml
+++ b/problems/specs/KernelBench/level1/64_conv_transposed_1D.yaml
@@ -17,3 +17,13 @@ ci:
       OUT_CHANNELS: 8
       KERNEL_SIZE: 3
       LENGTH: 128
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 2
+      IN_CHANNELS: 8
+      OUT_CHANNELS: 8
+      KERNEL_SIZE: 3
+      LENGTH: 128
diff --git a/problems/specs/KernelBench/level1/65_conv_transposed_2D__square_input__asymmetric_kernel.yaml b/problems/specs/KernelBench/level1/65_conv_transposed_2D__square_input__asymmetric_kernel.yaml
index aaed64b..e1941d6 100644
--- a/problems/specs/KernelBench/level1/65_conv_transposed_2D__square_input__asymmetric_kernel.yaml
+++ b/problems/specs/KernelBench/level1/65_conv_transposed_2D__square_input__asymmetric_kernel.yaml
@@ -18,3 +18,14 @@ ci:
       KERNEL_SIZE: [3, 7]
       HEIGHT: 32
       WIDTH: 32
+
+simple-cpu:
+  - params: [x]
+    dtype: float16
+    dims:
+      BATCH_SIZE: 2
+      IN_CHANNELS: 8
+      OUT_CHANNELS: 8
+      KERNEL_SIZE: [3, 7]
+      HEIGHT: 32
+      WIDTH: 32
diff --git a/problems/specs/KernelBench/level1/66_conv_standard_3D__asymmetric_input__asymmetric_kernel.yaml b/problems/specs/KernelBench/level1/66_conv_standard_3D__asymmetric_input__asymmetric_kernel.yaml
index db69bd0..f2c5be3 100644
--- a/problems/specs/KernelBench/level1/66_conv_standard_3D__asymmetric_input__asymmetric_kernel.yaml
+++ b/problems/specs/KernelBench/level1/66_conv_standard_3D__asymmetric_input__asymmetric_kernel.yaml
@@ -19,3 +19,15 @@ ci:
       DEPTH: 4
       HEIGHT: 32
       WIDTH: 32
+
+simple-cpu:
+  - params: [x]
+    dtype: float16
+    dims:
+      BATCH_SIZE: 2
+      IN_CHANNELS: 3
+      OUT_CHANNELS: 8
+      KERNEL_SIZE: [3, 5, 7]
+      DEPTH: 4
+      HEIGHT: 32
+      WIDTH: 32
diff --git a/problems/specs/KernelBench/level1/67_conv_standard_1D.yaml b/problems/specs/KernelBench/level1/67_conv_standard_1D.yaml
index 4ad5147..c68f735 100644
--- a/problems/specs/KernelBench/level1/67_conv_standard_1D.yaml
+++ b/problems/specs/KernelBench/level1/67_conv_standard_1D.yaml
@@ -17,3 +17,13 @@ ci:
       OUT_CHANNELS: 16
       KERNEL_SIZE: 3
       LENGTH: 128
+
+simple-cpu:
+  - params: [x]
+    dtype: float16
+    dims:
+      BATCH_SIZE: 2
+      IN_CHANNELS: 8
+      OUT_CHANNELS: 16
+      KERNEL_SIZE: 3
+      LENGTH: 128
diff --git a/problems/specs/KernelBench/level1/68_conv_transposed_3D__square_input__asymmetric_kernel.yaml b/problems/specs/KernelBench/level1/68_conv_transposed_3D__square_input__asymmetric_kernel.yaml
index 3cc2804..06e0d22 100644
--- a/problems/specs/KernelBench/level1/68_conv_transposed_3D__square_input__asymmetric_kernel.yaml
+++ b/problems/specs/KernelBench/level1/68_conv_transposed_3D__square_input__asymmetric_kernel.yaml
@@ -19,3 +19,15 @@ ci:
       DEPTH: 32
       WIDTH: 32
       HEIGHT: 32
+
+simple-cpu:
+  - params: [x]
+    dtype: float16
+    dims:
+      BATCH_SIZE: 2
+      IN_CHANNELS: 2
+      OUT_CHANNELS: 4
+      KERNEL_SIZE: [3, 5, 5]
+      DEPTH: 32
+      WIDTH: 32
+      HEIGHT: 32
diff --git a/problems/specs/KernelBench/level1/69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.yaml b/problems/specs/KernelBench/level1/69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.yaml
index 3ffa589..5b33b8b 100644
--- a/problems/specs/KernelBench/level1/69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.yaml
+++ b/problems/specs/KernelBench/level1/69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.yaml
@@ -18,3 +18,14 @@ ci:
       KERNEL_SIZE: [3, 5]
       HEIGHT_IN: 16
       WIDTH_IN: 32
+
+simple-cpu:
+  - params: [x]
+    dtype: float16
+    dims:
+      BATCH_SIZE: 2
+      IN_CHANNELS: 4
+      OUT_CHANNELS: 8
+      KERNEL_SIZE: [3, 5]
+      HEIGHT_IN: 16
+      WIDTH_IN: 32
diff --git a/problems/specs/KernelBench/level1/6_Matmul_with_large_K_dimension_.yaml b/problems/specs/KernelBench/level1/6_Matmul_with_large_K_dimension_.yaml
index 09435cc..73aa753 100644
--- a/problems/specs/KernelBench/level1/6_Matmul_with_large_K_dimension_.yaml
+++ b/problems/specs/KernelBench/level1/6_Matmul_with_large_K_dimension_.yaml
@@ -15,6 +15,15 @@ ci:
       K: 512
     flop: "2*M*N*K"
 
+simple-cpu:
+  - params: [A, B]
+    dtype: bfloat16
+    dims:
+      M: 8
+      N: 16
+      K: 512
+    flop: "2*M*N*K"
+
 bench-gpu:
   - params: [A, B]
     dtype: float16
diff --git a/problems/specs/KernelBench/level1/70_conv_transposed_3D__asymmetric_input__square_kernel.yaml b/problems/specs/KernelBench/level1/70_conv_transposed_3D__asymmetric_input__square_kernel.yaml
index a8920c9..0c210cb 100644
--- a/problems/specs/KernelBench/level1/70_conv_transposed_3D__asymmetric_input__square_kernel.yaml
+++ b/problems/specs/KernelBench/level1/70_conv_transposed_3D__asymmetric_input__square_kernel.yaml
@@ -19,3 +19,15 @@ ci:
       DEPTH: 24
       HEIGHT: 24
       WIDTH: 24
+
+simple-cpu:
+  - params: [x]
+    dtype: float16
+    dims:
+      BATCH_SIZE: 2
+      IN_CHANNELS: 12
+      OUT_CHANNELS: 6
+      KERNEL_SIZE: 3
+      DEPTH: 24
+      HEIGHT: 24
+      WIDTH: 24
diff --git a/problems/specs/KernelBench/level1/71_conv_transposed_2D__asymmetric_input__square_kernel.yaml b/problems/specs/KernelBench/level1/71_conv_transposed_2D__asymmetric_input__square_kernel.yaml
index cef0e24..618cfda 100644
--- a/problems/specs/KernelBench/level1/71_conv_transposed_2D__asymmetric_input__square_kernel.yaml
+++ b/problems/specs/KernelBench/level1/71_conv_transposed_2D__asymmetric_input__square_kernel.yaml
@@ -18,3 +18,14 @@ ci:
       KERNEL_SIZE: 3
       HEIGHT_IN: 16
       WIDTH_IN: 32
+
+simple-cpu:
+  - params: [x]
+    dtype: float16
+    dims:
+      BATCH_SIZE: 2
+      IN_CHANNELS: 8
+      OUT_CHANNELS: 8
+      KERNEL_SIZE: 3
+      HEIGHT_IN: 16
+      WIDTH_IN: 32
diff --git a/problems/specs/KernelBench/level1/72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.yaml b/problems/specs/KernelBench/level1/72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.yaml
index 99d8322..9fca6f0 100644
--- a/problems/specs/KernelBench/level1/72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.yaml
+++ b/problems/specs/KernelBench/level1/72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.yaml
@@ -27,3 +27,19 @@ ci:
       DEPTH: 12
       HEIGHT: 24
       WIDTH: 48
+
+simple-cpu:
+  - params: [x]
+    dtype: float16
+    dims:
+      BATCH_SIZE: 2
+      IN_CHANNELS: 4
+      OUT_CHANNELS: 4
+      KERNEL_SIZE: [3, 5, 7]
+      STRIDE: [2, 2, 2]
+      PADDING: [1, 2, 3]
+      OUTPUT_PADDING: [1, 1, 1]
+      GROUPS: 2
+      DEPTH: 12
+      HEIGHT: 24
+      WIDTH: 48
diff --git a/problems/specs/KernelBench/level1/73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.yaml b/problems/specs/KernelBench/level1/73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.yaml
index d6c9c1b..422ac63 100644
--- a/problems/specs/KernelBench/level1/73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.yaml
+++ b/problems/specs/KernelBench/level1/73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.yaml
@@ -25,3 +25,18 @@ ci:
       DEPTH: 8
       HEIGHT: 16
       WIDTH: 32
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 2
+      IN_CHANNELS: 4
+      OUT_CHANNELS: 4
+      KERNEL_SIZE: 3
+      STRIDE: 2
+      PADDING: 1
+      GROUPS: 4
+      DEPTH: 8
+      HEIGHT: 16
+      WIDTH: 32
diff --git a/problems/specs/KernelBench/level1/74_conv_transposed_1D_dilated.yaml b/problems/specs/KernelBench/level1/74_conv_transposed_1D_dilated.yaml
index e6812f2..1ea4a8d 100644
--- a/problems/specs/KernelBench/level1/74_conv_transposed_1D_dilated.yaml
+++ b/problems/specs/KernelBench/level1/74_conv_transposed_1D_dilated.yaml
@@ -23,3 +23,16 @@ ci:
       PADDING: 0
       DILATION: 3
       LENGTH: 128
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 2
+      IN_CHANNELS: 4
+      OUT_CHANNELS: 8
+      KERNEL_SIZE: 5
+      STRIDE: 1
+      PADDING: 0
+      DILATION: 3
+      LENGTH: 128
diff --git a/problems/specs/KernelBench/level1/75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.yaml b/problems/specs/KernelBench/level1/75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.yaml
index 8240af2..29fe1f7 100644
--- a/problems/specs/KernelBench/level1/75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.yaml
+++ b/problems/specs/KernelBench/level1/75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.yaml
@@ -26,3 +26,18 @@ ci:
       GROUPS: 2
       HEIGHT: 16
       WIDTH: 32
+
+simple-cpu:
+  - params: [x]
+    dtype: float16
+    dims:
+      BATCH_SIZE: 2
+      IN_CHANNELS: 4
+      OUT_CHANNELS: 8
+      KERNEL_SIZE: [3, 5]
+      STRIDE: [2, 3]
+      PADDING: [1, 2]
+      DILATION: [2, 1]
+      GROUPS: 2
+      HEIGHT: 16
+      WIDTH: 32
diff --git a/problems/specs/KernelBench/level1/76_conv_standard_1D_dilated_strided__.yaml b/problems/specs/KernelBench/level1/76_conv_standard_1D_dilated_strided__.yaml
index 8b5f74f..a012eed 100644
--- a/problems/specs/KernelBench/level1/76_conv_standard_1D_dilated_strided__.yaml
+++ b/problems/specs/KernelBench/level1/76_conv_standard_1D_dilated_strided__.yaml
@@ -21,3 +21,15 @@ ci:
       STRIDE: 3
       DILATION: 4
       LENGTH: 128
+
+simple-cpu:
+  - params: [x]
+    dtype: float16
+    dims:
+      BATCH_SIZE: 2
+      IN_CHANNELS: 4
+      OUT_CHANNELS: 8
+      KERNEL_SIZE: 3
+      STRIDE: 3
+      DILATION: 4
+      LENGTH: 128
diff --git a/problems/specs/KernelBench/level1/77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.yaml b/problems/specs/KernelBench/level1/77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.yaml
index 4be5a3a..1be100e 100644
--- a/problems/specs/KernelBench/level1/77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.yaml
+++ b/problems/specs/KernelBench/level1/77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.yaml
@@ -25,3 +25,18 @@ ci:
       DEPTH: 8
       HEIGHT: 16
       WIDTH: 16
+
+simple-cpu:
+  - params: [x]
+    dtype: float16
+    dims:
+      BATCH_SIZE: 2
+      IN_CHANNELS: 4
+      OUT_CHANNELS: 8
+      KERNEL_SIZE: 3
+      STRIDE: 2
+      PADDING: 1
+      DILATION: 2
+      DEPTH: 8
+      HEIGHT: 16
+      WIDTH: 16
diff --git a/problems/specs/KernelBench/level1/78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.yaml b/problems/specs/KernelBench/level1/78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.yaml
index d909e97..fae17dd 100644
--- a/problems/specs/KernelBench/level1/78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.yaml
+++ b/problems/specs/KernelBench/level1/78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.yaml
@@ -22,3 +22,16 @@ ci:
       PADDING: [1, 3]
       HEIGHT: 16
       WIDTH: 32
+
+simple-cpu:
+  - params: [x]
+    dtype: float16
+    dims:
+      BATCH_SIZE: 2
+      IN_CHANNELS: 8
+      OUT_CHANNELS: 8
+      KERNEL_SIZE: [3, 7]
+      STRIDE: [1, 1]
+      PADDING: [1, 3]
+      HEIGHT: 16
+      WIDTH: 32
diff --git a/problems/specs/KernelBench/level1/79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.yaml b/problems/specs/KernelBench/level1/79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.yaml
index 45f21bd..c12e5c1 100644
--- a/problems/specs/KernelBench/level1/79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.yaml
+++ b/problems/specs/KernelBench/level1/79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.yaml
@@ -23,3 +23,16 @@ ci:
       PADDING: 1
       DILATION: 2
       LENGTH: 128
+
+simple-cpu:
+  - params: [x]
+    dtype: float16
+    dims:
+      BATCH_SIZE: 2
+      IN_CHANNELS: 4
+      OUT_CHANNELS: 8
+      KERNEL_SIZE: 3
+      STRIDE: 2
+      PADDING: 1
+      DILATION: 2
+      LENGTH: 128
diff --git a/problems/specs/KernelBench/level1/7_Matmul_with_small_K_dimension_.yaml b/problems/specs/KernelBench/level1/7_Matmul_with_small_K_dimension_.yaml
index 2c55dd2..05b84d0 100644
--- a/problems/specs/KernelBench/level1/7_Matmul_with_small_K_dimension_.yaml
+++ b/problems/specs/KernelBench/level1/7_Matmul_with_small_K_dimension_.yaml
@@ -15,6 +15,15 @@ ci:
       K: 16
     flop: "2*M*N*K"
 
+simple-cpu:
+  - params: [A, B]
+    dtype: bfloat16
+    dims:
+      M: 256
+      N: 256
+      K: 16
+    flop: "2*M*N*K"
+
 bench-gpu:
   - params: [A, B]
     dtype: float16
diff --git a/problems/specs/KernelBench/level1/80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.yaml b/problems/specs/KernelBench/level1/80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.yaml
index f431aeb..f5ace12 100644
--- a/problems/specs/KernelBench/level1/80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.yaml
+++ b/problems/specs/KernelBench/level1/80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.yaml
@@ -24,3 +24,17 @@ ci:
       DILATION: [2, 3]
       HEIGHT: 32
       WIDTH: 32
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 2
+      IN_CHANNELS: 4
+      OUT_CHANNELS: 32
+      KERNEL_SIZE: [5, 9]
+      STRIDE: 1
+      PADDING: [2, 4]
+      DILATION: [2, 3]
+      HEIGHT: 32
+      WIDTH: 32
diff --git a/problems/specs/KernelBench/level1/81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.yaml b/problems/specs/KernelBench/level1/81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.yaml
index 5b811a1..36d01d2 100644
--- a/problems/specs/KernelBench/level1/81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.yaml
+++ b/problems/specs/KernelBench/level1/81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.yaml
@@ -24,3 +24,17 @@ ci:
       DILATION: 2
       HEIGHT_IN: 16
       WIDTH_IN: 32
+
+simple-cpu:
+  - params: [x]
+    dtype: float16
+    dims:
+      BATCH_SIZE: 2
+      IN_CHANNELS: 4
+      OUT_CHANNELS: 8
+      KERNEL_SIZE: 3
+      STRIDE: 5
+      PADDING: 1
+      DILATION: 2
+      HEIGHT_IN: 16
+      WIDTH_IN: 32
diff --git a/problems/specs/KernelBench/level1/82_conv_depthwise_2D_square_input_square_kernel.yaml b/problems/specs/KernelBench/level1/82_conv_depthwise_2D_square_input_square_kernel.yaml
index eef3c22..6c50a10 100644
--- a/problems/specs/KernelBench/level1/82_conv_depthwise_2D_square_input_square_kernel.yaml
+++ b/problems/specs/KernelBench/level1/82_conv_depthwise_2D_square_input_square_kernel.yaml
@@ -20,3 +20,15 @@ ci:
       PADDING: 0
       HEIGHT: 32
       WIDTH: 32
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 2
+      IN_CHANNELS: 16
+      KERNEL_SIZE: 3
+      STRIDE: 1
+      PADDING: 0
+      HEIGHT: 32
+      WIDTH: 32
diff --git a/problems/specs/KernelBench/level1/83_conv_depthwise_2D_square_input_asymmetric_kernel.yaml b/problems/specs/KernelBench/level1/83_conv_depthwise_2D_square_input_asymmetric_kernel.yaml
index 0bd69f3..ef40d11 100644
--- a/problems/specs/KernelBench/level1/83_conv_depthwise_2D_square_input_asymmetric_kernel.yaml
+++ b/problems/specs/KernelBench/level1/83_conv_depthwise_2D_square_input_asymmetric_kernel.yaml
@@ -22,3 +22,16 @@ ci:
       DILATION: 1
       HEIGHT: 32
       WIDTH: 32
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 2
+      IN_CHANNELS: 8
+      KERNEL_SIZE: 3
+      STRIDE: 1
+      PADDING: 0
+      DILATION: 1
+      HEIGHT: 32
+      WIDTH: 32
diff --git a/problems/specs/KernelBench/level1/84_conv_depthwise_2D_asymmetric_input_square_kernel.yaml b/problems/specs/KernelBench/level1/84_conv_depthwise_2D_asymmetric_input_square_kernel.yaml
index e07b710..e7e9990 100644
--- a/problems/specs/KernelBench/level1/84_conv_depthwise_2D_asymmetric_input_square_kernel.yaml
+++ b/problems/specs/KernelBench/level1/84_conv_depthwise_2D_asymmetric_input_square_kernel.yaml
@@ -22,3 +22,16 @@ ci:
       PADDING: 0
       HEIGHT_IN: 16
       WIDTH_IN: 32
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 2
+      IN_CHANNELS: 8
+      OUT_CHANNELS: 8
+      KERNEL_SIZE: 3
+      STRIDE: 1
+      PADDING: 0
+      HEIGHT_IN: 16
+      WIDTH_IN: 32
diff --git a/problems/specs/KernelBench/level1/85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.yaml b/problems/specs/KernelBench/level1/85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.yaml
index 8c96917..8a3ea09 100644
--- a/problems/specs/KernelBench/level1/85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.yaml
+++ b/problems/specs/KernelBench/level1/85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.yaml
@@ -34,3 +34,22 @@ ci:
       GROUPS: 16
       HEIGHT: 16
       WIDTH: 32
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 2
+      IN_CHANNELS: 4
+      OUT_CHANNELS: 4
+      KERNEL_SIZE_H: 3
+      KERNEL_SIZE_W: 7
+      STRIDE_H: 1
+      STRIDE_W: 1
+      PADDING_H: 0
+      PADDING_W: 0
+      DILATION_H: 1
+      DILATION_W: 1
+      GROUPS: 16
+      HEIGHT: 16
+      WIDTH: 32
diff --git a/problems/specs/KernelBench/level1/86_conv_depthwise_separable_2D.yaml b/problems/specs/KernelBench/level1/86_conv_depthwise_separable_2D.yaml
index dd6bd97..08e7f67 100644
--- a/problems/specs/KernelBench/level1/86_conv_depthwise_separable_2D.yaml
+++ b/problems/specs/KernelBench/level1/86_conv_depthwise_separable_2D.yaml
@@ -25,3 +25,18 @@ ci:
       DILATION: 1
       HEIGHT: 32
       WIDTH: 32
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    memory_format: channels_last
+    dims:
+      BATCH_SIZE: 2
+      IN_CHANNELS: 4
+      OUT_CHANNELS: 8
+      KERNEL_SIZE: 3
+      STRIDE: 1
+      PADDING: 1
+      DILATION: 1
+      HEIGHT: 32
+      WIDTH: 32
diff --git a/problems/specs/KernelBench/level1/87_conv_pointwise_2D.yaml b/problems/specs/KernelBench/level1/87_conv_pointwise_2D.yaml
index 9b2a376..6429cde 100644
--- a/problems/specs/KernelBench/level1/87_conv_pointwise_2D.yaml
+++ b/problems/specs/KernelBench/level1/87_conv_pointwise_2D.yaml
@@ -16,3 +16,13 @@ ci:
       OUT_CHANNELS: 8
       HEIGHT: 32
       WIDTH: 32
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 2
+      IN_CHANNELS: 4
+      OUT_CHANNELS: 8
+      HEIGHT: 32
+      WIDTH: 32
diff --git a/problems/specs/KernelBench/level1/88_MinGPTNewGelu.yaml b/problems/specs/KernelBench/level1/88_MinGPTNewGelu.yaml
index 273d44b..287029f 100644
--- a/problems/specs/KernelBench/level1/88_MinGPTNewGelu.yaml
+++ b/problems/specs/KernelBench/level1/88_MinGPTNewGelu.yaml
@@ -11,3 +11,10 @@ ci:
     dims:
       BATCH_SIZE: 32
       DIM: 32
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 32
+      DIM: 32
diff --git a/problems/specs/KernelBench/level1/89_cumsum.yaml b/problems/specs/KernelBench/level1/89_cumsum.yaml
index 82de8c7..d026c98 100644
--- a/problems/specs/KernelBench/level1/89_cumsum.yaml
+++ b/problems/specs/KernelBench/level1/89_cumsum.yaml
@@ -13,3 +13,11 @@ ci:
       BATCH_SIZE: 64
       INPUT_DIM: 64
       SCAN_DIM: 1
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 64
+      INPUT_DIM: 64
+      SCAN_DIM: 1
diff --git a/problems/specs/KernelBench/level1/8_Matmul_with_irregular_shapes_.yaml b/problems/specs/KernelBench/level1/8_Matmul_with_irregular_shapes_.yaml
index 4f0553d..1068de4 100644
--- a/problems/specs/KernelBench/level1/8_Matmul_with_irregular_shapes_.yaml
+++ b/problems/specs/KernelBench/level1/8_Matmul_with_irregular_shapes_.yaml
@@ -15,6 +15,15 @@ ci:
       K: 19
     flop: "2*M*N*K"
 
+simple-cpu:
+  - params: [A, B]
+    dtype: bfloat16
+    dims:
+      M: 60
+      N: 37
+      K: 19
+    flop: "2*M*N*K"
+
 bench-gpu:
   - params: [A, B]
     dtype: float16
diff --git a/problems/specs/KernelBench/level1/90_cumprod.yaml b/problems/specs/KernelBench/level1/90_cumprod.yaml
index 82de8c7..d026c98 100644
--- a/problems/specs/KernelBench/level1/90_cumprod.yaml
+++ b/problems/specs/KernelBench/level1/90_cumprod.yaml
@@ -13,3 +13,11 @@ ci:
       BATCH_SIZE: 64
       INPUT_DIM: 64
       SCAN_DIM: 1
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 64
+      INPUT_DIM: 64
+      SCAN_DIM: 1
diff --git a/problems/specs/KernelBench/level1/91_cumsum_reverse.yaml b/problems/specs/KernelBench/level1/91_cumsum_reverse.yaml
index 82de8c7..d026c98 100644
--- a/problems/specs/KernelBench/level1/91_cumsum_reverse.yaml
+++ b/problems/specs/KernelBench/level1/91_cumsum_reverse.yaml
@@ -13,3 +13,11 @@ ci:
       BATCH_SIZE: 64
       INPUT_DIM: 64
       SCAN_DIM: 1
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 64
+      INPUT_DIM: 64
+      SCAN_DIM: 1
diff --git a/problems/specs/KernelBench/level1/92_cumsum_exclusive.yaml b/problems/specs/KernelBench/level1/92_cumsum_exclusive.yaml
index 82de8c7..d026c98 100644
--- a/problems/specs/KernelBench/level1/92_cumsum_exclusive.yaml
+++ b/problems/specs/KernelBench/level1/92_cumsum_exclusive.yaml
@@ -13,3 +13,11 @@ ci:
       BATCH_SIZE: 64
       INPUT_DIM: 64
       SCAN_DIM: 1
+
+simple-cpu:
+  - params: [x]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 64
+      INPUT_DIM: 64
+      SCAN_DIM: 1
diff --git a/problems/specs/KernelBench/level1/93_masked_cumsum.yaml b/problems/specs/KernelBench/level1/93_masked_cumsum.yaml
index d296d36..e9b0dc5 100644
--- a/problems/specs/KernelBench/level1/93_masked_cumsum.yaml
+++ b/problems/specs/KernelBench/level1/93_masked_cumsum.yaml
@@ -17,6 +17,14 @@ ci:
       INPUT_DIM: 64
       SCAN_DIM: 1
 
+simple-cpu:
+  - params: [x, mask]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 64
+      INPUT_DIM: 64
+      SCAN_DIM: 1
+
 bench-gpu:
   - params: [x, mask]
     dtype: float16
diff --git a/problems/specs/KernelBench/level1/94_MSELoss.yaml b/problems/specs/KernelBench/level1/94_MSELoss.yaml
index d6893b3..59ba064 100644
--- a/problems/specs/KernelBench/level1/94_MSELoss.yaml
+++ b/problems/specs/KernelBench/level1/94_MSELoss.yaml
@@ -15,3 +15,10 @@ ci:
     dims:
       BATCH_SIZE: 64
       INPUT_DIM: 64
+
+simple-cpu:
+  - params: [predictions, targets]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 64
+      INPUT_DIM: 64
diff --git a/problems/specs/KernelBench/level1/95_CrossEntropyLoss.yaml b/problems/specs/KernelBench/level1/95_CrossEntropyLoss.yaml
index 1925d32..acc89da 100644
--- a/problems/specs/KernelBench/level1/95_CrossEntropyLoss.yaml
+++ b/problems/specs/KernelBench/level1/95_CrossEntropyLoss.yaml
@@ -15,3 +15,10 @@ ci:
     dims:
       BATCH_SIZE: 64
       NUM_CLASSES: 8
+
+simple-cpu:
+  - params: [predictions, targets]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 64
+      NUM_CLASSES: 8
diff --git a/problems/specs/KernelBench/level1/96_HuberLoss.yaml b/problems/specs/KernelBench/level1/96_HuberLoss.yaml
index d6893b3..59ba064 100644
--- a/problems/specs/KernelBench/level1/96_HuberLoss.yaml
+++ b/problems/specs/KernelBench/level1/96_HuberLoss.yaml
@@ -15,3 +15,10 @@ ci:
     dims:
       BATCH_SIZE: 64
       INPUT_DIM: 64
+
+simple-cpu:
+  - params: [predictions, targets]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 64
+      INPUT_DIM: 64
diff --git a/problems/specs/KernelBench/level1/97_ScaledDotProductAttention.yaml b/problems/specs/KernelBench/level1/97_ScaledDotProductAttention.yaml
index 701f51b..f0255ec 100644
--- a/problems/specs/KernelBench/level1/97_ScaledDotProductAttention.yaml
+++ b/problems/specs/KernelBench/level1/97_ScaledDotProductAttention.yaml
@@ -19,3 +19,12 @@ ci:
       NUM_HEADS: 8
       SEQUENCE_LENGTH: 16
       EMBEDDING_DIMENSION: 32
+
+simple-cpu:
+  - params: [Q, K, V]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 32
+      NUM_HEADS: 32
+      SEQUENCE_LENGTH: 64
+      EMBEDDING_DIMENSION: 128
diff --git a/problems/specs/KernelBench/level1/98_KLDivLoss.yaml b/problems/specs/KernelBench/level1/98_KLDivLoss.yaml
index 63efbdc..72bb8e2 100644
--- a/problems/specs/KernelBench/level1/98_KLDivLoss.yaml
+++ b/problems/specs/KernelBench/level1/98_KLDivLoss.yaml
@@ -16,3 +16,10 @@ ci:
     dims:
       BATCH_SIZE: 64
       INPUT_DIM: 64
+
+simple-cpu:
+  - params: [predictions, targets]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 64
+      INPUT_DIM: 64
diff --git a/problems/specs/KernelBench/level1/99_TripletMarginLoss.yaml b/problems/specs/KernelBench/level1/99_TripletMarginLoss.yaml
index 114af17..0ab92c1 100644
--- a/problems/specs/KernelBench/level1/99_TripletMarginLoss.yaml
+++ b/problems/specs/KernelBench/level1/99_TripletMarginLoss.yaml
@@ -20,3 +20,11 @@ ci:
       BATCH_SIZE: 64
       INPUT_DIM: 8
       MARGIN: 1.0
+
+simple-cpu:
+  - params: [anchor, positive, negative]
+    dtype: float32
+    dims:
+      BATCH_SIZE: 2048
+      INPUT_DIM: 512
+      MARGIN: 1.0
diff --git a/problems/specs/KernelBench/level1/9_Tall_skinny_matrix_multiplication_.yaml b/problems/specs/KernelBench/level1/9_Tall_skinny_matrix_multiplication_.yaml
index 94ced01..9a25d89 100644
--- a/problems/specs/KernelBench/level1/9_Tall_skinny_matrix_multiplication_.yaml
+++ b/problems/specs/KernelBench/level1/9_Tall_skinny_matrix_multiplication_.yaml
@@ -14,6 +14,14 @@ ci:
       N: 16
     flop: "2*M*M*N"
 
+simple-cpu:
+  - params: [A, B]
+    dtype: bfloat16
+    dims:
+      M: 256
+      N: 16
+    flop: "2*M*M*N"
+
 bench-gpu:
   - params: [A, B]
     dtype: float16
diff --git a/pyproject.toml b/pyproject.toml
index d0c35b1..ba285e0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -115,7 +115,7 @@ override-dependencies = [
 torch = { index = "pytorch" }
 pytorch-triton-xpu = { index = "pytorch" }
 pytorch-triton = { index = "pytorch" }
-triton = { git = "https://github.com/triton-lang/triton-cpu.git", rev = "270e696" }
+triton = { git = "https://github.com/triton-lang/triton-cpu.git", rev = "eece2e9" }
 lighthouse = { git = "https://github.com/llvm/lighthouse", rev = "456475d" }
 mlir-python-bindings = { index = "eudsl" }
 

From cf5193f1d6ee04651c38f73e7665be5fa11d9712 Mon Sep 17 00:00:00 2001
From: Julian Oppermann <julian.oppermann@intel.com>
Date: Wed, 27 May 2026 08:53:57 -0700
Subject: [PATCH 2/3] Make blocks smaller, inputs larger, to ensure kernels
 launch as multiple programs

---
 backends/triton/cpu/KernelBench/level1/100_HingeLoss.py  | 4 +++-
 .../level1/10_3D_tensor_matrix_multiplication.py         | 4 +---
 .../level1/11_4D_tensor_matrix_multiplication.py         | 4 +---
 .../level1/12_Matmul_with_diagonal_matrices_.py          | 4 +++-
 .../level1/13_Matmul_for_symmetric_matrices.py           | 4 +---
 .../level1/14_Matmul_for_upper_triangular_matrices.py    | 4 +---
 .../level1/15_Matmul_for_lower_triangular_matrices.py    | 4 +---
 .../KernelBench/level1/16_Matmul_with_transposed_A.py    | 4 +---
 .../KernelBench/level1/17_Matmul_with_transposed_B.py    | 4 +---
 .../KernelBench/level1/18_Matmul_with_transposed_both.py | 4 +---
 backends/triton/cpu/KernelBench/level1/19_ReLU.py        | 2 +-
 .../level1/1_Square_matrix_multiplication_.py            | 4 +---
 backends/triton/cpu/KernelBench/level1/20_LeakyReLU.py   | 4 +++-
 backends/triton/cpu/KernelBench/level1/21_Sigmoid.py     | 4 +++-
 backends/triton/cpu/KernelBench/level1/22_Tanh.py        | 4 +++-
 backends/triton/cpu/KernelBench/level1/23_Softmax.py     | 4 +++-
 backends/triton/cpu/KernelBench/level1/24_LogSoftmax.py  | 4 +++-
 backends/triton/cpu/KernelBench/level1/25_Swish.py       | 4 +++-
 backends/triton/cpu/KernelBench/level1/26_GELU_.py       | 2 +-
 backends/triton/cpu/KernelBench/level1/27_SELU_.py       | 4 +++-
 backends/triton/cpu/KernelBench/level1/28_HardSigmoid.py | 4 +++-
 backends/triton/cpu/KernelBench/level1/29_Softplus.py    | 4 +++-
 .../level1/2_Standard_matrix_multiplication_.py          | 4 +---
 backends/triton/cpu/KernelBench/level1/30_Softsign.py    | 4 +++-
 backends/triton/cpu/KernelBench/level1/31_ELU.py         | 4 +++-
 backends/triton/cpu/KernelBench/level1/32_HardTanh.py    | 4 +++-
 backends/triton/cpu/KernelBench/level1/33_BatchNorm.py   | 4 +---
 .../triton/cpu/KernelBench/level1/34_InstanceNorm.py     | 4 +++-
 backends/triton/cpu/KernelBench/level1/35_GroupNorm_.py  | 8 ++++++--
 backends/triton/cpu/KernelBench/level1/36_RMSNorm_.py    | 4 +++-
 .../triton/cpu/KernelBench/level1/37_FrobeniusNorm_.py   | 6 ++++--
 backends/triton/cpu/KernelBench/level1/38_L1Norm_.py     | 4 +++-
 backends/triton/cpu/KernelBench/level1/39_L2Norm_.py     | 4 +++-
 .../level1/3_Batched_matrix_multiplication.py            | 4 +---
 backends/triton/cpu/KernelBench/level1/40_LayerNorm.py   | 2 +-
 .../triton/cpu/KernelBench/level1/41_Max_Pooling_1D.py   | 4 +++-
 .../triton/cpu/KernelBench/level1/42_Max_Pooling_2D.py   | 4 +++-
 .../triton/cpu/KernelBench/level1/43_Max_Pooling_3D.py   | 4 +++-
 .../cpu/KernelBench/level1/44_Average_Pooling_1D.py      | 4 +++-
 .../cpu/KernelBench/level1/45_Average_Pooling_2D.py      | 4 +++-
 .../cpu/KernelBench/level1/46_Average_Pooling_3D.py      | 4 +++-
 .../level1/47_Sum_reduction_over_a_dimension.py          | 4 +++-
 .../level1/48_Mean_reduction_over_a_dimension.py         | 4 +++-
 .../level1/49_Max_reduction_over_a_dimension.py          | 4 +++-
 .../level1/4_Matrix_vector_multiplication_.py            | 4 +++-
 .../50_conv_standard_2D__square_input__square_kernel.py  | 4 +---
 .../cpu/KernelBench/level1/51_Argmax_over_a_dimension.py | 4 +++-
 .../cpu/KernelBench/level1/52_Argmin_over_a_dimension.py | 2 +-
 .../level1/53_Min_reduction_over_a_dimension.py          | 4 +---
 .../54_conv_standard_3D__square_input__square_kernel.py  | 6 ++----
 ..._conv_standard_2D__asymmetric_input__square_kernel.py | 4 +---
 ...v_standard_2D__asymmetric_input__asymmetric_kernel.py | 4 +---
 ...57_conv_transposed_2D__square_input__square_kernel.py | 6 ++----
 ...transposed_3D__asymmetric_input__asymmetric_kernel.py | 4 +++-
 ..._conv_standard_3D__asymmetric_input__square_kernel.py | 6 ++----
 .../KernelBench/level1/5_Matrix_scalar_multiplication.py | 4 +++-
 ..._conv_standard_3D__square_input__asymmetric_kernel.py | 6 ++----
 ...61_conv_transposed_3D__square_input__square_kernel.py | 4 +---
 ..._conv_standard_2D__square_input__asymmetric_kernel.py | 4 +---
 .../63_conv_standard_2D__square_input__square_kernel.py  | 4 +---
 .../cpu/KernelBench/level1/64_conv_transposed_1D.py      | 4 +---
 ...onv_transposed_2D__square_input__asymmetric_kernel.py | 6 ++----
 ...v_standard_3D__asymmetric_input__asymmetric_kernel.py | 4 +---
 .../triton/cpu/KernelBench/level1/67_conv_standard_1D.py | 4 +++-
 ...onv_transposed_3D__square_input__asymmetric_kernel.py | 4 +++-
 ...transposed_2D__asymmetric_input__asymmetric_kernel.py | 6 ++----
 .../level1/6_Matmul_with_large_K_dimension_.py           | 4 +---
 ...onv_transposed_3D__asymmetric_input__square_kernel.py | 4 +++-
 ...onv_transposed_2D__asymmetric_input__square_kernel.py | 2 --
 ..._input_asymmetric_kernel___strided_padded_grouped_.py | 4 +++-
 ...etric_input_square_kernel__strided_padded__grouped.py | 2 +-
 .../KernelBench/level1/74_conv_transposed_1D_dilated.py  | 4 +---
 ...ric_kernel_strided__grouped____padded____dilated__.py | 2 +-
 .../level1/76_conv_standard_1D_dilated_strided__.py      | 4 +++-
 ...put_square_kernel___padded____dilated____strided__.py | 6 ++++--
 ...d_2D_asymmetric_input_asymmetric_kernel___padded__.py | 2 --
 ...put_square_kernel___padded____strided____dilated__.py | 4 +++-
 .../level1/7_Matmul_with_small_K_dimension_.py           | 4 +---
 ...uare_input_asymmetric_kernel___dilated____padded__.py | 6 ++----
 ...put_square_kernel___dilated____padded____strided__.py | 6 ++++--
 .../82_conv_depthwise_2D_square_input_square_kernel.py   | 4 +++-
 ...3_conv_depthwise_2D_square_input_asymmetric_kernel.py | 4 +++-
 ...4_conv_depthwise_2D_asymmetric_input_square_kernel.py | 4 +++-
 ...nv_depthwise_2D_asymmetric_input_asymmetric_kernel.py | 4 +++-
 .../KernelBench/level1/86_conv_depthwise_separable_2D.py | 4 +---
 .../cpu/KernelBench/level1/87_conv_pointwise_2D.py       | 8 +++-----
 .../triton/cpu/KernelBench/level1/88_MinGPTNewGelu.py    | 4 +++-
 backends/triton/cpu/KernelBench/level1/89_cumsum.py      | 4 +++-
 .../level1/8_Matmul_with_irregular_shapes_.py            | 4 +---
 backends/triton/cpu/KernelBench/level1/90_cumprod.py     | 2 +-
 .../triton/cpu/KernelBench/level1/91_cumsum_reverse.py   | 2 +-
 .../triton/cpu/KernelBench/level1/92_cumsum_exclusive.py | 4 +++-
 .../triton/cpu/KernelBench/level1/93_masked_cumsum.py    | 4 +++-
 backends/triton/cpu/KernelBench/level1/94_MSELoss.py     | 4 +---
 .../triton/cpu/KernelBench/level1/95_CrossEntropyLoss.py | 9 +--------
 backends/triton/cpu/KernelBench/level1/96_HuberLoss.py   | 2 +-
 .../KernelBench/level1/97_ScaledDotProductAttention.py   | 4 ++--
 backends/triton/cpu/KernelBench/level1/98_KLDivLoss.py   | 4 +---
 .../cpu/KernelBench/level1/99_TripletMarginLoss.py       | 4 +++-
 .../level1/9_Tall_skinny_matrix_multiplication_.py       | 4 +---
 .../level1/10_3D_tensor_matrix_multiplication.yaml       | 8 ++++----
 .../level1/11_4D_tensor_matrix_multiplication.yaml       | 6 +++---
 .../level1/13_Matmul_for_symmetric_matrices.yaml         | 2 +-
 .../level1/14_Matmul_for_upper_triangular_matrices.yaml  | 2 +-
 .../level1/15_Matmul_for_lower_triangular_matrices.yaml  | 2 +-
 .../level1/1_Square_matrix_multiplication_.yaml          | 2 +-
 .../level1/2_Standard_matrix_multiplication_.yaml        | 6 +++---
 .../level1/47_Sum_reduction_over_a_dimension.yaml        | 4 ++--
 .../level1/48_Mean_reduction_over_a_dimension.yaml       | 4 ++--
 .../level1/4_Matrix_vector_multiplication_.yaml          | 4 ++--
 .../KernelBench/level1/51_Argmax_over_a_dimension.yaml   | 4 ++--
 .../KernelBench/level1/52_Argmin_over_a_dimension.yaml   | 4 ++--
 .../level1/53_Min_reduction_over_a_dimension.yaml        | 4 ++--
 ...54_conv_standard_3D__square_input__square_kernel.yaml | 2 +-
 ...onv_standard_3D__asymmetric_input__square_kernel.yaml | 2 +-
 .../level1/5_Matrix_scalar_multiplication.yaml           | 4 ++--
 .../specs/KernelBench/level1/67_conv_standard_1D.yaml    | 2 +-
 .../level1/6_Matmul_with_large_K_dimension_.yaml         | 4 ++--
 .../level1/7_Matmul_with_small_K_dimension_.yaml         | 6 +++---
 .../specs/KernelBench/level1/87_conv_pointwise_2D.yaml   | 4 ++--
 .../level1/8_Matmul_with_irregular_shapes_.yaml          | 6 +++---
 .../level1/9_Tall_skinny_matrix_multiplication_.yaml     | 4 ++--
 122 files changed, 257 insertions(+), 240 deletions(-)

diff --git a/backends/triton/cpu/KernelBench/level1/100_HingeLoss.py b/backends/triton/cpu/KernelBench/level1/100_HingeLoss.py
index 1903395..a7c936c 100644
--- a/backends/triton/cpu/KernelBench/level1/100_HingeLoss.py
+++ b/backends/triton/cpu/KernelBench/level1/100_HingeLoss.py
@@ -12,7 +12,9 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2),
+        triton.Config(
+            {"BLOCK_SIZE": 32},
+        ),
     ],
     key=["D"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/10_3D_tensor_matrix_multiplication.py b/backends/triton/cpu/KernelBench/level1/10_3D_tensor_matrix_multiplication.py
index b6629b8..a94792e 100644
--- a/backends/triton/cpu/KernelBench/level1/10_3D_tensor_matrix_multiplication.py
+++ b/backends/triton/cpu/KernelBench/level1/10_3D_tensor_matrix_multiplication.py
@@ -13,9 +13,7 @@
 def _configs():
     return [
         triton.Config(
-            {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
-            num_warps=32,
-            num_stages=2,
+            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
         ),
     ]
 
diff --git a/backends/triton/cpu/KernelBench/level1/11_4D_tensor_matrix_multiplication.py b/backends/triton/cpu/KernelBench/level1/11_4D_tensor_matrix_multiplication.py
index 10f3601..99f96b2 100644
--- a/backends/triton/cpu/KernelBench/level1/11_4D_tensor_matrix_multiplication.py
+++ b/backends/triton/cpu/KernelBench/level1/11_4D_tensor_matrix_multiplication.py
@@ -32,9 +32,7 @@ def swizzle_tile(
 def get_autotune_configs():
     return [
         triton.Config(
-            {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 64, "GROUP_SIZE_M": 4},
-            num_warps=32,
-            num_stages=2,
+            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
         ),
     ]
 
diff --git a/backends/triton/cpu/KernelBench/level1/12_Matmul_with_diagonal_matrices_.py b/backends/triton/cpu/KernelBench/level1/12_Matmul_with_diagonal_matrices_.py
index 499ab2a..7a1a6c0 100644
--- a/backends/triton/cpu/KernelBench/level1/12_Matmul_with_diagonal_matrices_.py
+++ b/backends/triton/cpu/KernelBench/level1/12_Matmul_with_diagonal_matrices_.py
@@ -12,7 +12,9 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_M": 64, "BLOCK_N": 128}, num_warps=32, num_stages=2),
+        triton.Config(
+            {"BLOCK_M": 32, "BLOCK_N": 32},
+        ),
     ],
     key=["N", "M"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/13_Matmul_for_symmetric_matrices.py b/backends/triton/cpu/KernelBench/level1/13_Matmul_for_symmetric_matrices.py
index 13dc6df..c47d878 100644
--- a/backends/triton/cpu/KernelBench/level1/13_Matmul_for_symmetric_matrices.py
+++ b/backends/triton/cpu/KernelBench/level1/13_Matmul_for_symmetric_matrices.py
@@ -32,9 +32,7 @@ def swizzle_tile(
 def _configs():
     return [
         triton.Config(
-            {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
-            num_warps=32,
-            num_stages=2,
+            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
         ),
     ]
 
diff --git a/backends/triton/cpu/KernelBench/level1/14_Matmul_for_upper_triangular_matrices.py b/backends/triton/cpu/KernelBench/level1/14_Matmul_for_upper_triangular_matrices.py
index 184f008..3e05bc4 100644
--- a/backends/triton/cpu/KernelBench/level1/14_Matmul_for_upper_triangular_matrices.py
+++ b/backends/triton/cpu/KernelBench/level1/14_Matmul_for_upper_triangular_matrices.py
@@ -34,9 +34,7 @@ def swizzle_tile(
 @triton.autotune(
     configs=[
         triton.Config(
-            {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
-            num_warps=32,
-            num_stages=2,
+            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
         ),
     ],
     key=["M", "N", "K"],
diff --git a/backends/triton/cpu/KernelBench/level1/15_Matmul_for_lower_triangular_matrices.py b/backends/triton/cpu/KernelBench/level1/15_Matmul_for_lower_triangular_matrices.py
index 8b755b7..9dde830 100644
--- a/backends/triton/cpu/KernelBench/level1/15_Matmul_for_lower_triangular_matrices.py
+++ b/backends/triton/cpu/KernelBench/level1/15_Matmul_for_lower_triangular_matrices.py
@@ -32,9 +32,7 @@ def swizzle_tile(
 @triton.autotune(
     configs=[
         triton.Config(
-            {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
-            num_warps=32,
-            num_stages=2,
+            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
         ),
     ],
     key=["M"],
diff --git a/backends/triton/cpu/KernelBench/level1/16_Matmul_with_transposed_A.py b/backends/triton/cpu/KernelBench/level1/16_Matmul_with_transposed_A.py
index 1df1ce2..348ff28 100644
--- a/backends/triton/cpu/KernelBench/level1/16_Matmul_with_transposed_A.py
+++ b/backends/triton/cpu/KernelBench/level1/16_Matmul_with_transposed_A.py
@@ -13,9 +13,7 @@
 def get_autotune_configs():
     return [
         triton.Config(
-            {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
-            num_warps=32,
-            num_stages=2,
+            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
         ),
     ]
 
diff --git a/backends/triton/cpu/KernelBench/level1/17_Matmul_with_transposed_B.py b/backends/triton/cpu/KernelBench/level1/17_Matmul_with_transposed_B.py
index c7b9f5f..f88ca8d 100644
--- a/backends/triton/cpu/KernelBench/level1/17_Matmul_with_transposed_B.py
+++ b/backends/triton/cpu/KernelBench/level1/17_Matmul_with_transposed_B.py
@@ -13,9 +13,7 @@
 def get_autotune_configs():
     return [
         triton.Config(
-            {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
-            num_warps=32,
-            num_stages=2,
+            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
         ),
     ]
 
diff --git a/backends/triton/cpu/KernelBench/level1/18_Matmul_with_transposed_both.py b/backends/triton/cpu/KernelBench/level1/18_Matmul_with_transposed_both.py
index 54c94c6..b4c40df 100644
--- a/backends/triton/cpu/KernelBench/level1/18_Matmul_with_transposed_both.py
+++ b/backends/triton/cpu/KernelBench/level1/18_Matmul_with_transposed_both.py
@@ -13,9 +13,7 @@
 def _configs():
     return [
         triton.Config(
-            {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
-            num_warps=32,
-            num_stages=2,
+            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
         ),
     ]
 
diff --git a/backends/triton/cpu/KernelBench/level1/19_ReLU.py b/backends/triton/cpu/KernelBench/level1/19_ReLU.py
index 1eed9d7..b7be751 100644
--- a/backends/triton/cpu/KernelBench/level1/19_ReLU.py
+++ b/backends/triton/cpu/KernelBench/level1/19_ReLU.py
@@ -13,7 +13,7 @@
 @triton.autotune(
     configs=[
         triton.Config(
-            {"BLOCK_SIZE": 4096, "NUM_PROGRAMS": 256}, num_warps=8, num_stages=3
+            {"BLOCK_SIZE": 32, "NUM_PROGRAMS": 64},
         ),
     ],
     key=["n_elements"],
diff --git a/backends/triton/cpu/KernelBench/level1/1_Square_matrix_multiplication_.py b/backends/triton/cpu/KernelBench/level1/1_Square_matrix_multiplication_.py
index 09c940e..80f240d 100644
--- a/backends/triton/cpu/KernelBench/level1/1_Square_matrix_multiplication_.py
+++ b/backends/triton/cpu/KernelBench/level1/1_Square_matrix_multiplication_.py
@@ -13,9 +13,7 @@
 def _configs():
     return [
         triton.Config(
-            {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
-            num_warps=32,
-            num_stages=2,
+            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
         ),
     ]
 
diff --git a/backends/triton/cpu/KernelBench/level1/20_LeakyReLU.py b/backends/triton/cpu/KernelBench/level1/20_LeakyReLU.py
index 8002e4f..12f9a28 100644
--- a/backends/triton/cpu/KernelBench/level1/20_LeakyReLU.py
+++ b/backends/triton/cpu/KernelBench/level1/20_LeakyReLU.py
@@ -12,7 +12,9 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2),
+        triton.Config(
+            {"BLOCK_SIZE": 32},
+        ),
     ],
     key=["n_elements"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/21_Sigmoid.py b/backends/triton/cpu/KernelBench/level1/21_Sigmoid.py
index 7240e65..a6b8f44 100644
--- a/backends/triton/cpu/KernelBench/level1/21_Sigmoid.py
+++ b/backends/triton/cpu/KernelBench/level1/21_Sigmoid.py
@@ -12,7 +12,9 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2),
+        triton.Config(
+            {"BLOCK_SIZE": 32},
+        ),
     ],
     key=["N"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/22_Tanh.py b/backends/triton/cpu/KernelBench/level1/22_Tanh.py
index f5f6074..e261dfc 100644
--- a/backends/triton/cpu/KernelBench/level1/22_Tanh.py
+++ b/backends/triton/cpu/KernelBench/level1/22_Tanh.py
@@ -15,7 +15,9 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2),
+        triton.Config(
+            {"BLOCK_SIZE": 32},
+        ),
     ],
     key=["n_elements"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/23_Softmax.py b/backends/triton/cpu/KernelBench/level1/23_Softmax.py
index 5e7ba1b..c813a14 100644
--- a/backends/triton/cpu/KernelBench/level1/23_Softmax.py
+++ b/backends/triton/cpu/KernelBench/level1/23_Softmax.py
@@ -12,7 +12,9 @@
 
 def _softmax_configs():
     return [
-        triton.Config({"BLOCK_N": 2048}, num_warps=8, num_stages=3),
+        triton.Config(
+            {"BLOCK_N": 32},
+        ),
     ]
 
 
diff --git a/backends/triton/cpu/KernelBench/level1/24_LogSoftmax.py b/backends/triton/cpu/KernelBench/level1/24_LogSoftmax.py
index c5e5f35..e314810 100644
--- a/backends/triton/cpu/KernelBench/level1/24_LogSoftmax.py
+++ b/backends/triton/cpu/KernelBench/level1/24_LogSoftmax.py
@@ -12,7 +12,9 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_N": 2048, "warp_size": 32}, num_warps=4, num_stages=2),
+        triton.Config(
+            {"BLOCK_N": 32, "warp_size": 32},
+        ),
     ],
     key=["N"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/25_Swish.py b/backends/triton/cpu/KernelBench/level1/25_Swish.py
index 0a682dd..14ea27a 100644
--- a/backends/triton/cpu/KernelBench/level1/25_Swish.py
+++ b/backends/triton/cpu/KernelBench/level1/25_Swish.py
@@ -19,7 +19,9 @@ def _sigmoid_exp2(x):
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2),
+        triton.Config(
+            {"BLOCK_SIZE": 32},
+        ),
     ],
     key=["n_elements"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/26_GELU_.py b/backends/triton/cpu/KernelBench/level1/26_GELU_.py
index 0d52473..777e424 100644
--- a/backends/triton/cpu/KernelBench/level1/26_GELU_.py
+++ b/backends/triton/cpu/KernelBench/level1/26_GELU_.py
@@ -13,7 +13,7 @@
 @triton.autotune(
     configs=[
         triton.Config(
-            {"BLOCK_SIZE": 4096, "NUM_PROGS": 160}, num_warps=4, num_stages=2
+            {"BLOCK_SIZE": 32, "NUM_PROGS": 16},
         ),
     ],
     key=["n_elements"],
diff --git a/backends/triton/cpu/KernelBench/level1/27_SELU_.py b/backends/triton/cpu/KernelBench/level1/27_SELU_.py
index 1d8a3a7..3eb2540 100644
--- a/backends/triton/cpu/KernelBench/level1/27_SELU_.py
+++ b/backends/triton/cpu/KernelBench/level1/27_SELU_.py
@@ -12,7 +12,9 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=3),
+        triton.Config(
+            {"BLOCK_SIZE": 32},
+        ),
     ],
     key=["n_elements"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/28_HardSigmoid.py b/backends/triton/cpu/KernelBench/level1/28_HardSigmoid.py
index 6bde4bb..2d9d71b 100644
--- a/backends/triton/cpu/KernelBench/level1/28_HardSigmoid.py
+++ b/backends/triton/cpu/KernelBench/level1/28_HardSigmoid.py
@@ -12,7 +12,9 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2),
+        triton.Config(
+            {"BLOCK_SIZE": 32},
+        ),
     ],
     key=["N"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/29_Softplus.py b/backends/triton/cpu/KernelBench/level1/29_Softplus.py
index d561bab..1829f87 100644
--- a/backends/triton/cpu/KernelBench/level1/29_Softplus.py
+++ b/backends/triton/cpu/KernelBench/level1/29_Softplus.py
@@ -12,7 +12,9 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2),
+        triton.Config(
+            {"BLOCK_SIZE": 32},
+        ),
     ],
     key=["n_elements"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/2_Standard_matrix_multiplication_.py b/backends/triton/cpu/KernelBench/level1/2_Standard_matrix_multiplication_.py
index 761dc6e..e002715 100644
--- a/backends/triton/cpu/KernelBench/level1/2_Standard_matrix_multiplication_.py
+++ b/backends/triton/cpu/KernelBench/level1/2_Standard_matrix_multiplication_.py
@@ -13,9 +13,7 @@
 def _configs():
     return [
         triton.Config(
-            {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
-            num_warps=32,
-            num_stages=2,
+            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
         ),
     ]
 
diff --git a/backends/triton/cpu/KernelBench/level1/30_Softsign.py b/backends/triton/cpu/KernelBench/level1/30_Softsign.py
index 6290f9a..574b897 100644
--- a/backends/triton/cpu/KernelBench/level1/30_Softsign.py
+++ b/backends/triton/cpu/KernelBench/level1/30_Softsign.py
@@ -12,7 +12,9 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2),
+        triton.Config(
+            {"BLOCK_SIZE": 32},
+        ),
     ],
     key=["n_elements"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/31_ELU.py b/backends/triton/cpu/KernelBench/level1/31_ELU.py
index c9e0a26..0397929 100644
--- a/backends/triton/cpu/KernelBench/level1/31_ELU.py
+++ b/backends/triton/cpu/KernelBench/level1/31_ELU.py
@@ -12,7 +12,9 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2),
+        triton.Config(
+            {"BLOCK_SIZE": 32},
+        ),
     ],
     key=["n_elements"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/32_HardTanh.py b/backends/triton/cpu/KernelBench/level1/32_HardTanh.py
index c5fa253..12e4fb8 100644
--- a/backends/triton/cpu/KernelBench/level1/32_HardTanh.py
+++ b/backends/triton/cpu/KernelBench/level1/32_HardTanh.py
@@ -12,7 +12,9 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2),
+        triton.Config(
+            {"BLOCK_SIZE": 32},
+        ),
     ],
     key=["n_elements"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/33_BatchNorm.py b/backends/triton/cpu/KernelBench/level1/33_BatchNorm.py
index 4b2eab4..9f8b17f 100644
--- a/backends/triton/cpu/KernelBench/level1/33_BatchNorm.py
+++ b/backends/triton/cpu/KernelBench/level1/33_BatchNorm.py
@@ -71,7 +71,7 @@ def _bn_stats_kernel(
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_SIZE": 4096, "warp_size": 32}, num_warps=8),
+        triton.Config({"BLOCK_SIZE": 32, "warp_size": 32}, num_warps=8),
     ],
     key=["total_elements"],
 )
@@ -153,7 +153,6 @@ def forward(self, x):
                 stride_c,
                 B=B,
                 BLOCK_HW=8192,
-                num_warps=8,
             )
             _bn_stats_kernel[(C,)](
                 self._partial_sum,
@@ -166,7 +165,6 @@ def forward(self, x):
                 self.eps,
                 B=B,
                 BLOCK_B=triton.next_power_of_2(B),
-                num_warps=4,
             )
         else:
             inv_std = 1.0 / torch.sqrt(self.running_var + self.eps)
diff --git a/backends/triton/cpu/KernelBench/level1/34_InstanceNorm.py b/backends/triton/cpu/KernelBench/level1/34_InstanceNorm.py
index 4149704..17923cd 100644
--- a/backends/triton/cpu/KernelBench/level1/34_InstanceNorm.py
+++ b/backends/triton/cpu/KernelBench/level1/34_InstanceNorm.py
@@ -12,7 +12,9 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_SIZE": 2048, "warp_size": 32}, num_warps=8, num_stages=2),
+        triton.Config(
+            {"BLOCK_SIZE": 32, "warp_size": 32},
+        ),
     ],
     key=["N"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/35_GroupNorm_.py b/backends/triton/cpu/KernelBench/level1/35_GroupNorm_.py
index 4fab4bb..326cfab 100644
--- a/backends/triton/cpu/KernelBench/level1/35_GroupNorm_.py
+++ b/backends/triton/cpu/KernelBench/level1/35_GroupNorm_.py
@@ -12,7 +12,9 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_HW": 4096, "warp_size": 32}, num_warps=8, num_stages=2),
+        triton.Config(
+            {"BLOCK_HW": 32, "warp_size": 32},
+        ),
     ],
     key=["HW", "channels_per_group"],
 )
@@ -62,7 +64,9 @@ def group_norm_stats_kernel(
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_HW": 4096, "warp_size": 32}, num_warps=8, num_stages=2),
+        triton.Config(
+            {"BLOCK_HW": 32, "warp_size": 32},
+        ),
     ],
     key=["HW"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/36_RMSNorm_.py b/backends/triton/cpu/KernelBench/level1/36_RMSNorm_.py
index de1949b..ae56e24 100644
--- a/backends/triton/cpu/KernelBench/level1/36_RMSNorm_.py
+++ b/backends/triton/cpu/KernelBench/level1/36_RMSNorm_.py
@@ -12,7 +12,9 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_S": 64}, num_warps=4, num_stages=2),
+        triton.Config(
+            {"BLOCK_S": 32},
+        ),
     ],
     key=["S", "F"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/37_FrobeniusNorm_.py b/backends/triton/cpu/KernelBench/level1/37_FrobeniusNorm_.py
index d83afe2..ca79b72 100644
--- a/backends/triton/cpu/KernelBench/level1/37_FrobeniusNorm_.py
+++ b/backends/triton/cpu/KernelBench/level1/37_FrobeniusNorm_.py
@@ -9,7 +9,7 @@
 import triton
 import triton.language as tl
 
-REDUCE_BLOCK: tl.constexpr = 8192
+REDUCE_BLOCK: tl.constexpr = 128
 
 
 @triton.jit
@@ -48,7 +48,9 @@ def _reduce_kernel(
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_SIZE": 2048}, num_warps=8, num_stages=2),
+        triton.Config(
+            {"BLOCK_SIZE": 32},
+        ),
     ],
     key=["N"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/38_L1Norm_.py b/backends/triton/cpu/KernelBench/level1/38_L1Norm_.py
index 4a3a396..30f0892 100644
--- a/backends/triton/cpu/KernelBench/level1/38_L1Norm_.py
+++ b/backends/triton/cpu/KernelBench/level1/38_L1Norm_.py
@@ -12,7 +12,9 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_SIZE": 1024}, num_warps=8, num_stages=2),
+        triton.Config(
+            {"BLOCK_SIZE": 32},
+        ),
     ],
     key=["N"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/39_L2Norm_.py b/backends/triton/cpu/KernelBench/level1/39_L2Norm_.py
index ecd571f..4a5d86b 100644
--- a/backends/triton/cpu/KernelBench/level1/39_L2Norm_.py
+++ b/backends/triton/cpu/KernelBench/level1/39_L2Norm_.py
@@ -12,7 +12,9 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2),
+        triton.Config(
+            {"BLOCK_SIZE": 32},
+        ),
     ],
     key=["N"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/3_Batched_matrix_multiplication.py b/backends/triton/cpu/KernelBench/level1/3_Batched_matrix_multiplication.py
index 9380f52..8e07a2f 100644
--- a/backends/triton/cpu/KernelBench/level1/3_Batched_matrix_multiplication.py
+++ b/backends/triton/cpu/KernelBench/level1/3_Batched_matrix_multiplication.py
@@ -13,9 +13,7 @@
 @triton.autotune(
     configs=[
         triton.Config(
-            {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
-            num_warps=32,
-            num_stages=2,
+            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
         ),
     ],
     key=["M", "N", "K"],
diff --git a/backends/triton/cpu/KernelBench/level1/40_LayerNorm.py b/backends/triton/cpu/KernelBench/level1/40_LayerNorm.py
index e0954b2..2b0491e 100644
--- a/backends/triton/cpu/KernelBench/level1/40_LayerNorm.py
+++ b/backends/triton/cpu/KernelBench/level1/40_LayerNorm.py
@@ -13,7 +13,7 @@
 @triton.autotune(
     configs=[
         triton.Config(
-            {"BLOCK_SIZE": 1024, "warp_size": 32}, num_warps=16, num_stages=2
+            {"BLOCK_SIZE": 32, "warp_size": 32},
         ),
     ],
     key=["N"],
diff --git a/backends/triton/cpu/KernelBench/level1/41_Max_Pooling_1D.py b/backends/triton/cpu/KernelBench/level1/41_Max_Pooling_1D.py
index 1d71cbb..9771174 100644
--- a/backends/triton/cpu/KernelBench/level1/41_Max_Pooling_1D.py
+++ b/backends/triton/cpu/KernelBench/level1/41_Max_Pooling_1D.py
@@ -12,7 +12,9 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_SIZE": 256}, num_warps=4, num_stages=2),
+        triton.Config(
+            {"BLOCK_SIZE": 32},
+        ),
     ],
     key=["output_length"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/42_Max_Pooling_2D.py b/backends/triton/cpu/KernelBench/level1/42_Max_Pooling_2D.py
index 0d5aa8e..573edd8 100644
--- a/backends/triton/cpu/KernelBench/level1/42_Max_Pooling_2D.py
+++ b/backends/triton/cpu/KernelBench/level1/42_Max_Pooling_2D.py
@@ -12,7 +12,9 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_H": 1, "BLOCK_W": 256}, num_warps=4, num_stages=2),
+        triton.Config(
+            {"BLOCK_H": 1, "BLOCK_W": 32},
+        ),
     ],
     key=["OH", "OW"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/43_Max_Pooling_3D.py b/backends/triton/cpu/KernelBench/level1/43_Max_Pooling_3D.py
index 84f6dc5..0b313a6 100644
--- a/backends/triton/cpu/KernelBench/level1/43_Max_Pooling_3D.py
+++ b/backends/triton/cpu/KernelBench/level1/43_Max_Pooling_3D.py
@@ -12,7 +12,9 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_OW": 32}, num_warps=4, num_stages=2),
+        triton.Config(
+            {"BLOCK_OW": 32},
+        ),
     ],
     key=["OW"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/44_Average_Pooling_1D.py b/backends/triton/cpu/KernelBench/level1/44_Average_Pooling_1D.py
index 564bf7f..21234fb 100644
--- a/backends/triton/cpu/KernelBench/level1/44_Average_Pooling_1D.py
+++ b/backends/triton/cpu/KernelBench/level1/44_Average_Pooling_1D.py
@@ -12,7 +12,9 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_SIZE": 256}, num_warps=4, num_stages=2),
+        triton.Config(
+            {"BLOCK_SIZE": 32},
+        ),
     ],
     key=["output_length", "kernel_size"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/45_Average_Pooling_2D.py b/backends/triton/cpu/KernelBench/level1/45_Average_Pooling_2D.py
index 0e9e33f..3c5e3b1 100644
--- a/backends/triton/cpu/KernelBench/level1/45_Average_Pooling_2D.py
+++ b/backends/triton/cpu/KernelBench/level1/45_Average_Pooling_2D.py
@@ -12,7 +12,9 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_OW": 16}, num_warps=4, num_stages=2),
+        triton.Config(
+            {"BLOCK_OW": 16},
+        ),
     ],
     key=["OW"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/46_Average_Pooling_3D.py b/backends/triton/cpu/KernelBench/level1/46_Average_Pooling_3D.py
index f52c515..aa98269 100644
--- a/backends/triton/cpu/KernelBench/level1/46_Average_Pooling_3D.py
+++ b/backends/triton/cpu/KernelBench/level1/46_Average_Pooling_3D.py
@@ -12,7 +12,9 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_OW": 64}, num_warps=4, num_stages=2),
+        triton.Config(
+            {"BLOCK_OW": 32},
+        ),
     ],
     key=["OW"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/47_Sum_reduction_over_a_dimension.py b/backends/triton/cpu/KernelBench/level1/47_Sum_reduction_over_a_dimension.py
index e5088dd..ad7fed5 100644
--- a/backends/triton/cpu/KernelBench/level1/47_Sum_reduction_over_a_dimension.py
+++ b/backends/triton/cpu/KernelBench/level1/47_Sum_reduction_over_a_dimension.py
@@ -12,7 +12,9 @@
 
 def get_reduction_configs():
     return [
-        triton.Config({"BLOCK_R": 64, "BLOCK_N": 128}, num_warps=4, num_stages=2),
+        triton.Config(
+            {"BLOCK_R": 32, "BLOCK_N": 64},
+        ),
     ]
 
 
diff --git a/backends/triton/cpu/KernelBench/level1/48_Mean_reduction_over_a_dimension.py b/backends/triton/cpu/KernelBench/level1/48_Mean_reduction_over_a_dimension.py
index 7b43864..d1d0151 100644
--- a/backends/triton/cpu/KernelBench/level1/48_Mean_reduction_over_a_dimension.py
+++ b/backends/triton/cpu/KernelBench/level1/48_Mean_reduction_over_a_dimension.py
@@ -12,7 +12,9 @@
 
 def get_reduction_configs():
     return [
-        triton.Config({"BLOCK_R": 64, "BLOCK_N": 128}, num_warps=4, num_stages=2),
+        triton.Config(
+            {"BLOCK_R": 32, "BLOCK_N": 64},
+        ),
     ]
 
 
diff --git a/backends/triton/cpu/KernelBench/level1/49_Max_reduction_over_a_dimension.py b/backends/triton/cpu/KernelBench/level1/49_Max_reduction_over_a_dimension.py
index a82f882..469246e 100644
--- a/backends/triton/cpu/KernelBench/level1/49_Max_reduction_over_a_dimension.py
+++ b/backends/triton/cpu/KernelBench/level1/49_Max_reduction_over_a_dimension.py
@@ -12,7 +12,9 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_N": 64, "BLOCK_K": 64}, num_warps=8, num_stages=2),
+        triton.Config(
+            {"BLOCK_N": 32, "BLOCK_K": 64},
+        ),
     ],
     key=["DIM1", "DIM2"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/4_Matrix_vector_multiplication_.py b/backends/triton/cpu/KernelBench/level1/4_Matrix_vector_multiplication_.py
index 4573a0e..5407588 100644
--- a/backends/triton/cpu/KernelBench/level1/4_Matrix_vector_multiplication_.py
+++ b/backends/triton/cpu/KernelBench/level1/4_Matrix_vector_multiplication_.py
@@ -12,7 +12,9 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_K": 512}, num_warps=4, num_stages=3),
+        triton.Config(
+            {"BLOCK_K": 32},
+        ),
     ],
     key=["K"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/50_conv_standard_2D__square_input__square_kernel.py b/backends/triton/cpu/KernelBench/level1/50_conv_standard_2D__square_input__square_kernel.py
index a2e0bcb..5fcaf7d 100644
--- a/backends/triton/cpu/KernelBench/level1/50_conv_standard_2D__square_input__square_kernel.py
+++ b/backends/triton/cpu/KernelBench/level1/50_conv_standard_2D__square_input__square_kernel.py
@@ -34,9 +34,7 @@ def swizzle_tile(
 @triton.autotune(
     configs=[
         triton.Config(
-            {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
-            num_warps=32,
-            num_stages=2,
+            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
         ),
     ],
     key=["M", "N", "K"],
diff --git a/backends/triton/cpu/KernelBench/level1/51_Argmax_over_a_dimension.py b/backends/triton/cpu/KernelBench/level1/51_Argmax_over_a_dimension.py
index 1b9d31d..4b7b467 100644
--- a/backends/triton/cpu/KernelBench/level1/51_Argmax_over_a_dimension.py
+++ b/backends/triton/cpu/KernelBench/level1/51_Argmax_over_a_dimension.py
@@ -12,7 +12,9 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_N": 64, "grf_mode": "128"}, num_warps=4, num_stages=2),
+        triton.Config(
+            {"BLOCK_N": 32, "grf_mode": "128"},
+        ),
     ],
     key=["D1", "D2"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/52_Argmin_over_a_dimension.py b/backends/triton/cpu/KernelBench/level1/52_Argmin_over_a_dimension.py
index 15dc776..7a8bb0c 100644
--- a/backends/triton/cpu/KernelBench/level1/52_Argmin_over_a_dimension.py
+++ b/backends/triton/cpu/KernelBench/level1/52_Argmin_over_a_dimension.py
@@ -13,7 +13,7 @@
 @triton.autotune(
     configs=[
         triton.Config(
-            {"BLOCK_D2": 128, "BLOCK_K": 32, "warp_size": 16}, num_warps=4, num_stages=2
+            {"BLOCK_D2": 128, "BLOCK_K": 32, "warp_size": 16},
         ),
     ],
     key=["D1", "D2"],
diff --git a/backends/triton/cpu/KernelBench/level1/53_Min_reduction_over_a_dimension.py b/backends/triton/cpu/KernelBench/level1/53_Min_reduction_over_a_dimension.py
index b4fc20a..75ecb5e 100644
--- a/backends/triton/cpu/KernelBench/level1/53_Min_reduction_over_a_dimension.py
+++ b/backends/triton/cpu/KernelBench/level1/53_Min_reduction_over_a_dimension.py
@@ -13,9 +13,7 @@
 @triton.autotune(
     configs=[
         triton.Config(
-            {"BLOCK_D1": 64, "BLOCK_D2": 256, "warp_size": 16},
-            num_warps=4,
-            num_stages=6,
+            {"BLOCK_D1": 32, "BLOCK_D2": 64, "warp_size": 16},
         ),
     ],
     key=["D1", "D2"],
diff --git a/backends/triton/cpu/KernelBench/level1/54_conv_standard_3D__square_input__square_kernel.py b/backends/triton/cpu/KernelBench/level1/54_conv_standard_3D__square_input__square_kernel.py
index a6a0552..0df02f0 100644
--- a/backends/triton/cpu/KernelBench/level1/54_conv_standard_3D__square_input__square_kernel.py
+++ b/backends/triton/cpu/KernelBench/level1/54_conv_standard_3D__square_input__square_kernel.py
@@ -20,14 +20,12 @@ def _to_triple(x):
     configs=[
         triton.Config(
             {
-                "BLOCK_M": 64,
-                "BLOCK_N": 64,
+                "BLOCK_M": 32,
+                "BLOCK_N": 32,
                 "BLOCK_K": 16,
                 "GROUP_M": 8,
                 "grf_mode": "256",
             },
-            num_warps=4,
-            num_stages=2,
         ),
     ],
     key=["C_out", "K_FUSED"],
diff --git a/backends/triton/cpu/KernelBench/level1/55_conv_standard_2D__asymmetric_input__square_kernel.py b/backends/triton/cpu/KernelBench/level1/55_conv_standard_2D__asymmetric_input__square_kernel.py
index 5618624..3cbfdcf 100644
--- a/backends/triton/cpu/KernelBench/level1/55_conv_standard_2D__asymmetric_input__square_kernel.py
+++ b/backends/triton/cpu/KernelBench/level1/55_conv_standard_2D__asymmetric_input__square_kernel.py
@@ -13,9 +13,7 @@
 @triton.autotune(
     configs=[
         triton.Config(
-            {"BLOCK_OW": 64, "BLOCK_N": 64, "BLOCK_K": 64, "grf_mode": "256"},
-            num_warps=4,
-            num_stages=2,
+            {"BLOCK_OW": 64, "BLOCK_N": 32, "BLOCK_K": 32, "grf_mode": "256"},
         ),
     ],
     key=["H", "W", "C_IN", "C_out", "OH", "OW"],
diff --git a/backends/triton/cpu/KernelBench/level1/56_conv_standard_2D__asymmetric_input__asymmetric_kernel.py b/backends/triton/cpu/KernelBench/level1/56_conv_standard_2D__asymmetric_input__asymmetric_kernel.py
index 0c35718..0455e06 100644
--- a/backends/triton/cpu/KernelBench/level1/56_conv_standard_2D__asymmetric_input__asymmetric_kernel.py
+++ b/backends/triton/cpu/KernelBench/level1/56_conv_standard_2D__asymmetric_input__asymmetric_kernel.py
@@ -13,9 +13,7 @@
 @triton.autotune(
     configs=[
         triton.Config(
-            {"BLOCK_OW": 64, "BLOCK_N": 64, "BLOCK_K": 64, "grf_mode": "128"},
-            num_warps=4,
-            num_stages=2,
+            {"BLOCK_OW": 64, "BLOCK_N": 32, "BLOCK_K": 32, "grf_mode": "128"},
         ),
     ],
     key=["H", "W", "C_IN", "C_out", "OH", "OW"],
diff --git a/backends/triton/cpu/KernelBench/level1/57_conv_transposed_2D__square_input__square_kernel.py b/backends/triton/cpu/KernelBench/level1/57_conv_transposed_2D__square_input__square_kernel.py
index 1206eac..849dcc7 100644
--- a/backends/triton/cpu/KernelBench/level1/57_conv_transposed_2D__square_input__square_kernel.py
+++ b/backends/triton/cpu/KernelBench/level1/57_conv_transposed_2D__square_input__square_kernel.py
@@ -16,13 +16,11 @@
         triton.Config(
             {
                 "BLOCK_OW": 64,
-                "BLOCK_N": 64,
-                "BLOCK_K": 64,
+                "BLOCK_N": 32,
+                "BLOCK_K": 32,
                 "GROUP_SIZE_M": 8,
                 "grf_mode": "256",
             },
-            num_warps=4,
-            num_stages=2,
         ),
     ],
     key=["H", "W", "C_IN", "C_out", "OH", "OW"],
diff --git a/backends/triton/cpu/KernelBench/level1/58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.py b/backends/triton/cpu/KernelBench/level1/58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.py
index 2c22089..fc34ee8 100644
--- a/backends/triton/cpu/KernelBench/level1/58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.py
+++ b/backends/triton/cpu/KernelBench/level1/58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.py
@@ -12,7 +12,9 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_W": 16, "BLOCK_OC": 16}, num_warps=4, num_stages=3),
+        triton.Config(
+            {"BLOCK_W": 16, "BLOCK_OC": 16},
+        ),
     ],
     key=["W_out", "C_out"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/59_conv_standard_3D__asymmetric_input__square_kernel.py b/backends/triton/cpu/KernelBench/level1/59_conv_standard_3D__asymmetric_input__square_kernel.py
index 14a217f..288e4b1 100644
--- a/backends/triton/cpu/KernelBench/level1/59_conv_standard_3D__asymmetric_input__square_kernel.py
+++ b/backends/triton/cpu/KernelBench/level1/59_conv_standard_3D__asymmetric_input__square_kernel.py
@@ -14,14 +14,12 @@
     configs=[
         triton.Config(
             {
-                "BLOCK_M": 64,
-                "BLOCK_N": 64,
+                "BLOCK_M": 32,
+                "BLOCK_N": 32,
                 "BLOCK_K": 16,
                 "GROUP_M": 8,
                 "grf_mode": "256",
             },
-            num_warps=4,
-            num_stages=2,
         ),
     ],
     key=["C_out", "K_FUSED"],
diff --git a/backends/triton/cpu/KernelBench/level1/5_Matrix_scalar_multiplication.py b/backends/triton/cpu/KernelBench/level1/5_Matrix_scalar_multiplication.py
index fe569a8..0b1463a 100644
--- a/backends/triton/cpu/KernelBench/level1/5_Matrix_scalar_multiplication.py
+++ b/backends/triton/cpu/KernelBench/level1/5_Matrix_scalar_multiplication.py
@@ -12,7 +12,9 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_SIZE": 1024 * 2}, num_warps=4, num_stages=2),
+        triton.Config(
+            {"BLOCK_SIZE": 32},
+        ),
     ],
     key=["n_elements"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/60_conv_standard_3D__square_input__asymmetric_kernel.py b/backends/triton/cpu/KernelBench/level1/60_conv_standard_3D__square_input__asymmetric_kernel.py
index 529f8ee..d23dd44 100644
--- a/backends/triton/cpu/KernelBench/level1/60_conv_standard_3D__square_input__asymmetric_kernel.py
+++ b/backends/triton/cpu/KernelBench/level1/60_conv_standard_3D__square_input__asymmetric_kernel.py
@@ -20,14 +20,12 @@ def _to_triple(x):
     configs=[
         triton.Config(
             {
-                "BLOCK_M": 64,
-                "BLOCK_N": 64,
+                "BLOCK_M": 32,
+                "BLOCK_N": 32,
                 "BLOCK_K": 16,
                 "GROUP_M": 8,
                 "grf_mode": "256",
             },
-            num_warps=4,
-            num_stages=2,
         ),
     ],
     key=["C_out", "K_FUSED"],
diff --git a/backends/triton/cpu/KernelBench/level1/61_conv_transposed_3D__square_input__square_kernel.py b/backends/triton/cpu/KernelBench/level1/61_conv_transposed_3D__square_input__square_kernel.py
index d7a5e6d..93214fb 100644
--- a/backends/triton/cpu/KernelBench/level1/61_conv_transposed_3D__square_input__square_kernel.py
+++ b/backends/triton/cpu/KernelBench/level1/61_conv_transposed_3D__square_input__square_kernel.py
@@ -13,9 +13,7 @@
 @triton.autotune(
     configs=[
         triton.Config(
-            {"BLOCK_OW": 64, "BLOCK_N": 64, "BLOCK_K": 16, "grf_mode": "256"},
-            num_warps=4,
-            num_stages=3,
+            {"BLOCK_OW": 64, "BLOCK_N": 32, "BLOCK_K": 16, "grf_mode": "256"},
         ),
     ],
     key=["D", "H", "W", "C_IN", "C_OUT", "OD", "OH", "OW"],
diff --git a/backends/triton/cpu/KernelBench/level1/62_conv_standard_2D__square_input__asymmetric_kernel.py b/backends/triton/cpu/KernelBench/level1/62_conv_standard_2D__square_input__asymmetric_kernel.py
index 0c35718..0455e06 100644
--- a/backends/triton/cpu/KernelBench/level1/62_conv_standard_2D__square_input__asymmetric_kernel.py
+++ b/backends/triton/cpu/KernelBench/level1/62_conv_standard_2D__square_input__asymmetric_kernel.py
@@ -13,9 +13,7 @@
 @triton.autotune(
     configs=[
         triton.Config(
-            {"BLOCK_OW": 64, "BLOCK_N": 64, "BLOCK_K": 64, "grf_mode": "128"},
-            num_warps=4,
-            num_stages=2,
+            {"BLOCK_OW": 64, "BLOCK_N": 32, "BLOCK_K": 32, "grf_mode": "128"},
         ),
     ],
     key=["H", "W", "C_IN", "C_out", "OH", "OW"],
diff --git a/backends/triton/cpu/KernelBench/level1/63_conv_standard_2D__square_input__square_kernel.py b/backends/triton/cpu/KernelBench/level1/63_conv_standard_2D__square_input__square_kernel.py
index 760a8fa..da19242 100644
--- a/backends/triton/cpu/KernelBench/level1/63_conv_standard_2D__square_input__square_kernel.py
+++ b/backends/triton/cpu/KernelBench/level1/63_conv_standard_2D__square_input__square_kernel.py
@@ -13,9 +13,7 @@
 @triton.autotune(
     configs=[
         triton.Config(
-            {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 16, "grf_mode": "128"},
-            num_warps=4,
-            num_stages=2,
+            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 16, "grf_mode": "128"},
         ),
     ],
     key=["M_total", "C_out", "C_IN"],
diff --git a/backends/triton/cpu/KernelBench/level1/64_conv_transposed_1D.py b/backends/triton/cpu/KernelBench/level1/64_conv_transposed_1D.py
index c9f1851..adda572 100644
--- a/backends/triton/cpu/KernelBench/level1/64_conv_transposed_1D.py
+++ b/backends/triton/cpu/KernelBench/level1/64_conv_transposed_1D.py
@@ -13,9 +13,7 @@
 @triton.autotune(
     configs=[
         triton.Config(
-            {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 16, "GROUP_M": 8},
-            num_warps=4,
-            num_stages=2,
+            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 16, "GROUP_M": 8},
         ),
     ],
     key=["C_IN", "C_out", "OL"],
diff --git a/backends/triton/cpu/KernelBench/level1/65_conv_transposed_2D__square_input__asymmetric_kernel.py b/backends/triton/cpu/KernelBench/level1/65_conv_transposed_2D__square_input__asymmetric_kernel.py
index 81d1992..1e834ce 100644
--- a/backends/triton/cpu/KernelBench/level1/65_conv_transposed_2D__square_input__asymmetric_kernel.py
+++ b/backends/triton/cpu/KernelBench/level1/65_conv_transposed_2D__square_input__asymmetric_kernel.py
@@ -16,13 +16,11 @@
         triton.Config(
             {
                 "BLOCK_OW": 64,
-                "BLOCK_N": 64,
-                "BLOCK_K": 64,
+                "BLOCK_N": 32,
+                "BLOCK_K": 32,
                 "GROUP_SIZE_M": 8,
                 "grf_mode": "256",
             },
-            num_warps=4,
-            num_stages=2,
         ),
     ],
     key=["H", "W", "C_IN", "C_out", "OH", "OW"],
diff --git a/backends/triton/cpu/KernelBench/level1/66_conv_standard_3D__asymmetric_input__asymmetric_kernel.py b/backends/triton/cpu/KernelBench/level1/66_conv_standard_3D__asymmetric_input__asymmetric_kernel.py
index 656518f..7f8eff7 100644
--- a/backends/triton/cpu/KernelBench/level1/66_conv_standard_3D__asymmetric_input__asymmetric_kernel.py
+++ b/backends/triton/cpu/KernelBench/level1/66_conv_standard_3D__asymmetric_input__asymmetric_kernel.py
@@ -19,9 +19,7 @@ def _to_triple(x):
 @triton.autotune(
     configs=[
         triton.Config(
-            {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 16, "grf_mode": "256"},
-            num_warps=4,
-            num_stages=2,
+            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 16, "grf_mode": "256"},
         ),
     ],
     key=["C_out", "K_FUSED"],
diff --git a/backends/triton/cpu/KernelBench/level1/67_conv_standard_1D.py b/backends/triton/cpu/KernelBench/level1/67_conv_standard_1D.py
index 7abf077..9a6ba31 100644
--- a/backends/triton/cpu/KernelBench/level1/67_conv_standard_1D.py
+++ b/backends/triton/cpu/KernelBench/level1/67_conv_standard_1D.py
@@ -12,7 +12,9 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_OL": 64, "BLOCK_N": 128}, num_warps=4, num_stages=3),
+        triton.Config(
+            {"BLOCK_OL": 64, "BLOCK_N": 128},
+        ),
     ],
     key=["OL", "OC"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/68_conv_transposed_3D__square_input__asymmetric_kernel.py b/backends/triton/cpu/KernelBench/level1/68_conv_transposed_3D__square_input__asymmetric_kernel.py
index 9bcf102..d2a7ff6 100644
--- a/backends/triton/cpu/KernelBench/level1/68_conv_transposed_3D__square_input__asymmetric_kernel.py
+++ b/backends/triton/cpu/KernelBench/level1/68_conv_transposed_3D__square_input__asymmetric_kernel.py
@@ -12,7 +12,9 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_W": 16, "BLOCK_OC": 16}, num_warps=4, num_stages=3),
+        triton.Config(
+            {"BLOCK_W": 16, "BLOCK_OC": 16},
+        ),
     ],
     key=["W_out", "C_out"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.py b/backends/triton/cpu/KernelBench/level1/69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.py
index 233c9b5..3693da9 100644
--- a/backends/triton/cpu/KernelBench/level1/69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.py
+++ b/backends/triton/cpu/KernelBench/level1/69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.py
@@ -16,13 +16,11 @@
         triton.Config(
             {
                 "BLOCK_OW": 64,
-                "BLOCK_N": 64,
-                "BLOCK_K": 64,
+                "BLOCK_N": 32,
+                "BLOCK_K": 32,
                 "GROUP_SIZE_M": 8,
                 "grf_mode": "256",
             },
-            num_warps=4,
-            num_stages=2,
         ),
     ],
     key=["H", "W", "C_IN", "C_out", "OH", "OW"],
diff --git a/backends/triton/cpu/KernelBench/level1/6_Matmul_with_large_K_dimension_.py b/backends/triton/cpu/KernelBench/level1/6_Matmul_with_large_K_dimension_.py
index 2dd9faa..055a1a6 100644
--- a/backends/triton/cpu/KernelBench/level1/6_Matmul_with_large_K_dimension_.py
+++ b/backends/triton/cpu/KernelBench/level1/6_Matmul_with_large_K_dimension_.py
@@ -13,9 +13,7 @@
 def _configs():
     return [
         triton.Config(
-            {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_SIZE_M": 4},
-            num_warps=8,
-            num_stages=3,
+            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
         ),
     ]
 
diff --git a/backends/triton/cpu/KernelBench/level1/70_conv_transposed_3D__asymmetric_input__square_kernel.py b/backends/triton/cpu/KernelBench/level1/70_conv_transposed_3D__asymmetric_input__square_kernel.py
index 9418854..a5ee485 100644
--- a/backends/triton/cpu/KernelBench/level1/70_conv_transposed_3D__asymmetric_input__square_kernel.py
+++ b/backends/triton/cpu/KernelBench/level1/70_conv_transposed_3D__asymmetric_input__square_kernel.py
@@ -12,7 +12,9 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_W": 16, "BLOCK_OC": 32}, num_warps=4, num_stages=3),
+        triton.Config(
+            {"BLOCK_W": 16, "BLOCK_OC": 32},
+        ),
     ],
     key=["W_out", "C_out"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/71_conv_transposed_2D__asymmetric_input__square_kernel.py b/backends/triton/cpu/KernelBench/level1/71_conv_transposed_2D__asymmetric_input__square_kernel.py
index e93ce1a..5e8915f 100644
--- a/backends/triton/cpu/KernelBench/level1/71_conv_transposed_2D__asymmetric_input__square_kernel.py
+++ b/backends/triton/cpu/KernelBench/level1/71_conv_transposed_2D__asymmetric_input__square_kernel.py
@@ -21,8 +21,6 @@
                 "GROUP_SIZE_M": 8,
                 "grf_mode": "128",
             },
-            num_warps=4,
-            num_stages=3,
         ),
     ],
     key=["H", "W", "C_IN", "C_out", "OH", "OW"],
diff --git a/backends/triton/cpu/KernelBench/level1/72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.py b/backends/triton/cpu/KernelBench/level1/72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.py
index 8509131..e2b6816 100644
--- a/backends/triton/cpu/KernelBench/level1/72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.py
+++ b/backends/triton/cpu/KernelBench/level1/72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.py
@@ -12,7 +12,9 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_OW": 64, "BLOCK_OH": 2}, num_warps=4, num_stages=2),
+        triton.Config(
+            {"BLOCK_OW": 64, "BLOCK_OH": 2},
+        ),
     ],
     key=["D_out", "H_out", "W_out"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.py b/backends/triton/cpu/KernelBench/level1/73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.py
index 91671df..19f2d5d 100644
--- a/backends/triton/cpu/KernelBench/level1/73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.py
+++ b/backends/triton/cpu/KernelBench/level1/73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.py
@@ -167,7 +167,7 @@ def kernel_function(
     )
 
     # choose a block size for the flattened spatial dimension
-    BLOCK = 256
+    BLOCK = 32
     grid = (B, C_out, triton.cdiv(Do * Ho * Wo, BLOCK))
 
     # launch
diff --git a/backends/triton/cpu/KernelBench/level1/74_conv_transposed_1D_dilated.py b/backends/triton/cpu/KernelBench/level1/74_conv_transposed_1D_dilated.py
index 830134b..fd7a5db 100644
--- a/backends/triton/cpu/KernelBench/level1/74_conv_transposed_1D_dilated.py
+++ b/backends/triton/cpu/KernelBench/level1/74_conv_transposed_1D_dilated.py
@@ -13,9 +13,7 @@
 @triton.autotune(
     configs=[
         triton.Config(
-            {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 16, "GROUP_M": 8},
-            num_warps=4,
-            num_stages=2,
+            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 16, "GROUP_M": 8},
         ),
     ],
     key=["C_IN", "C_out", "OL"],
diff --git a/backends/triton/cpu/KernelBench/level1/75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.py b/backends/triton/cpu/KernelBench/level1/75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.py
index 69289d0..3c89b3f 100644
--- a/backends/triton/cpu/KernelBench/level1/75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.py
+++ b/backends/triton/cpu/KernelBench/level1/75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.py
@@ -185,7 +185,7 @@ def kernel_function(
     out_stride_b, out_stride_c, out_stride_h, out_stride_w = out.stride()
 
     # 6) Launch parameters
-    BLOCK = 256
+    BLOCK = 32
     grid = (triton.cdiv(total_elems, BLOCK),)
 
     # 7) Launch kernel
diff --git a/backends/triton/cpu/KernelBench/level1/76_conv_standard_1D_dilated_strided__.py b/backends/triton/cpu/KernelBench/level1/76_conv_standard_1D_dilated_strided__.py
index 78155ce..d67ec91 100644
--- a/backends/triton/cpu/KernelBench/level1/76_conv_standard_1D_dilated_strided__.py
+++ b/backends/triton/cpu/KernelBench/level1/76_conv_standard_1D_dilated_strided__.py
@@ -12,7 +12,9 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_OL": 64, "BLOCK_N": 128}, num_warps=4, num_stages=3),
+        triton.Config(
+            {"BLOCK_OL": 32, "BLOCK_N": 64},
+        ),
     ],
     key=["OL", "OC"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.py b/backends/triton/cpu/KernelBench/level1/77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.py
index 9942ff4..31acbda 100644
--- a/backends/triton/cpu/KernelBench/level1/77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.py
+++ b/backends/triton/cpu/KernelBench/level1/77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.py
@@ -16,7 +16,9 @@ def _compute_output_size(input_size, kernel_size, stride, padding, dilation):
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_W": 16, "BLOCK_OC": 32}, num_warps=4, num_stages=4),
+        triton.Config(
+            {"BLOCK_W": 16, "BLOCK_OC": 32},
+        ),
     ],
     key=["W_act", "C_out"],
 )
@@ -197,7 +199,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
         output = self._output_buf
         N = output.numel()
-        BLOCK = 1024
+        BLOCK = 32
         _zero_kernel[(triton.cdiv(N, BLOCK),)](output, N, BLOCK=BLOCK)
 
         sx = x_cl3d.stride()
diff --git a/backends/triton/cpu/KernelBench/level1/78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.py b/backends/triton/cpu/KernelBench/level1/78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.py
index 854892d..00e4f51 100644
--- a/backends/triton/cpu/KernelBench/level1/78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.py
+++ b/backends/triton/cpu/KernelBench/level1/78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.py
@@ -21,8 +21,6 @@
                 "GROUP_SIZE_M": 8,
                 "grf_mode": "128",
             },
-            num_warps=4,
-            num_stages=3,
         ),
     ],
     key=["H", "W", "C_IN", "C_out", "OH", "OW"],
diff --git a/backends/triton/cpu/KernelBench/level1/79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.py b/backends/triton/cpu/KernelBench/level1/79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.py
index 84f13ff..02186e7 100644
--- a/backends/triton/cpu/KernelBench/level1/79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.py
+++ b/backends/triton/cpu/KernelBench/level1/79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.py
@@ -12,7 +12,9 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_L": 128}, num_warps=4, num_stages=2),
+        triton.Config(
+            {"BLOCK_L": 32},
+        ),
     ],
     key=["N_NONZERO"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/7_Matmul_with_small_K_dimension_.py b/backends/triton/cpu/KernelBench/level1/7_Matmul_with_small_K_dimension_.py
index dc43e9f..7dedcbe 100644
--- a/backends/triton/cpu/KernelBench/level1/7_Matmul_with_small_K_dimension_.py
+++ b/backends/triton/cpu/KernelBench/level1/7_Matmul_with_small_K_dimension_.py
@@ -32,9 +32,7 @@ def swizzle_tile(
 @triton.autotune(
     configs=[
         triton.Config(
-            {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 64, "GROUP_SIZE_M": 4},
-            num_warps=32,
-            num_stages=2,
+            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
         ),
     ],
     key=["M", "N", "K"],
diff --git a/backends/triton/cpu/KernelBench/level1/80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.py b/backends/triton/cpu/KernelBench/level1/80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.py
index 2494939..6d0a219 100644
--- a/backends/triton/cpu/KernelBench/level1/80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.py
+++ b/backends/triton/cpu/KernelBench/level1/80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.py
@@ -18,14 +18,12 @@ def _to_pair(x):
     configs=[
         triton.Config(
             {
-                "BLOCK_M": 64,
-                "BLOCK_N": 64,
+                "BLOCK_M": 32,
+                "BLOCK_N": 32,
                 "BLOCK_K": 32,
                 "GROUP_M": 8,
                 "grf_mode": "128",
             },
-            num_warps=4,
-            num_stages=3,
         ),
     ],
     key=["C_IN", "C_out", "OH", "OW"],
diff --git a/backends/triton/cpu/KernelBench/level1/81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.py b/backends/triton/cpu/KernelBench/level1/81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.py
index b282b7b..bfc9432 100644
--- a/backends/triton/cpu/KernelBench/level1/81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.py
+++ b/backends/triton/cpu/KernelBench/level1/81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.py
@@ -16,7 +16,9 @@ def _to_pair(x):
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_W": 16, "BLOCK_OC": 32}, num_warps=4, num_stages=3),
+        triton.Config(
+            {"BLOCK_W": 16, "BLOCK_OC": 32},
+        ),
     ],
     key=["W_IN", "C_out"],
 )
@@ -179,7 +181,7 @@ def forward(self, x):
 
         output = self._output
         N_elem = output.numel()
-        _zero_kernel[(triton.cdiv(N_elem, 1024),)](output, N_elem, BLOCK=1024)
+        _zero_kernel[(triton.cdiv(N_elem, 32),)](output, N_elem, BLOCK=32)
 
         sx = x_cl.stride()
         so = output.stride()
diff --git a/backends/triton/cpu/KernelBench/level1/82_conv_depthwise_2D_square_input_square_kernel.py b/backends/triton/cpu/KernelBench/level1/82_conv_depthwise_2D_square_input_square_kernel.py
index e8395b3..94660ed 100644
--- a/backends/triton/cpu/KernelBench/level1/82_conv_depthwise_2D_square_input_square_kernel.py
+++ b/backends/triton/cpu/KernelBench/level1/82_conv_depthwise_2D_square_input_square_kernel.py
@@ -12,7 +12,9 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=1),
+        triton.Config(
+            {"BLOCK_SIZE": 32},
+        ),
     ],
     key=["OHOW"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/83_conv_depthwise_2D_square_input_asymmetric_kernel.py b/backends/triton/cpu/KernelBench/level1/83_conv_depthwise_2D_square_input_asymmetric_kernel.py
index 6b9c313..3190fb2 100644
--- a/backends/triton/cpu/KernelBench/level1/83_conv_depthwise_2D_square_input_asymmetric_kernel.py
+++ b/backends/triton/cpu/KernelBench/level1/83_conv_depthwise_2D_square_input_asymmetric_kernel.py
@@ -12,7 +12,9 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=1),
+        triton.Config(
+            {"BLOCK_SIZE": 32},
+        ),
     ],
     key=["OHOW"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/84_conv_depthwise_2D_asymmetric_input_square_kernel.py b/backends/triton/cpu/KernelBench/level1/84_conv_depthwise_2D_asymmetric_input_square_kernel.py
index 94c2d3c..ec92ac7 100644
--- a/backends/triton/cpu/KernelBench/level1/84_conv_depthwise_2D_asymmetric_input_square_kernel.py
+++ b/backends/triton/cpu/KernelBench/level1/84_conv_depthwise_2D_asymmetric_input_square_kernel.py
@@ -12,7 +12,9 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=1),
+        triton.Config(
+            {"BLOCK_SIZE": 32},
+        ),
     ],
     key=["OHOW"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.py b/backends/triton/cpu/KernelBench/level1/85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.py
index 61d7834..c19d1ab 100644
--- a/backends/triton/cpu/KernelBench/level1/85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.py
+++ b/backends/triton/cpu/KernelBench/level1/85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.py
@@ -12,7 +12,9 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=1),
+        triton.Config(
+            {"BLOCK_SIZE": 32},
+        ),
     ],
     key=["OHOW"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/86_conv_depthwise_separable_2D.py b/backends/triton/cpu/KernelBench/level1/86_conv_depthwise_separable_2D.py
index 0bcdc18..c97f8bb 100644
--- a/backends/triton/cpu/KernelBench/level1/86_conv_depthwise_separable_2D.py
+++ b/backends/triton/cpu/KernelBench/level1/86_conv_depthwise_separable_2D.py
@@ -14,9 +14,7 @@
 @triton.autotune(
     configs=[
         triton.Config(
-            {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 16, "grf_mode": "128"},
-            num_warps=4,
-            num_stages=2,
+            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 16, "grf_mode": "128"},
         ),
     ],
     key=["M_total", "C_out", "C_IN"],
diff --git a/backends/triton/cpu/KernelBench/level1/87_conv_pointwise_2D.py b/backends/triton/cpu/KernelBench/level1/87_conv_pointwise_2D.py
index cf48a00..e6c8cbc 100644
--- a/backends/triton/cpu/KernelBench/level1/87_conv_pointwise_2D.py
+++ b/backends/triton/cpu/KernelBench/level1/87_conv_pointwise_2D.py
@@ -94,9 +94,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         weight = self.conv1d.weight.squeeze(-1).squeeze(-1).contiguous()
         N_spatial = H * W
 
-        BLOCK_M = 128
-        BLOCK_N = 128
-        BLOCK_K = 64
+        BLOCK_M = 32
+        BLOCK_N = 32
+        BLOCK_K = 16
         GROUP_M = 8
 
         out = torch.empty((B, C_out, H, W), device=x.device, dtype=x.dtype)
@@ -122,8 +122,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             BLOCK_N=BLOCK_N,
             BLOCK_K=BLOCK_K,
             GROUP_M=GROUP_M,
-            num_warps=8,
-            num_stages=1,
         )
 
         if self.conv1d.bias is not None:
diff --git a/backends/triton/cpu/KernelBench/level1/88_MinGPTNewGelu.py b/backends/triton/cpu/KernelBench/level1/88_MinGPTNewGelu.py
index bde7264..e1f54dd 100644
--- a/backends/triton/cpu/KernelBench/level1/88_MinGPTNewGelu.py
+++ b/backends/triton/cpu/KernelBench/level1/88_MinGPTNewGelu.py
@@ -13,7 +13,9 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2),
+        triton.Config(
+            {"BLOCK_SIZE": 32},
+        ),
     ],
     key=["n_elements"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/89_cumsum.py b/backends/triton/cpu/KernelBench/level1/89_cumsum.py
index 34e0a1e..b5e3ce0 100644
--- a/backends/triton/cpu/KernelBench/level1/89_cumsum.py
+++ b/backends/triton/cpu/KernelBench/level1/89_cumsum.py
@@ -17,7 +17,9 @@ def scan_add_op(a, b):
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_SIZE": 256}, num_warps=4, num_stages=1),
+        triton.Config(
+            {"BLOCK_SIZE": 32},
+        ),
     ],
     key=["N"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/8_Matmul_with_irregular_shapes_.py b/backends/triton/cpu/KernelBench/level1/8_Matmul_with_irregular_shapes_.py
index ad4bb8a..15b7b38 100644
--- a/backends/triton/cpu/KernelBench/level1/8_Matmul_with_irregular_shapes_.py
+++ b/backends/triton/cpu/KernelBench/level1/8_Matmul_with_irregular_shapes_.py
@@ -13,9 +13,7 @@
 def get_autotune_configs():
     return [
         triton.Config(
-            {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
-            num_warps=32,
-            num_stages=2,
+            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
         ),
     ]
 
diff --git a/backends/triton/cpu/KernelBench/level1/90_cumprod.py b/backends/triton/cpu/KernelBench/level1/90_cumprod.py
index e0d9e1e..4f9057f 100644
--- a/backends/triton/cpu/KernelBench/level1/90_cumprod.py
+++ b/backends/triton/cpu/KernelBench/level1/90_cumprod.py
@@ -17,7 +17,7 @@ def _mul_combine(a, b):
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_SIZE": 512}, num_warps=4),
+        triton.Config({"BLOCK_SIZE": 32}, num_warps=4),
     ],
     key=["N"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/91_cumsum_reverse.py b/backends/triton/cpu/KernelBench/level1/91_cumsum_reverse.py
index 3d78bb9..b9905e5 100644
--- a/backends/triton/cpu/KernelBench/level1/91_cumsum_reverse.py
+++ b/backends/triton/cpu/KernelBench/level1/91_cumsum_reverse.py
@@ -12,7 +12,7 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_N": 256}, num_warps=4),
+        triton.Config({"BLOCK_N": 32}, num_warps=4),
     ],
     key=["N"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/92_cumsum_exclusive.py b/backends/triton/cpu/KernelBench/level1/92_cumsum_exclusive.py
index 94e17ba..eba7c00 100644
--- a/backends/triton/cpu/KernelBench/level1/92_cumsum_exclusive.py
+++ b/backends/triton/cpu/KernelBench/level1/92_cumsum_exclusive.py
@@ -12,7 +12,9 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2),
+        triton.Config(
+            {"BLOCK_SIZE": 32},
+        ),
     ],
     key=["N"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/93_masked_cumsum.py b/backends/triton/cpu/KernelBench/level1/93_masked_cumsum.py
index b440482..cdc00e1 100644
--- a/backends/triton/cpu/KernelBench/level1/93_masked_cumsum.py
+++ b/backends/triton/cpu/KernelBench/level1/93_masked_cumsum.py
@@ -17,7 +17,9 @@ def scan_add_op(a, b):
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_SIZE": 512}, num_warps=4, num_stages=1),
+        triton.Config(
+            {"BLOCK_SIZE": 32},
+        ),
     ],
     key=["N"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/94_MSELoss.py b/backends/triton/cpu/KernelBench/level1/94_MSELoss.py
index 5111369..51e21e6 100644
--- a/backends/triton/cpu/KernelBench/level1/94_MSELoss.py
+++ b/backends/triton/cpu/KernelBench/level1/94_MSELoss.py
@@ -13,9 +13,7 @@
 def get_autotune_configs():
     configs = [
         triton.Config(
-            {"BLOCK_SIZE": 1024},
-            num_warps=4,
-            num_stages=1,
+            {"BLOCK_SIZE": 32},
         )
     ]
     return configs
diff --git a/backends/triton/cpu/KernelBench/level1/95_CrossEntropyLoss.py b/backends/triton/cpu/KernelBench/level1/95_CrossEntropyLoss.py
index 6a9f8e8..dca8570 100644
--- a/backends/triton/cpu/KernelBench/level1/95_CrossEntropyLoss.py
+++ b/backends/triton/cpu/KernelBench/level1/95_CrossEntropyLoss.py
@@ -11,13 +11,7 @@
 
 
 def _ce_configs():
-    configs = []
-    for BN in [1024, 2048, 4096]:
-        for nw in [4, 8, 16]:
-            for ws in [16, 32]:
-                configs.append(
-                    triton.Config({"BLOCK_N": BN, "warp_size": ws}, num_warps=nw)
-                )
+    configs = [triton.Config({"BLOCK_N": 32})]
     return configs
 
 
@@ -31,7 +25,6 @@ def _cross_entropy_online_kernel(
     stride_lm,
     stride_ln,
     BLOCK_N: tl.constexpr,
-    warp_size: tl.constexpr,
 ):
     row = tl.program_id(0)
     row_off = row.to(tl.int64) * stride_lm
diff --git a/backends/triton/cpu/KernelBench/level1/96_HuberLoss.py b/backends/triton/cpu/KernelBench/level1/96_HuberLoss.py
index 30e4887..9f09d74 100644
--- a/backends/triton/cpu/KernelBench/level1/96_HuberLoss.py
+++ b/backends/triton/cpu/KernelBench/level1/96_HuberLoss.py
@@ -12,7 +12,7 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_SIZE": 4096}, num_warps=4),
+        triton.Config({"BLOCK_SIZE": 32}, num_warps=4),
     ],
     key=["n_cols"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/97_ScaledDotProductAttention.py b/backends/triton/cpu/KernelBench/level1/97_ScaledDotProductAttention.py
index dc20d99..a378b24 100644
--- a/backends/triton/cpu/KernelBench/level1/97_ScaledDotProductAttention.py
+++ b/backends/triton/cpu/KernelBench/level1/97_ScaledDotProductAttention.py
@@ -15,7 +15,7 @@
 @triton.autotune(
     configs=[
         triton.Config(
-            {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 64}, num_warps=16, num_stages=2
+            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32},
         ),
     ],
     key=["SEQ_LEN", "HEAD_DIM"],
@@ -93,7 +93,7 @@ def _qk_gemm_kernel(
 @triton.autotune(
     configs=[
         triton.Config(
-            {"BLOCK_M": 64, "BLOCK_K": 64, "BLOCK_N": 128}, num_warps=8, num_stages=3
+            {"BLOCK_M": 32, "BLOCK_K": 32, "BLOCK_N": 128},
         ),
     ],
     key=["SEQ_LEN", "HEAD_DIM"],
diff --git a/backends/triton/cpu/KernelBench/level1/98_KLDivLoss.py b/backends/triton/cpu/KernelBench/level1/98_KLDivLoss.py
index c0ed1ab..a5b876f 100644
--- a/backends/triton/cpu/KernelBench/level1/98_KLDivLoss.py
+++ b/backends/triton/cpu/KernelBench/level1/98_KLDivLoss.py
@@ -17,9 +17,7 @@
 def get_kl_div_configs():
     configs = [
         triton.Config(
-            {"BLOCK_SIZE": 4096},
-            num_warps=4,
-            num_stages=1,
+            {"BLOCK_SIZE": 32},
         )
     ]
     return configs
diff --git a/backends/triton/cpu/KernelBench/level1/99_TripletMarginLoss.py b/backends/triton/cpu/KernelBench/level1/99_TripletMarginLoss.py
index 237d264..2d0538c 100644
--- a/backends/triton/cpu/KernelBench/level1/99_TripletMarginLoss.py
+++ b/backends/triton/cpu/KernelBench/level1/99_TripletMarginLoss.py
@@ -12,7 +12,9 @@
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_K": 256}, num_warps=4, num_stages=2),
+        triton.Config(
+            {"BLOCK_K": 32},
+        ),
     ],
     key=["D"],
 )
diff --git a/backends/triton/cpu/KernelBench/level1/9_Tall_skinny_matrix_multiplication_.py b/backends/triton/cpu/KernelBench/level1/9_Tall_skinny_matrix_multiplication_.py
index 152975e..e5605d7 100644
--- a/backends/triton/cpu/KernelBench/level1/9_Tall_skinny_matrix_multiplication_.py
+++ b/backends/triton/cpu/KernelBench/level1/9_Tall_skinny_matrix_multiplication_.py
@@ -32,9 +32,7 @@ def swizzle_tile(
 def _configs():
     return [
         triton.Config(
-            {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
-            num_warps=16,
-            num_stages=2,
+            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
         ),
     ]
 
diff --git a/problems/specs/KernelBench/level1/10_3D_tensor_matrix_multiplication.yaml b/problems/specs/KernelBench/level1/10_3D_tensor_matrix_multiplication.yaml
index 43eabc9..2f7f86a 100644
--- a/problems/specs/KernelBench/level1/10_3D_tensor_matrix_multiplication.yaml
+++ b/problems/specs/KernelBench/level1/10_3D_tensor_matrix_multiplication.yaml
@@ -20,10 +20,10 @@ simple-cpu:
   - params: [A, B]
     dtype: bfloat16
     dims:
-      N: 4
-      M: 32
-      K: 64
-      L: 24
+      N: 64
+      M: 256
+      K: 128
+      L: 96
     flop: "2*N*M*L*K"
 
 bench-gpu:
diff --git a/problems/specs/KernelBench/level1/11_4D_tensor_matrix_multiplication.yaml b/problems/specs/KernelBench/level1/11_4D_tensor_matrix_multiplication.yaml
index d0c5522..81822b0 100644
--- a/problems/specs/KernelBench/level1/11_4D_tensor_matrix_multiplication.yaml
+++ b/problems/specs/KernelBench/level1/11_4D_tensor_matrix_multiplication.yaml
@@ -22,10 +22,10 @@ simple-cpu:
     dtype: bfloat16
     dims:
       B: 2
-      I: 32
+      I: 128
       J: 64
-      L: 32
-      K: 24
+      L: 128
+      K: 96
     flop: "2*B*I*J*K*L"
 
 bench-gpu:
diff --git a/problems/specs/KernelBench/level1/13_Matmul_for_symmetric_matrices.yaml b/problems/specs/KernelBench/level1/13_Matmul_for_symmetric_matrices.yaml
index 9808093..3034011 100644
--- a/problems/specs/KernelBench/level1/13_Matmul_for_symmetric_matrices.yaml
+++ b/problems/specs/KernelBench/level1/13_Matmul_for_symmetric_matrices.yaml
@@ -20,4 +20,4 @@ simple-cpu:
   - params: [A, B]
     dtype: bfloat16
     dims:
-      N: 64
+      N: 128
diff --git a/problems/specs/KernelBench/level1/14_Matmul_for_upper_triangular_matrices.yaml b/problems/specs/KernelBench/level1/14_Matmul_for_upper_triangular_matrices.yaml
index b29c1f6..9247081 100644
--- a/problems/specs/KernelBench/level1/14_Matmul_for_upper_triangular_matrices.yaml
+++ b/problems/specs/KernelBench/level1/14_Matmul_for_upper_triangular_matrices.yaml
@@ -20,4 +20,4 @@ simple-cpu:
   - params: [A, B]
     dtype: bfloat16
     dims:
-      N: 64
+      N: 128
diff --git a/problems/specs/KernelBench/level1/15_Matmul_for_lower_triangular_matrices.yaml b/problems/specs/KernelBench/level1/15_Matmul_for_lower_triangular_matrices.yaml
index f6043d4..6ec276c 100644
--- a/problems/specs/KernelBench/level1/15_Matmul_for_lower_triangular_matrices.yaml
+++ b/problems/specs/KernelBench/level1/15_Matmul_for_lower_triangular_matrices.yaml
@@ -20,4 +20,4 @@ simple-cpu:
   - params: [A, B]
     dtype: bfloat16
     dims:
-      M: 64
+      M: 128
diff --git a/problems/specs/KernelBench/level1/1_Square_matrix_multiplication_.yaml b/problems/specs/KernelBench/level1/1_Square_matrix_multiplication_.yaml
index 2f5de5b..fef7c31 100644
--- a/problems/specs/KernelBench/level1/1_Square_matrix_multiplication_.yaml
+++ b/problems/specs/KernelBench/level1/1_Square_matrix_multiplication_.yaml
@@ -16,7 +16,7 @@ simple-cpu:
   - params: [A, B]
     dtype: bfloat16
     dims:
-      N: 128
+      N: 256
 
 bench-cpu:
   - params: [A, B]
diff --git a/problems/specs/KernelBench/level1/2_Standard_matrix_multiplication_.yaml b/problems/specs/KernelBench/level1/2_Standard_matrix_multiplication_.yaml
index f4c1f66..3e2b28a 100644
--- a/problems/specs/KernelBench/level1/2_Standard_matrix_multiplication_.yaml
+++ b/problems/specs/KernelBench/level1/2_Standard_matrix_multiplication_.yaml
@@ -18,9 +18,9 @@ simple-cpu:
   - params: [A, B]
     dtype: bfloat16
     dims:
-      M: 64
-      N: 128
-      K: 256
+      M: 256
+      N: 1024
+      K: 512
 
 bench-cpu:
   - params: [A, B]
diff --git a/problems/specs/KernelBench/level1/47_Sum_reduction_over_a_dimension.yaml b/problems/specs/KernelBench/level1/47_Sum_reduction_over_a_dimension.yaml
index 34c285c..4e1aa59 100644
--- a/problems/specs/KernelBench/level1/47_Sum_reduction_over_a_dimension.yaml
+++ b/problems/specs/KernelBench/level1/47_Sum_reduction_over_a_dimension.yaml
@@ -20,6 +20,6 @@ simple-cpu:
     dtype: float32
     dims:
       BATCH_SIZE: 2
-      DIM1: 64
-      DIM2: 63
+      DIM1: 128
+      DIM2: 127
       REDUCE_DIM: 1
diff --git a/problems/specs/KernelBench/level1/48_Mean_reduction_over_a_dimension.yaml b/problems/specs/KernelBench/level1/48_Mean_reduction_over_a_dimension.yaml
index 34c285c..4e1aa59 100644
--- a/problems/specs/KernelBench/level1/48_Mean_reduction_over_a_dimension.yaml
+++ b/problems/specs/KernelBench/level1/48_Mean_reduction_over_a_dimension.yaml
@@ -20,6 +20,6 @@ simple-cpu:
     dtype: float32
     dims:
       BATCH_SIZE: 2
-      DIM1: 64
-      DIM2: 63
+      DIM1: 128
+      DIM2: 127
       REDUCE_DIM: 1
diff --git a/problems/specs/KernelBench/level1/4_Matrix_vector_multiplication_.yaml b/problems/specs/KernelBench/level1/4_Matrix_vector_multiplication_.yaml
index d3afc05..36bce41 100644
--- a/problems/specs/KernelBench/level1/4_Matrix_vector_multiplication_.yaml
+++ b/problems/specs/KernelBench/level1/4_Matrix_vector_multiplication_.yaml
@@ -19,9 +19,9 @@ simple-cpu:
   - params: [A, B]
     dtype: bfloat16
     dims:
-      M: 64
+      M: 128
       N: 1
-      K: 256
+      K: 512
     flop: "2*M*N*K"
 
 bench-gpu:
diff --git a/problems/specs/KernelBench/level1/51_Argmax_over_a_dimension.yaml b/problems/specs/KernelBench/level1/51_Argmax_over_a_dimension.yaml
index 8b78339..340dc21 100644
--- a/problems/specs/KernelBench/level1/51_Argmax_over_a_dimension.yaml
+++ b/problems/specs/KernelBench/level1/51_Argmax_over_a_dimension.yaml
@@ -20,6 +20,6 @@ simple-cpu:
     dtype: float32
     dims:
       BATCH_SIZE: 2
-      DIM1: 64
-      DIM2: 63
+      DIM1: 256
+      DIM2: 255
       ARGMAX_DIM: 1
diff --git a/problems/specs/KernelBench/level1/52_Argmin_over_a_dimension.yaml b/problems/specs/KernelBench/level1/52_Argmin_over_a_dimension.yaml
index e20221d..f11d837 100644
--- a/problems/specs/KernelBench/level1/52_Argmin_over_a_dimension.yaml
+++ b/problems/specs/KernelBench/level1/52_Argmin_over_a_dimension.yaml
@@ -20,6 +20,6 @@ simple-cpu:
     dtype: float32
     dims:
       BATCH_SIZE: 2
-      DIM1: 64
-      DIM2: 63
+      DIM1: 256
+      DIM2: 255
       ARGMIN_DIM: 1
diff --git a/problems/specs/KernelBench/level1/53_Min_reduction_over_a_dimension.yaml b/problems/specs/KernelBench/level1/53_Min_reduction_over_a_dimension.yaml
index 34c285c..8075c38 100644
--- a/problems/specs/KernelBench/level1/53_Min_reduction_over_a_dimension.yaml
+++ b/problems/specs/KernelBench/level1/53_Min_reduction_over_a_dimension.yaml
@@ -20,6 +20,6 @@ simple-cpu:
     dtype: float32
     dims:
       BATCH_SIZE: 2
-      DIM1: 64
-      DIM2: 63
+      DIM1: 256
+      DIM2: 255
       REDUCE_DIM: 1
diff --git a/problems/specs/KernelBench/level1/54_conv_standard_3D__square_input__square_kernel.yaml b/problems/specs/KernelBench/level1/54_conv_standard_3D__square_input__square_kernel.yaml
index 4503c24..a8e7ec9 100644
--- a/problems/specs/KernelBench/level1/54_conv_standard_3D__square_input__square_kernel.yaml
+++ b/problems/specs/KernelBench/level1/54_conv_standard_3D__square_input__square_kernel.yaml
@@ -26,7 +26,7 @@ simple-cpu:
     dims:
       BATCH_SIZE: 2
       IN_CHANNELS: 3
-      OUT_CHANNELS: 16
+      OUT_CHANNELS: 64
       KERNEL_SIZE: 3
       DEPTH: 16
       WIDTH: 16
diff --git a/problems/specs/KernelBench/level1/59_conv_standard_3D__asymmetric_input__square_kernel.yaml b/problems/specs/KernelBench/level1/59_conv_standard_3D__asymmetric_input__square_kernel.yaml
index 4d58a3e..9e1aa1c 100644
--- a/problems/specs/KernelBench/level1/59_conv_standard_3D__asymmetric_input__square_kernel.yaml
+++ b/problems/specs/KernelBench/level1/59_conv_standard_3D__asymmetric_input__square_kernel.yaml
@@ -26,7 +26,7 @@ simple-cpu:
     dims:
       BATCH_SIZE: 2
       IN_CHANNELS: 3
-      OUT_CHANNELS: 16
+      OUT_CHANNELS: 64
       KERNEL_SIZE: 3
       HEIGHT: 32
       WIDTH: 32
diff --git a/problems/specs/KernelBench/level1/5_Matrix_scalar_multiplication.yaml b/problems/specs/KernelBench/level1/5_Matrix_scalar_multiplication.yaml
index d2ecbbe..99d0101 100644
--- a/problems/specs/KernelBench/level1/5_Matrix_scalar_multiplication.yaml
+++ b/problems/specs/KernelBench/level1/5_Matrix_scalar_multiplication.yaml
@@ -19,8 +19,8 @@ simple-cpu:
   - params: [A, B]
     dtype: bfloat16
     dims:
-      M: 64
-      N: 32
+      M: 256
+      N: 128
       UNIT: 1
     flop: "M*N"
 
diff --git a/problems/specs/KernelBench/level1/67_conv_standard_1D.yaml b/problems/specs/KernelBench/level1/67_conv_standard_1D.yaml
index c68f735..f87c7d2 100644
--- a/problems/specs/KernelBench/level1/67_conv_standard_1D.yaml
+++ b/problems/specs/KernelBench/level1/67_conv_standard_1D.yaml
@@ -22,7 +22,7 @@ simple-cpu:
   - params: [x]
     dtype: float16
     dims:
-      BATCH_SIZE: 2
+      BATCH_SIZE: 8
       IN_CHANNELS: 8
       OUT_CHANNELS: 16
       KERNEL_SIZE: 3
diff --git a/problems/specs/KernelBench/level1/6_Matmul_with_large_K_dimension_.yaml b/problems/specs/KernelBench/level1/6_Matmul_with_large_K_dimension_.yaml
index 73aa753..ff5ccde 100644
--- a/problems/specs/KernelBench/level1/6_Matmul_with_large_K_dimension_.yaml
+++ b/problems/specs/KernelBench/level1/6_Matmul_with_large_K_dimension_.yaml
@@ -19,8 +19,8 @@ simple-cpu:
   - params: [A, B]
     dtype: bfloat16
     dims:
-      M: 8
-      N: 16
+      M: 64
+      N: 32
       K: 512
     flop: "2*M*N*K"
 
diff --git a/problems/specs/KernelBench/level1/7_Matmul_with_small_K_dimension_.yaml b/problems/specs/KernelBench/level1/7_Matmul_with_small_K_dimension_.yaml
index 05b84d0..855daf5 100644
--- a/problems/specs/KernelBench/level1/7_Matmul_with_small_K_dimension_.yaml
+++ b/problems/specs/KernelBench/level1/7_Matmul_with_small_K_dimension_.yaml
@@ -19,9 +19,9 @@ simple-cpu:
   - params: [A, B]
     dtype: bfloat16
     dims:
-      M: 256
-      N: 256
-      K: 16
+      M: 512
+      N: 512
+      K: 32
     flop: "2*M*N*K"
 
 bench-gpu:
diff --git a/problems/specs/KernelBench/level1/87_conv_pointwise_2D.yaml b/problems/specs/KernelBench/level1/87_conv_pointwise_2D.yaml
index 6429cde..3ea00ab 100644
--- a/problems/specs/KernelBench/level1/87_conv_pointwise_2D.yaml
+++ b/problems/specs/KernelBench/level1/87_conv_pointwise_2D.yaml
@@ -24,5 +24,5 @@ simple-cpu:
       BATCH_SIZE: 2
       IN_CHANNELS: 4
       OUT_CHANNELS: 8
-      HEIGHT: 32
-      WIDTH: 32
+      HEIGHT: 64
+      WIDTH: 64
diff --git a/problems/specs/KernelBench/level1/8_Matmul_with_irregular_shapes_.yaml b/problems/specs/KernelBench/level1/8_Matmul_with_irregular_shapes_.yaml
index 1068de4..a05e921 100644
--- a/problems/specs/KernelBench/level1/8_Matmul_with_irregular_shapes_.yaml
+++ b/problems/specs/KernelBench/level1/8_Matmul_with_irregular_shapes_.yaml
@@ -19,9 +19,9 @@ simple-cpu:
   - params: [A, B]
     dtype: bfloat16
     dims:
-      M: 60
-      N: 37
-      K: 19
+      M: 607
+      N: 372
+      K: 191
     flop: "2*M*N*K"
 
 bench-gpu:
diff --git a/problems/specs/KernelBench/level1/9_Tall_skinny_matrix_multiplication_.yaml b/problems/specs/KernelBench/level1/9_Tall_skinny_matrix_multiplication_.yaml
index 9a25d89..c990777 100644
--- a/problems/specs/KernelBench/level1/9_Tall_skinny_matrix_multiplication_.yaml
+++ b/problems/specs/KernelBench/level1/9_Tall_skinny_matrix_multiplication_.yaml
@@ -18,8 +18,8 @@ simple-cpu:
   - params: [A, B]
     dtype: bfloat16
     dims:
-      M: 256
-      N: 16
+      M: 1024
+      N: 32
     flop: "2*M*M*N"
 
 bench-gpu:

From 45f21cf2758ae9bed857f18323bb8b3752390ac3 Mon Sep 17 00:00:00 2001
From: Marcin Spoczynski <marcin@spoczynski.com>
Date: Sat, 30 May 2026 22:55:02 -0700
Subject: [PATCH 3/3] Fix temporary mlir install

---
 pyproject.toml | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index ba285e0..6294db8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "AI-bench"
 dynamic = ["version"]
-requires-python = ">=3.10"
+requires-python = ">=3.10,<3.13"
 description = "AI kernel benchmarking harness"
 readme = "README.md"
 license = {text = "MIT"}
@@ -61,7 +61,11 @@ cuda = [
   "helion==1.0.0",
 ]
 mlir = [
-  "lighthouse[ingress_torch_mlir]",
+  # torch-mlir publishes Linux-only wheels for cp310-cp312, so restrict the
+  # extra to where it can actually resolve/install. This keeps universal `uv
+  # lock` resolution from failing on the python>=3.13 / non-Linux splits.
+  "lighthouse; sys_platform == 'linux' and python_version < '3.13'",
+  "torch-mlir @ https://github.com/llvm/torch-mlir-release/releases/download/dev-wheels/torch_mlir-20260331.768-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl ; sys_platform == 'linux' and platform_machine == 'x86_64' and python_version == '3.12'",
   "ml-dtypes",
 ]
 # If triton-cpu build fails with a stale LLVM path, clear caches: