From 7f9e6f4ed5ffc22e9f4d16f559c663dca302e03b Mon Sep 17 00:00:00 2001 From: Julian Oppermann Date: Tue, 26 May 2026 14:26:51 -0700 Subject: [PATCH 1/3] Add LLM-generated level 1 kernels for Triton CPU Co-authored-by: Marcin Spoczynski --- .../cpu/KernelBench/level1/100_HingeLoss.py | 81 ++++ .../10_3D_tensor_matrix_multiplication.py | 135 +++++++ .../11_4D_tensor_matrix_multiplication.py | 155 ++++++++ .../12_Matmul_with_diagonal_matrices_.py | 95 +++++ .../13_Matmul_for_symmetric_matrices.py | 151 ++++++++ ...14_Matmul_for_upper_triangular_matrices.py | 153 ++++++++ ...15_Matmul_for_lower_triangular_matrices.py | 146 ++++++++ .../level1/16_Matmul_with_transposed_A.py | 132 +++++++ .../level1/17_Matmul_with_transposed_B.py | 137 +++++++ .../level1/18_Matmul_with_transposed_both.py | 133 +++++++ .../triton/cpu/KernelBench/level1/19_ReLU.py | 63 ++++ .../level1/1_Square_matrix_multiplication_.py | 134 ++++--- .../cpu/KernelBench/level1/20_LeakyReLU.py | 69 ++++ .../cpu/KernelBench/level1/21_Sigmoid.py | 64 ++++ .../triton/cpu/KernelBench/level1/22_Tanh.py | 67 ++++ .../cpu/KernelBench/level1/23_Softmax.py | 101 +++++ .../cpu/KernelBench/level1/24_LogSoftmax.py | 94 +++++ .../triton/cpu/KernelBench/level1/25_Swish.py | 65 ++++ .../triton/cpu/KernelBench/level1/26_GELU_.py | 64 ++++ .../triton/cpu/KernelBench/level1/27_SELU_.py | 64 ++++ .../cpu/KernelBench/level1/28_HardSigmoid.py | 62 +++ .../cpu/KernelBench/level1/29_Softplus.py | 61 +++ .../2_Standard_matrix_multiplication_.py | 124 ++++++ .../cpu/KernelBench/level1/30_Softsign.py | 58 +++ .../triton/cpu/KernelBench/level1/31_ELU.py | 69 ++++ .../cpu/KernelBench/level1/32_HardTanh.py | 59 +++ .../cpu/KernelBench/level1/33_BatchNorm.py | 202 ++++++++++ .../cpu/KernelBench/level1/34_InstanceNorm.py | 87 +++++ .../cpu/KernelBench/level1/35_GroupNorm_.py | 183 +++++++++ .../cpu/KernelBench/level1/36_RMSNorm_.py | 109 ++++++ .../KernelBench/level1/37_FrobeniusNorm_.py | 110 ++++++ .../cpu/KernelBench/level1/38_L1Norm_.py | 88 +++++ .../cpu/KernelBench/level1/39_L2Norm_.py | 84 +++++ .../level1/3_Batched_matrix_multiplication.py | 147 ++++++++ .../cpu/KernelBench/level1/40_LayerNorm.py | 111 ++++++ .../KernelBench/level1/41_Max_Pooling_1D.py | 121 ++++++ .../KernelBench/level1/42_Max_Pooling_2D.py | 127 +++++++ .../KernelBench/level1/43_Max_Pooling_3D.py | 172 +++++++++ .../level1/44_Average_Pooling_1D.py | 125 +++++++ .../level1/45_Average_Pooling_2D.py | 115 ++++++ .../level1/46_Average_Pooling_3D.py | 157 ++++++++ .../47_Sum_reduction_over_a_dimension.py | 103 +++++ .../48_Mean_reduction_over_a_dimension.py | 106 ++++++ .../49_Max_reduction_over_a_dimension.py | 105 ++++++ .../level1/4_Matrix_vector_multiplication_.py | 76 ++++ ...tandard_2D__square_input__square_kernel.py | 232 ++++++++++++ .../level1/51_Argmax_over_a_dimension.py | 98 +++++ .../level1/52_Argmin_over_a_dimension.py | 119 ++++++ .../53_Min_reduction_over_a_dimension.py | 107 ++++++ ...tandard_3D__square_input__square_kernel.py | 252 +++++++++++++ ...ard_2D__asymmetric_input__square_kernel.py | 214 +++++++++++ ...2D__asymmetric_input__asymmetric_kernel.py | 205 ++++++++++ ...nsposed_2D__square_input__square_kernel.py | 229 ++++++++++++ ...3D__asymmetric_input__asymmetric_kernel.py | 238 ++++++++++++ ...ard_3D__asymmetric_input__square_kernel.py | 246 ++++++++++++ .../level1/5_Matrix_scalar_multiplication.py | 63 ++++ ...ard_3D__square_input__asymmetric_kernel.py | 252 +++++++++++++ ...nsposed_3D__square_input__square_kernel.py | 243 ++++++++++++ ...ard_2D__square_input__asymmetric_kernel.py | 205 ++++++++++ ...tandard_2D__square_input__square_kernel.py | 201 ++++++++++ .../level1/64_conv_transposed_1D.py | 195 ++++++++++ ...sed_2D__square_input__asymmetric_kernel.py | 229 ++++++++++++ ...3D__asymmetric_input__asymmetric_kernel.py | 242 ++++++++++++ .../KernelBench/level1/67_conv_standard_1D.py | 167 +++++++++ ...sed_3D__square_input__asymmetric_kernel.py | 239 ++++++++++++ ...2D__asymmetric_input__asymmetric_kernel.py | 229 ++++++++++++ .../6_Matmul_with_large_K_dimension_.py | 133 +++++++ ...sed_3D__asymmetric_input__square_kernel.py | 234 ++++++++++++ ...sed_2D__asymmetric_input__square_kernel.py | 229 ++++++++++++ ...metric_kernel___strided_padded_grouped_.py | 289 ++++++++++++++ ..._square_kernel__strided_padded__grouped.py | 309 +++++++++++++++ .../level1/74_conv_transposed_1D_dilated.py | 199 ++++++++++ ...strided__grouped____padded____dilated__.py | 353 ++++++++++++++++++ .../76_conv_standard_1D_dilated_strided__.py | 174 +++++++++ ...kernel___padded____dilated____strided__.py | 259 +++++++++++++ ...tric_input_asymmetric_kernel___padded__.py | 244 ++++++++++++ ...kernel___padded____strided____dilated__.py | 180 +++++++++ .../7_Matmul_with_small_K_dimension_.py | 136 +++++++ ...asymmetric_kernel___dilated____padded__.py | 245 ++++++++++++ ...kernel___dilated____padded____strided__.py | 242 ++++++++++++ ...depthwise_2D_square_input_square_kernel.py | 117 ++++++ ...hwise_2D_square_input_asymmetric_kernel.py | 122 ++++++ ...hwise_2D_asymmetric_input_square_kernel.py | 119 ++++++ ...e_2D_asymmetric_input_asymmetric_kernel.py | 179 +++++++++ .../level1/86_conv_depthwise_separable_2D.py | 214 +++++++++++ .../level1/87_conv_pointwise_2D.py | 147 ++++++++ .../KernelBench/level1/88_MinGPTNewGelu.py | 65 ++++ .../cpu/KernelBench/level1/89_cumsum.py | 90 +++++ .../level1/8_Matmul_with_irregular_shapes_.py | 128 +++++++ .../cpu/KernelBench/level1/90_cumprod.py | 94 +++++ .../KernelBench/level1/91_cumsum_reverse.py | 91 +++++ .../KernelBench/level1/92_cumsum_exclusive.py | 88 +++++ .../KernelBench/level1/93_masked_cumsum.py | 89 +++++ .../cpu/KernelBench/level1/94_MSELoss.py | 107 ++++++ .../KernelBench/level1/95_CrossEntropyLoss.py | 107 ++++++ .../cpu/KernelBench/level1/96_HuberLoss.py | 97 +++++ .../level1/97_ScaledDotProductAttention.py | 280 ++++++++++++++ .../cpu/KernelBench/level1/98_KLDivLoss.py | 103 +++++ .../level1/99_TripletMarginLoss.py | 127 +++++++ .../9_Tall_skinny_matrix_multiplication_.py | 141 +++++++ .../KernelBench/level1/100_HingeLoss.yaml | 7 + .../10_3D_tensor_matrix_multiplication.yaml | 10 + .../11_4D_tensor_matrix_multiplication.yaml | 11 + .../12_Matmul_with_diagonal_matrices_.yaml | 8 + .../13_Matmul_for_symmetric_matrices.yaml | 6 + ..._Matmul_for_upper_triangular_matrices.yaml | 6 + ..._Matmul_for_lower_triangular_matrices.yaml | 6 + .../level1/16_Matmul_with_transposed_A.yaml | 9 + .../level1/17_Matmul_with_transposed_B.yaml | 9 + .../18_Matmul_with_transposed_both.yaml | 9 + .../specs/KernelBench/level1/19_ReLU.yaml | 7 + .../1_Square_matrix_multiplication_.yaml | 6 + .../KernelBench/level1/20_LeakyReLU.yaml | 8 + .../specs/KernelBench/level1/21_Sigmoid.yaml | 7 + .../specs/KernelBench/level1/22_Tanh.yaml | 7 + .../specs/KernelBench/level1/23_Softmax.yaml | 7 + .../KernelBench/level1/24_LogSoftmax.yaml | 7 + .../specs/KernelBench/level1/25_Swish.yaml | 7 + .../specs/KernelBench/level1/26_GELU_.yaml | 7 + .../specs/KernelBench/level1/27_SELU_.yaml | 7 + .../KernelBench/level1/28_HardSigmoid.yaml | 7 + .../specs/KernelBench/level1/29_Softplus.yaml | 7 + .../2_Standard_matrix_multiplication_.yaml | 8 + .../specs/KernelBench/level1/30_Softsign.yaml | 7 + problems/specs/KernelBench/level1/31_ELU.yaml | 8 + .../specs/KernelBench/level1/32_HardTanh.yaml | 7 + .../KernelBench/level1/33_BatchNorm.yaml | 9 + .../KernelBench/level1/34_InstanceNorm.yaml | 9 + .../KernelBench/level1/35_GroupNorm_.yaml | 10 + .../specs/KernelBench/level1/36_RMSNorm_.yaml | 9 + .../KernelBench/level1/37_FrobeniusNorm_.yaml | 9 + .../specs/KernelBench/level1/38_L1Norm_.yaml | 7 + .../specs/KernelBench/level1/39_L2Norm_.yaml | 7 + .../3_Batched_matrix_multiplication.yaml | 9 + .../KernelBench/level1/40_LayerNorm.yaml | 10 + .../KernelBench/level1/41_Max_Pooling_1D.yaml | 13 + .../KernelBench/level1/42_Max_Pooling_2D.yaml | 13 + .../KernelBench/level1/43_Max_Pooling_3D.yaml | 17 +- .../level1/44_Average_Pooling_1D.yaml | 11 + .../level1/45_Average_Pooling_2D.yaml | 10 + .../level1/46_Average_Pooling_3D.yaml | 13 + .../47_Sum_reduction_over_a_dimension.yaml | 9 + .../48_Mean_reduction_over_a_dimension.yaml | 9 + .../49_Max_reduction_over_a_dimension.yaml | 9 + .../4_Matrix_vector_multiplication_.yaml | 9 + ...ndard_2D__square_input__square_kernel.yaml | 10 + .../level1/51_Argmax_over_a_dimension.yaml | 9 + .../level1/52_Argmin_over_a_dimension.yaml | 9 + .../53_Min_reduction_over_a_dimension.yaml | 9 + ...ndard_3D__square_input__square_kernel.yaml | 12 + ...d_2D__asymmetric_input__square_kernel.yaml | 11 + ...__asymmetric_input__asymmetric_kernel.yaml | 12 + ...posed_2D__square_input__square_kernel.yaml | 11 + ...__asymmetric_input__asymmetric_kernel.yaml | 12 + ...d_3D__asymmetric_input__square_kernel.yaml | 12 + .../5_Matrix_scalar_multiplication.yaml | 9 + ...d_3D__square_input__asymmetric_kernel.yaml | 12 + ...posed_3D__square_input__square_kernel.yaml | 12 + ...d_2D__square_input__asymmetric_kernel.yaml | 11 + ...ndard_2D__square_input__square_kernel.yaml | 11 + .../level1/64_conv_transposed_1D.yaml | 10 + ...d_2D__square_input__asymmetric_kernel.yaml | 11 + ...__asymmetric_input__asymmetric_kernel.yaml | 12 + .../level1/67_conv_standard_1D.yaml | 10 + ...d_3D__square_input__asymmetric_kernel.yaml | 12 + ...__asymmetric_input__asymmetric_kernel.yaml | 11 + .../6_Matmul_with_large_K_dimension_.yaml | 9 + ...d_3D__asymmetric_input__square_kernel.yaml | 12 + ...d_2D__asymmetric_input__square_kernel.yaml | 11 + ...tric_kernel___strided_padded_grouped_.yaml | 16 + ...quare_kernel__strided_padded__grouped.yaml | 15 + .../level1/74_conv_transposed_1D_dilated.yaml | 13 + ...rided__grouped____padded____dilated__.yaml | 15 + ...76_conv_standard_1D_dilated_strided__.yaml | 12 + ...rnel___padded____dilated____strided__.yaml | 15 + ...ic_input_asymmetric_kernel___padded__.yaml | 13 + ...rnel___padded____strided____dilated__.yaml | 13 + .../7_Matmul_with_small_K_dimension_.yaml | 9 + ...ymmetric_kernel___dilated____padded__.yaml | 14 + ...rnel___dilated____padded____strided__.yaml | 14 + ...pthwise_2D_square_input_square_kernel.yaml | 12 + ...ise_2D_square_input_asymmetric_kernel.yaml | 13 + ...ise_2D_asymmetric_input_square_kernel.yaml | 13 + ...2D_asymmetric_input_asymmetric_kernel.yaml | 19 + .../86_conv_depthwise_separable_2D.yaml | 15 + .../level1/87_conv_pointwise_2D.yaml | 10 + .../KernelBench/level1/88_MinGPTNewGelu.yaml | 7 + .../specs/KernelBench/level1/89_cumsum.yaml | 8 + .../8_Matmul_with_irregular_shapes_.yaml | 9 + .../specs/KernelBench/level1/90_cumprod.yaml | 8 + .../KernelBench/level1/91_cumsum_reverse.yaml | 8 + .../level1/92_cumsum_exclusive.yaml | 8 + .../KernelBench/level1/93_masked_cumsum.yaml | 8 + .../specs/KernelBench/level1/94_MSELoss.yaml | 7 + .../level1/95_CrossEntropyLoss.yaml | 7 + .../KernelBench/level1/96_HuberLoss.yaml | 7 + .../level1/97_ScaledDotProductAttention.yaml | 9 + .../KernelBench/level1/98_KLDivLoss.yaml | 7 + .../level1/99_TripletMarginLoss.yaml | 8 + .../9_Tall_skinny_matrix_multiplication_.yaml | 8 + pyproject.toml | 2 +- 201 files changed, 15733 insertions(+), 56 deletions(-) create mode 100644 backends/triton/cpu/KernelBench/level1/100_HingeLoss.py create mode 100644 backends/triton/cpu/KernelBench/level1/10_3D_tensor_matrix_multiplication.py create mode 100644 backends/triton/cpu/KernelBench/level1/11_4D_tensor_matrix_multiplication.py create mode 100644 backends/triton/cpu/KernelBench/level1/12_Matmul_with_diagonal_matrices_.py create mode 100644 backends/triton/cpu/KernelBench/level1/13_Matmul_for_symmetric_matrices.py create mode 100644 backends/triton/cpu/KernelBench/level1/14_Matmul_for_upper_triangular_matrices.py create mode 100644 backends/triton/cpu/KernelBench/level1/15_Matmul_for_lower_triangular_matrices.py create mode 100644 backends/triton/cpu/KernelBench/level1/16_Matmul_with_transposed_A.py create mode 100644 backends/triton/cpu/KernelBench/level1/17_Matmul_with_transposed_B.py create mode 100644 backends/triton/cpu/KernelBench/level1/18_Matmul_with_transposed_both.py create mode 100644 backends/triton/cpu/KernelBench/level1/19_ReLU.py create mode 100644 backends/triton/cpu/KernelBench/level1/20_LeakyReLU.py create mode 100644 backends/triton/cpu/KernelBench/level1/21_Sigmoid.py create mode 100644 backends/triton/cpu/KernelBench/level1/22_Tanh.py create mode 100644 backends/triton/cpu/KernelBench/level1/23_Softmax.py create mode 100644 backends/triton/cpu/KernelBench/level1/24_LogSoftmax.py create mode 100644 backends/triton/cpu/KernelBench/level1/25_Swish.py create mode 100644 backends/triton/cpu/KernelBench/level1/26_GELU_.py create mode 100644 backends/triton/cpu/KernelBench/level1/27_SELU_.py create mode 100644 backends/triton/cpu/KernelBench/level1/28_HardSigmoid.py create mode 100644 backends/triton/cpu/KernelBench/level1/29_Softplus.py create mode 100644 backends/triton/cpu/KernelBench/level1/2_Standard_matrix_multiplication_.py create mode 100644 backends/triton/cpu/KernelBench/level1/30_Softsign.py create mode 100644 backends/triton/cpu/KernelBench/level1/31_ELU.py create mode 100644 backends/triton/cpu/KernelBench/level1/32_HardTanh.py create mode 100644 backends/triton/cpu/KernelBench/level1/33_BatchNorm.py create mode 100644 backends/triton/cpu/KernelBench/level1/34_InstanceNorm.py create mode 100644 backends/triton/cpu/KernelBench/level1/35_GroupNorm_.py create mode 100644 backends/triton/cpu/KernelBench/level1/36_RMSNorm_.py create mode 100644 backends/triton/cpu/KernelBench/level1/37_FrobeniusNorm_.py create mode 100644 backends/triton/cpu/KernelBench/level1/38_L1Norm_.py create mode 100644 backends/triton/cpu/KernelBench/level1/39_L2Norm_.py create mode 100644 backends/triton/cpu/KernelBench/level1/3_Batched_matrix_multiplication.py create mode 100644 backends/triton/cpu/KernelBench/level1/40_LayerNorm.py create mode 100644 backends/triton/cpu/KernelBench/level1/41_Max_Pooling_1D.py create mode 100644 backends/triton/cpu/KernelBench/level1/42_Max_Pooling_2D.py create mode 100644 backends/triton/cpu/KernelBench/level1/43_Max_Pooling_3D.py create mode 100644 backends/triton/cpu/KernelBench/level1/44_Average_Pooling_1D.py create mode 100644 backends/triton/cpu/KernelBench/level1/45_Average_Pooling_2D.py create mode 100644 backends/triton/cpu/KernelBench/level1/46_Average_Pooling_3D.py create mode 100644 backends/triton/cpu/KernelBench/level1/47_Sum_reduction_over_a_dimension.py create mode 100644 backends/triton/cpu/KernelBench/level1/48_Mean_reduction_over_a_dimension.py create mode 100644 backends/triton/cpu/KernelBench/level1/49_Max_reduction_over_a_dimension.py create mode 100644 backends/triton/cpu/KernelBench/level1/4_Matrix_vector_multiplication_.py create mode 100644 backends/triton/cpu/KernelBench/level1/50_conv_standard_2D__square_input__square_kernel.py create mode 100644 backends/triton/cpu/KernelBench/level1/51_Argmax_over_a_dimension.py create mode 100644 backends/triton/cpu/KernelBench/level1/52_Argmin_over_a_dimension.py create mode 100644 backends/triton/cpu/KernelBench/level1/53_Min_reduction_over_a_dimension.py create mode 100644 backends/triton/cpu/KernelBench/level1/54_conv_standard_3D__square_input__square_kernel.py create mode 100644 backends/triton/cpu/KernelBench/level1/55_conv_standard_2D__asymmetric_input__square_kernel.py create mode 100644 backends/triton/cpu/KernelBench/level1/56_conv_standard_2D__asymmetric_input__asymmetric_kernel.py create mode 100644 backends/triton/cpu/KernelBench/level1/57_conv_transposed_2D__square_input__square_kernel.py create mode 100644 backends/triton/cpu/KernelBench/level1/58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.py create mode 100644 backends/triton/cpu/KernelBench/level1/59_conv_standard_3D__asymmetric_input__square_kernel.py create mode 100644 backends/triton/cpu/KernelBench/level1/5_Matrix_scalar_multiplication.py create mode 100644 backends/triton/cpu/KernelBench/level1/60_conv_standard_3D__square_input__asymmetric_kernel.py create mode 100644 backends/triton/cpu/KernelBench/level1/61_conv_transposed_3D__square_input__square_kernel.py create mode 100644 backends/triton/cpu/KernelBench/level1/62_conv_standard_2D__square_input__asymmetric_kernel.py create mode 100644 backends/triton/cpu/KernelBench/level1/63_conv_standard_2D__square_input__square_kernel.py create mode 100644 backends/triton/cpu/KernelBench/level1/64_conv_transposed_1D.py create mode 100644 backends/triton/cpu/KernelBench/level1/65_conv_transposed_2D__square_input__asymmetric_kernel.py create mode 100644 backends/triton/cpu/KernelBench/level1/66_conv_standard_3D__asymmetric_input__asymmetric_kernel.py create mode 100644 backends/triton/cpu/KernelBench/level1/67_conv_standard_1D.py create mode 100644 backends/triton/cpu/KernelBench/level1/68_conv_transposed_3D__square_input__asymmetric_kernel.py create mode 100644 backends/triton/cpu/KernelBench/level1/69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.py create mode 100644 backends/triton/cpu/KernelBench/level1/6_Matmul_with_large_K_dimension_.py create mode 100644 backends/triton/cpu/KernelBench/level1/70_conv_transposed_3D__asymmetric_input__square_kernel.py create mode 100644 backends/triton/cpu/KernelBench/level1/71_conv_transposed_2D__asymmetric_input__square_kernel.py create mode 100644 backends/triton/cpu/KernelBench/level1/72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.py create mode 100644 backends/triton/cpu/KernelBench/level1/73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.py create mode 100644 backends/triton/cpu/KernelBench/level1/74_conv_transposed_1D_dilated.py create mode 100644 backends/triton/cpu/KernelBench/level1/75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.py create mode 100644 backends/triton/cpu/KernelBench/level1/76_conv_standard_1D_dilated_strided__.py create mode 100644 backends/triton/cpu/KernelBench/level1/77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.py create mode 100644 backends/triton/cpu/KernelBench/level1/78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.py create mode 100644 backends/triton/cpu/KernelBench/level1/79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.py create mode 100644 backends/triton/cpu/KernelBench/level1/7_Matmul_with_small_K_dimension_.py create mode 100644 backends/triton/cpu/KernelBench/level1/80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.py create mode 100644 backends/triton/cpu/KernelBench/level1/81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.py create mode 100644 backends/triton/cpu/KernelBench/level1/82_conv_depthwise_2D_square_input_square_kernel.py create mode 100644 backends/triton/cpu/KernelBench/level1/83_conv_depthwise_2D_square_input_asymmetric_kernel.py create mode 100644 backends/triton/cpu/KernelBench/level1/84_conv_depthwise_2D_asymmetric_input_square_kernel.py create mode 100644 backends/triton/cpu/KernelBench/level1/85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.py create mode 100644 backends/triton/cpu/KernelBench/level1/86_conv_depthwise_separable_2D.py create mode 100644 backends/triton/cpu/KernelBench/level1/87_conv_pointwise_2D.py create mode 100644 backends/triton/cpu/KernelBench/level1/88_MinGPTNewGelu.py create mode 100644 backends/triton/cpu/KernelBench/level1/89_cumsum.py create mode 100644 backends/triton/cpu/KernelBench/level1/8_Matmul_with_irregular_shapes_.py create mode 100644 backends/triton/cpu/KernelBench/level1/90_cumprod.py create mode 100644 backends/triton/cpu/KernelBench/level1/91_cumsum_reverse.py create mode 100644 backends/triton/cpu/KernelBench/level1/92_cumsum_exclusive.py create mode 100644 backends/triton/cpu/KernelBench/level1/93_masked_cumsum.py create mode 100644 backends/triton/cpu/KernelBench/level1/94_MSELoss.py create mode 100644 backends/triton/cpu/KernelBench/level1/95_CrossEntropyLoss.py create mode 100644 backends/triton/cpu/KernelBench/level1/96_HuberLoss.py create mode 100644 backends/triton/cpu/KernelBench/level1/97_ScaledDotProductAttention.py create mode 100644 backends/triton/cpu/KernelBench/level1/98_KLDivLoss.py create mode 100644 backends/triton/cpu/KernelBench/level1/99_TripletMarginLoss.py create mode 100644 backends/triton/cpu/KernelBench/level1/9_Tall_skinny_matrix_multiplication_.py diff --git a/backends/triton/cpu/KernelBench/level1/100_HingeLoss.py b/backends/triton/cpu/KernelBench/level1/100_HingeLoss.py new file mode 100644 index 0000000..1903395 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/100_HingeLoss.py @@ -0,0 +1,81 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2), + ], + key=["D"], +) +@triton.jit +def _hinge_loss_kernel( + pred_ptr, + targ_ptr, + out_ptr, + B, + D, + stride_pb, + BLOCK_SIZE: tl.constexpr, +): + row_idx = tl.program_id(0) + row_start = row_idx * stride_pb + + acc = 0.0 + + for col_start in range(0, D, BLOCK_SIZE): + cols = col_start + tl.arange(0, BLOCK_SIZE) + mask = cols < D + + targ = tl.load(targ_ptr + cols, mask=mask, other=0.0).to(tl.float32) + pred = tl.load(pred_ptr + row_start + cols, mask=mask, other=0.0).to(tl.float32) + + hinge = tl.maximum(1.0 - pred * targ, 0.0) + acc += tl.sum(hinge, axis=0) + + tl.store(out_ptr + row_idx, acc.to(tl.float32)) + + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, predictions, targets): + B, D = predictions.shape + row_sums = torch.empty(B, device=predictions.device, dtype=torch.float32) + + grid = (B,) + _hinge_loss_kernel[grid]( + predictions, + targets, + row_sums, + B, + D, + predictions.stride(0), + ) + + return row_sums.sum() / (B * D) + + +batch_size = 32768 +input_shape = (32768,) +dim = 1 + + +def get_inputs(): + return [ + torch.rand(batch_size, *input_shape), + torch.randint(0, 2, (batch_size,)).float() * 2 - 1, + ] + + +def get_init_inputs(): + return [] diff --git a/backends/triton/cpu/KernelBench/level1/10_3D_tensor_matrix_multiplication.py b/backends/triton/cpu/KernelBench/level1/10_3D_tensor_matrix_multiplication.py new file mode 100644 index 0000000..b6629b8 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/10_3D_tensor_matrix_multiplication.py @@ -0,0 +1,135 @@ +# ruff: noqa: E731, E741 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +def _configs(): + return [ + triton.Config( + {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, + num_warps=32, + num_stages=2, + ), + ] + + +@triton.autotune(configs=_configs(), key=["M", "N", "K"]) +@triton.jit +def _matmul_kernel( + a_ptr, + b_ptr, + c_ptr, + M, + N, + K, + stride_am: tl.constexpr, + stride_ak: tl.constexpr, + stride_bk: tl.constexpr, + stride_bn: tl.constexpr, + stride_cm: tl.constexpr, + stride_cn: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + pid = tl.program_id(0) + + num_pid_m = tl.cdiv(M, BLOCK_M) + num_pid_n = tl.cdiv(N, BLOCK_N) + num_pid_in_group = GROUP_SIZE_M * num_pid_n + + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size_m = tl.minimum(num_pid_m - first_pid_m, GROUP_SIZE_M) + + pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m) + pid_n = (pid % num_pid_in_group) // group_size_m + + a_desc = tl.make_tensor_descriptor( + base=a_ptr, + shape=(M, K), + strides=(stride_am, stride_ak), + block_shape=(BLOCK_M, BLOCK_K), + ) + b_desc = tl.make_tensor_descriptor( + base=b_ptr, + shape=(K, N), + strides=(stride_bk, stride_bn), + block_shape=(BLOCK_K, BLOCK_N), + ) + + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + for off_k in range(0, K, BLOCK_K): + a_tile = a_desc.load([pid_m * BLOCK_M, off_k]) + b_tile = b_desc.load([off_k, pid_n * BLOCK_N]) + acc += tl.dot(a_tile, b_tile) + c_desc = tl.make_tensor_descriptor( + base=c_ptr, + shape=(M, N), + strides=(stride_cm, stride_cn), + block_shape=(BLOCK_M, BLOCK_N), + ) + c_desc.store([pid_m * BLOCK_M, pid_n * BLOCK_N], acc.to(c_ptr.type.element_ty)) + + +N = 16 +M = 1024 +K = 2048 +L = 768 + + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, A, B): + batch, m, k = A.shape + _, l = B.shape + + a = A.to(torch.bfloat16).contiguous() + b = B.to(torch.bfloat16).contiguous() + + a_flat = a.reshape(batch * m, k) + total_m = batch * m + + c_flat = torch.empty((total_m, l), device=a.device, dtype=torch.bfloat16) + + def grid(META): + return ( + triton.cdiv(total_m, META["BLOCK_M"]) * triton.cdiv(l, META["BLOCK_N"]), + ) + + _matmul_kernel[grid]( + a_flat, + b, + c_flat, + total_m, + l, + k, + a_flat.stride(0), + a_flat.stride(1), + b.stride(0), + b.stride(1), + c_flat.stride(0), + c_flat.stride(1), + ) + + return c_flat.reshape(batch, m, l) + + +def get_inputs(): + A = torch.rand(N, M, K, dtype=torch.bfloat16) + B = torch.rand(K, L, dtype=torch.bfloat16) + return [A, B] + + +def get_init_inputs(): + return [] diff --git a/backends/triton/cpu/KernelBench/level1/11_4D_tensor_matrix_multiplication.py b/backends/triton/cpu/KernelBench/level1/11_4D_tensor_matrix_multiplication.py new file mode 100644 index 0000000..10f3601 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/11_4D_tensor_matrix_multiplication.py @@ -0,0 +1,155 @@ +# ruff: noqa: E731, E741 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.jit +def swizzle_tile( + tile_id, + M, + N, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + grid_m = tl.cdiv(M, BLOCK_M) + grid_n = tl.cdiv(N, BLOCK_N) + width = GROUP_SIZE_M * grid_n + group_id = tile_id // width + group_size = tl.minimum(GROUP_SIZE_M, grid_m - group_id * GROUP_SIZE_M) + pid_m = group_id * GROUP_SIZE_M + ((tile_id % width) % group_size) + pid_n = (tile_id % width) // group_size + return pid_m, pid_n + + +def get_autotune_configs(): + return [ + triton.Config( + {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 64, "GROUP_SIZE_M": 4}, + num_warps=32, + num_stages=2, + ), + ] + + +@triton.autotune( + configs=get_autotune_configs(), + key=["M", "N", "K"], +) +@triton.jit +def _gemm_kernel( + a_ptr, + b_ptr, + c_ptr, + M, + N, + K, + stride_am: tl.constexpr, + stride_ak: tl.constexpr, + stride_bk: tl.constexpr, + stride_bn: tl.constexpr, + stride_cm: tl.constexpr, + stride_cn: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + pid = tl.program_id(0) + pid_m, pid_n = swizzle_tile(pid, M, N, BLOCK_M, BLOCK_N, GROUP_SIZE_M) + + a_desc = tl.make_tensor_descriptor( + base=a_ptr, + shape=(M, K), + strides=(stride_am, stride_ak), + block_shape=(BLOCK_M, BLOCK_K), + ) + b_desc = tl.make_tensor_descriptor( + base=b_ptr, + shape=(K, N), + strides=(stride_bk, stride_bn), + block_shape=(BLOCK_K, BLOCK_N), + ) + + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + for off_k in range(0, K, BLOCK_K): + a_block = a_desc.load([pid_m * BLOCK_M, off_k]) + b_block = b_desc.load([off_k, pid_n * BLOCK_N]) + acc += tl.dot(a_block, b_block) + c_desc = tl.make_tensor_descriptor( + base=c_ptr, + shape=(M, N), + strides=(stride_cm, stride_cn), + block_shape=(BLOCK_M, BLOCK_N), + ) + c_desc.store([pid_m * BLOCK_M, pid_n * BLOCK_N], acc.to(c_ptr.type.element_ty)) + + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, A, B): + b_dim, i_dim, j_dim, l_dim = A.shape + k_dim = B.shape[1] + + A_flat = A.contiguous().view(-1, l_dim) + if A_flat.dtype != torch.bfloat16: + A_flat = A_flat.to(torch.bfloat16) + B_fp16 = B.contiguous() + if B_fp16.dtype != torch.bfloat16: + B_fp16 = B_fp16.to(torch.bfloat16) + + M = A_flat.shape[0] + N = k_dim + K = l_dim + + C_2d = torch.empty((M, N), device=A.device, dtype=torch.bfloat16) + + grid = lambda META: ( + triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]), + ) + + _gemm_kernel[grid]( + A_flat, + B_fp16, + C_2d, + M, + N, + K, + A_flat.stride(0), + A_flat.stride(1), + B_fp16.stride(0), + B_fp16.stride(1), + C_2d.stride(0), + C_2d.stride(1), + ) + + result = C_2d.view(b_dim, i_dim, j_dim, k_dim) + if A.dtype != torch.bfloat16: + result = result.to(A.dtype) + return result + + +b = 8 +i = 256 +j = 512 +l = 256 +k = 768 + + +def get_inputs(): + A = torch.rand(b, i, j, l, dtype=torch.bfloat16) + B = torch.rand(l, k, dtype=torch.bfloat16) + return [A, B] + + +def get_init_inputs(): + return [] diff --git a/backends/triton/cpu/KernelBench/level1/12_Matmul_with_diagonal_matrices_.py b/backends/triton/cpu/KernelBench/level1/12_Matmul_with_diagonal_matrices_.py new file mode 100644 index 0000000..499ab2a --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/12_Matmul_with_diagonal_matrices_.py @@ -0,0 +1,95 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_M": 64, "BLOCK_N": 128}, num_warps=32, num_stages=2), + ], + key=["N", "M"], +) +@triton.jit +def _diag_matmul_kernel( + a_ptr, + b_ptr, + c_ptr, + N, + M, + stride_bn, + stride_bm, + stride_cn, + stride_cm, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, +): + pid_n = tl.program_id(0) + pid_m = tl.program_id(1) + + offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + + mask_n = offs_n < N + mask_m = offs_m < M + + a_vals = tl.load(a_ptr + offs_n, mask=mask_n, other=0.0) + + b_ptrs = b_ptr + offs_n[:, None] * stride_bn + offs_m[None, :] * stride_bm + mask = mask_n[:, None] & mask_m[None, :] + b_vals = tl.load(b_ptrs, mask=mask, other=0.0) + + c_vals = a_vals[:, None].to(tl.float32) * b_vals.to(tl.float32) + + c_ptrs = c_ptr + offs_n[:, None] * stride_cn + offs_m[None, :] * stride_cm + tl.store(c_ptrs, c_vals.to(tl.bfloat16), mask=mask) + + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, A, B): + N = A.shape[0] + M = B.shape[1] + + C = torch.empty((N, M), device=A.device, dtype=A.dtype) + + grid = lambda META: ( + triton.cdiv(N, META["BLOCK_N"]), + triton.cdiv(M, META["BLOCK_M"]), + ) + + _diag_matmul_kernel[grid]( + A, + B, + C, + N, + M, + B.stride(0), + B.stride(1), + C.stride(0), + C.stride(1), + ) + + return C + + +M = 4096 +N = 4096 + + +def get_inputs(): + A = torch.rand(N, dtype=torch.bfloat16) + B = torch.rand(N, M, dtype=torch.bfloat16) + return [A, B] + + +def get_init_inputs(): + return [] diff --git a/backends/triton/cpu/KernelBench/level1/13_Matmul_for_symmetric_matrices.py b/backends/triton/cpu/KernelBench/level1/13_Matmul_for_symmetric_matrices.py new file mode 100644 index 0000000..13dc6df --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/13_Matmul_for_symmetric_matrices.py @@ -0,0 +1,151 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.jit +def swizzle_tile( + tile_id, + M, + N, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + grid_m = tl.cdiv(M, BLOCK_M) + grid_n = tl.cdiv(N, BLOCK_N) + width = GROUP_SIZE_M * grid_n + group_id = tile_id // width + group_size = tl.minimum(GROUP_SIZE_M, grid_m - group_id * GROUP_SIZE_M) + pid_m = group_id * GROUP_SIZE_M + (tile_id % group_size) + pid_n = (tile_id % width) // group_size + return pid_m, pid_n + + +def _configs(): + return [ + triton.Config( + {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, + num_warps=32, + num_stages=2, + ), + ] + + +@triton.autotune(configs=_configs(), key=["M", "N", "K"]) +@triton.jit +def _matmul_kernel( + a_ptr, + b_ptr, + c_ptr, + M, + N, + K, + stride_am: tl.constexpr, + stride_ak: tl.constexpr, + stride_bk: tl.constexpr, + stride_bn: tl.constexpr, + stride_cm: tl.constexpr, + stride_cn: tl.constexpr, + DIVISIBLE_M: tl.constexpr, + DIVISIBLE_N: tl.constexpr, + DIVISIBLE_K: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + pid = tl.program_id(0) + pid_m, pid_n = swizzle_tile(pid, M, N, BLOCK_M, BLOCK_N, GROUP_SIZE_M) + + a_desc = tl.make_tensor_descriptor( + base=a_ptr, + shape=(M, K), + strides=(stride_am, stride_ak), + block_shape=(BLOCK_M, BLOCK_K), + ) + b_desc = tl.make_tensor_descriptor( + base=b_ptr, + shape=(K, N), + strides=(stride_bk, stride_bn), + block_shape=(BLOCK_K, BLOCK_N), + ) + + off_m = pid_m * BLOCK_M + off_n = pid_n * BLOCK_N + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + + for off_k in range(0, K, BLOCK_K): + a_tile = a_desc.load([off_m, off_k]) + b_tile = b_desc.load([off_k, off_n]) + acc += tl.dot(a_tile, b_tile) + c_desc = tl.make_tensor_descriptor( + base=c_ptr, + shape=(M, N), + strides=(stride_cm, stride_cn), + block_shape=(BLOCK_M, BLOCK_N), + ) + c_desc.store([off_m, off_n], acc.to(c_ptr.type.element_ty)) + + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, A, B): + if A.dtype != torch.bfloat16: + A = A.to(torch.bfloat16) + if B.dtype != torch.bfloat16: + B = B.to(torch.bfloat16) + if not A.is_contiguous(): + A = A.contiguous() + if not B.is_contiguous(): + B = B.contiguous() + + M, K = A.shape + N = B.shape[1] + C = torch.empty((M, N), device=A.device, dtype=torch.bfloat16) + + grid = lambda META: ( + triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]), + ) + _matmul_kernel[grid]( + A, + B, + C, + M, + N, + K, + A.stride(0), + A.stride(1), + B.stride(0), + B.stride(1), + C.stride(0), + C.stride(1), + DIVISIBLE_M=(M % 256 == 0), + DIVISIBLE_N=(N % 128 == 0), + DIVISIBLE_K=(K % 32 == 0), + ) + return C + + +N = 4096 + + +def get_inputs(): + A = torch.rand(N, N, dtype=torch.bfloat16) + A = (A + A.T) / 2 + B = torch.rand(N, N, dtype=torch.bfloat16) + B = (B + B.T) / 2 + return [A, B] + + +def get_init_inputs(): + return [] diff --git a/backends/triton/cpu/KernelBench/level1/14_Matmul_for_upper_triangular_matrices.py b/backends/triton/cpu/KernelBench/level1/14_Matmul_for_upper_triangular_matrices.py new file mode 100644 index 0000000..184f008 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/14_Matmul_for_upper_triangular_matrices.py @@ -0,0 +1,153 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.jit +def swizzle_tile( + tile_id, + M, + N, + K, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + grid_m = tl.cdiv(M, BLOCK_M) + grid_n = tl.cdiv(N, BLOCK_N) + width = GROUP_SIZE_M * grid_n + group_id = tile_id // width + group_size = tl.minimum(GROUP_SIZE_M, grid_m - group_id * GROUP_SIZE_M) + pid_m = group_id * GROUP_SIZE_M + (tile_id % group_size) + pid_n = (tile_id % width) // group_size + return pid_m, pid_n + + +@triton.autotune( + configs=[ + triton.Config( + {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, + num_warps=32, + num_stages=2, + ), + ], + key=["M", "N", "K"], +) +@triton.jit +def _triu_matmul_kernel( + a_ptr, + b_ptr, + c_ptr, + M, + N, + K, + stride_am: tl.constexpr, + stride_ak: tl.constexpr, + stride_bk: tl.constexpr, + stride_bn: tl.constexpr, + stride_cm: tl.constexpr, + stride_cn: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + pid = tl.program_id(0) + pid_m, pid_n = swizzle_tile(pid, M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, GROUP_SIZE_M) + + off_m = pid_m * BLOCK_M + off_n = pid_n * BLOCK_N + + # Skip tiles entirely below the diagonal (output is upper triangular) + if off_m >= off_n + BLOCK_N: + return + + # K-loop trimming: A is upper tri so A[i,k]=0 for kj + # Effective K range: [off_m, min(off_n + BLOCK_N, K)) + k_start = (off_m // BLOCK_K) * BLOCK_K + k_end_raw = off_n + BLOCK_N + k_end = k_end_raw if k_end_raw < K else K + + a_desc = tl.make_tensor_descriptor( + base=a_ptr, + shape=(M, K), + strides=(stride_am, stride_ak), + block_shape=(BLOCK_M, BLOCK_K), + ) + b_desc = tl.make_tensor_descriptor( + base=b_ptr, + shape=(K, N), + strides=(stride_bk, stride_bn), + block_shape=(BLOCK_K, BLOCK_N), + ) + + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + + for k_offset in range(k_start, k_end, BLOCK_K): + a_block = a_desc.load([off_m, k_offset]) + b_block = b_desc.load([k_offset, off_n]) + acc += tl.dot(a_block, b_block) + # Apply triu mask + row_idx = off_m + tl.arange(0, BLOCK_M) + col_idx = off_n + tl.arange(0, BLOCK_N) + triu_mask = row_idx[:, None] <= col_idx[None, :] + acc = tl.where(triu_mask, acc, 0.0) + + c_desc = tl.make_tensor_descriptor( + base=c_ptr, + shape=(M, N), + strides=(stride_cm, stride_cn), + block_shape=(BLOCK_M, BLOCK_N), + ) + c_desc.store([off_m, off_n], acc.to(c_ptr.type.element_ty)) + + +class Model(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, A, B): + M, K = A.shape + N = B.shape[1] + C = torch.zeros((M, N), device=A.device, dtype=A.dtype) + + grid = lambda META: ( + triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]), + ) + + _triu_matmul_kernel[grid]( + A, + B, + C, + M, + N, + K, + A.stride(0), + A.stride(1), + B.stride(0), + B.stride(1), + C.stride(0), + C.stride(1), + ) + return C + + +N = 4096 + + +def get_inputs(): + A = torch.triu(torch.rand(N, N, dtype=torch.bfloat16)) + B = torch.triu(torch.rand(N, N, dtype=torch.bfloat16)) + return [A, B] + + +def get_init_inputs(): + return [] diff --git a/backends/triton/cpu/KernelBench/level1/15_Matmul_for_lower_triangular_matrices.py b/backends/triton/cpu/KernelBench/level1/15_Matmul_for_lower_triangular_matrices.py new file mode 100644 index 0000000..8b755b7 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/15_Matmul_for_lower_triangular_matrices.py @@ -0,0 +1,146 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.jit +def swizzle_tile( + tile_id, + M, + N, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + grid_m = tl.cdiv(M, BLOCK_M) + grid_n = tl.cdiv(N, BLOCK_N) + width = GROUP_SIZE_M * grid_n + group_id = tile_id // width + group_size = tl.minimum(GROUP_SIZE_M, grid_m - group_id * GROUP_SIZE_M) + pid_m = group_id * GROUP_SIZE_M + (tile_id % group_size) + pid_n = (tile_id % width) // group_size + return pid_m, pid_n + + +@triton.autotune( + configs=[ + triton.Config( + {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, + num_warps=32, + num_stages=2, + ), + ], + key=["M"], +) +@triton.jit +def tril_matmul_kernel( + a_ptr, + b_ptr, + c_ptr, + M, + stride_am: tl.constexpr, + stride_ak: tl.constexpr, + stride_bk: tl.constexpr, + stride_bn: tl.constexpr, + stride_cm, + stride_cn, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + pid = tl.program_id(0) + pid_m, pid_n = swizzle_tile(pid, M, M, BLOCK_M, BLOCK_N, GROUP_SIZE_M) + + off_m = pid_m * BLOCK_M + off_n = pid_n * BLOCK_N + + # Skip tiles entirely in the upper triangle + if off_n > off_m + BLOCK_M - 1: + return + + # K-range optimization for triangular matrices: + # A is lower triangular: A[i,k]=0 for k>i, so max useful K = off_m + BLOCK_M + # B is lower triangular: B[k,j]=0 for j>k, so min useful K = off_n + # Align to BLOCK_K boundaries + k_start = (off_n // BLOCK_K) * BLOCK_K + k_end_raw = off_m + BLOCK_M + k_end = tl.minimum(k_end_raw, M) + + a_desc = tl.make_tensor_descriptor( + base=a_ptr, + shape=(M, M), + strides=(stride_am, stride_ak), + block_shape=(BLOCK_M, BLOCK_K), + ) + b_desc = tl.make_tensor_descriptor( + base=b_ptr, + shape=(M, M), + strides=(stride_bk, stride_bn), + block_shape=(BLOCK_K, BLOCK_N), + ) + + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + + for k in range(k_start, k_end, BLOCK_K): + a_tile = a_desc.load([off_m, k]) + b_tile = b_desc.load([k, off_n]) + acc += tl.dot(a_tile, b_tile) + # Apply tril mask + row_idx = off_m + tl.arange(0, BLOCK_M) + col_idx = off_n + tl.arange(0, BLOCK_N) + tril_mask = row_idx[:, None] >= col_idx[None, :] + acc = tl.where(tril_mask, acc, 0.0) + + # Store using raw pointers + bounds_mask = (row_idx[:, None] < M) & (col_idx[None, :] < M) + c_ptrs = c_ptr + row_idx[:, None] * stride_cm + col_idx[None, :] * stride_cn + tl.store(c_ptrs, acc.to(c_ptr.type.element_ty), mask=bounds_mask & tril_mask) + + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, A, B): + M = A.shape[0] + C = torch.zeros(M, M, device=A.device, dtype=A.dtype) + + grid = lambda META: ( + triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(M, META["BLOCK_N"]), + ) + tril_matmul_kernel[grid]( + A, + B, + C, + M, + A.stride(0), + A.stride(1), + B.stride(0), + B.stride(1), + C.stride(0), + C.stride(1), + ) + return C + + +M = 4096 + + +def get_inputs(): + A = torch.rand(M, M, dtype=torch.bfloat16) + B = torch.rand(M, M, dtype=torch.bfloat16) + A = torch.tril(A) + B = torch.tril(B) + return [A, B] + + +def get_init_inputs(): + return [] diff --git a/backends/triton/cpu/KernelBench/level1/16_Matmul_with_transposed_A.py b/backends/triton/cpu/KernelBench/level1/16_Matmul_with_transposed_A.py new file mode 100644 index 0000000..1df1ce2 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/16_Matmul_with_transposed_A.py @@ -0,0 +1,132 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +def get_autotune_configs(): + return [ + triton.Config( + {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, + num_warps=32, + num_stages=2, + ), + ] + + +@triton.autotune( + configs=get_autotune_configs(), + key=["M", "N", "K"], +) +@triton.jit +def _matmul_at_kernel( + A_ptr, + B_ptr, + C_ptr, + M, + N, + K, + stride_ak: tl.constexpr, + stride_am: tl.constexpr, + stride_bk: tl.constexpr, + stride_bn: tl.constexpr, + stride_cm: tl.constexpr, + stride_cn: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + pid = tl.program_id(0) + + num_pid_m = tl.cdiv(M, BLOCK_M) + num_pid_n = tl.cdiv(N, BLOCK_N) + num_pid_in_group = GROUP_SIZE_M * num_pid_n + + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size_m = tl.minimum(num_pid_m - first_pid_m, GROUP_SIZE_M) + + pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m) + pid_n = (pid % num_pid_in_group) // group_size_m + + # A is [K, M] — load [BLOCK_K, BLOCK_M] tiles, transpose in register + A_desc = tl.make_tensor_descriptor( + base=A_ptr, + shape=(K, M), + strides=(stride_ak, stride_am), + block_shape=(BLOCK_K, BLOCK_M), + ) + + # B is [K, N] — load [BLOCK_K, BLOCK_N] tiles + B_desc = tl.make_tensor_descriptor( + base=B_ptr, + shape=(K, N), + strides=(stride_bk, stride_bn), + block_shape=(BLOCK_K, BLOCK_N), + ) + + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + for off_k in range(0, K, BLOCK_K): + A_tile = A_desc.load([off_k, pid_m * BLOCK_M]) + B_tile = B_desc.load([off_k, pid_n * BLOCK_N]) + acc += tl.dot(A_tile.T, B_tile) + C_desc = tl.make_tensor_descriptor( + base=C_ptr, + shape=(M, N), + strides=(stride_cm, stride_cn), + block_shape=(BLOCK_M, BLOCK_N), + ) + C_desc.store([pid_m * BLOCK_M, pid_n * BLOCK_N], acc.to(C_ptr.type.element_ty)) + + +class Model(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: + K, M = A.shape + _, N = B.shape + A = A.contiguous() + B = B.contiguous() + C = torch.empty((M, N), device=A.device, dtype=A.dtype) + + def grid(META): + return (triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),) + + _matmul_at_kernel[grid]( + A, + B, + C, + M, + N, + K, + A.stride(0), + A.stride(1), + B.stride(0), + B.stride(1), + C.stride(0), + C.stride(1), + ) + return C + + +M = 1024 * 2 +K = 4096 * 2 +N = 2048 * 2 + + +def get_inputs(): + A = torch.rand(K, M, dtype=torch.bfloat16) + B = torch.rand(K, N, dtype=torch.bfloat16) + return [A, B] + + +def get_init_inputs(): + return [] diff --git a/backends/triton/cpu/KernelBench/level1/17_Matmul_with_transposed_B.py b/backends/triton/cpu/KernelBench/level1/17_Matmul_with_transposed_B.py new file mode 100644 index 0000000..c7b9f5f --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/17_Matmul_with_transposed_B.py @@ -0,0 +1,137 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +def get_autotune_configs(): + return [ + triton.Config( + {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, + num_warps=32, + num_stages=2, + ), + ] + + +@triton.autotune( + configs=get_autotune_configs(), + key=["M", "N", "K"], +) +@triton.jit +def _matmul_bt_kernel( + A_ptr, + B_ptr, + C_ptr, + M, + N, + K, + stride_am: tl.constexpr, + stride_ak: tl.constexpr, + stride_bn: tl.constexpr, + stride_bk: tl.constexpr, + stride_cm: tl.constexpr, + stride_cn: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + pid = tl.program_id(0) + + num_pid_m = tl.cdiv(M, BLOCK_M) + num_pid_n = tl.cdiv(N, BLOCK_N) + num_pid_in_group = GROUP_SIZE_M * num_pid_n + + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size_m = tl.minimum(num_pid_m - first_pid_m, GROUP_SIZE_M) + + pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m) + pid_n = (pid % num_pid_in_group) // group_size_m + + # A is [M, K] — load [BLOCK_M, BLOCK_K] tiles in natural layout + a_desc = tl.make_tensor_descriptor( + A_ptr, + shape=[M, K], + strides=[stride_am, stride_ak], + block_shape=[BLOCK_M, BLOCK_K], + ) + + # B is [N, K] — load [BLOCK_N, BLOCK_K] tiles in natural layout, then transpose + b_desc = tl.make_tensor_descriptor( + B_ptr, + shape=[N, K], + strides=[stride_bn, stride_bk], + block_shape=[BLOCK_N, BLOCK_K], + ) + + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + for off_k in range(0, K, BLOCK_K): + A_tile = a_desc.load([pid_m * BLOCK_M, off_k]) + B_tile = b_desc.load([pid_n * BLOCK_N, off_k]) + + # Transpose B in-register: [BLOCK_N, BLOCK_K] -> [BLOCK_K, BLOCK_N] + acc += tl.dot(A_tile, B_tile.T) + c_desc = tl.make_tensor_descriptor( + C_ptr, + shape=[M, N], + strides=[stride_cm, stride_cn], + block_shape=[BLOCK_M, BLOCK_N], + ) + c_desc.store([pid_m * BLOCK_M, pid_n * BLOCK_N], acc.to(C_ptr.type.element_ty)) + + +class Model(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: + M, K = A.shape + N, _ = B.shape + + A = A.contiguous() + B = B.contiguous() + + C = torch.empty((M, N), device=A.device, dtype=A.dtype) + + def grid(META): + return (triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),) + + _matmul_bt_kernel[grid]( + A, + B, + C, + M, + N, + K, + A.stride(0), + A.stride(1), + B.stride(0), + B.stride(1), + C.stride(0), + C.stride(1), + ) + + return C + + +M = 1024 * 2 +K = 4096 * 2 +N = 2048 * 2 + + +def get_inputs(): + A = torch.rand(M, K, dtype=torch.bfloat16) + B = torch.rand(N, K, dtype=torch.bfloat16) + return [A, B] + + +def get_init_inputs(): + return [] diff --git a/backends/triton/cpu/KernelBench/level1/18_Matmul_with_transposed_both.py b/backends/triton/cpu/KernelBench/level1/18_Matmul_with_transposed_both.py new file mode 100644 index 0000000..54c94c6 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/18_Matmul_with_transposed_both.py @@ -0,0 +1,133 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +def _configs(): + return [ + triton.Config( + {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, + num_warps=32, + num_stages=2, + ), + ] + + +@triton.autotune(configs=_configs(), key=["M", "N", "K"]) +@triton.jit +def _matmul_tt_kernel( + A_ptr, + B_ptr, + C_ptr, + M, + N, + K, + stride_ak: tl.constexpr, + stride_am: tl.constexpr, + stride_bn: tl.constexpr, + stride_bk: tl.constexpr, + stride_cm: tl.constexpr, + stride_cn: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + pid = tl.program_id(0) + + num_pid_m = tl.cdiv(M, BLOCK_M) + num_pid_n = tl.cdiv(N, BLOCK_N) + num_pid_in_group = GROUP_SIZE_M * num_pid_n + + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size_m = tl.minimum(num_pid_m - first_pid_m, GROUP_SIZE_M) + + pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m) + pid_n = (pid % num_pid_in_group) // group_size_m + + # A is (K, M): load (BLOCK_K, BLOCK_M), transpose in-register + a_desc = tl.make_tensor_descriptor( + A_ptr, + shape=[K, M], + strides=[stride_ak, stride_am], + block_shape=[BLOCK_K, BLOCK_M], + ) + + # B is (N, K): load (BLOCK_N, BLOCK_K), transpose in-register + b_desc = tl.make_tensor_descriptor( + B_ptr, + shape=[N, K], + strides=[stride_bn, stride_bk], + block_shape=[BLOCK_N, BLOCK_K], + ) + + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + for off_k in range(0, K, BLOCK_K): + a = a_desc.load([off_k, pid_m * BLOCK_M]) + b = b_desc.load([pid_n * BLOCK_N, off_k]) + + acc += tl.dot(a.T, b.T) + c_desc = tl.make_tensor_descriptor( + C_ptr, + shape=[M, N], + strides=[stride_cm, stride_cn], + block_shape=[BLOCK_M, BLOCK_N], + ) + c_desc.store([pid_m * BLOCK_M, pid_n * BLOCK_N], acc.to(C_ptr.type.element_ty)) + + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: + K, M = A.shape + N, _ = B.shape + + A = A.contiguous() + B = B.contiguous() + + C = torch.empty((M, N), device=A.device, dtype=A.dtype) + + def grid(META): + return (triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),) + + _matmul_tt_kernel[grid]( + A, + B, + C, + M, + N, + K, + A.stride(0), + A.stride(1), + B.stride(0), + B.stride(1), + C.stride(0), + C.stride(1), + ) + + return C + + +M = 1024 * 2 +K = 4096 * 2 +N = 2048 * 2 + + +def get_inputs(): + A = torch.rand(K, M, dtype=torch.bfloat16) + B = torch.rand(N, K, dtype=torch.bfloat16) + return [A, B] + + +def get_init_inputs(): + return [] diff --git a/backends/triton/cpu/KernelBench/level1/19_ReLU.py b/backends/triton/cpu/KernelBench/level1/19_ReLU.py new file mode 100644 index 0000000..1eed9d7 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/19_ReLU.py @@ -0,0 +1,63 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config( + {"BLOCK_SIZE": 4096, "NUM_PROGRAMS": 256}, num_warps=8, num_stages=3 + ), + ], + key=["n_elements"], +) +@triton.jit +def relu_kernel_persistent( + x_ptr, + output_ptr, + n_elements, + BLOCK_SIZE: tl.constexpr, + NUM_PROGRAMS: tl.constexpr, +): + pid = tl.program_id(0) + num_blocks = tl.cdiv(n_elements, BLOCK_SIZE) + + for block_id in tl.range(pid, num_blocks, NUM_PROGRAMS): + block_start = block_id * BLOCK_SIZE + offsets = block_start + tl.arange(0, BLOCK_SIZE) + mask = offsets < n_elements + x = tl.load(x_ptr + offsets, mask=mask) + output = tl.maximum(x, 0.0) + tl.store(output_ptr + offsets, output, mask=mask) + + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + output = torch.empty_like(x) + n_elements = x.numel() + grid = lambda META: (META["NUM_PROGRAMS"],) + relu_kernel_persistent[grid](x, output, n_elements) + return output + + +batch_size = 4096 +dim = 393216 + + +def get_inputs(): + x = torch.rand(batch_size, dim, dtype=torch.bfloat16) + return [x] + + +def get_init_inputs(): + return [] diff --git a/backends/triton/cpu/KernelBench/level1/1_Square_matrix_multiplication_.py b/backends/triton/cpu/KernelBench/level1/1_Square_matrix_multiplication_.py index 1582894..09c940e 100644 --- a/backends/triton/cpu/KernelBench/level1/1_Square_matrix_multiplication_.py +++ b/backends/triton/cpu/KernelBench/level1/1_Square_matrix_multiplication_.py @@ -1,5 +1,6 @@ # ruff: noqa: E731 -# Example Triton CPU kernel +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation # Status: Experimental / uncurated # Expectation: Correctness-first, performance not representative @@ -9,13 +10,17 @@ import triton.language as tl -@triton.autotune( - configs=[ - triton.Config({"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32}), - triton.Config({"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 64}), - ], - key=["M", "N", "K"], # autotune per problem size -) +def _configs(): + return [ + triton.Config( + {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, + num_warps=32, + num_stages=2, + ), + ] + + +@triton.autotune(configs=_configs(), key=["M", "N", "K"]) @triton.jit def _matmul_kernel( a_ptr, @@ -24,70 +29,95 @@ def _matmul_kernel( M, N, K, + stride_am, + stride_ak: tl.constexpr, + stride_bk, + stride_bn: tl.constexpr, + stride_cm, + stride_cn: tl.constexpr, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, ): + pid = tl.program_id(0) + num_pid_m = tl.cdiv(M, BLOCK_M) + num_pid_n = tl.cdiv(N, BLOCK_N) + num_pid_in_group = GROUP_SIZE_M * num_pid_n + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size_m = tl.minimum(num_pid_m - first_pid_m, GROUP_SIZE_M) + pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m) + pid_n = (pid % num_pid_in_group) // group_size_m + a_desc = tl.make_tensor_descriptor( - base=a_ptr, shape=(M, K), strides=(K, 1), block_shape=(BLOCK_M, BLOCK_K) + base=a_ptr, + shape=(M, K), + strides=(stride_am, stride_ak), + block_shape=(BLOCK_M, BLOCK_K), ) b_desc = tl.make_tensor_descriptor( - base=b_ptr, shape=(K, N), strides=(N, 1), block_shape=(BLOCK_K, BLOCK_N) - ) - c_desc = tl.make_tensor_descriptor( - base=c_ptr, shape=(M, N), strides=(N, 1), block_shape=(BLOCK_M, BLOCK_N) + base=b_ptr, + shape=(K, N), + strides=(stride_bk, stride_bn), + block_shape=(BLOCK_K, BLOCK_N), ) - m = tl.program_id(0) * BLOCK_M - n = tl.program_id(1) * BLOCK_N acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) - for k in range(0, K, BLOCK_K): - a = a_desc.load((m, k)) - b = b_desc.load((k, n)) - acc = tl.dot(a, b, acc) + for off_k in range(0, K, BLOCK_K): + a_tile = a_desc.load([pid_m * BLOCK_M, off_k]) + b_tile = b_desc.load([off_k, pid_n * BLOCK_N]) + acc += tl.dot(a_tile, b_tile) + c_desc = tl.make_tensor_descriptor( + base=c_ptr, + shape=(M, N), + strides=(stride_cm, stride_cn), + block_shape=(BLOCK_M, BLOCK_N), + ) + c_desc.store([pid_m * BLOCK_M, pid_n * BLOCK_N], acc.to(c_ptr.type.element_ty)) - c_desc.store((m, n), acc) +class Model(nn.Module): + def __init__(self): + super().__init__() -def _kernel_function_cpu(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: - assert isinstance(A, torch.Tensor) and isinstance(B, torch.Tensor) - assert A.device.type == "cpu" and B.device.type == "cpu", "A and B must be on CPU" - assert A.is_floating_point() and B.is_floating_point(), ( - "A and B must be floating point tensors" - ) - assert A.dtype == B.dtype, f"dtype mismatch: {A.dtype} vs {B.dtype}" + def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: + M, K = A.shape + K2, N = B.shape - orig_dtype = A.dtype + A = A.contiguous() + B = B.contiguous() - M, K = A.shape - K2, N = B.shape - assert K == K2, f"Incompatible K dimensions: {K} vs {K2}" + C = torch.empty((M, N), device=A.device, dtype=A.dtype) - C32 = torch.empty((M, N), device=A.device, dtype=torch.float32) + def grid(META): + return (triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),) - # Autotuned grid: depends on BLOCK_M/BLOCK_N chosen by config - grid = lambda META: ( - triton.cdiv(M, META["BLOCK_M"]), - triton.cdiv(N, META["BLOCK_N"]), - ) + _matmul_kernel[grid]( + A, + B, + C, + M, + N, + K, + A.stride(0), + A.stride(1), + B.stride(0), + B.stride(1), + C.stride(0), + C.stride(1), + ) + return C - _matmul_kernel[grid]( - A, - B, - C32, - M, - N, - K, - ) - return C32.to(orig_dtype) +N = 2048 * 2 -class Model(nn.Module): - """KernelBench-compatible wrapper""" +def get_inputs(): + A = torch.rand(N, N, dtype=torch.bfloat16) + B = torch.rand(N, N, dtype=torch.bfloat16) + return [A, B] - def __init__(self, *args, **kwargs): - super(Model, self).__init__() - def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: - return _kernel_function_cpu(A, B) +def get_init_inputs(): + return [] diff --git a/backends/triton/cpu/KernelBench/level1/20_LeakyReLU.py b/backends/triton/cpu/KernelBench/level1/20_LeakyReLU.py new file mode 100644 index 0000000..8002e4f --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/20_LeakyReLU.py @@ -0,0 +1,69 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2), + ], + key=["n_elements"], +) +@triton.jit +def leaky_relu_kernel( + x_ptr, + output_ptr, + neg_slope_ptr, + n_elements, + BLOCK_SIZE: tl.constexpr, +): + pid = tl.program_id(0) + offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + mask = offsets < n_elements + + neg_slope = tl.load(neg_slope_ptr) + + x = tl.load(x_ptr + offsets, mask=mask) + out = tl.where(x >= 0, x, x * neg_slope) + tl.store(output_ptr + offsets, out, mask=mask) + + +class Model(nn.Module): + def __init__(self, negative_slope: float = 0.01): + super(Model, self).__init__() + self.negative_slope = negative_slope + + def forward(self, x: torch.Tensor) -> torch.Tensor: + output = torch.empty_like(x) + n_elements = x.numel() + neg_slope_t = torch.tensor( + [self.negative_slope], dtype=x.dtype, device=x.device + ) + grid = lambda META: (triton.cdiv(n_elements, META["BLOCK_SIZE"]),) + leaky_relu_kernel[grid]( + x, + output, + neg_slope_t, + n_elements, + ) + return output + + +batch_size = 4096 +dim = 393216 + + +def get_inputs(): + x = torch.rand(batch_size, dim, dtype=torch.bfloat16) + return [x] + + +def get_init_inputs(): + return [] diff --git a/backends/triton/cpu/KernelBench/level1/21_Sigmoid.py b/backends/triton/cpu/KernelBench/level1/21_Sigmoid.py new file mode 100644 index 0000000..7240e65 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/21_Sigmoid.py @@ -0,0 +1,64 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2), + ], + key=["N"], +) +@triton.jit +def _sigmoid_kernel( + x_ptr, + out_ptr, + N, + BLOCK_SIZE: tl.constexpr, +): + pid = tl.program_id(0) + offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + mask = offsets < N + + x = tl.load(x_ptr + offsets, mask=mask, other=0.0).to(tl.float32) + + inv_ln2 = 1.4426950408889634 + e = tl.math.exp2((-x) * inv_ln2) + y = 1.0 / (1.0 + e) + + tl.store(out_ptr + offsets, y.to(tl.bfloat16), mask=mask) + + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x_flat = x.contiguous().view(-1) + N = x_flat.numel() + out_flat = torch.empty_like(x_flat) + + grid = lambda META: (triton.cdiv(N, META["BLOCK_SIZE"]),) + _sigmoid_kernel[grid](x_flat, out_flat, N) + + return out_flat.view_as(x) + + +batch_size = 4096 +dim = 393216 + + +def get_inputs(): + x = torch.rand(batch_size, dim, dtype=torch.bfloat16) + return [x] + + +def get_init_inputs(): + return [] diff --git a/backends/triton/cpu/KernelBench/level1/22_Tanh.py b/backends/triton/cpu/KernelBench/level1/22_Tanh.py new file mode 100644 index 0000000..f5f6074 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/22_Tanh.py @@ -0,0 +1,67 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + +batch_size = 4096 +dim = 393216 + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2), + ], + key=["n_elements"], +) +@triton.jit +def _tanh_kernel( + x_ptr, + out_ptr, + n_elements, + BLOCK_SIZE: tl.constexpr, +): + pid = tl.program_id(0) + offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + mask = offsets < n_elements + + x = tl.load(x_ptr + offsets, mask=mask, other=0.0).to(tl.float32) + + # tanh(x) = 2*sigmoid(2x) - 1 + # sigmoid(z) = 1/(1 + exp2(-z * log2(e))) + inv_ln2: tl.constexpr = 1.4426950408889634 + z = 2.0 * x + e = tl.math.exp2((-z) * inv_ln2) + sig = 1.0 / (1.0 + e) + result = 2.0 * sig - 1.0 + + tl.store(out_ptr + offsets, result.to(tl.bfloat16), mask=mask) + + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x_flat = x.contiguous().view(-1) + n_elements = x_flat.numel() + output_flat = torch.empty_like(x_flat) + + grid = lambda META: (triton.cdiv(n_elements, META["BLOCK_SIZE"]),) + _tanh_kernel[grid](x_flat, output_flat, n_elements) + + return output_flat.view(x.shape) + + +def get_inputs(): + x = torch.rand(batch_size, dim, dtype=torch.bfloat16) + return [x] + + +def get_init_inputs(): + return [] diff --git a/backends/triton/cpu/KernelBench/level1/23_Softmax.py b/backends/triton/cpu/KernelBench/level1/23_Softmax.py new file mode 100644 index 0000000..5e7ba1b --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/23_Softmax.py @@ -0,0 +1,101 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +def _softmax_configs(): + return [ + triton.Config({"BLOCK_N": 2048}, num_warps=8, num_stages=3), + ] + + +@triton.autotune(configs=_softmax_configs(), key=["N"]) +@triton.jit +def _softmax_kernel( + inp_ptr, + out_ptr, + M, + N, + stride_im, + stride_in, + stride_om, + stride_on, + BLOCK_N: tl.constexpr, +): + pid_m = tl.program_id(0) + row_inp = inp_ptr + pid_m * stride_im + row_out = out_ptr + pid_m * stride_om + + LOG2E: tl.constexpr = 1.4426950408889634 + + # Pass 1: Online max + sum_exp + row_max = -float("inf") + row_sum = 0.0 + for start in range(0, N, BLOCK_N): + offs = start + tl.arange(0, BLOCK_N) + mask = offs < N + x = tl.load(row_inp + offs * stride_in, mask=mask, other=-float("inf")).to( + tl.float32 + ) + block_max = tl.max(x, axis=0) + new_max = tl.maximum(row_max, block_max) + row_sum = row_sum * tl.math.exp2((row_max - new_max) * LOG2E) + tl.sum( + tl.math.exp2((x - new_max) * LOG2E), axis=0 + ) + row_max = new_max + + inv_sum = 1.0 / row_sum + + # Pass 2: normalize and store + for start in range(0, N, BLOCK_N): + offs = start + tl.arange(0, BLOCK_N) + mask = offs < N + x = tl.load(row_inp + offs * stride_in, mask=mask, other=-float("inf")).to( + tl.float32 + ) + e = tl.math.exp2((x - row_max) * LOG2E) + y = (e * inv_sum).to(tl.bfloat16) + tl.store(row_out + offs * stride_on, y, mask=mask) + + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = x.contiguous() + M, N = x.shape + out = torch.empty_like(x) + + grid = (M,) + _softmax_kernel[grid]( + x, + out, + M, + N, + x.stride(0), + x.stride(1), + out.stride(0), + out.stride(1), + ) + return out + + +batch_size = 4096 +dim = 393216 + + +def get_inputs(): + x = torch.rand(batch_size, dim, dtype=torch.bfloat16) + return [x] + + +def get_init_inputs(): + return [] diff --git a/backends/triton/cpu/KernelBench/level1/24_LogSoftmax.py b/backends/triton/cpu/KernelBench/level1/24_LogSoftmax.py new file mode 100644 index 0000000..c5e5f35 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/24_LogSoftmax.py @@ -0,0 +1,94 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_N": 2048, "warp_size": 32}, num_warps=4, num_stages=2), + ], + key=["N"], +) +@triton.jit +def _logsoftmax_kernel( + inp_ptr, + out_ptr, + M, + N, + stride_im, + stride_om, + BLOCK_N: tl.constexpr, + warp_size: tl.constexpr, +): + pid_m = tl.program_id(0) + row_inp = inp_ptr + pid_m.to(tl.int64) * stride_im + row_out = out_ptr + pid_m.to(tl.int64) * stride_om + + LOG2E = 1.4426950408889634 + LN2 = 0.6931471805599453 + + m = -float("inf") + s = 0.0 + + for start in range(0, N, BLOCK_N): + offs = start + tl.arange(0, BLOCK_N) + mask = offs < N + x = tl.load(row_inp + offs, mask=mask, other=-float("inf")).to(tl.float32) + block_max = tl.max(x, axis=0) + m_new = tl.maximum(m, block_max) + s = s * tl.math.exp2((m - m_new) * LOG2E) + tl.sum( + tl.math.exp2((x - m_new) * LOG2E), axis=0 + ) + m = m_new + + log_s = tl.math.log2(s) * LN2 + + for start in range(0, N, BLOCK_N): + offs = start + tl.arange(0, BLOCK_N) + mask = offs < N + x = tl.load(row_inp + offs, mask=mask, other=-float("inf")).to(tl.float32) + y = x - m - log_s + tl.store(row_out + offs, y.to(tl.bfloat16), mask=mask) + + +class Model(nn.Module): + def __init__(self, dim: int = 1): + super(Model, self).__init__() + self.dim = dim + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = x.contiguous() + + M, N = x.shape + out = torch.empty_like(x) + + grid = (M,) + _logsoftmax_kernel[grid]( + x, + out, + M, + N, + x.stride(0), + out.stride(0), + ) + return out + + +batch_size = 4096 +dim = 393216 + + +def get_inputs(): + x = torch.rand(batch_size, dim, dtype=torch.bfloat16) + return [x] + + +def get_init_inputs(): + return [] diff --git a/backends/triton/cpu/KernelBench/level1/25_Swish.py b/backends/triton/cpu/KernelBench/level1/25_Swish.py new file mode 100644 index 0000000..0a682dd --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/25_Swish.py @@ -0,0 +1,65 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.jit +def _sigmoid_exp2(x): + inv_ln2 = 1.4426950408889634 + e = tl.math.exp2((-x) * inv_ln2) + return 1.0 / (1.0 + e) + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2), + ], + key=["n_elements"], +) +@triton.jit +def swish_kernel( + x_ptr, + output_ptr, + n_elements, + BLOCK_SIZE: tl.constexpr, +): + pid = tl.program_id(0) + offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + mask = offsets < n_elements + x = tl.load(x_ptr + offsets, mask=mask, other=0.0) + x_f32 = x.to(tl.float32) + sig = _sigmoid_exp2(x_f32) + result = x_f32 * sig + tl.store(output_ptr + offsets, result.to(tl.bfloat16), mask=mask) + + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + output = torch.empty_like(x) + n_elements = x.numel() + grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),) + swish_kernel[grid](x, output, n_elements) + return output + + +batch_size = 4096 +dim = 393216 + + +def get_inputs(): + x = torch.rand(batch_size, dim, dtype=torch.bfloat16) + return [x] + + +def get_init_inputs(): + return [] diff --git a/backends/triton/cpu/KernelBench/level1/26_GELU_.py b/backends/triton/cpu/KernelBench/level1/26_GELU_.py new file mode 100644 index 0000000..0d52473 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/26_GELU_.py @@ -0,0 +1,64 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config( + {"BLOCK_SIZE": 4096, "NUM_PROGS": 160}, num_warps=4, num_stages=2 + ), + ], + key=["n_elements"], +) +@triton.jit +def gelu_persistent_kernel( + x_ptr, + out_ptr, + n_elements, + BLOCK_SIZE: tl.constexpr, + NUM_PROGS: tl.constexpr, +): + pid = tl.program_id(0) + num_tiles = tl.cdiv(n_elements, BLOCK_SIZE) + + for tile_id in range(pid, num_tiles, NUM_PROGS): + offsets = tile_id * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + mask = offsets < n_elements + + x = tl.load(x_ptr + offsets, mask=mask, other=0.0).to(tl.float32) + out = 0.5 * x * (1.0 + tl.math.erf(x * 0.70710678118654752440)) + tl.store(out_ptr + offsets, out.to(tl.bfloat16), mask=mask) + + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x_flat = x.view(-1) + out_flat = torch.empty_like(x_flat) + n_elements = x_flat.numel() + grid = lambda META: (META["NUM_PROGS"],) + gelu_persistent_kernel[grid](x_flat, out_flat, n_elements) + return out_flat.view_as(x) + + +batch_size = 4096 +dim = 393216 + + +def get_inputs(): + x = torch.rand(batch_size, dim, dtype=torch.bfloat16) + return [x] + + +def get_init_inputs(): + return [] diff --git a/backends/triton/cpu/KernelBench/level1/27_SELU_.py b/backends/triton/cpu/KernelBench/level1/27_SELU_.py new file mode 100644 index 0000000..1d8a3a7 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/27_SELU_.py @@ -0,0 +1,64 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=3), + ], + key=["n_elements"], +) +@triton.jit +def selu_kernel( + x_ptr, + out_ptr, + n_elements, + alpha, + scale, + BLOCK_SIZE: tl.constexpr, +): + pid = tl.program_id(0) + offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + mask = offsets < n_elements + + x = tl.load(x_ptr + offsets, mask=mask, other=0.0) + x_f32 = x.to(tl.float32) + + result = tl.where(x_f32 > 0.0, scale * x_f32, scale * alpha * (tl.exp(x_f32) - 1.0)) + + tl.store(out_ptr + offsets, result.to(tl.bfloat16), mask=mask) + + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + self.alpha = 1.6732632423543772848170429916717 + self.scale = 1.0507009873554804934193349852946 + + def forward(self, x: torch.Tensor) -> torch.Tensor: + n_elements = x.numel() + output = torch.empty_like(x) + grid = lambda META: (triton.cdiv(n_elements, META["BLOCK_SIZE"]),) + selu_kernel[grid](x, output, n_elements, self.alpha, self.scale) + return output + + +batch_size = 4096 +dim = 393216 + + +def get_inputs(): + x = torch.rand(batch_size, dim, dtype=torch.bfloat16) + return [x] + + +def get_init_inputs(): + return [] diff --git a/backends/triton/cpu/KernelBench/level1/28_HardSigmoid.py b/backends/triton/cpu/KernelBench/level1/28_HardSigmoid.py new file mode 100644 index 0000000..6bde4bb --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/28_HardSigmoid.py @@ -0,0 +1,62 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2), + ], + key=["N"], +) +@triton.jit +def hardsigmoid_kernel( + x_ptr, + out_ptr, + N, + BLOCK_SIZE: tl.constexpr, +): + pid = tl.program_id(0) + offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + mask = offsets < N + + x = tl.load(x_ptr + offsets, mask=mask, other=0.0) + x_f32 = x.to(tl.float32) + + result = x_f32 * (1.0 / 6.0) + 0.5 + result = tl.maximum(result, 0.0) + result = tl.minimum(result, 1.0) + + tl.store(out_ptr + offsets, result.to(x.dtype), mask=mask) + + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + out = torch.empty_like(x) + N = x.numel() + grid = lambda META: (triton.cdiv(N, META["BLOCK_SIZE"]),) + hardsigmoid_kernel[grid](x, out, N) + return out + + +batch_size = 4096 +dim = 393216 + + +def get_inputs(): + x = torch.rand(batch_size, dim, dtype=torch.bfloat16) + return [x] + + +def get_init_inputs(): + return [] diff --git a/backends/triton/cpu/KernelBench/level1/29_Softplus.py b/backends/triton/cpu/KernelBench/level1/29_Softplus.py new file mode 100644 index 0000000..d561bab --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/29_Softplus.py @@ -0,0 +1,61 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2), + ], + key=["n_elements"], +) +@triton.jit +def softplus_kernel( + x_ptr, + output_ptr, + n_elements, + BLOCK_SIZE: tl.constexpr, +): + pid = tl.program_id(0) + offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + mask = offsets < n_elements + + x = tl.load(x_ptr + offsets, mask=mask, other=0.0).to(tl.float32) + + # softplus(x) = log(1 + exp(x)), with threshold for numerical stability + THRESHOLD: tl.constexpr = 20.0 + result = tl.where(x > THRESHOLD, x, tl.math.log(1.0 + tl.exp(x))) + + tl.store(output_ptr + offsets, result.to(tl.bfloat16), mask=mask) + + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + output = torch.empty_like(x) + n_elements = x.numel() + grid = lambda META: (triton.cdiv(n_elements, META["BLOCK_SIZE"]),) + softplus_kernel[grid](x, output, n_elements) + return output + + +batch_size = 4096 +dim = 393216 + + +def get_inputs(): + x = torch.rand(batch_size, dim, dtype=torch.bfloat16) + return [x] + + +def get_init_inputs(): + return [] diff --git a/backends/triton/cpu/KernelBench/level1/2_Standard_matrix_multiplication_.py b/backends/triton/cpu/KernelBench/level1/2_Standard_matrix_multiplication_.py new file mode 100644 index 0000000..761dc6e --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/2_Standard_matrix_multiplication_.py @@ -0,0 +1,124 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +def _configs(): + return [ + triton.Config( + {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, + num_warps=32, + num_stages=2, + ), + ] + + +@triton.autotune(configs=_configs(), key=["M", "N", "K"]) +@triton.jit +def _matmul_kernel( + a_ptr, + b_ptr, + c_ptr, + M, + N, + K, + stride_am: tl.constexpr, + stride_ak: tl.constexpr, + stride_bk: tl.constexpr, + stride_bn: tl.constexpr, + stride_cm: tl.constexpr, + stride_cn: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + pid = tl.program_id(0) + + num_pid_m = tl.cdiv(M, BLOCK_M) + num_pid_n = tl.cdiv(N, BLOCK_N) + num_pid_in_group = GROUP_SIZE_M * num_pid_n + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size_m = tl.minimum(num_pid_m - first_pid_m, GROUP_SIZE_M) + pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m) + pid_n = (pid % num_pid_in_group) // group_size_m + + a_desc = tl.make_tensor_descriptor( + base=a_ptr, + shape=(M, K), + strides=(stride_am, stride_ak), + block_shape=(BLOCK_M, BLOCK_K), + ) + b_desc = tl.make_tensor_descriptor( + base=b_ptr, + shape=(K, N), + strides=(stride_bk, stride_bn), + block_shape=(BLOCK_K, BLOCK_N), + ) + + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + for off_k in range(0, K, BLOCK_K): + a_tile = a_desc.load([pid_m * BLOCK_M, off_k]) + b_tile = b_desc.load([off_k, pid_n * BLOCK_N]) + acc += tl.dot(a_tile, b_tile) + c_desc = tl.make_tensor_descriptor( + base=c_ptr, + shape=(M, N), + strides=(stride_cm, stride_cn), + block_shape=(BLOCK_M, BLOCK_N), + ) + c_desc.store([pid_m * BLOCK_M, pid_n * BLOCK_N], acc.to(c_ptr.type.element_ty)) + + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: + A_fp16 = A.to(torch.bfloat16).contiguous() + B_fp16 = B.to(torch.bfloat16).contiguous() + M, K = A_fp16.shape + N = B_fp16.shape[1] + C = torch.empty((M, N), device=A.device, dtype=torch.bfloat16) + + grid = lambda META: ( + triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]), + ) + _matmul_kernel[grid]( + A_fp16, + B_fp16, + C, + M, + N, + K, + A_fp16.stride(0), + A_fp16.stride(1), + B_fp16.stride(0), + B_fp16.stride(1), + C.stride(0), + C.stride(1), + ) + return C + + +M = 1024 * 2 +K = 4096 * 2 +N = 2048 * 2 + + +def get_inputs(): + A = torch.rand(M, K, dtype=torch.bfloat16) + B = torch.rand(K, N, dtype=torch.bfloat16) + return [A, B] + + +def get_init_inputs(): + return [] diff --git a/backends/triton/cpu/KernelBench/level1/30_Softsign.py b/backends/triton/cpu/KernelBench/level1/30_Softsign.py new file mode 100644 index 0000000..6290f9a --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/30_Softsign.py @@ -0,0 +1,58 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2), + ], + key=["n_elements"], +) +@triton.jit +def softsign_kernel( + x_ptr, + output_ptr, + n_elements, + BLOCK_SIZE: tl.constexpr, +): + pid = tl.program_id(0) + offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + mask = offsets < n_elements + x = tl.load(x_ptr + offsets, mask=mask) + x_f32 = x.to(tl.float32) + abs_x = tl.abs(x_f32) + result = x_f32 / (1.0 + abs_x) + tl.store(output_ptr + offsets, result.to(x.dtype), mask=mask) + + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + output = torch.empty_like(x) + n_elements = x.numel() + grid = lambda META: (triton.cdiv(n_elements, META["BLOCK_SIZE"]),) + softsign_kernel[grid](x, output, n_elements) + return output + + +batch_size = 4096 +dim = 393216 + + +def get_inputs(): + x = torch.rand(batch_size, dim, dtype=torch.bfloat16) + return [x] + + +def get_init_inputs(): + return [] diff --git a/backends/triton/cpu/KernelBench/level1/31_ELU.py b/backends/triton/cpu/KernelBench/level1/31_ELU.py new file mode 100644 index 0000000..c9e0a26 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/31_ELU.py @@ -0,0 +1,69 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2), + ], + key=["n_elements"], +) +@triton.jit +def elu_kernel( + x_ptr, + out_ptr, + alpha, + n_elements, + BLOCK_SIZE: tl.constexpr, +): + pid = tl.program_id(0) + offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + mask = offsets < n_elements + + x = tl.load(x_ptr + offsets, mask=mask, other=0.0) + x_f32 = x.to(tl.float32) + + inv_ln2: tl.constexpr = 1.4426950408889634 + exp_x = tl.math.exp2(x_f32 * inv_ln2) + neg_branch = alpha * (exp_x - 1.0) + + result = tl.where(x_f32 > 0.0, x_f32, neg_branch) + + tl.store(out_ptr + offsets, result.to(x.dtype), mask=mask) + + +class Model(nn.Module): + def __init__(self, alpha=1.0): + super(Model, self).__init__() + try: + self.alpha = float(alpha) + except (ValueError, TypeError): + self.alpha = 1.0 + + def forward(self, x: torch.Tensor) -> torch.Tensor: + out = torch.empty_like(x) + n_elements = x.numel() + grid = lambda META: (triton.cdiv(n_elements, META["BLOCK_SIZE"]),) + elu_kernel[grid](x, out, self.alpha, n_elements) + return out + + +batch_size = 4096 +dim = 393216 + + +def get_inputs(): + x = torch.rand(batch_size, dim, dtype=torch.bfloat16) + return [x] + + +def get_init_inputs(): + return [1.0] diff --git a/backends/triton/cpu/KernelBench/level1/32_HardTanh.py b/backends/triton/cpu/KernelBench/level1/32_HardTanh.py new file mode 100644 index 0000000..c5fa253 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/32_HardTanh.py @@ -0,0 +1,59 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2), + ], + key=["n_elements"], +) +@triton.jit +def hardtanh_kernel( + x_ptr, + out_ptr, + n_elements, + min_val: tl.constexpr, + max_val: tl.constexpr, + BLOCK_SIZE: tl.constexpr, +): + pid = tl.program_id(0) + offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + mask = offsets < n_elements + x = tl.load(x_ptr + offsets, mask=mask) + x = tl.maximum(x, min_val) + x = tl.minimum(x, max_val) + tl.store(out_ptr + offsets, x, mask=mask) + + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + out = torch.empty_like(x) + n_elements = x.numel() + grid = lambda META: (triton.cdiv(n_elements, META["BLOCK_SIZE"]),) + hardtanh_kernel[grid](x, out, n_elements, -1.0, 1.0) + return out + + +batch_size = 4096 +dim = 393216 + + +def get_inputs(): + x = torch.rand(batch_size, dim, dtype=torch.bfloat16) + return [x] + + +def get_init_inputs(): + return [] diff --git a/backends/triton/cpu/KernelBench/level1/33_BatchNorm.py b/backends/triton/cpu/KernelBench/level1/33_BatchNorm.py new file mode 100644 index 0000000..4b2eab4 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/33_BatchNorm.py @@ -0,0 +1,202 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.jit +def _bn_reduce_kernel( + x_ptr, + partial_sum_ptr, + partial_sq_ptr, + C, + HW, + stride_b, + stride_c, + B: tl.constexpr, + BLOCK_HW: tl.constexpr, +): + pid_c = tl.program_id(0) + pid_b = tl.program_id(1) + base = pid_b.to(tl.int64) * stride_b + pid_c.to(tl.int64) * stride_c + acc_sum = 0.0 + acc_sq = 0.0 + for hw_start in range(0, HW, BLOCK_HW): + offs = hw_start + tl.arange(0, BLOCK_HW) + mask = offs < HW + x = tl.load(x_ptr + base + offs, mask=mask, other=0.0).to(tl.float32) + acc_sum += tl.sum(x, axis=0) + acc_sq += tl.sum(x * x, axis=0) + out_idx = pid_c * B + pid_b + tl.store(partial_sum_ptr + out_idx, acc_sum) + tl.store(partial_sq_ptr + out_idx, acc_sq) + + +@triton.jit +def _bn_stats_kernel( + partial_sum_ptr, + partial_sq_ptr, + scale_ptr, + shift_ptr, + weight_ptr, + bias_ptr, + inv_count, + eps, + B: tl.constexpr, + BLOCK_B: tl.constexpr, +): + pid_c = tl.program_id(0) + offs_b = tl.arange(0, BLOCK_B) + mask = offs_b < B + s = tl.load(partial_sum_ptr + pid_c * B + offs_b, mask=mask, other=0.0) + sq = tl.load(partial_sq_ptr + pid_c * B + offs_b, mask=mask, other=0.0) + total_sum = tl.sum(s, axis=0) + total_sq = tl.sum(sq, axis=0) + mean_val = total_sum * inv_count + var_val = total_sq * inv_count - mean_val * mean_val + w = tl.load(weight_ptr + pid_c) + bi = tl.load(bias_ptr + pid_c) + inv_std = 1.0 / tl.sqrt(var_val + eps) + scale_val = w * inv_std + shift_val = bi - mean_val * scale_val + tl.store(scale_ptr + pid_c, scale_val) + tl.store(shift_ptr + pid_c, shift_val) + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_SIZE": 4096, "warp_size": 32}, num_warps=8), + ], + key=["total_elements"], +) +@triton.jit +def _bn_normalize_flat_kernel( + x_ptr, + out_ptr, + scale_ptr, + shift_ptr, + C, + HW, + total_elements, + BLOCK_SIZE: tl.constexpr, + warp_size: tl.constexpr, +): + pid = tl.program_id(0) + start = pid * BLOCK_SIZE + offs = start + tl.arange(0, BLOCK_SIZE) + mask = offs < total_elements + c = (offs // HW) % C + x = tl.load(x_ptr + offs, mask=mask, other=0.0).to(tl.float32) + scale = tl.load(scale_ptr + c, mask=mask, other=0.0) + shift = tl.load(shift_ptr + c, mask=mask, other=0.0) + y = x * scale + shift + tl.store(out_ptr + offs, y.to(tl.float32), mask=mask) + + +class Model(nn.Module): + def __init__(self, num_features: int): + super().__init__() + self.num_features = num_features + self.weight = nn.Parameter(torch.ones(num_features)) + self.bias = nn.Parameter(torch.zeros(num_features)) + self.register_buffer("running_mean", torch.zeros(num_features)) + self.register_buffer("running_var", torch.ones(num_features)) + self.eps = 1e-5 + self.momentum = 0.1 + self._moved = False + self._bufs_ready = False + + def _move_params(self, device): + self.weight.data = self.weight.data.to(device, dtype=torch.float32).contiguous() + self.bias.data = self.bias.data.to(device, dtype=torch.float32).contiguous() + self.running_mean = self.running_mean.to( + device, dtype=torch.float32 + ).contiguous() + self.running_var = self.running_var.to(device, dtype=torch.float32).contiguous() + self._moved = True + + def _alloc_bufs(self, B, C, device): + self._partial_sum = torch.empty((C, B), device=device, dtype=torch.float32) + self._partial_sq = torch.empty((C, B), device=device, dtype=torch.float32) + self._scale = torch.empty(C, device=device, dtype=torch.float32) + self._shift = torch.empty(C, device=device, dtype=torch.float32) + self._bufs_ready = True + + def forward(self, x): + device = x.device + if not self._moved: + self._move_params(device) + x = x.to(dtype=torch.float32).contiguous() + B, C, H, W = x.shape + HW = H * W + stride_b = x.stride(0) + stride_c = x.stride(1) + total_elements = B * C * HW + + if not self._bufs_ready: + self._alloc_bufs(B, C, device) + + if self.training: + _bn_reduce_kernel[(C, B)]( + x, + self._partial_sum, + self._partial_sq, + C, + HW, + stride_b, + stride_c, + B=B, + BLOCK_HW=8192, + num_warps=8, + ) + _bn_stats_kernel[(C,)]( + self._partial_sum, + self._partial_sq, + self._scale, + self._shift, + self.weight, + self.bias, + 1.0 / (B * HW), + self.eps, + B=B, + BLOCK_B=triton.next_power_of_2(B), + num_warps=4, + ) + else: + inv_std = 1.0 / torch.sqrt(self.running_var + self.eps) + self._scale.copy_(self.weight * inv_std) + self._shift.copy_(self.bias - self.running_mean * self._scale) + + out = torch.empty_like(x) + grid = lambda META: (triton.cdiv(total_elements, META["BLOCK_SIZE"]),) + _bn_normalize_flat_kernel[grid]( + x, + out, + self._scale, + self._shift, + C, + HW, + total_elements, + ) + return out + + +batch_size = 64 +features = 64 +dim1 = 512 +dim2 = 512 + + +def get_inputs(): + x = torch.rand(batch_size, features, dim1, dim2, dtype=torch.bfloat16) + return [x] + + +def get_init_inputs(): + return [features] diff --git a/backends/triton/cpu/KernelBench/level1/34_InstanceNorm.py b/backends/triton/cpu/KernelBench/level1/34_InstanceNorm.py new file mode 100644 index 0000000..4149704 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/34_InstanceNorm.py @@ -0,0 +1,87 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_SIZE": 2048, "warp_size": 32}, num_warps=8, num_stages=2), + ], + key=["N"], +) +@triton.jit +def instance_norm_kernel( + x_ptr, + out_ptr, + N, + eps, + BLOCK_SIZE: tl.constexpr, + warp_size: tl.constexpr, +): + pid = tl.program_id(0) + base = pid.to(tl.int64) * N + + sum_acc = tl.zeros((BLOCK_SIZE,), dtype=tl.float32) + sq_acc = tl.zeros((BLOCK_SIZE,), dtype=tl.float32) + for start in range(0, N, BLOCK_SIZE): + offs = start + tl.arange(0, BLOCK_SIZE) + mask = offs < N + x = tl.load(x_ptr + base + offs, mask=mask, other=0.0).to(tl.float32) + sum_acc += x + sq_acc += x * x + + total_sum = tl.sum(sum_acc, axis=0) + total_sq = tl.sum(sq_acc, axis=0) + mean = total_sum / N + var = total_sq / N - mean * mean + inv_std = 1.0 / tl.sqrt(var + eps) + + for start in range(0, N, BLOCK_SIZE): + offs = start + tl.arange(0, BLOCK_SIZE) + mask = offs < N + x = tl.load(x_ptr + base + offs, mask=mask, other=0.0).to(tl.float32) + out = (x - mean) * inv_std + tl.store(out_ptr + base + offs, out.to(tl.bfloat16), mask=mask) + + +class Model(nn.Module): + def __init__(self, num_features: int): + super(Model, self).__init__() + self.num_features = num_features + self.eps = 1e-5 + + def forward(self, x: torch.Tensor) -> torch.Tensor: + B, C, H, W = x.shape + N = H * W + x_flat = x.contiguous().view(B * C, N) + out = torch.empty_like(x_flat) + grid = (B * C,) + instance_norm_kernel[grid]( + x_flat, + out, + N, + self.eps, + ) + return out.view(B, C, H, W) + + +batch_size = 16 +features = 64 +dim1 = 256 +dim2 = 256 + + +def get_inputs(): + x = torch.rand(batch_size, features, dim1, dim2, dtype=torch.bfloat16) + return [x] + + +def get_init_inputs(): + return [features] diff --git a/backends/triton/cpu/KernelBench/level1/35_GroupNorm_.py b/backends/triton/cpu/KernelBench/level1/35_GroupNorm_.py new file mode 100644 index 0000000..4fab4bb --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/35_GroupNorm_.py @@ -0,0 +1,183 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_HW": 4096, "warp_size": 32}, num_warps=8, num_stages=2), + ], + key=["HW", "channels_per_group"], +) +@triton.jit +def group_norm_stats_kernel( + x_ptr, + mean_ptr, + invstd_ptr, + C: tl.constexpr, + HW: tl.constexpr, + num_groups: tl.constexpr, + eps: tl.constexpr, + channels_per_group: tl.constexpr, + BLOCK_HW: tl.constexpr, + warp_size: tl.constexpr, +): + pid = tl.program_id(0) + batch_idx = pid // num_groups + group_idx = pid % num_groups + + channel_start = group_idx * channels_per_group + batch_offset = batch_idx.to(tl.int64) * C * HW + group_elems = channels_per_group * HW + + sum_val = 0.0 + sq_val = 0.0 + + for c in range(channels_per_group): + c_offset = (channel_start + c).to(tl.int64) * HW + base = batch_offset + c_offset + for hw_start in range(0, HW, BLOCK_HW): + offs = hw_start + tl.arange(0, BLOCK_HW) + mask = offs < HW + x_val = tl.load(x_ptr + base + offs.to(tl.int64), mask=mask, other=0.0).to( + tl.float32 + ) + sum_val += tl.sum(x_val, axis=0) + sq_val += tl.sum(x_val * x_val, axis=0) + + mean = sum_val / group_elems + variance = sq_val / group_elems - mean * mean + inv_std = 1.0 / tl.sqrt(variance + eps) + + tl.store(mean_ptr + pid, mean) + tl.store(invstd_ptr + pid, inv_std) + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_HW": 4096, "warp_size": 32}, num_warps=8, num_stages=2), + ], + key=["HW"], +) +@triton.jit +def group_norm_apply_kernel( + x_ptr, + y_ptr, + mean_ptr, + invstd_ptr, + weight_ptr, + bias_ptr, + C: tl.constexpr, + HW: tl.constexpr, + num_groups: tl.constexpr, + channels_per_group: tl.constexpr, + BLOCK_HW: tl.constexpr, + warp_size: tl.constexpr, +): + pid = tl.program_id(0) + batch_idx = pid // C + channel_idx = pid % C + group_idx = channel_idx // channels_per_group + + stats_idx = batch_idx * num_groups + group_idx + mean = tl.load(mean_ptr + stats_idx) + inv_std = tl.load(invstd_ptr + stats_idx) + w = tl.load(weight_ptr + channel_idx).to(tl.float32) + b = tl.load(bias_ptr + channel_idx).to(tl.float32) + + scale = inv_std * w + shift = b - mean * scale + + batch_offset = batch_idx.to(tl.int64) * C * HW + c_offset = channel_idx.to(tl.int64) * HW + base = batch_offset + c_offset + + for hw_start in range(0, HW, BLOCK_HW): + offs = hw_start + tl.arange(0, BLOCK_HW) + mask = offs < HW + x_val = tl.load(x_ptr + base + offs.to(tl.int64), mask=mask, other=0.0).to( + tl.float32 + ) + normed = x_val * scale + shift + tl.store(y_ptr + base + offs.to(tl.int64), normed.to(tl.bfloat16), mask=mask) + + +class Model(nn.Module): + def __init__(self, num_features: int, num_groups: int): + super(Model, self).__init__() + self.gn = nn.GroupNorm(num_groups=num_groups, num_channels=num_features) + self.num_features = num_features + self.num_groups = num_groups + self._packed = False + + def _pack_weights(self, device): + self.weight_packed = self.gn.weight.data.to(device).contiguous() + self.bias_packed = self.gn.bias.data.to(device).contiguous() + self._packed = True + + def forward(self, x: torch.Tensor) -> torch.Tensor: + device = x.device + x = x.contiguous() + if not self._packed: + self._pack_weights(device) + + N, C, H, W_dim = x.shape + HW = H * W_dim + channels_per_group = C // self.num_groups + y = torch.empty_like(x) + eps = self.gn.eps + + mean_buf = torch.empty(N * self.num_groups, device=device, dtype=torch.float32) + invstd_buf = torch.empty( + N * self.num_groups, device=device, dtype=torch.float32 + ) + + stats_grid = (N * self.num_groups,) + group_norm_stats_kernel[stats_grid]( + x, + mean_buf, + invstd_buf, + C, + HW, + self.num_groups, + eps, + channels_per_group, + ) + + apply_grid = (N * C,) + group_norm_apply_kernel[apply_grid]( + x, + y, + mean_buf, + invstd_buf, + self.weight_packed, + self.bias_packed, + C, + HW, + self.num_groups, + channels_per_group, + ) + return y + + +batch_size = 16 +features = 64 +num_groups = 8 +dim1 = 256 +dim2 = 256 + + +def get_inputs(): + x = torch.rand(batch_size, features, dim1, dim2, dtype=torch.bfloat16) + return [x] + + +def get_init_inputs(): + return [features, num_groups] diff --git a/backends/triton/cpu/KernelBench/level1/36_RMSNorm_.py b/backends/triton/cpu/KernelBench/level1/36_RMSNorm_.py new file mode 100644 index 0000000..de1949b --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/36_RMSNorm_.py @@ -0,0 +1,109 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_S": 64}, num_warps=4, num_stages=2), + ], + key=["S", "F"], +) +@triton.jit +def rms_norm_kernel( + x_ptr, + out_ptr, + eps_ptr, + S: tl.constexpr, + F: tl.constexpr, + stride_batch, + stride_feat, + BLOCK_S: tl.constexpr, +): + pid = tl.program_id(0) + num_s_blocks = tl.cdiv(S, BLOCK_S) + batch_id = pid // num_s_blocks + spatial_block_id = pid % num_s_blocks + + s_start = spatial_block_id * BLOCK_S + s_offs = s_start + tl.arange(0, BLOCK_S) + s_mask = s_offs < S + + batch_offset = batch_id.to(tl.int64) * stride_batch + eps = tl.load(eps_ptr) + + sum_sq = tl.zeros((BLOCK_S,), dtype=tl.float32) + for f in tl.static_range(F): + x_offs = batch_offset + f * stride_feat + s_offs + x_val = tl.load(x_ptr + x_offs, mask=s_mask, other=0.0) + x_fp32 = x_val.to(tl.float32) + sum_sq += x_fp32 * x_fp32 + + mean_sq = sum_sq / F + rms = tl.sqrt(mean_sq + eps.to(tl.float32)) + + for f in tl.static_range(F): + x_offs = batch_offset + f * stride_feat + s_offs + x_val = tl.load(x_ptr + x_offs, mask=s_mask, other=0.0) + out_val = x_val.to(tl.float32) / rms + tl.store(out_ptr + x_offs, out_val.to(tl.bfloat16), mask=s_mask) + + +class Model(nn.Module): + def __init__(self, num_features: int, eps: float = 1e-5): + super(Model, self).__init__() + self.num_features = num_features + self.eps = eps + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if x.dtype != torch.bfloat16: + x = x.to(torch.bfloat16) + x = x.contiguous() + + B = x.shape[0] + F = x.shape[1] + spatial = 1 + for i in range(2, x.dim()): + spatial *= x.shape[i] + + x_flat = x.view(B, F, spatial) + out_flat = torch.empty_like(x_flat) + + stride_batch = F * spatial + stride_feat = spatial + + eps_t = torch.tensor([self.eps], dtype=torch.float32, device=x.device) + grid = lambda META: (B * triton.cdiv(spatial, META["BLOCK_S"]),) + rms_norm_kernel[grid]( + x_flat, + out_flat, + eps_t, + spatial, + F, + stride_batch, + stride_feat, + ) + + return out_flat.view_as(x) + + +batch_size = 16 +features = 64 +dim1 = 256 +dim2 = 256 + + +def get_inputs(): + x = torch.rand(batch_size, features, dim1, dim2, dtype=torch.bfloat16) + return [x] + + +def get_init_inputs(): + return [features] diff --git a/backends/triton/cpu/KernelBench/level1/37_FrobeniusNorm_.py b/backends/triton/cpu/KernelBench/level1/37_FrobeniusNorm_.py new file mode 100644 index 0000000..d83afe2 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/37_FrobeniusNorm_.py @@ -0,0 +1,110 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + +REDUCE_BLOCK: tl.constexpr = 8192 + + +@triton.jit +def _partial_sum_sq_kernel( + x_ptr, + out_ptr, + N, + BLOCK_SIZE: tl.constexpr, +): + pid = tl.program_id(0) + offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + mask = offsets < N + x = tl.load(x_ptr + offsets, mask=mask, other=0.0) + x_fp32 = x.to(tl.float32) + partial_sum = tl.sum(x_fp32 * x_fp32) + tl.store(out_ptr + pid, partial_sum) + + +@triton.jit +def _reduce_kernel( + partial_sums_ptr, + inv_norm_ptr, + num_partial, + REDUCE_BLOCK: tl.constexpr, +): + acc = tl.zeros([REDUCE_BLOCK], dtype=tl.float32) + for start in range(0, num_partial, REDUCE_BLOCK): + offsets = start + tl.arange(0, REDUCE_BLOCK) + mask = offsets < num_partial + vals = tl.load(partial_sums_ptr + offsets, mask=mask, other=0.0) + acc += vals + total = tl.sum(acc) + inv_norm = tl.rsqrt(total) + tl.store(inv_norm_ptr, inv_norm) + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_SIZE": 2048}, num_warps=8, num_stages=2), + ], + key=["N"], +) +@triton.jit +def _normalize_kernel( + x_ptr, + out_ptr, + inv_norm_ptr, + N, + BLOCK_SIZE: tl.constexpr, +): + pid = tl.program_id(0) + inv_norm = tl.load(inv_norm_ptr) + offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + mask = offsets < N + x = tl.load(x_ptr + offsets, mask=mask, other=0.0) + result = x.to(tl.float32) * inv_norm + tl.store(out_ptr + offsets, result.to(tl.bfloat16), mask=mask) + + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + original_shape = x.shape + x_flat = x.contiguous().view(-1) + N = x_flat.numel() + + num_blocks = triton.cdiv(N, 8192) + partial_sums = torch.empty(num_blocks, device=x.device, dtype=torch.float32) + + _partial_sum_sq_kernel[(num_blocks,)](x_flat, partial_sums, N, BLOCK_SIZE=8192) + + inv_norm = torch.empty(1, device=x.device, dtype=torch.float32) + _reduce_kernel[(1,)]( + partial_sums, inv_norm, num_blocks, REDUCE_BLOCK=REDUCE_BLOCK + ) + + output_flat = torch.empty_like(x_flat) + grid_norm = lambda META: (triton.cdiv(N, META["BLOCK_SIZE"]),) + _normalize_kernel[grid_norm](x_flat, output_flat, inv_norm, N) + + return output_flat.view(original_shape) + + +batch_size = 16 +features = 64 +dim1 = 256 +dim2 = 256 + + +def get_inputs(): + x = torch.rand(batch_size, features, dim1, dim2, dtype=torch.bfloat16) + return [x] + + +def get_init_inputs(): + return [] diff --git a/backends/triton/cpu/KernelBench/level1/38_L1Norm_.py b/backends/triton/cpu/KernelBench/level1/38_L1Norm_.py new file mode 100644 index 0000000..4a3a396 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/38_L1Norm_.py @@ -0,0 +1,88 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_SIZE": 1024}, num_warps=8, num_stages=2), + ], + key=["N"], +) +@triton.jit +def l1_norm_kernel( + x_ptr, + out_ptr, + M, + N, + stride_xm, + stride_xn, + stride_om, + stride_on, + BLOCK_SIZE: tl.constexpr, +): + row = tl.program_id(0) + + col_offsets = tl.arange(0, BLOCK_SIZE) + row_x = x_ptr + row.to(tl.int64) * stride_xm + row_o = out_ptr + row.to(tl.int64) * stride_om + + # Phase 1: compute sum(abs(x)) for this row + abs_sum = 0.0 + for col_start in range(0, N, BLOCK_SIZE): + cols = col_start + col_offsets + mask = cols < N + x = tl.load(row_x + cols * stride_xn, mask=mask, other=0.0).to(tl.float32) + abs_sum += tl.sum(tl.abs(x), axis=0) + + # mean = sum(abs(x)) / N + mean_val = abs_sum / N + + # Phase 2: normalize x / mean + for col_start in range(0, N, BLOCK_SIZE): + cols = col_start + col_offsets + mask = cols < N + x = tl.load(row_x + cols * stride_xn, mask=mask, other=0.0).to(tl.float32) + out = x / mean_val + tl.store(row_o + cols * stride_on, out.to(tl.bfloat16), mask=mask) + + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + M, N = x.shape + out = torch.empty_like(x) + grid = (M,) + l1_norm_kernel[grid]( + x, + out, + M, + N, + x.stride(0), + x.stride(1), + out.stride(0), + out.stride(1), + ) + return out + + +batch_size = 4096 +dim = 16384 + + +def get_inputs(): + x = torch.rand(batch_size, dim, dtype=torch.bfloat16) + return [x] + + +def get_init_inputs(): + return [] diff --git a/backends/triton/cpu/KernelBench/level1/39_L2Norm_.py b/backends/triton/cpu/KernelBench/level1/39_L2Norm_.py new file mode 100644 index 0000000..ecd571f --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/39_L2Norm_.py @@ -0,0 +1,84 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2), + ], + key=["N"], +) +@triton.jit +def l2_norm_kernel( + x_ptr, + out_ptr, + M, + N, + stride_m, + stride_n, + BLOCK_SIZE: tl.constexpr, +): + row = tl.program_id(0) + row_start = row * stride_m + + sum_sq = tl.zeros((BLOCK_SIZE,), dtype=tl.float32) + for off in range(0, N, BLOCK_SIZE): + cols = off + tl.arange(0, BLOCK_SIZE) + mask = cols < N + x = tl.load(x_ptr + row_start + cols * stride_n, mask=mask, other=0.0).to( + tl.float32 + ) + sum_sq += x * x + + norm_sq = tl.sum(sum_sq, axis=0) + inv_norm = 1.0 / tl.sqrt(norm_sq + 1e-12) + + for off in range(0, N, BLOCK_SIZE): + cols = off + tl.arange(0, BLOCK_SIZE) + mask = cols < N + x = tl.load(x_ptr + row_start + cols * stride_n, mask=mask, other=0.0).to( + tl.float32 + ) + out = x * inv_norm + tl.store(out_ptr + row_start + cols * stride_n, out.to(tl.bfloat16), mask=mask) + + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = x.contiguous() + M, N = x.shape + out = torch.empty_like(x) + grid = (M,) + l2_norm_kernel[grid]( + x, + out, + M, + N, + x.stride(0), + x.stride(1), + ) + return out + + +batch_size = 4096 +dim = 16384 + + +def get_inputs(): + x = torch.rand(batch_size, dim, dtype=torch.bfloat16) + return [x] + + +def get_init_inputs(): + return [] diff --git a/backends/triton/cpu/KernelBench/level1/3_Batched_matrix_multiplication.py b/backends/triton/cpu/KernelBench/level1/3_Batched_matrix_multiplication.py new file mode 100644 index 0000000..9380f52 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/3_Batched_matrix_multiplication.py @@ -0,0 +1,147 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config( + {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, + num_warps=32, + num_stages=2, + ), + ], + key=["M", "N", "K"], +) +@triton.jit +def _batched_matmul_kernel( + A_ptr, + B_ptr, + C_ptr, + M, + N, + K, + stride_ab: tl.constexpr, + stride_am: tl.constexpr, + stride_ak: tl.constexpr, + stride_bb: tl.constexpr, + stride_bk: tl.constexpr, + stride_bn: tl.constexpr, + stride_cb: tl.constexpr, + stride_cm: tl.constexpr, + stride_cn: tl.constexpr, + BATCH: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + pid = tl.program_id(0) + + num_pid_m = tl.cdiv(M, BLOCK_M) + num_pid_n = tl.cdiv(N, BLOCK_N) + num_tiles_mn = num_pid_m * num_pid_n + + batch_id = pid // num_tiles_mn + tile_id = pid % num_tiles_mn + + num_pid_in_group = GROUP_SIZE_M * num_pid_n + group_id = tile_id // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size_m = tl.minimum(num_pid_m - first_pid_m, GROUP_SIZE_M) + pid_m = first_pid_m + ((tile_id % num_pid_in_group) % group_size_m) + pid_n = (tile_id % num_pid_in_group) // group_size_m + + batch_offset_a = batch_id.to(tl.int64) * stride_ab + batch_offset_b = batch_id.to(tl.int64) * stride_bb + batch_offset_c = batch_id.to(tl.int64) * stride_cb + + a_desc = tl.make_tensor_descriptor( + base=A_ptr + batch_offset_a, + shape=(M, K), + strides=(stride_am, stride_ak), + block_shape=(BLOCK_M, BLOCK_K), + ) + + b_desc = tl.make_tensor_descriptor( + base=B_ptr + batch_offset_b, + shape=(K, N), + strides=(stride_bk, stride_bn), + block_shape=(BLOCK_K, BLOCK_N), + ) + + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + for off_k in range(0, K, BLOCK_K): + a = a_desc.load([pid_m * BLOCK_M, off_k]) + b = b_desc.load([off_k, pid_n * BLOCK_N]) + acc += tl.dot(a, b) + c_desc = tl.make_tensor_descriptor( + base=C_ptr + batch_offset_c, + shape=(M, N), + strides=(stride_cm, stride_cn), + block_shape=(BLOCK_M, BLOCK_N), + ) + c_desc.store([pid_m * BLOCK_M, pid_n * BLOCK_N], acc.to(C_ptr.type.element_ty)) + + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: + BATCH, M, K = A.shape + _, _, N = B.shape + + A = A.contiguous() + B = B.contiguous() + C = torch.empty((BATCH, M, N), device=A.device, dtype=A.dtype) + + def grid(META): + return ( + BATCH + * triton.cdiv(M, META["BLOCK_M"]) + * triton.cdiv(N, META["BLOCK_N"]), + ) + + _batched_matmul_kernel[grid]( + A, + B, + C, + M, + N, + K, + A.stride(0), + A.stride(1), + A.stride(2), + B.stride(0), + B.stride(1), + B.stride(2), + C.stride(0), + C.stride(1), + C.stride(2), + BATCH=BATCH, + ) + return C + + +batch_size = 128 +m = 128 * 4 +k = 256 * 4 +n = 512 * 4 + + +def get_inputs(): + A = torch.rand(batch_size, m, k, dtype=torch.bfloat16) + B = torch.rand(batch_size, k, n, dtype=torch.bfloat16) + return [A, B] + + +def get_init_inputs(): + return [] diff --git a/backends/triton/cpu/KernelBench/level1/40_LayerNorm.py b/backends/triton/cpu/KernelBench/level1/40_LayerNorm.py new file mode 100644 index 0000000..e0954b2 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/40_LayerNorm.py @@ -0,0 +1,111 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config( + {"BLOCK_SIZE": 1024, "warp_size": 32}, num_warps=16, num_stages=2 + ), + ], + key=["N"], +) +@triton.jit +def _layer_norm_best_kernel( + X_ptr, + Y_ptr, + W_ptr, + B_ptr, + M, + N, + eps, + BLOCK_SIZE: tl.constexpr, + warp_size: tl.constexpr, +): + row = tl.program_id(0) + row_start = row.to(tl.int64) * N + + _sum = tl.zeros([BLOCK_SIZE], dtype=tl.float32) + _sum_sq = tl.zeros([BLOCK_SIZE], dtype=tl.float32) + for off in range(0, N, BLOCK_SIZE): + cols = off + tl.arange(0, BLOCK_SIZE) + mask = cols < N + x = tl.load(X_ptr + row_start + cols, mask=mask, other=0.0).to(tl.float32) + _sum += x + _sum_sq += x * x + mean = tl.sum(_sum, axis=0) / N + var = tl.sum(_sum_sq, axis=0) / N - mean * mean + rstd = 1.0 / tl.sqrt(var + eps) + + for off in range(0, N, BLOCK_SIZE): + cols = off + tl.arange(0, BLOCK_SIZE) + mask = cols < N + x = tl.load(X_ptr + row_start + cols, mask=mask, other=0.0).to(tl.float32) + w = tl.load(W_ptr + cols, mask=mask, other=1.0).to(tl.float32) + b = tl.load(B_ptr + cols, mask=mask, other=0.0).to(tl.float32) + y = (x - mean) * rstd * w + b + tl.store(Y_ptr + row_start + cols, y.to(tl.bfloat16), mask=mask) + + +class Model(nn.Module): + def __init__(self, normalized_shape: tuple): + super(Model, self).__init__() + self.ln = nn.LayerNorm(normalized_shape=normalized_shape) + self._moved = False + + def _move_params(self, device): + self.w_flat = ( + self.ln.weight.data.to(device, dtype=torch.bfloat16).contiguous().flatten() + ) + self.b_flat = ( + self.ln.bias.data.to(device, dtype=torch.bfloat16).contiguous().flatten() + ) + self._eps = self.ln.eps + self._norm_n = 1 + for s in self.ln.normalized_shape: + self._norm_n *= s + self._moved = True + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if not self._moved: + self._move_params(x.device) + x = x.contiguous() + orig_shape = x.shape + N = self._norm_n + M = x.numel() // N + x_flat = x.view(M, N) + y_flat = torch.empty_like(x_flat) + grid = (M,) + _layer_norm_best_kernel[grid]( + x_flat, + y_flat, + self.w_flat, + self.b_flat, + M, + N, + self._eps, + ) + return y_flat.view(orig_shape) + + +batch_size = 16 +features = 64 +dim1 = 256 +dim2 = 256 + + +def get_inputs(): + x = torch.rand(batch_size, features, dim1, dim2, dtype=torch.bfloat16) + return [x] + + +def get_init_inputs(*args, **kwargs): + return [(features, dim1, dim2)] diff --git a/backends/triton/cpu/KernelBench/level1/41_Max_Pooling_1D.py b/backends/triton/cpu/KernelBench/level1/41_Max_Pooling_1D.py new file mode 100644 index 0000000..1d71cbb --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/41_Max_Pooling_1D.py @@ -0,0 +1,121 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_SIZE": 256}, num_warps=4, num_stages=2), + ], + key=["output_length"], +) +@triton.jit +def maxpool1d_kernel( + input_ptr, + output_ptr, + seq_length, + output_length, + num_channels, + stride_b, + stride_c, + stride_out_b, + stride_out_c, + KERNEL_SIZE: tl.constexpr, + STRIDE: tl.constexpr, + PADDING: tl.constexpr, + DILATION: tl.constexpr, + BLOCK_SIZE: tl.constexpr, +): + pid_bc = tl.program_id(0) + pid_o = tl.program_id(1) + + b = pid_bc // num_channels + c = pid_bc % num_channels + + o_start = pid_o * BLOCK_SIZE + o_offsets = o_start + tl.arange(0, BLOCK_SIZE) + o_mask = o_offsets < output_length + + base_in = input_ptr + b * stride_b + c * stride_c + running_max = tl.full([BLOCK_SIZE], value=-float("inf"), dtype=tl.float32) + + for k in range(KERNEL_SIZE): + inp_idx = o_offsets * STRIDE + k * DILATION - PADDING + valid = (inp_idx >= 0) & (inp_idx < seq_length) & o_mask + vals = tl.load(base_in + inp_idx, mask=valid, other=-float("inf")) + running_max = tl.maximum(running_max, vals.to(tl.float32)) + + base_out = output_ptr + b * stride_out_b + c * stride_out_c + tl.store(base_out + o_offsets, running_max.to(tl.bfloat16), mask=o_mask) + + +class Model(nn.Module): + def __init__( + self, + kernel_size: int, + stride: int = None, + padding: int = 0, + dilation: int = 1, + return_indices: bool = False, + ): + super(Model, self).__init__() + self.kernel_size = kernel_size + self.stride = stride if stride is not None else kernel_size + self.padding = padding + self.dilation = dilation + + def forward(self, x: torch.Tensor) -> torch.Tensor: + device = x.device + B, C, L = x.shape + x = x.to(device).contiguous() + + output_length = ( + L + 2 * self.padding - self.dilation * (self.kernel_size - 1) - 1 + ) // self.stride + 1 + output = torch.empty((B, C, output_length), device=device, dtype=x.dtype) + + grid = lambda META: (B * C, triton.cdiv(output_length, META["BLOCK_SIZE"])) + maxpool1d_kernel[grid]( + x, + output, + L, + output_length, + C, + x.stride(0), + x.stride(1), + output.stride(0), + output.stride(1), + KERNEL_SIZE=self.kernel_size, + STRIDE=self.stride, + PADDING=self.padding, + DILATION=self.dilation, + ) + return output + + +batch_size = 64 +features = 192 +sequence_length = 65536 + +kernel_size = 8 +stride = 1 +padding = 4 +dilation = 3 + +return_indices = False + + +def get_inputs(): + x = torch.rand(batch_size, features, sequence_length, dtype=torch.bfloat16) + return [x] + + +def get_init_inputs(): + return [kernel_size, stride, padding, dilation, return_indices] diff --git a/backends/triton/cpu/KernelBench/level1/42_Max_Pooling_2D.py b/backends/triton/cpu/KernelBench/level1/42_Max_Pooling_2D.py new file mode 100644 index 0000000..0d5aa8e --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/42_Max_Pooling_2D.py @@ -0,0 +1,127 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_H": 1, "BLOCK_W": 256}, num_warps=4, num_stages=2), + ], + key=["OH", "OW"], +) +@triton.jit +def _maxpool2d_kernel( + x_ptr, + out_ptr, + H, + W, + OH, + OW, + BLOCK_H: tl.constexpr, + BLOCK_W: tl.constexpr, + KERNEL_SIZE: tl.constexpr, + STRIDE: tl.constexpr, + PADDING: tl.constexpr, + DILATION: tl.constexpr, + IS_BF16: tl.constexpr, +): + pid_bc = tl.program_id(0) + pid_oh = tl.program_id(1) + pid_ow = tl.program_id(2) + + x_base = x_ptr + pid_bc.to(tl.int64) * H * W + out_base = out_ptr + pid_bc.to(tl.int64) * OH * OW + + oh_offsets = pid_oh * BLOCK_H + tl.arange(0, BLOCK_H) + ow_offsets = pid_ow * BLOCK_W + tl.arange(0, BLOCK_W) + + max_vals = tl.full((BLOCK_H, BLOCK_W), float("-inf"), dtype=tl.float32) + + for kh in range(KERNEL_SIZE): + ih = oh_offsets * STRIDE - PADDING + kh * DILATION + valid_h = (ih >= 0) & (ih < H) & (oh_offsets < OH) + for kw in range(KERNEL_SIZE): + iw = ow_offsets * STRIDE - PADDING + kw * DILATION + valid_w = (iw >= 0) & (iw < W) & (ow_offsets < OW) + mask = valid_h[:, None] & valid_w[None, :] + ptrs = x_base + ih[:, None] * W + iw[None, :] + vals = tl.load(ptrs, mask=mask, other=float("-inf")).to(tl.float32) + max_vals = tl.maximum(max_vals, vals) + + out_mask = (oh_offsets < OH)[:, None] & (ow_offsets < OW)[None, :] + out_ptrs = out_base + oh_offsets[:, None] * OW + ow_offsets[None, :] + if IS_BF16: + tl.store(out_ptrs, max_vals.to(tl.bfloat16), mask=out_mask) + else: + tl.store(out_ptrs, max_vals, mask=out_mask) + + +def maxpool2d(x, kernel_size, stride, padding, dilation): + B, C, H, W = x.shape + OH = (H + 2 * padding - dilation * (kernel_size - 1) - 1) // stride + 1 + OW = (W + 2 * padding - dilation * (kernel_size - 1) - 1) // stride + 1 + + x = x.contiguous() + out = torch.empty((B, C, OH, OW), device=x.device, dtype=x.dtype) + + is_bf16 = x.dtype == torch.bfloat16 + + grid = lambda META: ( + B * C, + triton.cdiv(OH, META["BLOCK_H"]), + triton.cdiv(OW, META["BLOCK_W"]), + ) + + _maxpool2d_kernel[grid]( + x, + out, + H, + W, + OH, + OW, + KERNEL_SIZE=kernel_size, + STRIDE=stride, + PADDING=padding, + DILATION=dilation, + IS_BF16=is_bf16, + ) + + return out + + +class Model(nn.Module): + def __init__(self, kernel_size: int, stride: int, padding: int, dilation: int): + super().__init__() + self.kernel_size = kernel_size + self.stride = stride + self.padding = padding + self.dilation = dilation + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return maxpool2d(x, self.kernel_size, self.stride, self.padding, self.dilation) + + +batch_size = 32 +channels = 64 +height = 512 +width = 512 +kernel_size = 4 +stride = 1 +padding = 1 +dilation = 1 + + +def get_inputs(): + x = torch.rand(batch_size, channels, height, width, dtype=torch.bfloat16) + return [x] + + +def get_init_inputs(): + return [kernel_size, stride, padding, dilation] diff --git a/backends/triton/cpu/KernelBench/level1/43_Max_Pooling_3D.py b/backends/triton/cpu/KernelBench/level1/43_Max_Pooling_3D.py new file mode 100644 index 0000000..84f6dc5 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/43_Max_Pooling_3D.py @@ -0,0 +1,172 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_OW": 32}, num_warps=4, num_stages=2), + ], + key=["OW"], +) +@triton.jit +def _maxpool3d_kernel( + x_ptr, + out_ptr, + C, + D, + H, + W, + OD, + OH, + OW, + stride_xb, + stride_xc, + stride_xd, + stride_xh, + stride_xw, + stride_ob, + stride_oc, + stride_od, + stride_oh, + stride_ow, + POOL_STRIDE: tl.constexpr, + PAD: tl.constexpr, + DIL: tl.constexpr, + KS: tl.constexpr, + BLOCK_OW: tl.constexpr, +): + pid_ow = tl.program_id(0) + pid_dh = tl.program_id(1) + pid_bc = tl.program_id(2) + + b = pid_bc // C + c = pid_bc % C + od = pid_dh // OH + oh = pid_dh % OH + + ow_start = pid_ow * BLOCK_OW + ow_offs = ow_start + tl.arange(0, BLOCK_OW) + mask_ow = ow_offs < OW + + base = x_ptr + b.to(tl.int64) * stride_xb + c.to(tl.int64) * stride_xc + + d_base = od * POOL_STRIDE - PAD + h_base = oh * POOL_STRIDE - PAD + w_bases = ow_offs * POOL_STRIDE - PAD + + max_val = tl.full([BLOCK_OW], float("-inf"), dtype=tl.float32) + + for kd in range(KS): + d_in = d_base + kd * DIL + d_valid = (d_in >= 0) & (d_in < D) + if d_valid: + for kh in range(KS): + h_in = h_base + kh * DIL + h_valid = (h_in >= 0) & (h_in < H) + if h_valid: + dh_offset = d_in * stride_xd + h_in * stride_xh + for kw in range(KS): + w_in = w_bases + kw * DIL + w_valid = (w_in >= 0) & (w_in < W) + ptrs = base + dh_offset + w_in * stride_xw + valid_mask = mask_ow & w_valid + val = tl.load(ptrs, mask=valid_mask, other=float("-inf")) + max_val = tl.maximum(max_val, val.to(tl.float32)) + + out_base = out_ptr + b.to(tl.int64) * stride_ob + c.to(tl.int64) * stride_oc + out_base += od * stride_od + oh * stride_oh + out_ptrs = out_base + ow_offs * stride_ow + tl.store(out_ptrs, max_val.to(tl.bfloat16), mask=mask_ow) + + +def maxpool3d_triton(x, kernel_size, stride_pool, padding, dilation): + B, C, D, H, W = x.shape + OD = (D + 2 * padding - dilation * (kernel_size - 1) - 1) // stride_pool + 1 + OH = (H + 2 * padding - dilation * (kernel_size - 1) - 1) // stride_pool + 1 + OW = (W + 2 * padding - dilation * (kernel_size - 1) - 1) // stride_pool + 1 + + out = torch.empty(B, C, OD, OH, OW, device=x.device, dtype=x.dtype) + + grid = lambda META: ( + triton.cdiv(OW, META["BLOCK_OW"]), + OD * OH, + B * C, + ) + + _maxpool3d_kernel[grid]( + x, + out, + C, + D, + H, + W, + OD, + OH, + OW, + x.stride(0), + x.stride(1), + x.stride(2), + x.stride(3), + x.stride(4), + out.stride(0), + out.stride(1), + out.stride(2), + out.stride(3), + out.stride(4), + POOL_STRIDE=stride_pool, + PAD=padding, + DIL=dilation, + KS=kernel_size, + ) + + return out + + +class Model(nn.Module): + def __init__( + self, + kernel_size: int, + stride: int = None, + padding: int = 0, + dilation: int = 1, + return_indices: bool = False, + ceil_mode: bool = False, + ): + super(Model, self).__init__() + self.kernel_size = kernel_size + self.stride_pool = stride if stride is not None else kernel_size + self.padding = padding + self.dilation = dilation + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return maxpool3d_triton( + x, self.kernel_size, self.stride_pool, self.padding, self.dilation + ) + + +batch_size = 16 +channels = 32 +dim1 = 128 +dim2 = 128 +dim3 = 128 +kernel_size = 3 +stride = 2 +padding = 1 +dilation = 3 + + +def get_inputs(): + x = torch.rand(batch_size, channels, dim1, dim2, dim3, dtype=torch.bfloat16) + return [x] + + +def get_init_inputs(): + return [kernel_size, stride, padding, dilation] diff --git a/backends/triton/cpu/KernelBench/level1/44_Average_Pooling_1D.py b/backends/triton/cpu/KernelBench/level1/44_Average_Pooling_1D.py new file mode 100644 index 0000000..564bf7f --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/44_Average_Pooling_1D.py @@ -0,0 +1,125 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_SIZE": 256}, num_warps=4, num_stages=2), + ], + key=["output_length", "kernel_size"], +) +@triton.jit +def avg_pool1d_kernel( + input_ptr, + output_ptr, + input_length, + output_length, + kernel_size, + stride, + padding, + stride_b, + stride_c, + stride_l, + out_stride_b, + out_stride_c, + out_stride_l, + BLOCK_SIZE: tl.constexpr, +): + pid_bc = tl.program_id(0) + pid_l = tl.program_id(1) + + offs = pid_l * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + mask = offs < output_length + + acc = tl.zeros((BLOCK_SIZE,), dtype=tl.float32) + + for k in range(kernel_size): + inp_pos = offs * stride - padding + k + inp_mask = mask & (inp_pos >= 0) & (inp_pos < input_length) + inp_ptr = input_ptr + pid_bc.to(tl.int64) * stride_c + inp_pos * stride_l + val = tl.load(inp_ptr, mask=inp_mask, other=0.0).to(tl.float32) + acc += val + + inv_kernel_size = 1.0 / kernel_size + acc = acc * inv_kernel_size + + out_ptr = output_ptr + pid_bc.to(tl.int64) * out_stride_c + offs * out_stride_l + tl.store(out_ptr, acc.to(tl.bfloat16), mask=mask) + + +def kernel_function( + x: torch.Tensor, kernel_size: int, stride: int, padding: int +) -> torch.Tensor: + batch_size, in_channels, input_length = x.shape + output_length = (input_length + 2 * padding - kernel_size) // stride + 1 + + if x.dtype != torch.bfloat16: + x = x.to(torch.bfloat16) + x = x.contiguous() + + output = torch.empty( + (batch_size, in_channels, output_length), + device=x.device, + dtype=torch.bfloat16, + ) + + total_bc = batch_size * in_channels + + x_flat = x.view(total_bc, input_length) + out_flat = output.view(total_bc, output_length) + + grid = lambda META: (total_bc, triton.cdiv(output_length, META["BLOCK_SIZE"])) + + avg_pool1d_kernel[grid]( + x_flat, + out_flat, + input_length, + output_length, + kernel_size, + stride, + padding, + x_flat.stride(0), + x_flat.stride(0), + x_flat.stride(1), + out_flat.stride(0), + out_flat.stride(0), + out_flat.stride(1), + ) + + return output + + +batch_size = 64 +in_channels = 128 +input_length = 65536 +kernel_size = 8 +stride = 1 +padding = 4 + + +class Model(nn.Module): + def __init__(self, kernel_size: int, stride: int = 1, padding: int = 0): + super(Model, self).__init__() + self.kernel_size = kernel_size + self.stride = stride + self.padding = padding + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return kernel_function(x, self.kernel_size, self.stride, self.padding) + + +def get_inputs(): + x = torch.rand(batch_size, in_channels, input_length, dtype=torch.bfloat16) + return [x] + + +def get_init_inputs(): + return [kernel_size, stride, padding] diff --git a/backends/triton/cpu/KernelBench/level1/45_Average_Pooling_2D.py b/backends/triton/cpu/KernelBench/level1/45_Average_Pooling_2D.py new file mode 100644 index 0000000..0e9e33f --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/45_Average_Pooling_2D.py @@ -0,0 +1,115 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_OW": 16}, num_warps=4, num_stages=2), + ], + key=["OW"], +) +@triton.jit +def avg_pool2d_kernel( + input_ptr, + output_ptr, + W, + OW, + in_stride_nc, + out_stride_nc, + KERNEL_SIZE: tl.constexpr, + POOL_STRIDE: tl.constexpr, + BLOCK_OW: tl.constexpr, + OUTPUT_BF16: tl.constexpr, +): + pid_nc = tl.program_id(0) + pid_oh = tl.program_id(1) + pid_ow_tile = tl.program_id(2) + + ow_offsets = pid_ow_tile * BLOCK_OW + tl.arange(0, BLOCK_OW) + ow_mask = ow_offsets < OW + + in_base = input_ptr + pid_nc.to(tl.int64) * in_stride_nc + out_base = output_ptr + pid_nc.to(tl.int64) * out_stride_nc + + acc = tl.zeros((BLOCK_OW,), dtype=tl.float32) + + h_start = pid_oh * POOL_STRIDE + + for kh in range(KERNEL_SIZE): + h_in = h_start + kh + row_offset = h_in * W + for kw in range(KERNEL_SIZE): + w_in = ow_offsets * POOL_STRIDE + kw + vals = tl.load(in_base + row_offset + w_in, mask=ow_mask, other=0.0) + acc += vals.to(tl.float32) + + inv_area = 1.0 / (KERNEL_SIZE * KERNEL_SIZE) + result = acc * inv_area + + out_offset = pid_oh * OW + ow_offsets + if OUTPUT_BF16: + tl.store(out_base + out_offset, result.to(tl.bfloat16), mask=ow_mask) + else: + tl.store(out_base + out_offset, result, mask=ow_mask) + + +class Model(nn.Module): + def __init__(self, kernel_size: int, stride: int = None, padding: int = 0): + super(Model, self).__init__() + self.kernel_size = kernel_size + self.stride = stride if stride is not None else kernel_size + self.padding = padding + + def forward(self, x: torch.Tensor) -> torch.Tensor: + N, C, H, W = x.shape + x = x.contiguous() + + OH = (H + 2 * self.padding - self.kernel_size) // self.stride + 1 + OW = (W + 2 * self.padding - self.kernel_size) // self.stride + 1 + + output = torch.empty(N, C, OH, OW, device=x.device, dtype=x.dtype) + + NC = N * C + in_stride_nc = H * W + out_stride_nc = OH * OW + is_bf16 = x.dtype == torch.bfloat16 + + grid = lambda META: (NC, OH, triton.cdiv(OW, META["BLOCK_OW"])) + + avg_pool2d_kernel[grid]( + x, + output, + W, + OW, + in_stride_nc, + out_stride_nc, + KERNEL_SIZE=self.kernel_size, + POOL_STRIDE=self.stride, + OUTPUT_BF16=is_bf16, + ) + + return output + + +batch_size = 16 +channels = 64 +height = 2048 +width = 2048 +kernel_size = 11 + + +def get_inputs(): + x = torch.rand(batch_size, channels, height, width, dtype=torch.bfloat16) + return [x] + + +def get_init_inputs(): + return [kernel_size] diff --git a/backends/triton/cpu/KernelBench/level1/46_Average_Pooling_3D.py b/backends/triton/cpu/KernelBench/level1/46_Average_Pooling_3D.py new file mode 100644 index 0000000..f52c515 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/46_Average_Pooling_3D.py @@ -0,0 +1,157 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_OW": 64}, num_warps=4, num_stages=2), + ], + key=["OW"], +) +@triton.jit +def avg_pool3d_kernel( + x_ptr, + out_ptr, + C, + D, + H, + W, + OD, + OH, + OW, + stride_xn, + stride_xc, + stride_xd, + stride_xh, + stride_on, + stride_oc, + stride_od, + stride_oh, + KERNEL_SIZE: tl.constexpr, + STRIDE: tl.constexpr, + PADDING: tl.constexpr, + BLOCK_OW: tl.constexpr, +): + pid_ow = tl.program_id(0) + pid_ncdoh = tl.program_id(1) + + oh = pid_ncdoh % OH + tmp = pid_ncdoh // OH + od = tmp % OD + tmp2 = tmp // OD + c = tmp2 % C + n = tmp2 // C + + ow_start = pid_ow * BLOCK_OW + ow_offsets = ow_start + tl.arange(0, BLOCK_OW) + mask_ow = ow_offsets < OW + + base_x = x_ptr + n.to(tl.int64) * stride_xn + c.to(tl.int64) * stride_xc + + acc = tl.zeros((BLOCK_OW,), dtype=tl.float32) + + for kd in range(KERNEL_SIZE): + d_in = od * STRIDE + kd - PADDING + d_valid = (d_in >= 0) & (d_in < D) + if d_valid: + for kh in range(KERNEL_SIZE): + h_in = oh * STRIDE + kh - PADDING + h_valid = (h_in >= 0) & (h_in < H) + if h_valid: + row_base = ( + base_x + + d_in.to(tl.int64) * stride_xd + + h_in.to(tl.int64) * stride_xh + ) + for kw in range(KERNEL_SIZE): + w_in = ow_offsets * STRIDE + kw - PADDING + w_valid = (w_in >= 0) & (w_in < W) + mask = mask_ow & w_valid + ptrs = row_base + w_in + vals = tl.load(ptrs, mask=mask, other=0.0) + acc += vals.to(tl.float32) + + inv_count = 1.0 / (KERNEL_SIZE * KERNEL_SIZE * KERNEL_SIZE) + acc = acc * inv_count + + out_base = ( + out_ptr + + n.to(tl.int64) * stride_on + + c.to(tl.int64) * stride_oc + + od.to(tl.int64) * stride_od + + oh.to(tl.int64) * stride_oh + ) + tl.store(out_base + ow_offsets, acc.to(tl.bfloat16), mask=mask_ow) + + +class Model(nn.Module): + def __init__(self, kernel_size: int, stride: int = None, padding: int = 0): + super(Model, self).__init__() + self.kernel_size = kernel_size + self.stride = stride if stride is not None else kernel_size + self.padding = padding + + def forward(self, x: torch.Tensor) -> torch.Tensor: + N, C, D, H, W = x.shape + OD = (D + 2 * self.padding - self.kernel_size) // self.stride + 1 + OH = (H + 2 * self.padding - self.kernel_size) // self.stride + 1 + OW = (W + 2 * self.padding - self.kernel_size) // self.stride + 1 + + output = torch.empty(N, C, OD, OH, OW, device=x.device, dtype=x.dtype) + + grid = lambda META: ( + triton.cdiv(OW, META["BLOCK_OW"]), + N * C * OD * OH, + ) + + avg_pool3d_kernel[grid]( + x, + output, + C, + D, + H, + W, + OD, + OH, + OW, + x.stride(0), + x.stride(1), + x.stride(2), + x.stride(3), + output.stride(0), + output.stride(1), + output.stride(2), + output.stride(3), + KERNEL_SIZE=self.kernel_size, + STRIDE=self.stride, + PADDING=self.padding, + ) + + return output + + +batch_size = 16 +channels = 32 +depth = 128 +height = 128 +width = 256 +kernel_size = 3 +stride = 2 +padding = 1 + + +def get_inputs(): + x = torch.rand(batch_size, channels, depth, height, width, dtype=torch.bfloat16) + return [x] + + +def get_init_inputs(): + return [kernel_size, stride, padding] diff --git a/backends/triton/cpu/KernelBench/level1/47_Sum_reduction_over_a_dimension.py b/backends/triton/cpu/KernelBench/level1/47_Sum_reduction_over_a_dimension.py new file mode 100644 index 0000000..e5088dd --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/47_Sum_reduction_over_a_dimension.py @@ -0,0 +1,103 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +def get_reduction_configs(): + return [ + triton.Config({"BLOCK_R": 64, "BLOCK_N": 128}, num_warps=4, num_stages=2), + ] + + +@triton.autotune( + configs=get_reduction_configs(), + key=["R", "C"], +) +@triton.jit +def sum_reduce_kernel( + x_ptr, + out_ptr, + B, + R, + C, + stride_xb, + stride_xr, + stride_xc: tl.constexpr, + stride_ob, + stride_oc, + BLOCK_R: tl.constexpr, + BLOCK_N: tl.constexpr, +): + pid_n = tl.program_id(0) + pid_b = tl.program_id(1) + + x_batch_offset = pid_b.to(tl.int64) * stride_xb + col_start = pid_n * BLOCK_N + + acc = tl.zeros((BLOCK_N,), dtype=tl.float32) + + x_desc = tl.make_tensor_descriptor( + base=x_ptr + x_batch_offset, + shape=(R, C), + strides=(stride_xr, stride_xc), + block_shape=(BLOCK_R, BLOCK_N), + ) + for off_r in range(0, R, BLOCK_R): + tile = x_desc.load([off_r, col_start]) + acc += tl.sum(tile.to(tl.float32), axis=0) + offs_c = col_start + tl.arange(0, BLOCK_N) + out_offset = pid_b.to(tl.int64) * stride_ob + tl.store( + out_ptr + out_offset + offs_c * stride_oc, acc.to(tl.bfloat16), mask=offs_c < C + ) + + +class Model(nn.Module): + def __init__(self, dim: int): + super().__init__() + self.dim = dim + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = x.contiguous() + + B, R, C = x.shape + output = torch.empty((B, 1, C), device=x.device, dtype=x.dtype) + + grid = lambda META: (triton.cdiv(C, META["BLOCK_N"]), B) + + sum_reduce_kernel[grid]( + x, + output, + B, + R, + C, + x.stride(0), + x.stride(1), + x.stride(2), + output.stride(0), + output.stride(2), + ) + + return output + + +batch_size = 128 +dim1 = 4096 +dim2 = 4095 +reduce_dim = 1 + + +def get_inputs(): + x = torch.rand(batch_size, dim1, dim2, dtype=torch.bfloat16) + return [x] + + +def get_init_inputs(): + return [reduce_dim] diff --git a/backends/triton/cpu/KernelBench/level1/48_Mean_reduction_over_a_dimension.py b/backends/triton/cpu/KernelBench/level1/48_Mean_reduction_over_a_dimension.py new file mode 100644 index 0000000..7b43864 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/48_Mean_reduction_over_a_dimension.py @@ -0,0 +1,106 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +def get_reduction_configs(): + return [ + triton.Config({"BLOCK_R": 64, "BLOCK_N": 128}, num_warps=4, num_stages=2), + ] + + +@triton.autotune( + configs=get_reduction_configs(), + key=["R", "C"], +) +@triton.jit +def mean_reduce_kernel( + x_ptr, + out_ptr, + B, + R, + C, + stride_xb, + stride_xr, + stride_xc: tl.constexpr, + stride_ob, + stride_oc, + BLOCK_R: tl.constexpr, + BLOCK_N: tl.constexpr, +): + pid_n = tl.program_id(0) + pid_b = tl.program_id(1) + + x_batch_offset = pid_b.to(tl.int64) * stride_xb + col_start = pid_n * BLOCK_N + + acc = tl.zeros((BLOCK_N,), dtype=tl.float32) + + x_desc = tl.make_tensor_descriptor( + base=x_ptr + x_batch_offset, + shape=(R, C), + strides=(stride_xr, stride_xc), + block_shape=(BLOCK_R, BLOCK_N), + ) + for off_r in range(0, R, BLOCK_R): + tile = x_desc.load([off_r, col_start]) + acc += tl.sum(tile.to(tl.float32), axis=0) + inv_dim = 1.0 / R + result = acc * inv_dim + + offs_c = col_start + tl.arange(0, BLOCK_N) + out_offset = pid_b.to(tl.int64) * stride_ob + tl.store( + out_ptr + out_offset + offs_c * stride_oc, + result.to(tl.bfloat16), + mask=offs_c < C, + ) + + +class Model(nn.Module): + def __init__(self, dim: int): + super(Model, self).__init__() + self.dim = dim + + def forward(self, x: torch.Tensor) -> torch.Tensor: + assert self.dim == 1, "This kernel only supports reduction over dim=1" + x = x.contiguous() + B, R, C = x.shape + out = torch.empty((B, C), device=x.device, dtype=x.dtype) + + grid = lambda META: (triton.cdiv(C, META["BLOCK_N"]), B) + + mean_reduce_kernel[grid]( + x, + out, + B, + R, + C, + x.stride(0), + x.stride(1), + x.stride(2), + out.stride(0), + out.stride(1), + ) + return out + + +batch_size = 128 +dim1 = 4096 +dim2 = 4095 + + +def get_inputs(): + x = torch.rand(batch_size, dim1, dim2, dtype=torch.bfloat16) + return [x] + + +def get_init_inputs(): + return [1] diff --git a/backends/triton/cpu/KernelBench/level1/49_Max_reduction_over_a_dimension.py b/backends/triton/cpu/KernelBench/level1/49_Max_reduction_over_a_dimension.py new file mode 100644 index 0000000..a82f882 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/49_Max_reduction_over_a_dimension.py @@ -0,0 +1,105 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_N": 64, "BLOCK_K": 64}, num_warps=8, num_stages=2), + ], + key=["DIM1", "DIM2"], +) +@triton.jit +def max_reduce_dim1_kernel( + x_ptr, + out_ptr, + BATCH: tl.constexpr, + DIM1: tl.constexpr, + DIM2: tl.constexpr, + stride_xb: tl.constexpr, + stride_xd1: tl.constexpr, + stride_xd2: tl.constexpr, + stride_ob: tl.constexpr, + stride_od2: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, +): + pid_batch = tl.program_id(0) + pid_n = tl.program_id(1) + + n_start = pid_n * BLOCK_N + n_offs = n_start + tl.arange(0, BLOCK_N) + n_mask = n_offs < DIM2 + + batch_offset = pid_batch.to(tl.int64) * stride_xb + + acc = tl.full((BLOCK_N,), value=float("-inf"), dtype=tl.float32) + + for k_start in range(0, DIM1, BLOCK_K): + k_offs = k_start + tl.arange(0, BLOCK_K) + k_mask = k_offs < DIM1 + + ptrs = ( + x_ptr + + batch_offset + + k_offs[:, None] * stride_xd1 + + n_offs[None, :] * stride_xd2 + ) + mask = k_mask[:, None] & n_mask[None, :] + tile = tl.load(ptrs, mask=mask, other=float("-inf")) + + tile_max = tl.max(tile.to(tl.float32), axis=0) + acc = tl.maximum(acc, tile_max) + + out_ptrs = out_ptr + pid_batch.to(tl.int64) * stride_ob + n_offs * stride_od2 + tl.store(out_ptrs, acc, mask=n_mask) + + +class Model(nn.Module): + def __init__(self, dim: int): + super(Model, self).__init__() + self.dim = dim + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = x.contiguous() + batch_size, dim1, dim2 = x.shape + + output = torch.empty((batch_size, dim2), device=x.device, dtype=torch.float32) + + grid = lambda META: (batch_size, triton.cdiv(dim2, META["BLOCK_N"])) + + max_reduce_dim1_kernel[grid]( + x, + output, + batch_size, + dim1, + dim2, + x.stride(0), + x.stride(1), + x.stride(2), + output.stride(0), + output.stride(1), + ) + + return output.to(x.dtype) + + +batch_size = 128 +dim1 = 4096 +dim2 = 4095 + + +def get_inputs(): + x = torch.rand(batch_size, dim1, dim2, dtype=torch.bfloat16) + return [x] + + +def get_init_inputs(): + return [1] diff --git a/backends/triton/cpu/KernelBench/level1/4_Matrix_vector_multiplication_.py b/backends/triton/cpu/KernelBench/level1/4_Matrix_vector_multiplication_.py new file mode 100644 index 0000000..4573a0e --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/4_Matrix_vector_multiplication_.py @@ -0,0 +1,76 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_K": 512}, num_warps=4, num_stages=3), + ], + key=["K"], +) +@triton.jit +def _gemv_kernel( + a_ptr, + b_ptr, + c_ptr, + K, + stride_am, + BLOCK_K: tl.constexpr, +): + row = tl.program_id(0) + a_row_ptr = a_ptr + row * stride_am + + acc = tl.zeros((BLOCK_K,), dtype=tl.float32) + + for k in range(0, K, BLOCK_K): + offs_k = k + tl.arange(0, BLOCK_K) + mask = offs_k < K + a_vals = tl.load(a_row_ptr + offs_k, mask=mask, other=0.0) + b_vals = tl.load(b_ptr + offs_k, mask=mask, other=0.0) + acc += a_vals.to(tl.float32) * b_vals.to(tl.float32) + + result = tl.sum(acc) + tl.store(c_ptr + row, result.to(tl.bfloat16)) + + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: + M, K = A.shape + C = torch.empty(M, device=A.device, dtype=A.dtype) + + B_flat = B.view(-1).contiguous() + + grid = (M,) + _gemv_kernel[grid]( + A, + B_flat, + C, + K, + A.stride(0), + ) + return C.view(M, 1) + + +M = 256 * 8 +K = 131072 * 8 + + +def get_inputs(): + A = torch.rand(M, K, dtype=torch.bfloat16) + B = torch.rand(K, 1, dtype=torch.bfloat16) + return [A, B] + + +def get_init_inputs(): + return [] diff --git a/backends/triton/cpu/KernelBench/level1/50_conv_standard_2D__square_input__square_kernel.py b/backends/triton/cpu/KernelBench/level1/50_conv_standard_2D__square_input__square_kernel.py new file mode 100644 index 0000000..a2e0bcb --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/50_conv_standard_2D__square_input__square_kernel.py @@ -0,0 +1,232 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.jit +def swizzle_tile( + tile_id, + M, + N, + K, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + grid_m = tl.cdiv(M, BLOCK_M) + grid_n = tl.cdiv(N, BLOCK_N) + width = GROUP_SIZE_M * grid_n + group_id = tile_id // width + group_size = tl.minimum(GROUP_SIZE_M, grid_m - group_id * GROUP_SIZE_M) + pid_m = group_id * GROUP_SIZE_M + (tile_id % group_size) + pid_n = (tile_id % width) // group_size + return pid_m, pid_n + + +@triton.autotune( + configs=[ + triton.Config( + {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, + num_warps=32, + num_stages=2, + ), + ], + key=["M", "N", "K"], +) +@triton.jit +def conv2d_implicit_gemm_kernel( + x_ptr, + w_ptr, + bias_ptr, + out_ptr, + M, + N, + K, + OH, + OW, + H, + W, + stride_conv_h, + stride_conv_w, + pad_h, + pad_w, + stride_xn, + stride_xc, + stride_xh, + stride_xw, + stride_on, + stride_oc, + stride_oh, + stride_ow, + stride_wk, + stride_wn: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + KH: tl.constexpr, + KW: tl.constexpr, + C_IN: tl.constexpr, +): + pid = tl.program_id(0) + pid_m, pid_n = swizzle_tile(pid, M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, GROUP_SIZE_M) + + offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + + ohow = OH * OW + n_idx = (offs_m // ohow).to(tl.int64) + rem = offs_m % ohow + oh_idx = rem // OW + ow_idx = rem % OW + + mask_m = offs_m < M + mask_n = offs_n < N + + w_desc = tl.make_tensor_descriptor( + base=w_ptr, + shape=(K, N), + strides=(stride_wk, stride_wn), + block_shape=(BLOCK_K, BLOCK_N), + ) + + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + + for k0 in range(0, K, BLOCK_K): + offs_k = k0 + tl.arange(0, BLOCK_K) + + kh = offs_k // (KW * C_IN) + kw = (offs_k // C_IN) % KW + cin = offs_k % C_IN + + ih = oh_idx[:, None] * stride_conv_h + kh[None, :] - pad_h + iw = ow_idx[:, None] * stride_conv_w + kw[None, :] - pad_w + + valid = (ih >= 0) & (ih < H) & (iw >= 0) & (iw < W) & (offs_k[None, :] < K) + valid = valid & mask_m[:, None] + + x_ptrs = ( + x_ptr + + n_idx[:, None] * stride_xn + + cin[None, :].to(tl.int64) * stride_xc + + ih.to(tl.int64) * stride_xh + + iw.to(tl.int64) * stride_xw + ) + + x_tile = tl.load(x_ptrs, mask=valid, other=0.0) + w_tile = w_desc.load([k0, pid_n * BLOCK_N]) + + acc = tl.dot(x_tile, w_tile, acc) + bias_vals = tl.load(bias_ptr + offs_n, mask=mask_n, other=0.0) + acc += bias_vals[None, :] + + out_ptrs = ( + out_ptr + + n_idx[:, None] * stride_on + + offs_n[None, :].to(tl.int64) * stride_oc + + oh_idx[:, None].to(tl.int64) * stride_oh + + ow_idx[:, None].to(tl.int64) * stride_ow + ) + tl.store(out_ptrs, acc.to(tl.bfloat16), mask=mask_m[:, None] & mask_n[None, :]) + + +class Model(nn.Module): + def __init__(self, num_classes=1000): + super().__init__() + self.conv1 = nn.Conv2d( + in_channels=3, out_channels=96, kernel_size=11, stride=4, padding=2 + ) + self._packed = False + + def _pack_weights(self): + w = self.conv1.weight.data + b = self.conv1.bias.data + device = w.device + self.w_packed = ( + w.permute(2, 3, 1, 0) + .contiguous() + .reshape(-1, w.shape[0]) + .to(device=device, dtype=torch.bfloat16) + .contiguous() + ) + self.bias_packed = b.to(device=device, dtype=torch.bfloat16).contiguous() + self._packed = True + + def forward(self, x): + if not self._packed: + self._pack_weights() + + x = x.to(dtype=torch.bfloat16).contiguous() + + N_batch, C_in, H, W_in = x.shape + C_out = self.w_packed.shape[1] + KH, KW = 11, 11 + stride_h, stride_w = 4, 4 + pad_h, pad_w = 2, 2 + OH = (H + 2 * pad_h - KH) // stride_h + 1 + OW = (W_in + 2 * pad_w - KW) // stride_w + 1 + + M = N_batch * OH * OW + N = C_out + K = C_in * KH * KW + + output = torch.empty( + (N_batch, C_out, OH, OW), device=x.device, dtype=torch.bfloat16 + ) + + grid = lambda META: ( + triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]), + ) + + conv2d_implicit_gemm_kernel[grid]( + x, + self.w_packed, + self.bias_packed, + output, + M, + N, + K, + OH, + OW, + H, + W_in, + stride_h, + stride_w, + pad_h, + pad_w, + x.stride(0), + x.stride(1), + x.stride(2), + x.stride(3), + output.stride(0), + output.stride(1), + output.stride(2), + output.stride(3), + self.w_packed.stride(0), + self.w_packed.stride(1), + KH=KH, + KW=KW, + C_IN=C_in, + ) + + return output + + +batch_size = 256 +num_classes = 1000 + + +def get_inputs(): + return [torch.rand(batch_size, 3, 224, 224, dtype=torch.bfloat16)] + + +def get_init_inputs(): + return [num_classes] diff --git a/backends/triton/cpu/KernelBench/level1/51_Argmax_over_a_dimension.py b/backends/triton/cpu/KernelBench/level1/51_Argmax_over_a_dimension.py new file mode 100644 index 0000000..1b9d31d --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/51_Argmax_over_a_dimension.py @@ -0,0 +1,98 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_N": 64, "grf_mode": "128"}, num_warps=4, num_stages=2), + ], + key=["D1", "D2"], +) +@triton.jit +def argmax_dim1_kernel( + x_ptr, + out_ptr, + D1: tl.constexpr, + D2: tl.constexpr, + stride_b, + stride_d1, + stride_d2, + stride_ob, + stride_od2, + BLOCK_N: tl.constexpr, + grf_mode: tl.constexpr, +): + pid_n = tl.program_id(0) + pid_b = tl.program_id(1) + + col_start = pid_n * BLOCK_N + cols = col_start + tl.arange(0, BLOCK_N) + col_mask = cols < D2 + + max_val = tl.full((BLOCK_N,), -float("inf"), dtype=tl.float32) + max_idx = tl.zeros((BLOCK_N,), dtype=tl.int32) + + batch_offset = pid_b.to(tl.int64) * stride_b + col_offsets = cols.to(tl.int64) * stride_d2 + base = x_ptr + batch_offset + col_offsets + + for k in tl.range(0, D1): + val = tl.load(base + k * stride_d1, mask=col_mask, other=-float("inf")).to( + tl.float32 + ) + update = val > max_val + max_val = tl.where(update, val, max_val) + max_idx = tl.where(update, k, max_idx) + + out_ptrs = out_ptr + pid_b.to(tl.int64) * stride_ob + cols.to(tl.int64) * stride_od2 + tl.store(out_ptrs, max_idx.to(tl.int64), mask=col_mask) + + +class Model(nn.Module): + def __init__(self, dim=1): + super(Model, self).__init__() + try: + self.dim = int(dim) + except (ValueError, TypeError): + self.dim = 1 + + def forward(self, x: torch.Tensor) -> torch.Tensor: + B, D1, D2 = x.shape + output = torch.empty((B, D2), device=x.device, dtype=torch.int64) + + grid = lambda META: (triton.cdiv(D2, META["BLOCK_N"]), B) + argmax_dim1_kernel[grid]( + x, + output, + D1, + D2, + x.stride(0), + x.stride(1), + x.stride(2), + output.stride(0), + output.stride(1), + ) + + return output + + +batch_size = 128 +dim1 = 4096 +dim2 = 4095 + + +def get_inputs(): + x = torch.rand(batch_size, dim1, dim2) + return [x] + + +def get_init_inputs(): + return [1] diff --git a/backends/triton/cpu/KernelBench/level1/52_Argmin_over_a_dimension.py b/backends/triton/cpu/KernelBench/level1/52_Argmin_over_a_dimension.py new file mode 100644 index 0000000..15dc776 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/52_Argmin_over_a_dimension.py @@ -0,0 +1,119 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config( + {"BLOCK_D2": 128, "BLOCK_K": 32, "warp_size": 16}, num_warps=4, num_stages=2 + ), + ], + key=["D1", "D2"], +) +@triton.jit +def argmin_kernel( + x_ptr, + out_ptr, + B, + D1, + D2, + stride_b, + stride_d1, + stride_d2, + out_stride_b, + out_stride_d2, + BLOCK_D2: tl.constexpr, + BLOCK_K: tl.constexpr, + warp_size: tl.constexpr, +): + pid = tl.program_id(0) + num_d2_blocks = tl.cdiv(D2, BLOCK_D2) + batch_idx = pid // num_d2_blocks + d2_block_idx = pid % num_d2_blocks + + d2_start = d2_block_idx * BLOCK_D2 + d2_offs = d2_start + tl.arange(0, BLOCK_D2) + d2_mask = d2_offs < D2 + + base = x_ptr + batch_idx.to(tl.int64) * stride_b + + min_val = tl.full([BLOCK_D2], float("inf"), dtype=tl.float32) + min_idx = tl.zeros([BLOCK_D2], dtype=tl.int32) + + k_offs_base = tl.arange(0, BLOCK_K) + + for k_start in tl.range(0, D1, BLOCK_K): + k_offs = k_start + k_offs_base + k_mask = k_offs < D1 + ptrs = ( + base + + k_offs[:, None].to(tl.int64) * stride_d1 + + d2_offs[None, :] * stride_d2 + ) + mask = k_mask[:, None] & d2_mask[None, :] + tile = tl.load(ptrs, mask=mask, other=float("inf")).to(tl.float32) + + tile_min = tl.min(tile, axis=0) + + update = tile_min < min_val + + k_indices = k_offs[:, None] + large_k = tl.full([1], D1, dtype=tl.int32) + k_masked = tl.where(tile == tile_min[None, :], k_indices, large_k) + tile_argmin = tl.min(k_masked, axis=0) + + min_idx = tl.where(update, tile_argmin, min_idx) + min_val = tl.where(update, tile_min, min_val) + + out_ptrs = out_ptr + batch_idx.to(tl.int64) * out_stride_b + d2_offs * out_stride_d2 + tl.store(out_ptrs, min_idx.to(tl.int64), mask=d2_mask) + + +class Model(nn.Module): + def __init__(self, dim: int): + super(Model, self).__init__() + self.dim = dim + + def forward(self, x: torch.Tensor) -> torch.Tensor: + B, D1, D2 = x.shape + output = torch.empty(B, D2, device=x.device, dtype=torch.int64) + + grid = lambda META: (B * triton.cdiv(D2, META["BLOCK_D2"]),) + + argmin_kernel[grid]( + x, + output, + B, + D1, + D2, + x.stride(0), + x.stride(1), + x.stride(2), + output.stride(0), + output.stride(1), + ) + + return output + + +batch_size = 128 +dim1 = 4096 +dim2 = 4095 +dim = 1 + + +def get_inputs(): + x = torch.rand(batch_size, dim1, dim2) + return [x] + + +def get_init_inputs(): + return [dim] diff --git a/backends/triton/cpu/KernelBench/level1/53_Min_reduction_over_a_dimension.py b/backends/triton/cpu/KernelBench/level1/53_Min_reduction_over_a_dimension.py new file mode 100644 index 0000000..b4fc20a --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/53_Min_reduction_over_a_dimension.py @@ -0,0 +1,107 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config( + {"BLOCK_D1": 64, "BLOCK_D2": 256, "warp_size": 16}, + num_warps=4, + num_stages=6, + ), + ], + key=["D1", "D2"], +) +@triton.jit +def min_reduction_kernel( + x_ptr, + out_ptr, + B, + D1, + D2, + stride_xb, + stride_xd1, + stride_xd2, + stride_ob, + stride_od2, + BLOCK_D1: tl.constexpr, + BLOCK_D2: tl.constexpr, + warp_size: tl.constexpr, +): + pid_d2 = tl.program_id(0) + pid_b = tl.program_id(1) + + d2_start = pid_d2 * BLOCK_D2 + d2_offs = d2_start + tl.arange(0, BLOCK_D2) + d2_mask = d2_offs < D2 + + batch_offset = pid_b.to(tl.int64) * stride_xb + base = x_ptr + batch_offset + + acc = tl.full((BLOCK_D2,), value=float("inf"), dtype=tl.float32) + + for d1_start in range(0, D1, BLOCK_D1): + d1_offs = d1_start + tl.arange(0, BLOCK_D1) + mask = (d1_offs[:, None] < D1) & d2_mask[None, :] + ptrs = base + d1_offs[:, None] * stride_xd1 + d2_offs[None, :] * stride_xd2 + tile = tl.load(ptrs, mask=mask, other=float("inf")).to(tl.float32) + tile_min = tl.min(tile, axis=0) + acc = tl.minimum(acc, tile_min) + + out_ptrs = out_ptr + pid_b.to(tl.int64) * stride_ob + d2_offs * stride_od2 + tl.store(out_ptrs, acc.to(tl.float16), mask=d2_mask) + + +class Model(nn.Module): + def __init__(self, dim: int): + super(Model, self).__init__() + self.dim = dim + + def forward(self, x: torch.Tensor) -> torch.Tensor: + B = x.shape[0] + D1 = x.shape[1] + D2 = x.shape[2] + + out = torch.empty((B, D2), device=x.device, dtype=x.dtype) + + grid = lambda META: ( + triton.cdiv(D2, META["BLOCK_D2"]), + B, + ) + + min_reduction_kernel[grid]( + x, + out, + B, + D1, + D2, + x.stride(0), + x.stride(1), + x.stride(2), + out.stride(0), + out.stride(1), + ) + + return out + + +batch_size = 128 +dim1 = 4096 +dim2 = 4095 + + +def get_inputs(): + x = torch.rand(batch_size, dim1, dim2) + return [x] + + +def get_init_inputs(): + return [1] diff --git a/backends/triton/cpu/KernelBench/level1/54_conv_standard_3D__square_input__square_kernel.py b/backends/triton/cpu/KernelBench/level1/54_conv_standard_3D__square_input__square_kernel.py new file mode 100644 index 0000000..a6a0552 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/54_conv_standard_3D__square_input__square_kernel.py @@ -0,0 +1,252 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +def _to_triple(x): + if isinstance(x, int): + return (x, x, x) + return tuple(x) + + +@triton.autotune( + configs=[ + triton.Config( + { + "BLOCK_M": 64, + "BLOCK_N": 64, + "BLOCK_K": 16, + "GROUP_M": 8, + "grf_mode": "256", + }, + num_warps=4, + num_stages=2, + ), + ], + key=["C_out", "K_FUSED"], +) +@triton.jit +def _conv3d_fused_k( + x_ptr, + w_ptr, + y_ptr, + N_batch, + C_out, + OD, + OH, + OW, + sxn, + sxd, + sxh, + sxw, + syn, + syd, + syh, + syw, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + GROUP_M: tl.constexpr, + KD: tl.constexpr, + KH: tl.constexpr, + KW: tl.constexpr, + C_IN: tl.constexpr, + K_FUSED: tl.constexpr, + grf_mode: tl.constexpr, +): + pid = tl.program_id(0) + M_total = N_batch * OD * OH * OW + num_pid_m = tl.cdiv(M_total, BLOCK_M) + num_pid_n = tl.cdiv(C_out, BLOCK_N) + num_pid_in_group = GROUP_M * num_pid_n + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_M + group_size_m = tl.minimum(num_pid_m - first_pid_m, GROUP_M) + pid_in_group = pid % num_pid_in_group + pid_m = first_pid_m + (pid_in_group % group_size_m) + pid_n = pid_in_group // group_size_m + + offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + offs_k = tl.arange(0, BLOCK_K) + + ohw = OH * OW + odhw = OD * ohw + n_idx = offs_m // odhw + rem = offs_m % odhw + od_idx = rem // ohw + rem2 = rem % ohw + oh_idx = rem2 // OW + ow_idx = rem2 % OW + + mask_m = offs_m < M_total + + x_spatial_base = ( + x_ptr + n_idx.to(tl.int64) * sxn + od_idx * sxd + oh_idx * sxh + ow_idx * sxw + ) + + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + + w_bp = tl.make_block_ptr( + base=w_ptr, + shape=(K_FUSED, C_out), + strides=(C_out, 1), + offsets=(0, pid_n * BLOCK_N), + block_shape=(BLOCK_K, BLOCK_N), + order=(1, 0), + ) + + for k0 in range(0, K_FUSED, BLOCK_K): + k_idx = k0 + offs_k + c_off = k_idx % C_IN + spatial_idx = k_idx // C_IN + kw_off = spatial_idx % KW + kh_off = (spatial_idx // KW) % KH + kd_off = spatial_idx // (KW * KH) + k_input_off = kd_off * sxd + kh_off * sxh + kw_off * sxw + c_off + + x_tile = tl.load( + x_spatial_base[:, None] + k_input_off[None, :], + mask=mask_m[:, None] & (k_idx[None, :] < K_FUSED), + other=0.0, + ) + w_tile = tl.load(w_bp, boundary_check=(0, 1), padding_option="zero") + acc = tl.dot(x_tile, w_tile, acc) + w_bp = tl.advance(w_bp, (BLOCK_K, 0)) + + offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + mask_n = offs_n < C_out + y_ptrs = ( + y_ptr + n_idx.to(tl.int64) * syn + od_idx * syd + oh_idx * syh + ow_idx * syw + ) + tl.store( + y_ptrs[:, None] + offs_n[None, :], + acc.to(tl.float16), + mask=mask_m[:, None] & mask_n[None, :], + ) + + +class Model(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=False, + ): + super(Model, self).__init__() + ks = _to_triple(kernel_size) + self.weight = nn.Parameter(torch.empty(out_channels, in_channels, *ks)) + if bias: + self.bias = nn.Parameter(torch.empty(out_channels)) + else: + self.bias = None + nn.init.kaiming_uniform_(self.weight, a=5**0.5) + if self.bias is not None: + fan_in = in_channels * ks[0] * ks[1] * ks[2] + bound = 1 / fan_in**0.5 + nn.init.uniform_(self.bias, -bound, bound) + self._packed = False + + def _pack_weights(self): + w = self.weight.data + C_out, C_in, KD, KH, KW = w.shape + self.w_fused = ( + w.permute(2, 3, 4, 1, 0) + .reshape(KD * KH * KW * C_in, C_out) + .contiguous() + .to(dtype=torch.float16) + ) + self._KD, self._KH, self._KW = KD, KH, KW + self._C_in = C_in + self._C_out = C_out + self._K_fused = KD * KH * KW * C_in + self._packed = True + + def forward(self, x): + if not self._packed: + self._pack_weights() + + B, C_in, D_in, H_in, W_in = x.shape + KD, KH, KW = self._KD, self._KH, self._KW + C_out = self._C_out + OD = D_in - KD + 1 + OH = H_in - KH + 1 + OW = W_in - KW + 1 + + x_fp16 = x.to(dtype=torch.float16) + x_cl = x_fp16.contiguous(memory_format=torch.channels_last_3d) + y = torch.empty( + B, + C_out, + OD, + OH, + OW, + device=x.device, + dtype=torch.float16, + memory_format=torch.channels_last_3d, + ) + + sx = x_cl.stride() + sy = y.stride() + + M_total = B * OD * OH * OW + grid = lambda meta: ( + triton.cdiv(M_total, meta["BLOCK_M"]) * triton.cdiv(C_out, meta["BLOCK_N"]), + ) + + _conv3d_fused_k[grid]( + x_cl, + self.w_fused, + y, + B, + C_out, + OD, + OH, + OW, + sx[0], + sx[2], + sx[3], + sx[4], + sy[0], + sy[2], + sy[3], + sy[4], + KD=KD, + KH=KH, + KW=KW, + C_IN=C_in, + K_FUSED=self._K_fused, + ) + + if self.bias is not None: + y = y + self.bias.to(dtype=torch.float16).view(1, -1, 1, 1, 1) + return y + + +batch_size = 16 +in_channels = 3 +out_channels = 64 +kernel_size = 3 +depth = 64 +width = 64 +height = 64 + + +def get_inputs(): + x = torch.rand(batch_size, in_channels, depth, width, height) + return [x] + + +def get_init_inputs(): + return [in_channels, out_channels, kernel_size] diff --git a/backends/triton/cpu/KernelBench/level1/55_conv_standard_2D__asymmetric_input__square_kernel.py b/backends/triton/cpu/KernelBench/level1/55_conv_standard_2D__asymmetric_input__square_kernel.py new file mode 100644 index 0000000..5618624 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/55_conv_standard_2D__asymmetric_input__square_kernel.py @@ -0,0 +1,214 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config( + {"BLOCK_OW": 64, "BLOCK_N": 64, "BLOCK_K": 64, "grf_mode": "256"}, + num_warps=4, + num_stages=2, + ), + ], + key=["H", "W", "C_IN", "C_out", "OH", "OW"], +) +@triton.jit +def _conv2d_spatial_bf16_prepack( + x_ptr, + w_ptr, + y_ptr, + N_batch, + H, + W, + C_out, + OH, + OW, + stride_wkh, + stride_wkw, + stride_wci, + stride_wco, + BLOCK_OW: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + KH: tl.constexpr, + KW: tl.constexpr, + C_IN: tl.constexpr, + grf_mode: tl.constexpr, +): + n = tl.program_id(0) + oh = tl.program_id(1) + pid_ow_n = tl.program_id(2) + + num_ow_tiles = tl.cdiv(OW, BLOCK_OW) + pid_ow = pid_ow_n % num_ow_tiles + pid_n = pid_ow_n // num_ow_tiles + + ow0 = pid_ow * BLOCK_OW + HW = H * W + + acc = tl.zeros((BLOCK_OW, BLOCK_N), dtype=tl.float32) + + for kh in range(KH): + for kw in range(KW): + x_row_start = n * HW + (oh + kh) * W + (ow0 + kw) + x_valid_rows = W - (ow0 + kw) + + x_bp = tl.make_block_ptr( + base=x_ptr, + shape=(x_row_start + x_valid_rows, C_IN), + strides=(C_IN, 1), + offsets=(x_row_start, 0), + block_shape=(BLOCK_OW, BLOCK_K), + order=(1, 0), + ) + + w_bp = tl.make_block_ptr( + base=w_ptr + kh * stride_wkh + kw * stride_wkw, + shape=(C_IN, C_out), + strides=(stride_wci, stride_wco), + offsets=(0, pid_n * BLOCK_N), + block_shape=(BLOCK_K, BLOCK_N), + order=(1, 0), + ) + + for c0 in range(0, C_IN, BLOCK_K): + x_tile = tl.load(x_bp, boundary_check=(0, 1), padding_option="zero") + w_tile = tl.load(w_bp, boundary_check=(0, 1), padding_option="zero") + acc = tl.dot(x_tile, w_tile, acc) + x_bp = tl.advance(x_bp, (0, BLOCK_K)) + w_bp = tl.advance(w_bp, (BLOCK_K, 0)) + + OHOW = OH * OW + y_row_start = n * OHOW + oh * OW + ow0 + y_valid_rows = OW - ow0 + y_bp = tl.make_block_ptr( + base=y_ptr, + shape=(y_row_start + y_valid_rows, C_out), + strides=(C_out, 1), + offsets=(y_row_start, pid_n * BLOCK_N), + block_shape=(BLOCK_OW, BLOCK_N), + order=(1, 0), + ) + tl.store(y_bp, acc.to(tl.bfloat16), boundary_check=(0, 1)) + + +def _ensure_xpu_bf16(x): + if x.dtype != torch.bfloat16: + return x.to(dtype=torch.bfloat16) + return x + + +class Model(nn.Module): + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: int = 1, + padding: int = 0, + dilation: int = 1, + groups: int = 1, + bias: bool = False, + ): + super(Model, self).__init__() + self.conv2d = nn.Conv2d( + in_channels, + out_channels, + (kernel_size, kernel_size), + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=bias, + ) + self._packed = False + self._cached_x_ptr = None + self._x_nhwc = None + + def _pack_weights(self): + w = self.conv2d.weight.data.detach() + self.w_hwio = _ensure_xpu_bf16(w.permute(2, 3, 1, 0).contiguous()) + self._packed = True + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if not self._packed: + self._pack_weights() + + x_ptr = x.data_ptr() + if self._cached_x_ptr != x_ptr: + x_bf16 = _ensure_xpu_bf16(x) + x_cl = x_bf16.contiguous(memory_format=torch.channels_last) + self._x_nhwc = x_cl.permute(0, 2, 3, 1) + self._cached_x_ptr = x_ptr + + x_nhwc = self._x_nhwc + N_batch = x.shape[0] + C_in = x.shape[1] + H = x.shape[2] + W = x.shape[3] + KH, KW = self.conv2d.kernel_size + C_out = self.conv2d.out_channels + OH = H - KH + 1 + OW = W - KW + 1 + + y = torch.empty( + (N_batch, C_out, OH, OW), + device=x_nhwc.device, + dtype=torch.bfloat16, + memory_format=torch.channels_last, + ) + y_nhwc = y.permute(0, 2, 3, 1) + + w = self.w_hwio + swkh, swkw, swci, swco = w.stride() + + def grid(meta): + num_ow = triton.cdiv(OW, meta["BLOCK_OW"]) + num_n = triton.cdiv(C_out, meta["BLOCK_N"]) + return (N_batch, OH, num_ow * num_n) + + _conv2d_spatial_bf16_prepack[grid]( + x_nhwc, + w, + y_nhwc, + N_batch, + H, + W, + C_out, + OH, + OW, + swkh, + swkw, + swci, + swco, + KH=KH, + KW=KW, + C_IN=C_in, + ) + + return y + + +batch_size = 8 +height = 512 +width = 1024 +in_channels = 64 +out_channels = 128 +kernel_size = 3 + + +def get_inputs(): + x = torch.rand(batch_size, in_channels, height, width) + return [x] + + +def get_init_inputs(): + return [in_channels, out_channels, kernel_size] diff --git a/backends/triton/cpu/KernelBench/level1/56_conv_standard_2D__asymmetric_input__asymmetric_kernel.py b/backends/triton/cpu/KernelBench/level1/56_conv_standard_2D__asymmetric_input__asymmetric_kernel.py new file mode 100644 index 0000000..0c35718 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/56_conv_standard_2D__asymmetric_input__asymmetric_kernel.py @@ -0,0 +1,205 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config( + {"BLOCK_OW": 64, "BLOCK_N": 64, "BLOCK_K": 64, "grf_mode": "128"}, + num_warps=4, + num_stages=2, + ), + ], + key=["H", "W", "C_IN", "C_out", "OH", "OW"], +) +@triton.jit +def _conv2d_spatial_tiled( + x_ptr, + w_ptr, + y_ptr, + N_batch, + H, + W, + C_out, + OH, + OW, + stride_wkh, + stride_wkw, + stride_wci, + stride_wco, + BLOCK_OW: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + KH: tl.constexpr, + KW: tl.constexpr, + C_IN: tl.constexpr, + grf_mode: tl.constexpr, +): + n = tl.program_id(0) + oh = tl.program_id(1) + pid_ow_n = tl.program_id(2) + + num_ow_tiles = tl.cdiv(OW, BLOCK_OW) + pid_ow = pid_ow_n % num_ow_tiles + pid_n = pid_ow_n // num_ow_tiles + + ow0 = pid_ow * BLOCK_OW + HW = H * W + OHOW = OH * OW + + acc = tl.zeros((BLOCK_OW, BLOCK_N), dtype=tl.float32) + + for kh in range(KH): + for kw in range(KW): + x_row_start = n * HW + (oh + kh) * W + (ow0 + kw) + x_valid_rows = W - (ow0 + kw) + + x_bp = tl.make_block_ptr( + base=x_ptr, + shape=(x_row_start + x_valid_rows, C_IN), + strides=(C_IN, 1), + offsets=(x_row_start, 0), + block_shape=(BLOCK_OW, BLOCK_K), + order=(1, 0), + ) + + w_bp = tl.make_block_ptr( + base=w_ptr + kh * stride_wkh + kw * stride_wkw, + shape=(C_IN, C_out), + strides=(stride_wci, stride_wco), + offsets=(0, pid_n * BLOCK_N), + block_shape=(BLOCK_K, BLOCK_N), + order=(1, 0), + ) + + for c0 in range(0, C_IN, BLOCK_K): + x_tile = tl.load(x_bp, boundary_check=(0, 1), padding_option="zero") + w_tile = tl.load(w_bp, boundary_check=(0, 1), padding_option="zero") + acc = tl.dot(x_tile, w_tile, acc, input_precision="ieee") + x_bp = tl.advance(x_bp, (0, BLOCK_K)) + w_bp = tl.advance(w_bp, (BLOCK_K, 0)) + + y_row_start = n * OHOW + oh * OW + ow0 + y_valid_rows = OW - ow0 + y_bp = tl.make_block_ptr( + base=y_ptr, + shape=(y_row_start + y_valid_rows, C_out), + strides=(C_out, 1), + offsets=(y_row_start, pid_n * BLOCK_N), + block_shape=(BLOCK_OW, BLOCK_N), + order=(1, 0), + ) + tl.store(y_bp, acc.to(tl.float16), boundary_check=(0, 1)) + + +class Model(nn.Module): + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: tuple, + stride: tuple = (1, 1), + padding: tuple = (0, 0), + dilation: tuple = (1, 1), + groups: int = 1, + bias: bool = False, + ): + super(Model, self).__init__() + self.conv2d = nn.Conv2d( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=bias, + ) + self._w_hwio = None + self._moved = False + + def _move_params_once(self): + w = self.conv2d.weight.data + self._w_hwio = w.permute(2, 3, 1, 0).contiguous() + if self._w_hwio.dtype != torch.float16: + self._w_hwio = self._w_hwio.to(dtype=torch.float16) + self._moved = True + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if not self._moved: + self._move_params_once() + + input_dtype = x.dtype + if x.dtype != torch.float16: + x = x.to(dtype=torch.float16) + + x_cl = x.contiguous(memory_format=torch.channels_last) + x_nhwc = x_cl.permute(0, 2, 3, 1) + + N, C_in, H, W = x_cl.shape + KH, KW, _, C_out = self._w_hwio.shape + OH = H - KH + 1 + OW = W - KW + 1 + + y = torch.empty( + (N, C_out, OH, OW), + device=x.device, + dtype=torch.float16, + memory_format=torch.channels_last, + ) + y_nhwc = y.permute(0, 2, 3, 1) + + w = self._w_hwio + + def grid(meta): + num_ow = triton.cdiv(OW, meta["BLOCK_OW"]) + num_n = triton.cdiv(C_out, meta["BLOCK_N"]) + return (N, OH, num_ow * num_n) + + _conv2d_spatial_tiled[grid]( + x_nhwc, + w, + y_nhwc, + N, + H, + W, + C_out, + OH, + OW, + w.stride(0), + w.stride(1), + w.stride(2), + w.stride(3), + KH=KH, + KW=KW, + C_IN=C_in, + ) + + if input_dtype != torch.float16: + return y.to(dtype=input_dtype) + return y + + +batch_size = 8 +in_channels = 64 +out_channels = 128 +kernel_size = (5, 7) +height = 512 +width = 256 + + +def get_inputs(): + x = torch.rand(batch_size, in_channels, height, width) + return [x] + + +def get_init_inputs(): + return [in_channels, out_channels, kernel_size] diff --git a/backends/triton/cpu/KernelBench/level1/57_conv_transposed_2D__square_input__square_kernel.py b/backends/triton/cpu/KernelBench/level1/57_conv_transposed_2D__square_input__square_kernel.py new file mode 100644 index 0000000..1206eac --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/57_conv_transposed_2D__square_input__square_kernel.py @@ -0,0 +1,229 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import torch.nn.functional as F +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config( + { + "BLOCK_OW": 64, + "BLOCK_N": 64, + "BLOCK_K": 64, + "GROUP_SIZE_M": 8, + "grf_mode": "256", + }, + num_warps=4, + num_stages=2, + ), + ], + key=["H", "W", "C_IN", "C_out", "OH", "OW"], +) +@triton.jit +def _conv_transpose2d_swizzled_v2( + x_ptr, + w_ptr, + y_ptr, + N_batch, + H, + W, + C_out, + OH, + OW, + stride_wkh, + stride_wkw, + stride_wci, + stride_wco, + BLOCK_OW: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + KH: tl.constexpr, + KW: tl.constexpr, + C_IN: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + grf_mode: tl.constexpr, +): + n = tl.program_id(1) + pid = tl.program_id(0) + + num_ow_tiles = tl.cdiv(OW, BLOCK_OW) + num_n_tiles = tl.cdiv(C_out, BLOCK_N) + num_m_tiles = OH * num_ow_tiles + + group_id = pid // (GROUP_SIZE_M * num_n_tiles) + first_m_in_group = group_id * GROUP_SIZE_M + group_size = min(num_m_tiles - first_m_in_group, GROUP_SIZE_M) + local_id = pid % (group_size * num_n_tiles) + m_local = local_id // num_n_tiles + pid_n = local_id % num_n_tiles + m_idx = first_m_in_group + m_local + + oh = m_idx // num_ow_tiles + pid_ow = m_idx % num_ow_tiles + ow0 = pid_ow * BLOCK_OW + + HW = H * W + OHOW = OH * OW + + acc = tl.zeros((BLOCK_OW, BLOCK_N), dtype=tl.float32) + + for kh in range(KH): + for kw in range(KW): + x_row_start = n * HW + (oh + kh) * W + (ow0 + kw) + x_valid_rows = W - (ow0 + kw) + + x_bp = tl.make_block_ptr( + base=x_ptr, + shape=(x_row_start + x_valid_rows, C_IN), + strides=(C_IN, 1), + offsets=(x_row_start, 0), + block_shape=(BLOCK_OW, BLOCK_K), + order=(1, 0), + ) + + w_bp = tl.make_block_ptr( + base=w_ptr + kh * stride_wkh + kw * stride_wkw, + shape=(C_IN, C_out), + strides=(stride_wci, stride_wco), + offsets=(0, pid_n * BLOCK_N), + block_shape=(BLOCK_K, BLOCK_N), + order=(1, 0), + ) + + for c0 in range(0, C_IN, BLOCK_K): + x_tile = tl.load(x_bp, boundary_check=(0, 1), padding_option="zero") + w_tile = tl.load(w_bp, boundary_check=(0, 1), padding_option="zero") + acc = tl.dot(x_tile, w_tile, acc) + x_bp = tl.advance(x_bp, (0, BLOCK_K)) + w_bp = tl.advance(w_bp, (BLOCK_K, 0)) + + y_row_start = n * OHOW + oh * OW + ow0 + y_valid_rows = OW - ow0 + y_bp = tl.make_block_ptr( + base=y_ptr, + shape=(y_row_start + y_valid_rows, C_out), + strides=(C_out, 1), + offsets=(y_row_start, pid_n * BLOCK_N), + block_shape=(BLOCK_OW, BLOCK_N), + order=(1, 0), + ) + tl.store(y_bp, acc.to(tl.float16), boundary_check=(0, 1)) + + +class Model(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + output_padding=0, + groups=1, + bias=False, + ): + super().__init__() + self.conv_transpose2d = nn.ConvTranspose2d( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + output_padding=output_padding, + groups=groups, + bias=bias, + ) + self._packed = False + self._cached_x_ptr = None + + def _pack_weights(self): + weight = self.conv_transpose2d.weight.data.detach() + self.w_hwio = ( + weight.flip(2, 3).permute(2, 3, 0, 1).to(dtype=torch.float16).contiguous() + ) + self.KH = weight.shape[2] + self.KW = weight.shape[3] + self.C_in_val = weight.shape[0] + self.C_out_val = weight.shape[1] + self._packed = True + + def forward(self, x): + if not self._packed: + self._pack_weights() + + x = x.to(dtype=torch.float16) + N, C_in, H, W_in = x.shape + KH, KW = self.KH, self.KW + C_out = self.C_out_val + + if self._cached_x_ptr != x.data_ptr(): + x_pad = F.pad(x, (KW - 1, KW - 1, KH - 1, KH - 1)) + x_pad_cl = x_pad.contiguous(memory_format=torch.channels_last) + self._x_nhwc = x_pad_cl.permute(0, 2, 3, 1) + self._cached_x_ptr = x.data_ptr() + self._H_pad = H + 2 * (KH - 1) + self._W_pad = W_in + 2 * (KW - 1) + self._OH = H + KH - 1 + self._OW = W_in + KW - 1 + self._N = N + self._y = torch.empty( + (N, C_out, self._OH, self._OW), + device=x.device, + dtype=torch.float16, + memory_format=torch.channels_last, + ) + self._y_nhwc = self._y.permute(0, 2, 3, 1) + + OH, OW = self._OH, self._OW + H_pad, W_pad = self._H_pad, self._W_pad + + def grid(META): + num_ow = triton.cdiv(OW, META["BLOCK_OW"]) + num_n = triton.cdiv(C_out, META["BLOCK_N"]) + return (OH * num_ow * num_n, N) + + _conv_transpose2d_swizzled_v2[grid]( + self._x_nhwc, + self.w_hwio, + self._y_nhwc, + N, + H_pad, + W_pad, + C_out, + OH, + OW, + self.w_hwio.stride(0), + self.w_hwio.stride(1), + self.w_hwio.stride(2), + self.w_hwio.stride(3), + KH=KH, + KW=KW, + C_IN=C_in, + ) + + return self._y + + +batch_size = 8 +in_channels = 64 +out_channels = 64 +kernel_size = 3 +height = 1024 +width = 1024 + + +def get_inputs(): + x = torch.rand(batch_size, in_channels, height, width) + return [x] + + +def get_init_inputs(): + return [in_channels, out_channels, kernel_size] diff --git a/backends/triton/cpu/KernelBench/level1/58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.py b/backends/triton/cpu/KernelBench/level1/58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.py new file mode 100644 index 0000000..2c22089 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.py @@ -0,0 +1,238 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_W": 16, "BLOCK_OC": 16}, num_warps=4, num_stages=3), + ], + key=["W_out", "C_out"], +) +@triton.jit +def _conv_transpose3d_v2( + x_ptr, + w_ptr, + out_ptr, + D_in: tl.constexpr, + H_in: tl.constexpr, + W_in: tl.constexpr, + C_out: tl.constexpr, + D_out: tl.constexpr, + H_out: tl.constexpr, + W_out: tl.constexpr, + sx_b, + sx_d, + sx_h, + sx_w, + so_b, + so_d, + so_h, + so_w, + BLOCK_W: tl.constexpr, + BLOCK_OC: tl.constexpr, + KD: tl.constexpr, + KH: tl.constexpr, + KW: tl.constexpr, + C_IN: tl.constexpr, + BLOCK_K: tl.constexpr, +): + pid_w = tl.program_id(0) + pid_bdh = tl.program_id(1) + pid_oc = tl.program_id(2) + + b = pid_bdh // (D_out * H_out) + rem = pid_bdh % (D_out * H_out) + d_out = rem // H_out + h_out = rem % H_out + + acc = tl.zeros((BLOCK_W, BLOCK_OC), dtype=tl.float32) + x_batch_base = x_ptr + b.to(tl.int64) * sx_b + + for kd in range(KD): + d_in = d_out - kd + d_ok = (d_in >= 0) & (d_in < D_in) + if d_ok: + for kh in range(KH): + h_in = h_out - kh + h_ok = (h_in >= 0) & (h_in < H_in) + if h_ok: + x_dh_base = x_batch_base + d_in * sx_d + h_in * sx_h + + for kw in range(KW): + w_in_start = pid_w * BLOCK_W - kw + kidx = kd * KH * KW + kh * KW + kw + + x_bp = tl.make_block_ptr( + base=x_dh_base, + shape=(W_in, C_IN), + strides=(sx_w, 1), + offsets=(w_in_start, 0), + block_shape=(BLOCK_W, BLOCK_K), + order=(1, 0), + ) + w_bp = tl.make_block_ptr( + base=w_ptr + kidx * C_IN * C_out, + shape=(C_IN, C_out), + strides=(C_out, 1), + offsets=(0, pid_oc * BLOCK_OC), + block_shape=(BLOCK_K, BLOCK_OC), + order=(1, 0), + ) + + for _c0 in range(0, C_IN, BLOCK_K): + x_tile = tl.load( + x_bp, boundary_check=(0, 1), padding_option="zero" + ) + w_tile = tl.load( + w_bp, boundary_check=(0, 1), padding_option="zero" + ) + acc += tl.dot(x_tile, w_tile) + x_bp = tl.advance(x_bp, (0, BLOCK_K)) + w_bp = tl.advance(w_bp, (BLOCK_K, 0)) + + out_bp = tl.make_block_ptr( + base=out_ptr + b.to(tl.int64) * so_b + d_out * so_d + h_out * so_h, + shape=(W_out, C_out), + strides=(so_w, 1), + offsets=(pid_w * BLOCK_W, pid_oc * BLOCK_OC), + block_shape=(BLOCK_W, BLOCK_OC), + order=(1, 0), + ) + tl.store(out_bp, acc.to(tl.float16), boundary_check=(0, 1)) + + +class Model(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=(1, 1, 1), + padding=(0, 0, 0), + output_padding=(0, 0, 0), + groups=1, + bias=False, + ): + super().__init__() + if isinstance(kernel_size, int): + kernel_size = (kernel_size, kernel_size, kernel_size) + self.deconv3d = nn.ConvTranspose3d( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + output_padding=output_padding, + groups=groups, + bias=bias, + ) + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self._packed = False + self._y_buf = None + + def _pack_weights(self): + w = self.deconv3d.weight.data + KD, KH, KW = self.kernel_size + self.weight_packed = ( + w.permute(2, 3, 4, 0, 1) + .reshape(KD * KH * KW, self.in_channels, self.out_channels) + .contiguous() + .to(dtype=torch.float16) + ) + self._packed = True + + def forward(self, x): + if not self._packed: + self._pack_weights() + + device = x.device + B, C_in, D_in, H_in, W_in = x.shape + KD, KH, KW = self.kernel_size + + D_out = D_in + KD - 1 + H_out = H_in + KH - 1 + W_out = W_in + KW - 1 + + x_cl = x.to(dtype=torch.float16).contiguous( + memory_format=torch.channels_last_3d + ) + + expected_shape = (B, self.out_channels, D_out, H_out, W_out) + if ( + self._y_buf is None + or self._y_buf.shape != expected_shape + or self._y_buf.device != device + ): + self._y_buf = torch.empty( + *expected_shape, + device=device, + dtype=torch.float16, + ).contiguous(memory_format=torch.channels_last_3d) + output = self._y_buf + + sx = x_cl.stride() + so = output.stride() + + BLOCK_K = 32 + + grid = lambda META: ( + triton.cdiv(W_out, META["BLOCK_W"]), + B * D_out * H_out, + triton.cdiv(self.out_channels, META["BLOCK_OC"]), + ) + + _conv_transpose3d_v2[grid]( + x_cl, + self.weight_packed, + output, + D_in, + H_in, + W_in, + self.out_channels, + D_out, + H_out, + W_out, + sx[0], + sx[2], + sx[3], + sx[4], + so[0], + so[2], + so[3], + so[4], + KD=KD, + KH=KH, + KW=KW, + C_IN=C_in, + BLOCK_K=BLOCK_K, + ) + + return output + + +batch_size = 16 +in_channels = 32 +out_channels = 16 +kernel_size = (3, 5, 7) +depth_in = 16 +height_in = 32 +width_in = 64 + + +def get_inputs(): + x = torch.rand(batch_size, in_channels, depth_in, height_in, width_in) + return [x] + + +def get_init_inputs(): + return [in_channels, out_channels, kernel_size] diff --git a/backends/triton/cpu/KernelBench/level1/59_conv_standard_3D__asymmetric_input__square_kernel.py b/backends/triton/cpu/KernelBench/level1/59_conv_standard_3D__asymmetric_input__square_kernel.py new file mode 100644 index 0000000..14a217f --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/59_conv_standard_3D__asymmetric_input__square_kernel.py @@ -0,0 +1,246 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config( + { + "BLOCK_M": 64, + "BLOCK_N": 64, + "BLOCK_K": 16, + "GROUP_M": 8, + "grf_mode": "256", + }, + num_warps=4, + num_stages=2, + ), + ], + key=["C_out", "K_FUSED"], +) +@triton.jit +def _conv3d_fused_k( + x_ptr, + w_ptr, + y_ptr, + N_batch, + C_out, + OD, + OH, + OW, + sxn, + sxd, + sxh, + sxw, + syn, + syd, + syh, + syw, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + GROUP_M: tl.constexpr, + KD: tl.constexpr, + KH: tl.constexpr, + KW: tl.constexpr, + C_IN: tl.constexpr, + K_FUSED: tl.constexpr, + grf_mode: tl.constexpr, +): + pid = tl.program_id(0) + M_total = N_batch * OD * OH * OW + num_pid_m = tl.cdiv(M_total, BLOCK_M) + num_pid_n = tl.cdiv(C_out, BLOCK_N) + num_pid_in_group = GROUP_M * num_pid_n + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_M + group_size_m = tl.minimum(num_pid_m - first_pid_m, GROUP_M) + pid_in_group = pid % num_pid_in_group + pid_m = first_pid_m + (pid_in_group % group_size_m) + pid_n = pid_in_group // group_size_m + + offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + offs_k = tl.arange(0, BLOCK_K) + + ohw = OH * OW + odhw = OD * ohw + n_idx = offs_m // odhw + rem = offs_m % odhw + od_idx = rem // ohw + rem2 = rem % ohw + oh_idx = rem2 // OW + ow_idx = rem2 % OW + + mask_m = offs_m < M_total + + x_spatial_base = ( + x_ptr + n_idx.to(tl.int64) * sxn + od_idx * sxd + oh_idx * sxh + ow_idx * sxw + ) + + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + + w_bp = tl.make_block_ptr( + base=w_ptr, + shape=(K_FUSED, C_out), + strides=(C_out, 1), + offsets=(0, pid_n * BLOCK_N), + block_shape=(BLOCK_K, BLOCK_N), + order=(1, 0), + ) + + for k0 in range(0, K_FUSED, BLOCK_K): + k_idx = k0 + offs_k + c_off = k_idx % C_IN + spatial_idx = k_idx // C_IN + kw_off = spatial_idx % KW + kh_off = (spatial_idx // KW) % KH + kd_off = spatial_idx // (KW * KH) + k_input_off = kd_off * sxd + kh_off * sxh + kw_off * sxw + c_off + + x_tile = tl.load( + x_spatial_base[:, None] + k_input_off[None, :], + mask=mask_m[:, None] & (k_idx[None, :] < K_FUSED), + other=0.0, + ) + w_tile = tl.load(w_bp, boundary_check=(0, 1), padding_option="zero") + acc = tl.dot(x_tile, w_tile, acc) + w_bp = tl.advance(w_bp, (BLOCK_K, 0)) + + offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + mask_n = offs_n < C_out + y_ptrs = ( + y_ptr + n_idx.to(tl.int64) * syn + od_idx * syd + oh_idx * syh + ow_idx * syw + ) + tl.store( + y_ptrs[:, None] + offs_n[None, :], + acc.to(tl.float16), + mask=mask_m[:, None] & mask_n[None, :], + ) + + +class Model(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=False, + ): + super(Model, self).__init__() + ks = (kernel_size, kernel_size, 1) + self.weight = nn.Parameter(torch.empty(out_channels, in_channels, *ks)) + if bias: + self.bias = nn.Parameter(torch.empty(out_channels)) + else: + self.bias = None + nn.init.kaiming_uniform_(self.weight, a=5**0.5) + if self.bias is not None: + fan_in = in_channels * ks[0] * ks[1] * ks[2] + bound = 1 / fan_in**0.5 + nn.init.uniform_(self.bias, -bound, bound) + self._packed = False + + def _pack_weights(self): + w = self.weight.data + C_out, C_in, KD, KH, KW = w.shape + self.w_fused = ( + w.permute(2, 3, 4, 1, 0) + .reshape(KD * KH * KW * C_in, C_out) + .contiguous() + .to(dtype=torch.float16) + ) + self._KD, self._KH, self._KW = KD, KH, KW + self._C_in = C_in + self._C_out = C_out + self._K_fused = KD * KH * KW * C_in + self._packed = True + + def forward(self, x): + if not self._packed: + self._pack_weights() + + B, C_in, D_in, H_in, W_in = x.shape + KD, KH, KW = self._KD, self._KH, self._KW + C_out = self._C_out + OD = D_in - KD + 1 + OH = H_in - KH + 1 + OW = W_in - KW + 1 + + x_fp16 = x.to(dtype=torch.float16) + x_cl = x_fp16.contiguous(memory_format=torch.channels_last_3d) + y = torch.empty( + B, + C_out, + OD, + OH, + OW, + device=x.device, + dtype=torch.float16, + memory_format=torch.channels_last_3d, + ) + + sx = x_cl.stride() + sy = y.stride() + + M_total = B * OD * OH * OW + grid = lambda meta: ( + triton.cdiv(M_total, meta["BLOCK_M"]) * triton.cdiv(C_out, meta["BLOCK_N"]), + ) + + _conv3d_fused_k[grid]( + x_cl, + self.w_fused, + y, + B, + C_out, + OD, + OH, + OW, + sx[0], + sx[2], + sx[3], + sx[4], + sy[0], + sy[2], + sy[3], + sy[4], + KD=KD, + KH=KH, + KW=KW, + C_IN=C_in, + K_FUSED=self._K_fused, + ) + + if self.bias is not None: + y = y + self.bias.view(1, -1, 1, 1, 1) + return y + + +batch_size = 16 +in_channels = 3 +out_channels = 64 +kernel_size = 3 +height = 256 +width = 256 +depth = 10 + + +def get_inputs(): + x = torch.rand(batch_size, in_channels, height, width, depth) + return [x] + + +def get_init_inputs(): + return [in_channels, out_channels, kernel_size] diff --git a/backends/triton/cpu/KernelBench/level1/5_Matrix_scalar_multiplication.py b/backends/triton/cpu/KernelBench/level1/5_Matrix_scalar_multiplication.py new file mode 100644 index 0000000..fe569a8 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/5_Matrix_scalar_multiplication.py @@ -0,0 +1,63 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_SIZE": 1024 * 2}, num_warps=4, num_stages=2), + ], + key=["n_elements"], +) +@triton.jit +def scalar_mul_kernel( + input_ptr, + output_ptr, + scalar, + n_elements, + BLOCK_SIZE: tl.constexpr, +): + pid = tl.program_id(0) + offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + mask = offsets < n_elements + x = tl.load(input_ptr + offsets, mask=mask) + result = x * scalar + tl.store(output_ptr + offsets, result.to(tl.bfloat16), mask=mask) + + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, A: torch.Tensor, s) -> torch.Tensor: + A = A.contiguous() + output = torch.empty_like(A) + n_elements = A.numel() + if isinstance(s, torch.Tensor): + scalar_val = s.item() + else: + scalar_val = float(s) + grid = lambda META: (triton.cdiv(n_elements, META["BLOCK_SIZE"]),) + scalar_mul_kernel[grid](A, output, scalar_val, n_elements) + return output + + +M = 16384 * 4 +N = 4096 * 4 + + +def get_inputs(): + A = torch.rand(M, N, dtype=torch.bfloat16) + s = 3.14 + return [A, s] + + +def get_init_inputs(): + return [] diff --git a/backends/triton/cpu/KernelBench/level1/60_conv_standard_3D__square_input__asymmetric_kernel.py b/backends/triton/cpu/KernelBench/level1/60_conv_standard_3D__square_input__asymmetric_kernel.py new file mode 100644 index 0000000..529f8ee --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/60_conv_standard_3D__square_input__asymmetric_kernel.py @@ -0,0 +1,252 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +def _to_triple(x): + if isinstance(x, int): + return (x, x, x) + return tuple(x) + + +@triton.autotune( + configs=[ + triton.Config( + { + "BLOCK_M": 64, + "BLOCK_N": 64, + "BLOCK_K": 16, + "GROUP_M": 8, + "grf_mode": "256", + }, + num_warps=4, + num_stages=2, + ), + ], + key=["C_out", "K_FUSED"], +) +@triton.jit +def _conv3d_fused_k( + x_ptr, + w_ptr, + y_ptr, + N_batch, + C_out, + OD, + OH, + OW, + sxn, + sxd, + sxh, + sxw, + syn, + syd, + syh, + syw, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + GROUP_M: tl.constexpr, + KD: tl.constexpr, + KH: tl.constexpr, + KW: tl.constexpr, + C_IN: tl.constexpr, + K_FUSED: tl.constexpr, + grf_mode: tl.constexpr, +): + pid = tl.program_id(0) + M_total = N_batch * OD * OH * OW + num_pid_m = tl.cdiv(M_total, BLOCK_M) + num_pid_n = tl.cdiv(C_out, BLOCK_N) + num_pid_in_group = GROUP_M * num_pid_n + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_M + group_size_m = tl.minimum(num_pid_m - first_pid_m, GROUP_M) + pid_in_group = pid % num_pid_in_group + pid_m = first_pid_m + (pid_in_group % group_size_m) + pid_n = pid_in_group // group_size_m + + offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + offs_k = tl.arange(0, BLOCK_K) + + ohw = OH * OW + odhw = OD * ohw + n_idx = offs_m // odhw + rem = offs_m % odhw + od_idx = rem // ohw + rem2 = rem % ohw + oh_idx = rem2 // OW + ow_idx = rem2 % OW + + mask_m = offs_m < M_total + + x_spatial_base = ( + x_ptr + n_idx.to(tl.int64) * sxn + od_idx * sxd + oh_idx * sxh + ow_idx * sxw + ) + + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + + w_bp = tl.make_block_ptr( + base=w_ptr, + shape=(K_FUSED, C_out), + strides=(C_out, 1), + offsets=(0, pid_n * BLOCK_N), + block_shape=(BLOCK_K, BLOCK_N), + order=(1, 0), + ) + + for k0 in range(0, K_FUSED, BLOCK_K): + k_idx = k0 + offs_k + c_off = k_idx % C_IN + spatial_idx = k_idx // C_IN + kw_off = spatial_idx % KW + kh_off = (spatial_idx // KW) % KH + kd_off = spatial_idx // (KW * KH) + k_input_off = kd_off * sxd + kh_off * sxh + kw_off * sxw + c_off + + x_tile = tl.load( + x_spatial_base[:, None] + k_input_off[None, :], + mask=mask_m[:, None] & (k_idx[None, :] < K_FUSED), + other=0.0, + ) + w_tile = tl.load(w_bp, boundary_check=(0, 1), padding_option="zero") + acc = tl.dot(x_tile, w_tile, acc) + w_bp = tl.advance(w_bp, (BLOCK_K, 0)) + + offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + mask_n = offs_n < C_out + y_ptrs = ( + y_ptr + n_idx.to(tl.int64) * syn + od_idx * syd + oh_idx * syh + ow_idx * syw + ) + tl.store( + y_ptrs[:, None] + offs_n[None, :], + acc.to(tl.float16), + mask=mask_m[:, None] & mask_n[None, :], + ) + + +class Model(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=False, + ): + super(Model, self).__init__() + ks = _to_triple(kernel_size) + self.weight = nn.Parameter(torch.empty(out_channels, in_channels, *ks)) + if bias: + self.bias = nn.Parameter(torch.empty(out_channels)) + else: + self.bias = None + nn.init.kaiming_uniform_(self.weight, a=5**0.5) + if self.bias is not None: + fan_in = in_channels * ks[0] * ks[1] * ks[2] + bound = 1 / fan_in**0.5 + nn.init.uniform_(self.bias, -bound, bound) + self._packed = False + + def _pack_weights(self): + w = self.weight.data + C_out, C_in, KD, KH, KW = w.shape + self.w_fused = ( + w.permute(2, 3, 4, 1, 0) + .reshape(KD * KH * KW * C_in, C_out) + .contiguous() + .to(dtype=torch.float16) + ) + self._KD, self._KH, self._KW = KD, KH, KW + self._C_in = C_in + self._C_out = C_out + self._K_fused = KD * KH * KW * C_in + self._packed = True + + def forward(self, x): + if not self._packed: + self._pack_weights() + + B, C_in, D_in, H_in, W_in = x.shape + KD, KH, KW = self._KD, self._KH, self._KW + C_out = self._C_out + OD = D_in - KD + 1 + OH = H_in - KH + 1 + OW = W_in - KW + 1 + + x_fp16 = x.to(dtype=torch.float16) + x_cl = x_fp16.contiguous(memory_format=torch.channels_last_3d) + y = torch.empty( + B, + C_out, + OD, + OH, + OW, + device=x.device, + dtype=torch.float16, + memory_format=torch.channels_last_3d, + ) + + sx = x_cl.stride() + sy = y.stride() + + M_total = B * OD * OH * OW + grid = lambda meta: ( + triton.cdiv(M_total, meta["BLOCK_M"]) * triton.cdiv(C_out, meta["BLOCK_N"]), + ) + + _conv3d_fused_k[grid]( + x_cl, + self.w_fused, + y, + B, + C_out, + OD, + OH, + OW, + sx[0], + sx[2], + sx[3], + sx[4], + sy[0], + sy[2], + sy[3], + sy[4], + KD=KD, + KH=KH, + KW=KW, + C_IN=C_in, + K_FUSED=self._K_fused, + ) + + if self.bias is not None: + y = y + self.bias.view(1, -1, 1, 1, 1) + return y + + +batch_size = 16 +in_channels = 3 +out_channels = 64 +kernel_size = (3, 5, 7) +width = 64 +height = 64 +depth = 64 + + +def get_inputs(): + x = torch.rand(batch_size, in_channels, width, height, depth) + return [x] + + +def get_init_inputs(): + return [in_channels, out_channels, kernel_size] diff --git a/backends/triton/cpu/KernelBench/level1/61_conv_transposed_3D__square_input__square_kernel.py b/backends/triton/cpu/KernelBench/level1/61_conv_transposed_3D__square_input__square_kernel.py new file mode 100644 index 0000000..d7a5e6d --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/61_conv_transposed_3D__square_input__square_kernel.py @@ -0,0 +1,243 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config( + {"BLOCK_OW": 64, "BLOCK_N": 64, "BLOCK_K": 16, "grf_mode": "256"}, + num_warps=4, + num_stages=3, + ), + ], + key=["D", "H", "W", "C_IN", "C_OUT", "OD", "OH", "OW"], +) +@triton.jit +def _conv3d_spatial_tiled( + x_ptr, + w_ptr, + y_ptr, + N_batch, + D, + H, + W, + OD, + OH, + OW, + sx_n, + sx_d, + sx_h, + sw_kd, + sw_kh, + sw_kw, + sw_ci, + sw_co, + sy_n, + sy_d, + sy_h, + BLOCK_OW: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + KD: tl.constexpr, + KH: tl.constexpr, + KW: tl.constexpr, + PAD: tl.constexpr, + C_IN: tl.constexpr, + C_OUT: tl.constexpr, + grf_mode: tl.constexpr, +): + n = tl.program_id(0) + pid_dh = tl.program_id(1) + pid_wn = tl.program_id(2) + + od = pid_dh // OH + oh = pid_dh % OH + + num_ow_tiles = tl.cdiv(OW, BLOCK_OW) + pid_ow = pid_wn % num_ow_tiles + pid_n = pid_wn // num_ow_tiles + ow0 = pid_ow * BLOCK_OW + + acc = tl.zeros((BLOCK_OW, BLOCK_N), dtype=tl.float32) + + x_n_base = x_ptr + n.to(tl.int64) * sx_n + + for kd in range(KD): + d_in = od + kd - PAD + d_ok = (d_in >= 0) & (d_in < D) + if d_ok: + for kh in range(KH): + h_in = oh + kh - PAD + h_ok = (h_in >= 0) & (h_in < H) + if h_ok: + x_dh_base = x_n_base + d_in * sx_d + h_in * sx_h + + for kw in range(KW): + w_start = ow0 + kw - PAD + + x_bp = tl.make_block_ptr( + base=x_dh_base, + shape=(W, C_IN), + strides=(C_IN, 1), + offsets=(w_start, 0), + block_shape=(BLOCK_OW, BLOCK_K), + order=(1, 0), + ) + + w_bp = tl.make_block_ptr( + base=w_ptr + kd * sw_kd + kh * sw_kh + kw * sw_kw, + shape=(C_IN, C_OUT), + strides=(sw_ci, sw_co), + offsets=(0, pid_n * BLOCK_N), + block_shape=(BLOCK_K, BLOCK_N), + order=(1, 0), + ) + + for c0 in range(0, C_IN, BLOCK_K): + x_tile = tl.load( + x_bp, boundary_check=(0, 1), padding_option="zero" + ) + w_tile = tl.load( + w_bp, boundary_check=(0, 1), padding_option="zero" + ) + acc = tl.dot(x_tile, w_tile, acc) + x_bp = tl.advance(x_bp, (0, BLOCK_K)) + w_bp = tl.advance(w_bp, (BLOCK_K, 0)) + + y_dh_base = y_ptr + n.to(tl.int64) * sy_n + od * sy_d + oh * sy_h + y_valid = OW - ow0 + y_bp = tl.make_block_ptr( + base=y_dh_base, + shape=(y_valid, C_OUT), + strides=(C_OUT, 1), + offsets=(0, pid_n * BLOCK_N), + block_shape=(BLOCK_OW, BLOCK_N), + order=(1, 0), + ) + tl.store(y_bp, acc.to(tl.float16), boundary_check=(0, 1)) + + +class Model(nn.Module): + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: int = 1, + padding: int = 0, + output_padding: int = 0, + groups: int = 1, + bias: bool = False, + ): + super(Model, self).__init__() + self.conv_transpose3d = nn.ConvTranspose3d( + in_channels, + out_channels, + kernel_size=(kernel_size, kernel_size, kernel_size), + stride=stride, + padding=padding, + output_padding=output_padding, + groups=groups, + bias=bias, + ) + self._packed = False + self._ks = kernel_size + self._pad = padding + + def _pack_weights(self): + w = self.conv_transpose3d.weight.data + self._w_dhwio = ( + w.flip(2, 3, 4).permute(2, 3, 4, 0, 1).to(dtype=torch.float16).contiguous() + ) + self._packed = True + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if not self._packed: + self._pack_weights() + if x.dtype != torch.float16: + x = x.to(dtype=torch.float16) + + x = x.to(memory_format=torch.channels_last_3d) + x_ndhwc = x.permute(0, 2, 3, 4, 1) + + N_b, C_in, D, H, W = x.shape + C_out = self._w_dhwio.shape[4] + KS = self._ks + PAD = KS - 1 - self._pad + + OD = D + 2 * PAD - KS + 1 + OH = H + 2 * PAD - KS + 1 + OW = W + 2 * PAD - KS + 1 + + conv_out = torch.empty( + (N_b, C_out, OD, OH, OW), + device=x.device, + dtype=torch.float16, + memory_format=torch.channels_last_3d, + ) + conv_ndhwc = conv_out.permute(0, 2, 3, 4, 1) + + def conv_grid(meta): + return ( + N_b, + OD * OH, + triton.cdiv(OW, meta["BLOCK_OW"]) * triton.cdiv(C_out, meta["BLOCK_N"]), + ) + + _conv3d_spatial_tiled[conv_grid]( + x_ndhwc, + self._w_dhwio, + conv_ndhwc, + N_b, + D, + H, + W, + OD, + OH, + OW, + x_ndhwc.stride(0), + x_ndhwc.stride(1), + x_ndhwc.stride(2), + self._w_dhwio.stride(0), + self._w_dhwio.stride(1), + self._w_dhwio.stride(2), + self._w_dhwio.stride(3), + self._w_dhwio.stride(4), + conv_ndhwc.stride(0), + conv_ndhwc.stride(1), + conv_ndhwc.stride(2), + KD=KS, + KH=KS, + KW=KS, + PAD=PAD, + C_IN=C_in, + C_OUT=C_out, + ) + + return conv_out + + +batch_size = 8 +in_channels = 48 +out_channels = 48 +kernel_size = 3 +depth = 64 +height = 64 +width = 64 + + +def get_inputs(): + x = torch.rand(batch_size, in_channels, depth, height, width) + return [x] + + +def get_init_inputs(): + return [in_channels, out_channels, kernel_size] diff --git a/backends/triton/cpu/KernelBench/level1/62_conv_standard_2D__square_input__asymmetric_kernel.py b/backends/triton/cpu/KernelBench/level1/62_conv_standard_2D__square_input__asymmetric_kernel.py new file mode 100644 index 0000000..0c35718 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/62_conv_standard_2D__square_input__asymmetric_kernel.py @@ -0,0 +1,205 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config( + {"BLOCK_OW": 64, "BLOCK_N": 64, "BLOCK_K": 64, "grf_mode": "128"}, + num_warps=4, + num_stages=2, + ), + ], + key=["H", "W", "C_IN", "C_out", "OH", "OW"], +) +@triton.jit +def _conv2d_spatial_tiled( + x_ptr, + w_ptr, + y_ptr, + N_batch, + H, + W, + C_out, + OH, + OW, + stride_wkh, + stride_wkw, + stride_wci, + stride_wco, + BLOCK_OW: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + KH: tl.constexpr, + KW: tl.constexpr, + C_IN: tl.constexpr, + grf_mode: tl.constexpr, +): + n = tl.program_id(0) + oh = tl.program_id(1) + pid_ow_n = tl.program_id(2) + + num_ow_tiles = tl.cdiv(OW, BLOCK_OW) + pid_ow = pid_ow_n % num_ow_tiles + pid_n = pid_ow_n // num_ow_tiles + + ow0 = pid_ow * BLOCK_OW + HW = H * W + OHOW = OH * OW + + acc = tl.zeros((BLOCK_OW, BLOCK_N), dtype=tl.float32) + + for kh in range(KH): + for kw in range(KW): + x_row_start = n * HW + (oh + kh) * W + (ow0 + kw) + x_valid_rows = W - (ow0 + kw) + + x_bp = tl.make_block_ptr( + base=x_ptr, + shape=(x_row_start + x_valid_rows, C_IN), + strides=(C_IN, 1), + offsets=(x_row_start, 0), + block_shape=(BLOCK_OW, BLOCK_K), + order=(1, 0), + ) + + w_bp = tl.make_block_ptr( + base=w_ptr + kh * stride_wkh + kw * stride_wkw, + shape=(C_IN, C_out), + strides=(stride_wci, stride_wco), + offsets=(0, pid_n * BLOCK_N), + block_shape=(BLOCK_K, BLOCK_N), + order=(1, 0), + ) + + for c0 in range(0, C_IN, BLOCK_K): + x_tile = tl.load(x_bp, boundary_check=(0, 1), padding_option="zero") + w_tile = tl.load(w_bp, boundary_check=(0, 1), padding_option="zero") + acc = tl.dot(x_tile, w_tile, acc, input_precision="ieee") + x_bp = tl.advance(x_bp, (0, BLOCK_K)) + w_bp = tl.advance(w_bp, (BLOCK_K, 0)) + + y_row_start = n * OHOW + oh * OW + ow0 + y_valid_rows = OW - ow0 + y_bp = tl.make_block_ptr( + base=y_ptr, + shape=(y_row_start + y_valid_rows, C_out), + strides=(C_out, 1), + offsets=(y_row_start, pid_n * BLOCK_N), + block_shape=(BLOCK_OW, BLOCK_N), + order=(1, 0), + ) + tl.store(y_bp, acc.to(tl.float16), boundary_check=(0, 1)) + + +class Model(nn.Module): + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: tuple, + stride: tuple = (1, 1), + padding: tuple = (0, 0), + dilation: tuple = (1, 1), + groups: int = 1, + bias: bool = False, + ): + super(Model, self).__init__() + self.conv2d = nn.Conv2d( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=bias, + ) + self._w_hwio = None + self._moved = False + + def _move_params_once(self): + w = self.conv2d.weight.data + self._w_hwio = w.permute(2, 3, 1, 0).contiguous() + if self._w_hwio.dtype != torch.float16: + self._w_hwio = self._w_hwio.to(dtype=torch.float16) + self._moved = True + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if not self._moved: + self._move_params_once() + + input_dtype = x.dtype + if x.dtype != torch.float16: + x = x.to(dtype=torch.float16) + + x_cl = x.contiguous(memory_format=torch.channels_last) + x_nhwc = x_cl.permute(0, 2, 3, 1) + + N, C_in, H, W = x_cl.shape + KH, KW, _, C_out = self._w_hwio.shape + OH = H - KH + 1 + OW = W - KW + 1 + + y = torch.empty( + (N, C_out, OH, OW), + device=x.device, + dtype=torch.float16, + memory_format=torch.channels_last, + ) + y_nhwc = y.permute(0, 2, 3, 1) + + w = self._w_hwio + + def grid(meta): + num_ow = triton.cdiv(OW, meta["BLOCK_OW"]) + num_n = triton.cdiv(C_out, meta["BLOCK_N"]) + return (N, OH, num_ow * num_n) + + _conv2d_spatial_tiled[grid]( + x_nhwc, + w, + y_nhwc, + N, + H, + W, + C_out, + OH, + OW, + w.stride(0), + w.stride(1), + w.stride(2), + w.stride(3), + KH=KH, + KW=KW, + C_IN=C_in, + ) + + if input_dtype != torch.float16: + return y.to(dtype=input_dtype) + return y + + +batch_size = 8 +in_channels = 64 +out_channels = 128 +kernel_size = (5, 7) +height = 512 +width = 256 + + +def get_inputs(): + x = torch.rand(batch_size, in_channels, height, width) + return [x] + + +def get_init_inputs(): + return [in_channels, out_channels, kernel_size] diff --git a/backends/triton/cpu/KernelBench/level1/63_conv_standard_2D__square_input__square_kernel.py b/backends/triton/cpu/KernelBench/level1/63_conv_standard_2D__square_input__square_kernel.py new file mode 100644 index 0000000..760a8fa --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/63_conv_standard_2D__square_input__square_kernel.py @@ -0,0 +1,201 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config( + {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 16, "grf_mode": "128"}, + num_warps=4, + num_stages=2, + ), + ], + key=["M_total", "C_out", "C_IN"], +) +@triton.jit +def _conv2d_flat_kernel( + x_ptr, + w_ptr, + y_ptr, + M_total, + C_out, + OH, + OW, + stride_xn, + stride_xh, + stride_xw, + stride_wkh, + stride_wkw, + stride_wci, + stride_wco, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + KH: tl.constexpr, + KW: tl.constexpr, + C_IN: tl.constexpr, + grf_mode: tl.constexpr, +): + pid_m = tl.program_id(0) + pid_n = tl.program_id(1) + + nhw = OH * OW + offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + offs_k = tl.arange(0, BLOCK_K) + + n_idx = offs_m // nhw + rem = offs_m % nhw + oh_idx = rem // OW + ow_idx = rem % OW + + mask_m = offs_m < M_total + + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + + x_base = x_ptr + n_idx * stride_xn + oh_idx * stride_xh + ow_idx * stride_xw + + for kh in range(KH): + for kw in range(KW): + x_kh_kw = x_base + kh * stride_xh + kw * stride_xw + + w_bp = tl.make_block_ptr( + base=w_ptr + kh * stride_wkh + kw * stride_wkw, + shape=(C_IN, C_out), + strides=(stride_wci, stride_wco), + offsets=(0, pid_n * BLOCK_N), + block_shape=(BLOCK_K, BLOCK_N), + order=(1, 0), + ) + + for c0 in range(0, C_IN, BLOCK_K): + k_idx = c0 + offs_k + + x_tile = tl.load( + x_kh_kw[:, None] + k_idx[None, :], + mask=mask_m[:, None] & (k_idx[None, :] < C_IN), + other=0.0, + ) + + w_tile = tl.load(w_bp, boundary_check=(0, 1), padding_option="zero") + acc = tl.dot(x_tile, w_tile, acc) + w_bp = tl.advance(w_bp, (BLOCK_K, 0)) + + y_bp = tl.make_block_ptr( + base=y_ptr, + shape=(M_total, C_out), + strides=(C_out, 1), + offsets=(pid_m * BLOCK_M, pid_n * BLOCK_N), + block_shape=(BLOCK_M, BLOCK_N), + order=(1, 0), + ) + tl.store(y_bp, acc.to(tl.float16), boundary_check=(0, 1)) + + +class Model(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=False, + ): + super(Model, self).__init__() + self.conv2d = nn.Conv2d( + in_channels, + out_channels, + (kernel_size, kernel_size), + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=bias, + ) + self._packed = False + self._y_buf = None + + def _pack_weights(self): + w = self.conv2d.weight.data + self.w_hwio = w.to(dtype=torch.float16).permute(2, 3, 1, 0).contiguous() + self._packed = True + + def forward(self, x): + if not self._packed: + self._pack_weights() + + x = x.to(dtype=torch.float16) + + N_batch, C_in, H, W_dim = x.shape + KH, KW = self.conv2d.kernel_size + C_out = self.conv2d.out_channels + OH = H - KH + 1 + OW = W_dim - KW + 1 + M_total = N_batch * OH * OW + + x_cl = x.contiguous(memory_format=torch.channels_last) + x_nhwc = x_cl.permute(0, 2, 3, 1) + + if self._y_buf is None or self._y_buf.shape != (N_batch, C_out, OH, OW): + self._y_buf = torch.empty( + (N_batch, C_out, OH, OW), + device=x.device, + dtype=torch.float16, + memory_format=torch.channels_last, + ) + y = self._y_buf + y_nhwc = y.permute(0, 2, 3, 1) + + grid = lambda META: ( + triton.cdiv(M_total, META["BLOCK_M"]), + triton.cdiv(C_out, META["BLOCK_N"]), + ) + + _conv2d_flat_kernel[grid]( + x_nhwc, + self.w_hwio, + y_nhwc, + M_total, + C_out, + OH, + OW, + x_nhwc.stride(0), + x_nhwc.stride(1), + x_nhwc.stride(2), + self.w_hwio.stride(0), + self.w_hwio.stride(1), + self.w_hwio.stride(2), + self.w_hwio.stride(3), + KH=KH, + KW=KW, + C_IN=C_in, + ) + + return y + + +batch_size = 16 +in_channels = 16 +out_channels = 128 +kernel_size = 3 +width = 1024 +height = 1024 + + +def get_inputs(): + x = torch.rand(batch_size, in_channels, height, width) + return [x] + + +def get_init_inputs(): + return [in_channels, out_channels, kernel_size] diff --git a/backends/triton/cpu/KernelBench/level1/64_conv_transposed_1D.py b/backends/triton/cpu/KernelBench/level1/64_conv_transposed_1D.py new file mode 100644 index 0000000..c9f1851 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/64_conv_transposed_1D.py @@ -0,0 +1,195 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config( + {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 16, "GROUP_M": 8}, + num_warps=4, + num_stages=2, + ), + ], + key=["C_IN", "C_out", "OL"], +) +@triton.jit +def _conv_transpose1d_gemm( + x_ptr, + w_ptr, + y_ptr, + N_batch, + L_in, + C_out, + OL, + sxn, + sxl, + swk, + swci, + swco, + syn, + syl, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + GROUP_M: tl.constexpr, + K_SIZE: tl.constexpr, + C_IN: tl.constexpr, +): + pid = tl.program_id(0) + M_total = N_batch * OL + num_pid_m = tl.cdiv(M_total, BLOCK_M) + num_pid_n = tl.cdiv(C_out, BLOCK_N) + num_pid_in_group = GROUP_M * num_pid_n + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_M + group_size_m = tl.minimum(num_pid_m - first_pid_m, GROUP_M) + pid_in_group = pid % num_pid_in_group + pid_m = first_pid_m + (pid_in_group % group_size_m) + pid_n = pid_in_group // group_size_m + + offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + offs_k = tl.arange(0, BLOCK_K) + + n_idx = offs_m // OL + ol_idx = offs_m % OL + + mask_m = offs_m < M_total + mask_n = offs_n < C_out + + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + x_base = x_ptr + n_idx * sxn + + for k in range(K_SIZE): + il = ol_idx - k + valid = (il >= 0) & (il < L_in) & mask_m + x_ptrs = x_base + il * sxl + + w_bp = tl.make_block_ptr( + base=w_ptr + k * swk, + shape=(C_IN, C_out), + strides=(swci, swco), + offsets=(0, pid_n * BLOCK_N), + block_shape=(BLOCK_K, BLOCK_N), + order=(1, 0), + ) + + for c0 in range(0, C_IN, BLOCK_K): + k_idx = c0 + offs_k + x_tile = tl.load( + x_ptrs[:, None] + k_idx[None, :], + mask=valid[:, None] & (k_idx[None, :] < C_IN), + other=0.0, + ) + w_tile = tl.load(w_bp, boundary_check=(0, 1), padding_option="zero") + acc = tl.dot(x_tile, w_tile, acc) + w_bp = tl.advance(w_bp, (BLOCK_K, 0)) + + y_ptrs = y_ptr + n_idx * syn + ol_idx * syl + tl.store( + y_ptrs[:, None] + offs_n[None, :], + acc.to(y_ptr.dtype.element_ty), + mask=mask_m[:, None] & mask_n[None, :], + ) + + +class Model(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + output_padding=0, + groups=1, + bias=False, + ): + super(Model, self).__init__() + self.conv1d_transpose = nn.ConvTranspose1d( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + output_padding=output_padding, + groups=groups, + bias=bias, + ) + self._packed = False + + def _pack_weights(self): + self._w_kio = self.conv1d_transpose.weight.permute(2, 0, 1).contiguous() + self._K = self.conv1d_transpose.weight.shape[2] + self._C_out = self.conv1d_transpose.weight.shape[1] + self._packed = True + + def forward(self, x): + if not self._packed: + self._pack_weights() + + B, C_in, L_in = x.shape + K = self._K + C_out = self._C_out + OL = L_in + K - 1 + + x_nlc = x.permute(0, 2, 1).contiguous() + w_kio = self._w_kio + y_nlc = torch.empty((B, OL, C_out), device=x.device, dtype=x.dtype) + + sxn, sxl, _ = x_nlc.stride() + swk, swci, swco = w_kio.stride() + syn, syl, _ = y_nlc.stride() + + M_total = B * OL + grid = lambda meta: ( + triton.cdiv(M_total, meta["BLOCK_M"]) * triton.cdiv(C_out, meta["BLOCK_N"]), + ) + + _conv_transpose1d_gemm[grid]( + x_nlc, + w_kio, + y_nlc, + B, + L_in, + C_out, + OL, + sxn, + sxl, + swk, + swci, + swco, + syn, + syl, + K_SIZE=K, + C_IN=C_in, + ) + + result = y_nlc.permute(0, 2, 1).contiguous() + if self.conv1d_transpose.bias is not None: + result = result + self.conv1d_transpose.bias.view(1, -1, 1) + return result + + +batch_size = 64 +in_channels = 128 +out_channels = 128 +kernel_size = 3 +length = 65536 + + +def get_inputs(): + x = torch.rand(batch_size, in_channels, length) + return [x] + + +def get_init_inputs(): + return [in_channels, out_channels, kernel_size] diff --git a/backends/triton/cpu/KernelBench/level1/65_conv_transposed_2D__square_input__asymmetric_kernel.py b/backends/triton/cpu/KernelBench/level1/65_conv_transposed_2D__square_input__asymmetric_kernel.py new file mode 100644 index 0000000..81d1992 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/65_conv_transposed_2D__square_input__asymmetric_kernel.py @@ -0,0 +1,229 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import torch.nn.functional as F +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config( + { + "BLOCK_OW": 64, + "BLOCK_N": 64, + "BLOCK_K": 64, + "GROUP_SIZE_M": 8, + "grf_mode": "256", + }, + num_warps=4, + num_stages=2, + ), + ], + key=["H", "W", "C_IN", "C_out", "OH", "OW"], +) +@triton.jit +def _conv_transpose2d_swizzled_v2( + x_ptr, + w_ptr, + y_ptr, + N_batch, + H, + W, + C_out, + OH, + OW, + stride_wkh, + stride_wkw, + stride_wci, + stride_wco, + BLOCK_OW: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + KH: tl.constexpr, + KW: tl.constexpr, + C_IN: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + grf_mode: tl.constexpr, +): + n = tl.program_id(1) + pid = tl.program_id(0) + + num_ow_tiles = tl.cdiv(OW, BLOCK_OW) + num_n_tiles = tl.cdiv(C_out, BLOCK_N) + num_m_tiles = OH * num_ow_tiles + + group_id = pid // (GROUP_SIZE_M * num_n_tiles) + first_m_in_group = group_id * GROUP_SIZE_M + group_size = min(num_m_tiles - first_m_in_group, GROUP_SIZE_M) + local_id = pid % (group_size * num_n_tiles) + m_local = local_id // num_n_tiles + pid_n = local_id % num_n_tiles + m_idx = first_m_in_group + m_local + + oh = m_idx // num_ow_tiles + pid_ow = m_idx % num_ow_tiles + ow0 = pid_ow * BLOCK_OW + + HW = H * W + OHOW = OH * OW + + acc = tl.zeros((BLOCK_OW, BLOCK_N), dtype=tl.float32) + + for kh in range(KH): + for kw in range(KW): + x_row_start = n * HW + (oh + kh) * W + (ow0 + kw) + x_valid_rows = W - (ow0 + kw) + + x_bp = tl.make_block_ptr( + base=x_ptr, + shape=(x_row_start + x_valid_rows, C_IN), + strides=(C_IN, 1), + offsets=(x_row_start, 0), + block_shape=(BLOCK_OW, BLOCK_K), + order=(1, 0), + ) + + w_bp = tl.make_block_ptr( + base=w_ptr + kh * stride_wkh + kw * stride_wkw, + shape=(C_IN, C_out), + strides=(stride_wci, stride_wco), + offsets=(0, pid_n * BLOCK_N), + block_shape=(BLOCK_K, BLOCK_N), + order=(1, 0), + ) + + for c0 in range(0, C_IN, BLOCK_K): + x_tile = tl.load(x_bp, boundary_check=(0, 1), padding_option="zero") + w_tile = tl.load(w_bp, boundary_check=(0, 1), padding_option="zero") + acc = tl.dot(x_tile, w_tile, acc) + x_bp = tl.advance(x_bp, (0, BLOCK_K)) + w_bp = tl.advance(w_bp, (BLOCK_K, 0)) + + y_row_start = n * OHOW + oh * OW + ow0 + y_valid_rows = OW - ow0 + y_bp = tl.make_block_ptr( + base=y_ptr, + shape=(y_row_start + y_valid_rows, C_out), + strides=(C_out, 1), + offsets=(y_row_start, pid_n * BLOCK_N), + block_shape=(BLOCK_OW, BLOCK_N), + order=(1, 0), + ) + tl.store(y_bp, acc.to(tl.float16), boundary_check=(0, 1)) + + +class Model(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + output_padding=0, + groups=1, + bias=False, + ): + super().__init__() + self.conv_transpose2d = nn.ConvTranspose2d( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + output_padding=output_padding, + groups=groups, + bias=bias, + ) + self._packed = False + self._cached_x_ptr = None + + def _pack_weights(self): + weight = self.conv_transpose2d.weight.data.detach() + self.w_hwio = ( + weight.flip(2, 3).permute(2, 3, 0, 1).to(dtype=torch.float16).contiguous() + ) + self.KH = weight.shape[2] + self.KW = weight.shape[3] + self.C_in_val = weight.shape[0] + self.C_out_val = weight.shape[1] + self._packed = True + + def forward(self, x): + if not self._packed: + self._pack_weights() + + x = x.to(dtype=torch.float16) + N, C_in, H, W_in = x.shape + KH, KW = self.KH, self.KW + C_out = self.C_out_val + + if self._cached_x_ptr != x.data_ptr(): + x_pad = F.pad(x, (KW - 1, KW - 1, KH - 1, KH - 1)) + x_pad_cl = x_pad.contiguous(memory_format=torch.channels_last) + self._x_nhwc = x_pad_cl.permute(0, 2, 3, 1) + self._cached_x_ptr = x.data_ptr() + self._H_pad = H + 2 * (KH - 1) + self._W_pad = W_in + 2 * (KW - 1) + self._OH = H + KH - 1 + self._OW = W_in + KW - 1 + self._N = N + self._y = torch.empty( + (N, C_out, self._OH, self._OW), + device=x.device, + dtype=torch.float16, + memory_format=torch.channels_last, + ) + self._y_nhwc = self._y.permute(0, 2, 3, 1) + + OH, OW = self._OH, self._OW + H_pad, W_pad = self._H_pad, self._W_pad + + def grid(META): + num_ow = triton.cdiv(OW, META["BLOCK_OW"]) + num_n = triton.cdiv(C_out, META["BLOCK_N"]) + return (OH * num_ow * num_n, N) + + _conv_transpose2d_swizzled_v2[grid]( + self._x_nhwc, + self.w_hwio, + self._y_nhwc, + N, + H_pad, + W_pad, + C_out, + OH, + OW, + self.w_hwio.stride(0), + self.w_hwio.stride(1), + self.w_hwio.stride(2), + self.w_hwio.stride(3), + KH=KH, + KW=KW, + C_IN=C_in, + ) + + return self._y + + +batch_size = 8 +in_channels = 64 +out_channels = 64 +kernel_size = (3, 7) +width = 512 +height = 512 + + +def get_inputs(): + x = torch.rand(batch_size, in_channels, height, width) + return [x] + + +def get_init_inputs(): + return [in_channels, out_channels, kernel_size] diff --git a/backends/triton/cpu/KernelBench/level1/66_conv_standard_3D__asymmetric_input__asymmetric_kernel.py b/backends/triton/cpu/KernelBench/level1/66_conv_standard_3D__asymmetric_input__asymmetric_kernel.py new file mode 100644 index 0000000..656518f --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/66_conv_standard_3D__asymmetric_input__asymmetric_kernel.py @@ -0,0 +1,242 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +def _to_triple(x): + if isinstance(x, int): + return (x, x, x) + return tuple(x) + + +@triton.autotune( + configs=[ + triton.Config( + {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 16, "grf_mode": "256"}, + num_warps=4, + num_stages=2, + ), + ], + key=["C_out", "K_FUSED"], +) +@triton.jit +def _conv3d_fused_k( + x_ptr, + w_ptr, + y_ptr, + N_batch, + C_out, + OD, + OH, + OW, + sxn, + sxd, + sxh, + sxw, + syn, + syd, + syh, + syw, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + KD: tl.constexpr, + KH: tl.constexpr, + KW: tl.constexpr, + C_IN: tl.constexpr, + K_FUSED: tl.constexpr, + grf_mode: tl.constexpr, +): + pid_m = tl.program_id(0) + pid_n = tl.program_id(1) + + M_total = N_batch * OD * OH * OW + offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + offs_k = tl.arange(0, BLOCK_K) + + ohw = OH * OW + odhw = OD * ohw + n_idx = offs_m // odhw + rem = offs_m % odhw + od_idx = rem // ohw + rem2 = rem % ohw + oh_idx = rem2 // OW + ow_idx = rem2 % OW + + mask_m = offs_m < M_total + + x_spatial_base = ( + x_ptr + n_idx.to(tl.int64) * sxn + od_idx * sxd + oh_idx * sxh + ow_idx * sxw + ) + + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + + w_bp = tl.make_block_ptr( + base=w_ptr, + shape=(K_FUSED, C_out), + strides=(C_out, 1), + offsets=(0, pid_n * BLOCK_N), + block_shape=(BLOCK_K, BLOCK_N), + order=(1, 0), + ) + + for k0 in range(0, K_FUSED, BLOCK_K): + k_idx = k0 + offs_k + c_off = k_idx % C_IN + spatial_idx = k_idx // C_IN + kw_off = spatial_idx % KW + kh_off = (spatial_idx // KW) % KH + kd_off = spatial_idx // (KW * KH) + k_input_off = kd_off * sxd + kh_off * sxh + kw_off * sxw + c_off + + x_tile = tl.load( + x_spatial_base[:, None] + k_input_off[None, :], + mask=mask_m[:, None] & (k_idx[None, :] < K_FUSED), + other=0.0, + ) + w_tile = tl.load(w_bp, boundary_check=(0, 1), padding_option="zero") + acc = tl.dot(x_tile, w_tile, acc) + w_bp = tl.advance(w_bp, (BLOCK_K, 0)) + + offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + mask_n = offs_n < C_out + y_ptrs = ( + y_ptr + n_idx.to(tl.int64) * syn + od_idx * syd + oh_idx * syh + ow_idx * syw + ) + tl.store( + y_ptrs[:, None] + offs_n[None, :], + acc.to(tl.float16), + mask=mask_m[:, None] & mask_n[None, :], + ) + + +class Model(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=(1, 1, 1), + padding=(0, 0, 0), + dilation=(1, 1, 1), + groups=1, + bias=False, + ): + super(Model, self).__init__() + ks = _to_triple(kernel_size) + self.weight = nn.Parameter(torch.empty(out_channels, in_channels, *ks)) + if bias: + self.bias = nn.Parameter(torch.empty(out_channels)) + else: + self.bias = None + nn.init.kaiming_uniform_(self.weight, a=5**0.5) + if self.bias is not None: + fan_in = in_channels * ks[0] * ks[1] * ks[2] + bound = 1 / fan_in**0.5 + nn.init.uniform_(self.bias, -bound, bound) + self._packed = False + self._y_buf = None + + def _pack_weights(self): + w = self.weight.data + C_out, C_in, KD, KH, KW = w.shape + self.w_fused = ( + w.permute(2, 3, 4, 1, 0) + .reshape(KD * KH * KW * C_in, C_out) + .contiguous() + .to(dtype=torch.float16) + ) + self._KD, self._KH, self._KW = KD, KH, KW + self._C_in = C_in + self._C_out = C_out + self._K_fused = KD * KH * KW * C_in + self._packed = True + + def forward(self, x): + if not self._packed: + self._pack_weights() + + B, C_in, D_in, H_in, W_in = x.shape + KD, KH, KW = self._KD, self._KH, self._KW + C_out = self._C_out + OD = D_in - KD + 1 + OH = H_in - KH + 1 + OW = W_in - KW + 1 + + x_fp16 = x.to(dtype=torch.float16) + x_cl = x_fp16.contiguous(memory_format=torch.channels_last_3d) + + if self._y_buf is None or self._y_buf.shape != (B, C_out, OD, OH, OW): + self._y_buf = torch.empty( + B, + C_out, + OD, + OH, + OW, + device=x.device, + dtype=torch.float16, + memory_format=torch.channels_last_3d, + ) + y = self._y_buf + + sx = x_cl.stride() + sy = y.stride() + + M_total = B * OD * OH * OW + grid = lambda meta: ( + triton.cdiv(M_total, meta["BLOCK_M"]), + triton.cdiv(C_out, meta["BLOCK_N"]), + ) + + _conv3d_fused_k[grid]( + x_cl, + self.w_fused, + y, + B, + C_out, + OD, + OH, + OW, + sx[0], + sx[2], + sx[3], + sx[4], + sy[0], + sy[2], + sy[3], + sy[4], + KD=KD, + KH=KH, + KW=KW, + C_IN=C_in, + K_FUSED=self._K_fused, + ) + + if self.bias is not None: + y = y + self.bias.to(dtype=torch.float16).view(1, -1, 1, 1, 1) + return y + + +batch_size = 8 +in_channels = 3 +out_channels = 64 +kernel_size = (3, 5, 7) +depth = 16 +height = 128 +width = 128 + + +def get_inputs(): + x = torch.rand(batch_size, in_channels, depth, height, width) + return [x] + + +def get_init_inputs(): + return [in_channels, out_channels, kernel_size] diff --git a/backends/triton/cpu/KernelBench/level1/67_conv_standard_1D.py b/backends/triton/cpu/KernelBench/level1/67_conv_standard_1D.py new file mode 100644 index 0000000..7abf077 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/67_conv_standard_1D.py @@ -0,0 +1,167 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_OL": 64, "BLOCK_N": 128}, num_warps=4, num_stages=3), + ], + key=["OL", "OC"], +) +@triton.jit +def _conv1d_kernel( + x_ptr, + w_ptr, + y_ptr, + OL, + OC, + stride_xn, + stride_xc, + stride_wk, + stride_wi, + stride_wo, + stride_yn, + stride_yoc, + C_IN: tl.constexpr, + BLOCK_K: tl.constexpr, + KERNEL_SIZE: tl.constexpr, + BLOCK_OL: tl.constexpr, + BLOCK_N: tl.constexpr, +): + n = tl.program_id(0) + pid_ol = tl.program_id(1) + pid_n = tl.program_id(2) + + ol_start = pid_ol * BLOCK_OL + offs_ol = ol_start + tl.arange(0, BLOCK_OL) + offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + + mask_ol = offs_ol < OL + mask_n = offs_n < OC + + acc = tl.zeros((BLOCK_OL, BLOCK_N), dtype=tl.float32) + + offs_k = tl.arange(0, BLOCK_K) + x_batch = x_ptr + n.to(tl.int64) * stride_xn + + for k in range(KERNEL_SIZE): + in_pos = offs_ol + k + + x_addrs = x_batch + offs_k[None, :].to(tl.int64) * stride_xc + in_pos[:, None] + x_tile = tl.load( + x_addrs, mask=mask_ol[:, None] & (offs_k[None, :] < C_IN), other=0.0 + ) + + w_bp = tl.make_block_ptr( + base=w_ptr + k * stride_wk, + shape=(C_IN, OC), + strides=(stride_wi, stride_wo), + offsets=(0, pid_n * BLOCK_N), + block_shape=(BLOCK_K, BLOCK_N), + order=(1, 0), + ) + w_tile = tl.load(w_bp, boundary_check=(0, 1), padding_option="zero") + + acc = tl.dot(x_tile, w_tile, acc) + + y_batch = y_ptr + n.to(tl.int64) * stride_yn + y_addrs = y_batch + offs_n[None, :].to(tl.int64) * stride_yoc + offs_ol[:, None] + tl.store(y_addrs, acc.to(tl.float16), mask=mask_ol[:, None] & mask_n[None, :]) + + +class Model(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=False, + ): + super(Model, self).__init__() + self.conv1d = nn.Conv1d( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=bias, + ) + self._packed = False + + def _pack_weights(self): + w = self.conv1d.weight.data + self.w_kio = w.permute(2, 1, 0).contiguous().to(dtype=torch.float16) + self._packed = True + + def forward(self, x): + if not self._packed: + self._pack_weights() + + x = x.to(dtype=torch.float16) + if not x.is_contiguous(): + x = x.contiguous() + + B, C_in, L_in = x.shape + K = self.conv1d.weight.shape[2] + C_out = self.conv1d.weight.shape[0] + OL = L_in - K + 1 + + BLOCK_K = triton.next_power_of_2(max(16, C_in)) + + y = torch.empty((B, C_out, OL), device=x.device, dtype=torch.float16) + + grid = lambda META: ( + B, + triton.cdiv(OL, META["BLOCK_OL"]), + triton.cdiv(C_out, META["BLOCK_N"]), + ) + + _conv1d_kernel[grid]( + x, + self.w_kio, + y, + OL, + C_out, + x.stride(0), + x.stride(1), + self.w_kio.stride(0), + self.w_kio.stride(1), + self.w_kio.stride(2), + y.stride(0), + y.stride(1), + C_IN=C_in, + BLOCK_K=BLOCK_K, + KERNEL_SIZE=K, + ) + + return y + + +batch_size = 32 +in_channels = 64 +out_channels = 128 +kernel_size = 3 +length = 131072 + + +def get_inputs(): + x = torch.rand(batch_size, in_channels, length) + return [x] + + +def get_init_inputs(): + return [in_channels, out_channels, kernel_size] diff --git a/backends/triton/cpu/KernelBench/level1/68_conv_transposed_3D__square_input__asymmetric_kernel.py b/backends/triton/cpu/KernelBench/level1/68_conv_transposed_3D__square_input__asymmetric_kernel.py new file mode 100644 index 0000000..9bcf102 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/68_conv_transposed_3D__square_input__asymmetric_kernel.py @@ -0,0 +1,239 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_W": 16, "BLOCK_OC": 16}, num_warps=4, num_stages=3), + ], + key=["W_out", "C_out"], +) +@triton.jit +def _conv_transpose3d_v2( + x_ptr, + w_ptr, + out_ptr, + D_in: tl.constexpr, + H_in: tl.constexpr, + W_in: tl.constexpr, + C_out: tl.constexpr, + D_out: tl.constexpr, + H_out: tl.constexpr, + W_out: tl.constexpr, + sx_b, + sx_d, + sx_h, + sx_w, + so_b, + so_d, + so_h, + so_w, + BLOCK_W: tl.constexpr, + BLOCK_OC: tl.constexpr, + KD: tl.constexpr, + KH: tl.constexpr, + KW: tl.constexpr, + C_IN: tl.constexpr, + BLOCK_K: tl.constexpr, +): + pid_w = tl.program_id(0) + pid_bdh = tl.program_id(1) + pid_oc = tl.program_id(2) + + b = pid_bdh // (D_out * H_out) + rem = pid_bdh % (D_out * H_out) + d_out = rem // H_out + h_out = rem % H_out + + acc = tl.zeros((BLOCK_W, BLOCK_OC), dtype=tl.float32) + x_batch_base = x_ptr + b.to(tl.int64) * sx_b + + for kd in range(KD): + d_in = d_out - kd + d_ok = (d_in >= 0) & (d_in < D_in) + if d_ok: + for kh in range(KH): + h_in = h_out - kh + h_ok = (h_in >= 0) & (h_in < H_in) + if h_ok: + x_dh_base = x_batch_base + d_in * sx_d + h_in * sx_h + + for kw in range(KW): + w_in_start = pid_w * BLOCK_W - kw + kidx = kd * KH * KW + kh * KW + kw + + x_bp = tl.make_block_ptr( + base=x_dh_base, + shape=(W_in, C_IN), + strides=(sx_w, 1), + offsets=(w_in_start, 0), + block_shape=(BLOCK_W, BLOCK_K), + order=(1, 0), + ) + w_bp = tl.make_block_ptr( + base=w_ptr + kidx * C_IN * C_out, + shape=(C_IN, C_out), + strides=(C_out, 1), + offsets=(0, pid_oc * BLOCK_OC), + block_shape=(BLOCK_K, BLOCK_OC), + order=(1, 0), + ) + + for _c0 in range(0, C_IN, BLOCK_K): + x_tile = tl.load( + x_bp, boundary_check=(0, 1), padding_option="zero" + ) + w_tile = tl.load( + w_bp, boundary_check=(0, 1), padding_option="zero" + ) + acc += tl.dot(x_tile, w_tile) + x_bp = tl.advance(x_bp, (0, BLOCK_K)) + w_bp = tl.advance(w_bp, (BLOCK_K, 0)) + + out_bp = tl.make_block_ptr( + base=out_ptr + b.to(tl.int64) * so_b + d_out * so_d + h_out * so_h, + shape=(W_out, C_out), + strides=(so_w, 1), + offsets=(pid_w * BLOCK_W, pid_oc * BLOCK_OC), + block_shape=(BLOCK_W, BLOCK_OC), + order=(1, 0), + ) + tl.store(out_bp, acc.to(tl.float16), boundary_check=(0, 1)) + + +class Model(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=(1, 1, 1), + padding=(0, 0, 0), + output_padding=(0, 0, 0), + groups=1, + bias=False, + ): + super().__init__() + if isinstance(kernel_size, int): + kernel_size = (kernel_size, kernel_size, kernel_size) + self.deconv3d = nn.ConvTranspose3d( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + output_padding=output_padding, + groups=groups, + bias=bias, + ) + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self._packed = False + self._y_buf = None + + def _pack_weights(self): + w = self.deconv3d.weight.data + KD, KH, KW = self.kernel_size + self.weight_packed = ( + w.permute(2, 3, 4, 0, 1) + .reshape(KD * KH * KW, self.in_channels, self.out_channels) + .contiguous() + .to(dtype=torch.float16) + ) + self._packed = True + + def forward(self, x): + if not self._packed: + self._pack_weights() + + B, C_in, D_in, H_in, W_in = x.shape + KD, KH, KW = self.kernel_size + + D_out = D_in + KD - 1 + H_out = H_in + KH - 1 + W_out = W_in + KW - 1 + + x_cl = x.to(dtype=torch.float16).contiguous( + memory_format=torch.channels_last_3d + ) + + expected_shape = (B, self.out_channels, D_out, H_out, W_out) + if ( + self._y_buf is None + or self._y_buf.shape != expected_shape + or self._y_buf.device != x.device + ): + self._y_buf = torch.empty( + *expected_shape, + device=x.device, + dtype=torch.float16, + ).contiguous(memory_format=torch.channels_last_3d) + output = self._y_buf + + sx = x_cl.stride() + so = output.stride() + + BLOCK_K = 32 + + grid = lambda META: ( + triton.cdiv(W_out, META["BLOCK_W"]), + B * D_out * H_out, + triton.cdiv(self.out_channels, META["BLOCK_OC"]), + ) + + _conv_transpose3d_v2[grid]( + x_cl, + self.weight_packed, + output, + D_in, + H_in, + W_in, + self.out_channels, + D_out, + H_out, + W_out, + sx[0], + sx[2], + sx[3], + sx[4], + so[0], + so[2], + so[3], + so[4], + KD=KD, + KH=KH, + KW=KW, + C_IN=C_in, + BLOCK_K=BLOCK_K, + ) + + return output + + +batch_size = 16 +in_channels = 32 +out_channels = 64 +kernel_depth = 3 +kernel_width = 5 +kernel_height = 5 +depth = 64 +width = 64 +height = 64 + + +def get_inputs(): + x = torch.rand(batch_size, in_channels, depth, width, height) + return [x] + + +def get_init_inputs(): + return [in_channels, out_channels, (kernel_depth, kernel_width, kernel_height)] diff --git a/backends/triton/cpu/KernelBench/level1/69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.py b/backends/triton/cpu/KernelBench/level1/69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.py new file mode 100644 index 0000000..233c9b5 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.py @@ -0,0 +1,229 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import torch.nn.functional as F +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config( + { + "BLOCK_OW": 64, + "BLOCK_N": 64, + "BLOCK_K": 64, + "GROUP_SIZE_M": 8, + "grf_mode": "256", + }, + num_warps=4, + num_stages=2, + ), + ], + key=["H", "W", "C_IN", "C_out", "OH", "OW"], +) +@triton.jit +def _conv_transpose2d_swizzled_v2( + x_ptr, + w_ptr, + y_ptr, + N_batch, + H, + W, + C_out, + OH, + OW, + stride_wkh, + stride_wkw, + stride_wci, + stride_wco, + BLOCK_OW: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + KH: tl.constexpr, + KW: tl.constexpr, + C_IN: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + grf_mode: tl.constexpr, +): + n = tl.program_id(1) + pid = tl.program_id(0) + + num_ow_tiles = tl.cdiv(OW, BLOCK_OW) + num_n_tiles = tl.cdiv(C_out, BLOCK_N) + num_m_tiles = OH * num_ow_tiles + + group_id = pid // (GROUP_SIZE_M * num_n_tiles) + first_m_in_group = group_id * GROUP_SIZE_M + group_size = min(num_m_tiles - first_m_in_group, GROUP_SIZE_M) + local_id = pid % (group_size * num_n_tiles) + m_local = local_id // num_n_tiles + pid_n = local_id % num_n_tiles + m_idx = first_m_in_group + m_local + + oh = m_idx // num_ow_tiles + pid_ow = m_idx % num_ow_tiles + ow0 = pid_ow * BLOCK_OW + + HW = H * W + OHOW = OH * OW + + acc = tl.zeros((BLOCK_OW, BLOCK_N), dtype=tl.float32) + + for kh in range(KH): + for kw in range(KW): + x_row_start = n * HW + (oh + kh) * W + (ow0 + kw) + x_valid_rows = W - (ow0 + kw) + + x_bp = tl.make_block_ptr( + base=x_ptr, + shape=(x_row_start + x_valid_rows, C_IN), + strides=(C_IN, 1), + offsets=(x_row_start, 0), + block_shape=(BLOCK_OW, BLOCK_K), + order=(1, 0), + ) + + w_bp = tl.make_block_ptr( + base=w_ptr + kh * stride_wkh + kw * stride_wkw, + shape=(C_IN, C_out), + strides=(stride_wci, stride_wco), + offsets=(0, pid_n * BLOCK_N), + block_shape=(BLOCK_K, BLOCK_N), + order=(1, 0), + ) + + for c0 in range(0, C_IN, BLOCK_K): + x_tile = tl.load(x_bp, boundary_check=(0, 1), padding_option="zero") + w_tile = tl.load(w_bp, boundary_check=(0, 1), padding_option="zero") + acc = tl.dot(x_tile, w_tile, acc) + x_bp = tl.advance(x_bp, (0, BLOCK_K)) + w_bp = tl.advance(w_bp, (BLOCK_K, 0)) + + y_row_start = n * OHOW + oh * OW + ow0 + y_valid_rows = OW - ow0 + y_bp = tl.make_block_ptr( + base=y_ptr, + shape=(y_row_start + y_valid_rows, C_out), + strides=(C_out, 1), + offsets=(y_row_start, pid_n * BLOCK_N), + block_shape=(BLOCK_OW, BLOCK_N), + order=(1, 0), + ) + tl.store(y_bp, acc.to(tl.float16), boundary_check=(0, 1)) + + +class Model(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + output_padding=0, + groups=1, + bias=False, + ): + super().__init__() + self.conv_transpose2d = nn.ConvTranspose2d( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + output_padding=output_padding, + groups=groups, + bias=bias, + ) + self._packed = False + self._cached_x_ptr = None + + def _pack_weights(self): + weight = self.conv_transpose2d.weight.data.detach() + self.w_hwio = ( + weight.flip(2, 3).permute(2, 3, 0, 1).to(dtype=torch.float16).contiguous() + ) + self.KH = weight.shape[2] + self.KW = weight.shape[3] + self.C_in_val = weight.shape[0] + self.C_out_val = weight.shape[1] + self._packed = True + + def forward(self, x): + if not self._packed: + self._pack_weights() + + x = x.to(dtype=torch.float16) + N, C_in, H, W_in = x.shape + KH, KW = self.KH, self.KW + C_out = self.C_out_val + + if self._cached_x_ptr != x.data_ptr(): + x_pad = F.pad(x, (KW - 1, KW - 1, KH - 1, KH - 1)) + x_pad_cl = x_pad.contiguous(memory_format=torch.channels_last) + self._x_nhwc = x_pad_cl.permute(0, 2, 3, 1) + self._cached_x_ptr = x.data_ptr() + self._H_pad = H + 2 * (KH - 1) + self._W_pad = W_in + 2 * (KW - 1) + self._OH = H + KH - 1 + self._OW = W_in + KW - 1 + self._N = N + self._y = torch.empty( + (N, C_out, self._OH, self._OW), + device=x.device, + dtype=torch.float16, + memory_format=torch.channels_last, + ) + self._y_nhwc = self._y.permute(0, 2, 3, 1) + + OH, OW = self._OH, self._OW + H_pad, W_pad = self._H_pad, self._W_pad + + def grid(META): + num_ow = triton.cdiv(OW, META["BLOCK_OW"]) + num_n = triton.cdiv(C_out, META["BLOCK_N"]) + return (OH * num_ow * num_n, N) + + _conv_transpose2d_swizzled_v2[grid]( + self._x_nhwc, + self.w_hwio, + self._y_nhwc, + N, + H_pad, + W_pad, + C_out, + OH, + OW, + self.w_hwio.stride(0), + self.w_hwio.stride(1), + self.w_hwio.stride(2), + self.w_hwio.stride(3), + KH=KH, + KW=KW, + C_IN=C_in, + ) + + return self._y + + +batch_size = 64 +in_channels = 64 +out_channels = 128 +kernel_size = (3, 5) +height = 128 +width = 256 + + +def get_inputs(): + x = torch.rand(batch_size, in_channels, height, width) + return [x] + + +def get_init_inputs(): + return [in_channels, out_channels, kernel_size] diff --git a/backends/triton/cpu/KernelBench/level1/6_Matmul_with_large_K_dimension_.py b/backends/triton/cpu/KernelBench/level1/6_Matmul_with_large_K_dimension_.py new file mode 100644 index 0000000..2dd9faa --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/6_Matmul_with_large_K_dimension_.py @@ -0,0 +1,133 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +def _configs(): + return [ + triton.Config( + {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_SIZE_M": 4}, + num_warps=8, + num_stages=3, + ), + ] + + +@triton.autotune(configs=_configs(), key=["M", "N", "K"]) +@triton.jit +def _matmul_kernel( + a_ptr, + b_ptr, + c_ptr, + M, + N, + K, + stride_am, + stride_ak: tl.constexpr, + stride_bk, + stride_bn: tl.constexpr, + stride_cm, + stride_cn: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + pid = tl.program_id(0) + + num_pid_m = tl.cdiv(M, BLOCK_M) + num_pid_n = tl.cdiv(N, BLOCK_N) + num_pid_in_group = GROUP_SIZE_M * num_pid_n + + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size_m = tl.minimum(num_pid_m - first_pid_m, GROUP_SIZE_M) + + pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m) + pid_n = (pid % num_pid_in_group) // group_size_m + + a_desc = tl.make_tensor_descriptor( + base=a_ptr, + shape=(M, K), + strides=(stride_am, stride_ak), + block_shape=(BLOCK_M, BLOCK_K), + ) + b_desc = tl.make_tensor_descriptor( + base=b_ptr, + shape=(K, N), + strides=(stride_bk, stride_bn), + block_shape=(BLOCK_K, BLOCK_N), + ) + + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + for off_k in range(0, K, BLOCK_K): + a_tile = a_desc.load([pid_m * BLOCK_M, off_k]) + b_tile = b_desc.load([off_k, pid_n * BLOCK_N]) + acc += tl.dot(a_tile, b_tile) + c_desc = tl.make_tensor_descriptor( + base=c_ptr, + shape=(M, N), + strides=(stride_cm, stride_cn), + block_shape=(BLOCK_M, BLOCK_N), + ) + c_desc.store([pid_m * BLOCK_M, pid_n * BLOCK_N], acc.to(c_ptr.type.element_ty)) + + +def _matmul_triton(A, B): + M, K = A.shape + K2, N = B.shape + + A = A.contiguous() + B = B.contiguous() + + C = torch.empty((M, N), device=A.device, dtype=A.dtype) + + grid = lambda META: ( + triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]), + ) + + _matmul_kernel[grid]( + A, + B, + C, + M, + N, + K, + A.stride(0), + A.stride(1), + B.stride(0), + B.stride(1), + C.stride(0), + C.stride(1), + ) + return C + + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: + return _matmul_triton(A, B) + + +M = 256 +N = 256 +K = 131072 * 4 + + +def get_inputs(): + A = torch.rand(M, K, dtype=torch.bfloat16) + B = torch.rand(K, N, dtype=torch.bfloat16) + return [A, B] + + +def get_init_inputs(): + return [] diff --git a/backends/triton/cpu/KernelBench/level1/70_conv_transposed_3D__asymmetric_input__square_kernel.py b/backends/triton/cpu/KernelBench/level1/70_conv_transposed_3D__asymmetric_input__square_kernel.py new file mode 100644 index 0000000..9418854 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/70_conv_transposed_3D__asymmetric_input__square_kernel.py @@ -0,0 +1,234 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_W": 16, "BLOCK_OC": 32}, num_warps=4, num_stages=3), + ], + key=["W_out", "C_out"], +) +@triton.jit +def _conv_transpose3d_v2( + x_ptr, + w_ptr, + out_ptr, + D_in: tl.constexpr, + H_in: tl.constexpr, + W_in: tl.constexpr, + C_out: tl.constexpr, + D_out: tl.constexpr, + H_out: tl.constexpr, + W_out: tl.constexpr, + sx_b, + sx_d, + sx_h, + sx_w, + so_b, + so_d, + so_h, + so_w, + BLOCK_W: tl.constexpr, + BLOCK_OC: tl.constexpr, + K: tl.constexpr, + C_IN: tl.constexpr, + BLOCK_K: tl.constexpr, +): + pid_w = tl.program_id(0) + pid_bdh = tl.program_id(1) + pid_oc = tl.program_id(2) + + b = pid_bdh // (D_out * H_out) + rem = pid_bdh % (D_out * H_out) + d_out = rem // H_out + h_out = rem % H_out + + acc = tl.zeros((BLOCK_W, BLOCK_OC), dtype=tl.float32) + x_batch_base = x_ptr + b.to(tl.int64) * sx_b + + for kd in range(K): + d_in = d_out - kd + d_ok = (d_in >= 0) & (d_in < D_in) + if d_ok: + for kh in range(K): + h_in = h_out - kh + h_ok = (h_in >= 0) & (h_in < H_in) + if h_ok: + x_dh_base = x_batch_base + d_in * sx_d + h_in * sx_h + + for kw in range(K): + w_in_start = pid_w * BLOCK_W - kw + kidx = kd * K * K + kh * K + kw + + x_bp = tl.make_block_ptr( + base=x_dh_base, + shape=(W_in, C_IN), + strides=(sx_w, 1), + offsets=(w_in_start, 0), + block_shape=(BLOCK_W, BLOCK_K), + order=(1, 0), + ) + w_bp = tl.make_block_ptr( + base=w_ptr + kidx * C_IN * C_out, + shape=(C_IN, C_out), + strides=(C_out, 1), + offsets=(0, pid_oc * BLOCK_OC), + block_shape=(BLOCK_K, BLOCK_OC), + order=(1, 0), + ) + + for _c0 in range(0, C_IN, BLOCK_K): + x_tile = tl.load( + x_bp, boundary_check=(0, 1), padding_option="zero" + ) + w_tile = tl.load( + w_bp, boundary_check=(0, 1), padding_option="zero" + ) + acc += tl.dot(x_tile, w_tile) + x_bp = tl.advance(x_bp, (0, BLOCK_K)) + w_bp = tl.advance(w_bp, (BLOCK_K, 0)) + + out_bp = tl.make_block_ptr( + base=out_ptr + b.to(tl.int64) * so_b + d_out * so_d + h_out * so_h, + shape=(W_out, C_out), + strides=(so_w, 1), + offsets=(pid_w * BLOCK_W, pid_oc * BLOCK_OC), + block_shape=(BLOCK_W, BLOCK_OC), + order=(1, 0), + ) + tl.store(out_bp, acc.to(tl.float16), boundary_check=(0, 1)) + + +class Model(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + output_padding=0, + dilation=1, + groups=1, + bias=False, + ): + super().__init__() + self.conv_transpose3d = nn.ConvTranspose3d( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + output_padding=output_padding, + dilation=dilation, + groups=groups, + bias=bias, + ) + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self._packed = False + self._y_buf = None + + def _pack_weights(self): + w = self.conv_transpose3d.weight.data + K = self.kernel_size + self.weight_packed = ( + w.permute(2, 3, 4, 0, 1) + .reshape(K * K * K, self.in_channels, self.out_channels) + .contiguous() + .to(dtype=torch.float16) + ) + self._packed = True + + def forward(self, x): + if not self._packed: + self._pack_weights() + + device = x.device + B, C_in, D_in, H_in, W_in = x.shape + K = self.kernel_size + + D_out = D_in + K - 1 + H_out = H_in + K - 1 + W_out = W_in + K - 1 + + x_cl = x.to(dtype=torch.float16).contiguous( + memory_format=torch.channels_last_3d + ) + + expected_shape = (B, self.out_channels, D_out, H_out, W_out) + if ( + self._y_buf is None + or self._y_buf.shape != expected_shape + or self._y_buf.device != device + ): + self._y_buf = torch.empty( + *expected_shape, + device=device, + dtype=torch.float16, + ).contiguous(memory_format=torch.channels_last_3d) + output = self._y_buf + + sx = x_cl.stride() + so = output.stride() + + BLOCK_K = 16 + + grid = lambda META: ( + triton.cdiv(W_out, META["BLOCK_W"]), + B * D_out * H_out, + triton.cdiv(self.out_channels, META["BLOCK_OC"]), + ) + + _conv_transpose3d_v2[grid]( + x_cl, + self.weight_packed, + output, + D_in, + H_in, + W_in, + self.out_channels, + D_out, + H_out, + W_out, + sx[0], + sx[2], + sx[3], + sx[4], + so[0], + so[2], + so[3], + so[4], + K=K, + C_IN=C_in, + BLOCK_K=BLOCK_K, + ) + + return output + + +batch_size = 8 +in_channels = 48 +out_channels = 24 +kernel_size = 3 +depth = 96 +height = 96 +width = 96 + + +def get_inputs(): + x = torch.rand(batch_size, in_channels, depth, height, width) + return [x] + + +def get_init_inputs(): + return [in_channels, out_channels, kernel_size] diff --git a/backends/triton/cpu/KernelBench/level1/71_conv_transposed_2D__asymmetric_input__square_kernel.py b/backends/triton/cpu/KernelBench/level1/71_conv_transposed_2D__asymmetric_input__square_kernel.py new file mode 100644 index 0000000..e93ce1a --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/71_conv_transposed_2D__asymmetric_input__square_kernel.py @@ -0,0 +1,229 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import torch.nn.functional as F +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config( + { + "BLOCK_OW": 64, + "BLOCK_N": 32, + "BLOCK_K": 32, + "GROUP_SIZE_M": 8, + "grf_mode": "128", + }, + num_warps=4, + num_stages=3, + ), + ], + key=["H", "W", "C_IN", "C_out", "OH", "OW"], +) +@triton.jit +def _conv_transpose2d_swizzled_v2( + x_ptr, + w_ptr, + y_ptr, + N_batch, + H, + W, + C_out, + OH, + OW, + stride_wkh, + stride_wkw, + stride_wci, + stride_wco, + BLOCK_OW: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + KH: tl.constexpr, + KW: tl.constexpr, + C_IN: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + grf_mode: tl.constexpr, +): + n = tl.program_id(1) + pid = tl.program_id(0) + + num_ow_tiles = tl.cdiv(OW, BLOCK_OW) + num_n_tiles = tl.cdiv(C_out, BLOCK_N) + num_m_tiles = OH * num_ow_tiles + + group_id = pid // (GROUP_SIZE_M * num_n_tiles) + first_m_in_group = group_id * GROUP_SIZE_M + group_size = min(num_m_tiles - first_m_in_group, GROUP_SIZE_M) + local_id = pid % (group_size * num_n_tiles) + m_local = local_id // num_n_tiles + pid_n = local_id % num_n_tiles + m_idx = first_m_in_group + m_local + + oh = m_idx // num_ow_tiles + pid_ow = m_idx % num_ow_tiles + ow0 = pid_ow * BLOCK_OW + + HW = H * W + OHOW = OH * OW + + acc = tl.zeros((BLOCK_OW, BLOCK_N), dtype=tl.float32) + + for kh in range(KH): + for kw in range(KW): + x_row_start = n * HW + (oh + kh) * W + (ow0 + kw) + x_valid_rows = W - (ow0 + kw) + + x_bp = tl.make_block_ptr( + base=x_ptr, + shape=(x_row_start + x_valid_rows, C_IN), + strides=(C_IN, 1), + offsets=(x_row_start, 0), + block_shape=(BLOCK_OW, BLOCK_K), + order=(1, 0), + ) + + w_bp = tl.make_block_ptr( + base=w_ptr + kh * stride_wkh + kw * stride_wkw, + shape=(C_IN, C_out), + strides=(stride_wci, stride_wco), + offsets=(0, pid_n * BLOCK_N), + block_shape=(BLOCK_K, BLOCK_N), + order=(1, 0), + ) + + for c0 in range(0, C_IN, BLOCK_K): + x_tile = tl.load(x_bp, boundary_check=(0, 1), padding_option="zero") + w_tile = tl.load(w_bp, boundary_check=(0, 1), padding_option="zero") + acc = tl.dot(x_tile, w_tile, acc) + x_bp = tl.advance(x_bp, (0, BLOCK_K)) + w_bp = tl.advance(w_bp, (BLOCK_K, 0)) + + y_row_start = n * OHOW + oh * OW + ow0 + y_valid_rows = OW - ow0 + y_bp = tl.make_block_ptr( + base=y_ptr, + shape=(y_row_start + y_valid_rows, C_out), + strides=(C_out, 1), + offsets=(y_row_start, pid_n * BLOCK_N), + block_shape=(BLOCK_OW, BLOCK_N), + order=(1, 0), + ) + tl.store(y_bp, acc.to(tl.float16), boundary_check=(0, 1)) + + +class Model(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + output_padding=0, + groups=1, + bias=False, + ): + super().__init__() + self.conv_transpose2d = nn.ConvTranspose2d( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + output_padding=output_padding, + groups=groups, + bias=bias, + ) + self._packed = False + self._cached_x_ptr = None + + def _pack_weights(self): + weight = self.conv_transpose2d.weight.data.detach() + self.w_hwio = ( + weight.flip(2, 3).permute(2, 3, 0, 1).to(dtype=torch.float16).contiguous() + ) + self.KH = weight.shape[2] + self.KW = weight.shape[3] + self.C_in_val = weight.shape[0] + self.C_out_val = weight.shape[1] + self._packed = True + + def forward(self, x): + if not self._packed: + self._pack_weights() + + x = x.to(dtype=torch.float16) + N, C_in, H, W_in = x.shape + KH, KW = self.KH, self.KW + C_out = self.C_out_val + + if self._cached_x_ptr != x.data_ptr(): + x_pad = F.pad(x, (KW - 1, KW - 1, KH - 1, KH - 1)) + x_pad_cl = x_pad.contiguous(memory_format=torch.channels_last) + self._x_nhwc = x_pad_cl.permute(0, 2, 3, 1) + self._cached_x_ptr = x.data_ptr() + self._H_pad = H + 2 * (KH - 1) + self._W_pad = W_in + 2 * (KW - 1) + self._OH = H + KH - 1 + self._OW = W_in + KW - 1 + self._N = N + self._y = torch.empty( + (N, C_out, self._OH, self._OW), + device=x.device, + dtype=torch.float16, + memory_format=torch.channels_last, + ) + self._y_nhwc = self._y.permute(0, 2, 3, 1) + + OH, OW = self._OH, self._OW + H_pad, W_pad = self._H_pad, self._W_pad + + def grid(META): + num_ow = triton.cdiv(OW, META["BLOCK_OW"]) + num_n = triton.cdiv(C_out, META["BLOCK_N"]) + return (OH * num_ow * num_n, N) + + _conv_transpose2d_swizzled_v2[grid]( + self._x_nhwc, + self.w_hwio, + self._y_nhwc, + N, + H_pad, + W_pad, + C_out, + OH, + OW, + self.w_hwio.stride(0), + self.w_hwio.stride(1), + self.w_hwio.stride(2), + self.w_hwio.stride(3), + KH=KH, + KW=KW, + C_IN=C_in, + ) + + return self._y + + +batch_size = 8 +in_channels = 32 +out_channels = 32 +kernel_size = 3 +height = 512 +width = 1024 + + +def get_inputs(): + x = torch.rand(batch_size, in_channels, height, width) + return [x] + + +def get_init_inputs(): + return [in_channels, out_channels, kernel_size] diff --git a/backends/triton/cpu/KernelBench/level1/72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.py b/backends/triton/cpu/KernelBench/level1/72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.py new file mode 100644 index 0000000..8509131 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.py @@ -0,0 +1,289 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_OW": 64, "BLOCK_OH": 2}, num_warps=4, num_stages=2), + ], + key=["D_out", "H_out", "W_out"], +) +@triton.jit +def _conv_transpose3d_v2( + x_ptr, + w_ptr, + out_ptr, + D_in: tl.constexpr, + H_in: tl.constexpr, + W_in: tl.constexpr, + D_out: tl.constexpr, + H_out: tl.constexpr, + W_out: tl.constexpr, + C_in_pg: tl.constexpr, + C_out_pg: tl.constexpr, + groups: tl.constexpr, + kD: tl.constexpr, + kH: tl.constexpr, + kW: tl.constexpr, + stride_d: tl.constexpr, + stride_h: tl.constexpr, + stride_w: tl.constexpr, + pad_d: tl.constexpr, + pad_h: tl.constexpr, + pad_w: tl.constexpr, + sx_b, + sx_c, + sx_d, + sx_h, + sx_w, + sw_ci, + sw_kd, + sw_kh, + sw_kw, + so_b, + so_c, + so_d, + so_h, + so_w, + BLOCK_OW: tl.constexpr, + BLOCK_OH: tl.constexpr, +): + pid_bg = tl.program_id(0) + pid_d = tl.program_id(1) + pid_hw = tl.program_id(2) + + b_idx = pid_bg // groups + g = pid_bg % groups + + od = pid_d + + num_ow_tiles = tl.cdiv(W_out, BLOCK_OW) + oh_tile = pid_hw // num_ow_tiles + ow_tile = pid_hw % num_ow_tiles + + oh0 = oh_tile * BLOCK_OH + ow0 = ow_tile * BLOCK_OW + + ow_offs = ow0 + tl.arange(0, BLOCK_OW) + ow_mask = ow_offs < W_out + oc_offs = tl.arange(0, C_out_pg) + + for oh_local in range(BLOCK_OH): + oh = oh0 + oh_local + if oh < H_out: + acc = tl.zeros((BLOCK_OW, C_out_pg), dtype=tl.float32) + + for kd_idx in range(kD): + tmp_d = od + pad_d - kd_idx + d_ok = (tmp_d >= 0) & (tmp_d % stride_d == 0) + if d_ok: + id_val = tmp_d // stride_d + if id_val < D_in: + for kh_idx in range(kH): + tmp_h = oh + pad_h - kh_idx + h_ok = (tmp_h >= 0) & (tmp_h % stride_h == 0) + if h_ok: + ih_val = tmp_h // stride_h + if ih_val < H_in: + x_dh = ( + x_ptr + + b_idx.to(tl.int64) * sx_b + + id_val.to(tl.int64) * sx_d + + ih_val.to(tl.int64) * sx_h + ) + w_kdkh = w_ptr + kd_idx * sw_kd + kh_idx * sw_kh + + for kw_idx in range(kW): + tmp_w = ow_offs + pad_w - kw_idx + w_ok = (tmp_w >= 0) & (tmp_w % stride_w == 0) + iw_vals = tmp_w // stride_w + valid = ( + ow_mask + & w_ok + & (iw_vals >= 0) + & (iw_vals < W_in) + ) + + w_k = w_kdkh + kw_idx * sw_kw + + for ic_local in range(C_in_pg): + ic = g * C_in_pg + ic_local + x_val = tl.load( + x_dh + + ic.to(tl.int64) * sx_c + + iw_vals.to(tl.int64) * sx_w, + mask=valid, + other=0.0, + ).to(tl.float32) + w_vals = tl.load( + w_k + + ic.to(tl.int64) * sw_ci + + oc_offs.to(tl.int64) + ).to(tl.float32) + acc += x_val[:, None] * w_vals[None, :] + + oc_global = g * C_out_pg + oc_offs + out_off = ( + out_ptr + + b_idx.to(tl.int64) * so_b + + oc_global[None, :].to(tl.int64) * so_c + + od.to(tl.int64) * so_d + + oh.to(tl.int64) * so_h + + ow_offs[:, None].to(tl.int64) * so_w + ) + tl.store(out_off, acc.to(tl.float16), mask=ow_mask[:, None]) + + +class Model(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=(1, 1, 1), + padding=(0, 0, 0), + output_padding=(0, 0, 0), + groups=1, + bias=False, + ): + super().__init__() + self.conv_transpose3d = nn.ConvTranspose3d( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + output_padding=output_padding, + groups=groups, + bias=bias, + ) + self.in_channels = in_channels + self.out_channels = out_channels + if isinstance(kernel_size, int): + kernel_size = (kernel_size,) * 3 + if isinstance(stride, int): + stride = (stride,) * 3 + if isinstance(padding, int): + padding = (padding,) * 3 + if isinstance(output_padding, int): + output_padding = (output_padding,) * 3 + self.kernel_size_t = kernel_size + self.stride_t = stride + self.padding_t = padding + self.output_padding_t = output_padding + self.groups = groups + self._packed = False + + def _pack_weights(self): + w = self.conv_transpose3d.weight.data # (C_in, C_out_pg, kD, kH, kW) + w = w.permute(0, 2, 3, 4, 1).contiguous() # (C_in, kD, kH, kW, C_out_pg) + self.weight_xpu = w.to(dtype=torch.float16) + self._packed = True + + def forward(self, x): + if not self._packed: + self._pack_weights() + x = x.to(dtype=torch.float16).contiguous() + + B = x.shape[0] + D_in, H_in, W_in = x.shape[2], x.shape[3], x.shape[4] + C_in_pg = self.in_channels // self.groups + C_out_pg = self.out_channels // self.groups + kD, kH, kW = self.kernel_size_t + sD, sH, sW = self.stride_t + pD, pH, pW = self.padding_t + opD, opH, opW = self.output_padding_t + + D_out = (D_in - 1) * sD - 2 * pD + kD + opD + H_out = (H_in - 1) * sH - 2 * pH + kH + opH + W_out = (W_in - 1) * sW - 2 * pW + kW + opW + + output = torch.empty( + (B, self.out_channels, D_out, H_out, W_out), + device=x.device, + dtype=torch.float16, + ) + + w = self.weight_xpu + grid = lambda META: ( + B * self.groups, + D_out, + triton.cdiv(H_out, META["BLOCK_OH"]) * triton.cdiv(W_out, META["BLOCK_OW"]), + ) + + _conv_transpose3d_v2[grid]( + x, + w, + output, + D_in, + H_in, + W_in, + D_out, + H_out, + W_out, + C_in_pg, + C_out_pg, + self.groups, + kD, + kH, + kW, + sD, + sH, + sW, + pD, + pH, + pW, + x.stride(0), + x.stride(1), + x.stride(2), + x.stride(3), + x.stride(4), + w.stride(0), + w.stride(1), + w.stride(2), + w.stride(3), + output.stride(0), + output.stride(1), + output.stride(2), + output.stride(3), + output.stride(4), + ) + return output + + +batch_size = 8 +in_channels = 32 +out_channels = 32 +kernel_size = (3, 5, 7) +depth = 12 +height = 24 +width = 48 +stride = (2, 2, 2) +padding = (1, 2, 3) +output_padding = (1, 1, 1) +groups = 4 + + +def get_inputs(): + x = torch.rand(batch_size, in_channels, depth, height, width) + return [x] + + +def get_init_inputs(): + return [ + in_channels, + out_channels, + kernel_size, + stride, + padding, + output_padding, + groups, + ] diff --git a/backends/triton/cpu/KernelBench/level1/73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.py b/backends/triton/cpu/KernelBench/level1/73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.py new file mode 100644 index 0000000..91671df --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.py @@ -0,0 +1,309 @@ +# ruff: noqa: E731, A001, E402 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +# Auto-merged kernel + test for: 73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.log +# Generated by merge script + +# kernel.py + +import torch +import triton +import triton.language as tl + + +# ----------------------------------------------------------------------------- +# Triton kernel: naive element‐wise transposed 3D conv, one output‐element per +# Triton thread vector. Works in groups, BF16 input/weight, FP32 accumulation. +# ----------------------------------------------------------------------------- +@triton.jit +def _conv_transpose3d_kernel( + inp_ptr, # pointer to input tensor + w_ptr, # pointer to weight tensor + out_ptr, # pointer to output tensor + # tensor dimensions + B, + C_in, + Di, + Hi, + Wi, + C_out, + Do, + Ho, + Wo, + # kernel size + kD, + kH, + kW, + # strides + sd, + sh, + sw, + # paddings + pd, + ph, + pw, + # grouping + groups, + # compile‐time tile size + BLOCK_SZ: tl.constexpr, +): + # block indices + b = tl.program_id(0) # batch index + oc = tl.program_id(1) # output‐channel index + sb_id = tl.program_id(2) # spatial‐block index + + # each thread handles one element of a BLOCK_SZ‐vector along the flattened + # (Do*Ho*Wo) dimension + offs = tl.arange(0, BLOCK_SZ) + flat_sp = sb_id * BLOCK_SZ + offs # flattened output index + mask_sp = flat_sp < (Do * Ho * Wo) # guard OOB + + # decode flattened spatial into od, oh, ow + od = flat_sp // (Ho * Wo) + rem = flat_sp % (Ho * Wo) + oh = rem // Wo + ow = rem % Wo + + # FP32 accumulator + acc = tl.zeros((BLOCK_SZ,), dtype=tl.float32) + + # channels per group + out_ch_per_grp = C_out // groups + in_ch_per_grp = C_in // groups + + # which group this oc belongs to, and local oc + grp = oc // out_ch_per_grp + oc_local = oc % out_ch_per_grp + in_start = grp * in_ch_per_grp + + # iterate over the input‐channels in this group and over kD,kH,kW + for ic_loc in range(in_ch_per_grp): + c_in = in_start + ic_loc + for kd in range(kD): + # invert the transpose‐conv mapping in D + id_num = od + pd - kd + id = id_num // sd + mask_id = (id_num == id * sd) & (id >= 0) & (id < Di) + for kh in range(kH): + # invert in H + ih_num = oh + ph - kh + ih = ih_num // sh + mask_ih = (ih_num == ih * sh) & (ih >= 0) & (ih < Hi) + for kw in range(kW): + # invert in W + iw_num = ow + pw - kw + iw = iw_num // sw + mask_iw = (iw_num == iw * sw) & (iw >= 0) & (iw < Wi) + + # combine all masks, plus the original spatial‐mask + m = mask_sp & mask_id & mask_ih & mask_iw + + # vectorized offsets into the flattened input + inp_offset = (((b * C_in + c_in) * Di + id) * Hi + ih) * Wi + iw + # masked‐load input; out‐of‐range lanes → 0.0 + inp_val = tl.load(inp_ptr + inp_offset, mask=m, other=0.0).to( + tl.float32 + ) + + # vectorized offsets into the flattened weight + w_offset = ( + ((c_in * out_ch_per_grp + oc_local) * kD + kd) * kH + kh + ) * kW + kw + w_val = tl.load(w_ptr + w_offset).to(tl.float32) + + acc += inp_val * w_val + + # write‐back: flatten (b,oc,od,oh,ow) and store with mask_sp + out_offset = (((b * C_out + oc) * Do + od) * Ho + oh) * Wo + ow + out_val = acc.to(out_ptr.dtype.element_ty) + tl.store(out_ptr + out_offset, out_val, mask=mask_sp) + + +# ----------------------------------------------------------------------------- +# Python wrapper: only does checks, allocates output, and launches Triton kernel. +# ----------------------------------------------------------------------------- +def kernel_function( + input_tensor, weight, bias, stride, padding, output_padding, groups +): + # --- Argument checks + assert isinstance(input_tensor, torch.Tensor), "input must be a Tensor" + assert isinstance(weight, torch.Tensor), "weight must be a Tensor" + assert bias is None, "this kernel does not support bias" + assert isinstance(stride, (int, tuple)), "stride must be int or tuple" + assert isinstance(padding, (int, tuple)), "padding must be int or tuple" + assert isinstance(groups, int) and groups >= 1, "groups must be a positive int" + + # --- Shapes and device/dtype + B, C_in, Di, Hi, Wi = input_tensor.shape + Cin_w, Cout_grp, kD, kH, kW = weight.shape + assert Cin_w == C_in, "weight.shape[0] must match input channels" + C_out = Cout_grp * groups + + # unify stride / padding / output_padding + if isinstance(stride, int): + sd = sh = sw = stride + else: + sd, sh, sw = stride + if isinstance(padding, int): + pd = ph = pw = padding + else: + pd, ph, pw = padding + if isinstance(output_padding, int): + opd = oph = opw = output_padding + else: + opd, oph, opw = output_padding + + # compute output D/H/W for ConvTranspose3d + Do = (Di - 1) * sd - 2 * pd + kD + opd + Ho = (Hi - 1) * sh - 2 * ph + kH + oph + Wo = (Wi - 1) * sw - 2 * pw + kW + opw + + # allocate output (bfloat16) + out = torch.empty( + (B, C_out, Do, Ho, Wo), device=input_tensor.device, dtype=input_tensor.dtype + ) + + # choose a block size for the flattened spatial dimension + BLOCK = 256 + grid = (B, C_out, triton.cdiv(Do * Ho * Wo, BLOCK)) + + # launch + _conv_transpose3d_kernel[grid]( + input_tensor, + weight, + out, + B, + C_in, + Di, + Hi, + Wi, + C_out, + Do, + Ho, + Wo, + kD, + kH, + kW, + sd, + sh, + sw, + pd, + ph, + pw, + groups, + BLOCK, # compile‐time constant + ) + return out[ + :, :, : Do - opd, : Ho - oph, : Wo - opw + ] # slice to remove output_padding area + + +# ------------------------------ +# KernelBench-compatible Model +# ------------------------------ + + +import torch.nn as nn + + +class Model(nn.Module): + """ + Performs a 3D transposed convolution operation with asymmetric input and square kernel. + The input is padded before the convolution. + + Args: + in_channels (int): Number of channels in the input tensor. + out_channels (int): Number of channels produced by the convolution. + kernel_size (int): Size of the square convolution kernel. + stride (int, optional): Stride of the convolution. Defaults to 1. + padding (int, optional): Padding applied to the input. Defaults to 0. + groups (int, optional): Number of blocked connections from input channels to output channels. Defaults to 1. + bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`. + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: int = 1, + padding: int = 0, + output_padding: int = 0, + groups: int = 1, + bias: bool = False, + ): + super(Model, self).__init__() + # Keep the same init signature as the reference Model, but route compute to Triton kernel. + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.stride = stride + self.padding = padding + self.output_padding = output_padding + self.groups = groups + self.use_bias = bias + + # Kernel wrapper constraints + assert not self.use_bias, ( + "this kernel does not support bias (bias must be False / None)" + ) + # output_padding is now handled in kernel_function + assert isinstance(groups, int) and groups >= 1, "groups must be a positive int" + assert in_channels % groups == 0, "in_channels must be divisible by groups" + assert out_channels % groups == 0, "out_channels must be divisible by groups" + + kD = kH = kW = kernel_size + # Match nn.ConvTranspose3d parameter shapes: + # weight: (in_channels, out_channels/groups, kD, kH, kW) + self.weight = nn.Parameter( + torch.empty(in_channels, out_channels // groups, kD, kH, kW) + ) + self.bias = None + + # Reasonable init + nn.init.kaiming_uniform_(self.weight, a=5**0.5) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Performs the 3D transposed convolution. + + Args: + x (torch.Tensor): Input tensor of shape (batch_size, in_channels, depth, height, width). + + Returns: + torch.Tensor: Output tensor of shape (batch_size, out_channels, depth_out, height_out, width_out). + """ + return kernel_function( + x, + self.weight, + self.bias, + self.stride, + self.padding, + self.output_padding, + self.groups, + ) + + +# Test code +batch_size = 4 +in_channels = 32 +out_channels = 32 +kernel_size = 3 +depth = 32 +height = 64 +width = 128 +stride = 2 +padding = 1 +groups = 4 + + +def get_inputs(): + x = torch.rand(batch_size, in_channels, depth, height, width) + return [x] + + +def get_init_inputs(): + return [in_channels, out_channels, kernel_size, stride, padding, groups] diff --git a/backends/triton/cpu/KernelBench/level1/74_conv_transposed_1D_dilated.py b/backends/triton/cpu/KernelBench/level1/74_conv_transposed_1D_dilated.py new file mode 100644 index 0000000..830134b --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/74_conv_transposed_1D_dilated.py @@ -0,0 +1,199 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config( + {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 16, "GROUP_M": 8}, + num_warps=4, + num_stages=2, + ), + ], + key=["C_IN", "C_out", "OL"], +) +@triton.jit +def _conv_transpose1d_dilated_gemm( + x_ptr, + w_ptr, + y_ptr, + N_batch, + L_in, + C_out, + OL, + sxn, + sxl, + swk, + swci, + swco, + syn, + syl, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + GROUP_M: tl.constexpr, + K_SIZE: tl.constexpr, + C_IN: tl.constexpr, + DILATION: tl.constexpr, +): + pid = tl.program_id(0) + M_total = N_batch * OL + num_pid_m = tl.cdiv(M_total, BLOCK_M) + num_pid_n = tl.cdiv(C_out, BLOCK_N) + num_pid_in_group = GROUP_M * num_pid_n + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_M + group_size_m = tl.minimum(num_pid_m - first_pid_m, GROUP_M) + pid_in_group = pid % num_pid_in_group + pid_m = first_pid_m + (pid_in_group % group_size_m) + pid_n = pid_in_group // group_size_m + + offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + offs_k = tl.arange(0, BLOCK_K) + + n_idx = offs_m // OL + ol_idx = offs_m % OL + + mask_m = offs_m < M_total + mask_n = offs_n < C_out + + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + x_base = x_ptr + n_idx * sxn + + for k in range(K_SIZE): + il = ol_idx - k * DILATION + valid = (il >= 0) & (il < L_in) & mask_m + x_ptrs = x_base + il * sxl + + w_bp = tl.make_block_ptr( + base=w_ptr + k * swk, + shape=(C_IN, C_out), + strides=(swci, swco), + offsets=(0, pid_n * BLOCK_N), + block_shape=(BLOCK_K, BLOCK_N), + order=(1, 0), + ) + + for c0 in range(0, C_IN, BLOCK_K): + k_idx = c0 + offs_k + x_tile = tl.load( + x_ptrs[:, None] + k_idx[None, :], + mask=valid[:, None] & (k_idx[None, :] < C_IN), + other=0.0, + ) + w_tile = tl.load(w_bp, boundary_check=(0, 1), padding_option="zero") + acc = tl.dot(x_tile, w_tile, acc) + w_bp = tl.advance(w_bp, (BLOCK_K, 0)) + + y_ptrs = y_ptr + n_idx * syn + ol_idx * syl + tl.store( + y_ptrs[:, None] + offs_n[None, :], + acc.to(y_ptr.dtype.element_ty), + mask=mask_m[:, None] & mask_n[None, :], + ) + + +class Model(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + bias=False, + ): + super(Model, self).__init__() + self.conv1d_transpose = nn.ConvTranspose1d( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + bias=bias, + ) + self._dilation = dilation + self._packed = False + + def _pack_weights(self): + self._w_kio = self.conv1d_transpose.weight.permute(2, 0, 1).contiguous() + self._K = self.conv1d_transpose.weight.shape[2] + self._C_out = self.conv1d_transpose.weight.shape[1] + self._packed = True + + def forward(self, x): + if not self._packed: + self._pack_weights() + + B, C_in, L_in = x.shape + K = self._K + C_out = self._C_out + OL = L_in + self._dilation * (K - 1) + + x_nlc = x.permute(0, 2, 1).contiguous() + w_kio = self._w_kio + y_nlc = torch.empty((B, OL, C_out), device=x.device, dtype=x.dtype) + + sxn, sxl, _ = x_nlc.stride() + swk, swci, swco = w_kio.stride() + syn, syl, _ = y_nlc.stride() + + M_total = B * OL + grid = lambda meta: ( + triton.cdiv(M_total, meta["BLOCK_M"]) * triton.cdiv(C_out, meta["BLOCK_N"]), + ) + + _conv_transpose1d_dilated_gemm[grid]( + x_nlc, + w_kio, + y_nlc, + B, + L_in, + C_out, + OL, + sxn, + sxl, + swk, + swci, + swco, + syn, + syl, + K_SIZE=K, + C_IN=C_in, + DILATION=self._dilation, + ) + + result = y_nlc.permute(0, 2, 1).contiguous() + if self.conv1d_transpose.bias is not None: + result = result + self.conv1d_transpose.bias.view(1, -1, 1) + return result + + +batch_size = 32 +in_channels = 32 +out_channels = 64 +kernel_size = 5 +length = 131072 +stride = 1 +padding = 0 +dilation = 3 + + +def get_inputs(): + x = torch.rand(batch_size, in_channels, length) + return [x] + + +def get_init_inputs(): + return [in_channels, out_channels, kernel_size, stride, padding, dilation] diff --git a/backends/triton/cpu/KernelBench/level1/75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.py b/backends/triton/cpu/KernelBench/level1/75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.py new file mode 100644 index 0000000..69289d0 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.py @@ -0,0 +1,353 @@ +# ruff: noqa: E731, E402 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +# Auto-merged kernel + test for: 75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.log +# Generated by merge script + +# kernel.py +import torch +import triton +import triton.language as tl + +# ----------------------------------------------------------------------------- +# Triton kernel: Fused 2D Transposed Convolution +# +# Loops over kernel spatial dims and input‐channel group in one pass. +# Accumulates in fp32 for stability, then casts to bf16 on store. +# Supports arbitrary asymmetric kernel sizes, padding, dilation, stride, and groups. +# ----------------------------------------------------------------------------- + + +@triton.jit +def _conv_transpose2d_kernel( + # Pointers + x_ptr, + w_ptr, + out_ptr, + total_elements, + # Tensor dims (constexpr) + batch: tl.constexpr, + in_c: tl.constexpr, + in_h: tl.constexpr, + in_w: tl.constexpr, + out_c: tl.constexpr, + out_h: tl.constexpr, + out_w: tl.constexpr, + k_h: tl.constexpr, + k_w: tl.constexpr, + s_h: tl.constexpr, + s_w: tl.constexpr, + p_h: tl.constexpr, + p_w: tl.constexpr, + d_h: tl.constexpr, + d_w: tl.constexpr, + groups: tl.constexpr, + in_c_per_g: tl.constexpr, + out_c_per_g: tl.constexpr, + # Strides (in elements) + x_stride_b: tl.constexpr, + x_stride_c: tl.constexpr, + x_stride_h: tl.constexpr, + x_stride_w: tl.constexpr, + w_stride_ic: tl.constexpr, + w_stride_oc: tl.constexpr, + w_stride_kh: tl.constexpr, + w_stride_kw: tl.constexpr, + out_stride_b: tl.constexpr, + out_stride_c: tl.constexpr, + out_stride_h: tl.constexpr, + out_stride_w: tl.constexpr, + # Block size + BLOCK: tl.constexpr, +): + """ + Each program computes BLOCK flattened output elements. + """ + + pid = tl.program_id(0) + offs = pid * BLOCK + tl.arange(0, BLOCK) + mask_o = offs < total_elements + + # Flattened index -> (n, co, i_out, j_out) + nc = out_c * out_h * out_w + n = offs // nc + rem = offs % nc + co = rem // (out_h * out_w) + rem2 = rem % (out_h * out_w) + i_out = rem2 // out_w + j_out = rem2 % out_w + + # FP32 accumulator per lane + acc = tl.zeros((BLOCK,), dtype=tl.float32) + + # Which group & local output‐channel + group_id = co // out_c_per_g + cin_start = group_id * in_c_per_g + co_local = co - group_id * out_c_per_g + + # Loop over kernel H×W + for kh_i in range(k_h): + # invert transposed conv formula in H dimension: + # i_out = i_in * s_h - p_h + kh * d_h + # => i_in = (i_out + p_h - kh*d_h) / s_h only valid if divisible + in_i_numer = i_out + p_h - kh_i * d_h + in_i = in_i_numer // s_h + valid_h = (in_i_numer == in_i * s_h) & (in_i >= 0) & (in_i < in_h) + + for kw_j in range(k_w): + in_j_numer = j_out + p_w - kw_j * d_w + in_j = in_j_numer // s_w + valid_w = (in_j_numer == in_j * s_w) & (in_j >= 0) & (in_j < in_w) + + # final per‐lane validity mask + valid_spatial = mask_o & valid_h & valid_w + + # sum over input‐channels in this group + for ci in range(in_c_per_g): + c_in = cin_start + ci + x_off = ( + n * x_stride_b + + c_in * x_stride_c + + in_i * x_stride_h + + in_j * x_stride_w + ) + x_val = tl.load(x_ptr + x_off, mask=valid_spatial, other=0.0).to( + tl.float32 + ) + w_off = ( + c_in * w_stride_ic + + co_local * w_stride_oc + + kh_i * w_stride_kh + + kw_j * w_stride_kw + ) + w_val = tl.load(w_ptr + w_off, mask=mask_o, other=0.0).to(tl.float32) + acc += x_val * w_val + + # cast acc to bf16 and store + out_val = acc.to(out_ptr.dtype.element_ty) + out_off = ( + n * out_stride_b + + co * out_stride_c + + i_out * out_stride_h + + j_out * out_stride_w + ) + tl.store(out_ptr + out_off, out_val, mask=mask_o) + + +def kernel_function( + x: torch.Tensor, + weight: torch.Tensor, + bias, + stride: tuple, + padding: tuple, + dilation: tuple, + groups: int, +): + """ + Wrapper for grouped 2D transposed convolution on Intel XPU using Triton. + """ + + # 1) Validate arguments + assert bias is None, "Bias must be None for this kernel" + assert x.dtype in (torch.bfloat16, torch.float16) and weight.dtype == x.dtype, ( + "Inputs must be float16 or bfloat16" + ) + + # 2) Shapes + batch, in_c, in_h, in_w = x.shape + # PyTorch conv_transpose2d weight: [in_c, out_c/groups, k_h, k_w] + in_c_w, out_c_pg, k_h, k_w = weight.shape + assert in_c_w == in_c + out_c = out_c_pg * groups + + s_h, s_w = stride + p_h, p_w = padding + d_h, d_w = dilation + in_c_per_g = in_c // groups + out_c_per_g = out_c // groups + + # compute output H, W + out_h = (in_h - 1) * s_h - 2 * p_h + d_h * (k_h - 1) + 1 + out_w = (in_w - 1) * s_w - 2 * p_w + d_w * (k_w - 1) + 1 + + # 3) Allocate output + out = torch.empty((batch, out_c, out_h, out_w), device=x.device, dtype=x.dtype) + + # 4) Flattened element count + total_elems = batch * out_c * out_h * out_w + + # 5) Strides (in elements) + x_stride_b, x_stride_c, x_stride_h, x_stride_w = x.stride() + w_stride_ic, w_stride_oc, w_stride_kh, w_stride_kw = weight.stride() + out_stride_b, out_stride_c, out_stride_h, out_stride_w = out.stride() + + # 6) Launch parameters + BLOCK = 256 + grid = (triton.cdiv(total_elems, BLOCK),) + + # 7) Launch kernel + _conv_transpose2d_kernel[grid]( + # pointers + total + x, + weight, + out, + total_elems, + # dims + batch, + in_c, + in_h, + in_w, + out_c, + out_h, + out_w, + k_h, + k_w, + s_h, + s_w, + p_h, + p_w, + d_h, + d_w, + groups, + in_c_per_g, + out_c_per_g, + # strides + x_stride_b, + x_stride_c, + x_stride_h, + x_stride_w, + w_stride_ic, + w_stride_oc, + w_stride_kh, + w_stride_kw, + out_stride_b, + out_stride_c, + out_stride_h, + out_stride_w, + # block size + BLOCK, + ) + return out + + +# ------------------------------ +# KernelBench-compatible Model +# ------------------------------ + + +import torch.nn as nn + + +class Model(nn.Module): + """ + Performs a 2D transposed convolution operation with asymmetric input, asymmetric kernel, + grouped, padded, and dilated. + + Args: + in_channels (int): Number of channels in the input tensor. + out_channels (int): Number of channels produced by the convolution. + kernel_size (tuple): Size of the convolution kernel (height, width). + stride (tuple, optional): Stride of the convolution (height, width). Defaults to (1, 1). + padding (tuple, optional): Padding applied to the input (height, width). Defaults to (0, 0). + dilation (tuple, optional): Spacing between kernel elements (height, width). Defaults to (1, 1). + groups (int, optional): Number of blocked connections from input channels to output channels. Defaults to 1. + bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`. + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: tuple, + stride: tuple = (1, 1), + padding: tuple = (0, 0), + dilation: tuple = (1, 1), + groups: int = 1, + bias: bool = False, + ): + super(Model, self).__init__() + # Keep the same init signature as the reference Model, but route compute to Triton kernel. + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.stride = ( + tuple(stride) if isinstance(stride, (list, tuple)) else (stride, stride) + ) + self.padding = ( + tuple(padding) if isinstance(padding, (list, tuple)) else (padding, padding) + ) + self.dilation = ( + tuple(dilation) + if isinstance(dilation, (list, tuple)) + else (dilation, dilation) + ) + self.groups = groups + self.use_bias = bias + + # Kernel wrapper constraints + assert not self.use_bias, "Bias must be False / None for this kernel" + assert len(self.stride) == 2, "stride must have 2 elements (sh, sw)" + assert len(self.padding) == 2, "padding must have 2 elements (ph, pw)" + assert len(self.dilation) == 2, "dilation must have 2 elements (dh, dw)" + assert isinstance(self.groups, int) and self.groups >= 1, ( + "groups must be a positive int" + ) + assert in_channels % self.groups == 0, "in_channels must be divisible by groups" + assert out_channels % self.groups == 0, ( + "out_channels must be divisible by groups" + ) + + k_h, k_w = kernel_size + # Match nn.ConvTranspose2d parameter shapes: + # weight: (in_channels, out_channels/groups, k_h, k_w) + self.weight = nn.Parameter( + torch.empty(in_channels, out_channels // self.groups, k_h, k_w) + ) + self.bias = None + + # Reasonable init + nn.init.kaiming_uniform_(self.weight, a=5**0.5) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Performs the 2D transposed convolution. + + Args: + x (torch.Tensor): Input tensor of shape (batch_size, in_channels, height, width). + + Returns: + torch.Tensor: Output tensor of shape (batch_size, out_channels, height_out, width_out). + """ + return kernel_function( + x, + self.weight, + self.bias, + self.stride, + self.padding, + self.dilation, + self.groups, + ) + + +batch_size = 16 +in_channels = 32 +out_channels = 64 +kernel_size = (3, 5) +height = 128 +width = 256 +stride = (2, 3) +padding = (1, 2) +dilation = (2, 1) +groups = 4 + + +def get_inputs(): + x = torch.rand(batch_size, in_channels, height, width) + return [x] + + +def get_init_inputs(): + return [in_channels, out_channels, kernel_size, stride, padding, dilation, groups] diff --git a/backends/triton/cpu/KernelBench/level1/76_conv_standard_1D_dilated_strided__.py b/backends/triton/cpu/KernelBench/level1/76_conv_standard_1D_dilated_strided__.py new file mode 100644 index 0000000..78155ce --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/76_conv_standard_1D_dilated_strided__.py @@ -0,0 +1,174 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_OL": 64, "BLOCK_N": 128}, num_warps=4, num_stages=3), + ], + key=["OL", "OC"], +) +@triton.jit +def _conv1d_kernel( + x_ptr, + w_ptr, + y_ptr, + OL, + OC, + stride_xn, + stride_xc, + stride_wk, + stride_wi, + stride_wo, + stride_yn, + stride_yoc, + C_IN: tl.constexpr, + BLOCK_K: tl.constexpr, + KERNEL_SIZE: tl.constexpr, + STRIDE_CONV: tl.constexpr, + DILATION: tl.constexpr, + BLOCK_OL: tl.constexpr, + BLOCK_N: tl.constexpr, +): + n = tl.program_id(0) + pid_ol = tl.program_id(1) + pid_n = tl.program_id(2) + + ol_start = pid_ol * BLOCK_OL + offs_ol = ol_start + tl.arange(0, BLOCK_OL) + offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + + mask_ol = offs_ol < OL + mask_n = offs_n < OC + + acc = tl.zeros((BLOCK_OL, BLOCK_N), dtype=tl.float32) + + offs_k = tl.arange(0, BLOCK_K) + x_batch = x_ptr + n.to(tl.int64) * stride_xn + + for k in range(KERNEL_SIZE): + in_pos = offs_ol * STRIDE_CONV + k * DILATION + + x_addrs = x_batch + offs_k[None, :].to(tl.int64) * stride_xc + in_pos[:, None] + x_tile = tl.load( + x_addrs, mask=mask_ol[:, None] & (offs_k[None, :] < C_IN), other=0.0 + ) + + w_bp = tl.make_block_ptr( + base=w_ptr + k * stride_wk, + shape=(C_IN, OC), + strides=(stride_wi, stride_wo), + offsets=(0, pid_n * BLOCK_N), + block_shape=(BLOCK_K, BLOCK_N), + order=(1, 0), + ) + w_tile = tl.load(w_bp, boundary_check=(0, 1), padding_option="zero") + + acc = tl.dot(x_tile, w_tile, acc) + + y_batch = y_ptr + n.to(tl.int64) * stride_yn + y_addrs = y_batch + offs_n[None, :].to(tl.int64) * stride_yoc + offs_ol[:, None] + tl.store(y_addrs, acc.to(tl.float16), mask=mask_ol[:, None] & mask_n[None, :]) + + +class Model(nn.Module): + def __init__( + self, in_channels, out_channels, kernel_size, stride=1, dilation=1, bias=False + ): + super().__init__() + self.conv1d = nn.Conv1d( + in_channels, + out_channels, + kernel_size, + stride=stride, + dilation=dilation, + bias=bias, + ) + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size_val = kernel_size + self.stride_val = stride + self.dilation_val = dilation + self._packed = False + + def _pack_weights(self): + w = self.conv1d.weight.data + w_kio = w.permute(2, 1, 0).contiguous() + self.w_kio = w_kio.to("cpu", dtype=torch.float16).contiguous() + self._packed = True + + def forward(self, x): + if not self._packed: + self._pack_weights() + + if x.device.type != "cpu" or x.dtype != torch.float16: + x = x.to("cpu", dtype=torch.float16) + if not x.is_contiguous(): + x = x.contiguous() + + N = x.shape[0] + IC = x.shape[1] + L = x.shape[2] + OC = self.out_channels + KS = self.kernel_size_val + S = self.stride_val + D = self.dilation_val + OL = (L - D * (KS - 1) - 1) // S + 1 + + BLOCK_K = triton.next_power_of_2(max(16, IC)) + + y = torch.empty((N, OC, OL), device="cpu", dtype=torch.float16) + + grid = lambda META: ( + N, + triton.cdiv(OL, META["BLOCK_OL"]), + triton.cdiv(OC, META["BLOCK_N"]), + ) + + _conv1d_kernel[grid]( + x, + self.w_kio, + y, + OL, + OC, + x.stride(0), + x.stride(1), + self.w_kio.stride(0), + self.w_kio.stride(1), + self.w_kio.stride(2), + y.stride(0), + y.stride(1), + C_IN=IC, + BLOCK_K=BLOCK_K, + KERNEL_SIZE=KS, + STRIDE_CONV=S, + DILATION=D, + ) + + return y + + +batch_size = 64 +in_channels = 64 +out_channels = 128 +kernel_size = 3 +length = 524280 +stride = 3 +dilation = 4 + + +def get_inputs(): + x = torch.rand(batch_size, in_channels, length) + return [x] + + +def get_init_inputs(): + return [in_channels, out_channels, kernel_size, stride, dilation] diff --git a/backends/triton/cpu/KernelBench/level1/77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.py b/backends/triton/cpu/KernelBench/level1/77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.py new file mode 100644 index 0000000..9942ff4 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.py @@ -0,0 +1,259 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +def _compute_output_size(input_size, kernel_size, stride, padding, dilation): + return (input_size - 1) * stride - 2 * padding + dilation * (kernel_size - 1) + 1 + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_W": 16, "BLOCK_OC": 32}, num_warps=4, num_stages=4), + ], + key=["W_act", "C_out"], +) +@triton.jit +def _conv_transpose3d_v5( + x_ptr, + w_ptr, + out_ptr, + D: tl.constexpr, + H: tl.constexpr, + W: tl.constexpr, + C_out: tl.constexpr, + D_out: tl.constexpr, + H_out: tl.constexpr, + W_out: tl.constexpr, + D_act: tl.constexpr, + H_act: tl.constexpr, + W_act: tl.constexpr, + sx_b, + sx_d, + sx_h, + sx_w, + so_b, + so_d, + so_h, + so_w, + BLOCK_W: tl.constexpr, + BLOCK_OC: tl.constexpr, + K: tl.constexpr, + C_IN: tl.constexpr, +): + pid_w = tl.program_id(0) + pid_bdh = tl.program_id(1) + pid_oc = tl.program_id(2) + + b = pid_bdh // (D_act * H_act) + rem = pid_bdh % (D_act * H_act) + d_idx = rem // H_act + h_idx = rem % H_act + + offs_w_idx = pid_w * BLOCK_W + tl.arange(0, BLOCK_W) + offs_oc = pid_oc * BLOCK_OC + tl.arange(0, BLOCK_OC) + + acc = tl.zeros((BLOCK_W, BLOCK_OC), dtype=tl.float32) + + x_batch_base = x_ptr + b.to(tl.int64) * sx_b + + for kd in range(K): + d_in = d_idx + 1 - kd + d_ok = (d_in >= 0) & (d_in < D) + if d_ok: + for kh in range(K): + h_in = h_idx + 1 - kh + h_ok = (h_in >= 0) & (h_in < H) + if h_ok: + x_dh_base = x_batch_base + d_in * sx_d + h_in * sx_h + + for kw in range(K): + w_in_start = pid_w * BLOCK_W + 1 - kw + + x_bp = tl.make_block_ptr( + base=x_dh_base, + shape=(W, C_IN), + strides=(sx_w, 1), + offsets=(w_in_start, 0), + block_shape=(BLOCK_W, C_IN), + order=(1, 0), + ) + x_tile = tl.load( + x_bp, boundary_check=(0,), padding_option="zero" + ) + + kidx = kd * K * K + kh * K + kw + w_bp = tl.make_block_ptr( + base=w_ptr + kidx * C_IN * C_out, + shape=(C_IN, C_out), + strides=(C_out, 1), + offsets=(0, pid_oc * BLOCK_OC), + block_shape=(C_IN, BLOCK_OC), + order=(1, 0), + ) + w_tile = tl.load( + w_bp, boundary_check=(1,), padding_option="zero" + ) + + acc += tl.dot(x_tile, w_tile) + + d_out = 2 * d_idx + 1 + h_out = 2 * h_idx + 1 + w_out = 2 * offs_w_idx + 1 + out_base = out_ptr + b.to(tl.int64) * so_b + d_out * so_d + h_out * so_h + out_ptrs = out_base + w_out[:, None] * so_w + offs_oc[None, :] + out_mask = (offs_w_idx[:, None] < W_act) & (offs_oc[None, :] < C_out) + tl.store(out_ptrs, acc.to(tl.float16), mask=out_mask) + + +@triton.jit +def _zero_kernel(out_ptr, N, BLOCK: tl.constexpr): + pid = tl.program_id(0) + offs = pid * BLOCK + tl.arange(0, BLOCK) + mask = offs < N + tl.store(out_ptr + offs, tl.zeros((BLOCK,), dtype=tl.float16), mask=mask) + + +class Model(nn.Module): + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: int = 1, + padding: int = 0, + dilation: int = 1, + bias: bool = False, + ): + super(Model, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.stride = stride + self.padding = padding + self.dilation = dilation + self.has_bias = bias + self.conv_transpose3d = nn.ConvTranspose3d( + in_channels, + out_channels, + kernel_size=(kernel_size, kernel_size, kernel_size), + stride=stride, + padding=padding, + dilation=dilation, + bias=bias, + ) + self._packed = False + self._buf_allocated = False + + def _pack_weights(self): + w = self.conv_transpose3d.weight.data + K = self.kernel_size + self.weight_packed = ( + w.permute(2, 3, 4, 0, 1) + .reshape(K * K * K, self.in_channels, self.out_channels) + .contiguous() + .to(dtype=torch.float16) + ) + self._packed = True + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if not self._packed: + self._pack_weights() + + device = x.device + B, C_in, D, H, W = x.shape + K = self.kernel_size + + D_out = _compute_output_size(D, K, self.stride, self.padding, self.dilation) + H_out = _compute_output_size(H, K, self.stride, self.padding, self.dilation) + W_out = _compute_output_size(W, K, self.stride, self.padding, self.dilation) + + D_act = D_out // 2 + H_act = H_out // 2 + W_act = W_out // 2 + + x_cl3d = x.to(dtype=torch.float16).contiguous( + memory_format=torch.channels_last_3d + ) + + if not self._buf_allocated or self._output_buf.shape[0] != B: + self._output_buf = torch.empty( + B, + self.out_channels, + D_out, + H_out, + W_out, + device=device, + dtype=torch.float16, + ).contiguous(memory_format=torch.channels_last_3d) + self._buf_allocated = True + + output = self._output_buf + N = output.numel() + BLOCK = 1024 + _zero_kernel[(triton.cdiv(N, BLOCK),)](output, N, BLOCK=BLOCK) + + sx = x_cl3d.stride() + so = output.stride() + + grid = lambda META: ( + triton.cdiv(W_act, META["BLOCK_W"]), + B * D_act * H_act, + triton.cdiv(self.out_channels, META["BLOCK_OC"]), + ) + + _conv_transpose3d_v5[grid]( + x_cl3d, + self.weight_packed, + output, + D, + H, + W, + self.out_channels, + D_out, + H_out, + W_out, + D_act, + H_act, + W_act, + sx[0], + sx[2], + sx[3], + sx[4], + so[0], + so[2], + so[3], + so[4], + K=K, + C_IN=C_in, + ) + + return output + + +batch_size = 16 +in_channels = 32 +out_channels = 64 +kernel_size = 3 +depth = 16 +height = 32 +width = 32 +stride = 2 +padding = 1 +dilation = 2 + + +def get_inputs(): + x = torch.rand(batch_size, in_channels, depth, height, width) + return [x] + + +def get_init_inputs(): + return [in_channels, out_channels, kernel_size, stride, padding, dilation] diff --git a/backends/triton/cpu/KernelBench/level1/78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.py b/backends/triton/cpu/KernelBench/level1/78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.py new file mode 100644 index 0000000..854892d --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.py @@ -0,0 +1,244 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import torch.nn.functional as F +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config( + { + "BLOCK_OW": 64, + "BLOCK_N": 32, + "BLOCK_K": 32, + "GROUP_SIZE_M": 8, + "grf_mode": "128", + }, + num_warps=4, + num_stages=3, + ), + ], + key=["H", "W", "C_IN", "C_out", "OH", "OW"], +) +@triton.jit +def _conv_transpose2d_swizzled_v2( + x_ptr, + w_ptr, + y_ptr, + N_batch, + H, + W, + C_out, + OH, + OW, + stride_wkh, + stride_wkw, + stride_wci, + stride_wco, + BLOCK_OW: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + KH: tl.constexpr, + KW: tl.constexpr, + C_IN: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + grf_mode: tl.constexpr, +): + n = tl.program_id(1) + pid = tl.program_id(0) + + num_ow_tiles = tl.cdiv(OW, BLOCK_OW) + num_n_tiles = tl.cdiv(C_out, BLOCK_N) + num_m_tiles = OH * num_ow_tiles + + group_id = pid // (GROUP_SIZE_M * num_n_tiles) + first_m_in_group = group_id * GROUP_SIZE_M + group_size = min(num_m_tiles - first_m_in_group, GROUP_SIZE_M) + local_id = pid % (group_size * num_n_tiles) + m_local = local_id // num_n_tiles + pid_n = local_id % num_n_tiles + m_idx = first_m_in_group + m_local + + oh = m_idx // num_ow_tiles + pid_ow = m_idx % num_ow_tiles + ow0 = pid_ow * BLOCK_OW + + HW = H * W + OHOW = OH * OW + + acc = tl.zeros((BLOCK_OW, BLOCK_N), dtype=tl.float32) + + for kh in range(KH): + for kw in range(KW): + x_row_start = n * HW + (oh + kh) * W + (ow0 + kw) + x_valid_rows = W - (ow0 + kw) + + x_bp = tl.make_block_ptr( + base=x_ptr, + shape=(x_row_start + x_valid_rows, C_IN), + strides=(C_IN, 1), + offsets=(x_row_start, 0), + block_shape=(BLOCK_OW, BLOCK_K), + order=(1, 0), + ) + + w_bp = tl.make_block_ptr( + base=w_ptr + kh * stride_wkh + kw * stride_wkw, + shape=(C_IN, C_out), + strides=(stride_wci, stride_wco), + offsets=(0, pid_n * BLOCK_N), + block_shape=(BLOCK_K, BLOCK_N), + order=(1, 0), + ) + + for c0 in range(0, C_IN, BLOCK_K): + x_tile = tl.load(x_bp, boundary_check=(0, 1), padding_option="zero") + w_tile = tl.load(w_bp, boundary_check=(0, 1), padding_option="zero") + acc = tl.dot(x_tile, w_tile, acc) + x_bp = tl.advance(x_bp, (0, BLOCK_K)) + w_bp = tl.advance(w_bp, (BLOCK_K, 0)) + + y_row_start = n * OHOW + oh * OW + ow0 + y_valid_rows = OW - ow0 + y_bp = tl.make_block_ptr( + base=y_ptr, + shape=(y_row_start + y_valid_rows, C_out), + strides=(C_out, 1), + offsets=(y_row_start, pid_n * BLOCK_N), + block_shape=(BLOCK_OW, BLOCK_N), + order=(1, 0), + ) + tl.store(y_bp, acc.to(tl.float16), boundary_check=(0, 1)) + + +class Model(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + output_padding=0, + groups=1, + bias=False, + ): + super().__init__() + self.conv_transpose2d = nn.ConvTranspose2d( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + output_padding=output_padding, + groups=groups, + bias=bias, + ) + self._packed = False + self._cached_x_ptr = None + + def _to_pair(self, x): + return (x, x) if isinstance(x, int) else tuple(x) + + def _pack_weights(self): + device = torch.device("cpu") + weight = self.conv_transpose2d.weight.data.detach() + self.w_hwio = ( + weight.flip(2, 3) + .permute(2, 3, 0, 1) + .to(device, dtype=torch.float16) + .contiguous() + ) + self.KH = weight.shape[2] + self.KW = weight.shape[3] + self.C_in_val = weight.shape[0] + self.C_out_val = weight.shape[1] + # Compute padding for flip+pad conversion + pad = self._to_pair(self.conv_transpose2d.padding) + self._pad_h = self.KH - 1 - pad[0] + self._pad_w = self.KW - 1 - pad[1] + self._packed = True + + def forward(self, x): + device = torch.device("cpu") + if not self._packed: + self._pack_weights() + + x = x.to(device, dtype=torch.float16) + N, C_in, H, W_in = x.shape + KH, KW = self.KH, self.KW + C_out = self.C_out_val + pad_h, pad_w = self._pad_h, self._pad_w + + if self._cached_x_ptr != x.data_ptr(): + x_pad = F.pad(x, (pad_w, pad_w, pad_h, pad_h)) + x_pad_cl = x_pad.contiguous(memory_format=torch.channels_last) + self._x_nhwc = x_pad_cl.permute(0, 2, 3, 1) + self._cached_x_ptr = x.data_ptr() + self._H_pad = H + 2 * pad_h + self._W_pad = W_in + 2 * pad_w + self._OH = H + 2 * pad_h - KH + 1 + self._OW = W_in + 2 * pad_w - KW + 1 + self._N = N + self._y = torch.empty( + (N, C_out, self._OH, self._OW), + device=device, + dtype=torch.float16, + memory_format=torch.channels_last, + ) + self._y_nhwc = self._y.permute(0, 2, 3, 1) + + OH, OW = self._OH, self._OW + H_pad, W_pad = self._H_pad, self._W_pad + + def grid(META): + num_ow = triton.cdiv(OW, META["BLOCK_OW"]) + num_n = triton.cdiv(C_out, META["BLOCK_N"]) + return (OH * num_ow * num_n, N) + + _conv_transpose2d_swizzled_v2[grid]( + self._x_nhwc, + self.w_hwio, + self._y_nhwc, + N, + H_pad, + W_pad, + C_out, + OH, + OW, + self.w_hwio.stride(0), + self.w_hwio.stride(1), + self.w_hwio.stride(2), + self.w_hwio.stride(3), + KH=KH, + KW=KW, + C_IN=C_in, + ) + + return self._y + + +batch_size = 8 +in_channels = 32 +out_channels = 32 +kernel_size = (3, 7) +height = 512 +width = 1024 +stride = (1, 1) +padding = (1, 3) + + +def get_inputs(): + x = torch.rand(batch_size, in_channels, height, width) + return [x] + + +def get_init_inputs(): + return [in_channels, out_channels, kernel_size, stride, padding] diff --git a/backends/triton/cpu/KernelBench/level1/79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.py b/backends/triton/cpu/KernelBench/level1/79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.py new file mode 100644 index 0000000..84f13ff --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.py @@ -0,0 +1,180 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_L": 128}, num_warps=4, num_stages=2), + ], + key=["N_NONZERO"], +) +@triton.jit +def _conv_transpose1d_kernel( + inp_ptr, + w_ptr, + out_ptr, + L_IN, + L_OUT, + N_NONZERO, + IN_CHANNELS: tl.constexpr, + OUT_CHANNELS: tl.constexpr, + KERNEL_SIZE: tl.constexpr, + BLOCK_L: tl.constexpr, + BLOCK_K: tl.constexpr, + BLOCK_OC: tl.constexpr, +): + pid = tl.program_id(0) + num_l_tiles = tl.cdiv(N_NONZERO, BLOCK_L) + batch_idx = pid // num_l_tiles + j0 = (pid % num_l_tiles) * BLOCK_L + + batch_off_in = batch_idx.to(tl.int64) * L_IN * IN_CHANNELS + + acc = tl.zeros((BLOCK_L, BLOCK_OC), dtype=tl.float32) + + for k in range(KERNEL_SIZE): + i_off = j0 + 1 - k + + inp_bp = tl.make_block_ptr( + base=inp_ptr + batch_off_in, + shape=(L_IN, IN_CHANNELS), + strides=(IN_CHANNELS, 1), + offsets=(i_off, 0), + block_shape=(BLOCK_L, BLOCK_K), + order=(1, 0), + ) + w_bp = tl.make_block_ptr( + base=w_ptr + k * IN_CHANNELS * OUT_CHANNELS, + shape=(IN_CHANNELS, OUT_CHANNELS), + strides=(OUT_CHANNELS, 1), + offsets=(0, 0), + block_shape=(BLOCK_K, BLOCK_OC), + order=(1, 0), + ) + + a = tl.load(inp_bp, boundary_check=(0, 1), padding_option="zero") + b = tl.load(w_bp, boundary_check=(0, 1), padding_option="zero") + acc = tl.dot(a, b, acc) + + batch_off_out = batch_idx.to(tl.int64) * L_OUT * OUT_CHANNELS + OUT_CHANNELS + out_bp = tl.make_block_ptr( + base=out_ptr + batch_off_out, + shape=(N_NONZERO, OUT_CHANNELS), + strides=(2 * OUT_CHANNELS, 1), + offsets=(j0, 0), + block_shape=(BLOCK_L, BLOCK_OC), + order=(1, 0), + ) + tl.store(out_bp, acc.to(tl.float16), boundary_check=(0, 1)) + + +class Model(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + bias=False, + ): + super().__init__() + self.conv1d_transpose = nn.ConvTranspose1d( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + bias=bias, + ) + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.stride_val = stride + self.padding_val = padding + self.dilation_val = dilation + self._packed = False + + def _pack_weights(self): + device = torch.device("cpu") + w = self.conv1d_transpose.weight.data.detach() + self.weight_kio = w.permute(2, 0, 1).contiguous().to(device, torch.float16) + self._packed = True + + def forward(self, x): + if not self._packed: + self._pack_weights() + + device = torch.device("cpu") + x = x.to(device, torch.float16) + B, IC, L_in = x.shape + L_out = ( + (L_in - 1) * self.stride_val + - 2 * self.padding_val + + self.dilation_val * (self.kernel_size - 1) + + 1 + ) + N_nonzero = L_out // 2 + + x_ptr = x.data_ptr() + if not hasattr(self, "_x_cl") or self._x_ptr != x_ptr: + self._x_cl = x.permute(0, 2, 1).contiguous() + self._x_ptr = x_ptr + + if not hasattr(self, "_out_cl") or self._out_shape != ( + B, + L_out, + self.out_channels, + ): + self._out_cl = torch.zeros( + (B, L_out, self.out_channels), device=device, dtype=torch.float16 + ) + self._out_view = self._out_cl.permute(0, 2, 1) + self._out_shape = (B, L_out, self.out_channels) + + grid = lambda META: (B * triton.cdiv(N_nonzero, META["BLOCK_L"]),) + + _conv_transpose1d_kernel[grid]( + self._x_cl, + self.weight_kio, + self._out_cl, + L_in, + L_out, + N_nonzero, + IN_CHANNELS=IC, + OUT_CHANNELS=self.out_channels, + KERNEL_SIZE=self.kernel_size, + BLOCK_K=32, + BLOCK_OC=64, + ) + + return self._out_view + + +batch_size = 16 +in_channels = 32 +out_channels = 64 +kernel_size = 3 +length = 131072 +stride = 2 +padding = 1 +dilation = 2 + + +def get_inputs(): + x = torch.rand(batch_size, in_channels, length) + return [x] + + +def get_init_inputs(): + return [in_channels, out_channels, kernel_size, stride, padding, dilation] diff --git a/backends/triton/cpu/KernelBench/level1/7_Matmul_with_small_K_dimension_.py b/backends/triton/cpu/KernelBench/level1/7_Matmul_with_small_K_dimension_.py new file mode 100644 index 0000000..dc43e9f --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/7_Matmul_with_small_K_dimension_.py @@ -0,0 +1,136 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.jit +def swizzle_tile( + tile_id, + M, + N, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + grid_m = tl.cdiv(M, BLOCK_M) + grid_n = tl.cdiv(N, BLOCK_N) + width = GROUP_SIZE_M * grid_n + group_id = tile_id // width + group_size = tl.minimum(GROUP_SIZE_M, grid_m - group_id * GROUP_SIZE_M) + pid_m = group_id * GROUP_SIZE_M + (tile_id % group_size) + pid_n = (tile_id % width) // group_size + return pid_m, pid_n + + +@triton.autotune( + configs=[ + triton.Config( + {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 64, "GROUP_SIZE_M": 4}, + num_warps=32, + num_stages=2, + ), + ], + key=["M", "N", "K"], +) +@triton.jit +def _matmul_small_k_kernel( + a_ptr, + b_ptr, + c_ptr, + M, + N, + K, + stride_am, + stride_ak: tl.constexpr, + stride_bk, + stride_bn: tl.constexpr, + stride_cm, + stride_cn: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + pid = tl.program_id(0) + pid_m, pid_n = swizzle_tile(pid, M, N, BLOCK_M, BLOCK_N, GROUP_SIZE_M) + + a_desc = tl.make_tensor_descriptor( + base=a_ptr, + shape=(M, K), + strides=(stride_am, stride_ak), + block_shape=(BLOCK_M, BLOCK_K), + ) + b_desc = tl.make_tensor_descriptor( + base=b_ptr, + shape=(K, N), + strides=(stride_bk, stride_bn), + block_shape=(BLOCK_K, BLOCK_N), + ) + + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + for off_k in range(0, K, BLOCK_K): + a_tile = a_desc.load([pid_m * BLOCK_M, off_k]) + b_tile = b_desc.load([off_k, pid_n * BLOCK_N]) + acc += tl.dot(a_tile, b_tile) + c_desc = tl.make_tensor_descriptor( + base=c_ptr, + shape=(M, N), + strides=(stride_cm, stride_cn), + block_shape=(BLOCK_M, BLOCK_N), + ) + c_desc.store([pid_m * BLOCK_M, pid_n * BLOCK_N], acc.to(c_ptr.type.element_ty)) + + +class Model(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: + A = A.contiguous() + B = B.contiguous() + M, K = A.shape + _, N = B.shape + + C = torch.empty((M, N), device=A.device, dtype=A.dtype) + + grid = lambda META: ( + triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]), + ) + + _matmul_small_k_kernel[grid]( + A, + B, + C, + M, + N, + K, + A.stride(0), + A.stride(1), + B.stride(0), + B.stride(1), + C.stride(0), + C.stride(1), + ) + return C + + +M = 16384 * 2 +N = 16384 * 2 +K = 32 * 2 + + +def get_inputs(): + A = torch.rand(M, K, dtype=torch.bfloat16) + B = torch.rand(K, N, dtype=torch.bfloat16) + return [A, B] + + +def get_init_inputs(): + return [] diff --git a/backends/triton/cpu/KernelBench/level1/80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.py b/backends/triton/cpu/KernelBench/level1/80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.py new file mode 100644 index 0000000..2494939 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.py @@ -0,0 +1,245 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +def _to_pair(x): + return (x, x) if isinstance(x, int) else tuple(x) + + +@triton.autotune( + configs=[ + triton.Config( + { + "BLOCK_M": 64, + "BLOCK_N": 64, + "BLOCK_K": 32, + "GROUP_M": 8, + "grf_mode": "128", + }, + num_warps=4, + num_stages=3, + ), + ], + key=["C_IN", "C_out", "OH", "OW"], +) +@triton.jit +def _conv2d_dilated_gemm( + x_ptr, + w_ptr, + y_ptr, + N_batch, + H_in, + W_in, + C_out, + OH, + OW, + sxn, + sxh, + sxw, + swkh, + swkw, + swci, + swco, + syn, + syh, + syw, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + GROUP_M: tl.constexpr, + PAD_H: tl.constexpr, + PAD_W: tl.constexpr, + DIL_H: tl.constexpr, + DIL_W: tl.constexpr, + KH: tl.constexpr, + KW: tl.constexpr, + C_IN: tl.constexpr, + grf_mode: tl.constexpr, +): + pid = tl.program_id(0) + M_total = N_batch * OH * OW + num_pid_m = tl.cdiv(M_total, BLOCK_M) + num_pid_n = tl.cdiv(C_out, BLOCK_N) + num_pid_in_group = GROUP_M * num_pid_n + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_M + group_size_m = tl.minimum(num_pid_m - first_pid_m, GROUP_M) + pid_in_group = pid % num_pid_in_group + pid_m = first_pid_m + (pid_in_group % group_size_m) + pid_n = pid_in_group // group_size_m + + offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + offs_k = tl.arange(0, BLOCK_K) + + ohw = OH * OW + n_idx = offs_m // ohw + rem = offs_m % ohw + oh_idx = rem // OW + ow_idx = rem % OW + + mask_m = offs_m < M_total + mask_n = offs_n < C_out + + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + x_base = x_ptr + n_idx.to(tl.int64) * sxn + + for kh in range(KH): + for kw in range(KW): + ih = oh_idx + kh * DIL_H - PAD_H + iw = ow_idx + kw * DIL_W - PAD_W + valid = (ih >= 0) & (ih < H_in) & (iw >= 0) & (iw < W_in) & mask_m + x_hw_ptrs = x_base + ih * sxh + iw * sxw + + w_bp = tl.make_block_ptr( + base=w_ptr + kh * swkh + kw * swkw, + shape=(C_IN, C_out), + strides=(swci, swco), + offsets=(0, pid_n * BLOCK_N), + block_shape=(BLOCK_K, BLOCK_N), + order=(1, 0), + ) + + for c0 in range(0, C_IN, BLOCK_K): + k_idx = c0 + offs_k + x_tile = tl.load( + x_hw_ptrs[:, None] + k_idx[None, :], + mask=valid[:, None] & (k_idx[None, :] < C_IN), + other=0.0, + ) + w_tile = tl.load(w_bp, boundary_check=(0, 1), padding_option="zero") + acc = tl.dot(x_tile, w_tile, acc) + w_bp = tl.advance(w_bp, (BLOCK_K, 0)) + + y_ptrs = y_ptr + n_idx.to(tl.int64) * syn + oh_idx * syh + ow_idx * syw + tl.store( + y_ptrs[:, None] + offs_n[None, :], + acc.to(y_ptr.dtype.element_ty), + mask=mask_m[:, None] & mask_n[None, :], + ) + + +class Model(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=(0, 0), + dilation=(1, 1), + bias=False, + ): + super(Model, self).__init__() + self.conv2d = nn.Conv2d( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + bias=bias, + ) + self._stride = stride + self._padding = _to_pair(padding) + self._dilation = _to_pair(dilation) + self._packed = False + + def _pack_weights(self): + w = self.conv2d.weight.data + self.w_packed = w.permute(2, 3, 1, 0).contiguous() + self._packed = True + + def forward(self, x): + if not self._packed: + self._pack_weights() + + B, C_in, H_in, W_in = x.shape + C_out = self.conv2d.weight.shape[0] + KH, KW = self.conv2d.weight.shape[2], self.conv2d.weight.shape[3] + pad_h, pad_w = self._padding + dil_h, dil_w = self._dilation + + OH = (H_in + 2 * pad_h - dil_h * (KH - 1) - 1) + 1 + OW = (W_in + 2 * pad_w - dil_w * (KW - 1) - 1) + 1 + + x_cl = x.contiguous(memory_format=torch.channels_last) + y = torch.empty( + B, + C_out, + OH, + OW, + device=x.device, + dtype=x.dtype, + memory_format=torch.channels_last, + ) + + sx = x_cl.stride() + sy = y.stride() + sw = self.w_packed.stride() + + M_total = B * OH * OW + grid = lambda meta: ( + triton.cdiv(M_total, meta["BLOCK_M"]) * triton.cdiv(C_out, meta["BLOCK_N"]), + ) + + _conv2d_dilated_gemm[grid]( + x_cl, + self.w_packed, + y, + B, + H_in, + W_in, + C_out, + OH, + OW, + sx[0], + sx[2], + sx[3], + sw[0], + sw[1], + sw[2], + sw[3], + sy[0], + sy[2], + sy[3], + PAD_H=pad_h, + PAD_W=pad_w, + DIL_H=dil_h, + DIL_W=dil_w, + KH=KH, + KW=KW, + C_IN=C_in, + ) + + if self.conv2d.bias is not None: + y = y + self.conv2d.bias.view(1, -1, 1, 1) + return y + + +batch_size = 8 +in_channels = 32 +out_channels = 64 +kernel_size = (5, 9) +width = 512 +height = 512 +stride = 1 +padding = (2, 4) +dilation = (2, 3) + + +def get_inputs(): + x = torch.rand(batch_size, in_channels, height, width) + return [x] + + +def get_init_inputs(): + return [in_channels, out_channels, kernel_size, stride, padding, dilation] diff --git a/backends/triton/cpu/KernelBench/level1/81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.py b/backends/triton/cpu/KernelBench/level1/81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.py new file mode 100644 index 0000000..b282b7b --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.py @@ -0,0 +1,242 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +def _to_pair(x): + return (x, x) if isinstance(x, int) else tuple(x) + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_W": 16, "BLOCK_OC": 32}, num_warps=4, num_stages=3), + ], + key=["W_IN", "C_out"], +) +@triton.jit +def _conv_transpose2d_scatter( + x_ptr, + w_ptr, + out_ptr, + H_IN, + W_IN, + C_out, + OH, + OW, + sx_b, + sx_h, + sx_w, + so_b, + so_h, + so_w, + BLOCK_W: tl.constexpr, + BLOCK_OC: tl.constexpr, + KH: tl.constexpr, + KW: tl.constexpr, + C_IN: tl.constexpr, + STRIDE_H: tl.constexpr, + STRIDE_W: tl.constexpr, + DIL_H: tl.constexpr, + DIL_W: tl.constexpr, + PAD_H: tl.constexpr, + PAD_W: tl.constexpr, +): + pid_w = tl.program_id(0) + pid_bh = tl.program_id(1) + pid_oc = tl.program_id(2) + + b = pid_bh // H_IN + ih = pid_bh % H_IN + + offs_iw = pid_w * BLOCK_W + tl.arange(0, BLOCK_W) + offs_oc = pid_oc * BLOCK_OC + tl.arange(0, BLOCK_OC) + + x_base = x_ptr + b.to(tl.int64) * sx_b + ih * sx_h + x_bp = tl.make_block_ptr( + base=x_base, + shape=(W_IN, C_IN), + strides=(sx_w, 1), + offsets=(pid_w * BLOCK_W, 0), + block_shape=(BLOCK_W, C_IN), + order=(1, 0), + ) + x_tile = tl.load(x_bp, boundary_check=(0,), padding_option="zero") + + out_batch_base = out_ptr + b.to(tl.int64) * so_b + iw_valid = offs_iw < W_IN + + for kh in range(KH): + oh = ih * STRIDE_H + kh * DIL_H - PAD_H + if (oh >= 0) & (oh < OH): + for kw in range(KW): + ow = offs_iw * STRIDE_W + kw * DIL_W - PAD_W + + kidx = kh * KW + kw + w_bp = tl.make_block_ptr( + base=w_ptr + kidx * C_IN * C_out, + shape=(C_IN, C_out), + strides=(C_out, 1), + offsets=(0, pid_oc * BLOCK_OC), + block_shape=(C_IN, BLOCK_OC), + order=(1, 0), + ) + w_tile = tl.load(w_bp, boundary_check=(1,), padding_option="zero") + + result = tl.dot(x_tile, w_tile).to(tl.float16) + + out_ptrs = ( + out_batch_base + oh * so_h + ow[:, None] * so_w + offs_oc[None, :] + ) + out_mask = ( + iw_valid[:, None] + & (offs_oc[None, :] < C_out) + & (ow[:, None] >= 0) + & (ow[:, None] < OW) + ) + tl.store(out_ptrs, result, mask=out_mask) + + +@triton.jit +def _zero_kernel(out_ptr, N, BLOCK: tl.constexpr): + pid = tl.program_id(0) + offs = pid * BLOCK + tl.arange(0, BLOCK) + mask = offs < N + tl.store(out_ptr + offs, tl.zeros((BLOCK,), dtype=tl.float16), mask=mask) + + +class Model(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + bias=False, + ): + super().__init__() + self.conv_transpose2d = nn.ConvTranspose2d( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + bias=bias, + ) + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = _to_pair(kernel_size) + self.stride_val = _to_pair(stride) + self.padding_val = _to_pair(padding) + self.dilation_val = _to_pair(dilation) + self._packed = False + + def _pack_weights(self): + w = self.conv_transpose2d.weight.data + KH, KW = self.kernel_size + self.weight_packed = ( + w.permute(2, 3, 0, 1) + .reshape(KH * KW, self.in_channels, self.out_channels) + .contiguous() + .to(dtype=torch.float16) + ) + self._packed = True + + def forward(self, x): + if not self._packed: + self._pack_weights() + + device = x.device + B, C_in, H_in, W_in = x.shape + KH, KW = self.kernel_size + SH, SW = self.stride_val + PH, PW = self.padding_val + DH, DW = self.dilation_val + + OH = (H_in - 1) * SH - 2 * PH + DH * (KH - 1) + 1 + OW = (W_in - 1) * SW - 2 * PW + DW * (KW - 1) + 1 + + x_cl = x.to(dtype=torch.float16).contiguous(memory_format=torch.channels_last) + + if not hasattr(self, "_output") or self._output.shape[0] != B: + self._output = torch.empty( + B, + self.out_channels, + OH, + OW, + device=device, + dtype=torch.float16, + ).contiguous(memory_format=torch.channels_last) + + output = self._output + N_elem = output.numel() + _zero_kernel[(triton.cdiv(N_elem, 1024),)](output, N_elem, BLOCK=1024) + + sx = x_cl.stride() + so = output.stride() + + grid = lambda META: ( + triton.cdiv(W_in, META["BLOCK_W"]), + B * H_in, + triton.cdiv(self.out_channels, META["BLOCK_OC"]), + ) + + _conv_transpose2d_scatter[grid]( + x_cl, + self.weight_packed, + output, + H_in, + W_in, + self.out_channels, + OH, + OW, + sx[0], + sx[2], + sx[3], + so[0], + so[2], + so[3], + KH=KH, + KW=KW, + C_IN=C_in, + STRIDE_H=SH, + STRIDE_W=SW, + DIL_H=DH, + DIL_W=DW, + PAD_H=PH, + PAD_W=PW, + ) + + if self.conv_transpose2d.bias is not None: + output = output + self.conv_transpose2d.bias.view(1, -1, 1, 1) + + return output + + +batch_size = 16 +in_channels = 32 +out_channels = 64 +kernel_size = 3 +height_in = 64 +width_in = 128 +stride = 5 +padding = 1 +dilation = 2 + + +def get_inputs(): + x = torch.rand(batch_size, in_channels, height_in, width_in) + return [x] + + +def get_init_inputs(): + return [in_channels, out_channels, kernel_size, stride, padding, dilation] diff --git a/backends/triton/cpu/KernelBench/level1/82_conv_depthwise_2D_square_input_square_kernel.py b/backends/triton/cpu/KernelBench/level1/82_conv_depthwise_2D_square_input_square_kernel.py new file mode 100644 index 0000000..e8395b3 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/82_conv_depthwise_2D_square_input_square_kernel.py @@ -0,0 +1,117 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=1), + ], + key=["OHOW"], +) +@triton.jit +def _depthwise_conv2d( + x_ptr, + w_ptr, + y_ptr, + C, + H, + W, + OH, + OW, + OHOW, + KH: tl.constexpr, + KW: tl.constexpr, + BLOCK_SIZE: tl.constexpr, +): + nc = tl.program_id(1) + pid = tl.program_id(0) + c = nc % C + nc_i64 = nc.to(tl.int64) + + offs = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + mask = offs < OHOW + oh = offs // OW + ow = offs % OW + + x_base = x_ptr + nc_i64 * (H * W) + w_base = w_ptr + c * (KH * KW) + y_base = y_ptr + nc_i64 * OHOW + + acc = tl.zeros([BLOCK_SIZE], dtype=y_ptr.dtype.element_ty) + for kh in range(KH): + for kw in range(KW): + x_val = tl.load(x_base + (oh + kh) * W + (ow + kw), mask=mask, other=0.0) + w_val = tl.load(w_base + kh * KW + kw) + acc += x_val * w_val + + tl.store(y_base + offs, acc, mask=mask) + + +class Model(nn.Module): + def __init__( + self, + in_channels: int, + kernel_size: int, + stride: int = 1, + padding: int = 0, + bias: bool = False, + ): + super(Model, self).__init__() + self.conv2d = nn.Conv2d( + in_channels, + in_channels, + kernel_size, + stride=stride, + padding=padding, + groups=in_channels, + bias=bias, + ) + self._packed = False + + def _pack_weights(self): + self._dw_w = self.conv2d.weight.squeeze(1).contiguous() + self._packed = True + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if not self._packed: + self._pack_weights() + w = self._dw_w.to(dtype=x.dtype) + N, C, H, W_dim = x.shape + KH = self.conv2d.weight.shape[2] + KW = self.conv2d.weight.shape[3] + OH = H - KH + 1 + OW = W_dim - KW + 1 + OHOW = OH * OW + + x = x.contiguous() + y = torch.empty((N, C, OH, OW), device=x.device, dtype=x.dtype) + + grid = lambda meta: (triton.cdiv(OHOW, meta["BLOCK_SIZE"]), N * C) + _depthwise_conv2d[grid](x, w, y, C, H, W_dim, OH, OW, OHOW, KH=KH, KW=KW) + return y + + +batch_size = 16 +in_channels = 64 +kernel_size = 3 +width = 512 +height = 512 +stride = 1 +padding = 0 + + +def get_inputs(): + x = torch.rand(batch_size, in_channels, height, width) + return [x] + + +def get_init_inputs(): + return [in_channels, kernel_size, stride, padding] diff --git a/backends/triton/cpu/KernelBench/level1/83_conv_depthwise_2D_square_input_asymmetric_kernel.py b/backends/triton/cpu/KernelBench/level1/83_conv_depthwise_2D_square_input_asymmetric_kernel.py new file mode 100644 index 0000000..6b9c313 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/83_conv_depthwise_2D_square_input_asymmetric_kernel.py @@ -0,0 +1,122 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=1), + ], + key=["OHOW"], +) +@triton.jit +def _depthwise_conv2d( + x_ptr, + w_ptr, + y_ptr, + C, + H, + W, + OH, + OW, + OHOW, + KH: tl.constexpr, + KW: tl.constexpr, + BLOCK_SIZE: tl.constexpr, +): + nc = tl.program_id(1) + pid = tl.program_id(0) + c = nc % C + nc_i64 = nc.to(tl.int64) + + offs = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + mask = offs < OHOW + oh = offs // OW + ow = offs % OW + + x_base = x_ptr + nc_i64 * (H * W) + w_base = w_ptr + c * (KH * KW) + y_base = y_ptr + nc_i64 * OHOW + + acc = tl.zeros([BLOCK_SIZE], dtype=y_ptr.dtype.element_ty) + for kh in range(KH): + for kw in range(KW): + x_val = tl.load(x_base + (oh + kh) * W + (ow + kw), mask=mask, other=0.0) + w_val = tl.load(w_base + kh * KW + kw) + acc += x_val * w_val + + tl.store(y_base + offs, acc, mask=mask) + + +class Model(nn.Module): + def __init__( + self, + in_channels: int, + kernel_size: int, + stride: int = 1, + padding: int = 0, + dilation: int = 1, + bias: bool = False, + ): + super(Model, self).__init__() + self.conv2d = nn.Conv2d( + in_channels, + in_channels, + kernel_size=(kernel_size, 1), + stride=stride, + padding=padding, + dilation=dilation, + groups=in_channels, + bias=bias, + ) + self.in_channels = in_channels + self.kernel_size = kernel_size + self._packed = False + + def _pack_weights(self): + self._dw_w = self.conv2d.weight.squeeze(1).contiguous() + self._packed = True + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if not self._packed: + self._pack_weights() + w = self._dw_w.to(dtype=x.dtype) + N, C, H, W_dim = x.shape + KH = self.conv2d.weight.shape[2] + KW = self.conv2d.weight.shape[3] + OH = H - KH + 1 + OW = W_dim + OHOW = OH * OW + + x = x.contiguous() + y = torch.empty((N, C, OH, OW), device=x.device, dtype=x.dtype) + + grid = lambda meta: (triton.cdiv(OHOW, meta["BLOCK_SIZE"]), N * C) + _depthwise_conv2d[grid](x, w, y, C, H, W_dim, OH, OW, OHOW, KH=KH, KW=KW) + return y + + +batch_size = 64 +in_channels = 8 +kernel_size = 3 +width = 512 +height = 512 +stride = 1 +padding = 0 +dilation = 1 + + +def get_inputs(): + x = torch.rand(batch_size, in_channels, height, width) + return [x] + + +def get_init_inputs(): + return [in_channels, kernel_size, stride, padding, dilation] diff --git a/backends/triton/cpu/KernelBench/level1/84_conv_depthwise_2D_asymmetric_input_square_kernel.py b/backends/triton/cpu/KernelBench/level1/84_conv_depthwise_2D_asymmetric_input_square_kernel.py new file mode 100644 index 0000000..94c2d3c --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/84_conv_depthwise_2D_asymmetric_input_square_kernel.py @@ -0,0 +1,119 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=1), + ], + key=["OHOW"], +) +@triton.jit +def _depthwise_conv2d( + x_ptr, + w_ptr, + y_ptr, + C, + H, + W, + OH, + OW, + OHOW, + KH: tl.constexpr, + KW: tl.constexpr, + BLOCK_SIZE: tl.constexpr, +): + nc = tl.program_id(1) + pid = tl.program_id(0) + c = nc % C + nc_i64 = nc.to(tl.int64) + + offs = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + mask = offs < OHOW + oh = offs // OW + ow = offs % OW + + x_base = x_ptr + nc_i64 * (H * W) + w_base = w_ptr + c * (KH * KW) + y_base = y_ptr + nc_i64 * OHOW + + acc = tl.zeros([BLOCK_SIZE], dtype=y_ptr.dtype.element_ty) + for kh in range(KH): + for kw in range(KW): + x_val = tl.load(x_base + (oh + kh) * W + (ow + kw), mask=mask, other=0.0) + w_val = tl.load(w_base + kh * KW + kw) + acc += x_val * w_val + + tl.store(y_base + offs, acc, mask=mask) + + +class Model(nn.Module): + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: int = 1, + padding: int = 0, + bias: bool = False, + ): + super(Model, self).__init__() + self.conv2d = nn.Conv2d( + in_channels, + out_channels, + kernel_size=(kernel_size, kernel_size), + stride=stride, + padding=padding, + groups=in_channels, + bias=bias, + ) + self._packed = False + + def _pack_weights(self): + self._dw_w = self.conv2d.weight.squeeze(1).contiguous() + self._packed = True + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if not self._packed: + self._pack_weights() + w = self._dw_w.to(dtype=x.dtype) + N, C, H, W_dim = x.shape + KH = self.conv2d.weight.shape[2] + KW = self.conv2d.weight.shape[3] + OH = H - KH + 1 + OW = W_dim - KW + 1 + OHOW = OH * OW + + x = x.contiguous() + y = torch.empty((N, C, OH, OW), device=x.device, dtype=x.dtype) + + grid = lambda meta: (triton.cdiv(OHOW, meta["BLOCK_SIZE"]), N * C) + _depthwise_conv2d[grid](x, w, y, C, H, W_dim, OH, OW, OHOW, KH=KH, KW=KW) + return y + + +batch_size = 64 +in_channels = 128 +out_channels = 128 +kernel_size = 3 +width_in = 512 +height_in = 256 +stride = 1 +padding = 0 + + +def get_inputs(): + x = torch.rand(batch_size, in_channels, height_in, width_in) + return [x] + + +def get_init_inputs(): + return [in_channels, out_channels, kernel_size, stride, padding] diff --git a/backends/triton/cpu/KernelBench/level1/85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.py b/backends/triton/cpu/KernelBench/level1/85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.py new file mode 100644 index 0000000..61d7834 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.py @@ -0,0 +1,179 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=1), + ], + key=["OHOW"], +) +@triton.jit +def _depthwise_conv2d( + x_ptr, + w_ptr, + y_ptr, + C, + H, + W, + OH, + OW, + OHOW, + stride_h: tl.constexpr, + stride_w: tl.constexpr, + pad_h: tl.constexpr, + pad_w: tl.constexpr, + dil_h: tl.constexpr, + dil_w: tl.constexpr, + KH: tl.constexpr, + KW: tl.constexpr, + BLOCK_SIZE: tl.constexpr, +): + nc = tl.program_id(1) + pid = tl.program_id(0) + c = nc % C + nc_i64 = nc.to(tl.int64) + + offs = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + mask = offs < OHOW + oh = offs // OW + ow = offs % OW + + x_base = x_ptr + nc_i64 * (H * W) + w_base = w_ptr + c * (KH * KW) + y_base = y_ptr + nc_i64 * OHOW + + acc = tl.zeros([BLOCK_SIZE], dtype=y_ptr.dtype.element_ty) + for kh in range(KH): + for kw in range(KW): + ih = oh * stride_h + kh * dil_h - pad_h + iw = ow * stride_w + kw * dil_w - pad_w + valid = mask & (ih >= 0) & (ih < H) & (iw >= 0) & (iw < W) + x_val = tl.load(x_base + ih * W + iw, mask=valid, other=0.0) + w_val = tl.load(w_base + kh * KW + kw) + acc += x_val * w_val + + tl.store(y_base + offs, acc, mask=mask) + + +class Model(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size_h, + kernel_size_w, + stride_h=1, + stride_w=1, + padding_h=0, + padding_w=0, + dilation_h=1, + dilation_w=1, + groups=1, + bias=False, + ): + super(Model, self).__init__() + self.conv2d = nn.Conv2d( + in_channels, + in_channels, + (kernel_size_h, kernel_size_w), + stride=(stride_h, stride_w), + padding=(padding_h, padding_w), + dilation=(dilation_h, dilation_w), + groups=in_channels, + bias=bias, + ) + self._stride_h = stride_h + self._stride_w = stride_w + self._pad_h = padding_h + self._pad_w = padding_w + self._dil_h = dilation_h + self._dil_w = dilation_w + self._packed = False + + def _pack_weights(self): + self._dw_w = self.conv2d.weight.squeeze(1).contiguous() + self._packed = True + + def forward(self, x): + if not self._packed: + self._pack_weights() + w = self._dw_w.to(dtype=x.dtype) + N, C, H, W_dim = x.shape + KH = self.conv2d.weight.shape[2] + KW = self.conv2d.weight.shape[3] + OH = (H + 2 * self._pad_h - self._dil_h * (KH - 1) - 1) // self._stride_h + 1 + OW = ( + W_dim + 2 * self._pad_w - self._dil_w * (KW - 1) - 1 + ) // self._stride_w + 1 + OHOW = OH * OW + + x = x.contiguous() + y = torch.empty((N, C, OH, OW), device=x.device, dtype=x.dtype) + + grid = lambda meta: (triton.cdiv(OHOW, meta["BLOCK_SIZE"]), N * C) + _depthwise_conv2d[grid]( + x, + w, + y, + C, + H, + W_dim, + OH, + OW, + OHOW, + stride_h=self._stride_h, + stride_w=self._stride_w, + pad_h=self._pad_h, + pad_w=self._pad_w, + dil_h=self._dil_h, + dil_w=self._dil_w, + KH=KH, + KW=KW, + ) + return y + + +batch_size = 32 +in_channels = 128 +out_channels = 128 +kernel_size_h = 3 +kernel_size_w = 7 +width = 256 +height = 128 +stride_h = 1 +stride_w = 1 +padding_h = 0 +padding_w = 0 +dilation_h = 1 +dilation_w = 1 +groups = in_channels + + +def get_inputs(): + x = torch.rand(batch_size, in_channels, height, width) + return [x] + + +def get_init_inputs(): + return [ + in_channels, + out_channels, + kernel_size_h, + kernel_size_w, + stride_h, + stride_w, + padding_h, + padding_w, + dilation_h, + dilation_w, + groups, + ] diff --git a/backends/triton/cpu/KernelBench/level1/86_conv_depthwise_separable_2D.py b/backends/triton/cpu/KernelBench/level1/86_conv_depthwise_separable_2D.py new file mode 100644 index 0000000..0bcdc18 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/86_conv_depthwise_separable_2D.py @@ -0,0 +1,214 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import torch.nn.functional as F +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config( + {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 16, "grf_mode": "128"}, + num_warps=4, + num_stages=2, + ), + ], + key=["M_total", "C_out", "C_IN"], +) +@triton.jit +def _conv2d_fused_kernel( + x_ptr, + w_ptr, + y_ptr, + M_total, + C_out, + OH, + OW, + stride_xn, + stride_xh, + stride_xw, + stride_wkh, + stride_wkw, + stride_wci, + stride_wco, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + KH: tl.constexpr, + KW: tl.constexpr, + C_IN: tl.constexpr, + grf_mode: tl.constexpr, +): + pid_m = tl.program_id(0) + pid_n = tl.program_id(1) + + nhw = OH * OW + offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + offs_k = tl.arange(0, BLOCK_K) + + n_idx = offs_m // nhw + rem = offs_m % nhw + oh_idx = rem // OW + ow_idx = rem % OW + + mask_m = offs_m < M_total + + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + + x_base = x_ptr + n_idx * stride_xn + oh_idx * stride_xh + ow_idx * stride_xw + + for kh in range(KH): + for kw in range(KW): + x_kh_kw = x_base + kh * stride_xh + kw * stride_xw + + w_bp = tl.make_block_ptr( + base=w_ptr + kh * stride_wkh + kw * stride_wkw, + shape=(C_IN, C_out), + strides=(stride_wci, stride_wco), + offsets=(0, pid_n * BLOCK_N), + block_shape=(BLOCK_K, BLOCK_N), + order=(1, 0), + ) + + for c0 in range(0, C_IN, BLOCK_K): + k_idx = c0 + offs_k + + x_tile = tl.load( + x_kh_kw[:, None] + k_idx[None, :], + mask=mask_m[:, None] & (k_idx[None, :] < C_IN), + other=0.0, + ) + + w_tile = tl.load(w_bp, boundary_check=(0, 1), padding_option="zero") + acc = tl.dot(x_tile, w_tile, acc) + w_bp = tl.advance(w_bp, (BLOCK_K, 0)) + + y_bp = tl.make_block_ptr( + base=y_ptr, + shape=(M_total, C_out), + strides=(C_out, 1), + offsets=(pid_m * BLOCK_M, pid_n * BLOCK_N), + block_shape=(BLOCK_M, BLOCK_N), + order=(1, 0), + ) + tl.store(y_bp, acc.to(tl.float32), boundary_check=(0, 1)) + + +class Model(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + bias=False, + ): + super(Model, self).__init__() + self.depthwise = nn.Conv2d( + in_channels, + in_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=in_channels, + bias=bias, + ) + self.pointwise = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=bias) + self._padding = int(padding) + self._packed = False + self._y_buf = None + + def _pack_weights(self): + dw_w = self.depthwise.weight.data + pw_w = self.pointwise.weight.data + combined = pw_w * dw_w.transpose(0, 1) + self.w_hwio = combined.to(dtype=torch.float32).permute(2, 3, 1, 0).contiguous() + self._packed = True + + def forward(self, x): + if not self._packed: + self._pack_weights() + + x = x.to(dtype=torch.float32) + + B, C_in, H, W_dim = x.shape + KH, KW = self.depthwise.kernel_size + C_out = self.pointwise.out_channels + pad = self._padding + + if pad > 0: + x = F.pad(x, (pad, pad, pad, pad)) + + H_pad, W_pad = x.shape[2], x.shape[3] + OH = H_pad - KH + 1 + OW = W_pad - KW + 1 + M_total = B * OH * OW + + x_cl = x.contiguous(memory_format=torch.channels_last) + x_nhwc = x_cl.permute(0, 2, 3, 1) + + if self._y_buf is None or self._y_buf.shape != (B, C_out, OH, OW): + self._y_buf = torch.empty( + (B, C_out, OH, OW), + device=x.device, + dtype=torch.float32, + memory_format=torch.channels_last, + ) + y = self._y_buf + y_nhwc = y.permute(0, 2, 3, 1) + + grid = lambda META: ( + triton.cdiv(M_total, META["BLOCK_M"]), + triton.cdiv(C_out, META["BLOCK_N"]), + ) + + _conv2d_fused_kernel[grid]( + x_nhwc, + self.w_hwio, + y_nhwc, + M_total, + C_out, + OH, + OW, + x_nhwc.stride(0), + x_nhwc.stride(1), + x_nhwc.stride(2), + self.w_hwio.stride(0), + self.w_hwio.stride(1), + self.w_hwio.stride(2), + self.w_hwio.stride(3), + KH=KH, + KW=KW, + C_IN=C_in, + ) + + return y + + +batch_size = 16 +in_channels = 64 +out_channels = 128 +kernel_size = 3 +width = 512 +height = 512 +stride = 1 +padding = 1 +dilation = 1 + + +def get_init_inputs(): + return [in_channels, out_channels, kernel_size, stride, padding, dilation] + + +def get_inputs(): + x = torch.rand(batch_size, in_channels, height, width) + return [x] diff --git a/backends/triton/cpu/KernelBench/level1/87_conv_pointwise_2D.py b/backends/triton/cpu/KernelBench/level1/87_conv_pointwise_2D.py new file mode 100644 index 0000000..cf48a00 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/87_conv_pointwise_2D.py @@ -0,0 +1,147 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.jit +def _pointwise_gemm( + x_ptr, + w_ptr, + y_ptr, + M, + N, + K, + stride_xb, + stride_xk, + stride_wm, + stride_wk, + stride_yb, + stride_ym, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + GROUP_M: tl.constexpr, +): + pid = tl.program_id(0) + bid = tl.program_id(1) + + num_pid_m = tl.cdiv(M, BLOCK_M) + num_pid_n = tl.cdiv(N, BLOCK_N) + num_pid_in_group = GROUP_M * num_pid_n + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_M + group_size_m = tl.minimum(num_pid_m - first_pid_m, GROUP_M) + pid_in_group = pid % num_pid_in_group + pid_m = first_pid_m + (pid_in_group % group_size_m) + pid_n = pid_in_group // group_size_m + + x_base = x_ptr + bid.to(tl.int64) * stride_xb + y_base = y_ptr + bid.to(tl.int64) * stride_yb + + W_bp = tl.make_block_ptr( + base=w_ptr, + shape=(M, K), + strides=(stride_wm, stride_wk), + offsets=(pid_m * BLOCK_M, 0), + block_shape=(BLOCK_M, BLOCK_K), + order=(1, 0), + ) + X_bp = tl.make_block_ptr( + base=x_base, + shape=(K, N), + strides=(stride_xk, 1), + offsets=(0, pid_n * BLOCK_N), + block_shape=(BLOCK_K, BLOCK_N), + order=(1, 0), + ) + + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + for _ in range(0, K, BLOCK_K): + w = tl.load(W_bp, boundary_check=(0, 1), padding_option="zero") + x = tl.load(X_bp, boundary_check=(0, 1), padding_option="zero") + acc = tl.dot(w, x, acc) + W_bp = tl.advance(W_bp, (0, BLOCK_K)) + X_bp = tl.advance(X_bp, (BLOCK_K, 0)) + + Y_bp = tl.make_block_ptr( + base=y_base, + shape=(M, N), + strides=(stride_ym, 1), + offsets=(pid_m * BLOCK_M, pid_n * BLOCK_N), + block_shape=(BLOCK_M, BLOCK_N), + order=(1, 0), + ) + tl.store(Y_bp, acc.to(y_ptr.dtype.element_ty), boundary_check=(0, 1)) + + +class Model(nn.Module): + def __init__(self, in_channels: int, out_channels: int, bias: bool = False): + super(Model, self).__init__() + self.conv1d = nn.Conv2d( + in_channels, out_channels, kernel_size=1, stride=1, padding=0, bias=bias + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + B, C_in, H, W = x.shape + C_out = self.conv1d.weight.shape[0] + weight = self.conv1d.weight.squeeze(-1).squeeze(-1).contiguous() + N_spatial = H * W + + BLOCK_M = 128 + BLOCK_N = 128 + BLOCK_K = 64 + GROUP_M = 8 + + out = torch.empty((B, C_out, H, W), device=x.device, dtype=x.dtype) + + grid = ( + triton.cdiv(C_out, BLOCK_M) * triton.cdiv(N_spatial, BLOCK_N), + B, + ) + _pointwise_gemm[grid]( + x, + weight, + out, + C_out, + N_spatial, + C_in, + x.stride(0), + x.stride(1), + weight.stride(0), + weight.stride(1), + out.stride(0), + out.stride(1), + BLOCK_M=BLOCK_M, + BLOCK_N=BLOCK_N, + BLOCK_K=BLOCK_K, + GROUP_M=GROUP_M, + num_warps=8, + num_stages=1, + ) + + if self.conv1d.bias is not None: + out = out + self.conv1d.bias.view(1, -1, 1, 1) + return out + + +batch_size = 8 +in_channels = 64 +out_channels = 128 +width = 1024 +height = 1024 + + +def get_inputs(): + x = torch.rand(batch_size, in_channels, height, width) + return [x] + + +def get_init_inputs(): + return [in_channels, out_channels] diff --git a/backends/triton/cpu/KernelBench/level1/88_MinGPTNewGelu.py b/backends/triton/cpu/KernelBench/level1/88_MinGPTNewGelu.py new file mode 100644 index 0000000..bde7264 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/88_MinGPTNewGelu.py @@ -0,0 +1,65 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl +from triton.language.extra.cpu import libdevice + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2), + ], + key=["n_elements"], +) +@triton.jit +def _gelu_kernel( + x_ptr, + out_ptr, + n_elements, + BLOCK_SIZE: tl.constexpr, +): + pid = tl.program_id(0) + offs = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + mask = offs < n_elements + + x = tl.load(x_ptr + offs, mask=mask, other=0.0) + + x_f32 = x.to(tl.float32) + inner = x_f32 + 0.044715 * x_f32 * x_f32 * x_f32 + t = libdevice.tanh(0.7978845608028654 * inner) + result = (0.5 * x_f32 * (1.0 + t)).to(x.dtype) + + tl.store(out_ptr + offs, result, mask=mask) + + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x): + x_flat = x.contiguous().view(-1) + n_elements = x_flat.numel() + output = torch.empty_like(x_flat) + + grid = lambda META: (triton.cdiv(n_elements, META["BLOCK_SIZE"]),) + _gelu_kernel[grid](x_flat, output, n_elements) + + return output.view(x.shape) + + +batch_size = 8192 +dim = 8192 + + +def get_inputs(): + return [torch.rand(batch_size, dim)] + + +def get_init_inputs(): + return [] diff --git a/backends/triton/cpu/KernelBench/level1/89_cumsum.py b/backends/triton/cpu/KernelBench/level1/89_cumsum.py new file mode 100644 index 0000000..34e0a1e --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/89_cumsum.py @@ -0,0 +1,90 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.jit +def scan_add_op(a, b): + return a + b + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_SIZE": 256}, num_warps=4, num_stages=1), + ], + key=["N"], +) +@triton.jit +def cumsum_kernel( + x_ptr, + out_ptr, + M, + N, + stride_xm, + stride_om, + BLOCK_SIZE: tl.constexpr, +): + row_idx = tl.program_id(0) + x_row = x_ptr + row_idx * stride_xm + o_row = out_ptr + row_idx * stride_om + + running_total = 0.0 + + for block_start in tl.range(0, N, BLOCK_SIZE): + offsets = block_start + tl.arange(0, BLOCK_SIZE) + mask = offsets < N + + x = tl.load(x_row + offsets, mask=mask, other=0.0).to(tl.float32) + + scanned = tl.associative_scan(x, axis=0, combine_fn=scan_add_op) + result = scanned + running_total + + running_total = running_total + tl.sum(x, axis=0) + + tl.store(o_row + offsets, result, mask=mask) + + +def kernel_function(x): + M, N = x.shape + out = torch.empty_like(x) + assert x.stride(1) == 1 + + grid = (M,) + cumsum_kernel[grid]( + x, + out, + M, + N, + x.stride(0), + out.stride(0), + ) + return out + + +batch_size = 32768 +input_shape = (32768,) +dim = 1 + + +def get_inputs(): + return [torch.rand(batch_size, *input_shape)] + + +def get_init_inputs(): + return [dim] + + +class Model(nn.Module): + def __init__(self, dim): + super(Model, self).__init__() + self.dim = dim + + def forward(self, x): + return kernel_function(x) diff --git a/backends/triton/cpu/KernelBench/level1/8_Matmul_with_irregular_shapes_.py b/backends/triton/cpu/KernelBench/level1/8_Matmul_with_irregular_shapes_.py new file mode 100644 index 0000000..ad4bb8a --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/8_Matmul_with_irregular_shapes_.py @@ -0,0 +1,128 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +def get_autotune_configs(): + return [ + triton.Config( + {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, + num_warps=32, + num_stages=2, + ), + ] + + +@triton.autotune( + configs=get_autotune_configs(), + key=["M", "N", "K"], +) +@triton.jit +def matmul_kernel( + a_ptr, + b_ptr, + c_ptr, + M, + N, + K, + stride_am: tl.constexpr, + stride_ak: tl.constexpr, + stride_bk: tl.constexpr, + stride_bn: tl.constexpr, + stride_cm: tl.constexpr, + stride_cn: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + pid = tl.program_id(0) + num_pid_m = tl.cdiv(M, BLOCK_M) + num_pid_n = tl.cdiv(N, BLOCK_N) + num_pid_in_group = GROUP_SIZE_M * num_pid_n + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size_m = tl.minimum(num_pid_m - first_pid_m, GROUP_SIZE_M) + pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m) + pid_n = (pid % num_pid_in_group) // group_size_m + + a_desc = tl.make_tensor_descriptor( + a_ptr, + shape=[M, K], + strides=[stride_am, stride_ak], + block_shape=[BLOCK_M, BLOCK_K], + ) + b_desc = tl.make_tensor_descriptor( + b_ptr, + shape=[K, N], + strides=[stride_bk, stride_bn], + block_shape=[BLOCK_K, BLOCK_N], + ) + + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + + for off_k in range(0, K, BLOCK_K): + a = a_desc.load([pid_m * BLOCK_M, off_k]) + b = b_desc.load([off_k, pid_n * BLOCK_N]) + acc += tl.dot(a, b) + + c_desc = tl.make_tensor_descriptor( + c_ptr, + shape=[M, N], + strides=[stride_cm, stride_cn], + block_shape=[BLOCK_M, BLOCK_N], + ) + c_desc.store([pid_m * BLOCK_M, pid_n * BLOCK_N], acc.to(c_ptr.type.element_ty)) + + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: + A = A.contiguous() + B = B.contiguous() + M, K = A.shape + _, N = B.shape + C = torch.empty((M, N), device=A.device, dtype=A.dtype) + + grid = lambda META: ( + triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]), + ) + matmul_kernel[grid]( + A, + B, + C, + M, + N, + K, + A.stride(0), + A.stride(1), + B.stride(0), + B.stride(1), + C.stride(0), + C.stride(1), + ) + return C + + +M = 8205 +K = 2949 +N = 5921 + + +def get_inputs(): + A = torch.rand(M, K, dtype=torch.bfloat16) + B = torch.rand(K, N, dtype=torch.bfloat16) + return [A, B] + + +def get_init_inputs(): + return [] diff --git a/backends/triton/cpu/KernelBench/level1/90_cumprod.py b/backends/triton/cpu/KernelBench/level1/90_cumprod.py new file mode 100644 index 0000000..e0d9e1e --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/90_cumprod.py @@ -0,0 +1,94 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.jit +def _mul_combine(a, b): + return a * b + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_SIZE": 512}, num_warps=4), + ], + key=["N"], +) +@triton.jit +def _cumprod_kernel( + x_ptr, + out_ptr, + M, + N, + stride_m, + stride_n, + BLOCK_SIZE: tl.constexpr, +): + row = tl.program_id(0) + + running_prod = 1.0 + + for col_start in range(0, N, BLOCK_SIZE): + cols = col_start + tl.arange(0, BLOCK_SIZE) + mask = cols < N + + x = tl.load( + x_ptr + row.to(tl.int64) * stride_m + cols * stride_n, mask=mask, other=1.0 + ).to(tl.float32) + + cum = tl.associative_scan(x, 0, _mul_combine) + + cum = cum * running_prod + + tl.store( + out_ptr + row.to(tl.int64) * stride_m + cols * stride_n, cum, mask=mask + ) + + block_product = tl.reduce(x, 0, _mul_combine) + running_prod = running_prod * block_product + + +def cumprod_triton(x, dim): + assert dim == 1 + M, N = x.shape + out = torch.empty_like(x) + + grid = (M,) + _cumprod_kernel[grid]( + x, + out, + M, + N, + x.stride(0), + x.stride(1), + ) + return out + + +class Model(nn.Module): + def __init__(self, dim): + super(Model, self).__init__() + self.dim = dim + + def forward(self, x): + return cumprod_triton(x, self.dim) + + +batch_size = 32768 +input_shape = (32768,) +dim = 1 + + +def get_inputs(): + return [torch.rand(batch_size, *input_shape)] + + +def get_init_inputs(): + return [dim] diff --git a/backends/triton/cpu/KernelBench/level1/91_cumsum_reverse.py b/backends/triton/cpu/KernelBench/level1/91_cumsum_reverse.py new file mode 100644 index 0000000..3d78bb9 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/91_cumsum_reverse.py @@ -0,0 +1,91 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_N": 256}, num_warps=4), + ], + key=["N"], +) +@triton.jit +def reverse_cumsum_kernel( + x_ptr, + out_ptr, + M, + N: tl.constexpr, + stride_xm, + stride_xn, + stride_om, + stride_on, + BLOCK_N: tl.constexpr, +): + row = tl.program_id(0) + x_base = x_ptr + row.to(tl.int64) * stride_xm + out_base = out_ptr + row.to(tl.int64) * stride_om + + # Pass 1: compute total sum of the row + total = 0.0 + for col_start in range(0, N, BLOCK_N): + cols = col_start + tl.arange(0, BLOCK_N) + mask = cols < N + x = tl.load(x_base + cols * stride_xn, mask=mask, other=0.0).to(tl.float32) + total = total + tl.sum(x, axis=0) + + # Pass 2: compute suffix sums (left-to-right) + # suffix[i] = total - exclusive_prefix_sum[i] + # where exclusive_prefix[i] = running_prefix + cumsum_inc[local_i] - x[local_i] + running_prefix = 0.0 + for col_start in range(0, N, BLOCK_N): + cols = col_start + tl.arange(0, BLOCK_N) + mask = cols < N + x = tl.load(x_base + cols * stride_xn, mask=mask, other=0.0).to(tl.float32) + cumsum_inc = tl.cumsum(x, axis=0) + suffix = total - running_prefix - cumsum_inc + x + tl.store(out_base + cols * stride_on, suffix, mask=mask) + running_prefix = running_prefix + tl.sum(x, axis=0) + + +class Model(nn.Module): + def __init__(self, dim): + super(Model, self).__init__() + self.dim = dim + + def forward(self, x): + x = x.contiguous() + M, N = x.shape + output = torch.empty_like(x) + + grid = (M,) + reverse_cumsum_kernel[grid]( + x, + output, + M, + N, + x.stride(0), + x.stride(1), + output.stride(0), + output.stride(1), + ) + return output + + +batch_size = 32768 +input_shape = (32768,) +dim = 1 + + +def get_inputs(): + return [torch.rand(batch_size, *input_shape)] + + +def get_init_inputs(): + return [dim] diff --git a/backends/triton/cpu/KernelBench/level1/92_cumsum_exclusive.py b/backends/triton/cpu/KernelBench/level1/92_cumsum_exclusive.py new file mode 100644 index 0000000..94e17ba --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/92_cumsum_exclusive.py @@ -0,0 +1,88 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2), + ], + key=["N"], +) +@triton.jit +def exclusive_cumsum_kernel( + x_ptr, + out_ptr, + M, + N, + stride_xm, + stride_xn, + stride_om, + stride_on, + BLOCK_SIZE: tl.constexpr, +): + row_idx = tl.program_id(0) + + x_row_ptr = x_ptr + row_idx.to(tl.int64) * stride_xm + o_row_ptr = out_ptr + row_idx.to(tl.int64) * stride_om + + running_sum = tl.zeros([1], dtype=tl.float32) + + for col_start in range(0, N, BLOCK_SIZE): + col_offsets = col_start + tl.arange(0, BLOCK_SIZE) + mask = col_offsets < N + + x = tl.load(x_row_ptr + col_offsets * stride_xn, mask=mask, other=0.0).to( + tl.float32 + ) + + inclusive = tl.cumsum(x, axis=0) + exclusive = inclusive - x + + result = exclusive + running_sum + + tl.store(o_row_ptr + col_offsets * stride_on, result, mask=mask) + + running_sum += tl.sum(x, axis=0) + + +class Model(nn.Module): + def __init__(self, dim): + super(Model, self).__init__() + self.dim = dim + + def forward(self, x): + M, N = x.shape + out = torch.empty_like(x) + grid = (M,) + exclusive_cumsum_kernel[grid]( + x, + out, + M, + N, + x.stride(0), + x.stride(1), + out.stride(0), + out.stride(1), + ) + return out + + +batch_size = 32768 +input_shape = (32768,) +dim = 1 + + +def get_inputs(): + return [torch.rand(batch_size, *input_shape)] + + +def get_init_inputs(): + return [dim] diff --git a/backends/triton/cpu/KernelBench/level1/93_masked_cumsum.py b/backends/triton/cpu/KernelBench/level1/93_masked_cumsum.py new file mode 100644 index 0000000..b440482 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/93_masked_cumsum.py @@ -0,0 +1,89 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.jit +def scan_add_op(a, b): + return a + b + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_SIZE": 512}, num_warps=4, num_stages=1), + ], + key=["N"], +) +@triton.jit +def cumsum_kernel( + x_ptr, + out_ptr, + M, + N, + stride_xm, + stride_om, + BLOCK_SIZE: tl.constexpr, +): + row_idx = tl.program_id(0) + x_row = x_ptr + row_idx * stride_xm + o_row = out_ptr + row_idx * stride_om + + running_total = 0.0 + + for block_start in tl.range(0, N, BLOCK_SIZE): + offsets = block_start + tl.arange(0, BLOCK_SIZE) + col_mask = offsets < N + + x = tl.load(x_row + offsets, mask=col_mask, other=0.0).to(tl.float32) + + scanned = tl.associative_scan(x, axis=0, combine_fn=scan_add_op) + result = scanned + running_total + + running_total = running_total + tl.sum(x, axis=0) + + tl.store(o_row + offsets, result.to(out_ptr.dtype.element_ty), mask=col_mask) + + +class Model(nn.Module): + def __init__(self, dim): + super(Model, self).__init__() + self.dim = dim + + def forward(self, x, mask): + masked = x.float() * mask.float() + masked = masked.contiguous() + M, N = masked.shape + out = torch.empty_like(masked) + + grid = (M,) + cumsum_kernel[grid]( + masked, + out, + M, + N, + masked.stride(0), + out.stride(0), + ) + return out + + +batch_size = 32768 +input_shape = (32768,) +dim = 1 + + +def get_inputs(): + x = torch.rand(batch_size, *input_shape) + mask = torch.randint(0, 2, x.shape).bool() + return [x, mask] + + +def get_init_inputs(): + return [dim] diff --git a/backends/triton/cpu/KernelBench/level1/94_MSELoss.py b/backends/triton/cpu/KernelBench/level1/94_MSELoss.py new file mode 100644 index 0000000..5111369 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/94_MSELoss.py @@ -0,0 +1,107 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +def get_autotune_configs(): + configs = [ + triton.Config( + {"BLOCK_SIZE": 1024}, + num_warps=4, + num_stages=1, + ) + ] + return configs + + +@triton.autotune( + configs=get_autotune_configs(), + key=["N_COLS"], +) +@triton.jit +def mse_row_kernel( + pred_ptr, + target_ptr, + row_sums_ptr, + N_ROWS, + N_COLS, + stride_pred_row, + stride_pred_col, + stride_target_row, + stride_target_col, + BLOCK_SIZE: tl.constexpr, +): + row_idx = tl.program_id(0) + + pred_row_start = pred_ptr + row_idx * stride_pred_row + target_row_start = target_ptr + row_idx * stride_target_row + + acc = 0.0 + + for col_start in tl.range(0, N_COLS, BLOCK_SIZE): + cols = col_start + tl.arange(0, BLOCK_SIZE) + mask = cols < N_COLS + + pred_vals = tl.load( + pred_row_start + cols * stride_pred_col, mask=mask, other=0.0 + ).to(tl.float32) + target_vals = tl.load( + target_row_start + cols * stride_target_col, mask=mask, other=0.0 + ).to(tl.float32) + + diff = pred_vals - target_vals + acc += tl.sum(diff * diff, axis=0) + + tl.store(row_sums_ptr + row_idx, acc) + + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, predictions, targets): + predictions = predictions.contiguous() + targets = targets.contiguous() + + N_ROWS, N_COLS = predictions.shape + + row_sums = torch.empty(N_ROWS, device=predictions.device, dtype=torch.float32) + + grid = (N_ROWS,) + mse_row_kernel[grid]( + predictions, + targets, + row_sums, + N_ROWS, + N_COLS, + predictions.stride(0), + predictions.stride(1), + targets.stride(0), + targets.stride(1), + ) + + return torch.sum(row_sums) / (N_ROWS * N_COLS) + + +batch_size = 32768 +input_shape = (32768,) +dim = 1 + + +def get_inputs(): + scale = torch.rand(()) + return [ + torch.rand(batch_size, *input_shape) * scale, + torch.rand(batch_size, *input_shape), + ] + + +def get_init_inputs(): + return [] diff --git a/backends/triton/cpu/KernelBench/level1/95_CrossEntropyLoss.py b/backends/triton/cpu/KernelBench/level1/95_CrossEntropyLoss.py new file mode 100644 index 0000000..6a9f8e8 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/95_CrossEntropyLoss.py @@ -0,0 +1,107 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +def _ce_configs(): + configs = [] + for BN in [1024, 2048, 4096]: + for nw in [4, 8, 16]: + for ws in [16, 32]: + configs.append( + triton.Config({"BLOCK_N": BN, "warp_size": ws}, num_warps=nw) + ) + return configs + + +@triton.autotune(configs=_ce_configs(), key=["N"]) +@triton.jit +def _cross_entropy_online_kernel( + logits_ptr, + targets_ptr, + losses_ptr, + N, + stride_lm, + stride_ln, + BLOCK_N: tl.constexpr, + warp_size: tl.constexpr, +): + row = tl.program_id(0) + row_off = row.to(tl.int64) * stride_lm + + LOG2E: tl.constexpr = 1.4426950408889634 + LN2: tl.constexpr = 0.6931471805599453 + + running_max = -float("inf") + running_sum = 0.0 + + for start in range(0, N, BLOCK_N): + col_offs = start + tl.arange(0, BLOCK_N) + mask = col_offs < N + x = tl.load( + logits_ptr + row_off + col_offs * stride_ln, mask=mask, other=-float("inf") + ).to(tl.float32) + + block_max = tl.max(x, axis=0) + new_max = tl.maximum(running_max, block_max) + + running_sum = running_sum * tl.math.exp2( + (running_max - new_max) * LOG2E + ) + tl.sum(tl.math.exp2((x - new_max) * LOG2E), axis=0) + running_max = new_max + + log_sum_exp = tl.math.log2(running_sum) * LN2 + + target = tl.load(targets_ptr + row).to(tl.int64) + target_logit = tl.load(logits_ptr + row_off + target * stride_ln).to(tl.float32) + + loss = -target_logit + running_max + log_sum_exp + tl.store(losses_ptr + row, loss) + + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, predictions, targets): + predictions = predictions.contiguous() + targets = targets.contiguous() + + M, N = predictions.shape + losses = torch.empty(M, device=predictions.device, dtype=torch.float32) + + grid = (M,) + _cross_entropy_online_kernel[grid]( + predictions, + targets, + losses, + N, + predictions.stride(0), + predictions.stride(1), + ) + + return losses.mean() + + +batch_size = 32768 +num_classes = 4096 +input_shape = (num_classes,) +dim = 1 + + +def get_inputs(): + return [ + torch.rand(batch_size, *input_shape), + torch.randint(0, num_classes, (batch_size,)), + ] + + +def get_init_inputs(): + return [] diff --git a/backends/triton/cpu/KernelBench/level1/96_HuberLoss.py b/backends/triton/cpu/KernelBench/level1/96_HuberLoss.py new file mode 100644 index 0000000..30e4887 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/96_HuberLoss.py @@ -0,0 +1,97 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_SIZE": 4096}, num_warps=4), + ], + key=["n_cols"], +) +@triton.jit +def smooth_l1_row_kernel( + predictions_ptr, + targets_ptr, + row_sums_ptr, + n_cols, + stride_pred, + stride_targ, + BLOCK_SIZE: tl.constexpr, +): + row_idx = tl.program_id(0) + + pred_row_start = row_idx * stride_pred + targ_row_start = row_idx * stride_targ + + row_sum = 0.0 + + for col_start in range(0, n_cols, BLOCK_SIZE): + col_offsets = col_start + tl.arange(0, BLOCK_SIZE) + mask = col_offsets < n_cols + + pred = tl.load( + predictions_ptr + pred_row_start + col_offsets, mask=mask, other=0.0 + ).to(tl.float32) + targ = tl.load( + targets_ptr + targ_row_start + col_offsets, mask=mask, other=0.0 + ).to(tl.float32) + + diff = pred - targ + abs_diff = tl.abs(diff) + loss = tl.where(abs_diff < 1.0, 0.5 * diff * diff, abs_diff - 0.5) + + row_sum += tl.sum(loss, axis=0) + + tl.store(row_sums_ptr + row_idx, row_sum.to(tl.float32)) + + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, predictions, targets): + predictions = predictions.contiguous() + targets = targets.contiguous() + + n_rows = predictions.shape[0] + n_cols = predictions.shape[1] if predictions.ndim > 1 else predictions.numel() + n_elements = predictions.numel() + + row_sums = torch.empty(n_rows, device=predictions.device, dtype=torch.float32) + + grid = (n_rows,) + smooth_l1_row_kernel[grid]( + predictions, + targets, + row_sums, + n_cols, + predictions.stride(0) if predictions.ndim > 1 else n_cols, + targets.stride(0) if targets.ndim > 1 else n_cols, + ) + + return row_sums.sum() / n_elements + + +batch_size = 32768 +input_shape = (32768,) +dim = 1 + + +def get_inputs(): + scale = torch.rand(()) + return [ + torch.rand(batch_size, *input_shape) * scale, + torch.rand(batch_size, *input_shape), + ] + + +def get_init_inputs(): + return [] diff --git a/backends/triton/cpu/KernelBench/level1/97_ScaledDotProductAttention.py b/backends/triton/cpu/KernelBench/level1/97_ScaledDotProductAttention.py new file mode 100644 index 0000000..dc20d99 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/97_ScaledDotProductAttention.py @@ -0,0 +1,280 @@ +# ruff: noqa: E731, E741 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import math + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config( + {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 64}, num_warps=16, num_stages=2 + ), + ], + key=["SEQ_LEN", "HEAD_DIM"], +) +@triton.jit +def _qk_gemm_kernel( + Q_ptr, + K_ptr, + S_ptr, + stride_qb, + stride_qm, + stride_qd, + stride_kb, + stride_kn, + stride_kd, + stride_sb, + stride_sm, + stride_sn, + SEQ_LEN, + HEAD_DIM, + scale, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, +): + pid_b = tl.program_id(0) + pid_tile = tl.program_id(1) + + num_n = tl.cdiv(SEQ_LEN, BLOCK_N) + pid_m = pid_tile // num_n + pid_n = pid_tile % num_n + + q_base = Q_ptr + pid_b.to(tl.int64) * stride_qb + k_base = K_ptr + pid_b.to(tl.int64) * stride_kb + + Q_bp = tl.make_block_ptr( + base=q_base, + shape=(SEQ_LEN, HEAD_DIM), + strides=(stride_qm, stride_qd), + offsets=(pid_m * BLOCK_M, 0), + block_shape=(BLOCK_M, BLOCK_K), + order=(1, 0), + ) + KT_bp = tl.make_block_ptr( + base=k_base, + shape=(HEAD_DIM, SEQ_LEN), + strides=(stride_kd, stride_kn), + offsets=(0, pid_n * BLOCK_N), + block_shape=(BLOCK_K, BLOCK_N), + order=(1, 0), + ) + + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + for _ in range(0, HEAD_DIM, BLOCK_K): + q = tl.load(Q_bp, boundary_check=(0, 1)) + k_t = tl.load(KT_bp, boundary_check=(0, 1)) + acc = tl.dot(q, k_t, acc) + Q_bp = tl.advance(Q_bp, (0, BLOCK_K)) + KT_bp = tl.advance(KT_bp, (BLOCK_K, 0)) + + acc = acc * scale + + s_base = S_ptr + pid_b.to(tl.int64) * stride_sb + S_bp = tl.make_block_ptr( + base=s_base, + shape=(SEQ_LEN, SEQ_LEN), + strides=(stride_sm, stride_sn), + offsets=(pid_m * BLOCK_M, pid_n * BLOCK_N), + block_shape=(BLOCK_M, BLOCK_N), + order=(1, 0), + ) + tl.store(S_bp, acc.to(S_ptr.dtype.element_ty), boundary_check=(0, 1)) + + +@triton.autotune( + configs=[ + triton.Config( + {"BLOCK_M": 64, "BLOCK_K": 64, "BLOCK_N": 128}, num_warps=8, num_stages=3 + ), + ], + key=["SEQ_LEN", "HEAD_DIM"], +) +@triton.jit +def _fused_softmax_pv_kernel( + S_ptr, + V_ptr, + O_ptr, + stride_sb, + stride_sm, + stride_sn, + stride_vb, + stride_vn, + stride_vd, + stride_ob, + stride_om, + stride_od, + SEQ_LEN, + HEAD_DIM, + BLOCK_M: tl.constexpr, + BLOCK_K: tl.constexpr, + BLOCK_N: tl.constexpr, +): + pid_b = tl.program_id(0) + pid_tile = tl.program_id(1) + + num_n = tl.cdiv(HEAD_DIM, BLOCK_N) + pid_m = pid_tile // num_n + pid_n = pid_tile % num_n + + s_base = S_ptr + pid_b.to(tl.int64) * stride_sb + v_base = V_ptr + pid_b.to(tl.int64) * stride_vb + off_m = pid_m * BLOCK_M + + LOG2E = 1.4426950408889634 + + m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32) + l_i = tl.zeros([BLOCK_M], dtype=tl.float32) + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + + for off_k in range(0, SEQ_LEN, BLOCK_K): + S_bp = tl.make_block_ptr( + base=s_base, + shape=(SEQ_LEN, SEQ_LEN), + strides=(stride_sm, stride_sn), + offsets=(off_m, off_k), + block_shape=(BLOCK_M, BLOCK_K), + order=(1, 0), + ) + s = tl.load(S_bp, boundary_check=(0, 1)).to(tl.float32) + + chunk_max = tl.max(s, axis=1) + m_new = tl.maximum(m_i, chunk_max) + alpha = tl.math.exp2((m_i - m_new) * LOG2E) + exp_s = tl.math.exp2((s - m_new[:, None]) * LOG2E) + chunk_sum = tl.sum(exp_s, axis=1) + l_i = alpha * l_i + chunk_sum + acc = acc * alpha[:, None] + m_i = m_new + + V_bp = tl.make_block_ptr( + base=v_base, + shape=(SEQ_LEN, HEAD_DIM), + strides=(stride_vn, stride_vd), + offsets=(off_k, pid_n * BLOCK_N), + block_shape=(BLOCK_K, BLOCK_N), + order=(1, 0), + ) + v = tl.load(V_bp, boundary_check=(0, 1)) + acc = tl.dot(exp_s.to(v.dtype), v, acc) + + acc = acc / l_i[:, None] + + o_base = O_ptr + pid_b.to(tl.int64) * stride_ob + O_bp = tl.make_block_ptr( + base=o_base, + shape=(SEQ_LEN, HEAD_DIM), + strides=(stride_om, stride_od), + offsets=(off_m, pid_n * BLOCK_N), + block_shape=(BLOCK_M, BLOCK_N), + order=(1, 0), + ) + tl.store(O_bp, acc.to(O_ptr.dtype.element_ty), boundary_check=(0, 1)) + + +class Model(nn.Module): + def __init__(self): + super().__init__() + self._s_buf = None + self._o_buf = None + + def forward(self, Q, K, V): + B, H, S, D = Q.shape + scale = 1.0 / math.sqrt(D) + + device = Q.device + + Q = Q.reshape(B * H, S, D).contiguous() + K = K.reshape(B * H, S, D).contiguous() + V = V.reshape(B * H, S, D).contiguous() + + BH = B * H + + if ( + self._s_buf is None + or self._s_buf.shape != (BH, S, S) + or self._s_buf.device != device + ): + self._s_buf = torch.empty(BH, S, S, device=device, dtype=torch.float32) + S_mat = self._s_buf + + grid1 = lambda META: ( + BH, + triton.cdiv(S, META["BLOCK_M"]) * triton.cdiv(S, META["BLOCK_N"]), + ) + _qk_gemm_kernel[grid1]( + Q, + K, + S_mat, + Q.stride(0), + Q.stride(1), + Q.stride(2), + K.stride(0), + K.stride(1), + K.stride(2), + S_mat.stride(0), + S_mat.stride(1), + S_mat.stride(2), + S, + D, + scale, + ) + + if ( + self._o_buf is None + or self._o_buf.shape != (BH, S, D) + or self._o_buf.device != device + or self._o_buf.dtype != Q.dtype + ): + self._o_buf = torch.empty(BH, S, D, device=device, dtype=Q.dtype) + O = self._o_buf + + grid2 = lambda META: ( + BH, + triton.cdiv(S, META["BLOCK_M"]) * triton.cdiv(D, META["BLOCK_N"]), + ) + _fused_softmax_pv_kernel[grid2]( + S_mat, + V, + O, + S_mat.stride(0), + S_mat.stride(1), + S_mat.stride(2), + V.stride(0), + V.stride(1), + V.stride(2), + O.stride(0), + O.stride(1), + O.stride(2), + S, + D, + ) + + result = O.reshape(B, H, S, D) + + return result + + +batch_size = 32 +num_heads = 32 +sequence_length = 512 +embedding_dimension = 1024 + + +def get_inputs(): + Q = torch.rand(batch_size, num_heads, sequence_length, embedding_dimension) + K = torch.rand(batch_size, num_heads, sequence_length, embedding_dimension) + V = torch.rand(batch_size, num_heads, sequence_length, embedding_dimension) + return [Q, K, V] + + +def get_init_inputs(): + return [] diff --git a/backends/triton/cpu/KernelBench/level1/98_KLDivLoss.py b/backends/triton/cpu/KernelBench/level1/98_KLDivLoss.py new file mode 100644 index 0000000..c0ed1ab --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/98_KLDivLoss.py @@ -0,0 +1,103 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + +batch_size = 8192 * 2 +input_shape = (8192 * 2,) +dim = 1 + + +def get_kl_div_configs(): + configs = [ + triton.Config( + {"BLOCK_SIZE": 4096}, + num_warps=4, + num_stages=1, + ) + ] + return configs + + +@triton.autotune( + configs=get_kl_div_configs(), + key=["n_cols"], +) +@triton.jit +def kl_div_row_kernel( + pred_ptr, + target_ptr, + out_ptr, + n_cols, + stride_pred_row, + stride_target_row, + BLOCK_SIZE: tl.constexpr, +): + row_idx = tl.program_id(0) + pred_row_start = pred_ptr + row_idx * stride_pred_row + target_row_start = target_ptr + row_idx * stride_target_row + + acc = 0.0 + + for col_start in tl.range(0, n_cols, BLOCK_SIZE): + col_offsets = col_start + tl.arange(0, BLOCK_SIZE) + mask = col_offsets < n_cols + + pred_vals = tl.load(pred_row_start + col_offsets, mask=mask, other=1.0).to( + tl.float32 + ) + target_vals = tl.load(target_row_start + col_offsets, mask=mask, other=0.0).to( + tl.float32 + ) + + kl_vals = tl.where( + target_vals > 0, target_vals * tl.log(target_vals / pred_vals), 0.0 + ) + acc += tl.sum(kl_vals, axis=0) + + tl.store(out_ptr + row_idx, acc) + + +def kernel_function(predictions: torch.Tensor, targets: torch.Tensor) -> torch.Tensor: + assert predictions.is_contiguous() and targets.is_contiguous() + B, N = predictions.shape + + row_sums = torch.empty(B, device=predictions.device, dtype=torch.float32) + + grid = (B,) + kl_div_row_kernel[grid]( + predictions, + targets, + row_sums, + N, + predictions.stride(0), + targets.stride(0), + ) + + return row_sums.sum() / B + + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, predictions, targets): + return kernel_function(predictions, targets) + + +def get_inputs(): + scale = torch.rand(()) + return [ + (torch.rand(batch_size, *input_shape) * scale).softmax(dim=-1), + torch.rand(batch_size, *input_shape).softmax(dim=-1), + ] + + +def get_init_inputs(): + return [] diff --git a/backends/triton/cpu/KernelBench/level1/99_TripletMarginLoss.py b/backends/triton/cpu/KernelBench/level1/99_TripletMarginLoss.py new file mode 100644 index 0000000..237d264 --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/99_TripletMarginLoss.py @@ -0,0 +1,127 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_K": 256}, num_warps=4, num_stages=2), + ], + key=["D"], +) +@triton.jit +def _triplet_margin_loss_kernel( + anchor_ptr, + positive_ptr, + negative_ptr, + loss_ptr, + B, + D, + stride_ab, + stride_ad, + stride_pb, + stride_pd, + stride_nb, + stride_nd, + margin, + eps, + BLOCK_K: tl.constexpr, +): + row = tl.program_id(0) + if row >= B: + return + + base_a = row.to(tl.int64) * stride_ab + base_p = row.to(tl.int64) * stride_pb + base_n = row.to(tl.int64) * stride_nb + + sum_pos_sq = 0.0 + sum_neg_sq = 0.0 + + for k_start in range(0, D, BLOCK_K): + offs_k = k_start + tl.arange(0, BLOCK_K) + mask_k = offs_k < D + + a = tl.load( + anchor_ptr + base_a + offs_k * stride_ad, mask=mask_k, other=0.0 + ).to(tl.float32) + p = tl.load( + positive_ptr + base_p + offs_k * stride_pd, mask=mask_k, other=0.0 + ).to(tl.float32) + n = tl.load( + negative_ptr + base_n + offs_k * stride_nd, mask=mask_k, other=0.0 + ).to(tl.float32) + + diff_pos = a - p + eps + diff_neg = a - n + eps + sum_pos_sq += tl.sum(diff_pos * diff_pos, axis=0) + sum_neg_sq += tl.sum(diff_neg * diff_neg, axis=0) + + d_pos = tl.sqrt(sum_pos_sq) + d_neg = tl.sqrt(sum_neg_sq) + + loss_val = tl.maximum(d_pos - d_neg + margin, 0.0) + + tl.atomic_add(loss_ptr, loss_val, sem="relaxed") + + +class Model(nn.Module): + def __init__(self, margin=1.0): + super(Model, self).__init__() + self.margin = margin + self.eps = 1e-6 + + def forward(self, anchor, positive, negative): + device = anchor.device + B, D = anchor.shape + + anchor = anchor.contiguous() + positive = positive.contiguous() + negative = negative.contiguous() + + loss_accum = torch.zeros((), device=device, dtype=torch.float32) + + grid = (B,) + _triplet_margin_loss_kernel[grid]( + anchor, + positive, + negative, + loss_accum, + B, + D, + anchor.stride(0), + anchor.stride(1), + positive.stride(0), + positive.stride(1), + negative.stride(0), + negative.stride(1), + self.margin, + self.eps, + ) + + return loss_accum / B + + +batch_size = 32768 +input_shape = (8192,) +dim = 1 + + +def get_inputs(): + scale = torch.rand(()) + return [ + torch.rand(batch_size, *input_shape) * scale, + torch.rand(batch_size, *input_shape), + torch.rand(batch_size, *input_shape), + ] + + +def get_init_inputs(): + return [1.0] diff --git a/backends/triton/cpu/KernelBench/level1/9_Tall_skinny_matrix_multiplication_.py b/backends/triton/cpu/KernelBench/level1/9_Tall_skinny_matrix_multiplication_.py new file mode 100644 index 0000000..152975e --- /dev/null +++ b/backends/triton/cpu/KernelBench/level1/9_Tall_skinny_matrix_multiplication_.py @@ -0,0 +1,141 @@ +# ruff: noqa: E731 +# AUTOGENERATED KERNEL (LLM) +# Source: LLM-generated candidate implementation +# Status: Experimental / uncurated +# Expectation: Correctness-first, performance not representative + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +@triton.jit +def swizzle_tile( + tile_id, + M, + N, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + grid_m = tl.cdiv(M, BLOCK_M) + grid_n = tl.cdiv(N, BLOCK_N) + width = GROUP_SIZE_M * grid_n + group_id = tile_id // width + group_size = tl.minimum(GROUP_SIZE_M, grid_m - group_id * GROUP_SIZE_M) + pid_m = group_id * GROUP_SIZE_M + (tile_id % group_size) + pid_n = (tile_id % width) // group_size + return pid_m, pid_n + + +def _configs(): + return [ + triton.Config( + {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, + num_warps=16, + num_stages=2, + ), + ] + + +@triton.autotune(configs=_configs(), key=["M", "N", "K"]) +@triton.jit +def _matmul_kernel( + a_ptr, + b_ptr, + c_ptr, + M, + N, + K, + stride_am, + stride_ak: tl.constexpr, + stride_bk, + stride_bn: tl.constexpr, + stride_cm, + stride_cn: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + pid = tl.program_id(0) + pid_m, pid_n = swizzle_tile(pid, M, N, BLOCK_M, BLOCK_N, GROUP_SIZE_M) + + a_desc = tl.make_tensor_descriptor( + base=a_ptr, + shape=(M, K), + strides=(stride_am, stride_ak), + block_shape=(BLOCK_M, BLOCK_K), + ) + b_desc = tl.make_tensor_descriptor( + base=b_ptr, + shape=(K, N), + strides=(stride_bk, stride_bn), + block_shape=(BLOCK_K, BLOCK_N), + ) + + off_m = pid_m * BLOCK_M + off_n = pid_n * BLOCK_N + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + + for off_k in range(0, K, BLOCK_K): + a_tile = a_desc.load([off_m, off_k]) + b_tile = b_desc.load([off_k, off_n]) + acc += tl.dot(a_tile, b_tile) + c_desc = tl.make_tensor_descriptor( + base=c_ptr, + shape=(M, N), + strides=(stride_cm, stride_cn), + block_shape=(BLOCK_M, BLOCK_N), + ) + c_desc.store([off_m, off_n], acc.to(c_ptr.type.element_ty)) + + +class Model(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, A, B): + device = A.device + A = A.contiguous() + B = B.contiguous() + + M_out, K = A.shape + N_out = B.shape[1] + + C = torch.empty((M_out, N_out), device=device, dtype=A.dtype) + + grid = lambda META: ( + triton.cdiv(M_out, META["BLOCK_M"]) * triton.cdiv(N_out, META["BLOCK_N"]), + ) + + _matmul_kernel[grid]( + A, + B, + C, + M_out, + N_out, + K, + A.stride(0), + A.stride(1), + B.stride(0), + B.stride(1), + C.stride(0), + C.stride(1), + ) + return C + + +M = 16384 * 2 +N = 16 * 2 + + +def get_inputs(): + A = torch.rand(M, N, dtype=torch.bfloat16) + B = torch.rand(N, M, dtype=torch.bfloat16) + return [A, B] + + +def get_init_inputs(): + return [] diff --git a/problems/specs/KernelBench/level1/100_HingeLoss.yaml b/problems/specs/KernelBench/level1/100_HingeLoss.yaml index 8f21a60..441ff9c 100644 --- a/problems/specs/KernelBench/level1/100_HingeLoss.yaml +++ b/problems/specs/KernelBench/level1/100_HingeLoss.yaml @@ -15,3 +15,10 @@ ci: dims: BATCH_SIZE: 64 INPUT_DIM: 64 + +simple-cpu: + - params: [predictions, targets] + dtype: float32 + dims: + BATCH_SIZE: 1024 + INPUT_DIM: 1024 diff --git a/problems/specs/KernelBench/level1/10_3D_tensor_matrix_multiplication.yaml b/problems/specs/KernelBench/level1/10_3D_tensor_matrix_multiplication.yaml index a72791f..43eabc9 100644 --- a/problems/specs/KernelBench/level1/10_3D_tensor_matrix_multiplication.yaml +++ b/problems/specs/KernelBench/level1/10_3D_tensor_matrix_multiplication.yaml @@ -16,6 +16,16 @@ ci: L: 24 flop: "2*N*M*L*K" +simple-cpu: + - params: [A, B] + dtype: bfloat16 + dims: + N: 4 + M: 32 + K: 64 + L: 24 + flop: "2*N*M*L*K" + bench-gpu: - params: [A, B] dtype: float16 diff --git a/problems/specs/KernelBench/level1/11_4D_tensor_matrix_multiplication.yaml b/problems/specs/KernelBench/level1/11_4D_tensor_matrix_multiplication.yaml index e82b215..d0c5522 100644 --- a/problems/specs/KernelBench/level1/11_4D_tensor_matrix_multiplication.yaml +++ b/problems/specs/KernelBench/level1/11_4D_tensor_matrix_multiplication.yaml @@ -17,6 +17,17 @@ ci: K: 24 flop: "2*B*I*J*K*L" +simple-cpu: + - params: [A, B] + dtype: bfloat16 + dims: + B: 2 + I: 32 + J: 64 + L: 32 + K: 24 + flop: "2*B*I*J*K*L" + bench-gpu: - params: [A, B] dtype: float16 diff --git a/problems/specs/KernelBench/level1/12_Matmul_with_diagonal_matrices_.yaml b/problems/specs/KernelBench/level1/12_Matmul_with_diagonal_matrices_.yaml index d5f9f78..1e7a076 100644 --- a/problems/specs/KernelBench/level1/12_Matmul_with_diagonal_matrices_.yaml +++ b/problems/specs/KernelBench/level1/12_Matmul_with_diagonal_matrices_.yaml @@ -14,6 +14,14 @@ ci: N: 128 flop: "2*M*N" +simple-cpu: + - params: [A, B] + dtype: bfloat16 + dims: + M: 128 + N: 128 + flop: "2*M*N" + bench-gpu: - params: [A, B] dtype: float16 diff --git a/problems/specs/KernelBench/level1/13_Matmul_for_symmetric_matrices.yaml b/problems/specs/KernelBench/level1/13_Matmul_for_symmetric_matrices.yaml index 66a59c3..9808093 100644 --- a/problems/specs/KernelBench/level1/13_Matmul_for_symmetric_matrices.yaml +++ b/problems/specs/KernelBench/level1/13_Matmul_for_symmetric_matrices.yaml @@ -15,3 +15,9 @@ ci: dtype: float32 dims: N: 64 + +simple-cpu: + - params: [A, B] + dtype: bfloat16 + dims: + N: 64 diff --git a/problems/specs/KernelBench/level1/14_Matmul_for_upper_triangular_matrices.yaml b/problems/specs/KernelBench/level1/14_Matmul_for_upper_triangular_matrices.yaml index 7369aff..b29c1f6 100644 --- a/problems/specs/KernelBench/level1/14_Matmul_for_upper_triangular_matrices.yaml +++ b/problems/specs/KernelBench/level1/14_Matmul_for_upper_triangular_matrices.yaml @@ -15,3 +15,9 @@ ci: dtype: float32 dims: N: 64 + +simple-cpu: + - params: [A, B] + dtype: bfloat16 + dims: + N: 64 diff --git a/problems/specs/KernelBench/level1/15_Matmul_for_lower_triangular_matrices.yaml b/problems/specs/KernelBench/level1/15_Matmul_for_lower_triangular_matrices.yaml index cafb725..f6043d4 100644 --- a/problems/specs/KernelBench/level1/15_Matmul_for_lower_triangular_matrices.yaml +++ b/problems/specs/KernelBench/level1/15_Matmul_for_lower_triangular_matrices.yaml @@ -15,3 +15,9 @@ ci: dtype: float32 dims: M: 64 + +simple-cpu: + - params: [A, B] + dtype: bfloat16 + dims: + M: 64 diff --git a/problems/specs/KernelBench/level1/16_Matmul_with_transposed_A.yaml b/problems/specs/KernelBench/level1/16_Matmul_with_transposed_A.yaml index e665dcc..3c22b31 100644 --- a/problems/specs/KernelBench/level1/16_Matmul_with_transposed_A.yaml +++ b/problems/specs/KernelBench/level1/16_Matmul_with_transposed_A.yaml @@ -15,6 +15,15 @@ ci: K: 256 flop: "2*M*N*K" +simple-cpu: + - params: [A, B] + dtype: bfloat16 + dims: + M: 64 + N: 128 + K: 256 + flop: "2*M*N*K" + bench-gpu: - params: [A, B] dtype: float16 diff --git a/problems/specs/KernelBench/level1/17_Matmul_with_transposed_B.yaml b/problems/specs/KernelBench/level1/17_Matmul_with_transposed_B.yaml index 9d984be..1e1e095 100644 --- a/problems/specs/KernelBench/level1/17_Matmul_with_transposed_B.yaml +++ b/problems/specs/KernelBench/level1/17_Matmul_with_transposed_B.yaml @@ -15,6 +15,15 @@ ci: K: 256 flop: "2*M*N*K" +simple-cpu: + - params: [A, B] + dtype: bfloat16 + dims: + M: 64 + N: 128 + K: 256 + flop: "2*M*N*K" + bench-gpu: - params: [A, B] dtype: float16 diff --git a/problems/specs/KernelBench/level1/18_Matmul_with_transposed_both.yaml b/problems/specs/KernelBench/level1/18_Matmul_with_transposed_both.yaml index 0ec07fb..d684c28 100644 --- a/problems/specs/KernelBench/level1/18_Matmul_with_transposed_both.yaml +++ b/problems/specs/KernelBench/level1/18_Matmul_with_transposed_both.yaml @@ -15,6 +15,15 @@ ci: K: 256 flop: "2*N*M*K" +simple-cpu: + - params: [A, B] + dtype: bfloat16 + dims: + M: 64 + N: 128 + K: 256 + flop: "2*N*M*K" + bench-gpu: - params: [A, B] dtype: float16 diff --git a/problems/specs/KernelBench/level1/19_ReLU.yaml b/problems/specs/KernelBench/level1/19_ReLU.yaml index 9fc614e..3ccc889 100644 --- a/problems/specs/KernelBench/level1/19_ReLU.yaml +++ b/problems/specs/KernelBench/level1/19_ReLU.yaml @@ -10,6 +10,13 @@ ci: BATCH: 128 DIM: 512 +simple-cpu: + - params: [X] + dtype: float32 + dims: + BATCH: 128 + DIM: 512 + bench-cpu: - params: [X] dtype: bfloat16 diff --git a/problems/specs/KernelBench/level1/1_Square_matrix_multiplication_.yaml b/problems/specs/KernelBench/level1/1_Square_matrix_multiplication_.yaml index 3a0fc9c..2f5de5b 100644 --- a/problems/specs/KernelBench/level1/1_Square_matrix_multiplication_.yaml +++ b/problems/specs/KernelBench/level1/1_Square_matrix_multiplication_.yaml @@ -12,6 +12,12 @@ ci: dims: N: 128 +simple-cpu: + - params: [A, B] + dtype: bfloat16 + dims: + N: 128 + bench-cpu: - params: [A, B] dtype: float32 diff --git a/problems/specs/KernelBench/level1/20_LeakyReLU.yaml b/problems/specs/KernelBench/level1/20_LeakyReLU.yaml index 7071370..bed06d7 100644 --- a/problems/specs/KernelBench/level1/20_LeakyReLU.yaml +++ b/problems/specs/KernelBench/level1/20_LeakyReLU.yaml @@ -11,6 +11,14 @@ ci: DIM: 512 flop: "2*BATCH*DIM" +simple-cpu: + - params: [X] + dtype: float32 + dims: + BATCH: 128 + DIM: 512 + flop: "2*BATCH*DIM" + bench-gpu: - params: [X] dtype: float16 diff --git a/problems/specs/KernelBench/level1/21_Sigmoid.yaml b/problems/specs/KernelBench/level1/21_Sigmoid.yaml index 0bbba8e..7f40ac7 100644 --- a/problems/specs/KernelBench/level1/21_Sigmoid.yaml +++ b/problems/specs/KernelBench/level1/21_Sigmoid.yaml @@ -11,3 +11,10 @@ ci: dims: BATCH_SIZE: 128 DIM: 512 + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 128 + DIM: 512 diff --git a/problems/specs/KernelBench/level1/22_Tanh.yaml b/problems/specs/KernelBench/level1/22_Tanh.yaml index 0bbba8e..7f40ac7 100644 --- a/problems/specs/KernelBench/level1/22_Tanh.yaml +++ b/problems/specs/KernelBench/level1/22_Tanh.yaml @@ -11,3 +11,10 @@ ci: dims: BATCH_SIZE: 128 DIM: 512 + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 128 + DIM: 512 diff --git a/problems/specs/KernelBench/level1/23_Softmax.yaml b/problems/specs/KernelBench/level1/23_Softmax.yaml index 0bbba8e..ef98da3 100644 --- a/problems/specs/KernelBench/level1/23_Softmax.yaml +++ b/problems/specs/KernelBench/level1/23_Softmax.yaml @@ -11,3 +11,10 @@ ci: dims: BATCH_SIZE: 128 DIM: 512 + +simple-cpu: + - params: [x] + dtype: bfloat16 + dims: + BATCH_SIZE: 128 + DIM: 512 diff --git a/problems/specs/KernelBench/level1/24_LogSoftmax.yaml b/problems/specs/KernelBench/level1/24_LogSoftmax.yaml index 0bbba8e..7f40ac7 100644 --- a/problems/specs/KernelBench/level1/24_LogSoftmax.yaml +++ b/problems/specs/KernelBench/level1/24_LogSoftmax.yaml @@ -11,3 +11,10 @@ ci: dims: BATCH_SIZE: 128 DIM: 512 + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 128 + DIM: 512 diff --git a/problems/specs/KernelBench/level1/25_Swish.yaml b/problems/specs/KernelBench/level1/25_Swish.yaml index 0bbba8e..7f40ac7 100644 --- a/problems/specs/KernelBench/level1/25_Swish.yaml +++ b/problems/specs/KernelBench/level1/25_Swish.yaml @@ -11,3 +11,10 @@ ci: dims: BATCH_SIZE: 128 DIM: 512 + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 128 + DIM: 512 diff --git a/problems/specs/KernelBench/level1/26_GELU_.yaml b/problems/specs/KernelBench/level1/26_GELU_.yaml index 0bbba8e..7f40ac7 100644 --- a/problems/specs/KernelBench/level1/26_GELU_.yaml +++ b/problems/specs/KernelBench/level1/26_GELU_.yaml @@ -11,3 +11,10 @@ ci: dims: BATCH_SIZE: 128 DIM: 512 + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 128 + DIM: 512 diff --git a/problems/specs/KernelBench/level1/27_SELU_.yaml b/problems/specs/KernelBench/level1/27_SELU_.yaml index 0bbba8e..7f40ac7 100644 --- a/problems/specs/KernelBench/level1/27_SELU_.yaml +++ b/problems/specs/KernelBench/level1/27_SELU_.yaml @@ -11,3 +11,10 @@ ci: dims: BATCH_SIZE: 128 DIM: 512 + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 128 + DIM: 512 diff --git a/problems/specs/KernelBench/level1/28_HardSigmoid.yaml b/problems/specs/KernelBench/level1/28_HardSigmoid.yaml index 0bbba8e..7f40ac7 100644 --- a/problems/specs/KernelBench/level1/28_HardSigmoid.yaml +++ b/problems/specs/KernelBench/level1/28_HardSigmoid.yaml @@ -11,3 +11,10 @@ ci: dims: BATCH_SIZE: 128 DIM: 512 + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 128 + DIM: 512 diff --git a/problems/specs/KernelBench/level1/29_Softplus.yaml b/problems/specs/KernelBench/level1/29_Softplus.yaml index 0bbba8e..7f40ac7 100644 --- a/problems/specs/KernelBench/level1/29_Softplus.yaml +++ b/problems/specs/KernelBench/level1/29_Softplus.yaml @@ -11,3 +11,10 @@ ci: dims: BATCH_SIZE: 128 DIM: 512 + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 128 + DIM: 512 diff --git a/problems/specs/KernelBench/level1/2_Standard_matrix_multiplication_.yaml b/problems/specs/KernelBench/level1/2_Standard_matrix_multiplication_.yaml index 0ba0f91..f4c1f66 100644 --- a/problems/specs/KernelBench/level1/2_Standard_matrix_multiplication_.yaml +++ b/problems/specs/KernelBench/level1/2_Standard_matrix_multiplication_.yaml @@ -14,6 +14,14 @@ ci: N: 128 K: 256 +simple-cpu: + - params: [A, B] + dtype: bfloat16 + dims: + M: 64 + N: 128 + K: 256 + bench-cpu: - params: [A, B] dtype: bfloat16 diff --git a/problems/specs/KernelBench/level1/30_Softsign.yaml b/problems/specs/KernelBench/level1/30_Softsign.yaml index 0bbba8e..7f40ac7 100644 --- a/problems/specs/KernelBench/level1/30_Softsign.yaml +++ b/problems/specs/KernelBench/level1/30_Softsign.yaml @@ -11,3 +11,10 @@ ci: dims: BATCH_SIZE: 128 DIM: 512 + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 128 + DIM: 512 diff --git a/problems/specs/KernelBench/level1/31_ELU.yaml b/problems/specs/KernelBench/level1/31_ELU.yaml index 451772e..092c0ed 100644 --- a/problems/specs/KernelBench/level1/31_ELU.yaml +++ b/problems/specs/KernelBench/level1/31_ELU.yaml @@ -13,3 +13,11 @@ ci: BATCH_SIZE: 128 DIM: 512 ALPHA: 1.0 + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 128 + DIM: 512 + ALPHA: 1.0 diff --git a/problems/specs/KernelBench/level1/32_HardTanh.yaml b/problems/specs/KernelBench/level1/32_HardTanh.yaml index 0bbba8e..7f40ac7 100644 --- a/problems/specs/KernelBench/level1/32_HardTanh.yaml +++ b/problems/specs/KernelBench/level1/32_HardTanh.yaml @@ -11,3 +11,10 @@ ci: dims: BATCH_SIZE: 128 DIM: 512 + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 128 + DIM: 512 diff --git a/problems/specs/KernelBench/level1/33_BatchNorm.yaml b/problems/specs/KernelBench/level1/33_BatchNorm.yaml index 2c34dfb..2287cb0 100644 --- a/problems/specs/KernelBench/level1/33_BatchNorm.yaml +++ b/problems/specs/KernelBench/level1/33_BatchNorm.yaml @@ -14,3 +14,12 @@ ci: FEATURES: 4 DIM1: 64 DIM2: 64 + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 2 + FEATURES: 4 + DIM1: 64 + DIM2: 64 diff --git a/problems/specs/KernelBench/level1/34_InstanceNorm.yaml b/problems/specs/KernelBench/level1/34_InstanceNorm.yaml index 2c34dfb..2287cb0 100644 --- a/problems/specs/KernelBench/level1/34_InstanceNorm.yaml +++ b/problems/specs/KernelBench/level1/34_InstanceNorm.yaml @@ -14,3 +14,12 @@ ci: FEATURES: 4 DIM1: 64 DIM2: 64 + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 2 + FEATURES: 4 + DIM1: 64 + DIM2: 64 diff --git a/problems/specs/KernelBench/level1/35_GroupNorm_.yaml b/problems/specs/KernelBench/level1/35_GroupNorm_.yaml index da07568..f1f03fa 100644 --- a/problems/specs/KernelBench/level1/35_GroupNorm_.yaml +++ b/problems/specs/KernelBench/level1/35_GroupNorm_.yaml @@ -16,3 +16,13 @@ ci: NUM_GROUPS: 4 DIM1: 32 DIM2: 32 + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 2 + FEATURES: 8 + NUM_GROUPS: 4 + DIM1: 32 + DIM2: 32 diff --git a/problems/specs/KernelBench/level1/36_RMSNorm_.yaml b/problems/specs/KernelBench/level1/36_RMSNorm_.yaml index 2c34dfb..5a5ed85 100644 --- a/problems/specs/KernelBench/level1/36_RMSNorm_.yaml +++ b/problems/specs/KernelBench/level1/36_RMSNorm_.yaml @@ -14,3 +14,12 @@ ci: FEATURES: 4 DIM1: 64 DIM2: 64 + +simple-cpu: + - params: [x] + dtype: bfloat16 + dims: + BATCH_SIZE: 2 + FEATURES: 4 + DIM1: 64 + DIM2: 64 diff --git a/problems/specs/KernelBench/level1/37_FrobeniusNorm_.yaml b/problems/specs/KernelBench/level1/37_FrobeniusNorm_.yaml index 72d91ec..3cbc627 100644 --- a/problems/specs/KernelBench/level1/37_FrobeniusNorm_.yaml +++ b/problems/specs/KernelBench/level1/37_FrobeniusNorm_.yaml @@ -13,3 +13,12 @@ ci: FEATURES: 4 DIM1: 64 DIM2: 64 + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 2 + FEATURES: 4 + DIM1: 64 + DIM2: 64 diff --git a/problems/specs/KernelBench/level1/38_L1Norm_.yaml b/problems/specs/KernelBench/level1/38_L1Norm_.yaml index 0bbba8e..7f40ac7 100644 --- a/problems/specs/KernelBench/level1/38_L1Norm_.yaml +++ b/problems/specs/KernelBench/level1/38_L1Norm_.yaml @@ -11,3 +11,10 @@ ci: dims: BATCH_SIZE: 128 DIM: 512 + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 128 + DIM: 512 diff --git a/problems/specs/KernelBench/level1/39_L2Norm_.yaml b/problems/specs/KernelBench/level1/39_L2Norm_.yaml index 0bbba8e..7f40ac7 100644 --- a/problems/specs/KernelBench/level1/39_L2Norm_.yaml +++ b/problems/specs/KernelBench/level1/39_L2Norm_.yaml @@ -11,3 +11,10 @@ ci: dims: BATCH_SIZE: 128 DIM: 512 + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 128 + DIM: 512 diff --git a/problems/specs/KernelBench/level1/3_Batched_matrix_multiplication.yaml b/problems/specs/KernelBench/level1/3_Batched_matrix_multiplication.yaml index 14c26dd..5c11555 100644 --- a/problems/specs/KernelBench/level1/3_Batched_matrix_multiplication.yaml +++ b/problems/specs/KernelBench/level1/3_Batched_matrix_multiplication.yaml @@ -15,6 +15,15 @@ ci: N: 128 K: 128 +simple-cpu: + - params: [A, B] + dtype: bfloat16 + dims: + BATCH: 2 + M: 64 + N: 128 + K: 128 + bench-gpu: - params: [A, B] dtype: float16 diff --git a/problems/specs/KernelBench/level1/40_LayerNorm.yaml b/problems/specs/KernelBench/level1/40_LayerNorm.yaml index 2fbd50a..1b16337 100644 --- a/problems/specs/KernelBench/level1/40_LayerNorm.yaml +++ b/problems/specs/KernelBench/level1/40_LayerNorm.yaml @@ -15,3 +15,13 @@ ci: DIM1: 64 DIM2: 64 NORMALIZED_SHAPE: [4, 64, 64] # TODO: bind these to other dims + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 2 + FEATURES: 4 + DIM1: 64 + DIM2: 64 + NORMALIZED_SHAPE: [4, 64, 64] # TODO: bind these to other dims diff --git a/problems/specs/KernelBench/level1/41_Max_Pooling_1D.yaml b/problems/specs/KernelBench/level1/41_Max_Pooling_1D.yaml index 49ef0fe..34612d9 100644 --- a/problems/specs/KernelBench/level1/41_Max_Pooling_1D.yaml +++ b/problems/specs/KernelBench/level1/41_Max_Pooling_1D.yaml @@ -22,3 +22,16 @@ ci: PADDING: 1 DILATION: 3 RETURN_INDICES: false + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 2 + FEATURES: 24 + SEQUENCE_LENGTH: 64 + KERNEL_SIZE: 2 + STRIDE: 1 + PADDING: 1 + DILATION: 3 + RETURN_INDICES: false diff --git a/problems/specs/KernelBench/level1/42_Max_Pooling_2D.yaml b/problems/specs/KernelBench/level1/42_Max_Pooling_2D.yaml index 95debcf..7f444a4 100644 --- a/problems/specs/KernelBench/level1/42_Max_Pooling_2D.yaml +++ b/problems/specs/KernelBench/level1/42_Max_Pooling_2D.yaml @@ -21,3 +21,16 @@ ci: STRIDE: 1 PADDING: 1 DILATION: 1 + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 2 + CHANNELS: 8 + HEIGHT: 32 + WIDTH: 32 + KERNEL_SIZE: 2 + STRIDE: 1 + PADDING: 1 + DILATION: 1 diff --git a/problems/specs/KernelBench/level1/43_Max_Pooling_3D.yaml b/problems/specs/KernelBench/level1/43_Max_Pooling_3D.yaml index 2c34dfb..a50374c 100644 --- a/problems/specs/KernelBench/level1/43_Max_Pooling_3D.yaml +++ b/problems/specs/KernelBench/level1/43_Max_Pooling_3D.yaml @@ -1,6 +1,6 @@ inputs: x: - shape: [BATCH_SIZE, FEATURES, DIM1, DIM2] + shape: [BATCH_SIZE, FEATURES, DIM1, DIM2, DIM3] dtype: inherit inits: @@ -12,5 +12,16 @@ ci: dims: BATCH_SIZE: 2 FEATURES: 4 - DIM1: 64 - DIM2: 64 + DIM1: 16 + DIM2: 16 + DIM3: 16 + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 2 + FEATURES: 4 + DIM1: 16 + DIM2: 16 + DIM3: 16 diff --git a/problems/specs/KernelBench/level1/44_Average_Pooling_1D.yaml b/problems/specs/KernelBench/level1/44_Average_Pooling_1D.yaml index f4577e5..268a02e 100644 --- a/problems/specs/KernelBench/level1/44_Average_Pooling_1D.yaml +++ b/problems/specs/KernelBench/level1/44_Average_Pooling_1D.yaml @@ -18,3 +18,14 @@ ci: KERNEL_SIZE: 2 STRIDE: 1 PADDING: 1 + +simple-cpu: + - params: [x] + dtype: bfloat16 + dims: + BATCH_SIZE: 2 + IN_CHANNELS: 32 + INPUT_LENGTH: 128 + KERNEL_SIZE: 2 + STRIDE: 1 + PADDING: 1 diff --git a/problems/specs/KernelBench/level1/45_Average_Pooling_2D.yaml b/problems/specs/KernelBench/level1/45_Average_Pooling_2D.yaml index 7ae2029..e63de37 100644 --- a/problems/specs/KernelBench/level1/45_Average_Pooling_2D.yaml +++ b/problems/specs/KernelBench/level1/45_Average_Pooling_2D.yaml @@ -15,3 +15,13 @@ ci: HEIGHT: 128 WIDTH: 128 KERNEL_SIZE: 3 + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 2 + CHANNELS: 16 + HEIGHT: 128 + WIDTH: 128 + KERNEL_SIZE: 3 diff --git a/problems/specs/KernelBench/level1/46_Average_Pooling_3D.yaml b/problems/specs/KernelBench/level1/46_Average_Pooling_3D.yaml index 748cf0b..c13a4ba 100644 --- a/problems/specs/KernelBench/level1/46_Average_Pooling_3D.yaml +++ b/problems/specs/KernelBench/level1/46_Average_Pooling_3D.yaml @@ -20,3 +20,16 @@ ci: KERNEL_SIZE: 3 STRIDE: 2 PADDING: 1 + +simple-cpu: + - params: [x] + dtype: float32 # "avg_pool3d_out_frame" not implemented for 'Half' + dims: + BATCH_SIZE: 2 + CHANNELS: 8 + DEPTH: 16 + HEIGHT: 16 + WIDTH: 32 + KERNEL_SIZE: 3 + STRIDE: 2 + PADDING: 1 diff --git a/problems/specs/KernelBench/level1/47_Sum_reduction_over_a_dimension.yaml b/problems/specs/KernelBench/level1/47_Sum_reduction_over_a_dimension.yaml index bf5fbda..34c285c 100644 --- a/problems/specs/KernelBench/level1/47_Sum_reduction_over_a_dimension.yaml +++ b/problems/specs/KernelBench/level1/47_Sum_reduction_over_a_dimension.yaml @@ -14,3 +14,12 @@ ci: DIM1: 64 DIM2: 63 REDUCE_DIM: 1 + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 2 + DIM1: 64 + DIM2: 63 + REDUCE_DIM: 1 diff --git a/problems/specs/KernelBench/level1/48_Mean_reduction_over_a_dimension.yaml b/problems/specs/KernelBench/level1/48_Mean_reduction_over_a_dimension.yaml index bf5fbda..34c285c 100644 --- a/problems/specs/KernelBench/level1/48_Mean_reduction_over_a_dimension.yaml +++ b/problems/specs/KernelBench/level1/48_Mean_reduction_over_a_dimension.yaml @@ -14,3 +14,12 @@ ci: DIM1: 64 DIM2: 63 REDUCE_DIM: 1 + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 2 + DIM1: 64 + DIM2: 63 + REDUCE_DIM: 1 diff --git a/problems/specs/KernelBench/level1/49_Max_reduction_over_a_dimension.yaml b/problems/specs/KernelBench/level1/49_Max_reduction_over_a_dimension.yaml index bf5fbda..34c285c 100644 --- a/problems/specs/KernelBench/level1/49_Max_reduction_over_a_dimension.yaml +++ b/problems/specs/KernelBench/level1/49_Max_reduction_over_a_dimension.yaml @@ -14,3 +14,12 @@ ci: DIM1: 64 DIM2: 63 REDUCE_DIM: 1 + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 2 + DIM1: 64 + DIM2: 63 + REDUCE_DIM: 1 diff --git a/problems/specs/KernelBench/level1/4_Matrix_vector_multiplication_.yaml b/problems/specs/KernelBench/level1/4_Matrix_vector_multiplication_.yaml index 1536270..d3afc05 100644 --- a/problems/specs/KernelBench/level1/4_Matrix_vector_multiplication_.yaml +++ b/problems/specs/KernelBench/level1/4_Matrix_vector_multiplication_.yaml @@ -15,6 +15,15 @@ ci: K: 256 flop: "2*M*N*K" +simple-cpu: + - params: [A, B] + dtype: bfloat16 + dims: + M: 64 + N: 1 + K: 256 + flop: "2*M*N*K" + bench-gpu: - params: [A, B] dtype: float16 diff --git a/problems/specs/KernelBench/level1/50_conv_standard_2D__square_input__square_kernel.yaml b/problems/specs/KernelBench/level1/50_conv_standard_2D__square_input__square_kernel.yaml index 11d119c..33930dc 100644 --- a/problems/specs/KernelBench/level1/50_conv_standard_2D__square_input__square_kernel.yaml +++ b/problems/specs/KernelBench/level1/50_conv_standard_2D__square_input__square_kernel.yaml @@ -15,3 +15,13 @@ ci: IN_CHANNELS: 3 HEIGHT: 64 WIDTH: 64 + +simple-cpu: + - params: [x] + dtype: bfloat16 + dims: + BATCH_SIZE: 2 + NUM_CLASSES: 10 + IN_CHANNELS: 3 + HEIGHT: 64 + WIDTH: 64 diff --git a/problems/specs/KernelBench/level1/51_Argmax_over_a_dimension.yaml b/problems/specs/KernelBench/level1/51_Argmax_over_a_dimension.yaml index 917cfb3..8b78339 100644 --- a/problems/specs/KernelBench/level1/51_Argmax_over_a_dimension.yaml +++ b/problems/specs/KernelBench/level1/51_Argmax_over_a_dimension.yaml @@ -14,3 +14,12 @@ ci: DIM1: 64 DIM2: 63 ARGMAX_DIM: 1 + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 2 + DIM1: 64 + DIM2: 63 + ARGMAX_DIM: 1 diff --git a/problems/specs/KernelBench/level1/52_Argmin_over_a_dimension.yaml b/problems/specs/KernelBench/level1/52_Argmin_over_a_dimension.yaml index fe08c21..e20221d 100644 --- a/problems/specs/KernelBench/level1/52_Argmin_over_a_dimension.yaml +++ b/problems/specs/KernelBench/level1/52_Argmin_over_a_dimension.yaml @@ -14,3 +14,12 @@ ci: DIM1: 64 DIM2: 63 ARGMIN_DIM: 1 + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 2 + DIM1: 64 + DIM2: 63 + ARGMIN_DIM: 1 diff --git a/problems/specs/KernelBench/level1/53_Min_reduction_over_a_dimension.yaml b/problems/specs/KernelBench/level1/53_Min_reduction_over_a_dimension.yaml index bf5fbda..34c285c 100644 --- a/problems/specs/KernelBench/level1/53_Min_reduction_over_a_dimension.yaml +++ b/problems/specs/KernelBench/level1/53_Min_reduction_over_a_dimension.yaml @@ -14,3 +14,12 @@ ci: DIM1: 64 DIM2: 63 REDUCE_DIM: 1 + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 2 + DIM1: 64 + DIM2: 63 + REDUCE_DIM: 1 diff --git a/problems/specs/KernelBench/level1/54_conv_standard_3D__square_input__square_kernel.yaml b/problems/specs/KernelBench/level1/54_conv_standard_3D__square_input__square_kernel.yaml index bead1f0..4503c24 100644 --- a/problems/specs/KernelBench/level1/54_conv_standard_3D__square_input__square_kernel.yaml +++ b/problems/specs/KernelBench/level1/54_conv_standard_3D__square_input__square_kernel.yaml @@ -19,3 +19,15 @@ ci: DEPTH: 16 WIDTH: 16 HEIGHT: 16 + +simple-cpu: + - params: [x] + dtype: float16 + dims: + BATCH_SIZE: 2 + IN_CHANNELS: 3 + OUT_CHANNELS: 16 + KERNEL_SIZE: 3 + DEPTH: 16 + WIDTH: 16 + HEIGHT: 16 diff --git a/problems/specs/KernelBench/level1/55_conv_standard_2D__asymmetric_input__square_kernel.yaml b/problems/specs/KernelBench/level1/55_conv_standard_2D__asymmetric_input__square_kernel.yaml index 33793c6..febf771 100644 --- a/problems/specs/KernelBench/level1/55_conv_standard_2D__asymmetric_input__square_kernel.yaml +++ b/problems/specs/KernelBench/level1/55_conv_standard_2D__asymmetric_input__square_kernel.yaml @@ -18,3 +18,14 @@ ci: KERNEL_SIZE: 3 HEIGHT: 32 WIDTH: 64 + +simple-cpu: + - params: [x] + dtype: bfloat16 + dims: + BATCH_SIZE: 2 + IN_CHANNELS: 8 + OUT_CHANNELS: 16 + KERNEL_SIZE: 3 + HEIGHT: 32 + WIDTH: 64 diff --git a/problems/specs/KernelBench/level1/56_conv_standard_2D__asymmetric_input__asymmetric_kernel.yaml b/problems/specs/KernelBench/level1/56_conv_standard_2D__asymmetric_input__asymmetric_kernel.yaml index 16234e5..f057fd4 100644 --- a/problems/specs/KernelBench/level1/56_conv_standard_2D__asymmetric_input__asymmetric_kernel.yaml +++ b/problems/specs/KernelBench/level1/56_conv_standard_2D__asymmetric_input__asymmetric_kernel.yaml @@ -18,3 +18,15 @@ ci: KERNEL_SIZE: [5, 7] HEIGHT: 64 WIDTH: 32 + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 2 + IN_CHANNELS: 8 + OUT_CHANNELS: 16 + KERNEL_SIZE: [5, 7] + HEIGHT: 64 + WIDTH: 32 + atol: 9.e-4 diff --git a/problems/specs/KernelBench/level1/57_conv_transposed_2D__square_input__square_kernel.yaml b/problems/specs/KernelBench/level1/57_conv_transposed_2D__square_input__square_kernel.yaml index a1a2267..495f86f 100644 --- a/problems/specs/KernelBench/level1/57_conv_transposed_2D__square_input__square_kernel.yaml +++ b/problems/specs/KernelBench/level1/57_conv_transposed_2D__square_input__square_kernel.yaml @@ -18,3 +18,14 @@ ci: KERNEL_SIZE: 3 HEIGHT: 64 WIDTH: 64 + +simple-cpu: + - params: [x] + dtype: float16 + dims: + BATCH_SIZE: 2 + IN_CHANNELS: 16 + OUT_CHANNELS: 16 + KERNEL_SIZE: 3 + HEIGHT: 64 + WIDTH: 64 diff --git a/problems/specs/KernelBench/level1/58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.yaml b/problems/specs/KernelBench/level1/58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.yaml index f289f4d..500b1cf 100644 --- a/problems/specs/KernelBench/level1/58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.yaml +++ b/problems/specs/KernelBench/level1/58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.yaml @@ -19,3 +19,15 @@ ci: DEPTH_IN: 4 HEIGHT_IN: 32 WIDTH_IN: 64 + +simple-cpu: + - params: [x] + dtype: float16 + dims: + BATCH_SIZE: 2 + IN_CHANNELS: 4 + OUT_CHANNELS: 8 + KERNEL_SIZE: [3, 5, 7] + DEPTH_IN: 4 + HEIGHT_IN: 32 + WIDTH_IN: 64 diff --git a/problems/specs/KernelBench/level1/59_conv_standard_3D__asymmetric_input__square_kernel.yaml b/problems/specs/KernelBench/level1/59_conv_standard_3D__asymmetric_input__square_kernel.yaml index 1aea06b..4d58a3e 100644 --- a/problems/specs/KernelBench/level1/59_conv_standard_3D__asymmetric_input__square_kernel.yaml +++ b/problems/specs/KernelBench/level1/59_conv_standard_3D__asymmetric_input__square_kernel.yaml @@ -19,3 +19,15 @@ ci: HEIGHT: 32 WIDTH: 32 DEPTH: 5 + +simple-cpu: + - params: [x] + dtype: float16 + dims: + BATCH_SIZE: 2 + IN_CHANNELS: 3 + OUT_CHANNELS: 16 + KERNEL_SIZE: 3 + HEIGHT: 32 + WIDTH: 32 + DEPTH: 5 diff --git a/problems/specs/KernelBench/level1/5_Matrix_scalar_multiplication.yaml b/problems/specs/KernelBench/level1/5_Matrix_scalar_multiplication.yaml index 782dc63..d2ecbbe 100644 --- a/problems/specs/KernelBench/level1/5_Matrix_scalar_multiplication.yaml +++ b/problems/specs/KernelBench/level1/5_Matrix_scalar_multiplication.yaml @@ -15,6 +15,15 @@ ci: UNIT: 1 flop: "M*N" +simple-cpu: + - params: [A, B] + dtype: bfloat16 + dims: + M: 64 + N: 32 + UNIT: 1 + flop: "M*N" + bench-gpu: - params: [A, B] dtype: float16 diff --git a/problems/specs/KernelBench/level1/60_conv_standard_3D__square_input__asymmetric_kernel.yaml b/problems/specs/KernelBench/level1/60_conv_standard_3D__square_input__asymmetric_kernel.yaml index eb04ba7..2db656a 100644 --- a/problems/specs/KernelBench/level1/60_conv_standard_3D__square_input__asymmetric_kernel.yaml +++ b/problems/specs/KernelBench/level1/60_conv_standard_3D__square_input__asymmetric_kernel.yaml @@ -19,3 +19,15 @@ ci: WIDTH: 16 HEIGHT: 16 DEPTH: 16 + +simple-cpu: + - params: [x] + dtype: float16 + dims: + BATCH_SIZE: 2 + IN_CHANNELS: 3 + OUT_CHANNELS: 8 + KERNEL_SIZE: [3, 5, 7] + WIDTH: 16 + HEIGHT: 16 + DEPTH: 16 diff --git a/problems/specs/KernelBench/level1/61_conv_transposed_3D__square_input__square_kernel.yaml b/problems/specs/KernelBench/level1/61_conv_transposed_3D__square_input__square_kernel.yaml index 9a032a4..351451e 100644 --- a/problems/specs/KernelBench/level1/61_conv_transposed_3D__square_input__square_kernel.yaml +++ b/problems/specs/KernelBench/level1/61_conv_transposed_3D__square_input__square_kernel.yaml @@ -19,3 +19,15 @@ ci: DEPTH: 16 HEIGHT: 16 WIDTH: 16 + +simple-cpu: + - params: [x] + dtype: float16 + dims: + BATCH_SIZE: 2 + IN_CHANNELS: 24 + OUT_CHANNELS: 24 + KERNEL_SIZE: 3 + DEPTH: 16 + HEIGHT: 16 + WIDTH: 16 diff --git a/problems/specs/KernelBench/level1/62_conv_standard_2D__square_input__asymmetric_kernel.yaml b/problems/specs/KernelBench/level1/62_conv_standard_2D__square_input__asymmetric_kernel.yaml index 1b2a193..0b957f9 100644 --- a/problems/specs/KernelBench/level1/62_conv_standard_2D__square_input__asymmetric_kernel.yaml +++ b/problems/specs/KernelBench/level1/62_conv_standard_2D__square_input__asymmetric_kernel.yaml @@ -18,3 +18,14 @@ ci: KERNEL_SIZE: [5, 9] HEIGHT: 32 WIDTH: 32 + +simple-cpu: + - params: [x] + dtype: float16 + dims: + BATCH_SIZE: 2 + IN_CHANNELS: 4 + OUT_CHANNELS: 8 + KERNEL_SIZE: [5, 9] + HEIGHT: 32 + WIDTH: 32 diff --git a/problems/specs/KernelBench/level1/63_conv_standard_2D__square_input__square_kernel.yaml b/problems/specs/KernelBench/level1/63_conv_standard_2D__square_input__square_kernel.yaml index e91e4bc..34790ad 100644 --- a/problems/specs/KernelBench/level1/63_conv_standard_2D__square_input__square_kernel.yaml +++ b/problems/specs/KernelBench/level1/63_conv_standard_2D__square_input__square_kernel.yaml @@ -18,3 +18,14 @@ ci: KERNEL_SIZE: 3 HEIGHT: 32 WIDTH: 32 + +simple-cpu: + - params: [x] + dtype: float16 + dims: + BATCH_SIZE: 2 + IN_CHANNELS: 4 + OUT_CHANNELS: 16 + KERNEL_SIZE: 3 + HEIGHT: 32 + WIDTH: 32 diff --git a/problems/specs/KernelBench/level1/64_conv_transposed_1D.yaml b/problems/specs/KernelBench/level1/64_conv_transposed_1D.yaml index 577f23f..aea674d 100644 --- a/problems/specs/KernelBench/level1/64_conv_transposed_1D.yaml +++ b/problems/specs/KernelBench/level1/64_conv_transposed_1D.yaml @@ -17,3 +17,13 @@ ci: OUT_CHANNELS: 8 KERNEL_SIZE: 3 LENGTH: 128 + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 2 + IN_CHANNELS: 8 + OUT_CHANNELS: 8 + KERNEL_SIZE: 3 + LENGTH: 128 diff --git a/problems/specs/KernelBench/level1/65_conv_transposed_2D__square_input__asymmetric_kernel.yaml b/problems/specs/KernelBench/level1/65_conv_transposed_2D__square_input__asymmetric_kernel.yaml index aaed64b..e1941d6 100644 --- a/problems/specs/KernelBench/level1/65_conv_transposed_2D__square_input__asymmetric_kernel.yaml +++ b/problems/specs/KernelBench/level1/65_conv_transposed_2D__square_input__asymmetric_kernel.yaml @@ -18,3 +18,14 @@ ci: KERNEL_SIZE: [3, 7] HEIGHT: 32 WIDTH: 32 + +simple-cpu: + - params: [x] + dtype: float16 + dims: + BATCH_SIZE: 2 + IN_CHANNELS: 8 + OUT_CHANNELS: 8 + KERNEL_SIZE: [3, 7] + HEIGHT: 32 + WIDTH: 32 diff --git a/problems/specs/KernelBench/level1/66_conv_standard_3D__asymmetric_input__asymmetric_kernel.yaml b/problems/specs/KernelBench/level1/66_conv_standard_3D__asymmetric_input__asymmetric_kernel.yaml index db69bd0..f2c5be3 100644 --- a/problems/specs/KernelBench/level1/66_conv_standard_3D__asymmetric_input__asymmetric_kernel.yaml +++ b/problems/specs/KernelBench/level1/66_conv_standard_3D__asymmetric_input__asymmetric_kernel.yaml @@ -19,3 +19,15 @@ ci: DEPTH: 4 HEIGHT: 32 WIDTH: 32 + +simple-cpu: + - params: [x] + dtype: float16 + dims: + BATCH_SIZE: 2 + IN_CHANNELS: 3 + OUT_CHANNELS: 8 + KERNEL_SIZE: [3, 5, 7] + DEPTH: 4 + HEIGHT: 32 + WIDTH: 32 diff --git a/problems/specs/KernelBench/level1/67_conv_standard_1D.yaml b/problems/specs/KernelBench/level1/67_conv_standard_1D.yaml index 4ad5147..c68f735 100644 --- a/problems/specs/KernelBench/level1/67_conv_standard_1D.yaml +++ b/problems/specs/KernelBench/level1/67_conv_standard_1D.yaml @@ -17,3 +17,13 @@ ci: OUT_CHANNELS: 16 KERNEL_SIZE: 3 LENGTH: 128 + +simple-cpu: + - params: [x] + dtype: float16 + dims: + BATCH_SIZE: 2 + IN_CHANNELS: 8 + OUT_CHANNELS: 16 + KERNEL_SIZE: 3 + LENGTH: 128 diff --git a/problems/specs/KernelBench/level1/68_conv_transposed_3D__square_input__asymmetric_kernel.yaml b/problems/specs/KernelBench/level1/68_conv_transposed_3D__square_input__asymmetric_kernel.yaml index 3cc2804..06e0d22 100644 --- a/problems/specs/KernelBench/level1/68_conv_transposed_3D__square_input__asymmetric_kernel.yaml +++ b/problems/specs/KernelBench/level1/68_conv_transposed_3D__square_input__asymmetric_kernel.yaml @@ -19,3 +19,15 @@ ci: DEPTH: 32 WIDTH: 32 HEIGHT: 32 + +simple-cpu: + - params: [x] + dtype: float16 + dims: + BATCH_SIZE: 2 + IN_CHANNELS: 2 + OUT_CHANNELS: 4 + KERNEL_SIZE: [3, 5, 5] + DEPTH: 32 + WIDTH: 32 + HEIGHT: 32 diff --git a/problems/specs/KernelBench/level1/69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.yaml b/problems/specs/KernelBench/level1/69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.yaml index 3ffa589..5b33b8b 100644 --- a/problems/specs/KernelBench/level1/69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.yaml +++ b/problems/specs/KernelBench/level1/69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.yaml @@ -18,3 +18,14 @@ ci: KERNEL_SIZE: [3, 5] HEIGHT_IN: 16 WIDTH_IN: 32 + +simple-cpu: + - params: [x] + dtype: float16 + dims: + BATCH_SIZE: 2 + IN_CHANNELS: 4 + OUT_CHANNELS: 8 + KERNEL_SIZE: [3, 5] + HEIGHT_IN: 16 + WIDTH_IN: 32 diff --git a/problems/specs/KernelBench/level1/6_Matmul_with_large_K_dimension_.yaml b/problems/specs/KernelBench/level1/6_Matmul_with_large_K_dimension_.yaml index 09435cc..73aa753 100644 --- a/problems/specs/KernelBench/level1/6_Matmul_with_large_K_dimension_.yaml +++ b/problems/specs/KernelBench/level1/6_Matmul_with_large_K_dimension_.yaml @@ -15,6 +15,15 @@ ci: K: 512 flop: "2*M*N*K" +simple-cpu: + - params: [A, B] + dtype: bfloat16 + dims: + M: 8 + N: 16 + K: 512 + flop: "2*M*N*K" + bench-gpu: - params: [A, B] dtype: float16 diff --git a/problems/specs/KernelBench/level1/70_conv_transposed_3D__asymmetric_input__square_kernel.yaml b/problems/specs/KernelBench/level1/70_conv_transposed_3D__asymmetric_input__square_kernel.yaml index a8920c9..0c210cb 100644 --- a/problems/specs/KernelBench/level1/70_conv_transposed_3D__asymmetric_input__square_kernel.yaml +++ b/problems/specs/KernelBench/level1/70_conv_transposed_3D__asymmetric_input__square_kernel.yaml @@ -19,3 +19,15 @@ ci: DEPTH: 24 HEIGHT: 24 WIDTH: 24 + +simple-cpu: + - params: [x] + dtype: float16 + dims: + BATCH_SIZE: 2 + IN_CHANNELS: 12 + OUT_CHANNELS: 6 + KERNEL_SIZE: 3 + DEPTH: 24 + HEIGHT: 24 + WIDTH: 24 diff --git a/problems/specs/KernelBench/level1/71_conv_transposed_2D__asymmetric_input__square_kernel.yaml b/problems/specs/KernelBench/level1/71_conv_transposed_2D__asymmetric_input__square_kernel.yaml index cef0e24..618cfda 100644 --- a/problems/specs/KernelBench/level1/71_conv_transposed_2D__asymmetric_input__square_kernel.yaml +++ b/problems/specs/KernelBench/level1/71_conv_transposed_2D__asymmetric_input__square_kernel.yaml @@ -18,3 +18,14 @@ ci: KERNEL_SIZE: 3 HEIGHT_IN: 16 WIDTH_IN: 32 + +simple-cpu: + - params: [x] + dtype: float16 + dims: + BATCH_SIZE: 2 + IN_CHANNELS: 8 + OUT_CHANNELS: 8 + KERNEL_SIZE: 3 + HEIGHT_IN: 16 + WIDTH_IN: 32 diff --git a/problems/specs/KernelBench/level1/72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.yaml b/problems/specs/KernelBench/level1/72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.yaml index 99d8322..9fca6f0 100644 --- a/problems/specs/KernelBench/level1/72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.yaml +++ b/problems/specs/KernelBench/level1/72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.yaml @@ -27,3 +27,19 @@ ci: DEPTH: 12 HEIGHT: 24 WIDTH: 48 + +simple-cpu: + - params: [x] + dtype: float16 + dims: + BATCH_SIZE: 2 + IN_CHANNELS: 4 + OUT_CHANNELS: 4 + KERNEL_SIZE: [3, 5, 7] + STRIDE: [2, 2, 2] + PADDING: [1, 2, 3] + OUTPUT_PADDING: [1, 1, 1] + GROUPS: 2 + DEPTH: 12 + HEIGHT: 24 + WIDTH: 48 diff --git a/problems/specs/KernelBench/level1/73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.yaml b/problems/specs/KernelBench/level1/73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.yaml index d6c9c1b..422ac63 100644 --- a/problems/specs/KernelBench/level1/73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.yaml +++ b/problems/specs/KernelBench/level1/73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.yaml @@ -25,3 +25,18 @@ ci: DEPTH: 8 HEIGHT: 16 WIDTH: 32 + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 2 + IN_CHANNELS: 4 + OUT_CHANNELS: 4 + KERNEL_SIZE: 3 + STRIDE: 2 + PADDING: 1 + GROUPS: 4 + DEPTH: 8 + HEIGHT: 16 + WIDTH: 32 diff --git a/problems/specs/KernelBench/level1/74_conv_transposed_1D_dilated.yaml b/problems/specs/KernelBench/level1/74_conv_transposed_1D_dilated.yaml index e6812f2..1ea4a8d 100644 --- a/problems/specs/KernelBench/level1/74_conv_transposed_1D_dilated.yaml +++ b/problems/specs/KernelBench/level1/74_conv_transposed_1D_dilated.yaml @@ -23,3 +23,16 @@ ci: PADDING: 0 DILATION: 3 LENGTH: 128 + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 2 + IN_CHANNELS: 4 + OUT_CHANNELS: 8 + KERNEL_SIZE: 5 + STRIDE: 1 + PADDING: 0 + DILATION: 3 + LENGTH: 128 diff --git a/problems/specs/KernelBench/level1/75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.yaml b/problems/specs/KernelBench/level1/75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.yaml index 8240af2..29fe1f7 100644 --- a/problems/specs/KernelBench/level1/75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.yaml +++ b/problems/specs/KernelBench/level1/75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.yaml @@ -26,3 +26,18 @@ ci: GROUPS: 2 HEIGHT: 16 WIDTH: 32 + +simple-cpu: + - params: [x] + dtype: float16 + dims: + BATCH_SIZE: 2 + IN_CHANNELS: 4 + OUT_CHANNELS: 8 + KERNEL_SIZE: [3, 5] + STRIDE: [2, 3] + PADDING: [1, 2] + DILATION: [2, 1] + GROUPS: 2 + HEIGHT: 16 + WIDTH: 32 diff --git a/problems/specs/KernelBench/level1/76_conv_standard_1D_dilated_strided__.yaml b/problems/specs/KernelBench/level1/76_conv_standard_1D_dilated_strided__.yaml index 8b5f74f..a012eed 100644 --- a/problems/specs/KernelBench/level1/76_conv_standard_1D_dilated_strided__.yaml +++ b/problems/specs/KernelBench/level1/76_conv_standard_1D_dilated_strided__.yaml @@ -21,3 +21,15 @@ ci: STRIDE: 3 DILATION: 4 LENGTH: 128 + +simple-cpu: + - params: [x] + dtype: float16 + dims: + BATCH_SIZE: 2 + IN_CHANNELS: 4 + OUT_CHANNELS: 8 + KERNEL_SIZE: 3 + STRIDE: 3 + DILATION: 4 + LENGTH: 128 diff --git a/problems/specs/KernelBench/level1/77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.yaml b/problems/specs/KernelBench/level1/77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.yaml index 4be5a3a..1be100e 100644 --- a/problems/specs/KernelBench/level1/77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.yaml +++ b/problems/specs/KernelBench/level1/77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.yaml @@ -25,3 +25,18 @@ ci: DEPTH: 8 HEIGHT: 16 WIDTH: 16 + +simple-cpu: + - params: [x] + dtype: float16 + dims: + BATCH_SIZE: 2 + IN_CHANNELS: 4 + OUT_CHANNELS: 8 + KERNEL_SIZE: 3 + STRIDE: 2 + PADDING: 1 + DILATION: 2 + DEPTH: 8 + HEIGHT: 16 + WIDTH: 16 diff --git a/problems/specs/KernelBench/level1/78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.yaml b/problems/specs/KernelBench/level1/78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.yaml index d909e97..fae17dd 100644 --- a/problems/specs/KernelBench/level1/78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.yaml +++ b/problems/specs/KernelBench/level1/78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.yaml @@ -22,3 +22,16 @@ ci: PADDING: [1, 3] HEIGHT: 16 WIDTH: 32 + +simple-cpu: + - params: [x] + dtype: float16 + dims: + BATCH_SIZE: 2 + IN_CHANNELS: 8 + OUT_CHANNELS: 8 + KERNEL_SIZE: [3, 7] + STRIDE: [1, 1] + PADDING: [1, 3] + HEIGHT: 16 + WIDTH: 32 diff --git a/problems/specs/KernelBench/level1/79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.yaml b/problems/specs/KernelBench/level1/79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.yaml index 45f21bd..c12e5c1 100644 --- a/problems/specs/KernelBench/level1/79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.yaml +++ b/problems/specs/KernelBench/level1/79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.yaml @@ -23,3 +23,16 @@ ci: PADDING: 1 DILATION: 2 LENGTH: 128 + +simple-cpu: + - params: [x] + dtype: float16 + dims: + BATCH_SIZE: 2 + IN_CHANNELS: 4 + OUT_CHANNELS: 8 + KERNEL_SIZE: 3 + STRIDE: 2 + PADDING: 1 + DILATION: 2 + LENGTH: 128 diff --git a/problems/specs/KernelBench/level1/7_Matmul_with_small_K_dimension_.yaml b/problems/specs/KernelBench/level1/7_Matmul_with_small_K_dimension_.yaml index 2c55dd2..05b84d0 100644 --- a/problems/specs/KernelBench/level1/7_Matmul_with_small_K_dimension_.yaml +++ b/problems/specs/KernelBench/level1/7_Matmul_with_small_K_dimension_.yaml @@ -15,6 +15,15 @@ ci: K: 16 flop: "2*M*N*K" +simple-cpu: + - params: [A, B] + dtype: bfloat16 + dims: + M: 256 + N: 256 + K: 16 + flop: "2*M*N*K" + bench-gpu: - params: [A, B] dtype: float16 diff --git a/problems/specs/KernelBench/level1/80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.yaml b/problems/specs/KernelBench/level1/80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.yaml index f431aeb..f5ace12 100644 --- a/problems/specs/KernelBench/level1/80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.yaml +++ b/problems/specs/KernelBench/level1/80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.yaml @@ -24,3 +24,17 @@ ci: DILATION: [2, 3] HEIGHT: 32 WIDTH: 32 + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 2 + IN_CHANNELS: 4 + OUT_CHANNELS: 32 + KERNEL_SIZE: [5, 9] + STRIDE: 1 + PADDING: [2, 4] + DILATION: [2, 3] + HEIGHT: 32 + WIDTH: 32 diff --git a/problems/specs/KernelBench/level1/81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.yaml b/problems/specs/KernelBench/level1/81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.yaml index 5b811a1..36d01d2 100644 --- a/problems/specs/KernelBench/level1/81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.yaml +++ b/problems/specs/KernelBench/level1/81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.yaml @@ -24,3 +24,17 @@ ci: DILATION: 2 HEIGHT_IN: 16 WIDTH_IN: 32 + +simple-cpu: + - params: [x] + dtype: float16 + dims: + BATCH_SIZE: 2 + IN_CHANNELS: 4 + OUT_CHANNELS: 8 + KERNEL_SIZE: 3 + STRIDE: 5 + PADDING: 1 + DILATION: 2 + HEIGHT_IN: 16 + WIDTH_IN: 32 diff --git a/problems/specs/KernelBench/level1/82_conv_depthwise_2D_square_input_square_kernel.yaml b/problems/specs/KernelBench/level1/82_conv_depthwise_2D_square_input_square_kernel.yaml index eef3c22..6c50a10 100644 --- a/problems/specs/KernelBench/level1/82_conv_depthwise_2D_square_input_square_kernel.yaml +++ b/problems/specs/KernelBench/level1/82_conv_depthwise_2D_square_input_square_kernel.yaml @@ -20,3 +20,15 @@ ci: PADDING: 0 HEIGHT: 32 WIDTH: 32 + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 2 + IN_CHANNELS: 16 + KERNEL_SIZE: 3 + STRIDE: 1 + PADDING: 0 + HEIGHT: 32 + WIDTH: 32 diff --git a/problems/specs/KernelBench/level1/83_conv_depthwise_2D_square_input_asymmetric_kernel.yaml b/problems/specs/KernelBench/level1/83_conv_depthwise_2D_square_input_asymmetric_kernel.yaml index 0bd69f3..ef40d11 100644 --- a/problems/specs/KernelBench/level1/83_conv_depthwise_2D_square_input_asymmetric_kernel.yaml +++ b/problems/specs/KernelBench/level1/83_conv_depthwise_2D_square_input_asymmetric_kernel.yaml @@ -22,3 +22,16 @@ ci: DILATION: 1 HEIGHT: 32 WIDTH: 32 + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 2 + IN_CHANNELS: 8 + KERNEL_SIZE: 3 + STRIDE: 1 + PADDING: 0 + DILATION: 1 + HEIGHT: 32 + WIDTH: 32 diff --git a/problems/specs/KernelBench/level1/84_conv_depthwise_2D_asymmetric_input_square_kernel.yaml b/problems/specs/KernelBench/level1/84_conv_depthwise_2D_asymmetric_input_square_kernel.yaml index e07b710..e7e9990 100644 --- a/problems/specs/KernelBench/level1/84_conv_depthwise_2D_asymmetric_input_square_kernel.yaml +++ b/problems/specs/KernelBench/level1/84_conv_depthwise_2D_asymmetric_input_square_kernel.yaml @@ -22,3 +22,16 @@ ci: PADDING: 0 HEIGHT_IN: 16 WIDTH_IN: 32 + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 2 + IN_CHANNELS: 8 + OUT_CHANNELS: 8 + KERNEL_SIZE: 3 + STRIDE: 1 + PADDING: 0 + HEIGHT_IN: 16 + WIDTH_IN: 32 diff --git a/problems/specs/KernelBench/level1/85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.yaml b/problems/specs/KernelBench/level1/85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.yaml index 8c96917..8a3ea09 100644 --- a/problems/specs/KernelBench/level1/85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.yaml +++ b/problems/specs/KernelBench/level1/85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.yaml @@ -34,3 +34,22 @@ ci: GROUPS: 16 HEIGHT: 16 WIDTH: 32 + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 2 + IN_CHANNELS: 4 + OUT_CHANNELS: 4 + KERNEL_SIZE_H: 3 + KERNEL_SIZE_W: 7 + STRIDE_H: 1 + STRIDE_W: 1 + PADDING_H: 0 + PADDING_W: 0 + DILATION_H: 1 + DILATION_W: 1 + GROUPS: 16 + HEIGHT: 16 + WIDTH: 32 diff --git a/problems/specs/KernelBench/level1/86_conv_depthwise_separable_2D.yaml b/problems/specs/KernelBench/level1/86_conv_depthwise_separable_2D.yaml index dd6bd97..08e7f67 100644 --- a/problems/specs/KernelBench/level1/86_conv_depthwise_separable_2D.yaml +++ b/problems/specs/KernelBench/level1/86_conv_depthwise_separable_2D.yaml @@ -25,3 +25,18 @@ ci: DILATION: 1 HEIGHT: 32 WIDTH: 32 + +simple-cpu: + - params: [x] + dtype: float32 + memory_format: channels_last + dims: + BATCH_SIZE: 2 + IN_CHANNELS: 4 + OUT_CHANNELS: 8 + KERNEL_SIZE: 3 + STRIDE: 1 + PADDING: 1 + DILATION: 1 + HEIGHT: 32 + WIDTH: 32 diff --git a/problems/specs/KernelBench/level1/87_conv_pointwise_2D.yaml b/problems/specs/KernelBench/level1/87_conv_pointwise_2D.yaml index 9b2a376..6429cde 100644 --- a/problems/specs/KernelBench/level1/87_conv_pointwise_2D.yaml +++ b/problems/specs/KernelBench/level1/87_conv_pointwise_2D.yaml @@ -16,3 +16,13 @@ ci: OUT_CHANNELS: 8 HEIGHT: 32 WIDTH: 32 + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 2 + IN_CHANNELS: 4 + OUT_CHANNELS: 8 + HEIGHT: 32 + WIDTH: 32 diff --git a/problems/specs/KernelBench/level1/88_MinGPTNewGelu.yaml b/problems/specs/KernelBench/level1/88_MinGPTNewGelu.yaml index 273d44b..287029f 100644 --- a/problems/specs/KernelBench/level1/88_MinGPTNewGelu.yaml +++ b/problems/specs/KernelBench/level1/88_MinGPTNewGelu.yaml @@ -11,3 +11,10 @@ ci: dims: BATCH_SIZE: 32 DIM: 32 + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 32 + DIM: 32 diff --git a/problems/specs/KernelBench/level1/89_cumsum.yaml b/problems/specs/KernelBench/level1/89_cumsum.yaml index 82de8c7..d026c98 100644 --- a/problems/specs/KernelBench/level1/89_cumsum.yaml +++ b/problems/specs/KernelBench/level1/89_cumsum.yaml @@ -13,3 +13,11 @@ ci: BATCH_SIZE: 64 INPUT_DIM: 64 SCAN_DIM: 1 + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 64 + INPUT_DIM: 64 + SCAN_DIM: 1 diff --git a/problems/specs/KernelBench/level1/8_Matmul_with_irregular_shapes_.yaml b/problems/specs/KernelBench/level1/8_Matmul_with_irregular_shapes_.yaml index 4f0553d..1068de4 100644 --- a/problems/specs/KernelBench/level1/8_Matmul_with_irregular_shapes_.yaml +++ b/problems/specs/KernelBench/level1/8_Matmul_with_irregular_shapes_.yaml @@ -15,6 +15,15 @@ ci: K: 19 flop: "2*M*N*K" +simple-cpu: + - params: [A, B] + dtype: bfloat16 + dims: + M: 60 + N: 37 + K: 19 + flop: "2*M*N*K" + bench-gpu: - params: [A, B] dtype: float16 diff --git a/problems/specs/KernelBench/level1/90_cumprod.yaml b/problems/specs/KernelBench/level1/90_cumprod.yaml index 82de8c7..d026c98 100644 --- a/problems/specs/KernelBench/level1/90_cumprod.yaml +++ b/problems/specs/KernelBench/level1/90_cumprod.yaml @@ -13,3 +13,11 @@ ci: BATCH_SIZE: 64 INPUT_DIM: 64 SCAN_DIM: 1 + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 64 + INPUT_DIM: 64 + SCAN_DIM: 1 diff --git a/problems/specs/KernelBench/level1/91_cumsum_reverse.yaml b/problems/specs/KernelBench/level1/91_cumsum_reverse.yaml index 82de8c7..d026c98 100644 --- a/problems/specs/KernelBench/level1/91_cumsum_reverse.yaml +++ b/problems/specs/KernelBench/level1/91_cumsum_reverse.yaml @@ -13,3 +13,11 @@ ci: BATCH_SIZE: 64 INPUT_DIM: 64 SCAN_DIM: 1 + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 64 + INPUT_DIM: 64 + SCAN_DIM: 1 diff --git a/problems/specs/KernelBench/level1/92_cumsum_exclusive.yaml b/problems/specs/KernelBench/level1/92_cumsum_exclusive.yaml index 82de8c7..d026c98 100644 --- a/problems/specs/KernelBench/level1/92_cumsum_exclusive.yaml +++ b/problems/specs/KernelBench/level1/92_cumsum_exclusive.yaml @@ -13,3 +13,11 @@ ci: BATCH_SIZE: 64 INPUT_DIM: 64 SCAN_DIM: 1 + +simple-cpu: + - params: [x] + dtype: float32 + dims: + BATCH_SIZE: 64 + INPUT_DIM: 64 + SCAN_DIM: 1 diff --git a/problems/specs/KernelBench/level1/93_masked_cumsum.yaml b/problems/specs/KernelBench/level1/93_masked_cumsum.yaml index d296d36..e9b0dc5 100644 --- a/problems/specs/KernelBench/level1/93_masked_cumsum.yaml +++ b/problems/specs/KernelBench/level1/93_masked_cumsum.yaml @@ -17,6 +17,14 @@ ci: INPUT_DIM: 64 SCAN_DIM: 1 +simple-cpu: + - params: [x, mask] + dtype: float32 + dims: + BATCH_SIZE: 64 + INPUT_DIM: 64 + SCAN_DIM: 1 + bench-gpu: - params: [x, mask] dtype: float16 diff --git a/problems/specs/KernelBench/level1/94_MSELoss.yaml b/problems/specs/KernelBench/level1/94_MSELoss.yaml index d6893b3..59ba064 100644 --- a/problems/specs/KernelBench/level1/94_MSELoss.yaml +++ b/problems/specs/KernelBench/level1/94_MSELoss.yaml @@ -15,3 +15,10 @@ ci: dims: BATCH_SIZE: 64 INPUT_DIM: 64 + +simple-cpu: + - params: [predictions, targets] + dtype: float32 + dims: + BATCH_SIZE: 64 + INPUT_DIM: 64 diff --git a/problems/specs/KernelBench/level1/95_CrossEntropyLoss.yaml b/problems/specs/KernelBench/level1/95_CrossEntropyLoss.yaml index 1925d32..acc89da 100644 --- a/problems/specs/KernelBench/level1/95_CrossEntropyLoss.yaml +++ b/problems/specs/KernelBench/level1/95_CrossEntropyLoss.yaml @@ -15,3 +15,10 @@ ci: dims: BATCH_SIZE: 64 NUM_CLASSES: 8 + +simple-cpu: + - params: [predictions, targets] + dtype: float32 + dims: + BATCH_SIZE: 64 + NUM_CLASSES: 8 diff --git a/problems/specs/KernelBench/level1/96_HuberLoss.yaml b/problems/specs/KernelBench/level1/96_HuberLoss.yaml index d6893b3..59ba064 100644 --- a/problems/specs/KernelBench/level1/96_HuberLoss.yaml +++ b/problems/specs/KernelBench/level1/96_HuberLoss.yaml @@ -15,3 +15,10 @@ ci: dims: BATCH_SIZE: 64 INPUT_DIM: 64 + +simple-cpu: + - params: [predictions, targets] + dtype: float32 + dims: + BATCH_SIZE: 64 + INPUT_DIM: 64 diff --git a/problems/specs/KernelBench/level1/97_ScaledDotProductAttention.yaml b/problems/specs/KernelBench/level1/97_ScaledDotProductAttention.yaml index 701f51b..f0255ec 100644 --- a/problems/specs/KernelBench/level1/97_ScaledDotProductAttention.yaml +++ b/problems/specs/KernelBench/level1/97_ScaledDotProductAttention.yaml @@ -19,3 +19,12 @@ ci: NUM_HEADS: 8 SEQUENCE_LENGTH: 16 EMBEDDING_DIMENSION: 32 + +simple-cpu: + - params: [Q, K, V] + dtype: float32 + dims: + BATCH_SIZE: 32 + NUM_HEADS: 32 + SEQUENCE_LENGTH: 64 + EMBEDDING_DIMENSION: 128 diff --git a/problems/specs/KernelBench/level1/98_KLDivLoss.yaml b/problems/specs/KernelBench/level1/98_KLDivLoss.yaml index 63efbdc..72bb8e2 100644 --- a/problems/specs/KernelBench/level1/98_KLDivLoss.yaml +++ b/problems/specs/KernelBench/level1/98_KLDivLoss.yaml @@ -16,3 +16,10 @@ ci: dims: BATCH_SIZE: 64 INPUT_DIM: 64 + +simple-cpu: + - params: [predictions, targets] + dtype: float32 + dims: + BATCH_SIZE: 64 + INPUT_DIM: 64 diff --git a/problems/specs/KernelBench/level1/99_TripletMarginLoss.yaml b/problems/specs/KernelBench/level1/99_TripletMarginLoss.yaml index 114af17..0ab92c1 100644 --- a/problems/specs/KernelBench/level1/99_TripletMarginLoss.yaml +++ b/problems/specs/KernelBench/level1/99_TripletMarginLoss.yaml @@ -20,3 +20,11 @@ ci: BATCH_SIZE: 64 INPUT_DIM: 8 MARGIN: 1.0 + +simple-cpu: + - params: [anchor, positive, negative] + dtype: float32 + dims: + BATCH_SIZE: 2048 + INPUT_DIM: 512 + MARGIN: 1.0 diff --git a/problems/specs/KernelBench/level1/9_Tall_skinny_matrix_multiplication_.yaml b/problems/specs/KernelBench/level1/9_Tall_skinny_matrix_multiplication_.yaml index 94ced01..9a25d89 100644 --- a/problems/specs/KernelBench/level1/9_Tall_skinny_matrix_multiplication_.yaml +++ b/problems/specs/KernelBench/level1/9_Tall_skinny_matrix_multiplication_.yaml @@ -14,6 +14,14 @@ ci: N: 16 flop: "2*M*M*N" +simple-cpu: + - params: [A, B] + dtype: bfloat16 + dims: + M: 256 + N: 16 + flop: "2*M*M*N" + bench-gpu: - params: [A, B] dtype: float16 diff --git a/pyproject.toml b/pyproject.toml index d0c35b1..ba285e0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -115,7 +115,7 @@ override-dependencies = [ torch = { index = "pytorch" } pytorch-triton-xpu = { index = "pytorch" } pytorch-triton = { index = "pytorch" } -triton = { git = "https://github.com/triton-lang/triton-cpu.git", rev = "270e696" } +triton = { git = "https://github.com/triton-lang/triton-cpu.git", rev = "eece2e9" } lighthouse = { git = "https://github.com/llvm/lighthouse", rev = "456475d" } mlir-python-bindings = { index = "eudsl" } From cf5193f1d6ee04651c38f73e7665be5fa11d9712 Mon Sep 17 00:00:00 2001 From: Julian Oppermann Date: Wed, 27 May 2026 08:53:57 -0700 Subject: [PATCH 2/3] Make blocks smaller, inputs larger, to ensure kernels launch as multiple programs --- backends/triton/cpu/KernelBench/level1/100_HingeLoss.py | 4 +++- .../level1/10_3D_tensor_matrix_multiplication.py | 4 +--- .../level1/11_4D_tensor_matrix_multiplication.py | 4 +--- .../level1/12_Matmul_with_diagonal_matrices_.py | 4 +++- .../level1/13_Matmul_for_symmetric_matrices.py | 4 +--- .../level1/14_Matmul_for_upper_triangular_matrices.py | 4 +--- .../level1/15_Matmul_for_lower_triangular_matrices.py | 4 +--- .../KernelBench/level1/16_Matmul_with_transposed_A.py | 4 +--- .../KernelBench/level1/17_Matmul_with_transposed_B.py | 4 +--- .../KernelBench/level1/18_Matmul_with_transposed_both.py | 4 +--- backends/triton/cpu/KernelBench/level1/19_ReLU.py | 2 +- .../level1/1_Square_matrix_multiplication_.py | 4 +--- backends/triton/cpu/KernelBench/level1/20_LeakyReLU.py | 4 +++- backends/triton/cpu/KernelBench/level1/21_Sigmoid.py | 4 +++- backends/triton/cpu/KernelBench/level1/22_Tanh.py | 4 +++- backends/triton/cpu/KernelBench/level1/23_Softmax.py | 4 +++- backends/triton/cpu/KernelBench/level1/24_LogSoftmax.py | 4 +++- backends/triton/cpu/KernelBench/level1/25_Swish.py | 4 +++- backends/triton/cpu/KernelBench/level1/26_GELU_.py | 2 +- backends/triton/cpu/KernelBench/level1/27_SELU_.py | 4 +++- backends/triton/cpu/KernelBench/level1/28_HardSigmoid.py | 4 +++- backends/triton/cpu/KernelBench/level1/29_Softplus.py | 4 +++- .../level1/2_Standard_matrix_multiplication_.py | 4 +--- backends/triton/cpu/KernelBench/level1/30_Softsign.py | 4 +++- backends/triton/cpu/KernelBench/level1/31_ELU.py | 4 +++- backends/triton/cpu/KernelBench/level1/32_HardTanh.py | 4 +++- backends/triton/cpu/KernelBench/level1/33_BatchNorm.py | 4 +--- .../triton/cpu/KernelBench/level1/34_InstanceNorm.py | 4 +++- backends/triton/cpu/KernelBench/level1/35_GroupNorm_.py | 8 ++++++-- backends/triton/cpu/KernelBench/level1/36_RMSNorm_.py | 4 +++- .../triton/cpu/KernelBench/level1/37_FrobeniusNorm_.py | 6 ++++-- backends/triton/cpu/KernelBench/level1/38_L1Norm_.py | 4 +++- backends/triton/cpu/KernelBench/level1/39_L2Norm_.py | 4 +++- .../level1/3_Batched_matrix_multiplication.py | 4 +--- backends/triton/cpu/KernelBench/level1/40_LayerNorm.py | 2 +- .../triton/cpu/KernelBench/level1/41_Max_Pooling_1D.py | 4 +++- .../triton/cpu/KernelBench/level1/42_Max_Pooling_2D.py | 4 +++- .../triton/cpu/KernelBench/level1/43_Max_Pooling_3D.py | 4 +++- .../cpu/KernelBench/level1/44_Average_Pooling_1D.py | 4 +++- .../cpu/KernelBench/level1/45_Average_Pooling_2D.py | 4 +++- .../cpu/KernelBench/level1/46_Average_Pooling_3D.py | 4 +++- .../level1/47_Sum_reduction_over_a_dimension.py | 4 +++- .../level1/48_Mean_reduction_over_a_dimension.py | 4 +++- .../level1/49_Max_reduction_over_a_dimension.py | 4 +++- .../level1/4_Matrix_vector_multiplication_.py | 4 +++- .../50_conv_standard_2D__square_input__square_kernel.py | 4 +--- .../cpu/KernelBench/level1/51_Argmax_over_a_dimension.py | 4 +++- .../cpu/KernelBench/level1/52_Argmin_over_a_dimension.py | 2 +- .../level1/53_Min_reduction_over_a_dimension.py | 4 +--- .../54_conv_standard_3D__square_input__square_kernel.py | 6 ++---- ..._conv_standard_2D__asymmetric_input__square_kernel.py | 4 +--- ...v_standard_2D__asymmetric_input__asymmetric_kernel.py | 4 +--- ...57_conv_transposed_2D__square_input__square_kernel.py | 6 ++---- ...transposed_3D__asymmetric_input__asymmetric_kernel.py | 4 +++- ..._conv_standard_3D__asymmetric_input__square_kernel.py | 6 ++---- .../KernelBench/level1/5_Matrix_scalar_multiplication.py | 4 +++- ..._conv_standard_3D__square_input__asymmetric_kernel.py | 6 ++---- ...61_conv_transposed_3D__square_input__square_kernel.py | 4 +--- ..._conv_standard_2D__square_input__asymmetric_kernel.py | 4 +--- .../63_conv_standard_2D__square_input__square_kernel.py | 4 +--- .../cpu/KernelBench/level1/64_conv_transposed_1D.py | 4 +--- ...onv_transposed_2D__square_input__asymmetric_kernel.py | 6 ++---- ...v_standard_3D__asymmetric_input__asymmetric_kernel.py | 4 +--- .../triton/cpu/KernelBench/level1/67_conv_standard_1D.py | 4 +++- ...onv_transposed_3D__square_input__asymmetric_kernel.py | 4 +++- ...transposed_2D__asymmetric_input__asymmetric_kernel.py | 6 ++---- .../level1/6_Matmul_with_large_K_dimension_.py | 4 +--- ...onv_transposed_3D__asymmetric_input__square_kernel.py | 4 +++- ...onv_transposed_2D__asymmetric_input__square_kernel.py | 2 -- ..._input_asymmetric_kernel___strided_padded_grouped_.py | 4 +++- ...etric_input_square_kernel__strided_padded__grouped.py | 2 +- .../KernelBench/level1/74_conv_transposed_1D_dilated.py | 4 +--- ...ric_kernel_strided__grouped____padded____dilated__.py | 2 +- .../level1/76_conv_standard_1D_dilated_strided__.py | 4 +++- ...put_square_kernel___padded____dilated____strided__.py | 6 ++++-- ...d_2D_asymmetric_input_asymmetric_kernel___padded__.py | 2 -- ...put_square_kernel___padded____strided____dilated__.py | 4 +++- .../level1/7_Matmul_with_small_K_dimension_.py | 4 +--- ...uare_input_asymmetric_kernel___dilated____padded__.py | 6 ++---- ...put_square_kernel___dilated____padded____strided__.py | 6 ++++-- .../82_conv_depthwise_2D_square_input_square_kernel.py | 4 +++- ...3_conv_depthwise_2D_square_input_asymmetric_kernel.py | 4 +++- ...4_conv_depthwise_2D_asymmetric_input_square_kernel.py | 4 +++- ...nv_depthwise_2D_asymmetric_input_asymmetric_kernel.py | 4 +++- .../KernelBench/level1/86_conv_depthwise_separable_2D.py | 4 +--- .../cpu/KernelBench/level1/87_conv_pointwise_2D.py | 8 +++----- .../triton/cpu/KernelBench/level1/88_MinGPTNewGelu.py | 4 +++- backends/triton/cpu/KernelBench/level1/89_cumsum.py | 4 +++- .../level1/8_Matmul_with_irregular_shapes_.py | 4 +--- backends/triton/cpu/KernelBench/level1/90_cumprod.py | 2 +- .../triton/cpu/KernelBench/level1/91_cumsum_reverse.py | 2 +- .../triton/cpu/KernelBench/level1/92_cumsum_exclusive.py | 4 +++- .../triton/cpu/KernelBench/level1/93_masked_cumsum.py | 4 +++- backends/triton/cpu/KernelBench/level1/94_MSELoss.py | 4 +--- .../triton/cpu/KernelBench/level1/95_CrossEntropyLoss.py | 9 +-------- backends/triton/cpu/KernelBench/level1/96_HuberLoss.py | 2 +- .../KernelBench/level1/97_ScaledDotProductAttention.py | 4 ++-- backends/triton/cpu/KernelBench/level1/98_KLDivLoss.py | 4 +--- .../cpu/KernelBench/level1/99_TripletMarginLoss.py | 4 +++- .../level1/9_Tall_skinny_matrix_multiplication_.py | 4 +--- .../level1/10_3D_tensor_matrix_multiplication.yaml | 8 ++++---- .../level1/11_4D_tensor_matrix_multiplication.yaml | 6 +++--- .../level1/13_Matmul_for_symmetric_matrices.yaml | 2 +- .../level1/14_Matmul_for_upper_triangular_matrices.yaml | 2 +- .../level1/15_Matmul_for_lower_triangular_matrices.yaml | 2 +- .../level1/1_Square_matrix_multiplication_.yaml | 2 +- .../level1/2_Standard_matrix_multiplication_.yaml | 6 +++--- .../level1/47_Sum_reduction_over_a_dimension.yaml | 4 ++-- .../level1/48_Mean_reduction_over_a_dimension.yaml | 4 ++-- .../level1/4_Matrix_vector_multiplication_.yaml | 4 ++-- .../KernelBench/level1/51_Argmax_over_a_dimension.yaml | 4 ++-- .../KernelBench/level1/52_Argmin_over_a_dimension.yaml | 4 ++-- .../level1/53_Min_reduction_over_a_dimension.yaml | 4 ++-- ...54_conv_standard_3D__square_input__square_kernel.yaml | 2 +- ...onv_standard_3D__asymmetric_input__square_kernel.yaml | 2 +- .../level1/5_Matrix_scalar_multiplication.yaml | 4 ++-- .../specs/KernelBench/level1/67_conv_standard_1D.yaml | 2 +- .../level1/6_Matmul_with_large_K_dimension_.yaml | 4 ++-- .../level1/7_Matmul_with_small_K_dimension_.yaml | 6 +++--- .../specs/KernelBench/level1/87_conv_pointwise_2D.yaml | 4 ++-- .../level1/8_Matmul_with_irregular_shapes_.yaml | 6 +++--- .../level1/9_Tall_skinny_matrix_multiplication_.yaml | 4 ++-- 122 files changed, 257 insertions(+), 240 deletions(-) diff --git a/backends/triton/cpu/KernelBench/level1/100_HingeLoss.py b/backends/triton/cpu/KernelBench/level1/100_HingeLoss.py index 1903395..a7c936c 100644 --- a/backends/triton/cpu/KernelBench/level1/100_HingeLoss.py +++ b/backends/triton/cpu/KernelBench/level1/100_HingeLoss.py @@ -12,7 +12,9 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2), + triton.Config( + {"BLOCK_SIZE": 32}, + ), ], key=["D"], ) diff --git a/backends/triton/cpu/KernelBench/level1/10_3D_tensor_matrix_multiplication.py b/backends/triton/cpu/KernelBench/level1/10_3D_tensor_matrix_multiplication.py index b6629b8..a94792e 100644 --- a/backends/triton/cpu/KernelBench/level1/10_3D_tensor_matrix_multiplication.py +++ b/backends/triton/cpu/KernelBench/level1/10_3D_tensor_matrix_multiplication.py @@ -13,9 +13,7 @@ def _configs(): return [ triton.Config( - {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, - num_warps=32, - num_stages=2, + {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, ), ] diff --git a/backends/triton/cpu/KernelBench/level1/11_4D_tensor_matrix_multiplication.py b/backends/triton/cpu/KernelBench/level1/11_4D_tensor_matrix_multiplication.py index 10f3601..99f96b2 100644 --- a/backends/triton/cpu/KernelBench/level1/11_4D_tensor_matrix_multiplication.py +++ b/backends/triton/cpu/KernelBench/level1/11_4D_tensor_matrix_multiplication.py @@ -32,9 +32,7 @@ def swizzle_tile( def get_autotune_configs(): return [ triton.Config( - {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 64, "GROUP_SIZE_M": 4}, - num_warps=32, - num_stages=2, + {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, ), ] diff --git a/backends/triton/cpu/KernelBench/level1/12_Matmul_with_diagonal_matrices_.py b/backends/triton/cpu/KernelBench/level1/12_Matmul_with_diagonal_matrices_.py index 499ab2a..7a1a6c0 100644 --- a/backends/triton/cpu/KernelBench/level1/12_Matmul_with_diagonal_matrices_.py +++ b/backends/triton/cpu/KernelBench/level1/12_Matmul_with_diagonal_matrices_.py @@ -12,7 +12,9 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_M": 64, "BLOCK_N": 128}, num_warps=32, num_stages=2), + triton.Config( + {"BLOCK_M": 32, "BLOCK_N": 32}, + ), ], key=["N", "M"], ) diff --git a/backends/triton/cpu/KernelBench/level1/13_Matmul_for_symmetric_matrices.py b/backends/triton/cpu/KernelBench/level1/13_Matmul_for_symmetric_matrices.py index 13dc6df..c47d878 100644 --- a/backends/triton/cpu/KernelBench/level1/13_Matmul_for_symmetric_matrices.py +++ b/backends/triton/cpu/KernelBench/level1/13_Matmul_for_symmetric_matrices.py @@ -32,9 +32,7 @@ def swizzle_tile( def _configs(): return [ triton.Config( - {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, - num_warps=32, - num_stages=2, + {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, ), ] diff --git a/backends/triton/cpu/KernelBench/level1/14_Matmul_for_upper_triangular_matrices.py b/backends/triton/cpu/KernelBench/level1/14_Matmul_for_upper_triangular_matrices.py index 184f008..3e05bc4 100644 --- a/backends/triton/cpu/KernelBench/level1/14_Matmul_for_upper_triangular_matrices.py +++ b/backends/triton/cpu/KernelBench/level1/14_Matmul_for_upper_triangular_matrices.py @@ -34,9 +34,7 @@ def swizzle_tile( @triton.autotune( configs=[ triton.Config( - {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, - num_warps=32, - num_stages=2, + {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, ), ], key=["M", "N", "K"], diff --git a/backends/triton/cpu/KernelBench/level1/15_Matmul_for_lower_triangular_matrices.py b/backends/triton/cpu/KernelBench/level1/15_Matmul_for_lower_triangular_matrices.py index 8b755b7..9dde830 100644 --- a/backends/triton/cpu/KernelBench/level1/15_Matmul_for_lower_triangular_matrices.py +++ b/backends/triton/cpu/KernelBench/level1/15_Matmul_for_lower_triangular_matrices.py @@ -32,9 +32,7 @@ def swizzle_tile( @triton.autotune( configs=[ triton.Config( - {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, - num_warps=32, - num_stages=2, + {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, ), ], key=["M"], diff --git a/backends/triton/cpu/KernelBench/level1/16_Matmul_with_transposed_A.py b/backends/triton/cpu/KernelBench/level1/16_Matmul_with_transposed_A.py index 1df1ce2..348ff28 100644 --- a/backends/triton/cpu/KernelBench/level1/16_Matmul_with_transposed_A.py +++ b/backends/triton/cpu/KernelBench/level1/16_Matmul_with_transposed_A.py @@ -13,9 +13,7 @@ def get_autotune_configs(): return [ triton.Config( - {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, - num_warps=32, - num_stages=2, + {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, ), ] diff --git a/backends/triton/cpu/KernelBench/level1/17_Matmul_with_transposed_B.py b/backends/triton/cpu/KernelBench/level1/17_Matmul_with_transposed_B.py index c7b9f5f..f88ca8d 100644 --- a/backends/triton/cpu/KernelBench/level1/17_Matmul_with_transposed_B.py +++ b/backends/triton/cpu/KernelBench/level1/17_Matmul_with_transposed_B.py @@ -13,9 +13,7 @@ def get_autotune_configs(): return [ triton.Config( - {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, - num_warps=32, - num_stages=2, + {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, ), ] diff --git a/backends/triton/cpu/KernelBench/level1/18_Matmul_with_transposed_both.py b/backends/triton/cpu/KernelBench/level1/18_Matmul_with_transposed_both.py index 54c94c6..b4c40df 100644 --- a/backends/triton/cpu/KernelBench/level1/18_Matmul_with_transposed_both.py +++ b/backends/triton/cpu/KernelBench/level1/18_Matmul_with_transposed_both.py @@ -13,9 +13,7 @@ def _configs(): return [ triton.Config( - {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, - num_warps=32, - num_stages=2, + {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, ), ] diff --git a/backends/triton/cpu/KernelBench/level1/19_ReLU.py b/backends/triton/cpu/KernelBench/level1/19_ReLU.py index 1eed9d7..b7be751 100644 --- a/backends/triton/cpu/KernelBench/level1/19_ReLU.py +++ b/backends/triton/cpu/KernelBench/level1/19_ReLU.py @@ -13,7 +13,7 @@ @triton.autotune( configs=[ triton.Config( - {"BLOCK_SIZE": 4096, "NUM_PROGRAMS": 256}, num_warps=8, num_stages=3 + {"BLOCK_SIZE": 32, "NUM_PROGRAMS": 64}, ), ], key=["n_elements"], diff --git a/backends/triton/cpu/KernelBench/level1/1_Square_matrix_multiplication_.py b/backends/triton/cpu/KernelBench/level1/1_Square_matrix_multiplication_.py index 09c940e..80f240d 100644 --- a/backends/triton/cpu/KernelBench/level1/1_Square_matrix_multiplication_.py +++ b/backends/triton/cpu/KernelBench/level1/1_Square_matrix_multiplication_.py @@ -13,9 +13,7 @@ def _configs(): return [ triton.Config( - {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, - num_warps=32, - num_stages=2, + {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, ), ] diff --git a/backends/triton/cpu/KernelBench/level1/20_LeakyReLU.py b/backends/triton/cpu/KernelBench/level1/20_LeakyReLU.py index 8002e4f..12f9a28 100644 --- a/backends/triton/cpu/KernelBench/level1/20_LeakyReLU.py +++ b/backends/triton/cpu/KernelBench/level1/20_LeakyReLU.py @@ -12,7 +12,9 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2), + triton.Config( + {"BLOCK_SIZE": 32}, + ), ], key=["n_elements"], ) diff --git a/backends/triton/cpu/KernelBench/level1/21_Sigmoid.py b/backends/triton/cpu/KernelBench/level1/21_Sigmoid.py index 7240e65..a6b8f44 100644 --- a/backends/triton/cpu/KernelBench/level1/21_Sigmoid.py +++ b/backends/triton/cpu/KernelBench/level1/21_Sigmoid.py @@ -12,7 +12,9 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2), + triton.Config( + {"BLOCK_SIZE": 32}, + ), ], key=["N"], ) diff --git a/backends/triton/cpu/KernelBench/level1/22_Tanh.py b/backends/triton/cpu/KernelBench/level1/22_Tanh.py index f5f6074..e261dfc 100644 --- a/backends/triton/cpu/KernelBench/level1/22_Tanh.py +++ b/backends/triton/cpu/KernelBench/level1/22_Tanh.py @@ -15,7 +15,9 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2), + triton.Config( + {"BLOCK_SIZE": 32}, + ), ], key=["n_elements"], ) diff --git a/backends/triton/cpu/KernelBench/level1/23_Softmax.py b/backends/triton/cpu/KernelBench/level1/23_Softmax.py index 5e7ba1b..c813a14 100644 --- a/backends/triton/cpu/KernelBench/level1/23_Softmax.py +++ b/backends/triton/cpu/KernelBench/level1/23_Softmax.py @@ -12,7 +12,9 @@ def _softmax_configs(): return [ - triton.Config({"BLOCK_N": 2048}, num_warps=8, num_stages=3), + triton.Config( + {"BLOCK_N": 32}, + ), ] diff --git a/backends/triton/cpu/KernelBench/level1/24_LogSoftmax.py b/backends/triton/cpu/KernelBench/level1/24_LogSoftmax.py index c5e5f35..e314810 100644 --- a/backends/triton/cpu/KernelBench/level1/24_LogSoftmax.py +++ b/backends/triton/cpu/KernelBench/level1/24_LogSoftmax.py @@ -12,7 +12,9 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_N": 2048, "warp_size": 32}, num_warps=4, num_stages=2), + triton.Config( + {"BLOCK_N": 32, "warp_size": 32}, + ), ], key=["N"], ) diff --git a/backends/triton/cpu/KernelBench/level1/25_Swish.py b/backends/triton/cpu/KernelBench/level1/25_Swish.py index 0a682dd..14ea27a 100644 --- a/backends/triton/cpu/KernelBench/level1/25_Swish.py +++ b/backends/triton/cpu/KernelBench/level1/25_Swish.py @@ -19,7 +19,9 @@ def _sigmoid_exp2(x): @triton.autotune( configs=[ - triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2), + triton.Config( + {"BLOCK_SIZE": 32}, + ), ], key=["n_elements"], ) diff --git a/backends/triton/cpu/KernelBench/level1/26_GELU_.py b/backends/triton/cpu/KernelBench/level1/26_GELU_.py index 0d52473..777e424 100644 --- a/backends/triton/cpu/KernelBench/level1/26_GELU_.py +++ b/backends/triton/cpu/KernelBench/level1/26_GELU_.py @@ -13,7 +13,7 @@ @triton.autotune( configs=[ triton.Config( - {"BLOCK_SIZE": 4096, "NUM_PROGS": 160}, num_warps=4, num_stages=2 + {"BLOCK_SIZE": 32, "NUM_PROGS": 16}, ), ], key=["n_elements"], diff --git a/backends/triton/cpu/KernelBench/level1/27_SELU_.py b/backends/triton/cpu/KernelBench/level1/27_SELU_.py index 1d8a3a7..3eb2540 100644 --- a/backends/triton/cpu/KernelBench/level1/27_SELU_.py +++ b/backends/triton/cpu/KernelBench/level1/27_SELU_.py @@ -12,7 +12,9 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=3), + triton.Config( + {"BLOCK_SIZE": 32}, + ), ], key=["n_elements"], ) diff --git a/backends/triton/cpu/KernelBench/level1/28_HardSigmoid.py b/backends/triton/cpu/KernelBench/level1/28_HardSigmoid.py index 6bde4bb..2d9d71b 100644 --- a/backends/triton/cpu/KernelBench/level1/28_HardSigmoid.py +++ b/backends/triton/cpu/KernelBench/level1/28_HardSigmoid.py @@ -12,7 +12,9 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2), + triton.Config( + {"BLOCK_SIZE": 32}, + ), ], key=["N"], ) diff --git a/backends/triton/cpu/KernelBench/level1/29_Softplus.py b/backends/triton/cpu/KernelBench/level1/29_Softplus.py index d561bab..1829f87 100644 --- a/backends/triton/cpu/KernelBench/level1/29_Softplus.py +++ b/backends/triton/cpu/KernelBench/level1/29_Softplus.py @@ -12,7 +12,9 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2), + triton.Config( + {"BLOCK_SIZE": 32}, + ), ], key=["n_elements"], ) diff --git a/backends/triton/cpu/KernelBench/level1/2_Standard_matrix_multiplication_.py b/backends/triton/cpu/KernelBench/level1/2_Standard_matrix_multiplication_.py index 761dc6e..e002715 100644 --- a/backends/triton/cpu/KernelBench/level1/2_Standard_matrix_multiplication_.py +++ b/backends/triton/cpu/KernelBench/level1/2_Standard_matrix_multiplication_.py @@ -13,9 +13,7 @@ def _configs(): return [ triton.Config( - {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, - num_warps=32, - num_stages=2, + {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, ), ] diff --git a/backends/triton/cpu/KernelBench/level1/30_Softsign.py b/backends/triton/cpu/KernelBench/level1/30_Softsign.py index 6290f9a..574b897 100644 --- a/backends/triton/cpu/KernelBench/level1/30_Softsign.py +++ b/backends/triton/cpu/KernelBench/level1/30_Softsign.py @@ -12,7 +12,9 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2), + triton.Config( + {"BLOCK_SIZE": 32}, + ), ], key=["n_elements"], ) diff --git a/backends/triton/cpu/KernelBench/level1/31_ELU.py b/backends/triton/cpu/KernelBench/level1/31_ELU.py index c9e0a26..0397929 100644 --- a/backends/triton/cpu/KernelBench/level1/31_ELU.py +++ b/backends/triton/cpu/KernelBench/level1/31_ELU.py @@ -12,7 +12,9 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2), + triton.Config( + {"BLOCK_SIZE": 32}, + ), ], key=["n_elements"], ) diff --git a/backends/triton/cpu/KernelBench/level1/32_HardTanh.py b/backends/triton/cpu/KernelBench/level1/32_HardTanh.py index c5fa253..12e4fb8 100644 --- a/backends/triton/cpu/KernelBench/level1/32_HardTanh.py +++ b/backends/triton/cpu/KernelBench/level1/32_HardTanh.py @@ -12,7 +12,9 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2), + triton.Config( + {"BLOCK_SIZE": 32}, + ), ], key=["n_elements"], ) diff --git a/backends/triton/cpu/KernelBench/level1/33_BatchNorm.py b/backends/triton/cpu/KernelBench/level1/33_BatchNorm.py index 4b2eab4..9f8b17f 100644 --- a/backends/triton/cpu/KernelBench/level1/33_BatchNorm.py +++ b/backends/triton/cpu/KernelBench/level1/33_BatchNorm.py @@ -71,7 +71,7 @@ def _bn_stats_kernel( @triton.autotune( configs=[ - triton.Config({"BLOCK_SIZE": 4096, "warp_size": 32}, num_warps=8), + triton.Config({"BLOCK_SIZE": 32, "warp_size": 32}, num_warps=8), ], key=["total_elements"], ) @@ -153,7 +153,6 @@ def forward(self, x): stride_c, B=B, BLOCK_HW=8192, - num_warps=8, ) _bn_stats_kernel[(C,)]( self._partial_sum, @@ -166,7 +165,6 @@ def forward(self, x): self.eps, B=B, BLOCK_B=triton.next_power_of_2(B), - num_warps=4, ) else: inv_std = 1.0 / torch.sqrt(self.running_var + self.eps) diff --git a/backends/triton/cpu/KernelBench/level1/34_InstanceNorm.py b/backends/triton/cpu/KernelBench/level1/34_InstanceNorm.py index 4149704..17923cd 100644 --- a/backends/triton/cpu/KernelBench/level1/34_InstanceNorm.py +++ b/backends/triton/cpu/KernelBench/level1/34_InstanceNorm.py @@ -12,7 +12,9 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_SIZE": 2048, "warp_size": 32}, num_warps=8, num_stages=2), + triton.Config( + {"BLOCK_SIZE": 32, "warp_size": 32}, + ), ], key=["N"], ) diff --git a/backends/triton/cpu/KernelBench/level1/35_GroupNorm_.py b/backends/triton/cpu/KernelBench/level1/35_GroupNorm_.py index 4fab4bb..326cfab 100644 --- a/backends/triton/cpu/KernelBench/level1/35_GroupNorm_.py +++ b/backends/triton/cpu/KernelBench/level1/35_GroupNorm_.py @@ -12,7 +12,9 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_HW": 4096, "warp_size": 32}, num_warps=8, num_stages=2), + triton.Config( + {"BLOCK_HW": 32, "warp_size": 32}, + ), ], key=["HW", "channels_per_group"], ) @@ -62,7 +64,9 @@ def group_norm_stats_kernel( @triton.autotune( configs=[ - triton.Config({"BLOCK_HW": 4096, "warp_size": 32}, num_warps=8, num_stages=2), + triton.Config( + {"BLOCK_HW": 32, "warp_size": 32}, + ), ], key=["HW"], ) diff --git a/backends/triton/cpu/KernelBench/level1/36_RMSNorm_.py b/backends/triton/cpu/KernelBench/level1/36_RMSNorm_.py index de1949b..ae56e24 100644 --- a/backends/triton/cpu/KernelBench/level1/36_RMSNorm_.py +++ b/backends/triton/cpu/KernelBench/level1/36_RMSNorm_.py @@ -12,7 +12,9 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_S": 64}, num_warps=4, num_stages=2), + triton.Config( + {"BLOCK_S": 32}, + ), ], key=["S", "F"], ) diff --git a/backends/triton/cpu/KernelBench/level1/37_FrobeniusNorm_.py b/backends/triton/cpu/KernelBench/level1/37_FrobeniusNorm_.py index d83afe2..ca79b72 100644 --- a/backends/triton/cpu/KernelBench/level1/37_FrobeniusNorm_.py +++ b/backends/triton/cpu/KernelBench/level1/37_FrobeniusNorm_.py @@ -9,7 +9,7 @@ import triton import triton.language as tl -REDUCE_BLOCK: tl.constexpr = 8192 +REDUCE_BLOCK: tl.constexpr = 128 @triton.jit @@ -48,7 +48,9 @@ def _reduce_kernel( @triton.autotune( configs=[ - triton.Config({"BLOCK_SIZE": 2048}, num_warps=8, num_stages=2), + triton.Config( + {"BLOCK_SIZE": 32}, + ), ], key=["N"], ) diff --git a/backends/triton/cpu/KernelBench/level1/38_L1Norm_.py b/backends/triton/cpu/KernelBench/level1/38_L1Norm_.py index 4a3a396..30f0892 100644 --- a/backends/triton/cpu/KernelBench/level1/38_L1Norm_.py +++ b/backends/triton/cpu/KernelBench/level1/38_L1Norm_.py @@ -12,7 +12,9 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_SIZE": 1024}, num_warps=8, num_stages=2), + triton.Config( + {"BLOCK_SIZE": 32}, + ), ], key=["N"], ) diff --git a/backends/triton/cpu/KernelBench/level1/39_L2Norm_.py b/backends/triton/cpu/KernelBench/level1/39_L2Norm_.py index ecd571f..4a5d86b 100644 --- a/backends/triton/cpu/KernelBench/level1/39_L2Norm_.py +++ b/backends/triton/cpu/KernelBench/level1/39_L2Norm_.py @@ -12,7 +12,9 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2), + triton.Config( + {"BLOCK_SIZE": 32}, + ), ], key=["N"], ) diff --git a/backends/triton/cpu/KernelBench/level1/3_Batched_matrix_multiplication.py b/backends/triton/cpu/KernelBench/level1/3_Batched_matrix_multiplication.py index 9380f52..8e07a2f 100644 --- a/backends/triton/cpu/KernelBench/level1/3_Batched_matrix_multiplication.py +++ b/backends/triton/cpu/KernelBench/level1/3_Batched_matrix_multiplication.py @@ -13,9 +13,7 @@ @triton.autotune( configs=[ triton.Config( - {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, - num_warps=32, - num_stages=2, + {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, ), ], key=["M", "N", "K"], diff --git a/backends/triton/cpu/KernelBench/level1/40_LayerNorm.py b/backends/triton/cpu/KernelBench/level1/40_LayerNorm.py index e0954b2..2b0491e 100644 --- a/backends/triton/cpu/KernelBench/level1/40_LayerNorm.py +++ b/backends/triton/cpu/KernelBench/level1/40_LayerNorm.py @@ -13,7 +13,7 @@ @triton.autotune( configs=[ triton.Config( - {"BLOCK_SIZE": 1024, "warp_size": 32}, num_warps=16, num_stages=2 + {"BLOCK_SIZE": 32, "warp_size": 32}, ), ], key=["N"], diff --git a/backends/triton/cpu/KernelBench/level1/41_Max_Pooling_1D.py b/backends/triton/cpu/KernelBench/level1/41_Max_Pooling_1D.py index 1d71cbb..9771174 100644 --- a/backends/triton/cpu/KernelBench/level1/41_Max_Pooling_1D.py +++ b/backends/triton/cpu/KernelBench/level1/41_Max_Pooling_1D.py @@ -12,7 +12,9 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_SIZE": 256}, num_warps=4, num_stages=2), + triton.Config( + {"BLOCK_SIZE": 32}, + ), ], key=["output_length"], ) diff --git a/backends/triton/cpu/KernelBench/level1/42_Max_Pooling_2D.py b/backends/triton/cpu/KernelBench/level1/42_Max_Pooling_2D.py index 0d5aa8e..573edd8 100644 --- a/backends/triton/cpu/KernelBench/level1/42_Max_Pooling_2D.py +++ b/backends/triton/cpu/KernelBench/level1/42_Max_Pooling_2D.py @@ -12,7 +12,9 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_H": 1, "BLOCK_W": 256}, num_warps=4, num_stages=2), + triton.Config( + {"BLOCK_H": 1, "BLOCK_W": 32}, + ), ], key=["OH", "OW"], ) diff --git a/backends/triton/cpu/KernelBench/level1/43_Max_Pooling_3D.py b/backends/triton/cpu/KernelBench/level1/43_Max_Pooling_3D.py index 84f6dc5..0b313a6 100644 --- a/backends/triton/cpu/KernelBench/level1/43_Max_Pooling_3D.py +++ b/backends/triton/cpu/KernelBench/level1/43_Max_Pooling_3D.py @@ -12,7 +12,9 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_OW": 32}, num_warps=4, num_stages=2), + triton.Config( + {"BLOCK_OW": 32}, + ), ], key=["OW"], ) diff --git a/backends/triton/cpu/KernelBench/level1/44_Average_Pooling_1D.py b/backends/triton/cpu/KernelBench/level1/44_Average_Pooling_1D.py index 564bf7f..21234fb 100644 --- a/backends/triton/cpu/KernelBench/level1/44_Average_Pooling_1D.py +++ b/backends/triton/cpu/KernelBench/level1/44_Average_Pooling_1D.py @@ -12,7 +12,9 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_SIZE": 256}, num_warps=4, num_stages=2), + triton.Config( + {"BLOCK_SIZE": 32}, + ), ], key=["output_length", "kernel_size"], ) diff --git a/backends/triton/cpu/KernelBench/level1/45_Average_Pooling_2D.py b/backends/triton/cpu/KernelBench/level1/45_Average_Pooling_2D.py index 0e9e33f..3c5e3b1 100644 --- a/backends/triton/cpu/KernelBench/level1/45_Average_Pooling_2D.py +++ b/backends/triton/cpu/KernelBench/level1/45_Average_Pooling_2D.py @@ -12,7 +12,9 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_OW": 16}, num_warps=4, num_stages=2), + triton.Config( + {"BLOCK_OW": 16}, + ), ], key=["OW"], ) diff --git a/backends/triton/cpu/KernelBench/level1/46_Average_Pooling_3D.py b/backends/triton/cpu/KernelBench/level1/46_Average_Pooling_3D.py index f52c515..aa98269 100644 --- a/backends/triton/cpu/KernelBench/level1/46_Average_Pooling_3D.py +++ b/backends/triton/cpu/KernelBench/level1/46_Average_Pooling_3D.py @@ -12,7 +12,9 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_OW": 64}, num_warps=4, num_stages=2), + triton.Config( + {"BLOCK_OW": 32}, + ), ], key=["OW"], ) diff --git a/backends/triton/cpu/KernelBench/level1/47_Sum_reduction_over_a_dimension.py b/backends/triton/cpu/KernelBench/level1/47_Sum_reduction_over_a_dimension.py index e5088dd..ad7fed5 100644 --- a/backends/triton/cpu/KernelBench/level1/47_Sum_reduction_over_a_dimension.py +++ b/backends/triton/cpu/KernelBench/level1/47_Sum_reduction_over_a_dimension.py @@ -12,7 +12,9 @@ def get_reduction_configs(): return [ - triton.Config({"BLOCK_R": 64, "BLOCK_N": 128}, num_warps=4, num_stages=2), + triton.Config( + {"BLOCK_R": 32, "BLOCK_N": 64}, + ), ] diff --git a/backends/triton/cpu/KernelBench/level1/48_Mean_reduction_over_a_dimension.py b/backends/triton/cpu/KernelBench/level1/48_Mean_reduction_over_a_dimension.py index 7b43864..d1d0151 100644 --- a/backends/triton/cpu/KernelBench/level1/48_Mean_reduction_over_a_dimension.py +++ b/backends/triton/cpu/KernelBench/level1/48_Mean_reduction_over_a_dimension.py @@ -12,7 +12,9 @@ def get_reduction_configs(): return [ - triton.Config({"BLOCK_R": 64, "BLOCK_N": 128}, num_warps=4, num_stages=2), + triton.Config( + {"BLOCK_R": 32, "BLOCK_N": 64}, + ), ] diff --git a/backends/triton/cpu/KernelBench/level1/49_Max_reduction_over_a_dimension.py b/backends/triton/cpu/KernelBench/level1/49_Max_reduction_over_a_dimension.py index a82f882..469246e 100644 --- a/backends/triton/cpu/KernelBench/level1/49_Max_reduction_over_a_dimension.py +++ b/backends/triton/cpu/KernelBench/level1/49_Max_reduction_over_a_dimension.py @@ -12,7 +12,9 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_N": 64, "BLOCK_K": 64}, num_warps=8, num_stages=2), + triton.Config( + {"BLOCK_N": 32, "BLOCK_K": 64}, + ), ], key=["DIM1", "DIM2"], ) diff --git a/backends/triton/cpu/KernelBench/level1/4_Matrix_vector_multiplication_.py b/backends/triton/cpu/KernelBench/level1/4_Matrix_vector_multiplication_.py index 4573a0e..5407588 100644 --- a/backends/triton/cpu/KernelBench/level1/4_Matrix_vector_multiplication_.py +++ b/backends/triton/cpu/KernelBench/level1/4_Matrix_vector_multiplication_.py @@ -12,7 +12,9 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_K": 512}, num_warps=4, num_stages=3), + triton.Config( + {"BLOCK_K": 32}, + ), ], key=["K"], ) diff --git a/backends/triton/cpu/KernelBench/level1/50_conv_standard_2D__square_input__square_kernel.py b/backends/triton/cpu/KernelBench/level1/50_conv_standard_2D__square_input__square_kernel.py index a2e0bcb..5fcaf7d 100644 --- a/backends/triton/cpu/KernelBench/level1/50_conv_standard_2D__square_input__square_kernel.py +++ b/backends/triton/cpu/KernelBench/level1/50_conv_standard_2D__square_input__square_kernel.py @@ -34,9 +34,7 @@ def swizzle_tile( @triton.autotune( configs=[ triton.Config( - {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, - num_warps=32, - num_stages=2, + {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, ), ], key=["M", "N", "K"], diff --git a/backends/triton/cpu/KernelBench/level1/51_Argmax_over_a_dimension.py b/backends/triton/cpu/KernelBench/level1/51_Argmax_over_a_dimension.py index 1b9d31d..4b7b467 100644 --- a/backends/triton/cpu/KernelBench/level1/51_Argmax_over_a_dimension.py +++ b/backends/triton/cpu/KernelBench/level1/51_Argmax_over_a_dimension.py @@ -12,7 +12,9 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_N": 64, "grf_mode": "128"}, num_warps=4, num_stages=2), + triton.Config( + {"BLOCK_N": 32, "grf_mode": "128"}, + ), ], key=["D1", "D2"], ) diff --git a/backends/triton/cpu/KernelBench/level1/52_Argmin_over_a_dimension.py b/backends/triton/cpu/KernelBench/level1/52_Argmin_over_a_dimension.py index 15dc776..7a8bb0c 100644 --- a/backends/triton/cpu/KernelBench/level1/52_Argmin_over_a_dimension.py +++ b/backends/triton/cpu/KernelBench/level1/52_Argmin_over_a_dimension.py @@ -13,7 +13,7 @@ @triton.autotune( configs=[ triton.Config( - {"BLOCK_D2": 128, "BLOCK_K": 32, "warp_size": 16}, num_warps=4, num_stages=2 + {"BLOCK_D2": 128, "BLOCK_K": 32, "warp_size": 16}, ), ], key=["D1", "D2"], diff --git a/backends/triton/cpu/KernelBench/level1/53_Min_reduction_over_a_dimension.py b/backends/triton/cpu/KernelBench/level1/53_Min_reduction_over_a_dimension.py index b4fc20a..75ecb5e 100644 --- a/backends/triton/cpu/KernelBench/level1/53_Min_reduction_over_a_dimension.py +++ b/backends/triton/cpu/KernelBench/level1/53_Min_reduction_over_a_dimension.py @@ -13,9 +13,7 @@ @triton.autotune( configs=[ triton.Config( - {"BLOCK_D1": 64, "BLOCK_D2": 256, "warp_size": 16}, - num_warps=4, - num_stages=6, + {"BLOCK_D1": 32, "BLOCK_D2": 64, "warp_size": 16}, ), ], key=["D1", "D2"], diff --git a/backends/triton/cpu/KernelBench/level1/54_conv_standard_3D__square_input__square_kernel.py b/backends/triton/cpu/KernelBench/level1/54_conv_standard_3D__square_input__square_kernel.py index a6a0552..0df02f0 100644 --- a/backends/triton/cpu/KernelBench/level1/54_conv_standard_3D__square_input__square_kernel.py +++ b/backends/triton/cpu/KernelBench/level1/54_conv_standard_3D__square_input__square_kernel.py @@ -20,14 +20,12 @@ def _to_triple(x): configs=[ triton.Config( { - "BLOCK_M": 64, - "BLOCK_N": 64, + "BLOCK_M": 32, + "BLOCK_N": 32, "BLOCK_K": 16, "GROUP_M": 8, "grf_mode": "256", }, - num_warps=4, - num_stages=2, ), ], key=["C_out", "K_FUSED"], diff --git a/backends/triton/cpu/KernelBench/level1/55_conv_standard_2D__asymmetric_input__square_kernel.py b/backends/triton/cpu/KernelBench/level1/55_conv_standard_2D__asymmetric_input__square_kernel.py index 5618624..3cbfdcf 100644 --- a/backends/triton/cpu/KernelBench/level1/55_conv_standard_2D__asymmetric_input__square_kernel.py +++ b/backends/triton/cpu/KernelBench/level1/55_conv_standard_2D__asymmetric_input__square_kernel.py @@ -13,9 +13,7 @@ @triton.autotune( configs=[ triton.Config( - {"BLOCK_OW": 64, "BLOCK_N": 64, "BLOCK_K": 64, "grf_mode": "256"}, - num_warps=4, - num_stages=2, + {"BLOCK_OW": 64, "BLOCK_N": 32, "BLOCK_K": 32, "grf_mode": "256"}, ), ], key=["H", "W", "C_IN", "C_out", "OH", "OW"], diff --git a/backends/triton/cpu/KernelBench/level1/56_conv_standard_2D__asymmetric_input__asymmetric_kernel.py b/backends/triton/cpu/KernelBench/level1/56_conv_standard_2D__asymmetric_input__asymmetric_kernel.py index 0c35718..0455e06 100644 --- a/backends/triton/cpu/KernelBench/level1/56_conv_standard_2D__asymmetric_input__asymmetric_kernel.py +++ b/backends/triton/cpu/KernelBench/level1/56_conv_standard_2D__asymmetric_input__asymmetric_kernel.py @@ -13,9 +13,7 @@ @triton.autotune( configs=[ triton.Config( - {"BLOCK_OW": 64, "BLOCK_N": 64, "BLOCK_K": 64, "grf_mode": "128"}, - num_warps=4, - num_stages=2, + {"BLOCK_OW": 64, "BLOCK_N": 32, "BLOCK_K": 32, "grf_mode": "128"}, ), ], key=["H", "W", "C_IN", "C_out", "OH", "OW"], diff --git a/backends/triton/cpu/KernelBench/level1/57_conv_transposed_2D__square_input__square_kernel.py b/backends/triton/cpu/KernelBench/level1/57_conv_transposed_2D__square_input__square_kernel.py index 1206eac..849dcc7 100644 --- a/backends/triton/cpu/KernelBench/level1/57_conv_transposed_2D__square_input__square_kernel.py +++ b/backends/triton/cpu/KernelBench/level1/57_conv_transposed_2D__square_input__square_kernel.py @@ -16,13 +16,11 @@ triton.Config( { "BLOCK_OW": 64, - "BLOCK_N": 64, - "BLOCK_K": 64, + "BLOCK_N": 32, + "BLOCK_K": 32, "GROUP_SIZE_M": 8, "grf_mode": "256", }, - num_warps=4, - num_stages=2, ), ], key=["H", "W", "C_IN", "C_out", "OH", "OW"], diff --git a/backends/triton/cpu/KernelBench/level1/58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.py b/backends/triton/cpu/KernelBench/level1/58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.py index 2c22089..fc34ee8 100644 --- a/backends/triton/cpu/KernelBench/level1/58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.py +++ b/backends/triton/cpu/KernelBench/level1/58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.py @@ -12,7 +12,9 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_W": 16, "BLOCK_OC": 16}, num_warps=4, num_stages=3), + triton.Config( + {"BLOCK_W": 16, "BLOCK_OC": 16}, + ), ], key=["W_out", "C_out"], ) diff --git a/backends/triton/cpu/KernelBench/level1/59_conv_standard_3D__asymmetric_input__square_kernel.py b/backends/triton/cpu/KernelBench/level1/59_conv_standard_3D__asymmetric_input__square_kernel.py index 14a217f..288e4b1 100644 --- a/backends/triton/cpu/KernelBench/level1/59_conv_standard_3D__asymmetric_input__square_kernel.py +++ b/backends/triton/cpu/KernelBench/level1/59_conv_standard_3D__asymmetric_input__square_kernel.py @@ -14,14 +14,12 @@ configs=[ triton.Config( { - "BLOCK_M": 64, - "BLOCK_N": 64, + "BLOCK_M": 32, + "BLOCK_N": 32, "BLOCK_K": 16, "GROUP_M": 8, "grf_mode": "256", }, - num_warps=4, - num_stages=2, ), ], key=["C_out", "K_FUSED"], diff --git a/backends/triton/cpu/KernelBench/level1/5_Matrix_scalar_multiplication.py b/backends/triton/cpu/KernelBench/level1/5_Matrix_scalar_multiplication.py index fe569a8..0b1463a 100644 --- a/backends/triton/cpu/KernelBench/level1/5_Matrix_scalar_multiplication.py +++ b/backends/triton/cpu/KernelBench/level1/5_Matrix_scalar_multiplication.py @@ -12,7 +12,9 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_SIZE": 1024 * 2}, num_warps=4, num_stages=2), + triton.Config( + {"BLOCK_SIZE": 32}, + ), ], key=["n_elements"], ) diff --git a/backends/triton/cpu/KernelBench/level1/60_conv_standard_3D__square_input__asymmetric_kernel.py b/backends/triton/cpu/KernelBench/level1/60_conv_standard_3D__square_input__asymmetric_kernel.py index 529f8ee..d23dd44 100644 --- a/backends/triton/cpu/KernelBench/level1/60_conv_standard_3D__square_input__asymmetric_kernel.py +++ b/backends/triton/cpu/KernelBench/level1/60_conv_standard_3D__square_input__asymmetric_kernel.py @@ -20,14 +20,12 @@ def _to_triple(x): configs=[ triton.Config( { - "BLOCK_M": 64, - "BLOCK_N": 64, + "BLOCK_M": 32, + "BLOCK_N": 32, "BLOCK_K": 16, "GROUP_M": 8, "grf_mode": "256", }, - num_warps=4, - num_stages=2, ), ], key=["C_out", "K_FUSED"], diff --git a/backends/triton/cpu/KernelBench/level1/61_conv_transposed_3D__square_input__square_kernel.py b/backends/triton/cpu/KernelBench/level1/61_conv_transposed_3D__square_input__square_kernel.py index d7a5e6d..93214fb 100644 --- a/backends/triton/cpu/KernelBench/level1/61_conv_transposed_3D__square_input__square_kernel.py +++ b/backends/triton/cpu/KernelBench/level1/61_conv_transposed_3D__square_input__square_kernel.py @@ -13,9 +13,7 @@ @triton.autotune( configs=[ triton.Config( - {"BLOCK_OW": 64, "BLOCK_N": 64, "BLOCK_K": 16, "grf_mode": "256"}, - num_warps=4, - num_stages=3, + {"BLOCK_OW": 64, "BLOCK_N": 32, "BLOCK_K": 16, "grf_mode": "256"}, ), ], key=["D", "H", "W", "C_IN", "C_OUT", "OD", "OH", "OW"], diff --git a/backends/triton/cpu/KernelBench/level1/62_conv_standard_2D__square_input__asymmetric_kernel.py b/backends/triton/cpu/KernelBench/level1/62_conv_standard_2D__square_input__asymmetric_kernel.py index 0c35718..0455e06 100644 --- a/backends/triton/cpu/KernelBench/level1/62_conv_standard_2D__square_input__asymmetric_kernel.py +++ b/backends/triton/cpu/KernelBench/level1/62_conv_standard_2D__square_input__asymmetric_kernel.py @@ -13,9 +13,7 @@ @triton.autotune( configs=[ triton.Config( - {"BLOCK_OW": 64, "BLOCK_N": 64, "BLOCK_K": 64, "grf_mode": "128"}, - num_warps=4, - num_stages=2, + {"BLOCK_OW": 64, "BLOCK_N": 32, "BLOCK_K": 32, "grf_mode": "128"}, ), ], key=["H", "W", "C_IN", "C_out", "OH", "OW"], diff --git a/backends/triton/cpu/KernelBench/level1/63_conv_standard_2D__square_input__square_kernel.py b/backends/triton/cpu/KernelBench/level1/63_conv_standard_2D__square_input__square_kernel.py index 760a8fa..da19242 100644 --- a/backends/triton/cpu/KernelBench/level1/63_conv_standard_2D__square_input__square_kernel.py +++ b/backends/triton/cpu/KernelBench/level1/63_conv_standard_2D__square_input__square_kernel.py @@ -13,9 +13,7 @@ @triton.autotune( configs=[ triton.Config( - {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 16, "grf_mode": "128"}, - num_warps=4, - num_stages=2, + {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 16, "grf_mode": "128"}, ), ], key=["M_total", "C_out", "C_IN"], diff --git a/backends/triton/cpu/KernelBench/level1/64_conv_transposed_1D.py b/backends/triton/cpu/KernelBench/level1/64_conv_transposed_1D.py index c9f1851..adda572 100644 --- a/backends/triton/cpu/KernelBench/level1/64_conv_transposed_1D.py +++ b/backends/triton/cpu/KernelBench/level1/64_conv_transposed_1D.py @@ -13,9 +13,7 @@ @triton.autotune( configs=[ triton.Config( - {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 16, "GROUP_M": 8}, - num_warps=4, - num_stages=2, + {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 16, "GROUP_M": 8}, ), ], key=["C_IN", "C_out", "OL"], diff --git a/backends/triton/cpu/KernelBench/level1/65_conv_transposed_2D__square_input__asymmetric_kernel.py b/backends/triton/cpu/KernelBench/level1/65_conv_transposed_2D__square_input__asymmetric_kernel.py index 81d1992..1e834ce 100644 --- a/backends/triton/cpu/KernelBench/level1/65_conv_transposed_2D__square_input__asymmetric_kernel.py +++ b/backends/triton/cpu/KernelBench/level1/65_conv_transposed_2D__square_input__asymmetric_kernel.py @@ -16,13 +16,11 @@ triton.Config( { "BLOCK_OW": 64, - "BLOCK_N": 64, - "BLOCK_K": 64, + "BLOCK_N": 32, + "BLOCK_K": 32, "GROUP_SIZE_M": 8, "grf_mode": "256", }, - num_warps=4, - num_stages=2, ), ], key=["H", "W", "C_IN", "C_out", "OH", "OW"], diff --git a/backends/triton/cpu/KernelBench/level1/66_conv_standard_3D__asymmetric_input__asymmetric_kernel.py b/backends/triton/cpu/KernelBench/level1/66_conv_standard_3D__asymmetric_input__asymmetric_kernel.py index 656518f..7f8eff7 100644 --- a/backends/triton/cpu/KernelBench/level1/66_conv_standard_3D__asymmetric_input__asymmetric_kernel.py +++ b/backends/triton/cpu/KernelBench/level1/66_conv_standard_3D__asymmetric_input__asymmetric_kernel.py @@ -19,9 +19,7 @@ def _to_triple(x): @triton.autotune( configs=[ triton.Config( - {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 16, "grf_mode": "256"}, - num_warps=4, - num_stages=2, + {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 16, "grf_mode": "256"}, ), ], key=["C_out", "K_FUSED"], diff --git a/backends/triton/cpu/KernelBench/level1/67_conv_standard_1D.py b/backends/triton/cpu/KernelBench/level1/67_conv_standard_1D.py index 7abf077..9a6ba31 100644 --- a/backends/triton/cpu/KernelBench/level1/67_conv_standard_1D.py +++ b/backends/triton/cpu/KernelBench/level1/67_conv_standard_1D.py @@ -12,7 +12,9 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_OL": 64, "BLOCK_N": 128}, num_warps=4, num_stages=3), + triton.Config( + {"BLOCK_OL": 64, "BLOCK_N": 128}, + ), ], key=["OL", "OC"], ) diff --git a/backends/triton/cpu/KernelBench/level1/68_conv_transposed_3D__square_input__asymmetric_kernel.py b/backends/triton/cpu/KernelBench/level1/68_conv_transposed_3D__square_input__asymmetric_kernel.py index 9bcf102..d2a7ff6 100644 --- a/backends/triton/cpu/KernelBench/level1/68_conv_transposed_3D__square_input__asymmetric_kernel.py +++ b/backends/triton/cpu/KernelBench/level1/68_conv_transposed_3D__square_input__asymmetric_kernel.py @@ -12,7 +12,9 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_W": 16, "BLOCK_OC": 16}, num_warps=4, num_stages=3), + triton.Config( + {"BLOCK_W": 16, "BLOCK_OC": 16}, + ), ], key=["W_out", "C_out"], ) diff --git a/backends/triton/cpu/KernelBench/level1/69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.py b/backends/triton/cpu/KernelBench/level1/69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.py index 233c9b5..3693da9 100644 --- a/backends/triton/cpu/KernelBench/level1/69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.py +++ b/backends/triton/cpu/KernelBench/level1/69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.py @@ -16,13 +16,11 @@ triton.Config( { "BLOCK_OW": 64, - "BLOCK_N": 64, - "BLOCK_K": 64, + "BLOCK_N": 32, + "BLOCK_K": 32, "GROUP_SIZE_M": 8, "grf_mode": "256", }, - num_warps=4, - num_stages=2, ), ], key=["H", "W", "C_IN", "C_out", "OH", "OW"], diff --git a/backends/triton/cpu/KernelBench/level1/6_Matmul_with_large_K_dimension_.py b/backends/triton/cpu/KernelBench/level1/6_Matmul_with_large_K_dimension_.py index 2dd9faa..055a1a6 100644 --- a/backends/triton/cpu/KernelBench/level1/6_Matmul_with_large_K_dimension_.py +++ b/backends/triton/cpu/KernelBench/level1/6_Matmul_with_large_K_dimension_.py @@ -13,9 +13,7 @@ def _configs(): return [ triton.Config( - {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_SIZE_M": 4}, - num_warps=8, - num_stages=3, + {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, ), ] diff --git a/backends/triton/cpu/KernelBench/level1/70_conv_transposed_3D__asymmetric_input__square_kernel.py b/backends/triton/cpu/KernelBench/level1/70_conv_transposed_3D__asymmetric_input__square_kernel.py index 9418854..a5ee485 100644 --- a/backends/triton/cpu/KernelBench/level1/70_conv_transposed_3D__asymmetric_input__square_kernel.py +++ b/backends/triton/cpu/KernelBench/level1/70_conv_transposed_3D__asymmetric_input__square_kernel.py @@ -12,7 +12,9 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_W": 16, "BLOCK_OC": 32}, num_warps=4, num_stages=3), + triton.Config( + {"BLOCK_W": 16, "BLOCK_OC": 32}, + ), ], key=["W_out", "C_out"], ) diff --git a/backends/triton/cpu/KernelBench/level1/71_conv_transposed_2D__asymmetric_input__square_kernel.py b/backends/triton/cpu/KernelBench/level1/71_conv_transposed_2D__asymmetric_input__square_kernel.py index e93ce1a..5e8915f 100644 --- a/backends/triton/cpu/KernelBench/level1/71_conv_transposed_2D__asymmetric_input__square_kernel.py +++ b/backends/triton/cpu/KernelBench/level1/71_conv_transposed_2D__asymmetric_input__square_kernel.py @@ -21,8 +21,6 @@ "GROUP_SIZE_M": 8, "grf_mode": "128", }, - num_warps=4, - num_stages=3, ), ], key=["H", "W", "C_IN", "C_out", "OH", "OW"], diff --git a/backends/triton/cpu/KernelBench/level1/72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.py b/backends/triton/cpu/KernelBench/level1/72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.py index 8509131..e2b6816 100644 --- a/backends/triton/cpu/KernelBench/level1/72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.py +++ b/backends/triton/cpu/KernelBench/level1/72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.py @@ -12,7 +12,9 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_OW": 64, "BLOCK_OH": 2}, num_warps=4, num_stages=2), + triton.Config( + {"BLOCK_OW": 64, "BLOCK_OH": 2}, + ), ], key=["D_out", "H_out", "W_out"], ) diff --git a/backends/triton/cpu/KernelBench/level1/73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.py b/backends/triton/cpu/KernelBench/level1/73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.py index 91671df..19f2d5d 100644 --- a/backends/triton/cpu/KernelBench/level1/73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.py +++ b/backends/triton/cpu/KernelBench/level1/73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.py @@ -167,7 +167,7 @@ def kernel_function( ) # choose a block size for the flattened spatial dimension - BLOCK = 256 + BLOCK = 32 grid = (B, C_out, triton.cdiv(Do * Ho * Wo, BLOCK)) # launch diff --git a/backends/triton/cpu/KernelBench/level1/74_conv_transposed_1D_dilated.py b/backends/triton/cpu/KernelBench/level1/74_conv_transposed_1D_dilated.py index 830134b..fd7a5db 100644 --- a/backends/triton/cpu/KernelBench/level1/74_conv_transposed_1D_dilated.py +++ b/backends/triton/cpu/KernelBench/level1/74_conv_transposed_1D_dilated.py @@ -13,9 +13,7 @@ @triton.autotune( configs=[ triton.Config( - {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 16, "GROUP_M": 8}, - num_warps=4, - num_stages=2, + {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 16, "GROUP_M": 8}, ), ], key=["C_IN", "C_out", "OL"], diff --git a/backends/triton/cpu/KernelBench/level1/75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.py b/backends/triton/cpu/KernelBench/level1/75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.py index 69289d0..3c89b3f 100644 --- a/backends/triton/cpu/KernelBench/level1/75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.py +++ b/backends/triton/cpu/KernelBench/level1/75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.py @@ -185,7 +185,7 @@ def kernel_function( out_stride_b, out_stride_c, out_stride_h, out_stride_w = out.stride() # 6) Launch parameters - BLOCK = 256 + BLOCK = 32 grid = (triton.cdiv(total_elems, BLOCK),) # 7) Launch kernel diff --git a/backends/triton/cpu/KernelBench/level1/76_conv_standard_1D_dilated_strided__.py b/backends/triton/cpu/KernelBench/level1/76_conv_standard_1D_dilated_strided__.py index 78155ce..d67ec91 100644 --- a/backends/triton/cpu/KernelBench/level1/76_conv_standard_1D_dilated_strided__.py +++ b/backends/triton/cpu/KernelBench/level1/76_conv_standard_1D_dilated_strided__.py @@ -12,7 +12,9 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_OL": 64, "BLOCK_N": 128}, num_warps=4, num_stages=3), + triton.Config( + {"BLOCK_OL": 32, "BLOCK_N": 64}, + ), ], key=["OL", "OC"], ) diff --git a/backends/triton/cpu/KernelBench/level1/77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.py b/backends/triton/cpu/KernelBench/level1/77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.py index 9942ff4..31acbda 100644 --- a/backends/triton/cpu/KernelBench/level1/77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.py +++ b/backends/triton/cpu/KernelBench/level1/77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.py @@ -16,7 +16,9 @@ def _compute_output_size(input_size, kernel_size, stride, padding, dilation): @triton.autotune( configs=[ - triton.Config({"BLOCK_W": 16, "BLOCK_OC": 32}, num_warps=4, num_stages=4), + triton.Config( + {"BLOCK_W": 16, "BLOCK_OC": 32}, + ), ], key=["W_act", "C_out"], ) @@ -197,7 +199,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: output = self._output_buf N = output.numel() - BLOCK = 1024 + BLOCK = 32 _zero_kernel[(triton.cdiv(N, BLOCK),)](output, N, BLOCK=BLOCK) sx = x_cl3d.stride() diff --git a/backends/triton/cpu/KernelBench/level1/78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.py b/backends/triton/cpu/KernelBench/level1/78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.py index 854892d..00e4f51 100644 --- a/backends/triton/cpu/KernelBench/level1/78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.py +++ b/backends/triton/cpu/KernelBench/level1/78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.py @@ -21,8 +21,6 @@ "GROUP_SIZE_M": 8, "grf_mode": "128", }, - num_warps=4, - num_stages=3, ), ], key=["H", "W", "C_IN", "C_out", "OH", "OW"], diff --git a/backends/triton/cpu/KernelBench/level1/79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.py b/backends/triton/cpu/KernelBench/level1/79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.py index 84f13ff..02186e7 100644 --- a/backends/triton/cpu/KernelBench/level1/79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.py +++ b/backends/triton/cpu/KernelBench/level1/79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.py @@ -12,7 +12,9 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_L": 128}, num_warps=4, num_stages=2), + triton.Config( + {"BLOCK_L": 32}, + ), ], key=["N_NONZERO"], ) diff --git a/backends/triton/cpu/KernelBench/level1/7_Matmul_with_small_K_dimension_.py b/backends/triton/cpu/KernelBench/level1/7_Matmul_with_small_K_dimension_.py index dc43e9f..7dedcbe 100644 --- a/backends/triton/cpu/KernelBench/level1/7_Matmul_with_small_K_dimension_.py +++ b/backends/triton/cpu/KernelBench/level1/7_Matmul_with_small_K_dimension_.py @@ -32,9 +32,7 @@ def swizzle_tile( @triton.autotune( configs=[ triton.Config( - {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 64, "GROUP_SIZE_M": 4}, - num_warps=32, - num_stages=2, + {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, ), ], key=["M", "N", "K"], diff --git a/backends/triton/cpu/KernelBench/level1/80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.py b/backends/triton/cpu/KernelBench/level1/80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.py index 2494939..6d0a219 100644 --- a/backends/triton/cpu/KernelBench/level1/80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.py +++ b/backends/triton/cpu/KernelBench/level1/80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.py @@ -18,14 +18,12 @@ def _to_pair(x): configs=[ triton.Config( { - "BLOCK_M": 64, - "BLOCK_N": 64, + "BLOCK_M": 32, + "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_M": 8, "grf_mode": "128", }, - num_warps=4, - num_stages=3, ), ], key=["C_IN", "C_out", "OH", "OW"], diff --git a/backends/triton/cpu/KernelBench/level1/81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.py b/backends/triton/cpu/KernelBench/level1/81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.py index b282b7b..bfc9432 100644 --- a/backends/triton/cpu/KernelBench/level1/81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.py +++ b/backends/triton/cpu/KernelBench/level1/81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.py @@ -16,7 +16,9 @@ def _to_pair(x): @triton.autotune( configs=[ - triton.Config({"BLOCK_W": 16, "BLOCK_OC": 32}, num_warps=4, num_stages=3), + triton.Config( + {"BLOCK_W": 16, "BLOCK_OC": 32}, + ), ], key=["W_IN", "C_out"], ) @@ -179,7 +181,7 @@ def forward(self, x): output = self._output N_elem = output.numel() - _zero_kernel[(triton.cdiv(N_elem, 1024),)](output, N_elem, BLOCK=1024) + _zero_kernel[(triton.cdiv(N_elem, 32),)](output, N_elem, BLOCK=32) sx = x_cl.stride() so = output.stride() diff --git a/backends/triton/cpu/KernelBench/level1/82_conv_depthwise_2D_square_input_square_kernel.py b/backends/triton/cpu/KernelBench/level1/82_conv_depthwise_2D_square_input_square_kernel.py index e8395b3..94660ed 100644 --- a/backends/triton/cpu/KernelBench/level1/82_conv_depthwise_2D_square_input_square_kernel.py +++ b/backends/triton/cpu/KernelBench/level1/82_conv_depthwise_2D_square_input_square_kernel.py @@ -12,7 +12,9 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=1), + triton.Config( + {"BLOCK_SIZE": 32}, + ), ], key=["OHOW"], ) diff --git a/backends/triton/cpu/KernelBench/level1/83_conv_depthwise_2D_square_input_asymmetric_kernel.py b/backends/triton/cpu/KernelBench/level1/83_conv_depthwise_2D_square_input_asymmetric_kernel.py index 6b9c313..3190fb2 100644 --- a/backends/triton/cpu/KernelBench/level1/83_conv_depthwise_2D_square_input_asymmetric_kernel.py +++ b/backends/triton/cpu/KernelBench/level1/83_conv_depthwise_2D_square_input_asymmetric_kernel.py @@ -12,7 +12,9 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=1), + triton.Config( + {"BLOCK_SIZE": 32}, + ), ], key=["OHOW"], ) diff --git a/backends/triton/cpu/KernelBench/level1/84_conv_depthwise_2D_asymmetric_input_square_kernel.py b/backends/triton/cpu/KernelBench/level1/84_conv_depthwise_2D_asymmetric_input_square_kernel.py index 94c2d3c..ec92ac7 100644 --- a/backends/triton/cpu/KernelBench/level1/84_conv_depthwise_2D_asymmetric_input_square_kernel.py +++ b/backends/triton/cpu/KernelBench/level1/84_conv_depthwise_2D_asymmetric_input_square_kernel.py @@ -12,7 +12,9 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=1), + triton.Config( + {"BLOCK_SIZE": 32}, + ), ], key=["OHOW"], ) diff --git a/backends/triton/cpu/KernelBench/level1/85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.py b/backends/triton/cpu/KernelBench/level1/85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.py index 61d7834..c19d1ab 100644 --- a/backends/triton/cpu/KernelBench/level1/85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.py +++ b/backends/triton/cpu/KernelBench/level1/85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.py @@ -12,7 +12,9 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=1), + triton.Config( + {"BLOCK_SIZE": 32}, + ), ], key=["OHOW"], ) diff --git a/backends/triton/cpu/KernelBench/level1/86_conv_depthwise_separable_2D.py b/backends/triton/cpu/KernelBench/level1/86_conv_depthwise_separable_2D.py index 0bcdc18..c97f8bb 100644 --- a/backends/triton/cpu/KernelBench/level1/86_conv_depthwise_separable_2D.py +++ b/backends/triton/cpu/KernelBench/level1/86_conv_depthwise_separable_2D.py @@ -14,9 +14,7 @@ @triton.autotune( configs=[ triton.Config( - {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 16, "grf_mode": "128"}, - num_warps=4, - num_stages=2, + {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 16, "grf_mode": "128"}, ), ], key=["M_total", "C_out", "C_IN"], diff --git a/backends/triton/cpu/KernelBench/level1/87_conv_pointwise_2D.py b/backends/triton/cpu/KernelBench/level1/87_conv_pointwise_2D.py index cf48a00..e6c8cbc 100644 --- a/backends/triton/cpu/KernelBench/level1/87_conv_pointwise_2D.py +++ b/backends/triton/cpu/KernelBench/level1/87_conv_pointwise_2D.py @@ -94,9 +94,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: weight = self.conv1d.weight.squeeze(-1).squeeze(-1).contiguous() N_spatial = H * W - BLOCK_M = 128 - BLOCK_N = 128 - BLOCK_K = 64 + BLOCK_M = 32 + BLOCK_N = 32 + BLOCK_K = 16 GROUP_M = 8 out = torch.empty((B, C_out, H, W), device=x.device, dtype=x.dtype) @@ -122,8 +122,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: BLOCK_N=BLOCK_N, BLOCK_K=BLOCK_K, GROUP_M=GROUP_M, - num_warps=8, - num_stages=1, ) if self.conv1d.bias is not None: diff --git a/backends/triton/cpu/KernelBench/level1/88_MinGPTNewGelu.py b/backends/triton/cpu/KernelBench/level1/88_MinGPTNewGelu.py index bde7264..e1f54dd 100644 --- a/backends/triton/cpu/KernelBench/level1/88_MinGPTNewGelu.py +++ b/backends/triton/cpu/KernelBench/level1/88_MinGPTNewGelu.py @@ -13,7 +13,9 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2), + triton.Config( + {"BLOCK_SIZE": 32}, + ), ], key=["n_elements"], ) diff --git a/backends/triton/cpu/KernelBench/level1/89_cumsum.py b/backends/triton/cpu/KernelBench/level1/89_cumsum.py index 34e0a1e..b5e3ce0 100644 --- a/backends/triton/cpu/KernelBench/level1/89_cumsum.py +++ b/backends/triton/cpu/KernelBench/level1/89_cumsum.py @@ -17,7 +17,9 @@ def scan_add_op(a, b): @triton.autotune( configs=[ - triton.Config({"BLOCK_SIZE": 256}, num_warps=4, num_stages=1), + triton.Config( + {"BLOCK_SIZE": 32}, + ), ], key=["N"], ) diff --git a/backends/triton/cpu/KernelBench/level1/8_Matmul_with_irregular_shapes_.py b/backends/triton/cpu/KernelBench/level1/8_Matmul_with_irregular_shapes_.py index ad4bb8a..15b7b38 100644 --- a/backends/triton/cpu/KernelBench/level1/8_Matmul_with_irregular_shapes_.py +++ b/backends/triton/cpu/KernelBench/level1/8_Matmul_with_irregular_shapes_.py @@ -13,9 +13,7 @@ def get_autotune_configs(): return [ triton.Config( - {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, - num_warps=32, - num_stages=2, + {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, ), ] diff --git a/backends/triton/cpu/KernelBench/level1/90_cumprod.py b/backends/triton/cpu/KernelBench/level1/90_cumprod.py index e0d9e1e..4f9057f 100644 --- a/backends/triton/cpu/KernelBench/level1/90_cumprod.py +++ b/backends/triton/cpu/KernelBench/level1/90_cumprod.py @@ -17,7 +17,7 @@ def _mul_combine(a, b): @triton.autotune( configs=[ - triton.Config({"BLOCK_SIZE": 512}, num_warps=4), + triton.Config({"BLOCK_SIZE": 32}, num_warps=4), ], key=["N"], ) diff --git a/backends/triton/cpu/KernelBench/level1/91_cumsum_reverse.py b/backends/triton/cpu/KernelBench/level1/91_cumsum_reverse.py index 3d78bb9..b9905e5 100644 --- a/backends/triton/cpu/KernelBench/level1/91_cumsum_reverse.py +++ b/backends/triton/cpu/KernelBench/level1/91_cumsum_reverse.py @@ -12,7 +12,7 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_N": 256}, num_warps=4), + triton.Config({"BLOCK_N": 32}, num_warps=4), ], key=["N"], ) diff --git a/backends/triton/cpu/KernelBench/level1/92_cumsum_exclusive.py b/backends/triton/cpu/KernelBench/level1/92_cumsum_exclusive.py index 94e17ba..eba7c00 100644 --- a/backends/triton/cpu/KernelBench/level1/92_cumsum_exclusive.py +++ b/backends/triton/cpu/KernelBench/level1/92_cumsum_exclusive.py @@ -12,7 +12,9 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_SIZE": 1024}, num_warps=4, num_stages=2), + triton.Config( + {"BLOCK_SIZE": 32}, + ), ], key=["N"], ) diff --git a/backends/triton/cpu/KernelBench/level1/93_masked_cumsum.py b/backends/triton/cpu/KernelBench/level1/93_masked_cumsum.py index b440482..cdc00e1 100644 --- a/backends/triton/cpu/KernelBench/level1/93_masked_cumsum.py +++ b/backends/triton/cpu/KernelBench/level1/93_masked_cumsum.py @@ -17,7 +17,9 @@ def scan_add_op(a, b): @triton.autotune( configs=[ - triton.Config({"BLOCK_SIZE": 512}, num_warps=4, num_stages=1), + triton.Config( + {"BLOCK_SIZE": 32}, + ), ], key=["N"], ) diff --git a/backends/triton/cpu/KernelBench/level1/94_MSELoss.py b/backends/triton/cpu/KernelBench/level1/94_MSELoss.py index 5111369..51e21e6 100644 --- a/backends/triton/cpu/KernelBench/level1/94_MSELoss.py +++ b/backends/triton/cpu/KernelBench/level1/94_MSELoss.py @@ -13,9 +13,7 @@ def get_autotune_configs(): configs = [ triton.Config( - {"BLOCK_SIZE": 1024}, - num_warps=4, - num_stages=1, + {"BLOCK_SIZE": 32}, ) ] return configs diff --git a/backends/triton/cpu/KernelBench/level1/95_CrossEntropyLoss.py b/backends/triton/cpu/KernelBench/level1/95_CrossEntropyLoss.py index 6a9f8e8..dca8570 100644 --- a/backends/triton/cpu/KernelBench/level1/95_CrossEntropyLoss.py +++ b/backends/triton/cpu/KernelBench/level1/95_CrossEntropyLoss.py @@ -11,13 +11,7 @@ def _ce_configs(): - configs = [] - for BN in [1024, 2048, 4096]: - for nw in [4, 8, 16]: - for ws in [16, 32]: - configs.append( - triton.Config({"BLOCK_N": BN, "warp_size": ws}, num_warps=nw) - ) + configs = [triton.Config({"BLOCK_N": 32})] return configs @@ -31,7 +25,6 @@ def _cross_entropy_online_kernel( stride_lm, stride_ln, BLOCK_N: tl.constexpr, - warp_size: tl.constexpr, ): row = tl.program_id(0) row_off = row.to(tl.int64) * stride_lm diff --git a/backends/triton/cpu/KernelBench/level1/96_HuberLoss.py b/backends/triton/cpu/KernelBench/level1/96_HuberLoss.py index 30e4887..9f09d74 100644 --- a/backends/triton/cpu/KernelBench/level1/96_HuberLoss.py +++ b/backends/triton/cpu/KernelBench/level1/96_HuberLoss.py @@ -12,7 +12,7 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_SIZE": 4096}, num_warps=4), + triton.Config({"BLOCK_SIZE": 32}, num_warps=4), ], key=["n_cols"], ) diff --git a/backends/triton/cpu/KernelBench/level1/97_ScaledDotProductAttention.py b/backends/triton/cpu/KernelBench/level1/97_ScaledDotProductAttention.py index dc20d99..a378b24 100644 --- a/backends/triton/cpu/KernelBench/level1/97_ScaledDotProductAttention.py +++ b/backends/triton/cpu/KernelBench/level1/97_ScaledDotProductAttention.py @@ -15,7 +15,7 @@ @triton.autotune( configs=[ triton.Config( - {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 64}, num_warps=16, num_stages=2 + {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32}, ), ], key=["SEQ_LEN", "HEAD_DIM"], @@ -93,7 +93,7 @@ def _qk_gemm_kernel( @triton.autotune( configs=[ triton.Config( - {"BLOCK_M": 64, "BLOCK_K": 64, "BLOCK_N": 128}, num_warps=8, num_stages=3 + {"BLOCK_M": 32, "BLOCK_K": 32, "BLOCK_N": 128}, ), ], key=["SEQ_LEN", "HEAD_DIM"], diff --git a/backends/triton/cpu/KernelBench/level1/98_KLDivLoss.py b/backends/triton/cpu/KernelBench/level1/98_KLDivLoss.py index c0ed1ab..a5b876f 100644 --- a/backends/triton/cpu/KernelBench/level1/98_KLDivLoss.py +++ b/backends/triton/cpu/KernelBench/level1/98_KLDivLoss.py @@ -17,9 +17,7 @@ def get_kl_div_configs(): configs = [ triton.Config( - {"BLOCK_SIZE": 4096}, - num_warps=4, - num_stages=1, + {"BLOCK_SIZE": 32}, ) ] return configs diff --git a/backends/triton/cpu/KernelBench/level1/99_TripletMarginLoss.py b/backends/triton/cpu/KernelBench/level1/99_TripletMarginLoss.py index 237d264..2d0538c 100644 --- a/backends/triton/cpu/KernelBench/level1/99_TripletMarginLoss.py +++ b/backends/triton/cpu/KernelBench/level1/99_TripletMarginLoss.py @@ -12,7 +12,9 @@ @triton.autotune( configs=[ - triton.Config({"BLOCK_K": 256}, num_warps=4, num_stages=2), + triton.Config( + {"BLOCK_K": 32}, + ), ], key=["D"], ) diff --git a/backends/triton/cpu/KernelBench/level1/9_Tall_skinny_matrix_multiplication_.py b/backends/triton/cpu/KernelBench/level1/9_Tall_skinny_matrix_multiplication_.py index 152975e..e5605d7 100644 --- a/backends/triton/cpu/KernelBench/level1/9_Tall_skinny_matrix_multiplication_.py +++ b/backends/triton/cpu/KernelBench/level1/9_Tall_skinny_matrix_multiplication_.py @@ -32,9 +32,7 @@ def swizzle_tile( def _configs(): return [ triton.Config( - {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, - num_warps=16, - num_stages=2, + {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_SIZE_M": 4}, ), ] diff --git a/problems/specs/KernelBench/level1/10_3D_tensor_matrix_multiplication.yaml b/problems/specs/KernelBench/level1/10_3D_tensor_matrix_multiplication.yaml index 43eabc9..2f7f86a 100644 --- a/problems/specs/KernelBench/level1/10_3D_tensor_matrix_multiplication.yaml +++ b/problems/specs/KernelBench/level1/10_3D_tensor_matrix_multiplication.yaml @@ -20,10 +20,10 @@ simple-cpu: - params: [A, B] dtype: bfloat16 dims: - N: 4 - M: 32 - K: 64 - L: 24 + N: 64 + M: 256 + K: 128 + L: 96 flop: "2*N*M*L*K" bench-gpu: diff --git a/problems/specs/KernelBench/level1/11_4D_tensor_matrix_multiplication.yaml b/problems/specs/KernelBench/level1/11_4D_tensor_matrix_multiplication.yaml index d0c5522..81822b0 100644 --- a/problems/specs/KernelBench/level1/11_4D_tensor_matrix_multiplication.yaml +++ b/problems/specs/KernelBench/level1/11_4D_tensor_matrix_multiplication.yaml @@ -22,10 +22,10 @@ simple-cpu: dtype: bfloat16 dims: B: 2 - I: 32 + I: 128 J: 64 - L: 32 - K: 24 + L: 128 + K: 96 flop: "2*B*I*J*K*L" bench-gpu: diff --git a/problems/specs/KernelBench/level1/13_Matmul_for_symmetric_matrices.yaml b/problems/specs/KernelBench/level1/13_Matmul_for_symmetric_matrices.yaml index 9808093..3034011 100644 --- a/problems/specs/KernelBench/level1/13_Matmul_for_symmetric_matrices.yaml +++ b/problems/specs/KernelBench/level1/13_Matmul_for_symmetric_matrices.yaml @@ -20,4 +20,4 @@ simple-cpu: - params: [A, B] dtype: bfloat16 dims: - N: 64 + N: 128 diff --git a/problems/specs/KernelBench/level1/14_Matmul_for_upper_triangular_matrices.yaml b/problems/specs/KernelBench/level1/14_Matmul_for_upper_triangular_matrices.yaml index b29c1f6..9247081 100644 --- a/problems/specs/KernelBench/level1/14_Matmul_for_upper_triangular_matrices.yaml +++ b/problems/specs/KernelBench/level1/14_Matmul_for_upper_triangular_matrices.yaml @@ -20,4 +20,4 @@ simple-cpu: - params: [A, B] dtype: bfloat16 dims: - N: 64 + N: 128 diff --git a/problems/specs/KernelBench/level1/15_Matmul_for_lower_triangular_matrices.yaml b/problems/specs/KernelBench/level1/15_Matmul_for_lower_triangular_matrices.yaml index f6043d4..6ec276c 100644 --- a/problems/specs/KernelBench/level1/15_Matmul_for_lower_triangular_matrices.yaml +++ b/problems/specs/KernelBench/level1/15_Matmul_for_lower_triangular_matrices.yaml @@ -20,4 +20,4 @@ simple-cpu: - params: [A, B] dtype: bfloat16 dims: - M: 64 + M: 128 diff --git a/problems/specs/KernelBench/level1/1_Square_matrix_multiplication_.yaml b/problems/specs/KernelBench/level1/1_Square_matrix_multiplication_.yaml index 2f5de5b..fef7c31 100644 --- a/problems/specs/KernelBench/level1/1_Square_matrix_multiplication_.yaml +++ b/problems/specs/KernelBench/level1/1_Square_matrix_multiplication_.yaml @@ -16,7 +16,7 @@ simple-cpu: - params: [A, B] dtype: bfloat16 dims: - N: 128 + N: 256 bench-cpu: - params: [A, B] diff --git a/problems/specs/KernelBench/level1/2_Standard_matrix_multiplication_.yaml b/problems/specs/KernelBench/level1/2_Standard_matrix_multiplication_.yaml index f4c1f66..3e2b28a 100644 --- a/problems/specs/KernelBench/level1/2_Standard_matrix_multiplication_.yaml +++ b/problems/specs/KernelBench/level1/2_Standard_matrix_multiplication_.yaml @@ -18,9 +18,9 @@ simple-cpu: - params: [A, B] dtype: bfloat16 dims: - M: 64 - N: 128 - K: 256 + M: 256 + N: 1024 + K: 512 bench-cpu: - params: [A, B] diff --git a/problems/specs/KernelBench/level1/47_Sum_reduction_over_a_dimension.yaml b/problems/specs/KernelBench/level1/47_Sum_reduction_over_a_dimension.yaml index 34c285c..4e1aa59 100644 --- a/problems/specs/KernelBench/level1/47_Sum_reduction_over_a_dimension.yaml +++ b/problems/specs/KernelBench/level1/47_Sum_reduction_over_a_dimension.yaml @@ -20,6 +20,6 @@ simple-cpu: dtype: float32 dims: BATCH_SIZE: 2 - DIM1: 64 - DIM2: 63 + DIM1: 128 + DIM2: 127 REDUCE_DIM: 1 diff --git a/problems/specs/KernelBench/level1/48_Mean_reduction_over_a_dimension.yaml b/problems/specs/KernelBench/level1/48_Mean_reduction_over_a_dimension.yaml index 34c285c..4e1aa59 100644 --- a/problems/specs/KernelBench/level1/48_Mean_reduction_over_a_dimension.yaml +++ b/problems/specs/KernelBench/level1/48_Mean_reduction_over_a_dimension.yaml @@ -20,6 +20,6 @@ simple-cpu: dtype: float32 dims: BATCH_SIZE: 2 - DIM1: 64 - DIM2: 63 + DIM1: 128 + DIM2: 127 REDUCE_DIM: 1 diff --git a/problems/specs/KernelBench/level1/4_Matrix_vector_multiplication_.yaml b/problems/specs/KernelBench/level1/4_Matrix_vector_multiplication_.yaml index d3afc05..36bce41 100644 --- a/problems/specs/KernelBench/level1/4_Matrix_vector_multiplication_.yaml +++ b/problems/specs/KernelBench/level1/4_Matrix_vector_multiplication_.yaml @@ -19,9 +19,9 @@ simple-cpu: - params: [A, B] dtype: bfloat16 dims: - M: 64 + M: 128 N: 1 - K: 256 + K: 512 flop: "2*M*N*K" bench-gpu: diff --git a/problems/specs/KernelBench/level1/51_Argmax_over_a_dimension.yaml b/problems/specs/KernelBench/level1/51_Argmax_over_a_dimension.yaml index 8b78339..340dc21 100644 --- a/problems/specs/KernelBench/level1/51_Argmax_over_a_dimension.yaml +++ b/problems/specs/KernelBench/level1/51_Argmax_over_a_dimension.yaml @@ -20,6 +20,6 @@ simple-cpu: dtype: float32 dims: BATCH_SIZE: 2 - DIM1: 64 - DIM2: 63 + DIM1: 256 + DIM2: 255 ARGMAX_DIM: 1 diff --git a/problems/specs/KernelBench/level1/52_Argmin_over_a_dimension.yaml b/problems/specs/KernelBench/level1/52_Argmin_over_a_dimension.yaml index e20221d..f11d837 100644 --- a/problems/specs/KernelBench/level1/52_Argmin_over_a_dimension.yaml +++ b/problems/specs/KernelBench/level1/52_Argmin_over_a_dimension.yaml @@ -20,6 +20,6 @@ simple-cpu: dtype: float32 dims: BATCH_SIZE: 2 - DIM1: 64 - DIM2: 63 + DIM1: 256 + DIM2: 255 ARGMIN_DIM: 1 diff --git a/problems/specs/KernelBench/level1/53_Min_reduction_over_a_dimension.yaml b/problems/specs/KernelBench/level1/53_Min_reduction_over_a_dimension.yaml index 34c285c..8075c38 100644 --- a/problems/specs/KernelBench/level1/53_Min_reduction_over_a_dimension.yaml +++ b/problems/specs/KernelBench/level1/53_Min_reduction_over_a_dimension.yaml @@ -20,6 +20,6 @@ simple-cpu: dtype: float32 dims: BATCH_SIZE: 2 - DIM1: 64 - DIM2: 63 + DIM1: 256 + DIM2: 255 REDUCE_DIM: 1 diff --git a/problems/specs/KernelBench/level1/54_conv_standard_3D__square_input__square_kernel.yaml b/problems/specs/KernelBench/level1/54_conv_standard_3D__square_input__square_kernel.yaml index 4503c24..a8e7ec9 100644 --- a/problems/specs/KernelBench/level1/54_conv_standard_3D__square_input__square_kernel.yaml +++ b/problems/specs/KernelBench/level1/54_conv_standard_3D__square_input__square_kernel.yaml @@ -26,7 +26,7 @@ simple-cpu: dims: BATCH_SIZE: 2 IN_CHANNELS: 3 - OUT_CHANNELS: 16 + OUT_CHANNELS: 64 KERNEL_SIZE: 3 DEPTH: 16 WIDTH: 16 diff --git a/problems/specs/KernelBench/level1/59_conv_standard_3D__asymmetric_input__square_kernel.yaml b/problems/specs/KernelBench/level1/59_conv_standard_3D__asymmetric_input__square_kernel.yaml index 4d58a3e..9e1aa1c 100644 --- a/problems/specs/KernelBench/level1/59_conv_standard_3D__asymmetric_input__square_kernel.yaml +++ b/problems/specs/KernelBench/level1/59_conv_standard_3D__asymmetric_input__square_kernel.yaml @@ -26,7 +26,7 @@ simple-cpu: dims: BATCH_SIZE: 2 IN_CHANNELS: 3 - OUT_CHANNELS: 16 + OUT_CHANNELS: 64 KERNEL_SIZE: 3 HEIGHT: 32 WIDTH: 32 diff --git a/problems/specs/KernelBench/level1/5_Matrix_scalar_multiplication.yaml b/problems/specs/KernelBench/level1/5_Matrix_scalar_multiplication.yaml index d2ecbbe..99d0101 100644 --- a/problems/specs/KernelBench/level1/5_Matrix_scalar_multiplication.yaml +++ b/problems/specs/KernelBench/level1/5_Matrix_scalar_multiplication.yaml @@ -19,8 +19,8 @@ simple-cpu: - params: [A, B] dtype: bfloat16 dims: - M: 64 - N: 32 + M: 256 + N: 128 UNIT: 1 flop: "M*N" diff --git a/problems/specs/KernelBench/level1/67_conv_standard_1D.yaml b/problems/specs/KernelBench/level1/67_conv_standard_1D.yaml index c68f735..f87c7d2 100644 --- a/problems/specs/KernelBench/level1/67_conv_standard_1D.yaml +++ b/problems/specs/KernelBench/level1/67_conv_standard_1D.yaml @@ -22,7 +22,7 @@ simple-cpu: - params: [x] dtype: float16 dims: - BATCH_SIZE: 2 + BATCH_SIZE: 8 IN_CHANNELS: 8 OUT_CHANNELS: 16 KERNEL_SIZE: 3 diff --git a/problems/specs/KernelBench/level1/6_Matmul_with_large_K_dimension_.yaml b/problems/specs/KernelBench/level1/6_Matmul_with_large_K_dimension_.yaml index 73aa753..ff5ccde 100644 --- a/problems/specs/KernelBench/level1/6_Matmul_with_large_K_dimension_.yaml +++ b/problems/specs/KernelBench/level1/6_Matmul_with_large_K_dimension_.yaml @@ -19,8 +19,8 @@ simple-cpu: - params: [A, B] dtype: bfloat16 dims: - M: 8 - N: 16 + M: 64 + N: 32 K: 512 flop: "2*M*N*K" diff --git a/problems/specs/KernelBench/level1/7_Matmul_with_small_K_dimension_.yaml b/problems/specs/KernelBench/level1/7_Matmul_with_small_K_dimension_.yaml index 05b84d0..855daf5 100644 --- a/problems/specs/KernelBench/level1/7_Matmul_with_small_K_dimension_.yaml +++ b/problems/specs/KernelBench/level1/7_Matmul_with_small_K_dimension_.yaml @@ -19,9 +19,9 @@ simple-cpu: - params: [A, B] dtype: bfloat16 dims: - M: 256 - N: 256 - K: 16 + M: 512 + N: 512 + K: 32 flop: "2*M*N*K" bench-gpu: diff --git a/problems/specs/KernelBench/level1/87_conv_pointwise_2D.yaml b/problems/specs/KernelBench/level1/87_conv_pointwise_2D.yaml index 6429cde..3ea00ab 100644 --- a/problems/specs/KernelBench/level1/87_conv_pointwise_2D.yaml +++ b/problems/specs/KernelBench/level1/87_conv_pointwise_2D.yaml @@ -24,5 +24,5 @@ simple-cpu: BATCH_SIZE: 2 IN_CHANNELS: 4 OUT_CHANNELS: 8 - HEIGHT: 32 - WIDTH: 32 + HEIGHT: 64 + WIDTH: 64 diff --git a/problems/specs/KernelBench/level1/8_Matmul_with_irregular_shapes_.yaml b/problems/specs/KernelBench/level1/8_Matmul_with_irregular_shapes_.yaml index 1068de4..a05e921 100644 --- a/problems/specs/KernelBench/level1/8_Matmul_with_irregular_shapes_.yaml +++ b/problems/specs/KernelBench/level1/8_Matmul_with_irregular_shapes_.yaml @@ -19,9 +19,9 @@ simple-cpu: - params: [A, B] dtype: bfloat16 dims: - M: 60 - N: 37 - K: 19 + M: 607 + N: 372 + K: 191 flop: "2*M*N*K" bench-gpu: diff --git a/problems/specs/KernelBench/level1/9_Tall_skinny_matrix_multiplication_.yaml b/problems/specs/KernelBench/level1/9_Tall_skinny_matrix_multiplication_.yaml index 9a25d89..c990777 100644 --- a/problems/specs/KernelBench/level1/9_Tall_skinny_matrix_multiplication_.yaml +++ b/problems/specs/KernelBench/level1/9_Tall_skinny_matrix_multiplication_.yaml @@ -18,8 +18,8 @@ simple-cpu: - params: [A, B] dtype: bfloat16 dims: - M: 256 - N: 16 + M: 1024 + N: 32 flop: "2*M*M*N" bench-gpu: From 45f21cf2758ae9bed857f18323bb8b3752390ac3 Mon Sep 17 00:00:00 2001 From: Marcin Spoczynski Date: Sat, 30 May 2026 22:55:02 -0700 Subject: [PATCH 3/3] Fix temporary mlir install --- pyproject.toml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ba285e0..6294db8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta" [project] name = "AI-bench" dynamic = ["version"] -requires-python = ">=3.10" +requires-python = ">=3.10,<3.13" description = "AI kernel benchmarking harness" readme = "README.md" license = {text = "MIT"} @@ -61,7 +61,11 @@ cuda = [ "helion==1.0.0", ] mlir = [ - "lighthouse[ingress_torch_mlir]", + # torch-mlir publishes Linux-only wheels for cp310-cp312, so restrict the + # extra to where it can actually resolve/install. This keeps universal `uv + # lock` resolution from failing on the python>=3.13 / non-Linux splits. + "lighthouse; sys_platform == 'linux' and python_version < '3.13'", + "torch-mlir @ https://github.com/llvm/torch-mlir-release/releases/download/dev-wheels/torch_mlir-20260331.768-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl ; sys_platform == 'linux' and platform_machine == 'x86_64' and python_version == '3.12'", "ml-dtypes", ] # If triton-cpu build fails with a stale LLVM path, clear caches: