From 72c75bf1e0bfdda093736518968ac4402cb5eae2 Mon Sep 17 00:00:00 2001
From: Adam Siemieniuk <adam.siemieniuk@intel.com>
Date: Tue, 7 Apr 2026 17:15:24 +0200
Subject: [PATCH 1/7] L1 MLIR CPU kernels

---
 .../cpu/KernelBench/level1/100_HingeLoss.py   | 22 +++++++
 .../10_3D_tensor_matrix_multiplication.py     | 29 ++++++++
 .../11_4D_tensor_matrix_multiplication.py     | 37 +++++++++++
 .../13_Matmul_for_symmetric_matrices.py       | 29 ++++++++
 ...14_Matmul_for_upper_triangular_matrices.py | 29 ++++++++
 ...15_Matmul_for_lower_triangular_matrices.py | 29 ++++++++
 .../level1/16_Matmul_with_transposed_A.py     | 29 ++++++++
 .../level1/17_Matmul_with_transposed_B.py     | 29 ++++++++
 .../level1/18_Matmul_with_transposed_both.py  | 29 ++++++++
 .../mlir/cpu/KernelBench/level1/19_ReLU.py    | 28 ++++++++
 .../cpu/KernelBench/level1/20_LeakyReLU.py    | 35 ++++++++++
 .../mlir/cpu/KernelBench/level1/21_Sigmoid.py | 28 ++++++++
 .../mlir/cpu/KernelBench/level1/22_Tanh.py    | 28 ++++++++
 .../mlir/cpu/KernelBench/level1/23_Softmax.py | 28 ++++++++
 .../cpu/KernelBench/level1/24_LogSoftmax.py   | 29 ++++++++
 .../mlir/cpu/KernelBench/level1/25_Swish.py   | 28 ++++++++
 .../mlir/cpu/KernelBench/level1/26_GELU_.py   | 28 ++++++++
 .../mlir/cpu/KernelBench/level1/27_SELU_.py   | 28 ++++++++
 .../cpu/KernelBench/level1/28_HardSigmoid.py  | 28 ++++++++
 .../cpu/KernelBench/level1/29_Softplus.py     | 28 ++++++++
 .../2_Standard_matrix_multiplication_.py      | 29 ++++++++
 .../cpu/KernelBench/level1/30_Softsign.py     | 28 ++++++++
 .../mlir/cpu/KernelBench/level1/31_ELU.py     | 36 ++++++++++
 .../cpu/KernelBench/level1/32_HardTanh.py     | 29 ++++++++
 .../cpu/KernelBench/level1/33_BatchNorm.py    | 35 ++++++++++
 .../cpu/KernelBench/level1/35_GroupNorm_.py   | 36 ++++++++++
 .../cpu/KernelBench/level1/36_RMSNorm_.py     | 41 ++++++++++++
 .../KernelBench/level1/37_FrobeniusNorm_.py   | 32 +++++++++
 .../mlir/cpu/KernelBench/level1/38_L1Norm_.py | 31 +++++++++
 .../mlir/cpu/KernelBench/level1/39_L2Norm_.py | 34 ++++++++++
 .../level1/3_Batched_matrix_multiplication.py | 29 ++++++++
 .../cpu/KernelBench/level1/40_LayerNorm.py    | 35 ++++++++++
 .../KernelBench/level1/41_Max_Pooling_1D.py   | 52 +++++++++++++++
 .../KernelBench/level1/42_Max_Pooling_2D.py   | 40 +++++++++++
 .../level1/44_Average_Pooling_1D.py           | 39 +++++++++++
 .../level1/45_Average_Pooling_2D.py           | 39 +++++++++++
 .../level1/46_Average_Pooling_3D.py           | 39 +++++++++++
 .../47_Sum_reduction_over_a_dimension.py      | 35 ++++++++++
 .../48_Mean_reduction_over_a_dimension.py     | 35 ++++++++++
 .../49_Max_reduction_over_a_dimension.py      | 35 ++++++++++
 .../level1/4_Matrix_vector_multiplication_.py | 29 ++++++++
 ...tandard_2D__square_input__square_kernel.py | 19 ++++++
 .../level1/51_Argmax_over_a_dimension.py      | 35 ++++++++++
 .../level1/52_Argmin_over_a_dimension.py      | 35 ++++++++++
 .../53_Min_reduction_over_a_dimension.py      | 35 ++++++++++
 ...tandard_3D__square_input__square_kernel.py | 58 ++++++++++++++++
 ...ard_2D__asymmetric_input__square_kernel.py | 58 ++++++++++++++++
 ...2D__asymmetric_input__asymmetric_kernel.py | 58 ++++++++++++++++
 ...nsposed_2D__square_input__square_kernel.py | 58 ++++++++++++++++
 ...3D__asymmetric_input__asymmetric_kernel.py | 58 ++++++++++++++++
 ...ard_3D__asymmetric_input__square_kernel.py | 58 ++++++++++++++++
 .../level1/5_Matrix_scalar_multiplication.py  | 29 ++++++++
 ...ard_3D__square_input__asymmetric_kernel.py | 58 ++++++++++++++++
 ...nsposed_3D__square_input__square_kernel.py | 58 ++++++++++++++++
 ...ard_2D__square_input__asymmetric_kernel.py | 58 ++++++++++++++++
 ...tandard_2D__square_input__square_kernel.py | 58 ++++++++++++++++
 .../level1/64_conv_transposed_1D.py           | 58 ++++++++++++++++
 ...sed_2D__square_input__asymmetric_kernel.py | 58 ++++++++++++++++
 ...3D__asymmetric_input__asymmetric_kernel.py | 58 ++++++++++++++++
 .../KernelBench/level1/67_conv_standard_1D.py | 58 ++++++++++++++++
 ...sed_3D__square_input__asymmetric_kernel.py | 59 +++++++++++++++++
 ...2D__asymmetric_input__asymmetric_kernel.py | 61 +++++++++++++++++
 .../6_Matmul_with_large_K_dimension_.py       | 29 ++++++++
 ...sed_3D__asymmetric_input__square_kernel.py | 62 +++++++++++++++++
 ...sed_2D__asymmetric_input__square_kernel.py | 58 ++++++++++++++++
 ..._square_kernel__strided_padded__grouped.py | 57 ++++++++++++++++
 .../level1/74_conv_transposed_1D_dilated.py   | 55 ++++++++++++++++
 ...strided__grouped____padded____dilated__.py | 59 +++++++++++++++++
 .../76_conv_standard_1D_dilated_strided__.py  | 52 +++++++++++++++
 ...kernel___padded____dilated____strided__.py | 56 ++++++++++++++++
 ...tric_input_asymmetric_kernel___padded__.py | 52 +++++++++++++++
 ...kernel___padded____strided____dilated__.py | 56 ++++++++++++++++
 .../7_Matmul_with_small_K_dimension_.py       | 29 ++++++++
 ...asymmetric_kernel___dilated____padded__.py | 55 ++++++++++++++++
 ...kernel___dilated____padded____strided__.py | 55 ++++++++++++++++
 ...depthwise_2D_square_input_square_kernel.py | 51 ++++++++++++++
 ...hwise_2D_square_input_asymmetric_kernel.py | 54 +++++++++++++++
 ...hwise_2D_asymmetric_input_square_kernel.py | 53 +++++++++++++++
 ...e_2D_asymmetric_input_asymmetric_kernel.py | 66 +++++++++++++++++++
 .../level1/86_conv_depthwise_separable_2D.py  | 59 +++++++++++++++++
 .../level1/87_conv_pointwise_2D.py            | 36 ++++++++++
 .../KernelBench/level1/88_MinGPTNewGelu.py    | 32 +++++++++
 .../level1/8_Matmul_with_irregular_shapes_.py | 29 ++++++++
 .../mlir/cpu/KernelBench/level1/94_MSELoss.py | 22 +++++++
 .../level1/97_ScaledDotProductAttention.py    | 18 +++++
 .../cpu/KernelBench/level1/98_KLDivLoss.py    | 24 +++++++
 .../9_Tall_skinny_matrix_multiplication_.py   | 29 ++++++++
 87 files changed, 3549 insertions(+)
 create mode 100644 backends/mlir/cpu/KernelBench/level1/100_HingeLoss.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/10_3D_tensor_matrix_multiplication.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/11_4D_tensor_matrix_multiplication.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/13_Matmul_for_symmetric_matrices.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/14_Matmul_for_upper_triangular_matrices.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/15_Matmul_for_lower_triangular_matrices.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/16_Matmul_with_transposed_A.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/17_Matmul_with_transposed_B.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/18_Matmul_with_transposed_both.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/19_ReLU.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/20_LeakyReLU.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/21_Sigmoid.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/22_Tanh.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/23_Softmax.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/24_LogSoftmax.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/25_Swish.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/26_GELU_.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/27_SELU_.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/28_HardSigmoid.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/29_Softplus.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/2_Standard_matrix_multiplication_.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/30_Softsign.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/31_ELU.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/32_HardTanh.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/33_BatchNorm.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/35_GroupNorm_.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/36_RMSNorm_.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/37_FrobeniusNorm_.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/38_L1Norm_.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/39_L2Norm_.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/3_Batched_matrix_multiplication.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/40_LayerNorm.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/41_Max_Pooling_1D.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/42_Max_Pooling_2D.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/44_Average_Pooling_1D.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/45_Average_Pooling_2D.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/46_Average_Pooling_3D.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/47_Sum_reduction_over_a_dimension.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/48_Mean_reduction_over_a_dimension.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/49_Max_reduction_over_a_dimension.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/4_Matrix_vector_multiplication_.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/50_conv_standard_2D__square_input__square_kernel.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/51_Argmax_over_a_dimension.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/52_Argmin_over_a_dimension.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/53_Min_reduction_over_a_dimension.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/54_conv_standard_3D__square_input__square_kernel.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/55_conv_standard_2D__asymmetric_input__square_kernel.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/56_conv_standard_2D__asymmetric_input__asymmetric_kernel.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/57_conv_transposed_2D__square_input__square_kernel.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/59_conv_standard_3D__asymmetric_input__square_kernel.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/5_Matrix_scalar_multiplication.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/60_conv_standard_3D__square_input__asymmetric_kernel.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/61_conv_transposed_3D__square_input__square_kernel.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/62_conv_standard_2D__square_input__asymmetric_kernel.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/63_conv_standard_2D__square_input__square_kernel.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/64_conv_transposed_1D.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/65_conv_transposed_2D__square_input__asymmetric_kernel.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/66_conv_standard_3D__asymmetric_input__asymmetric_kernel.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/67_conv_standard_1D.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/68_conv_transposed_3D__square_input__asymmetric_kernel.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/6_Matmul_with_large_K_dimension_.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/70_conv_transposed_3D__asymmetric_input__square_kernel.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/71_conv_transposed_2D__asymmetric_input__square_kernel.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/74_conv_transposed_1D_dilated.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/76_conv_standard_1D_dilated_strided__.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/7_Matmul_with_small_K_dimension_.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/82_conv_depthwise_2D_square_input_square_kernel.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/83_conv_depthwise_2D_square_input_asymmetric_kernel.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/84_conv_depthwise_2D_asymmetric_input_square_kernel.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/86_conv_depthwise_separable_2D.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/87_conv_pointwise_2D.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/88_MinGPTNewGelu.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/8_Matmul_with_irregular_shapes_.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/94_MSELoss.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/97_ScaledDotProductAttention.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/98_KLDivLoss.py
 create mode 100644 backends/mlir/cpu/KernelBench/level1/9_Tall_skinny_matrix_multiplication_.py

diff --git a/backends/mlir/cpu/KernelBench/level1/100_HingeLoss.py b/backends/mlir/cpu/KernelBench/level1/100_HingeLoss.py
new file mode 100644
index 0000000..a9d8d70
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/100_HingeLoss.py
@@ -0,0 +1,22 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    A model that computes Hinge Loss for binary classification tasks.
+
+    Parameters:
+        None
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, predictions, targets):
+        return torch.mean(torch.clamp(1 - predictions * targets, min=0))
diff --git a/backends/mlir/cpu/KernelBench/level1/10_3D_tensor_matrix_multiplication.py b/backends/mlir/cpu/KernelBench/level1/10_3D_tensor_matrix_multiplication.py
new file mode 100644
index 0000000..ee77323
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/10_3D_tensor_matrix_multiplication.py
@@ -0,0 +1,29 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Performs 3D tensor-matrix multiplication.
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, A, B):
+        """
+        Performs 3D tensor-matrix multiplication.
+
+        Args:
+            A (torch.Tensor): Input 3D tensor of shape (N, M, K).
+            B (torch.Tensor): Input matrix of shape (K, L).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (N, M, L), resulting from the multiplication of A and B along the last dimension of A.
+        """
+        return torch.matmul(A, B)
diff --git a/backends/mlir/cpu/KernelBench/level1/11_4D_tensor_matrix_multiplication.py b/backends/mlir/cpu/KernelBench/level1/11_4D_tensor_matrix_multiplication.py
new file mode 100644
index 0000000..1f852eb
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/11_4D_tensor_matrix_multiplication.py
@@ -0,0 +1,37 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Performs 4D tensor-matrix multiplication:
+        C[b, i, j, k] = sum_l A[b, i, j, l] * B[l, k]
+
+    Args:
+        A (torch.Tensor): Input 4D tensor of shape (b, i, j, l)
+        B (torch.Tensor): Input matrix of shape (l, k)
+
+    Returns:
+        torch.Tensor: Output 4D tensor of shape (b, i, j, k)
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, A, B):
+        """
+        Performs the 4D tensor-matrix multiplication.
+
+        Args:
+            A (torch.Tensor): Input 4D tensor of shape (b, i, j, l)
+            B (torch.Tensor): Input matrix of shape (l, k)
+
+        Returns:
+            torch.Tensor: Output 4D tensor of shape (b, i, j, k)
+        """
+        return torch.einsum("bijl,lk->bijk", A, B)
diff --git a/backends/mlir/cpu/KernelBench/level1/13_Matmul_for_symmetric_matrices.py b/backends/mlir/cpu/KernelBench/level1/13_Matmul_for_symmetric_matrices.py
new file mode 100644
index 0000000..020b4d0
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/13_Matmul_for_symmetric_matrices.py
@@ -0,0 +1,29 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a single matrix multiplication (C = A * B) with A and B being symmetric matrices.
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, A, B):
+        """
+        Performs matrix multiplication of two symmetric matrices.
+
+        Args:
+            A (torch.Tensor): Input matrix A, shape (N, N), symmetric.
+            B (torch.Tensor): Input matrix B, shape (N, N), symmetric.
+
+        Returns:
+            torch.Tensor: Output matrix C, shape (N, N).
+        """
+        return torch.matmul(A, B)
diff --git a/backends/mlir/cpu/KernelBench/level1/14_Matmul_for_upper_triangular_matrices.py b/backends/mlir/cpu/KernelBench/level1/14_Matmul_for_upper_triangular_matrices.py
new file mode 100644
index 0000000..74177a8
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/14_Matmul_for_upper_triangular_matrices.py
@@ -0,0 +1,29 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs matrix multiplication (C = A * B) for upper triangular matrices.
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, A, B):
+        """
+        Performs matrix multiplication for upper triangular matrices.
+
+        Args:
+            A (torch.Tensor): Upper triangular matrix of shape (N, N).
+            B (torch.Tensor): Upper triangular matrix of shape (N, N).
+
+        Returns:
+            torch.Tensor: The product of A and B, also an upper triangular matrix of shape (N, N).
+        """
+        return torch.triu(torch.matmul(A, B))
diff --git a/backends/mlir/cpu/KernelBench/level1/15_Matmul_for_lower_triangular_matrices.py b/backends/mlir/cpu/KernelBench/level1/15_Matmul_for_lower_triangular_matrices.py
new file mode 100644
index 0000000..9ff9c99
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/15_Matmul_for_lower_triangular_matrices.py
@@ -0,0 +1,29 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a matrix multiplication (C = A * B) where A and B are lower triangular matrices.
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, A, B):
+        """
+        Performs matrix multiplication of lower triangular matrices A and B.
+
+        Args:
+            A (torch.Tensor): Lower triangular matrix of shape (N, N).
+            B (torch.Tensor): Lower triangular matrix of shape (N, N).
+
+        Returns:
+            torch.Tensor: The result of matrix multiplication C of shape (N, N).
+        """
+        return torch.tril(torch.matmul(A, B))
diff --git a/backends/mlir/cpu/KernelBench/level1/16_Matmul_with_transposed_A.py b/backends/mlir/cpu/KernelBench/level1/16_Matmul_with_transposed_A.py
new file mode 100644
index 0000000..4bf0703
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/16_Matmul_with_transposed_A.py
@@ -0,0 +1,29 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a single matrix multiplication (C = A * B)
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
+        """
+        Performs matrix multiplication.
+
+        Args:
+            A: Input tensor of shape (M, K).
+            B: Input tensor of shape (K, N).
+
+        Returns:
+            Output tensor of shape (M, N).
+        """
+        return torch.matmul(A.T, B)
diff --git a/backends/mlir/cpu/KernelBench/level1/17_Matmul_with_transposed_B.py b/backends/mlir/cpu/KernelBench/level1/17_Matmul_with_transposed_B.py
new file mode 100644
index 0000000..89f5581
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/17_Matmul_with_transposed_B.py
@@ -0,0 +1,29 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a single matrix multiplication (C = A * B)
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
+        """
+        Performs matrix multiplication.
+
+        Args:
+            A: Input tensor of shape (M, K).
+            B: Input tensor of shape (K, N).
+
+        Returns:
+            Output tensor of shape (M, N).
+        """
+        return torch.matmul(A, B.T)
diff --git a/backends/mlir/cpu/KernelBench/level1/18_Matmul_with_transposed_both.py b/backends/mlir/cpu/KernelBench/level1/18_Matmul_with_transposed_both.py
new file mode 100644
index 0000000..cf600b1
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/18_Matmul_with_transposed_both.py
@@ -0,0 +1,29 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a single matrix multiplication (C = A * B)
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
+        """
+        Performs matrix multiplication.
+
+        Args:
+            A: Input tensor of shape (M, K).
+            B: Input tensor of shape (K, N).
+
+        Returns:
+            Output tensor of shape (M, N).
+        """
+        return torch.matmul(A.T, B.T)
diff --git a/backends/mlir/cpu/KernelBench/level1/19_ReLU.py b/backends/mlir/cpu/KernelBench/level1/19_ReLU.py
new file mode 100644
index 0000000..543de5e
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/19_ReLU.py
@@ -0,0 +1,28 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a ReLU activation.
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Applies ReLU activation to the input tensor.
+
+        Args:
+            x (torch.Tensor): Input tensor of any shape.
+
+        Returns:
+            torch.Tensor: Output tensor with ReLU applied, same shape as input.
+        """
+        return torch.relu(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/20_LeakyReLU.py b/backends/mlir/cpu/KernelBench/level1/20_LeakyReLU.py
new file mode 100644
index 0000000..401a201
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/20_LeakyReLU.py
@@ -0,0 +1,35 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a LeakyReLU activation.
+    """
+
+    def __init__(self, negative_slope: float = 0.01):
+        """
+        Initializes the LeakyReLU module.
+
+        Args:
+            negative_slope (float, optional): The negative slope of the activation function. Defaults to 0.01.
+        """
+        super(Model, self).__init__()
+        self.negative_slope = negative_slope
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Applies LeakyReLU activation to the input tensor.
+
+        Args:
+            x (torch.Tensor): Input tensor of any shape.
+
+        Returns:
+            torch.Tensor: Output tensor with LeakyReLU applied, same shape as input.
+        """
+        return torch.nn.functional.leaky_relu(x, negative_slope=self.negative_slope)
diff --git a/backends/mlir/cpu/KernelBench/level1/21_Sigmoid.py b/backends/mlir/cpu/KernelBench/level1/21_Sigmoid.py
new file mode 100644
index 0000000..ae9268d
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/21_Sigmoid.py
@@ -0,0 +1,28 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a Sigmoid activation.
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Applies Sigmoid activation to the input tensor.
+
+        Args:
+            x (torch.Tensor): Input tensor of any shape.
+
+        Returns:
+            torch.Tensor: Output tensor with Sigmoid applied, same shape as input.
+        """
+        return torch.sigmoid(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/22_Tanh.py b/backends/mlir/cpu/KernelBench/level1/22_Tanh.py
new file mode 100644
index 0000000..b1a73aa
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/22_Tanh.py
@@ -0,0 +1,28 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a Tanh activation.
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Applies Tanh activation to the input tensor.
+
+        Args:
+            x (torch.Tensor): Input tensor of any shape.
+
+        Returns:
+            torch.Tensor: Output tensor with Tanh applied, same shape as input.
+        """
+        return torch.tanh(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/23_Softmax.py b/backends/mlir/cpu/KernelBench/level1/23_Softmax.py
new file mode 100644
index 0000000..19c503f
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/23_Softmax.py
@@ -0,0 +1,28 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a Softmax activation.
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Applies Softmax activation to the input tensor.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, num_features).
+
+        Returns:
+            torch.Tensor: Output tensor with Softmax applied, same shape as input.
+        """
+        return torch.softmax(x, dim=1)
diff --git a/backends/mlir/cpu/KernelBench/level1/24_LogSoftmax.py b/backends/mlir/cpu/KernelBench/level1/24_LogSoftmax.py
new file mode 100644
index 0000000..d709b98
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/24_LogSoftmax.py
@@ -0,0 +1,29 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a LogSoftmax activation.
+    """
+
+    def __init__(self, dim: int = 1):
+        super(Model, self).__init__()
+        self.dim = dim
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Applies LogSoftmax activation to the input tensor.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, dim).
+
+        Returns:
+            torch.Tensor: Output tensor with LogSoftmax applied, same shape as input.
+        """
+        return torch.log_softmax(x, dim=self.dim)
diff --git a/backends/mlir/cpu/KernelBench/level1/25_Swish.py b/backends/mlir/cpu/KernelBench/level1/25_Swish.py
new file mode 100644
index 0000000..150c740
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/25_Swish.py
@@ -0,0 +1,28 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a Swish activation.
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Applies Swish activation to the input tensor.
+
+        Args:
+            x (torch.Tensor): Input tensor of any shape.
+
+        Returns:
+            torch.Tensor: Output tensor with Swish applied, same shape as input.
+        """
+        return x * torch.sigmoid(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/26_GELU_.py b/backends/mlir/cpu/KernelBench/level1/26_GELU_.py
new file mode 100644
index 0000000..e05a57e
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/26_GELU_.py
@@ -0,0 +1,28 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a GELU activation.
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Applies GELU activation to the input tensor.
+
+        Args:
+            x (torch.Tensor): Input tensor of any shape.
+
+        Returns:
+            torch.Tensor: Output tensor with GELU applied, same shape as input.
+        """
+        return torch.nn.functional.gelu(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/27_SELU_.py b/backends/mlir/cpu/KernelBench/level1/27_SELU_.py
new file mode 100644
index 0000000..9311d09
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/27_SELU_.py
@@ -0,0 +1,28 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a SELU activation.
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Applies SELU activation to the input tensor.
+
+        Args:
+            x (torch.Tensor): Input tensor of any shape.
+
+        Returns:
+            torch.Tensor: Output tensor with SELU applied, same shape as input.
+        """
+        return torch.selu(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/28_HardSigmoid.py b/backends/mlir/cpu/KernelBench/level1/28_HardSigmoid.py
new file mode 100644
index 0000000..38ef66f
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/28_HardSigmoid.py
@@ -0,0 +1,28 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a HardSigmoid activation.
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Applies HardSigmoid activation to the input tensor.
+
+        Args:
+            x (torch.Tensor): Input tensor of any shape.
+
+        Returns:
+            torch.Tensor: Output tensor with HardSigmoid applied, same shape as input.
+        """
+        return torch.nn.functional.hardsigmoid(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/29_Softplus.py b/backends/mlir/cpu/KernelBench/level1/29_Softplus.py
new file mode 100644
index 0000000..e28ec9c
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/29_Softplus.py
@@ -0,0 +1,28 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a Softplus activation.
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Applies Softplus activation to the input tensor.
+
+        Args:
+            x (torch.Tensor): Input tensor of any shape.
+
+        Returns:
+            torch.Tensor: Output tensor with Softplus applied, same shape as input.
+        """
+        return torch.nn.functional.softplus(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/2_Standard_matrix_multiplication_.py b/backends/mlir/cpu/KernelBench/level1/2_Standard_matrix_multiplication_.py
new file mode 100644
index 0000000..d934710
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/2_Standard_matrix_multiplication_.py
@@ -0,0 +1,29 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a single matrix multiplication (C = A * B)
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
+        """
+        Performs matrix multiplication.
+
+        Args:
+            A: Input tensor of shape (M, K).
+            B: Input tensor of shape (K, N).
+
+        Returns:
+            Output tensor of shape (M, N).
+        """
+        return torch.matmul(A, B)
diff --git a/backends/mlir/cpu/KernelBench/level1/30_Softsign.py b/backends/mlir/cpu/KernelBench/level1/30_Softsign.py
new file mode 100644
index 0000000..40a5db5
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/30_Softsign.py
@@ -0,0 +1,28 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a Softsign activation.
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Applies Softsign activation to the input tensor.
+
+        Args:
+            x (torch.Tensor): Input tensor of any shape.
+
+        Returns:
+            torch.Tensor: Output tensor with Softsign applied, same shape as input.
+        """
+        return x / (1 + torch.abs(x))
diff --git a/backends/mlir/cpu/KernelBench/level1/31_ELU.py b/backends/mlir/cpu/KernelBench/level1/31_ELU.py
new file mode 100644
index 0000000..03ec22f
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/31_ELU.py
@@ -0,0 +1,36 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs an ELU activation.
+    """
+
+    def __init__(self, alpha: float = 1.0):
+        """
+        Initializes the ELU model.
+
+        Args:
+            alpha (float, optional): The alpha parameter for the ELU function. Defaults to 1.0.
+        """
+        super(Model, self).__init__()
+        self.alpha = alpha
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Applies ELU activation to the input tensor.
+
+        Args:
+            x (torch.Tensor): Input tensor of any shape.
+
+        Returns:
+            torch.Tensor: Output tensor with ELU applied, same shape as input.
+        """
+        return F.elu(x, alpha=self.alpha)
diff --git a/backends/mlir/cpu/KernelBench/level1/32_HardTanh.py b/backends/mlir/cpu/KernelBench/level1/32_HardTanh.py
new file mode 100644
index 0000000..43b2220
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/32_HardTanh.py
@@ -0,0 +1,29 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a HardTanh activation.
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Applies HardTanh activation to the input tensor.
+
+        Args:
+            x (torch.Tensor): Input tensor of any shape.
+
+        Returns:
+            torch.Tensor: Output tensor with HardTanh applied, same shape as input.
+        """
+        return F.hardtanh(x, min_val=-1.0, max_val=1.0)
diff --git a/backends/mlir/cpu/KernelBench/level1/33_BatchNorm.py b/backends/mlir/cpu/KernelBench/level1/33_BatchNorm.py
new file mode 100644
index 0000000..f912872
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/33_BatchNorm.py
@@ -0,0 +1,35 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs Batch Normalization.
+    """
+
+    def __init__(self, num_features: int):
+        """
+        Initializes the BatchNorm layer.
+
+        Args:
+            num_features (int): Number of features in the input tensor.
+        """
+        super(Model, self).__init__()
+        self.bn = nn.BatchNorm2d(num_features=num_features)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Applies Batch Normalization to the input tensor.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, num_features, *).
+
+        Returns:
+            torch.Tensor: Output tensor with Batch Normalization applied, same shape as input.
+        """
+        return self.bn(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/35_GroupNorm_.py b/backends/mlir/cpu/KernelBench/level1/35_GroupNorm_.py
new file mode 100644
index 0000000..f4e694d
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/35_GroupNorm_.py
@@ -0,0 +1,36 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs Group Normalization.
+    """
+
+    def __init__(self, num_features: int, num_groups: int):
+        """
+        Initializes the GroupNorm layer.
+
+        Args:
+            num_features (int): Number of features in the input tensor.
+            num_groups (int): Number of groups to divide the channels into.
+        """
+        super(Model, self).__init__()
+        self.gn = nn.GroupNorm(num_groups=num_groups, num_channels=num_features)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Applies Group Normalization to the input tensor.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, num_features, *).
+
+        Returns:
+            torch.Tensor: Output tensor with Group Normalization applied, same shape as input.
+        """
+        return self.gn(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/36_RMSNorm_.py b/backends/mlir/cpu/KernelBench/level1/36_RMSNorm_.py
new file mode 100644
index 0000000..be95ac6
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/36_RMSNorm_.py
@@ -0,0 +1,41 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs RMS Normalization.
+    """
+
+    def __init__(self, num_features: int, eps: float = 1e-5):
+        """
+        Initializes the RMSNorm layer.
+
+        Args:
+            num_features (int): Number of features in the input tensor.
+            eps (float, optional): A small value added to the denominator to avoid division by zero. Defaults to 1e-5.
+        """
+        super(Model, self).__init__()
+        self.num_features = num_features
+        self.eps = eps
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Applies RMS Normalization to the input tensor.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, num_features, *).
+
+        Returns:
+            torch.Tensor: Output tensor with RMS Normalization applied, same shape as input.
+        """
+        # Calculate the RMS along the feature dimension
+        rms = torch.sqrt(torch.mean(x**2, dim=1, keepdim=True) + self.eps)
+
+        # Normalize the input by dividing by the RMS
+        return x / rms
diff --git a/backends/mlir/cpu/KernelBench/level1/37_FrobeniusNorm_.py b/backends/mlir/cpu/KernelBench/level1/37_FrobeniusNorm_.py
new file mode 100644
index 0000000..61b72d2
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/37_FrobeniusNorm_.py
@@ -0,0 +1,32 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs Frobenius norm normalization.
+    """
+
+    def __init__(self):
+        """
+        Initializes the Frobenius norm normalization layer.
+        """
+        super(Model, self).__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Applies Frobenius norm normalization to the input tensor.
+
+        Args:
+            x (torch.Tensor): Input tensor of arbitrary shape.
+
+        Returns:
+            torch.Tensor: Output tensor with Frobenius norm normalization applied, same shape as input.
+        """
+        norm = torch.norm(x, p="fro")
+        return x / norm
diff --git a/backends/mlir/cpu/KernelBench/level1/38_L1Norm_.py b/backends/mlir/cpu/KernelBench/level1/38_L1Norm_.py
new file mode 100644
index 0000000..8cb87ce
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/38_L1Norm_.py
@@ -0,0 +1,31 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs L1 normalization.
+    """
+
+    def __init__(self):
+        """
+        Initializes the L1 normalization layer.
+        """
+        super(Model, self).__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Applies L1 normalization to the input tensor.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (..., dim, ...).
+
+        Returns:
+            torch.Tensor: Output tensor with L1 normalization applied, same shape as input.
+        """
+        return x / torch.mean(torch.abs(x), dim=1, keepdim=True)
diff --git a/backends/mlir/cpu/KernelBench/level1/39_L2Norm_.py b/backends/mlir/cpu/KernelBench/level1/39_L2Norm_.py
new file mode 100644
index 0000000..cdf92e8
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/39_L2Norm_.py
@@ -0,0 +1,34 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs L2 normalization.
+    """
+
+    def __init__(self):
+        """
+        Initializes the L2Norm layer.
+
+        Args:
+            dim (int): Dimension along which to normalize.
+        """
+        super(Model, self).__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Applies L2 normalization to the input tensor.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (*, dim, *).
+
+        Returns:
+            torch.Tensor: Output tensor with L2 normalization applied, same shape as input.
+        """
+        return x / torch.norm(x, p=2, dim=1, keepdim=True)
diff --git a/backends/mlir/cpu/KernelBench/level1/3_Batched_matrix_multiplication.py b/backends/mlir/cpu/KernelBench/level1/3_Batched_matrix_multiplication.py
new file mode 100644
index 0000000..d46f4e4
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/3_Batched_matrix_multiplication.py
@@ -0,0 +1,29 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Performs batched matrix multiplication (C = A * B) where A, B, and C have the same batch dimension.
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
+        """
+        Performs batched matrix multiplication.
+
+        Args:
+            A: Input tensor of shape (batch_size, m, k).
+            B: Input tensor of shape (batch_size, k, n).
+
+        Returns:
+            C: Output tensor of shape (batch_size, m, n).
+        """
+        return torch.bmm(A, B)
diff --git a/backends/mlir/cpu/KernelBench/level1/40_LayerNorm.py b/backends/mlir/cpu/KernelBench/level1/40_LayerNorm.py
new file mode 100644
index 0000000..28ec282
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/40_LayerNorm.py
@@ -0,0 +1,35 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs Layer Normalization.
+    """
+
+    def __init__(self, normalized_shape: tuple):
+        """
+        Initializes the LayerNorm layer.
+
+        Args:
+            normalized_shape (tuple): Shape of the input tensor to be normalized.
+        """
+        super(Model, self).__init__()
+        self.ln = nn.LayerNorm(normalized_shape=normalized_shape)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Applies Layer Normalization to the input tensor.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (*, normalized_shape).
+
+        Returns:
+            torch.Tensor: Output tensor with Layer Normalization applied, same shape as input.
+        """
+        return self.ln(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/41_Max_Pooling_1D.py b/backends/mlir/cpu/KernelBench/level1/41_Max_Pooling_1D.py
new file mode 100644
index 0000000..4875a12
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/41_Max_Pooling_1D.py
@@ -0,0 +1,52 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs Max Pooling 1D.
+    """
+
+    def __init__(
+        self,
+        kernel_size: int,
+        stride: int = None,
+        padding: int = 0,
+        dilation: int = 1,
+        return_indices: bool = False,
+    ):
+        """
+        Initializes the Max Pooling 1D layer.
+
+        Args:
+            kernel_size (int): Size of the window to take a max over.
+            stride (int, optional): Stride of the window. Defaults to None (same as kernel_size).
+            padding (int, optional): Implicit zero padding to be added on both sides. Defaults to 0.
+            dilation (int, optional): Spacing between kernel elements. Defaults to 1.
+            return_indices (bool, optional): Whether to return the indices of the maximum values. Defaults to False.
+        """
+        super(Model, self).__init__()
+        self.maxpool = nn.MaxPool1d(
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            return_indices=return_indices,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Applies Max Pooling 1D to the input tensor.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, num_features, sequence_length).
+
+        Returns:
+            torch.Tensor: Output tensor with Max Pooling 1D applied, shape (batch_size, num_features, output_sequence_length).
+        """
+        return self.maxpool(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/42_Max_Pooling_2D.py b/backends/mlir/cpu/KernelBench/level1/42_Max_Pooling_2D.py
new file mode 100644
index 0000000..48e1c7a
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/42_Max_Pooling_2D.py
@@ -0,0 +1,40 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs Max Pooling 2D.
+    """
+
+    def __init__(self, kernel_size: int, stride: int, padding: int, dilation: int):
+        """
+        Initializes the Max Pooling 2D layer.
+
+        Args:
+            kernel_size (int): Size of the pooling window.
+            stride (int): Stride of the pooling window.
+            padding (int): Padding to be applied before pooling.
+            dilation (int): Spacing between kernel elements.
+        """
+        super(Model, self).__init__()
+        self.maxpool = nn.MaxPool2d(
+            kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Applies Max Pooling 2D to the input tensor.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, channels, height, width).
+
+        Returns:
+            torch.Tensor: Output tensor after Max Pooling 2D, shape (batch_size, channels, pooled_height, pooled_width).
+        """
+        return self.maxpool(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/44_Average_Pooling_1D.py b/backends/mlir/cpu/KernelBench/level1/44_Average_Pooling_1D.py
new file mode 100644
index 0000000..b6d9415
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/44_Average_Pooling_1D.py
@@ -0,0 +1,39 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs 1D Average Pooling.
+    """
+
+    def __init__(self, kernel_size: int, stride: int = 1, padding: int = 0):
+        """
+        Initializes the 1D Average Pooling layer.
+
+        Args:
+            kernel_size (int): Size of the pooling window.
+            stride (int, optional): Stride of the pooling operation. Defaults to 1.
+            padding (int, optional): Padding applied to the input tensor. Defaults to 0.
+        """
+        super(Model, self).__init__()
+        self.avg_pool = nn.AvgPool1d(
+            kernel_size=kernel_size, stride=stride, padding=padding
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Applies 1D Average Pooling to the input tensor.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, input_length).
+
+        Returns:
+            torch.Tensor: Output tensor with 1D Average Pooling applied, shape (batch_size, in_channels, output_length).
+        """
+        return self.avg_pool(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/45_Average_Pooling_2D.py b/backends/mlir/cpu/KernelBench/level1/45_Average_Pooling_2D.py
new file mode 100644
index 0000000..e2dd15b
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/45_Average_Pooling_2D.py
@@ -0,0 +1,39 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs 2D Average Pooling.
+    """
+
+    def __init__(self, kernel_size: int, stride: int = None, padding: int = 0):
+        """
+        Initializes the Average Pooling layer.
+
+        Args:
+            kernel_size (int): Size of the pooling window.
+            stride (int, optional): Stride of the pooling operation. Defaults to None (same as kernel_size).
+            padding (int, optional): Padding applied to the input tensor. Defaults to 0.
+        """
+        super(Model, self).__init__()
+        self.avg_pool = nn.AvgPool2d(
+            kernel_size=kernel_size, stride=stride, padding=padding
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Applies 2D Average Pooling to the input tensor.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, channels, height, width).
+
+        Returns:
+            torch.Tensor: Output tensor with Average Pooling applied.
+        """
+        return self.avg_pool(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/46_Average_Pooling_3D.py b/backends/mlir/cpu/KernelBench/level1/46_Average_Pooling_3D.py
new file mode 100644
index 0000000..81c272b
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/46_Average_Pooling_3D.py
@@ -0,0 +1,39 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs 3D Average Pooling.
+    """
+
+    def __init__(self, kernel_size: int, stride: int = None, padding: int = 0):
+        """
+        Initializes the Average Pooling layer.
+
+        Args:
+            kernel_size (int): Size of the kernel to apply pooling.
+            stride (int, optional): Stride of the pooling operation. Defaults to None, which uses the kernel size.
+            padding (int, optional): Padding to apply before pooling. Defaults to 0.
+        """
+        super(Model, self).__init__()
+        self.avg_pool = nn.AvgPool3d(
+            kernel_size=kernel_size, stride=stride, padding=padding
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Applies Average Pooling to the input tensor.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, channels, depth, height, width).
+
+        Returns:
+            torch.Tensor: Output tensor with Average Pooling applied, shape depends on kernel_size, stride and padding.
+        """
+        return self.avg_pool(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/47_Sum_reduction_over_a_dimension.py b/backends/mlir/cpu/KernelBench/level1/47_Sum_reduction_over_a_dimension.py
new file mode 100644
index 0000000..fe0ecf2
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/47_Sum_reduction_over_a_dimension.py
@@ -0,0 +1,35 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs sum reduction over a specified dimension.
+    """
+
+    def __init__(self, dim: int):
+        """
+        Initializes the model with the dimension to reduce over.
+
+        Args:
+            dim (int): Dimension to reduce over.
+        """
+        super(Model, self).__init__()
+        self.dim = dim
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Applies sum reduction over the specified dimension.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (..., dim, ...).
+
+        Returns:
+            torch.Tensor: Output tensor after sum reduction, shape (..., 1, ...).
+        """
+        return torch.sum(x, dim=self.dim, keepdim=True)
diff --git a/backends/mlir/cpu/KernelBench/level1/48_Mean_reduction_over_a_dimension.py b/backends/mlir/cpu/KernelBench/level1/48_Mean_reduction_over_a_dimension.py
new file mode 100644
index 0000000..17ae10b
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/48_Mean_reduction_over_a_dimension.py
@@ -0,0 +1,35 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs mean reduction over a specific dimension.
+    """
+
+    def __init__(self, dim: int):
+        """
+        Initializes the model with the dimension to reduce over.
+
+        Args:
+            dim (int): The dimension to reduce over.
+        """
+        super(Model, self).__init__()
+        self.dim = dim
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Reduces the input tensor along the specified dimension by taking the mean.
+
+        Args:
+            x (torch.Tensor): Input tensor of arbitrary shape.
+
+        Returns:
+            torch.Tensor: Output tensor with reduced dimension. The shape of the output is the same as the input except for the reduced dimension which is removed.
+        """
+        return torch.mean(x, dim=self.dim)
diff --git a/backends/mlir/cpu/KernelBench/level1/49_Max_reduction_over_a_dimension.py b/backends/mlir/cpu/KernelBench/level1/49_Max_reduction_over_a_dimension.py
new file mode 100644
index 0000000..4f72447
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/49_Max_reduction_over_a_dimension.py
@@ -0,0 +1,35 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs Max reduction over a specific dimension.
+    """
+
+    def __init__(self, dim: int):
+        """
+        Initializes the model with the dimension to reduce over.
+
+        Args:
+            dim (int): The dimension to reduce over.
+        """
+        super(Model, self).__init__()
+        self.dim = dim
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Applies Max reduction over the specified dimension to the input tensor.
+
+        Args:
+            x (torch.Tensor): Input tensor.
+
+        Returns:
+            torch.Tensor: Output tensor after Max reduction over the specified dimension.
+        """
+        return torch.max(x, dim=self.dim)[0]
diff --git a/backends/mlir/cpu/KernelBench/level1/4_Matrix_vector_multiplication_.py b/backends/mlir/cpu/KernelBench/level1/4_Matrix_vector_multiplication_.py
new file mode 100644
index 0000000..ccc66c7
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/4_Matrix_vector_multiplication_.py
@@ -0,0 +1,29 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs matrix-vector multiplication (C = A * B).
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
+        """
+        Performs matrix-vector multiplication.
+
+        Args:
+            A: Input matrix of shape (M, K).
+            B: Input vector of shape (K, 1).
+
+        Returns:
+            Output vector of shape (M, 1).
+        """
+        return torch.matmul(A, B)
diff --git a/backends/mlir/cpu/KernelBench/level1/50_conv_standard_2D__square_input__square_kernel.py b/backends/mlir/cpu/KernelBench/level1/50_conv_standard_2D__square_input__square_kernel.py
new file mode 100644
index 0000000..f2e7f4c
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/50_conv_standard_2D__square_input__square_kernel.py
@@ -0,0 +1,19 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    def __init__(self, num_classes=1000):
+        super(Model, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_channels=3, out_channels=96, kernel_size=11, stride=4, padding=2
+        )
+
+    def forward(self, x):
+        x = self.conv1(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level1/51_Argmax_over_a_dimension.py b/backends/mlir/cpu/KernelBench/level1/51_Argmax_over_a_dimension.py
new file mode 100644
index 0000000..ef6542f
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/51_Argmax_over_a_dimension.py
@@ -0,0 +1,35 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs Argmax over a specified dimension.
+    """
+
+    def __init__(self, dim: int):
+        """
+        Initializes the model with the dimension to perform argmax.
+
+        Args:
+            dim (int): The dimension to perform argmax over.
+        """
+        super(Model, self).__init__()
+        self.dim = dim
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Applies argmax over the specified dimension to the input tensor.
+
+        Args:
+            x (torch.Tensor): Input tensor.
+
+        Returns:
+            torch.Tensor: Output tensor with argmax applied, with the specified dimension removed.
+        """
+        return torch.argmax(x, dim=self.dim)
diff --git a/backends/mlir/cpu/KernelBench/level1/52_Argmin_over_a_dimension.py b/backends/mlir/cpu/KernelBench/level1/52_Argmin_over_a_dimension.py
new file mode 100644
index 0000000..e499ede
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/52_Argmin_over_a_dimension.py
@@ -0,0 +1,35 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that finds the index of the minimum value along a specified dimension.
+    """
+
+    def __init__(self, dim: int):
+        """
+        Initializes the model with the dimension to perform argmin on.
+
+        Args:
+            dim (int): Dimension along which to find the minimum value.
+        """
+        super(Model, self).__init__()
+        self.dim = dim
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Finds the index of the minimum value along the specified dimension.
+
+        Args:
+            x (torch.Tensor): Input tensor.
+
+        Returns:
+            torch.Tensor: Tensor containing the indices of the minimum values along the specified dimension.
+        """
+        return torch.argmin(x, dim=self.dim)
diff --git a/backends/mlir/cpu/KernelBench/level1/53_Min_reduction_over_a_dimension.py b/backends/mlir/cpu/KernelBench/level1/53_Min_reduction_over_a_dimension.py
new file mode 100644
index 0000000..41fe386
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/53_Min_reduction_over_a_dimension.py
@@ -0,0 +1,35 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs min reduction over a specific dimension.
+    """
+
+    def __init__(self, dim: int):
+        """
+        Initializes the model with the dimension to reduce over.
+
+        Args:
+            dim (int): The dimension to reduce over.
+        """
+        super(Model, self).__init__()
+        self.dim = dim
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Applies min reduction over the specified dimension to the input tensor.
+
+        Args:
+            x (torch.Tensor): Input tensor.
+
+        Returns:
+            torch.Tensor: Output tensor after min reduction over the specified dimension.
+        """
+        return torch.min(x, dim=self.dim)[0]
diff --git a/backends/mlir/cpu/KernelBench/level1/54_conv_standard_3D__square_input__square_kernel.py b/backends/mlir/cpu/KernelBench/level1/54_conv_standard_3D__square_input__square_kernel.py
new file mode 100644
index 0000000..622104e
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/54_conv_standard_3D__square_input__square_kernel.py
@@ -0,0 +1,58 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Performs a standard 3D convolution operation with square input and square kernel.
+
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size (int): Size of the square convolution kernel.
+        stride (int, optional): Stride of the convolution. Defaults to 1.
+        padding (int, optional): Padding applied to the input. Defaults to 0.
+        dilation (int, optional): Spacing between kernel elements. Defaults to 1.
+        groups (int, optional): Number of blocked connections from input channels to output channels. Defaults to 1.
+        bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.conv3d = nn.Conv3d(
+            in_channels,
+            out_channels,
+            (kernel_size, kernel_size, kernel_size),
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Performs the 3D convolution.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, depth, width, height).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, depth_out, width_out, height_out).
+        """
+        return self.conv3d(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/55_conv_standard_2D__asymmetric_input__square_kernel.py b/backends/mlir/cpu/KernelBench/level1/55_conv_standard_2D__asymmetric_input__square_kernel.py
new file mode 100644
index 0000000..76d43ad
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/55_conv_standard_2D__asymmetric_input__square_kernel.py
@@ -0,0 +1,58 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Performs a standard 2D convolution operation with an asymmetric input and a square kernel.
+
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size (int): Size of the square convolution kernel.
+        stride (int, optional): Stride of the convolution. Defaults to 1.
+        padding (int, optional): Padding applied to the input. Defaults to 0.
+        dilation (int, optional): Spacing between kernel elements. Defaults to 1.
+        groups (int, optional): Number of blocked connections from input channels to output channels. Defaults to 1.
+        bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.conv2d = nn.Conv2d(
+            in_channels,
+            out_channels,
+            (kernel_size, kernel_size),
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Performs the 2D convolution.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, height, width).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, height_out, width_out).
+        """
+        return self.conv2d(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/56_conv_standard_2D__asymmetric_input__asymmetric_kernel.py b/backends/mlir/cpu/KernelBench/level1/56_conv_standard_2D__asymmetric_input__asymmetric_kernel.py
new file mode 100644
index 0000000..05f078b
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/56_conv_standard_2D__asymmetric_input__asymmetric_kernel.py
@@ -0,0 +1,58 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Performs a standard 2D convolution operation with asymmetric input and kernel sizes.
+
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size (tuple): Tuple of two integers representing the height and width of the convolution kernel.
+        stride (tuple, optional): Tuple of two integers representing the stride in the height and width dimensions. Defaults to (1, 1).
+        padding (tuple, optional): Tuple of two integers representing the padding in the height and width dimensions. Defaults to (0, 0).
+        dilation (tuple, optional): Tuple of two integers representing the dilation in the height and width dimensions. Defaults to (1, 1).
+        groups (int, optional): Number of blocked connections from input channels to output channels. Defaults to 1.
+        bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: tuple,
+        stride: tuple = (1, 1),
+        padding: tuple = (0, 0),
+        dilation: tuple = (1, 1),
+        groups: int = 1,
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.conv2d = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Performs the 2D convolution.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, height, width).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, height_out, width_out).
+        """
+        return self.conv2d(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/57_conv_transposed_2D__square_input__square_kernel.py b/backends/mlir/cpu/KernelBench/level1/57_conv_transposed_2D__square_input__square_kernel.py
new file mode 100644
index 0000000..616a764
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/57_conv_transposed_2D__square_input__square_kernel.py
@@ -0,0 +1,58 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Performs a transposed 2D convolution with square input and square kernel.
+
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size (int): Size of the square convolution kernel.
+        stride (int, optional): Stride of the convolution. Defaults to 1.
+        padding (int, optional): Padding applied to the input. Defaults to 0.
+        output_padding (int, optional): Additional size added to one side of the output shape. Defaults to 0.
+        groups (int, optional): Number of blocked connections from input channels to output channels. Defaults to 1.
+        bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        output_padding: int = 0,
+        groups: int = 1,
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.conv_transpose2d = nn.ConvTranspose2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+            groups=groups,
+            bias=bias,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Performs the transposed 2D convolution.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, height, width).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, height_out, width_out).
+        """
+        return self.conv_transpose2d(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.py b/backends/mlir/cpu/KernelBench/level1/58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.py
new file mode 100644
index 0000000..93f46c6
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.py
@@ -0,0 +1,58 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Performs a transposed 3D convolution operation with asymmetric input and kernel sizes.
+
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size (tuple): Tuple of 3 integers representing the kernel size in the form (depth, height, width).
+        stride (tuple, optional): Tuple of 3 integers representing the stride in the form (depth, height, width). Defaults to (1, 1, 1).
+        padding (tuple, optional): Tuple of 3 integers representing the padding in the form (depth, height, width). Defaults to (0, 0, 0).
+        output_padding (tuple, optional): Tuple of 3 integers representing the output padding in the form (depth, height, width). Defaults to (0, 0, 0).
+        groups (int, optional): Number of blocked connections from input channels to output channels. Defaults to 1.
+        bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: tuple,
+        stride: tuple = (1, 1, 1),
+        padding: tuple = (0, 0, 0),
+        output_padding: tuple = (0, 0, 0),
+        groups: int = 1,
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.conv_transpose3d = nn.ConvTranspose3d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+            groups=groups,
+            bias=bias,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Performs the transposed 3D convolution.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, depth_in, height_in, width_in).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, depth_out, height_out, width_out).
+        """
+        return self.conv_transpose3d(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/59_conv_standard_3D__asymmetric_input__square_kernel.py b/backends/mlir/cpu/KernelBench/level1/59_conv_standard_3D__asymmetric_input__square_kernel.py
new file mode 100644
index 0000000..c209371
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/59_conv_standard_3D__asymmetric_input__square_kernel.py
@@ -0,0 +1,58 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Performs a standard 3D convolution operation with an asymmetric input and a square kernel.
+
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size (int): Size of the square convolution kernel (kernel_size x kernel_size).
+        stride (int, optional): Stride of the convolution. Defaults to 1.
+        padding (int, optional): Padding applied to the input. Defaults to 0.
+        dilation (int, optional): Spacing between kernel elements. Defaults to 1.
+        groups (int, optional): Number of blocked connections from input channels to output channels. Defaults to 1.
+        bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.conv3d = nn.Conv3d(
+            in_channels,
+            out_channels,
+            (kernel_size, kernel_size, 1),
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Performs the 3D convolution.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, height, width, depth).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, height_out, width_out, depth_out).
+        """
+        return self.conv3d(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/5_Matrix_scalar_multiplication.py b/backends/mlir/cpu/KernelBench/level1/5_Matrix_scalar_multiplication.py
new file mode 100644
index 0000000..dbc7b7f
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/5_Matrix_scalar_multiplication.py
@@ -0,0 +1,29 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a matrix-scalar multiplication (C = A * s)
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, A: torch.Tensor, s: float) -> torch.Tensor:
+        """
+        Performs matrix-scalar multiplication.
+
+        Args:
+            A: Input matrix of shape (M, N)
+            s: Scalar value
+
+        Returns:
+            C: Resulting matrix of shape (M, N)
+        """
+        return A * s
diff --git a/backends/mlir/cpu/KernelBench/level1/60_conv_standard_3D__square_input__asymmetric_kernel.py b/backends/mlir/cpu/KernelBench/level1/60_conv_standard_3D__square_input__asymmetric_kernel.py
new file mode 100644
index 0000000..18f6116
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/60_conv_standard_3D__square_input__asymmetric_kernel.py
@@ -0,0 +1,58 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Performs a standard 3D convolution operation with a square input and an asymmetric kernel.
+
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size (tuple): Size of the convolution kernel (kernel_width, kernel_height, kernel_depth).
+        stride (int, optional): Stride of the convolution. Defaults to 1.
+        padding (int or tuple, optional): Padding applied to the input. Defaults to 0.
+        dilation (int or tuple, optional): Spacing between kernel elements. Defaults to 1.
+        groups (int, optional): Number of blocked connections from input channels to output channels. Defaults to 1.
+        bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: tuple,
+        stride: int = 1,
+        padding: int = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.conv3d = nn.Conv3d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Performs the 3D convolution.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, width, height, depth).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, width_out, height_out, depth_out).
+        """
+        return self.conv3d(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/61_conv_transposed_3D__square_input__square_kernel.py b/backends/mlir/cpu/KernelBench/level1/61_conv_transposed_3D__square_input__square_kernel.py
new file mode 100644
index 0000000..613047f
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/61_conv_transposed_3D__square_input__square_kernel.py
@@ -0,0 +1,58 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Performs a transposed 3D convolution with square input and square kernel.
+
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size (int): Size of the square convolution kernel.
+        stride (int, optional): Stride of the convolution. Defaults to 1.
+        padding (int, optional): Padding applied to the input. Defaults to 0.
+        output_padding (int, optional): Additional size added to one side of the output shape. Defaults to 0.
+        groups (int, optional): Number of blocked connections from input channels to output channels. Defaults to 1.
+        bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        output_padding: int = 0,
+        groups: int = 1,
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.conv_transpose3d = nn.ConvTranspose3d(
+            in_channels,
+            out_channels,
+            kernel_size=(kernel_size, kernel_size, kernel_size),
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+            groups=groups,
+            bias=bias,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Performs the transposed 3D convolution.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, depth, height, width).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, depth_out, height_out, width_out).
+        """
+        return self.conv_transpose3d(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/62_conv_standard_2D__square_input__asymmetric_kernel.py b/backends/mlir/cpu/KernelBench/level1/62_conv_standard_2D__square_input__asymmetric_kernel.py
new file mode 100644
index 0000000..9209564
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/62_conv_standard_2D__square_input__asymmetric_kernel.py
@@ -0,0 +1,58 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Performs a standard 2D convolution operation with a square input and an asymmetric kernel.
+
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size (tuple): Size of the convolution kernel (height, width).
+        stride (int, optional): Stride of the convolution. Defaults to 1.
+        padding (int or tuple, optional): Padding applied to the input. Defaults to 0.
+        dilation (int or tuple, optional): Spacing between kernel elements. Defaults to 1.
+        groups (int, optional): Number of blocked connections from input channels to output channels. Defaults to 1.
+        bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: tuple,
+        stride: int = 1,
+        padding: int = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.conv2d = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Performs the 2D convolution.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, height, width).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, height_out, width_out).
+        """
+        return self.conv2d(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/63_conv_standard_2D__square_input__square_kernel.py b/backends/mlir/cpu/KernelBench/level1/63_conv_standard_2D__square_input__square_kernel.py
new file mode 100644
index 0000000..e27dd51
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/63_conv_standard_2D__square_input__square_kernel.py
@@ -0,0 +1,58 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Performs a standard 2D convolution operation with a square input and square kernel.
+
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size (int): Size of the square convolution kernel.
+        stride (int, optional): Stride of the convolution. Defaults to 1.
+        padding (int, optional): Padding applied to the input. Defaults to 0.
+        dilation (int, optional): Spacing between kernel elements. Defaults to 1.
+        groups (int, optional): Number of blocked connections from input channels to output channels. Defaults to 1.
+        bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.conv2d = nn.Conv2d(
+            in_channels,
+            out_channels,
+            (kernel_size, kernel_size),
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Performs the 2D convolution.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, height, width).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, height_out, width_out).
+        """
+        return self.conv2d(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/64_conv_transposed_1D.py b/backends/mlir/cpu/KernelBench/level1/64_conv_transposed_1D.py
new file mode 100644
index 0000000..f52a20d
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/64_conv_transposed_1D.py
@@ -0,0 +1,58 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Performs a transposed 1D convolution operation.
+
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size (int): Size of the convolution kernel.
+        stride (int, optional): Stride of the convolution. Defaults to 1.
+        padding (int, optional): Padding applied to the input. Defaults to 0.
+        output_padding (int, optional): Additional size added to one side of the output shape. Defaults to 0.
+        groups (int, optional): Number of blocked connections from input channels to output channels. Defaults to 1.
+        bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        output_padding: int = 0,
+        groups: int = 1,
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.conv1d_transpose = nn.ConvTranspose1d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+            groups=groups,
+            bias=bias,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Performs the transposed 1D convolution.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, length).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, length_out).
+        """
+        return self.conv1d_transpose(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/65_conv_transposed_2D__square_input__asymmetric_kernel.py b/backends/mlir/cpu/KernelBench/level1/65_conv_transposed_2D__square_input__asymmetric_kernel.py
new file mode 100644
index 0000000..7ac4f0c
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/65_conv_transposed_2D__square_input__asymmetric_kernel.py
@@ -0,0 +1,58 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Performs a transposed 2D convolution with a square input and an asymmetric kernel.
+
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size (tuple): Size of the convolution kernel (height, width).
+        stride (int, optional): Stride of the convolution. Defaults to 1.
+        padding (int or tuple, optional): Padding applied to the input. Defaults to 0.
+        output_padding (int or tuple, optional): Additional size added to one side of the output shape. Defaults to 0.
+        groups (int, optional): Number of blocked connections from input channels to output channels. Defaults to 1.
+        bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: tuple,
+        stride: int = 1,
+        padding: int = 0,
+        output_padding: int = 0,
+        groups: int = 1,
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.conv_transpose2d = nn.ConvTranspose2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+            groups=groups,
+            bias=bias,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Performs the transposed 2D convolution.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, height, width).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, height_out, width_out).
+        """
+        return self.conv_transpose2d(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/66_conv_standard_3D__asymmetric_input__asymmetric_kernel.py b/backends/mlir/cpu/KernelBench/level1/66_conv_standard_3D__asymmetric_input__asymmetric_kernel.py
new file mode 100644
index 0000000..02ed451
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/66_conv_standard_3D__asymmetric_input__asymmetric_kernel.py
@@ -0,0 +1,58 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Performs a standard 3D convolution operation with asymmetric input and kernel sizes.
+
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size (tuple): Size of the convolution kernel in the form (kernel_size_d, kernel_size_h, kernel_size_w).
+        stride (tuple, optional): Stride of the convolution in the form (stride_d, stride_h, stride_w). Defaults to (1, 1, 1).
+        padding (tuple, optional): Padding applied to the input in the form (padding_d, padding_h, padding_w). Defaults to (0, 0, 0).
+        dilation (tuple, optional): Spacing between kernel elements in the form (dilation_d, dilation_h, dilation_w). Defaults to (1, 1, 1).
+        groups (int, optional): Number of blocked connections from input channels to output channels. Defaults to 1.
+        bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: tuple,
+        stride: tuple = (1, 1, 1),
+        padding: tuple = (0, 0, 0),
+        dilation: tuple = (1, 1, 1),
+        groups: int = 1,
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.conv3d = nn.Conv3d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Performs the 3D convolution.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, depth, height, width).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, depth_out, height_out, width_out).
+        """
+        return self.conv3d(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/67_conv_standard_1D.py b/backends/mlir/cpu/KernelBench/level1/67_conv_standard_1D.py
new file mode 100644
index 0000000..9388422
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/67_conv_standard_1D.py
@@ -0,0 +1,58 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Performs a standard 1D convolution operation.
+
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size (int): Size of the convolution kernel.
+        stride (int, optional): Stride of the convolution. Defaults to 1.
+        padding (int, optional): Padding applied to the input. Defaults to 0.
+        dilation (int, optional): Spacing between kernel elements. Defaults to 1.
+        groups (int, optional): Number of blocked connections from input channels to output channels. Defaults to 1.
+        bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.conv1d = nn.Conv1d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Performs the 1D convolution.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, length).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, length_out).
+        """
+        return self.conv1d(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/68_conv_transposed_3D__square_input__asymmetric_kernel.py b/backends/mlir/cpu/KernelBench/level1/68_conv_transposed_3D__square_input__asymmetric_kernel.py
new file mode 100644
index 0000000..07a19b2
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/68_conv_transposed_3D__square_input__asymmetric_kernel.py
@@ -0,0 +1,59 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Performs a transposed 3D convolution with a square input and an asymmetric kernel.
+
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size (tuple): Size of the convolution kernel (kernel_depth, kernel_width, kernel_height),
+                             where kernel_width == kernel_height.
+        stride (tuple, optional): Stride of the convolution. Defaults to (1, 1, 1).
+        padding (tuple, optional): Padding applied to the input. Defaults to (0, 0, 0).
+        output_padding (tuple, optional): Additional size added to one side of the output shape. Defaults to (0, 0, 0).
+        groups (int, optional): Number of blocked connections from input channels to output channels. Defaults to 1.
+        bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: tuple,
+        stride: tuple = (1, 1, 1),
+        padding: tuple = (0, 0, 0),
+        output_padding: tuple = (0, 0, 0),
+        groups: int = 1,
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.conv_transpose3d = nn.ConvTranspose3d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+            groups=groups,
+            bias=bias,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Performs the transposed 3D convolution.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, depth, width, height).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, depth_out, width_out, height_out).
+        """
+        return self.conv_transpose3d(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.py b/backends/mlir/cpu/KernelBench/level1/69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.py
new file mode 100644
index 0000000..1943e1e
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.py
@@ -0,0 +1,61 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Performs a transposed 2D convolution operation with asymmetric input and kernel size.
+
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size (tuple): Tuple of integers representing the kernel size (height, width).
+        stride (tuple, optional): Tuple of integers representing the stride of the convolution. Defaults to (1, 1).
+        padding (tuple, optional): Tuple of integers representing the padding applied to the input. Defaults to (0, 0).
+        output_padding (tuple, optional): Tuple of integers representing the additional size added to one side of the output shape. Defaults to (0, 0).
+        dilation (tuple, optional): Tuple of integers representing the spacing between kernel elements. Defaults to (1, 1).
+        groups (int, optional): Number of blocked connections from input channels to output channels. Defaults to 1.
+        bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: tuple,
+        stride: tuple = (1, 1),
+        padding: tuple = (0, 0),
+        output_padding: tuple = (0, 0),
+        dilation: tuple = (1, 1),
+        groups: int = 1,
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.conv_transpose2d = nn.ConvTranspose2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Performs the transposed 2D convolution.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, height_in, width_in).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, height_out, width_out).
+        """
+        return self.conv_transpose2d(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/6_Matmul_with_large_K_dimension_.py b/backends/mlir/cpu/KernelBench/level1/6_Matmul_with_large_K_dimension_.py
new file mode 100644
index 0000000..b9b1953
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/6_Matmul_with_large_K_dimension_.py
@@ -0,0 +1,29 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a single matrix multiplication (C = A * B) with a large K dimension
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
+        """
+        Performs matrix multiplication of A and B.
+
+        Args:
+            A: Input tensor of shape (M, K)
+            B: Input tensor of shape (K, N)
+
+        Returns:
+            Output tensor of shape (M, N)
+        """
+        return torch.matmul(A, B)
diff --git a/backends/mlir/cpu/KernelBench/level1/70_conv_transposed_3D__asymmetric_input__square_kernel.py b/backends/mlir/cpu/KernelBench/level1/70_conv_transposed_3D__asymmetric_input__square_kernel.py
new file mode 100644
index 0000000..56ae2be
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/70_conv_transposed_3D__asymmetric_input__square_kernel.py
@@ -0,0 +1,62 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Performs a transposed 3D convolution operation with asymmetric input and a square kernel.
+
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size (int): Size of the square convolution kernel.
+        stride (int or tuple, optional): Stride of the convolution. Defaults to 1.
+        padding (int or tuple, optional): Padding applied to the input. Defaults to 0.
+        output_padding (int or tuple, optional): Additional size added to one side of each dimension in the output shape.
+                                                  Defaults to 0.
+        dilation (int or tuple, optional): Spacing between kernel elements. Defaults to 1.
+        groups (int, optional): Number of blocked connections from input channels to output channels. Defaults to 1.
+        bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        output_padding: int = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.conv_transpose3d = nn.ConvTranspose3d(
+            in_channels,
+            out_channels,
+            (kernel_size, kernel_size, kernel_size),
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Performs the transposed 3D convolution.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, depth, height, width).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, depth_out, height_out, width_out).
+        """
+        return self.conv_transpose3d(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/71_conv_transposed_2D__asymmetric_input__square_kernel.py b/backends/mlir/cpu/KernelBench/level1/71_conv_transposed_2D__asymmetric_input__square_kernel.py
new file mode 100644
index 0000000..6eaf0d9
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/71_conv_transposed_2D__asymmetric_input__square_kernel.py
@@ -0,0 +1,58 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Performs a transposed 2D convolution with asymmetric input and a square kernel.
+
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size (int): Size of the square convolution kernel.
+        stride (int, optional): Stride of the convolution. Defaults to 1.
+        padding (int, optional): Padding applied to the input. Defaults to 0.
+        output_padding (int, optional): Additional size added to one side of the output shape. Defaults to 0.
+        groups (int, optional): Number of blocked connections from input channels to output channels. Defaults to 1.
+        bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        output_padding: int = 0,
+        groups: int = 1,
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.conv_transpose2d = nn.ConvTranspose2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+            groups=groups,
+            bias=bias,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Performs the transposed 2D convolution.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, height_in, width_in).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, height_out, width_out).
+        """
+        return self.conv_transpose2d(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.py b/backends/mlir/cpu/KernelBench/level1/73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.py
new file mode 100644
index 0000000..bcd1368
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.py
@@ -0,0 +1,57 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Performs a 3D transposed convolution operation with asymmetric input and square kernel.
+    The input is padded before the convolution.
+
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size (int): Size of the square convolution kernel.
+        stride (int, optional): Stride of the convolution. Defaults to 1.
+        padding (int, optional): Padding applied to the input. Defaults to 0.
+        groups (int, optional): Number of blocked connections from input channels to output channels. Defaults to 1.
+        bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        output_padding: int = 0,
+        groups: int = 1,
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.conv_transpose3d = nn.ConvTranspose3d(
+            in_channels,
+            out_channels,
+            kernel_size=(kernel_size, kernel_size, kernel_size),
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            bias=bias,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Performs the 3D transposed convolution.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, depth, height, width).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, depth_out, height_out, width_out).
+        """
+        return self.conv_transpose3d(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/74_conv_transposed_1D_dilated.py b/backends/mlir/cpu/KernelBench/level1/74_conv_transposed_1D_dilated.py
new file mode 100644
index 0000000..983b0cd
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/74_conv_transposed_1D_dilated.py
@@ -0,0 +1,55 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Performs a transposed 1D convolution operation with square input and asymmetric kernel, optionally with dilation.
+
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size (int): Size of the convolution kernel.
+        stride (int, optional): Stride of the convolution. Defaults to 1.
+        padding (int, optional): Padding applied to the input. Defaults to 0.
+        dilation (int, optional): Spacing between kernel elements. Defaults to 1.
+        bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        dilation: int = 1,
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.conv1d_transpose = nn.ConvTranspose1d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Performs the transposed 1D convolution.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, length).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, length_out).
+        """
+        return self.conv1d_transpose(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.py b/backends/mlir/cpu/KernelBench/level1/75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.py
new file mode 100644
index 0000000..f7d48f7
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.py
@@ -0,0 +1,59 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Performs a 2D transposed convolution operation with asymmetric input, asymmetric kernel,
+    grouped, padded, and dilated.
+
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size (tuple): Size of the convolution kernel (height, width).
+        stride (tuple, optional): Stride of the convolution (height, width). Defaults to (1, 1).
+        padding (tuple, optional): Padding applied to the input (height, width). Defaults to (0, 0).
+        dilation (tuple, optional): Spacing between kernel elements (height, width). Defaults to (1, 1).
+        groups (int, optional): Number of blocked connections from input channels to output channels. Defaults to 1.
+        bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: tuple,
+        stride: tuple = (1, 1),
+        padding: tuple = (0, 0),
+        dilation: tuple = (1, 1),
+        groups: int = 1,
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.conv_transpose2d = nn.ConvTranspose2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Performs the 2D transposed convolution.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, height, width).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, height_out, width_out).
+        """
+        return self.conv_transpose2d(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/76_conv_standard_1D_dilated_strided__.py b/backends/mlir/cpu/KernelBench/level1/76_conv_standard_1D_dilated_strided__.py
new file mode 100644
index 0000000..aff84f6
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/76_conv_standard_1D_dilated_strided__.py
@@ -0,0 +1,52 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Performs a standard 1D convolution operation with asymmetric input and a square kernel, potentially dilated and strided.
+
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size (int): Size of the square convolution kernel.
+        stride (int, optional): Stride of the convolution. Defaults to 1.
+        dilation (int, optional): Spacing between kernel elements. Defaults to 1.
+        bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        dilation: int = 1,
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.conv1d = nn.Conv1d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            dilation=dilation,
+            bias=bias,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Performs the 1D convolution.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, length).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, length_out).
+        """
+        return self.conv1d(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.py b/backends/mlir/cpu/KernelBench/level1/77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.py
new file mode 100644
index 0000000..b59aa5a
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.py
@@ -0,0 +1,56 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Performs a 3D transposed convolution operation with square input and square kernel,
+    and supports padding, dilation, and stride.
+
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size (int): Size of the convolution kernel (square kernel, so only one value needed).
+        stride (int, optional): Stride of the convolution. Defaults to 1.
+        padding (int, optional): Padding applied to the input. Defaults to 0.
+        dilation (int, optional): Spacing between kernel elements. Defaults to 1.
+        bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        dilation: int = 1,
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.conv_transpose3d = nn.ConvTranspose3d(
+            in_channels,
+            out_channels,
+            kernel_size=(kernel_size, kernel_size, kernel_size),
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Performs the 3D transposed convolution.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, depth, height, width).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, depth_out, height_out, width_out).
+        """
+        return self.conv_transpose3d(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.py b/backends/mlir/cpu/KernelBench/level1/78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.py
new file mode 100644
index 0000000..9b2e7f0
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.py
@@ -0,0 +1,52 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Performs a 2D transposed convolution operation with asymmetric input and kernel, with optional padding.
+
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size (tuple): Size of the convolution kernel (height, width).
+        stride (tuple, optional): Stride of the convolution (height, width). Defaults to (1, 1).
+        padding (tuple, optional): Padding applied to the input (height, width). Defaults to (0, 0).
+        bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: tuple,
+        stride: tuple = (1, 1),
+        padding: tuple = (0, 0),
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.conv_transpose2d = nn.ConvTranspose2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            bias=bias,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Performs the 2D transposed convolution.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, height, width).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, height_out, width_out).
+        """
+        return self.conv_transpose2d(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.py b/backends/mlir/cpu/KernelBench/level1/79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.py
new file mode 100644
index 0000000..9424508
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.py
@@ -0,0 +1,56 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Performs a transposed 1D convolution operation with asymmetric input and square kernel.
+    Supports padding, striding, and dilation.
+
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size (int): Size of the square convolution kernel.
+        stride (int, optional): Stride of the convolution. Defaults to 1.
+        padding (int, optional): Padding applied to the input. Defaults to 0.
+        dilation (int, optional): Spacing between kernel elements. Defaults to 1.
+        bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        dilation: int = 1,
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.conv1d_transpose = nn.ConvTranspose1d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Performs the transposed 1D convolution.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, length).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, length_out).
+        """
+        return self.conv1d_transpose(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/7_Matmul_with_small_K_dimension_.py b/backends/mlir/cpu/KernelBench/level1/7_Matmul_with_small_K_dimension_.py
new file mode 100644
index 0000000..32ee22b
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/7_Matmul_with_small_K_dimension_.py
@@ -0,0 +1,29 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a single matrix multiplication (C = A * B) with a small K dimension
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
+        """
+        Performs matrix multiplication.
+
+        Args:
+            A: Input tensor of shape (M, K).
+            B: Input tensor of shape (K, N).
+
+        Returns:
+            Output tensor of shape (M, N).
+        """
+        return torch.matmul(A, B)
diff --git a/backends/mlir/cpu/KernelBench/level1/80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.py b/backends/mlir/cpu/KernelBench/level1/80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.py
new file mode 100644
index 0000000..590f93c
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.py
@@ -0,0 +1,55 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Performs a standard 2D convolution operation with square input and asymmetric kernel, with dilation and padding.
+
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size (tuple): Size of the convolution kernel (height, width).
+        stride (int, optional): Stride of the convolution. Defaults to 1.
+        padding (tuple, optional): Padding applied to the input (top/bottom, left/right). Defaults to (0, 0).
+        dilation (tuple, optional): Spacing between kernel elements (height, width). Defaults to (1, 1).
+        bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: tuple,
+        stride: int = 1,
+        padding: tuple = (0, 0),
+        dilation: tuple = (1, 1),
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.conv2d = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Performs the 2D convolution.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, height, width).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, height_out, width_out).
+        """
+        return self.conv2d(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.py b/backends/mlir/cpu/KernelBench/level1/81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.py
new file mode 100644
index 0000000..19efd37
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.py
@@ -0,0 +1,55 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Performs a 2D transposed convolution operation with asymmetric input and square kernel, supporting dilation, padding, and stride.
+
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size (int): Size of the convolution kernel (square, e.g., 3 for a 3x3 kernel).
+        stride (int, optional): Stride of the convolution. Defaults to 1.
+        padding (int, optional): Padding applied to the input. Defaults to 0.
+        dilation (int, optional): Spacing between kernel elements. Defaults to 1.
+        bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        dilation: int = 1,
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.conv_transpose2d = nn.ConvTranspose2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Performs the 2D transposed convolution.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, height_in, width_in).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, height_out, width_out).
+        """
+        return self.conv_transpose2d(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/82_conv_depthwise_2D_square_input_square_kernel.py b/backends/mlir/cpu/KernelBench/level1/82_conv_depthwise_2D_square_input_square_kernel.py
new file mode 100644
index 0000000..159c97f
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/82_conv_depthwise_2D_square_input_square_kernel.py
@@ -0,0 +1,51 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Performs a depthwise 2D convolution operation with square input and square kernel.
+
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        kernel_size (int): Size of the convolution kernel.
+        stride (int, optional): Stride of the convolution. Defaults to 1.
+        padding (int, optional): Padding applied to the input. Defaults to 0.
+        bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.conv2d = nn.Conv2d(
+            in_channels,
+            in_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=in_channels,
+            bias=bias,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Performs the depthwise 2D convolution.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, height, width).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, in_channels, height_out, width_out).
+        """
+        return self.conv2d(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/83_conv_depthwise_2D_square_input_asymmetric_kernel.py b/backends/mlir/cpu/KernelBench/level1/83_conv_depthwise_2D_square_input_asymmetric_kernel.py
new file mode 100644
index 0000000..0e89225
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/83_conv_depthwise_2D_square_input_asymmetric_kernel.py
@@ -0,0 +1,54 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Performs a depthwise 2D convolution with a square input and an asymmetric kernel.
+
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        kernel_size (int): Size of the convolution kernel.
+        stride (int, optional): Stride of the convolution. Defaults to 1.
+        padding (int, optional): Padding applied to the input. Defaults to 0.
+        dilation (int, optional): Spacing between kernel elements. Defaults to 1.
+        bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        dilation: int = 1,
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.conv2d = nn.Conv2d(
+            in_channels,
+            in_channels,
+            kernel_size=(kernel_size, 1),
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=in_channels,
+            bias=bias,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Performs the depthwise 2D convolution.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, height, width).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, in_channels, height_out, width_out).
+        """
+        return self.conv2d(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/84_conv_depthwise_2D_asymmetric_input_square_kernel.py b/backends/mlir/cpu/KernelBench/level1/84_conv_depthwise_2D_asymmetric_input_square_kernel.py
new file mode 100644
index 0000000..0cbbcef
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/84_conv_depthwise_2D_asymmetric_input_square_kernel.py
@@ -0,0 +1,53 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Performs a depthwise 2D convolution with asymmetric input and square kernel.
+
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size (int): Size of the square convolution kernel.
+        stride (int, optional): Stride of the convolution. Defaults to 1.
+        padding (int, optional): Padding applied to the input. Defaults to 0.
+        bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.conv2d = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=(kernel_size, kernel_size),
+            stride=stride,
+            padding=padding,
+            groups=in_channels,
+            bias=bias,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Performs the depthwise 2D convolution.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, height_in, width_in).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, height_out, width_out).
+        """
+        return self.conv2d(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.py b/backends/mlir/cpu/KernelBench/level1/85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.py
new file mode 100644
index 0000000..f96a0fa
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.py
@@ -0,0 +1,66 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Performs a depthwise 2D convolution with asymmetric input and asymmetric kernel.
+
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size_h (int): Height of the convolution kernel.
+        kernel_size_w (int): Width of the convolution kernel.
+        stride_h (int, optional): Stride of the convolution in height dimension. Defaults to 1.
+        stride_w (int, optional): Stride of the convolution in width dimension. Defaults to 1.
+        padding_h (int, optional): Padding applied to the input in height dimension. Defaults to 0.
+        padding_w (int, optional): Padding applied to the input in width dimension. Defaults to 0.
+        dilation_h (int, optional): Spacing between kernel elements in height dimension. Defaults to 1.
+        dilation_w (int, optional): Spacing between kernel elements in width dimension. Defaults to 1.
+        groups (int, optional): Number of blocked connections from input channels to output channels. Defaults to 1.
+        bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size_h: int,
+        kernel_size_w: int,
+        stride_h: int = 1,
+        stride_w: int = 1,
+        padding_h: int = 0,
+        padding_w: int = 0,
+        dilation_h: int = 1,
+        dilation_w: int = 1,
+        groups: int = 1,
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.conv2d = nn.Conv2d(
+            in_channels,
+            in_channels,
+            (kernel_size_h, kernel_size_w),
+            stride=(stride_h, stride_w),
+            padding=(padding_h, padding_w),
+            dilation=(dilation_h, dilation_w),
+            groups=in_channels,
+            bias=bias,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Performs the depthwise 2D convolution.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, height, width).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, height_out, width_out).
+        """
+        return self.conv2d(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/86_conv_depthwise_separable_2D.py b/backends/mlir/cpu/KernelBench/level1/86_conv_depthwise_separable_2D.py
new file mode 100644
index 0000000..10a2c6b
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/86_conv_depthwise_separable_2D.py
@@ -0,0 +1,59 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Performs a depthwise-separable 2D convolution operation.
+
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size (int): Size of the convolution kernel.
+        stride (int, optional): Stride of the convolution. Defaults to 1.
+        padding (int, optional): Padding applied to the input. Defaults to 0.
+        dilation (int, optional): Spacing between kernel elements. Defaults to 1.
+        bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        dilation: int = 1,
+        bias: bool = False,
+    ):
+        super(Model, self).__init__()
+        self.depthwise = nn.Conv2d(
+            in_channels,
+            in_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=in_channels,
+            bias=bias,
+        )
+        self.pointwise = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=bias)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Performs the depthwise-separable 2D convolution.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, height, width).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, height_out, width_out).
+        """
+        x = self.depthwise(x)
+        x = self.pointwise(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level1/87_conv_pointwise_2D.py b/backends/mlir/cpu/KernelBench/level1/87_conv_pointwise_2D.py
new file mode 100644
index 0000000..7a8d04b
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/87_conv_pointwise_2D.py
@@ -0,0 +1,36 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Performs a pointwise 2D convolution operation.
+
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        out_channels (int): Number of channels produced by the convolution.
+        bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`.
+    """
+
+    def __init__(self, in_channels: int, out_channels: int, bias: bool = False):
+        super(Model, self).__init__()
+        self.conv1d = nn.Conv2d(
+            in_channels, out_channels, kernel_size=1, stride=1, padding=0, bias=bias
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Performs the pointwise 2D convolution.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, height, width).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, height, width).
+        """
+        return self.conv1d(x)
diff --git a/backends/mlir/cpu/KernelBench/level1/88_MinGPTNewGelu.py b/backends/mlir/cpu/KernelBench/level1/88_MinGPTNewGelu.py
new file mode 100644
index 0000000..b59a188
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/88_MinGPTNewGelu.py
@@ -0,0 +1,32 @@
+import math
+
+import torch
+import torch.nn as nn
+
+# From https://github.com/karpathy/minGPT/blob/master/mingpt/model.py
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT).
+    Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x):
+        return (
+            0.5
+            * x
+            * (
+                1.0
+                + torch.tanh(
+                    math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))
+                )
+            )
+        )
diff --git a/backends/mlir/cpu/KernelBench/level1/8_Matmul_with_irregular_shapes_.py b/backends/mlir/cpu/KernelBench/level1/8_Matmul_with_irregular_shapes_.py
new file mode 100644
index 0000000..4674914
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/8_Matmul_with_irregular_shapes_.py
@@ -0,0 +1,29 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a single matrix multiplication (C = A * B) with irregular shapes
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
+        """
+        Performs matrix multiplication of A and B.
+
+        Args:
+            A: Input tensor with shape (M, K).
+            B: Input tensor with shape (K, N).
+
+        Returns:
+            C: Output tensor with shape (M, N).
+        """
+        return torch.matmul(A, B)
diff --git a/backends/mlir/cpu/KernelBench/level1/94_MSELoss.py b/backends/mlir/cpu/KernelBench/level1/94_MSELoss.py
new file mode 100644
index 0000000..a516ad3
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/94_MSELoss.py
@@ -0,0 +1,22 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    A model that computes the Mean Squared Error loss for regression tasks.
+
+    Parameters:
+        None
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, predictions, targets):
+        return torch.mean((predictions - targets) ** 2)
diff --git a/backends/mlir/cpu/KernelBench/level1/97_ScaledDotProductAttention.py b/backends/mlir/cpu/KernelBench/level1/97_ScaledDotProductAttention.py
new file mode 100644
index 0000000..6819027
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/97_ScaledDotProductAttention.py
@@ -0,0 +1,18 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(
+        self, Q: torch.Tensor, K: torch.Tensor, V: torch.Tensor
+    ) -> torch.Tensor:
+        out = torch.nn.functional.scaled_dot_product_attention(Q, K, V)
+        return out
diff --git a/backends/mlir/cpu/KernelBench/level1/98_KLDivLoss.py b/backends/mlir/cpu/KernelBench/level1/98_KLDivLoss.py
new file mode 100644
index 0000000..5e39723
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/98_KLDivLoss.py
@@ -0,0 +1,24 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    A model that computes Kullback-Leibler Divergence for comparing two distributions.
+
+    Parameters:
+        None
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, predictions, targets):
+        return torch.nn.functional.kl_div(
+            torch.log(predictions), targets, reduction="batchmean"
+        )
diff --git a/backends/mlir/cpu/KernelBench/level1/9_Tall_skinny_matrix_multiplication_.py b/backends/mlir/cpu/KernelBench/level1/9_Tall_skinny_matrix_multiplication_.py
new file mode 100644
index 0000000..f0cb5c7
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level1/9_Tall_skinny_matrix_multiplication_.py
@@ -0,0 +1,29 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a single matrix multiplication (C = A * B) where one of the matrices is tall and skinny (M >> N or N >> M)
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, A, B):
+        """
+        Performs the matrix multiplication.
+
+        Args:
+            A (torch.Tensor): Input matrix of shape (M, K) or (K, M) where M >> N or N >> M.
+            B (torch.Tensor): Input matrix of shape (K, N) or (N, K) where M >> N or N >> M.
+
+        Returns:
+            torch.Tensor: Output matrix of shape (M, N) or (N, M)
+        """
+        return torch.matmul(A, B)

From 8608860a382b357b3ffac58dee84ab83c0e1564b Mon Sep 17 00:00:00 2001
From: Adam Siemieniuk <adam.siemieniuk@intel.com>
Date: Tue, 7 Apr 2026 23:12:58 +0200
Subject: [PATCH 2/7] MLIR CPU pipeline

---
 ai_bench/mlir/__init__.py     |  2 ++
 ai_bench/mlir/cpu_pipeline.py | 63 +++++++++++++++++++++++++++++++++++
 2 files changed, 65 insertions(+)
 create mode 100644 ai_bench/mlir/cpu_pipeline.py

diff --git a/ai_bench/mlir/__init__.py b/ai_bench/mlir/__init__.py
index 01c0a60..750c1bf 100644
--- a/ai_bench/mlir/__init__.py
+++ b/ai_bench/mlir/__init__.py
@@ -1,5 +1,7 @@
 from .compile import cpu_backend
+from .cpu_pipeline import cpu_pipeline
 
 __all__ = [
     "cpu_backend",
+    "cpu_pipeline",
 ]
diff --git a/ai_bench/mlir/cpu_pipeline.py b/ai_bench/mlir/cpu_pipeline.py
new file mode 100644
index 0000000..50f0673
--- /dev/null
+++ b/ai_bench/mlir/cpu_pipeline.py
@@ -0,0 +1,63 @@
+import lighthouse.schedule as lh_schedule
+import lighthouse.transform as lh_transform
+from mlir import ir
+from mlir.dialects import transform
+from mlir.dialects.transform import structured
+from mlir.passmanager import PassManager
+
+
+def cpu_pipeline(module: ir.Module) -> ir.Module:
+    # Use standard C interface wrappers for functions.
+    pm = PassManager("builtin.module", module.context)
+    # pm.add("print-ir")
+    pm.add("func.func(llvm-request-c-wrappers)")
+    pm.run(module.operation)
+
+    # Decompose complex Linalg ops into simpler ones.
+    ctx = module.context
+    with ctx, ir.Location.unknown(context=ctx):
+        with lh_schedule.schedule_boilerplate() as (sched, named_seq):
+            # ops = lh_transform.match_op(named_seq.bodyTarget, "linalg.conv_2d_nchw_fchw")
+            # structured.structured_decompose(transform.any_op_t(), ops)
+            # ops = lh_transform.match_op(named_seq.bodyTarget, "linalg.conv_3d_ncdhw_fcdhw")
+            # structured.structured_decompose(transform.any_op_t(), ops)
+            softmax_ops = lh_transform.match_op(named_seq.bodyTarget, "linalg.softmax")
+            structured.structured_decompose_interface(transform.any_op_t(), softmax_ops)
+            transform.yield_()
+    sched.body.operations[0].apply(module)
+
+    # Bufferize.
+    pm = PassManager("builtin.module", module.context)
+    pm.add("eliminate-empty-tensors")
+    pm.add(
+        "one-shot-bufferize{function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries}"
+    )
+    pm.add("drop-equivalent-buffer-results")
+    pm.add("buffer-deallocation-pipeline")
+    pm.add("convert-bufferization-to-memref")
+    pm.add("cse")
+    pm.add("canonicalize")
+
+    # Lower to LLVM.
+    pm.add("convert-linalg-to-loops")
+    # pm.add("print-ir")
+    pm.add("math-expand-ops")
+    pm.add("expand-strided-metadata")
+    pm.add("canonicalize")
+
+    pm.add("convert-vector-to-scf")
+    pm.add("lower-affine")
+    pm.add("convert-scf-to-cf")
+    pm.add("convert-vector-to-llvm")
+    pm.add("convert-math-to-libm")
+    pm.add("convert-to-llvm")
+    pm.add("reconcile-unrealized-casts")
+
+    # Cleanup
+    pm.add("cse")
+    pm.add("canonicalize")
+    # pm.add("print-ir")
+
+    pm.run(module.operation)
+
+    return module

From 8aa3ca8d0605cfc04a65250e7477e9d0f4509549 Mon Sep 17 00:00:00 2001
From: Adam Siemieniuk <adam.siemieniuk@intel.com>
Date: Tue, 7 Apr 2026 23:40:42 +0200
Subject: [PATCH 3/7] L2 MLIR CPU kernels

---
 .../100_ConvTranspose3d_Clamp_Min_Divide.py   | 37 ++++++++++++
 ...vTranspose2d_MaxPool_Hardtanh_Mean_Tanh.py | 42 ++++++++++++++
 ...pose2d_BatchNorm_Tanh_MaxPool_GroupNorm.py | 40 +++++++++++++
 .../level2/12_Gemm_Multiply_LeakyReLU.py      | 25 ++++++++
 ...anspose3d_Mean_Add_Softmax_Tanh_Scaling.py | 40 +++++++++++++
 .../level2/14_Gemm_Divide_Sum_Scaling.py      | 31 ++++++++++
 .../15_ConvTranspose3d_BatchNorm_Subtract.py  | 35 +++++++++++
 ...nvTranspose2d_Mish_Add_Hardtanh_Scaling.py | 42 ++++++++++++++
 .../19_ConvTranspose2d_GELU_GroupNorm.py      | 28 +++++++++
 .../level2/1_Conv2D_ReLU_BiasAdd.py           | 24 ++++++++
 ...3d_Sum_ResidualAdd_Multiply_ResidualAdd.py | 44 ++++++++++++++
 .../21_Conv2d_Add_Scale_Sigmoid_GroupNorm.py  | 36 ++++++++++++
 .../level2/23_Conv3d_GroupNorm_Mean.py        | 30 ++++++++++
 .../level2/24_Conv3d_Min_Softmax.py           | 31 ++++++++++
 .../level2/25_Conv2d_Min_Tanh_Tanh.py         | 26 +++++++++
 .../26_ConvTranspose3d_Add_HardSwish.py       | 47 +++++++++++++++
 .../27_Conv3d_HardSwish_GroupNorm_Mean.py     | 30 ++++++++++
 .../KernelBench/level2/29_Matmul_Mish_Mish.py | 23 ++++++++
 ...se2d_BiasAdd_Clamp_Scaling_Clamp_Divide.py | 45 ++++++++++++++
 .../level2/30_Gemm_GroupNorm_Hardtanh.py      | 33 +++++++++++
 .../level2/31_Conv2d_Min_Add_Multiply.py      | 35 +++++++++++
 .../level2/32_Conv2d_Scaling_Min.py           | 30 ++++++++++
 .../level2/33_Gemm_Scale_BatchNorm.py         | 26 +++++++++
 ..._ConvTranspose3d_LayerNorm_GELU_Scaling.py | 50 ++++++++++++++++
 ..._Conv2d_Subtract_HardSwish_MaxPool_Mish.py | 29 ++++++++++
 .../36_ConvTranspose2d_Min_Sum_GELU_Add.py    | 39 +++++++++++++
 .../level2/37_Matmul_Swish_Sum_GroupNorm.py   | 32 ++++++++++
 ...nspose3d_AvgPool_Clamp_Softmax_Multiply.py | 58 +++++++++++++++++++
 .../level2/39_Gemm_Scale_BatchNorm.py         | 25 ++++++++
 ...vTranspose3d_Sum_LayerNorm_AvgPool_GELU.py | 47 +++++++++++++++
 .../level2/40_Matmul_Scaling_ResidualAdd.py   | 39 +++++++++++++
 .../level2/41_Gemm_BatchNorm_GELU_ReLU.py     | 31 ++++++++++
 ...ltiply_GlobalAvgPool_GlobalAvgPool_Mean.py | 42 ++++++++++++++
 ...6_Conv2d_Subtract_Tanh_Subtract_AvgPool.py | 36 ++++++++++++
 .../KernelBench/level2/47_Conv3d_Mish_Tanh.py | 32 ++++++++++
 ...48_Conv3d_Scaling_Tanh_Multiply_Sigmoid.py | 29 ++++++++++
 .../49_ConvTranspose3d_Softmax_Sigmoid.py     | 49 ++++++++++++++++
 .../KernelBench/level2/4_Conv2d_Mish_Mish.py  | 23 ++++++++
 ...spose3d_Scaling_AvgPool_BiasAdd_Scaling.py | 41 +++++++++++++
 .../level2/53_Gemm_Scaling_Hardtanh_GELU.py   | 29 ++++++++++
 .../54_Conv2d_Multiply_LeakyReLU_GELU.py      | 26 +++++++++
 .../level2/55_Matmul_MaxPool_Sum_Scale.py     | 33 +++++++++++
 .../level2/56_Matmul_Sigmoid_Sum.py           | 30 ++++++++++
 .../level2/57_Conv2d_ReLU_HardSwish.py        | 23 ++++++++
 .../level2/59_Matmul_Swish_Scaling.py         | 24 ++++++++
 .../level2/5_ConvTranspose2d_Subtract_Tanh.py | 40 +++++++++++++
 ...nvTranspose3d_Swish_GroupNorm_HardSwish.py | 45 ++++++++++++++
 .../61_ConvTranspose3d_ReLU_GroupNorm.py      | 34 +++++++++++
 .../62_Matmul_GroupNorm_LeakyReLU_Sum.py      | 37 ++++++++++++
 .../KernelBench/level2/63_Gemm_ReLU_Divide.py | 24 ++++++++
 .../level2/65_Conv2d_AvgPool_Sigmoid_Sum.py   | 25 ++++++++
 .../level2/66_Matmul_Dropout_Softmax.py       | 31 ++++++++++
 .../level2/67_Conv2d_GELU_GlobalAvgPool.py    | 30 ++++++++++
 .../level2/68_Matmul_Min_Subtract.py          | 24 ++++++++
 .../level2/69_Conv2d_HardSwish_ReLU.py        | 30 ++++++++++
 .../6_Conv3d_Softmax_MaxPool_MaxPool.py       | 32 ++++++++++
 .../70_Gemm_Sigmoid_Scaling_ResidualAdd.py    | 35 +++++++++++
 .../level2/71_Conv2d_Divide_LeakyReLU.py      | 24 ++++++++
 ...nvTranspose3d_BatchNorm_AvgPool_AvgPool.py | 32 ++++++++++
 .../level2/73_Conv2d_BatchNorm_Scaling.py     | 25 ++++++++
 ...pose3d_LeakyReLU_Multiply_LeakyReLU_Max.py | 45 ++++++++++++++
 .../level2/75_Gemm_GroupNorm_Min_BiasAdd.py   | 26 +++++++++
 .../KernelBench/level2/76_Gemm_Add_ReLU.py    | 30 ++++++++++
 ...anspose3d_Scale_BatchNorm_GlobalAvgPool.py | 36 ++++++++++++
 .../level2/78_ConvTranspose3d_Max_Max_Sum.py  | 28 +++++++++
 ...v3d_ReLU_LeakyReLU_GELU_Sigmoid_BiasAdd.py | 27 +++++++++
 .../level2/80_Gemm_Max_Subtract_GELU.py       | 32 ++++++++++
 .../81_Gemm_Swish_Divide_Clamp_Tanh_Clamp.py  | 32 ++++++++++
 .../82_Conv2d_Tanh_Scaling_BiasAdd_Max.py     | 41 +++++++++++++
 .../83_Conv3d_GroupNorm_Min_Clamp_Dropout.py  | 40 +++++++++++++
 .../84_Gemm_BatchNorm_Scaling_Softmax.py      | 35 +++++++++++
 ...85_Conv2d_GroupNorm_Scale_MaxPool_Clamp.py | 46 +++++++++++++++
 .../level2/86_Matmul_Divide_GELU.py           | 30 ++++++++++
 .../87_Conv2d_Subtract_Subtract_Mish.py       | 28 +++++++++
 .../88_Gemm_GroupNorm_Swish_Multiply_Swish.py | 32 ++++++++++
 ...se3d_MaxPool_Softmax_Subtract_Swish_Max.py | 56 ++++++++++++++++++
 ...3d_Divide_Max_GlobalAvgPool_BiasAdd_Sum.py | 41 +++++++++++++
 .../90_Conv3d_LeakyReLU_Sum_Clamp_GELU.py     | 26 +++++++++
 ...spose2d_Softmax_BiasAdd_Scaling_Sigmoid.py | 44 ++++++++++++++
 ...3_ConvTranspose2d_Add_Min_GELU_Multiply.py | 31 ++++++++++
 ...94_Gemm_BiasAdd_Hardtanh_Mish_GroupNorm.py | 35 +++++++++++
 .../95_Matmul_Add_Swish_Tanh_GELU_Hardtanh.py | 27 +++++++++
 ...pose3d_Multiply_Max_GlobalAvgPool_Clamp.py | 42 ++++++++++++++
 ...7_Matmul_BatchNorm_BiasAdd_Divide_Swish.py | 36 ++++++++++++
 .../98_Matmul_AvgPool_GELU_Scale_Max.py       | 34 +++++++++++
 .../level2/99_Matmul_GELU_Softmax.py          | 23 ++++++++
 .../level2/9_Matmul_Subtract_Multiply_ReLU.py | 26 +++++++++
 87 files changed, 2944 insertions(+)
 create mode 100644 backends/mlir/cpu/KernelBench/level2/100_ConvTranspose3d_Clamp_Min_Divide.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/10_ConvTranspose2d_MaxPool_Hardtanh_Mean_Tanh.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/11_ConvTranspose2d_BatchNorm_Tanh_MaxPool_GroupNorm.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/12_Gemm_Multiply_LeakyReLU.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/13_ConvTranspose3d_Mean_Add_Softmax_Tanh_Scaling.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/14_Gemm_Divide_Sum_Scaling.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/15_ConvTranspose3d_BatchNorm_Subtract.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/16_ConvTranspose2d_Mish_Add_Hardtanh_Scaling.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/19_ConvTranspose2d_GELU_GroupNorm.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/1_Conv2D_ReLU_BiasAdd.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/20_ConvTranspose3d_Sum_ResidualAdd_Multiply_ResidualAdd.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/21_Conv2d_Add_Scale_Sigmoid_GroupNorm.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/23_Conv3d_GroupNorm_Mean.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/24_Conv3d_Min_Softmax.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/25_Conv2d_Min_Tanh_Tanh.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/26_ConvTranspose3d_Add_HardSwish.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/27_Conv3d_HardSwish_GroupNorm_Mean.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/29_Matmul_Mish_Mish.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/2_ConvTranspose2d_BiasAdd_Clamp_Scaling_Clamp_Divide.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/30_Gemm_GroupNorm_Hardtanh.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/31_Conv2d_Min_Add_Multiply.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/32_Conv2d_Scaling_Min.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/33_Gemm_Scale_BatchNorm.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/34_ConvTranspose3d_LayerNorm_GELU_Scaling.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/35_Conv2d_Subtract_HardSwish_MaxPool_Mish.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/36_ConvTranspose2d_Min_Sum_GELU_Add.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/37_Matmul_Swish_Sum_GroupNorm.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/38_ConvTranspose3d_AvgPool_Clamp_Softmax_Multiply.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/39_Gemm_Scale_BatchNorm.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/3_ConvTranspose3d_Sum_LayerNorm_AvgPool_GELU.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/40_Matmul_Scaling_ResidualAdd.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/41_Gemm_BatchNorm_GELU_ReLU.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/44_ConvTranspose2d_Multiply_GlobalAvgPool_GlobalAvgPool_Mean.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/46_Conv2d_Subtract_Tanh_Subtract_AvgPool.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/47_Conv3d_Mish_Tanh.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/48_Conv3d_Scaling_Tanh_Multiply_Sigmoid.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/49_ConvTranspose3d_Softmax_Sigmoid.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/4_Conv2d_Mish_Mish.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/50_ConvTranspose3d_Scaling_AvgPool_BiasAdd_Scaling.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/53_Gemm_Scaling_Hardtanh_GELU.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/54_Conv2d_Multiply_LeakyReLU_GELU.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/55_Matmul_MaxPool_Sum_Scale.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/56_Matmul_Sigmoid_Sum.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/57_Conv2d_ReLU_HardSwish.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/59_Matmul_Swish_Scaling.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/5_ConvTranspose2d_Subtract_Tanh.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/60_ConvTranspose3d_Swish_GroupNorm_HardSwish.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/61_ConvTranspose3d_ReLU_GroupNorm.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/62_Matmul_GroupNorm_LeakyReLU_Sum.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/63_Gemm_ReLU_Divide.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/65_Conv2d_AvgPool_Sigmoid_Sum.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/66_Matmul_Dropout_Softmax.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/67_Conv2d_GELU_GlobalAvgPool.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/68_Matmul_Min_Subtract.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/69_Conv2d_HardSwish_ReLU.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/6_Conv3d_Softmax_MaxPool_MaxPool.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/70_Gemm_Sigmoid_Scaling_ResidualAdd.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/71_Conv2d_Divide_LeakyReLU.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/72_ConvTranspose3d_BatchNorm_AvgPool_AvgPool.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/73_Conv2d_BatchNorm_Scaling.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/74_ConvTranspose3d_LeakyReLU_Multiply_LeakyReLU_Max.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/75_Gemm_GroupNorm_Min_BiasAdd.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/76_Gemm_Add_ReLU.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/77_ConvTranspose3d_Scale_BatchNorm_GlobalAvgPool.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/78_ConvTranspose3d_Max_Max_Sum.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/7_Conv3d_ReLU_LeakyReLU_GELU_Sigmoid_BiasAdd.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/80_Gemm_Max_Subtract_GELU.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/81_Gemm_Swish_Divide_Clamp_Tanh_Clamp.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/82_Conv2d_Tanh_Scaling_BiasAdd_Max.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/83_Conv3d_GroupNorm_Min_Clamp_Dropout.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/84_Gemm_BatchNorm_Scaling_Softmax.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/85_Conv2d_GroupNorm_Scale_MaxPool_Clamp.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/86_Matmul_Divide_GELU.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/87_Conv2d_Subtract_Subtract_Mish.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/88_Gemm_GroupNorm_Swish_Multiply_Swish.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/89_ConvTranspose3d_MaxPool_Softmax_Subtract_Swish_Max.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/8_Conv3d_Divide_Max_GlobalAvgPool_BiasAdd_Sum.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/90_Conv3d_LeakyReLU_Sum_Clamp_GELU.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/91_ConvTranspose2d_Softmax_BiasAdd_Scaling_Sigmoid.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/93_ConvTranspose2d_Add_Min_GELU_Multiply.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/94_Gemm_BiasAdd_Hardtanh_Mish_GroupNorm.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/95_Matmul_Add_Swish_Tanh_GELU_Hardtanh.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/96_ConvTranspose3d_Multiply_Max_GlobalAvgPool_Clamp.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/97_Matmul_BatchNorm_BiasAdd_Divide_Swish.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/98_Matmul_AvgPool_GELU_Scale_Max.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/99_Matmul_GELU_Softmax.py
 create mode 100644 backends/mlir/cpu/KernelBench/level2/9_Matmul_Subtract_Multiply_ReLU.py

diff --git a/backends/mlir/cpu/KernelBench/level2/100_ConvTranspose3d_Clamp_Min_Divide.py b/backends/mlir/cpu/KernelBench/level2/100_ConvTranspose3d_Clamp_Min_Divide.py
new file mode 100644
index 0000000..c6ed29e
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/100_ConvTranspose3d_Clamp_Min_Divide.py
@@ -0,0 +1,37 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    A model that performs a transposed 3D convolution, clamps the output to a minimum value,
+    and then divides the result by a constant.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        min_value,
+        divisor,
+    ):
+        super(Model, self).__init__()
+        self.conv_transpose = nn.ConvTranspose3d(
+            in_channels, out_channels, kernel_size, stride=stride, padding=padding
+        )
+        self.min_value = min_value
+        self.divisor = divisor
+
+    def forward(self, x):
+        x = self.conv_transpose(x)
+        x = torch.clamp(x, min=self.min_value)
+        x = x / self.divisor
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/10_ConvTranspose2d_MaxPool_Hardtanh_Mean_Tanh.py b/backends/mlir/cpu/KernelBench/level2/10_ConvTranspose2d_MaxPool_Hardtanh_Mean_Tanh.py
new file mode 100644
index 0000000..2f006fe
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/10_ConvTranspose2d_MaxPool_Hardtanh_Mean_Tanh.py
@@ -0,0 +1,42 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a transposed convolution, followed by max pooling, hardtanh activation, mean operation, and tanh activation.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        maxpool_kernel_size,
+        maxpool_stride,
+        hardtanh_min,
+        hardtanh_max,
+    ):
+        super(Model, self).__init__()
+        self.conv_transpose = nn.ConvTranspose2d(
+            in_channels, out_channels, kernel_size, stride=stride, padding=padding
+        )
+        self.maxpool = nn.MaxPool2d(
+            kernel_size=maxpool_kernel_size, stride=maxpool_stride
+        )
+        self.hardtanh = nn.Hardtanh(min_val=hardtanh_min, max_val=hardtanh_max)
+
+    def forward(self, x):
+        x = self.conv_transpose(x)
+        x = self.maxpool(x)
+        x = self.hardtanh(x)
+        x = torch.mean(x, dim=(2, 3), keepdim=True)
+        x = torch.tanh(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/11_ConvTranspose2d_BatchNorm_Tanh_MaxPool_GroupNorm.py b/backends/mlir/cpu/KernelBench/level2/11_ConvTranspose2d_BatchNorm_Tanh_MaxPool_GroupNorm.py
new file mode 100644
index 0000000..4825bc2
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/11_ConvTranspose2d_BatchNorm_Tanh_MaxPool_GroupNorm.py
@@ -0,0 +1,40 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a transposed convolution, batch normalization, tanh activation, max pooling, and group normalization.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        groups,
+        num_groups,
+    ):
+        super(Model, self).__init__()
+        self.conv_transpose = nn.ConvTranspose2d(
+            in_channels, out_channels, kernel_size, stride=stride, padding=padding
+        )
+        self.batch_norm = nn.BatchNorm2d(out_channels)
+        self.tanh = nn.Tanh()
+        self.max_pool = nn.MaxPool2d(kernel_size=2, stride=2)
+        self.group_norm = nn.GroupNorm(num_groups=num_groups, num_channels=out_channels)
+
+    def forward(self, x):
+        x = self.conv_transpose(x)
+        x = self.batch_norm(x)
+        x = self.tanh(x)
+        x = self.max_pool(x)
+        x = self.group_norm(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/12_Gemm_Multiply_LeakyReLU.py b/backends/mlir/cpu/KernelBench/level2/12_Gemm_Multiply_LeakyReLU.py
new file mode 100644
index 0000000..0412a84
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/12_Gemm_Multiply_LeakyReLU.py
@@ -0,0 +1,25 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a Gemm, multiplies the result, and applies LeakyReLU.
+    """
+
+    def __init__(self, in_features, out_features, multiplier, negative_slope):
+        super(Model, self).__init__()
+        self.gemm = nn.Linear(in_features, out_features)
+        self.multiplier = multiplier
+        self.leaky_relu = nn.LeakyReLU(negative_slope)
+
+    def forward(self, x):
+        x = self.gemm(x)
+        x = x * self.multiplier
+        x = self.leaky_relu(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/13_ConvTranspose3d_Mean_Add_Softmax_Tanh_Scaling.py b/backends/mlir/cpu/KernelBench/level2/13_ConvTranspose3d_Mean_Add_Softmax_Tanh_Scaling.py
new file mode 100644
index 0000000..972da54
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/13_ConvTranspose3d_Mean_Add_Softmax_Tanh_Scaling.py
@@ -0,0 +1,40 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a series of operations:
+    1. Transposed 3D convolution
+    2. Mean pooling (across depth)
+    3. Addition
+    4. Softmax (across channels)
+    5. Tanh activation
+    6. Scaling
+    """
+
+    def __init__(
+        self, in_channels, out_channels, kernel_size, stride, padding, scaling_factor
+    ):
+        super(Model, self).__init__()
+        self.conv_transpose = nn.ConvTranspose3d(
+            in_channels, out_channels, kernel_size, stride=stride, padding=padding
+        )
+        self.bias = nn.Parameter(
+            torch.randn(1, out_channels, 1, 1, 1)
+        )  # Broadcastable bias over channels
+        self.scaling_factor = scaling_factor
+
+    def forward(self, x):
+        x = self.conv_transpose(x)  # (B, C, D, H, W)
+        x = x.mean(dim=2, keepdim=True)  # Mean pool over depth dim (D)
+        x = x + self.bias  # Bias add per channel
+        x = torch.softmax(x, dim=1)  # Softmax over channels
+        x = torch.tanh(x)  # Nonlinearity
+        x = x * self.scaling_factor  # Scaling
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/14_Gemm_Divide_Sum_Scaling.py b/backends/mlir/cpu/KernelBench/level2/14_Gemm_Divide_Sum_Scaling.py
new file mode 100644
index 0000000..cf64e34
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/14_Gemm_Divide_Sum_Scaling.py
@@ -0,0 +1,31 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a matrix multiplication, division, summation, and scaling.
+    """
+
+    def __init__(self, input_size, hidden_size, scaling_factor):
+        super(Model, self).__init__()
+        self.weight = nn.Parameter(torch.randn(hidden_size, input_size))
+        self.scaling_factor = scaling_factor
+
+    def forward(self, x):
+        """
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, input_size).
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, hidden_size).
+        """
+        x = torch.matmul(x, self.weight.T)  # Gemm
+        x = x / 2  # Divide
+        x = torch.sum(x, dim=1, keepdim=True)  # Sum
+        x = x * self.scaling_factor  # Scaling
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/15_ConvTranspose3d_BatchNorm_Subtract.py b/backends/mlir/cpu/KernelBench/level2/15_ConvTranspose3d_BatchNorm_Subtract.py
new file mode 100644
index 0000000..27578d0
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/15_ConvTranspose3d_BatchNorm_Subtract.py
@@ -0,0 +1,35 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    A 3D convolutional transpose layer followed by Batch Normalization and subtraction.
+    """
+
+    def __init__(
+        self, in_channels, out_channels, kernel_size, stride, padding, bias=True
+    ):
+        super(Model, self).__init__()
+        self.conv_transpose = nn.ConvTranspose3d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            bias=bias,
+        )
+        self.batch_norm = nn.BatchNorm3d(out_channels)
+
+    def forward(self, x):
+        x = self.conv_transpose(x)
+        x = self.batch_norm(x)
+        x = x - torch.mean(
+            x, dim=(2, 3, 4), keepdim=True
+        )  # Subtract mean along spatial dimensions
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/16_ConvTranspose2d_Mish_Add_Hardtanh_Scaling.py b/backends/mlir/cpu/KernelBench/level2/16_ConvTranspose2d_Mish_Add_Hardtanh_Scaling.py
new file mode 100644
index 0000000..0e155d7
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/16_ConvTranspose2d_Mish_Add_Hardtanh_Scaling.py
@@ -0,0 +1,42 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a transposed convolution, applies Mish activation, adds a value,
+    applies Hardtanh activation, and scales the output.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        output_padding,
+        add_value,
+        scale,
+    ):
+        super(Model, self).__init__()
+        self.conv_transpose = nn.ConvTranspose2d(
+            in_channels, out_channels, kernel_size, stride, padding, output_padding
+        )
+        self.add_value = add_value
+        self.scale = scale
+
+    def forward(self, x):
+        x = self.conv_transpose(x)
+        x = torch.nn.functional.mish(x)  # Mish activation
+        x = x + self.add_value
+        x = torch.nn.functional.hardtanh(
+            x, min_val=-1, max_val=1
+        )  # Hardtanh activation
+        x = x * self.scale  # Scaling
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/19_ConvTranspose2d_GELU_GroupNorm.py b/backends/mlir/cpu/KernelBench/level2/19_ConvTranspose2d_GELU_GroupNorm.py
new file mode 100644
index 0000000..2b45748
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/19_ConvTranspose2d_GELU_GroupNorm.py
@@ -0,0 +1,28 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a transposed convolution, applies GELU, and normalizes with GroupNorm.
+    """
+
+    def __init__(
+        self, in_channels, out_channels, kernel_size, stride, groups, num_groups
+    ):
+        super(Model, self).__init__()
+        self.conv_transpose = nn.ConvTranspose2d(
+            in_channels, out_channels, kernel_size, stride=stride
+        )
+        self.group_norm = nn.GroupNorm(num_groups=num_groups, num_channels=out_channels)
+
+    def forward(self, x):
+        x = self.conv_transpose(x)
+        x = torch.nn.functional.gelu(x)
+        x = self.group_norm(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/1_Conv2D_ReLU_BiasAdd.py b/backends/mlir/cpu/KernelBench/level2/1_Conv2D_ReLU_BiasAdd.py
new file mode 100644
index 0000000..c23ca81
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/1_Conv2D_ReLU_BiasAdd.py
@@ -0,0 +1,24 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a convolution, applies ReLU, and adds a bias term.
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size, bias_shape):
+        super(Model, self).__init__()
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size)
+        self.bias = nn.Parameter(torch.randn(bias_shape))
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = torch.relu(x)
+        x = x + self.bias
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/20_ConvTranspose3d_Sum_ResidualAdd_Multiply_ResidualAdd.py b/backends/mlir/cpu/KernelBench/level2/20_ConvTranspose3d_Sum_ResidualAdd_Multiply_ResidualAdd.py
new file mode 100644
index 0000000..6e2f4a3
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/20_ConvTranspose3d_Sum_ResidualAdd_Multiply_ResidualAdd.py
@@ -0,0 +1,44 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a 3D transposed convolution, followed by a sum,
+    a residual add, a multiplication, and another residual add.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        output_padding,
+        bias_shape,
+    ):
+        super(Model, self).__init__()
+        self.conv_transpose = nn.ConvTranspose3d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+        )
+        self.bias = nn.Parameter(torch.randn(bias_shape))
+
+    def forward(self, x):
+        x = self.conv_transpose(x)
+        original_x = x.clone().detach()
+        x = x + self.bias
+        x = x + original_x
+        x = x * original_x
+        x = x + original_x
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/21_Conv2d_Add_Scale_Sigmoid_GroupNorm.py b/backends/mlir/cpu/KernelBench/level2/21_Conv2d_Add_Scale_Sigmoid_GroupNorm.py
new file mode 100644
index 0000000..6215dc6
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/21_Conv2d_Add_Scale_Sigmoid_GroupNorm.py
@@ -0,0 +1,36 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a convolution, adds a bias term, scales, applies sigmoid, and performs group normalization.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        num_groups,
+        bias_shape,
+        scale_shape,
+    ):
+        super(Model, self).__init__()
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size)
+        self.bias = nn.Parameter(torch.randn(bias_shape))
+        self.scale = nn.Parameter(torch.randn(scale_shape))
+        self.group_norm = nn.GroupNorm(num_groups, out_channels)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = x + self.bias
+        x = x * self.scale
+        x = torch.sigmoid(x)
+        x = self.group_norm(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/23_Conv3d_GroupNorm_Mean.py b/backends/mlir/cpu/KernelBench/level2/23_Conv3d_GroupNorm_Mean.py
new file mode 100644
index 0000000..d43bcb0
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/23_Conv3d_GroupNorm_Mean.py
@@ -0,0 +1,30 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a 3D convolution, applies Group Normalization, computes the mean
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size, num_groups):
+        super(Model, self).__init__()
+        self.conv = nn.Conv3d(in_channels, out_channels, kernel_size)
+        self.group_norm = nn.GroupNorm(num_groups, out_channels)
+
+    def forward(self, x):
+        """
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, D, H, W).
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, 1).
+        """
+        x = self.conv(x)
+        x = self.group_norm(x)
+        x = x.mean(dim=[1, 2, 3, 4])  # Compute mean across all dimensions except batch
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/24_Conv3d_Min_Softmax.py b/backends/mlir/cpu/KernelBench/level2/24_Conv3d_Min_Softmax.py
new file mode 100644
index 0000000..c91ea04
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/24_Conv3d_Min_Softmax.py
@@ -0,0 +1,31 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a 3D convolution, applies minimum operation along a specific dimension,
+    and then applies softmax.
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size, dim):
+        super(Model, self).__init__()
+        self.conv = nn.Conv3d(in_channels, out_channels, kernel_size)
+        self.dim = dim
+
+    def forward(self, x):
+        """
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, D, H, W)
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, H, W)
+        """
+        x = self.conv(x)
+        x = torch.min(x, dim=self.dim)[0]  # Apply minimum along the specified dimension
+        x = torch.softmax(x, dim=1)  # Apply softmax along the channel dimension
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/25_Conv2d_Min_Tanh_Tanh.py b/backends/mlir/cpu/KernelBench/level2/25_Conv2d_Min_Tanh_Tanh.py
new file mode 100644
index 0000000..5b97edc
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/25_Conv2d_Min_Tanh_Tanh.py
@@ -0,0 +1,26 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a convolution, applies minimum operation, Tanh, and another Tanh.
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size):
+        super(Model, self).__init__()
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = torch.min(x, dim=1, keepdim=True)[
+            0
+        ]  # Apply minimum operation along the channel dimension
+        x = torch.tanh(x)
+        x = torch.tanh(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/26_ConvTranspose3d_Add_HardSwish.py b/backends/mlir/cpu/KernelBench/level2/26_ConvTranspose3d_Add_HardSwish.py
new file mode 100644
index 0000000..7bda6f1
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/26_ConvTranspose3d_Add_HardSwish.py
@@ -0,0 +1,47 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a 3D transposed convolution, adds an input tensor, and applies HardSwish activation.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        output_padding,
+        bias_shape,
+    ):
+        super(Model, self).__init__()
+        self.conv_transpose = nn.ConvTranspose3d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+        )
+        self.bias = nn.Parameter(torch.randn(bias_shape))
+
+    def forward(self, x, add_input):
+        """
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, D, H, W).
+            add_input (torch.Tensor): Input tensor to be added after transposed convolution, of shape (batch_size, out_channels, D, H, W).
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, D, H, W) after HardSwish activation.
+        """
+        x = self.conv_transpose(x)
+        x = x + add_input
+        x = x * torch.nn.functional.hardswish(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/27_Conv3d_HardSwish_GroupNorm_Mean.py b/backends/mlir/cpu/KernelBench/level2/27_Conv3d_HardSwish_GroupNorm_Mean.py
new file mode 100644
index 0000000..e225ed7
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/27_Conv3d_HardSwish_GroupNorm_Mean.py
@@ -0,0 +1,30 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs:
+    1. Conv3D
+    2. HardSwish activation
+    3. GroupNorm
+    4. Mean pooling across spatial dimensions
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size, num_groups=4, bias=True):
+        super(Model, self).__init__()
+        self.conv = nn.Conv3d(in_channels, out_channels, kernel_size, bias=bias)
+        self.group_norm = nn.GroupNorm(num_groups, out_channels)
+
+    def forward(self, x):
+        x = self.conv(x)  # (B, C, D, H, W)
+        x = F.hardswish(x)  # Nonlinear activation
+        x = self.group_norm(x)  # Normalization over channels
+        x = torch.mean(x, dim=[2, 3, 4])  # Mean over spatial dims → (B, C)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/29_Matmul_Mish_Mish.py b/backends/mlir/cpu/KernelBench/level2/29_Matmul_Mish_Mish.py
new file mode 100644
index 0000000..318d5d7
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/29_Matmul_Mish_Mish.py
@@ -0,0 +1,23 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a matrix multiplication, applies Mish, and applies Mish again.
+    """
+
+    def __init__(self, in_features, out_features):
+        super(Model, self).__init__()
+        self.linear = nn.Linear(in_features, out_features)
+
+    def forward(self, x):
+        x = self.linear(x)
+        x = torch.nn.functional.mish(x)
+        x = torch.nn.functional.mish(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/2_ConvTranspose2d_BiasAdd_Clamp_Scaling_Clamp_Divide.py b/backends/mlir/cpu/KernelBench/level2/2_ConvTranspose2d_BiasAdd_Clamp_Scaling_Clamp_Divide.py
new file mode 100644
index 0000000..76a2d3b
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/2_ConvTranspose2d_BiasAdd_Clamp_Scaling_Clamp_Divide.py
@@ -0,0 +1,45 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a transposed convolution, adds a bias term, clamps, scales, clamps, and divides.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        output_padding,
+        bias_shape,
+        scaling_factor,
+    ):
+        super(Model, self).__init__()
+        self.conv_transpose = nn.ConvTranspose2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+        )
+        self.bias = nn.Parameter(torch.randn(bias_shape))
+        self.scaling_factor = scaling_factor
+
+    def forward(self, x):
+        x = self.conv_transpose(x)
+        x = x + self.bias
+        x = torch.clamp(x, min=0.0, max=1.0)
+        x = x * self.scaling_factor
+        x = torch.clamp(x, min=0.0, max=1.0)
+        x = x / self.scaling_factor
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/30_Gemm_GroupNorm_Hardtanh.py b/backends/mlir/cpu/KernelBench/level2/30_Gemm_GroupNorm_Hardtanh.py
new file mode 100644
index 0000000..9ac11e2
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/30_Gemm_GroupNorm_Hardtanh.py
@@ -0,0 +1,33 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a GEMM, applies Group Normalization, and then HardTanh.
+    """
+
+    def __init__(
+        self, in_features, out_features, num_groups, hardtanh_min, hardtanh_max
+    ):
+        super(Model, self).__init__()
+        self.gemm = nn.Linear(in_features, out_features)
+        self.group_norm = nn.GroupNorm(num_groups, out_features)
+        self.hardtanh = nn.Hardtanh(min_val=hardtanh_min, max_val=hardtanh_max)
+
+    def forward(self, x):
+        """
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_features).
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_features).
+        """
+        x = self.gemm(x)
+        x = self.group_norm(x)
+        x = self.hardtanh(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/31_Conv2d_Min_Add_Multiply.py b/backends/mlir/cpu/KernelBench/level2/31_Conv2d_Min_Add_Multiply.py
new file mode 100644
index 0000000..5cdd12c
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/31_Conv2d_Min_Add_Multiply.py
@@ -0,0 +1,35 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a convolution, takes the minimum with a constant, adds a bias term, and multiplies by a scaling factor.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        constant_value,
+        bias_shape,
+        scaling_factor,
+    ):
+        super(Model, self).__init__()
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size)
+        self.constant_value = constant_value
+        self.bias = nn.Parameter(torch.randn(bias_shape))
+        self.scaling_factor = scaling_factor
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = torch.min(x, torch.tensor(self.constant_value))
+        x = x + self.bias
+        x = x * self.scaling_factor
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/32_Conv2d_Scaling_Min.py b/backends/mlir/cpu/KernelBench/level2/32_Conv2d_Scaling_Min.py
new file mode 100644
index 0000000..cc20ce8
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/32_Conv2d_Scaling_Min.py
@@ -0,0 +1,30 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a convolution, scales the output, and then applies a minimum operation.
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size, scale_factor):
+        super(Model, self).__init__()
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size)
+        self.scale_factor = scale_factor
+
+    def forward(self, x):
+        """
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, height, width).
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, height, width).
+        """
+        x = self.conv(x)
+        x = x * self.scale_factor
+        x = torch.min(x, dim=1, keepdim=True)[0]  # Minimum along channel dimension
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/33_Gemm_Scale_BatchNorm.py b/backends/mlir/cpu/KernelBench/level2/33_Gemm_Scale_BatchNorm.py
new file mode 100644
index 0000000..03d0b6b
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/33_Gemm_Scale_BatchNorm.py
@@ -0,0 +1,26 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a GEMM (general matrix multiplication), applies scaling,
+    and then batch normalization.
+    """
+
+    def __init__(self, in_features, out_features, scale_shape, eps=1e-5, momentum=0.1):
+        super(Model, self).__init__()
+        self.gemm = nn.Linear(in_features, out_features)
+        self.scale = nn.Parameter(torch.randn(scale_shape))
+        self.bn = nn.BatchNorm1d(out_features, eps=eps, momentum=momentum)
+
+    def forward(self, x):
+        x = self.gemm(x)
+        x = x * self.scale
+        x = self.bn(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/34_ConvTranspose3d_LayerNorm_GELU_Scaling.py b/backends/mlir/cpu/KernelBench/level2/34_ConvTranspose3d_LayerNorm_GELU_Scaling.py
new file mode 100644
index 0000000..775512d
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/34_ConvTranspose3d_LayerNorm_GELU_Scaling.py
@@ -0,0 +1,50 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a 3D transposed convolution, layer normalization, GELU activation, and scaling.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        bias=True,
+        eps=1e-5,
+        scaling_factor=1.0,
+    ):
+        super(Model, self).__init__()
+        self.conv_transpose = nn.ConvTranspose3d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            bias=bias,
+        )
+        self.layer_norm = nn.LayerNorm(out_channels, eps=eps)
+        self.scaling_factor = scaling_factor
+
+    def forward(self, x):
+        """
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, D, H, W).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, D', H', W').
+        """
+        x = self.conv_transpose(x)
+        x = self.layer_norm(x)
+        x = torch.nn.functional.gelu(x)
+        x = x * self.scaling_factor
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/35_Conv2d_Subtract_HardSwish_MaxPool_Mish.py b/backends/mlir/cpu/KernelBench/level2/35_Conv2d_Subtract_HardSwish_MaxPool_Mish.py
new file mode 100644
index 0000000..7261a49
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/35_Conv2d_Subtract_HardSwish_MaxPool_Mish.py
@@ -0,0 +1,29 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a convolution, subtracts a value, applies HardSwish, MaxPool, and Mish activation functions.
+    """
+
+    def __init__(
+        self, in_channels, out_channels, kernel_size, subtract_value, pool_kernel_size
+    ):
+        super(Model, self).__init__()
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size)
+        self.subtract_value = subtract_value
+        self.pool = nn.MaxPool2d(pool_kernel_size)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = x - self.subtract_value
+        x = torch.nn.functional.hardswish(x)
+        x = self.pool(x)
+        x = torch.nn.functional.mish(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/36_ConvTranspose2d_Min_Sum_GELU_Add.py b/backends/mlir/cpu/KernelBench/level2/36_ConvTranspose2d_Min_Sum_GELU_Add.py
new file mode 100644
index 0000000..8ecf374
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/36_ConvTranspose2d_Min_Sum_GELU_Add.py
@@ -0,0 +1,39 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    A model that performs a convolution transpose, minimum operation, sum operation, GELU activation and addition.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        output_padding,
+        bias_shape,
+    ):
+        super(Model, self).__init__()
+        self.conv_transpose = nn.ConvTranspose2d(
+            in_channels, out_channels, kernel_size, stride, padding, output_padding
+        )
+        self.bias = nn.Parameter(torch.randn(bias_shape))
+
+    def forward(self, x):
+        x = self.conv_transpose(x)
+        x = torch.min(x, dim=1, keepdim=True)[
+            0
+        ]  # Minimum operation along channel dimension
+        x = torch.sum(x, dim=2, keepdim=True)  # Sum operation along height dimension
+        x = torch.nn.functional.gelu(x)  # GELU activation
+        x = x + self.bias
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/37_Matmul_Swish_Sum_GroupNorm.py b/backends/mlir/cpu/KernelBench/level2/37_Matmul_Swish_Sum_GroupNorm.py
new file mode 100644
index 0000000..03ec171
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/37_Matmul_Swish_Sum_GroupNorm.py
@@ -0,0 +1,32 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    A model that performs a matrix multiplication, applies Swish activation, sums with a bias term, and normalizes with GroupNorm.
+    """
+
+    def __init__(self, in_features, out_features, num_groups, bias_shape):
+        super(Model, self).__init__()
+        self.matmul = nn.Linear(in_features, out_features)
+        self.bias = nn.Parameter(torch.randn(bias_shape))
+        self.group_norm = nn.GroupNorm(num_groups, out_features)
+
+    def forward(self, x):
+        """
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_features).
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_features).
+        """
+        x = self.matmul(x)
+        x = torch.sigmoid(x) * x  # Swish activation
+        x = x + self.bias
+        x = self.group_norm(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/38_ConvTranspose3d_AvgPool_Clamp_Softmax_Multiply.py b/backends/mlir/cpu/KernelBench/level2/38_ConvTranspose3d_AvgPool_Clamp_Softmax_Multiply.py
new file mode 100644
index 0000000..a821c1e
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/38_ConvTranspose3d_AvgPool_Clamp_Softmax_Multiply.py
@@ -0,0 +1,58 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs average pooling, 3D transposed convolution, clamping,
+    spatial softmax, and multiplication by a learnable scale.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        output_padding,
+        pool_kernel_size,
+        clamp_min,
+        clamp_max,
+    ):
+        super(Model, self).__init__()
+        self.avg_pool = nn.AvgPool3d(pool_kernel_size)
+        self.conv_transpose = nn.ConvTranspose3d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+        )
+        self.clamp_min = clamp_min
+        self.clamp_max = clamp_max
+        self.scale = nn.Parameter(torch.ones(1, out_channels, 1, 1, 1))
+
+    def forward(self, x):
+        """
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, depth, height, width).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, depth, height, width).
+        """
+        x = self.avg_pool(x)
+        x = self.conv_transpose(x)
+        x = torch.clamp(x, self.clamp_min, self.clamp_max)
+        b, c, d, h, w = x.shape
+        x = x.view(b, c, -1)  # flatten spatial dims
+        x = torch.softmax(x, dim=2)
+        x = x.view(b, c, d, h, w)
+        x = x * self.scale
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/39_Gemm_Scale_BatchNorm.py b/backends/mlir/cpu/KernelBench/level2/39_Gemm_Scale_BatchNorm.py
new file mode 100644
index 0000000..87b120a
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/39_Gemm_Scale_BatchNorm.py
@@ -0,0 +1,25 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a matrix multiplication, scales the result, and applies batch normalization.
+    """
+
+    def __init__(self, in_features, out_features, scale_shape, eps=1e-5, momentum=0.1):
+        super(Model, self).__init__()
+        self.gemm = nn.Linear(in_features, out_features)
+        self.scale = nn.Parameter(torch.randn(scale_shape))
+        self.bn = nn.BatchNorm1d(out_features, eps=eps, momentum=momentum)
+
+    def forward(self, x):
+        x = self.gemm(x)
+        x = x * self.scale
+        x = self.bn(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/3_ConvTranspose3d_Sum_LayerNorm_AvgPool_GELU.py b/backends/mlir/cpu/KernelBench/level2/3_ConvTranspose3d_Sum_LayerNorm_AvgPool_GELU.py
new file mode 100644
index 0000000..282588d
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/3_ConvTranspose3d_Sum_LayerNorm_AvgPool_GELU.py
@@ -0,0 +1,47 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a 3D transposed convolution, followed by a sum, layer normalization, average pooling, and GELU activation.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        output_padding,
+        sum_weight,
+        norm_shape,
+        pool_kernel_size,
+    ):
+        super(Model, self).__init__()
+        self.conv_transpose = nn.ConvTranspose3d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+        )
+        self.sum_weight = nn.Parameter(torch.tensor(sum_weight))
+        self.norm = nn.LayerNorm(norm_shape)
+        self.avg_pool = nn.AvgPool3d(kernel_size=pool_kernel_size)
+        self.gelu = nn.GELU()
+
+    def forward(self, x):
+        x = self.conv_transpose(x)
+        x = x + self.sum_weight
+        x = self.norm(x)
+        x = self.avg_pool(x)
+        x = self.gelu(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/40_Matmul_Scaling_ResidualAdd.py b/backends/mlir/cpu/KernelBench/level2/40_Matmul_Scaling_ResidualAdd.py
new file mode 100644
index 0000000..1a22ddc
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/40_Matmul_Scaling_ResidualAdd.py
@@ -0,0 +1,39 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    A model that performs a matrix multiplication, scaling, and residual addition.
+
+    Args:
+        in_features (int): Number of input features.
+        out_features (int): Number of output features.
+        scaling_factor (float): Scaling factor to apply after matrix multiplication.
+    """
+
+    def __init__(self, in_features, out_features, scaling_factor):
+        super(Model, self).__init__()
+        self.matmul = nn.Linear(in_features, out_features)
+        self.scaling_factor = scaling_factor
+
+    def forward(self, x):
+        """
+        Forward pass of the model.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_features).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_features).
+        """
+        x = self.matmul(x)
+        original_x = x.clone().detach()
+        x = x * self.scaling_factor
+        x = x + original_x
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/41_Gemm_BatchNorm_GELU_ReLU.py b/backends/mlir/cpu/KernelBench/level2/41_Gemm_BatchNorm_GELU_ReLU.py
new file mode 100644
index 0000000..69cbec3
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/41_Gemm_BatchNorm_GELU_ReLU.py
@@ -0,0 +1,31 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a GEMM, BatchNorm, GELU, and ReLU in sequence.
+    """
+
+    def __init__(self, in_features, out_features):
+        super(Model, self).__init__()
+        self.gemm = nn.Linear(in_features, out_features)
+        self.batch_norm = nn.BatchNorm1d(out_features)
+
+    def forward(self, x):
+        """
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_features).
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_features).
+        """
+        x = self.gemm(x)
+        x = self.batch_norm(x)
+        x = torch.nn.functional.gelu(x)
+        x = torch.relu(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/44_ConvTranspose2d_Multiply_GlobalAvgPool_GlobalAvgPool_Mean.py b/backends/mlir/cpu/KernelBench/level2/44_ConvTranspose2d_Multiply_GlobalAvgPool_GlobalAvgPool_Mean.py
new file mode 100644
index 0000000..ca856c5
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/44_ConvTranspose2d_Multiply_GlobalAvgPool_GlobalAvgPool_Mean.py
@@ -0,0 +1,42 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a transposed convolution, multiplies by a scalar, applies global average pooling,
+    another global average pooling
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        output_padding,
+        multiplier,
+    ):
+        super(Model, self).__init__()
+        self.conv_transpose = nn.ConvTranspose2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+        )
+        self.multiplier = multiplier
+
+    def forward(self, x):
+        x = self.conv_transpose(x)
+        x = x * self.multiplier
+        x = torch.mean(x, dim=[2, 3], keepdim=True)  # First global average pooling
+        x = torch.mean(x, dim=[2, 3], keepdim=True)  # Second global average pooling
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/46_Conv2d_Subtract_Tanh_Subtract_AvgPool.py b/backends/mlir/cpu/KernelBench/level2/46_Conv2d_Subtract_Tanh_Subtract_AvgPool.py
new file mode 100644
index 0000000..480264a
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/46_Conv2d_Subtract_Tanh_Subtract_AvgPool.py
@@ -0,0 +1,36 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a convolution, subtraction, tanh activation, subtraction and average pooling.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        subtract1_value,
+        subtract2_value,
+        kernel_size_pool,
+    ):
+        super(Model, self).__init__()
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size)
+        self.subtract1_value = subtract1_value
+        self.subtract2_value = subtract2_value
+        self.avgpool = nn.AvgPool2d(kernel_size_pool)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = x - self.subtract1_value
+        x = torch.tanh(x)
+        x = x - self.subtract2_value
+        x = self.avgpool(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/47_Conv3d_Mish_Tanh.py b/backends/mlir/cpu/KernelBench/level2/47_Conv3d_Mish_Tanh.py
new file mode 100644
index 0000000..1dfa9ef
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/47_Conv3d_Mish_Tanh.py
@@ -0,0 +1,32 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a 3D convolution, applies Mish activation, and then applies Tanh activation.
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0):
+        super(Model, self).__init__()
+        self.conv = nn.Conv3d(
+            in_channels, out_channels, kernel_size, stride=stride, padding=padding
+        )
+
+    def forward(self, x):
+        """
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, D, H, W).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, D', H', W').
+        """
+        x = self.conv(x)
+        x = torch.nn.functional.mish(x)
+        x = torch.tanh(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/48_Conv3d_Scaling_Tanh_Multiply_Sigmoid.py b/backends/mlir/cpu/KernelBench/level2/48_Conv3d_Scaling_Tanh_Multiply_Sigmoid.py
new file mode 100644
index 0000000..5a71b0e
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/48_Conv3d_Scaling_Tanh_Multiply_Sigmoid.py
@@ -0,0 +1,29 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a 3D convolution, scales the output, applies tanh, multiplies by a scaling factor, and applies sigmoid.
+    """
+
+    def __init__(
+        self, in_channels, out_channels, kernel_size, scaling_factor, bias_shape
+    ):
+        super(Model, self).__init__()
+        self.conv = nn.Conv3d(in_channels, out_channels, kernel_size)
+        self.scaling_factor = nn.Parameter(torch.randn(bias_shape))
+        self.bias = nn.Parameter(torch.randn(bias_shape))
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = x * self.scaling_factor
+        x = torch.tanh(x)
+        x = x * self.bias
+        x = torch.sigmoid(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/49_ConvTranspose3d_Softmax_Sigmoid.py b/backends/mlir/cpu/KernelBench/level2/49_ConvTranspose3d_Softmax_Sigmoid.py
new file mode 100644
index 0000000..1fd8be5
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/49_ConvTranspose3d_Softmax_Sigmoid.py
@@ -0,0 +1,49 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a 3D transposed convolution, applies Softmax and Sigmoid.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        output_padding,
+        bias=True,
+    ):
+        super(Model, self).__init__()
+        self.conv_transpose = nn.ConvTranspose3d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+            bias=bias,
+        )
+        self.softmax = nn.Softmax(dim=1)
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        """
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, D, H, W).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, D, H, W).
+        """
+        x = self.conv_transpose(x)
+        x = self.softmax(x)
+        x = self.sigmoid(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/4_Conv2d_Mish_Mish.py b/backends/mlir/cpu/KernelBench/level2/4_Conv2d_Mish_Mish.py
new file mode 100644
index 0000000..cfaac8e
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/4_Conv2d_Mish_Mish.py
@@ -0,0 +1,23 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a convolution, applies Mish, and another Mish.
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size):
+        super(Model, self).__init__()
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = torch.nn.functional.mish(x)
+        x = torch.nn.functional.mish(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/50_ConvTranspose3d_Scaling_AvgPool_BiasAdd_Scaling.py b/backends/mlir/cpu/KernelBench/level2/50_ConvTranspose3d_Scaling_AvgPool_BiasAdd_Scaling.py
new file mode 100644
index 0000000..19c9ac7
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/50_ConvTranspose3d_Scaling_AvgPool_BiasAdd_Scaling.py
@@ -0,0 +1,41 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a 3D transposed convolution, scaling, average pooling, bias addition, and scaling.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        scale1,
+        scale2,
+        bias_shape,
+    ):
+        super(Model, self).__init__()
+        self.conv_transpose = nn.ConvTranspose3d(
+            in_channels, out_channels, kernel_size, stride=stride, padding=padding
+        )
+        self.scale1 = nn.Parameter(torch.tensor(scale1))
+        self.avg_pool = nn.AvgPool3d(kernel_size=2)
+        self.bias = nn.Parameter(torch.randn(bias_shape))
+        self.scale2 = nn.Parameter(torch.tensor(scale2))
+
+    def forward(self, x):
+        x = self.conv_transpose(x)
+        x = x * self.scale1
+        x = self.avg_pool(x)
+        x = x + self.bias
+        x = x * self.scale2
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/53_Gemm_Scaling_Hardtanh_GELU.py b/backends/mlir/cpu/KernelBench/level2/53_Gemm_Scaling_Hardtanh_GELU.py
new file mode 100644
index 0000000..30fcca2
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/53_Gemm_Scaling_Hardtanh_GELU.py
@@ -0,0 +1,29 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a GEMM, scaling, hardtanh, and GELU activation.
+    """
+
+    def __init__(
+        self, in_features, out_features, scaling_factor, hardtanh_min, hardtanh_max
+    ):
+        super(Model, self).__init__()
+        self.gemm = nn.Linear(in_features, out_features)
+        self.scaling_factor = scaling_factor
+        self.hardtanh = nn.Hardtanh(min_val=hardtanh_min, max_val=hardtanh_max)
+        self.gelu = nn.GELU()
+
+    def forward(self, x):
+        x = self.gemm(x)
+        x = x * self.scaling_factor
+        x = self.hardtanh(x)
+        x = self.gelu(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/54_Conv2d_Multiply_LeakyReLU_GELU.py b/backends/mlir/cpu/KernelBench/level2/54_Conv2d_Multiply_LeakyReLU_GELU.py
new file mode 100644
index 0000000..4e3f8f5
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/54_Conv2d_Multiply_LeakyReLU_GELU.py
@@ -0,0 +1,26 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a convolution, multiplies by a learnable scalar, applies LeakyReLU, and then GELU.
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size, multiplier_shape):
+        super(Model, self).__init__()
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size)
+        self.multiplier = nn.Parameter(torch.randn(multiplier_shape))
+        self.leaky_relu = nn.LeakyReLU()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = x * self.multiplier
+        x = self.leaky_relu(x)
+        x = torch.nn.functional.gelu(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/55_Matmul_MaxPool_Sum_Scale.py b/backends/mlir/cpu/KernelBench/level2/55_Matmul_MaxPool_Sum_Scale.py
new file mode 100644
index 0000000..8bc78dd
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/55_Matmul_MaxPool_Sum_Scale.py
@@ -0,0 +1,33 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs matrix multiplication, max pooling, sum, and scaling.
+    """
+
+    def __init__(self, in_features, out_features, kernel_size, scale_factor):
+        super(Model, self).__init__()
+        self.matmul = nn.Linear(in_features, out_features)
+        self.max_pool = nn.MaxPool1d(kernel_size)
+        self.scale_factor = scale_factor
+
+    def forward(self, x):
+        """
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_features).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_features).
+        """
+        x = self.matmul(x)
+        x = self.max_pool(x.unsqueeze(1)).squeeze(1)
+        x = torch.sum(x, dim=1)
+        x = x * self.scale_factor
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/56_Matmul_Sigmoid_Sum.py b/backends/mlir/cpu/KernelBench/level2/56_Matmul_Sigmoid_Sum.py
new file mode 100644
index 0000000..4b0f5c3
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/56_Matmul_Sigmoid_Sum.py
@@ -0,0 +1,30 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a matrix multiplication, applies sigmoid, and sums the result.
+    """
+
+    def __init__(self, input_size, hidden_size):
+        super(Model, self).__init__()
+        self.linear = nn.Linear(input_size, hidden_size)
+
+    def forward(self, x):
+        """
+        Args:
+            x: Input tensor of shape (batch_size, input_size).
+
+        Returns:
+            Output tensor of shape (batch_size, 1).
+        """
+        x = self.linear(x)
+        x = torch.sigmoid(x)
+        x = torch.sum(x, dim=1, keepdim=True)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/57_Conv2d_ReLU_HardSwish.py b/backends/mlir/cpu/KernelBench/level2/57_Conv2d_ReLU_HardSwish.py
new file mode 100644
index 0000000..950fef9
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/57_Conv2d_ReLU_HardSwish.py
@@ -0,0 +1,23 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a convolution, applies ReLU, and applies HardSwish activation.
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size):
+        super(Model, self).__init__()
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = torch.relu(x)
+        x = x * torch.clamp((x + 3) / 6, 0, 1)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/59_Matmul_Swish_Scaling.py b/backends/mlir/cpu/KernelBench/level2/59_Matmul_Swish_Scaling.py
new file mode 100644
index 0000000..55fe1b2
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/59_Matmul_Swish_Scaling.py
@@ -0,0 +1,24 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a matrix multiplication, applies Swish activation, and scales the result.
+    """
+
+    def __init__(self, in_features, out_features, scaling_factor):
+        super(Model, self).__init__()
+        self.matmul = nn.Linear(in_features, out_features)
+        self.scaling_factor = scaling_factor
+
+    def forward(self, x):
+        x = self.matmul(x)
+        x = x * torch.sigmoid(x)  # Swish activation
+        x = x * self.scaling_factor
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/5_ConvTranspose2d_Subtract_Tanh.py b/backends/mlir/cpu/KernelBench/level2/5_ConvTranspose2d_Subtract_Tanh.py
new file mode 100644
index 0000000..6717c75
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/5_ConvTranspose2d_Subtract_Tanh.py
@@ -0,0 +1,40 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a transposed convolution, subtracts a bias term, and applies tanh activation.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        bias_shape,
+        stride=2,
+        padding=1,
+        output_padding=1,
+    ):
+        super(Model, self).__init__()
+        self.conv_transpose = nn.ConvTranspose2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+        )
+        self.bias = nn.Parameter(torch.randn(bias_shape))
+
+    def forward(self, x):
+        x = self.conv_transpose(x)
+        x = x - self.bias
+        x = torch.tanh(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/60_ConvTranspose3d_Swish_GroupNorm_HardSwish.py b/backends/mlir/cpu/KernelBench/level2/60_ConvTranspose3d_Swish_GroupNorm_HardSwish.py
new file mode 100644
index 0000000..b5f7a22
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/60_ConvTranspose3d_Swish_GroupNorm_HardSwish.py
@@ -0,0 +1,45 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a 3D transposed convolution, applies Swish activation,
+    group normalization, and then HardSwish activation.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        groups,
+        eps,
+        bias=True,
+    ):
+        super(Model, self).__init__()
+        self.conv_transpose = nn.ConvTranspose3d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            bias=bias,
+        )
+        self.group_norm = nn.GroupNorm(
+            num_groups=groups, num_channels=out_channels, eps=eps
+        )
+
+    def forward(self, x):
+        x = self.conv_transpose(x)
+        x = torch.sigmoid(x) * x  # Swish activation
+        x = self.group_norm(x)
+        x = torch.nn.functional.hardswish(x)  # HardSwish activation
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/61_ConvTranspose3d_ReLU_GroupNorm.py b/backends/mlir/cpu/KernelBench/level2/61_ConvTranspose3d_ReLU_GroupNorm.py
new file mode 100644
index 0000000..dd722fe
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/61_ConvTranspose3d_ReLU_GroupNorm.py
@@ -0,0 +1,34 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a transposed 3D convolution, applies ReLU, and then applies group normalization.
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size, groups, bias=False):
+        super(Model, self).__init__()
+        self.conv_transpose = nn.ConvTranspose3d(
+            in_channels, out_channels, kernel_size, bias=bias
+        )
+        self.relu = nn.ReLU()
+        self.group_norm = nn.GroupNorm(num_groups=groups, num_channels=out_channels)
+
+    def forward(self, x):
+        """
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, D, H, W).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, D, H, W).
+        """
+        x = self.conv_transpose(x)
+        x = self.relu(x)
+        x = self.group_norm(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/62_Matmul_GroupNorm_LeakyReLU_Sum.py b/backends/mlir/cpu/KernelBench/level2/62_Matmul_GroupNorm_LeakyReLU_Sum.py
new file mode 100644
index 0000000..8cc3a9e
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/62_Matmul_GroupNorm_LeakyReLU_Sum.py
@@ -0,0 +1,37 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    A model that performs a matrix multiplication, group normalization, leaky ReLU activation, and element-wise sum.
+    """
+
+    def __init__(
+        self, input_size, hidden_size, num_groups, eps=1e-5, negative_slope=0.01
+    ):
+        super(Model, self).__init__()
+        self.fc = nn.Linear(input_size, hidden_size)
+        self.gn = nn.GroupNorm(num_groups=num_groups, num_channels=hidden_size, eps=eps)
+        self.leaky_relu = nn.LeakyReLU(negative_slope=negative_slope)
+
+    def forward(self, x):
+        """
+        Performs the forward pass of the model.
+
+        Args:
+            x: Input tensor of shape (batch_size, input_size).
+
+        Returns:
+            Output tensor of shape (batch_size, hidden_size).
+        """
+        x = self.fc(x)
+        x = self.gn(x)
+        x = self.leaky_relu(x)
+        x = x + x
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/63_Gemm_ReLU_Divide.py b/backends/mlir/cpu/KernelBench/level2/63_Gemm_ReLU_Divide.py
new file mode 100644
index 0000000..2937c37
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/63_Gemm_ReLU_Divide.py
@@ -0,0 +1,24 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a matrix multiplication, applies ReLU, and divides by a constant.
+    """
+
+    def __init__(self, in_features, out_features, divisor):
+        super(Model, self).__init__()
+        self.linear = nn.Linear(in_features, out_features)
+        self.divisor = divisor
+
+    def forward(self, x):
+        x = self.linear(x)
+        x = torch.relu(x)
+        x = x / self.divisor
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/65_Conv2d_AvgPool_Sigmoid_Sum.py b/backends/mlir/cpu/KernelBench/level2/65_Conv2d_AvgPool_Sigmoid_Sum.py
new file mode 100644
index 0000000..dabbc27
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/65_Conv2d_AvgPool_Sigmoid_Sum.py
@@ -0,0 +1,25 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    This model performs a convolution, average pooling, applies sigmoid, and sums the result.
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size, pool_kernel_size):
+        super(Model, self).__init__()
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size)
+        self.avg_pool = nn.AvgPool2d(pool_kernel_size)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.avg_pool(x)
+        x = torch.sigmoid(x)
+        x = torch.sum(x, dim=[1, 2, 3])  # Sum over all spatial dimensions
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/66_Matmul_Dropout_Softmax.py b/backends/mlir/cpu/KernelBench/level2/66_Matmul_Dropout_Softmax.py
new file mode 100644
index 0000000..64f94db
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/66_Matmul_Dropout_Softmax.py
@@ -0,0 +1,31 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    A model that performs matrix multiplication, applies dropout, and then applies softmax.
+    """
+
+    def __init__(self, in_features, out_features, dropout_p):
+        super(Model, self).__init__()
+        self.matmul = nn.Linear(in_features, out_features)
+        self.dropout = nn.Dropout(dropout_p)
+
+    def forward(self, x):
+        """
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_features).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_features).
+        """
+        x = self.matmul(x)
+        x = self.dropout(x)
+        x = torch.softmax(x, dim=1)  # Softmax over features
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/67_Conv2d_GELU_GlobalAvgPool.py b/backends/mlir/cpu/KernelBench/level2/67_Conv2d_GELU_GlobalAvgPool.py
new file mode 100644
index 0000000..9ffcf10
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/67_Conv2d_GELU_GlobalAvgPool.py
@@ -0,0 +1,30 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a convolution, applies GELU, and then performs global average pooling.
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size):
+        super(Model, self).__init__()
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size)
+
+    def forward(self, x):
+        """
+        Args:
+            x: Input tensor of shape (batch_size, in_channels, height, width)
+        Returns:
+            Output tensor of shape (batch_size, out_channels)
+        """
+        x = self.conv(x)
+        x = torch.nn.functional.gelu(x)
+        x = torch.nn.functional.adaptive_avg_pool2d(x, 1)
+        x = x.squeeze(-1).squeeze(-1)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/68_Matmul_Min_Subtract.py b/backends/mlir/cpu/KernelBench/level2/68_Matmul_Min_Subtract.py
new file mode 100644
index 0000000..eedfd43
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/68_Matmul_Min_Subtract.py
@@ -0,0 +1,24 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a matrix multiplication, applies minimum, and subtracts a constant.
+    """
+
+    def __init__(self, in_features, out_features, constant):
+        super(Model, self).__init__()
+        self.linear = nn.Linear(in_features, out_features)
+        self.constant = nn.Parameter(torch.tensor(constant))
+
+    def forward(self, x):
+        x = self.linear(x)
+        x = torch.min(x, self.constant)
+        x = x - self.constant
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/69_Conv2d_HardSwish_ReLU.py b/backends/mlir/cpu/KernelBench/level2/69_Conv2d_HardSwish_ReLU.py
new file mode 100644
index 0000000..0448963
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/69_Conv2d_HardSwish_ReLU.py
@@ -0,0 +1,30 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a convolution, applies HardSwish, and then ReLU.
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size):
+        super(Model, self).__init__()
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size)
+
+    def forward(self, x):
+        """
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, height, width).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_channels, height, width).
+        """
+        x = self.conv(x)
+        x = torch.nn.functional.hardswish(x)
+        x = torch.relu(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/6_Conv3d_Softmax_MaxPool_MaxPool.py b/backends/mlir/cpu/KernelBench/level2/6_Conv3d_Softmax_MaxPool_MaxPool.py
new file mode 100644
index 0000000..db7e68c
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/6_Conv3d_Softmax_MaxPool_MaxPool.py
@@ -0,0 +1,32 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a 3D convolution, applies Softmax, and performs two max pooling operations.
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size, pool_kernel_size):
+        super(Model, self).__init__()
+        self.conv = nn.Conv3d(in_channels, out_channels, kernel_size)
+        self.pool1 = nn.MaxPool3d(pool_kernel_size)
+        self.pool2 = nn.MaxPool3d(pool_kernel_size)
+
+    def forward(self, x):
+        """
+        Args:
+            x: Input tensor of shape (batch_size, in_channels, depth, height, width)
+        Returns:
+            Output tensor of shape (batch_size, out_channels, depth', height', width') where depth', height', width' are the dimensions after pooling.
+        """
+        x = self.conv(x)
+        x = torch.softmax(x, dim=1)
+        x = self.pool1(x)
+        x = self.pool2(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/70_Gemm_Sigmoid_Scaling_ResidualAdd.py b/backends/mlir/cpu/KernelBench/level2/70_Gemm_Sigmoid_Scaling_ResidualAdd.py
new file mode 100644
index 0000000..2a2e140
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/70_Gemm_Sigmoid_Scaling_ResidualAdd.py
@@ -0,0 +1,35 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model implementing the pattern "Gemm_Sigmoid_Scaling_ResidualAdd".
+    """
+
+    def __init__(self, input_size, hidden_size, scaling_factor):
+        super(Model, self).__init__()
+        self.gemm = nn.Linear(input_size, hidden_size)
+        self.scaling_factor = scaling_factor
+
+    def forward(self, x):
+        """
+        Forward pass of the model.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, input_size).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, hidden_size).
+        """
+        x = self.gemm(x)
+        original_x = x
+        x = torch.sigmoid(x)
+        x = x * self.scaling_factor
+        x = x + original_x
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/71_Conv2d_Divide_LeakyReLU.py b/backends/mlir/cpu/KernelBench/level2/71_Conv2d_Divide_LeakyReLU.py
new file mode 100644
index 0000000..91e61f2
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/71_Conv2d_Divide_LeakyReLU.py
@@ -0,0 +1,24 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a convolution, divides by a constant, and applies LeakyReLU.
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size, divisor):
+        super(Model, self).__init__()
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size)
+        self.divisor = divisor
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = x / self.divisor
+        x = torch.nn.functional.leaky_relu(x, negative_slope=0.01)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/72_ConvTranspose3d_BatchNorm_AvgPool_AvgPool.py b/backends/mlir/cpu/KernelBench/level2/72_ConvTranspose3d_BatchNorm_AvgPool_AvgPool.py
new file mode 100644
index 0000000..846be2d
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/72_ConvTranspose3d_BatchNorm_AvgPool_AvgPool.py
@@ -0,0 +1,32 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    A model that performs a 3D transposed convolution, followed by batch normalization,
+    two average pooling layers.
+    """
+
+    def __init__(
+        self, in_channels, out_channels, kernel_size, stride, padding, bias_shape
+    ):
+        super(Model, self).__init__()
+        self.conv_transpose = nn.ConvTranspose3d(
+            in_channels, out_channels, kernel_size, stride=stride, padding=padding
+        )
+        self.batch_norm = nn.BatchNorm3d(out_channels)
+        self.avg_pool1 = nn.AvgPool3d(kernel_size=2)
+        self.avg_pool2 = nn.AvgPool3d(kernel_size=2)
+
+    def forward(self, x):
+        x = self.conv_transpose(x)
+        x = self.batch_norm(x)
+        x = self.avg_pool1(x)
+        x = self.avg_pool2(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/73_Conv2d_BatchNorm_Scaling.py b/backends/mlir/cpu/KernelBench/level2/73_Conv2d_BatchNorm_Scaling.py
new file mode 100644
index 0000000..9d1317f
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/73_Conv2d_BatchNorm_Scaling.py
@@ -0,0 +1,25 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a convolution, applies Batch Normalization, and scales the output.
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size, scaling_factor):
+        super(Model, self).__init__()
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size)
+        self.bn = nn.BatchNorm2d(out_channels)
+        self.scaling_factor = scaling_factor
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = x * self.scaling_factor
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/74_ConvTranspose3d_LeakyReLU_Multiply_LeakyReLU_Max.py b/backends/mlir/cpu/KernelBench/level2/74_ConvTranspose3d_LeakyReLU_Multiply_LeakyReLU_Max.py
new file mode 100644
index 0000000..b0e8345
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/74_ConvTranspose3d_LeakyReLU_Multiply_LeakyReLU_Max.py
@@ -0,0 +1,45 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a 3D transposed convolution, applies LeakyReLU, multiplies by a learnable parameter,
+    applies LeakyReLU again, and performs a max pooling operation.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        output_padding,
+        multiplier_shape,
+    ):
+        super(Model, self).__init__()
+        self.conv_transpose = nn.ConvTranspose3d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+        )
+        self.multiplier = nn.Parameter(torch.randn(multiplier_shape))
+        self.leaky_relu = nn.LeakyReLU(negative_slope=0.2)
+        self.max_pool = nn.MaxPool3d(kernel_size=2)
+
+    def forward(self, x):
+        x = self.conv_transpose(x)
+        x = self.leaky_relu(x)
+        x = x * self.multiplier
+        x = self.leaky_relu(x)
+        x = self.max_pool(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/75_Gemm_GroupNorm_Min_BiasAdd.py b/backends/mlir/cpu/KernelBench/level2/75_Gemm_GroupNorm_Min_BiasAdd.py
new file mode 100644
index 0000000..c6f6fab
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/75_Gemm_GroupNorm_Min_BiasAdd.py
@@ -0,0 +1,26 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a GEMM, Group Normalization, Minimum operation, and Bias addition.
+    """
+
+    def __init__(self, in_features, out_features, num_groups, bias_shape):
+        super(Model, self).__init__()
+        self.gemm = nn.Linear(in_features, out_features)
+        self.group_norm = nn.GroupNorm(num_groups, out_features)
+        self.bias = nn.Parameter(torch.randn(bias_shape))
+
+    def forward(self, x):
+        x = self.gemm(x)
+        x = self.group_norm(x)
+        x = torch.min(x, dim=1, keepdim=True)[0]
+        x = x + self.bias
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/76_Gemm_Add_ReLU.py b/backends/mlir/cpu/KernelBench/level2/76_Gemm_Add_ReLU.py
new file mode 100644
index 0000000..4cb7f4e
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/76_Gemm_Add_ReLU.py
@@ -0,0 +1,30 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a matrix multiplication, adds a bias term, and applies ReLU.
+    """
+
+    def __init__(self, in_features, out_features, bias_shape):
+        super(Model, self).__init__()
+        self.gemm = nn.Linear(in_features, out_features, bias=False)
+        self.bias = nn.Parameter(torch.randn(bias_shape))
+
+    def forward(self, x):
+        """
+        Args:
+            x (torch.Tensor): Input tensor with shape (batch_size, in_features).
+        Returns:
+            torch.Tensor: Output tensor with shape (batch_size, out_features).
+        """
+        x = self.gemm(x)
+        x = x + self.bias
+        x = torch.relu(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/77_ConvTranspose3d_Scale_BatchNorm_GlobalAvgPool.py b/backends/mlir/cpu/KernelBench/level2/77_ConvTranspose3d_Scale_BatchNorm_GlobalAvgPool.py
new file mode 100644
index 0000000..1244369
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/77_ConvTranspose3d_Scale_BatchNorm_GlobalAvgPool.py
@@ -0,0 +1,36 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a 3D transposed convolution, scales the output, applies batch normalization,
+    and then performs global average pooling.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        scale_factor,
+        eps=1e-5,
+        momentum=0.1,
+    ):
+        super(Model, self).__init__()
+        self.conv_transpose = nn.ConvTranspose3d(in_channels, out_channels, kernel_size)
+        self.scale_factor = scale_factor
+        self.batch_norm = nn.BatchNorm3d(out_channels, eps=eps, momentum=momentum)
+        self.global_avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1))
+
+    def forward(self, x):
+        x = self.conv_transpose(x)
+        x = x * self.scale_factor
+        x = self.batch_norm(x)
+        x = self.global_avg_pool(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/78_ConvTranspose3d_Max_Max_Sum.py b/backends/mlir/cpu/KernelBench/level2/78_ConvTranspose3d_Max_Max_Sum.py
new file mode 100644
index 0000000..58019e7
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/78_ConvTranspose3d_Max_Max_Sum.py
@@ -0,0 +1,28 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a 3D transposed convolution, followed by two max pooling layers and a sum operation.
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride, padding):
+        super(Model, self).__init__()
+        self.conv_transpose = nn.ConvTranspose3d(
+            in_channels, out_channels, kernel_size, stride=stride, padding=padding
+        )
+        self.max_pool1 = nn.MaxPool3d(kernel_size=2)
+        self.max_pool2 = nn.MaxPool3d(kernel_size=3)
+
+    def forward(self, x):
+        x = self.conv_transpose(x)
+        x = self.max_pool1(x)
+        x = self.max_pool2(x)
+        x = torch.sum(x, dim=1, keepdim=True)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/7_Conv3d_ReLU_LeakyReLU_GELU_Sigmoid_BiasAdd.py b/backends/mlir/cpu/KernelBench/level2/7_Conv3d_ReLU_LeakyReLU_GELU_Sigmoid_BiasAdd.py
new file mode 100644
index 0000000..e0dc07a
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/7_Conv3d_ReLU_LeakyReLU_GELU_Sigmoid_BiasAdd.py
@@ -0,0 +1,27 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a 3D convolution, applies ReLU, LeakyReLU, GELU, Sigmoid activations, and bias in sequence.
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size, bias_shape):
+        super(Model, self).__init__()
+        self.conv = nn.Conv3d(in_channels, out_channels, kernel_size)
+        self.bias = nn.Parameter(torch.randn(bias_shape))
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = torch.relu(x)
+        x = torch.nn.functional.leaky_relu(x, negative_slope=0.01)
+        x = torch.nn.functional.gelu(x)
+        x = torch.sigmoid(x)
+        x = x + self.bias
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/80_Gemm_Max_Subtract_GELU.py b/backends/mlir/cpu/KernelBench/level2/80_Gemm_Max_Subtract_GELU.py
new file mode 100644
index 0000000..3751114
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/80_Gemm_Max_Subtract_GELU.py
@@ -0,0 +1,32 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a GEMM, followed by a max operation, subtraction, and GELU activation.
+    """
+
+    def __init__(self, in_features, out_features, max_dim):
+        super(Model, self).__init__()
+        self.gemm = nn.Linear(in_features, out_features)
+        self.max_dim = max_dim
+
+    def forward(self, x):
+        """
+        Args:
+            x: Input tensor of shape (batch_size, in_features)
+
+        Returns:
+            Output tensor of shape (batch_size, out_features)
+        """
+        x = self.gemm(x)
+        x = torch.max(x, dim=self.max_dim, keepdim=True).values
+        x = x - x.mean(dim=1, keepdim=True)
+        x = torch.nn.functional.gelu(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/81_Gemm_Swish_Divide_Clamp_Tanh_Clamp.py b/backends/mlir/cpu/KernelBench/level2/81_Gemm_Swish_Divide_Clamp_Tanh_Clamp.py
new file mode 100644
index 0000000..0c71fb5
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/81_Gemm_Swish_Divide_Clamp_Tanh_Clamp.py
@@ -0,0 +1,32 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a gemm, swish, divide, clamp, tanh, and clamp operations.
+    """
+
+    def __init__(self, in_features, out_features, bias=True):
+        super(Model, self).__init__()
+        self.gemm = nn.Linear(in_features, out_features, bias=bias)
+
+    def forward(self, x):
+        """
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_features).
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_features).
+        """
+        x = self.gemm(x)
+        x = x * torch.sigmoid(x)  # Swish activation
+        x = x / 2.0
+        x = torch.clamp(x, min=-1.0, max=1.0)  # Clamp between -1 and 1
+        x = torch.tanh(x)  # Tanh activation
+        x = torch.clamp(x, min=-1.0, max=1.0)  # Clamp between -1 and 1
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/82_Conv2d_Tanh_Scaling_BiasAdd_Max.py b/backends/mlir/cpu/KernelBench/level2/82_Conv2d_Tanh_Scaling_BiasAdd_Max.py
new file mode 100644
index 0000000..ebfeca8
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/82_Conv2d_Tanh_Scaling_BiasAdd_Max.py
@@ -0,0 +1,41 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    A model that performs a convolution, applies tanh, scaling, adds a bias term, and then max-pools.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        scaling_factor,
+        bias_shape,
+        pool_kernel_size,
+    ):
+        super(Model, self).__init__()
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size)
+        self.scaling_factor = scaling_factor
+        self.bias = nn.Parameter(torch.randn(bias_shape))
+        self.max_pool = nn.MaxPool2d(pool_kernel_size)
+
+    def forward(self, x):
+        # Convolution
+        x = self.conv(x)
+        # Tanh activation
+        x = torch.tanh(x)
+        # Scaling
+        x = x * self.scaling_factor
+        # Bias addition
+        x = x + self.bias
+        # Max-pooling
+        x = self.max_pool(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/83_Conv3d_GroupNorm_Min_Clamp_Dropout.py b/backends/mlir/cpu/KernelBench/level2/83_Conv3d_GroupNorm_Min_Clamp_Dropout.py
new file mode 100644
index 0000000..c8b7be8
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/83_Conv3d_GroupNorm_Min_Clamp_Dropout.py
@@ -0,0 +1,40 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a 3D convolution, applies Group Normalization, minimum, clamp, and dropout.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        groups,
+        min_value,
+        max_value,
+        dropout_p,
+    ):
+        super(Model, self).__init__()
+        self.conv = nn.Conv3d(in_channels, out_channels, kernel_size)
+        self.norm = nn.GroupNorm(groups, out_channels)
+        self.dropout = nn.Dropout(dropout_p)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.norm(x)
+        x = torch.min(x, torch.tensor(min_value, device=x.device))
+        x = torch.clamp(x, min=min_value, max=max_value)
+        x = self.dropout(x)
+        return x
+
+
+min_value = 0.0
+max_value = 1.0
diff --git a/backends/mlir/cpu/KernelBench/level2/84_Gemm_BatchNorm_Scaling_Softmax.py b/backends/mlir/cpu/KernelBench/level2/84_Gemm_BatchNorm_Scaling_Softmax.py
new file mode 100644
index 0000000..a5c8052
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/84_Gemm_BatchNorm_Scaling_Softmax.py
@@ -0,0 +1,35 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a matrix multiplication (Gemm), Batch Normalization, scaling, and Softmax.
+    """
+
+    def __init__(
+        self, in_features, out_features, bn_eps=1e-5, bn_momentum=0.1, scale_shape=(1,)
+    ):
+        super(Model, self).__init__()
+        self.gemm = nn.Linear(in_features, out_features)
+        self.bn = nn.BatchNorm1d(out_features, eps=bn_eps, momentum=bn_momentum)
+        self.scale = nn.Parameter(torch.ones(scale_shape))
+        self.softmax = nn.Softmax(dim=1)
+
+    def forward(self, x):
+        """
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_features).
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_features).
+        """
+        x = self.gemm(x)
+        x = self.bn(x)
+        x = self.scale * x
+        x = self.softmax(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/85_Conv2d_GroupNorm_Scale_MaxPool_Clamp.py b/backends/mlir/cpu/KernelBench/level2/85_Conv2d_GroupNorm_Scale_MaxPool_Clamp.py
new file mode 100644
index 0000000..ddc9723
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/85_Conv2d_GroupNorm_Scale_MaxPool_Clamp.py
@@ -0,0 +1,46 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs convolution, group normalization, scaling, max pooling, and clamping.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        num_groups,
+        scale_shape,
+        maxpool_kernel_size,
+        clamp_min,
+        clamp_max,
+    ):
+        super(Model, self).__init__()
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size)
+        self.group_norm = nn.GroupNorm(num_groups, out_channels)
+        self.scale = nn.Parameter(torch.ones(scale_shape))
+        self.maxpool = nn.MaxPool2d(kernel_size=maxpool_kernel_size)
+        self.clamp_min = clamp_min
+        self.clamp_max = clamp_max
+
+    def forward(self, x):
+        """
+        Args:
+            x: Input tensor of shape (batch_size, in_channels, height, width).
+        Returns:
+            Output tensor of shape (batch_size, out_channels, height', width').
+        """
+        x = self.conv(x)
+        x = self.group_norm(x)
+        x = x * self.scale
+        x = self.maxpool(x)
+        x = torch.clamp(x, self.clamp_min, self.clamp_max)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/86_Matmul_Divide_GELU.py b/backends/mlir/cpu/KernelBench/level2/86_Matmul_Divide_GELU.py
new file mode 100644
index 0000000..33260ca
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/86_Matmul_Divide_GELU.py
@@ -0,0 +1,30 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    A model that performs a matrix multiplication, divides by a scalar, and applies GELU activation.
+    """
+
+    def __init__(self, input_size, output_size, divisor):
+        super(Model, self).__init__()
+        self.linear = nn.Linear(input_size, output_size)
+        self.divisor = divisor
+
+    def forward(self, x):
+        """
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, input_size).
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, output_size).
+        """
+        x = self.linear(x)
+        x = x / self.divisor
+        x = torch.nn.functional.gelu(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/87_Conv2d_Subtract_Subtract_Mish.py b/backends/mlir/cpu/KernelBench/level2/87_Conv2d_Subtract_Subtract_Mish.py
new file mode 100644
index 0000000..102fe72
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/87_Conv2d_Subtract_Subtract_Mish.py
@@ -0,0 +1,28 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a convolution, subtracts two values, applies Mish activation.
+    """
+
+    def __init__(
+        self, in_channels, out_channels, kernel_size, subtract_value_1, subtract_value_2
+    ):
+        super(Model, self).__init__()
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size)
+        self.subtract_value_1 = subtract_value_1
+        self.subtract_value_2 = subtract_value_2
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = x - self.subtract_value_1
+        x = x - self.subtract_value_2
+        x = torch.nn.functional.mish(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/88_Gemm_GroupNorm_Swish_Multiply_Swish.py b/backends/mlir/cpu/KernelBench/level2/88_Gemm_GroupNorm_Swish_Multiply_Swish.py
new file mode 100644
index 0000000..0ec9875
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/88_Gemm_GroupNorm_Swish_Multiply_Swish.py
@@ -0,0 +1,32 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a GEMM, GroupNorm, Swish, Multiply, and Swish operations.
+    """
+
+    def __init__(self, in_features, out_features, num_groups, multiply_weight_shape):
+        super(Model, self).__init__()
+        self.gemm = nn.Linear(in_features, out_features)
+        self.group_norm = nn.GroupNorm(num_groups, out_features)
+        self.multiply_weight = nn.Parameter(torch.randn(multiply_weight_shape))
+
+    def forward(self, x):
+        # (batch_size, in_features) -> (batch_size, out_features)
+        x = self.gemm(x)
+        # (batch_size, out_features) -> (batch_size, out_features)
+        x = self.group_norm(x)
+        # (batch_size, out_features) -> (batch_size, out_features)
+        x = x * torch.sigmoid(x)
+        # (batch_size, out_features) -> (batch_size, out_features)
+        x = x * self.multiply_weight
+        # (batch_size, out_features) -> (batch_size, out_features)
+        x = x * torch.sigmoid(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/89_ConvTranspose3d_MaxPool_Softmax_Subtract_Swish_Max.py b/backends/mlir/cpu/KernelBench/level2/89_ConvTranspose3d_MaxPool_Softmax_Subtract_Swish_Max.py
new file mode 100644
index 0000000..8741b65
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/89_ConvTranspose3d_MaxPool_Softmax_Subtract_Swish_Max.py
@@ -0,0 +1,56 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    A model that performs a sequence of operations:
+        - ConvTranspose3d
+        - MaxPool3d
+        - Softmax
+        - Subtract
+        - Swish
+        - Max
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        output_padding,
+        pool_kernel_size,
+        pool_stride,
+        pool_padding,
+    ):
+        super(Model, self).__init__()
+        self.conv_transpose = nn.ConvTranspose3d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+        )
+        self.max_pool = nn.MaxPool3d(
+            kernel_size=pool_kernel_size, stride=pool_stride, padding=pool_padding
+        )
+        self.subtract = nn.Parameter(
+            torch.randn(out_channels)
+        )  # Assuming subtraction is element-wise across channels
+
+    def forward(self, x):
+        x = self.conv_transpose(x)
+        x = self.max_pool(x)
+        x = torch.softmax(x, dim=1)  # Apply softmax across channels (dim=1)
+        x = x - self.subtract.view(1, -1, 1, 1, 1)  # Subtract across channels
+        x = torch.sigmoid(x) * x  # Swish activation
+        x = torch.max(x, dim=1)[0]  # Max pooling across channels
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/8_Conv3d_Divide_Max_GlobalAvgPool_BiasAdd_Sum.py b/backends/mlir/cpu/KernelBench/level2/8_Conv3d_Divide_Max_GlobalAvgPool_BiasAdd_Sum.py
new file mode 100644
index 0000000..1075c1e
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/8_Conv3d_Divide_Max_GlobalAvgPool_BiasAdd_Sum.py
@@ -0,0 +1,41 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a 3D convolution, divides by a constant, applies max pooling,
+    global average pooling, adds a bias term, and sums along a specific dimension.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        divisor,
+        pool_size,
+        bias_shape,
+        sum_dim,
+    ):
+        super(Model, self).__init__()
+        self.conv = nn.Conv3d(in_channels, out_channels, kernel_size)
+        self.divisor = divisor
+        self.max_pool = nn.MaxPool3d(pool_size)
+        self.global_avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1))
+        self.bias = nn.Parameter(torch.randn(bias_shape))
+        self.sum_dim = sum_dim
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = x / self.divisor
+        x = self.max_pool(x)
+        x = self.global_avg_pool(x)
+        x = x + self.bias
+        x = torch.sum(x, dim=self.sum_dim)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/90_Conv3d_LeakyReLU_Sum_Clamp_GELU.py b/backends/mlir/cpu/KernelBench/level2/90_Conv3d_LeakyReLU_Sum_Clamp_GELU.py
new file mode 100644
index 0000000..cf15c8d
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/90_Conv3d_LeakyReLU_Sum_Clamp_GELU.py
@@ -0,0 +1,26 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a 3D convolution, applies LeakyReLU, sums with a tensor, clamps, and applies GELU activation.
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size, sum_tensor_shape):
+        super(Model, self).__init__()
+        self.conv = nn.Conv3d(in_channels, out_channels, kernel_size)
+        self.sum_tensor = nn.Parameter(torch.randn(sum_tensor_shape))
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = torch.nn.functional.leaky_relu(x, negative_slope=0.2)
+        x = x + self.sum_tensor
+        x = torch.clamp(x, min=-1.0, max=1.0)
+        x = torch.nn.functional.gelu(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/91_ConvTranspose2d_Softmax_BiasAdd_Scaling_Sigmoid.py b/backends/mlir/cpu/KernelBench/level2/91_ConvTranspose2d_Softmax_BiasAdd_Scaling_Sigmoid.py
new file mode 100644
index 0000000..dd87bae
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/91_ConvTranspose2d_Softmax_BiasAdd_Scaling_Sigmoid.py
@@ -0,0 +1,44 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a transposed convolution, applies softmax, adds a bias term, scales the result, and applies sigmoid.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        output_padding,
+        bias_shape,
+        scaling_factor,
+    ):
+        super(Model, self).__init__()
+        self.conv_transpose = nn.ConvTranspose2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+        )
+        self.bias = nn.Parameter(torch.randn(bias_shape))
+        self.scaling_factor = scaling_factor
+
+    def forward(self, x):
+        x = self.conv_transpose(x)
+        x = torch.softmax(x, dim=1)
+        x = x + self.bias
+        x = x * self.scaling_factor
+        x = torch.sigmoid(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/93_ConvTranspose2d_Add_Min_GELU_Multiply.py b/backends/mlir/cpu/KernelBench/level2/93_ConvTranspose2d_Add_Min_GELU_Multiply.py
new file mode 100644
index 0000000..e151c40
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/93_ConvTranspose2d_Add_Min_GELU_Multiply.py
@@ -0,0 +1,31 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a transposed convolution, adds a value, takes the minimum, applies GELU, and multiplies by a value.
+    """
+
+    def __init__(
+        self, in_channels, out_channels, kernel_size, stride, add_value, multiply_value
+    ):
+        super(Model, self).__init__()
+        self.conv_transpose = nn.ConvTranspose2d(
+            in_channels, out_channels, kernel_size, stride=stride
+        )
+        self.add_value = add_value
+        self.multiply_value = multiply_value
+
+    def forward(self, x):
+        x = self.conv_transpose(x)
+        x = x + self.add_value
+        x = torch.min(x, torch.tensor(0.0, device=x.device))
+        x = torch.nn.functional.gelu(x)
+        x = x * self.multiply_value
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/94_Gemm_BiasAdd_Hardtanh_Mish_GroupNorm.py b/backends/mlir/cpu/KernelBench/level2/94_Gemm_BiasAdd_Hardtanh_Mish_GroupNorm.py
new file mode 100644
index 0000000..be5f84a
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/94_Gemm_BiasAdd_Hardtanh_Mish_GroupNorm.py
@@ -0,0 +1,35 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    A model that performs a GEMM, BiasAdd, Hardtanh, Mish, and GroupNorm operations in sequence.
+    """
+
+    def __init__(self, in_features, out_features, bias_shape, num_groups):
+        super(Model, self).__init__()
+        self.gemm = nn.Linear(in_features, out_features)
+        self.bias = nn.Parameter(torch.randn(bias_shape))
+        self.hardtanh = nn.Hardtanh()
+        self.mish = nn.Mish()
+        self.groupnorm = nn.GroupNorm(num_groups=num_groups, num_channels=out_features)
+
+    def forward(self, x):
+        """
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_features).
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_features).
+        """
+        x = self.gemm(x)
+        x = x + self.bias
+        x = self.hardtanh(x)
+        x = self.mish(x)
+        x = self.groupnorm(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/95_Matmul_Add_Swish_Tanh_GELU_Hardtanh.py b/backends/mlir/cpu/KernelBench/level2/95_Matmul_Add_Swish_Tanh_GELU_Hardtanh.py
new file mode 100644
index 0000000..69831a8
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/95_Matmul_Add_Swish_Tanh_GELU_Hardtanh.py
@@ -0,0 +1,27 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a matrix multiplication, adds a value, applies Swish, Tanh, GELU, and Hardtanh activation functions.
+    """
+
+    def __init__(self, in_features, out_features, add_value_shape):
+        super(Model, self).__init__()
+        self.matmul = nn.Linear(in_features, out_features)
+        self.add_value = nn.Parameter(torch.randn(add_value_shape))
+
+    def forward(self, x):
+        x = self.matmul(x)
+        x = x + self.add_value
+        x = torch.sigmoid(x) * x  # Swish
+        x = torch.tanh(x)
+        x = torch.nn.functional.gelu(x)  # GELU
+        x = torch.nn.functional.hardtanh(x, min_val=-1, max_val=1)  # Hardtanh
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/96_ConvTranspose3d_Multiply_Max_GlobalAvgPool_Clamp.py b/backends/mlir/cpu/KernelBench/level2/96_ConvTranspose3d_Multiply_Max_GlobalAvgPool_Clamp.py
new file mode 100644
index 0000000..efb6fd2
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/96_ConvTranspose3d_Multiply_Max_GlobalAvgPool_Clamp.py
@@ -0,0 +1,42 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a transposed 3D convolution, multiplies by a scalar, applies max pooling,
+    global average pooling, and clamps the output.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        scale,
+        maxpool_kernel_size,
+    ):
+        super(Model, self).__init__()
+        self.conv_transpose = nn.ConvTranspose3d(
+            in_channels, out_channels, kernel_size, stride=stride, padding=padding
+        )
+        self.scale = scale
+        self.maxpool = nn.MaxPool3d(kernel_size=maxpool_kernel_size)
+        self.global_avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1))
+        self.clamp_min = 0
+        self.clamp_max = 1
+
+    def forward(self, x):
+        x = self.conv_transpose(x)
+        x = x * self.scale
+        x = self.maxpool(x)
+        x = self.global_avg_pool(x)
+        x = torch.clamp(x, min=self.clamp_min, max=self.clamp_max)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/97_Matmul_BatchNorm_BiasAdd_Divide_Swish.py b/backends/mlir/cpu/KernelBench/level2/97_Matmul_BatchNorm_BiasAdd_Divide_Swish.py
new file mode 100644
index 0000000..35fa50a
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/97_Matmul_BatchNorm_BiasAdd_Divide_Swish.py
@@ -0,0 +1,36 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a matrix multiplication, batch normalization, bias addition, division, and Swish activation.
+    """
+
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        bn_eps=1e-5,
+        bn_momentum=0.1,
+        bias_shape=(1,),
+        divide_value=1.0,
+    ):
+        super(Model, self).__init__()
+        self.matmul = nn.Linear(in_features, out_features)
+        self.bn = nn.BatchNorm1d(out_features, eps=bn_eps, momentum=bn_momentum)
+        self.bias = nn.Parameter(torch.randn(bias_shape))
+        self.divide_value = divide_value
+
+    def forward(self, x):
+        x = self.matmul(x)
+        x = self.bn(x)
+        x = x + self.bias
+        x = x / self.divide_value
+        x = x * torch.sigmoid(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/98_Matmul_AvgPool_GELU_Scale_Max.py b/backends/mlir/cpu/KernelBench/level2/98_Matmul_AvgPool_GELU_Scale_Max.py
new file mode 100644
index 0000000..cecc94d
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/98_Matmul_AvgPool_GELU_Scale_Max.py
@@ -0,0 +1,34 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    A model implementing the pattern "Matmul_AvgPool_GELU_Scale_Max".
+    """
+
+    def __init__(self, in_features, out_features, pool_kernel_size, scale_factor):
+        super(Model, self).__init__()
+        self.matmul = nn.Linear(in_features, out_features)
+        self.avg_pool = nn.AvgPool1d(kernel_size=pool_kernel_size)
+        self.scale_factor = scale_factor
+
+    def forward(self, x):
+        """
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_features).
+
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_features).
+        """
+        x = self.matmul(x)
+        x = self.avg_pool(x.unsqueeze(1)).squeeze(1)
+        x = torch.nn.functional.gelu(x)
+        x = x * self.scale_factor
+        x = torch.max(x, dim=1).values
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/99_Matmul_GELU_Softmax.py b/backends/mlir/cpu/KernelBench/level2/99_Matmul_GELU_Softmax.py
new file mode 100644
index 0000000..68344a0
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/99_Matmul_GELU_Softmax.py
@@ -0,0 +1,23 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Simple model that performs a matrix multiplication, applies GELU, and then applies Softmax.
+    """
+
+    def __init__(self, in_features, out_features):
+        super(Model, self).__init__()
+        self.linear = nn.Linear(in_features, out_features)
+
+    def forward(self, x):
+        x = self.linear(x)
+        x = torch.nn.functional.gelu(x)
+        x = torch.nn.functional.softmax(x, dim=1)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level2/9_Matmul_Subtract_Multiply_ReLU.py b/backends/mlir/cpu/KernelBench/level2/9_Matmul_Subtract_Multiply_ReLU.py
new file mode 100644
index 0000000..c3a0c6b
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level2/9_Matmul_Subtract_Multiply_ReLU.py
@@ -0,0 +1,26 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    Model that performs a matrix multiplication, subtraction, multiplication, and ReLU activation.
+    """
+
+    def __init__(self, in_features, out_features, subtract_value, multiply_value):
+        super(Model, self).__init__()
+        self.linear = nn.Linear(in_features, out_features)
+        self.subtract_value = subtract_value
+        self.multiply_value = multiply_value
+
+    def forward(self, x):
+        x = self.linear(x)
+        x = x - self.subtract_value
+        x = x * self.multiply_value
+        x = torch.relu(x)
+        return x

From 481a14cc45ddcb09e4fba5046df4c2583f220e66 Mon Sep 17 00:00:00 2001
From: Adam Siemieniuk <adam.siemieniuk@intel.com>
Date: Wed, 8 Apr 2026 12:39:47 +0200
Subject: [PATCH 4/7] Rename module

---
 ai_bench/mlir/__init__.py                      | 2 +-
 ai_bench/mlir/{cpu_pipeline.py => pipeline.py} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename ai_bench/mlir/{cpu_pipeline.py => pipeline.py} (100%)

diff --git a/ai_bench/mlir/__init__.py b/ai_bench/mlir/__init__.py
index 750c1bf..f27d0e0 100644
--- a/ai_bench/mlir/__init__.py
+++ b/ai_bench/mlir/__init__.py
@@ -1,5 +1,5 @@
 from .compile import cpu_backend
-from .cpu_pipeline import cpu_pipeline
+from .pipeline import cpu_pipeline
 
 __all__ = [
     "cpu_backend",
diff --git a/ai_bench/mlir/cpu_pipeline.py b/ai_bench/mlir/pipeline.py
similarity index 100%
rename from ai_bench/mlir/cpu_pipeline.py
rename to ai_bench/mlir/pipeline.py

From 684578fcb190eda0ba2ddeeecea2b2e2800f634a Mon Sep 17 00:00:00 2001
From: Adam Siemieniuk <adam.siemieniuk@intel.com>
Date: Wed, 8 Apr 2026 12:56:47 +0200
Subject: [PATCH 5/7] Setup MLIR shared libs

---
 infra/scripts/ci-cpu-run-kernel-bench.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/infra/scripts/ci-cpu-run-kernel-bench.sh b/infra/scripts/ci-cpu-run-kernel-bench.sh
index bc797dd..f4e3995 100755
--- a/infra/scripts/ci-cpu-run-kernel-bench.sh
+++ b/infra/scripts/ci-cpu-run-kernel-bench.sh
@@ -94,6 +94,8 @@ if [[ "${BENCH_BACKEND}" == "${BENCH_BACKEND_TORCH_COMPILE}" ]]; then
 fi
 if [[ "${BENCH_BACKEND}" == "${BENCH_BACKEND_MLIR}" ]]; then
   BENCH_FLAGS="${BENCH_FLAGS} --mlir"
+  MLIR_PACKAGE_PATH=$(${AI_BENCH_UV} run python -c "import mlir; print(mlir.__path__[0])")
+  export AIBENCH_MLIR_LIB_PATH=${MLIR_PACKAGE_PATH}/_mlir_libs/libmlir_c_runner_utils.so
 fi
 
 ${AI_BENCH_UV} run ai-bench ${BENCH_FLAGS}

From fcb01ddf1e7f4a7303e225f633ded4f72d5e8afa Mon Sep 17 00:00:00 2001
From: Adam Siemieniuk <adam.siemieniuk@intel.com>
Date: Wed, 8 Apr 2026 13:00:03 +0200
Subject: [PATCH 6/7] Simplify pipeline

---
 ai_bench/mlir/pipeline.py | 22 ----------------------
 1 file changed, 22 deletions(-)

diff --git a/ai_bench/mlir/pipeline.py b/ai_bench/mlir/pipeline.py
index 50f0673..7bb19fc 100644
--- a/ai_bench/mlir/pipeline.py
+++ b/ai_bench/mlir/pipeline.py
@@ -1,33 +1,13 @@
-import lighthouse.schedule as lh_schedule
-import lighthouse.transform as lh_transform
 from mlir import ir
-from mlir.dialects import transform
-from mlir.dialects.transform import structured
 from mlir.passmanager import PassManager
 
 
 def cpu_pipeline(module: ir.Module) -> ir.Module:
     # Use standard C interface wrappers for functions.
     pm = PassManager("builtin.module", module.context)
-    # pm.add("print-ir")
     pm.add("func.func(llvm-request-c-wrappers)")
-    pm.run(module.operation)
-
-    # Decompose complex Linalg ops into simpler ones.
-    ctx = module.context
-    with ctx, ir.Location.unknown(context=ctx):
-        with lh_schedule.schedule_boilerplate() as (sched, named_seq):
-            # ops = lh_transform.match_op(named_seq.bodyTarget, "linalg.conv_2d_nchw_fchw")
-            # structured.structured_decompose(transform.any_op_t(), ops)
-            # ops = lh_transform.match_op(named_seq.bodyTarget, "linalg.conv_3d_ncdhw_fcdhw")
-            # structured.structured_decompose(transform.any_op_t(), ops)
-            softmax_ops = lh_transform.match_op(named_seq.bodyTarget, "linalg.softmax")
-            structured.structured_decompose_interface(transform.any_op_t(), softmax_ops)
-            transform.yield_()
-    sched.body.operations[0].apply(module)
 
     # Bufferize.
-    pm = PassManager("builtin.module", module.context)
     pm.add("eliminate-empty-tensors")
     pm.add(
         "one-shot-bufferize{function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries}"
@@ -40,7 +20,6 @@ def cpu_pipeline(module: ir.Module) -> ir.Module:
 
     # Lower to LLVM.
     pm.add("convert-linalg-to-loops")
-    # pm.add("print-ir")
     pm.add("math-expand-ops")
     pm.add("expand-strided-metadata")
     pm.add("canonicalize")
@@ -56,7 +35,6 @@ def cpu_pipeline(module: ir.Module) -> ir.Module:
     # Cleanup
     pm.add("cse")
     pm.add("canonicalize")
-    # pm.add("print-ir")
 
     pm.run(module.operation)
 

From 9b6a1bf85775e2df7382b2cc3e372a852fe8aa8e Mon Sep 17 00:00:00 2001
From: Adam Siemieniuk <adam.siemieniuk@intel.com>
Date: Wed, 8 Apr 2026 17:00:19 +0200
Subject: [PATCH 7/7] Docs

---
 ai_bench/mlir/pipeline.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/ai_bench/mlir/pipeline.py b/ai_bench/mlir/pipeline.py
index 7bb19fc..caccff4 100644
--- a/ai_bench/mlir/pipeline.py
+++ b/ai_bench/mlir/pipeline.py
@@ -3,6 +3,21 @@
 
 
 def cpu_pipeline(module: ir.Module) -> ir.Module:
+    """
+    The default lowering pipeline for CPU.
+    Lowers MLIR ops within the module to MLIR LLVM IR dialect.
+
+    The pipeline focuses on enabling end-to-end lowering for various
+    generic kernel modules.
+
+    Performance is currently secondary and not representative.
+
+    Args:
+        module: MLIR module coming from PyTorch importer.
+    Returns:
+        MLIR module with lowered IR.
+    """
+
     # Use standard C interface wrappers for functions.
     pm = PassManager("builtin.module", module.context)
     pm.add("func.func(llvm-request-c-wrappers)")
@@ -36,6 +51,7 @@ def cpu_pipeline(module: ir.Module) -> ir.Module:
     pm.add("cse")
     pm.add("canonicalize")
 
+    # IR is transformed in-place.
     pm.run(module.operation)
 
     return module