diff --git a/backends/mlir/cpu/KernelBench/level3/13_DenseNet121TransitionLayer.py b/backends/mlir/cpu/KernelBench/level3/13_DenseNet121TransitionLayer.py
new file mode 100644
index 0000000..c871c6a
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level3/13_DenseNet121TransitionLayer.py
@@ -0,0 +1,31 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    def __init__(self, num_input_features: int, num_output_features: int):
+        """
+        :param num_input_features: The number of input feature maps
+        :param num_output_features: The number of output feature maps
+        """
+        super(Model, self).__init__()
+        self.transition = nn.Sequential(
+            nn.BatchNorm2d(num_input_features),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(
+                num_input_features, num_output_features, kernel_size=1, bias=False
+            ),
+            nn.AvgPool2d(kernel_size=2, stride=2),
+        )
+
+    def forward(self, x):
+        """
+        :param x: Input tensor of shape (batch_size, num_input_features, height, width)
+        :return: Downsampled tensor with reduced number of feature maps
+        """
+        return self.transition(x)
diff --git a/backends/mlir/cpu/KernelBench/level3/14_DenseNet121DenseBlock.py b/backends/mlir/cpu/KernelBench/level3/14_DenseNet121DenseBlock.py
new file mode 100644
index 0000000..a00659d
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level3/14_DenseNet121DenseBlock.py
@@ -0,0 +1,46 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    def __init__(self, num_layers: int, num_input_features: int, growth_rate: int):
+        """
+        :param num_layers: The number of layers in the dense block
+        :param num_input_features: The number of input feature maps
+        :param growth_rate: The growth rate for the dense block (new features added per layer)
+        """
+        super(Model, self).__init__()
+        layers = []
+        for i in range(num_layers):
+            layers.append(
+                self._make_layer(num_input_features + i * growth_rate, growth_rate)
+            )
+        self.layers = nn.ModuleList(layers)
+
+    def _make_layer(self, in_features: int, growth_rate: int):
+        """
+        Creates a single layer with BatchNorm, ReLU, Conv2D, and Dropout.
+        """
+        return nn.Sequential(
+            nn.BatchNorm2d(in_features),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(in_features, growth_rate, kernel_size=3, padding=1, bias=False),
+            nn.Dropout(0.0),
+        )
+
+    def forward(self, x):
+        """
+        :param x: Input tensor of shape (batch_size, num_input_features, height, width)
+        :return: Concatenated output tensor with shape (batch_size, num_output_features, height, width)
+        """
+        features = [x]
+        for layer in self.layers:
+            new_feature = layer(x)
+            features.append(new_feature)
+            x = torch.cat(features, 1)  # Concatenate along channel axis
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level3/17_SqueezeNetFireModule.py b/backends/mlir/cpu/KernelBench/level3/17_SqueezeNetFireModule.py
new file mode 100644
index 0000000..dcd91c4
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level3/17_SqueezeNetFireModule.py
@@ -0,0 +1,45 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    def __init__(
+        self, in_channels, squeeze_channels, expand1x1_channels, expand3x3_channels
+    ):
+        """
+        :param in_channels: Number of input channels
+        :param squeeze_channels: Number of output channels for the squeeze layer
+        :param expand1x1_channels: Number of output channels for the 1x1 expand layer
+        :param expand3x3_channels: Number of output channels for the 3x3 expand layer
+        """
+        super(Model, self).__init__()
+
+        self.squeeze = nn.Conv2d(in_channels, squeeze_channels, kernel_size=1)
+        self.squeeze_activation = nn.ReLU(inplace=True)
+
+        self.expand1x1 = nn.Conv2d(squeeze_channels, expand1x1_channels, kernel_size=1)
+        self.expand1x1_activation = nn.ReLU(inplace=True)
+
+        self.expand3x3 = nn.Conv2d(
+            squeeze_channels, expand3x3_channels, kernel_size=3, padding=1
+        )
+        self.expand3x3_activation = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        """
+        :param x: Input tensor, shape (batch_size, in_channels, height, width)
+        :return: Output tensor, shape (batch_size, expand1x1_channels + expand3x3_channels, height, width)
+        """
+        x = self.squeeze_activation(self.squeeze(x))
+        return torch.cat(
+            [
+                self.expand1x1_activation(self.expand1x1(x)),
+                self.expand3x3_activation(self.expand3x3(x)),
+            ],
+            1,
+        )
diff --git a/backends/mlir/cpu/KernelBench/level3/18_SqueezeNet.py b/backends/mlir/cpu/KernelBench/level3/18_SqueezeNet.py
new file mode 100644
index 0000000..b111971
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level3/18_SqueezeNet.py
@@ -0,0 +1,85 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+class FireModule(nn.Module):
+    def __init__(
+        self, in_channels, squeeze_channels, expand1x1_channels, expand3x3_channels
+    ):
+        """
+        :param in_channels: Number of input channels
+        :param squeeze_channels: Number of output channels for the squeeze layer
+        :param expand1x1_channels: Number of output channels for the 1x1 expand layer
+        :param expand3x3_channels: Number of output channels for the 3x3 expand layer
+        """
+        super(FireModule, self).__init__()
+
+        self.squeeze = nn.Conv2d(in_channels, squeeze_channels, kernel_size=1)
+        self.squeeze_activation = nn.ReLU(inplace=True)
+
+        self.expand1x1 = nn.Conv2d(squeeze_channels, expand1x1_channels, kernel_size=1)
+        self.expand1x1_activation = nn.ReLU(inplace=True)
+
+        self.expand3x3 = nn.Conv2d(
+            squeeze_channels, expand3x3_channels, kernel_size=3, padding=1
+        )
+        self.expand3x3_activation = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        """
+        :param x: Input tensor, shape (batch_size, in_channels, height, width)
+        :return: Output tensor, shape (batch_size, expand1x1_channels + expand3x3_channels, height, width)
+        """
+        x = self.squeeze_activation(self.squeeze(x))
+        return torch.cat(
+            [
+                self.expand1x1_activation(self.expand1x1(x)),
+                self.expand3x3_activation(self.expand3x3(x)),
+            ],
+            1,
+        )
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    def __init__(self, num_classes=1000):
+        """
+        :param num_classes: Number of output classes
+        """
+        super(Model, self).__init__()
+
+        self.features = nn.Sequential(
+            nn.Conv2d(3, 96, kernel_size=7, stride=2),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
+            FireModule(96, 16, 64, 64),
+            FireModule(128, 16, 64, 64),
+            FireModule(128, 32, 128, 128),
+            nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
+            FireModule(256, 32, 128, 128),
+            FireModule(256, 48, 192, 192),
+            FireModule(384, 48, 192, 192),
+            FireModule(384, 64, 256, 256),
+            nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
+            FireModule(512, 64, 256, 256),
+        )
+
+        self.classifier = nn.Sequential(
+            nn.Dropout(p=0.0),
+            nn.Conv2d(512, num_classes, kernel_size=1),
+            nn.ReLU(inplace=True),
+            nn.AdaptiveAvgPool2d((1, 1)),
+        )
+
+    def forward(self, x):
+        """
+        :param x: Input tensor, shape (batch_size, 3, height, width)
+        :return: Output tensor, shape (batch_size, num_classes)
+        """
+        x = self.features(x)
+        x = self.classifier(x)
+        return torch.flatten(x, 1)
diff --git a/backends/mlir/cpu/KernelBench/level3/1_MLP.py b/backends/mlir/cpu/KernelBench/level3/1_MLP.py
new file mode 100644
index 0000000..61574ed
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level3/1_MLP.py
@@ -0,0 +1,36 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    def __init__(self, input_size, layer_sizes, output_size):
+        """
+        :param input_size: The number of input features
+        :param layer_sizes: A list of ints containing the sizes of each hidden layer
+        :param output_size: The number of output features
+        """
+        super(Model, self).__init__()
+
+        layers = []
+        current_input_size = input_size
+
+        for layer_size in layer_sizes:
+            layers.append(nn.Linear(current_input_size, layer_size))
+            layers.append(nn.ReLU())
+            current_input_size = layer_size
+
+        layers.append(nn.Linear(current_input_size, output_size))
+
+        self.network = nn.Sequential(*layers)
+
+    def forward(self, x):
+        """
+        :param x: The input tensor, shape (batch_size, input_size)
+        :return: The output tensor, shape (batch_size, output_size)
+        """
+        return self.network(x)
diff --git a/backends/mlir/cpu/KernelBench/level3/21_EfficientNetMBConv.py b/backends/mlir/cpu/KernelBench/level3/21_EfficientNetMBConv.py
new file mode 100644
index 0000000..85e8a4e
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level3/21_EfficientNetMBConv.py
@@ -0,0 +1,79 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride, expand_ratio):
+        """
+        MBConv block implementation.
+
+        :param in_channels: Number of input channels.
+        :param out_channels: Number of output channels.
+        :param kernel_size: Kernel size for the depthwise convolution.
+        :param stride: Stride for the depthwise convolution.
+        :param expand_ratio: Expansion ratio for the intermediate channels.
+        """
+        super(Model, self).__init__()
+
+        self.use_residual = stride == 1 and in_channels == out_channels
+        hidden_dim = in_channels * expand_ratio
+
+        if expand_ratio != 1:
+            self.expand_conv = nn.Sequential(
+                nn.Conv2d(
+                    in_channels,
+                    hidden_dim,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                    bias=False,
+                ),
+                nn.BatchNorm2d(hidden_dim),
+                nn.ReLU6(inplace=True),
+            )
+
+        self.depthwise_conv = nn.Sequential(
+            nn.Conv2d(
+                hidden_dim,
+                hidden_dim,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=(kernel_size - 1) // 2,
+                groups=hidden_dim,
+                bias=False,
+            ),
+            nn.BatchNorm2d(hidden_dim),
+            nn.ReLU6(inplace=True),
+        )
+
+        self.project_conv = nn.Sequential(
+            nn.Conv2d(
+                hidden_dim, out_channels, kernel_size=1, stride=1, padding=0, bias=False
+            ),
+            nn.BatchNorm2d(out_channels),
+        )
+
+    def forward(self, x):
+        """
+        Forward pass of the MBConv block.
+
+        :param x: The input tensor, shape (batch_size, in_channels, H, W)
+        :return: The output tensor, shape (batch_size, out_channels, H', W')
+        """
+        identity = x
+
+        if hasattr(self, "expand_conv"):
+            x = self.expand_conv(x)
+
+        x = self.depthwise_conv(x)
+        x = self.project_conv(x)
+
+        if self.use_residual:
+            x += identity
+
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level3/25_ShuffleNetUnit.py b/backends/mlir/cpu/KernelBench/level3/25_ShuffleNetUnit.py
new file mode 100644
index 0000000..af8c23e
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level3/25_ShuffleNetUnit.py
@@ -0,0 +1,126 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    def __init__(self, in_channels, out_channels, groups=3):
+        """
+        ShuffleNet unit implementation.
+
+        :param in_channels: Number of input channels.
+        :param out_channels: Number of output channels.
+        :param groups: Number of groups for group convolution.
+        """
+        super(Model, self).__init__()
+
+        # Ensure the output channels are divisible by groups
+        assert out_channels % 4 == 0
+        mid_channels = out_channels // 4
+
+        # First 1x1 group convolution
+        self.conv1 = nn.Conv2d(
+            in_channels,
+            mid_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=groups,
+            bias=False,
+        )
+        self.bn1 = nn.BatchNorm2d(mid_channels)
+
+        # Depthwise 3x3 convolution
+        self.conv2 = nn.Conv2d(
+            mid_channels,
+            mid_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            groups=mid_channels,
+            bias=False,
+        )
+        self.bn2 = nn.BatchNorm2d(mid_channels)
+
+        # Second 1x1 group convolution
+        self.conv3 = nn.Conv2d(
+            mid_channels,
+            out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=groups,
+            bias=False,
+        )
+        self.bn3 = nn.BatchNorm2d(out_channels)
+
+        # Shuffle operation
+        self.shuffle = ChannelShuffle(groups)
+
+        # Shortcut connection if input and output channels are the same
+        if in_channels == out_channels:
+            self.shortcut = nn.Sequential()
+        else:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(
+                    in_channels,
+                    out_channels,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                    bias=False,
+                ),
+                nn.BatchNorm2d(out_channels),
+            )
+
+    def forward(self, x):
+        """
+        Forward pass for ShuffleNet unit.
+
+        :param x: Input tensor, shape (batch_size, in_channels, height, width)
+        :return: Output tensor, shape (batch_size, out_channels, height, width)
+        """
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.bn2(self.conv2(out))
+        out = self.shuffle(out)
+        out = F.relu(self.bn3(self.conv3(out)))
+
+        out += self.shortcut(x)
+        return out
+
+
+class ChannelShuffle(nn.Module):
+    def __init__(self, groups):
+        """
+        Channel shuffle operation.
+
+        :param groups: Number of groups for shuffling.
+        """
+        super(ChannelShuffle, self).__init__()
+        self.groups = groups
+
+    def forward(self, x):
+        """
+        Forward pass for channel shuffle.
+
+        :param x: Input tensor, shape (batch_size, channels, height, width)
+        :return: Output tensor, shape (batch_size, channels, height, width)
+        """
+        batch_size, channels, height, width = x.size()
+        channels_per_group = channels // self.groups
+
+        # Reshape
+        x = x.view(batch_size, self.groups, channels_per_group, height, width)
+
+        # Transpose
+        x = x.transpose(1, 2).contiguous()
+
+        # Flatten
+        x = x.view(batch_size, -1, height, width)
+
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level3/27_RegNet.py b/backends/mlir/cpu/KernelBench/level3/27_RegNet.py
new file mode 100644
index 0000000..164ceca
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level3/27_RegNet.py
@@ -0,0 +1,62 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    def __init__(self, input_channels, stages, block_widths, output_classes):
+        """
+        :param input_channels: int, Number of input channels for the first layer
+        :param stages: int, Number of stages in the RegNet architecture
+        :param block_widths: List[int], Width (number of channels) for each block in the stages
+        :param output_classes: int, Number of output classes for classification
+        """
+        super(Model, self).__init__()
+
+        self.stages = stages
+        self.block_widths = block_widths
+
+        layers = []
+        current_channels = input_channels
+
+        # Construct the stages with their respective blocks
+        for i in range(stages):
+            layers.append(self._make_stage(current_channels, block_widths[i]))
+            current_channels = block_widths[i]
+
+        self.feature_extractor = nn.Sequential(*layers)
+
+        # Final fully connected layer for classification
+        self.fc = nn.Linear(block_widths[-1], output_classes)
+
+    def _make_stage(self, in_channels, out_channels):
+        """
+        Creates a simple block for each stage.
+        :param in_channels: int, number of input channels
+        :param out_channels: int, number of output channels
+        :return: nn.Sequential block with convolutional layers
+        """
+        return nn.Sequential(
+            nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(),
+            nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(),
+            nn.MaxPool2d(kernel_size=2, stride=2),
+        )
+
+    def forward(self, x):
+        """
+        Forward pass through the RegNet model.
+        :param x: torch.Tensor of shape (batch_size, input_channels, height, width)
+        :return: torch.Tensor of shape (batch_size, output_classes)
+        """
+        x = self.feature_extractor(x)
+        x = torch.mean(x, dim=[2, 3])  # Global Average Pooling
+        x = self.fc(x)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level3/28_VisionTransformer.py b/backends/mlir/cpu/KernelBench/level3/28_VisionTransformer.py
new file mode 100644
index 0000000..18dd297
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level3/28_VisionTransformer.py
@@ -0,0 +1,91 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    def __init__(
+        self,
+        image_size,
+        patch_size,
+        num_classes,
+        dim,
+        depth,
+        heads,
+        mlp_dim,
+        channels=3,
+        dropout=0.1,
+        emb_dropout=0.1,
+    ):
+        """
+        Vision Transformer (ViT) model.
+
+        :param image_size: The size of the input image (assumed to be square).
+        :param patch_size: The size of each patch (assumed to be square).
+        :param num_classes: The number of output classes.
+        :param dim: The dimensionality of the embedding space.
+        :param depth: The number of transformer layers.
+        :param heads: The number of attention heads.
+        :param mlp_dim: The dimensionality of the MLP (Multi-Layer Perceptron) in the transformer.
+        :param channels: The number of channels in the input image (default is 3 for RGB).
+        :param dropout: Dropout rate applied in the MLP.
+        :param emb_dropout: Dropout rate applied to the embedded patches.
+        """
+        super(Model, self).__init__()
+
+        assert image_size % patch_size == 0, (
+            "Image dimensions must be divisible by the patch size."
+        )
+        num_patches = (image_size // patch_size) ** 2
+        patch_dim = channels * patch_size**2
+
+        self.patch_size = patch_size
+        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
+        self.patch_to_embedding = nn.Linear(patch_dim, dim)
+        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
+        self.dropout = nn.Dropout(emb_dropout)
+
+        self.transformer = nn.TransformerEncoder(
+            nn.TransformerEncoderLayer(
+                d_model=dim, nhead=heads, dim_feedforward=mlp_dim, dropout=dropout
+            ),
+            num_layers=depth,
+        )
+
+        self.to_cls_token = nn.Identity()
+        self.mlp_head = nn.Sequential(
+            nn.Linear(dim, mlp_dim),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(mlp_dim, num_classes),
+        )
+
+    def forward(self, img):
+        """
+        Forward pass of the Vision Transformer.
+
+        :param img: The input image tensor, shape (batch_size, channels, image_size, image_size).
+        :return: The output tensor, shape (batch_size, num_classes).
+        """
+        p = self.patch_size
+
+        x = (
+            img.unfold(2, p, p)
+            .unfold(3, p, p)
+            .reshape(img.shape[0], -1, p * p * img.shape[1])
+        )
+        x = self.patch_to_embedding(x)
+
+        cls_tokens = self.cls_token.expand(img.shape[0], -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
+        x += self.pos_embedding
+        x = self.dropout(x)
+
+        x = self.transformer(x)
+
+        x = self.to_cls_token(x[:, 0])
+        return self.mlp_head(x)
diff --git a/backends/mlir/cpu/KernelBench/level3/2_ShallowWideMLP.py b/backends/mlir/cpu/KernelBench/level3/2_ShallowWideMLP.py
new file mode 100644
index 0000000..32a34d1
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level3/2_ShallowWideMLP.py
@@ -0,0 +1,36 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    def __init__(self, input_size, hidden_layer_sizes, output_size):
+        """
+        :param input_size: The number of input features
+        :param hidden_layer_sizes: A list of ints containing the sizes of each hidden layer
+        :param output_size: The number of output features
+        """
+        super(Model, self).__init__()
+
+        layers = []
+        current_input_size = input_size
+
+        for hidden_size in hidden_layer_sizes:
+            layers.append(nn.Linear(current_input_size, hidden_size))
+            layers.append(nn.ReLU())
+            current_input_size = hidden_size
+
+        layers.append(nn.Linear(current_input_size, output_size))
+
+        self.network = nn.Sequential(*layers)
+
+    def forward(self, x):
+        """
+        :param x: The input tensor, shape (batch_size, input_size)
+        :return: The output tensor, shape (batch_size, output_size)
+        """
+        return self.network(x)
diff --git a/backends/mlir/cpu/KernelBench/level3/31_VisionAttention.py b/backends/mlir/cpu/KernelBench/level3/31_VisionAttention.py
new file mode 100644
index 0000000..537d715
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level3/31_VisionAttention.py
@@ -0,0 +1,32 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    def __init__(self, embed_dim, num_heads):
+        """
+        Attention Block using Multihead Self-Attention.
+        :param embed_dim: Embedding dimension (the number of channels)
+        :param num_heads: Number of attention heads
+        """
+        super(Model, self).__init__()
+        self.attn = nn.MultiheadAttention(embed_dim, num_heads)
+        self.norm = nn.LayerNorm(embed_dim)
+
+    def forward(self, x):
+        """
+        Forward pass of the AttentionBlock.
+        :param x: Input tensor of shape (B, C, H, W)
+        :return: Output tensor of the same shape (B, C, H, W)
+        """
+        B, C, H, W = x.shape
+        x = x.view(B, C, H * W).permute(2, 0, 1)  # (seq_len, batch_size, embed_dim)
+        attn_output, _ = self.attn(x, x, x)
+        x = self.norm(attn_output + x)  # (seq_len, batch_size, embed_dim)
+        x = x.permute(1, 2, 0).view(B, C, H, W)
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level3/33_VanillaRNN.py b/backends/mlir/cpu/KernelBench/level3/33_VanillaRNN.py
new file mode 100644
index 0000000..5baab18
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level3/33_VanillaRNN.py
@@ -0,0 +1,49 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    def __init__(self, input_size: int, hidden_size: int, output_size: int):
+        """
+        Initialize the Vanilla RNN model.
+
+        :param input_size: The number of input features (int).
+        :param hidden_size: The size of the hidden state (int).
+        :param output_size: The number of output features (int).
+        """
+        super(Model, self).__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.output_size = output_size
+        self.hidden = torch.randn((batch_size, hidden_size))
+
+        # Define the RNN cell components (input to hidden, hidden to hidden, and hidden to output)
+        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)  # Input to hidden
+        self.h2o = nn.Linear(hidden_size, output_size)  # Hidden to output
+        self.tanh = nn.Tanh()  # Activation function for hidden state
+
+    def forward(self, x: torch.Tensor, initial_hidden=None) -> torch.Tensor:
+        """
+        Forward pass of the Vanilla RNN.
+
+        :param x: Input tensor of shape (batch_size, input_size).
+        :param hidden: Hidden state tensor of shape (batch_size, hidden_size).
+        :return: Output tensor of shape (batch_size, output_size), and the new hidden state.
+        """
+        if initial_hidden is not None:
+            self.hidden.copy_(initial_hidden)
+        self.hidden = self.hidden.to(x.device)
+        combined = torch.cat(
+            (x, self.hidden), dim=1
+        )  # Concatenate input and hidden state
+        self.hidden = self.tanh(self.i2h(combined))  # Update hidden state
+        output = self.h2o(self.hidden)  # Compute output
+        return output
+
+
+batch_size = 256
diff --git a/backends/mlir/cpu/KernelBench/level3/34_VanillaRNNHidden.py b/backends/mlir/cpu/KernelBench/level3/34_VanillaRNNHidden.py
new file mode 100644
index 0000000..13ddb88
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level3/34_VanillaRNNHidden.py
@@ -0,0 +1,49 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    def __init__(self, input_size: int, hidden_size: int, output_size: int):
+        """
+        Initialize the Vanilla RNN model.
+
+        :param input_size: The number of input features (int).
+        :param hidden_size: The size of the hidden state (int).
+        :param output_size: The number of output features (int).
+        """
+        super(Model, self).__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.output_size = output_size
+
+        # Define the RNN cell components (input to hidden, hidden to hidden, and hidden to output)
+        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)  # Input to hidden
+        self.h2o = nn.Linear(hidden_size, output_size)  # Hidden to output
+        self.tanh = nn.Tanh()  # Activation function for hidden state
+
+    def forward(self, x: torch.Tensor, h0: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass of the Vanilla RNN.
+
+        :param x: Input tensor of shape (seq_len, batch_size, input_size)
+        :param h0: Initial hidden state tensor of shape (batch_size, hidden_size)
+        :return: Output tensor of shape (seq_len, batch_size, output_size)
+        """
+        seq_len, batch_size, _ = x.size()
+        hidden = h0.to(x.device)
+        outputs = []
+
+        for t in range(seq_len):
+            combined = torch.cat(
+                (x[t], hidden), dim=1
+            )  # Concatenate input and hidden state
+            hidden = self.tanh(self.i2h(combined))  # Update hidden state
+            output = self.h2o(hidden)  # Compute output
+            outputs.append(output)
+
+        return torch.stack(outputs, dim=0)  # (seq_len, batch_size, output_size)
diff --git a/backends/mlir/cpu/KernelBench/level3/35_LSTM.py b/backends/mlir/cpu/KernelBench/level3/35_LSTM.py
new file mode 100644
index 0000000..85d1ad5
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level3/35_LSTM.py
@@ -0,0 +1,57 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout=0.0):
+        """
+        Initialize the LSTM model.
+
+        :param input_size: The number of expected features in the input `x`
+        :param hidden_size: The number of features in the hidden state `h`
+        :param num_layers: Number of recurrent layers
+        :param output_size: The number of output features
+        :param dropout: If non-zero, introduces a Dropout layer on the outputs of each LSTM layer except the last layer
+        """
+        super(Model, self).__init__()
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.lstm = nn.LSTM(
+            input_size,
+            hidden_size,
+            num_layers,
+            batch_first=True,
+            dropout=dropout,
+            bidirectional=False,
+        )
+        self.fc = nn.Linear(hidden_size, output_size)
+
+    def forward(self, x, h0=None, c0=None):
+        """
+        Forward pass through the LSTM model.
+
+        :param x: The input tensor, shape (batch_size, sequence_length, input_size)
+        :param h0: Optional initial hidden state (num_layers, batch_size, hidden_size)
+        :param c0: Optional initial cell state (num_layers, batch_size, hidden_size)
+        :return: The output tensor, shape (batch_size, output_size)
+        """
+        batch_size = x.size(0)
+
+        if h0 is None:
+            h0 = torch.randn(
+                self.num_layers, batch_size, self.hidden_size, device=x.device
+            )
+        if c0 is None:
+            c0 = torch.randn(
+                self.num_layers, batch_size, self.hidden_size, device=x.device
+            )
+
+        out, _ = self.lstm(x, (h0, c0))  # out: (batch_size, seq_length, hidden_size)
+        out = self.fc(out[:, -1, :])  # out: (batch_size, output_size)
+
+        return out
diff --git a/backends/mlir/cpu/KernelBench/level3/36_LSTMHn.py b/backends/mlir/cpu/KernelBench/level3/36_LSTMHn.py
new file mode 100644
index 0000000..bc33b52
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level3/36_LSTMHn.py
@@ -0,0 +1,49 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout=0.0):
+        """
+        Initialize the LSTM model.
+
+        :param input_size: The number of expected features in the input `x`
+        :param hidden_size: The number of features in the hidden state `h`
+        :param num_layers: Number of recurrent layers
+        :param output_size: The number of output features
+        :param dropout: If non-zero, introduces a Dropout layer on the outputs of each LSTM layer except the last layer, with dropout probability equal to `dropout`
+        """
+        super(Model, self).__init__()
+        # Initialize hidden state with random values
+        self.lstm = nn.LSTM(
+            input_size,
+            hidden_size,
+            num_layers,
+            batch_first=True,
+            dropout=dropout,
+            bidirectional=False,
+        )
+        self.fc = nn.Linear(hidden_size, output_size)
+
+    def forward(self, x, h0, c0):
+        """
+        Forward pass through the LSTM model.
+
+        :param x: The input tensor, shape (batch_size, sequence_length, input_size)
+        :return: The output tensor, shape (batch_size, sequence_length, output_size)
+        """
+
+        # Forward propagate LSTM
+        out, state = self.lstm(
+            x, (h0, c0)
+        )  # out: tensor of shape (batch_size, seq_length, hidden_size)
+
+        # Decode the hidden state of the last time step
+        out = self.fc(out[:, -1, :])  # out: tensor of shape (batch_size, output_size)
+
+        return state[0]
diff --git a/backends/mlir/cpu/KernelBench/level3/37_LSTMCn.py b/backends/mlir/cpu/KernelBench/level3/37_LSTMCn.py
new file mode 100644
index 0000000..f94d39a
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level3/37_LSTMCn.py
@@ -0,0 +1,49 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout=0.0):
+        """
+        Initialize the LSTM model.
+
+        :param input_size: The number of expected features in the input `x`
+        :param hidden_size: The number of features in the hidden state `h`
+        :param num_layers: Number of recurrent layers
+        :param output_size: The number of output features
+        :param dropout: If non-zero, introduces a Dropout layer on the outputs of each LSTM layer except the last layer, with dropout probability equal to `dropout`
+        """
+        super(Model, self).__init__()
+        # Initialize hidden state with random values
+        self.lstm = nn.LSTM(
+            input_size,
+            hidden_size,
+            num_layers,
+            batch_first=True,
+            dropout=dropout,
+            bidirectional=False,
+        )
+        self.fc = nn.Linear(hidden_size, output_size)
+
+    def forward(self, x, h0, c0):
+        """
+        Forward pass through the LSTM model.
+
+        :param x: The input tensor, shape (batch_size, sequence_length, input_size)
+        :return: The output tensor, shape (batch_size, sequence_length, output_size)
+        """
+
+        # Forward propagate LSTM
+        out, state = self.lstm(
+            x, (h0, c0)
+        )  # out: tensor of shape (batch_size, seq_length, hidden_size)
+
+        # Decode the hidden state of the last time step
+        out = self.fc(out[:, -1, :])  # out: tensor of shape (batch_size, output_size)
+
+        return state[1]
diff --git a/backends/mlir/cpu/KernelBench/level3/38_LSTMBidirectional.py b/backends/mlir/cpu/KernelBench/level3/38_LSTMBidirectional.py
new file mode 100644
index 0000000..39c7746
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level3/38_LSTMBidirectional.py
@@ -0,0 +1,48 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout=0.0):
+        """
+        Initialize the LSTM model.
+
+        :param input_size: The number of expected features in the input `x`
+        :param hidden_size: The number of features in the hidden state `h`
+        :param num_layers: Number of recurrent layers
+        :param output_size: The number of output features
+        :param dropout: If non-zero, introduces a Dropout layer on the outputs of each LSTM layer except the last layer, with dropout probability equal to `dropout`
+        """
+        super(Model, self).__init__()
+        # Initialize hidden state with random values
+        self.lstm = nn.LSTM(
+            input_size,
+            hidden_size,
+            num_layers,
+            batch_first=True,
+            dropout=dropout,
+            bidirectional=True,
+        )
+        self.fc = nn.Linear(hidden_size * 2, output_size)
+
+    def forward(self, x, h0, c0):
+        """
+        Forward pass through the LSTM model.
+
+        :param x: The input tensor, shape (batch_size, sequence_length, input_size)
+        :return: The output tensor, shape (batch_size, sequence_length, output_size)
+        """
+        # Forward propagate LSTM
+        out, hn = self.lstm(
+            x, (h0, c0)
+        )  # out: tensor of shape (batch_size, seq_length, hidden_size)
+
+        # Decode the hidden state of the last time step
+        out = self.fc(out[:, -1, :])  # out: tensor of shape (batch_size, output_size)
+
+        return out
diff --git a/backends/mlir/cpu/KernelBench/level3/39_GRU.py b/backends/mlir/cpu/KernelBench/level3/39_GRU.py
new file mode 100644
index 0000000..8631126
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level3/39_GRU.py
@@ -0,0 +1,42 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    def __init__(
+        self, input_size, hidden_size, num_layers=3, bias=True, batch_first=False
+    ):
+        """
+        :param input_size: The number of expected features in the input x
+        :param hidden_size: The number of features in the hidden state h
+        :param num_layers: Number of recurrent layers (default: 1)
+        :param bias: If False, then the layer does not use bias weights b_ih and b_hh (default: True)
+        :param batch_first: If True, then the input and output tensors are provided as (batch, seq, feature) (default: False)
+        """
+        super(Model, self).__init__()
+
+        self.gru = nn.GRU(
+            input_size,
+            hidden_size,
+            num_layers,
+            bias,
+            batch_first,
+            dropout=0,
+            bidirectional=False,
+        )
+
+    def forward(self, x, h0):
+        """
+        :param x: The input tensor, shape (seq_len, batch_size, input_size) if batch_first=False, otherwise (batch_size, seq_len, input_size)
+        :param h_0: The initial hidden state for the input sequence, shape (num_layers * num_directions, batch_size, hidden_size) (default: None)
+        :return: output, h_n
+            - output: The output features (h_t) from the last layer of the GRU, for each t, shape (seq_len, batch_size, num_directions * hidden_size) if batch_first=False, otherwise (batch_size, seq_len, num_directions * hidden_size)
+            - h_n: The hidden state for t = seq_len, shape (num_layers * num_directions, batch_size, hidden_size)
+        """
+        output, h_n = self.gru(x, h0)
+        return output
diff --git a/backends/mlir/cpu/KernelBench/level3/3_DeepNarrowMLP.py b/backends/mlir/cpu/KernelBench/level3/3_DeepNarrowMLP.py
new file mode 100644
index 0000000..32a34d1
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level3/3_DeepNarrowMLP.py
@@ -0,0 +1,36 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    def __init__(self, input_size, hidden_layer_sizes, output_size):
+        """
+        :param input_size: The number of input features
+        :param hidden_layer_sizes: A list of ints containing the sizes of each hidden layer
+        :param output_size: The number of output features
+        """
+        super(Model, self).__init__()
+
+        layers = []
+        current_input_size = input_size
+
+        for hidden_size in hidden_layer_sizes:
+            layers.append(nn.Linear(current_input_size, hidden_size))
+            layers.append(nn.ReLU())
+            current_input_size = hidden_size
+
+        layers.append(nn.Linear(current_input_size, output_size))
+
+        self.network = nn.Sequential(*layers)
+
+    def forward(self, x):
+        """
+        :param x: The input tensor, shape (batch_size, input_size)
+        :return: The output tensor, shape (batch_size, output_size)
+        """
+        return self.network(x)
diff --git a/backends/mlir/cpu/KernelBench/level3/40_GRUHidden.py b/backends/mlir/cpu/KernelBench/level3/40_GRUHidden.py
new file mode 100644
index 0000000..34aae11
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level3/40_GRUHidden.py
@@ -0,0 +1,42 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    def __init__(
+        self, input_size, hidden_size, num_layers=3, bias=True, batch_first=False
+    ):
+        """
+        :param input_size: The number of expected features in the input x
+        :param hidden_size: The number of features in the hidden state h
+        :param num_layers: Number of recurrent layers (default: 1)
+        :param bias: If False, then the layer does not use bias weights b_ih and b_hh (default: True)
+        :param batch_first: If True, then the input and output tensors are provided as (batch, seq, feature) (default: False)
+        """
+        super(Model, self).__init__()
+
+        self.gru = nn.GRU(
+            input_size,
+            hidden_size,
+            num_layers,
+            bias,
+            batch_first,
+            dropout=0,
+            bidirectional=False,
+        )
+
+    def forward(self, x, h0):
+        """
+        :param x: The input tensor, shape (seq_len, batch_size, input_size) if batch_first=False, otherwise (batch_size, seq_len, input_size)
+        :param h_0: The initial hidden state for the input sequence, shape (num_layers * num_directions, batch_size, hidden_size) (default: None)
+        :return: output, h_n
+            - output: The output features (h_t) from the last layer of the GRU, for each t, shape (seq_len, batch_size, num_directions * hidden_size) if batch_first=False, otherwise (batch_size, seq_len, num_directions * hidden_size)
+            - h_n: The hidden state for t = seq_len, shape (num_layers * num_directions, batch_size, hidden_size)
+        """
+        output, h_n = self.gru(x, h0)
+        return h_n
diff --git a/backends/mlir/cpu/KernelBench/level3/41_GRUBidirectional.py b/backends/mlir/cpu/KernelBench/level3/41_GRUBidirectional.py
new file mode 100644
index 0000000..cf17962
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level3/41_GRUBidirectional.py
@@ -0,0 +1,46 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    def __init__(
+        self, input_size, hidden_size, num_layers=3, bias=True, batch_first=False
+    ):
+        """
+        :param input_size: The number of expected features in the input x
+        :param hidden_size: The number of features in the hidden state h
+        :param num_layers: Number of recurrent layers (default: 1)
+        :param bias: If False, then the layer does not use bias weights b_ih and b_hh (default: True)
+        :param batch_first: If True, then the input and output tensors are provided as (batch, seq, feature) (default: False)
+        """
+        super(Model, self).__init__()
+
+        self.gru = nn.GRU(
+            input_size,
+            hidden_size,
+            num_layers,
+            bias,
+            batch_first,
+            dropout=0,
+            bidirectional=True,
+        )
+        self.h0 = torch.randn((num_layers * 2, batch_size, hidden_size))
+
+    def forward(self, x, h0):
+        """
+        :param x: The input tensor, shape (seq_len, batch_size, input_size) if batch_first=False, otherwise (batch_size, seq_len, input_size)
+        :param h_0: The initial hidden state for the input sequence, shape (num_layers * num_directions, batch_size, hidden_size) (default: None)
+        :return: output, h_n
+            - output: The output features (h_t) from the last layer of the GRU, for each t, shape (seq_len, batch_size, num_directions * hidden_size) if batch_first=False, otherwise (batch_size, seq_len, num_directions * hidden_size)
+            - h_n: The hidden state for t = seq_len, shape (num_layers * num_directions, batch_size, hidden_size)
+        """
+        output, h_n = self.gru(x, h0)
+        return output
+
+
+batch_size = 10
diff --git a/backends/mlir/cpu/KernelBench/level3/42_GRUBidirectionalHidden.py b/backends/mlir/cpu/KernelBench/level3/42_GRUBidirectionalHidden.py
new file mode 100644
index 0000000..6e0b0b9
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level3/42_GRUBidirectionalHidden.py
@@ -0,0 +1,42 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    def __init__(
+        self, input_size, hidden_size, num_layers=3, bias=True, batch_first=False
+    ):
+        """
+        :param input_size: The number of expected features in the input x
+        :param hidden_size: The number of features in the hidden state h
+        :param num_layers: Number of recurrent layers (default: 1)
+        :param bias: If False, then the layer does not use bias weights b_ih and b_hh (default: True)
+        :param batch_first: If True, then the input and output tensors are provided as (batch, seq, feature) (default: False)
+        """
+        super(Model, self).__init__()
+
+        self.gru = nn.GRU(
+            input_size,
+            hidden_size,
+            num_layers,
+            bias,
+            batch_first,
+            dropout=0,
+            bidirectional=True,
+        )
+
+    def forward(self, x, h0):
+        """
+        :param x: The input tensor, shape (seq_len, batch_size, input_size) if batch_first=False, otherwise (batch_size, seq_len, input_size)
+        :param h_0: The initial hidden state for the input sequence, shape (num_layers * num_directions, batch_size, hidden_size) (default: None)
+        :return: output, h_n
+            - output: The output features (h_t) from the last layer of the GRU, for each t, shape (seq_len, batch_size, num_directions * hidden_size) if batch_first=False, otherwise (batch_size, seq_len, num_directions * hidden_size)
+            - h_n: The hidden state for t = seq_len, shape (num_layers * num_directions, batch_size, hidden_size)
+        """
+        output, h_n = self.gru(x, h0)
+        return h_n
diff --git a/backends/mlir/cpu/KernelBench/level3/43_MinGPTCausalAttention.py b/backends/mlir/cpu/KernelBench/level3/43_MinGPTCausalAttention.py
new file mode 100644
index 0000000..61f89a3
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level3/43_MinGPTCausalAttention.py
@@ -0,0 +1,70 @@
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+# From https://github.com/karpathy/minGPT/blob/master/mingpt/model.py
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    A vanilla multi-head masked self-attention layer with a projection at the end.
+    It is possible to use torch.nn.MultiheadAttention here but I am including an
+    explicit implementation here to show that there is nothing too scary here.
+    """
+
+    def __init__(self, n_embd, n_head, attn_pdrop, resid_pdrop, max_seqlen):
+        super().__init__()
+        assert n_embd % n_head == 0
+        # key, query, value projections for all heads, but in a batch
+        self.c_attn = nn.Linear(n_embd, 3 * n_embd)
+        # output projection
+        self.c_proj = nn.Linear(n_embd, n_embd)
+        # regularization
+        self.attn_dropout = nn.Dropout(attn_pdrop)
+        self.resid_dropout = nn.Dropout(resid_pdrop)
+        # causal mask to ensure that attention is only applied to the left in the input sequence
+        self.register_buffer(
+            "bias",
+            torch.tril(torch.ones(max_seqlen, max_seqlen)).view(
+                1, 1, max_seqlen, max_seqlen
+            ),
+        )
+        self.n_head = n_head
+        self.n_embd = n_embd
+
+    def forward(self, x):
+        B, T, C = (
+            x.size()
+        )  # batch size, sequence length, embedding dimensionality (n_embd)
+
+        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
+        k = k.view(B, T, self.n_head, C // self.n_head).transpose(
+            1, 2
+        )  # (B, nh, T, hs)
+        q = q.view(B, T, self.n_head, C // self.n_head).transpose(
+            1, 2
+        )  # (B, nh, T, hs)
+        v = v.view(B, T, self.n_head, C // self.n_head).transpose(
+            1, 2
+        )  # (B, nh, T, hs)
+
+        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float("-inf"))
+        att = F.softmax(att, dim=-1)
+        att = self.attn_dropout(att)
+        y = att @ v  # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
+        y = (
+            y.transpose(1, 2).contiguous().view(B, T, C)
+        )  # re-assemble all head outputs side by side
+
+        # output projection
+        y = self.resid_dropout(self.c_proj(y))
+        return y
diff --git a/backends/mlir/cpu/KernelBench/level3/44_MiniGPTBlock.py b/backends/mlir/cpu/KernelBench/level3/44_MiniGPTBlock.py
new file mode 100644
index 0000000..6effe48
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level3/44_MiniGPTBlock.py
@@ -0,0 +1,120 @@
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import ai_bench.mlir
+
+# From https://github.com/karpathy/minGPT/blob/master/mingpt/model.py
+
+
+class NewGELU(nn.Module):
+    """
+    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT).
+    Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
+    """
+
+    def __init__(self):
+        super(NewGELU, self).__init__()
+
+    def forward(self, x):
+        return (
+            0.5
+            * x
+            * (
+                1.0
+                + torch.tanh(
+                    math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))
+                )
+            )
+        )
+
+
+class CausalSelfAttention(nn.Module):
+    """
+    A vanilla multi-head masked self-attention layer with a projection at the end.
+    It is possible to use torch.nn.MultiheadAttention here but I am including an
+    explicit implementation here to show that there is nothing too scary here.
+    """
+
+    def __init__(self, n_embd, n_head, attn_pdrop, resid_pdrop, max_seqlen):
+        super().__init__()
+        assert n_embd % n_head == 0
+        # key, query, value projections for all heads, but in a batch
+        self.c_attn = nn.Linear(n_embd, 3 * n_embd)
+        # output projection
+        self.c_proj = nn.Linear(n_embd, n_embd)
+        # regularization
+        self.attn_dropout = nn.Dropout(attn_pdrop)
+        self.resid_dropout = nn.Dropout(resid_pdrop)
+        # causal mask to ensure that attention is only applied to the left in the input sequence
+        self.register_buffer(
+            "bias",
+            torch.tril(torch.ones(max_seqlen, max_seqlen)).view(
+                1, 1, max_seqlen, max_seqlen
+            ),
+        )
+        self.n_head = n_head
+        self.n_embd = n_embd
+
+    def forward(self, x):
+        B, T, C = (
+            x.size()
+        )  # batch size, sequence length, embedding dimensionality (n_embd)
+
+        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
+        k = k.view(B, T, self.n_head, C // self.n_head).transpose(
+            1, 2
+        )  # (B, nh, T, hs)
+        q = q.view(B, T, self.n_head, C // self.n_head).transpose(
+            1, 2
+        )  # (B, nh, T, hs)
+        v = v.view(B, T, self.n_head, C // self.n_head).transpose(
+            1, 2
+        )  # (B, nh, T, hs)
+
+        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float("-inf"))
+        att = F.softmax(att, dim=-1)
+        att = self.attn_dropout(att)
+        y = att @ v  # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
+        y = (
+            y.transpose(1, 2).contiguous().view(B, T, C)
+        )  # re-assemble all head outputs side by side
+
+        # output projection
+        y = self.resid_dropout(self.c_proj(y))
+        return y
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """an unassuming Transformer block"""
+
+    def __init__(self, n_embd, n_head, attn_pdrop, resid_pdrop, max_seqlen):
+        super().__init__()
+        self.ln_1 = nn.LayerNorm(n_embd)
+        self.attn = CausalSelfAttention(
+            n_embd, n_head, attn_pdrop, resid_pdrop, max_seqlen
+        )
+        self.ln_2 = nn.LayerNorm(n_embd)
+        self.mlp = nn.ModuleDict(
+            dict(
+                c_fc=nn.Linear(n_embd, 4 * n_embd),
+                c_proj=nn.Linear(4 * n_embd, n_embd),
+                act=NewGELU(),
+                dropout=nn.Dropout(resid_pdrop),
+            )
+        )
+        m = self.mlp
+        self.mlpf = lambda x: m.dropout(m.c_proj(m.act(m.c_fc(x))))  # MLP forward
+
+    def forward(self, x):
+        x = x + self.attn(self.ln_1(x))
+        x = x + self.mlpf(self.ln_2(x))
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level3/46_NetVladWithGhostClusters.py b/backends/mlir/cpu/KernelBench/level3/46_NetVladWithGhostClusters.py
new file mode 100644
index 0000000..6ab2b14
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level3/46_NetVladWithGhostClusters.py
@@ -0,0 +1,91 @@
+# Copyright 2018 Antoine Miech All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Code modified from here
+https://github.com/albanie/collaborative-experts/blob/master/model/net_vlad.py
+"""
+
+import math
+
+import torch
+import torch as th
+import torch.nn as nn
+import torch.nn.functional as F
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    def __init__(self, cluster_size, feature_size, ghost_clusters):
+        super(Model, self).__init__()
+
+        self.feature_size = feature_size
+        self.cluster_size = cluster_size
+        self.ghost_clusters = ghost_clusters
+
+        init_sc = 1 / math.sqrt(feature_size)
+        clusters = cluster_size + ghost_clusters
+
+        # The `clusters` weights are the `(w,b)` in the paper
+        self.clusters = nn.Parameter(init_sc * th.randn(feature_size, clusters))
+        self.batch_norm = nn.BatchNorm1d(clusters)
+        # The `clusters2` weights are the visual words `c_k` in the paper
+        self.clusters2 = nn.Parameter(init_sc * th.randn(1, feature_size, cluster_size))
+        self.out_dim = self.cluster_size * feature_size
+
+    def forward(self, x, mask=None):
+        """Aggregates feature maps into a fixed size representation.  In the following
+        notation, B = batch_size, N = num_features, K = num_clusters, D = feature_size.
+
+        Args:
+            x (th.Tensor): B x N x D
+
+        Returns:
+            (th.Tensor): B x DK
+        """
+        max_sample = x.size()[1]
+        x = x.view(-1, self.feature_size)  # B x N x D -> BN x D
+
+        if x.device != self.clusters.device:
+            msg = f"x.device {x.device} != cluster.device {self.clusters.device}"
+            raise ValueError(msg)
+
+        assignment = th.matmul(x, self.clusters)  # (BN x D) x (D x (K+G)) -> BN x (K+G)
+        assignment = self.batch_norm(assignment)
+
+        assignment = F.softmax(assignment, dim=1)  # BN x (K+G) -> BN x (K+G)
+        # remove ghost assigments
+        assignment = assignment[:, : self.cluster_size]
+        assignment = assignment.view(-1, max_sample, self.cluster_size)  # -> B x N x K
+        a_sum = th.sum(assignment, dim=1, keepdim=True)  # B x N x K -> B x 1 x K
+        a = a_sum * self.clusters2
+
+        assignment = assignment.transpose(1, 2)  # B x N x K -> B x K x N
+
+        x = x.view(-1, max_sample, self.feature_size)  # BN x D -> B x N x D
+        vlad = th.matmul(assignment, x)  # (B x K x N) x (B x N x D) -> B x K x D
+        vlad = vlad.transpose(1, 2)  # -> B x D x K
+        vlad = vlad - a
+
+        # L2 intra norm
+        vlad = F.normalize(vlad)
+
+        # flattening + L2 norm
+        vlad = vlad.reshape(-1, self.cluster_size * self.feature_size)  # -> B x DK
+        vlad = F.normalize(vlad)
+        return vlad  # B x DK
diff --git a/backends/mlir/cpu/KernelBench/level3/47_NetVladNoGhostClusters.py b/backends/mlir/cpu/KernelBench/level3/47_NetVladNoGhostClusters.py
new file mode 100644
index 0000000..6ab2b14
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level3/47_NetVladNoGhostClusters.py
@@ -0,0 +1,91 @@
+# Copyright 2018 Antoine Miech All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Code modified from here
+https://github.com/albanie/collaborative-experts/blob/master/model/net_vlad.py
+"""
+
+import math
+
+import torch
+import torch as th
+import torch.nn as nn
+import torch.nn.functional as F
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    def __init__(self, cluster_size, feature_size, ghost_clusters):
+        super(Model, self).__init__()
+
+        self.feature_size = feature_size
+        self.cluster_size = cluster_size
+        self.ghost_clusters = ghost_clusters
+
+        init_sc = 1 / math.sqrt(feature_size)
+        clusters = cluster_size + ghost_clusters
+
+        # The `clusters` weights are the `(w,b)` in the paper
+        self.clusters = nn.Parameter(init_sc * th.randn(feature_size, clusters))
+        self.batch_norm = nn.BatchNorm1d(clusters)
+        # The `clusters2` weights are the visual words `c_k` in the paper
+        self.clusters2 = nn.Parameter(init_sc * th.randn(1, feature_size, cluster_size))
+        self.out_dim = self.cluster_size * feature_size
+
+    def forward(self, x, mask=None):
+        """Aggregates feature maps into a fixed size representation.  In the following
+        notation, B = batch_size, N = num_features, K = num_clusters, D = feature_size.
+
+        Args:
+            x (th.Tensor): B x N x D
+
+        Returns:
+            (th.Tensor): B x DK
+        """
+        max_sample = x.size()[1]
+        x = x.view(-1, self.feature_size)  # B x N x D -> BN x D
+
+        if x.device != self.clusters.device:
+            msg = f"x.device {x.device} != cluster.device {self.clusters.device}"
+            raise ValueError(msg)
+
+        assignment = th.matmul(x, self.clusters)  # (BN x D) x (D x (K+G)) -> BN x (K+G)
+        assignment = self.batch_norm(assignment)
+
+        assignment = F.softmax(assignment, dim=1)  # BN x (K+G) -> BN x (K+G)
+        # remove ghost assigments
+        assignment = assignment[:, : self.cluster_size]
+        assignment = assignment.view(-1, max_sample, self.cluster_size)  # -> B x N x K
+        a_sum = th.sum(assignment, dim=1, keepdim=True)  # B x N x K -> B x 1 x K
+        a = a_sum * self.clusters2
+
+        assignment = assignment.transpose(1, 2)  # B x N x K -> B x K x N
+
+        x = x.view(-1, max_sample, self.feature_size)  # BN x D -> B x N x D
+        vlad = th.matmul(assignment, x)  # (B x K x N) x (B x N x D) -> B x K x D
+        vlad = vlad.transpose(1, 2)  # -> B x D x K
+        vlad = vlad - a
+
+        # L2 intra norm
+        vlad = F.normalize(vlad)
+
+        # flattening + L2 norm
+        vlad = vlad.reshape(-1, self.cluster_size * self.feature_size)  # -> B x DK
+        vlad = F.normalize(vlad)
+        return vlad  # B x DK
diff --git a/backends/mlir/cpu/KernelBench/level3/4_LeNet5.py b/backends/mlir/cpu/KernelBench/level3/4_LeNet5.py
new file mode 100644
index 0000000..0125db0
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level3/4_LeNet5.py
@@ -0,0 +1,56 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    def __init__(self, num_classes):
+        """
+        LeNet-5 architecture implementation in PyTorch.
+
+        :param num_classes: The number of output classes.
+        """
+        super(Model, self).__init__()
+
+        # Convolutional layers
+        self.conv1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5, stride=1)
+        self.conv2 = nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1)
+
+        # Fully connected layers
+        self.fc1 = nn.Linear(in_features=16 * 5 * 5, out_features=120)
+        self.fc2 = nn.Linear(in_features=120, out_features=84)
+        self.fc3 = nn.Linear(in_features=84, out_features=num_classes)
+
+    def forward(self, x):
+        """
+        Forward pass of the LeNet-5 model.
+
+        :param x: The input tensor, shape (batch_size, 1, 32, 32)
+        :return: The output tensor, shape (batch_size, num_classes)
+        """
+        # First convolutional layer with ReLU activation and max pooling
+        x = F.relu(self.conv1(x))
+        x = F.max_pool2d(x, kernel_size=2, stride=2)
+
+        # Second convolutional layer with ReLU activation and max pooling
+        x = F.relu(self.conv2(x))
+        x = F.max_pool2d(x, kernel_size=2, stride=2)
+
+        # Flatten the output for the fully connected layers
+        x = x.view(-1, 16 * 5 * 5)
+
+        # First fully connected layer with ReLU activation
+        x = F.relu(self.fc1(x))
+
+        # Second fully connected layer with ReLU activation
+        x = F.relu(self.fc2(x))
+
+        # Final fully connected layer
+        x = self.fc3(x)
+
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level3/50_ReLUSelfAttention.py b/backends/mlir/cpu/KernelBench/level3/50_ReLUSelfAttention.py
new file mode 100644
index 0000000..4789b1b
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level3/50_ReLUSelfAttention.py
@@ -0,0 +1,88 @@
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import ai_bench.mlir
+
+# From https://github.com/karpathy/minGPT/blob/master/mingpt/model.py
+
+
+class NewGELU(nn.Module):
+    """
+    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT).
+    Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
+    """
+
+    def __init__(self):
+        super(NewGELU, self).__init__()
+
+    def forward(self, x):
+        return (
+            0.5
+            * x
+            * (
+                1.0
+                + torch.tanh(
+                    math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))
+                )
+            )
+        )
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    """
+    A multi-head masked self-attention layer with a projection at the end that uses ReLU instead of Softmax.
+    It is possible to use torch.nn.MultiheadAttention here but I am including an
+    explicit implementation here to show that there is nothing too scary here.
+    """
+
+    def __init__(self, n_embd, n_head, max_seqlen):
+        super().__init__()
+        assert n_embd % n_head == 0
+        # key, query, value projections for all heads, but in a batch
+        self.c_attn = nn.Linear(n_embd, 3 * n_embd)
+        # output projection
+        self.c_proj = nn.Linear(n_embd, n_embd)
+        # causal mask to ensure that attention is only applied to the left in the input sequence
+        self.register_buffer(
+            "bias",
+            torch.tril(torch.ones(max_seqlen, max_seqlen)).view(
+                1, 1, max_seqlen, max_seqlen
+            ),
+        )
+        self.n_head = n_head
+        self.n_embd = n_embd
+
+    def forward(self, x):
+        B, T, C = (
+            x.size()
+        )  # batch size, sequence length, embedding dimensionality (n_embd)
+
+        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
+        k = k.view(B, T, self.n_head, C // self.n_head).transpose(
+            1, 2
+        )  # (B, nh, T, hs)
+        q = q.view(B, T, self.n_head, C // self.n_head).transpose(
+            1, 2
+        )  # (B, nh, T, hs)
+        v = v.view(B, T, self.n_head, C // self.n_head).transpose(
+            1, 2
+        )  # (B, nh, T, hs)
+
+        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float("-inf"))
+        att = F.relu(att)
+
+        y = att @ v  # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
+        y = (
+            y.transpose(1, 2).contiguous().view(B, T, C)
+        )  # re-assemble all head outputs side by side
+
+        return y
diff --git a/backends/mlir/cpu/KernelBench/level3/5_AlexNet.py b/backends/mlir/cpu/KernelBench/level3/5_AlexNet.py
new file mode 100644
index 0000000..fe92f94
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level3/5_AlexNet.py
@@ -0,0 +1,96 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    def __init__(self, num_classes=1000):
+        """
+        :param num_classes: The number of output classes (default is 1000 for ImageNet)
+        """
+        super(Model, self).__init__()
+
+        # First convolutional layer
+        self.conv1 = nn.Conv2d(
+            in_channels=3, out_channels=96, kernel_size=11, stride=4, padding=2
+        )
+        self.relu1 = nn.ReLU(inplace=True)
+        self.maxpool1 = nn.MaxPool2d(kernel_size=3, stride=2)
+
+        # Second convolutional layer
+        self.conv2 = nn.Conv2d(
+            in_channels=96, out_channels=256, kernel_size=5, padding=2
+        )
+        self.relu2 = nn.ReLU(inplace=True)
+        self.maxpool2 = nn.MaxPool2d(kernel_size=3, stride=2)
+
+        # Third convolutional layer
+        self.conv3 = nn.Conv2d(
+            in_channels=256, out_channels=384, kernel_size=3, padding=1
+        )
+        self.relu3 = nn.ReLU(inplace=True)
+
+        # Fourth convolutional layer
+        self.conv4 = nn.Conv2d(
+            in_channels=384, out_channels=384, kernel_size=3, padding=1
+        )
+        self.relu4 = nn.ReLU(inplace=True)
+
+        # Fifth convolutional layer
+        self.conv5 = nn.Conv2d(
+            in_channels=384, out_channels=256, kernel_size=3, padding=1
+        )
+        self.relu5 = nn.ReLU(inplace=True)
+        self.maxpool3 = nn.MaxPool2d(kernel_size=3, stride=2)
+
+        # Fully connected layers
+        self.fc1 = nn.Linear(in_features=256 * 6 * 6, out_features=4096)
+        self.relu6 = nn.ReLU(inplace=True)
+        self.dropout1 = nn.Dropout(p=0.0)
+
+        self.fc2 = nn.Linear(in_features=4096, out_features=4096)
+        self.relu7 = nn.ReLU(inplace=True)
+        self.dropout2 = nn.Dropout(p=0.0)
+
+        self.fc3 = nn.Linear(in_features=4096, out_features=num_classes)
+
+    def forward(self, x):
+        """
+        :param x: The input tensor, shape (batch_size, 3, 224, 224)
+        :return: The output tensor, shape (batch_size, num_classes)
+        """
+        x = self.conv1(x)
+        x = self.relu1(x)
+        x = self.maxpool1(x)
+
+        x = self.conv2(x)
+        x = self.relu2(x)
+        x = self.maxpool2(x)
+
+        x = self.conv3(x)
+        x = self.relu3(x)
+
+        x = self.conv4(x)
+        x = self.relu4(x)
+
+        x = self.conv5(x)
+        x = self.relu5(x)
+        x = self.maxpool3(x)
+
+        x = torch.flatten(x, 1)
+
+        x = self.fc1(x)
+        x = self.relu6(x)
+        x = self.dropout1(x)
+
+        x = self.fc2(x)
+        x = self.relu7(x)
+        x = self.dropout2(x)
+
+        x = self.fc3(x)
+
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level3/6_GoogleNetInceptionModule.py b/backends/mlir/cpu/KernelBench/level3/6_GoogleNetInceptionModule.py
new file mode 100644
index 0000000..8068586
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level3/6_GoogleNetInceptionModule.py
@@ -0,0 +1,57 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    def __init__(
+        self, in_channels, out_1x1, reduce_3x3, out_3x3, reduce_5x5, out_5x5, pool_proj
+    ):
+        """
+        :param in_channels: Number of input channels
+        :param out_1x1: Number of output channels for the 1x1 convolution
+        :param reduce_3x3: Number of output channels for the 1x1 reduction before 3x3 convolution
+        :param out_3x3: Number of output channels for the 3x3 convolution
+        :param reduce_5x5: Number of output channels for the 1x1 reduction before 5x5 convolution
+        :param out_5x5: Number of output channels for the 5x5 convolution
+        :param pool_proj: Number of output channels for the pooling projection
+        """
+        super(Model, self).__init__()
+
+        # 1x1 convolution branch
+        self.branch1x1 = nn.Conv2d(in_channels, out_1x1, kernel_size=1)
+
+        # 3x3 convolution branch
+        self.branch3x3 = nn.Sequential(
+            nn.Conv2d(in_channels, reduce_3x3, kernel_size=1),
+            nn.Conv2d(reduce_3x3, out_3x3, kernel_size=3, padding=1),
+        )
+
+        # 5x5 convolution branch
+        self.branch5x5 = nn.Sequential(
+            nn.Conv2d(in_channels, reduce_5x5, kernel_size=1),
+            nn.Conv2d(reduce_5x5, out_5x5, kernel_size=5, padding=2),
+        )
+
+        # Max pooling branch
+        self.branch_pool = nn.Sequential(
+            nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
+            nn.Conv2d(in_channels, pool_proj, kernel_size=1),
+        )
+
+    def forward(self, x):
+        """
+        :param x: Input tensor, shape (batch_size, in_channels, height, width)
+        :return: Output tensor, shape (batch_size, out_channels, height, width)
+        """
+        branch1x1 = self.branch1x1(x)
+        branch3x3 = self.branch3x3(x)
+        branch5x5 = self.branch5x5(x)
+        branch_pool = self.branch_pool(x)
+
+        outputs = [branch1x1, branch3x3, branch5x5, branch_pool]
+        return torch.cat(outputs, 1)
diff --git a/backends/mlir/cpu/KernelBench/level3/7_GoogleNetInceptionV1.py b/backends/mlir/cpu/KernelBench/level3/7_GoogleNetInceptionV1.py
new file mode 100644
index 0000000..39f6e90
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level3/7_GoogleNetInceptionV1.py
@@ -0,0 +1,120 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import ai_bench.mlir
+
+
+class InceptionModule(nn.Module):
+    def __init__(
+        self, in_channels, out_1x1, reduce_3x3, out_3x3, reduce_5x5, out_5x5, pool_proj
+    ):
+        """
+        :param in_channels: Number of input channels
+        :param out_1x1: Number of output channels for the 1x1 convolution
+        :param reduce_3x3: Number of output channels for the 1x1 reduction before 3x3 convolution
+        :param out_3x3: Number of output channels for the 3x3 convolution
+        :param reduce_5x5: Number of output channels for the 1x1 reduction before 5x5 convolution
+        :param out_5x5: Number of output channels for the 5x5 convolution
+        :param pool_proj: Number of output channels for the pooling projection
+        """
+        super(InceptionModule, self).__init__()
+
+        # 1x1 convolution branch
+        self.branch1x1 = nn.Conv2d(in_channels, out_1x1, kernel_size=1)
+
+        # 3x3 convolution branch
+        self.branch3x3 = nn.Sequential(
+            nn.Conv2d(in_channels, reduce_3x3, kernel_size=1),
+            nn.Conv2d(reduce_3x3, out_3x3, kernel_size=3, padding=1),
+        )
+
+        # 5x5 convolution branch
+        self.branch5x5 = nn.Sequential(
+            nn.Conv2d(in_channels, reduce_5x5, kernel_size=1),
+            nn.Conv2d(reduce_5x5, out_5x5, kernel_size=5, padding=2),
+        )
+
+        # Max pooling branch
+        self.branch_pool = nn.Sequential(
+            nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
+            nn.Conv2d(in_channels, pool_proj, kernel_size=1),
+        )
+
+    def forward(self, x):
+        """
+        :param x: Input tensor, shape (batch_size, in_channels, height, width)
+        :return: Output tensor, shape (batch_size, out_channels, height, width)
+        """
+        branch1x1 = self.branch1x1(x)
+        branch3x3 = self.branch3x3(x)
+        branch5x5 = self.branch5x5(x)
+        branch_pool = self.branch_pool(x)
+
+        outputs = [branch1x1, branch3x3, branch5x5, branch_pool]
+        return torch.cat(outputs, 1)
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    def __init__(self, num_classes=1000):
+        """
+        :param num_classes: Number of output classes
+        """
+        super(Model, self).__init__()
+
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
+        self.maxpool1 = nn.MaxPool2d(3, stride=2, padding=1)
+        self.conv2 = nn.Conv2d(64, 64, kernel_size=1)
+        self.conv3 = nn.Conv2d(64, 192, kernel_size=3, padding=1)
+        self.maxpool2 = nn.MaxPool2d(3, stride=2, padding=1)
+
+        self.inception3a = InceptionModule(192, 64, 96, 128, 16, 32, 32)
+        self.inception3b = InceptionModule(256, 128, 128, 192, 32, 96, 64)
+        self.maxpool3 = nn.MaxPool2d(3, stride=2, padding=1)
+
+        self.inception4a = InceptionModule(480, 192, 96, 208, 16, 48, 64)
+        self.inception4b = InceptionModule(512, 160, 112, 224, 24, 64, 64)
+        self.inception4c = InceptionModule(512, 128, 128, 256, 24, 64, 64)
+        self.inception4d = InceptionModule(512, 112, 144, 288, 32, 64, 64)
+        self.inception4e = InceptionModule(528, 256, 160, 320, 32, 128, 128)
+        self.maxpool4 = nn.MaxPool2d(3, stride=2, padding=1)
+
+        self.inception5a = InceptionModule(832, 256, 160, 320, 32, 128, 128)
+        self.inception5b = InceptionModule(832, 384, 192, 384, 48, 128, 128)
+
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.dropout = nn.Dropout(0.0)
+        self.fc = nn.Linear(1024, num_classes)
+
+    def forward(self, x):
+        """
+        :param x: Input tensor, shape (batch_size, 3, height, width)
+        :return: Output tensor, shape (batch_size, num_classes)
+        """
+        x = self.maxpool1(F.relu(self.conv1(x)))
+        x = F.relu(self.conv2(x))
+        x = self.maxpool2(F.relu(self.conv3(x)))
+
+        x = self.inception3a(x)
+        x = self.inception3b(x)
+        x = self.maxpool3(x)
+
+        x = self.inception4a(x)
+        x = self.inception4b(x)
+        x = self.inception4c(x)
+        x = self.inception4d(x)
+        x = self.inception4e(x)
+        x = self.maxpool4(x)
+
+        x = self.inception5a(x)
+        x = self.inception5b(x)
+
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+        x = self.dropout(x)
+        x = self.fc(x)
+
+        return x
diff --git a/backends/mlir/cpu/KernelBench/level3/8_ResNetBasicBlock.py b/backends/mlir/cpu/KernelBench/level3/8_ResNetBasicBlock.py
new file mode 100644
index 0000000..521bcac
--- /dev/null
+++ b/backends/mlir/cpu/KernelBench/level3/8_ResNetBasicBlock.py
@@ -0,0 +1,67 @@
+import torch
+import torch.nn as nn
+
+import ai_bench.mlir
+
+
+@torch.compile(
+    dynamic=False, backend=ai_bench.mlir.cpu_backend(ai_bench.mlir.cpu_pipeline)
+)
+class Model(nn.Module):
+    expansion = 1
+
+    def __init__(self, in_channels, out_channels, stride=1):
+        """
+        :param in_channels: Number of input channels
+        :param out_channels: Number of output channels
+        :param stride: Stride for the first convolutional layer
+        :param downsample: Downsample layer for the shortcut connection
+        """
+        super(Model, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            bias=False,
+        )
+        self.bn1 = nn.BatchNorm2d(out_channels)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(
+            out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False
+        )
+        self.bn2 = nn.BatchNorm2d(out_channels)
+        self.downsample = nn.Sequential(
+            nn.Conv2d(
+                in_channels,
+                out_channels * self.expansion,
+                kernel_size=1,
+                stride=stride,
+                bias=False,
+            ),
+            nn.BatchNorm2d(out_channels * self.expansion),
+        )
+        self.stride = stride
+
+    def forward(self, x):
+        """
+        :param x: Input tensor, shape (batch_size, in_channels, height, width)
+        :return: Output tensor, shape (batch_size, out_channels, height, width)
+        """
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out