From dc4a3d5b4e7ae46e713c488f3b155fe1a47b6200 Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devvm26340.ftw0.facebook.com>
Date: Wed, 18 Mar 2026 07:24:40 -0700
Subject: [PATCH] [ET-VK][conv2d_dw] Extract depthwise dispatch into
 Conv2dDW.cpp with device-based tile selection

Profiling showed depthwise conv2d is 5-15x slower on Mali GPUs vs Adreno due to
register pressure from the 4x2 output tile (17 vec4 registers per thread).
Benchmarking confirmed that reducing the tile to 1x1 (7 vec4 registers) gives
4-15x speedup on Mali with no regression on Adreno.

This change extracts depthwise conv2d dispatch logic from Convolution.cpp into a
new Conv2dDW.cpp (following the Conv2dPW.cpp pattern), and adds device-based
tile size selection: b1x1 on Mali, b4x2 (current default) on Adreno.

Differential Revision: [D97058158](https://our.internmc.facebook.com/intern/diff/D97058158/)

[ghstack-poisoned]
---
 .../graph/ops/glsl/conv2d_dw_output_tile.yaml |   3 +
 .../runtime/graph/ops/impl/Conv2dDW.cpp       | 327 +++++++++++++
 .../runtime/graph/ops/impl/Convolution.cpp    | 178 ++-----
 .../runtime/graph/ops/impl/Convolution.h      |  18 +
 .../test/custom_ops/impl/TestConv2dDw.cpp     | 348 ++++++++++++++
 backends/vulkan/test/custom_ops/targets.bzl   |   1 +
 .../vulkan/test/custom_ops/test_conv2d_dw.cpp | 435 ++++++++++++++++++
 7 files changed, 1158 insertions(+), 152 deletions(-)
 create mode 100644 backends/vulkan/runtime/graph/ops/impl/Conv2dDW.cpp
 create mode 100644 backends/vulkan/test/custom_ops/impl/TestConv2dDw.cpp
 create mode 100644 backends/vulkan/test/custom_ops/test_conv2d_dw.cpp

diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml
index 9cf6c22c6ca..87aa86154ee 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml
@@ -25,3 +25,6 @@ conv2d_dw_output_tile:
     - NAME: conv2d_dw_output_tile_5x5_clamp
       OPERATOR: clamp(X, A, B)
       TILE_SIZE: 5
+    - NAME: conv2d_dw_output_tile_3x3_b1x1
+      BATCH_SIZE_X: 1
+      BATCH_SIZE_Y: 1
diff --git a/backends/vulkan/runtime/graph/ops/impl/Conv2dDW.cpp b/backends/vulkan/runtime/graph/ops/impl/Conv2dDW.cpp
new file mode 100644
index 00000000000..a9d8483b2e2
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Conv2dDW.cpp
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Convolution.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
+
+namespace vkcompute {
+
+//
+// Weight prepack
+//
+
+ValueRef prepack_dw_weights(ComputeGraph& graph, const ValueRef vref) {
+  const auto original_sizes = graph.sizes_of(vref);
+
+  int64_t out_channels_padded =
+      utils::align_up_4(utils::val_at(-4, original_sizes));
+  int64_t height = utils::val_at(-2, original_sizes);
+  int64_t width = utils::val_at(-1, original_sizes);
+
+  const std::vector<int64_t> final_sizes = {
+      4, out_channels_padded / 4, height * width};
+
+  ValueRef v = graph.add_tensor(
+      final_sizes,
+      graph.dtype_of(vref),
+      utils::kTexture2D,
+      utils::kChannelsPacked);
+
+  std::string kernel_name = "conv2d_dw_prepack_weights";
+  add_dtype_suffix(kernel_name, graph.dtype_of(v));
+  add_dtype_suffix(kernel_name, graph.get_staging_dtype_for(vref));
+
+  const auto original_sizes_pc =
+      utils::make_ivec4(original_sizes, /*reverse = */ true);
+  graph.prepack_nodes().emplace_back(new PrepackNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      graph.create_global_wg_size(v),
+      graph.create_local_wg_size(v),
+      vref,
+      v,
+      {},
+      // Specialization constants
+      {graph.packed_dim_of(v)},
+      {graph.sizes_pc_of(v),
+       PushConstantDataInfo(&original_sizes_pc, sizeof(original_sizes_pc))}));
+
+  return v;
+}
+
+//
+// Shader selection
+//
+
+std::string pick_conv2d_dw_shader(
+    ComputeGraph& graph,
+    const ValueRef weight_data,
+    const ValueRef out,
+    const bool stride_equals_dilation,
+    const bool clamp_out) {
+  std::string kernel_name = "conv2d_dw";
+  kernel_name.reserve(kShaderNameReserve);
+
+  const auto& weight_sizes = graph.get_tref(weight_data)->sizes;
+  const bool is_3x3 = weight_sizes.at(2) == 3 && weight_sizes.at(3) == 3;
+  const bool is_5x5 = weight_sizes.at(2) == 5 && weight_sizes.at(3) == 5;
+
+  if (!stride_equals_dilation) {
+    kernel_name += "_sned";
+  }
+
+  if (is_3x3) {
+    kernel_name += "_output_tile_3x3";
+    if (stride_equals_dilation && graph.device_is_mali()) {
+      kernel_name += "_b1x1";
+    }
+  } else if (is_5x5) {
+    kernel_name += "_output_tile_5x5";
+  }
+
+  if (clamp_out) {
+    kernel_name += "_clamp";
+  }
+  add_dtype_suffix(kernel_name, graph.dtype_of(out));
+
+  return kernel_name;
+}
+
+//
+// Workgroup size
+//
+
+utils::uvec3 conv2d_dw_global_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)resize_args;
+  const ValueRef out = args.at(0).refs.at(0);
+
+  const bool uses_output_tile =
+      shader.kernel_name.find("_output_tile") != std::string::npos;
+
+  if (uses_output_tile) {
+    const bool is_sned = shader.kernel_name.find("_sned") != std::string::npos;
+
+    const utils::uvec3 image_extents = graph->create_global_wg_size(out);
+
+    if (is_sned) {
+      // sned output_tile shaders: no batch division, just flatten W*H
+      return {image_extents[0] * image_extents[1], image_extents[2], 1};
+    }
+
+    // stride==dilation output_tile shaders: apply batch division
+    uint32_t batch_x = 4u;
+    uint32_t batch_y = 2u;
+    if (shader.kernel_name.find("_b1x1") != std::string::npos) {
+      batch_x = 1u;
+      batch_y = 1u;
+    }
+
+    uint32_t scaled_x = utils::div_up(image_extents[0], batch_x);
+    uint32_t scaled_y = utils::div_up(image_extents[1], batch_y);
+    return {scaled_x * scaled_y, image_extents[2], 1};
+  }
+
+  // Base conv2d_dw shader: fully linearized dispatch
+  const utils::uvec3 base_extents = graph->create_global_wg_size(out);
+  return {base_extents[0] * base_extents[1] * base_extents[2], 1, 1};
+}
+
+utils::uvec3 conv2d_dw_local_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const utils::uvec3& global_workgroup_size,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)graph;
+  (void)shader;
+  (void)global_workgroup_size;
+  (void)args;
+  (void)resize_args;
+  return {64, 1, 1};
+}
+
+//
+// Dispatch node
+//
+
+struct Conv2dDWParams final {
+  utils::ivec2 overlay_region;
+  int in_group_size;
+};
+
+struct OutputParams final {
+  float out_min;
+  float out_max;
+};
+
+void add_conv2d_dw_node(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef arg_weight,
+    const ValueRef arg_bias,
+    const ValueRef weight_data,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef dilation,
+    const ValueRef out,
+    const std::string& kernel_name,
+    const Kernel2dParams& kernel_params,
+    const Conv2dDWParams& extra_params,
+    const OutputParams& out_params) {
+  vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);
+
+  vkapi::ParamsBindList param_buffers;
+  std::vector<PushConstantDataInfo> push_constants;
+
+  const bool uses_output_tile =
+      kernel_name.find("_output_tile") != std::string::npos;
+
+  if (uses_output_tile) {
+    const utils::ivec4 kernel_param_size_stride = {
+        kernel_params.kernel_size[0],
+        kernel_params.kernel_size[1],
+        kernel_params.stride[0],
+        kernel_params.stride[1]};
+
+    const utils::ivec4 kernel_param_pad_dial = {
+        kernel_params.padding[0],
+        kernel_params.padding[1],
+        kernel_params.dilation[0],
+        kernel_params.dilation[1]};
+
+    push_constants = {
+        graph.logical_limits_pc_of(out),
+        graph.sizes_pc_of(in),
+        PushConstantDataInfo(
+            &kernel_param_size_stride, sizeof(kernel_param_size_stride)),
+        PushConstantDataInfo(
+            &kernel_param_pad_dial, sizeof(kernel_param_pad_dial)),
+        PushConstantDataInfo(
+            &extra_params, sizeof(extra_params), sizeof(utils::ivec4)),
+        PushConstantDataInfo(&out_params, sizeof(out_params)),
+    };
+  } else {
+    param_buffers = {
+        graph.logical_limits_ubo(out),
+        graph.sizes_ubo(in),
+        graph.create_params_buffer(kernel_params),
+        graph.create_params_buffer(extra_params),
+        graph.create_params_buffer(out_params),
+    };
+  }
+
+  // transposed is always false for depthwise, output_padding unused
+  ValueRef transposed_ref = graph.add_scalar(false);
+  ValueRef output_padding = graph.add_none();
+
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+      graph,
+      shader,
+      conv2d_dw_global_wg_size,
+      conv2d_dw_local_wg_size,
+      // Inputs and Outputs
+      {{out, vkapi::kWrite}, {{in, arg_weight, arg_bias}, vkapi::kRead}},
+      // Shader params buffers
+      param_buffers,
+      // Push Constants
+      push_constants,
+      // Specialization Constants
+      {},
+      // Resize Args
+      {weight_data, stride, padding, dilation, transposed_ref, output_padding},
+      // Resizing Logic
+      resize_conv2d_node));
+}
+
+//
+// High level operator impl
+//
+
+void conv2d_dw_impl(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef weight_data,
+    const ValueRef bias,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef dilation,
+    const ValueRef out,
+    const bool clamp_out,
+    const float out_min_val,
+    const float out_max_val) {
+  ValueRef arg_weight = prepack_dw_weights(graph, weight_data);
+  ValueRef arg_bias = prepack_biases(
+      graph,
+      bias,
+      weight_data,
+      /* transposed = */ false,
+      /* storage_type = */ utils::kTexture2D,
+      /* memory_layout = */ utils::kWidthPacked);
+
+  const std::vector<int64_t> in_sizes = graph.sizes_of(in);
+  if (in_sizes.at(0) > 1) {
+    VK_THROW("conv2d: input batch size > 1 is not supported yet!");
+  }
+
+  check_conv_args(graph, in, out);
+
+  Kernel2dParams kernel_params = create_kernel2d_params(
+      graph,
+      weight_data,
+      /*kernel_size_only = */ false,
+      stride,
+      padding,
+      dilation);
+
+  const bool stride_equals_dilation =
+      (kernel_params.stride[0] == kernel_params.dilation[0] &&
+       kernel_params.stride[1] == kernel_params.dilation[1]);
+
+  const auto& overlay_region = utils::make_ivec2({
+      kernel_params.kernel_size[0] +
+          (kernel_params.kernel_size[0] - 1) * (kernel_params.dilation[0] - 1),
+      kernel_params.kernel_size[1] +
+          (kernel_params.kernel_size[1] - 1) * (kernel_params.dilation[1] - 1),
+  });
+  const auto weight_sizes = graph.sizes_of(weight_data);
+  const int32_t in_group_size =
+      utils::safe_downcast<int32_t>(utils::align_up_4(weight_sizes.at(1)));
+  Conv2dDWParams extra_params = {overlay_region, in_group_size};
+
+  OutputParams out_params = {out_min_val, out_max_val};
+
+  std::string kernel_name = pick_conv2d_dw_shader(
+      graph, weight_data, out, stride_equals_dilation, clamp_out);
+
+  add_conv2d_dw_node(
+      graph,
+      in,
+      arg_weight,
+      arg_bias,
+      weight_data,
+      stride,
+      padding,
+      dilation,
+      out,
+      kernel_name,
+      kernel_params,
+      extra_params,
+      out_params);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
index f5dd576ba54..077ce285cfc 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -139,18 +139,6 @@ vkapi::ShaderInfo get_conv2d_shader(
   switch (method) {
     case Conv2dMethod::Depthwise:
       kernel_name = "conv2d_dw";
-      if (!prepack_weights) {
-        if (!stride_equals_dilation) {
-          kernel_name += "_sned";
-        }
-        const auto& weight_sizes = graph.get_tref(weight)->sizes;
-        if (weight_sizes.at(2) == 3 && weight_sizes.at(3) == 3) {
-          kernel_name += "_output_tile_3x3";
-        }
-        if (weight_sizes.at(2) == 5 && weight_sizes.at(3) == 5) {
-          kernel_name += "_output_tile_5x5";
-        }
-      }
       break;
     case Conv2dMethod::Pointwise:
       if (prepack_weights) {
@@ -296,17 +284,6 @@ Conv2dMethod get_conv2d_method(
   return Conv2dMethod::SlidingWindow;
 }
 
-utils::uvec2 get_conv2d_dw_dispatch_divisor(
-    const std::vector<int64_t>& weight_sizes) {
-  if (weight_sizes.at(2) == 3 && weight_sizes.at(3) == 3) {
-    return {4u, 2u};
-  }
-  if (weight_sizes.at(2) == 5 && weight_sizes.at(3) == 5) {
-    return {4u, 2u};
-  }
-  return {4u, 2u};
-}
-
 utils::uvec3 create_conv2d_global_wg_size(
     ComputeGraph& graph,
     const Conv2dMethod method,
@@ -319,14 +296,6 @@ utils::uvec3 create_conv2d_global_wg_size(
         utils::div_up(image_extents[0u], 1u),
         utils::div_up(image_extents[1u], 4u),
         image_extents[2u]};
-  } else if (method == Conv2dMethod::Depthwise && stride_equals_dilation) {
-    const utils::uvec3 image_extents = graph.create_global_wg_size(out);
-    const utils::uvec2 div =
-        get_conv2d_dw_dispatch_divisor(graph.get_tref(weight_data)->sizes);
-    return {
-        utils::div_up(image_extents[0], div[0]),
-        utils::div_up(image_extents[1], div[1]),
-        image_extents[2]};
   } else {
     return graph.create_global_wg_size(out);
   }
@@ -343,10 +312,7 @@ utils::uvec3 conv2d_global_wg_size(
 
   // Determine method from shader name
   Conv2dMethod method;
-  if (shader.kernel_name.find("conv2d_dw") != std::string::npos) {
-    method = Conv2dMethod::Depthwise;
-  } else if (
-      shader.kernel_name.find("conv2d_pw") != std::string::npos ||
+  if (shader.kernel_name.find("conv2d_pw") != std::string::npos ||
       (shader.kernel_name.find("conv2d") != std::string::npos &&
        shader.kernel_name.find("conv_transpose2d") == std::string::npos)) {
     // Check if it's pointwise by examining weight sizes
@@ -369,21 +335,7 @@ utils::uvec3 conv2d_global_wg_size(
   utils::uvec3 wg_size = create_conv2d_global_wg_size(
       *graph, method, out, weight_data, stride_equals_dilation);
 
-  if (method == Conv2dMethod::Depthwise) {
-    // The output_tile shaders (conv2d_dw_output_tile,
-    // conv2d_dw_sned_output_tile) use a 2D dispatch: (x_tile, y_tile) packed
-    // into glb_x, channel in glb_y. The base conv2d_dw shader uses a 1D
-    // dispatch: all (x, y, channel) packed into glb_x. For the base shader, we
-    // must use {W*H*C_packed, 1, 1}.
-    const bool uses_output_tile =
-        shader.kernel_name.find("_output_tile") != std::string::npos;
-    if (uses_output_tile) {
-      wg_size = {wg_size[0] * wg_size[1], wg_size[2], 1};
-    } else {
-      const utils::uvec3 base_extents = graph->create_global_wg_size(out);
-      wg_size = {base_extents[0] * base_extents[1] * base_extents[2], 1, 1};
-    }
-  } else if (method == Conv2dMethod::Pointwise) {
+  if (method == Conv2dMethod::Pointwise) {
     wg_size = {wg_size[0] * wg_size[1], wg_size[2], 1};
 
     if (shader.kernel_name.find("s1p0") != std::string::npos) {
@@ -406,10 +358,7 @@ utils::uvec3 conv2d_local_wg_size(
 
   // Determine method from shader name
   Conv2dMethod method;
-  if (shader.kernel_name.find("conv2d_dw") != std::string::npos) {
-    method = Conv2dMethod::Depthwise;
-  } else if (
-      shader.kernel_name.find("conv2d_pw") != std::string::npos ||
+  if (shader.kernel_name.find("conv2d_pw") != std::string::npos ||
       (shader.kernel_name.find("conv2d") != std::string::npos &&
        shader.kernel_name.find("conv_transpose2d") == std::string::npos)) {
     method = Conv2dMethod::Pointwise;
@@ -427,8 +376,6 @@ utils::uvec3 conv2d_local_wg_size(
       local_wg_size_y = 2;
     }
     return {64 / local_wg_size_y, local_wg_size_y, 1};
-  } else if (method == Conv2dMethod::Depthwise) {
-    return {64, 1, 1};
   } else {
     return graph->create_local_wg_size(global_workgroup_size);
   }
@@ -499,6 +446,21 @@ void add_conv2d_node(
         out_max_val);
   }
 
+  if (method == Conv2dMethod::Depthwise) {
+    return conv2d_dw_impl(
+        graph,
+        in,
+        weight_data,
+        bias,
+        stride,
+        padding,
+        dilation,
+        out,
+        clamp_out,
+        out_min_val,
+        out_max_val);
+  }
+
   ValueRef arg_weight = prepack_weights(graph, weight_data, method);
   ValueRef arg_bias = prepack_biases(
       graph,
@@ -547,101 +509,13 @@ void add_conv2d_node(
       stride_equals_dilation,
       stride_1_padding_0);
 
-  utils::uvec3 wg_size = create_conv2d_global_wg_size(
-      graph, method, out, weight_data, stride_equals_dilation);
-
-  utils::uvec3 local_wg_size;
-  if (method == Conv2dMethod::Depthwise || method == Conv2dMethod::Pointwise) {
-    wg_size = {wg_size[0] * wg_size[1], wg_size[2], 1};
-  }
-
-  if (method == Conv2dMethod::Pointwise) {
-    uint32_t local_wg_size_y = 1;
-    if (wg_size[1] % 8 == 0) {
-      local_wg_size_y = 8;
-    } else if (wg_size[1] % 4 == 0) {
-      local_wg_size_y = 4;
-    } else if (wg_size[1] % 2 == 0) {
-      local_wg_size_y = 2;
-    }
-    local_wg_size = {64 / local_wg_size_y, local_wg_size_y, 1};
-  } else if (method == Conv2dMethod::Depthwise) {
-    local_wg_size = {64, 1, 1};
-  } else {
-    local_wg_size = graph.create_local_wg_size(wg_size);
-  }
-
-  vkapi::ParamsBindList param_buffers;
-  std::vector<PushConstantDataInfo> push_constants;
-  if (method == Conv2dMethod::Pointwise) {
-    const utils::ivec4 kernel_param_stride_pad = {
-        kernel_params.stride[0],
-        kernel_params.stride[1],
-        kernel_params.padding[0],
-        kernel_params.padding[1],
-    };
-
-    struct Conv2dPWParams final {
-      int in_group_size;
-      int dummy_padding;
-      OutputParams out_params;
-    } param{extra_params.in_group_size, 0, out_params};
-
-    push_constants = {
-        graph.logical_limits_pc_of(out),
-        PushConstantDataInfo(
-            &kernel_param_stride_pad, sizeof(kernel_param_stride_pad)),
-        PushConstantDataInfo(&param, sizeof(param)),
-    };
-  } else if (method == Conv2dMethod::Depthwise) {
-    // output_tile variants use push constants; the base conv2d_dw shader uses
-    // UBOs. Distinguish by checking if "_output_tile" is in the shader name.
-    const bool uses_output_tile =
-        shader.kernel_name.find("_output_tile") != std::string::npos;
-
-    if (uses_output_tile) {
-      const utils::ivec4 kernel_param_size_stride = {
-          kernel_params.kernel_size[0],
-          kernel_params.kernel_size[1],
-          kernel_params.stride[0],
-          kernel_params.stride[1]};
-
-      const utils::ivec4 kernel_param_pad_dial = {
-          kernel_params.padding[0],
-          kernel_params.padding[1],
-          kernel_params.dilation[0],
-          kernel_params.dilation[1]};
-
-      push_constants = {
-          graph.logical_limits_pc_of(out),
-          graph.sizes_pc_of(in),
-          PushConstantDataInfo(
-              &kernel_param_size_stride, sizeof(kernel_param_size_stride)),
-          PushConstantDataInfo(
-              &kernel_param_pad_dial, sizeof(kernel_param_pad_dial)),
-          PushConstantDataInfo(
-              &extra_params, sizeof(extra_params), sizeof(utils::ivec4)),
-          PushConstantDataInfo(&out_params, sizeof(out_params)),
-      };
-    } else {
-      // Base conv2d_dw shader uses UBOs, same as SlidingWindow case
-      param_buffers = {
-          graph.logical_limits_ubo(out),
-          graph.sizes_ubo(in),
-          graph.create_params_buffer(kernel_params),
-          graph.create_params_buffer(extra_params),
-          graph.create_params_buffer(out_params),
-      };
-    }
-  } else {
-    param_buffers = {
-        graph.logical_limits_ubo(out),
-        graph.sizes_ubo(in),
-        graph.create_params_buffer(kernel_params),
-        graph.create_params_buffer(extra_params),
-        graph.create_params_buffer(out_params),
-    };
-  }
+  vkapi::ParamsBindList param_buffers = {
+      graph.logical_limits_ubo(out),
+      graph.sizes_ubo(in),
+      graph.create_params_buffer(kernel_params),
+      graph.create_params_buffer(extra_params),
+      graph.create_params_buffer(out_params),
+  };
 
   graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
@@ -653,7 +527,7 @@ void add_conv2d_node(
       // Shader params buffers
       param_buffers,
       // Push Constants
-      push_constants,
+      {},
       // Specialization Constants
       {utils::safe_downcast<int32_t>(groups_val)},
       // Resize Args
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.h b/backends/vulkan/runtime/graph/ops/impl/Convolution.h
index f1768a89875..f49e7efcfe7 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Convolution.h
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.h
@@ -38,4 +38,22 @@ void conv2d_pw_impl(
     const float out_min_val,
     const float out_max_val);
 
+void conv2d_dw_impl(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef weight_data,
+    const ValueRef bias,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef dilation,
+    const ValueRef out,
+    const bool clamp_out,
+    const float out_min_val,
+    const float out_max_val);
+
+void resize_conv2d_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args);
+
 } // namespace vkcompute
diff --git a/backends/vulkan/test/custom_ops/impl/TestConv2dDw.cpp b/backends/vulkan/test/custom_ops/impl/TestConv2dDw.cpp
new file mode 100644
index 00000000000..f7454b6b93a
--- /dev/null
+++ b/backends/vulkan/test/custom_ops/impl/TestConv2dDw.cpp
@@ -0,0 +1,348 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Convolution.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+//
+// Local copies of Conv2dDW internals, extended with impl_selector support.
+// These mirror the logic in Conv2dDW.cpp but allow forcing a specific tile size
+// variant via the impl_selector string.
+//
+
+struct Conv2dDWParams final {
+  utils::ivec2 overlay_region;
+  int in_group_size;
+};
+
+struct OutputParams final {
+  float out_min;
+  float out_max;
+};
+
+static std::string pick_conv2d_dw_shader_with_selector(
+    ComputeGraph& graph,
+    const ValueRef weight_data,
+    const ValueRef out,
+    const bool stride_equals_dilation,
+    const bool clamp_out,
+    const std::string& impl_selector) {
+  std::string kernel_name = "conv2d_dw";
+  kernel_name.reserve(40);
+
+  const auto& weight_sizes = graph.get_tref(weight_data)->sizes;
+  const bool is_3x3 = weight_sizes.at(2) == 3 && weight_sizes.at(3) == 3;
+  const bool is_5x5 = weight_sizes.at(2) == 5 && weight_sizes.at(3) == 5;
+
+  if (!stride_equals_dilation) {
+    kernel_name += "_sned";
+  }
+
+  if (is_3x3) {
+    kernel_name += "_output_tile_3x3";
+    if (impl_selector == "b1x1") {
+      kernel_name += "_b1x1";
+    } else if (impl_selector == "b4x2") {
+      // b4x2 is the default (no suffix)
+    } else {
+      // Auto-selection: use b1x1 on Mali
+      if (stride_equals_dilation && graph.device_is_mali()) {
+        kernel_name += "_b1x1";
+      }
+    }
+  } else if (is_5x5) {
+    kernel_name += "_output_tile_5x5";
+    // No b1x1 variant for 5x5; impl_selector is ignored for batch size
+  }
+
+  if (clamp_out) {
+    kernel_name += "_clamp";
+  }
+  add_dtype_suffix(kernel_name, graph.dtype_of(out));
+
+  return kernel_name;
+}
+
+static utils::uvec3 conv2d_dw_global_wg_size_fn(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)resize_args;
+  const ValueRef out = args.at(0).refs.at(0);
+
+  const bool uses_output_tile =
+      shader.kernel_name.find("_output_tile") != std::string::npos;
+
+  if (uses_output_tile) {
+    const bool is_sned = shader.kernel_name.find("_sned") != std::string::npos;
+    const utils::uvec3 image_extents = graph->create_global_wg_size(out);
+
+    if (is_sned) {
+      return {image_extents[0] * image_extents[1], image_extents[2], 1};
+    }
+
+    uint32_t batch_x = 4u;
+    uint32_t batch_y = 2u;
+    if (shader.kernel_name.find("_b1x1") != std::string::npos) {
+      batch_x = 1u;
+      batch_y = 1u;
+    }
+
+    uint32_t scaled_x = utils::div_up(image_extents[0], batch_x);
+    uint32_t scaled_y = utils::div_up(image_extents[1], batch_y);
+    return {scaled_x * scaled_y, image_extents[2], 1};
+  }
+
+  const utils::uvec3 base_extents = graph->create_global_wg_size(out);
+  return {base_extents[0] * base_extents[1] * base_extents[2], 1, 1};
+}
+
+static utils::uvec3 conv2d_dw_local_wg_size_fn(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const utils::uvec3& global_workgroup_size,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)graph;
+  (void)shader;
+  (void)global_workgroup_size;
+  (void)args;
+  (void)resize_args;
+  return {64, 1, 1};
+}
+
+static ValueRef prepack_dw_weights(ComputeGraph& graph, const ValueRef vref) {
+  const auto original_sizes = graph.sizes_of(vref);
+
+  int64_t out_channels_padded =
+      utils::align_up_4(utils::val_at(-4, original_sizes));
+  int64_t height = utils::val_at(-2, original_sizes);
+  int64_t width = utils::val_at(-1, original_sizes);
+
+  const std::vector<int64_t> final_sizes = {
+      4, out_channels_padded / 4, height * width};
+
+  ValueRef v = graph.add_tensor(
+      final_sizes,
+      graph.dtype_of(vref),
+      utils::kTexture2D,
+      utils::kChannelsPacked);
+
+  std::string kernel_name = "conv2d_dw_prepack_weights";
+  add_dtype_suffix(kernel_name, graph.dtype_of(v));
+  add_dtype_suffix(kernel_name, graph.get_staging_dtype_for(vref));
+
+  const auto original_sizes_pc =
+      utils::make_ivec4(original_sizes, /*reverse=*/true);
+  graph.prepack_nodes().emplace_back(new PrepackNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      graph.create_global_wg_size(v),
+      graph.create_local_wg_size(v),
+      vref,
+      v,
+      {},
+      {graph.packed_dim_of(v)},
+      {graph.sizes_pc_of(v),
+       PushConstantDataInfo(&original_sizes_pc, sizeof(original_sizes_pc))}));
+
+  return v;
+}
+
+static void conv2d_dw_with_selector(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef weight_data,
+    const ValueRef bias,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef dilation,
+    const ValueRef out,
+    const std::string& impl_selector) {
+  ValueRef arg_weight = prepack_dw_weights(graph, weight_data);
+  ValueRef arg_bias = prepack_biases(
+      graph,
+      bias,
+      weight_data,
+      /*transposed=*/false,
+      /*storage_type=*/utils::kTexture2D,
+      /*memory_layout=*/utils::kWidthPacked);
+
+  check_conv_args(graph, in, out);
+
+  Kernel2dParams kernel_params = create_kernel2d_params(
+      graph,
+      weight_data,
+      /*kernel_size_only=*/false,
+      stride,
+      padding,
+      dilation);
+
+  const bool stride_equals_dilation =
+      (kernel_params.stride[0] == kernel_params.dilation[0] &&
+       kernel_params.stride[1] == kernel_params.dilation[1]);
+
+  const auto& overlay_region = utils::make_ivec2({
+      kernel_params.kernel_size[0] +
+          (kernel_params.kernel_size[0] - 1) * (kernel_params.dilation[0] - 1),
+      kernel_params.kernel_size[1] +
+          (kernel_params.kernel_size[1] - 1) * (kernel_params.dilation[1] - 1),
+  });
+  const auto weight_sizes = graph.sizes_of(weight_data);
+  const int32_t in_group_size =
+      utils::safe_downcast<int32_t>(utils::align_up_4(weight_sizes.at(1)));
+  Conv2dDWParams extra_params = {overlay_region, in_group_size};
+
+  OutputParams out_params = {
+      std::numeric_limits<float>::lowest(), std::numeric_limits<float>::max()};
+
+  std::string kernel_name = pick_conv2d_dw_shader_with_selector(
+      graph,
+      weight_data,
+      out,
+      stride_equals_dilation,
+      /*clamp_out=*/false,
+      impl_selector);
+
+  vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);
+
+  vkapi::ParamsBindList param_buffers;
+  std::vector<PushConstantDataInfo> push_constants;
+
+  const bool uses_output_tile =
+      kernel_name.find("_output_tile") != std::string::npos;
+
+  if (uses_output_tile) {
+    const utils::ivec4 kernel_param_size_stride = {
+        kernel_params.kernel_size[0],
+        kernel_params.kernel_size[1],
+        kernel_params.stride[0],
+        kernel_params.stride[1]};
+
+    const utils::ivec4 kernel_param_pad_dial = {
+        kernel_params.padding[0],
+        kernel_params.padding[1],
+        kernel_params.dilation[0],
+        kernel_params.dilation[1]};
+
+    push_constants = {
+        graph.logical_limits_pc_of(out),
+        graph.sizes_pc_of(in),
+        PushConstantDataInfo(
+            &kernel_param_size_stride, sizeof(kernel_param_size_stride)),
+        PushConstantDataInfo(
+            &kernel_param_pad_dial, sizeof(kernel_param_pad_dial)),
+        PushConstantDataInfo(
+            &extra_params, sizeof(extra_params), sizeof(utils::ivec4)),
+        PushConstantDataInfo(&out_params, sizeof(out_params)),
+    };
+  } else {
+    param_buffers = {
+        graph.logical_limits_ubo(out),
+        graph.sizes_ubo(in),
+        graph.create_params_buffer(kernel_params),
+        graph.create_params_buffer(extra_params),
+        graph.create_params_buffer(out_params),
+    };
+  }
+
+  ValueRef transposed_ref = graph.add_scalar(false);
+  ValueRef output_padding = graph.add_none();
+
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+      graph,
+      shader,
+      conv2d_dw_global_wg_size_fn,
+      conv2d_dw_local_wg_size_fn,
+      {{out, vkapi::kWrite}, {{in, arg_weight, arg_bias}, vkapi::kRead}},
+      param_buffers,
+      push_constants,
+      {},
+      {weight_data, stride, padding, dilation, transposed_ref, output_padding},
+      resize_conv2d_node));
+}
+
+void test_conv2d_dw(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  // args[0] = input [N, C, H, W]
+  // args[1] = weight [C, 1, K_h, K_w] (constant)
+  // args[2] = bias (constant, or none)
+  // args[3] = stride_h (int)
+  // args[4] = stride_w (int)
+  // args[5] = padding_h (int)
+  // args[6] = padding_w (int)
+  // args[7] = dilation_h (int)
+  // args[8] = dilation_w (int)
+  // args[9] = impl_selector (string)
+  // args[10] = output
+  const ValueRef input = args.at(0);
+  const ValueRef weight = args.at(1);
+  const ValueRef bias = args.at(2);
+  const int64_t stride_h = graph.extract_scalar<int64_t>(args.at(3));
+  const int64_t stride_w = graph.extract_scalar<int64_t>(args.at(4));
+  const int64_t padding_h = graph.extract_scalar<int64_t>(args.at(5));
+  const int64_t padding_w = graph.extract_scalar<int64_t>(args.at(6));
+  const int64_t dilation_h = graph.extract_scalar<int64_t>(args.at(7));
+  const int64_t dilation_w = graph.extract_scalar<int64_t>(args.at(8));
+  const std::string impl_selector = graph.extract_string(args.at(9));
+  const ValueRef out = args.at(10);
+
+  ValueRef stride =
+      graph.add_scalar_list<int64_t>(std::vector<int64_t>{stride_h, stride_w});
+  ValueRef padding = graph.add_scalar_list<int64_t>(
+      std::vector<int64_t>{padding_h, padding_w});
+  ValueRef dilation = graph.add_scalar_list<int64_t>(
+      std::vector<int64_t>{dilation_h, dilation_w});
+
+  if (impl_selector.empty()) {
+    // Auto-selection: delegate to aten.convolution.default
+    const int64_t channels = graph.sizes_of(input).at(1);
+    ValueRef transposed = graph.add_scalar<bool>(false);
+    ValueRef output_padding =
+        graph.add_scalar_list<int64_t>(std::vector<int64_t>{0, 0});
+    ValueRef groups = graph.add_scalar<int64_t>(channels);
+
+    VK_GET_OP_FN("aten.convolution.default")
+    (graph,
+     {input,
+      weight,
+      bias,
+      stride,
+      padding,
+      dilation,
+      transposed,
+      output_padding,
+      groups,
+      out});
+  } else {
+    // Forced variant: build the dispatch directly with impl_selector
+    conv2d_dw_with_selector(
+        graph,
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        out,
+        impl_selector);
+  }
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(test_etvk.test_conv2d_dw.default, test_conv2d_dw);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/test/custom_ops/targets.bzl b/backends/vulkan/test/custom_ops/targets.bzl
index 471b2530910..84432bce30b 100644
--- a/backends/vulkan/test/custom_ops/targets.bzl
+++ b/backends/vulkan/test/custom_ops/targets.bzl
@@ -101,3 +101,4 @@ def define_common_targets(is_fbcode = False):
     define_custom_op_test_binary("test_q8ta_conv2d_transposed")
     define_custom_op_test_binary("test_mm")
     define_custom_op_test_binary("test_conv2d_pw")
+    define_custom_op_test_binary("test_conv2d_dw")
diff --git a/backends/vulkan/test/custom_ops/test_conv2d_dw.cpp b/backends/vulkan/test/custom_ops/test_conv2d_dw.cpp
new file mode 100644
index 00000000000..9c48c320d62
--- /dev/null
+++ b/backends/vulkan/test/custom_ops/test_conv2d_dw.cpp
@@ -0,0 +1,435 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <iostream>
+#include <vector>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+#include "conv2d_utils.h"
+#include "utils.h"
+
+using namespace executorch::vulkan::prototyping;
+using namespace vkcompute;
+
+static constexpr int64_t kRefDimSizeLimit = 64;
+
+struct InputDims {
+  int64_t N;
+  int64_t C;
+  int64_t H;
+  int64_t W;
+
+  InputDims(int64_t n, int64_t c, int64_t h, int64_t w)
+      : N(n), C(c), H(h), W(w) {}
+};
+
+struct Conv2dDwConfig {
+  InputDims dims;
+  KernelSize kernel;
+  Stride stride;
+  Padding padding;
+  Dilation dilation;
+  bool has_bias;
+};
+
+static int64_t calc_out_size(
+    int64_t in_size,
+    int64_t kernel_size,
+    int64_t stride,
+    int64_t padding,
+    int64_t dilation) {
+  return (in_size + 2 * padding - dilation * (kernel_size - 1) - 1) / stride +
+      1;
+}
+
+static TestCase create_conv2d_dw_test_case(
+    const Conv2dDwConfig& config,
+    vkapi::ScalarType dtype,
+    utils::StorageType storage_type,
+    utils::GPUMemoryLayout memory_layout,
+    const std::string& impl_selector = "") {
+  TestCase test_case;
+
+  bool is_perf = config.dims.C > kRefDimSizeLimit ||
+      config.dims.H > kRefDimSizeLimit || config.dims.W > kRefDimSizeLimit;
+
+  std::string prefix = is_perf ? "PERF" : "ACCU";
+  std::string storage_str = storage_type_abbrev(storage_type);
+  std::string layout_str = layout_abbrev(memory_layout);
+  std::string dtype_str = (dtype == vkapi::kHalf) ? "f16" : "f32";
+  std::string bias_str = config.has_bias ? "+bias" : "";
+
+  int64_t H_out = calc_out_size(
+      config.dims.H,
+      config.kernel.h,
+      config.stride.h,
+      config.padding.h,
+      config.dilation.h);
+  int64_t W_out = calc_out_size(
+      config.dims.W,
+      config.kernel.w,
+      config.stride.w,
+      config.padding.w,
+      config.dilation.w);
+
+  std::string shape = "[" + std::to_string(config.dims.N) + "," +
+      std::to_string(config.dims.C) + "," + std::to_string(config.dims.H) +
+      "," + std::to_string(config.dims.W) + "] k" +
+      std::to_string(config.kernel.h) + "x" + std::to_string(config.kernel.w) +
+      " s" + std::to_string(config.stride.h) + " p" +
+      std::to_string(config.padding.h) + " d" +
+      std::to_string(config.dilation.h) + "->[" +
+      std::to_string(config.dims.N) + "," + std::to_string(config.dims.C) +
+      "," + std::to_string(H_out) + "," + std::to_string(W_out) + "]";
+
+  std::string selector_str =
+      impl_selector.empty() ? "" : " [" + impl_selector + "]";
+
+  std::string name = prefix + "  conv2d_dw" + bias_str + " " + shape + "  " +
+      storage_str + "(" + layout_str + ") " + dtype_str + selector_str;
+
+  test_case.set_name(name);
+  test_case.set_operator_name("test_etvk.test_conv2d_dw.default");
+
+  // Input tensor [N, C, H, W]
+  ValueSpec input(
+      {config.dims.N, config.dims.C, config.dims.H, config.dims.W},
+      dtype,
+      storage_type,
+      memory_layout,
+      DataGenType::RANDOM);
+
+  // Weight tensor [C, 1, K_h, K_w] - constant
+  ValueSpec weight(
+      {config.dims.C, 1, config.kernel.h, config.kernel.w},
+      dtype,
+      storage_type,
+      memory_layout,
+      DataGenType::RANDOM);
+  weight.set_constant(true);
+
+  test_case.add_input_spec(input);
+  test_case.add_input_spec(weight);
+
+  // Bias (or none)
+  if (config.has_bias) {
+    ValueSpec bias(
+        {config.dims.C},
+        dtype,
+        storage_type,
+        memory_layout,
+        DataGenType::RANDOM);
+    bias.set_constant(true);
+    test_case.add_input_spec(bias);
+  } else {
+    ValueSpec none_bias(static_cast<int32_t>(0));
+    none_bias.set_none(true);
+    test_case.add_input_spec(none_bias);
+  }
+
+  // stride_h, stride_w, padding_h, padding_w, dilation_h, dilation_w
+  test_case.add_input_spec(ValueSpec(static_cast<int32_t>(config.stride.h)));
+  test_case.add_input_spec(ValueSpec(static_cast<int32_t>(config.stride.w)));
+  test_case.add_input_spec(ValueSpec(static_cast<int32_t>(config.padding.h)));
+  test_case.add_input_spec(ValueSpec(static_cast<int32_t>(config.padding.w)));
+  test_case.add_input_spec(ValueSpec(static_cast<int32_t>(config.dilation.h)));
+  test_case.add_input_spec(ValueSpec(static_cast<int32_t>(config.dilation.w)));
+
+  // impl_selector string
+  test_case.add_input_spec(ValueSpec::make_string(impl_selector));
+
+  // Output tensor [N, C, H_out, W_out]
+  ValueSpec output(
+      {config.dims.N, config.dims.C, H_out, W_out},
+      dtype,
+      storage_type,
+      memory_layout,
+      DataGenType::ZEROS);
+  test_case.add_output_spec(output);
+
+  if (dtype == vkapi::kHalf) {
+    test_case.set_abs_tolerance(1e-1f);
+    test_case.set_rel_tolerance(1e-2f);
+  } else {
+    test_case.set_abs_tolerance(1e-3f);
+    test_case.set_rel_tolerance(1e-3f);
+  }
+
+  test_case.set_shader_filter({"nchw_to", "to_nchw", "view_copy"});
+
+  return test_case;
+}
+
+// Reference implementation for depthwise conv2d
+static void conv2d_dw_reference_impl(TestCase& test_case) {
+  const ValueSpec& input = test_case.inputs()[0];
+  const ValueSpec& weight = test_case.inputs()[1];
+  const ValueSpec& bias_spec = test_case.inputs()[2];
+  ValueSpec& output = test_case.outputs()[0];
+
+  if (input.dtype != vkapi::kFloat) {
+    throw std::invalid_argument("Reference only supports float");
+  }
+
+  auto input_sizes = input.get_tensor_sizes();
+  auto weight_sizes = weight.get_tensor_sizes();
+  auto output_sizes = output.get_tensor_sizes();
+
+  int64_t N = input_sizes[0];
+  int64_t C = input_sizes[1];
+  int64_t H_in = input_sizes[2];
+  int64_t W_in = input_sizes[3];
+  int64_t K_h = weight_sizes[2];
+  int64_t K_w = weight_sizes[3];
+  int64_t H_out = output_sizes[2];
+  int64_t W_out = output_sizes[3];
+
+  int64_t stride_h = test_case.inputs()[3].get_int_value();
+  int64_t stride_w = test_case.inputs()[4].get_int_value();
+  int64_t padding_h = test_case.inputs()[5].get_int_value();
+  int64_t padding_w = test_case.inputs()[6].get_int_value();
+  int64_t dilation_h = test_case.inputs()[7].get_int_value();
+  int64_t dilation_w = test_case.inputs()[8].get_int_value();
+
+  auto& input_data = input.get_float_data();
+  auto& weight_data = weight.get_float_data();
+  auto& ref_data = output.get_ref_float_data();
+  ref_data.resize(N * C * H_out * W_out, 0.0f);
+
+  for (int64_t n = 0; n < N; ++n) {
+    for (int64_t c = 0; c < C; ++c) {
+      for (int64_t oh = 0; oh < H_out; ++oh) {
+        for (int64_t ow = 0; ow < W_out; ++ow) {
+          float sum = 0.0f;
+          for (int64_t kh = 0; kh < K_h; ++kh) {
+            for (int64_t kw = 0; kw < K_w; ++kw) {
+              int64_t ih = oh * stride_h - padding_h + kh * dilation_h;
+              int64_t iw = ow * stride_w - padding_w + kw * dilation_w;
+              if (ih >= 0 && ih < H_in && iw >= 0 && iw < W_in) {
+                float in_val = input_data
+                    [n * (C * H_in * W_in) + c * (H_in * W_in) + ih * W_in +
+                     iw];
+                // weight is [C, 1, K_h, K_w]
+                float w_val = weight_data[c * (K_h * K_w) + kh * K_w + kw];
+                sum += in_val * w_val;
+              }
+            }
+          }
+          if (!bias_spec.is_none()) {
+            auto& bias_data = bias_spec.get_float_data();
+            sum += bias_data[c];
+          }
+          ref_data
+              [n * (C * H_out * W_out) + c * (H_out * W_out) + oh * W_out +
+               ow] = sum;
+        }
+      }
+    }
+  }
+}
+
+static std::vector<TestCase> generate_conv2d_dw_test_cases() {
+  std::vector<TestCase> test_cases;
+
+  std::vector<utils::StorageType> storage_types = {utils::kTexture3D};
+  utils::GPUMemoryLayout layout = utils::kChannelsPacked;
+
+  // Accuracy shapes (small enough for float reference validation)
+  std::vector<Conv2dDwConfig> accuracy_configs = {
+      {InputDims(1, 8, 16, 16),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      {InputDims(1, 8, 16, 16),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       true},
+      {InputDims(1, 8, 16, 16),
+       KernelSize(5, 5),
+       Stride(1, 1),
+       Padding(2, 2),
+       Dilation(1, 1),
+       false},
+      {InputDims(1, 8, 16, 16),
+       KernelSize(3, 3),
+       Stride(2, 2),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      // Non-multiple-of-4 channels
+      {InputDims(1, 11, 16, 16),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      {InputDims(1, 3, 16, 16),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+  };
+
+  // EdgeTAM depthwise shapes (from profiling data)
+  std::vector<Conv2dDwConfig> perf_configs = {
+      // Backbone stem and early stages
+      {InputDims(1, 24, 512, 512),
+       KernelSize(3, 3),
+       Stride(2, 2),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      {InputDims(1, 48, 256, 256),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      {InputDims(1, 48, 256, 256),
+       KernelSize(3, 3),
+       Stride(2, 2),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      {InputDims(1, 96, 128, 128),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      {InputDims(1, 96, 128, 128),
+       KernelSize(3, 3),
+       Stride(2, 2),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      {InputDims(1, 192, 64, 64),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      {InputDims(1, 192, 64, 64),
+       KernelSize(3, 3),
+       Stride(2, 2),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      {InputDims(1, 384, 32, 32),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      // 5x5 kernels
+      {InputDims(1, 48, 256, 256),
+       KernelSize(5, 5),
+       Stride(1, 1),
+       Padding(2, 2),
+       Dilation(1, 1),
+       false},
+      {InputDims(1, 96, 128, 128),
+       KernelSize(5, 5),
+       Stride(1, 1),
+       Padding(2, 2),
+       Dilation(1, 1),
+       false},
+      // FPN/Neck
+      {InputDims(1, 256, 256, 256),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      {InputDims(1, 256, 128, 128),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+  };
+
+  // Generate accuracy test cases (float only)
+  for (const auto& config : accuracy_configs) {
+    for (auto st : storage_types) {
+      test_cases.push_back(
+          create_conv2d_dw_test_case(config, vkapi::kFloat, st, layout));
+    }
+  }
+
+  // Generate performance test cases (float and half)
+  for (const auto& config : perf_configs) {
+    std::vector<vkapi::ScalarType> dtypes = {vkapi::kFloat, vkapi::kHalf};
+    for (auto dtype : dtypes) {
+      for (auto st : storage_types) {
+        // Auto-selection (empty impl_selector)
+        test_cases.push_back(
+            create_conv2d_dw_test_case(config, dtype, st, layout));
+
+        // Force b4x2 variant
+        test_cases.push_back(
+            create_conv2d_dw_test_case(config, dtype, st, layout, "b4x2"));
+
+        // Force b1x1 variant (only for 3x3 kernels; for 5x5 it falls back
+        // to default, but we still generate it to test the fallback path)
+        test_cases.push_back(
+            create_conv2d_dw_test_case(config, dtype, st, layout, "b1x1"));
+      }
+    }
+  }
+
+  return test_cases;
+}
+
+static int64_t conv2d_dw_flop_calculator(const TestCase& test_case) {
+  auto input_sizes = test_case.inputs()[0].get_tensor_sizes();
+  auto weight_sizes = test_case.inputs()[1].get_tensor_sizes();
+  auto output_sizes = test_case.outputs()[0].get_tensor_sizes();
+
+  int64_t N = output_sizes[0];
+  int64_t C = output_sizes[1];
+  int64_t H_out = output_sizes[2];
+  int64_t W_out = output_sizes[3];
+  int64_t K_h = weight_sizes[2];
+  int64_t K_w = weight_sizes[3];
+
+  // Each output element: K_h * K_w multiplies + (K_h * K_w - 1) adds
+  return 2 * N * C * H_out * W_out * K_h * K_w;
+}
+
+static void reference_impl(TestCase& test_case) {
+  conv2d_dw_reference_impl(test_case);
+}
+
+int main(int argc, char* argv[]) {
+  set_debugging(false);
+  set_print_output(false);
+  set_print_latencies(false);
+  set_use_gpu_timestamps(true);
+
+  print_performance_header();
+  std::cout << "Depthwise Conv2d Benchmark" << std::endl;
+  print_separator();
+
+  ReferenceComputeFunc ref_fn = reference_impl;
+
+  auto results = execute_test_cases(
+      generate_conv2d_dw_test_cases,
+      conv2d_dw_flop_calculator,
+      "Conv2dDW",
+      3,
+      10,
+      ref_fn);
+
+  return 0;
+}