From 33a2016a113edcfb979ba999a219085e97ba3f9c Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devvm26340.ftw0.facebook.com>
Date: Wed, 18 Mar 2026 07:37:25 -0700
Subject: [PATCH 1/3] [ET-VK] Fix staging buffer allocation to check all memory
 types for HOST_CACHED

Pull Request resolved: https://github.com/pytorch/executorch/pull/18291

`test_host_cached_available()` only checked `memoryTypes[0]` to determine if
HOST_CACHED memory was available. On Pixel devices, `memoryTypes[0]` is
DEVICE_LOCAL without HOST_CACHED, so the function incorrectly returned
`SEQUENTIAL_WRITE_BIT`. This caused DEVICE_TO_HOST staging buffers to be
allocated in write-combining (uncached) memory, making CPU reads during
COPY_OUTPUTS ~170x slower than necessary (~40ms vs ~237us on S24).

The fix iterates over all memory types to correctly detect HOST_CACHED support.

On-device profiling of edgetam_first_frame_fp16_vulkan.pte confirms the fix:
- Pixel 8 Pro COPY_OUTPUTS: 40ms -> 6.3ms (-84%)
- Pixel 9 Pro XL COPY_OUTPUTS: 40ms -> 2.5ms (-94%)
- Pixel 8 Pro Method::execute: 492ms -> 464ms (-5.7%)
- Pixel 9 Pro XL Method::execute: 445ms -> 411ms (-7.6%)
ghstack-source-id: 353941146
@exported-using-ghexport

Differential Revision: [D97058156](https://our.internmc.facebook.com/intern/diff/D97058156/)
---
 backends/vulkan/runtime/vk_api/memory/Allocator.cpp | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp
index f5abf3e6d0c..f36b2b0c09e 100644
--- a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp
+++ b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp
@@ -16,13 +16,12 @@ VmaAllocationCreateFlags test_host_cached_available(
   VkPhysicalDeviceMemoryProperties mem_props;
   vkGetPhysicalDeviceMemoryProperties(physical_device, &mem_props);
 
-  VkMemoryPropertyFlags const flags = mem_props.memoryTypes->propertyFlags;
-
-  bool const host_visible = flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
-  bool const host_cached = flags & VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
-
-  if (host_visible && host_cached) {
-    return VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT;
+  for (uint32_t i = 0; i < mem_props.memoryTypeCount; i++) {
+    VkMemoryPropertyFlags flags = mem_props.memoryTypes[i].propertyFlags;
+    if ((flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) &&
+        (flags & VK_MEMORY_PROPERTY_HOST_CACHED_BIT)) {
+      return VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT;
+    }
   }
 
   return VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT;

From 4fc8516ecf7dc53fde4c3a4972e47c19f8203082 Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devvm26340.ftw0.facebook.com>
Date: Wed, 18 Mar 2026 07:37:27 -0700
Subject: [PATCH 2/3] [ET-VK][conv2d] Re-implement pointwise conv2d with tiled
 compute and blocked weight packing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pull Request resolved: https://github.com/pytorch/executorch/pull/18292

Profiling EdgeTAM on Adreno shows pointwise 1×1 convolutions are a dominant
bottleneck. This diff re-implements the stride=1, padding=0 pointwise path
using the same tiled matmul approach as the recently landed linear shader
rewrite.

The new `conv2d_pw_tiled` shader reuses the shared linear tiled infrastructure
(FPInputTile, FPWeightTile, FPOutTile, fp_accumulate_with_fp_weight, packed
weight tile loading) with custom input/output tile load/store functions that
map flat spatial indices to channels-packed texture3d coordinates.

Weight packing uses the same 4OC×4IC blocked format as linear via the
`pack_fp_linear_weight` shader. Dispatch uses DynamicDispatchNode for correct
workgroup size updates during graph resizing.

Only the stride=1, padding=0 pointwise path is changed; the general conv2d_pw
shader for arbitrary stride/padding is left unchanged.

EdgeTAM first frame on Samsung S25 (Adreno 830): 208 ms → 196 ms (~6%).

Authored with Claude.
ghstack-source-id: 353941147
@exported-using-ghexport

Differential Revision: [D96756792](https://our.internmc.facebook.com/intern/diff/D96756792/)
---
 .../graph/ops/glsl/conv2d_pw_tiled.glsl       | 158 +++++++++++
 .../graph/ops/glsl/conv2d_pw_tiled.yaml       |  20 ++
 .../runtime/graph/ops/impl/Conv2dPW.cpp       | 255 +++++++++++++++++
 .../runtime/graph/ops/impl/Convolution.cpp    |  18 ++
 .../runtime/graph/ops/impl/Convolution.h      |  41 +++
 .../test/custom_ops/impl/TestConv2dPw.cpp     |  61 ++++
 backends/vulkan/test/custom_ops/targets.bzl   |   1 +
 .../vulkan/test/custom_ops/test_conv2d_pw.cpp | 268 ++++++++++++++++++
 8 files changed, 822 insertions(+)
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_tiled.glsl
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_tiled.yaml
 create mode 100644 backends/vulkan/runtime/graph/ops/impl/Conv2dPW.cpp
 create mode 100644 backends/vulkan/runtime/graph/ops/impl/Convolution.h
 create mode 100644 backends/vulkan/test/custom_ops/impl/TestConv2dPw.cpp
 create mode 100644 backends/vulkan/test/custom_ops/test_conv2d_pw.cpp

diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_tiled.glsl
new file mode 100644
index 00000000000..aee920bd84a
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_tiled.glsl
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+#define VEC4_T ${texel_load_type(DTYPE, STORAGE)}
+
+#define TILE_M4 ${TILE_M4}
+#define TILE_K4 ${TILE_K4}
+#define TILE_N4 ${TILE_N4}
+
+#define TILE_M ${TILE_M}
+#define TILE_K ${TILE_K4 * 4}
+#define TILE_N ${TILE_N4 * 4}
+
+${define_required_extensions(STORAGE, DTYPE)}
+
+layout(std430) buffer;
+
+#include "common.glslh"
+
+${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "t_weight_packed", DTYPE, "texture2d")}
+${layout_declare_tensor(B, "r", "t_bias", DTYPE, "texture2d")}
+
+${layout_declare_ubo(B, "ivec4", "in_sizes")}
+${layout_declare_ubo(B, "ivec4", "out_sizes")}
+
+layout(push_constant) uniform restrict Block {
+  int stride_h;
+  int stride_w;
+  int padding_h;
+  int padding_w;
+  float out_min;
+  float out_max;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+${layout_declare_spec_const(C, "int", "stride_1_padding_0", "0")}
+${layout_declare_spec_const(C, "int", "activation_type", "0")}
+
+#include "linear_fp_input_tile.glslh"
+#include "linear_fp_packed_weight_tile_load.glslh"
+#include "linear_fp_output_tile_fp_compute.glslh"
+
+void load_input_tile_with_checks(
+    out FPInputTile tile,
+    const int k4_start,
+    const int m_start,
+    const int K4,
+    const int M,
+    const int W_out,
+    const int W_in,
+    const int H_in) {
+  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
+    [[unroll]] for (int k4 = 0; k4 < TILE_K4; ++k4) {
+      if (k4_start + k4 < K4 && m_start + m < M) {
+        if (stride_1_padding_0 != 0) {
+          const int spatial = m_start + m;
+          tile.data[m][k4] =
+              texelFetch(t_in, ivec3(spatial % W_out, spatial / W_out, k4_start + k4), 0);
+        } else {
+          const int out_spatial = m_start + m;
+          const int out_x = out_spatial % W_out;
+          const int out_y = out_spatial / W_out;
+          const int in_x = out_x * stride_w - padding_w;
+          const int in_y = out_y * stride_h - padding_h;
+          if (in_x >= 0 && in_x < W_in && in_y >= 0 && in_y < H_in) {
+            tile.data[m][k4] =
+                texelFetch(t_in, ivec3(in_x, in_y, k4_start + k4), 0);
+          } else {
+            tile.data[m][k4] = VEC4_T(0.0);
+          }
+        }
+      } else {
+        tile.data[m][k4] = VEC4_T(0.0);
+      }
+    }
+  }
+}
+
+void store_output_tile_with_checks(
+    const FPOutTile out_tile,
+    const int n4_start,
+    const int m_start,
+    const int N4,
+    const int M,
+    const int W_out) {
+  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
+    [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
+      if (m_start + m < M && n4_start + n4 < N4) {
+        const int spatial = m_start + m;
+        VEC4_T texel = out_tile.data[m][n4];
+        if (activation_type == 1) {
+          texel = max(texel, VEC4_T(0.0));
+        } else if (activation_type == 2) {
+          texel = clamp(texel, VEC4_T(out_min), VEC4_T(out_max));
+        }
+        imageStore(t_out, ivec3(spatial % W_out, spatial / W_out, n4_start + n4), texel);
+      }
+    }
+  }
+}
+
+void main() {
+  const int tile_idx_n = int(gl_GlobalInvocationID.x);
+  const int tile_idx_m = int(gl_GlobalInvocationID.y);
+
+  const int n4_start = tile_idx_n * TILE_N4;
+  const int m_start = tile_idx_m * TILE_M;
+
+  const int W_in = in_sizes.x;
+  const int H_in = in_sizes.y;
+  const int K = in_sizes.z;
+  const int K4 = div_up_4(K);
+
+  const int W_out = out_sizes.x;
+  const int H_out = out_sizes.y;
+  const int M = W_out * H_out;
+  const int N = out_sizes.z;
+  const int N4 = div_up_4(N);
+
+  if (n4_start >= N4 || m_start >= M) {
+    return;
+  }
+
+  FPOutTile out_tile;
+  initialize(out_tile);
+
+  FPInputTile in_tile;
+  FPWeightTile w_tile;
+
+  for (int k4 = 0; k4 < K4; k4++) {
+    load_input_tile_with_checks(in_tile, k4, m_start, K4, M, W_out, W_in, H_in);
+    load_packed_weight_tile_with_checks(w_tile, n4_start, k4, 0, N4, K4);
+    fp_accumulate_with_fp_weight(out_tile, in_tile, w_tile);
+  }
+
+  // Apply bias
+  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
+    [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
+      if (n4_start + n4 < N4) {
+        out_tile.data[m][n4] +=
+            texelFetch(t_bias, ivec2(n4_start + n4, 0), 0);
+      }
+    }
+  }
+
+  store_output_tile_with_checks(out_tile, n4_start, m_start, N4, M, W_out);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_tiled.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_tiled.yaml
new file mode 100644
index 00000000000..037dfc35c89
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_tiled.yaml
@@ -0,0 +1,20 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+conv2d_pw_tiled:
+  parameter_names_with_default_values:
+    DTYPE: float
+    STORAGE: texture3d
+    TILE_M4: 1
+    TILE_K4: 1
+    TILE_N4: 1
+    TILE_M: 4
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: float
+      - VALUE: half
+  shader_variants:
+    - NAME: conv2d_pw_tiled
diff --git a/backends/vulkan/runtime/graph/ops/impl/Conv2dPW.cpp b/backends/vulkan/runtime/graph/ops/impl/Conv2dPW.cpp
new file mode 100644
index 00000000000..2863d80aa0e
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Conv2dPW.cpp
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Convolution.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+//
+// Shader dispatch utilities
+//
+
+void resize_conv2d_pw_tiled_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  const ValueRef out = args.at(0).refs.at(0);
+  const ValueRef self = args.at(1).refs.at(0);
+
+  std::vector<int64_t> self_sizes = graph->sizes_of(self);
+  TensorRefPtr weight_ref = graph->get_tref(extra_args.at(0));
+  const auto& weight_sizes = weight_ref->sizes;
+
+  const auto stride_list = graph->get_int_list(extra_args.at(1));
+  const auto padding_list = graph->get_int_list(extra_args.at(2));
+
+  const int64_t stride_h = stride_list->at(0);
+  const int64_t stride_w = stride_list->at(1);
+  const int64_t padding_h = padding_list->at(0);
+  const int64_t padding_w = padding_list->at(1);
+
+  const int64_t in_h = self_sizes.at(self_sizes.size() - 2);
+  const int64_t in_w = self_sizes.at(self_sizes.size() - 1);
+
+  // For 1x1 kernel with dilation=1: out = (in + 2*padding - 1) / stride + 1
+  const int64_t out_h = (in_h + 2 * padding_h - 1) / stride_h + 1;
+  const int64_t out_w = (in_w + 2 * padding_w - 1) / stride_w + 1;
+
+  std::vector<int64_t> new_out_sizes = self_sizes;
+  new_out_sizes.at(self_sizes.size() - 3) = weight_sizes.at(0);
+  new_out_sizes.at(self_sizes.size() - 2) = out_h;
+  new_out_sizes.at(self_sizes.size() - 1) = out_w;
+
+  graph->virtual_resize(out, new_out_sizes);
+}
+
+vkapi::ShaderInfo pick_conv2d_pw_tiled_shader(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)resize_args;
+  const ValueRef out = args.at(0).refs.at(0);
+
+  std::string kernel_name = "conv2d_pw_tiled";
+  kernel_name.reserve(kShaderNameReserve);
+  add_dtype_suffix(kernel_name, graph->dtype_of(out));
+  return VK_KERNEL_FROM_STR(kernel_name);
+}
+
+utils::uvec3 pick_conv2d_pw_tiled_global_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)shader;
+  (void)resize_args;
+  const ValueRef out = args.at(0).refs.at(0);
+  uint32_t W = graph->size_at<uint32_t>(-1, out);
+  uint32_t H = graph->size_at<uint32_t>(-2, out);
+  uint32_t C_out = graph->size_at<uint32_t>(-3, out);
+  uint32_t M = H * W;
+  uint32_t N4 = utils::div_up_4(C_out);
+  // TILE_N4=1, TILE_M=4
+  return {N4, utils::div_up(M, 4u), 1};
+}
+
+//
+// Prepack nodes
+//
+
+struct PackParams {
+  int32_t N;
+  int32_t K;
+  int32_t B;
+  int32_t is_transposed;
+};
+
+ValueRef prepack_conv2d_pw_weight(
+    ComputeGraph& graph,
+    const ValueRef weight_data) {
+  const std::vector<int64_t> weight_sizes = graph.sizes_of(weight_data);
+  const int64_t N = weight_sizes.at(0); // C_out
+  const int64_t K = weight_sizes.at(1); // C_in
+  const int64_t N4 = utils::div_up(N, int64_t(4));
+  const int64_t K4 = utils::div_up(K, int64_t(4));
+
+  const int64_t output_height = K4;
+  const int64_t output_width = N4 * 4 * 4;
+
+  utils::StorageType weight_storage = utils::kTexture2D;
+  uint32_t max_extent = graph.context()->adapter_ptr()->max_texture2d_dim();
+  if (output_width / 4 > max_extent ||
+      static_cast<uint32_t>(output_height) > max_extent) {
+    weight_storage = utils::kBuffer;
+  }
+
+  ValueRef packed_weight = graph.add_tensor(
+      {output_height, output_width},
+      graph.dtype_of(weight_data),
+      weight_storage,
+      utils::kWidthPacked);
+
+  utils::uvec3 global_wg_size = {
+      utils::safe_downcast<uint32_t>(N4),
+      utils::safe_downcast<uint32_t>(K4),
+      1u};
+
+  PackParams pack_params{
+      utils::safe_downcast<int32_t>(N), utils::safe_downcast<int32_t>(K), 1, 1};
+
+  std::string pack_kernel_name = "pack_fp_linear_weight";
+  add_storage_type_suffix(pack_kernel_name, weight_storage);
+  add_dtype_suffix(pack_kernel_name, graph.dtype_of(weight_data));
+
+  graph.prepack_nodes().emplace_back(new PrepackNode(
+      graph,
+      VK_KERNEL_FROM_STR(pack_kernel_name),
+      global_wg_size,
+      graph.create_local_wg_size(global_wg_size),
+      weight_data,
+      packed_weight,
+      {},
+      {},
+      {PushConstantDataInfo(&pack_params, sizeof(PackParams))}));
+
+  return packed_weight;
+}
+
+//
+// Dispatch nodes
+//
+
+void add_conv2d_pw_tiled_node(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef packed_weight,
+    const ValueRef packed_bias,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef out,
+    const ValueRef weight_data,
+    const bool clamp_out,
+    const float out_min_val,
+    const float out_max_val) {
+  int32_t stride_h, stride_w, padding_h, padding_w;
+  {
+    const auto stride_list = graph.get_int_list(stride);
+    const auto padding_list = graph.get_int_list(padding);
+    stride_h = utils::safe_downcast<int32_t>(stride_list->at(0));
+    stride_w = utils::safe_downcast<int32_t>(stride_list->at(1));
+    padding_h = utils::safe_downcast<int32_t>(padding_list->at(0));
+    padding_w = utils::safe_downcast<int32_t>(padding_list->at(1));
+  }
+
+  bool s1p0 =
+      stride_h == 1 && stride_w == 1 && padding_h == 0 && padding_w == 0;
+
+  utils::ivec4 stride_padding{stride_h, stride_w, padding_h, padding_w};
+
+  struct ClampParams final {
+    float out_min;
+    float out_max;
+  };
+  ClampParams clamp_params{out_min_val, out_max_val};
+
+  ValueRef clamp_out_ref = graph.add_scalar(clamp_out);
+
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+      graph,
+      pick_conv2d_pw_tiled_shader,
+      pick_conv2d_pw_tiled_global_wg_size,
+      pick_hw_square_wg_size,
+      // Inputs and Outputs
+      {{out, vkapi::kWrite}, {{in, packed_weight, packed_bias}, vkapi::kRead}},
+      // Shader params buffers
+      {graph.sizes_ubo(in), graph.sizes_ubo(out)},
+      // Push Constants
+      {PushConstantDataInfo(&stride_padding, sizeof(stride_padding)),
+       PushConstantDataInfo(&clamp_params, sizeof(clamp_params))},
+      // Specialization Constants
+      // activation_type: 0=none, 1=relu, 2=clamp
+      {s1p0 ? 1 : 0, clamp_out ? 2 : 0},
+      // Resize Args
+      {weight_data, stride, padding, clamp_out_ref},
+      // Resizing Logic
+      resize_conv2d_pw_tiled_node));
+}
+
+//
+// High level operator impl
+//
+
+void conv2d_pw_impl(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef weight_data,
+    const ValueRef bias,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef out,
+    const bool transposed_val,
+    const bool clamp_out,
+    const float out_min_val,
+    const float out_max_val) {
+  ValueRef packed_weight = prepack_conv2d_pw_weight(graph, weight_data);
+
+  ValueRef packed_bias = prepack_biases(
+      graph,
+      bias,
+      weight_data,
+      transposed_val,
+      utils::kTexture2D,
+      utils::kWidthPacked);
+
+  check_conv_args(graph, in, out);
+
+  const std::vector<int64_t> in_sizes = graph.sizes_of(in);
+  if (in_sizes.at(0) > 1) {
+    VK_THROW("conv2d: input batch size > 1 is not supported yet!");
+  }
+
+  add_conv2d_pw_tiled_node(
+      graph,
+      in,
+      packed_weight,
+      packed_bias,
+      stride,
+      padding,
+      out,
+      weight_data,
+      clamp_out,
+      out_min_val,
+      out_max_val);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
index 2bf3f8f726d..f5dd576ba54 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -6,6 +6,8 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Convolution.h>
+
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
@@ -481,6 +483,22 @@ void add_conv2d_node(
   const Conv2dMethod method =
       get_conv2d_method(graph, weight_data, groups_val, transposed_val);
 
+  // Use tiled path for all pointwise conv2d
+  if (method == Conv2dMethod::Pointwise) {
+    return conv2d_pw_impl(
+        graph,
+        in,
+        weight_data,
+        bias,
+        stride,
+        padding,
+        out,
+        transposed_val,
+        clamp_out,
+        out_min_val,
+        out_max_val);
+  }
+
   ValueRef arg_weight = prepack_weights(graph, weight_data, method);
   ValueRef arg_bias = prepack_biases(
       graph,
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.h b/backends/vulkan/runtime/graph/ops/impl/Convolution.h
new file mode 100644
index 00000000000..f1768a89875
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
+
+namespace vkcompute {
+
+ValueRef prepack_biases(
+    ComputeGraph& graph,
+    const ValueRef vref,
+    const ValueRef weight,
+    const bool transposed,
+    const utils::StorageType storage_type,
+    const utils::GPUMemoryLayout memory_layout);
+
+void check_conv_args(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef out);
+
+void conv2d_pw_impl(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef weight_data,
+    const ValueRef bias,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef out,
+    const bool transposed_val,
+    const bool clamp_out,
+    const float out_min_val,
+    const float out_max_val);
+
+} // namespace vkcompute
diff --git a/backends/vulkan/test/custom_ops/impl/TestConv2dPw.cpp b/backends/vulkan/test/custom_ops/impl/TestConv2dPw.cpp
new file mode 100644
index 00000000000..7c640d7a1ac
--- /dev/null
+++ b/backends/vulkan/test/custom_ops/impl/TestConv2dPw.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+
+namespace vkcompute {
+
+void test_conv2d_pw(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  // args[0] = input [N, C_in, H, W]
+  // args[1] = weight [C_out, C_in, 1, 1] (constant)
+  // args[2] = bias (constant, or none)
+  // args[3] = impl_selector (string)
+  // args[4] = output [N, C_out, H, W]
+  const ValueRef input = args.at(0);
+  const ValueRef weight = args.at(1);
+  const ValueRef bias = args.at(2);
+  const ValueRef impl_selector_str = args.at(3);
+  const ValueRef out = args.at(4);
+
+  std::string impl_selector = graph.extract_string(impl_selector_str);
+  (void)impl_selector; // Reserved for future use
+
+  // Create fixed pointwise conv parameters
+  ValueRef stride = graph.add_scalar_list<int64_t>(std::vector<int64_t>{1, 1});
+  ValueRef padding = graph.add_scalar_list<int64_t>(std::vector<int64_t>{0, 0});
+  ValueRef dilation =
+      graph.add_scalar_list<int64_t>(std::vector<int64_t>{1, 1});
+  ValueRef transposed = graph.add_scalar<bool>(false);
+  ValueRef output_padding =
+      graph.add_scalar_list<int64_t>(std::vector<int64_t>{0, 0});
+  ValueRef groups = graph.add_scalar<int64_t>(1);
+
+  // Call aten.convolution.default with all 10 args:
+  // input, weight, bias, stride, padding, dilation, transposed,
+  // output_padding, groups, output
+  VK_GET_OP_FN("aten.convolution.default")
+  (graph,
+   {input,
+    weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    transposed,
+    output_padding,
+    groups,
+    out});
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(test_etvk.test_conv2d_pw.default, test_conv2d_pw);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/test/custom_ops/targets.bzl b/backends/vulkan/test/custom_ops/targets.bzl
index fef8994718f..471b2530910 100644
--- a/backends/vulkan/test/custom_ops/targets.bzl
+++ b/backends/vulkan/test/custom_ops/targets.bzl
@@ -100,3 +100,4 @@ def define_common_targets(is_fbcode = False):
     define_custom_op_test_binary("test_q8ta_linear")
     define_custom_op_test_binary("test_q8ta_conv2d_transposed")
     define_custom_op_test_binary("test_mm")
+    define_custom_op_test_binary("test_conv2d_pw")
diff --git a/backends/vulkan/test/custom_ops/test_conv2d_pw.cpp b/backends/vulkan/test/custom_ops/test_conv2d_pw.cpp
new file mode 100644
index 00000000000..1e8eec2a6c2
--- /dev/null
+++ b/backends/vulkan/test/custom_ops/test_conv2d_pw.cpp
@@ -0,0 +1,268 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <iostream>
+#include <vector>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+#include "utils.h"
+
+using namespace executorch::vulkan::prototyping;
+using namespace vkcompute;
+
+static constexpr int64_t kRefDimSizeLimit = 64;
+
+struct Conv2dPwConfig {
+  int64_t N;
+  int64_t C_in;
+  int64_t C_out;
+  int64_t H;
+  int64_t W;
+  bool has_bias;
+};
+
+static TestCase create_conv2d_pw_test_case(
+    const Conv2dPwConfig& config,
+    vkapi::ScalarType dtype,
+    utils::StorageType storage_type,
+    utils::GPUMemoryLayout memory_layout) {
+  TestCase test_case;
+
+  bool is_perf = config.C_in > kRefDimSizeLimit ||
+      config.C_out > kRefDimSizeLimit || config.H > kRefDimSizeLimit ||
+      config.W > kRefDimSizeLimit;
+
+  std::string prefix = is_perf ? "PERF" : "ACCU";
+  std::string storage_str = storage_type_abbrev(storage_type);
+  std::string layout_str = layout_abbrev(memory_layout);
+  std::string dtype_str = (dtype == vkapi::kHalf) ? "f16" : "f32";
+  std::string bias_str = config.has_bias ? "+bias" : "";
+
+  std::string shape = "[" + std::to_string(config.N) + "," +
+      std::to_string(config.C_in) + "," + std::to_string(config.H) + "," +
+      std::to_string(config.W) + "]->[" + std::to_string(config.N) + "," +
+      std::to_string(config.C_out) + "," + std::to_string(config.H) + "," +
+      std::to_string(config.W) + "]";
+
+  std::string name = prefix + "  conv2d_pw" + bias_str + " " + shape + "  " +
+      storage_str + "(" + layout_str + ") " + dtype_str;
+
+  test_case.set_name(name);
+  test_case.set_operator_name("test_etvk.test_conv2d_pw.default");
+
+  // Input tensor [N, C_in, H, W]
+  ValueSpec input(
+      {config.N, config.C_in, config.H, config.W},
+      dtype,
+      storage_type,
+      memory_layout,
+      DataGenType::RANDOM);
+
+  // Weight tensor [C_out, C_in, 1, 1] - constant
+  ValueSpec weight(
+      {config.C_out, config.C_in, 1, 1},
+      dtype,
+      storage_type,
+      memory_layout,
+      DataGenType::RANDOM);
+  weight.set_constant(true);
+
+  test_case.add_input_spec(input);
+  test_case.add_input_spec(weight);
+
+  // Bias (or none)
+  if (config.has_bias) {
+    ValueSpec bias(
+        {config.C_out},
+        dtype,
+        storage_type,
+        memory_layout,
+        DataGenType::RANDOM);
+    bias.set_constant(true);
+    test_case.add_input_spec(bias);
+  } else {
+    ValueSpec none_bias(static_cast<int32_t>(0));
+    none_bias.set_none(true);
+    test_case.add_input_spec(none_bias);
+  }
+
+  // impl_selector
+  ValueSpec impl_selector_spec = ValueSpec::make_string("default");
+  test_case.add_input_spec(impl_selector_spec);
+
+  // Output tensor [N, C_out, H, W]
+  ValueSpec output(
+      {config.N, config.C_out, config.H, config.W},
+      dtype,
+      storage_type,
+      memory_layout,
+      DataGenType::ZEROS);
+  test_case.add_output_spec(output);
+
+  if (dtype == vkapi::kHalf) {
+    test_case.set_abs_tolerance(1e-1f);
+    test_case.set_rel_tolerance(1e-2f);
+  } else {
+    test_case.set_abs_tolerance(1e-3f);
+    test_case.set_rel_tolerance(1e-3f);
+  }
+
+  test_case.set_shader_filter({"nchw_to", "to_nchw", "view_copy"});
+
+  return test_case;
+}
+
+// Reference implementation: pointwise conv2d is essentially a matmul
+// output[n][c_out][h][w] = bias[c_out] +
+//   sum_over_c_in(input[n][c_in][h][w] * weight[c_out][c_in][0][0])
+static void conv2d_pw_reference_impl(TestCase& test_case) {
+  // input[0], weight[1], bias[2], impl_selector[3]
+  const ValueSpec& input = test_case.inputs()[0];
+  const ValueSpec& weight = test_case.inputs()[1];
+  const ValueSpec& bias_spec = test_case.inputs()[2];
+  ValueSpec& output = test_case.outputs()[0];
+
+  if (input.dtype != vkapi::kFloat) {
+    throw std::invalid_argument("Reference only supports float");
+  }
+
+  auto input_sizes = input.get_tensor_sizes();
+  auto weight_sizes = weight.get_tensor_sizes();
+
+  int64_t N = input_sizes[0];
+  int64_t C_in = input_sizes[1];
+  int64_t H = input_sizes[2];
+  int64_t W = input_sizes[3];
+  int64_t C_out = weight_sizes[0];
+
+  auto& input_data = input.get_float_data();
+  auto& weight_data = weight.get_float_data();
+  auto& ref_data = output.get_ref_float_data();
+  ref_data.resize(N * C_out * H * W, 0.0f);
+
+  for (int64_t n = 0; n < N; ++n) {
+    for (int64_t co = 0; co < C_out; ++co) {
+      for (int64_t h = 0; h < H; ++h) {
+        for (int64_t w = 0; w < W; ++w) {
+          float sum = 0.0f;
+          for (int64_t ci = 0; ci < C_in; ++ci) {
+            float in_val =
+                input_data[n * (C_in * H * W) + ci * (H * W) + h * W + w];
+            // weight is [C_out, C_in, 1, 1]
+            float w_val = weight_data[co * C_in + ci];
+            sum += in_val * w_val;
+          }
+          if (!bias_spec.is_none()) {
+            auto& bias_data = bias_spec.get_float_data();
+            sum += bias_data[co];
+          }
+          ref_data[n * (C_out * H * W) + co * (H * W) + h * W + w] = sum;
+        }
+      }
+    }
+  }
+}
+
+static std::vector<TestCase> generate_conv2d_pw_test_cases() {
+  std::vector<TestCase> test_cases;
+
+  // Conv2d shaders are texture-only and require channels-packed layout
+  std::vector<utils::StorageType> storage_types = {utils::kTexture3D};
+  utils::GPUMemoryLayout layout = utils::kChannelsPacked;
+
+  // Accuracy shapes (small enough for float reference validation)
+  std::vector<Conv2dPwConfig> accuracy_configs = {
+      {1, 16, 32, 8, 8, false},
+      {1, 32, 16, 8, 8, false},
+      {1, 16, 32, 8, 8, true},
+      {1, 48, 96, 16, 16, false},
+      {1, 96, 48, 16, 16, false},
+      // Non-multiple-of-4 channels
+      {1, 13, 27, 8, 8, false},
+      {1, 33, 17, 8, 8, false},
+  };
+
+  // EdgeTAM performance shapes
+  std::vector<Conv2dPwConfig> perf_configs = {
+      // EdgeTAM backbone stages
+      {1, 48, 96, 256, 256, false},
+      {1, 96, 48, 256, 256, false},
+      {1, 96, 192, 128, 128, false},
+      {1, 192, 96, 128, 128, false},
+      {1, 192, 384, 64, 64, false},
+      {1, 384, 192, 64, 64, false},
+      {1, 384, 768, 32, 32, false},
+      {1, 768, 384, 32, 32, false},
+      // EdgeTAM FPN/Neck
+      {1, 48, 256, 256, 256, false},
+      {1, 256, 32, 256, 256, false},
+      {1, 96, 256, 128, 128, false},
+      {1, 256, 64, 128, 128, false},
+  };
+
+  // Generate accuracy test cases (float only)
+  for (const auto& config : accuracy_configs) {
+    for (auto st : storage_types) {
+      test_cases.push_back(
+          create_conv2d_pw_test_case(config, vkapi::kFloat, st, layout));
+    }
+  }
+
+  // Generate performance test cases (float and half)
+  for (const auto& config : perf_configs) {
+    std::vector<vkapi::ScalarType> dtypes = {vkapi::kFloat, vkapi::kHalf};
+    for (auto dtype : dtypes) {
+      for (auto st : storage_types) {
+        test_cases.push_back(
+            create_conv2d_pw_test_case(config, dtype, st, layout));
+      }
+    }
+  }
+
+  return test_cases;
+}
+
+static int64_t conv2d_pw_flop_calculator(const TestCase& test_case) {
+  auto input_sizes = test_case.inputs()[0].get_tensor_sizes();
+  auto weight_sizes = test_case.inputs()[1].get_tensor_sizes();
+
+  int64_t N = input_sizes[0];
+  int64_t C_in = input_sizes[1];
+  int64_t H = input_sizes[2];
+  int64_t W = input_sizes[3];
+  int64_t C_out = weight_sizes[0];
+
+  return 2 * N * C_out * C_in * H * W;
+}
+
+static void reference_impl(TestCase& test_case) {
+  conv2d_pw_reference_impl(test_case);
+}
+
+int main(int argc, char* argv[]) {
+  set_debugging(false);
+  set_print_output(false);
+  set_print_latencies(false);
+  set_use_gpu_timestamps(true);
+
+  print_performance_header();
+  std::cout << "Pointwise Conv2d (1x1) Benchmark" << std::endl;
+  print_separator();
+
+  ReferenceComputeFunc ref_fn = reference_impl;
+
+  auto results = execute_test_cases(
+      generate_conv2d_pw_test_cases,
+      conv2d_pw_flop_calculator,
+      "Conv2dPW",
+      3,
+      10,
+      ref_fn);
+
+  return 0;
+}

From 69b3f8fef0600ebafe7231d945f60f37f82188d1 Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devvm26340.ftw0.facebook.com>
Date: Wed, 18 Mar 2026 07:39:50 -0700
Subject: [PATCH 3/3] [ET-VK][conv2d_dw] Extract depthwise dispatch into
 Conv2dDW.cpp with device-based tile selection

Pull Request resolved: https://github.com/pytorch/executorch/pull/18293

Profiling showed depthwise conv2d is 5-15x slower on Mali GPUs vs Adreno due to
register pressure from the 4x2 output tile (17 vec4 registers per thread).
Benchmarking confirmed that reducing the tile to 1x1 (7 vec4 registers) gives
4-15x speedup on Mali with no regression on Adreno.

This change extracts depthwise conv2d dispatch logic from Convolution.cpp into a
new Conv2dDW.cpp (following the Conv2dPW.cpp pattern), and adds device-based
tile size selection: b1x1 on Mali, b4x2 (current default) on Adreno.
ghstack-source-id: 353940602
@exported-using-ghexport

Differential Revision: [D97058158](https://our.internmc.facebook.com/intern/diff/D97058158/)
---
 .../graph/ops/glsl/conv2d_dw_output_tile.yaml |   3 +
 .../runtime/graph/ops/impl/Conv2dDW.cpp       | 327 +++++++++++++
 .../runtime/graph/ops/impl/Convolution.cpp    | 178 ++-----
 .../runtime/graph/ops/impl/Convolution.h      |  18 +
 .../test/custom_ops/impl/TestConv2dDw.cpp     | 348 ++++++++++++++
 backends/vulkan/test/custom_ops/targets.bzl   |   1 +
 .../vulkan/test/custom_ops/test_conv2d_dw.cpp | 435 ++++++++++++++++++
 7 files changed, 1158 insertions(+), 152 deletions(-)
 create mode 100644 backends/vulkan/runtime/graph/ops/impl/Conv2dDW.cpp
 create mode 100644 backends/vulkan/test/custom_ops/impl/TestConv2dDw.cpp
 create mode 100644 backends/vulkan/test/custom_ops/test_conv2d_dw.cpp

diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml
index 9cf6c22c6ca..87aa86154ee 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml
@@ -25,3 +25,6 @@ conv2d_dw_output_tile:
     - NAME: conv2d_dw_output_tile_5x5_clamp
       OPERATOR: clamp(X, A, B)
       TILE_SIZE: 5
+    - NAME: conv2d_dw_output_tile_3x3_b1x1
+      BATCH_SIZE_X: 1
+      BATCH_SIZE_Y: 1
diff --git a/backends/vulkan/runtime/graph/ops/impl/Conv2dDW.cpp b/backends/vulkan/runtime/graph/ops/impl/Conv2dDW.cpp
new file mode 100644
index 00000000000..a9d8483b2e2
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Conv2dDW.cpp
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Convolution.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
+
+namespace vkcompute {
+
+//
+// Weight prepack
+//
+
+ValueRef prepack_dw_weights(ComputeGraph& graph, const ValueRef vref) {
+  const auto original_sizes = graph.sizes_of(vref);
+
+  int64_t out_channels_padded =
+      utils::align_up_4(utils::val_at(-4, original_sizes));
+  int64_t height = utils::val_at(-2, original_sizes);
+  int64_t width = utils::val_at(-1, original_sizes);
+
+  const std::vector<int64_t> final_sizes = {
+      4, out_channels_padded / 4, height * width};
+
+  ValueRef v = graph.add_tensor(
+      final_sizes,
+      graph.dtype_of(vref),
+      utils::kTexture2D,
+      utils::kChannelsPacked);
+
+  std::string kernel_name = "conv2d_dw_prepack_weights";
+  add_dtype_suffix(kernel_name, graph.dtype_of(v));
+  add_dtype_suffix(kernel_name, graph.get_staging_dtype_for(vref));
+
+  const auto original_sizes_pc =
+      utils::make_ivec4(original_sizes, /*reverse = */ true);
+  graph.prepack_nodes().emplace_back(new PrepackNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      graph.create_global_wg_size(v),
+      graph.create_local_wg_size(v),
+      vref,
+      v,
+      {},
+      // Specialization constants
+      {graph.packed_dim_of(v)},
+      {graph.sizes_pc_of(v),
+       PushConstantDataInfo(&original_sizes_pc, sizeof(original_sizes_pc))}));
+
+  return v;
+}
+
+//
+// Shader selection
+//
+
+std::string pick_conv2d_dw_shader(
+    ComputeGraph& graph,
+    const ValueRef weight_data,
+    const ValueRef out,
+    const bool stride_equals_dilation,
+    const bool clamp_out) {
+  std::string kernel_name = "conv2d_dw";
+  kernel_name.reserve(kShaderNameReserve);
+
+  const auto& weight_sizes = graph.get_tref(weight_data)->sizes;
+  const bool is_3x3 = weight_sizes.at(2) == 3 && weight_sizes.at(3) == 3;
+  const bool is_5x5 = weight_sizes.at(2) == 5 && weight_sizes.at(3) == 5;
+
+  if (!stride_equals_dilation) {
+    kernel_name += "_sned";
+  }
+
+  if (is_3x3) {
+    kernel_name += "_output_tile_3x3";
+    if (stride_equals_dilation && graph.device_is_mali()) {
+      kernel_name += "_b1x1";
+    }
+  } else if (is_5x5) {
+    kernel_name += "_output_tile_5x5";
+  }
+
+  if (clamp_out) {
+    kernel_name += "_clamp";
+  }
+  add_dtype_suffix(kernel_name, graph.dtype_of(out));
+
+  return kernel_name;
+}
+
+//
+// Workgroup size
+//
+
+utils::uvec3 conv2d_dw_global_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)resize_args;
+  const ValueRef out = args.at(0).refs.at(0);
+
+  const bool uses_output_tile =
+      shader.kernel_name.find("_output_tile") != std::string::npos;
+
+  if (uses_output_tile) {
+    const bool is_sned = shader.kernel_name.find("_sned") != std::string::npos;
+
+    const utils::uvec3 image_extents = graph->create_global_wg_size(out);
+
+    if (is_sned) {
+      // sned output_tile shaders: no batch division, just flatten W*H
+      return {image_extents[0] * image_extents[1], image_extents[2], 1};
+    }
+
+    // stride==dilation output_tile shaders: apply batch division
+    uint32_t batch_x = 4u;
+    uint32_t batch_y = 2u;
+    if (shader.kernel_name.find("_b1x1") != std::string::npos) {
+      batch_x = 1u;
+      batch_y = 1u;
+    }
+
+    uint32_t scaled_x = utils::div_up(image_extents[0], batch_x);
+    uint32_t scaled_y = utils::div_up(image_extents[1], batch_y);
+    return {scaled_x * scaled_y, image_extents[2], 1};
+  }
+
+  // Base conv2d_dw shader: fully linearized dispatch
+  const utils::uvec3 base_extents = graph->create_global_wg_size(out);
+  return {base_extents[0] * base_extents[1] * base_extents[2], 1, 1};
+}
+
+utils::uvec3 conv2d_dw_local_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const utils::uvec3& global_workgroup_size,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)graph;
+  (void)shader;
+  (void)global_workgroup_size;
+  (void)args;
+  (void)resize_args;
+  return {64, 1, 1};
+}
+
+//
+// Dispatch node
+//
+
+struct Conv2dDWParams final {
+  utils::ivec2 overlay_region;
+  int in_group_size;
+};
+
+struct OutputParams final {
+  float out_min;
+  float out_max;
+};
+
+void add_conv2d_dw_node(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef arg_weight,
+    const ValueRef arg_bias,
+    const ValueRef weight_data,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef dilation,
+    const ValueRef out,
+    const std::string& kernel_name,
+    const Kernel2dParams& kernel_params,
+    const Conv2dDWParams& extra_params,
+    const OutputParams& out_params) {
+  vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);
+
+  vkapi::ParamsBindList param_buffers;
+  std::vector<PushConstantDataInfo> push_constants;
+
+  const bool uses_output_tile =
+      kernel_name.find("_output_tile") != std::string::npos;
+
+  if (uses_output_tile) {
+    const utils::ivec4 kernel_param_size_stride = {
+        kernel_params.kernel_size[0],
+        kernel_params.kernel_size[1],
+        kernel_params.stride[0],
+        kernel_params.stride[1]};
+
+    const utils::ivec4 kernel_param_pad_dial = {
+        kernel_params.padding[0],
+        kernel_params.padding[1],
+        kernel_params.dilation[0],
+        kernel_params.dilation[1]};
+
+    push_constants = {
+        graph.logical_limits_pc_of(out),
+        graph.sizes_pc_of(in),
+        PushConstantDataInfo(
+            &kernel_param_size_stride, sizeof(kernel_param_size_stride)),
+        PushConstantDataInfo(
+            &kernel_param_pad_dial, sizeof(kernel_param_pad_dial)),
+        PushConstantDataInfo(
+            &extra_params, sizeof(extra_params), sizeof(utils::ivec4)),
+        PushConstantDataInfo(&out_params, sizeof(out_params)),
+    };
+  } else {
+    param_buffers = {
+        graph.logical_limits_ubo(out),
+        graph.sizes_ubo(in),
+        graph.create_params_buffer(kernel_params),
+        graph.create_params_buffer(extra_params),
+        graph.create_params_buffer(out_params),
+    };
+  }
+
+  // transposed is always false for depthwise, output_padding unused
+  ValueRef transposed_ref = graph.add_scalar(false);
+  ValueRef output_padding = graph.add_none();
+
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+      graph,
+      shader,
+      conv2d_dw_global_wg_size,
+      conv2d_dw_local_wg_size,
+      // Inputs and Outputs
+      {{out, vkapi::kWrite}, {{in, arg_weight, arg_bias}, vkapi::kRead}},
+      // Shader params buffers
+      param_buffers,
+      // Push Constants
+      push_constants,
+      // Specialization Constants
+      {},
+      // Resize Args
+      {weight_data, stride, padding, dilation, transposed_ref, output_padding},
+      // Resizing Logic
+      resize_conv2d_node));
+}
+
+//
+// High level operator impl
+//
+
+void conv2d_dw_impl(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef weight_data,
+    const ValueRef bias,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef dilation,
+    const ValueRef out,
+    const bool clamp_out,
+    const float out_min_val,
+    const float out_max_val) {
+  ValueRef arg_weight = prepack_dw_weights(graph, weight_data);
+  ValueRef arg_bias = prepack_biases(
+      graph,
+      bias,
+      weight_data,
+      /* transposed = */ false,
+      /* storage_type = */ utils::kTexture2D,
+      /* memory_layout = */ utils::kWidthPacked);
+
+  const std::vector<int64_t> in_sizes = graph.sizes_of(in);
+  if (in_sizes.at(0) > 1) {
+    VK_THROW("conv2d: input batch size > 1 is not supported yet!");
+  }
+
+  check_conv_args(graph, in, out);
+
+  Kernel2dParams kernel_params = create_kernel2d_params(
+      graph,
+      weight_data,
+      /*kernel_size_only = */ false,
+      stride,
+      padding,
+      dilation);
+
+  const bool stride_equals_dilation =
+      (kernel_params.stride[0] == kernel_params.dilation[0] &&
+       kernel_params.stride[1] == kernel_params.dilation[1]);
+
+  const auto& overlay_region = utils::make_ivec2({
+      kernel_params.kernel_size[0] +
+          (kernel_params.kernel_size[0] - 1) * (kernel_params.dilation[0] - 1),
+      kernel_params.kernel_size[1] +
+          (kernel_params.kernel_size[1] - 1) * (kernel_params.dilation[1] - 1),
+  });
+  const auto weight_sizes = graph.sizes_of(weight_data);
+  const int32_t in_group_size =
+      utils::safe_downcast<int32_t>(utils::align_up_4(weight_sizes.at(1)));
+  Conv2dDWParams extra_params = {overlay_region, in_group_size};
+
+  OutputParams out_params = {out_min_val, out_max_val};
+
+  std::string kernel_name = pick_conv2d_dw_shader(
+      graph, weight_data, out, stride_equals_dilation, clamp_out);
+
+  add_conv2d_dw_node(
+      graph,
+      in,
+      arg_weight,
+      arg_bias,
+      weight_data,
+      stride,
+      padding,
+      dilation,
+      out,
+      kernel_name,
+      kernel_params,
+      extra_params,
+      out_params);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
index f5dd576ba54..077ce285cfc 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -139,18 +139,6 @@ vkapi::ShaderInfo get_conv2d_shader(
   switch (method) {
     case Conv2dMethod::Depthwise:
       kernel_name = "conv2d_dw";
-      if (!prepack_weights) {
-        if (!stride_equals_dilation) {
-          kernel_name += "_sned";
-        }
-        const auto& weight_sizes = graph.get_tref(weight)->sizes;
-        if (weight_sizes.at(2) == 3 && weight_sizes.at(3) == 3) {
-          kernel_name += "_output_tile_3x3";
-        }
-        if (weight_sizes.at(2) == 5 && weight_sizes.at(3) == 5) {
-          kernel_name += "_output_tile_5x5";
-        }
-      }
       break;
     case Conv2dMethod::Pointwise:
       if (prepack_weights) {
@@ -296,17 +284,6 @@ Conv2dMethod get_conv2d_method(
   return Conv2dMethod::SlidingWindow;
 }
 
-utils::uvec2 get_conv2d_dw_dispatch_divisor(
-    const std::vector<int64_t>& weight_sizes) {
-  if (weight_sizes.at(2) == 3 && weight_sizes.at(3) == 3) {
-    return {4u, 2u};
-  }
-  if (weight_sizes.at(2) == 5 && weight_sizes.at(3) == 5) {
-    return {4u, 2u};
-  }
-  return {4u, 2u};
-}
-
 utils::uvec3 create_conv2d_global_wg_size(
     ComputeGraph& graph,
     const Conv2dMethod method,
@@ -319,14 +296,6 @@ utils::uvec3 create_conv2d_global_wg_size(
         utils::div_up(image_extents[0u], 1u),
         utils::div_up(image_extents[1u], 4u),
         image_extents[2u]};
-  } else if (method == Conv2dMethod::Depthwise && stride_equals_dilation) {
-    const utils::uvec3 image_extents = graph.create_global_wg_size(out);
-    const utils::uvec2 div =
-        get_conv2d_dw_dispatch_divisor(graph.get_tref(weight_data)->sizes);
-    return {
-        utils::div_up(image_extents[0], div[0]),
-        utils::div_up(image_extents[1], div[1]),
-        image_extents[2]};
   } else {
     return graph.create_global_wg_size(out);
   }
@@ -343,10 +312,7 @@ utils::uvec3 conv2d_global_wg_size(
 
   // Determine method from shader name
   Conv2dMethod method;
-  if (shader.kernel_name.find("conv2d_dw") != std::string::npos) {
-    method = Conv2dMethod::Depthwise;
-  } else if (
-      shader.kernel_name.find("conv2d_pw") != std::string::npos ||
+  if (shader.kernel_name.find("conv2d_pw") != std::string::npos ||
       (shader.kernel_name.find("conv2d") != std::string::npos &&
        shader.kernel_name.find("conv_transpose2d") == std::string::npos)) {
     // Check if it's pointwise by examining weight sizes
@@ -369,21 +335,7 @@ utils::uvec3 conv2d_global_wg_size(
   utils::uvec3 wg_size = create_conv2d_global_wg_size(
       *graph, method, out, weight_data, stride_equals_dilation);
 
-  if (method == Conv2dMethod::Depthwise) {
-    // The output_tile shaders (conv2d_dw_output_tile,
-    // conv2d_dw_sned_output_tile) use a 2D dispatch: (x_tile, y_tile) packed
-    // into glb_x, channel in glb_y. The base conv2d_dw shader uses a 1D
-    // dispatch: all (x, y, channel) packed into glb_x. For the base shader, we
-    // must use {W*H*C_packed, 1, 1}.
-    const bool uses_output_tile =
-        shader.kernel_name.find("_output_tile") != std::string::npos;
-    if (uses_output_tile) {
-      wg_size = {wg_size[0] * wg_size[1], wg_size[2], 1};
-    } else {
-      const utils::uvec3 base_extents = graph->create_global_wg_size(out);
-      wg_size = {base_extents[0] * base_extents[1] * base_extents[2], 1, 1};
-    }
-  } else if (method == Conv2dMethod::Pointwise) {
+  if (method == Conv2dMethod::Pointwise) {
     wg_size = {wg_size[0] * wg_size[1], wg_size[2], 1};
 
     if (shader.kernel_name.find("s1p0") != std::string::npos) {
@@ -406,10 +358,7 @@ utils::uvec3 conv2d_local_wg_size(
 
   // Determine method from shader name
   Conv2dMethod method;
-  if (shader.kernel_name.find("conv2d_dw") != std::string::npos) {
-    method = Conv2dMethod::Depthwise;
-  } else if (
-      shader.kernel_name.find("conv2d_pw") != std::string::npos ||
+  if (shader.kernel_name.find("conv2d_pw") != std::string::npos ||
       (shader.kernel_name.find("conv2d") != std::string::npos &&
        shader.kernel_name.find("conv_transpose2d") == std::string::npos)) {
     method = Conv2dMethod::Pointwise;
@@ -427,8 +376,6 @@ utils::uvec3 conv2d_local_wg_size(
       local_wg_size_y = 2;
     }
     return {64 / local_wg_size_y, local_wg_size_y, 1};
-  } else if (method == Conv2dMethod::Depthwise) {
-    return {64, 1, 1};
   } else {
     return graph->create_local_wg_size(global_workgroup_size);
   }
@@ -499,6 +446,21 @@ void add_conv2d_node(
         out_max_val);
   }
 
+  if (method == Conv2dMethod::Depthwise) {
+    return conv2d_dw_impl(
+        graph,
+        in,
+        weight_data,
+        bias,
+        stride,
+        padding,
+        dilation,
+        out,
+        clamp_out,
+        out_min_val,
+        out_max_val);
+  }
+
   ValueRef arg_weight = prepack_weights(graph, weight_data, method);
   ValueRef arg_bias = prepack_biases(
       graph,
@@ -547,101 +509,13 @@ void add_conv2d_node(
       stride_equals_dilation,
       stride_1_padding_0);
 
-  utils::uvec3 wg_size = create_conv2d_global_wg_size(
-      graph, method, out, weight_data, stride_equals_dilation);
-
-  utils::uvec3 local_wg_size;
-  if (method == Conv2dMethod::Depthwise || method == Conv2dMethod::Pointwise) {
-    wg_size = {wg_size[0] * wg_size[1], wg_size[2], 1};
-  }
-
-  if (method == Conv2dMethod::Pointwise) {
-    uint32_t local_wg_size_y = 1;
-    if (wg_size[1] % 8 == 0) {
-      local_wg_size_y = 8;
-    } else if (wg_size[1] % 4 == 0) {
-      local_wg_size_y = 4;
-    } else if (wg_size[1] % 2 == 0) {
-      local_wg_size_y = 2;
-    }
-    local_wg_size = {64 / local_wg_size_y, local_wg_size_y, 1};
-  } else if (method == Conv2dMethod::Depthwise) {
-    local_wg_size = {64, 1, 1};
-  } else {
-    local_wg_size = graph.create_local_wg_size(wg_size);
-  }
-
-  vkapi::ParamsBindList param_buffers;
-  std::vector<PushConstantDataInfo> push_constants;
-  if (method == Conv2dMethod::Pointwise) {
-    const utils::ivec4 kernel_param_stride_pad = {
-        kernel_params.stride[0],
-        kernel_params.stride[1],
-        kernel_params.padding[0],
-        kernel_params.padding[1],
-    };
-
-    struct Conv2dPWParams final {
-      int in_group_size;
-      int dummy_padding;
-      OutputParams out_params;
-    } param{extra_params.in_group_size, 0, out_params};
-
-    push_constants = {
-        graph.logical_limits_pc_of(out),
-        PushConstantDataInfo(
-            &kernel_param_stride_pad, sizeof(kernel_param_stride_pad)),
-        PushConstantDataInfo(&param, sizeof(param)),
-    };
-  } else if (method == Conv2dMethod::Depthwise) {
-    // output_tile variants use push constants; the base conv2d_dw shader uses
-    // UBOs. Distinguish by checking if "_output_tile" is in the shader name.
-    const bool uses_output_tile =
-        shader.kernel_name.find("_output_tile") != std::string::npos;
-
-    if (uses_output_tile) {
-      const utils::ivec4 kernel_param_size_stride = {
-          kernel_params.kernel_size[0],
-          kernel_params.kernel_size[1],
-          kernel_params.stride[0],
-          kernel_params.stride[1]};
-
-      const utils::ivec4 kernel_param_pad_dial = {
-          kernel_params.padding[0],
-          kernel_params.padding[1],
-          kernel_params.dilation[0],
-          kernel_params.dilation[1]};
-
-      push_constants = {
-          graph.logical_limits_pc_of(out),
-          graph.sizes_pc_of(in),
-          PushConstantDataInfo(
-              &kernel_param_size_stride, sizeof(kernel_param_size_stride)),
-          PushConstantDataInfo(
-              &kernel_param_pad_dial, sizeof(kernel_param_pad_dial)),
-          PushConstantDataInfo(
-              &extra_params, sizeof(extra_params), sizeof(utils::ivec4)),
-          PushConstantDataInfo(&out_params, sizeof(out_params)),
-      };
-    } else {
-      // Base conv2d_dw shader uses UBOs, same as SlidingWindow case
-      param_buffers = {
-          graph.logical_limits_ubo(out),
-          graph.sizes_ubo(in),
-          graph.create_params_buffer(kernel_params),
-          graph.create_params_buffer(extra_params),
-          graph.create_params_buffer(out_params),
-      };
-    }
-  } else {
-    param_buffers = {
-        graph.logical_limits_ubo(out),
-        graph.sizes_ubo(in),
-        graph.create_params_buffer(kernel_params),
-        graph.create_params_buffer(extra_params),
-        graph.create_params_buffer(out_params),
-    };
-  }
+  vkapi::ParamsBindList param_buffers = {
+      graph.logical_limits_ubo(out),
+      graph.sizes_ubo(in),
+      graph.create_params_buffer(kernel_params),
+      graph.create_params_buffer(extra_params),
+      graph.create_params_buffer(out_params),
+  };
 
   graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
@@ -653,7 +527,7 @@ void add_conv2d_node(
       // Shader params buffers
       param_buffers,
       // Push Constants
-      push_constants,
+      {},
       // Specialization Constants
       {utils::safe_downcast<int32_t>(groups_val)},
       // Resize Args
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.h b/backends/vulkan/runtime/graph/ops/impl/Convolution.h
index f1768a89875..f49e7efcfe7 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Convolution.h
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.h
@@ -38,4 +38,22 @@ void conv2d_pw_impl(
     const float out_min_val,
     const float out_max_val);
 
+void conv2d_dw_impl(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef weight_data,
+    const ValueRef bias,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef dilation,
+    const ValueRef out,
+    const bool clamp_out,
+    const float out_min_val,
+    const float out_max_val);
+
+void resize_conv2d_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args);
+
 } // namespace vkcompute
diff --git a/backends/vulkan/test/custom_ops/impl/TestConv2dDw.cpp b/backends/vulkan/test/custom_ops/impl/TestConv2dDw.cpp
new file mode 100644
index 00000000000..f7454b6b93a
--- /dev/null
+++ b/backends/vulkan/test/custom_ops/impl/TestConv2dDw.cpp
@@ -0,0 +1,348 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Convolution.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+//
+// Local copies of Conv2dDW internals, extended with impl_selector support.
+// These mirror the logic in Conv2dDW.cpp but allow forcing a specific tile size
+// variant via the impl_selector string.
+//
+
+struct Conv2dDWParams final {
+  utils::ivec2 overlay_region;
+  int in_group_size;
+};
+
+struct OutputParams final {
+  float out_min;
+  float out_max;
+};
+
+static std::string pick_conv2d_dw_shader_with_selector(
+    ComputeGraph& graph,
+    const ValueRef weight_data,
+    const ValueRef out,
+    const bool stride_equals_dilation,
+    const bool clamp_out,
+    const std::string& impl_selector) {
+  std::string kernel_name = "conv2d_dw";
+  kernel_name.reserve(40);
+
+  const auto& weight_sizes = graph.get_tref(weight_data)->sizes;
+  const bool is_3x3 = weight_sizes.at(2) == 3 && weight_sizes.at(3) == 3;
+  const bool is_5x5 = weight_sizes.at(2) == 5 && weight_sizes.at(3) == 5;
+
+  if (!stride_equals_dilation) {
+    kernel_name += "_sned";
+  }
+
+  if (is_3x3) {
+    kernel_name += "_output_tile_3x3";
+    if (impl_selector == "b1x1") {
+      kernel_name += "_b1x1";
+    } else if (impl_selector == "b4x2") {
+      // b4x2 is the default (no suffix)
+    } else {
+      // Auto-selection: use b1x1 on Mali
+      if (stride_equals_dilation && graph.device_is_mali()) {
+        kernel_name += "_b1x1";
+      }
+    }
+  } else if (is_5x5) {
+    kernel_name += "_output_tile_5x5";
+    // No b1x1 variant for 5x5; impl_selector is ignored for batch size
+  }
+
+  if (clamp_out) {
+    kernel_name += "_clamp";
+  }
+  add_dtype_suffix(kernel_name, graph.dtype_of(out));
+
+  return kernel_name;
+}
+
+static utils::uvec3 conv2d_dw_global_wg_size_fn(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)resize_args;
+  const ValueRef out = args.at(0).refs.at(0);
+
+  const bool uses_output_tile =
+      shader.kernel_name.find("_output_tile") != std::string::npos;
+
+  if (uses_output_tile) {
+    const bool is_sned = shader.kernel_name.find("_sned") != std::string::npos;
+    const utils::uvec3 image_extents = graph->create_global_wg_size(out);
+
+    if (is_sned) {
+      return {image_extents[0] * image_extents[1], image_extents[2], 1};
+    }
+
+    uint32_t batch_x = 4u;
+    uint32_t batch_y = 2u;
+    if (shader.kernel_name.find("_b1x1") != std::string::npos) {
+      batch_x = 1u;
+      batch_y = 1u;
+    }
+
+    uint32_t scaled_x = utils::div_up(image_extents[0], batch_x);
+    uint32_t scaled_y = utils::div_up(image_extents[1], batch_y);
+    return {scaled_x * scaled_y, image_extents[2], 1};
+  }
+
+  const utils::uvec3 base_extents = graph->create_global_wg_size(out);
+  return {base_extents[0] * base_extents[1] * base_extents[2], 1, 1};
+}
+
+static utils::uvec3 conv2d_dw_local_wg_size_fn(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const utils::uvec3& global_workgroup_size,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)graph;
+  (void)shader;
+  (void)global_workgroup_size;
+  (void)args;
+  (void)resize_args;
+  return {64, 1, 1};
+}
+
+static ValueRef prepack_dw_weights(ComputeGraph& graph, const ValueRef vref) {
+  const auto original_sizes = graph.sizes_of(vref);
+
+  int64_t out_channels_padded =
+      utils::align_up_4(utils::val_at(-4, original_sizes));
+  int64_t height = utils::val_at(-2, original_sizes);
+  int64_t width = utils::val_at(-1, original_sizes);
+
+  const std::vector<int64_t> final_sizes = {
+      4, out_channels_padded / 4, height * width};
+
+  ValueRef v = graph.add_tensor(
+      final_sizes,
+      graph.dtype_of(vref),
+      utils::kTexture2D,
+      utils::kChannelsPacked);
+
+  std::string kernel_name = "conv2d_dw_prepack_weights";
+  add_dtype_suffix(kernel_name, graph.dtype_of(v));
+  add_dtype_suffix(kernel_name, graph.get_staging_dtype_for(vref));
+
+  const auto original_sizes_pc =
+      utils::make_ivec4(original_sizes, /*reverse=*/true);
+  graph.prepack_nodes().emplace_back(new PrepackNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      graph.create_global_wg_size(v),
+      graph.create_local_wg_size(v),
+      vref,
+      v,
+      {},
+      {graph.packed_dim_of(v)},
+      {graph.sizes_pc_of(v),
+       PushConstantDataInfo(&original_sizes_pc, sizeof(original_sizes_pc))}));
+
+  return v;
+}
+
+static void conv2d_dw_with_selector(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef weight_data,
+    const ValueRef bias,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef dilation,
+    const ValueRef out,
+    const std::string& impl_selector) {
+  ValueRef arg_weight = prepack_dw_weights(graph, weight_data);
+  ValueRef arg_bias = prepack_biases(
+      graph,
+      bias,
+      weight_data,
+      /*transposed=*/false,
+      /*storage_type=*/utils::kTexture2D,
+      /*memory_layout=*/utils::kWidthPacked);
+
+  check_conv_args(graph, in, out);
+
+  Kernel2dParams kernel_params = create_kernel2d_params(
+      graph,
+      weight_data,
+      /*kernel_size_only=*/false,
+      stride,
+      padding,
+      dilation);
+
+  const bool stride_equals_dilation =
+      (kernel_params.stride[0] == kernel_params.dilation[0] &&
+       kernel_params.stride[1] == kernel_params.dilation[1]);
+
+  const auto& overlay_region = utils::make_ivec2({
+      kernel_params.kernel_size[0] +
+          (kernel_params.kernel_size[0] - 1) * (kernel_params.dilation[0] - 1),
+      kernel_params.kernel_size[1] +
+          (kernel_params.kernel_size[1] - 1) * (kernel_params.dilation[1] - 1),
+  });
+  const auto weight_sizes = graph.sizes_of(weight_data);
+  const int32_t in_group_size =
+      utils::safe_downcast<int32_t>(utils::align_up_4(weight_sizes.at(1)));
+  Conv2dDWParams extra_params = {overlay_region, in_group_size};
+
+  OutputParams out_params = {
+      std::numeric_limits<float>::lowest(), std::numeric_limits<float>::max()};
+
+  std::string kernel_name = pick_conv2d_dw_shader_with_selector(
+      graph,
+      weight_data,
+      out,
+      stride_equals_dilation,
+      /*clamp_out=*/false,
+      impl_selector);
+
+  vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);
+
+  vkapi::ParamsBindList param_buffers;
+  std::vector<PushConstantDataInfo> push_constants;
+
+  const bool uses_output_tile =
+      kernel_name.find("_output_tile") != std::string::npos;
+
+  if (uses_output_tile) {
+    const utils::ivec4 kernel_param_size_stride = {
+        kernel_params.kernel_size[0],
+        kernel_params.kernel_size[1],
+        kernel_params.stride[0],
+        kernel_params.stride[1]};
+
+    const utils::ivec4 kernel_param_pad_dial = {
+        kernel_params.padding[0],
+        kernel_params.padding[1],
+        kernel_params.dilation[0],
+        kernel_params.dilation[1]};
+
+    push_constants = {
+        graph.logical_limits_pc_of(out),
+        graph.sizes_pc_of(in),
+        PushConstantDataInfo(
+            &kernel_param_size_stride, sizeof(kernel_param_size_stride)),
+        PushConstantDataInfo(
+            &kernel_param_pad_dial, sizeof(kernel_param_pad_dial)),
+        PushConstantDataInfo(
+            &extra_params, sizeof(extra_params), sizeof(utils::ivec4)),
+        PushConstantDataInfo(&out_params, sizeof(out_params)),
+    };
+  } else {
+    param_buffers = {
+        graph.logical_limits_ubo(out),
+        graph.sizes_ubo(in),
+        graph.create_params_buffer(kernel_params),
+        graph.create_params_buffer(extra_params),
+        graph.create_params_buffer(out_params),
+    };
+  }
+
+  ValueRef transposed_ref = graph.add_scalar(false);
+  ValueRef output_padding = graph.add_none();
+
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+      graph,
+      shader,
+      conv2d_dw_global_wg_size_fn,
+      conv2d_dw_local_wg_size_fn,
+      {{out, vkapi::kWrite}, {{in, arg_weight, arg_bias}, vkapi::kRead}},
+      param_buffers,
+      push_constants,
+      {},
+      {weight_data, stride, padding, dilation, transposed_ref, output_padding},
+      resize_conv2d_node));
+}
+
+void test_conv2d_dw(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  // args[0] = input [N, C, H, W]
+  // args[1] = weight [C, 1, K_h, K_w] (constant)
+  // args[2] = bias (constant, or none)
+  // args[3] = stride_h (int)
+  // args[4] = stride_w (int)
+  // args[5] = padding_h (int)
+  // args[6] = padding_w (int)
+  // args[7] = dilation_h (int)
+  // args[8] = dilation_w (int)
+  // args[9] = impl_selector (string)
+  // args[10] = output
+  const ValueRef input = args.at(0);
+  const ValueRef weight = args.at(1);
+  const ValueRef bias = args.at(2);
+  const int64_t stride_h = graph.extract_scalar<int64_t>(args.at(3));
+  const int64_t stride_w = graph.extract_scalar<int64_t>(args.at(4));
+  const int64_t padding_h = graph.extract_scalar<int64_t>(args.at(5));
+  const int64_t padding_w = graph.extract_scalar<int64_t>(args.at(6));
+  const int64_t dilation_h = graph.extract_scalar<int64_t>(args.at(7));
+  const int64_t dilation_w = graph.extract_scalar<int64_t>(args.at(8));
+  const std::string impl_selector = graph.extract_string(args.at(9));
+  const ValueRef out = args.at(10);
+
+  ValueRef stride =
+      graph.add_scalar_list<int64_t>(std::vector<int64_t>{stride_h, stride_w});
+  ValueRef padding = graph.add_scalar_list<int64_t>(
+      std::vector<int64_t>{padding_h, padding_w});
+  ValueRef dilation = graph.add_scalar_list<int64_t>(
+      std::vector<int64_t>{dilation_h, dilation_w});
+
+  if (impl_selector.empty()) {
+    // Auto-selection: delegate to aten.convolution.default
+    const int64_t channels = graph.sizes_of(input).at(1);
+    ValueRef transposed = graph.add_scalar<bool>(false);
+    ValueRef output_padding =
+        graph.add_scalar_list<int64_t>(std::vector<int64_t>{0, 0});
+    ValueRef groups = graph.add_scalar<int64_t>(channels);
+
+    VK_GET_OP_FN("aten.convolution.default")
+    (graph,
+     {input,
+      weight,
+      bias,
+      stride,
+      padding,
+      dilation,
+      transposed,
+      output_padding,
+      groups,
+      out});
+  } else {
+    // Forced variant: build the dispatch directly with impl_selector
+    conv2d_dw_with_selector(
+        graph,
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        out,
+        impl_selector);
+  }
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(test_etvk.test_conv2d_dw.default, test_conv2d_dw);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/test/custom_ops/targets.bzl b/backends/vulkan/test/custom_ops/targets.bzl
index 471b2530910..84432bce30b 100644
--- a/backends/vulkan/test/custom_ops/targets.bzl
+++ b/backends/vulkan/test/custom_ops/targets.bzl
@@ -101,3 +101,4 @@ def define_common_targets(is_fbcode = False):
     define_custom_op_test_binary("test_q8ta_conv2d_transposed")
     define_custom_op_test_binary("test_mm")
     define_custom_op_test_binary("test_conv2d_pw")
+    define_custom_op_test_binary("test_conv2d_dw")
diff --git a/backends/vulkan/test/custom_ops/test_conv2d_dw.cpp b/backends/vulkan/test/custom_ops/test_conv2d_dw.cpp
new file mode 100644
index 00000000000..9c48c320d62
--- /dev/null
+++ b/backends/vulkan/test/custom_ops/test_conv2d_dw.cpp
@@ -0,0 +1,435 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <iostream>
+#include <vector>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+#include "conv2d_utils.h"
+#include "utils.h"
+
+using namespace executorch::vulkan::prototyping;
+using namespace vkcompute;
+
+static constexpr int64_t kRefDimSizeLimit = 64;
+
+struct InputDims {
+  int64_t N;
+  int64_t C;
+  int64_t H;
+  int64_t W;
+
+  InputDims(int64_t n, int64_t c, int64_t h, int64_t w)
+      : N(n), C(c), H(h), W(w) {}
+};
+
+struct Conv2dDwConfig {
+  InputDims dims;
+  KernelSize kernel;
+  Stride stride;
+  Padding padding;
+  Dilation dilation;
+  bool has_bias;
+};
+
+static int64_t calc_out_size(
+    int64_t in_size,
+    int64_t kernel_size,
+    int64_t stride,
+    int64_t padding,
+    int64_t dilation) {
+  return (in_size + 2 * padding - dilation * (kernel_size - 1) - 1) / stride +
+      1;
+}
+
+static TestCase create_conv2d_dw_test_case(
+    const Conv2dDwConfig& config,
+    vkapi::ScalarType dtype,
+    utils::StorageType storage_type,
+    utils::GPUMemoryLayout memory_layout,
+    const std::string& impl_selector = "") {
+  TestCase test_case;
+
+  bool is_perf = config.dims.C > kRefDimSizeLimit ||
+      config.dims.H > kRefDimSizeLimit || config.dims.W > kRefDimSizeLimit;
+
+  std::string prefix = is_perf ? "PERF" : "ACCU";
+  std::string storage_str = storage_type_abbrev(storage_type);
+  std::string layout_str = layout_abbrev(memory_layout);
+  std::string dtype_str = (dtype == vkapi::kHalf) ? "f16" : "f32";
+  std::string bias_str = config.has_bias ? "+bias" : "";
+
+  int64_t H_out = calc_out_size(
+      config.dims.H,
+      config.kernel.h,
+      config.stride.h,
+      config.padding.h,
+      config.dilation.h);
+  int64_t W_out = calc_out_size(
+      config.dims.W,
+      config.kernel.w,
+      config.stride.w,
+      config.padding.w,
+      config.dilation.w);
+
+  std::string shape = "[" + std::to_string(config.dims.N) + "," +
+      std::to_string(config.dims.C) + "," + std::to_string(config.dims.H) +
+      "," + std::to_string(config.dims.W) + "] k" +
+      std::to_string(config.kernel.h) + "x" + std::to_string(config.kernel.w) +
+      " s" + std::to_string(config.stride.h) + " p" +
+      std::to_string(config.padding.h) + " d" +
+      std::to_string(config.dilation.h) + "->[" +
+      std::to_string(config.dims.N) + "," + std::to_string(config.dims.C) +
+      "," + std::to_string(H_out) + "," + std::to_string(W_out) + "]";
+
+  std::string selector_str =
+      impl_selector.empty() ? "" : " [" + impl_selector + "]";
+
+  std::string name = prefix + "  conv2d_dw" + bias_str + " " + shape + "  " +
+      storage_str + "(" + layout_str + ") " + dtype_str + selector_str;
+
+  test_case.set_name(name);
+  test_case.set_operator_name("test_etvk.test_conv2d_dw.default");
+
+  // Input tensor [N, C, H, W]
+  ValueSpec input(
+      {config.dims.N, config.dims.C, config.dims.H, config.dims.W},
+      dtype,
+      storage_type,
+      memory_layout,
+      DataGenType::RANDOM);
+
+  // Weight tensor [C, 1, K_h, K_w] - constant
+  ValueSpec weight(
+      {config.dims.C, 1, config.kernel.h, config.kernel.w},
+      dtype,
+      storage_type,
+      memory_layout,
+      DataGenType::RANDOM);
+  weight.set_constant(true);
+
+  test_case.add_input_spec(input);
+  test_case.add_input_spec(weight);
+
+  // Bias (or none)
+  if (config.has_bias) {
+    ValueSpec bias(
+        {config.dims.C},
+        dtype,
+        storage_type,
+        memory_layout,
+        DataGenType::RANDOM);
+    bias.set_constant(true);
+    test_case.add_input_spec(bias);
+  } else {
+    ValueSpec none_bias(static_cast<int32_t>(0));
+    none_bias.set_none(true);
+    test_case.add_input_spec(none_bias);
+  }
+
+  // stride_h, stride_w, padding_h, padding_w, dilation_h, dilation_w
+  test_case.add_input_spec(ValueSpec(static_cast<int32_t>(config.stride.h)));
+  test_case.add_input_spec(ValueSpec(static_cast<int32_t>(config.stride.w)));
+  test_case.add_input_spec(ValueSpec(static_cast<int32_t>(config.padding.h)));
+  test_case.add_input_spec(ValueSpec(static_cast<int32_t>(config.padding.w)));
+  test_case.add_input_spec(ValueSpec(static_cast<int32_t>(config.dilation.h)));
+  test_case.add_input_spec(ValueSpec(static_cast<int32_t>(config.dilation.w)));
+
+  // impl_selector string
+  test_case.add_input_spec(ValueSpec::make_string(impl_selector));
+
+  // Output tensor [N, C, H_out, W_out]
+  ValueSpec output(
+      {config.dims.N, config.dims.C, H_out, W_out},
+      dtype,
+      storage_type,
+      memory_layout,
+      DataGenType::ZEROS);
+  test_case.add_output_spec(output);
+
+  if (dtype == vkapi::kHalf) {
+    test_case.set_abs_tolerance(1e-1f);
+    test_case.set_rel_tolerance(1e-2f);
+  } else {
+    test_case.set_abs_tolerance(1e-3f);
+    test_case.set_rel_tolerance(1e-3f);
+  }
+
+  test_case.set_shader_filter({"nchw_to", "to_nchw", "view_copy"});
+
+  return test_case;
+}
+
+// Reference implementation for depthwise conv2d
+static void conv2d_dw_reference_impl(TestCase& test_case) {
+  const ValueSpec& input = test_case.inputs()[0];
+  const ValueSpec& weight = test_case.inputs()[1];
+  const ValueSpec& bias_spec = test_case.inputs()[2];
+  ValueSpec& output = test_case.outputs()[0];
+
+  if (input.dtype != vkapi::kFloat) {
+    throw std::invalid_argument("Reference only supports float");
+  }
+
+  auto input_sizes = input.get_tensor_sizes();
+  auto weight_sizes = weight.get_tensor_sizes();
+  auto output_sizes = output.get_tensor_sizes();
+
+  int64_t N = input_sizes[0];
+  int64_t C = input_sizes[1];
+  int64_t H_in = input_sizes[2];
+  int64_t W_in = input_sizes[3];
+  int64_t K_h = weight_sizes[2];
+  int64_t K_w = weight_sizes[3];
+  int64_t H_out = output_sizes[2];
+  int64_t W_out = output_sizes[3];
+
+  int64_t stride_h = test_case.inputs()[3].get_int_value();
+  int64_t stride_w = test_case.inputs()[4].get_int_value();
+  int64_t padding_h = test_case.inputs()[5].get_int_value();
+  int64_t padding_w = test_case.inputs()[6].get_int_value();
+  int64_t dilation_h = test_case.inputs()[7].get_int_value();
+  int64_t dilation_w = test_case.inputs()[8].get_int_value();
+
+  auto& input_data = input.get_float_data();
+  auto& weight_data = weight.get_float_data();
+  auto& ref_data = output.get_ref_float_data();
+  ref_data.resize(N * C * H_out * W_out, 0.0f);
+
+  for (int64_t n = 0; n < N; ++n) {
+    for (int64_t c = 0; c < C; ++c) {
+      for (int64_t oh = 0; oh < H_out; ++oh) {
+        for (int64_t ow = 0; ow < W_out; ++ow) {
+          float sum = 0.0f;
+          for (int64_t kh = 0; kh < K_h; ++kh) {
+            for (int64_t kw = 0; kw < K_w; ++kw) {
+              int64_t ih = oh * stride_h - padding_h + kh * dilation_h;
+              int64_t iw = ow * stride_w - padding_w + kw * dilation_w;
+              if (ih >= 0 && ih < H_in && iw >= 0 && iw < W_in) {
+                float in_val = input_data
+                    [n * (C * H_in * W_in) + c * (H_in * W_in) + ih * W_in +
+                     iw];
+                // weight is [C, 1, K_h, K_w]
+                float w_val = weight_data[c * (K_h * K_w) + kh * K_w + kw];
+                sum += in_val * w_val;
+              }
+            }
+          }
+          if (!bias_spec.is_none()) {
+            auto& bias_data = bias_spec.get_float_data();
+            sum += bias_data[c];
+          }
+          ref_data
+              [n * (C * H_out * W_out) + c * (H_out * W_out) + oh * W_out +
+               ow] = sum;
+        }
+      }
+    }
+  }
+}
+
+static std::vector<TestCase> generate_conv2d_dw_test_cases() {
+  std::vector<TestCase> test_cases;
+
+  std::vector<utils::StorageType> storage_types = {utils::kTexture3D};
+  utils::GPUMemoryLayout layout = utils::kChannelsPacked;
+
+  // Accuracy shapes (small enough for float reference validation)
+  std::vector<Conv2dDwConfig> accuracy_configs = {
+      {InputDims(1, 8, 16, 16),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      {InputDims(1, 8, 16, 16),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       true},
+      {InputDims(1, 8, 16, 16),
+       KernelSize(5, 5),
+       Stride(1, 1),
+       Padding(2, 2),
+       Dilation(1, 1),
+       false},
+      {InputDims(1, 8, 16, 16),
+       KernelSize(3, 3),
+       Stride(2, 2),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      // Non-multiple-of-4 channels
+      {InputDims(1, 11, 16, 16),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      {InputDims(1, 3, 16, 16),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+  };
+
+  // EdgeTAM depthwise shapes (from profiling data)
+  std::vector<Conv2dDwConfig> perf_configs = {
+      // Backbone stem and early stages
+      {InputDims(1, 24, 512, 512),
+       KernelSize(3, 3),
+       Stride(2, 2),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      {InputDims(1, 48, 256, 256),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      {InputDims(1, 48, 256, 256),
+       KernelSize(3, 3),
+       Stride(2, 2),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      {InputDims(1, 96, 128, 128),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      {InputDims(1, 96, 128, 128),
+       KernelSize(3, 3),
+       Stride(2, 2),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      {InputDims(1, 192, 64, 64),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      {InputDims(1, 192, 64, 64),
+       KernelSize(3, 3),
+       Stride(2, 2),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      {InputDims(1, 384, 32, 32),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      // 5x5 kernels
+      {InputDims(1, 48, 256, 256),
+       KernelSize(5, 5),
+       Stride(1, 1),
+       Padding(2, 2),
+       Dilation(1, 1),
+       false},
+      {InputDims(1, 96, 128, 128),
+       KernelSize(5, 5),
+       Stride(1, 1),
+       Padding(2, 2),
+       Dilation(1, 1),
+       false},
+      // FPN/Neck
+      {InputDims(1, 256, 256, 256),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      {InputDims(1, 256, 128, 128),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+  };
+
+  // Generate accuracy test cases (float only)
+  for (const auto& config : accuracy_configs) {
+    for (auto st : storage_types) {
+      test_cases.push_back(
+          create_conv2d_dw_test_case(config, vkapi::kFloat, st, layout));
+    }
+  }
+
+  // Generate performance test cases (float and half)
+  for (const auto& config : perf_configs) {
+    std::vector<vkapi::ScalarType> dtypes = {vkapi::kFloat, vkapi::kHalf};
+    for (auto dtype : dtypes) {
+      for (auto st : storage_types) {
+        // Auto-selection (empty impl_selector)
+        test_cases.push_back(
+            create_conv2d_dw_test_case(config, dtype, st, layout));
+
+        // Force b4x2 variant
+        test_cases.push_back(
+            create_conv2d_dw_test_case(config, dtype, st, layout, "b4x2"));
+
+        // Force b1x1 variant (only for 3x3 kernels; for 5x5 it falls back
+        // to default, but we still generate it to test the fallback path)
+        test_cases.push_back(
+            create_conv2d_dw_test_case(config, dtype, st, layout, "b1x1"));
+      }
+    }
+  }
+
+  return test_cases;
+}
+
+static int64_t conv2d_dw_flop_calculator(const TestCase& test_case) {
+  auto input_sizes = test_case.inputs()[0].get_tensor_sizes();
+  auto weight_sizes = test_case.inputs()[1].get_tensor_sizes();
+  auto output_sizes = test_case.outputs()[0].get_tensor_sizes();
+
+  int64_t N = output_sizes[0];
+  int64_t C = output_sizes[1];
+  int64_t H_out = output_sizes[2];
+  int64_t W_out = output_sizes[3];
+  int64_t K_h = weight_sizes[2];
+  int64_t K_w = weight_sizes[3];
+
+  // Each output element: K_h * K_w multiplies + (K_h * K_w - 1) adds
+  return 2 * N * C * H_out * W_out * K_h * K_w;
+}
+
+static void reference_impl(TestCase& test_case) {
+  conv2d_dw_reference_impl(test_case);
+}
+
+int main(int argc, char* argv[]) {
+  set_debugging(false);
+  set_print_output(false);
+  set_print_latencies(false);
+  set_use_gpu_timestamps(true);
+
+  print_performance_header();
+  std::cout << "Depthwise Conv2d Benchmark" << std::endl;
+  print_separator();
+
+  ReferenceComputeFunc ref_fn = reference_impl;
+
+  auto results = execute_test_cases(
+      generate_conv2d_dw_test_cases,
+      conv2d_dw_flop_calculator,
+      "Conv2dDW",
+      3,
+      10,
+      ref_fn);
+
+  return 0;
+}