From 8c951f4d528ca5892fab6e208a07abc511089ef4 Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devvm1479.ncg0.facebook.com>
Date: Thu, 12 Feb 2026 19:57:23 -0800
Subject: [PATCH 1/4] [ET-VK] Layout-flexible impl of quantized binary

Pull Request resolved: https://github.com/pytorch/executorch/pull/17391

This refactors the quantized binary add operator to support all PackedInt8 memory layouts (4W, 4C, 4W4C, 4H4W, 4C1W) instead of being hardcoded to 4W4C. The shader is rewritten to use the block indexing framework (BlockConfig, block_int8x4_load/store) and BufferMetadata for layout-agnostic tensor access, replacing the previous linear dispatch that assumed 4W4C ordering.

Key changes:
- Renames shader from binary_q8ta_q8ta_q8to to q8ta_binary, and op from add_q8ta_q8ta_q8to to q8ta_add
- Shader now uses contiguous_block_idx_to_tensor4d_idx_with_block_config for dispatch and generated load/store functions for layout-flexible int8x4 access
- C++ dispatch uses pick_linear_global_wg_with_block_config and passes BufferMetadata UBOs for output and both inputs, plus hashed_layout specialization constants
- Moves the test operator into a separate TestQ8taBinary.cpp file that parameterizes on GPUMemoryLayout, testing all 5 layouts
- Updates op_registry to accept PACKED_INT8_BUFFER (all layouts) instead of just PACKED_INT8_4W4C_BUFFER

This diff was authored with Claude.
ghstack-source-id: 340983074
@exported-using-ghexport

Differential Revision: [D93000170](https://our.internmc.facebook.com/intern/diff/D93000170/)
---
 .github/workflows/pull.yml                    |   2 +-
 backends/vulkan/custom_ops_lib.py             |  10 +-
 backends/vulkan/op_registry.py                |  18 +-
 backends/vulkan/patterns/quantized_binary.py  |   2 +-
 .../graph/ops/glsl/binary_q8ta_q8ta_q8to.glsl |  76 ------
 .../runtime/graph/ops/glsl/q8ta_binary.glsl   |  91 +++++++
 ...y_q8ta_q8ta_q8to.yaml => q8ta_binary.yaml} |  11 +-
 .../{QuantizedBinary.cpp => Q8taBinary.cpp}   | 141 +++-------
 .../runtime/graph/ops/impl/Q8taBinary.h       |  33 +++
 .../vulkan/test/custom_ops/CMakeLists.txt     |   2 +-
 .../test/custom_ops/impl/TestQ8taBinary.cpp   |  88 ++++++
 backends/vulkan/test/custom_ops/targets.bzl   |   2 +-
 ...q8ta_q8to_add.cpp => test_q8ta_binary.cpp} | 254 ++++++++++++------
 13 files changed, 446 insertions(+), 284 deletions(-)
 delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.glsl
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/q8ta_binary.glsl
 rename backends/vulkan/runtime/graph/ops/glsl/{binary_q8ta_q8ta_q8to.yaml => q8ta_binary.yaml} (61%)
 rename backends/vulkan/runtime/graph/ops/impl/{QuantizedBinary.cpp => Q8taBinary.cpp} (53%)
 create mode 100644 backends/vulkan/runtime/graph/ops/impl/Q8taBinary.h
 create mode 100644 backends/vulkan/test/custom_ops/impl/TestQ8taBinary.cpp
 rename backends/vulkan/test/custom_ops/{q8ta_q8ta_q8to_add.cpp => test_q8ta_binary.cpp} (52%)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index eb09a1c8aa2..e3bdce67dc1 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -1136,7 +1136,7 @@ jobs:
         ./cmake-out/backends/vulkan/test/custom_ops/choose_qparams_per_row
         ./cmake-out/backends/vulkan/test/custom_ops/test_q8ta_qdq
         ./cmake-out/backends/vulkan/test/custom_ops/test_q8ta_clone
-        ./cmake-out/backends/vulkan/test/custom_ops/q8ta_q8ta_q8to_add
+        ./cmake-out/backends/vulkan/test/custom_ops/test_q8ta_binary
 
         # "Classic" Operator tests
         PYTHON_EXECUTABLE=python bash backends/vulkan/test/scripts/test_op.sh --build
diff --git a/backends/vulkan/custom_ops_lib.py b/backends/vulkan/custom_ops_lib.py
index 1f4c962fdb3..f2e4482c9b9 100644
--- a/backends/vulkan/custom_ops_lib.py
+++ b/backends/vulkan/custom_ops_lib.py
@@ -564,11 +564,11 @@ def apply_rotary_emb_impl(
 apply_rotary_emb_op = getattr(getattr(torch.ops, namespace), name)
 
 ########################
-## add_q8ta_q8ta_q8to ##
+## q8ta_add ##
 ########################
 
 
-def add_q8ta_q8ta_q8to_impl(
+def q8ta_add_impl(
     input_a: torch.Tensor,
     input_b: torch.Tensor,
     input_a_scale: float,
@@ -598,12 +598,12 @@ def add_q8ta_q8ta_q8to_impl(
     return quantized_result
 
 
-name = "add_q8ta_q8ta_q8to"
+name = "q8ta_add"
 lib.define(
     f"{name}(Tensor input_a, Tensor input_b, float input_a_scale, int input_a_zero_point, float input_b_scale, int input_b_zero_point, float output_scale, int output_zero_point, float alpha) -> Tensor"
 )
-lib.impl(name, add_q8ta_q8ta_q8to_impl, "CompositeExplicitAutograd")
-add_q8ta_q8ta_q8to_op = getattr(getattr(torch.ops, namespace), name)
+lib.impl(name, q8ta_add_impl, "CompositeExplicitAutograd")
+q8ta_add_op = getattr(getattr(torch.ops, namespace), name)
 
 #############################
 ## select_as_symint ##
diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
index 2b40976e4a0..b58d6407308 100644
--- a/backends/vulkan/op_registry.py
+++ b/backends/vulkan/op_registry.py
@@ -501,19 +501,24 @@ def register_torchao_choose_qparams_affine():
 
 
 # =============================================================================
-# QuantizedBinary.cpp
+# Q8taBinary.cpp
 # =============================================================================
 
 
-@update_features(exir_ops.edge.et_vk.add_q8ta_q8ta_q8to.default)
-def register_add_q8ta_q8ta_q8to():
+@update_features(exir_ops.edge.et_vk.q8ta_add.default)
+def register_q8ta_add():
     return OpFeatures(
-        inputs_storage=utils.PACKED_INT8_4W4C_BUFFER,
+        inputs_storage=utils.PACKED_INT8_BUFFER,
         supports_resize=False,
         supports_prepacking=True,
     )
 
 
+# =============================================================================
+# Reduce.cpp
+# =============================================================================
+
+
 def get_dims_reduced(node: torch.fx.Node) -> Union[int, List[int]]:
     ndim = utils.ndim_of(node.args[0])
     assert ndim is not None
@@ -623,11 +628,6 @@ def pick_storage_for_reduce(node: torch.fx.Node):
     return inputs_storage, outputs_storage
 
 
-# =============================================================================
-# Reduce.cpp
-# =============================================================================
-
-
 @update_features(
     [
         exir_ops.edge.aten.mean.dim,
diff --git a/backends/vulkan/patterns/quantized_binary.py b/backends/vulkan/patterns/quantized_binary.py
index da4985b931d..9a18f148736 100644
--- a/backends/vulkan/patterns/quantized_binary.py
+++ b/backends/vulkan/patterns/quantized_binary.py
@@ -133,7 +133,7 @@ def make_add_q8ta_q8ta_q8to_custom_op(
         exir_ops.edge.aten.add.Tensor,
         exir_ops.edge.aten.add_.Tensor,
     }:
-        op_target = exir_ops.edge.et_vk.add_q8ta_q8ta_q8to.default
+        op_target = exir_ops.edge.et_vk.q8ta_add.default
     else:
         # For future binary operations, add more mappings here
         raise NotImplementedError(
diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.glsl b/backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.glsl
deleted file mode 100644
index c5dac0d6571..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.glsl
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-${define_required_extensions("buffer", DTYPE)}
-
-#define PRECISION ${PRECISION}
-
-#define NAME ${VARIANT_NAME}
-
-#define VEC4_T ${texel_load_type(DTYPE, "buffer")}
-#define T ${texel_load_component_type(DTYPE, "buffer")}
-
-$if IO_STORAGE == "buffer":
-  #define PACKED_INT8_OUTPUT_BUFFER
-  #define PACKED_INT8_INPUT_BUFFER
-
-#define op(X, Y) ${OPERATOR}
-
-layout(std430) buffer;
-
-#include "indexing.glslh"
-#include "common.glslh"
-
-${layout_declare_tensor(B, "w", "t_packed_int8_out", "int", IO_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_packed_int8_in_a", "int", IO_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_packed_int8_in_b", "int", IO_STORAGE, is_scalar_array=False)}
-
-${layout_declare_ubo(B, "ivec4", "out_sizes")}
-
-layout(push_constant) uniform restrict Block {
-  float input_a_scale;
-  int input_a_zp;
-  float input_b_scale;
-  int input_b_zp;
-  float output_inv_scale;
-  int output_zp;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  const int tid = int(gl_GlobalInvocationID.x);
-
-  const int W4 = div_up_4(out_sizes.x);
-  const int H = out_sizes.y;
-  const int C4 = div_up_4(out_sizes.z);
-  const int N = out_sizes.w;
-
-  if (tid >= W4 * H * C4 * N) {
-    return;
-  }
-
-  const ivec4 in_block_1 = t_packed_int8_in_a[tid];
-  const ivec4 in_block_2 = t_packed_int8_in_b[tid];
-
-  ivec4 out_block = ivec4(pack_into_int32(ivec4(output_zp)));
-
-  for (int row = 0; row < 4; row++) {
-    vec4 in_texel_1 = unpack_and_dequantize(
-        in_block_1[row], input_a_scale, input_a_zp);
-    vec4 in_texel_2 = unpack_and_dequantize(
-        in_block_2[row], input_b_scale, input_b_zp);
-
-    vec4 out_texel = op(in_texel_1, in_texel_2);
-    out_block[row] = quantize_and_pack(out_texel, output_inv_scale, output_zp);
-  }
-
-  t_packed_int8_out[tid] = out_block;
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q8ta_binary.glsl b/backends/vulkan/runtime/graph/ops/glsl/q8ta_binary.glsl
new file mode 100644
index 00000000000..60f437fbdce
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/q8ta_binary.glsl
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+${define_active_storage_type("buffer")}
+
+#define op(X, Y) ${OPERATOR}
+
+layout(std430) buffer;
+
+#include "indexing.glslh"
+#include "common.glslh"
+#include "block_indexing.glslh"
+#include "block_int8x4_load.glslh"
+#include "block_int8x4_store.glslh"
+
+// Output buffer: packed int8x4 values
+${layout_declare_tensor(B, "w", "t_out", "int", "buffer")}
+// Input buffers: packed int8x4 values
+${layout_declare_tensor(B, "r", "t_in_a", "int", "buffer")}
+${layout_declare_tensor(B, "r", "t_in_b", "int", "buffer")}
+
+// Metadata for output and input tensors
+${layout_declare_ubo(B, "BufferMetadata", "out_meta")}
+${layout_declare_ubo(B, "BufferMetadata", "in_a_meta")}
+${layout_declare_ubo(B, "BufferMetadata", "in_b_meta")}
+
+layout(push_constant) uniform restrict Block {
+  float input_a_scale;
+  int input_a_zp;
+  float input_b_scale;
+  int input_b_zp;
+  float output_inv_scale;
+  int output_zp;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+${layout_declare_spec_const(C, "int", "out_layout", "CONTIG_LAYOUT_INT")}
+${layout_declare_spec_const(C, "int", "in_layout", "CONTIG_LAYOUT_INT")}
+${layout_declare_spec_const(C, "int", "block_config", "0")}
+
+// Generate loading functions for input buffers
+define_load_int8x4_buffer_fns(t_in_a)
+define_load_int8x4_buffer_fns(t_in_b)
+
+// Generate storing functions for output buffer
+define_store_int8x4_buffer_fns(t_out)
+
+void main() {
+  // Buffer storage: use linear dispatch
+  const uint contig_block_idx = gl_GlobalInvocationID.x;
+  TensorIndex4D tidx = contiguous_block_idx_to_tensor4d_idx_with_block_config(
+      out_meta, contig_block_idx, block_config);
+
+  if (out_of_bounds(tidx, out_meta)) {
+    return;
+  }
+
+  const int block_outer_dim = get_block_outer_dim(block_config);
+
+  // Load int8x4 blocks from both inputs
+  ivec4 in_block_a = load_int8x4_block_from_t_in_a(
+      in_a_meta, tidx, in_layout, block_outer_dim);
+  ivec4 in_block_b = load_int8x4_block_from_t_in_b(
+      in_b_meta, tidx, in_layout, block_outer_dim);
+
+  ivec4 out_block;
+
+  for (int row = 0; row < 4; row++) {
+    vec4 in_texel_a = unpack_and_dequantize(
+        in_block_a[row], input_a_scale, input_a_zp);
+    vec4 in_texel_b = unpack_and_dequantize(
+        in_block_b[row], input_b_scale, input_b_zp);
+
+    vec4 out_texel = op(in_texel_a, in_texel_b);
+    out_block[row] = quantize_and_pack(out_texel, output_inv_scale, output_zp);
+  }
+
+  // Store to output buffer
+  store_int8x4_block_to_t_out(
+      out_meta, tidx, out_layout, block_outer_dim, out_block);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.yaml b/backends/vulkan/runtime/graph/ops/glsl/q8ta_binary.yaml
similarity index 61%
rename from backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.yaml
rename to backends/vulkan/runtime/graph/ops/glsl/q8ta_binary.yaml
index e19ed8839eb..2060f7e42ba 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/q8ta_binary.yaml
@@ -4,16 +4,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-binary_q8ta_q8ta_q8to:
+q8ta_binary:
   parameter_names_with_default_values:
     OPERATOR: X + Y
-    NDIM: 3
-    DTYPE: float
-    PACKING: C_packed
-    IO_STORAGE: buffer
-  generate_variant_forall:
-    IO_STORAGE:
-      - VALUE: buffer
   shader_variants:
-    - NAME: add_q8ta_q8ta_q8to
+    - NAME: q8ta_add_buffer
       OPERATOR: X + Y
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedBinary.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taBinary.cpp
similarity index 53%
rename from backends/vulkan/runtime/graph/ops/impl/QuantizedBinary.cpp
rename to backends/vulkan/runtime/graph/ops/impl/Q8taBinary.cpp
index 99b5880c2eb..af934b9b521 100644
--- a/backends/vulkan/runtime/graph/ops/impl/QuantizedBinary.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Q8taBinary.cpp
@@ -9,38 +9,15 @@
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/QuantizeDequantize.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
 namespace vkcompute {
 
-//
-// Shader dispatch utilities
-//
-
-utils::uvec3 pick_q8ta_q8ta_q8to_binary_global_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  const ValueRef packed_int8_output = args.at(0).refs.at(0);
-
-  const uint32_t W = graph->size_at<uint32_t>(-1, packed_int8_output);
-  const uint32_t H = graph->size_at<uint32_t>(-2, packed_int8_output);
-  const uint32_t C = graph->size_at<uint32_t>(-3, packed_int8_output);
-
-  const uint32_t W4 = utils::div_up_4(W);
-  const uint32_t C4 = utils::div_up_4(C);
-
-  return {W4 * H * C4, 1, 1};
-}
-
 //
 // Dispatch nodes
 //
 
-void add_q8ta_q8ta_q8to_binary_node(
+void add_q8ta_binary_node(
     ComputeGraph& graph,
     const ValueRef packed_int8_input_a,
     const ValueRef packed_int8_input_b,
@@ -53,6 +30,23 @@ void add_q8ta_q8ta_q8to_binary_node(
     const ValueRef alpha,
     const ValueRef packed_int8_output,
     const std::string& op_name) {
+  // The implementation assumes that all participating tensors have the same
+  // packed dimension, and that they all have the same block size for the packed
+  // dimension
+  const api::PackedDimInfo& output_info =
+      graph.packed_dim_info_of(packed_int8_output);
+  const api::PackedDimInfo& input_a_info =
+      graph.packed_dim_info_of(packed_int8_input_a);
+  const api::PackedDimInfo& input_b_info =
+      graph.packed_dim_info_of(packed_int8_input_b);
+
+  VK_CHECK_COND(input_a_info.packed_dim == output_info.packed_dim);
+  VK_CHECK_COND(input_b_info.packed_dim == output_info.packed_dim);
+  VK_CHECK_COND(
+      input_a_info.packed_dim_block_size == output_info.packed_dim_block_size);
+  VK_CHECK_COND(
+      input_b_info.packed_dim_block_size == output_info.packed_dim_block_size);
+
   float input_a_scale_val = graph.extract_scalar<float>(input_a_scale);
   int32_t input_a_zp_val = graph.extract_scalar<int32_t>(input_a_zp);
   float input_b_scale_val = graph.extract_scalar<float>(input_b_scale);
@@ -68,11 +62,15 @@ void add_q8ta_q8ta_q8to_binary_node(
     alpha_val = graph.extract_scalar<float>(alpha);
   }
 
-  std::string kernel_name = op_name + "_q8ta_q8ta_q8to";
+  std::string kernel_name = "q8ta_" + op_name;
   add_storage_type_suffix(
       kernel_name, graph.storage_type_of(packed_int8_output));
 
-  vkapi::ParamsBindList param_buffers = {graph.sizes_ubo(packed_int8_output)};
+  // Pass metadata for output and input tensors
+  vkapi::ParamsBindList param_buffers;
+  param_buffers.append(graph.buffer_meta_ubo(packed_int8_output));
+  param_buffers.append(graph.buffer_meta_ubo(packed_int8_input_a));
+  param_buffers.append(graph.buffer_meta_ubo(packed_int8_input_b));
 
   std::vector<PushConstantDataInfo> push_constants = {
       PushConstantDataInfo(&input_a_scale_val, sizeof(input_a_scale_val)),
@@ -84,11 +82,19 @@ void add_q8ta_q8ta_q8to_binary_node(
       PushConstantDataInfo(&alpha_val, sizeof(alpha_val)),
   };
 
+  // Create block config for output tensor: inner_dim = output's packed_dim
+  const BlockConfig block_config =
+      create_block_config_for_tensor(graph, packed_int8_output);
+
+  // Cast block config to ValueRef for pick_linear_global_wg_with_block_config
+  const ValueRef block_config_ref =
+      static_cast<ValueRef>(block_config.as_packed_int());
+
   graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
-      pick_q8ta_q8ta_q8to_binary_global_wg_size,
-      default_pick_local_wg_size,
+      pick_linear_global_wg_with_block_config,
+      pick_square_local_wg_with_block_config,
       // Inputs and Outputs
       {{packed_int8_output, vkapi::kWrite},
        {{packed_int8_input_a, packed_int8_input_b}, vkapi::kRead}},
@@ -97,9 +103,11 @@ void add_q8ta_q8ta_q8to_binary_node(
       // Push Constants
       push_constants,
       // Specialization Constants
-      {},
+      {graph.hashed_layout_of(packed_int8_output),
+       graph.hashed_layout_of(packed_int8_input_a),
+       block_config.as_packed_int()},
       // Resize args
-      {},
+      {block_config_ref},
       // Resizing Logic
       nullptr));
 }
@@ -108,9 +116,7 @@ void add_q8ta_q8ta_q8to_binary_node(
 // High level operator impl
 //
 
-void add_q8ta_q8ta_q8to(
-    ComputeGraph& graph,
-    const std::vector<ValueRef>& args) {
+void q8ta_add(ComputeGraph& graph, const std::vector<ValueRef>& args) {
   int32_t idx = 0;
   const ValueRef packed_int8_input_a = args.at(idx++);
   const ValueRef packed_int8_input_b = args.at(idx++);
@@ -123,7 +129,7 @@ void add_q8ta_q8ta_q8to(
   const ValueRef alpha = args.at(idx++);
   const ValueRef packed_int8_output = args.at(idx++);
 
-  add_q8ta_q8ta_q8to_binary_node(
+  add_q8ta_binary_node(
       graph,
       packed_int8_input_a,
       packed_int8_input_b,
@@ -138,73 +144,8 @@ void add_q8ta_q8ta_q8to(
       "add");
 }
 
-//
-// Test operators
-//
-
-void add_q8ta_q8ta_q8to_test(
-    ComputeGraph& graph,
-    const std::vector<ValueRef>& args) {
-  int32_t idx = 0;
-  const ValueRef fp_input_a = args.at(idx++);
-  const ValueRef fp_input_b = args.at(idx++);
-  const ValueRef input_a_scale = args.at(idx++);
-  const ValueRef input_a_zp = args.at(idx++);
-  const ValueRef input_b_scale = args.at(idx++);
-  const ValueRef input_b_zp = args.at(idx++);
-  const ValueRef output_scale = args.at(idx++);
-  const ValueRef output_zp = args.at(idx++);
-  const ValueRef alpha = args.at(idx++);
-  const ValueRef fp_output = args.at(idx++);
-
-  TmpTensor packed_int8_input_a(
-      &graph,
-      graph.sizes_of(fp_input_a),
-      vkapi::kInt8x4,
-      utils::kBuffer,
-      utils::kPackedInt8_4W4C);
-
-  TmpTensor packed_int8_input_b(
-      &graph,
-      graph.sizes_of(fp_input_b),
-      vkapi::kInt8x4,
-      utils::kBuffer,
-      utils::kPackedInt8_4W4C);
-
-  TmpTensor packed_int8_output(
-      &graph,
-      graph.sizes_of(fp_output),
-      vkapi::kInt8x4,
-      utils::kBuffer,
-      utils::kPackedInt8_4W4C);
-
-  add_quantize_and_pack_4w4c_node(
-      graph, fp_input_a, input_a_scale, input_a_zp, packed_int8_input_a);
-
-  add_quantize_and_pack_4w4c_node(
-      graph, fp_input_b, input_b_scale, input_b_zp, packed_int8_input_b);
-
-  std::vector<ValueRef> add_args = {
-      packed_int8_input_a,
-      packed_int8_input_b,
-      input_a_scale,
-      input_a_zp,
-      input_b_scale,
-      input_b_zp,
-      output_scale,
-      output_zp,
-      alpha,
-      packed_int8_output};
-
-  add_q8ta_q8ta_q8to(graph, add_args);
-
-  add_unpack_4w4c_and_dequantize_node(
-      graph, packed_int8_output, output_scale, output_zp, fp_output);
-}
-
 REGISTER_OPERATORS {
-  VK_REGISTER_OP(et_vk.add_q8ta_q8ta_q8to.default, add_q8ta_q8ta_q8to);
-  VK_REGISTER_OP(et_vk.add_q8ta_q8ta_q8to.test, add_q8ta_q8ta_q8to_test);
+  VK_REGISTER_OP(et_vk.q8ta_add.default, q8ta_add);
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taBinary.h b/backends/vulkan/runtime/graph/ops/impl/Q8taBinary.h
new file mode 100644
index 00000000000..512849762cb
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Q8taBinary.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
+
+namespace vkcompute {
+
+//
+// Binary operations for int8x4 tensors
+//
+
+void add_q8ta_binary_node(
+    ComputeGraph& graph,
+    const ValueRef packed_int8_input_a,
+    const ValueRef packed_int8_input_b,
+    const ValueRef input_a_scale,
+    const ValueRef input_a_zp,
+    const ValueRef input_b_scale,
+    const ValueRef input_b_zp,
+    const ValueRef output_scale,
+    const ValueRef output_zp,
+    const ValueRef alpha,
+    const ValueRef packed_int8_output,
+    const std::string& op_name);
+
+} // namespace vkcompute
diff --git a/backends/vulkan/test/custom_ops/CMakeLists.txt b/backends/vulkan/test/custom_ops/CMakeLists.txt
index 0121c84bb5b..d17ab94d194 100644
--- a/backends/vulkan/test/custom_ops/CMakeLists.txt
+++ b/backends/vulkan/test/custom_ops/CMakeLists.txt
@@ -99,8 +99,8 @@ if(TARGET vulkan_backend)
   add_operator_prototype(choose_qparams_per_row)
   add_operator_prototype(test_q8ta_qdq)
   add_operator_prototype(test_q8ta_clone)
+  add_operator_prototype(test_q8ta_binary)
   add_operator_prototype(test_q8ta_conv2d)
   add_operator_prototype(test_q8ta_conv2d_pw)
   add_operator_prototype(test_q8ta_conv2d_dw)
-  add_operator_prototype(q8ta_q8ta_q8to_add)
 endif()
diff --git a/backends/vulkan/test/custom_ops/impl/TestQ8taBinary.cpp b/backends/vulkan/test/custom_ops/impl/TestQ8taBinary.cpp
new file mode 100644
index 00000000000..53f8859b581
--- /dev/null
+++ b/backends/vulkan/test/custom_ops/impl/TestQ8taBinary.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Q8taBinary.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Q8taQuantizeDequantize.h>
+
+namespace vkcompute {
+
+void q8ta_add_test(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  int32_t idx = 0;
+  const ValueRef fp_input_a = args.at(idx++);
+  const ValueRef fp_input_b = args.at(idx++);
+  const ValueRef input_a_scale = args.at(idx++);
+  const ValueRef input_a_zp = args.at(idx++);
+  const ValueRef input_b_scale = args.at(idx++);
+  const ValueRef input_b_zp = args.at(idx++);
+  const ValueRef output_scale = args.at(idx++);
+  const ValueRef output_zp = args.at(idx++);
+  const ValueRef alpha = args.at(idx++);
+  const ValueRef quant_layout_int = args.at(idx++);
+  const ValueRef fp_output = args.at(idx++);
+
+  // Extract the layout parameter and cast to GPUMemoryLayout
+  int32_t layout_value = graph.extract_scalar<int32_t>(quant_layout_int);
+  utils::GPUMemoryLayout quant_layout =
+      static_cast<utils::GPUMemoryLayout>(layout_value);
+
+  // Create temporary tensors for quantized data with the specified layout
+  TmpTensor packed_int8_input_a(
+      &graph,
+      graph.sizes_of(fp_input_a),
+      vkapi::kInt8x4,
+      utils::kBuffer,
+      quant_layout);
+
+  TmpTensor packed_int8_input_b(
+      &graph,
+      graph.sizes_of(fp_input_b),
+      vkapi::kInt8x4,
+      utils::kBuffer,
+      quant_layout);
+
+  TmpTensor packed_int8_output(
+      &graph,
+      graph.sizes_of(fp_output),
+      vkapi::kInt8x4,
+      utils::kBuffer,
+      quant_layout);
+
+  // Quantize: FP -> int8x4 with specified layout
+  add_q8ta_quantize_node(
+      graph, fp_input_a, input_a_scale, input_a_zp, packed_int8_input_a);
+
+  add_q8ta_quantize_node(
+      graph, fp_input_b, input_b_scale, input_b_zp, packed_int8_input_b);
+
+  // Binary add: int8x4 -> int8x4 (same layout for all tensors)
+  add_q8ta_binary_node(
+      graph,
+      packed_int8_input_a,
+      packed_int8_input_b,
+      input_a_scale,
+      input_a_zp,
+      input_b_scale,
+      input_b_zp,
+      output_scale,
+      output_zp,
+      alpha,
+      packed_int8_output,
+      "add");
+
+  // Dequantize: int8x4 -> FP
+  add_q8ta_dequantize_node(
+      graph, packed_int8_output, output_scale, output_zp, fp_output);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(et_vk.q8ta_add.test, q8ta_add_test);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/test/custom_ops/targets.bzl b/backends/vulkan/test/custom_ops/targets.bzl
index 63423ed410f..73b1e343bbe 100644
--- a/backends/vulkan/test/custom_ops/targets.bzl
+++ b/backends/vulkan/test/custom_ops/targets.bzl
@@ -93,7 +93,7 @@ def define_common_targets(is_fbcode = False):
     define_custom_op_test_binary("q4gsw_linear")
     define_custom_op_test_binary("test_q8ta_qdq")
     define_custom_op_test_binary("test_q8ta_clone")
+    define_custom_op_test_binary("test_q8ta_binary")
     define_custom_op_test_binary("test_q8ta_conv2d")
     define_custom_op_test_binary("test_q8ta_conv2d_pw")
     define_custom_op_test_binary("test_q8ta_conv2d_dw")
-    define_custom_op_test_binary("q8ta_q8ta_q8to_add")
diff --git a/backends/vulkan/test/custom_ops/q8ta_q8ta_q8to_add.cpp b/backends/vulkan/test/custom_ops/test_q8ta_binary.cpp
similarity index 52%
rename from backends/vulkan/test/custom_ops/q8ta_q8ta_q8to_add.cpp
rename to backends/vulkan/test/custom_ops/test_q8ta_binary.cpp
index eb8e6908060..1100eb4d5f0 100644
--- a/backends/vulkan/test/custom_ops/q8ta_q8ta_q8to_add.cpp
+++ b/backends/vulkan/test/custom_ops/test_q8ta_binary.cpp
@@ -6,49 +6,57 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+#include <algorithm>
+#include <cmath>
 #include <iostream>
 #include <vector>
 #include "utils.h"
 
 using namespace executorch::vulkan::prototyping;
 
-// Utility function to create a test case for quantized add operation
-TestCase create_quantized_add_test_case(
-    const std::vector<int64_t>& sizes,
+static constexpr int64_t kRefDimSizeLimit = 512;
+
+// Configuration struct for q8ta binary testing
+struct Q8taBinaryConfig {
+  std::vector<int64_t> shape; // Tensor shape (can be any dimensionality)
+  std::string test_case_name = "placeholder";
+  std::string op_name = "q8ta_add";
+};
+
+// Utility function to create a test case from a Q8taBinaryConfig
+TestCase create_test_case_from_config(
+    const Q8taBinaryConfig& config,
     utils::StorageType storage_type,
-    vkapi::ScalarType input_dtype) {
+    vkapi::ScalarType input_dtype,
+    utils::GPUMemoryLayout fp_memory_layout,
+    utils::GPUMemoryLayout quant_layout) {
   TestCase test_case;
 
   // Create a descriptive name for the test case
-  std::string size_str = "";
-  for (size_t i = 0; i < sizes.size(); ++i) {
-    size_str += std::to_string(sizes[i]);
-    if (i < sizes.size() - 1)
-      size_str += "x";
-  }
-
-  std::string storage_str =
-      (storage_type == utils::kTexture3D) ? "Texture3D" : "Buffer";
-  std::string dtype_str = (input_dtype == vkapi::kFloat) ? "Float" : "Half";
-
-  std::string test_name =
-      "QuantizedAdd_" + size_str + "_" + storage_str + "_" + dtype_str;
+  std::string shape_str = shape_string(config.shape);
+  std::string test_name = config.test_case_name + "  I=" + shape_str + "  " +
+      repr_str(utils::kBuffer, quant_layout);
   test_case.set_name(test_name);
 
   // Set the operator name for the test case
-  test_case.set_operator_name("et_vk.add_q8ta_q8ta_q8to.test");
-
-  utils::GPUMemoryLayout io_memory_layout = storage_type == utils::kBuffer
-      ? utils::kWidthPacked
-      : utils::kChannelsPacked;
+  std::string operator_name = "et_vk." + config.op_name + ".test";
+  test_case.set_operator_name(operator_name);
 
   // Input tensor A (float/half)
   ValueSpec input_a(
-      sizes, input_dtype, storage_type, io_memory_layout, DataGenType::RANDOM);
+      config.shape,
+      input_dtype,
+      storage_type,
+      fp_memory_layout,
+      DataGenType::RANDOM);
 
   // Input tensor B (float/half)
   ValueSpec input_b(
-      sizes, input_dtype, storage_type, io_memory_layout, DataGenType::RANDOM);
+      config.shape,
+      input_dtype,
+      storage_type,
+      fp_memory_layout,
+      DataGenType::RANDOM);
 
   // Quantization parameters for input A
   float input_a_scale_val = 0.007843; // 2/255 approximately
@@ -75,11 +83,19 @@ TestCase create_quantized_add_test_case(
   float alpha_val = 1.0f;
   ValueSpec alpha(alpha_val);
 
+  // Quantized layout as integer
+  int32_t quant_layout_int = static_cast<int32_t>(quant_layout);
+  ValueSpec quant_layout_spec(quant_layout_int);
+
   // Output tensor (float/half)
   ValueSpec output(
-      sizes, input_dtype, storage_type, io_memory_layout, DataGenType::ZEROS);
+      config.shape,
+      input_dtype,
+      storage_type,
+      fp_memory_layout,
+      DataGenType::ZEROS);
 
-  // Add all specs to test case for q8ta_q8ta_q8to add operation
+  // Add all specs to test case for q8ta add operation
   test_case.add_input_spec(input_a);
   test_case.add_input_spec(input_b);
   test_case.add_input_spec(input_a_scale);
@@ -89,50 +105,124 @@ TestCase create_quantized_add_test_case(
   test_case.add_input_spec(output_scale);
   test_case.add_input_spec(output_zero_point);
   test_case.add_input_spec(alpha);
+  test_case.add_input_spec(quant_layout_spec);
 
   test_case.add_output_spec(output);
 
   test_case.set_abs_tolerance(output_scale_val + 1e-4f);
 
+  // Use layout-only filter to focus on the binary operation
+  test_case.set_shader_filter({
+      "nchw_to",
+      "to_nchw",
+      "q8ta_quantize",
+      "q8ta_dequantize",
+  });
+
   return test_case;
 }
 
-// Generate test cases for quantized add operation
-std::vector<TestCase> generate_quantized_add_test_cases() {
+// Generate easy test cases for q8ta_add operation (for debugging)
+std::vector<TestCase> generate_q8ta_add_easy_cases() {
   std::vector<TestCase> test_cases;
 
-  // Define different input size configurations
-  std::vector<std::vector<int64_t>> size_configs = {
-      {3, 32, 32}, // Small square
-      {8, 64, 64}, // Medium square
-      {16, 16, 16}, // 3D cube
-      {8, 32, 16}, // 3D rectangular
-      {7, 7, 13}, // Irregular sizes
+  // Single simple configuration for debugging
+  Q8taBinaryConfig config = {
+      {1, 16, 16, 16}, // shape: [N, C, H, W]
+      "ACCU", // test_case_name
   };
 
-  // Storage types to test
-  std::vector<utils::StorageType> storage_types = {
-      utils::kTexture3D, utils::kBuffer};
+  // Quantized memory layouts to test
+  std::vector<utils::GPUMemoryLayout> quant_layouts = {
+      utils::kPackedInt8_4W,
+      utils::kPackedInt8_4C,
+      utils::kPackedInt8_4W4C,
+      utils::kPackedInt8_4H4W,
+      utils::kPackedInt8_4C1W,
+  };
+
+  for (const auto& quant_layout : quant_layouts) {
+    test_cases.push_back(create_test_case_from_config(
+        config,
+        /*storage_type=*/utils::kBuffer,
+        /*input_dtype=*/vkapi::kFloat,
+        /*fp_memory_layout=*/utils::kWidthPacked,
+        quant_layout));
+  }
+
+  return test_cases;
+}
+
+// Generate test cases for q8ta_add operation
+std::vector<TestCase> generate_q8ta_add_test_cases() {
+  std::vector<TestCase> test_cases;
+
+  // Shapes to test
+  std::vector<std::vector<int64_t>> shapes = {
+      // Small test cases for correctness
+      {1, 3, 16, 16},
+      {1, 8, 32, 32},
+      {1, 16, 24, 24},
+      {1, 32, 12, 12},
+      {1, 1, 64, 64},
+      {1, 3, 64, 64},
+      {1, 4, 16, 16},
+
+      // Different tensor sizes
+      {1, 8, 20, 20},
+      {1, 16, 14, 14},
+      {1, 8, 28, 28},
+
+      // Odd tensor sizes
+      {1, 3, 15, 15},
+      {1, 13, 31, 31},
+      {1, 17, 23, 23},
+
+      // Performance test cases (larger tensors)
+      {1, 64, 128, 128},
+      {1, 32, 64, 64},
+      {1, 128, 56, 56},
+      {1, 128, 128, 128},
+  };
 
-  // Data types to test
-  std::vector<vkapi::ScalarType> data_types = {vkapi::kFloat};
+  // Quantized memory layouts to test
+  std::vector<utils::GPUMemoryLayout> quant_layouts = {
+      utils::kPackedInt8_4W,
+      utils::kPackedInt8_4C,
+      utils::kPackedInt8_4W4C,
+      utils::kPackedInt8_4H4W,
+      utils::kPackedInt8_4C1W,
+  };
 
-  // Generate test cases for each combination
-  for (const auto& sizes : size_configs) {
-    for (const auto& storage_type : storage_types) {
-      for (const auto& data_type : data_types) {
-        test_cases.push_back(
-            create_quantized_add_test_case(sizes, storage_type, data_type));
+  // Generate all combinations
+  for (const auto& shape : shapes) {
+    // Generate test case name prefix from shape dimensions
+    std::string prefix = "ACCU";
+    for (const auto& dim : shape) {
+      if (dim > kRefDimSizeLimit) {
+        prefix = "PERF";
+        break;
       }
     }
+
+    Q8taBinaryConfig config;
+    config.shape = shape;
+    config.test_case_name = prefix;
+    for (const auto& quant_layout : quant_layouts) {
+      test_cases.push_back(create_test_case_from_config(
+          config,
+          /*storage_type=*/utils::kBuffer,
+          /*input_dtype=*/vkapi::kFloat,
+          /*fp_memory_layout=*/utils::kWidthPacked,
+          quant_layout));
+    }
   }
 
   return test_cases;
 }
 
 // Reference implementation for quantized add operation
-void add_q8ta_q8ta_q8to_reference_impl(TestCase& test_case) {
-  // Extract input specifications
+void q8ta_add_reference_impl(TestCase& test_case) {
   int32_t idx = 0;
   const ValueSpec& input_a_spec = test_case.inputs()[idx++];
   const ValueSpec& input_b_spec = test_case.inputs()[idx++];
@@ -143,13 +233,29 @@ void add_q8ta_q8ta_q8to_reference_impl(TestCase& test_case) {
   const ValueSpec& output_scale_spec = test_case.inputs()[idx++];
   const ValueSpec& output_zero_point_spec = test_case.inputs()[idx++];
   const ValueSpec& alpha_spec = test_case.inputs()[idx++];
+  const ValueSpec& quant_layout_spec = test_case.inputs()[idx++];
+  (void)quant_layout_spec; // Not used in reference implementation
 
-  // Extract output specification (mutable reference)
+  // Extract output specification
   ValueSpec& output_spec = test_case.outputs()[0];
 
   // Get tensor dimensions
   auto input_sizes = input_a_spec.get_tensor_sizes();
-  int64_t num_elements = input_a_spec.numel();
+
+  // Calculate total number of elements
+  int64_t num_elements = 1;
+  for (const auto& dim : input_sizes) {
+    num_elements *= dim;
+  }
+
+  // Skip for large tensors since computation time will be extremely slow
+  for (const auto& dim : input_sizes) {
+    if (dim > kRefDimSizeLimit) {
+      throw std::invalid_argument(
+          "One or more dimensions exceed the allowed limit for reference "
+          "implementation.");
+    }
+  }
 
   if (input_a_spec.dtype != vkapi::kFloat) {
     throw std::invalid_argument("Unsupported dtype");
@@ -208,50 +314,36 @@ void add_q8ta_q8ta_q8to_reference_impl(TestCase& test_case) {
   }
 }
 
-void reference_impl(TestCase& test_case) {
-  add_q8ta_q8ta_q8to_reference_impl(test_case);
-}
-
-// Custom FLOP calculator for quantized add operation
-int64_t quantized_add_flop_calculator(const TestCase& test_case) {
-  // Calculate total elements from the first input tensor
-  int64_t total_elements = 1;
-  if (!test_case.empty() && test_case.num_inputs() > 0 &&
-      test_case.inputs()[0].is_tensor()) {
-    const auto& sizes = test_case.inputs()[0].get_tensor_sizes();
-    for (int64_t size : sizes) {
-      total_elements *= size;
-    }
-  }
-
-  // Quantized add operation includes:
-  // - 2 quantizations (float to int8)
-  // - 2 dequantizations (int8 to float)
-  // - 1 addition
-  // For simplicity, we count this as 1 FLOP per element (the addition)
-  return total_elements;
-}
-
 int main(int argc, char* argv[]) {
   set_debugging(false);
   set_print_output(false);
+#ifdef DEBUG_MODE
+  set_print_latencies(false);
+#else
   set_print_latencies(false);
+#endif
   set_use_gpu_timestamps(true);
 
   print_performance_header();
-  std::cout << "Quantized Add Operation (q8ta_q8ta_q8to) Prototyping Framework"
-            << std::endl;
+  std::cout << "Q8TA Binary Add Operation Prototyping Framework" << std::endl;
   print_separator();
 
-  ReferenceComputeFunc ref_fn = reference_impl;
+  ReferenceComputeFunc ref_fn = q8ta_add_reference_impl;
 
-  // Execute test cases using the new framework with custom FLOP calculator
   auto results = execute_test_cases(
-      generate_quantized_add_test_cases,
-      quantized_add_flop_calculator,
-      "QuantizedAddQ8taQ8taQ8to",
+#ifdef DEBUG_MODE
+      generate_q8ta_add_easy_cases,
+#else
+      generate_q8ta_add_test_cases,
+#endif
+      "Q8taBinaryAdd",
+#ifdef DEBUG_MODE
       0,
       1,
+#else
+      3,
+      10,
+#endif
       ref_fn);
 
   return 0;

From 3b459797d8a3634fedfe0b1ade18a48b7fd71cfb Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devvm1479.ncg0.facebook.com>
Date: Thu, 12 Feb 2026 23:49:03 -0800
Subject: [PATCH 2/4] [ET-VK] Add nchw_to_int8x4_buffer shader for prepacking
 int8 staging data

Pull Request resolved: https://github.com/pytorch/executorch/pull/17392

This adds a GLSL compute shader and supporting C++ dispatch logic to transfer
int8 tensor data from a staging buffer (in NCHW contiguous order) to a GPU
buffer in any PackedInt8 layout (4W, 4C, 4W4C, 4H4W, 4C1W).

Previously there was no prepack path for kInt8x4 tensors, so constant int8
tensors (TensorRef inputs) could not be transferred to GPU buffers. This is
needed to support constant quantized weights in q8ta operators.

The shader uses texel-level dispatch where each thread writes one texel (one
int32 = 4 packed int8 values). It decomposes the texel index into block-space
coordinates using BufferMetadata strides and hashed_layout dim_order, then reads
the corresponding int8 bytes from the staging buffer (interpreted as int32 for
device compatibility, avoiding the need for 8-bit buffer support).

New files:
- nchw_to_int8x4_buffer.glsl: Compute shader handling all PackedInt8 layouts
- nchw_to_int8x4_buffer.yaml: Shader variant config
- Q8taStaging.h/cpp: C++ dispatch function creating the PrepackNode

Modified files:
- Staging.cpp: Routes kInt8x4 tensors in prepack_op() to the new function
- TestQ8taBinary.cpp: Prepacks TensorRef inputs before quantization
- test_q8ta_binary.cpp: Adds const_b test cases for constant tensor B inputs

This diff was authored with Claude.
ghstack-source-id: 341022576
@exported-using-ghexport

Differential Revision: [D93000169](https://our.internmc.facebook.com/intern/diff/D93000169/)
---
 backends/vulkan/op_registry.py                |  1 -
 .../runtime/api/containers/StagingBuffer.cpp  |  6 +-
 .../runtime/graph/ops/glsl/indexing.glslh     | 61 ++++++++++-----
 .../graph/ops/glsl/nchw_to_int8x4_buffer.glsl | 78 +++++++++++++++++++
 .../graph/ops/glsl/nchw_to_int8x4_buffer.yaml | 11 +++
 .../runtime/graph/ops/impl/Q8taStaging.cpp    | 49 ++++++++++++
 .../runtime/graph/ops/impl/Q8taStaging.h      | 20 +++++
 .../vulkan/runtime/graph/ops/impl/Staging.cpp |  4 +
 .../graph/ops/utils/ShaderNameUtils.cpp       |  3 +
 .../test/custom_ops/impl/TestQ8taBinary.cpp   | 30 ++++---
 .../test/custom_ops/test_q8ta_binary.cpp      | 48 +++++++++---
 11 files changed, 268 insertions(+), 43 deletions(-)
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8x4_buffer.glsl
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8x4_buffer.yaml
 create mode 100644 backends/vulkan/runtime/graph/ops/impl/Q8taStaging.cpp
 create mode 100644 backends/vulkan/runtime/graph/ops/impl/Q8taStaging.h

diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
index b58d6407308..55a92335bc7 100644
--- a/backends/vulkan/op_registry.py
+++ b/backends/vulkan/op_registry.py
@@ -510,7 +510,6 @@ def register_q8ta_add():
     return OpFeatures(
         inputs_storage=utils.PACKED_INT8_BUFFER,
         supports_resize=False,
-        supports_prepacking=True,
     )
 
 
diff --git a/backends/vulkan/runtime/api/containers/StagingBuffer.cpp b/backends/vulkan/runtime/api/containers/StagingBuffer.cpp
index 499f0b43d05..53ec9c17eae 100644
--- a/backends/vulkan/runtime/api/containers/StagingBuffer.cpp
+++ b/backends/vulkan/runtime/api/containers/StagingBuffer.cpp
@@ -136,8 +136,12 @@ StagingBuffer::StagingBuffer(
     const vkapi::CopyDirection direction)
     : context_p_(context_p),
       dtype_(get_staging_dtype(context_p, dtype)),
+      // For 8-bit types, align numel to the next multiple of 4. Devices that
+      // lack 8-bit storage buffer support will interpret the data as int32, so
+      // the buffer size must be a multiple of 4 bytes.
       vulkan_buffer_(context_p_->adapter_ptr()->vma().create_staging_buffer(
-          element_size(dtype_) * numel,
+          element_size(dtype_) *
+              (element_size(dtype_) == 1 ? utils::align_up_4(numel) : numel),
           direction)),
       mapped_data_(nullptr) {}
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh b/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh
index 37c47795214..51cda9a3d1d 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh
+++ b/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh
@@ -334,19 +334,30 @@ TensorIndex linear_idx_to_tensor_idx(
 /*
  * Convert a linear texel index to a TensorIndex4D.
  *
- * This function is used for texel-based dispatch where each thread handles
- * one packed texel (4 elements along the packed dimension). The texel index
- * is decomposed using the dim_order and strides from the tensor's layout.
+ * This is the inverse of tensor4d_idx_to_texel_idx. It handles both
+ * single-packed layouts (outer_block_size == 1) and block-packed layouts
+ * (e.g., 4W4C where outer_block_size > 1).
  *
- * The strides in BufferMetadata should already be in texel space (with packed
- * dimension size divided by 4).
+ * The approach mirrors tensor4d_idx_to_texel_idx by decomposing the problem
+ * into two levels:
+ *   1. Decompose texel_idx into block_idx and intra-block texel offset
+ *   2. Decompose block_idx into block-space tensor coordinates using strides
+ *   3. Convert block-space coordinates to element-space by multiplying by
+ *      block sizes
+ *   4. Add the intra-block outer-dimension offset
+ *
+ * For single-packed layouts (outer_block_size == 1, inner_dim == outer_dim),
+ * texels_per_block == 1, so block_idx == texel_idx and intra_block_texel == 0.
+ * The only effective multiplication is tidx[inner_dim] *= inner_block_size
+ * (i.e., *= 4), matching the previous single-packed behavior.
  *
  * Parameters:
- *   meta: BufferMetadata with tensor sizes and texel-space strides
+ *   meta: BufferMetadata with block-space strides
  *   texel_idx: Linear index into packed texels (0 to num_texels-1)
  *   hashed_layout: Packed layout info containing dim_order and packed_dim
  *
- * Returns: TensorIndex4D with logical tensor coordinates (packed dim is base of 4-element block)
+ * Returns: TensorIndex4D with logical tensor coordinates (packed dims are
+ *          base of their respective blocks)
  */
 TensorIndex4D texel_idx_to_tensor4d_idx(
     const BufferMetadata meta,
@@ -354,25 +365,35 @@ TensorIndex4D texel_idx_to_tensor4d_idx(
     const int hashed_layout) {
   TensorIndex4D tidx;
 
-  const int packed_dim = get_packed_dim(hashed_layout);
+  const int inner_dim = get_packed_dim(hashed_layout);
+  const int outer_dim = get_outer_packed_dim(hashed_layout);
+  const int inner_block_size = get_packed_dim_block_size(hashed_layout);
+  const int outer_block_size = get_outer_packed_dim_block_size(hashed_layout);
 
-  // Decompose texel_idx using dim_order from hashed_layout and strides from meta
-  // Iterate from slowest-varying dimension (d=3) to fastest (d=0)
-  // This follows the pattern of linear_idx_to_tensor_idx in indexing.glslh
+  // Number of texels per block: each block has inner_block_size *
+  // outer_block_size elements, and each texel holds 4 elements
+  const int texels_per_block = (inner_block_size * outer_block_size) / 4;
+
+  // Decompose texel_idx into block_idx and intra-block texel offset
+  const uint block_idx = texel_idx / texels_per_block;
+  const int intra_block_texel = int(texel_idx % texels_per_block);
+
+  // Decompose block_idx into block-space tensor coordinates using dim_order
+  // and strides. Iterate from slowest-varying (d=3) to fastest (d=0).
+  uint remaining = block_idx;
   [[unroll]] for (int d = 3; d >= 0; d--) {
-    // Get dim index from hashed_layout's dim_order (bits 0-15)
     int dim_idx = extract_4b(hashed_layout, d);
-
-    // Get stride for this dimension from BufferMetadata
     uint dim_stride = meta.strides[0][dim_idx];
-
-    // Compute coordinate for this dimension
-    tidx.data[dim_idx] = int(texel_idx / dim_stride);
-    texel_idx = texel_idx % dim_stride;
+    tidx.data[dim_idx] = int(remaining / dim_stride);
+    remaining = remaining % dim_stride;
   }
 
-  // Convert packed dimension from texel index to element index
-  tidx.data[packed_dim] *= 4;
+  // Convert block-space coordinates to element-space
+  tidx.data[inner_dim] *= inner_block_size;
+  tidx.data[outer_dim] *= outer_block_size;
+
+  // Add intra-block outer-dimension offset
+  tidx.data[outer_dim] += intra_block_texel;
 
   return tidx;
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8x4_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8x4_buffer.glsl
new file mode 100644
index 00000000000..d8f7bdabe53
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8x4_buffer.glsl
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+${define_active_storage_type("buffer")}
+
+layout(std430) buffer;
+
+#include "indexing.glslh"
+
+// Output buffer: packed int8x4 values (each int32 contains 4 packed int8)
+${layout_declare_tensor(B, "w", "t_outp", "int", "buffer")}
+// Input staging buffer: raw int8 data interpreted as int32 for device compat
+${layout_declare_tensor(B, "r", "nchw_in", "int", "buffer")}
+
+// Metadata for output tensor
+${layout_declare_ubo(B, "BufferMetadata", "outp")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+${layout_declare_spec_const(C, "int", "outp_layout", "CONTIG_LAYOUT_INT")}
+
+void main() {
+  const uint texel_idx = gl_GlobalInvocationID.x;
+  const uint num_texels = numel(outp) / 4;
+  if (texel_idx >= num_texels) {
+    return;
+  }
+
+  const int inner_dim = get_packed_dim(outp_layout);
+  const int outer_dim = get_outer_packed_dim(outp_layout);
+
+  const TensorIndex4D tidx =
+      texel_idx_to_tensor4d_idx(outp, texel_idx, outp_layout);
+
+  // Bounds check on outer dimension
+  if (tidx.data[outer_dim] >= int(outp.sizes[0][outer_dim])) {
+    return;
+  }
+
+  // Tensor sizes in WHCN order for NCHW contiguous index computation
+  const uint W = outp.sizes[0][0];
+  const uint H = outp.sizes[0][1];
+  const uint C = outp.sizes[0][2];
+
+  // Pack 4 int8 values along inner dimension into one int32
+  int packed = 0;
+  [[unroll]] for (int i = 0; i < 4; ++i) {
+    const int elem_inner = tidx.data[inner_dim] + i;
+    if (elem_inner < int(outp.sizes[0][inner_dim])) {
+      // Build element coordinates
+      ivec4 elem = tidx.data;
+      elem[inner_dim] = elem_inner;
+
+      // Compute NCHW contiguous index: w + h*W + c*H*W + n*C*H*W
+      const uint nchw_idx = uint(elem[0]) + uint(elem[1]) * W +
+                            uint(elem[2]) * H * W + uint(elem[3]) * C * H * W;
+
+      // Read int8 from staging buffer (each int32 contains 4 bytes)
+      const uint int_idx = nchw_idx >> 2;
+      const uint byte_pos = nchw_idx & 3;
+      const int staging_val = nchw_in[int_idx];
+      const int byte_val = (staging_val >> (byte_pos * 8)) & 0xFF;
+
+      packed |= (byte_val << (i * 8));
+    }
+  }
+
+  t_outp[texel_idx] = packed;
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8x4_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8x4_buffer.yaml
new file mode 100644
index 00000000000..514ada71f63
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8x4_buffer.yaml
@@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+nchw_to_int8x4_buffer:
+  parameter_names_with_default_values:
+    DTYPE: int
+  shader_variants:
+    - NAME: nchw_to_int8x4_buffer
diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taStaging.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taStaging.cpp
new file mode 100644
index 00000000000..8dc3f8156f8
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Q8taStaging.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Q8taStaging.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+void add_staging_to_int8x4_buffer_node(
+    ComputeGraph& graph,
+    const ValueRef tensor_data,
+    const ValueRef tensor) {
+  VK_CHECK_COND(graph.dtype_of(tensor) == vkapi::kInt8x4);
+
+  std::string kernel_name = "nchw_to_int8x4_buffer";
+
+  vkapi::ParamsBindList param_buffers;
+  param_buffers.append(graph.buffer_meta_ubo(tensor));
+
+  // One thread per texel (each texel = one int32 = 4 packed int8).
+  // Use padded_numel to account for dimension padding in packed int8 layouts
+  // (e.g., kPackedInt8_4C with C=3 pads to C=4).
+  uint32_t num_texels =
+      utils::safe_downcast<uint32_t>(graph.padded_numel_of(tensor) / 4);
+  utils::uvec3 global_wg_size = {num_texels, 1, 1};
+  utils::uvec3 local_wg_size = graph.create_local_wg_size(global_wg_size);
+
+  graph.prepack_nodes().emplace_back(new PrepackNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_wg_size,
+      local_wg_size,
+      // Input and Output
+      tensor_data,
+      tensor,
+      // Parameter Buffers
+      param_buffers,
+      // Specialization Constants
+      {graph.hashed_layout_of(tensor)}));
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taStaging.h b/backends/vulkan/runtime/graph/ops/impl/Q8taStaging.h
new file mode 100644
index 00000000000..40386551e36
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Q8taStaging.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
+
+namespace vkcompute {
+
+void add_staging_to_int8x4_buffer_node(
+    ComputeGraph& graph,
+    const ValueRef tensor_data,
+    const ValueRef tensor);
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
index 9dc4d0a58f8..adcad9f9817 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
@@ -12,6 +12,7 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Q8taStaging.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
 
@@ -327,6 +328,9 @@ ValueRef prepack_int4_linear_weight_transposed_interleaved(
 }
 
 void prepack_op(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  if (graph.dtype_of(args[1]) == vkapi::kInt8x4) {
+    return add_staging_to_int8x4_buffer_node(graph, args[0], args[1]);
+  }
   return add_prepack_standard_node(graph, args[0], args[1]);
 }
 
diff --git a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp
index 231e6d0c7f6..59a9d79a6e3 100644
--- a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp
+++ b/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp
@@ -64,6 +64,9 @@ void add_dtype_suffix(std::string& kernel_name, const vkapi::ScalarType dtype) {
     case vkapi::kUInt64:
       kernel_name += "_uint64";
       break;
+    case vkapi::kInt8x4:
+      kernel_name += "_int32";
+      break;
     default:
       break;
   }
diff --git a/backends/vulkan/test/custom_ops/impl/TestQ8taBinary.cpp b/backends/vulkan/test/custom_ops/impl/TestQ8taBinary.cpp
index 53f8859b581..f5214221359 100644
--- a/backends/vulkan/test/custom_ops/impl/TestQ8taBinary.cpp
+++ b/backends/vulkan/test/custom_ops/impl/TestQ8taBinary.cpp
@@ -10,13 +10,14 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Q8taBinary.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Q8taQuantizeDequantize.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Q8taStaging.h>
 
 namespace vkcompute {
 
 void q8ta_add_test(ComputeGraph& graph, const std::vector<ValueRef>& args) {
   int32_t idx = 0;
-  const ValueRef fp_input_a = args.at(idx++);
-  const ValueRef fp_input_b = args.at(idx++);
+  ValueRef fp_input_a = args.at(idx++);
+  ValueRef input_b = args.at(idx++);
   const ValueRef input_a_scale = args.at(idx++);
   const ValueRef input_a_zp = args.at(idx++);
   const ValueRef input_b_scale = args.at(idx++);
@@ -32,6 +33,10 @@ void q8ta_add_test(ComputeGraph& graph, const std::vector<ValueRef>& args) {
   utils::GPUMemoryLayout quant_layout =
       static_cast<utils::GPUMemoryLayout>(layout_value);
 
+  // Check if input_b is a pre-quantized int8 TensorRef
+  bool input_b_is_int8 =
+      graph.val_is_tref(input_b) && graph.dtype_of(input_b) == vkapi::kChar;
+
   // Create temporary tensors for quantized data with the specified layout
   TmpTensor packed_int8_input_a(
       &graph,
@@ -40,12 +45,8 @@ void q8ta_add_test(ComputeGraph& graph, const std::vector<ValueRef>& args) {
       utils::kBuffer,
       quant_layout);
 
-  TmpTensor packed_int8_input_b(
-      &graph,
-      graph.sizes_of(fp_input_b),
-      vkapi::kInt8x4,
-      utils::kBuffer,
-      quant_layout);
+  ValueRef packed_int8_input_b = graph.add_tensor(
+      graph.sizes_of(input_b), vkapi::kInt8x4, utils::kBuffer, quant_layout);
 
   TmpTensor packed_int8_output(
       &graph,
@@ -54,12 +55,19 @@ void q8ta_add_test(ComputeGraph& graph, const std::vector<ValueRef>& args) {
       utils::kBuffer,
       quant_layout);
 
-  // Quantize: FP -> int8x4 with specified layout
+  // Quantize input A: FP -> int8x4
   add_q8ta_quantize_node(
       graph, fp_input_a, input_a_scale, input_a_zp, packed_int8_input_a);
 
-  add_q8ta_quantize_node(
-      graph, fp_input_b, input_b_scale, input_b_zp, packed_int8_input_b);
+  if (input_b_is_int8) {
+    // Input B is a pre-quantized int8 TensorRef; prepack directly into packed
+    // int8x4 format
+    add_staging_to_int8x4_buffer_node(graph, input_b, packed_int8_input_b);
+  } else {
+    // Input B is a float tensor; quantize at runtime
+    add_q8ta_quantize_node(
+        graph, input_b, input_b_scale, input_b_zp, packed_int8_input_b);
+  }
 
   // Binary add: int8x4 -> int8x4 (same layout for all tensors)
   add_q8ta_binary_node(
diff --git a/backends/vulkan/test/custom_ops/test_q8ta_binary.cpp b/backends/vulkan/test/custom_ops/test_q8ta_binary.cpp
index 1100eb4d5f0..86725ca8fb8 100644
--- a/backends/vulkan/test/custom_ops/test_q8ta_binary.cpp
+++ b/backends/vulkan/test/custom_ops/test_q8ta_binary.cpp
@@ -29,13 +29,17 @@ TestCase create_test_case_from_config(
     utils::StorageType storage_type,
     vkapi::ScalarType input_dtype,
     utils::GPUMemoryLayout fp_memory_layout,
-    utils::GPUMemoryLayout quant_layout) {
+    utils::GPUMemoryLayout quant_layout,
+    bool const_b = false) {
   TestCase test_case;
 
   // Create a descriptive name for the test case
   std::string shape_str = shape_string(config.shape);
   std::string test_name = config.test_case_name + "  I=" + shape_str + "  " +
       repr_str(utils::kBuffer, quant_layout);
+  if (const_b) {
+    test_name += "  const_b";
+  }
   test_case.set_name(test_name);
 
   // Set the operator name for the test case
@@ -50,13 +54,16 @@ TestCase create_test_case_from_config(
       fp_memory_layout,
       DataGenType::RANDOM);
 
-  // Input tensor B (float/half)
+  // Input tensor B (float/half, or pre-quantized int8 for const_b)
   ValueSpec input_b(
       config.shape,
-      input_dtype,
+      const_b ? vkapi::kChar : input_dtype,
       storage_type,
       fp_memory_layout,
-      DataGenType::RANDOM);
+      const_b ? DataGenType::RANDINT8 : DataGenType::RANDOM);
+  if (const_b) {
+    input_b.set_constant(true);
+  }
 
   // Quantization parameters for input A
   float input_a_scale_val = 0.007843; // 2/255 approximately
@@ -148,6 +155,13 @@ std::vector<TestCase> generate_q8ta_add_easy_cases() {
         /*input_dtype=*/vkapi::kFloat,
         /*fp_memory_layout=*/utils::kWidthPacked,
         quant_layout));
+    test_cases.push_back(create_test_case_from_config(
+        config,
+        /*fp_storage_type=*/utils::kBuffer,
+        /*input_dtype=*/vkapi::kFloat,
+        /*fp_layout=*/utils::kWidthPacked,
+        quant_layout,
+        /*const_b=*/true));
   }
 
   return test_cases;
@@ -215,6 +229,13 @@ std::vector<TestCase> generate_q8ta_add_test_cases() {
           /*input_dtype=*/vkapi::kFloat,
           /*fp_memory_layout=*/utils::kWidthPacked,
           quant_layout));
+      test_cases.push_back(create_test_case_from_config(
+          config,
+          /*fp_storage_type=*/utils::kBuffer,
+          /*fp_input_dtype=*/vkapi::kFloat,
+          /*fp_layout=*/utils::kWidthPacked,
+          quant_layout,
+          /*const_b=*/true));
     }
   }
 
@@ -261,9 +282,10 @@ void q8ta_add_reference_impl(TestCase& test_case) {
     throw std::invalid_argument("Unsupported dtype");
   }
 
+  bool input_b_is_int8 = (input_b_spec.dtype == vkapi::kChar);
+
   // Get raw data pointers
   auto& input_a_data = input_a_spec.get_float_data();
-  auto& input_b_data = input_b_spec.get_float_data();
 
   const float input_a_scale = input_a_scale_spec.get_float_value();
   const int32_t input_a_zero_point = input_a_zero_point_spec.get_int_value();
@@ -284,11 +306,17 @@ void q8ta_add_reference_impl(TestCase& test_case) {
     quant_a_f = std::min(std::max(quant_a_f, -128.0f), 127.0f);
     int8_t quantized_a = static_cast<int8_t>(quant_a_f);
 
-    // Quantize input B to int8
-    float quant_b_f =
-        std::round(input_b_data[i] / input_b_scale) + input_b_zero_point;
-    quant_b_f = std::min(std::max(quant_b_f, -128.0f), 127.0f);
-    int8_t quantized_b = static_cast<int8_t>(quant_b_f);
+    // Get quantized input B (either from pre-quantized int8 or by quantizing)
+    int8_t quantized_b;
+    if (input_b_is_int8) {
+      quantized_b = input_b_spec.get_int8_data()[i];
+    } else {
+      float quant_b_f =
+          std::round(input_b_spec.get_float_data()[i] / input_b_scale) +
+          input_b_zero_point;
+      quant_b_f = std::min(std::max(quant_b_f, -128.0f), 127.0f);
+      quantized_b = static_cast<int8_t>(quant_b_f);
+    }
 
     // Dequantize both inputs to a common scale for addition
     float dequant_a =

From 10e8070e5bc555146566974af652072261233a53 Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devvm1479.ncg0.facebook.com>
Date: Thu, 12 Feb 2026 23:49:05 -0800
Subject: [PATCH 3/4] [ET-VK][qconv2d][ez] Don't use im2col path for general
 convs

Pull Request resolved: https://github.com/pytorch/executorch/pull/17393

This removes the dynamic dispatch logic in q8ta_conv2d() that selected between the im2col and general convolution paths. The function now unconditionally uses q8ta_conv2d_general(). This simplifies the dispatch since the im2col path selection will be handled upstream by the pattern matcher routing to specialized ops (q8ta_conv2d_pw, q8ta_conv2d_dw, etc.) instead of being decided at runtime.
ghstack-source-id: 341022577
@exported-using-ghexport

Differential Revision: [D93000164](https://our.internmc.facebook.com/intern/diff/D93000164/)
---
 .../runtime/graph/ops/impl/Q8taConv2d.cpp     | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp
index d3fe1afd906..4f047d414f8 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp
@@ -401,24 +401,7 @@ void q8ta_conv2d_general(
 }
 
 void q8ta_conv2d(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  // Index into args to extract values needed for dispatch decision
-  const ValueRef packed_int8_input = args.at(0);
-  const ValueRef kernel_size = args.at(9);
-  const ValueRef groups = args.at(13);
-
-  const int32_t groups_val = graph.get_int(groups);
-  const int64_t IC = graph.size_at<int64_t>(-3, packed_int8_input);
-
-  const int64_t K_h = graph.get_int_list(kernel_size)->at(0);
-  const int64_t K_w = graph.get_int_list(kernel_size)->at(1);
-
-  // Use im2col path when: non-grouped, input channels multiple of 4, small
-  // kernel
-  if (groups_val == 1 && IC % 4 == 0 && K_h <= 3 && K_w <= 3) {
-    q8ta_conv2d_im2col(graph, args);
-  } else {
-    q8ta_conv2d_general(graph, args);
-  }
+  q8ta_conv2d_general(graph, args);
 }
 
 REGISTER_OPERATORS {

From 97926c306018b29598cc016b6c72674afc66758f Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devvm1479.ncg0.facebook.com>
Date: Thu, 12 Feb 2026 23:49:06 -0800
Subject: [PATCH 4/4] [ET-VK][profiling] Add additional profiling blocks

Pull Request resolved: https://github.com/pytorch/executorch/pull/17394

This adds fine-grained ET_EVENT_TRACER profiling blocks to the Vulkan backend's execute() method in VulkanBackend.cpp. Previously, only GPU shader timestamps were logged. Now the following phases are individually traced: ETVK_COPY_INPUTS (host-to-GPU input transfer), ETVK_RESIZE (graph resize propagation), ETVK_COMPUTE_GRAPH_EXECUTE (GPU compute dispatch), ETVK_COPY_OUTPUTS (GPU-to-host output transfer), and ETVK_EXECUTE (overall delegate execution). The GPU shader timestamp extraction is also moved to occur right after execute() completes rather than at the end of the function, so it falls within the ETVK_EXECUTE span.
ghstack-source-id: 341022578
@exported-using-ghexport

Differential Revision: [D93000163](https://our.internmc.facebook.com/intern/diff/D93000163/)
---
 backends/vulkan/runtime/VulkanBackend.cpp | 82 +++++++++++++++++++----
 1 file changed, 69 insertions(+), 13 deletions(-)

diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp
index 261585c381b..fbca5af5100 100644
--- a/backends/vulkan/runtime/VulkanBackend.cpp
+++ b/backends/vulkan/runtime/VulkanBackend.cpp
@@ -641,6 +641,21 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
 
     const size_t num_inputs = compute_graph->inputs().size();
     bool should_propagate_resize = false;
+#ifdef ET_EVENT_TRACER_ENABLED
+    runtime::EventTracer* event_tracer = context.event_tracer();
+    runtime::EventTracerEntry overall_event_tracer_entry =
+        event_tracer_start_profiling_delegate(
+            event_tracer,
+            "ETVK_EXECUTE",
+            /* delegate_debug_id = */ -1);
+#endif // ET_EVENT_TRACER_ENABLED
+#ifdef ET_EVENT_TRACER_ENABLED
+    runtime::EventTracerEntry copy_inputs_event_tracer_entry =
+        event_tracer_start_profiling_delegate(
+            event_tracer,
+            "ETVK_COPY_INPUTS",
+            /* delegate_debug_id = */ -1);
+#endif // ET_EVENT_TRACER_ENABLED
     for (size_t i = 0; i < num_inputs; i++) {
       const ValueRef iref = compute_graph->inputs()[i].value;
       if (compute_graph->val_is_tensor(iref)) {
@@ -669,13 +684,61 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
             compute_graph->get_val_type(iref));
       }
     }
+#ifdef ET_EVENT_TRACER_ENABLED
+    event_tracer_end_profiling_delegate(
+        event_tracer, copy_inputs_event_tracer_entry);
+#endif // ET_EVENT_TRACER_ENABLED
 
     if (should_propagate_resize || compute_graph->has_data_dependent_shapes()) {
+#ifdef ET_EVENT_TRACER_ENABLED
+      runtime::EventTracerEntry resize_event_tracer_entry =
+          event_tracer_start_profiling_delegate(
+              event_tracer,
+              "ETVK_RESIZE",
+              /* delegate_debug_id = */ -1);
+#endif // ET_EVENT_TRACER_ENABLED
       compute_graph->propagate_resize();
+#ifdef ET_EVENT_TRACER_ENABLED
+      event_tracer_end_profiling_delegate(
+          event_tracer, resize_event_tracer_entry);
+#endif // ET_EVENT_TRACER_ENABLED
     }
 
+#ifdef ET_EVENT_TRACER_ENABLED
+    runtime::EventTracerEntry execute_event_tracer_entry =
+        event_tracer_start_profiling_delegate(
+            event_tracer,
+            "ETVK_COMPUTE_GRAPH_EXECUTE",
+            /* delegate_debug_id = */ -1);
+#endif // ET_EVENT_TRACER_ENABLED
     compute_graph->execute();
+#ifdef ET_EVENT_TRACER_ENABLED
+    event_tracer_end_profiling_delegate(
+        event_tracer, execute_event_tracer_entry);
+#endif // ET_EVENT_TRACER_ENABLED
 
+#ifdef ET_EVENT_TRACER_ENABLED
+    compute_graph->context()->querypool().extract_results();
+    for (const auto& r :
+         compute_graph->context()->querypool().get_shader_timestamp_data()) {
+      std::string event_name = "{" + r.kernel_name +
+          ", \"dispatch_id\": " + std::to_string(r.dispatch_id) + "}";
+      event_tracer_log_profiling_delegate(
+          event_tracer,
+          event_name.c_str(),
+          /* delegate_debug_id = */ -1,
+          r.start_time_ns,
+          r.end_time_ns);
+    }
+#endif // ET_EVENT_TRACER_ENABLED
+
+#ifdef ET_EVENT_TRACER_ENABLED
+    runtime::EventTracerEntry copy_outputs_event_tracer_entry =
+        event_tracer_start_profiling_delegate(
+            event_tracer,
+            "ETVK_COPY_OUTPUTS",
+            /* delegate_debug_id = */ -1);
+#endif // ET_EVENT_TRACER_ENABLED
     for (size_t i = 0; i < compute_graph->outputs().size(); i++) {
       const size_t o = i + num_inputs;
       const ValueRef oref = compute_graph->outputs()[i].value;
@@ -701,21 +764,14 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
             compute_graph->get_val_type(oref));
       }
     }
+#ifdef ET_EVENT_TRACER_ENABLED
+    event_tracer_end_profiling_delegate(
+        event_tracer, copy_outputs_event_tracer_entry);
+#endif // ET_EVENT_TRACER_ENABLED
 
 #ifdef ET_EVENT_TRACER_ENABLED
-    runtime::EventTracer* event_tracer = context.event_tracer();
-    compute_graph->context()->querypool().extract_results();
-    for (const auto& r :
-         compute_graph->context()->querypool().get_shader_timestamp_data()) {
-      std::string event_name = "{" + r.kernel_name +
-          ", \"dispatch_id\": " + std::to_string(r.dispatch_id) + "}";
-      event_tracer_log_profiling_delegate(
-          event_tracer,
-          event_name.c_str(),
-          /* delegate_debug_id = */ -1,
-          r.start_time_ns,
-          r.end_time_ns);
-    }
+    event_tracer_end_profiling_delegate(
+        event_tracer, overall_event_tracer_entry);
 #endif // ET_EVENT_TRACER_ENABLED
 
     return Error::Ok;