From ba9b8da912cdf1e0c5c57ec4120fd372c022c0e7 Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devvm1479.ncg0.facebook.com>
Date: Thu, 12 Feb 2026 23:49:03 -0800
Subject: [PATCH 1/3] [ET-VK] Add nchw_to_int8x4_buffer shader for prepacking
 int8 staging data

Pull Request resolved: https://github.com/pytorch/executorch/pull/17392

This adds a GLSL compute shader and supporting C++ dispatch logic to transfer
int8 tensor data from a staging buffer (in NCHW contiguous order) to a GPU
buffer in any PackedInt8 layout (4W, 4C, 4W4C, 4H4W, 4C1W).

Previously there was no prepack path for kInt8x4 tensors, so constant int8
tensors (TensorRef inputs) could not be transferred to GPU buffers. This is
needed to support constant quantized weights in q8ta operators.

The shader uses texel-level dispatch where each thread writes one texel (one
int32 = 4 packed int8 values). It decomposes the texel index into block-space
coordinates using BufferMetadata strides and hashed_layout dim_order, then reads
the corresponding int8 bytes from the staging buffer (interpreted as int32 for
device compatibility, avoiding the need for 8-bit buffer support).

New files:
- nchw_to_int8x4_buffer.glsl: Compute shader handling all PackedInt8 layouts
- nchw_to_int8x4_buffer.yaml: Shader variant config
- Q8taStaging.h/cpp: C++ dispatch function creating the PrepackNode

Modified files:
- Staging.cpp: Routes kInt8x4 tensors in prepack_op() to the new function
- TestQ8taBinary.cpp: Prepacks TensorRef inputs before quantization
- test_q8ta_binary.cpp: Adds const_b test cases for constant tensor B inputs

This diff was authored with Claude.
ghstack-source-id: 341022576
@exported-using-ghexport

Differential Revision: [D93000169](https://our.internmc.facebook.com/intern/diff/D93000169/)
---
 backends/vulkan/op_registry.py                |  1 -
 .../runtime/api/containers/StagingBuffer.cpp  |  6 +-
 .../runtime/graph/ops/glsl/indexing.glslh     | 61 ++++++++++-----
 .../graph/ops/glsl/nchw_to_int8x4_buffer.glsl | 78 +++++++++++++++++++
 .../graph/ops/glsl/nchw_to_int8x4_buffer.yaml | 11 +++
 .../runtime/graph/ops/impl/Q8taStaging.cpp    | 49 ++++++++++++
 .../runtime/graph/ops/impl/Q8taStaging.h      | 20 +++++
 .../vulkan/runtime/graph/ops/impl/Staging.cpp |  4 +
 .../graph/ops/utils/ShaderNameUtils.cpp       |  3 +
 .../test/custom_ops/impl/TestQ8taBinary.cpp   | 30 ++++---
 .../test/custom_ops/test_q8ta_binary.cpp      | 48 +++++++++---
 11 files changed, 268 insertions(+), 43 deletions(-)
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8x4_buffer.glsl
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8x4_buffer.yaml
 create mode 100644 backends/vulkan/runtime/graph/ops/impl/Q8taStaging.cpp
 create mode 100644 backends/vulkan/runtime/graph/ops/impl/Q8taStaging.h

diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
index b58d6407308..55a92335bc7 100644
--- a/backends/vulkan/op_registry.py
+++ b/backends/vulkan/op_registry.py
@@ -510,7 +510,6 @@ def register_q8ta_add():
     return OpFeatures(
         inputs_storage=utils.PACKED_INT8_BUFFER,
         supports_resize=False,
-        supports_prepacking=True,
     )
 
 
diff --git a/backends/vulkan/runtime/api/containers/StagingBuffer.cpp b/backends/vulkan/runtime/api/containers/StagingBuffer.cpp
index 499f0b43d05..53ec9c17eae 100644
--- a/backends/vulkan/runtime/api/containers/StagingBuffer.cpp
+++ b/backends/vulkan/runtime/api/containers/StagingBuffer.cpp
@@ -136,8 +136,12 @@ StagingBuffer::StagingBuffer(
     const vkapi::CopyDirection direction)
     : context_p_(context_p),
       dtype_(get_staging_dtype(context_p, dtype)),
+      // For 8-bit types, align numel to the next multiple of 4. Devices that
+      // lack 8-bit storage buffer support will interpret the data as int32, so
+      // the buffer size must be a multiple of 4 bytes.
       vulkan_buffer_(context_p_->adapter_ptr()->vma().create_staging_buffer(
-          element_size(dtype_) * numel,
+          element_size(dtype_) *
+              (element_size(dtype_) == 1 ? utils::align_up_4(numel) : numel),
           direction)),
       mapped_data_(nullptr) {}
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh b/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh
index 37c47795214..51cda9a3d1d 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh
+++ b/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh
@@ -334,19 +334,30 @@ TensorIndex linear_idx_to_tensor_idx(
 /*
  * Convert a linear texel index to a TensorIndex4D.
  *
- * This function is used for texel-based dispatch where each thread handles
- * one packed texel (4 elements along the packed dimension). The texel index
- * is decomposed using the dim_order and strides from the tensor's layout.
+ * This is the inverse of tensor4d_idx_to_texel_idx. It handles both
+ * single-packed layouts (outer_block_size == 1) and block-packed layouts
+ * (e.g., 4W4C where outer_block_size > 1).
  *
- * The strides in BufferMetadata should already be in texel space (with packed
- * dimension size divided by 4).
+ * The approach mirrors tensor4d_idx_to_texel_idx by decomposing the problem
+ * into two levels:
+ *   1. Decompose texel_idx into block_idx and intra-block texel offset
+ *   2. Decompose block_idx into block-space tensor coordinates using strides
+ *   3. Convert block-space coordinates to element-space by multiplying by
+ *      block sizes
+ *   4. Add the intra-block outer-dimension offset
+ *
+ * For single-packed layouts (outer_block_size == 1, inner_dim == outer_dim),
+ * texels_per_block == 1, so block_idx == texel_idx and intra_block_texel == 0.
+ * The only effective multiplication is tidx[inner_dim] *= inner_block_size
+ * (i.e., *= 4), matching the previous single-packed behavior.
  *
  * Parameters:
- *   meta: BufferMetadata with tensor sizes and texel-space strides
+ *   meta: BufferMetadata with block-space strides
  *   texel_idx: Linear index into packed texels (0 to num_texels-1)
  *   hashed_layout: Packed layout info containing dim_order and packed_dim
  *
- * Returns: TensorIndex4D with logical tensor coordinates (packed dim is base of 4-element block)
+ * Returns: TensorIndex4D with logical tensor coordinates (packed dims are
+ *          base of their respective blocks)
  */
 TensorIndex4D texel_idx_to_tensor4d_idx(
     const BufferMetadata meta,
@@ -354,25 +365,35 @@ TensorIndex4D texel_idx_to_tensor4d_idx(
     const int hashed_layout) {
   TensorIndex4D tidx;
 
-  const int packed_dim = get_packed_dim(hashed_layout);
+  const int inner_dim = get_packed_dim(hashed_layout);
+  const int outer_dim = get_outer_packed_dim(hashed_layout);
+  const int inner_block_size = get_packed_dim_block_size(hashed_layout);
+  const int outer_block_size = get_outer_packed_dim_block_size(hashed_layout);
 
-  // Decompose texel_idx using dim_order from hashed_layout and strides from meta
-  // Iterate from slowest-varying dimension (d=3) to fastest (d=0)
-  // This follows the pattern of linear_idx_to_tensor_idx in indexing.glslh
+  // Number of texels per block: each block has inner_block_size *
+  // outer_block_size elements, and each texel holds 4 elements
+  const int texels_per_block = (inner_block_size * outer_block_size) / 4;
+
+  // Decompose texel_idx into block_idx and intra-block texel offset
+  const uint block_idx = texel_idx / texels_per_block;
+  const int intra_block_texel = int(texel_idx % texels_per_block);
+
+  // Decompose block_idx into block-space tensor coordinates using dim_order
+  // and strides. Iterate from slowest-varying (d=3) to fastest (d=0).
+  uint remaining = block_idx;
   [[unroll]] for (int d = 3; d >= 0; d--) {
-    // Get dim index from hashed_layout's dim_order (bits 0-15)
     int dim_idx = extract_4b(hashed_layout, d);
-
-    // Get stride for this dimension from BufferMetadata
     uint dim_stride = meta.strides[0][dim_idx];
-
-    // Compute coordinate for this dimension
-    tidx.data[dim_idx] = int(texel_idx / dim_stride);
-    texel_idx = texel_idx % dim_stride;
+    tidx.data[dim_idx] = int(remaining / dim_stride);
+    remaining = remaining % dim_stride;
   }
 
-  // Convert packed dimension from texel index to element index
-  tidx.data[packed_dim] *= 4;
+  // Convert block-space coordinates to element-space
+  tidx.data[inner_dim] *= inner_block_size;
+  tidx.data[outer_dim] *= outer_block_size;
+
+  // Add intra-block outer-dimension offset
+  tidx.data[outer_dim] += intra_block_texel;
 
   return tidx;
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8x4_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8x4_buffer.glsl
new file mode 100644
index 00000000000..d8f7bdabe53
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8x4_buffer.glsl
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+${define_active_storage_type("buffer")}
+
+layout(std430) buffer;
+
+#include "indexing.glslh"
+
+// Output buffer: packed int8x4 values (each int32 contains 4 packed int8)
+${layout_declare_tensor(B, "w", "t_outp", "int", "buffer")}
+// Input staging buffer: raw int8 data interpreted as int32 for device compat
+${layout_declare_tensor(B, "r", "nchw_in", "int", "buffer")}
+
+// Metadata for output tensor
+${layout_declare_ubo(B, "BufferMetadata", "outp")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+${layout_declare_spec_const(C, "int", "outp_layout", "CONTIG_LAYOUT_INT")}
+
+void main() {
+  const uint texel_idx = gl_GlobalInvocationID.x;
+  const uint num_texels = numel(outp) / 4;
+  if (texel_idx >= num_texels) {
+    return;
+  }
+
+  const int inner_dim = get_packed_dim(outp_layout);
+  const int outer_dim = get_outer_packed_dim(outp_layout);
+
+  const TensorIndex4D tidx =
+      texel_idx_to_tensor4d_idx(outp, texel_idx, outp_layout);
+
+  // Bounds check on outer dimension
+  if (tidx.data[outer_dim] >= int(outp.sizes[0][outer_dim])) {
+    return;
+  }
+
+  // Tensor sizes in WHCN order for NCHW contiguous index computation
+  const uint W = outp.sizes[0][0];
+  const uint H = outp.sizes[0][1];
+  const uint C = outp.sizes[0][2];
+
+  // Pack 4 int8 values along inner dimension into one int32
+  int packed = 0;
+  [[unroll]] for (int i = 0; i < 4; ++i) {
+    const int elem_inner = tidx.data[inner_dim] + i;
+    if (elem_inner < int(outp.sizes[0][inner_dim])) {
+      // Build element coordinates
+      ivec4 elem = tidx.data;
+      elem[inner_dim] = elem_inner;
+
+      // Compute NCHW contiguous index: w + h*W + c*H*W + n*C*H*W
+      const uint nchw_idx = uint(elem[0]) + uint(elem[1]) * W +
+                            uint(elem[2]) * H * W + uint(elem[3]) * C * H * W;
+
+      // Read int8 from staging buffer (each int32 contains 4 bytes)
+      const uint int_idx = nchw_idx >> 2;
+      const uint byte_pos = nchw_idx & 3;
+      const int staging_val = nchw_in[int_idx];
+      const int byte_val = (staging_val >> (byte_pos * 8)) & 0xFF;
+
+      packed |= (byte_val << (i * 8));
+    }
+  }
+
+  t_outp[texel_idx] = packed;
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8x4_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8x4_buffer.yaml
new file mode 100644
index 00000000000..514ada71f63
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8x4_buffer.yaml
@@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+nchw_to_int8x4_buffer:
+  parameter_names_with_default_values:
+    DTYPE: int
+  shader_variants:
+    - NAME: nchw_to_int8x4_buffer
diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taStaging.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taStaging.cpp
new file mode 100644
index 00000000000..8dc3f8156f8
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Q8taStaging.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Q8taStaging.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+void add_staging_to_int8x4_buffer_node(
+    ComputeGraph& graph,
+    const ValueRef tensor_data,
+    const ValueRef tensor) {
+  VK_CHECK_COND(graph.dtype_of(tensor) == vkapi::kInt8x4);
+
+  std::string kernel_name = "nchw_to_int8x4_buffer";
+
+  vkapi::ParamsBindList param_buffers;
+  param_buffers.append(graph.buffer_meta_ubo(tensor));
+
+  // One thread per texel (each texel = one int32 = 4 packed int8).
+  // Use padded_numel to account for dimension padding in packed int8 layouts
+  // (e.g., kPackedInt8_4C with C=3 pads to C=4).
+  uint32_t num_texels =
+      utils::safe_downcast<uint32_t>(graph.padded_numel_of(tensor) / 4);
+  utils::uvec3 global_wg_size = {num_texels, 1, 1};
+  utils::uvec3 local_wg_size = graph.create_local_wg_size(global_wg_size);
+
+  graph.prepack_nodes().emplace_back(new PrepackNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_wg_size,
+      local_wg_size,
+      // Input and Output
+      tensor_data,
+      tensor,
+      // Parameter Buffers
+      param_buffers,
+      // Specialization Constants
+      {graph.hashed_layout_of(tensor)}));
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taStaging.h b/backends/vulkan/runtime/graph/ops/impl/Q8taStaging.h
new file mode 100644
index 00000000000..40386551e36
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Q8taStaging.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
+
+namespace vkcompute {
+
+void add_staging_to_int8x4_buffer_node(
+    ComputeGraph& graph,
+    const ValueRef tensor_data,
+    const ValueRef tensor);
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
index 9dc4d0a58f8..adcad9f9817 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
@@ -12,6 +12,7 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Q8taStaging.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
 
@@ -327,6 +328,9 @@ ValueRef prepack_int4_linear_weight_transposed_interleaved(
 }
 
 void prepack_op(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  if (graph.dtype_of(args[1]) == vkapi::kInt8x4) {
+    return add_staging_to_int8x4_buffer_node(graph, args[0], args[1]);
+  }
   return add_prepack_standard_node(graph, args[0], args[1]);
 }
 
diff --git a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp
index 231e6d0c7f6..59a9d79a6e3 100644
--- a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp
+++ b/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp
@@ -64,6 +64,9 @@ void add_dtype_suffix(std::string& kernel_name, const vkapi::ScalarType dtype) {
     case vkapi::kUInt64:
       kernel_name += "_uint64";
       break;
+    case vkapi::kInt8x4:
+      kernel_name += "_int32";
+      break;
     default:
       break;
   }
diff --git a/backends/vulkan/test/custom_ops/impl/TestQ8taBinary.cpp b/backends/vulkan/test/custom_ops/impl/TestQ8taBinary.cpp
index 53f8859b581..f5214221359 100644
--- a/backends/vulkan/test/custom_ops/impl/TestQ8taBinary.cpp
+++ b/backends/vulkan/test/custom_ops/impl/TestQ8taBinary.cpp
@@ -10,13 +10,14 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Q8taBinary.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Q8taQuantizeDequantize.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Q8taStaging.h>
 
 namespace vkcompute {
 
 void q8ta_add_test(ComputeGraph& graph, const std::vector<ValueRef>& args) {
   int32_t idx = 0;
-  const ValueRef fp_input_a = args.at(idx++);
-  const ValueRef fp_input_b = args.at(idx++);
+  ValueRef fp_input_a = args.at(idx++);
+  ValueRef input_b = args.at(idx++);
   const ValueRef input_a_scale = args.at(idx++);
   const ValueRef input_a_zp = args.at(idx++);
   const ValueRef input_b_scale = args.at(idx++);
@@ -32,6 +33,10 @@ void q8ta_add_test(ComputeGraph& graph, const std::vector<ValueRef>& args) {
   utils::GPUMemoryLayout quant_layout =
       static_cast<utils::GPUMemoryLayout>(layout_value);
 
+  // Check if input_b is a pre-quantized int8 TensorRef
+  bool input_b_is_int8 =
+      graph.val_is_tref(input_b) && graph.dtype_of(input_b) == vkapi::kChar;
+
   // Create temporary tensors for quantized data with the specified layout
   TmpTensor packed_int8_input_a(
       &graph,
@@ -40,12 +45,8 @@ void q8ta_add_test(ComputeGraph& graph, const std::vector<ValueRef>& args) {
       utils::kBuffer,
       quant_layout);
 
-  TmpTensor packed_int8_input_b(
-      &graph,
-      graph.sizes_of(fp_input_b),
-      vkapi::kInt8x4,
-      utils::kBuffer,
-      quant_layout);
+  ValueRef packed_int8_input_b = graph.add_tensor(
+      graph.sizes_of(input_b), vkapi::kInt8x4, utils::kBuffer, quant_layout);
 
   TmpTensor packed_int8_output(
       &graph,
@@ -54,12 +55,19 @@ void q8ta_add_test(ComputeGraph& graph, const std::vector<ValueRef>& args) {
       utils::kBuffer,
       quant_layout);
 
-  // Quantize: FP -> int8x4 with specified layout
+  // Quantize input A: FP -> int8x4
   add_q8ta_quantize_node(
       graph, fp_input_a, input_a_scale, input_a_zp, packed_int8_input_a);
 
-  add_q8ta_quantize_node(
-      graph, fp_input_b, input_b_scale, input_b_zp, packed_int8_input_b);
+  if (input_b_is_int8) {
+    // Input B is a pre-quantized int8 TensorRef; prepack directly into packed
+    // int8x4 format
+    add_staging_to_int8x4_buffer_node(graph, input_b, packed_int8_input_b);
+  } else {
+    // Input B is a float tensor; quantize at runtime
+    add_q8ta_quantize_node(
+        graph, input_b, input_b_scale, input_b_zp, packed_int8_input_b);
+  }
 
   // Binary add: int8x4 -> int8x4 (same layout for all tensors)
   add_q8ta_binary_node(
diff --git a/backends/vulkan/test/custom_ops/test_q8ta_binary.cpp b/backends/vulkan/test/custom_ops/test_q8ta_binary.cpp
index 1100eb4d5f0..86725ca8fb8 100644
--- a/backends/vulkan/test/custom_ops/test_q8ta_binary.cpp
+++ b/backends/vulkan/test/custom_ops/test_q8ta_binary.cpp
@@ -29,13 +29,17 @@ TestCase create_test_case_from_config(
     utils::StorageType storage_type,
     vkapi::ScalarType input_dtype,
     utils::GPUMemoryLayout fp_memory_layout,
-    utils::GPUMemoryLayout quant_layout) {
+    utils::GPUMemoryLayout quant_layout,
+    bool const_b = false) {
   TestCase test_case;
 
   // Create a descriptive name for the test case
   std::string shape_str = shape_string(config.shape);
   std::string test_name = config.test_case_name + "  I=" + shape_str + "  " +
       repr_str(utils::kBuffer, quant_layout);
+  if (const_b) {
+    test_name += "  const_b";
+  }
   test_case.set_name(test_name);
 
   // Set the operator name for the test case
@@ -50,13 +54,16 @@ TestCase create_test_case_from_config(
       fp_memory_layout,
       DataGenType::RANDOM);
 
-  // Input tensor B (float/half)
+  // Input tensor B (float/half, or pre-quantized int8 for const_b)
   ValueSpec input_b(
       config.shape,
-      input_dtype,
+      const_b ? vkapi::kChar : input_dtype,
       storage_type,
       fp_memory_layout,
-      DataGenType::RANDOM);
+      const_b ? DataGenType::RANDINT8 : DataGenType::RANDOM);
+  if (const_b) {
+    input_b.set_constant(true);
+  }
 
   // Quantization parameters for input A
   float input_a_scale_val = 0.007843; // 2/255 approximately
@@ -148,6 +155,13 @@ std::vector<TestCase> generate_q8ta_add_easy_cases() {
         /*input_dtype=*/vkapi::kFloat,
         /*fp_memory_layout=*/utils::kWidthPacked,
         quant_layout));
+    test_cases.push_back(create_test_case_from_config(
+        config,
+        /*fp_storage_type=*/utils::kBuffer,
+        /*input_dtype=*/vkapi::kFloat,
+        /*fp_layout=*/utils::kWidthPacked,
+        quant_layout,
+        /*const_b=*/true));
   }
 
   return test_cases;
@@ -215,6 +229,13 @@ std::vector<TestCase> generate_q8ta_add_test_cases() {
           /*input_dtype=*/vkapi::kFloat,
           /*fp_memory_layout=*/utils::kWidthPacked,
           quant_layout));
+      test_cases.push_back(create_test_case_from_config(
+          config,
+          /*fp_storage_type=*/utils::kBuffer,
+          /*fp_input_dtype=*/vkapi::kFloat,
+          /*fp_layout=*/utils::kWidthPacked,
+          quant_layout,
+          /*const_b=*/true));
     }
   }
 
@@ -261,9 +282,10 @@ void q8ta_add_reference_impl(TestCase& test_case) {
     throw std::invalid_argument("Unsupported dtype");
   }
 
+  bool input_b_is_int8 = (input_b_spec.dtype == vkapi::kChar);
+
   // Get raw data pointers
   auto& input_a_data = input_a_spec.get_float_data();
-  auto& input_b_data = input_b_spec.get_float_data();
 
   const float input_a_scale = input_a_scale_spec.get_float_value();
   const int32_t input_a_zero_point = input_a_zero_point_spec.get_int_value();
@@ -284,11 +306,17 @@ void q8ta_add_reference_impl(TestCase& test_case) {
     quant_a_f = std::min(std::max(quant_a_f, -128.0f), 127.0f);
     int8_t quantized_a = static_cast<int8_t>(quant_a_f);
 
-    // Quantize input B to int8
-    float quant_b_f =
-        std::round(input_b_data[i] / input_b_scale) + input_b_zero_point;
-    quant_b_f = std::min(std::max(quant_b_f, -128.0f), 127.0f);
-    int8_t quantized_b = static_cast<int8_t>(quant_b_f);
+    // Get quantized input B (either from pre-quantized int8 or by quantizing)
+    int8_t quantized_b;
+    if (input_b_is_int8) {
+      quantized_b = input_b_spec.get_int8_data()[i];
+    } else {
+      float quant_b_f =
+          std::round(input_b_spec.get_float_data()[i] / input_b_scale) +
+          input_b_zero_point;
+      quant_b_f = std::min(std::max(quant_b_f, -128.0f), 127.0f);
+      quantized_b = static_cast<int8_t>(quant_b_f);
+    }
 
     // Dequantize both inputs to a common scale for addition
     float dequant_a =

From c3e20547cb7ada4fab4d23432ebec3860b6d3156 Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devvm1479.ncg0.facebook.com>
Date: Thu, 12 Feb 2026 23:49:05 -0800
Subject: [PATCH 2/3] [ET-VK][qconv2d][ez] Don't use im2col path for general
 convs

Pull Request resolved: https://github.com/pytorch/executorch/pull/17393

This removes the dynamic dispatch logic in q8ta_conv2d() that selected between the im2col and general convolution paths. The function now unconditionally uses q8ta_conv2d_general(). This simplifies the dispatch since the im2col path selection will be handled upstream by the pattern matcher routing to specialized ops (q8ta_conv2d_pw, q8ta_conv2d_dw, etc.) instead of being decided at runtime.
ghstack-source-id: 341022577
@exported-using-ghexport

Differential Revision: [D93000164](https://our.internmc.facebook.com/intern/diff/D93000164/)
---
 .../runtime/graph/ops/impl/Q8taConv2d.cpp     | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp
index d3fe1afd906..4f047d414f8 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp
@@ -401,24 +401,7 @@ void q8ta_conv2d_general(
 }
 
 void q8ta_conv2d(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  // Index into args to extract values needed for dispatch decision
-  const ValueRef packed_int8_input = args.at(0);
-  const ValueRef kernel_size = args.at(9);
-  const ValueRef groups = args.at(13);
-
-  const int32_t groups_val = graph.get_int(groups);
-  const int64_t IC = graph.size_at<int64_t>(-3, packed_int8_input);
-
-  const int64_t K_h = graph.get_int_list(kernel_size)->at(0);
-  const int64_t K_w = graph.get_int_list(kernel_size)->at(1);
-
-  // Use im2col path when: non-grouped, input channels multiple of 4, small
-  // kernel
-  if (groups_val == 1 && IC % 4 == 0 && K_h <= 3 && K_w <= 3) {
-    q8ta_conv2d_im2col(graph, args);
-  } else {
-    q8ta_conv2d_general(graph, args);
-  }
+  q8ta_conv2d_general(graph, args);
 }
 
 REGISTER_OPERATORS {

From 6f2270c88cf4e5b80d5bbcebe0af09a798b593e2 Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devvm1479.ncg0.facebook.com>
Date: Thu, 12 Feb 2026 23:49:06 -0800
Subject: [PATCH 3/3] [ET-VK][profiling] Add additional profiling blocks

Pull Request resolved: https://github.com/pytorch/executorch/pull/17394

This adds fine-grained ET_EVENT_TRACER profiling blocks to the Vulkan backend's execute() method in VulkanBackend.cpp. Previously, only GPU shader timestamps were logged. Now the following phases are individually traced: ETVK_COPY_INPUTS (host-to-GPU input transfer), ETVK_RESIZE (graph resize propagation), ETVK_COMPUTE_GRAPH_EXECUTE (GPU compute dispatch), ETVK_COPY_OUTPUTS (GPU-to-host output transfer), and ETVK_EXECUTE (overall delegate execution). The GPU shader timestamp extraction is also moved to occur right after execute() completes rather than at the end of the function, so it falls within the ETVK_EXECUTE span.
ghstack-source-id: 341022578
@exported-using-ghexport

Differential Revision: [D93000163](https://our.internmc.facebook.com/intern/diff/D93000163/)
---
 backends/vulkan/runtime/VulkanBackend.cpp | 82 +++++++++++++++++++----
 1 file changed, 69 insertions(+), 13 deletions(-)

diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp
index 261585c381b..fbca5af5100 100644
--- a/backends/vulkan/runtime/VulkanBackend.cpp
+++ b/backends/vulkan/runtime/VulkanBackend.cpp
@@ -641,6 +641,21 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
 
     const size_t num_inputs = compute_graph->inputs().size();
     bool should_propagate_resize = false;
+#ifdef ET_EVENT_TRACER_ENABLED
+    runtime::EventTracer* event_tracer = context.event_tracer();
+    runtime::EventTracerEntry overall_event_tracer_entry =
+        event_tracer_start_profiling_delegate(
+            event_tracer,
+            "ETVK_EXECUTE",
+            /* delegate_debug_id = */ -1);
+#endif // ET_EVENT_TRACER_ENABLED
+#ifdef ET_EVENT_TRACER_ENABLED
+    runtime::EventTracerEntry copy_inputs_event_tracer_entry =
+        event_tracer_start_profiling_delegate(
+            event_tracer,
+            "ETVK_COPY_INPUTS",
+            /* delegate_debug_id = */ -1);
+#endif // ET_EVENT_TRACER_ENABLED
     for (size_t i = 0; i < num_inputs; i++) {
       const ValueRef iref = compute_graph->inputs()[i].value;
       if (compute_graph->val_is_tensor(iref)) {
@@ -669,13 +684,61 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
             compute_graph->get_val_type(iref));
       }
     }
+#ifdef ET_EVENT_TRACER_ENABLED
+    event_tracer_end_profiling_delegate(
+        event_tracer, copy_inputs_event_tracer_entry);
+#endif // ET_EVENT_TRACER_ENABLED
 
     if (should_propagate_resize || compute_graph->has_data_dependent_shapes()) {
+#ifdef ET_EVENT_TRACER_ENABLED
+      runtime::EventTracerEntry resize_event_tracer_entry =
+          event_tracer_start_profiling_delegate(
+              event_tracer,
+              "ETVK_RESIZE",
+              /* delegate_debug_id = */ -1);
+#endif // ET_EVENT_TRACER_ENABLED
       compute_graph->propagate_resize();
+#ifdef ET_EVENT_TRACER_ENABLED
+      event_tracer_end_profiling_delegate(
+          event_tracer, resize_event_tracer_entry);
+#endif // ET_EVENT_TRACER_ENABLED
     }
 
+#ifdef ET_EVENT_TRACER_ENABLED
+    runtime::EventTracerEntry execute_event_tracer_entry =
+        event_tracer_start_profiling_delegate(
+            event_tracer,
+            "ETVK_COMPUTE_GRAPH_EXECUTE",
+            /* delegate_debug_id = */ -1);
+#endif // ET_EVENT_TRACER_ENABLED
     compute_graph->execute();
+#ifdef ET_EVENT_TRACER_ENABLED
+    event_tracer_end_profiling_delegate(
+        event_tracer, execute_event_tracer_entry);
+#endif // ET_EVENT_TRACER_ENABLED
 
+#ifdef ET_EVENT_TRACER_ENABLED
+    compute_graph->context()->querypool().extract_results();
+    for (const auto& r :
+         compute_graph->context()->querypool().get_shader_timestamp_data()) {
+      std::string event_name = "{" + r.kernel_name +
+          ", \"dispatch_id\": " + std::to_string(r.dispatch_id) + "}";
+      event_tracer_log_profiling_delegate(
+          event_tracer,
+          event_name.c_str(),
+          /* delegate_debug_id = */ -1,
+          r.start_time_ns,
+          r.end_time_ns);
+    }
+#endif // ET_EVENT_TRACER_ENABLED
+
+#ifdef ET_EVENT_TRACER_ENABLED
+    runtime::EventTracerEntry copy_outputs_event_tracer_entry =
+        event_tracer_start_profiling_delegate(
+            event_tracer,
+            "ETVK_COPY_OUTPUTS",
+            /* delegate_debug_id = */ -1);
+#endif // ET_EVENT_TRACER_ENABLED
     for (size_t i = 0; i < compute_graph->outputs().size(); i++) {
       const size_t o = i + num_inputs;
       const ValueRef oref = compute_graph->outputs()[i].value;
@@ -701,21 +764,14 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
             compute_graph->get_val_type(oref));
       }
     }
+#ifdef ET_EVENT_TRACER_ENABLED
+    event_tracer_end_profiling_delegate(
+        event_tracer, copy_outputs_event_tracer_entry);
+#endif // ET_EVENT_TRACER_ENABLED
 
 #ifdef ET_EVENT_TRACER_ENABLED
-    runtime::EventTracer* event_tracer = context.event_tracer();
-    compute_graph->context()->querypool().extract_results();
-    for (const auto& r :
-         compute_graph->context()->querypool().get_shader_timestamp_data()) {
-      std::string event_name = "{" + r.kernel_name +
-          ", \"dispatch_id\": " + std::to_string(r.dispatch_id) + "}";
-      event_tracer_log_profiling_delegate(
-          event_tracer,
-          event_name.c_str(),
-          /* delegate_debug_id = */ -1,
-          r.start_time_ns,
-          r.end_time_ns);
-    }
+    event_tracer_end_profiling_delegate(
+        event_tracer, overall_event_tracer_entry);
 #endif // ET_EVENT_TRACER_ENABLED
 
     return Error::Ok;