pytorch · SS-JIA · Feb 13, 2026 · Feb 13, 2026 · Feb 13, 2026 · Feb 13, 2026
diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
@@ -510,7 +510,6 @@ def register_q8ta_add():
     return OpFeatures(
         inputs_storage=utils.PACKED_INT8_BUFFER,
         supports_resize=False,
-        supports_prepacking=True,
     )
 
 

diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp
@@ -641,6 +641,21 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
 
     const size_t num_inputs = compute_graph->inputs().size();
     bool should_propagate_resize = false;
+#ifdef ET_EVENT_TRACER_ENABLED
+    runtime::EventTracer* event_tracer = context.event_tracer();
+    runtime::EventTracerEntry overall_event_tracer_entry =
+        event_tracer_start_profiling_delegate(
+            event_tracer,
+            "ETVK_EXECUTE",
+            /* delegate_debug_id = */ -1);
+#endif // ET_EVENT_TRACER_ENABLED
+#ifdef ET_EVENT_TRACER_ENABLED
+    runtime::EventTracerEntry copy_inputs_event_tracer_entry =
+        event_tracer_start_profiling_delegate(
+            event_tracer,
+            "ETVK_COPY_INPUTS",
+            /* delegate_debug_id = */ -1);
+#endif // ET_EVENT_TRACER_ENABLED
     for (size_t i = 0; i < num_inputs; i++) {
       const ValueRef iref = compute_graph->inputs()[i].value;
       if (compute_graph->val_is_tensor(iref)) {
@@ -669,13 +684,61 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
             compute_graph->get_val_type(iref));
       }
     }
+#ifdef ET_EVENT_TRACER_ENABLED
+    event_tracer_end_profiling_delegate(
+        event_tracer, copy_inputs_event_tracer_entry);
+#endif // ET_EVENT_TRACER_ENABLED
 
     if (should_propagate_resize || compute_graph->has_data_dependent_shapes()) {
+#ifdef ET_EVENT_TRACER_ENABLED
+      runtime::EventTracerEntry resize_event_tracer_entry =
+          event_tracer_start_profiling_delegate(
+              event_tracer,
+              "ETVK_RESIZE",
+              /* delegate_debug_id = */ -1);
+#endif // ET_EVENT_TRACER_ENABLED
       compute_graph->propagate_resize();
+#ifdef ET_EVENT_TRACER_ENABLED
+      event_tracer_end_profiling_delegate(
+          event_tracer, resize_event_tracer_entry);
+#endif // ET_EVENT_TRACER_ENABLED
     }
 
+#ifdef ET_EVENT_TRACER_ENABLED
+    runtime::EventTracerEntry execute_event_tracer_entry =
+        event_tracer_start_profiling_delegate(
+            event_tracer,
+            "ETVK_COMPUTE_GRAPH_EXECUTE",
+            /* delegate_debug_id = */ -1);
+#endif // ET_EVENT_TRACER_ENABLED
     compute_graph->execute();
+#ifdef ET_EVENT_TRACER_ENABLED
+    event_tracer_end_profiling_delegate(
+        event_tracer, execute_event_tracer_entry);
+#endif // ET_EVENT_TRACER_ENABLED
 
+#ifdef ET_EVENT_TRACER_ENABLED
+    compute_graph->context()->querypool().extract_results();
+    for (const auto& r :
+         compute_graph->context()->querypool().get_shader_timestamp_data()) {
+      std::string event_name = "{" + r.kernel_name +
+          ", \"dispatch_id\": " + std::to_string(r.dispatch_id) + "}";
+      event_tracer_log_profiling_delegate(
+          event_tracer,
+          event_name.c_str(),
+          /* delegate_debug_id = */ -1,
+          r.start_time_ns,
+          r.end_time_ns);
+    }
+#endif // ET_EVENT_TRACER_ENABLED
+
+#ifdef ET_EVENT_TRACER_ENABLED
+    runtime::EventTracerEntry copy_outputs_event_tracer_entry =
+        event_tracer_start_profiling_delegate(
+            event_tracer,
+            "ETVK_COPY_OUTPUTS",
+            /* delegate_debug_id = */ -1);
+#endif // ET_EVENT_TRACER_ENABLED
     for (size_t i = 0; i < compute_graph->outputs().size(); i++) {
       const size_t o = i + num_inputs;
       const ValueRef oref = compute_graph->outputs()[i].value;
@@ -701,21 +764,14 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
             compute_graph->get_val_type(oref));
       }
     }
+#ifdef ET_EVENT_TRACER_ENABLED
+    event_tracer_end_profiling_delegate(
+        event_tracer, copy_outputs_event_tracer_entry);
+#endif // ET_EVENT_TRACER_ENABLED
 
 #ifdef ET_EVENT_TRACER_ENABLED
-    runtime::EventTracer* event_tracer = context.event_tracer();
-    compute_graph->context()->querypool().extract_results();
-    for (const auto& r :
-         compute_graph->context()->querypool().get_shader_timestamp_data()) {
-      std::string event_name = "{" + r.kernel_name +
-          ", \"dispatch_id\": " + std::to_string(r.dispatch_id) + "}";
-      event_tracer_log_profiling_delegate(
-          event_tracer,
-          event_name.c_str(),
-          /* delegate_debug_id = */ -1,
-          r.start_time_ns,
-          r.end_time_ns);
-    }
+    event_tracer_end_profiling_delegate(
+        event_tracer, overall_event_tracer_entry);
 #endif // ET_EVENT_TRACER_ENABLED
 
     return Error::Ok;

diff --git a/backends/vulkan/runtime/api/containers/StagingBuffer.cpp b/backends/vulkan/runtime/api/containers/StagingBuffer.cpp
@@ -136,8 +136,12 @@ StagingBuffer::StagingBuffer(
     const vkapi::CopyDirection direction)
     : context_p_(context_p),
       dtype_(get_staging_dtype(context_p, dtype)),
+      // For 8-bit types, align numel to the next multiple of 4. Devices that
+      // lack 8-bit storage buffer support will interpret the data as int32, so
+      // the buffer size must be a multiple of 4 bytes.
       vulkan_buffer_(context_p_->adapter_ptr()->vma().create_staging_buffer(
-          element_size(dtype_) * numel,
+          element_size(dtype_) *
+              (element_size(dtype_) == 1 ? utils::align_up_4(numel) : numel),
           direction)),
       mapped_data_(nullptr) {}
 

diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh b/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh
@@ -334,45 +334,66 @@ TensorIndex linear_idx_to_tensor_idx(
 /*
  * Convert a linear texel index to a TensorIndex4D.
  *
- * This function is used for texel-based dispatch where each thread handles
- * one packed texel (4 elements along the packed dimension). The texel index
- * is decomposed using the dim_order and strides from the tensor's layout.
+ * This is the inverse of tensor4d_idx_to_texel_idx. It handles both
+ * single-packed layouts (outer_block_size == 1) and block-packed layouts
+ * (e.g., 4W4C where outer_block_size > 1).
  *
- * The strides in BufferMetadata should already be in texel space (with packed
- * dimension size divided by 4).
+ * The approach mirrors tensor4d_idx_to_texel_idx by decomposing the problem
+ * into two levels:
+ *   1. Decompose texel_idx into block_idx and intra-block texel offset
+ *   2. Decompose block_idx into block-space tensor coordinates using strides
+ *   3. Convert block-space coordinates to element-space by multiplying by
+ *      block sizes
+ *   4. Add the intra-block outer-dimension offset
+ *
+ * For single-packed layouts (outer_block_size == 1, inner_dim == outer_dim),
+ * texels_per_block == 1, so block_idx == texel_idx and intra_block_texel == 0.
+ * The only effective multiplication is tidx[inner_dim] *= inner_block_size
+ * (i.e., *= 4), matching the previous single-packed behavior.
  *
  * Parameters:
- *   meta: BufferMetadata with tensor sizes and texel-space strides
+ *   meta: BufferMetadata with block-space strides
  *   texel_idx: Linear index into packed texels (0 to num_texels-1)
  *   hashed_layout: Packed layout info containing dim_order and packed_dim
  *
- * Returns: TensorIndex4D with logical tensor coordinates (packed dim is base of 4-element block)
+ * Returns: TensorIndex4D with logical tensor coordinates (packed dims are
+ *          base of their respective blocks)
  */
 TensorIndex4D texel_idx_to_tensor4d_idx(
     const BufferMetadata meta,
     uint texel_idx,
     const int hashed_layout) {
   TensorIndex4D tidx;
 
-  const int packed_dim = get_packed_dim(hashed_layout);
+  const int inner_dim = get_packed_dim(hashed_layout);
+  const int outer_dim = get_outer_packed_dim(hashed_layout);
+  const int inner_block_size = get_packed_dim_block_size(hashed_layout);
+  const int outer_block_size = get_outer_packed_dim_block_size(hashed_layout);
 
-  // Decompose texel_idx using dim_order from hashed_layout and strides from meta
-  // Iterate from slowest-varying dimension (d=3) to fastest (d=0)
-  // This follows the pattern of linear_idx_to_tensor_idx in indexing.glslh
+  // Number of texels per block: each block has inner_block_size *
+  // outer_block_size elements, and each texel holds 4 elements
+  const int texels_per_block = (inner_block_size * outer_block_size) / 4;
+
+  // Decompose texel_idx into block_idx and intra-block texel offset
+  const uint block_idx = texel_idx / texels_per_block;
+  const int intra_block_texel = int(texel_idx % texels_per_block);
+
+  // Decompose block_idx into block-space tensor coordinates using dim_order
+  // and strides. Iterate from slowest-varying (d=3) to fastest (d=0).
+  uint remaining = block_idx;
   [[unroll]] for (int d = 3; d >= 0; d--) {
-    // Get dim index from hashed_layout's dim_order (bits 0-15)
     int dim_idx = extract_4b(hashed_layout, d);
-
-    // Get stride for this dimension from BufferMetadata
     uint dim_stride = meta.strides[0][dim_idx];
-
-    // Compute coordinate for this dimension
-    tidx.data[dim_idx] = int(texel_idx / dim_stride);
-    texel_idx = texel_idx % dim_stride;
+    tidx.data[dim_idx] = int(remaining / dim_stride);
+    remaining = remaining % dim_stride;
   }
 
-  // Convert packed dimension from texel index to element index
-  tidx.data[packed_dim] *= 4;
+  // Convert block-space coordinates to element-space
+  tidx.data[inner_dim] *= inner_block_size;
+  tidx.data[outer_dim] *= outer_block_size;
+
+  // Add intra-block outer-dimension offset
+  tidx.data[outer_dim] += intra_block_texel;
 
   return tidx;
 }

diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8x4_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8x4_buffer.glsl
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+${define_active_storage_type("buffer")}
+
+layout(std430) buffer;
+
+#include "indexing.glslh"
+
+// Output buffer: packed int8x4 values (each int32 contains 4 packed int8)
+${layout_declare_tensor(B, "w", "t_outp", "int", "buffer")}
+// Input staging buffer: raw int8 data interpreted as int32 for device compat
+${layout_declare_tensor(B, "r", "nchw_in", "int", "buffer")}
+
+// Metadata for output tensor
+${layout_declare_ubo(B, "BufferMetadata", "outp")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+${layout_declare_spec_const(C, "int", "outp_layout", "CONTIG_LAYOUT_INT")}
+
+void main() {
+  const uint texel_idx = gl_GlobalInvocationID.x;
+  const uint num_texels = numel(outp) / 4;
+  if (texel_idx >= num_texels) {
+    return;
+  }
+
+  const int inner_dim = get_packed_dim(outp_layout);
+  const int outer_dim = get_outer_packed_dim(outp_layout);
+
+  const TensorIndex4D tidx =
+      texel_idx_to_tensor4d_idx(outp, texel_idx, outp_layout);
+
+  // Bounds check on outer dimension
+  if (tidx.data[outer_dim] >= int(outp.sizes[0][outer_dim])) {
+    return;
+  }
+
+  // Tensor sizes in WHCN order for NCHW contiguous index computation
+  const uint W = outp.sizes[0][0];
+  const uint H = outp.sizes[0][1];
+  const uint C = outp.sizes[0][2];
+
+  // Pack 4 int8 values along inner dimension into one int32
+  int packed = 0;
+  [[unroll]] for (int i = 0; i < 4; ++i) {
+    const int elem_inner = tidx.data[inner_dim] + i;
+    if (elem_inner < int(outp.sizes[0][inner_dim])) {
+      // Build element coordinates
+      ivec4 elem = tidx.data;
+      elem[inner_dim] = elem_inner;
+
+      // Compute NCHW contiguous index: w + h*W + c*H*W + n*C*H*W
+      const uint nchw_idx = uint(elem[0]) + uint(elem[1]) * W +
+                            uint(elem[2]) * H * W + uint(elem[3]) * C * H * W;
+
+      // Read int8 from staging buffer (each int32 contains 4 bytes)
+      const uint int_idx = nchw_idx >> 2;
+      const uint byte_pos = nchw_idx & 3;
+      const int staging_val = nchw_in[int_idx];
+      const int byte_val = (staging_val >> (byte_pos * 8)) & 0xFF;
+
+      packed |= (byte_val << (i * 8));
+    }
+  }
+
+  t_outp[texel_idx] = packed;
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8x4_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8x4_buffer.yaml
@@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+nchw_to_int8x4_buffer:
+  parameter_names_with_default_values:
+    DTYPE: int
+  shader_variants:
+    - NAME: nchw_to_int8x4_buffer
diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp
@@ -401,24 +401,7 @@ void q8ta_conv2d_general(
 }
 
 void q8ta_conv2d(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  // Index into args to extract values needed for dispatch decision
-  const ValueRef packed_int8_input = args.at(0);
-  const ValueRef kernel_size = args.at(9);
-  const ValueRef groups = args.at(13);
-
-  const int32_t groups_val = graph.get_int(groups);
-  const int64_t IC = graph.size_at<int64_t>(-3, packed_int8_input);
-
-  const int64_t K_h = graph.get_int_list(kernel_size)->at(0);
-  const int64_t K_w = graph.get_int_list(kernel_size)->at(1);
-
-  // Use im2col path when: non-grouped, input channels multiple of 4, small
-  // kernel
-  if (groups_val == 1 && IC % 4 == 0 && K_h <= 3 && K_w <= 3) {
-    q8ta_conv2d_im2col(graph, args);
-  } else {
-    q8ta_conv2d_general(graph, args);
-  }
+  q8ta_conv2d_general(graph, args);
 }
 
 REGISTER_OPERATORS {

diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taStaging.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taStaging.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Q8taStaging.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+void add_staging_to_int8x4_buffer_node(
+    ComputeGraph& graph,
+    const ValueRef tensor_data,
+    const ValueRef tensor) {
+  VK_CHECK_COND(graph.dtype_of(tensor) == vkapi::kInt8x4);
+
+  std::string kernel_name = "nchw_to_int8x4_buffer";
+
+  vkapi::ParamsBindList param_buffers;
+  param_buffers.append(graph.buffer_meta_ubo(tensor));
+
+  // One thread per texel (each texel = one int32 = 4 packed int8).
+  // Use padded_numel to account for dimension padding in packed int8 layouts
+  // (e.g., kPackedInt8_4C with C=3 pads to C=4).
+  uint32_t num_texels =
+      utils::safe_downcast<uint32_t>(graph.padded_numel_of(tensor) / 4);
+  utils::uvec3 global_wg_size = {num_texels, 1, 1};
+  utils::uvec3 local_wg_size = graph.create_local_wg_size(global_wg_size);
+
+  graph.prepack_nodes().emplace_back(new PrepackNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_wg_size,
+      local_wg_size,
+      // Input and Output
+      tensor_data,
+      tensor,
+      // Parameter Buffers
+      param_buffers,
+      // Specialization Constants
+      {graph.hashed_layout_of(tensor)}));
+}
+
+} // namespace vkcompute