pytorch · SS-JIA · Feb 13, 2026 · Feb 13, 2026 · Feb 13, 2026 · Feb 13, 2026
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -1136,7 +1136,7 @@ jobs:
         ./cmake-out/backends/vulkan/test/custom_ops/choose_qparams_per_row
         ./cmake-out/backends/vulkan/test/custom_ops/test_q8ta_qdq
         ./cmake-out/backends/vulkan/test/custom_ops/test_q8ta_clone
-        ./cmake-out/backends/vulkan/test/custom_ops/q8ta_q8ta_q8to_add
+        ./cmake-out/backends/vulkan/test/custom_ops/test_q8ta_binary
 
         # "Classic" Operator tests
         PYTHON_EXECUTABLE=python bash backends/vulkan/test/scripts/test_op.sh --build

diff --git a/backends/vulkan/custom_ops_lib.py b/backends/vulkan/custom_ops_lib.py
@@ -564,11 +564,11 @@ def apply_rotary_emb_impl(
 apply_rotary_emb_op = getattr(getattr(torch.ops, namespace), name)
 
 ########################
-## add_q8ta_q8ta_q8to ##
+## q8ta_add ##
 ########################
 
 
-def add_q8ta_q8ta_q8to_impl(
+def q8ta_add_impl(
     input_a: torch.Tensor,
     input_b: torch.Tensor,
     input_a_scale: float,
@@ -598,12 +598,12 @@ def add_q8ta_q8ta_q8to_impl(
     return quantized_result
 
 
-name = "add_q8ta_q8ta_q8to"
+name = "q8ta_add"
 lib.define(
     f"{name}(Tensor input_a, Tensor input_b, float input_a_scale, int input_a_zero_point, float input_b_scale, int input_b_zero_point, float output_scale, int output_zero_point, float alpha) -> Tensor"
 )
-lib.impl(name, add_q8ta_q8ta_q8to_impl, "CompositeExplicitAutograd")
-add_q8ta_q8ta_q8to_op = getattr(getattr(torch.ops, namespace), name)
+lib.impl(name, q8ta_add_impl, "CompositeExplicitAutograd")
+q8ta_add_op = getattr(getattr(torch.ops, namespace), name)
 
 #############################
 ## select_as_symint ##

diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
@@ -501,19 +501,23 @@ def register_torchao_choose_qparams_affine():
 
 
 # =============================================================================
-# QuantizedBinary.cpp
+# Q8taBinary.cpp
 # =============================================================================
 
 
-@update_features(exir_ops.edge.et_vk.add_q8ta_q8ta_q8to.default)
-def register_add_q8ta_q8ta_q8to():
+@update_features(exir_ops.edge.et_vk.q8ta_add.default)
+def register_q8ta_add():
     return OpFeatures(
-        inputs_storage=utils.PACKED_INT8_4W4C_BUFFER,
+        inputs_storage=utils.PACKED_INT8_BUFFER,
         supports_resize=False,
-        supports_prepacking=True,
     )
 
 
+# =============================================================================
+# Reduce.cpp
+# =============================================================================
+
+
 def get_dims_reduced(node: torch.fx.Node) -> Union[int, List[int]]:
     ndim = utils.ndim_of(node.args[0])
     assert ndim is not None
@@ -623,11 +627,6 @@ def pick_storage_for_reduce(node: torch.fx.Node):
     return inputs_storage, outputs_storage
 
 
-# =============================================================================
-# Reduce.cpp
-# =============================================================================
-
-
 @update_features(
     [
         exir_ops.edge.aten.mean.dim,
@@ -778,6 +777,9 @@ def register_q8ta_conv_pw_op():
             utils.NO_STORAGE,  # groups (non tensor)
             utils.NO_STORAGE,  # original OC count (non tensor)
         ],
+        outputs_storage=[
+            utils.PACKED_INT8_CHANNELS_PACKED_BUFFER,
+        ],
         supports_resize=False,
         supports_prepacking=True,
     )
@@ -792,7 +794,7 @@ def register_q8ta_conv_pw_op():
 def register_q8ta_conv2d_ops():
     return OpFeatures(
         inputs_storage=[
-            utils.PACKED_INT8_4W4C_BUFFER,  # input
+            utils.PACKED_INT8_4C1W_BUFFER,  # input
             utils.NO_STORAGE,  # input_scale (non tensor)
             utils.NO_STORAGE,  # input_zero_point (non tensor)
             utils.NO_STORAGE,  # weight (prepacked)
@@ -808,6 +810,9 @@ def register_q8ta_conv2d_ops():
             utils.NO_STORAGE,  # groups (non tensor)
             utils.NO_STORAGE,  # original OC count (non tensor)
         ],
+        outputs_storage=[
+            utils.PACKED_INT8_CHANNELS_PACKED_BUFFER,
+        ],
         supports_resize=False,
         supports_prepacking=True,
     )

diff --git a/backends/vulkan/patterns/quantized_binary.py b/backends/vulkan/patterns/quantized_binary.py
@@ -133,7 +133,7 @@ def make_add_q8ta_q8ta_q8to_custom_op(
         exir_ops.edge.aten.add.Tensor,
         exir_ops.edge.aten.add_.Tensor,
     }:
-        op_target = exir_ops.edge.et_vk.add_q8ta_q8ta_q8to.default
+        op_target = exir_ops.edge.et_vk.q8ta_add.default
     else:
         # For future binary operations, add more mappings here
         raise NotImplementedError(

diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp
@@ -641,6 +641,21 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
 
     const size_t num_inputs = compute_graph->inputs().size();
     bool should_propagate_resize = false;
+#ifdef ET_EVENT_TRACER_ENABLED
+    runtime::EventTracer* event_tracer = context.event_tracer();
+    runtime::EventTracerEntry overall_event_tracer_entry =
+        event_tracer_start_profiling_delegate(
+            event_tracer,
+            "ETVK_EXECUTE",
+            /* delegate_debug_id = */ -1);
+#endif // ET_EVENT_TRACER_ENABLED
+#ifdef ET_EVENT_TRACER_ENABLED
+    runtime::EventTracerEntry copy_inputs_event_tracer_entry =
+        event_tracer_start_profiling_delegate(
+            event_tracer,
+            "ETVK_COPY_INPUTS",
+            /* delegate_debug_id = */ -1);
+#endif // ET_EVENT_TRACER_ENABLED
     for (size_t i = 0; i < num_inputs; i++) {
       const ValueRef iref = compute_graph->inputs()[i].value;
       if (compute_graph->val_is_tensor(iref)) {
@@ -669,13 +684,61 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
             compute_graph->get_val_type(iref));
       }
     }
+#ifdef ET_EVENT_TRACER_ENABLED
+    event_tracer_end_profiling_delegate(
+        event_tracer, copy_inputs_event_tracer_entry);
+#endif // ET_EVENT_TRACER_ENABLED
 
     if (should_propagate_resize || compute_graph->has_data_dependent_shapes()) {
+#ifdef ET_EVENT_TRACER_ENABLED
+      runtime::EventTracerEntry resize_event_tracer_entry =
+          event_tracer_start_profiling_delegate(
+              event_tracer,
+              "ETVK_RESIZE",
+              /* delegate_debug_id = */ -1);
+#endif // ET_EVENT_TRACER_ENABLED
       compute_graph->propagate_resize();
+#ifdef ET_EVENT_TRACER_ENABLED
+      event_tracer_end_profiling_delegate(
+          event_tracer, resize_event_tracer_entry);
+#endif // ET_EVENT_TRACER_ENABLED
     }
 
+#ifdef ET_EVENT_TRACER_ENABLED
+    runtime::EventTracerEntry execute_event_tracer_entry =
+        event_tracer_start_profiling_delegate(
+            event_tracer,
+            "ETVK_COMPUTE_GRAPH_EXECUTE",
+            /* delegate_debug_id = */ -1);
+#endif // ET_EVENT_TRACER_ENABLED
     compute_graph->execute();
+#ifdef ET_EVENT_TRACER_ENABLED
+    event_tracer_end_profiling_delegate(
+        event_tracer, execute_event_tracer_entry);
+#endif // ET_EVENT_TRACER_ENABLED
 
+#ifdef ET_EVENT_TRACER_ENABLED
+    compute_graph->context()->querypool().extract_results();
+    for (const auto& r :
+         compute_graph->context()->querypool().get_shader_timestamp_data()) {
+      std::string event_name = "{" + r.kernel_name +
+          ", \"dispatch_id\": " + std::to_string(r.dispatch_id) + "}";
+      event_tracer_log_profiling_delegate(
+          event_tracer,
+          event_name.c_str(),
+          /* delegate_debug_id = */ -1,
+          r.start_time_ns,
+          r.end_time_ns);
+    }
+#endif // ET_EVENT_TRACER_ENABLED
+
+#ifdef ET_EVENT_TRACER_ENABLED
+    runtime::EventTracerEntry copy_outputs_event_tracer_entry =
+        event_tracer_start_profiling_delegate(
+            event_tracer,
+            "ETVK_COPY_OUTPUTS",
+            /* delegate_debug_id = */ -1);
+#endif // ET_EVENT_TRACER_ENABLED
     for (size_t i = 0; i < compute_graph->outputs().size(); i++) {
       const size_t o = i + num_inputs;
       const ValueRef oref = compute_graph->outputs()[i].value;
@@ -701,21 +764,14 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
             compute_graph->get_val_type(oref));
       }
     }
+#ifdef ET_EVENT_TRACER_ENABLED
+    event_tracer_end_profiling_delegate(
+        event_tracer, copy_outputs_event_tracer_entry);
+#endif // ET_EVENT_TRACER_ENABLED
 
 #ifdef ET_EVENT_TRACER_ENABLED
-    runtime::EventTracer* event_tracer = context.event_tracer();
-    compute_graph->context()->querypool().extract_results();
-    for (const auto& r :
-         compute_graph->context()->querypool().get_shader_timestamp_data()) {
-      std::string event_name = "{" + r.kernel_name +
-          ", \"dispatch_id\": " + std::to_string(r.dispatch_id) + "}";
-      event_tracer_log_profiling_delegate(
-          event_tracer,
-          event_name.c_str(),
-          /* delegate_debug_id = */ -1,
-          r.start_time_ns,
-          r.end_time_ns);
-    }
+    event_tracer_end_profiling_delegate(
+        event_tracer, overall_event_tracer_entry);
 #endif // ET_EVENT_TRACER_ENABLED
 
     return Error::Ok;

diff --git a/backends/vulkan/runtime/api/containers/StagingBuffer.cpp b/backends/vulkan/runtime/api/containers/StagingBuffer.cpp
@@ -136,8 +136,12 @@ StagingBuffer::StagingBuffer(
     const vkapi::CopyDirection direction)
     : context_p_(context_p),
       dtype_(get_staging_dtype(context_p, dtype)),
+      // For 8-bit types, align numel to the next multiple of 4. Devices that
+      // lack 8-bit storage buffer support will interpret the data as int32, so
+      // the buffer size must be a multiple of 4 bytes.
       vulkan_buffer_(context_p_->adapter_ptr()->vma().create_staging_buffer(
-          element_size(dtype_) * numel,
+          element_size(dtype_) *
+              (element_size(dtype_) == 1 ? utils::align_up_4(numel) : numel),
           direction)),
       mapped_data_(nullptr) {}
 

diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.glsl b/backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.glsl