From 57c5a41af8a55d30da5d2a3051cc5498f1379a71 Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devvm1479.ncg0.facebook.com>
Date: Thu, 12 Feb 2026 23:49:05 -0800
Subject: [PATCH 1/2] [ET-VK][qconv2d][ez] Don't use im2col path for general
 convs

Pull Request resolved: https://github.com/pytorch/executorch/pull/17393

This removes the dynamic dispatch logic in q8ta_conv2d() that selected between the im2col and general convolution paths. The function now unconditionally uses q8ta_conv2d_general(). This simplifies the dispatch since the im2col path selection will be handled upstream by the pattern matcher routing to specialized ops (q8ta_conv2d_pw, q8ta_conv2d_dw, etc.) instead of being decided at runtime.
ghstack-source-id: 341022577
@exported-using-ghexport

Differential Revision: [D93000164](https://our.internmc.facebook.com/intern/diff/D93000164/)
---
 .../runtime/graph/ops/impl/Q8taConv2d.cpp     | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)
diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp
index d3fe1afd906..4f047d414f8 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp
@@ -401,24 +401,7 @@ void q8ta_conv2d_general(
 }
 
 void q8ta_conv2d(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  // Index into args to extract values needed for dispatch decision
-  const ValueRef packed_int8_input = args.at(0);
-  const ValueRef kernel_size = args.at(9);
-  const ValueRef groups = args.at(13);
-
-  const int32_t groups_val = graph.get_int(groups);
-  const int64_t IC = graph.size_at<int64_t>(-3, packed_int8_input);
-
-  const int64_t K_h = graph.get_int_list(kernel_size)->at(0);
-  const int64_t K_w = graph.get_int_list(kernel_size)->at(1);
-
-  // Use im2col path when: non-grouped, input channels multiple of 4, small
-  // kernel
-  if (groups_val == 1 && IC % 4 == 0 && K_h <= 3 && K_w <= 3) {
-    q8ta_conv2d_im2col(graph, args);
-  } else {
-    q8ta_conv2d_general(graph, args);
-  }
+  q8ta_conv2d_general(graph, args);
 }
 
 REGISTER_OPERATORS {

From 0d64247af32db0aeb4fc4a5313e6594e20462970 Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devvm1479.ncg0.facebook.com>
Date: Thu, 12 Feb 2026 23:49:06 -0800
Subject: [PATCH 2/2] [ET-VK][profiling] Add additional profiling blocks

Pull Request resolved: https://github.com/pytorch/executorch/pull/17394

This adds fine-grained ET_EVENT_TRACER profiling blocks to the Vulkan backend's execute() method in VulkanBackend.cpp. Previously, only GPU shader timestamps were logged. Now the following phases are individually traced: ETVK_COPY_INPUTS (host-to-GPU input transfer), ETVK_RESIZE (graph resize propagation), ETVK_COMPUTE_GRAPH_EXECUTE (GPU compute dispatch), ETVK_COPY_OUTPUTS (GPU-to-host output transfer), and ETVK_EXECUTE (overall delegate execution). The GPU shader timestamp extraction is also moved to occur right after execute() completes rather than at the end of the function, so it falls within the ETVK_EXECUTE span.
ghstack-source-id: 341022578
@exported-using-ghexport

Differential Revision: [D93000163](https://our.internmc.facebook.com/intern/diff/D93000163/)
---
 backends/vulkan/runtime/VulkanBackend.cpp | 82 +++++++++++++++++++----
 1 file changed, 69 insertions(+), 13 deletions(-)

diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp
index 261585c381b..fbca5af5100 100644
--- a/backends/vulkan/runtime/VulkanBackend.cpp
+++ b/backends/vulkan/runtime/VulkanBackend.cpp
@@ -641,6 +641,21 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
 
     const size_t num_inputs = compute_graph->inputs().size();
     bool should_propagate_resize = false;
+#ifdef ET_EVENT_TRACER_ENABLED
+    runtime::EventTracer* event_tracer = context.event_tracer();
+    runtime::EventTracerEntry overall_event_tracer_entry =
+        event_tracer_start_profiling_delegate(
+            event_tracer,
+            "ETVK_EXECUTE",
+            /* delegate_debug_id = */ -1);
+#endif // ET_EVENT_TRACER_ENABLED
+#ifdef ET_EVENT_TRACER_ENABLED
+    runtime::EventTracerEntry copy_inputs_event_tracer_entry =
+        event_tracer_start_profiling_delegate(
+            event_tracer,
+            "ETVK_COPY_INPUTS",
+            /* delegate_debug_id = */ -1);
+#endif // ET_EVENT_TRACER_ENABLED
     for (size_t i = 0; i < num_inputs; i++) {
       const ValueRef iref = compute_graph->inputs()[i].value;
       if (compute_graph->val_is_tensor(iref)) {
@@ -669,13 +684,61 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
             compute_graph->get_val_type(iref));
       }
     }
+#ifdef ET_EVENT_TRACER_ENABLED
+    event_tracer_end_profiling_delegate(
+        event_tracer, copy_inputs_event_tracer_entry);
+#endif // ET_EVENT_TRACER_ENABLED
 
     if (should_propagate_resize || compute_graph->has_data_dependent_shapes()) {
+#ifdef ET_EVENT_TRACER_ENABLED
+      runtime::EventTracerEntry resize_event_tracer_entry =
+          event_tracer_start_profiling_delegate(
+              event_tracer,
+              "ETVK_RESIZE",
+              /* delegate_debug_id = */ -1);
+#endif // ET_EVENT_TRACER_ENABLED
       compute_graph->propagate_resize();
+#ifdef ET_EVENT_TRACER_ENABLED
+      event_tracer_end_profiling_delegate(
+          event_tracer, resize_event_tracer_entry);
+#endif // ET_EVENT_TRACER_ENABLED
     }
 
+#ifdef ET_EVENT_TRACER_ENABLED
+    runtime::EventTracerEntry execute_event_tracer_entry =
+        event_tracer_start_profiling_delegate(
+            event_tracer,
+            "ETVK_COMPUTE_GRAPH_EXECUTE",
+            /* delegate_debug_id = */ -1);
+#endif // ET_EVENT_TRACER_ENABLED
     compute_graph->execute();
+#ifdef ET_EVENT_TRACER_ENABLED
+    event_tracer_end_profiling_delegate(
+        event_tracer, execute_event_tracer_entry);
+#endif // ET_EVENT_TRACER_ENABLED
 
+#ifdef ET_EVENT_TRACER_ENABLED
+    compute_graph->context()->querypool().extract_results();
+    for (const auto& r :
+         compute_graph->context()->querypool().get_shader_timestamp_data()) {
+      std::string event_name = "{" + r.kernel_name +
+          ", \"dispatch_id\": " + std::to_string(r.dispatch_id) + "}";
+      event_tracer_log_profiling_delegate(
+          event_tracer,
+          event_name.c_str(),
+          /* delegate_debug_id = */ -1,
+          r.start_time_ns,
+          r.end_time_ns);
+    }
+#endif // ET_EVENT_TRACER_ENABLED
+
+#ifdef ET_EVENT_TRACER_ENABLED
+    runtime::EventTracerEntry copy_outputs_event_tracer_entry =
+        event_tracer_start_profiling_delegate(
+            event_tracer,
+            "ETVK_COPY_OUTPUTS",
+            /* delegate_debug_id = */ -1);
+#endif // ET_EVENT_TRACER_ENABLED
     for (size_t i = 0; i < compute_graph->outputs().size(); i++) {
       const size_t o = i + num_inputs;
       const ValueRef oref = compute_graph->outputs()[i].value;
@@ -701,21 +764,14 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
             compute_graph->get_val_type(oref));
       }
     }
+#ifdef ET_EVENT_TRACER_ENABLED
+    event_tracer_end_profiling_delegate(
+        event_tracer, copy_outputs_event_tracer_entry);
+#endif // ET_EVENT_TRACER_ENABLED
 
 #ifdef ET_EVENT_TRACER_ENABLED
-    runtime::EventTracer* event_tracer = context.event_tracer();
-    compute_graph->context()->querypool().extract_results();
-    for (const auto& r :
-         compute_graph->context()->querypool().get_shader_timestamp_data()) {
-      std::string event_name = "{" + r.kernel_name +
-          ", \"dispatch_id\": " + std::to_string(r.dispatch_id) + "}";
-      event_tracer_log_profiling_delegate(
-          event_tracer,
-          event_name.c_str(),
-          /* delegate_debug_id = */ -1,
-          r.start_time_ns,
-          r.end_time_ns);
-    }
+    event_tracer_end_profiling_delegate(
+        event_tracer, overall_event_tracer_entry);
 #endif // ET_EVENT_TRACER_ENABLED
 
     return Error::Ok;