From 57c5a41af8a55d30da5d2a3051cc5498f1379a71 Mon Sep 17 00:00:00 2001 From: ssjia Date: Thu, 12 Feb 2026 23:49:05 -0800 Subject: [PATCH 1/2] [ET-VK][qconv2d][ez] Don't use im2col path for general convs Pull Request resolved: https://github.com/pytorch/executorch/pull/17393 This removes the dynamic dispatch logic in q8ta_conv2d() that selected between the im2col and general convolution paths. The function now unconditionally uses q8ta_conv2d_general(). This simplifies the dispatch since the im2col path selection will be handled upstream by the pattern matcher routing to specialized ops (q8ta_conv2d_pw, q8ta_conv2d_dw, etc.) instead of being decided at runtime. ghstack-source-id: 341022577 @exported-using-ghexport Differential Revision: [D93000164](https://our.internmc.facebook.com/intern/diff/D93000164/) --- .../runtime/graph/ops/impl/Q8taConv2d.cpp | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp index d3fe1afd906..4f047d414f8 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp @@ -401,24 +401,7 @@ void q8ta_conv2d_general( } void q8ta_conv2d(ComputeGraph& graph, const std::vector& args) { - // Index into args to extract values needed for dispatch decision - const ValueRef packed_int8_input = args.at(0); - const ValueRef kernel_size = args.at(9); - const ValueRef groups = args.at(13); - - const int32_t groups_val = graph.get_int(groups); - const int64_t IC = graph.size_at(-3, packed_int8_input); - - const int64_t K_h = graph.get_int_list(kernel_size)->at(0); - const int64_t K_w = graph.get_int_list(kernel_size)->at(1); - - // Use im2col path when: non-grouped, input channels multiple of 4, small - // kernel - if (groups_val == 1 && IC % 4 == 0 && K_h <= 3 && K_w <= 3) { - q8ta_conv2d_im2col(graph, args); - } else { - q8ta_conv2d_general(graph, args); - } + q8ta_conv2d_general(graph, args); } REGISTER_OPERATORS { From 0d64247af32db0aeb4fc4a5313e6594e20462970 Mon Sep 17 00:00:00 2001 From: ssjia Date: Thu, 12 Feb 2026 23:49:06 -0800 Subject: [PATCH 2/2] [ET-VK][profiling] Add additional profiling blocks Pull Request resolved: https://github.com/pytorch/executorch/pull/17394 This adds fine-grained ET_EVENT_TRACER profiling blocks to the Vulkan backend's execute() method in VulkanBackend.cpp. Previously, only GPU shader timestamps were logged. Now the following phases are individually traced: ETVK_COPY_INPUTS (host-to-GPU input transfer), ETVK_RESIZE (graph resize propagation), ETVK_COMPUTE_GRAPH_EXECUTE (GPU compute dispatch), ETVK_COPY_OUTPUTS (GPU-to-host output transfer), and ETVK_EXECUTE (overall delegate execution). The GPU shader timestamp extraction is also moved to occur right after execute() completes rather than at the end of the function, so it falls within the ETVK_EXECUTE span. ghstack-source-id: 341022578 @exported-using-ghexport Differential Revision: [D93000163](https://our.internmc.facebook.com/intern/diff/D93000163/) --- backends/vulkan/runtime/VulkanBackend.cpp | 82 +++++++++++++++++++---- 1 file changed, 69 insertions(+), 13 deletions(-) diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp index 261585c381b..fbca5af5100 100644 --- a/backends/vulkan/runtime/VulkanBackend.cpp +++ b/backends/vulkan/runtime/VulkanBackend.cpp @@ -641,6 +641,21 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface { const size_t num_inputs = compute_graph->inputs().size(); bool should_propagate_resize = false; +#ifdef ET_EVENT_TRACER_ENABLED + runtime::EventTracer* event_tracer = context.event_tracer(); + runtime::EventTracerEntry overall_event_tracer_entry = + event_tracer_start_profiling_delegate( + event_tracer, + "ETVK_EXECUTE", + /* delegate_debug_id = */ -1); +#endif // ET_EVENT_TRACER_ENABLED +#ifdef ET_EVENT_TRACER_ENABLED + runtime::EventTracerEntry copy_inputs_event_tracer_entry = + event_tracer_start_profiling_delegate( + event_tracer, + "ETVK_COPY_INPUTS", + /* delegate_debug_id = */ -1); +#endif // ET_EVENT_TRACER_ENABLED for (size_t i = 0; i < num_inputs; i++) { const ValueRef iref = compute_graph->inputs()[i].value; if (compute_graph->val_is_tensor(iref)) { @@ -669,13 +684,61 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface { compute_graph->get_val_type(iref)); } } +#ifdef ET_EVENT_TRACER_ENABLED + event_tracer_end_profiling_delegate( + event_tracer, copy_inputs_event_tracer_entry); +#endif // ET_EVENT_TRACER_ENABLED if (should_propagate_resize || compute_graph->has_data_dependent_shapes()) { +#ifdef ET_EVENT_TRACER_ENABLED + runtime::EventTracerEntry resize_event_tracer_entry = + event_tracer_start_profiling_delegate( + event_tracer, + "ETVK_RESIZE", + /* delegate_debug_id = */ -1); +#endif // ET_EVENT_TRACER_ENABLED compute_graph->propagate_resize(); +#ifdef ET_EVENT_TRACER_ENABLED + event_tracer_end_profiling_delegate( + event_tracer, resize_event_tracer_entry); +#endif // ET_EVENT_TRACER_ENABLED } +#ifdef ET_EVENT_TRACER_ENABLED + runtime::EventTracerEntry execute_event_tracer_entry = + event_tracer_start_profiling_delegate( + event_tracer, + "ETVK_COMPUTE_GRAPH_EXECUTE", + /* delegate_debug_id = */ -1); +#endif // ET_EVENT_TRACER_ENABLED compute_graph->execute(); +#ifdef ET_EVENT_TRACER_ENABLED + event_tracer_end_profiling_delegate( + event_tracer, execute_event_tracer_entry); +#endif // ET_EVENT_TRACER_ENABLED +#ifdef ET_EVENT_TRACER_ENABLED + compute_graph->context()->querypool().extract_results(); + for (const auto& r : + compute_graph->context()->querypool().get_shader_timestamp_data()) { + std::string event_name = "{" + r.kernel_name + + ", \"dispatch_id\": " + std::to_string(r.dispatch_id) + "}"; + event_tracer_log_profiling_delegate( + event_tracer, + event_name.c_str(), + /* delegate_debug_id = */ -1, + r.start_time_ns, + r.end_time_ns); + } +#endif // ET_EVENT_TRACER_ENABLED + +#ifdef ET_EVENT_TRACER_ENABLED + runtime::EventTracerEntry copy_outputs_event_tracer_entry = + event_tracer_start_profiling_delegate( + event_tracer, + "ETVK_COPY_OUTPUTS", + /* delegate_debug_id = */ -1); +#endif // ET_EVENT_TRACER_ENABLED for (size_t i = 0; i < compute_graph->outputs().size(); i++) { const size_t o = i + num_inputs; const ValueRef oref = compute_graph->outputs()[i].value; @@ -701,21 +764,14 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface { compute_graph->get_val_type(oref)); } } +#ifdef ET_EVENT_TRACER_ENABLED + event_tracer_end_profiling_delegate( + event_tracer, copy_outputs_event_tracer_entry); +#endif // ET_EVENT_TRACER_ENABLED #ifdef ET_EVENT_TRACER_ENABLED - runtime::EventTracer* event_tracer = context.event_tracer(); - compute_graph->context()->querypool().extract_results(); - for (const auto& r : - compute_graph->context()->querypool().get_shader_timestamp_data()) { - std::string event_name = "{" + r.kernel_name + - ", \"dispatch_id\": " + std::to_string(r.dispatch_id) + "}"; - event_tracer_log_profiling_delegate( - event_tracer, - event_name.c_str(), - /* delegate_debug_id = */ -1, - r.start_time_ns, - r.end_time_ns); - } + event_tracer_end_profiling_delegate( + event_tracer, overall_event_tracer_entry); #endif // ET_EVENT_TRACER_ENABLED return Error::Ok;