From c8f3cfe06ef167e9989a89da6712b527802b5f08 Mon Sep 17 00:00:00 2001 From: Stephen Jia Date: Thu, 5 Jun 2025 11:02:49 -0700 Subject: [PATCH] [ET-VK][ez] Fix Vulkan Validation layer errors due to consecutive command buffer encoding ## Changes * In `VulkanBackend.cpp` do not call `encode_execute()` during model load if the model compile spec specifies `requires_dynamic_shapes` as true * In test files, do not call `encode_execute()` if `propagate_resize()` is subsequently called. ## Motivation Recently, it was discovered that a command buffer re-encode was required to update push constant values. This means that for dynamic shapes to work correctly, `encode_execute()` must be called after updating tensor sizes. As a result, `propagate_resize()` now calls `encode_execute()` internally. This results in scenarios where `encode_execute()` is called once during model load, then again right before the first inference during `propagate_resize()`, without actually executing the command buffer in-between. This causes Validation layer errors like ``` UNASSIGNED-CoreValidation-DrawState-InvalidImageLayout(ERROR / SPEC): msgNum: 1303270965 - Validation Error: [ UNASSIGNED-CoreValidation-DrawState-InvalidImageLayout ] Object 0: handle = 0x24086224ec0, type = VK_OBJECT_TYPE_COMMAND_BUFFER; Object 1: handle = 0x88d2b500000000e2, type = VK_OBJECT_TYPE_IMAGE; | MessageID = 0x4dae5635 | vkQueueSubmit(): pSubmits[0].pCommandBuffers[0] command buffer VkCommandBuffer 0x24086224ec0[] expects VkImage 0x88d2b500000000e2[] (subresource: aspectMask VK_IMAGE_ASPECT_COLOR_BIT array layer 0, mip level 0) to be in layout VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL--instead, current layout is VK_IMAGE_LAYOUT_UNDEFINED. Objects: 2 [0] 0x24086224ec0, type: 6, name: NULL [1] 0x88d2b500000000e2, type: 10, name: NULL UNASSIGNED-CoreValidation-DrawState-InvalidImageLayout(ERROR / SPEC): msgNum: 1303270965 - Validation Error: [ UNASSIGNED-CoreValidation-DrawState-InvalidImageLayout ] Object 0: handle = 0x24086224ec0, type = VK_OBJECT_TYPE_COMMAND_BUFFER; Object 1: handle = 0x6caffc00000000e3, type = VK_OBJECT_TYPE_IMAGE; | MessageID = 0x4dae5635 | vkQueueSubmit(): pSubmits[0].pCommandBuffers[0] command buffer VkCommandBuffer 0x24086224ec0[] expects VkImage 0x6caffc00000000e3[] (subresource: aspectMask VK_IMAGE_ASPECT_COLOR_BIT array layer 0, mip level 0) to be in layout VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL--instead, current layout is VK_IMAGE_LAYOUT_UNDEFINED. Objects: 2 [0] 0x24086224ec0, type: 6, name: NULL [1] 0x6caffc00000000e3, type: 10, name: NULL ``` because the last access information of image/buffer resources are inaccurate during the second command buffer encoding, since the first command buffer never executed. ## Perf Impact * Performance improvement for first inference of dynamic shape models if actual tensor sizes are much smaller than maximum possible sizes * No impact for non-dynamic shape models Differential Revision: [D76047203](https://our.internmc.facebook.com/intern/diff/D76047203/) [ghstack-poisoned] --- backends/vulkan/runtime/VulkanBackend.cpp | 23 +++++++++++++++---- .../vulkan/runtime/VulkanDelegateHeader.cpp | 4 ++++ .../vulkan/runtime/VulkanDelegateHeader.h | 3 +++ backends/vulkan/runtime/graph/GraphConfig.cpp | 2 ++ backends/vulkan/runtime/graph/GraphConfig.h | 3 +++ .../test/op_tests/utils/gen_computegraph.py | 3 ++- .../vulkan/test/vulkan_compute_api_test.cpp | 6 ----- 7 files changed, 33 insertions(+), 11 deletions(-) diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp index 75726ae0892..9b3818c2ed0 100644 --- a/backends/vulkan/runtime/VulkanBackend.cpp +++ b/backends/vulkan/runtime/VulkanBackend.cpp @@ -30,6 +30,8 @@ #include #include +#include + namespace executorch { namespace backends { namespace vulkan { @@ -140,6 +142,14 @@ GraphConfig get_graph_config(ArrayRef& compile_specs) { config.set_memory_layout_override(memory_layout); } + if (strcmp(spec.key, "require_dynamic_shapes") == 0) { + ET_CHECK_MSG(value_size == sizeof(uint8_t), "Unexpected value size!"); + bool value = getBool(value_data); + + if (value) { + config.expect_dynamic_shapes = true; + } + } } #ifdef ET_EVENT_TRACER_ENABLED config.enable_querypool = true; @@ -500,9 +510,12 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface { compute_graph->encode_prepack(); compute_graph->prepack(); - // TODO(ssjia): remove this once we can batch compile compute pipelines - // during prepare(). - compute_graph->encode_execute(); + // If dynamic shapes are not expected, then the command buffer only needs to + // be encoded once. Otherwise, wait until the first inference to encode the + // the command buffer, when actual input shapes are known. + if (!compute_graph->graphconfig().expect_dynamic_shapes) { + compute_graph->encode_execute(); + } return Error::Ok; } @@ -574,7 +587,9 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface { // constants are updated and DynamicDispatchNode can update the compute // shader, global workgroup size, and local workgroup size to perform the // model inference. - if (should_propagate_resize) { + if (should_propagate_resize || + (compute_graph->graphconfig().expect_dynamic_shapes && + compute_graph->execute_count() == 0u)) { compute_graph->propagate_resize(); } diff --git a/backends/vulkan/runtime/VulkanDelegateHeader.cpp b/backends/vulkan/runtime/VulkanDelegateHeader.cpp index 81fd0bbc953..2a235144342 100644 --- a/backends/vulkan/runtime/VulkanDelegateHeader.cpp +++ b/backends/vulkan/runtime/VulkanDelegateHeader.cpp @@ -60,6 +60,10 @@ uint32_t getUInt16LE(const uint8_t* data) { return (uint32_t)data[0] | ((uint32_t)data[1] << 8); } +bool getBool(const uint8_t* data) { + return data[0] != 0; +} + bool VulkanDelegateHeader::is_valid() const { if (header_size < kExpectedSize) { return false; diff --git a/backends/vulkan/runtime/VulkanDelegateHeader.h b/backends/vulkan/runtime/VulkanDelegateHeader.h index 0fc163bbe3c..722f01cbb75 100644 --- a/backends/vulkan/runtime/VulkanDelegateHeader.h +++ b/backends/vulkan/runtime/VulkanDelegateHeader.h @@ -19,6 +19,9 @@ uint64_t getUInt64LE(const uint8_t* data); uint32_t getUInt32LE(const uint8_t* data); uint32_t getUInt16LE(const uint8_t* data); +// Bool is serialized as a single byte +bool getBool(const uint8_t* data); + struct VulkanDelegateHeader { bool is_valid() const; diff --git a/backends/vulkan/runtime/graph/GraphConfig.cpp b/backends/vulkan/runtime/graph/GraphConfig.cpp index 887b46c002a..20606be6a96 100644 --- a/backends/vulkan/runtime/graph/GraphConfig.cpp +++ b/backends/vulkan/runtime/graph/GraphConfig.cpp @@ -63,6 +63,8 @@ GraphConfig::GraphConfig() { enable_local_wg_size_override = false; local_wg_size_override = {}; + + expect_dynamic_shapes = false; } void GraphConfig::set_storage_type_override(utils::StorageType storage_type) { diff --git a/backends/vulkan/runtime/graph/GraphConfig.h b/backends/vulkan/runtime/graph/GraphConfig.h index aa7df2cb413..df2d6d6f2e1 100644 --- a/backends/vulkan/runtime/graph/GraphConfig.h +++ b/backends/vulkan/runtime/graph/GraphConfig.h @@ -33,6 +33,9 @@ struct GraphConfig final { bool enable_local_wg_size_override; utils::uvec3 local_wg_size_override; + // Whether or not the ComputeGraph should expect input shapes to be dynamic + bool expect_dynamic_shapes; + // Generate a default graph config with pre-configured settings explicit GraphConfig(); diff --git a/backends/vulkan/test/op_tests/utils/gen_computegraph.py b/backends/vulkan/test/op_tests/utils/gen_computegraph.py index b24879f660a..b428db67ef9 100644 --- a/backends/vulkan/test/op_tests/utils/gen_computegraph.py +++ b/backends/vulkan/test/op_tests/utils/gen_computegraph.py @@ -618,7 +618,8 @@ def gen_graph_build_code(self, include_declarations: bool = True) -> str: graph_build += f"{self.graph}{self.dot}prepare();\n" graph_build += f"{self.graph}{self.dot}encode_prepack();\n" graph_build += f"{self.graph}{self.dot}prepack();\n" - graph_build += f"{self.graph}{self.dot}encode_execute();\n" + if not self.include_io: + graph_build += f"{self.graph}{self.dot}encode_execute();\n" graph_build += "\n" return graph_build diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index 05cec2d2257..6750d8f9b06 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -1536,7 +1536,6 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) { EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count); graph.prepare(); - graph.encode_execute(); // +3: shared memory allocations for tensors expected_vma_allocation_count += 3; @@ -1743,7 +1742,6 @@ TEST(VulkanComputeGraphTest, test_large_graph) { out.staging = graph.set_output_tensor(out.value); graph.prepare(); - graph.encode_execute(); auto build_end_time = std::chrono::system_clock::now(); @@ -1820,7 +1818,6 @@ void test_clone( out.staging = graph.set_output_tensor(out.value); graph.prepare(); - graph.encode_execute(); fill_vtensor(graph, a, 0.0f, /*iota = */ true); @@ -2955,7 +2952,6 @@ void test_grid_priors( graph.prepare(); graph.encode_prepack(); graph.prepack(); - graph.encode_execute(); vTensorPtr t_in = graph.get_tensor(in.value); vTensorPtr t_out = graph.get_tensor(out.value); @@ -3058,7 +3054,6 @@ void test_transpose_view_mm( graph.prepare(); graph.encode_prepack(); graph.prepack(); - graph.encode_execute(); for (int i = 1; i < 4; i++) { float val_mat1 = i; @@ -3125,7 +3120,6 @@ void test_to_copy() { graph.prepare(); graph.encode_prepack(); graph.prepack(); - graph.encode_execute(); graph.propagate_resize(); graph.execute();