From 5f2f881e9607083b33c710e0e0627615677e2acd Mon Sep 17 00:00:00 2001 From: leaiss Date: Thu, 28 May 2026 10:08:06 -0700 Subject: [PATCH 1/2] =?UTF-8?q?perf(vk=5Fnative):=20chain=20renderer=20atl?= =?UTF-8?q?as-blit=20=E2=86=92=20pre-DP=20submit=20via=20semaphore?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Eliminates the first of two CPU stalls per frame in the in-process vk_native compositor path (Tier 3 #10, partial). Before this change every frame did: 1. renderer.draw() → vkQueueSubmit + vkQueueWaitIdle ← CPU stall A 2. compositor pre-DP cmd buffer → vkQueueSubmit + vkQueueWaitIdle ← CPU stall B 3. display_processor.process_atlas() → DP self-submits internally Submissions 1 and 2 are on the same queue (vk->main_queue) and #2 consumes atlas_image written by #1. The waitIdle at end of (1) is doing two jobs: (a) cross-submission memory dependency, (b) cmd-buffer free safety. Both can be expressed correctly with a binary semaphore + fence instead, without ever blocking the CPU. What changed - Renderer now owns a binary VkSemaphore (frame_done_sem) and a VkFence (frame_done_fence). draw()'s vkQueueSubmit signals both, defers the cmd-buffer free to the next call (gated on the fence), and skips the vkQueueWaitIdle entirely. Fence is created in the signaled state so the first draw() doesn't stall on a non-existent prior submit. - New accessor comp_vk_native_renderer_take_frame_done_semaphore() returns the signaled semaphore handle to a single downstream consumer per frame, and clears the pending flag so subsequent submits in the same frame don't double-wait. - comp_vk_native_compositor.c grows a static vk_native_wire_renderer_wait() helper that's called at every per-frame vkQueueSubmit site — the helper takes the pending semaphore (if any) and wires it into the submit info at wait-stage TRANSFER | FRAGMENT_SHADER | COLOR_ATTACHMENT_OUTPUT. Only one submit per frame ever sees a non-NULL handle, so the wiring is correct regardless of which branch (dp_self_submits / not, target path / shared-image path, fallback / weave) actually executes. What didn't change - The vkQueueWaitIdle *after* the pre-DP submit (stall B above) stays. Eliminating it requires the DP vtable to accept a wait-semaphore on its self-submit — a separate plug-in-coordinated change deferred to Tier 3 #10 part 2. - The vk_native_capture_atlas_to_png debug helper still uses vkQueueSubmit + vkQueueWaitIdle for its one-shot CPU readback — not per-frame and not worth the extra state. - Renderer's resize() path keeps vkDeviceWaitIdle. A new guard at the start of draw() drains any unclaimed pending signal (would-be binary-semaphore double-signal UB) so resize can safely interleave with the chain. Validation - Builds clean on Windows (scripts/build_windows.bat build). - Cross-platform code: macOS + Android builds will run via CI. - Hardware perf measurement deferred — emulator hits its Vulkan-device- extension wall before the frame loop runs, and Windows test apps default to service mode which routes around in-process vk_native. The change is correctness-equivalent to the prior waitIdle pattern (same execution ordering on the queue, same memory-visibility guarantees via the wait-stage mask). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../vk_native/comp_vk_native_compositor.c | 50 ++++++++ .../vk_native/comp_vk_native_renderer.c | 118 +++++++++++++++++- .../vk_native/comp_vk_native_renderer.h | 21 ++++ 3 files changed, 186 insertions(+), 3 deletions(-) diff --git a/src/xrt/compositor/vk_native/comp_vk_native_compositor.c b/src/xrt/compositor/vk_native/comp_vk_native_compositor.c index 1b58a99f1..f058bd204 100644 --- a/src/xrt/compositor/vk_native/comp_vk_native_compositor.c +++ b/src/xrt/compositor/vk_native/comp_vk_native_compositor.c @@ -606,6 +606,41 @@ vk_comp(struct xrt_compositor *xc) return (struct comp_vk_native_compositor *)xc; } +/* + * If the renderer signaled a frame-done semaphore on its most recent draw() + * submit and no consumer has taken it yet this frame, wire it into the given + * VkSubmitInfo so this submit waits on it instead of the caller issuing a + * vkQueueWaitIdle between submits. The semaphore handle and wait-stage mask + * are written to the caller-owned storage (must outlive vkQueueSubmit). + * + * When the renderer didn't submit this frame (zero-copy path) or a prior + * caller already consumed it, leaves waitSemaphoreCount at 0 — safe no-op. + */ +static inline void +vk_native_wire_renderer_wait(struct comp_vk_native_compositor *c, + VkSubmitInfo *si, + VkSemaphore *out_sem_storage, + VkPipelineStageFlags *out_stage_storage) +{ + uint64_t sem_u64 = comp_vk_native_renderer_take_frame_done_semaphore(c->renderer); + if (sem_u64 == 0) { + return; + } + *out_sem_storage = (VkSemaphore)(uintptr_t)sem_u64; + // Renderer's atlas is read by every downstream pre-DP / DP / fallback + // path either as a sampled texture (FRAGMENT_SHADER), a copy/blit + // source (TRANSFER), or sampled into a render pass (COLOR_ATTACHMENT + // indirectly). Waiting at the union of those stages is correct and + // only costs scheduling — not a CPU stall. + *out_stage_storage = + VK_PIPELINE_STAGE_TRANSFER_BIT | + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | + VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; + si->waitSemaphoreCount = 1; + si->pWaitSemaphores = out_sem_storage; + si->pWaitDstStageMask = out_stage_storage; +} + /* * * xrt_compositor member functions @@ -2185,6 +2220,9 @@ vk_compositor_layer_commit(struct xrt_compositor *xc, xrt_graphics_sync_handle_t .commandBufferCount = 1, .pCommandBuffers = &cmd, }; + VkSemaphore pre_wait_sem = VK_NULL_HANDLE; + VkPipelineStageFlags pre_wait_stage = 0; + vk_native_wire_renderer_wait(c, &pre_si, &pre_wait_sem, &pre_wait_stage); res = vk->vkQueueSubmit(vk->main_queue->queue, 1, &pre_si, VK_NULL_HANDLE); if (res == VK_SUCCESS) { vk->vkQueueWaitIdle(vk->main_queue->queue); @@ -2220,6 +2258,9 @@ vk_compositor_layer_commit(struct xrt_compositor *xc, xrt_graphics_sync_handle_t .commandBufferCount = 1, .pCommandBuffers = &cmd, }; + VkSemaphore post_wait_sem = VK_NULL_HANDLE; + VkPipelineStageFlags post_wait_stage = 0; + vk_native_wire_renderer_wait(c, &submit_info, &post_wait_sem, &post_wait_stage); res = vk->vkQueueSubmit(vk->main_queue->queue, 1, &submit_info, VK_NULL_HANDLE); if (res == VK_SUCCESS) { vk->vkQueueWaitIdle(vk->main_queue->queue); @@ -2244,6 +2285,9 @@ vk_compositor_layer_commit(struct xrt_compositor *xc, xrt_graphics_sync_handle_t .commandBufferCount = 1, .pCommandBuffers = &cmd, }; + VkSemaphore fb_wait_sem = VK_NULL_HANDLE; + VkPipelineStageFlags fb_wait_stage = 0; + vk_native_wire_renderer_wait(c, &submit_info, &fb_wait_sem, &fb_wait_stage); res = vk->vkQueueSubmit(vk->main_queue->queue, 1, &submit_info, VK_NULL_HANDLE); if (res == VK_SUCCESS) { vk->vkQueueWaitIdle(vk->main_queue->queue); @@ -2388,6 +2432,9 @@ vk_compositor_layer_commit(struct xrt_compositor *xc, xrt_graphics_sync_handle_t .commandBufferCount = 1, .pCommandBuffers = &cmd, }; + VkSemaphore tgt_pre_wait_sem = VK_NULL_HANDLE; + VkPipelineStageFlags tgt_pre_wait_stage = 0; + vk_native_wire_renderer_wait(c, &pre_si, &tgt_pre_wait_sem, &tgt_pre_wait_stage); res = vk->vkQueueSubmit(vk->main_queue->queue, 1, &pre_si, VK_NULL_HANDLE); if (res == VK_SUCCESS) { vk->vkQueueWaitIdle(vk->main_queue->queue); @@ -2467,6 +2514,9 @@ vk_compositor_layer_commit(struct xrt_compositor *xc, xrt_graphics_sync_handle_t .commandBufferCount = 1, .pCommandBuffers = &cmd, }; + VkSemaphore tgt_wait_sem = VK_NULL_HANDLE; + VkPipelineStageFlags tgt_wait_stage = 0; + vk_native_wire_renderer_wait(c, &submit_info, &tgt_wait_sem, &tgt_wait_stage); res = vk->vkQueueSubmit(vk->main_queue->queue, 1, &submit_info, VK_NULL_HANDLE); if (res == VK_SUCCESS) { diff --git a/src/xrt/compositor/vk_native/comp_vk_native_renderer.c b/src/xrt/compositor/vk_native/comp_vk_native_renderer.c index 82cbc245f..6142a54dc 100644 --- a/src/xrt/compositor/vk_native/comp_vk_native_renderer.c +++ b/src/xrt/compositor/vk_native/comp_vk_native_renderer.c @@ -75,6 +75,27 @@ struct comp_vk_native_renderer //! When true, clear the atlas to alpha=0 (transparent) instead of //! opaque black, so app alpha<1 regions survive to the present (issue #392). bool transparent_background; + + //! Binary semaphore signaled at end of draw()'s queue submit so the + //! compositor's downstream pre-DP submit can chain without a CPU + //! waitIdle between them. Single-use per draw — take_frame_done_semaphore() + //! returns this handle and clears @ref signal_pending so subsequent + //! submits in the same frame don't double-wait. + VkSemaphore frame_done_sem; + + //! Fence signaled alongside @ref frame_done_sem. Waited at the start of + //! the next draw() to enforce CPU-side back-pressure and to know when + //! @ref pending_cmd is safe to free. + VkFence frame_done_fence; + + //! Cmd buffer in flight from the previous draw(). NULL on first call. + //! Freed at the start of the next draw() once @ref frame_done_fence + //! signals, or at destroy() time after vkDeviceWaitIdle. + VkCommandBuffer pending_cmd; + + //! True after draw() signals @ref frame_done_sem and until a downstream + //! submit takes it via take_frame_done_semaphore(). + bool signal_pending; }; static void @@ -234,6 +255,35 @@ comp_vk_native_renderer_create(struct comp_vk_native_compositor *c, return xret; } + // Per-frame sync primitives for the combined-submit chain + // (renderer.draw() signals → compositor's pre-DP submit waits, no + // vkQueueWaitIdle between them). The fence starts signaled so the + // first draw() can free a (non-existent) previous cmd buffer and + // vkResetFences without an initial wait stall. + VkSemaphoreCreateInfo sem_ci = {.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO}; + res = vk->vkCreateSemaphore(vk->device, &sem_ci, NULL, &r->frame_done_sem); + if (res != VK_SUCCESS) { + U_LOG_E("Failed to create renderer frame-done semaphore: %d", res); + destroy_atlas_resources(r); + vk->vkDestroyCommandPool(vk->device, r->cmd_pool, NULL); + free(r); + return XRT_ERROR_VULKAN; + } + + VkFenceCreateInfo fence_ci = { + .sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO, + .flags = VK_FENCE_CREATE_SIGNALED_BIT, + }; + res = vk->vkCreateFence(vk->device, &fence_ci, NULL, &r->frame_done_fence); + if (res != VK_SUCCESS) { + U_LOG_E("Failed to create renderer frame-done fence: %d", res); + vk->vkDestroySemaphore(vk->device, r->frame_done_sem, NULL); + destroy_atlas_resources(r); + vk->vkDestroyCommandPool(vk->device, r->cmd_pool, NULL); + free(r); + return XRT_ERROR_VULKAN; + } + *out_renderer = r; return XRT_SUCCESS; } @@ -250,6 +300,18 @@ comp_vk_native_renderer_destroy(struct comp_vk_native_renderer **renderer_ptr) vk->vkDeviceWaitIdle(vk->device); + if (r->pending_cmd != VK_NULL_HANDLE) { + vk->vkFreeCommandBuffers(vk->device, r->cmd_pool, 1, &r->pending_cmd); + r->pending_cmd = VK_NULL_HANDLE; + } + + if (r->frame_done_fence != VK_NULL_HANDLE) { + vk->vkDestroyFence(vk->device, r->frame_done_fence, NULL); + } + if (r->frame_done_sem != VK_NULL_HANDLE) { + vk->vkDestroySemaphore(vk->device, r->frame_done_sem, NULL); + } + destroy_atlas_resources(r); if (r->cmd_pool != VK_NULL_HANDLE) { @@ -306,6 +368,35 @@ comp_vk_native_renderer_draw(struct comp_vk_native_renderer *r, (void)left_eye; (void)right_eye; + // Wait for the previous frame's submit to finish so we can safely + // free its cmd buffer and reuse the per-frame fence. Fence starts + // signaled at create time, so the first call returns immediately. + vk->vkWaitForFences(vk->device, 1, &r->frame_done_fence, VK_TRUE, UINT64_MAX); + vk->vkResetFences(vk->device, 1, &r->frame_done_fence); + + if (r->pending_cmd != VK_NULL_HANDLE) { + vk->vkFreeCommandBuffers(vk->device, r->cmd_pool, 1, &r->pending_cmd); + r->pending_cmd = VK_NULL_HANDLE; + } + + // Defensive: if the previous frame's frame-done semaphore was + // signaled but never consumed by a downstream submit (e.g. that + // submit failed, or resize() drained the GPU without going through + // the compositor's frame loop), drain it now via a no-op + // wait-submit so we don't double-signal a binary semaphore on the + // vkQueueSubmit below — that's undefined behavior per Vulkan spec. + if (r->signal_pending) { + VkPipelineStageFlags drain_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + VkSubmitInfo drain = { + .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO, + .waitSemaphoreCount = 1, + .pWaitSemaphores = &r->frame_done_sem, + .pWaitDstStageMask = &drain_stage, + }; + vk->vkQueueSubmit(vk->main_queue->queue, 1, &drain, VK_NULL_HANDLE); + r->signal_pending = false; + } + VkCommandBufferAllocateInfo alloc_info = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, .commandPool = r->cmd_pool, @@ -460,21 +551,32 @@ comp_vk_native_renderer_draw(struct comp_vk_native_renderer *r, vk->vkEndCommandBuffer(cmd); + // Signal the frame-done semaphore on submit so the compositor's + // pre-DP submit (which reads from atlas_image) can chain after us + // without a CPU vkQueueWaitIdle. The fence catches the same event + // CPU-side so the next draw() can safely free this cmd buffer. VkSubmitInfo submit_info = { .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO, .commandBufferCount = 1, .pCommandBuffers = &cmd, + .signalSemaphoreCount = 1, + .pSignalSemaphores = &r->frame_done_sem, }; - res = vk->vkQueueSubmit(vk->main_queue->queue, 1, &submit_info, VK_NULL_HANDLE); + res = vk->vkQueueSubmit(vk->main_queue->queue, 1, &submit_info, r->frame_done_fence); if (res != VK_SUCCESS) { U_LOG_E("Failed to submit renderer commands: %d", res); vk->vkFreeCommandBuffers(vk->device, r->cmd_pool, 1, &cmd); + // Re-signal the fence so the next draw() doesn't deadlock waiting + // on a fence that vkQueueSubmit refused to associate with a batch. + vk->vkResetFences(vk->device, 1, &r->frame_done_fence); + VkSubmitInfo signal_only = {.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO}; + vk->vkQueueSubmit(vk->main_queue->queue, 1, &signal_only, r->frame_done_fence); return XRT_ERROR_VULKAN; } - vk->vkQueueWaitIdle(vk->main_queue->queue); - vk->vkFreeCommandBuffers(vk->device, r->cmd_pool, 1, &cmd); + r->pending_cmd = cmd; + r->signal_pending = true; return XRT_SUCCESS; } @@ -700,3 +802,13 @@ comp_vk_native_renderer_set_transparent(struct comp_vk_native_renderer *r, bool { r->transparent_background = transparent_background; } + +uint64_t +comp_vk_native_renderer_take_frame_done_semaphore(struct comp_vk_native_renderer *r) +{ + if (!r->signal_pending) { + return 0; + } + r->signal_pending = false; + return (uint64_t)(uintptr_t)r->frame_done_sem; +} diff --git a/src/xrt/compositor/vk_native/comp_vk_native_renderer.h b/src/xrt/compositor/vk_native/comp_vk_native_renderer.h index 51d6efcd6..628b07ef1 100644 --- a/src/xrt/compositor/vk_native/comp_vk_native_renderer.h +++ b/src/xrt/compositor/vk_native/comp_vk_native_renderer.h @@ -251,6 +251,27 @@ void comp_vk_native_renderer_set_transparent(struct comp_vk_native_renderer *renderer, bool transparent_background); +/*! + * Take ownership of the binary semaphore signaled by the most recent + * @ref comp_vk_native_renderer_draw submit. The compositor passes this + * as a wait semaphore on its downstream pre-DP submit, replacing the + * previous vkQueueWaitIdle between renderer and compositor submits. + * + * Returns VK_NULL_HANDLE (cast to uint64_t) when there is no pending + * signal — e.g. on the zero-copy path where draw() was skipped, or when + * a previous caller in the same frame already consumed it. + * + * Single-consumer per draw() call. Caller is responsible for waiting on + * the returned handle at an appropriate pipeline stage. + * + * @param renderer The renderer. + * @return VkSemaphore as uint64_t, or 0 (VK_NULL_HANDLE) when none pending. + * + * @ingroup comp_vk_native + */ +uint64_t +comp_vk_native_renderer_take_frame_done_semaphore(struct comp_vk_native_renderer *renderer); + #ifdef __cplusplus } #endif From cb182adf43704df77ceb0a69550ffd557c9640c2 Mon Sep 17 00:00:00 2001 From: leaiss Date: Thu, 28 May 2026 11:27:00 -0700 Subject: [PATCH 2/2] fix(vk_native renderer): defer fence reset to avoid deadlock on alloc-fail path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two bug-fixes surfaced during self-audit of the previous commit: 1. **Fence-reset/deadlock race.** The previous commit reset frame_done_fence at the *top* of draw(), then proceeded to vkAllocateCommandBuffers. If the alloc failed (or any early return between the reset and the main submit fired), the fence was left in the *unsignaled* state with no submit to ever signal it. The next draw() would block on vkWaitForFences forever — runtime hang on the next frame after any one-off transient failure. Fix: defer the vkResetFences call to just before the main vkQueueSubmit. All early-return paths now leave the fence in its previous (signaled) state, so the next draw() proceeds normally. 2. **Drain-failure double-signal risk.** The semaphore-drain block at the top of draw() set signal_pending = false unconditionally, even if the no-op drain submit returned non-VK_SUCCESS. With signal_pending cleared, the next main submit would queue a signal operation on a binary semaphore that may still have its previous signal pending — undefined behavior per Vulkan spec. Fix: only clear signal_pending when vkQueueSubmit returns VK_SUCCESS. On failure, log + return XRT_ERROR_VULKAN so the caller knows the renderer isn't in a usable state this frame. Builds clean on Windows. The original change (#360) passed CI on Windows + macOS + Android; this commit re-pushes for re-validation. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../vk_native/comp_vk_native_renderer.c | 33 ++++++++++++++----- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/src/xrt/compositor/vk_native/comp_vk_native_renderer.c b/src/xrt/compositor/vk_native/comp_vk_native_renderer.c index 6142a54dc..03e792e83 100644 --- a/src/xrt/compositor/vk_native/comp_vk_native_renderer.c +++ b/src/xrt/compositor/vk_native/comp_vk_native_renderer.c @@ -369,10 +369,14 @@ comp_vk_native_renderer_draw(struct comp_vk_native_renderer *r, (void)right_eye; // Wait for the previous frame's submit to finish so we can safely - // free its cmd buffer and reuse the per-frame fence. Fence starts - // signaled at create time, so the first call returns immediately. + // free its cmd buffer and (later) reuse the per-frame fence. Fence + // starts signaled at create time so the first call returns + // immediately. We DON'T reset the fence here — that's deferred + // until just before the main vkQueueSubmit so any early-return + // failure (cmd-buffer alloc fails, drain fails, etc.) leaves the + // fence in its signaled state. Otherwise the next draw() would + // vkWaitForFences forever on a fence nothing ever signals. vk->vkWaitForFences(vk->device, 1, &r->frame_done_fence, VK_TRUE, UINT64_MAX); - vk->vkResetFences(vk->device, 1, &r->frame_done_fence); if (r->pending_cmd != VK_NULL_HANDLE) { vk->vkFreeCommandBuffers(vk->device, r->cmd_pool, 1, &r->pending_cmd); @@ -385,6 +389,8 @@ comp_vk_native_renderer_draw(struct comp_vk_native_renderer *r, // the compositor's frame loop), drain it now via a no-op // wait-submit so we don't double-signal a binary semaphore on the // vkQueueSubmit below — that's undefined behavior per Vulkan spec. + // Only clear signal_pending if the drain actually went through; + // otherwise the next signaling submit would risk the double-signal UB. if (r->signal_pending) { VkPipelineStageFlags drain_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; VkSubmitInfo drain = { @@ -393,8 +399,13 @@ comp_vk_native_renderer_draw(struct comp_vk_native_renderer *r, .pWaitSemaphores = &r->frame_done_sem, .pWaitDstStageMask = &drain_stage, }; - vk->vkQueueSubmit(vk->main_queue->queue, 1, &drain, VK_NULL_HANDLE); - r->signal_pending = false; + VkResult drain_res = vk->vkQueueSubmit(vk->main_queue->queue, 1, &drain, VK_NULL_HANDLE); + if (drain_res == VK_SUCCESS) { + r->signal_pending = false; + } else { + U_LOG_E("Failed to drain stuck frame-done semaphore: %d", drain_res); + return XRT_ERROR_VULKAN; + } } VkCommandBufferAllocateInfo alloc_info = { @@ -555,6 +566,12 @@ comp_vk_native_renderer_draw(struct comp_vk_native_renderer *r, // pre-DP submit (which reads from atlas_image) can chain after us // without a CPU vkQueueWaitIdle. The fence catches the same event // CPU-side so the next draw() can safely free this cmd buffer. + // + // Reset the fence here (not at the top of draw) so any early-return + // failure above leaves the fence in its previous signaled state + // rather than deadlocking the next draw() on a fence nothing signals. + vk->vkResetFences(vk->device, 1, &r->frame_done_fence); + VkSubmitInfo submit_info = { .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO, .commandBufferCount = 1, @@ -567,9 +584,9 @@ comp_vk_native_renderer_draw(struct comp_vk_native_renderer *r, if (res != VK_SUCCESS) { U_LOG_E("Failed to submit renderer commands: %d", res); vk->vkFreeCommandBuffers(vk->device, r->cmd_pool, 1, &cmd); - // Re-signal the fence so the next draw() doesn't deadlock waiting - // on a fence that vkQueueSubmit refused to associate with a batch. - vk->vkResetFences(vk->device, 1, &r->frame_done_fence); + // Re-signal the fence with a no-op submit so the next draw() + // doesn't deadlock waiting on a fence that vkQueueSubmit + // refused to associate with a batch. VkSubmitInfo signal_only = {.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO}; vk->vkQueueSubmit(vk->main_queue->queue, 1, &signal_only, r->frame_done_fence); return XRT_ERROR_VULKAN;