diff --git a/src/viz/AGENTS.md b/src/viz/AGENTS.md index 67735dedd..ec80d7b2b 100644 --- a/src/viz/AGENTS.md +++ b/src/viz/AGENTS.md @@ -16,27 +16,44 @@ single sub-module. Each sub-module is its own static library with its own sibling `_tests/` directory: - **`viz/core/`** — foundational types + Vulkan/CUDA infrastructure. - Library: `viz_core`. Today: `VkContext`, `VizBuffer`, `Pose3D`, `Fov`, - `Resolution`, `ViewInfo`, `PixelFormat`, `RenderTarget`, `FrameSync`, - `HostImage`, `DeviceImage`. `HostImage` / `DeviceImage` are the - symmetric pair of owning 2D pixel buffers (CPU bytes vs CUDA-Vulkan - interop) — both expose `VizBuffer view()` so generic helpers branch - on `VizBuffer::space`. Math types (`glm::vec3`, `glm::quat`, - `glm::mat4`) come from GLM 1.0.1 (FetchContent in - `deps/third_party/`); use `glm::value_ptr(mat)` to get a raw `float*` - for Vulkan / CUDA upload (POD-equivalent layout, no copy). - CUDA-Vulkan interop requires CUDA Toolkit at link time - (`CUDAToolkit::cudart`). `VkContext::init()` matches the current - CUDA device to the chosen Vulkan physical device by UUID — every - viz_core type can assume CUDA and Vulkan are talking to the same - GPU without re-doing the match. -- **`viz/layers/`** — `LayerBase` and concrete layers (`QuadLayer`, etc.). - Library: `viz_layers` (INTERFACE / header-only today; promoted to - STATIC when the first concrete layer ships). Depends on `viz_core`. - Test-only fixture layers (`ClearRectLayer`, future `ColoredQuadLayer`) - live in `viz/layers_tests/cpp/inc/viz/layers/testing/` and are exposed - via the `viz::layers_testing` static library — used by other test - binaries (e.g. `viz_session_tests`) to compose into a `VizSession`. + Library: `viz_core`. Today: `VkContext`, `VizBuffer`, `Pose3D`, + `Fov`, `Resolution`, `ViewInfo`, `PixelFormat`, `RenderTarget`, + `FrameSync`, `HostImage`, `DeviceImage`. `HostImage` owns CPU bytes + and exposes a `VizBuffer view()`; `DeviceImage` owns CUDA-Vulkan + interop memory (VkImage + cudaArray_t) plus one timeline semaphore + (`cuda_done_writing`) that layers expose to the compositor for + fine-grained producer→consumer sync. The reverse direction (consumer + done reading) is handled at the layer level by buffering enough + slots that the producer never targets a still-in-flight read. + Consumed via discrete accessors — no `view()` because `cudaArray_t` + is opaque tiled memory, not a CUDA device pointer. `VizBuffer` + covers linear pointer-backed memory (CPU bytes / CUDA device + pointer; exposes `__cuda_array_interface__` / `__array_interface__` + in Python). Math types (`glm::vec3`, `glm::quat`, `glm::mat4`) come + from GLM 1.0.1 (FetchContent in `deps/third_party/`); use + `glm::value_ptr(mat)` to get a raw `float*` for Vulkan / CUDA upload + (POD-equivalent layout, no copy). CUDA-Vulkan interop requires CUDA + Toolkit at link time (`CUDAToolkit::cudart`). `VkContext::init()` + matches the current CUDA device to the chosen Vulkan physical + device by UUID — every viz_core type can assume CUDA and Vulkan + are talking to the same GPU without re-doing the match. +- **`viz/layers/`** — `LayerBase` and concrete layers. Library: + `viz_layers` (STATIC). Depends on `viz_core` + `viz_shaders`. Today: + `QuadLayer` (textured fullscreen quad, CUDA-fed via a 3-slot + mailbox of `DeviceImage`s; producer side is `submit(VizBuffer)` and + the renderer always samples the most recently published slot). + Pipelines built per-layer using the driver-side `VkPipelineCache` + from `VkContext`. **Deferred:** a zero-copy `acquire`/`release` + variant for producers that can write directly into a tiled + `cudaArray_t` (NVDEC, custom CUDA kernels). The mailbox internals + already track slot ownership, so the addition is local to + `QuadLayer`; revisit when a real producer demands it. Open design + questions are captured under "Future: zero-copy acquire/release" + in `DESIGN.md`. + Test-only fixture layers (`ClearRectLayer`, `ThrowingLayer`) live in + `viz/layers_tests/cpp/inc/viz/layers/testing/` and are exposed via + the `viz::layers_testing` static library — used by `viz_session_tests` + to compose into a `VizSession`. - **`viz/session/`** — `VizSession`, `VizCompositor`, `FrameInfo`, `FrameTimingStats`, `SessionState`, display backends (today: offscreen only; window/XR added by their respective backends). Library: @@ -75,11 +92,10 @@ Build paths that ship viz (the wheel CI on Linux + Windows) pass When `BUILD_VIZ=ON` the build machine must have: - **Vulkan headers + loader**: `libvulkan-dev` on Linux, LunarG SDK on Windows. -- **CUDA Toolkit** (cudart for link, nvcc not strictly required today - but expected to be needed for kernels in M3b+): apt - `nvidia-cuda-toolkit` or the official NVIDIA installer / CI action - (`Jimver/cuda-toolkit`). The wheel excludes `libcuda.so.1` — - consumers supply it via NVIDIA driver. +- **CUDA Toolkit** (cudart for link; nvcc once we ship CUDA kernels): + apt `nvidia-cuda-toolkit` or the official NVIDIA installer / + CI action (`Jimver/cuda-toolkit`). The wheel excludes + `libcuda.so.1` — consumers supply it via NVIDIA driver. - **glslangValidator** for shader compilation: `glslang-tools` apt package on Linux, `brew install glslang` on macOS, ships with the Vulkan SDK on Windows. diff --git a/src/viz/core/cpp/device_image.cpp b/src/viz/core/cpp/device_image.cpp index a02053b58..d6b53246f 100644 --- a/src/viz/core/cpp/device_image.cpp +++ b/src/viz/core/cpp/device_image.cpp @@ -7,11 +7,9 @@ #include #include -// Posix close() lives in on Linux/macOS; Windows uses _close() -// from . The fd-close path is unreachable at runtime on Windows -// (vkGetMemoryFdKHR isn't available there — import_to_cuda throws before -// memory_fd_ is ever assigned), but the code still has to compile under -// MSVC for the experimental Windows build. +// Posix close() vs Windows _close() shim — the fd-close path is +// dead on Windows (vkGetMemoryFdKHR isn't available there) but +// still has to compile under MSVC. #ifdef _WIN32 # include namespace @@ -68,14 +66,20 @@ uint32_t find_memory_type(VkPhysicalDevice physical_device, uint32_t type_bits, throw std::runtime_error("DeviceImage: no Vulkan memory type matching requested properties"); } -VkFormat to_vk_format(PixelFormat format) +// Storage-side Vulkan format for the underlying VkImage / VkDeviceMemory. +// We keep the storage UNORM and create a separate SRGB sampling view +// (image is created with VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT) so: +// - CUDA writes raw bytes (no implicit gamma transform). +// - Vulkan samples through the SRGB view → sampler decodes +// sRGB -> linear. +// - Fragment writes linear -> sRGB encode at the attachment. +// Net effect: arbitrary RGBA byte values round-trip exactly through +// CUDA -> Vulkan -> readback. +VkFormat to_vk_storage_format(PixelFormat format) { switch (format) { case PixelFormat::kRGBA8: - // UNORM (not SRGB) so CUDA writes round-trip without an - // implicit sRGB encode on the Vulkan side. Color management - // is the layer's concern. return VK_FORMAT_R8G8B8A8_UNORM; case PixelFormat::kD32F: return VK_FORMAT_D32_SFLOAT; @@ -83,6 +87,18 @@ VkFormat to_vk_format(PixelFormat format) throw std::runtime_error("DeviceImage: unsupported PixelFormat"); } +VkFormat to_vk_view_format(PixelFormat format) +{ + switch (format) + { + case PixelFormat::kRGBA8: + return VK_FORMAT_R8G8B8A8_SRGB; + case PixelFormat::kD32F: + return VK_FORMAT_D32_SFLOAT; + } + throw std::runtime_error("DeviceImage: unsupported PixelFormat"); +} + cudaChannelFormatDesc to_cuda_format(PixelFormat format) { switch (format) @@ -107,13 +123,21 @@ std::unique_ptr DeviceImage::create(const VkContext& ctx, Resolutio { throw std::invalid_argument("DeviceImage: resolution must be non-zero"); } + if (format != PixelFormat::kRGBA8) + { + // kD32F is reserved for ProjectionLayer's depth path. The + // CUDA-Vulkan interop contract for a depth image (sample + // semantics, layout transitions, color-space view) is not + // worked out yet, so refuse to half-build it. + throw std::invalid_argument("DeviceImage: only PixelFormat::kRGBA8 is supported"); + } std::unique_ptr img(new DeviceImage(ctx, resolution, format)); img->init(); return img; } DeviceImage::DeviceImage(const VkContext& ctx, Resolution resolution, PixelFormat format) - : ctx_(&ctx), resolution_(resolution), format_(format), vk_format_(to_vk_format(format)) + : ctx_(&ctx), resolution_(resolution), format_(format), vk_format_(to_vk_view_format(format)) { } @@ -129,6 +153,7 @@ void DeviceImage::init() create_vk_image_with_external_memory(); create_vk_image_view(); import_to_cuda(); + create_interop_semaphores(); transition_to_shader_read(); } catch (...) @@ -140,23 +165,24 @@ void DeviceImage::init() void DeviceImage::destroy() { - // Pin CUDA device for this thread so the CUDA frees below land on - // the right device even if destroy() runs on a thread that never - // ran VkContext::init(). Best-effort — destructor must not throw. + // Pin CUDA device on the destroying thread (best-effort; we + // can't throw out of a destructor). if (ctx_ != nullptr && ctx_->cuda_device_id() >= 0) { (void)cudaSetDevice(ctx_->cuda_device_id()); } - // CUDA side first; CUDA holds a dup'd handle on the underlying - // memory, so the VkDeviceMemory must outlive the CUDA mapping. - // cudaDeviceSynchronize ensures any caller-issued async CUDA work - // (e.g. cudaMemcpy2DToArrayAsync) has retired before we free the - // array — otherwise CUDA may UAF its own staging. - if (cuda_mipmapped_array_ != nullptr || cuda_external_memory_ != nullptr) + // CUDA side first — VkDeviceMemory must outlive the CUDA + // mapping. Sync drains any caller-issued async work first. + if (cuda_mipmapped_array_ != nullptr || cuda_external_memory_ != nullptr || cuda_cuda_done_writing_ != nullptr) { (void)cudaDeviceSynchronize(); } + if (cuda_cuda_done_writing_ != nullptr) + { + (void)cudaDestroyExternalSemaphore(cuda_cuda_done_writing_); + cuda_cuda_done_writing_ = nullptr; + } if (cuda_mipmapped_array_ != nullptr) { (void)cudaFreeMipmappedArray(cuda_mipmapped_array_); @@ -170,9 +196,8 @@ void DeviceImage::destroy() } if (memory_fd_ >= 0) { - // CUDA dups the fd internally on import, so we close our copy. - // If import failed before our explicit close, fd_ may still - // hold our copy — close it here. + // CUDA dup'd the fd on import; close ours. Also handles the + // import-failed-before-close case. close_fd(memory_fd_); memory_fd_ = -1; } @@ -189,6 +214,11 @@ void DeviceImage::destroy() // Wait for all GPU work to retire before tearing down Vulkan // resources. (void)vkDeviceWaitIdle(device); + if (cuda_done_writing_ != VK_NULL_HANDLE) + { + vkDestroySemaphore(device, cuda_done_writing_, nullptr); + cuda_done_writing_ = VK_NULL_HANDLE; + } if (command_pool_ != VK_NULL_HANDLE) { vkDestroyCommandPool(device, command_pool_, nullptr); @@ -212,18 +242,6 @@ void DeviceImage::destroy() current_layout_ = VK_IMAGE_LAYOUT_UNDEFINED; } -VizBuffer DeviceImage::view() const noexcept -{ - VizBuffer b; - b.data = static_cast(cuda_array_); - b.width = resolution_.width; - b.height = resolution_.height; - b.format = format_; - b.pitch = static_cast(resolution_.width) * bytes_per_pixel(format_); - b.space = MemorySpace::kDevice; - return b; -} - void DeviceImage::create_vk_image_with_external_memory() { const VkDevice device = ctx_->device(); @@ -239,13 +257,16 @@ void DeviceImage::create_vk_image_with_external_memory() info.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO; info.pNext = &ext_image_info; info.imageType = VK_IMAGE_TYPE_2D; - info.format = vk_format_; + // Storage in linear-space format (UNORM); we'll attach the SRGB + // view in create_vk_image_view(). VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT + // is what allows view format != image format among compatible + // formats (UNORM <-> SRGB are in the same compatibility class). + info.flags = VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT; + info.format = to_vk_storage_format(format_); info.extent = { resolution_.width, resolution_.height, 1 }; - info.mipLevels = 1; // Single level — when minification moiré shows up in - // XR distance views, expose mipLevels via Config and - // generate the chain via vkCmdBlitImage pre-render. - // Anisotropic filtering on the sampler is the cheaper - // first line of defense. + info.mipLevels = 1; // Single level. If XR distance views show + // moiré, expose mipLevels via Config and + // generate via vkCmdBlitImage pre-render. info.arrayLayers = 1; info.samples = VK_SAMPLE_COUNT_1_BIT; info.tiling = VK_IMAGE_TILING_OPTIMAL; @@ -258,10 +279,8 @@ void DeviceImage::create_vk_image_with_external_memory() VkMemoryRequirements reqs; vkGetImageMemoryRequirements(device, image_, &reqs); - // Memory backing the image: device-local + exportable as POSIX fd. - // No VkMemoryDedicatedAllocateInfo / cudaExternalMemoryDedicated — - // a generic allocation works for sampled 2D images and avoids the - // dedicated-allocation extension wiring. + // Device-local + exportable as POSIX fd. Generic allocation + // (no VkMemoryDedicatedAllocateInfo) suffices for sampled 2D. VkExportMemoryAllocateInfo export_info{}; export_info.sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO; export_info.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT; @@ -314,10 +333,8 @@ void DeviceImage::create_vk_image_view() void DeviceImage::import_to_cuda() { - // cudaSetDevice is per-host-thread; VkContext set it on the init - // thread, but DeviceImage::create() may run on a different one. - // Re-pin here so cudaImportExternalMemory / GetMappedMipmappedArray - // talk to the same physical GPU as Vulkan. + // cudaSetDevice is per-host-thread; VkContext sets it on the + // init thread, re-pin here for worker-thread create() callers. check_cuda(cudaSetDevice(ctx_->cuda_device_id()), "cudaSetDevice"); VkMemoryRequirements reqs; @@ -347,6 +364,76 @@ void DeviceImage::import_to_cuda() check_cuda(cudaGetMipmappedArrayLevel(&cuda_array_, cuda_mipmapped_array_, 0), "cudaGetMipmappedArrayLevel"); } +void DeviceImage::create_interop_semaphores() +{ + const VkDevice device = ctx_->device(); + + auto vkGetSemaphoreFdKHR = + reinterpret_cast(vkGetDeviceProcAddr(device, "vkGetSemaphoreFdKHR")); + if (vkGetSemaphoreFdKHR == nullptr) + { + throw std::runtime_error( + "DeviceImage: vkGetSemaphoreFdKHR not available " + "(VK_KHR_external_semaphore_fd not enabled?)"); + } + + // Timeline semaphore (initial value 0) exported via OPAQUE_FD and + // imported into CUDA. CUDA dups the fd internally; we close ours + // after the import. + VkSemaphoreTypeCreateInfo type_info{}; + type_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO; + type_info.semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE; + type_info.initialValue = 0; + + VkExportSemaphoreCreateInfo export_info{}; + export_info.sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO; + export_info.pNext = &type_info; + export_info.handleTypes = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT; + + VkSemaphoreCreateInfo info{}; + info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO; + info.pNext = &export_info; + check_vk(vkCreateSemaphore(device, &info, nullptr, &cuda_done_writing_), "vkCreateSemaphore"); + + int fd = -1; + VkSemaphoreGetFdInfoKHR fd_info{}; + fd_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR; + fd_info.semaphore = cuda_done_writing_; + fd_info.handleType = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT; + check_vk(vkGetSemaphoreFdKHR(device, &fd_info, &fd), "vkGetSemaphoreFdKHR"); + + cudaExternalSemaphoreHandleDesc ext_desc{}; + ext_desc.type = cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd; + ext_desc.handle.fd = fd; + const cudaError_t err = cudaImportExternalSemaphore(&cuda_cuda_done_writing_, &ext_desc); + if (err != cudaSuccess) + { + close_fd(fd); + throw std::runtime_error(std::string("DeviceImage: cudaImportExternalSemaphore(cuda_done_writing) failed: ") + + cudaGetErrorString(err)); + } + close_fd(fd); +} + +void DeviceImage::cuda_signal_write_done(cudaStream_t stream) +{ + // Reserve, signal, commit on success. Failed signal leaves _value_ + // at the last successfully signaled value (consumer keeps a valid + // wait target; failed frame is dropped). Single producer per + // DeviceImage → reserved is always > _value_, so a release store + // suffices. + const uint64_t reserved = cuda_done_writing_next_.fetch_add(1, std::memory_order_acq_rel) + 1; + cudaExternalSemaphoreSignalParams params{}; + params.params.fence.value = reserved; + const cudaError_t err = cudaSignalExternalSemaphoresAsync(&cuda_cuda_done_writing_, ¶ms, 1, stream); + if (err != cudaSuccess) + { + throw std::runtime_error(std::string("DeviceImage: cudaSignalExternalSemaphoresAsync(cuda_done_writing) failed: ") + + cudaGetErrorString(err)); + } + cuda_done_writing_value_.store(reserved, std::memory_order_release); +} + void DeviceImage::transition_to_shader_read() { if (current_layout_ == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) diff --git a/src/viz/core/cpp/inc/viz/core/device_image.hpp b/src/viz/core/cpp/inc/viz/core/device_image.hpp index 5fd82e364..a98c8876c 100644 --- a/src/viz/core/cpp/inc/viz/core/device_image.hpp +++ b/src/viz/core/cpp/inc/viz/core/device_image.hpp @@ -3,10 +3,12 @@ #pragma once -#include +#include // PixelFormat — used in API signatures #include #include +#include +#include #include #include @@ -19,12 +21,21 @@ class VkContext; // (optimal tiling, sampled + transfer-dst); the backing VkDeviceMemory // is exported via VK_KHR_external_memory_fd and imported into CUDA as // a cudaArray_t. CUDA writes via cuda_array(); Vulkan samples via -// vk_image(). Symmetric counterpart to HostImage; both expose -// VizBuffer view() so helpers branch on VizBuffer::space. +// vk_image(). // -// Synchronization is heavyweight today (cudaDeviceSynchronize + -// vkQueueWaitIdle); paired acquire / release semaphores arrive with -// QuadLayer. CUDA/Vulkan device matching is handled by VkContext. +// Conceptually paired with HostImage (CPU bytes vs GPU interop bytes), +// but they don't share a view() return type: a cudaArray_t is opaque +// tiled GPU memory and is NOT a CUDA device pointer, so wrapping it +// as a VizBuffer would lie about that type's contract. Callers consume +// DeviceImage via discrete accessors instead. +// +// Producer→consumer synchronization is one-way: a Vulkan timeline +// semaphore exported to CUDA. CUDA increments cuda_done_writing +// after filling; Vulkan waits for the latest known value before +// sampling. The reverse direction is the producer's problem to solve +// at a higher level (e.g. QuadLayer's mailbox owns enough buffers +// that producer writes never collide with in-flight Vulkan reads). +// CUDA / Vulkan device matching is handled by VkContext. class DeviceImage { public: @@ -40,11 +51,6 @@ class DeviceImage DeviceImage(DeviceImage&&) = delete; DeviceImage& operator=(DeviceImage&&) = delete; - // VizBuffer view (kDevice). `data` is the cudaArray_t cast to - // void*; it's an opaque CUDA handle, not a raw device pointer — - // use cuda_array() with cudaMemcpy2DToArrayAsync to write. - VizBuffer view() const noexcept; - // CUDA write target. Lifetime tied to this DeviceImage. cudaArray_t cuda_array() const noexcept { @@ -66,6 +72,29 @@ class DeviceImage return vk_format_; } + // Timeline semaphore handle. Vulkan waits on this with the + // value returned by cuda_done_writing_value() before sampling. + VkSemaphore cuda_done_writing() const noexcept + { + return cuda_done_writing_; + } + + // Latest value CUDA has signaled successfully. Vulkan uses this + // as the wait target. Advanced by cuda_signal_write_done() only + // after the underlying cudaSignalExternalSemaphoresAsync returns + // success, so a failed signal never poisons the timeline. + uint64_t cuda_done_writing_value() const noexcept + { + return cuda_done_writing_value_.load(std::memory_order_acquire); + } + + // CUDA-side primitive. Reserves the next monotonic value, queues + // the signal on `stream`, and commits the value on success. + // Throws std::runtime_error on cuda*Async failure; failure leaves + // the public counter un-advanced so the next call is consistent + // with the GPU's actual semaphore state. + void cuda_signal_write_done(cudaStream_t stream); + Resolution resolution() const noexcept { return resolution_; @@ -88,6 +117,7 @@ class DeviceImage void create_vk_image_with_external_memory(); void create_vk_image_view(); void import_to_cuda(); + void create_interop_semaphores(); void run_one_shot_layout_transition(VkImageLayout old_layout, VkImageLayout new_layout, @@ -113,6 +143,16 @@ class DeviceImage cudaExternalMemory_t cuda_external_memory_ = nullptr; cudaMipmappedArray_t cuda_mipmapped_array_ = nullptr; cudaArray_t cuda_array_ = nullptr; // Level-0 view, non-owning. + + // Producer→consumer timeline semaphore exported via + // VK_KHR_external_semaphore_fd and imported into CUDA. Two atomic + // counters (next reservation, last committed) so a failed + // cudaSignal can't leave the public value pointing at something + // that was never signaled. + VkSemaphore cuda_done_writing_ = VK_NULL_HANDLE; + cudaExternalSemaphore_t cuda_cuda_done_writing_ = nullptr; + std::atomic cuda_done_writing_next_{ 0 }; + std::atomic cuda_done_writing_value_{ 0 }; }; } // namespace viz diff --git a/src/viz/core/cpp/inc/viz/core/viz_buffer.hpp b/src/viz/core/cpp/inc/viz/core/viz_buffer.hpp index c36f0aec4..12a66213a 100644 --- a/src/viz/core/cpp/inc/viz/core/viz_buffer.hpp +++ b/src/viz/core/cpp/inc/viz/core/viz_buffer.hpp @@ -31,10 +31,12 @@ enum class MemorySpace // Lightweight, non-owning reference to a 2D pixel buffer. // -// Carries no ownership: it does not allocate or free memory. For Mode B -// submission (acquire/release), the layer owns the underlying interop -// buffer; VizBuffer is a view into it. For host readback, HostImage owns -// the bytes and exposes a VizBuffer view via HostImage::view(). +// Carries no ownership: it does not allocate or free memory. Producers +// fill VizBuffer with a pointer to memory they own (CUDA device buffer, +// host array) and pass it to QuadLayer::submit(); the layer copies the +// pixels into one of its internal interop slots. For host readback, +// HostImage owns the bytes and exposes a VizBuffer view via +// HostImage::view(). // // In Python, VizBuffer with space == kDevice exposes // __cuda_array_interface__ so CuPy can wrap it zero-copy. Host buffers diff --git a/src/viz/core/cpp/inc/viz/core/vk_context.hpp b/src/viz/core/cpp/inc/viz/core/vk_context.hpp index 5f12c8690..a4014084b 100644 --- a/src/viz/core/cpp/inc/viz/core/vk_context.hpp +++ b/src/viz/core/cpp/inc/viz/core/vk_context.hpp @@ -44,13 +44,9 @@ struct PhysicalDeviceInfo // // VkContext owns the Vulkan handles and tears them down on destruction. // -// CUDA-Vulkan device matching: as part of init(), VkContext queries -// the chosen physical device's UUID and calls cudaSetDevice() on the -// matching CUDA device. This is required for CUDA-Vulkan interop -// (cudaImportExternalMemory) to succeed on multi-GPU machines, and -// it makes VkContext the single chokepoint for "which GPU is Televiz -// on" — every viz_core type that touches CUDA can assume the current -// CUDA device matches the Vulkan one. +// init() also matches the current CUDA device to the chosen Vulkan +// physical device by UUID, so every viz_core type that touches CUDA +// can assume the two APIs are on the same GPU. class VkContext { public: @@ -104,13 +100,14 @@ class VkContext uint32_t queue_family_index() const noexcept; VkQueue queue() const noexcept; - // CUDA device id matched to the chosen Vulkan physical device by - // UUID at init() time. Useful for callers that need to ensure - // their thread is on the right CUDA device before issuing CUDA - // calls — cudaSetDevice is per-host-thread, so a CudaTexture / - // DeviceImage created on a worker thread must call - // cudaSetDevice(ctx.cuda_device_id()) before any CUDA API. Returns - // -1 before init() has run. + // Process-wide VkPipelineCache for driver-side compiled-state + // reuse across pipeline creations. VK_NULL_HANDLE before init(). + VkPipelineCache pipeline_cache() const noexcept; + + // CUDA device id matched to the chosen Vulkan physical device. + // Layers created on worker threads should + // cudaSetDevice(ctx.cuda_device_id()) before any CUDA call — + // cudaSetDevice is per-host-thread. Returns -1 before init(). int cuda_device_id() const noexcept; // Enumerates all Vulkan-capable physical devices and returns their @@ -127,6 +124,7 @@ class VkContext void select_physical_device(const Config& config); void create_logical_device(const Config& config); void match_cuda_device_to_vulkan(); + void create_pipeline_cache(); bool initialized_ = false; bool validation_enabled_ = false; @@ -135,6 +133,7 @@ class VkContext VkDevice device_ = VK_NULL_HANDLE; uint32_t queue_family_index_ = UINT32_MAX; VkQueue queue_ = VK_NULL_HANDLE; + VkPipelineCache pipeline_cache_ = VK_NULL_HANDLE; int cuda_device_id_ = -1; }; diff --git a/src/viz/core/cpp/vk_context.cpp b/src/viz/core/cpp/vk_context.cpp index d2184d038..9ff3d91cc 100644 --- a/src/viz/core/cpp/vk_context.cpp +++ b/src/viz/core/cpp/vk_context.cpp @@ -192,6 +192,7 @@ void VkContext::init(const Config& config) select_physical_device(config); create_logical_device(config); match_cuda_device_to_vulkan(); + create_pipeline_cache(); initialized_ = true; } catch (...) @@ -203,6 +204,12 @@ void VkContext::init(const Config& config) void VkContext::destroy() { + // Destroy device-owned objects (pipeline cache) before the device. + if (pipeline_cache_ != VK_NULL_HANDLE && device_ != VK_NULL_HANDLE) + { + vkDestroyPipelineCache(device_, pipeline_cache_, nullptr); + pipeline_cache_ = VK_NULL_HANDLE; + } if (device_ != VK_NULL_HANDLE) { vkDestroyDevice(device_, nullptr); @@ -216,6 +223,7 @@ void VkContext::destroy() physical_device_ = VK_NULL_HANDLE; queue_ = VK_NULL_HANDLE; queue_family_index_ = UINT32_MAX; + pipeline_cache_ = VK_NULL_HANDLE; cuda_device_id_ = -1; validation_enabled_ = false; initialized_ = false; @@ -251,6 +259,11 @@ VkQueue VkContext::queue() const noexcept return queue_; } +VkPipelineCache VkContext::pipeline_cache() const noexcept +{ + return pipeline_cache_; +} + int VkContext::cuda_device_id() const noexcept { return cuda_device_id_; @@ -395,10 +408,16 @@ void VkContext::create_logical_device(const Config& config) } VkPhysicalDeviceFeatures device_features{}; - // No special features needed yet; extend as the renderer requires them. + + // Enable the Vulkan 1.2 timeline semaphore feature so DeviceImage + // can use VK_SEMAPHORE_TYPE_TIMELINE for CUDA-Vulkan interop. + VkPhysicalDeviceVulkan12Features features12{}; + features12.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES; + features12.timelineSemaphore = VK_TRUE; VkDeviceCreateInfo device_info{}; device_info.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO; + device_info.pNext = &features12; device_info.queueCreateInfoCount = 1; device_info.pQueueCreateInfos = &queue_info; device_info.enabledExtensionCount = static_cast(extensions.size()); @@ -414,15 +433,25 @@ void VkContext::create_logical_device(const Config& config) vkGetDeviceQueue(device_, queue_family_index_, 0, &queue_); } +void VkContext::create_pipeline_cache() +{ + // Empty cache; the driver populates it as pipelines are created. + // Not persisted across runs — purely in-process reuse. + VkPipelineCacheCreateInfo info{}; + info.sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO; + const VkResult result = vkCreatePipelineCache(device_, &info, nullptr, &pipeline_cache_); + if (result != VK_SUCCESS) + { + throw std::runtime_error("vkCreatePipelineCache failed: VkResult=" + std::to_string(result)); + } +} + void VkContext::match_cuda_device_to_vulkan() { - // Read the Vulkan physical device's UUID, then find the CUDA - // device with the same UUID and make it current. Required so - // every viz_core type that imports Vulkan memory into CUDA - // (e.g. DeviceImage::cudaImportExternalMemory) operates on the - // same physical GPU. On multi-GPU machines Vulkan and CUDA - // default to different devices and interop fails with - // cudaErrorUnknown. + // Find the CUDA device whose UUID matches the chosen Vulkan + // physical device and make it current. Required so CUDA-Vulkan + // interop on multi-GPU machines doesn't pick a different GPU + // than Vulkan. VkPhysicalDeviceIDProperties id_props{}; id_props.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES; VkPhysicalDeviceProperties2 props2{}; diff --git a/src/viz/core_tests/cpp/CMakeLists.txt b/src/viz/core_tests/cpp/CMakeLists.txt index 656aff6c6..76cec20e6 100644 --- a/src/viz/core_tests/cpp/CMakeLists.txt +++ b/src/viz/core_tests/cpp/CMakeLists.txt @@ -3,6 +3,19 @@ cmake_minimum_required(VERSION 3.20) +# Header-only INTERFACE library exposing test_helpers.hpp: +# is_gpu_available(), shared_vk_context(), GpuFixture. Used by all +# viz_*_tests so the probes have one canonical implementation. +add_library(viz_test_support INTERFACE) +target_include_directories(viz_test_support INTERFACE + $ +) +target_link_libraries(viz_test_support INTERFACE + viz::core + Catch2::Catch2 +) +add_library(viz::test_support ALIAS viz_test_support) + add_executable(viz_core_tests test_device_image.cpp test_frame_sync.cpp @@ -15,7 +28,7 @@ add_executable(viz_core_tests ) target_link_libraries(viz_core_tests PRIVATE - viz::core + viz::test_support Catch2::Catch2WithMain ) diff --git a/src/viz/core_tests/cpp/test_device_image.cpp b/src/viz/core_tests/cpp/test_device_image.cpp index fcdb23af0..789eb92fd 100644 --- a/src/viz/core_tests/cpp/test_device_image.cpp +++ b/src/viz/core_tests/cpp/test_device_image.cpp @@ -2,15 +2,13 @@ // SPDX-License-Identifier: Apache-2.0 // [gpu] tests for DeviceImage: verify Vulkan handle creation, the -// CUDA cudaArray_t is usable from a cudaMemcpy2DToArray call, the -// VizBuffer view exposes the right shape with MemorySpace::kDevice, -// and a round-trip copy preserves pixel values. +// CUDA cudaArray_t is usable from a cudaMemcpy2DToArray call, and a +// round-trip copy preserves pixel values. #include "test_helpers.hpp" #include #include -#include #include #include @@ -19,10 +17,8 @@ #include using viz::DeviceImage; -using viz::MemorySpace; using viz::PixelFormat; using viz::Resolution; -using viz::VizBuffer; namespace { @@ -55,7 +51,11 @@ TEST_CASE_METHOD(viz::testing::GpuFixture, "DeviceImage creates valid Vulkan + C REQUIRE(img != nullptr); CHECK(img->vk_image() != VK_NULL_HANDLE); CHECK(img->vk_image_view() != VK_NULL_HANDLE); - CHECK(img->vk_format() == VK_FORMAT_R8G8B8A8_UNORM); + // vk_format() returns the SRGB sampling view format. Storage is + // UNORM (CUDA writes raw bytes), but sampling decodes through SRGB + // so arbitrary byte values round-trip exactly through the + // sRGB->linear->sRGB pipeline. See device_image.cpp comments. + CHECK(img->vk_format() == VK_FORMAT_R8G8B8A8_SRGB); CHECK(img->cuda_array() != nullptr); CHECK(img->resolution().width == 64); CHECK(img->resolution().height == 64); @@ -78,20 +78,6 @@ TEST_CASE_METHOD(viz::testing::GpuFixture, "DeviceImage destroy is idempotent", img->destroy(); } -TEST_CASE_METHOD(viz::testing::GpuFixture, "DeviceImage::view exposes a kDevice VizBuffer", "[gpu][device_image]") -{ - auto img = DeviceImage::create(vk, Resolution{ 16, 16 }, PixelFormat::kRGBA8); - const VizBuffer v = img->view(); - CHECK(v.space == MemorySpace::kDevice); - CHECK(v.width == 16); - CHECK(v.height == 16); - CHECK(v.format == PixelFormat::kRGBA8); - CHECK(v.pitch == static_cast(16) * 4); - // .data points at the cudaArray_t handle (NOT a raw device pointer); - // sanity that it's non-null when the image is alive. - CHECK(v.data != nullptr); -} - TEST_CASE_METHOD(viz::testing::GpuFixture, "DeviceImage round-trip preserves pixel pattern", "[gpu][device_image]") { constexpr uint32_t kSide = 64; @@ -105,9 +91,8 @@ TEST_CASE_METHOD(viz::testing::GpuFixture, "DeviceImage round-trip preserves pix cudaMemcpyHostToDevice) == cudaSuccess); REQUIRE(cudaDeviceSynchronize() == cudaSuccess); - // Read it back via CUDA — verifies the data took. (M3b's milestone - // test will read back via Vulkan sampling; that's the real round- - // trip but requires a graphics pipeline we don't have yet.) + // Read back via CUDA — verifies the data made it. The Vulkan- + // sampling round-trip is covered by viz_session_tests. std::vector dst(kBytes); REQUIRE(cudaMemcpy2DFromArray(dst.data(), kSide * 4, img->cuda_array(), 0, 0, kSide * 4, kSide, cudaMemcpyDeviceToHost) == cudaSuccess); diff --git a/src/viz/core_tests/cpp/test_helpers.hpp b/src/viz/core_tests/cpp/test_helpers.hpp index 04ea2b416..8e76f558f 100644 --- a/src/viz/core_tests/cpp/test_helpers.hpp +++ b/src/viz/core_tests/cpp/test_helpers.hpp @@ -9,26 +9,76 @@ #include #include +#include +#include +#include namespace viz::testing { -// True iff a Televiz-suitable Vulkan device is reachable. Cached after -// the first call. [gpu] tests should SKIP when this is false so CI -// runners without a suitable GPU report skipped rather than failed. +// True iff at least one GPU is reachable from BOTH Vulkan AND CUDA +// with matching UUIDs — the same constraint VkContext::init() enforces. +// [gpu] tests SKIP when this returns false so CI runners without a +// suitable GPU report skipped rather than failed. Cached after the +// first call. inline bool is_gpu_available() { static const bool cached = []() -> bool { - const auto devices = viz::VkContext::enumerate_physical_devices(); - for (const auto& info : devices) + VkApplicationInfo app{}; + app.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO; + app.apiVersion = VK_API_VERSION_1_2; + VkInstanceCreateInfo ic{}; + ic.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO; + ic.pApplicationInfo = &app; + VkInstance instance = VK_NULL_HANDLE; + if (vkCreateInstance(&ic, nullptr, &instance) != VK_SUCCESS) { - if (info.meets_requirements) + return false; + } + uint32_t count = 0; + vkEnumeratePhysicalDevices(instance, &count, nullptr); + std::vector devs(count); + if (count > 0) + { + vkEnumeratePhysicalDevices(instance, &count, devs.data()); + } + + int cuda_count = 0; + if (cudaGetDeviceCount(&cuda_count) != cudaSuccess) + { + cuda_count = 0; + } + + bool match = false; + for (VkPhysicalDevice vk_dev : devs) + { + VkPhysicalDeviceIDProperties id{}; + id.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES; + VkPhysicalDeviceProperties2 p2{}; + p2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2; + p2.pNext = &id; + vkGetPhysicalDeviceProperties2(vk_dev, &p2); + for (int ci = 0; ci < cuda_count; ++ci) + { + cudaDeviceProp prop{}; + if (cudaGetDeviceProperties(&prop, ci) != cudaSuccess) + { + continue; + } + if (std::memcmp(prop.uuid.bytes, id.deviceUUID, VK_UUID_SIZE) == 0) + { + match = true; + break; + } + } + if (match) { - return true; + break; } } - return false; + vkDestroyInstance(instance, nullptr); + return match; }(); return cached; } diff --git a/src/viz/layers/cpp/CMakeLists.txt b/src/viz/layers/cpp/CMakeLists.txt index bf695673c..02d09ad17 100644 --- a/src/viz/layers/cpp/CMakeLists.txt +++ b/src/viz/layers/cpp/CMakeLists.txt @@ -3,25 +3,25 @@ cmake_minimum_required(VERSION 3.20) -# LayerBase abstract interface and (later) concrete layer types -# (QuadLayer, ProjectionLayer, OverlayLayer). Header-only today — promoted -# to STATIC when the first .cpp lands. Test-only helper layers -# (e.g. ColoredQuadLayer) live in viz/layers_tests/, not here. -add_library(viz_layers INTERFACE) - -target_sources(viz_layers - INTERFACE - $ +# LayerBase abstract interface and concrete layer types. Promoted to +# STATIC now that QuadLayer ships. Test-only helper layers +# (ClearRectLayer, ThrowingLayer) live in viz/layers_tests/, not here. +add_library(viz_layers STATIC + quad_layer.cpp + inc/viz/layers/layer_base.hpp + inc/viz/layers/quad_layer.hpp ) target_include_directories(viz_layers - INTERFACE + PUBLIC $ ) target_link_libraries(viz_layers - INTERFACE + PUBLIC viz::core + PRIVATE + viz::shaders ) # Aliased as viz::layers. diff --git a/src/viz/layers/cpp/inc/viz/layers/layer_base.hpp b/src/viz/layers/cpp/inc/viz/layers/layer_base.hpp index d2a9407ab..355d3fbf1 100644 --- a/src/viz/layers/cpp/inc/viz/layers/layer_base.hpp +++ b/src/viz/layers/cpp/inc/viz/layers/layer_base.hpp @@ -51,6 +51,30 @@ class LayerBase // draws into; const so layers cannot modify the target. virtual void record(VkCommandBuffer cmd, const std::vector& views, const RenderTarget& target) = 0; + // Per-frame wait wiring for layers that synchronize against CUDA + // (or other external) producers via a Vulkan timeline semaphore. + // VizCompositor concatenates these across all visible layers and + // feeds them into vkQueueSubmit (with a chained + // VkTimelineSemaphoreSubmitInfo for the values). + // Default: empty (no external sync). + // + // No signal semaphores: layers that need producer↔consumer + // ping-pong solve it at the layer level (e.g. QuadLayer's mailbox + // owns enough buffers that producer writes never collide with + // in-flight Vulkan reads, so the compositor never has to signal + // back to the producer). + struct WaitSemaphore + { + VkSemaphore semaphore = VK_NULL_HANDLE; + uint64_t value = 0; + VkPipelineStageFlags wait_stage = 0; + }; + + virtual std::vector get_wait_semaphores() const + { + return {}; + } + const std::string& name() const noexcept; // Visibility flag is atomic so it can be toggled from any thread (UI diff --git a/src/viz/layers/cpp/inc/viz/layers/quad_layer.hpp b/src/viz/layers/cpp/inc/viz/layers/quad_layer.hpp new file mode 100644 index 000000000..b320ee0fa --- /dev/null +++ b/src/viz/layers/cpp/inc/viz/layers/quad_layer.hpp @@ -0,0 +1,155 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace viz +{ + +class VkContext; + +// QuadLayer: renders a CUDA-fed 2D texture as a fullscreen quad. +// +// Owns kSlotCount=3 DeviceImages plus the graphics-pipeline state to +// sample any of them (one VkSampler, one VkPipeline, one descriptor +// set per slot). The slots form a mailbox: +// +// submit() picks a "free" slot (one that is neither the most recent +// publish nor the slot the renderer is currently sampling), runs +// cudaMemcpyAsync into it, signals cuda_done_writing, and atomic- +// exchanges the "latest" pointer to it. The previous "latest" slot +// becomes free. +// +// record() atomic-exchanges "latest" into "in_use" (taking it for +// this frame's draw); the previous "in_use" slot becomes free. The +// draw waits on cuda_done_writing of the slot it just took. +// +// Net result: producer can submit at any rate. The renderer always +// samples the most recently completed publish, and there is always +// at least one slot free for the producer to write — it never +// collides with a buffer the renderer is currently sampling. +// +// Correctness depends on VizCompositor::render() being synchronous +// (frame_sync_->wait() at end of frame). Multi-frame-in-flight +// would require in_use_ to become per-in-flight-frame. +// +// Memory cost: ~width*height*bpp*3 bytes (e.g. 24 MB at 1080p RGBA8). +// +// Fullscreen-blit / kRGBA8 only. Placement transforms and other +// formats land with the XR backend. +class QuadLayer : public LayerBase +{ +public: + static constexpr uint32_t kSlotCount = 3; + + struct Config + { + std::string name = "QuadLayer"; + Resolution resolution{}; + PixelFormat format = PixelFormat::kRGBA8; + }; + + // Builds the 3 DeviceImages + pipeline up front. Throws + // std::invalid_argument on bad config; std::runtime_error on + // Vulkan / CUDA failure. + QuadLayer(const VkContext& ctx, VkRenderPass render_pass, Config config); + + ~QuadLayer() override; + void destroy(); + + // Threading contract: submit() is the producer side; record() (+ + // get_wait_semaphores) is the consumer side. They may run on + // separate threads. Multiple concurrent producers on the same + // QuadLayer are NOT supported — use one QuadLayer per producer. + // + // src.space must be kDevice and src dimensions/format must match + // the layer. The wait/copy/signal sequence runs on `stream` + // (default: the default stream); pass the producer's stream so + // the signal lands after the producer's prior writes on the same + // stream. + // + // Throws std::invalid_argument on validation failure; + // std::runtime_error on CUDA failure; + // std::logic_error if called after destroy(). + void submit(const VizBuffer& src, cudaStream_t stream = 0); + + // Binds pipeline + per-slot descriptor + draws a 3-vertex + // fullscreen quad. Skips the draw if no frame has been published + // yet (kSlotNone — render target keeps its clear value). + void record(VkCommandBuffer cmd, const std::vector& views, const RenderTarget& target) override; + + // Layer-side timeline wait: VizCompositor waits on this slot's + // cuda_done_writing before the fragment shader samples it. + std::vector get_wait_semaphores() const override; + + Resolution resolution() const noexcept; + PixelFormat format() const noexcept; + + // Diagnostic accessor; nullptr for slots beyond kSlotCount. + const DeviceImage* device_image(uint32_t slot) const noexcept; + +private: + void init(); + + void create_sampler(); + void create_descriptor_set_layout(); + void create_pipeline_layout(); + void create_pipeline(); + void create_descriptor_pool(); + void allocate_descriptor_sets(); + void update_descriptor_sets(); + + // Mailbox slot allocation. submit() picks one of these states + // and atomically takes ownership; record() atomically promotes + // a freshly-published slot to `in_use_`. + static constexpr uint8_t kSlotNone = 0xFF; + + // Picks a slot that is neither latest_ nor in_use_, in + // 0..kSlotCount-1. Returns a value < kSlotCount. + uint8_t pick_free_slot(uint8_t latest, uint8_t in_use) const noexcept; + + const VkContext* ctx_ = nullptr; + VkRenderPass render_pass_ = VK_NULL_HANDLE; // borrowed from compositor + Config config_; + + // One DeviceImage per mailbox slot. + std::array, kSlotCount> slots_; + + VkSampler sampler_ = VK_NULL_HANDLE; + VkDescriptorSetLayout descriptor_set_layout_ = VK_NULL_HANDLE; + VkPipelineLayout pipeline_layout_ = VK_NULL_HANDLE; + VkPipeline pipeline_ = VK_NULL_HANDLE; + + VkDescriptorPool descriptor_pool_ = VK_NULL_HANDLE; + // One descriptor set per slot, each binding the corresponding + // DeviceImage's sRGB view. record() picks the one for in_use_. + std::array descriptor_sets_{}; + + // Mailbox state. Both atomic so producer and renderer can + // touch them without locks. + // + // latest_: most recently published slot. submit() stores + // here on success; record() exchanges it into + // in_use_ at frame start. kSlotNone before the + // first submit(). + // in_use_: slot the renderer is currently drawing from. + // kSlotNone before the first frame that finds a + // published slot. record() updates this. + std::atomic latest_{ kSlotNone }; + std::atomic in_use_{ kSlotNone }; +}; + +} // namespace viz diff --git a/src/viz/layers/cpp/quad_layer.cpp b/src/viz/layers/cpp/quad_layer.cpp new file mode 100644 index 000000000..8b861dea0 --- /dev/null +++ b/src/viz/layers/cpp/quad_layer.cpp @@ -0,0 +1,517 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace viz +{ + +namespace +{ + +void check_vk(VkResult result, const char* what) +{ + if (result != VK_SUCCESS) + { + throw std::runtime_error(std::string("QuadLayer: ") + what + " failed: VkResult=" + std::to_string(result)); + } +} + +void check_cuda(cudaError_t result, const char* what) +{ + if (result != cudaSuccess) + { + throw std::runtime_error(std::string("QuadLayer: ") + what + " failed: " + cudaGetErrorString(result)); + } +} + +VkShaderModule create_shader_module(VkDevice device, const unsigned char* spv, size_t size) +{ + VkShaderModuleCreateInfo info{}; + info.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO; + info.codeSize = size; + info.pCode = reinterpret_cast(spv); + VkShaderModule mod = VK_NULL_HANDLE; + check_vk(vkCreateShaderModule(device, &info, nullptr, &mod), "vkCreateShaderModule"); + return mod; +} + +// Once destroy() has run, slots_[0] is the canonical "alive" signal +// (it's the first thing init() builds and the last thing destroy() +// resets). Throwing logic_error converts use-after-destroy from a +// silent null-deref into a clean failure callers can catch in tests. +void require_alive(const std::unique_ptr& slot0, const char* what) +{ + if (!slot0) + { + throw std::logic_error(std::string("QuadLayer::") + what + " called after destroy()"); + } +} + +} // namespace + +QuadLayer::QuadLayer(const VkContext& ctx, VkRenderPass render_pass, Config config) + : LayerBase(config.name), ctx_(&ctx), render_pass_(render_pass), config_(std::move(config)) +{ + // Cheap-first config checks, then argument shape, then context + // state. Tests can exercise each path by varying just the + // relevant argument with an uninitialized VkContext. + if (config_.format != PixelFormat::kRGBA8) + { + // textured_quad samples color; depth (kD32F) would create a + // depth-aspect view that can't be sampled as color. + throw std::invalid_argument("QuadLayer: only PixelFormat::kRGBA8 is supported"); + } + if (config_.resolution.width == 0 || config_.resolution.height == 0) + { + throw std::invalid_argument("QuadLayer: resolution must be non-zero"); + } + if (render_pass == VK_NULL_HANDLE) + { + throw std::invalid_argument("QuadLayer: render_pass must be non-null"); + } + if (!ctx.is_initialized()) + { + throw std::invalid_argument("QuadLayer: VkContext is not initialized"); + } + init(); +} + +QuadLayer::~QuadLayer() +{ + destroy(); +} + +void QuadLayer::init() +{ + try + { + for (auto& slot : slots_) + { + slot = DeviceImage::create(*ctx_, config_.resolution, config_.format); + } + create_sampler(); + create_descriptor_set_layout(); + create_pipeline_layout(); + create_pipeline(); + create_descriptor_pool(); + allocate_descriptor_sets(); + update_descriptor_sets(); + } + catch (...) + { + destroy(); + throw; + } +} + +void QuadLayer::destroy() +{ + if (ctx_ == nullptr) + { + return; + } + const VkDevice device = ctx_->device(); + if (device == VK_NULL_HANDLE) + { + for (auto& slot : slots_) + { + slot.reset(); + } + return; + } + if (descriptor_pool_ != VK_NULL_HANDLE) + { + // descriptor_sets_ are freed implicitly with the pool. + vkDestroyDescriptorPool(device, descriptor_pool_, nullptr); + descriptor_pool_ = VK_NULL_HANDLE; + descriptor_sets_.fill(VK_NULL_HANDLE); + } + if (pipeline_ != VK_NULL_HANDLE) + { + vkDestroyPipeline(device, pipeline_, nullptr); + pipeline_ = VK_NULL_HANDLE; + } + if (pipeline_layout_ != VK_NULL_HANDLE) + { + vkDestroyPipelineLayout(device, pipeline_layout_, nullptr); + pipeline_layout_ = VK_NULL_HANDLE; + } + if (descriptor_set_layout_ != VK_NULL_HANDLE) + { + vkDestroyDescriptorSetLayout(device, descriptor_set_layout_, nullptr); + descriptor_set_layout_ = VK_NULL_HANDLE; + } + if (sampler_ != VK_NULL_HANDLE) + { + vkDestroySampler(device, sampler_, nullptr); + sampler_ = VK_NULL_HANDLE; + } + for (auto& slot : slots_) + { + slot.reset(); + } + latest_.store(kSlotNone, std::memory_order_release); + in_use_.store(kSlotNone, std::memory_order_release); +} + +Resolution QuadLayer::resolution() const noexcept +{ + return config_.resolution; +} + +PixelFormat QuadLayer::format() const noexcept +{ + return config_.format; +} + +const DeviceImage* QuadLayer::device_image(uint32_t slot) const noexcept +{ + if (slot >= kSlotCount) + { + return nullptr; + } + return slots_[slot].get(); +} + +uint8_t QuadLayer::pick_free_slot(uint8_t latest, uint8_t in_use) const noexcept +{ + // With kSlotCount=3, at most 2 slots are "claimed" (latest + + // in_use). At least one of {0, 1, 2} is always free. + static_assert(kSlotCount == 3, "pick_free_slot assumes 3 slots"); + for (uint8_t i = 0; i < kSlotCount; ++i) + { + if (i != latest && i != in_use) + { + return i; + } + } + return 0; // unreachable for kSlotCount >= 2 +} + +void QuadLayer::submit(const VizBuffer& src, cudaStream_t stream) +{ + require_alive(slots_[0], "submit"); + if (src.space != MemorySpace::kDevice) + { + throw std::invalid_argument("QuadLayer::submit: src must be MemorySpace::kDevice"); + } + if (src.width != config_.resolution.width || src.height != config_.resolution.height) + { + throw std::invalid_argument("QuadLayer::submit: src dimensions do not match layer resolution"); + } + if (src.format != config_.format) + { + throw std::invalid_argument("QuadLayer::submit: src format does not match layer format"); + } + if (src.data == nullptr) + { + throw std::invalid_argument("QuadLayer::submit: src.data is null"); + } + + // Pick a free slot — neither the most recent publish nor the + // slot the renderer is currently using. With 3 slots there's + // always one free, so this is wait-free. + const uint8_t latest = latest_.load(std::memory_order_acquire); + const uint8_t in_use = in_use_.load(std::memory_order_acquire); + const uint8_t slot = pick_free_slot(latest, in_use); + DeviceImage& image = *slots_[slot]; + + check_cuda(cudaSetDevice(ctx_->cuda_device_id()), "cudaSetDevice"); + // Async copy on `stream`. Caller's prior work on the same stream + // is naturally ordered before this; signal lands after the copy + // completes on the GPU. + const size_t row_bytes = static_cast(src.width) * bytes_per_pixel(src.format); + const size_t src_pitch = (src.pitch == 0) ? row_bytes : src.pitch; + check_cuda(cudaMemcpy2DToArrayAsync(image.cuda_array(), 0, 0, src.data, src_pitch, row_bytes, src.height, + cudaMemcpyDeviceToDevice, stream), + "cudaMemcpy2DToArrayAsync"); + image.cuda_signal_write_done(stream); + + // Publish. The renderer's next record() will atomic-exchange + // this into in_use_; the previous latest_ slot becomes free. + // memory_order_release pairs with the renderer's acquire load. + latest_.store(slot, std::memory_order_release); +} + +void QuadLayer::record(VkCommandBuffer cmd, const std::vector& /*views*/, const RenderTarget& target) +{ + require_alive(slots_[0], "record"); + + // Promote latest_ to in_use_. The previous in_use_ slot becomes + // free for the next submit(). If no frame has been published yet + // (latest_ == kSlotNone), we leave in_use_ as-is — if it's also + // kSlotNone, we skip the draw and the framebuffer keeps its + // clear value. + const uint8_t latest = latest_.load(std::memory_order_acquire); + if (latest != kSlotNone) + { + in_use_.store(latest, std::memory_order_release); + } + const uint8_t cur = in_use_.load(std::memory_order_acquire); + if (cur == kSlotNone) + { + return; + } + + const Resolution res = target.resolution(); + + VkViewport viewport{}; + viewport.x = 0.0f; + viewport.y = 0.0f; + viewport.width = static_cast(res.width); + viewport.height = static_cast(res.height); + viewport.minDepth = 0.0f; + viewport.maxDepth = 1.0f; + vkCmdSetViewport(cmd, 0, 1, &viewport); + + VkRect2D scissor{}; + scissor.offset = { 0, 0 }; + scissor.extent = { res.width, res.height }; + vkCmdSetScissor(cmd, 0, 1, &scissor); + + vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline_); + vkCmdBindDescriptorSets( + cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline_layout_, 0, 1, &descriptor_sets_[cur], 0, nullptr); + + // 3 vertices, no vertex buffer — vertex shader emits a fullscreen + // triangle from gl_VertexIndex. + vkCmdDraw(cmd, 3, 1, 0, 0); +} + +std::vector QuadLayer::get_wait_semaphores() const +{ + // VizCompositor calls record() first (which promotes latest_ -> + // in_use_), then this. So in_use_ is the slot the draw will + // sample, and that's what we need the GPU to wait on. + const uint8_t cur = in_use_.load(std::memory_order_acquire); + if (cur == kSlotNone || !slots_[cur]) + { + return {}; + } + const DeviceImage& image = *slots_[cur]; + const uint64_t value = image.cuda_done_writing_value(); + if (value == 0) + { + return {}; + } + return { + WaitSemaphore{ + image.cuda_done_writing(), + value, + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, + }, + }; +} + +void QuadLayer::create_sampler() +{ + VkSamplerCreateInfo info{}; + info.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO; + info.magFilter = VK_FILTER_LINEAR; + info.minFilter = VK_FILTER_LINEAR; + info.mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST; + info.addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE; + info.addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE; + info.addressModeW = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE; + info.anisotropyEnable = VK_FALSE; // enable later when XR distance views need it + info.maxAnisotropy = 1.0f; + info.borderColor = VK_BORDER_COLOR_INT_OPAQUE_BLACK; + info.unnormalizedCoordinates = VK_FALSE; + info.compareEnable = VK_FALSE; + info.compareOp = VK_COMPARE_OP_ALWAYS; + info.minLod = 0.0f; + info.maxLod = 0.0f; + check_vk(vkCreateSampler(ctx_->device(), &info, nullptr, &sampler_), "vkCreateSampler"); +} + +void QuadLayer::create_descriptor_set_layout() +{ + VkDescriptorSetLayoutBinding binding{}; + binding.binding = 0; + binding.descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; + binding.descriptorCount = 1; + binding.stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT; + binding.pImmutableSamplers = nullptr; + + VkDescriptorSetLayoutCreateInfo info{}; + info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO; + info.bindingCount = 1; + info.pBindings = &binding; + check_vk(vkCreateDescriptorSetLayout(ctx_->device(), &info, nullptr, &descriptor_set_layout_), + "vkCreateDescriptorSetLayout"); +} + +void QuadLayer::create_pipeline_layout() +{ + VkPipelineLayoutCreateInfo info{}; + info.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; + info.setLayoutCount = 1; + info.pSetLayouts = &descriptor_set_layout_; + info.pushConstantRangeCount = 0; + check_vk(vkCreatePipelineLayout(ctx_->device(), &info, nullptr, &pipeline_layout_), "vkCreatePipelineLayout"); +} + +void QuadLayer::create_pipeline() +{ + const VkDevice device = ctx_->device(); + + VkShaderModule vert = + create_shader_module(device, viz::shaders::kTexturedQuadVertSpv, viz::shaders::kTexturedQuadVertSpvSize); + VkShaderModule frag = + create_shader_module(device, viz::shaders::kTexturedQuadFragSpv, viz::shaders::kTexturedQuadFragSpvSize); + + // RAII: shader modules are only needed during pipeline creation. + struct ShaderGuard + { + VkDevice device; + VkShaderModule vert; + VkShaderModule frag; + ~ShaderGuard() + { + if (vert != VK_NULL_HANDLE) + { + vkDestroyShaderModule(device, vert, nullptr); + } + if (frag != VK_NULL_HANDLE) + { + vkDestroyShaderModule(device, frag, nullptr); + } + } + } guard{ device, vert, frag }; + + VkPipelineShaderStageCreateInfo stages[2]{}; + stages[0].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; + stages[0].stage = VK_SHADER_STAGE_VERTEX_BIT; + stages[0].module = vert; + stages[0].pName = "main"; + stages[1].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; + stages[1].stage = VK_SHADER_STAGE_FRAGMENT_BIT; + stages[1].module = frag; + stages[1].pName = "main"; + + VkPipelineVertexInputStateCreateInfo vertex_input{}; + vertex_input.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO; + + VkPipelineInputAssemblyStateCreateInfo input_assembly{}; + input_assembly.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO; + input_assembly.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST; + + // Viewport / scissor are dynamic so one pipeline works across + // resolutions. + VkPipelineViewportStateCreateInfo viewport_state{}; + viewport_state.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO; + viewport_state.viewportCount = 1; + viewport_state.scissorCount = 1; + + VkPipelineRasterizationStateCreateInfo rasterizer{}; + rasterizer.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO; + rasterizer.polygonMode = VK_POLYGON_MODE_FILL; + rasterizer.cullMode = VK_CULL_MODE_NONE; + rasterizer.frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE; + rasterizer.lineWidth = 1.0f; + + VkPipelineMultisampleStateCreateInfo multisample{}; + multisample.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO; + multisample.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT; + + // Depth disabled — fullscreen blits don't need it. + VkPipelineDepthStencilStateCreateInfo depth_stencil{}; + depth_stencil.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO; + depth_stencil.depthTestEnable = VK_FALSE; + depth_stencil.depthWriteEnable = VK_FALSE; + + VkPipelineColorBlendAttachmentState blend_attachment{}; + blend_attachment.blendEnable = VK_FALSE; + blend_attachment.colorWriteMask = + VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT | VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT; + + VkPipelineColorBlendStateCreateInfo color_blend{}; + color_blend.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO; + color_blend.attachmentCount = 1; + color_blend.pAttachments = &blend_attachment; + + const VkDynamicState dynamic_states[] = { VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR }; + VkPipelineDynamicStateCreateInfo dynamic{}; + dynamic.sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO; + dynamic.dynamicStateCount = sizeof(dynamic_states) / sizeof(dynamic_states[0]); + dynamic.pDynamicStates = dynamic_states; + + VkGraphicsPipelineCreateInfo info{}; + info.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO; + info.stageCount = 2; + info.pStages = stages; + info.pVertexInputState = &vertex_input; + info.pInputAssemblyState = &input_assembly; + info.pViewportState = &viewport_state; + info.pRasterizationState = &rasterizer; + info.pMultisampleState = &multisample; + info.pDepthStencilState = &depth_stencil; + info.pColorBlendState = &color_blend; + info.pDynamicState = &dynamic; + info.layout = pipeline_layout_; + info.renderPass = render_pass_; + info.subpass = 0; + + check_vk(vkCreateGraphicsPipelines(device, ctx_->pipeline_cache(), 1, &info, nullptr, &pipeline_), + "vkCreateGraphicsPipelines"); +} + +void QuadLayer::create_descriptor_pool() +{ + VkDescriptorPoolSize pool_size{}; + pool_size.type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; + pool_size.descriptorCount = kSlotCount; + + VkDescriptorPoolCreateInfo info{}; + info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; + info.maxSets = kSlotCount; + info.poolSizeCount = 1; + info.pPoolSizes = &pool_size; + check_vk(vkCreateDescriptorPool(ctx_->device(), &info, nullptr, &descriptor_pool_), "vkCreateDescriptorPool"); +} + +void QuadLayer::allocate_descriptor_sets() +{ + std::array layouts{}; + layouts.fill(descriptor_set_layout_); + + VkDescriptorSetAllocateInfo info{}; + info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; + info.descriptorPool = descriptor_pool_; + info.descriptorSetCount = kSlotCount; + info.pSetLayouts = layouts.data(); + check_vk(vkAllocateDescriptorSets(ctx_->device(), &info, descriptor_sets_.data()), "vkAllocateDescriptorSets"); +} + +void QuadLayer::update_descriptor_sets() +{ + // One write per slot, each pointing at the slot's own image view. + std::array image_infos{}; + std::array writes{}; + for (uint32_t i = 0; i < kSlotCount; ++i) + { + image_infos[i].sampler = sampler_; + image_infos[i].imageView = slots_[i]->vk_image_view(); + image_infos[i].imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + + writes[i].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + writes[i].dstSet = descriptor_sets_[i]; + writes[i].dstBinding = 0; + writes[i].dstArrayElement = 0; + writes[i].descriptorCount = 1; + writes[i].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; + writes[i].pImageInfo = &image_infos[i]; + } + vkUpdateDescriptorSets(ctx_->device(), kSlotCount, writes.data(), 0, nullptr); +} + +} // namespace viz diff --git a/src/viz/layers_tests/cpp/CMakeLists.txt b/src/viz/layers_tests/cpp/CMakeLists.txt index 0ba68076d..b34de50b8 100644 --- a/src/viz/layers_tests/cpp/CMakeLists.txt +++ b/src/viz/layers_tests/cpp/CMakeLists.txt @@ -25,14 +25,18 @@ target_link_libraries(viz_layers_testing add_library(viz::layers_testing ALIAS viz_layers_testing) -# Test executable for the layers_testing fixtures themselves. +# Test executable for the layers_testing fixtures themselves and for +# the production layers in viz::layers (QuadLayer, ...). add_executable(viz_layers_tests test_clear_rect_layer.cpp + test_quad_layer.cpp test_throwing_layer.cpp ) target_link_libraries(viz_layers_tests PRIVATE viz::layers_testing + viz::layers + viz::test_support Catch2::Catch2WithMain ) diff --git a/src/viz/layers_tests/cpp/test_quad_layer.cpp b/src/viz/layers_tests/cpp/test_quad_layer.cpp new file mode 100644 index 000000000..ed540e11a --- /dev/null +++ b/src/viz/layers_tests/cpp/test_quad_layer.cpp @@ -0,0 +1,317 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +// Tests for QuadLayer: config validation (unit-level) and pipeline / +// CUDA-Vulkan interop (gpu-level). End-to-end fill+render+readback +// lives in viz_session_tests where the full VizSession pipeline is +// available. + +#include "test_helpers.hpp" + +#include +#include +#include +#include +#include + +#include +#include + +using viz::DeviceImage; +using viz::PixelFormat; +using viz::QuadLayer; +using viz::RenderTarget; +using viz::Resolution; +using viz::VizBuffer; +using viz::VkContext; + +using viz::testing::is_gpu_available; + +// The arg-shape checks (format, resolution, render_pass) run before +// the VkContext::is_initialized() check, so these unit tests can +// exercise each rejection path with a default-constructed VkContext. +// +// Per-test ordering: a test passes a config that's valid for every +// earlier check and triggers only the named check. + +TEST_CASE("QuadLayer ctor rejects non-RGBA8 pixel format", "[unit][quad_layer]") +{ + VkContext ctx; + QuadLayer::Config cfg; + cfg.resolution = { 64, 64 }; + cfg.format = PixelFormat::kD32F; + CHECK_THROWS_AS(QuadLayer(ctx, VK_NULL_HANDLE, cfg), std::invalid_argument); +} + +TEST_CASE("QuadLayer ctor rejects zero dimensions", "[unit][quad_layer]") +{ + VkContext ctx; + QuadLayer::Config cfg; + cfg.resolution = { 0, 64 }; + CHECK_THROWS_AS(QuadLayer(ctx, VK_NULL_HANDLE, cfg), std::invalid_argument); +} + +TEST_CASE("QuadLayer ctor rejects null render pass", "[unit][quad_layer]") +{ + VkContext ctx; + QuadLayer::Config cfg; + cfg.resolution = { 64, 64 }; + CHECK_THROWS_AS(QuadLayer(ctx, VK_NULL_HANDLE, cfg), std::invalid_argument); +} + +TEST_CASE("QuadLayer creates valid Vulkan + CUDA handles for every mailbox slot", "[gpu][quad_layer]") +{ + if (!is_gpu_available()) + { + SKIP("No Vulkan-capable GPU available"); + } + VkContext ctx; + ctx.init({}); + auto target = RenderTarget::create(ctx, RenderTarget::Config{ Resolution{ 64, 64 } }); + + QuadLayer::Config cfg; + cfg.resolution = { 64, 64 }; + QuadLayer layer(ctx, target->render_pass(), cfg); + + CHECK(layer.name() == "QuadLayer"); + CHECK(layer.is_visible()); + CHECK(layer.resolution().width == 64); + CHECK(layer.resolution().height == 64); + CHECK(layer.format() == PixelFormat::kRGBA8); + for (uint32_t i = 0; i < QuadLayer::kSlotCount; ++i) + { + REQUIRE(layer.device_image(i) != nullptr); + CHECK(layer.device_image(i)->vk_image() != VK_NULL_HANDLE); + CHECK(layer.device_image(i)->cuda_array() != nullptr); + } + // Out-of-range slot returns nullptr without crashing. + CHECK(layer.device_image(QuadLayer::kSlotCount) == nullptr); +} + +TEST_CASE("QuadLayer destroy is idempotent", "[gpu][quad_layer]") +{ + if (!is_gpu_available()) + { + SKIP("No Vulkan-capable GPU available"); + } + VkContext ctx; + ctx.init({}); + auto target = RenderTarget::create(ctx, RenderTarget::Config{ Resolution{ 32, 32 } }); + + QuadLayer::Config cfg; + cfg.resolution = { 32, 32 }; + QuadLayer layer(ctx, target->render_pass(), cfg); + + layer.destroy(); + layer.destroy(); // second call must be a no-op +} + +TEST_CASE("QuadLayer::submit throws after destroy", "[gpu][quad_layer]") +{ + if (!is_gpu_available()) + { + SKIP("No Vulkan-capable GPU available"); + } + VkContext ctx; + ctx.init({}); + auto target = RenderTarget::create(ctx, RenderTarget::Config{ Resolution{ 32, 32 } }); + + QuadLayer::Config cfg; + cfg.resolution = { 32, 32 }; + QuadLayer layer(ctx, target->render_pass(), cfg); + layer.destroy(); + + // submit must throw cleanly rather than dereferencing the + // released slot DeviceImages / pipeline. + viz::VizBuffer src{}; + src.width = 32; + src.height = 32; + src.format = PixelFormat::kRGBA8; + src.space = viz::MemorySpace::kDevice; + src.data = reinterpret_cast(uintptr_t{ 0x1 }); // never dereferenced + CHECK_THROWS_AS(layer.submit(src), std::logic_error); +} + +TEST_CASE("QuadLayer::submit rejects mismatched dimensions / format / space", "[gpu][quad_layer]") +{ + if (!is_gpu_available()) + { + SKIP("No Vulkan-capable GPU available"); + } + VkContext ctx; + ctx.init({}); + auto target = RenderTarget::create(ctx, RenderTarget::Config{ Resolution{ 64, 64 } }); + + QuadLayer::Config cfg; + cfg.resolution = { 64, 64 }; + QuadLayer layer(ctx, target->render_pass(), cfg); + + // Allocate a small CUDA buffer to point at — content is irrelevant + // because the validation rejects the descriptor before any memcpy. + void* dev_ptr = nullptr; + REQUIRE(cudaMalloc(&dev_ptr, 64 * 64 * 4) == cudaSuccess); + struct CudaFreeGuard + { + void* p; + ~CudaFreeGuard() + { + cudaFree(p); + } + } guard{ dev_ptr }; + + SECTION("kHost rejected") + { + VizBuffer src{}; + src.data = dev_ptr; + src.width = 64; + src.height = 64; + src.format = PixelFormat::kRGBA8; + src.space = viz::MemorySpace::kHost; + CHECK_THROWS_AS(layer.submit(src), std::invalid_argument); + } + SECTION("dimension mismatch rejected") + { + VizBuffer src{}; + src.data = dev_ptr; + src.width = 32; + src.height = 64; + src.format = PixelFormat::kRGBA8; + src.space = viz::MemorySpace::kDevice; + CHECK_THROWS_AS(layer.submit(src), std::invalid_argument); + } + SECTION("null data rejected") + { + VizBuffer src{}; + src.data = nullptr; + src.width = 64; + src.height = 64; + src.format = PixelFormat::kRGBA8; + src.space = viz::MemorySpace::kDevice; + CHECK_THROWS_AS(layer.submit(src), std::invalid_argument); + } +} + +TEST_CASE("QuadLayer submit accepts a non-default CUDA stream", "[gpu][quad_layer]") +{ + if (!is_gpu_available()) + { + SKIP("No Vulkan-capable GPU available"); + } + VkContext ctx; + ctx.init({}); + auto target = RenderTarget::create(ctx, RenderTarget::Config{ Resolution{ 32, 32 } }); + + QuadLayer::Config cfg; + cfg.resolution = { 32, 32 }; + QuadLayer layer(ctx, target->render_pass(), cfg); + + cudaStream_t stream = nullptr; + REQUIRE(cudaStreamCreate(&stream) == cudaSuccess); + struct StreamGuard + { + cudaStream_t s; + ~StreamGuard() + { + cudaStreamDestroy(s); + } + } guard{ stream }; + + void* dev_ptr = nullptr; + REQUIRE(cudaMalloc(&dev_ptr, static_cast(32) * 32 * 4) == cudaSuccess); + struct CudaFree + { + void* p; + ~CudaFree() + { + cudaFree(p); + } + } cuda_free{ dev_ptr }; + + viz::VizBuffer src{}; + src.data = dev_ptr; + src.width = 32; + src.height = 32; + src.format = PixelFormat::kRGBA8; + src.pitch = static_cast(32) * 4; + src.space = viz::MemorySpace::kDevice; + REQUIRE_NOTHROW(layer.submit(src, stream)); + REQUIRE(cudaStreamSynchronize(stream) == cudaSuccess); +} + +TEST_CASE("QuadLayer back-to-back submits cycle through mailbox slots", "[gpu][quad_layer]") +{ + if (!is_gpu_available()) + { + SKIP("No Vulkan-capable GPU available"); + } + VkContext ctx; + ctx.init({}); + auto target = RenderTarget::create(ctx, RenderTarget::Config{ Resolution{ 32, 32 } }); + + QuadLayer::Config cfg; + cfg.resolution = { 32, 32 }; + QuadLayer layer(ctx, target->render_pass(), cfg); + + void* dev_ptr = nullptr; + REQUIRE(cudaMalloc(&dev_ptr, static_cast(32) * 32 * 4) == cudaSuccess); + struct CudaFree + { + void* p; + ~CudaFree() + { + cudaFree(p); + } + } cuda_free{ dev_ptr }; + + viz::VizBuffer src{}; + src.data = dev_ptr; + src.width = 32; + src.height = 32; + src.format = PixelFormat::kRGBA8; + src.pitch = static_cast(32) * 4; + src.space = viz::MemorySpace::kDevice; + + // Without an intervening render(), in_use_ stays kSlotNone, so + // every submit() is free to pick any slot that isn't latest_. + // We expect each submit's cuda_done_writing counter to advance + // monotonically on whichever slot it landed on. + uint64_t total_signals_before = 0; + for (uint32_t i = 0; i < QuadLayer::kSlotCount; ++i) + { + total_signals_before += layer.device_image(i)->cuda_done_writing_value(); + } + constexpr uint32_t kSubmits = 8; + for (uint32_t i = 0; i < kSubmits; ++i) + { + REQUIRE_NOTHROW(layer.submit(src)); + } + REQUIRE(cudaDeviceSynchronize() == cudaSuccess); + + uint64_t total_signals_after = 0; + for (uint32_t i = 0; i < QuadLayer::kSlotCount; ++i) + { + total_signals_after += layer.device_image(i)->cuda_done_writing_value(); + } + CHECK(total_signals_after - total_signals_before == kSubmits); +} + +TEST_CASE("QuadLayer visibility toggle is independent of pipeline state", "[gpu][quad_layer]") +{ + if (!is_gpu_available()) + { + SKIP("No Vulkan-capable GPU available"); + } + VkContext ctx; + ctx.init({}); + auto target = RenderTarget::create(ctx, RenderTarget::Config{ Resolution{ 32, 32 } }); + + QuadLayer::Config cfg; + cfg.resolution = { 32, 32 }; + QuadLayer layer(ctx, target->render_pass(), cfg); + + REQUIRE(layer.is_visible()); + layer.set_visible(false); + CHECK_FALSE(layer.is_visible()); + layer.set_visible(true); + CHECK(layer.is_visible()); +} diff --git a/src/viz/session/cpp/inc/viz/session/viz_session.hpp b/src/viz/session/cpp/inc/viz/session/viz_session.hpp index 9b66ed5ce..1a4be519c 100644 --- a/src/viz/session/cpp/inc/viz/session/viz_session.hpp +++ b/src/viz/session/cpp/inc/viz/session/viz_session.hpp @@ -157,6 +157,10 @@ class VizSession uint32_t get_vk_queue_family_index() const noexcept; VkRenderPass get_render_pass() const noexcept; + // The VkContext driving this session, used by layers that build + // their own pipelines. nullptr before create() / after destroy(). + const VkContext* get_vk_context() const noexcept; + private: explicit VizSession(const Config& config); void init(); diff --git a/src/viz/session/cpp/viz_compositor.cpp b/src/viz/session/cpp/viz_compositor.cpp index b8361ffbc..9c2a6a76d 100644 --- a/src/viz/session/cpp/viz_compositor.cpp +++ b/src/viz/session/cpp/viz_compositor.cpp @@ -205,17 +205,29 @@ void VizCompositor::render(const std::vector& layers, const std::vec rp.clearValueCount = static_cast(clears.size()); rp.pClearValues = clears.data(); - vkCmdBeginRenderPass(command_buffer_, &rp, VK_SUBPASS_CONTENTS_INLINE); - - // Layer dispatch: insertion order; skip invisible. + // Snapshot the visible-layer set ONCE per frame. is_visible() is + // an atomic flag; sampling it twice across record / wait-collect + // would let a mid-frame toggle record draws but skip the + // matching cuda_done_writing wait (or vice versa), which would + // race the producer's CUDA copy. + std::vector visible_layers; + visible_layers.reserve(layers.size()); for (LayerBase* layer : layers) { if (layer != nullptr && layer->is_visible()) { - layer->record(command_buffer_, views, *render_target_); + visible_layers.push_back(layer); } } + vkCmdBeginRenderPass(command_buffer_, &rp, VK_SUBPASS_CONTENTS_INLINE); + + // Layer dispatch: insertion order, only the snapshotted visible set. + for (LayerBase* layer : visible_layers) + { + layer->record(command_buffer_, views, *render_target_); + } + vkCmdEndRenderPass(command_buffer_); check_vk(vkEndCommandBuffer(command_buffer_), "vkEndCommandBuffer"); @@ -225,16 +237,46 @@ void VizCompositor::render(const std::vector& layers, const std::vec // frame and the next render() doesn't deadlock on wait(). frame_sync_->reset(); + // Collect layer-provided wait timeline semaphores. Each visible + // layer contributes; flatten into the arrays vkQueueSubmit + // expects (with a chained VkTimelineSemaphoreSubmitInfo for the + // per-semaphore counter values). + std::vector wait_semaphores; + std::vector wait_values; + std::vector wait_stages; + for (LayerBase* layer : visible_layers) + { + for (const auto& w : layer->get_wait_semaphores()) + { + if (w.semaphore != VK_NULL_HANDLE) + { + wait_semaphores.push_back(w.semaphore); + wait_values.push_back(w.value); + wait_stages.push_back(w.wait_stage); + } + } + } + + VkTimelineSemaphoreSubmitInfo timeline{}; + timeline.sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO; + timeline.waitSemaphoreValueCount = static_cast(wait_values.size()); + timeline.pWaitSemaphoreValues = wait_values.empty() ? nullptr : wait_values.data(); + VkSubmitInfo submit{}; submit.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + submit.pNext = &timeline; submit.commandBufferCount = 1; submit.pCommandBuffers = &command_buffer_; + submit.waitSemaphoreCount = static_cast(wait_semaphores.size()); + submit.pWaitSemaphores = wait_semaphores.empty() ? nullptr : wait_semaphores.data(); + submit.pWaitDstStageMask = wait_stages.empty() ? nullptr : wait_stages.data(); submit_or_signal_fence(submit, "vkQueueSubmit"); // Wait for completion before returning so readback / next frame sees // a consistent state. With 1 frame in flight this is the natural // synchronization point; multi-buffered swapchain rendering moves - // this wait to the start of the next frame. + // this wait to the start of the next frame. QuadLayer's mailbox + // depends on this — see quad_layer.hpp. frame_sync_->wait(); } diff --git a/src/viz/session/cpp/viz_session.cpp b/src/viz/session/cpp/viz_session.cpp index 521b8d6ba..d9e6eea6e 100644 --- a/src/viz/session/cpp/viz_session.cpp +++ b/src/viz/session/cpp/viz_session.cpp @@ -18,8 +18,7 @@ void check_offscreen_only(DisplayMode mode, const char* what) { throw std::runtime_error(std::string("VizSession: ") + what + " is not implemented for the requested DisplayMode " - "(only kOffscreen ships in this milestone; " - "kWindow / kXr arrive with their respective backends)"); + "(only kOffscreen is currently supported)"); } } @@ -259,4 +258,9 @@ VkRenderPass VizSession::get_render_pass() const noexcept return compositor_ ? compositor_->render_pass() : VK_NULL_HANDLE; } +const VkContext* VizSession::get_vk_context() const noexcept +{ + return ctx_ptr_; +} + } // namespace viz diff --git a/src/viz/session_tests/cpp/CMakeLists.txt b/src/viz/session_tests/cpp/CMakeLists.txt index f2ac2ccac..31734b870 100644 --- a/src/viz/session_tests/cpp/CMakeLists.txt +++ b/src/viz/session_tests/cpp/CMakeLists.txt @@ -5,12 +5,15 @@ cmake_minimum_required(VERSION 3.20) add_executable(viz_session_tests test_offscreen_render.cpp + test_quad_milestone.cpp test_viz_session.cpp ) target_link_libraries(viz_session_tests PRIVATE viz::session + viz::layers viz::layers_testing + viz::test_support Catch2::Catch2WithMain ) diff --git a/src/viz/session_tests/cpp/test_offscreen_render.cpp b/src/viz/session_tests/cpp/test_offscreen_render.cpp index 9dc2ca171..f25d46d95 100644 --- a/src/viz/session_tests/cpp/test_offscreen_render.cpp +++ b/src/viz/session_tests/cpp/test_offscreen_render.cpp @@ -11,6 +11,8 @@ // VizSession + LayerBase dispatch + readback path. Everything that ships // in this milestone, exercised in one test. +#include "test_helpers.hpp" + #include #include #include @@ -27,27 +29,10 @@ using viz::VizSession; using viz::testing::ClearRectLayer; using viz::testing::ThrowingLayer; -namespace -{ +using viz::testing::is_gpu_available; -// Same NVIDIA-driver-leak workaround as viz_core_tests: any [gpu] test -// that creates a VkContext should check this first and SKIP if the -// runner has no suitable GPU. -bool gpu_available() +namespace { - static const bool cached = []() - { - for (const auto& info : viz::VkContext::enumerate_physical_devices()) - { - if (info.meets_requirements) - { - return true; - } - } - return false; - }(); - return cached; -} // RGBA8 byte at (x, y) in a tightly-packed row-major framebuffer. struct Rgba @@ -69,7 +54,7 @@ Rgba pixel_at(const HostImage& img, uint32_t x, uint32_t y) TEST_CASE("Offscreen session renders layer pixels through to readback", "[gpu][viz_session]") { - if (!gpu_available()) + if (!is_gpu_available()) { SKIP("No Vulkan-capable GPU available"); } @@ -139,7 +124,7 @@ TEST_CASE("Offscreen session renders layer pixels through to readback", "[gpu][v TEST_CASE("Hidden layer does not contribute to the framebuffer", "[gpu][viz_session]") { - if (!gpu_available()) + if (!is_gpu_available()) { SKIP("No Vulkan-capable GPU available"); } @@ -180,7 +165,7 @@ TEST_CASE("Hidden layer does not contribute to the framebuffer", "[gpu][viz_sess TEST_CASE("Multiple frames advance frame_index and avoid leaking sync state", "[gpu][viz_session]") { - if (!gpu_available()) + if (!is_gpu_available()) { SKIP("No Vulkan-capable GPU available"); } @@ -202,7 +187,7 @@ TEST_CASE("Multiple frames advance frame_index and avoid leaking sync state", "[ TEST_CASE("Session recovers from a layer that throws and renders the next frame", "[gpu][viz_session]") { - if (!gpu_available()) + if (!is_gpu_available()) { SKIP("No Vulkan-capable GPU available"); } @@ -241,7 +226,7 @@ TEST_CASE("Session recovers from a layer that throws and renders the next frame" TEST_CASE("Layer that throws does not corrupt the layer registry", "[gpu][viz_session]") { - if (!gpu_available()) + if (!is_gpu_available()) { SKIP("No Vulkan-capable GPU available"); } @@ -275,7 +260,7 @@ TEST_CASE("Layer that throws does not corrupt the layer registry", "[gpu][viz_se TEST_CASE("begin_frame / end_frame must be paired", "[gpu][viz_session]") { - if (!gpu_available()) + if (!is_gpu_available()) { SKIP("No Vulkan-capable GPU available"); } diff --git a/src/viz/session_tests/cpp/test_quad_milestone.cpp b/src/viz/session_tests/cpp/test_quad_milestone.cpp new file mode 100644 index 000000000..3f04581dc --- /dev/null +++ b/src/viz/session_tests/cpp/test_quad_milestone.cpp @@ -0,0 +1,482 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +// End-to-end CUDA-Vulkan interop through VizSession: producer writes +// pixels into a caller-owned CUDA buffer, QuadLayer::submit() copies +// them into one of the mailbox slots, the next render() samples that +// slot, readback_to_host() pulls the framebuffer back out. +// +// Pattern: 4 quadrants of {0, 255}-only RGBA — exact through any +// sRGB / UNORM gamma curve because the curve endpoints map to +// themselves. A separate midtone test covers the gamma round-trip. + +#include "test_helpers.hpp" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +using viz::DisplayMode; +using viz::HostImage; +using viz::PixelFormat; +using viz::QuadLayer; +using viz::Resolution; +using viz::VizSession; + +using viz::testing::is_gpu_available; + +namespace +{ + +// 4 quadrants, each a different {0, 255}-only color. Round-trip-exact +// through Vulkan's sRGB attachment encoding because both endpoints of +// the gamma curve (0 and 255) map to themselves. +// +// top-left = red, top-right = green, bottom-left = blue, +// bottom-right = white. +struct Rgba +{ + uint8_t r, g, b, a; +}; + +Rgba quadrant_color(uint32_t x, uint32_t y, uint32_t w, uint32_t h) +{ + const bool right = x >= w / 2; + const bool bottom = y >= h / 2; + if (!right && !bottom) + { + return { 255, 0, 0, 255 }; + } + if (right && !bottom) + { + return { 0, 255, 0, 255 }; + } + if (!right && bottom) + { + return { 0, 0, 255, 255 }; + } + return { 255, 255, 255, 255 }; +} + +std::vector build_host_pattern(uint32_t side) +{ + std::vector px(static_cast(side) * side); + for (uint32_t y = 0; y < side; ++y) + { + for (uint32_t x = 0; x < side; ++x) + { + px[static_cast(y) * side + x] = quadrant_color(x, y, side, side); + } + } + return px; +} + +Rgba pixel_at(const HostImage& img, uint32_t x, uint32_t y) +{ + const size_t i = (static_cast(y) * img.resolution().width + x) * 4; + const uint8_t* p = img.data() + i; + return Rgba{ p[0], p[1], p[2], p[3] }; +} + +// Asserts the readback contains the 4-quadrant pattern at the four +// quadrant centers. Centers (kSide/4, kSide/4) etc. are deep inside +// each color region, far from any rasterization-edge ambiguity. +void check_quadrant_pattern(const HostImage& image, uint32_t side) +{ + const Rgba top_left = pixel_at(image, side / 4, side / 4); + CHECK(top_left.r == 255); + CHECK(top_left.g == 0); + CHECK(top_left.b == 0); + CHECK(top_left.a == 255); + + const Rgba top_right = pixel_at(image, 3 * side / 4, side / 4); + CHECK(top_right.r == 0); + CHECK(top_right.g == 255); + CHECK(top_right.b == 0); + CHECK(top_right.a == 255); + + const Rgba bottom_left = pixel_at(image, side / 4, 3 * side / 4); + CHECK(bottom_left.r == 0); + CHECK(bottom_left.g == 0); + CHECK(bottom_left.b == 255); + CHECK(bottom_left.a == 255); + + const Rgba bottom_right = pixel_at(image, 3 * side / 4, 3 * side / 4); + CHECK(bottom_right.r == 255); + CHECK(bottom_right.g == 255); + CHECK(bottom_right.b == 255); + CHECK(bottom_right.a == 255); +} + +// RAII wrapper that frees the cudaMalloc'd device pointer on scope exit. +struct CudaFreeGuard +{ + void* p; + ~CudaFreeGuard() + { + cudaFree(p); + } +}; + +} // namespace + +TEST_CASE("QuadLayer submit() round-trips CUDA pixels to readback", "[gpu][quad_layer][milestone]") +{ + if (!is_gpu_available()) + { + SKIP("No Vulkan-capable GPU available"); + } + + constexpr uint32_t kSide = 64; + + VizSession::Config cfg{}; + cfg.mode = DisplayMode::kOffscreen; + cfg.window_width = kSide; + cfg.window_height = kSide; + + auto session = VizSession::create(cfg); + REQUIRE(session != nullptr); + REQUIRE(session->get_state() == viz::SessionState::kReady); + + const auto* ctx = session->get_vk_context(); + REQUIRE(ctx != nullptr); + const VkRenderPass render_pass = session->get_render_pass(); + REQUIRE(render_pass != VK_NULL_HANDLE); + + QuadLayer::Config layer_cfg; + layer_cfg.name = "milestone_quad_mode_a"; + layer_cfg.resolution = { kSide, kSide }; + auto* layer = session->add_layer(*ctx, render_pass, layer_cfg); + REQUIRE(layer != nullptr); + + // Stage the pattern in a caller-owned cudaMalloc'd buffer — this + // mirrors how a real Mode A consumer (camera decoder, NN renderer) + // hands data to submit(). + const auto host_pattern = build_host_pattern(kSide); + void* device_ptr = nullptr; + REQUIRE(cudaMalloc(&device_ptr, host_pattern.size() * sizeof(Rgba)) == cudaSuccess); + CudaFreeGuard guard{ device_ptr }; + REQUIRE(cudaMemcpy(device_ptr, host_pattern.data(), host_pattern.size() * sizeof(Rgba), cudaMemcpyHostToDevice) == + cudaSuccess); + REQUIRE(cudaDeviceSynchronize() == cudaSuccess); + + viz::VizBuffer src{}; + src.data = device_ptr; + src.width = kSide; + src.height = kSide; + src.format = PixelFormat::kRGBA8; + src.pitch = static_cast(kSide) * 4; + src.space = viz::MemorySpace::kDevice; + layer->submit(src); + + const auto info = session->render(); + CHECK(info.frame_index == 0); + CHECK(info.resolution.width == kSide); + CHECK(info.resolution.height == kSide); + + const auto image = session->readback_to_host(); + REQUIRE(image.resolution().width == kSide); + REQUIRE(image.resolution().height == kSide); + check_quadrant_pattern(image, kSide); +} + +TEST_CASE("QuadLayer multi-frame submit/render/readback loop stays correct", "[gpu][quad_layer][milestone]") +{ + if (!is_gpu_available()) + { + SKIP("No Vulkan-capable GPU available"); + } + + constexpr uint32_t kSide = 64; + constexpr int kFrames = 16; + + VizSession::Config cfg{}; + cfg.mode = DisplayMode::kOffscreen; + cfg.window_width = kSide; + cfg.window_height = kSide; + + auto session = VizSession::create(cfg); + REQUIRE(session != nullptr); + const auto* ctx = session->get_vk_context(); + REQUIRE(ctx != nullptr); + + QuadLayer::Config layer_cfg; + layer_cfg.name = "milestone_quad_multiframe"; + layer_cfg.resolution = { kSide, kSide }; + auto* layer = session->add_layer(*ctx, session->get_render_pass(), layer_cfg); + REQUIRE(layer != nullptr); + + void* device_ptr = nullptr; + REQUIRE(cudaMalloc(&device_ptr, static_cast(kSide) * kSide * 4) == cudaSuccess); + CudaFreeGuard guard{ device_ptr }; + + // Each frame fills with a different solid-color palette entry + // (channels in {0, 255} for sRGB-exact round-trip). With the + // 3-slot mailbox the producer's submit and the renderer's draw + // pipeline naturally; frame N's readback must still contain + // frame N's color, not a stale or torn frame. + const std::array palette = { { + { 255, 0, 0, 255 }, + { 0, 255, 0, 255 }, + { 0, 0, 255, 255 }, + { 255, 255, 255, 255 }, + } }; + + std::vector host_buf(static_cast(kSide) * kSide); + for (int frame = 0; frame < kFrames; ++frame) + { + const Rgba expected = palette[frame % palette.size()]; + std::fill(host_buf.begin(), host_buf.end(), expected); + REQUIRE(cudaMemcpy(device_ptr, host_buf.data(), host_buf.size() * sizeof(Rgba), cudaMemcpyHostToDevice) == + cudaSuccess); + + viz::VizBuffer src{}; + src.data = device_ptr; + src.width = kSide; + src.height = kSide; + src.format = PixelFormat::kRGBA8; + src.pitch = static_cast(kSide) * 4; + src.space = viz::MemorySpace::kDevice; + layer->submit(src); + + session->render(); + + const auto image = session->readback_to_host(); + const auto sample = pixel_at(image, kSide / 2, kSide / 2); + CHECK(sample.r == expected.r); + CHECK(sample.g == expected.g); + CHECK(sample.b == expected.b); + CHECK(sample.a == expected.a); + } +} + +TEST_CASE("QuadLayer round-trips midtone RGBA values exactly", "[gpu][quad_layer][milestone]") +{ + // The {0, 255}-only round-trip tests don't exercise the sRGB + // color-space round-trip — those endpoints map to themselves + // through any gamma curve. Mid-range bytes are only exact when + // the storage UNORM image is sampled through an SRGB view + // (decode at sample) and the SRGB color attachment encodes on + // write. Net of decode+encode is identity. + if (!is_gpu_available()) + { + SKIP("No Vulkan-capable GPU available"); + } + + constexpr uint32_t kSide = 64; + VizSession::Config cfg{}; + cfg.mode = DisplayMode::kOffscreen; + cfg.window_width = kSide; + cfg.window_height = kSide; + + auto session = VizSession::create(cfg); + REQUIRE(session != nullptr); + const auto* ctx = session->get_vk_context(); + REQUIRE(ctx != nullptr); + + QuadLayer::Config layer_cfg; + layer_cfg.name = "milestone_quad_midtone"; + layer_cfg.resolution = { kSide, kSide }; + auto* layer = session->add_layer(*ctx, session->get_render_pass(), layer_cfg); + REQUIRE(layer != nullptr); + + // A non-trivial midtone (~50% gray, mixed channels). With the + // wrong color-space wiring this would shift by ~5–20% per channel. + constexpr Rgba kExpected = { 64, 128, 200, 255 }; + + std::vector host_buf(static_cast(kSide) * kSide, kExpected); + void* device_ptr = nullptr; + REQUIRE(cudaMalloc(&device_ptr, host_buf.size() * sizeof(Rgba)) == cudaSuccess); + CudaFreeGuard guard{ device_ptr }; + REQUIRE(cudaMemcpy(device_ptr, host_buf.data(), host_buf.size() * sizeof(Rgba), cudaMemcpyHostToDevice) == + cudaSuccess); + REQUIRE(cudaDeviceSynchronize() == cudaSuccess); + + viz::VizBuffer src{}; + src.data = device_ptr; + src.width = kSide; + src.height = kSide; + src.format = PixelFormat::kRGBA8; + src.pitch = static_cast(kSide) * 4; + src.space = viz::MemorySpace::kDevice; + layer->submit(src); + + session->render(); + + const auto image = session->readback_to_host(); + const auto sample = pixel_at(image, kSide / 2, kSide / 2); + // Round-trip should be exact; allow ±1 LSB for any quantization + // edge case on the Vulkan->host blit path. + CHECK(std::abs(int(sample.r) - int(kExpected.r)) <= 1); + CHECK(std::abs(int(sample.g) - int(kExpected.g)) <= 1); + CHECK(std::abs(int(sample.b) - int(kExpected.b)) <= 1); + CHECK(sample.a == kExpected.a); +} + +TEST_CASE("QuadLayer with no submit yet renders the clear color", "[gpu][quad_layer][milestone]") +{ + // Pins the kSlotNone short-circuit in record() / get_wait_semaphores(). + if (!is_gpu_available()) + { + SKIP("No Vulkan-capable GPU available"); + } + + constexpr uint32_t kSide = 64; + VizSession::Config cfg{}; + cfg.mode = DisplayMode::kOffscreen; + cfg.window_width = kSide; + cfg.window_height = kSide; + // Distinctive non-default clear so a coincidental black draw can't pass. + cfg.clear_color[0] = 0.0f; + cfg.clear_color[1] = 1.0f; + cfg.clear_color[2] = 0.0f; + cfg.clear_color[3] = 1.0f; + + auto session = VizSession::create(cfg); + REQUIRE(session != nullptr); + const auto* ctx = session->get_vk_context(); + REQUIRE(ctx != nullptr); + + QuadLayer::Config layer_cfg; + layer_cfg.name = "milestone_quad_no_submit"; + layer_cfg.resolution = { kSide, kSide }; + auto* layer = session->add_layer(*ctx, session->get_render_pass(), layer_cfg); + REQUIRE(layer != nullptr); + + session->render(); + const auto image = session->readback_to_host(); + const auto sample = pixel_at(image, kSide / 2, kSide / 2); + CHECK(sample.r == 0); + CHECK(sample.g == 255); + CHECK(sample.b == 0); + CHECK(sample.a == 255); +} + +TEST_CASE("QuadLayer re-renders the same publish when no new submit arrives", "[gpu][quad_layer][milestone]") +{ + // Pins: record() keeps in_use_ stable across frames if latest_ doesn't change. + if (!is_gpu_available()) + { + SKIP("No Vulkan-capable GPU available"); + } + + constexpr uint32_t kSide = 64; + VizSession::Config cfg{}; + cfg.mode = DisplayMode::kOffscreen; + cfg.window_width = kSide; + cfg.window_height = kSide; + + auto session = VizSession::create(cfg); + REQUIRE(session != nullptr); + const auto* ctx = session->get_vk_context(); + REQUIRE(ctx != nullptr); + + QuadLayer::Config layer_cfg; + layer_cfg.name = "milestone_quad_resubmit_none"; + layer_cfg.resolution = { kSide, kSide }; + auto* layer = session->add_layer(*ctx, session->get_render_pass(), layer_cfg); + REQUIRE(layer != nullptr); + + constexpr Rgba kColor = { 255, 0, 255, 255 }; + std::vector host_buf(static_cast(kSide) * kSide, kColor); + void* device_ptr = nullptr; + REQUIRE(cudaMalloc(&device_ptr, host_buf.size() * sizeof(Rgba)) == cudaSuccess); + CudaFreeGuard guard{ device_ptr }; + REQUIRE(cudaMemcpy(device_ptr, host_buf.data(), host_buf.size() * sizeof(Rgba), cudaMemcpyHostToDevice) == + cudaSuccess); + + viz::VizBuffer src{}; + src.data = device_ptr; + src.width = kSide; + src.height = kSide; + src.format = PixelFormat::kRGBA8; + src.pitch = static_cast(kSide) * 4; + src.space = viz::MemorySpace::kDevice; + layer->submit(src); + + for (int i = 0; i < 2; ++i) + { + session->render(); + const auto image = session->readback_to_host(); + const auto sample = pixel_at(image, kSide / 2, kSide / 2); + CHECK(sample.r == kColor.r); + CHECK(sample.g == kColor.g); + CHECK(sample.b == kColor.b); + CHECK(sample.a == kColor.a); + } +} + +TEST_CASE("QuadLayer fast producer: render samples only the latest publish", "[gpu][quad_layer][milestone]") +{ + // Pins the core mailbox guarantee — intermediate publishes are dropped. + if (!is_gpu_available()) + { + SKIP("No Vulkan-capable GPU available"); + } + + constexpr uint32_t kSide = 64; + VizSession::Config cfg{}; + cfg.mode = DisplayMode::kOffscreen; + cfg.window_width = kSide; + cfg.window_height = kSide; + + auto session = VizSession::create(cfg); + REQUIRE(session != nullptr); + const auto* ctx = session->get_vk_context(); + REQUIRE(ctx != nullptr); + + QuadLayer::Config layer_cfg; + layer_cfg.name = "milestone_quad_fast_producer"; + layer_cfg.resolution = { kSide, kSide }; + auto* layer = session->add_layer(*ctx, session->get_render_pass(), layer_cfg); + REQUIRE(layer != nullptr); + + void* device_ptr = nullptr; + REQUIRE(cudaMalloc(&device_ptr, static_cast(kSide) * kSide * 4) == cudaSuccess); + CudaFreeGuard guard{ device_ptr }; + + // Five back-to-back submits, no intervening render. The last one must win. + const std::array palette = { { + { 255, 0, 0, 255 }, + { 0, 255, 0, 255 }, + { 0, 0, 255, 255 }, + { 255, 255, 0, 255 }, + { 0, 255, 255, 255 }, + } }; + + std::vector host_buf(static_cast(kSide) * kSide); + for (const auto& color : palette) + { + std::fill(host_buf.begin(), host_buf.end(), color); + REQUIRE(cudaMemcpy(device_ptr, host_buf.data(), host_buf.size() * sizeof(Rgba), cudaMemcpyHostToDevice) == + cudaSuccess); + + viz::VizBuffer src{}; + src.data = device_ptr; + src.width = kSide; + src.height = kSide; + src.format = PixelFormat::kRGBA8; + src.pitch = static_cast(kSide) * 4; + src.space = viz::MemorySpace::kDevice; + layer->submit(src); + } + + session->render(); + const auto image = session->readback_to_host(); + const auto sample = pixel_at(image, kSide / 2, kSide / 2); + const auto expected = palette.back(); + CHECK(sample.r == expected.r); + CHECK(sample.g == expected.g); + CHECK(sample.b == expected.b); + CHECK(sample.a == expected.a); +} diff --git a/src/viz/shaders/cpp/textured_quad.frag b/src/viz/shaders/cpp/textured_quad.frag index 29b92d6c9..00243763b 100644 --- a/src/viz/shaders/cpp/textured_quad.frag +++ b/src/viz/shaders/cpp/textured_quad.frag @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 // Samples a combined image sampler at descriptor binding 0. Used by -// QuadLayer (ships in M3b) to display a CUDA-fed texture as a quad. +// QuadLayer to display a CUDA-fed texture as a fullscreen quad. #version 450