diff --git a/src/viz/AGENTS.md b/src/viz/AGENTS.md
index 67735dedd..ec80d7b2b 100644
--- a/src/viz/AGENTS.md
+++ b/src/viz/AGENTS.md
@@ -16,27 +16,44 @@ single sub-module. Each sub-module is its own static library with its own
 sibling `<sub-module>_tests/` directory:
 
 - **`viz/core/`** — foundational types + Vulkan/CUDA infrastructure.
-  Library: `viz_core`. Today: `VkContext`, `VizBuffer`, `Pose3D`, `Fov`,
-  `Resolution`, `ViewInfo`, `PixelFormat`, `RenderTarget`, `FrameSync`,
-  `HostImage`, `DeviceImage`. `HostImage` / `DeviceImage` are the
-  symmetric pair of owning 2D pixel buffers (CPU bytes vs CUDA-Vulkan
-  interop) — both expose `VizBuffer view()` so generic helpers branch
-  on `VizBuffer::space`. Math types (`glm::vec3`, `glm::quat`,
-  `glm::mat4`) come from GLM 1.0.1 (FetchContent in
-  `deps/third_party/`); use `glm::value_ptr(mat)` to get a raw `float*`
-  for Vulkan / CUDA upload (POD-equivalent layout, no copy).
-  CUDA-Vulkan interop requires CUDA Toolkit at link time
-  (`CUDAToolkit::cudart`). `VkContext::init()` matches the current
-  CUDA device to the chosen Vulkan physical device by UUID — every
-  viz_core type can assume CUDA and Vulkan are talking to the same
-  GPU without re-doing the match.
-- **`viz/layers/`** — `LayerBase` and concrete layers (`QuadLayer`, etc.).
-  Library: `viz_layers` (INTERFACE / header-only today; promoted to
-  STATIC when the first concrete layer ships). Depends on `viz_core`.
-  Test-only fixture layers (`ClearRectLayer`, future `ColoredQuadLayer`)
-  live in `viz/layers_tests/cpp/inc/viz/layers/testing/` and are exposed
-  via the `viz::layers_testing` static library — used by other test
-  binaries (e.g. `viz_session_tests`) to compose into a `VizSession`.
+  Library: `viz_core`. Today: `VkContext`, `VizBuffer`, `Pose3D`,
+  `Fov`, `Resolution`, `ViewInfo`, `PixelFormat`, `RenderTarget`,
+  `FrameSync`, `HostImage`, `DeviceImage`. `HostImage` owns CPU bytes
+  and exposes a `VizBuffer view()`; `DeviceImage` owns CUDA-Vulkan
+  interop memory (VkImage + cudaArray_t) plus one timeline semaphore
+  (`cuda_done_writing`) that layers expose to the compositor for
+  fine-grained producer→consumer sync. The reverse direction (consumer
+  done reading) is handled at the layer level by buffering enough
+  slots that the producer never targets a still-in-flight read.
+  Consumed via discrete accessors — no `view()` because `cudaArray_t`
+  is opaque tiled memory, not a CUDA device pointer. `VizBuffer`
+  covers linear pointer-backed memory (CPU bytes / CUDA device
+  pointer; exposes `__cuda_array_interface__` / `__array_interface__`
+  in Python). Math types (`glm::vec3`, `glm::quat`, `glm::mat4`) come
+  from GLM 1.0.1 (FetchContent in `deps/third_party/`); use
+  `glm::value_ptr(mat)` to get a raw `float*` for Vulkan / CUDA upload
+  (POD-equivalent layout, no copy). CUDA-Vulkan interop requires CUDA
+  Toolkit at link time (`CUDAToolkit::cudart`). `VkContext::init()`
+  matches the current CUDA device to the chosen Vulkan physical
+  device by UUID — every viz_core type can assume CUDA and Vulkan
+  are talking to the same GPU without re-doing the match.
+- **`viz/layers/`** — `LayerBase` and concrete layers. Library:
+  `viz_layers` (STATIC). Depends on `viz_core` + `viz_shaders`. Today:
+  `QuadLayer` (textured fullscreen quad, CUDA-fed via a 3-slot
+  mailbox of `DeviceImage`s; producer side is `submit(VizBuffer)` and
+  the renderer always samples the most recently published slot).
+  Pipelines built per-layer using the driver-side `VkPipelineCache`
+  from `VkContext`. **Deferred:** a zero-copy `acquire`/`release`
+  variant for producers that can write directly into a tiled
+  `cudaArray_t` (NVDEC, custom CUDA kernels). The mailbox internals
+  already track slot ownership, so the addition is local to
+  `QuadLayer`; revisit when a real producer demands it. Open design
+  questions are captured under "Future: zero-copy acquire/release"
+  in `DESIGN.md`.
+  Test-only fixture layers (`ClearRectLayer`, `ThrowingLayer`) live in
+  `viz/layers_tests/cpp/inc/viz/layers/testing/` and are exposed via
+  the `viz::layers_testing` static library — used by `viz_session_tests`
+  to compose into a `VizSession`.
 - **`viz/session/`** — `VizSession`, `VizCompositor`, `FrameInfo`,
   `FrameTimingStats`, `SessionState`, display backends (today: offscreen
   only; window/XR added by their respective backends). Library:
@@ -75,11 +92,10 @@ Build paths that ship viz (the wheel CI on Linux + Windows) pass
 When `BUILD_VIZ=ON` the build machine must have:
 - **Vulkan headers + loader**: `libvulkan-dev` on Linux, LunarG SDK on
   Windows.
-- **CUDA Toolkit** (cudart for link, nvcc not strictly required today
-  but expected to be needed for kernels in M3b+): apt
-  `nvidia-cuda-toolkit` or the official NVIDIA installer / CI action
-  (`Jimver/cuda-toolkit`). The wheel excludes `libcuda.so.1` —
-  consumers supply it via NVIDIA driver.
+- **CUDA Toolkit** (cudart for link; nvcc once we ship CUDA kernels):
+  apt `nvidia-cuda-toolkit` or the official NVIDIA installer /
+  CI action (`Jimver/cuda-toolkit`). The wheel excludes
+  `libcuda.so.1` — consumers supply it via NVIDIA driver.
 - **glslangValidator** for shader compilation: `glslang-tools` apt
   package on Linux, `brew install glslang` on macOS, ships with the
   Vulkan SDK on Windows.
diff --git a/src/viz/core/cpp/device_image.cpp b/src/viz/core/cpp/device_image.cpp
index a02053b58..d6b53246f 100644
--- a/src/viz/core/cpp/device_image.cpp
+++ b/src/viz/core/cpp/device_image.cpp
@@ -7,11 +7,9 @@
 #include <stdexcept>
 #include <string>
 
-// Posix close() lives in <unistd.h> on Linux/macOS; Windows uses _close()
-// from <io.h>. The fd-close path is unreachable at runtime on Windows
-// (vkGetMemoryFdKHR isn't available there — import_to_cuda throws before
-// memory_fd_ is ever assigned), but the code still has to compile under
-// MSVC for the experimental Windows build.
+// Posix close() vs Windows _close() shim — the fd-close path is
+// dead on Windows (vkGetMemoryFdKHR isn't available there) but
+// still has to compile under MSVC.
 #ifdef _WIN32
 #    include <io.h>
 namespace
@@ -68,14 +66,20 @@ uint32_t find_memory_type(VkPhysicalDevice physical_device, uint32_t type_bits,
     throw std::runtime_error("DeviceImage: no Vulkan memory type matching requested properties");
 }
 
-VkFormat to_vk_format(PixelFormat format)
+// Storage-side Vulkan format for the underlying VkImage / VkDeviceMemory.
+// We keep the storage UNORM and create a separate SRGB sampling view
+// (image is created with VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT) so:
+//   - CUDA writes raw bytes (no implicit gamma transform).
+//   - Vulkan samples through the SRGB view → sampler decodes
+//     sRGB -> linear.
+//   - Fragment writes linear -> sRGB encode at the attachment.
+// Net effect: arbitrary RGBA byte values round-trip exactly through
+// CUDA -> Vulkan -> readback.
+VkFormat to_vk_storage_format(PixelFormat format)
 {
     switch (format)
     {
     case PixelFormat::kRGBA8:
-        // UNORM (not SRGB) so CUDA writes round-trip without an
-        // implicit sRGB encode on the Vulkan side. Color management
-        // is the layer's concern.
         return VK_FORMAT_R8G8B8A8_UNORM;
     case PixelFormat::kD32F:
         return VK_FORMAT_D32_SFLOAT;
@@ -83,6 +87,18 @@ VkFormat to_vk_format(PixelFormat format)
     throw std::runtime_error("DeviceImage: unsupported PixelFormat");
 }
 
+VkFormat to_vk_view_format(PixelFormat format)
+{
+    switch (format)
+    {
+    case PixelFormat::kRGBA8:
+        return VK_FORMAT_R8G8B8A8_SRGB;
+    case PixelFormat::kD32F:
+        return VK_FORMAT_D32_SFLOAT;
+    }
+    throw std::runtime_error("DeviceImage: unsupported PixelFormat");
+}
+
 cudaChannelFormatDesc to_cuda_format(PixelFormat format)
 {
     switch (format)
@@ -107,13 +123,21 @@ std::unique_ptr<DeviceImage> DeviceImage::create(const VkContext& ctx, Resolutio
     {
         throw std::invalid_argument("DeviceImage: resolution must be non-zero");
     }
+    if (format != PixelFormat::kRGBA8)
+    {
+        // kD32F is reserved for ProjectionLayer's depth path. The
+        // CUDA-Vulkan interop contract for a depth image (sample
+        // semantics, layout transitions, color-space view) is not
+        // worked out yet, so refuse to half-build it.
+        throw std::invalid_argument("DeviceImage: only PixelFormat::kRGBA8 is supported");
+    }
     std::unique_ptr<DeviceImage> img(new DeviceImage(ctx, resolution, format));
     img->init();
     return img;
 }
 
 DeviceImage::DeviceImage(const VkContext& ctx, Resolution resolution, PixelFormat format)
-    : ctx_(&ctx), resolution_(resolution), format_(format), vk_format_(to_vk_format(format))
+    : ctx_(&ctx), resolution_(resolution), format_(format), vk_format_(to_vk_view_format(format))
 {
 }
 
@@ -129,6 +153,7 @@ void DeviceImage::init()
         create_vk_image_with_external_memory();
         create_vk_image_view();
         import_to_cuda();
+        create_interop_semaphores();
         transition_to_shader_read();
     }
     catch (...)
@@ -140,23 +165,24 @@ void DeviceImage::init()
 
 void DeviceImage::destroy()
 {
-    // Pin CUDA device for this thread so the CUDA frees below land on
-    // the right device even if destroy() runs on a thread that never
-    // ran VkContext::init(). Best-effort — destructor must not throw.
+    // Pin CUDA device on the destroying thread (best-effort; we
+    // can't throw out of a destructor).
     if (ctx_ != nullptr && ctx_->cuda_device_id() >= 0)
     {
         (void)cudaSetDevice(ctx_->cuda_device_id());
     }
 
-    // CUDA side first; CUDA holds a dup'd handle on the underlying
-    // memory, so the VkDeviceMemory must outlive the CUDA mapping.
-    // cudaDeviceSynchronize ensures any caller-issued async CUDA work
-    // (e.g. cudaMemcpy2DToArrayAsync) has retired before we free the
-    // array — otherwise CUDA may UAF its own staging.
-    if (cuda_mipmapped_array_ != nullptr || cuda_external_memory_ != nullptr)
+    // CUDA side first — VkDeviceMemory must outlive the CUDA
+    // mapping. Sync drains any caller-issued async work first.
+    if (cuda_mipmapped_array_ != nullptr || cuda_external_memory_ != nullptr || cuda_cuda_done_writing_ != nullptr)
     {
         (void)cudaDeviceSynchronize();
     }
+    if (cuda_cuda_done_writing_ != nullptr)
+    {
+        (void)cudaDestroyExternalSemaphore(cuda_cuda_done_writing_);
+        cuda_cuda_done_writing_ = nullptr;
+    }
     if (cuda_mipmapped_array_ != nullptr)
     {
         (void)cudaFreeMipmappedArray(cuda_mipmapped_array_);
@@ -170,9 +196,8 @@ void DeviceImage::destroy()
     }
     if (memory_fd_ >= 0)
     {
-        // CUDA dups the fd internally on import, so we close our copy.
-        // If import failed before our explicit close, fd_ may still
-        // hold our copy — close it here.
+        // CUDA dup'd the fd on import; close ours. Also handles the
+        // import-failed-before-close case.
         close_fd(memory_fd_);
         memory_fd_ = -1;
     }
@@ -189,6 +214,11 @@ void DeviceImage::destroy()
     // Wait for all GPU work to retire before tearing down Vulkan
     // resources.
     (void)vkDeviceWaitIdle(device);
+    if (cuda_done_writing_ != VK_NULL_HANDLE)
+    {
+        vkDestroySemaphore(device, cuda_done_writing_, nullptr);
+        cuda_done_writing_ = VK_NULL_HANDLE;
+    }
     if (command_pool_ != VK_NULL_HANDLE)
     {
         vkDestroyCommandPool(device, command_pool_, nullptr);
@@ -212,18 +242,6 @@ void DeviceImage::destroy()
     current_layout_ = VK_IMAGE_LAYOUT_UNDEFINED;
 }
 
-VizBuffer DeviceImage::view() const noexcept
-{
-    VizBuffer b;
-    b.data = static_cast<void*>(cuda_array_);
-    b.width = resolution_.width;
-    b.height = resolution_.height;
-    b.format = format_;
-    b.pitch = static_cast<size_t>(resolution_.width) * bytes_per_pixel(format_);
-    b.space = MemorySpace::kDevice;
-    return b;
-}
-
 void DeviceImage::create_vk_image_with_external_memory()
 {
     const VkDevice device = ctx_->device();
@@ -239,13 +257,16 @@ void DeviceImage::create_vk_image_with_external_memory()
     info.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO;
     info.pNext = &ext_image_info;
     info.imageType = VK_IMAGE_TYPE_2D;
-    info.format = vk_format_;
+    // Storage in linear-space format (UNORM); we'll attach the SRGB
+    // view in create_vk_image_view(). VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT
+    // is what allows view format != image format among compatible
+    // formats (UNORM <-> SRGB are in the same compatibility class).
+    info.flags = VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT;
+    info.format = to_vk_storage_format(format_);
     info.extent = { resolution_.width, resolution_.height, 1 };
-    info.mipLevels = 1; // Single level — when minification moiré shows up in
-                        // XR distance views, expose mipLevels via Config and
-                        // generate the chain via vkCmdBlitImage pre-render.
-                        // Anisotropic filtering on the sampler is the cheaper
-                        // first line of defense.
+    info.mipLevels = 1; // Single level. If XR distance views show
+                        // moiré, expose mipLevels via Config and
+                        // generate via vkCmdBlitImage pre-render.
     info.arrayLayers = 1;
     info.samples = VK_SAMPLE_COUNT_1_BIT;
     info.tiling = VK_IMAGE_TILING_OPTIMAL;
@@ -258,10 +279,8 @@ void DeviceImage::create_vk_image_with_external_memory()
     VkMemoryRequirements reqs;
     vkGetImageMemoryRequirements(device, image_, &reqs);
 
-    // Memory backing the image: device-local + exportable as POSIX fd.
-    // No VkMemoryDedicatedAllocateInfo / cudaExternalMemoryDedicated —
-    // a generic allocation works for sampled 2D images and avoids the
-    // dedicated-allocation extension wiring.
+    // Device-local + exportable as POSIX fd. Generic allocation
+    // (no VkMemoryDedicatedAllocateInfo) suffices for sampled 2D.
     VkExportMemoryAllocateInfo export_info{};
     export_info.sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO;
     export_info.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT;
@@ -314,10 +333,8 @@ void DeviceImage::create_vk_image_view()
 
 void DeviceImage::import_to_cuda()
 {
-    // cudaSetDevice is per-host-thread; VkContext set it on the init
-    // thread, but DeviceImage::create() may run on a different one.
-    // Re-pin here so cudaImportExternalMemory / GetMappedMipmappedArray
-    // talk to the same physical GPU as Vulkan.
+    // cudaSetDevice is per-host-thread; VkContext sets it on the
+    // init thread, re-pin here for worker-thread create() callers.
     check_cuda(cudaSetDevice(ctx_->cuda_device_id()), "cudaSetDevice");
 
     VkMemoryRequirements reqs;
@@ -347,6 +364,76 @@ void DeviceImage::import_to_cuda()
     check_cuda(cudaGetMipmappedArrayLevel(&cuda_array_, cuda_mipmapped_array_, 0), "cudaGetMipmappedArrayLevel");
 }
 
+void DeviceImage::create_interop_semaphores()
+{
+    const VkDevice device = ctx_->device();
+
+    auto vkGetSemaphoreFdKHR =
+        reinterpret_cast<PFN_vkGetSemaphoreFdKHR>(vkGetDeviceProcAddr(device, "vkGetSemaphoreFdKHR"));
+    if (vkGetSemaphoreFdKHR == nullptr)
+    {
+        throw std::runtime_error(
+            "DeviceImage: vkGetSemaphoreFdKHR not available "
+            "(VK_KHR_external_semaphore_fd not enabled?)");
+    }
+
+    // Timeline semaphore (initial value 0) exported via OPAQUE_FD and
+    // imported into CUDA. CUDA dups the fd internally; we close ours
+    // after the import.
+    VkSemaphoreTypeCreateInfo type_info{};
+    type_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO;
+    type_info.semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE;
+    type_info.initialValue = 0;
+
+    VkExportSemaphoreCreateInfo export_info{};
+    export_info.sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO;
+    export_info.pNext = &type_info;
+    export_info.handleTypes = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT;
+
+    VkSemaphoreCreateInfo info{};
+    info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
+    info.pNext = &export_info;
+    check_vk(vkCreateSemaphore(device, &info, nullptr, &cuda_done_writing_), "vkCreateSemaphore");
+
+    int fd = -1;
+    VkSemaphoreGetFdInfoKHR fd_info{};
+    fd_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR;
+    fd_info.semaphore = cuda_done_writing_;
+    fd_info.handleType = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT;
+    check_vk(vkGetSemaphoreFdKHR(device, &fd_info, &fd), "vkGetSemaphoreFdKHR");
+
+    cudaExternalSemaphoreHandleDesc ext_desc{};
+    ext_desc.type = cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd;
+    ext_desc.handle.fd = fd;
+    const cudaError_t err = cudaImportExternalSemaphore(&cuda_cuda_done_writing_, &ext_desc);
+    if (err != cudaSuccess)
+    {
+        close_fd(fd);
+        throw std::runtime_error(std::string("DeviceImage: cudaImportExternalSemaphore(cuda_done_writing) failed: ") +
+                                 cudaGetErrorString(err));
+    }
+    close_fd(fd);
+}
+
+void DeviceImage::cuda_signal_write_done(cudaStream_t stream)
+{
+    // Reserve, signal, commit on success. Failed signal leaves _value_
+    // at the last successfully signaled value (consumer keeps a valid
+    // wait target; failed frame is dropped). Single producer per
+    // DeviceImage → reserved is always > _value_, so a release store
+    // suffices.
+    const uint64_t reserved = cuda_done_writing_next_.fetch_add(1, std::memory_order_acq_rel) + 1;
+    cudaExternalSemaphoreSignalParams params{};
+    params.params.fence.value = reserved;
+    const cudaError_t err = cudaSignalExternalSemaphoresAsync(&cuda_cuda_done_writing_, &params, 1, stream);
+    if (err != cudaSuccess)
+    {
+        throw std::runtime_error(std::string("DeviceImage: cudaSignalExternalSemaphoresAsync(cuda_done_writing) failed: ") +
+                                 cudaGetErrorString(err));
+    }
+    cuda_done_writing_value_.store(reserved, std::memory_order_release);
+}
+
 void DeviceImage::transition_to_shader_read()
 {
     if (current_layout_ == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL)
diff --git a/src/viz/core/cpp/inc/viz/core/device_image.hpp b/src/viz/core/cpp/inc/viz/core/device_image.hpp
index 5fd82e364..a98c8876c 100644
--- a/src/viz/core/cpp/inc/viz/core/device_image.hpp
+++ b/src/viz/core/cpp/inc/viz/core/device_image.hpp
@@ -3,10 +3,12 @@
 
 #pragma once
 
-#include <viz/core/viz_buffer.hpp>
+#include <viz/core/viz_buffer.hpp> // PixelFormat — used in API signatures
 #include <viz/core/viz_types.hpp>
 #include <vulkan/vulkan.h>
 
+#include <atomic>
+#include <cstdint>
 #include <cuda_runtime.h>
 #include <memory>
 
@@ -19,12 +21,21 @@ class VkContext;
 // (optimal tiling, sampled + transfer-dst); the backing VkDeviceMemory
 // is exported via VK_KHR_external_memory_fd and imported into CUDA as
 // a cudaArray_t. CUDA writes via cuda_array(); Vulkan samples via
-// vk_image(). Symmetric counterpart to HostImage; both expose
-// VizBuffer view() so helpers branch on VizBuffer::space.
+// vk_image().
 //
-// Synchronization is heavyweight today (cudaDeviceSynchronize +
-// vkQueueWaitIdle); paired acquire / release semaphores arrive with
-// QuadLayer. CUDA/Vulkan device matching is handled by VkContext.
+// Conceptually paired with HostImage (CPU bytes vs GPU interop bytes),
+// but they don't share a view() return type: a cudaArray_t is opaque
+// tiled GPU memory and is NOT a CUDA device pointer, so wrapping it
+// as a VizBuffer would lie about that type's contract. Callers consume
+// DeviceImage via discrete accessors instead.
+//
+// Producer→consumer synchronization is one-way: a Vulkan timeline
+// semaphore exported to CUDA. CUDA increments cuda_done_writing
+// after filling; Vulkan waits for the latest known value before
+// sampling. The reverse direction is the producer's problem to solve
+// at a higher level (e.g. QuadLayer's mailbox owns enough buffers
+// that producer writes never collide with in-flight Vulkan reads).
+// CUDA / Vulkan device matching is handled by VkContext.
 class DeviceImage
 {
 public:
@@ -40,11 +51,6 @@ class DeviceImage
     DeviceImage(DeviceImage&&) = delete;
     DeviceImage& operator=(DeviceImage&&) = delete;
 
-    // VizBuffer view (kDevice). `data` is the cudaArray_t cast to
-    // void*; it's an opaque CUDA handle, not a raw device pointer —
-    // use cuda_array() with cudaMemcpy2DToArrayAsync to write.
-    VizBuffer view() const noexcept;
-
     // CUDA write target. Lifetime tied to this DeviceImage.
     cudaArray_t cuda_array() const noexcept
     {
@@ -66,6 +72,29 @@ class DeviceImage
         return vk_format_;
     }
 
+    // Timeline semaphore handle. Vulkan waits on this with the
+    // value returned by cuda_done_writing_value() before sampling.
+    VkSemaphore cuda_done_writing() const noexcept
+    {
+        return cuda_done_writing_;
+    }
+
+    // Latest value CUDA has signaled successfully. Vulkan uses this
+    // as the wait target. Advanced by cuda_signal_write_done() only
+    // after the underlying cudaSignalExternalSemaphoresAsync returns
+    // success, so a failed signal never poisons the timeline.
+    uint64_t cuda_done_writing_value() const noexcept
+    {
+        return cuda_done_writing_value_.load(std::memory_order_acquire);
+    }
+
+    // CUDA-side primitive. Reserves the next monotonic value, queues
+    // the signal on `stream`, and commits the value on success.
+    // Throws std::runtime_error on cuda*Async failure; failure leaves
+    // the public counter un-advanced so the next call is consistent
+    // with the GPU's actual semaphore state.
+    void cuda_signal_write_done(cudaStream_t stream);
+
     Resolution resolution() const noexcept
     {
         return resolution_;
@@ -88,6 +117,7 @@ class DeviceImage
     void create_vk_image_with_external_memory();
     void create_vk_image_view();
     void import_to_cuda();
+    void create_interop_semaphores();
 
     void run_one_shot_layout_transition(VkImageLayout old_layout,
                                         VkImageLayout new_layout,
@@ -113,6 +143,16 @@ class DeviceImage
     cudaExternalMemory_t cuda_external_memory_ = nullptr;
     cudaMipmappedArray_t cuda_mipmapped_array_ = nullptr;
     cudaArray_t cuda_array_ = nullptr; // Level-0 view, non-owning.
+
+    // Producer→consumer timeline semaphore exported via
+    // VK_KHR_external_semaphore_fd and imported into CUDA. Two atomic
+    // counters (next reservation, last committed) so a failed
+    // cudaSignal can't leave the public value pointing at something
+    // that was never signaled.
+    VkSemaphore cuda_done_writing_ = VK_NULL_HANDLE;
+    cudaExternalSemaphore_t cuda_cuda_done_writing_ = nullptr;
+    std::atomic<uint64_t> cuda_done_writing_next_{ 0 };
+    std::atomic<uint64_t> cuda_done_writing_value_{ 0 };
 };
 
 } // namespace viz
diff --git a/src/viz/core/cpp/inc/viz/core/viz_buffer.hpp b/src/viz/core/cpp/inc/viz/core/viz_buffer.hpp
index c36f0aec4..12a66213a 100644
--- a/src/viz/core/cpp/inc/viz/core/viz_buffer.hpp
+++ b/src/viz/core/cpp/inc/viz/core/viz_buffer.hpp
@@ -31,10 +31,12 @@ enum class MemorySpace
 
 // Lightweight, non-owning reference to a 2D pixel buffer.
 //
-// Carries no ownership: it does not allocate or free memory. For Mode B
-// submission (acquire/release), the layer owns the underlying interop
-// buffer; VizBuffer is a view into it. For host readback, HostImage owns
-// the bytes and exposes a VizBuffer view via HostImage::view().
+// Carries no ownership: it does not allocate or free memory. Producers
+// fill VizBuffer with a pointer to memory they own (CUDA device buffer,
+// host array) and pass it to QuadLayer::submit(); the layer copies the
+// pixels into one of its internal interop slots. For host readback,
+// HostImage owns the bytes and exposes a VizBuffer view via
+// HostImage::view().
 //
 // In Python, VizBuffer with space == kDevice exposes
 // __cuda_array_interface__ so CuPy can wrap it zero-copy. Host buffers
diff --git a/src/viz/core/cpp/inc/viz/core/vk_context.hpp b/src/viz/core/cpp/inc/viz/core/vk_context.hpp
index 5f12c8690..a4014084b 100644
--- a/src/viz/core/cpp/inc/viz/core/vk_context.hpp
+++ b/src/viz/core/cpp/inc/viz/core/vk_context.hpp
@@ -44,13 +44,9 @@ struct PhysicalDeviceInfo
 //
 // VkContext owns the Vulkan handles and tears them down on destruction.
 //
-// CUDA-Vulkan device matching: as part of init(), VkContext queries
-// the chosen physical device's UUID and calls cudaSetDevice() on the
-// matching CUDA device. This is required for CUDA-Vulkan interop
-// (cudaImportExternalMemory) to succeed on multi-GPU machines, and
-// it makes VkContext the single chokepoint for "which GPU is Televiz
-// on" — every viz_core type that touches CUDA can assume the current
-// CUDA device matches the Vulkan one.
+// init() also matches the current CUDA device to the chosen Vulkan
+// physical device by UUID, so every viz_core type that touches CUDA
+// can assume the two APIs are on the same GPU.
 class VkContext
 {
 public:
@@ -104,13 +100,14 @@ class VkContext
     uint32_t queue_family_index() const noexcept;
     VkQueue queue() const noexcept;
 
-    // CUDA device id matched to the chosen Vulkan physical device by
-    // UUID at init() time. Useful for callers that need to ensure
-    // their thread is on the right CUDA device before issuing CUDA
-    // calls — cudaSetDevice is per-host-thread, so a CudaTexture /
-    // DeviceImage created on a worker thread must call
-    // cudaSetDevice(ctx.cuda_device_id()) before any CUDA API. Returns
-    // -1 before init() has run.
+    // Process-wide VkPipelineCache for driver-side compiled-state
+    // reuse across pipeline creations. VK_NULL_HANDLE before init().
+    VkPipelineCache pipeline_cache() const noexcept;
+
+    // CUDA device id matched to the chosen Vulkan physical device.
+    // Layers created on worker threads should
+    // cudaSetDevice(ctx.cuda_device_id()) before any CUDA call —
+    // cudaSetDevice is per-host-thread. Returns -1 before init().
     int cuda_device_id() const noexcept;
 
     // Enumerates all Vulkan-capable physical devices and returns their
@@ -127,6 +124,7 @@ class VkContext
     void select_physical_device(const Config& config);
     void create_logical_device(const Config& config);
     void match_cuda_device_to_vulkan();
+    void create_pipeline_cache();
 
     bool initialized_ = false;
     bool validation_enabled_ = false;
@@ -135,6 +133,7 @@ class VkContext
     VkDevice device_ = VK_NULL_HANDLE;
     uint32_t queue_family_index_ = UINT32_MAX;
     VkQueue queue_ = VK_NULL_HANDLE;
+    VkPipelineCache pipeline_cache_ = VK_NULL_HANDLE;
     int cuda_device_id_ = -1;
 };
 
diff --git a/src/viz/core/cpp/vk_context.cpp b/src/viz/core/cpp/vk_context.cpp
index d2184d038..9ff3d91cc 100644
--- a/src/viz/core/cpp/vk_context.cpp
+++ b/src/viz/core/cpp/vk_context.cpp
@@ -192,6 +192,7 @@ void VkContext::init(const Config& config)
         select_physical_device(config);
         create_logical_device(config);
         match_cuda_device_to_vulkan();
+        create_pipeline_cache();
         initialized_ = true;
     }
     catch (...)
@@ -203,6 +204,12 @@ void VkContext::init(const Config& config)
 
 void VkContext::destroy()
 {
+    // Destroy device-owned objects (pipeline cache) before the device.
+    if (pipeline_cache_ != VK_NULL_HANDLE && device_ != VK_NULL_HANDLE)
+    {
+        vkDestroyPipelineCache(device_, pipeline_cache_, nullptr);
+        pipeline_cache_ = VK_NULL_HANDLE;
+    }
     if (device_ != VK_NULL_HANDLE)
     {
         vkDestroyDevice(device_, nullptr);
@@ -216,6 +223,7 @@ void VkContext::destroy()
     physical_device_ = VK_NULL_HANDLE;
     queue_ = VK_NULL_HANDLE;
     queue_family_index_ = UINT32_MAX;
+    pipeline_cache_ = VK_NULL_HANDLE;
     cuda_device_id_ = -1;
     validation_enabled_ = false;
     initialized_ = false;
@@ -251,6 +259,11 @@ VkQueue VkContext::queue() const noexcept
     return queue_;
 }
 
+VkPipelineCache VkContext::pipeline_cache() const noexcept
+{
+    return pipeline_cache_;
+}
+
 int VkContext::cuda_device_id() const noexcept
 {
     return cuda_device_id_;
@@ -395,10 +408,16 @@ void VkContext::create_logical_device(const Config& config)
     }
 
     VkPhysicalDeviceFeatures device_features{};
-    // No special features needed yet; extend as the renderer requires them.
+
+    // Enable the Vulkan 1.2 timeline semaphore feature so DeviceImage
+    // can use VK_SEMAPHORE_TYPE_TIMELINE for CUDA-Vulkan interop.
+    VkPhysicalDeviceVulkan12Features features12{};
+    features12.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES;
+    features12.timelineSemaphore = VK_TRUE;
 
     VkDeviceCreateInfo device_info{};
     device_info.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
+    device_info.pNext = &features12;
     device_info.queueCreateInfoCount = 1;
     device_info.pQueueCreateInfos = &queue_info;
     device_info.enabledExtensionCount = static_cast<uint32_t>(extensions.size());
@@ -414,15 +433,25 @@ void VkContext::create_logical_device(const Config& config)
     vkGetDeviceQueue(device_, queue_family_index_, 0, &queue_);
 }
 
+void VkContext::create_pipeline_cache()
+{
+    // Empty cache; the driver populates it as pipelines are created.
+    // Not persisted across runs — purely in-process reuse.
+    VkPipelineCacheCreateInfo info{};
+    info.sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO;
+    const VkResult result = vkCreatePipelineCache(device_, &info, nullptr, &pipeline_cache_);
+    if (result != VK_SUCCESS)
+    {
+        throw std::runtime_error("vkCreatePipelineCache failed: VkResult=" + std::to_string(result));
+    }
+}
+
 void VkContext::match_cuda_device_to_vulkan()
 {
-    // Read the Vulkan physical device's UUID, then find the CUDA
-    // device with the same UUID and make it current. Required so
-    // every viz_core type that imports Vulkan memory into CUDA
-    // (e.g. DeviceImage::cudaImportExternalMemory) operates on the
-    // same physical GPU. On multi-GPU machines Vulkan and CUDA
-    // default to different devices and interop fails with
-    // cudaErrorUnknown.
+    // Find the CUDA device whose UUID matches the chosen Vulkan
+    // physical device and make it current. Required so CUDA-Vulkan
+    // interop on multi-GPU machines doesn't pick a different GPU
+    // than Vulkan.
     VkPhysicalDeviceIDProperties id_props{};
     id_props.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES;
     VkPhysicalDeviceProperties2 props2{};
diff --git a/src/viz/core_tests/cpp/CMakeLists.txt b/src/viz/core_tests/cpp/CMakeLists.txt
index 656aff6c6..76cec20e6 100644
--- a/src/viz/core_tests/cpp/CMakeLists.txt
+++ b/src/viz/core_tests/cpp/CMakeLists.txt
@@ -3,6 +3,19 @@
 
 cmake_minimum_required(VERSION 3.20)
 
+# Header-only INTERFACE library exposing test_helpers.hpp:
+# is_gpu_available(), shared_vk_context(), GpuFixture. Used by all
+# viz_*_tests so the probes have one canonical implementation.
+add_library(viz_test_support INTERFACE)
+target_include_directories(viz_test_support INTERFACE
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+)
+target_link_libraries(viz_test_support INTERFACE
+    viz::core
+    Catch2::Catch2
+)
+add_library(viz::test_support ALIAS viz_test_support)
+
 add_executable(viz_core_tests
     test_device_image.cpp
     test_frame_sync.cpp
@@ -15,7 +28,7 @@ add_executable(viz_core_tests
 )
 
 target_link_libraries(viz_core_tests PRIVATE
-    viz::core
+    viz::test_support
     Catch2::Catch2WithMain
 )
 
diff --git a/src/viz/core_tests/cpp/test_device_image.cpp b/src/viz/core_tests/cpp/test_device_image.cpp
index fcdb23af0..789eb92fd 100644
--- a/src/viz/core_tests/cpp/test_device_image.cpp
+++ b/src/viz/core_tests/cpp/test_device_image.cpp
@@ -2,15 +2,13 @@
 // SPDX-License-Identifier: Apache-2.0
 
 // [gpu] tests for DeviceImage: verify Vulkan handle creation, the
-// CUDA cudaArray_t is usable from a cudaMemcpy2DToArray call, the
-// VizBuffer view exposes the right shape with MemorySpace::kDevice,
-// and a round-trip copy preserves pixel values.
+// CUDA cudaArray_t is usable from a cudaMemcpy2DToArray call, and a
+// round-trip copy preserves pixel values.
 
 #include "test_helpers.hpp"
 
 #include <catch2/catch_test_macros.hpp>
 #include <viz/core/device_image.hpp>
-#include <viz/core/viz_buffer.hpp>
 #include <viz/core/vk_context.hpp>
 
 #include <cstdint>
@@ -19,10 +17,8 @@
 #include <vector>
 
 using viz::DeviceImage;
-using viz::MemorySpace;
 using viz::PixelFormat;
 using viz::Resolution;
-using viz::VizBuffer;
 
 namespace
 {
@@ -55,7 +51,11 @@ TEST_CASE_METHOD(viz::testing::GpuFixture, "DeviceImage creates valid Vulkan + C
     REQUIRE(img != nullptr);
     CHECK(img->vk_image() != VK_NULL_HANDLE);
     CHECK(img->vk_image_view() != VK_NULL_HANDLE);
-    CHECK(img->vk_format() == VK_FORMAT_R8G8B8A8_UNORM);
+    // vk_format() returns the SRGB sampling view format. Storage is
+    // UNORM (CUDA writes raw bytes), but sampling decodes through SRGB
+    // so arbitrary byte values round-trip exactly through the
+    // sRGB->linear->sRGB pipeline. See device_image.cpp comments.
+    CHECK(img->vk_format() == VK_FORMAT_R8G8B8A8_SRGB);
     CHECK(img->cuda_array() != nullptr);
     CHECK(img->resolution().width == 64);
     CHECK(img->resolution().height == 64);
@@ -78,20 +78,6 @@ TEST_CASE_METHOD(viz::testing::GpuFixture, "DeviceImage destroy is idempotent",
     img->destroy();
 }
 
-TEST_CASE_METHOD(viz::testing::GpuFixture, "DeviceImage::view exposes a kDevice VizBuffer", "[gpu][device_image]")
-{
-    auto img = DeviceImage::create(vk, Resolution{ 16, 16 }, PixelFormat::kRGBA8);
-    const VizBuffer v = img->view();
-    CHECK(v.space == MemorySpace::kDevice);
-    CHECK(v.width == 16);
-    CHECK(v.height == 16);
-    CHECK(v.format == PixelFormat::kRGBA8);
-    CHECK(v.pitch == static_cast<size_t>(16) * 4);
-    // .data points at the cudaArray_t handle (NOT a raw device pointer);
-    // sanity that it's non-null when the image is alive.
-    CHECK(v.data != nullptr);
-}
-
 TEST_CASE_METHOD(viz::testing::GpuFixture, "DeviceImage round-trip preserves pixel pattern", "[gpu][device_image]")
 {
     constexpr uint32_t kSide = 64;
@@ -105,9 +91,8 @@ TEST_CASE_METHOD(viz::testing::GpuFixture, "DeviceImage round-trip preserves pix
                                 cudaMemcpyHostToDevice) == cudaSuccess);
     REQUIRE(cudaDeviceSynchronize() == cudaSuccess);
 
-    // Read it back via CUDA — verifies the data took. (M3b's milestone
-    // test will read back via Vulkan sampling; that's the real round-
-    // trip but requires a graphics pipeline we don't have yet.)
+    // Read back via CUDA — verifies the data made it. The Vulkan-
+    // sampling round-trip is covered by viz_session_tests.
     std::vector<uint8_t> dst(kBytes);
     REQUIRE(cudaMemcpy2DFromArray(dst.data(), kSide * 4, img->cuda_array(), 0, 0, kSide * 4, kSide,
                                   cudaMemcpyDeviceToHost) == cudaSuccess);
diff --git a/src/viz/core_tests/cpp/test_helpers.hpp b/src/viz/core_tests/cpp/test_helpers.hpp
index 04ea2b416..8e76f558f 100644
--- a/src/viz/core_tests/cpp/test_helpers.hpp
+++ b/src/viz/core_tests/cpp/test_helpers.hpp
@@ -9,26 +9,76 @@
 
 #include <cstdint>
 #include <cstdlib>
+#include <cstring>
+#include <cuda_runtime.h>
+#include <vector>
 
 namespace viz::testing
 {
 
-// True iff a Televiz-suitable Vulkan device is reachable. Cached after
-// the first call. [gpu] tests should SKIP when this is false so CI
-// runners without a suitable GPU report skipped rather than failed.
+// True iff at least one GPU is reachable from BOTH Vulkan AND CUDA
+// with matching UUIDs — the same constraint VkContext::init() enforces.
+// [gpu] tests SKIP when this returns false so CI runners without a
+// suitable GPU report skipped rather than failed. Cached after the
+// first call.
 inline bool is_gpu_available()
 {
     static const bool cached = []() -> bool
     {
-        const auto devices = viz::VkContext::enumerate_physical_devices();
-        for (const auto& info : devices)
+        VkApplicationInfo app{};
+        app.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO;
+        app.apiVersion = VK_API_VERSION_1_2;
+        VkInstanceCreateInfo ic{};
+        ic.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
+        ic.pApplicationInfo = &app;
+        VkInstance instance = VK_NULL_HANDLE;
+        if (vkCreateInstance(&ic, nullptr, &instance) != VK_SUCCESS)
         {
-            if (info.meets_requirements)
+            return false;
+        }
+        uint32_t count = 0;
+        vkEnumeratePhysicalDevices(instance, &count, nullptr);
+        std::vector<VkPhysicalDevice> devs(count);
+        if (count > 0)
+        {
+            vkEnumeratePhysicalDevices(instance, &count, devs.data());
+        }
+
+        int cuda_count = 0;
+        if (cudaGetDeviceCount(&cuda_count) != cudaSuccess)
+        {
+            cuda_count = 0;
+        }
+
+        bool match = false;
+        for (VkPhysicalDevice vk_dev : devs)
+        {
+            VkPhysicalDeviceIDProperties id{};
+            id.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES;
+            VkPhysicalDeviceProperties2 p2{};
+            p2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
+            p2.pNext = &id;
+            vkGetPhysicalDeviceProperties2(vk_dev, &p2);
+            for (int ci = 0; ci < cuda_count; ++ci)
+            {
+                cudaDeviceProp prop{};
+                if (cudaGetDeviceProperties(&prop, ci) != cudaSuccess)
+                {
+                    continue;
+                }
+                if (std::memcmp(prop.uuid.bytes, id.deviceUUID, VK_UUID_SIZE) == 0)
+                {
+                    match = true;
+                    break;
+                }
+            }
+            if (match)
             {
-                return true;
+                break;
             }
         }
-        return false;
+        vkDestroyInstance(instance, nullptr);
+        return match;
     }();
     return cached;
 }
diff --git a/src/viz/layers/cpp/CMakeLists.txt b/src/viz/layers/cpp/CMakeLists.txt
index bf695673c..02d09ad17 100644
--- a/src/viz/layers/cpp/CMakeLists.txt
+++ b/src/viz/layers/cpp/CMakeLists.txt
@@ -3,25 +3,25 @@
 
 cmake_minimum_required(VERSION 3.20)
 
-# LayerBase abstract interface and (later) concrete layer types
-# (QuadLayer, ProjectionLayer, OverlayLayer). Header-only today — promoted
-# to STATIC when the first .cpp lands. Test-only helper layers
-# (e.g. ColoredQuadLayer) live in viz/layers_tests/, not here.
-add_library(viz_layers INTERFACE)
-
-target_sources(viz_layers
-    INTERFACE
-        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/inc/viz/layers/layer_base.hpp>
+# LayerBase abstract interface and concrete layer types. Promoted to
+# STATIC now that QuadLayer ships. Test-only helper layers
+# (ClearRectLayer, ThrowingLayer) live in viz/layers_tests/, not here.
+add_library(viz_layers STATIC
+    quad_layer.cpp
+    inc/viz/layers/layer_base.hpp
+    inc/viz/layers/quad_layer.hpp
 )
 
 target_include_directories(viz_layers
-    INTERFACE
+    PUBLIC
         $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/inc>
 )
 
 target_link_libraries(viz_layers
-    INTERFACE
+    PUBLIC
         viz::core
+    PRIVATE
+        viz::shaders
 )
 
 # Aliased as viz::layers.
diff --git a/src/viz/layers/cpp/inc/viz/layers/layer_base.hpp b/src/viz/layers/cpp/inc/viz/layers/layer_base.hpp
index d2a9407ab..355d3fbf1 100644
--- a/src/viz/layers/cpp/inc/viz/layers/layer_base.hpp
+++ b/src/viz/layers/cpp/inc/viz/layers/layer_base.hpp
@@ -51,6 +51,30 @@ class LayerBase
     //           draws into; const so layers cannot modify the target.
     virtual void record(VkCommandBuffer cmd, const std::vector<ViewInfo>& views, const RenderTarget& target) = 0;
 
+    // Per-frame wait wiring for layers that synchronize against CUDA
+    // (or other external) producers via a Vulkan timeline semaphore.
+    // VizCompositor concatenates these across all visible layers and
+    // feeds them into vkQueueSubmit (with a chained
+    // VkTimelineSemaphoreSubmitInfo for the values).
+    // Default: empty (no external sync).
+    //
+    // No signal semaphores: layers that need producer↔consumer
+    // ping-pong solve it at the layer level (e.g. QuadLayer's mailbox
+    // owns enough buffers that producer writes never collide with
+    // in-flight Vulkan reads, so the compositor never has to signal
+    // back to the producer).
+    struct WaitSemaphore
+    {
+        VkSemaphore semaphore = VK_NULL_HANDLE;
+        uint64_t value = 0;
+        VkPipelineStageFlags wait_stage = 0;
+    };
+
+    virtual std::vector<WaitSemaphore> get_wait_semaphores() const
+    {
+        return {};
+    }
+
     const std::string& name() const noexcept;
 
     // Visibility flag is atomic so it can be toggled from any thread (UI
diff --git a/src/viz/layers/cpp/inc/viz/layers/quad_layer.hpp b/src/viz/layers/cpp/inc/viz/layers/quad_layer.hpp
new file mode 100644
index 000000000..b320ee0fa
--- /dev/null
+++ b/src/viz/layers/cpp/inc/viz/layers/quad_layer.hpp
@@ -0,0 +1,155 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <viz/core/device_image.hpp>
+#include <viz/core/viz_buffer.hpp>
+#include <viz/core/viz_types.hpp>
+#include <viz/layers/layer_base.hpp>
+#include <vulkan/vulkan.h>
+
+#include <array>
+#include <atomic>
+#include <cstdint>
+#include <cuda_runtime.h>
+#include <memory>
+#include <string>
+
+namespace viz
+{
+
+class VkContext;
+
+// QuadLayer: renders a CUDA-fed 2D texture as a fullscreen quad.
+//
+// Owns kSlotCount=3 DeviceImages plus the graphics-pipeline state to
+// sample any of them (one VkSampler, one VkPipeline, one descriptor
+// set per slot). The slots form a mailbox:
+//
+//   submit() picks a "free" slot (one that is neither the most recent
+//   publish nor the slot the renderer is currently sampling), runs
+//   cudaMemcpyAsync into it, signals cuda_done_writing, and atomic-
+//   exchanges the "latest" pointer to it. The previous "latest" slot
+//   becomes free.
+//
+//   record() atomic-exchanges "latest" into "in_use" (taking it for
+//   this frame's draw); the previous "in_use" slot becomes free. The
+//   draw waits on cuda_done_writing of the slot it just took.
+//
+// Net result: producer can submit at any rate. The renderer always
+// samples the most recently completed publish, and there is always
+// at least one slot free for the producer to write — it never
+// collides with a buffer the renderer is currently sampling.
+//
+// Correctness depends on VizCompositor::render() being synchronous
+// (frame_sync_->wait() at end of frame). Multi-frame-in-flight
+// would require in_use_ to become per-in-flight-frame.
+//
+// Memory cost: ~width*height*bpp*3 bytes (e.g. 24 MB at 1080p RGBA8).
+//
+// Fullscreen-blit / kRGBA8 only. Placement transforms and other
+// formats land with the XR backend.
+class QuadLayer : public LayerBase
+{
+public:
+    static constexpr uint32_t kSlotCount = 3;
+
+    struct Config
+    {
+        std::string name = "QuadLayer";
+        Resolution resolution{};
+        PixelFormat format = PixelFormat::kRGBA8;
+    };
+
+    // Builds the 3 DeviceImages + pipeline up front. Throws
+    // std::invalid_argument on bad config; std::runtime_error on
+    // Vulkan / CUDA failure.
+    QuadLayer(const VkContext& ctx, VkRenderPass render_pass, Config config);
+
+    ~QuadLayer() override;
+    void destroy();
+
+    // Threading contract: submit() is the producer side; record() (+
+    // get_wait_semaphores) is the consumer side. They may run on
+    // separate threads. Multiple concurrent producers on the same
+    // QuadLayer are NOT supported — use one QuadLayer per producer.
+    //
+    // src.space must be kDevice and src dimensions/format must match
+    // the layer. The wait/copy/signal sequence runs on `stream`
+    // (default: the default stream); pass the producer's stream so
+    // the signal lands after the producer's prior writes on the same
+    // stream.
+    //
+    // Throws std::invalid_argument on validation failure;
+    // std::runtime_error on CUDA failure;
+    // std::logic_error if called after destroy().
+    void submit(const VizBuffer& src, cudaStream_t stream = 0);
+
+    // Binds pipeline + per-slot descriptor + draws a 3-vertex
+    // fullscreen quad. Skips the draw if no frame has been published
+    // yet (kSlotNone — render target keeps its clear value).
+    void record(VkCommandBuffer cmd, const std::vector<ViewInfo>& views, const RenderTarget& target) override;
+
+    // Layer-side timeline wait: VizCompositor waits on this slot's
+    // cuda_done_writing before the fragment shader samples it.
+    std::vector<LayerBase::WaitSemaphore> get_wait_semaphores() const override;
+
+    Resolution resolution() const noexcept;
+    PixelFormat format() const noexcept;
+
+    // Diagnostic accessor; nullptr for slots beyond kSlotCount.
+    const DeviceImage* device_image(uint32_t slot) const noexcept;
+
+private:
+    void init();
+
+    void create_sampler();
+    void create_descriptor_set_layout();
+    void create_pipeline_layout();
+    void create_pipeline();
+    void create_descriptor_pool();
+    void allocate_descriptor_sets();
+    void update_descriptor_sets();
+
+    // Mailbox slot allocation. submit() picks one of these states
+    // and atomically takes ownership; record() atomically promotes
+    // a freshly-published slot to `in_use_`.
+    static constexpr uint8_t kSlotNone = 0xFF;
+
+    // Picks a slot that is neither latest_ nor in_use_, in
+    // 0..kSlotCount-1. Returns a value < kSlotCount.
+    uint8_t pick_free_slot(uint8_t latest, uint8_t in_use) const noexcept;
+
+    const VkContext* ctx_ = nullptr;
+    VkRenderPass render_pass_ = VK_NULL_HANDLE; // borrowed from compositor
+    Config config_;
+
+    // One DeviceImage per mailbox slot.
+    std::array<std::unique_ptr<DeviceImage>, kSlotCount> slots_;
+
+    VkSampler sampler_ = VK_NULL_HANDLE;
+    VkDescriptorSetLayout descriptor_set_layout_ = VK_NULL_HANDLE;
+    VkPipelineLayout pipeline_layout_ = VK_NULL_HANDLE;
+    VkPipeline pipeline_ = VK_NULL_HANDLE;
+
+    VkDescriptorPool descriptor_pool_ = VK_NULL_HANDLE;
+    // One descriptor set per slot, each binding the corresponding
+    // DeviceImage's sRGB view. record() picks the one for in_use_.
+    std::array<VkDescriptorSet, kSlotCount> descriptor_sets_{};
+
+    // Mailbox state. Both atomic so producer and renderer can
+    // touch them without locks.
+    //
+    //   latest_:    most recently published slot. submit() stores
+    //               here on success; record() exchanges it into
+    //               in_use_ at frame start. kSlotNone before the
+    //               first submit().
+    //   in_use_:    slot the renderer is currently drawing from.
+    //               kSlotNone before the first frame that finds a
+    //               published slot. record() updates this.
+    std::atomic<uint8_t> latest_{ kSlotNone };
+    std::atomic<uint8_t> in_use_{ kSlotNone };
+};
+
+} // namespace viz
diff --git a/src/viz/layers/cpp/quad_layer.cpp b/src/viz/layers/cpp/quad_layer.cpp
new file mode 100644
index 000000000..8b861dea0
--- /dev/null
+++ b/src/viz/layers/cpp/quad_layer.cpp
@@ -0,0 +1,517 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include <viz/core/render_target.hpp>
+#include <viz/core/vk_context.hpp>
+#include <viz/layers/quad_layer.hpp>
+#include <viz/shaders/textured_quad.frag.spv.h>
+#include <viz/shaders/textured_quad.vert.spv.h>
+
+#include <cuda_runtime.h>
+#include <stdexcept>
+#include <string>
+
+namespace viz
+{
+
+namespace
+{
+
+void check_vk(VkResult result, const char* what)
+{
+    if (result != VK_SUCCESS)
+    {
+        throw std::runtime_error(std::string("QuadLayer: ") + what + " failed: VkResult=" + std::to_string(result));
+    }
+}
+
+void check_cuda(cudaError_t result, const char* what)
+{
+    if (result != cudaSuccess)
+    {
+        throw std::runtime_error(std::string("QuadLayer: ") + what + " failed: " + cudaGetErrorString(result));
+    }
+}
+
+VkShaderModule create_shader_module(VkDevice device, const unsigned char* spv, size_t size)
+{
+    VkShaderModuleCreateInfo info{};
+    info.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO;
+    info.codeSize = size;
+    info.pCode = reinterpret_cast<const uint32_t*>(spv);
+    VkShaderModule mod = VK_NULL_HANDLE;
+    check_vk(vkCreateShaderModule(device, &info, nullptr, &mod), "vkCreateShaderModule");
+    return mod;
+}
+
+// Once destroy() has run, slots_[0] is the canonical "alive" signal
+// (it's the first thing init() builds and the last thing destroy()
+// resets). Throwing logic_error converts use-after-destroy from a
+// silent null-deref into a clean failure callers can catch in tests.
+void require_alive(const std::unique_ptr<DeviceImage>& slot0, const char* what)
+{
+    if (!slot0)
+    {
+        throw std::logic_error(std::string("QuadLayer::") + what + " called after destroy()");
+    }
+}
+
+} // namespace
+
+QuadLayer::QuadLayer(const VkContext& ctx, VkRenderPass render_pass, Config config)
+    : LayerBase(config.name), ctx_(&ctx), render_pass_(render_pass), config_(std::move(config))
+{
+    // Cheap-first config checks, then argument shape, then context
+    // state. Tests can exercise each path by varying just the
+    // relevant argument with an uninitialized VkContext.
+    if (config_.format != PixelFormat::kRGBA8)
+    {
+        // textured_quad samples color; depth (kD32F) would create a
+        // depth-aspect view that can't be sampled as color.
+        throw std::invalid_argument("QuadLayer: only PixelFormat::kRGBA8 is supported");
+    }
+    if (config_.resolution.width == 0 || config_.resolution.height == 0)
+    {
+        throw std::invalid_argument("QuadLayer: resolution must be non-zero");
+    }
+    if (render_pass == VK_NULL_HANDLE)
+    {
+        throw std::invalid_argument("QuadLayer: render_pass must be non-null");
+    }
+    if (!ctx.is_initialized())
+    {
+        throw std::invalid_argument("QuadLayer: VkContext is not initialized");
+    }
+    init();
+}
+
+QuadLayer::~QuadLayer()
+{
+    destroy();
+}
+
+void QuadLayer::init()
+{
+    try
+    {
+        for (auto& slot : slots_)
+        {
+            slot = DeviceImage::create(*ctx_, config_.resolution, config_.format);
+        }
+        create_sampler();
+        create_descriptor_set_layout();
+        create_pipeline_layout();
+        create_pipeline();
+        create_descriptor_pool();
+        allocate_descriptor_sets();
+        update_descriptor_sets();
+    }
+    catch (...)
+    {
+        destroy();
+        throw;
+    }
+}
+
+void QuadLayer::destroy()
+{
+    if (ctx_ == nullptr)
+    {
+        return;
+    }
+    const VkDevice device = ctx_->device();
+    if (device == VK_NULL_HANDLE)
+    {
+        for (auto& slot : slots_)
+        {
+            slot.reset();
+        }
+        return;
+    }
+    if (descriptor_pool_ != VK_NULL_HANDLE)
+    {
+        // descriptor_sets_ are freed implicitly with the pool.
+        vkDestroyDescriptorPool(device, descriptor_pool_, nullptr);
+        descriptor_pool_ = VK_NULL_HANDLE;
+        descriptor_sets_.fill(VK_NULL_HANDLE);
+    }
+    if (pipeline_ != VK_NULL_HANDLE)
+    {
+        vkDestroyPipeline(device, pipeline_, nullptr);
+        pipeline_ = VK_NULL_HANDLE;
+    }
+    if (pipeline_layout_ != VK_NULL_HANDLE)
+    {
+        vkDestroyPipelineLayout(device, pipeline_layout_, nullptr);
+        pipeline_layout_ = VK_NULL_HANDLE;
+    }
+    if (descriptor_set_layout_ != VK_NULL_HANDLE)
+    {
+        vkDestroyDescriptorSetLayout(device, descriptor_set_layout_, nullptr);
+        descriptor_set_layout_ = VK_NULL_HANDLE;
+    }
+    if (sampler_ != VK_NULL_HANDLE)
+    {
+        vkDestroySampler(device, sampler_, nullptr);
+        sampler_ = VK_NULL_HANDLE;
+    }
+    for (auto& slot : slots_)
+    {
+        slot.reset();
+    }
+    latest_.store(kSlotNone, std::memory_order_release);
+    in_use_.store(kSlotNone, std::memory_order_release);
+}
+
+Resolution QuadLayer::resolution() const noexcept
+{
+    return config_.resolution;
+}
+
+PixelFormat QuadLayer::format() const noexcept
+{
+    return config_.format;
+}
+
+const DeviceImage* QuadLayer::device_image(uint32_t slot) const noexcept
+{
+    if (slot >= kSlotCount)
+    {
+        return nullptr;
+    }
+    return slots_[slot].get();
+}
+
+uint8_t QuadLayer::pick_free_slot(uint8_t latest, uint8_t in_use) const noexcept
+{
+    // With kSlotCount=3, at most 2 slots are "claimed" (latest +
+    // in_use). At least one of {0, 1, 2} is always free.
+    static_assert(kSlotCount == 3, "pick_free_slot assumes 3 slots");
+    for (uint8_t i = 0; i < kSlotCount; ++i)
+    {
+        if (i != latest && i != in_use)
+        {
+            return i;
+        }
+    }
+    return 0; // unreachable for kSlotCount >= 2
+}
+
+void QuadLayer::submit(const VizBuffer& src, cudaStream_t stream)
+{
+    require_alive(slots_[0], "submit");
+    if (src.space != MemorySpace::kDevice)
+    {
+        throw std::invalid_argument("QuadLayer::submit: src must be MemorySpace::kDevice");
+    }
+    if (src.width != config_.resolution.width || src.height != config_.resolution.height)
+    {
+        throw std::invalid_argument("QuadLayer::submit: src dimensions do not match layer resolution");
+    }
+    if (src.format != config_.format)
+    {
+        throw std::invalid_argument("QuadLayer::submit: src format does not match layer format");
+    }
+    if (src.data == nullptr)
+    {
+        throw std::invalid_argument("QuadLayer::submit: src.data is null");
+    }
+
+    // Pick a free slot — neither the most recent publish nor the
+    // slot the renderer is currently using. With 3 slots there's
+    // always one free, so this is wait-free.
+    const uint8_t latest = latest_.load(std::memory_order_acquire);
+    const uint8_t in_use = in_use_.load(std::memory_order_acquire);
+    const uint8_t slot = pick_free_slot(latest, in_use);
+    DeviceImage& image = *slots_[slot];
+
+    check_cuda(cudaSetDevice(ctx_->cuda_device_id()), "cudaSetDevice");
+    // Async copy on `stream`. Caller's prior work on the same stream
+    // is naturally ordered before this; signal lands after the copy
+    // completes on the GPU.
+    const size_t row_bytes = static_cast<size_t>(src.width) * bytes_per_pixel(src.format);
+    const size_t src_pitch = (src.pitch == 0) ? row_bytes : src.pitch;
+    check_cuda(cudaMemcpy2DToArrayAsync(image.cuda_array(), 0, 0, src.data, src_pitch, row_bytes, src.height,
+                                        cudaMemcpyDeviceToDevice, stream),
+               "cudaMemcpy2DToArrayAsync");
+    image.cuda_signal_write_done(stream);
+
+    // Publish. The renderer's next record() will atomic-exchange
+    // this into in_use_; the previous latest_ slot becomes free.
+    // memory_order_release pairs with the renderer's acquire load.
+    latest_.store(slot, std::memory_order_release);
+}
+
+void QuadLayer::record(VkCommandBuffer cmd, const std::vector<ViewInfo>& /*views*/, const RenderTarget& target)
+{
+    require_alive(slots_[0], "record");
+
+    // Promote latest_ to in_use_. The previous in_use_ slot becomes
+    // free for the next submit(). If no frame has been published yet
+    // (latest_ == kSlotNone), we leave in_use_ as-is — if it's also
+    // kSlotNone, we skip the draw and the framebuffer keeps its
+    // clear value.
+    const uint8_t latest = latest_.load(std::memory_order_acquire);
+    if (latest != kSlotNone)
+    {
+        in_use_.store(latest, std::memory_order_release);
+    }
+    const uint8_t cur = in_use_.load(std::memory_order_acquire);
+    if (cur == kSlotNone)
+    {
+        return;
+    }
+
+    const Resolution res = target.resolution();
+
+    VkViewport viewport{};
+    viewport.x = 0.0f;
+    viewport.y = 0.0f;
+    viewport.width = static_cast<float>(res.width);
+    viewport.height = static_cast<float>(res.height);
+    viewport.minDepth = 0.0f;
+    viewport.maxDepth = 1.0f;
+    vkCmdSetViewport(cmd, 0, 1, &viewport);
+
+    VkRect2D scissor{};
+    scissor.offset = { 0, 0 };
+    scissor.extent = { res.width, res.height };
+    vkCmdSetScissor(cmd, 0, 1, &scissor);
+
+    vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline_);
+    vkCmdBindDescriptorSets(
+        cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline_layout_, 0, 1, &descriptor_sets_[cur], 0, nullptr);
+
+    // 3 vertices, no vertex buffer — vertex shader emits a fullscreen
+    // triangle from gl_VertexIndex.
+    vkCmdDraw(cmd, 3, 1, 0, 0);
+}
+
+std::vector<LayerBase::WaitSemaphore> QuadLayer::get_wait_semaphores() const
+{
+    // VizCompositor calls record() first (which promotes latest_ ->
+    // in_use_), then this. So in_use_ is the slot the draw will
+    // sample, and that's what we need the GPU to wait on.
+    const uint8_t cur = in_use_.load(std::memory_order_acquire);
+    if (cur == kSlotNone || !slots_[cur])
+    {
+        return {};
+    }
+    const DeviceImage& image = *slots_[cur];
+    const uint64_t value = image.cuda_done_writing_value();
+    if (value == 0)
+    {
+        return {};
+    }
+    return {
+        WaitSemaphore{
+            image.cuda_done_writing(),
+            value,
+            VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT,
+        },
+    };
+}
+
+void QuadLayer::create_sampler()
+{
+    VkSamplerCreateInfo info{};
+    info.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO;
+    info.magFilter = VK_FILTER_LINEAR;
+    info.minFilter = VK_FILTER_LINEAR;
+    info.mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST;
+    info.addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
+    info.addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
+    info.addressModeW = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
+    info.anisotropyEnable = VK_FALSE; // enable later when XR distance views need it
+    info.maxAnisotropy = 1.0f;
+    info.borderColor = VK_BORDER_COLOR_INT_OPAQUE_BLACK;
+    info.unnormalizedCoordinates = VK_FALSE;
+    info.compareEnable = VK_FALSE;
+    info.compareOp = VK_COMPARE_OP_ALWAYS;
+    info.minLod = 0.0f;
+    info.maxLod = 0.0f;
+    check_vk(vkCreateSampler(ctx_->device(), &info, nullptr, &sampler_), "vkCreateSampler");
+}
+
+void QuadLayer::create_descriptor_set_layout()
+{
+    VkDescriptorSetLayoutBinding binding{};
+    binding.binding = 0;
+    binding.descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
+    binding.descriptorCount = 1;
+    binding.stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT;
+    binding.pImmutableSamplers = nullptr;
+
+    VkDescriptorSetLayoutCreateInfo info{};
+    info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
+    info.bindingCount = 1;
+    info.pBindings = &binding;
+    check_vk(vkCreateDescriptorSetLayout(ctx_->device(), &info, nullptr, &descriptor_set_layout_),
+             "vkCreateDescriptorSetLayout");
+}
+
+void QuadLayer::create_pipeline_layout()
+{
+    VkPipelineLayoutCreateInfo info{};
+    info.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
+    info.setLayoutCount = 1;
+    info.pSetLayouts = &descriptor_set_layout_;
+    info.pushConstantRangeCount = 0;
+    check_vk(vkCreatePipelineLayout(ctx_->device(), &info, nullptr, &pipeline_layout_), "vkCreatePipelineLayout");
+}
+
+void QuadLayer::create_pipeline()
+{
+    const VkDevice device = ctx_->device();
+
+    VkShaderModule vert =
+        create_shader_module(device, viz::shaders::kTexturedQuadVertSpv, viz::shaders::kTexturedQuadVertSpvSize);
+    VkShaderModule frag =
+        create_shader_module(device, viz::shaders::kTexturedQuadFragSpv, viz::shaders::kTexturedQuadFragSpvSize);
+
+    // RAII: shader modules are only needed during pipeline creation.
+    struct ShaderGuard
+    {
+        VkDevice device;
+        VkShaderModule vert;
+        VkShaderModule frag;
+        ~ShaderGuard()
+        {
+            if (vert != VK_NULL_HANDLE)
+            {
+                vkDestroyShaderModule(device, vert, nullptr);
+            }
+            if (frag != VK_NULL_HANDLE)
+            {
+                vkDestroyShaderModule(device, frag, nullptr);
+            }
+        }
+    } guard{ device, vert, frag };
+
+    VkPipelineShaderStageCreateInfo stages[2]{};
+    stages[0].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
+    stages[0].stage = VK_SHADER_STAGE_VERTEX_BIT;
+    stages[0].module = vert;
+    stages[0].pName = "main";
+    stages[1].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
+    stages[1].stage = VK_SHADER_STAGE_FRAGMENT_BIT;
+    stages[1].module = frag;
+    stages[1].pName = "main";
+
+    VkPipelineVertexInputStateCreateInfo vertex_input{};
+    vertex_input.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO;
+
+    VkPipelineInputAssemblyStateCreateInfo input_assembly{};
+    input_assembly.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO;
+    input_assembly.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST;
+
+    // Viewport / scissor are dynamic so one pipeline works across
+    // resolutions.
+    VkPipelineViewportStateCreateInfo viewport_state{};
+    viewport_state.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO;
+    viewport_state.viewportCount = 1;
+    viewport_state.scissorCount = 1;
+
+    VkPipelineRasterizationStateCreateInfo rasterizer{};
+    rasterizer.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO;
+    rasterizer.polygonMode = VK_POLYGON_MODE_FILL;
+    rasterizer.cullMode = VK_CULL_MODE_NONE;
+    rasterizer.frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE;
+    rasterizer.lineWidth = 1.0f;
+
+    VkPipelineMultisampleStateCreateInfo multisample{};
+    multisample.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO;
+    multisample.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT;
+
+    // Depth disabled — fullscreen blits don't need it.
+    VkPipelineDepthStencilStateCreateInfo depth_stencil{};
+    depth_stencil.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO;
+    depth_stencil.depthTestEnable = VK_FALSE;
+    depth_stencil.depthWriteEnable = VK_FALSE;
+
+    VkPipelineColorBlendAttachmentState blend_attachment{};
+    blend_attachment.blendEnable = VK_FALSE;
+    blend_attachment.colorWriteMask =
+        VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT | VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT;
+
+    VkPipelineColorBlendStateCreateInfo color_blend{};
+    color_blend.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO;
+    color_blend.attachmentCount = 1;
+    color_blend.pAttachments = &blend_attachment;
+
+    const VkDynamicState dynamic_states[] = { VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR };
+    VkPipelineDynamicStateCreateInfo dynamic{};
+    dynamic.sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO;
+    dynamic.dynamicStateCount = sizeof(dynamic_states) / sizeof(dynamic_states[0]);
+    dynamic.pDynamicStates = dynamic_states;
+
+    VkGraphicsPipelineCreateInfo info{};
+    info.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO;
+    info.stageCount = 2;
+    info.pStages = stages;
+    info.pVertexInputState = &vertex_input;
+    info.pInputAssemblyState = &input_assembly;
+    info.pViewportState = &viewport_state;
+    info.pRasterizationState = &rasterizer;
+    info.pMultisampleState = &multisample;
+    info.pDepthStencilState = &depth_stencil;
+    info.pColorBlendState = &color_blend;
+    info.pDynamicState = &dynamic;
+    info.layout = pipeline_layout_;
+    info.renderPass = render_pass_;
+    info.subpass = 0;
+
+    check_vk(vkCreateGraphicsPipelines(device, ctx_->pipeline_cache(), 1, &info, nullptr, &pipeline_),
+             "vkCreateGraphicsPipelines");
+}
+
+void QuadLayer::create_descriptor_pool()
+{
+    VkDescriptorPoolSize pool_size{};
+    pool_size.type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
+    pool_size.descriptorCount = kSlotCount;
+
+    VkDescriptorPoolCreateInfo info{};
+    info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO;
+    info.maxSets = kSlotCount;
+    info.poolSizeCount = 1;
+    info.pPoolSizes = &pool_size;
+    check_vk(vkCreateDescriptorPool(ctx_->device(), &info, nullptr, &descriptor_pool_), "vkCreateDescriptorPool");
+}
+
+void QuadLayer::allocate_descriptor_sets()
+{
+    std::array<VkDescriptorSetLayout, kSlotCount> layouts{};
+    layouts.fill(descriptor_set_layout_);
+
+    VkDescriptorSetAllocateInfo info{};
+    info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO;
+    info.descriptorPool = descriptor_pool_;
+    info.descriptorSetCount = kSlotCount;
+    info.pSetLayouts = layouts.data();
+    check_vk(vkAllocateDescriptorSets(ctx_->device(), &info, descriptor_sets_.data()), "vkAllocateDescriptorSets");
+}
+
+void QuadLayer::update_descriptor_sets()
+{
+    // One write per slot, each pointing at the slot's own image view.
+    std::array<VkDescriptorImageInfo, kSlotCount> image_infos{};
+    std::array<VkWriteDescriptorSet, kSlotCount> writes{};
+    for (uint32_t i = 0; i < kSlotCount; ++i)
+    {
+        image_infos[i].sampler = sampler_;
+        image_infos[i].imageView = slots_[i]->vk_image_view();
+        image_infos[i].imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
+
+        writes[i].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
+        writes[i].dstSet = descriptor_sets_[i];
+        writes[i].dstBinding = 0;
+        writes[i].dstArrayElement = 0;
+        writes[i].descriptorCount = 1;
+        writes[i].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
+        writes[i].pImageInfo = &image_infos[i];
+    }
+    vkUpdateDescriptorSets(ctx_->device(), kSlotCount, writes.data(), 0, nullptr);
+}
+
+} // namespace viz
diff --git a/src/viz/layers_tests/cpp/CMakeLists.txt b/src/viz/layers_tests/cpp/CMakeLists.txt
index 0ba68076d..b34de50b8 100644
--- a/src/viz/layers_tests/cpp/CMakeLists.txt
+++ b/src/viz/layers_tests/cpp/CMakeLists.txt
@@ -25,14 +25,18 @@ target_link_libraries(viz_layers_testing
 
 add_library(viz::layers_testing ALIAS viz_layers_testing)
 
-# Test executable for the layers_testing fixtures themselves.
+# Test executable for the layers_testing fixtures themselves and for
+# the production layers in viz::layers (QuadLayer, ...).
 add_executable(viz_layers_tests
     test_clear_rect_layer.cpp
+    test_quad_layer.cpp
     test_throwing_layer.cpp
 )
 
 target_link_libraries(viz_layers_tests PRIVATE
     viz::layers_testing
+    viz::layers
+    viz::test_support
     Catch2::Catch2WithMain
 )
 
diff --git a/src/viz/layers_tests/cpp/test_quad_layer.cpp b/src/viz/layers_tests/cpp/test_quad_layer.cpp
new file mode 100644
index 000000000..ed540e11a
--- /dev/null
+++ b/src/viz/layers_tests/cpp/test_quad_layer.cpp
@@ -0,0 +1,317 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+// Tests for QuadLayer: config validation (unit-level) and pipeline /
+// CUDA-Vulkan interop (gpu-level). End-to-end fill+render+readback
+// lives in viz_session_tests where the full VizSession pipeline is
+// available.
+
+#include "test_helpers.hpp"
+
+#include <catch2/catch_test_macros.hpp>
+#include <viz/core/render_target.hpp>
+#include <viz/core/viz_buffer.hpp>
+#include <viz/core/vk_context.hpp>
+#include <viz/layers/quad_layer.hpp>
+
+#include <cuda_runtime.h>
+#include <stdexcept>
+
+using viz::DeviceImage;
+using viz::PixelFormat;
+using viz::QuadLayer;
+using viz::RenderTarget;
+using viz::Resolution;
+using viz::VizBuffer;
+using viz::VkContext;
+
+using viz::testing::is_gpu_available;
+
+// The arg-shape checks (format, resolution, render_pass) run before
+// the VkContext::is_initialized() check, so these unit tests can
+// exercise each rejection path with a default-constructed VkContext.
+//
+// Per-test ordering: a test passes a config that's valid for every
+// earlier check and triggers only the named check.
+
+TEST_CASE("QuadLayer ctor rejects non-RGBA8 pixel format", "[unit][quad_layer]")
+{
+    VkContext ctx;
+    QuadLayer::Config cfg;
+    cfg.resolution = { 64, 64 };
+    cfg.format = PixelFormat::kD32F;
+    CHECK_THROWS_AS(QuadLayer(ctx, VK_NULL_HANDLE, cfg), std::invalid_argument);
+}
+
+TEST_CASE("QuadLayer ctor rejects zero dimensions", "[unit][quad_layer]")
+{
+    VkContext ctx;
+    QuadLayer::Config cfg;
+    cfg.resolution = { 0, 64 };
+    CHECK_THROWS_AS(QuadLayer(ctx, VK_NULL_HANDLE, cfg), std::invalid_argument);
+}
+
+TEST_CASE("QuadLayer ctor rejects null render pass", "[unit][quad_layer]")
+{
+    VkContext ctx;
+    QuadLayer::Config cfg;
+    cfg.resolution = { 64, 64 };
+    CHECK_THROWS_AS(QuadLayer(ctx, VK_NULL_HANDLE, cfg), std::invalid_argument);
+}
+
+TEST_CASE("QuadLayer creates valid Vulkan + CUDA handles for every mailbox slot", "[gpu][quad_layer]")
+{
+    if (!is_gpu_available())
+    {
+        SKIP("No Vulkan-capable GPU available");
+    }
+    VkContext ctx;
+    ctx.init({});
+    auto target = RenderTarget::create(ctx, RenderTarget::Config{ Resolution{ 64, 64 } });
+
+    QuadLayer::Config cfg;
+    cfg.resolution = { 64, 64 };
+    QuadLayer layer(ctx, target->render_pass(), cfg);
+
+    CHECK(layer.name() == "QuadLayer");
+    CHECK(layer.is_visible());
+    CHECK(layer.resolution().width == 64);
+    CHECK(layer.resolution().height == 64);
+    CHECK(layer.format() == PixelFormat::kRGBA8);
+    for (uint32_t i = 0; i < QuadLayer::kSlotCount; ++i)
+    {
+        REQUIRE(layer.device_image(i) != nullptr);
+        CHECK(layer.device_image(i)->vk_image() != VK_NULL_HANDLE);
+        CHECK(layer.device_image(i)->cuda_array() != nullptr);
+    }
+    // Out-of-range slot returns nullptr without crashing.
+    CHECK(layer.device_image(QuadLayer::kSlotCount) == nullptr);
+}
+
+TEST_CASE("QuadLayer destroy is idempotent", "[gpu][quad_layer]")
+{
+    if (!is_gpu_available())
+    {
+        SKIP("No Vulkan-capable GPU available");
+    }
+    VkContext ctx;
+    ctx.init({});
+    auto target = RenderTarget::create(ctx, RenderTarget::Config{ Resolution{ 32, 32 } });
+
+    QuadLayer::Config cfg;
+    cfg.resolution = { 32, 32 };
+    QuadLayer layer(ctx, target->render_pass(), cfg);
+
+    layer.destroy();
+    layer.destroy(); // second call must be a no-op
+}
+
+TEST_CASE("QuadLayer::submit throws after destroy", "[gpu][quad_layer]")
+{
+    if (!is_gpu_available())
+    {
+        SKIP("No Vulkan-capable GPU available");
+    }
+    VkContext ctx;
+    ctx.init({});
+    auto target = RenderTarget::create(ctx, RenderTarget::Config{ Resolution{ 32, 32 } });
+
+    QuadLayer::Config cfg;
+    cfg.resolution = { 32, 32 };
+    QuadLayer layer(ctx, target->render_pass(), cfg);
+    layer.destroy();
+
+    // submit must throw cleanly rather than dereferencing the
+    // released slot DeviceImages / pipeline.
+    viz::VizBuffer src{};
+    src.width = 32;
+    src.height = 32;
+    src.format = PixelFormat::kRGBA8;
+    src.space = viz::MemorySpace::kDevice;
+    src.data = reinterpret_cast<void*>(uintptr_t{ 0x1 }); // never dereferenced
+    CHECK_THROWS_AS(layer.submit(src), std::logic_error);
+}
+
+TEST_CASE("QuadLayer::submit rejects mismatched dimensions / format / space", "[gpu][quad_layer]")
+{
+    if (!is_gpu_available())
+    {
+        SKIP("No Vulkan-capable GPU available");
+    }
+    VkContext ctx;
+    ctx.init({});
+    auto target = RenderTarget::create(ctx, RenderTarget::Config{ Resolution{ 64, 64 } });
+
+    QuadLayer::Config cfg;
+    cfg.resolution = { 64, 64 };
+    QuadLayer layer(ctx, target->render_pass(), cfg);
+
+    // Allocate a small CUDA buffer to point at — content is irrelevant
+    // because the validation rejects the descriptor before any memcpy.
+    void* dev_ptr = nullptr;
+    REQUIRE(cudaMalloc(&dev_ptr, 64 * 64 * 4) == cudaSuccess);
+    struct CudaFreeGuard
+    {
+        void* p;
+        ~CudaFreeGuard()
+        {
+            cudaFree(p);
+        }
+    } guard{ dev_ptr };
+
+    SECTION("kHost rejected")
+    {
+        VizBuffer src{};
+        src.data = dev_ptr;
+        src.width = 64;
+        src.height = 64;
+        src.format = PixelFormat::kRGBA8;
+        src.space = viz::MemorySpace::kHost;
+        CHECK_THROWS_AS(layer.submit(src), std::invalid_argument);
+    }
+    SECTION("dimension mismatch rejected")
+    {
+        VizBuffer src{};
+        src.data = dev_ptr;
+        src.width = 32;
+        src.height = 64;
+        src.format = PixelFormat::kRGBA8;
+        src.space = viz::MemorySpace::kDevice;
+        CHECK_THROWS_AS(layer.submit(src), std::invalid_argument);
+    }
+    SECTION("null data rejected")
+    {
+        VizBuffer src{};
+        src.data = nullptr;
+        src.width = 64;
+        src.height = 64;
+        src.format = PixelFormat::kRGBA8;
+        src.space = viz::MemorySpace::kDevice;
+        CHECK_THROWS_AS(layer.submit(src), std::invalid_argument);
+    }
+}
+
+TEST_CASE("QuadLayer submit accepts a non-default CUDA stream", "[gpu][quad_layer]")
+{
+    if (!is_gpu_available())
+    {
+        SKIP("No Vulkan-capable GPU available");
+    }
+    VkContext ctx;
+    ctx.init({});
+    auto target = RenderTarget::create(ctx, RenderTarget::Config{ Resolution{ 32, 32 } });
+
+    QuadLayer::Config cfg;
+    cfg.resolution = { 32, 32 };
+    QuadLayer layer(ctx, target->render_pass(), cfg);
+
+    cudaStream_t stream = nullptr;
+    REQUIRE(cudaStreamCreate(&stream) == cudaSuccess);
+    struct StreamGuard
+    {
+        cudaStream_t s;
+        ~StreamGuard()
+        {
+            cudaStreamDestroy(s);
+        }
+    } guard{ stream };
+
+    void* dev_ptr = nullptr;
+    REQUIRE(cudaMalloc(&dev_ptr, static_cast<size_t>(32) * 32 * 4) == cudaSuccess);
+    struct CudaFree
+    {
+        void* p;
+        ~CudaFree()
+        {
+            cudaFree(p);
+        }
+    } cuda_free{ dev_ptr };
+
+    viz::VizBuffer src{};
+    src.data = dev_ptr;
+    src.width = 32;
+    src.height = 32;
+    src.format = PixelFormat::kRGBA8;
+    src.pitch = static_cast<size_t>(32) * 4;
+    src.space = viz::MemorySpace::kDevice;
+    REQUIRE_NOTHROW(layer.submit(src, stream));
+    REQUIRE(cudaStreamSynchronize(stream) == cudaSuccess);
+}
+
+TEST_CASE("QuadLayer back-to-back submits cycle through mailbox slots", "[gpu][quad_layer]")
+{
+    if (!is_gpu_available())
+    {
+        SKIP("No Vulkan-capable GPU available");
+    }
+    VkContext ctx;
+    ctx.init({});
+    auto target = RenderTarget::create(ctx, RenderTarget::Config{ Resolution{ 32, 32 } });
+
+    QuadLayer::Config cfg;
+    cfg.resolution = { 32, 32 };
+    QuadLayer layer(ctx, target->render_pass(), cfg);
+
+    void* dev_ptr = nullptr;
+    REQUIRE(cudaMalloc(&dev_ptr, static_cast<size_t>(32) * 32 * 4) == cudaSuccess);
+    struct CudaFree
+    {
+        void* p;
+        ~CudaFree()
+        {
+            cudaFree(p);
+        }
+    } cuda_free{ dev_ptr };
+
+    viz::VizBuffer src{};
+    src.data = dev_ptr;
+    src.width = 32;
+    src.height = 32;
+    src.format = PixelFormat::kRGBA8;
+    src.pitch = static_cast<size_t>(32) * 4;
+    src.space = viz::MemorySpace::kDevice;
+
+    // Without an intervening render(), in_use_ stays kSlotNone, so
+    // every submit() is free to pick any slot that isn't latest_.
+    // We expect each submit's cuda_done_writing counter to advance
+    // monotonically on whichever slot it landed on.
+    uint64_t total_signals_before = 0;
+    for (uint32_t i = 0; i < QuadLayer::kSlotCount; ++i)
+    {
+        total_signals_before += layer.device_image(i)->cuda_done_writing_value();
+    }
+    constexpr uint32_t kSubmits = 8;
+    for (uint32_t i = 0; i < kSubmits; ++i)
+    {
+        REQUIRE_NOTHROW(layer.submit(src));
+    }
+    REQUIRE(cudaDeviceSynchronize() == cudaSuccess);
+
+    uint64_t total_signals_after = 0;
+    for (uint32_t i = 0; i < QuadLayer::kSlotCount; ++i)
+    {
+        total_signals_after += layer.device_image(i)->cuda_done_writing_value();
+    }
+    CHECK(total_signals_after - total_signals_before == kSubmits);
+}
+
+TEST_CASE("QuadLayer visibility toggle is independent of pipeline state", "[gpu][quad_layer]")
+{
+    if (!is_gpu_available())
+    {
+        SKIP("No Vulkan-capable GPU available");
+    }
+    VkContext ctx;
+    ctx.init({});
+    auto target = RenderTarget::create(ctx, RenderTarget::Config{ Resolution{ 32, 32 } });
+
+    QuadLayer::Config cfg;
+    cfg.resolution = { 32, 32 };
+    QuadLayer layer(ctx, target->render_pass(), cfg);
+
+    REQUIRE(layer.is_visible());
+    layer.set_visible(false);
+    CHECK_FALSE(layer.is_visible());
+    layer.set_visible(true);
+    CHECK(layer.is_visible());
+}
diff --git a/src/viz/session/cpp/inc/viz/session/viz_session.hpp b/src/viz/session/cpp/inc/viz/session/viz_session.hpp
index 9b66ed5ce..1a4be519c 100644
--- a/src/viz/session/cpp/inc/viz/session/viz_session.hpp
+++ b/src/viz/session/cpp/inc/viz/session/viz_session.hpp
@@ -157,6 +157,10 @@ class VizSession
     uint32_t get_vk_queue_family_index() const noexcept;
     VkRenderPass get_render_pass() const noexcept;
 
+    // The VkContext driving this session, used by layers that build
+    // their own pipelines. nullptr before create() / after destroy().
+    const VkContext* get_vk_context() const noexcept;
+
 private:
     explicit VizSession(const Config& config);
     void init();
diff --git a/src/viz/session/cpp/viz_compositor.cpp b/src/viz/session/cpp/viz_compositor.cpp
index b8361ffbc..9c2a6a76d 100644
--- a/src/viz/session/cpp/viz_compositor.cpp
+++ b/src/viz/session/cpp/viz_compositor.cpp
@@ -205,17 +205,29 @@ void VizCompositor::render(const std::vector<LayerBase*>& layers, const std::vec
     rp.clearValueCount = static_cast<uint32_t>(clears.size());
     rp.pClearValues = clears.data();
 
-    vkCmdBeginRenderPass(command_buffer_, &rp, VK_SUBPASS_CONTENTS_INLINE);
-
-    // Layer dispatch: insertion order; skip invisible.
+    // Snapshot the visible-layer set ONCE per frame. is_visible() is
+    // an atomic flag; sampling it twice across record / wait-collect
+    // would let a mid-frame toggle record draws but skip the
+    // matching cuda_done_writing wait (or vice versa), which would
+    // race the producer's CUDA copy.
+    std::vector<LayerBase*> visible_layers;
+    visible_layers.reserve(layers.size());
     for (LayerBase* layer : layers)
     {
         if (layer != nullptr && layer->is_visible())
         {
-            layer->record(command_buffer_, views, *render_target_);
+            visible_layers.push_back(layer);
         }
     }
 
+    vkCmdBeginRenderPass(command_buffer_, &rp, VK_SUBPASS_CONTENTS_INLINE);
+
+    // Layer dispatch: insertion order, only the snapshotted visible set.
+    for (LayerBase* layer : visible_layers)
+    {
+        layer->record(command_buffer_, views, *render_target_);
+    }
+
     vkCmdEndRenderPass(command_buffer_);
     check_vk(vkEndCommandBuffer(command_buffer_), "vkEndCommandBuffer");
 
@@ -225,16 +237,46 @@ void VizCompositor::render(const std::vector<LayerBase*>& layers, const std::vec
     // frame and the next render() doesn't deadlock on wait().
     frame_sync_->reset();
 
+    // Collect layer-provided wait timeline semaphores. Each visible
+    // layer contributes; flatten into the arrays vkQueueSubmit
+    // expects (with a chained VkTimelineSemaphoreSubmitInfo for the
+    // per-semaphore counter values).
+    std::vector<VkSemaphore> wait_semaphores;
+    std::vector<uint64_t> wait_values;
+    std::vector<VkPipelineStageFlags> wait_stages;
+    for (LayerBase* layer : visible_layers)
+    {
+        for (const auto& w : layer->get_wait_semaphores())
+        {
+            if (w.semaphore != VK_NULL_HANDLE)
+            {
+                wait_semaphores.push_back(w.semaphore);
+                wait_values.push_back(w.value);
+                wait_stages.push_back(w.wait_stage);
+            }
+        }
+    }
+
+    VkTimelineSemaphoreSubmitInfo timeline{};
+    timeline.sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO;
+    timeline.waitSemaphoreValueCount = static_cast<uint32_t>(wait_values.size());
+    timeline.pWaitSemaphoreValues = wait_values.empty() ? nullptr : wait_values.data();
+
     VkSubmitInfo submit{};
     submit.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+    submit.pNext = &timeline;
     submit.commandBufferCount = 1;
     submit.pCommandBuffers = &command_buffer_;
+    submit.waitSemaphoreCount = static_cast<uint32_t>(wait_semaphores.size());
+    submit.pWaitSemaphores = wait_semaphores.empty() ? nullptr : wait_semaphores.data();
+    submit.pWaitDstStageMask = wait_stages.empty() ? nullptr : wait_stages.data();
     submit_or_signal_fence(submit, "vkQueueSubmit");
 
     // Wait for completion before returning so readback / next frame sees
     // a consistent state. With 1 frame in flight this is the natural
     // synchronization point; multi-buffered swapchain rendering moves
-    // this wait to the start of the next frame.
+    // this wait to the start of the next frame. QuadLayer's mailbox
+    // depends on this — see quad_layer.hpp.
     frame_sync_->wait();
 }
 
diff --git a/src/viz/session/cpp/viz_session.cpp b/src/viz/session/cpp/viz_session.cpp
index 521b8d6ba..d9e6eea6e 100644
--- a/src/viz/session/cpp/viz_session.cpp
+++ b/src/viz/session/cpp/viz_session.cpp
@@ -18,8 +18,7 @@ void check_offscreen_only(DisplayMode mode, const char* what)
     {
         throw std::runtime_error(std::string("VizSession: ") + what +
                                  " is not implemented for the requested DisplayMode "
-                                 "(only kOffscreen ships in this milestone; "
-                                 "kWindow / kXr arrive with their respective backends)");
+                                 "(only kOffscreen is currently supported)");
     }
 }
 
@@ -259,4 +258,9 @@ VkRenderPass VizSession::get_render_pass() const noexcept
     return compositor_ ? compositor_->render_pass() : VK_NULL_HANDLE;
 }
 
+const VkContext* VizSession::get_vk_context() const noexcept
+{
+    return ctx_ptr_;
+}
+
 } // namespace viz
diff --git a/src/viz/session_tests/cpp/CMakeLists.txt b/src/viz/session_tests/cpp/CMakeLists.txt
index f2ac2ccac..31734b870 100644
--- a/src/viz/session_tests/cpp/CMakeLists.txt
+++ b/src/viz/session_tests/cpp/CMakeLists.txt
@@ -5,12 +5,15 @@ cmake_minimum_required(VERSION 3.20)
 
 add_executable(viz_session_tests
     test_offscreen_render.cpp
+    test_quad_milestone.cpp
     test_viz_session.cpp
 )
 
 target_link_libraries(viz_session_tests PRIVATE
     viz::session
+    viz::layers
     viz::layers_testing
+    viz::test_support
     Catch2::Catch2WithMain
 )
 
diff --git a/src/viz/session_tests/cpp/test_offscreen_render.cpp b/src/viz/session_tests/cpp/test_offscreen_render.cpp
index 9dc2ca171..f25d46d95 100644
--- a/src/viz/session_tests/cpp/test_offscreen_render.cpp
+++ b/src/viz/session_tests/cpp/test_offscreen_render.cpp
@@ -11,6 +11,8 @@
 // VizSession + LayerBase dispatch + readback path. Everything that ships
 // in this milestone, exercised in one test.
 
+#include "test_helpers.hpp"
+
 #include <catch2/catch_test_macros.hpp>
 #include <viz/core/host_image.hpp>
 #include <viz/core/vk_context.hpp>
@@ -27,27 +29,10 @@ using viz::VizSession;
 using viz::testing::ClearRectLayer;
 using viz::testing::ThrowingLayer;
 
-namespace
-{
+using viz::testing::is_gpu_available;
 
-// Same NVIDIA-driver-leak workaround as viz_core_tests: any [gpu] test
-// that creates a VkContext should check this first and SKIP if the
-// runner has no suitable GPU.
-bool gpu_available()
+namespace
 {
-    static const bool cached = []()
-    {
-        for (const auto& info : viz::VkContext::enumerate_physical_devices())
-        {
-            if (info.meets_requirements)
-            {
-                return true;
-            }
-        }
-        return false;
-    }();
-    return cached;
-}
 
 // RGBA8 byte at (x, y) in a tightly-packed row-major framebuffer.
 struct Rgba
@@ -69,7 +54,7 @@ Rgba pixel_at(const HostImage& img, uint32_t x, uint32_t y)
 
 TEST_CASE("Offscreen session renders layer pixels through to readback", "[gpu][viz_session]")
 {
-    if (!gpu_available())
+    if (!is_gpu_available())
     {
         SKIP("No Vulkan-capable GPU available");
     }
@@ -139,7 +124,7 @@ TEST_CASE("Offscreen session renders layer pixels through to readback", "[gpu][v
 
 TEST_CASE("Hidden layer does not contribute to the framebuffer", "[gpu][viz_session]")
 {
-    if (!gpu_available())
+    if (!is_gpu_available())
     {
         SKIP("No Vulkan-capable GPU available");
     }
@@ -180,7 +165,7 @@ TEST_CASE("Hidden layer does not contribute to the framebuffer", "[gpu][viz_sess
 
 TEST_CASE("Multiple frames advance frame_index and avoid leaking sync state", "[gpu][viz_session]")
 {
-    if (!gpu_available())
+    if (!is_gpu_available())
     {
         SKIP("No Vulkan-capable GPU available");
     }
@@ -202,7 +187,7 @@ TEST_CASE("Multiple frames advance frame_index and avoid leaking sync state", "[
 
 TEST_CASE("Session recovers from a layer that throws and renders the next frame", "[gpu][viz_session]")
 {
-    if (!gpu_available())
+    if (!is_gpu_available())
     {
         SKIP("No Vulkan-capable GPU available");
     }
@@ -241,7 +226,7 @@ TEST_CASE("Session recovers from a layer that throws and renders the next frame"
 
 TEST_CASE("Layer that throws does not corrupt the layer registry", "[gpu][viz_session]")
 {
-    if (!gpu_available())
+    if (!is_gpu_available())
     {
         SKIP("No Vulkan-capable GPU available");
     }
@@ -275,7 +260,7 @@ TEST_CASE("Layer that throws does not corrupt the layer registry", "[gpu][viz_se
 
 TEST_CASE("begin_frame / end_frame must be paired", "[gpu][viz_session]")
 {
-    if (!gpu_available())
+    if (!is_gpu_available())
     {
         SKIP("No Vulkan-capable GPU available");
     }
diff --git a/src/viz/session_tests/cpp/test_quad_milestone.cpp b/src/viz/session_tests/cpp/test_quad_milestone.cpp
new file mode 100644
index 000000000..3f04581dc
--- /dev/null
+++ b/src/viz/session_tests/cpp/test_quad_milestone.cpp
@@ -0,0 +1,482 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+// End-to-end CUDA-Vulkan interop through VizSession: producer writes
+// pixels into a caller-owned CUDA buffer, QuadLayer::submit() copies
+// them into one of the mailbox slots, the next render() samples that
+// slot, readback_to_host() pulls the framebuffer back out.
+//
+// Pattern: 4 quadrants of {0, 255}-only RGBA — exact through any
+// sRGB / UNORM gamma curve because the curve endpoints map to
+// themselves. A separate midtone test covers the gamma round-trip.
+
+#include "test_helpers.hpp"
+
+#include <catch2/catch_test_macros.hpp>
+#include <viz/core/host_image.hpp>
+#include <viz/core/viz_buffer.hpp>
+#include <viz/core/vk_context.hpp>
+#include <viz/layers/quad_layer.hpp>
+#include <viz/session/viz_session.hpp>
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <cstdlib>
+#include <cuda_runtime.h>
+#include <vector>
+
+using viz::DisplayMode;
+using viz::HostImage;
+using viz::PixelFormat;
+using viz::QuadLayer;
+using viz::Resolution;
+using viz::VizSession;
+
+using viz::testing::is_gpu_available;
+
+namespace
+{
+
+// 4 quadrants, each a different {0, 255}-only color. Round-trip-exact
+// through Vulkan's sRGB attachment encoding because both endpoints of
+// the gamma curve (0 and 255) map to themselves.
+//
+//   top-left = red, top-right = green, bottom-left = blue,
+//   bottom-right = white.
+struct Rgba
+{
+    uint8_t r, g, b, a;
+};
+
+Rgba quadrant_color(uint32_t x, uint32_t y, uint32_t w, uint32_t h)
+{
+    const bool right = x >= w / 2;
+    const bool bottom = y >= h / 2;
+    if (!right && !bottom)
+    {
+        return { 255, 0, 0, 255 };
+    }
+    if (right && !bottom)
+    {
+        return { 0, 255, 0, 255 };
+    }
+    if (!right && bottom)
+    {
+        return { 0, 0, 255, 255 };
+    }
+    return { 255, 255, 255, 255 };
+}
+
+std::vector<Rgba> build_host_pattern(uint32_t side)
+{
+    std::vector<Rgba> px(static_cast<size_t>(side) * side);
+    for (uint32_t y = 0; y < side; ++y)
+    {
+        for (uint32_t x = 0; x < side; ++x)
+        {
+            px[static_cast<size_t>(y) * side + x] = quadrant_color(x, y, side, side);
+        }
+    }
+    return px;
+}
+
+Rgba pixel_at(const HostImage& img, uint32_t x, uint32_t y)
+{
+    const size_t i = (static_cast<size_t>(y) * img.resolution().width + x) * 4;
+    const uint8_t* p = img.data() + i;
+    return Rgba{ p[0], p[1], p[2], p[3] };
+}
+
+// Asserts the readback contains the 4-quadrant pattern at the four
+// quadrant centers. Centers (kSide/4, kSide/4) etc. are deep inside
+// each color region, far from any rasterization-edge ambiguity.
+void check_quadrant_pattern(const HostImage& image, uint32_t side)
+{
+    const Rgba top_left = pixel_at(image, side / 4, side / 4);
+    CHECK(top_left.r == 255);
+    CHECK(top_left.g == 0);
+    CHECK(top_left.b == 0);
+    CHECK(top_left.a == 255);
+
+    const Rgba top_right = pixel_at(image, 3 * side / 4, side / 4);
+    CHECK(top_right.r == 0);
+    CHECK(top_right.g == 255);
+    CHECK(top_right.b == 0);
+    CHECK(top_right.a == 255);
+
+    const Rgba bottom_left = pixel_at(image, side / 4, 3 * side / 4);
+    CHECK(bottom_left.r == 0);
+    CHECK(bottom_left.g == 0);
+    CHECK(bottom_left.b == 255);
+    CHECK(bottom_left.a == 255);
+
+    const Rgba bottom_right = pixel_at(image, 3 * side / 4, 3 * side / 4);
+    CHECK(bottom_right.r == 255);
+    CHECK(bottom_right.g == 255);
+    CHECK(bottom_right.b == 255);
+    CHECK(bottom_right.a == 255);
+}
+
+// RAII wrapper that frees the cudaMalloc'd device pointer on scope exit.
+struct CudaFreeGuard
+{
+    void* p;
+    ~CudaFreeGuard()
+    {
+        cudaFree(p);
+    }
+};
+
+} // namespace
+
+TEST_CASE("QuadLayer submit() round-trips CUDA pixels to readback", "[gpu][quad_layer][milestone]")
+{
+    if (!is_gpu_available())
+    {
+        SKIP("No Vulkan-capable GPU available");
+    }
+
+    constexpr uint32_t kSide = 64;
+
+    VizSession::Config cfg{};
+    cfg.mode = DisplayMode::kOffscreen;
+    cfg.window_width = kSide;
+    cfg.window_height = kSide;
+
+    auto session = VizSession::create(cfg);
+    REQUIRE(session != nullptr);
+    REQUIRE(session->get_state() == viz::SessionState::kReady);
+
+    const auto* ctx = session->get_vk_context();
+    REQUIRE(ctx != nullptr);
+    const VkRenderPass render_pass = session->get_render_pass();
+    REQUIRE(render_pass != VK_NULL_HANDLE);
+
+    QuadLayer::Config layer_cfg;
+    layer_cfg.name = "milestone_quad_mode_a";
+    layer_cfg.resolution = { kSide, kSide };
+    auto* layer = session->add_layer<QuadLayer>(*ctx, render_pass, layer_cfg);
+    REQUIRE(layer != nullptr);
+
+    // Stage the pattern in a caller-owned cudaMalloc'd buffer — this
+    // mirrors how a real Mode A consumer (camera decoder, NN renderer)
+    // hands data to submit().
+    const auto host_pattern = build_host_pattern(kSide);
+    void* device_ptr = nullptr;
+    REQUIRE(cudaMalloc(&device_ptr, host_pattern.size() * sizeof(Rgba)) == cudaSuccess);
+    CudaFreeGuard guard{ device_ptr };
+    REQUIRE(cudaMemcpy(device_ptr, host_pattern.data(), host_pattern.size() * sizeof(Rgba), cudaMemcpyHostToDevice) ==
+            cudaSuccess);
+    REQUIRE(cudaDeviceSynchronize() == cudaSuccess);
+
+    viz::VizBuffer src{};
+    src.data = device_ptr;
+    src.width = kSide;
+    src.height = kSide;
+    src.format = PixelFormat::kRGBA8;
+    src.pitch = static_cast<size_t>(kSide) * 4;
+    src.space = viz::MemorySpace::kDevice;
+    layer->submit(src);
+
+    const auto info = session->render();
+    CHECK(info.frame_index == 0);
+    CHECK(info.resolution.width == kSide);
+    CHECK(info.resolution.height == kSide);
+
+    const auto image = session->readback_to_host();
+    REQUIRE(image.resolution().width == kSide);
+    REQUIRE(image.resolution().height == kSide);
+    check_quadrant_pattern(image, kSide);
+}
+
+TEST_CASE("QuadLayer multi-frame submit/render/readback loop stays correct", "[gpu][quad_layer][milestone]")
+{
+    if (!is_gpu_available())
+    {
+        SKIP("No Vulkan-capable GPU available");
+    }
+
+    constexpr uint32_t kSide = 64;
+    constexpr int kFrames = 16;
+
+    VizSession::Config cfg{};
+    cfg.mode = DisplayMode::kOffscreen;
+    cfg.window_width = kSide;
+    cfg.window_height = kSide;
+
+    auto session = VizSession::create(cfg);
+    REQUIRE(session != nullptr);
+    const auto* ctx = session->get_vk_context();
+    REQUIRE(ctx != nullptr);
+
+    QuadLayer::Config layer_cfg;
+    layer_cfg.name = "milestone_quad_multiframe";
+    layer_cfg.resolution = { kSide, kSide };
+    auto* layer = session->add_layer<QuadLayer>(*ctx, session->get_render_pass(), layer_cfg);
+    REQUIRE(layer != nullptr);
+
+    void* device_ptr = nullptr;
+    REQUIRE(cudaMalloc(&device_ptr, static_cast<size_t>(kSide) * kSide * 4) == cudaSuccess);
+    CudaFreeGuard guard{ device_ptr };
+
+    // Each frame fills with a different solid-color palette entry
+    // (channels in {0, 255} for sRGB-exact round-trip). With the
+    // 3-slot mailbox the producer's submit and the renderer's draw
+    // pipeline naturally; frame N's readback must still contain
+    // frame N's color, not a stale or torn frame.
+    const std::array<Rgba, 4> palette = { {
+        { 255, 0, 0, 255 },
+        { 0, 255, 0, 255 },
+        { 0, 0, 255, 255 },
+        { 255, 255, 255, 255 },
+    } };
+
+    std::vector<Rgba> host_buf(static_cast<size_t>(kSide) * kSide);
+    for (int frame = 0; frame < kFrames; ++frame)
+    {
+        const Rgba expected = palette[frame % palette.size()];
+        std::fill(host_buf.begin(), host_buf.end(), expected);
+        REQUIRE(cudaMemcpy(device_ptr, host_buf.data(), host_buf.size() * sizeof(Rgba), cudaMemcpyHostToDevice) ==
+                cudaSuccess);
+
+        viz::VizBuffer src{};
+        src.data = device_ptr;
+        src.width = kSide;
+        src.height = kSide;
+        src.format = PixelFormat::kRGBA8;
+        src.pitch = static_cast<size_t>(kSide) * 4;
+        src.space = viz::MemorySpace::kDevice;
+        layer->submit(src);
+
+        session->render();
+
+        const auto image = session->readback_to_host();
+        const auto sample = pixel_at(image, kSide / 2, kSide / 2);
+        CHECK(sample.r == expected.r);
+        CHECK(sample.g == expected.g);
+        CHECK(sample.b == expected.b);
+        CHECK(sample.a == expected.a);
+    }
+}
+
+TEST_CASE("QuadLayer round-trips midtone RGBA values exactly", "[gpu][quad_layer][milestone]")
+{
+    // The {0, 255}-only round-trip tests don't exercise the sRGB
+    // color-space round-trip — those endpoints map to themselves
+    // through any gamma curve. Mid-range bytes are only exact when
+    // the storage UNORM image is sampled through an SRGB view
+    // (decode at sample) and the SRGB color attachment encodes on
+    // write. Net of decode+encode is identity.
+    if (!is_gpu_available())
+    {
+        SKIP("No Vulkan-capable GPU available");
+    }
+
+    constexpr uint32_t kSide = 64;
+    VizSession::Config cfg{};
+    cfg.mode = DisplayMode::kOffscreen;
+    cfg.window_width = kSide;
+    cfg.window_height = kSide;
+
+    auto session = VizSession::create(cfg);
+    REQUIRE(session != nullptr);
+    const auto* ctx = session->get_vk_context();
+    REQUIRE(ctx != nullptr);
+
+    QuadLayer::Config layer_cfg;
+    layer_cfg.name = "milestone_quad_midtone";
+    layer_cfg.resolution = { kSide, kSide };
+    auto* layer = session->add_layer<QuadLayer>(*ctx, session->get_render_pass(), layer_cfg);
+    REQUIRE(layer != nullptr);
+
+    // A non-trivial midtone (~50% gray, mixed channels). With the
+    // wrong color-space wiring this would shift by ~5–20% per channel.
+    constexpr Rgba kExpected = { 64, 128, 200, 255 };
+
+    std::vector<Rgba> host_buf(static_cast<size_t>(kSide) * kSide, kExpected);
+    void* device_ptr = nullptr;
+    REQUIRE(cudaMalloc(&device_ptr, host_buf.size() * sizeof(Rgba)) == cudaSuccess);
+    CudaFreeGuard guard{ device_ptr };
+    REQUIRE(cudaMemcpy(device_ptr, host_buf.data(), host_buf.size() * sizeof(Rgba), cudaMemcpyHostToDevice) ==
+            cudaSuccess);
+    REQUIRE(cudaDeviceSynchronize() == cudaSuccess);
+
+    viz::VizBuffer src{};
+    src.data = device_ptr;
+    src.width = kSide;
+    src.height = kSide;
+    src.format = PixelFormat::kRGBA8;
+    src.pitch = static_cast<size_t>(kSide) * 4;
+    src.space = viz::MemorySpace::kDevice;
+    layer->submit(src);
+
+    session->render();
+
+    const auto image = session->readback_to_host();
+    const auto sample = pixel_at(image, kSide / 2, kSide / 2);
+    // Round-trip should be exact; allow ±1 LSB for any quantization
+    // edge case on the Vulkan->host blit path.
+    CHECK(std::abs(int(sample.r) - int(kExpected.r)) <= 1);
+    CHECK(std::abs(int(sample.g) - int(kExpected.g)) <= 1);
+    CHECK(std::abs(int(sample.b) - int(kExpected.b)) <= 1);
+    CHECK(sample.a == kExpected.a);
+}
+
+TEST_CASE("QuadLayer with no submit yet renders the clear color", "[gpu][quad_layer][milestone]")
+{
+    // Pins the kSlotNone short-circuit in record() / get_wait_semaphores().
+    if (!is_gpu_available())
+    {
+        SKIP("No Vulkan-capable GPU available");
+    }
+
+    constexpr uint32_t kSide = 64;
+    VizSession::Config cfg{};
+    cfg.mode = DisplayMode::kOffscreen;
+    cfg.window_width = kSide;
+    cfg.window_height = kSide;
+    // Distinctive non-default clear so a coincidental black draw can't pass.
+    cfg.clear_color[0] = 0.0f;
+    cfg.clear_color[1] = 1.0f;
+    cfg.clear_color[2] = 0.0f;
+    cfg.clear_color[3] = 1.0f;
+
+    auto session = VizSession::create(cfg);
+    REQUIRE(session != nullptr);
+    const auto* ctx = session->get_vk_context();
+    REQUIRE(ctx != nullptr);
+
+    QuadLayer::Config layer_cfg;
+    layer_cfg.name = "milestone_quad_no_submit";
+    layer_cfg.resolution = { kSide, kSide };
+    auto* layer = session->add_layer<QuadLayer>(*ctx, session->get_render_pass(), layer_cfg);
+    REQUIRE(layer != nullptr);
+
+    session->render();
+    const auto image = session->readback_to_host();
+    const auto sample = pixel_at(image, kSide / 2, kSide / 2);
+    CHECK(sample.r == 0);
+    CHECK(sample.g == 255);
+    CHECK(sample.b == 0);
+    CHECK(sample.a == 255);
+}
+
+TEST_CASE("QuadLayer re-renders the same publish when no new submit arrives", "[gpu][quad_layer][milestone]")
+{
+    // Pins: record() keeps in_use_ stable across frames if latest_ doesn't change.
+    if (!is_gpu_available())
+    {
+        SKIP("No Vulkan-capable GPU available");
+    }
+
+    constexpr uint32_t kSide = 64;
+    VizSession::Config cfg{};
+    cfg.mode = DisplayMode::kOffscreen;
+    cfg.window_width = kSide;
+    cfg.window_height = kSide;
+
+    auto session = VizSession::create(cfg);
+    REQUIRE(session != nullptr);
+    const auto* ctx = session->get_vk_context();
+    REQUIRE(ctx != nullptr);
+
+    QuadLayer::Config layer_cfg;
+    layer_cfg.name = "milestone_quad_resubmit_none";
+    layer_cfg.resolution = { kSide, kSide };
+    auto* layer = session->add_layer<QuadLayer>(*ctx, session->get_render_pass(), layer_cfg);
+    REQUIRE(layer != nullptr);
+
+    constexpr Rgba kColor = { 255, 0, 255, 255 };
+    std::vector<Rgba> host_buf(static_cast<size_t>(kSide) * kSide, kColor);
+    void* device_ptr = nullptr;
+    REQUIRE(cudaMalloc(&device_ptr, host_buf.size() * sizeof(Rgba)) == cudaSuccess);
+    CudaFreeGuard guard{ device_ptr };
+    REQUIRE(cudaMemcpy(device_ptr, host_buf.data(), host_buf.size() * sizeof(Rgba), cudaMemcpyHostToDevice) ==
+            cudaSuccess);
+
+    viz::VizBuffer src{};
+    src.data = device_ptr;
+    src.width = kSide;
+    src.height = kSide;
+    src.format = PixelFormat::kRGBA8;
+    src.pitch = static_cast<size_t>(kSide) * 4;
+    src.space = viz::MemorySpace::kDevice;
+    layer->submit(src);
+
+    for (int i = 0; i < 2; ++i)
+    {
+        session->render();
+        const auto image = session->readback_to_host();
+        const auto sample = pixel_at(image, kSide / 2, kSide / 2);
+        CHECK(sample.r == kColor.r);
+        CHECK(sample.g == kColor.g);
+        CHECK(sample.b == kColor.b);
+        CHECK(sample.a == kColor.a);
+    }
+}
+
+TEST_CASE("QuadLayer fast producer: render samples only the latest publish", "[gpu][quad_layer][milestone]")
+{
+    // Pins the core mailbox guarantee — intermediate publishes are dropped.
+    if (!is_gpu_available())
+    {
+        SKIP("No Vulkan-capable GPU available");
+    }
+
+    constexpr uint32_t kSide = 64;
+    VizSession::Config cfg{};
+    cfg.mode = DisplayMode::kOffscreen;
+    cfg.window_width = kSide;
+    cfg.window_height = kSide;
+
+    auto session = VizSession::create(cfg);
+    REQUIRE(session != nullptr);
+    const auto* ctx = session->get_vk_context();
+    REQUIRE(ctx != nullptr);
+
+    QuadLayer::Config layer_cfg;
+    layer_cfg.name = "milestone_quad_fast_producer";
+    layer_cfg.resolution = { kSide, kSide };
+    auto* layer = session->add_layer<QuadLayer>(*ctx, session->get_render_pass(), layer_cfg);
+    REQUIRE(layer != nullptr);
+
+    void* device_ptr = nullptr;
+    REQUIRE(cudaMalloc(&device_ptr, static_cast<size_t>(kSide) * kSide * 4) == cudaSuccess);
+    CudaFreeGuard guard{ device_ptr };
+
+    // Five back-to-back submits, no intervening render. The last one must win.
+    const std::array<Rgba, 5> palette = { {
+        { 255, 0, 0, 255 },
+        { 0, 255, 0, 255 },
+        { 0, 0, 255, 255 },
+        { 255, 255, 0, 255 },
+        { 0, 255, 255, 255 },
+    } };
+
+    std::vector<Rgba> host_buf(static_cast<size_t>(kSide) * kSide);
+    for (const auto& color : palette)
+    {
+        std::fill(host_buf.begin(), host_buf.end(), color);
+        REQUIRE(cudaMemcpy(device_ptr, host_buf.data(), host_buf.size() * sizeof(Rgba), cudaMemcpyHostToDevice) ==
+                cudaSuccess);
+
+        viz::VizBuffer src{};
+        src.data = device_ptr;
+        src.width = kSide;
+        src.height = kSide;
+        src.format = PixelFormat::kRGBA8;
+        src.pitch = static_cast<size_t>(kSide) * 4;
+        src.space = viz::MemorySpace::kDevice;
+        layer->submit(src);
+    }
+
+    session->render();
+    const auto image = session->readback_to_host();
+    const auto sample = pixel_at(image, kSide / 2, kSide / 2);
+    const auto expected = palette.back();
+    CHECK(sample.r == expected.r);
+    CHECK(sample.g == expected.g);
+    CHECK(sample.b == expected.b);
+    CHECK(sample.a == expected.a);
+}
diff --git a/src/viz/shaders/cpp/textured_quad.frag b/src/viz/shaders/cpp/textured_quad.frag
index 29b92d6c9..00243763b 100644
--- a/src/viz/shaders/cpp/textured_quad.frag
+++ b/src/viz/shaders/cpp/textured_quad.frag
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 // Samples a combined image sampler at descriptor binding 0. Used by
-// QuadLayer (ships in M3b) to display a CUDA-fed texture as a quad.
+// QuadLayer to display a CUDA-fed texture as a fullscreen quad.
 
 #version 450