From c991a64a7e8cc95146d9d6076ced7457b5f98875 Mon Sep 17 00:00:00 2001
From: Farbod Motlagh <fmotlagh@nvidia.com>
Date: Fri, 1 May 2026 10:31:31 -0700
Subject: [PATCH 01/10] Add televiz shaders + DeviceImage CUDA-Vulkan interop
 (m3a)

- viz/shaders/: glslang-compiled SPIR-V embedded as constexpr arrays;
  ships textured_quad.{vert,frag} for the upcoming QuadLayer.
- viz/core/device_image: VkImage backed by external memory, imported
  into CUDA as cudaArray_t. Symmetric pair to HostImage.
- viz/core/vk_context: pin CUDA device to Vulkan physical device by
  UUID at init() so interop types can assume same-GPU operation.
- CI: add CUDA Toolkit + glslang-tools to build-ubuntu and sanitizer.
- Tests: viz_shaders_tests, viz_core_tests DeviceImage round-trip;
  all unit tests pass under ASAN+UBSAN.

Signed-off-by: Farbod Motlagh <fmotlagh@nvidia.com>
Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .github/workflows/build-ubuntu.yml            |  23 +-
 deps/README.md                                |  18 +-
 src/viz/AGENTS.md                             |  36 +-
 src/viz/CMakeLists.txt                        |   5 +
 src/viz/core/cpp/CMakeLists.txt               |  11 +-
 src/viz/core/cpp/device_image.cpp             | 378 ++++++++++++++++++
 .../core/cpp/inc/viz/core/device_image.hpp    | 124 +++++-
 src/viz/core/cpp/inc/viz/core/vk_context.hpp  |   9 +
 src/viz/core/cpp/vk_context.cpp               |  49 +++
 src/viz/core_tests/cpp/CMakeLists.txt         |   1 +
 src/viz/core_tests/cpp/test_device_image.cpp  | 117 ++++++
 src/viz/shaders/CMakeLists.txt                |   9 +
 src/viz/shaders/cpp/CMakeLists.txt            |  66 +++
 src/viz/shaders/cpp/compile_shader.cmake      |  56 +++
 src/viz/shaders/cpp/textured_quad.frag        |  17 +
 src/viz/shaders/cpp/textured_quad.vert        |  16 +
 src/viz/shaders_tests/CMakeLists.txt          |   7 +
 src/viz/shaders_tests/cpp/CMakeLists.txt      |  17 +
 .../shaders_tests/cpp/test_shader_blobs.cpp   |  44 ++
 19 files changed, 972 insertions(+), 31 deletions(-)
 create mode 100644 src/viz/core/cpp/device_image.cpp
 create mode 100644 src/viz/core_tests/cpp/test_device_image.cpp
 create mode 100644 src/viz/shaders/CMakeLists.txt
 create mode 100644 src/viz/shaders/cpp/CMakeLists.txt
 create mode 100644 src/viz/shaders/cpp/compile_shader.cmake
 create mode 100644 src/viz/shaders/cpp/textured_quad.frag
 create mode 100644 src/viz/shaders/cpp/textured_quad.vert
 create mode 100644 src/viz/shaders_tests/CMakeLists.txt
 create mode 100644 src/viz/shaders_tests/cpp/CMakeLists.txt
 create mode 100644 src/viz/shaders_tests/cpp/test_shader_blobs.cpp

diff --git a/.github/workflows/build-ubuntu.yml b/.github/workflows/build-ubuntu.yml
index eafb25592..0a47318a6 100644
--- a/.github/workflows/build-ubuntu.yml
+++ b/.github/workflows/build-ubuntu.yml
@@ -39,13 +39,25 @@ jobs:
     - name: Install Apt dependencies
       run: |
         sudo apt-get update
-        sudo apt-get install -y build-essential cmake libx11-dev clang-format-14 ccache libvulkan-dev
+        sudo apt-get install -y build-essential cmake libx11-dev clang-format-14 ccache libvulkan-dev glslang-tools
 
     - name: Install patchelf (Release only)
       if: ${{ matrix.build_type == 'Release' }}
       run: |
         sudo apt-get update
         sudo apt-get install -y patchelf
+
+    # CUDA toolkit needed at link time for viz_core (cuda_texture +
+    # device_image link libcudart). The wheel itself excludes
+    # libcuda.so.1 via auditwheel; runtime CUDA is the consumer's
+    # responsibility.
+    - name: Install CUDA toolkit
+      uses: Jimver/cuda-toolkit@v0.2.19
+      with:
+        cuda: '12.4.0'
+        method: 'network'
+        sub-packages: '["nvcc", "cudart"]'
+
     - name: Setup CloudXR SDK
       id: cloudxr-sdk
       uses: ./.github/actions/setup-cloudxr-sdk
@@ -262,7 +274,14 @@ jobs:
     - name: Install Apt dependencies
       run: |
         sudo apt-get update
-        sudo apt-get install -y build-essential cmake libx11-dev clang-format-14 ccache libvulkan-dev
+        sudo apt-get install -y build-essential cmake libx11-dev clang-format-14 ccache libvulkan-dev glslang-tools
+
+    - name: Install CUDA toolkit
+      uses: Jimver/cuda-toolkit@v0.2.19
+      with:
+        cuda: '12.4.0'
+        method: 'network'
+        sub-packages: '["nvcc", "cudart"]'
 
     - name: Cache ccache
       uses: actions/cache@v5
diff --git a/deps/README.md b/deps/README.md
index f004d29db..3cb4f3496 100644
--- a/deps/README.md
+++ b/deps/README.md
@@ -54,14 +54,24 @@ machine and are located via `find_package`.
 - **Min version**: 1.2 (Televiz checks `VK_API_VERSION_1_2` at device select time).
 - **License**: Apache 2.0 (loader); per-vendor for ICD drivers.
 
-### CUDA Toolkit (optional, used by examples)
+### CUDA Toolkit
 - **Locator**: `find_package(CUDAToolkit REQUIRED)`
-- **Required by**: `examples/camera_streamer/` only today; `viz` adds CUDA
-  dependency once CUDA-Vulkan interop lands; CI must have CUDA installed for
-  GPU test runner to exercise these paths.
+- **Required by**: `viz/core/` (`CudaTexture`, `DeviceImage` link to
+  `CUDA::cudart`) when `BUILD_VIZ=ON`; also `examples/camera_streamer/`.
 - **Min version**: 12.0
+- **CI**: installed via `Jimver/cuda-toolkit@v0.2.x` action with
+  `nvcc` + `cudart` sub-packages.
 - **License**: NVIDIA EULA
 
+### glslangValidator (shader compiler)
+- **Locator**: `find_program(GLSLANG_VALIDATOR glslangValidator REQUIRED)`
+- **Required by**: `viz/shaders/` to compile `.vert` / `.frag` GLSL
+  into SPIR-V at build time.
+- **Linux**: `apt-get install glslang-tools`
+- **Windows**: ships with the LunarG Vulkan SDK
+- **macOS**: `brew install glslang`
+- **License**: BSD-3 / Khronos
+
 ## Third-Party Dependencies
 
 ### OpenXR SDK
diff --git a/src/viz/AGENTS.md b/src/viz/AGENTS.md
index 8219ce243..67735dedd 100644
--- a/src/viz/AGENTS.md
+++ b/src/viz/AGENTS.md
@@ -17,11 +17,19 @@ sibling `<sub-module>_tests/` directory:
 
 - **`viz/core/`** — foundational types + Vulkan/CUDA infrastructure.
   Library: `viz_core`. Today: `VkContext`, `VizBuffer`, `Pose3D`, `Fov`,
-  `Resolution`, `ViewInfo`, `PixelFormat`, `RenderTarget`, `FrameSync`.
-  Future: `cuda_texture`. Math types (`glm::vec3`, `glm::quat`,
+  `Resolution`, `ViewInfo`, `PixelFormat`, `RenderTarget`, `FrameSync`,
+  `HostImage`, `DeviceImage`. `HostImage` / `DeviceImage` are the
+  symmetric pair of owning 2D pixel buffers (CPU bytes vs CUDA-Vulkan
+  interop) — both expose `VizBuffer view()` so generic helpers branch
+  on `VizBuffer::space`. Math types (`glm::vec3`, `glm::quat`,
   `glm::mat4`) come from GLM 1.0.1 (FetchContent in
   `deps/third_party/`); use `glm::value_ptr(mat)` to get a raw `float*`
   for Vulkan / CUDA upload (POD-equivalent layout, no copy).
+  CUDA-Vulkan interop requires CUDA Toolkit at link time
+  (`CUDAToolkit::cudart`). `VkContext::init()` matches the current
+  CUDA device to the chosen Vulkan physical device by UUID — every
+  viz_core type can assume CUDA and Vulkan are talking to the same
+  GPU without re-doing the match.
 - **`viz/layers/`** — `LayerBase` and concrete layers (`QuadLayer`, etc.).
   Library: `viz_layers` (INTERFACE / header-only today; promoted to
   STATIC when the first concrete layer ships). Depends on `viz_core`.
@@ -39,11 +47,18 @@ sibling `<sub-module>_tests/` directory:
   frame loop, type conversion). Library: `viz_xr`. **Optional** behind
   `BUILD_VIZ_XR`. Depends on `viz_core` + OpenXR.
 - **`viz/python/`** — pybind11 module `_viz`, exposed as `isaacteleop.viz`.
-- **`viz/shaders/`** — GLSL → SPIR-V at build time.
+- **`viz/shaders/`** — GLSL → SPIR-V at build time. Library: `viz_shaders`
+  (INTERFACE — exposes generated headers `viz/shaders/<name>.spv.h`,
+  each containing an `inline constexpr alignas(uint32_t) unsigned char`
+  byte array + a `Size` constant). Compilation runs `glslangValidator`
+  (system-installed; CI gets `glslang-tools` apt package). Add new
+  shader programs by dropping `<name>.vert` / `<name>.frag` in
+  `viz/shaders/cpp/` and calling `compile_shader(<name>.vert kVarName)`
+  in the local CMakeLists.
 
 Test directories follow the same per-module pattern:
 `viz/core_tests/`, `viz/layers_tests/`, `viz/session_tests/`,
-`viz/xr_tests/`.
+`viz/shaders_tests/`, `viz/xr_tests/`.
 
 `src/viz/CMakeLists.txt` is an **orchestrator only** — it adds the
 sub-module sub-directories. Sub-module `CMakeLists.txt` files build the
@@ -57,8 +72,17 @@ Build paths that ship viz (the wheel CI on Linux + Windows) pass
 `-DBUILD_VIZ=ON` explicitly. Lean Dockerfiles
 (`examples/teleop_ros2/Dockerfile`) get viz-free builds for free.
 
-When `BUILD_VIZ=ON` you must have Vulkan headers + loader installed:
-`libvulkan-dev` on Linux, LunarG SDK on Windows.
+When `BUILD_VIZ=ON` the build machine must have:
+- **Vulkan headers + loader**: `libvulkan-dev` on Linux, LunarG SDK on
+  Windows.
+- **CUDA Toolkit** (cudart for link, nvcc not strictly required today
+  but expected to be needed for kernels in M3b+): apt
+  `nvidia-cuda-toolkit` or the official NVIDIA installer / CI action
+  (`Jimver/cuda-toolkit`). The wheel excludes `libcuda.so.1` —
+  consumers supply it via NVIDIA driver.
+- **glslangValidator** for shader compilation: `glslang-tools` apt
+  package on Linux, `brew install glslang` on macOS, ships with the
+  Vulkan SDK on Windows.
 
 ## Code conventions
 
diff --git a/src/viz/CMakeLists.txt b/src/viz/CMakeLists.txt
index 18bf7f6c6..642e080b2 100644
--- a/src/viz/CMakeLists.txt
+++ b/src/viz/CMakeLists.txt
@@ -31,6 +31,10 @@ add_subdirectory(layers)
 # layer that drives the per-frame loop and manages the layer registry.
 add_subdirectory(session)
 
+# Shaders: GLSL sources compiled to SPIR-V at build time and embedded
+# as C arrays. Header-only INTERFACE library viz::shaders.
+add_subdirectory(shaders)
+
 # Python bindings (stub today; pybind11 module added later).
 if(BUILD_PYTHON_BINDINGS)
     add_subdirectory(python)
@@ -41,4 +45,5 @@ if(BUILD_TESTING)
     add_subdirectory(core_tests)
     add_subdirectory(layers_tests)
     add_subdirectory(session_tests)
+    add_subdirectory(shaders_tests)
 endif()
diff --git a/src/viz/core/cpp/CMakeLists.txt b/src/viz/core/cpp/CMakeLists.txt
index 2591443b3..4116624a9 100644
--- a/src/viz/core/cpp/CMakeLists.txt
+++ b/src/viz/core/cpp/CMakeLists.txt
@@ -3,12 +3,18 @@
 
 cmake_minimum_required(VERSION 3.20)
 
-# Vulkan is required for vk_context.
+# Vulkan is always required for vk_context / render_target / etc.
 find_package(Vulkan REQUIRED)
 
+# CUDA Toolkit provides the CUDA-Vulkan interop runtime APIs
+# (cudaImportExternalMemory, cudaExternalMemoryGetMappedMipmappedArray)
+# used by cuda_texture / device_image. Required when BUILD_VIZ=ON.
+find_package(CUDAToolkit REQUIRED)
+
 # Foundational types and Vulkan/CUDA primitives shared by all viz sub-modules.
-# Static lib; consumed in-tree via viz::viz_core.
+# Static lib; consumed in-tree via viz::core.
 add_library(viz_core STATIC
+    device_image.cpp
     frame_sync.cpp
     render_target.cpp
     vk_context.cpp
@@ -30,6 +36,7 @@ target_link_libraries(viz_core
     PUBLIC
         Vulkan::Vulkan
         glm::glm
+        CUDA::cudart
 )
 
 # Aliased as viz::core (consumers say viz::core, not viz::viz_core).
diff --git a/src/viz/core/cpp/device_image.cpp b/src/viz/core/cpp/device_image.cpp
new file mode 100644
index 000000000..4cfe156e1
--- /dev/null
+++ b/src/viz/core/cpp/device_image.cpp
@@ -0,0 +1,378 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include <viz/core/device_image.hpp>
+#include <viz/core/vk_context.hpp>
+
+#include <stdexcept>
+#include <string>
+#include <unistd.h>
+
+namespace viz
+{
+
+namespace
+{
+
+void check_vk(VkResult result, const char* what)
+{
+    if (result != VK_SUCCESS)
+    {
+        throw std::runtime_error(std::string("DeviceImage: ") + what + " failed: VkResult=" + std::to_string(result));
+    }
+}
+
+void check_cuda(cudaError_t result, const char* what)
+{
+    if (result != cudaSuccess)
+    {
+        throw std::runtime_error(std::string("DeviceImage: ") + what + " failed: " + cudaGetErrorString(result));
+    }
+}
+
+uint32_t find_memory_type(VkPhysicalDevice physical_device, uint32_t type_bits, VkMemoryPropertyFlags properties)
+{
+    VkPhysicalDeviceMemoryProperties mem_props;
+    vkGetPhysicalDeviceMemoryProperties(physical_device, &mem_props);
+    for (uint32_t i = 0; i < mem_props.memoryTypeCount; ++i)
+    {
+        if ((type_bits & (1u << i)) != 0 && (mem_props.memoryTypes[i].propertyFlags & properties) == properties)
+        {
+            return i;
+        }
+    }
+    throw std::runtime_error("DeviceImage: no Vulkan memory type matching requested properties");
+}
+
+VkFormat to_vk_format(PixelFormat format)
+{
+    switch (format)
+    {
+    case PixelFormat::kRGBA8:
+        // UNORM (not SRGB) so CUDA writes round-trip without an
+        // implicit sRGB encode on the Vulkan side. Color management
+        // is the layer's concern.
+        return VK_FORMAT_R8G8B8A8_UNORM;
+    case PixelFormat::kD32F:
+        return VK_FORMAT_D32_SFLOAT;
+    }
+    throw std::runtime_error("DeviceImage: unsupported PixelFormat");
+}
+
+cudaChannelFormatDesc to_cuda_format(PixelFormat format)
+{
+    switch (format)
+    {
+    case PixelFormat::kRGBA8:
+        return cudaCreateChannelDesc<uchar4>();
+    case PixelFormat::kD32F:
+        return cudaCreateChannelDesc<float>();
+    }
+    throw std::runtime_error("DeviceImage: unsupported PixelFormat");
+}
+
+} // namespace
+
+std::unique_ptr<DeviceImage> DeviceImage::create(const VkContext& ctx, Resolution resolution, PixelFormat format)
+{
+    if (!ctx.is_initialized())
+    {
+        throw std::invalid_argument("DeviceImage: VkContext is not initialized");
+    }
+    if (resolution.width == 0 || resolution.height == 0)
+    {
+        throw std::invalid_argument("DeviceImage: resolution must be non-zero");
+    }
+    std::unique_ptr<DeviceImage> img(new DeviceImage(ctx, resolution, format));
+    img->init();
+    return img;
+}
+
+DeviceImage::DeviceImage(const VkContext& ctx, Resolution resolution, PixelFormat format)
+    : ctx_(&ctx), resolution_(resolution), format_(format), vk_format_(to_vk_format(format))
+{
+}
+
+DeviceImage::~DeviceImage()
+{
+    destroy();
+}
+
+void DeviceImage::init()
+{
+    try
+    {
+        create_vk_image_with_external_memory();
+        create_vk_image_view();
+        import_to_cuda();
+        transition_to_shader_read();
+    }
+    catch (...)
+    {
+        destroy();
+        throw;
+    }
+}
+
+void DeviceImage::destroy()
+{
+    // CUDA side first; CUDA holds a dup'd handle on the underlying
+    // memory, so the VkDeviceMemory must outlive the CUDA mapping.
+    if (cuda_mipmapped_array_ != nullptr)
+    {
+        // Failure here is best-effort cleanup; we don't throw from dtor.
+        (void)cudaFreeMipmappedArray(cuda_mipmapped_array_);
+        cuda_mipmapped_array_ = nullptr;
+        cuda_array_ = nullptr;
+    }
+    if (cuda_external_memory_ != nullptr)
+    {
+        (void)cudaDestroyExternalMemory(cuda_external_memory_);
+        cuda_external_memory_ = nullptr;
+    }
+    if (memory_fd_ >= 0)
+    {
+        // CUDA dups the fd internally on import, so we close our copy.
+        // If import failed before our explicit close, fd_ may still
+        // hold our copy — close it here.
+        ::close(memory_fd_);
+        memory_fd_ = -1;
+    }
+
+    if (ctx_ == nullptr)
+    {
+        return;
+    }
+    const VkDevice device = ctx_->device();
+    if (device == VK_NULL_HANDLE)
+    {
+        return;
+    }
+    if (command_pool_ != VK_NULL_HANDLE)
+    {
+        vkDestroyCommandPool(device, command_pool_, nullptr);
+        command_pool_ = VK_NULL_HANDLE;
+    }
+    if (image_view_ != VK_NULL_HANDLE)
+    {
+        vkDestroyImageView(device, image_view_, nullptr);
+        image_view_ = VK_NULL_HANDLE;
+    }
+    if (image_ != VK_NULL_HANDLE)
+    {
+        vkDestroyImage(device, image_, nullptr);
+        image_ = VK_NULL_HANDLE;
+    }
+    if (memory_ != VK_NULL_HANDLE)
+    {
+        vkFreeMemory(device, memory_, nullptr);
+        memory_ = VK_NULL_HANDLE;
+    }
+    current_layout_ = VK_IMAGE_LAYOUT_UNDEFINED;
+}
+
+VizBuffer DeviceImage::view() const noexcept
+{
+    VizBuffer b;
+    b.data = static_cast<void*>(cuda_array_);
+    b.width = resolution_.width;
+    b.height = resolution_.height;
+    b.format = format_;
+    b.pitch = static_cast<size_t>(resolution_.width) * bytes_per_pixel(format_);
+    b.space = MemorySpace::kDevice;
+    return b;
+}
+
+void DeviceImage::create_vk_image_with_external_memory()
+{
+    const VkDevice device = ctx_->device();
+
+    // Image with external-memory export flag. Optimal tiling — CUDA
+    // accesses the image via cudaArray_t, not raw memory, so opaque
+    // GPU layout is fine.
+    VkExternalMemoryImageCreateInfo ext_image_info{};
+    ext_image_info.sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO;
+    ext_image_info.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT;
+
+    VkImageCreateInfo info{};
+    info.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO;
+    info.pNext = &ext_image_info;
+    info.imageType = VK_IMAGE_TYPE_2D;
+    info.format = vk_format_;
+    info.extent = { resolution_.width, resolution_.height, 1 };
+    info.mipLevels = 1; // Single level — when minification moiré shows up in
+                        // XR distance views, expose mipLevels via Config and
+                        // generate the chain via vkCmdBlitImage pre-render.
+                        // Anisotropic filtering on the sampler is the cheaper
+                        // first line of defense.
+    info.arrayLayers = 1;
+    info.samples = VK_SAMPLE_COUNT_1_BIT;
+    info.tiling = VK_IMAGE_TILING_OPTIMAL;
+    info.usage = VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT;
+    info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
+    info.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+
+    check_vk(vkCreateImage(device, &info, nullptr, &image_), "vkCreateImage");
+
+    VkMemoryRequirements reqs;
+    vkGetImageMemoryRequirements(device, image_, &reqs);
+
+    // Memory backing the image: device-local + exportable as POSIX fd.
+    // No VkMemoryDedicatedAllocateInfo / cudaExternalMemoryDedicated —
+    // a generic allocation works for sampled 2D images and avoids the
+    // dedicated-allocation extension wiring.
+    VkExportMemoryAllocateInfo export_info{};
+    export_info.sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO;
+    export_info.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT;
+
+    VkMemoryAllocateInfo alloc{};
+    alloc.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
+    alloc.pNext = &export_info;
+    alloc.allocationSize = reqs.size;
+    alloc.memoryTypeIndex =
+        find_memory_type(ctx_->physical_device(), reqs.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
+    check_vk(vkAllocateMemory(device, &alloc, nullptr, &memory_), "vkAllocateMemory");
+    check_vk(vkBindImageMemory(device, image_, memory_, 0), "vkBindImageMemory");
+
+    auto vkGetMemoryFdKHR = reinterpret_cast<PFN_vkGetMemoryFdKHR>(vkGetDeviceProcAddr(device, "vkGetMemoryFdKHR"));
+    if (vkGetMemoryFdKHR == nullptr)
+    {
+        throw std::runtime_error(
+            "DeviceImage: vkGetMemoryFdKHR not available "
+            "(VK_KHR_external_memory_fd not enabled?)");
+    }
+    VkMemoryGetFdInfoKHR fd_info{};
+    fd_info.sType = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR;
+    fd_info.memory = memory_;
+    fd_info.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT;
+    check_vk(vkGetMemoryFdKHR(device, &fd_info, &memory_fd_), "vkGetMemoryFdKHR");
+
+    // Used only for transition_to_*; tiny pool, default flags.
+    VkCommandPoolCreateInfo pool_info{};
+    pool_info.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
+    pool_info.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT;
+    pool_info.queueFamilyIndex = ctx_->queue_family_index();
+    check_vk(vkCreateCommandPool(device, &pool_info, nullptr, &command_pool_), "vkCreateCommandPool");
+}
+
+void DeviceImage::create_vk_image_view()
+{
+    VkImageViewCreateInfo info{};
+    info.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
+    info.image = image_;
+    info.viewType = VK_IMAGE_VIEW_TYPE_2D;
+    info.format = vk_format_;
+    info.subresourceRange.aspectMask =
+        (format_ == PixelFormat::kD32F) ? VK_IMAGE_ASPECT_DEPTH_BIT : VK_IMAGE_ASPECT_COLOR_BIT;
+    info.subresourceRange.baseMipLevel = 0;
+    info.subresourceRange.levelCount = 1;
+    info.subresourceRange.baseArrayLayer = 0;
+    info.subresourceRange.layerCount = 1;
+    check_vk(vkCreateImageView(ctx_->device(), &info, nullptr, &image_view_), "vkCreateImageView");
+}
+
+void DeviceImage::import_to_cuda()
+{
+    VkMemoryRequirements reqs;
+    vkGetImageMemoryRequirements(ctx_->device(), image_, &reqs);
+
+    cudaExternalMemoryHandleDesc ext_desc{};
+    ext_desc.type = cudaExternalMemoryHandleTypeOpaqueFd;
+    ext_desc.handle.fd = memory_fd_;
+    ext_desc.size = reqs.size;
+    ext_desc.flags = 0;
+
+    check_cuda(cudaImportExternalMemory(&cuda_external_memory_, &ext_desc), "cudaImportExternalMemory");
+
+    // CUDA dup'd the fd internally; close ours so we don't double-free.
+    ::close(memory_fd_);
+    memory_fd_ = -1;
+
+    cudaExternalMemoryMipmappedArrayDesc array_desc{};
+    array_desc.offset = 0;
+    array_desc.formatDesc = to_cuda_format(format_);
+    array_desc.extent = make_cudaExtent(resolution_.width, resolution_.height, 0);
+    array_desc.flags = cudaArrayColorAttachment;
+    array_desc.numLevels = 1;
+
+    check_cuda(cudaExternalMemoryGetMappedMipmappedArray(&cuda_mipmapped_array_, cuda_external_memory_, &array_desc),
+               "cudaExternalMemoryGetMappedMipmappedArray");
+    check_cuda(cudaGetMipmappedArrayLevel(&cuda_array_, cuda_mipmapped_array_, 0), "cudaGetMipmappedArrayLevel");
+}
+
+void DeviceImage::transition_to_shader_read()
+{
+    if (current_layout_ == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL)
+    {
+        return;
+    }
+    run_one_shot_layout_transition(current_layout_, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
+                                   VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT,
+                                   VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT);
+    current_layout_ = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
+}
+
+void DeviceImage::transition_to_transfer_dst()
+{
+    if (current_layout_ == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL)
+    {
+        return;
+    }
+    run_one_shot_layout_transition(current_layout_, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, VK_ACCESS_SHADER_READ_BIT,
+                                   VK_ACCESS_TRANSFER_WRITE_BIT, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT,
+                                   VK_PIPELINE_STAGE_TRANSFER_BIT);
+    current_layout_ = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
+}
+
+void DeviceImage::run_one_shot_layout_transition(VkImageLayout old_layout,
+                                                 VkImageLayout new_layout,
+                                                 VkAccessFlags src_access,
+                                                 VkAccessFlags dst_access,
+                                                 VkPipelineStageFlags src_stage,
+                                                 VkPipelineStageFlags dst_stage)
+{
+    const VkDevice device = ctx_->device();
+
+    VkCommandBufferAllocateInfo alloc{};
+    alloc.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
+    alloc.commandPool = command_pool_;
+    alloc.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
+    alloc.commandBufferCount = 1;
+    VkCommandBuffer cmd = VK_NULL_HANDLE;
+    check_vk(vkAllocateCommandBuffers(device, &alloc, &cmd), "vkAllocateCommandBuffers(transition)");
+
+    VkCommandBufferBeginInfo begin{};
+    begin.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+    begin.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+    check_vk(vkBeginCommandBuffer(cmd, &begin), "vkBeginCommandBuffer(transition)");
+
+    VkImageMemoryBarrier barrier{};
+    barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
+    barrier.oldLayout = old_layout;
+    barrier.newLayout = new_layout;
+    barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+    barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+    barrier.image = image_;
+    barrier.subresourceRange.aspectMask =
+        (format_ == PixelFormat::kD32F) ? VK_IMAGE_ASPECT_DEPTH_BIT : VK_IMAGE_ASPECT_COLOR_BIT;
+    barrier.subresourceRange.baseMipLevel = 0;
+    barrier.subresourceRange.levelCount = 1;
+    barrier.subresourceRange.baseArrayLayer = 0;
+    barrier.subresourceRange.layerCount = 1;
+    barrier.srcAccessMask = src_access;
+    barrier.dstAccessMask = dst_access;
+    vkCmdPipelineBarrier(cmd, src_stage, dst_stage, 0, 0, nullptr, 0, nullptr, 1, &barrier);
+
+    check_vk(vkEndCommandBuffer(cmd), "vkEndCommandBuffer(transition)");
+
+    VkSubmitInfo submit{};
+    submit.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+    submit.commandBufferCount = 1;
+    submit.pCommandBuffers = &cmd;
+    check_vk(vkQueueSubmit(ctx_->queue(), 1, &submit, VK_NULL_HANDLE), "vkQueueSubmit(transition)");
+    check_vk(vkQueueWaitIdle(ctx_->queue()), "vkQueueWaitIdle(transition)");
+
+    vkFreeCommandBuffers(device, command_pool_, 1, &cmd);
+}
+
+} // namespace viz
diff --git a/src/viz/core/cpp/inc/viz/core/device_image.hpp b/src/viz/core/cpp/inc/viz/core/device_image.hpp
index c29537c0b..5fd82e364 100644
--- a/src/viz/core/cpp/inc/viz/core/device_image.hpp
+++ b/src/viz/core/cpp/inc/viz/core/device_image.hpp
@@ -3,26 +3,116 @@
 
 #pragma once
 
+#include <viz/core/viz_buffer.hpp>
+#include <viz/core/viz_types.hpp>
+#include <vulkan/vulkan.h>
+
+#include <cuda_runtime.h>
+#include <memory>
+
 namespace viz
 {
 
-// Owning device-side 2D pixel buffer with CUDA-Vulkan interop. The
-// symmetric counterpart to HostImage: HostImage owns CPU bytes,
-// DeviceImage will own:
-//   - VkImage / VkBuffer + VkDeviceMemory exported via
-//     VK_KHR_external_memory_fd
-//   - cudaExternalMemory_t imported from that fd, plus the CUDA device
-//     pointer derived from it
-//   - paired cudaExternalSemaphore_t / VkSemaphore for acquire / release
-//     synchronization
-//
-// Returned by Televiz's mode-B submission path (acquire / release) when
-// Televiz allocates the interop buffer for a layer to write into.
+class VkContext;
+
+// Owning CUDA-Vulkan interop image. Vulkan allocates the VkImage
+// (optimal tiling, sampled + transfer-dst); the backing VkDeviceMemory
+// is exported via VK_KHR_external_memory_fd and imported into CUDA as
+// a cudaArray_t. CUDA writes via cuda_array(); Vulkan samples via
+// vk_image(). Symmetric counterpart to HostImage; both expose
+// VizBuffer view() so helpers branch on VizBuffer::space.
 //
-// Intentionally only forward-declared in this milestone — the
-// implementation ships alongside CUDA-Vulkan interop. Callers may pass
-// pointers / references to DeviceImage between modules but cannot
-// instantiate one until then.
-class DeviceImage;
+// Synchronization is heavyweight today (cudaDeviceSynchronize +
+// vkQueueWaitIdle); paired acquire / release semaphores arrive with
+// QuadLayer. CUDA/Vulkan device matching is handled by VkContext.
+class DeviceImage
+{
+public:
+    // Throws std::invalid_argument on bad config; std::runtime_error
+    // on Vulkan or CUDA failure. Pre-initialized.
+    static std::unique_ptr<DeviceImage> create(const VkContext& ctx, Resolution resolution, PixelFormat format);
+
+    ~DeviceImage();
+    void destroy();
+
+    DeviceImage(const DeviceImage&) = delete;
+    DeviceImage& operator=(const DeviceImage&) = delete;
+    DeviceImage(DeviceImage&&) = delete;
+    DeviceImage& operator=(DeviceImage&&) = delete;
+
+    // VizBuffer view (kDevice). `data` is the cudaArray_t cast to
+    // void*; it's an opaque CUDA handle, not a raw device pointer —
+    // use cuda_array() with cudaMemcpy2DToArrayAsync to write.
+    VizBuffer view() const noexcept;
+
+    // CUDA write target. Lifetime tied to this DeviceImage.
+    cudaArray_t cuda_array() const noexcept
+    {
+        return cuda_array_;
+    }
+
+    // Image lives in VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL after
+    // init; transition_to_*() below moves it back and forth.
+    VkImage vk_image() const noexcept
+    {
+        return image_;
+    }
+    VkImageView vk_image_view() const noexcept
+    {
+        return image_view_;
+    }
+    VkFormat vk_format() const noexcept
+    {
+        return vk_format_;
+    }
+
+    Resolution resolution() const noexcept
+    {
+        return resolution_;
+    }
+    PixelFormat format() const noexcept
+    {
+        return format_;
+    }
+
+    // Synchronous one-shot layout transitions (vkQueueSubmit +
+    // vkQueueWaitIdle). For tests / one-shot uploads — production
+    // layers record their own barriers in render commands.
+    void transition_to_shader_read();
+    void transition_to_transfer_dst();
+
+private:
+    explicit DeviceImage(const VkContext& ctx, Resolution resolution, PixelFormat format);
+    void init();
+
+    void create_vk_image_with_external_memory();
+    void create_vk_image_view();
+    void import_to_cuda();
+
+    void run_one_shot_layout_transition(VkImageLayout old_layout,
+                                        VkImageLayout new_layout,
+                                        VkAccessFlags src_access,
+                                        VkAccessFlags dst_access,
+                                        VkPipelineStageFlags src_stage,
+                                        VkPipelineStageFlags dst_stage);
+
+    const VkContext* ctx_ = nullptr;
+    Resolution resolution_{};
+    PixelFormat format_ = PixelFormat::kRGBA8;
+    VkFormat vk_format_ = VK_FORMAT_R8G8B8A8_UNORM;
+    VkImageLayout current_layout_ = VK_IMAGE_LAYOUT_UNDEFINED;
+
+    VkImage image_ = VK_NULL_HANDLE;
+    VkDeviceMemory memory_ = VK_NULL_HANDLE;
+    VkImageView image_view_ = VK_NULL_HANDLE;
+    VkCommandPool command_pool_ = VK_NULL_HANDLE; // For layout transitions only.
+
+    // CUDA dup's the fd internally on import; we close ours after.
+    int memory_fd_ = -1;
+
+    cudaExternalMemory_t cuda_external_memory_ = nullptr;
+    cudaMipmappedArray_t cuda_mipmapped_array_ = nullptr;
+    cudaArray_t cuda_array_ = nullptr; // Level-0 view, non-owning.
+};
 
 } // namespace viz
diff --git a/src/viz/core/cpp/inc/viz/core/vk_context.hpp b/src/viz/core/cpp/inc/viz/core/vk_context.hpp
index e960f30ef..d1402e58a 100644
--- a/src/viz/core/cpp/inc/viz/core/vk_context.hpp
+++ b/src/viz/core/cpp/inc/viz/core/vk_context.hpp
@@ -43,6 +43,14 @@ struct PhysicalDeviceInfo
 //   - A queue family with graphics + compute + transfer flags
 //
 // VkContext owns the Vulkan handles and tears them down on destruction.
+//
+// CUDA-Vulkan device matching: as part of init(), VkContext queries
+// the chosen physical device's UUID and calls cudaSetDevice() on the
+// matching CUDA device. This is required for CUDA-Vulkan interop
+// (cudaImportExternalMemory) to succeed on multi-GPU machines, and
+// it makes VkContext the single chokepoint for "which GPU is Televiz
+// on" — every viz_core type that touches CUDA can assume the current
+// CUDA device matches the Vulkan one.
 class VkContext
 {
 public:
@@ -109,6 +117,7 @@ class VkContext
     void create_instance(const Config& config);
     void select_physical_device(const Config& config);
     void create_logical_device(const Config& config);
+    void match_cuda_device_to_vulkan();
 
     bool initialized_ = false;
     bool validation_enabled_ = false;
diff --git a/src/viz/core/cpp/vk_context.cpp b/src/viz/core/cpp/vk_context.cpp
index 1046f1d8e..773329634 100644
--- a/src/viz/core/cpp/vk_context.cpp
+++ b/src/viz/core/cpp/vk_context.cpp
@@ -5,6 +5,7 @@
 
 #include <algorithm>
 #include <cstring>
+#include <cuda_runtime.h>
 #include <iostream>
 #include <stdexcept>
 #include <string>
@@ -190,6 +191,7 @@ void VkContext::init(const Config& config)
         create_instance(config);
         select_physical_device(config);
         create_logical_device(config);
+        match_cuda_device_to_vulkan();
         initialized_ = true;
     }
     catch (...)
@@ -406,6 +408,53 @@ void VkContext::create_logical_device(const Config& config)
     vkGetDeviceQueue(device_, queue_family_index_, 0, &queue_);
 }
 
+void VkContext::match_cuda_device_to_vulkan()
+{
+    // Read the Vulkan physical device's UUID, then find the CUDA
+    // device with the same UUID and make it current. Required so
+    // every viz_core type that imports Vulkan memory into CUDA
+    // (e.g. DeviceImage::cudaImportExternalMemory) operates on the
+    // same physical GPU. On multi-GPU machines Vulkan and CUDA
+    // default to different devices and interop fails with
+    // cudaErrorUnknown.
+    VkPhysicalDeviceIDProperties id_props{};
+    id_props.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES;
+    VkPhysicalDeviceProperties2 props2{};
+    props2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
+    props2.pNext = &id_props;
+    vkGetPhysicalDeviceProperties2(physical_device_, &props2);
+
+    int cuda_count = 0;
+    cudaError_t err = cudaGetDeviceCount(&cuda_count);
+    if (err != cudaSuccess || cuda_count == 0)
+    {
+        throw std::runtime_error(
+            "VkContext: no CUDA devices visible — CUDA-Vulkan interop requires "
+            "a working CUDA driver");
+    }
+    for (int i = 0; i < cuda_count; ++i)
+    {
+        cudaDeviceProp prop{};
+        err = cudaGetDeviceProperties(&prop, i);
+        if (err != cudaSuccess)
+        {
+            continue;
+        }
+        if (std::memcmp(prop.uuid.bytes, id_props.deviceUUID, VK_UUID_SIZE) == 0)
+        {
+            err = cudaSetDevice(i);
+            if (err != cudaSuccess)
+            {
+                throw std::runtime_error(std::string("VkContext: cudaSetDevice failed: ") + cudaGetErrorString(err));
+            }
+            return;
+        }
+    }
+    throw std::runtime_error(
+        "VkContext: no CUDA device matches the Vulkan physical device's UUID — "
+        "CUDA-Vulkan interop requires same-GPU operation");
+}
+
 std::vector<PhysicalDeviceInfo> VkContext::enumerate_physical_devices()
 {
     std::vector<PhysicalDeviceInfo> result;
diff --git a/src/viz/core_tests/cpp/CMakeLists.txt b/src/viz/core_tests/cpp/CMakeLists.txt
index c9d031d2e..656aff6c6 100644
--- a/src/viz/core_tests/cpp/CMakeLists.txt
+++ b/src/viz/core_tests/cpp/CMakeLists.txt
@@ -4,6 +4,7 @@
 cmake_minimum_required(VERSION 3.20)
 
 add_executable(viz_core_tests
+    test_device_image.cpp
     test_frame_sync.cpp
     test_host_image.cpp
     test_render_target.cpp
diff --git a/src/viz/core_tests/cpp/test_device_image.cpp b/src/viz/core_tests/cpp/test_device_image.cpp
new file mode 100644
index 000000000..fcdb23af0
--- /dev/null
+++ b/src/viz/core_tests/cpp/test_device_image.cpp
@@ -0,0 +1,117 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+// [gpu] tests for DeviceImage: verify Vulkan handle creation, the
+// CUDA cudaArray_t is usable from a cudaMemcpy2DToArray call, the
+// VizBuffer view exposes the right shape with MemorySpace::kDevice,
+// and a round-trip copy preserves pixel values.
+
+#include "test_helpers.hpp"
+
+#include <catch2/catch_test_macros.hpp>
+#include <viz/core/device_image.hpp>
+#include <viz/core/viz_buffer.hpp>
+#include <viz/core/vk_context.hpp>
+
+#include <cstdint>
+#include <cuda_runtime.h>
+#include <stdexcept>
+#include <vector>
+
+using viz::DeviceImage;
+using viz::MemorySpace;
+using viz::PixelFormat;
+using viz::Resolution;
+using viz::VizBuffer;
+
+namespace
+{
+
+// Generate a deterministic gradient pattern. Channel R = column / W *
+// 255, G = row / H * 255, B = column XOR row, A = 255. Easy to spot
+// in a debugger and reproducible across test runs.
+std::vector<uint8_t> make_gradient(uint32_t w, uint32_t h)
+{
+    std::vector<uint8_t> px(static_cast<size_t>(w) * h * 4);
+    for (uint32_t y = 0; y < h; ++y)
+    {
+        for (uint32_t x = 0; x < w; ++x)
+        {
+            const size_t i = (static_cast<size_t>(y) * w + x) * 4;
+            px[i + 0] = static_cast<uint8_t>((x * 255u) / (w - 1u));
+            px[i + 1] = static_cast<uint8_t>((y * 255u) / (h - 1u));
+            px[i + 2] = static_cast<uint8_t>((x ^ y) & 0xff);
+            px[i + 3] = 255;
+        }
+    }
+    return px;
+}
+
+} // namespace
+
+TEST_CASE_METHOD(viz::testing::GpuFixture, "DeviceImage creates valid Vulkan + CUDA handles", "[gpu][device_image]")
+{
+    auto img = DeviceImage::create(vk, Resolution{ 64, 64 }, PixelFormat::kRGBA8);
+    REQUIRE(img != nullptr);
+    CHECK(img->vk_image() != VK_NULL_HANDLE);
+    CHECK(img->vk_image_view() != VK_NULL_HANDLE);
+    CHECK(img->vk_format() == VK_FORMAT_R8G8B8A8_UNORM);
+    CHECK(img->cuda_array() != nullptr);
+    CHECK(img->resolution().width == 64);
+    CHECK(img->resolution().height == 64);
+    CHECK(img->format() == PixelFormat::kRGBA8);
+}
+
+TEST_CASE_METHOD(viz::testing::GpuFixture, "DeviceImage::create rejects zero dimensions", "[gpu][device_image]")
+{
+    CHECK_THROWS_AS(DeviceImage::create(vk, Resolution{ 0, 64 }, PixelFormat::kRGBA8), std::invalid_argument);
+    CHECK_THROWS_AS(DeviceImage::create(vk, Resolution{ 64, 0 }, PixelFormat::kRGBA8), std::invalid_argument);
+}
+
+TEST_CASE_METHOD(viz::testing::GpuFixture, "DeviceImage destroy is idempotent", "[gpu][device_image]")
+{
+    auto img = DeviceImage::create(vk, Resolution{ 32, 32 }, PixelFormat::kRGBA8);
+    img->destroy();
+    CHECK(img->vk_image() == VK_NULL_HANDLE);
+    CHECK(img->vk_image_view() == VK_NULL_HANDLE);
+    CHECK(img->cuda_array() == nullptr);
+    img->destroy();
+}
+
+TEST_CASE_METHOD(viz::testing::GpuFixture, "DeviceImage::view exposes a kDevice VizBuffer", "[gpu][device_image]")
+{
+    auto img = DeviceImage::create(vk, Resolution{ 16, 16 }, PixelFormat::kRGBA8);
+    const VizBuffer v = img->view();
+    CHECK(v.space == MemorySpace::kDevice);
+    CHECK(v.width == 16);
+    CHECK(v.height == 16);
+    CHECK(v.format == PixelFormat::kRGBA8);
+    CHECK(v.pitch == static_cast<size_t>(16) * 4);
+    // .data points at the cudaArray_t handle (NOT a raw device pointer);
+    // sanity that it's non-null when the image is alive.
+    CHECK(v.data != nullptr);
+}
+
+TEST_CASE_METHOD(viz::testing::GpuFixture, "DeviceImage round-trip preserves pixel pattern", "[gpu][device_image]")
+{
+    constexpr uint32_t kSide = 64;
+    constexpr size_t kBytes = static_cast<size_t>(kSide) * kSide * 4;
+
+    auto img = DeviceImage::create(vk, Resolution{ kSide, kSide }, PixelFormat::kRGBA8);
+
+    // Write a gradient via CUDA into the array.
+    const auto src = make_gradient(kSide, kSide);
+    REQUIRE(cudaMemcpy2DToArray(img->cuda_array(), 0, 0, src.data(), kSide * 4, kSide * 4, kSide,
+                                cudaMemcpyHostToDevice) == cudaSuccess);
+    REQUIRE(cudaDeviceSynchronize() == cudaSuccess);
+
+    // Read it back via CUDA — verifies the data took. (M3b's milestone
+    // test will read back via Vulkan sampling; that's the real round-
+    // trip but requires a graphics pipeline we don't have yet.)
+    std::vector<uint8_t> dst(kBytes);
+    REQUIRE(cudaMemcpy2DFromArray(dst.data(), kSide * 4, img->cuda_array(), 0, 0, kSide * 4, kSide,
+                                  cudaMemcpyDeviceToHost) == cudaSuccess);
+    REQUIRE(cudaDeviceSynchronize() == cudaSuccess);
+
+    CHECK(dst == src);
+}
diff --git a/src/viz/shaders/CMakeLists.txt b/src/viz/shaders/CMakeLists.txt
new file mode 100644
index 000000000..636447b71
--- /dev/null
+++ b/src/viz/shaders/CMakeLists.txt
@@ -0,0 +1,9 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+cmake_minimum_required(VERSION 3.20)
+
+# GLSL shader sources compiled to SPIR-V at build time and embedded as
+# C arrays in generated headers. Consumers link viz::shaders to get the
+# headers on their include path.
+add_subdirectory(cpp)
diff --git a/src/viz/shaders/cpp/CMakeLists.txt b/src/viz/shaders/cpp/CMakeLists.txt
new file mode 100644
index 000000000..0f934aaf7
--- /dev/null
+++ b/src/viz/shaders/cpp/CMakeLists.txt
@@ -0,0 +1,66 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+cmake_minimum_required(VERSION 3.20)
+
+# glslangValidator is part of the Vulkan SDK / glslang-tools package.
+# We don't FetchContent it — assume it's installed (libvulkan-dev pulls
+# it in on Ubuntu, brew install glslang on macOS).
+find_program(GLSLANG_VALIDATOR glslangValidator REQUIRED)
+message(STATUS "Using glslangValidator: ${GLSLANG_VALIDATOR}")
+
+set(SHADERS_GEN_DIR "${CMAKE_CURRENT_BINARY_DIR}/inc/viz/shaders")
+file(MAKE_DIRECTORY "${SHADERS_GEN_DIR}")
+
+# compile_shader(<glsl_path> <var_name>): adds custom commands that
+# run glslangValidator + the byte-emit helper, and registers the
+# generated header so it's a build-time dependency for any consumer
+# of viz_shaders.
+function(compile_shader GLSL_PATH VAR_NAME)
+    get_filename_component(GLSL_NAME "${GLSL_PATH}" NAME)
+    set(SPV_PATH "${CMAKE_CURRENT_BINARY_DIR}/${GLSL_NAME}.spv")
+    set(HEADER_PATH "${SHADERS_GEN_DIR}/${GLSL_NAME}.spv.h")
+
+    add_custom_command(
+        OUTPUT "${SPV_PATH}"
+        COMMAND ${GLSLANG_VALIDATOR} -V "${CMAKE_CURRENT_SOURCE_DIR}/${GLSL_PATH}" -o "${SPV_PATH}"
+        DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${GLSL_PATH}"
+        COMMENT "Compiling shader ${GLSL_NAME} -> SPIR-V"
+        VERBATIM
+    )
+
+    add_custom_command(
+        OUTPUT "${HEADER_PATH}"
+        COMMAND ${CMAKE_COMMAND}
+            -DSPV_PATH=${SPV_PATH}
+            -DHEADER_PATH=${HEADER_PATH}
+            -DVAR_NAME=${VAR_NAME}
+            -P "${CMAKE_CURRENT_SOURCE_DIR}/compile_shader.cmake"
+        DEPENDS "${SPV_PATH}" "${CMAKE_CURRENT_SOURCE_DIR}/compile_shader.cmake"
+        COMMENT "Embedding ${GLSL_NAME}.spv -> ${VAR_NAME}"
+        VERBATIM
+    )
+
+    set_property(GLOBAL APPEND PROPERTY VIZ_SHADER_HEADERS "${HEADER_PATH}")
+endfunction()
+
+# Shader programs:
+#   textured_quad  — fullscreen quad sampling a combined image sampler.
+#                    Used by QuadLayer to display a CUDA-fed texture.
+compile_shader(textured_quad.vert  kTexturedQuadVertSpv)
+compile_shader(textured_quad.frag  kTexturedQuadFragSpv)
+
+# INTERFACE library exposing the generated headers + a phony custom
+# target that ensures the headers exist before any consumer compiles.
+get_property(_SHADER_HEADERS GLOBAL PROPERTY VIZ_SHADER_HEADERS)
+add_custom_target(viz_shaders_generate ALL DEPENDS ${_SHADER_HEADERS})
+
+add_library(viz_shaders INTERFACE)
+target_include_directories(viz_shaders
+    INTERFACE
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/inc>
+)
+add_dependencies(viz_shaders viz_shaders_generate)
+
+# Aliased as viz::shaders.
+add_library(viz::shaders ALIAS viz_shaders)
diff --git a/src/viz/shaders/cpp/compile_shader.cmake b/src/viz/shaders/cpp/compile_shader.cmake
new file mode 100644
index 000000000..32e27d4ca
--- /dev/null
+++ b/src/viz/shaders/cpp/compile_shader.cmake
@@ -0,0 +1,56 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Helper script invoked from add_custom_command to convert a SPIR-V
+# binary into a C header containing an inline constexpr byte array.
+# Driven by command-line variables: SPV_PATH, HEADER_PATH, VAR_NAME.
+#
+# Output (HEADER_PATH) looks like:
+#   #pragma once
+#   #include <cstddef>
+#   #include <cstdint>
+#   namespace viz::shaders {
+#   inline constexpr alignas(uint32_t) unsigned char kVarName[] = {
+#       0x03, 0x02, 0x23, 0x07, ...
+#   };
+#   inline constexpr size_t kVarNameSize = sizeof(kVarName);
+#   } // namespace viz::shaders
+
+if(NOT DEFINED SPV_PATH OR NOT DEFINED HEADER_PATH OR NOT DEFINED VAR_NAME)
+    message(FATAL_ERROR "compile_shader.cmake requires SPV_PATH, HEADER_PATH, VAR_NAME")
+endif()
+
+file(READ "${SPV_PATH}" SPV_CONTENT HEX)
+string(LENGTH "${SPV_CONTENT}" SPV_HEX_LEN)
+math(EXPR SPV_BYTE_LEN "${SPV_HEX_LEN} / 2")
+if(SPV_BYTE_LEN EQUAL 0)
+    message(FATAL_ERROR "compile_shader.cmake: ${SPV_PATH} is empty")
+endif()
+
+# Format every two hex chars as 0xab,. Single long line — SPIR-V binaries
+# are small (~KB) and compilers handle long initializer lines fine.
+string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1, " SPV_BYTES "${SPV_CONTENT}")
+
+set(HEADER_CONTENT
+"// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// AUTO-GENERATED FROM ${SPV_PATH} BY compile_shader.cmake. DO NOT EDIT.
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+namespace viz::shaders
+{
+
+alignas(uint32_t) inline constexpr unsigned char ${VAR_NAME}[] = {
+    ${SPV_BYTES}
+};
+inline constexpr size_t ${VAR_NAME}Size = sizeof(${VAR_NAME});
+
+} // namespace viz::shaders
+")
+
+file(WRITE "${HEADER_PATH}" "${HEADER_CONTENT}")
diff --git a/src/viz/shaders/cpp/textured_quad.frag b/src/viz/shaders/cpp/textured_quad.frag
new file mode 100644
index 000000000..29b92d6c9
--- /dev/null
+++ b/src/viz/shaders/cpp/textured_quad.frag
@@ -0,0 +1,17 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+// Samples a combined image sampler at descriptor binding 0. Used by
+// QuadLayer (ships in M3b) to display a CUDA-fed texture as a quad.
+
+#version 450
+
+layout(set = 0, binding = 0) uniform sampler2D u_texture;
+
+layout(location = 0) in vec2 v_uv;
+layout(location = 0) out vec4 out_color;
+
+void main()
+{
+    out_color = texture(u_texture, v_uv);
+}
diff --git a/src/viz/shaders/cpp/textured_quad.vert b/src/viz/shaders/cpp/textured_quad.vert
new file mode 100644
index 000000000..52cbe5eec
--- /dev/null
+++ b/src/viz/shaders/cpp/textured_quad.vert
@@ -0,0 +1,16 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+// Fullscreen-quad vertex shader that also forwards UVs for sampling.
+// Uses the same gl_VertexIndex trick as solid_color.vert: 3 vertices
+// → covers the screen, no vertex buffer needed.
+
+#version 450
+
+layout(location = 0) out vec2 v_uv;
+
+void main()
+{
+    v_uv = vec2((gl_VertexIndex << 1) & 2, gl_VertexIndex & 2);
+    gl_Position = vec4(v_uv * 2.0 - 1.0, 0.0, 1.0);
+}
diff --git a/src/viz/shaders_tests/CMakeLists.txt b/src/viz/shaders_tests/CMakeLists.txt
new file mode 100644
index 000000000..a25478921
--- /dev/null
+++ b/src/viz/shaders_tests/CMakeLists.txt
@@ -0,0 +1,7 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+cmake_minimum_required(VERSION 3.20)
+
+# Build C++ tests
+add_subdirectory(cpp)
diff --git a/src/viz/shaders_tests/cpp/CMakeLists.txt b/src/viz/shaders_tests/cpp/CMakeLists.txt
new file mode 100644
index 000000000..66159f706
--- /dev/null
+++ b/src/viz/shaders_tests/cpp/CMakeLists.txt
@@ -0,0 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+cmake_minimum_required(VERSION 3.20)
+
+add_executable(viz_shaders_tests
+    test_shader_blobs.cpp
+)
+
+target_link_libraries(viz_shaders_tests PRIVATE
+    viz::shaders
+    Catch2::Catch2WithMain
+)
+
+message(STATUS "viz_shaders_tests target enabled with Catch2")
+
+catch_discover_tests(viz_shaders_tests ADD_TAGS_AS_LABELS)
diff --git a/src/viz/shaders_tests/cpp/test_shader_blobs.cpp b/src/viz/shaders_tests/cpp/test_shader_blobs.cpp
new file mode 100644
index 000000000..8637cf6e2
--- /dev/null
+++ b/src/viz/shaders_tests/cpp/test_shader_blobs.cpp
@@ -0,0 +1,44 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+// Smoke tests for the embedded SPIR-V blobs: verify the CMake
+// glslangValidator + byte-embed pipeline produced something that
+// looks like real SPIR-V. Doesn't run the shaders — just sanity-checks
+// the bytes.
+
+#include <catch2/catch_test_macros.hpp>
+#include <viz/shaders/textured_quad.frag.spv.h>
+#include <viz/shaders/textured_quad.vert.spv.h>
+
+#include <cstdint>
+#include <cstring>
+
+namespace
+{
+
+// SPIR-V magic word, little-endian.
+constexpr uint32_t kSpvMagic = 0x07230203;
+
+// Returns the first uint32 of `bytes` as a little-endian word.
+uint32_t first_word_le(const unsigned char* bytes)
+{
+    uint32_t w = 0;
+    std::memcpy(&w, bytes, sizeof(w));
+    return w;
+}
+
+} // namespace
+
+TEST_CASE("textured_quad.vert.spv blob is non-empty and starts with SPIR-V magic", "[unit][shaders]")
+{
+    REQUIRE(viz::shaders::kTexturedQuadVertSpvSize >= 4);
+    REQUIRE(viz::shaders::kTexturedQuadVertSpvSize % 4 == 0);
+    CHECK(first_word_le(viz::shaders::kTexturedQuadVertSpv) == kSpvMagic);
+}
+
+TEST_CASE("textured_quad.frag.spv blob is non-empty and starts with SPIR-V magic", "[unit][shaders]")
+{
+    REQUIRE(viz::shaders::kTexturedQuadFragSpvSize >= 4);
+    REQUIRE(viz::shaders::kTexturedQuadFragSpvSize % 4 == 0);
+    CHECK(first_word_le(viz::shaders::kTexturedQuadFragSpv) == kSpvMagic);
+}

From b2cc3e7188160e2bd13580fc2c74034b264b48f3 Mon Sep 17 00:00:00 2001
From: Farbod Motlagh <fmotlagh@nvidia.com>
Date: Fri, 1 May 2026 11:32:55 -0700
Subject: [PATCH 02/10] DeviceImage robustness: cmd-buffer RAII, destroy sync,
 per-thread cuda device

- run_one_shot_layout_transition: wrap submit/wait in an RAII guard so
  the command buffer is freed on every exit path (otherwise a queue
  submit failure leaks one cmd per retry).
- DeviceImage::destroy: cudaDeviceSynchronize before CUDA frees and
  vkDeviceWaitIdle before Vulkan teardown, so async work submitted by
  the caller has retired before the resources go away.
- VkContext stores the matched CUDA device id and exposes it via
  cuda_device_id(); DeviceImage::import_to_cuda + ::destroy now call
  cudaSetDevice on the current thread before any CUDA API. cudaSetDevice
  is per-host-thread, so this protects users who create a DeviceImage
  on a worker thread.

All 37 unit + 28 GPU tests pass; unit tests also pass under ASAN+UBSAN.

Signed-off-by: Farbod Motlagh <fmotlagh@nvidia.com>
Co-authored-by: Cursor <cursoragent@cursor.com>
---
 src/viz/core/cpp/device_image.cpp            | 42 ++++++++++++++++++--
 src/viz/core/cpp/inc/viz/core/vk_context.hpp | 10 +++++
 src/viz/core/cpp/vk_context.cpp              |  7 ++++
 3 files changed, 56 insertions(+), 3 deletions(-)

diff --git a/src/viz/core/cpp/device_image.cpp b/src/viz/core/cpp/device_image.cpp
index 4cfe156e1..4bc80e249 100644
--- a/src/viz/core/cpp/device_image.cpp
+++ b/src/viz/core/cpp/device_image.cpp
@@ -116,11 +116,25 @@ void DeviceImage::init()
 
 void DeviceImage::destroy()
 {
+    // Pin CUDA device for this thread so the CUDA frees below land on
+    // the right device even if destroy() runs on a thread that never
+    // ran VkContext::init(). Best-effort — destructor must not throw.
+    if (ctx_ != nullptr && ctx_->cuda_device_id() >= 0)
+    {
+        (void)cudaSetDevice(ctx_->cuda_device_id());
+    }
+
     // CUDA side first; CUDA holds a dup'd handle on the underlying
     // memory, so the VkDeviceMemory must outlive the CUDA mapping.
+    // cudaDeviceSynchronize ensures any caller-issued async CUDA work
+    // (e.g. cudaMemcpy2DToArrayAsync) has retired before we free the
+    // array — otherwise CUDA may UAF its own staging.
+    if (cuda_mipmapped_array_ != nullptr || cuda_external_memory_ != nullptr)
+    {
+        (void)cudaDeviceSynchronize();
+    }
     if (cuda_mipmapped_array_ != nullptr)
     {
-        // Failure here is best-effort cleanup; we don't throw from dtor.
         (void)cudaFreeMipmappedArray(cuda_mipmapped_array_);
         cuda_mipmapped_array_ = nullptr;
         cuda_array_ = nullptr;
@@ -148,6 +162,9 @@ void DeviceImage::destroy()
     {
         return;
     }
+    // Wait for all GPU work to retire before tearing down Vulkan
+    // resources.
+    (void)vkDeviceWaitIdle(device);
     if (command_pool_ != VK_NULL_HANDLE)
     {
         vkDestroyCommandPool(device, command_pool_, nullptr);
@@ -273,6 +290,12 @@ void DeviceImage::create_vk_image_view()
 
 void DeviceImage::import_to_cuda()
 {
+    // cudaSetDevice is per-host-thread; VkContext set it on the init
+    // thread, but DeviceImage::create() may run on a different one.
+    // Re-pin here so cudaImportExternalMemory / GetMappedMipmappedArray
+    // talk to the same physical GPU as Vulkan.
+    check_cuda(cudaSetDevice(ctx_->cuda_device_id()), "cudaSetDevice");
+
     VkMemoryRequirements reqs;
     vkGetImageMemoryRequirements(ctx_->device(), image_, &reqs);
 
@@ -341,6 +364,21 @@ void DeviceImage::run_one_shot_layout_transition(VkImageLayout old_layout,
     VkCommandBuffer cmd = VK_NULL_HANDLE;
     check_vk(vkAllocateCommandBuffers(device, &alloc, &cmd), "vkAllocateCommandBuffers(transition)");
 
+    // RAII: free the command buffer on every exit path (including
+    // exceptions from the check_vk calls below). The pool would
+    // eventually reclaim it on destroy(), but a retry loop after a
+    // transient queue submit failure would leak one cmd per attempt.
+    struct CmdGuard
+    {
+        VkDevice device;
+        VkCommandPool pool;
+        VkCommandBuffer cmd;
+        ~CmdGuard()
+        {
+            vkFreeCommandBuffers(device, pool, 1, &cmd);
+        }
+    } guard{ device, command_pool_, cmd };
+
     VkCommandBufferBeginInfo begin{};
     begin.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
     begin.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
@@ -371,8 +409,6 @@ void DeviceImage::run_one_shot_layout_transition(VkImageLayout old_layout,
     submit.pCommandBuffers = &cmd;
     check_vk(vkQueueSubmit(ctx_->queue(), 1, &submit, VK_NULL_HANDLE), "vkQueueSubmit(transition)");
     check_vk(vkQueueWaitIdle(ctx_->queue()), "vkQueueWaitIdle(transition)");
-
-    vkFreeCommandBuffers(device, command_pool_, 1, &cmd);
 }
 
 } // namespace viz
diff --git a/src/viz/core/cpp/inc/viz/core/vk_context.hpp b/src/viz/core/cpp/inc/viz/core/vk_context.hpp
index d1402e58a..5f12c8690 100644
--- a/src/viz/core/cpp/inc/viz/core/vk_context.hpp
+++ b/src/viz/core/cpp/inc/viz/core/vk_context.hpp
@@ -104,6 +104,15 @@ class VkContext
     uint32_t queue_family_index() const noexcept;
     VkQueue queue() const noexcept;
 
+    // CUDA device id matched to the chosen Vulkan physical device by
+    // UUID at init() time. Useful for callers that need to ensure
+    // their thread is on the right CUDA device before issuing CUDA
+    // calls — cudaSetDevice is per-host-thread, so a CudaTexture /
+    // DeviceImage created on a worker thread must call
+    // cudaSetDevice(ctx.cuda_device_id()) before any CUDA API. Returns
+    // -1 before init() has run.
+    int cuda_device_id() const noexcept;
+
     // Enumerates all Vulkan-capable physical devices and returns their
     // properties. Useful for picking a specific GPU index on multi-GPU
     // machines before calling init().
@@ -126,6 +135,7 @@ class VkContext
     VkDevice device_ = VK_NULL_HANDLE;
     uint32_t queue_family_index_ = UINT32_MAX;
     VkQueue queue_ = VK_NULL_HANDLE;
+    int cuda_device_id_ = -1;
 };
 
 } // namespace viz
diff --git a/src/viz/core/cpp/vk_context.cpp b/src/viz/core/cpp/vk_context.cpp
index 773329634..d2184d038 100644
--- a/src/viz/core/cpp/vk_context.cpp
+++ b/src/viz/core/cpp/vk_context.cpp
@@ -216,6 +216,7 @@ void VkContext::destroy()
     physical_device_ = VK_NULL_HANDLE;
     queue_ = VK_NULL_HANDLE;
     queue_family_index_ = UINT32_MAX;
+    cuda_device_id_ = -1;
     validation_enabled_ = false;
     initialized_ = false;
 }
@@ -250,6 +251,11 @@ VkQueue VkContext::queue() const noexcept
     return queue_;
 }
 
+int VkContext::cuda_device_id() const noexcept
+{
+    return cuda_device_id_;
+}
+
 void VkContext::create_instance(const Config& config)
 {
     VkApplicationInfo app_info{};
@@ -447,6 +453,7 @@ void VkContext::match_cuda_device_to_vulkan()
             {
                 throw std::runtime_error(std::string("VkContext: cudaSetDevice failed: ") + cudaGetErrorString(err));
             }
+            cuda_device_id_ = i;
             return;
         }
     }

From 202f869125f5e19db586ce8c88cbc753c9f1400b Mon Sep 17 00:00:00 2001
From: Farbod Motlagh <fmotlagh@nvidia.com>
Date: Fri, 1 May 2026 13:24:19 -0700
Subject: [PATCH 03/10] ci: fix CUDA install on arm64 runners (replace Jimver
 action)

Jimver/cuda-toolkit@v0.2.19 hardcodes /x86_64/ in the NVIDIA repo URL,
which fails on the ubuntu-22.04-arm matrix entry (cuda-nvcc-12-4 /
cuda-cudart-12-4 packages don't exist for that arch on that path).

Replace with a small composite action (.github/actions/setup-cuda)
that picks /x86_64/ or /sbsa/ based on dpkg --print-architecture and
installs cuda-nvcc-* + cuda-cudart-dev-* via apt. Used in both
build-ubuntu and test-viz-sanitizers.

Signed-off-by: Farbod Motlagh <fmotlagh@nvidia.com>
Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .github/actions/setup-cuda/action.yml | 42 +++++++++++++++++++++++++++
 .github/workflows/build-ubuntu.yml    | 20 ++++---------
 2 files changed, 48 insertions(+), 14 deletions(-)
 create mode 100644 .github/actions/setup-cuda/action.yml

diff --git a/.github/actions/setup-cuda/action.yml b/.github/actions/setup-cuda/action.yml
new file mode 100644
index 000000000..3dc9590fa
--- /dev/null
+++ b/.github/actions/setup-cuda/action.yml
@@ -0,0 +1,42 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+name: Setup CUDA Toolkit
+description: Install CUDA Toolkit (cudart + nvcc) via NVIDIA's apt repository.
+  Architecture-aware (x86_64 / sbsa) so it works on both ubuntu-22.04 and
+  ubuntu-22.04-arm runners. Replaces the Jimver/cuda-toolkit action, which
+  hardcodes /x86_64/ in the repo URL and fails on ARM runners.
+
+inputs:
+  cuda-version:
+    description: 'CUDA major-minor version (e.g. "12-4" for 12.4)'
+    required: false
+    default: '12-4'
+
+runs:
+  using: composite
+  steps:
+    - name: Install CUDA toolkit (apt)
+      shell: bash
+      run: |
+        set -euo pipefail
+        ARCH=$(dpkg --print-architecture)
+        case "$ARCH" in
+          amd64) CUDA_ARCH=x86_64 ;;
+          arm64) CUDA_ARCH=sbsa ;;
+          *) echo "setup-cuda: unsupported arch '$ARCH'" >&2; exit 1 ;;
+        esac
+        REPO_URL="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/${CUDA_ARCH}"
+        TMPDIR=$(mktemp -d)
+        wget -q -O "${TMPDIR}/cuda-keyring.deb" "${REPO_URL}/cuda-keyring_1.1-1_all.deb"
+        sudo dpkg -i "${TMPDIR}/cuda-keyring.deb"
+        rm -rf "${TMPDIR}"
+        sudo apt-get update -y
+        sudo apt-get install -y --no-install-recommends \
+          cuda-nvcc-${{ inputs.cuda-version }} \
+          cuda-cudart-dev-${{ inputs.cuda-version }}
+        # Expose to subsequent steps. CMake's find_package(CUDAToolkit)
+        # picks up CUDA_PATH or finds nvcc via PATH.
+        CUDA_VERSION_DOTTED="$(echo '${{ inputs.cuda-version }}' | tr '-' '.')"
+        echo "CUDA_PATH=/usr/local/cuda-${CUDA_VERSION_DOTTED}" >> "$GITHUB_ENV"
+        echo "/usr/local/cuda-${CUDA_VERSION_DOTTED}/bin" >> "$GITHUB_PATH"
diff --git a/.github/workflows/build-ubuntu.yml b/.github/workflows/build-ubuntu.yml
index 0a47318a6..8c9e76f95 100644
--- a/.github/workflows/build-ubuntu.yml
+++ b/.github/workflows/build-ubuntu.yml
@@ -47,16 +47,12 @@ jobs:
         sudo apt-get update
         sudo apt-get install -y patchelf
 
-    # CUDA toolkit needed at link time for viz_core (cuda_texture +
-    # device_image link libcudart). The wheel itself excludes
-    # libcuda.so.1 via auditwheel; runtime CUDA is the consumer's
-    # responsibility.
+    # CUDA toolkit needed at link time for viz_core (device_image
+    # links libcudart). The wheel itself excludes libcuda.so.1 via
+    # auditwheel; runtime CUDA is the consumer's responsibility.
+    # Custom action handles both x86_64 and ARM64 (sbsa) repos.
     - name: Install CUDA toolkit
-      uses: Jimver/cuda-toolkit@v0.2.19
-      with:
-        cuda: '12.4.0'
-        method: 'network'
-        sub-packages: '["nvcc", "cudart"]'
+      uses: ./.github/actions/setup-cuda
 
     - name: Setup CloudXR SDK
       id: cloudxr-sdk
@@ -277,11 +273,7 @@ jobs:
         sudo apt-get install -y build-essential cmake libx11-dev clang-format-14 ccache libvulkan-dev glslang-tools
 
     - name: Install CUDA toolkit
-      uses: Jimver/cuda-toolkit@v0.2.19
-      with:
-        cuda: '12.4.0'
-        method: 'network'
-        sub-packages: '["nvcc", "cudart"]'
+      uses: ./.github/actions/setup-cuda
 
     - name: Cache ccache
       uses: actions/cache@v5

From e09ac9c8a02b6601b5c6a70d71eef97561c2d21e Mon Sep 17 00:00:00 2001
From: Farbod Motlagh <fmotlagh@nvidia.com>
Date: Fri, 1 May 2026 13:31:39 -0700
Subject: [PATCH 04/10] ci: disable BUILD_VIZ on Windows runner (no CUDA
 installed)

viz_core links CUDAToolkit::cudart, but the experimental Windows CI
runner doesn't have CUDA installed. With BUILD_VIZ=ON we hit
"Could not find nvcc" at find_package(CUDAToolkit) time. No Windows-XR
consumer for viz today, so flip it OFF for now and add CUDA install
to build-windows.yml when we have a real reason to ship viz on Windows.

Signed-off-by: Farbod Motlagh <fmotlagh@nvidia.com>
Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .github/workflows/build-windows.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build-windows.yml b/.github/workflows/build-windows.yml
index 0ef005806..a72de2fed 100644
--- a/.github/workflows/build-windows.yml
+++ b/.github/workflows/build-windows.yml
@@ -74,6 +74,9 @@ jobs:
       # sccache does not work with VSBuild, so we use Ninja generator here.
       # -G Ninja would pickup mingw64 by default on Windows, so we explicitly set the compiler to cl.exe
       # Force embedded debug info (Z7) to avoid PDB contention whenever debug info is generated.
+      # BUILD_VIZ is OFF on Windows: viz_core links CUDAToolkit (cudart) and the Windows
+      # runner doesn't have CUDA installed. No Windows-XR consumer today, so flip it back on
+      # only when we add CUDA install to this job.
       run: >
         cmake -B build -G Ninja
         -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
@@ -84,7 +87,7 @@ jobs:
         -DCMAKE_CXX_COMPILER="cl.exe"
         -DCMAKE_MSVC_DEBUG_INFORMATION_FORMAT=Embedded
         -DBUILD_PLUGIN_OAK_CAMERA=ON
-        -DBUILD_VIZ=ON
+        -DBUILD_VIZ=OFF
 
     - name: Build
       run: cmake --build build --parallel

From c2aa6fb6e2d39a5cbf1bd450a5a37601e4224583 Mon Sep 17 00:00:00 2001
From: Farbod Motlagh <fmotlagh@nvidia.com>
Date: Fri, 1 May 2026 13:36:58 -0700
Subject: [PATCH 05/10] ci: install CUDA on Windows runner (re-enable
 BUILD_VIZ)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Jimver/cuda-toolkit works correctly on Windows — the /x86_64/
hardcoding bug only bites on Linux ARM. Runs NVIDIA's silent
network installer and sets CUDA_PATH so find_package(CUDAToolkit)
succeeds. Re-enables BUILD_VIZ=ON for the experimental Windows job.

Signed-off-by: Farbod Motlagh <fmotlagh@nvidia.com>
Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .github/workflows/build-windows.yml | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build-windows.yml b/.github/workflows/build-windows.yml
index a72de2fed..a9cd2d93a 100644
--- a/.github/workflows/build-windows.yml
+++ b/.github/workflows/build-windows.yml
@@ -69,14 +69,23 @@ jobs:
         version: 1.3.290.0
         cache: true
 
+    # CUDA Toolkit needed at link time for viz_core (device_image links
+    # cudart). Jimver action works correctly on Windows (the arm64
+    # /x86_64/ hardcoding bug is Linux-only); runs NVIDIA's silent
+    # network installer and sets CUDA_PATH so find_package(CUDAToolkit)
+    # locates the install.
+    - name: Install CUDA Toolkit
+      uses: Jimver/cuda-toolkit@v0.2.19
+      with:
+        cuda: '12.4.0'
+        method: 'network'
+        sub-packages: '["nvcc", "cudart"]'
+
     - name: Configure CMake
       # Note:
       # sccache does not work with VSBuild, so we use Ninja generator here.
       # -G Ninja would pickup mingw64 by default on Windows, so we explicitly set the compiler to cl.exe
       # Force embedded debug info (Z7) to avoid PDB contention whenever debug info is generated.
-      # BUILD_VIZ is OFF on Windows: viz_core links CUDAToolkit (cudart) and the Windows
-      # runner doesn't have CUDA installed. No Windows-XR consumer today, so flip it back on
-      # only when we add CUDA install to this job.
       run: >
         cmake -B build -G Ninja
         -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
@@ -87,7 +96,7 @@ jobs:
         -DCMAKE_CXX_COMPILER="cl.exe"
         -DCMAKE_MSVC_DEBUG_INFORMATION_FORMAT=Embedded
         -DBUILD_PLUGIN_OAK_CAMERA=ON
-        -DBUILD_VIZ=OFF
+        -DBUILD_VIZ=ON
 
     - name: Build
       run: cmake --build build --parallel

From 2203957fca935412622b2cc9c5fe2c1f6c7f1f5e Mon Sep 17 00:00:00 2001
From: Farbod Motlagh <fmotlagh@nvidia.com>
Date: Fri, 1 May 2026 13:46:01 -0700
Subject: [PATCH 06/10] ci: bundle libcudart.so.12 in viz-tests artifact

GPU runners have the NVIDIA driver (libcuda.so.1) but not the CUDA
Toolkit (libcudart.so.12). After M3a's CUDA dependency landed, viz
test binaries fail to load on the GPU runners with "cannot open shared
object file: libcudart.so.12".

Bundle libcudart.so.12 from the build host's CUDA install into the
viz-tests-* artifact, then point LD_LIBRARY_PATH at the artifact dir
when running tests. Same pattern as auditwheel for the Python wheel.

Signed-off-by: Farbod Motlagh <fmotlagh@nvidia.com>
Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .github/workflows/build-ubuntu.yml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/.github/workflows/build-ubuntu.yml b/.github/workflows/build-ubuntu.yml
index 8c9e76f95..213c19b7b 100644
--- a/.github/workflows/build-ubuntu.yml
+++ b/.github/workflows/build-ubuntu.yml
@@ -119,6 +119,16 @@ jobs:
           echo "No viz test binaries found under build/src/viz/"
           exit 1
         fi
+        # Bundle libcudart.so.12 alongside the binaries so test-viz-gpu
+        # runners (which have the NVIDIA driver but not the CUDA Toolkit)
+        # can resolve it via LD_LIBRARY_PATH=. without installing the
+        # toolkit. Mirrors the auditwheel approach used for the wheel.
+        CUDART_PATH="${CUDA_PATH:-/usr/local/cuda}/lib64/libcudart.so.12"
+        if [[ ! -f "$CUDART_PATH" ]]; then
+          echo "libcudart.so.12 not found at $CUDART_PATH"
+          exit 1
+        fi
+        cp -v "$CUDART_PATH" viz-tests-pkg/
         tar -cvzf viz-tests.tar.gz -C viz-tests-pkg .
 
     - name: Upload viz test binaries (Release, py3.11 only)
@@ -208,6 +218,10 @@ jobs:
       run: |
         set -euo pipefail
         cd viz-tests
+        # libcudart.so.12 is bundled in the artifact next to the binaries —
+        # GPU runners have the NVIDIA driver (libcuda.so.1) but not the
+        # full CUDA Toolkit, so cudart has to come with us.
+        export LD_LIBRARY_PATH="${PWD}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
         # Run [gpu]-tagged tests for every viz_*_tests binary. The build-ubuntu
         # job already covered [unit] tests on a CPU-only runner; here we focus
         # on the GPU paths that genuinely need Vulkan/CUDA at runtime.

From 604c707bb414e728a48a7f65bb7cae0985bee125 Mon Sep 17 00:00:00 2001
From: Farbod Motlagh <fmotlagh@nvidia.com>
Date: Fri, 1 May 2026 13:56:15 -0700
Subject: [PATCH 07/10] ci: install CUDA on GPU runner instead of bundling
 libcudart

Reverses the bundle-libcudart-in-artifact hack from 2203957f. Run the
same setup-cuda composite action on the test-viz-gpu job: NVIDIA's
apt postinst registers /etc/ld.so.conf.d/cuda-12-4.conf so libcudart
lands on the standard ld.so search path with no LD_LIBRARY_PATH or
artifact gymnastics needed. Symmetric with build-ubuntu, and we'd
need cudart on the runner anyway for richer GPU tests in M3b+.

Signed-off-by: Farbod Motlagh <fmotlagh@nvidia.com>
Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .github/workflows/build-ubuntu.yml | 31 ++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/build-ubuntu.yml b/.github/workflows/build-ubuntu.yml
index 213c19b7b..337d896c1 100644
--- a/.github/workflows/build-ubuntu.yml
+++ b/.github/workflows/build-ubuntu.yml
@@ -119,16 +119,6 @@ jobs:
           echo "No viz test binaries found under build/src/viz/"
           exit 1
         fi
-        # Bundle libcudart.so.12 alongside the binaries so test-viz-gpu
-        # runners (which have the NVIDIA driver but not the CUDA Toolkit)
-        # can resolve it via LD_LIBRARY_PATH=. without installing the
-        # toolkit. Mirrors the auditwheel approach used for the wheel.
-        CUDART_PATH="${CUDA_PATH:-/usr/local/cuda}/lib64/libcudart.so.12"
-        if [[ ! -f "$CUDART_PATH" ]]; then
-          echo "libcudart.so.12 not found at $CUDART_PATH"
-          exit 1
-        fi
-        cp -v "$CUDART_PATH" viz-tests-pkg/
         tar -cvzf viz-tests.tar.gz -C viz-tests-pkg .
 
     - name: Upload viz test binaries (Release, py3.11 only)
@@ -202,6 +192,14 @@ jobs:
         arch: ['x64', 'arm64']
 
     steps:
+    # Sparse checkout so the local composite action below
+    # (.github/actions/setup-cuda) is available; we don't need source.
+    - name: Checkout (sparse — actions only)
+      uses: actions/checkout@v6
+      with:
+        sparse-checkout: .github
+        sparse-checkout-cone-mode: false
+
     - name: Download viz test binaries
       uses: actions/download-artifact@v7
       with:
@@ -214,14 +212,19 @@ jobs:
         tar -xzvf viz-tests.tar.gz -C viz-tests
         chmod +x viz-tests/viz_*_tests
 
+    # GPU runners have the NVIDIA driver (libcuda.so.1) but not the
+    # CUDA Toolkit. viz_core links libcudart.so.12 (toolkit-side) so
+    # we need the toolkit installed here too. The cuda-cudart apt
+    # package's postinst drops /etc/ld.so.conf.d/cuda-*.conf, so
+    # libcudart lands on the standard ld.so search path — no
+    # LD_LIBRARY_PATH gymnastics needed.
+    - name: Install CUDA toolkit
+      uses: ./.github/actions/setup-cuda
+
     - name: Run viz [gpu] tests
       run: |
         set -euo pipefail
         cd viz-tests
-        # libcudart.so.12 is bundled in the artifact next to the binaries —
-        # GPU runners have the NVIDIA driver (libcuda.so.1) but not the
-        # full CUDA Toolkit, so cudart has to come with us.
-        export LD_LIBRARY_PATH="${PWD}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
         # Run [gpu]-tagged tests for every viz_*_tests binary. The build-ubuntu
         # job already covered [unit] tests on a CPU-only runner; here we focus
         # on the GPU paths that genuinely need Vulkan/CUDA at runtime.

From feb11ae6ee61cbfc965d2257c4f806b42fc6d5bb Mon Sep 17 00:00:00 2001
From: Farbod Motlagh <fmotlagh@nvidia.com>
Date: Fri, 1 May 2026 13:58:37 -0700
Subject: [PATCH 08/10] device_image: portable close_fd shim so Windows builds
 compile
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

device_image.cpp included <unistd.h> for ::close on the fd returned
by vkGetMemoryFdKHR. MSVC has no <unistd.h>; build-windows fails with
"Cannot open include file: 'unistd.h'".

Wrap close in a tiny shim: <io.h>+_close on _WIN32, <unistd.h>+close
elsewhere. The whole fd path is unreachable at runtime on Windows
(vkGetMemoryFdKHR returns nullptr on that platform → import_to_cuda
throws before memory_fd_ is set), but we still need a clean compile.

Signed-off-by: Farbod Motlagh <fmotlagh@nvidia.com>
Co-authored-by: Cursor <cursoragent@cursor.com>
---
 src/viz/core/cpp/device_image.cpp | 30 +++++++++++++++++++++++++++---
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/src/viz/core/cpp/device_image.cpp b/src/viz/core/cpp/device_image.cpp
index 4bc80e249..a02053b58 100644
--- a/src/viz/core/cpp/device_image.cpp
+++ b/src/viz/core/cpp/device_image.cpp
@@ -6,7 +6,31 @@
 
 #include <stdexcept>
 #include <string>
-#include <unistd.h>
+
+// Posix close() lives in <unistd.h> on Linux/macOS; Windows uses _close()
+// from <io.h>. The fd-close path is unreachable at runtime on Windows
+// (vkGetMemoryFdKHR isn't available there — import_to_cuda throws before
+// memory_fd_ is ever assigned), but the code still has to compile under
+// MSVC for the experimental Windows build.
+#ifdef _WIN32
+#    include <io.h>
+namespace
+{
+inline int close_fd(int fd) noexcept
+{
+    return ::_close(fd);
+}
+} // namespace
+#else
+#    include <unistd.h>
+namespace
+{
+inline int close_fd(int fd) noexcept
+{
+    return ::close(fd);
+}
+} // namespace
+#endif
 
 namespace viz
 {
@@ -149,7 +173,7 @@ void DeviceImage::destroy()
         // CUDA dups the fd internally on import, so we close our copy.
         // If import failed before our explicit close, fd_ may still
         // hold our copy — close it here.
-        ::close(memory_fd_);
+        close_fd(memory_fd_);
         memory_fd_ = -1;
     }
 
@@ -308,7 +332,7 @@ void DeviceImage::import_to_cuda()
     check_cuda(cudaImportExternalMemory(&cuda_external_memory_, &ext_desc), "cudaImportExternalMemory");
 
     // CUDA dup'd the fd internally; close ours so we don't double-free.
-    ::close(memory_fd_);
+    close_fd(memory_fd_);
     memory_fd_ = -1;
 
     cudaExternalMemoryMipmappedArrayDesc array_desc{};

From 4eb4db18b3998c65f4dd76735e9f5acc34295b33 Mon Sep 17 00:00:00 2001
From: Farbod Motlagh <fmotlagh@nvidia.com>
Date: Fri, 1 May 2026 15:06:31 -0700
Subject: [PATCH 09/10] viz_core: static-link cudart so artifacts are
 self-contained
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Switch CUDA::cudart -> CUDA::cudart_static. Makes the wheel and
viz_*_tests artifacts self-contained:
- Wheel: _viz.so has no dynamic libcudart.so.12 dep so auditwheel
  bundles nothing CUDA-related (release artifact stays clean).
- Test artifacts: run on GPU runners that have only the NVIDIA driver
  (libcuda.so.1). The self-hosted runner's sudo policy disallows apt
  installs from a job step, so we can't install the toolkit there.
- Drops the back-and-forth between bundling libcudart in the artifact
  vs. installing CUDA on the GPU runner — neither is needed now.

Build host still needs the CUDA Toolkit for libcudart_static.a;
setup-cuda already covers that on build-ubuntu / test-viz-sanitizers.

Tradeoff: ~3 MB binary growth per consumer. Safe today because
viz_core is the only CUDA-using component in the codebase.

Signed-off-by: Farbod Motlagh <fmotlagh@nvidia.com>
Co-authored-by: Cursor <cursoragent@cursor.com>
---
 src/viz/core/cpp/CMakeLists.txt | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/viz/core/cpp/CMakeLists.txt b/src/viz/core/cpp/CMakeLists.txt
index 4116624a9..6962bc5ca 100644
--- a/src/viz/core/cpp/CMakeLists.txt
+++ b/src/viz/core/cpp/CMakeLists.txt
@@ -8,7 +8,17 @@ find_package(Vulkan REQUIRED)
 
 # CUDA Toolkit provides the CUDA-Vulkan interop runtime APIs
 # (cudaImportExternalMemory, cudaExternalMemoryGetMappedMipmappedArray)
-# used by cuda_texture / device_image. Required when BUILD_VIZ=ON.
+# used by device_image. Required when BUILD_VIZ=ON.
+#
+# We link CUDA::cudart_static (not CUDA::cudart) so consumers of viz_core
+# don't have a runtime libcudart.so.12 dependency:
+#   - The wheel ships a fully self-contained _viz.so. auditwheel sees no
+#     libcudart symbols to bundle.
+#   - viz_*_tests artifacts run on GPU runners that have only the NVIDIA
+#     driver (libcuda.so.1) installed; static cudart sidesteps the
+#     "install the toolkit on every runner" problem.
+# Tradeoff: ~3 MB binary growth per consumer. Safe because viz_core is
+# the only CUDA-using component in this codebase today.
 find_package(CUDAToolkit REQUIRED)
 
 # Foundational types and Vulkan/CUDA primitives shared by all viz sub-modules.
@@ -36,7 +46,7 @@ target_link_libraries(viz_core
     PUBLIC
         Vulkan::Vulkan
         glm::glm
-        CUDA::cudart
+        CUDA::cudart_static
 )
 
 # Aliased as viz::core (consumers say viz::core, not viz::viz_core).

From 630870d8c5e1641b9d53dc8dc89c6f4ec6d4f18f Mon Sep 17 00:00:00 2001
From: Farbod Motlagh <fmotlagh@nvidia.com>
Date: Fri, 1 May 2026 15:25:45 -0700
Subject: [PATCH 10/10] ci: drop setup-cuda from test-viz-gpu (cudart now
 static-linked)

Now that viz_core links libcudart_static (4eb4db18), the GPU runner
no longer needs the CUDA Toolkit. The previous setup-cuda step was
failing on the self-hosted runner anyway (sudo policy), and is now
unnecessary. Test binaries depend only on the NVIDIA driver
(libcuda.so.1) which is already present on the runner.

Signed-off-by: Farbod Motlagh <fmotlagh@nvidia.com>
Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .github/workflows/build-ubuntu.yml | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/build-ubuntu.yml b/.github/workflows/build-ubuntu.yml
index 337d896c1..58b17402e 100644
--- a/.github/workflows/build-ubuntu.yml
+++ b/.github/workflows/build-ubuntu.yml
@@ -192,14 +192,6 @@ jobs:
         arch: ['x64', 'arm64']
 
     steps:
-    # Sparse checkout so the local composite action below
-    # (.github/actions/setup-cuda) is available; we don't need source.
-    - name: Checkout (sparse — actions only)
-      uses: actions/checkout@v6
-      with:
-        sparse-checkout: .github
-        sparse-checkout-cone-mode: false
-
     - name: Download viz test binaries
       uses: actions/download-artifact@v7
       with:
@@ -212,14 +204,9 @@ jobs:
         tar -xzvf viz-tests.tar.gz -C viz-tests
         chmod +x viz-tests/viz_*_tests
 
-    # GPU runners have the NVIDIA driver (libcuda.so.1) but not the
-    # CUDA Toolkit. viz_core links libcudart.so.12 (toolkit-side) so
-    # we need the toolkit installed here too. The cuda-cudart apt
-    # package's postinst drops /etc/ld.so.conf.d/cuda-*.conf, so
-    # libcudart lands on the standard ld.so search path — no
-    # LD_LIBRARY_PATH gymnastics needed.
-    - name: Install CUDA toolkit
-      uses: ./.github/actions/setup-cuda
+    # No CUDA Toolkit install on the GPU runner — viz_core static-links
+    # libcudart_static, so the binaries only need the NVIDIA driver
+    # (libcuda.so.1) which the runner already has.
 
     - name: Run viz [gpu] tests
       run: |