diff --git a/.github/actions/setup-cuda/action.yml b/.github/actions/setup-cuda/action.yml
new file mode 100644
index 000000000..3dc9590fa
--- /dev/null
+++ b/.github/actions/setup-cuda/action.yml
@@ -0,0 +1,42 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+name: Setup CUDA Toolkit
+description: Install CUDA Toolkit (cudart + nvcc) via NVIDIA's apt repository.
+  Architecture-aware (x86_64 / sbsa) so it works on both ubuntu-22.04 and
+  ubuntu-22.04-arm runners. Replaces the Jimver/cuda-toolkit action, which
+  hardcodes /x86_64/ in the repo URL and fails on ARM runners.
+
+inputs:
+  cuda-version:
+    description: 'CUDA major-minor version (e.g. "12-4" for 12.4)'
+    required: false
+    default: '12-4'
+
+runs:
+  using: composite
+  steps:
+    - name: Install CUDA toolkit (apt)
+      shell: bash
+      run: |
+        set -euo pipefail
+        ARCH=$(dpkg --print-architecture)
+        case "$ARCH" in
+          amd64) CUDA_ARCH=x86_64 ;;
+          arm64) CUDA_ARCH=sbsa ;;
+          *) echo "setup-cuda: unsupported arch '$ARCH'" >&2; exit 1 ;;
+        esac
+        REPO_URL="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/${CUDA_ARCH}"
+        TMPDIR=$(mktemp -d)
+        wget -q -O "${TMPDIR}/cuda-keyring.deb" "${REPO_URL}/cuda-keyring_1.1-1_all.deb"
+        sudo dpkg -i "${TMPDIR}/cuda-keyring.deb"
+        rm -rf "${TMPDIR}"
+        sudo apt-get update -y
+        sudo apt-get install -y --no-install-recommends \
+          cuda-nvcc-${{ inputs.cuda-version }} \
+          cuda-cudart-dev-${{ inputs.cuda-version }}
+        # Expose to subsequent steps. CMake's find_package(CUDAToolkit)
+        # picks up CUDA_PATH or finds nvcc via PATH.
+        CUDA_VERSION_DOTTED="$(echo '${{ inputs.cuda-version }}' | tr '-' '.')"
+        echo "CUDA_PATH=/usr/local/cuda-${CUDA_VERSION_DOTTED}" >> "$GITHUB_ENV"
+        echo "/usr/local/cuda-${CUDA_VERSION_DOTTED}/bin" >> "$GITHUB_PATH"
diff --git a/.github/workflows/build-ubuntu.yml b/.github/workflows/build-ubuntu.yml
index eafb25592..58b17402e 100644
--- a/.github/workflows/build-ubuntu.yml
+++ b/.github/workflows/build-ubuntu.yml
@@ -39,13 +39,21 @@ jobs:
     - name: Install Apt dependencies
       run: |
         sudo apt-get update
-        sudo apt-get install -y build-essential cmake libx11-dev clang-format-14 ccache libvulkan-dev
+        sudo apt-get install -y build-essential cmake libx11-dev clang-format-14 ccache libvulkan-dev glslang-tools
 
     - name: Install patchelf (Release only)
       if: ${{ matrix.build_type == 'Release' }}
       run: |
         sudo apt-get update
         sudo apt-get install -y patchelf
+
+    # CUDA toolkit needed at link time for viz_core (device_image
+    # links libcudart). The wheel itself excludes libcuda.so.1 via
+    # auditwheel; runtime CUDA is the consumer's responsibility.
+    # Custom action handles both x86_64 and ARM64 (sbsa) repos.
+    - name: Install CUDA toolkit
+      uses: ./.github/actions/setup-cuda
+
     - name: Setup CloudXR SDK
       id: cloudxr-sdk
       uses: ./.github/actions/setup-cloudxr-sdk
@@ -196,6 +204,10 @@ jobs:
         tar -xzvf viz-tests.tar.gz -C viz-tests
         chmod +x viz-tests/viz_*_tests
 
+    # No CUDA Toolkit install on the GPU runner — viz_core static-links
+    # libcudart_static, so the binaries only need the NVIDIA driver
+    # (libcuda.so.1) which the runner already has.
+
     - name: Run viz [gpu] tests
       run: |
         set -euo pipefail
@@ -262,7 +274,10 @@ jobs:
     - name: Install Apt dependencies
       run: |
         sudo apt-get update
-        sudo apt-get install -y build-essential cmake libx11-dev clang-format-14 ccache libvulkan-dev
+        sudo apt-get install -y build-essential cmake libx11-dev clang-format-14 ccache libvulkan-dev glslang-tools
+
+    - name: Install CUDA toolkit
+      uses: ./.github/actions/setup-cuda
 
     - name: Cache ccache
       uses: actions/cache@v5
diff --git a/.github/workflows/build-windows.yml b/.github/workflows/build-windows.yml
index 0ef005806..a9cd2d93a 100644
--- a/.github/workflows/build-windows.yml
+++ b/.github/workflows/build-windows.yml
@@ -69,6 +69,18 @@ jobs:
         version: 1.3.290.0
         cache: true
 
+    # CUDA Toolkit needed at link time for viz_core (device_image links
+    # cudart). Jimver action works correctly on Windows (the arm64
+    # /x86_64/ hardcoding bug is Linux-only); runs NVIDIA's silent
+    # network installer and sets CUDA_PATH so find_package(CUDAToolkit)
+    # locates the install.
+    - name: Install CUDA Toolkit
+      uses: Jimver/cuda-toolkit@v0.2.19
+      with:
+        cuda: '12.4.0'
+        method: 'network'
+        sub-packages: '["nvcc", "cudart"]'
+
     - name: Configure CMake
       # Note:
       # sccache does not work with VSBuild, so we use Ninja generator here.
diff --git a/deps/README.md b/deps/README.md
index f004d29db..3cb4f3496 100644
--- a/deps/README.md
+++ b/deps/README.md
@@ -54,14 +54,24 @@ machine and are located via `find_package`.
 - **Min version**: 1.2 (Televiz checks `VK_API_VERSION_1_2` at device select time).
 - **License**: Apache 2.0 (loader); per-vendor for ICD drivers.
 
-### CUDA Toolkit (optional, used by examples)
+### CUDA Toolkit
 - **Locator**: `find_package(CUDAToolkit REQUIRED)`
-- **Required by**: `examples/camera_streamer/` only today; `viz` adds CUDA
-  dependency once CUDA-Vulkan interop lands; CI must have CUDA installed for
-  GPU test runner to exercise these paths.
+- **Required by**: `viz/core/` (`CudaTexture`, `DeviceImage` link to
+  `CUDA::cudart`) when `BUILD_VIZ=ON`; also `examples/camera_streamer/`.
 - **Min version**: 12.0
+- **CI**: installed via `Jimver/cuda-toolkit@v0.2.x` action with
+  `nvcc` + `cudart` sub-packages.
 - **License**: NVIDIA EULA
 
+### glslangValidator (shader compiler)
+- **Locator**: `find_program(GLSLANG_VALIDATOR glslangValidator REQUIRED)`
+- **Required by**: `viz/shaders/` to compile `.vert` / `.frag` GLSL
+  into SPIR-V at build time.
+- **Linux**: `apt-get install glslang-tools`
+- **Windows**: ships with the LunarG Vulkan SDK
+- **macOS**: `brew install glslang`
+- **License**: BSD-3 / Khronos
+
 ## Third-Party Dependencies
 
 ### OpenXR SDK
diff --git a/src/viz/AGENTS.md b/src/viz/AGENTS.md
index 8219ce243..67735dedd 100644
--- a/src/viz/AGENTS.md
+++ b/src/viz/AGENTS.md
@@ -17,11 +17,19 @@ sibling `<sub-module>_tests/` directory:
 
 - **`viz/core/`** — foundational types + Vulkan/CUDA infrastructure.
   Library: `viz_core`. Today: `VkContext`, `VizBuffer`, `Pose3D`, `Fov`,
-  `Resolution`, `ViewInfo`, `PixelFormat`, `RenderTarget`, `FrameSync`.
-  Future: `cuda_texture`. Math types (`glm::vec3`, `glm::quat`,
+  `Resolution`, `ViewInfo`, `PixelFormat`, `RenderTarget`, `FrameSync`,
+  `HostImage`, `DeviceImage`. `HostImage` / `DeviceImage` are the
+  symmetric pair of owning 2D pixel buffers (CPU bytes vs CUDA-Vulkan
+  interop) — both expose `VizBuffer view()` so generic helpers branch
+  on `VizBuffer::space`. Math types (`glm::vec3`, `glm::quat`,
   `glm::mat4`) come from GLM 1.0.1 (FetchContent in
   `deps/third_party/`); use `glm::value_ptr(mat)` to get a raw `float*`
   for Vulkan / CUDA upload (POD-equivalent layout, no copy).
+  CUDA-Vulkan interop requires CUDA Toolkit at link time
+  (`CUDAToolkit::cudart`). `VkContext::init()` matches the current
+  CUDA device to the chosen Vulkan physical device by UUID — every
+  viz_core type can assume CUDA and Vulkan are talking to the same
+  GPU without re-doing the match.
 - **`viz/layers/`** — `LayerBase` and concrete layers (`QuadLayer`, etc.).
   Library: `viz_layers` (INTERFACE / header-only today; promoted to
   STATIC when the first concrete layer ships). Depends on `viz_core`.
@@ -39,11 +47,18 @@ sibling `<sub-module>_tests/` directory:
   frame loop, type conversion). Library: `viz_xr`. **Optional** behind
   `BUILD_VIZ_XR`. Depends on `viz_core` + OpenXR.
 - **`viz/python/`** — pybind11 module `_viz`, exposed as `isaacteleop.viz`.
-- **`viz/shaders/`** — GLSL → SPIR-V at build time.
+- **`viz/shaders/`** — GLSL → SPIR-V at build time. Library: `viz_shaders`
+  (INTERFACE — exposes generated headers `viz/shaders/<name>.spv.h`,
+  each containing an `inline constexpr alignas(uint32_t) unsigned char`
+  byte array + a `Size` constant). Compilation runs `glslangValidator`
+  (system-installed; CI gets `glslang-tools` apt package). Add new
+  shader programs by dropping `<name>.vert` / `<name>.frag` in
+  `viz/shaders/cpp/` and calling `compile_shader(<name>.vert kVarName)`
+  in the local CMakeLists.
 
 Test directories follow the same per-module pattern:
 `viz/core_tests/`, `viz/layers_tests/`, `viz/session_tests/`,
-`viz/xr_tests/`.
+`viz/shaders_tests/`, `viz/xr_tests/`.
 
 `src/viz/CMakeLists.txt` is an **orchestrator only** — it adds the
 sub-module sub-directories. Sub-module `CMakeLists.txt` files build the
@@ -57,8 +72,17 @@ Build paths that ship viz (the wheel CI on Linux + Windows) pass
 `-DBUILD_VIZ=ON` explicitly. Lean Dockerfiles
 (`examples/teleop_ros2/Dockerfile`) get viz-free builds for free.
 
-When `BUILD_VIZ=ON` you must have Vulkan headers + loader installed:
-`libvulkan-dev` on Linux, LunarG SDK on Windows.
+When `BUILD_VIZ=ON` the build machine must have:
+- **Vulkan headers + loader**: `libvulkan-dev` on Linux, LunarG SDK on
+  Windows.
+- **CUDA Toolkit** (cudart for link, nvcc not strictly required today
+  but expected to be needed for kernels in M3b+): apt
+  `nvidia-cuda-toolkit` or the official NVIDIA installer / CI action
+  (`Jimver/cuda-toolkit`). The wheel excludes `libcuda.so.1` —
+  consumers supply it via NVIDIA driver.
+- **glslangValidator** for shader compilation: `glslang-tools` apt
+  package on Linux, `brew install glslang` on macOS, ships with the
+  Vulkan SDK on Windows.
 
 ## Code conventions
 
diff --git a/src/viz/CMakeLists.txt b/src/viz/CMakeLists.txt
index 18bf7f6c6..642e080b2 100644
--- a/src/viz/CMakeLists.txt
+++ b/src/viz/CMakeLists.txt
@@ -31,6 +31,10 @@ add_subdirectory(layers)
 # layer that drives the per-frame loop and manages the layer registry.
 add_subdirectory(session)
 
+# Shaders: GLSL sources compiled to SPIR-V at build time and embedded
+# as C arrays. Header-only INTERFACE library viz::shaders.
+add_subdirectory(shaders)
+
 # Python bindings (stub today; pybind11 module added later).
 if(BUILD_PYTHON_BINDINGS)
     add_subdirectory(python)
@@ -41,4 +45,5 @@ if(BUILD_TESTING)
     add_subdirectory(core_tests)
     add_subdirectory(layers_tests)
     add_subdirectory(session_tests)
+    add_subdirectory(shaders_tests)
 endif()
diff --git a/src/viz/core/cpp/CMakeLists.txt b/src/viz/core/cpp/CMakeLists.txt
index 2591443b3..6962bc5ca 100644
--- a/src/viz/core/cpp/CMakeLists.txt
+++ b/src/viz/core/cpp/CMakeLists.txt
@@ -3,12 +3,28 @@
 
 cmake_minimum_required(VERSION 3.20)
 
-# Vulkan is required for vk_context.
+# Vulkan is always required for vk_context / render_target / etc.
 find_package(Vulkan REQUIRED)
 
+# CUDA Toolkit provides the CUDA-Vulkan interop runtime APIs
+# (cudaImportExternalMemory, cudaExternalMemoryGetMappedMipmappedArray)
+# used by device_image. Required when BUILD_VIZ=ON.
+#
+# We link CUDA::cudart_static (not CUDA::cudart) so consumers of viz_core
+# don't have a runtime libcudart.so.12 dependency:
+#   - The wheel ships a fully self-contained _viz.so. auditwheel sees no
+#     libcudart symbols to bundle.
+#   - viz_*_tests artifacts run on GPU runners that have only the NVIDIA
+#     driver (libcuda.so.1) installed; static cudart sidesteps the
+#     "install the toolkit on every runner" problem.
+# Tradeoff: ~3 MB binary growth per consumer. Safe because viz_core is
+# the only CUDA-using component in this codebase today.
+find_package(CUDAToolkit REQUIRED)
+
 # Foundational types and Vulkan/CUDA primitives shared by all viz sub-modules.
-# Static lib; consumed in-tree via viz::viz_core.
+# Static lib; consumed in-tree via viz::core.
 add_library(viz_core STATIC
+    device_image.cpp
     frame_sync.cpp
     render_target.cpp
     vk_context.cpp
@@ -30,6 +46,7 @@ target_link_libraries(viz_core
     PUBLIC
         Vulkan::Vulkan
         glm::glm
+        CUDA::cudart_static
 )
 
 # Aliased as viz::core (consumers say viz::core, not viz::viz_core).
diff --git a/src/viz/core/cpp/device_image.cpp b/src/viz/core/cpp/device_image.cpp
new file mode 100644
index 000000000..a02053b58
--- /dev/null
+++ b/src/viz/core/cpp/device_image.cpp
@@ -0,0 +1,438 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include <viz/core/device_image.hpp>
+#include <viz/core/vk_context.hpp>
+
+#include <stdexcept>
+#include <string>
+
+// Posix close() lives in <unistd.h> on Linux/macOS; Windows uses _close()
+// from <io.h>. The fd-close path is unreachable at runtime on Windows
+// (vkGetMemoryFdKHR isn't available there — import_to_cuda throws before
+// memory_fd_ is ever assigned), but the code still has to compile under
+// MSVC for the experimental Windows build.
+#ifdef _WIN32
+#    include <io.h>
+namespace
+{
+inline int close_fd(int fd) noexcept
+{
+    return ::_close(fd);
+}
+} // namespace
+#else
+#    include <unistd.h>
+namespace
+{
+inline int close_fd(int fd) noexcept
+{
+    return ::close(fd);
+}
+} // namespace
+#endif
+
+namespace viz
+{
+
+namespace
+{
+
+void check_vk(VkResult result, const char* what)
+{
+    if (result != VK_SUCCESS)
+    {
+        throw std::runtime_error(std::string("DeviceImage: ") + what + " failed: VkResult=" + std::to_string(result));
+    }
+}
+
+void check_cuda(cudaError_t result, const char* what)
+{
+    if (result != cudaSuccess)
+    {
+        throw std::runtime_error(std::string("DeviceImage: ") + what + " failed: " + cudaGetErrorString(result));
+    }
+}
+
+uint32_t find_memory_type(VkPhysicalDevice physical_device, uint32_t type_bits, VkMemoryPropertyFlags properties)
+{
+    VkPhysicalDeviceMemoryProperties mem_props;
+    vkGetPhysicalDeviceMemoryProperties(physical_device, &mem_props);
+    for (uint32_t i = 0; i < mem_props.memoryTypeCount; ++i)
+    {
+        if ((type_bits & (1u << i)) != 0 && (mem_props.memoryTypes[i].propertyFlags & properties) == properties)
+        {
+            return i;
+        }
+    }
+    throw std::runtime_error("DeviceImage: no Vulkan memory type matching requested properties");
+}
+
+VkFormat to_vk_format(PixelFormat format)
+{
+    switch (format)
+    {
+    case PixelFormat::kRGBA8:
+        // UNORM (not SRGB) so CUDA writes round-trip without an
+        // implicit sRGB encode on the Vulkan side. Color management
+        // is the layer's concern.
+        return VK_FORMAT_R8G8B8A8_UNORM;
+    case PixelFormat::kD32F:
+        return VK_FORMAT_D32_SFLOAT;
+    }
+    throw std::runtime_error("DeviceImage: unsupported PixelFormat");
+}
+
+cudaChannelFormatDesc to_cuda_format(PixelFormat format)
+{
+    switch (format)
+    {
+    case PixelFormat::kRGBA8:
+        return cudaCreateChannelDesc<uchar4>();
+    case PixelFormat::kD32F:
+        return cudaCreateChannelDesc<float>();
+    }
+    throw std::runtime_error("DeviceImage: unsupported PixelFormat");
+}
+
+} // namespace
+
+std::unique_ptr<DeviceImage> DeviceImage::create(const VkContext& ctx, Resolution resolution, PixelFormat format)
+{
+    if (!ctx.is_initialized())
+    {
+        throw std::invalid_argument("DeviceImage: VkContext is not initialized");
+    }
+    if (resolution.width == 0 || resolution.height == 0)
+    {
+        throw std::invalid_argument("DeviceImage: resolution must be non-zero");
+    }
+    std::unique_ptr<DeviceImage> img(new DeviceImage(ctx, resolution, format));
+    img->init();
+    return img;
+}
+
+DeviceImage::DeviceImage(const VkContext& ctx, Resolution resolution, PixelFormat format)
+    : ctx_(&ctx), resolution_(resolution), format_(format), vk_format_(to_vk_format(format))
+{
+}
+
+DeviceImage::~DeviceImage()
+{
+    destroy();
+}
+
+void DeviceImage::init()
+{
+    try
+    {
+        create_vk_image_with_external_memory();
+        create_vk_image_view();
+        import_to_cuda();
+        transition_to_shader_read();
+    }
+    catch (...)
+    {
+        destroy();
+        throw;
+    }
+}
+
+void DeviceImage::destroy()
+{
+    // Pin CUDA device for this thread so the CUDA frees below land on
+    // the right device even if destroy() runs on a thread that never
+    // ran VkContext::init(). Best-effort — destructor must not throw.
+    if (ctx_ != nullptr && ctx_->cuda_device_id() >= 0)
+    {
+        (void)cudaSetDevice(ctx_->cuda_device_id());
+    }
+
+    // CUDA side first; CUDA holds a dup'd handle on the underlying
+    // memory, so the VkDeviceMemory must outlive the CUDA mapping.
+    // cudaDeviceSynchronize ensures any caller-issued async CUDA work
+    // (e.g. cudaMemcpy2DToArrayAsync) has retired before we free the
+    // array — otherwise CUDA may UAF its own staging.
+    if (cuda_mipmapped_array_ != nullptr || cuda_external_memory_ != nullptr)
+    {
+        (void)cudaDeviceSynchronize();
+    }
+    if (cuda_mipmapped_array_ != nullptr)
+    {
+        (void)cudaFreeMipmappedArray(cuda_mipmapped_array_);
+        cuda_mipmapped_array_ = nullptr;
+        cuda_array_ = nullptr;
+    }
+    if (cuda_external_memory_ != nullptr)
+    {
+        (void)cudaDestroyExternalMemory(cuda_external_memory_);
+        cuda_external_memory_ = nullptr;
+    }
+    if (memory_fd_ >= 0)
+    {
+        // CUDA dups the fd internally on import, so we close our copy.
+        // If import failed before our explicit close, fd_ may still
+        // hold our copy — close it here.
+        close_fd(memory_fd_);
+        memory_fd_ = -1;
+    }
+
+    if (ctx_ == nullptr)
+    {
+        return;
+    }
+    const VkDevice device = ctx_->device();
+    if (device == VK_NULL_HANDLE)
+    {
+        return;
+    }
+    // Wait for all GPU work to retire before tearing down Vulkan
+    // resources.
+    (void)vkDeviceWaitIdle(device);
+    if (command_pool_ != VK_NULL_HANDLE)
+    {
+        vkDestroyCommandPool(device, command_pool_, nullptr);
+        command_pool_ = VK_NULL_HANDLE;
+    }
+    if (image_view_ != VK_NULL_HANDLE)
+    {
+        vkDestroyImageView(device, image_view_, nullptr);
+        image_view_ = VK_NULL_HANDLE;
+    }
+    if (image_ != VK_NULL_HANDLE)
+    {
+        vkDestroyImage(device, image_, nullptr);
+        image_ = VK_NULL_HANDLE;
+    }
+    if (memory_ != VK_NULL_HANDLE)
+    {
+        vkFreeMemory(device, memory_, nullptr);
+        memory_ = VK_NULL_HANDLE;
+    }
+    current_layout_ = VK_IMAGE_LAYOUT_UNDEFINED;
+}
+
+VizBuffer DeviceImage::view() const noexcept
+{
+    VizBuffer b;
+    b.data = static_cast<void*>(cuda_array_);
+    b.width = resolution_.width;
+    b.height = resolution_.height;
+    b.format = format_;
+    b.pitch = static_cast<size_t>(resolution_.width) * bytes_per_pixel(format_);
+    b.space = MemorySpace::kDevice;
+    return b;
+}
+
+void DeviceImage::create_vk_image_with_external_memory()
+{
+    const VkDevice device = ctx_->device();
+
+    // Image with external-memory export flag. Optimal tiling — CUDA
+    // accesses the image via cudaArray_t, not raw memory, so opaque
+    // GPU layout is fine.
+    VkExternalMemoryImageCreateInfo ext_image_info{};
+    ext_image_info.sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO;
+    ext_image_info.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT;
+
+    VkImageCreateInfo info{};
+    info.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO;
+    info.pNext = &ext_image_info;
+    info.imageType = VK_IMAGE_TYPE_2D;
+    info.format = vk_format_;
+    info.extent = { resolution_.width, resolution_.height, 1 };
+    info.mipLevels = 1; // Single level — when minification moiré shows up in
+                        // XR distance views, expose mipLevels via Config and
+                        // generate the chain via vkCmdBlitImage pre-render.
+                        // Anisotropic filtering on the sampler is the cheaper
+                        // first line of defense.
+    info.arrayLayers = 1;
+    info.samples = VK_SAMPLE_COUNT_1_BIT;
+    info.tiling = VK_IMAGE_TILING_OPTIMAL;
+    info.usage = VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT;
+    info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
+    info.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+
+    check_vk(vkCreateImage(device, &info, nullptr, &image_), "vkCreateImage");
+
+    VkMemoryRequirements reqs;
+    vkGetImageMemoryRequirements(device, image_, &reqs);
+
+    // Memory backing the image: device-local + exportable as POSIX fd.
+    // No VkMemoryDedicatedAllocateInfo / cudaExternalMemoryDedicated —
+    // a generic allocation works for sampled 2D images and avoids the
+    // dedicated-allocation extension wiring.
+    VkExportMemoryAllocateInfo export_info{};
+    export_info.sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO;
+    export_info.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT;
+
+    VkMemoryAllocateInfo alloc{};
+    alloc.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
+    alloc.pNext = &export_info;
+    alloc.allocationSize = reqs.size;
+    alloc.memoryTypeIndex =
+        find_memory_type(ctx_->physical_device(), reqs.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
+    check_vk(vkAllocateMemory(device, &alloc, nullptr, &memory_), "vkAllocateMemory");
+    check_vk(vkBindImageMemory(device, image_, memory_, 0), "vkBindImageMemory");
+
+    auto vkGetMemoryFdKHR = reinterpret_cast<PFN_vkGetMemoryFdKHR>(vkGetDeviceProcAddr(device, "vkGetMemoryFdKHR"));
+    if (vkGetMemoryFdKHR == nullptr)
+    {
+        throw std::runtime_error(
+            "DeviceImage: vkGetMemoryFdKHR not available "
+            "(VK_KHR_external_memory_fd not enabled?)");
+    }
+    VkMemoryGetFdInfoKHR fd_info{};
+    fd_info.sType = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR;
+    fd_info.memory = memory_;
+    fd_info.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT;
+    check_vk(vkGetMemoryFdKHR(device, &fd_info, &memory_fd_), "vkGetMemoryFdKHR");
+
+    // Used only for transition_to_*; tiny pool, default flags.
+    VkCommandPoolCreateInfo pool_info{};
+    pool_info.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
+    pool_info.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT;
+    pool_info.queueFamilyIndex = ctx_->queue_family_index();
+    check_vk(vkCreateCommandPool(device, &pool_info, nullptr, &command_pool_), "vkCreateCommandPool");
+}
+
+void DeviceImage::create_vk_image_view()
+{
+    VkImageViewCreateInfo info{};
+    info.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
+    info.image = image_;
+    info.viewType = VK_IMAGE_VIEW_TYPE_2D;
+    info.format = vk_format_;
+    info.subresourceRange.aspectMask =
+        (format_ == PixelFormat::kD32F) ? VK_IMAGE_ASPECT_DEPTH_BIT : VK_IMAGE_ASPECT_COLOR_BIT;
+    info.subresourceRange.baseMipLevel = 0;
+    info.subresourceRange.levelCount = 1;
+    info.subresourceRange.baseArrayLayer = 0;
+    info.subresourceRange.layerCount = 1;
+    check_vk(vkCreateImageView(ctx_->device(), &info, nullptr, &image_view_), "vkCreateImageView");
+}
+
+void DeviceImage::import_to_cuda()
+{
+    // cudaSetDevice is per-host-thread; VkContext set it on the init
+    // thread, but DeviceImage::create() may run on a different one.
+    // Re-pin here so cudaImportExternalMemory / GetMappedMipmappedArray
+    // talk to the same physical GPU as Vulkan.
+    check_cuda(cudaSetDevice(ctx_->cuda_device_id()), "cudaSetDevice");
+
+    VkMemoryRequirements reqs;
+    vkGetImageMemoryRequirements(ctx_->device(), image_, &reqs);
+
+    cudaExternalMemoryHandleDesc ext_desc{};
+    ext_desc.type = cudaExternalMemoryHandleTypeOpaqueFd;
+    ext_desc.handle.fd = memory_fd_;
+    ext_desc.size = reqs.size;
+    ext_desc.flags = 0;
+
+    check_cuda(cudaImportExternalMemory(&cuda_external_memory_, &ext_desc), "cudaImportExternalMemory");
+
+    // CUDA dup'd the fd internally; close ours so we don't double-free.
+    close_fd(memory_fd_);
+    memory_fd_ = -1;
+
+    cudaExternalMemoryMipmappedArrayDesc array_desc{};
+    array_desc.offset = 0;
+    array_desc.formatDesc = to_cuda_format(format_);
+    array_desc.extent = make_cudaExtent(resolution_.width, resolution_.height, 0);
+    array_desc.flags = cudaArrayColorAttachment;
+    array_desc.numLevels = 1;
+
+    check_cuda(cudaExternalMemoryGetMappedMipmappedArray(&cuda_mipmapped_array_, cuda_external_memory_, &array_desc),
+               "cudaExternalMemoryGetMappedMipmappedArray");
+    check_cuda(cudaGetMipmappedArrayLevel(&cuda_array_, cuda_mipmapped_array_, 0), "cudaGetMipmappedArrayLevel");
+}
+
+void DeviceImage::transition_to_shader_read()
+{
+    if (current_layout_ == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL)
+    {
+        return;
+    }
+    run_one_shot_layout_transition(current_layout_, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
+                                   VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT,
+                                   VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT);
+    current_layout_ = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
+}
+
+void DeviceImage::transition_to_transfer_dst()
+{
+    if (current_layout_ == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL)
+    {
+        return;
+    }
+    run_one_shot_layout_transition(current_layout_, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, VK_ACCESS_SHADER_READ_BIT,
+                                   VK_ACCESS_TRANSFER_WRITE_BIT, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT,
+                                   VK_PIPELINE_STAGE_TRANSFER_BIT);
+    current_layout_ = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
+}
+
+void DeviceImage::run_one_shot_layout_transition(VkImageLayout old_layout,
+                                                 VkImageLayout new_layout,
+                                                 VkAccessFlags src_access,
+                                                 VkAccessFlags dst_access,
+                                                 VkPipelineStageFlags src_stage,
+                                                 VkPipelineStageFlags dst_stage)
+{
+    const VkDevice device = ctx_->device();
+
+    VkCommandBufferAllocateInfo alloc{};
+    alloc.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
+    alloc.commandPool = command_pool_;
+    alloc.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
+    alloc.commandBufferCount = 1;
+    VkCommandBuffer cmd = VK_NULL_HANDLE;
+    check_vk(vkAllocateCommandBuffers(device, &alloc, &cmd), "vkAllocateCommandBuffers(transition)");
+
+    // RAII: free the command buffer on every exit path (including
+    // exceptions from the check_vk calls below). The pool would
+    // eventually reclaim it on destroy(), but a retry loop after a
+    // transient queue submit failure would leak one cmd per attempt.
+    struct CmdGuard
+    {
+        VkDevice device;
+        VkCommandPool pool;
+        VkCommandBuffer cmd;
+        ~CmdGuard()
+        {
+            vkFreeCommandBuffers(device, pool, 1, &cmd);
+        }
+    } guard{ device, command_pool_, cmd };
+
+    VkCommandBufferBeginInfo begin{};
+    begin.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+    begin.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+    check_vk(vkBeginCommandBuffer(cmd, &begin), "vkBeginCommandBuffer(transition)");
+
+    VkImageMemoryBarrier barrier{};
+    barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
+    barrier.oldLayout = old_layout;
+    barrier.newLayout = new_layout;
+    barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+    barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+    barrier.image = image_;
+    barrier.subresourceRange.aspectMask =
+        (format_ == PixelFormat::kD32F) ? VK_IMAGE_ASPECT_DEPTH_BIT : VK_IMAGE_ASPECT_COLOR_BIT;
+    barrier.subresourceRange.baseMipLevel = 0;
+    barrier.subresourceRange.levelCount = 1;
+    barrier.subresourceRange.baseArrayLayer = 0;
+    barrier.subresourceRange.layerCount = 1;
+    barrier.srcAccessMask = src_access;
+    barrier.dstAccessMask = dst_access;
+    vkCmdPipelineBarrier(cmd, src_stage, dst_stage, 0, 0, nullptr, 0, nullptr, 1, &barrier);
+
+    check_vk(vkEndCommandBuffer(cmd), "vkEndCommandBuffer(transition)");
+
+    VkSubmitInfo submit{};
+    submit.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+    submit.commandBufferCount = 1;
+    submit.pCommandBuffers = &cmd;
+    check_vk(vkQueueSubmit(ctx_->queue(), 1, &submit, VK_NULL_HANDLE), "vkQueueSubmit(transition)");
+    check_vk(vkQueueWaitIdle(ctx_->queue()), "vkQueueWaitIdle(transition)");
+}
+
+} // namespace viz
diff --git a/src/viz/core/cpp/inc/viz/core/device_image.hpp b/src/viz/core/cpp/inc/viz/core/device_image.hpp
index c29537c0b..5fd82e364 100644
--- a/src/viz/core/cpp/inc/viz/core/device_image.hpp
+++ b/src/viz/core/cpp/inc/viz/core/device_image.hpp
@@ -3,26 +3,116 @@
 
 #pragma once
 
+#include <viz/core/viz_buffer.hpp>
+#include <viz/core/viz_types.hpp>
+#include <vulkan/vulkan.h>
+
+#include <cuda_runtime.h>
+#include <memory>
+
 namespace viz
 {
 
-// Owning device-side 2D pixel buffer with CUDA-Vulkan interop. The
-// symmetric counterpart to HostImage: HostImage owns CPU bytes,
-// DeviceImage will own:
-//   - VkImage / VkBuffer + VkDeviceMemory exported via
-//     VK_KHR_external_memory_fd
-//   - cudaExternalMemory_t imported from that fd, plus the CUDA device
-//     pointer derived from it
-//   - paired cudaExternalSemaphore_t / VkSemaphore for acquire / release
-//     synchronization
-//
-// Returned by Televiz's mode-B submission path (acquire / release) when
-// Televiz allocates the interop buffer for a layer to write into.
+class VkContext;
+
+// Owning CUDA-Vulkan interop image. Vulkan allocates the VkImage
+// (optimal tiling, sampled + transfer-dst); the backing VkDeviceMemory
+// is exported via VK_KHR_external_memory_fd and imported into CUDA as
+// a cudaArray_t. CUDA writes via cuda_array(); Vulkan samples via
+// vk_image(). Symmetric counterpart to HostImage; both expose
+// VizBuffer view() so helpers branch on VizBuffer::space.
 //
-// Intentionally only forward-declared in this milestone — the
-// implementation ships alongside CUDA-Vulkan interop. Callers may pass
-// pointers / references to DeviceImage between modules but cannot
-// instantiate one until then.
-class DeviceImage;
+// Synchronization is heavyweight today (cudaDeviceSynchronize +
+// vkQueueWaitIdle); paired acquire / release semaphores arrive with
+// QuadLayer. CUDA/Vulkan device matching is handled by VkContext.
+class DeviceImage
+{
+public:
+    // Throws std::invalid_argument on bad config; std::runtime_error
+    // on Vulkan or CUDA failure. Pre-initialized.
+    static std::unique_ptr<DeviceImage> create(const VkContext& ctx, Resolution resolution, PixelFormat format);
+
+    ~DeviceImage();
+    void destroy();
+
+    DeviceImage(const DeviceImage&) = delete;
+    DeviceImage& operator=(const DeviceImage&) = delete;
+    DeviceImage(DeviceImage&&) = delete;
+    DeviceImage& operator=(DeviceImage&&) = delete;
+
+    // VizBuffer view (kDevice). `data` is the cudaArray_t cast to
+    // void*; it's an opaque CUDA handle, not a raw device pointer —
+    // use cuda_array() with cudaMemcpy2DToArrayAsync to write.
+    VizBuffer view() const noexcept;
+
+    // CUDA write target. Lifetime tied to this DeviceImage.
+    cudaArray_t cuda_array() const noexcept
+    {
+        return cuda_array_;
+    }
+
+    // Image lives in VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL after
+    // init; transition_to_*() below moves it back and forth.
+    VkImage vk_image() const noexcept
+    {
+        return image_;
+    }
+    VkImageView vk_image_view() const noexcept
+    {
+        return image_view_;
+    }
+    VkFormat vk_format() const noexcept
+    {
+        return vk_format_;
+    }
+
+    Resolution resolution() const noexcept
+    {
+        return resolution_;
+    }
+    PixelFormat format() const noexcept
+    {
+        return format_;
+    }
+
+    // Synchronous one-shot layout transitions (vkQueueSubmit +
+    // vkQueueWaitIdle). For tests / one-shot uploads — production
+    // layers record their own barriers in render commands.
+    void transition_to_shader_read();
+    void transition_to_transfer_dst();
+
+private:
+    explicit DeviceImage(const VkContext& ctx, Resolution resolution, PixelFormat format);
+    void init();
+
+    void create_vk_image_with_external_memory();
+    void create_vk_image_view();
+    void import_to_cuda();
+
+    void run_one_shot_layout_transition(VkImageLayout old_layout,
+                                        VkImageLayout new_layout,
+                                        VkAccessFlags src_access,
+                                        VkAccessFlags dst_access,
+                                        VkPipelineStageFlags src_stage,
+                                        VkPipelineStageFlags dst_stage);
+
+    const VkContext* ctx_ = nullptr;
+    Resolution resolution_{};
+    PixelFormat format_ = PixelFormat::kRGBA8;
+    VkFormat vk_format_ = VK_FORMAT_R8G8B8A8_UNORM;
+    VkImageLayout current_layout_ = VK_IMAGE_LAYOUT_UNDEFINED;
+
+    VkImage image_ = VK_NULL_HANDLE;
+    VkDeviceMemory memory_ = VK_NULL_HANDLE;
+    VkImageView image_view_ = VK_NULL_HANDLE;
+    VkCommandPool command_pool_ = VK_NULL_HANDLE; // For layout transitions only.
+
+    // CUDA dup's the fd internally on import; we close ours after.
+    int memory_fd_ = -1;
+
+    cudaExternalMemory_t cuda_external_memory_ = nullptr;
+    cudaMipmappedArray_t cuda_mipmapped_array_ = nullptr;
+    cudaArray_t cuda_array_ = nullptr; // Level-0 view, non-owning.
+};
 
 } // namespace viz
diff --git a/src/viz/core/cpp/inc/viz/core/vk_context.hpp b/src/viz/core/cpp/inc/viz/core/vk_context.hpp
index e960f30ef..5f12c8690 100644
--- a/src/viz/core/cpp/inc/viz/core/vk_context.hpp
+++ b/src/viz/core/cpp/inc/viz/core/vk_context.hpp
@@ -43,6 +43,14 @@ struct PhysicalDeviceInfo
 //   - A queue family with graphics + compute + transfer flags
 //
 // VkContext owns the Vulkan handles and tears them down on destruction.
+//
+// CUDA-Vulkan device matching: as part of init(), VkContext queries
+// the chosen physical device's UUID and calls cudaSetDevice() on the
+// matching CUDA device. This is required for CUDA-Vulkan interop
+// (cudaImportExternalMemory) to succeed on multi-GPU machines, and
+// it makes VkContext the single chokepoint for "which GPU is Televiz
+// on" — every viz_core type that touches CUDA can assume the current
+// CUDA device matches the Vulkan one.
 class VkContext
 {
 public:
@@ -96,6 +104,15 @@ class VkContext
     uint32_t queue_family_index() const noexcept;
     VkQueue queue() const noexcept;
 
+    // CUDA device id matched to the chosen Vulkan physical device by
+    // UUID at init() time. Useful for callers that need to ensure
+    // their thread is on the right CUDA device before issuing CUDA
+    // calls — cudaSetDevice is per-host-thread, so a CudaTexture /
+    // DeviceImage created on a worker thread must call
+    // cudaSetDevice(ctx.cuda_device_id()) before any CUDA API. Returns
+    // -1 before init() has run.
+    int cuda_device_id() const noexcept;
+
     // Enumerates all Vulkan-capable physical devices and returns their
     // properties. Useful for picking a specific GPU index on multi-GPU
     // machines before calling init().
@@ -109,6 +126,7 @@ class VkContext
     void create_instance(const Config& config);
     void select_physical_device(const Config& config);
     void create_logical_device(const Config& config);
+    void match_cuda_device_to_vulkan();
 
     bool initialized_ = false;
     bool validation_enabled_ = false;
@@ -117,6 +135,7 @@ class VkContext
     VkDevice device_ = VK_NULL_HANDLE;
     uint32_t queue_family_index_ = UINT32_MAX;
     VkQueue queue_ = VK_NULL_HANDLE;
+    int cuda_device_id_ = -1;
 };
 
 } // namespace viz
diff --git a/src/viz/core/cpp/vk_context.cpp b/src/viz/core/cpp/vk_context.cpp
index 1046f1d8e..d2184d038 100644
--- a/src/viz/core/cpp/vk_context.cpp
+++ b/src/viz/core/cpp/vk_context.cpp
@@ -5,6 +5,7 @@
 
 #include <algorithm>
 #include <cstring>
+#include <cuda_runtime.h>
 #include <iostream>
 #include <stdexcept>
 #include <string>
@@ -190,6 +191,7 @@ void VkContext::init(const Config& config)
         create_instance(config);
         select_physical_device(config);
         create_logical_device(config);
+        match_cuda_device_to_vulkan();
         initialized_ = true;
     }
     catch (...)
@@ -214,6 +216,7 @@ void VkContext::destroy()
     physical_device_ = VK_NULL_HANDLE;
     queue_ = VK_NULL_HANDLE;
     queue_family_index_ = UINT32_MAX;
+    cuda_device_id_ = -1;
     validation_enabled_ = false;
     initialized_ = false;
 }
@@ -248,6 +251,11 @@ VkQueue VkContext::queue() const noexcept
     return queue_;
 }
 
+int VkContext::cuda_device_id() const noexcept
+{
+    return cuda_device_id_;
+}
+
 void VkContext::create_instance(const Config& config)
 {
     VkApplicationInfo app_info{};
@@ -406,6 +414,54 @@ void VkContext::create_logical_device(const Config& config)
     vkGetDeviceQueue(device_, queue_family_index_, 0, &queue_);
 }
 
+void VkContext::match_cuda_device_to_vulkan()
+{
+    // Read the Vulkan physical device's UUID, then find the CUDA
+    // device with the same UUID and make it current. Required so
+    // every viz_core type that imports Vulkan memory into CUDA
+    // (e.g. DeviceImage::cudaImportExternalMemory) operates on the
+    // same physical GPU. On multi-GPU machines Vulkan and CUDA
+    // default to different devices and interop fails with
+    // cudaErrorUnknown.
+    VkPhysicalDeviceIDProperties id_props{};
+    id_props.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES;
+    VkPhysicalDeviceProperties2 props2{};
+    props2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
+    props2.pNext = &id_props;
+    vkGetPhysicalDeviceProperties2(physical_device_, &props2);
+
+    int cuda_count = 0;
+    cudaError_t err = cudaGetDeviceCount(&cuda_count);
+    if (err != cudaSuccess || cuda_count == 0)
+    {
+        throw std::runtime_error(
+            "VkContext: no CUDA devices visible — CUDA-Vulkan interop requires "
+            "a working CUDA driver");
+    }
+    for (int i = 0; i < cuda_count; ++i)
+    {
+        cudaDeviceProp prop{};
+        err = cudaGetDeviceProperties(&prop, i);
+        if (err != cudaSuccess)
+        {
+            continue;
+        }
+        if (std::memcmp(prop.uuid.bytes, id_props.deviceUUID, VK_UUID_SIZE) == 0)
+        {
+            err = cudaSetDevice(i);
+            if (err != cudaSuccess)
+            {
+                throw std::runtime_error(std::string("VkContext: cudaSetDevice failed: ") + cudaGetErrorString(err));
+            }
+            cuda_device_id_ = i;
+            return;
+        }
+    }
+    throw std::runtime_error(
+        "VkContext: no CUDA device matches the Vulkan physical device's UUID — "
+        "CUDA-Vulkan interop requires same-GPU operation");
+}
+
 std::vector<PhysicalDeviceInfo> VkContext::enumerate_physical_devices()
 {
     std::vector<PhysicalDeviceInfo> result;
diff --git a/src/viz/core_tests/cpp/CMakeLists.txt b/src/viz/core_tests/cpp/CMakeLists.txt
index c9d031d2e..656aff6c6 100644
--- a/src/viz/core_tests/cpp/CMakeLists.txt
+++ b/src/viz/core_tests/cpp/CMakeLists.txt
@@ -4,6 +4,7 @@
 cmake_minimum_required(VERSION 3.20)
 
 add_executable(viz_core_tests
+    test_device_image.cpp
     test_frame_sync.cpp
     test_host_image.cpp
     test_render_target.cpp
diff --git a/src/viz/core_tests/cpp/test_device_image.cpp b/src/viz/core_tests/cpp/test_device_image.cpp
new file mode 100644
index 000000000..fcdb23af0
--- /dev/null
+++ b/src/viz/core_tests/cpp/test_device_image.cpp
@@ -0,0 +1,117 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+// [gpu] tests for DeviceImage: verify Vulkan handle creation, the
+// CUDA cudaArray_t is usable from a cudaMemcpy2DToArray call, the
+// VizBuffer view exposes the right shape with MemorySpace::kDevice,
+// and a round-trip copy preserves pixel values.
+
+#include "test_helpers.hpp"
+
+#include <catch2/catch_test_macros.hpp>
+#include <viz/core/device_image.hpp>
+#include <viz/core/viz_buffer.hpp>
+#include <viz/core/vk_context.hpp>
+
+#include <cstdint>
+#include <cuda_runtime.h>
+#include <stdexcept>
+#include <vector>
+
+using viz::DeviceImage;
+using viz::MemorySpace;
+using viz::PixelFormat;
+using viz::Resolution;
+using viz::VizBuffer;
+
+namespace
+{
+
+// Generate a deterministic gradient pattern. Channel R = column / W *
+// 255, G = row / H * 255, B = column XOR row, A = 255. Easy to spot
+// in a debugger and reproducible across test runs.
+std::vector<uint8_t> make_gradient(uint32_t w, uint32_t h)
+{
+    std::vector<uint8_t> px(static_cast<size_t>(w) * h * 4);
+    for (uint32_t y = 0; y < h; ++y)
+    {
+        for (uint32_t x = 0; x < w; ++x)
+        {
+            const size_t i = (static_cast<size_t>(y) * w + x) * 4;
+            px[i + 0] = static_cast<uint8_t>((x * 255u) / (w - 1u));
+            px[i + 1] = static_cast<uint8_t>((y * 255u) / (h - 1u));
+            px[i + 2] = static_cast<uint8_t>((x ^ y) & 0xff);
+            px[i + 3] = 255;
+        }
+    }
+    return px;
+}
+
+} // namespace
+
+TEST_CASE_METHOD(viz::testing::GpuFixture, "DeviceImage creates valid Vulkan + CUDA handles", "[gpu][device_image]")
+{
+    auto img = DeviceImage::create(vk, Resolution{ 64, 64 }, PixelFormat::kRGBA8);
+    REQUIRE(img != nullptr);
+    CHECK(img->vk_image() != VK_NULL_HANDLE);
+    CHECK(img->vk_image_view() != VK_NULL_HANDLE);
+    CHECK(img->vk_format() == VK_FORMAT_R8G8B8A8_UNORM);
+    CHECK(img->cuda_array() != nullptr);
+    CHECK(img->resolution().width == 64);
+    CHECK(img->resolution().height == 64);
+    CHECK(img->format() == PixelFormat::kRGBA8);
+}
+
+TEST_CASE_METHOD(viz::testing::GpuFixture, "DeviceImage::create rejects zero dimensions", "[gpu][device_image]")
+{
+    CHECK_THROWS_AS(DeviceImage::create(vk, Resolution{ 0, 64 }, PixelFormat::kRGBA8), std::invalid_argument);
+    CHECK_THROWS_AS(DeviceImage::create(vk, Resolution{ 64, 0 }, PixelFormat::kRGBA8), std::invalid_argument);
+}
+
+TEST_CASE_METHOD(viz::testing::GpuFixture, "DeviceImage destroy is idempotent", "[gpu][device_image]")
+{
+    auto img = DeviceImage::create(vk, Resolution{ 32, 32 }, PixelFormat::kRGBA8);
+    img->destroy();
+    CHECK(img->vk_image() == VK_NULL_HANDLE);
+    CHECK(img->vk_image_view() == VK_NULL_HANDLE);
+    CHECK(img->cuda_array() == nullptr);
+    img->destroy();
+}
+
+TEST_CASE_METHOD(viz::testing::GpuFixture, "DeviceImage::view exposes a kDevice VizBuffer", "[gpu][device_image]")
+{
+    auto img = DeviceImage::create(vk, Resolution{ 16, 16 }, PixelFormat::kRGBA8);
+    const VizBuffer v = img->view();
+    CHECK(v.space == MemorySpace::kDevice);
+    CHECK(v.width == 16);
+    CHECK(v.height == 16);
+    CHECK(v.format == PixelFormat::kRGBA8);
+    CHECK(v.pitch == static_cast<size_t>(16) * 4);
+    // .data points at the cudaArray_t handle (NOT a raw device pointer);
+    // sanity that it's non-null when the image is alive.
+    CHECK(v.data != nullptr);
+}
+
+TEST_CASE_METHOD(viz::testing::GpuFixture, "DeviceImage round-trip preserves pixel pattern", "[gpu][device_image]")
+{
+    constexpr uint32_t kSide = 64;
+    constexpr size_t kBytes = static_cast<size_t>(kSide) * kSide * 4;
+
+    auto img = DeviceImage::create(vk, Resolution{ kSide, kSide }, PixelFormat::kRGBA8);
+
+    // Write a gradient via CUDA into the array.
+    const auto src = make_gradient(kSide, kSide);
+    REQUIRE(cudaMemcpy2DToArray(img->cuda_array(), 0, 0, src.data(), kSide * 4, kSide * 4, kSide,
+                                cudaMemcpyHostToDevice) == cudaSuccess);
+    REQUIRE(cudaDeviceSynchronize() == cudaSuccess);
+
+    // Read it back via CUDA — verifies the data took. (M3b's milestone
+    // test will read back via Vulkan sampling; that's the real round-
+    // trip but requires a graphics pipeline we don't have yet.)
+    std::vector<uint8_t> dst(kBytes);
+    REQUIRE(cudaMemcpy2DFromArray(dst.data(), kSide * 4, img->cuda_array(), 0, 0, kSide * 4, kSide,
+                                  cudaMemcpyDeviceToHost) == cudaSuccess);
+    REQUIRE(cudaDeviceSynchronize() == cudaSuccess);
+
+    CHECK(dst == src);
+}
diff --git a/src/viz/shaders/CMakeLists.txt b/src/viz/shaders/CMakeLists.txt
new file mode 100644
index 000000000..636447b71
--- /dev/null
+++ b/src/viz/shaders/CMakeLists.txt
@@ -0,0 +1,9 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+cmake_minimum_required(VERSION 3.20)
+
+# GLSL shader sources compiled to SPIR-V at build time and embedded as
+# C arrays in generated headers. Consumers link viz::shaders to get the
+# headers on their include path.
+add_subdirectory(cpp)
diff --git a/src/viz/shaders/cpp/CMakeLists.txt b/src/viz/shaders/cpp/CMakeLists.txt
new file mode 100644
index 000000000..0f934aaf7
--- /dev/null
+++ b/src/viz/shaders/cpp/CMakeLists.txt
@@ -0,0 +1,66 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+cmake_minimum_required(VERSION 3.20)
+
+# glslangValidator is part of the Vulkan SDK / glslang-tools package.
+# We don't FetchContent it — assume it's installed (libvulkan-dev pulls
+# it in on Ubuntu, brew install glslang on macOS).
+find_program(GLSLANG_VALIDATOR glslangValidator REQUIRED)
+message(STATUS "Using glslangValidator: ${GLSLANG_VALIDATOR}")
+
+set(SHADERS_GEN_DIR "${CMAKE_CURRENT_BINARY_DIR}/inc/viz/shaders")
+file(MAKE_DIRECTORY "${SHADERS_GEN_DIR}")
+
+# compile_shader(<glsl_path> <var_name>): adds custom commands that
+# run glslangValidator + the byte-emit helper, and registers the
+# generated header so it's a build-time dependency for any consumer
+# of viz_shaders.
+function(compile_shader GLSL_PATH VAR_NAME)
+    get_filename_component(GLSL_NAME "${GLSL_PATH}" NAME)
+    set(SPV_PATH "${CMAKE_CURRENT_BINARY_DIR}/${GLSL_NAME}.spv")
+    set(HEADER_PATH "${SHADERS_GEN_DIR}/${GLSL_NAME}.spv.h")
+
+    add_custom_command(
+        OUTPUT "${SPV_PATH}"
+        COMMAND ${GLSLANG_VALIDATOR} -V "${CMAKE_CURRENT_SOURCE_DIR}/${GLSL_PATH}" -o "${SPV_PATH}"
+        DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${GLSL_PATH}"
+        COMMENT "Compiling shader ${GLSL_NAME} -> SPIR-V"
+        VERBATIM
+    )
+
+    add_custom_command(
+        OUTPUT "${HEADER_PATH}"
+        COMMAND ${CMAKE_COMMAND}
+            -DSPV_PATH=${SPV_PATH}
+            -DHEADER_PATH=${HEADER_PATH}
+            -DVAR_NAME=${VAR_NAME}
+            -P "${CMAKE_CURRENT_SOURCE_DIR}/compile_shader.cmake"
+        DEPENDS "${SPV_PATH}" "${CMAKE_CURRENT_SOURCE_DIR}/compile_shader.cmake"
+        COMMENT "Embedding ${GLSL_NAME}.spv -> ${VAR_NAME}"
+        VERBATIM
+    )
+
+    set_property(GLOBAL APPEND PROPERTY VIZ_SHADER_HEADERS "${HEADER_PATH}")
+endfunction()
+
+# Shader programs:
+#   textured_quad  — fullscreen quad sampling a combined image sampler.
+#                    Used by QuadLayer to display a CUDA-fed texture.
+compile_shader(textured_quad.vert  kTexturedQuadVertSpv)
+compile_shader(textured_quad.frag  kTexturedQuadFragSpv)
+
+# INTERFACE library exposing the generated headers + a phony custom
+# target that ensures the headers exist before any consumer compiles.
+get_property(_SHADER_HEADERS GLOBAL PROPERTY VIZ_SHADER_HEADERS)
+add_custom_target(viz_shaders_generate ALL DEPENDS ${_SHADER_HEADERS})
+
+add_library(viz_shaders INTERFACE)
+target_include_directories(viz_shaders
+    INTERFACE
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/inc>
+)
+add_dependencies(viz_shaders viz_shaders_generate)
+
+# Aliased as viz::shaders.
+add_library(viz::shaders ALIAS viz_shaders)
diff --git a/src/viz/shaders/cpp/compile_shader.cmake b/src/viz/shaders/cpp/compile_shader.cmake
new file mode 100644
index 000000000..32e27d4ca
--- /dev/null
+++ b/src/viz/shaders/cpp/compile_shader.cmake
@@ -0,0 +1,56 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Helper script invoked from add_custom_command to convert a SPIR-V
+# binary into a C header containing an inline constexpr byte array.
+# Driven by command-line variables: SPV_PATH, HEADER_PATH, VAR_NAME.
+#
+# Output (HEADER_PATH) looks like:
+#   #pragma once
+#   #include <cstddef>
+#   #include <cstdint>
+#   namespace viz::shaders {
+#   inline constexpr alignas(uint32_t) unsigned char kVarName[] = {
+#       0x03, 0x02, 0x23, 0x07, ...
+#   };
+#   inline constexpr size_t kVarNameSize = sizeof(kVarName);
+#   } // namespace viz::shaders
+
+if(NOT DEFINED SPV_PATH OR NOT DEFINED HEADER_PATH OR NOT DEFINED VAR_NAME)
+    message(FATAL_ERROR "compile_shader.cmake requires SPV_PATH, HEADER_PATH, VAR_NAME")
+endif()
+
+file(READ "${SPV_PATH}" SPV_CONTENT HEX)
+string(LENGTH "${SPV_CONTENT}" SPV_HEX_LEN)
+math(EXPR SPV_BYTE_LEN "${SPV_HEX_LEN} / 2")
+if(SPV_BYTE_LEN EQUAL 0)
+    message(FATAL_ERROR "compile_shader.cmake: ${SPV_PATH} is empty")
+endif()
+
+# Format every two hex chars as 0xab,. Single long line — SPIR-V binaries
+# are small (~KB) and compilers handle long initializer lines fine.
+string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1, " SPV_BYTES "${SPV_CONTENT}")
+
+set(HEADER_CONTENT
+"// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// AUTO-GENERATED FROM ${SPV_PATH} BY compile_shader.cmake. DO NOT EDIT.
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+namespace viz::shaders
+{
+
+alignas(uint32_t) inline constexpr unsigned char ${VAR_NAME}[] = {
+    ${SPV_BYTES}
+};
+inline constexpr size_t ${VAR_NAME}Size = sizeof(${VAR_NAME});
+
+} // namespace viz::shaders
+")
+
+file(WRITE "${HEADER_PATH}" "${HEADER_CONTENT}")
diff --git a/src/viz/shaders/cpp/textured_quad.frag b/src/viz/shaders/cpp/textured_quad.frag
new file mode 100644
index 000000000..29b92d6c9
--- /dev/null
+++ b/src/viz/shaders/cpp/textured_quad.frag
@@ -0,0 +1,17 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+// Samples a combined image sampler at descriptor binding 0. Used by
+// QuadLayer (ships in M3b) to display a CUDA-fed texture as a quad.
+
+#version 450
+
+layout(set = 0, binding = 0) uniform sampler2D u_texture;
+
+layout(location = 0) in vec2 v_uv;
+layout(location = 0) out vec4 out_color;
+
+void main()
+{
+    out_color = texture(u_texture, v_uv);
+}
diff --git a/src/viz/shaders/cpp/textured_quad.vert b/src/viz/shaders/cpp/textured_quad.vert
new file mode 100644
index 000000000..52cbe5eec
--- /dev/null
+++ b/src/viz/shaders/cpp/textured_quad.vert
@@ -0,0 +1,16 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+// Fullscreen-quad vertex shader that also forwards UVs for sampling.
+// Uses the same gl_VertexIndex trick as solid_color.vert: 3 vertices
+// → covers the screen, no vertex buffer needed.
+
+#version 450
+
+layout(location = 0) out vec2 v_uv;
+
+void main()
+{
+    v_uv = vec2((gl_VertexIndex << 1) & 2, gl_VertexIndex & 2);
+    gl_Position = vec4(v_uv * 2.0 - 1.0, 0.0, 1.0);
+}
diff --git a/src/viz/shaders_tests/CMakeLists.txt b/src/viz/shaders_tests/CMakeLists.txt
new file mode 100644
index 000000000..a25478921
--- /dev/null
+++ b/src/viz/shaders_tests/CMakeLists.txt
@@ -0,0 +1,7 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+cmake_minimum_required(VERSION 3.20)
+
+# Build C++ tests
+add_subdirectory(cpp)
diff --git a/src/viz/shaders_tests/cpp/CMakeLists.txt b/src/viz/shaders_tests/cpp/CMakeLists.txt
new file mode 100644
index 000000000..66159f706
--- /dev/null
+++ b/src/viz/shaders_tests/cpp/CMakeLists.txt
@@ -0,0 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+cmake_minimum_required(VERSION 3.20)
+
+add_executable(viz_shaders_tests
+    test_shader_blobs.cpp
+)
+
+target_link_libraries(viz_shaders_tests PRIVATE
+    viz::shaders
+    Catch2::Catch2WithMain
+)
+
+message(STATUS "viz_shaders_tests target enabled with Catch2")
+
+catch_discover_tests(viz_shaders_tests ADD_TAGS_AS_LABELS)
diff --git a/src/viz/shaders_tests/cpp/test_shader_blobs.cpp b/src/viz/shaders_tests/cpp/test_shader_blobs.cpp
new file mode 100644
index 000000000..8637cf6e2
--- /dev/null
+++ b/src/viz/shaders_tests/cpp/test_shader_blobs.cpp
@@ -0,0 +1,44 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+// Smoke tests for the embedded SPIR-V blobs: verify the CMake
+// glslangValidator + byte-embed pipeline produced something that
+// looks like real SPIR-V. Doesn't run the shaders — just sanity-checks
+// the bytes.
+
+#include <catch2/catch_test_macros.hpp>
+#include <viz/shaders/textured_quad.frag.spv.h>
+#include <viz/shaders/textured_quad.vert.spv.h>
+
+#include <cstdint>
+#include <cstring>
+
+namespace
+{
+
+// SPIR-V magic word, little-endian.
+constexpr uint32_t kSpvMagic = 0x07230203;
+
+// Returns the first uint32 of `bytes` as a little-endian word.
+uint32_t first_word_le(const unsigned char* bytes)
+{
+    uint32_t w = 0;
+    std::memcpy(&w, bytes, sizeof(w));
+    return w;
+}
+
+} // namespace
+
+TEST_CASE("textured_quad.vert.spv blob is non-empty and starts with SPIR-V magic", "[unit][shaders]")
+{
+    REQUIRE(viz::shaders::kTexturedQuadVertSpvSize >= 4);
+    REQUIRE(viz::shaders::kTexturedQuadVertSpvSize % 4 == 0);
+    CHECK(first_word_le(viz::shaders::kTexturedQuadVertSpv) == kSpvMagic);
+}
+
+TEST_CASE("textured_quad.frag.spv blob is non-empty and starts with SPIR-V magic", "[unit][shaders]")
+{
+    REQUIRE(viz::shaders::kTexturedQuadFragSpvSize >= 4);
+    REQUIRE(viz::shaders::kTexturedQuadFragSpvSize % 4 == 0);
+    CHECK(first_word_le(viz::shaders::kTexturedQuadFragSpv) == kSpvMagic);
+}