diff --git a/.github/actions/setup-cuda/action.yml b/.github/actions/setup-cuda/action.yml new file mode 100644 index 000000000..3dc9590fa --- /dev/null +++ b/.github/actions/setup-cuda/action.yml @@ -0,0 +1,42 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +name: Setup CUDA Toolkit +description: Install CUDA Toolkit (cudart + nvcc) via NVIDIA's apt repository. + Architecture-aware (x86_64 / sbsa) so it works on both ubuntu-22.04 and + ubuntu-22.04-arm runners. Replaces the Jimver/cuda-toolkit action, which + hardcodes /x86_64/ in the repo URL and fails on ARM runners. + +inputs: + cuda-version: + description: 'CUDA major-minor version (e.g. "12-4" for 12.4)' + required: false + default: '12-4' + +runs: + using: composite + steps: + - name: Install CUDA toolkit (apt) + shell: bash + run: | + set -euo pipefail + ARCH=$(dpkg --print-architecture) + case "$ARCH" in + amd64) CUDA_ARCH=x86_64 ;; + arm64) CUDA_ARCH=sbsa ;; + *) echo "setup-cuda: unsupported arch '$ARCH'" >&2; exit 1 ;; + esac + REPO_URL="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/${CUDA_ARCH}" + TMPDIR=$(mktemp -d) + wget -q -O "${TMPDIR}/cuda-keyring.deb" "${REPO_URL}/cuda-keyring_1.1-1_all.deb" + sudo dpkg -i "${TMPDIR}/cuda-keyring.deb" + rm -rf "${TMPDIR}" + sudo apt-get update -y + sudo apt-get install -y --no-install-recommends \ + cuda-nvcc-${{ inputs.cuda-version }} \ + cuda-cudart-dev-${{ inputs.cuda-version }} + # Expose to subsequent steps. CMake's find_package(CUDAToolkit) + # picks up CUDA_PATH or finds nvcc via PATH. + CUDA_VERSION_DOTTED="$(echo '${{ inputs.cuda-version }}' | tr '-' '.')" + echo "CUDA_PATH=/usr/local/cuda-${CUDA_VERSION_DOTTED}" >> "$GITHUB_ENV" + echo "/usr/local/cuda-${CUDA_VERSION_DOTTED}/bin" >> "$GITHUB_PATH" diff --git a/.github/workflows/build-ubuntu.yml b/.github/workflows/build-ubuntu.yml index eafb25592..58b17402e 100644 --- a/.github/workflows/build-ubuntu.yml +++ b/.github/workflows/build-ubuntu.yml @@ -39,13 +39,21 @@ jobs: - name: Install Apt dependencies run: | sudo apt-get update - sudo apt-get install -y build-essential cmake libx11-dev clang-format-14 ccache libvulkan-dev + sudo apt-get install -y build-essential cmake libx11-dev clang-format-14 ccache libvulkan-dev glslang-tools - name: Install patchelf (Release only) if: ${{ matrix.build_type == 'Release' }} run: | sudo apt-get update sudo apt-get install -y patchelf + + # CUDA toolkit needed at link time for viz_core (device_image + # links libcudart). The wheel itself excludes libcuda.so.1 via + # auditwheel; runtime CUDA is the consumer's responsibility. + # Custom action handles both x86_64 and ARM64 (sbsa) repos. + - name: Install CUDA toolkit + uses: ./.github/actions/setup-cuda + - name: Setup CloudXR SDK id: cloudxr-sdk uses: ./.github/actions/setup-cloudxr-sdk @@ -196,6 +204,10 @@ jobs: tar -xzvf viz-tests.tar.gz -C viz-tests chmod +x viz-tests/viz_*_tests + # No CUDA Toolkit install on the GPU runner — viz_core static-links + # libcudart_static, so the binaries only need the NVIDIA driver + # (libcuda.so.1) which the runner already has. + - name: Run viz [gpu] tests run: | set -euo pipefail @@ -262,7 +274,10 @@ jobs: - name: Install Apt dependencies run: | sudo apt-get update - sudo apt-get install -y build-essential cmake libx11-dev clang-format-14 ccache libvulkan-dev + sudo apt-get install -y build-essential cmake libx11-dev clang-format-14 ccache libvulkan-dev glslang-tools + + - name: Install CUDA toolkit + uses: ./.github/actions/setup-cuda - name: Cache ccache uses: actions/cache@v5 diff --git a/.github/workflows/build-windows.yml b/.github/workflows/build-windows.yml index 0ef005806..a9cd2d93a 100644 --- a/.github/workflows/build-windows.yml +++ b/.github/workflows/build-windows.yml @@ -69,6 +69,18 @@ jobs: version: 1.3.290.0 cache: true + # CUDA Toolkit needed at link time for viz_core (device_image links + # cudart). Jimver action works correctly on Windows (the arm64 + # /x86_64/ hardcoding bug is Linux-only); runs NVIDIA's silent + # network installer and sets CUDA_PATH so find_package(CUDAToolkit) + # locates the install. + - name: Install CUDA Toolkit + uses: Jimver/cuda-toolkit@v0.2.19 + with: + cuda: '12.4.0' + method: 'network' + sub-packages: '["nvcc", "cudart"]' + - name: Configure CMake # Note: # sccache does not work with VSBuild, so we use Ninja generator here. diff --git a/deps/README.md b/deps/README.md index f004d29db..3cb4f3496 100644 --- a/deps/README.md +++ b/deps/README.md @@ -54,14 +54,24 @@ machine and are located via `find_package`. - **Min version**: 1.2 (Televiz checks `VK_API_VERSION_1_2` at device select time). - **License**: Apache 2.0 (loader); per-vendor for ICD drivers. -### CUDA Toolkit (optional, used by examples) +### CUDA Toolkit - **Locator**: `find_package(CUDAToolkit REQUIRED)` -- **Required by**: `examples/camera_streamer/` only today; `viz` adds CUDA - dependency once CUDA-Vulkan interop lands; CI must have CUDA installed for - GPU test runner to exercise these paths. +- **Required by**: `viz/core/` (`CudaTexture`, `DeviceImage` link to + `CUDA::cudart`) when `BUILD_VIZ=ON`; also `examples/camera_streamer/`. - **Min version**: 12.0 +- **CI**: installed via `Jimver/cuda-toolkit@v0.2.x` action with + `nvcc` + `cudart` sub-packages. - **License**: NVIDIA EULA +### glslangValidator (shader compiler) +- **Locator**: `find_program(GLSLANG_VALIDATOR glslangValidator REQUIRED)` +- **Required by**: `viz/shaders/` to compile `.vert` / `.frag` GLSL + into SPIR-V at build time. +- **Linux**: `apt-get install glslang-tools` +- **Windows**: ships with the LunarG Vulkan SDK +- **macOS**: `brew install glslang` +- **License**: BSD-3 / Khronos + ## Third-Party Dependencies ### OpenXR SDK diff --git a/src/viz/AGENTS.md b/src/viz/AGENTS.md index 8219ce243..67735dedd 100644 --- a/src/viz/AGENTS.md +++ b/src/viz/AGENTS.md @@ -17,11 +17,19 @@ sibling `_tests/` directory: - **`viz/core/`** — foundational types + Vulkan/CUDA infrastructure. Library: `viz_core`. Today: `VkContext`, `VizBuffer`, `Pose3D`, `Fov`, - `Resolution`, `ViewInfo`, `PixelFormat`, `RenderTarget`, `FrameSync`. - Future: `cuda_texture`. Math types (`glm::vec3`, `glm::quat`, + `Resolution`, `ViewInfo`, `PixelFormat`, `RenderTarget`, `FrameSync`, + `HostImage`, `DeviceImage`. `HostImage` / `DeviceImage` are the + symmetric pair of owning 2D pixel buffers (CPU bytes vs CUDA-Vulkan + interop) — both expose `VizBuffer view()` so generic helpers branch + on `VizBuffer::space`. Math types (`glm::vec3`, `glm::quat`, `glm::mat4`) come from GLM 1.0.1 (FetchContent in `deps/third_party/`); use `glm::value_ptr(mat)` to get a raw `float*` for Vulkan / CUDA upload (POD-equivalent layout, no copy). + CUDA-Vulkan interop requires CUDA Toolkit at link time + (`CUDAToolkit::cudart`). `VkContext::init()` matches the current + CUDA device to the chosen Vulkan physical device by UUID — every + viz_core type can assume CUDA and Vulkan are talking to the same + GPU without re-doing the match. - **`viz/layers/`** — `LayerBase` and concrete layers (`QuadLayer`, etc.). Library: `viz_layers` (INTERFACE / header-only today; promoted to STATIC when the first concrete layer ships). Depends on `viz_core`. @@ -39,11 +47,18 @@ sibling `_tests/` directory: frame loop, type conversion). Library: `viz_xr`. **Optional** behind `BUILD_VIZ_XR`. Depends on `viz_core` + OpenXR. - **`viz/python/`** — pybind11 module `_viz`, exposed as `isaacteleop.viz`. -- **`viz/shaders/`** — GLSL → SPIR-V at build time. +- **`viz/shaders/`** — GLSL → SPIR-V at build time. Library: `viz_shaders` + (INTERFACE — exposes generated headers `viz/shaders/.spv.h`, + each containing an `inline constexpr alignas(uint32_t) unsigned char` + byte array + a `Size` constant). Compilation runs `glslangValidator` + (system-installed; CI gets `glslang-tools` apt package). Add new + shader programs by dropping `.vert` / `.frag` in + `viz/shaders/cpp/` and calling `compile_shader(.vert kVarName)` + in the local CMakeLists. Test directories follow the same per-module pattern: `viz/core_tests/`, `viz/layers_tests/`, `viz/session_tests/`, -`viz/xr_tests/`. +`viz/shaders_tests/`, `viz/xr_tests/`. `src/viz/CMakeLists.txt` is an **orchestrator only** — it adds the sub-module sub-directories. Sub-module `CMakeLists.txt` files build the @@ -57,8 +72,17 @@ Build paths that ship viz (the wheel CI on Linux + Windows) pass `-DBUILD_VIZ=ON` explicitly. Lean Dockerfiles (`examples/teleop_ros2/Dockerfile`) get viz-free builds for free. -When `BUILD_VIZ=ON` you must have Vulkan headers + loader installed: -`libvulkan-dev` on Linux, LunarG SDK on Windows. +When `BUILD_VIZ=ON` the build machine must have: +- **Vulkan headers + loader**: `libvulkan-dev` on Linux, LunarG SDK on + Windows. +- **CUDA Toolkit** (cudart for link, nvcc not strictly required today + but expected to be needed for kernels in M3b+): apt + `nvidia-cuda-toolkit` or the official NVIDIA installer / CI action + (`Jimver/cuda-toolkit`). The wheel excludes `libcuda.so.1` — + consumers supply it via NVIDIA driver. +- **glslangValidator** for shader compilation: `glslang-tools` apt + package on Linux, `brew install glslang` on macOS, ships with the + Vulkan SDK on Windows. ## Code conventions diff --git a/src/viz/CMakeLists.txt b/src/viz/CMakeLists.txt index 18bf7f6c6..642e080b2 100644 --- a/src/viz/CMakeLists.txt +++ b/src/viz/CMakeLists.txt @@ -31,6 +31,10 @@ add_subdirectory(layers) # layer that drives the per-frame loop and manages the layer registry. add_subdirectory(session) +# Shaders: GLSL sources compiled to SPIR-V at build time and embedded +# as C arrays. Header-only INTERFACE library viz::shaders. +add_subdirectory(shaders) + # Python bindings (stub today; pybind11 module added later). if(BUILD_PYTHON_BINDINGS) add_subdirectory(python) @@ -41,4 +45,5 @@ if(BUILD_TESTING) add_subdirectory(core_tests) add_subdirectory(layers_tests) add_subdirectory(session_tests) + add_subdirectory(shaders_tests) endif() diff --git a/src/viz/core/cpp/CMakeLists.txt b/src/viz/core/cpp/CMakeLists.txt index 2591443b3..6962bc5ca 100644 --- a/src/viz/core/cpp/CMakeLists.txt +++ b/src/viz/core/cpp/CMakeLists.txt @@ -3,12 +3,28 @@ cmake_minimum_required(VERSION 3.20) -# Vulkan is required for vk_context. +# Vulkan is always required for vk_context / render_target / etc. find_package(Vulkan REQUIRED) +# CUDA Toolkit provides the CUDA-Vulkan interop runtime APIs +# (cudaImportExternalMemory, cudaExternalMemoryGetMappedMipmappedArray) +# used by device_image. Required when BUILD_VIZ=ON. +# +# We link CUDA::cudart_static (not CUDA::cudart) so consumers of viz_core +# don't have a runtime libcudart.so.12 dependency: +# - The wheel ships a fully self-contained _viz.so. auditwheel sees no +# libcudart symbols to bundle. +# - viz_*_tests artifacts run on GPU runners that have only the NVIDIA +# driver (libcuda.so.1) installed; static cudart sidesteps the +# "install the toolkit on every runner" problem. +# Tradeoff: ~3 MB binary growth per consumer. Safe because viz_core is +# the only CUDA-using component in this codebase today. +find_package(CUDAToolkit REQUIRED) + # Foundational types and Vulkan/CUDA primitives shared by all viz sub-modules. -# Static lib; consumed in-tree via viz::viz_core. +# Static lib; consumed in-tree via viz::core. add_library(viz_core STATIC + device_image.cpp frame_sync.cpp render_target.cpp vk_context.cpp @@ -30,6 +46,7 @@ target_link_libraries(viz_core PUBLIC Vulkan::Vulkan glm::glm + CUDA::cudart_static ) # Aliased as viz::core (consumers say viz::core, not viz::viz_core). diff --git a/src/viz/core/cpp/device_image.cpp b/src/viz/core/cpp/device_image.cpp new file mode 100644 index 000000000..a02053b58 --- /dev/null +++ b/src/viz/core/cpp/device_image.cpp @@ -0,0 +1,438 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#include +#include + +// Posix close() lives in on Linux/macOS; Windows uses _close() +// from . The fd-close path is unreachable at runtime on Windows +// (vkGetMemoryFdKHR isn't available there — import_to_cuda throws before +// memory_fd_ is ever assigned), but the code still has to compile under +// MSVC for the experimental Windows build. +#ifdef _WIN32 +# include +namespace +{ +inline int close_fd(int fd) noexcept +{ + return ::_close(fd); +} +} // namespace +#else +# include +namespace +{ +inline int close_fd(int fd) noexcept +{ + return ::close(fd); +} +} // namespace +#endif + +namespace viz +{ + +namespace +{ + +void check_vk(VkResult result, const char* what) +{ + if (result != VK_SUCCESS) + { + throw std::runtime_error(std::string("DeviceImage: ") + what + " failed: VkResult=" + std::to_string(result)); + } +} + +void check_cuda(cudaError_t result, const char* what) +{ + if (result != cudaSuccess) + { + throw std::runtime_error(std::string("DeviceImage: ") + what + " failed: " + cudaGetErrorString(result)); + } +} + +uint32_t find_memory_type(VkPhysicalDevice physical_device, uint32_t type_bits, VkMemoryPropertyFlags properties) +{ + VkPhysicalDeviceMemoryProperties mem_props; + vkGetPhysicalDeviceMemoryProperties(physical_device, &mem_props); + for (uint32_t i = 0; i < mem_props.memoryTypeCount; ++i) + { + if ((type_bits & (1u << i)) != 0 && (mem_props.memoryTypes[i].propertyFlags & properties) == properties) + { + return i; + } + } + throw std::runtime_error("DeviceImage: no Vulkan memory type matching requested properties"); +} + +VkFormat to_vk_format(PixelFormat format) +{ + switch (format) + { + case PixelFormat::kRGBA8: + // UNORM (not SRGB) so CUDA writes round-trip without an + // implicit sRGB encode on the Vulkan side. Color management + // is the layer's concern. + return VK_FORMAT_R8G8B8A8_UNORM; + case PixelFormat::kD32F: + return VK_FORMAT_D32_SFLOAT; + } + throw std::runtime_error("DeviceImage: unsupported PixelFormat"); +} + +cudaChannelFormatDesc to_cuda_format(PixelFormat format) +{ + switch (format) + { + case PixelFormat::kRGBA8: + return cudaCreateChannelDesc(); + case PixelFormat::kD32F: + return cudaCreateChannelDesc(); + } + throw std::runtime_error("DeviceImage: unsupported PixelFormat"); +} + +} // namespace + +std::unique_ptr DeviceImage::create(const VkContext& ctx, Resolution resolution, PixelFormat format) +{ + if (!ctx.is_initialized()) + { + throw std::invalid_argument("DeviceImage: VkContext is not initialized"); + } + if (resolution.width == 0 || resolution.height == 0) + { + throw std::invalid_argument("DeviceImage: resolution must be non-zero"); + } + std::unique_ptr img(new DeviceImage(ctx, resolution, format)); + img->init(); + return img; +} + +DeviceImage::DeviceImage(const VkContext& ctx, Resolution resolution, PixelFormat format) + : ctx_(&ctx), resolution_(resolution), format_(format), vk_format_(to_vk_format(format)) +{ +} + +DeviceImage::~DeviceImage() +{ + destroy(); +} + +void DeviceImage::init() +{ + try + { + create_vk_image_with_external_memory(); + create_vk_image_view(); + import_to_cuda(); + transition_to_shader_read(); + } + catch (...) + { + destroy(); + throw; + } +} + +void DeviceImage::destroy() +{ + // Pin CUDA device for this thread so the CUDA frees below land on + // the right device even if destroy() runs on a thread that never + // ran VkContext::init(). Best-effort — destructor must not throw. + if (ctx_ != nullptr && ctx_->cuda_device_id() >= 0) + { + (void)cudaSetDevice(ctx_->cuda_device_id()); + } + + // CUDA side first; CUDA holds a dup'd handle on the underlying + // memory, so the VkDeviceMemory must outlive the CUDA mapping. + // cudaDeviceSynchronize ensures any caller-issued async CUDA work + // (e.g. cudaMemcpy2DToArrayAsync) has retired before we free the + // array — otherwise CUDA may UAF its own staging. + if (cuda_mipmapped_array_ != nullptr || cuda_external_memory_ != nullptr) + { + (void)cudaDeviceSynchronize(); + } + if (cuda_mipmapped_array_ != nullptr) + { + (void)cudaFreeMipmappedArray(cuda_mipmapped_array_); + cuda_mipmapped_array_ = nullptr; + cuda_array_ = nullptr; + } + if (cuda_external_memory_ != nullptr) + { + (void)cudaDestroyExternalMemory(cuda_external_memory_); + cuda_external_memory_ = nullptr; + } + if (memory_fd_ >= 0) + { + // CUDA dups the fd internally on import, so we close our copy. + // If import failed before our explicit close, fd_ may still + // hold our copy — close it here. + close_fd(memory_fd_); + memory_fd_ = -1; + } + + if (ctx_ == nullptr) + { + return; + } + const VkDevice device = ctx_->device(); + if (device == VK_NULL_HANDLE) + { + return; + } + // Wait for all GPU work to retire before tearing down Vulkan + // resources. + (void)vkDeviceWaitIdle(device); + if (command_pool_ != VK_NULL_HANDLE) + { + vkDestroyCommandPool(device, command_pool_, nullptr); + command_pool_ = VK_NULL_HANDLE; + } + if (image_view_ != VK_NULL_HANDLE) + { + vkDestroyImageView(device, image_view_, nullptr); + image_view_ = VK_NULL_HANDLE; + } + if (image_ != VK_NULL_HANDLE) + { + vkDestroyImage(device, image_, nullptr); + image_ = VK_NULL_HANDLE; + } + if (memory_ != VK_NULL_HANDLE) + { + vkFreeMemory(device, memory_, nullptr); + memory_ = VK_NULL_HANDLE; + } + current_layout_ = VK_IMAGE_LAYOUT_UNDEFINED; +} + +VizBuffer DeviceImage::view() const noexcept +{ + VizBuffer b; + b.data = static_cast(cuda_array_); + b.width = resolution_.width; + b.height = resolution_.height; + b.format = format_; + b.pitch = static_cast(resolution_.width) * bytes_per_pixel(format_); + b.space = MemorySpace::kDevice; + return b; +} + +void DeviceImage::create_vk_image_with_external_memory() +{ + const VkDevice device = ctx_->device(); + + // Image with external-memory export flag. Optimal tiling — CUDA + // accesses the image via cudaArray_t, not raw memory, so opaque + // GPU layout is fine. + VkExternalMemoryImageCreateInfo ext_image_info{}; + ext_image_info.sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO; + ext_image_info.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT; + + VkImageCreateInfo info{}; + info.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO; + info.pNext = &ext_image_info; + info.imageType = VK_IMAGE_TYPE_2D; + info.format = vk_format_; + info.extent = { resolution_.width, resolution_.height, 1 }; + info.mipLevels = 1; // Single level — when minification moiré shows up in + // XR distance views, expose mipLevels via Config and + // generate the chain via vkCmdBlitImage pre-render. + // Anisotropic filtering on the sampler is the cheaper + // first line of defense. + info.arrayLayers = 1; + info.samples = VK_SAMPLE_COUNT_1_BIT; + info.tiling = VK_IMAGE_TILING_OPTIMAL; + info.usage = VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT; + info.sharingMode = VK_SHARING_MODE_EXCLUSIVE; + info.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; + + check_vk(vkCreateImage(device, &info, nullptr, &image_), "vkCreateImage"); + + VkMemoryRequirements reqs; + vkGetImageMemoryRequirements(device, image_, &reqs); + + // Memory backing the image: device-local + exportable as POSIX fd. + // No VkMemoryDedicatedAllocateInfo / cudaExternalMemoryDedicated — + // a generic allocation works for sampled 2D images and avoids the + // dedicated-allocation extension wiring. + VkExportMemoryAllocateInfo export_info{}; + export_info.sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO; + export_info.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT; + + VkMemoryAllocateInfo alloc{}; + alloc.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; + alloc.pNext = &export_info; + alloc.allocationSize = reqs.size; + alloc.memoryTypeIndex = + find_memory_type(ctx_->physical_device(), reqs.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); + check_vk(vkAllocateMemory(device, &alloc, nullptr, &memory_), "vkAllocateMemory"); + check_vk(vkBindImageMemory(device, image_, memory_, 0), "vkBindImageMemory"); + + auto vkGetMemoryFdKHR = reinterpret_cast(vkGetDeviceProcAddr(device, "vkGetMemoryFdKHR")); + if (vkGetMemoryFdKHR == nullptr) + { + throw std::runtime_error( + "DeviceImage: vkGetMemoryFdKHR not available " + "(VK_KHR_external_memory_fd not enabled?)"); + } + VkMemoryGetFdInfoKHR fd_info{}; + fd_info.sType = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR; + fd_info.memory = memory_; + fd_info.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT; + check_vk(vkGetMemoryFdKHR(device, &fd_info, &memory_fd_), "vkGetMemoryFdKHR"); + + // Used only for transition_to_*; tiny pool, default flags. + VkCommandPoolCreateInfo pool_info{}; + pool_info.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; + pool_info.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT; + pool_info.queueFamilyIndex = ctx_->queue_family_index(); + check_vk(vkCreateCommandPool(device, &pool_info, nullptr, &command_pool_), "vkCreateCommandPool"); +} + +void DeviceImage::create_vk_image_view() +{ + VkImageViewCreateInfo info{}; + info.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; + info.image = image_; + info.viewType = VK_IMAGE_VIEW_TYPE_2D; + info.format = vk_format_; + info.subresourceRange.aspectMask = + (format_ == PixelFormat::kD32F) ? VK_IMAGE_ASPECT_DEPTH_BIT : VK_IMAGE_ASPECT_COLOR_BIT; + info.subresourceRange.baseMipLevel = 0; + info.subresourceRange.levelCount = 1; + info.subresourceRange.baseArrayLayer = 0; + info.subresourceRange.layerCount = 1; + check_vk(vkCreateImageView(ctx_->device(), &info, nullptr, &image_view_), "vkCreateImageView"); +} + +void DeviceImage::import_to_cuda() +{ + // cudaSetDevice is per-host-thread; VkContext set it on the init + // thread, but DeviceImage::create() may run on a different one. + // Re-pin here so cudaImportExternalMemory / GetMappedMipmappedArray + // talk to the same physical GPU as Vulkan. + check_cuda(cudaSetDevice(ctx_->cuda_device_id()), "cudaSetDevice"); + + VkMemoryRequirements reqs; + vkGetImageMemoryRequirements(ctx_->device(), image_, &reqs); + + cudaExternalMemoryHandleDesc ext_desc{}; + ext_desc.type = cudaExternalMemoryHandleTypeOpaqueFd; + ext_desc.handle.fd = memory_fd_; + ext_desc.size = reqs.size; + ext_desc.flags = 0; + + check_cuda(cudaImportExternalMemory(&cuda_external_memory_, &ext_desc), "cudaImportExternalMemory"); + + // CUDA dup'd the fd internally; close ours so we don't double-free. + close_fd(memory_fd_); + memory_fd_ = -1; + + cudaExternalMemoryMipmappedArrayDesc array_desc{}; + array_desc.offset = 0; + array_desc.formatDesc = to_cuda_format(format_); + array_desc.extent = make_cudaExtent(resolution_.width, resolution_.height, 0); + array_desc.flags = cudaArrayColorAttachment; + array_desc.numLevels = 1; + + check_cuda(cudaExternalMemoryGetMappedMipmappedArray(&cuda_mipmapped_array_, cuda_external_memory_, &array_desc), + "cudaExternalMemoryGetMappedMipmappedArray"); + check_cuda(cudaGetMipmappedArrayLevel(&cuda_array_, cuda_mipmapped_array_, 0), "cudaGetMipmappedArrayLevel"); +} + +void DeviceImage::transition_to_shader_read() +{ + if (current_layout_ == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) + { + return; + } + run_one_shot_layout_transition(current_layout_, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, + VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, + VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT); + current_layout_ = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; +} + +void DeviceImage::transition_to_transfer_dst() +{ + if (current_layout_ == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) + { + return; + } + run_one_shot_layout_transition(current_layout_, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, VK_ACCESS_SHADER_READ_BIT, + VK_ACCESS_TRANSFER_WRITE_BIT, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, + VK_PIPELINE_STAGE_TRANSFER_BIT); + current_layout_ = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; +} + +void DeviceImage::run_one_shot_layout_transition(VkImageLayout old_layout, + VkImageLayout new_layout, + VkAccessFlags src_access, + VkAccessFlags dst_access, + VkPipelineStageFlags src_stage, + VkPipelineStageFlags dst_stage) +{ + const VkDevice device = ctx_->device(); + + VkCommandBufferAllocateInfo alloc{}; + alloc.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; + alloc.commandPool = command_pool_; + alloc.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; + alloc.commandBufferCount = 1; + VkCommandBuffer cmd = VK_NULL_HANDLE; + check_vk(vkAllocateCommandBuffers(device, &alloc, &cmd), "vkAllocateCommandBuffers(transition)"); + + // RAII: free the command buffer on every exit path (including + // exceptions from the check_vk calls below). The pool would + // eventually reclaim it on destroy(), but a retry loop after a + // transient queue submit failure would leak one cmd per attempt. + struct CmdGuard + { + VkDevice device; + VkCommandPool pool; + VkCommandBuffer cmd; + ~CmdGuard() + { + vkFreeCommandBuffers(device, pool, 1, &cmd); + } + } guard{ device, command_pool_, cmd }; + + VkCommandBufferBeginInfo begin{}; + begin.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + begin.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; + check_vk(vkBeginCommandBuffer(cmd, &begin), "vkBeginCommandBuffer(transition)"); + + VkImageMemoryBarrier barrier{}; + barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + barrier.oldLayout = old_layout; + barrier.newLayout = new_layout; + barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.image = image_; + barrier.subresourceRange.aspectMask = + (format_ == PixelFormat::kD32F) ? VK_IMAGE_ASPECT_DEPTH_BIT : VK_IMAGE_ASPECT_COLOR_BIT; + barrier.subresourceRange.baseMipLevel = 0; + barrier.subresourceRange.levelCount = 1; + barrier.subresourceRange.baseArrayLayer = 0; + barrier.subresourceRange.layerCount = 1; + barrier.srcAccessMask = src_access; + barrier.dstAccessMask = dst_access; + vkCmdPipelineBarrier(cmd, src_stage, dst_stage, 0, 0, nullptr, 0, nullptr, 1, &barrier); + + check_vk(vkEndCommandBuffer(cmd), "vkEndCommandBuffer(transition)"); + + VkSubmitInfo submit{}; + submit.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + submit.commandBufferCount = 1; + submit.pCommandBuffers = &cmd; + check_vk(vkQueueSubmit(ctx_->queue(), 1, &submit, VK_NULL_HANDLE), "vkQueueSubmit(transition)"); + check_vk(vkQueueWaitIdle(ctx_->queue()), "vkQueueWaitIdle(transition)"); +} + +} // namespace viz diff --git a/src/viz/core/cpp/inc/viz/core/device_image.hpp b/src/viz/core/cpp/inc/viz/core/device_image.hpp index c29537c0b..5fd82e364 100644 --- a/src/viz/core/cpp/inc/viz/core/device_image.hpp +++ b/src/viz/core/cpp/inc/viz/core/device_image.hpp @@ -3,26 +3,116 @@ #pragma once +#include +#include +#include + +#include +#include + namespace viz { -// Owning device-side 2D pixel buffer with CUDA-Vulkan interop. The -// symmetric counterpart to HostImage: HostImage owns CPU bytes, -// DeviceImage will own: -// - VkImage / VkBuffer + VkDeviceMemory exported via -// VK_KHR_external_memory_fd -// - cudaExternalMemory_t imported from that fd, plus the CUDA device -// pointer derived from it -// - paired cudaExternalSemaphore_t / VkSemaphore for acquire / release -// synchronization -// -// Returned by Televiz's mode-B submission path (acquire / release) when -// Televiz allocates the interop buffer for a layer to write into. +class VkContext; + +// Owning CUDA-Vulkan interop image. Vulkan allocates the VkImage +// (optimal tiling, sampled + transfer-dst); the backing VkDeviceMemory +// is exported via VK_KHR_external_memory_fd and imported into CUDA as +// a cudaArray_t. CUDA writes via cuda_array(); Vulkan samples via +// vk_image(). Symmetric counterpart to HostImage; both expose +// VizBuffer view() so helpers branch on VizBuffer::space. // -// Intentionally only forward-declared in this milestone — the -// implementation ships alongside CUDA-Vulkan interop. Callers may pass -// pointers / references to DeviceImage between modules but cannot -// instantiate one until then. -class DeviceImage; +// Synchronization is heavyweight today (cudaDeviceSynchronize + +// vkQueueWaitIdle); paired acquire / release semaphores arrive with +// QuadLayer. CUDA/Vulkan device matching is handled by VkContext. +class DeviceImage +{ +public: + // Throws std::invalid_argument on bad config; std::runtime_error + // on Vulkan or CUDA failure. Pre-initialized. + static std::unique_ptr create(const VkContext& ctx, Resolution resolution, PixelFormat format); + + ~DeviceImage(); + void destroy(); + + DeviceImage(const DeviceImage&) = delete; + DeviceImage& operator=(const DeviceImage&) = delete; + DeviceImage(DeviceImage&&) = delete; + DeviceImage& operator=(DeviceImage&&) = delete; + + // VizBuffer view (kDevice). `data` is the cudaArray_t cast to + // void*; it's an opaque CUDA handle, not a raw device pointer — + // use cuda_array() with cudaMemcpy2DToArrayAsync to write. + VizBuffer view() const noexcept; + + // CUDA write target. Lifetime tied to this DeviceImage. + cudaArray_t cuda_array() const noexcept + { + return cuda_array_; + } + + // Image lives in VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL after + // init; transition_to_*() below moves it back and forth. + VkImage vk_image() const noexcept + { + return image_; + } + VkImageView vk_image_view() const noexcept + { + return image_view_; + } + VkFormat vk_format() const noexcept + { + return vk_format_; + } + + Resolution resolution() const noexcept + { + return resolution_; + } + PixelFormat format() const noexcept + { + return format_; + } + + // Synchronous one-shot layout transitions (vkQueueSubmit + + // vkQueueWaitIdle). For tests / one-shot uploads — production + // layers record their own barriers in render commands. + void transition_to_shader_read(); + void transition_to_transfer_dst(); + +private: + explicit DeviceImage(const VkContext& ctx, Resolution resolution, PixelFormat format); + void init(); + + void create_vk_image_with_external_memory(); + void create_vk_image_view(); + void import_to_cuda(); + + void run_one_shot_layout_transition(VkImageLayout old_layout, + VkImageLayout new_layout, + VkAccessFlags src_access, + VkAccessFlags dst_access, + VkPipelineStageFlags src_stage, + VkPipelineStageFlags dst_stage); + + const VkContext* ctx_ = nullptr; + Resolution resolution_{}; + PixelFormat format_ = PixelFormat::kRGBA8; + VkFormat vk_format_ = VK_FORMAT_R8G8B8A8_UNORM; + VkImageLayout current_layout_ = VK_IMAGE_LAYOUT_UNDEFINED; + + VkImage image_ = VK_NULL_HANDLE; + VkDeviceMemory memory_ = VK_NULL_HANDLE; + VkImageView image_view_ = VK_NULL_HANDLE; + VkCommandPool command_pool_ = VK_NULL_HANDLE; // For layout transitions only. + + // CUDA dup's the fd internally on import; we close ours after. + int memory_fd_ = -1; + + cudaExternalMemory_t cuda_external_memory_ = nullptr; + cudaMipmappedArray_t cuda_mipmapped_array_ = nullptr; + cudaArray_t cuda_array_ = nullptr; // Level-0 view, non-owning. +}; } // namespace viz diff --git a/src/viz/core/cpp/inc/viz/core/vk_context.hpp b/src/viz/core/cpp/inc/viz/core/vk_context.hpp index e960f30ef..5f12c8690 100644 --- a/src/viz/core/cpp/inc/viz/core/vk_context.hpp +++ b/src/viz/core/cpp/inc/viz/core/vk_context.hpp @@ -43,6 +43,14 @@ struct PhysicalDeviceInfo // - A queue family with graphics + compute + transfer flags // // VkContext owns the Vulkan handles and tears them down on destruction. +// +// CUDA-Vulkan device matching: as part of init(), VkContext queries +// the chosen physical device's UUID and calls cudaSetDevice() on the +// matching CUDA device. This is required for CUDA-Vulkan interop +// (cudaImportExternalMemory) to succeed on multi-GPU machines, and +// it makes VkContext the single chokepoint for "which GPU is Televiz +// on" — every viz_core type that touches CUDA can assume the current +// CUDA device matches the Vulkan one. class VkContext { public: @@ -96,6 +104,15 @@ class VkContext uint32_t queue_family_index() const noexcept; VkQueue queue() const noexcept; + // CUDA device id matched to the chosen Vulkan physical device by + // UUID at init() time. Useful for callers that need to ensure + // their thread is on the right CUDA device before issuing CUDA + // calls — cudaSetDevice is per-host-thread, so a CudaTexture / + // DeviceImage created on a worker thread must call + // cudaSetDevice(ctx.cuda_device_id()) before any CUDA API. Returns + // -1 before init() has run. + int cuda_device_id() const noexcept; + // Enumerates all Vulkan-capable physical devices and returns their // properties. Useful for picking a specific GPU index on multi-GPU // machines before calling init(). @@ -109,6 +126,7 @@ class VkContext void create_instance(const Config& config); void select_physical_device(const Config& config); void create_logical_device(const Config& config); + void match_cuda_device_to_vulkan(); bool initialized_ = false; bool validation_enabled_ = false; @@ -117,6 +135,7 @@ class VkContext VkDevice device_ = VK_NULL_HANDLE; uint32_t queue_family_index_ = UINT32_MAX; VkQueue queue_ = VK_NULL_HANDLE; + int cuda_device_id_ = -1; }; } // namespace viz diff --git a/src/viz/core/cpp/vk_context.cpp b/src/viz/core/cpp/vk_context.cpp index 1046f1d8e..d2184d038 100644 --- a/src/viz/core/cpp/vk_context.cpp +++ b/src/viz/core/cpp/vk_context.cpp @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -190,6 +191,7 @@ void VkContext::init(const Config& config) create_instance(config); select_physical_device(config); create_logical_device(config); + match_cuda_device_to_vulkan(); initialized_ = true; } catch (...) @@ -214,6 +216,7 @@ void VkContext::destroy() physical_device_ = VK_NULL_HANDLE; queue_ = VK_NULL_HANDLE; queue_family_index_ = UINT32_MAX; + cuda_device_id_ = -1; validation_enabled_ = false; initialized_ = false; } @@ -248,6 +251,11 @@ VkQueue VkContext::queue() const noexcept return queue_; } +int VkContext::cuda_device_id() const noexcept +{ + return cuda_device_id_; +} + void VkContext::create_instance(const Config& config) { VkApplicationInfo app_info{}; @@ -406,6 +414,54 @@ void VkContext::create_logical_device(const Config& config) vkGetDeviceQueue(device_, queue_family_index_, 0, &queue_); } +void VkContext::match_cuda_device_to_vulkan() +{ + // Read the Vulkan physical device's UUID, then find the CUDA + // device with the same UUID and make it current. Required so + // every viz_core type that imports Vulkan memory into CUDA + // (e.g. DeviceImage::cudaImportExternalMemory) operates on the + // same physical GPU. On multi-GPU machines Vulkan and CUDA + // default to different devices and interop fails with + // cudaErrorUnknown. + VkPhysicalDeviceIDProperties id_props{}; + id_props.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES; + VkPhysicalDeviceProperties2 props2{}; + props2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2; + props2.pNext = &id_props; + vkGetPhysicalDeviceProperties2(physical_device_, &props2); + + int cuda_count = 0; + cudaError_t err = cudaGetDeviceCount(&cuda_count); + if (err != cudaSuccess || cuda_count == 0) + { + throw std::runtime_error( + "VkContext: no CUDA devices visible — CUDA-Vulkan interop requires " + "a working CUDA driver"); + } + for (int i = 0; i < cuda_count; ++i) + { + cudaDeviceProp prop{}; + err = cudaGetDeviceProperties(&prop, i); + if (err != cudaSuccess) + { + continue; + } + if (std::memcmp(prop.uuid.bytes, id_props.deviceUUID, VK_UUID_SIZE) == 0) + { + err = cudaSetDevice(i); + if (err != cudaSuccess) + { + throw std::runtime_error(std::string("VkContext: cudaSetDevice failed: ") + cudaGetErrorString(err)); + } + cuda_device_id_ = i; + return; + } + } + throw std::runtime_error( + "VkContext: no CUDA device matches the Vulkan physical device's UUID — " + "CUDA-Vulkan interop requires same-GPU operation"); +} + std::vector VkContext::enumerate_physical_devices() { std::vector result; diff --git a/src/viz/core_tests/cpp/CMakeLists.txt b/src/viz/core_tests/cpp/CMakeLists.txt index c9d031d2e..656aff6c6 100644 --- a/src/viz/core_tests/cpp/CMakeLists.txt +++ b/src/viz/core_tests/cpp/CMakeLists.txt @@ -4,6 +4,7 @@ cmake_minimum_required(VERSION 3.20) add_executable(viz_core_tests + test_device_image.cpp test_frame_sync.cpp test_host_image.cpp test_render_target.cpp diff --git a/src/viz/core_tests/cpp/test_device_image.cpp b/src/viz/core_tests/cpp/test_device_image.cpp new file mode 100644 index 000000000..fcdb23af0 --- /dev/null +++ b/src/viz/core_tests/cpp/test_device_image.cpp @@ -0,0 +1,117 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +// [gpu] tests for DeviceImage: verify Vulkan handle creation, the +// CUDA cudaArray_t is usable from a cudaMemcpy2DToArray call, the +// VizBuffer view exposes the right shape with MemorySpace::kDevice, +// and a round-trip copy preserves pixel values. + +#include "test_helpers.hpp" + +#include +#include +#include +#include + +#include +#include +#include +#include + +using viz::DeviceImage; +using viz::MemorySpace; +using viz::PixelFormat; +using viz::Resolution; +using viz::VizBuffer; + +namespace +{ + +// Generate a deterministic gradient pattern. Channel R = column / W * +// 255, G = row / H * 255, B = column XOR row, A = 255. Easy to spot +// in a debugger and reproducible across test runs. +std::vector make_gradient(uint32_t w, uint32_t h) +{ + std::vector px(static_cast(w) * h * 4); + for (uint32_t y = 0; y < h; ++y) + { + for (uint32_t x = 0; x < w; ++x) + { + const size_t i = (static_cast(y) * w + x) * 4; + px[i + 0] = static_cast((x * 255u) / (w - 1u)); + px[i + 1] = static_cast((y * 255u) / (h - 1u)); + px[i + 2] = static_cast((x ^ y) & 0xff); + px[i + 3] = 255; + } + } + return px; +} + +} // namespace + +TEST_CASE_METHOD(viz::testing::GpuFixture, "DeviceImage creates valid Vulkan + CUDA handles", "[gpu][device_image]") +{ + auto img = DeviceImage::create(vk, Resolution{ 64, 64 }, PixelFormat::kRGBA8); + REQUIRE(img != nullptr); + CHECK(img->vk_image() != VK_NULL_HANDLE); + CHECK(img->vk_image_view() != VK_NULL_HANDLE); + CHECK(img->vk_format() == VK_FORMAT_R8G8B8A8_UNORM); + CHECK(img->cuda_array() != nullptr); + CHECK(img->resolution().width == 64); + CHECK(img->resolution().height == 64); + CHECK(img->format() == PixelFormat::kRGBA8); +} + +TEST_CASE_METHOD(viz::testing::GpuFixture, "DeviceImage::create rejects zero dimensions", "[gpu][device_image]") +{ + CHECK_THROWS_AS(DeviceImage::create(vk, Resolution{ 0, 64 }, PixelFormat::kRGBA8), std::invalid_argument); + CHECK_THROWS_AS(DeviceImage::create(vk, Resolution{ 64, 0 }, PixelFormat::kRGBA8), std::invalid_argument); +} + +TEST_CASE_METHOD(viz::testing::GpuFixture, "DeviceImage destroy is idempotent", "[gpu][device_image]") +{ + auto img = DeviceImage::create(vk, Resolution{ 32, 32 }, PixelFormat::kRGBA8); + img->destroy(); + CHECK(img->vk_image() == VK_NULL_HANDLE); + CHECK(img->vk_image_view() == VK_NULL_HANDLE); + CHECK(img->cuda_array() == nullptr); + img->destroy(); +} + +TEST_CASE_METHOD(viz::testing::GpuFixture, "DeviceImage::view exposes a kDevice VizBuffer", "[gpu][device_image]") +{ + auto img = DeviceImage::create(vk, Resolution{ 16, 16 }, PixelFormat::kRGBA8); + const VizBuffer v = img->view(); + CHECK(v.space == MemorySpace::kDevice); + CHECK(v.width == 16); + CHECK(v.height == 16); + CHECK(v.format == PixelFormat::kRGBA8); + CHECK(v.pitch == static_cast(16) * 4); + // .data points at the cudaArray_t handle (NOT a raw device pointer); + // sanity that it's non-null when the image is alive. + CHECK(v.data != nullptr); +} + +TEST_CASE_METHOD(viz::testing::GpuFixture, "DeviceImage round-trip preserves pixel pattern", "[gpu][device_image]") +{ + constexpr uint32_t kSide = 64; + constexpr size_t kBytes = static_cast(kSide) * kSide * 4; + + auto img = DeviceImage::create(vk, Resolution{ kSide, kSide }, PixelFormat::kRGBA8); + + // Write a gradient via CUDA into the array. + const auto src = make_gradient(kSide, kSide); + REQUIRE(cudaMemcpy2DToArray(img->cuda_array(), 0, 0, src.data(), kSide * 4, kSide * 4, kSide, + cudaMemcpyHostToDevice) == cudaSuccess); + REQUIRE(cudaDeviceSynchronize() == cudaSuccess); + + // Read it back via CUDA — verifies the data took. (M3b's milestone + // test will read back via Vulkan sampling; that's the real round- + // trip but requires a graphics pipeline we don't have yet.) + std::vector dst(kBytes); + REQUIRE(cudaMemcpy2DFromArray(dst.data(), kSide * 4, img->cuda_array(), 0, 0, kSide * 4, kSide, + cudaMemcpyDeviceToHost) == cudaSuccess); + REQUIRE(cudaDeviceSynchronize() == cudaSuccess); + + CHECK(dst == src); +} diff --git a/src/viz/shaders/CMakeLists.txt b/src/viz/shaders/CMakeLists.txt new file mode 100644 index 000000000..636447b71 --- /dev/null +++ b/src/viz/shaders/CMakeLists.txt @@ -0,0 +1,9 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +cmake_minimum_required(VERSION 3.20) + +# GLSL shader sources compiled to SPIR-V at build time and embedded as +# C arrays in generated headers. Consumers link viz::shaders to get the +# headers on their include path. +add_subdirectory(cpp) diff --git a/src/viz/shaders/cpp/CMakeLists.txt b/src/viz/shaders/cpp/CMakeLists.txt new file mode 100644 index 000000000..0f934aaf7 --- /dev/null +++ b/src/viz/shaders/cpp/CMakeLists.txt @@ -0,0 +1,66 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +cmake_minimum_required(VERSION 3.20) + +# glslangValidator is part of the Vulkan SDK / glslang-tools package. +# We don't FetchContent it — assume it's installed (libvulkan-dev pulls +# it in on Ubuntu, brew install glslang on macOS). +find_program(GLSLANG_VALIDATOR glslangValidator REQUIRED) +message(STATUS "Using glslangValidator: ${GLSLANG_VALIDATOR}") + +set(SHADERS_GEN_DIR "${CMAKE_CURRENT_BINARY_DIR}/inc/viz/shaders") +file(MAKE_DIRECTORY "${SHADERS_GEN_DIR}") + +# compile_shader( ): adds custom commands that +# run glslangValidator + the byte-emit helper, and registers the +# generated header so it's a build-time dependency for any consumer +# of viz_shaders. +function(compile_shader GLSL_PATH VAR_NAME) + get_filename_component(GLSL_NAME "${GLSL_PATH}" NAME) + set(SPV_PATH "${CMAKE_CURRENT_BINARY_DIR}/${GLSL_NAME}.spv") + set(HEADER_PATH "${SHADERS_GEN_DIR}/${GLSL_NAME}.spv.h") + + add_custom_command( + OUTPUT "${SPV_PATH}" + COMMAND ${GLSLANG_VALIDATOR} -V "${CMAKE_CURRENT_SOURCE_DIR}/${GLSL_PATH}" -o "${SPV_PATH}" + DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${GLSL_PATH}" + COMMENT "Compiling shader ${GLSL_NAME} -> SPIR-V" + VERBATIM + ) + + add_custom_command( + OUTPUT "${HEADER_PATH}" + COMMAND ${CMAKE_COMMAND} + -DSPV_PATH=${SPV_PATH} + -DHEADER_PATH=${HEADER_PATH} + -DVAR_NAME=${VAR_NAME} + -P "${CMAKE_CURRENT_SOURCE_DIR}/compile_shader.cmake" + DEPENDS "${SPV_PATH}" "${CMAKE_CURRENT_SOURCE_DIR}/compile_shader.cmake" + COMMENT "Embedding ${GLSL_NAME}.spv -> ${VAR_NAME}" + VERBATIM + ) + + set_property(GLOBAL APPEND PROPERTY VIZ_SHADER_HEADERS "${HEADER_PATH}") +endfunction() + +# Shader programs: +# textured_quad — fullscreen quad sampling a combined image sampler. +# Used by QuadLayer to display a CUDA-fed texture. +compile_shader(textured_quad.vert kTexturedQuadVertSpv) +compile_shader(textured_quad.frag kTexturedQuadFragSpv) + +# INTERFACE library exposing the generated headers + a phony custom +# target that ensures the headers exist before any consumer compiles. +get_property(_SHADER_HEADERS GLOBAL PROPERTY VIZ_SHADER_HEADERS) +add_custom_target(viz_shaders_generate ALL DEPENDS ${_SHADER_HEADERS}) + +add_library(viz_shaders INTERFACE) +target_include_directories(viz_shaders + INTERFACE + $ +) +add_dependencies(viz_shaders viz_shaders_generate) + +# Aliased as viz::shaders. +add_library(viz::shaders ALIAS viz_shaders) diff --git a/src/viz/shaders/cpp/compile_shader.cmake b/src/viz/shaders/cpp/compile_shader.cmake new file mode 100644 index 000000000..32e27d4ca --- /dev/null +++ b/src/viz/shaders/cpp/compile_shader.cmake @@ -0,0 +1,56 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Helper script invoked from add_custom_command to convert a SPIR-V +# binary into a C header containing an inline constexpr byte array. +# Driven by command-line variables: SPV_PATH, HEADER_PATH, VAR_NAME. +# +# Output (HEADER_PATH) looks like: +# #pragma once +# #include +# #include +# namespace viz::shaders { +# inline constexpr alignas(uint32_t) unsigned char kVarName[] = { +# 0x03, 0x02, 0x23, 0x07, ... +# }; +# inline constexpr size_t kVarNameSize = sizeof(kVarName); +# } // namespace viz::shaders + +if(NOT DEFINED SPV_PATH OR NOT DEFINED HEADER_PATH OR NOT DEFINED VAR_NAME) + message(FATAL_ERROR "compile_shader.cmake requires SPV_PATH, HEADER_PATH, VAR_NAME") +endif() + +file(READ "${SPV_PATH}" SPV_CONTENT HEX) +string(LENGTH "${SPV_CONTENT}" SPV_HEX_LEN) +math(EXPR SPV_BYTE_LEN "${SPV_HEX_LEN} / 2") +if(SPV_BYTE_LEN EQUAL 0) + message(FATAL_ERROR "compile_shader.cmake: ${SPV_PATH} is empty") +endif() + +# Format every two hex chars as 0xab,. Single long line — SPIR-V binaries +# are small (~KB) and compilers handle long initializer lines fine. +string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1, " SPV_BYTES "${SPV_CONTENT}") + +set(HEADER_CONTENT +"// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// AUTO-GENERATED FROM ${SPV_PATH} BY compile_shader.cmake. DO NOT EDIT. + +#pragma once + +#include +#include + +namespace viz::shaders +{ + +alignas(uint32_t) inline constexpr unsigned char ${VAR_NAME}[] = { + ${SPV_BYTES} +}; +inline constexpr size_t ${VAR_NAME}Size = sizeof(${VAR_NAME}); + +} // namespace viz::shaders +") + +file(WRITE "${HEADER_PATH}" "${HEADER_CONTENT}") diff --git a/src/viz/shaders/cpp/textured_quad.frag b/src/viz/shaders/cpp/textured_quad.frag new file mode 100644 index 000000000..29b92d6c9 --- /dev/null +++ b/src/viz/shaders/cpp/textured_quad.frag @@ -0,0 +1,17 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +// Samples a combined image sampler at descriptor binding 0. Used by +// QuadLayer (ships in M3b) to display a CUDA-fed texture as a quad. + +#version 450 + +layout(set = 0, binding = 0) uniform sampler2D u_texture; + +layout(location = 0) in vec2 v_uv; +layout(location = 0) out vec4 out_color; + +void main() +{ + out_color = texture(u_texture, v_uv); +} diff --git a/src/viz/shaders/cpp/textured_quad.vert b/src/viz/shaders/cpp/textured_quad.vert new file mode 100644 index 000000000..52cbe5eec --- /dev/null +++ b/src/viz/shaders/cpp/textured_quad.vert @@ -0,0 +1,16 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +// Fullscreen-quad vertex shader that also forwards UVs for sampling. +// Uses the same gl_VertexIndex trick as solid_color.vert: 3 vertices +// → covers the screen, no vertex buffer needed. + +#version 450 + +layout(location = 0) out vec2 v_uv; + +void main() +{ + v_uv = vec2((gl_VertexIndex << 1) & 2, gl_VertexIndex & 2); + gl_Position = vec4(v_uv * 2.0 - 1.0, 0.0, 1.0); +} diff --git a/src/viz/shaders_tests/CMakeLists.txt b/src/viz/shaders_tests/CMakeLists.txt new file mode 100644 index 000000000..a25478921 --- /dev/null +++ b/src/viz/shaders_tests/CMakeLists.txt @@ -0,0 +1,7 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +cmake_minimum_required(VERSION 3.20) + +# Build C++ tests +add_subdirectory(cpp) diff --git a/src/viz/shaders_tests/cpp/CMakeLists.txt b/src/viz/shaders_tests/cpp/CMakeLists.txt new file mode 100644 index 000000000..66159f706 --- /dev/null +++ b/src/viz/shaders_tests/cpp/CMakeLists.txt @@ -0,0 +1,17 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +cmake_minimum_required(VERSION 3.20) + +add_executable(viz_shaders_tests + test_shader_blobs.cpp +) + +target_link_libraries(viz_shaders_tests PRIVATE + viz::shaders + Catch2::Catch2WithMain +) + +message(STATUS "viz_shaders_tests target enabled with Catch2") + +catch_discover_tests(viz_shaders_tests ADD_TAGS_AS_LABELS) diff --git a/src/viz/shaders_tests/cpp/test_shader_blobs.cpp b/src/viz/shaders_tests/cpp/test_shader_blobs.cpp new file mode 100644 index 000000000..8637cf6e2 --- /dev/null +++ b/src/viz/shaders_tests/cpp/test_shader_blobs.cpp @@ -0,0 +1,44 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +// Smoke tests for the embedded SPIR-V blobs: verify the CMake +// glslangValidator + byte-embed pipeline produced something that +// looks like real SPIR-V. Doesn't run the shaders — just sanity-checks +// the bytes. + +#include +#include +#include + +#include +#include + +namespace +{ + +// SPIR-V magic word, little-endian. +constexpr uint32_t kSpvMagic = 0x07230203; + +// Returns the first uint32 of `bytes` as a little-endian word. +uint32_t first_word_le(const unsigned char* bytes) +{ + uint32_t w = 0; + std::memcpy(&w, bytes, sizeof(w)); + return w; +} + +} // namespace + +TEST_CASE("textured_quad.vert.spv blob is non-empty and starts with SPIR-V magic", "[unit][shaders]") +{ + REQUIRE(viz::shaders::kTexturedQuadVertSpvSize >= 4); + REQUIRE(viz::shaders::kTexturedQuadVertSpvSize % 4 == 0); + CHECK(first_word_le(viz::shaders::kTexturedQuadVertSpv) == kSpvMagic); +} + +TEST_CASE("textured_quad.frag.spv blob is non-empty and starts with SPIR-V magic", "[unit][shaders]") +{ + REQUIRE(viz::shaders::kTexturedQuadFragSpvSize >= 4); + REQUIRE(viz::shaders::kTexturedQuadFragSpvSize % 4 == 0); + CHECK(first_word_le(viz::shaders::kTexturedQuadFragSpv) == kSpvMagic); +}