diff --git a/cmake/cuda_targets.cmake b/cmake/cuda_targets.cmake index 30637bf2..ca3ae05f 100644 --- a/cmake/cuda_targets.cmake +++ b/cmake/cuda_targets.cmake @@ -22,7 +22,8 @@ if(NVMOLKIT_CUDA_TARGET_MODE STREQUAL "native") "NVMOLKIT_CUDA_TARGET_MODE=native: Using native CUDA architecture for fast local builds" ) elseif(NVMOLKIT_CUDA_TARGET_MODE STREQUAL "full") - set(_nvmolkit_cuda_arch_list "70-real;75-real;80-real;86-real;90-real;90") + set(_nvmolkit_cuda_arch_list + "70-real;75-real;80-real;86-real;89-real;90-real") if(DEFINED CUDAToolkit_VERSION) string(REPLACE "." ";" _cuda_version_list "${CUDAToolkit_VERSION}") list(GET _cuda_version_list 0 _cuda_major) @@ -40,6 +41,13 @@ elseif(NVMOLKIT_CUDA_TARGET_MODE STREQUAL "full") message( STATUS "CUDA < 12.8 detected, Blackwell (100-real) arch not enabled") endif() + if(_cuda_version_num GREATER_EQUAL 1209) + list(APPEND _nvmolkit_cuda_arch_list "120") + message( + STATUS + "CUDA >= 12.9 detected, enabling Blackwell (120 + PTX) for forward compatibility" + ) + endif() endif() set(CMAKE_CUDA_ARCHITECTURES "${_nvmolkit_cuda_arch_list}") @@ -110,7 +118,7 @@ if(CMAKE_CUDA_ARCHITECTURES STREQUAL "native") message(FATAL_ERROR "Failed to build detect_cuda_arch.cu") endif() # _native_cc will be something like "86" - foreach(cc IN ITEMS 80 86 89 90) + foreach(cc IN ITEMS 80 86 89 90 100 120) if(_native_cc STREQUAL "${cc}") add_definitions(-DNVMOLKIT_CUDA_CC_${cc}=1) else() @@ -118,7 +126,7 @@ if(CMAKE_CUDA_ARCHITECTURES STREQUAL "native") endif() endforeach() else() - foreach(cc IN ITEMS 80 86 89 90) + foreach(cc IN ITEMS 80 86 89 90 100 120) string(REPLACE ";" " " _cuda_arch_str "${CMAKE_CUDA_ARCHITECTURES}") string(REGEX MATCH "(^| )${cc}(-real)?( |$)" _match "${_cuda_arch_str}") if(_match) diff --git a/src/similarity_kernels.cu b/src/similarity_kernels.cu index 4d0c6cb6..01b81880 100644 --- a/src/similarity_kernels.cu +++ b/src/similarity_kernels.cu @@ -30,9 +30,10 @@ namespace { //! Detects if we can use BMMA tensor operations for the given device compute capability, //! taking into account compile-time targets. bool supportsTensorOps(const int major, const int minor) { - if (major != 8 && major != 9) { - return false; // BMMA instructions are only available on Ampere, Ada, Hopper. - // TODO: Check if blackwell accepts BMMA instructions. + // BMMA m16n8k256 .b1 {.and,.xor}.popc is supported on sm_80+ per PTX ISA, including Blackwell. + // We explicitly support Ampere/Ada (8.x), Hopper (9.0), and Blackwell sm_100 / sm_120. + if (major != 8 && major != 9 && major != 10 && major != 12) { + return false; } // Now we do compile time checks. Account for forward compatibilty, @@ -51,6 +52,13 @@ bool supportsTensorOps(const int major, const int minor) { if (NVMOLKIT_CUDA_CC_90 && major == 9) { return true; } + // Blackwell builds are per-arch (-real); each macro only matches its exact SM. + if (NVMOLKIT_CUDA_CC_100 && major == 10 && minor == 0) { + return true; + } + if (NVMOLKIT_CUDA_CC_120 && major == 12 && minor == 0) { + return true; + } return false; } @@ -371,7 +379,7 @@ __global__ void tanimotoCrossSimilarityKernel(const cuda::std::span similarities, const size_t offset) { -#if __CUDA_ARCH__ >= 800 && __CUDA_ARCH__ < 1000 +#if __CUDA_ARCH__ >= 800 crossSimilarityKernelTensorOp similarities, const size_t offset) { -#if __CUDA_ARCH__ >= 800 && __CUDA_ARCH__ < 1000 +#if __CUDA_ARCH__ >= 800 crossSimilarityKernelTensorOp= 90) - return 2048; // Hopper+ + return 2048; // Hopper if (sm == 80) return 2048; // A100 if (sm >= 86)