Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions cmake/cuda_targets.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ if(NVMOLKIT_CUDA_TARGET_MODE STREQUAL "native")
"NVMOLKIT_CUDA_TARGET_MODE=native: Using native CUDA architecture for fast local builds"
)
elseif(NVMOLKIT_CUDA_TARGET_MODE STREQUAL "full")
set(_nvmolkit_cuda_arch_list "70-real;75-real;80-real;86-real;90-real;90")
set(_nvmolkit_cuda_arch_list
"70-real;75-real;80-real;86-real;89-real;90-real")
Comment thread
scal444 marked this conversation as resolved.
if(DEFINED CUDAToolkit_VERSION)
string(REPLACE "." ";" _cuda_version_list "${CUDAToolkit_VERSION}")
list(GET _cuda_version_list 0 _cuda_major)
Expand All @@ -40,6 +41,13 @@ elseif(NVMOLKIT_CUDA_TARGET_MODE STREQUAL "full")
message(
STATUS "CUDA < 12.8 detected, Blackwell (100-real) arch not enabled")
endif()
if(_cuda_version_num GREATER_EQUAL 1209)
list(APPEND _nvmolkit_cuda_arch_list "120")
message(
STATUS
"CUDA >= 12.9 detected, enabling Blackwell (120 + PTX) for forward compatibility"
)
endif()
endif()

set(CMAKE_CUDA_ARCHITECTURES "${_nvmolkit_cuda_arch_list}")
Expand Down Expand Up @@ -110,15 +118,15 @@ if(CMAKE_CUDA_ARCHITECTURES STREQUAL "native")
message(FATAL_ERROR "Failed to build detect_cuda_arch.cu")
endif()
# _native_cc will be something like "86"
foreach(cc IN ITEMS 80 86 89 90)
foreach(cc IN ITEMS 80 86 89 90 100 120)
if(_native_cc STREQUAL "${cc}")
add_definitions(-DNVMOLKIT_CUDA_CC_${cc}=1)
else()
add_definitions(-DNVMOLKIT_CUDA_CC_${cc}=0)
endif()
endforeach()
else()
foreach(cc IN ITEMS 80 86 89 90)
foreach(cc IN ITEMS 80 86 89 90 100 120)
string(REPLACE ";" " " _cuda_arch_str "${CMAKE_CUDA_ARCHITECTURES}")
string(REGEX MATCH "(^| )${cc}(-real)?( |$)" _match "${_cuda_arch_str}")
if(_match)
Expand Down
18 changes: 13 additions & 5 deletions src/similarity_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,10 @@ namespace {
//! Detects if we can use BMMA tensor operations for the given device compute capability,
//! taking into account compile-time targets.
bool supportsTensorOps(const int major, const int minor) {
if (major != 8 && major != 9) {
return false; // BMMA instructions are only available on Ampere, Ada, Hopper.
// TODO: Check if blackwell accepts BMMA instructions.
// BMMA m16n8k256 .b1 {.and,.xor}.popc is supported on sm_80+ per PTX ISA, including Blackwell.
// We explicitly support Ampere/Ada (8.x), Hopper (9.0), and Blackwell sm_100 / sm_120.
if (major != 8 && major != 9 && major != 10 && major != 12) {
return false;
}

// Now we do compile time checks. Account for forward compatibilty,
Expand All @@ -51,6 +52,13 @@ bool supportsTensorOps(const int major, const int minor) {
if (NVMOLKIT_CUDA_CC_90 && major == 9) {
return true;
}
// Blackwell builds are per-arch (-real); each macro only matches its exact SM.
if (NVMOLKIT_CUDA_CC_100 && major == 10 && minor == 0) {
return true;
}
if (NVMOLKIT_CUDA_CC_120 && major == 12 && minor == 0) {
return true;
}
return false;
}

Expand Down Expand Up @@ -371,7 +379,7 @@ __global__ void tanimotoCrossSimilarityKernel(const cuda::std::span<const kThrea
const size_t elementsPerMolecule,
cuda::std::span<double> similarities,
const size_t offset) {
#if __CUDA_ARCH__ >= 800 && __CUDA_ARCH__ < 1000
#if __CUDA_ARCH__ >= 800
crossSimilarityKernelTensorOp<SimilarityType::Tanimoto,
kThreadReductionType,
double,
Expand Down Expand Up @@ -593,7 +601,7 @@ __global__ void cosineCrossSimilarityKernel(const cuda::std::span<const kThreadR
const size_t elementsPerMolecule,
cuda::std::span<double> similarities,
const size_t offset) {
#if __CUDA_ARCH__ >= 800 && __CUDA_ARCH__ < 1000
#if __CUDA_ARCH__ >= 800
crossSimilarityKernelTensorOp<SimilarityType::Cosine,
kThreadReductionType,
double,
Expand Down
6 changes: 5 additions & 1 deletion src/substruct/substruct_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,12 @@ __host__ __device__ constexpr int getComputeCapability() {

/// Max threads per SM for each compute capability
__host__ __device__ constexpr int getMaxThreadsPerSM(int sm) {
if (sm == 120)
return 1536; // Consumer Blackwell (sm_120)
if (sm == 100)
return 2048; // Datacenter Blackwell (sm_100)
if (sm >= 90)
return 2048; // Hopper+
return 2048; // Hopper
if (sm == 80)
return 2048; // A100
if (sm >= 86)
Expand Down
Loading