diff --git a/cmake/cuda_targets.cmake b/cmake/cuda_targets.cmake
index 30637bf2..ca3ae05f 100644
--- a/cmake/cuda_targets.cmake
+++ b/cmake/cuda_targets.cmake
@@ -22,7 +22,8 @@ if(NVMOLKIT_CUDA_TARGET_MODE STREQUAL "native")
       "NVMOLKIT_CUDA_TARGET_MODE=native: Using native CUDA architecture for fast local builds"
   )
 elseif(NVMOLKIT_CUDA_TARGET_MODE STREQUAL "full")
-  set(_nvmolkit_cuda_arch_list "70-real;75-real;80-real;86-real;90-real;90")
+  set(_nvmolkit_cuda_arch_list
+      "70-real;75-real;80-real;86-real;89-real;90-real")
   if(DEFINED CUDAToolkit_VERSION)
     string(REPLACE "." ";" _cuda_version_list "${CUDAToolkit_VERSION}")
     list(GET _cuda_version_list 0 _cuda_major)
@@ -40,6 +41,13 @@ elseif(NVMOLKIT_CUDA_TARGET_MODE STREQUAL "full")
       message(
         STATUS "CUDA < 12.8 detected, Blackwell (100-real) arch not enabled")
     endif()
+    if(_cuda_version_num GREATER_EQUAL 1209)
+      list(APPEND _nvmolkit_cuda_arch_list "120")
+      message(
+        STATUS
+          "CUDA >= 12.9 detected, enabling Blackwell (120 + PTX) for forward compatibility"
+      )
+    endif()
   endif()
 
   set(CMAKE_CUDA_ARCHITECTURES "${_nvmolkit_cuda_arch_list}")
@@ -110,7 +118,7 @@ if(CMAKE_CUDA_ARCHITECTURES STREQUAL "native")
     message(FATAL_ERROR "Failed to build detect_cuda_arch.cu")
   endif()
   # _native_cc will be something like "86"
-  foreach(cc IN ITEMS 80 86 89 90)
+  foreach(cc IN ITEMS 80 86 89 90 100 120)
     if(_native_cc STREQUAL "${cc}")
       add_definitions(-DNVMOLKIT_CUDA_CC_${cc}=1)
     else()
@@ -118,7 +126,7 @@ if(CMAKE_CUDA_ARCHITECTURES STREQUAL "native")
     endif()
   endforeach()
 else()
-  foreach(cc IN ITEMS 80 86 89 90)
+  foreach(cc IN ITEMS 80 86 89 90 100 120)
     string(REPLACE ";" " " _cuda_arch_str "${CMAKE_CUDA_ARCHITECTURES}")
     string(REGEX MATCH "(^| )${cc}(-real)?( |$)" _match "${_cuda_arch_str}")
     if(_match)
diff --git a/src/similarity_kernels.cu b/src/similarity_kernels.cu
index 4d0c6cb6..01b81880 100644
--- a/src/similarity_kernels.cu
+++ b/src/similarity_kernels.cu
@@ -30,9 +30,10 @@ namespace {
 //! Detects if we can use BMMA tensor operations for the given device compute capability,
 //! taking into account compile-time targets.
 bool supportsTensorOps(const int major, const int minor) {
-  if (major != 8 && major != 9) {
-    return false;  // BMMA instructions are only available on Ampere, Ada, Hopper.
-    // TODO: Check if blackwell accepts BMMA instructions.
+  // BMMA m16n8k256 .b1 {.and,.xor}.popc is supported on sm_80+ per PTX ISA, including Blackwell.
+  // We explicitly support Ampere/Ada (8.x), Hopper (9.0), and Blackwell sm_100 / sm_120.
+  if (major != 8 && major != 9 && major != 10 && major != 12) {
+    return false;
   }
 
   // Now we do compile time checks. Account for forward compatibilty,
@@ -51,6 +52,13 @@ bool supportsTensorOps(const int major, const int minor) {
   if (NVMOLKIT_CUDA_CC_90 && major == 9) {
     return true;
   }
+  // Blackwell builds are per-arch (-real); each macro only matches its exact SM.
+  if (NVMOLKIT_CUDA_CC_100 && major == 10 && minor == 0) {
+    return true;
+  }
+  if (NVMOLKIT_CUDA_CC_120 && major == 12 && minor == 0) {
+    return true;
+  }
   return false;
 }
 
@@ -371,7 +379,7 @@ __global__ void tanimotoCrossSimilarityKernel(const cuda::std::span<const kThrea
                                               const size_t                                      elementsPerMolecule,
                                               cuda::std::span<double>                           similarities,
                                               const size_t                                      offset) {
-#if __CUDA_ARCH__ >= 800 && __CUDA_ARCH__ < 1000
+#if __CUDA_ARCH__ >= 800
   crossSimilarityKernelTensorOp<SimilarityType::Tanimoto,
                                 kThreadReductionType,
                                 double,
@@ -593,7 +601,7 @@ __global__ void cosineCrossSimilarityKernel(const cuda::std::span<const kThreadR
                                             const size_t                                      elementsPerMolecule,
                                             cuda::std::span<double>                           similarities,
                                             const size_t                                      offset) {
-#if __CUDA_ARCH__ >= 800 && __CUDA_ARCH__ < 1000
+#if __CUDA_ARCH__ >= 800
   crossSimilarityKernelTensorOp<SimilarityType::Cosine,
                                 kThreadReductionType,
                                 double,
diff --git a/src/substruct/substruct_kernels.cu b/src/substruct/substruct_kernels.cu
index b91afb36..332544e4 100644
--- a/src/substruct/substruct_kernels.cu
+++ b/src/substruct/substruct_kernels.cu
@@ -85,8 +85,12 @@ __host__ __device__ constexpr int getComputeCapability() {
 
 /// Max threads per SM for each compute capability
 __host__ __device__ constexpr int getMaxThreadsPerSM(int sm) {
+  if (sm == 120)
+    return 1536;  // Consumer Blackwell (sm_120)
+  if (sm == 100)
+    return 2048;  // Datacenter Blackwell (sm_100)
   if (sm >= 90)
-    return 2048;  // Hopper+
+    return 2048;  // Hopper
   if (sm == 80)
     return 2048;  // A100
   if (sm >= 86)