From b04b7377a925eea27222e14d80c23ae0417e539b Mon Sep 17 00:00:00 2001
From: Kangyan Zhou <zky314343421@gmail.com>
Date: Thu, 21 May 2026 12:01:27 +0800
Subject: [PATCH] [CI] Revert cutlass-dsl to 4.5.0 and add GPU-family-aware
 libs cleanup

PR #25576 bumped nvidia-cutlass-dsl[cu13] from 4.5.0 to 4.5.1. The bump
exposed a latent file-level conflict between -libs-base and -libs-cu13
(both written by the additive [cu13] extra) as a hard GPUModuleOp
TypeError on H100: -libs-cu13's pybind11 binding changed to the new
MLIR-style ((operation: object)) without a matching bump to the Python
wrapper in nvidia-cutlass-dsl, so loading -libs-cu13's .so makes the
wrapper's old-style super().__init__() call fail.

Two changes:

1. Revert the version bump (4.5.1 -> 4.5.0). At 4.5.0 both .so files
   expose a compatible binding, so the same coexistence no longer crashes.
   This removes the active TypeError on H100 and on the CUDA-13 Docker
   image for non-Blackwell users.

2. Add fix_cutlass_dsl_libs() to ci_install_dependency.sh, called from
   main() after download_flashinfer_cache. The function picks the right
   libs package per GPU family even at 4.5.0 to avoid two independent
   regressions that the silent conflict could still hit:

     Blackwell (IS_BLACKWELL=1, CU13):
       Purge -libs-base, force-reinstall -libs-cu13 so its files take
       precedence. -libs-base is CUDA-12.9-built and lacks the sm_110
       arch alias that GB300/B200 need at cutlass import time.

     Non-Blackwell CU13 (H100, H200):
       Purge -libs-cu13, force-reinstall -libs-base. -libs-cu13 carries
       a CUDBG_EXCEPTION_WARP_ILLEGAL_ADDRESS regression in LoRA CUDA-
       graph capture on sm_90 (#25743 / reverted by #25756).

     Non-CU13: no-op (only -libs-base ever installed).
---
 python/pyproject.toml                    |  2 +-
 scripts/ci/cuda/ci_install_dependency.sh | 53 ++++++++++++++++++++++++
 2 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/python/pyproject.toml b/python/pyproject.toml
index 376db0878620..3f3b7a03911f 100755
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -38,7 +38,7 @@ dependencies = [
   "ninja",
   "easydict",  # Required by remote model code (e.g. DeepSeek-OCR) loaded via trust_remote_code; validated by transformers 5.4+ check_imports
   "numpy",
-  "nvidia-cutlass-dsl[cu13]==4.5.1",
+  "nvidia-cutlass-dsl[cu13]==4.5.0",
   "nvidia-ml-py",
   "openai-harmony==0.0.4",
   "openai==2.6.1",
diff --git a/scripts/ci/cuda/ci_install_dependency.sh b/scripts/ci/cuda/ci_install_dependency.sh
index 485d5b63c8cd..36cda9fc4802 100755
--- a/scripts/ci/cuda/ci_install_dependency.sh
+++ b/scripts/ci/cuda/ci_install_dependency.sh
@@ -335,6 +335,58 @@ download_flashinfer_cache() {
     mark_step_done "${FUNCNAME[0]}"
 }
 
+fix_cutlass_dsl_libs() {
+    # nvidia-cutlass-dsl[cu13] has additive extras on PyPI: both -libs-base AND
+    # -libs-cu13 are installed when [cu13] is requested. They write to the same
+    # site-packages paths with conflicting content. At 4.5.0 the two .so files
+    # expose a compatible Python binding so the conflict is silent; at 4.5.1+
+    # -libs-cu13's binding changed without a wrapper bump and the conflict
+    # surfaces as a GPUModuleOp TypeError at kernel-compile time
+    # (see vllm-project/vllm#40082).
+    #
+    # Independent of the API mismatch, the two .so files target different GPU
+    # families. Pick the right one per runner:
+    #
+    #   Blackwell (IS_BLACKWELL=1, CU13):
+    #     -libs-cu13 must win. It provides the sm_110 arch alias that the
+    #     CUDA-12.9-built -libs-base wheel lacks. Remove -libs-base and
+    #     force-reinstall -libs-cu13 so its files take precedence.
+    #
+    #   Non-Blackwell CU13 (H100, H200, …):
+    #     -libs-base must win. Keeping only -libs-cu13 causes a
+    #     CUDBG_EXCEPTION_WARP_ILLEGAL_ADDRESS regression in LoRA CUDA-graph
+    #     capture on H100 (#25743). Remove -libs-cu13 and force-reinstall
+    #     -libs-base so the original bindings are intact.
+    #
+    #   Non-CU13:
+    #     Only -libs-base is installed (no [cu13] extra) — no conflict.
+    #
+    if [ "$CU_MAJOR" != "13" ]; then
+        return
+    fi
+
+    CUTLASS_DSL_VERSION=$(grep -Po -m1 'nvidia-cutlass-dsl(\[[^]]+\])?==\K[0-9A-Za-z\.\-]+' python/pyproject.toml || echo "")
+    if [ -z "$CUTLASS_DSL_VERSION" ]; then
+        echo "WARNING: could not detect nvidia-cutlass-dsl version from pyproject.toml; skipping libs fix"
+        return
+    fi
+
+    if [ "$IS_BLACKWELL" = "1" ]; then
+        # Blackwell: -libs-cu13 must win for sm_110 support.
+        echo "fix_cutlass_dsl_libs: Blackwell runner — purging -libs-base, reinstalling -libs-cu13==${CUTLASS_DSL_VERSION}"
+        $PIP_UNINSTALL_CMD nvidia-cutlass-dsl-libs-base $PIP_UNINSTALL_SUFFIX || true
+        $PIP_CMD install --force-reinstall --no-deps "nvidia-cutlass-dsl-libs-cu13==${CUTLASS_DSL_VERSION}" $PIP_INSTALL_SUFFIX
+    else
+        # Non-Blackwell CU13 (H100, H200): -libs-base must win to avoid LoRA
+        # CUDA-graph regression and GPUModuleOp TypeError.
+        echo "fix_cutlass_dsl_libs: non-Blackwell CU13 runner — purging -libs-cu13, reinstalling -libs-base==${CUTLASS_DSL_VERSION}"
+        $PIP_UNINSTALL_CMD nvidia-cutlass-dsl-libs-cu13 $PIP_UNINSTALL_SUFFIX || true
+        $PIP_CMD install --force-reinstall --no-deps "nvidia-cutlass-dsl-libs-base==${CUTLASS_DSL_VERSION}" $PIP_INSTALL_SUFFIX
+    fi
+
+    mark_step_done "${FUNCNAME[0]}"
+}
+
 stabilize_flashinfer_jit_paths() {
     # In venv mode, FlashInfer JIT writes build.ninja with hardcoded -isystem
     # paths. Per-job venvs get unique paths, but the JIT cache is shared on the
@@ -488,6 +540,7 @@ main() {
     install_sglang_kernel
     install_sglang_router
     download_flashinfer_cache
+    fix_cutlass_dsl_libs
     stabilize_flashinfer_jit_paths
     install_extra_deps
     install_test_tools