From b04b7377a925eea27222e14d80c23ae0417e539b Mon Sep 17 00:00:00 2001 From: Kangyan Zhou Date: Thu, 21 May 2026 12:01:27 +0800 Subject: [PATCH] [CI] Revert cutlass-dsl to 4.5.0 and add GPU-family-aware libs cleanup PR #25576 bumped nvidia-cutlass-dsl[cu13] from 4.5.0 to 4.5.1. The bump exposed a latent file-level conflict between -libs-base and -libs-cu13 (both written by the additive [cu13] extra) as a hard GPUModuleOp TypeError on H100: -libs-cu13's pybind11 binding changed to the new MLIR-style ((operation: object)) without a matching bump to the Python wrapper in nvidia-cutlass-dsl, so loading -libs-cu13's .so makes the wrapper's old-style super().__init__() call fail. Two changes: 1. Revert the version bump (4.5.1 -> 4.5.0). At 4.5.0 both .so files expose a compatible binding, so the same coexistence no longer crashes. This removes the active TypeError on H100 and on the CUDA-13 Docker image for non-Blackwell users. 2. Add fix_cutlass_dsl_libs() to ci_install_dependency.sh, called from main() after download_flashinfer_cache. The function picks the right libs package per GPU family even at 4.5.0 to avoid two independent regressions that the silent conflict could still hit: Blackwell (IS_BLACKWELL=1, CU13): Purge -libs-base, force-reinstall -libs-cu13 so its files take precedence. -libs-base is CUDA-12.9-built and lacks the sm_110 arch alias that GB300/B200 need at cutlass import time. Non-Blackwell CU13 (H100, H200): Purge -libs-cu13, force-reinstall -libs-base. -libs-cu13 carries a CUDBG_EXCEPTION_WARP_ILLEGAL_ADDRESS regression in LoRA CUDA- graph capture on sm_90 (#25743 / reverted by #25756). Non-CU13: no-op (only -libs-base ever installed). --- python/pyproject.toml | 2 +- scripts/ci/cuda/ci_install_dependency.sh | 53 ++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index 376db0878620..3f3b7a03911f 100755 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -38,7 +38,7 @@ dependencies = [ "ninja", "easydict", # Required by remote model code (e.g. DeepSeek-OCR) loaded via trust_remote_code; validated by transformers 5.4+ check_imports "numpy", - "nvidia-cutlass-dsl[cu13]==4.5.1", + "nvidia-cutlass-dsl[cu13]==4.5.0", "nvidia-ml-py", "openai-harmony==0.0.4", "openai==2.6.1", diff --git a/scripts/ci/cuda/ci_install_dependency.sh b/scripts/ci/cuda/ci_install_dependency.sh index 485d5b63c8cd..36cda9fc4802 100755 --- a/scripts/ci/cuda/ci_install_dependency.sh +++ b/scripts/ci/cuda/ci_install_dependency.sh @@ -335,6 +335,58 @@ download_flashinfer_cache() { mark_step_done "${FUNCNAME[0]}" } +fix_cutlass_dsl_libs() { + # nvidia-cutlass-dsl[cu13] has additive extras on PyPI: both -libs-base AND + # -libs-cu13 are installed when [cu13] is requested. They write to the same + # site-packages paths with conflicting content. At 4.5.0 the two .so files + # expose a compatible Python binding so the conflict is silent; at 4.5.1+ + # -libs-cu13's binding changed without a wrapper bump and the conflict + # surfaces as a GPUModuleOp TypeError at kernel-compile time + # (see vllm-project/vllm#40082). + # + # Independent of the API mismatch, the two .so files target different GPU + # families. Pick the right one per runner: + # + # Blackwell (IS_BLACKWELL=1, CU13): + # -libs-cu13 must win. It provides the sm_110 arch alias that the + # CUDA-12.9-built -libs-base wheel lacks. Remove -libs-base and + # force-reinstall -libs-cu13 so its files take precedence. + # + # Non-Blackwell CU13 (H100, H200, …): + # -libs-base must win. Keeping only -libs-cu13 causes a + # CUDBG_EXCEPTION_WARP_ILLEGAL_ADDRESS regression in LoRA CUDA-graph + # capture on H100 (#25743). Remove -libs-cu13 and force-reinstall + # -libs-base so the original bindings are intact. + # + # Non-CU13: + # Only -libs-base is installed (no [cu13] extra) — no conflict. + # + if [ "$CU_MAJOR" != "13" ]; then + return + fi + + CUTLASS_DSL_VERSION=$(grep -Po -m1 'nvidia-cutlass-dsl(\[[^]]+\])?==\K[0-9A-Za-z\.\-]+' python/pyproject.toml || echo "") + if [ -z "$CUTLASS_DSL_VERSION" ]; then + echo "WARNING: could not detect nvidia-cutlass-dsl version from pyproject.toml; skipping libs fix" + return + fi + + if [ "$IS_BLACKWELL" = "1" ]; then + # Blackwell: -libs-cu13 must win for sm_110 support. + echo "fix_cutlass_dsl_libs: Blackwell runner — purging -libs-base, reinstalling -libs-cu13==${CUTLASS_DSL_VERSION}" + $PIP_UNINSTALL_CMD nvidia-cutlass-dsl-libs-base $PIP_UNINSTALL_SUFFIX || true + $PIP_CMD install --force-reinstall --no-deps "nvidia-cutlass-dsl-libs-cu13==${CUTLASS_DSL_VERSION}" $PIP_INSTALL_SUFFIX + else + # Non-Blackwell CU13 (H100, H200): -libs-base must win to avoid LoRA + # CUDA-graph regression and GPUModuleOp TypeError. + echo "fix_cutlass_dsl_libs: non-Blackwell CU13 runner — purging -libs-cu13, reinstalling -libs-base==${CUTLASS_DSL_VERSION}" + $PIP_UNINSTALL_CMD nvidia-cutlass-dsl-libs-cu13 $PIP_UNINSTALL_SUFFIX || true + $PIP_CMD install --force-reinstall --no-deps "nvidia-cutlass-dsl-libs-base==${CUTLASS_DSL_VERSION}" $PIP_INSTALL_SUFFIX + fi + + mark_step_done "${FUNCNAME[0]}" +} + stabilize_flashinfer_jit_paths() { # In venv mode, FlashInfer JIT writes build.ninja with hardcoded -isystem # paths. Per-job venvs get unique paths, but the JIT cache is shared on the @@ -488,6 +540,7 @@ main() { install_sglang_kernel install_sglang_router download_flashinfer_cache + fix_cutlass_dsl_libs stabilize_flashinfer_jit_paths install_extra_deps install_test_tools