lightseekorg · key4ng · May 17, 2026 · May 17, 2026 · gemini-code-assist · May 17, 2026
@@ -1,5 +1,5 @@
 name: 'Setup TRT-LLM Backend'
-description: 'Create Python venv, restore/save TRT-LLM wheel cache, and install TRT-LLM.'
+description: 'Create Python venv and install TensorRT-LLM pre-release wheel from PyPI.'
 
 runs:
   using: 'composite'
@@ -8,20 +8,6 @@ runs:
       shell: bash
       run: bash scripts/ci_setup_python_venv.sh
 
-    - name: Restore TRT-LLM wheel cache
-      id: trtllm-cache
-      uses: actions/cache/restore@v4
-      with:
-        path: /tmp/trtllm-wheel
-        key: trtllm-wheel-${{ runner.os }}-${{ hashFiles('scripts/ci_install_trtllm.sh') }}
-
     - name: Install TRT-LLM
       shell: bash
       run: bash scripts/ci_install_trtllm.sh
-
-    - name: Save TRT-LLM wheel cache
-      if: steps.trtllm-cache.outputs.cache-hit != 'true'
-      uses: actions/cache/save@v4
-      with:
-        path: /tmp/trtllm-wheel
-        key: trtllm-wheel-${{ runner.os }}-${{ hashFiles('scripts/ci_install_trtllm.sh') }}
@@ -1,11 +1,11 @@
 #!/bin/bash
-# Install TensorRT-LLM from source with gRPC support for CI
+# Install TensorRT-LLM pre-release wheel from PyPI for CI.
 #
-# gRPC server support (PR #11037) is not yet in a pip release,
-# so we build from source (main branch) which compiles the C++
-# extensions properly and includes the gRPC serve command.
-#
-# Cache version: 5 — rebuild from latest TRT-LLM main for Harmony tool calling fixes
+# As of 1.3.0rc14 (released 2026-05-07), the gRPC serve command from PR #11037
+# and the Harmony parser fixes (#12045, #12467) referenced by SMG #801 are all
+# included in the published pre-release wheel. We install it directly from PyPI
+# instead of building TensorRT-LLM from source, which saves ~30 min of CMake
+# compile time per CI run. See git history for the previous source-build logic.
 #
 # Prerequisites (expected on k8s-runner-gpu nodes):
 #   - NVIDIA driver 580+ (CUDA 13)
@@ -16,97 +16,19 @@
 
 set -euo pipefail
 
+TRTLLM_VERSION="1.3.0rc14"
 NCCL_VERSION_CONSTRAINT="nvidia-nccl-cu13>=2.28.9,<=2.29.2"
 
 # Activate venv if it exists
 if [ -f ".venv/bin/activate" ]; then
     source .venv/bin/activate
 fi
 
-# ── Check for cached wheel FIRST ─────────────────────────────────────────────
-# This allows us to skip heavy build dependencies when wheel is already cached
-TRTLLM_WHEEL_CACHE="/tmp/trtllm-wheel"
-mkdir -p "$TRTLLM_WHEEL_CACHE"
-CACHED_WHEEL=$(find "$TRTLLM_WHEEL_CACHE" -name "tensorrt_llm*.whl" 2>/dev/null | head -1 || true)
-
-if [ -n "$CACHED_WHEEL" ] && [ -f "$CACHED_WHEEL" ]; then
-    echo "=== Found cached TRT-LLM wheel: $CACHED_WHEEL ==="
-    echo "=== Installing runtime dependencies only (skipping build deps) ==="
-
-    # ── Runtime dependencies only ────────────────────────────────────────────
-    export DEBIAN_FRONTEND=noninteractive
-    sudo dpkg --configure -a --force-confnew 2>/dev/null || true
-
-    # Add NVIDIA apt repository if needed
-    if ! dpkg -l cuda-keyring 2>/dev/null | grep -q '^ii'; then
-        echo "Setting up NVIDIA apt repository..."
-        curl -fsSL -o /tmp/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
-        sudo dpkg -i /tmp/cuda-keyring.deb
-        rm -f /tmp/cuda-keyring.deb
-    fi
-
-    sudo apt-get update
-    # Runtime deps: wheel links against CUDA 13 + TensorRT libs
-    sudo apt-get install -y libopenmpi-dev libnvinfer10 cuda-toolkit-13-0
-
-    # ── CUDA runtime setup ───────────────────────────────────────────────────
-    if [ -d "/usr/local/cuda-13.0" ]; then
-        export CUDA_HOME="/usr/local/cuda-13.0"
-    else
-        export CUDA_HOME="${CUDA_HOME:-/usr/local/cuda}"
-    fi
-    export PATH="$CUDA_HOME/bin:$PATH"
-    export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64:${LD_LIBRARY_PATH:-}"
-
-    # ── Install pip and NCCL runtime ─────────────────────────────────────────
-    pip install --upgrade pip
-    pip install --no-cache-dir "$NCCL_VERSION_CONSTRAINT"
-
-    # ── Install cached wheel ─────────────────────────────────────────────────
-    # Use --extra-index-url for cu130 torch so pip resolves torch 2.10+cu130
-    # (cuda-bindings==13.x) instead of the default PyPI torch (cuda-bindings==12.9.4),
-    # which conflicts with tensorrt-llm's cuda-python>=13 requirement.
-    echo "Installing cached wheel..."
-    pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cu130 "$CACHED_WHEEL"
-
-    # ── Setup LD_LIBRARY_PATH ────────────────────────────────────────────────
-    SITE_PACKAGES=$(python3 -c "import site; print(site.getsitepackages()[0])")
-    NVIDIA_LIB_DIRS=$(find "$SITE_PACKAGES/nvidia" -name "lib" -type d 2>/dev/null | sort -u | paste -sd':')
-    if [ -n "$NVIDIA_LIB_DIRS" ]; then
-        export LD_LIBRARY_PATH="${NVIDIA_LIB_DIRS}:${LD_LIBRARY_PATH:-}"
-    fi
-
-    TRTLLM_LIB_DIR=$(find "$SITE_PACKAGES" -path "*/tensorrt_llm/libs" -type d 2>/dev/null | head -1)
-    if [ -n "$TRTLLM_LIB_DIR" ]; then
-        export LD_LIBRARY_PATH="${TRTLLM_LIB_DIR}:${LD_LIBRARY_PATH:-}"
-    fi
-
-    # Persist LD_LIBRARY_PATH for subsequent CI steps
-    if [ -n "${GITHUB_ENV:-}" ]; then
-        echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" >> "$GITHUB_ENV"
-    fi
-
-    # ── Verification ─────────────────────────────────────────────────────────
-    echo "=== TensorRT-LLM verification ==="
-    python3 -c "import tensorrt_llm; print(f'TensorRT-LLM version: {tensorrt_llm.__version__}')"
-    python3 -c "from tensorrt_llm.commands.serve import main; print('gRPC serve command: available')"
-    echo "Verifying gRPC serve command..."
-    python3 -m tensorrt_llm.commands.serve serve --help 2>&1 | head -20 || echo "WARNING: serve --help failed"
-
-    echo "TensorRT-LLM installation complete (from cache)"
-    exit 0
-fi
-
-# ══════════════════════════════════════════════════════════════════════════════
-# No cached wheel - full build required
-# ══════════════════════════════════════════════════════════════════════════════
-echo "=== No cached wheel found, building from source ==="
-
-# ── System dependencies (full build) ─────────────────────────────────────────
+# ── Runtime system dependencies ──────────────────────────────────────────────
 export DEBIAN_FRONTEND=noninteractive
 sudo dpkg --configure -a --force-confnew 2>/dev/null || true
 
-# Add NVIDIA CUDA/TensorRT apt repository (needed for libnvinfer-dev, tensorrt-dev)
+# Add NVIDIA apt repository if needed
 if ! dpkg -l cuda-keyring 2>/dev/null | grep -q '^ii'; then
     echo "Setting up NVIDIA apt repository..."
     curl -fsSL -o /tmp/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
@@ -115,216 +37,38 @@ if ! dpkg -l cuda-keyring 2>/dev/null | grep -q '^ii'; then
 fi
 
 sudo apt-get update
-# Full build deps: runtime + dev headers + build tools
-sudo apt-get install -y libopenmpi-dev git-lfs libnvinfer10 libnvinfer-dev tensorrt-dev cuda-toolkit-13-0 cmake
-
-# ── Fabric Manager for multi-GPU NCCL communication ───────────────────────────
-# Required for H100 with NVSwitch - the k8s GPU runners should have it pre-installed
-# Just try to start it if it's not running (don't try to install - causes dpkg errors)
-echo "Checking Fabric Manager status for multi-GPU support..."
-if command -v nv-fabricmanager &>/dev/null || [ -f /usr/bin/nv-fabricmanager ]; then
-    sudo systemctl start nvidia-fabricmanager 2>/dev/null || true
-    sudo systemctl status nvidia-fabricmanager --no-pager 2>/dev/null || echo "INFO: Fabric Manager not running (may not be needed for this GPU type)"
-else
-    echo "INFO: Fabric Manager not installed (may not be needed for this GPU type)"
-fi
+# Runtime deps: wheel links against CUDA 13 + TensorRT libs
+sudo apt-get install -y libopenmpi-dev libnvinfer10 cuda-toolkit-13-0
 
-# ── CUDA setup ───────────────────────────────────────────────────────────────
-# Prefer /usr/local/cuda-13.0 if it exists, otherwise fall back to /usr/local/cuda
+# ── CUDA runtime setup ───────────────────────────────────────────────────────
 if [ -d "/usr/local/cuda-13.0" ]; then
     export CUDA_HOME="/usr/local/cuda-13.0"
 else
     export CUDA_HOME="${CUDA_HOME:-/usr/local/cuda}"
 fi
-# Re-activate venv first, then add CUDA to PATH so it takes precedence
-if [ -f ".venv/bin/activate" ]; then
-    source .venv/bin/activate
-fi
 export PATH="$CUDA_HOME/bin:$PATH"
 export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64:${LD_LIBRARY_PATH:-}"
-export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64:${LD_LIBRARY_PATH:-}"
+export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
-export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64:${LD_LIBRARY_PATH:-}"
+export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
 
-# Debug: print what CUDA we actually have
-echo "=== CUDA diagnostics ==="
-echo "CUDA_HOME=$CUDA_HOME"
-echo "PATH=$PATH"
-ls -la "$CUDA_HOME/bin/nvcc" 2>/dev/null || echo "WARNING: nvcc not at $CUDA_HOME/bin/nvcc"
-find /usr/local -name "nvcc" -type f 2>/dev/null || echo "WARNING: nvcc not found anywhere in /usr/local"
-which nvcc 2>/dev/null || echo "WARNING: nvcc not on PATH"
-nvcc --version 2>/dev/null || echo "WARNING: nvcc --version failed"
-nvidia-smi 2>/dev/null | head -4 || echo "WARNING: nvidia-smi not found"
-echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-<unset>}"
-python3 --version
-echo "=== end CUDA diagnostics ==="
-
-# ── TensorRT symlinks (for CMake to find TensorRT) ──────────────────────────
-sudo mkdir -p /usr/local/tensorrt
-sudo ln -sf /usr/include/x86_64-linux-gnu /usr/local/tensorrt/include
-sudo ln -sf /usr/lib/x86_64-linux-gnu /usr/local/tensorrt/lib
-
+# ── Install pip and NCCL runtime ─────────────────────────────────────────────
 pip install --upgrade pip
+pip install --no-cache-dir "$NCCL_VERSION_CONSTRAINT"
 
-# ── Clone TensorRT-LLM ──────────────────────────────────────────────────────
-TRTLLM_DIR="/tmp/tensorrt-llm-src"
-if [ ! -d "$TRTLLM_DIR" ]; then
-    echo "Cloning TensorRT-LLM main branch..."
-    git clone --depth 1 https://github.com/NVIDIA/TensorRT-LLM.git "$TRTLLM_DIR"
-fi
-
-cd "$TRTLLM_DIR"
-git lfs install --force
-git lfs pull
-
-# ── Install TensorRT-LLM Python requirements ─────────────────────────────────
-# Install nvidia-cutlass first - provides cutlass_library module needed during CMake configure
-# This is cleaner than relying on CMake's FetchContent which installs to user site-packages
-pip install --no-cache-dir nvidia-cutlass
-
-if [ -f "requirements-dev.txt" ]; then
-    echo "Installing TensorRT-LLM build requirements..."
-    pip install --no-cache-dir -r requirements-dev.txt
-fi
-
-# ── NCCL setup ──────────────────────────────────────────────────────────────
-# build_wheel.py runs pip install internally which can change the NCCL version.
-# Copy headers+libs to a fixed directory that pip can't overwrite, and point
-# NCCL_ROOT there for CMake.
-pip install --no-cache-dir --force-reinstall "$NCCL_VERSION_CONSTRAINT"
-
+# ── Install TensorRT-LLM pre-release wheel from NVIDIA's index ───────────────
+# PyPI only hosts the source tarball for tensorrt-llm — installing from there
+# would trigger a full source build. The pre-built linux_x86_64 wheels live on
+# https://pypi.nvidia.com, which we add as an extra index.
+#
+# The cu130 torch index is also needed so pip resolves torch 2.10+cu130
+# (cuda-bindings==13.x) instead of the default PyPI torch (cuda-bindings==12.9.4),
+# which conflicts with tensorrt-llm's cuda-python>=13 requirement.
+echo "Installing tensorrt-llm==${TRTLLM_VERSION} from pypi.nvidia.com..."
+pip install --no-cache-dir --pre \
+    --extra-index-url https://pypi.nvidia.com \
+    --extra-index-url https://download.pytorch.org/whl/cu130 \
+    "tensorrt-llm==${TRTLLM_VERSION}"
+
+# ── Setup LD_LIBRARY_PATH ────────────────────────────────────────────────────
 SITE_PACKAGES=$(python3 -c "import site; print(site.getsitepackages()[0])")
-NCCL_PIP_ROOT="$SITE_PACKAGES/nvidia/nccl"
-
-# Copy to a stable location that pip won't touch
-NCCL_ROOT="/tmp/nccl-stable"
-rm -rf "$NCCL_ROOT"
-mkdir -p "$NCCL_ROOT/include" "$NCCL_ROOT/lib"
-cp -a "$NCCL_PIP_ROOT/include/"* "$NCCL_ROOT/include/"
-cp -a "$NCCL_PIP_ROOT/lib/"* "$NCCL_ROOT/lib/"
-# Create libnccl.so symlink — pip only ships libnccl.so.2
-if [ -f "$NCCL_ROOT/lib/libnccl.so.2" ] && [ ! -e "$NCCL_ROOT/lib/libnccl.so" ]; then
-    ln -s libnccl.so.2 "$NCCL_ROOT/lib/libnccl.so"
-fi
-
-echo "=== NCCL diagnostics ==="
-echo "NCCL_ROOT=$NCCL_ROOT (stable copy, immune to pip downgrades)"
-ls -la "$NCCL_ROOT/include/" 2>/dev/null | head -5
-ls -la "$NCCL_ROOT/lib/" 2>/dev/null | head -5
-grep "NCCL_MAJOR\|NCCL_MINOR" "$NCCL_ROOT/include/nccl.h" 2>/dev/null | head -3
-echo "=== end NCCL diagnostics ==="
-
-# Symlink stable NCCL header to system path for other tools that look there
-sudo ln -sf "$NCCL_ROOT/include/nccl.h" /usr/include/nccl.h
-
-# ── Patch FindTensorRT.cmake ─────────────────────────────────────────────────
-# CMake needs to find TensorRT in system paths
-CMAKE_FILE="cpp/cmake/modules/FindTensorRT.cmake"
-if [ -f "$CMAKE_FILE" ]; then
-    echo "Patching FindTensorRT.cmake for system paths..."
-    python3 <<'PYTHON_EOF'
-import pathlib
-import re
-import sys
-
-cmake_file = sys.argv[1] if len(sys.argv) > 1 else "cpp/cmake/modules/FindTensorRT.cmake"
-p = pathlib.Path(cmake_file)
-text = p.read_text()
-
-# Add system paths to CMAKE_FIND_ROOT_PATH
-if '/usr/local/tensorrt' not in text or 'list(APPEND CMAKE_FIND_ROOT_PATH' not in text:
-    text = text.replace(
-        'set(TensorRT_WELL_KNOWN_ROOT /usr/local/tensorrt)',
-        'set(TensorRT_WELL_KNOWN_ROOT /usr/local/tensorrt)\nlist(APPEND CMAKE_FIND_ROOT_PATH /usr/local/tensorrt /usr)',
-    )
-
-# Patch find_path for NvInfer.h to include system paths
-text = re.sub(
-    r'(find_path\(\s*TensorRT_INCLUDE_DIR\s+NAMES\s+NvInfer\.h\s+PATHS\s+\$\{TensorRT_WELL_KNOWN_ROOT\}/include)',
-    r'\1 /usr/include/x86_64-linux-gnu',
-    text,
-)
-
-# Add system library paths to find_library calls (matches installation guide)
-text = re.sub(
-    r'(find_library\([^)]*PATHS\s+\$\{TensorRT_WELL_KNOWN_ROOT\}/lib)(\s*\))',
-    r'\1 /usr/lib/x86_64-linux-gnu\2',
-    text,
-    flags=re.DOTALL,
-)
-
-# Add NO_CMAKE_FIND_ROOT_PATH to find_path and find_library calls
-for pattern in [r'(find_path\([^)]*)\)', r'(find_library\([^)]*)\)']:
-    for match in re.finditer(pattern, text, re.DOTALL):
-        block = match.group(0)
-        if 'TensorRT' in block and 'NO_CMAKE_FIND_ROOT_PATH' not in block:
-            patched = block[:-1] + '\n  NO_CMAKE_FIND_ROOT_PATH)'
-            text = text.replace(block, patched)
-
-p.write_text(text)
-print('FindTensorRT.cmake patched')
-PYTHON_EOF
-fi
-
-# ── Patch FindNCCL.cmake ─────────────────────────────────────────────────────
-# The upstream FindNCCL.cmake doesn't use NCCL_ROOT hint at all!
-# We need to add PATHS ${NCCL_ROOT}/lib and NO_CMAKE_FIND_ROOT_PATH
-NCCL_CMAKE_FILE="cpp/cmake/modules/FindNCCL.cmake"
-if [ -f "$NCCL_CMAKE_FILE" ]; then
-    echo "Patching FindNCCL.cmake to use NCCL_ROOT hint..."
-    python3 <<'PYTHON_EOF'
-import pathlib
-
-p = pathlib.Path("cpp/cmake/modules/FindNCCL.cmake")
-text = p.read_text()
-
-# Replace simple find_library/find_path calls with ones that use NCCL_ROOT hint
-# Original: find_library(NCCL_LIBRARY NAMES nccl)
-# Patched:  find_library(NCCL_LIBRARY NAMES nccl PATHS ${NCCL_ROOT}/lib NO_CMAKE_FIND_ROOT_PATH)
-
-# The pip nvidia-nccl-cu13 package has libnccl.so.2 directly in NCCL_ROOT, not in lib/
-text = text.replace(
-    'find_library(NCCL_LIBRARY NAMES nccl)',
-    'find_library(NCCL_LIBRARY NAMES nccl PATHS ${NCCL_ROOT} ${NCCL_ROOT}/lib NO_CMAKE_FIND_ROOT_PATH)'
-)
-
-text = text.replace(
-    'find_library(NCCL_STATIC_LIBRARY NAMES nccl_static)',
-    'find_library(NCCL_STATIC_LIBRARY NAMES nccl_static PATHS ${NCCL_ROOT} ${NCCL_ROOT}/lib NO_CMAKE_FIND_ROOT_PATH)'
-)
-
-text = text.replace(
-    'find_path(NCCL_INCLUDE_DIR NAMES nccl.h)',
-    'find_path(NCCL_INCLUDE_DIR NAMES nccl.h PATHS ${NCCL_ROOT}/include NO_CMAKE_FIND_ROOT_PATH)'
-)
-
-p.write_text(text)
-print('FindNCCL.cmake patched to use NCCL_ROOT hint')
-PYTHON_EOF
-fi
-
-# ── Build TensorRT-LLM ───────────────────────────────────────────────────────
-echo "=== Building TensorRT-LLM from source (this may take a while)... ==="
-
-python3 scripts/build_wheel.py \
-    --cuda_architectures "90-real" \
-    --trt_root /usr/local/tensorrt \
-    --nccl_root "$NCCL_ROOT" \
-    --install \
-    --no-venv \
-    -j "$(nproc)" \
-    -D "ENABLE_UCX=OFF" \
-    --clean
-
-# Return to repo dir
-cd -
-
-# Cache the built wheel for future runs
-mkdir -p "$TRTLLM_WHEEL_CACHE"
-BUILT_WHEEL=$(find "$TRTLLM_DIR/build" -name "tensorrt_llm*.whl" 2>/dev/null | head -1)
-if [ -n "$BUILT_WHEEL" ]; then
-    cp "$BUILT_WHEEL" "$TRTLLM_WHEEL_CACHE/"
-    echo "Cached wheel to: $TRTLLM_WHEEL_CACHE/$(basename "$BUILT_WHEEL")"
-fi
-
-# ── Add pip-installed NVIDIA libraries to LD_LIBRARY_PATH ────────────────────
 NVIDIA_LIB_DIRS=$(find "$SITE_PACKAGES/nvidia" -name "lib" -type d 2>/dev/null | sort -u | paste -sd':')
 if [ -n "$NVIDIA_LIB_DIRS" ]; then
     export LD_LIBRARY_PATH="${NVIDIA_LIB_DIRS}:${LD_LIBRARY_PATH:-}"
-    export LD_LIBRARY_PATH="${NVIDIA_LIB_DIRS}:${LD_LIBRARY_PATH:-}"
+    export LD_LIBRARY_PATH="${NVIDIA_LIB_DIRS}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
-    export LD_LIBRARY_PATH="${NVIDIA_LIB_DIRS}:${LD_LIBRARY_PATH:-}"
+    export LD_LIBRARY_PATH="${NVIDIA_LIB_DIRS}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
@@ -344,9 +88,7 @@ fi
 echo "=== TensorRT-LLM verification ==="
 python3 -c "import tensorrt_llm; print(f'TensorRT-LLM version: {tensorrt_llm.__version__}')"
 python3 -c "from tensorrt_llm.commands.serve import main; print('gRPC serve command: available')"
-
-# Smoke-test: verify the serve command can parse --help without crashing
 echo "Verifying gRPC serve command..."
 python3 -m tensorrt_llm.commands.serve serve --help 2>&1 | head -20 || echo "WARNING: serve --help failed"
-python3 -m tensorrt_llm.commands.serve serve --help 2>&1 | head -20 || echo "WARNING: serve --help failed"
+python3 -m tensorrt_llm.commands.serve serve --help 2>&1 | head -20
-python3 -m tensorrt_llm.commands.serve serve --help 2>&1 | head -20 || echo "WARNING: serve --help failed"
+python3 -m tensorrt_llm.commands.serve serve --help 2>&1 | head -20
 
-echo "TensorRT-LLM installation complete (built from source)"
+echo "TensorRT-LLM installation complete (from PyPI)"