diff --git a/.github/workflows/build-wheel-cuda-hip.yaml b/.github/workflows/build-wheel-cuda-hip.yaml index ee5e0198..80c6c553 100644 --- a/.github/workflows/build-wheel-cuda-hip.yaml +++ b/.github/workflows/build-wheel-cuda-hip.yaml @@ -371,8 +371,22 @@ jobs: # Build only with Python 3.10 for abi3 wheel (compatible with all Python 3.10+) pyver: ["3.10"] cuda: ["12.8.1", "13.2.0"] + include: + # Per-CUDA-version GPU architecture lists, mirroring the upstream + # llama.cpp *release* default (ggml/src/ggml-cuda/CMakeLists.txt, + # GGML_NATIVE=OFF branch). Without this, build.py falls back to + # "all" (every arch the toolkit supports), which is far slower than + # needed. Applies to both x86_64 and arm64 hosts (sm_XX is the GPU + # arch, independent of host CPU). 121a needs CUDA >= 12.9. + - cuda: "12.8.1" + cuda_arch: "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real;89-real;120a-real" + - cuda: "13.2.0" + cuda_arch: "75-virtual;80-virtual;86-real;89-real;120a-real;121a-real" env: CUDAVER: ${{ matrix.cuda }} + # CUDA target architectures (PTX vs SASS, per-arch GPU map and rationale): + # see the detailed reference in the build_wheels_cuda_windows job below. + CUDA_ARCHITECTURES: ${{ matrix.cuda_arch }} steps: - name: Free Disk Space (Ubuntu) @@ -595,11 +609,73 @@ jobs: # Build only with Python 3.10 for abi3 wheel (compatible with all Python 3.10+) pyver: ["3.10"] cuda: ["12.8.1", "13.2.0"] + include: + # Per-CUDA-version GPU architecture lists, mirroring the upstream + # llama.cpp *release* default (ggml/src/ggml-cuda/CMakeLists.txt, + # GGML_NATIVE=OFF branch). See the CUDA_ARCHITECTURES note below. + # CUDA 12.x still supports Maxwell/Pascal/Volta; 121a needs CUDA >= 12.9 + # so it is NOT included for 12.8. + - cuda: "12.8.1" + cuda_arch: "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real;89-real;120a-real" + # CUDA 13 dropped Maxwell/Pascal/Volta; 121a-real is available. + - cuda: "13.2.0" + cuda_arch: "75-virtual;80-virtual;86-real;89-real;120a-real;121a-real" defaults: run: shell: bash env: CUDAVER: ${{ matrix.cuda }} + # ---------------------------------------------------------------------- + # CUDA target architectures (canonical reference; the Linux job points here) + # ---------------------------------------------------------------------- + # Why pin this: the Windows runner has only 4 vCPUs, so CUDA compile time + # scales with the number of code-gen outputs. build.py would otherwise + # auto-detect archs via `nvcc --list-gpu-arch` -> 12 *bare* archs for + # CUDA 13, and a bare arch tells CMake to emit BOTH SASS and PTX, i.e. + # 24 code-gen outputs per .cu file. Across ~138 .cu files that blows past + # GitHub's 6-hour job limit. The lists below produce only ~6-8 outputs. + # + # PTX vs SASS (what the suffixes mean): + # * -real = SASS: native machine code (cubin) for exactly that + # GPU arch. Loads and runs directly, fully optimized. + # Does NOT run on any other arch. + # * -virtual = PTX: a forward-compatible virtual ISA. The driver + # JIT-compiles it to SASS on first launch (then caches + # it). PTX built for compute_X can JIT onto any GPU + # with compute capability >= X (never older). + # * = both SASS and PTX (twice the work; what auto-detect + # produced, hence the slowness). + # + # Runtime kernel selection: matching SASS -> use directly; else compatible + # PTX -> JIT; else load fails (CUDA error 209, no kernel image). So at + # least one -virtual entry is REQUIRED for any GPU we do not ship SASS for. + # + # These lists mirror the upstream llama.cpp *release* default + # (ggml/src/ggml-cuda/CMakeLists.txt, GGML_NATIVE=OFF branch): + # + # arch kind GPUs runtime + # ---- ---- -------------------------- -------------------------- + # 50 PTX Maxwell (GTX 9xx) JIT (CUDA 12 only) + # 61 PTX Pascal (GTX 10xx, P40) JIT (CUDA 12 only) + # 70 PTX Volta (V100) JIT (CUDA 12 only) + # 75 PTX Turing (RTX 20xx, T4) JIT + # 80 PTX Ampere DC (A100) JIT; also forward-covers + # Hopper(90) & Blackwell DC + # (100/103) via JIT + # 86 SASS Ampere (RTX 30xx, A10/A40) native + # 89 SASS Ada (RTX 40xx, L4/L40) native + # 120a SASS Blackwell (RTX 50xx) native (CUDA >= 12.8) + # 121a SASS Blackwell variant native (CUDA >= 12.9) + # + # Notes: + # * CUDA 13 dropped Maxwell/Pascal/Volta, so 50/61/70 are omitted there; + # the CUDA-13 floor is sm_75 (Turing) via the 75-virtual PTX. + # * Native SASS is shipped only for mainstream consumer cards; datacenter + # parts (A100/Hopper/Blackwell-DC) run via JIT from the 80-virtual PTX. + # Add 90-real / 100-real if native datacenter perf is desired. + # * Local builds (CUDA_ARCHITECTURES unset) instead use CMake's "all" for + # native SASS on every arch -- see scripts/build.py. + CUDA_ARCHITECTURES: ${{ matrix.cuda_arch }} steps: - name: Clone @@ -654,8 +730,17 @@ jobs: python -m build --wheel - # On Windows, we use delvewheel for wheel repair - python -m delvewheel repair --exclude nvcuda.dll dist/*.whl -w dist + # On Windows, we use delvewheel for wheel repair. + # delvewheel must locate the CUDA redistributable DLLs (cudart64_*, + # cublas64_*, ...) to graft them into the wheel. CUDA 13 relocated + # these from \bin to \bin\x64 on Windows, while + # CUDA 12 keeps them in \bin. The toolkit action only puts + # \bin on PATH, so without this delvewheel fails with e.g. + # "Unable to find library: cublas64_13.dll". Add both dirs explicitly + # (semicolon = Windows path separator for --add-path). + python -m delvewheel repair \ + --add-path "$CUDA_PATH/bin/x64;$CUDA_PATH/bin" \ + --exclude nvcuda.dll dist/*.whl -w dist echo "CUDA_VERSION=$cuda_version" >> $GITHUB_ENV diff --git a/README.md b/README.md index 290d2389..70fc7cca 100644 --- a/README.md +++ b/README.md @@ -108,9 +108,10 @@ Before pip installing `xllamacpp`, please ensure your system meets the following - Requires ARMv8-A or later architecture - For best performance, build from source if your CPU supports advanced instruction sets (e.g., SVE) -- **CUDA (Linux)**: - - Requires glibc 2.35 or later +- **CUDA (Linux/Windows)**: + - Requires glibc 2.35 or later (Linux) - Compatible NVIDIA GPU with appropriate drivers (CUDA 12.8 or 13.2) + - See [CUDA GPU Architecture Coverage](#cuda-gpu-architecture-coverage) below for the list of supported GPUs per CUDA version - **ROCm (Linux)**: - Requires glibc 2.35 or later @@ -122,6 +123,53 @@ Before pip installing `xllamacpp`, please ensure your system meets the following - Linux users may need distro packages and the LunarG SDK - macOS Intel is supported via Vulkan; Apple Silicon Vulkan is not supported in this project +## CUDA GPU Architecture Coverage + +The prebuilt CUDA wheels are compiled for a curated set of NVIDIA GPU architectures +that mirrors the upstream llama.cpp *release* default (`ggml/src/ggml-cuda/CMakeLists.txt`, +`GGML_NATIVE=OFF` branch). The set differs between CUDA versions because newer toolkits +drop older architectures and add newer ones. + +Each architecture is shipped in one of two forms: + +- **SASS** (`-real`): native machine code (cubin) for exactly that GPU architecture. + It loads and runs directly with full optimization, but only on that specific architecture. +- **PTX** (`-virtual`): a forward-compatible virtual ISA. The driver JIT-compiles it to + SASS on first launch (then caches it). PTX built for `compute_X` can JIT onto any GPU + with compute capability `>= X` (never older). + +At runtime CUDA picks matching SASS if available; otherwise it JIT-compiles compatible PTX; +otherwise the load fails (CUDA error 209, "no kernel image"). This is why at least one PTX +(`-virtual`) entry is always shipped, so GPUs without native SASS still run via JIT. + +| Arch | Form | GPUs | CUDA 12.8 wheel | CUDA 13.2 wheel | +|:-----|:-----|:-----|:---------------:|:---------------:| +| `50` | PTX | Maxwell (GTX 9xx) | ✅ JIT | — | +| `61` | PTX | Pascal (GTX 10xx, P40) | ✅ JIT | — | +| `70` | PTX | Volta (V100) | ✅ JIT | — | +| `75` | PTX | Turing (RTX 20xx, T4) | ✅ JIT | ✅ JIT | +| `80` | PTX | Ampere DC (A100); also forward-covers Hopper (90) and Blackwell DC (100/103) via JIT | ✅ JIT | ✅ JIT | +| `86` | SASS | Ampere (RTX 30xx, A10/A40) | ✅ native | ✅ native | +| `89` | SASS | Ada (RTX 40xx, L4/L40) | ✅ native | ✅ native | +| `120a` | SASS | Blackwell (RTX 50xx) | ✅ native | ✅ native | +| `121a` | SASS | Blackwell variant | — | ✅ native | + +Notes: + +- **CUDA 13 dropped Maxwell/Pascal/Volta**, so `50`/`61`/`70` are omitted there; the CUDA 13 + floor is `sm_75` (Turing) via the `75-virtual` PTX. +- **`121a` requires CUDA >= 12.9**, so it is only present in the CUDA 13.2 wheel. +- **Native SASS is shipped only for mainstream consumer cards.** Datacenter parts + (A100/Hopper/Blackwell-DC) run via JIT from the `80-virtual` PTX, which works but incurs a + one-time JIT compile on first launch. +- **Local source builds** (where `CUDA_ARCHITECTURES` is unset) instead use CMake's `all`, + producing native SASS for every architecture the installed toolkit supports — see + `scripts/build.py`. If you need native datacenter performance from a prebuilt wheel, build + from source or add `90-real` / `100-real` to the architecture list. + +The same architecture lists apply to both Linux (x86_64 and arm64) and Windows CUDA wheels, +since the `sm_XX` value is the GPU architecture and is independent of the host CPU. + ## Build from Source ### (Optional) Preparation diff --git a/scripts/build.py b/scripts/build.py index a415c32c..b0c65005 100755 --- a/scripts/build.py +++ b/scripts/build.py @@ -5,7 +5,6 @@ import os import platform -import re import shlex import shutil import subprocess @@ -44,33 +43,6 @@ def split_cmake_args(value: str) -> list[str]: return parts -def detect_cuda_architectures() -> str: - log("=== Detecting supported GPU architectures ===") - try: - output = subprocess.check_output( - ["nvcc", "--list-gpu-arch"], text=True, stderr=subprocess.STDOUT - ) - except (OSError, subprocess.CalledProcessError) as exc: - raise SystemExit( - "CUDA_ARCHITECTURES is not set and `nvcc --list-gpu-arch` failed. " - "Install CUDA tools or set CUDA_ARCHITECTURES explicitly." - ) from exc - - print(output, end="", flush=True) - archs: set[int] = set() - for line in output.splitlines(): - match = re.fullmatch(r"(?:sm|compute)_(\d+)", line.strip()) - if match: - arch = int(match.group(1)) - if arch >= 70: - archs.add(arch) - - if not archs: - raise SystemExit("nvcc did not report any CUDA architectures >= 70") - - return ";".join("120a" if arch == 120 else str(arch) for arch in sorted(archs)) - - def hip_compiler() -> str: try: hip_root = subprocess.check_output( @@ -134,7 +106,16 @@ def build_llamacpp() -> None: if env_is_set("XLLAMACPP_BUILD_CUDA"): log("Building for CUDA") - cuda_archs = os.environ.get("CUDA_ARCHITECTURES") or detect_cuda_architectures() + # CI pipelines pin CUDA_ARCHITECTURES to a curated list to keep build + # times under the runner limit (a few -real archs + PTX fallbacks). + # + # When unset (i.e. a user building locally for their own use), fall back + # to CMake's "native" keyword. Per the CMake docs this detects the GPUs + # actually installed on the build machine and compiles SASS only for + # those architectures. That keeps the build fast and produces fully + # arch-optimized code for the local hardware -- at the cost of a binary + # that is not portable to other GPU architectures. + cuda_archs = os.environ.get("CUDA_ARCHITECTURES") or "native" log(f"Using CUDA architectures: {cuda_archs}") cmake_args.extend( [ diff --git a/scripts/download_models.py b/scripts/download_models.py index 5c38ddad..00909f1f 100644 --- a/scripts/download_models.py +++ b/scripts/download_models.py @@ -13,7 +13,10 @@ """ import hashlib +import http.client import os +import socket +import ssl import sys import threading import time @@ -30,8 +33,20 @@ READ_CHUNK_SIZE = 1 << 23 # 8 MB — buffer size for network reads / disk writes. MAX_HTTP_RETRIES = 3 RATE_LIMIT_BASE_SLEEP = 30.0 +NETWORK_ERROR_BASE_SLEEP = 2.0 REQUEST_START_DELAY = 1.0 +# Transient connection-level errors worth retrying (network blips, dropped TLS +# connections, slow servers). These are distinct from HTTP status errors. +TRANSIENT_NETWORK_ERRORS = ( + ssl.SSLError, + socket.timeout, + TimeoutError, + ConnectionError, + http.client.IncompleteRead, + http.client.RemoteDisconnected, +) + @dataclass(frozen=True) class RetryDelay: @@ -140,7 +155,7 @@ def _sleep_before_request() -> None: def open_url_with_retries(req: urllib.request.Request, timeout: int, description: str): - """Open a URL, retrying HTTP 429 with backoff.""" + """Open a URL, retrying HTTP 429 and transient network errors with backoff.""" request_timeout = timeout for attempt in range(MAX_HTTP_RETRIES + 1): try: @@ -163,6 +178,25 @@ def open_url_with_retries(req: urllib.request.Request, timeout: int, description flush=True, ) time.sleep(sleep_for) + except (urllib.error.URLError,) + TRANSIENT_NETWORK_ERRORS as err: + # urllib wraps SSL/connection errors in URLError(reason=...). + reason = getattr(err, "reason", err) + if not isinstance(err, TRANSIENT_NETWORK_ERRORS) and not isinstance( + reason, TRANSIENT_NETWORK_ERRORS + ): + # A non-transient URLError (e.g. DNS failure): don't retry. + raise + if attempt == MAX_HTTP_RETRIES: + raise + + sleep_for = NETWORK_ERROR_BASE_SLEEP * (2**attempt) + print( + f"Network error while requesting {description}: {reason}; " + f"sleeping {sleep_for:.0f}s before retry {attempt + 1}/" + f"{MAX_HTTP_RETRIES}...", + flush=True, + ) + time.sleep(sleep_for) def get_file_info(url: str) -> tuple[str, int] | None: @@ -294,11 +328,33 @@ def download_one( dest.unlink() print(f"Downloading {name} ({CHUNKS_PER_FILE} connections)...", flush=True) - try: - download_file(url, dest, CHUNKS_PER_FILE, chunk_pool) - except Exception as e: - dest.unlink(missing_ok=True) - raise RuntimeError(f"{name}: download failed -- {e}") + last_err: Exception | None = None + for attempt in range(MAX_HTTP_RETRIES + 1): + try: + download_file(url, dest, CHUNKS_PER_FILE, chunk_pool) + last_err = None + break + except urllib.error.HTTPError: + # HTTP-status errors (404, etc.) are not transient; fail fast. + dest.unlink(missing_ok=True) + raise RuntimeError(f"{name}: download failed -- {sys.exc_info()[1]}") + except Exception as e: + # Connection-level / streaming errors (SSL EOF, reset, timeout): + # the body read may have died mid-stream, so retry the whole file. + last_err = e + dest.unlink(missing_ok=True) + if attempt == MAX_HTTP_RETRIES: + break + sleep_for = NETWORK_ERROR_BASE_SLEEP * (2**attempt) + print( + f" {name}: download error ({e}); " + f"retrying in {sleep_for:.0f}s " + f"({attempt + 1}/{MAX_HTTP_RETRIES})...", + flush=True, + ) + time.sleep(sleep_for) + if last_err is not None: + raise RuntimeError(f"{name}: download failed -- {last_err}") actual = sha256_file(dest) if actual != expected_sha: