Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 87 additions & 2 deletions .github/workflows/build-wheel-cuda-hip.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -371,8 +371,22 @@ jobs:
# Build only with Python 3.10 for abi3 wheel (compatible with all Python 3.10+)
pyver: ["3.10"]
cuda: ["12.8.1", "13.2.0"]
include:
# Per-CUDA-version GPU architecture lists, mirroring the upstream
# llama.cpp *release* default (ggml/src/ggml-cuda/CMakeLists.txt,
# GGML_NATIVE=OFF branch). Without this, build.py falls back to
# "all" (every arch the toolkit supports), which is far slower than
# needed. Applies to both x86_64 and arm64 hosts (sm_XX is the GPU
# arch, independent of host CPU). 121a needs CUDA >= 12.9.
- cuda: "12.8.1"
cuda_arch: "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real;89-real;120a-real"
- cuda: "13.2.0"
cuda_arch: "75-virtual;80-virtual;86-real;89-real;120a-real;121a-real"
env:
CUDAVER: ${{ matrix.cuda }}
# CUDA target architectures (PTX vs SASS, per-arch GPU map and rationale):
# see the detailed reference in the build_wheels_cuda_windows job below.
CUDA_ARCHITECTURES: ${{ matrix.cuda_arch }}

steps:
- name: Free Disk Space (Ubuntu)
Expand Down Expand Up @@ -595,11 +609,73 @@ jobs:
# Build only with Python 3.10 for abi3 wheel (compatible with all Python 3.10+)
pyver: ["3.10"]
cuda: ["12.8.1", "13.2.0"]
include:
# Per-CUDA-version GPU architecture lists, mirroring the upstream
# llama.cpp *release* default (ggml/src/ggml-cuda/CMakeLists.txt,
# GGML_NATIVE=OFF branch). See the CUDA_ARCHITECTURES note below.
# CUDA 12.x still supports Maxwell/Pascal/Volta; 121a needs CUDA >= 12.9
# so it is NOT included for 12.8.
- cuda: "12.8.1"
cuda_arch: "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real;89-real;120a-real"
# CUDA 13 dropped Maxwell/Pascal/Volta; 121a-real is available.
- cuda: "13.2.0"
cuda_arch: "75-virtual;80-virtual;86-real;89-real;120a-real;121a-real"
defaults:
run:
shell: bash
env:
CUDAVER: ${{ matrix.cuda }}
# ----------------------------------------------------------------------
# CUDA target architectures (canonical reference; the Linux job points here)
# ----------------------------------------------------------------------
# Why pin this: the Windows runner has only 4 vCPUs, so CUDA compile time
# scales with the number of code-gen outputs. build.py would otherwise
# auto-detect archs via `nvcc --list-gpu-arch` -> 12 *bare* archs for
# CUDA 13, and a bare arch tells CMake to emit BOTH SASS and PTX, i.e.
# 24 code-gen outputs per .cu file. Across ~138 .cu files that blows past
# GitHub's 6-hour job limit. The lists below produce only ~6-8 outputs.
#
# PTX vs SASS (what the suffixes mean):
# * <arch>-real = SASS: native machine code (cubin) for exactly that
# GPU arch. Loads and runs directly, fully optimized.
# Does NOT run on any other arch.
# * <arch>-virtual = PTX: a forward-compatible virtual ISA. The driver
# JIT-compiles it to SASS on first launch (then caches
# it). PTX built for compute_X can JIT onto any GPU
# with compute capability >= X (never older).
# * <arch> = both SASS and PTX (twice the work; what auto-detect
# produced, hence the slowness).
#
# Runtime kernel selection: matching SASS -> use directly; else compatible
# PTX -> JIT; else load fails (CUDA error 209, no kernel image). So at
# least one -virtual entry is REQUIRED for any GPU we do not ship SASS for.
#
# These lists mirror the upstream llama.cpp *release* default
# (ggml/src/ggml-cuda/CMakeLists.txt, GGML_NATIVE=OFF branch):
#
# arch kind GPUs runtime
# ---- ---- -------------------------- --------------------------
# 50 PTX Maxwell (GTX 9xx) JIT (CUDA 12 only)
# 61 PTX Pascal (GTX 10xx, P40) JIT (CUDA 12 only)
# 70 PTX Volta (V100) JIT (CUDA 12 only)
# 75 PTX Turing (RTX 20xx, T4) JIT
# 80 PTX Ampere DC (A100) JIT; also forward-covers
# Hopper(90) & Blackwell DC
# (100/103) via JIT
# 86 SASS Ampere (RTX 30xx, A10/A40) native
# 89 SASS Ada (RTX 40xx, L4/L40) native
# 120a SASS Blackwell (RTX 50xx) native (CUDA >= 12.8)
# 121a SASS Blackwell variant native (CUDA >= 12.9)
#
# Notes:
# * CUDA 13 dropped Maxwell/Pascal/Volta, so 50/61/70 are omitted there;
# the CUDA-13 floor is sm_75 (Turing) via the 75-virtual PTX.
# * Native SASS is shipped only for mainstream consumer cards; datacenter
# parts (A100/Hopper/Blackwell-DC) run via JIT from the 80-virtual PTX.
# Add 90-real / 100-real if native datacenter perf is desired.
# * Local builds (CUDA_ARCHITECTURES unset) instead use CMake's "all" for
# native SASS on every arch -- see scripts/build.py.
CUDA_ARCHITECTURES: ${{ matrix.cuda_arch }}

steps:
- name: Clone
Expand Down Expand Up @@ -654,8 +730,17 @@ jobs:

python -m build --wheel

# On Windows, we use delvewheel for wheel repair
python -m delvewheel repair --exclude nvcuda.dll dist/*.whl -w dist
# On Windows, we use delvewheel for wheel repair.
# delvewheel must locate the CUDA redistributable DLLs (cudart64_*,
# cublas64_*, ...) to graft them into the wheel. CUDA 13 relocated
# these from <CUDA_PATH>\bin to <CUDA_PATH>\bin\x64 on Windows, while
# CUDA 12 keeps them in <CUDA_PATH>\bin. The toolkit action only puts
# \bin on PATH, so without this delvewheel fails with e.g.
# "Unable to find library: cublas64_13.dll". Add both dirs explicitly
# (semicolon = Windows path separator for --add-path).
python -m delvewheel repair \
--add-path "$CUDA_PATH/bin/x64;$CUDA_PATH/bin" \
--exclude nvcuda.dll dist/*.whl -w dist

echo "CUDA_VERSION=$cuda_version" >> $GITHUB_ENV

Expand Down
52 changes: 50 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,9 +108,10 @@ Before pip installing `xllamacpp`, please ensure your system meets the following
- Requires ARMv8-A or later architecture
- For best performance, build from source if your CPU supports advanced instruction sets (e.g., SVE)

- **CUDA (Linux)**:
- Requires glibc 2.35 or later
- **CUDA (Linux/Windows)**:
- Requires glibc 2.35 or later (Linux)
- Compatible NVIDIA GPU with appropriate drivers (CUDA 12.8 or 13.2)
- See [CUDA GPU Architecture Coverage](#cuda-gpu-architecture-coverage) below for the list of supported GPUs per CUDA version

- **ROCm (Linux)**:
- Requires glibc 2.35 or later
Expand All @@ -122,6 +123,53 @@ Before pip installing `xllamacpp`, please ensure your system meets the following
- Linux users may need distro packages and the LunarG SDK
- macOS Intel is supported via Vulkan; Apple Silicon Vulkan is not supported in this project

## CUDA GPU Architecture Coverage

The prebuilt CUDA wheels are compiled for a curated set of NVIDIA GPU architectures
that mirrors the upstream llama.cpp *release* default (`ggml/src/ggml-cuda/CMakeLists.txt`,
`GGML_NATIVE=OFF` branch). The set differs between CUDA versions because newer toolkits
drop older architectures and add newer ones.

Each architecture is shipped in one of two forms:

- **SASS** (`-real`): native machine code (cubin) for exactly that GPU architecture.
It loads and runs directly with full optimization, but only on that specific architecture.
- **PTX** (`-virtual`): a forward-compatible virtual ISA. The driver JIT-compiles it to
SASS on first launch (then caches it). PTX built for `compute_X` can JIT onto any GPU
with compute capability `>= X` (never older).

At runtime CUDA picks matching SASS if available; otherwise it JIT-compiles compatible PTX;
otherwise the load fails (CUDA error 209, "no kernel image"). This is why at least one PTX
(`-virtual`) entry is always shipped, so GPUs without native SASS still run via JIT.

| Arch | Form | GPUs | CUDA 12.8 wheel | CUDA 13.2 wheel |
|:-----|:-----|:-----|:---------------:|:---------------:|
| `50` | PTX | Maxwell (GTX 9xx) | ✅ JIT | — |
| `61` | PTX | Pascal (GTX 10xx, P40) | ✅ JIT | — |
| `70` | PTX | Volta (V100) | ✅ JIT | — |
| `75` | PTX | Turing (RTX 20xx, T4) | ✅ JIT | ✅ JIT |
| `80` | PTX | Ampere DC (A100); also forward-covers Hopper (90) and Blackwell DC (100/103) via JIT | ✅ JIT | ✅ JIT |
| `86` | SASS | Ampere (RTX 30xx, A10/A40) | ✅ native | ✅ native |
| `89` | SASS | Ada (RTX 40xx, L4/L40) | ✅ native | ✅ native |
| `120a` | SASS | Blackwell (RTX 50xx) | ✅ native | ✅ native |
| `121a` | SASS | Blackwell variant | — | ✅ native |

Notes:

- **CUDA 13 dropped Maxwell/Pascal/Volta**, so `50`/`61`/`70` are omitted there; the CUDA 13
floor is `sm_75` (Turing) via the `75-virtual` PTX.
- **`121a` requires CUDA >= 12.9**, so it is only present in the CUDA 13.2 wheel.
- **Native SASS is shipped only for mainstream consumer cards.** Datacenter parts
(A100/Hopper/Blackwell-DC) run via JIT from the `80-virtual` PTX, which works but incurs a
one-time JIT compile on first launch.
- **Local source builds** (where `CUDA_ARCHITECTURES` is unset) instead use CMake's `all`,
producing native SASS for every architecture the installed toolkit supports — see
`scripts/build.py`. If you need native datacenter performance from a prebuilt wheel, build
from source or add `90-real` / `100-real` to the architecture list.

The same architecture lists apply to both Linux (x86_64 and arm64) and Windows CUDA wheels,
since the `sm_XX` value is the GPU architecture and is independent of the host CPU.

## Build from Source

### (Optional) Preparation
Expand Down
39 changes: 10 additions & 29 deletions scripts/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

import os
import platform
import re
import shlex
import shutil
import subprocess
Expand Down Expand Up @@ -44,33 +43,6 @@ def split_cmake_args(value: str) -> list[str]:
return parts


def detect_cuda_architectures() -> str:
log("=== Detecting supported GPU architectures ===")
try:
output = subprocess.check_output(
["nvcc", "--list-gpu-arch"], text=True, stderr=subprocess.STDOUT
)
except (OSError, subprocess.CalledProcessError) as exc:
raise SystemExit(
"CUDA_ARCHITECTURES is not set and `nvcc --list-gpu-arch` failed. "
"Install CUDA tools or set CUDA_ARCHITECTURES explicitly."
) from exc

print(output, end="", flush=True)
archs: set[int] = set()
for line in output.splitlines():
match = re.fullmatch(r"(?:sm|compute)_(\d+)", line.strip())
if match:
arch = int(match.group(1))
if arch >= 70:
archs.add(arch)

if not archs:
raise SystemExit("nvcc did not report any CUDA architectures >= 70")

return ";".join("120a" if arch == 120 else str(arch) for arch in sorted(archs))


def hip_compiler() -> str:
try:
hip_root = subprocess.check_output(
Expand Down Expand Up @@ -134,7 +106,16 @@ def build_llamacpp() -> None:

if env_is_set("XLLAMACPP_BUILD_CUDA"):
log("Building for CUDA")
cuda_archs = os.environ.get("CUDA_ARCHITECTURES") or detect_cuda_architectures()
# CI pipelines pin CUDA_ARCHITECTURES to a curated list to keep build
# times under the runner limit (a few -real archs + PTX fallbacks).
#
# When unset (i.e. a user building locally for their own use), fall back
# to CMake's "native" keyword. Per the CMake docs this detects the GPUs
# actually installed on the build machine and compiles SASS only for
# those architectures. That keeps the build fast and produces fully
# arch-optimized code for the local hardware -- at the cost of a binary
# that is not portable to other GPU architectures.
cuda_archs = os.environ.get("CUDA_ARCHITECTURES") or "native"
log(f"Using CUDA architectures: {cuda_archs}")
cmake_args.extend(
[
Expand Down
68 changes: 62 additions & 6 deletions scripts/download_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@
"""

import hashlib
import http.client
import os
import socket
import ssl
import sys
import threading
import time
Expand All @@ -30,8 +33,20 @@
READ_CHUNK_SIZE = 1 << 23 # 8 MB — buffer size for network reads / disk writes.
MAX_HTTP_RETRIES = 3
RATE_LIMIT_BASE_SLEEP = 30.0
NETWORK_ERROR_BASE_SLEEP = 2.0
REQUEST_START_DELAY = 1.0

# Transient connection-level errors worth retrying (network blips, dropped TLS
# connections, slow servers). These are distinct from HTTP status errors.
TRANSIENT_NETWORK_ERRORS = (
ssl.SSLError,
socket.timeout,
TimeoutError,
ConnectionError,
http.client.IncompleteRead,
http.client.RemoteDisconnected,
)


@dataclass(frozen=True)
class RetryDelay:
Expand Down Expand Up @@ -140,7 +155,7 @@ def _sleep_before_request() -> None:


def open_url_with_retries(req: urllib.request.Request, timeout: int, description: str):
"""Open a URL, retrying HTTP 429 with backoff."""
"""Open a URL, retrying HTTP 429 and transient network errors with backoff."""
request_timeout = timeout
for attempt in range(MAX_HTTP_RETRIES + 1):
try:
Expand All @@ -163,6 +178,25 @@ def open_url_with_retries(req: urllib.request.Request, timeout: int, description
flush=True,
)
time.sleep(sleep_for)
except (urllib.error.URLError,) + TRANSIENT_NETWORK_ERRORS as err:
# urllib wraps SSL/connection errors in URLError(reason=...).
reason = getattr(err, "reason", err)
if not isinstance(err, TRANSIENT_NETWORK_ERRORS) and not isinstance(
reason, TRANSIENT_NETWORK_ERRORS
):
# A non-transient URLError (e.g. DNS failure): don't retry.
raise
if attempt == MAX_HTTP_RETRIES:
raise

sleep_for = NETWORK_ERROR_BASE_SLEEP * (2**attempt)
print(
f"Network error while requesting {description}: {reason}; "
f"sleeping {sleep_for:.0f}s before retry {attempt + 1}/"
f"{MAX_HTTP_RETRIES}...",
flush=True,
)
time.sleep(sleep_for)


def get_file_info(url: str) -> tuple[str, int] | None:
Expand Down Expand Up @@ -294,11 +328,33 @@ def download_one(
dest.unlink()

print(f"Downloading {name} ({CHUNKS_PER_FILE} connections)...", flush=True)
try:
download_file(url, dest, CHUNKS_PER_FILE, chunk_pool)
except Exception as e:
dest.unlink(missing_ok=True)
raise RuntimeError(f"{name}: download failed -- {e}")
last_err: Exception | None = None
for attempt in range(MAX_HTTP_RETRIES + 1):
try:
download_file(url, dest, CHUNKS_PER_FILE, chunk_pool)
last_err = None
break
except urllib.error.HTTPError:
# HTTP-status errors (404, etc.) are not transient; fail fast.
dest.unlink(missing_ok=True)
raise RuntimeError(f"{name}: download failed -- {sys.exc_info()[1]}")
except Exception as e:
# Connection-level / streaming errors (SSL EOF, reset, timeout):
# the body read may have died mid-stream, so retry the whole file.
last_err = e
dest.unlink(missing_ok=True)
if attempt == MAX_HTTP_RETRIES:
break
sleep_for = NETWORK_ERROR_BASE_SLEEP * (2**attempt)
print(
f" {name}: download error ({e}); "
f"retrying in {sleep_for:.0f}s "
f"({attempt + 1}/{MAX_HTTP_RETRIES})...",
flush=True,
)
time.sleep(sleep_for)
if last_err is not None:
raise RuntimeError(f"{name}: download failed -- {last_err}")

actual = sha256_file(dest)
if actual != expected_sha:
Expand Down
Loading