xorbitsai · codingl2k1 · Jun 11, 2026 · Jun 10, 2026 · Jun 10, 2026 · Jun 11, 2026
diff --git a/.github/workflows/build-wheel-cuda-hip.yaml b/.github/workflows/build-wheel-cuda-hip.yaml
@@ -371,8 +371,22 @@ jobs:
         # Build only with Python 3.10 for abi3 wheel (compatible with all Python 3.10+)
         pyver: ["3.10"]
         cuda: ["12.8.1", "13.2.0"]
+        include:
+          # Per-CUDA-version GPU architecture lists, mirroring the upstream
+          # llama.cpp *release* default (ggml/src/ggml-cuda/CMakeLists.txt,
+          # GGML_NATIVE=OFF branch). Without this, build.py falls back to
+          # "all" (every arch the toolkit supports), which is far slower than
+          # needed. Applies to both x86_64 and arm64 hosts (sm_XX is the GPU
+          # arch, independent of host CPU). 121a needs CUDA >= 12.9.
+          - cuda: "12.8.1"
+            cuda_arch: "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real;89-real;120a-real"
+          - cuda: "13.2.0"
+            cuda_arch: "75-virtual;80-virtual;86-real;89-real;120a-real;121a-real"
     env:
       CUDAVER: ${{ matrix.cuda }}
+      # CUDA target architectures (PTX vs SASS, per-arch GPU map and rationale):
+      # see the detailed reference in the build_wheels_cuda_windows job below.
+      CUDA_ARCHITECTURES: ${{ matrix.cuda_arch }}
 
     steps:
       - name: Free Disk Space (Ubuntu)
@@ -595,11 +609,73 @@ jobs:
         # Build only with Python 3.10 for abi3 wheel (compatible with all Python 3.10+)
         pyver: ["3.10"]
         cuda: ["12.8.1", "13.2.0"]
+        include:
+          # Per-CUDA-version GPU architecture lists, mirroring the upstream
+          # llama.cpp *release* default (ggml/src/ggml-cuda/CMakeLists.txt,
+          # GGML_NATIVE=OFF branch). See the CUDA_ARCHITECTURES note below.
+          # CUDA 12.x still supports Maxwell/Pascal/Volta; 121a needs CUDA >= 12.9
+          # so it is NOT included for 12.8.
+          - cuda: "12.8.1"
+            cuda_arch: "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real;89-real;120a-real"
+          # CUDA 13 dropped Maxwell/Pascal/Volta; 121a-real is available.
+          - cuda: "13.2.0"
+            cuda_arch: "75-virtual;80-virtual;86-real;89-real;120a-real;121a-real"
     defaults:
       run:
         shell: bash
     env:
       CUDAVER: ${{ matrix.cuda }}
+      # ----------------------------------------------------------------------
+      # CUDA target architectures (canonical reference; the Linux job points here)
+      # ----------------------------------------------------------------------
+      # Why pin this: the Windows runner has only 4 vCPUs, so CUDA compile time
+      # scales with the number of code-gen outputs. build.py would otherwise
+      # auto-detect archs via `nvcc --list-gpu-arch` -> 12 *bare* archs for
+      # CUDA 13, and a bare arch tells CMake to emit BOTH SASS and PTX, i.e.
+      # 24 code-gen outputs per .cu file. Across ~138 .cu files that blows past
+      # GitHub's 6-hour job limit. The lists below produce only ~6-8 outputs.
+      #
+      # PTX vs SASS (what the suffixes mean):
+      #   * <arch>-real    = SASS:  native machine code (cubin) for exactly that
+      #                      GPU arch. Loads and runs directly, fully optimized.
+      #                      Does NOT run on any other arch.
+      #   * <arch>-virtual = PTX:   a forward-compatible virtual ISA. The driver
+      #                      JIT-compiles it to SASS on first launch (then caches
+      #                      it). PTX built for compute_X can JIT onto any GPU
+      #                      with compute capability >= X (never older).
+      #   * <arch>         = both SASS and PTX (twice the work; what auto-detect
+      #                      produced, hence the slowness).
+      #
+      # Runtime kernel selection: matching SASS -> use directly; else compatible
+      # PTX -> JIT; else load fails (CUDA error 209, no kernel image). So at
+      # least one -virtual entry is REQUIRED for any GPU we do not ship SASS for.
+      #
+      # These lists mirror the upstream llama.cpp *release* default
+      # (ggml/src/ggml-cuda/CMakeLists.txt, GGML_NATIVE=OFF branch):
+      #
+      #   arch   kind   GPUs                         runtime
+      #   ----   ----   --------------------------   --------------------------
+      #   50     PTX    Maxwell (GTX 9xx)            JIT      (CUDA 12 only)
+      #   61     PTX    Pascal  (GTX 10xx, P40)      JIT      (CUDA 12 only)
+      #   70     PTX    Volta   (V100)               JIT      (CUDA 12 only)
+      #   75     PTX    Turing  (RTX 20xx, T4)       JIT
+      #   80     PTX    Ampere DC (A100)             JIT; also forward-covers
+      #                                              Hopper(90) & Blackwell DC
+      #                                              (100/103) via JIT
+      #   86     SASS   Ampere  (RTX 30xx, A10/A40)  native
+      #   89     SASS   Ada     (RTX 40xx, L4/L40)   native
+      #   120a   SASS   Blackwell (RTX 50xx)         native   (CUDA >= 12.8)
+      #   121a   SASS   Blackwell variant            native   (CUDA >= 12.9)
+      #
+      # Notes:
+      #   * CUDA 13 dropped Maxwell/Pascal/Volta, so 50/61/70 are omitted there;
+      #     the CUDA-13 floor is sm_75 (Turing) via the 75-virtual PTX.
+      #   * Native SASS is shipped only for mainstream consumer cards; datacenter
+      #     parts (A100/Hopper/Blackwell-DC) run via JIT from the 80-virtual PTX.
+      #     Add 90-real / 100-real if native datacenter perf is desired.
+      #   * Local builds (CUDA_ARCHITECTURES unset) instead use CMake's "all" for
+      #     native SASS on every arch -- see scripts/build.py.
+      CUDA_ARCHITECTURES: ${{ matrix.cuda_arch }}
 
     steps:
       - name: Clone
@@ -654,8 +730,17 @@ jobs:
 
           python -m build --wheel
 
-          # On Windows, we use delvewheel for wheel repair
-          python -m delvewheel repair --exclude nvcuda.dll dist/*.whl -w dist
+          # On Windows, we use delvewheel for wheel repair.
+          # delvewheel must locate the CUDA redistributable DLLs (cudart64_*,
+          # cublas64_*, ...) to graft them into the wheel. CUDA 13 relocated
+          # these from <CUDA_PATH>\bin to <CUDA_PATH>\bin\x64 on Windows, while
+          # CUDA 12 keeps them in <CUDA_PATH>\bin. The toolkit action only puts
+          # \bin on PATH, so without this delvewheel fails with e.g.
+          # "Unable to find library: cublas64_13.dll". Add both dirs explicitly
+          # (semicolon = Windows path separator for --add-path).
+          python -m delvewheel repair \
+            --add-path "$CUDA_PATH/bin/x64;$CUDA_PATH/bin" \
+            --exclude nvcuda.dll dist/*.whl -w dist
 
           echo "CUDA_VERSION=$cuda_version" >> $GITHUB_ENV
 

diff --git a/README.md b/README.md
@@ -108,9 +108,10 @@ Before pip installing `xllamacpp`, please ensure your system meets the following
   - Requires ARMv8-A or later architecture
   - For best performance, build from source if your CPU supports advanced instruction sets (e.g., SVE)
 
-- **CUDA (Linux)**:
-  - Requires glibc 2.35 or later
+- **CUDA (Linux/Windows)**:
+  - Requires glibc 2.35 or later (Linux)
   - Compatible NVIDIA GPU with appropriate drivers (CUDA 12.8 or 13.2)
+  - See [CUDA GPU Architecture Coverage](#cuda-gpu-architecture-coverage) below for the list of supported GPUs per CUDA version
 
 - **ROCm (Linux)**:
   - Requires glibc 2.35 or later
@@ -122,6 +123,53 @@ Before pip installing `xllamacpp`, please ensure your system meets the following
   - Linux users may need distro packages and the LunarG SDK
   - macOS Intel is supported via Vulkan; Apple Silicon Vulkan is not supported in this project
 
+## CUDA GPU Architecture Coverage
+
+The prebuilt CUDA wheels are compiled for a curated set of NVIDIA GPU architectures
+that mirrors the upstream llama.cpp *release* default (`ggml/src/ggml-cuda/CMakeLists.txt`,
+`GGML_NATIVE=OFF` branch). The set differs between CUDA versions because newer toolkits
+drop older architectures and add newer ones.
+
+Each architecture is shipped in one of two forms:
+
+- **SASS** (`-real`): native machine code (cubin) for exactly that GPU architecture.
+  It loads and runs directly with full optimization, but only on that specific architecture.
+- **PTX** (`-virtual`): a forward-compatible virtual ISA. The driver JIT-compiles it to
+  SASS on first launch (then caches it). PTX built for `compute_X` can JIT onto any GPU
+  with compute capability `>= X` (never older).
+
+At runtime CUDA picks matching SASS if available; otherwise it JIT-compiles compatible PTX;
+otherwise the load fails (CUDA error 209, "no kernel image"). This is why at least one PTX
+(`-virtual`) entry is always shipped, so GPUs without native SASS still run via JIT.
+
+| Arch | Form | GPUs | CUDA 12.8 wheel | CUDA 13.2 wheel |
+|:-----|:-----|:-----|:---------------:|:---------------:|
+| `50`   | PTX  | Maxwell (GTX 9xx)            | ✅ JIT  | — |
+| `61`   | PTX  | Pascal (GTX 10xx, P40)       | ✅ JIT  | — |
+| `70`   | PTX  | Volta (V100)                 | ✅ JIT  | — |
+| `75`   | PTX  | Turing (RTX 20xx, T4)        | ✅ JIT  | ✅ JIT  |
+| `80`   | PTX  | Ampere DC (A100); also forward-covers Hopper (90) and Blackwell DC (100/103) via JIT | ✅ JIT  | ✅ JIT  |
+| `86`   | SASS | Ampere (RTX 30xx, A10/A40)   | ✅ native | ✅ native |
+| `89`   | SASS | Ada (RTX 40xx, L4/L40)       | ✅ native | ✅ native |
+| `120a` | SASS | Blackwell (RTX 50xx)         | ✅ native | ✅ native |
+| `121a` | SASS | Blackwell variant            | —       | ✅ native |
+
+Notes:
+
+- **CUDA 13 dropped Maxwell/Pascal/Volta**, so `50`/`61`/`70` are omitted there; the CUDA 13
+  floor is `sm_75` (Turing) via the `75-virtual` PTX.
+- **`121a` requires CUDA >= 12.9**, so it is only present in the CUDA 13.2 wheel.
+- **Native SASS is shipped only for mainstream consumer cards.** Datacenter parts
+  (A100/Hopper/Blackwell-DC) run via JIT from the `80-virtual` PTX, which works but incurs a
+  one-time JIT compile on first launch.
+- **Local source builds** (where `CUDA_ARCHITECTURES` is unset) instead use CMake's `all`,
+  producing native SASS for every architecture the installed toolkit supports — see
+  `scripts/build.py`. If you need native datacenter performance from a prebuilt wheel, build
+  from source or add `90-real` / `100-real` to the architecture list.
+
+The same architecture lists apply to both Linux (x86_64 and arm64) and Windows CUDA wheels,
+since the `sm_XX` value is the GPU architecture and is independent of the host CPU.
+
 ## Build from Source
 
 ### (Optional) Preparation

diff --git a/scripts/build.py b/scripts/build.py
@@ -5,7 +5,6 @@
 
 import os
 import platform
-import re
 import shlex
 import shutil
 import subprocess
@@ -44,33 +43,6 @@ def split_cmake_args(value: str) -> list[str]:
     return parts
 
 
-def detect_cuda_architectures() -> str:
-    log("=== Detecting supported GPU architectures ===")
-    try:
-        output = subprocess.check_output(
-            ["nvcc", "--list-gpu-arch"], text=True, stderr=subprocess.STDOUT
-        )
-    except (OSError, subprocess.CalledProcessError) as exc:
-        raise SystemExit(
-            "CUDA_ARCHITECTURES is not set and `nvcc --list-gpu-arch` failed. "
-            "Install CUDA tools or set CUDA_ARCHITECTURES explicitly."
-        ) from exc
-
-    print(output, end="", flush=True)
-    archs: set[int] = set()
-    for line in output.splitlines():
-        match = re.fullmatch(r"(?:sm|compute)_(\d+)", line.strip())
-        if match:
-            arch = int(match.group(1))
-            if arch >= 70:
-                archs.add(arch)
-
-    if not archs:
-        raise SystemExit("nvcc did not report any CUDA architectures >= 70")
-
-    return ";".join("120a" if arch == 120 else str(arch) for arch in sorted(archs))
-
-
 def hip_compiler() -> str:
     try:
         hip_root = subprocess.check_output(
@@ -134,7 +106,16 @@ def build_llamacpp() -> None:
 
     if env_is_set("XLLAMACPP_BUILD_CUDA"):
         log("Building for CUDA")
-        cuda_archs = os.environ.get("CUDA_ARCHITECTURES") or detect_cuda_architectures()
+        # CI pipelines pin CUDA_ARCHITECTURES to a curated list to keep build
+        # times under the runner limit (a few -real archs + PTX fallbacks).
+        #
+        # When unset (i.e. a user building locally for their own use), fall back
+        # to CMake's "native" keyword. Per the CMake docs this detects the GPUs
+        # actually installed on the build machine and compiles SASS only for
+        # those architectures. That keeps the build fast and produces fully
+        # arch-optimized code for the local hardware -- at the cost of a binary
+        # that is not portable to other GPU architectures.
+        cuda_archs = os.environ.get("CUDA_ARCHITECTURES") or "native"
         log(f"Using CUDA architectures: {cuda_archs}")
         cmake_args.extend(
             [

diff --git a/scripts/download_models.py b/scripts/download_models.py
@@ -13,7 +13,10 @@
 """
 
 import hashlib
+import http.client
 import os
+import socket
+import ssl
 import sys
 import threading
 import time
@@ -30,8 +33,20 @@
 READ_CHUNK_SIZE = 1 << 23  # 8 MB — buffer size for network reads / disk writes.
 MAX_HTTP_RETRIES = 3
 RATE_LIMIT_BASE_SLEEP = 30.0
+NETWORK_ERROR_BASE_SLEEP = 2.0
 REQUEST_START_DELAY = 1.0
 
+# Transient connection-level errors worth retrying (network blips, dropped TLS
+# connections, slow servers). These are distinct from HTTP status errors.
+TRANSIENT_NETWORK_ERRORS = (
+    ssl.SSLError,
+    socket.timeout,
+    TimeoutError,
+    ConnectionError,
+    http.client.IncompleteRead,
+    http.client.RemoteDisconnected,
+)
+
 
 @dataclass(frozen=True)
 class RetryDelay:
@@ -140,7 +155,7 @@ def _sleep_before_request() -> None:
 
 
 def open_url_with_retries(req: urllib.request.Request, timeout: int, description: str):
-    """Open a URL, retrying HTTP 429 with backoff."""
+    """Open a URL, retrying HTTP 429 and transient network errors with backoff."""
     request_timeout = timeout
     for attempt in range(MAX_HTTP_RETRIES + 1):
         try:
@@ -163,6 +178,25 @@ def open_url_with_retries(req: urllib.request.Request, timeout: int, description
                 flush=True,
             )
             time.sleep(sleep_for)
+        except (urllib.error.URLError,) + TRANSIENT_NETWORK_ERRORS as err:
+            # urllib wraps SSL/connection errors in URLError(reason=...).
+            reason = getattr(err, "reason", err)
+            if not isinstance(err, TRANSIENT_NETWORK_ERRORS) and not isinstance(
+                reason, TRANSIENT_NETWORK_ERRORS
+            ):
+                # A non-transient URLError (e.g. DNS failure): don't retry.
+                raise
+            if attempt == MAX_HTTP_RETRIES:
+                raise
+
+            sleep_for = NETWORK_ERROR_BASE_SLEEP * (2**attempt)
+            print(
+                f"Network error while requesting {description}: {reason}; "
+                f"sleeping {sleep_for:.0f}s before retry {attempt + 1}/"
+                f"{MAX_HTTP_RETRIES}...",
+                flush=True,
+            )
+            time.sleep(sleep_for)
 
 
 def get_file_info(url: str) -> tuple[str, int] | None:
@@ -294,11 +328,33 @@ def download_one(
             dest.unlink()
 
     print(f"Downloading {name} ({CHUNKS_PER_FILE} connections)...", flush=True)
-    try:
-        download_file(url, dest, CHUNKS_PER_FILE, chunk_pool)
-    except Exception as e:
-        dest.unlink(missing_ok=True)
-        raise RuntimeError(f"{name}: download failed -- {e}")
+    last_err: Exception | None = None
+    for attempt in range(MAX_HTTP_RETRIES + 1):
+        try:
+            download_file(url, dest, CHUNKS_PER_FILE, chunk_pool)
+            last_err = None
+            break
+        except urllib.error.HTTPError:
+            # HTTP-status errors (404, etc.) are not transient; fail fast.
+            dest.unlink(missing_ok=True)
+            raise RuntimeError(f"{name}: download failed -- {sys.exc_info()[1]}")
+        except Exception as e:
+            # Connection-level / streaming errors (SSL EOF, reset, timeout):
+            # the body read may have died mid-stream, so retry the whole file.
+            last_err = e
+            dest.unlink(missing_ok=True)
+            if attempt == MAX_HTTP_RETRIES:
+                break
+            sleep_for = NETWORK_ERROR_BASE_SLEEP * (2**attempt)
+            print(
+                f"  {name}: download error ({e}); "
+                f"retrying in {sleep_for:.0f}s "
+                f"({attempt + 1}/{MAX_HTTP_RETRIES})...",
+                flush=True,
+            )
+            time.sleep(sleep_for)
+    if last_err is not None:
+        raise RuntimeError(f"{name}: download failed -- {last_err}")
 
     actual = sha256_file(dest)
     if actual != expected_sha: