diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index bf27ffe47..2a664f153 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -12,12 +12,30 @@ on:
   # event=tag` + `type=semver` rules below.
   release:
     types: [published]
-  # Build + push the rolling `:cuda12` tag on every main merge so the public
-  # image tracks main. The metadata-action `enable=` rule below gates the
-  # moving tag on `github.ref == refs/heads/main`, and the build step's
-  # `push:` condition includes push events on main.
+  # Build + push the rolling `:cuda12` / `:rocm` tags when an image-affecting
+  # file changes on main, so the public images track main without a ~2h
+  # rebuild on every unrelated commit (docs, harness, server tweaks that
+  # don't reach the image). Same paths as the PR guard below. The
+  # metadata-action `enable=` rule gates the moving tag on
+  # `github.ref == refs/heads/main`; the build step's `push:` condition
+  # includes push events on main.
   push:
     branches: [main]
+    paths:
+      - Dockerfile
+      - Dockerfile.rocm
+      - docker-bake.hcl
+      - .dockerignore
+      - .github/workflows/docker.yml
+      - server/CMakeLists.txt
+      - server/src/**
+      - server/test/**
+      - server/include/**
+      - server/scripts/**
+      - server/deps/**
+      - server/pyproject.toml
+      - pyproject.toml
+      - uv.lock
   # Build-only CI guard on PRs that touch the docker surface. We never push
   # from a PR — even if we wanted to, GITHUB_TOKEN on PRs from forks lacks
   # `packages:write`. The point is to catch Dockerfile / bake-file / arch-
@@ -25,6 +43,7 @@ on:
   pull_request:
     paths:
       - Dockerfile
+      - Dockerfile.rocm
       - docker-bake.hcl
       - .dockerignore
       - .github/workflows/docker.yml
@@ -76,7 +95,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        variant: [cuda12]
+        variant: [cuda12, rocm]
     steps:
       - name: Free runner disk space
         # The default ubuntu-latest image keeps ~25 GB of preinstalled
diff --git a/Dockerfile b/Dockerfile
index 06355d6df..8f624b200 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -148,15 +148,15 @@ ARG BUILD_TIME=""
 RUN apt-get update && apt-get install -y --no-install-recommends \
         ca-certificates \
         curl \
-        docker.io \
         libgomp1 \
         pciutils \
     && rm -rf /var/lib/apt/lists/*
 
 # uv manages Python 3.12 (required by the workspace) and resolves the
 # lucebox-dflash + pflash members declared in pyproject.toml.
-RUN curl -LsSf https://astral.sh/uv/install.sh \
-        | env UV_INSTALL_DIR=/usr/local/bin UV_NO_MODIFY_PATH=1 INSTALLER_NO_MODIFY_PATH=1 sh
+# uv (pinned) copied from the official image rather than `curl | sh`, so the
+# version is fixed and no remote installer script runs at build time.
+COPY --from=ghcr.io/astral-sh/uv:0.11.2 /uv /uvx /usr/local/bin/
 
 # Install Python to a world-readable location, not /root/.local/share/uv/
 # (the default). The container runs as the host UID for bind-mount sanity
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
new file mode 100644
index 000000000..4b0787c6c
--- /dev/null
+++ b/Dockerfile.rocm
@@ -0,0 +1,215 @@
+# syntax=docker/dockerfile:1.7
+
+# ─── Stage 1: builder ───────────────────────────────────────────────────────
+# ROCm/HIP sibling of the CUDA Dockerfile. ROCM_VERSION / UBUNTU_VERSION /
+# DFLASH_HIP_ARCHES are build args so the same Dockerfile can be repinned.
+#   • lucebox-hub:rocm  — ROCm 6.4.x, gfx1151 (+ optional fat gfx list)
+# See docker-bake.hcl for the canonical invocation.
+#
+# NOTE: gfx1151 (Strix Halo / Ryzen AI MAX) needs ROCm >= 6.4.1. The default
+# stays on 6.4.1 because the 7.2.x stack has shown intermittent problems on
+# Strix Halo. The flip side: against a ROCm 7.x HOST driver the 6.4.x
+# userspace can segfault at model load (seen on gfx1151 + host ROCm 7.2.2:
+# SIGSEGV in backend creation, bogus 1.28 TB VRAM report) — in that case
+# rebuild with ROCM_VERSION=7.2.2 to match the host. Rule of thumb: keep the
+# base's major version aligned with the host driver.
+ARG ROCM_VERSION=6.4.1
+ARG UBUNTU_VERSION=22.04
+FROM rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION} AS builder
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# Fat-binary HIP arch list, semicolon-separated. Each gfx target adds build
+# time + kernel code the same way CUDA arches do, so narrow this to your host
+# for fast local builds:  DFLASH_HIP_ARCHES=gfx1151 docker buildx bake rocm-local
+# dflash-supported gfx targets (all have the sm_75+ equivalent features dflash
+# assumes; pre-RDNA/CDNA1 parts are excluded):
+#   gfx1151  Strix Halo / Ryzen AI MAX+ 395  (the lucebox appliance iGPU)
+#   gfx1100  RDNA3        RX 7900 XTX / W7900
+#   gfx1200  RDNA4        RX 9070
+#   gfx942   CDNA3        MI300X / MI300A
+#   gfx90a   CDNA2        MI200 / MI250
+ARG DFLASH_HIP_ARCHES="gfx1151"
+
+# ROCm toolchain on PATH (hipcc + amdclang). The rocm/dev-ubuntu base installs
+# ROCm at /opt/rocm; ROCM_PATH lets server/CMakeLists.txt resolve the HIP rpath
+# and rocwmma header roots (server/CMakeLists.txt:33-41).
+ENV ROCM_PATH=/opt/rocm
+ENV PATH=/opt/rocm/bin:/opt/rocm/lib/llvm/bin:${PATH}
+
+# Unlike the CUDA image there is NO driver-stub symlink step: the ROCm base
+# ships the real HIP runtime libs (libamdhip64.so etc.) and the host kernel
+# driver (/dev/kfd, /dev/dri) is wired in at run time via --device.
+# hipblas/rocblas: ggml's HIP backend hard-requires them
+# (deps/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt does find_package(hipblas)
+# for its BLAS matmul path). The rocm/dev-ubuntu base ships the HIP toolchain
+# but NOT the math libs, so they are installed explicitly from the ROCm apt
+# repo the base image already configures.
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        ca-certificates \
+        cmake \
+        curl \
+        git \
+        git-lfs \
+        hipblas-dev \
+        libcurl4-openssl-dev \
+        ninja-build \
+        pkg-config \
+        python3 \
+        rocblas-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /src
+
+# COPY ordering mirrors the CUDA Dockerfile: C++ build inputs first so the
+# (slow) HIP build layer stays cached across Python-only edits.
+COPY server/CMakeLists.txt /src/server/CMakeLists.txt
+COPY server/include /src/server/include
+COPY server/src /src/server/src
+COPY server/test /src/server/test
+COPY server/hip_compat /src/server/hip_compat
+COPY server/deps /src/server/deps
+# status.html: dflash_server's POST_BUILD copies server/share/status.html into
+# build/share/. Without this COPY the build links then dies on the missing file.
+COPY server/share /src/server/share
+
+# Submodules must be populated on the host before `docker build` (.git/ is
+# .dockerignore'd so they cannot be fetched inside the image).
+RUN test -f /src/server/deps/llama.cpp/ggml/CMakeLists.txt \
+    || (echo "ERROR: server/deps/llama.cpp submodule not initialised. Run on host:" >&2 \
+        && echo "       git submodule update --init --recursive" >&2 \
+        && exit 1)
+
+# Configure + build for HIP. DFLASH27B_GPU_BACKEND=hip selects the ggml-hip
+# backend (server/CMakeLists.txt:70-78); DFLASH27B_HIP_ARCHITECTURES pins the
+# gfx list. Block-Sparse-Attention is a CUDA-only kernel set, so BSA is OFF for
+# HIP; FA_ALL_QUANTS OFF keeps the fattn build tractable (matches the lucebox
+# native HIP build). CMAKE_BUILD_WITH_INSTALL_RPATH bakes the $ORIGIN-relative
+# rpath (incl. ggml-hip + ROCm lib) so the binaries find their .so files after
+# the COPY into the runtime stage.
+RUN cmake -S /src/server -B /src/server/build \
+        -G Ninja \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
+        -DDFLASH27B_GPU_BACKEND=hip \
+        -DDFLASH27B_HIP_ARCHITECTURES="${DFLASH_HIP_ARCHES}" \
+        -DDFLASH27B_FA_ALL_QUANTS=OFF \
+        -DDFLASH27B_ENABLE_BSA=OFF \
+    && cmake --build /src/server/build --target test_dflash dflash_server test_server_unit --parallel
+
+# Prune the build tree to only what the runtime stage needs: the native server,
+# test_dflash, test_server_unit, and the ggml shared libs their embedded rpath
+# ($ORIGIN/deps/...) looks up.
+RUN cd /src/server/build \
+    && find . -mindepth 1 -maxdepth 1 \
+            ! -name test_dflash ! -name dflash_server ! -name test_server_unit ! -name deps -exec rm -rf {} + \
+    && find deps -mindepth 1 -type f ! -name 'lib*.so*' -delete \
+    && find deps -depth -type d -empty -delete
+
+# Python sources, workspace manifests, lockfile, READMEs (same as CUDA stage).
+COPY pyproject.toml uv.lock README.md /src/
+COPY server/pyproject.toml server/README.md /src/server/
+COPY server/scripts /src/server/scripts
+COPY optimizations/pflash /src/optimizations/pflash
+COPY optimizations/megakernel /src/optimizations/megakernel
+
+# ─── Stage 2: runtime ───────────────────────────────────────────────────────
+# Runtime reuses the ROCm base so the HIP runtime libs (libamdhip64,
+# libhsa-runtime64, librocm-core, …) the binaries link against are present and
+# version-matched to the builder. This makes a larger image than a minimal
+# runtime would; slimming to `ubuntu:${UBUNTU_VERSION}` + the AMD repo's
+# `rocm-language-runtime` meta-package is a follow-up optimisation (the build
+# stage is unaffected, so it can be done independently).
+FROM rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION} AS runtime
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# Image identity baked in for /props.build (git_sha / image_tag / build_time),
+# wired from docker-bake.hcl. Empty in non-bake builds → JSON null at /props.
+ARG GIT_SHA=""
+ARG IMAGE_TAG=""
+ARG BUILD_TIME=""
+
+ENV ROCM_PATH=/opt/rocm
+ENV PATH=/opt/rocm/bin:${PATH}
+
+# hipblas/rocblas runtime libs: the dflash binaries link ggml-hip against
+# them (see builder stage). Their .so files must be present at runtime for
+# the server to load.
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        ca-certificates \
+        curl \
+        hipblas \
+        libgomp1 \
+        pciutils \
+        rocblas \
+    && rm -rf /var/lib/apt/lists/*
+
+# uv (pinned) manages Python 3.12 + resolves the lucebox-dflash + pflash
+# workspace. Copied from the official image rather than `curl | sh`, so the
+# version is fixed and no remote installer script runs at build time.
+COPY --from=ghcr.io/astral-sh/uv:0.11.2 /uv /uvx /usr/local/bin/
+
+ENV UV_PYTHON_INSTALL_DIR=/opt/uv/python \
+    UV_TOOL_DIR=/opt/uv/tools
+
+WORKDIR /opt/lucebox-hub
+
+COPY --from=builder /src/pyproject.toml /src/uv.lock /src/README.md /opt/lucebox-hub/
+COPY --from=builder /src/optimizations/pflash /opt/lucebox-hub/optimizations/pflash
+COPY --from=builder /src/optimizations/megakernel/pyproject.toml \
+                   /src/optimizations/megakernel/README.md \
+                   /opt/lucebox-hub/optimizations/megakernel/
+
+COPY --from=builder /src/server/scripts /opt/lucebox-hub/server/scripts
+COPY --from=builder /src/server/pyproject.toml /src/server/README.md \
+                   /opt/lucebox-hub/server/
+COPY --from=builder /src/server/build /opt/lucebox-hub/server/build
+
+COPY share/model_cards /opt/lucebox-hub/share/model_cards
+RUN mkdir -p /opt/lucebox-hub/server/share \
+    && ln -s /opt/lucebox-hub/share/model_cards \
+             /opt/lucebox-hub/server/share/model_cards
+
+RUN test -x /opt/lucebox-hub/server/build/test_dflash \
+    && test -x /opt/lucebox-hub/server/build/dflash_server \
+    && test -x /opt/lucebox-hub/server/build/test_server_unit \
+    && test -f /opt/lucebox-hub/server/share/model_cards/qwen3.6-27b.json \
+    && chmod +x /opt/lucebox-hub/server/scripts/entrypoint.sh
+
+RUN printf '%s\n%s\n%s\n' "$GIT_SHA" "$IMAGE_TAG" "$BUILD_TIME" \
+        > /opt/lucebox-hub/IMAGE_INFO
+
+# Register the ggml lib dirs with ld.so. HIP names its backend lib dir
+# `ggml-hip` (server/CMakeLists.txt:74), where the CUDA image uses `ggml-cuda`.
+RUN printf '%s\n%s\n' \
+        /opt/lucebox-hub/server/build/deps/llama.cpp/ggml/src \
+        /opt/lucebox-hub/server/build/deps/llama.cpp/ggml/src/ggml-hip \
+        > /etc/ld.so.conf.d/lucebox-ggml.conf \
+    && ldconfig
+
+ENV UV_LINK_MODE=hardlink \
+    UV_NO_CACHE=1
+RUN uv sync --no-dev --frozen --no-editable 2>/dev/null \
+    || uv sync --no-dev --frozen --no-editable
+
+RUN chmod -R a+rX /opt/lucebox-hub/.venv /opt/lucebox-hub /opt/uv
+
+# Models live in server/models/ — bind-mount or volume them in.
+# ROCm run example (note: AMD uses --device, not --gpus):
+#   docker run --rm --device /dev/kfd --device /dev/dri \
+#       --group-add video --group-add render \
+#       --security-opt seccomp=unconfined -p 8080:8080 \
+#       -v "$PWD/server/models:/opt/lucebox-hub/server/models" \
+#       ghcr.io/luce-org/lucebox-hub:rocm
+VOLUME ["/opt/lucebox-hub/server/models"]
+
+ENV DFLASH_HOST=0.0.0.0 \
+    DFLASH_PORT=8080 \
+    DFLASH_BIN=/opt/lucebox-hub/server/build/test_dflash \
+    DFLASH_SERVER_BIN=/opt/lucebox-hub/server/build/dflash_server
+
+EXPOSE 8080
+
+ENTRYPOINT ["/opt/lucebox-hub/server/scripts/entrypoint.sh"]
diff --git a/Makefile b/Makefile
index 49a2700eb..189a00cb8 100644
--- a/Makefile
+++ b/Makefile
@@ -51,7 +51,7 @@ build:  ## Build lucebox-hub:cuda12 locally via docker buildx bake.
 
 .PHONY: serve
 serve:  ## Run the local image, foreground. Models bind-mounted from $(MODELS_DIR).
-	docker run --rm --gpus all -p 8080:8080 \
+	docker run --rm --gpus all -p 8000:8080 \
 		-v $(MODELS_DIR):/opt/lucebox-hub/server/models:ro \
 		--name lucebox-gemma \
 		$(IMAGE) serve
diff --git a/README.md b/README.md
index 70da77c89..8e0bff048 100644
--- a/README.md
+++ b/README.md
@@ -157,28 +157,51 @@ Prebuilt images on GHCR track `main`. No CUDA toolkit or build needed. Pull the
 
 <table>
 <tr>
-<td width="50%" valign="middle">
+<td width="38%" valign="middle">
 
-```bash
-# NVIDIA (CUDA 12+)
-docker run --rm --gpus all -p 8000:8080 \
-  -v "$PWD/server/models:/opt/lucebox-hub/server/models" \
-  ghcr.io/luce-org/lucebox-hub:cuda12
-```
+| GPU | Image tag |
+|-----|-----------|
+| NVIDIA (CUDA 12+) | `:cuda12` |
+| AMD (ROCm 6+) | `:rocm` |
 
 Drop a GGUF model target into `server/models/` first, then
 `:8000/v1/chat/completions`. Full tutorial in the
 [Docker blog](https://lucebox.com/blog/docker).
 
 </td>
-<td width="50%" valign="middle">
+<td width="62%" valign="middle">
 
-<a href="https://lucebox.com/blog/docker"><img src="assets/docker.png" alt="Lucebox prebuilt Docker images" width="100%" /></a>
+<a href="https://lucebox.com/blog/docker"><img src="assets/docker.png" alt="Lucebox prebuilt Docker images for NVIDIA and AMD" width="100%" /></a>
 
 </td>
 </tr>
 </table>
 
+**Install and run:**
+
+```bash
+# 1. Pull the image for your GPU
+docker pull ghcr.io/luce-org/lucebox-hub:cuda12   # NVIDIA
+docker pull ghcr.io/luce-org/lucebox-hub:rocm     # AMD
+
+# 2. Download a target model into server/models/
+hf download unsloth/Qwen3.6-27B-GGUF Qwen3.6-27B-Q4_K_M.gguf \
+  --local-dir server/models/
+
+# 3a. NVIDIA (CUDA 12+)
+docker run --rm --gpus all -p 8000:8080 \
+  -v "$PWD/server/models:/opt/lucebox-hub/server/models" \
+  ghcr.io/luce-org/lucebox-hub:cuda12
+
+# 3b. AMD (ROCm 6+, Strix Halo / RX 7900)
+docker run --rm --device /dev/kfd --device /dev/dri \
+  --group-add video --group-add render --security-opt seccomp=unconfined \
+  -p 8000:8080 -v "$PWD/server/models:/opt/lucebox-hub/server/models" \
+  ghcr.io/luce-org/lucebox-hub:rocm
+```
+
+Then hit `:8000/v1/chat/completions` (OpenAI-compatible).
+
 ## Run the Server
 
 Default: Qwen 3.6-27B Q4_K_M target + Lucebox Q4_K_M DFlash drafter on RTX 3090. DDTree budget=22, TQ3_0 KV cache, sliding FA window 2048. OpenAI-compatible HTTP on `:8000`.
diff --git a/assets/docker.png b/assets/docker.png
index bc9f3b7f6..2f3b75d1d 100644
Binary files a/assets/docker.png and b/assets/docker.png differ
diff --git a/docker-bake.hcl b/docker-bake.hcl
index 22149fd4a..8b377674b 100644
--- a/docker-bake.hcl
+++ b/docker-bake.hcl
@@ -44,6 +44,21 @@ variable "TAG"      { default = "" }
 # multiple arches.
 variable "DFLASH_CUDA_ARCHES" { default = "75;80;86;89;90;120" }
 
+# Fat-binary HIP/gfx arch list for the rocm variant (semicolon-separated).
+# Default is gfx1151 (Strix Halo, the lucebox appliance iGPU) only, to keep the
+# build tractable. Widen for a broadly-runnable released image, e.g.:
+#   DFLASH_HIP_ARCHES="gfx1151;gfx1100;gfx1200;gfx942;gfx90a" docker buildx bake rocm
+# (gfx1151 Strix Halo, gfx1100 RX7900/RDNA3, gfx1200 RDNA4, gfx942 MI300,
+# gfx90a MI200.)
+variable "DFLASH_HIP_ARCHES" { default = "gfx1151" }
+
+# ROCm base-image tag for the rocm variant. gfx1151 needs >= 6.4.1. Default
+# stays 6.4.1 (7.2.x has shown intermittent problems on Strix Halo), but on a
+# ROCm 7.x HOST driver the 6.4.x userspace can segfault at model load — set
+# ROCM_VERSION=7.2.2 there. Keep the base aligned with the host driver (see
+# Dockerfile.rocm).
+variable "ROCM_VERSION" { default = "6.4.1" }
+
 # Image identity stamped into /opt/lucebox-hub/IMAGE_INFO at build time and
 # surfaced under /props.build at runtime (git_sha, image_tag, build_time).
 # CI sets all three from the workflow context; local builds get a best-
@@ -73,6 +88,12 @@ group "default" {
     targets = ["cuda12-local"]
 }
 
+# Build every published variant locally (cuda + rocm). CI builds these as a
+# matrix; this group is the local equivalent for a full two-image build.
+group "all" {
+    targets = ["cuda12-local", "rocm-local"]
+}
+
 # CI integration. docker/metadata-action in .github/workflows/docker.yml
 # emits a bake-file that defines a `docker-metadata-action` target carrying
 # tags + labels derived from the ref. Both build targets inherit from it.
@@ -108,3 +129,29 @@ target "cuda12-local" {
     inherits = ["_cuda12-base"]
     tags = image_tags("cuda12")
 }
+
+# ── ROCm / HIP ───────────────────────────────────────────────────────────────
+# AMD GPU build from Dockerfile.rocm: gfx1151 (Strix Halo) by default, widen via
+# DFLASH_HIP_ARCHES for a broadly-runnable image. Block-Sparse-Attention is
+# CUDA-only and disabled in this variant (see Dockerfile.rocm).
+target "_rocm-base" {
+    context    = "."
+    dockerfile = "Dockerfile.rocm"
+    args = {
+        ROCM_VERSION      = ROCM_VERSION
+        UBUNTU_VERSION    = "22.04"
+        DFLASH_HIP_ARCHES = DFLASH_HIP_ARCHES
+        GIT_SHA           = GIT_SHA
+        IMAGE_TAG         = IMAGE_TAG
+        BUILD_TIME        = BUILD_TIME
+    }
+}
+
+target "rocm" {
+    inherits = ["_rocm-base", "docker-metadata-action"]
+}
+
+target "rocm-local" {
+    inherits = ["_rocm-base"]
+    tags = image_tags("rocm")
+}
diff --git a/harness/benchmarks/generation_benchmark.py b/harness/benchmarks/generation_benchmark.py
index f99c903e6..2ca51a272 100755
--- a/harness/benchmarks/generation_benchmark.py
+++ b/harness/benchmarks/generation_benchmark.py
@@ -353,7 +353,7 @@ def cmd_run(args: argparse.Namespace) -> int:
         "name": args.name,
         "url": args.url,
         "model": args.model,
-        "created_at": dt.datetime.now(dt.timezone.utc).isoformat(),
+        "created_at": dt.datetime.now(dt.UTC).isoformat(),
         "prompts": str(Path(args.prompts)),
         "max_tokens": args.max_tokens,
         "temperature": args.temperature,
@@ -428,7 +428,7 @@ def cmd_compare(args: argparse.Namespace) -> int:
         summary["mean_speedup"] = None
 
     report = {
-        "created_at": dt.datetime.now(dt.timezone.utc).isoformat(),
+        "created_at": dt.datetime.now(dt.UTC).isoformat(),
         "baseline_report": str(Path(args.baseline)),
         "candidate_report": str(Path(args.candidate)),
         "summary": summary,
diff --git a/harness/client_test_runner.py b/harness/client_test_runner.py
index ebeed0fdf..2140b998f 100755
--- a/harness/client_test_runner.py
+++ b/harness/client_test_runner.py
@@ -22,8 +22,6 @@
 import signal
 import socket
 import subprocess
-import sys
-import tempfile
 import time
 import urllib.error
 import urllib.request
@@ -32,7 +30,6 @@
 from pathlib import Path
 from typing import Any
 
-
 ROOT = Path(__file__).resolve().parent.parent
 DEFAULT_WORK_DIR = ROOT / ".harness-work"
 MODEL = "luce-dflash"
@@ -982,7 +979,7 @@ def wait_http(base_url: str, proc: subprocess.Popen | None = None, timeout: int
             status, _body, _elapsed = http_json("GET", base_url + "/health", timeout=2)
             if status == 200:
                 return True
-        except (urllib.error.URLError, TimeoutError, ConnectionResetError, socket.timeout):
+        except (urllib.error.URLError, TimeoutError, ConnectionResetError):
             pass
         time.sleep(1)
     return False
@@ -1552,7 +1549,6 @@ def _score_he_response(text: str, entry_point: str, gold_test: str) -> tuple[boo
     Returns (correct, detail_str).
     """
     import subprocess as _sp
-    import tempfile as _tmp
 
     think_end = text.rfind("</think>")
     answer_text = text[think_end + len("</think>"):] if think_end >= 0 else text
@@ -1883,7 +1879,7 @@ def cmd_bench(args: argparse.Namespace) -> int:
         if status != 200:
             print(f"[bench] WARNING: server health check returned {status}", flush=True)
     except Exception as exc:
-        raise SystemExit(f"[bench] cannot reach server at {base_url}/health: {exc}")
+        raise SystemExit(f"[bench] cannot reach server at {base_url}/health: {exc}") from exc
 
     print(f"[bench] url={base_url}  model={model}  suites={','.join(selected)}", flush=True)
 
@@ -1903,7 +1899,7 @@ def cmd_bench(args: argparse.Namespace) -> int:
     }
 
     # Final summary
-    print(f"\n[bench] === SUMMARY ===", flush=True)
+    print("\n[bench] === SUMMARY ===", flush=True)
     print(f"{'Suite':>8s}  {'OK':>5s}  {'Wall':>7s}  {'TTFT':>7s}  {'Pf tok/s':>9s}  "
           f"{'Out tok/s':>10s}  {'Out tok':>8s}  {'Score':>10s}", flush=True)
     for suite, s in all_suites.items():
diff --git a/harness/clients/llamacpp_compat_proxy.py b/harness/clients/llamacpp_compat_proxy.py
index 9f8ce785e..ad6922bf4 100644
--- a/harness/clients/llamacpp_compat_proxy.py
+++ b/harness/clients/llamacpp_compat_proxy.py
@@ -378,7 +378,7 @@ class Handler(BaseHTTPRequestHandler):
     max_tokens_cap = 0
 
     def log_message(self, fmt, *args):
-        print("[%s] %s" % (self.log_date_time_string(), fmt % args), flush=True)
+        print(f"[{self.log_date_time_string()}] {fmt % args}", flush=True)
 
     def send_json(self, status: int, obj: dict):
         data = json.dumps(obj).encode("utf-8")
diff --git a/harness/clients/session_inject_proxy.py b/harness/clients/session_inject_proxy.py
index 8cebab81e..b07a34e3f 100755
--- a/harness/clients/session_inject_proxy.py
+++ b/harness/clients/session_inject_proxy.py
@@ -17,13 +17,11 @@
 from __future__ import annotations
 
 import argparse
+import http.client
 import json
 import os
-import socket
-import threading
 from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
 from urllib.parse import urlparse
-import http.client
 
 
 class Handler(BaseHTTPRequestHandler):
@@ -73,7 +71,7 @@ def _relay_response(self, resp: http.client.HTTPResponse):
                     self.wfile.write(b"0\r\n\r\n")
                     self.wfile.flush()
                     break
-                size = "%X\r\n" % len(chunk)
+                size = f"{len(chunk):X}\r\n"
                 self.wfile.write(size.encode("ascii"))
                 self.wfile.write(chunk)
                 self.wfile.write(b"\r\n")
diff --git a/harness/clients/summarize_backend_pair.py b/harness/clients/summarize_backend_pair.py
index cd01274af..5062cac77 100755
--- a/harness/clients/summarize_backend_pair.py
+++ b/harness/clients/summarize_backend_pair.py
@@ -8,7 +8,6 @@
 import sys
 from pathlib import Path
 
-
 LUCEBOX_DONE_RE = re.compile(r"(?:chat|responses|messages) DONE .*? in=(?P<prompt>\d+) out=(?P<out>\d+)")
 LUCEBOX_DECODE_RE = re.compile(r"decode=[^(]*\((?P<tps>[0-9.]+)tok/s\)")
 LUCEBOX_OVERALL_RE = re.compile(r"\s(?P<tps>[0-9.]+) tok/s\s+finish=")
diff --git a/pyproject.toml b/pyproject.toml
index b68870fef..56ae2bf4f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,13 +19,11 @@ dev = ["pytest>=8", "mypy>=1.10,<2", "ruff>=0.14,<1"]
 [tool.ruff]
 target-version = "py312"
 line-length = 100
-# Staged lint adoption for the Python surfaces touched by the Docker,
-# benchmark, autotune, and host-CLI stack. Untouched server-adjacent scripts and
-# vendored dependencies stay outside this gate until they are cleaned up.
-#
-# No Python package at the repo root to lint yet; workspace members add
-# their own include lists. The pyproject-driven ruff config still applies.
-include = []
+# Staged lint adoption: the host-CLI / harness tooling is gated now. The
+# server-internal and optimization Python (server/scripts, optimizations/*)
+# carries pre-existing style debt and is added to `include` as it is cleaned
+# up. Vendored deps stay excluded permanently (extend-exclude below).
+include = ["harness/**/*.py", "scripts/**/*.py"]
 extend-exclude = [
     "dflash/deps",
     "megakernel",
@@ -33,7 +31,10 @@ extend-exclude = [
 ]
 
 [tool.ruff.lint]
-select = ["E", "F", "I", "UP", "B"]
+# Correctness + imports + modernization + bugbear. Line-length / style (E5xx,
+# E7xx) is intentionally staged out for now so the gate stays green without
+# reflowing existing code; tighten once the tracked surfaces are clean.
+select = ["F", "I", "UP", "B"]
 
 [tool.uv]
 package = false
@@ -64,8 +65,3 @@ torch = { index = "pytorch-cu128" }
 name = "pytorch-cu128"
 url = "https://download.pytorch.org/whl/cu128"
 explicit = true
-
-[dependency-groups]
-dev = [
-    "pytest>=9.0.3",
-]
diff --git a/scripts/build_image.sh b/scripts/build_image.sh
index 38bd34e03..054fd8850 100755
--- a/scripts/build_image.sh
+++ b/scripts/build_image.sh
@@ -4,6 +4,8 @@
 #
 # Tagging:
 #   * Untagged tree    → lucebox-hub:cuda12 (moving)
+#                      → lucebox-hub:<short-sha>-cuda12   (pinned, from
+#                        `git describe --always`)
 #   * Tagged `lucebox-v0.3.0` (clean checkout):
 #                      → lucebox-hub:cuda12
 #                      → lucebox-hub:0.3.0-cuda12
diff --git a/uv.lock b/uv.lock
index 3f5f64aa9..fee8de0df 100644
--- a/uv.lock
+++ b/uv.lock
@@ -480,11 +480,6 @@ megakernel = [
     { name = "qwen35-megakernel-bf16" },
 ]
 
-[package.dev-dependencies]
-dev = [
-    { name = "pytest" },
-]
-
 [package.metadata]
 requires-dist = [
     { name = "lucebox-dflash", virtual = "server" },
@@ -496,9 +491,6 @@ requires-dist = [
 ]
 provides-extras = ["megakernel", "dev"]
 
-[package.metadata.requires-dev]
-dev = [{ name = "pytest", specifier = ">=9.0.3" }]
-
 [[package]]
 name = "markdown-it-py"
 version = "4.2.0"