diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index bf27ffe47..2a664f153 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -12,12 +12,30 @@ on: # event=tag` + `type=semver` rules below. release: types: [published] - # Build + push the rolling `:cuda12` tag on every main merge so the public - # image tracks main. The metadata-action `enable=` rule below gates the - # moving tag on `github.ref == refs/heads/main`, and the build step's - # `push:` condition includes push events on main. + # Build + push the rolling `:cuda12` / `:rocm` tags when an image-affecting + # file changes on main, so the public images track main without a ~2h + # rebuild on every unrelated commit (docs, harness, server tweaks that + # don't reach the image). Same paths as the PR guard below. The + # metadata-action `enable=` rule gates the moving tag on + # `github.ref == refs/heads/main`; the build step's `push:` condition + # includes push events on main. push: branches: [main] + paths: + - Dockerfile + - Dockerfile.rocm + - docker-bake.hcl + - .dockerignore + - .github/workflows/docker.yml + - server/CMakeLists.txt + - server/src/** + - server/test/** + - server/include/** + - server/scripts/** + - server/deps/** + - server/pyproject.toml + - pyproject.toml + - uv.lock # Build-only CI guard on PRs that touch the docker surface. We never push # from a PR — even if we wanted to, GITHUB_TOKEN on PRs from forks lacks # `packages:write`. The point is to catch Dockerfile / bake-file / arch- @@ -25,6 +43,7 @@ on: pull_request: paths: - Dockerfile + - Dockerfile.rocm - docker-bake.hcl - .dockerignore - .github/workflows/docker.yml @@ -76,7 +95,7 @@ jobs: strategy: fail-fast: false matrix: - variant: [cuda12] + variant: [cuda12, rocm] steps: - name: Free runner disk space # The default ubuntu-latest image keeps ~25 GB of preinstalled diff --git a/Dockerfile b/Dockerfile index 06355d6df..8f624b200 100644 --- a/Dockerfile +++ b/Dockerfile @@ -148,15 +148,15 @@ ARG BUILD_TIME="" RUN apt-get update && apt-get install -y --no-install-recommends \ ca-certificates \ curl \ - docker.io \ libgomp1 \ pciutils \ && rm -rf /var/lib/apt/lists/* # uv manages Python 3.12 (required by the workspace) and resolves the # lucebox-dflash + pflash members declared in pyproject.toml. -RUN curl -LsSf https://astral.sh/uv/install.sh \ - | env UV_INSTALL_DIR=/usr/local/bin UV_NO_MODIFY_PATH=1 INSTALLER_NO_MODIFY_PATH=1 sh +# uv (pinned) copied from the official image rather than `curl | sh`, so the +# version is fixed and no remote installer script runs at build time. +COPY --from=ghcr.io/astral-sh/uv:0.11.2 /uv /uvx /usr/local/bin/ # Install Python to a world-readable location, not /root/.local/share/uv/ # (the default). The container runs as the host UID for bind-mount sanity diff --git a/Dockerfile.rocm b/Dockerfile.rocm new file mode 100644 index 000000000..4b0787c6c --- /dev/null +++ b/Dockerfile.rocm @@ -0,0 +1,215 @@ +# syntax=docker/dockerfile:1.7 + +# ─── Stage 1: builder ─────────────────────────────────────────────────────── +# ROCm/HIP sibling of the CUDA Dockerfile. ROCM_VERSION / UBUNTU_VERSION / +# DFLASH_HIP_ARCHES are build args so the same Dockerfile can be repinned. +# • lucebox-hub:rocm — ROCm 6.4.x, gfx1151 (+ optional fat gfx list) +# See docker-bake.hcl for the canonical invocation. +# +# NOTE: gfx1151 (Strix Halo / Ryzen AI MAX) needs ROCm >= 6.4.1. The default +# stays on 6.4.1 because the 7.2.x stack has shown intermittent problems on +# Strix Halo. The flip side: against a ROCm 7.x HOST driver the 6.4.x +# userspace can segfault at model load (seen on gfx1151 + host ROCm 7.2.2: +# SIGSEGV in backend creation, bogus 1.28 TB VRAM report) — in that case +# rebuild with ROCM_VERSION=7.2.2 to match the host. Rule of thumb: keep the +# base's major version aligned with the host driver. +ARG ROCM_VERSION=6.4.1 +ARG UBUNTU_VERSION=22.04 +FROM rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION} AS builder + +ARG DEBIAN_FRONTEND=noninteractive + +# Fat-binary HIP arch list, semicolon-separated. Each gfx target adds build +# time + kernel code the same way CUDA arches do, so narrow this to your host +# for fast local builds: DFLASH_HIP_ARCHES=gfx1151 docker buildx bake rocm-local +# dflash-supported gfx targets (all have the sm_75+ equivalent features dflash +# assumes; pre-RDNA/CDNA1 parts are excluded): +# gfx1151 Strix Halo / Ryzen AI MAX+ 395 (the lucebox appliance iGPU) +# gfx1100 RDNA3 RX 7900 XTX / W7900 +# gfx1200 RDNA4 RX 9070 +# gfx942 CDNA3 MI300X / MI300A +# gfx90a CDNA2 MI200 / MI250 +ARG DFLASH_HIP_ARCHES="gfx1151" + +# ROCm toolchain on PATH (hipcc + amdclang). The rocm/dev-ubuntu base installs +# ROCm at /opt/rocm; ROCM_PATH lets server/CMakeLists.txt resolve the HIP rpath +# and rocwmma header roots (server/CMakeLists.txt:33-41). +ENV ROCM_PATH=/opt/rocm +ENV PATH=/opt/rocm/bin:/opt/rocm/lib/llvm/bin:${PATH} + +# Unlike the CUDA image there is NO driver-stub symlink step: the ROCm base +# ships the real HIP runtime libs (libamdhip64.so etc.) and the host kernel +# driver (/dev/kfd, /dev/dri) is wired in at run time via --device. +# hipblas/rocblas: ggml's HIP backend hard-requires them +# (deps/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt does find_package(hipblas) +# for its BLAS matmul path). The rocm/dev-ubuntu base ships the HIP toolchain +# but NOT the math libs, so they are installed explicitly from the ROCm apt +# repo the base image already configures. +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + cmake \ + curl \ + git \ + git-lfs \ + hipblas-dev \ + libcurl4-openssl-dev \ + ninja-build \ + pkg-config \ + python3 \ + rocblas-dev \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /src + +# COPY ordering mirrors the CUDA Dockerfile: C++ build inputs first so the +# (slow) HIP build layer stays cached across Python-only edits. +COPY server/CMakeLists.txt /src/server/CMakeLists.txt +COPY server/include /src/server/include +COPY server/src /src/server/src +COPY server/test /src/server/test +COPY server/hip_compat /src/server/hip_compat +COPY server/deps /src/server/deps +# status.html: dflash_server's POST_BUILD copies server/share/status.html into +# build/share/. Without this COPY the build links then dies on the missing file. +COPY server/share /src/server/share + +# Submodules must be populated on the host before `docker build` (.git/ is +# .dockerignore'd so they cannot be fetched inside the image). +RUN test -f /src/server/deps/llama.cpp/ggml/CMakeLists.txt \ + || (echo "ERROR: server/deps/llama.cpp submodule not initialised. Run on host:" >&2 \ + && echo " git submodule update --init --recursive" >&2 \ + && exit 1) + +# Configure + build for HIP. DFLASH27B_GPU_BACKEND=hip selects the ggml-hip +# backend (server/CMakeLists.txt:70-78); DFLASH27B_HIP_ARCHITECTURES pins the +# gfx list. Block-Sparse-Attention is a CUDA-only kernel set, so BSA is OFF for +# HIP; FA_ALL_QUANTS OFF keeps the fattn build tractable (matches the lucebox +# native HIP build). CMAKE_BUILD_WITH_INSTALL_RPATH bakes the $ORIGIN-relative +# rpath (incl. ggml-hip + ROCm lib) so the binaries find their .so files after +# the COPY into the runtime stage. +RUN cmake -S /src/server -B /src/server/build \ + -G Ninja \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \ + -DDFLASH27B_GPU_BACKEND=hip \ + -DDFLASH27B_HIP_ARCHITECTURES="${DFLASH_HIP_ARCHES}" \ + -DDFLASH27B_FA_ALL_QUANTS=OFF \ + -DDFLASH27B_ENABLE_BSA=OFF \ + && cmake --build /src/server/build --target test_dflash dflash_server test_server_unit --parallel + +# Prune the build tree to only what the runtime stage needs: the native server, +# test_dflash, test_server_unit, and the ggml shared libs their embedded rpath +# ($ORIGIN/deps/...) looks up. +RUN cd /src/server/build \ + && find . -mindepth 1 -maxdepth 1 \ + ! -name test_dflash ! -name dflash_server ! -name test_server_unit ! -name deps -exec rm -rf {} + \ + && find deps -mindepth 1 -type f ! -name 'lib*.so*' -delete \ + && find deps -depth -type d -empty -delete + +# Python sources, workspace manifests, lockfile, READMEs (same as CUDA stage). +COPY pyproject.toml uv.lock README.md /src/ +COPY server/pyproject.toml server/README.md /src/server/ +COPY server/scripts /src/server/scripts +COPY optimizations/pflash /src/optimizations/pflash +COPY optimizations/megakernel /src/optimizations/megakernel + +# ─── Stage 2: runtime ─────────────────────────────────────────────────────── +# Runtime reuses the ROCm base so the HIP runtime libs (libamdhip64, +# libhsa-runtime64, librocm-core, …) the binaries link against are present and +# version-matched to the builder. This makes a larger image than a minimal +# runtime would; slimming to `ubuntu:${UBUNTU_VERSION}` + the AMD repo's +# `rocm-language-runtime` meta-package is a follow-up optimisation (the build +# stage is unaffected, so it can be done independently). +FROM rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION} AS runtime + +ARG DEBIAN_FRONTEND=noninteractive + +# Image identity baked in for /props.build (git_sha / image_tag / build_time), +# wired from docker-bake.hcl. Empty in non-bake builds → JSON null at /props. +ARG GIT_SHA="" +ARG IMAGE_TAG="" +ARG BUILD_TIME="" + +ENV ROCM_PATH=/opt/rocm +ENV PATH=/opt/rocm/bin:${PATH} + +# hipblas/rocblas runtime libs: the dflash binaries link ggml-hip against +# them (see builder stage). Their .so files must be present at runtime for +# the server to load. +RUN apt-get update && apt-get install -y --no-install-recommends \ + ca-certificates \ + curl \ + hipblas \ + libgomp1 \ + pciutils \ + rocblas \ + && rm -rf /var/lib/apt/lists/* + +# uv (pinned) manages Python 3.12 + resolves the lucebox-dflash + pflash +# workspace. Copied from the official image rather than `curl | sh`, so the +# version is fixed and no remote installer script runs at build time. +COPY --from=ghcr.io/astral-sh/uv:0.11.2 /uv /uvx /usr/local/bin/ + +ENV UV_PYTHON_INSTALL_DIR=/opt/uv/python \ + UV_TOOL_DIR=/opt/uv/tools + +WORKDIR /opt/lucebox-hub + +COPY --from=builder /src/pyproject.toml /src/uv.lock /src/README.md /opt/lucebox-hub/ +COPY --from=builder /src/optimizations/pflash /opt/lucebox-hub/optimizations/pflash +COPY --from=builder /src/optimizations/megakernel/pyproject.toml \ + /src/optimizations/megakernel/README.md \ + /opt/lucebox-hub/optimizations/megakernel/ + +COPY --from=builder /src/server/scripts /opt/lucebox-hub/server/scripts +COPY --from=builder /src/server/pyproject.toml /src/server/README.md \ + /opt/lucebox-hub/server/ +COPY --from=builder /src/server/build /opt/lucebox-hub/server/build + +COPY share/model_cards /opt/lucebox-hub/share/model_cards +RUN mkdir -p /opt/lucebox-hub/server/share \ + && ln -s /opt/lucebox-hub/share/model_cards \ + /opt/lucebox-hub/server/share/model_cards + +RUN test -x /opt/lucebox-hub/server/build/test_dflash \ + && test -x /opt/lucebox-hub/server/build/dflash_server \ + && test -x /opt/lucebox-hub/server/build/test_server_unit \ + && test -f /opt/lucebox-hub/server/share/model_cards/qwen3.6-27b.json \ + && chmod +x /opt/lucebox-hub/server/scripts/entrypoint.sh + +RUN printf '%s\n%s\n%s\n' "$GIT_SHA" "$IMAGE_TAG" "$BUILD_TIME" \ + > /opt/lucebox-hub/IMAGE_INFO + +# Register the ggml lib dirs with ld.so. HIP names its backend lib dir +# `ggml-hip` (server/CMakeLists.txt:74), where the CUDA image uses `ggml-cuda`. +RUN printf '%s\n%s\n' \ + /opt/lucebox-hub/server/build/deps/llama.cpp/ggml/src \ + /opt/lucebox-hub/server/build/deps/llama.cpp/ggml/src/ggml-hip \ + > /etc/ld.so.conf.d/lucebox-ggml.conf \ + && ldconfig + +ENV UV_LINK_MODE=hardlink \ + UV_NO_CACHE=1 +RUN uv sync --no-dev --frozen --no-editable 2>/dev/null \ + || uv sync --no-dev --frozen --no-editable + +RUN chmod -R a+rX /opt/lucebox-hub/.venv /opt/lucebox-hub /opt/uv + +# Models live in server/models/ — bind-mount or volume them in. +# ROCm run example (note: AMD uses --device, not --gpus): +# docker run --rm --device /dev/kfd --device /dev/dri \ +# --group-add video --group-add render \ +# --security-opt seccomp=unconfined -p 8080:8080 \ +# -v "$PWD/server/models:/opt/lucebox-hub/server/models" \ +# ghcr.io/luce-org/lucebox-hub:rocm +VOLUME ["/opt/lucebox-hub/server/models"] + +ENV DFLASH_HOST=0.0.0.0 \ + DFLASH_PORT=8080 \ + DFLASH_BIN=/opt/lucebox-hub/server/build/test_dflash \ + DFLASH_SERVER_BIN=/opt/lucebox-hub/server/build/dflash_server + +EXPOSE 8080 + +ENTRYPOINT ["/opt/lucebox-hub/server/scripts/entrypoint.sh"] diff --git a/Makefile b/Makefile index 49a2700eb..189a00cb8 100644 --- a/Makefile +++ b/Makefile @@ -51,7 +51,7 @@ build: ## Build lucebox-hub:cuda12 locally via docker buildx bake. .PHONY: serve serve: ## Run the local image, foreground. Models bind-mounted from $(MODELS_DIR). - docker run --rm --gpus all -p 8080:8080 \ + docker run --rm --gpus all -p 8000:8080 \ -v $(MODELS_DIR):/opt/lucebox-hub/server/models:ro \ --name lucebox-gemma \ $(IMAGE) serve diff --git a/README.md b/README.md index 70da77c89..8e0bff048 100644 --- a/README.md +++ b/README.md @@ -157,28 +157,51 @@ Prebuilt images on GHCR track `main`. No CUDA toolkit or build needed. Pull the - -
+ -```bash -# NVIDIA (CUDA 12+) -docker run --rm --gpus all -p 8000:8080 \ - -v "$PWD/server/models:/opt/lucebox-hub/server/models" \ - ghcr.io/luce-org/lucebox-hub:cuda12 -``` +| GPU | Image tag | +|-----|-----------| +| NVIDIA (CUDA 12+) | `:cuda12` | +| AMD (ROCm 6+) | `:rocm` | Drop a GGUF model target into `server/models/` first, then `:8000/v1/chat/completions`. Full tutorial in the [Docker blog](https://lucebox.com/blog/docker). + -Lucebox prebuilt Docker images +Lucebox prebuilt Docker images for NVIDIA and AMD
+**Install and run:** + +```bash +# 1. Pull the image for your GPU +docker pull ghcr.io/luce-org/lucebox-hub:cuda12 # NVIDIA +docker pull ghcr.io/luce-org/lucebox-hub:rocm # AMD + +# 2. Download a target model into server/models/ +hf download unsloth/Qwen3.6-27B-GGUF Qwen3.6-27B-Q4_K_M.gguf \ + --local-dir server/models/ + +# 3a. NVIDIA (CUDA 12+) +docker run --rm --gpus all -p 8000:8080 \ + -v "$PWD/server/models:/opt/lucebox-hub/server/models" \ + ghcr.io/luce-org/lucebox-hub:cuda12 + +# 3b. AMD (ROCm 6+, Strix Halo / RX 7900) +docker run --rm --device /dev/kfd --device /dev/dri \ + --group-add video --group-add render --security-opt seccomp=unconfined \ + -p 8000:8080 -v "$PWD/server/models:/opt/lucebox-hub/server/models" \ + ghcr.io/luce-org/lucebox-hub:rocm +``` + +Then hit `:8000/v1/chat/completions` (OpenAI-compatible). + ## Run the Server Default: Qwen 3.6-27B Q4_K_M target + Lucebox Q4_K_M DFlash drafter on RTX 3090. DDTree budget=22, TQ3_0 KV cache, sliding FA window 2048. OpenAI-compatible HTTP on `:8000`. diff --git a/assets/docker.png b/assets/docker.png index bc9f3b7f6..2f3b75d1d 100644 Binary files a/assets/docker.png and b/assets/docker.png differ diff --git a/docker-bake.hcl b/docker-bake.hcl index 22149fd4a..8b377674b 100644 --- a/docker-bake.hcl +++ b/docker-bake.hcl @@ -44,6 +44,21 @@ variable "TAG" { default = "" } # multiple arches. variable "DFLASH_CUDA_ARCHES" { default = "75;80;86;89;90;120" } +# Fat-binary HIP/gfx arch list for the rocm variant (semicolon-separated). +# Default is gfx1151 (Strix Halo, the lucebox appliance iGPU) only, to keep the +# build tractable. Widen for a broadly-runnable released image, e.g.: +# DFLASH_HIP_ARCHES="gfx1151;gfx1100;gfx1200;gfx942;gfx90a" docker buildx bake rocm +# (gfx1151 Strix Halo, gfx1100 RX7900/RDNA3, gfx1200 RDNA4, gfx942 MI300, +# gfx90a MI200.) +variable "DFLASH_HIP_ARCHES" { default = "gfx1151" } + +# ROCm base-image tag for the rocm variant. gfx1151 needs >= 6.4.1. Default +# stays 6.4.1 (7.2.x has shown intermittent problems on Strix Halo), but on a +# ROCm 7.x HOST driver the 6.4.x userspace can segfault at model load — set +# ROCM_VERSION=7.2.2 there. Keep the base aligned with the host driver (see +# Dockerfile.rocm). +variable "ROCM_VERSION" { default = "6.4.1" } + # Image identity stamped into /opt/lucebox-hub/IMAGE_INFO at build time and # surfaced under /props.build at runtime (git_sha, image_tag, build_time). # CI sets all three from the workflow context; local builds get a best- @@ -73,6 +88,12 @@ group "default" { targets = ["cuda12-local"] } +# Build every published variant locally (cuda + rocm). CI builds these as a +# matrix; this group is the local equivalent for a full two-image build. +group "all" { + targets = ["cuda12-local", "rocm-local"] +} + # CI integration. docker/metadata-action in .github/workflows/docker.yml # emits a bake-file that defines a `docker-metadata-action` target carrying # tags + labels derived from the ref. Both build targets inherit from it. @@ -108,3 +129,29 @@ target "cuda12-local" { inherits = ["_cuda12-base"] tags = image_tags("cuda12") } + +# ── ROCm / HIP ─────────────────────────────────────────────────────────────── +# AMD GPU build from Dockerfile.rocm: gfx1151 (Strix Halo) by default, widen via +# DFLASH_HIP_ARCHES for a broadly-runnable image. Block-Sparse-Attention is +# CUDA-only and disabled in this variant (see Dockerfile.rocm). +target "_rocm-base" { + context = "." + dockerfile = "Dockerfile.rocm" + args = { + ROCM_VERSION = ROCM_VERSION + UBUNTU_VERSION = "22.04" + DFLASH_HIP_ARCHES = DFLASH_HIP_ARCHES + GIT_SHA = GIT_SHA + IMAGE_TAG = IMAGE_TAG + BUILD_TIME = BUILD_TIME + } +} + +target "rocm" { + inherits = ["_rocm-base", "docker-metadata-action"] +} + +target "rocm-local" { + inherits = ["_rocm-base"] + tags = image_tags("rocm") +} diff --git a/harness/benchmarks/generation_benchmark.py b/harness/benchmarks/generation_benchmark.py index f99c903e6..2ca51a272 100755 --- a/harness/benchmarks/generation_benchmark.py +++ b/harness/benchmarks/generation_benchmark.py @@ -353,7 +353,7 @@ def cmd_run(args: argparse.Namespace) -> int: "name": args.name, "url": args.url, "model": args.model, - "created_at": dt.datetime.now(dt.timezone.utc).isoformat(), + "created_at": dt.datetime.now(dt.UTC).isoformat(), "prompts": str(Path(args.prompts)), "max_tokens": args.max_tokens, "temperature": args.temperature, @@ -428,7 +428,7 @@ def cmd_compare(args: argparse.Namespace) -> int: summary["mean_speedup"] = None report = { - "created_at": dt.datetime.now(dt.timezone.utc).isoformat(), + "created_at": dt.datetime.now(dt.UTC).isoformat(), "baseline_report": str(Path(args.baseline)), "candidate_report": str(Path(args.candidate)), "summary": summary, diff --git a/harness/client_test_runner.py b/harness/client_test_runner.py index ebeed0fdf..2140b998f 100755 --- a/harness/client_test_runner.py +++ b/harness/client_test_runner.py @@ -22,8 +22,6 @@ import signal import socket import subprocess -import sys -import tempfile import time import urllib.error import urllib.request @@ -32,7 +30,6 @@ from pathlib import Path from typing import Any - ROOT = Path(__file__).resolve().parent.parent DEFAULT_WORK_DIR = ROOT / ".harness-work" MODEL = "luce-dflash" @@ -982,7 +979,7 @@ def wait_http(base_url: str, proc: subprocess.Popen | None = None, timeout: int status, _body, _elapsed = http_json("GET", base_url + "/health", timeout=2) if status == 200: return True - except (urllib.error.URLError, TimeoutError, ConnectionResetError, socket.timeout): + except (urllib.error.URLError, TimeoutError, ConnectionResetError): pass time.sleep(1) return False @@ -1552,7 +1549,6 @@ def _score_he_response(text: str, entry_point: str, gold_test: str) -> tuple[boo Returns (correct, detail_str). """ import subprocess as _sp - import tempfile as _tmp think_end = text.rfind("") answer_text = text[think_end + len(""):] if think_end >= 0 else text @@ -1883,7 +1879,7 @@ def cmd_bench(args: argparse.Namespace) -> int: if status != 200: print(f"[bench] WARNING: server health check returned {status}", flush=True) except Exception as exc: - raise SystemExit(f"[bench] cannot reach server at {base_url}/health: {exc}") + raise SystemExit(f"[bench] cannot reach server at {base_url}/health: {exc}") from exc print(f"[bench] url={base_url} model={model} suites={','.join(selected)}", flush=True) @@ -1903,7 +1899,7 @@ def cmd_bench(args: argparse.Namespace) -> int: } # Final summary - print(f"\n[bench] === SUMMARY ===", flush=True) + print("\n[bench] === SUMMARY ===", flush=True) print(f"{'Suite':>8s} {'OK':>5s} {'Wall':>7s} {'TTFT':>7s} {'Pf tok/s':>9s} " f"{'Out tok/s':>10s} {'Out tok':>8s} {'Score':>10s}", flush=True) for suite, s in all_suites.items(): diff --git a/harness/clients/llamacpp_compat_proxy.py b/harness/clients/llamacpp_compat_proxy.py index 9f8ce785e..ad6922bf4 100644 --- a/harness/clients/llamacpp_compat_proxy.py +++ b/harness/clients/llamacpp_compat_proxy.py @@ -378,7 +378,7 @@ class Handler(BaseHTTPRequestHandler): max_tokens_cap = 0 def log_message(self, fmt, *args): - print("[%s] %s" % (self.log_date_time_string(), fmt % args), flush=True) + print(f"[{self.log_date_time_string()}] {fmt % args}", flush=True) def send_json(self, status: int, obj: dict): data = json.dumps(obj).encode("utf-8") diff --git a/harness/clients/session_inject_proxy.py b/harness/clients/session_inject_proxy.py index 8cebab81e..b07a34e3f 100755 --- a/harness/clients/session_inject_proxy.py +++ b/harness/clients/session_inject_proxy.py @@ -17,13 +17,11 @@ from __future__ import annotations import argparse +import http.client import json import os -import socket -import threading from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer from urllib.parse import urlparse -import http.client class Handler(BaseHTTPRequestHandler): @@ -73,7 +71,7 @@ def _relay_response(self, resp: http.client.HTTPResponse): self.wfile.write(b"0\r\n\r\n") self.wfile.flush() break - size = "%X\r\n" % len(chunk) + size = f"{len(chunk):X}\r\n" self.wfile.write(size.encode("ascii")) self.wfile.write(chunk) self.wfile.write(b"\r\n") diff --git a/harness/clients/summarize_backend_pair.py b/harness/clients/summarize_backend_pair.py index cd01274af..5062cac77 100755 --- a/harness/clients/summarize_backend_pair.py +++ b/harness/clients/summarize_backend_pair.py @@ -8,7 +8,6 @@ import sys from pathlib import Path - LUCEBOX_DONE_RE = re.compile(r"(?:chat|responses|messages) DONE .*? in=(?P\d+) out=(?P\d+)") LUCEBOX_DECODE_RE = re.compile(r"decode=[^(]*\((?P[0-9.]+)tok/s\)") LUCEBOX_OVERALL_RE = re.compile(r"\s(?P[0-9.]+) tok/s\s+finish=") diff --git a/pyproject.toml b/pyproject.toml index b68870fef..56ae2bf4f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,13 +19,11 @@ dev = ["pytest>=8", "mypy>=1.10,<2", "ruff>=0.14,<1"] [tool.ruff] target-version = "py312" line-length = 100 -# Staged lint adoption for the Python surfaces touched by the Docker, -# benchmark, autotune, and host-CLI stack. Untouched server-adjacent scripts and -# vendored dependencies stay outside this gate until they are cleaned up. -# -# No Python package at the repo root to lint yet; workspace members add -# their own include lists. The pyproject-driven ruff config still applies. -include = [] +# Staged lint adoption: the host-CLI / harness tooling is gated now. The +# server-internal and optimization Python (server/scripts, optimizations/*) +# carries pre-existing style debt and is added to `include` as it is cleaned +# up. Vendored deps stay excluded permanently (extend-exclude below). +include = ["harness/**/*.py", "scripts/**/*.py"] extend-exclude = [ "dflash/deps", "megakernel", @@ -33,7 +31,10 @@ extend-exclude = [ ] [tool.ruff.lint] -select = ["E", "F", "I", "UP", "B"] +# Correctness + imports + modernization + bugbear. Line-length / style (E5xx, +# E7xx) is intentionally staged out for now so the gate stays green without +# reflowing existing code; tighten once the tracked surfaces are clean. +select = ["F", "I", "UP", "B"] [tool.uv] package = false @@ -64,8 +65,3 @@ torch = { index = "pytorch-cu128" } name = "pytorch-cu128" url = "https://download.pytorch.org/whl/cu128" explicit = true - -[dependency-groups] -dev = [ - "pytest>=9.0.3", -] diff --git a/scripts/build_image.sh b/scripts/build_image.sh index 38bd34e03..054fd8850 100755 --- a/scripts/build_image.sh +++ b/scripts/build_image.sh @@ -4,6 +4,8 @@ # # Tagging: # * Untagged tree → lucebox-hub:cuda12 (moving) +# → lucebox-hub:-cuda12 (pinned, from +# `git describe --always`) # * Tagged `lucebox-v0.3.0` (clean checkout): # → lucebox-hub:cuda12 # → lucebox-hub:0.3.0-cuda12 diff --git a/uv.lock b/uv.lock index 3f5f64aa9..fee8de0df 100644 --- a/uv.lock +++ b/uv.lock @@ -480,11 +480,6 @@ megakernel = [ { name = "qwen35-megakernel-bf16" }, ] -[package.dev-dependencies] -dev = [ - { name = "pytest" }, -] - [package.metadata] requires-dist = [ { name = "lucebox-dflash", virtual = "server" }, @@ -496,9 +491,6 @@ requires-dist = [ ] provides-extras = ["megakernel", "dev"] -[package.metadata.requires-dev] -dev = [{ name = "pytest", specifier = ">=9.0.3" }] - [[package]] name = "markdown-it-py" version = "4.2.0"