From 2a77be5749c6a6c4904c0c26d79c0cef45c18da7 Mon Sep 17 00:00:00 2001 From: quantumaikr Date: Sat, 11 Apr 2026 23:21:47 +0900 Subject: [PATCH] feat(cli): quantcpp client (SSE streaming) + serve discoverability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The HTTP server already supported OpenAI-compatible SSE streaming (controlled by `"stream": true` in the request body) but it wasn't discoverable from the CLI. This PR makes it explicit and easy to use. New: `quantcpp client PROMPT [--url ...] [--no-stream]` - Sends a chat completion to a running quantcpp serve endpoint - Default mode is streaming (SSE) — tokens print as they arrive - --no-stream falls back to a single JSON response - Stdlib only (urllib) — no extra dependencies Improved: `quantcpp serve` startup output - Now prints all three endpoints (chat/completions, models, health) - Shows curl examples for both streaming and non-streaming modes - Shows OpenAI Python SDK snippet for drop-in usage Verified end-to-end: server streams token-by-token; client decodes SSE chunks correctly; --no-stream returns single JSON. README (EN/KO) and guide CTA updated to mention `quantcpp client` and the streaming/non-streaming choice. Version: 0.12.0 → 0.12.1. Co-Authored-By: Claude Opus 4.6 (1M context) --- README.ko.md | 11 +-- README.md | 11 +-- bindings/python/pyproject.toml | 2 +- bindings/python/quantcpp/__init__.py | 2 +- bindings/python/quantcpp/cli.py | 103 ++++++++++++++++++++++++++- site/index.html | 2 +- 6 files changed, 117 insertions(+), 14 deletions(-) diff --git a/README.ko.md b/README.ko.md index df108b9..0c69f90 100644 --- a/README.ko.md +++ b/README.ko.md @@ -28,13 +28,14 @@ ```bash pip install quantcpp -quantcpp pull llama3.2:1b # HuggingFace에서 다운로드 -quantcpp run llama3.2:1b # 대화형 채팅 -quantcpp serve llama3.2:1b -p 8080 # OpenAI 호환 HTTP 서버 -quantcpp list # 캐시된 모델 목록 +quantcpp pull llama3.2:1b # HuggingFace에서 다운로드 +quantcpp run llama3.2:1b # 대화형 채팅 +quantcpp serve llama3.2:1b -p 8080 # OpenAI 호환 HTTP 서버 (SSE 스트리밍) +quantcpp client "안녕" # 스트리밍 클라이언트 → :8080 서버 +quantcpp list # 캐시된 모델 목록 ``` -짧은 별칭: `smollm2:135m`, `qwen3.5:0.8b`, `llama3.2:1b`. `run`/`serve` 첫 실행 시 자동 다운로드. `serve`는 OpenAI 호환 `POST /v1/chat/completions` 엔드포인트를 8080 포트에 제공합니다. +짧은 별칭: `smollm2:135m`, `qwen3.5:0.8b`, `llama3.2:1b`. `run`/`serve` 첫 실행 시 자동 다운로드. `serve`는 OpenAI 호환 `POST /v1/chat/completions` 엔드포인트를 8080 포트에 제공합니다 — 클라이언트가 `"stream": true`를 보내면 SSE 토큰 단위 스트리밍, 생략하면 단일 JSON 응답. 내장 `quantcpp client`는 두 모드 모두 지원 (기본: 스트리밍, `--no-stream`: 단일 응답). **한 줄 질문:** ```bash diff --git a/README.md b/README.md index 23944dd..703c293 100644 --- a/README.md +++ b/README.md @@ -41,13 +41,14 @@ ```bash pip install quantcpp -quantcpp pull llama3.2:1b # download from HuggingFace -quantcpp run llama3.2:1b # interactive chat -quantcpp serve llama3.2:1b -p 8080 # OpenAI-compatible HTTP server -quantcpp list # show cached models +quantcpp pull llama3.2:1b # download from HuggingFace +quantcpp run llama3.2:1b # interactive chat +quantcpp serve llama3.2:1b -p 8080 # OpenAI-compatible HTTP server (SSE streaming) +quantcpp client "Hi" # streaming client → server on :8080 +quantcpp list # show cached models ``` -Short aliases: `smollm2:135m`, `qwen3.5:0.8b`, `llama3.2:1b`. Auto-pulls on first `run`/`serve`. The `serve` subcommand exposes `POST /v1/chat/completions` (OpenAI-compatible) on port 8080. +Short aliases: `smollm2:135m`, `qwen3.5:0.8b`, `llama3.2:1b`. Auto-pulls on first `run`/`serve`. The `serve` subcommand exposes `POST /v1/chat/completions` (OpenAI-compatible) on port 8080 — clients pass `"stream": true` for SSE streaming, or omit it for a single JSON response. Built-in `quantcpp client` supports both modes (default: streaming, `--no-stream` for single response). **One-shot question:** ```bash diff --git a/bindings/python/pyproject.toml b/bindings/python/pyproject.toml index 8b3ebf1..e59f239 100644 --- a/bindings/python/pyproject.toml +++ b/bindings/python/pyproject.toml @@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta" [project] name = "quantcpp" -version = "0.12.0" +version = "0.12.1" description = "Single-header LLM inference engine with KV cache compression (7× compression at fp32 parity)" readme = "README.md" license = { text = "Apache-2.0" } diff --git a/bindings/python/quantcpp/__init__.py b/bindings/python/quantcpp/__init__.py index f9127f4..ff092e3 100644 --- a/bindings/python/quantcpp/__init__.py +++ b/bindings/python/quantcpp/__init__.py @@ -15,7 +15,7 @@ from importlib.metadata import version as _pkg_version __version__ = _pkg_version("quantcpp") except Exception: - __version__ = "0.12.0" # fallback for editable / source-tree imports + __version__ = "0.12.1" # fallback for editable / source-tree imports import os import sys diff --git a/bindings/python/quantcpp/cli.py b/bindings/python/quantcpp/cli.py index 9a14d70..a9506d1 100644 --- a/bindings/python/quantcpp/cli.py +++ b/bindings/python/quantcpp/cli.py @@ -195,10 +195,92 @@ def cmd_serve(args): return 2 cmd = [binary, model_path, "-p", str(args.port), "-j", str(args.threads)] - print(f"quant serve {os.path.basename(model_path)} on :{args.port}", file=sys.stderr) + print(f"quantcpp serve {os.path.basename(model_path)} on :{args.port}", file=sys.stderr) + print("", file=sys.stderr) + print("OpenAI-compatible endpoints:", file=sys.stderr) + print(f" POST http://localhost:{args.port}/v1/chat/completions", file=sys.stderr) + print(f" GET http://localhost:{args.port}/v1/models", file=sys.stderr) + print(f" GET http://localhost:{args.port}/health", file=sys.stderr) + print("", file=sys.stderr) + print("Streaming (SSE — token-by-token):", file=sys.stderr) + print(f" curl -N http://localhost:{args.port}/v1/chat/completions \\", file=sys.stderr) + print(" -H 'Content-Type: application/json' \\", file=sys.stderr) + print(' -d \'{"messages":[{"role":"user","content":"Hi"}],"stream":true}\'', + file=sys.stderr) + print("", file=sys.stderr) + print("Non-streaming (single JSON response):", file=sys.stderr) + print(f" curl http://localhost:{args.port}/v1/chat/completions \\", file=sys.stderr) + print(" -H 'Content-Type: application/json' \\", file=sys.stderr) + print(' -d \'{"messages":[{"role":"user","content":"Hi"}]}\'', + file=sys.stderr) + print("", file=sys.stderr) + print("OpenAI Python SDK works as-is:", file=sys.stderr) + print(f" client = OpenAI(base_url='http://localhost:{args.port}/v1', api_key='none')", + file=sys.stderr) + print(" client.chat.completions.create(model='quantcpp', messages=[...], stream=True)", + file=sys.stderr) + print("", file=sys.stderr) os.execvp(cmd[0], cmd) +def cmd_client(args): + """Send a chat request to a running quantcpp serve endpoint. + + Default mode is streaming (SSE) — tokens print as they arrive. + Use --no-stream for a single JSON response. + """ + import json as _json + import urllib.request + + url = args.url.rstrip("/") + "/v1/chat/completions" + payload = { + "model": args.model_name, + "messages": [{"role": "user", "content": args.prompt}], + "max_tokens": args.max_tokens, + "temperature": args.temperature, + "stream": not args.no_stream, + } + body = _json.dumps(payload).encode() + req = urllib.request.Request( + url, data=body, + headers={ + "Content-Type": "application/json", + "User-Agent": "quantcpp-client", + }, + ) + + try: + with urllib.request.urlopen(req) as resp: + if args.no_stream: + data = _json.loads(resp.read()) + print(data["choices"][0]["message"]["content"]) + return 0 + + # SSE stream — parse `data: {...}\n\n` chunks + for line in resp: + line = line.decode("utf-8", errors="replace").rstrip() + if not line.startswith("data:"): + continue + payload_str = line[5:].strip() + if payload_str == "[DONE]": + break + try: + chunk = _json.loads(payload_str) + delta = chunk["choices"][0]["delta"].get("content", "") + if delta: + print(delta, end="", flush=True) + except Exception: + pass + print() + return 0 + except urllib.error.URLError as e: + print(f"connection failed: {e}", file=sys.stderr) + print(f" Is the server running on {args.url}?", file=sys.stderr) + print(f" Start it with: quantcpp serve llama3.2:1b -p {args.url.rsplit(':', 1)[-1].rstrip('/')}", + file=sys.stderr) + return 1 + + def cmd_chat_default(args): """Backwards-compatible default: auto-download Llama-3.2-1B and chat.""" args.model = args.model or "Llama-3.2-1B" @@ -222,6 +304,7 @@ def main(): list List cached and available models run MODEL [PROMPT] Chat with a model (auto-pulls if needed) serve MODEL Start OpenAI-compatible HTTP server + client PROMPT Send a request to a running serve (default: SSE streaming) examples: quantcpp pull llama3.2:1b @@ -229,6 +312,9 @@ def main(): quantcpp run llama3.2:1b quantcpp run llama3.2:1b "What is gravity?" quantcpp serve llama3.2:1b --port 8080 + quantcpp client "What is gravity?" # streams from :8080 + quantcpp client "Hi" --url http://localhost:8081 + quantcpp client "Hi" --no-stream # single JSON response backwards-compat (no subcommand): quantcpp # default chat with Llama-3.2-1B @@ -261,6 +347,19 @@ def main(): p_serve.add_argument("-p", "--port", type=int, default=8080) p_serve.add_argument("-j", "--threads", type=int, default=4) + # client + p_client = sub.add_parser("client", + help="Send a chat request to a running quantcpp serve endpoint") + p_client.add_argument("prompt", help="Question to send") + p_client.add_argument("--url", default="http://localhost:8080", + help="Server URL (default: http://localhost:8080)") + p_client.add_argument("--model-name", "-m", default="quantcpp", + help="Model name in the request body (server ignores)") + p_client.add_argument("-n", "--max-tokens", type=int, default=256) + p_client.add_argument("-t", "--temperature", type=float, default=0.7) + p_client.add_argument("--no-stream", action="store_true", + help="Disable SSE streaming (single JSON response)") + # Backwards-compat: top-level args for direct chat parser.add_argument("prompt", nargs="*", default=None, help="(default mode) question to ask") @@ -280,6 +379,8 @@ def main(): return cmd_run(args) if args.command == "serve": return cmd_serve(args) + if args.command == "client": + return cmd_client(args) # No subcommand → backwards-compat default chat return cmd_chat_default(args) diff --git a/site/index.html b/site/index.html index 745caf2..94a95b8 100644 --- a/site/index.html +++ b/site/index.html @@ -736,7 +736,7 @@

Try It Yourself

quantcpp pull llama3.2:1b quantcpp run llama3.2:1b quantcpp serve llama3.2:1b -p 8080 -quantcpp list +quantcpp client "Hi" # SSE streaming
Python API