From 2a77be5749c6a6c4904c0c26d79c0cef45c18da7 Mon Sep 17 00:00:00 2001
From: quantumaikr <hi@quantumai.kr>
Date: Sat, 11 Apr 2026 23:21:47 +0900
Subject: [PATCH] feat(cli): quantcpp client (SSE streaming) + serve
 discoverability
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The HTTP server already supported OpenAI-compatible SSE streaming
(controlled by `"stream": true` in the request body) but it wasn't
discoverable from the CLI. This PR makes it explicit and easy to use.

New: `quantcpp client PROMPT [--url ...] [--no-stream]`
- Sends a chat completion to a running quantcpp serve endpoint
- Default mode is streaming (SSE) — tokens print as they arrive
- --no-stream falls back to a single JSON response
- Stdlib only (urllib) — no extra dependencies

Improved: `quantcpp serve` startup output
- Now prints all three endpoints (chat/completions, models, health)
- Shows curl examples for both streaming and non-streaming modes
- Shows OpenAI Python SDK snippet for drop-in usage

Verified end-to-end: server streams token-by-token; client decodes
SSE chunks correctly; --no-stream returns single JSON.

README (EN/KO) and guide CTA updated to mention `quantcpp client`
and the streaming/non-streaming choice.

Version: 0.12.0 → 0.12.1.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 README.ko.md                         |  11 +--
 README.md                            |  11 +--
 bindings/python/pyproject.toml       |   2 +-
 bindings/python/quantcpp/__init__.py |   2 +-
 bindings/python/quantcpp/cli.py      | 103 ++++++++++++++++++++++++++-
 site/index.html                      |   2 +-
 6 files changed, 117 insertions(+), 14 deletions(-)

diff --git a/README.ko.md b/README.ko.md
index df108b9..0c69f90 100644
--- a/README.ko.md
+++ b/README.ko.md
@@ -28,13 +28,14 @@
 ```bash
 pip install quantcpp
 
-quantcpp pull llama3.2:1b           # HuggingFace에서 다운로드
-quantcpp run llama3.2:1b            # 대화형 채팅
-quantcpp serve llama3.2:1b -p 8080  # OpenAI 호환 HTTP 서버
-quantcpp list                       # 캐시된 모델 목록
+quantcpp pull llama3.2:1b               # HuggingFace에서 다운로드
+quantcpp run llama3.2:1b                # 대화형 채팅
+quantcpp serve llama3.2:1b -p 8080      # OpenAI 호환 HTTP 서버 (SSE 스트리밍)
+quantcpp client "안녕"                   # 스트리밍 클라이언트 → :8080 서버
+quantcpp list                           # 캐시된 모델 목록
 ```
 
-짧은 별칭: `smollm2:135m`, `qwen3.5:0.8b`, `llama3.2:1b`. `run`/`serve` 첫 실행 시 자동 다운로드. `serve`는 OpenAI 호환 `POST /v1/chat/completions` 엔드포인트를 8080 포트에 제공합니다.
+짧은 별칭: `smollm2:135m`, `qwen3.5:0.8b`, `llama3.2:1b`. `run`/`serve` 첫 실행 시 자동 다운로드. `serve`는 OpenAI 호환 `POST /v1/chat/completions` 엔드포인트를 8080 포트에 제공합니다 — 클라이언트가 `"stream": true`를 보내면 SSE 토큰 단위 스트리밍, 생략하면 단일 JSON 응답. 내장 `quantcpp client`는 두 모드 모두 지원 (기본: 스트리밍, `--no-stream`: 단일 응답).
 
 **한 줄 질문:**
 ```bash
diff --git a/README.md b/README.md
index 23944dd..703c293 100644
--- a/README.md
+++ b/README.md
@@ -41,13 +41,14 @@
 ```bash
 pip install quantcpp
 
-quantcpp pull llama3.2:1b           # download from HuggingFace
-quantcpp run llama3.2:1b            # interactive chat
-quantcpp serve llama3.2:1b -p 8080  # OpenAI-compatible HTTP server
-quantcpp list                       # show cached models
+quantcpp pull llama3.2:1b               # download from HuggingFace
+quantcpp run llama3.2:1b                # interactive chat
+quantcpp serve llama3.2:1b -p 8080      # OpenAI-compatible HTTP server (SSE streaming)
+quantcpp client "Hi"                    # streaming client → server on :8080
+quantcpp list                           # show cached models
 ```
 
-Short aliases: `smollm2:135m`, `qwen3.5:0.8b`, `llama3.2:1b`. Auto-pulls on first `run`/`serve`. The `serve` subcommand exposes `POST /v1/chat/completions` (OpenAI-compatible) on port 8080.
+Short aliases: `smollm2:135m`, `qwen3.5:0.8b`, `llama3.2:1b`. Auto-pulls on first `run`/`serve`. The `serve` subcommand exposes `POST /v1/chat/completions` (OpenAI-compatible) on port 8080 — clients pass `"stream": true` for SSE streaming, or omit it for a single JSON response. Built-in `quantcpp client` supports both modes (default: streaming, `--no-stream` for single response).
 
 **One-shot question:**
 ```bash
diff --git a/bindings/python/pyproject.toml b/bindings/python/pyproject.toml
index 8b3ebf1..e59f239 100644
--- a/bindings/python/pyproject.toml
+++ b/bindings/python/pyproject.toml
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "quantcpp"
-version = "0.12.0"
+version = "0.12.1"
 description = "Single-header LLM inference engine with KV cache compression (7× compression at fp32 parity)"
 readme = "README.md"
 license = { text = "Apache-2.0" }
diff --git a/bindings/python/quantcpp/__init__.py b/bindings/python/quantcpp/__init__.py
index f9127f4..ff092e3 100644
--- a/bindings/python/quantcpp/__init__.py
+++ b/bindings/python/quantcpp/__init__.py
@@ -15,7 +15,7 @@
     from importlib.metadata import version as _pkg_version
     __version__ = _pkg_version("quantcpp")
 except Exception:
-    __version__ = "0.12.0"  # fallback for editable / source-tree imports
+    __version__ = "0.12.1"  # fallback for editable / source-tree imports
 
 import os
 import sys
diff --git a/bindings/python/quantcpp/cli.py b/bindings/python/quantcpp/cli.py
index 9a14d70..a9506d1 100644
--- a/bindings/python/quantcpp/cli.py
+++ b/bindings/python/quantcpp/cli.py
@@ -195,10 +195,92 @@ def cmd_serve(args):
         return 2
 
     cmd = [binary, model_path, "-p", str(args.port), "-j", str(args.threads)]
-    print(f"quant serve {os.path.basename(model_path)} on :{args.port}", file=sys.stderr)
+    print(f"quantcpp serve {os.path.basename(model_path)} on :{args.port}", file=sys.stderr)
+    print("", file=sys.stderr)
+    print("OpenAI-compatible endpoints:", file=sys.stderr)
+    print(f"  POST http://localhost:{args.port}/v1/chat/completions", file=sys.stderr)
+    print(f"  GET  http://localhost:{args.port}/v1/models", file=sys.stderr)
+    print(f"  GET  http://localhost:{args.port}/health", file=sys.stderr)
+    print("", file=sys.stderr)
+    print("Streaming (SSE — token-by-token):", file=sys.stderr)
+    print(f"  curl -N http://localhost:{args.port}/v1/chat/completions \\", file=sys.stderr)
+    print("    -H 'Content-Type: application/json' \\", file=sys.stderr)
+    print('    -d \'{"messages":[{"role":"user","content":"Hi"}],"stream":true}\'',
+          file=sys.stderr)
+    print("", file=sys.stderr)
+    print("Non-streaming (single JSON response):", file=sys.stderr)
+    print(f"  curl http://localhost:{args.port}/v1/chat/completions \\", file=sys.stderr)
+    print("    -H 'Content-Type: application/json' \\", file=sys.stderr)
+    print('    -d \'{"messages":[{"role":"user","content":"Hi"}]}\'',
+          file=sys.stderr)
+    print("", file=sys.stderr)
+    print("OpenAI Python SDK works as-is:", file=sys.stderr)
+    print(f"  client = OpenAI(base_url='http://localhost:{args.port}/v1', api_key='none')",
+          file=sys.stderr)
+    print("  client.chat.completions.create(model='quantcpp', messages=[...], stream=True)",
+          file=sys.stderr)
+    print("", file=sys.stderr)
     os.execvp(cmd[0], cmd)
 
 
+def cmd_client(args):
+    """Send a chat request to a running quantcpp serve endpoint.
+
+    Default mode is streaming (SSE) — tokens print as they arrive.
+    Use --no-stream for a single JSON response.
+    """
+    import json as _json
+    import urllib.request
+
+    url = args.url.rstrip("/") + "/v1/chat/completions"
+    payload = {
+        "model": args.model_name,
+        "messages": [{"role": "user", "content": args.prompt}],
+        "max_tokens": args.max_tokens,
+        "temperature": args.temperature,
+        "stream": not args.no_stream,
+    }
+    body = _json.dumps(payload).encode()
+    req = urllib.request.Request(
+        url, data=body,
+        headers={
+            "Content-Type": "application/json",
+            "User-Agent": "quantcpp-client",
+        },
+    )
+
+    try:
+        with urllib.request.urlopen(req) as resp:
+            if args.no_stream:
+                data = _json.loads(resp.read())
+                print(data["choices"][0]["message"]["content"])
+                return 0
+
+            # SSE stream — parse `data: {...}\n\n` chunks
+            for line in resp:
+                line = line.decode("utf-8", errors="replace").rstrip()
+                if not line.startswith("data:"):
+                    continue
+                payload_str = line[5:].strip()
+                if payload_str == "[DONE]":
+                    break
+                try:
+                    chunk = _json.loads(payload_str)
+                    delta = chunk["choices"][0]["delta"].get("content", "")
+                    if delta:
+                        print(delta, end="", flush=True)
+                except Exception:
+                    pass
+            print()
+            return 0
+    except urllib.error.URLError as e:
+        print(f"connection failed: {e}", file=sys.stderr)
+        print(f"  Is the server running on {args.url}?", file=sys.stderr)
+        print(f"  Start it with: quantcpp serve llama3.2:1b -p {args.url.rsplit(':', 1)[-1].rstrip('/')}",
+              file=sys.stderr)
+        return 1
+
+
 def cmd_chat_default(args):
     """Backwards-compatible default: auto-download Llama-3.2-1B and chat."""
     args.model = args.model or "Llama-3.2-1B"
@@ -222,6 +304,7 @@ def main():
   list                  List cached and available models
   run MODEL [PROMPT]    Chat with a model (auto-pulls if needed)
   serve MODEL           Start OpenAI-compatible HTTP server
+  client PROMPT         Send a request to a running serve (default: SSE streaming)
 
 examples:
   quantcpp pull llama3.2:1b
@@ -229,6 +312,9 @@ def main():
   quantcpp run llama3.2:1b
   quantcpp run llama3.2:1b "What is gravity?"
   quantcpp serve llama3.2:1b --port 8080
+  quantcpp client "What is gravity?"                  # streams from :8080
+  quantcpp client "Hi" --url http://localhost:8081
+  quantcpp client "Hi" --no-stream                    # single JSON response
 
 backwards-compat (no subcommand):
   quantcpp                          # default chat with Llama-3.2-1B
@@ -261,6 +347,19 @@ def main():
     p_serve.add_argument("-p", "--port", type=int, default=8080)
     p_serve.add_argument("-j", "--threads", type=int, default=4)
 
+    # client
+    p_client = sub.add_parser("client",
+        help="Send a chat request to a running quantcpp serve endpoint")
+    p_client.add_argument("prompt", help="Question to send")
+    p_client.add_argument("--url", default="http://localhost:8080",
+                          help="Server URL (default: http://localhost:8080)")
+    p_client.add_argument("--model-name", "-m", default="quantcpp",
+                          help="Model name in the request body (server ignores)")
+    p_client.add_argument("-n", "--max-tokens", type=int, default=256)
+    p_client.add_argument("-t", "--temperature", type=float, default=0.7)
+    p_client.add_argument("--no-stream", action="store_true",
+                          help="Disable SSE streaming (single JSON response)")
+
     # Backwards-compat: top-level args for direct chat
     parser.add_argument("prompt", nargs="*", default=None,
                         help="(default mode) question to ask")
@@ -280,6 +379,8 @@ def main():
         return cmd_run(args)
     if args.command == "serve":
         return cmd_serve(args)
+    if args.command == "client":
+        return cmd_client(args)
 
     # No subcommand → backwards-compat default chat
     return cmd_chat_default(args)
diff --git a/site/index.html b/site/index.html
index 745caf2..94a95b8 100644
--- a/site/index.html
+++ b/site/index.html
@@ -736,7 +736,7 @@ <h2 style="margin-bottom:1rem" data-i18n="cta.title">Try It Yourself</h2>
 quantcpp pull llama3.2:1b
 quantcpp run llama3.2:1b
 quantcpp serve llama3.2:1b -p 8080
-quantcpp list</code></pre>
+quantcpp client "Hi"   # SSE streaming</code></pre>
       </div>
       <div>
         <div style="font-size:.75rem;color:var(--text2);margin-bottom:.3rem;font-weight:600" data-i18n="cta.label.python">Python API</div>