Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions README.ko.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,14 @@
```bash
pip install quantcpp

quantcpp pull llama3.2:1b # HuggingFace에서 다운로드
quantcpp run llama3.2:1b # 대화형 채팅
quantcpp serve llama3.2:1b -p 8080 # OpenAI 호환 HTTP 서버
quantcpp list # 캐시된 모델 목록
quantcpp pull llama3.2:1b # HuggingFace에서 다운로드
quantcpp run llama3.2:1b # 대화형 채팅
quantcpp serve llama3.2:1b -p 8080 # OpenAI 호환 HTTP 서버 (SSE 스트리밍)
quantcpp client "안녕" # 스트리밍 클라이언트 → :8080 서버
quantcpp list # 캐시된 모델 목록
```

짧은 별칭: `smollm2:135m`, `qwen3.5:0.8b`, `llama3.2:1b`. `run`/`serve` 첫 실행 시 자동 다운로드. `serve`는 OpenAI 호환 `POST /v1/chat/completions` 엔드포인트를 8080 포트에 제공합니다.
짧은 별칭: `smollm2:135m`, `qwen3.5:0.8b`, `llama3.2:1b`. `run`/`serve` 첫 실행 시 자동 다운로드. `serve`는 OpenAI 호환 `POST /v1/chat/completions` 엔드포인트를 8080 포트에 제공합니다 — 클라이언트가 `"stream": true`를 보내면 SSE 토큰 단위 스트리밍, 생략하면 단일 JSON 응답. 내장 `quantcpp client`는 두 모드 모두 지원 (기본: 스트리밍, `--no-stream`: 단일 응답).

**한 줄 질문:**
```bash
Expand Down
11 changes: 6 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,14 @@
```bash
pip install quantcpp

quantcpp pull llama3.2:1b # download from HuggingFace
quantcpp run llama3.2:1b # interactive chat
quantcpp serve llama3.2:1b -p 8080 # OpenAI-compatible HTTP server
quantcpp list # show cached models
quantcpp pull llama3.2:1b # download from HuggingFace
quantcpp run llama3.2:1b # interactive chat
quantcpp serve llama3.2:1b -p 8080 # OpenAI-compatible HTTP server (SSE streaming)
quantcpp client "Hi" # streaming client → server on :8080
quantcpp list # show cached models
```

Short aliases: `smollm2:135m`, `qwen3.5:0.8b`, `llama3.2:1b`. Auto-pulls on first `run`/`serve`. The `serve` subcommand exposes `POST /v1/chat/completions` (OpenAI-compatible) on port 8080.
Short aliases: `smollm2:135m`, `qwen3.5:0.8b`, `llama3.2:1b`. Auto-pulls on first `run`/`serve`. The `serve` subcommand exposes `POST /v1/chat/completions` (OpenAI-compatible) on port 8080 — clients pass `"stream": true` for SSE streaming, or omit it for a single JSON response. Built-in `quantcpp client` supports both modes (default: streaming, `--no-stream` for single response).

**One-shot question:**
```bash
Expand Down
2 changes: 1 addition & 1 deletion bindings/python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "quantcpp"
version = "0.12.0"
version = "0.12.1"
description = "Single-header LLM inference engine with KV cache compression (7× compression at fp32 parity)"
readme = "README.md"
license = { text = "Apache-2.0" }
Expand Down
2 changes: 1 addition & 1 deletion bindings/python/quantcpp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from importlib.metadata import version as _pkg_version
__version__ = _pkg_version("quantcpp")
except Exception:
__version__ = "0.12.0" # fallback for editable / source-tree imports
__version__ = "0.12.1" # fallback for editable / source-tree imports

import os
import sys
Expand Down
103 changes: 102 additions & 1 deletion bindings/python/quantcpp/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,10 +195,92 @@ def cmd_serve(args):
return 2

cmd = [binary, model_path, "-p", str(args.port), "-j", str(args.threads)]
print(f"quant serve {os.path.basename(model_path)} on :{args.port}", file=sys.stderr)
print(f"quantcpp serve {os.path.basename(model_path)} on :{args.port}", file=sys.stderr)
print("", file=sys.stderr)
print("OpenAI-compatible endpoints:", file=sys.stderr)
print(f" POST http://localhost:{args.port}/v1/chat/completions", file=sys.stderr)
print(f" GET http://localhost:{args.port}/v1/models", file=sys.stderr)
print(f" GET http://localhost:{args.port}/health", file=sys.stderr)
print("", file=sys.stderr)
print("Streaming (SSE — token-by-token):", file=sys.stderr)
print(f" curl -N http://localhost:{args.port}/v1/chat/completions \\", file=sys.stderr)
print(" -H 'Content-Type: application/json' \\", file=sys.stderr)
print(' -d \'{"messages":[{"role":"user","content":"Hi"}],"stream":true}\'',
file=sys.stderr)
print("", file=sys.stderr)
print("Non-streaming (single JSON response):", file=sys.stderr)
print(f" curl http://localhost:{args.port}/v1/chat/completions \\", file=sys.stderr)
print(" -H 'Content-Type: application/json' \\", file=sys.stderr)
print(' -d \'{"messages":[{"role":"user","content":"Hi"}]}\'',
file=sys.stderr)
print("", file=sys.stderr)
print("OpenAI Python SDK works as-is:", file=sys.stderr)
print(f" client = OpenAI(base_url='http://localhost:{args.port}/v1', api_key='none')",
file=sys.stderr)
print(" client.chat.completions.create(model='quantcpp', messages=[...], stream=True)",
file=sys.stderr)
print("", file=sys.stderr)
os.execvp(cmd[0], cmd)


def cmd_client(args):
"""Send a chat request to a running quantcpp serve endpoint.

Default mode is streaming (SSE) — tokens print as they arrive.
Use --no-stream for a single JSON response.
"""
import json as _json
import urllib.request

url = args.url.rstrip("/") + "/v1/chat/completions"
payload = {
"model": args.model_name,
"messages": [{"role": "user", "content": args.prompt}],
"max_tokens": args.max_tokens,
"temperature": args.temperature,
"stream": not args.no_stream,
}
body = _json.dumps(payload).encode()
req = urllib.request.Request(
url, data=body,
headers={
"Content-Type": "application/json",
"User-Agent": "quantcpp-client",
},
)

try:
with urllib.request.urlopen(req) as resp:
if args.no_stream:
data = _json.loads(resp.read())
print(data["choices"][0]["message"]["content"])
return 0

# SSE stream — parse `data: {...}\n\n` chunks
for line in resp:
line = line.decode("utf-8", errors="replace").rstrip()
if not line.startswith("data:"):
continue
payload_str = line[5:].strip()
if payload_str == "[DONE]":
break
try:
chunk = _json.loads(payload_str)
delta = chunk["choices"][0]["delta"].get("content", "")
if delta:
print(delta, end="", flush=True)
except Exception:
pass
print()
return 0
except urllib.error.URLError as e:
print(f"connection failed: {e}", file=sys.stderr)
print(f" Is the server running on {args.url}?", file=sys.stderr)
print(f" Start it with: quantcpp serve llama3.2:1b -p {args.url.rsplit(':', 1)[-1].rstrip('/')}",
file=sys.stderr)
return 1


def cmd_chat_default(args):
"""Backwards-compatible default: auto-download Llama-3.2-1B and chat."""
args.model = args.model or "Llama-3.2-1B"
Expand All @@ -222,13 +304,17 @@ def main():
list List cached and available models
run MODEL [PROMPT] Chat with a model (auto-pulls if needed)
serve MODEL Start OpenAI-compatible HTTP server
client PROMPT Send a request to a running serve (default: SSE streaming)

examples:
quantcpp pull llama3.2:1b
quantcpp list
quantcpp run llama3.2:1b
quantcpp run llama3.2:1b "What is gravity?"
quantcpp serve llama3.2:1b --port 8080
quantcpp client "What is gravity?" # streams from :8080
quantcpp client "Hi" --url http://localhost:8081
quantcpp client "Hi" --no-stream # single JSON response

backwards-compat (no subcommand):
quantcpp # default chat with Llama-3.2-1B
Expand Down Expand Up @@ -261,6 +347,19 @@ def main():
p_serve.add_argument("-p", "--port", type=int, default=8080)
p_serve.add_argument("-j", "--threads", type=int, default=4)

# client
p_client = sub.add_parser("client",
help="Send a chat request to a running quantcpp serve endpoint")
p_client.add_argument("prompt", help="Question to send")
p_client.add_argument("--url", default="http://localhost:8080",
help="Server URL (default: http://localhost:8080)")
p_client.add_argument("--model-name", "-m", default="quantcpp",
help="Model name in the request body (server ignores)")
p_client.add_argument("-n", "--max-tokens", type=int, default=256)
p_client.add_argument("-t", "--temperature", type=float, default=0.7)
p_client.add_argument("--no-stream", action="store_true",
help="Disable SSE streaming (single JSON response)")

# Backwards-compat: top-level args for direct chat
parser.add_argument("prompt", nargs="*", default=None,
help="(default mode) question to ask")
Expand All @@ -280,6 +379,8 @@ def main():
return cmd_run(args)
if args.command == "serve":
return cmd_serve(args)
if args.command == "client":
return cmd_client(args)

# No subcommand → backwards-compat default chat
return cmd_chat_default(args)
Expand Down
2 changes: 1 addition & 1 deletion site/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -736,7 +736,7 @@ <h2 style="margin-bottom:1rem" data-i18n="cta.title">Try It Yourself</h2>
quantcpp pull llama3.2:1b
quantcpp run llama3.2:1b
quantcpp serve llama3.2:1b -p 8080
quantcpp list</code></pre>
quantcpp client "Hi" # SSE streaming</code></pre>
</div>
<div>
<div style="font-size:.75rem;color:var(--text2);margin-bottom:.3rem;font-weight:600" data-i18n="cta.label.python">Python API</div>
Expand Down
Loading