diff --git a/server/CMakeLists.txt b/server/CMakeLists.txt index d42762f35..c8158cf7a 100644 --- a/server/CMakeLists.txt +++ b/server/CMakeLists.txt @@ -217,6 +217,7 @@ add_library(dflash_common STATIC src/draft/draft_gguf_loader.cpp src/draft/draft_safetensors_loader.cpp src/draft/draft_graph.cpp + src/qwen3/anchor_scan.cpp src/qwen3/qwen3_drafter.cpp src/qwen3/qwen3_loader.cpp src/qwen3/qwen3_graph.cpp @@ -576,6 +577,52 @@ if(DFLASH27B_TESTS) target_link_libraries(test_bandit_integration PRIVATE dflash_common) add_test(NAME bandit_integration COMMAND test_bandit_integration) endif() + if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_drafter_early_exit_score_range.cpp") + add_executable(test_drafter_early_exit_score_range + test/test_drafter_early_exit_score_range.cpp) + target_include_directories(test_drafter_early_exit_score_range PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/src/common) + add_test(NAME test_drafter_early_exit_score_range + COMMAND test_drafter_early_exit_score_range) + endif() + if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_regime_router.cpp") + add_executable(test_regime_router + test/test_regime_router.cpp) + target_include_directories(test_regime_router PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/src/common) + add_test(NAME regime_router + COMMAND test_regime_router) + endif() + if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_anchor_transitive.cpp") + add_executable(test_anchor_transitive + test/test_anchor_transitive.cpp + src/qwen3/anchor_scan.cpp) + target_include_directories(test_anchor_transitive PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/src/qwen3) + add_test(NAME test_anchor_transitive + COMMAND test_anchor_transitive) + endif() + if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_drafter_warm_path_regression.cpp") + add_executable(test_drafter_warm_path_regression + test/test_drafter_warm_path_regression.cpp) + target_include_directories(test_drafter_warm_path_regression PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/src/common) + add_test(NAME test_drafter_warm_path_regression + COMMAND test_drafter_warm_path_regression) + endif() + if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_drafter_tail_capture_guard.cpp") + # GREEN phase: built with TAIL_GUARD_USE_NEW_FORMULA — must pass after Bug #42 fix. + add_executable(test_drafter_tail_capture_guard + test/test_drafter_tail_capture_guard.cpp) + target_compile_definitions(test_drafter_tail_capture_guard PRIVATE + TAIL_GUARD_USE_NEW_FORMULA) + add_test(NAME test_drafter_tail_capture_guard + COMMAND test_drafter_tail_capture_guard) + # RED phase binary: same source WITHOUT the fix flag — documents the bug. + add_executable(test_drafter_tail_capture_guard_red + test/test_drafter_tail_capture_guard.cpp) + # No TAIL_GUARD_USE_NEW_FORMULA — uses old (buggy) guard, expected to FAIL. + endif() if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_draft_vs_reference.cpp") add_executable(test_draft_vs_reference test/test_draft_vs_reference.cpp) target_link_libraries(test_draft_vs_reference PRIVATE dflash_common) diff --git a/server/README.md b/server/README.md index c66703e80..f4640b5b8 100644 --- a/server/README.md +++ b/server/README.md @@ -306,6 +306,8 @@ tokens) is the path to bring code recall to the same ratio as prose. ## Quick start +> **Looking for the prebuilt Docker image?** See [Quick start](../README.md#quick-start) in the top-level README — `ghcr.io/luce-org/lucebox-hub:cuda12` ships the dflash daemon, Python server, and all weights bind-mountable from the host. The instructions below are for building dflash from source (kernel development, custom arch lists, non-Docker hosts). + ```bash git clone --recurse-submodules https://github.com/Luce-Org/lucebox-hub cd lucebox-hub/dflash diff --git a/server/scripts/bench_agent.py b/server/scripts/bench_agent.py deleted file mode 100644 index 6953746c6..000000000 --- a/server/scripts/bench_agent.py +++ /dev/null @@ -1,506 +0,0 @@ -""" -Agentic-workload benchmark — simulates Codex / Claude Code style requests. - -Where ``bench_llm.py`` exercises 40-250 token prompts (and skips anything -``> 3500`` tokens) and reports decode tok/s only, real Codex / Claude Code -clients send 5K-30K input tokens (system prompt + tool definitions + -conversation history + tool-call file dumps) and the user-visible cost is -prefill-dominated. This bench: - - - Builds prompts from a real Codex system prompt fixture + a SWE-bench - Verified row (problem_statement + patch/test_patch as synthesised - "tool result" file context) padded to three length buckets: - ~2K, ~8K, ~24K tokens. - - Reports AR vs DFlash for each bucket: prefill_s, decode tok/s, TTFT, - total latency, AL — and BOTH the decode-only speedup (today's - RESULTS.md definition) and the user-visible total-latency speedup. - - Parses the per-stage [timing] block from ``test_dflash`` for - root-cause attribution. - -Usage: - python3 scripts/bench_agent.py # all buckets, n=5 each - python3 scripts/bench_agent.py --bucket 8k # one bucket - python3 scripts/bench_agent.py --n-sample 1 --bucket 2k # smoke - -Same env vars as ``bench_llm.py``: ``DFLASH_TARGET``, ``DFLASH_DRAFT``, -``DFLASH_BIN``, ``DFLASH_BIN_AR``, ``DFLASH_TOKENIZER``. -""" -import argparse -import json -import os -import re -import struct -import subprocess -import tempfile -import time -from pathlib import Path - -ROOT = Path(__file__).resolve().parent.parent -BIN_SUFFIX = ".exe" if os.name == "nt" else "" -TARGET = os.environ.get( - "DFLASH_TARGET", - str(ROOT / "models" / "Qwen3.6-27B-Q4_K_M.gguf"), -) -_LOCAL_DRAFT_FILE = ROOT / "models" / "draft" / "dflash-draft-3.6-q4_k_m.gguf" -_LOCAL_DRAFT_ROOT = ROOT / "models" / "draft" -DRAFT = None -TEST_DFLASH = os.environ.get("DFLASH_BIN", str(ROOT / "build" / f"test_dflash{BIN_SUFFIX}")) -TEST_GENERATE = os.environ.get("DFLASH_BIN_AR", str(ROOT / "build" / f"test_generate{BIN_SUFFIX}")) -TOKENIZER = os.environ.get("DFLASH_TOKENIZER", "Qwen/Qwen3.5-27B") -TMPDIR = Path(tempfile.gettempdir()) / "dflash_bench" -TMPDIR.mkdir(parents=True, exist_ok=True) - -FIX_DIR = ROOT / "scripts" / "fixtures" -SWE_PARQUET = FIX_DIR / "swe_bench" / "swe_bench_verified.parquet" -SYS_PROMPT_SMALL = FIX_DIR / "agent_prompts" / "codex_gpt52_codex.md" # ~1694 tok -SYS_PROMPT_LARGE = FIX_DIR / "agent_prompts" / "codex_gpt52.md" # ~4756 tok - -N_GEN = 256 -BUDGET = 22 - -# Prompt buckets — target token counts (hit within ±20%). -BUCKETS = { - "2k": {"target": 2048, "sys": SYS_PROMPT_SMALL}, - "8k": {"target": 8192, "sys": SYS_PROMPT_LARGE}, - "24k": {"target": 24576, "sys": SYS_PROMPT_LARGE}, -} - - -# ── shared with bench_llm.py (intentionally duplicated to keep it standalone) ── -def _find_draft_model(root: Path): - if root.is_file(): - return str(root) - if not root.is_dir(): - return None - for pattern in ("dflash-draft-*.gguf", "*.gguf", "model.safetensors"): - matches = sorted(root.rglob(pattern)) - if matches: - return str(matches[0]) - return None - - -def _resolve_draft() -> str: - env = os.environ.get("DFLASH_DRAFT") - if env: - found = _find_draft_model(Path(env)) - if found: - return found - raise FileNotFoundError(f"DFLASH_DRAFT does not point to a draft GGUF/safetensors: {env}") - for c in (_LOCAL_DRAFT_FILE, _LOCAL_DRAFT_ROOT): - found = _find_draft_model(c) - if found: - return found - raise FileNotFoundError( - f"DFlash draft not found. Set DFLASH_DRAFT or place a file under {_LOCAL_DRAFT_ROOT}" - ) - - -def _require_file(path: str, label: str): - if not Path(path).is_file(): - raise FileNotFoundError(f"{label} not found: {path}") - - -def _run_timed(cmd, timeout: int, label: str): - """Run a subprocess, return (CompletedProcess, wall_seconds).""" - t0 = time.perf_counter() - r = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) - wall_s = time.perf_counter() - t0 - if r.returncode != 0: - tail = (r.stderr or r.stdout or "").strip()[-2000:] - raise RuntimeError(f"{label} exited {r.returncode}: {tail}") - return r, wall_s - - -def tokenize_to_file(tok, text: str, path: Path) -> int: - ids = tok.encode(text, add_special_tokens=False) - with open(path, "wb") as f: - for t in ids: - f.write(struct.pack(" int: - pad = 64 - return ((n_prompt + n_gen + pad + 255) // 256) * 256 - - -# ── parsing ──────────────────────────────────────────────────────────────── -_RE_DECODE_TPS = re.compile(r"(\d+(?:\.\d+)?)\s+tok/s") -_RE_DF_PREFILL = re.compile(r"\[prefill\]\s+(\d+)\s+tokens\s+in\s+(\d+(?:\.\d+)?)\s*s") -_RE_AL = re.compile(r"avg commit/step=(\d+(?:\.\d+)?)") -_RE_TIMING_LINE = re.compile(r"^\s+([a-z_]+)\s+(\d+(?:\.\d+)?)\s*$") - - -def _parse_dflash(stdout: str) -> dict: - """Parse test_dflash stdout into {prefill_s, decode_tps, al, stages{}}.""" - m_pf = _RE_DF_PREFILL.search(stdout) - m_al = _RE_AL.search(stdout) - # decode tps line is the LAST tok/s in the file ("[dflash] generated ... -> X tok/s") - matches = list(_RE_DECODE_TPS.finditer(stdout)) - if not (m_pf and m_al and matches): - raise RuntimeError(f"test_dflash parse failed:\n{stdout[-1500:]}") - decode_tps = float(matches[-1].group(1)) - - # Per-stage timing block (lines like " draft_compute 14.70") - stages = {} - in_block = False - for line in stdout.splitlines(): - if line.startswith("[timing]"): - in_block = True - continue - if in_block: - if line.startswith("[") or line.startswith("---"): - # " ----- sum 132.20" ends block; "[dflash] generated…" too - if "----- sum" in line: - m = re.search(r"sum\s+(\d+(?:\.\d+)?)", line) - if m: - stages["sum"] = float(m.group(1)) - in_block = False - continue - if line.startswith("["): - in_block = False - continue - m = _RE_TIMING_LINE.match(line) - if m: - stages[m.group(1)] = float(m.group(2)) - - return { - "prefill_s": float(m_pf.group(2)), - "n_prompt_seen": int(m_pf.group(1)), - "decode_tps": decode_tps, - "al": float(m_al.group(1)), - "stages": stages, - } - - -def _parse_ar(stdout: str) -> dict: - """Parse test_generate stdout → {decode_tps, n_gen}.""" - m = re.search(r"\[gen\]\s+(\d+)\s+new tokens in\s+(\d+(?:\.\d+)?)\s*s\s*->\s*(\d+(?:\.\d+)?)\s+tok/s", - stdout) - if not m: - raise RuntimeError(f"test_generate parse failed:\n{stdout[-1500:]}") - return { - "decode_tps": float(m.group(3)), - "decode_s": float(m.group(2)), - "n_gen": int(m.group(1)), - } - - -# ── runners ──────────────────────────────────────────────────────────────── -def run_ar(path: Path, n_gen: int): - out_bin = TMPDIR / "ar_out.bin" - r, wall_s = _run_timed( - [TEST_GENERATE, TARGET, str(path), str(n_gen), str(out_bin)], - timeout=600, label="test_generate", - ) - p = _parse_ar(r.stdout) - p["wall_s"] = wall_s - return p - - -def run_df(path: Path, n_prompt: int, n_gen: int, budget: int = None): - if budget is None: - budget = BUDGET - max_ctx = _auto_max_ctx(n_prompt, n_gen) - out_bin = TMPDIR / "df_out.bin" - r, wall_s = _run_timed( - [ - TEST_DFLASH, TARGET, DRAFT, str(path), str(n_gen), str(out_bin), - "--fast-rollback", "--ddtree", - f"--ddtree-budget={budget}", f"--max-ctx={max_ctx}", - ], - timeout=900, label="test_dflash", - ) - p = _parse_dflash(r.stdout) - p["wall_s"] = wall_s - p["max_ctx"] = max_ctx - return p - - -# ── prompt construction ──────────────────────────────────────────────────── -def _load_swe_rows(): - import pyarrow.parquet as pq - t = pq.read_table(str(SWE_PARQUET)) - return t.to_pandas() - - -def _agent_user_message(row: dict, file_blocks_chars: int) -> str: - """Synthesise a Codex/Claude-Code style user turn. - - Structure mimics what an agent client actually sends after a few tool - calls have run (read_file results pasted into history). - """ - repo = row["repo"] - iid = row["instance_id"] - problem = row["problem_statement"] or "" - patch = row["patch"] or "" - test_patch = row["test_patch"] or "" - hints = row["hints_text"] or "" - - # Build a pool of "file content" — real code from the repo's patch + - # test_patch, repeated if needed to hit the target. This is the same - # shape Codex would have after a few read_file calls. - pool = "\n\n".join(p for p in (patch, test_patch, hints) if p) - if not pool: - pool = problem - # Repeat to reach target byte count. Padding is real code from the same - # repo, just chunked into multiple blocks so it looks - # like several read_file calls. - chunks = [] - chunk_size = max(2000, file_blocks_chars // 6) - cur = 0 - idx = 1 - while cur < file_blocks_chars: - seg = pool[(cur % max(1, len(pool))) : (cur % max(1, len(pool))) + chunk_size] - if not seg: - seg = pool[:chunk_size] - chunks.append( - f"\n{seg}\n" - ) - cur += len(seg) - idx += 1 - - file_blocks = "\n\n".join(chunks) - return ( - f"Repository: {repo}\n" - f"Instance: {iid}\n\n" - f"## Issue\n{problem}\n\n" - f"## Context I gathered\n" - f"I ran `read_file` on the relevant modules. Their contents are:\n\n" - f"{file_blocks}\n\n" - f"## Task\n" - f"Investigate the bug and reply with a single tool call to `apply_patch` " - f"that fixes it. Keep the patch minimal." - ) - - -def build_prompt(tok, sys_prompt_path: Path, row, target_tokens: int) -> tuple: - """Build a chat-templated prompt that hits ``target_tokens`` ±20%. - - Returns (text, n_tokens). - """ - sys_text = sys_prompt_path.read_text(encoding="utf-8") - # Iteratively grow file_blocks_chars until we hit target. - # Empirically ~3.5 chars/token for Qwen on code. - sys_tokens = len(tok.encode(sys_text, add_special_tokens=False)) - overhead = 200 # chat template + scaffolding - target_user_tokens = max(256, target_tokens - sys_tokens - overhead) - chars = max(1024, target_user_tokens * 4) - - for _ in range(6): - user_text = _agent_user_message(row, chars) - msgs = [ - {"role": "system", "content": sys_text}, - {"role": "user", "content": user_text}, - ] - # Try chat template; fall back to plain concat if tokenizer has none. - try: - text = tok.apply_chat_template( - msgs, tokenize=False, add_generation_prompt=True, - enable_thinking=False, - ) - except Exception: - text = sys_text + "\n\n" + user_text + "\n\n" - n = len(tok.encode(text, add_special_tokens=False)) - if abs(n - target_tokens) / target_tokens < 0.20: - return text, n - # binary search-ish: scale chars by ratio - chars = max(512, int(chars * (target_tokens / max(1, n)))) - return text, n - - -def select_rows_for_bucket(df, target_tokens, n_sample, seed=42): - """Pick rows whose problem_statement is small enough that we can grow to - target without truncating the issue itself.""" - # Just shuffle — _agent_user_message handles padding to any size. - return df.sample(n=n_sample, random_state=seed).to_dict("records") - - -# ── main bench loop ──────────────────────────────────────────────────────── -def main(): - global DRAFT, BUDGET - - p = argparse.ArgumentParser(description="DFlash agentic-workload benchmark") - p.add_argument("--budget", type=int, default=BUDGET) - p.add_argument("--n-sample", type=int, default=5, - help="prompts per bucket (default 5)") - p.add_argument("--bucket", choices=list(BUCKETS) + ["all"], default="all") - p.add_argument("--n-gen", type=int, default=N_GEN) - p.add_argument("--out", type=str, default=str(TMPDIR / "bench_agent_results.json")) - p.add_argument("--skip-ar", action="store_true", - help="skip the AR baseline (useful for budget sweeps)") - args = p.parse_args() - BUDGET = args.budget - - DRAFT = _resolve_draft() - _require_file(TARGET, "target GGUF") - _require_file(TEST_DFLASH, "test_dflash binary") - if not args.skip_ar: - _require_file(TEST_GENERATE, "test_generate binary") - _require_file(str(SWE_PARQUET), "SWE-bench Verified parquet") - _require_file(str(SYS_PROMPT_SMALL), "small Codex system prompt fixture") - _require_file(str(SYS_PROMPT_LARGE), "large Codex system prompt fixture") - - print(f"[bench-agent] target = {TARGET}", flush=True) - print(f"[bench-agent] draft = {DRAFT}", flush=True) - print(f"[bench-agent] tokenizer = {TOKENIZER}", flush=True) - print(f"[bench-agent] budget = {BUDGET} n_gen = {args.n_gen} n_sample = {args.n_sample}", - flush=True) - - from transformers import AutoTokenizer - tok = AutoTokenizer.from_pretrained(TOKENIZER, trust_remote_code=True) - - df = _load_swe_rows() - print(f"[bench-agent] loaded {len(df)} SWE-bench Verified rows", flush=True) - - bucket_keys = list(BUCKETS) if args.bucket == "all" else [args.bucket] - results = {} - load_s_estimate = None # calibrated from first DFlash run - - for bk in bucket_keys: - cfg = BUCKETS[bk] - target = cfg["target"] - sys_path = cfg["sys"] - rows = select_rows_for_bucket(df, target, args.n_sample, seed=42) - - print(f"\n[bench-agent] === bucket {bk} (target ~{target} tok, n={len(rows)}) ===", - flush=True) - per_prompt = [] - for i, row in enumerate(rows): - text, n = build_prompt(tok, sys_path, row, target) - path = TMPDIR / f"agent_{bk}_{i:02d}.bin" - tokenize_to_file(tok, text, path) - - try: - df_res = run_df(path, n, args.n_gen) - except Exception as e: - print(f" [{i+1:02d}/{len(rows)}] n={n:5d} DFlash FAILED: {e}", - flush=True) - continue - - # calibrate load_s from the first DFlash run if not done yet - if load_s_estimate is None: - load_s_estimate = max( - 0.0, - df_res["wall_s"] - df_res["prefill_s"] - - args.n_gen / max(1e-6, df_res["decode_tps"]) - ) - print(f" [calibration] estimated model_load_s = {load_s_estimate:.2f}", - flush=True) - - ar_res = None - if not args.skip_ar: - try: - ar_res = run_ar(path, args.n_gen) - except Exception as e: - print(f" [{i+1:02d}/{len(rows)}] n={n:5d} AR FAILED: {e}", - flush=True) - - entry = { - "bucket": bk, "i": i, "instance_id": row["instance_id"], - "n_prompt": n, "n_gen": args.n_gen, - "df": df_res, "ar": ar_res, "load_s_est": load_s_estimate, - } - - # Derived numbers (per-prompt, AR vs DFlash) - df_decode_s = args.n_gen / max(1e-6, df_res["decode_tps"]) - df_total_s = df_res["prefill_s"] + df_decode_s - df_ttft_s = df_res["prefill_s"] + 1.0 / max(1e-6, df_res["decode_tps"]) - entry["df_total_s"] = df_total_s - entry["df_ttft_s"] = df_ttft_s - - line = ( - f" [{i+1:02d}/{len(rows)}] n={n:5d} " - f"DF prefill={df_res['prefill_s']:6.2f}s ({n/df_res['prefill_s']:6.1f} tok/s) " - f"decode={df_res['decode_tps']:6.2f} tok/s " - f"AL={df_res['al']:5.2f} " - f"TTFT={df_ttft_s:6.2f}s total={df_total_s:6.2f}s" - ) - if ar_res is not None: - ar_decode_s = args.n_gen / max(1e-6, ar_res["decode_tps"]) - ar_prefill_s = max(0.0, ar_res["wall_s"] - load_s_estimate - ar_decode_s) - ar_ttft_s = ar_prefill_s + 1.0 / max(1e-6, ar_res["decode_tps"]) - ar_total_s = ar_prefill_s + ar_decode_s - entry["ar_prefill_s_est"] = ar_prefill_s - entry["ar_total_s"] = ar_total_s - entry["ar_ttft_s"] = ar_ttft_s - entry["speedup_decode"] = df_res["decode_tps"] / max(1e-6, ar_res["decode_tps"]) - entry["speedup_total"] = ar_total_s / max(1e-6, df_total_s) - line += ( - f" || AR prefill≈{ar_prefill_s:6.2f}s " - f"decode={ar_res['decode_tps']:5.2f} tok/s " - f"total={ar_total_s:6.2f}s " - f"speedup decode={entry['speedup_decode']:.2f}x " - f"total={entry['speedup_total']:.2f}x" - ) - print(line, flush=True) - per_prompt.append(entry) - - # bucket aggregates - if per_prompt: - def _mean(xs): return sum(xs) / len(xs) if xs else 0.0 - agg = { - "n_samples": len(per_prompt), - "n_prompt_mean": _mean([e["n_prompt"] for e in per_prompt]), - "df_prefill_s_mean": _mean([e["df"]["prefill_s"] for e in per_prompt]), - "df_decode_tps_mean": _mean([e["df"]["decode_tps"] for e in per_prompt]), - "df_al_mean": _mean([e["df"]["al"] for e in per_prompt]), - "df_ttft_s_mean": _mean([e["df_ttft_s"] for e in per_prompt]), - "df_total_s_mean": _mean([e["df_total_s"] for e in per_prompt]), - "stages_mean": { - k: _mean([e["df"]["stages"].get(k, 0.0) for e in per_prompt]) - for k in sorted({k for e in per_prompt for k in e["df"]["stages"]}) - }, - } - ar_entries = [e for e in per_prompt if e.get("ar") is not None] - if ar_entries: - agg["ar_prefill_s_est_mean"] = _mean([e["ar_prefill_s_est"] for e in ar_entries]) - agg["ar_decode_tps_mean"] = _mean([e["ar"]["decode_tps"] for e in ar_entries]) - agg["ar_ttft_s_mean"] = _mean([e["ar_ttft_s"] for e in ar_entries]) - agg["ar_total_s_mean"] = _mean([e["ar_total_s"] for e in ar_entries]) - agg["speedup_decode_mean"] = _mean([e["speedup_decode"] for e in ar_entries]) - agg["speedup_total_mean"] = _mean([e["speedup_total"] for e in ar_entries]) - results[bk] = {"per_prompt": per_prompt, "agg": agg} - - print(f"\n [{bk}] mean: n={agg['n_prompt_mean']:.0f} " - f"DF prefill={agg['df_prefill_s_mean']:.2f}s " - f"decode={agg['df_decode_tps_mean']:.2f} tok/s " - f"AL={agg['df_al_mean']:.2f} " - f"TTFT={agg['df_ttft_s_mean']:.2f}s " - f"total={agg['df_total_s_mean']:.2f}s", - flush=True) - if ar_entries: - print(f" [{bk}] mean: AR prefill≈{agg['ar_prefill_s_est_mean']:.2f}s " - f"decode={agg['ar_decode_tps_mean']:.2f} tok/s " - f"total={agg['ar_total_s_mean']:.2f}s " - f"|| speedup decode={agg['speedup_decode_mean']:.2f}x " - f"TOTAL={agg['speedup_total_mean']:.2f}x", - flush=True) - - # ── final comparison vs RESULTS.md headline ───────────────────────── - print("\n[bench-agent] === COMPARISON vs RESULTS.md HumanEval headline ===") - print(f"{'Bucket':>8s} {'n_tok':>6s} {'AR tps':>7s} {'DF tps':>7s} " - f"{'AL':>5s} {'TTFT':>7s} {'Total':>7s} {'sp_dec':>7s} {'sp_tot':>7s}") - print(f"{'HumanEv':>8s} {' ~120':>6s} {' 37.78':>7s} {'129.52':>7s} " - f"{' 8.31':>5s} {' --':>7s} {' --':>7s} {' 3.43x':>7s} {' --':>7s} " - f"(from RESULTS.md)") - for bk, r in results.items(): - a = r["agg"] - sp_d = a.get("speedup_decode_mean", 0.0) - sp_t = a.get("speedup_total_mean", 0.0) - ar_tps = a.get("ar_decode_tps_mean", 0.0) - print(f"{bk:>8s} {a['n_prompt_mean']:6.0f} {ar_tps:7.2f} " - f"{a['df_decode_tps_mean']:7.2f} {a['df_al_mean']:5.2f} " - f"{a['df_ttft_s_mean']:6.2f}s {a['df_total_s_mean']:6.2f}s " - f"{sp_d:6.2f}x {sp_t:6.2f}x") - - # write JSON (per-prompt and agg) - out_path = Path(args.out) - out_path.parent.mkdir(parents=True, exist_ok=True) - with open(out_path, "w") as f: - json.dump(results, f, indent=2, default=str) - print(f"\n[bench-agent] wrote {out_path}", flush=True) - - -if __name__ == "__main__": - main() diff --git a/server/scripts/bench_agent_loop.py b/server/scripts/bench_agent_loop.py deleted file mode 100644 index 3f4a668b3..000000000 --- a/server/scripts/bench_agent_loop.py +++ /dev/null @@ -1,190 +0,0 @@ -"""B.6: agent-loop bench using real Claude Code session messages. - -Extracts the first N user-text turns from a session JSONL, replays them -sequentially through the dflash server, and reports per-turn latency -under two configs: prefix-cache enabled vs disabled. - -Usage: - python3 dflash/scripts/bench_agent_loop.py [--turns N] [--session PATH] - -Default session = most recent JSONL under -~/.claude/projects/-home-peppi-Dev-lucebox-hub/. - -Each turn's user text is the real human prompt from the session. Assistant -replies are generated by the dflash server (small max_tokens to keep -the bench fast); the synthesized history grows turn-by-turn. - -Compares cold (--prefix-cache-slots=0) vs warm (--prefix-cache-slots=4). -Reports total wall time, per-turn latency, and per-turn ratio. -""" -import argparse -import json -import os -import signal -import subprocess -import sys -import time -import urllib.error -import urllib.request -from pathlib import Path - -ROOT = Path(__file__).resolve().parent.parent.parent -TARGET = Path.home() / "models/qwen3.6-27b/Qwen3.6-27B-UD-Q4_K_XL.gguf" -DRAFT = Path.home() / "models/qwen3.6-27b-dflash" -SERVER_BIN = ROOT / "dflash/build/dflash_server" -SESSION_DIR = Path.home() / ".claude/projects/-home-peppi-Dev-lucebox-hub" - - -def extract_user_turns(jsonl_path: Path, limit: int) -> list[str]: - """Pull the first `limit` user-text messages from a Claude Code session.""" - turns = [] - with open(jsonl_path) as f: - for ln in f: - try: - rec = json.loads(ln) - except json.JSONDecodeError: - continue - if rec.get("type") != "user": - continue - msg = rec.get("message", {}) - content = msg.get("content", "") - if isinstance(content, str) and content.strip() and not content.startswith("<"): - # Skip command-name records (they start with etc). - turns.append(content.strip()) - if len(turns) >= limit: - break - return turns - - -def chat_post(port: int, payload: dict, timeout=600) -> str: - body = json.dumps(payload).encode() - req = urllib.request.Request( - f"http://127.0.0.1:{port}/v1/chat/completions", - data=body, headers={"Content-Type": "application/json"}) - resp = urllib.request.urlopen(req, timeout=timeout) - data = json.loads(resp.read()) - return data["choices"][0]["message"]["content"] - - -def wait_server_up(port: int, proc: subprocess.Popen, timeout=180) -> bool: - deadline = time.time() + timeout - while time.time() < deadline: - if proc.poll() is not None: - return False - try: - urllib.request.urlopen(f"http://127.0.0.1:{port}/v1/models", timeout=1).read() - return True - except (urllib.error.URLError, ConnectionResetError, TimeoutError): - time.sleep(1) - return False - - -def run_config(label: str, port: int, slots: int, user_turns: list[str], - max_tokens: int, log_path: Path) -> list[float]: - """Spin up server with --prefix-cache-slots=slots, replay turns, return latencies.""" - log_f = open(log_path, "w") - proc = subprocess.Popen( - [str(SERVER_BIN), str(TARGET), - "--draft", str(DRAFT), - "--max-ctx", "4096", "--port", str(port), - "--prefix-cache-slots", str(slots)], - stdout=log_f, stderr=subprocess.STDOUT, bufsize=1) - - if not wait_server_up(port, proc): - log_f.close() - out = log_path.read_text()[-1500:] - proc.send_signal(signal.SIGINT) - try: proc.wait(timeout=10) - except subprocess.TimeoutExpired: proc.kill() - raise RuntimeError(f"{label}: server didn't come up\n{out}") - - print(f"\n--- {label} (slots={slots}) ---", flush=True) - - history = [] - SYSTEM = "You are a precise coding assistant for the lucebox-hub repo. Answer concisely." - latencies = [] - try: - for i, user_text in enumerate(user_turns): - history.append({"role": "user", "content": user_text}) - msgs = [{"role": "system", "content": SYSTEM}, *history] - payload = {"model": "luce-dflash", "messages": msgs, - "max_tokens": max_tokens, "stream": False} - t0 = time.time() - try: - reply = chat_post(port, payload, timeout=300) - except Exception as e: - print(f" turn {i+1}: ERROR {e}") - latencies.append(float("nan")) - continue - dt = time.time() - t0 - latencies.append(dt) - history.append({"role": "assistant", "content": reply}) - print(f" turn {i+1}: {dt:.2f}s reply={reply[:50]!r}", flush=True) - finally: - proc.send_signal(signal.SIGINT) - try: proc.wait(timeout=10) - except subprocess.TimeoutExpired: proc.kill() - log_f.close() - - return latencies - - -def main(): - ap = argparse.ArgumentParser() - ap.add_argument("--turns", type=int, default=5, - help="Number of user turns to replay") - ap.add_argument("--max-tokens", type=int, default=8, - help="max_tokens per response (kept small to bound bench time)") - ap.add_argument("--session", type=Path, default=None, - help="Path to session JSONL; default = most recent under " - f"{SESSION_DIR}") - args = ap.parse_args() - - if not TARGET.exists() or not SERVER_BIN.exists(): - print(f"SKIP: prereqs missing (target={TARGET.exists()} bin={SERVER_BIN.exists()})") - return 0 - - if args.session: - session = args.session - else: - candidates = sorted(SESSION_DIR.glob("*.jsonl"), - key=lambda p: p.stat().st_mtime, reverse=True) - if not candidates: - print(f"No session JSONL under {SESSION_DIR}") - return 1 - session = candidates[0] - print(f"Session: {session.name}", flush=True) - - user_turns = extract_user_turns(session, args.turns) - if len(user_turns) < args.turns: - print(f"Only got {len(user_turns)} turns") - print(f"Extracted {len(user_turns)} user turns:") - for i, t in enumerate(user_turns): - print(f" [{i+1}] {t[:80]!r}{'...' if len(t)>80 else ''}") - - # Cold config: cache disabled (slots=0) → every turn re-prefills full history - cold = run_config("COLD (cache disabled)", port=18290, slots=0, - user_turns=user_turns, max_tokens=args.max_tokens, - log_path=Path("/tmp/bench_cold.log")) - - # Warm config: cache enabled (slots=4) → multi-point inline-snap - warm = run_config("WARM (cache enabled)", port=18291, slots=4, - user_turns=user_turns, max_tokens=args.max_tokens, - log_path=Path("/tmp/bench_warm.log")) - - print("\n=== Per-turn latency ===", flush=True) - print(f"{'turn':>4} {'cold':>8} {'warm':>8} {'speedup':>8}") - total_cold = total_warm = 0.0 - for i, (c, w) in enumerate(zip(cold, warm), start=1): - speedup = (c / w) if (w and w > 0) else float("nan") - print(f"{i:>4} {c:>8.2f} {w:>8.2f} {speedup:>7.2f}x") - total_cold += c; total_warm += w - overall = total_cold / total_warm if total_warm else float("nan") - print(f"\ntotal_cold={total_cold:.2f}s total_warm={total_warm:.2f}s " - f"overall speedup={overall:.2f}x") - - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/server/scripts/bench_daemon.py b/server/scripts/bench_daemon.py deleted file mode 100644 index 1f7ea80bc..000000000 --- a/server/scripts/bench_daemon.py +++ /dev/null @@ -1,140 +0,0 @@ -"""Daemon-mode HE bench. Hits /v1/chat/completions with the same 10 HE -prompts as bench_he.py and reports mean tok/s. - -Streams the response and reports two numbers per prompt: - - * wall — total HTTP time (tokenize + prefill + decode + HTTP / JSON) - * decode — first-token → last-token elapsed, matching bench_he.py's - tok/s (excludes prefill + setup) - -Compare `decode` against bench_he.py to verify the C++ decode path is as -fast under the daemon as under a one-shot test_dflash invocation. - -Start the server first (same config the published numbers use): - DFLASH27B_KV_TQ3=1 ./build/dflash_server models/Qwen3.6-27B-Q4_K_M.gguf \\ - --ddtree --ddtree-budget 22 --max-ctx 16384 --port 8000 - -Then: - python3 scripts/bench_daemon.py --url http://localhost:8000 --n-gen 256 -""" -import argparse -import json -import time -import urllib.request -from pathlib import Path -import sys - -# Reuse the exact same 10 HE prompts bench_he.py uses. -sys.path.insert(0, str(Path(__file__).resolve().parent)) -from bench_he import PROMPTS - - -def run(url: str, prompt: str, n_gen: int) -> tuple[int, float, float]: - """POST to /v1/chat/completions with stream=true. Return (n_tok, wall_secs, - decode_secs) where decode_secs starts at the first streamed token (after - prefill) and ends at the last token.""" - body = json.dumps({ - "model": "luce-dflash", - "messages": [{"role": "user", "content": prompt}], - "max_tokens": n_gen, - "stream": True, - }).encode() - req = urllib.request.Request( - url + "/v1/chat/completions", - data=body, - headers={"Content-Type": "application/json", - "Accept": "text/event-stream"}, - ) - t0 = time.perf_counter() - t_first = 0.0 - t_last = 0.0 - n_tok = 0 - with urllib.request.urlopen(req, timeout=600) as r: - for raw in r: - line = raw.decode("utf-8", errors="replace").rstrip() - if not line.startswith("data:"): - continue - payload = line[5:].strip() - if payload == "[DONE]": - break - try: - chunk = json.loads(payload) - except json.JSONDecodeError: - continue - choices = chunk.get("choices") or [] - if not choices: - continue - delta = choices[0].get("delta") or {} - # Count tokens by content / reasoning deltas. Tool-call deltas - # aren't counted — they arrive as a single final chunk. - if delta.get("content") or delta.get("reasoning_content"): - if n_tok == 0: - t_first = time.perf_counter() - n_tok += 1 - t_last = time.perf_counter() - wall = time.perf_counter() - t0 - decode = (t_last - t_first) if n_tok > 1 else 0.0 - return n_tok, wall, decode - - -def main(): - ap = argparse.ArgumentParser() - ap.add_argument("--url", default="http://localhost:8000", - help="Base URL of the running server (no /v1 suffix)") - ap.add_argument("--n-gen", type=int, default=256) - ap.add_argument("--warmup", action="store_true", - help="Run the first prompt once before timing to discard " - "cold-start effects (model is already resident, but " - "the first request allocates the decode VMM chunks).") - args = ap.parse_args() - - if args.warmup: - print("[bench] warmup...", flush=True) - run(args.url, PROMPTS[0][1], args.n_gen) - - print(f"[bench] daemon API n_gen={args.n_gen} url={args.url}", flush=True) - print(f"{'prompt':28s} {'n_tok':>5s} {'wall_s':>7s} {'dec_s':>7s} " - f"{'wall_tps':>9s} {'dec_tps':>9s}") - print("-" * 72) - wall_tps_list: list[float] = [] - dec_tps_list: list[float] = [] - total_tok = 0 - total_wall = 0.0 - total_decode = 0.0 - for name, text in PROMPTS: - try: - n_tok, wall, decode = run(args.url, text, args.n_gen) - except Exception as e: - print(f" {name:26s} FAILED: {e}", flush=True) - continue - if n_tok == 0: - print(f" {name:26s} {n_tok:5d} {wall:7.2f} -- -- -- " - " (empty — daemon likely OOM'd)", flush=True) - continue - wall_tps = n_tok / wall - dec_tps = (n_tok - 1) / decode if decode > 0 else 0.0 - wall_tps_list.append(wall_tps) - if dec_tps > 0: - dec_tps_list.append(dec_tps) - total_decode += decode - total_tok += n_tok - total_wall += wall - print(f" {name:26s} {n_tok:5d} {wall:7.2f} {decode:7.2f} " - f"{wall_tps:9.2f} {dec_tps:9.2f}", flush=True) - - print("-" * 72) - if wall_tps_list: - print(f"wall tok/s mean: {sum(wall_tps_list)/len(wall_tps_list):7.2f} " - f"(HTTP + tokenize + prefill + decode)") - if dec_tps_list: - print(f"decode tok/s mean: {sum(dec_tps_list)/len(dec_tps_list):7.2f} " - f"(first-token → last-token, matches bench_he.py's number)") - agg_dec = (total_tok - len(dec_tps_list)) / total_decode if total_decode > 0 else 0.0 - print(f"decode tok/s aggregate:{agg_dec:7.2f}") - print(f"decode tok/s range: {min(dec_tps_list):.2f} - {max(dec_tps_list):.2f}") - else: - print("no successful runs") - - -if __name__ == "__main__": - main() diff --git a/server/scripts/bench_he.py b/server/scripts/bench_he.py deleted file mode 100644 index a4da24f6d..000000000 --- a/server/scripts/bench_he.py +++ /dev/null @@ -1,463 +0,0 @@ -""" -Bench DFlash test_dflash over multiple HumanEval-style prompts to get a stable -average acceptance length. Single-prompt measurements are noisy — z-lab's 8.09 -AL on humaneval is averaged over 164 samples. - -Usage on lucebox: - python3 bench_he.py # run all 10 prompts with --fast-rollback - python3 bench_he.py --mode batched # run without --fast-rollback for A/B -""" -import argparse -import os -import re -import struct -import subprocess -import sys -import tempfile -from pathlib import Path - -from placement.backend_device import apply_backend_visible_devices -from placement.test_dflash_args import TestDflashLaunchArgs - - -ROOT = Path(__file__).resolve().parent.parent -BIN_SUFFIX = ".exe" if os.name == "nt" else "" -TARGET = os.environ.get( - "DFLASH_TARGET", - str(ROOT / "models" / "Qwen3.6-27B-Q4_K_M.gguf"), -) -_LOCAL_DRAFT_FILE = ROOT / "models" / "draft" / "dflash-draft-3.6-q4_k_m.gguf" -_LOCAL_DRAFT_ROOT = ROOT / "models" / "draft" -DRAFT = None -TEST_DFLASH = os.environ.get( - "DFLASH_BIN", - str(ROOT / "build" / f"test_dflash{BIN_SUFFIX}"), -) -TMPDIR = Path(tempfile.gettempdir()) / "dflash_bench" -TMPDIR.mkdir(parents=True, exist_ok=True) - -PROMPTS = [ - # (name, source_code) - ( - "has_close_elements", - "from typing import List\n\n" - "def has_close_elements(numbers: List[float], threshold: float) -> bool:\n" - ' """Check if in given list of numbers, are any two numbers closer to each other than\n' - " given threshold.\n" - " >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n" - " False\n" - " >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n" - " True\n" - ' """\n' - " for", - ), - ( - "separate_paren_groups", - "from typing import List\n\n" - "def separate_paren_groups(paren_string: str) -> List[str]:\n" - ' """ Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n' - " separate those group into separate strings and return the list of those.\n" - " Separate groups are balanced (each open brace is properly closed) and not nested within each other\n" - " Ignore any spaces in the input string.\n" - " >>> separate_paren_groups('( ) (( )) (( )( ))')\n" - " ['()', '(())', '(()())']\n" - ' """\n' - " result = []\n" - " current_string = []\n" - " current_depth = 0\n" - " for", - ), - ( - "truncate_number", - "def truncate_number(number: float) -> float:\n" - ' """ Given a positive floating point number, it can be decomposed into\n' - " and integer part (largest integer smaller than given number) and decimals\n" - " (leftover part always smaller than 1).\n" - "\n" - " Return the decimal part of the number.\n" - " >>> truncate_number(3.5)\n" - " 0.5\n" - ' """\n' - " return", - ), - ( - "below_zero", - "from typing import List\n\n" - "def below_zero(operations: List[int]) -> bool:\n" - ' """ You\'re given a list of deposit and withdrawal operations on a bank account that starts with\n' - " zero balance. Your task is to detect if at any point the balance of account fallls below zero, and\n" - " at that point function should return True. Otherwise it should return False.\n" - " >>> below_zero([1, 2, 3])\n" - " False\n" - " >>> below_zero([1, 2, -4, 5])\n" - " True\n" - ' """\n' - " balance = 0\n" - " for op in", - ), - ( - "mean_absolute_deviation", - "from typing import List\n\n" - "def mean_absolute_deviation(numbers: List[float]) -> float:\n" - ' """ For a given list of input numbers, calculate Mean Absolute Deviation\n' - " around the mean of this dataset.\n" - " Mean Absolute Deviation is the average absolute difference between each\n" - " element and a centerpoint (mean in this case):\n" - " MAD = average | x - x_mean |\n" - " >>> mean_absolute_deviation([1.0, 2.0, 3.0, 4.0])\n" - " 1.0\n" - ' """\n' - " mean =", - ), - ( - "intersperse", - "from typing import List\n\n" - "def intersperse(numbers: List[int], delimeter: int) -> List[int]:\n" - " \"\"\" Insert a number 'delimeter' between every two consecutive elements of input list `numbers'\n" - " >>> intersperse([], 4)\n" - " []\n" - " >>> intersperse([1, 2, 3], 4)\n" - " [1, 4, 2, 4, 3]\n" - ' """\n' - " result = []\n" - " for i, n in", - ), - ( - "parse_nested_parens", - "from typing import List\n\n" - "def parse_nested_parens(paren_string: str) -> List[int]:\n" - ' """ Input to this function is a string represented multiple groups for nested parentheses separated by spaces.\n' - " For each of the group, output the deepest level of nesting of parentheses.\n" - " E.g. (()()) has maximum two levels of nesting while ((())) has three.\n" - " >>> parse_nested_parens('(()()) ((())) () ((())()())')\n" - " [2, 3, 1, 3]\n" - ' """\n' - " def parse_paren_group(s):\n" - " depth = 0\n" - " max_depth = 0\n" - " for c in", - ), - ( - "filter_by_substring", - "from typing import List\n\n" - "def filter_by_substring(strings: List[str], substring: str) -> List[str]:\n" - ' """ Filter an input list of strings only for ones that contain given substring\n' - " >>> filter_by_substring([], 'a')\n" - " []\n" - " >>> filter_by_substring(['abc', 'bacd', 'cde', 'array'], 'a')\n" - " ['abc', 'bacd', 'array']\n" - ' """\n' - " return", - ), - ( - "sum_product", - "from typing import List, Tuple\n\n" - "def sum_product(numbers: List[int]) -> Tuple[int, int]:\n" - ' """ For a given list of integers, return a tuple consisting of a sum and a product of all the integers in a list.\n' - " Empty sum should be equal to 0 and empty product should be equal to 1.\n" - " >>> sum_product([])\n" - " (0, 1)\n" - " >>> sum_product([1, 2, 3, 4])\n" - " (10, 24)\n" - ' """\n' - " s = 0\n" - " p = 1\n" - " for n in", - ), - ( - "rolling_max", - "from typing import List\n\n" - "def rolling_max(numbers: List[int]) -> List[int]:\n" - ' """ From a given list of integers, generate a list of rolling maximum element found until given moment\n' - " in the sequence.\n" - " >>> rolling_max([1, 2, 3, 2, 3, 4, 2])\n" - " [1, 2, 3, 3, 3, 4, 4]\n" - ' """\n' - " result = []\n" - " running_max = None\n" - " for n in numbers:\n" - " if running_max is", - ), -] - - -def _find_draft_file(root: Path) -> str | None: - if root.is_file(): - return str(root) if root.suffix in (".safetensors", ".gguf") else None - if not root.is_dir(): - return None - for pattern in ("dflash-draft-*.gguf", "*.gguf", "model.safetensors"): - matches = sorted(root.rglob(pattern)) - if matches: - return str(matches[0]) - return None - - -def _resolve_draft() -> str: - env = os.environ.get("DFLASH_DRAFT") - if env: - found = _find_draft_file(Path(env)) - if found: - return found - raise FileNotFoundError(f"DFLASH_DRAFT does not point to a draft file: {env}") - - for candidate in (_LOCAL_DRAFT_FILE, _LOCAL_DRAFT_ROOT): - found = _find_draft_file(candidate) - if found: - return found - - raise FileNotFoundError( - "draft model file not found. Expected one of:\n" - f" - {_LOCAL_DRAFT_FILE}\n" - "Download it as documented in the README, or set DFLASH_DRAFT to an explicit .safetensors/.gguf file or directory." - ) - - -def _require_file(path: str, label: str): - if not Path(path).is_file(): - raise FileNotFoundError(f"{label} not found: {path}") - - -def _tokenizer_slug(tokenizer_id: str) -> str: - """Filesystem-safe slug for tokenizer cache keying.""" - return re.sub(r"[^A-Za-z0-9._-]+", "_", tokenizer_id) - - -def _prompt_path(i: int, tokenizer_slug: str) -> Path: - return TMPDIR / f"he_prompt_{tokenizer_slug}_{i:02d}.bin" - - -def tokenize_prompt(prompt: str, out_path: Path, tokenizer) -> int: - ids = tokenizer.encode(prompt, add_special_tokens=False) - with open(out_path, "wb") as f: - for tid in ids: - f.write(struct.pack(" dict: - out_bin = TMPDIR / "he_bench_out.bin" - cmd = [ - TEST_DFLASH, TARGET, DRAFT, str(prompt_path), str(n_gen), str(out_bin), - ] - if fast_rollback: - cmd.append("--fast-rollback") - if ddtree_budget is not None: - cmd.append("--ddtree") - cmd.append(f"--ddtree-budget={ddtree_budget}") - if ddtree_temp is not None: - cmd.append(f"--ddtree-temp={ddtree_temp}") - if ddtree_no_chain_seed: - cmd.append("--ddtree-no-chain-seed") - if extra_args: - cmd.extend(extra_args) - env = os.environ.copy() - if extra_env: - env.update(extra_env) - r = subprocess.run(cmd, capture_output=True, text=True, timeout=600, env=env) - if r.returncode != 0: - print("STDERR:", r.stderr[-2000:]) - raise RuntimeError(f"test_dflash exited {r.returncode}") - - # Parse output. The target layer-split harness prints both prefill and - # decode lines, so avoid the older "first tok/s wins" regexp there. - out = r.stdout - m_prefill = re.search( - r"\[target-split\] prefill tokens=(\d+) time=(\d+(?:\.\d+)?) s speed=(\d+(?:\.\d+)?) tok/s", - out, - ) - m_decode_split = re.search( - r"\[target-split-dflash\] decode tokens=(\d+) time=(\d+(?:\.\d+)?) s speed=(\d+(?:\.\d+)?) tok/s", - out, - ) - m_decode_default = re.search( - r"\[dflash\] generated \d+ tokens in \d+(?:\.\d+)? s\s+->\s+(\d+(?:\.\d+)?) tok/s", - out, - ) - m_tps = re.search(r"(\d+(?:\.\d+)?)\s+tok/s", out) - m_commit = re.search(r"avg commit/step=(\d+(?:\.\d+)?)", out) - m_accept = re.search(r"accepted=(\d+)/(\d+) \((\d+(?:\.\d+)?)%", out) - m_steps = re.search(r"(\d+) draft steps", out) - if not ((m_decode_split or m_decode_default or m_tps) and m_commit and m_accept and m_steps): - print("STDOUT tail:", out[-2000:]) - raise RuntimeError("failed to parse output") - if m_decode_split: - tok_s = float(m_decode_split.group(3)) - elif m_decode_default: - tok_s = float(m_decode_default.group(1)) - else: - tok_s = float(m_tps.group(1)) - return { - "tok_s": tok_s, - "prefill_tok_s": float(m_prefill.group(3)) if m_prefill else None, - "commit_per_step": float(m_commit.group(1)), - "accepted": int(m_accept.group(1)), - "total_draft_pos": int(m_accept.group(2)), - "pct": float(m_accept.group(3)), - "steps": int(m_steps.group(1)), - } - - -def main(): - global DRAFT - DRAFT = _resolve_draft() - _require_file(TARGET, "target GGUF") - _require_file(TEST_DFLASH, "test_dflash binary") - - ap = argparse.ArgumentParser() - ap.add_argument("--n-gen", type=int, default=128) - ap.add_argument("--mode", choices=["fast", "batched"], default="fast") - ap.add_argument("--skip-tokenize", action="store_true") - ap.add_argument("--ddtree-budget", type=int, default=None, - help="Enable DDTree mode with this node budget (e.g. 15, 32, 64)") - ap.add_argument("--ddtree-temp", type=float, default=None, - help="Sharpen draft logits with this temperature (T<1 widens top-1/top-2 gap)") - ap.add_argument("--ddtree-no-chain-seed", action="store_true", - help="Use paper's pure best-first (no chain pre-seed)") - ap.add_argument("--draft-feature-mirror", action="store_true", - help="Use the draft-side target feature mirror path") - ap.add_argument("--peer-access", action="store_true", - help="Prefer CUDA P2P memcpy between GPUs when available (else host-staged copy)") - ap.add_argument("--target-gpu", type=int, default=None, - help="Visible CUDA device id for the target backend") - ap.add_argument("--draft-gpu", type=int, default=None, - help="Visible CUDA device id for the draft backend") - ap.add_argument("--target-gpus", default=None, - help="Comma-separated target GPU ids for the layer-split harness") - ap.add_argument("--target-layer-split", default=None, - help="Comma-separated layer split weights matching --target-gpus") - ap.add_argument("--target-split-load-draft", action="store_true", - help="Load the draft alongside the target layer-split harness") - ap.add_argument("--target-split-dflash", action="store_true", - help="Run chain DFlash decode through the target layer-split harness") - ap.add_argument("--draft-ipc-bin", default=None, - help="Path to a different-backend test_dflash used as the remote draft daemon") - ap.add_argument("--draft-ipc-gpu", type=int, default=None, - help="GPU id passed to the remote draft daemon") - ap.add_argument("--draft-ipc-work-dir", default=None, - help="Work directory for host-file IPC with the remote draft daemon") - ap.add_argument("--draft-ipc-ring-cap", type=int, default=None, - help="Feature-ring capacity for the remote draft daemon") - ap.add_argument("--max-ctx", type=int, default=None, - help="Forward --max-ctx=N to test_dflash") - ap.add_argument("--prefill-ubatch", type=int, default=None, - help="Set DFLASH27B_PREFILL_UBATCH for target split prefill") - ap.add_argument("--cuda-visible-devices", default=None, - help="Optional CUDA_VISIBLE_DEVICES override for test_dflash") - ap.add_argument("--target-tokenizer", - default=os.environ.get("DFLASH_TOKENIZER", "Qwen/Qwen3.5-27B"), - help="HuggingFace tokenizer repo for the target. Defaults to " - "$DFLASH_TOKENIZER, then Qwen/Qwen3.5-27B. Override for " - "Qwen3.6 or other variants, e.g. " - "--target-tokenizer Qwen/Qwen3.6-27B") - args = ap.parse_args() - - # Tokenized prompts are cached at TMPDIR/he_prompt__NN.bin so - # different --target-tokenizer values never collide. Without the slug, - # `--skip-tokenize` after a prior run with a different tokenizer would - # silently feed the wrong token IDs to the bench. - tok_slug = _tokenizer_slug(args.target_tokenizer) - - print(f"[bench] target = {TARGET}") - print(f"[bench] draft = {DRAFT}") - print(f"[bench] bin = {TEST_DFLASH}") - print(f"[bench] tmp = {TMPDIR}") - print(f"[bench] tokenizer = {args.target_tokenizer}") - - if not args.skip_tokenize: - print(f"[bench] tokenizing prompts via HF…") - from transformers import AutoTokenizer - tok = AutoTokenizer.from_pretrained(args.target_tokenizer, trust_remote_code=True) - for i, (name, p) in enumerate(PROMPTS): - path = _prompt_path(i, tok_slug) - n = tokenize_prompt(p, path, tok) - print(f" [{i:02d}] {name:26s} {n:4d} tokens") - else: - if not _prompt_path(0, tok_slug).exists(): - sys.exit( - f"[error] --skip-tokenize requested but no cache for " - f"tokenizer={args.target_tokenizer!r} (looked for " - f"{_prompt_path(0, tok_slug)}). Drop --skip-tokenize to " - f"tokenize fresh, or pass --target-tokenizer matching a " - f"previous run.") - print(f"[bench] skipping tokenize (reusing {_prompt_path(0, tok_slug).parent})") - - print(f"\n[bench] mode={args.mode} n_gen={args.n_gen}") - print(f"{'prompt':28s} {'steps':>6s} {'AL':>6s} {'pct%':>6s} {'prefill':>8s} {'decode':>8s}") - print("-" * 72) - - extra_args = TestDflashLaunchArgs( - draft_feature_mirror=args.draft_feature_mirror, - peer_access=args.peer_access, - target_gpu=args.target_gpu, - draft_gpu=args.draft_gpu, - target_gpus=args.target_gpus, - target_layer_split=args.target_layer_split, - target_split_load_draft=args.target_split_load_draft, - target_split_dflash=args.target_split_dflash, - draft_ipc_bin=args.draft_ipc_bin, - draft_ipc_gpu=args.draft_ipc_gpu, - draft_ipc_work_dir=args.draft_ipc_work_dir, - draft_ipc_ring_cap=args.draft_ipc_ring_cap, - max_ctx=args.max_ctx, - ).to_cli_args() - - extra_env = {} - if args.cuda_visible_devices: - extra_env = apply_backend_visible_devices( - "cuda", - visible_devices=args.cuda_visible_devices, - base_env=extra_env, - ) - if args.prefill_ubatch is not None: - extra_env["DFLASH27B_PREFILL_UBATCH"] = str(args.prefill_ubatch) - - results = [] - for i, (name, _) in enumerate(PROMPTS): - path = _prompt_path(i, tok_slug) - try: - r = run_test_dflash(path, args.n_gen, - fast_rollback=(args.mode == "fast" and not args.target_split_dflash), - ddtree_budget=args.ddtree_budget, - ddtree_temp=args.ddtree_temp, - ddtree_no_chain_seed=args.ddtree_no_chain_seed, - extra_args=extra_args, - extra_env=extra_env) - except Exception as e: - print(f" [{i:02d}] {name:26s} FAILED: {e}") - continue - results.append((name, r)) - prefill_s = f"{r['prefill_tok_s']:8.2f}" if r["prefill_tok_s"] is not None else f"{'n/a':>8s}" - print( - f" {name:26s} {r['steps']:6d} {r['commit_per_step']:6.2f} " - f"{r['pct']:6.1f} {prefill_s} {r['tok_s']:8.2f}" - ) - - if not results: - print("no successful runs") - sys.exit(1) - - n = len(results) - mean_al = sum(r["commit_per_step"] for _, r in results) / n - mean_tps = sum(r["tok_s"] for _, r in results) / n - mean_pct = sum(r["pct"] for _, r in results) / n - prefill_vals = [r["prefill_tok_s"] for _, r in results if r["prefill_tok_s"] is not None] - mean_prefill = sum(prefill_vals) / len(prefill_vals) if prefill_vals else None - - print("-" * 72) - prefill_s = f"{mean_prefill:8.2f}" if mean_prefill is not None else f"{'n/a':>8s}" - print(f"{'MEAN':28s} {'':6s} {mean_al:6.2f} {mean_pct:6.1f} {prefill_s} {mean_tps:8.2f}") - print() - print(f"commit/step range: {min(r['commit_per_step'] for _,r in results):.2f} - " - f"{max(r['commit_per_step'] for _,r in results):.2f}") - print(f"tok/s range: {min(r['tok_s'] for _,r in results):.1f} - " - f"{max(r['tok_s'] for _,r in results):.1f}") - - -if __name__ == "__main__": - main() diff --git a/server/scripts/bench_he_http.py b/server/scripts/bench_he_http.py deleted file mode 100644 index 5da3b1d8b..000000000 --- a/server/scripts/bench_he_http.py +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env python3 -"""HumanEval-style bench through dflash_server HTTP API. Measures end-to-end -decode tok/s on code-completion prompts. Server-internal [gemma4-spec] log -captures the true decode speed + acceptance rate per request.""" -import json, time, urllib.request, sys - -PROMPTS = [ - ("has_close_elements", - "from typing import List\n\n" - "def has_close_elements(numbers: List[float], threshold: float) -> bool:\n" - ' """Check if in given list of numbers, are any two numbers closer to each other than\n' - " given threshold.\n" - " >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n" - " False\n" - ' """\n for'), - ("separate_paren_groups", - "from typing import List\n\n" - "def separate_paren_groups(paren_string: str) -> List[str]:\n" - ' """ Separate groups of nested parentheses into list of strings. """\n' - " result = []\n current = ''\n depth = 0\n for"), - ("truncate_number", - "def truncate_number(number: float) -> float:\n" - ' """ Return the decimal part of a floating-point number. """\n' - " return"), - ("below_zero", - "from typing import List\n\n" - "def below_zero(operations: List[int]) -> bool:\n" - ' """ Detect if balance falls below zero. """\n' - " balance = 0\n for op in operations:\n"), - ("mean_absolute_deviation", - "from typing import List\n\n" - "def mean_absolute_deviation(numbers: List[float]) -> float:\n" - ' """ For a given list of input numbers, calculate Mean Absolute Deviation. """\n' - " mean ="), - ("intersperse", - "from typing import List\n\n" - "def intersperse(numbers: List[int], delimeter: int) -> List[int]:\n" - ' """ Insert delimiter between consecutive elements. """\n' - " if not numbers:\n return []\n result ="), - ("parse_nested_parens", - "from typing import List\n\n" - "def parse_nested_parens(paren_string: str) -> List[int]:\n" - ' """ For each group return deepest level of nesting. """\n' - " def parse(s):\n depth = max_depth ="), - ("filter_by_substring", - "from typing import List\n\n" - "def filter_by_substring(strings: List[str], substring: str) -> List[str]:\n" - ' """ Filter strings that contain substring. """\n' - " return"), - ("sum_product", - "from typing import List, Tuple\n\n" - "def sum_product(numbers: List[int]) -> Tuple[int, int]:\n" - ' """ Return tuple (sum, product). Empty list -> (0,1). """\n' - " s ="), - ("rolling_max", - "from typing import List\n\n" - "def rolling_max(numbers: List[int]) -> List[int]:\n" - ' """ Rolling maximum of running list. """\n' - " result = []\n cur = -10**18\n for n in numbers:\n"), -] - -URL = sys.argv[1] if len(sys.argv) > 1 else "http://127.0.0.1:18080/v1/chat/completions" -MAX_TOKENS = int(sys.argv[2]) if len(sys.argv) > 2 else 96 - -total_tok = 0 -total_dt = 0.0 -print(f"[bench] url={URL} max_tokens={MAX_TOKENS} n_prompts={len(PROMPTS)}") -for name, prompt in PROMPTS: - body = { - "model": "dflash", - "messages": [{"role": "user", "content": prompt}], - "max_tokens": MAX_TOKENS, "temperature": 0, - } - req = urllib.request.Request(URL, data=json.dumps(body).encode(), - headers={"Content-Type": "application/json"}) - t = time.time() - try: - with urllib.request.urlopen(req, timeout=120) as resp: - r = json.loads(resp.read().decode()) - except Exception as e: - print(f" {name:30s} FAIL {e}") - continue - dt = time.time() - t - n = r["usage"]["completion_tokens"] - total_tok += n; total_dt += dt - print(f" {name:30s} N={n:3d} dt={dt:.3f}s tok/s={n/dt:6.2f}") - -print(f"\n[bench] total tokens={total_tok} total dt={total_dt:.2f}s") -print(f"[bench] avg tok/s = {total_tok/total_dt:.2f}") diff --git a/server/scripts/bench_llm.py b/server/scripts/bench_llm.py deleted file mode 100644 index 91ba498c5..000000000 --- a/server/scripts/bench_llm.py +++ /dev/null @@ -1,466 +0,0 @@ -""" -10 prompts per dataset, AR + DFlash per prompt. - - python3 scripts/bench_llm.py - -Paths resolve from the repo root by default. Override with env vars: - DFLASH_TARGET path to target Qwen3.6-27B-Q4_K_M.gguf (or 3.5) - DFLASH_DRAFT path to DFlash draft GGUF or model.safetensors - DFLASH_BIN path to build/test_dflash - DFLASH_BIN_AR path to build/test_generate - DFLASH_TOKENIZER HF tokenizer repo (default Qwen/Qwen3.5-27B; matches run.py) -""" -import argparse -import json -import os -import re -import struct -import subprocess -import tempfile -from pathlib import Path - -ROOT = Path(__file__).resolve().parent.parent -BIN_SUFFIX = ".exe" if os.name == "nt" else "" -TARGET = os.environ.get( - "DFLASH_TARGET", - str(ROOT / "models" / "Qwen3.6-27B-Q4_K_M.gguf"), -) -_LOCAL_DRAFT_FILE = ROOT / "models" / "draft" / "dflash-draft-3.6-q4_k_m.gguf" -_LOCAL_DRAFT_ROOT = ROOT / "models" / "draft" -DRAFT = None -TEST_DFLASH = os.environ.get("DFLASH_BIN", str(ROOT / "build" / f"test_dflash{BIN_SUFFIX}")) -TEST_GENERATE = os.environ.get("DFLASH_BIN_AR", str(ROOT / "build" / f"test_generate{BIN_SUFFIX}")) -TOKENIZER = os.environ.get("DFLASH_TOKENIZER", "Qwen/Qwen3.5-27B") -TMPDIR = Path(tempfile.gettempdir()) / "dflash_bench" -TMPDIR.mkdir(parents=True, exist_ok=True) - -N_GEN = 256 -BUDGET = 22 # default; overridden by --budget CLI arg -N_SAMPLE = 10 - -def _gsm_gold(x): - """Extract numeric answer after #### from GSM8K answer field.""" - ans = x["answer"] - idx = ans.rfind("####") - if idx >= 0: - return ans[idx + 4:].strip().replace(",", "") - return ans.strip() - - -BENCHES = [ - ("HumanEval", "openai_humaneval", None, "test", lambda x: x["prompt"], None, N_GEN), - ("GSM8K", "gsm8k", "main", "test", lambda x: f"Question: {x['question']}\nAnswer: ", _gsm_gold, 1024), - ("Math500", "HuggingFaceH4/MATH-500", None, "test", lambda x: f"Problem: {x['problem']}\nSolution: Put your final answer in \\boxed{{}}.\n", lambda x: x["answer"], 2048), -] - - -def _find_draft_model(root: Path) -> str | None: - if root.is_file(): - return str(root) - if not root.is_dir(): - return None - for pattern in ("dflash-draft-*.gguf", "*.gguf", "model.safetensors"): - matches = sorted(root.rglob(pattern)) - if matches: - return str(matches[0]) - return None - - -def _resolve_draft() -> str: - env = os.environ.get("DFLASH_DRAFT") - if env: - found = _find_draft_model(Path(env)) - if found: - return found - raise FileNotFoundError(f"DFLASH_DRAFT does not point to a DFlash draft GGUF or model.safetensors: {env}") - - for candidate in (_LOCAL_DRAFT_FILE, _LOCAL_DRAFT_ROOT): - found = _find_draft_model(candidate) - if found: - return found - - raise FileNotFoundError( - "DFlash draft GGUF or model.safetensors not found. Expected one of:\n" - f" - {_LOCAL_DRAFT_FILE}\n" - "Download it as documented in the README, or set DFLASH_DRAFT to an explicit file or directory." - ) - - -def _require_file(path: str, label: str): - if not Path(path).is_file(): - raise FileNotFoundError(f"{label} not found: {path}") - - -def _run_checked(cmd, timeout: int, label: str) -> subprocess.CompletedProcess: - r = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) - if r.returncode != 0: - tail = (r.stderr or r.stdout or "").strip()[-2000:] - raise RuntimeError(f"{label} exited {r.returncode}: {tail}") - return r - - -def tokenize(tok, p, path: Path): - ids = tok.encode(p, add_special_tokens=False) - with open(path, "wb") as f: - for t in ids: - f.write(struct.pack("20× prefill time (32K prompt + --kv-q4 + - # max_ctx=131072 → 1035s vs 38s at max_ctx=32768). See scripts/run.py. - pad = 64 # covers q_len=16 + ddtree verify overhead with margin - return ((n_prompt + n_gen + pad + 255) // 256) * 256 - - -def run_df(path: Path, n_prompt, n_gen: int = N_GEN): - max_ctx = _auto_max_ctx(n_prompt, n_gen) - out_bin = TMPDIR / f"df_out.bin" - r = _run_checked( - [ - TEST_DFLASH, - TARGET, - DRAFT, - str(path), - str(n_gen), - str(out_bin), - "--fast-rollback", - "--ddtree", - f"--ddtree-budget={BUDGET}", - f"--max-ctx={max_ctx}", - ], - timeout=300, - label="test_dflash", - ) - tps = re.search(r"(\d+(?:\.\d+)?)\s+tok/s", r.stdout) - al = re.search(r"avg commit/step=(\d+(?:\.\d+)?)", r.stdout) - if not (tps and al): - raise RuntimeError(f"test_dflash output parse failed: {r.stdout[-1500:]}") - return float(tps.group(1)), float(al.group(1)), out_bin - - -def _read_ids(path: Path): - """Read a binary file of packed int32 token IDs.""" - data = path.read_bytes() - return list(struct.unpack(f"<{len(data)//4}i", data)) - - -def _extract_boxed(text: str) -> str | None: - """Extract the last \\boxed{...} from a string, handling nested braces.""" - results = [] - i = 0 - while i < len(text): - idx = text.find("\\boxed{", i) - if idx == -1: - break - start = idx + len("\\boxed{") - depth = 1 - j = start - while j < len(text) and depth > 0: - if text[j] == "{": - depth += 1 - elif text[j] == "}": - depth -= 1 - j += 1 - if depth == 0: - results.append(text[start:j-1].strip()) - i = j - return results[-1] if results else None - - -def _normalize_math(s: str) -> str: - """Normalize a math answer string for comparison.""" - if s is None: - return "" - s = s.strip() - if s.startswith("$") and s.endswith("$"): - s = s[1:-1].strip() - # Strip currency $ (e.g. "$18" → "18") - if re.match(r'^\$\d', s): - s = s[1:] - s = re.sub(r"\\text\s*\{([^}]*)\}", r"\1", s) - s = re.sub(r"\\mathrm\s*\{([^}]*)\}", r"\1", s) - for cmd in [r"\left", r"\right", r"\displaystyle", r"\tfrac", r"\dfrac"]: - s = s.replace(cmd, "") - for unit in [" cm", " m", " km", " kg", " g", " s", " ms", - " degrees", " degree", "°", " inches", " feet", - " square units", " units", " dollars"]: - if s.lower().rstrip(".").endswith(unit): - s = s[:len(s) - len(unit) - (1 if s.endswith(".") else 0)] - s = re.sub(r"\s+", " ", s).strip() - s = s.rstrip(".,") - return s - - -def _math_equiv(pred: str, gold: str) -> bool: - """Check if two math answers are equivalent.""" - if pred is None or gold is None: - return False - p = _normalize_math(pred) - g = _normalize_math(gold) - if p == g: - return True - p_c = re.sub(r"\s*\\frac", r"\\frac", p) - g_c = re.sub(r"\s*\\frac", r"\\frac", g) - if p_c == g_c: - return True - try: - pf = float(p.replace(",", "")) - gf = float(g.replace(",", "")) - return abs(pf - gf) < 1e-6 - except (ValueError, TypeError): - pass - mixed_pat = re.compile(r"^(\d+)\s*\\frac\s*\{(\d+)\}\s*\{(\d+)\}$") - for s, other in [(p, g), (g, p)]: - m = mixed_pat.match(s) - if m: - try: - val = float(m.group(1)) + float(m.group(2)) / float(m.group(3)) - oval = float(other.replace(",", "")) - if abs(val - oval) < 1e-6: - return True - except (ValueError, ZeroDivisionError): - pass - frac_pat = re.compile(r"\\?frac\s*\{([^}]+)\}\s*\{([^}]+)\}") - for s, other in [(p, g), (g, p)]: - m = frac_pat.search(s) - if m: - try: - val = float(m.group(1)) / float(m.group(2)) - oval = float(other.replace(",", "")) - if abs(val - oval) < 1e-6: - return True - except (ValueError, ZeroDivisionError): - pass - return False - - -def score_math(output_bin: Path, gold_answer: str, tok) -> tuple[bool, str]: - """Score a Math500 output against the gold answer. Returns (correct, detail_str).""" - ids = _read_ids(output_bin) - text = tok.decode(ids) - - think_end = text.rfind("") - answer_text = text[think_end + len(""):] if think_end >= 0 else text - - pred = _extract_boxed(answer_text) - - # Fallback: "the answer is **X**" patterns - if pred is None: - bold_pattern = re.compile( - r'(?:answer\s+is|there\s+are|result\s+is|equals?|=)\s*\*\*(.+?)\*\*', - re.IGNORECASE) - m = bold_pattern.search(answer_text) - if m: - pred = m.group(1).strip().rstrip(".") - - # Fallback: last $...$ expression - if pred is None: - matches = re.findall(r'\$([^$]+)\$', answer_text) - if matches: - pred = matches[-1].strip() - - correct = _math_equiv(pred, gold_answer) - pred_short = (pred[:60] + "…") if pred and len(pred) > 60 else pred - gold_short = (gold_answer[:60] + "…") if len(gold_answer) > 60 else gold_answer - if correct: - detail = f"🎯 {pred_short}" - elif pred: - detail = f"✗ pred={pred_short} gold={gold_short}" - else: - detail = f"✗ no answer found, gold={gold_short}" - return correct, detail - - -def score_gsm(output_bin: Path, gold_answer: str, tok) -> tuple[bool, str]: - """Score a GSM8K output against the gold numeric answer. Returns (correct, detail_str).""" - ids = _read_ids(output_bin) - text = tok.decode(ids) - - think_end = text.rfind("") - answer_text = text[think_end + len(""):] if think_end >= 0 else text - - pred = None - - # \boxed{} - boxed = _extract_boxed(answer_text) - if boxed: - cleaned = boxed.replace(",", "").replace("$", "").strip() - if re.match(r'^[+-]?\d+\.?\d*$', cleaned): - pred = cleaned - - # #### - if pred is None: - m = re.search(r'####\s*\$?([+-]?\d[\d,]*\.?\d*)', answer_text) - if m: - pred = m.group(1).replace(",", "") - - # "the answer is **X**" - if pred is None: - m = re.search( - r'(?:answer\s+is|result\s+is|equals?|there\s+are|we\s+get)\s*\*?\*?\$?([+-]?\d[\d,]*\.?\d*)', - answer_text, re.IGNORECASE) - if m: - pred = m.group(1).replace(",", "") - - # **** or **$** - if pred is None: - m = re.search(r'\*\*\$?([+-]?\d[\d,]*\.?\d*)\*\*', answer_text) - if m: - pred = m.group(1).replace(",", "") - - # Last standalone number - if pred is None: - nums = re.findall(r'(? str: - # Instruct/thinking models require the chat template. Feeding a raw - # prompt makes the model ramble and never emit a scorable answer - # (issue #191: Math500 scored 0/10). Always apply the template when the - # tokenizer has one; --no-thinking only toggles Qwen's block. - if not getattr(tok, "chat_template", None): - return raw_prompt - return tok.apply_chat_template( - [{"role": "user", "content": raw_prompt}], - tokenize=False, add_generation_prompt=True, - enable_thinking=not args.no_thinking, - ) - - results = {} - for name, ds_name, cfg, split, extract, gold_extract, gen in BENCHES: - if bench_filter and name not in bench_filter: - continue - print(f"\n[bench] ==== {name} (n={N_SAMPLE}, n_gen={gen}) ====", flush=True) - ds = load_dataset(ds_name, cfg, split=split) - ds_selected = ds.shuffle(seed=42).select(range(N_SAMPLE)) - prompt_list = [extract(s) for s in ds_selected] - gold_list = [gold_extract(s) for s in ds_selected] if gold_extract else [None] * len(prompt_list) - - ar_tps, df_tps, df_al = [], [], [] - n_score_correct, n_scored = 0, 0 - for i, (p, gold) in enumerate(zip(prompt_list, gold_list)): - path = TMPDIR / f"b_{name}_{i:02d}.bin" - n = tokenize(tok, _wrap_prompt(p), path) - if n == 0 or n > 3500: - continue - try: - ar = run_ar(path, gen) - df, al, df_bin = run_df(path, n, gen) - except Exception as e: - print(f" [{i+1:02d}/{N_SAMPLE}] n_tok={n:4d} FAILED: {e}", flush=True) - continue - - score_detail = "" - if gold is not None: - if name == "GSM8K": - correct, score_detail = score_gsm(df_bin, gold, tok) - else: - correct, score_detail = score_math(df_bin, gold, tok) - n_scored += 1 - if correct: - n_score_correct += 1 - score_detail = f" {score_detail}" - - if ar > 0: - ar_tps.append(ar) - if df > 0: - df_tps.append(df) - df_al.append(al) - print(f" [{i+1:02d}/{N_SAMPLE}] n_tok={n:4d} AR={ar:6.2f} DFlash={df:7.2f} AL={al:5.2f}{score_detail}", flush=True) - ar_m = sum(ar_tps) / len(ar_tps) if ar_tps else 0 - df_m = sum(df_tps) / len(df_tps) if df_tps else 0 - al_m = sum(df_al) / len(df_al) if df_al else 0 - score_str = f"{n_score_correct}/{n_scored}" if n_scored else "" - results[name] = {"ar": ar_m, "dflash": df_m, "al": al_m, - "speedup": df_m / ar_m if ar_m else 0, - "score": score_str} - summary = f" {name} mean: AR={ar_m:.2f} DFlash={df_m:.2f} AL={al_m:.2f} {results[name]['speedup']:.2f}x" - if score_str: - summary += f" score={score_str} ({n_score_correct/n_scored*100:.0f}%)" - print(summary, flush=True) - - print("\n[bench] === SUMMARY ===") - print(f"{'Task':12s} {'AR':>8s} {'DFlash':>8s} {'AL':>6s} {'Speedup':>8s} {'Score':>8s}") - for name, r in results.items(): - print(f"{name:12s} {r['ar']:8.2f} {r['dflash']:8.2f} {r['al']:6.2f} {r['speedup']:7.2f}x {r.get('score',''):>8s}") - - out_json = TMPDIR / "bench_llm_results.json" - with open(out_json, "w") as f: - json.dump(results, f, indent=2) - print(f"[bench] wrote {out_json}", flush=True) - - -if __name__ == "__main__": - main() diff --git a/server/scripts/bench_server.py b/server/scripts/bench_server.py deleted file mode 100644 index 3e607d920..000000000 --- a/server/scripts/bench_server.py +++ /dev/null @@ -1,502 +0,0 @@ -#!/usr/bin/env python3 -"""HTTP server benchmark — exercises the C++ dflash_server with the same -workloads as bench_llm.py (short LLM prompts) and bench_agent.py (long -agentic prompts), but over HTTP via /v1/chat/completions streaming. - -This answers: "does the C++ server perform as well as the raw CLI binaries?" - -Workloads: - he — 10 HumanEval code-completion prompts (same as bench_daemon.py) - gsm8k — 10 GSM8K math word problems - math500 — 10 MATH-500 problems (2048 max_tokens) - agent — SWE-bench Verified at 2K / 8K / 24K token buckets - -Usage: - # Start C++ server first: - ./dflash/build/dflash_server dflash/models/Qwen3-0.6B-BF16.gguf --port 9099 - - # Run all workloads: - python3 dflash/scripts/bench_server.py --url http://localhost:9099 - - # Run specific workloads: - python3 dflash/scripts/bench_server.py --url http://localhost:9099 --workload he gsm8k - - # Quick smoke test (1 prompt per workload): - python3 dflash/scripts/bench_server.py --url http://localhost:9099 --n-sample 1 -""" -import argparse -import json -import re -import sys -import time -import urllib.request -import urllib.error -from pathlib import Path - -# Allow importing bench_he for its PROMPTS list. -sys.path.insert(0, str(Path(__file__).resolve().parent)) -from bench_llm import _extract_boxed, _normalize_math, _math_equiv - -N_SAMPLE = 10 -N_GEN_DEFAULT = 256 -N_GEN_MATH = 2048 - - -# ── HTTP streaming client ───────────────────────────────────────────────── - -def stream_chat(url: str, messages: list[dict], max_tokens: int, - temperature: float = 0.0, timeout: float = 600.0, - thinking: bool = False) -> dict: - """POST /v1/chat/completions with stream=True. - - Returns dict with: - n_tok, wall_s, ttft_s, decode_s, decode_tps, wall_tps, - text, usage (if server returns it in final chunk). - """ - body = { - "model": "dflash", - "messages": messages, - "max_tokens": max_tokens, - "temperature": temperature, - "stream": True, - } - if thinking: - body["thinking"] = {"type": "enabled", "budget_tokens": 4096} - - data = json.dumps(body).encode() - req = urllib.request.Request( - url.rstrip("/") + "/v1/chat/completions", - data=data, - headers={"Content-Type": "application/json", - "Accept": "text/event-stream"}, - ) - t0 = time.perf_counter() - t_first = 0.0 - t_last = 0.0 - n_tok = 0 - text_parts = [] - usage = None - - with urllib.request.urlopen(req, timeout=timeout) as r: - for raw in r: - line = raw.decode("utf-8", errors="replace").rstrip() - if not line.startswith("data:"): - continue - payload = line[5:].strip() - if payload == "[DONE]": - break - try: - chunk = json.loads(payload) - except json.JSONDecodeError: - continue - # Extract usage from final chunk if present. - if chunk.get("usage"): - usage = chunk["usage"] - choices = chunk.get("choices") or [] - if not choices: - continue - delta = choices[0].get("delta") or {} - content = delta.get("content") or "" - reasoning = delta.get("reasoning_content") or "" - if content or reasoning: - if n_tok == 0: - t_first = time.perf_counter() - n_tok += 1 - t_last = time.perf_counter() - if content: - text_parts.append(content) - - wall_s = time.perf_counter() - t0 - ttft_s = (t_first - t0) if n_tok > 0 else wall_s - decode_s = (t_last - t_first) if n_tok > 1 else 0.0 - decode_tps = (n_tok - 1) / decode_s if decode_s > 0 else 0.0 - wall_tps = n_tok / wall_s if wall_s > 0 else 0.0 - - return { - "n_tok": n_tok, - "wall_s": wall_s, - "ttft_s": ttft_s, - "decode_s": decode_s, - "decode_tps": decode_tps, - "wall_tps": wall_tps, - "text": "".join(text_parts), - "usage": usage, - } - - -# ── Workload: HumanEval ────────────────────────────────────────────────── - -def workload_he(url: str, n_sample: int, n_gen: int, **_kw): - """HumanEval code-completion prompts (same 10 as bench_he.py).""" - from bench_he import PROMPTS - prompts = PROMPTS[:n_sample] - results = [] - for name, text in prompts: - msgs = [{"role": "user", "content": text}] - try: - r = stream_chat(url, msgs, n_gen) - results.append({"name": name, **r}) - _print_row(name, r) - except Exception as e: - print(f" {name:28s} FAILED: {e}", flush=True) - return results - - -# ── Workload: GSM8K ────────────────────────────────────────────────────── - -def _load_gsm8k(n_sample: int): - from datasets import load_dataset - ds = load_dataset("gsm8k", "main", split="test") - n_sample = min(n_sample, len(ds)) - ds = ds.shuffle(seed=42).select(range(n_sample)) - return [ - {"name": f"gsm8k_{i:02d}", - "prompt": f"Question: {row['question']}\nAnswer: "} - for i, row in enumerate(ds) - ] - - -def workload_gsm8k(url: str, n_sample: int, n_gen: int, **_kw): - rows = _load_gsm8k(n_sample) - results = [] - for row in rows: - msgs = [{"role": "user", "content": row["prompt"]}] - try: - r = stream_chat(url, msgs, n_gen) - results.append({"name": row["name"], **r}) - _print_row(row["name"], r) - except Exception as e: - print(f" {row['name']:28s} FAILED: {e}", flush=True) - return results - - -# ── Workload: Math500 ──────────────────────────────────────────────────── - -def _load_math500(n_sample: int): - from datasets import load_dataset - ds = load_dataset("HuggingFaceH4/MATH-500", split="test") - n_sample = min(n_sample, len(ds)) - ds = ds.shuffle(seed=42).select(range(n_sample)) - return [ - {"name": f"math_{i:02d}", - "prompt": f"Problem: {row['problem']}\nSolution: Put your final answer in \\boxed{{}}.\n", - "answer": row["answer"]} - for i, row in enumerate(ds) - ] - - -def _score_math_text(text: str, gold_answer: str) -> tuple[bool, str]: - """Score a math response text against the gold answer. - - Extracts \\boxed{} answers (after for thinking models), - with fallbacks for **bold** and $...$ patterns. - Returns (correct, detail_str). - """ - think_end = text.rfind("") - answer_text = text[think_end + len(""):] if think_end >= 0 else text - - pred = _extract_boxed(answer_text) - if not pred: - pred = _extract_boxed(text) - - # Fallback: "the answer is **X**" patterns - if pred is None: - bold_pattern = re.compile( - r'(?:answer\s+is|there\s+are|result\s+is|equals?|=)\s*\*\*(.+?)\*\*', - re.IGNORECASE) - m = bold_pattern.search(answer_text) - if m: - pred = m.group(1).strip().rstrip(".") - - # Fallback: last $...$ expression - if pred is None: - matches = re.findall(r'\$([^$]+)\$', answer_text) - if matches: - pred = matches[-1].strip() - - correct = _math_equiv(pred, gold_answer) - pred_short = (pred[:60] + "…") if pred and len(pred) > 60 else pred - gold_short = (gold_answer[:60] + "…") if len(gold_answer) > 60 else gold_answer - if correct: - detail = f"🎯 {pred_short}" - elif pred: - detail = f"✗ pred={pred_short} gold={gold_short}" - else: - detail = f"✗ no answer found, gold={gold_short}" - return correct, detail - - -def workload_math500(url: str, n_sample: int, n_gen: int, thinking: bool = False, **_kw): - rows = _load_math500(n_sample) - gen = max(n_gen, N_GEN_MATH) # Math needs longer generation - results = [] - n_correct, n_scored = 0, 0 - for row in rows: - msgs = [{"role": "user", "content": row["prompt"]}] - try: - r = stream_chat(url, msgs, gen, thinking=thinking) - correct, detail = _score_math_text(r["text"], row["answer"]) - r["correct"] = correct - r["score_detail"] = detail - n_scored += 1 - if correct: - n_correct += 1 - results.append({"name": row["name"], **r}) - _print_row(row["name"], r, score=detail) - except Exception as e: - print(f" {row['name']:28s} FAILED: {e}", flush=True) - if n_scored: - pct = n_correct / n_scored * 100 - print(f"\n accuracy: {n_correct}/{n_scored} ({pct:.0f}%)") - return results - - -# ── Workload: Agent (SWE-bench) ────────────────────────────────────────── - -FIX_DIR = Path(__file__).resolve().parent / "fixtures" -SWE_PARQUET = FIX_DIR / "swe_bench" / "swe_bench_verified.parquet" -SYS_PROMPT_SMALL = FIX_DIR / "agent_prompts" / "codex_gpt52_codex.md" -SYS_PROMPT_LARGE = FIX_DIR / "agent_prompts" / "codex_gpt52.md" - -AGENT_BUCKETS = { - "2k": {"target_chars": 6000, "sys": SYS_PROMPT_SMALL}, - "8k": {"target_chars": 24000, "sys": SYS_PROMPT_LARGE}, - "24k": {"target_chars": 72000, "sys": SYS_PROMPT_LARGE}, -} - - -def _build_agent_user_msg(row: dict, target_chars: int) -> str: - """Build a Codex-style user message padded to ~target_chars.""" - repo = row.get("repo", "unknown/repo") - iid = row.get("instance_id", "unknown") - problem = row.get("problem_statement", "") or "" - patch = row.get("patch", "") or "" - test_patch = row.get("test_patch", "") or "" - hints = row.get("hints_text", "") or "" - - pool = "\n\n".join(p for p in (patch, test_patch, hints) if p) - if not pool: - pool = problem - - chunks = [] - chunk_size = max(2000, target_chars // 6) - cur = 0 - idx = 1 - while cur < target_chars: - offset = cur % max(1, len(pool)) - seg = pool[offset:offset + chunk_size] - if not seg: - seg = pool[:chunk_size] - chunks.append( - f'\n{seg}\n' - ) - cur += len(seg) - idx += 1 - - file_blocks = "\n\n".join(chunks) - return ( - f"Repository: {repo}\nInstance: {iid}\n\n" - f"## Issue\n{problem}\n\n" - f"## Context I gathered\n" - f"I ran `read_file` on the relevant modules:\n\n" - f"{file_blocks}\n\n" - f"## Task\nInvestigate the bug and reply with a single tool call " - f"to `apply_patch` that fixes it. Keep the patch minimal." - ) - - -def workload_agent(url: str, n_sample: int, n_gen: int, bucket: str = "all", **_kw): - if not SWE_PARQUET.is_file(): - print(f" SKIP: SWE-bench parquet not found at {SWE_PARQUET}", flush=True) - return [] - - import pyarrow.parquet as pq - df = pq.read_table(str(SWE_PARQUET)).to_pandas() - - bucket_keys = list(AGENT_BUCKETS) if bucket == "all" else [bucket] - all_results = [] - - for bk in bucket_keys: - cfg = AGENT_BUCKETS[bk] - sys_path = cfg["sys"] - if not sys_path.is_file(): - print(f" SKIP bucket {bk}: system prompt not found at {sys_path}", flush=True) - continue - - sys_text = sys_path.read_text(encoding="utf-8") - rows = df.sample(n=min(n_sample, len(df)), random_state=42).to_dict("records") - - print(f"\n --- bucket {bk} (target ~{cfg['target_chars']} chars, n={len(rows)}) ---") - for i, row in enumerate(rows): - name = f"agent_{bk}_{i:02d}" - user_msg = _build_agent_user_msg(row, cfg["target_chars"]) - msgs = [ - {"role": "system", "content": sys_text}, - {"role": "user", "content": user_msg}, - ] - try: - r = stream_chat(url, msgs, n_gen) - r["bucket"] = bk - r["instance_id"] = row.get("instance_id", "") - all_results.append({"name": name, **r}) - _print_row(name, r) - except Exception as e: - print(f" {name:28s} FAILED: {e}", flush=True) - - return all_results - - -# ── Output formatting ───────────────────────────────────────────────────── - -def _print_header(): - print(f" {'prompt':28s} {'n_tok':>5s} {'wall_s':>7s} {'ttft_s':>7s} " - f"{'dec_s':>7s} {'dec_tps':>8s} {'wall_tps':>9s}") - print(" " + "-" * 80) - - -def _print_row(name: str, r: dict, score: str = ""): - n = r["n_tok"] - suffix = f" {score}" if score else "" - if n == 0: - print(f" {name:28s} {n:5d} {r['wall_s']:7.2f} -- -- -- --{suffix}", - flush=True) - return - print(f" {name:28s} {n:5d} {r['wall_s']:7.2f} {r['ttft_s']:7.3f} " - f"{r['decode_s']:7.2f} {r['decode_tps']:8.2f} {r['wall_tps']:9.2f}{suffix}", - flush=True) - - -def _print_summary(label: str, results: list[dict]): - if not results: - return - valid = [r for r in results if r["n_tok"] > 0] - if not valid: - print(f"\n [{label}] no successful runs") - return - - def _mean(xs): - return sum(xs) / len(xs) if xs else 0.0 - - n = len(valid) - wall_tps = _mean([r["wall_tps"] for r in valid]) - dec_tps_list = [r["decode_tps"] for r in valid if r["decode_tps"] > 0] - dec_tps = _mean(dec_tps_list) if dec_tps_list else 0.0 - ttft = _mean([r["ttft_s"] for r in valid]) - wall = _mean([r["wall_s"] for r in valid]) - tok = _mean([r["n_tok"] for r in valid]) - - print(f"\n [{label}] {n} prompts — mean: " - f"n_tok={tok:.0f} TTFT={ttft:.3f}s " - f"decode={dec_tps:.2f} tok/s " - f"wall={wall_tps:.2f} tok/s " - f"total={wall:.2f}s") - if dec_tps_list: - print(f" [{label}] decode tok/s range: " - f"{min(dec_tps_list):.2f} - {max(dec_tps_list):.2f}") - - -# ── Main ────────────────────────────────────────────────────────────────── - -WORKLOADS = { - "he": ("HumanEval (code completion)", workload_he), - "gsm8k": ("GSM8K (math word problems)", workload_gsm8k), - "math500": ("MATH-500 (hard math)", workload_math500), - "agent": ("SWE-bench agent (2K/8K/24K)", workload_agent), -} - - -def main(): - ap = argparse.ArgumentParser( - description="HTTP server benchmark — exercises dflash_server with " - "bench_llm + bench_agent workloads over /v1/chat/completions") - ap.add_argument("--url", default="http://localhost:9099", - help="Server base URL (default: http://localhost:9099)") - ap.add_argument("--workload", nargs="+", choices=list(WORKLOADS) + ["all"], - default=["all"], - help="Which workloads to run (default: all)") - ap.add_argument("--n-sample", type=int, default=N_SAMPLE, - help=f"Prompts per workload (default: {N_SAMPLE})") - ap.add_argument("--n-gen", type=int, default=N_GEN_DEFAULT, - help=f"Max output tokens (default: {N_GEN_DEFAULT})") - ap.add_argument("--agent-bucket", choices=["2k", "8k", "24k", "all"], - default="all", help="Agent bucket filter (default: all)") - ap.add_argument("--warmup", action="store_true", - help="Run one warmup request before timing") - ap.add_argument("--thinking", action="store_true", - help="Enable thinking/reasoning mode") - ap.add_argument("--out", type=str, default=None, - help="Write JSON results to this file") - args = ap.parse_args() - - # Validate server is reachable. - try: - urllib.request.urlopen(args.url.rstrip("/") + "/health", timeout=5) - except Exception as e: - print(f"ERROR: server not reachable at {args.url}: {e}", file=sys.stderr) - sys.exit(1) - - if args.warmup: - print("[bench-server] warmup...", flush=True) - stream_chat(args.url, [{"role": "user", "content": "Hi"}], 16) - - wl_keys = list(WORKLOADS) if "all" in args.workload else args.workload - - print("=" * 88) - print(f" HTTP Server Benchmark — {args.url}") - print(f" workloads: {', '.join(wl_keys)} n_sample={args.n_sample} n_gen={args.n_gen}") - print("=" * 88) - - all_results = {} - for wk in wl_keys: - label, fn = WORKLOADS[wk] - print(f"\n{'─' * 88}") - print(f" {label}") - print(f"{'─' * 88}") - _print_header() - try: - results = fn(url=args.url, n_sample=args.n_sample, n_gen=args.n_gen, - bucket=args.agent_bucket, thinking=args.thinking) - except ImportError as e: - print(f" SKIP {wk}: missing dependency — {e}", flush=True) - results = [] - except FileNotFoundError as e: - print(f" SKIP {wk}: {e}", flush=True) - results = [] - - all_results[wk] = results - _print_summary(wk, results) - - # Final summary - print(f"\n{'=' * 88}") - print(" SUMMARY") - print(f"{'=' * 88}") - print(f" {'Workload':12s} {'N':>3s} {'TTFT':>7s} {'dec_tps':>8s} " - f"{'wall_tps':>9s} {'wall_s':>7s}") - print(" " + "-" * 55) - for wk in wl_keys: - results = all_results.get(wk, []) - valid = [r for r in results if r.get("n_tok", 0) > 0] - if not valid: - print(f" {wk:12s} {'--':>3s} {'--':>7s} {'--':>8s} {'--':>9s} {'--':>7s}") - continue - n = len(valid) - - def _m(key): - vals = [r[key] for r in valid if r.get(key, 0) > 0] - return sum(vals) / len(vals) if vals else 0.0 - - print(f" {wk:12s} {n:3d} {_m('ttft_s'):7.3f} {_m('decode_tps'):8.2f} " - f"{_m('wall_tps'):9.2f} {_m('wall_s'):7.2f}") - - if args.out: - out_path = Path(args.out) - out_path.parent.mkdir(parents=True, exist_ok=True) - with open(out_path, "w") as f: - json.dump(all_results, f, indent=2, default=str) - print(f"\n Wrote {out_path}") - - print() - - -if __name__ == "__main__": - main() diff --git a/server/scripts/fixtures/agent_cases/cases.json b/server/scripts/fixtures/agent_cases/cases.json new file mode 100644 index 000000000..c4f3a5be0 --- /dev/null +++ b/server/scripts/fixtures/agent_cases/cases.json @@ -0,0 +1,30 @@ +{ + "schema": "lucebox-bench-cases-v1", + "source": "Agent-style probes: real codex agent system prompt + coding-task user message. Tests whether the model produces agent-shaped output (tool calls, code blocks, apply_patch envelopes) given a realistic coding-agent context. Complement to --area forge (which exercises tool-calling reliability with mocked scenarios).", + "cases": [ + { + "id": "codex-mini-read-task", + "kind": "agent-prompt", + "system_prompt_file": "codex_gpt5_codex.md", + "user_message": "Read the file dflash/src/server/http_server.cpp and summarize what it does in 2-3 sentences. Don't actually open the file — describe what tool you would use and what arguments." + }, + { + "id": "codex-mini-apply-patch", + "kind": "agent-prompt", + "system_prompt_file": "codex_gpt52_codex.md", + "user_message": "The function `qwen35_decode` in dflash/src/qwen35/qwen35_backend.cpp has a bug on the force-close path: it doesn't reset `budget_close_started` between requests. Show me the apply_patch envelope to fix it." + }, + { + "id": "codex-large-explore", + "kind": "agent-prompt", + "system_prompt_file": "codex_apply_patch.md", + "user_message": "Which files in this repository handle authentication? List the steps you would take to find them." + }, + { + "id": "codex-medium-test", + "kind": "agent-prompt", + "system_prompt_file": "codex_gpt52.md", + "user_message": "Write a unit test for a function `parse_thinking_budget(s: str) -> int` that returns the integer budget from strings like 'high', 'low', '4096'. Use pytest. Use apply_patch to add the test file." + } + ] +} diff --git a/server/scripts/test_server_integration.py b/server/scripts/test_server_integration.py index 08da6cedf..c22121089 100644 --- a/server/scripts/test_server_integration.py +++ b/server/scripts/test_server_integration.py @@ -490,7 +490,12 @@ def test_thinking_disabled_by_default(self): @pytest.mark.slow def test_thinking_enabled_via_chat_template_kwargs(self): - """Enabling thinking should produce reasoning_content.""" + """Enabling thinking must route reasoning into reasoning_content, + not leak it into content. Regression guard for the Qwen3.6/Laguna + pre-opened- bug: the chat template appends `` to the + prompt suffix, so the model emits reasoning directly with no + opening tag. If the renderer→emitter wiring drops, reasoning_content + stays empty and the raw reasoning text appears in content.""" r = post_json("/v1/chat/completions", { "model": MODEL_NAME, "messages": [{"role": "user", "content": "What is 15 * 17?"}], @@ -500,13 +505,26 @@ def test_thinking_enabled_via_chat_template_kwargs(self): }) assert r.status_code == 200 msg = r.json()["choices"][0]["message"] - assert msg["content"] - # With thinking enabled, model may produce reasoning_content - # (not guaranteed for short prompts, so we just check it doesn't crash) + reasoning = msg.get("reasoning_content") or "" + content = msg.get("content") or "" + assert reasoning, ( + f"reasoning_content empty with enable_thinking=True — " + f"renderer→emitter wiring likely broken. content={content[:200]!r}" + ) + assert "" not in reasoning and "" not in reasoning, ( + f"raw think tags leaked into reasoning_content: {reasoning[:200]!r}" + ) + assert "" not in content and "" not in content, ( + f"think tags leaked into content channel: {content[:200]!r}" + ) + assert content, "content channel empty — model never closed " @pytest.mark.slow def test_thinking_enabled_via_reasoning_effort(self): - """OpenAI Responses-style reasoning.effort field.""" + """OpenAI Responses-style reasoning.effort=high must also route + reasoning to reasoning_content. Same regression class as above + but reached through a different request shape (effort→template + kwargs translation in http_server.cpp).""" r = post_json("/v1/chat/completions", { "model": MODEL_NAME, "messages": [{"role": "user", "content": "What is 15 * 17?"}], @@ -516,7 +534,15 @@ def test_thinking_enabled_via_reasoning_effort(self): }) assert r.status_code == 200 msg = r.json()["choices"][0]["message"] - assert msg["content"] + reasoning = msg.get("reasoning_content") or "" + content = msg.get("content") or "" + assert reasoning, ( + f"reasoning_content empty with reasoning.effort=high — " + f"renderer→emitter wiring likely broken. content={content[:200]!r}" + ) + assert "" not in reasoning and "" not in reasoning + assert "" not in content and "" not in content + assert content # ═══════════════════════════════════════════════════════════════════ @@ -871,3 +897,243 @@ def test_stop_no_match(self): content = r.json()["choices"][0]["message"]["content"] # Should produce some output since stop didn't match assert len(content) > 0 + + +# ═══════════════════════════════════════════════════════════════════ +# /props introspection — parity with dflash/scripts/server.py:1221 +# ═══════════════════════════════════════════════════════════════════ + +class TestProps: + """Mirrors the Python server's /props shape so cross-server consumers + (autotune, dashboards, snapshot/profile) see a stable contract.""" + + def _fetch(self): + r = requests.get(f"{SERVER_URL}/props", timeout=10) + assert r.status_code == 200, f"/props returned {r.status_code}" + return r.json() + + def test_top_level_keys_present(self): + body = self._fetch() + expected = { + "default_generation_settings", "model_alias", "model_path", + "build_info", "speculative_mode", "server", "model", "runtime", + "reasoning", "speculative", "sampling", "pflash", "prefix_cache", + "full_cache", "tool_replay", "daemon", "api", "capabilities", + } + missing = expected - set(body.keys()) + assert not missing, f"/props missing top-level keys: {missing}" + + def test_server_block_shape(self): + srv = self._fetch()["server"] + assert srv["name"] == "luce-dflash" + assert "version" in srv + assert isinstance(srv["props_schema"], int) + + def test_speculative_mode_consistency(self): + body = self._fetch() + mode = body["speculative_mode"] + assert mode in {"off", "dflash", "pflash"} + if mode == "dflash": + assert body["speculative"]["enabled"] is True + assert body["pflash"]["enabled"] is False + elif mode == "pflash": + assert body["pflash"]["enabled"] is True + else: + assert body["speculative"]["enabled"] is False + assert body["pflash"]["enabled"] is False + + def test_runtime_backend_value(self): + rt = self._fetch()["runtime"] + assert rt["backend"] in {"cuda", "hip", "cpu"} + assert isinstance(rt["fa_window"], int) + assert rt["kv_cache_k"] + assert rt["kv_cache_v"] + + def test_capabilities_match_arch(self): + body = self._fetch() + caps = body["capabilities"] + # Reasoning + speculative + tools all flip together with arch family. + if caps["reasoning_supported"]: + assert caps["speculative_supported"] is True + assert caps["tools_supported"] is True + assert "medium" in body["reasoning"]["supported_efforts"] + + def test_api_endpoint_registry(self): + endpoints = self._fetch()["api"]["endpoints"] + # Every endpoint the test suite hits must be in the registry. + required = { + "GET /health", "GET /props", "GET /v1/models", + "POST /v1/chat/completions", "POST /v1/messages", + "POST /v1/messages/count_tokens", "POST /v1/responses", + } + assert required.issubset(set(endpoints)), \ + f"/props missing endpoints: {required - set(endpoints)}" + + def test_prefix_cache_stats_shape(self): + pc = self._fetch()["prefix_cache"] + for key in ("capacity", "in_use", "lifetime_hits"): + assert key in pc, f"prefix_cache missing {key}" + assert isinstance(pc[key], int) + + def test_tool_replay_stats_shape(self): + tr = self._fetch()["tool_replay"] + for key in ("max_entries", "max_bytes", "current_entries", "current_bytes"): + assert key in tr, f"tool_replay missing {key}" + + +# ═══════════════════════════════════════════════════════════════════ +# /v1/messages/count_tokens — Anthropic count_tokens parity +# ═══════════════════════════════════════════════════════════════════ + +class TestCountTokens: + def test_simple_count(self): + body = { + "model": MODEL_NAME, + "messages": [{"role": "user", "content": "Hello, world."}], + } + r = post_json("/v1/messages/count_tokens", body, timeout=10) + assert r.status_code == 200 + payload = r.json() + assert "input_tokens" in payload + assert isinstance(payload["input_tokens"], int) + assert payload["input_tokens"] > 0 + + def test_count_scales_with_message_length(self): + short = {"model": MODEL_NAME, + "messages": [{"role": "user", "content": "hi"}]} + long = {"model": MODEL_NAME, + "messages": [{"role": "user", "content": "word " * 200}]} + r_short = post_json("/v1/messages/count_tokens", short, timeout=10).json() + r_long = post_json("/v1/messages/count_tokens", long, timeout=10).json() + assert r_long["input_tokens"] > r_short["input_tokens"] + + def test_count_with_system_block(self): + body = { + "model": MODEL_NAME, + "system": "You are a helpful assistant.", + "messages": [{"role": "user", "content": "Hi"}], + } + r = post_json("/v1/messages/count_tokens", body, timeout=10) + assert r.status_code == 200 + assert r.json()["input_tokens"] > 0 + + def test_count_does_not_generate(self): + """count_tokens must be fast — no generation. <1s budget vs many + seconds for a real generation.""" + body = { + "model": MODEL_NAME, + "messages": [{"role": "user", "content": "What is 1+1?"}], + } + t0 = time.monotonic() + r = post_json("/v1/messages/count_tokens", body, timeout=10) + elapsed = time.monotonic() - t0 + assert r.status_code == 200 + # 1s is generous; real bound is dominated by tokenizer + HTTP RTT. + assert elapsed < 1.0, f"count_tokens took {elapsed:.2f}s (expected <1s)" + + +# ═══════════════════════════════════════════════════════════════════ +# Thinking-budget envelope — finish_details emission +# ═══════════════════════════════════════════════════════════════════ + +class TestThinkingBudget: + """Verifies the response includes a `finish_details` block when the + request opted in via `thinking: {type: "enabled"}`. Mirrors + docs/specs/thinking-budget.md:43-58. + + Level 1 phase-1/phase-2 reprompt is now wired up: when the model + fails to emit within --think-max-tokens, the server force- + closes via a synthetic "\\n\\nFinal answer: " reprompt and + runs phase-2 for the remaining budget. close_kind reflects the path + taken ("natural" for self-close, "hard" for force-close). + """ + + @pytest.mark.slow + def test_finish_details_present_when_thinking_opted_in(self): + body = { + "model": MODEL_NAME, + "messages": [{"role": "user", "content": "What is 2+2? Answer in one word."}], + "max_tokens": 256, + "thinking": {"type": "enabled"}, + "temperature": 0, + } + r = post_json("/v1/chat/completions", body) + assert r.status_code == 200 + choice = r.json()["choices"][0] + assert "finish_details" in choice, \ + "finish_details missing despite thinking:{type:enabled}" + fd = choice["finish_details"] + assert fd["close_kind"] in {"natural", "hard"} + assert isinstance(fd["thinking_tokens"], int) + assert isinstance(fd["content_tokens"], int) + assert isinstance(fd["total_tokens"], int) + # Invariant: the two sub-counts sum to the total. + assert fd["thinking_tokens"] + fd["content_tokens"] == fd["total_tokens"] + + def test_finish_details_absent_when_thinking_not_opted_in(self): + body = { + "model": MODEL_NAME, + "messages": [{"role": "user", "content": "Say hi"}], + "max_tokens": 16, + "temperature": 0, + } + r = post_json("/v1/chat/completions", body) + assert r.status_code == 200 + choice = r.json()["choices"][0] + assert "finish_details" not in choice, \ + "finish_details should only appear when thinking is opted in" + + @pytest.mark.slow + def test_close_kind_natural_when_model_self_closes(self): + """An easy prompt with a generous budget should let the model emit + well within --think-max-tokens, producing close_kind="natural" + (no phase-2 reprompt fires).""" + body = { + "model": MODEL_NAME, + "messages": [{"role": "user", "content": + "What is 2+2? Answer in one word."}], + "max_tokens": 4096, + "thinking": {"type": "enabled"}, + "temperature": 0, + } + r = post_json("/v1/chat/completions", body) + assert r.status_code == 200 + fd = r.json()["choices"][0]["finish_details"] + assert fd["close_kind"] == "natural", \ + f"expected natural close, got {fd['close_kind']}" + assert fd["content_tokens"] >= 0 + # Phase-2 did not fire — content_tokens stays 0 when the model + # self-closes (all generated tokens are reasoning + content interleaved + # via the emitter on the phase-1 stream). + assert fd["content_tokens"] == 0 + assert fd["thinking_tokens"] == fd["total_tokens"] + + @pytest.mark.skipif( + os.environ.get("THINK_MAX_TOKENS_LOW") != "1", + reason="requires server started with very low --think-max-tokens " + "(set THINK_MAX_TOKENS_LOW=1 to enable when the running " + "server was launched with e.g. --think-max-tokens 32)", + ) + @pytest.mark.slow + def test_close_kind_hard_on_phase2_trigger(self): + """A think-heavy prompt with a deliberately tiny --think-max-tokens + should trigger phase-2: the model can't finish reasoning in time, + the server force-closes and runs a Final-answer reprompt.""" + body = { + "model": MODEL_NAME, + "messages": [{"role": "user", "content": + "Reason step by step about the following: list the " + "first 20 prime numbers, then explain why each is " + "prime, then compute their sum. Be thorough."}], + "max_tokens": 4096, + "thinking": {"type": "enabled"}, + "temperature": 0, + } + r = post_json("/v1/chat/completions", body) + assert r.status_code == 200 + fd = r.json()["choices"][0]["finish_details"] + assert fd["close_kind"] == "hard", \ + f"expected hard close, got {fd['close_kind']}" + assert fd["thinking_tokens"] > 0 + assert fd["content_tokens"] > 0 + assert fd["thinking_tokens"] + fd["content_tokens"] == fd["total_tokens"] diff --git a/server/src/common/gguf_inspect.cpp b/server/src/common/gguf_inspect.cpp index 95cc30c41..f8319f941 100644 --- a/server/src/common/gguf_inspect.cpp +++ b/server/src/common/gguf_inspect.cpp @@ -1,9 +1,14 @@ #include "gguf_inspect.h" #include "gguf.h" +#include +#include #include #include +#include #include +#include +#include namespace dflash::common { @@ -36,4 +41,292 @@ GgufModelInfo inspect_gguf_model_info(const char * path) { return info; } +// ─── SHA-256 (RFC 6234) ───────────────────────────────────────────────── +// +// Self-contained mini-implementation so we don't pull in OpenSSL just for +// one hash. Performance is "fine" — hashing a 17 GB GGUF takes ~30s on a +// fast NVMe, which is comparable to the per-file numbers `sha256sum` gets. +// We sidecar the result so this only happens on the first server start +// after a model is downloaded. + +namespace { + +struct Sha256Ctx { + uint32_t state[8]; + uint64_t bit_len; + uint8_t buf[64]; + size_t buf_len; +}; + +inline uint32_t rotr32(uint32_t x, uint32_t n) { + return (x >> n) | (x << (32 - n)); +} + +void sha256_init(Sha256Ctx & c) { + c.state[0] = 0x6a09e667u; + c.state[1] = 0xbb67ae85u; + c.state[2] = 0x3c6ef372u; + c.state[3] = 0xa54ff53au; + c.state[4] = 0x510e527fu; + c.state[5] = 0x9b05688cu; + c.state[6] = 0x1f83d9abu; + c.state[7] = 0x5be0cd19u; + c.bit_len = 0; + c.buf_len = 0; +} + +void sha256_compress(Sha256Ctx & c, const uint8_t * block) { + static const uint32_t K[64] = { + 0x428a2f98u,0x71374491u,0xb5c0fbcfu,0xe9b5dba5u,0x3956c25bu,0x59f111f1u,0x923f82a4u,0xab1c5ed5u, + 0xd807aa98u,0x12835b01u,0x243185beu,0x550c7dc3u,0x72be5d74u,0x80deb1feu,0x9bdc06a7u,0xc19bf174u, + 0xe49b69c1u,0xefbe4786u,0x0fc19dc6u,0x240ca1ccu,0x2de92c6fu,0x4a7484aau,0x5cb0a9dcu,0x76f988dau, + 0x983e5152u,0xa831c66du,0xb00327c8u,0xbf597fc7u,0xc6e00bf3u,0xd5a79147u,0x06ca6351u,0x14292967u, + 0x27b70a85u,0x2e1b2138u,0x4d2c6dfcu,0x53380d13u,0x650a7354u,0x766a0abbu,0x81c2c92eu,0x92722c85u, + 0xa2bfe8a1u,0xa81a664bu,0xc24b8b70u,0xc76c51a3u,0xd192e819u,0xd6990624u,0xf40e3585u,0x106aa070u, + 0x19a4c116u,0x1e376c08u,0x2748774cu,0x34b0bcb5u,0x391c0cb3u,0x4ed8aa4au,0x5b9cca4fu,0x682e6ff3u, + 0x748f82eeu,0x78a5636fu,0x84c87814u,0x8cc70208u,0x90befffau,0xa4506cebu,0xbef9a3f7u,0xc67178f2u + }; + uint32_t w[64]; + for (int i = 0; i < 16; ++i) { + w[i] = (uint32_t(block[i*4+0]) << 24) | (uint32_t(block[i*4+1]) << 16) | + (uint32_t(block[i*4+2]) << 8 ) | uint32_t(block[i*4+3]); + } + for (int i = 16; i < 64; ++i) { + uint32_t s0 = rotr32(w[i-15], 7) ^ rotr32(w[i-15], 18) ^ (w[i-15] >> 3); + uint32_t s1 = rotr32(w[i-2], 17) ^ rotr32(w[i-2], 19) ^ (w[i-2] >> 10); + w[i] = w[i-16] + s0 + w[i-7] + s1; + } + uint32_t a = c.state[0], b = c.state[1], cc = c.state[2], d = c.state[3]; + uint32_t e = c.state[4], f = c.state[5], g = c.state[6], h = c.state[7]; + for (int i = 0; i < 64; ++i) { + uint32_t S1 = rotr32(e, 6) ^ rotr32(e, 11) ^ rotr32(e, 25); + uint32_t ch = (e & f) ^ ((~e) & g); + uint32_t t1 = h + S1 + ch + K[i] + w[i]; + uint32_t S0 = rotr32(a, 2) ^ rotr32(a, 13) ^ rotr32(a, 22); + uint32_t mj = (a & b) ^ (a & cc) ^ (b & cc); + uint32_t t2 = S0 + mj; + h = g; g = f; f = e; e = d + t1; + d = cc; cc = b; b = a; a = t1 + t2; + } + c.state[0] += a; c.state[1] += b; c.state[2] += cc; c.state[3] += d; + c.state[4] += e; c.state[5] += f; c.state[6] += g; c.state[7] += h; +} + +void sha256_update(Sha256Ctx & c, const uint8_t * data, size_t len) { + c.bit_len += uint64_t(len) * 8; + if (c.buf_len) { + size_t take = std::min(size_t(64) - c.buf_len, len); + std::memcpy(c.buf + c.buf_len, data, take); + c.buf_len += take; + data += take; + len -= take; + if (c.buf_len == 64) { + sha256_compress(c, c.buf); + c.buf_len = 0; + } + } + while (len >= 64) { + sha256_compress(c, data); + data += 64; + len -= 64; + } + if (len) { + std::memcpy(c.buf, data, len); + c.buf_len = len; + } +} + +std::string sha256_final(Sha256Ctx & c) { + uint64_t bits = c.bit_len; + c.buf[c.buf_len++] = 0x80; + if (c.buf_len > 56) { + std::memset(c.buf + c.buf_len, 0, 64 - c.buf_len); + sha256_compress(c, c.buf); + c.buf_len = 0; + } + std::memset(c.buf + c.buf_len, 0, 56 - c.buf_len); + for (int i = 7; i >= 0; --i) { + c.buf[56 + i] = uint8_t(bits & 0xff); + bits >>= 8; + } + sha256_compress(c, c.buf); + + static const char * hex = "0123456789abcdef"; + std::string out; + out.resize(64); + for (int i = 0; i < 8; ++i) { + uint32_t v = c.state[i]; + for (int j = 0; j < 4; ++j) { + uint8_t byte = uint8_t((v >> (24 - j * 8)) & 0xff); + out[i*8 + j*2 + 0] = hex[byte >> 4]; + out[i*8 + j*2 + 1] = hex[byte & 0x0f]; + } + } + return out; +} + +std::string sha256_of_file(const std::string & path) { + std::ifstream f(path, std::ios::binary); + if (!f) return {}; + Sha256Ctx c; + sha256_init(c); + // 4 MiB read buffer: empirically best throughput on NVMe without + // gulping the page cache. std::vector heap-allocates so we don't + // blow the C++ thread stack. + constexpr size_t BUF = 4 * 1024 * 1024; + std::vector buf(BUF); + while (f) { + f.read(reinterpret_cast(buf.data()), BUF); + std::streamsize got = f.gcount(); + if (got > 0) sha256_update(c, buf.data(), size_t(got)); + } + return sha256_final(c); +} + +// Map LLAMA_FTYPE_* int → operator-friendly tag (Q4_K_M, IQ4_XS, BF16, …). +// Kept inline so we don't pull in llama.h here — those enum values are part +// of the GGUF on-disk format and won't change without a format bump. +const char * llama_ftype_name(int32_t v) { + switch (v) { + case 0: return "F32"; + case 1: return "F16"; + case 2: return "Q4_0"; + case 3: return "Q4_1"; + case 7: return "Q8_0"; + case 8: return "Q5_0"; + case 9: return "Q5_1"; + case 10: return "Q2_K"; + case 11: return "Q3_K_S"; + case 12: return "Q3_K_M"; + case 13: return "Q3_K_L"; + case 14: return "Q4_K_S"; + case 15: return "Q4_K_M"; + case 16: return "Q5_K_S"; + case 17: return "Q5_K_M"; + case 18: return "Q6_K"; + case 19: return "IQ2_XXS"; + case 20: return "IQ2_XS"; + case 21: return "Q2_K_S"; + case 22: return "IQ3_XS"; + case 23: return "IQ3_XXS"; + case 24: return "IQ1_S"; + case 25: return "IQ4_NL"; + case 26: return "IQ3_S"; + case 27: return "IQ3_M"; + case 28: return "IQ2_S"; + case 29: return "IQ2_M"; + case 30: return "IQ4_XS"; + case 31: return "IQ1_M"; + case 32: return "BF16"; + case 36: return "TQ1_0"; + case 37: return "TQ2_0"; + case 38: return "MXFP4_MOE"; + case 39: return "NVFP4"; + case 40: return "Q1_0"; + case 1024: return "GUESSED"; + default: return ""; + } +} + +bool read_sidecar_sha(const std::string & path, std::string & out) { + std::ifstream f(path + ".sha256"); + if (!f) return false; + std::string s; + f >> s; // tolerate ` filename\n` (sha256sum format) — we only want the first token + if (s.size() != 64) return false; + for (char c : s) { + bool hex = (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f'); + if (!hex) return false; + } + out = std::move(s); + return true; +} + +void write_sidecar_sha(const std::string & path, const std::string & sha) { + // Best-effort. If the directory isn't writable (read-only mount, model + // dir owned by another user), we just skip — the in-memory hash is + // already what /props will report this run. + std::ofstream f(path + ".sha256"); + if (!f) return; + f << sha << "\n"; +} + +} // namespace + +GgufMetadata read_gguf_metadata(const std::string & path, + bool compute_sha256) { + GgufMetadata m; + m.path = path; + + struct stat st{}; + if (::stat(path.c_str(), &st) == 0) { + m.size_bytes = int64_t(st.st_size); + } + + gguf_init_params gip{}; + gip.no_alloc = true; + gip.ctx = nullptr; + gguf_context * gctx = gguf_init_from_file(path.c_str(), gip); + if (!gctx) { + // No GGUF header → bail. Still report path/size if we got them. + return m; + } + m.ok = true; + + auto get_str = [&](const char * key, std::string & out) { + int64_t id = gguf_find_key(gctx, key); + if (id < 0) return; + const char * v = gguf_get_val_str(gctx, id); + if (v) out = v; + }; + auto get_u32 = [&](const char * key, int32_t & out) { + int64_t id = gguf_find_key(gctx, key); + if (id < 0) return; + out = int32_t(gguf_get_val_u32(gctx, id)); + }; + + get_str("general.architecture", m.general_architecture); + get_str("general.name", m.general_name); + get_u32("general.file_type", m.file_type); + get_u32("general.quantization_version", m.quantization_version); + if (m.file_type >= 0) { + const char * name = llama_ftype_name(m.file_type); + if (name) m.file_type_name = name; + } + + if (!m.general_architecture.empty()) { + const std::string a = m.general_architecture; + get_u32((a + ".block_count").c_str(), m.block_count); + get_u32((a + ".embedding_length").c_str(), m.embedding_length); + get_u32((a + ".context_length").c_str(), m.context_length); + // vocab_size: prefer the explicit .vocab_size key. Fall back + // to the tokenizer token array length (the canonical source on + // models that don't write the redundant key). + get_u32((a + ".vocab_size").c_str(), m.vocab_size); + } + if (m.vocab_size < 0) { + int64_t toks_id = gguf_find_key(gctx, "tokenizer.ggml.tokens"); + if (toks_id >= 0) { + m.vocab_size = int32_t(gguf_get_arr_n(gctx, toks_id)); + } + } + + gguf_free(gctx); + + if (compute_sha256) { + std::string cached; + if (read_sidecar_sha(path, cached)) { + m.sha256 = std::move(cached); + } else { + std::string hash = sha256_of_file(path); + if (!hash.empty()) { + m.sha256 = hash; + write_sidecar_sha(path, hash); + } + } + } + + return m; +} + } // namespace dflash::common diff --git a/server/src/common/gguf_inspect.h b/server/src/common/gguf_inspect.h index 11c11379e..6e7a15827 100644 --- a/server/src/common/gguf_inspect.h +++ b/server/src/common/gguf_inspect.h @@ -5,6 +5,7 @@ #pragma once +#include #include namespace dflash::common { @@ -18,4 +19,43 @@ struct GgufModelInfo { // Returns info with arch="" and n_layer=-1 on failure. GgufModelInfo inspect_gguf_model_info(const char * path); +// Richer GGUF identity captured at server startup and re-emitted at /props. +// All header values are best-effort: missing keys leave the corresponding +// field at the listed default (empty string or -1). `ok` is false only if +// the file itself couldn't be opened (path missing, not a GGUF, etc.). +// +// The intent is "exactly what binary + GGUF + quant + sha256 is loaded"; +// any field the file doesn't carry stays at the default so consumers can +// distinguish "not in GGUF" (-1) from "0" (legitimately zero). +struct GgufMetadata { + bool ok = false; // false: open failed, all other fields ignorable + std::string path; // absolute filesystem path passed in + int64_t size_bytes = -1; // file size (-1 if stat failed) + std::string sha256; // lowercase hex sha256 (empty if not computed) + + // Header fields (`general.*` + `.*`). All optional. + std::string general_architecture; // raw value of "general.architecture" + std::string general_name; // "general.name" (display string) + int32_t file_type = -1; // "general.file_type" (LLAMA_FTYPE_* int) + std::string file_type_name; // decoded LLAMA_FTYPE_* (e.g. "Q4_K_M", "IQ4_XS") + int32_t quantization_version = -1; // "general.quantization_version" + + int32_t block_count = -1; // ".block_count" + int32_t embedding_length = -1; // ".embedding_length" + int32_t context_length = -1; // ".context_length" + int32_t vocab_size = -1; // ".vocab_size" (or tokenizer.ggml.tokens length) +}; + +// Read GGUF identity for /props. Set `compute_sha256` to hash the file (slow, +// O(size) — multi-GB GGUFs take ~30s on a fast SSD). When false, `sha256` +// stays empty. The header read is cheap (no weight load). +// +// When `compute_sha256` is true and a sidecar file `.sha256` exists, +// its first 64-hex-char token is trusted as the file's sha256 and the file +// is not re-hashed. After a successful hash, the result is written to the +// sidecar so subsequent restarts skip the rehash. Sidecar I/O failures are +// non-fatal — the in-memory hash still gets returned. +GgufMetadata read_gguf_metadata(const std::string & path, + bool compute_sha256); + } // namespace dflash::common diff --git a/server/src/common/layer_split_backend.cpp b/server/src/common/layer_split_backend.cpp index e45cbc104..e9e8b4c2e 100644 --- a/server/src/common/layer_split_backend.cpp +++ b/server/src/common/layer_split_backend.cpp @@ -57,8 +57,7 @@ GenerateResult LayerSplitBackend::run_from_state(const GenerateRequest & req, result.error = "context"; return result; } - if (req.do_sample && req.sampler.needs_logit_processing() && - !adapter_->supports_cpu_sampling()) { + if (req.do_sample && req.sampler.temp > 0.0f) { result.error = "sampling_unsupported"; return result; } diff --git a/server/src/common/layer_split_backend.h b/server/src/common/layer_split_backend.h index e85b5ad6a..76c336fa2 100644 --- a/server/src/common/layer_split_backend.h +++ b/server/src/common/layer_split_backend.h @@ -31,7 +31,6 @@ class LayerSplitAdapter { virtual bool decode_ar(int last_tok, int committed, int n_gen, std::vector & out_tokens, const DaemonIO & io) = 0; - virtual bool supports_cpu_sampling() const { return false; } virtual bool can_dflash_decode() const { return false; } virtual bool decode_dflash(const std::vector & prompt, diff --git a/server/src/common/model_backend.h b/server/src/common/model_backend.h index b808d0c39..088b354d7 100644 --- a/server/src/common/model_backend.h +++ b/server/src/common/model_backend.h @@ -102,6 +102,10 @@ struct GenerateRequest { const std::vector * hint_tokens = nullptr; // Optional thinking-budget hook — see BudgetHook docs above. BudgetHook budget_hook; + // Per-request override for target spec-decode verify fa_window. Set by + // http_server when pflash compresses, so verify sees the entire compressed + // prompt (not just the last cfg_.fa_window positions). Zero = no override. + int fa_window_override = 0; // Common retry knob. Upper layers set this after a speculative decode // path returns success but emits no tokens, so each backend can route the // retry through its existing AR path without copying retry policy. @@ -251,6 +255,10 @@ struct ModelBackend { std::string drafter_path; // GGUF path (for lazy-load) int drafter_gpu = 0; // backend-local GPU for PFlash drafter bool skip_park = false; // true on >=32GB GPUs + // Per-request transitive-cascade override (-1 = use env default). + // 0 = off (agentic path: suppress cascade to avoid anchor bloat). + // 1 = on (retrieval path: full expansion, same as today). + int use_transitive = -1; DraftResidencyAction residency_action = DraftResidencyAction::KeepLoaded; }; diff --git a/server/src/draft/draft_gguf_loader.cpp b/server/src/draft/draft_gguf_loader.cpp index fbec7263b..73a9c17bd 100644 --- a/server/src/draft/draft_gguf_loader.cpp +++ b/server/src/draft/draft_gguf_loader.cpp @@ -349,6 +349,63 @@ bool load_draft_gguf(const std::string & path, gguf_free(gctx); + // Structural defense: derive scalar dims from weight tensor shapes and + // assert against GGUF-declared metadata (Bug #2 class prevention). + // All draft layers have wq/wk (no deltanet mix), so use layer 0. + // wq is plain Q-only (no gate), so ne[1] = n_head * head_dim. + // fc is [n_target_layers*n_embd, n_embd], so ne[0] = n_target_layers*n_embd. + { + const DraftLayer & L0 = out.layers[0]; + const int64_t derived_q_dim = L0.wq->ne[1]; + const int64_t derived_kv_dim = L0.wk->ne[1]; + const int64_t expected_q_dim = (int64_t)out.n_head * out.head_dim; + const int64_t expected_kv_dim = (int64_t)out.n_head_kv * out.head_dim; + if (derived_q_dim != expected_q_dim) { + char buf[256]; + std::snprintf(buf, sizeof(buf), + "draft GGUF shape mismatch: blk.0.attn_q.weight->ne[1]=%lld " + "!= n_head*head_dim=%d*%d=%lld", + (long long)derived_q_dim, + out.n_head, out.head_dim, (long long)expected_q_dim); + set_last_error(buf); + return false; + } + if (derived_kv_dim != expected_kv_dim) { + char buf[256]; + std::snprintf(buf, sizeof(buf), + "draft GGUF shape mismatch: blk.0.attn_k.weight->ne[1]=%lld " + "!= n_head_kv*head_dim=%d*%d=%lld", + (long long)derived_kv_dim, + out.n_head_kv, out.head_dim, (long long)expected_kv_dim); + set_last_error(buf); + return false; + } + const int64_t derived_n_embd = L0.wq->ne[0]; + if (derived_n_embd != (int64_t)out.n_embd) { + char buf[256]; + std::snprintf(buf, sizeof(buf), + "draft GGUF shape mismatch: blk.0.attn_q.weight->ne[0]=%lld != n_embd=%d", + (long long)derived_n_embd, out.n_embd); + set_last_error(buf); + return false; + } + // fc: [n_target_layers*n_embd, n_embd] — check fc->ne[0] against derived expectation + if (out.n_target_layers > 0) { + const int64_t derived_fc_in = out.fc->ne[0]; + const int64_t expected_fc_in = (int64_t)out.n_target_layers * out.n_embd; + if (derived_fc_in != expected_fc_in) { + char buf[256]; + std::snprintf(buf, sizeof(buf), + "draft GGUF shape mismatch: dflash.fc.weight->ne[0]=%lld " + "!= n_target_layers*n_embd=%d*%d=%lld", + (long long)derived_fc_in, + out.n_target_layers, out.n_embd, (long long)expected_fc_in); + set_last_error(buf); + return false; + } + } + } + char summary[192]; std::snprintf(summary, sizeof(summary), "draft GGUF loaded: %" PRId64 " tensors, %.2f GiB on GPU", diff --git a/server/src/gemma4/gemma4_graph.cpp b/server/src/gemma4/gemma4_graph.cpp index bc2276555..bf8f8ce7c 100644 --- a/server/src/gemma4/gemma4_graph.cpp +++ b/server/src/gemma4/gemma4_graph.cpp @@ -515,14 +515,13 @@ bool build_gemma4_layer_step( return ggml_gallocr_alloc_graph(sg.alloc, sg.gf); } -bool compute_gemma4_split_projection( +bool compute_gemma4_split_argmax( ggml_backend_t backend, const Gemma4Weights & w, ggml_tensor * act, int token_offset, int n_tokens, - std::vector * out_argmax, - std::vector * out_logits) { + std::vector & out_argmax) { ggml_init_params ip{}; ip.mem_size = ggml_tensor_overhead() * 64 + ggml_graph_overhead() + 1024 * 1024; ip.no_alloc = true; @@ -540,17 +539,9 @@ bool compute_gemma4_split_projection( cur = ggml_tanh(ctx, cur); cur = ggml_scale(ctx, cur, w.final_logit_softcap); } - ggml_tensor * logits = cur; - ggml_tensor * argmax = nullptr; - if (out_logits) { - ggml_set_output(logits); - ggml_build_forward_expand(gf, logits); - } - if (out_argmax) { - argmax = ggml_argmax(ctx, logits); - ggml_set_output(argmax); - ggml_build_forward_expand(gf, argmax); - } + cur = ggml_argmax(ctx, cur); + ggml_set_output(cur); + ggml_build_forward_expand(gf, cur); ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend)); if (!alloc || !ggml_gallocr_alloc_graph(alloc, gf)) { @@ -563,32 +554,14 @@ bool compute_gemma4_split_projection( ggml_free(ctx); return false; } - if (out_argmax) { - out_argmax->resize((size_t)n_tokens); - ggml_backend_tensor_get(argmax, out_argmax->data(), 0, - sizeof(int32_t) * (size_t)n_tokens); - } - if (out_logits) { - out_logits->resize((size_t)w.n_vocab * (size_t)n_tokens); - ggml_backend_tensor_get(logits, out_logits->data(), 0, - sizeof(float) * (size_t)w.n_vocab * (size_t)n_tokens); - } + out_argmax.resize((size_t)n_tokens); + ggml_backend_tensor_get(cur, out_argmax.data(), 0, + sizeof(int32_t) * (size_t)n_tokens); ggml_gallocr_free(alloc); ggml_free(ctx); return true; } -bool compute_gemma4_split_argmax( - ggml_backend_t backend, - const Gemma4Weights & w, - ggml_tensor * act, - int token_offset, - int n_tokens, - std::vector & out_argmax) { - return compute_gemma4_split_projection( - backend, w, act, token_offset, n_tokens, &out_argmax, nullptr); -} - bool gemma4_step( ggml_backend_t backend, const Gemma4Weights & w, diff --git a/server/src/gemma4/gemma4_layer_split_adapter.cpp b/server/src/gemma4/gemma4_layer_split_adapter.cpp index b212cdd4c..4e7c6a877 100644 --- a/server/src/gemma4/gemma4_layer_split_adapter.cpp +++ b/server/src/gemma4/gemma4_layer_split_adapter.cpp @@ -146,10 +146,7 @@ bool Gemma4LayerSplitAdapter::init() { } void Gemma4LayerSplitAdapter::begin_request(const GenerateRequest & req) { - sampler_ = req.sampler; - if (req.do_sample && sampler_.seed != 0) { - sampler_rng_.seed(sampler_.seed); - } + (void)req; } void Gemma4LayerSplitAdapter::reset_request_state() { @@ -157,14 +154,12 @@ void Gemma4LayerSplitAdapter::reset_request_state() { shard.cache.cur_pos = 0; shard.cache.last_tok = -1; } - prefill_last_logits_.clear(); } bool Gemma4LayerSplitAdapter::run_forward( const std::vector & tokens, int base_pos, - int & last_tok, - std::vector * logits_out) { + int & last_tok) { if (shards_.empty() || tokens.empty()) return false; const Gemma4Weights & ref = shards_.front().weights; const int hidden = ref.n_embd; @@ -347,9 +342,9 @@ bool Gemma4LayerSplitAdapter::run_forward( std::vector argmax; Gemma4LayerSplitShard & last = shards_.back(); - const bool ok = compute_gemma4_split_projection( + const bool ok = compute_gemma4_split_argmax( last.backend, last.weights, act_in, - n_tokens_total - 1, 1, &argmax, logits_out); + n_tokens_total - 1, 1, argmax); activation_buffer_free(orig); activation_pair_free(acts); if (!ok || argmax.empty()) return false; @@ -364,7 +359,7 @@ bool Gemma4LayerSplitAdapter::run_forward( bool Gemma4LayerSplitAdapter::prefill(const std::vector & prompt, int base_pos, int & last_tok) { - return run_forward(prompt, base_pos, last_tok, &prefill_last_logits_); + return run_forward(prompt, base_pos, last_tok); } bool Gemma4LayerSplitAdapter::decode_ar( @@ -377,13 +372,6 @@ bool Gemma4LayerSplitAdapter::decode_ar( if (shards_.empty()) return false; const auto & w = shards_.front().weights; - const int vocab = w.n_vocab; - std::vector logits_buf; - if (sampler_.needs_logit_processing()) { - if ((int)prefill_last_logits_.size() != vocab) return false; - last_tok = sample_logits(prefill_last_logits_.data(), vocab, sampler_, - out_tokens, sampler_rng_); - } out_tokens.push_back(last_tok); io.emit(last_tok); if (io.cancelled) { @@ -399,16 +387,7 @@ bool Gemma4LayerSplitAdapter::decode_ar( for (int i = 1; i < n_gen; ++i) { std::vector one(1, last_tok); int next_tok = -1; - logits_buf.clear(); - if (!run_forward(one, committed - 1, next_tok, - sampler_.needs_logit_processing() ? &logits_buf : nullptr)) { - return false; - } - if (sampler_.needs_logit_processing()) { - if ((int)logits_buf.size() != vocab) return false; - next_tok = sample_logits(logits_buf.data(), vocab, sampler_, - out_tokens, sampler_rng_); - } + if (!run_forward(one, committed - 1, next_tok)) return false; last_tok = next_tok; out_tokens.push_back(last_tok); io.emit(last_tok); @@ -482,7 +461,6 @@ bool Gemma4LayerSplitAdapter::snapshot_save(int slot) { } snap.cur_pos = snap_pos; snap.last_tok = shards_.front().cache.last_tok; - snap.prefill_last_logits = prefill_last_logits_; return true; } @@ -494,7 +472,6 @@ void Gemma4LayerSplitAdapter::snapshot_free(int slot) { } snap.cur_pos = 0; snap.last_tok = -1; - snap.prefill_last_logits.clear(); if (snap.shards.size() != shards_.size()) snap.shards.resize(shards_.size()); } @@ -505,7 +482,6 @@ bool Gemma4LayerSplitAdapter::snapshot_used(int slot) const { } const auto & snap = snapshots_[(size_t)slot]; if (snap.cur_pos <= 0 || snap.shards.size() != shards_.size()) return false; - if (snap.prefill_last_logits.empty()) return false; for (const auto & ss : snap.shards) { if (!ss.ctx) return false; } @@ -545,7 +521,6 @@ bool Gemma4LayerSplitAdapter::snapshot_restore(int slot) { shards_[i].cache.cur_pos = snap.cur_pos; shards_[i].cache.last_tok = snap.last_tok; } - prefill_last_logits_ = snap.prefill_last_logits; return true; } diff --git a/server/src/gemma4/gemma4_layer_split_adapter.h b/server/src/gemma4/gemma4_layer_split_adapter.h index b4238fd63..3a3050f7e 100644 --- a/server/src/gemma4/gemma4_layer_split_adapter.h +++ b/server/src/gemma4/gemma4_layer_split_adapter.h @@ -30,7 +30,6 @@ struct Gemma4LayerSplitSnapshot { int cur_pos = 0; int32_t last_tok = -1; std::vector shards; - std::vector prefill_last_logits; }; class Gemma4LayerSplitAdapter : public LayerSplitAdapter { @@ -53,7 +52,6 @@ class Gemma4LayerSplitAdapter : public LayerSplitAdapter { bool decode_ar(int last_tok, int committed, int n_gen, std::vector & out_tokens, const DaemonIO & io) override; - bool supports_cpu_sampling() const override { return true; } bool snapshot_save(int slot) override; void snapshot_free(int slot) override; @@ -68,17 +66,13 @@ class Gemma4LayerSplitAdapter : public LayerSplitAdapter { private: bool run_forward(const std::vector & tokens, int base_pos, - int & last_tok, - std::vector * logits_out = nullptr); + int & last_tok); Gemma4LayerSplitAdapterConfig cfg_; std::vector shards_; std::vector snapshot_backends_; std::vector snapshots_; static constexpr int PREFIX_SLOTS = ModelBackend::kMaxSlots; - SamplerCfg sampler_; - std::mt19937_64 sampler_rng_{std::random_device{}()}; - std::vector prefill_last_logits_; }; void free_gemma4_layer_split_shards(std::vector & shards); diff --git a/server/src/qwen3/qwen3_backend.cpp b/server/src/qwen3/qwen3_backend.cpp index 253886978..b42aac96e 100644 --- a/server/src/qwen3/qwen3_backend.cpp +++ b/server/src/qwen3/qwen3_backend.cpp @@ -952,7 +952,9 @@ ModelBackend::CompressResult Qwen3Backend::compress(const CompressRequest & req) } result.compressed_ids = drafter_score_and_compress( - drafter_ctx_, req.input_ids, req.keep_ratio); + drafter_ctx_, req.input_ids, req.keep_ratio, + /*chunk_size=*/32, /*n_lookahead=*/8, /*pool_kernel=*/13, + req.use_transitive); result.ok = true; if (req.residency_action == DraftResidencyAction::ReleaseAfterUse) { diff --git a/server/src/qwen3/qwen3_graph.cpp b/server/src/qwen3/qwen3_graph.cpp index a23bcefb3..c2715a356 100644 --- a/server/src/qwen3/qwen3_graph.cpp +++ b/server/src/qwen3/qwen3_graph.cpp @@ -5,23 +5,10 @@ // buffers. Sliding-window flash-attention via ggml-cuda's tensor-core // `flash_attn_ext` keeps attention cost linear in S. // -// **Algorithmic note vs blog**: -// The blog stack is Liu Q-hook tail scoring + FlashPrefill block-sparse FA. -// The Liu Q-hook is implemented with a NoPE fix: by default (DFLASH_FP_NOPE_TAIL=1) -// the tail score uses pre-RoPE K/Q, removing the RoPE distance decay that -// buries early-position needle chunks and was causing NIAH failures. -// Set DFLASH_FP_NOPE_TAIL=0 to revert to post-RoPE scoring. The block-sparse FA is replaced -// with a sliding-window approximation here because (a) ggml-cuda's -// `flash_attn_ext` already gives tensor-core speed inside the ubatch -// graph, and (b) our own block-sparse CUDA kernel needs a tensor-core -// rewrite (mma.sync.aligned) to actually beat ggml's FA — see -// `src/flashprefill_kernels.cu` for the (slow) scalar reference path. -// At S=140K with W=512 sliding window the NIAH magic key still propagates -// through 28 layers and is recovered in the kept tokens, so this -// approximation passes the actual e2e correctness check the user cares -// about. The block-sparse FA upgrade remains the next deliverable for -// "match the article algorithmically", but is functionally equivalent -// for the deployed perf budget today. +// Tail score uses pre-RoPE K/Q (DFLASH_FP_NOPE_TAIL=1 default) to remove +// distance decay that buries early-position needle chunks (NIAH fix). +// Block-sparse FA replaced by sliding-window via ggml-cuda flash_attn_ext; +// BSA upgrade tracked in flashprefill_kernels.cu. // // Memory at S=140K, B=1, H=16, Hk=8, D=128, hidden=1024, ff=3072: // weights ~1.5 GB @@ -35,6 +22,7 @@ #include "qwen3_drafter_model.h" #include "internal.h" #include "flashprefill.h" +#include "../common/score_range.h" #include "device_runtime.h" @@ -249,13 +237,30 @@ bool forward_qwen3_drafter_model( } running_max.assign((size_t)n_lookahead * S, -INFINITY); + // Pre-compute score range to skip K_norope alloc for non-scoring layers. + // At S=128K this trims ~5.6 GB (21 × 268 MB); see test_drafter_warm_path_regression. + static const int score_layers_pre = []() -> int { + const char * e = std::getenv("PFLASH_DRAFTER_SCORE_LAYERS"); + if (e) { int v = std::atoi(e); if (v > 0) return v; } + return -1; + }(); + static const int early_exit_pre = []() -> int { + const char * e = std::getenv("PFLASH_DRAFTER_EARLY_EXIT_N"); + if (e) { int v = std::atoi(e); if (v > 0) return v; } + return -1; + }(); + const int fwd_layer_limit_pre = (early_exit_pre > 0 && early_exit_pre < w.n_layer) + ? early_exit_pre : w.n_layer; + const ScoreRange pre_range = compute_score_range(w.n_layer, score_layers_pre, fwd_layer_limit_pre); + const int score_layer_start_pre = pre_range.start; + const int n_score_layers = pre_range.count(); + PersBuf hidden_buf, pos_buf, mask_tail_buf, Q_buf, attn_out_buf; std::vector K_curr_v((size_t)w.n_layer); std::vector V_curr_v((size_t)w.n_layer); std::vector Q_last_v((size_t)w.n_layer); - // NoPE: pre-RoPE K (full sequence) and Q tail; allocated only when nope_tail. - std::vector K_norope_v(nope_tail ? (size_t)w.n_layer : 0); - std::vector Q_norope_v(nope_tail ? (size_t)w.n_layer : 0); + std::vector K_norope_v(nope_tail ? (size_t)n_score_layers : 0); + std::vector Q_norope_v(nope_tail ? (size_t)n_score_layers : 0); auto cleanup_all = [&]() { free_pers(hidden_buf); free_pers(pos_buf); @@ -294,9 +299,10 @@ bool forward_qwen3_drafter_model( cleanup_all(); return false; } - if (nope_tail) { - if (!make_pers(w.backend, half_type, 3, d_kv, K_norope_v[il]) || - !make_pers(w.backend, GGML_TYPE_F32, 3, d_ql, Q_norope_v[il])) { + if (nope_tail && il >= score_layer_start_pre && il < fwd_layer_limit_pre) { + const int si = il - score_layer_start_pre; + if (!make_pers(w.backend, half_type, 3, d_kv, K_norope_v[si]) || + !make_pers(w.backend, GGML_TYPE_F32, 3, d_ql, Q_norope_v[si])) { set_last_error("forward_qwen3: K_norope/Q_norope alloc failed at layer " + std::to_string(il)); cleanup_all(); return false; @@ -372,7 +378,10 @@ bool forward_qwen3_drafter_model( double t_b_warm = 0.0, t_b_setup = 0.0, t_b_alloc = 0.0, t_b_copy_in = 0.0, t_b_norm = 0.0, t_compute_b = 0.0, t_b_copy_out = 0.0; double t_fp = 0.0; - for (int il = 0; il < w.n_layer; ++il) { + const int fwd_layer_limit = (early_exit_pre > 0 && early_exit_pre < w.n_layer) + ? early_exit_pre : w.n_layer; + + for (int il = 0; il < fwd_layer_limit; ++il) { const auto & L = w.layers[il]; const bool debug_first_layer = (il == 0 && std::getenv("DFLASH_FP_DEBUG_LAYER0") != nullptr); @@ -411,19 +420,22 @@ bool forward_qwen3_drafter_model( ggml_tensor * Q = ggml_mul_mat(gA, L.wq, h_norm); Q = ggml_reshape_3d(gA, Q, D, H, cl); - Q = ggml_rms_norm(gA, Q, eps); - Q = ggml_mul(gA, Q, L.q_norm); - // NoPE: capture pre-RoPE Q tail so the tail scorer is not biased by distance. - if (nope_tail) { + if (L.q_norm) { + Q = ggml_rms_norm(gA, Q, eps); + Q = ggml_mul(gA, Q, L.q_norm); + } + // NoPE: capture pre-RoPE Q tail (only for layers that will be scored). + if (nope_tail && il >= score_layer_start_pre) { + const int si = il - score_layer_start_pre; const int tail_lo_nr = S - n_lookahead; - if (tail_lo_nr >= cs && tail_lo_nr < cs + cl) { + if (tail_lo_nr >= cs && tail_lo_nr + n_lookahead <= cs + cl) { const int local_lo_nr = tail_lo_nr - cs; ggml_tensor * Q_prenrope_tail = ggml_view_3d( gA, Q, D, H, n_lookahead, Q->nb[1], Q->nb[2], (size_t)local_lo_nr * Q->nb[2]); ggml_build_forward_expand(gfA, - ggml_cpy(gA, Q_prenrope_tail, Q_norope_v[il].t)); + ggml_cpy(gA, Q_prenrope_tail, Q_norope_v[si].t)); } } Q = ggml_rope_ext(gA, Q, pos_chunk, nullptr, D, @@ -432,12 +444,15 @@ bool forward_qwen3_drafter_model( ggml_tensor * K = ggml_mul_mat(gA, L.wk, h_norm); K = ggml_reshape_3d(gA, K, D, Hk, cl); - K = ggml_rms_norm(gA, K, eps); - K = ggml_mul(gA, K, L.k_norm); - // NoPE: save pre-RoPE K chunk alongside K_curr_v. - if (nope_tail) { - const size_t kn_esz = ggml_element_size(K_norope_v[il].t); - ggml_tensor * Kn_dst = ggml_view_3d(gA, K_norope_v[il].t, D, Hk, cl, + if (L.k_norm) { + K = ggml_rms_norm(gA, K, eps); + K = ggml_mul(gA, K, L.k_norm); + } + // NoPE: save pre-RoPE K chunk (only for layers that will be scored). + if (nope_tail && il >= score_layer_start_pre) { + const int si = il - score_layer_start_pre; + const size_t kn_esz = ggml_element_size(K_norope_v[si].t); + ggml_tensor * Kn_dst = ggml_view_3d(gA, K_norope_v[si].t, D, Hk, cl, kn_esz * D, kn_esz * D * Hk, (size_t)cs * kn_esz * D * Hk); ggml_build_forward_expand(gfA, ggml_cpy(gA, K, Kn_dst)); @@ -466,7 +481,7 @@ bool forward_qwen3_drafter_model( // Copy Q tail to Q_last_v[il] in the chunk that contains the tail. const int tail_lo = S - n_lookahead; - if (tail_lo >= cs && tail_lo < cs + cl) { + if (tail_lo >= cs && tail_lo + n_lookahead <= cs + cl) { int local_lo = tail_lo - cs; ggml_tensor * Q_tail_local = ggml_view_3d( gA, Q, D, H, n_lookahead, @@ -707,12 +722,12 @@ bool forward_qwen3_drafter_model( } #endif - if (il == 0 || il == w.n_layer - 1) { + if (il == 0 || il == fwd_layer_limit - 1) { std::fprintf(stderr, "[qwen3-0.6b-fp] layer %d/%d done " "(A_setup=%.3fs A_alloc=%.3fs A_compute=%.3fs FP=%.3fs " "B_warm=%.3fs B_setup=%.3fs B_alloc=%.3fs B_copy_in=%.3fs B_norm=%.3fs B_compute=%.3fs B_copy_out=%.3fs)\n", - il + 1, w.n_layer, + il + 1, fwd_layer_limit, t_a_setup, t_a_alloc, t_compute_a, t_fp, t_b_warm, t_b_setup, t_b_alloc, t_b_copy_in, t_b_norm, t_compute_b, t_b_copy_out); std::fflush(stderr); @@ -724,19 +739,28 @@ bool forward_qwen3_drafter_model( auto t_fwd_end = std::chrono::steady_clock::now(); double t_fwd = std::chrono::duration(t_fwd_end - t_total_start).count(); - // Tail attention scoring (unchanged from previous impl). + // Tail attention scoring. + // score_layers_pre / compute_score_range already determined the range before + // allocation (to size K_norope_v correctly). Re-use that result here. + // score_layer_start_pre == score_layer_start by construction (same formula, + // same env vars, same fwd_layer_limit_pre == fwd_layer_limit). + const int score_layer_start = score_layer_start_pre; + const int score_layer_end = fwd_layer_limit; + std::vector probs_h((size_t)S * n_lookahead * H); auto t_score_start = std::chrono::steady_clock::now(); - for (int il = 0; il < w.n_layer; ++il) { + for (int il = score_layer_start; il < score_layer_end; ++il) { ggml_init_params ip{}; ip.mem_size = ggml_tensor_overhead() * 32 + ggml_graph_overhead() + 16 * 1024; ip.no_alloc = true; ggml_context * gctx = ggml_init(ip); + // K_norope_v / Q_norope_v are indexed from score_layer_start_pre. + const int si = il - score_layer_start_pre; ggml_tensor * K_f32 = ggml_new_tensor_3d(gctx, GGML_TYPE_F32, D, Hk, S); ggml_tensor * K_cast = ggml_cpy(gctx, - nope_tail ? K_norope_v[il].t : K_curr_v[il].t, K_f32); + nope_tail ? K_norope_v[si].t : K_curr_v[il].t, K_f32); ggml_tensor * K_perm = ggml_cont(gctx, ggml_permute(gctx, K_cast, 0, 2, 1, 3)); ggml_tensor * K_score = K_perm; @@ -749,7 +773,7 @@ bool forward_qwen3_drafter_model( } ggml_tensor * Q_tail_perm = ggml_cont(gctx, ggml_permute(gctx, - nope_tail ? Q_norope_v[il].t : Q_last_v[il].t, + nope_tail ? Q_norope_v[si].t : Q_last_v[il].t, 0, 2, 1, 3)); ggml_tensor * attn_score = ggml_mul_mat(gctx, K_score, Q_tail_perm); ggml_tensor * probs = ggml_soft_max_ext(gctx, attn_score, mask_tail_buf.t, @@ -796,8 +820,9 @@ bool forward_qwen3_drafter_model( double t_score = std::chrono::duration(t_total_end - t_score_start).count(); std::fprintf(stderr, "[qwen3-0.6b-fp] forward %.2fs (S=%d, A_setup=%.2fs A_alloc=%.2fs A_compute=%.2fs FP=%.2fs B_warm=%.2fs B_setup=%.2fs B_alloc=%.2fs B_copy_in=%.2fs B_norm=%.2fs B_compute=%.2fs B_copy_out=%.2fs) " - "tail-score %.2fs total %.2fs\n", - t_fwd, S, t_a_setup, t_a_alloc, t_compute_a, t_fp, t_b_warm, t_b_setup, t_b_alloc, t_b_copy_in, t_b_norm, t_compute_b, t_b_copy_out, t_score, t_fwd + t_score); + "tail-score %.2fs (layers %d-%d) total %.2fs\n", + t_fwd, S, t_a_setup, t_a_alloc, t_compute_a, t_fp, t_b_warm, t_b_setup, t_b_alloc, t_b_copy_in, t_b_norm, t_compute_b, t_b_copy_out, + t_score, score_layer_start, score_layer_end - 1, t_fwd + t_score); std::fflush(stderr); cleanup_all(); diff --git a/server/src/qwen3/qwen3_loader.cpp b/server/src/qwen3/qwen3_loader.cpp index ed38ee106..b7b35a85e 100644 --- a/server/src/qwen3/qwen3_loader.cpp +++ b/server/src/qwen3/qwen3_loader.cpp @@ -133,6 +133,18 @@ bool load_qwen3_drafter_model(const std::string & path, out.head_dim = (int)get_u32(gctx, "qwen3.attention.key_length", 128); out.rope_theta = get_f32(gctx, "qwen3.rope.freq_base", 1000000.0f); + // Detect weight quant type from blk.0.attn_q.weight; support BF16 and Q8_0. + ggml_type wtype = GGML_TYPE_BF16; + { + int64_t tidx = gguf_find_tensor(gctx, "blk.0.attn_q.weight"); + if (tidx >= 0) { + wtype = gguf_get_tensor_type(gctx, tidx); + } + } + std::fprintf(stderr, "[qwen3-0.6b] detected weight type: %s\n", + wtype == GGML_TYPE_Q8_0 ? "Q8_0" : "BF16"); + std::fflush(stderr); + // Compute total tensor metadata size for context allocation. const int n_layer = out.n_layer; const int n_tensors_per_layer = 11; diff --git a/server/src/qwen35/c2_gate.h b/server/src/qwen35/c2_gate.h new file mode 100644 index 000000000..51c644e2c --- /dev/null +++ b/server/src/qwen35/c2_gate.h @@ -0,0 +1,31 @@ +// C2 gate predicate — pure function, no GPU/model deps. +// Extracted from qwen35_backend.cpp for testability. +// +// Reasoning: when pflash compresses a 128K prompt to ~11K tokens, the +// target KV at decode time = 11K (small). T_target is fast (small KV), +// T_draft ≈ constant. r = T_draft/T_target ≈ 1, so spec-decode does NOT +// win over AR. Empirical: D_composition 128K: AR=27.5 tok/s, spec=5.74 tok/s. +// Gate correctly blocks spec-decode when eff_fa_window > 2*fa_window_cfg. +#pragma once + +namespace dflash::common { + +// Returns true if spec-decode should be attempted. +// fa_window_override: 0 = no pflash; else = compressed_prompt_size + 256 +// fa_window_cfg : cfg_.fa_window (default 2048) +// kv_committed : KV position after prefill (unused; kept for future use) +// +// Gate: permit spec-decode when eff_fa_window <= 2 * fa_window_cfg. +// For uncompressed (override==0): always permit. +// For pflash-compressed: permit only when compressed_size <= 3840 tokens. +// At compressed_size > 3840, target KV is large enough that AR is faster +// than spec-decode (empirically: D_composition 128K AR=27.5 vs spec=5.74 tok/s). +inline bool c2_spec_decode_permitted(int fa_window_override, + int fa_window_cfg, + int kv_committed) { + (void)kv_committed; + return (fa_window_override == 0) + || (fa_window_override <= 2 * fa_window_cfg); +} + +} // namespace dflash::common diff --git a/server/src/qwen35/gguf_target_loader.cpp b/server/src/qwen35/gguf_target_loader.cpp index 116ddafc0..8628eb3ab 100644 --- a/server/src/qwen35/gguf_target_loader.cpp +++ b/server/src/qwen35/gguf_target_loader.cpp @@ -38,10 +38,7 @@ // ssm_out.weight [inner, hidden] Q5_K // ffn_gate/up/down (same as full-attn) // -// This loader reads the file via ggml's built-in GGUF API, which returns a -// ggml_context pre-populated with tensors. We then wire that context onto -// the CUDA backend (via ggml_backend_alloc_ctx_tensors) and copy each -// tensor's bytes from the mmap'd file. +// Loads via ggml GGUF API; tensors copied from mmap to CUDA backend. #include "internal.h" #include "common/layer_split_utils.h" @@ -738,6 +735,51 @@ bool load_target_gguf_partial(const std::string & path, gguf_free(gctx); + // Structural defense: derive scalar dims from weight tensor shapes and + // assert against GGUF-declared metadata. Catches stale/zero dw_ or w_ + // scalars before they silently corrupt graph-build (Bug #2 class). + // Uses the first full-attention layer (il = fai-1) because deltanet + // layers don't carry wq/wk. wq packs Q+gate so ne[1] = n_head*kl*2. + { + const int fa_il = out.full_attention_interval - 1; // first full-attn layer + const TargetLayer & fa = out.layers[(size_t)fa_il]; + if (fa.wq && fa.wk) { + const int64_t derived_q_dim = fa.wq->ne[1]; // n_head * head_dim * 2 + const int64_t derived_kv_dim = fa.wk->ne[1]; // n_head_kv * head_dim + const int64_t expected_q_dim = (int64_t)out.n_head * out.n_embd_head_k * 2; + const int64_t expected_kv_dim = (int64_t)out.n_head_kv * out.n_embd_head_k; + if (derived_q_dim != expected_q_dim) { + char buf[256]; + std::snprintf(buf, sizeof(buf), + "GGUF shape mismatch: blk.%d.attn_q.weight->ne[1]=%lld " + "!= n_head*head_dim*2=%d*%d*2=%lld", + fa_il, (long long)derived_q_dim, + out.n_head, out.n_embd_head_k, (long long)expected_q_dim); + set_last_error(buf); + return false; + } + if (derived_kv_dim != expected_kv_dim) { + char buf[256]; + std::snprintf(buf, sizeof(buf), + "GGUF shape mismatch: blk.%d.attn_k.weight->ne[1]=%lld " + "!= n_head_kv*head_dim=%d*%d=%lld", + fa_il, (long long)derived_kv_dim, + out.n_head_kv, out.n_embd_head_k, (long long)expected_kv_dim); + set_last_error(buf); + return false; + } + const int64_t derived_n_embd = fa.wq->ne[0]; // input dim = n_embd + if (derived_n_embd != (int64_t)out.n_embd) { + char buf[256]; + std::snprintf(buf, sizeof(buf), + "GGUF shape mismatch: blk.%d.attn_q.weight->ne[0]=%lld != n_embd=%d", + fa_il, (long long)derived_n_embd, out.n_embd); + set_last_error(buf); + return false; + } + } + } + if (tok_embd_off == 0 || tok_embd_type == GGML_TYPE_COUNT) { set_last_error("token_embd.weight not found or invalid type"); return false; diff --git a/server/src/qwen35/layer_split_forward.cpp b/server/src/qwen35/layer_split_forward.cpp index 5fd774cc0..d1ab66587 100644 --- a/server/src/qwen35/layer_split_forward.cpp +++ b/server/src/qwen35/layer_split_forward.cpp @@ -17,7 +17,7 @@ namespace dflash::common { -bool compute_target_split_projection( +bool compute_target_split_argmax( StepGraph & sg, const TargetWeights & w, ggml_backend_t backend, @@ -26,8 +26,7 @@ bool compute_target_split_projection( int n_tokens, int hidden, int vocab, - std::vector * argmax_out, - std::vector * logits_out) { + std::vector & argmax_out) { step_graph_free(sg); ggml_init_params ip{}; ip.mem_size = 256 * 1024 * 1024; @@ -44,51 +43,24 @@ bool compute_target_split_projection( ggml_tensor * logits = ggml_mul_mat(sg.ctx, w.output, normed); ggml_set_name(logits, "target_split_logits"); sg.logits = logits; - if (argmax_out) { - sg.argmax_tokens = ggml_argmax(sg.ctx, logits); - ggml_set_name(sg.argmax_tokens, "target_split_argmax"); - ggml_set_output(sg.argmax_tokens); - } - if (logits_out) { - ggml_set_output(sg.logits); - } + sg.argmax_tokens = ggml_argmax(sg.ctx, logits); + ggml_set_name(sg.argmax_tokens, "target_split_argmax"); + ggml_set_output(sg.argmax_tokens); sg.gf = ggml_new_graph_custom(sg.ctx, 1024, false); - if (argmax_out) ggml_build_forward_expand(sg.gf, sg.argmax_tokens); - if (logits_out) ggml_build_forward_expand(sg.gf, sg.logits); + ggml_build_forward_expand(sg.gf, sg.argmax_tokens); if (!sg.alloc) { sg.alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend)); } if (!ggml_gallocr_alloc_graph(sg.alloc, sg.gf)) return false; auto st = ggml_backend_graph_compute(backend, sg.gf); if (st != GGML_STATUS_SUCCESS) return false; - if (argmax_out) { - argmax_out->assign((size_t)n_tokens, 0); - ggml_backend_tensor_get(sg.argmax_tokens, argmax_out->data(), 0, - sizeof(int32_t) * (size_t)n_tokens); - } - if (logits_out) { - logits_out->assign((size_t)vocab * (size_t)n_tokens, 0.0f); - ggml_backend_tensor_get(sg.logits, logits_out->data(), 0, - sizeof(float) * (size_t)vocab * (size_t)n_tokens); - } + (void)vocab; + argmax_out.assign((size_t)n_tokens, 0); + ggml_backend_tensor_get(sg.argmax_tokens, argmax_out.data(), 0, + sizeof(int32_t) * (size_t)n_tokens); return true; } -bool compute_target_split_argmax( - StepGraph & sg, - const TargetWeights & w, - ggml_backend_t backend, - ggml_tensor * act, - int token_offset, - int n_tokens, - int hidden, - int vocab, - std::vector & argmax_out) { - return compute_target_split_projection( - sg, w, backend, act, token_offset, n_tokens, hidden, vocab, - &argmax_out, nullptr); -} - bool run_qwen35_layer_split_forward( std::vector & shards, const TargetWeights & embed_source, @@ -236,10 +208,9 @@ bool run_qwen35_layer_split_forward( const bool need_all_argmax = argmax_out != nullptr; const int argmax_offset = need_all_argmax ? 0 : (n_tokens_total - 1); const int argmax_count = need_all_argmax ? n_tokens_total : 1; - const bool ok = compute_target_split_projection( + const bool ok = compute_target_split_argmax( final_sg, last_shard.weights, last_shard.backend, act_in, - argmax_offset, argmax_count, hidden, vocab, - &argmax_tokens, logits_out); + argmax_offset, argmax_count, hidden, vocab, argmax_tokens); step_graph_destroy(final_sg); activation_pair_free(acts); if (!ok) return false; @@ -249,6 +220,7 @@ bool run_qwen35_layer_split_forward( shard.cache.last_tok = last_tok; } if (argmax_out) *argmax_out = std::move(argmax_tokens); + if (logits_out) logits_out->clear(); return true; } diff --git a/server/src/qwen35/layer_split_forward.h b/server/src/qwen35/layer_split_forward.h index bb01bff09..c04680fe4 100644 --- a/server/src/qwen35/layer_split_forward.h +++ b/server/src/qwen35/layer_split_forward.h @@ -32,18 +32,6 @@ bool compute_target_split_argmax( int vocab, std::vector & argmax_out); -bool compute_target_split_projection( - StepGraph & sg, - const TargetWeights & w, - ggml_backend_t backend, - ggml_tensor * act, - int token_offset, - int n_tokens, - int hidden, - int vocab, - std::vector * argmax_out, - std::vector * logits_out); - // Run a full forward pass through all shards, writing K/V into each shard's // cache. Returns the argmax of the last token in `last_tok`. // Optionally captures features into `feature_ring` / remote draft. diff --git a/server/src/qwen35/qwen35_dflash_target.h b/server/src/qwen35/qwen35_dflash_target.h index 6a72e48b5..69e134f1c 100644 --- a/server/src/qwen35/qwen35_dflash_target.h +++ b/server/src/qwen35/qwen35_dflash_target.h @@ -53,6 +53,11 @@ class Qwen35DFlashTarget : public DFlashTarget { int mask_token_id() const override; const std::vector & capture_layer_ids() const override; + // Per-call override for the verify-time flash-attention window. Used by + // do_spec_decode to widen the window when pflash compression has shrunk + // the prompt — see GenerateRequest.fa_window_override. + void set_fa_window(int fa) { fa_window_ = fa; } + private: TargetWeights & w_; TargetCache & cache_; diff --git a/server/src/qwen35/qwen35_layer_split_adapter.cpp b/server/src/qwen35/qwen35_layer_split_adapter.cpp index 51f68378c..a911f169b 100644 --- a/server/src/qwen35/qwen35_layer_split_adapter.cpp +++ b/server/src/qwen35/qwen35_layer_split_adapter.cpp @@ -86,7 +86,6 @@ bool Qwen35LayerSplitAdapter::init() { for (auto & slot : prefix_snapshots_) { slot.resize(shards_.size()); } - snapshot_prefill_logits_.resize(PREFIX_SLOTS); draft_feature_snapshots_.resize(PREFIX_SLOTS); return true; @@ -172,7 +171,6 @@ void Qwen35LayerSplitAdapter::begin_request(const GenerateRequest & req) { void Qwen35LayerSplitAdapter::reset_request_state() { for (auto & shard : shards_) reset_target_cache(shard.cache); - prefill_last_logits_.clear(); } int Qwen35LayerSplitAdapter::prefill_chunk_tokens() const { @@ -196,8 +194,7 @@ bool Qwen35LayerSplitAdapter::prefill(const std::vector & prompt, shards_, shards_.front().weights, prompt, base_pos, ubatch, last_tok, cfg_.kq_stride_pad, /*fa_window=*/0, (cfg_.run_dflash && !remote_draft_.active()) ? &feature_ring_ : nullptr, - /*argmax_out=*/nullptr, - &prefill_last_logits_, + /*argmax_out=*/nullptr, /*logits_out=*/nullptr, cfg_.run_dflash ? &remote_draft_ : nullptr); } @@ -222,8 +219,6 @@ bool Qwen35LayerSplitAdapter::snapshot_save(int slot) { return false; } } - if (snapshot_prefill_logits_.size() != (size_t)PREFIX_SLOTS) return false; - snapshot_prefill_logits_[(size_t)slot] = prefill_last_logits_; if (!snapshot_draft_features(slot)) { snapshot_free(slot); return false; @@ -236,9 +231,6 @@ void Qwen35LayerSplitAdapter::snapshot_free(int slot) { for (auto & snap : prefix_snapshots_[(size_t)slot]) { free_prefix_snapshot(snap); } - if (snapshot_prefill_logits_.size() == (size_t)PREFIX_SLOTS) { - snapshot_prefill_logits_[(size_t)slot].clear(); - } free_draft_feature_snapshot(slot); } @@ -249,10 +241,6 @@ bool Qwen35LayerSplitAdapter::snapshot_used(int slot) const { for (const auto & snap : snaps) { if (!snap.ctx) return false; } - if (snapshot_prefill_logits_.size() != (size_t)PREFIX_SLOTS || - snapshot_prefill_logits_[(size_t)slot].empty()) { - return false; - } if (cfg_.run_dflash && cfg_.draft_path) { if (draft_feature_snapshots_.size() != (size_t)PREFIX_SLOTS) return false; const auto & draft_snap = draft_feature_snapshots_[(size_t)slot]; @@ -277,8 +265,6 @@ bool Qwen35LayerSplitAdapter::snapshot_restore(int slot) { return false; } } - if (snapshot_prefill_logits_.size() != (size_t)PREFIX_SLOTS) return false; - prefill_last_logits_ = snapshot_prefill_logits_[(size_t)slot]; if (!restore_draft_features(slot)) return false; return true; } @@ -395,22 +381,14 @@ bool Qwen35LayerSplitAdapter::decode_ar( std::vector & out_tokens, const DaemonIO & io) { if (n_gen <= 0) return true; - const auto & w = shards_.front().weights; - const int vocab = w.n_vocab; - std::vector logits_buf; - if (sampler_.needs_logit_processing()) { - if ((int)prefill_last_logits_.size() != vocab) return false; - last_tok = sample_logits(prefill_last_logits_.data(), vocab, sampler_, - out_tokens, sampler_rng_); - } out_tokens.push_back(last_tok); io.emit(last_tok); if (io.cancelled) { io.emit(-1); return true; } - if (is_eos_tok(last_tok, w)) { + if (is_eos_tok(last_tok, shards_.front().weights)) { io.emit(-1); return true; } @@ -419,24 +397,16 @@ bool Qwen35LayerSplitAdapter::decode_ar( for (int i = 1; i < n_gen; ++i) { std::vector one(1, last_tok); int next_tok = -1; - logits_buf.clear(); if (!run_qwen35_layer_split_forward( shards_, shards_.front().weights, one, committed, 1, next_tok, cfg_.kq_stride_pad, cfg_.fa_window, - cfg_.run_dflash ? &feature_ring_ : nullptr, - /*argmax_out=*/nullptr, - sampler_.needs_logit_processing() ? &logits_buf : nullptr)) { + cfg_.run_dflash ? &feature_ring_ : nullptr)) { return false; } - if (sampler_.needs_logit_processing()) { - if ((int)logits_buf.size() != vocab) return false; - next_tok = sample_logits(logits_buf.data(), vocab, sampler_, - out_tokens, sampler_rng_); - } out_tokens.push_back(next_tok); io.emit(next_tok); if (io.cancelled) break; - if (is_eos_tok(next_tok, w)) break; + if (is_eos_tok(next_tok, shards_.front().weights)) break; last_tok = next_tok; ++committed; } @@ -445,7 +415,7 @@ bool Qwen35LayerSplitAdapter::decode_ar( } bool Qwen35LayerSplitAdapter::can_dflash_decode() const { - return cfg_.run_dflash && cfg_.draft_path && !sampler_.needs_logit_processing(); + return cfg_.run_dflash && cfg_.draft_path && sampler_.temp == 0.0f; } bool Qwen35LayerSplitAdapter::decode_dflash( @@ -533,7 +503,6 @@ void Qwen35LayerSplitAdapter::shutdown() { for (auto & snap : slot) free_prefix_snapshot(snap); } prefix_snapshots_.clear(); - snapshot_prefill_logits_.clear(); draft_feature_snapshots_.clear(); auto shard_metas = layer_split_shard_metas(shards_); free_layer_split_snapshot_backends(shard_metas, snapshot_backends_); diff --git a/server/src/qwen35/qwen35_layer_split_adapter.h b/server/src/qwen35/qwen35_layer_split_adapter.h index 9565cf6e6..bc778e8a8 100644 --- a/server/src/qwen35/qwen35_layer_split_adapter.h +++ b/server/src/qwen35/qwen35_layer_split_adapter.h @@ -57,7 +57,6 @@ class Qwen35LayerSplitAdapter : public LayerSplitAdapter { bool decode_ar(int last_tok, int committed, int n_gen, std::vector & out_tokens, const DaemonIO & io) override; - bool supports_cpu_sampling() const override { return true; } bool can_dflash_decode() const override; bool decode_dflash(const std::vector & prompt, int base_pos, @@ -102,7 +101,6 @@ class Qwen35LayerSplitAdapter : public LayerSplitAdapter { bool pflash_drafter_loaded_ = false; static constexpr int PREFIX_SLOTS = ModelBackend::kMaxSlots; std::vector> prefix_snapshots_; - std::vector> snapshot_prefill_logits_; std::vector snapshot_backends_; struct DraftFeatureSnapshot { int cur_pos = 0; @@ -118,7 +116,6 @@ class Qwen35LayerSplitAdapter : public LayerSplitAdapter { SamplerCfg sampler_; std::mt19937_64 sampler_rng_{std::random_device{}()}; std::unique_ptr dflash_target_; - std::vector prefill_last_logits_; }; } // namespace dflash::common diff --git a/server/test/test_server_unit.cpp b/server/test/test_server_unit.cpp index 363c9e9e6..c5dd5ed21 100644 --- a/server/test/test_server_unit.cpp +++ b/server/test/test_server_unit.cpp @@ -23,6 +23,7 @@ #include "placement/placement_config.h" #include "common/layer_split_backend.h" #include "common/layer_split_utils.h" +#include "qwen35/c2_gate.h" #include "placement/draft_residency.h" #include @@ -2696,6 +2697,58 @@ static void test_generate_result_accept_rate_zero_when_no_spec_decode() { TEST_ASSERT(r.accept_rate == 0.0f); } +// ═══════════════════════════════════════════════════════════════════════ +// C2 gate: c2_spec_decode_permitted() unit tests +// +// Gate logic: permit spec-decode when eff_fa_window <= 2*fa_window_cfg. +// eff_fa_window = fa_window_override when set, else fa_window_cfg. +// +// Empirical validation (Round 5 bench): +// - D_composition 128K: effective_in=10988, eff_fa_window=11244 > 4096 +// → gate BLOCKS spec-decode → AR at 27.5 tok/s (correct — spec at 5.74) +// - D_composition short: eff_fa_window <= 4096 → gate permits spec-decode +// ═══════════════════════════════════════════════════════════════════════ + +static void test_c2_gate_no_override_always_permits() { + // fa_window_override == 0 → no pflash, always spec-decode permitted. + TEST_ASSERT(dflash::common::c2_spec_decode_permitted(0, 2048, 1)); + TEST_ASSERT(dflash::common::c2_spec_decode_permitted(0, 2048, 4096)); + TEST_ASSERT(dflash::common::c2_spec_decode_permitted(0, 2048, 131072)); +} + +static void test_c2_gate_128k_compressed_blocks_spec() { + // Round 5 D 128K: effective_in=10988, fa_window_override=11244. + // 11244 > 2*2048=4096 → gate correctly BLOCKS spec-decode (AR wins empirically). + int fa_window_cfg = 2048; + int compressed_size = 10988; + int fa_window_override = compressed_size + 256; // = 11244 + TEST_ASSERT(!dflash::common::c2_spec_decode_permitted( + fa_window_override, fa_window_cfg, compressed_size)); +} + +static void test_c2_gate_65k_compressed_blocks_spec() { + // D 65K cell: effective_in≈5383, fa_window_override≈5639 > 4096 → blocks. + int compressed_size = 5383; + int fa_window_override = compressed_size + 256; + TEST_ASSERT(!dflash::common::c2_spec_decode_permitted( + fa_window_override, 2048, compressed_size)); +} + +static void test_c2_gate_small_compressed_permits_spec() { + // Small compressed KV (override <= 2*fa_window): spec-decode permitted. + // fa_window_override=3000 <= 4096 → permit + TEST_ASSERT(dflash::common::c2_spec_decode_permitted(3000, 2048, 2744)); + // fa_window_override=4096 == 2*2048 → permit (at boundary) + TEST_ASSERT(dflash::common::c2_spec_decode_permitted(4096, 2048, 3840)); +} + +static void test_c2_gate_boundary_at_2x_fa_window() { + // At exactly 2*fa_window_cfg: permit (<=). + TEST_ASSERT(dflash::common::c2_spec_decode_permitted(4096, 2048, 3840)); + // At 2*fa_window_cfg + 1: block. + TEST_ASSERT(!dflash::common::c2_spec_decode_permitted(4097, 2048, 3841)); +} + int main() { std::fprintf(stderr, "══════════════════════════════════════════\n"); std::fprintf(stderr, " Server Unit Tests\n"); @@ -2867,6 +2920,13 @@ int main() { RUN_TEST(test_generate_result_accept_rate_in_usage_anthropic); RUN_TEST(test_generate_result_accept_rate_zero_when_no_spec_decode); + std::fprintf(stderr, "\n── C2 gate (spec-decode gate) ──\n"); + RUN_TEST(test_c2_gate_no_override_always_permits); + RUN_TEST(test_c2_gate_128k_compressed_blocks_spec); + RUN_TEST(test_c2_gate_65k_compressed_blocks_spec); + RUN_TEST(test_c2_gate_small_compressed_permits_spec); + RUN_TEST(test_c2_gate_boundary_at_2x_fa_window); + std::fprintf(stderr, "\n══════════════════════════════════════════\n"); std::fprintf(stderr, " Results: %d assertions, %d failures\n", test_count, test_failures);