diff --git a/server/CMakeLists.txt b/server/CMakeLists.txt
index d42762f35..c8158cf7a 100644
--- a/server/CMakeLists.txt
+++ b/server/CMakeLists.txt
@@ -217,6 +217,7 @@ add_library(dflash_common STATIC
     src/draft/draft_gguf_loader.cpp
     src/draft/draft_safetensors_loader.cpp
     src/draft/draft_graph.cpp
+    src/qwen3/anchor_scan.cpp
     src/qwen3/qwen3_drafter.cpp
     src/qwen3/qwen3_loader.cpp
     src/qwen3/qwen3_graph.cpp
@@ -576,6 +577,52 @@ if(DFLASH27B_TESTS)
         target_link_libraries(test_bandit_integration PRIVATE dflash_common)
         add_test(NAME bandit_integration COMMAND test_bandit_integration)
     endif()
+    if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_drafter_early_exit_score_range.cpp")
+        add_executable(test_drafter_early_exit_score_range
+            test/test_drafter_early_exit_score_range.cpp)
+        target_include_directories(test_drafter_early_exit_score_range PRIVATE
+            ${CMAKE_CURRENT_SOURCE_DIR}/src/common)
+        add_test(NAME test_drafter_early_exit_score_range
+            COMMAND test_drafter_early_exit_score_range)
+    endif()
+    if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_regime_router.cpp")
+        add_executable(test_regime_router
+            test/test_regime_router.cpp)
+        target_include_directories(test_regime_router PRIVATE
+            ${CMAKE_CURRENT_SOURCE_DIR}/src/common)
+        add_test(NAME regime_router
+            COMMAND test_regime_router)
+    endif()
+    if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_anchor_transitive.cpp")
+        add_executable(test_anchor_transitive
+            test/test_anchor_transitive.cpp
+            src/qwen3/anchor_scan.cpp)
+        target_include_directories(test_anchor_transitive PRIVATE
+            ${CMAKE_CURRENT_SOURCE_DIR}/src/qwen3)
+        add_test(NAME test_anchor_transitive
+            COMMAND test_anchor_transitive)
+    endif()
+    if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_drafter_warm_path_regression.cpp")
+        add_executable(test_drafter_warm_path_regression
+            test/test_drafter_warm_path_regression.cpp)
+        target_include_directories(test_drafter_warm_path_regression PRIVATE
+            ${CMAKE_CURRENT_SOURCE_DIR}/src/common)
+        add_test(NAME test_drafter_warm_path_regression
+            COMMAND test_drafter_warm_path_regression)
+    endif()
+    if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_drafter_tail_capture_guard.cpp")
+        # GREEN phase: built with TAIL_GUARD_USE_NEW_FORMULA — must pass after Bug #42 fix.
+        add_executable(test_drafter_tail_capture_guard
+            test/test_drafter_tail_capture_guard.cpp)
+        target_compile_definitions(test_drafter_tail_capture_guard PRIVATE
+            TAIL_GUARD_USE_NEW_FORMULA)
+        add_test(NAME test_drafter_tail_capture_guard
+            COMMAND test_drafter_tail_capture_guard)
+        # RED phase binary: same source WITHOUT the fix flag — documents the bug.
+        add_executable(test_drafter_tail_capture_guard_red
+            test/test_drafter_tail_capture_guard.cpp)
+        # No TAIL_GUARD_USE_NEW_FORMULA — uses old (buggy) guard, expected to FAIL.
+    endif()
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_draft_vs_reference.cpp")
         add_executable(test_draft_vs_reference test/test_draft_vs_reference.cpp)
         target_link_libraries(test_draft_vs_reference PRIVATE dflash_common)
diff --git a/server/README.md b/server/README.md
index c66703e80..f4640b5b8 100644
--- a/server/README.md
+++ b/server/README.md
@@ -306,6 +306,8 @@ tokens) is the path to bring code recall to the same ratio as prose.
 
 ## Quick start
 
+> **Looking for the prebuilt Docker image?** See [Quick start](../README.md#quick-start) in the top-level README — `ghcr.io/luce-org/lucebox-hub:cuda12` ships the dflash daemon, Python server, and all weights bind-mountable from the host. The instructions below are for building dflash from source (kernel development, custom arch lists, non-Docker hosts).
+
 ```bash
 git clone --recurse-submodules https://github.com/Luce-Org/lucebox-hub
 cd lucebox-hub/dflash
diff --git a/server/scripts/bench_agent.py b/server/scripts/bench_agent.py
deleted file mode 100644
index 6953746c6..000000000
--- a/server/scripts/bench_agent.py
+++ /dev/null
@@ -1,506 +0,0 @@
-"""
-Agentic-workload benchmark — simulates Codex / Claude Code style requests.
-
-Where ``bench_llm.py`` exercises 40-250 token prompts (and skips anything
-``> 3500`` tokens) and reports decode tok/s only, real Codex / Claude Code
-clients send 5K-30K input tokens (system prompt + tool definitions +
-conversation history + tool-call file dumps) and the user-visible cost is
-prefill-dominated. This bench:
-
-  - Builds prompts from a real Codex system prompt fixture + a SWE-bench
-    Verified row (problem_statement + patch/test_patch as synthesised
-    "tool result" file context) padded to three length buckets:
-    ~2K, ~8K, ~24K tokens.
-  - Reports AR vs DFlash for each bucket: prefill_s, decode tok/s, TTFT,
-    total latency, AL — and BOTH the decode-only speedup (today's
-    RESULTS.md definition) and the user-visible total-latency speedup.
-  - Parses the per-stage [timing] block from ``test_dflash`` for
-    root-cause attribution.
-
-Usage:
-    python3 scripts/bench_agent.py                      # all buckets, n=5 each
-    python3 scripts/bench_agent.py --bucket 8k          # one bucket
-    python3 scripts/bench_agent.py --n-sample 1 --bucket 2k  # smoke
-
-Same env vars as ``bench_llm.py``: ``DFLASH_TARGET``, ``DFLASH_DRAFT``,
-``DFLASH_BIN``, ``DFLASH_BIN_AR``, ``DFLASH_TOKENIZER``.
-"""
-import argparse
-import json
-import os
-import re
-import struct
-import subprocess
-import tempfile
-import time
-from pathlib import Path
-
-ROOT = Path(__file__).resolve().parent.parent
-BIN_SUFFIX = ".exe" if os.name == "nt" else ""
-TARGET = os.environ.get(
-    "DFLASH_TARGET",
-    str(ROOT / "models" / "Qwen3.6-27B-Q4_K_M.gguf"),
-)
-_LOCAL_DRAFT_FILE = ROOT / "models" / "draft" / "dflash-draft-3.6-q4_k_m.gguf"
-_LOCAL_DRAFT_ROOT = ROOT / "models" / "draft"
-DRAFT = None
-TEST_DFLASH = os.environ.get("DFLASH_BIN", str(ROOT / "build" / f"test_dflash{BIN_SUFFIX}"))
-TEST_GENERATE = os.environ.get("DFLASH_BIN_AR", str(ROOT / "build" / f"test_generate{BIN_SUFFIX}"))
-TOKENIZER = os.environ.get("DFLASH_TOKENIZER", "Qwen/Qwen3.5-27B")
-TMPDIR = Path(tempfile.gettempdir()) / "dflash_bench"
-TMPDIR.mkdir(parents=True, exist_ok=True)
-
-FIX_DIR = ROOT / "scripts" / "fixtures"
-SWE_PARQUET = FIX_DIR / "swe_bench" / "swe_bench_verified.parquet"
-SYS_PROMPT_SMALL = FIX_DIR / "agent_prompts" / "codex_gpt52_codex.md"   # ~1694 tok
-SYS_PROMPT_LARGE = FIX_DIR / "agent_prompts" / "codex_gpt52.md"         # ~4756 tok
-
-N_GEN = 256
-BUDGET = 22
-
-# Prompt buckets — target token counts (hit within ±20%).
-BUCKETS = {
-    "2k":  {"target": 2048,  "sys": SYS_PROMPT_SMALL},
-    "8k":  {"target": 8192,  "sys": SYS_PROMPT_LARGE},
-    "24k": {"target": 24576, "sys": SYS_PROMPT_LARGE},
-}
-
-
-# ── shared with bench_llm.py (intentionally duplicated to keep it standalone) ──
-def _find_draft_model(root: Path):
-    if root.is_file():
-        return str(root)
-    if not root.is_dir():
-        return None
-    for pattern in ("dflash-draft-*.gguf", "*.gguf", "model.safetensors"):
-        matches = sorted(root.rglob(pattern))
-        if matches:
-            return str(matches[0])
-    return None
-
-
-def _resolve_draft() -> str:
-    env = os.environ.get("DFLASH_DRAFT")
-    if env:
-        found = _find_draft_model(Path(env))
-        if found:
-            return found
-        raise FileNotFoundError(f"DFLASH_DRAFT does not point to a draft GGUF/safetensors: {env}")
-    for c in (_LOCAL_DRAFT_FILE, _LOCAL_DRAFT_ROOT):
-        found = _find_draft_model(c)
-        if found:
-            return found
-    raise FileNotFoundError(
-        f"DFlash draft not found. Set DFLASH_DRAFT or place a file under {_LOCAL_DRAFT_ROOT}"
-    )
-
-
-def _require_file(path: str, label: str):
-    if not Path(path).is_file():
-        raise FileNotFoundError(f"{label} not found: {path}")
-
-
-def _run_timed(cmd, timeout: int, label: str):
-    """Run a subprocess, return (CompletedProcess, wall_seconds)."""
-    t0 = time.perf_counter()
-    r = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
-    wall_s = time.perf_counter() - t0
-    if r.returncode != 0:
-        tail = (r.stderr or r.stdout or "<no output>").strip()[-2000:]
-        raise RuntimeError(f"{label} exited {r.returncode}: {tail}")
-    return r, wall_s
-
-
-def tokenize_to_file(tok, text: str, path: Path) -> int:
-    ids = tok.encode(text, add_special_tokens=False)
-    with open(path, "wb") as f:
-        for t in ids:
-            f.write(struct.pack("<i", int(t)))
-    return len(ids)
-
-
-def _auto_max_ctx(n_prompt: int, n_gen: int) -> int:
-    pad = 64
-    return ((n_prompt + n_gen + pad + 255) // 256) * 256
-
-
-# ── parsing ────────────────────────────────────────────────────────────────
-_RE_DECODE_TPS = re.compile(r"(\d+(?:\.\d+)?)\s+tok/s")
-_RE_DF_PREFILL = re.compile(r"\[prefill\]\s+(\d+)\s+tokens\s+in\s+(\d+(?:\.\d+)?)\s*s")
-_RE_AL = re.compile(r"avg commit/step=(\d+(?:\.\d+)?)")
-_RE_TIMING_LINE = re.compile(r"^\s+([a-z_]+)\s+(\d+(?:\.\d+)?)\s*$")
-
-
-def _parse_dflash(stdout: str) -> dict:
-    """Parse test_dflash stdout into {prefill_s, decode_tps, al, stages{}}."""
-    m_pf = _RE_DF_PREFILL.search(stdout)
-    m_al = _RE_AL.search(stdout)
-    # decode tps line is the LAST tok/s in the file ("[dflash] generated ... -> X tok/s")
-    matches = list(_RE_DECODE_TPS.finditer(stdout))
-    if not (m_pf and m_al and matches):
-        raise RuntimeError(f"test_dflash parse failed:\n{stdout[-1500:]}")
-    decode_tps = float(matches[-1].group(1))
-
-    # Per-stage timing block (lines like "  draft_compute  14.70")
-    stages = {}
-    in_block = False
-    for line in stdout.splitlines():
-        if line.startswith("[timing]"):
-            in_block = True
-            continue
-        if in_block:
-            if line.startswith("[") or line.startswith("---"):
-                # "  ----- sum     132.20" ends block; "[dflash] generated…" too
-                if "----- sum" in line:
-                    m = re.search(r"sum\s+(\d+(?:\.\d+)?)", line)
-                    if m:
-                        stages["sum"] = float(m.group(1))
-                    in_block = False
-                    continue
-                if line.startswith("["):
-                    in_block = False
-                    continue
-            m = _RE_TIMING_LINE.match(line)
-            if m:
-                stages[m.group(1)] = float(m.group(2))
-
-    return {
-        "prefill_s": float(m_pf.group(2)),
-        "n_prompt_seen": int(m_pf.group(1)),
-        "decode_tps": decode_tps,
-        "al": float(m_al.group(1)),
-        "stages": stages,
-    }
-
-
-def _parse_ar(stdout: str) -> dict:
-    """Parse test_generate stdout → {decode_tps, n_gen}."""
-    m = re.search(r"\[gen\]\s+(\d+)\s+new tokens in\s+(\d+(?:\.\d+)?)\s*s\s*->\s*(\d+(?:\.\d+)?)\s+tok/s",
-                  stdout)
-    if not m:
-        raise RuntimeError(f"test_generate parse failed:\n{stdout[-1500:]}")
-    return {
-        "decode_tps": float(m.group(3)),
-        "decode_s": float(m.group(2)),
-        "n_gen": int(m.group(1)),
-    }
-
-
-# ── runners ────────────────────────────────────────────────────────────────
-def run_ar(path: Path, n_gen: int):
-    out_bin = TMPDIR / "ar_out.bin"
-    r, wall_s = _run_timed(
-        [TEST_GENERATE, TARGET, str(path), str(n_gen), str(out_bin)],
-        timeout=600, label="test_generate",
-    )
-    p = _parse_ar(r.stdout)
-    p["wall_s"] = wall_s
-    return p
-
-
-def run_df(path: Path, n_prompt: int, n_gen: int, budget: int = None):
-    if budget is None:
-        budget = BUDGET
-    max_ctx = _auto_max_ctx(n_prompt, n_gen)
-    out_bin = TMPDIR / "df_out.bin"
-    r, wall_s = _run_timed(
-        [
-            TEST_DFLASH, TARGET, DRAFT, str(path), str(n_gen), str(out_bin),
-            "--fast-rollback", "--ddtree",
-            f"--ddtree-budget={budget}", f"--max-ctx={max_ctx}",
-        ],
-        timeout=900, label="test_dflash",
-    )
-    p = _parse_dflash(r.stdout)
-    p["wall_s"] = wall_s
-    p["max_ctx"] = max_ctx
-    return p
-
-
-# ── prompt construction ────────────────────────────────────────────────────
-def _load_swe_rows():
-    import pyarrow.parquet as pq
-    t = pq.read_table(str(SWE_PARQUET))
-    return t.to_pandas()
-
-
-def _agent_user_message(row: dict, file_blocks_chars: int) -> str:
-    """Synthesise a Codex/Claude-Code style user turn.
-
-    Structure mimics what an agent client actually sends after a few tool
-    calls have run (read_file results pasted into history).
-    """
-    repo = row["repo"]
-    iid = row["instance_id"]
-    problem = row["problem_statement"] or ""
-    patch = row["patch"] or ""
-    test_patch = row["test_patch"] or ""
-    hints = row["hints_text"] or ""
-
-    # Build a pool of "file content" — real code from the repo's patch +
-    # test_patch, repeated if needed to hit the target. This is the same
-    # shape Codex would have after a few read_file calls.
-    pool = "\n\n".join(p for p in (patch, test_patch, hints) if p)
-    if not pool:
-        pool = problem
-    # Repeat to reach target byte count. Padding is real code from the same
-    # repo, just chunked into multiple <tool_result> blocks so it looks
-    # like several read_file calls.
-    chunks = []
-    chunk_size = max(2000, file_blocks_chars // 6)
-    cur = 0
-    idx = 1
-    while cur < file_blocks_chars:
-        seg = pool[(cur % max(1, len(pool))) : (cur % max(1, len(pool))) + chunk_size]
-        if not seg:
-            seg = pool[:chunk_size]
-        chunks.append(
-            f"<tool_result tool=\"read_file\" path=\"{repo}/_ctx_{idx}.py\">\n{seg}\n</tool_result>"
-        )
-        cur += len(seg)
-        idx += 1
-
-    file_blocks = "\n\n".join(chunks)
-    return (
-        f"Repository: {repo}\n"
-        f"Instance: {iid}\n\n"
-        f"## Issue\n{problem}\n\n"
-        f"## Context I gathered\n"
-        f"I ran `read_file` on the relevant modules. Their contents are:\n\n"
-        f"{file_blocks}\n\n"
-        f"## Task\n"
-        f"Investigate the bug and reply with a single tool call to `apply_patch` "
-        f"that fixes it. Keep the patch minimal."
-    )
-
-
-def build_prompt(tok, sys_prompt_path: Path, row, target_tokens: int) -> tuple:
-    """Build a chat-templated prompt that hits ``target_tokens`` ±20%.
-
-    Returns (text, n_tokens).
-    """
-    sys_text = sys_prompt_path.read_text(encoding="utf-8")
-    # Iteratively grow file_blocks_chars until we hit target.
-    # Empirically ~3.5 chars/token for Qwen on code.
-    sys_tokens = len(tok.encode(sys_text, add_special_tokens=False))
-    overhead = 200  # chat template + scaffolding
-    target_user_tokens = max(256, target_tokens - sys_tokens - overhead)
-    chars = max(1024, target_user_tokens * 4)
-
-    for _ in range(6):
-        user_text = _agent_user_message(row, chars)
-        msgs = [
-            {"role": "system", "content": sys_text},
-            {"role": "user", "content": user_text},
-        ]
-        # Try chat template; fall back to plain concat if tokenizer has none.
-        try:
-            text = tok.apply_chat_template(
-                msgs, tokenize=False, add_generation_prompt=True,
-                enable_thinking=False,
-            )
-        except Exception:
-            text = sys_text + "\n\n" + user_text + "\n\n"
-        n = len(tok.encode(text, add_special_tokens=False))
-        if abs(n - target_tokens) / target_tokens < 0.20:
-            return text, n
-        # binary search-ish: scale chars by ratio
-        chars = max(512, int(chars * (target_tokens / max(1, n))))
-    return text, n
-
-
-def select_rows_for_bucket(df, target_tokens, n_sample, seed=42):
-    """Pick rows whose problem_statement is small enough that we can grow to
-    target without truncating the issue itself."""
-    # Just shuffle — _agent_user_message handles padding to any size.
-    return df.sample(n=n_sample, random_state=seed).to_dict("records")
-
-
-# ── main bench loop ────────────────────────────────────────────────────────
-def main():
-    global DRAFT, BUDGET
-
-    p = argparse.ArgumentParser(description="DFlash agentic-workload benchmark")
-    p.add_argument("--budget", type=int, default=BUDGET)
-    p.add_argument("--n-sample", type=int, default=5,
-                   help="prompts per bucket (default 5)")
-    p.add_argument("--bucket", choices=list(BUCKETS) + ["all"], default="all")
-    p.add_argument("--n-gen", type=int, default=N_GEN)
-    p.add_argument("--out", type=str, default=str(TMPDIR / "bench_agent_results.json"))
-    p.add_argument("--skip-ar", action="store_true",
-                   help="skip the AR baseline (useful for budget sweeps)")
-    args = p.parse_args()
-    BUDGET = args.budget
-
-    DRAFT = _resolve_draft()
-    _require_file(TARGET, "target GGUF")
-    _require_file(TEST_DFLASH, "test_dflash binary")
-    if not args.skip_ar:
-        _require_file(TEST_GENERATE, "test_generate binary")
-    _require_file(str(SWE_PARQUET), "SWE-bench Verified parquet")
-    _require_file(str(SYS_PROMPT_SMALL), "small Codex system prompt fixture")
-    _require_file(str(SYS_PROMPT_LARGE), "large Codex system prompt fixture")
-
-    print(f"[bench-agent] target    = {TARGET}", flush=True)
-    print(f"[bench-agent] draft     = {DRAFT}", flush=True)
-    print(f"[bench-agent] tokenizer = {TOKENIZER}", flush=True)
-    print(f"[bench-agent] budget    = {BUDGET}  n_gen = {args.n_gen}  n_sample = {args.n_sample}",
-          flush=True)
-
-    from transformers import AutoTokenizer
-    tok = AutoTokenizer.from_pretrained(TOKENIZER, trust_remote_code=True)
-
-    df = _load_swe_rows()
-    print(f"[bench-agent] loaded {len(df)} SWE-bench Verified rows", flush=True)
-
-    bucket_keys = list(BUCKETS) if args.bucket == "all" else [args.bucket]
-    results = {}
-    load_s_estimate = None  # calibrated from first DFlash run
-
-    for bk in bucket_keys:
-        cfg = BUCKETS[bk]
-        target = cfg["target"]
-        sys_path = cfg["sys"]
-        rows = select_rows_for_bucket(df, target, args.n_sample, seed=42)
-
-        print(f"\n[bench-agent] === bucket {bk} (target ~{target} tok, n={len(rows)}) ===",
-              flush=True)
-        per_prompt = []
-        for i, row in enumerate(rows):
-            text, n = build_prompt(tok, sys_path, row, target)
-            path = TMPDIR / f"agent_{bk}_{i:02d}.bin"
-            tokenize_to_file(tok, text, path)
-
-            try:
-                df_res = run_df(path, n, args.n_gen)
-            except Exception as e:
-                print(f"  [{i+1:02d}/{len(rows)}] n={n:5d}  DFlash FAILED: {e}",
-                      flush=True)
-                continue
-
-            # calibrate load_s from the first DFlash run if not done yet
-            if load_s_estimate is None:
-                load_s_estimate = max(
-                    0.0,
-                    df_res["wall_s"] - df_res["prefill_s"]
-                    - args.n_gen / max(1e-6, df_res["decode_tps"])
-                )
-                print(f"  [calibration] estimated model_load_s = {load_s_estimate:.2f}",
-                      flush=True)
-
-            ar_res = None
-            if not args.skip_ar:
-                try:
-                    ar_res = run_ar(path, args.n_gen)
-                except Exception as e:
-                    print(f"  [{i+1:02d}/{len(rows)}] n={n:5d}  AR FAILED: {e}",
-                          flush=True)
-
-            entry = {
-                "bucket": bk, "i": i, "instance_id": row["instance_id"],
-                "n_prompt": n, "n_gen": args.n_gen,
-                "df": df_res, "ar": ar_res, "load_s_est": load_s_estimate,
-            }
-
-            # Derived numbers (per-prompt, AR vs DFlash)
-            df_decode_s = args.n_gen / max(1e-6, df_res["decode_tps"])
-            df_total_s = df_res["prefill_s"] + df_decode_s
-            df_ttft_s = df_res["prefill_s"] + 1.0 / max(1e-6, df_res["decode_tps"])
-            entry["df_total_s"] = df_total_s
-            entry["df_ttft_s"] = df_ttft_s
-
-            line = (
-                f"  [{i+1:02d}/{len(rows)}] n={n:5d}  "
-                f"DF prefill={df_res['prefill_s']:6.2f}s ({n/df_res['prefill_s']:6.1f} tok/s)  "
-                f"decode={df_res['decode_tps']:6.2f} tok/s  "
-                f"AL={df_res['al']:5.2f}  "
-                f"TTFT={df_ttft_s:6.2f}s  total={df_total_s:6.2f}s"
-            )
-            if ar_res is not None:
-                ar_decode_s = args.n_gen / max(1e-6, ar_res["decode_tps"])
-                ar_prefill_s = max(0.0, ar_res["wall_s"] - load_s_estimate - ar_decode_s)
-                ar_ttft_s = ar_prefill_s + 1.0 / max(1e-6, ar_res["decode_tps"])
-                ar_total_s = ar_prefill_s + ar_decode_s
-                entry["ar_prefill_s_est"] = ar_prefill_s
-                entry["ar_total_s"] = ar_total_s
-                entry["ar_ttft_s"] = ar_ttft_s
-                entry["speedup_decode"] = df_res["decode_tps"] / max(1e-6, ar_res["decode_tps"])
-                entry["speedup_total"] = ar_total_s / max(1e-6, df_total_s)
-                line += (
-                    f"  ||  AR prefill≈{ar_prefill_s:6.2f}s  "
-                    f"decode={ar_res['decode_tps']:5.2f} tok/s  "
-                    f"total={ar_total_s:6.2f}s  "
-                    f"speedup decode={entry['speedup_decode']:.2f}x "
-                    f"total={entry['speedup_total']:.2f}x"
-                )
-            print(line, flush=True)
-            per_prompt.append(entry)
-
-        # bucket aggregates
-        if per_prompt:
-            def _mean(xs): return sum(xs) / len(xs) if xs else 0.0
-            agg = {
-                "n_samples": len(per_prompt),
-                "n_prompt_mean": _mean([e["n_prompt"] for e in per_prompt]),
-                "df_prefill_s_mean": _mean([e["df"]["prefill_s"] for e in per_prompt]),
-                "df_decode_tps_mean": _mean([e["df"]["decode_tps"] for e in per_prompt]),
-                "df_al_mean": _mean([e["df"]["al"] for e in per_prompt]),
-                "df_ttft_s_mean": _mean([e["df_ttft_s"] for e in per_prompt]),
-                "df_total_s_mean": _mean([e["df_total_s"] for e in per_prompt]),
-                "stages_mean": {
-                    k: _mean([e["df"]["stages"].get(k, 0.0) for e in per_prompt])
-                    for k in sorted({k for e in per_prompt for k in e["df"]["stages"]})
-                },
-            }
-            ar_entries = [e for e in per_prompt if e.get("ar") is not None]
-            if ar_entries:
-                agg["ar_prefill_s_est_mean"] = _mean([e["ar_prefill_s_est"] for e in ar_entries])
-                agg["ar_decode_tps_mean"] = _mean([e["ar"]["decode_tps"] for e in ar_entries])
-                agg["ar_ttft_s_mean"] = _mean([e["ar_ttft_s"] for e in ar_entries])
-                agg["ar_total_s_mean"] = _mean([e["ar_total_s"] for e in ar_entries])
-                agg["speedup_decode_mean"] = _mean([e["speedup_decode"] for e in ar_entries])
-                agg["speedup_total_mean"] = _mean([e["speedup_total"] for e in ar_entries])
-            results[bk] = {"per_prompt": per_prompt, "agg": agg}
-
-            print(f"\n  [{bk}] mean: n={agg['n_prompt_mean']:.0f}  "
-                  f"DF prefill={agg['df_prefill_s_mean']:.2f}s  "
-                  f"decode={agg['df_decode_tps_mean']:.2f} tok/s  "
-                  f"AL={agg['df_al_mean']:.2f}  "
-                  f"TTFT={agg['df_ttft_s_mean']:.2f}s  "
-                  f"total={agg['df_total_s_mean']:.2f}s",
-                  flush=True)
-            if ar_entries:
-                print(f"  [{bk}] mean: AR prefill≈{agg['ar_prefill_s_est_mean']:.2f}s  "
-                      f"decode={agg['ar_decode_tps_mean']:.2f} tok/s  "
-                      f"total={agg['ar_total_s_mean']:.2f}s  "
-                      f"|| speedup decode={agg['speedup_decode_mean']:.2f}x  "
-                      f"TOTAL={agg['speedup_total_mean']:.2f}x",
-                      flush=True)
-
-    # ── final comparison vs RESULTS.md headline ─────────────────────────
-    print("\n[bench-agent] === COMPARISON vs RESULTS.md HumanEval headline ===")
-    print(f"{'Bucket':>8s}  {'n_tok':>6s}  {'AR tps':>7s}  {'DF tps':>7s}  "
-          f"{'AL':>5s}  {'TTFT':>7s}  {'Total':>7s}  {'sp_dec':>7s}  {'sp_tot':>7s}")
-    print(f"{'HumanEv':>8s}  {' ~120':>6s}  {' 37.78':>7s}  {'129.52':>7s}  "
-          f"{' 8.31':>5s}  {'   --':>7s}  {'   --':>7s}  {' 3.43x':>7s}  {'   --':>7s}  "
-          f"(from RESULTS.md)")
-    for bk, r in results.items():
-        a = r["agg"]
-        sp_d = a.get("speedup_decode_mean", 0.0)
-        sp_t = a.get("speedup_total_mean", 0.0)
-        ar_tps = a.get("ar_decode_tps_mean", 0.0)
-        print(f"{bk:>8s}  {a['n_prompt_mean']:6.0f}  {ar_tps:7.2f}  "
-              f"{a['df_decode_tps_mean']:7.2f}  {a['df_al_mean']:5.2f}  "
-              f"{a['df_ttft_s_mean']:6.2f}s  {a['df_total_s_mean']:6.2f}s  "
-              f"{sp_d:6.2f}x  {sp_t:6.2f}x")
-
-    # write JSON (per-prompt and agg)
-    out_path = Path(args.out)
-    out_path.parent.mkdir(parents=True, exist_ok=True)
-    with open(out_path, "w") as f:
-        json.dump(results, f, indent=2, default=str)
-    print(f"\n[bench-agent] wrote {out_path}", flush=True)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/server/scripts/bench_agent_loop.py b/server/scripts/bench_agent_loop.py
deleted file mode 100644
index 3f4a668b3..000000000
--- a/server/scripts/bench_agent_loop.py
+++ /dev/null
@@ -1,190 +0,0 @@
-"""B.6: agent-loop bench using real Claude Code session messages.
-
-Extracts the first N user-text turns from a session JSONL, replays them
-sequentially through the dflash server, and reports per-turn latency
-under two configs: prefix-cache enabled vs disabled.
-
-Usage:
-    python3 dflash/scripts/bench_agent_loop.py [--turns N] [--session PATH]
-
-Default session = most recent JSONL under
-~/.claude/projects/-home-peppi-Dev-lucebox-hub/.
-
-Each turn's user text is the real human prompt from the session. Assistant
-replies are generated by the dflash server (small max_tokens to keep
-the bench fast); the synthesized history grows turn-by-turn.
-
-Compares cold (--prefix-cache-slots=0) vs warm (--prefix-cache-slots=4).
-Reports total wall time, per-turn latency, and per-turn ratio.
-"""
-import argparse
-import json
-import os
-import signal
-import subprocess
-import sys
-import time
-import urllib.error
-import urllib.request
-from pathlib import Path
-
-ROOT          = Path(__file__).resolve().parent.parent.parent
-TARGET        = Path.home() / "models/qwen3.6-27b/Qwen3.6-27B-UD-Q4_K_XL.gguf"
-DRAFT         = Path.home() / "models/qwen3.6-27b-dflash"
-SERVER_BIN    = ROOT / "dflash/build/dflash_server"
-SESSION_DIR   = Path.home() / ".claude/projects/-home-peppi-Dev-lucebox-hub"
-
-
-def extract_user_turns(jsonl_path: Path, limit: int) -> list[str]:
-    """Pull the first `limit` user-text messages from a Claude Code session."""
-    turns = []
-    with open(jsonl_path) as f:
-        for ln in f:
-            try:
-                rec = json.loads(ln)
-            except json.JSONDecodeError:
-                continue
-            if rec.get("type") != "user":
-                continue
-            msg = rec.get("message", {})
-            content = msg.get("content", "")
-            if isinstance(content, str) and content.strip() and not content.startswith("<"):
-                # Skip command-name records (they start with <command-message> etc).
-                turns.append(content.strip())
-                if len(turns) >= limit:
-                    break
-    return turns
-
-
-def chat_post(port: int, payload: dict, timeout=600) -> str:
-    body = json.dumps(payload).encode()
-    req = urllib.request.Request(
-        f"http://127.0.0.1:{port}/v1/chat/completions",
-        data=body, headers={"Content-Type": "application/json"})
-    resp = urllib.request.urlopen(req, timeout=timeout)
-    data = json.loads(resp.read())
-    return data["choices"][0]["message"]["content"]
-
-
-def wait_server_up(port: int, proc: subprocess.Popen, timeout=180) -> bool:
-    deadline = time.time() + timeout
-    while time.time() < deadline:
-        if proc.poll() is not None:
-            return False
-        try:
-            urllib.request.urlopen(f"http://127.0.0.1:{port}/v1/models", timeout=1).read()
-            return True
-        except (urllib.error.URLError, ConnectionResetError, TimeoutError):
-            time.sleep(1)
-    return False
-
-
-def run_config(label: str, port: int, slots: int, user_turns: list[str],
-                max_tokens: int, log_path: Path) -> list[float]:
-    """Spin up server with --prefix-cache-slots=slots, replay turns, return latencies."""
-    log_f = open(log_path, "w")
-    proc = subprocess.Popen(
-        [str(SERVER_BIN), str(TARGET),
-         "--draft", str(DRAFT),
-         "--max-ctx", "4096", "--port", str(port),
-         "--prefix-cache-slots", str(slots)],
-        stdout=log_f, stderr=subprocess.STDOUT, bufsize=1)
-
-    if not wait_server_up(port, proc):
-        log_f.close()
-        out = log_path.read_text()[-1500:]
-        proc.send_signal(signal.SIGINT)
-        try: proc.wait(timeout=10)
-        except subprocess.TimeoutExpired: proc.kill()
-        raise RuntimeError(f"{label}: server didn't come up\n{out}")
-
-    print(f"\n--- {label} (slots={slots}) ---", flush=True)
-
-    history = []
-    SYSTEM = "You are a precise coding assistant for the lucebox-hub repo. Answer concisely."
-    latencies = []
-    try:
-        for i, user_text in enumerate(user_turns):
-            history.append({"role": "user", "content": user_text})
-            msgs = [{"role": "system", "content": SYSTEM}, *history]
-            payload = {"model": "luce-dflash", "messages": msgs,
-                       "max_tokens": max_tokens, "stream": False}
-            t0 = time.time()
-            try:
-                reply = chat_post(port, payload, timeout=300)
-            except Exception as e:
-                print(f"  turn {i+1}: ERROR {e}")
-                latencies.append(float("nan"))
-                continue
-            dt = time.time() - t0
-            latencies.append(dt)
-            history.append({"role": "assistant", "content": reply})
-            print(f"  turn {i+1}: {dt:.2f}s  reply={reply[:50]!r}", flush=True)
-    finally:
-        proc.send_signal(signal.SIGINT)
-        try: proc.wait(timeout=10)
-        except subprocess.TimeoutExpired: proc.kill()
-        log_f.close()
-
-    return latencies
-
-
-def main():
-    ap = argparse.ArgumentParser()
-    ap.add_argument("--turns", type=int, default=5,
-                    help="Number of user turns to replay")
-    ap.add_argument("--max-tokens", type=int, default=8,
-                    help="max_tokens per response (kept small to bound bench time)")
-    ap.add_argument("--session", type=Path, default=None,
-                    help="Path to session JSONL; default = most recent under "
-                         f"{SESSION_DIR}")
-    args = ap.parse_args()
-
-    if not TARGET.exists() or not SERVER_BIN.exists():
-        print(f"SKIP: prereqs missing (target={TARGET.exists()} bin={SERVER_BIN.exists()})")
-        return 0
-
-    if args.session:
-        session = args.session
-    else:
-        candidates = sorted(SESSION_DIR.glob("*.jsonl"),
-                            key=lambda p: p.stat().st_mtime, reverse=True)
-        if not candidates:
-            print(f"No session JSONL under {SESSION_DIR}")
-            return 1
-        session = candidates[0]
-    print(f"Session: {session.name}", flush=True)
-
-    user_turns = extract_user_turns(session, args.turns)
-    if len(user_turns) < args.turns:
-        print(f"Only got {len(user_turns)} turns")
-    print(f"Extracted {len(user_turns)} user turns:")
-    for i, t in enumerate(user_turns):
-        print(f"  [{i+1}] {t[:80]!r}{'...' if len(t)>80 else ''}")
-
-    # Cold config: cache disabled (slots=0) → every turn re-prefills full history
-    cold = run_config("COLD (cache disabled)", port=18290, slots=0,
-                       user_turns=user_turns, max_tokens=args.max_tokens,
-                       log_path=Path("/tmp/bench_cold.log"))
-
-    # Warm config: cache enabled (slots=4) → multi-point inline-snap
-    warm = run_config("WARM (cache enabled)", port=18291, slots=4,
-                       user_turns=user_turns, max_tokens=args.max_tokens,
-                       log_path=Path("/tmp/bench_warm.log"))
-
-    print("\n=== Per-turn latency ===", flush=True)
-    print(f"{'turn':>4} {'cold':>8} {'warm':>8} {'speedup':>8}")
-    total_cold = total_warm = 0.0
-    for i, (c, w) in enumerate(zip(cold, warm), start=1):
-        speedup = (c / w) if (w and w > 0) else float("nan")
-        print(f"{i:>4} {c:>8.2f} {w:>8.2f} {speedup:>7.2f}x")
-        total_cold += c; total_warm += w
-    overall = total_cold / total_warm if total_warm else float("nan")
-    print(f"\ntotal_cold={total_cold:.2f}s  total_warm={total_warm:.2f}s  "
-          f"overall speedup={overall:.2f}x")
-
-    return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/server/scripts/bench_daemon.py b/server/scripts/bench_daemon.py
deleted file mode 100644
index 1f7ea80bc..000000000
--- a/server/scripts/bench_daemon.py
+++ /dev/null
@@ -1,140 +0,0 @@
-"""Daemon-mode HE bench. Hits /v1/chat/completions with the same 10 HE
-prompts as bench_he.py and reports mean tok/s.
-
-Streams the response and reports two numbers per prompt:
-
-  * wall    — total HTTP time (tokenize + prefill + decode + HTTP / JSON)
-  * decode  — first-token → last-token elapsed, matching bench_he.py's
-              tok/s (excludes prefill + setup)
-
-Compare `decode` against bench_he.py to verify the C++ decode path is as
-fast under the daemon as under a one-shot test_dflash invocation.
-
-Start the server first (same config the published numbers use):
-    DFLASH27B_KV_TQ3=1 ./build/dflash_server models/Qwen3.6-27B-Q4_K_M.gguf \\
-        --ddtree --ddtree-budget 22 --max-ctx 16384 --port 8000
-
-Then:
-    python3 scripts/bench_daemon.py --url http://localhost:8000 --n-gen 256
-"""
-import argparse
-import json
-import time
-import urllib.request
-from pathlib import Path
-import sys
-
-# Reuse the exact same 10 HE prompts bench_he.py uses.
-sys.path.insert(0, str(Path(__file__).resolve().parent))
-from bench_he import PROMPTS
-
-
-def run(url: str, prompt: str, n_gen: int) -> tuple[int, float, float]:
-    """POST to /v1/chat/completions with stream=true. Return (n_tok, wall_secs,
-    decode_secs) where decode_secs starts at the first streamed token (after
-    prefill) and ends at the last token."""
-    body = json.dumps({
-        "model": "luce-dflash",
-        "messages": [{"role": "user", "content": prompt}],
-        "max_tokens": n_gen,
-        "stream": True,
-    }).encode()
-    req = urllib.request.Request(
-        url + "/v1/chat/completions",
-        data=body,
-        headers={"Content-Type": "application/json",
-                 "Accept": "text/event-stream"},
-    )
-    t0 = time.perf_counter()
-    t_first = 0.0
-    t_last = 0.0
-    n_tok = 0
-    with urllib.request.urlopen(req, timeout=600) as r:
-        for raw in r:
-            line = raw.decode("utf-8", errors="replace").rstrip()
-            if not line.startswith("data:"):
-                continue
-            payload = line[5:].strip()
-            if payload == "[DONE]":
-                break
-            try:
-                chunk = json.loads(payload)
-            except json.JSONDecodeError:
-                continue
-            choices = chunk.get("choices") or []
-            if not choices:
-                continue
-            delta = choices[0].get("delta") or {}
-            # Count tokens by content / reasoning deltas. Tool-call deltas
-            # aren't counted — they arrive as a single final chunk.
-            if delta.get("content") or delta.get("reasoning_content"):
-                if n_tok == 0:
-                    t_first = time.perf_counter()
-                n_tok += 1
-                t_last = time.perf_counter()
-    wall = time.perf_counter() - t0
-    decode = (t_last - t_first) if n_tok > 1 else 0.0
-    return n_tok, wall, decode
-
-
-def main():
-    ap = argparse.ArgumentParser()
-    ap.add_argument("--url", default="http://localhost:8000",
-                    help="Base URL of the running server (no /v1 suffix)")
-    ap.add_argument("--n-gen", type=int, default=256)
-    ap.add_argument("--warmup", action="store_true",
-                    help="Run the first prompt once before timing to discard "
-                         "cold-start effects (model is already resident, but "
-                         "the first request allocates the decode VMM chunks).")
-    args = ap.parse_args()
-
-    if args.warmup:
-        print("[bench] warmup...", flush=True)
-        run(args.url, PROMPTS[0][1], args.n_gen)
-
-    print(f"[bench] daemon API  n_gen={args.n_gen}  url={args.url}", flush=True)
-    print(f"{'prompt':28s}  {'n_tok':>5s} {'wall_s':>7s} {'dec_s':>7s} "
-          f"{'wall_tps':>9s} {'dec_tps':>9s}")
-    print("-" * 72)
-    wall_tps_list: list[float] = []
-    dec_tps_list: list[float] = []
-    total_tok = 0
-    total_wall = 0.0
-    total_decode = 0.0
-    for name, text in PROMPTS:
-        try:
-            n_tok, wall, decode = run(args.url, text, args.n_gen)
-        except Exception as e:
-            print(f"  {name:26s}  FAILED: {e}", flush=True)
-            continue
-        if n_tok == 0:
-            print(f"  {name:26s}  {n_tok:5d} {wall:7.2f}    --         --        -- "
-                  "  (empty — daemon likely OOM'd)", flush=True)
-            continue
-        wall_tps = n_tok / wall
-        dec_tps = (n_tok - 1) / decode if decode > 0 else 0.0
-        wall_tps_list.append(wall_tps)
-        if dec_tps > 0:
-            dec_tps_list.append(dec_tps)
-            total_decode += decode
-        total_tok += n_tok
-        total_wall += wall
-        print(f"  {name:26s}  {n_tok:5d} {wall:7.2f} {decode:7.2f} "
-              f"{wall_tps:9.2f} {dec_tps:9.2f}", flush=True)
-
-    print("-" * 72)
-    if wall_tps_list:
-        print(f"wall tok/s mean:       {sum(wall_tps_list)/len(wall_tps_list):7.2f}  "
-              f"(HTTP + tokenize + prefill + decode)")
-        if dec_tps_list:
-            print(f"decode tok/s mean:     {sum(dec_tps_list)/len(dec_tps_list):7.2f}  "
-                  f"(first-token → last-token, matches bench_he.py's number)")
-            agg_dec = (total_tok - len(dec_tps_list)) / total_decode if total_decode > 0 else 0.0
-            print(f"decode tok/s aggregate:{agg_dec:7.2f}")
-            print(f"decode tok/s range:    {min(dec_tps_list):.2f} - {max(dec_tps_list):.2f}")
-    else:
-        print("no successful runs")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/server/scripts/bench_he.py b/server/scripts/bench_he.py
deleted file mode 100644
index a4da24f6d..000000000
--- a/server/scripts/bench_he.py
+++ /dev/null
@@ -1,463 +0,0 @@
-"""
-Bench DFlash test_dflash over multiple HumanEval-style prompts to get a stable
-average acceptance length. Single-prompt measurements are noisy — z-lab's 8.09
-AL on humaneval is averaged over 164 samples.
-
-Usage on lucebox:
-    python3 bench_he.py                 # run all 10 prompts with --fast-rollback
-    python3 bench_he.py --mode batched  # run without --fast-rollback for A/B
-"""
-import argparse
-import os
-import re
-import struct
-import subprocess
-import sys
-import tempfile
-from pathlib import Path
-
-from placement.backend_device import apply_backend_visible_devices
-from placement.test_dflash_args import TestDflashLaunchArgs
-
-
-ROOT = Path(__file__).resolve().parent.parent
-BIN_SUFFIX = ".exe" if os.name == "nt" else ""
-TARGET = os.environ.get(
-    "DFLASH_TARGET",
-    str(ROOT / "models" / "Qwen3.6-27B-Q4_K_M.gguf"),
-)
-_LOCAL_DRAFT_FILE = ROOT / "models" / "draft" / "dflash-draft-3.6-q4_k_m.gguf"
-_LOCAL_DRAFT_ROOT = ROOT / "models" / "draft"
-DRAFT = None
-TEST_DFLASH = os.environ.get(
-    "DFLASH_BIN",
-    str(ROOT / "build" / f"test_dflash{BIN_SUFFIX}"),
-)
-TMPDIR = Path(tempfile.gettempdir()) / "dflash_bench"
-TMPDIR.mkdir(parents=True, exist_ok=True)
-
-PROMPTS = [
-    # (name, source_code)
-    (
-        "has_close_elements",
-        "from typing import List\n\n"
-        "def has_close_elements(numbers: List[float], threshold: float) -> bool:\n"
-        '    """Check if in given list of numbers, are any two numbers closer to each other than\n'
-        "    given threshold.\n"
-        "    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n"
-        "    False\n"
-        "    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n"
-        "    True\n"
-        '    """\n'
-        "    for",
-    ),
-    (
-        "separate_paren_groups",
-        "from typing import List\n\n"
-        "def separate_paren_groups(paren_string: str) -> List[str]:\n"
-        '    """ Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n'
-        "    separate those group into separate strings and return the list of those.\n"
-        "    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n"
-        "    Ignore any spaces in the input string.\n"
-        "    >>> separate_paren_groups('( ) (( )) (( )( ))')\n"
-        "    ['()', '(())', '(()())']\n"
-        '    """\n'
-        "    result = []\n"
-        "    current_string = []\n"
-        "    current_depth = 0\n"
-        "    for",
-    ),
-    (
-        "truncate_number",
-        "def truncate_number(number: float) -> float:\n"
-        '    """ Given a positive floating point number, it can be decomposed into\n'
-        "    and integer part (largest integer smaller than given number) and decimals\n"
-        "    (leftover part always smaller than 1).\n"
-        "\n"
-        "    Return the decimal part of the number.\n"
-        "    >>> truncate_number(3.5)\n"
-        "    0.5\n"
-        '    """\n'
-        "    return",
-    ),
-    (
-        "below_zero",
-        "from typing import List\n\n"
-        "def below_zero(operations: List[int]) -> bool:\n"
-        '    """ You\'re given a list of deposit and withdrawal operations on a bank account that starts with\n'
-        "    zero balance. Your task is to detect if at any point the balance of account fallls below zero, and\n"
-        "    at that point function should return True. Otherwise it should return False.\n"
-        "    >>> below_zero([1, 2, 3])\n"
-        "    False\n"
-        "    >>> below_zero([1, 2, -4, 5])\n"
-        "    True\n"
-        '    """\n'
-        "    balance = 0\n"
-        "    for op in",
-    ),
-    (
-        "mean_absolute_deviation",
-        "from typing import List\n\n"
-        "def mean_absolute_deviation(numbers: List[float]) -> float:\n"
-        '    """ For a given list of input numbers, calculate Mean Absolute Deviation\n'
-        "    around the mean of this dataset.\n"
-        "    Mean Absolute Deviation is the average absolute difference between each\n"
-        "    element and a centerpoint (mean in this case):\n"
-        "    MAD = average | x - x_mean |\n"
-        "    >>> mean_absolute_deviation([1.0, 2.0, 3.0, 4.0])\n"
-        "    1.0\n"
-        '    """\n'
-        "    mean =",
-    ),
-    (
-        "intersperse",
-        "from typing import List\n\n"
-        "def intersperse(numbers: List[int], delimeter: int) -> List[int]:\n"
-        "    \"\"\" Insert a number 'delimeter' between every two consecutive elements of input list `numbers'\n"
-        "    >>> intersperse([], 4)\n"
-        "    []\n"
-        "    >>> intersperse([1, 2, 3], 4)\n"
-        "    [1, 4, 2, 4, 3]\n"
-        '    """\n'
-        "    result = []\n"
-        "    for i, n in",
-    ),
-    (
-        "parse_nested_parens",
-        "from typing import List\n\n"
-        "def parse_nested_parens(paren_string: str) -> List[int]:\n"
-        '    """ Input to this function is a string represented multiple groups for nested parentheses separated by spaces.\n'
-        "    For each of the group, output the deepest level of nesting of parentheses.\n"
-        "    E.g. (()()) has maximum two levels of nesting while ((())) has three.\n"
-        "    >>> parse_nested_parens('(()()) ((())) () ((())()())')\n"
-        "    [2, 3, 1, 3]\n"
-        '    """\n'
-        "    def parse_paren_group(s):\n"
-        "        depth = 0\n"
-        "        max_depth = 0\n"
-        "        for c in",
-    ),
-    (
-        "filter_by_substring",
-        "from typing import List\n\n"
-        "def filter_by_substring(strings: List[str], substring: str) -> List[str]:\n"
-        '    """ Filter an input list of strings only for ones that contain given substring\n'
-        "    >>> filter_by_substring([], 'a')\n"
-        "    []\n"
-        "    >>> filter_by_substring(['abc', 'bacd', 'cde', 'array'], 'a')\n"
-        "    ['abc', 'bacd', 'array']\n"
-        '    """\n'
-        "    return",
-    ),
-    (
-        "sum_product",
-        "from typing import List, Tuple\n\n"
-        "def sum_product(numbers: List[int]) -> Tuple[int, int]:\n"
-        '    """ For a given list of integers, return a tuple consisting of a sum and a product of all the integers in a list.\n'
-        "    Empty sum should be equal to 0 and empty product should be equal to 1.\n"
-        "    >>> sum_product([])\n"
-        "    (0, 1)\n"
-        "    >>> sum_product([1, 2, 3, 4])\n"
-        "    (10, 24)\n"
-        '    """\n'
-        "    s = 0\n"
-        "    p = 1\n"
-        "    for n in",
-    ),
-    (
-        "rolling_max",
-        "from typing import List\n\n"
-        "def rolling_max(numbers: List[int]) -> List[int]:\n"
-        '    """ From a given list of integers, generate a list of rolling maximum element found until given moment\n'
-        "    in the sequence.\n"
-        "    >>> rolling_max([1, 2, 3, 2, 3, 4, 2])\n"
-        "    [1, 2, 3, 3, 3, 4, 4]\n"
-        '    """\n'
-        "    result = []\n"
-        "    running_max = None\n"
-        "    for n in numbers:\n"
-        "        if running_max is",
-    ),
-]
-
-
-def _find_draft_file(root: Path) -> str | None:
-    if root.is_file():
-        return str(root) if root.suffix in (".safetensors", ".gguf") else None
-    if not root.is_dir():
-        return None
-    for pattern in ("dflash-draft-*.gguf", "*.gguf", "model.safetensors"):
-        matches = sorted(root.rglob(pattern))
-        if matches:
-            return str(matches[0])
-    return None
-
-
-def _resolve_draft() -> str:
-    env = os.environ.get("DFLASH_DRAFT")
-    if env:
-        found = _find_draft_file(Path(env))
-        if found:
-            return found
-        raise FileNotFoundError(f"DFLASH_DRAFT does not point to a draft file: {env}")
-
-    for candidate in (_LOCAL_DRAFT_FILE, _LOCAL_DRAFT_ROOT):
-        found = _find_draft_file(candidate)
-        if found:
-            return found
-
-    raise FileNotFoundError(
-        "draft model file not found. Expected one of:\n"
-        f"  - {_LOCAL_DRAFT_FILE}\n"
-        "Download it as documented in the README, or set DFLASH_DRAFT to an explicit .safetensors/.gguf file or directory."
-    )
-
-
-def _require_file(path: str, label: str):
-    if not Path(path).is_file():
-        raise FileNotFoundError(f"{label} not found: {path}")
-
-
-def _tokenizer_slug(tokenizer_id: str) -> str:
-    """Filesystem-safe slug for tokenizer cache keying."""
-    return re.sub(r"[^A-Za-z0-9._-]+", "_", tokenizer_id)
-
-
-def _prompt_path(i: int, tokenizer_slug: str) -> Path:
-    return TMPDIR / f"he_prompt_{tokenizer_slug}_{i:02d}.bin"
-
-
-def tokenize_prompt(prompt: str, out_path: Path, tokenizer) -> int:
-    ids = tokenizer.encode(prompt, add_special_tokens=False)
-    with open(out_path, "wb") as f:
-        for tid in ids:
-            f.write(struct.pack("<i", int(tid)))
-    return len(ids)
-
-
-def run_test_dflash(prompt_path: Path, n_gen: int, fast_rollback: bool,
-                    ddtree_budget: int | None = None,
-                    ddtree_temp: float | None = None,
-                    ddtree_no_chain_seed: bool = False,
-                    extra_args: list[str] | None = None,
-                    extra_env: dict[str, str] | None = None) -> dict:
-    out_bin = TMPDIR / "he_bench_out.bin"
-    cmd = [
-        TEST_DFLASH, TARGET, DRAFT, str(prompt_path), str(n_gen), str(out_bin),
-    ]
-    if fast_rollback:
-        cmd.append("--fast-rollback")
-    if ddtree_budget is not None:
-        cmd.append("--ddtree")
-        cmd.append(f"--ddtree-budget={ddtree_budget}")
-    if ddtree_temp is not None:
-        cmd.append(f"--ddtree-temp={ddtree_temp}")
-    if ddtree_no_chain_seed:
-        cmd.append("--ddtree-no-chain-seed")
-    if extra_args:
-        cmd.extend(extra_args)
-    env = os.environ.copy()
-    if extra_env:
-        env.update(extra_env)
-    r = subprocess.run(cmd, capture_output=True, text=True, timeout=600, env=env)
-    if r.returncode != 0:
-        print("STDERR:", r.stderr[-2000:])
-        raise RuntimeError(f"test_dflash exited {r.returncode}")
-
-    # Parse output. The target layer-split harness prints both prefill and
-    # decode lines, so avoid the older "first tok/s wins" regexp there.
-    out = r.stdout
-    m_prefill = re.search(
-        r"\[target-split\] prefill tokens=(\d+) time=(\d+(?:\.\d+)?) s speed=(\d+(?:\.\d+)?) tok/s",
-        out,
-    )
-    m_decode_split = re.search(
-        r"\[target-split-dflash\] decode tokens=(\d+) time=(\d+(?:\.\d+)?) s speed=(\d+(?:\.\d+)?) tok/s",
-        out,
-    )
-    m_decode_default = re.search(
-        r"\[dflash\] generated \d+ tokens in \d+(?:\.\d+)? s\s+->\s+(\d+(?:\.\d+)?) tok/s",
-        out,
-    )
-    m_tps = re.search(r"(\d+(?:\.\d+)?)\s+tok/s", out)
-    m_commit = re.search(r"avg commit/step=(\d+(?:\.\d+)?)", out)
-    m_accept = re.search(r"accepted=(\d+)/(\d+) \((\d+(?:\.\d+)?)%", out)
-    m_steps = re.search(r"(\d+) draft steps", out)
-    if not ((m_decode_split or m_decode_default or m_tps) and m_commit and m_accept and m_steps):
-        print("STDOUT tail:", out[-2000:])
-        raise RuntimeError("failed to parse output")
-    if m_decode_split:
-        tok_s = float(m_decode_split.group(3))
-    elif m_decode_default:
-        tok_s = float(m_decode_default.group(1))
-    else:
-        tok_s = float(m_tps.group(1))
-    return {
-        "tok_s": tok_s,
-        "prefill_tok_s": float(m_prefill.group(3)) if m_prefill else None,
-        "commit_per_step": float(m_commit.group(1)),
-        "accepted": int(m_accept.group(1)),
-        "total_draft_pos": int(m_accept.group(2)),
-        "pct": float(m_accept.group(3)),
-        "steps": int(m_steps.group(1)),
-    }
-
-
-def main():
-    global DRAFT
-    DRAFT = _resolve_draft()
-    _require_file(TARGET, "target GGUF")
-    _require_file(TEST_DFLASH, "test_dflash binary")
-
-    ap = argparse.ArgumentParser()
-    ap.add_argument("--n-gen", type=int, default=128)
-    ap.add_argument("--mode", choices=["fast", "batched"], default="fast")
-    ap.add_argument("--skip-tokenize", action="store_true")
-    ap.add_argument("--ddtree-budget", type=int, default=None,
-                    help="Enable DDTree mode with this node budget (e.g. 15, 32, 64)")
-    ap.add_argument("--ddtree-temp", type=float, default=None,
-                    help="Sharpen draft logits with this temperature (T<1 widens top-1/top-2 gap)")
-    ap.add_argument("--ddtree-no-chain-seed", action="store_true",
-                    help="Use paper's pure best-first (no chain pre-seed)")
-    ap.add_argument("--draft-feature-mirror", action="store_true",
-                    help="Use the draft-side target feature mirror path")
-    ap.add_argument("--peer-access", action="store_true",
-                    help="Prefer CUDA P2P memcpy between GPUs when available (else host-staged copy)")
-    ap.add_argument("--target-gpu", type=int, default=None,
-                    help="Visible CUDA device id for the target backend")
-    ap.add_argument("--draft-gpu", type=int, default=None,
-                    help="Visible CUDA device id for the draft backend")
-    ap.add_argument("--target-gpus", default=None,
-                    help="Comma-separated target GPU ids for the layer-split harness")
-    ap.add_argument("--target-layer-split", default=None,
-                    help="Comma-separated layer split weights matching --target-gpus")
-    ap.add_argument("--target-split-load-draft", action="store_true",
-                    help="Load the draft alongside the target layer-split harness")
-    ap.add_argument("--target-split-dflash", action="store_true",
-                    help="Run chain DFlash decode through the target layer-split harness")
-    ap.add_argument("--draft-ipc-bin", default=None,
-                    help="Path to a different-backend test_dflash used as the remote draft daemon")
-    ap.add_argument("--draft-ipc-gpu", type=int, default=None,
-                    help="GPU id passed to the remote draft daemon")
-    ap.add_argument("--draft-ipc-work-dir", default=None,
-                    help="Work directory for host-file IPC with the remote draft daemon")
-    ap.add_argument("--draft-ipc-ring-cap", type=int, default=None,
-                    help="Feature-ring capacity for the remote draft daemon")
-    ap.add_argument("--max-ctx", type=int, default=None,
-                    help="Forward --max-ctx=N to test_dflash")
-    ap.add_argument("--prefill-ubatch", type=int, default=None,
-                    help="Set DFLASH27B_PREFILL_UBATCH for target split prefill")
-    ap.add_argument("--cuda-visible-devices", default=None,
-                    help="Optional CUDA_VISIBLE_DEVICES override for test_dflash")
-    ap.add_argument("--target-tokenizer",
-                    default=os.environ.get("DFLASH_TOKENIZER", "Qwen/Qwen3.5-27B"),
-                    help="HuggingFace tokenizer repo for the target. Defaults to "
-                         "$DFLASH_TOKENIZER, then Qwen/Qwen3.5-27B. Override for "
-                         "Qwen3.6 or other variants, e.g. "
-                         "--target-tokenizer Qwen/Qwen3.6-27B")
-    args = ap.parse_args()
-
-    # Tokenized prompts are cached at TMPDIR/he_prompt_<slug>_NN.bin so
-    # different --target-tokenizer values never collide. Without the slug,
-    # `--skip-tokenize` after a prior run with a different tokenizer would
-    # silently feed the wrong token IDs to the bench.
-    tok_slug = _tokenizer_slug(args.target_tokenizer)
-
-    print(f"[bench] target    = {TARGET}")
-    print(f"[bench] draft     = {DRAFT}")
-    print(f"[bench] bin       = {TEST_DFLASH}")
-    print(f"[bench] tmp       = {TMPDIR}")
-    print(f"[bench] tokenizer = {args.target_tokenizer}")
-
-    if not args.skip_tokenize:
-        print(f"[bench] tokenizing prompts via HF…")
-        from transformers import AutoTokenizer
-        tok = AutoTokenizer.from_pretrained(args.target_tokenizer, trust_remote_code=True)
-        for i, (name, p) in enumerate(PROMPTS):
-            path = _prompt_path(i, tok_slug)
-            n = tokenize_prompt(p, path, tok)
-            print(f"  [{i:02d}] {name:26s}  {n:4d} tokens")
-    else:
-        if not _prompt_path(0, tok_slug).exists():
-            sys.exit(
-                f"[error] --skip-tokenize requested but no cache for "
-                f"tokenizer={args.target_tokenizer!r} (looked for "
-                f"{_prompt_path(0, tok_slug)}). Drop --skip-tokenize to "
-                f"tokenize fresh, or pass --target-tokenizer matching a "
-                f"previous run.")
-        print(f"[bench] skipping tokenize (reusing {_prompt_path(0, tok_slug).parent})")
-
-    print(f"\n[bench] mode={args.mode}  n_gen={args.n_gen}")
-    print(f"{'prompt':28s}  {'steps':>6s} {'AL':>6s} {'pct%':>6s} {'prefill':>8s} {'decode':>8s}")
-    print("-" * 72)
-
-    extra_args = TestDflashLaunchArgs(
-        draft_feature_mirror=args.draft_feature_mirror,
-        peer_access=args.peer_access,
-        target_gpu=args.target_gpu,
-        draft_gpu=args.draft_gpu,
-        target_gpus=args.target_gpus,
-        target_layer_split=args.target_layer_split,
-        target_split_load_draft=args.target_split_load_draft,
-        target_split_dflash=args.target_split_dflash,
-        draft_ipc_bin=args.draft_ipc_bin,
-        draft_ipc_gpu=args.draft_ipc_gpu,
-        draft_ipc_work_dir=args.draft_ipc_work_dir,
-        draft_ipc_ring_cap=args.draft_ipc_ring_cap,
-        max_ctx=args.max_ctx,
-    ).to_cli_args()
-
-    extra_env = {}
-    if args.cuda_visible_devices:
-        extra_env = apply_backend_visible_devices(
-            "cuda",
-            visible_devices=args.cuda_visible_devices,
-            base_env=extra_env,
-        )
-    if args.prefill_ubatch is not None:
-        extra_env["DFLASH27B_PREFILL_UBATCH"] = str(args.prefill_ubatch)
-
-    results = []
-    for i, (name, _) in enumerate(PROMPTS):
-        path = _prompt_path(i, tok_slug)
-        try:
-            r = run_test_dflash(path, args.n_gen,
-                                fast_rollback=(args.mode == "fast" and not args.target_split_dflash),
-                                ddtree_budget=args.ddtree_budget,
-                                ddtree_temp=args.ddtree_temp,
-                                ddtree_no_chain_seed=args.ddtree_no_chain_seed,
-                                extra_args=extra_args,
-                                extra_env=extra_env)
-        except Exception as e:
-            print(f"  [{i:02d}] {name:26s}  FAILED: {e}")
-            continue
-        results.append((name, r))
-        prefill_s = f"{r['prefill_tok_s']:8.2f}" if r["prefill_tok_s"] is not None else f"{'n/a':>8s}"
-        print(
-            f"  {name:26s}  {r['steps']:6d} {r['commit_per_step']:6.2f} "
-            f"{r['pct']:6.1f} {prefill_s} {r['tok_s']:8.2f}"
-        )
-
-    if not results:
-        print("no successful runs")
-        sys.exit(1)
-
-    n = len(results)
-    mean_al = sum(r["commit_per_step"] for _, r in results) / n
-    mean_tps = sum(r["tok_s"] for _, r in results) / n
-    mean_pct = sum(r["pct"] for _, r in results) / n
-    prefill_vals = [r["prefill_tok_s"] for _, r in results if r["prefill_tok_s"] is not None]
-    mean_prefill = sum(prefill_vals) / len(prefill_vals) if prefill_vals else None
-
-    print("-" * 72)
-    prefill_s = f"{mean_prefill:8.2f}" if mean_prefill is not None else f"{'n/a':>8s}"
-    print(f"{'MEAN':28s}  {'':6s} {mean_al:6.2f} {mean_pct:6.1f} {prefill_s} {mean_tps:8.2f}")
-    print()
-    print(f"commit/step range: {min(r['commit_per_step'] for _,r in results):.2f} - "
-          f"{max(r['commit_per_step'] for _,r in results):.2f}")
-    print(f"tok/s range:        {min(r['tok_s'] for _,r in results):.1f} - "
-          f"{max(r['tok_s'] for _,r in results):.1f}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/server/scripts/bench_he_http.py b/server/scripts/bench_he_http.py
deleted file mode 100644
index 5da3b1d8b..000000000
--- a/server/scripts/bench_he_http.py
+++ /dev/null
@@ -1,89 +0,0 @@
-#!/usr/bin/env python3
-"""HumanEval-style bench through dflash_server HTTP API. Measures end-to-end
-decode tok/s on code-completion prompts. Server-internal [gemma4-spec] log
-captures the true decode speed + acceptance rate per request."""
-import json, time, urllib.request, sys
-
-PROMPTS = [
-    ("has_close_elements",
-     "from typing import List\n\n"
-     "def has_close_elements(numbers: List[float], threshold: float) -> bool:\n"
-     '    """Check if in given list of numbers, are any two numbers closer to each other than\n'
-     "    given threshold.\n"
-     "    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n"
-     "    False\n"
-     '    """\n    for'),
-    ("separate_paren_groups",
-     "from typing import List\n\n"
-     "def separate_paren_groups(paren_string: str) -> List[str]:\n"
-     '    """ Separate groups of nested parentheses into list of strings. """\n'
-     "    result = []\n    current = ''\n    depth = 0\n    for"),
-    ("truncate_number",
-     "def truncate_number(number: float) -> float:\n"
-     '    """ Return the decimal part of a floating-point number. """\n'
-     "    return"),
-    ("below_zero",
-     "from typing import List\n\n"
-     "def below_zero(operations: List[int]) -> bool:\n"
-     '    """ Detect if balance falls below zero. """\n'
-     "    balance = 0\n    for op in operations:\n"),
-    ("mean_absolute_deviation",
-     "from typing import List\n\n"
-     "def mean_absolute_deviation(numbers: List[float]) -> float:\n"
-     '    """ For a given list of input numbers, calculate Mean Absolute Deviation. """\n'
-     "    mean ="),
-    ("intersperse",
-     "from typing import List\n\n"
-     "def intersperse(numbers: List[int], delimeter: int) -> List[int]:\n"
-     '    """ Insert delimiter between consecutive elements. """\n'
-     "    if not numbers:\n        return []\n    result ="),
-    ("parse_nested_parens",
-     "from typing import List\n\n"
-     "def parse_nested_parens(paren_string: str) -> List[int]:\n"
-     '    """ For each group return deepest level of nesting. """\n'
-     "    def parse(s):\n        depth = max_depth ="),
-    ("filter_by_substring",
-     "from typing import List\n\n"
-     "def filter_by_substring(strings: List[str], substring: str) -> List[str]:\n"
-     '    """ Filter strings that contain substring. """\n'
-     "    return"),
-    ("sum_product",
-     "from typing import List, Tuple\n\n"
-     "def sum_product(numbers: List[int]) -> Tuple[int, int]:\n"
-     '    """ Return tuple (sum, product). Empty list -> (0,1). """\n'
-     "    s ="),
-    ("rolling_max",
-     "from typing import List\n\n"
-     "def rolling_max(numbers: List[int]) -> List[int]:\n"
-     '    """ Rolling maximum of running list. """\n'
-     "    result = []\n    cur = -10**18\n    for n in numbers:\n"),
-]
-
-URL = sys.argv[1] if len(sys.argv) > 1 else "http://127.0.0.1:18080/v1/chat/completions"
-MAX_TOKENS = int(sys.argv[2]) if len(sys.argv) > 2 else 96
-
-total_tok = 0
-total_dt  = 0.0
-print(f"[bench] url={URL} max_tokens={MAX_TOKENS} n_prompts={len(PROMPTS)}")
-for name, prompt in PROMPTS:
-    body = {
-        "model": "dflash",
-        "messages": [{"role": "user", "content": prompt}],
-        "max_tokens": MAX_TOKENS, "temperature": 0,
-    }
-    req = urllib.request.Request(URL, data=json.dumps(body).encode(),
-                                 headers={"Content-Type": "application/json"})
-    t = time.time()
-    try:
-        with urllib.request.urlopen(req, timeout=120) as resp:
-            r = json.loads(resp.read().decode())
-    except Exception as e:
-        print(f"  {name:30s} FAIL {e}")
-        continue
-    dt = time.time() - t
-    n  = r["usage"]["completion_tokens"]
-    total_tok += n; total_dt += dt
-    print(f"  {name:30s} N={n:3d} dt={dt:.3f}s tok/s={n/dt:6.2f}")
-
-print(f"\n[bench] total tokens={total_tok}  total dt={total_dt:.2f}s")
-print(f"[bench] avg tok/s = {total_tok/total_dt:.2f}")
diff --git a/server/scripts/bench_llm.py b/server/scripts/bench_llm.py
deleted file mode 100644
index 91ba498c5..000000000
--- a/server/scripts/bench_llm.py
+++ /dev/null
@@ -1,466 +0,0 @@
-"""
-10 prompts per dataset, AR + DFlash per prompt.
-
-    python3 scripts/bench_llm.py
-
-Paths resolve from the repo root by default. Override with env vars:
-    DFLASH_TARGET    path to target Qwen3.6-27B-Q4_K_M.gguf (or 3.5)
-    DFLASH_DRAFT     path to DFlash draft GGUF or model.safetensors
-    DFLASH_BIN       path to build/test_dflash
-    DFLASH_BIN_AR    path to build/test_generate
-    DFLASH_TOKENIZER HF tokenizer repo (default Qwen/Qwen3.5-27B; matches run.py)
-"""
-import argparse
-import json
-import os
-import re
-import struct
-import subprocess
-import tempfile
-from pathlib import Path
-
-ROOT = Path(__file__).resolve().parent.parent
-BIN_SUFFIX = ".exe" if os.name == "nt" else ""
-TARGET = os.environ.get(
-    "DFLASH_TARGET",
-    str(ROOT / "models" / "Qwen3.6-27B-Q4_K_M.gguf"),
-)
-_LOCAL_DRAFT_FILE = ROOT / "models" / "draft" / "dflash-draft-3.6-q4_k_m.gguf"
-_LOCAL_DRAFT_ROOT = ROOT / "models" / "draft"
-DRAFT = None
-TEST_DFLASH = os.environ.get("DFLASH_BIN", str(ROOT / "build" / f"test_dflash{BIN_SUFFIX}"))
-TEST_GENERATE = os.environ.get("DFLASH_BIN_AR", str(ROOT / "build" / f"test_generate{BIN_SUFFIX}"))
-TOKENIZER = os.environ.get("DFLASH_TOKENIZER", "Qwen/Qwen3.5-27B")
-TMPDIR = Path(tempfile.gettempdir()) / "dflash_bench"
-TMPDIR.mkdir(parents=True, exist_ok=True)
-
-N_GEN = 256
-BUDGET = 22  # default; overridden by --budget CLI arg
-N_SAMPLE = 10
-
-def _gsm_gold(x):
-    """Extract numeric answer after #### from GSM8K answer field."""
-    ans = x["answer"]
-    idx = ans.rfind("####")
-    if idx >= 0:
-        return ans[idx + 4:].strip().replace(",", "")
-    return ans.strip()
-
-
-BENCHES = [
-    ("HumanEval", "openai_humaneval", None, "test", lambda x: x["prompt"], None, N_GEN),
-    ("GSM8K", "gsm8k", "main", "test", lambda x: f"Question: {x['question']}\nAnswer: ", _gsm_gold, 1024),
-    ("Math500", "HuggingFaceH4/MATH-500", None, "test", lambda x: f"Problem: {x['problem']}\nSolution: Put your final answer in \\boxed{{}}.\n", lambda x: x["answer"], 2048),
-]
-
-
-def _find_draft_model(root: Path) -> str | None:
-    if root.is_file():
-        return str(root)
-    if not root.is_dir():
-        return None
-    for pattern in ("dflash-draft-*.gguf", "*.gguf", "model.safetensors"):
-        matches = sorted(root.rglob(pattern))
-        if matches:
-            return str(matches[0])
-    return None
-
-
-def _resolve_draft() -> str:
-    env = os.environ.get("DFLASH_DRAFT")
-    if env:
-        found = _find_draft_model(Path(env))
-        if found:
-            return found
-        raise FileNotFoundError(f"DFLASH_DRAFT does not point to a DFlash draft GGUF or model.safetensors: {env}")
-
-    for candidate in (_LOCAL_DRAFT_FILE, _LOCAL_DRAFT_ROOT):
-        found = _find_draft_model(candidate)
-        if found:
-            return found
-
-    raise FileNotFoundError(
-        "DFlash draft GGUF or model.safetensors not found. Expected one of:\n"
-        f"  - {_LOCAL_DRAFT_FILE}\n"
-        "Download it as documented in the README, or set DFLASH_DRAFT to an explicit file or directory."
-    )
-
-
-def _require_file(path: str, label: str):
-    if not Path(path).is_file():
-        raise FileNotFoundError(f"{label} not found: {path}")
-
-
-def _run_checked(cmd, timeout: int, label: str) -> subprocess.CompletedProcess:
-    r = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
-    if r.returncode != 0:
-        tail = (r.stderr or r.stdout or "<no output>").strip()[-2000:]
-        raise RuntimeError(f"{label} exited {r.returncode}: {tail}")
-    return r
-
-
-def tokenize(tok, p, path: Path):
-    ids = tok.encode(p, add_special_tokens=False)
-    with open(path, "wb") as f:
-        for t in ids:
-            f.write(struct.pack("<i", int(t)))
-    return len(ids)
-
-
-def run_ar(path: Path, n_gen: int = N_GEN):
-    out_bin = TMPDIR / "ar_out.bin"
-    r = _run_checked(
-        [TEST_GENERATE, TARGET, str(path), str(n_gen), str(out_bin)],
-        timeout=300,
-        label="test_generate",
-    )
-    m = re.search(r"(\d+\.\d+)\s+tok/s", r.stdout)
-    if not m:
-        raise RuntimeError(f"test_generate output parse failed: {r.stdout[-1000:]}")
-    return float(m.group(1))
-
-
-def _auto_max_ctx(n_prompt, n_gen: int = N_GEN):
-    # Auto-fit attention budget: prompt + gen + small verify pad, aligned to
-    # FATTN_KQ_STRIDE=256. Oversizing max_ctx makes attention stride over
-    # unused KV and can cost >20× prefill time (32K prompt + --kv-q4 +
-    # max_ctx=131072 → 1035s vs 38s at max_ctx=32768). See scripts/run.py.
-    pad = 64  # covers q_len=16 + ddtree verify overhead with margin
-    return ((n_prompt + n_gen + pad + 255) // 256) * 256
-
-
-def run_df(path: Path, n_prompt, n_gen: int = N_GEN):
-    max_ctx = _auto_max_ctx(n_prompt, n_gen)
-    out_bin = TMPDIR / f"df_out.bin"
-    r = _run_checked(
-        [
-            TEST_DFLASH,
-            TARGET,
-            DRAFT,
-            str(path),
-            str(n_gen),
-            str(out_bin),
-            "--fast-rollback",
-            "--ddtree",
-            f"--ddtree-budget={BUDGET}",
-            f"--max-ctx={max_ctx}",
-        ],
-        timeout=300,
-        label="test_dflash",
-    )
-    tps = re.search(r"(\d+(?:\.\d+)?)\s+tok/s", r.stdout)
-    al = re.search(r"avg commit/step=(\d+(?:\.\d+)?)", r.stdout)
-    if not (tps and al):
-        raise RuntimeError(f"test_dflash output parse failed: {r.stdout[-1500:]}")
-    return float(tps.group(1)), float(al.group(1)), out_bin
-
-
-def _read_ids(path: Path):
-    """Read a binary file of packed int32 token IDs."""
-    data = path.read_bytes()
-    return list(struct.unpack(f"<{len(data)//4}i", data))
-
-
-def _extract_boxed(text: str) -> str | None:
-    """Extract the last \\boxed{...} from a string, handling nested braces."""
-    results = []
-    i = 0
-    while i < len(text):
-        idx = text.find("\\boxed{", i)
-        if idx == -1:
-            break
-        start = idx + len("\\boxed{")
-        depth = 1
-        j = start
-        while j < len(text) and depth > 0:
-            if text[j] == "{":
-                depth += 1
-            elif text[j] == "}":
-                depth -= 1
-            j += 1
-        if depth == 0:
-            results.append(text[start:j-1].strip())
-        i = j
-    return results[-1] if results else None
-
-
-def _normalize_math(s: str) -> str:
-    """Normalize a math answer string for comparison."""
-    if s is None:
-        return ""
-    s = s.strip()
-    if s.startswith("$") and s.endswith("$"):
-        s = s[1:-1].strip()
-    # Strip currency $ (e.g. "$18" → "18")
-    if re.match(r'^\$\d', s):
-        s = s[1:]
-    s = re.sub(r"\\text\s*\{([^}]*)\}", r"\1", s)
-    s = re.sub(r"\\mathrm\s*\{([^}]*)\}", r"\1", s)
-    for cmd in [r"\left", r"\right", r"\displaystyle", r"\tfrac", r"\dfrac"]:
-        s = s.replace(cmd, "")
-    for unit in [" cm", " m", " km", " kg", " g", " s", " ms",
-                 " degrees", " degree", "°", " inches", " feet",
-                 " square units", " units", " dollars"]:
-        if s.lower().rstrip(".").endswith(unit):
-            s = s[:len(s) - len(unit) - (1 if s.endswith(".") else 0)]
-    s = re.sub(r"\s+", " ", s).strip()
-    s = s.rstrip(".,")
-    return s
-
-
-def _math_equiv(pred: str, gold: str) -> bool:
-    """Check if two math answers are equivalent."""
-    if pred is None or gold is None:
-        return False
-    p = _normalize_math(pred)
-    g = _normalize_math(gold)
-    if p == g:
-        return True
-    p_c = re.sub(r"\s*\\frac", r"\\frac", p)
-    g_c = re.sub(r"\s*\\frac", r"\\frac", g)
-    if p_c == g_c:
-        return True
-    try:
-        pf = float(p.replace(",", ""))
-        gf = float(g.replace(",", ""))
-        return abs(pf - gf) < 1e-6
-    except (ValueError, TypeError):
-        pass
-    mixed_pat = re.compile(r"^(\d+)\s*\\frac\s*\{(\d+)\}\s*\{(\d+)\}$")
-    for s, other in [(p, g), (g, p)]:
-        m = mixed_pat.match(s)
-        if m:
-            try:
-                val = float(m.group(1)) + float(m.group(2)) / float(m.group(3))
-                oval = float(other.replace(",", ""))
-                if abs(val - oval) < 1e-6:
-                    return True
-            except (ValueError, ZeroDivisionError):
-                pass
-    frac_pat = re.compile(r"\\?frac\s*\{([^}]+)\}\s*\{([^}]+)\}")
-    for s, other in [(p, g), (g, p)]:
-        m = frac_pat.search(s)
-        if m:
-            try:
-                val = float(m.group(1)) / float(m.group(2))
-                oval = float(other.replace(",", ""))
-                if abs(val - oval) < 1e-6:
-                    return True
-            except (ValueError, ZeroDivisionError):
-                pass
-    return False
-
-
-def score_math(output_bin: Path, gold_answer: str, tok) -> tuple[bool, str]:
-    """Score a Math500 output against the gold answer. Returns (correct, detail_str)."""
-    ids = _read_ids(output_bin)
-    text = tok.decode(ids)
-
-    think_end = text.rfind("</think>")
-    answer_text = text[think_end + len("</think>"):] if think_end >= 0 else text
-
-    pred = _extract_boxed(answer_text)
-
-    # Fallback: "the answer is **X**" patterns
-    if pred is None:
-        bold_pattern = re.compile(
-            r'(?:answer\s+is|there\s+are|result\s+is|equals?|=)\s*\*\*(.+?)\*\*',
-            re.IGNORECASE)
-        m = bold_pattern.search(answer_text)
-        if m:
-            pred = m.group(1).strip().rstrip(".")
-
-    # Fallback: last $...$ expression
-    if pred is None:
-        matches = re.findall(r'\$([^$]+)\$', answer_text)
-        if matches:
-            pred = matches[-1].strip()
-
-    correct = _math_equiv(pred, gold_answer)
-    pred_short = (pred[:60] + "…") if pred and len(pred) > 60 else pred
-    gold_short = (gold_answer[:60] + "…") if len(gold_answer) > 60 else gold_answer
-    if correct:
-        detail = f"🎯 {pred_short}"
-    elif pred:
-        detail = f"✗ pred={pred_short} gold={gold_short}"
-    else:
-        detail = f"✗ no answer found, gold={gold_short}"
-    return correct, detail
-
-
-def score_gsm(output_bin: Path, gold_answer: str, tok) -> tuple[bool, str]:
-    """Score a GSM8K output against the gold numeric answer. Returns (correct, detail_str)."""
-    ids = _read_ids(output_bin)
-    text = tok.decode(ids)
-
-    think_end = text.rfind("</think>")
-    answer_text = text[think_end + len("</think>"):] if think_end >= 0 else text
-
-    pred = None
-
-    # \boxed{<number>}
-    boxed = _extract_boxed(answer_text)
-    if boxed:
-        cleaned = boxed.replace(",", "").replace("$", "").strip()
-        if re.match(r'^[+-]?\d+\.?\d*$', cleaned):
-            pred = cleaned
-
-    # #### <number>
-    if pred is None:
-        m = re.search(r'####\s*\$?([+-]?\d[\d,]*\.?\d*)', answer_text)
-        if m:
-            pred = m.group(1).replace(",", "")
-
-    # "the answer is **X**"
-    if pred is None:
-        m = re.search(
-            r'(?:answer\s+is|result\s+is|equals?|there\s+are|we\s+get)\s*\*?\*?\$?([+-]?\d[\d,]*\.?\d*)',
-            answer_text, re.IGNORECASE)
-        if m:
-            pred = m.group(1).replace(",", "")
-
-    # **<number>** or **$<number>**
-    if pred is None:
-        m = re.search(r'\*\*\$?([+-]?\d[\d,]*\.?\d*)\*\*', answer_text)
-        if m:
-            pred = m.group(1).replace(",", "")
-
-    # Last standalone number
-    if pred is None:
-        nums = re.findall(r'(?<![.\d])([+-]?\d[\d,]*\.?\d*)(?![.\d])', answer_text)
-        if nums:
-            pred = nums[-1].replace(",", "")
-
-    correct = False
-    if pred is not None:
-        try:
-            correct = abs(float(pred) - float(gold_answer)) < 1e-6
-        except (ValueError, TypeError):
-            correct = pred.strip() == gold_answer.strip()
-
-    if correct:
-        detail = f"🎯 {pred}"
-    elif pred:
-        detail = f"✗ pred={pred} gold={gold_answer}"
-    else:
-        detail = f"✗ no answer found, gold={gold_answer}"
-    return correct, detail
-
-
-
-def main():
-    global DRAFT, BUDGET
-
-    parser = argparse.ArgumentParser(description="DFlash LLM benchmark suite")
-    parser.add_argument("--budget", type=int, default=BUDGET,
-                        help=f"DDTree budget (default {BUDGET})")
-    parser.add_argument("--no-thinking", action="store_true",
-                        help="Wrap prompts in chat template with enable_thinking=False")
-    parser.add_argument("--bench", nargs="+", choices=["HumanEval", "GSM8K", "Math500"],
-                        help="Run only specified benchmarks (default: all)")
-    args = parser.parse_args()
-    BUDGET = args.budget
-
-    DRAFT = _resolve_draft()
-    _require_file(TARGET, "target GGUF")
-    _require_file(TEST_DFLASH, "test_dflash binary")
-    _require_file(TEST_GENERATE, "test_generate binary")
-
-    print(f"[bench] target    = {TARGET}", flush=True)
-    print(f"[bench] draft     = {DRAFT}", flush=True)
-    print(f"[bench] ar bin    = {TEST_GENERATE}", flush=True)
-    print(f"[bench] df bin    = {TEST_DFLASH}", flush=True)
-    print(f"[bench] tokenizer = {TOKENIZER}", flush=True)
-    print(f"[bench] budget    = {BUDGET}", flush=True)
-
-    from datasets import load_dataset
-    from transformers import AutoTokenizer
-    tok = AutoTokenizer.from_pretrained(TOKENIZER, trust_remote_code=True)
-
-    bench_filter = set(args.bench) if args.bench else None
-
-    if args.no_thinking and not getattr(tok, "chat_template", None):
-        parser.error(
-            f"--no-thinking requires a tokenizer with a chat template, "
-            f"but {TOKENIZER!r} has none. Use a Qwen3 tokenizer or drop --no-thinking."
-        )
-
-    def _wrap_prompt(raw_prompt: str) -> str:
-        # Instruct/thinking models require the chat template. Feeding a raw
-        # prompt makes the model ramble and never emit a scorable answer
-        # (issue #191: Math500 scored 0/10). Always apply the template when the
-        # tokenizer has one; --no-thinking only toggles Qwen's <think> block.
-        if not getattr(tok, "chat_template", None):
-            return raw_prompt
-        return tok.apply_chat_template(
-            [{"role": "user", "content": raw_prompt}],
-            tokenize=False, add_generation_prompt=True,
-            enable_thinking=not args.no_thinking,
-        )
-
-    results = {}
-    for name, ds_name, cfg, split, extract, gold_extract, gen in BENCHES:
-        if bench_filter and name not in bench_filter:
-            continue
-        print(f"\n[bench] ==== {name} (n={N_SAMPLE}, n_gen={gen}) ====", flush=True)
-        ds = load_dataset(ds_name, cfg, split=split)
-        ds_selected = ds.shuffle(seed=42).select(range(N_SAMPLE))
-        prompt_list = [extract(s) for s in ds_selected]
-        gold_list = [gold_extract(s) for s in ds_selected] if gold_extract else [None] * len(prompt_list)
-
-        ar_tps, df_tps, df_al = [], [], []
-        n_score_correct, n_scored = 0, 0
-        for i, (p, gold) in enumerate(zip(prompt_list, gold_list)):
-            path = TMPDIR / f"b_{name}_{i:02d}.bin"
-            n = tokenize(tok, _wrap_prompt(p), path)
-            if n == 0 or n > 3500:
-                continue
-            try:
-                ar = run_ar(path, gen)
-                df, al, df_bin = run_df(path, n, gen)
-            except Exception as e:
-                print(f"  [{i+1:02d}/{N_SAMPLE}] n_tok={n:4d}  FAILED: {e}", flush=True)
-                continue
-
-            score_detail = ""
-            if gold is not None:
-                if name == "GSM8K":
-                    correct, score_detail = score_gsm(df_bin, gold, tok)
-                else:
-                    correct, score_detail = score_math(df_bin, gold, tok)
-                n_scored += 1
-                if correct:
-                    n_score_correct += 1
-                score_detail = f"  {score_detail}"
-
-            if ar > 0:
-                ar_tps.append(ar)
-            if df > 0:
-                df_tps.append(df)
-                df_al.append(al)
-            print(f"  [{i+1:02d}/{N_SAMPLE}] n_tok={n:4d}  AR={ar:6.2f}  DFlash={df:7.2f}  AL={al:5.2f}{score_detail}", flush=True)
-        ar_m = sum(ar_tps) / len(ar_tps) if ar_tps else 0
-        df_m = sum(df_tps) / len(df_tps) if df_tps else 0
-        al_m = sum(df_al) / len(df_al) if df_al else 0
-        score_str = f"{n_score_correct}/{n_scored}" if n_scored else ""
-        results[name] = {"ar": ar_m, "dflash": df_m, "al": al_m,
-                         "speedup": df_m / ar_m if ar_m else 0,
-                         "score": score_str}
-        summary = f"  {name} mean: AR={ar_m:.2f}  DFlash={df_m:.2f}  AL={al_m:.2f}  {results[name]['speedup']:.2f}x"
-        if score_str:
-            summary += f"  score={score_str} ({n_score_correct/n_scored*100:.0f}%)"
-        print(summary, flush=True)
-
-    print("\n[bench] === SUMMARY ===")
-    print(f"{'Task':12s}  {'AR':>8s}  {'DFlash':>8s}  {'AL':>6s}  {'Speedup':>8s}  {'Score':>8s}")
-    for name, r in results.items():
-        print(f"{name:12s}  {r['ar']:8.2f}  {r['dflash']:8.2f}  {r['al']:6.2f}  {r['speedup']:7.2f}x  {r.get('score',''):>8s}")
-
-    out_json = TMPDIR / "bench_llm_results.json"
-    with open(out_json, "w") as f:
-        json.dump(results, f, indent=2)
-    print(f"[bench] wrote {out_json}", flush=True)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/server/scripts/bench_server.py b/server/scripts/bench_server.py
deleted file mode 100644
index 3e607d920..000000000
--- a/server/scripts/bench_server.py
+++ /dev/null
@@ -1,502 +0,0 @@
-#!/usr/bin/env python3
-"""HTTP server benchmark — exercises the C++ dflash_server with the same
-workloads as bench_llm.py (short LLM prompts) and bench_agent.py (long
-agentic prompts), but over HTTP via /v1/chat/completions streaming.
-
-This answers: "does the C++ server perform as well as the raw CLI binaries?"
-
-Workloads:
-  he      — 10 HumanEval code-completion prompts (same as bench_daemon.py)
-  gsm8k   — 10 GSM8K math word problems
-  math500 — 10 MATH-500 problems (2048 max_tokens)
-  agent   — SWE-bench Verified at 2K / 8K / 24K token buckets
-
-Usage:
-    # Start C++ server first:
-    ./dflash/build/dflash_server dflash/models/Qwen3-0.6B-BF16.gguf --port 9099
-
-    # Run all workloads:
-    python3 dflash/scripts/bench_server.py --url http://localhost:9099
-
-    # Run specific workloads:
-    python3 dflash/scripts/bench_server.py --url http://localhost:9099 --workload he gsm8k
-
-    # Quick smoke test (1 prompt per workload):
-    python3 dflash/scripts/bench_server.py --url http://localhost:9099 --n-sample 1
-"""
-import argparse
-import json
-import re
-import sys
-import time
-import urllib.request
-import urllib.error
-from pathlib import Path
-
-# Allow importing bench_he for its PROMPTS list.
-sys.path.insert(0, str(Path(__file__).resolve().parent))
-from bench_llm import _extract_boxed, _normalize_math, _math_equiv
-
-N_SAMPLE = 10
-N_GEN_DEFAULT = 256
-N_GEN_MATH = 2048
-
-
-# ── HTTP streaming client ─────────────────────────────────────────────────
-
-def stream_chat(url: str, messages: list[dict], max_tokens: int,
-                temperature: float = 0.0, timeout: float = 600.0,
-                thinking: bool = False) -> dict:
-    """POST /v1/chat/completions with stream=True.
-
-    Returns dict with:
-      n_tok, wall_s, ttft_s, decode_s, decode_tps, wall_tps,
-      text, usage (if server returns it in final chunk).
-    """
-    body = {
-        "model": "dflash",
-        "messages": messages,
-        "max_tokens": max_tokens,
-        "temperature": temperature,
-        "stream": True,
-    }
-    if thinking:
-        body["thinking"] = {"type": "enabled", "budget_tokens": 4096}
-
-    data = json.dumps(body).encode()
-    req = urllib.request.Request(
-        url.rstrip("/") + "/v1/chat/completions",
-        data=data,
-        headers={"Content-Type": "application/json",
-                 "Accept": "text/event-stream"},
-    )
-    t0 = time.perf_counter()
-    t_first = 0.0
-    t_last = 0.0
-    n_tok = 0
-    text_parts = []
-    usage = None
-
-    with urllib.request.urlopen(req, timeout=timeout) as r:
-        for raw in r:
-            line = raw.decode("utf-8", errors="replace").rstrip()
-            if not line.startswith("data:"):
-                continue
-            payload = line[5:].strip()
-            if payload == "[DONE]":
-                break
-            try:
-                chunk = json.loads(payload)
-            except json.JSONDecodeError:
-                continue
-            # Extract usage from final chunk if present.
-            if chunk.get("usage"):
-                usage = chunk["usage"]
-            choices = chunk.get("choices") or []
-            if not choices:
-                continue
-            delta = choices[0].get("delta") or {}
-            content = delta.get("content") or ""
-            reasoning = delta.get("reasoning_content") or ""
-            if content or reasoning:
-                if n_tok == 0:
-                    t_first = time.perf_counter()
-                n_tok += 1
-                t_last = time.perf_counter()
-                if content:
-                    text_parts.append(content)
-
-    wall_s = time.perf_counter() - t0
-    ttft_s = (t_first - t0) if n_tok > 0 else wall_s
-    decode_s = (t_last - t_first) if n_tok > 1 else 0.0
-    decode_tps = (n_tok - 1) / decode_s if decode_s > 0 else 0.0
-    wall_tps = n_tok / wall_s if wall_s > 0 else 0.0
-
-    return {
-        "n_tok": n_tok,
-        "wall_s": wall_s,
-        "ttft_s": ttft_s,
-        "decode_s": decode_s,
-        "decode_tps": decode_tps,
-        "wall_tps": wall_tps,
-        "text": "".join(text_parts),
-        "usage": usage,
-    }
-
-
-# ── Workload: HumanEval ──────────────────────────────────────────────────
-
-def workload_he(url: str, n_sample: int, n_gen: int, **_kw):
-    """HumanEval code-completion prompts (same 10 as bench_he.py)."""
-    from bench_he import PROMPTS
-    prompts = PROMPTS[:n_sample]
-    results = []
-    for name, text in prompts:
-        msgs = [{"role": "user", "content": text}]
-        try:
-            r = stream_chat(url, msgs, n_gen)
-            results.append({"name": name, **r})
-            _print_row(name, r)
-        except Exception as e:
-            print(f"  {name:28s}  FAILED: {e}", flush=True)
-    return results
-
-
-# ── Workload: GSM8K ──────────────────────────────────────────────────────
-
-def _load_gsm8k(n_sample: int):
-    from datasets import load_dataset
-    ds = load_dataset("gsm8k", "main", split="test")
-    n_sample = min(n_sample, len(ds))
-    ds = ds.shuffle(seed=42).select(range(n_sample))
-    return [
-        {"name": f"gsm8k_{i:02d}",
-         "prompt": f"Question: {row['question']}\nAnswer: "}
-        for i, row in enumerate(ds)
-    ]
-
-
-def workload_gsm8k(url: str, n_sample: int, n_gen: int, **_kw):
-    rows = _load_gsm8k(n_sample)
-    results = []
-    for row in rows:
-        msgs = [{"role": "user", "content": row["prompt"]}]
-        try:
-            r = stream_chat(url, msgs, n_gen)
-            results.append({"name": row["name"], **r})
-            _print_row(row["name"], r)
-        except Exception as e:
-            print(f"  {row['name']:28s}  FAILED: {e}", flush=True)
-    return results
-
-
-# ── Workload: Math500 ────────────────────────────────────────────────────
-
-def _load_math500(n_sample: int):
-    from datasets import load_dataset
-    ds = load_dataset("HuggingFaceH4/MATH-500", split="test")
-    n_sample = min(n_sample, len(ds))
-    ds = ds.shuffle(seed=42).select(range(n_sample))
-    return [
-        {"name": f"math_{i:02d}",
-         "prompt": f"Problem: {row['problem']}\nSolution: Put your final answer in \\boxed{{}}.\n",
-         "answer": row["answer"]}
-        for i, row in enumerate(ds)
-    ]
-
-
-def _score_math_text(text: str, gold_answer: str) -> tuple[bool, str]:
-    """Score a math response text against the gold answer.
-
-    Extracts \\boxed{} answers (after </think> for thinking models),
-    with fallbacks for **bold** and $...$ patterns.
-    Returns (correct, detail_str).
-    """
-    think_end = text.rfind("</think>")
-    answer_text = text[think_end + len("</think>"):] if think_end >= 0 else text
-
-    pred = _extract_boxed(answer_text)
-    if not pred:
-        pred = _extract_boxed(text)
-
-    # Fallback: "the answer is **X**" patterns
-    if pred is None:
-        bold_pattern = re.compile(
-            r'(?:answer\s+is|there\s+are|result\s+is|equals?|=)\s*\*\*(.+?)\*\*',
-            re.IGNORECASE)
-        m = bold_pattern.search(answer_text)
-        if m:
-            pred = m.group(1).strip().rstrip(".")
-
-    # Fallback: last $...$ expression
-    if pred is None:
-        matches = re.findall(r'\$([^$]+)\$', answer_text)
-        if matches:
-            pred = matches[-1].strip()
-
-    correct = _math_equiv(pred, gold_answer)
-    pred_short = (pred[:60] + "…") if pred and len(pred) > 60 else pred
-    gold_short = (gold_answer[:60] + "…") if len(gold_answer) > 60 else gold_answer
-    if correct:
-        detail = f"🎯 {pred_short}"
-    elif pred:
-        detail = f"✗ pred={pred_short} gold={gold_short}"
-    else:
-        detail = f"✗ no answer found, gold={gold_short}"
-    return correct, detail
-
-
-def workload_math500(url: str, n_sample: int, n_gen: int, thinking: bool = False, **_kw):
-    rows = _load_math500(n_sample)
-    gen = max(n_gen, N_GEN_MATH)  # Math needs longer generation
-    results = []
-    n_correct, n_scored = 0, 0
-    for row in rows:
-        msgs = [{"role": "user", "content": row["prompt"]}]
-        try:
-            r = stream_chat(url, msgs, gen, thinking=thinking)
-            correct, detail = _score_math_text(r["text"], row["answer"])
-            r["correct"] = correct
-            r["score_detail"] = detail
-            n_scored += 1
-            if correct:
-                n_correct += 1
-            results.append({"name": row["name"], **r})
-            _print_row(row["name"], r, score=detail)
-        except Exception as e:
-            print(f"  {row['name']:28s}  FAILED: {e}", flush=True)
-    if n_scored:
-        pct = n_correct / n_scored * 100
-        print(f"\n  accuracy: {n_correct}/{n_scored} ({pct:.0f}%)")
-    return results
-
-
-# ── Workload: Agent (SWE-bench) ──────────────────────────────────────────
-
-FIX_DIR = Path(__file__).resolve().parent / "fixtures"
-SWE_PARQUET = FIX_DIR / "swe_bench" / "swe_bench_verified.parquet"
-SYS_PROMPT_SMALL = FIX_DIR / "agent_prompts" / "codex_gpt52_codex.md"
-SYS_PROMPT_LARGE = FIX_DIR / "agent_prompts" / "codex_gpt52.md"
-
-AGENT_BUCKETS = {
-    "2k":  {"target_chars": 6000,   "sys": SYS_PROMPT_SMALL},
-    "8k":  {"target_chars": 24000,  "sys": SYS_PROMPT_LARGE},
-    "24k": {"target_chars": 72000,  "sys": SYS_PROMPT_LARGE},
-}
-
-
-def _build_agent_user_msg(row: dict, target_chars: int) -> str:
-    """Build a Codex-style user message padded to ~target_chars."""
-    repo = row.get("repo", "unknown/repo")
-    iid = row.get("instance_id", "unknown")
-    problem = row.get("problem_statement", "") or ""
-    patch = row.get("patch", "") or ""
-    test_patch = row.get("test_patch", "") or ""
-    hints = row.get("hints_text", "") or ""
-
-    pool = "\n\n".join(p for p in (patch, test_patch, hints) if p)
-    if not pool:
-        pool = problem
-
-    chunks = []
-    chunk_size = max(2000, target_chars // 6)
-    cur = 0
-    idx = 1
-    while cur < target_chars:
-        offset = cur % max(1, len(pool))
-        seg = pool[offset:offset + chunk_size]
-        if not seg:
-            seg = pool[:chunk_size]
-        chunks.append(
-            f'<tool_result tool="read_file" path="{repo}/_ctx_{idx}.py">\n{seg}\n</tool_result>'
-        )
-        cur += len(seg)
-        idx += 1
-
-    file_blocks = "\n\n".join(chunks)
-    return (
-        f"Repository: {repo}\nInstance: {iid}\n\n"
-        f"## Issue\n{problem}\n\n"
-        f"## Context I gathered\n"
-        f"I ran `read_file` on the relevant modules:\n\n"
-        f"{file_blocks}\n\n"
-        f"## Task\nInvestigate the bug and reply with a single tool call "
-        f"to `apply_patch` that fixes it. Keep the patch minimal."
-    )
-
-
-def workload_agent(url: str, n_sample: int, n_gen: int, bucket: str = "all", **_kw):
-    if not SWE_PARQUET.is_file():
-        print(f"  SKIP: SWE-bench parquet not found at {SWE_PARQUET}", flush=True)
-        return []
-
-    import pyarrow.parquet as pq
-    df = pq.read_table(str(SWE_PARQUET)).to_pandas()
-
-    bucket_keys = list(AGENT_BUCKETS) if bucket == "all" else [bucket]
-    all_results = []
-
-    for bk in bucket_keys:
-        cfg = AGENT_BUCKETS[bk]
-        sys_path = cfg["sys"]
-        if not sys_path.is_file():
-            print(f"  SKIP bucket {bk}: system prompt not found at {sys_path}", flush=True)
-            continue
-
-        sys_text = sys_path.read_text(encoding="utf-8")
-        rows = df.sample(n=min(n_sample, len(df)), random_state=42).to_dict("records")
-
-        print(f"\n  --- bucket {bk} (target ~{cfg['target_chars']} chars, n={len(rows)}) ---")
-        for i, row in enumerate(rows):
-            name = f"agent_{bk}_{i:02d}"
-            user_msg = _build_agent_user_msg(row, cfg["target_chars"])
-            msgs = [
-                {"role": "system", "content": sys_text},
-                {"role": "user", "content": user_msg},
-            ]
-            try:
-                r = stream_chat(url, msgs, n_gen)
-                r["bucket"] = bk
-                r["instance_id"] = row.get("instance_id", "")
-                all_results.append({"name": name, **r})
-                _print_row(name, r)
-            except Exception as e:
-                print(f"  {name:28s}  FAILED: {e}", flush=True)
-
-    return all_results
-
-
-# ── Output formatting ─────────────────────────────────────────────────────
-
-def _print_header():
-    print(f"  {'prompt':28s}  {'n_tok':>5s} {'wall_s':>7s} {'ttft_s':>7s} "
-          f"{'dec_s':>7s} {'dec_tps':>8s} {'wall_tps':>9s}")
-    print("  " + "-" * 80)
-
-
-def _print_row(name: str, r: dict, score: str = ""):
-    n = r["n_tok"]
-    suffix = f"  {score}" if score else ""
-    if n == 0:
-        print(f"  {name:28s}  {n:5d} {r['wall_s']:7.2f}   --       --       --         --{suffix}",
-              flush=True)
-        return
-    print(f"  {name:28s}  {n:5d} {r['wall_s']:7.2f} {r['ttft_s']:7.3f} "
-          f"{r['decode_s']:7.2f} {r['decode_tps']:8.2f} {r['wall_tps']:9.2f}{suffix}",
-          flush=True)
-
-
-def _print_summary(label: str, results: list[dict]):
-    if not results:
-        return
-    valid = [r for r in results if r["n_tok"] > 0]
-    if not valid:
-        print(f"\n  [{label}] no successful runs")
-        return
-
-    def _mean(xs):
-        return sum(xs) / len(xs) if xs else 0.0
-
-    n = len(valid)
-    wall_tps = _mean([r["wall_tps"] for r in valid])
-    dec_tps_list = [r["decode_tps"] for r in valid if r["decode_tps"] > 0]
-    dec_tps = _mean(dec_tps_list) if dec_tps_list else 0.0
-    ttft = _mean([r["ttft_s"] for r in valid])
-    wall = _mean([r["wall_s"] for r in valid])
-    tok = _mean([r["n_tok"] for r in valid])
-
-    print(f"\n  [{label}] {n} prompts — mean: "
-          f"n_tok={tok:.0f}  TTFT={ttft:.3f}s  "
-          f"decode={dec_tps:.2f} tok/s  "
-          f"wall={wall_tps:.2f} tok/s  "
-          f"total={wall:.2f}s")
-    if dec_tps_list:
-        print(f"  [{label}] decode tok/s range: "
-              f"{min(dec_tps_list):.2f} - {max(dec_tps_list):.2f}")
-
-
-# ── Main ──────────────────────────────────────────────────────────────────
-
-WORKLOADS = {
-    "he":      ("HumanEval (code completion)", workload_he),
-    "gsm8k":   ("GSM8K (math word problems)", workload_gsm8k),
-    "math500": ("MATH-500 (hard math)", workload_math500),
-    "agent":   ("SWE-bench agent (2K/8K/24K)", workload_agent),
-}
-
-
-def main():
-    ap = argparse.ArgumentParser(
-        description="HTTP server benchmark — exercises dflash_server with "
-                    "bench_llm + bench_agent workloads over /v1/chat/completions")
-    ap.add_argument("--url", default="http://localhost:9099",
-                    help="Server base URL (default: http://localhost:9099)")
-    ap.add_argument("--workload", nargs="+", choices=list(WORKLOADS) + ["all"],
-                    default=["all"],
-                    help="Which workloads to run (default: all)")
-    ap.add_argument("--n-sample", type=int, default=N_SAMPLE,
-                    help=f"Prompts per workload (default: {N_SAMPLE})")
-    ap.add_argument("--n-gen", type=int, default=N_GEN_DEFAULT,
-                    help=f"Max output tokens (default: {N_GEN_DEFAULT})")
-    ap.add_argument("--agent-bucket", choices=["2k", "8k", "24k", "all"],
-                    default="all", help="Agent bucket filter (default: all)")
-    ap.add_argument("--warmup", action="store_true",
-                    help="Run one warmup request before timing")
-    ap.add_argument("--thinking", action="store_true",
-                    help="Enable thinking/reasoning mode")
-    ap.add_argument("--out", type=str, default=None,
-                    help="Write JSON results to this file")
-    args = ap.parse_args()
-
-    # Validate server is reachable.
-    try:
-        urllib.request.urlopen(args.url.rstrip("/") + "/health", timeout=5)
-    except Exception as e:
-        print(f"ERROR: server not reachable at {args.url}: {e}", file=sys.stderr)
-        sys.exit(1)
-
-    if args.warmup:
-        print("[bench-server] warmup...", flush=True)
-        stream_chat(args.url, [{"role": "user", "content": "Hi"}], 16)
-
-    wl_keys = list(WORKLOADS) if "all" in args.workload else args.workload
-
-    print("=" * 88)
-    print(f"  HTTP Server Benchmark — {args.url}")
-    print(f"  workloads: {', '.join(wl_keys)}  n_sample={args.n_sample}  n_gen={args.n_gen}")
-    print("=" * 88)
-
-    all_results = {}
-    for wk in wl_keys:
-        label, fn = WORKLOADS[wk]
-        print(f"\n{'─' * 88}")
-        print(f"  {label}")
-        print(f"{'─' * 88}")
-        _print_header()
-        try:
-            results = fn(url=args.url, n_sample=args.n_sample, n_gen=args.n_gen,
-                         bucket=args.agent_bucket, thinking=args.thinking)
-        except ImportError as e:
-            print(f"  SKIP {wk}: missing dependency — {e}", flush=True)
-            results = []
-        except FileNotFoundError as e:
-            print(f"  SKIP {wk}: {e}", flush=True)
-            results = []
-
-        all_results[wk] = results
-        _print_summary(wk, results)
-
-    # Final summary
-    print(f"\n{'=' * 88}")
-    print("  SUMMARY")
-    print(f"{'=' * 88}")
-    print(f"  {'Workload':12s}  {'N':>3s}  {'TTFT':>7s}  {'dec_tps':>8s}  "
-          f"{'wall_tps':>9s}  {'wall_s':>7s}")
-    print("  " + "-" * 55)
-    for wk in wl_keys:
-        results = all_results.get(wk, [])
-        valid = [r for r in results if r.get("n_tok", 0) > 0]
-        if not valid:
-            print(f"  {wk:12s}  {'--':>3s}  {'--':>7s}  {'--':>8s}  {'--':>9s}  {'--':>7s}")
-            continue
-        n = len(valid)
-
-        def _m(key):
-            vals = [r[key] for r in valid if r.get(key, 0) > 0]
-            return sum(vals) / len(vals) if vals else 0.0
-
-        print(f"  {wk:12s}  {n:3d}  {_m('ttft_s'):7.3f}  {_m('decode_tps'):8.2f}  "
-              f"{_m('wall_tps'):9.2f}  {_m('wall_s'):7.2f}")
-
-    if args.out:
-        out_path = Path(args.out)
-        out_path.parent.mkdir(parents=True, exist_ok=True)
-        with open(out_path, "w") as f:
-            json.dump(all_results, f, indent=2, default=str)
-        print(f"\n  Wrote {out_path}")
-
-    print()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/server/scripts/fixtures/agent_cases/cases.json b/server/scripts/fixtures/agent_cases/cases.json
new file mode 100644
index 000000000..c4f3a5be0
--- /dev/null
+++ b/server/scripts/fixtures/agent_cases/cases.json
@@ -0,0 +1,30 @@
+{
+  "schema": "lucebox-bench-cases-v1",
+  "source": "Agent-style probes: real codex agent system prompt + coding-task user message. Tests whether the model produces agent-shaped output (tool calls, code blocks, apply_patch envelopes) given a realistic coding-agent context. Complement to --area forge (which exercises tool-calling reliability with mocked scenarios).",
+  "cases": [
+    {
+      "id": "codex-mini-read-task",
+      "kind": "agent-prompt",
+      "system_prompt_file": "codex_gpt5_codex.md",
+      "user_message": "Read the file dflash/src/server/http_server.cpp and summarize what it does in 2-3 sentences. Don't actually open the file — describe what tool you would use and what arguments."
+    },
+    {
+      "id": "codex-mini-apply-patch",
+      "kind": "agent-prompt",
+      "system_prompt_file": "codex_gpt52_codex.md",
+      "user_message": "The function `qwen35_decode` in dflash/src/qwen35/qwen35_backend.cpp has a bug on the force-close path: it doesn't reset `budget_close_started` between requests. Show me the apply_patch envelope to fix it."
+    },
+    {
+      "id": "codex-large-explore",
+      "kind": "agent-prompt",
+      "system_prompt_file": "codex_apply_patch.md",
+      "user_message": "Which files in this repository handle authentication? List the steps you would take to find them."
+    },
+    {
+      "id": "codex-medium-test",
+      "kind": "agent-prompt",
+      "system_prompt_file": "codex_gpt52.md",
+      "user_message": "Write a unit test for a function `parse_thinking_budget(s: str) -> int` that returns the integer budget from strings like 'high', 'low', '4096'. Use pytest. Use apply_patch to add the test file."
+    }
+  ]
+}
diff --git a/server/scripts/test_server_integration.py b/server/scripts/test_server_integration.py
index 08da6cedf..c22121089 100644
--- a/server/scripts/test_server_integration.py
+++ b/server/scripts/test_server_integration.py
@@ -490,7 +490,12 @@ def test_thinking_disabled_by_default(self):
 
     @pytest.mark.slow
     def test_thinking_enabled_via_chat_template_kwargs(self):
-        """Enabling thinking should produce reasoning_content."""
+        """Enabling thinking must route reasoning into reasoning_content,
+        not leak it into content. Regression guard for the Qwen3.6/Laguna
+        pre-opened-<think> bug: the chat template appends `<think>` to the
+        prompt suffix, so the model emits reasoning directly with no
+        opening tag. If the renderer→emitter wiring drops, reasoning_content
+        stays empty and the raw reasoning text appears in content."""
         r = post_json("/v1/chat/completions", {
             "model": MODEL_NAME,
             "messages": [{"role": "user", "content": "What is 15 * 17?"}],
@@ -500,13 +505,26 @@ def test_thinking_enabled_via_chat_template_kwargs(self):
         })
         assert r.status_code == 200
         msg = r.json()["choices"][0]["message"]
-        assert msg["content"]
-        # With thinking enabled, model may produce reasoning_content
-        # (not guaranteed for short prompts, so we just check it doesn't crash)
+        reasoning = msg.get("reasoning_content") or ""
+        content = msg.get("content") or ""
+        assert reasoning, (
+            f"reasoning_content empty with enable_thinking=True — "
+            f"renderer→emitter wiring likely broken. content={content[:200]!r}"
+        )
+        assert "<think>" not in reasoning and "</think>" not in reasoning, (
+            f"raw think tags leaked into reasoning_content: {reasoning[:200]!r}"
+        )
+        assert "<think>" not in content and "</think>" not in content, (
+            f"think tags leaked into content channel: {content[:200]!r}"
+        )
+        assert content, "content channel empty — model never closed </think>"
 
     @pytest.mark.slow
     def test_thinking_enabled_via_reasoning_effort(self):
-        """OpenAI Responses-style reasoning.effort field."""
+        """OpenAI Responses-style reasoning.effort=high must also route
+        reasoning to reasoning_content. Same regression class as above
+        but reached through a different request shape (effort→template
+        kwargs translation in http_server.cpp)."""
         r = post_json("/v1/chat/completions", {
             "model": MODEL_NAME,
             "messages": [{"role": "user", "content": "What is 15 * 17?"}],
@@ -516,7 +534,15 @@ def test_thinking_enabled_via_reasoning_effort(self):
         })
         assert r.status_code == 200
         msg = r.json()["choices"][0]["message"]
-        assert msg["content"]
+        reasoning = msg.get("reasoning_content") or ""
+        content = msg.get("content") or ""
+        assert reasoning, (
+            f"reasoning_content empty with reasoning.effort=high — "
+            f"renderer→emitter wiring likely broken. content={content[:200]!r}"
+        )
+        assert "<think>" not in reasoning and "</think>" not in reasoning
+        assert "<think>" not in content and "</think>" not in content
+        assert content
 
 
 # ═══════════════════════════════════════════════════════════════════
@@ -871,3 +897,243 @@ def test_stop_no_match(self):
         content = r.json()["choices"][0]["message"]["content"]
         # Should produce some output since stop didn't match
         assert len(content) > 0
+
+
+# ═══════════════════════════════════════════════════════════════════
+# /props introspection — parity with dflash/scripts/server.py:1221
+# ═══════════════════════════════════════════════════════════════════
+
+class TestProps:
+    """Mirrors the Python server's /props shape so cross-server consumers
+    (autotune, dashboards, snapshot/profile) see a stable contract."""
+
+    def _fetch(self):
+        r = requests.get(f"{SERVER_URL}/props", timeout=10)
+        assert r.status_code == 200, f"/props returned {r.status_code}"
+        return r.json()
+
+    def test_top_level_keys_present(self):
+        body = self._fetch()
+        expected = {
+            "default_generation_settings", "model_alias", "model_path",
+            "build_info", "speculative_mode", "server", "model", "runtime",
+            "reasoning", "speculative", "sampling", "pflash", "prefix_cache",
+            "full_cache", "tool_replay", "daemon", "api", "capabilities",
+        }
+        missing = expected - set(body.keys())
+        assert not missing, f"/props missing top-level keys: {missing}"
+
+    def test_server_block_shape(self):
+        srv = self._fetch()["server"]
+        assert srv["name"] == "luce-dflash"
+        assert "version" in srv
+        assert isinstance(srv["props_schema"], int)
+
+    def test_speculative_mode_consistency(self):
+        body = self._fetch()
+        mode = body["speculative_mode"]
+        assert mode in {"off", "dflash", "pflash"}
+        if mode == "dflash":
+            assert body["speculative"]["enabled"] is True
+            assert body["pflash"]["enabled"] is False
+        elif mode == "pflash":
+            assert body["pflash"]["enabled"] is True
+        else:
+            assert body["speculative"]["enabled"] is False
+            assert body["pflash"]["enabled"] is False
+
+    def test_runtime_backend_value(self):
+        rt = self._fetch()["runtime"]
+        assert rt["backend"] in {"cuda", "hip", "cpu"}
+        assert isinstance(rt["fa_window"], int)
+        assert rt["kv_cache_k"]
+        assert rt["kv_cache_v"]
+
+    def test_capabilities_match_arch(self):
+        body = self._fetch()
+        caps = body["capabilities"]
+        # Reasoning + speculative + tools all flip together with arch family.
+        if caps["reasoning_supported"]:
+            assert caps["speculative_supported"] is True
+            assert caps["tools_supported"] is True
+            assert "medium" in body["reasoning"]["supported_efforts"]
+
+    def test_api_endpoint_registry(self):
+        endpoints = self._fetch()["api"]["endpoints"]
+        # Every endpoint the test suite hits must be in the registry.
+        required = {
+            "GET /health", "GET /props", "GET /v1/models",
+            "POST /v1/chat/completions", "POST /v1/messages",
+            "POST /v1/messages/count_tokens", "POST /v1/responses",
+        }
+        assert required.issubset(set(endpoints)), \
+            f"/props missing endpoints: {required - set(endpoints)}"
+
+    def test_prefix_cache_stats_shape(self):
+        pc = self._fetch()["prefix_cache"]
+        for key in ("capacity", "in_use", "lifetime_hits"):
+            assert key in pc, f"prefix_cache missing {key}"
+            assert isinstance(pc[key], int)
+
+    def test_tool_replay_stats_shape(self):
+        tr = self._fetch()["tool_replay"]
+        for key in ("max_entries", "max_bytes", "current_entries", "current_bytes"):
+            assert key in tr, f"tool_replay missing {key}"
+
+
+# ═══════════════════════════════════════════════════════════════════
+# /v1/messages/count_tokens — Anthropic count_tokens parity
+# ═══════════════════════════════════════════════════════════════════
+
+class TestCountTokens:
+    def test_simple_count(self):
+        body = {
+            "model": MODEL_NAME,
+            "messages": [{"role": "user", "content": "Hello, world."}],
+        }
+        r = post_json("/v1/messages/count_tokens", body, timeout=10)
+        assert r.status_code == 200
+        payload = r.json()
+        assert "input_tokens" in payload
+        assert isinstance(payload["input_tokens"], int)
+        assert payload["input_tokens"] > 0
+
+    def test_count_scales_with_message_length(self):
+        short = {"model": MODEL_NAME,
+                 "messages": [{"role": "user", "content": "hi"}]}
+        long = {"model": MODEL_NAME,
+                "messages": [{"role": "user", "content": "word " * 200}]}
+        r_short = post_json("/v1/messages/count_tokens", short, timeout=10).json()
+        r_long  = post_json("/v1/messages/count_tokens", long,  timeout=10).json()
+        assert r_long["input_tokens"] > r_short["input_tokens"]
+
+    def test_count_with_system_block(self):
+        body = {
+            "model": MODEL_NAME,
+            "system": "You are a helpful assistant.",
+            "messages": [{"role": "user", "content": "Hi"}],
+        }
+        r = post_json("/v1/messages/count_tokens", body, timeout=10)
+        assert r.status_code == 200
+        assert r.json()["input_tokens"] > 0
+
+    def test_count_does_not_generate(self):
+        """count_tokens must be fast — no generation. <1s budget vs many
+        seconds for a real generation."""
+        body = {
+            "model": MODEL_NAME,
+            "messages": [{"role": "user", "content": "What is 1+1?"}],
+        }
+        t0 = time.monotonic()
+        r = post_json("/v1/messages/count_tokens", body, timeout=10)
+        elapsed = time.monotonic() - t0
+        assert r.status_code == 200
+        # 1s is generous; real bound is dominated by tokenizer + HTTP RTT.
+        assert elapsed < 1.0, f"count_tokens took {elapsed:.2f}s (expected <1s)"
+
+
+# ═══════════════════════════════════════════════════════════════════
+# Thinking-budget envelope — finish_details emission
+# ═══════════════════════════════════════════════════════════════════
+
+class TestThinkingBudget:
+    """Verifies the response includes a `finish_details` block when the
+    request opted in via `thinking: {type: "enabled"}`. Mirrors
+    docs/specs/thinking-budget.md:43-58.
+
+    Level 1 phase-1/phase-2 reprompt is now wired up: when the model
+    fails to emit </think> within --think-max-tokens, the server force-
+    closes via a synthetic "</think>\\n\\nFinal answer: " reprompt and
+    runs phase-2 for the remaining budget. close_kind reflects the path
+    taken ("natural" for self-close, "hard" for force-close).
+    """
+
+    @pytest.mark.slow
+    def test_finish_details_present_when_thinking_opted_in(self):
+        body = {
+            "model": MODEL_NAME,
+            "messages": [{"role": "user", "content": "What is 2+2? Answer in one word."}],
+            "max_tokens": 256,
+            "thinking": {"type": "enabled"},
+            "temperature": 0,
+        }
+        r = post_json("/v1/chat/completions", body)
+        assert r.status_code == 200
+        choice = r.json()["choices"][0]
+        assert "finish_details" in choice, \
+            "finish_details missing despite thinking:{type:enabled}"
+        fd = choice["finish_details"]
+        assert fd["close_kind"] in {"natural", "hard"}
+        assert isinstance(fd["thinking_tokens"], int)
+        assert isinstance(fd["content_tokens"], int)
+        assert isinstance(fd["total_tokens"], int)
+        # Invariant: the two sub-counts sum to the total.
+        assert fd["thinking_tokens"] + fd["content_tokens"] == fd["total_tokens"]
+
+    def test_finish_details_absent_when_thinking_not_opted_in(self):
+        body = {
+            "model": MODEL_NAME,
+            "messages": [{"role": "user", "content": "Say hi"}],
+            "max_tokens": 16,
+            "temperature": 0,
+        }
+        r = post_json("/v1/chat/completions", body)
+        assert r.status_code == 200
+        choice = r.json()["choices"][0]
+        assert "finish_details" not in choice, \
+            "finish_details should only appear when thinking is opted in"
+
+    @pytest.mark.slow
+    def test_close_kind_natural_when_model_self_closes(self):
+        """An easy prompt with a generous budget should let the model emit
+        </think> well within --think-max-tokens, producing close_kind="natural"
+        (no phase-2 reprompt fires)."""
+        body = {
+            "model": MODEL_NAME,
+            "messages": [{"role": "user", "content":
+                          "What is 2+2? Answer in one word."}],
+            "max_tokens": 4096,
+            "thinking": {"type": "enabled"},
+            "temperature": 0,
+        }
+        r = post_json("/v1/chat/completions", body)
+        assert r.status_code == 200
+        fd = r.json()["choices"][0]["finish_details"]
+        assert fd["close_kind"] == "natural", \
+            f"expected natural close, got {fd['close_kind']}"
+        assert fd["content_tokens"] >= 0
+        # Phase-2 did not fire — content_tokens stays 0 when the model
+        # self-closes (all generated tokens are reasoning + content interleaved
+        # via the emitter on the phase-1 stream).
+        assert fd["content_tokens"] == 0
+        assert fd["thinking_tokens"] == fd["total_tokens"]
+
+    @pytest.mark.skipif(
+        os.environ.get("THINK_MAX_TOKENS_LOW") != "1",
+        reason="requires server started with very low --think-max-tokens "
+               "(set THINK_MAX_TOKENS_LOW=1 to enable when the running "
+               "server was launched with e.g. --think-max-tokens 32)",
+    )
+    @pytest.mark.slow
+    def test_close_kind_hard_on_phase2_trigger(self):
+        """A think-heavy prompt with a deliberately tiny --think-max-tokens
+        should trigger phase-2: the model can't finish reasoning in time,
+        the server force-closes </think> and runs a Final-answer reprompt."""
+        body = {
+            "model": MODEL_NAME,
+            "messages": [{"role": "user", "content":
+                          "Reason step by step about the following: list the "
+                          "first 20 prime numbers, then explain why each is "
+                          "prime, then compute their sum. Be thorough."}],
+            "max_tokens": 4096,
+            "thinking": {"type": "enabled"},
+            "temperature": 0,
+        }
+        r = post_json("/v1/chat/completions", body)
+        assert r.status_code == 200
+        fd = r.json()["choices"][0]["finish_details"]
+        assert fd["close_kind"] == "hard", \
+            f"expected hard close, got {fd['close_kind']}"
+        assert fd["thinking_tokens"] > 0
+        assert fd["content_tokens"] > 0
+        assert fd["thinking_tokens"] + fd["content_tokens"] == fd["total_tokens"]
diff --git a/server/src/common/gguf_inspect.cpp b/server/src/common/gguf_inspect.cpp
index 95cc30c41..f8319f941 100644
--- a/server/src/common/gguf_inspect.cpp
+++ b/server/src/common/gguf_inspect.cpp
@@ -1,9 +1,14 @@
 #include "gguf_inspect.h"
 #include "gguf.h"
 
+#include <algorithm>
+#include <cstdint>
 #include <cstdio>
 #include <cstring>
+#include <fstream>
 #include <string>
+#include <sys/stat.h>
+#include <vector>
 
 namespace dflash::common {
 
@@ -36,4 +41,292 @@ GgufModelInfo inspect_gguf_model_info(const char * path) {
     return info;
 }
 
+// ─── SHA-256 (RFC 6234) ─────────────────────────────────────────────────
+//
+// Self-contained mini-implementation so we don't pull in OpenSSL just for
+// one hash. Performance is "fine" — hashing a 17 GB GGUF takes ~30s on a
+// fast NVMe, which is comparable to the per-file numbers `sha256sum` gets.
+// We sidecar the result so this only happens on the first server start
+// after a model is downloaded.
+
+namespace {
+
+struct Sha256Ctx {
+    uint32_t state[8];
+    uint64_t bit_len;
+    uint8_t  buf[64];
+    size_t   buf_len;
+};
+
+inline uint32_t rotr32(uint32_t x, uint32_t n) {
+    return (x >> n) | (x << (32 - n));
+}
+
+void sha256_init(Sha256Ctx & c) {
+    c.state[0] = 0x6a09e667u;
+    c.state[1] = 0xbb67ae85u;
+    c.state[2] = 0x3c6ef372u;
+    c.state[3] = 0xa54ff53au;
+    c.state[4] = 0x510e527fu;
+    c.state[5] = 0x9b05688cu;
+    c.state[6] = 0x1f83d9abu;
+    c.state[7] = 0x5be0cd19u;
+    c.bit_len = 0;
+    c.buf_len = 0;
+}
+
+void sha256_compress(Sha256Ctx & c, const uint8_t * block) {
+    static const uint32_t K[64] = {
+        0x428a2f98u,0x71374491u,0xb5c0fbcfu,0xe9b5dba5u,0x3956c25bu,0x59f111f1u,0x923f82a4u,0xab1c5ed5u,
+        0xd807aa98u,0x12835b01u,0x243185beu,0x550c7dc3u,0x72be5d74u,0x80deb1feu,0x9bdc06a7u,0xc19bf174u,
+        0xe49b69c1u,0xefbe4786u,0x0fc19dc6u,0x240ca1ccu,0x2de92c6fu,0x4a7484aau,0x5cb0a9dcu,0x76f988dau,
+        0x983e5152u,0xa831c66du,0xb00327c8u,0xbf597fc7u,0xc6e00bf3u,0xd5a79147u,0x06ca6351u,0x14292967u,
+        0x27b70a85u,0x2e1b2138u,0x4d2c6dfcu,0x53380d13u,0x650a7354u,0x766a0abbu,0x81c2c92eu,0x92722c85u,
+        0xa2bfe8a1u,0xa81a664bu,0xc24b8b70u,0xc76c51a3u,0xd192e819u,0xd6990624u,0xf40e3585u,0x106aa070u,
+        0x19a4c116u,0x1e376c08u,0x2748774cu,0x34b0bcb5u,0x391c0cb3u,0x4ed8aa4au,0x5b9cca4fu,0x682e6ff3u,
+        0x748f82eeu,0x78a5636fu,0x84c87814u,0x8cc70208u,0x90befffau,0xa4506cebu,0xbef9a3f7u,0xc67178f2u
+    };
+    uint32_t w[64];
+    for (int i = 0; i < 16; ++i) {
+        w[i] = (uint32_t(block[i*4+0]) << 24) | (uint32_t(block[i*4+1]) << 16) |
+               (uint32_t(block[i*4+2]) << 8 ) |  uint32_t(block[i*4+3]);
+    }
+    for (int i = 16; i < 64; ++i) {
+        uint32_t s0 = rotr32(w[i-15], 7) ^ rotr32(w[i-15], 18) ^ (w[i-15] >> 3);
+        uint32_t s1 = rotr32(w[i-2], 17) ^ rotr32(w[i-2], 19) ^ (w[i-2] >> 10);
+        w[i] = w[i-16] + s0 + w[i-7] + s1;
+    }
+    uint32_t a = c.state[0], b = c.state[1], cc = c.state[2], d = c.state[3];
+    uint32_t e = c.state[4], f = c.state[5], g = c.state[6], h = c.state[7];
+    for (int i = 0; i < 64; ++i) {
+        uint32_t S1 = rotr32(e, 6) ^ rotr32(e, 11) ^ rotr32(e, 25);
+        uint32_t ch = (e & f) ^ ((~e) & g);
+        uint32_t t1 = h + S1 + ch + K[i] + w[i];
+        uint32_t S0 = rotr32(a, 2) ^ rotr32(a, 13) ^ rotr32(a, 22);
+        uint32_t mj = (a & b) ^ (a & cc) ^ (b & cc);
+        uint32_t t2 = S0 + mj;
+        h = g; g = f; f = e; e = d + t1;
+        d = cc; cc = b; b = a; a = t1 + t2;
+    }
+    c.state[0] += a; c.state[1] += b; c.state[2] += cc; c.state[3] += d;
+    c.state[4] += e; c.state[5] += f; c.state[6] += g;  c.state[7] += h;
+}
+
+void sha256_update(Sha256Ctx & c, const uint8_t * data, size_t len) {
+    c.bit_len += uint64_t(len) * 8;
+    if (c.buf_len) {
+        size_t take = std::min(size_t(64) - c.buf_len, len);
+        std::memcpy(c.buf + c.buf_len, data, take);
+        c.buf_len += take;
+        data += take;
+        len  -= take;
+        if (c.buf_len == 64) {
+            sha256_compress(c, c.buf);
+            c.buf_len = 0;
+        }
+    }
+    while (len >= 64) {
+        sha256_compress(c, data);
+        data += 64;
+        len  -= 64;
+    }
+    if (len) {
+        std::memcpy(c.buf, data, len);
+        c.buf_len = len;
+    }
+}
+
+std::string sha256_final(Sha256Ctx & c) {
+    uint64_t bits = c.bit_len;
+    c.buf[c.buf_len++] = 0x80;
+    if (c.buf_len > 56) {
+        std::memset(c.buf + c.buf_len, 0, 64 - c.buf_len);
+        sha256_compress(c, c.buf);
+        c.buf_len = 0;
+    }
+    std::memset(c.buf + c.buf_len, 0, 56 - c.buf_len);
+    for (int i = 7; i >= 0; --i) {
+        c.buf[56 + i] = uint8_t(bits & 0xff);
+        bits >>= 8;
+    }
+    sha256_compress(c, c.buf);
+
+    static const char * hex = "0123456789abcdef";
+    std::string out;
+    out.resize(64);
+    for (int i = 0; i < 8; ++i) {
+        uint32_t v = c.state[i];
+        for (int j = 0; j < 4; ++j) {
+            uint8_t byte = uint8_t((v >> (24 - j * 8)) & 0xff);
+            out[i*8 + j*2 + 0] = hex[byte >> 4];
+            out[i*8 + j*2 + 1] = hex[byte & 0x0f];
+        }
+    }
+    return out;
+}
+
+std::string sha256_of_file(const std::string & path) {
+    std::ifstream f(path, std::ios::binary);
+    if (!f) return {};
+    Sha256Ctx c;
+    sha256_init(c);
+    // 4 MiB read buffer: empirically best throughput on NVMe without
+    // gulping the page cache. std::vector heap-allocates so we don't
+    // blow the C++ thread stack.
+    constexpr size_t BUF = 4 * 1024 * 1024;
+    std::vector<uint8_t> buf(BUF);
+    while (f) {
+        f.read(reinterpret_cast<char*>(buf.data()), BUF);
+        std::streamsize got = f.gcount();
+        if (got > 0) sha256_update(c, buf.data(), size_t(got));
+    }
+    return sha256_final(c);
+}
+
+// Map LLAMA_FTYPE_* int → operator-friendly tag (Q4_K_M, IQ4_XS, BF16, …).
+// Kept inline so we don't pull in llama.h here — those enum values are part
+// of the GGUF on-disk format and won't change without a format bump.
+const char * llama_ftype_name(int32_t v) {
+    switch (v) {
+    case 0:  return "F32";
+    case 1:  return "F16";
+    case 2:  return "Q4_0";
+    case 3:  return "Q4_1";
+    case 7:  return "Q8_0";
+    case 8:  return "Q5_0";
+    case 9:  return "Q5_1";
+    case 10: return "Q2_K";
+    case 11: return "Q3_K_S";
+    case 12: return "Q3_K_M";
+    case 13: return "Q3_K_L";
+    case 14: return "Q4_K_S";
+    case 15: return "Q4_K_M";
+    case 16: return "Q5_K_S";
+    case 17: return "Q5_K_M";
+    case 18: return "Q6_K";
+    case 19: return "IQ2_XXS";
+    case 20: return "IQ2_XS";
+    case 21: return "Q2_K_S";
+    case 22: return "IQ3_XS";
+    case 23: return "IQ3_XXS";
+    case 24: return "IQ1_S";
+    case 25: return "IQ4_NL";
+    case 26: return "IQ3_S";
+    case 27: return "IQ3_M";
+    case 28: return "IQ2_S";
+    case 29: return "IQ2_M";
+    case 30: return "IQ4_XS";
+    case 31: return "IQ1_M";
+    case 32: return "BF16";
+    case 36: return "TQ1_0";
+    case 37: return "TQ2_0";
+    case 38: return "MXFP4_MOE";
+    case 39: return "NVFP4";
+    case 40: return "Q1_0";
+    case 1024: return "GUESSED";
+    default: return "";
+    }
+}
+
+bool read_sidecar_sha(const std::string & path, std::string & out) {
+    std::ifstream f(path + ".sha256");
+    if (!f) return false;
+    std::string s;
+    f >> s;  // tolerate `<hex>  filename\n` (sha256sum format) — we only want the first token
+    if (s.size() != 64) return false;
+    for (char c : s) {
+        bool hex = (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f');
+        if (!hex) return false;
+    }
+    out = std::move(s);
+    return true;
+}
+
+void write_sidecar_sha(const std::string & path, const std::string & sha) {
+    // Best-effort. If the directory isn't writable (read-only mount, model
+    // dir owned by another user), we just skip — the in-memory hash is
+    // already what /props will report this run.
+    std::ofstream f(path + ".sha256");
+    if (!f) return;
+    f << sha << "\n";
+}
+
+}  // namespace
+
+GgufMetadata read_gguf_metadata(const std::string & path,
+                                bool compute_sha256) {
+    GgufMetadata m;
+    m.path = path;
+
+    struct stat st{};
+    if (::stat(path.c_str(), &st) == 0) {
+        m.size_bytes = int64_t(st.st_size);
+    }
+
+    gguf_init_params gip{};
+    gip.no_alloc = true;
+    gip.ctx = nullptr;
+    gguf_context * gctx = gguf_init_from_file(path.c_str(), gip);
+    if (!gctx) {
+        // No GGUF header → bail. Still report path/size if we got them.
+        return m;
+    }
+    m.ok = true;
+
+    auto get_str = [&](const char * key, std::string & out) {
+        int64_t id = gguf_find_key(gctx, key);
+        if (id < 0) return;
+        const char * v = gguf_get_val_str(gctx, id);
+        if (v) out = v;
+    };
+    auto get_u32 = [&](const char * key, int32_t & out) {
+        int64_t id = gguf_find_key(gctx, key);
+        if (id < 0) return;
+        out = int32_t(gguf_get_val_u32(gctx, id));
+    };
+
+    get_str("general.architecture",         m.general_architecture);
+    get_str("general.name",                 m.general_name);
+    get_u32("general.file_type",            m.file_type);
+    get_u32("general.quantization_version", m.quantization_version);
+    if (m.file_type >= 0) {
+        const char * name = llama_ftype_name(m.file_type);
+        if (name) m.file_type_name = name;
+    }
+
+    if (!m.general_architecture.empty()) {
+        const std::string a = m.general_architecture;
+        get_u32((a + ".block_count").c_str(),      m.block_count);
+        get_u32((a + ".embedding_length").c_str(), m.embedding_length);
+        get_u32((a + ".context_length").c_str(),   m.context_length);
+        // vocab_size: prefer the explicit <arch>.vocab_size key. Fall back
+        // to the tokenizer token array length (the canonical source on
+        // models that don't write the redundant key).
+        get_u32((a + ".vocab_size").c_str(),       m.vocab_size);
+    }
+    if (m.vocab_size < 0) {
+        int64_t toks_id = gguf_find_key(gctx, "tokenizer.ggml.tokens");
+        if (toks_id >= 0) {
+            m.vocab_size = int32_t(gguf_get_arr_n(gctx, toks_id));
+        }
+    }
+
+    gguf_free(gctx);
+
+    if (compute_sha256) {
+        std::string cached;
+        if (read_sidecar_sha(path, cached)) {
+            m.sha256 = std::move(cached);
+        } else {
+            std::string hash = sha256_of_file(path);
+            if (!hash.empty()) {
+                m.sha256 = hash;
+                write_sidecar_sha(path, hash);
+            }
+        }
+    }
+
+    return m;
+}
+
 }  // namespace dflash::common
diff --git a/server/src/common/gguf_inspect.h b/server/src/common/gguf_inspect.h
index 11c11379e..6e7a15827 100644
--- a/server/src/common/gguf_inspect.h
+++ b/server/src/common/gguf_inspect.h
@@ -5,6 +5,7 @@
 
 #pragma once
 
+#include <cstdint>
 #include <string>
 
 namespace dflash::common {
@@ -18,4 +19,43 @@ struct GgufModelInfo {
 // Returns info with arch="" and n_layer=-1 on failure.
 GgufModelInfo inspect_gguf_model_info(const char * path);
 
+// Richer GGUF identity captured at server startup and re-emitted at /props.
+// All header values are best-effort: missing keys leave the corresponding
+// field at the listed default (empty string or -1). `ok` is false only if
+// the file itself couldn't be opened (path missing, not a GGUF, etc.).
+//
+// The intent is "exactly what binary + GGUF + quant + sha256 is loaded";
+// any field the file doesn't carry stays at the default so consumers can
+// distinguish "not in GGUF" (-1) from "0" (legitimately zero).
+struct GgufMetadata {
+    bool        ok          = false;        // false: open failed, all other fields ignorable
+    std::string path;                       // absolute filesystem path passed in
+    int64_t     size_bytes  = -1;           // file size (-1 if stat failed)
+    std::string sha256;                     // lowercase hex sha256 (empty if not computed)
+
+    // Header fields (`general.*` + `<arch>.*`). All optional.
+    std::string general_architecture;       // raw value of "general.architecture"
+    std::string general_name;               // "general.name" (display string)
+    int32_t     file_type        = -1;      // "general.file_type" (LLAMA_FTYPE_* int)
+    std::string file_type_name;             // decoded LLAMA_FTYPE_* (e.g. "Q4_K_M", "IQ4_XS")
+    int32_t     quantization_version = -1;  // "general.quantization_version"
+
+    int32_t     block_count       = -1;     // "<arch>.block_count"
+    int32_t     embedding_length  = -1;     // "<arch>.embedding_length"
+    int32_t     context_length    = -1;     // "<arch>.context_length"
+    int32_t     vocab_size        = -1;     // "<arch>.vocab_size" (or tokenizer.ggml.tokens length)
+};
+
+// Read GGUF identity for /props. Set `compute_sha256` to hash the file (slow,
+// O(size) — multi-GB GGUFs take ~30s on a fast SSD). When false, `sha256`
+// stays empty. The header read is cheap (no weight load).
+//
+// When `compute_sha256` is true and a sidecar file `<path>.sha256` exists,
+// its first 64-hex-char token is trusted as the file's sha256 and the file
+// is not re-hashed. After a successful hash, the result is written to the
+// sidecar so subsequent restarts skip the rehash. Sidecar I/O failures are
+// non-fatal — the in-memory hash still gets returned.
+GgufMetadata read_gguf_metadata(const std::string & path,
+                                bool compute_sha256);
+
 }  // namespace dflash::common
diff --git a/server/src/common/layer_split_backend.cpp b/server/src/common/layer_split_backend.cpp
index e45cbc104..e9e8b4c2e 100644
--- a/server/src/common/layer_split_backend.cpp
+++ b/server/src/common/layer_split_backend.cpp
@@ -57,8 +57,7 @@ GenerateResult LayerSplitBackend::run_from_state(const GenerateRequest & req,
         result.error = "context";
         return result;
     }
-    if (req.do_sample && req.sampler.needs_logit_processing() &&
-        !adapter_->supports_cpu_sampling()) {
+    if (req.do_sample && req.sampler.temp > 0.0f) {
         result.error = "sampling_unsupported";
         return result;
     }
diff --git a/server/src/common/layer_split_backend.h b/server/src/common/layer_split_backend.h
index e85b5ad6a..76c336fa2 100644
--- a/server/src/common/layer_split_backend.h
+++ b/server/src/common/layer_split_backend.h
@@ -31,7 +31,6 @@ class LayerSplitAdapter {
     virtual bool decode_ar(int last_tok, int committed, int n_gen,
                            std::vector<int32_t> & out_tokens,
                            const DaemonIO & io) = 0;
-    virtual bool supports_cpu_sampling() const { return false; }
 
     virtual bool can_dflash_decode() const { return false; }
     virtual bool decode_dflash(const std::vector<int32_t> & prompt,
diff --git a/server/src/common/model_backend.h b/server/src/common/model_backend.h
index b808d0c39..088b354d7 100644
--- a/server/src/common/model_backend.h
+++ b/server/src/common/model_backend.h
@@ -102,6 +102,10 @@ struct GenerateRequest {
     const std::vector<int32_t> * hint_tokens = nullptr;
     // Optional thinking-budget hook — see BudgetHook docs above.
     BudgetHook                 budget_hook;
+    // Per-request override for target spec-decode verify fa_window. Set by
+    // http_server when pflash compresses, so verify sees the entire compressed
+    // prompt (not just the last cfg_.fa_window positions). Zero = no override.
+    int                        fa_window_override = 0;
     // Common retry knob. Upper layers set this after a speculative decode
     // path returns success but emits no tokens, so each backend can route the
     // retry through its existing AR path without copying retry policy.
@@ -251,6 +255,10 @@ struct ModelBackend {
         std::string          drafter_path;    // GGUF path (for lazy-load)
         int                  drafter_gpu = 0;  // backend-local GPU for PFlash drafter
         bool                 skip_park = false; // true on >=32GB GPUs
+        // Per-request transitive-cascade override (-1 = use env default).
+        // 0 = off (agentic path: suppress cascade to avoid anchor bloat).
+        // 1 = on  (retrieval path: full expansion, same as today).
+        int                  use_transitive = -1;
         DraftResidencyAction residency_action = DraftResidencyAction::KeepLoaded;
     };
 
diff --git a/server/src/draft/draft_gguf_loader.cpp b/server/src/draft/draft_gguf_loader.cpp
index fbec7263b..73a9c17bd 100644
--- a/server/src/draft/draft_gguf_loader.cpp
+++ b/server/src/draft/draft_gguf_loader.cpp
@@ -349,6 +349,63 @@ bool load_draft_gguf(const std::string & path,
 
     gguf_free(gctx);
 
+    // Structural defense: derive scalar dims from weight tensor shapes and
+    // assert against GGUF-declared metadata (Bug #2 class prevention).
+    // All draft layers have wq/wk (no deltanet mix), so use layer 0.
+    // wq is plain Q-only (no gate), so ne[1] = n_head * head_dim.
+    // fc is [n_target_layers*n_embd, n_embd], so ne[0] = n_target_layers*n_embd.
+    {
+        const DraftLayer & L0 = out.layers[0];
+        const int64_t derived_q_dim  = L0.wq->ne[1];
+        const int64_t derived_kv_dim = L0.wk->ne[1];
+        const int64_t expected_q_dim  = (int64_t)out.n_head * out.head_dim;
+        const int64_t expected_kv_dim = (int64_t)out.n_head_kv * out.head_dim;
+        if (derived_q_dim != expected_q_dim) {
+            char buf[256];
+            std::snprintf(buf, sizeof(buf),
+                "draft GGUF shape mismatch: blk.0.attn_q.weight->ne[1]=%lld "
+                "!= n_head*head_dim=%d*%d=%lld",
+                (long long)derived_q_dim,
+                out.n_head, out.head_dim, (long long)expected_q_dim);
+            set_last_error(buf);
+            return false;
+        }
+        if (derived_kv_dim != expected_kv_dim) {
+            char buf[256];
+            std::snprintf(buf, sizeof(buf),
+                "draft GGUF shape mismatch: blk.0.attn_k.weight->ne[1]=%lld "
+                "!= n_head_kv*head_dim=%d*%d=%lld",
+                (long long)derived_kv_dim,
+                out.n_head_kv, out.head_dim, (long long)expected_kv_dim);
+            set_last_error(buf);
+            return false;
+        }
+        const int64_t derived_n_embd = L0.wq->ne[0];
+        if (derived_n_embd != (int64_t)out.n_embd) {
+            char buf[256];
+            std::snprintf(buf, sizeof(buf),
+                "draft GGUF shape mismatch: blk.0.attn_q.weight->ne[0]=%lld != n_embd=%d",
+                (long long)derived_n_embd, out.n_embd);
+            set_last_error(buf);
+            return false;
+        }
+        // fc: [n_target_layers*n_embd, n_embd] — check fc->ne[0] against derived expectation
+        if (out.n_target_layers > 0) {
+            const int64_t derived_fc_in  = out.fc->ne[0];
+            const int64_t expected_fc_in = (int64_t)out.n_target_layers * out.n_embd;
+            if (derived_fc_in != expected_fc_in) {
+                char buf[256];
+                std::snprintf(buf, sizeof(buf),
+                    "draft GGUF shape mismatch: dflash.fc.weight->ne[0]=%lld "
+                    "!= n_target_layers*n_embd=%d*%d=%lld",
+                    (long long)derived_fc_in,
+                    out.n_target_layers, out.n_embd, (long long)expected_fc_in);
+                set_last_error(buf);
+                return false;
+            }
+        }
+    }
+
     char summary[192];
     std::snprintf(summary, sizeof(summary),
         "draft GGUF loaded: %" PRId64 " tensors, %.2f GiB on GPU",
diff --git a/server/src/gemma4/gemma4_graph.cpp b/server/src/gemma4/gemma4_graph.cpp
index bc2276555..bf8f8ce7c 100644
--- a/server/src/gemma4/gemma4_graph.cpp
+++ b/server/src/gemma4/gemma4_graph.cpp
@@ -515,14 +515,13 @@ bool build_gemma4_layer_step(
     return ggml_gallocr_alloc_graph(sg.alloc, sg.gf);
 }
 
-bool compute_gemma4_split_projection(
+bool compute_gemma4_split_argmax(
     ggml_backend_t          backend,
     const Gemma4Weights &   w,
     ggml_tensor *           act,
     int                     token_offset,
     int                     n_tokens,
-    std::vector<int32_t> *  out_argmax,
-    std::vector<float> *    out_logits) {
+    std::vector<int32_t> &  out_argmax) {
     ggml_init_params ip{};
     ip.mem_size = ggml_tensor_overhead() * 64 + ggml_graph_overhead() + 1024 * 1024;
     ip.no_alloc = true;
@@ -540,17 +539,9 @@ bool compute_gemma4_split_projection(
         cur = ggml_tanh(ctx, cur);
         cur = ggml_scale(ctx, cur, w.final_logit_softcap);
     }
-    ggml_tensor * logits = cur;
-    ggml_tensor * argmax = nullptr;
-    if (out_logits) {
-        ggml_set_output(logits);
-        ggml_build_forward_expand(gf, logits);
-    }
-    if (out_argmax) {
-        argmax = ggml_argmax(ctx, logits);
-        ggml_set_output(argmax);
-        ggml_build_forward_expand(gf, argmax);
-    }
+    cur = ggml_argmax(ctx, cur);
+    ggml_set_output(cur);
+    ggml_build_forward_expand(gf, cur);
 
     ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
     if (!alloc || !ggml_gallocr_alloc_graph(alloc, gf)) {
@@ -563,32 +554,14 @@ bool compute_gemma4_split_projection(
         ggml_free(ctx);
         return false;
     }
-    if (out_argmax) {
-        out_argmax->resize((size_t)n_tokens);
-        ggml_backend_tensor_get(argmax, out_argmax->data(), 0,
-                                sizeof(int32_t) * (size_t)n_tokens);
-    }
-    if (out_logits) {
-        out_logits->resize((size_t)w.n_vocab * (size_t)n_tokens);
-        ggml_backend_tensor_get(logits, out_logits->data(), 0,
-                                sizeof(float) * (size_t)w.n_vocab * (size_t)n_tokens);
-    }
+    out_argmax.resize((size_t)n_tokens);
+    ggml_backend_tensor_get(cur, out_argmax.data(), 0,
+                            sizeof(int32_t) * (size_t)n_tokens);
     ggml_gallocr_free(alloc);
     ggml_free(ctx);
     return true;
 }
 
-bool compute_gemma4_split_argmax(
-    ggml_backend_t          backend,
-    const Gemma4Weights &   w,
-    ggml_tensor *           act,
-    int                     token_offset,
-    int                     n_tokens,
-    std::vector<int32_t> &  out_argmax) {
-    return compute_gemma4_split_projection(
-        backend, w, act, token_offset, n_tokens, &out_argmax, nullptr);
-}
-
 bool gemma4_step(
     ggml_backend_t          backend,
     const Gemma4Weights &   w,
diff --git a/server/src/gemma4/gemma4_layer_split_adapter.cpp b/server/src/gemma4/gemma4_layer_split_adapter.cpp
index b212cdd4c..4e7c6a877 100644
--- a/server/src/gemma4/gemma4_layer_split_adapter.cpp
+++ b/server/src/gemma4/gemma4_layer_split_adapter.cpp
@@ -146,10 +146,7 @@ bool Gemma4LayerSplitAdapter::init() {
 }
 
 void Gemma4LayerSplitAdapter::begin_request(const GenerateRequest & req) {
-    sampler_ = req.sampler;
-    if (req.do_sample && sampler_.seed != 0) {
-        sampler_rng_.seed(sampler_.seed);
-    }
+    (void)req;
 }
 
 void Gemma4LayerSplitAdapter::reset_request_state() {
@@ -157,14 +154,12 @@ void Gemma4LayerSplitAdapter::reset_request_state() {
         shard.cache.cur_pos = 0;
         shard.cache.last_tok = -1;
     }
-    prefill_last_logits_.clear();
 }
 
 bool Gemma4LayerSplitAdapter::run_forward(
         const std::vector<int32_t> & tokens,
         int base_pos,
-        int & last_tok,
-        std::vector<float> * logits_out) {
+        int & last_tok) {
     if (shards_.empty() || tokens.empty()) return false;
     const Gemma4Weights & ref = shards_.front().weights;
     const int hidden = ref.n_embd;
@@ -347,9 +342,9 @@ bool Gemma4LayerSplitAdapter::run_forward(
 
     std::vector<int32_t> argmax;
     Gemma4LayerSplitShard & last = shards_.back();
-    const bool ok = compute_gemma4_split_projection(
+    const bool ok = compute_gemma4_split_argmax(
         last.backend, last.weights, act_in,
-        n_tokens_total - 1, 1, &argmax, logits_out);
+        n_tokens_total - 1, 1, argmax);
     activation_buffer_free(orig);
     activation_pair_free(acts);
     if (!ok || argmax.empty()) return false;
@@ -364,7 +359,7 @@ bool Gemma4LayerSplitAdapter::run_forward(
 bool Gemma4LayerSplitAdapter::prefill(const std::vector<int32_t> & prompt,
                                       int base_pos,
                                       int & last_tok) {
-    return run_forward(prompt, base_pos, last_tok, &prefill_last_logits_);
+    return run_forward(prompt, base_pos, last_tok);
 }
 
 bool Gemma4LayerSplitAdapter::decode_ar(
@@ -377,13 +372,6 @@ bool Gemma4LayerSplitAdapter::decode_ar(
     if (shards_.empty()) return false;
 
     const auto & w = shards_.front().weights;
-    const int vocab = w.n_vocab;
-    std::vector<float> logits_buf;
-    if (sampler_.needs_logit_processing()) {
-        if ((int)prefill_last_logits_.size() != vocab) return false;
-        last_tok = sample_logits(prefill_last_logits_.data(), vocab, sampler_,
-                                 out_tokens, sampler_rng_);
-    }
     out_tokens.push_back(last_tok);
     io.emit(last_tok);
     if (io.cancelled) {
@@ -399,16 +387,7 @@ bool Gemma4LayerSplitAdapter::decode_ar(
     for (int i = 1; i < n_gen; ++i) {
         std::vector<int32_t> one(1, last_tok);
         int next_tok = -1;
-        logits_buf.clear();
-        if (!run_forward(one, committed - 1, next_tok,
-                         sampler_.needs_logit_processing() ? &logits_buf : nullptr)) {
-            return false;
-        }
-        if (sampler_.needs_logit_processing()) {
-            if ((int)logits_buf.size() != vocab) return false;
-            next_tok = sample_logits(logits_buf.data(), vocab, sampler_,
-                                     out_tokens, sampler_rng_);
-        }
+        if (!run_forward(one, committed - 1, next_tok)) return false;
         last_tok = next_tok;
         out_tokens.push_back(last_tok);
         io.emit(last_tok);
@@ -482,7 +461,6 @@ bool Gemma4LayerSplitAdapter::snapshot_save(int slot) {
     }
     snap.cur_pos = snap_pos;
     snap.last_tok = shards_.front().cache.last_tok;
-    snap.prefill_last_logits = prefill_last_logits_;
     return true;
 }
 
@@ -494,7 +472,6 @@ void Gemma4LayerSplitAdapter::snapshot_free(int slot) {
     }
     snap.cur_pos = 0;
     snap.last_tok = -1;
-    snap.prefill_last_logits.clear();
     if (snap.shards.size() != shards_.size()) snap.shards.resize(shards_.size());
 }
 
@@ -505,7 +482,6 @@ bool Gemma4LayerSplitAdapter::snapshot_used(int slot) const {
     }
     const auto & snap = snapshots_[(size_t)slot];
     if (snap.cur_pos <= 0 || snap.shards.size() != shards_.size()) return false;
-    if (snap.prefill_last_logits.empty()) return false;
     for (const auto & ss : snap.shards) {
         if (!ss.ctx) return false;
     }
@@ -545,7 +521,6 @@ bool Gemma4LayerSplitAdapter::snapshot_restore(int slot) {
         shards_[i].cache.cur_pos = snap.cur_pos;
         shards_[i].cache.last_tok = snap.last_tok;
     }
-    prefill_last_logits_ = snap.prefill_last_logits;
     return true;
 }
 
diff --git a/server/src/gemma4/gemma4_layer_split_adapter.h b/server/src/gemma4/gemma4_layer_split_adapter.h
index b4238fd63..3a3050f7e 100644
--- a/server/src/gemma4/gemma4_layer_split_adapter.h
+++ b/server/src/gemma4/gemma4_layer_split_adapter.h
@@ -30,7 +30,6 @@ struct Gemma4LayerSplitSnapshot {
     int cur_pos = 0;
     int32_t last_tok = -1;
     std::vector<Gemma4Snapshot> shards;
-    std::vector<float> prefill_last_logits;
 };
 
 class Gemma4LayerSplitAdapter : public LayerSplitAdapter {
@@ -53,7 +52,6 @@ class Gemma4LayerSplitAdapter : public LayerSplitAdapter {
     bool decode_ar(int last_tok, int committed, int n_gen,
                    std::vector<int32_t> & out_tokens,
                    const DaemonIO & io) override;
-    bool supports_cpu_sampling() const override { return true; }
 
     bool snapshot_save(int slot) override;
     void snapshot_free(int slot) override;
@@ -68,17 +66,13 @@ class Gemma4LayerSplitAdapter : public LayerSplitAdapter {
 private:
     bool run_forward(const std::vector<int32_t> & tokens,
                      int base_pos,
-                     int & last_tok,
-                     std::vector<float> * logits_out = nullptr);
+                     int & last_tok);
 
     Gemma4LayerSplitAdapterConfig cfg_;
     std::vector<Gemma4LayerSplitShard> shards_;
     std::vector<ggml_backend_t> snapshot_backends_;
     std::vector<Gemma4LayerSplitSnapshot> snapshots_;
     static constexpr int PREFIX_SLOTS = ModelBackend::kMaxSlots;
-    SamplerCfg sampler_;
-    std::mt19937_64 sampler_rng_{std::random_device{}()};
-    std::vector<float> prefill_last_logits_;
 };
 
 void free_gemma4_layer_split_shards(std::vector<Gemma4LayerSplitShard> & shards);
diff --git a/server/src/qwen3/qwen3_backend.cpp b/server/src/qwen3/qwen3_backend.cpp
index 253886978..b42aac96e 100644
--- a/server/src/qwen3/qwen3_backend.cpp
+++ b/server/src/qwen3/qwen3_backend.cpp
@@ -952,7 +952,9 @@ ModelBackend::CompressResult Qwen3Backend::compress(const CompressRequest & req)
     }
 
     result.compressed_ids = drafter_score_and_compress(
-        drafter_ctx_, req.input_ids, req.keep_ratio);
+        drafter_ctx_, req.input_ids, req.keep_ratio,
+        /*chunk_size=*/32, /*n_lookahead=*/8, /*pool_kernel=*/13,
+        req.use_transitive);
     result.ok = true;
 
     if (req.residency_action == DraftResidencyAction::ReleaseAfterUse) {
diff --git a/server/src/qwen3/qwen3_graph.cpp b/server/src/qwen3/qwen3_graph.cpp
index a23bcefb3..c2715a356 100644
--- a/server/src/qwen3/qwen3_graph.cpp
+++ b/server/src/qwen3/qwen3_graph.cpp
@@ -5,23 +5,10 @@
 // buffers. Sliding-window flash-attention via ggml-cuda's tensor-core
 // `flash_attn_ext` keeps attention cost linear in S.
 //
-// **Algorithmic note vs blog**:
-//   The blog stack is Liu Q-hook tail scoring + FlashPrefill block-sparse FA.
-//   The Liu Q-hook is implemented with a NoPE fix: by default (DFLASH_FP_NOPE_TAIL=1)
-//   the tail score uses pre-RoPE K/Q, removing the RoPE distance decay that
-//   buries early-position needle chunks and was causing NIAH failures.
-//   Set DFLASH_FP_NOPE_TAIL=0 to revert to post-RoPE scoring.  The block-sparse FA is replaced
-//   with a sliding-window approximation here because (a) ggml-cuda's
-//   `flash_attn_ext` already gives tensor-core speed inside the ubatch
-//   graph, and (b) our own block-sparse CUDA kernel needs a tensor-core
-//   rewrite (mma.sync.aligned) to actually beat ggml's FA — see
-//   `src/flashprefill_kernels.cu` for the (slow) scalar reference path.
-//   At S=140K with W=512 sliding window the NIAH magic key still propagates
-//   through 28 layers and is recovered in the kept tokens, so this
-//   approximation passes the actual e2e correctness check the user cares
-//   about. The block-sparse FA upgrade remains the next deliverable for
-//   "match the article algorithmically", but is functionally equivalent
-//   for the deployed perf budget today.
+// Tail score uses pre-RoPE K/Q (DFLASH_FP_NOPE_TAIL=1 default) to remove
+// distance decay that buries early-position needle chunks (NIAH fix).
+// Block-sparse FA replaced by sliding-window via ggml-cuda flash_attn_ext;
+// BSA upgrade tracked in flashprefill_kernels.cu.
 //
 // Memory at S=140K, B=1, H=16, Hk=8, D=128, hidden=1024, ff=3072:
 //   weights                                            ~1.5 GB
@@ -35,6 +22,7 @@
 #include "qwen3_drafter_model.h"
 #include "internal.h"
 #include "flashprefill.h"
+#include "../common/score_range.h"
 
 #include "device_runtime.h"
 
@@ -249,13 +237,30 @@ bool forward_qwen3_drafter_model(
     }
     running_max.assign((size_t)n_lookahead * S, -INFINITY);
 
+    // Pre-compute score range to skip K_norope alloc for non-scoring layers.
+    // At S=128K this trims ~5.6 GB (21 × 268 MB); see test_drafter_warm_path_regression.
+    static const int score_layers_pre = []() -> int {
+        const char * e = std::getenv("PFLASH_DRAFTER_SCORE_LAYERS");
+        if (e) { int v = std::atoi(e); if (v > 0) return v; }
+        return -1;
+    }();
+    static const int early_exit_pre = []() -> int {
+        const char * e = std::getenv("PFLASH_DRAFTER_EARLY_EXIT_N");
+        if (e) { int v = std::atoi(e); if (v > 0) return v; }
+        return -1;
+    }();
+    const int fwd_layer_limit_pre = (early_exit_pre > 0 && early_exit_pre < w.n_layer)
+        ? early_exit_pre : w.n_layer;
+    const ScoreRange pre_range = compute_score_range(w.n_layer, score_layers_pre, fwd_layer_limit_pre);
+    const int score_layer_start_pre = pre_range.start;
+    const int n_score_layers = pre_range.count();
+
     PersBuf hidden_buf, pos_buf, mask_tail_buf, Q_buf, attn_out_buf;
     std::vector<PersBuf> K_curr_v((size_t)w.n_layer);
     std::vector<PersBuf> V_curr_v((size_t)w.n_layer);
     std::vector<PersBuf> Q_last_v((size_t)w.n_layer);
-    // NoPE: pre-RoPE K (full sequence) and Q tail; allocated only when nope_tail.
-    std::vector<PersBuf> K_norope_v(nope_tail ? (size_t)w.n_layer : 0);
-    std::vector<PersBuf> Q_norope_v(nope_tail ? (size_t)w.n_layer : 0);
+    std::vector<PersBuf> K_norope_v(nope_tail ? (size_t)n_score_layers : 0);
+    std::vector<PersBuf> Q_norope_v(nope_tail ? (size_t)n_score_layers : 0);
     auto cleanup_all = [&]() {
         free_pers(hidden_buf);
         free_pers(pos_buf);
@@ -294,9 +299,10 @@ bool forward_qwen3_drafter_model(
                 cleanup_all();
                 return false;
             }
-            if (nope_tail) {
-                if (!make_pers(w.backend, half_type, 3, d_kv, K_norope_v[il]) ||
-                    !make_pers(w.backend, GGML_TYPE_F32, 3, d_ql, Q_norope_v[il])) {
+            if (nope_tail && il >= score_layer_start_pre && il < fwd_layer_limit_pre) {
+                const int si = il - score_layer_start_pre;
+                if (!make_pers(w.backend, half_type, 3, d_kv, K_norope_v[si]) ||
+                    !make_pers(w.backend, GGML_TYPE_F32, 3, d_ql, Q_norope_v[si])) {
                     set_last_error("forward_qwen3: K_norope/Q_norope alloc failed at layer " + std::to_string(il));
                     cleanup_all();
                     return false;
@@ -372,7 +378,10 @@ bool forward_qwen3_drafter_model(
     double t_b_warm = 0.0, t_b_setup = 0.0, t_b_alloc = 0.0, t_b_copy_in = 0.0, t_b_norm = 0.0, t_compute_b = 0.0, t_b_copy_out = 0.0;
     double t_fp = 0.0;
 
-    for (int il = 0; il < w.n_layer; ++il) {
+    const int fwd_layer_limit = (early_exit_pre > 0 && early_exit_pre < w.n_layer)
+        ? early_exit_pre : w.n_layer;
+
+    for (int il = 0; il < fwd_layer_limit; ++il) {
         const auto & L = w.layers[il];
         const bool debug_first_layer = (il == 0 && std::getenv("DFLASH_FP_DEBUG_LAYER0") != nullptr);
 
@@ -411,19 +420,22 @@ bool forward_qwen3_drafter_model(
 
             ggml_tensor * Q = ggml_mul_mat(gA, L.wq, h_norm);
             Q = ggml_reshape_3d(gA, Q, D, H, cl);
-            Q = ggml_rms_norm(gA, Q, eps);
-            Q = ggml_mul(gA, Q, L.q_norm);
-            // NoPE: capture pre-RoPE Q tail so the tail scorer is not biased by distance.
-            if (nope_tail) {
+            if (L.q_norm) {
+                Q = ggml_rms_norm(gA, Q, eps);
+                Q = ggml_mul(gA, Q, L.q_norm);
+            }
+            // NoPE: capture pre-RoPE Q tail (only for layers that will be scored).
+            if (nope_tail && il >= score_layer_start_pre) {
+                const int si = il - score_layer_start_pre;
                 const int tail_lo_nr = S - n_lookahead;
-                if (tail_lo_nr >= cs && tail_lo_nr < cs + cl) {
+                if (tail_lo_nr >= cs && tail_lo_nr + n_lookahead <= cs + cl) {
                     const int local_lo_nr = tail_lo_nr - cs;
                     ggml_tensor * Q_prenrope_tail = ggml_view_3d(
                         gA, Q, D, H, n_lookahead,
                         Q->nb[1], Q->nb[2],
                         (size_t)local_lo_nr * Q->nb[2]);
                     ggml_build_forward_expand(gfA,
-                        ggml_cpy(gA, Q_prenrope_tail, Q_norope_v[il].t));
+                        ggml_cpy(gA, Q_prenrope_tail, Q_norope_v[si].t));
                 }
             }
             Q = ggml_rope_ext(gA, Q, pos_chunk, nullptr, D,
@@ -432,12 +444,15 @@ bool forward_qwen3_drafter_model(
 
             ggml_tensor * K = ggml_mul_mat(gA, L.wk, h_norm);
             K = ggml_reshape_3d(gA, K, D, Hk, cl);
-            K = ggml_rms_norm(gA, K, eps);
-            K = ggml_mul(gA, K, L.k_norm);
-            // NoPE: save pre-RoPE K chunk alongside K_curr_v.
-            if (nope_tail) {
-                const size_t kn_esz = ggml_element_size(K_norope_v[il].t);
-                ggml_tensor * Kn_dst = ggml_view_3d(gA, K_norope_v[il].t, D, Hk, cl,
+            if (L.k_norm) {
+                K = ggml_rms_norm(gA, K, eps);
+                K = ggml_mul(gA, K, L.k_norm);
+            }
+            // NoPE: save pre-RoPE K chunk (only for layers that will be scored).
+            if (nope_tail && il >= score_layer_start_pre) {
+                const int si = il - score_layer_start_pre;
+                const size_t kn_esz = ggml_element_size(K_norope_v[si].t);
+                ggml_tensor * Kn_dst = ggml_view_3d(gA, K_norope_v[si].t, D, Hk, cl,
                                                     kn_esz * D, kn_esz * D * Hk,
                                                     (size_t)cs * kn_esz * D * Hk);
                 ggml_build_forward_expand(gfA, ggml_cpy(gA, K, Kn_dst));
@@ -466,7 +481,7 @@ bool forward_qwen3_drafter_model(
 
             // Copy Q tail to Q_last_v[il] in the chunk that contains the tail.
             const int tail_lo = S - n_lookahead;
-            if (tail_lo >= cs && tail_lo < cs + cl) {
+            if (tail_lo >= cs && tail_lo + n_lookahead <= cs + cl) {
                 int local_lo = tail_lo - cs;
                 ggml_tensor * Q_tail_local = ggml_view_3d(
                     gA, Q, D, H, n_lookahead,
@@ -707,12 +722,12 @@ bool forward_qwen3_drafter_model(
         }
 #endif
 
-        if (il == 0 || il == w.n_layer - 1) {
+        if (il == 0 || il == fwd_layer_limit - 1) {
             std::fprintf(stderr,
                          "[qwen3-0.6b-fp] layer %d/%d done "
                          "(A_setup=%.3fs A_alloc=%.3fs A_compute=%.3fs FP=%.3fs "
                          "B_warm=%.3fs B_setup=%.3fs B_alloc=%.3fs B_copy_in=%.3fs B_norm=%.3fs B_compute=%.3fs B_copy_out=%.3fs)\n",
-                         il + 1, w.n_layer,
+                         il + 1, fwd_layer_limit,
                          t_a_setup, t_a_alloc, t_compute_a, t_fp,
                          t_b_warm, t_b_setup, t_b_alloc, t_b_copy_in, t_b_norm, t_compute_b, t_b_copy_out);
             std::fflush(stderr);
@@ -724,19 +739,28 @@ bool forward_qwen3_drafter_model(
     auto t_fwd_end = std::chrono::steady_clock::now();
     double t_fwd = std::chrono::duration<double>(t_fwd_end - t_total_start).count();
 
-    // Tail attention scoring (unchanged from previous impl).
+    // Tail attention scoring.
+    // score_layers_pre / compute_score_range already determined the range before
+    // allocation (to size K_norope_v correctly).  Re-use that result here.
+    // score_layer_start_pre == score_layer_start by construction (same formula,
+    // same env vars, same fwd_layer_limit_pre == fwd_layer_limit).
+    const int score_layer_start  = score_layer_start_pre;
+    const int score_layer_end    = fwd_layer_limit;
+
     std::vector<float> probs_h((size_t)S * n_lookahead * H);
     auto t_score_start = std::chrono::steady_clock::now();
 
-    for (int il = 0; il < w.n_layer; ++il) {
+    for (int il = score_layer_start; il < score_layer_end; ++il) {
         ggml_init_params ip{};
         ip.mem_size = ggml_tensor_overhead() * 32 + ggml_graph_overhead() + 16 * 1024;
         ip.no_alloc = true;
         ggml_context * gctx = ggml_init(ip);
 
+        // K_norope_v / Q_norope_v are indexed from score_layer_start_pre.
+        const int si = il - score_layer_start_pre;
         ggml_tensor * K_f32 = ggml_new_tensor_3d(gctx, GGML_TYPE_F32, D, Hk, S);
         ggml_tensor * K_cast = ggml_cpy(gctx,
-            nope_tail ? K_norope_v[il].t : K_curr_v[il].t, K_f32);
+            nope_tail ? K_norope_v[si].t : K_curr_v[il].t, K_f32);
         ggml_tensor * K_perm = ggml_cont(gctx,
             ggml_permute(gctx, K_cast, 0, 2, 1, 3));
         ggml_tensor * K_score = K_perm;
@@ -749,7 +773,7 @@ bool forward_qwen3_drafter_model(
         }
         ggml_tensor * Q_tail_perm = ggml_cont(gctx,
             ggml_permute(gctx,
-                nope_tail ? Q_norope_v[il].t : Q_last_v[il].t,
+                nope_tail ? Q_norope_v[si].t : Q_last_v[il].t,
                 0, 2, 1, 3));
         ggml_tensor * attn_score = ggml_mul_mat(gctx, K_score, Q_tail_perm);
         ggml_tensor * probs = ggml_soft_max_ext(gctx, attn_score, mask_tail_buf.t,
@@ -796,8 +820,9 @@ bool forward_qwen3_drafter_model(
     double t_score = std::chrono::duration<double>(t_total_end - t_score_start).count();
     std::fprintf(stderr,
         "[qwen3-0.6b-fp] forward %.2fs (S=%d, A_setup=%.2fs A_alloc=%.2fs A_compute=%.2fs FP=%.2fs B_warm=%.2fs B_setup=%.2fs B_alloc=%.2fs B_copy_in=%.2fs B_norm=%.2fs B_compute=%.2fs B_copy_out=%.2fs)  "
-        "tail-score %.2fs  total %.2fs\n",
-        t_fwd, S, t_a_setup, t_a_alloc, t_compute_a, t_fp, t_b_warm, t_b_setup, t_b_alloc, t_b_copy_in, t_b_norm, t_compute_b, t_b_copy_out, t_score, t_fwd + t_score);
+        "tail-score %.2fs (layers %d-%d)  total %.2fs\n",
+        t_fwd, S, t_a_setup, t_a_alloc, t_compute_a, t_fp, t_b_warm, t_b_setup, t_b_alloc, t_b_copy_in, t_b_norm, t_compute_b, t_b_copy_out,
+        t_score, score_layer_start, score_layer_end - 1, t_fwd + t_score);
     std::fflush(stderr);
 
     cleanup_all();
diff --git a/server/src/qwen3/qwen3_loader.cpp b/server/src/qwen3/qwen3_loader.cpp
index ed38ee106..b7b35a85e 100644
--- a/server/src/qwen3/qwen3_loader.cpp
+++ b/server/src/qwen3/qwen3_loader.cpp
@@ -133,6 +133,18 @@ bool load_qwen3_drafter_model(const std::string & path,
     out.head_dim   = (int)get_u32(gctx, "qwen3.attention.key_length", 128);
     out.rope_theta = get_f32(gctx, "qwen3.rope.freq_base", 1000000.0f);
 
+    // Detect weight quant type from blk.0.attn_q.weight; support BF16 and Q8_0.
+    ggml_type wtype = GGML_TYPE_BF16;
+    {
+        int64_t tidx = gguf_find_tensor(gctx, "blk.0.attn_q.weight");
+        if (tidx >= 0) {
+            wtype = gguf_get_tensor_type(gctx, tidx);
+        }
+    }
+    std::fprintf(stderr, "[qwen3-0.6b] detected weight type: %s\n",
+                 wtype == GGML_TYPE_Q8_0 ? "Q8_0" : "BF16");
+    std::fflush(stderr);
+
     // Compute total tensor metadata size for context allocation.
     const int n_layer = out.n_layer;
     const int n_tensors_per_layer = 11;
diff --git a/server/src/qwen35/c2_gate.h b/server/src/qwen35/c2_gate.h
new file mode 100644
index 000000000..51c644e2c
--- /dev/null
+++ b/server/src/qwen35/c2_gate.h
@@ -0,0 +1,31 @@
+// C2 gate predicate — pure function, no GPU/model deps.
+// Extracted from qwen35_backend.cpp for testability.
+//
+// Reasoning: when pflash compresses a 128K prompt to ~11K tokens, the
+// target KV at decode time = 11K (small). T_target is fast (small KV),
+// T_draft ≈ constant. r = T_draft/T_target ≈ 1, so spec-decode does NOT
+// win over AR. Empirical: D_composition 128K: AR=27.5 tok/s, spec=5.74 tok/s.
+// Gate correctly blocks spec-decode when eff_fa_window > 2*fa_window_cfg.
+#pragma once
+
+namespace dflash::common {
+
+// Returns true if spec-decode should be attempted.
+//   fa_window_override: 0 = no pflash; else = compressed_prompt_size + 256
+//   fa_window_cfg     : cfg_.fa_window (default 2048)
+//   kv_committed      : KV position after prefill (unused; kept for future use)
+//
+// Gate: permit spec-decode when eff_fa_window <= 2 * fa_window_cfg.
+// For uncompressed (override==0): always permit.
+// For pflash-compressed: permit only when compressed_size <= 3840 tokens.
+// At compressed_size > 3840, target KV is large enough that AR is faster
+// than spec-decode (empirically: D_composition 128K AR=27.5 vs spec=5.74 tok/s).
+inline bool c2_spec_decode_permitted(int fa_window_override,
+                                     int fa_window_cfg,
+                                     int kv_committed) {
+    (void)kv_committed;
+    return (fa_window_override == 0)
+        || (fa_window_override <= 2 * fa_window_cfg);
+}
+
+} // namespace dflash::common
diff --git a/server/src/qwen35/gguf_target_loader.cpp b/server/src/qwen35/gguf_target_loader.cpp
index 116ddafc0..8628eb3ab 100644
--- a/server/src/qwen35/gguf_target_loader.cpp
+++ b/server/src/qwen35/gguf_target_loader.cpp
@@ -38,10 +38,7 @@
 //     ssm_out.weight                 [inner, hidden]           Q5_K
 //     ffn_gate/up/down              (same as full-attn)
 //
-// This loader reads the file via ggml's built-in GGUF API, which returns a
-// ggml_context pre-populated with tensors. We then wire that context onto
-// the CUDA backend (via ggml_backend_alloc_ctx_tensors) and copy each
-// tensor's bytes from the mmap'd file.
+// Loads via ggml GGUF API; tensors copied from mmap to CUDA backend.
 
 #include "internal.h"
 #include "common/layer_split_utils.h"
@@ -738,6 +735,51 @@ bool load_target_gguf_partial(const std::string & path,
 
     gguf_free(gctx);
 
+    // Structural defense: derive scalar dims from weight tensor shapes and
+    // assert against GGUF-declared metadata. Catches stale/zero dw_ or w_
+    // scalars before they silently corrupt graph-build (Bug #2 class).
+    // Uses the first full-attention layer (il = fai-1) because deltanet
+    // layers don't carry wq/wk. wq packs Q+gate so ne[1] = n_head*kl*2.
+    {
+        const int fa_il = out.full_attention_interval - 1;  // first full-attn layer
+        const TargetLayer & fa = out.layers[(size_t)fa_il];
+        if (fa.wq && fa.wk) {
+            const int64_t derived_q_dim  = fa.wq->ne[1];  // n_head * head_dim * 2
+            const int64_t derived_kv_dim = fa.wk->ne[1];  // n_head_kv * head_dim
+            const int64_t expected_q_dim  = (int64_t)out.n_head * out.n_embd_head_k * 2;
+            const int64_t expected_kv_dim = (int64_t)out.n_head_kv * out.n_embd_head_k;
+            if (derived_q_dim != expected_q_dim) {
+                char buf[256];
+                std::snprintf(buf, sizeof(buf),
+                    "GGUF shape mismatch: blk.%d.attn_q.weight->ne[1]=%lld "
+                    "!= n_head*head_dim*2=%d*%d*2=%lld",
+                    fa_il, (long long)derived_q_dim,
+                    out.n_head, out.n_embd_head_k, (long long)expected_q_dim);
+                set_last_error(buf);
+                return false;
+            }
+            if (derived_kv_dim != expected_kv_dim) {
+                char buf[256];
+                std::snprintf(buf, sizeof(buf),
+                    "GGUF shape mismatch: blk.%d.attn_k.weight->ne[1]=%lld "
+                    "!= n_head_kv*head_dim=%d*%d=%lld",
+                    fa_il, (long long)derived_kv_dim,
+                    out.n_head_kv, out.n_embd_head_k, (long long)expected_kv_dim);
+                set_last_error(buf);
+                return false;
+            }
+            const int64_t derived_n_embd = fa.wq->ne[0];  // input dim = n_embd
+            if (derived_n_embd != (int64_t)out.n_embd) {
+                char buf[256];
+                std::snprintf(buf, sizeof(buf),
+                    "GGUF shape mismatch: blk.%d.attn_q.weight->ne[0]=%lld != n_embd=%d",
+                    fa_il, (long long)derived_n_embd, out.n_embd);
+                set_last_error(buf);
+                return false;
+            }
+        }
+    }
+
     if (tok_embd_off == 0 || tok_embd_type == GGML_TYPE_COUNT) {
         set_last_error("token_embd.weight not found or invalid type");
         return false;
diff --git a/server/src/qwen35/layer_split_forward.cpp b/server/src/qwen35/layer_split_forward.cpp
index 5fd774cc0..d1ab66587 100644
--- a/server/src/qwen35/layer_split_forward.cpp
+++ b/server/src/qwen35/layer_split_forward.cpp
@@ -17,7 +17,7 @@
 
 namespace dflash::common {
 
-bool compute_target_split_projection(
+bool compute_target_split_argmax(
         StepGraph & sg,
         const TargetWeights & w,
         ggml_backend_t backend,
@@ -26,8 +26,7 @@ bool compute_target_split_projection(
         int n_tokens,
         int hidden,
         int vocab,
-        std::vector<int32_t> * argmax_out,
-        std::vector<float> * logits_out) {
+        std::vector<int32_t> & argmax_out) {
     step_graph_free(sg);
     ggml_init_params ip{};
     ip.mem_size = 256 * 1024 * 1024;
@@ -44,51 +43,24 @@ bool compute_target_split_projection(
     ggml_tensor * logits = ggml_mul_mat(sg.ctx, w.output, normed);
     ggml_set_name(logits, "target_split_logits");
     sg.logits = logits;
-    if (argmax_out) {
-        sg.argmax_tokens = ggml_argmax(sg.ctx, logits);
-        ggml_set_name(sg.argmax_tokens, "target_split_argmax");
-        ggml_set_output(sg.argmax_tokens);
-    }
-    if (logits_out) {
-        ggml_set_output(sg.logits);
-    }
+    sg.argmax_tokens = ggml_argmax(sg.ctx, logits);
+    ggml_set_name(sg.argmax_tokens, "target_split_argmax");
+    ggml_set_output(sg.argmax_tokens);
     sg.gf = ggml_new_graph_custom(sg.ctx, 1024, false);
-    if (argmax_out) ggml_build_forward_expand(sg.gf, sg.argmax_tokens);
-    if (logits_out) ggml_build_forward_expand(sg.gf, sg.logits);
+    ggml_build_forward_expand(sg.gf, sg.argmax_tokens);
     if (!sg.alloc) {
         sg.alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
     }
     if (!ggml_gallocr_alloc_graph(sg.alloc, sg.gf)) return false;
     auto st = ggml_backend_graph_compute(backend, sg.gf);
     if (st != GGML_STATUS_SUCCESS) return false;
-    if (argmax_out) {
-        argmax_out->assign((size_t)n_tokens, 0);
-        ggml_backend_tensor_get(sg.argmax_tokens, argmax_out->data(), 0,
-                                sizeof(int32_t) * (size_t)n_tokens);
-    }
-    if (logits_out) {
-        logits_out->assign((size_t)vocab * (size_t)n_tokens, 0.0f);
-        ggml_backend_tensor_get(sg.logits, logits_out->data(), 0,
-                                sizeof(float) * (size_t)vocab * (size_t)n_tokens);
-    }
+    (void)vocab;
+    argmax_out.assign((size_t)n_tokens, 0);
+    ggml_backend_tensor_get(sg.argmax_tokens, argmax_out.data(), 0,
+                            sizeof(int32_t) * (size_t)n_tokens);
     return true;
 }
 
-bool compute_target_split_argmax(
-        StepGraph & sg,
-        const TargetWeights & w,
-        ggml_backend_t backend,
-        ggml_tensor * act,
-        int token_offset,
-        int n_tokens,
-        int hidden,
-        int vocab,
-        std::vector<int32_t> & argmax_out) {
-    return compute_target_split_projection(
-        sg, w, backend, act, token_offset, n_tokens, hidden, vocab,
-        &argmax_out, nullptr);
-}
-
 bool run_qwen35_layer_split_forward(
         std::vector<Qwen35LayerSplitShard> & shards,
         const TargetWeights & embed_source,
@@ -236,10 +208,9 @@ bool run_qwen35_layer_split_forward(
     const bool need_all_argmax = argmax_out != nullptr;
     const int argmax_offset = need_all_argmax ? 0 : (n_tokens_total - 1);
     const int argmax_count = need_all_argmax ? n_tokens_total : 1;
-    const bool ok = compute_target_split_projection(
+    const bool ok = compute_target_split_argmax(
         final_sg, last_shard.weights, last_shard.backend, act_in,
-        argmax_offset, argmax_count, hidden, vocab,
-        &argmax_tokens, logits_out);
+        argmax_offset, argmax_count, hidden, vocab, argmax_tokens);
     step_graph_destroy(final_sg);
     activation_pair_free(acts);
     if (!ok) return false;
@@ -249,6 +220,7 @@ bool run_qwen35_layer_split_forward(
         shard.cache.last_tok = last_tok;
     }
     if (argmax_out) *argmax_out = std::move(argmax_tokens);
+    if (logits_out) logits_out->clear();
     return true;
 }
 
diff --git a/server/src/qwen35/layer_split_forward.h b/server/src/qwen35/layer_split_forward.h
index bb01bff09..c04680fe4 100644
--- a/server/src/qwen35/layer_split_forward.h
+++ b/server/src/qwen35/layer_split_forward.h
@@ -32,18 +32,6 @@ bool compute_target_split_argmax(
         int vocab,
         std::vector<int32_t> & argmax_out);
 
-bool compute_target_split_projection(
-        StepGraph & sg,
-        const TargetWeights & w,
-        ggml_backend_t backend,
-        ggml_tensor * act,
-        int token_offset,
-        int n_tokens,
-        int hidden,
-        int vocab,
-        std::vector<int32_t> * argmax_out,
-        std::vector<float> * logits_out);
-
 // Run a full forward pass through all shards, writing K/V into each shard's
 // cache.  Returns the argmax of the last token in `last_tok`.
 // Optionally captures features into `feature_ring` / remote draft.
diff --git a/server/src/qwen35/qwen35_dflash_target.h b/server/src/qwen35/qwen35_dflash_target.h
index 6a72e48b5..69e134f1c 100644
--- a/server/src/qwen35/qwen35_dflash_target.h
+++ b/server/src/qwen35/qwen35_dflash_target.h
@@ -53,6 +53,11 @@ class Qwen35DFlashTarget : public DFlashTarget {
     int mask_token_id() const override;
     const std::vector<int> & capture_layer_ids() const override;
 
+    // Per-call override for the verify-time flash-attention window. Used by
+    // do_spec_decode to widen the window when pflash compression has shrunk
+    // the prompt — see GenerateRequest.fa_window_override.
+    void set_fa_window(int fa) { fa_window_ = fa; }
+
 private:
     TargetWeights & w_;
     TargetCache & cache_;
diff --git a/server/src/qwen35/qwen35_layer_split_adapter.cpp b/server/src/qwen35/qwen35_layer_split_adapter.cpp
index 51f68378c..a911f169b 100644
--- a/server/src/qwen35/qwen35_layer_split_adapter.cpp
+++ b/server/src/qwen35/qwen35_layer_split_adapter.cpp
@@ -86,7 +86,6 @@ bool Qwen35LayerSplitAdapter::init() {
     for (auto & slot : prefix_snapshots_) {
         slot.resize(shards_.size());
     }
-    snapshot_prefill_logits_.resize(PREFIX_SLOTS);
     draft_feature_snapshots_.resize(PREFIX_SLOTS);
 
     return true;
@@ -172,7 +171,6 @@ void Qwen35LayerSplitAdapter::begin_request(const GenerateRequest & req) {
 
 void Qwen35LayerSplitAdapter::reset_request_state() {
     for (auto & shard : shards_) reset_target_cache(shard.cache);
-    prefill_last_logits_.clear();
 }
 
 int Qwen35LayerSplitAdapter::prefill_chunk_tokens() const {
@@ -196,8 +194,7 @@ bool Qwen35LayerSplitAdapter::prefill(const std::vector<int32_t> & prompt,
         shards_, shards_.front().weights, prompt, base_pos, ubatch, last_tok,
         cfg_.kq_stride_pad, /*fa_window=*/0,
         (cfg_.run_dflash && !remote_draft_.active()) ? &feature_ring_ : nullptr,
-        /*argmax_out=*/nullptr,
-        &prefill_last_logits_,
+        /*argmax_out=*/nullptr, /*logits_out=*/nullptr,
         cfg_.run_dflash ? &remote_draft_ : nullptr);
 }
 
@@ -222,8 +219,6 @@ bool Qwen35LayerSplitAdapter::snapshot_save(int slot) {
             return false;
         }
     }
-    if (snapshot_prefill_logits_.size() != (size_t)PREFIX_SLOTS) return false;
-    snapshot_prefill_logits_[(size_t)slot] = prefill_last_logits_;
     if (!snapshot_draft_features(slot)) {
         snapshot_free(slot);
         return false;
@@ -236,9 +231,6 @@ void Qwen35LayerSplitAdapter::snapshot_free(int slot) {
     for (auto & snap : prefix_snapshots_[(size_t)slot]) {
         free_prefix_snapshot(snap);
     }
-    if (snapshot_prefill_logits_.size() == (size_t)PREFIX_SLOTS) {
-        snapshot_prefill_logits_[(size_t)slot].clear();
-    }
     free_draft_feature_snapshot(slot);
 }
 
@@ -249,10 +241,6 @@ bool Qwen35LayerSplitAdapter::snapshot_used(int slot) const {
     for (const auto & snap : snaps) {
         if (!snap.ctx) return false;
     }
-    if (snapshot_prefill_logits_.size() != (size_t)PREFIX_SLOTS ||
-        snapshot_prefill_logits_[(size_t)slot].empty()) {
-        return false;
-    }
     if (cfg_.run_dflash && cfg_.draft_path) {
         if (draft_feature_snapshots_.size() != (size_t)PREFIX_SLOTS) return false;
         const auto & draft_snap = draft_feature_snapshots_[(size_t)slot];
@@ -277,8 +265,6 @@ bool Qwen35LayerSplitAdapter::snapshot_restore(int slot) {
             return false;
         }
     }
-    if (snapshot_prefill_logits_.size() != (size_t)PREFIX_SLOTS) return false;
-    prefill_last_logits_ = snapshot_prefill_logits_[(size_t)slot];
     if (!restore_draft_features(slot)) return false;
     return true;
 }
@@ -395,22 +381,14 @@ bool Qwen35LayerSplitAdapter::decode_ar(
         std::vector<int32_t> & out_tokens,
         const DaemonIO & io) {
     if (n_gen <= 0) return true;
-    const auto & w = shards_.front().weights;
-    const int vocab = w.n_vocab;
-    std::vector<float> logits_buf;
 
-    if (sampler_.needs_logit_processing()) {
-        if ((int)prefill_last_logits_.size() != vocab) return false;
-        last_tok = sample_logits(prefill_last_logits_.data(), vocab, sampler_,
-                                 out_tokens, sampler_rng_);
-    }
     out_tokens.push_back(last_tok);
     io.emit(last_tok);
     if (io.cancelled) {
         io.emit(-1);
         return true;
     }
-    if (is_eos_tok(last_tok, w)) {
+    if (is_eos_tok(last_tok, shards_.front().weights)) {
         io.emit(-1);
         return true;
     }
@@ -419,24 +397,16 @@ bool Qwen35LayerSplitAdapter::decode_ar(
     for (int i = 1; i < n_gen; ++i) {
         std::vector<int32_t> one(1, last_tok);
         int next_tok = -1;
-        logits_buf.clear();
         if (!run_qwen35_layer_split_forward(
                 shards_, shards_.front().weights, one, committed, 1, next_tok,
                 cfg_.kq_stride_pad, cfg_.fa_window,
-                cfg_.run_dflash ? &feature_ring_ : nullptr,
-                /*argmax_out=*/nullptr,
-                sampler_.needs_logit_processing() ? &logits_buf : nullptr)) {
+                cfg_.run_dflash ? &feature_ring_ : nullptr)) {
             return false;
         }
-        if (sampler_.needs_logit_processing()) {
-            if ((int)logits_buf.size() != vocab) return false;
-            next_tok = sample_logits(logits_buf.data(), vocab, sampler_,
-                                     out_tokens, sampler_rng_);
-        }
         out_tokens.push_back(next_tok);
         io.emit(next_tok);
         if (io.cancelled) break;
-        if (is_eos_tok(next_tok, w)) break;
+        if (is_eos_tok(next_tok, shards_.front().weights)) break;
         last_tok = next_tok;
         ++committed;
     }
@@ -445,7 +415,7 @@ bool Qwen35LayerSplitAdapter::decode_ar(
 }
 
 bool Qwen35LayerSplitAdapter::can_dflash_decode() const {
-    return cfg_.run_dflash && cfg_.draft_path && !sampler_.needs_logit_processing();
+    return cfg_.run_dflash && cfg_.draft_path && sampler_.temp == 0.0f;
 }
 
 bool Qwen35LayerSplitAdapter::decode_dflash(
@@ -533,7 +503,6 @@ void Qwen35LayerSplitAdapter::shutdown() {
         for (auto & snap : slot) free_prefix_snapshot(snap);
     }
     prefix_snapshots_.clear();
-    snapshot_prefill_logits_.clear();
     draft_feature_snapshots_.clear();
     auto shard_metas = layer_split_shard_metas(shards_);
     free_layer_split_snapshot_backends(shard_metas, snapshot_backends_);
diff --git a/server/src/qwen35/qwen35_layer_split_adapter.h b/server/src/qwen35/qwen35_layer_split_adapter.h
index 9565cf6e6..bc778e8a8 100644
--- a/server/src/qwen35/qwen35_layer_split_adapter.h
+++ b/server/src/qwen35/qwen35_layer_split_adapter.h
@@ -57,7 +57,6 @@ class Qwen35LayerSplitAdapter : public LayerSplitAdapter {
     bool decode_ar(int last_tok, int committed, int n_gen,
                    std::vector<int32_t> & out_tokens,
                    const DaemonIO & io) override;
-    bool supports_cpu_sampling() const override { return true; }
 
     bool can_dflash_decode() const override;
     bool decode_dflash(const std::vector<int32_t> & prompt, int base_pos,
@@ -102,7 +101,6 @@ class Qwen35LayerSplitAdapter : public LayerSplitAdapter {
     bool pflash_drafter_loaded_ = false;
     static constexpr int PREFIX_SLOTS = ModelBackend::kMaxSlots;
     std::vector<std::vector<PrefixSnapshot>> prefix_snapshots_;
-    std::vector<std::vector<float>> snapshot_prefill_logits_;
     std::vector<ggml_backend_t> snapshot_backends_;
     struct DraftFeatureSnapshot {
         int cur_pos = 0;
@@ -118,7 +116,6 @@ class Qwen35LayerSplitAdapter : public LayerSplitAdapter {
     SamplerCfg sampler_;
     std::mt19937_64 sampler_rng_{std::random_device{}()};
     std::unique_ptr<DFlashTarget> dflash_target_;
-    std::vector<float> prefill_last_logits_;
 };
 
 }  // namespace dflash::common
diff --git a/server/test/test_server_unit.cpp b/server/test/test_server_unit.cpp
index 363c9e9e6..c5dd5ed21 100644
--- a/server/test/test_server_unit.cpp
+++ b/server/test/test_server_unit.cpp
@@ -23,6 +23,7 @@
 #include "placement/placement_config.h"
 #include "common/layer_split_backend.h"
 #include "common/layer_split_utils.h"
+#include "qwen35/c2_gate.h"
 #include "placement/draft_residency.h"
 #include <nlohmann/json.hpp>
 
@@ -2696,6 +2697,58 @@ static void test_generate_result_accept_rate_zero_when_no_spec_decode() {
     TEST_ASSERT(r.accept_rate == 0.0f);
 }
 
+// ═══════════════════════════════════════════════════════════════════════
+// C2 gate: c2_spec_decode_permitted() unit tests
+//
+// Gate logic: permit spec-decode when eff_fa_window <= 2*fa_window_cfg.
+// eff_fa_window = fa_window_override when set, else fa_window_cfg.
+//
+// Empirical validation (Round 5 bench):
+// - D_composition 128K: effective_in=10988, eff_fa_window=11244 > 4096
+//   → gate BLOCKS spec-decode → AR at 27.5 tok/s (correct — spec at 5.74)
+// - D_composition short: eff_fa_window <= 4096 → gate permits spec-decode
+// ═══════════════════════════════════════════════════════════════════════
+
+static void test_c2_gate_no_override_always_permits() {
+    // fa_window_override == 0 → no pflash, always spec-decode permitted.
+    TEST_ASSERT(dflash::common::c2_spec_decode_permitted(0, 2048, 1));
+    TEST_ASSERT(dflash::common::c2_spec_decode_permitted(0, 2048, 4096));
+    TEST_ASSERT(dflash::common::c2_spec_decode_permitted(0, 2048, 131072));
+}
+
+static void test_c2_gate_128k_compressed_blocks_spec() {
+    // Round 5 D 128K: effective_in=10988, fa_window_override=11244.
+    // 11244 > 2*2048=4096 → gate correctly BLOCKS spec-decode (AR wins empirically).
+    int fa_window_cfg = 2048;
+    int compressed_size = 10988;
+    int fa_window_override = compressed_size + 256;  // = 11244
+    TEST_ASSERT(!dflash::common::c2_spec_decode_permitted(
+        fa_window_override, fa_window_cfg, compressed_size));
+}
+
+static void test_c2_gate_65k_compressed_blocks_spec() {
+    // D 65K cell: effective_in≈5383, fa_window_override≈5639 > 4096 → blocks.
+    int compressed_size = 5383;
+    int fa_window_override = compressed_size + 256;
+    TEST_ASSERT(!dflash::common::c2_spec_decode_permitted(
+        fa_window_override, 2048, compressed_size));
+}
+
+static void test_c2_gate_small_compressed_permits_spec() {
+    // Small compressed KV (override <= 2*fa_window): spec-decode permitted.
+    // fa_window_override=3000 <= 4096 → permit
+    TEST_ASSERT(dflash::common::c2_spec_decode_permitted(3000, 2048, 2744));
+    // fa_window_override=4096 == 2*2048 → permit (at boundary)
+    TEST_ASSERT(dflash::common::c2_spec_decode_permitted(4096, 2048, 3840));
+}
+
+static void test_c2_gate_boundary_at_2x_fa_window() {
+    // At exactly 2*fa_window_cfg: permit (<=).
+    TEST_ASSERT(dflash::common::c2_spec_decode_permitted(4096, 2048, 3840));
+    // At 2*fa_window_cfg + 1: block.
+    TEST_ASSERT(!dflash::common::c2_spec_decode_permitted(4097, 2048, 3841));
+}
+
 int main() {
     std::fprintf(stderr, "══════════════════════════════════════════\n");
     std::fprintf(stderr, " Server Unit Tests\n");
@@ -2867,6 +2920,13 @@ int main() {
     RUN_TEST(test_generate_result_accept_rate_in_usage_anthropic);
     RUN_TEST(test_generate_result_accept_rate_zero_when_no_spec_decode);
 
+    std::fprintf(stderr, "\n── C2 gate (spec-decode gate) ──\n");
+    RUN_TEST(test_c2_gate_no_override_always_permits);
+    RUN_TEST(test_c2_gate_128k_compressed_blocks_spec);
+    RUN_TEST(test_c2_gate_65k_compressed_blocks_spec);
+    RUN_TEST(test_c2_gate_small_compressed_permits_spec);
+    RUN_TEST(test_c2_gate_boundary_at_2x_fa_window);
+
     std::fprintf(stderr, "\n══════════════════════════════════════════\n");
     std::fprintf(stderr, " Results: %d assertions, %d failures\n",
                  test_count, test_failures);