diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 46919debe..e870ef385 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -128,8 +128,8 @@ jobs: needs: [uv-workspace] runs-on: [self-hosted, gpu, sm86] timeout-minutes: 30 - # The box has a single physical GPU: serialize GPU jobs across PRs instead - # of letting concurrent runs clobber each other. + # Serialize CUDA jobs across PRs (one RTX 3090). The ROCm job has its + # own group: different physical GPU, no contention. concurrency: group: lucebox3-gpu-runner cancel-in-progress: false @@ -197,15 +197,51 @@ jobs: needs: [uv-workspace] runs-on: [self-hosted, rocm, gfx1151] timeout-minutes: 20 - # Same single box as gpu-tests: serialize GPU jobs across PRs. + # Serialize across PRs per GPU. NOT the same group as the CUDA job: + # the combo box has two distinct GPUs (RTX 3090 + Strix iGPU), and a + # shared group only holds one waiting job, so the Radeon leg was + # chronically displaced ("higher priority waiting request") by every + # new CUDA job entering the queue. concurrency: - group: lucebox3-gpu-runner + group: lucebox3-rocm-runner cancel-in-progress: false steps: - uses: actions/checkout@v4 + - name: KFD health (diagnose instead of hanging) + # rocminfo on a wedged KFD blocks in uninterruptible sleep and eats + # the whole 20-minute job timeout. Probe with a hard timeout first, + # and when it hangs, dump the evidence (D-state holders, dmesg) so + # the job fails in seconds with a diagnosis instead of silently. + run: | + # A wedged KFD puts rocminfo in UNINTERRUPTIBLE sleep: timeout(1) + # cannot kill it and a foreground wait blocks until the job + # timeout. Probe in the background (output to a file so no pipe + # keeps the step alive) and enforce the deadline in the shell. + /opt/rocm/bin/rocminfo > /tmp/rocminfo.out 2>&1 & + PROBE=$! + for i in $(seq 1 15); do + kill -0 $PROBE 2>/dev/null || break + sleep 1 + done + if kill -0 $PROBE 2>/dev/null; then + echo "::error::rocminfo hung (likely D-state) — ROCm/KFD wedged; the box needs a reboot" + echo "--- probe state:" + ps -o pid,stat,wchan:32,comm -p $PROBE || true + echo "--- processes holding /dev/kfd:" + sudo fuser -v /dev/kfd 2>&1 || true + echo "--- D-state processes:" + ps -eo pid,user,stat,wchan:32,comm | awk '$3 ~ /D/' || true + echo "--- recent amdgpu/kfd dmesg:" + sudo dmesg 2>/dev/null | grep -iE "amdgpu|kfd" | tail -15 || true + kill -9 $PROBE 2>/dev/null || true + disown $PROBE 2>/dev/null || true + exit 1 + fi + wait $PROBE && echo "KFD healthy" || { echo "::error::rocminfo exited non-zero"; cat /tmp/rocminfo.out | tail -5; exit 1; } + - name: ROCm smoke (rocminfo sees gfx1151) - run: /opt/rocm/bin/rocminfo | grep -E "Name:|Marketing Name:" | grep -iE "gfx1151|Radeon 8060S" + run: cat /tmp/rocminfo.out | grep -E "Name:|Marketing Name:" | grep -iE "gfx1151|Radeon 8060S" - name: Build + run HIP vector-add on the Radeon 8060S # Self-contained HIP kernel correctness test (no model weights). This is diff --git a/README.md b/README.md index 0856e5375..59d3fdd96 100644 --- a/README.md +++ b/README.md @@ -39,6 +39,10 @@ Each one is self-contained with setup instructions and benchmark notes. Luce Spark MoE expert offload

+

+ Luce KVFlash paged KV cache +

+ --- ## Supported Models & Drafters @@ -276,6 +280,18 @@ DFLASH27B_KV_TQ3=1 \ | `--kv-cache-dir ` | — | Persist prefix cache to disk | | `--kv-cache-budget N` | — | On-disk cache size cap | +**Bounded KV residency (KVFlash)** + +Pages the attention KV cache through a fixed pool of GPU slots; cold 64-token chunks live in host RAM, bit-exact and recallable. Decode speed stops depending on context length and resident KV stays pool-sized at any context. Off by default; works on every model family. Drafter-scored residency is the default on every family: the server finds the Qwen3-0.6B drafter next to the model (or via `--prefill-drafter`) and lazy-loads it as the relevance scorer that decides which chunks stay resident — non-qwen targets (laguna, gemma4) bridge the tokenizer gap by re-tokenizing the context text for the drafter. LRU is the fallback when no drafter is present, or the explicit choice via `--kvflash-policy lru`. Per-model numbers in [Luce KVFlash →](optimizations/kvflash/README.md). + +| Flag / env | Default | Effect | +|---|---|---| +| `--kvflash ` | off | Resident pool size. `auto` sizes from the GPU: half of free VRAM after weights and reserves, at the model's KV density, capped where decode speed stays near the flat optimum (default 16384, override `DFLASH_KVFLASH_MAX_POOL`) and at `--max-ctx`. Explicit values are rounded to 256, clamped to `--max-ctx`, floored at the protected minimum so eviction always has a victim. | +| `--kvflash-policy {drafter,lru}` | `drafter` | Residency policy. `lru` opts out of the drafter probe/load (recency-only paging, no extra VRAM). | +| `--kvflash-tau N` | `64` | Reselect interval floor (drafter policy only); the effective interval grows with history to cap rescore overhead. | +| `DFLASH_KVFLASH=N` | off | Env equivalent of `--kvflash`. | +| `DFLASH_KVFLASH_TAU=N` | `64` | Env equivalent of `--kvflash-tau`. | + **Thinking budget** | Flag | Default | Effect | diff --git a/assets/cards/kvflash_card.png b/assets/cards/kvflash_card.png new file mode 100644 index 000000000..1a8af70a3 --- /dev/null +++ b/assets/cards/kvflash_card.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3f810ba8150b818309173d9c003f475b5ff41b8a3e6605772eea7ca086029b2 +size 2231695 diff --git a/optimizations/kvflash/DESIGN.md b/optimizations/kvflash/DESIGN.md new file mode 100644 index 000000000..a8738eb27 --- /dev/null +++ b/optimizations/kvflash/DESIGN.md @@ -0,0 +1,272 @@ +# KVFlash design notes + +Mechanism details and tuning data behind [README.md](README.md); measured +tables in [RESULTS.md](RESULTS.md). + +FlashMemory-style (arXiv 2606.09079) decode-time KV paging for the qwen35 +target, designed to compose with pflash. Goal: the GPU footprint of the +full-attention KV cache is a hard O(pool) constant regardless of logical +context length, with paged-out chunks recallable bit-exact from host. + +## Division of labor with pflash + +pflash and the pager own different resources and compose cleanly: + +| concern | owner | +|---|---| +| which prompt chunks the target ever elaborates | pflash (drafter scores, evict at prefill) | +| which elaborated chunks occupy GPU slots | KvFlashPager (this module) | +| prefill compute sparsity | pflash BSA kernels | +| decode-time KV growth (generated tokens) | KvFlashPager (page out cold generated chunks) | + +pflash keeps the target from reading the huge context; the pager keeps +what the target HAS elaborated inside a fixed VRAM budget and makes every +eviction reversible. The drafter's chunk scores plug into +`KvFlashPager::score_hook` as the residency policy (LRU fallback in the +prototype). + +## Mechanism + +- Cache tensors are allocated at `pool_tokens` (e.g. 1024) instead of + `max_ctx` (e.g. 131072). That allocation delta IS the memory saving: + a mask over a full-size cache would save nothing. +- Logical positions map to physical pool slots at 64-token chunk + granularity. The mapping rides the existing step-invariant + `ggml_set_rows` KV append (`kv_write_rows` carries the physical slot; + the `positions` input keeps the logical position for M-RoPE). +- Decode FA spans the whole pool with an EXACT slot-validity mask + (`KvFlashPager::fill_slot_mask`): resident slots 0, free/paged-out -inf. + The host-side mask rebuilds only when the pager epoch moves; the device + upload happens before EVERY compute. That upload is mandatory, not an + optimization: input tensors live in the gallocr compute buffer, whose + regions are reused during graph execution, so a once-uploaded mask is + garbage by the next step (this masqueraded as a "fattn NaN kernel bug" + for a while — all-NaN logits from the second step on; production never + hit it because its prefill refills masks per chunk). `--no-mask` falls + back to maskless + zeroed freed slots (exp(-max) ~ 0, production's + padded-span approximation, measured ~1% argmax flips). +- Page-out copies a chunk's quantized rows (per layer x K/V x head + segments) to a host backing store and zeroes the slots; page-in writes + them back. Quantized bytes + baked-in RoPE means the roundtrip is + bit-exact and relocation is position-independent. +- Eviction protects sinks (first chunk) and the trailing window, mirrors + FlashMemory's always-resident floor (their last-8K + decoded window). + Unlike their sigmoid-threshold fetch (which leaks footprint at 500K, + their §3.3.1), a fixed slot pool is a hard budget by construction. +- DeltaNet/conv recurrent state is fixed-size and never paged. + +## What the prototype verifies (test_kvflash) + +A. Baseline at logical ctx 128K: reference greedy sequence + KV bytes. +B. Relocation proof: same workload in a small pool with SHUFFLED block + placement, teacher-forced — argmax must track the baseline. +C. Live paging: pool ≪ prompt+gen, eviction engaged; bit-exact + page_out/page_in roundtrip; decode completes; KV bytes vs A ≥ 90% cut. + +## Reselect (τ-step lookahead) + +`KvFlashPager::reselect()` rebuilds the resident set as the top-pool chunks by +`score_hook` over all materialized chunks (resident or host-backed), +keeping sinks and the trailing window unconditionally. Page-outs run +first so recalls always find free blocks. This is the FlashMemory τ=64 +loop's mechanism; the production caller invokes it every τ decoded +tokens with fresh drafter scores. Verified in test run D: an evicted +chunk recalled by a score flip, decode continues across the residency +change. + +## Measured (lucebox RTX 3090, Qwen3.6-27B Q4_K_M, Q8_0 KV, 2026-06-11) + +All gates PASS (exit 0). 64 timed steps per profile row, junk KV so the +FA span traffic is bandwidth-realistic: + +| config | FA span | ms/step p50 | tok/s | +|---|---|---|---| +| baseline 8K | 8192 | 35.1 | 28.5 | +| baseline 32K | 32768 | 30.1 | 33.1 | +| baseline 128K | 131072 | 45.1 | 22.1 | +| pool 1K @128K logical | 1024 | 25.1 | 39.6 | +| pool 4K @128K logical | 4096 | 25.7 | 38.7 | + +- attn-KV memory: 2304.0 -> 18.0 MiB (99.2% cut); whole cache buffer + 2653.6 -> 217.6 MiB, confirmed by VRAM deltas. +- At 128K-logical decode the pool is 1.8x FASTER than the full cache + (45.1 -> 25.1 ms/step): FA cost is span-bound, the pool caps the span. +- Paging: page_out p50 1.26 ms, page_in p50 0.63 ms per 64-token chunk + (~2.2 MiB, synchronous); 12 evictions over 1200 generated tokens + amortize to ~0.01 ms/token. reselect() recalling with 20 page events + took 21.3 ms — at τ=64 that is ~1% of decode time worst-case. +- Relocation equivalence: 0.83% argmax flips over 1200 teacher-forced + tokens at shuffled placement (gate: ≤1%). +- Open harness question: the C-loop (live eviction) measured ~34 ms/step + vs 25 ms for the identical config in the E-loop; suspected interaction + of sustained-load GPU clocks with run ordering, not paging cost (12 + sync page events explain only ~0.01 ms/token). Re-measure under the + production decode loop during integration. + +## Full LSA loop (drafter as Memory Indexer) — measured + +Test run F implements the paper's complete inference paradigm with the +pflash drafter (Qwen3-0.6B, `/opt/lucebox/models/drafter/`) standing in +for the trained indexer: prompt (2048) larger than the pool (1024) so +prefill itself evicts, then every τ=64 decoded tokens the drafter +rescores the full sequence (tail attention = indexer query, chunk means +via `drafter_chunk_scores`), `score_hook` receives the fresh scores, and +`reselect()` repages the pool. + +Measured (RTX 3090, target Qwen3.6-27B Q4_K_M + drafter co-resident): +- 31.2 tok/s with the loop active; 12 rescores over 768 generated tokens +- 43 genuine drafter-driven recalls of previously evicted context +- indexer rescore p50 = 245 ms (full 0.6B re-prefill at ~2-2.8K tokens — + ~12% decode overhead at τ=64; drops to ~ms once the drafter's own KV + is persisted and only the new τ tokens are pushed through it) +- reselect p50 = 7.5 ms + +vs the paper: their indexer is a trained <0.1% projection head (cheaper +queries, backbone-supervised labels); ours is the existing 0.6B drafter +(training-free, already shipped for pflash). Their sigmoid threshold +leaks footprint at scale (their §3.3.1); our fixed pool is a hard cap. + +## Production integration (daemon) + +The pool is wired into the qwen35 backend behind `--kvflash ` +(env `DFLASH_KVFLASH`; rounded to a 256 multiple) + `--kvflash-tau ` +(env `DFLASH_KVFLASH_TAU`, default 64). Pieces: + +- `create_target_cache(..., ctx_alloc)`: attention tensors allocated at + pool capacity; `cache.max_ctx` stays the logical bound. +- `do_prefill`: prompts that fit the pool land identity-mapped + (`kvflash_sync_prefill` rebuilds the pager map per request/restore); + LARGER prompts switch to pooled chunked prefill — pager-chunk batches, + slot-mapped set_rows writes, a slot-space mask per chunk, live + eviction. Constant VRAM, linear time (qwen35 only so far). +- `do_ar_decode`: `build_target_step(..., kvflash_mask=true)` keeps the + step-invariant set_rows write active alongside the slot mask; + `kv_write_rows` carries the pool slot; the mask uploads per step; + every τ generated tokens `kvflash_maybe_reselect` rescores + repages. +- Policy is agnostic by construction: `KvFlashScorer` (common/) is the + interface; with no scorer the pager runs pure LRU (zero pflash + dependency). When pflash loads its drafter, `KvFlashDrafterScorer` + (qwen3/) attaches automatically and reselect becomes drafter-driven. +- Spec decode (chain mode) runs ON the pool: verify_batch slot-maps the + draft block via per-token kv_write_rows and builds a slot-space mask + (resident committed positions + causal among draft tokens). Rejected + drafts need no rollback: the pos < base_pos validity rule excludes + their slots until the replay rewrites them. All four spec KV-write + sites (verify, both replays, stall-prefix) route through this one + function. Verified on the daemon: accept_rate 15.4-15.6% pooled vs + 15.3% pool-off (matched avg_commit 3.47 vs 3.45), coherent output + through a mid-generation pool wrap with live eviction. DDTree's + tree-verify is not pool-aware yet and falls back to AR. +- LAYOUT TRAP (cost a day of debugging): kv_write_rows is + [n_tokens, n_head_kv] ne0-major — element (token i, head h) lives at + i + h*n_tokens (ggml_set_rows asserts b->ne[1] == c->ne[0]). A + transposed fill scrambles per-head row targets for every multi-token + write while single-token fills (all entries equal) hide the bug + completely. +- Post-generation snapshots are skipped once cur_pos exceeds the pool + (pooled snapshots need page-table serialization; prefill-time + snapshots still work). + +## Production smokes (dflash_server on lucebox 3090, 2026-06-11) + +1. WITHOUT pflash (agnostic LRU): `dflash_server <27B> --kvflash 1024`. + 41-token prompt + 1400 generated = 1441 logical through a 1024-slot + pool (live LRU eviction mid-request). Coherent story end to end, + 36.9 tok/s, clean finish. Second request (per-request pager reset) ok. +2. WITH pflash: `--kvflash 2048 --prefill-compression always + --prefill-threshold 256 --prefill-drafter `. Compression + 1468 -> 60 tokens, then `[kvflash] drafter scorer attached (tau=64)` + automatically; 400 coherent tokens answering from the compressed + context. Same binary, zero pflash-specific configuration on the pool. + +Ops note: the init banner is flushed now, but generally `nohup` + +redirected stdout block-buffers printf output — kill the process (atexit +flush) before concluding a code path didn't run. + +## Quality matrix (synthetic NIAH, needle recall /16, teacher-forced) + +| context | residency | LRU d=10/50/90% | drafter d=10/50/90% | control | +|---|---|---|---|---| +| 8K | 25% | 0 / 0 / 16 | 15 / 15 / 16 | 16/16 | +| 8K | 9% | 0 / 0 / 0 | 15 / 15 / 15 | 16/16 | +| 32K | 25% | 0 / 0 / 16 | 15 / 15 / 16 | 16/16 | +| 32K | 9% | 0 / 0 / 0 | 15 / 15 / 15 | 15-16/16 | +| 256K | 6.25% | 0 (d=0.5); 16/16 in-window | 14 / 15 / 15 | (in-window LRU = control) | + +Drafter-scored residency retains 88-100% of perfect needle recall at every +depth down to 6-9% residency from 8K to the model's native 256K maximum; +recency-only LRU retains zero outside its tail window. 256K logistics on +the RTX 3090: ~6.5 min linear pooled prefill, 4.22 GiB host backing, +~18 GiB VRAM total, 46 s bisected rescore (drafter forward ceiling ~65K +per segment). + +## Tuned defaults (from the matrix) + +- Ship drafter scoring whenever a drafter is available; pure-LRU mode is + recency-only and must be documented as such. +- Pool ~25% of expected context is the conservative default; 9% measured + safe for retrieval-style work. +- tau adapts: rescore costs ~0.11 ms/history-token, so the effective + reselect interval is max(configured tau, history/45), capping rescore + overhead near 15% of decode time. + +## Per-architecture integration + +The pager core is architecture-blind; each backend routes its own KV writes +and masks through it. What differs per arch: + +- **qwen35** (reference): masked set_rows decode, slot-mapped chain-spec + verify, drafter scorer auto-attach. Everything in RESULTS.md. +- **qwen35moe** (Qwen3.6-35B-A3B): inherits the qwen35 path all-GPU. The + Spark hybrid pipelined decode keeps its per-layer cached CUDA graphs: + `pipelined_decode_one_token` takes a `kv_slot`, the cached FA span clamps + to the pool (so the graph stops rebuilding once the window hits pool + size), and the pool span stays MASKLESS like the rest of that path — the + pager zeroes freed blocks (page-out and `zero_free_blocks()` on request + reset), so evicted slots contribute exp(-max) ~ 0, production's own + padded-span approximation. Hybrid spec decode (literal-offset KV writes) + falls back to pipelined AR under kvflash. +- **laguna**: ALL 40 layers pooled (full + SWA share the pager). + `laguna_step` / `laguna_step_hybrid` take a const pager; both masks are + built in SLOT space via `fill_slot_pos` (the causal / sliding-window + conditions evaluate on the position each slot holds). SWA exactness: + `tail_window_chunks >= sliding_window/64 + 1`, so positions inside the + window are never evicted. The per-layer hybrid decode fallback and + NO_KVPAD / PAD_CPY / no_mask ablations are refused under kvflash. +- **gemma4**: pools FULL-attention layers only — SWA layers already use + sliding-window ring buffers and KV-reuse layers share their source's + tensors. The full mask is slot-space; the SWA ring path is untouched. + `--fa-window` (sparse full-attn) and kvflash are mutually exclusive. + DFlash spec verify is slot-mapped (gemma4_verify_batch gains set_rows + inputs + the slot-space causal mask; its KV-truncation rejection + semantics map directly onto the pool's validity rule). Measured: + identical acceptance pooled vs full (407/3104 = 13.1%, avg_commit + 3.09, identical text). + +Policy: drafter-scored residency is the default on all four archs. The +server probes for the Qwen3-0.6B next to the model (or --prefill-drafter) +and lazy-loads it at the first reselect; `--kvflash-policy lru` opts out. +qwen35/qwen35moe feed the drafter target ids directly; laguna/gemma4 use +KvFlashCrossTokScorer (detokenize -> re-tokenize -> score -> map back by +char spans; functional but untuned, see RESULTS). `--kvflash auto` sizes +the pool from free VRAM at the model's KV density, capped at the decode +speed knee (16384 default). + +Snapshots on laguna/gemma4 are refused once a chunk has relocated +(page_outs > 0); identity-layout snapshots before that still work. + +## Follow-ups + +Done since the prototype: pooled chunked prefill in the qwen35 daemon +(prompt > pool, eviction during prefill), spec-decode chain verify on the +pool, VRAM-aware auto sizing, cross-tokenizer scoring for laguna/gemma4. + +Open: +1. Drafter KV persistence for the indexer (incremental rescore: push + only the new τ tokens through the drafter; kills the ~240 ms re-prefill). +2. Pooled chunked prefill for laguna/gemma4 (qwen35-only today). +3. Pooled snapshot save/restore (serialize the page table + host store). +4. Async paging on a copy stream (currently synchronous + ggml_backend_tensor_get/set between steps). +5. Teacher-forced NIAH harness for non-qwen archs + cross-tok scorer + tuning (tail window, normalization). diff --git a/optimizations/kvflash/README.md b/optimizations/kvflash/README.md new file mode 100644 index 000000000..a54406453 --- /dev/null +++ b/optimizations/kvflash/README.md @@ -0,0 +1,133 @@ +

+ ← lucebox-hub +

+ +

+ +

+ +

Luce KVFlash

+ +

+ Lookahead sparse attention for dflash. Bounded KV residency on one GPU.
+ The attention KV cache lives in a fixed pool of slots; cold 64-token chunks page to host RAM, bit-exact and recallable. + With pflash, its drafter doubles as a Memory Indexer that recalls the context the generation needs next.
+ Qwen3.6-27B Q4_K_M on a single RTX 3090: native 256K context at 38.6 tok/s with 72 MiB of resident KV, + needle recall 88-100% at 6% residency, harness accuracy unchanged (32/32 vs full cache). +

+ +--- + +``` + decode tok/s KV in VRAM (Q8_0) needle (d=10/50/90%) +full cache @ 64K 27.8 1152 MiB 16/16 +full cache @ 128K 19.6 2304 MiB 16/16 +full cache @ 256K 13.1 4608 MiB 16/16 +KVFlash 4K @ 64K 38.6 72 MiB 14/16 +KVFlash 4K @ 128K 38.6 72 MiB 14/16 +KVFlash 4K @ 256K 38.6 72 MiB 15/16 +``` + +Decode speed is flat at any context length (the per-step KV read is pool-sized, +not context-sized), prefill is up to 2.8x faster, and a 256K prompt that costs +4.6 GiB of VRAM as a full cache costs 72 MiB resident + 4.2 GiB of host RAM. +(The full-cache 256K rows are measured, not extrapolated: they fit the 24 GB +card only thanks to Q8_0 KV; with F16 KV the cache alone is 9.2 GiB and 256K +does not fit at all.) + +## Usage + +```bash +dflash_server model.gguf --max-ctx 32768 --kvflash auto # one flag, LRU policy +dflash_server model.gguf --max-ctx 32768 --kvflash auto \ + --prefill-drafter qwen3-0.6b.gguf # drafter-scored residency +dflash_server model.gguf --max-ctx 32768 --kvflash 8192 # explicit pool size +``` + +Drafter-scored residency is the DEFAULT policy on every model family: +the server probes for `Qwen3-0.6B-BF16.gguf` next to the model (same +dir, `drafter/`, `draft/`, then `/opt/lucebox/models/drafter/`) and +lazy-loads it on the first reselect; `--prefill-drafter` overrides the +location, prefill compression can stay off either way. Qwen-family +targets feed the drafter their ids directly; laguna and gemma4 bridge +the tokenizer gap with `KvFlashCrossTokScorer` (relevance is a property +of the TEXT, so the target's history is detokenized, re-tokenized for +the drafter, scored, and mapped back to chunk boundaries by character +spans). LRU is the fallback when no drafter is found (the banner says +which policy you got) or the explicit choice via `--kvflash-policy lru`. +`auto` sizes the pool from the GPU, not a fixed fraction: half of the +free VRAM left after weights (minus a reserve for compute buffers and +the drafter), converted at the model's KV density, capped where decode +speed stays near the flat optimum (16384 tokens by default, +`DFLASH_KVFLASH_MAX_POOL` to override) and at `--max-ctx`. Bigger pools +mean more resident chunks and fewer forced evictions of useful context; +the cap keeps the per-step KV read small enough that decode stays near +the small-pool speed. + +- `--kvflash `: resident pool size (rounded to 256; clamped to + `--max-ctx`; floored at the protected minimum — 512 for qwen-family and + gemma4, larger on laguna where the SWA window stays resident — so + eviction always has a victim). Env: `DFLASH_KVFLASH`. +- `--kvflash-tau `: reselect interval floor (default 64; the effective + interval grows with history so rescore overhead stays ~15% of decode). + Env: `DFLASH_KVFLASH_TAU`. + +Sizing rule: without a drafter, pool >= prompt + generation headroom +(LRU is recency-only memory — an undersized pool can evict the question +itself). With pflash's drafter attached, 25% of the expected context is a +conservative default and 6-9% is measured safe for retrieval workloads. + +## Model support + +`--kvflash` works on every architecture the daemon serves: + +| arch | models | decode path | policy | notes | +|---|---|---|---|---| +| qwen35 | Qwen3.5/3.6-27B | masked set_rows decode + slot-mapped spec verify | LRU or pflash drafter | reference integration; all RESULTS.md numbers | +| qwen35moe | Qwen3.6-35B-A3B | pipelined hybrid decode (Spark) + all-GPU | LRU or pflash drafter | maskless pool span (zero-row approximation, same as production padding); hybrid spec falls back to AR | +| laguna | Laguna-XS.2 | single-graph hybrid + all-GPU, slot-space full+SWA masks | LRU or drafter (cross-tok, untuned) | pager covers all 40 layers; protected tail >= sliding_window keeps SWA exact | +| gemma4 | Gemma4 26B-A4B / 31B | masked decode + slot-mapped spec verify, slot-space full mask | LRU or drafter (cross-tok, untuned) | pools FULL-attention layers only (SWA layers already ring-buffer) | + +Non-qwen targets use the cross-tokenizer scorer (detokenize target ids, +re-tokenize for the drafter, score, map back by char spans); the +`KvFlashScorer` seam stays open for native indexers. + +## How it works + +- **Pool**: attention KV tensors are allocated at pool size; a pager maps + logical positions to slots at 64-token chunk granularity. Cold chunks + move to a host backing store (~0.6 ms/chunk) and return bit-exact. +- **Mask**: attention spans the pool with a slot-validity mask, uploaded + before every compute. Exact, and free (25.10 vs 25.52 ms/step maskless). +- **Reselect**: every tau decoded tokens the scorer re-ranks all chunks + (resident or host-backed) and `reselect()` repages the pool — the + lookahead loop from FlashMemory (arXiv 2606.09079), with the pflash + drafter standing in for their trained indexer, and a hard capacity cap + their threshold mechanism lacks. +- **Spec decode**: chain-mode verify is slot-mapped (per-token + `kv_write_rows` + slot-space mask); rejected drafts need no rollback — + their slots are excluded by the validity rule until rewritten. + Acceptance parity with the full cache (15.4-15.6% vs 15.3%), with or + without the --ddtree configuration (fast rollback only snapshots + DeltaNet state, which is never pooled). +- **Prefill**: prompts larger than the pool prefill in 64-token chunks at + constant VRAM (linear time; 256K in ~5.9 min on the 3090). + +Quality verdict (harness ground truth, base-vs-base control included): +full results in [RESULTS.md](RESULTS.md). Outputs are not guaranteed +byte-identical to the full cache on long generations (the masked kernel +path rounds differently — a different deterministic lineage), but +correctness is identical: 32/32 vs 32/32 across HumanEval, GSM, MATH, and +agent suites. + +## Files + +- `server/src/common/kvflash_pager.h` — pool, page table, host store, reselect +- `server/src/common/kvflash_scorer.h` — chunk-relevance policy interface +- `server/src/qwen3/qwen3_kvflash_scorer.{h,cpp}` — pflash-drafter scorer + (tail attention; bisects on allocation pressure) +- `server/src/qwen35/*` — cache `ctx_alloc`, masked pooled decode, slot-mapped + spec verify, daemon flags +- `server/test/test_kvflash.cpp` — verification suite (A-F), `--niah`, + `--niah256`, `--longab` +- [DESIGN.md](DESIGN.md) — mechanism details and tuning notes diff --git a/optimizations/kvflash/RESULTS.md b/optimizations/kvflash/RESULTS.md new file mode 100644 index 000000000..513412311 --- /dev/null +++ b/optimizations/kvflash/RESULTS.md @@ -0,0 +1,127 @@ +# KVFlash — measured results + +All numbers: single RTX 3090 (24 GB), Qwen3.6-27B Q4_K_M target, Q8_0 KV, +Qwen3-0.6B pflash drafter as the scorer. June 2026, `test_kvflash` + +`dflash_server` + `harness/benchmarks`. + +## End-to-end long-prompt A/B (`--longab`; needle depth 0.25, 240-token timed free run) + +| context | mode | prefill | decode tok/s | needle /16 | KV in VRAM (Q8_0) | +|---|---|---|---|---|---| +| 32K | full | 47.2 s | 32.8 | 16 | 576 MiB | +| 32K | KVFlash 4K | 41.8 s | 29.0 | 15 | 72 MiB | +| 64K | full | 130.6 s | 27.8 | 16 | 1152 MiB | +| 64K | KVFlash 4K | 87.5 s | **38.6** | 14 | **72 MiB** | +| 128K | full | 335.9 s | 19.6 | 16 | 2304 MiB | +| 128K | KVFlash 4K | 177.8 s | **38.6** | 14 | **72 MiB** | +| 256K | full | 999.0 s | 13.1 | 16 | 4608 MiB | +| 256K | KVFlash 4K | **354.9 s** | **38.6** | 15 | **72 MiB** | + +Decode is flat at 38.6 tok/s from 64K to native-max 256K (speedups 1.4x / +2.0x / 2.9x); prefill speedups 1.5x / 1.9x / 2.8x. One drafter rescore per +query: 9-70 s scaling with context (bisected above the drafter's ~65K +single-pass ceiling). + +Note on the 256K full-cache row: it fits the 24 GB card only because the +KV is Q8_0 (~15.3 GiB weights + 4.6 GiB KV ~ 21 GiB, measured, no OOM). +With F16 KV the cache alone is 9.2 GiB and 256K does NOT fit; KVFlash is +indifferent (72 MiB resident either way). + +## Retrieval quality vs residency (synthetic NIAH, teacher-forced /16) + +| context | residency | LRU (d=10/50/90%) | drafter (d=10/50/90%) | full control | +|---|---|---|---|---| +| 8K | 25% | 0 / 0 / 16 | 15 / 15 / 16 | 16/16 | +| 8K | 9% | 0 / 0 / 0 | 15 / 15 / 15 | 16/16 | +| 32K | 25% | 0 / 0 / 16 | 15 / 15 / 16 | 16/16 | +| 32K | 9% | 0 / 0 / 0 | 15 / 15 / 15 | 15-16/16 | +| 256K | 6.25% | 0 (d=0.5); 16/16 in-window | 14 / 15 / 15 | (in-window LRU = control) | + +Drafter-scored residency retains 88-100% of perfect recall at every depth +down to 6-9% residency; recency-only LRU retains zero outside its tail +window (mirrors FlashMemory's Recency-Only ablation). + +## Harness ground truth (pool sized per the heuristic, vs full cache) + +| suite | baseline pass | KVFlash pass | exact text match | +|---|---|---|---| +| HumanEval | 10/10 | **10/10** | 10/10 | +| GSM | 10/10 | **10/10** | 8/10 | +| MATH | 10/10 | **10/10** | 4/10 | +| agent (to 24K prompts) | 6/6 | **6/6** | 2/6 | + +Base-vs-base control: 16/16 byte-identical — the stack is deterministic. +Text drift under KVFlash is the masked decode kernel's different (equally +deterministic) rounding lineage, not noise and not a correctness effect. + +## Spec decode (slot-mapped verify, daemon) + +| config | accept rate | avg_commit | output | +|---|---|---|---| +| qwen35 full cache, 2400 tok | 15.3% | 3.45 | coherent | +| qwen35 KVFlash 2K, 1800 tok | 15.4% | 3.47 | coherent | +| qwen35 KVFlash 2K, 2400 tok (live eviction mid-spec) | 15.6% | 3.49 | coherent | +| qwen35 --ddtree full cache, 600 tok | 13.9% | 3.23 | coherent | +| qwen35 --ddtree KVFlash 2K, 600 tok | 14.6% | 3.33 | coherent | +| gemma4 full cache, 600 tok | 13.1% (407/3104) | 3.09 | coherent | +| gemma4 KVFlash 2K, 600 tok | 13.1% (407/3104) | 3.09 | identical text to full | +| qwen35moe A3B all-GPU --ddtree full cache, 500 tok | 11.5% | 2.84 | coherent | +| qwen35moe A3B all-GPU --ddtree KVFlash 2K, 500 tok | 10.4% | 2.66 | coherent | + +## Microbenchmarks + +- Memory at 128K-logical: attn-KV 2304 -> 18 MiB (99.2%) with a 1K pool; + whole cache buffer 2654 -> 218 MiB, confirmed via VRAM deltas. +- Exact slot mask is free: 25.10 ms/step masked vs 25.52 maskless. +- Paging: page_out p50 1.27 ms / page_in 0.64 ms per 64-token chunk + (~2.2 MiB, synchronous); ~0.01 ms/token amortized at observed rates. +- reselect() repaging 20 chunks: 21.3 ms. +- Relocation equivalence (shuffled physical placement, teacher-forced + 1200 tokens): ~99% argmax agreement; page_out/page_in roundtrip + bit-exact. + +## Multi-architecture smokes (pool 1024, --max-ctx 8192, ~1235 logical tokens, live LRU eviction mid-request, RTX 3090) + +| arch | model | mode | decode tok/s | output | +|---|---|---|---|---| +| qwen35 | Qwen3.6-27B Q4_K_M | all-GPU, masked pool | 37.4 | coherent | +| qwen35moe | Qwen3.6-35B-A3B UD-Q4_K_M | Spark hybrid (9403 hot / 837 cold experts), pipelined decode | 101.6 | coherent | +| laguna | Laguna-XS.2 Q4_K_M | Spark hybrid, single-graph decode, slot-space full+SWA masks | 137.1 | coherent | +| gemma4 | Gemma4 26B-A4B UD-Q4_K_M | all-GPU, slot-space full mask, SWA rings untouched | 119.0 | coherent | + +Gemma4 control on the same build without the flag: 120.2 tok/s, no +kvflash code engaged — the default path is unchanged. + +## Cross-tokenizer scorer (laguna/gemma4) — early result + +Stress A/B on gemma4 26B-A4B (pool 1024, needle at pos ~170, recital +demanded ~1700 generated tokens later, beyond the SWA ring and the pool): +LRU never recites and degenerates into filler repetition; the cross-tok +drafter stays coherent for 1.9K tokens, reaches the recital, and recalls +the correct prefix but not the exact code. Strictly better than LRU, +not yet at the qwen-native scorer's 14-16/16; treat as functional but +untuned (follow-up: teacher-forced NIAH harness for non-qwen archs, +tail-window/normalization tuning). + +## Known limits + +- qwen35moe `--spark` (hybrid expert offload) speculative decode crashes + with a CUDA illegal-memory-access — a pre-existing bug in the hybrid + spec path (`do_hybrid_spec_decode`), independent of KVFlash (it crashes + with the full cache too). It was never exercisable before because no + A3B DFlash draft could be converted; the converter fix in this branch + now loads them, surfacing the crash. Tracked separately; `--spark` + spec falls back to pipelined AR under KVFlash. All-GPU MoE spec decode + (experts resident, no `--spark`) works on the pool — see the spec table. + + +- The harness-only tree-verify graphs (test_dflash) are not pool-aware; + the daemon's spec decode, including the --ddtree configuration (chain + verify + fast rollback), runs fully on the pool. +- Post-generation snapshots are skipped once cur_pos exceeds the pool + (pooled snapshots need page-table serialization). +- Paging is synchronous (copy-stream overlap is a follow-up). +- Memory-dense tasks needing the entire context at once (MRCR-style) are + a paradigm limit shared with FlashMemory; size the pool up for those. +- 512K+ requires RoPE scaling (model native max is 256K) — memory-side + KVFlash already scales (host backing is the only growth). diff --git a/optimizations/kvflash/hero.png b/optimizations/kvflash/hero.png new file mode 100644 index 000000000..3fb3ce50e --- /dev/null +++ b/optimizations/kvflash/hero.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee1577f6ef97b030430041266532d39828749e1ef5868f58a0335955dcad9e7c +size 2255374 diff --git a/server/CMakeLists.txt b/server/CMakeLists.txt index 1ea6fd3fa..05d5add15 100644 --- a/server/CMakeLists.txt +++ b/server/CMakeLists.txt @@ -219,6 +219,7 @@ add_library(dflash_common STATIC src/draft/draft_safetensors_loader.cpp src/draft/draft_graph.cpp src/qwen3/qwen3_drafter.cpp + src/qwen3/qwen3_kvflash_scorer.cpp src/qwen3/qwen3_loader.cpp src/qwen3/qwen3_graph.cpp src/qwen3/qwen3_backend.cpp @@ -724,6 +725,11 @@ if(DFLASH27B_TESTS) target_include_directories(test_generate PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS}) target_link_libraries(test_generate PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET}) endif() + if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_kvflash.cpp") + add_executable(test_kvflash test/test_kvflash.cpp) + target_include_directories(test_kvflash PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS}) + target_link_libraries(test_kvflash PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET}) + endif() if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_restore_delta.cpp") add_executable(test_restore_delta test/test_restore_delta.cpp) target_include_directories(test_restore_delta PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS}) diff --git a/server/scripts/convert_dflash_to_gguf.py b/server/scripts/convert_dflash_to_gguf.py index fae1be7e5..106c04540 100644 --- a/server/scripts/convert_dflash_to_gguf.py +++ b/server/scripts/convert_dflash_to_gguf.py @@ -39,7 +39,14 @@ import gguf # ────────────────────────────────────────────────────────────────────── -# DFlash 27B draft architecture constants +# DFlash draft architecture constants — DEFAULTS ONLY. +# +# These are the qwen35-27B draft's values; they are used as a fallback when +# the source model has no config.json. Any other draft (A3B, gemma, ...) has +# a different head/dim/layer config, so the real scalars are read from the +# source config.json + derived from the tensor shapes in load_arch(). A +# converter that hardcoded these silently produced GGUFs with correct +# weights but 27B metadata, which the strict draft loader then rejected. # ────────────────────────────────────────────────────────────────────── ARCH = "qwen35-dflash-draft" @@ -50,7 +57,7 @@ HEAD_DIM = 128 INTERMEDIATE = 17408 VOCAB = 248320 -N_TARGET_LAYERS = 5 # fc projects 5*hidden -> hidden +N_TARGET_LAYERS = 5 # fc projects N_TARGET_LAYERS*hidden -> hidden ROPE_THETA = 1_000_000.0 RMS_EPS = 1e-6 MASK_TOKEN_ID = 248070 @@ -58,6 +65,89 @@ CTX_LEN = 32768 +def load_arch(safetensors: Path, header: dict) -> dict: + """Resolve the draft's architecture scalars. config.json (next to the + safetensors) is authoritative for the transformer hparams; the tensor + shapes are authoritative for the rest, so the result always matches the + weights even when config.json is partial or absent.""" + a = dict(hidden=HIDDEN, n_layer=N_LAYER, n_head=N_HEAD, n_head_kv=N_HEAD_KV, + head_dim=HEAD_DIM, intermediate=INTERMEDIATE, vocab=VOCAB, + n_target_layers=N_TARGET_LAYERS, rope_theta=ROPE_THETA, + rms_eps=RMS_EPS, mask_token_id=MASK_TOKEN_ID, block_size=BLOCK_SIZE, + ctx_len=CTX_LEN) + + cfg_path = safetensors.parent / "config.json" + if cfg_path.exists(): + c = json.loads(cfg_path.read_text()) + def pick(*keys): + for k in keys: + if k in c and c[k] is not None: + return c[k] + return None + for dst, val in ( + ("hidden", pick("hidden_size")), + ("n_layer", pick("num_hidden_layers")), + ("n_head", pick("num_attention_heads")), + ("n_head_kv", pick("num_key_value_heads")), + ("head_dim", pick("head_dim")), + ("intermediate", pick("intermediate_size")), + ("vocab", pick("vocab_size")), + ("rope_theta", pick("rope_theta")), + ("rms_eps", pick("rms_norm_eps")), + ("n_target_layers", pick("n_target_layers", "num_target_layers")), + ("mask_token_id", pick("mask_token_id")), + ("block_size", pick("block_size", "draft_block_size")), + ("ctx_len", pick("max_position_embeddings")), + ): + if val is not None: + a[dst] = val + print(f"[info] read arch from {cfg_path}") + else: + print(f"[warn] no config.json next to safetensors; using 27B defaults") + + # Weights are ground truth — derive/verify from tensor shapes. + def shape_of(st_name): + e = header.get(st_name) + return e["shape"] if e else None + + # hidden absent in config: k-proj is [n_head_kv*head_dim, hidden] -> ne[1]. + k0 = shape_of("layers.0.self_attn.k_proj.weight") + if (not cfg_path.exists()) and k0: + a["hidden"] = k0[1] + # head_dim absent in config: derive from k-proj (n_head_kv * head_dim). + if k0 and a["n_head_kv"]: + derived_hd = k0[0] // a["n_head_kv"] + if not cfg_path.exists() or "head_dim" not in json.loads(cfg_path.read_text() if cfg_path.exists() else "{}"): + a["head_dim"] = derived_hd + # intermediate: ffn gate/up is [intermediate, hidden] — ne[0]. + g0 = shape_of("layers.0.mlp.gate_proj.weight") + if g0: + a["intermediate"] = g0[0] + # n_target_layers: fc.weight is [hidden, n_target*hidden]; ne[0] (the + # larger dim) / hidden is the capture count the loader checks. + fc = shape_of("fc.weight") + if fc and a["hidden"]: + a["n_target_layers"] = max(fc) // a["hidden"] + # n_layer: count the actual blocks present. + n_blocks = 1 + max((int(n.split(".")[1]) for n in header + if n.startswith("layers.") and n.split(".")[1].isdigit()), + default=a["n_layer"] - 1) + a["n_layer"] = n_blocks + + # Consistency check against the k-proj weight. + if k0: + exp_kv = a["n_head_kv"] * a["head_dim"] + if exp_kv != k0[0]: + print(f"[error] config n_head_kv*head_dim={exp_kv} != " + f"k_proj.weight dim {k0[0]}; fix config.json", file=sys.stderr) + sys.exit(1) + print(f"[info] arch: hidden={a['hidden']} n_layer={a['n_layer']} " + f"n_head={a['n_head']} n_head_kv={a['n_head_kv']} " + f"head_dim={a['head_dim']} ff={a['intermediate']} vocab={a['vocab']} " + f"n_target_layers={a['n_target_layers']}") + return a + + # ────────────────────────────────────────────────────────────────────── # Tensor name mapping — DFlash safetensors -> llama.cpp GGUF # ────────────────────────────────────────────────────────────────────── @@ -155,29 +245,30 @@ def main(): n_entries = sum(1 for k in header if k != "__metadata__") print(f"[info] {n_entries} tensor entries") + a = load_arch(args.safetensors, header) + writer = gguf.GGUFWriter(args.out_gguf, ARCH) - # Architecture metadata - writer.add_string("general.name", "Qwen3.5-27B-DFlash-Draft") - writer.add_uint32(f"{ARCH}.context_length", CTX_LEN) - writer.add_uint32(f"{ARCH}.embedding_length", HIDDEN) - writer.add_uint32(f"{ARCH}.block_count", N_LAYER) - writer.add_uint32(f"{ARCH}.feed_forward_length", INTERMEDIATE) - writer.add_uint32(f"{ARCH}.attention.head_count", N_HEAD) - writer.add_uint32(f"{ARCH}.attention.head_count_kv", N_HEAD_KV) - # llama.cpp uses key_length / value_length to override the default - # n_embd_head = n_embd / n_head heuristic (DFlash has n_embd=5120 - # but head_dim=128 so n_head*head_dim=4096 != n_embd). - writer.add_uint32(f"{ARCH}.attention.key_length", HEAD_DIM) - writer.add_uint32(f"{ARCH}.attention.value_length", HEAD_DIM) - writer.add_uint32(f"{ARCH}.vocab_size", VOCAB) - writer.add_float32(f"{ARCH}.attention.layer_norm_rms_epsilon", RMS_EPS) - writer.add_float32(f"{ARCH}.rope.freq_base", ROPE_THETA) + # Architecture metadata (resolved from config.json + tensor shapes) + writer.add_string("general.name", f"DFlash-Draft-{a['hidden']}h-{a['n_layer']}L") + writer.add_uint32(f"{ARCH}.context_length", a["ctx_len"]) + writer.add_uint32(f"{ARCH}.embedding_length", a["hidden"]) + writer.add_uint32(f"{ARCH}.block_count", a["n_layer"]) + writer.add_uint32(f"{ARCH}.feed_forward_length", a["intermediate"]) + writer.add_uint32(f"{ARCH}.attention.head_count", a["n_head"]) + writer.add_uint32(f"{ARCH}.attention.head_count_kv", a["n_head_kv"]) + # key_length / value_length override the n_embd/n_head heuristic, which + # is wrong for DFlash drafts (n_head*head_dim != n_embd). + writer.add_uint32(f"{ARCH}.attention.key_length", a["head_dim"]) + writer.add_uint32(f"{ARCH}.attention.value_length", a["head_dim"]) + writer.add_uint32(f"{ARCH}.vocab_size", a["vocab"]) + writer.add_float32(f"{ARCH}.attention.layer_norm_rms_epsilon", a["rms_eps"]) + writer.add_float32(f"{ARCH}.rope.freq_base", a["rope_theta"]) # DFlash-specific hyperparameters - writer.add_uint32(f"{ARCH}.dflash.n_target_layers", N_TARGET_LAYERS) - writer.add_uint32(f"{ARCH}.dflash.block_size", BLOCK_SIZE) - writer.add_uint32(f"{ARCH}.dflash.mask_token_id", MASK_TOKEN_ID) + writer.add_uint32(f"{ARCH}.dflash.n_target_layers", a["n_target_layers"]) + writer.add_uint32(f"{ARCH}.dflash.block_size", a["block_size"]) + writer.add_uint32(f"{ARCH}.dflash.mask_token_id", a["mask_token_id"]) # Walk + add tensors. Sort: dflash.* singletons first, then output_*, # then per-layer in numeric order — keeps the on-disk layout stable. diff --git a/server/src/common/kvflash_pager.h b/server/src/common/kvflash_pager.h new file mode 100644 index 000000000..1b4679db9 --- /dev/null +++ b/server/src/common/kvflash_pager.h @@ -0,0 +1,548 @@ +// KvFlashPager — KVFlash core: a bounded resident pool for the +// full-attention KV cache (see optimizations/kvflash/). +// +// Lookahead-sparse-attention-style (FlashMemory, arXiv 2606.09079) +// decode-time KV residency for the qwen35 target: the cache tensors are +// allocated at POOL size (a fraction of the logical context), and this +// class owns the mapping from logical token positions to physical pool +// slots. Chunks (64 logical tokens) that fall cold are paged out to a +// host backing store and their slots are reused; paged-out chunks remain +// recallable bit-exact. GPU footprint is a hard O(pool) bound regardless +// of logical context length. +// +// Policy-agnostic by design: with no scorer, eviction is LRU over +// unprotected chunks (recency-only memory). A KvFlashScorer plugged into +// `score_hook` upgrades eviction and reselect() to relevance-driven +// residency; with pflash enabled, its drafter attaches automatically +// (KvFlashDrafterScorer) and recalls cold context the generation needs. +// +// Correctness notes (why relocating rows is legal): +// * RoPE is baked into K rows at write time from the `positions` input, +// so a row's physical slot is semantically irrelevant. +// * Attention runs over the whole pool with a slot-validity mask +// (resident = 0, free/paged-out = -inf). The mask must be re-uploaded +// before EVERY compute: input tensors live in the gallocr compute +// buffer whose regions are reused during graph execution. +// * Freed slots are additionally zeroed (defense in depth; a zero K row +// contributes exp(-max) ~ 0, the same assumption the production +// stride-256 padded span relies on in maskless mode). +// * The FWHT K-rotation and KV quantization operate per-row; page-out / +// page-in moves raw quantized bytes and is therefore bit-exact. +// +// Scope: full-attention layers only. DeltaNet/conv recurrent state is +// fixed-size, position-dependent in-place state and is never paged. + +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace dflash::common { + +struct KvFlashConfig { + int chunk_tokens = 64; // logical tokens per page + int pool_tokens = 0; // resident pool capacity (multiple of chunk_tokens) + int sink_chunks = 1; // leading chunks never evicted (attention sinks) + int tail_window_chunks = 4; // trailing chunks never evicted (local window) +}; + +struct KvFlashStats { + int64_t page_outs = 0; + int64_t page_ins = 0; + int64_t host_bytes = 0; // backing store currently held on host + int64_t moved_bytes = 0; // cumulative D2H+H2D traffic +}; + +class KvFlashPager { +public: + // `attn_k` / `attn_v` are the per-full-attention-layer cache tensors, + // each [head_dim, pool_tokens, n_head_kv]. All must share dims/types + // within their K/V group. + // Minimum pool for a config: sinks + trailing window stay resident + // unconditionally, so at least 2 more chunks are required (1 evictable + // victim + the partially filled append head) or eviction deadlocks and + // slot_for() starts failing once the pool fills. + static int min_pool_tokens(const KvFlashConfig & cfg) { + return (cfg.sink_chunks + cfg.tail_window_chunks + 2) * cfg.chunk_tokens; + } + + bool attach(const KvFlashConfig & cfg, + const std::vector & attn_k, + const std::vector & attn_v) { + if (cfg.pool_tokens <= 0 || cfg.pool_tokens % cfg.chunk_tokens != 0) return false; + if (cfg.pool_tokens < min_pool_tokens(cfg)) { + std::fprintf(stderr, + "kvflash: pool %d < minimum %d (%d sink + %d tail chunks must " + "leave an evictable block)\n", + cfg.pool_tokens, min_pool_tokens(cfg), + cfg.sink_chunks, cfg.tail_window_chunks); + return false; + } + if (attn_k.empty() || attn_k.size() != attn_v.size()) return false; + cfg_ = cfg; + attn_k_ = attn_k; + attn_v_ = attn_v; + n_blocks_ = cfg.pool_tokens / cfg.chunk_tokens; + const ggml_tensor * K0 = attn_k[0]; + if ((int)K0->ne[1] < cfg.pool_tokens) return false; + n_head_kv_ = (int)K0->ne[2]; + + // Per-(tensor, head) contiguous segment of chunk_tokens rows. + k_seg_bytes_ = (size_t)cfg.chunk_tokens * K0->nb[1]; + v_seg_bytes_ = (size_t)cfg.chunk_tokens * attn_v[0]->nb[1]; + chunk_bytes_ = (k_seg_bytes_ + v_seg_bytes_) * (size_t)n_head_kv_ * attn_k.size(); + zero_buf_.assign(std::max(k_seg_bytes_, v_seg_bytes_), 0); + + free_blocks_.clear(); + for (int b = n_blocks_ - 1; b >= 0; b--) free_blocks_.push_back(b); + chunks_.clear(); + stats_ = {}; + clock_ = 0; + return true; + } + + // Optional: custom block hand-out order (e.g. shuffled placement in + // relocation tests). `order[i]` = i-th block to hand out. + void set_block_order(const std::vector & order) { + free_blocks_.assign(order.rbegin(), order.rend()); + } + + // Drop all mappings and host backing (new request / cache reset). + // Cumulative stats are kept; the epoch advances so cached masks refill. + void reset() { + chunks_.clear(); + free_blocks_.clear(); + for (int b = n_blocks_ - 1; b >= 0; b--) free_blocks_.push_back(b); + stats_.host_bytes = 0; + cur_chunk_ = 0; + epoch_++; + } + + // Zero every currently-free block. reset() drops mappings but leaves the + // previous request's bytes in place; maskless consumers (the qwen35moe + // pipelined decode reads the whole padded pool span with no slot mask) + // need stale rows to dequantise to ~zero contribution. Masked consumers + // don't need this but it is cheap (pool-sized memset, sub-ms). + void zero_free_blocks() { + for (int b : free_blocks_) zero_block(b); + } + + bool attached() const { return n_blocks_ > 0; } + int pool_tokens() const { return cfg_.pool_tokens; } + int chunk_tokens() const { return cfg_.chunk_tokens; } + + // Optional external relevance score; higher = keep. Falls back to LRU. + std::function score_hook; + + // Allocate slots for [kv_start, kv_start + n_tok) ahead of a forward + // step (evicting LRU/low-score chunks as needed). False — with a + // diagnostic — if the pool has no evictable block left. + bool alloc_span(int kv_start, int n_tok) { + for (int i = 0; i < n_tok; ++i) { + if (slot_for(kv_start + i) < 0) { + std::fprintf(stderr, "[kvflash] no pool slot at pos %d " + "(pool %d exhausted)\n", + kv_start + i, cfg_.pool_tokens); + return false; + } + } + return true; + } + + // Physical pool slot for logical position `pos`. Allocates (and, when + // the pool is full, evicts) at chunk granularity. Call once per + // appended token, in logical order. + int slot_for(int64_t pos) { + const int c = (int)(pos / cfg_.chunk_tokens); + // cur_chunk_ tracks the append head only; a page_in of an older + // chunk must not shrink the protected tail window. It must advance + // BEFORE eviction (so the victim search protects the new tail), but + // a failed allocation must roll it back or the next eviction's tail + // window is computed from a chunk that never materialized. + const int prev_cur_chunk = cur_chunk_; + if (c > cur_chunk_) cur_chunk_ = c; + if ((int)chunks_.size() <= c) chunks_.resize(c + 1); + ChunkState & st = chunks_[c]; + if (st.block < 0) { + if (!ensure_free_block()) { + cur_chunk_ = prev_cur_chunk; + return -1; + } + st.block = free_blocks_.back(); + free_blocks_.pop_back(); + epoch_++; + if (st.on_host) { // recall: restore paged-out bytes + copy_chunk(c, st.block, /*to_host=*/false); + stats_.page_ins++; + stats_.moved_bytes += chunk_bytes_; + } + } + st.last_use = ++clock_; + return st.block * cfg_.chunk_tokens + (int)(pos % cfg_.chunk_tokens); + } + + // Force a chunk out of the pool (host backing + zeroed slots). + bool page_out(int c) { + if (c >= (int)chunks_.size() || chunks_[c].block < 0) return false; + ChunkState & st = chunks_[c]; + if (!st.on_host) { + st.host_data.resize(chunk_bytes_); + stats_.host_bytes += (int64_t)chunk_bytes_; + } + copy_chunk(c, st.block, /*to_host=*/true); + zero_block(st.block); + st.on_host = true; + free_blocks_.push_back(st.block); + st.block = -1; + epoch_++; + stats_.page_outs++; + stats_.moved_bytes += chunk_bytes_; + return true; + } + + // Recall a chunk into the pool (used by reselect / tests). + bool page_in(int c) { + if (c >= (int)chunks_.size() || !chunks_[c].on_host || chunks_[c].block >= 0) return false; + return slot_for((int64_t)c * cfg_.chunk_tokens) >= 0; + } + + bool is_resident(int c) const { + return c < (int)chunks_.size() && chunks_[c].block >= 0; + } + + // True while every materialized chunk still sits in its identity block + // (chunk c in block c, nothing paged out). This is the layout contract + // identity-copy snapshots rely on; it holds from reset() until the + // first eviction of the CURRENT request (cumulative stats do not). + bool is_identity() const { + for (int c = 0; c < (int)chunks_.size(); c++) { + if (chunks_[c].block >= 0 && chunks_[c].block != c) return false; + if (chunks_[c].block < 0 && chunks_[c].on_host) return false; + } + return true; + } + int block_of(int c) const { + return c < (int)chunks_.size() ? chunks_[c].block : -1; + } + + // Const lookup (no alloc / LRU touch): physical slot currently holding + // logical `pos`, or -1 if its chunk is not resident. Callers that may + // need an allocation must use slot_for() beforehand. + int slot_of(int64_t pos) const { + const int c = (int)(pos / cfg_.chunk_tokens); + if (c >= (int)chunks_.size() || chunks_[c].block < 0) return -1; + return chunks_[c].block * cfg_.chunk_tokens + (int)(pos % cfg_.chunk_tokens); + } + + // Logical position held by each pool slot, -1 for free blocks. `dst` + // must hold pool_tokens entries. Lets callers build masks that need + // POSITION semantics in slot space (causal / sliding-window): the + // mask condition is evaluated on dst[slot] instead of the column index. + void fill_slot_pos(int32_t * dst) const { + for (int i = 0; i < cfg_.pool_tokens; i++) dst[i] = -1; + for (int c = 0; c < (int)chunks_.size(); c++) { + if (chunks_[c].block < 0) continue; + int32_t * p = dst + (size_t)chunks_[c].block * cfg_.chunk_tokens; + for (int i = 0; i < cfg_.chunk_tokens; i++) + p[i] = (int32_t)c * cfg_.chunk_tokens + i; + } + } + const KvFlashStats & stats() const { return stats_; } + int resident_blocks() const { return n_blocks_ - (int)free_blocks_.size(); } + int n_chunks() const { return (int)chunks_.size(); } + + // Bumped on every residency change (alloc / page_out / page_in). + // Callers cache the slot mask and refill only when the epoch moves. + uint64_t epoch() const { return epoch_; } + + // F16 slot-validity mask for one query row: 0 for slots belonging to a + // resident chunk, -inf for free / paged-out blocks. `dst` must hold + // pool_tokens entries. Used as the FA mask so non-resident slots are + // excluded exactly instead of via the zero-row ~exp(-max) approximation. + void fill_slot_mask(uint16_t * dst) const { + constexpr uint16_t F16_ZERO = 0x0000, F16_NEG_INF = 0xFC00; + for (int i = 0; i < cfg_.pool_tokens; i++) dst[i] = F16_NEG_INF; + for (int c = 0; c < (int)chunks_.size(); c++) { + if (chunks_[c].block < 0) continue; + uint16_t * p = dst + (size_t)chunks_[c].block * cfg_.chunk_tokens; + for (int i = 0; i < cfg_.chunk_tokens; i++) p[i] = F16_ZERO; + } + } + + // Lookahead reselect (FlashMemory τ-step): rebuild the resident set as + // the top-pool chunks by score_hook among ALL known chunks (resident or + // host-backed). Sinks and the trailing window are always kept. Returns + // the number of page events. Call between decode steps. + int reselect() { + if (!score_hook) return 0; + struct Cand { int c; float s; }; + std::vector cands; + for (int c = 0; c < (int)chunks_.size(); c++) { + const ChunkState & st = chunks_[c]; + if (st.block < 0 && !st.on_host) continue; // never materialized + const bool prot = c < cfg_.sink_chunks || + c > cur_chunk_ - 1 - cfg_.tail_window_chunks; + cands.push_back({c, prot ? 3.4e38f : score_hook(c)}); + } + std::sort(cands.begin(), cands.end(), + [](const Cand & a, const Cand & b) { return a.s > b.s; }); + std::vector want(chunks_.size(), 0); + for (int i = 0; i < (int)cands.size() && i < n_blocks_; i++) want[cands[i].c] = 1; + + int events = 0; + for (int c = 0; c < (int)chunks_.size(); c++) { // out first: frees blocks + if (!want[c] && chunks_[c].block >= 0) { page_out(c); events++; } + } + for (int c = 0; c < (int)chunks_.size(); c++) { + if (want[c] && chunks_[c].block < 0 && chunks_[c].on_host) { + if (page_in(c)) events++; + } + } + return events; + } + +private: + struct ChunkState { + int block = -1; // pool block index, -1 = not resident + bool on_host = false; // backing store holds valid bytes + uint64_t last_use = 0; + std::vector host_data; + }; + + bool ensure_free_block() { + if (!free_blocks_.empty()) return true; + // Victim: unprotected resident chunk with the lowest score + // (score_hook) or the oldest use (LRU fallback). + int victim = -1; + float v_score = 0.f; + uint64_t v_use = 0; + for (int c = 0; c < (int)chunks_.size(); c++) { + if (chunks_[c].block < 0) continue; + if (c < cfg_.sink_chunks) continue; + if (c > cur_chunk_ - 1 - cfg_.tail_window_chunks) continue; + if (score_hook) { + const float s = score_hook(c); + if (victim < 0 || s < v_score) { victim = c; v_score = s; } + } else { + if (victim < 0 || chunks_[c].last_use < v_use) { victim = c; v_use = chunks_[c].last_use; } + } + } + return victim >= 0 && page_out(victim); + } + + // Move one chunk between pool slots and host backing. Segment order is + // fixed (layer-major, K then V, head-minor) so offsets are stable. + void copy_chunk(int c, int block, bool to_host) { + ChunkState & st = chunks_[c]; + uint8_t * p = st.host_data.data(); + for (size_t l = 0; l < attn_k_.size(); l++) { + for (int kv = 0; kv < 2; kv++) { + ggml_tensor * t = kv == 0 ? attn_k_[l] : attn_v_[l]; + const size_t seg = kv == 0 ? k_seg_bytes_ : v_seg_bytes_; + for (int h = 0; h < n_head_kv_; h++) { + const size_t off = (size_t)block * cfg_.chunk_tokens * t->nb[1] + (size_t)h * t->nb[2]; + if (to_host) ggml_backend_tensor_get(t, p, off, seg); + else ggml_backend_tensor_set(t, p, off, seg); + p += seg; + } + } + } + } + + void zero_block(int block) { + for (size_t l = 0; l < attn_k_.size(); l++) { + for (int kv = 0; kv < 2; kv++) { + ggml_tensor * t = kv == 0 ? attn_k_[l] : attn_v_[l]; + const size_t seg = kv == 0 ? k_seg_bytes_ : v_seg_bytes_; + for (int h = 0; h < n_head_kv_; h++) { + const size_t off = (size_t)block * cfg_.chunk_tokens * t->nb[1] + (size_t)h * t->nb[2]; + ggml_backend_tensor_set(t, zero_buf_.data(), off, seg); + } + } + } + } + + KvFlashConfig cfg_; + std::vector attn_k_, attn_v_; + std::vector chunks_; + std::vector free_blocks_; + std::vector zero_buf_; + KvFlashStats stats_; + size_t k_seg_bytes_ = 0, v_seg_bytes_ = 0, chunk_bytes_ = 0; + int n_blocks_ = 0, n_head_kv_ = 0, cur_chunk_ = 0; + uint64_t clock_ = 0; + uint64_t epoch_ = 0; +}; + +// ── Shared backend helpers ───────────────────────────────────────────── +// +// Every backend integration needs the same three steps: read the pool size +// from the env, allocate slots ahead of each forward (alloc_span above), +// and build slot-space inputs for the graph. The first and last live here +// so the per-arch code reduces to wiring. + +// VRAM budget for "auto" pool sizing. Backends fill this AFTER the target +// weights are on the GPU and BEFORE the cache is allocated, so free_bytes +// reflects what the pool can actually use. +struct KvFlashAutoBudget { + int64_t free_bytes = 0; // device free memory right now + int64_t reserve_bytes = 0; // compute buffers + (if expected) drafter + int64_t bytes_per_token = 0; // pooled attention KV density for this model + // Decode cost grows with the FA span (= the pool), so cap the auto pool + // where speed stays near the small-pool point. Measured on the 27B/3090: + // 1K pool 39.6 tok/s, 4K 38.7; 16K extrapolates to ~31-33, still 1.7-2.4x + // the full cache at 128-256K. Override: DFLASH_KVFLASH_MAX_POOL. + int speed_cap_tokens = 16384; +}; + +// Pool size from DFLASH_KVFLASH for a backend with `cfg` protections: +// 0 = off; otherwise rounded to a 256 multiple, floored at +// min_pool_tokens(cfg) (eviction must keep a victim) and clamped to +// `max_ctx` (a pool larger than the logical context is meaningless), with +// warnings on both adjustments. +// +// The literal value "auto" sizes the pool from the GPU, not from a fixed +// fraction: take half of (free VRAM - reserve), convert to tokens at the +// model's KV density, then cap at the speed point and max_ctx. Big pools +// avoid relevance-crowding (more resident chunks = fewer forced evictions +// of useful context); the speed cap keeps decode near the flat optimum. +// Falls back to max_ctx/4 (scorer expected) or /2 (LRU) when the backend +// supplies no budget. +inline int kvflash_pool_from_env(int max_ctx, const KvFlashConfig & cfg = {}, + bool scorer_expected = false, + const KvFlashAutoBudget & budget = {}) { + const char * env = std::getenv("DFLASH_KVFLASH"); + if (!env) return 0; + int tokens; + if (std::strcmp(env, "auto") == 0) { + int speed_cap = budget.speed_cap_tokens; + if (const char * mp = std::getenv("DFLASH_KVFLASH_MAX_POOL")) { + speed_cap = std::max(256, std::atoi(mp)); + } + if (budget.bytes_per_token > 0 && budget.free_bytes > 0) { + const int64_t usable = + std::max(0, budget.free_bytes - budget.reserve_bytes) / 2; + const int64_t vram_tokens = usable / budget.bytes_per_token; + tokens = (int)std::min(vram_tokens, + std::min(max_ctx, speed_cap)); + std::fprintf(stderr, + "[kvflash] auto pool: %d tokens (free %.1f GiB - reserve %.1f GiB, " + "%.1f KiB/token, caps: speed %d / max_ctx %d)\n", + tokens, budget.free_bytes / 1073741824.0, + budget.reserve_bytes / 1073741824.0, + budget.bytes_per_token / 1024.0, speed_cap, max_ctx); + } else { + tokens = max_ctx / (scorer_expected ? 4 : 2); + std::fprintf(stderr, "[kvflash] auto pool: %d tokens (%d%% of max_ctx %d, " + "no VRAM budget supplied)\n", + tokens, scorer_expected ? 25 : 50, max_ctx); + } + } else { + tokens = std::atoi(env); + } + if (tokens <= 0) return 0; + tokens = ((tokens + 255) / 256) * 256; + const int floor_tokens = + ((KvFlashPager::min_pool_tokens(cfg) + 255) / 256) * 256; + if (tokens < floor_tokens) { + std::fprintf(stderr, "[kvflash] requested pool %d < minimum %d " + "(%d sink + %d tail chunks must leave an " + "evictable block); raising\n", + tokens, floor_tokens, cfg.sink_chunks, cfg.tail_window_chunks); + tokens = floor_tokens; + } + if (tokens > max_ctx) { + std::fprintf(stderr, "[kvflash] requested pool %d > max_ctx %d; clamping " + "(raise --max-ctx for a larger pool)\n", + tokens, max_ctx); + tokens = (max_ctx / 256) * 256; + } + return tokens; +} + +// Residency policy from DFLASH_KVFLASH_POLICY (--kvflash-policy): "lru" +// forces recency-only paging (no drafter probe, no scorer); anything else +// (default "drafter") means scored residency when a drafter is available. +inline bool kvflash_policy_is_lru() { + const char * env = std::getenv("DFLASH_KVFLASH_POLICY"); + return env && std::strcmp(env, "lru") == 0; +} + +// Locate the Qwen3-0.6B residency drafter: the explicit override +// (DFLASH_KVFLASH_DRAFTER, set from --prefill-drafter), then the +// well-known locations next to the target model, then the appliance path. +// Returns "" when nothing is readable (callers fall back to LRU, loudly). +inline std::string kvflash_find_drafter(const char * target_path) { + if (kvflash_policy_is_lru()) return ""; + if (const char * dp = std::getenv("DFLASH_KVFLASH_DRAFTER")) return dp; + if (!target_path) return ""; + std::string dir(target_path); + const size_t slash = dir.find_last_of('/'); + dir = (slash == std::string::npos) ? "." : dir.substr(0, slash); + const std::string candidates[] = { + dir + "/Qwen3-0.6B-BF16.gguf", + dir + "/drafter/Qwen3-0.6B-BF16.gguf", + dir + "/draft/Qwen3-0.6B-BF16.gguf", + "/opt/lucebox/models/drafter/Qwen3-0.6B-BF16.gguf", + }; + for (const std::string & c : candidates) { + if (std::FILE * f = std::fopen(c.c_str(), "rb")) { + std::fclose(f); + std::fprintf(stderr, "[kvflash] found residency drafter: %s\n", c.c_str()); + return c; + } + } + return ""; +} + +// Slot-space step inputs for masked consumers: the K/V append row for each +// of this step's tokens, plus F32 causal (`mfull`) and sliding-window +// (`mswa`, optional) masks of width `mk_w` whose conditions are evaluated +// on the POSITION each pool slot holds (free slots stay -inf). The caller +// must have alloc_span()'d [kv_start, kv_start + n_tok) first. The pager +// zeroes freed slots, but the mask is what keeps relocation exact. +inline bool kvflash_fill_rows_and_masks( + const KvFlashPager & pager, + int kv_start, int n_tok, int mk_w, int swa_window, + std::vector & rows, + std::vector * mfull, std::vector * mswa) { + rows.resize((size_t)n_tok); + for (int i = 0; i < n_tok; ++i) { + const int s = pager.slot_of(kv_start + i); + if (s < 0) { + std::fprintf(stderr, "[kvflash] no pool slot at pos %d " + "(alloc_span not called?)\n", kv_start + i); + return false; + } + rows[(size_t)i] = s; + } + if (!mfull) return true; + std::vector spos((size_t)pager.pool_tokens(), -1); + pager.fill_slot_pos(spos.data()); + mfull->assign((size_t)mk_w * n_tok, -INFINITY); + if (mswa) mswa->assign((size_t)mk_w * n_tok, -INFINITY); + const int s_hi = std::min(mk_w, (int)spos.size()); + for (int q = 0; q < n_tok; ++q) { + const int abs_q = kv_start + q; + const int win_lo = std::max(0, abs_q - swa_window + 1); + for (int s = 0; s < s_hi; ++s) { + const int p = spos[(size_t)s]; + if (p < 0 || p > abs_q) continue; + (*mfull)[(size_t)q * mk_w + s] = 0.0f; + if (mswa && p >= win_lo) (*mswa)[(size_t)q * mk_w + s] = 0.0f; + } + } + return true; +} + +} // namespace dflash::common diff --git a/server/src/common/kvflash_scorer.h b/server/src/common/kvflash_scorer.h new file mode 100644 index 000000000..407d94c6d --- /dev/null +++ b/server/src/common/kvflash_scorer.h @@ -0,0 +1,33 @@ +// KvFlashScorer — pluggable chunk-relevance policy for KvFlashPager. +// +// The pager is policy-agnostic: with no scorer attached it evicts LRU and +// never recalls. A scorer upgrades eviction and reselect() to relevance- +// driven residency (FlashMemory's Memory Indexer role). This interface is +// deliberately dependency-free so the pager runs without pflash, without a +// drafter, and without any model beyond the target. +// +// Implementations: +// - (none) pure LRU + recency, zero dependencies +// - KvFlashDrafterScorer qwen3/qwen3_kvflash_scorer.h — pflash drafter tail +// attention (shared with pflash compression) + +#pragma once + +#include +#include + +namespace dflash::common { + +struct KvFlashScorer { + virtual ~KvFlashScorer() = default; + + // Fill out[c] with a relevance score (higher = keep resident) for each + // chunk_tokens-sized chunk of `ids` (the full token history: prompt + + // generated). Returns false on failure; the caller skips reselect for + // that round and the pager keeps its LRU behavior. + virtual bool score_chunks(const std::vector & ids, + int chunk_tokens, + std::vector & out) = 0; +}; + +} // namespace dflash::common diff --git a/server/src/common/moe_hybrid_ffn_eval.cpp b/server/src/common/moe_hybrid_ffn_eval.cpp index 12a854d37..6d106cfa5 100644 --- a/server/src/common/moe_hybrid_ffn_eval.cpp +++ b/server/src/common/moe_hybrid_ffn_eval.cpp @@ -39,8 +39,17 @@ static ggml_tensor * build_shared_expert_subgraph( ggml_tensor * shared = apply_scale2(ctx, ggml_mul_mat(ctx, desc.ffn_down_shexp, sh_gu), desc.ffn_down_shexp_s); if (desc.ffn_gate_inp_shexp) { + // The shared-expert gate is a single-row weight (M=1): out[0,n] = sum_k W[k]*inp[k,n]. + // Computing it as ggml_mul_mat routes to cublas, and on the shipped CUDA 12.0 + // cublasLt the M=1 heuristic selects a gemv/split-K reduce algorithm whose kernel + // is ABSENT from the library once N>1 (spec-decode verify/replay batches) — for + // BOTH F32 (cublasSgemm SSS) and F16 (cublasGemmEx HHH splitKreduce). That poisons + // the stream and surfaces as an illegal access in the next op. Compute the gate as + // broadcast elementwise-mul + sum_rows instead: identical math, ggml kernels only, + // no cublas. This is what unblocks single-pass full-batch verify. + ggml_tensor * gate_prod = ggml_mul(ctx, inp, desc.ffn_gate_inp_shexp); ggml_tensor * shared_gate = apply_scale2(ctx, - ggml_mul_mat(ctx, desc.ffn_gate_inp_shexp, inp), desc.ffn_gate_inp_shexp_s); + ggml_sum_rows(ctx, gate_prod), desc.ffn_gate_inp_shexp_s); shared_gate = ggml_sigmoid(ctx, shared_gate); shared = ggml_mul(ctx, shared, shared_gate); } @@ -658,6 +667,57 @@ bool build_cached_hot_batched_graph( return true; } +// Cached batched COLD routed graph (CPU backend, no shared expert). Mirror of +// build_cached_hot_batched_graph for the cold expert stack; used by the mixed +// batched path so spec-decode verify/replay reuse the graph instead of +// rebuilding it every call. +static bool build_cached_cold_batched_graph( + CachedHotBatchedGraph & out, + ggml_backend_t cpu_backend, + MoeHybridLayerStorage & storage, + const MoeLayerDesc & desc, + const MoeHybridConfig & cfg, + int n_tokens) { + + out.free(); + out.n_tokens = n_tokens; + const int n_embd = cfg.n_embd; + const int n_used = cfg.n_expert_used; + const int n_ff_exp = cfg.n_ff_exp; + + ggml_init_params ip{}; + ip.mem_size = 128 * 1024 * 1024; + ip.mem_buffer = nullptr; + ip.no_alloc = true; + out.ctx = ggml_init(ip); + if (!out.ctx) return false; + + out.inp = ggml_new_tensor_2d(out.ctx, GGML_TYPE_F32, n_embd, n_tokens); + ggml_set_input(out.inp); + out.sel = ggml_new_tensor_2d(out.ctx, GGML_TYPE_I32, n_used, n_tokens); + ggml_set_input(out.sel); + out.wts = ggml_new_tensor_2d(out.ctx, GGML_TYPE_F32, n_used, n_tokens); + ggml_set_input(out.wts); + + ggml_tensor * routed = nullptr; + build_batched_routed_graph(out.ctx, + storage.gate_cold, storage.up_cold, storage.down_cold, storage.gate_up_cold, + desc.ffn_gate_exps_s, desc.ffn_up_exps_s, desc.ffn_down_exps_s, desc.ffn_gate_up_exps_s, + out.inp, out.sel, out.wts, n_embd, n_ff_exp, n_used, n_tokens, &routed); + if (!routed) { out.free(); return false; } + out.output = routed; + + out.gf = ggml_new_graph_custom(out.ctx, 4096, false); + ggml_set_output(out.output); + ggml_build_forward_expand(out.gf, out.output); + out.alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(cpu_backend)); + if (!ggml_gallocr_alloc_graph(out.alloc, out.gf)) { + out.free(); + return false; + } + return true; +} + bool eval_moe_hybrid_ffn_single( ggml_backend_t gpu_backend, const MoeHybridConfig & cfg, @@ -935,6 +995,25 @@ static bool mmq_full_batch_ok(const MoeHybridConfig & cfg, int n_tokens) { return cfg.mmq_safe_full_batch && n_tokens >= min_tokens; } +// Sub-batch size for the reduced-hot-stack routed mul_mat_id. The MMQ path +// (n_tokens > 8) illegal-accesses on a REDUCED expert stack for sparse/ +// imbalanced sub-64 batches (a genuine ggml-cuda MMQ mul_mat_id bug, observed +// on sm_86 + gfx1151); the MMVQ-mmid path is stable. Q4_K MMVQ-mmid handles up +// to 8 tokens on CUDA sm_80+ (MMVQ_MAX_BATCH_SIZE) and 4 on AMD. Earlier this +// had to be 1 because the F32 shared-expert gate (cublasSgemm, M=1) also faulted +// at N>1 on the shipped CUDA 12.0 cublasLt; that is now computed cublas-free +// (mul + sum_rows), so sub-batch=8 is safe and validated on sm_86. Default to 8 +// on sm_80+ (CUDA), 1 elsewhere (proven single-token path on unvalidated archs); +// env override tunes per arch without a rebuild. +static int mmq_safe_sub_batch() { + static const int v = [](){ + const char * e = std::getenv("DFLASH_MMQ_SUB_BATCH"); + if (e) return std::max(1, std::atoi(e)); + return (query_gpu_compute_sm() >= 80) ? 8 : 1; + }(); + return v; +} + static bool eval_moe_hybrid_ffn_batched_core( ggml_backend_t gpu_backend, ggml_backend_t cpu_backend, @@ -956,6 +1035,74 @@ static bool eval_moe_hybrid_ffn_batched_core( out.assign((size_t)n_embd * (size_t)n_tokens, 0.0f); if (n_tokens <= 0) return true; + // ── Fast path: cached hot+cold batched graphs (spec-decode verify/replay) ── + // Mixed layers used to rebuild+free their hot and cold ggml graphs on every + // call; that graph churn (not the matmul) dominated the verify FFN time. + // Reuse per-n_tokens cached graphs so steady-state rebuilds nothing. Large + // prefill batches (n_tokens >= kMaxBatchedCache) fall through to the inline + // path below. + if (n_tokens > 0 && n_tokens < MoeHybridLayerStorage::kMaxBatchedCache) { + const int total_slots = n_used * n_tokens; + const int n_hot_stack = storage.gate_up_hot ? (int)storage.gate_up_hot->ne[2] + : storage.gate_hot ? (int)storage.gate_hot->ne[2] : 1; + const int n_cold_stack = std::max(1, (int)(storage.down_cold ? storage.down_cold->ne[2] : 1)); + std::vector hot_sel(total_slots); + std::vector hot_wts(total_slots, 0.0f); + std::vector cold_sel(total_slots); + std::vector cold_wts(total_slots, 0.0f); + for (int i = 0; i < total_slots; ++i) { hot_sel[i] = i % n_hot_stack; cold_sel[i] = i % n_cold_stack; } + bool fp_has_cold = false; + for (int i = 0; i < total_slots; ++i) { + const int32_t gid = selected_ids[i]; + if (gid < 0 || gid >= (int32_t)storage.hot_local_by_global.size()) continue; + const int32_t hl = storage.hot_local_by_global[(size_t)gid]; + if (hl >= 0) { hot_sel[i] = hl; hot_wts[i] = selected_weights[i]; } + else { + const int32_t cl = storage.cold_local_by_global[(size_t)gid]; + if (cl >= 0) { cold_sel[i] = cl; cold_wts[i] = selected_weights[i]; fp_has_cold = true; } + } + } + + CachedHotBatchedGraph & hg = storage.hot_batched_mixed[n_tokens]; + const bool hg_ok = (hg.valid() && hg.n_tokens == n_tokens) + || build_cached_hot_batched_graph(hg, gpu_backend, storage, desc, cfg, n_tokens); + CachedHotBatchedGraph * cg = nullptr; + bool cg_ok = true; + if (fp_has_cold) { + cg = &storage.cold_batched_mixed[n_tokens]; + cg_ok = (cg->valid() && cg->n_tokens == n_tokens) + || build_cached_cold_batched_graph(*cg, cpu_backend, storage, desc, cfg, n_tokens); + } + + if (hg_ok && cg_ok) { + // Hot (GPU, async): shared expert + routed hot (zero-weight dummy slots + // keep an all-cold batch's shared-expert contribution). + ggml_backend_tensor_set(hg.inp, cur_host, 0, sizeof(float) * (size_t)n_embd * (size_t)n_tokens); + ggml_backend_tensor_set(hg.sel, hot_sel.data(), 0, sizeof(int32_t) * (size_t)total_slots); + ggml_backend_tensor_set(hg.wts, hot_wts.data(), 0, sizeof(float) * (size_t)total_slots); + ggml_backend_graph_compute_async(gpu_backend, hg.gf); + + std::vector cold_partial; + if (cg) { + cold_partial.assign((size_t)n_embd * (size_t)n_tokens, 0.0f); + ggml_backend_tensor_set(cg->inp, cur_host, 0, sizeof(float) * (size_t)n_embd * (size_t)n_tokens); + ggml_backend_tensor_set(cg->sel, cold_sel.data(), 0, sizeof(int32_t) * (size_t)total_slots); + ggml_backend_tensor_set(cg->wts, cold_wts.data(), 0, sizeof(float) * (size_t)total_slots); + ggml_backend_graph_compute(cpu_backend, cg->gf); // sync; overlaps the async hot GPU graph + ggml_backend_tensor_get(cg->output, cold_partial.data(), 0, sizeof(float) * (size_t)n_embd * (size_t)n_tokens); + } + + ggml_backend_synchronize(gpu_backend); + ggml_backend_tensor_get(hg.output, out.data(), 0, sizeof(float) * (size_t)n_embd * (size_t)n_tokens); + if (cg) { + const size_t ntot = (size_t)n_embd * (size_t)n_tokens; + for (size_t i = 0; i < ntot; ++i) out[i] += cold_partial[i]; + } + return true; + } + // build failed -> fall through to the inline rebuild path + } + // ── Step 1: Partition routing into hot and cold ── // Dummy slots use weight 0.0 and are distributed evenly across all experts // to avoid pathological routing imbalance that triggers OOB in MMQ stream-k. @@ -1175,15 +1322,15 @@ bool eval_moe_hot_only_batched( out.assign((size_t)n_embd * (size_t)n_tokens, 0.0f); if (n_tokens <= 0) return true; - // Workaround for ggml-cuda MMQ mul_mat_id bug on sm_75/gfx1151: when the - // hot stack is smaller than n_expert, slice into <=4-token sub-batches to - // route through the stable MMVQ path. Skipped on sm_80+ where MMQ is safe. + // Workaround for the ggml-cuda MMQ mul_mat_id stream-k fault on a REDUCED + // hot stack (sm_75/gfx1151 AND sm_86): slice sub-64 batches to a size the + // MMVQ-mmid path handles. See mmq_safe_sub_batch(). const int n_hot_stack = storage.gate_up_hot ? (int)storage.gate_up_hot->ne[2] : storage.gate_hot ? (int)storage.gate_hot->ne[2] : 0; - static const int MMQ_SAFE_SUB_BATCH = 4; + const int MMQ_SAFE_SUB_BATCH = mmq_safe_sub_batch(); if (!mmq_full_batch_ok(cfg, n_tokens) - && n_hot_stack > 0 && n_hot_stack < cfg.n_expert && n_tokens > MMQ_SAFE_SUB_BATCH) { + && n_hot_stack > 0 && n_tokens > MMQ_SAFE_SUB_BATCH) { std::vector sub_out; for (int t0 = 0; t0 < n_tokens; t0 += MMQ_SAFE_SUB_BATCH) { const int tc = std::min(MMQ_SAFE_SUB_BATCH, n_tokens - t0); @@ -1234,7 +1381,7 @@ bool eval_moe_hot_only_batched( // ── Slow path: build graph (first call or size mismatch) ── // Try to build and cache for this n_tokens size. // Cache when: sub-batch size (legacy), full stack (all hot), or full-batch safe (sm_80+). - if (mmq_full_batch_ok(cfg, n_tokens) || n_tokens == MMQ_SAFE_SUB_BATCH + if (mmq_full_batch_ok(cfg, n_tokens) || n_tokens <= MMQ_SAFE_SUB_BATCH || (n_hot_stack == 0 || n_hot_stack >= cfg.n_expert)) { if (build_cached_hot_batched_graph(cached, gpu_backend, storage, desc, cfg, n_tokens)) { // Successfully cached — use it immediately @@ -1350,9 +1497,9 @@ bool eval_moe_hybrid_ffn_batched( const int n_hot_stack = storage.gate_up_hot ? (int)storage.gate_up_hot->ne[2] : storage.gate_hot ? (int)storage.gate_hot->ne[2] : 0; - static const int MMQ_SAFE_SUB_BATCH = 4; + const int MMQ_SAFE_SUB_BATCH = mmq_safe_sub_batch(); if (!mmq_full_batch_ok(cfg, n_tokens) - && n_hot_stack > 0 && n_hot_stack < cfg.n_expert && n_tokens > MMQ_SAFE_SUB_BATCH) { + && n_hot_stack > 0 && n_tokens > MMQ_SAFE_SUB_BATCH) { const int n_embd = cfg.n_embd; const int n_used = cfg.n_expert_used; out.assign((size_t)n_embd * (size_t)n_tokens, 0.0f); diff --git a/server/src/common/moe_hybrid_storage.cpp b/server/src/common/moe_hybrid_storage.cpp index a8613b02a..4bf027400 100644 --- a/server/src/common/moe_hybrid_storage.cpp +++ b/server/src/common/moe_hybrid_storage.cpp @@ -130,6 +130,9 @@ MoeHybridStorage::~MoeHybridStorage() { for (auto & layer : layers) { layer.hot_graph.free(); layer.cold_graph.free(); + layer.hot_batched_graph.free(); + for (auto & g : layer.hot_batched_mixed) g.free(); + for (auto & g : layer.cold_batched_mixed) g.free(); if (layer.hot_buf) { ggml_backend_buffer_free(layer.hot_buf); layer.hot_buf = nullptr; diff --git a/server/src/common/moe_hybrid_storage.h b/server/src/common/moe_hybrid_storage.h index 3485c69ff..d4a1d47d4 100644 --- a/server/src/common/moe_hybrid_storage.h +++ b/server/src/common/moe_hybrid_storage.h @@ -132,6 +132,17 @@ struct MoeHybridLayerStorage { // Cached batched hot-only graph for prefill sub-batches (n_tokens=4). CachedHotBatchedGraph hot_batched_graph; + + // Per-n_tokens cached graphs for the MIXED (hot+cold) batched path. The + // all-hot path already caches via hot_batched_graph, but the mixed path used + // to rebuild+free its hot AND cold ggml graphs on every call — that churn + // dominated the spec-decode verify cost (many cold-bearing layers x + // sub-batches x steps). Cache per n_tokens (index 1..kMaxBatchedCache-1) so + // steady-state verify/replay rebuilds zero graphs. Large prefill batches + // (n_tokens >= kMaxBatchedCache) keep using the inline build. + static constexpr int kMaxBatchedCache = 9; // covers spec sub-batch n_tokens 1..8 + CachedHotBatchedGraph hot_batched_mixed[kMaxBatchedCache]; + CachedHotBatchedGraph cold_batched_mixed[kMaxBatchedCache]; }; struct MoeHybridStorage { diff --git a/server/src/draft/draft_gguf_loader.cpp b/server/src/draft/draft_gguf_loader.cpp index 39d620ce6..f0dbd8eb9 100644 --- a/server/src/draft/draft_gguf_loader.cpp +++ b/server/src/draft/draft_gguf_loader.cpp @@ -368,19 +368,34 @@ bool load_draft_gguf(const std::string & path, set_last_error(err); return false; } - // fc: [n_target_layers*n_embd, n_embd] — ne[0] = n_target_layers*n_embd. + // fc: [n_capture_layers*n_embd, n_embd] — ne[0] counts the CAPTURE + // layers the fc consumes. Some draft GGUFs (gemma4) store the + // TARGET's layer count in dflash.n_target_layers instead of the + // capture count; per this file's own philosophy the weights are + // ground truth, so when fc disagrees but is an exact multiple of + // n_embd, derive the count from the tensor and warn. Fail only on + // a genuinely inconsistent shape. if (out.n_target_layers > 0) { const int64_t derived_fc_in = out.fc->ne[0]; const int64_t expected_fc_in = (int64_t)out.n_target_layers * out.n_embd; if (derived_fc_in != expected_fc_in) { - char buf[256]; - std::snprintf(buf, sizeof(buf), - "GGUF shape mismatch: dflash.fc.weight->ne[0]=%lld " - "!= n_target_layers*n_embd=%d*%d=%lld", - (long long)derived_fc_in, - out.n_target_layers, out.n_embd, (long long)expected_fc_in); - set_last_error(buf); - return false; + if (out.n_embd > 0 && derived_fc_in % out.n_embd == 0) { + const int derived_layers = (int)(derived_fc_in / out.n_embd); + std::fprintf(stderr, + "[draft] dflash.n_target_layers metadata (%d) != " + "fc-derived capture count (%d); using the weights\n", + out.n_target_layers, derived_layers); + out.n_target_layers = derived_layers; + } else { + char buf[256]; + std::snprintf(buf, sizeof(buf), + "GGUF shape mismatch: dflash.fc.weight->ne[0]=%lld " + "!= n_target_layers*n_embd=%d*%d=%lld", + (long long)derived_fc_in, + out.n_target_layers, out.n_embd, (long long)expected_fc_in); + set_last_error(buf); + return false; + } } } } diff --git a/server/src/gemma4/gemma4_backend.cpp b/server/src/gemma4/gemma4_backend.cpp index cfed37494..9e7f131a4 100644 --- a/server/src/gemma4/gemma4_backend.cpp +++ b/server/src/gemma4/gemma4_backend.cpp @@ -6,6 +6,7 @@ #include "gemma4_backend.h" #include "dflash27b.h" +#include "../qwen3/qwen3_kvflash_scorer.h" #include "common/sampler.h" #include "common/io_utils.h" #include "common/dflash_feature_ring.h" @@ -49,11 +50,19 @@ bool Gemma4Backend::init() { return false; } - if (!create_gemma4_cache(backend_, w_, cfg_.device.max_ctx, cache_)) { + kvflash_read_config(); + if (!create_gemma4_cache(backend_, w_, cfg_.device.max_ctx, cache_, + kvflash_tokens_)) { std::fprintf(stderr, "[gemma4] cache alloc failed\n"); return false; } cache_.fa_window = cfg_.fa_window; + if (kvflash_active() && cache_.fa_window > 0) { + std::fprintf(stderr, "[kvflash] --fa-window and --kvflash are mutually " + "exclusive full-attention policies\n"); + return false; + } + if (!kvflash_attach()) return false; // Load draft model for speculative decode. if (cfg_.draft_path && !load_decode_draft()) { @@ -117,18 +126,22 @@ bool Gemma4Backend::unpark(const std::string & what) { } // Recreate KV cache - if (!create_gemma4_cache(backend_, w_, cfg_.device.max_ctx, cache_)) { + if (!create_gemma4_cache(backend_, w_, cfg_.device.max_ctx, cache_, + kvflash_tokens_)) { std::fprintf(stderr, "[gemma4] unpark: failed to recreate cache\n"); free_gemma4_weights(w_); return false; } cache_.fa_window = cfg_.fa_window; + if (!kvflash_attach()) return false; + kvflash_drafter_failed_ = false; // fresh VRAM: allow a retry parked_ = false; std::printf("[gemma4] unparked (VRAM restored)\n"); std::fflush(stdout); if (cfg_.draft_path && !draft_parked_ && draft_backend_) { delete dflash_target_; dflash_target_ = new Gemma4DFlashTarget(w_, cache_, backend_); + if (kvflash_active()) dflash_target_->set_kvflash_pager(&kvflash_pager_); } } @@ -138,6 +151,118 @@ bool Gemma4Backend::unpark(const std::string & what) { return true; } +// ── kvflash helpers ──────────────────────────────────────────────────── + +void Gemma4Backend::kvflash_read_config() { + if (std::getenv("DFLASH_KVFLASH")) { + kvflash_drafter_path_ = kvflash_find_drafter(cfg_.model_path); + } + // "auto" sizes from the GPU (weights resident, cache not yet allocated): + // gemma4 pools the FULL-attention layers only (F16 cache); SWA rings are + // fixed-size and excluded from the density. + KvFlashAutoBudget kvf_budget; + { + size_t gpu_free = 0, gpu_total = 0; + if (ggml_backend_dev_t dev = ggml_backend_get_device(backend_)) { + ggml_backend_dev_memory(dev, &gpu_free, &gpu_total); + } + int64_t bpt = 0; + for (int il = 0; il < w_.n_layer; ++il) { + if (!gemma4_has_kv(w_, il) || gemma4_is_swa_layer(w_, il)) continue; + bpt += (int64_t)gemma4_n_head_kv(w_, il) * 2 * + (int64_t)ggml_row_size(GGML_TYPE_F16, gemma4_head_dim(w_, il)); + } + kvf_budget.free_bytes = (int64_t)gpu_free; + kvf_budget.bytes_per_token = bpt; + kvf_budget.reserve_bytes = (int64_t)(1.5 * 1073741824.0) + + (kvflash_drafter_path_.empty() ? 0 : (int64_t)(1.7 * 1073741824.0)); + } + kvflash_tokens_ = kvflash_pool_from_env(cfg_.device.max_ctx, KvFlashConfig{}, + !kvflash_drafter_path_.empty(), + kvf_budget); + if (kvflash_tokens_ > 0) { + const char * tau = std::getenv("DFLASH_KVFLASH_TAU"); + kvflash_tau_ = std::max(1, tau ? std::atoi(tau) : 64); + } +} + +// Drafter rescore + repage (FlashMemory tau loop) with the cross-tokenizer +// scorer: gemma ids are detokenized and re-scored through the Qwen3-0.6B +// drafter. Lazy: the drafter + tokenizers load on the first reselect that +// needs them, never on a request's first tokens. +void Gemma4Backend::kvflash_maybe_reselect(int generated) { + if (!kvflash_active() || kvflash_tau_ <= 0) return; + const int tau = std::max(kvflash_tau_, (int)(kvflash_history_.size() / 45)); + if (generated % tau != 0) return; + if (!kvflash_scorer_) { + if (kvflash_drafter_path_.empty() || kvflash_drafter_failed_) return; + if (!drafter_loaded_) { + ggml_backend_synchronize(backend_); + std::fprintf(stderr, "[kvflash] loading drafter for residency scoring: %s\n", + kvflash_drafter_path_.c_str()); + if (!load_drafter(kvflash_drafter_path_, /*gpu_layers=*/999, + cfg_.device.gpu, drafter_ctx_)) { + std::fprintf(stderr, "[kvflash] drafter load failed (%s); staying on " + "LRU residency\n", dflash27b_last_error()); + kvflash_drafter_failed_ = true; + return; + } + drafter_loaded_ = true; + } + kvflash_scorer_ = std::make_unique( + &drafter_ctx_, cfg_.model_path, kvflash_drafter_path_); + std::fprintf(stderr, "[kvflash] cross-tokenizer drafter scorer attached " + "(tau=%d)\n", kvflash_tau_); + } + if (!kvflash_scorer_->score_chunks(kvflash_history_, kvflash_pager_.chunk_tokens(), + kvflash_scores_)) { + return; // scorer failure -> keep LRU behavior this round + } + kvflash_pager_.score_hook = [this](int c) { + return c < (int)kvflash_scores_.size() ? kvflash_scores_[c] : 1e30f; + }; + const int events = kvflash_pager_.reselect(); + kvflash_pager_.score_hook = nullptr; + if (events > 0) { + std::fprintf(stderr, "[kvflash] reselect @gen=%d: %d page events\n", + generated, events); + } +} + +bool Gemma4Backend::kvflash_attach() { + if (!kvflash_active()) return true; + // Pool the FULL-attention layers only; SWA layers ring-buffer natively + // and KV-reuse layers share their source layer's tensors. + std::vector full_k, full_v; + for (int il = 0; il < w_.n_layer; ++il) { + if (cache_.k[(size_t)il] && !gemma4_is_swa_layer(w_, il)) { + full_k.push_back(cache_.k[(size_t)il]); + full_v.push_back(cache_.v[(size_t)il]); + } + } + KvFlashConfig pc; + pc.pool_tokens = kvflash_tokens_; + if (!kvflash_pager_.attach(pc, full_k, full_v)) { + std::fprintf(stderr, "kvflash: pager attach failed (pool=%d, " + "full-attn layers=%zu)\n", + kvflash_tokens_, full_k.size()); + return false; + } + std::printf("[kvflash] resident pool %d tokens over %zu full-attn layers " + "(logical max_ctx %d, SWA ring %d), policy=%s\n", + kvflash_tokens_, full_k.size(), cfg_.device.max_ctx, + cache_.swa_size, + !kvflash_drafter_path_.empty() + ? "drafter/cross-tok (attaches on first reselect)" + : "lru (recency-only: no Qwen3-0.6B drafter found)"); + std::fflush(stdout); + return true; +} + +bool Gemma4Backend::kvflash_alloc_span(int kv_start, int n_tok) { + return !kvflash_active() || kvflash_pager_.alloc_span(kv_start, n_tok); +} + // ── Prefill ──────────────────────────────────────────────────────────── int Gemma4Backend::do_prefill(const std::vector & tokens, @@ -147,6 +272,19 @@ int Gemma4Backend::do_prefill(const std::vector & tokens, const int hidden = w_.n_embd; const int chunk = cfg_.chunk; + if (kvflash_active()) { + // Fresh request: rebuild the pager mapping. Restore paths land the + // prefix identity-mapped and pre-allocate [0, kv_offset) themselves. + if (kv_offset == 0) kvflash_pager_.reset(); + if (kv_offset + n > kvflash_tokens_ - kvflash_pager_.chunk_tokens()) { + std::fprintf(stderr, + "[kvflash] prompt (%d @ offset %d) exceeds pool %d; raise " + "--kvflash or enable pflash compression\n", + n, kv_offset, kvflash_tokens_); + return -1; + } + } + std::vector embed(chunk * hidden); std::vector logits; @@ -168,8 +306,10 @@ int Gemma4Backend::do_prefill(const std::vector & tokens, for (int i = 0; i < len * hidden; ++i) embed[i] *= scale; const int kv_pos = kv_offset + pos; - if (!gemma4_step(backend_, w_, cache_, embed.data(), - tokens.data() + pos, len, kv_pos, logits)) { + if (!kvflash_alloc_span(kv_pos, len) || + !gemma4_step(backend_, w_, cache_, embed.data(), + tokens.data() + pos, len, kv_pos, logits, + kvflash_active() ? &kvflash_pager_ : nullptr)) { std::fprintf(stderr, "[gemma4] prefill step failed at pos=%d\n", kv_pos); return -1; } @@ -194,6 +334,15 @@ int Gemma4Backend::do_prefill(const std::vector & tokens, } } + if (kvflash_active()) { + if (kv_offset == 0) { + kvflash_history_.assign(tokens.begin(), tokens.end()); + } else { + kvflash_history_.resize((size_t)kv_offset, 0); // restored prefix ids unknown + kvflash_history_.insert(kvflash_history_.end(), tokens.begin(), tokens.end()); + } + } + return kv_offset + pos; } @@ -285,8 +434,10 @@ bool Gemma4Backend::do_decode(int committed, int n_gen, float scale = std::sqrt((float)hidden); for (int j = 0; j < hidden; ++j) embed_buf[j] *= scale; - if (!gemma4_step(backend_, w_, cache_, embed_buf.data(), - &tok, 1, committed, logits)) { + if (!kvflash_alloc_span(committed, 1) || + !gemma4_step(backend_, w_, cache_, embed_buf.data(), + &tok, 1, committed, logits, + kvflash_active() ? &kvflash_pager_ : nullptr)) { return false; } @@ -308,6 +459,10 @@ bool Gemma4Backend::do_decode(int committed, int n_gen, io.emit(next); committed++; cache_.cur_pos = committed; + if (kvflash_active()) { + kvflash_history_.push_back(next); + kvflash_maybe_reselect((int)out_tokens.size()); + } if (io.cancelled) break; // Check EOS @@ -323,7 +478,8 @@ bool Gemma4Backend::do_spec_decode(int committed, int n_gen, std::vector & out_tokens, const DaemonIO & io, const BudgetHook * budget_hook, - bool * forced_close_out) { + bool * forced_close_out, + float * accept_rate_out) { const int hidden = w_.n_embd; int32_t last_tok = cache_.last_tok; @@ -553,6 +709,12 @@ bool Gemma4Backend::do_spec_decode(int committed, int n_gen, n_draft_steps, n_accept_sum, total_draft_pos, accept_pct, n_draft_steps > 0 ? (double)n_generated / (double)n_draft_steps : 0.0); + // Surface acceptance to the HTTP usage block (was silently 0.0, the + // same reporting-only gap as the layer-split path fixed in PR #321). + if (accept_rate_out) { + *accept_rate_out = (float)(n_accept_sum / (double)total_draft_pos); + } + io.emit(-1); return true; } @@ -607,7 +769,8 @@ GenerateResult Gemma4Backend::generate_impl(const GenerateRequest & req, result.spec_decode_ran = true; if (!do_spec_decode(committed, req.n_gen, result.tokens, out_io, &req.budget_hook, - &result.budget_forced_close)) { + &result.budget_forced_close, + &result.accept_rate)) { result.error = "spec_decode"; return result; } @@ -624,7 +787,8 @@ GenerateResult Gemma4Backend::generate_impl(const GenerateRequest & req, for (int j = 0; j < hidden; ++j) embed_buf[j] *= scale; if (!gemma4_step(backend_, w_, cache_, embed_buf.data(), - &last_tok, 1, committed - 1, logits)) { + &last_tok, 1, committed - 1, logits, + kvflash_active() ? &kvflash_pager_ : nullptr)) { result.error = "first logits"; return result; } @@ -725,6 +889,22 @@ GenerateResult Gemma4Backend::restore_and_generate_impl(int slot, cache_.cur_pos = snap_pos; cache_.last_tok = snap.last_tok; + // kvflash: the restored prefix is identity-mapped; rebuild the pager + // mapping over [0, snap_pos) before the delta prefill extends it. + if (kvflash_active()) { + if (snap_pos > kvflash_tokens_ - kvflash_pager_.chunk_tokens()) { + std::fprintf(stderr, "[kvflash] restored prefix (%d) exceeds pool %d\n", + snap_pos, kvflash_tokens_); + result.error = "kvflash: prompt exceeds resident pool"; + return result; + } + kvflash_pager_.reset(); + if (!kvflash_alloc_span(0, snap_pos)) { + result.error = "kvflash_slot"; + return result; + } + } + // Set up sampler sampler_ = req.sampler; if (req.do_sample && sampler_.seed != 0) { @@ -795,7 +975,8 @@ GenerateResult Gemma4Backend::restore_and_generate_impl(int slot, result.spec_decode_ran = true; if (!do_spec_decode(committed, req.n_gen, result.tokens, out_io, &req.budget_hook, - &result.budget_forced_close)) { + &result.budget_forced_close, + &result.accept_rate)) { result.error = "spec_decode"; return result; } @@ -812,7 +993,8 @@ GenerateResult Gemma4Backend::restore_and_generate_impl(int slot, for (int j = 0; j < hidden; ++j) embed_buf[j] *= scale; if (!gemma4_step(backend_, w_, cache_, embed_buf.data(), - &last_tok, 1, committed - 1, logits)) { + &last_tok, 1, committed - 1, logits, + kvflash_active() ? &kvflash_pager_ : nullptr)) { result.error = "first logits"; return result; } @@ -867,6 +1049,13 @@ GenerateResult Gemma4Backend::restore_and_generate_impl(int slot, bool Gemma4Backend::snapshot_save(int slot) { if (parked_) return false; if (slot < 0 || slot >= PREFIX_SLOTS) return false; + // kvflash: snapshots copy rows [0, snap_pos) assuming identity layout, + // which breaks after the first page-out relocates a chunk. + if (kvflash_active() && !kvflash_pager_.is_identity()) { + std::fprintf(stderr, "[kvflash] snapshot skipped: pool has relocated " + "chunks (page-table serialization not implemented)\n"); + return false; + } auto & snap = snapshots_[slot]; const int n_layer = cache_.n_layer; @@ -1129,6 +1318,7 @@ bool Gemma4Backend::load_decode_draft() { delete dflash_target_; dflash_target_ = new Gemma4DFlashTarget(w_, cache_, backend_); + if (kvflash_active()) dflash_target_->set_kvflash_pager(&kvflash_pager_); draft_parked_ = false; std::printf("[gemma4] spec-decode ready: capture_layers=%d mirror_cap=%d\n", n_capture, mirror_cap); diff --git a/server/src/gemma4/gemma4_backend.h b/server/src/gemma4/gemma4_backend.h index 7898e2359..6295496b9 100644 --- a/server/src/gemma4/gemma4_backend.h +++ b/server/src/gemma4/gemma4_backend.h @@ -12,6 +12,8 @@ #include "gemma4_internal.h" #include "gemma4_dflash_target.h" #include "common/sampler.h" +#include "../common/kvflash_pager.h" +#include "../common/kvflash_scorer.h" #include "../qwen3/qwen3_drafter.h" #include "ggml.h" @@ -99,6 +101,27 @@ class Gemma4Backend : public ModelBackend { static constexpr int PREFIX_SLOTS = 64; Gemma4Snapshot snapshots_[PREFIX_SLOTS]; + // ── kvflash (bounded KV residency; see common/kvflash_pager.h) ── + // Pools the FULL-attention layers only (SWA layers already ring-buffer). + // Drafter-scored residency by default via the cross-tokenizer bridge + // (KvFlashCrossTokScorer: gemma ids are detokenized and re-scored by + // the Qwen3-0.6B drafter); LRU is the fallback when no drafter is + // found or --kvflash-policy lru. + KvFlashPager kvflash_pager_; + std::unique_ptr kvflash_scorer_; + std::vector kvflash_scores_; + std::vector kvflash_history_; // prompt + generated ids + std::string kvflash_drafter_path_; + int kvflash_tokens_ = 0; // 0 = off + int kvflash_tau_ = 64; + bool kvflash_drafter_failed_ = false; + bool kvflash_active() const { return kvflash_tokens_ > 0; } + void kvflash_read_config(); + bool kvflash_attach(); + bool kvflash_alloc_span(int kv_start, int n_tok); + // Drafter rescore + repage every effective-tau generated tokens. + void kvflash_maybe_reselect(int generated); + // Prefill prompt tokens in chunks, return absolute committed position. // kv_offset: starting KV cache position (0 for fresh prefill, snap_pos for restore). int do_prefill(const std::vector & tokens, const DaemonIO & io, @@ -126,7 +149,8 @@ class Gemma4Backend : public ModelBackend { std::vector & out_tokens, const DaemonIO & io, const BudgetHook * budget_hook = nullptr, - bool * forced_close_out = nullptr); + bool * forced_close_out = nullptr, + float * accept_rate_out = nullptr); bool load_decode_draft(); void free_decode_draft(); diff --git a/server/src/gemma4/gemma4_dflash_target.cpp b/server/src/gemma4/gemma4_dflash_target.cpp index aebd0b096..7983ccfb3 100644 --- a/server/src/gemma4/gemma4_dflash_target.cpp +++ b/server/src/gemma4/gemma4_dflash_target.cpp @@ -1,6 +1,7 @@ // Gemma4DFlashTarget — DFlashTarget adapter for Gemma4 iSWA models. #include "gemma4_dflash_target.h" +#include "../common/kvflash_pager.h" #include "dflash27b.h" #include @@ -53,11 +54,16 @@ bool Gemma4DFlashTarget::verify_batch( const float scale = std::sqrt((float)hidden); for (size_t i = 0; i < embed.size(); ++i) embed[i] *= scale; + // kvflash: allocate the verify block's slots up front (may evict). + if (pager_ && !pager_->alloc_span(base_pos, n_tokens)) { + return false; + } + // Run verify (all-token argmax) std::vector argmax_buf; if (!gemma4_verify_batch(backend_, w_, cache_, embed.data(), tokens.data(), n_tokens, base_pos, - argmax_buf)) { + argmax_buf, pager_)) { return false; } diff --git a/server/src/gemma4/gemma4_dflash_target.h b/server/src/gemma4/gemma4_dflash_target.h index 1d12079b0..aeed2feae 100644 --- a/server/src/gemma4/gemma4_dflash_target.h +++ b/server/src/gemma4/gemma4_dflash_target.h @@ -32,6 +32,10 @@ class Gemma4DFlashTarget : public DFlashTarget { int & last_tok, std::vector * all_argmax = nullptr) override; + // kvflash: route verify writes through the pool (slots allocated here, + // slot-space mask inside gemma4_verify_batch). Non-owning. + void set_kvflash_pager(class KvFlashPager * pager) { pager_ = pager; } + bool snapshot_kv() override; bool restore_kv() override; @@ -52,6 +56,7 @@ class Gemma4DFlashTarget : public DFlashTarget { Gemma4Weights & w_; Gemma4Cache & cache_; ggml_backend_t backend_; + class KvFlashPager * pager_ = nullptr; // Capture layer IDs (built once in constructor). std::vector capture_ids_; diff --git a/server/src/gemma4/gemma4_graph.cpp b/server/src/gemma4/gemma4_graph.cpp index 7df5a5a9f..33f60ffb5 100644 --- a/server/src/gemma4/gemma4_graph.cpp +++ b/server/src/gemma4/gemma4_graph.cpp @@ -18,6 +18,7 @@ #include "gemma4_internal.h" #include "common/ggml_graph_precision.h" #include "common/gpu_runtime_compat.h" +#include "../common/kvflash_pager.h" #include "dflash27b.h" #include "flashprefill.h" @@ -249,7 +250,10 @@ static ggml_tensor * build_gemma4_attn_block( ? (kv_start - fa_window) : 0; const int kv_len_raw = is_swa ? std::min(kv_start + n_tokens, cache_len) : (kv_start + n_tokens - full_win_start); - const int kv_len = (kv_len_raw + 255) & ~255; // pad to 256 for CUDA FA + // Pad to 256 for CUDA FA, clamped to the tensor's physical capacity + // (kvflash pools allocate full layers below max_ctx; the slot mask keeps + // the clamped span exact). + const int kv_len = std::min((kv_len_raw + 255) & ~255, cache_len); ggml_tensor * Qfa = ggml_permute(ctx, Qcur, 0, 2, 1, 3); Qfa = ggml_cont(ctx, Qfa); @@ -620,8 +624,14 @@ bool gemma4_step( const int32_t * token_ids, int n_tokens, int kv_start, - std::vector & out_logits) + std::vector & out_logits, + const KvFlashPager * kvflash) { + if (kvflash && cache.fa_window > 0) { + std::fprintf(stderr, "gemma4_step: kvflash and fa_window are mutually " + "exclusive full-attention policies\n"); + return false; + } // Allocate graph context. Persistent thread_local arena: rebuilt graphs // land at identical addresses every step, so the ggml-cuda CUDA-graph // cache (keyed on nodes[0], memcmps node properties) can replay the @@ -662,9 +672,18 @@ bool gemma4_step( } // Attention masks (full + SWA) - // Full-attention mask: covers all positions [0, kv_start+n_tokens) + // Full-attention mask: covers all positions [0, kv_start+n_tokens), + // clamped to the full-layer tensor capacity (pool-sized under kvflash) — + // must agree with the FA span clamp in build_gemma4_attn_block. + int full_cap = cache.max_ctx; + for (int il = 0; il < (int)cache.k.size(); ++il) { + if (cache.k[(size_t)il] && !gemma4_is_swa_layer(w, il)) { + full_cap = (int)cache.k[(size_t)il]->ne[1]; + break; + } + } const int kv_len_raw = kv_start + n_tokens; - const int kv_len_padded = (kv_len_raw + 255) & ~255; + const int kv_len_padded = std::min((kv_len_raw + 255) & ~255, full_cap); ggml_tensor * mk_full = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, kv_len_padded, n_tokens, 1, 1); ggml_set_input(mk_full); ggml_tensor * mk_full_f16 = ggml_cast(ctx, mk_full, GGML_TYPE_F16); @@ -768,12 +787,32 @@ bool gemma4_step( std::vector pos((size_t)n_tokens); for (int i = 0; i < n_tokens; ++i) pos[i] = kv_start + i; ggml_backend_tensor_set(pp, pos.data(), 0, ggml_nbytes(pp)); + if (!kvi_full && kvflash) { + std::fprintf(stderr, "gemma4_step: kvflash requires the set_rows path " + "(DFLASH_GEMMA4_NO_KVPAD is incompatible)\n"); + ggml_free(ctx); + return false; + } + std::vector kvf_mfull; // slot-space full mask (kvflash) if (kvi_full) { - // Full layers append at the absolute position; SWA layers at the ring - // slot. Per-token modular indices also land chunks that cross the - // ring wrap boundary correctly (the offset-view path wrote one - // contiguous block). - ggml_backend_tensor_set(kvi_full, pos.data(), 0, ggml_nbytes(kvi_full)); + // Full layers append at the absolute position (or the kvflash pool + // slot); SWA layers at the ring slot. Per-token modular indices also + // land chunks that cross the ring wrap boundary correctly (the + // offset-view path wrote one contiguous block). + if (kvflash) { + // Rows + slot-space full mask in one pass (shared helper; the + // mask is uploaded below where the legacy path builds its own). + std::vector rows; + if (!kvflash_fill_rows_and_masks(*kvflash, kv_start, n_tokens, + kv_len_padded, /*swa_window=*/0, + rows, &kvf_mfull, nullptr)) { + ggml_free(ctx); + return false; + } + ggml_backend_tensor_set(kvi_full, rows.data(), 0, ggml_nbytes(kvi_full)); + } else { + ggml_backend_tensor_set(kvi_full, pos.data(), 0, ggml_nbytes(kvi_full)); + } GGML_ASSERT(swa_size > 0); std::vector ring((size_t)n_tokens); for (int i = 0; i < n_tokens; ++i) ring[i] = (kv_start + i) % swa_size; @@ -785,12 +824,18 @@ bool gemma4_step( ggml_backend_tensor_set(tok_ids, token_ids, 0, (size_t)n_tokens * sizeof(int32_t)); } - // Causal mask (full attention) — padded positions are masked with -inf - std::vector mfull((size_t)kv_len_padded * n_tokens, -INFINITY); - for (int q = 0; q < n_tokens; ++q) { - const int abs_q = kv_start + q; - for (int k = 0; k <= abs_q && k < kv_len_raw; ++k) { - mfull[(size_t)q * kv_len_padded + k] = 0.0f; + // Causal mask (full attention) — padded positions are masked with -inf. + // kvflash: SLOT-space mask already built alongside the append rows. + std::vector mfull; + if (kvflash) { + mfull = std::move(kvf_mfull); + } else { + mfull.assign((size_t)kv_len_padded * n_tokens, -INFINITY); + for (int q = 0; q < n_tokens; ++q) { + const int abs_q = kv_start + q; + for (int k = 0; k <= abs_q && k < kv_len_raw; ++k) { + mfull[(size_t)q * kv_len_padded + k] = 0.0f; + } } } ggml_backend_tensor_set(mk_full, mfull.data(), 0, ggml_nbytes(mk_full)); @@ -844,8 +889,14 @@ bool gemma4_verify_batch( const int32_t * token_ids, int n_tokens, int kv_start, - std::vector & out_argmax) + std::vector & out_argmax, + const KvFlashPager * kvflash) { + if (kvflash && cache.fa_window > 0) { + std::fprintf(stderr, "gemma4_verify_batch: kvflash and fa_window are " + "mutually exclusive\n"); + return false; + } ggml_init_params ip{}; ip.mem_size = ggml_tensor_overhead() * 16384 + ggml_graph_overhead() + 16 * 1024 * 1024; ip.no_alloc = true; @@ -865,9 +916,28 @@ bool gemma4_verify_batch( ggml_set_input(tok_ids); } - // Attention masks (padded) + // kvflash: full-layer writes must go through set_rows to land in pool + // slots; SWA ring rows ride the same mechanism (pos % swa_size). + ggml_tensor * kvi_full = nullptr, * kvi_swa = nullptr; + if (kvflash) { + kvi_full = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_tokens); + ggml_set_input(kvi_full); + kvi_swa = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_tokens); + ggml_set_input(kvi_swa); + } + + // Attention masks (padded; full width clamps to the full-layer tensor + // capacity, which is pool-sized under kvflash — must agree with the FA + // span clamp in build_gemma4_attn_block) + int full_cap = cache.max_ctx; + for (int il = 0; il < (int)cache.k.size(); ++il) { + if (cache.k[(size_t)il] && !gemma4_is_swa_layer(w, il)) { + full_cap = (int)cache.k[(size_t)il]->ne[1]; + break; + } + } const int kv_len_raw = kv_start + n_tokens; - const int kv_len_padded = (kv_len_raw + 255) & ~255; + const int kv_len_padded = std::min((kv_len_raw + 255) & ~255, full_cap); ggml_tensor * mk_full = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, kv_len_padded, n_tokens, 1, 1); ggml_set_input(mk_full); ggml_tensor * mk_full_f16 = ggml_cast(ctx, mk_full, GGML_TYPE_F16); @@ -914,7 +984,8 @@ bool gemma4_verify_batch( } cur = build_gemma4_layer(ctx, gf, w, cache, il, cur, pp, mk_full_f16, mk_swa_f16, pl_input, - kv_start, n_tokens, cap_idx); + kv_start, n_tokens, cap_idx, + kvi_full, kvi_swa); } // Final norm @@ -954,12 +1025,27 @@ bool gemma4_verify_batch( ggml_backend_tensor_set(tok_ids, token_ids, 0, (size_t)n_tokens * sizeof(int32_t)); } - // Masks - std::vector mfull((size_t)kv_len_padded * n_tokens, -INFINITY); - for (int q = 0; q < n_tokens; ++q) { - const int abs_q = kv_start + q; - for (int k = 0; k <= abs_q && k < kv_len_raw; ++k) { - mfull[(size_t)q * kv_len_padded + k] = 0.0f; + // Masks (kvflash: slot-space full mask + slot rows via the shared helper) + std::vector mfull; + if (kvflash) { + std::vector rows; + if (!kvflash_fill_rows_and_masks(*kvflash, kv_start, n_tokens, + kv_len_padded, /*swa_window=*/0, + rows, &mfull, nullptr)) { + ggml_free(ctx); + return false; + } + ggml_backend_tensor_set(kvi_full, rows.data(), 0, ggml_nbytes(kvi_full)); + std::vector ring((size_t)n_tokens); + for (int i = 0; i < n_tokens; ++i) ring[(size_t)i] = (kv_start + i) % swa_size; + ggml_backend_tensor_set(kvi_swa, ring.data(), 0, ggml_nbytes(kvi_swa)); + } else { + mfull.assign((size_t)kv_len_padded * n_tokens, -INFINITY); + for (int q = 0; q < n_tokens; ++q) { + const int abs_q = kv_start + q; + for (int k = 0; k <= abs_q && k < kv_len_raw; ++k) { + mfull[(size_t)q * kv_len_padded + k] = 0.0f; + } } } ggml_backend_tensor_set(mk_full, mfull.data(), 0, ggml_nbytes(mk_full)); diff --git a/server/src/gemma4/gemma4_internal.h b/server/src/gemma4/gemma4_internal.h index d1e0e9033..800f00101 100644 --- a/server/src/gemma4/gemma4_internal.h +++ b/server/src/gemma4/gemma4_internal.h @@ -188,14 +188,19 @@ struct Gemma4Cache { ggml_backend_buffer_t feat_buf = nullptr; }; +// `ctx_alloc` (kvflash): when > 0 and < max_ctx, FULL-attention layers' K/V +// tensors are allocated at ctx_alloc rows (the resident pool); SWA layers +// keep their sliding-window ring buffers (already bounded). cache.max_ctx +// stays the logical bound. 0 = allocate full layers at max_ctx (default). bool create_gemma4_cache(ggml_backend_t backend, const Gemma4Weights & w, - int max_ctx, Gemma4Cache & out); + int max_ctx, Gemma4Cache & out, int ctx_alloc = 0); bool create_gemma4_cache_partial(ggml_backend_t backend, const Gemma4Weights & w, int max_ctx, int layer_begin, int layer_end, - Gemma4Cache & out); + Gemma4Cache & out, + int ctx_alloc = 0); void free_gemma4_cache(Gemma4Cache & c); // Allocate target_feat ring buffer (call after draft load determines n_capture_layers). @@ -221,6 +226,12 @@ void free_gemma4_snapshot(Gemma4Snapshot & s); // Returns logits for last token. // token_ids: raw token IDs needed for per-layer embedding lookup (may be nullptr // if the model has no per-layer embeddings). +// `kvflash`: optional bounded-residency pager over the FULL-attention KV +// (see common/kvflash_pager.h). When set, full-layer append rows come from +// the pager's slot mapping and the full mask is built in SLOT space; SWA +// ring buffers are untouched. The caller must have allocated slots for +// [kv_start, kv_start + n_tokens) via slot_for() beforehand. Requires the +// set_rows path (refused under DFLASH_GEMMA4_NO_KVPAD) and fa_window == 0. bool gemma4_step( ggml_backend_t backend, const Gemma4Weights & w, @@ -229,10 +240,17 @@ bool gemma4_step( const int32_t * token_ids, int n_tokens, int kv_start, - std::vector & out_logits); + std::vector & out_logits, + const class KvFlashPager * kvflash = nullptr); // Verify batch: run forward pass returning argmax for ALL positions. // Used by DFlash speculative decode target. +// `kvflash`: optional bounded-residency pager (caller must alloc_span() +// [kv_start, kv_start+n_tokens) first). Full-layer writes go to pool slots +// via set_rows with a slot-space causal mask; SWA ring writes/masks are +// unchanged. Rejected draft slots hold future positions, so the validity +// rule excludes them until the next verify rewrites them (KV truncation +// semantics, same as the full cache). bool gemma4_verify_batch( ggml_backend_t backend, const Gemma4Weights & w, @@ -241,7 +259,8 @@ bool gemma4_verify_batch( const int32_t * token_ids, int n_tokens, int kv_start, - std::vector & out_argmax); + std::vector & out_argmax, + const class KvFlashPager * kvflash = nullptr); // Project hidden states through lm_head (out_norm + output + softcap + argmax). // Used by DFlash draft to convert draft hidden states to token IDs. diff --git a/server/src/gemma4/gemma4_loader.cpp b/server/src/gemma4/gemma4_loader.cpp index 00be4c8a8..c6fbb5c6b 100644 --- a/server/src/gemma4/gemma4_loader.cpp +++ b/server/src/gemma4/gemma4_loader.cpp @@ -475,9 +475,10 @@ void free_gemma4_weights(Gemma4Weights & w) { // ── Cache ────────────────────────────────────────────────────────────── bool create_gemma4_cache(ggml_backend_t backend, const Gemma4Weights & w, - int max_ctx, Gemma4Cache & out) { + int max_ctx, Gemma4Cache & out, int ctx_alloc) { return create_gemma4_cache_partial( - backend, w, max_ctx, /*layer_begin=*/0, /*layer_end=*/w.n_layer, out); + backend, w, max_ctx, /*layer_begin=*/0, /*layer_end=*/w.n_layer, out, + ctx_alloc); } bool create_gemma4_cache_partial(ggml_backend_t backend, @@ -485,7 +486,8 @@ bool create_gemma4_cache_partial(ggml_backend_t backend, int max_ctx, int layer_begin, int layer_end, - Gemma4Cache & out) { + Gemma4Cache & out, + int ctx_alloc) { if (layer_begin < 0) layer_begin = 0; if (layer_end < 0) layer_end = w.n_layer; if (layer_begin > layer_end || layer_end > w.n_layer) return false; @@ -521,6 +523,10 @@ bool create_gemma4_cache_partial(ggml_backend_t backend, const int swa_size = (w.sliding_window > 0 && w.sliding_window < max_ctx) ? w.sliding_window : max_ctx; + // kvflash: FULL-attention layers at pool capacity; SWA ring buffers are + // already bounded and stay at swa_size. + const int full_phys = (ctx_alloc > 0 && ctx_alloc < max_ctx) ? ctx_alloc : max_ctx; + // Determine KV source for each layer int last_kv_layer = -1; for (int il = 0; il < w.n_layer; ++il) { @@ -529,7 +535,7 @@ bool create_gemma4_cache_partial(ggml_backend_t backend, const int D = gemma4_head_dim(w, il); const int Hk = gemma4_n_head_kv(w, il); const bool is_swa = gemma4_is_swa_layer(w, il); - const int cache_len = is_swa ? swa_size : max_ctx; + const int cache_len = is_swa ? swa_size : full_phys; if (owned_layer) { out.k[il] = ggml_new_tensor_3d(out.ctx, GGML_TYPE_F16, D, cache_len, Hk); out.v[il] = ggml_new_tensor_3d(out.ctx, GGML_TYPE_F16, D, cache_len, Hk); diff --git a/server/src/internal.h b/server/src/internal.h index 3c9611326..125e9a24e 100644 --- a/server/src/internal.h +++ b/server/src/internal.h @@ -373,6 +373,8 @@ struct TargetCache { void snapshot_ssm_state(TargetCache & c); // Restore the SSM+conv state from the snapshot. void restore_ssm_state(TargetCache & c); +// Allocate rollback snapshot tensors mirroring live ssm/conv state (MoE path). +bool ensure_ssm_snapshot(TargetCache & c, ggml_backend_t backend); // ─── Cross-request prefix snapshot (Phase A) ────────────────────── // @@ -471,12 +473,18 @@ bool restore_target_cache_chain(const PrefixSnapshot * thick, // When prefill_only is true, rollback tensors (snapshots, intermediates) are // skipped — saving ~1.4 GB on 48 DeltaNet layers. Use migrate_prefill_cache() // to promote the cache to a full decode cache after prefill. +// `ctx_alloc` (0 = max_ctx): physical token capacity of the attention KV +// tensors. When smaller than max_ctx, a KvFlashPager maps logical positions to +// pool slots and pages cold chunks to host (bounded KV residency); the +// logical context bound stays max_ctx. Recurrent (DeltaNet) state is +// unaffected. bool create_target_cache(const TargetWeights & w, int max_ctx, int max_verify_tokens, ggml_backend_t backend, TargetCache & out, - bool prefill_only = false); + bool prefill_only = false, + int ctx_alloc = 0); bool create_target_cache_partial(const TargetWeights & w, int max_ctx, @@ -486,7 +494,8 @@ bool create_target_cache_partial(const TargetWeights & w, bool prefill_only, int layer_begin, int layer_end, - bool allocate_target_feat); + bool allocate_target_feat, + int ctx_alloc = 0); void free_target_cache(TargetCache & c); diff --git a/server/src/laguna/laguna_backend.cpp b/server/src/laguna/laguna_backend.cpp index ab75ef5a8..9631f7f76 100644 --- a/server/src/laguna/laguna_backend.cpp +++ b/server/src/laguna/laguna_backend.cpp @@ -8,6 +8,7 @@ #include "laguna_backend.h" #include "laguna_internal.h" +#include "qwen3/qwen3_kvflash_scorer.h" #include "dflash27b.h" #include @@ -68,16 +69,130 @@ bool LagunaBackend::init() { cache_.kv_k_type = args_.kv_type; cache_.kv_v_type = args_.kv_type; - if (!create_laguna_target_cache(w_, args_.max_ctx, backend_, cache_)) { + kvflash_read_config(); + if (!create_laguna_target_cache(w_, args_.max_ctx, backend_, cache_, + kvflash_tokens_)) { std::fprintf(stderr, "cache failed: %s\n", dflash27b_last_error()); free_laguna_target_weights(w_); ggml_backend_free(backend_); backend_ = nullptr; return false; } + if (!kvflash_attach()) { + ggml_backend_free(backend_); backend_ = nullptr; + return false; + } + + return true; +} + +// ── kvflash helpers ───────────────────────────────────────────────────── +// Laguna's pager protections: the trailing sliding_window span (+1 chunk +// for the partially filled head) must stay resident so SWA attention stays +// exact under paging. This drives both the pool floor and the attach config. +KvFlashConfig LagunaBackend::kvflash_config() const { + KvFlashConfig pc; + pc.tail_window_chunks = + std::max(4, (w_.sliding_window + pc.chunk_tokens - 1) / pc.chunk_tokens + 1); + return pc; +} + +void LagunaBackend::kvflash_read_config() { + if (std::getenv("DFLASH_KVFLASH")) { + kvflash_drafter_path_ = kvflash_find_drafter(args_.target_path.c_str()); + } + // "auto" sizes from the GPU (weights resident, cache not yet allocated): + // laguna pools ALL n_layer layers at the configured KV quant. + KvFlashAutoBudget kvf_budget; + { + size_t gpu_free = 0, gpu_total = 0; + if (ggml_backend_dev_t dev = ggml_backend_get_device(backend_)) { + ggml_backend_dev_memory(dev, &gpu_free, &gpu_total); + } + kvf_budget.free_bytes = (int64_t)gpu_free; + kvf_budget.bytes_per_token = (int64_t)w_.n_layer * w_.n_head_kv * 2 * + (int64_t)ggml_row_size(args_.kv_type, w_.head_dim); + kvf_budget.reserve_bytes = (int64_t)(1.5 * 1073741824.0) + + (kvflash_drafter_path_.empty() ? 0 : (int64_t)(1.7 * 1073741824.0)); + } + kvflash_tokens_ = kvflash_pool_from_env(args_.max_ctx, kvflash_config(), + !kvflash_drafter_path_.empty(), + kvf_budget); + if (kvflash_tokens_ > 0) { + const char * tau = std::getenv("DFLASH_KVFLASH_TAU"); + kvflash_tau_ = std::max(1, tau ? std::atoi(tau) : 64); + } +} + +// Drafter rescore + repage (FlashMemory tau loop) with the cross-tokenizer +// scorer: laguna ids are detokenized and re-scored through the Qwen3-0.6B +// drafter (relevance is text-level, so the tokenizer gap is bridged by +// re-tokenization). Lazy: the drafter + tokenizers load on the first +// reselect that needs them, never on a request's first tokens. +void LagunaBackend::kvflash_maybe_reselect(const std::vector & history, + int generated) { + if (!kvflash_active() || kvflash_tau_ <= 0) return; + const int tau = std::max(kvflash_tau_, (int)(history.size() / 45)); + if (generated % tau != 0) return; + if (!kvflash_scorer_) { + if (kvflash_drafter_path_.empty() || kvflash_drafter_failed_) return; + if (!drafter_loaded_) { + ggml_backend_synchronize(backend_); + std::fprintf(stderr, "[kvflash] loading drafter for residency scoring: %s\n", + kvflash_drafter_path_.c_str()); + if (!load_drafter(kvflash_drafter_path_, /*gpu_layers=*/999, + args_.device.gpu, drafter_ctx_)) { + std::fprintf(stderr, "[kvflash] drafter load failed (%s); staying on " + "LRU residency\n", dflash27b_last_error()); + kvflash_drafter_failed_ = true; + return; + } + drafter_loaded_ = true; + } + kvflash_scorer_ = std::make_unique( + &drafter_ctx_, args_.target_path, kvflash_drafter_path_); + std::fprintf(stderr, "[kvflash] cross-tokenizer drafter scorer attached " + "(tau=%d)\n", kvflash_tau_); + } + if (!kvflash_scorer_->score_chunks(history, kvflash_pager_.chunk_tokens(), + kvflash_scores_)) { + return; // scorer failure -> keep LRU behavior this round + } + kvflash_pager_.score_hook = [this](int c) { + return c < (int)kvflash_scores_.size() ? kvflash_scores_[c] : 1e30f; + }; + const int events = kvflash_pager_.reselect(); + kvflash_pager_.score_hook = nullptr; + if (events > 0) { + std::fprintf(stderr, "[kvflash] reselect @gen=%d: %d page events\n", + generated, events); + } +} + +bool LagunaBackend::kvflash_attach() { + if (!kvflash_active()) return true; + KvFlashConfig pc = kvflash_config(); + pc.pool_tokens = kvflash_tokens_; + if (!kvflash_pager_.attach(pc, cache_.attn_k, cache_.attn_v)) { + std::fprintf(stderr, "kvflash: pager attach failed (pool=%d)\n", + kvflash_tokens_); + return false; + } + std::printf("[kvflash] resident pool %d tokens (logical max_ctx %d), " + "policy=%s, swa_tail=%d chunks\n", + kvflash_tokens_, args_.max_ctx, + !kvflash_drafter_path_.empty() + ? "drafter/cross-tok (attaches on first reselect)" + : "lru (recency-only: no Qwen3-0.6B drafter found)", + pc.tail_window_chunks); + std::fflush(stdout); return true; } +bool LagunaBackend::kvflash_alloc_span(int kv_start, int n_tok) { + return !kvflash_active() || kvflash_pager_.alloc_span(kv_start, n_tok); +} + void LagunaBackend::print_ready_banner() const { std::printf("[laguna-daemon] ready vocab=%lld eos=%d eot=%d max_ctx=%d kv=%s chunk=%d\n", (long long)w_.embedder.n_vocab, w_.eos_id, w_.eos_chat_id, @@ -107,10 +222,17 @@ bool LagunaBackend::unpark(const std::string & what) { } cache_.kv_k_type = args_.kv_type; cache_.kv_v_type = args_.kv_type; - if (!create_laguna_target_cache(w_, args_.max_ctx, backend_, cache_)) { + if (!create_laguna_target_cache(w_, args_.max_ctx, backend_, cache_, + kvflash_tokens_)) { std::fprintf(stderr, "[unpark] cache: %s\n", dflash27b_last_error()); return false; } + if (!kvflash_attach()) { + free_laguna_target_cache(cache_); + free_laguna_target_weights(w_); + return false; // still parked, resources released + } + kvflash_drafter_failed_ = false; // fresh VRAM: allow a retry target_parked_ = false; std::printf("[unpark] target restored\n"); std::fflush(stdout); } @@ -132,6 +254,13 @@ bool LagunaBackend::ensure_slot(int slot) { } bool LagunaBackend::snapshot_save(int slot) { + // kvflash: snapshots copy rows assuming identity layout, which breaks + // after the first page-out relocates a chunk. + if (kvflash_active() && !kvflash_pager_.is_identity()) { + std::fprintf(stderr, "[kvflash] snapshot skipped: pool has relocated " + "chunks (page-table serialization not implemented)\n"); + return false; + } if (!ensure_slot(slot)) return false; if (!laguna_snapshot_save(cache_, snap_backend_, w_.n_layer, w_.n_head_kv, w_.head_dim, snapshots_[slot])) { @@ -189,7 +318,19 @@ GenerateResult LagunaBackend::generate_impl(const GenerateRequest & req, return result; } + // kvflash: prefill rows land identity-mapped, so the prompt must fit the + // pool with one chunk of decode headroom (decode then evicts LRU live). + if (kvflash_active() && + N > kvflash_tokens_ - kvflash_pager_.chunk_tokens()) { + std::fprintf(stderr, "[kvflash] prompt (%d) exceeds pool %d; raise " + "--kvflash\n", N, kvflash_tokens_); + result.error = "kvflash: prompt exceeds resident pool"; + return result; + } + reset_laguna_target_cache(cache_); + if (kvflash_active()) kvflash_pager_.reset(); + const KvFlashPager * kvf = kvflash_active() ? &kvflash_pager_ : nullptr; // ── Prefill ── std::vector embed_pf((size_t)N * w_.n_embd); @@ -205,15 +346,23 @@ GenerateResult LagunaBackend::generate_impl(const GenerateRequest & req, for (int c = 0; c < n_chunks && ok; ++c) { const int kv_start = c * args_.chunk; const int n_tok = std::min(args_.chunk, N - c * args_.chunk); - ok = laguna_step(backend_, w_, cache_, + ok = kvflash_alloc_span(kv_start, n_tok) && + laguna_step(backend_, w_, cache_, embed_pf.data() + (size_t)kv_start * w_.n_embd, - n_tok, kv_start, no_mask, last_logits); + n_tok, kv_start, no_mask, last_logits, kvf); } if (!ok) { result.error = "prefill"; return result; } auto t_pf1 = std::chrono::steady_clock::now(); result.prefill_s = std::chrono::duration(t_pf1 - t_pf0).count(); // ── Inline snapshot (if requested) ── + // kvflash: snapshots copy rows [0, snap_pos) assuming identity layout, + // which holds until the first page-out relocates a chunk. + if (kvflash_active() && req.snap_slot >= 0 && + !kvflash_pager_.is_identity()) { + std::fprintf(stderr, "[kvflash] snapshot skipped: pool has relocated " + "chunks (page-table serialization not implemented)\n"); + } else if (req.snap_slot >= 0 && req.snap_pos > 0 && req.snap_pos <= N) { if (ensure_slot(req.snap_slot) && laguna_snapshot_save(cache_, snap_backend_, w_.n_layer, @@ -303,8 +452,10 @@ GenerateResult LagunaBackend::generate_impl(const GenerateRequest & req, } if (!w_.embedder.embed(&next_tok, 1, embed_step.data())) { ok = false; break; } std::vector step_logits; - if (!laguna_step(backend_, w_, cache_, embed_step.data(), 1, - cache_.cur_pos, no_mask, step_logits)) { ok = false; break; } + if (!kvflash_alloc_span(cache_.cur_pos, 1) || + !laguna_step(backend_, w_, cache_, embed_step.data(), 1, + cache_.cur_pos, no_mask, step_logits, kvf)) { ok = false; break; } + kvflash_maybe_reselect(history, s + 1); next_tok = pick(step_logits); } auto t_g1 = std::chrono::steady_clock::now(); @@ -342,6 +493,24 @@ GenerateResult LagunaBackend::restore_and_generate_impl(int slot, return result; } + // kvflash: restore lands rows identity-mapped; the full prompt (prefix + + // diff) must fit the pool. Rebuild the pager mapping over the prefix. + if (kvflash_active() && + N > kvflash_tokens_ - kvflash_pager_.chunk_tokens()) { + std::fprintf(stderr, "[kvflash] restore prompt (%d) exceeds pool %d; " + "raise --kvflash\n", N, kvflash_tokens_); + result.error = "kvflash: prompt exceeds resident pool"; + return result; + } + if (kvflash_active()) { + kvflash_pager_.reset(); + if (!kvflash_alloc_span(0, prefix_len)) { + result.error = "kvflash_slot"; + return result; + } + } + const KvFlashPager * kvf = kvflash_active() ? &kvflash_pager_ : nullptr; + // Re-prefill diff tokens (or last cached token when diff is empty). if (prefix_len == N) { if (prefix_len <= 0) { result.error = "empty_diff"; return result; } @@ -363,9 +532,10 @@ GenerateResult LagunaBackend::restore_and_generate_impl(int slot, const int off = c * args_.chunk; const int n_tok = std::min(args_.chunk, diff_n - off); const int starts = kv_start + off; - ok = laguna_step(backend_, w_, cache_, + ok = kvflash_alloc_span(starts, n_tok) && + laguna_step(backend_, w_, cache_, embed_diff.data() + (size_t)off * w_.n_embd, - n_tok, starts, no_mask, last_logits); + n_tok, starts, no_mask, last_logits, kvf); } if (!ok) { result.error = "prefill"; return result; } @@ -437,8 +607,10 @@ GenerateResult LagunaBackend::restore_and_generate_impl(int slot, if (out_io.cancelled) break; if (!w_.embedder.embed(&next_tok, 1, embed_step.data())) { ok = false; break; } std::vector step_logits; - if (!laguna_step(backend_, w_, cache_, embed_step.data(), 1, - cache_.cur_pos, no_mask, step_logits)) { ok = false; break; } + if (!kvflash_alloc_span(cache_.cur_pos, 1) || + !laguna_step(backend_, w_, cache_, embed_step.data(), 1, + cache_.cur_pos, no_mask, step_logits, kvf)) { ok = false; break; } + kvflash_maybe_reselect(history, s + 1); next_tok = pick(step_logits); } auto t_g1 = std::chrono::steady_clock::now(); @@ -1085,8 +1257,10 @@ bool LagunaBackend::hybrid_forward_one_token(int32_t tok, int kv_pos, static const bool _nm = (std::getenv("DFLASH_NO_MASK") != nullptr); static std::vector _sg_logits; static std::vector _sg_sel; + if (!kvflash_alloc_span(kv_pos, 1)) return false; if (!laguna_step_hybrid(backend_, w_, cache_, act_cur.data(), 1, kv_pos, _nm, - *moe_hybrid_, _sg_logits, &_sg_sel)) + *moe_hybrid_, _sg_logits, &_sg_sel, + kvflash_active() ? &kvflash_pager_ : nullptr)) return false; // Reactive cache warm + routing observe, POST-compute (off the // single-graph critical path): make each selected expert resident @@ -1128,6 +1302,14 @@ bool LagunaBackend::hybrid_forward_one_token(int32_t tok, int kv_pos, // GPU-resident state for MoE layers GpuResidentState gpu_state; + // The per-layer fallback writes KV at literal view offsets (no set_rows), + // which a kvflash pool cannot express once chunks relocate. + if (kvflash_active()) { + std::fprintf(stderr, "[kvflash] laguna per-layer hybrid decode is not " + "pool-aware; unset DFLASH_LAGUNA_NO_SINGLE_GRAPH\n"); + return false; + } + if (!init_gpu_resident_state(gpu_state, backend_, hidden)) return false; ggml_backend_tensor_set(gpu_state.act_cur, act_cur.data(), 0, sizeof(float) * (size_t)hidden); @@ -1348,7 +1530,25 @@ GenerateResult LagunaBackend::generate_hybrid(const GenerateRequest & req, return result; } + // kvflash: hybrid prefill writes rows identity-mapped (legacy per-layer + // views), so the prompt must fit the pool; the pager mapping is built up + // front and stays identity through prefill (no eviction can trigger). + if (kvflash_active() && + N > kvflash_tokens_ - kvflash_pager_.chunk_tokens()) { + std::fprintf(stderr, "[kvflash] hybrid prompt (%d) exceeds pool %d; " + "raise --kvflash\n", N, kvflash_tokens_); + result.error = "kvflash: prompt exceeds resident pool"; + return result; + } + reset_laguna_target_cache(cache_); + if (kvflash_active()) { + kvflash_pager_.reset(); + if (!kvflash_alloc_span(0, N)) { + result.error = "kvflash_slot"; + return result; + } + } // ── Hybrid Prefill: layer-by-layer pre-FFN + batched hybrid FFN ── const int hidden = w_.n_embd; @@ -1652,6 +1852,7 @@ GenerateResult LagunaBackend::generate_hybrid(const GenerateRequest & req, break; } cache_.cur_pos++; + kvflash_maybe_reselect(history, s + 1); if (req.do_sample) { // For sampling, we need full logits — project from act_cur diff --git a/server/src/laguna/laguna_backend.h b/server/src/laguna/laguna_backend.h index 156c82e6b..881ad1abd 100644 --- a/server/src/laguna/laguna_backend.h +++ b/server/src/laguna/laguna_backend.h @@ -10,6 +10,8 @@ #include "laguna_internal.h" #include "placement/placement_config.h" #include "qwen3_drafter.h" +#include "kvflash_pager.h" +#include "kvflash_scorer.h" #include "../common/moe_hybrid_ffn_eval.h" #include "../common/moe_hybrid_storage.h" #include "../common/moe_hybrid_routing_stats.h" @@ -99,6 +101,34 @@ class LagunaBackend : public ModelBackend { bool ensure_slot(int slot); + // ── kvflash (bounded KV residency; see common/kvflash_pager.h) ── + // Drafter-scored residency by default: the Qwen3-0.6B drafter scores + // chunks through the cross-tokenizer bridge (KvFlashCrossTokScorer — + // relevance is text-level, so the target's ids are detokenized and + // re-tokenized for the drafter). LRU is the fallback when no drafter is + // found or --kvflash-policy lru. The pager covers ALL 40 layers; SWA + // exactness comes from a protected tail >= sliding_window. + KvFlashPager kvflash_pager_; + std::unique_ptr kvflash_scorer_; + std::vector kvflash_scores_; + std::string kvflash_drafter_path_; + int kvflash_tokens_ = 0; // 0 = off + int kvflash_tau_ = 64; + bool kvflash_drafter_failed_ = false; + bool kvflash_active() const { return kvflash_tokens_ > 0; } + // Drafter rescore + repage every effective-tau generated tokens + // (lazy-loads the drafter + cross-tokenizer scorer on first need). + void kvflash_maybe_reselect(const std::vector & history, int generated); + // Pager protections (SWA tail) shared by the floor and attach. + KvFlashConfig kvflash_config() const; + // Read DFLASH_KVFLASH and round/clamp; call before cache creation. + void kvflash_read_config(); + // Attach the pager to the freshly created cache (init / unpark). + bool kvflash_attach(); + // Allocate pool slots for [kv_start, kv_start+n_tok) (evicting LRU as + // needed) ahead of a laguna_step call. False if the pool is exhausted. + bool kvflash_alloc_span(int kv_start, int n_tok); + // Hybrid mode helpers bool init_hybrid_mode(); // Build hot/cold expert storage for `placement` by re-reading expert weights diff --git a/server/src/laguna/laguna_internal.h b/server/src/laguna/laguna_internal.h index ec09b6113..cc37d2051 100644 --- a/server/src/laguna/laguna_internal.h +++ b/server/src/laguna/laguna_internal.h @@ -168,16 +168,21 @@ struct LagunaTargetCache { std::vector attn_v; }; +// `ctx_alloc` (kvflash): when > 0 and < max_ctx, the per-layer K/V tensors +// are allocated at ctx_alloc rows (the resident pool) while cache.max_ctx +// keeps the logical bound. 0 = allocate at max_ctx (default). bool create_laguna_target_cache(const LagunaTargetWeights & w, int max_ctx, ggml_backend_t backend, - LagunaTargetCache & out); + LagunaTargetCache & out, + int ctx_alloc = 0); bool create_laguna_target_cache_partial(const LagunaTargetWeights & w, int max_ctx, ggml_backend_t backend, int layer_begin, int layer_end, - LagunaTargetCache & out); + LagunaTargetCache & out, + int ctx_alloc = 0); void free_laguna_target_cache(LagunaTargetCache & c); void reset_laguna_target_cache(LagunaTargetCache & c); @@ -280,6 +285,12 @@ LagunaGraphOutputs build_laguna_graph( // `out_logits` : on success, resized to vocab and filled with last-token // logits when in.output_last_only == true (default in this // helper). +// `kvflash`: optional bounded-residency pager (see common/kvflash_pager.h). +// When set, the K/V append rows come from the pager's slot mapping and both +// masks are built in SLOT space (causal / sliding-window conditions evaluated +// on the position each slot holds). The caller must have allocated slots for +// [kv_start, kv_start + n_tok) via slot_for() beforehand. Requires the +// kv_pad set_rows path (refused otherwise). bool laguna_step( ggml_backend_t backend, const LagunaTargetWeights & w, @@ -288,7 +299,8 @@ bool laguna_step( int n_tok, int kv_start, bool no_mask, - std::vector & out_logits); + std::vector & out_logits, + const class KvFlashPager * kvflash = nullptr); // Forward decl (full definition in common/moe_hybrid_storage.h). struct MoeHybridStorage; @@ -306,7 +318,8 @@ bool laguna_step_hybrid( bool no_mask, const MoeHybridStorage & hyb, std::vector & out_logits, - std::vector * out_selected = nullptr); + std::vector * out_selected = nullptr, + const class KvFlashPager * kvflash = nullptr); struct LagunaLayerStepGraph { ggml_context * ctx = nullptr; diff --git a/server/src/laguna/laguna_target_graph.cpp b/server/src/laguna/laguna_target_graph.cpp index 44b1b5cd7..c44d1ee32 100644 --- a/server/src/laguna/laguna_target_graph.cpp +++ b/server/src/laguna/laguna_target_graph.cpp @@ -19,6 +19,7 @@ #include "laguna_internal.h" #include "../common/moe_hybrid_storage.h" +#include "../common/kvflash_pager.h" #include "common/ggml_graph_precision.h" #include "internal.h" #include "dflash27b.h" @@ -44,9 +45,11 @@ static constexpr float LAGUNA_EPS = 1e-6f; bool create_laguna_target_cache(const LagunaTargetWeights & w, int max_ctx, ggml_backend_t backend, - LagunaTargetCache & out) { + LagunaTargetCache & out, + int ctx_alloc) { return create_laguna_target_cache_partial( - w, max_ctx, backend, /*layer_begin=*/0, /*layer_end=*/w.n_layer, out); + w, max_ctx, backend, /*layer_begin=*/0, /*layer_end=*/w.n_layer, out, + ctx_alloc); } bool create_laguna_target_cache_partial(const LagunaTargetWeights & w, @@ -54,7 +57,8 @@ bool create_laguna_target_cache_partial(const LagunaTargetWeights & w, ggml_backend_t backend, int layer_begin, int layer_end, - LagunaTargetCache & out) { + LagunaTargetCache & out, + int ctx_alloc) { if (layer_begin < 0) layer_begin = 0; if (layer_end < 0) layer_end = w.n_layer; if (layer_begin > layer_end || layer_end > w.n_layer) { @@ -62,6 +66,9 @@ bool create_laguna_target_cache_partial(const LagunaTargetWeights & w, return false; } + // kvflash: tensors at pool capacity, logical bound stays max_ctx. + const int ctx_phys = (ctx_alloc > 0 && ctx_alloc < max_ctx) ? ctx_alloc : max_ctx; + out.backend = backend; out.max_ctx = max_ctx; out.cur_pos = 0; @@ -88,10 +95,10 @@ bool create_laguna_target_cache_partial(const LagunaTargetWeights & w, if (il < layer_begin || il >= layer_end) continue; char nm[32]; std::snprintf(nm, sizeof(nm), "k_l%d", il); - ggml_tensor * k = ggml_new_tensor_3d(out.base_ctx, k_type, w.head_dim, max_ctx, w.n_head_kv); + ggml_tensor * k = ggml_new_tensor_3d(out.base_ctx, k_type, w.head_dim, ctx_phys, w.n_head_kv); ggml_set_name(k, nm); std::snprintf(nm, sizeof(nm), "v_l%d", il); - ggml_tensor * v = ggml_new_tensor_3d(out.base_ctx, v_type, w.head_dim, max_ctx, w.n_head_kv); + ggml_tensor * v = ggml_new_tensor_3d(out.base_ctx, v_type, w.head_dim, ctx_phys, w.n_head_kv); ggml_set_name(v, nm); out.attn_k[il] = k; out.attn_v[il] = v; @@ -978,8 +985,14 @@ bool laguna_step( int n_tok, int kv_start, bool no_mask, - std::vector & out_logits) + std::vector & out_logits, + const KvFlashPager * kvflash) { + if (kvflash && no_mask) { + std::fprintf(stderr, "laguna_step: kvflash requires masks (slots are " + "relocated; position-implicit masking is invalid)\n"); + return false; + } // Same CUDA-graph-replay treatment as laguna_step_hybrid: persistent // arena (stable node addresses -> stable graph key), stride-padded KV // span, and set_rows K/V append (index is an input, so node properties @@ -1056,6 +1069,25 @@ bool laguna_step( std::vector pos((size_t)n_tok); for (int i = 0; i < n_tok; ++i) pos[i] = kv_start + i; ggml_backend_tensor_set(pp, pos.data(), 0, ggml_nbytes(pp)); + + if (kvflash) { + if (!kvi) { + std::fprintf(stderr, "laguna_step: kvflash requires the kv_pad " + "set_rows path (NO_KVPAD / PAD_CPY are incompatible)\n"); + ggml_free(ctx); + return false; + } + std::vector rows; + std::vector mfull, mswa; + if (!kvflash_fill_rows_and_masks(*kvflash, kv_start, n_tok, mk_w, + w.sliding_window, rows, &mfull, &mswa)) { + ggml_free(ctx); + return false; + } + ggml_backend_tensor_set(kvi, rows.data(), 0, ggml_nbytes(kvi)); + ggml_backend_tensor_set(mk_full, mfull.data(), 0, ggml_nbytes(mk_full)); + ggml_backend_tensor_set(mk_swa, mswa.data(), 0, ggml_nbytes(mk_swa)); + } else { if (kvi) { ggml_backend_tensor_set(kvi, pos.data(), 0, ggml_nbytes(kvi)); } @@ -1083,6 +1115,7 @@ bool laguna_step( } ggml_backend_tensor_set(mk_swa, mswa.data(), 0, ggml_nbytes(mk_swa)); } + } if (ggml_backend_graph_compute(backend, gf) != GGML_STATUS_SUCCESS) { std::fprintf(stderr, "laguna_step: graph_compute failed\n"); @@ -1111,8 +1144,14 @@ bool laguna_step_hybrid( bool no_mask, const MoeHybridStorage & hyb, std::vector & out_logits, - std::vector * out_selected) + std::vector * out_selected, + const KvFlashPager * kvflash) { + if (kvflash && no_mask) { + std::fprintf(stderr, "laguna_step_hybrid: kvflash requires masks (slots " + "are relocated; position-implicit masking is invalid)\n"); + return false; + } // Persistent arena: rebuilt graphs land at IDENTICAL addresses every step. // The ggml-cuda CUDA-graph cache is keyed on nodes[0] and memcmps node // properties (incl. src data pointers); address stability across steps is @@ -1209,6 +1248,25 @@ bool laguna_step_hybrid( std::vector pos((size_t)n_tok); for (int i = 0; i < n_tok; ++i) pos[i] = kv_start + i; ggml_backend_tensor_set(pp, pos.data(), 0, ggml_nbytes(pp)); + + if (kvflash) { + if (!kvi) { + std::fprintf(stderr, "laguna_step_hybrid: kvflash requires the kv_pad " + "set_rows path (NO_KVPAD / PAD_CPY are incompatible)\n"); + ggml_free(ctx); + return false; + } + std::vector rows; + std::vector mfull, mswa; + if (!kvflash_fill_rows_and_masks(*kvflash, kv_start, n_tok, mk_w, + w.sliding_window, rows, &mfull, &mswa)) { + ggml_free(ctx); + return false; + } + ggml_backend_tensor_set(kvi, rows.data(), 0, ggml_nbytes(kvi)); + ggml_backend_tensor_set(mk_full, mfull.data(), 0, ggml_nbytes(mk_full)); + ggml_backend_tensor_set(mk_swa, mswa.data(), 0, ggml_nbytes(mk_swa)); + } else { if (kvi) { // set_rows row indices = absolute cache positions of this step's tokens ggml_backend_tensor_set(kvi, pos.data(), 0, ggml_nbytes(kvi)); @@ -1232,6 +1290,7 @@ bool laguna_step_hybrid( } ggml_backend_tensor_set(mk_swa, mswa.data(), 0, ggml_nbytes(mk_swa)); } + } // Set ALL residency LUTs in two batched H2D copies from the hot stack mapping. std::vector lutbuf((size_t)n_expert * (size_t)n_moe); diff --git a/server/src/qwen3/qwen3_kvflash_scorer.cpp b/server/src/qwen3/qwen3_kvflash_scorer.cpp new file mode 100644 index 000000000..4dc00c7c9 --- /dev/null +++ b/server/src/qwen3/qwen3_kvflash_scorer.cpp @@ -0,0 +1,210 @@ +#include "qwen3_kvflash_scorer.h" + +#include "qwen3_drafter_model.h" +#include "server/tokenizer.h" + +#include +#include +#include + +namespace dflash::common { + +namespace { + +constexpr int kLookahead = 8; +constexpr int kPoolKernel = 13; +constexpr int kMinSegment = 4096; + +// Tail-attention token scores for `ids`: mean over the lookahead window of +// the drafter's running-max, then AvgPool smoothing. Same math as +// drafter_score_and_compress. +bool score_tokens_direct(DrafterContext & ctx, const std::vector & ids, + std::vector & out) { + const int S = (int)ids.size(); + std::vector running_max; + if (!forward_qwen3_drafter_model(ctx.weights, ids, kLookahead, running_max)) { + return false; + } + std::vector score((size_t)S, 0.0f); + for (int j = 0; j < S; j++) { + float s = 0.0f; + for (int t = 0; t < kLookahead; t++) s += running_max[(size_t)t * S + j]; + score[j] = s / kLookahead; + } + out.assign((size_t)S, 0.0f); + const int half = kPoolKernel / 2; + for (int j = 0; j < S; j++) { + const int lo = std::max(0, j - half), hi = std::min(S - 1, j + half); + float s = 0.0f; + for (int k = lo; k <= hi; k++) s += score[k]; + out[j] = s / (hi - lo + 1); + } + return true; +} + +void z_normalize(float * v, size_t n) { + if (n == 0) return; + double mean = 0; + for (size_t i = 0; i < n; i++) mean += v[i]; + mean /= n; + double var = 0; + for (size_t i = 0; i < n; i++) var += (v[i] - mean) * (v[i] - mean); + const float inv = 1.0f / ((float)std::sqrt(var / n) + 1e-6f); + for (size_t i = 0; i < n; i++) v[i] = (float)((v[i] - mean) * inv); +} + +// Score `ids` with allocation-failure resilience: try the full forward; +// on failure split into two equal halves, score each with the TRUE query +// tail (last kLookahead ids) appended so relevance stays query-aware, and +// z-normalize per segment so the merged ranking is comparable. Recursion +// floor kMinSegment. The drafter's per-call buffers (~10 KB/token) can +// fail on a fragmented CUDA heap at 32K+ even when total free VRAM is +// ample; segmented scoring trades exact cross-segment calibration for +// robustness. +bool score_tokens_resilient(DrafterContext & ctx, const std::vector & ids, + std::vector & out) { + if (score_tokens_direct(ctx, ids, out)) { + z_normalize(out.data(), out.size()); + return true; + } + const int S = (int)ids.size(); + if (S <= kMinSegment) return false; + + std::fprintf(stderr, "[kvflash-scorer] forward failed at S=%d, bisecting\n", S); + const int mid = S / 2; + std::vector tail(ids.end() - kLookahead, ids.end()); + + std::vector left(ids.begin(), ids.begin() + mid); + left.insert(left.end(), tail.begin(), tail.end()); + std::vector ls; + if (!score_tokens_resilient(ctx, left, ls)) return false; + + std::vector right(ids.begin() + mid, ids.end()); + std::vector rs; + if (!score_tokens_resilient(ctx, right, rs)) return false; + + out.assign((size_t)S, 0.0f); + std::copy(ls.begin(), ls.begin() + mid, out.begin()); // drop tail scores + std::copy(rs.begin(), rs.begin() + (S - mid), out.begin() + mid); + return true; +} + +} // namespace + +bool KvFlashDrafterScorer::score_chunks(const std::vector & ids, + int chunk_tokens, + std::vector & out) { + const int S = (int)ids.size(); + out.clear(); + if (!ctx_ || !ctx_->loaded || S < kLookahead + 1 || chunk_tokens <= 0) return false; + + std::vector score_ids = ids; + if (vocab_clamp_ > 1001) { // fold range must stay positive + for (auto & t : score_ids) { + if (t >= vocab_clamp_) t = 1000 + t % (vocab_clamp_ - 1000); + } + } + + std::vector smooth; + if (!score_tokens_resilient(*ctx_, score_ids, smooth)) return false; + + const int n_chunks = (S + chunk_tokens - 1) / chunk_tokens; + out.assign((size_t)n_chunks, 0.0f); + for (int c = 0; c < n_chunks; c++) { + const int s_ = c * chunk_tokens, e_ = std::min(S, (c + 1) * chunk_tokens); + float m = 0.0f; + for (int j = s_; j < e_; j++) m += smooth[j]; + out[c] = m / std::max(1, e_ - s_); + } + return true; +} + +// ── KvFlashCrossTokScorer ─────────────────────────────────────────────── + +struct KvFlashCrossTokScorer::Toks { + Tokenizer target; + Tokenizer drafter; +}; + +KvFlashCrossTokScorer::~KvFlashCrossTokScorer() { delete toks_; } + +bool KvFlashCrossTokScorer::ensure_tokenizers() { + if (toks_) return true; + if (toks_failed_) return false; + auto * t = new Toks(); + if (!t->target.load_from_gguf(target_gguf_.c_str()) || + !t->drafter.load_from_gguf(drafter_gguf_.c_str())) { + std::fprintf(stderr, "[kvflash] cross-tokenizer scorer: tokenizer load " + "failed (%s / %s)\n", + target_gguf_.c_str(), drafter_gguf_.c_str()); + delete t; + toks_failed_ = true; + return false; + } + toks_ = t; + return true; +} + +bool KvFlashCrossTokScorer::score_chunks(const std::vector & ids, + int chunk_tokens, + std::vector & out) { + const int S = (int)ids.size(); + out.clear(); + if (!ctx_ || !ctx_->loaded || S < kLookahead + 1 || chunk_tokens <= 0) return false; + if (!ensure_tokenizers()) return false; + + // 1) Target ids -> text, recording each target token's char end offset. + // Byte-level BPE pieces concatenate exactly, so per-id decode gives + // exact spans; special/template tokens may decode empty (their chunk + // contribution then comes from neighboring text, which is fine). + std::string text; + text.reserve((size_t)S * 4); + std::vector tgt_end((size_t)S); + std::vector one(1); + for (int i = 0; i < S; i++) { + one[0] = ids[(size_t)i]; + text += toks_->target.decode(one); + tgt_end[(size_t)i] = (int32_t)text.size(); + } + + // 2) Text -> drafter ids, with each drafter token's char midpoint. + const std::vector dids = toks_->drafter.encode(text); + const int D = (int)dids.size(); + if (D < kLookahead + 1) return false; + std::vector dmid((size_t)D); + { + size_t pos = 0; + for (int i = 0; i < D; i++) { + one[0] = dids[(size_t)i]; + const size_t len = toks_->drafter.decode(one).size(); + dmid[(size_t)i] = (float)pos + (float)len * 0.5f; + pos += len; + } + } + + // 3) Same tail-attention forward as the same-tokenizer scorer. + std::vector dscore; + if (!score_tokens_resilient(*ctx_, dids, dscore)) return false; + + // 4) Map drafter-token scores onto target chunks by char span: a chunk's + // score is the mean of drafter tokens whose midpoint falls inside the + // chunk's text span. Empty spans (pure template tokens) stay at 0, + // i.e. z-score-neutral. + const int n_chunks = (S + chunk_tokens - 1) / chunk_tokens; + out.assign((size_t)n_chunks, 0.0f); + std::vector counts((size_t)n_chunks, 0); + int d = 0; + for (int c = 0; c < n_chunks; c++) { + const int last_tok_idx = std::min(S, (c + 1) * chunk_tokens) - 1; + const float span_end = (float)tgt_end[(size_t)last_tok_idx]; + while (d < D && dmid[(size_t)d] < span_end) { + out[(size_t)c] += dscore[(size_t)d]; + counts[(size_t)c]++; + d++; + } + if (counts[(size_t)c] > 0) out[(size_t)c] /= (float)counts[(size_t)c]; + } + return true; +} + +} // namespace dflash::common diff --git a/server/src/qwen3/qwen3_kvflash_scorer.h b/server/src/qwen3/qwen3_kvflash_scorer.h new file mode 100644 index 000000000..e0fda5074 --- /dev/null +++ b/server/src/qwen3/qwen3_kvflash_scorer.h @@ -0,0 +1,68 @@ +// KvFlashDrafterScorer — pflash drafter as the KV pager's Memory Indexer. +// +// Scores 64-token chunks with the same Liu Q-hook tail attention that +// pflash compression uses (forward_qwen3_drafter_model), but returns the +// per-chunk relevance scores instead of a compressed token list. The +// DrafterContext is borrowed: the daemon shares its pflash drafter; the +// pager itself never depends on this file (see common/kvflash_scorer.h). + +#pragma once + +#include "kvflash_scorer.h" +#include "qwen3_drafter.h" + +#include + +namespace dflash::common { + +class KvFlashDrafterScorer : public KvFlashScorer { +public: + // `vocab_clamp`: ids >= clamp are folded into the drafter's vocab range + // before scoring. Needed when the target vocabulary is a superset of + // the drafter's (e.g. Qwen3.6 target + Qwen3-0.6B drafter); prompt ids + // tokenized for the target may be unembeddable by the drafter. + explicit KvFlashDrafterScorer(DrafterContext * ctx, int32_t vocab_clamp = 100000) + : ctx_(ctx), vocab_clamp_(vocab_clamp) {} + + bool score_chunks(const std::vector & ids, int chunk_tokens, + std::vector & out) override; + +private: + DrafterContext * ctx_; + int32_t vocab_clamp_; +}; + +// KvFlashCrossTokScorer — the same drafter scoring for targets that do NOT +// share the Qwen tokenizer (laguna, gemma4). Relevance is a property of the +// TEXT, so the bridge is re-tokenization: detokenize the target's history +// (its own tokenizer, loaded from the target GGUF), tokenize the text with +// the drafter's tokenizer (from the drafter GGUF), run the same tail- +// attention forward, then map per-drafter-token scores back onto the +// target's chunk boundaries by character spans. Tokenizers are host-only +// and lazy-loaded on first score. +class KvFlashCrossTokScorer : public KvFlashScorer { +public: + KvFlashCrossTokScorer(DrafterContext * ctx, + std::string target_gguf, + std::string drafter_gguf) + : ctx_(ctx), target_gguf_(std::move(target_gguf)), + drafter_gguf_(std::move(drafter_gguf)) {} + ~KvFlashCrossTokScorer() override; + KvFlashCrossTokScorer(const KvFlashCrossTokScorer &) = delete; + KvFlashCrossTokScorer & operator=(const KvFlashCrossTokScorer &) = delete; + + bool score_chunks(const std::vector & ids, int chunk_tokens, + std::vector & out) override; + +private: + bool ensure_tokenizers(); + + DrafterContext * ctx_; + std::string target_gguf_, drafter_gguf_; + // Pimpl to keep server/tokenizer.h out of backend headers. + struct Toks; + Toks * toks_ = nullptr; + bool toks_failed_ = false; +}; + +} // namespace dflash::common diff --git a/server/src/qwen35/graph_builders.cpp b/server/src/qwen35/graph_builders.cpp index f41f94cc0..f6c963870 100644 --- a/server/src/qwen35/graph_builders.cpp +++ b/server/src/qwen35/graph_builders.cpp @@ -2,6 +2,7 @@ #include "ggml-alloc.h" +#include #include namespace dflash::common { @@ -88,7 +89,9 @@ bool build_layer_prefn_step( int n_tokens, bool with_mask, int fa_window, - int kq_stride_pad) { + int kq_stride_pad, + bool kvflash) { + if (kvflash) with_mask = true; // slot-space masking is mandatory on the pool step_graph_free(sg); ggml_init_params ip{}; @@ -109,20 +112,34 @@ bool build_layer_prefn_step( ggml_set_name(sg.positions, "positions"); ggml_set_input(sg.positions); if (with_mask) { - const int max_win_len = cache.max_ctx + n_tokens; + // Mask width follows the PHYSICAL tensor capacity (pool-sized + // under kvflash) so it agrees with the FA span clamp inside + // build_full_attn_block. + int phys_ctx = cache.max_ctx; + for (ggml_tensor * t : cache.attn_k) { + if (t) { phys_ctx = std::min(phys_ctx, (int)t->ne[1]); break; } + } + const int max_win_len = phys_ctx + n_tokens; const int kv_pad = align_up(max_win_len, kq_stride_pad); const int q_pad = align_up(n_tokens, KQ_MASK_PAD); sg.attn_mask = ggml_new_tensor_2d(sg.ctx, GGML_TYPE_F16, kv_pad, q_pad); ggml_set_name(sg.attn_mask, "attn_mask"); ggml_set_input(sg.attn_mask); } + if (kvflash) { + sg.kv_write_rows = ggml_new_tensor_2d(sg.ctx, GGML_TYPE_I64, + n_tokens, w.n_head_kv); + ggml_set_name(sg.kv_write_rows, "kv_write_rows"); + ggml_set_input(sg.kv_write_rows); + } } sg.gf = ggml_new_graph_custom(sg.ctx, 16384, false); QwenLayerPrefnOutputs go = build_qwen35_layer_prefn( sg.ctx, sg.gf, w, cache, layer_idx, sg.inp_embed, sg.positions, sg.attn_mask, - kv_start, n_tokens, fa_window); + kv_start, n_tokens, fa_window, + sg.kv_write_rows); if (!go.residual || !go.post) return false; sg.ffn_residual = go.residual; sg.ffn_post = go.post; @@ -236,7 +253,8 @@ bool build_target_step( int fa_window, bool last_token_logits_only, int kq_stride_pad, - bool capture_moe_router) { + bool capture_moe_router, + bool kvflash_mask) { step_graph_free(sg); // Persistent thread_local arena: rebuilt step graphs land at identical @@ -266,7 +284,13 @@ bool build_target_step( // Use max_ctx for mask allocation so the gallocr buffer never needs to // grow as kv_start increases during generation. The actual mask is // filled only up to kv_start + n_tokens; the excess is don't-care. - const int max_win_len = cache.max_ctx + n_tokens; + // kvflash mode: the physical span is the (smaller) pool capacity of + // the attention tensors, so size the mask from those instead. + int phys_ctx = cache.max_ctx; + for (auto * t : cache.attn_k) { + if (t) { phys_ctx = std::min(phys_ctx, (int)t->ne[1]); break; } + } + const int max_win_len = phys_ctx + n_tokens; const int kv_pad = align_up(max_win_len, kq_stride_pad); const int q_pad = align_up(n_tokens, KQ_MASK_PAD); sg.attn_mask = ggml_new_tensor_2d(sg.ctx, GGML_TYPE_F16, kv_pad, q_pad); @@ -280,8 +304,16 @@ bool build_target_step( // DFLASH_QWEN35_NO_KVPAD=1 restores the legacy cpy append + exact-length // FA span (per-step node properties -> no CUDA-graph replay). static const bool g_no_kvpad = (std::getenv("DFLASH_QWEN35_NO_KVPAD") != nullptr); - const bool use_kv_write_rows = (!g_no_kvpad && n_tokens == 1 && fa_window == 0 && - !with_mask && !capture && !capture_delta_intermediate); + // kvflash_mask: kvflash mode. The mask carries pool slot validity + // (uploaded by the caller before EVERY compute — the input's buffer + // region is reused by graph execution) and set_rows carries per-token + // physical slots, so the slot-mapped write stays active for masked, + // multi-token, and feature-capturing forwards (decode AND spec verify). + const bool use_kv_write_rows = + !g_no_kvpad && !capture_delta_intermediate && + (kvflash_mask + ? (fa_window == 0) + : (n_tokens == 1 && fa_window == 0 && !with_mask && !capture)); if (use_kv_write_rows) { sg.kv_write_rows = ggml_new_tensor_2d(sg.ctx, GGML_TYPE_I64, n_tokens, w.n_head_kv); diff --git a/server/src/qwen35/graph_builders.h b/server/src/qwen35/graph_builders.h index 69a1e89e4..ca11a8169 100644 --- a/server/src/qwen35/graph_builders.h +++ b/server/src/qwen35/graph_builders.h @@ -40,6 +40,10 @@ bool build_layer_step( int fa_window = 0, int kq_stride_pad = KQ_MASK_PAD); +// `kvflash`: pooled mode — KV rows go through a set_rows input +// (sg.kv_write_rows, [n_tokens, n_head_kv] ne0-major slots) and the mask +// (forced on) is sized to the PHYSICAL tensor capacity so the caller can +// fill it in slot space. Caller allocates slots and fills rows + mask. bool build_layer_prefn_step( StepGraph & sg, const TargetWeights & w, @@ -50,7 +54,8 @@ bool build_layer_prefn_step( int n_tokens, bool with_mask, int fa_window = 0, - int kq_stride_pad = KQ_MASK_PAD); + int kq_stride_pad = KQ_MASK_PAD, + bool kvflash = false); // Full layer graph for hybrid decode: pre-FFN + MoE FFN + shared + residual in one compute. // Output: sg.hidden_input = layer_output, sg.moe_selected = router selections. @@ -67,6 +72,11 @@ bool build_hybrid_full_layer_step( int kq_stride_pad = KQ_MASK_PAD); // Full target forward: chain mode (all layers, logits + argmax output). +// +// `kvflash_mask`: kvflash pooled mode — keep the set_rows KV write active +// even though a mask is requested (the mask carries pool-slot validity and +// must be re-uploaded by the caller before every compute). Used by both +// single-token decode and multi-token spec verify; requires fa_window == 0. bool build_target_step( StepGraph & sg, const TargetWeights & w, @@ -80,7 +90,8 @@ bool build_target_step( int fa_window = 0, bool last_token_logits_only = false, int kq_stride_pad = KQ_MASK_PAD, - bool capture_moe_router = false); + bool capture_moe_router = false, + bool kvflash_mask = false); // Full target forward: DDTree tree-verify mode. bool build_target_step_tree( diff --git a/server/src/qwen35/qwen35_backend.cpp b/server/src/qwen35/qwen35_backend.cpp index c22b37ed5..4feb08b03 100644 --- a/server/src/qwen35/qwen35_backend.cpp +++ b/server/src/qwen35/qwen35_backend.cpp @@ -10,6 +10,7 @@ #include "common/io_utils.h" #include "common/restore_delta.h" #include "qwen3/qwen3_drafter.h" +#include "qwen3/qwen3_kvflash_scorer.h" #include "ggml-cuda.h" #include "common/snapshot_backend.h" @@ -26,6 +27,8 @@ #include #include +#include "kv_quant.h" + namespace dflash::common { namespace { @@ -215,11 +218,63 @@ bool Qwen35Backend::init() { const int max_verify_tokens = cfg_.ddtree_mode ? std::max(dw_.block_size, cfg_.ddtree_budget + 1) : dw_.block_size; + // kvflash (bounded residency): pool size from the env, rounded/floored/ + // clamped by the shared reader (256-stride keeps FA vec-kernel + // eligibility; the floor keeps eviction from deadlocking). + // Drafter-scored residency is the DEFAULT policy: explicit + // --prefill-drafter first, then the well-known locations next to the + // model (Spark's pattern). LRU is the fallback when nothing is found + // (or the explicit choice via --kvflash-policy lru). + if (std::getenv("DFLASH_KVFLASH")) { + kvflash_drafter_path_ = kvflash_find_drafter(cfg_.target_path); + } + // "auto" sizes the pool from the GPU: weights are resident at this + // point and the cache is not yet allocated, so device-free minus a + // reserve (compute buffers + the drafter when expected) is what the + // pool can really use, converted at this model's pooled-KV density. + KvFlashAutoBudget kvf_budget; + { + size_t gpu_free = 0, gpu_total = 0; + if (ggml_backend_dev_t dev = ggml_backend_get_device(target_backend_)) { + ggml_backend_dev_memory(dev, &gpu_free, &gpu_total); + } + ggml_type kv_k = GGML_TYPE_Q8_0, kv_v = GGML_TYPE_Q8_0; + dflash::resolve_kv_types(kv_k, kv_v); + const int n_full = w_.n_layer / w_.full_attention_interval; + kvf_budget.free_bytes = (int64_t)gpu_free; + kvf_budget.bytes_per_token = (int64_t)n_full * w_.n_head_kv * + (int64_t)(ggml_row_size(kv_k, w_.n_embd_head_k) + + ggml_row_size(kv_v, w_.n_embd_head_v)); + kvf_budget.reserve_bytes = (int64_t)(1.5 * 1073741824.0) + + (kvflash_drafter_path_.empty() ? 0 : (int64_t)(1.7 * 1073741824.0)); + } + kvflash_tokens_ = kvflash_pool_from_env(cfg_.device.max_ctx, KvFlashConfig{}, + !kvflash_drafter_path_.empty(), + kvf_budget); + if (kvflash_tokens_ > 0) { + kvflash_tau_ = std::max(1, env_int_or_default("DFLASH_KVFLASH_TAU", 64)); + } if (!create_target_cache(w_, cfg_.device.max_ctx, max_verify_tokens, target_backend_, cache_, - /*prefill_only=*/true)) { + /*prefill_only=*/true, /*ctx_alloc=*/kvflash_tokens_)) { std::fprintf(stderr, "cache: %s\n", dflash27b_last_error()); return false; } + if (kvflash_active()) { + KvFlashConfig pc; + pc.pool_tokens = kvflash_tokens_; + if (!kvflash_pager_.attach(pc, cache_.attn_k, cache_.attn_v)) { + std::fprintf(stderr, "kvflash: pager attach failed (pool=%d)\n", kvflash_tokens_); + return false; + } + std::printf("[kvflash] resident pool %d tokens (logical max_ctx %d), " + "tau=%d, policy=%s\n", + kvflash_tokens_, cfg_.device.max_ctx, kvflash_tau_, + !kvflash_drafter_path_.empty() + ? "drafter (attaches on first reselect)" + : "lru (recency-only: no Qwen3-0.6B drafter found " + "next to the model or in --prefill-drafter)"); + std::fflush(stdout); + } // Init feature mirror when draft model is available (needed for spec decode). // On single-GPU, this is an F32 conversion buffer; on split-GPU, a cross-device mirror. @@ -290,6 +345,7 @@ bool Qwen35Backend::unpark(const std::string & what) { std::fprintf(stderr, "[unpark] target: %s\n", dflash27b_last_error()); return false; } + kvflash_drafter_failed_ = false; // fresh VRAM: allow a retry target_parked_ = false; std::printf("[unpark] target restored\n"); std::fflush(stdout); } @@ -340,6 +396,22 @@ bool Qwen35Backend::unpark(const std::string & what) { bool Qwen35Backend::snapshot_save(int slot) { if (slot < 0 || slot >= PREFIX_SLOTS) return false; + // kvflash: snapshots right-size to cur_pos, which is a LOGICAL position + // that can exceed the physical pool once decode has paged, and they copy + // rows assuming the identity layout, which pooled prefill / eviction + // breaks. Snapshots of pooled state need page-table serialization + // (follow-up); identity-mapped prefill-time snapshots remain valid. + if (kvflash_active() && + (cache_.cur_pos > kvflash_tokens_ || !kvflash_pager_.is_identity())) { + static bool warned = false; + if (!warned) { + std::fprintf(stderr, "[kvflash] snapshot skipped: cur_pos %d exceeds " + "pool %d (pooled snapshots are a follow-up)\n", + cache_.cur_pos, kvflash_tokens_); + warned = true; + } + return false; + } PrefixSnapshot & snap = prefix_snapshots_[slot]; return snapshot_target_cache(w_, cache_, snap_backend_, snap); } @@ -488,6 +560,13 @@ ModelBackend::CompressResult Qwen35Backend::compress(const CompressRequest & req } drafter_loaded_ = true; std::fprintf(stderr, "[compress] drafter ready\n"); + // pflash + kvflash synergy: the drafter doubles as the pool's + // Memory Indexer (tau-step reselect). Pager stays LRU without it. + if (kvflash_active() && !kvflash_scorer_) { + kvflash_scorer_ = std::make_unique(&drafter_ctx_); + std::fprintf(stderr, "[kvflash] drafter scorer attached (tau=%d)\n", + kvflash_tau_); + } } result.compressed_ids = drafter_score_and_compress( @@ -544,6 +623,8 @@ bool Qwen35Backend::handle_compress(const std::string & line, const DaemonIO & i void Qwen35Backend::free_drafter() { if (drafter_loaded_) { + // The kvflash scorer borrows drafter_ctx_; drop it first. + kvflash_scorer_.reset(); // Drafter has its own backend — do a full free (weights + backend) dflash::common::free_drafter(drafter_ctx_); drafter_loaded_ = false; @@ -579,6 +660,10 @@ DFlashTarget * Qwen35Backend::dflash_target() { dflash_target_ = std::make_unique( w_, cache_, target_backend_, sg_, cfg_.kq_stride_pad, cfg_.fa_window); + if (kvflash_active()) { + static_cast(dflash_target_.get()) + ->set_kvflash_pager(&kvflash_pager_); + } } return dflash_target_.get(); } @@ -856,6 +941,32 @@ int Qwen35Backend::do_prefill(const std::vector & tokens, const int prompt_len = (int)tokens.size(); prefill_last_logits_valid_ = false; + // kvflash: a prompt that fits the pool prefills contiguously (identity + // mapping, normal chunking). A LARGER prompt switches to POOLED CHUNKED + // PREFILL: pager-chunk-sized batches whose KV rows are slot-mapped via + // set_rows, with a slot-space mask per chunk and live eviction as the + // pool fills (constant VRAM, linear time). Restore offsets are not + // supported in the pooled path (a relocated prefix cannot be restored + // identity-style in the first place). + const bool kvf_paged = kvflash_active() && + kv_offset + prompt_len > kvflash_tokens_ - kvflash_pager_.chunk_tokens(); + if (kvf_paged && kv_offset != 0) { + std::fprintf(stderr, + "[kvflash] restored prefix (%d) + prompt (%d) exceeds pool %d; " + "pooled prefill requires a fresh request\n", + kv_offset, prompt_len, kvflash_tokens_); + set_last_error("kvflash: restore + pooled prefill unsupported"); + return -1; + } + if (kvf_paged) { + prefill_ubatch = kvflash_pager_.chunk_tokens(); + kvflash_pager_.reset(); + std::printf("[kvflash] pooled prefill: %d tokens through a %d-token pool " + "(%d-token chunks, evicting)\n", + prompt_len, kvflash_tokens_, prefill_ubatch); + std::fflush(stdout); + } + // Skip KV-cache migration when resuming from a snapshot — the cache was // already migrated when the snapshot was taken; re-running migrate would // clobber the restored state. @@ -887,18 +998,39 @@ int Qwen35Backend::do_prefill(const std::vector & tokens, // incl. the user message -> a different user msg restores garbage.) if (snap_slot >= 0 && snap_pos >= 0 && kv_pos <= snap_pos && snap_pos < kv_pos + n_tokens) { - if (kv_pos > kv_offset) { // skip a degenerate short-prefix snapshot + if (kv_pos > kv_offset && !kvf_paged) { // skip degenerate / relocated cache_.cur_pos = kv_pos; if (snapshot_save(snap_slot)) { std::printf("[snap] boundary slot=%d cur_pos=%d (req snap_pos=%d)\n", snap_slot, kv_pos, snap_pos); std::fflush(stdout); } + } else if (kvf_paged) { + std::fprintf(stderr, "[kvflash] boundary snapshot skipped: pooled " + "prefill relocates chunks\n"); } snap_pos = -1; snap_slot = -1; } - const bool with_mask = (cfg_.kq_stride_pad > KQ_MASK_PAD) || (n_tokens > 1); + const bool with_mask = kvf_paged || + (cfg_.kq_stride_pad > KQ_MASK_PAD) || (n_tokens > 1); + + // kvflash pooled prefill: allocate this chunk's slots up front + // (evicting the lowest-priority resident chunk once the pool fills). + std::vector kvf_slots; + if (kvf_paged) { + kvf_slots.resize((size_t)n_tokens); + bool ok = true; + for (int i = 0; i < n_tokens; i++) { + kvf_slots[(size_t)i] = kvflash_pager_.slot_for(kv_pos + i); + if (kvf_slots[(size_t)i] < 0) { ok = false; break; } + } + if (!ok) { + std::fprintf(stderr, "[kvflash] pooled prefill: slot alloc failed @%d\n", kv_pos); + set_last_error("kvflash: no evictable pool block"); + return -1; + } + } // Prefill always uses full attention (fa_window=0) so that all // positions encode the complete context — critical for tool @@ -911,10 +1043,26 @@ int Qwen35Backend::do_prefill(const std::vector & tokens, /*fa_window=*/0, /*last_token_logits_only=*/(start + n_tokens < prompt_len), cfg_.kq_stride_pad, - should_capture_moe_router())) { + should_capture_moe_router(), + /*kvflash_mask=*/kvf_paged)) { std::fprintf(stderr, "prefill build @%d\n", kv_pos); return -1; } + if (kvf_paged) { + if (!sg_.kv_write_rows) { + std::fprintf(stderr, "[kvflash] pooled prefill requires the set_rows path\n"); + return -1; + } + // [n_tokens, n_head_kv] ne0-major (see verify_batch). + std::vector rows((size_t)n_tokens * w_.n_head_kv); + for (int h = 0; h < w_.n_head_kv; h++) { + for (int i = 0; i < n_tokens; i++) { + rows[(size_t)h * n_tokens + i] = kvf_slots[(size_t)i]; + } + } + ggml_backend_tensor_set(sg_.kv_write_rows, rows.data(), 0, + sizeof(int64_t) * rows.size()); + } // Embed if (!w_.embedder.embed(tokens.data() + start, n_tokens, embed_buf.data())) { @@ -936,7 +1084,34 @@ int Qwen35Backend::do_prefill(const std::vector & tokens, sizeof(int32_t) * pos_buf.size()); // Mask — full attention during prefill (no windowing) - if (sg_.attn_mask) { + if (sg_.attn_mask && kvf_paged) { + // Slot-space mask (same recipe as verify_batch): row q attends + // (a) the slots of resident chunks holding positions < kv_pos + // and (b) this chunk's own slots, causally. + constexpr uint16_t F16_ZERO = 0x0000, F16_NEG_INF = 0xFC00; + const size_t kvd = (size_t)sg_.attn_mask->ne[0]; + const int q_pad = (int)sg_.attn_mask->ne[1]; + std::vector mask_buf((size_t)kvd * q_pad, F16_NEG_INF); + const int ct = kvflash_pager_.chunk_tokens(); + for (int c = 0; c < kvflash_pager_.n_chunks(); c++) { + const int blk = kvflash_pager_.block_of(c); + if (blk < 0) continue; + for (int i = 0; i < ct; i++) { + if ((int64_t)c * ct + i >= kv_pos) break; + mask_buf[(size_t)blk * ct + i] = F16_ZERO; + } + } + for (int q = 1; q < n_tokens; q++) { + std::memcpy(mask_buf.data() + (size_t)q * kvd, mask_buf.data(), kvd * 2); + } + for (int q = 0; q < n_tokens; q++) { + for (int i = 0; i <= q; i++) { + mask_buf[(size_t)q * kvd + kvf_slots[(size_t)i]] = F16_ZERO; + } + } + ggml_backend_tensor_set(sg_.attn_mask, mask_buf.data(), 0, + sizeof(uint16_t) * mask_buf.size()); + } else if (sg_.attn_mask) { const int win_start = 0; const int kv_len = kv_pos + n_tokens - win_start; std::vector mask_buf; @@ -979,6 +1154,18 @@ int Qwen35Backend::do_prefill(const std::vector & tokens, start += n_tokens; } + if (kvflash_active()) { + if (kvf_paged) { + // The pager mapping was built live during the pooled prefill; + // only the history / hygiene parts of the sync apply. + kvflash_history_.assign(tokens.begin(), tokens.end()); + kvflash_pager_.zero_free_blocks(); + kvflash_mask_epoch_ = (uint64_t)-1; + } else { + kvflash_sync_prefill(committed, tokens, kv_offset); + } + } + // End-of-prefill snapshot: scoped disk-cache saves (auto/fixed policy) // request snap_pos == prompt end, which never falls inside a chunk so the // boundary branch above cannot fire. Taking the snapshot here changes @@ -995,6 +1182,104 @@ int Qwen35Backend::do_prefill(const std::vector & tokens, return committed; } +// ── kvflash helpers ───────────────────────────────────────────────── + +void Qwen35Backend::kvflash_sync_prefill(int committed, + const std::vector & tokens, + int kv_offset) { + // Prefill (and snapshot restore) place rows physically contiguous at + // [0, committed): rebuild the pager mapping identity-style and reset + // the token history to match. + kvflash_pager_.reset(); + for (int p = 0; p < committed; p++) { + const int slot = kvflash_pager_.slot_for(p); + if (slot != p) { + // Cannot happen while prompt <= pool (blocks are handed out in + // order from a freshly reset pager); guard against future + // changes to the hand-out order. + std::fprintf(stderr, "[kvflash] prefill slot mismatch %d != %d\n", slot, p); + } + } + if (kv_offset == 0) { + kvflash_history_.assign(tokens.begin(), tokens.end()); + } else { + kvflash_history_.resize((size_t)kv_offset, 0); // restored prefix ids unknown + kvflash_history_.insert(kvflash_history_.end(), tokens.begin(), tokens.end()); + } + // Slots past the prompt still hold the previous request's rows; the + // maskless qwen35moe pipelined decode reads the whole padded pool span. + kvflash_pager_.zero_free_blocks(); + kvflash_mask_epoch_ = (uint64_t)-1; +} + +void Qwen35Backend::kvflash_upload_mask() { + if (!sg_.attn_mask) return; + const size_t need = (size_t)sg_.attn_mask->ne[0] * sg_.attn_mask->ne[1]; + if (kvflash_mask_buf_.size() != need || kvflash_pager_.epoch() != kvflash_mask_epoch_) { + kvflash_mask_buf_.assign(need, F16_NEG_INF); + kvflash_pager_.fill_slot_mask(kvflash_mask_buf_.data()); // q row 0 + kvflash_mask_epoch_ = kvflash_pager_.epoch(); + } + // Upload before EVERY compute: the input tensor's buffer region is + // reused by graph execution, so a stale upload reads back as garbage. + ggml_backend_tensor_set(sg_.attn_mask, kvflash_mask_buf_.data(), 0, + need * sizeof(uint16_t)); +} + +// Attach the drafter as the residency scorer outside the pflash compress +// path: with `--kvflash --prefill-drafter ` but compression off, the +// drafter would otherwise never load and the pool would silently run +// recency-only LRU. Loads lazily on the first reselect that needs it (and +// re-attaches after a draft-residency release frees the drafter). +void Qwen35Backend::kvflash_ensure_scorer() { + if (kvflash_scorer_ || kvflash_drafter_path_.empty() || kvflash_drafter_failed_) { + return; + } + if (!drafter_loaded_) { + ggml_backend_synchronize(target_backend_); + if (draft_backend_) ggml_backend_synchronize(draft_backend_); + std::fprintf(stderr, "[kvflash] loading drafter for residency scoring: %s\n", + kvflash_drafter_path_.c_str()); + if (!load_drafter(kvflash_drafter_path_, /*gpu_layers=*/999, + cfg_.device.gpu, drafter_ctx_)) { + std::fprintf(stderr, "[kvflash] drafter load failed (%s); staying on " + "LRU residency\n", dflash27b_last_error()); + kvflash_drafter_failed_ = true; + return; + } + drafter_loaded_ = true; + } + kvflash_scorer_ = std::make_unique(&drafter_ctx_); + std::fprintf(stderr, "[kvflash] drafter scorer attached (tau=%d)\n", kvflash_tau_); +} + +void Qwen35Backend::kvflash_maybe_reselect(int generated) { + if (kvflash_tau_ <= 0) return; + // Adaptive tau: a rescore costs ~0.11 ms per history token (full 0.6B + // re-prefill; measured 0.9 s @8K, ~46 s bisected @256K), while decode + // produces ~30 tok/s. Capping rescore overhead at ~15% of decode time + // gives tau ~= history/45. The configured tau is the floor. + const int tau = std::max(kvflash_tau_, (int)(kvflash_history_.size() / 45)); + if (generated % tau != 0) return; + // Lazy-load the drafter only when a rescore is actually due, so the + // first tokens of the first request never pay the load. + if (!kvflash_scorer_) kvflash_ensure_scorer(); + if (!kvflash_scorer_) return; + if (!kvflash_scorer_->score_chunks(kvflash_history_, kvflash_pager_.chunk_tokens(), kvflash_scores_)) { + return; // scorer failure -> keep LRU behavior this round + } + kvflash_pager_.score_hook = [this](int c) { + return c < (int)kvflash_scores_.size() ? kvflash_scores_[c] : 1e30f; + }; + const int events = kvflash_pager_.reselect(); + if (events > 0) { + std::fprintf(stderr, "[kvflash] reselect @gen=%d: %d page events " + "(resident %d/%d blocks)\n", + generated, events, kvflash_pager_.resident_blocks(), + kvflash_tokens_ / kvflash_pager_.chunk_tokens()); + } +} + bool Qwen35Backend::do_ar_decode(int committed, int n_gen, std::vector & out_tokens, const DaemonIO & io, @@ -1127,6 +1412,7 @@ bool Qwen35Backend::do_ar_decode(int committed, int n_gen, maybe_force_close(first_tok, committed); out_tokens.push_back(first_tok); io.emit(first_tok); + if (kvflash_active()) kvflash_history_.push_back(first_tok); if (IS_EOS_TOK(first_tok, w_)) return true; committed++; cache_.cur_pos = committed; @@ -1141,24 +1427,39 @@ bool Qwen35Backend::do_ar_decode(int committed, int n_gen, int32_t pos4[4] = {committed, committed, committed, 0}; ggml_backend_tensor_set(sg_.positions, pos4, 0, sizeof(int32_t) * 4); + // kvflash: graph carries a slot-validity mask alongside the + // step-invariant set_rows write; the FA span clamps to the pool. + const bool pool = kvflash_active(); if (!build_target_step(sg_, w_, cache_, target_backend_, /*kv_start=*/committed, /*n_tokens=*/1, - /*with_mask=*/false, /*capture=*/false, + /*with_mask=*/pool, /*capture=*/false, /*capture_delta_intermediate=*/false, /*fa_window=*/0, /*last_token_logits_only=*/false, cfg_.kq_stride_pad, - should_capture_moe_router())) { + should_capture_moe_router(), + /*kvflash_mask=*/pool)) { return false; } - // Fill kv_write_rows with this step's cache slot (committed) for set_rows. + // Fill kv_write_rows with this step's cache slot for set_rows: + // the logical position directly, or its pool slot in kvflash mode. if (sg_.kv_write_rows) { const int n_head_kv = w_.n_head_kv; - std::vector row_vals(n_head_kv, (int64_t)committed); + const int64_t slot = pool ? (int64_t)kvflash_pager_.slot_for(committed) + : (int64_t)committed; + if (pool && slot < 0) { + std::fprintf(stderr, "[kvflash] no pool slot at pos %d " + "(pool %d exhausted)\n", + committed, kvflash_tokens_); + set_last_error("kvflash: no evictable pool block"); + return false; + } + std::vector row_vals(n_head_kv, slot); ggml_backend_tensor_set(sg_.kv_write_rows, row_vals.data(), 0, sizeof(int64_t) * n_head_kv); } + if (pool) kvflash_upload_mask(); auto st = ggml_backend_graph_compute(target_backend_, sg_.gf); if (st != GGML_STATUS_SUCCESS) return false; @@ -1220,6 +1521,10 @@ bool Qwen35Backend::do_ar_decode(int committed, int n_gen, io.emit(next_tok); committed++; cache_.cur_pos = committed; + if (pool) { + kvflash_history_.push_back(next_tok); + kvflash_maybe_reselect((int)(out_tokens.size() - out_tokens_at_entry)); + } if (io.cancelled) break; if (IS_EOS_TOK(next_tok, w_)) break; @@ -1352,6 +1657,12 @@ bool Qwen35Backend::do_spec_decode(int committed, int n_gen, // - draft model loaded and not parked // - feature mirror initialized // - greedy decoding (no logit processing) — spec decode uses argmax verification + // - kvflash: verify_batch is slot-mapped (Qwen35DFlashTarget pooled + // path), and that covers --ddtree too: in the daemon, ddtree_mode + // configures larger verify intermediates + fast_rollback, whose + // snapshot_kv/restore_kv only touch DeltaNet/conv state (pool- + // neutral); generation runs this same chain loop either way. The + // tree-verify graphs exist only in the test harness (test_dflash). const bool can_spec = cfg_.draft_path && !draft_parked_ && (cfg_.remote_draft.enabled() diff --git a/server/src/qwen35/qwen35_backend.h b/server/src/qwen35/qwen35_backend.h index 59a105fc9..0df4df036 100644 --- a/server/src/qwen35/qwen35_backend.h +++ b/server/src/qwen35/qwen35_backend.h @@ -21,6 +21,8 @@ #include "dflash_feature_ring.h" #include "internal.h" // TargetWeights, TargetCache, DraftWeights, PrefixSnapshot #include "qwen3/qwen3_drafter.h" // DrafterContext, load_drafter, free_drafter, drafter_score_and_compress +#include "kvflash_pager.h" // bounded KV residency pool +#include "kvflash_scorer.h" // chunk-relevance policy interface #include "ggml.h" #include "ggml-backend.h" @@ -158,6 +160,40 @@ class Qwen35Backend : public ModelBackend { // ── Configuration ──────────────────────────────────────────────── Qwen35Config cfg_; + // ── kvflash (bounded KV residency, FlashMemory-style) ──────────── + // Active when kvflash_tokens_ > 0 (env DFLASH_KVFLASH / --kvflash): + // attention KV tensors are allocated at pool capacity, logical + // positions map to pool slots via kvflash_pager_, cold chunks page to + // host. Policy-agnostic: with no scorer the pager is LRU; when the + // pflash drafter is loaded it becomes the reselect scorer (every + // kvflash_tau_ decoded tokens). Forces AR decode (no spec). + // Protected: the MoE subclass routes its pipelined decode loops and + // hybrid prefill through the same pager/history/reselect state. + KvFlashPager kvflash_pager_; + std::unique_ptr kvflash_scorer_; + std::vector kvflash_history_; // prompt + generated ids + std::vector kvflash_scores_; // latest chunk scores + std::vector kvflash_mask_buf_; // host mirror of slot mask + std::string kvflash_drafter_path_; // DFLASH_KVFLASH_DRAFTER + uint64_t kvflash_mask_epoch_ = (uint64_t)-1; + int kvflash_tokens_ = 0; // 0 = off + int kvflash_tau_ = 64; + bool kvflash_drafter_failed_ = false; // don't retry a failed load + bool kvflash_active() const { return kvflash_tokens_ > 0; } + // Rebuild pager mapping after (re)prefill: positions [0, committed) + // occupy pool slots identity-mapped (prefill is contiguous). + void kvflash_sync_prefill(int committed, const std::vector & tokens, + int kv_offset); + // Upload the slot-validity mask (host rebuild on epoch change, device + // upload every step — the input's buffer region is reused by compute). + void kvflash_upload_mask(); + // Drafter rescore + reselect every kvflash_tau_ generated tokens. + void kvflash_maybe_reselect(int generated); + // Attach the drafter scorer if a drafter path is configured and the + // scorer is missing (lazy-loads the drafter on first need; also heals + // after a residency release frees it). No-op without a path. + void kvflash_ensure_scorer(); + private: // ── GPU backends ───────────────────────────────────────────────── ggml_backend_t target_backend_ = nullptr; diff --git a/server/src/qwen35/qwen35_dflash_target.cpp b/server/src/qwen35/qwen35_dflash_target.cpp index 65713d1bb..5af4490af 100644 --- a/server/src/qwen35/qwen35_dflash_target.cpp +++ b/server/src/qwen35/qwen35_dflash_target.cpp @@ -5,6 +5,8 @@ #include "step_graph.h" #include "attn_masks.h" +#include + namespace dflash::common { Qwen35DFlashTarget::~Qwen35DFlashTarget() { @@ -33,18 +35,53 @@ bool Qwen35DFlashTarget::verify_batch( if (n_tokens <= 0) return false; const int hidden = w_.n_embd; - const bool need_mask = (kq_stride_pad_ > KQ_MASK_PAD) || (n_tokens > 1); + const bool pool = pager_ != nullptr; + const bool need_mask = pool || (kq_stride_pad_ > KQ_MASK_PAD) || (n_tokens > 1); + + // kvflash: allocate slots for the verify block up front (may evict at + // a chunk boundary; protections keep sinks + the tail window safe). + std::vector slots; + if (pool) { + slots.resize(n_tokens); + for (int i = 0; i < n_tokens; i++) { + slots[i] = pager_->slot_for(base_pos + i); + if (slots[i] < 0) { + std::fprintf(stderr, "verify_batch: pool slot alloc failed @%d\n", base_pos + i); + return false; + } + } + } if (!build_target_step(sg_, w_, cache_, backend_, /*kv_start=*/base_pos, n_tokens, need_mask, /*capture=*/true, /*capture_delta_intermediate=*/false, - fa_window_, + pool ? 0 : fa_window_, /*last_token_logits_only=*/false, - kq_stride_pad_)) { + kq_stride_pad_, + /*capture_moe_router=*/false, + /*kvflash_mask=*/pool)) { std::fprintf(stderr, "verify_batch: build_target_step failed (base=%d n=%d)\n", base_pos, n_tokens); return false; } + if (pool && !sg_.kv_write_rows) { + std::fprintf(stderr, "verify_batch: kvflash requires set_rows path\n"); + return false; + } + if (pool) { + // kv_write_rows is [n_tokens, n_head_kv] ne0-major: element + // (token i, head h) lives at i + h*n_tokens (set_rows asserts + // b->ne[1] == c->ne[0]). Getting this transposed scrambles + // per-head row targets for every multi-token write. + std::vector rows((size_t)n_tokens * w_.n_head_kv); + for (int h = 0; h < w_.n_head_kv; h++) { + for (int i = 0; i < n_tokens; i++) { + rows[(size_t)h * n_tokens + i] = slots[i]; + } + } + ggml_backend_tensor_set(sg_.kv_write_rows, rows.data(), 0, + sizeof(int64_t) * rows.size()); + } // Embed input tokens and fill positions. std::vector embed((size_t)n_tokens * hidden); @@ -66,8 +103,35 @@ bool Qwen35DFlashTarget::verify_batch( ggml_backend_tensor_set(sg_.positions, pos.data(), 0, sizeof(int32_t) * pos.size()); - // Fill causal attention mask when present. - if (sg_.attn_mask) { + // Fill the attention mask. + if (sg_.attn_mask && pool) { + // Slot-space mask: row q attends (a) slots of committed positions + // (pos < base_pos) of resident chunks — this exactly excludes + // slots holding rejected drafts from earlier rounds — and (b) the + // verify tokens' own slots, causally. + const size_t kvd = (size_t)sg_.attn_mask->ne[0]; + const int q_pad = (int)sg_.attn_mask->ne[1]; + std::vector mask_buf((size_t)kvd * q_pad, F16_NEG_INF); + const int ct = pager_->chunk_tokens(); + for (int c = 0; c < pager_->n_chunks(); c++) { + const int blk = pager_->block_of(c); + if (blk < 0) continue; + for (int i = 0; i < ct; i++) { + if ((int64_t)c * ct + i >= base_pos) break; + mask_buf[(size_t)blk * ct + i] = F16_ZERO; + } + } + for (int q = 1; q < n_tokens; q++) { + std::memcpy(mask_buf.data() + (size_t)q * kvd, mask_buf.data(), kvd * 2); + } + for (int q = 0; q < n_tokens; q++) { + for (int i = 0; i <= q; i++) { + mask_buf[(size_t)q * kvd + slots[i]] = F16_ZERO; + } + } + ggml_backend_tensor_set(sg_.attn_mask, mask_buf.data(), 0, + sizeof(uint16_t) * mask_buf.size()); + } else if (sg_.attn_mask) { const int win_start = (fa_window_ > 0 && base_pos > fa_window_) ? (base_pos - fa_window_) : 0; const int kv_len = base_pos + n_tokens - win_start; diff --git a/server/src/qwen35/qwen35_dflash_target.h b/server/src/qwen35/qwen35_dflash_target.h index 6a72e48b5..17ab8bf95 100644 --- a/server/src/qwen35/qwen35_dflash_target.h +++ b/server/src/qwen35/qwen35_dflash_target.h @@ -10,6 +10,7 @@ #include "internal.h" // TargetWeights, TargetCache, DraftWeights #include "step_graph.h" #include "graph_builders.h" +#include "kvflash_pager.h" #include "ggml.h" #include "ggml-backend.h" @@ -53,6 +54,14 @@ class Qwen35DFlashTarget : public DFlashTarget { int mask_token_id() const override; const std::vector & capture_layer_ids() const override; + // kvflash mode: verify writes are slot-mapped via the pager and the + // attention mask carries slot validity (resident committed positions + // only) plus causal structure among the verify tokens. Rejected draft + // tokens need no explicit rollback: their slots are excluded by the + // pos < base_pos validity rule on the next verify and get rewritten. + // Forces fa_window = 0 (logical windowing is meaningless in slot space). + void set_kvflash_pager(KvFlashPager * pager) { pager_ = pager; } + private: TargetWeights & w_; TargetCache & cache_; @@ -60,6 +69,7 @@ class Qwen35DFlashTarget : public DFlashTarget { StepGraph & sg_; int kq_stride_pad_; int fa_window_; + KvFlashPager * pager_ = nullptr; // Cached vector form of capture layer IDs (built once in constructor). std::vector capture_ids_; diff --git a/server/src/qwen35/qwen35_target_graph.cpp b/server/src/qwen35/qwen35_target_graph.cpp index ed7fbe057..e0f7d8ecd 100644 --- a/server/src/qwen35/qwen35_target_graph.cpp +++ b/server/src/qwen35/qwen35_target_graph.cpp @@ -76,10 +76,11 @@ bool create_target_cache(const TargetWeights & w, int max_verify_tokens, ggml_backend_t backend, TargetCache & out, - bool prefill_only) { + bool prefill_only, + int ctx_alloc) { return create_target_cache_partial(w, max_ctx, max_verify_tokens, backend, out, prefill_only, - 0, w.n_layer, true); + 0, w.n_layer, true, ctx_alloc); } bool create_target_cache_partial(const TargetWeights & w, @@ -90,7 +91,8 @@ bool create_target_cache_partial(const TargetWeights & w, bool prefill_only, int layer_begin, int layer_end, - bool allocate_target_feat) { + bool allocate_target_feat, + int ctx_alloc) { if (layer_begin < 0) layer_begin = 0; if (layer_end < 0 || layer_end > w.n_layer) layer_end = w.n_layer; if (layer_begin > layer_end) { @@ -133,9 +135,14 @@ bool create_target_cache_partial(const TargetWeights & w, const bool needs_256_stride = kv_k_type == GGML_TYPE_TQ3_0 || kv_v_type == GGML_TYPE_TQ3_0; + // kvflash mode: attention tensors are allocated at the (smaller) + // physical pool capacity; logical positions are mapped to pool slots + // by KvFlashPager. The 256-stride rounding applies to whichever capacity + // is in effect. + const int ctx_phys = (ctx_alloc > 0 && ctx_alloc < max_ctx) ? ctx_alloc : max_ctx; const int max_ctx_alloc = needs_256_stride - ? ((max_ctx + 255) / 256) * 256 - : max_ctx; + ? ((ctx_phys + 255) / 256) * 256 + : ctx_phys; // ── Base context: KV cache + SSM/conv state + target_feat ──────── { @@ -433,6 +440,62 @@ void restore_ssm_state(TargetCache & c) { } } +// Allocate SSM/conv rollback snapshot tensors by mirroring the live recurrent +// state tensors' shapes. The MoE hybrid spec-decode path sets up its DeltaNet +// state in base_buf but never calls migrate_prefill_cache, so without this +// snapshot_ssm_state/restore_ssm_state are silent no-ops (the _snap arrays are +// empty/null) and rejected draft tokens leak permanently into the linear +// recurrent state, collapsing generation. Idempotent: reuses an existing +// rollback_ctx (from a prior request or migrate_prefill_cache). +bool ensure_ssm_snapshot(TargetCache & c, ggml_backend_t backend) { + if (c.rollback_ctx) return true; + const size_t n = c.ssm_state.size(); + if (n == 0) return true; + c.ssm_state_snap.assign(n, nullptr); + c.conv_state_snap.assign(n, nullptr); + + size_t cnt = 0; + for (size_t i = 0; i < n; i++) { + if (c.ssm_state[i]) cnt++; + if (i < c.conv_state.size() && c.conv_state[i]) cnt++; + } + if (cnt == 0) return true; + + ggml_init_params ip{}; + ip.mem_size = (cnt + 8) * ggml_tensor_overhead(); + ip.mem_buffer = nullptr; + ip.no_alloc = true; + c.rollback_ctx = ggml_init(ip); + if (!c.rollback_ctx) { set_last_error("ensure_ssm_snapshot ggml_init failed"); return false; } + + for (size_t i = 0; i < n; i++) { + char name[64]; + if (c.ssm_state[i]) { + ggml_tensor * t = c.ssm_state[i]; + ggml_tensor * sn = ggml_new_tensor(c.rollback_ctx, t->type, ggml_n_dims(t), t->ne); + std::snprintf(name, sizeof(name), "ssm_state_snap_%zu", i); + ggml_set_name(sn, name); + c.ssm_state_snap[i] = sn; + } + if (i < c.conv_state.size() && c.conv_state[i]) { + ggml_tensor * t = c.conv_state[i]; + ggml_tensor * cn = ggml_new_tensor(c.rollback_ctx, t->type, ggml_n_dims(t), t->ne); + std::snprintf(name, sizeof(name), "conv_state_snap_%zu", i); + ggml_set_name(cn, name); + c.conv_state_snap[i] = cn; + } + } + + c.rollback_buf = ggml_backend_alloc_ctx_tensors(c.rollback_ctx, backend); + if (!c.rollback_buf) { + set_last_error("ensure_ssm_snapshot alloc_ctx_tensors failed"); + ggml_free(c.rollback_ctx); + c.rollback_ctx = nullptr; + return false; + } + return true; +} + // ─── Helpers ───────────────────────────────────────────────────────── static ggml_tensor * build_swiglu_ffn(ggml_context * ctx, ggml_tensor * cur, diff --git a/server/src/qwen35moe/qwen35moe_backend.cpp b/server/src/qwen35moe/qwen35moe_backend.cpp index 6455eac52..8b40be9fa 100644 --- a/server/src/qwen35moe/qwen35moe_backend.cpp +++ b/server/src/qwen35moe/qwen35moe_backend.cpp @@ -469,6 +469,7 @@ bool Qwen35MoeBackend::run_pipelined_decode_path(int committed, int n_gen, if (is_eos_tok(first_tok, target_weights())) return true; committed++; target_cache().cur_pos = committed; + if (kvflash_active()) kvflash_history_.push_back(first_tok); } // ── Ensure persistent pipelined state (built once, reused) ── @@ -487,11 +488,23 @@ bool Qwen35MoeBackend::run_pipelined_decode_path(int committed, int n_gen, act_cur.data(), 0, sizeof(float) * (size_t)hidden); const auto embed_done = DecodeClock::now(); + // kvflash: physical pool slot for this token's KV rows (may evict). + int kv_slot = -1; + if (kvflash_active()) { + kv_slot = kvflash_pager_.slot_for(committed); + if (kv_slot < 0) { + std::fprintf(stderr, "[kvflash] pipelined decode: no slot at pos %d\n", + committed); + return false; + } + } + PipelinedDecodeTelemetry tel; if (!pipelined_decode_one_token(*pipe_state_, target_backend(), target_weights(), target_cache(), *target_weights().moe_hybrid, committed, cfg_.kq_stride_pad, - hybrid_telemetry_ ? &tel : nullptr)) { + hybrid_telemetry_ ? &tel : nullptr, + kv_slot)) { return false; } const auto layers_done = DecodeClock::now(); @@ -563,6 +576,10 @@ bool Qwen35MoeBackend::run_pipelined_decode_path(int committed, int n_gen, io.emit(next_tok); committed++; target_cache().cur_pos = committed; + if (kvflash_active()) { + kvflash_history_.push_back(next_tok); + kvflash_maybe_reselect((int)out_tokens.size()); + } if (io.cancelled) break; if (is_eos_tok(next_tok, target_weights())) break; } @@ -721,6 +738,19 @@ GenerateResult Qwen35MoeBackend::generate_impl(const GenerateRequest & req, const int prompt_len = (int)req.prompt.size(); const int prefill_chunk = std::min(128, prompt_len); // batch size per GPU compute + // kvflash: hybrid prefill writes rows identity-mapped, so the prompt must + // fit the pool with one chunk of decode headroom (same contract as the + // base do_prefill). + if (kvflash_active() && + prompt_len > kvflash_tokens_ - kvflash_pager_.chunk_tokens()) { + std::fprintf(stderr, + "[kvflash] hybrid prompt (%d) exceeds pool %d; raise --kvflash " + "or enable pflash compression\n", prompt_len, kvflash_tokens_); + result.error = "kvflash: prompt exceeds resident pool"; + cleanup_graphs(); + return result; + } + // Embed all prompt tokens const int n_expert_used = target_weights().n_expert_used; std::vector embed_all((size_t)prompt_len * (size_t)hidden); @@ -957,6 +987,9 @@ GenerateResult Qwen35MoeBackend::generate_impl(const GenerateRequest & req, int committed = prompt_len; target_cache().cur_pos = committed; + if (kvflash_active()) { + kvflash_sync_prefill(committed, req.prompt, /*kv_offset=*/0); + } auto t_prefill_end = std::chrono::steady_clock::now(); result.prefill_s = std::chrono::duration(t_prefill_end - t_prefill_start).count(); @@ -990,7 +1023,9 @@ GenerateResult Qwen35MoeBackend::generate_impl(const GenerateRequest & req, if (req.n_gen > 0) { auto t_decode_start = std::chrono::steady_clock::now(); - // Check if hybrid spec-decode is available + // Hybrid spec-decode runs on the pool: hybrid_forward_batch is + // slot-mapped (verify and replay both route through it) and the + // recurrent-state rollback is ssm snapshot/restore (pool-neutral). const bool can_hybrid_spec = !req.force_ar_decode && cfg_.draft_path && !is_draft_parked() @@ -1021,7 +1056,8 @@ GenerateResult Qwen35MoeBackend::generate_impl(const GenerateRequest & req, target_cache().last_tok = first_tok; cleanup_graphs(); - if (!do_hybrid_spec_decode(committed, req.n_gen, result.tokens, out_io)) { + if (!do_hybrid_spec_decode(committed, req.n_gen, result.tokens, out_io, + &result.accept_rate)) { result.error = "hybrid_spec_decode"; return result; } @@ -1057,6 +1093,7 @@ GenerateResult Qwen35MoeBackend::generate_impl(const GenerateRequest & req, if (!is_eos_tok(first_tok, target_weights())) { committed++; target_cache().cur_pos = committed; + if (kvflash_active()) kvflash_history_.push_back(first_tok); // Pipelined decode loop PipelinedDecodeTelemetry decode_tel_accum{}; @@ -1071,11 +1108,23 @@ GenerateResult Qwen35MoeBackend::generate_impl(const GenerateRequest & req, ggml_backend_tensor_set_async(target_backend(), pipe_state_->gpu_state.act_cur, act_cur.data(), 0, sizeof(float) * (size_t)hidden); + // kvflash: pool slot for this token's KV rows (may evict) + int kv_slot = -1; + if (kvflash_active()) { + kv_slot = kvflash_pager_.slot_for(committed); + if (kv_slot < 0) { + result.error = "kvflash_slot"; + cleanup_graphs(); + return result; + } + } + PipelinedDecodeTelemetry tel; if (!pipelined_decode_one_token(*pipe_state_, target_backend(), target_weights(), target_cache(), *target_weights().moe_hybrid, committed, cfg_.kq_stride_pad, - hybrid_telemetry_ ? &tel : nullptr)) { + hybrid_telemetry_ ? &tel : nullptr, + kv_slot)) { result.error = "decode"; cleanup_graphs(); return result; @@ -1133,6 +1182,10 @@ GenerateResult Qwen35MoeBackend::generate_impl(const GenerateRequest & req, out_io.emit(next_tok); committed++; target_cache().cur_pos = committed; + if (kvflash_active()) { + kvflash_history_.push_back(next_tok); + kvflash_maybe_reselect((int)result.tokens.size()); + } if (out_io.cancelled) break; if (is_eos_tok(next_tok, target_weights())) break; } @@ -1295,6 +1348,32 @@ GenerateResult Qwen35MoeBackend::restore_and_generate_impl(int slot, return result; } + // kvflash: the restored prefix + delta prefill land identity-mapped, so + // the full prompt must fit the pool (snapshots past the pool are never + // saved, but the delta can still overflow it). + if (kvflash_active() && + prompt_len > kvflash_tokens_ - kvflash_pager_.chunk_tokens()) { + std::fprintf(stderr, + "[kvflash] hybrid restore prompt (%d) exceeds pool %d; raise " + "--kvflash\n", prompt_len, kvflash_tokens_); + result.error = "kvflash: prompt exceeds resident pool"; + out_io.emit(-1); + return result; + } + + // kvflash: the delta prefill below runs the maskless pipelined forward + // over the padded pool span; map the restored prefix identity-style and + // zero stale free slots BEFORE any forward reads them. + if (kvflash_active()) { + kvflash_pager_.reset(); + if (!kvflash_pager_.alloc_span(0, snap_pos)) { + result.error = "kvflash_slot"; + out_io.emit(-1); + return result; + } + kvflash_pager_.zero_free_blocks(); + } + const int hidden = target_weights().n_embd; std::vector act_cur((size_t)hidden); if (prompt_len > snap_pos) { @@ -1314,6 +1393,17 @@ GenerateResult Qwen35MoeBackend::restore_and_generate_impl(int slot, std::chrono::steady_clock::now() - t_prefill_start).count(); } + if (kvflash_active()) { + // Rebuild the pager mapping over the identity-mapped [0, committed). + // With the full prompt available the history carries real ids; + // restore-only generates keep an unknown-prefix history. + if (prompt_len == committed) { + kvflash_sync_prefill(committed, req.prompt, /*kv_offset=*/0); + } else { + kvflash_sync_prefill(committed, {}, /*kv_offset=*/committed); + } + } + if (req.n_gen > 0) { if (target_cache().last_tok < 0) { std::fprintf(stderr, @@ -1457,6 +1547,29 @@ bool Qwen35MoeBackend::hybrid_forward_batch( } } + // kvflash: allocate the block's slots up front (may evict) and build + // the slot-mapped write rows + slot-space mask once; every layer's + // graph gets the same fills (verify and replay both land here, so all + // hybrid-spec KV writes are pool-routed). + const bool kvf = kvflash_active(); + std::vector kvf_rows; + std::vector kvf_mask; + std::vector kvf_slots; + if (kvf) { + if (!kvflash_pager_.alloc_span(base_pos, n_tokens)) return false; + kvf_slots.resize((size_t)n_tokens); + for (int i = 0; i < n_tokens; ++i) { + kvf_slots[(size_t)i] = kvflash_pager_.slot_of(base_pos + i); + } + // [n_tokens, n_head_kv] ne0-major (see verify_batch). + kvf_rows.resize((size_t)n_tokens * target_weights().n_head_kv); + for (int h = 0; h < target_weights().n_head_kv; ++h) { + for (int i = 0; i < n_tokens; ++i) { + kvf_rows[(size_t)h * n_tokens + i] = kvf_slots[(size_t)i]; + } + } + } + // Process layer-by-layer (same as prefill) StepGraph prefn_sg; ggml_gallocr_t ffn_hot_alloc = nullptr; @@ -1466,17 +1579,23 @@ bool Qwen35MoeBackend::hybrid_forward_batch( for (int il = 0; il < n_layer; ++il) { auto & storage = target_weights().moe_hybrid->layers[(size_t)il]; - const bool with_mask = (cfg_.kq_stride_pad > KQ_MASK_PAD) || (n_tokens > 1); + const bool with_mask = kvf || + (cfg_.kq_stride_pad > KQ_MASK_PAD) || (n_tokens > 1); // Build pre-FFN graph (DeltaNet/attention + router) for all tokens step_graph_free(prefn_sg); if (!build_layer_prefn_step(prefn_sg, target_weights(), target_cache(), target_backend(), il, /*kv_start=*/base_pos, n_tokens, - with_mask, /*fa_window=*/0, cfg_.kq_stride_pad)) { + with_mask, /*fa_window=*/0, cfg_.kq_stride_pad, + /*kvflash=*/kvf)) { step_graph_destroy(prefn_sg); if (ffn_hot_alloc) ggml_gallocr_free(ffn_hot_alloc); return false; } + if (prefn_sg.kv_write_rows) { + ggml_backend_tensor_set(prefn_sg.kv_write_rows, kvf_rows.data(), 0, + sizeof(int64_t) * kvf_rows.size()); + } // Upload embeddings ggml_backend_tensor_set(prefn_sg.inp_embed, embed_all.data(), 0, @@ -1496,7 +1615,36 @@ bool Qwen35MoeBackend::hybrid_forward_batch( } // Set causal mask - if (prefn_sg.attn_mask) { + if (prefn_sg.attn_mask && kvf) { + // Slot-space mask (verify_batch recipe): committed resident + // positions (< base_pos) plus this block's own slots, causal. + // Built once, reused for every layer's graph. + if (kvf_mask.empty()) { + constexpr uint16_t F16_ZERO = 0x0000, F16_NEG_INF = 0xFC00; + const size_t kvd = (size_t)prefn_sg.attn_mask->ne[0]; + const int q_pad = (int)prefn_sg.attn_mask->ne[1]; + kvf_mask.assign(kvd * q_pad, F16_NEG_INF); + const int ct = kvflash_pager_.chunk_tokens(); + for (int c = 0; c < kvflash_pager_.n_chunks(); c++) { + const int blk = kvflash_pager_.block_of(c); + if (blk < 0) continue; + for (int i = 0; i < ct; i++) { + if ((int64_t)c * ct + i >= base_pos) break; + kvf_mask[(size_t)blk * ct + i] = F16_ZERO; + } + } + for (int q = 1; q < n_tokens; q++) { + std::memcpy(kvf_mask.data() + (size_t)q * kvd, kvf_mask.data(), kvd * 2); + } + for (int q = 0; q < n_tokens; q++) { + for (int i = 0; i <= q; i++) { + kvf_mask[(size_t)q * kvd + kvf_slots[(size_t)i]] = F16_ZERO; + } + } + } + ggml_backend_tensor_set(prefn_sg.attn_mask, kvf_mask.data(), 0, + sizeof(uint16_t) * kvf_mask.size()); + } else if (prefn_sg.attn_mask) { const int kv_len = base_pos + n_tokens; const int kv_pad_override = (int)prefn_sg.attn_mask->ne[0]; std::vector mask_buf; @@ -1542,14 +1690,27 @@ bool Qwen35MoeBackend::hybrid_forward_batch( std::vector ffn_batch_out; bool ffn_ok = false; - if (storage.cold_expert_ids.empty()) { - // All-hot: use batched hot-only path + // Spark expert cache: pull the verify batch's selected cold experts into + // spare GPU slots (LRU) so the batched FFN serves them on-die — the SAME + // residency mechanism the AR pipelined path uses. Without this the verify + // re-evaluated cold experts on the CPU every step, which dominated its FFN + // time (the spec-decode-with-offloading inefficiency). After warmup the + // working set is resident and the CPU cold path is rarely taken. + const int n_route_slots = n_tokens * n_expert_used; + if (storage.cache_slots > 0 && !storage.cold_expert_ids.empty()) { + for (int i = 0; i < n_route_slots; ++i) + dflash::common::moe_hybrid_cache_swap_in(storage, chunk_selected[(size_t)i], target_backend()); + } + const bool routed_all_hot = storage.cold_expert_ids.empty() + || storage.all_routed_are_hot(chunk_selected.data(), n_route_slots); + if (routed_all_hot) { + // All routed experts resident on GPU: fast batched hot-only path. ffn_ok = eval_moe_hot_only_batched( target_backend(), chunk_cfg, chunk_desc, storage, chunk_post.data(), chunk_selected.data(), chunk_weights.data(), n_tokens, ffn_batch_out, nullptr, &ffn_hot_alloc); } else { - // Mixed hot/cold: use hybrid path + // Cache full / residue still cold: hybrid path (remaining cold on CPU). ffn_ok = eval_moe_hybrid_ffn_batched( target_backend(), target_weights().moe_hybrid->cpu_backend, chunk_cfg, chunk_desc, storage, @@ -1619,29 +1780,13 @@ bool Qwen35MoeBackend::hybrid_forward_batch( act_cur.assign(embed_all.data() + (size_t)(n_tokens - 1) * (size_t)hidden, embed_all.data() + (size_t)n_tokens * (size_t)hidden); - // Project ALL tokens to logits and get argmax for each - const int vocab = target_weights().n_vocab; + // Project ALL tokens to logits and argmax ON THE GPU, reading back only + // n_tokens token ids instead of vocab*n_tokens floats. The host logits + // readback + host argmax was a large per-step D2H cost in the verify and + // replay forwards (vocab ~152k x n_tokens x 4B, twice per spec step). argmax_out.resize(n_tokens); - StepGraph proj_sg; - ggml_init_params ip{}; - ip.mem_size = 64 * 1024 * 1024; - ip.mem_buffer = nullptr; - ip.no_alloc = true; - proj_sg.ctx = ggml_init(ip); - if (!proj_sg.ctx) return false; - - proj_sg.hidden_input = ggml_new_tensor_2d(proj_sg.ctx, GGML_TYPE_F32, hidden, n_tokens); - ggml_set_input(proj_sg.hidden_input); - proj_sg.gf = ggml_new_graph_custom(proj_sg.ctx, 1024, false); - ggml_tensor * normed = ggml_rms_norm(proj_sg.ctx, proj_sg.hidden_input, target_weights().rms_eps); - normed = ggml_mul(proj_sg.ctx, normed, target_weights().out_norm); - proj_sg.logits = ggml_mul_mat(proj_sg.ctx, target_weights().output, normed); - ggml_set_output(proj_sg.logits); - ggml_build_forward_expand(proj_sg.gf, proj_sg.logits); - proj_sg.alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(target_backend())); - if (!ggml_gallocr_alloc_graph(proj_sg.alloc, proj_sg.gf)) { - step_graph_destroy(proj_sg); + if (!build_lm_head_projection_step(proj_sg, target_weights(), target_backend(), n_tokens)) { return false; } ggml_backend_tensor_set(proj_sg.hidden_input, embed_all.data(), 0, @@ -1651,35 +1796,36 @@ bool Qwen35MoeBackend::hybrid_forward_batch( step_graph_destroy(proj_sg); return false; } - - // Read logits and compute argmax per token - std::vector logits_buf((size_t)vocab * (size_t)n_tokens); - ggml_backend_tensor_get(proj_sg.logits, logits_buf.data(), 0, - sizeof(float) * logits_buf.size()); + ggml_backend_tensor_get(proj_sg.argmax_tokens, argmax_out.data(), 0, + sizeof(int32_t) * (size_t)n_tokens); step_graph_destroy(proj_sg); - - for (int t = 0; t < n_tokens; ++t) { - const float * tok_logits = logits_buf.data() + (size_t)t * (size_t)vocab; - int32_t best_id = 0; - float best_val = tok_logits[0]; - for (int j = 1; j < vocab; ++j) { - if (tok_logits[j] > best_val) { - best_val = tok_logits[j]; - best_id = j; - } - } - argmax_out[t] = best_id; - } return true; } bool Qwen35MoeBackend::do_hybrid_spec_decode(int committed, int n_gen, std::vector & out_tokens, - const DaemonIO & io) { + const DaemonIO & io, + float * accept_rate_out) { const int hidden = target_weights().n_embd; const int q_len = draft_weights().block_size; if (q_len <= 0) return false; + // Verify width: cap how many draft tokens we actually verify. The batched + // verify's cost is dominated by the distinct experts its tokens touch + // (especially under --spark expert offload, where extra tokens stream extra + // cold experts). Tokens past the realized accept length are wasted, so + // capping the verify to a width above the typical accept length cuts that + // waste at no acceptance cost. Default = full draft block; tune via env. + // Verify-width control (see note above). DFLASH_VERIFY_WIDTH pins a fixed + // width; otherwise the width adapts to the realized accept length so chain + // decoding (low AL) verifies just a few tokens (cheap, especially under + // expert offload) while a high-AL draft still gets enough width. + const int forced_verify_width = [&]{ + const char * e = std::getenv("DFLASH_VERIFY_WIDTH"); + return e ? std::max(1, std::min(q_len, std::atoi(e))) : 0; + }(); + int observed_max_accept = 1; + int32_t last_tok = target_cache().last_tok; std::vector act_cur((size_t)hidden); @@ -1696,10 +1842,22 @@ bool Qwen35MoeBackend::do_hybrid_spec_decode(int committed, int n_gen, int n_draft_steps = 0; int n_accept_sum = 0; + // Allocate DeltaNet rollback snapshot tensors (no-op if already present). + // Without these, snapshot_ssm_state/restore_ssm_state silently do nothing + // and rejected draft tokens leak into the recurrent state, collapsing output. + if (!ensure_ssm_snapshot(target_cache(), target_backend())) { + std::fprintf(stderr, "[hybrid-spec] ensure_ssm_snapshot failed\n"); + step_graph_destroy(draft_sg); + return false; + } + auto t_dec0 = std::chrono::steady_clock::now(); while (n_generated < n_gen) { const int need_commit_budget = n_gen - n_generated; + const int verify_width = forced_verify_width > 0 + ? forced_verify_width + : std::min(q_len, std::max(6, observed_max_accept + 2)); // 1. Build noise input for draft noise_ids[0] = last_tok; @@ -1785,9 +1943,9 @@ bool Qwen35MoeBackend::do_hybrid_spec_decode(int committed, int n_gen, // 4. Verify: snapshot recurrent state, then run ALL draft tokens batched snapshot_ssm_state(target_cache()); - target_tok.resize(q_len); + target_tok.resize(verify_width); bool verify_ok = hybrid_forward_batch( - draft_tok.data(), q_len, committed, + draft_tok.data(), verify_width, committed, act_cur, target_tok, /*capture_features=*/false); if (!verify_ok) { std::fprintf(stderr, "[hybrid-spec] verify failed\n"); @@ -1798,11 +1956,12 @@ bool Qwen35MoeBackend::do_hybrid_spec_decode(int committed, int n_gen, // 5. Acceptance: longest matching prefix int accept_n = 1; - for (int i = 0; i < q_len - 1; i++) { + for (int i = 0; i < verify_width - 1; i++) { if (draft_tok[i + 1] == target_tok[i]) accept_n++; else break; } - int bonus_tok = (accept_n < q_len) ? target_tok[accept_n - 1] : -1; + int bonus_tok = (accept_n < verify_width) ? target_tok[accept_n - 1] : -1; + observed_max_accept = std::max(observed_max_accept, accept_n); int commit_n = accept_n + (bonus_tok >= 0 ? 1 : 0); if (commit_n > need_commit_budget) { commit_n = need_commit_budget; @@ -1859,6 +2018,10 @@ bool Qwen35MoeBackend::do_hybrid_spec_decode(int committed, int n_gen, const double decode_s = std::chrono::duration(t_dec1 - t_dec0).count(); const int total_draft_pos = std::max(1, n_draft_steps * q_len); const double accept_pct = 100.0 * (double)n_accept_sum / (double)total_draft_pos; + if (accept_rate_out) { + *accept_rate_out = total_draft_pos > 0 + ? (float)((double)n_accept_sum / (double)total_draft_pos) : 0.0f; + } std::fprintf(stderr, "[hybrid-spec] tokens=%d time=%.3f s speed=%.2f tok/s " "steps=%d accepted=%d/%d (%.1f%%) avg_commit=%.2f AL=%.2f\n", n_generated, decode_s, diff --git a/server/src/qwen35moe/qwen35moe_backend.h b/server/src/qwen35moe/qwen35moe_backend.h index ca154e405..d2f711a4c 100644 --- a/server/src/qwen35moe/qwen35moe_backend.h +++ b/server/src/qwen35moe/qwen35moe_backend.h @@ -61,7 +61,8 @@ class Qwen35MoeBackend : public Qwen35Backend { // verify via hybrid forward (layer-by-layer with hot/cold FFN). bool do_hybrid_spec_decode(int committed, int n_gen, std::vector & out_tokens, - const DaemonIO & io); + const DaemonIO & io, + float * accept_rate_out = nullptr); // Run one token through hybrid forward, capturing features at capture layers. // Returns the logits argmax token. Advances committed by 1. diff --git a/server/src/qwen35moe/qwen35moe_pipelined_decode.cpp b/server/src/qwen35moe/qwen35moe_pipelined_decode.cpp index 72cb03975..bfd4df479 100644 --- a/server/src/qwen35moe/qwen35moe_pipelined_decode.cpp +++ b/server/src/qwen35moe/qwen35moe_pipelined_decode.cpp @@ -314,12 +314,16 @@ bool pipelined_decode_one_token( MoeHybridStorage & hybrid, int kv_pos, int kq_stride_pad, - PipelinedDecodeTelemetry * tel) { + PipelinedDecodeTelemetry * tel, + int kv_slot) { const int n_layer = state.n_layer; const int n_embd = state.n_embd; const int n_expert_used = state.n_expert_used; ggml_backend_t cpu_be = hybrid.cpu_backend; + // Physical KV row for this token: kvflash pool slot, or the logical + // position itself. positions (RoPE) always carry the logical kv_pos. + const int kv_row = kv_slot >= 0 ? kv_slot : kv_pos; if (tel) { *tel = PipelinedDecodeTelemetry{}; @@ -503,7 +507,12 @@ bool pipelined_decode_one_token( bool attn_cached_ok = false; if (is_attn && !g_no_kvpad) { auto & cpg = state.cached_prefn[(size_t)il]; - const int kv_win_needed = ((kv_pos + 1) + 255) & ~255; + // Clamp the baked FA span to the cache tensor's physical capacity: + // with kvflash the tensors are pool-sized, so the window stops + // growing at the pool (and the cached graph never rebuilds again). + const int kv_phys = (int)cache.attn_k[0]->ne[1]; + const int kv_win_needed = + std::min(((kv_pos + 1) + 255) & ~255, kv_phys); if (!cpg.valid() || cpg.kv_win < kv_win_needed) { if (!build_cached_attn_prefn(cpg, backend, w, cache, il, kv_win_needed, kq_stride_pad)) { @@ -519,7 +528,7 @@ bool pipelined_decode_one_token( ggml_backend_tensor_copy_async(backend, backend, state.gpu_state.act_cur, cpg.inp_embed); int32_t pos4[4] = {kv_pos, kv_pos, kv_pos, 0}; ggml_backend_tensor_set_async(backend, cpg.positions, pos4, 0, sizeof(pos4)); - std::vector row_vals((size_t)w.n_head_kv, (int64_t)kv_pos); + std::vector row_vals((size_t)w.n_head_kv, (int64_t)kv_row); ggml_backend_tensor_set_async(backend, cpg.kv_write_rows, row_vals.data(), 0, sizeof(int64_t) * row_vals.size()); @@ -536,7 +545,16 @@ bool pipelined_decode_one_token( moe_weights_tensor = cpg.moe_weights; } else if (is_attn || !state.cached_prefn[(size_t)il].valid()) { // Attention layer (legacy/fallback) OR failed DeltaNet cache: - // rebuild graph dynamically + // rebuild graph dynamically. The legacy path writes KV at the + // literal view offset kv_pos and cannot express a pool slot — + // refuse instead of corrupting the pool / running off its end. + if (is_attn && kv_slot >= 0) { + std::fprintf(stderr, + "[pipelined] kvflash requires the cached set_rows attn path " + "(layer %d cached-graph build failed)\n", il); + step_graph_destroy(dyn_sg); + return false; + } if (!build_layer_prefn_step(dyn_sg, w, cache, backend, il, kv_pos, /*n_tokens=*/1, /*with_mask=*/false, /*fa_window=*/0, kq_stride_pad)) { diff --git a/server/src/qwen35moe/qwen35moe_pipelined_decode.h b/server/src/qwen35moe/qwen35moe_pipelined_decode.h index ae35c775f..64d3b6bab 100644 --- a/server/src/qwen35moe/qwen35moe_pipelined_decode.h +++ b/server/src/qwen35moe/qwen35moe_pipelined_decode.h @@ -197,14 +197,18 @@ bool init_pipelined_decode_state( // Run one full token through the pipelined decode loop (all n_layer layers). // On success, gpu_state.act_cur holds the final hidden state on GPU. // selected_ids_out / weights_out: optional per-layer routing capture for telemetry. +// kv_slot: physical KV row to write (kvflash pool slot); -1 = kv_pos (identity, +// no pool). The FA span clamps to the cache tensor's physical capacity, so +// pool-sized tensors bound the cached-graph window automatically. bool pipelined_decode_one_token( PipelinedDecodeState & state, ggml_backend_t backend, const TargetWeights & w, TargetCache & cache, MoeHybridStorage & hybrid, - int kv_pos, // current KV position + int kv_pos, // current KV position (logical; drives RoPE) int kq_stride_pad, - PipelinedDecodeTelemetry * telemetry = nullptr); + PipelinedDecodeTelemetry * telemetry = nullptr, + int kv_slot = -1); } // namespace dflash::common diff --git a/server/src/server/server_main.cpp b/server/src/server/server_main.cpp index bbe274dbc..36c28e400 100644 --- a/server/src/server/server_main.cpp +++ b/server/src/server/server_main.cpp @@ -408,6 +408,33 @@ int main(int argc, char ** argv) { bargs.fast_rollback = true; } else if (std::strcmp(argv[i], "--ddtree-budget") == 0 && i + 1 < argc) { bargs.ddtree_budget = std::atoi(argv[++i]); + } else if (std::strcmp(argv[i], "--kvflash") == 0 && i + 1 < argc) { + // Bounded KV residency: attention KV lives in a fixed pool of N + // tokens; cold 64-token chunks page to host. Works with or + // without pflash (drafter becomes the reselect scorer when + // loaded; plain LRU otherwise). Forces AR decode. + ++i; + if (std::strcmp(argv[i], "auto") != 0 && std::atoi(argv[i]) <= 0) { + std::fprintf(stderr, "--kvflash expects a positive token count or " + "'auto', got '%s'\n", argv[i]); + return 1; + } + ::setenv("DFLASH_KVFLASH", argv[i], 1); + } else if (std::strcmp(argv[i], "--kvflash-policy") == 0 && i + 1 < argc) { + ++i; + if (std::strcmp(argv[i], "drafter") != 0 && std::strcmp(argv[i], "lru") != 0) { + std::fprintf(stderr, "--kvflash-policy expects 'drafter' or 'lru', got '%s'\n", + argv[i]); + return 1; + } + ::setenv("DFLASH_KVFLASH_POLICY", argv[i], 1); + } else if (std::strcmp(argv[i], "--kvflash-tau") == 0 && i + 1 < argc) { + if (std::atoi(argv[++i]) <= 0) { + std::fprintf(stderr, "--kvflash-tau expects a positive interval, got '%s'\n", + argv[i]); + return 1; + } + ::setenv("DFLASH_KVFLASH_TAU", argv[i], 1); } else if (std::strcmp(argv[i], "--spark") == 0) { spark_autotune = true; } else if (std::strcmp(argv[i], "--spark-slots") == 0 && i + 1 < argc) { @@ -459,6 +486,9 @@ int main(int argc, char ** argv) { sconfig.pflash_keep_ratio = (float)std::atof(argv[++i]); } else if (std::strcmp(argv[i], "--prefill-drafter") == 0 && i + 1 < argc) { sconfig.pflash_drafter_path = argv[++i]; + // kvflash reads this to lazy-attach the drafter as its + // residency scorer even when prefill compression is off. + ::setenv("DFLASH_KVFLASH_DRAFTER", argv[i], 1); } else if (std::strcmp(argv[i], "--prefill-skip-park") == 0) { sconfig.pflash_skip_park = true; } else if (std::strcmp(argv[i], "--prefill-upstream-base") == 0 && i + 1 < argc) { diff --git a/server/test/test_kvflash.cpp b/server/test/test_kvflash.cpp new file mode 100644 index 000000000..3f3634ac6 --- /dev/null +++ b/server/test/test_kvflash.cpp @@ -0,0 +1,1082 @@ +// test_kvflash — verifies KVFlash, the bounded-resident-pool KV cache +// (kvflash_pager.h). +// +// Runs against one loaded qwen35 target: +// +// A baseline: cache at LOGICAL context (default 131072), maskless decode +// (production AR path shape). Reference tokens + baseline KV memory. +// B relocation proof: small pool, chunks at SHUFFLED physical blocks, +// explicit pool slot mask, teacher-forced replay of A. Argmax must +// track A (position-independence + mask exactness). +// C paging proof: pool ≪ prompt+gen, live eviction, bit-exact +// page_out/page_in roundtrip, KV bytes vs A. +// D reselect/recall: evicted chunk recalled via score_hook + reselect() +// (the FlashMemory τ-step lookahead machinery); decode continues. +// E performance profile: decode ms/step vs FA span — baseline at +// 8K/32K/128K vs pool 1K/4K at 128K-logical — plus page-event and +// mask-refill microbenchmarks. +// +// Usage: +// test_kvflash [--logical-ctx=N] [--pool-b=N] [--pool-c=N] +// [--prompt=N] [--gen=N] [--skip-profile] [--no-mask] +// modes: (default) verification suite A-F | --niah | --niah256 | --longab + +#include "dflash27b.h" +#include "internal.h" +#include "kvflash_pager.h" +#include "attn_masks.h" +#include "qwen3_drafter.h" +#include "qwen3_kvflash_scorer.h" + +#include "ggml.h" +#include "ggml-alloc.h" +#include "ggml-backend.h" +#include "ggml-cuda.h" + +#include +#include +#include +#include +#include +#include +#include + +using namespace dflash::common; + +namespace { + +double now_ms() { + return std::chrono::duration( + std::chrono::steady_clock::now().time_since_epoch()).count(); +} + +size_t kv_cache_bytes(const TargetCache & c) { + size_t n = 0; + for (auto * t : c.attn_k) if (t) n += ggml_nbytes(t); + for (auto * t : c.attn_v) if (t) n += ggml_nbytes(t); + return n; +} + +size_t vram_used_now() { + size_t free_b = 0, total_b = 0; + ggml_backend_cuda_get_device_memory(0, &free_b, &total_b); + return total_b - free_b; +} + +// Single-token stepper over build_qwen35_graph with explicit control of: +// * kv_write_rows — physical pool slot for the KV append +// * positions — logical position (M-RoPE) +// * span — FA window length (kv_start = span-1 in graph terms) +// * attn_mask — optional [align32(span_padded), 32] f16 slot mask +// +// The graph arena and gallocr persist across rebuilds (same trick as +// build_target_step) so identical topology lands at identical addresses +// and the ggml-cuda CUDA-graph cache can replay decode steps. +struct Stepper { + ggml_context * ctx = nullptr; + ggml_cgraph * gf = nullptr; + ggml_gallocr_t alloc = nullptr; + ggml_tensor * inp_embed = nullptr; + ggml_tensor * positions = nullptr; + ggml_tensor * attn_mask = nullptr; + ggml_tensor * kv_write_rows = nullptr; + ggml_tensor * logits = nullptr; + ggml_tensor * argmax_tokens = nullptr; + + const TargetWeights * w = nullptr; + TargetCache * cache = nullptr; + ggml_backend_t backend = nullptr; + int span = 0; + bool with_mask = false; + + std::vector arena; + std::vector embed_buf; + std::vector mask_buf; + uint64_t mask_epoch = (uint64_t)-1; + double mask_fill_ms_total = 0.0; + int mask_fills = 0; + + bool init(const TargetWeights & tw, TargetCache & tc, ggml_backend_t be, + int span_, bool with_mask_) { + w = &tw; cache = &tc; backend = be; + span = span_; with_mask = with_mask_; + embed_buf.resize(tw.n_embd); + arena.resize((size_t)512 * 1024 * 1024); + return build(); + } + + bool build() { + if (ctx) { ggml_free(ctx); ctx = nullptr; } + ggml_init_params ip{}; + ip.mem_size = arena.size(); + ip.mem_buffer = arena.data(); + ip.no_alloc = true; + ctx = ggml_init(ip); + if (!ctx) return false; + + const int hidden = w->n_embd; + inp_embed = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hidden, 1, 1); + ggml_set_input(inp_embed); + positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4); + ggml_set_input(positions); + kv_write_rows = ggml_new_tensor_2d(ctx, GGML_TYPE_I64, 1, w->n_head_kv); + ggml_set_input(kv_write_rows); + + attn_mask = nullptr; + if (with_mask) { + // FA span is padded to 256 on the step-invariant path; the mask + // kv dim must cover it. + const int span_padded = std::min(((span + 255) / 256) * 256, + (int)cache->attn_k[0]->ne[1]); + attn_mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, + align_up(span_padded, KQ_MASK_PAD), + align_up(1, KQ_MASK_PAD)); + ggml_set_input(attn_mask); + mask_buf.assign((size_t)attn_mask->ne[0] * attn_mask->ne[1], F16_NEG_INF); + mask_epoch = (uint64_t)-1; + } + + gf = ggml_new_graph_custom(ctx, 16384, false); + + QwenGraphInputs gi{}; + gi.inp_embed = inp_embed; + gi.positions = positions; + gi.attn_mask = attn_mask; + gi.n_tokens = 1; + gi.kv_start = span - 1; + gi.capture_layers = false; + gi.kv_write_rows = kv_write_rows; + + QwenGraphOutputs go = build_qwen35_graph(ctx, gf, *w, *cache, gi); + if (!go.logits) return false; + logits = go.logits; + ggml_set_output(logits); + argmax_tokens = ggml_argmax(ctx, logits); + ggml_set_output(argmax_tokens); + ggml_build_forward_expand(gf, argmax_tokens); + + if (!alloc) alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend)); + return ggml_gallocr_alloc_graph(alloc, gf); + } + + void refresh_mask(const KvFlashPager & pager) { + if (!attn_mask) return; + const double t0 = now_ms(); + if (pager.epoch() != mask_epoch) { + // Host-side rebuild only on residency change. + std::fill(mask_buf.begin(), mask_buf.end(), F16_NEG_INF); + pager.fill_slot_mask(mask_buf.data()); + mask_epoch = pager.epoch(); + mask_fills++; + } + // Upload EVERY step: the compute-buffer region backing this input + // tensor is reused by graph execution, so a stale upload reads as + // garbage (NaN logits) on the next step. Production prefill + // re-uploads its mask before every compute for the same reason. + ggml_backend_tensor_set(attn_mask, mask_buf.data(), 0, + mask_buf.size() * sizeof(uint16_t)); + mask_fill_ms_total += now_ms() - t0; + } + + int32_t step(int32_t tok, int pos, int phys_slot) { + if (!w->embedder.embed(&tok, 1, embed_buf.data())) { + std::fprintf(stderr, "embed failed: tok=%d pos=%d (NaN logits upstream?)\n", tok, pos); + std::exit(1); + } + ggml_backend_tensor_set(inp_embed, embed_buf.data(), 0, + sizeof(float) * embed_buf.size()); + int32_t p4[4] = { pos, pos, pos, 0 }; + ggml_backend_tensor_set(positions, p4, 0, sizeof(int32_t) * 4); + std::vector rows(w->n_head_kv, (int64_t)phys_slot); + ggml_backend_tensor_set(kv_write_rows, rows.data(), 0, + sizeof(int64_t) * rows.size()); + if (ggml_backend_graph_compute(backend, gf) != GGML_STATUS_SUCCESS) { + std::fprintf(stderr, "graph_compute failed pos=%d\n", pos); + std::exit(1); + } + int32_t next = 0; + ggml_backend_tensor_get(argmax_tokens, &next, 0, sizeof(int32_t)); + return next; + } + + void destroy() { + if (alloc) { ggml_gallocr_free(alloc); alloc = nullptr; } + if (ctx) { ggml_free(ctx); ctx = nullptr; } + } +}; + +std::vector make_prompt(int n, int vocab) { + std::vector p(n); + uint64_t s = 0x9E3779B97F4A7C15ull; + // Cap below the drafter vocab too (Qwen3-0.6B ~151K) so the same ids + // are scoreable by the indexer in run F. + const int cap = std::min(vocab, 100000); + for (int i = 0; i < n; i++) { + s = s * 6364136223846793005ull + 1442695040888963407ull; + p[i] = (int32_t)(1000 + (s >> 33) % (uint64_t)(cap / 2)); + } + return p; +} + +// Pooled chunked prefill: 64-token (one pager chunk) batched forwards with +// slot-mapped set_rows writes and a resident+causal mask. This is the +// prompt > pool path: prefill evicts like decode does. Graph is built once +// (fixed topology) and reused for every chunk. +struct BatchStepper { + ggml_context * ctx = nullptr; + ggml_cgraph * gf = nullptr; + ggml_gallocr_t alloc = nullptr; + ggml_tensor * inp_embed = nullptr; + ggml_tensor * positions = nullptr; + ggml_tensor * attn_mask = nullptr; + ggml_tensor * kv_write_rows = nullptr; + ggml_tensor * logits = nullptr; + ggml_tensor * argmax_tokens = nullptr; + + const TargetWeights * w = nullptr; + TargetCache * cache = nullptr; + ggml_backend_t backend = nullptr; + int pool = 0; + static constexpr int NB = 64; // tokens per chunk + + std::vector arena; + std::vector embed_buf; + std::vector mask_buf; + + bool init(const TargetWeights & tw, TargetCache & tc, ggml_backend_t be, int pool_) { + w = &tw; cache = &tc; backend = be; pool = pool_; + embed_buf.resize((size_t)tw.n_embd * NB); + arena.resize((size_t)512 * 1024 * 1024); + + ggml_init_params ip{}; + ip.mem_size = arena.size(); + ip.mem_buffer = arena.data(); + ip.no_alloc = true; + ctx = ggml_init(ip); + if (!ctx) return false; + + inp_embed = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, tw.n_embd, NB, 1); + ggml_set_input(inp_embed); + positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4 * NB); + ggml_set_input(positions); + kv_write_rows = ggml_new_tensor_2d(ctx, GGML_TYPE_I64, NB, tw.n_head_kv); + ggml_set_input(kv_write_rows); + attn_mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, + align_up(pool, KQ_MASK_PAD), + align_up(NB, KQ_MASK_PAD)); + ggml_set_input(attn_mask); + mask_buf.assign((size_t)attn_mask->ne[0] * attn_mask->ne[1], F16_NEG_INF); + + gf = ggml_new_graph_custom(ctx, 16384, false); + QwenGraphInputs gi{}; + gi.inp_embed = inp_embed; + gi.positions = positions; + gi.attn_mask = attn_mask; + gi.n_tokens = NB; + gi.kv_start = pool - NB; // span = whole pool + gi.kv_write_rows = kv_write_rows; + gi.last_token_logits_only = true; + QwenGraphOutputs go = build_qwen35_graph(ctx, gf, *w, *cache, gi); + if (!go.logits) return false; + logits = go.logits; + ggml_set_output(logits); + argmax_tokens = ggml_argmax(ctx, logits); + ggml_set_output(argmax_tokens); + ggml_build_forward_expand(gf, argmax_tokens); + alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend)); + return ggml_gallocr_alloc_graph(alloc, gf); + } + + // One 64-token chunk at logical [pos_base, pos_base+64). Allocates the + // chunk's block (evicting if needed), writes slot-mapped, masks + // resident slots + causal-within-chunk. Returns last-token argmax. + int32_t step_chunk(const int32_t * toks, int pos_base, KvFlashPager & pager) { + int slots[NB]; + for (int i = 0; i < NB; i++) slots[i] = pager.slot_for(pos_base + i); + + if (!w->embedder.embed(toks, NB, embed_buf.data())) { + std::fprintf(stderr, "batch embed failed @%d\n", pos_base); + std::exit(1); + } + ggml_backend_tensor_set(inp_embed, embed_buf.data(), 0, + sizeof(float) * embed_buf.size()); + std::vector p4((size_t)4 * NB); + for (int i = 0; i < NB; i++) { + p4[4 * i + 0] = p4[4 * i + 1] = p4[4 * i + 2] = pos_base + i; + p4[4 * i + 3] = 0; + } + ggml_backend_tensor_set(positions, p4.data(), 0, sizeof(int32_t) * p4.size()); + // [n_tokens, n_head_kv] ne0-major: (token i, head h) at i + h*NB. + std::vector rows((size_t)NB * w->n_head_kv); + for (int h = 0; h < w->n_head_kv; h++) { + for (int i = 0; i < NB; i++) { + rows[(size_t)h * NB + i] = slots[i]; + } + } + ggml_backend_tensor_set(kv_write_rows, rows.data(), 0, + sizeof(int64_t) * rows.size()); + + // Mask: per q row, resident slots (excluding this chunk) attendable, + // this chunk's slots causal. Rebuilt + uploaded per chunk. + const size_t kvd = (size_t)attn_mask->ne[0]; + std::fill(mask_buf.begin(), mask_buf.end(), F16_NEG_INF); + pager.fill_slot_mask(mask_buf.data()); // row 0 base + const int this_block = slots[0] - slots[0] % NB; + for (int i = 0; i < NB; i++) mask_buf[(size_t)this_block + i] = F16_NEG_INF; + for (int q = 1; q < NB; q++) { + std::memcpy(mask_buf.data() + (size_t)q * kvd, mask_buf.data(), kvd * 2); + } + for (int q = 0; q < NB; q++) { + for (int i = 0; i <= q; i++) { + mask_buf[(size_t)q * kvd + slots[i]] = F16_ZERO; + } + } + ggml_backend_tensor_set(attn_mask, mask_buf.data(), 0, + mask_buf.size() * sizeof(uint16_t)); + + if (ggml_backend_graph_compute(backend, gf) != GGML_STATUS_SUCCESS) { + std::fprintf(stderr, "batch compute failed @%d\n", pos_base); + std::exit(1); + } + int32_t last = 0; + ggml_backend_tensor_get(argmax_tokens, &last, 0, sizeof(int32_t)); + return last; + } + + void destroy() { + if (alloc) { ggml_gallocr_free(alloc); alloc = nullptr; } + if (ctx) { ggml_free(ctx); ctx = nullptr; } + } +}; + + +int arg_int(int argc, char ** argv, const char * key, int defv) { + const size_t kl = std::strlen(key); + for (int i = 2; i < argc; i++) { + if (std::strncmp(argv[i], key, kl) == 0 && argv[i][kl] == '=') { + return std::atoi(argv[i] + kl + 1); + } + } + return defv; +} + +bool arg_flag(int argc, char ** argv, const char * key) { + for (int i = 2; i < argc; i++) if (std::strcmp(argv[i], key) == 0) return true; + return false; +} + +struct StepTimes { + double p50 = 0, p95 = 0, mean = 0; +}; + +StepTimes summarize(std::vector & ms) { + StepTimes r; + if (ms.empty()) return r; + std::sort(ms.begin(), ms.end()); + r.p50 = ms[ms.size() / 2]; + r.p95 = ms[(size_t)(ms.size() * 0.95)]; + for (double v : ms) r.mean += v; + r.mean /= ms.size(); + return r; +} + +} // namespace + +int main(int argc, char ** argv) { + if (argc < 2) { + std::fprintf(stderr, "usage: %s [--logical-ctx=N] [--pool-b=N] " + "[--pool-c=N] [--prompt=N] [--gen=N] [--skip-profile]\n", argv[0]); + return 2; + } + const int logical_ctx = arg_int(argc, argv, "--logical-ctx", 131072); + const int pool_b = arg_int(argc, argv, "--pool-b", 2048); + const int pool_c = arg_int(argc, argv, "--pool-c", 1024); + const int n_prompt = arg_int(argc, argv, "--prompt", 512); + const int n_gen = arg_int(argc, argv, "--gen", 1200); + const bool skip_prof = arg_flag(argc, argv, "--skip-profile"); + // Explicit pool slot mask: exact exclusion of non-resident slots. + // ON by default (requires the per-step re-upload in refresh_mask: the + // mask input's compute-buffer region is clobbered by graph execution). + // --no-mask falls back to the zero-row approximation production's + // padded span uses. + const bool use_mask = !arg_flag(argc, argv, "--no-mask"); + const int total = n_prompt + n_gen; + if (total > pool_b) { + std::fprintf(stderr, "config error: prompt+gen (%d) must fit pool-b (%d)\n", total, pool_b); + return 2; + } + + ggml_backend_t backend = ggml_backend_cuda_init(0); + if (!backend) { std::fprintf(stderr, "cuda init failed\n"); return 1; } + const size_t vram0 = vram_used_now(); + + TargetWeights w; + if (!load_target_gguf(argv[1], backend, w)) { + std::fprintf(stderr, "load: %s\n", dflash27b_last_error()); + return 1; + } + std::printf("[load] weights ok, vram_used=%.1f MiB\n", + (vram_used_now() - vram0) / 1048576.0); + + // ── --longab: end-to-end long-prompt A/B (speed + accuracy) ───── + // For L in {32K, 64K, 128K}: full-cache baseline vs pool-4096 with + // drafter reselect. Measures prefill time, decode tok/s over a + // 240-token free run, and needle recall (depth 0.25, outside both + // the sinks and the LRU window). + if (arg_flag(argc, argv, "--longab")) { + // Drafter loads lazily, pool mode only: the full-cache baseline at + // 256K needs every byte (weights 15.3 GiB + KV 4.6 GiB). + DrafterContext dctx; + KvFlashDrafterScorer scorer(&dctx); + // Single-config mode (one process per config: the CUDA VMM pool + // grows monotonically across large-cache configs and aborts). + const int only_L = arg_int(argc, argv, "--longab-L", 0); + const int only_mode = arg_int(argc, argv, "--longab-mode", -1); // 0=full 1=pool + std::printf("\n%-7s %-10s %-9s %-9s %-9s %-9s %s\n", + "L", "mode", "prefill_s", "rescore_s", "dec_tok/s", "needle", "kv_vram"); + for (int L : { 32768, 65536, 131072, 262144 }) { + if (only_L > 0 && L != only_L) continue; + for (int mode = 0; mode < 2; mode++) { // 0=baseline 1=pool + if (only_mode >= 0 && mode != only_mode) continue; + if (mode == 1 && !dctx.loaded && + !load_drafter("/opt/lucebox/models/drafter/Qwen3-0.6B-BF16.gguf", 0, dctx)) { + std::fprintf(stderr, "drafter load failed\n"); + return 1; + } + const int pool = mode == 0 ? L : 4096; + auto prompt = make_prompt(L, w.n_vocab); + std::vector needle(48); + uint64_t ns = 0xDEADBEEFCAFEull; + for (int i = 0; i < 48; i++) { + ns = ns * 6364136223846793005ull + 1442695040888963407ull; + needle[i] = (int32_t)(1000 + (ns >> 33) % 49000); + } + const int npos = ((int)(0.25 * (L - 512)) / 32) * 32; + for (int i = 0; i < 48; i++) prompt[npos + i] = needle[i]; + + TargetCache cache; + if (!create_target_cache(w, pool, 0, backend, cache, true)) return 1; + const double kv_mib = kv_cache_bytes(cache) / 1048576.0; + KvFlashPager pager; + KvFlashConfig pc; pc.pool_tokens = pool; + if (!pager.attach(pc, cache.attn_k, cache.attn_v)) return 1; + + double t0 = now_ms(); + BatchStepper bs; + if (!bs.init(w, cache, backend, pool)) return 1; + for (int p = 0; p < L; p += 64) bs.step_chunk(prompt.data() + p, p, pager); + bs.destroy(); + const double prefill_s = (now_ms() - t0) / 1000.0; + + Stepper st; + if (!st.init(w, cache, backend, pool, mode == 1)) return 1; + int32_t next = -1; + for (int i = 0; i < 32; i++) { + const int slot = pager.slot_for(L + i); + st.refresh_mask(pager); + next = st.step(needle[i], L + i, slot); + } + double rescore_s = 0; + if (mode == 1) { + std::vector hist = prompt; + hist.insert(hist.end(), needle.begin(), needle.begin() + 32); + std::vector scores; + t0 = now_ms(); + if (scorer.score_chunks(hist, pc.chunk_tokens, scores)) { + pager.score_hook = [&scores](int c) { + return c < (int)scores.size() ? scores[c] : 1e30f; + }; + pager.reselect(); + pager.score_hook = nullptr; + } + rescore_s = (now_ms() - t0) / 1000.0; + } + int match = 0; + for (int i = 0; i < 16; i++) { + if (next == needle[32 + i]) match++; + const int pos = L + 32 + i; + const int slot = pager.slot_for(pos); + st.refresh_mask(pager); + next = st.step(needle[32 + i], pos, slot); + } + t0 = now_ms(); + for (int i = 0; i < 240; i++) { // timed free run + const int pos = L + 48 + i; + const int slot = pager.slot_for(pos); + st.refresh_mask(pager); + next = st.step(next, pos, slot); + } + const double tok_s = 240.0 / ((now_ms() - t0) / 1000.0); + std::printf("%-7d %-10s %-9.1f %-9.1f %-9.1f %d/16 %.0f MiB\n", + L, mode == 0 ? "full" : "pool4096", + prefill_s, rescore_s, tok_s, match, kv_mib); + std::fflush(stdout); + st.destroy(); + free_target_cache(cache); + } + } + if (dctx.loaded) free_drafter(dctx); + free_target_weights(w); + ggml_backend_free(backend); + return 0; + } + + // ── --niah256: native-max-context probe (262144 logical) ──────── + // Pooled configs only: the fixed-span harness makes a full-pool + // control prefill take hours at 256K. The LRU row with the needle + // inside the recency window is the induction control (distance-free). + if (arg_flag(argc, argv, "--niah256")) { + DrafterContext dctx; + if (!load_drafter("/opt/lucebox/models/drafter/Qwen3-0.6B-BF16.gguf", 0, dctx)) { + std::fprintf(stderr, "drafter load failed\n"); + return 1; + } + KvFlashDrafterScorer scorer(&dctx); + const int L = 262144, pool = 16384; // 6.25% residency + struct Cfg { const char * policy; double depth; }; + const Cfg cfgs[] = { + {"lru", 0.97}, // in-window: induction control at 256K + {"lru", 0.50}, + {"drafter", 0.10}, + {"drafter", 0.50}, + {"drafter", 0.90}, + }; + std::printf("\n%-7s %-6s %-8s %-6s %s\n", "L", "pool", "policy", "depth", "match/16"); + for (const Cfg & cfg : cfgs) { + auto prompt = make_prompt(L, w.n_vocab); + std::vector needle(48); + uint64_t ns = 0xDEADBEEFCAFEull; + for (int i = 0; i < 48; i++) { + ns = ns * 6364136223846793005ull + 1442695040888963407ull; + needle[i] = (int32_t)(1000 + (ns >> 33) % 49000); + } + const int npos = ((int)(cfg.depth * (L - 512)) / 32) * 32; + for (int i = 0; i < 48; i++) prompt[npos + i] = needle[i]; + + TargetCache cache; + if (!create_target_cache(w, pool, 0, backend, cache, true)) return 1; + KvFlashPager pager; + KvFlashConfig pc; pc.pool_tokens = pool; + if (!pager.attach(pc, cache.attn_k, cache.attn_v)) return 1; + + const double t0 = now_ms(); + BatchStepper bs; + if (!bs.init(w, cache, backend, pool)) return 1; + for (int p = 0; p < L; p += 64) bs.step_chunk(prompt.data() + p, p, pager); + bs.destroy(); + std::printf("[256k] prefill %.1f s, host backing %.2f GiB\n", + (now_ms() - t0) / 1000.0, + pager.stats().host_bytes / 1073741824.0); + + Stepper st; + if (!st.init(w, cache, backend, pool, true)) return 1; + int32_t next = -1; + for (int i = 0; i < 32; i++) { + const int slot = pager.slot_for(L + i); + st.refresh_mask(pager); + next = st.step(needle[i], L + i, slot); + } + if (std::strcmp(cfg.policy, "drafter") == 0) { + std::vector hist = prompt; + hist.insert(hist.end(), needle.begin(), needle.begin() + 32); + std::vector scores; + const double r0 = now_ms(); + if (!scorer.score_chunks(hist, pc.chunk_tokens, scores)) { + std::printf("[256k] WARN rescore failed\n"); + } else { + std::printf("[256k] rescore %.1f s\n", (now_ms() - r0) / 1000.0); + pager.score_hook = [&scores](int c) { + return c < (int)scores.size() ? scores[c] : 1e30f; + }; + pager.reselect(); + pager.score_hook = nullptr; + } + } + int match = 0; + for (int i = 0; i < 16; i++) { + if (next == needle[32 + i]) match++; + const int pos = L + 32 + i; + const int slot = pager.slot_for(pos); + st.refresh_mask(pager); + next = st.step(needle[32 + i], pos, slot); + } + std::printf("%-7d %-6d %-8s %-6.2f %d/16\n", L, pool, cfg.policy, cfg.depth, match); + std::fflush(stdout); + st.destroy(); + free_target_cache(cache); + } + free_drafter(dctx); + free_target_weights(w); + ggml_backend_free(backend); + return 0; + } + + if (arg_flag(argc, argv, "--niah")) { + DrafterContext dctx; + const bool have_drafter = + load_drafter("/opt/lucebox/models/drafter/Qwen3-0.6B-BF16.gguf", 0, dctx); + if (!have_drafter) std::printf("[niah] drafter unavailable, skipping drafter policy\n"); + KvFlashDrafterScorer scorer(&dctx); + if (have_drafter) { + // Reserve the drafter's compute buffers at max context NOW, + // before target-side cache churn fragments the CUDA pool. + // Without this, 32K rescores OOM late in the sweep and the + // drafter policy silently degrades to LRU. + std::vector warm(33024, 1234); + std::vector tmp; + scorer.score_chunks(warm, 64, tmp); + } + + const int Ls[] = { 8192, 32768 }; + const double depths[] = { 0.10, 0.50, 0.90 }; + std::printf("\n%-7s %-6s %-8s %-6s %s\n", "L", "pool", "policy", "depth", "match/16"); + for (int L : Ls) { + const int pools[] = { L, L / 4, ((L / 10) / 256) * 256 }; + for (int pi = 0; pi < 3; pi++) { + const int pool = pools[pi]; + const char * policies[] = { "lru", "drafter" }; + const int n_pol = (pi == 0) ? 1 : (have_drafter ? 2 : 1); // full pool: control only + for (int pol = 0; pol < n_pol; pol++) { + for (double depth : depths) { + // Needle: 48 unique-as-a-sequence tokens from the + // filler id range (matched embedding statistics). + // Query = first 32 (longer match = stronger + // induction), score the last 16. + auto prompt = make_prompt(L, w.n_vocab); + std::vector needle(48); + uint64_t ns = 0xDEADBEEFCAFEull; + for (int i = 0; i < 48; i++) { + ns = ns * 6364136223846793005ull + 1442695040888963407ull; + needle[i] = (int32_t)(1000 + (ns >> 33) % 49000); + } + const int npos = ((int)(depth * (L - 512)) / 32) * 32; + for (int i = 0; i < 48; i++) prompt[npos + i] = needle[i]; + + TargetCache cache; + if (!create_target_cache(w, pool, 0, backend, cache, true)) return 1; + KvFlashPager pager; + KvFlashConfig pc; pc.pool_tokens = pool; + if (!pager.attach(pc, cache.attn_k, cache.attn_v)) return 1; + + BatchStepper bs; + if (!bs.init(w, cache, backend, pool)) return 1; + for (int p = 0; p < L; p += 64) { + bs.step_chunk(prompt.data() + p, p, pager); + } + bs.destroy(); + + Stepper st; + if (!st.init(w, cache, backend, pool, true)) return 1; + int32_t next = -1; + for (int i = 0; i < 32; i++) { // query: needle prefix + const int slot = pager.slot_for(L + i); + st.refresh_mask(pager); + next = st.step(needle[i], L + i, slot); + } + if (pol == 1) { // drafter reselect + std::vector hist = prompt; + hist.insert(hist.end(), needle.begin(), needle.begin() + 32); + std::vector scores; + if (!scorer.score_chunks(hist, pc.chunk_tokens, scores)) { + std::printf("[niah] WARN: rescore failed (L=%d pool=%d)\n", L, pool); + } else { + pager.score_hook = [&scores](int c) { + return c < (int)scores.size() ? scores[c] : 1e30f; + }; + pager.reselect(); + pager.score_hook = nullptr; + } + } + int match = 0; + for (int i = 0; i < 16; i++) { // continuation + if (next == needle[32 + i]) match++; + // Teacher-force ground truth: one miss must not + // cascade; we measure per-position retrieval. + const int pos = L + 32 + i; + const int slot = pager.slot_for(pos); + st.refresh_mask(pager); + next = st.step(needle[32 + i], pos, slot); + } + std::printf("%-7d %-6d %-8s %-6.2f %d/16\n", + L, pool, pi == 0 ? "full" : policies[pol], + depth, match); + std::fflush(stdout); + st.destroy(); + free_target_cache(cache); + } + } + } + } + if (have_drafter) free_drafter(dctx); + free_target_weights(w); + ggml_backend_free(backend); + return 0; + } + + const auto prompt = make_prompt(n_prompt, w.n_vocab); + std::vector tokens_a; + size_t mem_a_kv = 0, mem_a_buf = 0, mem_a_vram = 0; + size_t mem_c_kv = 0, mem_c_buf = 0, mem_c_vram = 0; + int hard_failures = 0; + + // ── Run A: baseline at logical context, maskless ──────────────── + { + const size_t v_before = vram_used_now(); + TargetCache cache; + if (!create_target_cache(w, logical_ctx, 0, backend, cache, /*prefill_only=*/true)) { + std::fprintf(stderr, "cache A: %s\n", dflash27b_last_error()); + return 1; + } + mem_a_kv = kv_cache_bytes(cache); + mem_a_buf = ggml_backend_buffer_get_size(cache.base_buf); + mem_a_vram = vram_used_now() - v_before; + std::printf("[A] logical_ctx=%d kv=%.1f MiB base_buf=%.1f MiB vram_delta=%.1f MiB\n", + logical_ctx, mem_a_kv / 1048576.0, mem_a_buf / 1048576.0, + mem_a_vram / 1048576.0); + + Stepper st; + int32_t next = -1; + const double t0 = now_ms(); + for (int pos = 0; pos < total; pos++) { + // Production-like growing span: rebuild only when the padded + // span crosses a 256 boundary (mirrors do_ar_decode topology). + const int want_span = pos + 1; + if (!st.ctx || ((want_span + 255) / 256) != ((st.span + 255) / 256)) { + st.span = want_span; + if (!st.ctx) { if (!st.init(w, cache, backend, want_span, false)) return 1; } + else if (!st.build()) return 1; + } + const int32_t tok = pos < n_prompt ? prompt[pos] + : (tokens_a.push_back(next), next); + next = st.step(tok, pos, pos); + cache.cur_pos = pos + 1; + } + tokens_a.push_back(next); + std::printf("[A] decoded %zu tokens, %.1f tok/s overall\n", + tokens_a.size(), total / ((now_ms() - t0) / 1000.0)); + st.destroy(); + free_target_cache(cache); + } + + // ── Run B: relocation + mask exactness, teacher-forced ────────── + { + TargetCache cache; + if (!create_target_cache(w, pool_b, 0, backend, cache, /*prefill_only=*/true)) { + std::fprintf(stderr, "cache B: %s\n", dflash27b_last_error()); + return 1; + } + KvFlashPager pager; + KvFlashConfig pc; + pc.pool_tokens = pool_b; + if (!pager.attach(pc, cache.attn_k, cache.attn_v)) return 1; + const int nb = pool_b / pc.chunk_tokens; + std::vector order(nb); + for (int i = 0; i < nb; i++) order[i] = i; + uint64_t s = 12345; + for (int i = nb - 1; i > 0; i--) { + s = s * 6364136223846793005ull + 1442695040888963407ull; + const int j = (int)((s >> 33) % (uint64_t)(i + 1)); + std::swap(order[i], order[j]); + } + pager.set_block_order(order); + + Stepper st; + if (!st.init(w, cache, backend, pool_b, use_mask)) return 1; + int mismatches = 0, first_mismatch = -1; + for (int pos = 0; pos < total; pos++) { + const int32_t tok = pos < n_prompt ? prompt[pos] : tokens_a[pos - n_prompt]; + const int slot = pager.slot_for(pos); + st.refresh_mask(pager); + const int32_t next = st.step(tok, pos, slot); + const int ref_idx = pos - n_prompt + 1; + if (pos >= n_prompt - 1 && ref_idx < (int)tokens_a.size()) { + if (next != tokens_a[ref_idx]) { + mismatches++; + if (first_mismatch < 0) first_mismatch = pos; + } + } + } + const double rate = 100.0 * mismatches / (n_gen + 1); + std::printf("[B] shuffled+masked, pool=%d: %d/%d argmax mismatches (%.2f%%), first at pos %d; " + "mask refills=%d avg=%.3f ms\n", + pool_b, mismatches, n_gen + 1, rate, first_mismatch, + st.mask_fills, st.mask_fills ? st.mask_fill_ms_total / st.mask_fills : 0.0); + // Gate at 2%: the flip sources are the maskless zero-row softmax + // mass plus run-to-run fattn nondeterminism; both measured ~1% + // (10-14 flips/1201 across runs), so a 1% gate flaps on noise. + std::printf("%s relocation equivalence (threshold 2%%)\n", rate <= 2.0 ? "PASS" : "FAIL"); + if (rate > 2.0) hard_failures++; + st.destroy(); + free_target_cache(cache); + } + + // ── Run C: live paging + roundtrip; D: reselect recall ────────── + { + const size_t v_before = vram_used_now(); + TargetCache cache; + if (!create_target_cache(w, pool_c, 0, backend, cache, /*prefill_only=*/true)) { + std::fprintf(stderr, "cache C: %s\n", dflash27b_last_error()); + return 1; + } + mem_c_kv = kv_cache_bytes(cache); + mem_c_buf = ggml_backend_buffer_get_size(cache.base_buf); + mem_c_vram = vram_used_now() - v_before; + + KvFlashPager pager; + KvFlashConfig pc; + pc.pool_tokens = pool_c; + if (!pager.attach(pc, cache.attn_k, cache.attn_v)) return 1; + + Stepper st; + if (!st.init(w, cache, backend, pool_c, use_mask)) return 1; + int32_t next = -1; + for (int pos = 0; pos < n_prompt; pos++) { + const int slot = pager.slot_for(pos); + st.refresh_mask(pager); + next = st.step(prompt[pos], pos, slot); + cache.cur_pos = pos + 1; + } + { // bit-exact roundtrip on chunk 2 + ggml_tensor * t = cache.attn_k[0]; + const size_t seg = (size_t)pc.chunk_tokens * t->nb[1]; + std::vector before(seg), after(seg); + ggml_backend_tensor_get(t, before.data(), + (size_t)pager.block_of(2) * pc.chunk_tokens * t->nb[1], seg); + if (!pager.page_out(2) || !pager.page_in(2)) { + std::fprintf(stderr, "roundtrip paging failed\n"); return 1; + } + ggml_backend_tensor_get(t, after.data(), + (size_t)pager.block_of(2) * pc.chunk_tokens * t->nb[1], seg); + const bool exact = std::memcmp(before.data(), after.data(), seg) == 0; + std::printf("%s page_out/page_in roundtrip bit-exact (chunk 2 -> block %d)\n", + exact ? "PASS" : "FAIL", pager.block_of(2)); + if (!exact) hard_failures++; + } + + std::vector tokens_c; + const double t0 = now_ms(); + for (int pos = n_prompt; pos < total; pos++) { + tokens_c.push_back(next); + const int slot = pager.slot_for(pos); + st.refresh_mask(pager); + next = st.step(next, pos, slot); + cache.cur_pos = pos + 1; + } + tokens_c.push_back(next); + const double secs = (now_ms() - t0) / 1000.0; + int agree = 0; + while (agree < (int)tokens_c.size() && agree < (int)tokens_a.size() && + tokens_c[agree] == tokens_a[agree]) agree++; + const auto & ps = pager.stats(); + std::printf("[C] pool=%d masked: decode %.1f tok/s, page_outs=%" PRId64 + " page_ins=%" PRId64 " host=%.1f MiB; baseline agreement %d tokens\n", + pool_c, n_gen / secs, ps.page_outs, ps.page_ins, + ps.host_bytes / 1048576.0, agree); + std::printf("PASS paged decode with eviction (%d evictions)\n", (int)ps.page_outs); + + // ── Run D: τ-style reselect recall ────────────────────────── + { + int victim = -1; // earliest paged-out, non-sink chunk + for (int c = pc.sink_chunks; c < pager.n_chunks(); c++) { + if (!pager.is_resident(c)) { victim = c; break; } + } + if (victim < 0) { + std::printf("FAIL reselect demo: no paged-out chunk found\n"); + hard_failures++; + } else { + // Score injection: the victim becomes the hottest chunk — + // stands in for a drafter rescore flagging recalled context. + pager.score_hook = [&](int c) { return c == victim ? 2.0f : 1.0f / (1 + c); }; + const double r0 = now_ms(); + const int events = pager.reselect(); + const double r_ms = now_ms() - r0; + const bool back = pager.is_resident(victim); + std::printf("%s reselect recalled chunk %d (%d page events, %.2f ms)\n", + back ? "PASS" : "FAIL", victim, events, r_ms); + if (!back) hard_failures++; + // decode must continue cleanly after the residency change + pager.score_hook = nullptr; + for (int pos = total; pos < total + 64; pos++) { + const int slot = pager.slot_for(pos); + st.refresh_mask(pager); + next = st.step(next, pos, slot); + } + std::printf("PASS decode continues after reselect (64 tokens)\n"); + } + } + st.destroy(); + free_target_cache(cache); + } + + // ── Run F: full LSA loop — drafter as Memory Indexer ──────────── + // Prompt LARGER than the pool, so prefill itself evicts; then the + // FlashMemory inference paradigm end to end: every τ=64 decoded + // tokens the drafter rescores the whole sequence (tail attention = + // indexer query), score_hook gets the fresh chunk scores, and + // reselect() repages the pool. PASS requires at least one genuine + // drafter-driven recall of a chunk evicted earlier. + { + const char * drafter_path = "/opt/lucebox/models/drafter/Qwen3-0.6B-BF16.gguf"; + DrafterContext dctx; + if (!load_drafter(drafter_path, 0, dctx)) { + std::printf("FAIL indexer run: drafter load failed (%s)\n", dflash27b_last_error()); + hard_failures++; + } else { + const int n_prompt_f = 2048, n_gen_f = 768, pool_f = 1024, tau = 64; + const auto prompt_f = make_prompt(n_prompt_f, w.n_vocab); + TargetCache cache; + if (!create_target_cache(w, pool_f, 0, backend, cache, true)) return 1; + KvFlashPager pager; + KvFlashConfig pc; + pc.pool_tokens = pool_f; + if (!pager.attach(pc, cache.attn_k, cache.attn_v)) return 1; + KvFlashDrafterScorer scorer(&dctx); // the production indexer plugin + + Stepper st; + if (!st.init(w, cache, backend, pool_f, use_mask)) return 1; + std::vector all_ids = prompt_f; + int32_t next = -1; + for (int pos = 0; pos < n_prompt_f; pos++) { + const int slot = pager.slot_for(pos); + st.refresh_mask(pager); + next = st.step(prompt_f[pos], pos, slot); + } + const int64_t prefill_evictions = pager.stats().page_outs; + + std::vector rescore_ms, reselect_ms; + int64_t recalls = 0; + std::vector scores; + const double t0 = now_ms(); + for (int g = 0; g < n_gen_f; g++) { + const int pos = n_prompt_f + g; + if (g % tau == 0) { + double r0 = now_ms(); + if (!scorer.score_chunks(all_ids, pc.chunk_tokens, scores)) { + std::fprintf(stderr, "scorer failed\n"); + std::exit(1); + } + rescore_ms.push_back(now_ms() - r0); + pager.score_hook = [&scores](int c) { + return c < (int)scores.size() ? scores[c] : 1e30f; + }; + r0 = now_ms(); + const int64_t ins_before = pager.stats().page_ins; + pager.reselect(); + reselect_ms.push_back(now_ms() - r0); + recalls += pager.stats().page_ins - ins_before; + } + const int slot = pager.slot_for(pos); + st.refresh_mask(pager); + next = st.step(next, pos, slot); + all_ids.push_back(next); + } + const double secs = (now_ms() - t0) / 1000.0; + StepTimes rs = summarize(rescore_ms), rsel = summarize(reselect_ms); + const auto & ps = pager.stats(); + std::printf("[F] LSA loop: prompt=%d pool=%d gen=%d tau=%d -> %.1f tok/s " + "(prefill evicted %" PRId64 ")\n", + n_prompt_f, pool_f, n_gen_f, tau, n_gen_f / secs, prefill_evictions); + std::printf("[F] indexer rescore p50=%.1f ms (full 0.6B re-prefill, %zu calls); " + "reselect p50=%.2f ms; drafter-driven recalls=%" PRId64 + "; total page_outs=%" PRId64 " page_ins=%" PRId64 "\n", + rs.p50, rescore_ms.size(), rsel.p50, recalls, + ps.page_outs, ps.page_ins); + std::printf("%s LSA loop: drafter-driven recall of evicted context (recalls >= 1)\n", + recalls >= 1 ? "PASS" : "FAIL"); + if (recalls < 1) hard_failures++; + st.destroy(); + free_target_cache(cache); + free_drafter(dctx); + } + } + + // ── Run E: performance profile ────────────────────────────────── + if (!skip_prof) { + std::printf("\n=== DECODE PROFILE (64 timed steps each, junk KV, span = FA window) ===\n"); + auto profile = [&](const char * tag, int alloc_ctx, int span, bool masked, + KvFlashPager * pager, int pos_base) { + TargetCache cache; + if (!create_target_cache(w, alloc_ctx, 0, backend, cache, true)) { + std::fprintf(stderr, "cache E(%s): %s\n", tag, dflash27b_last_error()); + std::exit(1); + } + KvFlashPager local; + if (masked && !pager) { + KvFlashConfig pc; pc.pool_tokens = alloc_ctx; + local.attach(pc, cache.attn_k, cache.attn_v); + // mark whole pool resident so the mask is all-zero (worst + // case mask read, no -inf shortcut) + for (int p = 0; p < alloc_ctx; p += 64) local.slot_for(p); + pager = &local; + } + Stepper st; + if (!st.init(w, cache, backend, span, masked)) std::exit(1); + // warmup 8, then time 64 (refresh included: it is part of the + // real per-step cost in masked mode) + int32_t tok = 1000; + for (int i = 0; i < 8; i++) { + if (masked) st.refresh_mask(*pager); + tok = st.step(tok, pos_base + i, (i * 64) % alloc_ctx); + } + std::vector ms; + for (int i = 0; i < 64; i++) { + const double t0 = now_ms(); + if (masked) st.refresh_mask(*pager); + tok = st.step(tok, pos_base + 8 + i, (8 * 64 + i) % alloc_ctx); + ms.push_back(now_ms() - t0); + } + const StepTimes r = summarize(ms); + std::printf("%-28s span=%6d p50=%7.2f ms p95=%7.2f ms mean=%7.2f ms (%5.1f tok/s)\n", + tag, span, r.p50, r.p95, r.mean, 1000.0 / r.mean); + st.destroy(); + free_target_cache(cache); + }; + profile("baseline 8K", 8192, 8192, false, nullptr, 8192 - 72); + profile("baseline 32K", 32768, 32768, false, nullptr, 32768 - 72); + profile("baseline 128K", 131072, 131072, false, nullptr, 131072 - 72); + profile("pool 1K masked (128K logical)", 1024, 1024, true, nullptr, 130000); + profile("pool 1K maskless", 1024, 1024, false, nullptr, 130000); + profile("pool 4K masked (128K logical)", 4096, 4096, true, nullptr, 130000); + + // Page-event microbench on a small pool. + { + TargetCache cache; + if (!create_target_cache(w, 1024, 0, backend, cache, true)) std::exit(1); + KvFlashPager pager; + KvFlashConfig pc; pc.pool_tokens = 1024; + pager.attach(pc, cache.attn_k, cache.attn_v); + for (int p = 0; p < 1024; p += 64) pager.slot_for(p); + std::vector out_ms, in_ms; + for (int rep = 0; rep < 32; rep++) { + const int c = 2 + (rep % 8); + double t0 = now_ms(); + pager.page_out(c); + out_ms.push_back(now_ms() - t0); + t0 = now_ms(); + pager.page_in(c); + in_ms.push_back(now_ms() - t0); + } + const StepTimes o = summarize(out_ms), i = summarize(in_ms); + std::printf("page_out: p50=%.2f ms p95=%.2f ms page_in: p50=%.2f ms p95=%.2f ms (per 64-token chunk, %zu KiB)\n", + o.p50, o.p95, i.p50, i.p95, + (size_t)(pager.stats().host_bytes / std::max(1, 8) / 1024)); + free_target_cache(cache); + } + } + + // ── Memory verdict ────────────────────────────────────────────── + const double red_kv = 100.0 * (1.0 - (double)mem_c_kv / (double)mem_a_kv); + std::printf("\n=== KV MEMORY ===\n"); + std::printf("baseline (ctx %6d): kv=%9.1f MiB base_buf=%9.1f MiB vram_delta=%9.1f MiB\n", + logical_ctx, mem_a_kv / 1048576.0, mem_a_buf / 1048576.0, mem_a_vram / 1048576.0); + std::printf("pooled (pool %5d): kv=%9.1f MiB base_buf=%9.1f MiB vram_delta=%9.1f MiB\n", + pool_c, mem_c_kv / 1048576.0, mem_c_buf / 1048576.0, mem_c_vram / 1048576.0); + std::printf("attn-KV reduction: %.1f%%\n", red_kv); + std::printf("%s KV memory reduction >= 90%%\n", red_kv >= 90.0 ? "PASS" : "FAIL"); + if (red_kv < 90.0) hard_failures++; + + free_target_weights(w); + ggml_backend_free(backend); + std::printf("\n%s (%d hard failures)\n", hard_failures == 0 ? "ALL PASS" : "FAILED", hard_failures); + return hard_failures == 0 ? 0 : 1; +}