diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 46919debe..e870ef385 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -128,8 +128,8 @@ jobs:
needs: [uv-workspace]
runs-on: [self-hosted, gpu, sm86]
timeout-minutes: 30
- # The box has a single physical GPU: serialize GPU jobs across PRs instead
- # of letting concurrent runs clobber each other.
+ # Serialize CUDA jobs across PRs (one RTX 3090). The ROCm job has its
+ # own group: different physical GPU, no contention.
concurrency:
group: lucebox3-gpu-runner
cancel-in-progress: false
@@ -197,15 +197,51 @@ jobs:
needs: [uv-workspace]
runs-on: [self-hosted, rocm, gfx1151]
timeout-minutes: 20
- # Same single box as gpu-tests: serialize GPU jobs across PRs.
+ # Serialize across PRs per GPU. NOT the same group as the CUDA job:
+ # the combo box has two distinct GPUs (RTX 3090 + Strix iGPU), and a
+ # shared group only holds one waiting job, so the Radeon leg was
+ # chronically displaced ("higher priority waiting request") by every
+ # new CUDA job entering the queue.
concurrency:
- group: lucebox3-gpu-runner
+ group: lucebox3-rocm-runner
cancel-in-progress: false
steps:
- uses: actions/checkout@v4
+ - name: KFD health (diagnose instead of hanging)
+ # rocminfo on a wedged KFD blocks in uninterruptible sleep and eats
+ # the whole 20-minute job timeout. Probe with a hard timeout first,
+ # and when it hangs, dump the evidence (D-state holders, dmesg) so
+ # the job fails in seconds with a diagnosis instead of silently.
+ run: |
+ # A wedged KFD puts rocminfo in UNINTERRUPTIBLE sleep: timeout(1)
+ # cannot kill it and a foreground wait blocks until the job
+ # timeout. Probe in the background (output to a file so no pipe
+ # keeps the step alive) and enforce the deadline in the shell.
+ /opt/rocm/bin/rocminfo > /tmp/rocminfo.out 2>&1 &
+ PROBE=$!
+ for i in $(seq 1 15); do
+ kill -0 $PROBE 2>/dev/null || break
+ sleep 1
+ done
+ if kill -0 $PROBE 2>/dev/null; then
+ echo "::error::rocminfo hung (likely D-state) — ROCm/KFD wedged; the box needs a reboot"
+ echo "--- probe state:"
+ ps -o pid,stat,wchan:32,comm -p $PROBE || true
+ echo "--- processes holding /dev/kfd:"
+ sudo fuser -v /dev/kfd 2>&1 || true
+ echo "--- D-state processes:"
+ ps -eo pid,user,stat,wchan:32,comm | awk '$3 ~ /D/' || true
+ echo "--- recent amdgpu/kfd dmesg:"
+ sudo dmesg 2>/dev/null | grep -iE "amdgpu|kfd" | tail -15 || true
+ kill -9 $PROBE 2>/dev/null || true
+ disown $PROBE 2>/dev/null || true
+ exit 1
+ fi
+ wait $PROBE && echo "KFD healthy" || { echo "::error::rocminfo exited non-zero"; cat /tmp/rocminfo.out | tail -5; exit 1; }
+
- name: ROCm smoke (rocminfo sees gfx1151)
- run: /opt/rocm/bin/rocminfo | grep -E "Name:|Marketing Name:" | grep -iE "gfx1151|Radeon 8060S"
+ run: cat /tmp/rocminfo.out | grep -E "Name:|Marketing Name:" | grep -iE "gfx1151|Radeon 8060S"
- name: Build + run HIP vector-add on the Radeon 8060S
# Self-contained HIP kernel correctness test (no model weights). This is
diff --git a/README.md b/README.md
index 0856e5375..59d3fdd96 100644
--- a/README.md
+++ b/README.md
@@ -39,6 +39,10 @@ Each one is self-contained with setup instructions and benchmark notes.
+
+
+
+
---
## Supported Models & Drafters
@@ -276,6 +280,18 @@ DFLASH27B_KV_TQ3=1 \
| `--kv-cache-dir ` | — | Persist prefix cache to disk |
| `--kv-cache-budget N` | — | On-disk cache size cap |
+**Bounded KV residency (KVFlash)**
+
+Pages the attention KV cache through a fixed pool of GPU slots; cold 64-token chunks live in host RAM, bit-exact and recallable. Decode speed stops depending on context length and resident KV stays pool-sized at any context. Off by default; works on every model family. Drafter-scored residency is the default on every family: the server finds the Qwen3-0.6B drafter next to the model (or via `--prefill-drafter`) and lazy-loads it as the relevance scorer that decides which chunks stay resident — non-qwen targets (laguna, gemma4) bridge the tokenizer gap by re-tokenizing the context text for the drafter. LRU is the fallback when no drafter is present, or the explicit choice via `--kvflash-policy lru`. Per-model numbers in [Luce KVFlash →](optimizations/kvflash/README.md).
+
+| Flag / env | Default | Effect |
+|---|---|---|
+| `--kvflash ` | off | Resident pool size. `auto` sizes from the GPU: half of free VRAM after weights and reserves, at the model's KV density, capped where decode speed stays near the flat optimum (default 16384, override `DFLASH_KVFLASH_MAX_POOL`) and at `--max-ctx`. Explicit values are rounded to 256, clamped to `--max-ctx`, floored at the protected minimum so eviction always has a victim. |
+| `--kvflash-policy {drafter,lru}` | `drafter` | Residency policy. `lru` opts out of the drafter probe/load (recency-only paging, no extra VRAM). |
+| `--kvflash-tau N` | `64` | Reselect interval floor (drafter policy only); the effective interval grows with history to cap rescore overhead. |
+| `DFLASH_KVFLASH=N` | off | Env equivalent of `--kvflash`. |
+| `DFLASH_KVFLASH_TAU=N` | `64` | Env equivalent of `--kvflash-tau`. |
+
**Thinking budget**
| Flag | Default | Effect |
diff --git a/assets/cards/kvflash_card.png b/assets/cards/kvflash_card.png
new file mode 100644
index 000000000..1a8af70a3
--- /dev/null
+++ b/assets/cards/kvflash_card.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d3f810ba8150b818309173d9c003f475b5ff41b8a3e6605772eea7ca086029b2
+size 2231695
diff --git a/optimizations/kvflash/DESIGN.md b/optimizations/kvflash/DESIGN.md
new file mode 100644
index 000000000..a8738eb27
--- /dev/null
+++ b/optimizations/kvflash/DESIGN.md
@@ -0,0 +1,272 @@
+# KVFlash design notes
+
+Mechanism details and tuning data behind [README.md](README.md); measured
+tables in [RESULTS.md](RESULTS.md).
+
+FlashMemory-style (arXiv 2606.09079) decode-time KV paging for the qwen35
+target, designed to compose with pflash. Goal: the GPU footprint of the
+full-attention KV cache is a hard O(pool) constant regardless of logical
+context length, with paged-out chunks recallable bit-exact from host.
+
+## Division of labor with pflash
+
+pflash and the pager own different resources and compose cleanly:
+
+| concern | owner |
+|---|---|
+| which prompt chunks the target ever elaborates | pflash (drafter scores, evict at prefill) |
+| which elaborated chunks occupy GPU slots | KvFlashPager (this module) |
+| prefill compute sparsity | pflash BSA kernels |
+| decode-time KV growth (generated tokens) | KvFlashPager (page out cold generated chunks) |
+
+pflash keeps the target from reading the huge context; the pager keeps
+what the target HAS elaborated inside a fixed VRAM budget and makes every
+eviction reversible. The drafter's chunk scores plug into
+`KvFlashPager::score_hook` as the residency policy (LRU fallback in the
+prototype).
+
+## Mechanism
+
+- Cache tensors are allocated at `pool_tokens` (e.g. 1024) instead of
+ `max_ctx` (e.g. 131072). That allocation delta IS the memory saving:
+ a mask over a full-size cache would save nothing.
+- Logical positions map to physical pool slots at 64-token chunk
+ granularity. The mapping rides the existing step-invariant
+ `ggml_set_rows` KV append (`kv_write_rows` carries the physical slot;
+ the `positions` input keeps the logical position for M-RoPE).
+- Decode FA spans the whole pool with an EXACT slot-validity mask
+ (`KvFlashPager::fill_slot_mask`): resident slots 0, free/paged-out -inf.
+ The host-side mask rebuilds only when the pager epoch moves; the device
+ upload happens before EVERY compute. That upload is mandatory, not an
+ optimization: input tensors live in the gallocr compute buffer, whose
+ regions are reused during graph execution, so a once-uploaded mask is
+ garbage by the next step (this masqueraded as a "fattn NaN kernel bug"
+ for a while — all-NaN logits from the second step on; production never
+ hit it because its prefill refills masks per chunk). `--no-mask` falls
+ back to maskless + zeroed freed slots (exp(-max) ~ 0, production's
+ padded-span approximation, measured ~1% argmax flips).
+- Page-out copies a chunk's quantized rows (per layer x K/V x head
+ segments) to a host backing store and zeroes the slots; page-in writes
+ them back. Quantized bytes + baked-in RoPE means the roundtrip is
+ bit-exact and relocation is position-independent.
+- Eviction protects sinks (first chunk) and the trailing window, mirrors
+ FlashMemory's always-resident floor (their last-8K + decoded window).
+ Unlike their sigmoid-threshold fetch (which leaks footprint at 500K,
+ their §3.3.1), a fixed slot pool is a hard budget by construction.
+- DeltaNet/conv recurrent state is fixed-size and never paged.
+
+## What the prototype verifies (test_kvflash)
+
+A. Baseline at logical ctx 128K: reference greedy sequence + KV bytes.
+B. Relocation proof: same workload in a small pool with SHUFFLED block
+ placement, teacher-forced — argmax must track the baseline.
+C. Live paging: pool ≪ prompt+gen, eviction engaged; bit-exact
+ page_out/page_in roundtrip; decode completes; KV bytes vs A ≥ 90% cut.
+
+## Reselect (τ-step lookahead)
+
+`KvFlashPager::reselect()` rebuilds the resident set as the top-pool chunks by
+`score_hook` over all materialized chunks (resident or host-backed),
+keeping sinks and the trailing window unconditionally. Page-outs run
+first so recalls always find free blocks. This is the FlashMemory τ=64
+loop's mechanism; the production caller invokes it every τ decoded
+tokens with fresh drafter scores. Verified in test run D: an evicted
+chunk recalled by a score flip, decode continues across the residency
+change.
+
+## Measured (lucebox RTX 3090, Qwen3.6-27B Q4_K_M, Q8_0 KV, 2026-06-11)
+
+All gates PASS (exit 0). 64 timed steps per profile row, junk KV so the
+FA span traffic is bandwidth-realistic:
+
+| config | FA span | ms/step p50 | tok/s |
+|---|---|---|---|
+| baseline 8K | 8192 | 35.1 | 28.5 |
+| baseline 32K | 32768 | 30.1 | 33.1 |
+| baseline 128K | 131072 | 45.1 | 22.1 |
+| pool 1K @128K logical | 1024 | 25.1 | 39.6 |
+| pool 4K @128K logical | 4096 | 25.7 | 38.7 |
+
+- attn-KV memory: 2304.0 -> 18.0 MiB (99.2% cut); whole cache buffer
+ 2653.6 -> 217.6 MiB, confirmed by VRAM deltas.
+- At 128K-logical decode the pool is 1.8x FASTER than the full cache
+ (45.1 -> 25.1 ms/step): FA cost is span-bound, the pool caps the span.
+- Paging: page_out p50 1.26 ms, page_in p50 0.63 ms per 64-token chunk
+ (~2.2 MiB, synchronous); 12 evictions over 1200 generated tokens
+ amortize to ~0.01 ms/token. reselect() recalling with 20 page events
+ took 21.3 ms — at τ=64 that is ~1% of decode time worst-case.
+- Relocation equivalence: 0.83% argmax flips over 1200 teacher-forced
+ tokens at shuffled placement (gate: ≤1%).
+- Open harness question: the C-loop (live eviction) measured ~34 ms/step
+ vs 25 ms for the identical config in the E-loop; suspected interaction
+ of sustained-load GPU clocks with run ordering, not paging cost (12
+ sync page events explain only ~0.01 ms/token). Re-measure under the
+ production decode loop during integration.
+
+## Full LSA loop (drafter as Memory Indexer) — measured
+
+Test run F implements the paper's complete inference paradigm with the
+pflash drafter (Qwen3-0.6B, `/opt/lucebox/models/drafter/`) standing in
+for the trained indexer: prompt (2048) larger than the pool (1024) so
+prefill itself evicts, then every τ=64 decoded tokens the drafter
+rescores the full sequence (tail attention = indexer query, chunk means
+via `drafter_chunk_scores`), `score_hook` receives the fresh scores, and
+`reselect()` repages the pool.
+
+Measured (RTX 3090, target Qwen3.6-27B Q4_K_M + drafter co-resident):
+- 31.2 tok/s with the loop active; 12 rescores over 768 generated tokens
+- 43 genuine drafter-driven recalls of previously evicted context
+- indexer rescore p50 = 245 ms (full 0.6B re-prefill at ~2-2.8K tokens —
+ ~12% decode overhead at τ=64; drops to ~ms once the drafter's own KV
+ is persisted and only the new τ tokens are pushed through it)
+- reselect p50 = 7.5 ms
+
+vs the paper: their indexer is a trained <0.1% projection head (cheaper
+queries, backbone-supervised labels); ours is the existing 0.6B drafter
+(training-free, already shipped for pflash). Their sigmoid threshold
+leaks footprint at scale (their §3.3.1); our fixed pool is a hard cap.
+
+## Production integration (daemon)
+
+The pool is wired into the qwen35 backend behind `--kvflash `
+(env `DFLASH_KVFLASH`; rounded to a 256 multiple) + `--kvflash-tau `
+(env `DFLASH_KVFLASH_TAU`, default 64). Pieces:
+
+- `create_target_cache(..., ctx_alloc)`: attention tensors allocated at
+ pool capacity; `cache.max_ctx` stays the logical bound.
+- `do_prefill`: prompts that fit the pool land identity-mapped
+ (`kvflash_sync_prefill` rebuilds the pager map per request/restore);
+ LARGER prompts switch to pooled chunked prefill — pager-chunk batches,
+ slot-mapped set_rows writes, a slot-space mask per chunk, live
+ eviction. Constant VRAM, linear time (qwen35 only so far).
+- `do_ar_decode`: `build_target_step(..., kvflash_mask=true)` keeps the
+ step-invariant set_rows write active alongside the slot mask;
+ `kv_write_rows` carries the pool slot; the mask uploads per step;
+ every τ generated tokens `kvflash_maybe_reselect` rescores + repages.
+- Policy is agnostic by construction: `KvFlashScorer` (common/) is the
+ interface; with no scorer the pager runs pure LRU (zero pflash
+ dependency). When pflash loads its drafter, `KvFlashDrafterScorer`
+ (qwen3/) attaches automatically and reselect becomes drafter-driven.
+- Spec decode (chain mode) runs ON the pool: verify_batch slot-maps the
+ draft block via per-token kv_write_rows and builds a slot-space mask
+ (resident committed positions + causal among draft tokens). Rejected
+ drafts need no rollback: the pos < base_pos validity rule excludes
+ their slots until the replay rewrites them. All four spec KV-write
+ sites (verify, both replays, stall-prefix) route through this one
+ function. Verified on the daemon: accept_rate 15.4-15.6% pooled vs
+ 15.3% pool-off (matched avg_commit 3.47 vs 3.45), coherent output
+ through a mid-generation pool wrap with live eviction. DDTree's
+ tree-verify is not pool-aware yet and falls back to AR.
+- LAYOUT TRAP (cost a day of debugging): kv_write_rows is
+ [n_tokens, n_head_kv] ne0-major — element (token i, head h) lives at
+ i + h*n_tokens (ggml_set_rows asserts b->ne[1] == c->ne[0]). A
+ transposed fill scrambles per-head row targets for every multi-token
+ write while single-token fills (all entries equal) hide the bug
+ completely.
+- Post-generation snapshots are skipped once cur_pos exceeds the pool
+ (pooled snapshots need page-table serialization; prefill-time
+ snapshots still work).
+
+## Production smokes (dflash_server on lucebox 3090, 2026-06-11)
+
+1. WITHOUT pflash (agnostic LRU): `dflash_server <27B> --kvflash 1024`.
+ 41-token prompt + 1400 generated = 1441 logical through a 1024-slot
+ pool (live LRU eviction mid-request). Coherent story end to end,
+ 36.9 tok/s, clean finish. Second request (per-request pager reset) ok.
+2. WITH pflash: `--kvflash 2048 --prefill-compression always
+ --prefill-threshold 256 --prefill-drafter `. Compression
+ 1468 -> 60 tokens, then `[kvflash] drafter scorer attached (tau=64)`
+ automatically; 400 coherent tokens answering from the compressed
+ context. Same binary, zero pflash-specific configuration on the pool.
+
+Ops note: the init banner is flushed now, but generally `nohup` +
+redirected stdout block-buffers printf output — kill the process (atexit
+flush) before concluding a code path didn't run.
+
+## Quality matrix (synthetic NIAH, needle recall /16, teacher-forced)
+
+| context | residency | LRU d=10/50/90% | drafter d=10/50/90% | control |
+|---|---|---|---|---|
+| 8K | 25% | 0 / 0 / 16 | 15 / 15 / 16 | 16/16 |
+| 8K | 9% | 0 / 0 / 0 | 15 / 15 / 15 | 16/16 |
+| 32K | 25% | 0 / 0 / 16 | 15 / 15 / 16 | 16/16 |
+| 32K | 9% | 0 / 0 / 0 | 15 / 15 / 15 | 15-16/16 |
+| 256K | 6.25% | 0 (d=0.5); 16/16 in-window | 14 / 15 / 15 | (in-window LRU = control) |
+
+Drafter-scored residency retains 88-100% of perfect needle recall at every
+depth down to 6-9% residency from 8K to the model's native 256K maximum;
+recency-only LRU retains zero outside its tail window. 256K logistics on
+the RTX 3090: ~6.5 min linear pooled prefill, 4.22 GiB host backing,
+~18 GiB VRAM total, 46 s bisected rescore (drafter forward ceiling ~65K
+per segment).
+
+## Tuned defaults (from the matrix)
+
+- Ship drafter scoring whenever a drafter is available; pure-LRU mode is
+ recency-only and must be documented as such.
+- Pool ~25% of expected context is the conservative default; 9% measured
+ safe for retrieval-style work.
+- tau adapts: rescore costs ~0.11 ms/history-token, so the effective
+ reselect interval is max(configured tau, history/45), capping rescore
+ overhead near 15% of decode time.
+
+## Per-architecture integration
+
+The pager core is architecture-blind; each backend routes its own KV writes
+and masks through it. What differs per arch:
+
+- **qwen35** (reference): masked set_rows decode, slot-mapped chain-spec
+ verify, drafter scorer auto-attach. Everything in RESULTS.md.
+- **qwen35moe** (Qwen3.6-35B-A3B): inherits the qwen35 path all-GPU. The
+ Spark hybrid pipelined decode keeps its per-layer cached CUDA graphs:
+ `pipelined_decode_one_token` takes a `kv_slot`, the cached FA span clamps
+ to the pool (so the graph stops rebuilding once the window hits pool
+ size), and the pool span stays MASKLESS like the rest of that path — the
+ pager zeroes freed blocks (page-out and `zero_free_blocks()` on request
+ reset), so evicted slots contribute exp(-max) ~ 0, production's own
+ padded-span approximation. Hybrid spec decode (literal-offset KV writes)
+ falls back to pipelined AR under kvflash.
+- **laguna**: ALL 40 layers pooled (full + SWA share the pager).
+ `laguna_step` / `laguna_step_hybrid` take a const pager; both masks are
+ built in SLOT space via `fill_slot_pos` (the causal / sliding-window
+ conditions evaluate on the position each slot holds). SWA exactness:
+ `tail_window_chunks >= sliding_window/64 + 1`, so positions inside the
+ window are never evicted. The per-layer hybrid decode fallback and
+ NO_KVPAD / PAD_CPY / no_mask ablations are refused under kvflash.
+- **gemma4**: pools FULL-attention layers only — SWA layers already use
+ sliding-window ring buffers and KV-reuse layers share their source's
+ tensors. The full mask is slot-space; the SWA ring path is untouched.
+ `--fa-window` (sparse full-attn) and kvflash are mutually exclusive.
+ DFlash spec verify is slot-mapped (gemma4_verify_batch gains set_rows
+ inputs + the slot-space causal mask; its KV-truncation rejection
+ semantics map directly onto the pool's validity rule). Measured:
+ identical acceptance pooled vs full (407/3104 = 13.1%, avg_commit
+ 3.09, identical text).
+
+Policy: drafter-scored residency is the default on all four archs. The
+server probes for the Qwen3-0.6B next to the model (or --prefill-drafter)
+and lazy-loads it at the first reselect; `--kvflash-policy lru` opts out.
+qwen35/qwen35moe feed the drafter target ids directly; laguna/gemma4 use
+KvFlashCrossTokScorer (detokenize -> re-tokenize -> score -> map back by
+char spans; functional but untuned, see RESULTS). `--kvflash auto` sizes
+the pool from free VRAM at the model's KV density, capped at the decode
+speed knee (16384 default).
+
+Snapshots on laguna/gemma4 are refused once a chunk has relocated
+(page_outs > 0); identity-layout snapshots before that still work.
+
+## Follow-ups
+
+Done since the prototype: pooled chunked prefill in the qwen35 daemon
+(prompt > pool, eviction during prefill), spec-decode chain verify on the
+pool, VRAM-aware auto sizing, cross-tokenizer scoring for laguna/gemma4.
+
+Open:
+1. Drafter KV persistence for the indexer (incremental rescore: push
+ only the new τ tokens through the drafter; kills the ~240 ms re-prefill).
+2. Pooled chunked prefill for laguna/gemma4 (qwen35-only today).
+3. Pooled snapshot save/restore (serialize the page table + host store).
+4. Async paging on a copy stream (currently synchronous
+ ggml_backend_tensor_get/set between steps).
+5. Teacher-forced NIAH harness for non-qwen archs + cross-tok scorer
+ tuning (tail window, normalization).
diff --git a/optimizations/kvflash/README.md b/optimizations/kvflash/README.md
new file mode 100644
index 000000000..a54406453
--- /dev/null
+++ b/optimizations/kvflash/README.md
@@ -0,0 +1,133 @@
+
+ ← lucebox-hub
+
+
+
+
+
+
+Luce KVFlash
+
+
+ Lookahead sparse attention for dflash. Bounded KV residency on one GPU.
+ The attention KV cache lives in a fixed pool of slots; cold 64-token chunks page to host RAM, bit-exact and recallable.
+ With pflash, its drafter doubles as a Memory Indexer that recalls the context the generation needs next.
+ Qwen3.6-27B Q4_K_M on a single RTX 3090: native 256K context at 38.6 tok/s with 72 MiB of resident KV,
+ needle recall 88-100% at 6% residency, harness accuracy unchanged (32/32 vs full cache).
+
+
+---
+
+```
+ decode tok/s KV in VRAM (Q8_0) needle (d=10/50/90%)
+full cache @ 64K 27.8 1152 MiB 16/16
+full cache @ 128K 19.6 2304 MiB 16/16
+full cache @ 256K 13.1 4608 MiB 16/16
+KVFlash 4K @ 64K 38.6 72 MiB 14/16
+KVFlash 4K @ 128K 38.6 72 MiB 14/16
+KVFlash 4K @ 256K 38.6 72 MiB 15/16
+```
+
+Decode speed is flat at any context length (the per-step KV read is pool-sized,
+not context-sized), prefill is up to 2.8x faster, and a 256K prompt that costs
+4.6 GiB of VRAM as a full cache costs 72 MiB resident + 4.2 GiB of host RAM.
+(The full-cache 256K rows are measured, not extrapolated: they fit the 24 GB
+card only thanks to Q8_0 KV; with F16 KV the cache alone is 9.2 GiB and 256K
+does not fit at all.)
+
+## Usage
+
+```bash
+dflash_server model.gguf --max-ctx 32768 --kvflash auto # one flag, LRU policy
+dflash_server model.gguf --max-ctx 32768 --kvflash auto \
+ --prefill-drafter qwen3-0.6b.gguf # drafter-scored residency
+dflash_server model.gguf --max-ctx 32768 --kvflash 8192 # explicit pool size
+```
+
+Drafter-scored residency is the DEFAULT policy on every model family:
+the server probes for `Qwen3-0.6B-BF16.gguf` next to the model (same
+dir, `drafter/`, `draft/`, then `/opt/lucebox/models/drafter/`) and
+lazy-loads it on the first reselect; `--prefill-drafter` overrides the
+location, prefill compression can stay off either way. Qwen-family
+targets feed the drafter their ids directly; laguna and gemma4 bridge
+the tokenizer gap with `KvFlashCrossTokScorer` (relevance is a property
+of the TEXT, so the target's history is detokenized, re-tokenized for
+the drafter, scored, and mapped back to chunk boundaries by character
+spans). LRU is the fallback when no drafter is found (the banner says
+which policy you got) or the explicit choice via `--kvflash-policy lru`.
+`auto` sizes the pool from the GPU, not a fixed fraction: half of the
+free VRAM left after weights (minus a reserve for compute buffers and
+the drafter), converted at the model's KV density, capped where decode
+speed stays near the flat optimum (16384 tokens by default,
+`DFLASH_KVFLASH_MAX_POOL` to override) and at `--max-ctx`. Bigger pools
+mean more resident chunks and fewer forced evictions of useful context;
+the cap keeps the per-step KV read small enough that decode stays near
+the small-pool speed.
+
+- `--kvflash `: resident pool size (rounded to 256; clamped to
+ `--max-ctx`; floored at the protected minimum — 512 for qwen-family and
+ gemma4, larger on laguna where the SWA window stays resident — so
+ eviction always has a victim). Env: `DFLASH_KVFLASH`.
+- `--kvflash-tau `: reselect interval floor (default 64; the effective
+ interval grows with history so rescore overhead stays ~15% of decode).
+ Env: `DFLASH_KVFLASH_TAU`.
+
+Sizing rule: without a drafter, pool >= prompt + generation headroom
+(LRU is recency-only memory — an undersized pool can evict the question
+itself). With pflash's drafter attached, 25% of the expected context is a
+conservative default and 6-9% is measured safe for retrieval workloads.
+
+## Model support
+
+`--kvflash` works on every architecture the daemon serves:
+
+| arch | models | decode path | policy | notes |
+|---|---|---|---|---|
+| qwen35 | Qwen3.5/3.6-27B | masked set_rows decode + slot-mapped spec verify | LRU or pflash drafter | reference integration; all RESULTS.md numbers |
+| qwen35moe | Qwen3.6-35B-A3B | pipelined hybrid decode (Spark) + all-GPU | LRU or pflash drafter | maskless pool span (zero-row approximation, same as production padding); hybrid spec falls back to AR |
+| laguna | Laguna-XS.2 | single-graph hybrid + all-GPU, slot-space full+SWA masks | LRU or drafter (cross-tok, untuned) | pager covers all 40 layers; protected tail >= sliding_window keeps SWA exact |
+| gemma4 | Gemma4 26B-A4B / 31B | masked decode + slot-mapped spec verify, slot-space full mask | LRU or drafter (cross-tok, untuned) | pools FULL-attention layers only (SWA layers already ring-buffer) |
+
+Non-qwen targets use the cross-tokenizer scorer (detokenize target ids,
+re-tokenize for the drafter, score, map back by char spans); the
+`KvFlashScorer` seam stays open for native indexers.
+
+## How it works
+
+- **Pool**: attention KV tensors are allocated at pool size; a pager maps
+ logical positions to slots at 64-token chunk granularity. Cold chunks
+ move to a host backing store (~0.6 ms/chunk) and return bit-exact.
+- **Mask**: attention spans the pool with a slot-validity mask, uploaded
+ before every compute. Exact, and free (25.10 vs 25.52 ms/step maskless).
+- **Reselect**: every tau decoded tokens the scorer re-ranks all chunks
+ (resident or host-backed) and `reselect()` repages the pool — the
+ lookahead loop from FlashMemory (arXiv 2606.09079), with the pflash
+ drafter standing in for their trained indexer, and a hard capacity cap
+ their threshold mechanism lacks.
+- **Spec decode**: chain-mode verify is slot-mapped (per-token
+ `kv_write_rows` + slot-space mask); rejected drafts need no rollback —
+ their slots are excluded by the validity rule until rewritten.
+ Acceptance parity with the full cache (15.4-15.6% vs 15.3%), with or
+ without the --ddtree configuration (fast rollback only snapshots
+ DeltaNet state, which is never pooled).
+- **Prefill**: prompts larger than the pool prefill in 64-token chunks at
+ constant VRAM (linear time; 256K in ~5.9 min on the 3090).
+
+Quality verdict (harness ground truth, base-vs-base control included):
+full results in [RESULTS.md](RESULTS.md). Outputs are not guaranteed
+byte-identical to the full cache on long generations (the masked kernel
+path rounds differently — a different deterministic lineage), but
+correctness is identical: 32/32 vs 32/32 across HumanEval, GSM, MATH, and
+agent suites.
+
+## Files
+
+- `server/src/common/kvflash_pager.h` — pool, page table, host store, reselect
+- `server/src/common/kvflash_scorer.h` — chunk-relevance policy interface
+- `server/src/qwen3/qwen3_kvflash_scorer.{h,cpp}` — pflash-drafter scorer
+ (tail attention; bisects on allocation pressure)
+- `server/src/qwen35/*` — cache `ctx_alloc`, masked pooled decode, slot-mapped
+ spec verify, daemon flags
+- `server/test/test_kvflash.cpp` — verification suite (A-F), `--niah`,
+ `--niah256`, `--longab`
+- [DESIGN.md](DESIGN.md) — mechanism details and tuning notes
diff --git a/optimizations/kvflash/RESULTS.md b/optimizations/kvflash/RESULTS.md
new file mode 100644
index 000000000..513412311
--- /dev/null
+++ b/optimizations/kvflash/RESULTS.md
@@ -0,0 +1,127 @@
+# KVFlash — measured results
+
+All numbers: single RTX 3090 (24 GB), Qwen3.6-27B Q4_K_M target, Q8_0 KV,
+Qwen3-0.6B pflash drafter as the scorer. June 2026, `test_kvflash` +
+`dflash_server` + `harness/benchmarks`.
+
+## End-to-end long-prompt A/B (`--longab`; needle depth 0.25, 240-token timed free run)
+
+| context | mode | prefill | decode tok/s | needle /16 | KV in VRAM (Q8_0) |
+|---|---|---|---|---|---|
+| 32K | full | 47.2 s | 32.8 | 16 | 576 MiB |
+| 32K | KVFlash 4K | 41.8 s | 29.0 | 15 | 72 MiB |
+| 64K | full | 130.6 s | 27.8 | 16 | 1152 MiB |
+| 64K | KVFlash 4K | 87.5 s | **38.6** | 14 | **72 MiB** |
+| 128K | full | 335.9 s | 19.6 | 16 | 2304 MiB |
+| 128K | KVFlash 4K | 177.8 s | **38.6** | 14 | **72 MiB** |
+| 256K | full | 999.0 s | 13.1 | 16 | 4608 MiB |
+| 256K | KVFlash 4K | **354.9 s** | **38.6** | 15 | **72 MiB** |
+
+Decode is flat at 38.6 tok/s from 64K to native-max 256K (speedups 1.4x /
+2.0x / 2.9x); prefill speedups 1.5x / 1.9x / 2.8x. One drafter rescore per
+query: 9-70 s scaling with context (bisected above the drafter's ~65K
+single-pass ceiling).
+
+Note on the 256K full-cache row: it fits the 24 GB card only because the
+KV is Q8_0 (~15.3 GiB weights + 4.6 GiB KV ~ 21 GiB, measured, no OOM).
+With F16 KV the cache alone is 9.2 GiB and 256K does NOT fit; KVFlash is
+indifferent (72 MiB resident either way).
+
+## Retrieval quality vs residency (synthetic NIAH, teacher-forced /16)
+
+| context | residency | LRU (d=10/50/90%) | drafter (d=10/50/90%) | full control |
+|---|---|---|---|---|
+| 8K | 25% | 0 / 0 / 16 | 15 / 15 / 16 | 16/16 |
+| 8K | 9% | 0 / 0 / 0 | 15 / 15 / 15 | 16/16 |
+| 32K | 25% | 0 / 0 / 16 | 15 / 15 / 16 | 16/16 |
+| 32K | 9% | 0 / 0 / 0 | 15 / 15 / 15 | 15-16/16 |
+| 256K | 6.25% | 0 (d=0.5); 16/16 in-window | 14 / 15 / 15 | (in-window LRU = control) |
+
+Drafter-scored residency retains 88-100% of perfect recall at every depth
+down to 6-9% residency; recency-only LRU retains zero outside its tail
+window (mirrors FlashMemory's Recency-Only ablation).
+
+## Harness ground truth (pool sized per the heuristic, vs full cache)
+
+| suite | baseline pass | KVFlash pass | exact text match |
+|---|---|---|---|
+| HumanEval | 10/10 | **10/10** | 10/10 |
+| GSM | 10/10 | **10/10** | 8/10 |
+| MATH | 10/10 | **10/10** | 4/10 |
+| agent (to 24K prompts) | 6/6 | **6/6** | 2/6 |
+
+Base-vs-base control: 16/16 byte-identical — the stack is deterministic.
+Text drift under KVFlash is the masked decode kernel's different (equally
+deterministic) rounding lineage, not noise and not a correctness effect.
+
+## Spec decode (slot-mapped verify, daemon)
+
+| config | accept rate | avg_commit | output |
+|---|---|---|---|
+| qwen35 full cache, 2400 tok | 15.3% | 3.45 | coherent |
+| qwen35 KVFlash 2K, 1800 tok | 15.4% | 3.47 | coherent |
+| qwen35 KVFlash 2K, 2400 tok (live eviction mid-spec) | 15.6% | 3.49 | coherent |
+| qwen35 --ddtree full cache, 600 tok | 13.9% | 3.23 | coherent |
+| qwen35 --ddtree KVFlash 2K, 600 tok | 14.6% | 3.33 | coherent |
+| gemma4 full cache, 600 tok | 13.1% (407/3104) | 3.09 | coherent |
+| gemma4 KVFlash 2K, 600 tok | 13.1% (407/3104) | 3.09 | identical text to full |
+| qwen35moe A3B all-GPU --ddtree full cache, 500 tok | 11.5% | 2.84 | coherent |
+| qwen35moe A3B all-GPU --ddtree KVFlash 2K, 500 tok | 10.4% | 2.66 | coherent |
+
+## Microbenchmarks
+
+- Memory at 128K-logical: attn-KV 2304 -> 18 MiB (99.2%) with a 1K pool;
+ whole cache buffer 2654 -> 218 MiB, confirmed via VRAM deltas.
+- Exact slot mask is free: 25.10 ms/step masked vs 25.52 maskless.
+- Paging: page_out p50 1.27 ms / page_in 0.64 ms per 64-token chunk
+ (~2.2 MiB, synchronous); ~0.01 ms/token amortized at observed rates.
+- reselect() repaging 20 chunks: 21.3 ms.
+- Relocation equivalence (shuffled physical placement, teacher-forced
+ 1200 tokens): ~99% argmax agreement; page_out/page_in roundtrip
+ bit-exact.
+
+## Multi-architecture smokes (pool 1024, --max-ctx 8192, ~1235 logical tokens, live LRU eviction mid-request, RTX 3090)
+
+| arch | model | mode | decode tok/s | output |
+|---|---|---|---|---|
+| qwen35 | Qwen3.6-27B Q4_K_M | all-GPU, masked pool | 37.4 | coherent |
+| qwen35moe | Qwen3.6-35B-A3B UD-Q4_K_M | Spark hybrid (9403 hot / 837 cold experts), pipelined decode | 101.6 | coherent |
+| laguna | Laguna-XS.2 Q4_K_M | Spark hybrid, single-graph decode, slot-space full+SWA masks | 137.1 | coherent |
+| gemma4 | Gemma4 26B-A4B UD-Q4_K_M | all-GPU, slot-space full mask, SWA rings untouched | 119.0 | coherent |
+
+Gemma4 control on the same build without the flag: 120.2 tok/s, no
+kvflash code engaged — the default path is unchanged.
+
+## Cross-tokenizer scorer (laguna/gemma4) — early result
+
+Stress A/B on gemma4 26B-A4B (pool 1024, needle at pos ~170, recital
+demanded ~1700 generated tokens later, beyond the SWA ring and the pool):
+LRU never recites and degenerates into filler repetition; the cross-tok
+drafter stays coherent for 1.9K tokens, reaches the recital, and recalls
+the correct prefix but not the exact code. Strictly better than LRU,
+not yet at the qwen-native scorer's 14-16/16; treat as functional but
+untuned (follow-up: teacher-forced NIAH harness for non-qwen archs,
+tail-window/normalization tuning).
+
+## Known limits
+
+- qwen35moe `--spark` (hybrid expert offload) speculative decode crashes
+ with a CUDA illegal-memory-access — a pre-existing bug in the hybrid
+ spec path (`do_hybrid_spec_decode`), independent of KVFlash (it crashes
+ with the full cache too). It was never exercisable before because no
+ A3B DFlash draft could be converted; the converter fix in this branch
+ now loads them, surfacing the crash. Tracked separately; `--spark`
+ spec falls back to pipelined AR under KVFlash. All-GPU MoE spec decode
+ (experts resident, no `--spark`) works on the pool — see the spec table.
+
+
+- The harness-only tree-verify graphs (test_dflash) are not pool-aware;
+ the daemon's spec decode, including the --ddtree configuration (chain
+ verify + fast rollback), runs fully on the pool.
+- Post-generation snapshots are skipped once cur_pos exceeds the pool
+ (pooled snapshots need page-table serialization).
+- Paging is synchronous (copy-stream overlap is a follow-up).
+- Memory-dense tasks needing the entire context at once (MRCR-style) are
+ a paradigm limit shared with FlashMemory; size the pool up for those.
+- 512K+ requires RoPE scaling (model native max is 256K) — memory-side
+ KVFlash already scales (host backing is the only growth).
diff --git a/optimizations/kvflash/hero.png b/optimizations/kvflash/hero.png
new file mode 100644
index 000000000..3fb3ce50e
--- /dev/null
+++ b/optimizations/kvflash/hero.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee1577f6ef97b030430041266532d39828749e1ef5868f58a0335955dcad9e7c
+size 2255374
diff --git a/server/CMakeLists.txt b/server/CMakeLists.txt
index 1ea6fd3fa..05d5add15 100644
--- a/server/CMakeLists.txt
+++ b/server/CMakeLists.txt
@@ -219,6 +219,7 @@ add_library(dflash_common STATIC
src/draft/draft_safetensors_loader.cpp
src/draft/draft_graph.cpp
src/qwen3/qwen3_drafter.cpp
+ src/qwen3/qwen3_kvflash_scorer.cpp
src/qwen3/qwen3_loader.cpp
src/qwen3/qwen3_graph.cpp
src/qwen3/qwen3_backend.cpp
@@ -724,6 +725,11 @@ if(DFLASH27B_TESTS)
target_include_directories(test_generate PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
target_link_libraries(test_generate PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET})
endif()
+ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_kvflash.cpp")
+ add_executable(test_kvflash test/test_kvflash.cpp)
+ target_include_directories(test_kvflash PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
+ target_link_libraries(test_kvflash PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET})
+ endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_restore_delta.cpp")
add_executable(test_restore_delta test/test_restore_delta.cpp)
target_include_directories(test_restore_delta PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
diff --git a/server/scripts/convert_dflash_to_gguf.py b/server/scripts/convert_dflash_to_gguf.py
index fae1be7e5..106c04540 100644
--- a/server/scripts/convert_dflash_to_gguf.py
+++ b/server/scripts/convert_dflash_to_gguf.py
@@ -39,7 +39,14 @@
import gguf
# ──────────────────────────────────────────────────────────────────────
-# DFlash 27B draft architecture constants
+# DFlash draft architecture constants — DEFAULTS ONLY.
+#
+# These are the qwen35-27B draft's values; they are used as a fallback when
+# the source model has no config.json. Any other draft (A3B, gemma, ...) has
+# a different head/dim/layer config, so the real scalars are read from the
+# source config.json + derived from the tensor shapes in load_arch(). A
+# converter that hardcoded these silently produced GGUFs with correct
+# weights but 27B metadata, which the strict draft loader then rejected.
# ──────────────────────────────────────────────────────────────────────
ARCH = "qwen35-dflash-draft"
@@ -50,7 +57,7 @@
HEAD_DIM = 128
INTERMEDIATE = 17408
VOCAB = 248320
-N_TARGET_LAYERS = 5 # fc projects 5*hidden -> hidden
+N_TARGET_LAYERS = 5 # fc projects N_TARGET_LAYERS*hidden -> hidden
ROPE_THETA = 1_000_000.0
RMS_EPS = 1e-6
MASK_TOKEN_ID = 248070
@@ -58,6 +65,89 @@
CTX_LEN = 32768
+def load_arch(safetensors: Path, header: dict) -> dict:
+ """Resolve the draft's architecture scalars. config.json (next to the
+ safetensors) is authoritative for the transformer hparams; the tensor
+ shapes are authoritative for the rest, so the result always matches the
+ weights even when config.json is partial or absent."""
+ a = dict(hidden=HIDDEN, n_layer=N_LAYER, n_head=N_HEAD, n_head_kv=N_HEAD_KV,
+ head_dim=HEAD_DIM, intermediate=INTERMEDIATE, vocab=VOCAB,
+ n_target_layers=N_TARGET_LAYERS, rope_theta=ROPE_THETA,
+ rms_eps=RMS_EPS, mask_token_id=MASK_TOKEN_ID, block_size=BLOCK_SIZE,
+ ctx_len=CTX_LEN)
+
+ cfg_path = safetensors.parent / "config.json"
+ if cfg_path.exists():
+ c = json.loads(cfg_path.read_text())
+ def pick(*keys):
+ for k in keys:
+ if k in c and c[k] is not None:
+ return c[k]
+ return None
+ for dst, val in (
+ ("hidden", pick("hidden_size")),
+ ("n_layer", pick("num_hidden_layers")),
+ ("n_head", pick("num_attention_heads")),
+ ("n_head_kv", pick("num_key_value_heads")),
+ ("head_dim", pick("head_dim")),
+ ("intermediate", pick("intermediate_size")),
+ ("vocab", pick("vocab_size")),
+ ("rope_theta", pick("rope_theta")),
+ ("rms_eps", pick("rms_norm_eps")),
+ ("n_target_layers", pick("n_target_layers", "num_target_layers")),
+ ("mask_token_id", pick("mask_token_id")),
+ ("block_size", pick("block_size", "draft_block_size")),
+ ("ctx_len", pick("max_position_embeddings")),
+ ):
+ if val is not None:
+ a[dst] = val
+ print(f"[info] read arch from {cfg_path}")
+ else:
+ print(f"[warn] no config.json next to safetensors; using 27B defaults")
+
+ # Weights are ground truth — derive/verify from tensor shapes.
+ def shape_of(st_name):
+ e = header.get(st_name)
+ return e["shape"] if e else None
+
+ # hidden absent in config: k-proj is [n_head_kv*head_dim, hidden] -> ne[1].
+ k0 = shape_of("layers.0.self_attn.k_proj.weight")
+ if (not cfg_path.exists()) and k0:
+ a["hidden"] = k0[1]
+ # head_dim absent in config: derive from k-proj (n_head_kv * head_dim).
+ if k0 and a["n_head_kv"]:
+ derived_hd = k0[0] // a["n_head_kv"]
+ if not cfg_path.exists() or "head_dim" not in json.loads(cfg_path.read_text() if cfg_path.exists() else "{}"):
+ a["head_dim"] = derived_hd
+ # intermediate: ffn gate/up is [intermediate, hidden] — ne[0].
+ g0 = shape_of("layers.0.mlp.gate_proj.weight")
+ if g0:
+ a["intermediate"] = g0[0]
+ # n_target_layers: fc.weight is [hidden, n_target*hidden]; ne[0] (the
+ # larger dim) / hidden is the capture count the loader checks.
+ fc = shape_of("fc.weight")
+ if fc and a["hidden"]:
+ a["n_target_layers"] = max(fc) // a["hidden"]
+ # n_layer: count the actual blocks present.
+ n_blocks = 1 + max((int(n.split(".")[1]) for n in header
+ if n.startswith("layers.") and n.split(".")[1].isdigit()),
+ default=a["n_layer"] - 1)
+ a["n_layer"] = n_blocks
+
+ # Consistency check against the k-proj weight.
+ if k0:
+ exp_kv = a["n_head_kv"] * a["head_dim"]
+ if exp_kv != k0[0]:
+ print(f"[error] config n_head_kv*head_dim={exp_kv} != "
+ f"k_proj.weight dim {k0[0]}; fix config.json", file=sys.stderr)
+ sys.exit(1)
+ print(f"[info] arch: hidden={a['hidden']} n_layer={a['n_layer']} "
+ f"n_head={a['n_head']} n_head_kv={a['n_head_kv']} "
+ f"head_dim={a['head_dim']} ff={a['intermediate']} vocab={a['vocab']} "
+ f"n_target_layers={a['n_target_layers']}")
+ return a
+
+
# ──────────────────────────────────────────────────────────────────────
# Tensor name mapping — DFlash safetensors -> llama.cpp GGUF
# ──────────────────────────────────────────────────────────────────────
@@ -155,29 +245,30 @@ def main():
n_entries = sum(1 for k in header if k != "__metadata__")
print(f"[info] {n_entries} tensor entries")
+ a = load_arch(args.safetensors, header)
+
writer = gguf.GGUFWriter(args.out_gguf, ARCH)
- # Architecture metadata
- writer.add_string("general.name", "Qwen3.5-27B-DFlash-Draft")
- writer.add_uint32(f"{ARCH}.context_length", CTX_LEN)
- writer.add_uint32(f"{ARCH}.embedding_length", HIDDEN)
- writer.add_uint32(f"{ARCH}.block_count", N_LAYER)
- writer.add_uint32(f"{ARCH}.feed_forward_length", INTERMEDIATE)
- writer.add_uint32(f"{ARCH}.attention.head_count", N_HEAD)
- writer.add_uint32(f"{ARCH}.attention.head_count_kv", N_HEAD_KV)
- # llama.cpp uses key_length / value_length to override the default
- # n_embd_head = n_embd / n_head heuristic (DFlash has n_embd=5120
- # but head_dim=128 so n_head*head_dim=4096 != n_embd).
- writer.add_uint32(f"{ARCH}.attention.key_length", HEAD_DIM)
- writer.add_uint32(f"{ARCH}.attention.value_length", HEAD_DIM)
- writer.add_uint32(f"{ARCH}.vocab_size", VOCAB)
- writer.add_float32(f"{ARCH}.attention.layer_norm_rms_epsilon", RMS_EPS)
- writer.add_float32(f"{ARCH}.rope.freq_base", ROPE_THETA)
+ # Architecture metadata (resolved from config.json + tensor shapes)
+ writer.add_string("general.name", f"DFlash-Draft-{a['hidden']}h-{a['n_layer']}L")
+ writer.add_uint32(f"{ARCH}.context_length", a["ctx_len"])
+ writer.add_uint32(f"{ARCH}.embedding_length", a["hidden"])
+ writer.add_uint32(f"{ARCH}.block_count", a["n_layer"])
+ writer.add_uint32(f"{ARCH}.feed_forward_length", a["intermediate"])
+ writer.add_uint32(f"{ARCH}.attention.head_count", a["n_head"])
+ writer.add_uint32(f"{ARCH}.attention.head_count_kv", a["n_head_kv"])
+ # key_length / value_length override the n_embd/n_head heuristic, which
+ # is wrong for DFlash drafts (n_head*head_dim != n_embd).
+ writer.add_uint32(f"{ARCH}.attention.key_length", a["head_dim"])
+ writer.add_uint32(f"{ARCH}.attention.value_length", a["head_dim"])
+ writer.add_uint32(f"{ARCH}.vocab_size", a["vocab"])
+ writer.add_float32(f"{ARCH}.attention.layer_norm_rms_epsilon", a["rms_eps"])
+ writer.add_float32(f"{ARCH}.rope.freq_base", a["rope_theta"])
# DFlash-specific hyperparameters
- writer.add_uint32(f"{ARCH}.dflash.n_target_layers", N_TARGET_LAYERS)
- writer.add_uint32(f"{ARCH}.dflash.block_size", BLOCK_SIZE)
- writer.add_uint32(f"{ARCH}.dflash.mask_token_id", MASK_TOKEN_ID)
+ writer.add_uint32(f"{ARCH}.dflash.n_target_layers", a["n_target_layers"])
+ writer.add_uint32(f"{ARCH}.dflash.block_size", a["block_size"])
+ writer.add_uint32(f"{ARCH}.dflash.mask_token_id", a["mask_token_id"])
# Walk + add tensors. Sort: dflash.* singletons first, then output_*,
# then per-layer in numeric order — keeps the on-disk layout stable.
diff --git a/server/src/common/kvflash_pager.h b/server/src/common/kvflash_pager.h
new file mode 100644
index 000000000..1b4679db9
--- /dev/null
+++ b/server/src/common/kvflash_pager.h
@@ -0,0 +1,548 @@
+// KvFlashPager — KVFlash core: a bounded resident pool for the
+// full-attention KV cache (see optimizations/kvflash/).
+//
+// Lookahead-sparse-attention-style (FlashMemory, arXiv 2606.09079)
+// decode-time KV residency for the qwen35 target: the cache tensors are
+// allocated at POOL size (a fraction of the logical context), and this
+// class owns the mapping from logical token positions to physical pool
+// slots. Chunks (64 logical tokens) that fall cold are paged out to a
+// host backing store and their slots are reused; paged-out chunks remain
+// recallable bit-exact. GPU footprint is a hard O(pool) bound regardless
+// of logical context length.
+//
+// Policy-agnostic by design: with no scorer, eviction is LRU over
+// unprotected chunks (recency-only memory). A KvFlashScorer plugged into
+// `score_hook` upgrades eviction and reselect() to relevance-driven
+// residency; with pflash enabled, its drafter attaches automatically
+// (KvFlashDrafterScorer) and recalls cold context the generation needs.
+//
+// Correctness notes (why relocating rows is legal):
+// * RoPE is baked into K rows at write time from the `positions` input,
+// so a row's physical slot is semantically irrelevant.
+// * Attention runs over the whole pool with a slot-validity mask
+// (resident = 0, free/paged-out = -inf). The mask must be re-uploaded
+// before EVERY compute: input tensors live in the gallocr compute
+// buffer whose regions are reused during graph execution.
+// * Freed slots are additionally zeroed (defense in depth; a zero K row
+// contributes exp(-max) ~ 0, the same assumption the production
+// stride-256 padded span relies on in maskless mode).
+// * The FWHT K-rotation and KV quantization operate per-row; page-out /
+// page-in moves raw quantized bytes and is therefore bit-exact.
+//
+// Scope: full-attention layers only. DeltaNet/conv recurrent state is
+// fixed-size, position-dependent in-place state and is never paged.
+
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+namespace dflash::common {
+
+struct KvFlashConfig {
+ int chunk_tokens = 64; // logical tokens per page
+ int pool_tokens = 0; // resident pool capacity (multiple of chunk_tokens)
+ int sink_chunks = 1; // leading chunks never evicted (attention sinks)
+ int tail_window_chunks = 4; // trailing chunks never evicted (local window)
+};
+
+struct KvFlashStats {
+ int64_t page_outs = 0;
+ int64_t page_ins = 0;
+ int64_t host_bytes = 0; // backing store currently held on host
+ int64_t moved_bytes = 0; // cumulative D2H+H2D traffic
+};
+
+class KvFlashPager {
+public:
+ // `attn_k` / `attn_v` are the per-full-attention-layer cache tensors,
+ // each [head_dim, pool_tokens, n_head_kv]. All must share dims/types
+ // within their K/V group.
+ // Minimum pool for a config: sinks + trailing window stay resident
+ // unconditionally, so at least 2 more chunks are required (1 evictable
+ // victim + the partially filled append head) or eviction deadlocks and
+ // slot_for() starts failing once the pool fills.
+ static int min_pool_tokens(const KvFlashConfig & cfg) {
+ return (cfg.sink_chunks + cfg.tail_window_chunks + 2) * cfg.chunk_tokens;
+ }
+
+ bool attach(const KvFlashConfig & cfg,
+ const std::vector & attn_k,
+ const std::vector & attn_v) {
+ if (cfg.pool_tokens <= 0 || cfg.pool_tokens % cfg.chunk_tokens != 0) return false;
+ if (cfg.pool_tokens < min_pool_tokens(cfg)) {
+ std::fprintf(stderr,
+ "kvflash: pool %d < minimum %d (%d sink + %d tail chunks must "
+ "leave an evictable block)\n",
+ cfg.pool_tokens, min_pool_tokens(cfg),
+ cfg.sink_chunks, cfg.tail_window_chunks);
+ return false;
+ }
+ if (attn_k.empty() || attn_k.size() != attn_v.size()) return false;
+ cfg_ = cfg;
+ attn_k_ = attn_k;
+ attn_v_ = attn_v;
+ n_blocks_ = cfg.pool_tokens / cfg.chunk_tokens;
+ const ggml_tensor * K0 = attn_k[0];
+ if ((int)K0->ne[1] < cfg.pool_tokens) return false;
+ n_head_kv_ = (int)K0->ne[2];
+
+ // Per-(tensor, head) contiguous segment of chunk_tokens rows.
+ k_seg_bytes_ = (size_t)cfg.chunk_tokens * K0->nb[1];
+ v_seg_bytes_ = (size_t)cfg.chunk_tokens * attn_v[0]->nb[1];
+ chunk_bytes_ = (k_seg_bytes_ + v_seg_bytes_) * (size_t)n_head_kv_ * attn_k.size();
+ zero_buf_.assign(std::max(k_seg_bytes_, v_seg_bytes_), 0);
+
+ free_blocks_.clear();
+ for (int b = n_blocks_ - 1; b >= 0; b--) free_blocks_.push_back(b);
+ chunks_.clear();
+ stats_ = {};
+ clock_ = 0;
+ return true;
+ }
+
+ // Optional: custom block hand-out order (e.g. shuffled placement in
+ // relocation tests). `order[i]` = i-th block to hand out.
+ void set_block_order(const std::vector & order) {
+ free_blocks_.assign(order.rbegin(), order.rend());
+ }
+
+ // Drop all mappings and host backing (new request / cache reset).
+ // Cumulative stats are kept; the epoch advances so cached masks refill.
+ void reset() {
+ chunks_.clear();
+ free_blocks_.clear();
+ for (int b = n_blocks_ - 1; b >= 0; b--) free_blocks_.push_back(b);
+ stats_.host_bytes = 0;
+ cur_chunk_ = 0;
+ epoch_++;
+ }
+
+ // Zero every currently-free block. reset() drops mappings but leaves the
+ // previous request's bytes in place; maskless consumers (the qwen35moe
+ // pipelined decode reads the whole padded pool span with no slot mask)
+ // need stale rows to dequantise to ~zero contribution. Masked consumers
+ // don't need this but it is cheap (pool-sized memset, sub-ms).
+ void zero_free_blocks() {
+ for (int b : free_blocks_) zero_block(b);
+ }
+
+ bool attached() const { return n_blocks_ > 0; }
+ int pool_tokens() const { return cfg_.pool_tokens; }
+ int chunk_tokens() const { return cfg_.chunk_tokens; }
+
+ // Optional external relevance score; higher = keep. Falls back to LRU.
+ std::function score_hook;
+
+ // Allocate slots for [kv_start, kv_start + n_tok) ahead of a forward
+ // step (evicting LRU/low-score chunks as needed). False — with a
+ // diagnostic — if the pool has no evictable block left.
+ bool alloc_span(int kv_start, int n_tok) {
+ for (int i = 0; i < n_tok; ++i) {
+ if (slot_for(kv_start + i) < 0) {
+ std::fprintf(stderr, "[kvflash] no pool slot at pos %d "
+ "(pool %d exhausted)\n",
+ kv_start + i, cfg_.pool_tokens);
+ return false;
+ }
+ }
+ return true;
+ }
+
+ // Physical pool slot for logical position `pos`. Allocates (and, when
+ // the pool is full, evicts) at chunk granularity. Call once per
+ // appended token, in logical order.
+ int slot_for(int64_t pos) {
+ const int c = (int)(pos / cfg_.chunk_tokens);
+ // cur_chunk_ tracks the append head only; a page_in of an older
+ // chunk must not shrink the protected tail window. It must advance
+ // BEFORE eviction (so the victim search protects the new tail), but
+ // a failed allocation must roll it back or the next eviction's tail
+ // window is computed from a chunk that never materialized.
+ const int prev_cur_chunk = cur_chunk_;
+ if (c > cur_chunk_) cur_chunk_ = c;
+ if ((int)chunks_.size() <= c) chunks_.resize(c + 1);
+ ChunkState & st = chunks_[c];
+ if (st.block < 0) {
+ if (!ensure_free_block()) {
+ cur_chunk_ = prev_cur_chunk;
+ return -1;
+ }
+ st.block = free_blocks_.back();
+ free_blocks_.pop_back();
+ epoch_++;
+ if (st.on_host) { // recall: restore paged-out bytes
+ copy_chunk(c, st.block, /*to_host=*/false);
+ stats_.page_ins++;
+ stats_.moved_bytes += chunk_bytes_;
+ }
+ }
+ st.last_use = ++clock_;
+ return st.block * cfg_.chunk_tokens + (int)(pos % cfg_.chunk_tokens);
+ }
+
+ // Force a chunk out of the pool (host backing + zeroed slots).
+ bool page_out(int c) {
+ if (c >= (int)chunks_.size() || chunks_[c].block < 0) return false;
+ ChunkState & st = chunks_[c];
+ if (!st.on_host) {
+ st.host_data.resize(chunk_bytes_);
+ stats_.host_bytes += (int64_t)chunk_bytes_;
+ }
+ copy_chunk(c, st.block, /*to_host=*/true);
+ zero_block(st.block);
+ st.on_host = true;
+ free_blocks_.push_back(st.block);
+ st.block = -1;
+ epoch_++;
+ stats_.page_outs++;
+ stats_.moved_bytes += chunk_bytes_;
+ return true;
+ }
+
+ // Recall a chunk into the pool (used by reselect / tests).
+ bool page_in(int c) {
+ if (c >= (int)chunks_.size() || !chunks_[c].on_host || chunks_[c].block >= 0) return false;
+ return slot_for((int64_t)c * cfg_.chunk_tokens) >= 0;
+ }
+
+ bool is_resident(int c) const {
+ return c < (int)chunks_.size() && chunks_[c].block >= 0;
+ }
+
+ // True while every materialized chunk still sits in its identity block
+ // (chunk c in block c, nothing paged out). This is the layout contract
+ // identity-copy snapshots rely on; it holds from reset() until the
+ // first eviction of the CURRENT request (cumulative stats do not).
+ bool is_identity() const {
+ for (int c = 0; c < (int)chunks_.size(); c++) {
+ if (chunks_[c].block >= 0 && chunks_[c].block != c) return false;
+ if (chunks_[c].block < 0 && chunks_[c].on_host) return false;
+ }
+ return true;
+ }
+ int block_of(int c) const {
+ return c < (int)chunks_.size() ? chunks_[c].block : -1;
+ }
+
+ // Const lookup (no alloc / LRU touch): physical slot currently holding
+ // logical `pos`, or -1 if its chunk is not resident. Callers that may
+ // need an allocation must use slot_for() beforehand.
+ int slot_of(int64_t pos) const {
+ const int c = (int)(pos / cfg_.chunk_tokens);
+ if (c >= (int)chunks_.size() || chunks_[c].block < 0) return -1;
+ return chunks_[c].block * cfg_.chunk_tokens + (int)(pos % cfg_.chunk_tokens);
+ }
+
+ // Logical position held by each pool slot, -1 for free blocks. `dst`
+ // must hold pool_tokens entries. Lets callers build masks that need
+ // POSITION semantics in slot space (causal / sliding-window): the
+ // mask condition is evaluated on dst[slot] instead of the column index.
+ void fill_slot_pos(int32_t * dst) const {
+ for (int i = 0; i < cfg_.pool_tokens; i++) dst[i] = -1;
+ for (int c = 0; c < (int)chunks_.size(); c++) {
+ if (chunks_[c].block < 0) continue;
+ int32_t * p = dst + (size_t)chunks_[c].block * cfg_.chunk_tokens;
+ for (int i = 0; i < cfg_.chunk_tokens; i++)
+ p[i] = (int32_t)c * cfg_.chunk_tokens + i;
+ }
+ }
+ const KvFlashStats & stats() const { return stats_; }
+ int resident_blocks() const { return n_blocks_ - (int)free_blocks_.size(); }
+ int n_chunks() const { return (int)chunks_.size(); }
+
+ // Bumped on every residency change (alloc / page_out / page_in).
+ // Callers cache the slot mask and refill only when the epoch moves.
+ uint64_t epoch() const { return epoch_; }
+
+ // F16 slot-validity mask for one query row: 0 for slots belonging to a
+ // resident chunk, -inf for free / paged-out blocks. `dst` must hold
+ // pool_tokens entries. Used as the FA mask so non-resident slots are
+ // excluded exactly instead of via the zero-row ~exp(-max) approximation.
+ void fill_slot_mask(uint16_t * dst) const {
+ constexpr uint16_t F16_ZERO = 0x0000, F16_NEG_INF = 0xFC00;
+ for (int i = 0; i < cfg_.pool_tokens; i++) dst[i] = F16_NEG_INF;
+ for (int c = 0; c < (int)chunks_.size(); c++) {
+ if (chunks_[c].block < 0) continue;
+ uint16_t * p = dst + (size_t)chunks_[c].block * cfg_.chunk_tokens;
+ for (int i = 0; i < cfg_.chunk_tokens; i++) p[i] = F16_ZERO;
+ }
+ }
+
+ // Lookahead reselect (FlashMemory τ-step): rebuild the resident set as
+ // the top-pool chunks by score_hook among ALL known chunks (resident or
+ // host-backed). Sinks and the trailing window are always kept. Returns
+ // the number of page events. Call between decode steps.
+ int reselect() {
+ if (!score_hook) return 0;
+ struct Cand { int c; float s; };
+ std::vector cands;
+ for (int c = 0; c < (int)chunks_.size(); c++) {
+ const ChunkState & st = chunks_[c];
+ if (st.block < 0 && !st.on_host) continue; // never materialized
+ const bool prot = c < cfg_.sink_chunks ||
+ c > cur_chunk_ - 1 - cfg_.tail_window_chunks;
+ cands.push_back({c, prot ? 3.4e38f : score_hook(c)});
+ }
+ std::sort(cands.begin(), cands.end(),
+ [](const Cand & a, const Cand & b) { return a.s > b.s; });
+ std::vector want(chunks_.size(), 0);
+ for (int i = 0; i < (int)cands.size() && i < n_blocks_; i++) want[cands[i].c] = 1;
+
+ int events = 0;
+ for (int c = 0; c < (int)chunks_.size(); c++) { // out first: frees blocks
+ if (!want[c] && chunks_[c].block >= 0) { page_out(c); events++; }
+ }
+ for (int c = 0; c < (int)chunks_.size(); c++) {
+ if (want[c] && chunks_[c].block < 0 && chunks_[c].on_host) {
+ if (page_in(c)) events++;
+ }
+ }
+ return events;
+ }
+
+private:
+ struct ChunkState {
+ int block = -1; // pool block index, -1 = not resident
+ bool on_host = false; // backing store holds valid bytes
+ uint64_t last_use = 0;
+ std::vector host_data;
+ };
+
+ bool ensure_free_block() {
+ if (!free_blocks_.empty()) return true;
+ // Victim: unprotected resident chunk with the lowest score
+ // (score_hook) or the oldest use (LRU fallback).
+ int victim = -1;
+ float v_score = 0.f;
+ uint64_t v_use = 0;
+ for (int c = 0; c < (int)chunks_.size(); c++) {
+ if (chunks_[c].block < 0) continue;
+ if (c < cfg_.sink_chunks) continue;
+ if (c > cur_chunk_ - 1 - cfg_.tail_window_chunks) continue;
+ if (score_hook) {
+ const float s = score_hook(c);
+ if (victim < 0 || s < v_score) { victim = c; v_score = s; }
+ } else {
+ if (victim < 0 || chunks_[c].last_use < v_use) { victim = c; v_use = chunks_[c].last_use; }
+ }
+ }
+ return victim >= 0 && page_out(victim);
+ }
+
+ // Move one chunk between pool slots and host backing. Segment order is
+ // fixed (layer-major, K then V, head-minor) so offsets are stable.
+ void copy_chunk(int c, int block, bool to_host) {
+ ChunkState & st = chunks_[c];
+ uint8_t * p = st.host_data.data();
+ for (size_t l = 0; l < attn_k_.size(); l++) {
+ for (int kv = 0; kv < 2; kv++) {
+ ggml_tensor * t = kv == 0 ? attn_k_[l] : attn_v_[l];
+ const size_t seg = kv == 0 ? k_seg_bytes_ : v_seg_bytes_;
+ for (int h = 0; h < n_head_kv_; h++) {
+ const size_t off = (size_t)block * cfg_.chunk_tokens * t->nb[1] + (size_t)h * t->nb[2];
+ if (to_host) ggml_backend_tensor_get(t, p, off, seg);
+ else ggml_backend_tensor_set(t, p, off, seg);
+ p += seg;
+ }
+ }
+ }
+ }
+
+ void zero_block(int block) {
+ for (size_t l = 0; l < attn_k_.size(); l++) {
+ for (int kv = 0; kv < 2; kv++) {
+ ggml_tensor * t = kv == 0 ? attn_k_[l] : attn_v_[l];
+ const size_t seg = kv == 0 ? k_seg_bytes_ : v_seg_bytes_;
+ for (int h = 0; h < n_head_kv_; h++) {
+ const size_t off = (size_t)block * cfg_.chunk_tokens * t->nb[1] + (size_t)h * t->nb[2];
+ ggml_backend_tensor_set(t, zero_buf_.data(), off, seg);
+ }
+ }
+ }
+ }
+
+ KvFlashConfig cfg_;
+ std::vector attn_k_, attn_v_;
+ std::vector chunks_;
+ std::vector free_blocks_;
+ std::vector zero_buf_;
+ KvFlashStats stats_;
+ size_t k_seg_bytes_ = 0, v_seg_bytes_ = 0, chunk_bytes_ = 0;
+ int n_blocks_ = 0, n_head_kv_ = 0, cur_chunk_ = 0;
+ uint64_t clock_ = 0;
+ uint64_t epoch_ = 0;
+};
+
+// ── Shared backend helpers ─────────────────────────────────────────────
+//
+// Every backend integration needs the same three steps: read the pool size
+// from the env, allocate slots ahead of each forward (alloc_span above),
+// and build slot-space inputs for the graph. The first and last live here
+// so the per-arch code reduces to wiring.
+
+// VRAM budget for "auto" pool sizing. Backends fill this AFTER the target
+// weights are on the GPU and BEFORE the cache is allocated, so free_bytes
+// reflects what the pool can actually use.
+struct KvFlashAutoBudget {
+ int64_t free_bytes = 0; // device free memory right now
+ int64_t reserve_bytes = 0; // compute buffers + (if expected) drafter
+ int64_t bytes_per_token = 0; // pooled attention KV density for this model
+ // Decode cost grows with the FA span (= the pool), so cap the auto pool
+ // where speed stays near the small-pool point. Measured on the 27B/3090:
+ // 1K pool 39.6 tok/s, 4K 38.7; 16K extrapolates to ~31-33, still 1.7-2.4x
+ // the full cache at 128-256K. Override: DFLASH_KVFLASH_MAX_POOL.
+ int speed_cap_tokens = 16384;
+};
+
+// Pool size from DFLASH_KVFLASH for a backend with `cfg` protections:
+// 0 = off; otherwise rounded to a 256 multiple, floored at
+// min_pool_tokens(cfg) (eviction must keep a victim) and clamped to
+// `max_ctx` (a pool larger than the logical context is meaningless), with
+// warnings on both adjustments.
+//
+// The literal value "auto" sizes the pool from the GPU, not from a fixed
+// fraction: take half of (free VRAM - reserve), convert to tokens at the
+// model's KV density, then cap at the speed point and max_ctx. Big pools
+// avoid relevance-crowding (more resident chunks = fewer forced evictions
+// of useful context); the speed cap keeps decode near the flat optimum.
+// Falls back to max_ctx/4 (scorer expected) or /2 (LRU) when the backend
+// supplies no budget.
+inline int kvflash_pool_from_env(int max_ctx, const KvFlashConfig & cfg = {},
+ bool scorer_expected = false,
+ const KvFlashAutoBudget & budget = {}) {
+ const char * env = std::getenv("DFLASH_KVFLASH");
+ if (!env) return 0;
+ int tokens;
+ if (std::strcmp(env, "auto") == 0) {
+ int speed_cap = budget.speed_cap_tokens;
+ if (const char * mp = std::getenv("DFLASH_KVFLASH_MAX_POOL")) {
+ speed_cap = std::max(256, std::atoi(mp));
+ }
+ if (budget.bytes_per_token > 0 && budget.free_bytes > 0) {
+ const int64_t usable =
+ std::max(0, budget.free_bytes - budget.reserve_bytes) / 2;
+ const int64_t vram_tokens = usable / budget.bytes_per_token;
+ tokens = (int)std::min(vram_tokens,
+ std::min(max_ctx, speed_cap));
+ std::fprintf(stderr,
+ "[kvflash] auto pool: %d tokens (free %.1f GiB - reserve %.1f GiB, "
+ "%.1f KiB/token, caps: speed %d / max_ctx %d)\n",
+ tokens, budget.free_bytes / 1073741824.0,
+ budget.reserve_bytes / 1073741824.0,
+ budget.bytes_per_token / 1024.0, speed_cap, max_ctx);
+ } else {
+ tokens = max_ctx / (scorer_expected ? 4 : 2);
+ std::fprintf(stderr, "[kvflash] auto pool: %d tokens (%d%% of max_ctx %d, "
+ "no VRAM budget supplied)\n",
+ tokens, scorer_expected ? 25 : 50, max_ctx);
+ }
+ } else {
+ tokens = std::atoi(env);
+ }
+ if (tokens <= 0) return 0;
+ tokens = ((tokens + 255) / 256) * 256;
+ const int floor_tokens =
+ ((KvFlashPager::min_pool_tokens(cfg) + 255) / 256) * 256;
+ if (tokens < floor_tokens) {
+ std::fprintf(stderr, "[kvflash] requested pool %d < minimum %d "
+ "(%d sink + %d tail chunks must leave an "
+ "evictable block); raising\n",
+ tokens, floor_tokens, cfg.sink_chunks, cfg.tail_window_chunks);
+ tokens = floor_tokens;
+ }
+ if (tokens > max_ctx) {
+ std::fprintf(stderr, "[kvflash] requested pool %d > max_ctx %d; clamping "
+ "(raise --max-ctx for a larger pool)\n",
+ tokens, max_ctx);
+ tokens = (max_ctx / 256) * 256;
+ }
+ return tokens;
+}
+
+// Residency policy from DFLASH_KVFLASH_POLICY (--kvflash-policy): "lru"
+// forces recency-only paging (no drafter probe, no scorer); anything else
+// (default "drafter") means scored residency when a drafter is available.
+inline bool kvflash_policy_is_lru() {
+ const char * env = std::getenv("DFLASH_KVFLASH_POLICY");
+ return env && std::strcmp(env, "lru") == 0;
+}
+
+// Locate the Qwen3-0.6B residency drafter: the explicit override
+// (DFLASH_KVFLASH_DRAFTER, set from --prefill-drafter), then the
+// well-known locations next to the target model, then the appliance path.
+// Returns "" when nothing is readable (callers fall back to LRU, loudly).
+inline std::string kvflash_find_drafter(const char * target_path) {
+ if (kvflash_policy_is_lru()) return "";
+ if (const char * dp = std::getenv("DFLASH_KVFLASH_DRAFTER")) return dp;
+ if (!target_path) return "";
+ std::string dir(target_path);
+ const size_t slash = dir.find_last_of('/');
+ dir = (slash == std::string::npos) ? "." : dir.substr(0, slash);
+ const std::string candidates[] = {
+ dir + "/Qwen3-0.6B-BF16.gguf",
+ dir + "/drafter/Qwen3-0.6B-BF16.gguf",
+ dir + "/draft/Qwen3-0.6B-BF16.gguf",
+ "/opt/lucebox/models/drafter/Qwen3-0.6B-BF16.gguf",
+ };
+ for (const std::string & c : candidates) {
+ if (std::FILE * f = std::fopen(c.c_str(), "rb")) {
+ std::fclose(f);
+ std::fprintf(stderr, "[kvflash] found residency drafter: %s\n", c.c_str());
+ return c;
+ }
+ }
+ return "";
+}
+
+// Slot-space step inputs for masked consumers: the K/V append row for each
+// of this step's tokens, plus F32 causal (`mfull`) and sliding-window
+// (`mswa`, optional) masks of width `mk_w` whose conditions are evaluated
+// on the POSITION each pool slot holds (free slots stay -inf). The caller
+// must have alloc_span()'d [kv_start, kv_start + n_tok) first. The pager
+// zeroes freed slots, but the mask is what keeps relocation exact.
+inline bool kvflash_fill_rows_and_masks(
+ const KvFlashPager & pager,
+ int kv_start, int n_tok, int mk_w, int swa_window,
+ std::vector & rows,
+ std::vector * mfull, std::vector * mswa) {
+ rows.resize((size_t)n_tok);
+ for (int i = 0; i < n_tok; ++i) {
+ const int s = pager.slot_of(kv_start + i);
+ if (s < 0) {
+ std::fprintf(stderr, "[kvflash] no pool slot at pos %d "
+ "(alloc_span not called?)\n", kv_start + i);
+ return false;
+ }
+ rows[(size_t)i] = s;
+ }
+ if (!mfull) return true;
+ std::vector spos((size_t)pager.pool_tokens(), -1);
+ pager.fill_slot_pos(spos.data());
+ mfull->assign((size_t)mk_w * n_tok, -INFINITY);
+ if (mswa) mswa->assign((size_t)mk_w * n_tok, -INFINITY);
+ const int s_hi = std::min(mk_w, (int)spos.size());
+ for (int q = 0; q < n_tok; ++q) {
+ const int abs_q = kv_start + q;
+ const int win_lo = std::max(0, abs_q - swa_window + 1);
+ for (int s = 0; s < s_hi; ++s) {
+ const int p = spos[(size_t)s];
+ if (p < 0 || p > abs_q) continue;
+ (*mfull)[(size_t)q * mk_w + s] = 0.0f;
+ if (mswa && p >= win_lo) (*mswa)[(size_t)q * mk_w + s] = 0.0f;
+ }
+ }
+ return true;
+}
+
+} // namespace dflash::common
diff --git a/server/src/common/kvflash_scorer.h b/server/src/common/kvflash_scorer.h
new file mode 100644
index 000000000..407d94c6d
--- /dev/null
+++ b/server/src/common/kvflash_scorer.h
@@ -0,0 +1,33 @@
+// KvFlashScorer — pluggable chunk-relevance policy for KvFlashPager.
+//
+// The pager is policy-agnostic: with no scorer attached it evicts LRU and
+// never recalls. A scorer upgrades eviction and reselect() to relevance-
+// driven residency (FlashMemory's Memory Indexer role). This interface is
+// deliberately dependency-free so the pager runs without pflash, without a
+// drafter, and without any model beyond the target.
+//
+// Implementations:
+// - (none) pure LRU + recency, zero dependencies
+// - KvFlashDrafterScorer qwen3/qwen3_kvflash_scorer.h — pflash drafter tail
+// attention (shared with pflash compression)
+
+#pragma once
+
+#include
+#include
+
+namespace dflash::common {
+
+struct KvFlashScorer {
+ virtual ~KvFlashScorer() = default;
+
+ // Fill out[c] with a relevance score (higher = keep resident) for each
+ // chunk_tokens-sized chunk of `ids` (the full token history: prompt +
+ // generated). Returns false on failure; the caller skips reselect for
+ // that round and the pager keeps its LRU behavior.
+ virtual bool score_chunks(const std::vector & ids,
+ int chunk_tokens,
+ std::vector & out) = 0;
+};
+
+} // namespace dflash::common
diff --git a/server/src/common/moe_hybrid_ffn_eval.cpp b/server/src/common/moe_hybrid_ffn_eval.cpp
index 12a854d37..6d106cfa5 100644
--- a/server/src/common/moe_hybrid_ffn_eval.cpp
+++ b/server/src/common/moe_hybrid_ffn_eval.cpp
@@ -39,8 +39,17 @@ static ggml_tensor * build_shared_expert_subgraph(
ggml_tensor * shared = apply_scale2(ctx,
ggml_mul_mat(ctx, desc.ffn_down_shexp, sh_gu), desc.ffn_down_shexp_s);
if (desc.ffn_gate_inp_shexp) {
+ // The shared-expert gate is a single-row weight (M=1): out[0,n] = sum_k W[k]*inp[k,n].
+ // Computing it as ggml_mul_mat routes to cublas, and on the shipped CUDA 12.0
+ // cublasLt the M=1 heuristic selects a gemv/split-K reduce algorithm whose kernel
+ // is ABSENT from the library once N>1 (spec-decode verify/replay batches) — for
+ // BOTH F32 (cublasSgemm SSS) and F16 (cublasGemmEx HHH splitKreduce). That poisons
+ // the stream and surfaces as an illegal access in the next op. Compute the gate as
+ // broadcast elementwise-mul + sum_rows instead: identical math, ggml kernels only,
+ // no cublas. This is what unblocks single-pass full-batch verify.
+ ggml_tensor * gate_prod = ggml_mul(ctx, inp, desc.ffn_gate_inp_shexp);
ggml_tensor * shared_gate = apply_scale2(ctx,
- ggml_mul_mat(ctx, desc.ffn_gate_inp_shexp, inp), desc.ffn_gate_inp_shexp_s);
+ ggml_sum_rows(ctx, gate_prod), desc.ffn_gate_inp_shexp_s);
shared_gate = ggml_sigmoid(ctx, shared_gate);
shared = ggml_mul(ctx, shared, shared_gate);
}
@@ -658,6 +667,57 @@ bool build_cached_hot_batched_graph(
return true;
}
+// Cached batched COLD routed graph (CPU backend, no shared expert). Mirror of
+// build_cached_hot_batched_graph for the cold expert stack; used by the mixed
+// batched path so spec-decode verify/replay reuse the graph instead of
+// rebuilding it every call.
+static bool build_cached_cold_batched_graph(
+ CachedHotBatchedGraph & out,
+ ggml_backend_t cpu_backend,
+ MoeHybridLayerStorage & storage,
+ const MoeLayerDesc & desc,
+ const MoeHybridConfig & cfg,
+ int n_tokens) {
+
+ out.free();
+ out.n_tokens = n_tokens;
+ const int n_embd = cfg.n_embd;
+ const int n_used = cfg.n_expert_used;
+ const int n_ff_exp = cfg.n_ff_exp;
+
+ ggml_init_params ip{};
+ ip.mem_size = 128 * 1024 * 1024;
+ ip.mem_buffer = nullptr;
+ ip.no_alloc = true;
+ out.ctx = ggml_init(ip);
+ if (!out.ctx) return false;
+
+ out.inp = ggml_new_tensor_2d(out.ctx, GGML_TYPE_F32, n_embd, n_tokens);
+ ggml_set_input(out.inp);
+ out.sel = ggml_new_tensor_2d(out.ctx, GGML_TYPE_I32, n_used, n_tokens);
+ ggml_set_input(out.sel);
+ out.wts = ggml_new_tensor_2d(out.ctx, GGML_TYPE_F32, n_used, n_tokens);
+ ggml_set_input(out.wts);
+
+ ggml_tensor * routed = nullptr;
+ build_batched_routed_graph(out.ctx,
+ storage.gate_cold, storage.up_cold, storage.down_cold, storage.gate_up_cold,
+ desc.ffn_gate_exps_s, desc.ffn_up_exps_s, desc.ffn_down_exps_s, desc.ffn_gate_up_exps_s,
+ out.inp, out.sel, out.wts, n_embd, n_ff_exp, n_used, n_tokens, &routed);
+ if (!routed) { out.free(); return false; }
+ out.output = routed;
+
+ out.gf = ggml_new_graph_custom(out.ctx, 4096, false);
+ ggml_set_output(out.output);
+ ggml_build_forward_expand(out.gf, out.output);
+ out.alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(cpu_backend));
+ if (!ggml_gallocr_alloc_graph(out.alloc, out.gf)) {
+ out.free();
+ return false;
+ }
+ return true;
+}
+
bool eval_moe_hybrid_ffn_single(
ggml_backend_t gpu_backend,
const MoeHybridConfig & cfg,
@@ -935,6 +995,25 @@ static bool mmq_full_batch_ok(const MoeHybridConfig & cfg, int n_tokens) {
return cfg.mmq_safe_full_batch && n_tokens >= min_tokens;
}
+// Sub-batch size for the reduced-hot-stack routed mul_mat_id. The MMQ path
+// (n_tokens > 8) illegal-accesses on a REDUCED expert stack for sparse/
+// imbalanced sub-64 batches (a genuine ggml-cuda MMQ mul_mat_id bug, observed
+// on sm_86 + gfx1151); the MMVQ-mmid path is stable. Q4_K MMVQ-mmid handles up
+// to 8 tokens on CUDA sm_80+ (MMVQ_MAX_BATCH_SIZE) and 4 on AMD. Earlier this
+// had to be 1 because the F32 shared-expert gate (cublasSgemm, M=1) also faulted
+// at N>1 on the shipped CUDA 12.0 cublasLt; that is now computed cublas-free
+// (mul + sum_rows), so sub-batch=8 is safe and validated on sm_86. Default to 8
+// on sm_80+ (CUDA), 1 elsewhere (proven single-token path on unvalidated archs);
+// env override tunes per arch without a rebuild.
+static int mmq_safe_sub_batch() {
+ static const int v = [](){
+ const char * e = std::getenv("DFLASH_MMQ_SUB_BATCH");
+ if (e) return std::max(1, std::atoi(e));
+ return (query_gpu_compute_sm() >= 80) ? 8 : 1;
+ }();
+ return v;
+}
+
static bool eval_moe_hybrid_ffn_batched_core(
ggml_backend_t gpu_backend,
ggml_backend_t cpu_backend,
@@ -956,6 +1035,74 @@ static bool eval_moe_hybrid_ffn_batched_core(
out.assign((size_t)n_embd * (size_t)n_tokens, 0.0f);
if (n_tokens <= 0) return true;
+ // ── Fast path: cached hot+cold batched graphs (spec-decode verify/replay) ──
+ // Mixed layers used to rebuild+free their hot and cold ggml graphs on every
+ // call; that graph churn (not the matmul) dominated the verify FFN time.
+ // Reuse per-n_tokens cached graphs so steady-state rebuilds nothing. Large
+ // prefill batches (n_tokens >= kMaxBatchedCache) fall through to the inline
+ // path below.
+ if (n_tokens > 0 && n_tokens < MoeHybridLayerStorage::kMaxBatchedCache) {
+ const int total_slots = n_used * n_tokens;
+ const int n_hot_stack = storage.gate_up_hot ? (int)storage.gate_up_hot->ne[2]
+ : storage.gate_hot ? (int)storage.gate_hot->ne[2] : 1;
+ const int n_cold_stack = std::max(1, (int)(storage.down_cold ? storage.down_cold->ne[2] : 1));
+ std::vector hot_sel(total_slots);
+ std::vector hot_wts(total_slots, 0.0f);
+ std::vector cold_sel(total_slots);
+ std::vector cold_wts(total_slots, 0.0f);
+ for (int i = 0; i < total_slots; ++i) { hot_sel[i] = i % n_hot_stack; cold_sel[i] = i % n_cold_stack; }
+ bool fp_has_cold = false;
+ for (int i = 0; i < total_slots; ++i) {
+ const int32_t gid = selected_ids[i];
+ if (gid < 0 || gid >= (int32_t)storage.hot_local_by_global.size()) continue;
+ const int32_t hl = storage.hot_local_by_global[(size_t)gid];
+ if (hl >= 0) { hot_sel[i] = hl; hot_wts[i] = selected_weights[i]; }
+ else {
+ const int32_t cl = storage.cold_local_by_global[(size_t)gid];
+ if (cl >= 0) { cold_sel[i] = cl; cold_wts[i] = selected_weights[i]; fp_has_cold = true; }
+ }
+ }
+
+ CachedHotBatchedGraph & hg = storage.hot_batched_mixed[n_tokens];
+ const bool hg_ok = (hg.valid() && hg.n_tokens == n_tokens)
+ || build_cached_hot_batched_graph(hg, gpu_backend, storage, desc, cfg, n_tokens);
+ CachedHotBatchedGraph * cg = nullptr;
+ bool cg_ok = true;
+ if (fp_has_cold) {
+ cg = &storage.cold_batched_mixed[n_tokens];
+ cg_ok = (cg->valid() && cg->n_tokens == n_tokens)
+ || build_cached_cold_batched_graph(*cg, cpu_backend, storage, desc, cfg, n_tokens);
+ }
+
+ if (hg_ok && cg_ok) {
+ // Hot (GPU, async): shared expert + routed hot (zero-weight dummy slots
+ // keep an all-cold batch's shared-expert contribution).
+ ggml_backend_tensor_set(hg.inp, cur_host, 0, sizeof(float) * (size_t)n_embd * (size_t)n_tokens);
+ ggml_backend_tensor_set(hg.sel, hot_sel.data(), 0, sizeof(int32_t) * (size_t)total_slots);
+ ggml_backend_tensor_set(hg.wts, hot_wts.data(), 0, sizeof(float) * (size_t)total_slots);
+ ggml_backend_graph_compute_async(gpu_backend, hg.gf);
+
+ std::vector cold_partial;
+ if (cg) {
+ cold_partial.assign((size_t)n_embd * (size_t)n_tokens, 0.0f);
+ ggml_backend_tensor_set(cg->inp, cur_host, 0, sizeof(float) * (size_t)n_embd * (size_t)n_tokens);
+ ggml_backend_tensor_set(cg->sel, cold_sel.data(), 0, sizeof(int32_t) * (size_t)total_slots);
+ ggml_backend_tensor_set(cg->wts, cold_wts.data(), 0, sizeof(float) * (size_t)total_slots);
+ ggml_backend_graph_compute(cpu_backend, cg->gf); // sync; overlaps the async hot GPU graph
+ ggml_backend_tensor_get(cg->output, cold_partial.data(), 0, sizeof(float) * (size_t)n_embd * (size_t)n_tokens);
+ }
+
+ ggml_backend_synchronize(gpu_backend);
+ ggml_backend_tensor_get(hg.output, out.data(), 0, sizeof(float) * (size_t)n_embd * (size_t)n_tokens);
+ if (cg) {
+ const size_t ntot = (size_t)n_embd * (size_t)n_tokens;
+ for (size_t i = 0; i < ntot; ++i) out[i] += cold_partial[i];
+ }
+ return true;
+ }
+ // build failed -> fall through to the inline rebuild path
+ }
+
// ── Step 1: Partition routing into hot and cold ──
// Dummy slots use weight 0.0 and are distributed evenly across all experts
// to avoid pathological routing imbalance that triggers OOB in MMQ stream-k.
@@ -1175,15 +1322,15 @@ bool eval_moe_hot_only_batched(
out.assign((size_t)n_embd * (size_t)n_tokens, 0.0f);
if (n_tokens <= 0) return true;
- // Workaround for ggml-cuda MMQ mul_mat_id bug on sm_75/gfx1151: when the
- // hot stack is smaller than n_expert, slice into <=4-token sub-batches to
- // route through the stable MMVQ path. Skipped on sm_80+ where MMQ is safe.
+ // Workaround for the ggml-cuda MMQ mul_mat_id stream-k fault on a REDUCED
+ // hot stack (sm_75/gfx1151 AND sm_86): slice sub-64 batches to a size the
+ // MMVQ-mmid path handles. See mmq_safe_sub_batch().
const int n_hot_stack = storage.gate_up_hot ? (int)storage.gate_up_hot->ne[2]
: storage.gate_hot ? (int)storage.gate_hot->ne[2]
: 0;
- static const int MMQ_SAFE_SUB_BATCH = 4;
+ const int MMQ_SAFE_SUB_BATCH = mmq_safe_sub_batch();
if (!mmq_full_batch_ok(cfg, n_tokens)
- && n_hot_stack > 0 && n_hot_stack < cfg.n_expert && n_tokens > MMQ_SAFE_SUB_BATCH) {
+ && n_hot_stack > 0 && n_tokens > MMQ_SAFE_SUB_BATCH) {
std::vector sub_out;
for (int t0 = 0; t0 < n_tokens; t0 += MMQ_SAFE_SUB_BATCH) {
const int tc = std::min(MMQ_SAFE_SUB_BATCH, n_tokens - t0);
@@ -1234,7 +1381,7 @@ bool eval_moe_hot_only_batched(
// ── Slow path: build graph (first call or size mismatch) ──
// Try to build and cache for this n_tokens size.
// Cache when: sub-batch size (legacy), full stack (all hot), or full-batch safe (sm_80+).
- if (mmq_full_batch_ok(cfg, n_tokens) || n_tokens == MMQ_SAFE_SUB_BATCH
+ if (mmq_full_batch_ok(cfg, n_tokens) || n_tokens <= MMQ_SAFE_SUB_BATCH
|| (n_hot_stack == 0 || n_hot_stack >= cfg.n_expert)) {
if (build_cached_hot_batched_graph(cached, gpu_backend, storage, desc, cfg, n_tokens)) {
// Successfully cached — use it immediately
@@ -1350,9 +1497,9 @@ bool eval_moe_hybrid_ffn_batched(
const int n_hot_stack = storage.gate_up_hot ? (int)storage.gate_up_hot->ne[2]
: storage.gate_hot ? (int)storage.gate_hot->ne[2]
: 0;
- static const int MMQ_SAFE_SUB_BATCH = 4;
+ const int MMQ_SAFE_SUB_BATCH = mmq_safe_sub_batch();
if (!mmq_full_batch_ok(cfg, n_tokens)
- && n_hot_stack > 0 && n_hot_stack < cfg.n_expert && n_tokens > MMQ_SAFE_SUB_BATCH) {
+ && n_hot_stack > 0 && n_tokens > MMQ_SAFE_SUB_BATCH) {
const int n_embd = cfg.n_embd;
const int n_used = cfg.n_expert_used;
out.assign((size_t)n_embd * (size_t)n_tokens, 0.0f);
diff --git a/server/src/common/moe_hybrid_storage.cpp b/server/src/common/moe_hybrid_storage.cpp
index a8613b02a..4bf027400 100644
--- a/server/src/common/moe_hybrid_storage.cpp
+++ b/server/src/common/moe_hybrid_storage.cpp
@@ -130,6 +130,9 @@ MoeHybridStorage::~MoeHybridStorage() {
for (auto & layer : layers) {
layer.hot_graph.free();
layer.cold_graph.free();
+ layer.hot_batched_graph.free();
+ for (auto & g : layer.hot_batched_mixed) g.free();
+ for (auto & g : layer.cold_batched_mixed) g.free();
if (layer.hot_buf) {
ggml_backend_buffer_free(layer.hot_buf);
layer.hot_buf = nullptr;
diff --git a/server/src/common/moe_hybrid_storage.h b/server/src/common/moe_hybrid_storage.h
index 3485c69ff..d4a1d47d4 100644
--- a/server/src/common/moe_hybrid_storage.h
+++ b/server/src/common/moe_hybrid_storage.h
@@ -132,6 +132,17 @@ struct MoeHybridLayerStorage {
// Cached batched hot-only graph for prefill sub-batches (n_tokens=4).
CachedHotBatchedGraph hot_batched_graph;
+
+ // Per-n_tokens cached graphs for the MIXED (hot+cold) batched path. The
+ // all-hot path already caches via hot_batched_graph, but the mixed path used
+ // to rebuild+free its hot AND cold ggml graphs on every call — that churn
+ // dominated the spec-decode verify cost (many cold-bearing layers x
+ // sub-batches x steps). Cache per n_tokens (index 1..kMaxBatchedCache-1) so
+ // steady-state verify/replay rebuilds zero graphs. Large prefill batches
+ // (n_tokens >= kMaxBatchedCache) keep using the inline build.
+ static constexpr int kMaxBatchedCache = 9; // covers spec sub-batch n_tokens 1..8
+ CachedHotBatchedGraph hot_batched_mixed[kMaxBatchedCache];
+ CachedHotBatchedGraph cold_batched_mixed[kMaxBatchedCache];
};
struct MoeHybridStorage {
diff --git a/server/src/draft/draft_gguf_loader.cpp b/server/src/draft/draft_gguf_loader.cpp
index 39d620ce6..f0dbd8eb9 100644
--- a/server/src/draft/draft_gguf_loader.cpp
+++ b/server/src/draft/draft_gguf_loader.cpp
@@ -368,19 +368,34 @@ bool load_draft_gguf(const std::string & path,
set_last_error(err);
return false;
}
- // fc: [n_target_layers*n_embd, n_embd] — ne[0] = n_target_layers*n_embd.
+ // fc: [n_capture_layers*n_embd, n_embd] — ne[0] counts the CAPTURE
+ // layers the fc consumes. Some draft GGUFs (gemma4) store the
+ // TARGET's layer count in dflash.n_target_layers instead of the
+ // capture count; per this file's own philosophy the weights are
+ // ground truth, so when fc disagrees but is an exact multiple of
+ // n_embd, derive the count from the tensor and warn. Fail only on
+ // a genuinely inconsistent shape.
if (out.n_target_layers > 0) {
const int64_t derived_fc_in = out.fc->ne[0];
const int64_t expected_fc_in = (int64_t)out.n_target_layers * out.n_embd;
if (derived_fc_in != expected_fc_in) {
- char buf[256];
- std::snprintf(buf, sizeof(buf),
- "GGUF shape mismatch: dflash.fc.weight->ne[0]=%lld "
- "!= n_target_layers*n_embd=%d*%d=%lld",
- (long long)derived_fc_in,
- out.n_target_layers, out.n_embd, (long long)expected_fc_in);
- set_last_error(buf);
- return false;
+ if (out.n_embd > 0 && derived_fc_in % out.n_embd == 0) {
+ const int derived_layers = (int)(derived_fc_in / out.n_embd);
+ std::fprintf(stderr,
+ "[draft] dflash.n_target_layers metadata (%d) != "
+ "fc-derived capture count (%d); using the weights\n",
+ out.n_target_layers, derived_layers);
+ out.n_target_layers = derived_layers;
+ } else {
+ char buf[256];
+ std::snprintf(buf, sizeof(buf),
+ "GGUF shape mismatch: dflash.fc.weight->ne[0]=%lld "
+ "!= n_target_layers*n_embd=%d*%d=%lld",
+ (long long)derived_fc_in,
+ out.n_target_layers, out.n_embd, (long long)expected_fc_in);
+ set_last_error(buf);
+ return false;
+ }
}
}
}
diff --git a/server/src/gemma4/gemma4_backend.cpp b/server/src/gemma4/gemma4_backend.cpp
index cfed37494..9e7f131a4 100644
--- a/server/src/gemma4/gemma4_backend.cpp
+++ b/server/src/gemma4/gemma4_backend.cpp
@@ -6,6 +6,7 @@
#include "gemma4_backend.h"
#include "dflash27b.h"
+#include "../qwen3/qwen3_kvflash_scorer.h"
#include "common/sampler.h"
#include "common/io_utils.h"
#include "common/dflash_feature_ring.h"
@@ -49,11 +50,19 @@ bool Gemma4Backend::init() {
return false;
}
- if (!create_gemma4_cache(backend_, w_, cfg_.device.max_ctx, cache_)) {
+ kvflash_read_config();
+ if (!create_gemma4_cache(backend_, w_, cfg_.device.max_ctx, cache_,
+ kvflash_tokens_)) {
std::fprintf(stderr, "[gemma4] cache alloc failed\n");
return false;
}
cache_.fa_window = cfg_.fa_window;
+ if (kvflash_active() && cache_.fa_window > 0) {
+ std::fprintf(stderr, "[kvflash] --fa-window and --kvflash are mutually "
+ "exclusive full-attention policies\n");
+ return false;
+ }
+ if (!kvflash_attach()) return false;
// Load draft model for speculative decode.
if (cfg_.draft_path && !load_decode_draft()) {
@@ -117,18 +126,22 @@ bool Gemma4Backend::unpark(const std::string & what) {
}
// Recreate KV cache
- if (!create_gemma4_cache(backend_, w_, cfg_.device.max_ctx, cache_)) {
+ if (!create_gemma4_cache(backend_, w_, cfg_.device.max_ctx, cache_,
+ kvflash_tokens_)) {
std::fprintf(stderr, "[gemma4] unpark: failed to recreate cache\n");
free_gemma4_weights(w_);
return false;
}
cache_.fa_window = cfg_.fa_window;
+ if (!kvflash_attach()) return false;
+ kvflash_drafter_failed_ = false; // fresh VRAM: allow a retry
parked_ = false;
std::printf("[gemma4] unparked (VRAM restored)\n"); std::fflush(stdout);
if (cfg_.draft_path && !draft_parked_ && draft_backend_) {
delete dflash_target_;
dflash_target_ = new Gemma4DFlashTarget(w_, cache_, backend_);
+ if (kvflash_active()) dflash_target_->set_kvflash_pager(&kvflash_pager_);
}
}
@@ -138,6 +151,118 @@ bool Gemma4Backend::unpark(const std::string & what) {
return true;
}
+// ── kvflash helpers ────────────────────────────────────────────────────
+
+void Gemma4Backend::kvflash_read_config() {
+ if (std::getenv("DFLASH_KVFLASH")) {
+ kvflash_drafter_path_ = kvflash_find_drafter(cfg_.model_path);
+ }
+ // "auto" sizes from the GPU (weights resident, cache not yet allocated):
+ // gemma4 pools the FULL-attention layers only (F16 cache); SWA rings are
+ // fixed-size and excluded from the density.
+ KvFlashAutoBudget kvf_budget;
+ {
+ size_t gpu_free = 0, gpu_total = 0;
+ if (ggml_backend_dev_t dev = ggml_backend_get_device(backend_)) {
+ ggml_backend_dev_memory(dev, &gpu_free, &gpu_total);
+ }
+ int64_t bpt = 0;
+ for (int il = 0; il < w_.n_layer; ++il) {
+ if (!gemma4_has_kv(w_, il) || gemma4_is_swa_layer(w_, il)) continue;
+ bpt += (int64_t)gemma4_n_head_kv(w_, il) * 2 *
+ (int64_t)ggml_row_size(GGML_TYPE_F16, gemma4_head_dim(w_, il));
+ }
+ kvf_budget.free_bytes = (int64_t)gpu_free;
+ kvf_budget.bytes_per_token = bpt;
+ kvf_budget.reserve_bytes = (int64_t)(1.5 * 1073741824.0) +
+ (kvflash_drafter_path_.empty() ? 0 : (int64_t)(1.7 * 1073741824.0));
+ }
+ kvflash_tokens_ = kvflash_pool_from_env(cfg_.device.max_ctx, KvFlashConfig{},
+ !kvflash_drafter_path_.empty(),
+ kvf_budget);
+ if (kvflash_tokens_ > 0) {
+ const char * tau = std::getenv("DFLASH_KVFLASH_TAU");
+ kvflash_tau_ = std::max(1, tau ? std::atoi(tau) : 64);
+ }
+}
+
+// Drafter rescore + repage (FlashMemory tau loop) with the cross-tokenizer
+// scorer: gemma ids are detokenized and re-scored through the Qwen3-0.6B
+// drafter. Lazy: the drafter + tokenizers load on the first reselect that
+// needs them, never on a request's first tokens.
+void Gemma4Backend::kvflash_maybe_reselect(int generated) {
+ if (!kvflash_active() || kvflash_tau_ <= 0) return;
+ const int tau = std::max(kvflash_tau_, (int)(kvflash_history_.size() / 45));
+ if (generated % tau != 0) return;
+ if (!kvflash_scorer_) {
+ if (kvflash_drafter_path_.empty() || kvflash_drafter_failed_) return;
+ if (!drafter_loaded_) {
+ ggml_backend_synchronize(backend_);
+ std::fprintf(stderr, "[kvflash] loading drafter for residency scoring: %s\n",
+ kvflash_drafter_path_.c_str());
+ if (!load_drafter(kvflash_drafter_path_, /*gpu_layers=*/999,
+ cfg_.device.gpu, drafter_ctx_)) {
+ std::fprintf(stderr, "[kvflash] drafter load failed (%s); staying on "
+ "LRU residency\n", dflash27b_last_error());
+ kvflash_drafter_failed_ = true;
+ return;
+ }
+ drafter_loaded_ = true;
+ }
+ kvflash_scorer_ = std::make_unique(
+ &drafter_ctx_, cfg_.model_path, kvflash_drafter_path_);
+ std::fprintf(stderr, "[kvflash] cross-tokenizer drafter scorer attached "
+ "(tau=%d)\n", kvflash_tau_);
+ }
+ if (!kvflash_scorer_->score_chunks(kvflash_history_, kvflash_pager_.chunk_tokens(),
+ kvflash_scores_)) {
+ return; // scorer failure -> keep LRU behavior this round
+ }
+ kvflash_pager_.score_hook = [this](int c) {
+ return c < (int)kvflash_scores_.size() ? kvflash_scores_[c] : 1e30f;
+ };
+ const int events = kvflash_pager_.reselect();
+ kvflash_pager_.score_hook = nullptr;
+ if (events > 0) {
+ std::fprintf(stderr, "[kvflash] reselect @gen=%d: %d page events\n",
+ generated, events);
+ }
+}
+
+bool Gemma4Backend::kvflash_attach() {
+ if (!kvflash_active()) return true;
+ // Pool the FULL-attention layers only; SWA layers ring-buffer natively
+ // and KV-reuse layers share their source layer's tensors.
+ std::vector full_k, full_v;
+ for (int il = 0; il < w_.n_layer; ++il) {
+ if (cache_.k[(size_t)il] && !gemma4_is_swa_layer(w_, il)) {
+ full_k.push_back(cache_.k[(size_t)il]);
+ full_v.push_back(cache_.v[(size_t)il]);
+ }
+ }
+ KvFlashConfig pc;
+ pc.pool_tokens = kvflash_tokens_;
+ if (!kvflash_pager_.attach(pc, full_k, full_v)) {
+ std::fprintf(stderr, "kvflash: pager attach failed (pool=%d, "
+ "full-attn layers=%zu)\n",
+ kvflash_tokens_, full_k.size());
+ return false;
+ }
+ std::printf("[kvflash] resident pool %d tokens over %zu full-attn layers "
+ "(logical max_ctx %d, SWA ring %d), policy=%s\n",
+ kvflash_tokens_, full_k.size(), cfg_.device.max_ctx,
+ cache_.swa_size,
+ !kvflash_drafter_path_.empty()
+ ? "drafter/cross-tok (attaches on first reselect)"
+ : "lru (recency-only: no Qwen3-0.6B drafter found)");
+ std::fflush(stdout);
+ return true;
+}
+
+bool Gemma4Backend::kvflash_alloc_span(int kv_start, int n_tok) {
+ return !kvflash_active() || kvflash_pager_.alloc_span(kv_start, n_tok);
+}
+
// ── Prefill ────────────────────────────────────────────────────────────
int Gemma4Backend::do_prefill(const std::vector & tokens,
@@ -147,6 +272,19 @@ int Gemma4Backend::do_prefill(const std::vector & tokens,
const int hidden = w_.n_embd;
const int chunk = cfg_.chunk;
+ if (kvflash_active()) {
+ // Fresh request: rebuild the pager mapping. Restore paths land the
+ // prefix identity-mapped and pre-allocate [0, kv_offset) themselves.
+ if (kv_offset == 0) kvflash_pager_.reset();
+ if (kv_offset + n > kvflash_tokens_ - kvflash_pager_.chunk_tokens()) {
+ std::fprintf(stderr,
+ "[kvflash] prompt (%d @ offset %d) exceeds pool %d; raise "
+ "--kvflash or enable pflash compression\n",
+ n, kv_offset, kvflash_tokens_);
+ return -1;
+ }
+ }
+
std::vector embed(chunk * hidden);
std::vector logits;
@@ -168,8 +306,10 @@ int Gemma4Backend::do_prefill(const std::vector & tokens,
for (int i = 0; i < len * hidden; ++i) embed[i] *= scale;
const int kv_pos = kv_offset + pos;
- if (!gemma4_step(backend_, w_, cache_, embed.data(),
- tokens.data() + pos, len, kv_pos, logits)) {
+ if (!kvflash_alloc_span(kv_pos, len) ||
+ !gemma4_step(backend_, w_, cache_, embed.data(),
+ tokens.data() + pos, len, kv_pos, logits,
+ kvflash_active() ? &kvflash_pager_ : nullptr)) {
std::fprintf(stderr, "[gemma4] prefill step failed at pos=%d\n", kv_pos);
return -1;
}
@@ -194,6 +334,15 @@ int Gemma4Backend::do_prefill(const std::vector & tokens,
}
}
+ if (kvflash_active()) {
+ if (kv_offset == 0) {
+ kvflash_history_.assign(tokens.begin(), tokens.end());
+ } else {
+ kvflash_history_.resize((size_t)kv_offset, 0); // restored prefix ids unknown
+ kvflash_history_.insert(kvflash_history_.end(), tokens.begin(), tokens.end());
+ }
+ }
+
return kv_offset + pos;
}
@@ -285,8 +434,10 @@ bool Gemma4Backend::do_decode(int committed, int n_gen,
float scale = std::sqrt((float)hidden);
for (int j = 0; j < hidden; ++j) embed_buf[j] *= scale;
- if (!gemma4_step(backend_, w_, cache_, embed_buf.data(),
- &tok, 1, committed, logits)) {
+ if (!kvflash_alloc_span(committed, 1) ||
+ !gemma4_step(backend_, w_, cache_, embed_buf.data(),
+ &tok, 1, committed, logits,
+ kvflash_active() ? &kvflash_pager_ : nullptr)) {
return false;
}
@@ -308,6 +459,10 @@ bool Gemma4Backend::do_decode(int committed, int n_gen,
io.emit(next);
committed++;
cache_.cur_pos = committed;
+ if (kvflash_active()) {
+ kvflash_history_.push_back(next);
+ kvflash_maybe_reselect((int)out_tokens.size());
+ }
if (io.cancelled) break;
// Check EOS
@@ -323,7 +478,8 @@ bool Gemma4Backend::do_spec_decode(int committed, int n_gen,
std::vector & out_tokens,
const DaemonIO & io,
const BudgetHook * budget_hook,
- bool * forced_close_out) {
+ bool * forced_close_out,
+ float * accept_rate_out) {
const int hidden = w_.n_embd;
int32_t last_tok = cache_.last_tok;
@@ -553,6 +709,12 @@ bool Gemma4Backend::do_spec_decode(int committed, int n_gen,
n_draft_steps, n_accept_sum, total_draft_pos, accept_pct,
n_draft_steps > 0 ? (double)n_generated / (double)n_draft_steps : 0.0);
+ // Surface acceptance to the HTTP usage block (was silently 0.0, the
+ // same reporting-only gap as the layer-split path fixed in PR #321).
+ if (accept_rate_out) {
+ *accept_rate_out = (float)(n_accept_sum / (double)total_draft_pos);
+ }
+
io.emit(-1);
return true;
}
@@ -607,7 +769,8 @@ GenerateResult Gemma4Backend::generate_impl(const GenerateRequest & req,
result.spec_decode_ran = true;
if (!do_spec_decode(committed, req.n_gen, result.tokens, out_io,
&req.budget_hook,
- &result.budget_forced_close)) {
+ &result.budget_forced_close,
+ &result.accept_rate)) {
result.error = "spec_decode";
return result;
}
@@ -624,7 +787,8 @@ GenerateResult Gemma4Backend::generate_impl(const GenerateRequest & req,
for (int j = 0; j < hidden; ++j) embed_buf[j] *= scale;
if (!gemma4_step(backend_, w_, cache_, embed_buf.data(),
- &last_tok, 1, committed - 1, logits)) {
+ &last_tok, 1, committed - 1, logits,
+ kvflash_active() ? &kvflash_pager_ : nullptr)) {
result.error = "first logits";
return result;
}
@@ -725,6 +889,22 @@ GenerateResult Gemma4Backend::restore_and_generate_impl(int slot,
cache_.cur_pos = snap_pos;
cache_.last_tok = snap.last_tok;
+ // kvflash: the restored prefix is identity-mapped; rebuild the pager
+ // mapping over [0, snap_pos) before the delta prefill extends it.
+ if (kvflash_active()) {
+ if (snap_pos > kvflash_tokens_ - kvflash_pager_.chunk_tokens()) {
+ std::fprintf(stderr, "[kvflash] restored prefix (%d) exceeds pool %d\n",
+ snap_pos, kvflash_tokens_);
+ result.error = "kvflash: prompt exceeds resident pool";
+ return result;
+ }
+ kvflash_pager_.reset();
+ if (!kvflash_alloc_span(0, snap_pos)) {
+ result.error = "kvflash_slot";
+ return result;
+ }
+ }
+
// Set up sampler
sampler_ = req.sampler;
if (req.do_sample && sampler_.seed != 0) {
@@ -795,7 +975,8 @@ GenerateResult Gemma4Backend::restore_and_generate_impl(int slot,
result.spec_decode_ran = true;
if (!do_spec_decode(committed, req.n_gen, result.tokens, out_io,
&req.budget_hook,
- &result.budget_forced_close)) {
+ &result.budget_forced_close,
+ &result.accept_rate)) {
result.error = "spec_decode";
return result;
}
@@ -812,7 +993,8 @@ GenerateResult Gemma4Backend::restore_and_generate_impl(int slot,
for (int j = 0; j < hidden; ++j) embed_buf[j] *= scale;
if (!gemma4_step(backend_, w_, cache_, embed_buf.data(),
- &last_tok, 1, committed - 1, logits)) {
+ &last_tok, 1, committed - 1, logits,
+ kvflash_active() ? &kvflash_pager_ : nullptr)) {
result.error = "first logits";
return result;
}
@@ -867,6 +1049,13 @@ GenerateResult Gemma4Backend::restore_and_generate_impl(int slot,
bool Gemma4Backend::snapshot_save(int slot) {
if (parked_) return false;
if (slot < 0 || slot >= PREFIX_SLOTS) return false;
+ // kvflash: snapshots copy rows [0, snap_pos) assuming identity layout,
+ // which breaks after the first page-out relocates a chunk.
+ if (kvflash_active() && !kvflash_pager_.is_identity()) {
+ std::fprintf(stderr, "[kvflash] snapshot skipped: pool has relocated "
+ "chunks (page-table serialization not implemented)\n");
+ return false;
+ }
auto & snap = snapshots_[slot];
const int n_layer = cache_.n_layer;
@@ -1129,6 +1318,7 @@ bool Gemma4Backend::load_decode_draft() {
delete dflash_target_;
dflash_target_ = new Gemma4DFlashTarget(w_, cache_, backend_);
+ if (kvflash_active()) dflash_target_->set_kvflash_pager(&kvflash_pager_);
draft_parked_ = false;
std::printf("[gemma4] spec-decode ready: capture_layers=%d mirror_cap=%d\n",
n_capture, mirror_cap);
diff --git a/server/src/gemma4/gemma4_backend.h b/server/src/gemma4/gemma4_backend.h
index 7898e2359..6295496b9 100644
--- a/server/src/gemma4/gemma4_backend.h
+++ b/server/src/gemma4/gemma4_backend.h
@@ -12,6 +12,8 @@
#include "gemma4_internal.h"
#include "gemma4_dflash_target.h"
#include "common/sampler.h"
+#include "../common/kvflash_pager.h"
+#include "../common/kvflash_scorer.h"
#include "../qwen3/qwen3_drafter.h"
#include "ggml.h"
@@ -99,6 +101,27 @@ class Gemma4Backend : public ModelBackend {
static constexpr int PREFIX_SLOTS = 64;
Gemma4Snapshot snapshots_[PREFIX_SLOTS];
+ // ── kvflash (bounded KV residency; see common/kvflash_pager.h) ──
+ // Pools the FULL-attention layers only (SWA layers already ring-buffer).
+ // Drafter-scored residency by default via the cross-tokenizer bridge
+ // (KvFlashCrossTokScorer: gemma ids are detokenized and re-scored by
+ // the Qwen3-0.6B drafter); LRU is the fallback when no drafter is
+ // found or --kvflash-policy lru.
+ KvFlashPager kvflash_pager_;
+ std::unique_ptr kvflash_scorer_;
+ std::vector kvflash_scores_;
+ std::vector kvflash_history_; // prompt + generated ids
+ std::string kvflash_drafter_path_;
+ int kvflash_tokens_ = 0; // 0 = off
+ int kvflash_tau_ = 64;
+ bool kvflash_drafter_failed_ = false;
+ bool kvflash_active() const { return kvflash_tokens_ > 0; }
+ void kvflash_read_config();
+ bool kvflash_attach();
+ bool kvflash_alloc_span(int kv_start, int n_tok);
+ // Drafter rescore + repage every effective-tau generated tokens.
+ void kvflash_maybe_reselect(int generated);
+
// Prefill prompt tokens in chunks, return absolute committed position.
// kv_offset: starting KV cache position (0 for fresh prefill, snap_pos for restore).
int do_prefill(const std::vector & tokens, const DaemonIO & io,
@@ -126,7 +149,8 @@ class Gemma4Backend : public ModelBackend {
std::vector & out_tokens,
const DaemonIO & io,
const BudgetHook * budget_hook = nullptr,
- bool * forced_close_out = nullptr);
+ bool * forced_close_out = nullptr,
+ float * accept_rate_out = nullptr);
bool load_decode_draft();
void free_decode_draft();
diff --git a/server/src/gemma4/gemma4_dflash_target.cpp b/server/src/gemma4/gemma4_dflash_target.cpp
index aebd0b096..7983ccfb3 100644
--- a/server/src/gemma4/gemma4_dflash_target.cpp
+++ b/server/src/gemma4/gemma4_dflash_target.cpp
@@ -1,6 +1,7 @@
// Gemma4DFlashTarget — DFlashTarget adapter for Gemma4 iSWA models.
#include "gemma4_dflash_target.h"
+#include "../common/kvflash_pager.h"
#include "dflash27b.h"
#include
@@ -53,11 +54,16 @@ bool Gemma4DFlashTarget::verify_batch(
const float scale = std::sqrt((float)hidden);
for (size_t i = 0; i < embed.size(); ++i) embed[i] *= scale;
+ // kvflash: allocate the verify block's slots up front (may evict).
+ if (pager_ && !pager_->alloc_span(base_pos, n_tokens)) {
+ return false;
+ }
+
// Run verify (all-token argmax)
std::vector argmax_buf;
if (!gemma4_verify_batch(backend_, w_, cache_, embed.data(),
tokens.data(), n_tokens, base_pos,
- argmax_buf)) {
+ argmax_buf, pager_)) {
return false;
}
diff --git a/server/src/gemma4/gemma4_dflash_target.h b/server/src/gemma4/gemma4_dflash_target.h
index 1d12079b0..aeed2feae 100644
--- a/server/src/gemma4/gemma4_dflash_target.h
+++ b/server/src/gemma4/gemma4_dflash_target.h
@@ -32,6 +32,10 @@ class Gemma4DFlashTarget : public DFlashTarget {
int & last_tok,
std::vector * all_argmax = nullptr) override;
+ // kvflash: route verify writes through the pool (slots allocated here,
+ // slot-space mask inside gemma4_verify_batch). Non-owning.
+ void set_kvflash_pager(class KvFlashPager * pager) { pager_ = pager; }
+
bool snapshot_kv() override;
bool restore_kv() override;
@@ -52,6 +56,7 @@ class Gemma4DFlashTarget : public DFlashTarget {
Gemma4Weights & w_;
Gemma4Cache & cache_;
ggml_backend_t backend_;
+ class KvFlashPager * pager_ = nullptr;
// Capture layer IDs (built once in constructor).
std::vector capture_ids_;
diff --git a/server/src/gemma4/gemma4_graph.cpp b/server/src/gemma4/gemma4_graph.cpp
index 7df5a5a9f..33f60ffb5 100644
--- a/server/src/gemma4/gemma4_graph.cpp
+++ b/server/src/gemma4/gemma4_graph.cpp
@@ -18,6 +18,7 @@
#include "gemma4_internal.h"
#include "common/ggml_graph_precision.h"
#include "common/gpu_runtime_compat.h"
+#include "../common/kvflash_pager.h"
#include "dflash27b.h"
#include "flashprefill.h"
@@ -249,7 +250,10 @@ static ggml_tensor * build_gemma4_attn_block(
? (kv_start - fa_window) : 0;
const int kv_len_raw = is_swa ? std::min(kv_start + n_tokens, cache_len)
: (kv_start + n_tokens - full_win_start);
- const int kv_len = (kv_len_raw + 255) & ~255; // pad to 256 for CUDA FA
+ // Pad to 256 for CUDA FA, clamped to the tensor's physical capacity
+ // (kvflash pools allocate full layers below max_ctx; the slot mask keeps
+ // the clamped span exact).
+ const int kv_len = std::min((kv_len_raw + 255) & ~255, cache_len);
ggml_tensor * Qfa = ggml_permute(ctx, Qcur, 0, 2, 1, 3);
Qfa = ggml_cont(ctx, Qfa);
@@ -620,8 +624,14 @@ bool gemma4_step(
const int32_t * token_ids,
int n_tokens,
int kv_start,
- std::vector & out_logits)
+ std::vector & out_logits,
+ const KvFlashPager * kvflash)
{
+ if (kvflash && cache.fa_window > 0) {
+ std::fprintf(stderr, "gemma4_step: kvflash and fa_window are mutually "
+ "exclusive full-attention policies\n");
+ return false;
+ }
// Allocate graph context. Persistent thread_local arena: rebuilt graphs
// land at identical addresses every step, so the ggml-cuda CUDA-graph
// cache (keyed on nodes[0], memcmps node properties) can replay the
@@ -662,9 +672,18 @@ bool gemma4_step(
}
// Attention masks (full + SWA)
- // Full-attention mask: covers all positions [0, kv_start+n_tokens)
+ // Full-attention mask: covers all positions [0, kv_start+n_tokens),
+ // clamped to the full-layer tensor capacity (pool-sized under kvflash) —
+ // must agree with the FA span clamp in build_gemma4_attn_block.
+ int full_cap = cache.max_ctx;
+ for (int il = 0; il < (int)cache.k.size(); ++il) {
+ if (cache.k[(size_t)il] && !gemma4_is_swa_layer(w, il)) {
+ full_cap = (int)cache.k[(size_t)il]->ne[1];
+ break;
+ }
+ }
const int kv_len_raw = kv_start + n_tokens;
- const int kv_len_padded = (kv_len_raw + 255) & ~255;
+ const int kv_len_padded = std::min((kv_len_raw + 255) & ~255, full_cap);
ggml_tensor * mk_full = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, kv_len_padded, n_tokens, 1, 1);
ggml_set_input(mk_full);
ggml_tensor * mk_full_f16 = ggml_cast(ctx, mk_full, GGML_TYPE_F16);
@@ -768,12 +787,32 @@ bool gemma4_step(
std::vector pos((size_t)n_tokens);
for (int i = 0; i < n_tokens; ++i) pos[i] = kv_start + i;
ggml_backend_tensor_set(pp, pos.data(), 0, ggml_nbytes(pp));
+ if (!kvi_full && kvflash) {
+ std::fprintf(stderr, "gemma4_step: kvflash requires the set_rows path "
+ "(DFLASH_GEMMA4_NO_KVPAD is incompatible)\n");
+ ggml_free(ctx);
+ return false;
+ }
+ std::vector kvf_mfull; // slot-space full mask (kvflash)
if (kvi_full) {
- // Full layers append at the absolute position; SWA layers at the ring
- // slot. Per-token modular indices also land chunks that cross the
- // ring wrap boundary correctly (the offset-view path wrote one
- // contiguous block).
- ggml_backend_tensor_set(kvi_full, pos.data(), 0, ggml_nbytes(kvi_full));
+ // Full layers append at the absolute position (or the kvflash pool
+ // slot); SWA layers at the ring slot. Per-token modular indices also
+ // land chunks that cross the ring wrap boundary correctly (the
+ // offset-view path wrote one contiguous block).
+ if (kvflash) {
+ // Rows + slot-space full mask in one pass (shared helper; the
+ // mask is uploaded below where the legacy path builds its own).
+ std::vector rows;
+ if (!kvflash_fill_rows_and_masks(*kvflash, kv_start, n_tokens,
+ kv_len_padded, /*swa_window=*/0,
+ rows, &kvf_mfull, nullptr)) {
+ ggml_free(ctx);
+ return false;
+ }
+ ggml_backend_tensor_set(kvi_full, rows.data(), 0, ggml_nbytes(kvi_full));
+ } else {
+ ggml_backend_tensor_set(kvi_full, pos.data(), 0, ggml_nbytes(kvi_full));
+ }
GGML_ASSERT(swa_size > 0);
std::vector ring((size_t)n_tokens);
for (int i = 0; i < n_tokens; ++i) ring[i] = (kv_start + i) % swa_size;
@@ -785,12 +824,18 @@ bool gemma4_step(
ggml_backend_tensor_set(tok_ids, token_ids, 0, (size_t)n_tokens * sizeof(int32_t));
}
- // Causal mask (full attention) — padded positions are masked with -inf
- std::vector mfull((size_t)kv_len_padded * n_tokens, -INFINITY);
- for (int q = 0; q < n_tokens; ++q) {
- const int abs_q = kv_start + q;
- for (int k = 0; k <= abs_q && k < kv_len_raw; ++k) {
- mfull[(size_t)q * kv_len_padded + k] = 0.0f;
+ // Causal mask (full attention) — padded positions are masked with -inf.
+ // kvflash: SLOT-space mask already built alongside the append rows.
+ std::vector mfull;
+ if (kvflash) {
+ mfull = std::move(kvf_mfull);
+ } else {
+ mfull.assign((size_t)kv_len_padded * n_tokens, -INFINITY);
+ for (int q = 0; q < n_tokens; ++q) {
+ const int abs_q = kv_start + q;
+ for (int k = 0; k <= abs_q && k < kv_len_raw; ++k) {
+ mfull[(size_t)q * kv_len_padded + k] = 0.0f;
+ }
}
}
ggml_backend_tensor_set(mk_full, mfull.data(), 0, ggml_nbytes(mk_full));
@@ -844,8 +889,14 @@ bool gemma4_verify_batch(
const int32_t * token_ids,
int n_tokens,
int kv_start,
- std::vector & out_argmax)
+ std::vector & out_argmax,
+ const KvFlashPager * kvflash)
{
+ if (kvflash && cache.fa_window > 0) {
+ std::fprintf(stderr, "gemma4_verify_batch: kvflash and fa_window are "
+ "mutually exclusive\n");
+ return false;
+ }
ggml_init_params ip{};
ip.mem_size = ggml_tensor_overhead() * 16384 + ggml_graph_overhead() + 16 * 1024 * 1024;
ip.no_alloc = true;
@@ -865,9 +916,28 @@ bool gemma4_verify_batch(
ggml_set_input(tok_ids);
}
- // Attention masks (padded)
+ // kvflash: full-layer writes must go through set_rows to land in pool
+ // slots; SWA ring rows ride the same mechanism (pos % swa_size).
+ ggml_tensor * kvi_full = nullptr, * kvi_swa = nullptr;
+ if (kvflash) {
+ kvi_full = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_tokens);
+ ggml_set_input(kvi_full);
+ kvi_swa = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_tokens);
+ ggml_set_input(kvi_swa);
+ }
+
+ // Attention masks (padded; full width clamps to the full-layer tensor
+ // capacity, which is pool-sized under kvflash — must agree with the FA
+ // span clamp in build_gemma4_attn_block)
+ int full_cap = cache.max_ctx;
+ for (int il = 0; il < (int)cache.k.size(); ++il) {
+ if (cache.k[(size_t)il] && !gemma4_is_swa_layer(w, il)) {
+ full_cap = (int)cache.k[(size_t)il]->ne[1];
+ break;
+ }
+ }
const int kv_len_raw = kv_start + n_tokens;
- const int kv_len_padded = (kv_len_raw + 255) & ~255;
+ const int kv_len_padded = std::min((kv_len_raw + 255) & ~255, full_cap);
ggml_tensor * mk_full = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, kv_len_padded, n_tokens, 1, 1);
ggml_set_input(mk_full);
ggml_tensor * mk_full_f16 = ggml_cast(ctx, mk_full, GGML_TYPE_F16);
@@ -914,7 +984,8 @@ bool gemma4_verify_batch(
}
cur = build_gemma4_layer(ctx, gf, w, cache, il, cur, pp,
mk_full_f16, mk_swa_f16, pl_input,
- kv_start, n_tokens, cap_idx);
+ kv_start, n_tokens, cap_idx,
+ kvi_full, kvi_swa);
}
// Final norm
@@ -954,12 +1025,27 @@ bool gemma4_verify_batch(
ggml_backend_tensor_set(tok_ids, token_ids, 0, (size_t)n_tokens * sizeof(int32_t));
}
- // Masks
- std::vector mfull((size_t)kv_len_padded * n_tokens, -INFINITY);
- for (int q = 0; q < n_tokens; ++q) {
- const int abs_q = kv_start + q;
- for (int k = 0; k <= abs_q && k < kv_len_raw; ++k) {
- mfull[(size_t)q * kv_len_padded + k] = 0.0f;
+ // Masks (kvflash: slot-space full mask + slot rows via the shared helper)
+ std::vector mfull;
+ if (kvflash) {
+ std::vector rows;
+ if (!kvflash_fill_rows_and_masks(*kvflash, kv_start, n_tokens,
+ kv_len_padded, /*swa_window=*/0,
+ rows, &mfull, nullptr)) {
+ ggml_free(ctx);
+ return false;
+ }
+ ggml_backend_tensor_set(kvi_full, rows.data(), 0, ggml_nbytes(kvi_full));
+ std::vector ring((size_t)n_tokens);
+ for (int i = 0; i < n_tokens; ++i) ring[(size_t)i] = (kv_start + i) % swa_size;
+ ggml_backend_tensor_set(kvi_swa, ring.data(), 0, ggml_nbytes(kvi_swa));
+ } else {
+ mfull.assign((size_t)kv_len_padded * n_tokens, -INFINITY);
+ for (int q = 0; q < n_tokens; ++q) {
+ const int abs_q = kv_start + q;
+ for (int k = 0; k <= abs_q && k < kv_len_raw; ++k) {
+ mfull[(size_t)q * kv_len_padded + k] = 0.0f;
+ }
}
}
ggml_backend_tensor_set(mk_full, mfull.data(), 0, ggml_nbytes(mk_full));
diff --git a/server/src/gemma4/gemma4_internal.h b/server/src/gemma4/gemma4_internal.h
index d1e0e9033..800f00101 100644
--- a/server/src/gemma4/gemma4_internal.h
+++ b/server/src/gemma4/gemma4_internal.h
@@ -188,14 +188,19 @@ struct Gemma4Cache {
ggml_backend_buffer_t feat_buf = nullptr;
};
+// `ctx_alloc` (kvflash): when > 0 and < max_ctx, FULL-attention layers' K/V
+// tensors are allocated at ctx_alloc rows (the resident pool); SWA layers
+// keep their sliding-window ring buffers (already bounded). cache.max_ctx
+// stays the logical bound. 0 = allocate full layers at max_ctx (default).
bool create_gemma4_cache(ggml_backend_t backend, const Gemma4Weights & w,
- int max_ctx, Gemma4Cache & out);
+ int max_ctx, Gemma4Cache & out, int ctx_alloc = 0);
bool create_gemma4_cache_partial(ggml_backend_t backend,
const Gemma4Weights & w,
int max_ctx,
int layer_begin,
int layer_end,
- Gemma4Cache & out);
+ Gemma4Cache & out,
+ int ctx_alloc = 0);
void free_gemma4_cache(Gemma4Cache & c);
// Allocate target_feat ring buffer (call after draft load determines n_capture_layers).
@@ -221,6 +226,12 @@ void free_gemma4_snapshot(Gemma4Snapshot & s);
// Returns logits for last token.
// token_ids: raw token IDs needed for per-layer embedding lookup (may be nullptr
// if the model has no per-layer embeddings).
+// `kvflash`: optional bounded-residency pager over the FULL-attention KV
+// (see common/kvflash_pager.h). When set, full-layer append rows come from
+// the pager's slot mapping and the full mask is built in SLOT space; SWA
+// ring buffers are untouched. The caller must have allocated slots for
+// [kv_start, kv_start + n_tokens) via slot_for() beforehand. Requires the
+// set_rows path (refused under DFLASH_GEMMA4_NO_KVPAD) and fa_window == 0.
bool gemma4_step(
ggml_backend_t backend,
const Gemma4Weights & w,
@@ -229,10 +240,17 @@ bool gemma4_step(
const int32_t * token_ids,
int n_tokens,
int kv_start,
- std::vector & out_logits);
+ std::vector & out_logits,
+ const class KvFlashPager * kvflash = nullptr);
// Verify batch: run forward pass returning argmax for ALL positions.
// Used by DFlash speculative decode target.
+// `kvflash`: optional bounded-residency pager (caller must alloc_span()
+// [kv_start, kv_start+n_tokens) first). Full-layer writes go to pool slots
+// via set_rows with a slot-space causal mask; SWA ring writes/masks are
+// unchanged. Rejected draft slots hold future positions, so the validity
+// rule excludes them until the next verify rewrites them (KV truncation
+// semantics, same as the full cache).
bool gemma4_verify_batch(
ggml_backend_t backend,
const Gemma4Weights & w,
@@ -241,7 +259,8 @@ bool gemma4_verify_batch(
const int32_t * token_ids,
int n_tokens,
int kv_start,
- std::vector & out_argmax);
+ std::vector & out_argmax,
+ const class KvFlashPager * kvflash = nullptr);
// Project hidden states through lm_head (out_norm + output + softcap + argmax).
// Used by DFlash draft to convert draft hidden states to token IDs.
diff --git a/server/src/gemma4/gemma4_loader.cpp b/server/src/gemma4/gemma4_loader.cpp
index 00be4c8a8..c6fbb5c6b 100644
--- a/server/src/gemma4/gemma4_loader.cpp
+++ b/server/src/gemma4/gemma4_loader.cpp
@@ -475,9 +475,10 @@ void free_gemma4_weights(Gemma4Weights & w) {
// ── Cache ──────────────────────────────────────────────────────────────
bool create_gemma4_cache(ggml_backend_t backend, const Gemma4Weights & w,
- int max_ctx, Gemma4Cache & out) {
+ int max_ctx, Gemma4Cache & out, int ctx_alloc) {
return create_gemma4_cache_partial(
- backend, w, max_ctx, /*layer_begin=*/0, /*layer_end=*/w.n_layer, out);
+ backend, w, max_ctx, /*layer_begin=*/0, /*layer_end=*/w.n_layer, out,
+ ctx_alloc);
}
bool create_gemma4_cache_partial(ggml_backend_t backend,
@@ -485,7 +486,8 @@ bool create_gemma4_cache_partial(ggml_backend_t backend,
int max_ctx,
int layer_begin,
int layer_end,
- Gemma4Cache & out) {
+ Gemma4Cache & out,
+ int ctx_alloc) {
if (layer_begin < 0) layer_begin = 0;
if (layer_end < 0) layer_end = w.n_layer;
if (layer_begin > layer_end || layer_end > w.n_layer) return false;
@@ -521,6 +523,10 @@ bool create_gemma4_cache_partial(ggml_backend_t backend,
const int swa_size = (w.sliding_window > 0 && w.sliding_window < max_ctx)
? w.sliding_window : max_ctx;
+ // kvflash: FULL-attention layers at pool capacity; SWA ring buffers are
+ // already bounded and stay at swa_size.
+ const int full_phys = (ctx_alloc > 0 && ctx_alloc < max_ctx) ? ctx_alloc : max_ctx;
+
// Determine KV source for each layer
int last_kv_layer = -1;
for (int il = 0; il < w.n_layer; ++il) {
@@ -529,7 +535,7 @@ bool create_gemma4_cache_partial(ggml_backend_t backend,
const int D = gemma4_head_dim(w, il);
const int Hk = gemma4_n_head_kv(w, il);
const bool is_swa = gemma4_is_swa_layer(w, il);
- const int cache_len = is_swa ? swa_size : max_ctx;
+ const int cache_len = is_swa ? swa_size : full_phys;
if (owned_layer) {
out.k[il] = ggml_new_tensor_3d(out.ctx, GGML_TYPE_F16, D, cache_len, Hk);
out.v[il] = ggml_new_tensor_3d(out.ctx, GGML_TYPE_F16, D, cache_len, Hk);
diff --git a/server/src/internal.h b/server/src/internal.h
index 3c9611326..125e9a24e 100644
--- a/server/src/internal.h
+++ b/server/src/internal.h
@@ -373,6 +373,8 @@ struct TargetCache {
void snapshot_ssm_state(TargetCache & c);
// Restore the SSM+conv state from the snapshot.
void restore_ssm_state(TargetCache & c);
+// Allocate rollback snapshot tensors mirroring live ssm/conv state (MoE path).
+bool ensure_ssm_snapshot(TargetCache & c, ggml_backend_t backend);
// ─── Cross-request prefix snapshot (Phase A) ──────────────────────
//
@@ -471,12 +473,18 @@ bool restore_target_cache_chain(const PrefixSnapshot * thick,
// When prefill_only is true, rollback tensors (snapshots, intermediates) are
// skipped — saving ~1.4 GB on 48 DeltaNet layers. Use migrate_prefill_cache()
// to promote the cache to a full decode cache after prefill.
+// `ctx_alloc` (0 = max_ctx): physical token capacity of the attention KV
+// tensors. When smaller than max_ctx, a KvFlashPager maps logical positions to
+// pool slots and pages cold chunks to host (bounded KV residency); the
+// logical context bound stays max_ctx. Recurrent (DeltaNet) state is
+// unaffected.
bool create_target_cache(const TargetWeights & w,
int max_ctx,
int max_verify_tokens,
ggml_backend_t backend,
TargetCache & out,
- bool prefill_only = false);
+ bool prefill_only = false,
+ int ctx_alloc = 0);
bool create_target_cache_partial(const TargetWeights & w,
int max_ctx,
@@ -486,7 +494,8 @@ bool create_target_cache_partial(const TargetWeights & w,
bool prefill_only,
int layer_begin,
int layer_end,
- bool allocate_target_feat);
+ bool allocate_target_feat,
+ int ctx_alloc = 0);
void free_target_cache(TargetCache & c);
diff --git a/server/src/laguna/laguna_backend.cpp b/server/src/laguna/laguna_backend.cpp
index ab75ef5a8..9631f7f76 100644
--- a/server/src/laguna/laguna_backend.cpp
+++ b/server/src/laguna/laguna_backend.cpp
@@ -8,6 +8,7 @@
#include "laguna_backend.h"
#include "laguna_internal.h"
+#include "qwen3/qwen3_kvflash_scorer.h"
#include "dflash27b.h"
#include
@@ -68,16 +69,130 @@ bool LagunaBackend::init() {
cache_.kv_k_type = args_.kv_type;
cache_.kv_v_type = args_.kv_type;
- if (!create_laguna_target_cache(w_, args_.max_ctx, backend_, cache_)) {
+ kvflash_read_config();
+ if (!create_laguna_target_cache(w_, args_.max_ctx, backend_, cache_,
+ kvflash_tokens_)) {
std::fprintf(stderr, "cache failed: %s\n", dflash27b_last_error());
free_laguna_target_weights(w_);
ggml_backend_free(backend_); backend_ = nullptr;
return false;
}
+ if (!kvflash_attach()) {
+ ggml_backend_free(backend_); backend_ = nullptr;
+ return false;
+ }
+
+ return true;
+}
+
+// ── kvflash helpers ─────────────────────────────────────────────────────
+// Laguna's pager protections: the trailing sliding_window span (+1 chunk
+// for the partially filled head) must stay resident so SWA attention stays
+// exact under paging. This drives both the pool floor and the attach config.
+KvFlashConfig LagunaBackend::kvflash_config() const {
+ KvFlashConfig pc;
+ pc.tail_window_chunks =
+ std::max(4, (w_.sliding_window + pc.chunk_tokens - 1) / pc.chunk_tokens + 1);
+ return pc;
+}
+
+void LagunaBackend::kvflash_read_config() {
+ if (std::getenv("DFLASH_KVFLASH")) {
+ kvflash_drafter_path_ = kvflash_find_drafter(args_.target_path.c_str());
+ }
+ // "auto" sizes from the GPU (weights resident, cache not yet allocated):
+ // laguna pools ALL n_layer layers at the configured KV quant.
+ KvFlashAutoBudget kvf_budget;
+ {
+ size_t gpu_free = 0, gpu_total = 0;
+ if (ggml_backend_dev_t dev = ggml_backend_get_device(backend_)) {
+ ggml_backend_dev_memory(dev, &gpu_free, &gpu_total);
+ }
+ kvf_budget.free_bytes = (int64_t)gpu_free;
+ kvf_budget.bytes_per_token = (int64_t)w_.n_layer * w_.n_head_kv * 2 *
+ (int64_t)ggml_row_size(args_.kv_type, w_.head_dim);
+ kvf_budget.reserve_bytes = (int64_t)(1.5 * 1073741824.0) +
+ (kvflash_drafter_path_.empty() ? 0 : (int64_t)(1.7 * 1073741824.0));
+ }
+ kvflash_tokens_ = kvflash_pool_from_env(args_.max_ctx, kvflash_config(),
+ !kvflash_drafter_path_.empty(),
+ kvf_budget);
+ if (kvflash_tokens_ > 0) {
+ const char * tau = std::getenv("DFLASH_KVFLASH_TAU");
+ kvflash_tau_ = std::max(1, tau ? std::atoi(tau) : 64);
+ }
+}
+
+// Drafter rescore + repage (FlashMemory tau loop) with the cross-tokenizer
+// scorer: laguna ids are detokenized and re-scored through the Qwen3-0.6B
+// drafter (relevance is text-level, so the tokenizer gap is bridged by
+// re-tokenization). Lazy: the drafter + tokenizers load on the first
+// reselect that needs them, never on a request's first tokens.
+void LagunaBackend::kvflash_maybe_reselect(const std::vector & history,
+ int generated) {
+ if (!kvflash_active() || kvflash_tau_ <= 0) return;
+ const int tau = std::max(kvflash_tau_, (int)(history.size() / 45));
+ if (generated % tau != 0) return;
+ if (!kvflash_scorer_) {
+ if (kvflash_drafter_path_.empty() || kvflash_drafter_failed_) return;
+ if (!drafter_loaded_) {
+ ggml_backend_synchronize(backend_);
+ std::fprintf(stderr, "[kvflash] loading drafter for residency scoring: %s\n",
+ kvflash_drafter_path_.c_str());
+ if (!load_drafter(kvflash_drafter_path_, /*gpu_layers=*/999,
+ args_.device.gpu, drafter_ctx_)) {
+ std::fprintf(stderr, "[kvflash] drafter load failed (%s); staying on "
+ "LRU residency\n", dflash27b_last_error());
+ kvflash_drafter_failed_ = true;
+ return;
+ }
+ drafter_loaded_ = true;
+ }
+ kvflash_scorer_ = std::make_unique(
+ &drafter_ctx_, args_.target_path, kvflash_drafter_path_);
+ std::fprintf(stderr, "[kvflash] cross-tokenizer drafter scorer attached "
+ "(tau=%d)\n", kvflash_tau_);
+ }
+ if (!kvflash_scorer_->score_chunks(history, kvflash_pager_.chunk_tokens(),
+ kvflash_scores_)) {
+ return; // scorer failure -> keep LRU behavior this round
+ }
+ kvflash_pager_.score_hook = [this](int c) {
+ return c < (int)kvflash_scores_.size() ? kvflash_scores_[c] : 1e30f;
+ };
+ const int events = kvflash_pager_.reselect();
+ kvflash_pager_.score_hook = nullptr;
+ if (events > 0) {
+ std::fprintf(stderr, "[kvflash] reselect @gen=%d: %d page events\n",
+ generated, events);
+ }
+}
+
+bool LagunaBackend::kvflash_attach() {
+ if (!kvflash_active()) return true;
+ KvFlashConfig pc = kvflash_config();
+ pc.pool_tokens = kvflash_tokens_;
+ if (!kvflash_pager_.attach(pc, cache_.attn_k, cache_.attn_v)) {
+ std::fprintf(stderr, "kvflash: pager attach failed (pool=%d)\n",
+ kvflash_tokens_);
+ return false;
+ }
+ std::printf("[kvflash] resident pool %d tokens (logical max_ctx %d), "
+ "policy=%s, swa_tail=%d chunks\n",
+ kvflash_tokens_, args_.max_ctx,
+ !kvflash_drafter_path_.empty()
+ ? "drafter/cross-tok (attaches on first reselect)"
+ : "lru (recency-only: no Qwen3-0.6B drafter found)",
+ pc.tail_window_chunks);
+ std::fflush(stdout);
return true;
}
+bool LagunaBackend::kvflash_alloc_span(int kv_start, int n_tok) {
+ return !kvflash_active() || kvflash_pager_.alloc_span(kv_start, n_tok);
+}
+
void LagunaBackend::print_ready_banner() const {
std::printf("[laguna-daemon] ready vocab=%lld eos=%d eot=%d max_ctx=%d kv=%s chunk=%d\n",
(long long)w_.embedder.n_vocab, w_.eos_id, w_.eos_chat_id,
@@ -107,10 +222,17 @@ bool LagunaBackend::unpark(const std::string & what) {
}
cache_.kv_k_type = args_.kv_type;
cache_.kv_v_type = args_.kv_type;
- if (!create_laguna_target_cache(w_, args_.max_ctx, backend_, cache_)) {
+ if (!create_laguna_target_cache(w_, args_.max_ctx, backend_, cache_,
+ kvflash_tokens_)) {
std::fprintf(stderr, "[unpark] cache: %s\n", dflash27b_last_error());
return false;
}
+ if (!kvflash_attach()) {
+ free_laguna_target_cache(cache_);
+ free_laguna_target_weights(w_);
+ return false; // still parked, resources released
+ }
+ kvflash_drafter_failed_ = false; // fresh VRAM: allow a retry
target_parked_ = false;
std::printf("[unpark] target restored\n"); std::fflush(stdout);
}
@@ -132,6 +254,13 @@ bool LagunaBackend::ensure_slot(int slot) {
}
bool LagunaBackend::snapshot_save(int slot) {
+ // kvflash: snapshots copy rows assuming identity layout, which breaks
+ // after the first page-out relocates a chunk.
+ if (kvflash_active() && !kvflash_pager_.is_identity()) {
+ std::fprintf(stderr, "[kvflash] snapshot skipped: pool has relocated "
+ "chunks (page-table serialization not implemented)\n");
+ return false;
+ }
if (!ensure_slot(slot)) return false;
if (!laguna_snapshot_save(cache_, snap_backend_, w_.n_layer,
w_.n_head_kv, w_.head_dim, snapshots_[slot])) {
@@ -189,7 +318,19 @@ GenerateResult LagunaBackend::generate_impl(const GenerateRequest & req,
return result;
}
+ // kvflash: prefill rows land identity-mapped, so the prompt must fit the
+ // pool with one chunk of decode headroom (decode then evicts LRU live).
+ if (kvflash_active() &&
+ N > kvflash_tokens_ - kvflash_pager_.chunk_tokens()) {
+ std::fprintf(stderr, "[kvflash] prompt (%d) exceeds pool %d; raise "
+ "--kvflash\n", N, kvflash_tokens_);
+ result.error = "kvflash: prompt exceeds resident pool";
+ return result;
+ }
+
reset_laguna_target_cache(cache_);
+ if (kvflash_active()) kvflash_pager_.reset();
+ const KvFlashPager * kvf = kvflash_active() ? &kvflash_pager_ : nullptr;
// ── Prefill ──
std::vector embed_pf((size_t)N * w_.n_embd);
@@ -205,15 +346,23 @@ GenerateResult LagunaBackend::generate_impl(const GenerateRequest & req,
for (int c = 0; c < n_chunks && ok; ++c) {
const int kv_start = c * args_.chunk;
const int n_tok = std::min(args_.chunk, N - c * args_.chunk);
- ok = laguna_step(backend_, w_, cache_,
+ ok = kvflash_alloc_span(kv_start, n_tok) &&
+ laguna_step(backend_, w_, cache_,
embed_pf.data() + (size_t)kv_start * w_.n_embd,
- n_tok, kv_start, no_mask, last_logits);
+ n_tok, kv_start, no_mask, last_logits, kvf);
}
if (!ok) { result.error = "prefill"; return result; }
auto t_pf1 = std::chrono::steady_clock::now();
result.prefill_s = std::chrono::duration(t_pf1 - t_pf0).count();
// ── Inline snapshot (if requested) ──
+ // kvflash: snapshots copy rows [0, snap_pos) assuming identity layout,
+ // which holds until the first page-out relocates a chunk.
+ if (kvflash_active() && req.snap_slot >= 0 &&
+ !kvflash_pager_.is_identity()) {
+ std::fprintf(stderr, "[kvflash] snapshot skipped: pool has relocated "
+ "chunks (page-table serialization not implemented)\n");
+ } else
if (req.snap_slot >= 0 && req.snap_pos > 0 && req.snap_pos <= N) {
if (ensure_slot(req.snap_slot) &&
laguna_snapshot_save(cache_, snap_backend_, w_.n_layer,
@@ -303,8 +452,10 @@ GenerateResult LagunaBackend::generate_impl(const GenerateRequest & req,
}
if (!w_.embedder.embed(&next_tok, 1, embed_step.data())) { ok = false; break; }
std::vector step_logits;
- if (!laguna_step(backend_, w_, cache_, embed_step.data(), 1,
- cache_.cur_pos, no_mask, step_logits)) { ok = false; break; }
+ if (!kvflash_alloc_span(cache_.cur_pos, 1) ||
+ !laguna_step(backend_, w_, cache_, embed_step.data(), 1,
+ cache_.cur_pos, no_mask, step_logits, kvf)) { ok = false; break; }
+ kvflash_maybe_reselect(history, s + 1);
next_tok = pick(step_logits);
}
auto t_g1 = std::chrono::steady_clock::now();
@@ -342,6 +493,24 @@ GenerateResult LagunaBackend::restore_and_generate_impl(int slot,
return result;
}
+ // kvflash: restore lands rows identity-mapped; the full prompt (prefix +
+ // diff) must fit the pool. Rebuild the pager mapping over the prefix.
+ if (kvflash_active() &&
+ N > kvflash_tokens_ - kvflash_pager_.chunk_tokens()) {
+ std::fprintf(stderr, "[kvflash] restore prompt (%d) exceeds pool %d; "
+ "raise --kvflash\n", N, kvflash_tokens_);
+ result.error = "kvflash: prompt exceeds resident pool";
+ return result;
+ }
+ if (kvflash_active()) {
+ kvflash_pager_.reset();
+ if (!kvflash_alloc_span(0, prefix_len)) {
+ result.error = "kvflash_slot";
+ return result;
+ }
+ }
+ const KvFlashPager * kvf = kvflash_active() ? &kvflash_pager_ : nullptr;
+
// Re-prefill diff tokens (or last cached token when diff is empty).
if (prefix_len == N) {
if (prefix_len <= 0) { result.error = "empty_diff"; return result; }
@@ -363,9 +532,10 @@ GenerateResult LagunaBackend::restore_and_generate_impl(int slot,
const int off = c * args_.chunk;
const int n_tok = std::min(args_.chunk, diff_n - off);
const int starts = kv_start + off;
- ok = laguna_step(backend_, w_, cache_,
+ ok = kvflash_alloc_span(starts, n_tok) &&
+ laguna_step(backend_, w_, cache_,
embed_diff.data() + (size_t)off * w_.n_embd,
- n_tok, starts, no_mask, last_logits);
+ n_tok, starts, no_mask, last_logits, kvf);
}
if (!ok) { result.error = "prefill"; return result; }
@@ -437,8 +607,10 @@ GenerateResult LagunaBackend::restore_and_generate_impl(int slot,
if (out_io.cancelled) break;
if (!w_.embedder.embed(&next_tok, 1, embed_step.data())) { ok = false; break; }
std::vector step_logits;
- if (!laguna_step(backend_, w_, cache_, embed_step.data(), 1,
- cache_.cur_pos, no_mask, step_logits)) { ok = false; break; }
+ if (!kvflash_alloc_span(cache_.cur_pos, 1) ||
+ !laguna_step(backend_, w_, cache_, embed_step.data(), 1,
+ cache_.cur_pos, no_mask, step_logits, kvf)) { ok = false; break; }
+ kvflash_maybe_reselect(history, s + 1);
next_tok = pick(step_logits);
}
auto t_g1 = std::chrono::steady_clock::now();
@@ -1085,8 +1257,10 @@ bool LagunaBackend::hybrid_forward_one_token(int32_t tok, int kv_pos,
static const bool _nm = (std::getenv("DFLASH_NO_MASK") != nullptr);
static std::vector _sg_logits;
static std::vector _sg_sel;
+ if (!kvflash_alloc_span(kv_pos, 1)) return false;
if (!laguna_step_hybrid(backend_, w_, cache_, act_cur.data(), 1, kv_pos, _nm,
- *moe_hybrid_, _sg_logits, &_sg_sel))
+ *moe_hybrid_, _sg_logits, &_sg_sel,
+ kvflash_active() ? &kvflash_pager_ : nullptr))
return false;
// Reactive cache warm + routing observe, POST-compute (off the
// single-graph critical path): make each selected expert resident
@@ -1128,6 +1302,14 @@ bool LagunaBackend::hybrid_forward_one_token(int32_t tok, int kv_pos,
// GPU-resident state for MoE layers
GpuResidentState gpu_state;
+ // The per-layer fallback writes KV at literal view offsets (no set_rows),
+ // which a kvflash pool cannot express once chunks relocate.
+ if (kvflash_active()) {
+ std::fprintf(stderr, "[kvflash] laguna per-layer hybrid decode is not "
+ "pool-aware; unset DFLASH_LAGUNA_NO_SINGLE_GRAPH\n");
+ return false;
+ }
+
if (!init_gpu_resident_state(gpu_state, backend_, hidden)) return false;
ggml_backend_tensor_set(gpu_state.act_cur, act_cur.data(), 0, sizeof(float) * (size_t)hidden);
@@ -1348,7 +1530,25 @@ GenerateResult LagunaBackend::generate_hybrid(const GenerateRequest & req,
return result;
}
+ // kvflash: hybrid prefill writes rows identity-mapped (legacy per-layer
+ // views), so the prompt must fit the pool; the pager mapping is built up
+ // front and stays identity through prefill (no eviction can trigger).
+ if (kvflash_active() &&
+ N > kvflash_tokens_ - kvflash_pager_.chunk_tokens()) {
+ std::fprintf(stderr, "[kvflash] hybrid prompt (%d) exceeds pool %d; "
+ "raise --kvflash\n", N, kvflash_tokens_);
+ result.error = "kvflash: prompt exceeds resident pool";
+ return result;
+ }
+
reset_laguna_target_cache(cache_);
+ if (kvflash_active()) {
+ kvflash_pager_.reset();
+ if (!kvflash_alloc_span(0, N)) {
+ result.error = "kvflash_slot";
+ return result;
+ }
+ }
// ── Hybrid Prefill: layer-by-layer pre-FFN + batched hybrid FFN ──
const int hidden = w_.n_embd;
@@ -1652,6 +1852,7 @@ GenerateResult LagunaBackend::generate_hybrid(const GenerateRequest & req,
break;
}
cache_.cur_pos++;
+ kvflash_maybe_reselect(history, s + 1);
if (req.do_sample) {
// For sampling, we need full logits — project from act_cur
diff --git a/server/src/laguna/laguna_backend.h b/server/src/laguna/laguna_backend.h
index 156c82e6b..881ad1abd 100644
--- a/server/src/laguna/laguna_backend.h
+++ b/server/src/laguna/laguna_backend.h
@@ -10,6 +10,8 @@
#include "laguna_internal.h"
#include "placement/placement_config.h"
#include "qwen3_drafter.h"
+#include "kvflash_pager.h"
+#include "kvflash_scorer.h"
#include "../common/moe_hybrid_ffn_eval.h"
#include "../common/moe_hybrid_storage.h"
#include "../common/moe_hybrid_routing_stats.h"
@@ -99,6 +101,34 @@ class LagunaBackend : public ModelBackend {
bool ensure_slot(int slot);
+ // ── kvflash (bounded KV residency; see common/kvflash_pager.h) ──
+ // Drafter-scored residency by default: the Qwen3-0.6B drafter scores
+ // chunks through the cross-tokenizer bridge (KvFlashCrossTokScorer —
+ // relevance is text-level, so the target's ids are detokenized and
+ // re-tokenized for the drafter). LRU is the fallback when no drafter is
+ // found or --kvflash-policy lru. The pager covers ALL 40 layers; SWA
+ // exactness comes from a protected tail >= sliding_window.
+ KvFlashPager kvflash_pager_;
+ std::unique_ptr kvflash_scorer_;
+ std::vector kvflash_scores_;
+ std::string kvflash_drafter_path_;
+ int kvflash_tokens_ = 0; // 0 = off
+ int kvflash_tau_ = 64;
+ bool kvflash_drafter_failed_ = false;
+ bool kvflash_active() const { return kvflash_tokens_ > 0; }
+ // Drafter rescore + repage every effective-tau generated tokens
+ // (lazy-loads the drafter + cross-tokenizer scorer on first need).
+ void kvflash_maybe_reselect(const std::vector & history, int generated);
+ // Pager protections (SWA tail) shared by the floor and attach.
+ KvFlashConfig kvflash_config() const;
+ // Read DFLASH_KVFLASH and round/clamp; call before cache creation.
+ void kvflash_read_config();
+ // Attach the pager to the freshly created cache (init / unpark).
+ bool kvflash_attach();
+ // Allocate pool slots for [kv_start, kv_start+n_tok) (evicting LRU as
+ // needed) ahead of a laguna_step call. False if the pool is exhausted.
+ bool kvflash_alloc_span(int kv_start, int n_tok);
+
// Hybrid mode helpers
bool init_hybrid_mode();
// Build hot/cold expert storage for `placement` by re-reading expert weights
diff --git a/server/src/laguna/laguna_internal.h b/server/src/laguna/laguna_internal.h
index ec09b6113..cc37d2051 100644
--- a/server/src/laguna/laguna_internal.h
+++ b/server/src/laguna/laguna_internal.h
@@ -168,16 +168,21 @@ struct LagunaTargetCache {
std::vector attn_v;
};
+// `ctx_alloc` (kvflash): when > 0 and < max_ctx, the per-layer K/V tensors
+// are allocated at ctx_alloc rows (the resident pool) while cache.max_ctx
+// keeps the logical bound. 0 = allocate at max_ctx (default).
bool create_laguna_target_cache(const LagunaTargetWeights & w,
int max_ctx,
ggml_backend_t backend,
- LagunaTargetCache & out);
+ LagunaTargetCache & out,
+ int ctx_alloc = 0);
bool create_laguna_target_cache_partial(const LagunaTargetWeights & w,
int max_ctx,
ggml_backend_t backend,
int layer_begin,
int layer_end,
- LagunaTargetCache & out);
+ LagunaTargetCache & out,
+ int ctx_alloc = 0);
void free_laguna_target_cache(LagunaTargetCache & c);
void reset_laguna_target_cache(LagunaTargetCache & c);
@@ -280,6 +285,12 @@ LagunaGraphOutputs build_laguna_graph(
// `out_logits` : on success, resized to vocab and filled with last-token
// logits when in.output_last_only == true (default in this
// helper).
+// `kvflash`: optional bounded-residency pager (see common/kvflash_pager.h).
+// When set, the K/V append rows come from the pager's slot mapping and both
+// masks are built in SLOT space (causal / sliding-window conditions evaluated
+// on the position each slot holds). The caller must have allocated slots for
+// [kv_start, kv_start + n_tok) via slot_for() beforehand. Requires the
+// kv_pad set_rows path (refused otherwise).
bool laguna_step(
ggml_backend_t backend,
const LagunaTargetWeights & w,
@@ -288,7 +299,8 @@ bool laguna_step(
int n_tok,
int kv_start,
bool no_mask,
- std::vector & out_logits);
+ std::vector & out_logits,
+ const class KvFlashPager * kvflash = nullptr);
// Forward decl (full definition in common/moe_hybrid_storage.h).
struct MoeHybridStorage;
@@ -306,7 +318,8 @@ bool laguna_step_hybrid(
bool no_mask,
const MoeHybridStorage & hyb,
std::vector & out_logits,
- std::vector * out_selected = nullptr);
+ std::vector * out_selected = nullptr,
+ const class KvFlashPager * kvflash = nullptr);
struct LagunaLayerStepGraph {
ggml_context * ctx = nullptr;
diff --git a/server/src/laguna/laguna_target_graph.cpp b/server/src/laguna/laguna_target_graph.cpp
index 44b1b5cd7..c44d1ee32 100644
--- a/server/src/laguna/laguna_target_graph.cpp
+++ b/server/src/laguna/laguna_target_graph.cpp
@@ -19,6 +19,7 @@
#include "laguna_internal.h"
#include "../common/moe_hybrid_storage.h"
+#include "../common/kvflash_pager.h"
#include "common/ggml_graph_precision.h"
#include "internal.h"
#include "dflash27b.h"
@@ -44,9 +45,11 @@ static constexpr float LAGUNA_EPS = 1e-6f;
bool create_laguna_target_cache(const LagunaTargetWeights & w,
int max_ctx,
ggml_backend_t backend,
- LagunaTargetCache & out) {
+ LagunaTargetCache & out,
+ int ctx_alloc) {
return create_laguna_target_cache_partial(
- w, max_ctx, backend, /*layer_begin=*/0, /*layer_end=*/w.n_layer, out);
+ w, max_ctx, backend, /*layer_begin=*/0, /*layer_end=*/w.n_layer, out,
+ ctx_alloc);
}
bool create_laguna_target_cache_partial(const LagunaTargetWeights & w,
@@ -54,7 +57,8 @@ bool create_laguna_target_cache_partial(const LagunaTargetWeights & w,
ggml_backend_t backend,
int layer_begin,
int layer_end,
- LagunaTargetCache & out) {
+ LagunaTargetCache & out,
+ int ctx_alloc) {
if (layer_begin < 0) layer_begin = 0;
if (layer_end < 0) layer_end = w.n_layer;
if (layer_begin > layer_end || layer_end > w.n_layer) {
@@ -62,6 +66,9 @@ bool create_laguna_target_cache_partial(const LagunaTargetWeights & w,
return false;
}
+ // kvflash: tensors at pool capacity, logical bound stays max_ctx.
+ const int ctx_phys = (ctx_alloc > 0 && ctx_alloc < max_ctx) ? ctx_alloc : max_ctx;
+
out.backend = backend;
out.max_ctx = max_ctx;
out.cur_pos = 0;
@@ -88,10 +95,10 @@ bool create_laguna_target_cache_partial(const LagunaTargetWeights & w,
if (il < layer_begin || il >= layer_end) continue;
char nm[32];
std::snprintf(nm, sizeof(nm), "k_l%d", il);
- ggml_tensor * k = ggml_new_tensor_3d(out.base_ctx, k_type, w.head_dim, max_ctx, w.n_head_kv);
+ ggml_tensor * k = ggml_new_tensor_3d(out.base_ctx, k_type, w.head_dim, ctx_phys, w.n_head_kv);
ggml_set_name(k, nm);
std::snprintf(nm, sizeof(nm), "v_l%d", il);
- ggml_tensor * v = ggml_new_tensor_3d(out.base_ctx, v_type, w.head_dim, max_ctx, w.n_head_kv);
+ ggml_tensor * v = ggml_new_tensor_3d(out.base_ctx, v_type, w.head_dim, ctx_phys, w.n_head_kv);
ggml_set_name(v, nm);
out.attn_k[il] = k;
out.attn_v[il] = v;
@@ -978,8 +985,14 @@ bool laguna_step(
int n_tok,
int kv_start,
bool no_mask,
- std::vector & out_logits)
+ std::vector & out_logits,
+ const KvFlashPager * kvflash)
{
+ if (kvflash && no_mask) {
+ std::fprintf(stderr, "laguna_step: kvflash requires masks (slots are "
+ "relocated; position-implicit masking is invalid)\n");
+ return false;
+ }
// Same CUDA-graph-replay treatment as laguna_step_hybrid: persistent
// arena (stable node addresses -> stable graph key), stride-padded KV
// span, and set_rows K/V append (index is an input, so node properties
@@ -1056,6 +1069,25 @@ bool laguna_step(
std::vector pos((size_t)n_tok);
for (int i = 0; i < n_tok; ++i) pos[i] = kv_start + i;
ggml_backend_tensor_set(pp, pos.data(), 0, ggml_nbytes(pp));
+
+ if (kvflash) {
+ if (!kvi) {
+ std::fprintf(stderr, "laguna_step: kvflash requires the kv_pad "
+ "set_rows path (NO_KVPAD / PAD_CPY are incompatible)\n");
+ ggml_free(ctx);
+ return false;
+ }
+ std::vector rows;
+ std::vector mfull, mswa;
+ if (!kvflash_fill_rows_and_masks(*kvflash, kv_start, n_tok, mk_w,
+ w.sliding_window, rows, &mfull, &mswa)) {
+ ggml_free(ctx);
+ return false;
+ }
+ ggml_backend_tensor_set(kvi, rows.data(), 0, ggml_nbytes(kvi));
+ ggml_backend_tensor_set(mk_full, mfull.data(), 0, ggml_nbytes(mk_full));
+ ggml_backend_tensor_set(mk_swa, mswa.data(), 0, ggml_nbytes(mk_swa));
+ } else {
if (kvi) {
ggml_backend_tensor_set(kvi, pos.data(), 0, ggml_nbytes(kvi));
}
@@ -1083,6 +1115,7 @@ bool laguna_step(
}
ggml_backend_tensor_set(mk_swa, mswa.data(), 0, ggml_nbytes(mk_swa));
}
+ }
if (ggml_backend_graph_compute(backend, gf) != GGML_STATUS_SUCCESS) {
std::fprintf(stderr, "laguna_step: graph_compute failed\n");
@@ -1111,8 +1144,14 @@ bool laguna_step_hybrid(
bool no_mask,
const MoeHybridStorage & hyb,
std::vector & out_logits,
- std::vector * out_selected)
+ std::vector * out_selected,
+ const KvFlashPager * kvflash)
{
+ if (kvflash && no_mask) {
+ std::fprintf(stderr, "laguna_step_hybrid: kvflash requires masks (slots "
+ "are relocated; position-implicit masking is invalid)\n");
+ return false;
+ }
// Persistent arena: rebuilt graphs land at IDENTICAL addresses every step.
// The ggml-cuda CUDA-graph cache is keyed on nodes[0] and memcmps node
// properties (incl. src data pointers); address stability across steps is
@@ -1209,6 +1248,25 @@ bool laguna_step_hybrid(
std::vector pos((size_t)n_tok);
for (int i = 0; i < n_tok; ++i) pos[i] = kv_start + i;
ggml_backend_tensor_set(pp, pos.data(), 0, ggml_nbytes(pp));
+
+ if (kvflash) {
+ if (!kvi) {
+ std::fprintf(stderr, "laguna_step_hybrid: kvflash requires the kv_pad "
+ "set_rows path (NO_KVPAD / PAD_CPY are incompatible)\n");
+ ggml_free(ctx);
+ return false;
+ }
+ std::vector rows;
+ std::vector mfull, mswa;
+ if (!kvflash_fill_rows_and_masks(*kvflash, kv_start, n_tok, mk_w,
+ w.sliding_window, rows, &mfull, &mswa)) {
+ ggml_free(ctx);
+ return false;
+ }
+ ggml_backend_tensor_set(kvi, rows.data(), 0, ggml_nbytes(kvi));
+ ggml_backend_tensor_set(mk_full, mfull.data(), 0, ggml_nbytes(mk_full));
+ ggml_backend_tensor_set(mk_swa, mswa.data(), 0, ggml_nbytes(mk_swa));
+ } else {
if (kvi) {
// set_rows row indices = absolute cache positions of this step's tokens
ggml_backend_tensor_set(kvi, pos.data(), 0, ggml_nbytes(kvi));
@@ -1232,6 +1290,7 @@ bool laguna_step_hybrid(
}
ggml_backend_tensor_set(mk_swa, mswa.data(), 0, ggml_nbytes(mk_swa));
}
+ }
// Set ALL residency LUTs in two batched H2D copies from the hot stack mapping.
std::vector lutbuf((size_t)n_expert * (size_t)n_moe);
diff --git a/server/src/qwen3/qwen3_kvflash_scorer.cpp b/server/src/qwen3/qwen3_kvflash_scorer.cpp
new file mode 100644
index 000000000..4dc00c7c9
--- /dev/null
+++ b/server/src/qwen3/qwen3_kvflash_scorer.cpp
@@ -0,0 +1,210 @@
+#include "qwen3_kvflash_scorer.h"
+
+#include "qwen3_drafter_model.h"
+#include "server/tokenizer.h"
+
+#include
+#include
+#include
+
+namespace dflash::common {
+
+namespace {
+
+constexpr int kLookahead = 8;
+constexpr int kPoolKernel = 13;
+constexpr int kMinSegment = 4096;
+
+// Tail-attention token scores for `ids`: mean over the lookahead window of
+// the drafter's running-max, then AvgPool smoothing. Same math as
+// drafter_score_and_compress.
+bool score_tokens_direct(DrafterContext & ctx, const std::vector & ids,
+ std::vector & out) {
+ const int S = (int)ids.size();
+ std::vector running_max;
+ if (!forward_qwen3_drafter_model(ctx.weights, ids, kLookahead, running_max)) {
+ return false;
+ }
+ std::vector score((size_t)S, 0.0f);
+ for (int j = 0; j < S; j++) {
+ float s = 0.0f;
+ for (int t = 0; t < kLookahead; t++) s += running_max[(size_t)t * S + j];
+ score[j] = s / kLookahead;
+ }
+ out.assign((size_t)S, 0.0f);
+ const int half = kPoolKernel / 2;
+ for (int j = 0; j < S; j++) {
+ const int lo = std::max(0, j - half), hi = std::min(S - 1, j + half);
+ float s = 0.0f;
+ for (int k = lo; k <= hi; k++) s += score[k];
+ out[j] = s / (hi - lo + 1);
+ }
+ return true;
+}
+
+void z_normalize(float * v, size_t n) {
+ if (n == 0) return;
+ double mean = 0;
+ for (size_t i = 0; i < n; i++) mean += v[i];
+ mean /= n;
+ double var = 0;
+ for (size_t i = 0; i < n; i++) var += (v[i] - mean) * (v[i] - mean);
+ const float inv = 1.0f / ((float)std::sqrt(var / n) + 1e-6f);
+ for (size_t i = 0; i < n; i++) v[i] = (float)((v[i] - mean) * inv);
+}
+
+// Score `ids` with allocation-failure resilience: try the full forward;
+// on failure split into two equal halves, score each with the TRUE query
+// tail (last kLookahead ids) appended so relevance stays query-aware, and
+// z-normalize per segment so the merged ranking is comparable. Recursion
+// floor kMinSegment. The drafter's per-call buffers (~10 KB/token) can
+// fail on a fragmented CUDA heap at 32K+ even when total free VRAM is
+// ample; segmented scoring trades exact cross-segment calibration for
+// robustness.
+bool score_tokens_resilient(DrafterContext & ctx, const std::vector & ids,
+ std::vector & out) {
+ if (score_tokens_direct(ctx, ids, out)) {
+ z_normalize(out.data(), out.size());
+ return true;
+ }
+ const int S = (int)ids.size();
+ if (S <= kMinSegment) return false;
+
+ std::fprintf(stderr, "[kvflash-scorer] forward failed at S=%d, bisecting\n", S);
+ const int mid = S / 2;
+ std::vector tail(ids.end() - kLookahead, ids.end());
+
+ std::vector left(ids.begin(), ids.begin() + mid);
+ left.insert(left.end(), tail.begin(), tail.end());
+ std::vector ls;
+ if (!score_tokens_resilient(ctx, left, ls)) return false;
+
+ std::vector right(ids.begin() + mid, ids.end());
+ std::vector rs;
+ if (!score_tokens_resilient(ctx, right, rs)) return false;
+
+ out.assign((size_t)S, 0.0f);
+ std::copy(ls.begin(), ls.begin() + mid, out.begin()); // drop tail scores
+ std::copy(rs.begin(), rs.begin() + (S - mid), out.begin() + mid);
+ return true;
+}
+
+} // namespace
+
+bool KvFlashDrafterScorer::score_chunks(const std::vector & ids,
+ int chunk_tokens,
+ std::vector & out) {
+ const int S = (int)ids.size();
+ out.clear();
+ if (!ctx_ || !ctx_->loaded || S < kLookahead + 1 || chunk_tokens <= 0) return false;
+
+ std::vector score_ids = ids;
+ if (vocab_clamp_ > 1001) { // fold range must stay positive
+ for (auto & t : score_ids) {
+ if (t >= vocab_clamp_) t = 1000 + t % (vocab_clamp_ - 1000);
+ }
+ }
+
+ std::vector smooth;
+ if (!score_tokens_resilient(*ctx_, score_ids, smooth)) return false;
+
+ const int n_chunks = (S + chunk_tokens - 1) / chunk_tokens;
+ out.assign((size_t)n_chunks, 0.0f);
+ for (int c = 0; c < n_chunks; c++) {
+ const int s_ = c * chunk_tokens, e_ = std::min(S, (c + 1) * chunk_tokens);
+ float m = 0.0f;
+ for (int j = s_; j < e_; j++) m += smooth[j];
+ out[c] = m / std::max(1, e_ - s_);
+ }
+ return true;
+}
+
+// ── KvFlashCrossTokScorer ───────────────────────────────────────────────
+
+struct KvFlashCrossTokScorer::Toks {
+ Tokenizer target;
+ Tokenizer drafter;
+};
+
+KvFlashCrossTokScorer::~KvFlashCrossTokScorer() { delete toks_; }
+
+bool KvFlashCrossTokScorer::ensure_tokenizers() {
+ if (toks_) return true;
+ if (toks_failed_) return false;
+ auto * t = new Toks();
+ if (!t->target.load_from_gguf(target_gguf_.c_str()) ||
+ !t->drafter.load_from_gguf(drafter_gguf_.c_str())) {
+ std::fprintf(stderr, "[kvflash] cross-tokenizer scorer: tokenizer load "
+ "failed (%s / %s)\n",
+ target_gguf_.c_str(), drafter_gguf_.c_str());
+ delete t;
+ toks_failed_ = true;
+ return false;
+ }
+ toks_ = t;
+ return true;
+}
+
+bool KvFlashCrossTokScorer::score_chunks(const std::vector & ids,
+ int chunk_tokens,
+ std::vector & out) {
+ const int S = (int)ids.size();
+ out.clear();
+ if (!ctx_ || !ctx_->loaded || S < kLookahead + 1 || chunk_tokens <= 0) return false;
+ if (!ensure_tokenizers()) return false;
+
+ // 1) Target ids -> text, recording each target token's char end offset.
+ // Byte-level BPE pieces concatenate exactly, so per-id decode gives
+ // exact spans; special/template tokens may decode empty (their chunk
+ // contribution then comes from neighboring text, which is fine).
+ std::string text;
+ text.reserve((size_t)S * 4);
+ std::vector tgt_end((size_t)S);
+ std::vector one(1);
+ for (int i = 0; i < S; i++) {
+ one[0] = ids[(size_t)i];
+ text += toks_->target.decode(one);
+ tgt_end[(size_t)i] = (int32_t)text.size();
+ }
+
+ // 2) Text -> drafter ids, with each drafter token's char midpoint.
+ const std::vector dids = toks_->drafter.encode(text);
+ const int D = (int)dids.size();
+ if (D < kLookahead + 1) return false;
+ std::vector dmid((size_t)D);
+ {
+ size_t pos = 0;
+ for (int i = 0; i < D; i++) {
+ one[0] = dids[(size_t)i];
+ const size_t len = toks_->drafter.decode(one).size();
+ dmid[(size_t)i] = (float)pos + (float)len * 0.5f;
+ pos += len;
+ }
+ }
+
+ // 3) Same tail-attention forward as the same-tokenizer scorer.
+ std::vector dscore;
+ if (!score_tokens_resilient(*ctx_, dids, dscore)) return false;
+
+ // 4) Map drafter-token scores onto target chunks by char span: a chunk's
+ // score is the mean of drafter tokens whose midpoint falls inside the
+ // chunk's text span. Empty spans (pure template tokens) stay at 0,
+ // i.e. z-score-neutral.
+ const int n_chunks = (S + chunk_tokens - 1) / chunk_tokens;
+ out.assign((size_t)n_chunks, 0.0f);
+ std::vector counts((size_t)n_chunks, 0);
+ int d = 0;
+ for (int c = 0; c < n_chunks; c++) {
+ const int last_tok_idx = std::min(S, (c + 1) * chunk_tokens) - 1;
+ const float span_end = (float)tgt_end[(size_t)last_tok_idx];
+ while (d < D && dmid[(size_t)d] < span_end) {
+ out[(size_t)c] += dscore[(size_t)d];
+ counts[(size_t)c]++;
+ d++;
+ }
+ if (counts[(size_t)c] > 0) out[(size_t)c] /= (float)counts[(size_t)c];
+ }
+ return true;
+}
+
+} // namespace dflash::common
diff --git a/server/src/qwen3/qwen3_kvflash_scorer.h b/server/src/qwen3/qwen3_kvflash_scorer.h
new file mode 100644
index 000000000..e0fda5074
--- /dev/null
+++ b/server/src/qwen3/qwen3_kvflash_scorer.h
@@ -0,0 +1,68 @@
+// KvFlashDrafterScorer — pflash drafter as the KV pager's Memory Indexer.
+//
+// Scores 64-token chunks with the same Liu Q-hook tail attention that
+// pflash compression uses (forward_qwen3_drafter_model), but returns the
+// per-chunk relevance scores instead of a compressed token list. The
+// DrafterContext is borrowed: the daemon shares its pflash drafter; the
+// pager itself never depends on this file (see common/kvflash_scorer.h).
+
+#pragma once
+
+#include "kvflash_scorer.h"
+#include "qwen3_drafter.h"
+
+#include
+
+namespace dflash::common {
+
+class KvFlashDrafterScorer : public KvFlashScorer {
+public:
+ // `vocab_clamp`: ids >= clamp are folded into the drafter's vocab range
+ // before scoring. Needed when the target vocabulary is a superset of
+ // the drafter's (e.g. Qwen3.6 target + Qwen3-0.6B drafter); prompt ids
+ // tokenized for the target may be unembeddable by the drafter.
+ explicit KvFlashDrafterScorer(DrafterContext * ctx, int32_t vocab_clamp = 100000)
+ : ctx_(ctx), vocab_clamp_(vocab_clamp) {}
+
+ bool score_chunks(const std::vector & ids, int chunk_tokens,
+ std::vector & out) override;
+
+private:
+ DrafterContext * ctx_;
+ int32_t vocab_clamp_;
+};
+
+// KvFlashCrossTokScorer — the same drafter scoring for targets that do NOT
+// share the Qwen tokenizer (laguna, gemma4). Relevance is a property of the
+// TEXT, so the bridge is re-tokenization: detokenize the target's history
+// (its own tokenizer, loaded from the target GGUF), tokenize the text with
+// the drafter's tokenizer (from the drafter GGUF), run the same tail-
+// attention forward, then map per-drafter-token scores back onto the
+// target's chunk boundaries by character spans. Tokenizers are host-only
+// and lazy-loaded on first score.
+class KvFlashCrossTokScorer : public KvFlashScorer {
+public:
+ KvFlashCrossTokScorer(DrafterContext * ctx,
+ std::string target_gguf,
+ std::string drafter_gguf)
+ : ctx_(ctx), target_gguf_(std::move(target_gguf)),
+ drafter_gguf_(std::move(drafter_gguf)) {}
+ ~KvFlashCrossTokScorer() override;
+ KvFlashCrossTokScorer(const KvFlashCrossTokScorer &) = delete;
+ KvFlashCrossTokScorer & operator=(const KvFlashCrossTokScorer &) = delete;
+
+ bool score_chunks(const std::vector & ids, int chunk_tokens,
+ std::vector & out) override;
+
+private:
+ bool ensure_tokenizers();
+
+ DrafterContext * ctx_;
+ std::string target_gguf_, drafter_gguf_;
+ // Pimpl to keep server/tokenizer.h out of backend headers.
+ struct Toks;
+ Toks * toks_ = nullptr;
+ bool toks_failed_ = false;
+};
+
+} // namespace dflash::common
diff --git a/server/src/qwen35/graph_builders.cpp b/server/src/qwen35/graph_builders.cpp
index f41f94cc0..f6c963870 100644
--- a/server/src/qwen35/graph_builders.cpp
+++ b/server/src/qwen35/graph_builders.cpp
@@ -2,6 +2,7 @@
#include "ggml-alloc.h"
+#include
#include
namespace dflash::common {
@@ -88,7 +89,9 @@ bool build_layer_prefn_step(
int n_tokens,
bool with_mask,
int fa_window,
- int kq_stride_pad) {
+ int kq_stride_pad,
+ bool kvflash) {
+ if (kvflash) with_mask = true; // slot-space masking is mandatory on the pool
step_graph_free(sg);
ggml_init_params ip{};
@@ -109,20 +112,34 @@ bool build_layer_prefn_step(
ggml_set_name(sg.positions, "positions");
ggml_set_input(sg.positions);
if (with_mask) {
- const int max_win_len = cache.max_ctx + n_tokens;
+ // Mask width follows the PHYSICAL tensor capacity (pool-sized
+ // under kvflash) so it agrees with the FA span clamp inside
+ // build_full_attn_block.
+ int phys_ctx = cache.max_ctx;
+ for (ggml_tensor * t : cache.attn_k) {
+ if (t) { phys_ctx = std::min(phys_ctx, (int)t->ne[1]); break; }
+ }
+ const int max_win_len = phys_ctx + n_tokens;
const int kv_pad = align_up(max_win_len, kq_stride_pad);
const int q_pad = align_up(n_tokens, KQ_MASK_PAD);
sg.attn_mask = ggml_new_tensor_2d(sg.ctx, GGML_TYPE_F16, kv_pad, q_pad);
ggml_set_name(sg.attn_mask, "attn_mask");
ggml_set_input(sg.attn_mask);
}
+ if (kvflash) {
+ sg.kv_write_rows = ggml_new_tensor_2d(sg.ctx, GGML_TYPE_I64,
+ n_tokens, w.n_head_kv);
+ ggml_set_name(sg.kv_write_rows, "kv_write_rows");
+ ggml_set_input(sg.kv_write_rows);
+ }
}
sg.gf = ggml_new_graph_custom(sg.ctx, 16384, false);
QwenLayerPrefnOutputs go = build_qwen35_layer_prefn(
sg.ctx, sg.gf, w, cache, layer_idx,
sg.inp_embed, sg.positions, sg.attn_mask,
- kv_start, n_tokens, fa_window);
+ kv_start, n_tokens, fa_window,
+ sg.kv_write_rows);
if (!go.residual || !go.post) return false;
sg.ffn_residual = go.residual;
sg.ffn_post = go.post;
@@ -236,7 +253,8 @@ bool build_target_step(
int fa_window,
bool last_token_logits_only,
int kq_stride_pad,
- bool capture_moe_router) {
+ bool capture_moe_router,
+ bool kvflash_mask) {
step_graph_free(sg);
// Persistent thread_local arena: rebuilt step graphs land at identical
@@ -266,7 +284,13 @@ bool build_target_step(
// Use max_ctx for mask allocation so the gallocr buffer never needs to
// grow as kv_start increases during generation. The actual mask is
// filled only up to kv_start + n_tokens; the excess is don't-care.
- const int max_win_len = cache.max_ctx + n_tokens;
+ // kvflash mode: the physical span is the (smaller) pool capacity of
+ // the attention tensors, so size the mask from those instead.
+ int phys_ctx = cache.max_ctx;
+ for (auto * t : cache.attn_k) {
+ if (t) { phys_ctx = std::min(phys_ctx, (int)t->ne[1]); break; }
+ }
+ const int max_win_len = phys_ctx + n_tokens;
const int kv_pad = align_up(max_win_len, kq_stride_pad);
const int q_pad = align_up(n_tokens, KQ_MASK_PAD);
sg.attn_mask = ggml_new_tensor_2d(sg.ctx, GGML_TYPE_F16, kv_pad, q_pad);
@@ -280,8 +304,16 @@ bool build_target_step(
// DFLASH_QWEN35_NO_KVPAD=1 restores the legacy cpy append + exact-length
// FA span (per-step node properties -> no CUDA-graph replay).
static const bool g_no_kvpad = (std::getenv("DFLASH_QWEN35_NO_KVPAD") != nullptr);
- const bool use_kv_write_rows = (!g_no_kvpad && n_tokens == 1 && fa_window == 0 &&
- !with_mask && !capture && !capture_delta_intermediate);
+ // kvflash_mask: kvflash mode. The mask carries pool slot validity
+ // (uploaded by the caller before EVERY compute — the input's buffer
+ // region is reused by graph execution) and set_rows carries per-token
+ // physical slots, so the slot-mapped write stays active for masked,
+ // multi-token, and feature-capturing forwards (decode AND spec verify).
+ const bool use_kv_write_rows =
+ !g_no_kvpad && !capture_delta_intermediate &&
+ (kvflash_mask
+ ? (fa_window == 0)
+ : (n_tokens == 1 && fa_window == 0 && !with_mask && !capture));
if (use_kv_write_rows) {
sg.kv_write_rows = ggml_new_tensor_2d(sg.ctx, GGML_TYPE_I64,
n_tokens, w.n_head_kv);
diff --git a/server/src/qwen35/graph_builders.h b/server/src/qwen35/graph_builders.h
index 69a1e89e4..ca11a8169 100644
--- a/server/src/qwen35/graph_builders.h
+++ b/server/src/qwen35/graph_builders.h
@@ -40,6 +40,10 @@ bool build_layer_step(
int fa_window = 0,
int kq_stride_pad = KQ_MASK_PAD);
+// `kvflash`: pooled mode — KV rows go through a set_rows input
+// (sg.kv_write_rows, [n_tokens, n_head_kv] ne0-major slots) and the mask
+// (forced on) is sized to the PHYSICAL tensor capacity so the caller can
+// fill it in slot space. Caller allocates slots and fills rows + mask.
bool build_layer_prefn_step(
StepGraph & sg,
const TargetWeights & w,
@@ -50,7 +54,8 @@ bool build_layer_prefn_step(
int n_tokens,
bool with_mask,
int fa_window = 0,
- int kq_stride_pad = KQ_MASK_PAD);
+ int kq_stride_pad = KQ_MASK_PAD,
+ bool kvflash = false);
// Full layer graph for hybrid decode: pre-FFN + MoE FFN + shared + residual in one compute.
// Output: sg.hidden_input = layer_output, sg.moe_selected = router selections.
@@ -67,6 +72,11 @@ bool build_hybrid_full_layer_step(
int kq_stride_pad = KQ_MASK_PAD);
// Full target forward: chain mode (all layers, logits + argmax output).
+//
+// `kvflash_mask`: kvflash pooled mode — keep the set_rows KV write active
+// even though a mask is requested (the mask carries pool-slot validity and
+// must be re-uploaded by the caller before every compute). Used by both
+// single-token decode and multi-token spec verify; requires fa_window == 0.
bool build_target_step(
StepGraph & sg,
const TargetWeights & w,
@@ -80,7 +90,8 @@ bool build_target_step(
int fa_window = 0,
bool last_token_logits_only = false,
int kq_stride_pad = KQ_MASK_PAD,
- bool capture_moe_router = false);
+ bool capture_moe_router = false,
+ bool kvflash_mask = false);
// Full target forward: DDTree tree-verify mode.
bool build_target_step_tree(
diff --git a/server/src/qwen35/qwen35_backend.cpp b/server/src/qwen35/qwen35_backend.cpp
index c22b37ed5..4feb08b03 100644
--- a/server/src/qwen35/qwen35_backend.cpp
+++ b/server/src/qwen35/qwen35_backend.cpp
@@ -10,6 +10,7 @@
#include "common/io_utils.h"
#include "common/restore_delta.h"
#include "qwen3/qwen3_drafter.h"
+#include "qwen3/qwen3_kvflash_scorer.h"
#include "ggml-cuda.h"
#include "common/snapshot_backend.h"
@@ -26,6 +27,8 @@
#include
#include
+#include "kv_quant.h"
+
namespace dflash::common {
namespace {
@@ -215,11 +218,63 @@ bool Qwen35Backend::init() {
const int max_verify_tokens = cfg_.ddtree_mode
? std::max(dw_.block_size, cfg_.ddtree_budget + 1)
: dw_.block_size;
+ // kvflash (bounded residency): pool size from the env, rounded/floored/
+ // clamped by the shared reader (256-stride keeps FA vec-kernel
+ // eligibility; the floor keeps eviction from deadlocking).
+ // Drafter-scored residency is the DEFAULT policy: explicit
+ // --prefill-drafter first, then the well-known locations next to the
+ // model (Spark's pattern). LRU is the fallback when nothing is found
+ // (or the explicit choice via --kvflash-policy lru).
+ if (std::getenv("DFLASH_KVFLASH")) {
+ kvflash_drafter_path_ = kvflash_find_drafter(cfg_.target_path);
+ }
+ // "auto" sizes the pool from the GPU: weights are resident at this
+ // point and the cache is not yet allocated, so device-free minus a
+ // reserve (compute buffers + the drafter when expected) is what the
+ // pool can really use, converted at this model's pooled-KV density.
+ KvFlashAutoBudget kvf_budget;
+ {
+ size_t gpu_free = 0, gpu_total = 0;
+ if (ggml_backend_dev_t dev = ggml_backend_get_device(target_backend_)) {
+ ggml_backend_dev_memory(dev, &gpu_free, &gpu_total);
+ }
+ ggml_type kv_k = GGML_TYPE_Q8_0, kv_v = GGML_TYPE_Q8_0;
+ dflash::resolve_kv_types(kv_k, kv_v);
+ const int n_full = w_.n_layer / w_.full_attention_interval;
+ kvf_budget.free_bytes = (int64_t)gpu_free;
+ kvf_budget.bytes_per_token = (int64_t)n_full * w_.n_head_kv *
+ (int64_t)(ggml_row_size(kv_k, w_.n_embd_head_k) +
+ ggml_row_size(kv_v, w_.n_embd_head_v));
+ kvf_budget.reserve_bytes = (int64_t)(1.5 * 1073741824.0) +
+ (kvflash_drafter_path_.empty() ? 0 : (int64_t)(1.7 * 1073741824.0));
+ }
+ kvflash_tokens_ = kvflash_pool_from_env(cfg_.device.max_ctx, KvFlashConfig{},
+ !kvflash_drafter_path_.empty(),
+ kvf_budget);
+ if (kvflash_tokens_ > 0) {
+ kvflash_tau_ = std::max(1, env_int_or_default("DFLASH_KVFLASH_TAU", 64));
+ }
if (!create_target_cache(w_, cfg_.device.max_ctx, max_verify_tokens, target_backend_, cache_,
- /*prefill_only=*/true)) {
+ /*prefill_only=*/true, /*ctx_alloc=*/kvflash_tokens_)) {
std::fprintf(stderr, "cache: %s\n", dflash27b_last_error());
return false;
}
+ if (kvflash_active()) {
+ KvFlashConfig pc;
+ pc.pool_tokens = kvflash_tokens_;
+ if (!kvflash_pager_.attach(pc, cache_.attn_k, cache_.attn_v)) {
+ std::fprintf(stderr, "kvflash: pager attach failed (pool=%d)\n", kvflash_tokens_);
+ return false;
+ }
+ std::printf("[kvflash] resident pool %d tokens (logical max_ctx %d), "
+ "tau=%d, policy=%s\n",
+ kvflash_tokens_, cfg_.device.max_ctx, kvflash_tau_,
+ !kvflash_drafter_path_.empty()
+ ? "drafter (attaches on first reselect)"
+ : "lru (recency-only: no Qwen3-0.6B drafter found "
+ "next to the model or in --prefill-drafter)");
+ std::fflush(stdout);
+ }
// Init feature mirror when draft model is available (needed for spec decode).
// On single-GPU, this is an F32 conversion buffer; on split-GPU, a cross-device mirror.
@@ -290,6 +345,7 @@ bool Qwen35Backend::unpark(const std::string & what) {
std::fprintf(stderr, "[unpark] target: %s\n", dflash27b_last_error());
return false;
}
+ kvflash_drafter_failed_ = false; // fresh VRAM: allow a retry
target_parked_ = false;
std::printf("[unpark] target restored\n"); std::fflush(stdout);
}
@@ -340,6 +396,22 @@ bool Qwen35Backend::unpark(const std::string & what) {
bool Qwen35Backend::snapshot_save(int slot) {
if (slot < 0 || slot >= PREFIX_SLOTS) return false;
+ // kvflash: snapshots right-size to cur_pos, which is a LOGICAL position
+ // that can exceed the physical pool once decode has paged, and they copy
+ // rows assuming the identity layout, which pooled prefill / eviction
+ // breaks. Snapshots of pooled state need page-table serialization
+ // (follow-up); identity-mapped prefill-time snapshots remain valid.
+ if (kvflash_active() &&
+ (cache_.cur_pos > kvflash_tokens_ || !kvflash_pager_.is_identity())) {
+ static bool warned = false;
+ if (!warned) {
+ std::fprintf(stderr, "[kvflash] snapshot skipped: cur_pos %d exceeds "
+ "pool %d (pooled snapshots are a follow-up)\n",
+ cache_.cur_pos, kvflash_tokens_);
+ warned = true;
+ }
+ return false;
+ }
PrefixSnapshot & snap = prefix_snapshots_[slot];
return snapshot_target_cache(w_, cache_, snap_backend_, snap);
}
@@ -488,6 +560,13 @@ ModelBackend::CompressResult Qwen35Backend::compress(const CompressRequest & req
}
drafter_loaded_ = true;
std::fprintf(stderr, "[compress] drafter ready\n");
+ // pflash + kvflash synergy: the drafter doubles as the pool's
+ // Memory Indexer (tau-step reselect). Pager stays LRU without it.
+ if (kvflash_active() && !kvflash_scorer_) {
+ kvflash_scorer_ = std::make_unique(&drafter_ctx_);
+ std::fprintf(stderr, "[kvflash] drafter scorer attached (tau=%d)\n",
+ kvflash_tau_);
+ }
}
result.compressed_ids = drafter_score_and_compress(
@@ -544,6 +623,8 @@ bool Qwen35Backend::handle_compress(const std::string & line, const DaemonIO & i
void Qwen35Backend::free_drafter() {
if (drafter_loaded_) {
+ // The kvflash scorer borrows drafter_ctx_; drop it first.
+ kvflash_scorer_.reset();
// Drafter has its own backend — do a full free (weights + backend)
dflash::common::free_drafter(drafter_ctx_);
drafter_loaded_ = false;
@@ -579,6 +660,10 @@ DFlashTarget * Qwen35Backend::dflash_target() {
dflash_target_ = std::make_unique(
w_, cache_, target_backend_, sg_,
cfg_.kq_stride_pad, cfg_.fa_window);
+ if (kvflash_active()) {
+ static_cast(dflash_target_.get())
+ ->set_kvflash_pager(&kvflash_pager_);
+ }
}
return dflash_target_.get();
}
@@ -856,6 +941,32 @@ int Qwen35Backend::do_prefill(const std::vector & tokens,
const int prompt_len = (int)tokens.size();
prefill_last_logits_valid_ = false;
+ // kvflash: a prompt that fits the pool prefills contiguously (identity
+ // mapping, normal chunking). A LARGER prompt switches to POOLED CHUNKED
+ // PREFILL: pager-chunk-sized batches whose KV rows are slot-mapped via
+ // set_rows, with a slot-space mask per chunk and live eviction as the
+ // pool fills (constant VRAM, linear time). Restore offsets are not
+ // supported in the pooled path (a relocated prefix cannot be restored
+ // identity-style in the first place).
+ const bool kvf_paged = kvflash_active() &&
+ kv_offset + prompt_len > kvflash_tokens_ - kvflash_pager_.chunk_tokens();
+ if (kvf_paged && kv_offset != 0) {
+ std::fprintf(stderr,
+ "[kvflash] restored prefix (%d) + prompt (%d) exceeds pool %d; "
+ "pooled prefill requires a fresh request\n",
+ kv_offset, prompt_len, kvflash_tokens_);
+ set_last_error("kvflash: restore + pooled prefill unsupported");
+ return -1;
+ }
+ if (kvf_paged) {
+ prefill_ubatch = kvflash_pager_.chunk_tokens();
+ kvflash_pager_.reset();
+ std::printf("[kvflash] pooled prefill: %d tokens through a %d-token pool "
+ "(%d-token chunks, evicting)\n",
+ prompt_len, kvflash_tokens_, prefill_ubatch);
+ std::fflush(stdout);
+ }
+
// Skip KV-cache migration when resuming from a snapshot — the cache was
// already migrated when the snapshot was taken; re-running migrate would
// clobber the restored state.
@@ -887,18 +998,39 @@ int Qwen35Backend::do_prefill(const std::vector & tokens,
// incl. the user message -> a different user msg restores garbage.)
if (snap_slot >= 0 && snap_pos >= 0 &&
kv_pos <= snap_pos && snap_pos < kv_pos + n_tokens) {
- if (kv_pos > kv_offset) { // skip a degenerate short-prefix snapshot
+ if (kv_pos > kv_offset && !kvf_paged) { // skip degenerate / relocated
cache_.cur_pos = kv_pos;
if (snapshot_save(snap_slot)) {
std::printf("[snap] boundary slot=%d cur_pos=%d (req snap_pos=%d)\n",
snap_slot, kv_pos, snap_pos);
std::fflush(stdout);
}
+ } else if (kvf_paged) {
+ std::fprintf(stderr, "[kvflash] boundary snapshot skipped: pooled "
+ "prefill relocates chunks\n");
}
snap_pos = -1;
snap_slot = -1;
}
- const bool with_mask = (cfg_.kq_stride_pad > KQ_MASK_PAD) || (n_tokens > 1);
+ const bool with_mask = kvf_paged ||
+ (cfg_.kq_stride_pad > KQ_MASK_PAD) || (n_tokens > 1);
+
+ // kvflash pooled prefill: allocate this chunk's slots up front
+ // (evicting the lowest-priority resident chunk once the pool fills).
+ std::vector kvf_slots;
+ if (kvf_paged) {
+ kvf_slots.resize((size_t)n_tokens);
+ bool ok = true;
+ for (int i = 0; i < n_tokens; i++) {
+ kvf_slots[(size_t)i] = kvflash_pager_.slot_for(kv_pos + i);
+ if (kvf_slots[(size_t)i] < 0) { ok = false; break; }
+ }
+ if (!ok) {
+ std::fprintf(stderr, "[kvflash] pooled prefill: slot alloc failed @%d\n", kv_pos);
+ set_last_error("kvflash: no evictable pool block");
+ return -1;
+ }
+ }
// Prefill always uses full attention (fa_window=0) so that all
// positions encode the complete context — critical for tool
@@ -911,10 +1043,26 @@ int Qwen35Backend::do_prefill(const std::vector & tokens,
/*fa_window=*/0,
/*last_token_logits_only=*/(start + n_tokens < prompt_len),
cfg_.kq_stride_pad,
- should_capture_moe_router())) {
+ should_capture_moe_router(),
+ /*kvflash_mask=*/kvf_paged)) {
std::fprintf(stderr, "prefill build @%d\n", kv_pos);
return -1;
}
+ if (kvf_paged) {
+ if (!sg_.kv_write_rows) {
+ std::fprintf(stderr, "[kvflash] pooled prefill requires the set_rows path\n");
+ return -1;
+ }
+ // [n_tokens, n_head_kv] ne0-major (see verify_batch).
+ std::vector rows((size_t)n_tokens * w_.n_head_kv);
+ for (int h = 0; h < w_.n_head_kv; h++) {
+ for (int i = 0; i < n_tokens; i++) {
+ rows[(size_t)h * n_tokens + i] = kvf_slots[(size_t)i];
+ }
+ }
+ ggml_backend_tensor_set(sg_.kv_write_rows, rows.data(), 0,
+ sizeof(int64_t) * rows.size());
+ }
// Embed
if (!w_.embedder.embed(tokens.data() + start, n_tokens, embed_buf.data())) {
@@ -936,7 +1084,34 @@ int Qwen35Backend::do_prefill(const std::vector & tokens,
sizeof(int32_t) * pos_buf.size());
// Mask — full attention during prefill (no windowing)
- if (sg_.attn_mask) {
+ if (sg_.attn_mask && kvf_paged) {
+ // Slot-space mask (same recipe as verify_batch): row q attends
+ // (a) the slots of resident chunks holding positions < kv_pos
+ // and (b) this chunk's own slots, causally.
+ constexpr uint16_t F16_ZERO = 0x0000, F16_NEG_INF = 0xFC00;
+ const size_t kvd = (size_t)sg_.attn_mask->ne[0];
+ const int q_pad = (int)sg_.attn_mask->ne[1];
+ std::vector mask_buf((size_t)kvd * q_pad, F16_NEG_INF);
+ const int ct = kvflash_pager_.chunk_tokens();
+ for (int c = 0; c < kvflash_pager_.n_chunks(); c++) {
+ const int blk = kvflash_pager_.block_of(c);
+ if (blk < 0) continue;
+ for (int i = 0; i < ct; i++) {
+ if ((int64_t)c * ct + i >= kv_pos) break;
+ mask_buf[(size_t)blk * ct + i] = F16_ZERO;
+ }
+ }
+ for (int q = 1; q < n_tokens; q++) {
+ std::memcpy(mask_buf.data() + (size_t)q * kvd, mask_buf.data(), kvd * 2);
+ }
+ for (int q = 0; q < n_tokens; q++) {
+ for (int i = 0; i <= q; i++) {
+ mask_buf[(size_t)q * kvd + kvf_slots[(size_t)i]] = F16_ZERO;
+ }
+ }
+ ggml_backend_tensor_set(sg_.attn_mask, mask_buf.data(), 0,
+ sizeof(uint16_t) * mask_buf.size());
+ } else if (sg_.attn_mask) {
const int win_start = 0;
const int kv_len = kv_pos + n_tokens - win_start;
std::vector mask_buf;
@@ -979,6 +1154,18 @@ int Qwen35Backend::do_prefill(const std::vector & tokens,
start += n_tokens;
}
+ if (kvflash_active()) {
+ if (kvf_paged) {
+ // The pager mapping was built live during the pooled prefill;
+ // only the history / hygiene parts of the sync apply.
+ kvflash_history_.assign(tokens.begin(), tokens.end());
+ kvflash_pager_.zero_free_blocks();
+ kvflash_mask_epoch_ = (uint64_t)-1;
+ } else {
+ kvflash_sync_prefill(committed, tokens, kv_offset);
+ }
+ }
+
// End-of-prefill snapshot: scoped disk-cache saves (auto/fixed policy)
// request snap_pos == prompt end, which never falls inside a chunk so the
// boundary branch above cannot fire. Taking the snapshot here changes
@@ -995,6 +1182,104 @@ int Qwen35Backend::do_prefill(const std::vector & tokens,
return committed;
}
+// ── kvflash helpers ─────────────────────────────────────────────────
+
+void Qwen35Backend::kvflash_sync_prefill(int committed,
+ const std::vector & tokens,
+ int kv_offset) {
+ // Prefill (and snapshot restore) place rows physically contiguous at
+ // [0, committed): rebuild the pager mapping identity-style and reset
+ // the token history to match.
+ kvflash_pager_.reset();
+ for (int p = 0; p < committed; p++) {
+ const int slot = kvflash_pager_.slot_for(p);
+ if (slot != p) {
+ // Cannot happen while prompt <= pool (blocks are handed out in
+ // order from a freshly reset pager); guard against future
+ // changes to the hand-out order.
+ std::fprintf(stderr, "[kvflash] prefill slot mismatch %d != %d\n", slot, p);
+ }
+ }
+ if (kv_offset == 0) {
+ kvflash_history_.assign(tokens.begin(), tokens.end());
+ } else {
+ kvflash_history_.resize((size_t)kv_offset, 0); // restored prefix ids unknown
+ kvflash_history_.insert(kvflash_history_.end(), tokens.begin(), tokens.end());
+ }
+ // Slots past the prompt still hold the previous request's rows; the
+ // maskless qwen35moe pipelined decode reads the whole padded pool span.
+ kvflash_pager_.zero_free_blocks();
+ kvflash_mask_epoch_ = (uint64_t)-1;
+}
+
+void Qwen35Backend::kvflash_upload_mask() {
+ if (!sg_.attn_mask) return;
+ const size_t need = (size_t)sg_.attn_mask->ne[0] * sg_.attn_mask->ne[1];
+ if (kvflash_mask_buf_.size() != need || kvflash_pager_.epoch() != kvflash_mask_epoch_) {
+ kvflash_mask_buf_.assign(need, F16_NEG_INF);
+ kvflash_pager_.fill_slot_mask(kvflash_mask_buf_.data()); // q row 0
+ kvflash_mask_epoch_ = kvflash_pager_.epoch();
+ }
+ // Upload before EVERY compute: the input tensor's buffer region is
+ // reused by graph execution, so a stale upload reads back as garbage.
+ ggml_backend_tensor_set(sg_.attn_mask, kvflash_mask_buf_.data(), 0,
+ need * sizeof(uint16_t));
+}
+
+// Attach the drafter as the residency scorer outside the pflash compress
+// path: with `--kvflash --prefill-drafter ` but compression off, the
+// drafter would otherwise never load and the pool would silently run
+// recency-only LRU. Loads lazily on the first reselect that needs it (and
+// re-attaches after a draft-residency release frees the drafter).
+void Qwen35Backend::kvflash_ensure_scorer() {
+ if (kvflash_scorer_ || kvflash_drafter_path_.empty() || kvflash_drafter_failed_) {
+ return;
+ }
+ if (!drafter_loaded_) {
+ ggml_backend_synchronize(target_backend_);
+ if (draft_backend_) ggml_backend_synchronize(draft_backend_);
+ std::fprintf(stderr, "[kvflash] loading drafter for residency scoring: %s\n",
+ kvflash_drafter_path_.c_str());
+ if (!load_drafter(kvflash_drafter_path_, /*gpu_layers=*/999,
+ cfg_.device.gpu, drafter_ctx_)) {
+ std::fprintf(stderr, "[kvflash] drafter load failed (%s); staying on "
+ "LRU residency\n", dflash27b_last_error());
+ kvflash_drafter_failed_ = true;
+ return;
+ }
+ drafter_loaded_ = true;
+ }
+ kvflash_scorer_ = std::make_unique(&drafter_ctx_);
+ std::fprintf(stderr, "[kvflash] drafter scorer attached (tau=%d)\n", kvflash_tau_);
+}
+
+void Qwen35Backend::kvflash_maybe_reselect(int generated) {
+ if (kvflash_tau_ <= 0) return;
+ // Adaptive tau: a rescore costs ~0.11 ms per history token (full 0.6B
+ // re-prefill; measured 0.9 s @8K, ~46 s bisected @256K), while decode
+ // produces ~30 tok/s. Capping rescore overhead at ~15% of decode time
+ // gives tau ~= history/45. The configured tau is the floor.
+ const int tau = std::max(kvflash_tau_, (int)(kvflash_history_.size() / 45));
+ if (generated % tau != 0) return;
+ // Lazy-load the drafter only when a rescore is actually due, so the
+ // first tokens of the first request never pay the load.
+ if (!kvflash_scorer_) kvflash_ensure_scorer();
+ if (!kvflash_scorer_) return;
+ if (!kvflash_scorer_->score_chunks(kvflash_history_, kvflash_pager_.chunk_tokens(), kvflash_scores_)) {
+ return; // scorer failure -> keep LRU behavior this round
+ }
+ kvflash_pager_.score_hook = [this](int c) {
+ return c < (int)kvflash_scores_.size() ? kvflash_scores_[c] : 1e30f;
+ };
+ const int events = kvflash_pager_.reselect();
+ if (events > 0) {
+ std::fprintf(stderr, "[kvflash] reselect @gen=%d: %d page events "
+ "(resident %d/%d blocks)\n",
+ generated, events, kvflash_pager_.resident_blocks(),
+ kvflash_tokens_ / kvflash_pager_.chunk_tokens());
+ }
+}
+
bool Qwen35Backend::do_ar_decode(int committed, int n_gen,
std::vector & out_tokens,
const DaemonIO & io,
@@ -1127,6 +1412,7 @@ bool Qwen35Backend::do_ar_decode(int committed, int n_gen,
maybe_force_close(first_tok, committed);
out_tokens.push_back(first_tok);
io.emit(first_tok);
+ if (kvflash_active()) kvflash_history_.push_back(first_tok);
if (IS_EOS_TOK(first_tok, w_)) return true;
committed++;
cache_.cur_pos = committed;
@@ -1141,24 +1427,39 @@ bool Qwen35Backend::do_ar_decode(int committed, int n_gen,
int32_t pos4[4] = {committed, committed, committed, 0};
ggml_backend_tensor_set(sg_.positions, pos4, 0, sizeof(int32_t) * 4);
+ // kvflash: graph carries a slot-validity mask alongside the
+ // step-invariant set_rows write; the FA span clamps to the pool.
+ const bool pool = kvflash_active();
if (!build_target_step(sg_, w_, cache_, target_backend_,
/*kv_start=*/committed, /*n_tokens=*/1,
- /*with_mask=*/false, /*capture=*/false,
+ /*with_mask=*/pool, /*capture=*/false,
/*capture_delta_intermediate=*/false,
/*fa_window=*/0,
/*last_token_logits_only=*/false,
cfg_.kq_stride_pad,
- should_capture_moe_router())) {
+ should_capture_moe_router(),
+ /*kvflash_mask=*/pool)) {
return false;
}
- // Fill kv_write_rows with this step's cache slot (committed) for set_rows.
+ // Fill kv_write_rows with this step's cache slot for set_rows:
+ // the logical position directly, or its pool slot in kvflash mode.
if (sg_.kv_write_rows) {
const int n_head_kv = w_.n_head_kv;
- std::vector row_vals(n_head_kv, (int64_t)committed);
+ const int64_t slot = pool ? (int64_t)kvflash_pager_.slot_for(committed)
+ : (int64_t)committed;
+ if (pool && slot < 0) {
+ std::fprintf(stderr, "[kvflash] no pool slot at pos %d "
+ "(pool %d exhausted)\n",
+ committed, kvflash_tokens_);
+ set_last_error("kvflash: no evictable pool block");
+ return false;
+ }
+ std::vector row_vals(n_head_kv, slot);
ggml_backend_tensor_set(sg_.kv_write_rows, row_vals.data(), 0,
sizeof(int64_t) * n_head_kv);
}
+ if (pool) kvflash_upload_mask();
auto st = ggml_backend_graph_compute(target_backend_, sg_.gf);
if (st != GGML_STATUS_SUCCESS) return false;
@@ -1220,6 +1521,10 @@ bool Qwen35Backend::do_ar_decode(int committed, int n_gen,
io.emit(next_tok);
committed++;
cache_.cur_pos = committed;
+ if (pool) {
+ kvflash_history_.push_back(next_tok);
+ kvflash_maybe_reselect((int)(out_tokens.size() - out_tokens_at_entry));
+ }
if (io.cancelled) break;
if (IS_EOS_TOK(next_tok, w_)) break;
@@ -1352,6 +1657,12 @@ bool Qwen35Backend::do_spec_decode(int committed, int n_gen,
// - draft model loaded and not parked
// - feature mirror initialized
// - greedy decoding (no logit processing) — spec decode uses argmax verification
+ // - kvflash: verify_batch is slot-mapped (Qwen35DFlashTarget pooled
+ // path), and that covers --ddtree too: in the daemon, ddtree_mode
+ // configures larger verify intermediates + fast_rollback, whose
+ // snapshot_kv/restore_kv only touch DeltaNet/conv state (pool-
+ // neutral); generation runs this same chain loop either way. The
+ // tree-verify graphs exist only in the test harness (test_dflash).
const bool can_spec = cfg_.draft_path
&& !draft_parked_
&& (cfg_.remote_draft.enabled()
diff --git a/server/src/qwen35/qwen35_backend.h b/server/src/qwen35/qwen35_backend.h
index 59a105fc9..0df4df036 100644
--- a/server/src/qwen35/qwen35_backend.h
+++ b/server/src/qwen35/qwen35_backend.h
@@ -21,6 +21,8 @@
#include "dflash_feature_ring.h"
#include "internal.h" // TargetWeights, TargetCache, DraftWeights, PrefixSnapshot
#include "qwen3/qwen3_drafter.h" // DrafterContext, load_drafter, free_drafter, drafter_score_and_compress
+#include "kvflash_pager.h" // bounded KV residency pool
+#include "kvflash_scorer.h" // chunk-relevance policy interface
#include "ggml.h"
#include "ggml-backend.h"
@@ -158,6 +160,40 @@ class Qwen35Backend : public ModelBackend {
// ── Configuration ────────────────────────────────────────────────
Qwen35Config cfg_;
+ // ── kvflash (bounded KV residency, FlashMemory-style) ────────────
+ // Active when kvflash_tokens_ > 0 (env DFLASH_KVFLASH / --kvflash):
+ // attention KV tensors are allocated at pool capacity, logical
+ // positions map to pool slots via kvflash_pager_, cold chunks page to
+ // host. Policy-agnostic: with no scorer the pager is LRU; when the
+ // pflash drafter is loaded it becomes the reselect scorer (every
+ // kvflash_tau_ decoded tokens). Forces AR decode (no spec).
+ // Protected: the MoE subclass routes its pipelined decode loops and
+ // hybrid prefill through the same pager/history/reselect state.
+ KvFlashPager kvflash_pager_;
+ std::unique_ptr kvflash_scorer_;
+ std::vector kvflash_history_; // prompt + generated ids
+ std::vector kvflash_scores_; // latest chunk scores
+ std::vector kvflash_mask_buf_; // host mirror of slot mask
+ std::string kvflash_drafter_path_; // DFLASH_KVFLASH_DRAFTER
+ uint64_t kvflash_mask_epoch_ = (uint64_t)-1;
+ int kvflash_tokens_ = 0; // 0 = off
+ int kvflash_tau_ = 64;
+ bool kvflash_drafter_failed_ = false; // don't retry a failed load
+ bool kvflash_active() const { return kvflash_tokens_ > 0; }
+ // Rebuild pager mapping after (re)prefill: positions [0, committed)
+ // occupy pool slots identity-mapped (prefill is contiguous).
+ void kvflash_sync_prefill(int committed, const std::vector & tokens,
+ int kv_offset);
+ // Upload the slot-validity mask (host rebuild on epoch change, device
+ // upload every step — the input's buffer region is reused by compute).
+ void kvflash_upload_mask();
+ // Drafter rescore + reselect every kvflash_tau_ generated tokens.
+ void kvflash_maybe_reselect(int generated);
+ // Attach the drafter scorer if a drafter path is configured and the
+ // scorer is missing (lazy-loads the drafter on first need; also heals
+ // after a residency release frees it). No-op without a path.
+ void kvflash_ensure_scorer();
+
private:
// ── GPU backends ─────────────────────────────────────────────────
ggml_backend_t target_backend_ = nullptr;
diff --git a/server/src/qwen35/qwen35_dflash_target.cpp b/server/src/qwen35/qwen35_dflash_target.cpp
index 65713d1bb..5af4490af 100644
--- a/server/src/qwen35/qwen35_dflash_target.cpp
+++ b/server/src/qwen35/qwen35_dflash_target.cpp
@@ -5,6 +5,8 @@
#include "step_graph.h"
#include "attn_masks.h"
+#include
+
namespace dflash::common {
Qwen35DFlashTarget::~Qwen35DFlashTarget() {
@@ -33,18 +35,53 @@ bool Qwen35DFlashTarget::verify_batch(
if (n_tokens <= 0) return false;
const int hidden = w_.n_embd;
- const bool need_mask = (kq_stride_pad_ > KQ_MASK_PAD) || (n_tokens > 1);
+ const bool pool = pager_ != nullptr;
+ const bool need_mask = pool || (kq_stride_pad_ > KQ_MASK_PAD) || (n_tokens > 1);
+
+ // kvflash: allocate slots for the verify block up front (may evict at
+ // a chunk boundary; protections keep sinks + the tail window safe).
+ std::vector slots;
+ if (pool) {
+ slots.resize(n_tokens);
+ for (int i = 0; i < n_tokens; i++) {
+ slots[i] = pager_->slot_for(base_pos + i);
+ if (slots[i] < 0) {
+ std::fprintf(stderr, "verify_batch: pool slot alloc failed @%d\n", base_pos + i);
+ return false;
+ }
+ }
+ }
if (!build_target_step(sg_, w_, cache_, backend_,
/*kv_start=*/base_pos, n_tokens,
need_mask, /*capture=*/true,
/*capture_delta_intermediate=*/false,
- fa_window_,
+ pool ? 0 : fa_window_,
/*last_token_logits_only=*/false,
- kq_stride_pad_)) {
+ kq_stride_pad_,
+ /*capture_moe_router=*/false,
+ /*kvflash_mask=*/pool)) {
std::fprintf(stderr, "verify_batch: build_target_step failed (base=%d n=%d)\n", base_pos, n_tokens);
return false;
}
+ if (pool && !sg_.kv_write_rows) {
+ std::fprintf(stderr, "verify_batch: kvflash requires set_rows path\n");
+ return false;
+ }
+ if (pool) {
+ // kv_write_rows is [n_tokens, n_head_kv] ne0-major: element
+ // (token i, head h) lives at i + h*n_tokens (set_rows asserts
+ // b->ne[1] == c->ne[0]). Getting this transposed scrambles
+ // per-head row targets for every multi-token write.
+ std::vector rows((size_t)n_tokens * w_.n_head_kv);
+ for (int h = 0; h < w_.n_head_kv; h++) {
+ for (int i = 0; i < n_tokens; i++) {
+ rows[(size_t)h * n_tokens + i] = slots[i];
+ }
+ }
+ ggml_backend_tensor_set(sg_.kv_write_rows, rows.data(), 0,
+ sizeof(int64_t) * rows.size());
+ }
// Embed input tokens and fill positions.
std::vector embed((size_t)n_tokens * hidden);
@@ -66,8 +103,35 @@ bool Qwen35DFlashTarget::verify_batch(
ggml_backend_tensor_set(sg_.positions, pos.data(), 0,
sizeof(int32_t) * pos.size());
- // Fill causal attention mask when present.
- if (sg_.attn_mask) {
+ // Fill the attention mask.
+ if (sg_.attn_mask && pool) {
+ // Slot-space mask: row q attends (a) slots of committed positions
+ // (pos < base_pos) of resident chunks — this exactly excludes
+ // slots holding rejected drafts from earlier rounds — and (b) the
+ // verify tokens' own slots, causally.
+ const size_t kvd = (size_t)sg_.attn_mask->ne[0];
+ const int q_pad = (int)sg_.attn_mask->ne[1];
+ std::vector mask_buf((size_t)kvd * q_pad, F16_NEG_INF);
+ const int ct = pager_->chunk_tokens();
+ for (int c = 0; c < pager_->n_chunks(); c++) {
+ const int blk = pager_->block_of(c);
+ if (blk < 0) continue;
+ for (int i = 0; i < ct; i++) {
+ if ((int64_t)c * ct + i >= base_pos) break;
+ mask_buf[(size_t)blk * ct + i] = F16_ZERO;
+ }
+ }
+ for (int q = 1; q < n_tokens; q++) {
+ std::memcpy(mask_buf.data() + (size_t)q * kvd, mask_buf.data(), kvd * 2);
+ }
+ for (int q = 0; q < n_tokens; q++) {
+ for (int i = 0; i <= q; i++) {
+ mask_buf[(size_t)q * kvd + slots[i]] = F16_ZERO;
+ }
+ }
+ ggml_backend_tensor_set(sg_.attn_mask, mask_buf.data(), 0,
+ sizeof(uint16_t) * mask_buf.size());
+ } else if (sg_.attn_mask) {
const int win_start = (fa_window_ > 0 && base_pos > fa_window_)
? (base_pos - fa_window_) : 0;
const int kv_len = base_pos + n_tokens - win_start;
diff --git a/server/src/qwen35/qwen35_dflash_target.h b/server/src/qwen35/qwen35_dflash_target.h
index 6a72e48b5..17ab8bf95 100644
--- a/server/src/qwen35/qwen35_dflash_target.h
+++ b/server/src/qwen35/qwen35_dflash_target.h
@@ -10,6 +10,7 @@
#include "internal.h" // TargetWeights, TargetCache, DraftWeights
#include "step_graph.h"
#include "graph_builders.h"
+#include "kvflash_pager.h"
#include "ggml.h"
#include "ggml-backend.h"
@@ -53,6 +54,14 @@ class Qwen35DFlashTarget : public DFlashTarget {
int mask_token_id() const override;
const std::vector & capture_layer_ids() const override;
+ // kvflash mode: verify writes are slot-mapped via the pager and the
+ // attention mask carries slot validity (resident committed positions
+ // only) plus causal structure among the verify tokens. Rejected draft
+ // tokens need no explicit rollback: their slots are excluded by the
+ // pos < base_pos validity rule on the next verify and get rewritten.
+ // Forces fa_window = 0 (logical windowing is meaningless in slot space).
+ void set_kvflash_pager(KvFlashPager * pager) { pager_ = pager; }
+
private:
TargetWeights & w_;
TargetCache & cache_;
@@ -60,6 +69,7 @@ class Qwen35DFlashTarget : public DFlashTarget {
StepGraph & sg_;
int kq_stride_pad_;
int fa_window_;
+ KvFlashPager * pager_ = nullptr;
// Cached vector form of capture layer IDs (built once in constructor).
std::vector capture_ids_;
diff --git a/server/src/qwen35/qwen35_target_graph.cpp b/server/src/qwen35/qwen35_target_graph.cpp
index ed7fbe057..e0f7d8ecd 100644
--- a/server/src/qwen35/qwen35_target_graph.cpp
+++ b/server/src/qwen35/qwen35_target_graph.cpp
@@ -76,10 +76,11 @@ bool create_target_cache(const TargetWeights & w,
int max_verify_tokens,
ggml_backend_t backend,
TargetCache & out,
- bool prefill_only) {
+ bool prefill_only,
+ int ctx_alloc) {
return create_target_cache_partial(w, max_ctx, max_verify_tokens, backend,
out, prefill_only,
- 0, w.n_layer, true);
+ 0, w.n_layer, true, ctx_alloc);
}
bool create_target_cache_partial(const TargetWeights & w,
@@ -90,7 +91,8 @@ bool create_target_cache_partial(const TargetWeights & w,
bool prefill_only,
int layer_begin,
int layer_end,
- bool allocate_target_feat) {
+ bool allocate_target_feat,
+ int ctx_alloc) {
if (layer_begin < 0) layer_begin = 0;
if (layer_end < 0 || layer_end > w.n_layer) layer_end = w.n_layer;
if (layer_begin > layer_end) {
@@ -133,9 +135,14 @@ bool create_target_cache_partial(const TargetWeights & w,
const bool needs_256_stride =
kv_k_type == GGML_TYPE_TQ3_0 || kv_v_type == GGML_TYPE_TQ3_0;
+ // kvflash mode: attention tensors are allocated at the (smaller)
+ // physical pool capacity; logical positions are mapped to pool slots
+ // by KvFlashPager. The 256-stride rounding applies to whichever capacity
+ // is in effect.
+ const int ctx_phys = (ctx_alloc > 0 && ctx_alloc < max_ctx) ? ctx_alloc : max_ctx;
const int max_ctx_alloc = needs_256_stride
- ? ((max_ctx + 255) / 256) * 256
- : max_ctx;
+ ? ((ctx_phys + 255) / 256) * 256
+ : ctx_phys;
// ── Base context: KV cache + SSM/conv state + target_feat ────────
{
@@ -433,6 +440,62 @@ void restore_ssm_state(TargetCache & c) {
}
}
+// Allocate SSM/conv rollback snapshot tensors by mirroring the live recurrent
+// state tensors' shapes. The MoE hybrid spec-decode path sets up its DeltaNet
+// state in base_buf but never calls migrate_prefill_cache, so without this
+// snapshot_ssm_state/restore_ssm_state are silent no-ops (the _snap arrays are
+// empty/null) and rejected draft tokens leak permanently into the linear
+// recurrent state, collapsing generation. Idempotent: reuses an existing
+// rollback_ctx (from a prior request or migrate_prefill_cache).
+bool ensure_ssm_snapshot(TargetCache & c, ggml_backend_t backend) {
+ if (c.rollback_ctx) return true;
+ const size_t n = c.ssm_state.size();
+ if (n == 0) return true;
+ c.ssm_state_snap.assign(n, nullptr);
+ c.conv_state_snap.assign(n, nullptr);
+
+ size_t cnt = 0;
+ for (size_t i = 0; i < n; i++) {
+ if (c.ssm_state[i]) cnt++;
+ if (i < c.conv_state.size() && c.conv_state[i]) cnt++;
+ }
+ if (cnt == 0) return true;
+
+ ggml_init_params ip{};
+ ip.mem_size = (cnt + 8) * ggml_tensor_overhead();
+ ip.mem_buffer = nullptr;
+ ip.no_alloc = true;
+ c.rollback_ctx = ggml_init(ip);
+ if (!c.rollback_ctx) { set_last_error("ensure_ssm_snapshot ggml_init failed"); return false; }
+
+ for (size_t i = 0; i < n; i++) {
+ char name[64];
+ if (c.ssm_state[i]) {
+ ggml_tensor * t = c.ssm_state[i];
+ ggml_tensor * sn = ggml_new_tensor(c.rollback_ctx, t->type, ggml_n_dims(t), t->ne);
+ std::snprintf(name, sizeof(name), "ssm_state_snap_%zu", i);
+ ggml_set_name(sn, name);
+ c.ssm_state_snap[i] = sn;
+ }
+ if (i < c.conv_state.size() && c.conv_state[i]) {
+ ggml_tensor * t = c.conv_state[i];
+ ggml_tensor * cn = ggml_new_tensor(c.rollback_ctx, t->type, ggml_n_dims(t), t->ne);
+ std::snprintf(name, sizeof(name), "conv_state_snap_%zu", i);
+ ggml_set_name(cn, name);
+ c.conv_state_snap[i] = cn;
+ }
+ }
+
+ c.rollback_buf = ggml_backend_alloc_ctx_tensors(c.rollback_ctx, backend);
+ if (!c.rollback_buf) {
+ set_last_error("ensure_ssm_snapshot alloc_ctx_tensors failed");
+ ggml_free(c.rollback_ctx);
+ c.rollback_ctx = nullptr;
+ return false;
+ }
+ return true;
+}
+
// ─── Helpers ─────────────────────────────────────────────────────────
static ggml_tensor * build_swiglu_ffn(ggml_context * ctx, ggml_tensor * cur,
diff --git a/server/src/qwen35moe/qwen35moe_backend.cpp b/server/src/qwen35moe/qwen35moe_backend.cpp
index 6455eac52..8b40be9fa 100644
--- a/server/src/qwen35moe/qwen35moe_backend.cpp
+++ b/server/src/qwen35moe/qwen35moe_backend.cpp
@@ -469,6 +469,7 @@ bool Qwen35MoeBackend::run_pipelined_decode_path(int committed, int n_gen,
if (is_eos_tok(first_tok, target_weights())) return true;
committed++;
target_cache().cur_pos = committed;
+ if (kvflash_active()) kvflash_history_.push_back(first_tok);
}
// ── Ensure persistent pipelined state (built once, reused) ──
@@ -487,11 +488,23 @@ bool Qwen35MoeBackend::run_pipelined_decode_path(int committed, int n_gen,
act_cur.data(), 0, sizeof(float) * (size_t)hidden);
const auto embed_done = DecodeClock::now();
+ // kvflash: physical pool slot for this token's KV rows (may evict).
+ int kv_slot = -1;
+ if (kvflash_active()) {
+ kv_slot = kvflash_pager_.slot_for(committed);
+ if (kv_slot < 0) {
+ std::fprintf(stderr, "[kvflash] pipelined decode: no slot at pos %d\n",
+ committed);
+ return false;
+ }
+ }
+
PipelinedDecodeTelemetry tel;
if (!pipelined_decode_one_token(*pipe_state_, target_backend(), target_weights(),
target_cache(), *target_weights().moe_hybrid,
committed, cfg_.kq_stride_pad,
- hybrid_telemetry_ ? &tel : nullptr)) {
+ hybrid_telemetry_ ? &tel : nullptr,
+ kv_slot)) {
return false;
}
const auto layers_done = DecodeClock::now();
@@ -563,6 +576,10 @@ bool Qwen35MoeBackend::run_pipelined_decode_path(int committed, int n_gen,
io.emit(next_tok);
committed++;
target_cache().cur_pos = committed;
+ if (kvflash_active()) {
+ kvflash_history_.push_back(next_tok);
+ kvflash_maybe_reselect((int)out_tokens.size());
+ }
if (io.cancelled) break;
if (is_eos_tok(next_tok, target_weights())) break;
}
@@ -721,6 +738,19 @@ GenerateResult Qwen35MoeBackend::generate_impl(const GenerateRequest & req,
const int prompt_len = (int)req.prompt.size();
const int prefill_chunk = std::min(128, prompt_len); // batch size per GPU compute
+ // kvflash: hybrid prefill writes rows identity-mapped, so the prompt must
+ // fit the pool with one chunk of decode headroom (same contract as the
+ // base do_prefill).
+ if (kvflash_active() &&
+ prompt_len > kvflash_tokens_ - kvflash_pager_.chunk_tokens()) {
+ std::fprintf(stderr,
+ "[kvflash] hybrid prompt (%d) exceeds pool %d; raise --kvflash "
+ "or enable pflash compression\n", prompt_len, kvflash_tokens_);
+ result.error = "kvflash: prompt exceeds resident pool";
+ cleanup_graphs();
+ return result;
+ }
+
// Embed all prompt tokens
const int n_expert_used = target_weights().n_expert_used;
std::vector embed_all((size_t)prompt_len * (size_t)hidden);
@@ -957,6 +987,9 @@ GenerateResult Qwen35MoeBackend::generate_impl(const GenerateRequest & req,
int committed = prompt_len;
target_cache().cur_pos = committed;
+ if (kvflash_active()) {
+ kvflash_sync_prefill(committed, req.prompt, /*kv_offset=*/0);
+ }
auto t_prefill_end = std::chrono::steady_clock::now();
result.prefill_s = std::chrono::duration(t_prefill_end - t_prefill_start).count();
@@ -990,7 +1023,9 @@ GenerateResult Qwen35MoeBackend::generate_impl(const GenerateRequest & req,
if (req.n_gen > 0) {
auto t_decode_start = std::chrono::steady_clock::now();
- // Check if hybrid spec-decode is available
+ // Hybrid spec-decode runs on the pool: hybrid_forward_batch is
+ // slot-mapped (verify and replay both route through it) and the
+ // recurrent-state rollback is ssm snapshot/restore (pool-neutral).
const bool can_hybrid_spec = !req.force_ar_decode
&& cfg_.draft_path
&& !is_draft_parked()
@@ -1021,7 +1056,8 @@ GenerateResult Qwen35MoeBackend::generate_impl(const GenerateRequest & req,
target_cache().last_tok = first_tok;
cleanup_graphs();
- if (!do_hybrid_spec_decode(committed, req.n_gen, result.tokens, out_io)) {
+ if (!do_hybrid_spec_decode(committed, req.n_gen, result.tokens, out_io,
+ &result.accept_rate)) {
result.error = "hybrid_spec_decode";
return result;
}
@@ -1057,6 +1093,7 @@ GenerateResult Qwen35MoeBackend::generate_impl(const GenerateRequest & req,
if (!is_eos_tok(first_tok, target_weights())) {
committed++;
target_cache().cur_pos = committed;
+ if (kvflash_active()) kvflash_history_.push_back(first_tok);
// Pipelined decode loop
PipelinedDecodeTelemetry decode_tel_accum{};
@@ -1071,11 +1108,23 @@ GenerateResult Qwen35MoeBackend::generate_impl(const GenerateRequest & req,
ggml_backend_tensor_set_async(target_backend(), pipe_state_->gpu_state.act_cur,
act_cur.data(), 0, sizeof(float) * (size_t)hidden);
+ // kvflash: pool slot for this token's KV rows (may evict)
+ int kv_slot = -1;
+ if (kvflash_active()) {
+ kv_slot = kvflash_pager_.slot_for(committed);
+ if (kv_slot < 0) {
+ result.error = "kvflash_slot";
+ cleanup_graphs();
+ return result;
+ }
+ }
+
PipelinedDecodeTelemetry tel;
if (!pipelined_decode_one_token(*pipe_state_, target_backend(), target_weights(),
target_cache(), *target_weights().moe_hybrid,
committed, cfg_.kq_stride_pad,
- hybrid_telemetry_ ? &tel : nullptr)) {
+ hybrid_telemetry_ ? &tel : nullptr,
+ kv_slot)) {
result.error = "decode";
cleanup_graphs();
return result;
@@ -1133,6 +1182,10 @@ GenerateResult Qwen35MoeBackend::generate_impl(const GenerateRequest & req,
out_io.emit(next_tok);
committed++;
target_cache().cur_pos = committed;
+ if (kvflash_active()) {
+ kvflash_history_.push_back(next_tok);
+ kvflash_maybe_reselect((int)result.tokens.size());
+ }
if (out_io.cancelled) break;
if (is_eos_tok(next_tok, target_weights())) break;
}
@@ -1295,6 +1348,32 @@ GenerateResult Qwen35MoeBackend::restore_and_generate_impl(int slot,
return result;
}
+ // kvflash: the restored prefix + delta prefill land identity-mapped, so
+ // the full prompt must fit the pool (snapshots past the pool are never
+ // saved, but the delta can still overflow it).
+ if (kvflash_active() &&
+ prompt_len > kvflash_tokens_ - kvflash_pager_.chunk_tokens()) {
+ std::fprintf(stderr,
+ "[kvflash] hybrid restore prompt (%d) exceeds pool %d; raise "
+ "--kvflash\n", prompt_len, kvflash_tokens_);
+ result.error = "kvflash: prompt exceeds resident pool";
+ out_io.emit(-1);
+ return result;
+ }
+
+ // kvflash: the delta prefill below runs the maskless pipelined forward
+ // over the padded pool span; map the restored prefix identity-style and
+ // zero stale free slots BEFORE any forward reads them.
+ if (kvflash_active()) {
+ kvflash_pager_.reset();
+ if (!kvflash_pager_.alloc_span(0, snap_pos)) {
+ result.error = "kvflash_slot";
+ out_io.emit(-1);
+ return result;
+ }
+ kvflash_pager_.zero_free_blocks();
+ }
+
const int hidden = target_weights().n_embd;
std::vector act_cur((size_t)hidden);
if (prompt_len > snap_pos) {
@@ -1314,6 +1393,17 @@ GenerateResult Qwen35MoeBackend::restore_and_generate_impl(int slot,
std::chrono::steady_clock::now() - t_prefill_start).count();
}
+ if (kvflash_active()) {
+ // Rebuild the pager mapping over the identity-mapped [0, committed).
+ // With the full prompt available the history carries real ids;
+ // restore-only generates keep an unknown-prefix history.
+ if (prompt_len == committed) {
+ kvflash_sync_prefill(committed, req.prompt, /*kv_offset=*/0);
+ } else {
+ kvflash_sync_prefill(committed, {}, /*kv_offset=*/committed);
+ }
+ }
+
if (req.n_gen > 0) {
if (target_cache().last_tok < 0) {
std::fprintf(stderr,
@@ -1457,6 +1547,29 @@ bool Qwen35MoeBackend::hybrid_forward_batch(
}
}
+ // kvflash: allocate the block's slots up front (may evict) and build
+ // the slot-mapped write rows + slot-space mask once; every layer's
+ // graph gets the same fills (verify and replay both land here, so all
+ // hybrid-spec KV writes are pool-routed).
+ const bool kvf = kvflash_active();
+ std::vector kvf_rows;
+ std::vector kvf_mask;
+ std::vector kvf_slots;
+ if (kvf) {
+ if (!kvflash_pager_.alloc_span(base_pos, n_tokens)) return false;
+ kvf_slots.resize((size_t)n_tokens);
+ for (int i = 0; i < n_tokens; ++i) {
+ kvf_slots[(size_t)i] = kvflash_pager_.slot_of(base_pos + i);
+ }
+ // [n_tokens, n_head_kv] ne0-major (see verify_batch).
+ kvf_rows.resize((size_t)n_tokens * target_weights().n_head_kv);
+ for (int h = 0; h < target_weights().n_head_kv; ++h) {
+ for (int i = 0; i < n_tokens; ++i) {
+ kvf_rows[(size_t)h * n_tokens + i] = kvf_slots[(size_t)i];
+ }
+ }
+ }
+
// Process layer-by-layer (same as prefill)
StepGraph prefn_sg;
ggml_gallocr_t ffn_hot_alloc = nullptr;
@@ -1466,17 +1579,23 @@ bool Qwen35MoeBackend::hybrid_forward_batch(
for (int il = 0; il < n_layer; ++il) {
auto & storage = target_weights().moe_hybrid->layers[(size_t)il];
- const bool with_mask = (cfg_.kq_stride_pad > KQ_MASK_PAD) || (n_tokens > 1);
+ const bool with_mask = kvf ||
+ (cfg_.kq_stride_pad > KQ_MASK_PAD) || (n_tokens > 1);
// Build pre-FFN graph (DeltaNet/attention + router) for all tokens
step_graph_free(prefn_sg);
if (!build_layer_prefn_step(prefn_sg, target_weights(), target_cache(), target_backend(),
il, /*kv_start=*/base_pos, n_tokens,
- with_mask, /*fa_window=*/0, cfg_.kq_stride_pad)) {
+ with_mask, /*fa_window=*/0, cfg_.kq_stride_pad,
+ /*kvflash=*/kvf)) {
step_graph_destroy(prefn_sg);
if (ffn_hot_alloc) ggml_gallocr_free(ffn_hot_alloc);
return false;
}
+ if (prefn_sg.kv_write_rows) {
+ ggml_backend_tensor_set(prefn_sg.kv_write_rows, kvf_rows.data(), 0,
+ sizeof(int64_t) * kvf_rows.size());
+ }
// Upload embeddings
ggml_backend_tensor_set(prefn_sg.inp_embed, embed_all.data(), 0,
@@ -1496,7 +1615,36 @@ bool Qwen35MoeBackend::hybrid_forward_batch(
}
// Set causal mask
- if (prefn_sg.attn_mask) {
+ if (prefn_sg.attn_mask && kvf) {
+ // Slot-space mask (verify_batch recipe): committed resident
+ // positions (< base_pos) plus this block's own slots, causal.
+ // Built once, reused for every layer's graph.
+ if (kvf_mask.empty()) {
+ constexpr uint16_t F16_ZERO = 0x0000, F16_NEG_INF = 0xFC00;
+ const size_t kvd = (size_t)prefn_sg.attn_mask->ne[0];
+ const int q_pad = (int)prefn_sg.attn_mask->ne[1];
+ kvf_mask.assign(kvd * q_pad, F16_NEG_INF);
+ const int ct = kvflash_pager_.chunk_tokens();
+ for (int c = 0; c < kvflash_pager_.n_chunks(); c++) {
+ const int blk = kvflash_pager_.block_of(c);
+ if (blk < 0) continue;
+ for (int i = 0; i < ct; i++) {
+ if ((int64_t)c * ct + i >= base_pos) break;
+ kvf_mask[(size_t)blk * ct + i] = F16_ZERO;
+ }
+ }
+ for (int q = 1; q < n_tokens; q++) {
+ std::memcpy(kvf_mask.data() + (size_t)q * kvd, kvf_mask.data(), kvd * 2);
+ }
+ for (int q = 0; q < n_tokens; q++) {
+ for (int i = 0; i <= q; i++) {
+ kvf_mask[(size_t)q * kvd + kvf_slots[(size_t)i]] = F16_ZERO;
+ }
+ }
+ }
+ ggml_backend_tensor_set(prefn_sg.attn_mask, kvf_mask.data(), 0,
+ sizeof(uint16_t) * kvf_mask.size());
+ } else if (prefn_sg.attn_mask) {
const int kv_len = base_pos + n_tokens;
const int kv_pad_override = (int)prefn_sg.attn_mask->ne[0];
std::vector mask_buf;
@@ -1542,14 +1690,27 @@ bool Qwen35MoeBackend::hybrid_forward_batch(
std::vector ffn_batch_out;
bool ffn_ok = false;
- if (storage.cold_expert_ids.empty()) {
- // All-hot: use batched hot-only path
+ // Spark expert cache: pull the verify batch's selected cold experts into
+ // spare GPU slots (LRU) so the batched FFN serves them on-die — the SAME
+ // residency mechanism the AR pipelined path uses. Without this the verify
+ // re-evaluated cold experts on the CPU every step, which dominated its FFN
+ // time (the spec-decode-with-offloading inefficiency). After warmup the
+ // working set is resident and the CPU cold path is rarely taken.
+ const int n_route_slots = n_tokens * n_expert_used;
+ if (storage.cache_slots > 0 && !storage.cold_expert_ids.empty()) {
+ for (int i = 0; i < n_route_slots; ++i)
+ dflash::common::moe_hybrid_cache_swap_in(storage, chunk_selected[(size_t)i], target_backend());
+ }
+ const bool routed_all_hot = storage.cold_expert_ids.empty()
+ || storage.all_routed_are_hot(chunk_selected.data(), n_route_slots);
+ if (routed_all_hot) {
+ // All routed experts resident on GPU: fast batched hot-only path.
ffn_ok = eval_moe_hot_only_batched(
target_backend(), chunk_cfg, chunk_desc, storage,
chunk_post.data(), chunk_selected.data(), chunk_weights.data(),
n_tokens, ffn_batch_out, nullptr, &ffn_hot_alloc);
} else {
- // Mixed hot/cold: use hybrid path
+ // Cache full / residue still cold: hybrid path (remaining cold on CPU).
ffn_ok = eval_moe_hybrid_ffn_batched(
target_backend(), target_weights().moe_hybrid->cpu_backend,
chunk_cfg, chunk_desc, storage,
@@ -1619,29 +1780,13 @@ bool Qwen35MoeBackend::hybrid_forward_batch(
act_cur.assign(embed_all.data() + (size_t)(n_tokens - 1) * (size_t)hidden,
embed_all.data() + (size_t)n_tokens * (size_t)hidden);
- // Project ALL tokens to logits and get argmax for each
- const int vocab = target_weights().n_vocab;
+ // Project ALL tokens to logits and argmax ON THE GPU, reading back only
+ // n_tokens token ids instead of vocab*n_tokens floats. The host logits
+ // readback + host argmax was a large per-step D2H cost in the verify and
+ // replay forwards (vocab ~152k x n_tokens x 4B, twice per spec step).
argmax_out.resize(n_tokens);
-
StepGraph proj_sg;
- ggml_init_params ip{};
- ip.mem_size = 64 * 1024 * 1024;
- ip.mem_buffer = nullptr;
- ip.no_alloc = true;
- proj_sg.ctx = ggml_init(ip);
- if (!proj_sg.ctx) return false;
-
- proj_sg.hidden_input = ggml_new_tensor_2d(proj_sg.ctx, GGML_TYPE_F32, hidden, n_tokens);
- ggml_set_input(proj_sg.hidden_input);
- proj_sg.gf = ggml_new_graph_custom(proj_sg.ctx, 1024, false);
- ggml_tensor * normed = ggml_rms_norm(proj_sg.ctx, proj_sg.hidden_input, target_weights().rms_eps);
- normed = ggml_mul(proj_sg.ctx, normed, target_weights().out_norm);
- proj_sg.logits = ggml_mul_mat(proj_sg.ctx, target_weights().output, normed);
- ggml_set_output(proj_sg.logits);
- ggml_build_forward_expand(proj_sg.gf, proj_sg.logits);
- proj_sg.alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(target_backend()));
- if (!ggml_gallocr_alloc_graph(proj_sg.alloc, proj_sg.gf)) {
- step_graph_destroy(proj_sg);
+ if (!build_lm_head_projection_step(proj_sg, target_weights(), target_backend(), n_tokens)) {
return false;
}
ggml_backend_tensor_set(proj_sg.hidden_input, embed_all.data(), 0,
@@ -1651,35 +1796,36 @@ bool Qwen35MoeBackend::hybrid_forward_batch(
step_graph_destroy(proj_sg);
return false;
}
-
- // Read logits and compute argmax per token
- std::vector logits_buf((size_t)vocab * (size_t)n_tokens);
- ggml_backend_tensor_get(proj_sg.logits, logits_buf.data(), 0,
- sizeof(float) * logits_buf.size());
+ ggml_backend_tensor_get(proj_sg.argmax_tokens, argmax_out.data(), 0,
+ sizeof(int32_t) * (size_t)n_tokens);
step_graph_destroy(proj_sg);
-
- for (int t = 0; t < n_tokens; ++t) {
- const float * tok_logits = logits_buf.data() + (size_t)t * (size_t)vocab;
- int32_t best_id = 0;
- float best_val = tok_logits[0];
- for (int j = 1; j < vocab; ++j) {
- if (tok_logits[j] > best_val) {
- best_val = tok_logits[j];
- best_id = j;
- }
- }
- argmax_out[t] = best_id;
- }
return true;
}
bool Qwen35MoeBackend::do_hybrid_spec_decode(int committed, int n_gen,
std::vector & out_tokens,
- const DaemonIO & io) {
+ const DaemonIO & io,
+ float * accept_rate_out) {
const int hidden = target_weights().n_embd;
const int q_len = draft_weights().block_size;
if (q_len <= 0) return false;
+ // Verify width: cap how many draft tokens we actually verify. The batched
+ // verify's cost is dominated by the distinct experts its tokens touch
+ // (especially under --spark expert offload, where extra tokens stream extra
+ // cold experts). Tokens past the realized accept length are wasted, so
+ // capping the verify to a width above the typical accept length cuts that
+ // waste at no acceptance cost. Default = full draft block; tune via env.
+ // Verify-width control (see note above). DFLASH_VERIFY_WIDTH pins a fixed
+ // width; otherwise the width adapts to the realized accept length so chain
+ // decoding (low AL) verifies just a few tokens (cheap, especially under
+ // expert offload) while a high-AL draft still gets enough width.
+ const int forced_verify_width = [&]{
+ const char * e = std::getenv("DFLASH_VERIFY_WIDTH");
+ return e ? std::max(1, std::min(q_len, std::atoi(e))) : 0;
+ }();
+ int observed_max_accept = 1;
+
int32_t last_tok = target_cache().last_tok;
std::vector act_cur((size_t)hidden);
@@ -1696,10 +1842,22 @@ bool Qwen35MoeBackend::do_hybrid_spec_decode(int committed, int n_gen,
int n_draft_steps = 0;
int n_accept_sum = 0;
+ // Allocate DeltaNet rollback snapshot tensors (no-op if already present).
+ // Without these, snapshot_ssm_state/restore_ssm_state silently do nothing
+ // and rejected draft tokens leak into the recurrent state, collapsing output.
+ if (!ensure_ssm_snapshot(target_cache(), target_backend())) {
+ std::fprintf(stderr, "[hybrid-spec] ensure_ssm_snapshot failed\n");
+ step_graph_destroy(draft_sg);
+ return false;
+ }
+
auto t_dec0 = std::chrono::steady_clock::now();
while (n_generated < n_gen) {
const int need_commit_budget = n_gen - n_generated;
+ const int verify_width = forced_verify_width > 0
+ ? forced_verify_width
+ : std::min(q_len, std::max(6, observed_max_accept + 2));
// 1. Build noise input for draft
noise_ids[0] = last_tok;
@@ -1785,9 +1943,9 @@ bool Qwen35MoeBackend::do_hybrid_spec_decode(int committed, int n_gen,
// 4. Verify: snapshot recurrent state, then run ALL draft tokens batched
snapshot_ssm_state(target_cache());
- target_tok.resize(q_len);
+ target_tok.resize(verify_width);
bool verify_ok = hybrid_forward_batch(
- draft_tok.data(), q_len, committed,
+ draft_tok.data(), verify_width, committed,
act_cur, target_tok, /*capture_features=*/false);
if (!verify_ok) {
std::fprintf(stderr, "[hybrid-spec] verify failed\n");
@@ -1798,11 +1956,12 @@ bool Qwen35MoeBackend::do_hybrid_spec_decode(int committed, int n_gen,
// 5. Acceptance: longest matching prefix
int accept_n = 1;
- for (int i = 0; i < q_len - 1; i++) {
+ for (int i = 0; i < verify_width - 1; i++) {
if (draft_tok[i + 1] == target_tok[i]) accept_n++;
else break;
}
- int bonus_tok = (accept_n < q_len) ? target_tok[accept_n - 1] : -1;
+ int bonus_tok = (accept_n < verify_width) ? target_tok[accept_n - 1] : -1;
+ observed_max_accept = std::max(observed_max_accept, accept_n);
int commit_n = accept_n + (bonus_tok >= 0 ? 1 : 0);
if (commit_n > need_commit_budget) {
commit_n = need_commit_budget;
@@ -1859,6 +2018,10 @@ bool Qwen35MoeBackend::do_hybrid_spec_decode(int committed, int n_gen,
const double decode_s = std::chrono::duration(t_dec1 - t_dec0).count();
const int total_draft_pos = std::max(1, n_draft_steps * q_len);
const double accept_pct = 100.0 * (double)n_accept_sum / (double)total_draft_pos;
+ if (accept_rate_out) {
+ *accept_rate_out = total_draft_pos > 0
+ ? (float)((double)n_accept_sum / (double)total_draft_pos) : 0.0f;
+ }
std::fprintf(stderr, "[hybrid-spec] tokens=%d time=%.3f s speed=%.2f tok/s "
"steps=%d accepted=%d/%d (%.1f%%) avg_commit=%.2f AL=%.2f\n",
n_generated, decode_s,
diff --git a/server/src/qwen35moe/qwen35moe_backend.h b/server/src/qwen35moe/qwen35moe_backend.h
index ca154e405..d2f711a4c 100644
--- a/server/src/qwen35moe/qwen35moe_backend.h
+++ b/server/src/qwen35moe/qwen35moe_backend.h
@@ -61,7 +61,8 @@ class Qwen35MoeBackend : public Qwen35Backend {
// verify via hybrid forward (layer-by-layer with hot/cold FFN).
bool do_hybrid_spec_decode(int committed, int n_gen,
std::vector & out_tokens,
- const DaemonIO & io);
+ const DaemonIO & io,
+ float * accept_rate_out = nullptr);
// Run one token through hybrid forward, capturing features at capture layers.
// Returns the logits argmax token. Advances committed by 1.
diff --git a/server/src/qwen35moe/qwen35moe_pipelined_decode.cpp b/server/src/qwen35moe/qwen35moe_pipelined_decode.cpp
index 72cb03975..bfd4df479 100644
--- a/server/src/qwen35moe/qwen35moe_pipelined_decode.cpp
+++ b/server/src/qwen35moe/qwen35moe_pipelined_decode.cpp
@@ -314,12 +314,16 @@ bool pipelined_decode_one_token(
MoeHybridStorage & hybrid,
int kv_pos,
int kq_stride_pad,
- PipelinedDecodeTelemetry * tel) {
+ PipelinedDecodeTelemetry * tel,
+ int kv_slot) {
const int n_layer = state.n_layer;
const int n_embd = state.n_embd;
const int n_expert_used = state.n_expert_used;
ggml_backend_t cpu_be = hybrid.cpu_backend;
+ // Physical KV row for this token: kvflash pool slot, or the logical
+ // position itself. positions (RoPE) always carry the logical kv_pos.
+ const int kv_row = kv_slot >= 0 ? kv_slot : kv_pos;
if (tel) {
*tel = PipelinedDecodeTelemetry{};
@@ -503,7 +507,12 @@ bool pipelined_decode_one_token(
bool attn_cached_ok = false;
if (is_attn && !g_no_kvpad) {
auto & cpg = state.cached_prefn[(size_t)il];
- const int kv_win_needed = ((kv_pos + 1) + 255) & ~255;
+ // Clamp the baked FA span to the cache tensor's physical capacity:
+ // with kvflash the tensors are pool-sized, so the window stops
+ // growing at the pool (and the cached graph never rebuilds again).
+ const int kv_phys = (int)cache.attn_k[0]->ne[1];
+ const int kv_win_needed =
+ std::min(((kv_pos + 1) + 255) & ~255, kv_phys);
if (!cpg.valid() || cpg.kv_win < kv_win_needed) {
if (!build_cached_attn_prefn(cpg, backend, w, cache, il,
kv_win_needed, kq_stride_pad)) {
@@ -519,7 +528,7 @@ bool pipelined_decode_one_token(
ggml_backend_tensor_copy_async(backend, backend, state.gpu_state.act_cur, cpg.inp_embed);
int32_t pos4[4] = {kv_pos, kv_pos, kv_pos, 0};
ggml_backend_tensor_set_async(backend, cpg.positions, pos4, 0, sizeof(pos4));
- std::vector row_vals((size_t)w.n_head_kv, (int64_t)kv_pos);
+ std::vector row_vals((size_t)w.n_head_kv, (int64_t)kv_row);
ggml_backend_tensor_set_async(backend, cpg.kv_write_rows, row_vals.data(), 0,
sizeof(int64_t) * row_vals.size());
@@ -536,7 +545,16 @@ bool pipelined_decode_one_token(
moe_weights_tensor = cpg.moe_weights;
} else if (is_attn || !state.cached_prefn[(size_t)il].valid()) {
// Attention layer (legacy/fallback) OR failed DeltaNet cache:
- // rebuild graph dynamically
+ // rebuild graph dynamically. The legacy path writes KV at the
+ // literal view offset kv_pos and cannot express a pool slot —
+ // refuse instead of corrupting the pool / running off its end.
+ if (is_attn && kv_slot >= 0) {
+ std::fprintf(stderr,
+ "[pipelined] kvflash requires the cached set_rows attn path "
+ "(layer %d cached-graph build failed)\n", il);
+ step_graph_destroy(dyn_sg);
+ return false;
+ }
if (!build_layer_prefn_step(dyn_sg, w, cache, backend,
il, kv_pos, /*n_tokens=*/1,
/*with_mask=*/false, /*fa_window=*/0, kq_stride_pad)) {
diff --git a/server/src/qwen35moe/qwen35moe_pipelined_decode.h b/server/src/qwen35moe/qwen35moe_pipelined_decode.h
index ae35c775f..64d3b6bab 100644
--- a/server/src/qwen35moe/qwen35moe_pipelined_decode.h
+++ b/server/src/qwen35moe/qwen35moe_pipelined_decode.h
@@ -197,14 +197,18 @@ bool init_pipelined_decode_state(
// Run one full token through the pipelined decode loop (all n_layer layers).
// On success, gpu_state.act_cur holds the final hidden state on GPU.
// selected_ids_out / weights_out: optional per-layer routing capture for telemetry.
+// kv_slot: physical KV row to write (kvflash pool slot); -1 = kv_pos (identity,
+// no pool). The FA span clamps to the cache tensor's physical capacity, so
+// pool-sized tensors bound the cached-graph window automatically.
bool pipelined_decode_one_token(
PipelinedDecodeState & state,
ggml_backend_t backend,
const TargetWeights & w,
TargetCache & cache,
MoeHybridStorage & hybrid,
- int kv_pos, // current KV position
+ int kv_pos, // current KV position (logical; drives RoPE)
int kq_stride_pad,
- PipelinedDecodeTelemetry * telemetry = nullptr);
+ PipelinedDecodeTelemetry * telemetry = nullptr,
+ int kv_slot = -1);
} // namespace dflash::common
diff --git a/server/src/server/server_main.cpp b/server/src/server/server_main.cpp
index bbe274dbc..36c28e400 100644
--- a/server/src/server/server_main.cpp
+++ b/server/src/server/server_main.cpp
@@ -408,6 +408,33 @@ int main(int argc, char ** argv) {
bargs.fast_rollback = true;
} else if (std::strcmp(argv[i], "--ddtree-budget") == 0 && i + 1 < argc) {
bargs.ddtree_budget = std::atoi(argv[++i]);
+ } else if (std::strcmp(argv[i], "--kvflash") == 0 && i + 1 < argc) {
+ // Bounded KV residency: attention KV lives in a fixed pool of N
+ // tokens; cold 64-token chunks page to host. Works with or
+ // without pflash (drafter becomes the reselect scorer when
+ // loaded; plain LRU otherwise). Forces AR decode.
+ ++i;
+ if (std::strcmp(argv[i], "auto") != 0 && std::atoi(argv[i]) <= 0) {
+ std::fprintf(stderr, "--kvflash expects a positive token count or "
+ "'auto', got '%s'\n", argv[i]);
+ return 1;
+ }
+ ::setenv("DFLASH_KVFLASH", argv[i], 1);
+ } else if (std::strcmp(argv[i], "--kvflash-policy") == 0 && i + 1 < argc) {
+ ++i;
+ if (std::strcmp(argv[i], "drafter") != 0 && std::strcmp(argv[i], "lru") != 0) {
+ std::fprintf(stderr, "--kvflash-policy expects 'drafter' or 'lru', got '%s'\n",
+ argv[i]);
+ return 1;
+ }
+ ::setenv("DFLASH_KVFLASH_POLICY", argv[i], 1);
+ } else if (std::strcmp(argv[i], "--kvflash-tau") == 0 && i + 1 < argc) {
+ if (std::atoi(argv[++i]) <= 0) {
+ std::fprintf(stderr, "--kvflash-tau expects a positive interval, got '%s'\n",
+ argv[i]);
+ return 1;
+ }
+ ::setenv("DFLASH_KVFLASH_TAU", argv[i], 1);
} else if (std::strcmp(argv[i], "--spark") == 0) {
spark_autotune = true;
} else if (std::strcmp(argv[i], "--spark-slots") == 0 && i + 1 < argc) {
@@ -459,6 +486,9 @@ int main(int argc, char ** argv) {
sconfig.pflash_keep_ratio = (float)std::atof(argv[++i]);
} else if (std::strcmp(argv[i], "--prefill-drafter") == 0 && i + 1 < argc) {
sconfig.pflash_drafter_path = argv[++i];
+ // kvflash reads this to lazy-attach the drafter as its
+ // residency scorer even when prefill compression is off.
+ ::setenv("DFLASH_KVFLASH_DRAFTER", argv[i], 1);
} else if (std::strcmp(argv[i], "--prefill-skip-park") == 0) {
sconfig.pflash_skip_park = true;
} else if (std::strcmp(argv[i], "--prefill-upstream-base") == 0 && i + 1 < argc) {
diff --git a/server/test/test_kvflash.cpp b/server/test/test_kvflash.cpp
new file mode 100644
index 000000000..3f3634ac6
--- /dev/null
+++ b/server/test/test_kvflash.cpp
@@ -0,0 +1,1082 @@
+// test_kvflash — verifies KVFlash, the bounded-resident-pool KV cache
+// (kvflash_pager.h).
+//
+// Runs against one loaded qwen35 target:
+//
+// A baseline: cache at LOGICAL context (default 131072), maskless decode
+// (production AR path shape). Reference tokens + baseline KV memory.
+// B relocation proof: small pool, chunks at SHUFFLED physical blocks,
+// explicit pool slot mask, teacher-forced replay of A. Argmax must
+// track A (position-independence + mask exactness).
+// C paging proof: pool ≪ prompt+gen, live eviction, bit-exact
+// page_out/page_in roundtrip, KV bytes vs A.
+// D reselect/recall: evicted chunk recalled via score_hook + reselect()
+// (the FlashMemory τ-step lookahead machinery); decode continues.
+// E performance profile: decode ms/step vs FA span — baseline at
+// 8K/32K/128K vs pool 1K/4K at 128K-logical — plus page-event and
+// mask-refill microbenchmarks.
+//
+// Usage:
+// test_kvflash [--logical-ctx=N] [--pool-b=N] [--pool-c=N]
+// [--prompt=N] [--gen=N] [--skip-profile] [--no-mask]
+// modes: (default) verification suite A-F | --niah | --niah256 | --longab
+
+#include "dflash27b.h"
+#include "internal.h"
+#include "kvflash_pager.h"
+#include "attn_masks.h"
+#include "qwen3_drafter.h"
+#include "qwen3_kvflash_scorer.h"
+
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+#include "ggml-cuda.h"
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+using namespace dflash::common;
+
+namespace {
+
+double now_ms() {
+ return std::chrono::duration(
+ std::chrono::steady_clock::now().time_since_epoch()).count();
+}
+
+size_t kv_cache_bytes(const TargetCache & c) {
+ size_t n = 0;
+ for (auto * t : c.attn_k) if (t) n += ggml_nbytes(t);
+ for (auto * t : c.attn_v) if (t) n += ggml_nbytes(t);
+ return n;
+}
+
+size_t vram_used_now() {
+ size_t free_b = 0, total_b = 0;
+ ggml_backend_cuda_get_device_memory(0, &free_b, &total_b);
+ return total_b - free_b;
+}
+
+// Single-token stepper over build_qwen35_graph with explicit control of:
+// * kv_write_rows — physical pool slot for the KV append
+// * positions — logical position (M-RoPE)
+// * span — FA window length (kv_start = span-1 in graph terms)
+// * attn_mask — optional [align32(span_padded), 32] f16 slot mask
+//
+// The graph arena and gallocr persist across rebuilds (same trick as
+// build_target_step) so identical topology lands at identical addresses
+// and the ggml-cuda CUDA-graph cache can replay decode steps.
+struct Stepper {
+ ggml_context * ctx = nullptr;
+ ggml_cgraph * gf = nullptr;
+ ggml_gallocr_t alloc = nullptr;
+ ggml_tensor * inp_embed = nullptr;
+ ggml_tensor * positions = nullptr;
+ ggml_tensor * attn_mask = nullptr;
+ ggml_tensor * kv_write_rows = nullptr;
+ ggml_tensor * logits = nullptr;
+ ggml_tensor * argmax_tokens = nullptr;
+
+ const TargetWeights * w = nullptr;
+ TargetCache * cache = nullptr;
+ ggml_backend_t backend = nullptr;
+ int span = 0;
+ bool with_mask = false;
+
+ std::vector arena;
+ std::vector embed_buf;
+ std::vector mask_buf;
+ uint64_t mask_epoch = (uint64_t)-1;
+ double mask_fill_ms_total = 0.0;
+ int mask_fills = 0;
+
+ bool init(const TargetWeights & tw, TargetCache & tc, ggml_backend_t be,
+ int span_, bool with_mask_) {
+ w = &tw; cache = &tc; backend = be;
+ span = span_; with_mask = with_mask_;
+ embed_buf.resize(tw.n_embd);
+ arena.resize((size_t)512 * 1024 * 1024);
+ return build();
+ }
+
+ bool build() {
+ if (ctx) { ggml_free(ctx); ctx = nullptr; }
+ ggml_init_params ip{};
+ ip.mem_size = arena.size();
+ ip.mem_buffer = arena.data();
+ ip.no_alloc = true;
+ ctx = ggml_init(ip);
+ if (!ctx) return false;
+
+ const int hidden = w->n_embd;
+ inp_embed = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hidden, 1, 1);
+ ggml_set_input(inp_embed);
+ positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
+ ggml_set_input(positions);
+ kv_write_rows = ggml_new_tensor_2d(ctx, GGML_TYPE_I64, 1, w->n_head_kv);
+ ggml_set_input(kv_write_rows);
+
+ attn_mask = nullptr;
+ if (with_mask) {
+ // FA span is padded to 256 on the step-invariant path; the mask
+ // kv dim must cover it.
+ const int span_padded = std::min(((span + 255) / 256) * 256,
+ (int)cache->attn_k[0]->ne[1]);
+ attn_mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F16,
+ align_up(span_padded, KQ_MASK_PAD),
+ align_up(1, KQ_MASK_PAD));
+ ggml_set_input(attn_mask);
+ mask_buf.assign((size_t)attn_mask->ne[0] * attn_mask->ne[1], F16_NEG_INF);
+ mask_epoch = (uint64_t)-1;
+ }
+
+ gf = ggml_new_graph_custom(ctx, 16384, false);
+
+ QwenGraphInputs gi{};
+ gi.inp_embed = inp_embed;
+ gi.positions = positions;
+ gi.attn_mask = attn_mask;
+ gi.n_tokens = 1;
+ gi.kv_start = span - 1;
+ gi.capture_layers = false;
+ gi.kv_write_rows = kv_write_rows;
+
+ QwenGraphOutputs go = build_qwen35_graph(ctx, gf, *w, *cache, gi);
+ if (!go.logits) return false;
+ logits = go.logits;
+ ggml_set_output(logits);
+ argmax_tokens = ggml_argmax(ctx, logits);
+ ggml_set_output(argmax_tokens);
+ ggml_build_forward_expand(gf, argmax_tokens);
+
+ if (!alloc) alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
+ return ggml_gallocr_alloc_graph(alloc, gf);
+ }
+
+ void refresh_mask(const KvFlashPager & pager) {
+ if (!attn_mask) return;
+ const double t0 = now_ms();
+ if (pager.epoch() != mask_epoch) {
+ // Host-side rebuild only on residency change.
+ std::fill(mask_buf.begin(), mask_buf.end(), F16_NEG_INF);
+ pager.fill_slot_mask(mask_buf.data());
+ mask_epoch = pager.epoch();
+ mask_fills++;
+ }
+ // Upload EVERY step: the compute-buffer region backing this input
+ // tensor is reused by graph execution, so a stale upload reads as
+ // garbage (NaN logits) on the next step. Production prefill
+ // re-uploads its mask before every compute for the same reason.
+ ggml_backend_tensor_set(attn_mask, mask_buf.data(), 0,
+ mask_buf.size() * sizeof(uint16_t));
+ mask_fill_ms_total += now_ms() - t0;
+ }
+
+ int32_t step(int32_t tok, int pos, int phys_slot) {
+ if (!w->embedder.embed(&tok, 1, embed_buf.data())) {
+ std::fprintf(stderr, "embed failed: tok=%d pos=%d (NaN logits upstream?)\n", tok, pos);
+ std::exit(1);
+ }
+ ggml_backend_tensor_set(inp_embed, embed_buf.data(), 0,
+ sizeof(float) * embed_buf.size());
+ int32_t p4[4] = { pos, pos, pos, 0 };
+ ggml_backend_tensor_set(positions, p4, 0, sizeof(int32_t) * 4);
+ std::vector rows(w->n_head_kv, (int64_t)phys_slot);
+ ggml_backend_tensor_set(kv_write_rows, rows.data(), 0,
+ sizeof(int64_t) * rows.size());
+ if (ggml_backend_graph_compute(backend, gf) != GGML_STATUS_SUCCESS) {
+ std::fprintf(stderr, "graph_compute failed pos=%d\n", pos);
+ std::exit(1);
+ }
+ int32_t next = 0;
+ ggml_backend_tensor_get(argmax_tokens, &next, 0, sizeof(int32_t));
+ return next;
+ }
+
+ void destroy() {
+ if (alloc) { ggml_gallocr_free(alloc); alloc = nullptr; }
+ if (ctx) { ggml_free(ctx); ctx = nullptr; }
+ }
+};
+
+std::vector make_prompt(int n, int vocab) {
+ std::vector p(n);
+ uint64_t s = 0x9E3779B97F4A7C15ull;
+ // Cap below the drafter vocab too (Qwen3-0.6B ~151K) so the same ids
+ // are scoreable by the indexer in run F.
+ const int cap = std::min(vocab, 100000);
+ for (int i = 0; i < n; i++) {
+ s = s * 6364136223846793005ull + 1442695040888963407ull;
+ p[i] = (int32_t)(1000 + (s >> 33) % (uint64_t)(cap / 2));
+ }
+ return p;
+}
+
+// Pooled chunked prefill: 64-token (one pager chunk) batched forwards with
+// slot-mapped set_rows writes and a resident+causal mask. This is the
+// prompt > pool path: prefill evicts like decode does. Graph is built once
+// (fixed topology) and reused for every chunk.
+struct BatchStepper {
+ ggml_context * ctx = nullptr;
+ ggml_cgraph * gf = nullptr;
+ ggml_gallocr_t alloc = nullptr;
+ ggml_tensor * inp_embed = nullptr;
+ ggml_tensor * positions = nullptr;
+ ggml_tensor * attn_mask = nullptr;
+ ggml_tensor * kv_write_rows = nullptr;
+ ggml_tensor * logits = nullptr;
+ ggml_tensor * argmax_tokens = nullptr;
+
+ const TargetWeights * w = nullptr;
+ TargetCache * cache = nullptr;
+ ggml_backend_t backend = nullptr;
+ int pool = 0;
+ static constexpr int NB = 64; // tokens per chunk
+
+ std::vector arena;
+ std::vector embed_buf;
+ std::vector mask_buf;
+
+ bool init(const TargetWeights & tw, TargetCache & tc, ggml_backend_t be, int pool_) {
+ w = &tw; cache = &tc; backend = be; pool = pool_;
+ embed_buf.resize((size_t)tw.n_embd * NB);
+ arena.resize((size_t)512 * 1024 * 1024);
+
+ ggml_init_params ip{};
+ ip.mem_size = arena.size();
+ ip.mem_buffer = arena.data();
+ ip.no_alloc = true;
+ ctx = ggml_init(ip);
+ if (!ctx) return false;
+
+ inp_embed = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, tw.n_embd, NB, 1);
+ ggml_set_input(inp_embed);
+ positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4 * NB);
+ ggml_set_input(positions);
+ kv_write_rows = ggml_new_tensor_2d(ctx, GGML_TYPE_I64, NB, tw.n_head_kv);
+ ggml_set_input(kv_write_rows);
+ attn_mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F16,
+ align_up(pool, KQ_MASK_PAD),
+ align_up(NB, KQ_MASK_PAD));
+ ggml_set_input(attn_mask);
+ mask_buf.assign((size_t)attn_mask->ne[0] * attn_mask->ne[1], F16_NEG_INF);
+
+ gf = ggml_new_graph_custom(ctx, 16384, false);
+ QwenGraphInputs gi{};
+ gi.inp_embed = inp_embed;
+ gi.positions = positions;
+ gi.attn_mask = attn_mask;
+ gi.n_tokens = NB;
+ gi.kv_start = pool - NB; // span = whole pool
+ gi.kv_write_rows = kv_write_rows;
+ gi.last_token_logits_only = true;
+ QwenGraphOutputs go = build_qwen35_graph(ctx, gf, *w, *cache, gi);
+ if (!go.logits) return false;
+ logits = go.logits;
+ ggml_set_output(logits);
+ argmax_tokens = ggml_argmax(ctx, logits);
+ ggml_set_output(argmax_tokens);
+ ggml_build_forward_expand(gf, argmax_tokens);
+ alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
+ return ggml_gallocr_alloc_graph(alloc, gf);
+ }
+
+ // One 64-token chunk at logical [pos_base, pos_base+64). Allocates the
+ // chunk's block (evicting if needed), writes slot-mapped, masks
+ // resident slots + causal-within-chunk. Returns last-token argmax.
+ int32_t step_chunk(const int32_t * toks, int pos_base, KvFlashPager & pager) {
+ int slots[NB];
+ for (int i = 0; i < NB; i++) slots[i] = pager.slot_for(pos_base + i);
+
+ if (!w->embedder.embed(toks, NB, embed_buf.data())) {
+ std::fprintf(stderr, "batch embed failed @%d\n", pos_base);
+ std::exit(1);
+ }
+ ggml_backend_tensor_set(inp_embed, embed_buf.data(), 0,
+ sizeof(float) * embed_buf.size());
+ std::vector p4((size_t)4 * NB);
+ for (int i = 0; i < NB; i++) {
+ p4[4 * i + 0] = p4[4 * i + 1] = p4[4 * i + 2] = pos_base + i;
+ p4[4 * i + 3] = 0;
+ }
+ ggml_backend_tensor_set(positions, p4.data(), 0, sizeof(int32_t) * p4.size());
+ // [n_tokens, n_head_kv] ne0-major: (token i, head h) at i + h*NB.
+ std::vector rows((size_t)NB * w->n_head_kv);
+ for (int h = 0; h < w->n_head_kv; h++) {
+ for (int i = 0; i < NB; i++) {
+ rows[(size_t)h * NB + i] = slots[i];
+ }
+ }
+ ggml_backend_tensor_set(kv_write_rows, rows.data(), 0,
+ sizeof(int64_t) * rows.size());
+
+ // Mask: per q row, resident slots (excluding this chunk) attendable,
+ // this chunk's slots causal. Rebuilt + uploaded per chunk.
+ const size_t kvd = (size_t)attn_mask->ne[0];
+ std::fill(mask_buf.begin(), mask_buf.end(), F16_NEG_INF);
+ pager.fill_slot_mask(mask_buf.data()); // row 0 base
+ const int this_block = slots[0] - slots[0] % NB;
+ for (int i = 0; i < NB; i++) mask_buf[(size_t)this_block + i] = F16_NEG_INF;
+ for (int q = 1; q < NB; q++) {
+ std::memcpy(mask_buf.data() + (size_t)q * kvd, mask_buf.data(), kvd * 2);
+ }
+ for (int q = 0; q < NB; q++) {
+ for (int i = 0; i <= q; i++) {
+ mask_buf[(size_t)q * kvd + slots[i]] = F16_ZERO;
+ }
+ }
+ ggml_backend_tensor_set(attn_mask, mask_buf.data(), 0,
+ mask_buf.size() * sizeof(uint16_t));
+
+ if (ggml_backend_graph_compute(backend, gf) != GGML_STATUS_SUCCESS) {
+ std::fprintf(stderr, "batch compute failed @%d\n", pos_base);
+ std::exit(1);
+ }
+ int32_t last = 0;
+ ggml_backend_tensor_get(argmax_tokens, &last, 0, sizeof(int32_t));
+ return last;
+ }
+
+ void destroy() {
+ if (alloc) { ggml_gallocr_free(alloc); alloc = nullptr; }
+ if (ctx) { ggml_free(ctx); ctx = nullptr; }
+ }
+};
+
+
+int arg_int(int argc, char ** argv, const char * key, int defv) {
+ const size_t kl = std::strlen(key);
+ for (int i = 2; i < argc; i++) {
+ if (std::strncmp(argv[i], key, kl) == 0 && argv[i][kl] == '=') {
+ return std::atoi(argv[i] + kl + 1);
+ }
+ }
+ return defv;
+}
+
+bool arg_flag(int argc, char ** argv, const char * key) {
+ for (int i = 2; i < argc; i++) if (std::strcmp(argv[i], key) == 0) return true;
+ return false;
+}
+
+struct StepTimes {
+ double p50 = 0, p95 = 0, mean = 0;
+};
+
+StepTimes summarize(std::vector & ms) {
+ StepTimes r;
+ if (ms.empty()) return r;
+ std::sort(ms.begin(), ms.end());
+ r.p50 = ms[ms.size() / 2];
+ r.p95 = ms[(size_t)(ms.size() * 0.95)];
+ for (double v : ms) r.mean += v;
+ r.mean /= ms.size();
+ return r;
+}
+
+} // namespace
+
+int main(int argc, char ** argv) {
+ if (argc < 2) {
+ std::fprintf(stderr, "usage: %s [--logical-ctx=N] [--pool-b=N] "
+ "[--pool-c=N] [--prompt=N] [--gen=N] [--skip-profile]\n", argv[0]);
+ return 2;
+ }
+ const int logical_ctx = arg_int(argc, argv, "--logical-ctx", 131072);
+ const int pool_b = arg_int(argc, argv, "--pool-b", 2048);
+ const int pool_c = arg_int(argc, argv, "--pool-c", 1024);
+ const int n_prompt = arg_int(argc, argv, "--prompt", 512);
+ const int n_gen = arg_int(argc, argv, "--gen", 1200);
+ const bool skip_prof = arg_flag(argc, argv, "--skip-profile");
+ // Explicit pool slot mask: exact exclusion of non-resident slots.
+ // ON by default (requires the per-step re-upload in refresh_mask: the
+ // mask input's compute-buffer region is clobbered by graph execution).
+ // --no-mask falls back to the zero-row approximation production's
+ // padded span uses.
+ const bool use_mask = !arg_flag(argc, argv, "--no-mask");
+ const int total = n_prompt + n_gen;
+ if (total > pool_b) {
+ std::fprintf(stderr, "config error: prompt+gen (%d) must fit pool-b (%d)\n", total, pool_b);
+ return 2;
+ }
+
+ ggml_backend_t backend = ggml_backend_cuda_init(0);
+ if (!backend) { std::fprintf(stderr, "cuda init failed\n"); return 1; }
+ const size_t vram0 = vram_used_now();
+
+ TargetWeights w;
+ if (!load_target_gguf(argv[1], backend, w)) {
+ std::fprintf(stderr, "load: %s\n", dflash27b_last_error());
+ return 1;
+ }
+ std::printf("[load] weights ok, vram_used=%.1f MiB\n",
+ (vram_used_now() - vram0) / 1048576.0);
+
+ // ── --longab: end-to-end long-prompt A/B (speed + accuracy) ─────
+ // For L in {32K, 64K, 128K}: full-cache baseline vs pool-4096 with
+ // drafter reselect. Measures prefill time, decode tok/s over a
+ // 240-token free run, and needle recall (depth 0.25, outside both
+ // the sinks and the LRU window).
+ if (arg_flag(argc, argv, "--longab")) {
+ // Drafter loads lazily, pool mode only: the full-cache baseline at
+ // 256K needs every byte (weights 15.3 GiB + KV 4.6 GiB).
+ DrafterContext dctx;
+ KvFlashDrafterScorer scorer(&dctx);
+ // Single-config mode (one process per config: the CUDA VMM pool
+ // grows monotonically across large-cache configs and aborts).
+ const int only_L = arg_int(argc, argv, "--longab-L", 0);
+ const int only_mode = arg_int(argc, argv, "--longab-mode", -1); // 0=full 1=pool
+ std::printf("\n%-7s %-10s %-9s %-9s %-9s %-9s %s\n",
+ "L", "mode", "prefill_s", "rescore_s", "dec_tok/s", "needle", "kv_vram");
+ for (int L : { 32768, 65536, 131072, 262144 }) {
+ if (only_L > 0 && L != only_L) continue;
+ for (int mode = 0; mode < 2; mode++) { // 0=baseline 1=pool
+ if (only_mode >= 0 && mode != only_mode) continue;
+ if (mode == 1 && !dctx.loaded &&
+ !load_drafter("/opt/lucebox/models/drafter/Qwen3-0.6B-BF16.gguf", 0, dctx)) {
+ std::fprintf(stderr, "drafter load failed\n");
+ return 1;
+ }
+ const int pool = mode == 0 ? L : 4096;
+ auto prompt = make_prompt(L, w.n_vocab);
+ std::vector needle(48);
+ uint64_t ns = 0xDEADBEEFCAFEull;
+ for (int i = 0; i < 48; i++) {
+ ns = ns * 6364136223846793005ull + 1442695040888963407ull;
+ needle[i] = (int32_t)(1000 + (ns >> 33) % 49000);
+ }
+ const int npos = ((int)(0.25 * (L - 512)) / 32) * 32;
+ for (int i = 0; i < 48; i++) prompt[npos + i] = needle[i];
+
+ TargetCache cache;
+ if (!create_target_cache(w, pool, 0, backend, cache, true)) return 1;
+ const double kv_mib = kv_cache_bytes(cache) / 1048576.0;
+ KvFlashPager pager;
+ KvFlashConfig pc; pc.pool_tokens = pool;
+ if (!pager.attach(pc, cache.attn_k, cache.attn_v)) return 1;
+
+ double t0 = now_ms();
+ BatchStepper bs;
+ if (!bs.init(w, cache, backend, pool)) return 1;
+ for (int p = 0; p < L; p += 64) bs.step_chunk(prompt.data() + p, p, pager);
+ bs.destroy();
+ const double prefill_s = (now_ms() - t0) / 1000.0;
+
+ Stepper st;
+ if (!st.init(w, cache, backend, pool, mode == 1)) return 1;
+ int32_t next = -1;
+ for (int i = 0; i < 32; i++) {
+ const int slot = pager.slot_for(L + i);
+ st.refresh_mask(pager);
+ next = st.step(needle[i], L + i, slot);
+ }
+ double rescore_s = 0;
+ if (mode == 1) {
+ std::vector hist = prompt;
+ hist.insert(hist.end(), needle.begin(), needle.begin() + 32);
+ std::vector scores;
+ t0 = now_ms();
+ if (scorer.score_chunks(hist, pc.chunk_tokens, scores)) {
+ pager.score_hook = [&scores](int c) {
+ return c < (int)scores.size() ? scores[c] : 1e30f;
+ };
+ pager.reselect();
+ pager.score_hook = nullptr;
+ }
+ rescore_s = (now_ms() - t0) / 1000.0;
+ }
+ int match = 0;
+ for (int i = 0; i < 16; i++) {
+ if (next == needle[32 + i]) match++;
+ const int pos = L + 32 + i;
+ const int slot = pager.slot_for(pos);
+ st.refresh_mask(pager);
+ next = st.step(needle[32 + i], pos, slot);
+ }
+ t0 = now_ms();
+ for (int i = 0; i < 240; i++) { // timed free run
+ const int pos = L + 48 + i;
+ const int slot = pager.slot_for(pos);
+ st.refresh_mask(pager);
+ next = st.step(next, pos, slot);
+ }
+ const double tok_s = 240.0 / ((now_ms() - t0) / 1000.0);
+ std::printf("%-7d %-10s %-9.1f %-9.1f %-9.1f %d/16 %.0f MiB\n",
+ L, mode == 0 ? "full" : "pool4096",
+ prefill_s, rescore_s, tok_s, match, kv_mib);
+ std::fflush(stdout);
+ st.destroy();
+ free_target_cache(cache);
+ }
+ }
+ if (dctx.loaded) free_drafter(dctx);
+ free_target_weights(w);
+ ggml_backend_free(backend);
+ return 0;
+ }
+
+ // ── --niah256: native-max-context probe (262144 logical) ────────
+ // Pooled configs only: the fixed-span harness makes a full-pool
+ // control prefill take hours at 256K. The LRU row with the needle
+ // inside the recency window is the induction control (distance-free).
+ if (arg_flag(argc, argv, "--niah256")) {
+ DrafterContext dctx;
+ if (!load_drafter("/opt/lucebox/models/drafter/Qwen3-0.6B-BF16.gguf", 0, dctx)) {
+ std::fprintf(stderr, "drafter load failed\n");
+ return 1;
+ }
+ KvFlashDrafterScorer scorer(&dctx);
+ const int L = 262144, pool = 16384; // 6.25% residency
+ struct Cfg { const char * policy; double depth; };
+ const Cfg cfgs[] = {
+ {"lru", 0.97}, // in-window: induction control at 256K
+ {"lru", 0.50},
+ {"drafter", 0.10},
+ {"drafter", 0.50},
+ {"drafter", 0.90},
+ };
+ std::printf("\n%-7s %-6s %-8s %-6s %s\n", "L", "pool", "policy", "depth", "match/16");
+ for (const Cfg & cfg : cfgs) {
+ auto prompt = make_prompt(L, w.n_vocab);
+ std::vector needle(48);
+ uint64_t ns = 0xDEADBEEFCAFEull;
+ for (int i = 0; i < 48; i++) {
+ ns = ns * 6364136223846793005ull + 1442695040888963407ull;
+ needle[i] = (int32_t)(1000 + (ns >> 33) % 49000);
+ }
+ const int npos = ((int)(cfg.depth * (L - 512)) / 32) * 32;
+ for (int i = 0; i < 48; i++) prompt[npos + i] = needle[i];
+
+ TargetCache cache;
+ if (!create_target_cache(w, pool, 0, backend, cache, true)) return 1;
+ KvFlashPager pager;
+ KvFlashConfig pc; pc.pool_tokens = pool;
+ if (!pager.attach(pc, cache.attn_k, cache.attn_v)) return 1;
+
+ const double t0 = now_ms();
+ BatchStepper bs;
+ if (!bs.init(w, cache, backend, pool)) return 1;
+ for (int p = 0; p < L; p += 64) bs.step_chunk(prompt.data() + p, p, pager);
+ bs.destroy();
+ std::printf("[256k] prefill %.1f s, host backing %.2f GiB\n",
+ (now_ms() - t0) / 1000.0,
+ pager.stats().host_bytes / 1073741824.0);
+
+ Stepper st;
+ if (!st.init(w, cache, backend, pool, true)) return 1;
+ int32_t next = -1;
+ for (int i = 0; i < 32; i++) {
+ const int slot = pager.slot_for(L + i);
+ st.refresh_mask(pager);
+ next = st.step(needle[i], L + i, slot);
+ }
+ if (std::strcmp(cfg.policy, "drafter") == 0) {
+ std::vector hist = prompt;
+ hist.insert(hist.end(), needle.begin(), needle.begin() + 32);
+ std::vector scores;
+ const double r0 = now_ms();
+ if (!scorer.score_chunks(hist, pc.chunk_tokens, scores)) {
+ std::printf("[256k] WARN rescore failed\n");
+ } else {
+ std::printf("[256k] rescore %.1f s\n", (now_ms() - r0) / 1000.0);
+ pager.score_hook = [&scores](int c) {
+ return c < (int)scores.size() ? scores[c] : 1e30f;
+ };
+ pager.reselect();
+ pager.score_hook = nullptr;
+ }
+ }
+ int match = 0;
+ for (int i = 0; i < 16; i++) {
+ if (next == needle[32 + i]) match++;
+ const int pos = L + 32 + i;
+ const int slot = pager.slot_for(pos);
+ st.refresh_mask(pager);
+ next = st.step(needle[32 + i], pos, slot);
+ }
+ std::printf("%-7d %-6d %-8s %-6.2f %d/16\n", L, pool, cfg.policy, cfg.depth, match);
+ std::fflush(stdout);
+ st.destroy();
+ free_target_cache(cache);
+ }
+ free_drafter(dctx);
+ free_target_weights(w);
+ ggml_backend_free(backend);
+ return 0;
+ }
+
+ if (arg_flag(argc, argv, "--niah")) {
+ DrafterContext dctx;
+ const bool have_drafter =
+ load_drafter("/opt/lucebox/models/drafter/Qwen3-0.6B-BF16.gguf", 0, dctx);
+ if (!have_drafter) std::printf("[niah] drafter unavailable, skipping drafter policy\n");
+ KvFlashDrafterScorer scorer(&dctx);
+ if (have_drafter) {
+ // Reserve the drafter's compute buffers at max context NOW,
+ // before target-side cache churn fragments the CUDA pool.
+ // Without this, 32K rescores OOM late in the sweep and the
+ // drafter policy silently degrades to LRU.
+ std::vector warm(33024, 1234);
+ std::vector tmp;
+ scorer.score_chunks(warm, 64, tmp);
+ }
+
+ const int Ls[] = { 8192, 32768 };
+ const double depths[] = { 0.10, 0.50, 0.90 };
+ std::printf("\n%-7s %-6s %-8s %-6s %s\n", "L", "pool", "policy", "depth", "match/16");
+ for (int L : Ls) {
+ const int pools[] = { L, L / 4, ((L / 10) / 256) * 256 };
+ for (int pi = 0; pi < 3; pi++) {
+ const int pool = pools[pi];
+ const char * policies[] = { "lru", "drafter" };
+ const int n_pol = (pi == 0) ? 1 : (have_drafter ? 2 : 1); // full pool: control only
+ for (int pol = 0; pol < n_pol; pol++) {
+ for (double depth : depths) {
+ // Needle: 48 unique-as-a-sequence tokens from the
+ // filler id range (matched embedding statistics).
+ // Query = first 32 (longer match = stronger
+ // induction), score the last 16.
+ auto prompt = make_prompt(L, w.n_vocab);
+ std::vector needle(48);
+ uint64_t ns = 0xDEADBEEFCAFEull;
+ for (int i = 0; i < 48; i++) {
+ ns = ns * 6364136223846793005ull + 1442695040888963407ull;
+ needle[i] = (int32_t)(1000 + (ns >> 33) % 49000);
+ }
+ const int npos = ((int)(depth * (L - 512)) / 32) * 32;
+ for (int i = 0; i < 48; i++) prompt[npos + i] = needle[i];
+
+ TargetCache cache;
+ if (!create_target_cache(w, pool, 0, backend, cache, true)) return 1;
+ KvFlashPager pager;
+ KvFlashConfig pc; pc.pool_tokens = pool;
+ if (!pager.attach(pc, cache.attn_k, cache.attn_v)) return 1;
+
+ BatchStepper bs;
+ if (!bs.init(w, cache, backend, pool)) return 1;
+ for (int p = 0; p < L; p += 64) {
+ bs.step_chunk(prompt.data() + p, p, pager);
+ }
+ bs.destroy();
+
+ Stepper st;
+ if (!st.init(w, cache, backend, pool, true)) return 1;
+ int32_t next = -1;
+ for (int i = 0; i < 32; i++) { // query: needle prefix
+ const int slot = pager.slot_for(L + i);
+ st.refresh_mask(pager);
+ next = st.step(needle[i], L + i, slot);
+ }
+ if (pol == 1) { // drafter reselect
+ std::vector hist = prompt;
+ hist.insert(hist.end(), needle.begin(), needle.begin() + 32);
+ std::vector scores;
+ if (!scorer.score_chunks(hist, pc.chunk_tokens, scores)) {
+ std::printf("[niah] WARN: rescore failed (L=%d pool=%d)\n", L, pool);
+ } else {
+ pager.score_hook = [&scores](int c) {
+ return c < (int)scores.size() ? scores[c] : 1e30f;
+ };
+ pager.reselect();
+ pager.score_hook = nullptr;
+ }
+ }
+ int match = 0;
+ for (int i = 0; i < 16; i++) { // continuation
+ if (next == needle[32 + i]) match++;
+ // Teacher-force ground truth: one miss must not
+ // cascade; we measure per-position retrieval.
+ const int pos = L + 32 + i;
+ const int slot = pager.slot_for(pos);
+ st.refresh_mask(pager);
+ next = st.step(needle[32 + i], pos, slot);
+ }
+ std::printf("%-7d %-6d %-8s %-6.2f %d/16\n",
+ L, pool, pi == 0 ? "full" : policies[pol],
+ depth, match);
+ std::fflush(stdout);
+ st.destroy();
+ free_target_cache(cache);
+ }
+ }
+ }
+ }
+ if (have_drafter) free_drafter(dctx);
+ free_target_weights(w);
+ ggml_backend_free(backend);
+ return 0;
+ }
+
+ const auto prompt = make_prompt(n_prompt, w.n_vocab);
+ std::vector tokens_a;
+ size_t mem_a_kv = 0, mem_a_buf = 0, mem_a_vram = 0;
+ size_t mem_c_kv = 0, mem_c_buf = 0, mem_c_vram = 0;
+ int hard_failures = 0;
+
+ // ── Run A: baseline at logical context, maskless ────────────────
+ {
+ const size_t v_before = vram_used_now();
+ TargetCache cache;
+ if (!create_target_cache(w, logical_ctx, 0, backend, cache, /*prefill_only=*/true)) {
+ std::fprintf(stderr, "cache A: %s\n", dflash27b_last_error());
+ return 1;
+ }
+ mem_a_kv = kv_cache_bytes(cache);
+ mem_a_buf = ggml_backend_buffer_get_size(cache.base_buf);
+ mem_a_vram = vram_used_now() - v_before;
+ std::printf("[A] logical_ctx=%d kv=%.1f MiB base_buf=%.1f MiB vram_delta=%.1f MiB\n",
+ logical_ctx, mem_a_kv / 1048576.0, mem_a_buf / 1048576.0,
+ mem_a_vram / 1048576.0);
+
+ Stepper st;
+ int32_t next = -1;
+ const double t0 = now_ms();
+ for (int pos = 0; pos < total; pos++) {
+ // Production-like growing span: rebuild only when the padded
+ // span crosses a 256 boundary (mirrors do_ar_decode topology).
+ const int want_span = pos + 1;
+ if (!st.ctx || ((want_span + 255) / 256) != ((st.span + 255) / 256)) {
+ st.span = want_span;
+ if (!st.ctx) { if (!st.init(w, cache, backend, want_span, false)) return 1; }
+ else if (!st.build()) return 1;
+ }
+ const int32_t tok = pos < n_prompt ? prompt[pos]
+ : (tokens_a.push_back(next), next);
+ next = st.step(tok, pos, pos);
+ cache.cur_pos = pos + 1;
+ }
+ tokens_a.push_back(next);
+ std::printf("[A] decoded %zu tokens, %.1f tok/s overall\n",
+ tokens_a.size(), total / ((now_ms() - t0) / 1000.0));
+ st.destroy();
+ free_target_cache(cache);
+ }
+
+ // ── Run B: relocation + mask exactness, teacher-forced ──────────
+ {
+ TargetCache cache;
+ if (!create_target_cache(w, pool_b, 0, backend, cache, /*prefill_only=*/true)) {
+ std::fprintf(stderr, "cache B: %s\n", dflash27b_last_error());
+ return 1;
+ }
+ KvFlashPager pager;
+ KvFlashConfig pc;
+ pc.pool_tokens = pool_b;
+ if (!pager.attach(pc, cache.attn_k, cache.attn_v)) return 1;
+ const int nb = pool_b / pc.chunk_tokens;
+ std::vector order(nb);
+ for (int i = 0; i < nb; i++) order[i] = i;
+ uint64_t s = 12345;
+ for (int i = nb - 1; i > 0; i--) {
+ s = s * 6364136223846793005ull + 1442695040888963407ull;
+ const int j = (int)((s >> 33) % (uint64_t)(i + 1));
+ std::swap(order[i], order[j]);
+ }
+ pager.set_block_order(order);
+
+ Stepper st;
+ if (!st.init(w, cache, backend, pool_b, use_mask)) return 1;
+ int mismatches = 0, first_mismatch = -1;
+ for (int pos = 0; pos < total; pos++) {
+ const int32_t tok = pos < n_prompt ? prompt[pos] : tokens_a[pos - n_prompt];
+ const int slot = pager.slot_for(pos);
+ st.refresh_mask(pager);
+ const int32_t next = st.step(tok, pos, slot);
+ const int ref_idx = pos - n_prompt + 1;
+ if (pos >= n_prompt - 1 && ref_idx < (int)tokens_a.size()) {
+ if (next != tokens_a[ref_idx]) {
+ mismatches++;
+ if (first_mismatch < 0) first_mismatch = pos;
+ }
+ }
+ }
+ const double rate = 100.0 * mismatches / (n_gen + 1);
+ std::printf("[B] shuffled+masked, pool=%d: %d/%d argmax mismatches (%.2f%%), first at pos %d; "
+ "mask refills=%d avg=%.3f ms\n",
+ pool_b, mismatches, n_gen + 1, rate, first_mismatch,
+ st.mask_fills, st.mask_fills ? st.mask_fill_ms_total / st.mask_fills : 0.0);
+ // Gate at 2%: the flip sources are the maskless zero-row softmax
+ // mass plus run-to-run fattn nondeterminism; both measured ~1%
+ // (10-14 flips/1201 across runs), so a 1% gate flaps on noise.
+ std::printf("%s relocation equivalence (threshold 2%%)\n", rate <= 2.0 ? "PASS" : "FAIL");
+ if (rate > 2.0) hard_failures++;
+ st.destroy();
+ free_target_cache(cache);
+ }
+
+ // ── Run C: live paging + roundtrip; D: reselect recall ──────────
+ {
+ const size_t v_before = vram_used_now();
+ TargetCache cache;
+ if (!create_target_cache(w, pool_c, 0, backend, cache, /*prefill_only=*/true)) {
+ std::fprintf(stderr, "cache C: %s\n", dflash27b_last_error());
+ return 1;
+ }
+ mem_c_kv = kv_cache_bytes(cache);
+ mem_c_buf = ggml_backend_buffer_get_size(cache.base_buf);
+ mem_c_vram = vram_used_now() - v_before;
+
+ KvFlashPager pager;
+ KvFlashConfig pc;
+ pc.pool_tokens = pool_c;
+ if (!pager.attach(pc, cache.attn_k, cache.attn_v)) return 1;
+
+ Stepper st;
+ if (!st.init(w, cache, backend, pool_c, use_mask)) return 1;
+ int32_t next = -1;
+ for (int pos = 0; pos < n_prompt; pos++) {
+ const int slot = pager.slot_for(pos);
+ st.refresh_mask(pager);
+ next = st.step(prompt[pos], pos, slot);
+ cache.cur_pos = pos + 1;
+ }
+ { // bit-exact roundtrip on chunk 2
+ ggml_tensor * t = cache.attn_k[0];
+ const size_t seg = (size_t)pc.chunk_tokens * t->nb[1];
+ std::vector before(seg), after(seg);
+ ggml_backend_tensor_get(t, before.data(),
+ (size_t)pager.block_of(2) * pc.chunk_tokens * t->nb[1], seg);
+ if (!pager.page_out(2) || !pager.page_in(2)) {
+ std::fprintf(stderr, "roundtrip paging failed\n"); return 1;
+ }
+ ggml_backend_tensor_get(t, after.data(),
+ (size_t)pager.block_of(2) * pc.chunk_tokens * t->nb[1], seg);
+ const bool exact = std::memcmp(before.data(), after.data(), seg) == 0;
+ std::printf("%s page_out/page_in roundtrip bit-exact (chunk 2 -> block %d)\n",
+ exact ? "PASS" : "FAIL", pager.block_of(2));
+ if (!exact) hard_failures++;
+ }
+
+ std::vector tokens_c;
+ const double t0 = now_ms();
+ for (int pos = n_prompt; pos < total; pos++) {
+ tokens_c.push_back(next);
+ const int slot = pager.slot_for(pos);
+ st.refresh_mask(pager);
+ next = st.step(next, pos, slot);
+ cache.cur_pos = pos + 1;
+ }
+ tokens_c.push_back(next);
+ const double secs = (now_ms() - t0) / 1000.0;
+ int agree = 0;
+ while (agree < (int)tokens_c.size() && agree < (int)tokens_a.size() &&
+ tokens_c[agree] == tokens_a[agree]) agree++;
+ const auto & ps = pager.stats();
+ std::printf("[C] pool=%d masked: decode %.1f tok/s, page_outs=%" PRId64
+ " page_ins=%" PRId64 " host=%.1f MiB; baseline agreement %d tokens\n",
+ pool_c, n_gen / secs, ps.page_outs, ps.page_ins,
+ ps.host_bytes / 1048576.0, agree);
+ std::printf("PASS paged decode with eviction (%d evictions)\n", (int)ps.page_outs);
+
+ // ── Run D: τ-style reselect recall ──────────────────────────
+ {
+ int victim = -1; // earliest paged-out, non-sink chunk
+ for (int c = pc.sink_chunks; c < pager.n_chunks(); c++) {
+ if (!pager.is_resident(c)) { victim = c; break; }
+ }
+ if (victim < 0) {
+ std::printf("FAIL reselect demo: no paged-out chunk found\n");
+ hard_failures++;
+ } else {
+ // Score injection: the victim becomes the hottest chunk —
+ // stands in for a drafter rescore flagging recalled context.
+ pager.score_hook = [&](int c) { return c == victim ? 2.0f : 1.0f / (1 + c); };
+ const double r0 = now_ms();
+ const int events = pager.reselect();
+ const double r_ms = now_ms() - r0;
+ const bool back = pager.is_resident(victim);
+ std::printf("%s reselect recalled chunk %d (%d page events, %.2f ms)\n",
+ back ? "PASS" : "FAIL", victim, events, r_ms);
+ if (!back) hard_failures++;
+ // decode must continue cleanly after the residency change
+ pager.score_hook = nullptr;
+ for (int pos = total; pos < total + 64; pos++) {
+ const int slot = pager.slot_for(pos);
+ st.refresh_mask(pager);
+ next = st.step(next, pos, slot);
+ }
+ std::printf("PASS decode continues after reselect (64 tokens)\n");
+ }
+ }
+ st.destroy();
+ free_target_cache(cache);
+ }
+
+ // ── Run F: full LSA loop — drafter as Memory Indexer ────────────
+ // Prompt LARGER than the pool, so prefill itself evicts; then the
+ // FlashMemory inference paradigm end to end: every τ=64 decoded
+ // tokens the drafter rescores the whole sequence (tail attention =
+ // indexer query), score_hook gets the fresh chunk scores, and
+ // reselect() repages the pool. PASS requires at least one genuine
+ // drafter-driven recall of a chunk evicted earlier.
+ {
+ const char * drafter_path = "/opt/lucebox/models/drafter/Qwen3-0.6B-BF16.gguf";
+ DrafterContext dctx;
+ if (!load_drafter(drafter_path, 0, dctx)) {
+ std::printf("FAIL indexer run: drafter load failed (%s)\n", dflash27b_last_error());
+ hard_failures++;
+ } else {
+ const int n_prompt_f = 2048, n_gen_f = 768, pool_f = 1024, tau = 64;
+ const auto prompt_f = make_prompt(n_prompt_f, w.n_vocab);
+ TargetCache cache;
+ if (!create_target_cache(w, pool_f, 0, backend, cache, true)) return 1;
+ KvFlashPager pager;
+ KvFlashConfig pc;
+ pc.pool_tokens = pool_f;
+ if (!pager.attach(pc, cache.attn_k, cache.attn_v)) return 1;
+ KvFlashDrafterScorer scorer(&dctx); // the production indexer plugin
+
+ Stepper st;
+ if (!st.init(w, cache, backend, pool_f, use_mask)) return 1;
+ std::vector all_ids = prompt_f;
+ int32_t next = -1;
+ for (int pos = 0; pos < n_prompt_f; pos++) {
+ const int slot = pager.slot_for(pos);
+ st.refresh_mask(pager);
+ next = st.step(prompt_f[pos], pos, slot);
+ }
+ const int64_t prefill_evictions = pager.stats().page_outs;
+
+ std::vector rescore_ms, reselect_ms;
+ int64_t recalls = 0;
+ std::vector scores;
+ const double t0 = now_ms();
+ for (int g = 0; g < n_gen_f; g++) {
+ const int pos = n_prompt_f + g;
+ if (g % tau == 0) {
+ double r0 = now_ms();
+ if (!scorer.score_chunks(all_ids, pc.chunk_tokens, scores)) {
+ std::fprintf(stderr, "scorer failed\n");
+ std::exit(1);
+ }
+ rescore_ms.push_back(now_ms() - r0);
+ pager.score_hook = [&scores](int c) {
+ return c < (int)scores.size() ? scores[c] : 1e30f;
+ };
+ r0 = now_ms();
+ const int64_t ins_before = pager.stats().page_ins;
+ pager.reselect();
+ reselect_ms.push_back(now_ms() - r0);
+ recalls += pager.stats().page_ins - ins_before;
+ }
+ const int slot = pager.slot_for(pos);
+ st.refresh_mask(pager);
+ next = st.step(next, pos, slot);
+ all_ids.push_back(next);
+ }
+ const double secs = (now_ms() - t0) / 1000.0;
+ StepTimes rs = summarize(rescore_ms), rsel = summarize(reselect_ms);
+ const auto & ps = pager.stats();
+ std::printf("[F] LSA loop: prompt=%d pool=%d gen=%d tau=%d -> %.1f tok/s "
+ "(prefill evicted %" PRId64 ")\n",
+ n_prompt_f, pool_f, n_gen_f, tau, n_gen_f / secs, prefill_evictions);
+ std::printf("[F] indexer rescore p50=%.1f ms (full 0.6B re-prefill, %zu calls); "
+ "reselect p50=%.2f ms; drafter-driven recalls=%" PRId64
+ "; total page_outs=%" PRId64 " page_ins=%" PRId64 "\n",
+ rs.p50, rescore_ms.size(), rsel.p50, recalls,
+ ps.page_outs, ps.page_ins);
+ std::printf("%s LSA loop: drafter-driven recall of evicted context (recalls >= 1)\n",
+ recalls >= 1 ? "PASS" : "FAIL");
+ if (recalls < 1) hard_failures++;
+ st.destroy();
+ free_target_cache(cache);
+ free_drafter(dctx);
+ }
+ }
+
+ // ── Run E: performance profile ──────────────────────────────────
+ if (!skip_prof) {
+ std::printf("\n=== DECODE PROFILE (64 timed steps each, junk KV, span = FA window) ===\n");
+ auto profile = [&](const char * tag, int alloc_ctx, int span, bool masked,
+ KvFlashPager * pager, int pos_base) {
+ TargetCache cache;
+ if (!create_target_cache(w, alloc_ctx, 0, backend, cache, true)) {
+ std::fprintf(stderr, "cache E(%s): %s\n", tag, dflash27b_last_error());
+ std::exit(1);
+ }
+ KvFlashPager local;
+ if (masked && !pager) {
+ KvFlashConfig pc; pc.pool_tokens = alloc_ctx;
+ local.attach(pc, cache.attn_k, cache.attn_v);
+ // mark whole pool resident so the mask is all-zero (worst
+ // case mask read, no -inf shortcut)
+ for (int p = 0; p < alloc_ctx; p += 64) local.slot_for(p);
+ pager = &local;
+ }
+ Stepper st;
+ if (!st.init(w, cache, backend, span, masked)) std::exit(1);
+ // warmup 8, then time 64 (refresh included: it is part of the
+ // real per-step cost in masked mode)
+ int32_t tok = 1000;
+ for (int i = 0; i < 8; i++) {
+ if (masked) st.refresh_mask(*pager);
+ tok = st.step(tok, pos_base + i, (i * 64) % alloc_ctx);
+ }
+ std::vector ms;
+ for (int i = 0; i < 64; i++) {
+ const double t0 = now_ms();
+ if (masked) st.refresh_mask(*pager);
+ tok = st.step(tok, pos_base + 8 + i, (8 * 64 + i) % alloc_ctx);
+ ms.push_back(now_ms() - t0);
+ }
+ const StepTimes r = summarize(ms);
+ std::printf("%-28s span=%6d p50=%7.2f ms p95=%7.2f ms mean=%7.2f ms (%5.1f tok/s)\n",
+ tag, span, r.p50, r.p95, r.mean, 1000.0 / r.mean);
+ st.destroy();
+ free_target_cache(cache);
+ };
+ profile("baseline 8K", 8192, 8192, false, nullptr, 8192 - 72);
+ profile("baseline 32K", 32768, 32768, false, nullptr, 32768 - 72);
+ profile("baseline 128K", 131072, 131072, false, nullptr, 131072 - 72);
+ profile("pool 1K masked (128K logical)", 1024, 1024, true, nullptr, 130000);
+ profile("pool 1K maskless", 1024, 1024, false, nullptr, 130000);
+ profile("pool 4K masked (128K logical)", 4096, 4096, true, nullptr, 130000);
+
+ // Page-event microbench on a small pool.
+ {
+ TargetCache cache;
+ if (!create_target_cache(w, 1024, 0, backend, cache, true)) std::exit(1);
+ KvFlashPager pager;
+ KvFlashConfig pc; pc.pool_tokens = 1024;
+ pager.attach(pc, cache.attn_k, cache.attn_v);
+ for (int p = 0; p < 1024; p += 64) pager.slot_for(p);
+ std::vector out_ms, in_ms;
+ for (int rep = 0; rep < 32; rep++) {
+ const int c = 2 + (rep % 8);
+ double t0 = now_ms();
+ pager.page_out(c);
+ out_ms.push_back(now_ms() - t0);
+ t0 = now_ms();
+ pager.page_in(c);
+ in_ms.push_back(now_ms() - t0);
+ }
+ const StepTimes o = summarize(out_ms), i = summarize(in_ms);
+ std::printf("page_out: p50=%.2f ms p95=%.2f ms page_in: p50=%.2f ms p95=%.2f ms (per 64-token chunk, %zu KiB)\n",
+ o.p50, o.p95, i.p50, i.p95,
+ (size_t)(pager.stats().host_bytes / std::max(1, 8) / 1024));
+ free_target_cache(cache);
+ }
+ }
+
+ // ── Memory verdict ──────────────────────────────────────────────
+ const double red_kv = 100.0 * (1.0 - (double)mem_c_kv / (double)mem_a_kv);
+ std::printf("\n=== KV MEMORY ===\n");
+ std::printf("baseline (ctx %6d): kv=%9.1f MiB base_buf=%9.1f MiB vram_delta=%9.1f MiB\n",
+ logical_ctx, mem_a_kv / 1048576.0, mem_a_buf / 1048576.0, mem_a_vram / 1048576.0);
+ std::printf("pooled (pool %5d): kv=%9.1f MiB base_buf=%9.1f MiB vram_delta=%9.1f MiB\n",
+ pool_c, mem_c_kv / 1048576.0, mem_c_buf / 1048576.0, mem_c_vram / 1048576.0);
+ std::printf("attn-KV reduction: %.1f%%\n", red_kv);
+ std::printf("%s KV memory reduction >= 90%%\n", red_kv >= 90.0 ? "PASS" : "FAIL");
+ if (red_kv < 90.0) hard_failures++;
+
+ free_target_weights(w);
+ ggml_backend_free(backend);
+ std::printf("\n%s (%d hard failures)\n", hard_failures == 0 ? "ALL PASS" : "FAILED", hard_failures);
+ return hard_failures == 0 ? 0 : 1;
+}