diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 46919debe..e870ef385 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -128,8 +128,8 @@ jobs:
     needs: [uv-workspace]
     runs-on: [self-hosted, gpu, sm86]
     timeout-minutes: 30
-    # The box has a single physical GPU: serialize GPU jobs across PRs instead
-    # of letting concurrent runs clobber each other.
+    # Serialize CUDA jobs across PRs (one RTX 3090). The ROCm job has its
+    # own group: different physical GPU, no contention.
     concurrency:
       group: lucebox3-gpu-runner
       cancel-in-progress: false
@@ -197,15 +197,51 @@ jobs:
     needs: [uv-workspace]
     runs-on: [self-hosted, rocm, gfx1151]
     timeout-minutes: 20
-    # Same single box as gpu-tests: serialize GPU jobs across PRs.
+    # Serialize across PRs per GPU. NOT the same group as the CUDA job:
+    # the combo box has two distinct GPUs (RTX 3090 + Strix iGPU), and a
+    # shared group only holds one waiting job, so the Radeon leg was
+    # chronically displaced ("higher priority waiting request") by every
+    # new CUDA job entering the queue.
     concurrency:
-      group: lucebox3-gpu-runner
+      group: lucebox3-rocm-runner
       cancel-in-progress: false
     steps:
       - uses: actions/checkout@v4
 
+      - name: KFD health (diagnose instead of hanging)
+        # rocminfo on a wedged KFD blocks in uninterruptible sleep and eats
+        # the whole 20-minute job timeout. Probe with a hard timeout first,
+        # and when it hangs, dump the evidence (D-state holders, dmesg) so
+        # the job fails in seconds with a diagnosis instead of silently.
+        run: |
+          # A wedged KFD puts rocminfo in UNINTERRUPTIBLE sleep: timeout(1)
+          # cannot kill it and a foreground wait blocks until the job
+          # timeout. Probe in the background (output to a file so no pipe
+          # keeps the step alive) and enforce the deadline in the shell.
+          /opt/rocm/bin/rocminfo > /tmp/rocminfo.out 2>&1 &
+          PROBE=$!
+          for i in $(seq 1 15); do
+            kill -0 $PROBE 2>/dev/null || break
+            sleep 1
+          done
+          if kill -0 $PROBE 2>/dev/null; then
+            echo "::error::rocminfo hung (likely D-state) — ROCm/KFD wedged; the box needs a reboot"
+            echo "--- probe state:"
+            ps -o pid,stat,wchan:32,comm -p $PROBE || true
+            echo "--- processes holding /dev/kfd:"
+            sudo fuser -v /dev/kfd 2>&1 || true
+            echo "--- D-state processes:"
+            ps -eo pid,user,stat,wchan:32,comm | awk '$3 ~ /D/' || true
+            echo "--- recent amdgpu/kfd dmesg:"
+            sudo dmesg 2>/dev/null | grep -iE "amdgpu|kfd" | tail -15 || true
+            kill -9 $PROBE 2>/dev/null || true
+            disown $PROBE 2>/dev/null || true
+            exit 1
+          fi
+          wait $PROBE && echo "KFD healthy" || { echo "::error::rocminfo exited non-zero"; cat /tmp/rocminfo.out | tail -5; exit 1; }
+
       - name: ROCm smoke (rocminfo sees gfx1151)
-        run: /opt/rocm/bin/rocminfo | grep -E "Name:|Marketing Name:" | grep -iE "gfx1151|Radeon 8060S"
+        run: cat /tmp/rocminfo.out | grep -E "Name:|Marketing Name:" | grep -iE "gfx1151|Radeon 8060S"
 
       - name: Build + run HIP vector-add on the Radeon 8060S
         # Self-contained HIP kernel correctness test (no model weights). This is
diff --git a/README.md b/README.md
index 0856e5375..59d3fdd96 100644
--- a/README.md
+++ b/README.md
@@ -39,6 +39,10 @@ Each one is self-contained with setup instructions and benchmark notes.
   <a href="optimizations/spark/"><img src="assets/cards/spark_card.png" alt="Luce Spark MoE expert offload" width="46%"></a>
 </p>
 
+<p align="center">
+  <a href="optimizations/kvflash/"><img src="assets/cards/kvflash_card.png" alt="Luce KVFlash paged KV cache" width="46%"></a>
+</p>
+
 ---
 
 ## Supported Models & Drafters
@@ -276,6 +280,18 @@ DFLASH27B_KV_TQ3=1 \
 | `--kv-cache-dir <path>` | — | Persist prefix cache to disk |
 | `--kv-cache-budget N` | — | On-disk cache size cap |
 
+**Bounded KV residency (KVFlash)**
+
+Pages the attention KV cache through a fixed pool of GPU slots; cold 64-token chunks live in host RAM, bit-exact and recallable. Decode speed stops depending on context length and resident KV stays pool-sized at any context. Off by default; works on every model family. Drafter-scored residency is the default on every family: the server finds the Qwen3-0.6B drafter next to the model (or via `--prefill-drafter`) and lazy-loads it as the relevance scorer that decides which chunks stay resident — non-qwen targets (laguna, gemma4) bridge the tokenizer gap by re-tokenizing the context text for the drafter. LRU is the fallback when no drafter is present, or the explicit choice via `--kvflash-policy lru`. Per-model numbers in [Luce KVFlash →](optimizations/kvflash/README.md).
+
+| Flag / env | Default | Effect |
+|---|---|---|
+| `--kvflash <tokens\|auto>` | off | Resident pool size. `auto` sizes from the GPU: half of free VRAM after weights and reserves, at the model's KV density, capped where decode speed stays near the flat optimum (default 16384, override `DFLASH_KVFLASH_MAX_POOL`) and at `--max-ctx`. Explicit values are rounded to 256, clamped to `--max-ctx`, floored at the protected minimum so eviction always has a victim. |
+| `--kvflash-policy {drafter,lru}` | `drafter` | Residency policy. `lru` opts out of the drafter probe/load (recency-only paging, no extra VRAM). |
+| `--kvflash-tau N` | `64` | Reselect interval floor (drafter policy only); the effective interval grows with history to cap rescore overhead. |
+| `DFLASH_KVFLASH=N` | off | Env equivalent of `--kvflash`. |
+| `DFLASH_KVFLASH_TAU=N` | `64` | Env equivalent of `--kvflash-tau`. |
+
 **Thinking budget**
 
 | Flag | Default | Effect |
diff --git a/assets/cards/kvflash_card.png b/assets/cards/kvflash_card.png
new file mode 100644
index 000000000..1a8af70a3
--- /dev/null
+++ b/assets/cards/kvflash_card.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d3f810ba8150b818309173d9c003f475b5ff41b8a3e6605772eea7ca086029b2
+size 2231695
diff --git a/optimizations/kvflash/DESIGN.md b/optimizations/kvflash/DESIGN.md
new file mode 100644
index 000000000..a8738eb27
--- /dev/null
+++ b/optimizations/kvflash/DESIGN.md
@@ -0,0 +1,272 @@
+# KVFlash design notes
+
+Mechanism details and tuning data behind [README.md](README.md); measured
+tables in [RESULTS.md](RESULTS.md).
+
+FlashMemory-style (arXiv 2606.09079) decode-time KV paging for the qwen35
+target, designed to compose with pflash. Goal: the GPU footprint of the
+full-attention KV cache is a hard O(pool) constant regardless of logical
+context length, with paged-out chunks recallable bit-exact from host.
+
+## Division of labor with pflash
+
+pflash and the pager own different resources and compose cleanly:
+
+| concern | owner |
+|---|---|
+| which prompt chunks the target ever elaborates | pflash (drafter scores, evict at prefill) |
+| which elaborated chunks occupy GPU slots | KvFlashPager (this module) |
+| prefill compute sparsity | pflash BSA kernels |
+| decode-time KV growth (generated tokens) | KvFlashPager (page out cold generated chunks) |
+
+pflash keeps the target from reading the huge context; the pager keeps
+what the target HAS elaborated inside a fixed VRAM budget and makes every
+eviction reversible. The drafter's chunk scores plug into
+`KvFlashPager::score_hook` as the residency policy (LRU fallback in the
+prototype).
+
+## Mechanism
+
+- Cache tensors are allocated at `pool_tokens` (e.g. 1024) instead of
+  `max_ctx` (e.g. 131072). That allocation delta IS the memory saving:
+  a mask over a full-size cache would save nothing.
+- Logical positions map to physical pool slots at 64-token chunk
+  granularity. The mapping rides the existing step-invariant
+  `ggml_set_rows` KV append (`kv_write_rows` carries the physical slot;
+  the `positions` input keeps the logical position for M-RoPE).
+- Decode FA spans the whole pool with an EXACT slot-validity mask
+  (`KvFlashPager::fill_slot_mask`): resident slots 0, free/paged-out -inf.
+  The host-side mask rebuilds only when the pager epoch moves; the device
+  upload happens before EVERY compute. That upload is mandatory, not an
+  optimization: input tensors live in the gallocr compute buffer, whose
+  regions are reused during graph execution, so a once-uploaded mask is
+  garbage by the next step (this masqueraded as a "fattn NaN kernel bug"
+  for a while — all-NaN logits from the second step on; production never
+  hit it because its prefill refills masks per chunk). `--no-mask` falls
+  back to maskless + zeroed freed slots (exp(-max) ~ 0, production's
+  padded-span approximation, measured ~1% argmax flips).
+- Page-out copies a chunk's quantized rows (per layer x K/V x head
+  segments) to a host backing store and zeroes the slots; page-in writes
+  them back. Quantized bytes + baked-in RoPE means the roundtrip is
+  bit-exact and relocation is position-independent.
+- Eviction protects sinks (first chunk) and the trailing window, mirrors
+  FlashMemory's always-resident floor (their last-8K + decoded window).
+  Unlike their sigmoid-threshold fetch (which leaks footprint at 500K,
+  their §3.3.1), a fixed slot pool is a hard budget by construction.
+- DeltaNet/conv recurrent state is fixed-size and never paged.
+
+## What the prototype verifies (test_kvflash)
+
+A. Baseline at logical ctx 128K: reference greedy sequence + KV bytes.
+B. Relocation proof: same workload in a small pool with SHUFFLED block
+   placement, teacher-forced — argmax must track the baseline.
+C. Live paging: pool ≪ prompt+gen, eviction engaged; bit-exact
+   page_out/page_in roundtrip; decode completes; KV bytes vs A ≥ 90% cut.
+
+## Reselect (τ-step lookahead)
+
+`KvFlashPager::reselect()` rebuilds the resident set as the top-pool chunks by
+`score_hook` over all materialized chunks (resident or host-backed),
+keeping sinks and the trailing window unconditionally. Page-outs run
+first so recalls always find free blocks. This is the FlashMemory τ=64
+loop's mechanism; the production caller invokes it every τ decoded
+tokens with fresh drafter scores. Verified in test run D: an evicted
+chunk recalled by a score flip, decode continues across the residency
+change.
+
+## Measured (lucebox RTX 3090, Qwen3.6-27B Q4_K_M, Q8_0 KV, 2026-06-11)
+
+All gates PASS (exit 0). 64 timed steps per profile row, junk KV so the
+FA span traffic is bandwidth-realistic:
+
+| config | FA span | ms/step p50 | tok/s |
+|---|---|---|---|
+| baseline 8K   | 8192   | 35.1 | 28.5 |
+| baseline 32K  | 32768  | 30.1 | 33.1 |
+| baseline 128K | 131072 | 45.1 | 22.1 |
+| pool 1K @128K logical | 1024 | 25.1 | 39.6 |
+| pool 4K @128K logical | 4096 | 25.7 | 38.7 |
+
+- attn-KV memory: 2304.0 -> 18.0 MiB (99.2% cut); whole cache buffer
+  2653.6 -> 217.6 MiB, confirmed by VRAM deltas.
+- At 128K-logical decode the pool is 1.8x FASTER than the full cache
+  (45.1 -> 25.1 ms/step): FA cost is span-bound, the pool caps the span.
+- Paging: page_out p50 1.26 ms, page_in p50 0.63 ms per 64-token chunk
+  (~2.2 MiB, synchronous); 12 evictions over 1200 generated tokens
+  amortize to ~0.01 ms/token. reselect() recalling with 20 page events
+  took 21.3 ms — at τ=64 that is ~1% of decode time worst-case.
+- Relocation equivalence: 0.83% argmax flips over 1200 teacher-forced
+  tokens at shuffled placement (gate: ≤1%).
+- Open harness question: the C-loop (live eviction) measured ~34 ms/step
+  vs 25 ms for the identical config in the E-loop; suspected interaction
+  of sustained-load GPU clocks with run ordering, not paging cost (12
+  sync page events explain only ~0.01 ms/token). Re-measure under the
+  production decode loop during integration.
+
+## Full LSA loop (drafter as Memory Indexer) — measured
+
+Test run F implements the paper's complete inference paradigm with the
+pflash drafter (Qwen3-0.6B, `/opt/lucebox/models/drafter/`) standing in
+for the trained indexer: prompt (2048) larger than the pool (1024) so
+prefill itself evicts, then every τ=64 decoded tokens the drafter
+rescores the full sequence (tail attention = indexer query, chunk means
+via `drafter_chunk_scores`), `score_hook` receives the fresh scores, and
+`reselect()` repages the pool.
+
+Measured (RTX 3090, target Qwen3.6-27B Q4_K_M + drafter co-resident):
+- 31.2 tok/s with the loop active; 12 rescores over 768 generated tokens
+- 43 genuine drafter-driven recalls of previously evicted context
+- indexer rescore p50 = 245 ms (full 0.6B re-prefill at ~2-2.8K tokens —
+  ~12% decode overhead at τ=64; drops to ~ms once the drafter's own KV
+  is persisted and only the new τ tokens are pushed through it)
+- reselect p50 = 7.5 ms
+
+vs the paper: their indexer is a trained <0.1% projection head (cheaper
+queries, backbone-supervised labels); ours is the existing 0.6B drafter
+(training-free, already shipped for pflash). Their sigmoid threshold
+leaks footprint at scale (their §3.3.1); our fixed pool is a hard cap.
+
+## Production integration (daemon)
+
+The pool is wired into the qwen35 backend behind `--kvflash <tokens>`
+(env `DFLASH_KVFLASH`; rounded to a 256 multiple) + `--kvflash-tau <N>`
+(env `DFLASH_KVFLASH_TAU`, default 64). Pieces:
+
+- `create_target_cache(..., ctx_alloc)`: attention tensors allocated at
+  pool capacity; `cache.max_ctx` stays the logical bound.
+- `do_prefill`: prompts that fit the pool land identity-mapped
+  (`kvflash_sync_prefill` rebuilds the pager map per request/restore);
+  LARGER prompts switch to pooled chunked prefill — pager-chunk batches,
+  slot-mapped set_rows writes, a slot-space mask per chunk, live
+  eviction. Constant VRAM, linear time (qwen35 only so far).
+- `do_ar_decode`: `build_target_step(..., kvflash_mask=true)` keeps the
+  step-invariant set_rows write active alongside the slot mask;
+  `kv_write_rows` carries the pool slot; the mask uploads per step;
+  every τ generated tokens `kvflash_maybe_reselect` rescores + repages.
+- Policy is agnostic by construction: `KvFlashScorer` (common/) is the
+  interface; with no scorer the pager runs pure LRU (zero pflash
+  dependency). When pflash loads its drafter, `KvFlashDrafterScorer`
+  (qwen3/) attaches automatically and reselect becomes drafter-driven.
+- Spec decode (chain mode) runs ON the pool: verify_batch slot-maps the
+  draft block via per-token kv_write_rows and builds a slot-space mask
+  (resident committed positions + causal among draft tokens). Rejected
+  drafts need no rollback: the pos < base_pos validity rule excludes
+  their slots until the replay rewrites them. All four spec KV-write
+  sites (verify, both replays, stall-prefix) route through this one
+  function. Verified on the daemon: accept_rate 15.4-15.6% pooled vs
+  15.3% pool-off (matched avg_commit 3.47 vs 3.45), coherent output
+  through a mid-generation pool wrap with live eviction. DDTree's
+  tree-verify is not pool-aware yet and falls back to AR.
+- LAYOUT TRAP (cost a day of debugging): kv_write_rows is
+  [n_tokens, n_head_kv] ne0-major — element (token i, head h) lives at
+  i + h*n_tokens (ggml_set_rows asserts b->ne[1] == c->ne[0]). A
+  transposed fill scrambles per-head row targets for every multi-token
+  write while single-token fills (all entries equal) hide the bug
+  completely.
+- Post-generation snapshots are skipped once cur_pos exceeds the pool
+  (pooled snapshots need page-table serialization; prefill-time
+  snapshots still work).
+
+## Production smokes (dflash_server on lucebox 3090, 2026-06-11)
+
+1. WITHOUT pflash (agnostic LRU): `dflash_server <27B> --kvflash 1024`.
+   41-token prompt + 1400 generated = 1441 logical through a 1024-slot
+   pool (live LRU eviction mid-request). Coherent story end to end,
+   36.9 tok/s, clean finish. Second request (per-request pager reset) ok.
+2. WITH pflash: `--kvflash 2048 --prefill-compression always
+   --prefill-threshold 256 --prefill-drafter <Qwen3-0.6B>`. Compression
+   1468 -> 60 tokens, then `[kvflash] drafter scorer attached (tau=64)`
+   automatically; 400 coherent tokens answering from the compressed
+   context. Same binary, zero pflash-specific configuration on the pool.
+
+Ops note: the init banner is flushed now, but generally `nohup` +
+redirected stdout block-buffers printf output — kill the process (atexit
+flush) before concluding a code path didn't run.
+
+## Quality matrix (synthetic NIAH, needle recall /16, teacher-forced)
+
+| context | residency | LRU d=10/50/90% | drafter d=10/50/90% | control |
+|---|---|---|---|---|
+| 8K   | 25%   | 0 / 0 / 16 | 15 / 15 / 16 | 16/16 |
+| 8K   | 9%    | 0 / 0 / 0  | 15 / 15 / 15 | 16/16 |
+| 32K  | 25%   | 0 / 0 / 16 | 15 / 15 / 16 | 16/16 |
+| 32K  | 9%    | 0 / 0 / 0  | 15 / 15 / 15 | 15-16/16 |
+| 256K | 6.25% | 0 (d=0.5); 16/16 in-window | 14 / 15 / 15 | (in-window LRU = control) |
+
+Drafter-scored residency retains 88-100% of perfect needle recall at every
+depth down to 6-9% residency from 8K to the model's native 256K maximum;
+recency-only LRU retains zero outside its tail window. 256K logistics on
+the RTX 3090: ~6.5 min linear pooled prefill, 4.22 GiB host backing,
+~18 GiB VRAM total, 46 s bisected rescore (drafter forward ceiling ~65K
+per segment).
+
+## Tuned defaults (from the matrix)
+
+- Ship drafter scoring whenever a drafter is available; pure-LRU mode is
+  recency-only and must be documented as such.
+- Pool ~25% of expected context is the conservative default; 9% measured
+  safe for retrieval-style work.
+- tau adapts: rescore costs ~0.11 ms/history-token, so the effective
+  reselect interval is max(configured tau, history/45), capping rescore
+  overhead near 15% of decode time.
+
+## Per-architecture integration
+
+The pager core is architecture-blind; each backend routes its own KV writes
+and masks through it. What differs per arch:
+
+- **qwen35** (reference): masked set_rows decode, slot-mapped chain-spec
+  verify, drafter scorer auto-attach. Everything in RESULTS.md.
+- **qwen35moe** (Qwen3.6-35B-A3B): inherits the qwen35 path all-GPU. The
+  Spark hybrid pipelined decode keeps its per-layer cached CUDA graphs:
+  `pipelined_decode_one_token` takes a `kv_slot`, the cached FA span clamps
+  to the pool (so the graph stops rebuilding once the window hits pool
+  size), and the pool span stays MASKLESS like the rest of that path — the
+  pager zeroes freed blocks (page-out and `zero_free_blocks()` on request
+  reset), so evicted slots contribute exp(-max) ~ 0, production's own
+  padded-span approximation. Hybrid spec decode (literal-offset KV writes)
+  falls back to pipelined AR under kvflash.
+- **laguna**: ALL 40 layers pooled (full + SWA share the pager).
+  `laguna_step` / `laguna_step_hybrid` take a const pager; both masks are
+  built in SLOT space via `fill_slot_pos` (the causal / sliding-window
+  conditions evaluate on the position each slot holds). SWA exactness:
+  `tail_window_chunks >= sliding_window/64 + 1`, so positions inside the
+  window are never evicted. The per-layer hybrid decode fallback and
+  NO_KVPAD / PAD_CPY / no_mask ablations are refused under kvflash.
+- **gemma4**: pools FULL-attention layers only — SWA layers already use
+  sliding-window ring buffers and KV-reuse layers share their source's
+  tensors. The full mask is slot-space; the SWA ring path is untouched.
+  `--fa-window` (sparse full-attn) and kvflash are mutually exclusive.
+  DFlash spec verify is slot-mapped (gemma4_verify_batch gains set_rows
+  inputs + the slot-space causal mask; its KV-truncation rejection
+  semantics map directly onto the pool's validity rule). Measured:
+  identical acceptance pooled vs full (407/3104 = 13.1%, avg_commit
+  3.09, identical text).
+
+Policy: drafter-scored residency is the default on all four archs. The
+server probes for the Qwen3-0.6B next to the model (or --prefill-drafter)
+and lazy-loads it at the first reselect; `--kvflash-policy lru` opts out.
+qwen35/qwen35moe feed the drafter target ids directly; laguna/gemma4 use
+KvFlashCrossTokScorer (detokenize -> re-tokenize -> score -> map back by
+char spans; functional but untuned, see RESULTS). `--kvflash auto` sizes
+the pool from free VRAM at the model's KV density, capped at the decode
+speed knee (16384 default).
+
+Snapshots on laguna/gemma4 are refused once a chunk has relocated
+(page_outs > 0); identity-layout snapshots before that still work.
+
+## Follow-ups
+
+Done since the prototype: pooled chunked prefill in the qwen35 daemon
+(prompt > pool, eviction during prefill), spec-decode chain verify on the
+pool, VRAM-aware auto sizing, cross-tokenizer scoring for laguna/gemma4.
+
+Open:
+1. Drafter KV persistence for the indexer (incremental rescore: push
+   only the new τ tokens through the drafter; kills the ~240 ms re-prefill).
+2. Pooled chunked prefill for laguna/gemma4 (qwen35-only today).
+3. Pooled snapshot save/restore (serialize the page table + host store).
+4. Async paging on a copy stream (currently synchronous
+   ggml_backend_tensor_get/set between steps).
+5. Teacher-forced NIAH harness for non-qwen archs + cross-tok scorer
+   tuning (tail window, normalization).
diff --git a/optimizations/kvflash/README.md b/optimizations/kvflash/README.md
new file mode 100644
index 000000000..a54406453
--- /dev/null
+++ b/optimizations/kvflash/README.md
@@ -0,0 +1,133 @@
+<p align="left">
+  <a href="../README.md">← lucebox-hub</a>
+</p>
+
+<p align="center">
+  <img src="hero.png" width="600" />
+</p>
+
+<h1 align="center">Luce KVFlash</h1>
+
+<p align="center">
+  <strong>Lookahead sparse attention for dflash. Bounded KV residency on one GPU.</strong><br/>
+  The attention KV cache lives in a fixed pool of slots; cold 64-token chunks page to host RAM, bit-exact and recallable.
+  With pflash, its drafter doubles as a Memory Indexer that recalls the context the generation needs next.<br/>
+  Qwen3.6-27B Q4_K_M on a single RTX 3090: <strong>native 256K context at 38.6 tok/s with 72 MiB of resident KV</strong>,
+  needle recall 88-100% at 6% residency, harness accuracy unchanged (32/32 vs full cache).
+</p>
+
+---
+
+```
+                         decode tok/s   KV in VRAM (Q8_0)   needle (d=10/50/90%)
+full cache  @  64K            27.8        1152 MiB        16/16
+full cache  @ 128K            19.6        2304 MiB        16/16
+full cache  @ 256K            13.1        4608 MiB        16/16
+KVFlash 4K  @  64K            38.6          72 MiB        14/16
+KVFlash 4K  @ 128K            38.6          72 MiB        14/16
+KVFlash 4K  @ 256K            38.6          72 MiB        15/16
+```
+
+Decode speed is flat at any context length (the per-step KV read is pool-sized,
+not context-sized), prefill is up to 2.8x faster, and a 256K prompt that costs
+4.6 GiB of VRAM as a full cache costs 72 MiB resident + 4.2 GiB of host RAM.
+(The full-cache 256K rows are measured, not extrapolated: they fit the 24 GB
+card only thanks to Q8_0 KV; with F16 KV the cache alone is 9.2 GiB and 256K
+does not fit at all.)
+
+## Usage
+
+```bash
+dflash_server model.gguf --max-ctx 32768 --kvflash auto           # one flag, LRU policy
+dflash_server model.gguf --max-ctx 32768 --kvflash auto \
+    --prefill-drafter qwen3-0.6b.gguf                             # drafter-scored residency
+dflash_server model.gguf --max-ctx 32768 --kvflash 8192           # explicit pool size
+```
+
+Drafter-scored residency is the DEFAULT policy on every model family:
+the server probes for `Qwen3-0.6B-BF16.gguf` next to the model (same
+dir, `drafter/`, `draft/`, then `/opt/lucebox/models/drafter/`) and
+lazy-loads it on the first reselect; `--prefill-drafter` overrides the
+location, prefill compression can stay off either way. Qwen-family
+targets feed the drafter their ids directly; laguna and gemma4 bridge
+the tokenizer gap with `KvFlashCrossTokScorer` (relevance is a property
+of the TEXT, so the target's history is detokenized, re-tokenized for
+the drafter, scored, and mapped back to chunk boundaries by character
+spans). LRU is the fallback when no drafter is found (the banner says
+which policy you got) or the explicit choice via `--kvflash-policy lru`.
+`auto` sizes the pool from the GPU, not a fixed fraction: half of the
+free VRAM left after weights (minus a reserve for compute buffers and
+the drafter), converted at the model's KV density, capped where decode
+speed stays near the flat optimum (16384 tokens by default,
+`DFLASH_KVFLASH_MAX_POOL` to override) and at `--max-ctx`. Bigger pools
+mean more resident chunks and fewer forced evictions of useful context;
+the cap keeps the per-step KV read small enough that decode stays near
+the small-pool speed.
+
+- `--kvflash <tokens|auto>`: resident pool size (rounded to 256; clamped to
+  `--max-ctx`; floored at the protected minimum — 512 for qwen-family and
+  gemma4, larger on laguna where the SWA window stays resident — so
+  eviction always has a victim). Env: `DFLASH_KVFLASH`.
+- `--kvflash-tau <N>`: reselect interval floor (default 64; the effective
+  interval grows with history so rescore overhead stays ~15% of decode).
+  Env: `DFLASH_KVFLASH_TAU`.
+
+Sizing rule: without a drafter, pool >= prompt + generation headroom
+(LRU is recency-only memory — an undersized pool can evict the question
+itself). With pflash's drafter attached, 25% of the expected context is a
+conservative default and 6-9% is measured safe for retrieval workloads.
+
+## Model support
+
+`--kvflash` works on every architecture the daemon serves:
+
+| arch | models | decode path | policy | notes |
+|---|---|---|---|---|
+| qwen35 | Qwen3.5/3.6-27B | masked set_rows decode + slot-mapped spec verify | LRU or pflash drafter | reference integration; all RESULTS.md numbers |
+| qwen35moe | Qwen3.6-35B-A3B | pipelined hybrid decode (Spark) + all-GPU | LRU or pflash drafter | maskless pool span (zero-row approximation, same as production padding); hybrid spec falls back to AR |
+| laguna | Laguna-XS.2 | single-graph hybrid + all-GPU, slot-space full+SWA masks | LRU or drafter (cross-tok, untuned) | pager covers all 40 layers; protected tail >= sliding_window keeps SWA exact |
+| gemma4 | Gemma4 26B-A4B / 31B | masked decode + slot-mapped spec verify, slot-space full mask | LRU or drafter (cross-tok, untuned) | pools FULL-attention layers only (SWA layers already ring-buffer) |
+
+Non-qwen targets use the cross-tokenizer scorer (detokenize target ids,
+re-tokenize for the drafter, score, map back by char spans); the
+`KvFlashScorer` seam stays open for native indexers.
+
+## How it works
+
+- **Pool**: attention KV tensors are allocated at pool size; a pager maps
+  logical positions to slots at 64-token chunk granularity. Cold chunks
+  move to a host backing store (~0.6 ms/chunk) and return bit-exact.
+- **Mask**: attention spans the pool with a slot-validity mask, uploaded
+  before every compute. Exact, and free (25.10 vs 25.52 ms/step maskless).
+- **Reselect**: every tau decoded tokens the scorer re-ranks all chunks
+  (resident or host-backed) and `reselect()` repages the pool — the
+  lookahead loop from FlashMemory (arXiv 2606.09079), with the pflash
+  drafter standing in for their trained indexer, and a hard capacity cap
+  their threshold mechanism lacks.
+- **Spec decode**: chain-mode verify is slot-mapped (per-token
+  `kv_write_rows` + slot-space mask); rejected drafts need no rollback —
+  their slots are excluded by the validity rule until rewritten.
+  Acceptance parity with the full cache (15.4-15.6% vs 15.3%), with or
+  without the --ddtree configuration (fast rollback only snapshots
+  DeltaNet state, which is never pooled).
+- **Prefill**: prompts larger than the pool prefill in 64-token chunks at
+  constant VRAM (linear time; 256K in ~5.9 min on the 3090).
+
+Quality verdict (harness ground truth, base-vs-base control included):
+full results in [RESULTS.md](RESULTS.md). Outputs are not guaranteed
+byte-identical to the full cache on long generations (the masked kernel
+path rounds differently — a different deterministic lineage), but
+correctness is identical: 32/32 vs 32/32 across HumanEval, GSM, MATH, and
+agent suites.
+
+## Files
+
+- `server/src/common/kvflash_pager.h` — pool, page table, host store, reselect
+- `server/src/common/kvflash_scorer.h` — chunk-relevance policy interface
+- `server/src/qwen3/qwen3_kvflash_scorer.{h,cpp}` — pflash-drafter scorer
+  (tail attention; bisects on allocation pressure)
+- `server/src/qwen35/*` — cache `ctx_alloc`, masked pooled decode, slot-mapped
+  spec verify, daemon flags
+- `server/test/test_kvflash.cpp` — verification suite (A-F), `--niah`,
+  `--niah256`, `--longab`
+- [DESIGN.md](DESIGN.md) — mechanism details and tuning notes
diff --git a/optimizations/kvflash/RESULTS.md b/optimizations/kvflash/RESULTS.md
new file mode 100644
index 000000000..513412311
--- /dev/null
+++ b/optimizations/kvflash/RESULTS.md
@@ -0,0 +1,127 @@
+# KVFlash — measured results
+
+All numbers: single RTX 3090 (24 GB), Qwen3.6-27B Q4_K_M target, Q8_0 KV,
+Qwen3-0.6B pflash drafter as the scorer. June 2026, `test_kvflash` +
+`dflash_server` + `harness/benchmarks`.
+
+## End-to-end long-prompt A/B (`--longab`; needle depth 0.25, 240-token timed free run)
+
+| context | mode | prefill | decode tok/s | needle /16 | KV in VRAM (Q8_0) |
+|---|---|---|---|---|---|
+| 32K  | full    | 47.2 s  | 32.8 | 16 | 576 MiB |
+| 32K  | KVFlash 4K | 41.8 s | 29.0 | 15 | 72 MiB |
+| 64K  | full    | 130.6 s | 27.8 | 16 | 1152 MiB |
+| 64K  | KVFlash 4K | 87.5 s | **38.6** | 14 | **72 MiB** |
+| 128K | full    | 335.9 s | 19.6 | 16 | 2304 MiB |
+| 128K | KVFlash 4K | 177.8 s | **38.6** | 14 | **72 MiB** |
+| 256K | full    | 999.0 s | 13.1 | 16 | 4608 MiB |
+| 256K | KVFlash 4K | **354.9 s** | **38.6** | 15 | **72 MiB** |
+
+Decode is flat at 38.6 tok/s from 64K to native-max 256K (speedups 1.4x /
+2.0x / 2.9x); prefill speedups 1.5x / 1.9x / 2.8x. One drafter rescore per
+query: 9-70 s scaling with context (bisected above the drafter's ~65K
+single-pass ceiling).
+
+Note on the 256K full-cache row: it fits the 24 GB card only because the
+KV is Q8_0 (~15.3 GiB weights + 4.6 GiB KV ~ 21 GiB, measured, no OOM).
+With F16 KV the cache alone is 9.2 GiB and 256K does NOT fit; KVFlash is
+indifferent (72 MiB resident either way).
+
+## Retrieval quality vs residency (synthetic NIAH, teacher-forced /16)
+
+| context | residency | LRU (d=10/50/90%) | drafter (d=10/50/90%) | full control |
+|---|---|---|---|---|
+| 8K   | 25%   | 0 / 0 / 16 | 15 / 15 / 16 | 16/16 |
+| 8K   | 9%    | 0 / 0 / 0  | 15 / 15 / 15 | 16/16 |
+| 32K  | 25%   | 0 / 0 / 16 | 15 / 15 / 16 | 16/16 |
+| 32K  | 9%    | 0 / 0 / 0  | 15 / 15 / 15 | 15-16/16 |
+| 256K | 6.25% | 0 (d=0.5); 16/16 in-window | 14 / 15 / 15 | (in-window LRU = control) |
+
+Drafter-scored residency retains 88-100% of perfect recall at every depth
+down to 6-9% residency; recency-only LRU retains zero outside its tail
+window (mirrors FlashMemory's Recency-Only ablation).
+
+## Harness ground truth (pool sized per the heuristic, vs full cache)
+
+| suite | baseline pass | KVFlash pass | exact text match |
+|---|---|---|---|
+| HumanEval | 10/10 | **10/10** | 10/10 |
+| GSM       | 10/10 | **10/10** | 8/10 |
+| MATH      | 10/10 | **10/10** | 4/10 |
+| agent (to 24K prompts) | 6/6 | **6/6** | 2/6 |
+
+Base-vs-base control: 16/16 byte-identical — the stack is deterministic.
+Text drift under KVFlash is the masked decode kernel's different (equally
+deterministic) rounding lineage, not noise and not a correctness effect.
+
+## Spec decode (slot-mapped verify, daemon)
+
+| config | accept rate | avg_commit | output |
+|---|---|---|---|
+| qwen35 full cache, 2400 tok | 15.3% | 3.45 | coherent |
+| qwen35 KVFlash 2K, 1800 tok | 15.4% | 3.47 | coherent |
+| qwen35 KVFlash 2K, 2400 tok (live eviction mid-spec) | 15.6% | 3.49 | coherent |
+| qwen35 --ddtree full cache, 600 tok | 13.9% | 3.23 | coherent |
+| qwen35 --ddtree KVFlash 2K, 600 tok | 14.6% | 3.33 | coherent |
+| gemma4 full cache, 600 tok | 13.1% (407/3104) | 3.09 | coherent |
+| gemma4 KVFlash 2K, 600 tok | 13.1% (407/3104) | 3.09 | identical text to full |
+| qwen35moe A3B all-GPU --ddtree full cache, 500 tok | 11.5% | 2.84 | coherent |
+| qwen35moe A3B all-GPU --ddtree KVFlash 2K, 500 tok | 10.4% | 2.66 | coherent |
+
+## Microbenchmarks
+
+- Memory at 128K-logical: attn-KV 2304 -> 18 MiB (99.2%) with a 1K pool;
+  whole cache buffer 2654 -> 218 MiB, confirmed via VRAM deltas.
+- Exact slot mask is free: 25.10 ms/step masked vs 25.52 maskless.
+- Paging: page_out p50 1.27 ms / page_in 0.64 ms per 64-token chunk
+  (~2.2 MiB, synchronous); ~0.01 ms/token amortized at observed rates.
+- reselect() repaging 20 chunks: 21.3 ms.
+- Relocation equivalence (shuffled physical placement, teacher-forced
+  1200 tokens): ~99% argmax agreement; page_out/page_in roundtrip
+  bit-exact.
+
+## Multi-architecture smokes (pool 1024, --max-ctx 8192, ~1235 logical tokens, live LRU eviction mid-request, RTX 3090)
+
+| arch | model | mode | decode tok/s | output |
+|---|---|---|---|---|
+| qwen35 | Qwen3.6-27B Q4_K_M | all-GPU, masked pool | 37.4 | coherent |
+| qwen35moe | Qwen3.6-35B-A3B UD-Q4_K_M | Spark hybrid (9403 hot / 837 cold experts), pipelined decode | 101.6 | coherent |
+| laguna | Laguna-XS.2 Q4_K_M | Spark hybrid, single-graph decode, slot-space full+SWA masks | 137.1 | coherent |
+| gemma4 | Gemma4 26B-A4B UD-Q4_K_M | all-GPU, slot-space full mask, SWA rings untouched | 119.0 | coherent |
+
+Gemma4 control on the same build without the flag: 120.2 tok/s, no
+kvflash code engaged — the default path is unchanged.
+
+## Cross-tokenizer scorer (laguna/gemma4) — early result
+
+Stress A/B on gemma4 26B-A4B (pool 1024, needle at pos ~170, recital
+demanded ~1700 generated tokens later, beyond the SWA ring and the pool):
+LRU never recites and degenerates into filler repetition; the cross-tok
+drafter stays coherent for 1.9K tokens, reaches the recital, and recalls
+the correct prefix but not the exact code. Strictly better than LRU,
+not yet at the qwen-native scorer's 14-16/16; treat as functional but
+untuned (follow-up: teacher-forced NIAH harness for non-qwen archs,
+tail-window/normalization tuning).
+
+## Known limits
+
+- qwen35moe `--spark` (hybrid expert offload) speculative decode crashes
+  with a CUDA illegal-memory-access — a pre-existing bug in the hybrid
+  spec path (`do_hybrid_spec_decode`), independent of KVFlash (it crashes
+  with the full cache too). It was never exercisable before because no
+  A3B DFlash draft could be converted; the converter fix in this branch
+  now loads them, surfacing the crash. Tracked separately; `--spark`
+  spec falls back to pipelined AR under KVFlash. All-GPU MoE spec decode
+  (experts resident, no `--spark`) works on the pool — see the spec table.
+
+
+- The harness-only tree-verify graphs (test_dflash) are not pool-aware;
+  the daemon's spec decode, including the --ddtree configuration (chain
+  verify + fast rollback), runs fully on the pool.
+- Post-generation snapshots are skipped once cur_pos exceeds the pool
+  (pooled snapshots need page-table serialization).
+- Paging is synchronous (copy-stream overlap is a follow-up).
+- Memory-dense tasks needing the entire context at once (MRCR-style) are
+  a paradigm limit shared with FlashMemory; size the pool up for those.
+- 512K+ requires RoPE scaling (model native max is 256K) — memory-side
+  KVFlash already scales (host backing is the only growth).
diff --git a/optimizations/kvflash/hero.png b/optimizations/kvflash/hero.png
new file mode 100644
index 000000000..3fb3ce50e
--- /dev/null
+++ b/optimizations/kvflash/hero.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee1577f6ef97b030430041266532d39828749e1ef5868f58a0335955dcad9e7c
+size 2255374
diff --git a/server/CMakeLists.txt b/server/CMakeLists.txt
index 1ea6fd3fa..05d5add15 100644
--- a/server/CMakeLists.txt
+++ b/server/CMakeLists.txt
@@ -219,6 +219,7 @@ add_library(dflash_common STATIC
     src/draft/draft_safetensors_loader.cpp
     src/draft/draft_graph.cpp
     src/qwen3/qwen3_drafter.cpp
+    src/qwen3/qwen3_kvflash_scorer.cpp
     src/qwen3/qwen3_loader.cpp
     src/qwen3/qwen3_graph.cpp
     src/qwen3/qwen3_backend.cpp
@@ -724,6 +725,11 @@ if(DFLASH27B_TESTS)
         target_include_directories(test_generate PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
         target_link_libraries(test_generate PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET})
     endif()
+    if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_kvflash.cpp")
+        add_executable(test_kvflash test/test_kvflash.cpp)
+        target_include_directories(test_kvflash PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
+        target_link_libraries(test_kvflash PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET})
+    endif()
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_restore_delta.cpp")
         add_executable(test_restore_delta test/test_restore_delta.cpp)
         target_include_directories(test_restore_delta PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
diff --git a/server/scripts/convert_dflash_to_gguf.py b/server/scripts/convert_dflash_to_gguf.py
index fae1be7e5..106c04540 100644
--- a/server/scripts/convert_dflash_to_gguf.py
+++ b/server/scripts/convert_dflash_to_gguf.py
@@ -39,7 +39,14 @@
 import gguf
 
 # ──────────────────────────────────────────────────────────────────────
-# DFlash 27B draft architecture constants
+# DFlash draft architecture constants — DEFAULTS ONLY.
+#
+# These are the qwen35-27B draft's values; they are used as a fallback when
+# the source model has no config.json. Any other draft (A3B, gemma, ...) has
+# a different head/dim/layer config, so the real scalars are read from the
+# source config.json + derived from the tensor shapes in load_arch(). A
+# converter that hardcoded these silently produced GGUFs with correct
+# weights but 27B metadata, which the strict draft loader then rejected.
 # ──────────────────────────────────────────────────────────────────────
 
 ARCH                = "qwen35-dflash-draft"
@@ -50,7 +57,7 @@
 HEAD_DIM            = 128
 INTERMEDIATE        = 17408
 VOCAB               = 248320
-N_TARGET_LAYERS     = 5            # fc projects 5*hidden -> hidden
+N_TARGET_LAYERS     = 5            # fc projects N_TARGET_LAYERS*hidden -> hidden
 ROPE_THETA          = 1_000_000.0
 RMS_EPS             = 1e-6
 MASK_TOKEN_ID       = 248070
@@ -58,6 +65,89 @@
 CTX_LEN             = 32768
 
 
+def load_arch(safetensors: Path, header: dict) -> dict:
+    """Resolve the draft's architecture scalars. config.json (next to the
+    safetensors) is authoritative for the transformer hparams; the tensor
+    shapes are authoritative for the rest, so the result always matches the
+    weights even when config.json is partial or absent."""
+    a = dict(hidden=HIDDEN, n_layer=N_LAYER, n_head=N_HEAD, n_head_kv=N_HEAD_KV,
+             head_dim=HEAD_DIM, intermediate=INTERMEDIATE, vocab=VOCAB,
+             n_target_layers=N_TARGET_LAYERS, rope_theta=ROPE_THETA,
+             rms_eps=RMS_EPS, mask_token_id=MASK_TOKEN_ID, block_size=BLOCK_SIZE,
+             ctx_len=CTX_LEN)
+
+    cfg_path = safetensors.parent / "config.json"
+    if cfg_path.exists():
+        c = json.loads(cfg_path.read_text())
+        def pick(*keys):
+            for k in keys:
+                if k in c and c[k] is not None:
+                    return c[k]
+            return None
+        for dst, val in (
+            ("hidden",       pick("hidden_size")),
+            ("n_layer",      pick("num_hidden_layers")),
+            ("n_head",       pick("num_attention_heads")),
+            ("n_head_kv",    pick("num_key_value_heads")),
+            ("head_dim",     pick("head_dim")),
+            ("intermediate", pick("intermediate_size")),
+            ("vocab",        pick("vocab_size")),
+            ("rope_theta",   pick("rope_theta")),
+            ("rms_eps",      pick("rms_norm_eps")),
+            ("n_target_layers", pick("n_target_layers", "num_target_layers")),
+            ("mask_token_id",   pick("mask_token_id")),
+            ("block_size",      pick("block_size", "draft_block_size")),
+            ("ctx_len",         pick("max_position_embeddings")),
+        ):
+            if val is not None:
+                a[dst] = val
+        print(f"[info] read arch from {cfg_path}")
+    else:
+        print(f"[warn] no config.json next to safetensors; using 27B defaults")
+
+    # Weights are ground truth — derive/verify from tensor shapes.
+    def shape_of(st_name):
+        e = header.get(st_name)
+        return e["shape"] if e else None
+
+    # hidden absent in config: k-proj is [n_head_kv*head_dim, hidden] -> ne[1].
+    k0 = shape_of("layers.0.self_attn.k_proj.weight")
+    if (not cfg_path.exists()) and k0:
+        a["hidden"] = k0[1]
+    # head_dim absent in config: derive from k-proj (n_head_kv * head_dim).
+    if k0 and a["n_head_kv"]:
+        derived_hd = k0[0] // a["n_head_kv"]
+        if not cfg_path.exists() or "head_dim" not in json.loads(cfg_path.read_text() if cfg_path.exists() else "{}"):
+            a["head_dim"] = derived_hd
+    # intermediate: ffn gate/up is [intermediate, hidden] — ne[0].
+    g0 = shape_of("layers.0.mlp.gate_proj.weight")
+    if g0:
+        a["intermediate"] = g0[0]
+    # n_target_layers: fc.weight is [hidden, n_target*hidden]; ne[0] (the
+    # larger dim) / hidden is the capture count the loader checks.
+    fc = shape_of("fc.weight")
+    if fc and a["hidden"]:
+        a["n_target_layers"] = max(fc) // a["hidden"]
+    # n_layer: count the actual blocks present.
+    n_blocks = 1 + max((int(n.split(".")[1]) for n in header
+                        if n.startswith("layers.") and n.split(".")[1].isdigit()),
+                       default=a["n_layer"] - 1)
+    a["n_layer"] = n_blocks
+
+    # Consistency check against the k-proj weight.
+    if k0:
+        exp_kv = a["n_head_kv"] * a["head_dim"]
+        if exp_kv != k0[0]:
+            print(f"[error] config n_head_kv*head_dim={exp_kv} != "
+                  f"k_proj.weight dim {k0[0]}; fix config.json", file=sys.stderr)
+            sys.exit(1)
+    print(f"[info] arch: hidden={a['hidden']} n_layer={a['n_layer']} "
+          f"n_head={a['n_head']} n_head_kv={a['n_head_kv']} "
+          f"head_dim={a['head_dim']} ff={a['intermediate']} vocab={a['vocab']} "
+          f"n_target_layers={a['n_target_layers']}")
+    return a
+
+
 # ──────────────────────────────────────────────────────────────────────
 # Tensor name mapping  —  DFlash safetensors -> llama.cpp GGUF
 # ──────────────────────────────────────────────────────────────────────
@@ -155,29 +245,30 @@ def main():
     n_entries = sum(1 for k in header if k != "__metadata__")
     print(f"[info]   {n_entries} tensor entries")
 
+    a = load_arch(args.safetensors, header)
+
     writer = gguf.GGUFWriter(args.out_gguf, ARCH)
 
-    # Architecture metadata
-    writer.add_string("general.name", "Qwen3.5-27B-DFlash-Draft")
-    writer.add_uint32(f"{ARCH}.context_length",          CTX_LEN)
-    writer.add_uint32(f"{ARCH}.embedding_length",        HIDDEN)
-    writer.add_uint32(f"{ARCH}.block_count",             N_LAYER)
-    writer.add_uint32(f"{ARCH}.feed_forward_length",     INTERMEDIATE)
-    writer.add_uint32(f"{ARCH}.attention.head_count",    N_HEAD)
-    writer.add_uint32(f"{ARCH}.attention.head_count_kv", N_HEAD_KV)
-    # llama.cpp uses key_length / value_length to override the default
-    # n_embd_head = n_embd / n_head heuristic (DFlash has n_embd=5120
-    # but head_dim=128 so n_head*head_dim=4096 != n_embd).
-    writer.add_uint32(f"{ARCH}.attention.key_length",    HEAD_DIM)
-    writer.add_uint32(f"{ARCH}.attention.value_length",  HEAD_DIM)
-    writer.add_uint32(f"{ARCH}.vocab_size",              VOCAB)
-    writer.add_float32(f"{ARCH}.attention.layer_norm_rms_epsilon", RMS_EPS)
-    writer.add_float32(f"{ARCH}.rope.freq_base",         ROPE_THETA)
+    # Architecture metadata (resolved from config.json + tensor shapes)
+    writer.add_string("general.name", f"DFlash-Draft-{a['hidden']}h-{a['n_layer']}L")
+    writer.add_uint32(f"{ARCH}.context_length",          a["ctx_len"])
+    writer.add_uint32(f"{ARCH}.embedding_length",        a["hidden"])
+    writer.add_uint32(f"{ARCH}.block_count",             a["n_layer"])
+    writer.add_uint32(f"{ARCH}.feed_forward_length",     a["intermediate"])
+    writer.add_uint32(f"{ARCH}.attention.head_count",    a["n_head"])
+    writer.add_uint32(f"{ARCH}.attention.head_count_kv", a["n_head_kv"])
+    # key_length / value_length override the n_embd/n_head heuristic, which
+    # is wrong for DFlash drafts (n_head*head_dim != n_embd).
+    writer.add_uint32(f"{ARCH}.attention.key_length",    a["head_dim"])
+    writer.add_uint32(f"{ARCH}.attention.value_length",  a["head_dim"])
+    writer.add_uint32(f"{ARCH}.vocab_size",              a["vocab"])
+    writer.add_float32(f"{ARCH}.attention.layer_norm_rms_epsilon", a["rms_eps"])
+    writer.add_float32(f"{ARCH}.rope.freq_base",         a["rope_theta"])
 
     # DFlash-specific hyperparameters
-    writer.add_uint32(f"{ARCH}.dflash.n_target_layers", N_TARGET_LAYERS)
-    writer.add_uint32(f"{ARCH}.dflash.block_size",      BLOCK_SIZE)
-    writer.add_uint32(f"{ARCH}.dflash.mask_token_id",   MASK_TOKEN_ID)
+    writer.add_uint32(f"{ARCH}.dflash.n_target_layers", a["n_target_layers"])
+    writer.add_uint32(f"{ARCH}.dflash.block_size",      a["block_size"])
+    writer.add_uint32(f"{ARCH}.dflash.mask_token_id",   a["mask_token_id"])
 
     # Walk + add tensors. Sort: dflash.* singletons first, then output_*,
     # then per-layer in numeric order — keeps the on-disk layout stable.
diff --git a/server/src/common/kvflash_pager.h b/server/src/common/kvflash_pager.h
new file mode 100644
index 000000000..1b4679db9
--- /dev/null
+++ b/server/src/common/kvflash_pager.h
@@ -0,0 +1,548 @@
+// KvFlashPager — KVFlash core: a bounded resident pool for the
+// full-attention KV cache (see optimizations/kvflash/).
+//
+// Lookahead-sparse-attention-style (FlashMemory, arXiv 2606.09079)
+// decode-time KV residency for the qwen35 target: the cache tensors are
+// allocated at POOL size (a fraction of the logical context), and this
+// class owns the mapping from logical token positions to physical pool
+// slots. Chunks (64 logical tokens) that fall cold are paged out to a
+// host backing store and their slots are reused; paged-out chunks remain
+// recallable bit-exact. GPU footprint is a hard O(pool) bound regardless
+// of logical context length.
+//
+// Policy-agnostic by design: with no scorer, eviction is LRU over
+// unprotected chunks (recency-only memory). A KvFlashScorer plugged into
+// `score_hook` upgrades eviction and reselect() to relevance-driven
+// residency; with pflash enabled, its drafter attaches automatically
+// (KvFlashDrafterScorer) and recalls cold context the generation needs.
+//
+// Correctness notes (why relocating rows is legal):
+//  * RoPE is baked into K rows at write time from the `positions` input,
+//    so a row's physical slot is semantically irrelevant.
+//  * Attention runs over the whole pool with a slot-validity mask
+//    (resident = 0, free/paged-out = -inf). The mask must be re-uploaded
+//    before EVERY compute: input tensors live in the gallocr compute
+//    buffer whose regions are reused during graph execution.
+//  * Freed slots are additionally zeroed (defense in depth; a zero K row
+//    contributes exp(-max) ~ 0, the same assumption the production
+//    stride-256 padded span relies on in maskless mode).
+//  * The FWHT K-rotation and KV quantization operate per-row; page-out /
+//    page-in moves raw quantized bytes and is therefore bit-exact.
+//
+// Scope: full-attention layers only. DeltaNet/conv recurrent state is
+// fixed-size, position-dependent in-place state and is never paged.
+
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <functional>
+#include <string>
+#include <vector>
+
+namespace dflash::common {
+
+struct KvFlashConfig {
+    int chunk_tokens       = 64;  // logical tokens per page
+    int pool_tokens        = 0;   // resident pool capacity (multiple of chunk_tokens)
+    int sink_chunks        = 1;   // leading chunks never evicted (attention sinks)
+    int tail_window_chunks = 4;   // trailing chunks never evicted (local window)
+};
+
+struct KvFlashStats {
+    int64_t page_outs  = 0;
+    int64_t page_ins   = 0;
+    int64_t host_bytes = 0;   // backing store currently held on host
+    int64_t moved_bytes = 0;  // cumulative D2H+H2D traffic
+};
+
+class KvFlashPager {
+public:
+    // `attn_k` / `attn_v` are the per-full-attention-layer cache tensors,
+    // each [head_dim, pool_tokens, n_head_kv]. All must share dims/types
+    // within their K/V group.
+    // Minimum pool for a config: sinks + trailing window stay resident
+    // unconditionally, so at least 2 more chunks are required (1 evictable
+    // victim + the partially filled append head) or eviction deadlocks and
+    // slot_for() starts failing once the pool fills.
+    static int min_pool_tokens(const KvFlashConfig & cfg) {
+        return (cfg.sink_chunks + cfg.tail_window_chunks + 2) * cfg.chunk_tokens;
+    }
+
+    bool attach(const KvFlashConfig & cfg,
+                const std::vector<ggml_tensor *> & attn_k,
+                const std::vector<ggml_tensor *> & attn_v) {
+        if (cfg.pool_tokens <= 0 || cfg.pool_tokens % cfg.chunk_tokens != 0) return false;
+        if (cfg.pool_tokens < min_pool_tokens(cfg)) {
+            std::fprintf(stderr,
+                "kvflash: pool %d < minimum %d (%d sink + %d tail chunks must "
+                "leave an evictable block)\n",
+                cfg.pool_tokens, min_pool_tokens(cfg),
+                cfg.sink_chunks, cfg.tail_window_chunks);
+            return false;
+        }
+        if (attn_k.empty() || attn_k.size() != attn_v.size()) return false;
+        cfg_ = cfg;
+        attn_k_ = attn_k;
+        attn_v_ = attn_v;
+        n_blocks_ = cfg.pool_tokens / cfg.chunk_tokens;
+        const ggml_tensor * K0 = attn_k[0];
+        if ((int)K0->ne[1] < cfg.pool_tokens) return false;
+        n_head_kv_ = (int)K0->ne[2];
+
+        // Per-(tensor, head) contiguous segment of chunk_tokens rows.
+        k_seg_bytes_ = (size_t)cfg.chunk_tokens * K0->nb[1];
+        v_seg_bytes_ = (size_t)cfg.chunk_tokens * attn_v[0]->nb[1];
+        chunk_bytes_ = (k_seg_bytes_ + v_seg_bytes_) * (size_t)n_head_kv_ * attn_k.size();
+        zero_buf_.assign(std::max(k_seg_bytes_, v_seg_bytes_), 0);
+
+        free_blocks_.clear();
+        for (int b = n_blocks_ - 1; b >= 0; b--) free_blocks_.push_back(b);
+        chunks_.clear();
+        stats_ = {};
+        clock_ = 0;
+        return true;
+    }
+
+    // Optional: custom block hand-out order (e.g. shuffled placement in
+    // relocation tests). `order[i]` = i-th block to hand out.
+    void set_block_order(const std::vector<int> & order) {
+        free_blocks_.assign(order.rbegin(), order.rend());
+    }
+
+    // Drop all mappings and host backing (new request / cache reset).
+    // Cumulative stats are kept; the epoch advances so cached masks refill.
+    void reset() {
+        chunks_.clear();
+        free_blocks_.clear();
+        for (int b = n_blocks_ - 1; b >= 0; b--) free_blocks_.push_back(b);
+        stats_.host_bytes = 0;
+        cur_chunk_ = 0;
+        epoch_++;
+    }
+
+    // Zero every currently-free block. reset() drops mappings but leaves the
+    // previous request's bytes in place; maskless consumers (the qwen35moe
+    // pipelined decode reads the whole padded pool span with no slot mask)
+    // need stale rows to dequantise to ~zero contribution. Masked consumers
+    // don't need this but it is cheap (pool-sized memset, sub-ms).
+    void zero_free_blocks() {
+        for (int b : free_blocks_) zero_block(b);
+    }
+
+    bool attached() const { return n_blocks_ > 0; }
+    int pool_tokens() const { return cfg_.pool_tokens; }
+    int chunk_tokens() const { return cfg_.chunk_tokens; }
+
+    // Optional external relevance score; higher = keep. Falls back to LRU.
+    std::function<float(int /*chunk*/)> score_hook;
+
+    // Allocate slots for [kv_start, kv_start + n_tok) ahead of a forward
+    // step (evicting LRU/low-score chunks as needed). False — with a
+    // diagnostic — if the pool has no evictable block left.
+    bool alloc_span(int kv_start, int n_tok) {
+        for (int i = 0; i < n_tok; ++i) {
+            if (slot_for(kv_start + i) < 0) {
+                std::fprintf(stderr, "[kvflash] no pool slot at pos %d "
+                                     "(pool %d exhausted)\n",
+                             kv_start + i, cfg_.pool_tokens);
+                return false;
+            }
+        }
+        return true;
+    }
+
+    // Physical pool slot for logical position `pos`. Allocates (and, when
+    // the pool is full, evicts) at chunk granularity. Call once per
+    // appended token, in logical order.
+    int slot_for(int64_t pos) {
+        const int c = (int)(pos / cfg_.chunk_tokens);
+        // cur_chunk_ tracks the append head only; a page_in of an older
+        // chunk must not shrink the protected tail window. It must advance
+        // BEFORE eviction (so the victim search protects the new tail), but
+        // a failed allocation must roll it back or the next eviction's tail
+        // window is computed from a chunk that never materialized.
+        const int prev_cur_chunk = cur_chunk_;
+        if (c > cur_chunk_) cur_chunk_ = c;
+        if ((int)chunks_.size() <= c) chunks_.resize(c + 1);
+        ChunkState & st = chunks_[c];
+        if (st.block < 0) {
+            if (!ensure_free_block()) {
+                cur_chunk_ = prev_cur_chunk;
+                return -1;
+            }
+            st.block = free_blocks_.back();
+            free_blocks_.pop_back();
+            epoch_++;
+            if (st.on_host) {              // recall: restore paged-out bytes
+                copy_chunk(c, st.block, /*to_host=*/false);
+                stats_.page_ins++;
+                stats_.moved_bytes += chunk_bytes_;
+            }
+        }
+        st.last_use = ++clock_;
+        return st.block * cfg_.chunk_tokens + (int)(pos % cfg_.chunk_tokens);
+    }
+
+    // Force a chunk out of the pool (host backing + zeroed slots).
+    bool page_out(int c) {
+        if (c >= (int)chunks_.size() || chunks_[c].block < 0) return false;
+        ChunkState & st = chunks_[c];
+        if (!st.on_host) {
+            st.host_data.resize(chunk_bytes_);
+            stats_.host_bytes += (int64_t)chunk_bytes_;
+        }
+        copy_chunk(c, st.block, /*to_host=*/true);
+        zero_block(st.block);
+        st.on_host = true;
+        free_blocks_.push_back(st.block);
+        st.block = -1;
+        epoch_++;
+        stats_.page_outs++;
+        stats_.moved_bytes += chunk_bytes_;
+        return true;
+    }
+
+    // Recall a chunk into the pool (used by reselect / tests).
+    bool page_in(int c) {
+        if (c >= (int)chunks_.size() || !chunks_[c].on_host || chunks_[c].block >= 0) return false;
+        return slot_for((int64_t)c * cfg_.chunk_tokens) >= 0;
+    }
+
+    bool is_resident(int c) const {
+        return c < (int)chunks_.size() && chunks_[c].block >= 0;
+    }
+
+    // True while every materialized chunk still sits in its identity block
+    // (chunk c in block c, nothing paged out). This is the layout contract
+    // identity-copy snapshots rely on; it holds from reset() until the
+    // first eviction of the CURRENT request (cumulative stats do not).
+    bool is_identity() const {
+        for (int c = 0; c < (int)chunks_.size(); c++) {
+            if (chunks_[c].block >= 0 && chunks_[c].block != c) return false;
+            if (chunks_[c].block < 0 && chunks_[c].on_host) return false;
+        }
+        return true;
+    }
+    int block_of(int c) const {
+        return c < (int)chunks_.size() ? chunks_[c].block : -1;
+    }
+
+    // Const lookup (no alloc / LRU touch): physical slot currently holding
+    // logical `pos`, or -1 if its chunk is not resident. Callers that may
+    // need an allocation must use slot_for() beforehand.
+    int slot_of(int64_t pos) const {
+        const int c = (int)(pos / cfg_.chunk_tokens);
+        if (c >= (int)chunks_.size() || chunks_[c].block < 0) return -1;
+        return chunks_[c].block * cfg_.chunk_tokens + (int)(pos % cfg_.chunk_tokens);
+    }
+
+    // Logical position held by each pool slot, -1 for free blocks. `dst`
+    // must hold pool_tokens entries. Lets callers build masks that need
+    // POSITION semantics in slot space (causal / sliding-window): the
+    // mask condition is evaluated on dst[slot] instead of the column index.
+    void fill_slot_pos(int32_t * dst) const {
+        for (int i = 0; i < cfg_.pool_tokens; i++) dst[i] = -1;
+        for (int c = 0; c < (int)chunks_.size(); c++) {
+            if (chunks_[c].block < 0) continue;
+            int32_t * p = dst + (size_t)chunks_[c].block * cfg_.chunk_tokens;
+            for (int i = 0; i < cfg_.chunk_tokens; i++)
+                p[i] = (int32_t)c * cfg_.chunk_tokens + i;
+        }
+    }
+    const KvFlashStats & stats() const { return stats_; }
+    int resident_blocks() const { return n_blocks_ - (int)free_blocks_.size(); }
+    int n_chunks() const { return (int)chunks_.size(); }
+
+    // Bumped on every residency change (alloc / page_out / page_in).
+    // Callers cache the slot mask and refill only when the epoch moves.
+    uint64_t epoch() const { return epoch_; }
+
+    // F16 slot-validity mask for one query row: 0 for slots belonging to a
+    // resident chunk, -inf for free / paged-out blocks. `dst` must hold
+    // pool_tokens entries. Used as the FA mask so non-resident slots are
+    // excluded exactly instead of via the zero-row ~exp(-max) approximation.
+    void fill_slot_mask(uint16_t * dst) const {
+        constexpr uint16_t F16_ZERO = 0x0000, F16_NEG_INF = 0xFC00;
+        for (int i = 0; i < cfg_.pool_tokens; i++) dst[i] = F16_NEG_INF;
+        for (int c = 0; c < (int)chunks_.size(); c++) {
+            if (chunks_[c].block < 0) continue;
+            uint16_t * p = dst + (size_t)chunks_[c].block * cfg_.chunk_tokens;
+            for (int i = 0; i < cfg_.chunk_tokens; i++) p[i] = F16_ZERO;
+        }
+    }
+
+    // Lookahead reselect (FlashMemory τ-step): rebuild the resident set as
+    // the top-pool chunks by score_hook among ALL known chunks (resident or
+    // host-backed). Sinks and the trailing window are always kept. Returns
+    // the number of page events. Call between decode steps.
+    int reselect() {
+        if (!score_hook) return 0;
+        struct Cand { int c; float s; };
+        std::vector<Cand> cands;
+        for (int c = 0; c < (int)chunks_.size(); c++) {
+            const ChunkState & st = chunks_[c];
+            if (st.block < 0 && !st.on_host) continue;     // never materialized
+            const bool prot = c < cfg_.sink_chunks ||
+                              c > cur_chunk_ - 1 - cfg_.tail_window_chunks;
+            cands.push_back({c, prot ? 3.4e38f : score_hook(c)});
+        }
+        std::sort(cands.begin(), cands.end(),
+                  [](const Cand & a, const Cand & b) { return a.s > b.s; });
+        std::vector<uint8_t> want(chunks_.size(), 0);
+        for (int i = 0; i < (int)cands.size() && i < n_blocks_; i++) want[cands[i].c] = 1;
+
+        int events = 0;
+        for (int c = 0; c < (int)chunks_.size(); c++) {       // out first: frees blocks
+            if (!want[c] && chunks_[c].block >= 0) { page_out(c); events++; }
+        }
+        for (int c = 0; c < (int)chunks_.size(); c++) {
+            if (want[c] && chunks_[c].block < 0 && chunks_[c].on_host) {
+                if (page_in(c)) events++;
+            }
+        }
+        return events;
+    }
+
+private:
+    struct ChunkState {
+        int      block = -1;       // pool block index, -1 = not resident
+        bool     on_host = false;  // backing store holds valid bytes
+        uint64_t last_use = 0;
+        std::vector<uint8_t> host_data;
+    };
+
+    bool ensure_free_block() {
+        if (!free_blocks_.empty()) return true;
+        // Victim: unprotected resident chunk with the lowest score
+        // (score_hook) or the oldest use (LRU fallback).
+        int victim = -1;
+        float v_score = 0.f;
+        uint64_t v_use = 0;
+        for (int c = 0; c < (int)chunks_.size(); c++) {
+            if (chunks_[c].block < 0) continue;
+            if (c < cfg_.sink_chunks) continue;
+            if (c > cur_chunk_ - 1 - cfg_.tail_window_chunks) continue;
+            if (score_hook) {
+                const float s = score_hook(c);
+                if (victim < 0 || s < v_score) { victim = c; v_score = s; }
+            } else {
+                if (victim < 0 || chunks_[c].last_use < v_use) { victim = c; v_use = chunks_[c].last_use; }
+            }
+        }
+        return victim >= 0 && page_out(victim);
+    }
+
+    // Move one chunk between pool slots and host backing. Segment order is
+    // fixed (layer-major, K then V, head-minor) so offsets are stable.
+    void copy_chunk(int c, int block, bool to_host) {
+        ChunkState & st = chunks_[c];
+        uint8_t * p = st.host_data.data();
+        for (size_t l = 0; l < attn_k_.size(); l++) {
+            for (int kv = 0; kv < 2; kv++) {
+                ggml_tensor * t = kv == 0 ? attn_k_[l] : attn_v_[l];
+                const size_t seg = kv == 0 ? k_seg_bytes_ : v_seg_bytes_;
+                for (int h = 0; h < n_head_kv_; h++) {
+                    const size_t off = (size_t)block * cfg_.chunk_tokens * t->nb[1] + (size_t)h * t->nb[2];
+                    if (to_host) ggml_backend_tensor_get(t, p, off, seg);
+                    else         ggml_backend_tensor_set(t, p, off, seg);
+                    p += seg;
+                }
+            }
+        }
+    }
+
+    void zero_block(int block) {
+        for (size_t l = 0; l < attn_k_.size(); l++) {
+            for (int kv = 0; kv < 2; kv++) {
+                ggml_tensor * t = kv == 0 ? attn_k_[l] : attn_v_[l];
+                const size_t seg = kv == 0 ? k_seg_bytes_ : v_seg_bytes_;
+                for (int h = 0; h < n_head_kv_; h++) {
+                    const size_t off = (size_t)block * cfg_.chunk_tokens * t->nb[1] + (size_t)h * t->nb[2];
+                    ggml_backend_tensor_set(t, zero_buf_.data(), off, seg);
+                }
+            }
+        }
+    }
+
+    KvFlashConfig cfg_;
+    std::vector<ggml_tensor *> attn_k_, attn_v_;
+    std::vector<ChunkState> chunks_;
+    std::vector<int> free_blocks_;
+    std::vector<uint8_t> zero_buf_;
+    KvFlashStats stats_;
+    size_t k_seg_bytes_ = 0, v_seg_bytes_ = 0, chunk_bytes_ = 0;
+    int n_blocks_ = 0, n_head_kv_ = 0, cur_chunk_ = 0;
+    uint64_t clock_ = 0;
+    uint64_t epoch_ = 0;
+};
+
+// ── Shared backend helpers ─────────────────────────────────────────────
+//
+// Every backend integration needs the same three steps: read the pool size
+// from the env, allocate slots ahead of each forward (alloc_span above),
+// and build slot-space inputs for the graph. The first and last live here
+// so the per-arch code reduces to wiring.
+
+// VRAM budget for "auto" pool sizing. Backends fill this AFTER the target
+// weights are on the GPU and BEFORE the cache is allocated, so free_bytes
+// reflects what the pool can actually use.
+struct KvFlashAutoBudget {
+    int64_t free_bytes      = 0;   // device free memory right now
+    int64_t reserve_bytes   = 0;   // compute buffers + (if expected) drafter
+    int64_t bytes_per_token = 0;   // pooled attention KV density for this model
+    // Decode cost grows with the FA span (= the pool), so cap the auto pool
+    // where speed stays near the small-pool point. Measured on the 27B/3090:
+    // 1K pool 39.6 tok/s, 4K 38.7; 16K extrapolates to ~31-33, still 1.7-2.4x
+    // the full cache at 128-256K. Override: DFLASH_KVFLASH_MAX_POOL.
+    int     speed_cap_tokens = 16384;
+};
+
+// Pool size from DFLASH_KVFLASH for a backend with `cfg` protections:
+// 0 = off; otherwise rounded to a 256 multiple, floored at
+// min_pool_tokens(cfg) (eviction must keep a victim) and clamped to
+// `max_ctx` (a pool larger than the logical context is meaningless), with
+// warnings on both adjustments.
+//
+// The literal value "auto" sizes the pool from the GPU, not from a fixed
+// fraction: take half of (free VRAM - reserve), convert to tokens at the
+// model's KV density, then cap at the speed point and max_ctx. Big pools
+// avoid relevance-crowding (more resident chunks = fewer forced evictions
+// of useful context); the speed cap keeps decode near the flat optimum.
+// Falls back to max_ctx/4 (scorer expected) or /2 (LRU) when the backend
+// supplies no budget.
+inline int kvflash_pool_from_env(int max_ctx, const KvFlashConfig & cfg = {},
+                                 bool scorer_expected = false,
+                                 const KvFlashAutoBudget & budget = {}) {
+    const char * env = std::getenv("DFLASH_KVFLASH");
+    if (!env) return 0;
+    int tokens;
+    if (std::strcmp(env, "auto") == 0) {
+        int speed_cap = budget.speed_cap_tokens;
+        if (const char * mp = std::getenv("DFLASH_KVFLASH_MAX_POOL")) {
+            speed_cap = std::max(256, std::atoi(mp));
+        }
+        if (budget.bytes_per_token > 0 && budget.free_bytes > 0) {
+            const int64_t usable =
+                std::max<int64_t>(0, budget.free_bytes - budget.reserve_bytes) / 2;
+            const int64_t vram_tokens = usable / budget.bytes_per_token;
+            tokens = (int)std::min<int64_t>(vram_tokens,
+                                            std::min(max_ctx, speed_cap));
+            std::fprintf(stderr,
+                "[kvflash] auto pool: %d tokens (free %.1f GiB - reserve %.1f GiB, "
+                "%.1f KiB/token, caps: speed %d / max_ctx %d)\n",
+                tokens, budget.free_bytes / 1073741824.0,
+                budget.reserve_bytes / 1073741824.0,
+                budget.bytes_per_token / 1024.0, speed_cap, max_ctx);
+        } else {
+            tokens = max_ctx / (scorer_expected ? 4 : 2);
+            std::fprintf(stderr, "[kvflash] auto pool: %d tokens (%d%% of max_ctx %d, "
+                                 "no VRAM budget supplied)\n",
+                         tokens, scorer_expected ? 25 : 50, max_ctx);
+        }
+    } else {
+        tokens = std::atoi(env);
+    }
+    if (tokens <= 0) return 0;
+    tokens = ((tokens + 255) / 256) * 256;
+    const int floor_tokens =
+        ((KvFlashPager::min_pool_tokens(cfg) + 255) / 256) * 256;
+    if (tokens < floor_tokens) {
+        std::fprintf(stderr, "[kvflash] requested pool %d < minimum %d "
+                             "(%d sink + %d tail chunks must leave an "
+                             "evictable block); raising\n",
+                     tokens, floor_tokens, cfg.sink_chunks, cfg.tail_window_chunks);
+        tokens = floor_tokens;
+    }
+    if (tokens > max_ctx) {
+        std::fprintf(stderr, "[kvflash] requested pool %d > max_ctx %d; clamping "
+                             "(raise --max-ctx for a larger pool)\n",
+                     tokens, max_ctx);
+        tokens = (max_ctx / 256) * 256;
+    }
+    return tokens;
+}
+
+// Residency policy from DFLASH_KVFLASH_POLICY (--kvflash-policy): "lru"
+// forces recency-only paging (no drafter probe, no scorer); anything else
+// (default "drafter") means scored residency when a drafter is available.
+inline bool kvflash_policy_is_lru() {
+    const char * env = std::getenv("DFLASH_KVFLASH_POLICY");
+    return env && std::strcmp(env, "lru") == 0;
+}
+
+// Locate the Qwen3-0.6B residency drafter: the explicit override
+// (DFLASH_KVFLASH_DRAFTER, set from --prefill-drafter), then the
+// well-known locations next to the target model, then the appliance path.
+// Returns "" when nothing is readable (callers fall back to LRU, loudly).
+inline std::string kvflash_find_drafter(const char * target_path) {
+    if (kvflash_policy_is_lru()) return "";
+    if (const char * dp = std::getenv("DFLASH_KVFLASH_DRAFTER")) return dp;
+    if (!target_path) return "";
+    std::string dir(target_path);
+    const size_t slash = dir.find_last_of('/');
+    dir = (slash == std::string::npos) ? "." : dir.substr(0, slash);
+    const std::string candidates[] = {
+        dir + "/Qwen3-0.6B-BF16.gguf",
+        dir + "/drafter/Qwen3-0.6B-BF16.gguf",
+        dir + "/draft/Qwen3-0.6B-BF16.gguf",
+        "/opt/lucebox/models/drafter/Qwen3-0.6B-BF16.gguf",
+    };
+    for (const std::string & c : candidates) {
+        if (std::FILE * f = std::fopen(c.c_str(), "rb")) {
+            std::fclose(f);
+            std::fprintf(stderr, "[kvflash] found residency drafter: %s\n", c.c_str());
+            return c;
+        }
+    }
+    return "";
+}
+
+// Slot-space step inputs for masked consumers: the K/V append row for each
+// of this step's tokens, plus F32 causal (`mfull`) and sliding-window
+// (`mswa`, optional) masks of width `mk_w` whose conditions are evaluated
+// on the POSITION each pool slot holds (free slots stay -inf). The caller
+// must have alloc_span()'d [kv_start, kv_start + n_tok) first. The pager
+// zeroes freed slots, but the mask is what keeps relocation exact.
+inline bool kvflash_fill_rows_and_masks(
+    const KvFlashPager & pager,
+    int kv_start, int n_tok, int mk_w, int swa_window,
+    std::vector<int32_t> & rows,
+    std::vector<float> * mfull, std::vector<float> * mswa) {
+    rows.resize((size_t)n_tok);
+    for (int i = 0; i < n_tok; ++i) {
+        const int s = pager.slot_of(kv_start + i);
+        if (s < 0) {
+            std::fprintf(stderr, "[kvflash] no pool slot at pos %d "
+                                 "(alloc_span not called?)\n", kv_start + i);
+            return false;
+        }
+        rows[(size_t)i] = s;
+    }
+    if (!mfull) return true;
+    std::vector<int32_t> spos((size_t)pager.pool_tokens(), -1);
+    pager.fill_slot_pos(spos.data());
+    mfull->assign((size_t)mk_w * n_tok, -INFINITY);
+    if (mswa) mswa->assign((size_t)mk_w * n_tok, -INFINITY);
+    const int s_hi = std::min(mk_w, (int)spos.size());
+    for (int q = 0; q < n_tok; ++q) {
+        const int abs_q = kv_start + q;
+        const int win_lo = std::max(0, abs_q - swa_window + 1);
+        for (int s = 0; s < s_hi; ++s) {
+            const int p = spos[(size_t)s];
+            if (p < 0 || p > abs_q) continue;
+            (*mfull)[(size_t)q * mk_w + s] = 0.0f;
+            if (mswa && p >= win_lo) (*mswa)[(size_t)q * mk_w + s] = 0.0f;
+        }
+    }
+    return true;
+}
+
+} // namespace dflash::common
diff --git a/server/src/common/kvflash_scorer.h b/server/src/common/kvflash_scorer.h
new file mode 100644
index 000000000..407d94c6d
--- /dev/null
+++ b/server/src/common/kvflash_scorer.h
@@ -0,0 +1,33 @@
+// KvFlashScorer — pluggable chunk-relevance policy for KvFlashPager.
+//
+// The pager is policy-agnostic: with no scorer attached it evicts LRU and
+// never recalls. A scorer upgrades eviction and reselect() to relevance-
+// driven residency (FlashMemory's Memory Indexer role). This interface is
+// deliberately dependency-free so the pager runs without pflash, without a
+// drafter, and without any model beyond the target.
+//
+// Implementations:
+//   - (none)            pure LRU + recency, zero dependencies
+//   - KvFlashDrafterScorer   qwen3/qwen3_kvflash_scorer.h — pflash drafter tail
+//                       attention (shared with pflash compression)
+
+#pragma once
+
+#include <cstdint>
+#include <vector>
+
+namespace dflash::common {
+
+struct KvFlashScorer {
+    virtual ~KvFlashScorer() = default;
+
+    // Fill out[c] with a relevance score (higher = keep resident) for each
+    // chunk_tokens-sized chunk of `ids` (the full token history: prompt +
+    // generated). Returns false on failure; the caller skips reselect for
+    // that round and the pager keeps its LRU behavior.
+    virtual bool score_chunks(const std::vector<int32_t> & ids,
+                              int chunk_tokens,
+                              std::vector<float> & out) = 0;
+};
+
+} // namespace dflash::common
diff --git a/server/src/common/moe_hybrid_ffn_eval.cpp b/server/src/common/moe_hybrid_ffn_eval.cpp
index 12a854d37..6d106cfa5 100644
--- a/server/src/common/moe_hybrid_ffn_eval.cpp
+++ b/server/src/common/moe_hybrid_ffn_eval.cpp
@@ -39,8 +39,17 @@ static ggml_tensor * build_shared_expert_subgraph(
     ggml_tensor * shared = apply_scale2(ctx,
         ggml_mul_mat(ctx, desc.ffn_down_shexp, sh_gu), desc.ffn_down_shexp_s);
     if (desc.ffn_gate_inp_shexp) {
+        // The shared-expert gate is a single-row weight (M=1): out[0,n] = sum_k W[k]*inp[k,n].
+        // Computing it as ggml_mul_mat routes to cublas, and on the shipped CUDA 12.0
+        // cublasLt the M=1 heuristic selects a gemv/split-K reduce algorithm whose kernel
+        // is ABSENT from the library once N>1 (spec-decode verify/replay batches) — for
+        // BOTH F32 (cublasSgemm SSS) and F16 (cublasGemmEx HHH splitKreduce). That poisons
+        // the stream and surfaces as an illegal access in the next op. Compute the gate as
+        // broadcast elementwise-mul + sum_rows instead: identical math, ggml kernels only,
+        // no cublas. This is what unblocks single-pass full-batch verify.
+        ggml_tensor * gate_prod = ggml_mul(ctx, inp, desc.ffn_gate_inp_shexp);
         ggml_tensor * shared_gate = apply_scale2(ctx,
-            ggml_mul_mat(ctx, desc.ffn_gate_inp_shexp, inp), desc.ffn_gate_inp_shexp_s);
+            ggml_sum_rows(ctx, gate_prod), desc.ffn_gate_inp_shexp_s);
         shared_gate = ggml_sigmoid(ctx, shared_gate);
         shared = ggml_mul(ctx, shared, shared_gate);
     }
@@ -658,6 +667,57 @@ bool build_cached_hot_batched_graph(
     return true;
 }
 
+// Cached batched COLD routed graph (CPU backend, no shared expert). Mirror of
+// build_cached_hot_batched_graph for the cold expert stack; used by the mixed
+// batched path so spec-decode verify/replay reuse the graph instead of
+// rebuilding it every call.
+static bool build_cached_cold_batched_graph(
+    CachedHotBatchedGraph & out,
+    ggml_backend_t cpu_backend,
+    MoeHybridLayerStorage & storage,
+    const MoeLayerDesc & desc,
+    const MoeHybridConfig & cfg,
+    int n_tokens) {
+
+    out.free();
+    out.n_tokens = n_tokens;
+    const int n_embd = cfg.n_embd;
+    const int n_used = cfg.n_expert_used;
+    const int n_ff_exp = cfg.n_ff_exp;
+
+    ggml_init_params ip{};
+    ip.mem_size = 128 * 1024 * 1024;
+    ip.mem_buffer = nullptr;
+    ip.no_alloc = true;
+    out.ctx = ggml_init(ip);
+    if (!out.ctx) return false;
+
+    out.inp = ggml_new_tensor_2d(out.ctx, GGML_TYPE_F32, n_embd, n_tokens);
+    ggml_set_input(out.inp);
+    out.sel = ggml_new_tensor_2d(out.ctx, GGML_TYPE_I32, n_used, n_tokens);
+    ggml_set_input(out.sel);
+    out.wts = ggml_new_tensor_2d(out.ctx, GGML_TYPE_F32, n_used, n_tokens);
+    ggml_set_input(out.wts);
+
+    ggml_tensor * routed = nullptr;
+    build_batched_routed_graph(out.ctx,
+        storage.gate_cold, storage.up_cold, storage.down_cold, storage.gate_up_cold,
+        desc.ffn_gate_exps_s, desc.ffn_up_exps_s, desc.ffn_down_exps_s, desc.ffn_gate_up_exps_s,
+        out.inp, out.sel, out.wts, n_embd, n_ff_exp, n_used, n_tokens, &routed);
+    if (!routed) { out.free(); return false; }
+    out.output = routed;
+
+    out.gf = ggml_new_graph_custom(out.ctx, 4096, false);
+    ggml_set_output(out.output);
+    ggml_build_forward_expand(out.gf, out.output);
+    out.alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(cpu_backend));
+    if (!ggml_gallocr_alloc_graph(out.alloc, out.gf)) {
+        out.free();
+        return false;
+    }
+    return true;
+}
+
 bool eval_moe_hybrid_ffn_single(
     ggml_backend_t                  gpu_backend,
     const MoeHybridConfig &         cfg,
@@ -935,6 +995,25 @@ static bool mmq_full_batch_ok(const MoeHybridConfig & cfg, int n_tokens) {
     return cfg.mmq_safe_full_batch && n_tokens >= min_tokens;
 }
 
+// Sub-batch size for the reduced-hot-stack routed mul_mat_id. The MMQ path
+// (n_tokens > 8) illegal-accesses on a REDUCED expert stack for sparse/
+// imbalanced sub-64 batches (a genuine ggml-cuda MMQ mul_mat_id bug, observed
+// on sm_86 + gfx1151); the MMVQ-mmid path is stable. Q4_K MMVQ-mmid handles up
+// to 8 tokens on CUDA sm_80+ (MMVQ_MAX_BATCH_SIZE) and 4 on AMD. Earlier this
+// had to be 1 because the F32 shared-expert gate (cublasSgemm, M=1) also faulted
+// at N>1 on the shipped CUDA 12.0 cublasLt; that is now computed cublas-free
+// (mul + sum_rows), so sub-batch=8 is safe and validated on sm_86. Default to 8
+// on sm_80+ (CUDA), 1 elsewhere (proven single-token path on unvalidated archs);
+// env override tunes per arch without a rebuild.
+static int mmq_safe_sub_batch() {
+    static const int v = [](){
+        const char * e = std::getenv("DFLASH_MMQ_SUB_BATCH");
+        if (e) return std::max(1, std::atoi(e));
+        return (query_gpu_compute_sm() >= 80) ? 8 : 1;
+    }();
+    return v;
+}
+
 static bool eval_moe_hybrid_ffn_batched_core(
     ggml_backend_t                  gpu_backend,
     ggml_backend_t                  cpu_backend,
@@ -956,6 +1035,74 @@ static bool eval_moe_hybrid_ffn_batched_core(
     out.assign((size_t)n_embd * (size_t)n_tokens, 0.0f);
     if (n_tokens <= 0) return true;
 
+    // ── Fast path: cached hot+cold batched graphs (spec-decode verify/replay) ──
+    // Mixed layers used to rebuild+free their hot and cold ggml graphs on every
+    // call; that graph churn (not the matmul) dominated the verify FFN time.
+    // Reuse per-n_tokens cached graphs so steady-state rebuilds nothing. Large
+    // prefill batches (n_tokens >= kMaxBatchedCache) fall through to the inline
+    // path below.
+    if (n_tokens > 0 && n_tokens < MoeHybridLayerStorage::kMaxBatchedCache) {
+        const int total_slots = n_used * n_tokens;
+        const int n_hot_stack = storage.gate_up_hot ? (int)storage.gate_up_hot->ne[2]
+                              : storage.gate_hot    ? (int)storage.gate_hot->ne[2] : 1;
+        const int n_cold_stack = std::max(1, (int)(storage.down_cold ? storage.down_cold->ne[2] : 1));
+        std::vector<int32_t> hot_sel(total_slots);
+        std::vector<float>   hot_wts(total_slots, 0.0f);
+        std::vector<int32_t> cold_sel(total_slots);
+        std::vector<float>   cold_wts(total_slots, 0.0f);
+        for (int i = 0; i < total_slots; ++i) { hot_sel[i] = i % n_hot_stack; cold_sel[i] = i % n_cold_stack; }
+        bool fp_has_cold = false;
+        for (int i = 0; i < total_slots; ++i) {
+            const int32_t gid = selected_ids[i];
+            if (gid < 0 || gid >= (int32_t)storage.hot_local_by_global.size()) continue;
+            const int32_t hl = storage.hot_local_by_global[(size_t)gid];
+            if (hl >= 0) { hot_sel[i] = hl; hot_wts[i] = selected_weights[i]; }
+            else {
+                const int32_t cl = storage.cold_local_by_global[(size_t)gid];
+                if (cl >= 0) { cold_sel[i] = cl; cold_wts[i] = selected_weights[i]; fp_has_cold = true; }
+            }
+        }
+
+        CachedHotBatchedGraph & hg = storage.hot_batched_mixed[n_tokens];
+        const bool hg_ok = (hg.valid() && hg.n_tokens == n_tokens)
+            || build_cached_hot_batched_graph(hg, gpu_backend, storage, desc, cfg, n_tokens);
+        CachedHotBatchedGraph * cg = nullptr;
+        bool cg_ok = true;
+        if (fp_has_cold) {
+            cg = &storage.cold_batched_mixed[n_tokens];
+            cg_ok = (cg->valid() && cg->n_tokens == n_tokens)
+                || build_cached_cold_batched_graph(*cg, cpu_backend, storage, desc, cfg, n_tokens);
+        }
+
+        if (hg_ok && cg_ok) {
+            // Hot (GPU, async): shared expert + routed hot (zero-weight dummy slots
+            // keep an all-cold batch's shared-expert contribution).
+            ggml_backend_tensor_set(hg.inp, cur_host, 0, sizeof(float) * (size_t)n_embd * (size_t)n_tokens);
+            ggml_backend_tensor_set(hg.sel, hot_sel.data(), 0, sizeof(int32_t) * (size_t)total_slots);
+            ggml_backend_tensor_set(hg.wts, hot_wts.data(), 0, sizeof(float) * (size_t)total_slots);
+            ggml_backend_graph_compute_async(gpu_backend, hg.gf);
+
+            std::vector<float> cold_partial;
+            if (cg) {
+                cold_partial.assign((size_t)n_embd * (size_t)n_tokens, 0.0f);
+                ggml_backend_tensor_set(cg->inp, cur_host, 0, sizeof(float) * (size_t)n_embd * (size_t)n_tokens);
+                ggml_backend_tensor_set(cg->sel, cold_sel.data(), 0, sizeof(int32_t) * (size_t)total_slots);
+                ggml_backend_tensor_set(cg->wts, cold_wts.data(), 0, sizeof(float) * (size_t)total_slots);
+                ggml_backend_graph_compute(cpu_backend, cg->gf);  // sync; overlaps the async hot GPU graph
+                ggml_backend_tensor_get(cg->output, cold_partial.data(), 0, sizeof(float) * (size_t)n_embd * (size_t)n_tokens);
+            }
+
+            ggml_backend_synchronize(gpu_backend);
+            ggml_backend_tensor_get(hg.output, out.data(), 0, sizeof(float) * (size_t)n_embd * (size_t)n_tokens);
+            if (cg) {
+                const size_t ntot = (size_t)n_embd * (size_t)n_tokens;
+                for (size_t i = 0; i < ntot; ++i) out[i] += cold_partial[i];
+            }
+            return true;
+        }
+        // build failed -> fall through to the inline rebuild path
+    }
+
     // ── Step 1: Partition routing into hot and cold ──
     // Dummy slots use weight 0.0 and are distributed evenly across all experts
     // to avoid pathological routing imbalance that triggers OOB in MMQ stream-k.
@@ -1175,15 +1322,15 @@ bool eval_moe_hot_only_batched(
     out.assign((size_t)n_embd * (size_t)n_tokens, 0.0f);
     if (n_tokens <= 0) return true;
 
-    // Workaround for ggml-cuda MMQ mul_mat_id bug on sm_75/gfx1151: when the
-    // hot stack is smaller than n_expert, slice into <=4-token sub-batches to
-    // route through the stable MMVQ path. Skipped on sm_80+ where MMQ is safe.
+    // Workaround for the ggml-cuda MMQ mul_mat_id stream-k fault on a REDUCED
+    // hot stack (sm_75/gfx1151 AND sm_86): slice sub-64 batches to a size the
+    // MMVQ-mmid path handles. See mmq_safe_sub_batch().
     const int n_hot_stack = storage.gate_up_hot ? (int)storage.gate_up_hot->ne[2]
                           : storage.gate_hot    ? (int)storage.gate_hot->ne[2]
                           : 0;
-    static const int MMQ_SAFE_SUB_BATCH = 4;
+    const int MMQ_SAFE_SUB_BATCH = mmq_safe_sub_batch();
     if (!mmq_full_batch_ok(cfg, n_tokens)
-        && n_hot_stack > 0 && n_hot_stack < cfg.n_expert && n_tokens > MMQ_SAFE_SUB_BATCH) {
+        && n_hot_stack > 0 && n_tokens > MMQ_SAFE_SUB_BATCH) {
         std::vector<float> sub_out;
         for (int t0 = 0; t0 < n_tokens; t0 += MMQ_SAFE_SUB_BATCH) {
             const int tc = std::min(MMQ_SAFE_SUB_BATCH, n_tokens - t0);
@@ -1234,7 +1381,7 @@ bool eval_moe_hot_only_batched(
     // ── Slow path: build graph (first call or size mismatch) ──
     // Try to build and cache for this n_tokens size.
     // Cache when: sub-batch size (legacy), full stack (all hot), or full-batch safe (sm_80+).
-    if (mmq_full_batch_ok(cfg, n_tokens) || n_tokens == MMQ_SAFE_SUB_BATCH
+    if (mmq_full_batch_ok(cfg, n_tokens) || n_tokens <= MMQ_SAFE_SUB_BATCH
         || (n_hot_stack == 0 || n_hot_stack >= cfg.n_expert)) {
         if (build_cached_hot_batched_graph(cached, gpu_backend, storage, desc, cfg, n_tokens)) {
             // Successfully cached — use it immediately
@@ -1350,9 +1497,9 @@ bool eval_moe_hybrid_ffn_batched(
     const int n_hot_stack = storage.gate_up_hot ? (int)storage.gate_up_hot->ne[2]
                           : storage.gate_hot    ? (int)storage.gate_hot->ne[2]
                           : 0;
-    static const int MMQ_SAFE_SUB_BATCH = 4;
+    const int MMQ_SAFE_SUB_BATCH = mmq_safe_sub_batch();
     if (!mmq_full_batch_ok(cfg, n_tokens)
-        && n_hot_stack > 0 && n_hot_stack < cfg.n_expert && n_tokens > MMQ_SAFE_SUB_BATCH) {
+        && n_hot_stack > 0 && n_tokens > MMQ_SAFE_SUB_BATCH) {
         const int n_embd = cfg.n_embd;
         const int n_used = cfg.n_expert_used;
         out.assign((size_t)n_embd * (size_t)n_tokens, 0.0f);
diff --git a/server/src/common/moe_hybrid_storage.cpp b/server/src/common/moe_hybrid_storage.cpp
index a8613b02a..4bf027400 100644
--- a/server/src/common/moe_hybrid_storage.cpp
+++ b/server/src/common/moe_hybrid_storage.cpp
@@ -130,6 +130,9 @@ MoeHybridStorage::~MoeHybridStorage() {
     for (auto & layer : layers) {
         layer.hot_graph.free();
         layer.cold_graph.free();
+        layer.hot_batched_graph.free();
+        for (auto & g : layer.hot_batched_mixed) g.free();
+        for (auto & g : layer.cold_batched_mixed) g.free();
         if (layer.hot_buf) {
             ggml_backend_buffer_free(layer.hot_buf);
             layer.hot_buf = nullptr;
diff --git a/server/src/common/moe_hybrid_storage.h b/server/src/common/moe_hybrid_storage.h
index 3485c69ff..d4a1d47d4 100644
--- a/server/src/common/moe_hybrid_storage.h
+++ b/server/src/common/moe_hybrid_storage.h
@@ -132,6 +132,17 @@ struct MoeHybridLayerStorage {
 
     // Cached batched hot-only graph for prefill sub-batches (n_tokens=4).
     CachedHotBatchedGraph hot_batched_graph;
+
+    // Per-n_tokens cached graphs for the MIXED (hot+cold) batched path. The
+    // all-hot path already caches via hot_batched_graph, but the mixed path used
+    // to rebuild+free its hot AND cold ggml graphs on every call — that churn
+    // dominated the spec-decode verify cost (many cold-bearing layers x
+    // sub-batches x steps). Cache per n_tokens (index 1..kMaxBatchedCache-1) so
+    // steady-state verify/replay rebuilds zero graphs. Large prefill batches
+    // (n_tokens >= kMaxBatchedCache) keep using the inline build.
+    static constexpr int kMaxBatchedCache = 9;  // covers spec sub-batch n_tokens 1..8
+    CachedHotBatchedGraph hot_batched_mixed[kMaxBatchedCache];
+    CachedHotBatchedGraph cold_batched_mixed[kMaxBatchedCache];
 };
 
 struct MoeHybridStorage {
diff --git a/server/src/draft/draft_gguf_loader.cpp b/server/src/draft/draft_gguf_loader.cpp
index 39d620ce6..f0dbd8eb9 100644
--- a/server/src/draft/draft_gguf_loader.cpp
+++ b/server/src/draft/draft_gguf_loader.cpp
@@ -368,19 +368,34 @@ bool load_draft_gguf(const std::string & path,
             set_last_error(err);
             return false;
         }
-        // fc: [n_target_layers*n_embd, n_embd] — ne[0] = n_target_layers*n_embd.
+        // fc: [n_capture_layers*n_embd, n_embd] — ne[0] counts the CAPTURE
+        // layers the fc consumes. Some draft GGUFs (gemma4) store the
+        // TARGET's layer count in dflash.n_target_layers instead of the
+        // capture count; per this file's own philosophy the weights are
+        // ground truth, so when fc disagrees but is an exact multiple of
+        // n_embd, derive the count from the tensor and warn. Fail only on
+        // a genuinely inconsistent shape.
         if (out.n_target_layers > 0) {
             const int64_t derived_fc_in  = out.fc->ne[0];
             const int64_t expected_fc_in = (int64_t)out.n_target_layers * out.n_embd;
             if (derived_fc_in != expected_fc_in) {
-                char buf[256];
-                std::snprintf(buf, sizeof(buf),
-                    "GGUF shape mismatch: dflash.fc.weight->ne[0]=%lld "
-                    "!= n_target_layers*n_embd=%d*%d=%lld",
-                    (long long)derived_fc_in,
-                    out.n_target_layers, out.n_embd, (long long)expected_fc_in);
-                set_last_error(buf);
-                return false;
+                if (out.n_embd > 0 && derived_fc_in % out.n_embd == 0) {
+                    const int derived_layers = (int)(derived_fc_in / out.n_embd);
+                    std::fprintf(stderr,
+                        "[draft] dflash.n_target_layers metadata (%d) != "
+                        "fc-derived capture count (%d); using the weights\n",
+                        out.n_target_layers, derived_layers);
+                    out.n_target_layers = derived_layers;
+                } else {
+                    char buf[256];
+                    std::snprintf(buf, sizeof(buf),
+                        "GGUF shape mismatch: dflash.fc.weight->ne[0]=%lld "
+                        "!= n_target_layers*n_embd=%d*%d=%lld",
+                        (long long)derived_fc_in,
+                        out.n_target_layers, out.n_embd, (long long)expected_fc_in);
+                    set_last_error(buf);
+                    return false;
+                }
             }
         }
     }
diff --git a/server/src/gemma4/gemma4_backend.cpp b/server/src/gemma4/gemma4_backend.cpp
index cfed37494..9e7f131a4 100644
--- a/server/src/gemma4/gemma4_backend.cpp
+++ b/server/src/gemma4/gemma4_backend.cpp
@@ -6,6 +6,7 @@
 
 #include "gemma4_backend.h"
 #include "dflash27b.h"
+#include "../qwen3/qwen3_kvflash_scorer.h"
 #include "common/sampler.h"
 #include "common/io_utils.h"
 #include "common/dflash_feature_ring.h"
@@ -49,11 +50,19 @@ bool Gemma4Backend::init() {
         return false;
     }
 
-    if (!create_gemma4_cache(backend_, w_, cfg_.device.max_ctx, cache_)) {
+    kvflash_read_config();
+    if (!create_gemma4_cache(backend_, w_, cfg_.device.max_ctx, cache_,
+                             kvflash_tokens_)) {
         std::fprintf(stderr, "[gemma4] cache alloc failed\n");
         return false;
     }
     cache_.fa_window = cfg_.fa_window;
+    if (kvflash_active() && cache_.fa_window > 0) {
+        std::fprintf(stderr, "[kvflash] --fa-window and --kvflash are mutually "
+                             "exclusive full-attention policies\n");
+        return false;
+    }
+    if (!kvflash_attach()) return false;
 
     // Load draft model for speculative decode.
     if (cfg_.draft_path && !load_decode_draft()) {
@@ -117,18 +126,22 @@ bool Gemma4Backend::unpark(const std::string & what) {
         }
 
         // Recreate KV cache
-        if (!create_gemma4_cache(backend_, w_, cfg_.device.max_ctx, cache_)) {
+        if (!create_gemma4_cache(backend_, w_, cfg_.device.max_ctx, cache_,
+                                 kvflash_tokens_)) {
             std::fprintf(stderr, "[gemma4] unpark: failed to recreate cache\n");
             free_gemma4_weights(w_);
             return false;
         }
         cache_.fa_window = cfg_.fa_window;
+        if (!kvflash_attach()) return false;
 
+        kvflash_drafter_failed_ = false;   // fresh VRAM: allow a retry
         parked_ = false;
         std::printf("[gemma4] unparked (VRAM restored)\n"); std::fflush(stdout);
         if (cfg_.draft_path && !draft_parked_ && draft_backend_) {
             delete dflash_target_;
             dflash_target_ = new Gemma4DFlashTarget(w_, cache_, backend_);
+        if (kvflash_active()) dflash_target_->set_kvflash_pager(&kvflash_pager_);
         }
     }
 
@@ -138,6 +151,118 @@ bool Gemma4Backend::unpark(const std::string & what) {
     return true;
 }
 
+// ── kvflash helpers ────────────────────────────────────────────────────
+
+void Gemma4Backend::kvflash_read_config() {
+    if (std::getenv("DFLASH_KVFLASH")) {
+        kvflash_drafter_path_ = kvflash_find_drafter(cfg_.model_path);
+    }
+    // "auto" sizes from the GPU (weights resident, cache not yet allocated):
+    // gemma4 pools the FULL-attention layers only (F16 cache); SWA rings are
+    // fixed-size and excluded from the density.
+    KvFlashAutoBudget kvf_budget;
+    {
+        size_t gpu_free = 0, gpu_total = 0;
+        if (ggml_backend_dev_t dev = ggml_backend_get_device(backend_)) {
+            ggml_backend_dev_memory(dev, &gpu_free, &gpu_total);
+        }
+        int64_t bpt = 0;
+        for (int il = 0; il < w_.n_layer; ++il) {
+            if (!gemma4_has_kv(w_, il) || gemma4_is_swa_layer(w_, il)) continue;
+            bpt += (int64_t)gemma4_n_head_kv(w_, il) * 2 *
+                   (int64_t)ggml_row_size(GGML_TYPE_F16, gemma4_head_dim(w_, il));
+        }
+        kvf_budget.free_bytes      = (int64_t)gpu_free;
+        kvf_budget.bytes_per_token = bpt;
+        kvf_budget.reserve_bytes   = (int64_t)(1.5 * 1073741824.0) +
+            (kvflash_drafter_path_.empty() ? 0 : (int64_t)(1.7 * 1073741824.0));
+    }
+    kvflash_tokens_ = kvflash_pool_from_env(cfg_.device.max_ctx, KvFlashConfig{},
+                                            !kvflash_drafter_path_.empty(),
+                                            kvf_budget);
+    if (kvflash_tokens_ > 0) {
+        const char * tau = std::getenv("DFLASH_KVFLASH_TAU");
+        kvflash_tau_ = std::max(1, tau ? std::atoi(tau) : 64);
+    }
+}
+
+// Drafter rescore + repage (FlashMemory tau loop) with the cross-tokenizer
+// scorer: gemma ids are detokenized and re-scored through the Qwen3-0.6B
+// drafter. Lazy: the drafter + tokenizers load on the first reselect that
+// needs them, never on a request's first tokens.
+void Gemma4Backend::kvflash_maybe_reselect(int generated) {
+    if (!kvflash_active() || kvflash_tau_ <= 0) return;
+    const int tau = std::max<int>(kvflash_tau_, (int)(kvflash_history_.size() / 45));
+    if (generated % tau != 0) return;
+    if (!kvflash_scorer_) {
+        if (kvflash_drafter_path_.empty() || kvflash_drafter_failed_) return;
+        if (!drafter_loaded_) {
+            ggml_backend_synchronize(backend_);
+            std::fprintf(stderr, "[kvflash] loading drafter for residency scoring: %s\n",
+                         kvflash_drafter_path_.c_str());
+            if (!load_drafter(kvflash_drafter_path_, /*gpu_layers=*/999,
+                              cfg_.device.gpu, drafter_ctx_)) {
+                std::fprintf(stderr, "[kvflash] drafter load failed (%s); staying on "
+                                     "LRU residency\n", dflash27b_last_error());
+                kvflash_drafter_failed_ = true;
+                return;
+            }
+            drafter_loaded_ = true;
+        }
+        kvflash_scorer_ = std::make_unique<KvFlashCrossTokScorer>(
+            &drafter_ctx_, cfg_.model_path, kvflash_drafter_path_);
+        std::fprintf(stderr, "[kvflash] cross-tokenizer drafter scorer attached "
+                             "(tau=%d)\n", kvflash_tau_);
+    }
+    if (!kvflash_scorer_->score_chunks(kvflash_history_, kvflash_pager_.chunk_tokens(),
+                                       kvflash_scores_)) {
+        return;  // scorer failure -> keep LRU behavior this round
+    }
+    kvflash_pager_.score_hook = [this](int c) {
+        return c < (int)kvflash_scores_.size() ? kvflash_scores_[c] : 1e30f;
+    };
+    const int events = kvflash_pager_.reselect();
+    kvflash_pager_.score_hook = nullptr;
+    if (events > 0) {
+        std::fprintf(stderr, "[kvflash] reselect @gen=%d: %d page events\n",
+                     generated, events);
+    }
+}
+
+bool Gemma4Backend::kvflash_attach() {
+    if (!kvflash_active()) return true;
+    // Pool the FULL-attention layers only; SWA layers ring-buffer natively
+    // and KV-reuse layers share their source layer's tensors.
+    std::vector<ggml_tensor *> full_k, full_v;
+    for (int il = 0; il < w_.n_layer; ++il) {
+        if (cache_.k[(size_t)il] && !gemma4_is_swa_layer(w_, il)) {
+            full_k.push_back(cache_.k[(size_t)il]);
+            full_v.push_back(cache_.v[(size_t)il]);
+        }
+    }
+    KvFlashConfig pc;
+    pc.pool_tokens = kvflash_tokens_;
+    if (!kvflash_pager_.attach(pc, full_k, full_v)) {
+        std::fprintf(stderr, "kvflash: pager attach failed (pool=%d, "
+                             "full-attn layers=%zu)\n",
+                     kvflash_tokens_, full_k.size());
+        return false;
+    }
+    std::printf("[kvflash] resident pool %d tokens over %zu full-attn layers "
+                "(logical max_ctx %d, SWA ring %d), policy=%s\n",
+                kvflash_tokens_, full_k.size(), cfg_.device.max_ctx,
+                cache_.swa_size,
+                !kvflash_drafter_path_.empty()
+                    ? "drafter/cross-tok (attaches on first reselect)"
+                    : "lru (recency-only: no Qwen3-0.6B drafter found)");
+    std::fflush(stdout);
+    return true;
+}
+
+bool Gemma4Backend::kvflash_alloc_span(int kv_start, int n_tok) {
+    return !kvflash_active() || kvflash_pager_.alloc_span(kv_start, n_tok);
+}
+
 // ── Prefill ────────────────────────────────────────────────────────────
 
 int Gemma4Backend::do_prefill(const std::vector<int32_t> & tokens,
@@ -147,6 +272,19 @@ int Gemma4Backend::do_prefill(const std::vector<int32_t> & tokens,
     const int hidden = w_.n_embd;
     const int chunk = cfg_.chunk;
 
+    if (kvflash_active()) {
+        // Fresh request: rebuild the pager mapping. Restore paths land the
+        // prefix identity-mapped and pre-allocate [0, kv_offset) themselves.
+        if (kv_offset == 0) kvflash_pager_.reset();
+        if (kv_offset + n > kvflash_tokens_ - kvflash_pager_.chunk_tokens()) {
+            std::fprintf(stderr,
+                "[kvflash] prompt (%d @ offset %d) exceeds pool %d; raise "
+                "--kvflash or enable pflash compression\n",
+                n, kv_offset, kvflash_tokens_);
+            return -1;
+        }
+    }
+
     std::vector<float> embed(chunk * hidden);
     std::vector<float> logits;
 
@@ -168,8 +306,10 @@ int Gemma4Backend::do_prefill(const std::vector<int32_t> & tokens,
         for (int i = 0; i < len * hidden; ++i) embed[i] *= scale;
 
         const int kv_pos = kv_offset + pos;
-        if (!gemma4_step(backend_, w_, cache_, embed.data(),
-                         tokens.data() + pos, len, kv_pos, logits)) {
+        if (!kvflash_alloc_span(kv_pos, len) ||
+            !gemma4_step(backend_, w_, cache_, embed.data(),
+                         tokens.data() + pos, len, kv_pos, logits,
+                         kvflash_active() ? &kvflash_pager_ : nullptr)) {
             std::fprintf(stderr, "[gemma4] prefill step failed at pos=%d\n", kv_pos);
             return -1;
         }
@@ -194,6 +334,15 @@ int Gemma4Backend::do_prefill(const std::vector<int32_t> & tokens,
         }
     }
 
+    if (kvflash_active()) {
+        if (kv_offset == 0) {
+            kvflash_history_.assign(tokens.begin(), tokens.end());
+        } else {
+            kvflash_history_.resize((size_t)kv_offset, 0);  // restored prefix ids unknown
+            kvflash_history_.insert(kvflash_history_.end(), tokens.begin(), tokens.end());
+        }
+    }
+
     return kv_offset + pos;
 }
 
@@ -285,8 +434,10 @@ bool Gemma4Backend::do_decode(int committed, int n_gen,
         float scale = std::sqrt((float)hidden);
         for (int j = 0; j < hidden; ++j) embed_buf[j] *= scale;
 
-        if (!gemma4_step(backend_, w_, cache_, embed_buf.data(),
-                         &tok, 1, committed, logits)) {
+        if (!kvflash_alloc_span(committed, 1) ||
+            !gemma4_step(backend_, w_, cache_, embed_buf.data(),
+                         &tok, 1, committed, logits,
+                         kvflash_active() ? &kvflash_pager_ : nullptr)) {
             return false;
         }
 
@@ -308,6 +459,10 @@ bool Gemma4Backend::do_decode(int committed, int n_gen,
         io.emit(next);
         committed++;
         cache_.cur_pos = committed;
+        if (kvflash_active()) {
+            kvflash_history_.push_back(next);
+            kvflash_maybe_reselect((int)out_tokens.size());
+        }
         if (io.cancelled) break;
 
         // Check EOS
@@ -323,7 +478,8 @@ bool Gemma4Backend::do_spec_decode(int committed, int n_gen,
                                     std::vector<int32_t> & out_tokens,
                                     const DaemonIO & io,
                                     const BudgetHook * budget_hook,
-                                    bool * forced_close_out) {
+                                    bool * forced_close_out,
+                                    float * accept_rate_out) {
     const int hidden = w_.n_embd;
     int32_t last_tok = cache_.last_tok;
 
@@ -553,6 +709,12 @@ bool Gemma4Backend::do_spec_decode(int committed, int n_gen,
                  n_draft_steps, n_accept_sum, total_draft_pos, accept_pct,
                  n_draft_steps > 0 ? (double)n_generated / (double)n_draft_steps : 0.0);
 
+    // Surface acceptance to the HTTP usage block (was silently 0.0, the
+    // same reporting-only gap as the layer-split path fixed in PR #321).
+    if (accept_rate_out) {
+        *accept_rate_out = (float)(n_accept_sum / (double)total_draft_pos);
+    }
+
     io.emit(-1);
     return true;
 }
@@ -607,7 +769,8 @@ GenerateResult Gemma4Backend::generate_impl(const GenerateRequest & req,
             result.spec_decode_ran = true;
             if (!do_spec_decode(committed, req.n_gen, result.tokens, out_io,
                                 &req.budget_hook,
-                                &result.budget_forced_close)) {
+                                &result.budget_forced_close,
+                                &result.accept_rate)) {
                 result.error = "spec_decode";
                 return result;
             }
@@ -624,7 +787,8 @@ GenerateResult Gemma4Backend::generate_impl(const GenerateRequest & req,
             for (int j = 0; j < hidden; ++j) embed_buf[j] *= scale;
 
             if (!gemma4_step(backend_, w_, cache_, embed_buf.data(),
-                             &last_tok, 1, committed - 1, logits)) {
+                             &last_tok, 1, committed - 1, logits,
+                             kvflash_active() ? &kvflash_pager_ : nullptr)) {
                 result.error = "first logits";
                 return result;
             }
@@ -725,6 +889,22 @@ GenerateResult Gemma4Backend::restore_and_generate_impl(int slot,
     cache_.cur_pos = snap_pos;
     cache_.last_tok = snap.last_tok;
 
+    // kvflash: the restored prefix is identity-mapped; rebuild the pager
+    // mapping over [0, snap_pos) before the delta prefill extends it.
+    if (kvflash_active()) {
+        if (snap_pos > kvflash_tokens_ - kvflash_pager_.chunk_tokens()) {
+            std::fprintf(stderr, "[kvflash] restored prefix (%d) exceeds pool %d\n",
+                         snap_pos, kvflash_tokens_);
+            result.error = "kvflash: prompt exceeds resident pool";
+            return result;
+        }
+        kvflash_pager_.reset();
+        if (!kvflash_alloc_span(0, snap_pos)) {
+            result.error = "kvflash_slot";
+            return result;
+        }
+    }
+
     // Set up sampler
     sampler_ = req.sampler;
     if (req.do_sample && sampler_.seed != 0) {
@@ -795,7 +975,8 @@ GenerateResult Gemma4Backend::restore_and_generate_impl(int slot,
             result.spec_decode_ran = true;
             if (!do_spec_decode(committed, req.n_gen, result.tokens, out_io,
                                 &req.budget_hook,
-                                &result.budget_forced_close)) {
+                                &result.budget_forced_close,
+                                &result.accept_rate)) {
                 result.error = "spec_decode";
                 return result;
             }
@@ -812,7 +993,8 @@ GenerateResult Gemma4Backend::restore_and_generate_impl(int slot,
             for (int j = 0; j < hidden; ++j) embed_buf[j] *= scale;
 
             if (!gemma4_step(backend_, w_, cache_, embed_buf.data(),
-                             &last_tok, 1, committed - 1, logits)) {
+                             &last_tok, 1, committed - 1, logits,
+                             kvflash_active() ? &kvflash_pager_ : nullptr)) {
                 result.error = "first logits";
                 return result;
             }
@@ -867,6 +1049,13 @@ GenerateResult Gemma4Backend::restore_and_generate_impl(int slot,
 bool Gemma4Backend::snapshot_save(int slot) {
     if (parked_) return false;
     if (slot < 0 || slot >= PREFIX_SLOTS) return false;
+    // kvflash: snapshots copy rows [0, snap_pos) assuming identity layout,
+    // which breaks after the first page-out relocates a chunk.
+    if (kvflash_active() && !kvflash_pager_.is_identity()) {
+        std::fprintf(stderr, "[kvflash] snapshot skipped: pool has relocated "
+                             "chunks (page-table serialization not implemented)\n");
+        return false;
+    }
 
     auto & snap = snapshots_[slot];
     const int n_layer = cache_.n_layer;
@@ -1129,6 +1318,7 @@ bool Gemma4Backend::load_decode_draft() {
 
     delete dflash_target_;
     dflash_target_ = new Gemma4DFlashTarget(w_, cache_, backend_);
+        if (kvflash_active()) dflash_target_->set_kvflash_pager(&kvflash_pager_);
     draft_parked_ = false;
     std::printf("[gemma4] spec-decode ready: capture_layers=%d mirror_cap=%d\n",
                 n_capture, mirror_cap);
diff --git a/server/src/gemma4/gemma4_backend.h b/server/src/gemma4/gemma4_backend.h
index 7898e2359..6295496b9 100644
--- a/server/src/gemma4/gemma4_backend.h
+++ b/server/src/gemma4/gemma4_backend.h
@@ -12,6 +12,8 @@
 #include "gemma4_internal.h"
 #include "gemma4_dflash_target.h"
 #include "common/sampler.h"
+#include "../common/kvflash_pager.h"
+#include "../common/kvflash_scorer.h"
 #include "../qwen3/qwen3_drafter.h"
 
 #include "ggml.h"
@@ -99,6 +101,27 @@ class Gemma4Backend : public ModelBackend {
     static constexpr int PREFIX_SLOTS = 64;
     Gemma4Snapshot        snapshots_[PREFIX_SLOTS];
 
+    // ── kvflash (bounded KV residency; see common/kvflash_pager.h) ──
+    // Pools the FULL-attention layers only (SWA layers already ring-buffer).
+    // Drafter-scored residency by default via the cross-tokenizer bridge
+    // (KvFlashCrossTokScorer: gemma ids are detokenized and re-scored by
+    // the Qwen3-0.6B drafter); LRU is the fallback when no drafter is
+    // found or --kvflash-policy lru.
+    KvFlashPager                   kvflash_pager_;
+    std::unique_ptr<KvFlashScorer> kvflash_scorer_;
+    std::vector<float>             kvflash_scores_;
+    std::vector<int32_t>           kvflash_history_;   // prompt + generated ids
+    std::string                    kvflash_drafter_path_;
+    int          kvflash_tokens_ = 0;     // 0 = off
+    int          kvflash_tau_    = 64;
+    bool         kvflash_drafter_failed_ = false;
+    bool kvflash_active() const { return kvflash_tokens_ > 0; }
+    void kvflash_read_config();
+    bool kvflash_attach();
+    bool kvflash_alloc_span(int kv_start, int n_tok);
+    // Drafter rescore + repage every effective-tau generated tokens.
+    void kvflash_maybe_reselect(int generated);
+
     // Prefill prompt tokens in chunks, return absolute committed position.
     // kv_offset: starting KV cache position (0 for fresh prefill, snap_pos for restore).
     int do_prefill(const std::vector<int32_t> & tokens, const DaemonIO & io,
@@ -126,7 +149,8 @@ class Gemma4Backend : public ModelBackend {
                         std::vector<int32_t> & out_tokens,
                         const DaemonIO & io,
                         const BudgetHook * budget_hook = nullptr,
-                        bool * forced_close_out = nullptr);
+                        bool * forced_close_out = nullptr,
+                        float * accept_rate_out = nullptr);
 
     bool load_decode_draft();
     void free_decode_draft();
diff --git a/server/src/gemma4/gemma4_dflash_target.cpp b/server/src/gemma4/gemma4_dflash_target.cpp
index aebd0b096..7983ccfb3 100644
--- a/server/src/gemma4/gemma4_dflash_target.cpp
+++ b/server/src/gemma4/gemma4_dflash_target.cpp
@@ -1,6 +1,7 @@
 // Gemma4DFlashTarget — DFlashTarget adapter for Gemma4 iSWA models.
 
 #include "gemma4_dflash_target.h"
+#include "../common/kvflash_pager.h"
 #include "dflash27b.h"
 
 #include <algorithm>
@@ -53,11 +54,16 @@ bool Gemma4DFlashTarget::verify_batch(
     const float scale = std::sqrt((float)hidden);
     for (size_t i = 0; i < embed.size(); ++i) embed[i] *= scale;
 
+    // kvflash: allocate the verify block's slots up front (may evict).
+    if (pager_ && !pager_->alloc_span(base_pos, n_tokens)) {
+        return false;
+    }
+
     // Run verify (all-token argmax)
     std::vector<int32_t> argmax_buf;
     if (!gemma4_verify_batch(backend_, w_, cache_, embed.data(),
                               tokens.data(), n_tokens, base_pos,
-                              argmax_buf)) {
+                              argmax_buf, pager_)) {
         return false;
     }
 
diff --git a/server/src/gemma4/gemma4_dflash_target.h b/server/src/gemma4/gemma4_dflash_target.h
index 1d12079b0..aeed2feae 100644
--- a/server/src/gemma4/gemma4_dflash_target.h
+++ b/server/src/gemma4/gemma4_dflash_target.h
@@ -32,6 +32,10 @@ class Gemma4DFlashTarget : public DFlashTarget {
                       int & last_tok,
                       std::vector<int32_t> * all_argmax = nullptr) override;
 
+    // kvflash: route verify writes through the pool (slots allocated here,
+    // slot-space mask inside gemma4_verify_batch). Non-owning.
+    void set_kvflash_pager(class KvFlashPager * pager) { pager_ = pager; }
+
     bool snapshot_kv() override;
     bool restore_kv() override;
 
@@ -52,6 +56,7 @@ class Gemma4DFlashTarget : public DFlashTarget {
     Gemma4Weights & w_;
     Gemma4Cache & cache_;
     ggml_backend_t backend_;
+    class KvFlashPager * pager_ = nullptr;
 
     // Capture layer IDs (built once in constructor).
     std::vector<int> capture_ids_;
diff --git a/server/src/gemma4/gemma4_graph.cpp b/server/src/gemma4/gemma4_graph.cpp
index 7df5a5a9f..33f60ffb5 100644
--- a/server/src/gemma4/gemma4_graph.cpp
+++ b/server/src/gemma4/gemma4_graph.cpp
@@ -18,6 +18,7 @@
 #include "gemma4_internal.h"
 #include "common/ggml_graph_precision.h"
 #include "common/gpu_runtime_compat.h"
+#include "../common/kvflash_pager.h"
 #include "dflash27b.h"
 #include "flashprefill.h"
 
@@ -249,7 +250,10 @@ static ggml_tensor * build_gemma4_attn_block(
                                    ? (kv_start - fa_window) : 0;
     const int kv_len_raw = is_swa ? std::min(kv_start + n_tokens, cache_len)
                                   : (kv_start + n_tokens - full_win_start);
-    const int kv_len = (kv_len_raw + 255) & ~255;  // pad to 256 for CUDA FA
+    // Pad to 256 for CUDA FA, clamped to the tensor's physical capacity
+    // (kvflash pools allocate full layers below max_ctx; the slot mask keeps
+    // the clamped span exact).
+    const int kv_len = std::min((kv_len_raw + 255) & ~255, cache_len);
 
     ggml_tensor * Qfa = ggml_permute(ctx, Qcur, 0, 2, 1, 3);
     Qfa = ggml_cont(ctx, Qfa);
@@ -620,8 +624,14 @@ bool gemma4_step(
     const int32_t *         token_ids,
     int                     n_tokens,
     int                     kv_start,
-    std::vector<float> &    out_logits)
+    std::vector<float> &    out_logits,
+    const KvFlashPager *    kvflash)
 {
+    if (kvflash && cache.fa_window > 0) {
+        std::fprintf(stderr, "gemma4_step: kvflash and fa_window are mutually "
+                             "exclusive full-attention policies\n");
+        return false;
+    }
     // Allocate graph context. Persistent thread_local arena: rebuilt graphs
     // land at identical addresses every step, so the ggml-cuda CUDA-graph
     // cache (keyed on nodes[0], memcmps node properties) can replay the
@@ -662,9 +672,18 @@ bool gemma4_step(
     }
 
     // Attention masks (full + SWA)
-    // Full-attention mask: covers all positions [0, kv_start+n_tokens)
+    // Full-attention mask: covers all positions [0, kv_start+n_tokens),
+    // clamped to the full-layer tensor capacity (pool-sized under kvflash) —
+    // must agree with the FA span clamp in build_gemma4_attn_block.
+    int full_cap = cache.max_ctx;
+    for (int il = 0; il < (int)cache.k.size(); ++il) {
+        if (cache.k[(size_t)il] && !gemma4_is_swa_layer(w, il)) {
+            full_cap = (int)cache.k[(size_t)il]->ne[1];
+            break;
+        }
+    }
     const int kv_len_raw = kv_start + n_tokens;
-    const int kv_len_padded = (kv_len_raw + 255) & ~255;
+    const int kv_len_padded = std::min((kv_len_raw + 255) & ~255, full_cap);
     ggml_tensor * mk_full = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, kv_len_padded, n_tokens, 1, 1);
     ggml_set_input(mk_full);
     ggml_tensor * mk_full_f16 = ggml_cast(ctx, mk_full, GGML_TYPE_F16);
@@ -768,12 +787,32 @@ bool gemma4_step(
     std::vector<int32_t> pos((size_t)n_tokens);
     for (int i = 0; i < n_tokens; ++i) pos[i] = kv_start + i;
     ggml_backend_tensor_set(pp, pos.data(), 0, ggml_nbytes(pp));
+    if (!kvi_full && kvflash) {
+        std::fprintf(stderr, "gemma4_step: kvflash requires the set_rows path "
+                             "(DFLASH_GEMMA4_NO_KVPAD is incompatible)\n");
+        ggml_free(ctx);
+        return false;
+    }
+    std::vector<float> kvf_mfull;  // slot-space full mask (kvflash)
     if (kvi_full) {
-        // Full layers append at the absolute position; SWA layers at the ring
-        // slot. Per-token modular indices also land chunks that cross the
-        // ring wrap boundary correctly (the offset-view path wrote one
-        // contiguous block).
-        ggml_backend_tensor_set(kvi_full, pos.data(), 0, ggml_nbytes(kvi_full));
+        // Full layers append at the absolute position (or the kvflash pool
+        // slot); SWA layers at the ring slot. Per-token modular indices also
+        // land chunks that cross the ring wrap boundary correctly (the
+        // offset-view path wrote one contiguous block).
+        if (kvflash) {
+            // Rows + slot-space full mask in one pass (shared helper; the
+            // mask is uploaded below where the legacy path builds its own).
+            std::vector<int32_t> rows;
+            if (!kvflash_fill_rows_and_masks(*kvflash, kv_start, n_tokens,
+                                             kv_len_padded, /*swa_window=*/0,
+                                             rows, &kvf_mfull, nullptr)) {
+                ggml_free(ctx);
+                return false;
+            }
+            ggml_backend_tensor_set(kvi_full, rows.data(), 0, ggml_nbytes(kvi_full));
+        } else {
+            ggml_backend_tensor_set(kvi_full, pos.data(), 0, ggml_nbytes(kvi_full));
+        }
         GGML_ASSERT(swa_size > 0);
         std::vector<int32_t> ring((size_t)n_tokens);
         for (int i = 0; i < n_tokens; ++i) ring[i] = (kv_start + i) % swa_size;
@@ -785,12 +824,18 @@ bool gemma4_step(
         ggml_backend_tensor_set(tok_ids, token_ids, 0, (size_t)n_tokens * sizeof(int32_t));
     }
 
-    // Causal mask (full attention) — padded positions are masked with -inf
-    std::vector<float> mfull((size_t)kv_len_padded * n_tokens, -INFINITY);
-    for (int q = 0; q < n_tokens; ++q) {
-        const int abs_q = kv_start + q;
-        for (int k = 0; k <= abs_q && k < kv_len_raw; ++k) {
-            mfull[(size_t)q * kv_len_padded + k] = 0.0f;
+    // Causal mask (full attention) — padded positions are masked with -inf.
+    // kvflash: SLOT-space mask already built alongside the append rows.
+    std::vector<float> mfull;
+    if (kvflash) {
+        mfull = std::move(kvf_mfull);
+    } else {
+        mfull.assign((size_t)kv_len_padded * n_tokens, -INFINITY);
+        for (int q = 0; q < n_tokens; ++q) {
+            const int abs_q = kv_start + q;
+            for (int k = 0; k <= abs_q && k < kv_len_raw; ++k) {
+                mfull[(size_t)q * kv_len_padded + k] = 0.0f;
+            }
         }
     }
     ggml_backend_tensor_set(mk_full, mfull.data(), 0, ggml_nbytes(mk_full));
@@ -844,8 +889,14 @@ bool gemma4_verify_batch(
     const int32_t *         token_ids,
     int                     n_tokens,
     int                     kv_start,
-    std::vector<int32_t> &  out_argmax)
+    std::vector<int32_t> &  out_argmax,
+    const KvFlashPager *    kvflash)
 {
+    if (kvflash && cache.fa_window > 0) {
+        std::fprintf(stderr, "gemma4_verify_batch: kvflash and fa_window are "
+                             "mutually exclusive\n");
+        return false;
+    }
     ggml_init_params ip{};
     ip.mem_size = ggml_tensor_overhead() * 16384 + ggml_graph_overhead() + 16 * 1024 * 1024;
     ip.no_alloc = true;
@@ -865,9 +916,28 @@ bool gemma4_verify_batch(
         ggml_set_input(tok_ids);
     }
 
-    // Attention masks (padded)
+    // kvflash: full-layer writes must go through set_rows to land in pool
+    // slots; SWA ring rows ride the same mechanism (pos % swa_size).
+    ggml_tensor * kvi_full = nullptr, * kvi_swa = nullptr;
+    if (kvflash) {
+        kvi_full = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_tokens);
+        ggml_set_input(kvi_full);
+        kvi_swa = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_tokens);
+        ggml_set_input(kvi_swa);
+    }
+
+    // Attention masks (padded; full width clamps to the full-layer tensor
+    // capacity, which is pool-sized under kvflash — must agree with the FA
+    // span clamp in build_gemma4_attn_block)
+    int full_cap = cache.max_ctx;
+    for (int il = 0; il < (int)cache.k.size(); ++il) {
+        if (cache.k[(size_t)il] && !gemma4_is_swa_layer(w, il)) {
+            full_cap = (int)cache.k[(size_t)il]->ne[1];
+            break;
+        }
+    }
     const int kv_len_raw = kv_start + n_tokens;
-    const int kv_len_padded = (kv_len_raw + 255) & ~255;
+    const int kv_len_padded = std::min((kv_len_raw + 255) & ~255, full_cap);
     ggml_tensor * mk_full = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, kv_len_padded, n_tokens, 1, 1);
     ggml_set_input(mk_full);
     ggml_tensor * mk_full_f16 = ggml_cast(ctx, mk_full, GGML_TYPE_F16);
@@ -914,7 +984,8 @@ bool gemma4_verify_batch(
         }
         cur = build_gemma4_layer(ctx, gf, w, cache, il, cur, pp,
                                    mk_full_f16, mk_swa_f16, pl_input,
-                                   kv_start, n_tokens, cap_idx);
+                                   kv_start, n_tokens, cap_idx,
+                                   kvi_full, kvi_swa);
     }
 
     // Final norm
@@ -954,12 +1025,27 @@ bool gemma4_verify_batch(
         ggml_backend_tensor_set(tok_ids, token_ids, 0, (size_t)n_tokens * sizeof(int32_t));
     }
 
-    // Masks
-    std::vector<float> mfull((size_t)kv_len_padded * n_tokens, -INFINITY);
-    for (int q = 0; q < n_tokens; ++q) {
-        const int abs_q = kv_start + q;
-        for (int k = 0; k <= abs_q && k < kv_len_raw; ++k) {
-            mfull[(size_t)q * kv_len_padded + k] = 0.0f;
+    // Masks (kvflash: slot-space full mask + slot rows via the shared helper)
+    std::vector<float> mfull;
+    if (kvflash) {
+        std::vector<int32_t> rows;
+        if (!kvflash_fill_rows_and_masks(*kvflash, kv_start, n_tokens,
+                                         kv_len_padded, /*swa_window=*/0,
+                                         rows, &mfull, nullptr)) {
+            ggml_free(ctx);
+            return false;
+        }
+        ggml_backend_tensor_set(kvi_full, rows.data(), 0, ggml_nbytes(kvi_full));
+        std::vector<int32_t> ring((size_t)n_tokens);
+        for (int i = 0; i < n_tokens; ++i) ring[(size_t)i] = (kv_start + i) % swa_size;
+        ggml_backend_tensor_set(kvi_swa, ring.data(), 0, ggml_nbytes(kvi_swa));
+    } else {
+        mfull.assign((size_t)kv_len_padded * n_tokens, -INFINITY);
+        for (int q = 0; q < n_tokens; ++q) {
+            const int abs_q = kv_start + q;
+            for (int k = 0; k <= abs_q && k < kv_len_raw; ++k) {
+                mfull[(size_t)q * kv_len_padded + k] = 0.0f;
+            }
         }
     }
     ggml_backend_tensor_set(mk_full, mfull.data(), 0, ggml_nbytes(mk_full));
diff --git a/server/src/gemma4/gemma4_internal.h b/server/src/gemma4/gemma4_internal.h
index d1e0e9033..800f00101 100644
--- a/server/src/gemma4/gemma4_internal.h
+++ b/server/src/gemma4/gemma4_internal.h
@@ -188,14 +188,19 @@ struct Gemma4Cache {
     ggml_backend_buffer_t feat_buf = nullptr;
 };
 
+// `ctx_alloc` (kvflash): when > 0 and < max_ctx, FULL-attention layers' K/V
+// tensors are allocated at ctx_alloc rows (the resident pool); SWA layers
+// keep their sliding-window ring buffers (already bounded). cache.max_ctx
+// stays the logical bound. 0 = allocate full layers at max_ctx (default).
 bool  create_gemma4_cache(ggml_backend_t backend, const Gemma4Weights & w,
-                           int max_ctx, Gemma4Cache & out);
+                           int max_ctx, Gemma4Cache & out, int ctx_alloc = 0);
 bool  create_gemma4_cache_partial(ggml_backend_t backend,
                                   const Gemma4Weights & w,
                                   int max_ctx,
                                   int layer_begin,
                                   int layer_end,
-                                  Gemma4Cache & out);
+                                  Gemma4Cache & out,
+                                  int ctx_alloc = 0);
 void  free_gemma4_cache(Gemma4Cache & c);
 
 // Allocate target_feat ring buffer (call after draft load determines n_capture_layers).
@@ -221,6 +226,12 @@ void free_gemma4_snapshot(Gemma4Snapshot & s);
 // Returns logits for last token.
 // token_ids: raw token IDs needed for per-layer embedding lookup (may be nullptr
 //            if the model has no per-layer embeddings).
+// `kvflash`: optional bounded-residency pager over the FULL-attention KV
+// (see common/kvflash_pager.h). When set, full-layer append rows come from
+// the pager's slot mapping and the full mask is built in SLOT space; SWA
+// ring buffers are untouched. The caller must have allocated slots for
+// [kv_start, kv_start + n_tokens) via slot_for() beforehand. Requires the
+// set_rows path (refused under DFLASH_GEMMA4_NO_KVPAD) and fa_window == 0.
 bool gemma4_step(
     ggml_backend_t          backend,
     const Gemma4Weights &   w,
@@ -229,10 +240,17 @@ bool gemma4_step(
     const int32_t *         token_ids,
     int                     n_tokens,
     int                     kv_start,
-    std::vector<float> &    out_logits);
+    std::vector<float> &    out_logits,
+    const class KvFlashPager * kvflash = nullptr);
 
 // Verify batch: run forward pass returning argmax for ALL positions.
 // Used by DFlash speculative decode target.
+// `kvflash`: optional bounded-residency pager (caller must alloc_span()
+// [kv_start, kv_start+n_tokens) first). Full-layer writes go to pool slots
+// via set_rows with a slot-space causal mask; SWA ring writes/masks are
+// unchanged. Rejected draft slots hold future positions, so the validity
+// rule excludes them until the next verify rewrites them (KV truncation
+// semantics, same as the full cache).
 bool gemma4_verify_batch(
     ggml_backend_t          backend,
     const Gemma4Weights &   w,
@@ -241,7 +259,8 @@ bool gemma4_verify_batch(
     const int32_t *         token_ids,
     int                     n_tokens,
     int                     kv_start,
-    std::vector<int32_t> &  out_argmax);
+    std::vector<int32_t> &  out_argmax,
+    const class KvFlashPager * kvflash = nullptr);
 
 // Project hidden states through lm_head (out_norm + output + softcap + argmax).
 // Used by DFlash draft to convert draft hidden states to token IDs.
diff --git a/server/src/gemma4/gemma4_loader.cpp b/server/src/gemma4/gemma4_loader.cpp
index 00be4c8a8..c6fbb5c6b 100644
--- a/server/src/gemma4/gemma4_loader.cpp
+++ b/server/src/gemma4/gemma4_loader.cpp
@@ -475,9 +475,10 @@ void free_gemma4_weights(Gemma4Weights & w) {
 // ── Cache ──────────────────────────────────────────────────────────────
 
 bool create_gemma4_cache(ggml_backend_t backend, const Gemma4Weights & w,
-                          int max_ctx, Gemma4Cache & out) {
+                          int max_ctx, Gemma4Cache & out, int ctx_alloc) {
     return create_gemma4_cache_partial(
-        backend, w, max_ctx, /*layer_begin=*/0, /*layer_end=*/w.n_layer, out);
+        backend, w, max_ctx, /*layer_begin=*/0, /*layer_end=*/w.n_layer, out,
+        ctx_alloc);
 }
 
 bool create_gemma4_cache_partial(ggml_backend_t backend,
@@ -485,7 +486,8 @@ bool create_gemma4_cache_partial(ggml_backend_t backend,
                                   int max_ctx,
                                   int layer_begin,
                                   int layer_end,
-                                  Gemma4Cache & out) {
+                                  Gemma4Cache & out,
+                                  int ctx_alloc) {
     if (layer_begin < 0) layer_begin = 0;
     if (layer_end < 0) layer_end = w.n_layer;
     if (layer_begin > layer_end || layer_end > w.n_layer) return false;
@@ -521,6 +523,10 @@ bool create_gemma4_cache_partial(ggml_backend_t backend,
     const int swa_size = (w.sliding_window > 0 && w.sliding_window < max_ctx)
                              ? w.sliding_window : max_ctx;
 
+    // kvflash: FULL-attention layers at pool capacity; SWA ring buffers are
+    // already bounded and stay at swa_size.
+    const int full_phys = (ctx_alloc > 0 && ctx_alloc < max_ctx) ? ctx_alloc : max_ctx;
+
     // Determine KV source for each layer
     int last_kv_layer = -1;
     for (int il = 0; il < w.n_layer; ++il) {
@@ -529,7 +535,7 @@ bool create_gemma4_cache_partial(ggml_backend_t backend,
             const int D  = gemma4_head_dim(w, il);
             const int Hk = gemma4_n_head_kv(w, il);
             const bool is_swa = gemma4_is_swa_layer(w, il);
-            const int cache_len = is_swa ? swa_size : max_ctx;
+            const int cache_len = is_swa ? swa_size : full_phys;
             if (owned_layer) {
                 out.k[il] = ggml_new_tensor_3d(out.ctx, GGML_TYPE_F16, D, cache_len, Hk);
                 out.v[il] = ggml_new_tensor_3d(out.ctx, GGML_TYPE_F16, D, cache_len, Hk);
diff --git a/server/src/internal.h b/server/src/internal.h
index 3c9611326..125e9a24e 100644
--- a/server/src/internal.h
+++ b/server/src/internal.h
@@ -373,6 +373,8 @@ struct TargetCache {
 void snapshot_ssm_state(TargetCache & c);
 // Restore the SSM+conv state from the snapshot.
 void restore_ssm_state(TargetCache & c);
+// Allocate rollback snapshot tensors mirroring live ssm/conv state (MoE path).
+bool ensure_ssm_snapshot(TargetCache & c, ggml_backend_t backend);
 
 // ─── Cross-request prefix snapshot (Phase A) ──────────────────────
 //
@@ -471,12 +473,18 @@ bool restore_target_cache_chain(const PrefixSnapshot * thick,
 // When prefill_only is true, rollback tensors (snapshots, intermediates) are
 // skipped — saving ~1.4 GB on 48 DeltaNet layers. Use migrate_prefill_cache()
 // to promote the cache to a full decode cache after prefill.
+// `ctx_alloc` (0 = max_ctx): physical token capacity of the attention KV
+// tensors. When smaller than max_ctx, a KvFlashPager maps logical positions to
+// pool slots and pages cold chunks to host (bounded KV residency); the
+// logical context bound stays max_ctx. Recurrent (DeltaNet) state is
+// unaffected.
 bool create_target_cache(const TargetWeights & w,
                          int max_ctx,
                          int max_verify_tokens,
                          ggml_backend_t backend,
                          TargetCache & out,
-                         bool prefill_only = false);
+                         bool prefill_only = false,
+                         int ctx_alloc = 0);
 
 bool create_target_cache_partial(const TargetWeights & w,
                                  int max_ctx,
@@ -486,7 +494,8 @@ bool create_target_cache_partial(const TargetWeights & w,
                                  bool prefill_only,
                                  int layer_begin,
                                  int layer_end,
-                                 bool allocate_target_feat);
+                                 bool allocate_target_feat,
+                                 int ctx_alloc = 0);
 
 void free_target_cache(TargetCache & c);
 
diff --git a/server/src/laguna/laguna_backend.cpp b/server/src/laguna/laguna_backend.cpp
index ab75ef5a8..9631f7f76 100644
--- a/server/src/laguna/laguna_backend.cpp
+++ b/server/src/laguna/laguna_backend.cpp
@@ -8,6 +8,7 @@
 
 #include "laguna_backend.h"
 #include "laguna_internal.h"
+#include "qwen3/qwen3_kvflash_scorer.h"
 #include "dflash27b.h"
 
 #include <chrono>
@@ -68,16 +69,130 @@ bool LagunaBackend::init() {
 
     cache_.kv_k_type = args_.kv_type;
     cache_.kv_v_type = args_.kv_type;
-    if (!create_laguna_target_cache(w_, args_.max_ctx, backend_, cache_)) {
+    kvflash_read_config();
+    if (!create_laguna_target_cache(w_, args_.max_ctx, backend_, cache_,
+                                    kvflash_tokens_)) {
         std::fprintf(stderr, "cache failed: %s\n", dflash27b_last_error());
         free_laguna_target_weights(w_);
         ggml_backend_free(backend_); backend_ = nullptr;
         return false;
     }
+    if (!kvflash_attach()) {
+        ggml_backend_free(backend_); backend_ = nullptr;
+        return false;
+    }
+
+    return true;
+}
+
+// ── kvflash helpers ─────────────────────────────────────────────────────
 
+// Laguna's pager protections: the trailing sliding_window span (+1 chunk
+// for the partially filled head) must stay resident so SWA attention stays
+// exact under paging. This drives both the pool floor and the attach config.
+KvFlashConfig LagunaBackend::kvflash_config() const {
+    KvFlashConfig pc;
+    pc.tail_window_chunks =
+        std::max(4, (w_.sliding_window + pc.chunk_tokens - 1) / pc.chunk_tokens + 1);
+    return pc;
+}
+
+void LagunaBackend::kvflash_read_config() {
+    if (std::getenv("DFLASH_KVFLASH")) {
+        kvflash_drafter_path_ = kvflash_find_drafter(args_.target_path.c_str());
+    }
+    // "auto" sizes from the GPU (weights resident, cache not yet allocated):
+    // laguna pools ALL n_layer layers at the configured KV quant.
+    KvFlashAutoBudget kvf_budget;
+    {
+        size_t gpu_free = 0, gpu_total = 0;
+        if (ggml_backend_dev_t dev = ggml_backend_get_device(backend_)) {
+            ggml_backend_dev_memory(dev, &gpu_free, &gpu_total);
+        }
+        kvf_budget.free_bytes      = (int64_t)gpu_free;
+        kvf_budget.bytes_per_token = (int64_t)w_.n_layer * w_.n_head_kv * 2 *
+            (int64_t)ggml_row_size(args_.kv_type, w_.head_dim);
+        kvf_budget.reserve_bytes   = (int64_t)(1.5 * 1073741824.0) +
+            (kvflash_drafter_path_.empty() ? 0 : (int64_t)(1.7 * 1073741824.0));
+    }
+    kvflash_tokens_ = kvflash_pool_from_env(args_.max_ctx, kvflash_config(),
+                                            !kvflash_drafter_path_.empty(),
+                                            kvf_budget);
+    if (kvflash_tokens_ > 0) {
+        const char * tau = std::getenv("DFLASH_KVFLASH_TAU");
+        kvflash_tau_ = std::max(1, tau ? std::atoi(tau) : 64);
+    }
+}
+
+// Drafter rescore + repage (FlashMemory tau loop) with the cross-tokenizer
+// scorer: laguna ids are detokenized and re-scored through the Qwen3-0.6B
+// drafter (relevance is text-level, so the tokenizer gap is bridged by
+// re-tokenization). Lazy: the drafter + tokenizers load on the first
+// reselect that needs them, never on a request's first tokens.
+void LagunaBackend::kvflash_maybe_reselect(const std::vector<int32_t> & history,
+                                           int generated) {
+    if (!kvflash_active() || kvflash_tau_ <= 0) return;
+    const int tau = std::max<int>(kvflash_tau_, (int)(history.size() / 45));
+    if (generated % tau != 0) return;
+    if (!kvflash_scorer_) {
+        if (kvflash_drafter_path_.empty() || kvflash_drafter_failed_) return;
+        if (!drafter_loaded_) {
+            ggml_backend_synchronize(backend_);
+            std::fprintf(stderr, "[kvflash] loading drafter for residency scoring: %s\n",
+                         kvflash_drafter_path_.c_str());
+            if (!load_drafter(kvflash_drafter_path_, /*gpu_layers=*/999,
+                              args_.device.gpu, drafter_ctx_)) {
+                std::fprintf(stderr, "[kvflash] drafter load failed (%s); staying on "
+                                     "LRU residency\n", dflash27b_last_error());
+                kvflash_drafter_failed_ = true;
+                return;
+            }
+            drafter_loaded_ = true;
+        }
+        kvflash_scorer_ = std::make_unique<KvFlashCrossTokScorer>(
+            &drafter_ctx_, args_.target_path, kvflash_drafter_path_);
+        std::fprintf(stderr, "[kvflash] cross-tokenizer drafter scorer attached "
+                             "(tau=%d)\n", kvflash_tau_);
+    }
+    if (!kvflash_scorer_->score_chunks(history, kvflash_pager_.chunk_tokens(),
+                                       kvflash_scores_)) {
+        return;  // scorer failure -> keep LRU behavior this round
+    }
+    kvflash_pager_.score_hook = [this](int c) {
+        return c < (int)kvflash_scores_.size() ? kvflash_scores_[c] : 1e30f;
+    };
+    const int events = kvflash_pager_.reselect();
+    kvflash_pager_.score_hook = nullptr;
+    if (events > 0) {
+        std::fprintf(stderr, "[kvflash] reselect @gen=%d: %d page events\n",
+                     generated, events);
+    }
+}
+
+bool LagunaBackend::kvflash_attach() {
+    if (!kvflash_active()) return true;
+    KvFlashConfig pc = kvflash_config();
+    pc.pool_tokens = kvflash_tokens_;
+    if (!kvflash_pager_.attach(pc, cache_.attn_k, cache_.attn_v)) {
+        std::fprintf(stderr, "kvflash: pager attach failed (pool=%d)\n",
+                     kvflash_tokens_);
+        return false;
+    }
+    std::printf("[kvflash] resident pool %d tokens (logical max_ctx %d), "
+                "policy=%s, swa_tail=%d chunks\n",
+                kvflash_tokens_, args_.max_ctx,
+                !kvflash_drafter_path_.empty()
+                    ? "drafter/cross-tok (attaches on first reselect)"
+                    : "lru (recency-only: no Qwen3-0.6B drafter found)",
+                pc.tail_window_chunks);
+    std::fflush(stdout);
     return true;
 }
 
+bool LagunaBackend::kvflash_alloc_span(int kv_start, int n_tok) {
+    return !kvflash_active() || kvflash_pager_.alloc_span(kv_start, n_tok);
+}
+
 void LagunaBackend::print_ready_banner() const {
     std::printf("[laguna-daemon] ready vocab=%lld eos=%d eot=%d max_ctx=%d kv=%s chunk=%d\n",
                 (long long)w_.embedder.n_vocab, w_.eos_id, w_.eos_chat_id,
@@ -107,10 +222,17 @@ bool LagunaBackend::unpark(const std::string & what) {
         }
         cache_.kv_k_type = args_.kv_type;
         cache_.kv_v_type = args_.kv_type;
-        if (!create_laguna_target_cache(w_, args_.max_ctx, backend_, cache_)) {
+        if (!create_laguna_target_cache(w_, args_.max_ctx, backend_, cache_,
+                                        kvflash_tokens_)) {
             std::fprintf(stderr, "[unpark] cache: %s\n", dflash27b_last_error());
             return false;
         }
+        if (!kvflash_attach()) {
+            free_laguna_target_cache(cache_);
+            free_laguna_target_weights(w_);
+            return false;          // still parked, resources released
+        }
+        kvflash_drafter_failed_ = false;   // fresh VRAM: allow a retry
         target_parked_ = false;
         std::printf("[unpark] target restored\n"); std::fflush(stdout);
     }
@@ -132,6 +254,13 @@ bool LagunaBackend::ensure_slot(int slot) {
 }
 
 bool LagunaBackend::snapshot_save(int slot) {
+    // kvflash: snapshots copy rows assuming identity layout, which breaks
+    // after the first page-out relocates a chunk.
+    if (kvflash_active() && !kvflash_pager_.is_identity()) {
+        std::fprintf(stderr, "[kvflash] snapshot skipped: pool has relocated "
+                             "chunks (page-table serialization not implemented)\n");
+        return false;
+    }
     if (!ensure_slot(slot)) return false;
     if (!laguna_snapshot_save(cache_, snap_backend_, w_.n_layer,
                                w_.n_head_kv, w_.head_dim, snapshots_[slot])) {
@@ -189,7 +318,19 @@ GenerateResult LagunaBackend::generate_impl(const GenerateRequest & req,
         return result;
     }
 
+    // kvflash: prefill rows land identity-mapped, so the prompt must fit the
+    // pool with one chunk of decode headroom (decode then evicts LRU live).
+    if (kvflash_active() &&
+        N > kvflash_tokens_ - kvflash_pager_.chunk_tokens()) {
+        std::fprintf(stderr, "[kvflash] prompt (%d) exceeds pool %d; raise "
+                             "--kvflash\n", N, kvflash_tokens_);
+        result.error = "kvflash: prompt exceeds resident pool";
+        return result;
+    }
+
     reset_laguna_target_cache(cache_);
+    if (kvflash_active()) kvflash_pager_.reset();
+    const KvFlashPager * kvf = kvflash_active() ? &kvflash_pager_ : nullptr;
 
     // ── Prefill ──
     std::vector<float> embed_pf((size_t)N * w_.n_embd);
@@ -205,15 +346,23 @@ GenerateResult LagunaBackend::generate_impl(const GenerateRequest & req,
     for (int c = 0; c < n_chunks && ok; ++c) {
         const int kv_start = c * args_.chunk;
         const int n_tok    = std::min(args_.chunk, N - c * args_.chunk);
-        ok = laguna_step(backend_, w_, cache_,
+        ok = kvflash_alloc_span(kv_start, n_tok) &&
+             laguna_step(backend_, w_, cache_,
                           embed_pf.data() + (size_t)kv_start * w_.n_embd,
-                          n_tok, kv_start, no_mask, last_logits);
+                          n_tok, kv_start, no_mask, last_logits, kvf);
     }
     if (!ok) { result.error = "prefill"; return result; }
     auto t_pf1 = std::chrono::steady_clock::now();
     result.prefill_s = std::chrono::duration<double>(t_pf1 - t_pf0).count();
 
     // ── Inline snapshot (if requested) ──
+    // kvflash: snapshots copy rows [0, snap_pos) assuming identity layout,
+    // which holds until the first page-out relocates a chunk.
+    if (kvflash_active() && req.snap_slot >= 0 &&
+        !kvflash_pager_.is_identity()) {
+        std::fprintf(stderr, "[kvflash] snapshot skipped: pool has relocated "
+                             "chunks (page-table serialization not implemented)\n");
+    } else
     if (req.snap_slot >= 0 && req.snap_pos > 0 && req.snap_pos <= N) {
         if (ensure_slot(req.snap_slot) &&
             laguna_snapshot_save(cache_, snap_backend_, w_.n_layer,
@@ -303,8 +452,10 @@ GenerateResult LagunaBackend::generate_impl(const GenerateRequest & req,
         }
         if (!w_.embedder.embed(&next_tok, 1, embed_step.data())) { ok = false; break; }
         std::vector<float> step_logits;
-        if (!laguna_step(backend_, w_, cache_, embed_step.data(), 1,
-                          cache_.cur_pos, no_mask, step_logits)) { ok = false; break; }
+        if (!kvflash_alloc_span(cache_.cur_pos, 1) ||
+            !laguna_step(backend_, w_, cache_, embed_step.data(), 1,
+                          cache_.cur_pos, no_mask, step_logits, kvf)) { ok = false; break; }
+        kvflash_maybe_reselect(history, s + 1);
         next_tok = pick(step_logits);
     }
     auto t_g1 = std::chrono::steady_clock::now();
@@ -342,6 +493,24 @@ GenerateResult LagunaBackend::restore_and_generate_impl(int slot,
         return result;
     }
 
+    // kvflash: restore lands rows identity-mapped; the full prompt (prefix +
+    // diff) must fit the pool. Rebuild the pager mapping over the prefix.
+    if (kvflash_active() &&
+        N > kvflash_tokens_ - kvflash_pager_.chunk_tokens()) {
+        std::fprintf(stderr, "[kvflash] restore prompt (%d) exceeds pool %d; "
+                             "raise --kvflash\n", N, kvflash_tokens_);
+        result.error = "kvflash: prompt exceeds resident pool";
+        return result;
+    }
+    if (kvflash_active()) {
+        kvflash_pager_.reset();
+        if (!kvflash_alloc_span(0, prefix_len)) {
+            result.error = "kvflash_slot";
+            return result;
+        }
+    }
+    const KvFlashPager * kvf = kvflash_active() ? &kvflash_pager_ : nullptr;
+
     // Re-prefill diff tokens (or last cached token when diff is empty).
     if (prefix_len == N) {
         if (prefix_len <= 0) { result.error = "empty_diff"; return result; }
@@ -363,9 +532,10 @@ GenerateResult LagunaBackend::restore_and_generate_impl(int slot,
         const int off   = c * args_.chunk;
         const int n_tok = std::min(args_.chunk, diff_n - off);
         const int starts = kv_start + off;
-        ok = laguna_step(backend_, w_, cache_,
+        ok = kvflash_alloc_span(starts, n_tok) &&
+             laguna_step(backend_, w_, cache_,
                           embed_diff.data() + (size_t)off * w_.n_embd,
-                          n_tok, starts, no_mask, last_logits);
+                          n_tok, starts, no_mask, last_logits, kvf);
     }
     if (!ok) { result.error = "prefill"; return result; }
 
@@ -437,8 +607,10 @@ GenerateResult LagunaBackend::restore_and_generate_impl(int slot,
         if (out_io.cancelled) break;
         if (!w_.embedder.embed(&next_tok, 1, embed_step.data())) { ok = false; break; }
         std::vector<float> step_logits;
-        if (!laguna_step(backend_, w_, cache_, embed_step.data(), 1,
-                          cache_.cur_pos, no_mask, step_logits)) { ok = false; break; }
+        if (!kvflash_alloc_span(cache_.cur_pos, 1) ||
+            !laguna_step(backend_, w_, cache_, embed_step.data(), 1,
+                          cache_.cur_pos, no_mask, step_logits, kvf)) { ok = false; break; }
+        kvflash_maybe_reselect(history, s + 1);
         next_tok = pick(step_logits);
     }
     auto t_g1 = std::chrono::steady_clock::now();
@@ -1085,8 +1257,10 @@ bool LagunaBackend::hybrid_forward_one_token(int32_t tok, int kv_pos,
         static const bool _nm = (std::getenv("DFLASH_NO_MASK") != nullptr);
         static std::vector<float> _sg_logits;
         static std::vector<int32_t> _sg_sel;
+        if (!kvflash_alloc_span(kv_pos, 1)) return false;
         if (!laguna_step_hybrid(backend_, w_, cache_, act_cur.data(), 1, kv_pos, _nm,
-                                *moe_hybrid_, _sg_logits, &_sg_sel))
+                                *moe_hybrid_, _sg_logits, &_sg_sel,
+                                kvflash_active() ? &kvflash_pager_ : nullptr))
             return false;
         // Reactive cache warm + routing observe, POST-compute (off the
         // single-graph critical path): make each selected expert resident
@@ -1128,6 +1302,14 @@ bool LagunaBackend::hybrid_forward_one_token(int32_t tok, int kv_pos,
 
     // GPU-resident state for MoE layers
     GpuResidentState gpu_state;
+    // The per-layer fallback writes KV at literal view offsets (no set_rows),
+    // which a kvflash pool cannot express once chunks relocate.
+    if (kvflash_active()) {
+        std::fprintf(stderr, "[kvflash] laguna per-layer hybrid decode is not "
+                             "pool-aware; unset DFLASH_LAGUNA_NO_SINGLE_GRAPH\n");
+        return false;
+    }
+
     if (!init_gpu_resident_state(gpu_state, backend_, hidden)) return false;
     ggml_backend_tensor_set(gpu_state.act_cur, act_cur.data(), 0, sizeof(float) * (size_t)hidden);
 
@@ -1348,7 +1530,25 @@ GenerateResult LagunaBackend::generate_hybrid(const GenerateRequest & req,
         return result;
     }
 
+    // kvflash: hybrid prefill writes rows identity-mapped (legacy per-layer
+    // views), so the prompt must fit the pool; the pager mapping is built up
+    // front and stays identity through prefill (no eviction can trigger).
+    if (kvflash_active() &&
+        N > kvflash_tokens_ - kvflash_pager_.chunk_tokens()) {
+        std::fprintf(stderr, "[kvflash] hybrid prompt (%d) exceeds pool %d; "
+                             "raise --kvflash\n", N, kvflash_tokens_);
+        result.error = "kvflash: prompt exceeds resident pool";
+        return result;
+    }
+
     reset_laguna_target_cache(cache_);
+    if (kvflash_active()) {
+        kvflash_pager_.reset();
+        if (!kvflash_alloc_span(0, N)) {
+            result.error = "kvflash_slot";
+            return result;
+        }
+    }
 
     // ── Hybrid Prefill: layer-by-layer pre-FFN + batched hybrid FFN ──
     const int hidden = w_.n_embd;
@@ -1652,6 +1852,7 @@ GenerateResult LagunaBackend::generate_hybrid(const GenerateRequest & req,
             break;
         }
         cache_.cur_pos++;
+        kvflash_maybe_reselect(history, s + 1);
 
         if (req.do_sample) {
             // For sampling, we need full logits — project from act_cur
diff --git a/server/src/laguna/laguna_backend.h b/server/src/laguna/laguna_backend.h
index 156c82e6b..881ad1abd 100644
--- a/server/src/laguna/laguna_backend.h
+++ b/server/src/laguna/laguna_backend.h
@@ -10,6 +10,8 @@
 #include "laguna_internal.h"
 #include "placement/placement_config.h"
 #include "qwen3_drafter.h"
+#include "kvflash_pager.h"
+#include "kvflash_scorer.h"
 #include "../common/moe_hybrid_ffn_eval.h"
 #include "../common/moe_hybrid_storage.h"
 #include "../common/moe_hybrid_routing_stats.h"
@@ -99,6 +101,34 @@ class LagunaBackend : public ModelBackend {
 
     bool ensure_slot(int slot);
 
+    // ── kvflash (bounded KV residency; see common/kvflash_pager.h) ──
+    // Drafter-scored residency by default: the Qwen3-0.6B drafter scores
+    // chunks through the cross-tokenizer bridge (KvFlashCrossTokScorer —
+    // relevance is text-level, so the target's ids are detokenized and
+    // re-tokenized for the drafter). LRU is the fallback when no drafter is
+    // found or --kvflash-policy lru. The pager covers ALL 40 layers; SWA
+    // exactness comes from a protected tail >= sliding_window.
+    KvFlashPager                   kvflash_pager_;
+    std::unique_ptr<KvFlashScorer> kvflash_scorer_;
+    std::vector<float>             kvflash_scores_;
+    std::string                    kvflash_drafter_path_;
+    int          kvflash_tokens_ = 0;     // 0 = off
+    int          kvflash_tau_    = 64;
+    bool         kvflash_drafter_failed_ = false;
+    bool kvflash_active() const { return kvflash_tokens_ > 0; }
+    // Drafter rescore + repage every effective-tau generated tokens
+    // (lazy-loads the drafter + cross-tokenizer scorer on first need).
+    void kvflash_maybe_reselect(const std::vector<int32_t> & history, int generated);
+    // Pager protections (SWA tail) shared by the floor and attach.
+    KvFlashConfig kvflash_config() const;
+    // Read DFLASH_KVFLASH and round/clamp; call before cache creation.
+    void kvflash_read_config();
+    // Attach the pager to the freshly created cache (init / unpark).
+    bool kvflash_attach();
+    // Allocate pool slots for [kv_start, kv_start+n_tok) (evicting LRU as
+    // needed) ahead of a laguna_step call. False if the pool is exhausted.
+    bool kvflash_alloc_span(int kv_start, int n_tok);
+
     // Hybrid mode helpers
     bool init_hybrid_mode();
     // Build hot/cold expert storage for `placement` by re-reading expert weights
diff --git a/server/src/laguna/laguna_internal.h b/server/src/laguna/laguna_internal.h
index ec09b6113..cc37d2051 100644
--- a/server/src/laguna/laguna_internal.h
+++ b/server/src/laguna/laguna_internal.h
@@ -168,16 +168,21 @@ struct LagunaTargetCache {
     std::vector<ggml_tensor *> attn_v;
 };
 
+// `ctx_alloc` (kvflash): when > 0 and < max_ctx, the per-layer K/V tensors
+// are allocated at ctx_alloc rows (the resident pool) while cache.max_ctx
+// keeps the logical bound. 0 = allocate at max_ctx (default).
 bool create_laguna_target_cache(const LagunaTargetWeights & w,
                                  int max_ctx,
                                  ggml_backend_t backend,
-                                 LagunaTargetCache & out);
+                                 LagunaTargetCache & out,
+                                 int ctx_alloc = 0);
 bool create_laguna_target_cache_partial(const LagunaTargetWeights & w,
                                          int max_ctx,
                                          ggml_backend_t backend,
                                          int layer_begin,
                                          int layer_end,
-                                         LagunaTargetCache & out);
+                                         LagunaTargetCache & out,
+                                         int ctx_alloc = 0);
 void free_laguna_target_cache(LagunaTargetCache & c);
 void reset_laguna_target_cache(LagunaTargetCache & c);
 
@@ -280,6 +285,12 @@ LagunaGraphOutputs build_laguna_graph(
 // `out_logits` : on success, resized to vocab and filled with last-token
 //             logits when in.output_last_only == true (default in this
 //             helper).
+// `kvflash`: optional bounded-residency pager (see common/kvflash_pager.h).
+// When set, the K/V append rows come from the pager's slot mapping and both
+// masks are built in SLOT space (causal / sliding-window conditions evaluated
+// on the position each slot holds). The caller must have allocated slots for
+// [kv_start, kv_start + n_tok) via slot_for() beforehand. Requires the
+// kv_pad set_rows path (refused otherwise).
 bool laguna_step(
     ggml_backend_t              backend,
     const LagunaTargetWeights & w,
@@ -288,7 +299,8 @@ bool laguna_step(
     int                         n_tok,
     int                         kv_start,
     bool                        no_mask,
-    std::vector<float> &        out_logits);
+    std::vector<float> &        out_logits,
+    const class KvFlashPager *  kvflash = nullptr);
 
 // Forward decl (full definition in common/moe_hybrid_storage.h).
 struct MoeHybridStorage;
@@ -306,7 +318,8 @@ bool laguna_step_hybrid(
     bool                        no_mask,
     const MoeHybridStorage &    hyb,
     std::vector<float> &        out_logits,
-    std::vector<int32_t> *      out_selected = nullptr);
+    std::vector<int32_t> *      out_selected = nullptr,
+    const class KvFlashPager *  kvflash = nullptr);
 
 struct LagunaLayerStepGraph {
     ggml_context * ctx = nullptr;
diff --git a/server/src/laguna/laguna_target_graph.cpp b/server/src/laguna/laguna_target_graph.cpp
index 44b1b5cd7..c44d1ee32 100644
--- a/server/src/laguna/laguna_target_graph.cpp
+++ b/server/src/laguna/laguna_target_graph.cpp
@@ -19,6 +19,7 @@
 
 #include "laguna_internal.h"
 #include "../common/moe_hybrid_storage.h"
+#include "../common/kvflash_pager.h"
 #include "common/ggml_graph_precision.h"
 #include "internal.h"
 #include "dflash27b.h"
@@ -44,9 +45,11 @@ static constexpr float LAGUNA_EPS = 1e-6f;
 bool create_laguna_target_cache(const LagunaTargetWeights & w,
                                  int max_ctx,
                                  ggml_backend_t backend,
-                                 LagunaTargetCache & out) {
+                                 LagunaTargetCache & out,
+                                 int ctx_alloc) {
     return create_laguna_target_cache_partial(
-        w, max_ctx, backend, /*layer_begin=*/0, /*layer_end=*/w.n_layer, out);
+        w, max_ctx, backend, /*layer_begin=*/0, /*layer_end=*/w.n_layer, out,
+        ctx_alloc);
 }
 
 bool create_laguna_target_cache_partial(const LagunaTargetWeights & w,
@@ -54,7 +57,8 @@ bool create_laguna_target_cache_partial(const LagunaTargetWeights & w,
                                          ggml_backend_t backend,
                                          int layer_begin,
                                          int layer_end,
-                                         LagunaTargetCache & out) {
+                                         LagunaTargetCache & out,
+                                         int ctx_alloc) {
     if (layer_begin < 0) layer_begin = 0;
     if (layer_end < 0) layer_end = w.n_layer;
     if (layer_begin > layer_end || layer_end > w.n_layer) {
@@ -62,6 +66,9 @@ bool create_laguna_target_cache_partial(const LagunaTargetWeights & w,
         return false;
     }
 
+    // kvflash: tensors at pool capacity, logical bound stays max_ctx.
+    const int ctx_phys = (ctx_alloc > 0 && ctx_alloc < max_ctx) ? ctx_alloc : max_ctx;
+
     out.backend  = backend;
     out.max_ctx  = max_ctx;
     out.cur_pos  = 0;
@@ -88,10 +95,10 @@ bool create_laguna_target_cache_partial(const LagunaTargetWeights & w,
         if (il < layer_begin || il >= layer_end) continue;
         char nm[32];
         std::snprintf(nm, sizeof(nm), "k_l%d", il);
-        ggml_tensor * k = ggml_new_tensor_3d(out.base_ctx, k_type, w.head_dim, max_ctx, w.n_head_kv);
+        ggml_tensor * k = ggml_new_tensor_3d(out.base_ctx, k_type, w.head_dim, ctx_phys, w.n_head_kv);
         ggml_set_name(k, nm);
         std::snprintf(nm, sizeof(nm), "v_l%d", il);
-        ggml_tensor * v = ggml_new_tensor_3d(out.base_ctx, v_type, w.head_dim, max_ctx, w.n_head_kv);
+        ggml_tensor * v = ggml_new_tensor_3d(out.base_ctx, v_type, w.head_dim, ctx_phys, w.n_head_kv);
         ggml_set_name(v, nm);
         out.attn_k[il] = k;
         out.attn_v[il] = v;
@@ -978,8 +985,14 @@ bool laguna_step(
     int                         n_tok,
     int                         kv_start,
     bool                        no_mask,
-    std::vector<float> &        out_logits)
+    std::vector<float> &        out_logits,
+    const KvFlashPager *        kvflash)
 {
+    if (kvflash && no_mask) {
+        std::fprintf(stderr, "laguna_step: kvflash requires masks (slots are "
+                             "relocated; position-implicit masking is invalid)\n");
+        return false;
+    }
     // Same CUDA-graph-replay treatment as laguna_step_hybrid: persistent
     // arena (stable node addresses -> stable graph key), stride-padded KV
     // span, and set_rows K/V append (index is an input, so node properties
@@ -1056,6 +1069,25 @@ bool laguna_step(
     std::vector<int32_t> pos((size_t)n_tok);
     for (int i = 0; i < n_tok; ++i) pos[i] = kv_start + i;
     ggml_backend_tensor_set(pp, pos.data(), 0, ggml_nbytes(pp));
+
+    if (kvflash) {
+        if (!kvi) {
+            std::fprintf(stderr, "laguna_step: kvflash requires the kv_pad "
+                                 "set_rows path (NO_KVPAD / PAD_CPY are incompatible)\n");
+            ggml_free(ctx);
+            return false;
+        }
+        std::vector<int32_t> rows;
+        std::vector<float> mfull, mswa;
+        if (!kvflash_fill_rows_and_masks(*kvflash, kv_start, n_tok, mk_w,
+                                         w.sliding_window, rows, &mfull, &mswa)) {
+            ggml_free(ctx);
+            return false;
+        }
+        ggml_backend_tensor_set(kvi, rows.data(), 0, ggml_nbytes(kvi));
+        ggml_backend_tensor_set(mk_full, mfull.data(), 0, ggml_nbytes(mk_full));
+        ggml_backend_tensor_set(mk_swa, mswa.data(), 0, ggml_nbytes(mk_swa));
+    } else {
     if (kvi) {
         ggml_backend_tensor_set(kvi, pos.data(), 0, ggml_nbytes(kvi));
     }
@@ -1083,6 +1115,7 @@ bool laguna_step(
         }
         ggml_backend_tensor_set(mk_swa, mswa.data(), 0, ggml_nbytes(mk_swa));
     }
+    }
 
     if (ggml_backend_graph_compute(backend, gf) != GGML_STATUS_SUCCESS) {
         std::fprintf(stderr, "laguna_step: graph_compute failed\n");
@@ -1111,8 +1144,14 @@ bool laguna_step_hybrid(
     bool                        no_mask,
     const MoeHybridStorage &    hyb,
     std::vector<float> &        out_logits,
-    std::vector<int32_t> *      out_selected)
+    std::vector<int32_t> *      out_selected,
+    const KvFlashPager *        kvflash)
 {
+    if (kvflash && no_mask) {
+        std::fprintf(stderr, "laguna_step_hybrid: kvflash requires masks (slots "
+                             "are relocated; position-implicit masking is invalid)\n");
+        return false;
+    }
     // Persistent arena: rebuilt graphs land at IDENTICAL addresses every step.
     // The ggml-cuda CUDA-graph cache is keyed on nodes[0] and memcmps node
     // properties (incl. src data pointers); address stability across steps is
@@ -1209,6 +1248,25 @@ bool laguna_step_hybrid(
     std::vector<int32_t> pos((size_t)n_tok);
     for (int i = 0; i < n_tok; ++i) pos[i] = kv_start + i;
     ggml_backend_tensor_set(pp, pos.data(), 0, ggml_nbytes(pp));
+
+    if (kvflash) {
+        if (!kvi) {
+            std::fprintf(stderr, "laguna_step_hybrid: kvflash requires the kv_pad "
+                                 "set_rows path (NO_KVPAD / PAD_CPY are incompatible)\n");
+            ggml_free(ctx);
+            return false;
+        }
+        std::vector<int32_t> rows;
+        std::vector<float> mfull, mswa;
+        if (!kvflash_fill_rows_and_masks(*kvflash, kv_start, n_tok, mk_w,
+                                         w.sliding_window, rows, &mfull, &mswa)) {
+            ggml_free(ctx);
+            return false;
+        }
+        ggml_backend_tensor_set(kvi, rows.data(), 0, ggml_nbytes(kvi));
+        ggml_backend_tensor_set(mk_full, mfull.data(), 0, ggml_nbytes(mk_full));
+        ggml_backend_tensor_set(mk_swa, mswa.data(), 0, ggml_nbytes(mk_swa));
+    } else {
     if (kvi) {
         // set_rows row indices = absolute cache positions of this step's tokens
         ggml_backend_tensor_set(kvi, pos.data(), 0, ggml_nbytes(kvi));
@@ -1232,6 +1290,7 @@ bool laguna_step_hybrid(
         }
         ggml_backend_tensor_set(mk_swa, mswa.data(), 0, ggml_nbytes(mk_swa));
     }
+    }
 
     // Set ALL residency LUTs in two batched H2D copies from the hot stack mapping.
     std::vector<int32_t> lutbuf((size_t)n_expert * (size_t)n_moe);
diff --git a/server/src/qwen3/qwen3_kvflash_scorer.cpp b/server/src/qwen3/qwen3_kvflash_scorer.cpp
new file mode 100644
index 000000000..4dc00c7c9
--- /dev/null
+++ b/server/src/qwen3/qwen3_kvflash_scorer.cpp
@@ -0,0 +1,210 @@
+#include "qwen3_kvflash_scorer.h"
+
+#include "qwen3_drafter_model.h"
+#include "server/tokenizer.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+
+namespace dflash::common {
+
+namespace {
+
+constexpr int kLookahead  = 8;
+constexpr int kPoolKernel = 13;
+constexpr int kMinSegment = 4096;
+
+// Tail-attention token scores for `ids`: mean over the lookahead window of
+// the drafter's running-max, then AvgPool smoothing. Same math as
+// drafter_score_and_compress.
+bool score_tokens_direct(DrafterContext & ctx, const std::vector<int32_t> & ids,
+                         std::vector<float> & out) {
+    const int S = (int)ids.size();
+    std::vector<float> running_max;
+    if (!forward_qwen3_drafter_model(ctx.weights, ids, kLookahead, running_max)) {
+        return false;
+    }
+    std::vector<float> score((size_t)S, 0.0f);
+    for (int j = 0; j < S; j++) {
+        float s = 0.0f;
+        for (int t = 0; t < kLookahead; t++) s += running_max[(size_t)t * S + j];
+        score[j] = s / kLookahead;
+    }
+    out.assign((size_t)S, 0.0f);
+    const int half = kPoolKernel / 2;
+    for (int j = 0; j < S; j++) {
+        const int lo = std::max(0, j - half), hi = std::min(S - 1, j + half);
+        float s = 0.0f;
+        for (int k = lo; k <= hi; k++) s += score[k];
+        out[j] = s / (hi - lo + 1);
+    }
+    return true;
+}
+
+void z_normalize(float * v, size_t n) {
+    if (n == 0) return;
+    double mean = 0;
+    for (size_t i = 0; i < n; i++) mean += v[i];
+    mean /= n;
+    double var = 0;
+    for (size_t i = 0; i < n; i++) var += (v[i] - mean) * (v[i] - mean);
+    const float inv = 1.0f / ((float)std::sqrt(var / n) + 1e-6f);
+    for (size_t i = 0; i < n; i++) v[i] = (float)((v[i] - mean) * inv);
+}
+
+// Score `ids` with allocation-failure resilience: try the full forward;
+// on failure split into two equal halves, score each with the TRUE query
+// tail (last kLookahead ids) appended so relevance stays query-aware, and
+// z-normalize per segment so the merged ranking is comparable. Recursion
+// floor kMinSegment. The drafter's per-call buffers (~10 KB/token) can
+// fail on a fragmented CUDA heap at 32K+ even when total free VRAM is
+// ample; segmented scoring trades exact cross-segment calibration for
+// robustness.
+bool score_tokens_resilient(DrafterContext & ctx, const std::vector<int32_t> & ids,
+                            std::vector<float> & out) {
+    if (score_tokens_direct(ctx, ids, out)) {
+        z_normalize(out.data(), out.size());
+        return true;
+    }
+    const int S = (int)ids.size();
+    if (S <= kMinSegment) return false;
+
+    std::fprintf(stderr, "[kvflash-scorer] forward failed at S=%d, bisecting\n", S);
+    const int mid = S / 2;
+    std::vector<int32_t> tail(ids.end() - kLookahead, ids.end());
+
+    std::vector<int32_t> left(ids.begin(), ids.begin() + mid);
+    left.insert(left.end(), tail.begin(), tail.end());
+    std::vector<float> ls;
+    if (!score_tokens_resilient(ctx, left, ls)) return false;
+
+    std::vector<int32_t> right(ids.begin() + mid, ids.end());
+    std::vector<float> rs;
+    if (!score_tokens_resilient(ctx, right, rs)) return false;
+
+    out.assign((size_t)S, 0.0f);
+    std::copy(ls.begin(), ls.begin() + mid, out.begin());          // drop tail scores
+    std::copy(rs.begin(), rs.begin() + (S - mid), out.begin() + mid);
+    return true;
+}
+
+} // namespace
+
+bool KvFlashDrafterScorer::score_chunks(const std::vector<int32_t> & ids,
+                                   int chunk_tokens,
+                                   std::vector<float> & out) {
+    const int S = (int)ids.size();
+    out.clear();
+    if (!ctx_ || !ctx_->loaded || S < kLookahead + 1 || chunk_tokens <= 0) return false;
+
+    std::vector<int32_t> score_ids = ids;
+    if (vocab_clamp_ > 1001) {   // fold range must stay positive
+        for (auto & t : score_ids) {
+            if (t >= vocab_clamp_) t = 1000 + t % (vocab_clamp_ - 1000);
+        }
+    }
+
+    std::vector<float> smooth;
+    if (!score_tokens_resilient(*ctx_, score_ids, smooth)) return false;
+
+    const int n_chunks = (S + chunk_tokens - 1) / chunk_tokens;
+    out.assign((size_t)n_chunks, 0.0f);
+    for (int c = 0; c < n_chunks; c++) {
+        const int s_ = c * chunk_tokens, e_ = std::min(S, (c + 1) * chunk_tokens);
+        float m = 0.0f;
+        for (int j = s_; j < e_; j++) m += smooth[j];
+        out[c] = m / std::max(1, e_ - s_);
+    }
+    return true;
+}
+
+// ── KvFlashCrossTokScorer ───────────────────────────────────────────────
+
+struct KvFlashCrossTokScorer::Toks {
+    Tokenizer target;
+    Tokenizer drafter;
+};
+
+KvFlashCrossTokScorer::~KvFlashCrossTokScorer() { delete toks_; }
+
+bool KvFlashCrossTokScorer::ensure_tokenizers() {
+    if (toks_) return true;
+    if (toks_failed_) return false;
+    auto * t = new Toks();
+    if (!t->target.load_from_gguf(target_gguf_.c_str()) ||
+        !t->drafter.load_from_gguf(drafter_gguf_.c_str())) {
+        std::fprintf(stderr, "[kvflash] cross-tokenizer scorer: tokenizer load "
+                             "failed (%s / %s)\n",
+                     target_gguf_.c_str(), drafter_gguf_.c_str());
+        delete t;
+        toks_failed_ = true;
+        return false;
+    }
+    toks_ = t;
+    return true;
+}
+
+bool KvFlashCrossTokScorer::score_chunks(const std::vector<int32_t> & ids,
+                                         int chunk_tokens,
+                                         std::vector<float> & out) {
+    const int S = (int)ids.size();
+    out.clear();
+    if (!ctx_ || !ctx_->loaded || S < kLookahead + 1 || chunk_tokens <= 0) return false;
+    if (!ensure_tokenizers()) return false;
+
+    // 1) Target ids -> text, recording each target token's char end offset.
+    //    Byte-level BPE pieces concatenate exactly, so per-id decode gives
+    //    exact spans; special/template tokens may decode empty (their chunk
+    //    contribution then comes from neighboring text, which is fine).
+    std::string text;
+    text.reserve((size_t)S * 4);
+    std::vector<int32_t> tgt_end((size_t)S);
+    std::vector<int32_t> one(1);
+    for (int i = 0; i < S; i++) {
+        one[0] = ids[(size_t)i];
+        text += toks_->target.decode(one);
+        tgt_end[(size_t)i] = (int32_t)text.size();
+    }
+
+    // 2) Text -> drafter ids, with each drafter token's char midpoint.
+    const std::vector<int32_t> dids = toks_->drafter.encode(text);
+    const int D = (int)dids.size();
+    if (D < kLookahead + 1) return false;
+    std::vector<float> dmid((size_t)D);
+    {
+        size_t pos = 0;
+        for (int i = 0; i < D; i++) {
+            one[0] = dids[(size_t)i];
+            const size_t len = toks_->drafter.decode(one).size();
+            dmid[(size_t)i] = (float)pos + (float)len * 0.5f;
+            pos += len;
+        }
+    }
+
+    // 3) Same tail-attention forward as the same-tokenizer scorer.
+    std::vector<float> dscore;
+    if (!score_tokens_resilient(*ctx_, dids, dscore)) return false;
+
+    // 4) Map drafter-token scores onto target chunks by char span: a chunk's
+    //    score is the mean of drafter tokens whose midpoint falls inside the
+    //    chunk's text span. Empty spans (pure template tokens) stay at 0,
+    //    i.e. z-score-neutral.
+    const int n_chunks = (S + chunk_tokens - 1) / chunk_tokens;
+    out.assign((size_t)n_chunks, 0.0f);
+    std::vector<int> counts((size_t)n_chunks, 0);
+    int d = 0;
+    for (int c = 0; c < n_chunks; c++) {
+        const int last_tok_idx = std::min(S, (c + 1) * chunk_tokens) - 1;
+        const float span_end = (float)tgt_end[(size_t)last_tok_idx];
+        while (d < D && dmid[(size_t)d] < span_end) {
+            out[(size_t)c] += dscore[(size_t)d];
+            counts[(size_t)c]++;
+            d++;
+        }
+        if (counts[(size_t)c] > 0) out[(size_t)c] /= (float)counts[(size_t)c];
+    }
+    return true;
+}
+
+} // namespace dflash::common
diff --git a/server/src/qwen3/qwen3_kvflash_scorer.h b/server/src/qwen3/qwen3_kvflash_scorer.h
new file mode 100644
index 000000000..e0fda5074
--- /dev/null
+++ b/server/src/qwen3/qwen3_kvflash_scorer.h
@@ -0,0 +1,68 @@
+// KvFlashDrafterScorer — pflash drafter as the KV pager's Memory Indexer.
+//
+// Scores 64-token chunks with the same Liu Q-hook tail attention that
+// pflash compression uses (forward_qwen3_drafter_model), but returns the
+// per-chunk relevance scores instead of a compressed token list. The
+// DrafterContext is borrowed: the daemon shares its pflash drafter; the
+// pager itself never depends on this file (see common/kvflash_scorer.h).
+
+#pragma once
+
+#include "kvflash_scorer.h"
+#include "qwen3_drafter.h"
+
+#include <string>
+
+namespace dflash::common {
+
+class KvFlashDrafterScorer : public KvFlashScorer {
+public:
+    // `vocab_clamp`: ids >= clamp are folded into the drafter's vocab range
+    // before scoring. Needed when the target vocabulary is a superset of
+    // the drafter's (e.g. Qwen3.6 target + Qwen3-0.6B drafter); prompt ids
+    // tokenized for the target may be unembeddable by the drafter.
+    explicit KvFlashDrafterScorer(DrafterContext * ctx, int32_t vocab_clamp = 100000)
+        : ctx_(ctx), vocab_clamp_(vocab_clamp) {}
+
+    bool score_chunks(const std::vector<int32_t> & ids, int chunk_tokens,
+                      std::vector<float> & out) override;
+
+private:
+    DrafterContext * ctx_;
+    int32_t vocab_clamp_;
+};
+
+// KvFlashCrossTokScorer — the same drafter scoring for targets that do NOT
+// share the Qwen tokenizer (laguna, gemma4). Relevance is a property of the
+// TEXT, so the bridge is re-tokenization: detokenize the target's history
+// (its own tokenizer, loaded from the target GGUF), tokenize the text with
+// the drafter's tokenizer (from the drafter GGUF), run the same tail-
+// attention forward, then map per-drafter-token scores back onto the
+// target's chunk boundaries by character spans. Tokenizers are host-only
+// and lazy-loaded on first score.
+class KvFlashCrossTokScorer : public KvFlashScorer {
+public:
+    KvFlashCrossTokScorer(DrafterContext * ctx,
+                          std::string target_gguf,
+                          std::string drafter_gguf)
+        : ctx_(ctx), target_gguf_(std::move(target_gguf)),
+          drafter_gguf_(std::move(drafter_gguf)) {}
+    ~KvFlashCrossTokScorer() override;
+    KvFlashCrossTokScorer(const KvFlashCrossTokScorer &) = delete;
+    KvFlashCrossTokScorer & operator=(const KvFlashCrossTokScorer &) = delete;
+
+    bool score_chunks(const std::vector<int32_t> & ids, int chunk_tokens,
+                      std::vector<float> & out) override;
+
+private:
+    bool ensure_tokenizers();
+
+    DrafterContext * ctx_;
+    std::string target_gguf_, drafter_gguf_;
+    // Pimpl to keep server/tokenizer.h out of backend headers.
+    struct Toks;
+    Toks * toks_ = nullptr;
+    bool   toks_failed_ = false;
+};
+
+} // namespace dflash::common
diff --git a/server/src/qwen35/graph_builders.cpp b/server/src/qwen35/graph_builders.cpp
index f41f94cc0..f6c963870 100644
--- a/server/src/qwen35/graph_builders.cpp
+++ b/server/src/qwen35/graph_builders.cpp
@@ -2,6 +2,7 @@
 
 #include "ggml-alloc.h"
 
+#include <algorithm>
 #include <cstdio>
 
 namespace dflash::common {
@@ -88,7 +89,9 @@ bool build_layer_prefn_step(
     int n_tokens,
     bool with_mask,
     int fa_window,
-    int kq_stride_pad) {
+    int kq_stride_pad,
+    bool kvflash) {
+    if (kvflash) with_mask = true;   // slot-space masking is mandatory on the pool
     step_graph_free(sg);
 
     ggml_init_params ip{};
@@ -109,20 +112,34 @@ bool build_layer_prefn_step(
         ggml_set_name(sg.positions, "positions");
         ggml_set_input(sg.positions);
         if (with_mask) {
-            const int max_win_len = cache.max_ctx + n_tokens;
+            // Mask width follows the PHYSICAL tensor capacity (pool-sized
+            // under kvflash) so it agrees with the FA span clamp inside
+            // build_full_attn_block.
+            int phys_ctx = cache.max_ctx;
+            for (ggml_tensor * t : cache.attn_k) {
+                if (t) { phys_ctx = std::min(phys_ctx, (int)t->ne[1]); break; }
+            }
+            const int max_win_len = phys_ctx + n_tokens;
             const int kv_pad = align_up(max_win_len, kq_stride_pad);
             const int q_pad  = align_up(n_tokens, KQ_MASK_PAD);
             sg.attn_mask = ggml_new_tensor_2d(sg.ctx, GGML_TYPE_F16, kv_pad, q_pad);
             ggml_set_name(sg.attn_mask, "attn_mask");
             ggml_set_input(sg.attn_mask);
         }
+        if (kvflash) {
+            sg.kv_write_rows = ggml_new_tensor_2d(sg.ctx, GGML_TYPE_I64,
+                                                  n_tokens, w.n_head_kv);
+            ggml_set_name(sg.kv_write_rows, "kv_write_rows");
+            ggml_set_input(sg.kv_write_rows);
+        }
     }
 
     sg.gf = ggml_new_graph_custom(sg.ctx, 16384, false);
     QwenLayerPrefnOutputs go = build_qwen35_layer_prefn(
         sg.ctx, sg.gf, w, cache, layer_idx,
         sg.inp_embed, sg.positions, sg.attn_mask,
-        kv_start, n_tokens, fa_window);
+        kv_start, n_tokens, fa_window,
+        sg.kv_write_rows);
     if (!go.residual || !go.post) return false;
     sg.ffn_residual = go.residual;
     sg.ffn_post = go.post;
@@ -236,7 +253,8 @@ bool build_target_step(
     int fa_window,
     bool last_token_logits_only,
     int kq_stride_pad,
-    bool capture_moe_router) {
+    bool capture_moe_router,
+    bool kvflash_mask) {
     step_graph_free(sg);
 
     // Persistent thread_local arena: rebuilt step graphs land at identical
@@ -266,7 +284,13 @@ bool build_target_step(
         // Use max_ctx for mask allocation so the gallocr buffer never needs to
         // grow as kv_start increases during generation.  The actual mask is
         // filled only up to kv_start + n_tokens; the excess is don't-care.
-        const int max_win_len = cache.max_ctx + n_tokens;
+        // kvflash mode: the physical span is the (smaller) pool capacity of
+        // the attention tensors, so size the mask from those instead.
+        int phys_ctx = cache.max_ctx;
+        for (auto * t : cache.attn_k) {
+            if (t) { phys_ctx = std::min(phys_ctx, (int)t->ne[1]); break; }
+        }
+        const int max_win_len = phys_ctx + n_tokens;
         const int kv_pad = align_up(max_win_len, kq_stride_pad);
         const int q_pad  = align_up(n_tokens, KQ_MASK_PAD);
         sg.attn_mask = ggml_new_tensor_2d(sg.ctx, GGML_TYPE_F16, kv_pad, q_pad);
@@ -280,8 +304,16 @@ bool build_target_step(
     // DFLASH_QWEN35_NO_KVPAD=1 restores the legacy cpy append + exact-length
     // FA span (per-step node properties -> no CUDA-graph replay).
     static const bool g_no_kvpad = (std::getenv("DFLASH_QWEN35_NO_KVPAD") != nullptr);
-    const bool use_kv_write_rows = (!g_no_kvpad && n_tokens == 1 && fa_window == 0 &&
-                                    !with_mask && !capture && !capture_delta_intermediate);
+    // kvflash_mask: kvflash mode. The mask carries pool slot validity
+    // (uploaded by the caller before EVERY compute — the input's buffer
+    // region is reused by graph execution) and set_rows carries per-token
+    // physical slots, so the slot-mapped write stays active for masked,
+    // multi-token, and feature-capturing forwards (decode AND spec verify).
+    const bool use_kv_write_rows =
+        !g_no_kvpad && !capture_delta_intermediate &&
+        (kvflash_mask
+             ? (fa_window == 0)
+             : (n_tokens == 1 && fa_window == 0 && !with_mask && !capture));
     if (use_kv_write_rows) {
         sg.kv_write_rows = ggml_new_tensor_2d(sg.ctx, GGML_TYPE_I64,
                                               n_tokens, w.n_head_kv);
diff --git a/server/src/qwen35/graph_builders.h b/server/src/qwen35/graph_builders.h
index 69a1e89e4..ca11a8169 100644
--- a/server/src/qwen35/graph_builders.h
+++ b/server/src/qwen35/graph_builders.h
@@ -40,6 +40,10 @@ bool build_layer_step(
     int fa_window = 0,
     int kq_stride_pad = KQ_MASK_PAD);
 
+// `kvflash`: pooled mode — KV rows go through a set_rows input
+// (sg.kv_write_rows, [n_tokens, n_head_kv] ne0-major slots) and the mask
+// (forced on) is sized to the PHYSICAL tensor capacity so the caller can
+// fill it in slot space. Caller allocates slots and fills rows + mask.
 bool build_layer_prefn_step(
     StepGraph & sg,
     const TargetWeights & w,
@@ -50,7 +54,8 @@ bool build_layer_prefn_step(
     int n_tokens,
     bool with_mask,
     int fa_window = 0,
-    int kq_stride_pad = KQ_MASK_PAD);
+    int kq_stride_pad = KQ_MASK_PAD,
+    bool kvflash = false);
 
 // Full layer graph for hybrid decode: pre-FFN + MoE FFN + shared + residual in one compute.
 // Output: sg.hidden_input = layer_output, sg.moe_selected = router selections.
@@ -67,6 +72,11 @@ bool build_hybrid_full_layer_step(
     int kq_stride_pad = KQ_MASK_PAD);
 
 // Full target forward: chain mode (all layers, logits + argmax output).
+//
+// `kvflash_mask`: kvflash pooled mode — keep the set_rows KV write active
+// even though a mask is requested (the mask carries pool-slot validity and
+// must be re-uploaded by the caller before every compute). Used by both
+// single-token decode and multi-token spec verify; requires fa_window == 0.
 bool build_target_step(
     StepGraph & sg,
     const TargetWeights & w,
@@ -80,7 +90,8 @@ bool build_target_step(
     int fa_window = 0,
     bool last_token_logits_only = false,
     int kq_stride_pad = KQ_MASK_PAD,
-    bool capture_moe_router = false);
+    bool capture_moe_router = false,
+    bool kvflash_mask = false);
 
 // Full target forward: DDTree tree-verify mode.
 bool build_target_step_tree(
diff --git a/server/src/qwen35/qwen35_backend.cpp b/server/src/qwen35/qwen35_backend.cpp
index c22b37ed5..4feb08b03 100644
--- a/server/src/qwen35/qwen35_backend.cpp
+++ b/server/src/qwen35/qwen35_backend.cpp
@@ -10,6 +10,7 @@
 #include "common/io_utils.h"
 #include "common/restore_delta.h"
 #include "qwen3/qwen3_drafter.h"
+#include "qwen3/qwen3_kvflash_scorer.h"
 
 #include "ggml-cuda.h"
 #include "common/snapshot_backend.h"
@@ -26,6 +27,8 @@
 #include <sys/stat.h>
 #include <unistd.h>
 
+#include "kv_quant.h"
+
 namespace dflash::common {
 
 namespace {
@@ -215,11 +218,63 @@ bool Qwen35Backend::init() {
     const int max_verify_tokens = cfg_.ddtree_mode
         ? std::max<int>(dw_.block_size, cfg_.ddtree_budget + 1)
         : dw_.block_size;
+    // kvflash (bounded residency): pool size from the env, rounded/floored/
+    // clamped by the shared reader (256-stride keeps FA vec-kernel
+    // eligibility; the floor keeps eviction from deadlocking).
+    // Drafter-scored residency is the DEFAULT policy: explicit
+    // --prefill-drafter first, then the well-known locations next to the
+    // model (Spark's pattern). LRU is the fallback when nothing is found
+    // (or the explicit choice via --kvflash-policy lru).
+    if (std::getenv("DFLASH_KVFLASH")) {
+        kvflash_drafter_path_ = kvflash_find_drafter(cfg_.target_path);
+    }
+    // "auto" sizes the pool from the GPU: weights are resident at this
+    // point and the cache is not yet allocated, so device-free minus a
+    // reserve (compute buffers + the drafter when expected) is what the
+    // pool can really use, converted at this model's pooled-KV density.
+    KvFlashAutoBudget kvf_budget;
+    {
+        size_t gpu_free = 0, gpu_total = 0;
+        if (ggml_backend_dev_t dev = ggml_backend_get_device(target_backend_)) {
+            ggml_backend_dev_memory(dev, &gpu_free, &gpu_total);
+        }
+        ggml_type kv_k = GGML_TYPE_Q8_0, kv_v = GGML_TYPE_Q8_0;
+        dflash::resolve_kv_types(kv_k, kv_v);
+        const int n_full = w_.n_layer / w_.full_attention_interval;
+        kvf_budget.free_bytes      = (int64_t)gpu_free;
+        kvf_budget.bytes_per_token = (int64_t)n_full * w_.n_head_kv *
+            (int64_t)(ggml_row_size(kv_k, w_.n_embd_head_k) +
+                      ggml_row_size(kv_v, w_.n_embd_head_v));
+        kvf_budget.reserve_bytes   = (int64_t)(1.5 * 1073741824.0) +
+            (kvflash_drafter_path_.empty() ? 0 : (int64_t)(1.7 * 1073741824.0));
+    }
+    kvflash_tokens_ = kvflash_pool_from_env(cfg_.device.max_ctx, KvFlashConfig{},
+                                            !kvflash_drafter_path_.empty(),
+                                            kvf_budget);
+    if (kvflash_tokens_ > 0) {
+        kvflash_tau_ = std::max(1, env_int_or_default("DFLASH_KVFLASH_TAU", 64));
+    }
     if (!create_target_cache(w_, cfg_.device.max_ctx, max_verify_tokens, target_backend_, cache_,
-                             /*prefill_only=*/true)) {
+                             /*prefill_only=*/true, /*ctx_alloc=*/kvflash_tokens_)) {
         std::fprintf(stderr, "cache: %s\n", dflash27b_last_error());
         return false;
     }
+    if (kvflash_active()) {
+        KvFlashConfig pc;
+        pc.pool_tokens = kvflash_tokens_;
+        if (!kvflash_pager_.attach(pc, cache_.attn_k, cache_.attn_v)) {
+            std::fprintf(stderr, "kvflash: pager attach failed (pool=%d)\n", kvflash_tokens_);
+            return false;
+        }
+        std::printf("[kvflash] resident pool %d tokens (logical max_ctx %d), "
+                    "tau=%d, policy=%s\n",
+                    kvflash_tokens_, cfg_.device.max_ctx, kvflash_tau_,
+                    !kvflash_drafter_path_.empty()
+                        ? "drafter (attaches on first reselect)"
+                        : "lru (recency-only: no Qwen3-0.6B drafter found "
+                          "next to the model or in --prefill-drafter)");
+        std::fflush(stdout);
+    }
 
     // Init feature mirror when draft model is available (needed for spec decode).
     // On single-GPU, this is an F32 conversion buffer; on split-GPU, a cross-device mirror.
@@ -290,6 +345,7 @@ bool Qwen35Backend::unpark(const std::string & what) {
             std::fprintf(stderr, "[unpark] target: %s\n", dflash27b_last_error());
             return false;
         }
+        kvflash_drafter_failed_ = false;   // fresh VRAM: allow a retry
         target_parked_ = false;
         std::printf("[unpark] target restored\n"); std::fflush(stdout);
     }
@@ -340,6 +396,22 @@ bool Qwen35Backend::unpark(const std::string & what) {
 
 bool Qwen35Backend::snapshot_save(int slot) {
     if (slot < 0 || slot >= PREFIX_SLOTS) return false;
+    // kvflash: snapshots right-size to cur_pos, which is a LOGICAL position
+    // that can exceed the physical pool once decode has paged, and they copy
+    // rows assuming the identity layout, which pooled prefill / eviction
+    // breaks. Snapshots of pooled state need page-table serialization
+    // (follow-up); identity-mapped prefill-time snapshots remain valid.
+    if (kvflash_active() &&
+        (cache_.cur_pos > kvflash_tokens_ || !kvflash_pager_.is_identity())) {
+        static bool warned = false;
+        if (!warned) {
+            std::fprintf(stderr, "[kvflash] snapshot skipped: cur_pos %d exceeds "
+                                 "pool %d (pooled snapshots are a follow-up)\n",
+                         cache_.cur_pos, kvflash_tokens_);
+            warned = true;
+        }
+        return false;
+    }
     PrefixSnapshot & snap = prefix_snapshots_[slot];
     return snapshot_target_cache(w_, cache_, snap_backend_, snap);
 }
@@ -488,6 +560,13 @@ ModelBackend::CompressResult Qwen35Backend::compress(const CompressRequest & req
         }
         drafter_loaded_ = true;
         std::fprintf(stderr, "[compress] drafter ready\n");
+        // pflash + kvflash synergy: the drafter doubles as the pool's
+        // Memory Indexer (tau-step reselect). Pager stays LRU without it.
+        if (kvflash_active() && !kvflash_scorer_) {
+            kvflash_scorer_ = std::make_unique<KvFlashDrafterScorer>(&drafter_ctx_);
+            std::fprintf(stderr, "[kvflash] drafter scorer attached (tau=%d)\n",
+                         kvflash_tau_);
+        }
     }
 
     result.compressed_ids = drafter_score_and_compress(
@@ -544,6 +623,8 @@ bool Qwen35Backend::handle_compress(const std::string & line, const DaemonIO & i
 
 void Qwen35Backend::free_drafter() {
     if (drafter_loaded_) {
+        // The kvflash scorer borrows drafter_ctx_; drop it first.
+        kvflash_scorer_.reset();
         // Drafter has its own backend — do a full free (weights + backend)
         dflash::common::free_drafter(drafter_ctx_);
         drafter_loaded_ = false;
@@ -579,6 +660,10 @@ DFlashTarget * Qwen35Backend::dflash_target() {
         dflash_target_ = std::make_unique<Qwen35DFlashTarget>(
             w_, cache_, target_backend_, sg_,
             cfg_.kq_stride_pad, cfg_.fa_window);
+        if (kvflash_active()) {
+            static_cast<Qwen35DFlashTarget *>(dflash_target_.get())
+                ->set_kvflash_pager(&kvflash_pager_);
+        }
     }
     return dflash_target_.get();
 }
@@ -856,6 +941,32 @@ int Qwen35Backend::do_prefill(const std::vector<int32_t> & tokens,
     const int prompt_len = (int)tokens.size();
     prefill_last_logits_valid_ = false;
 
+    // kvflash: a prompt that fits the pool prefills contiguously (identity
+    // mapping, normal chunking). A LARGER prompt switches to POOLED CHUNKED
+    // PREFILL: pager-chunk-sized batches whose KV rows are slot-mapped via
+    // set_rows, with a slot-space mask per chunk and live eviction as the
+    // pool fills (constant VRAM, linear time). Restore offsets are not
+    // supported in the pooled path (a relocated prefix cannot be restored
+    // identity-style in the first place).
+    const bool kvf_paged = kvflash_active() &&
+        kv_offset + prompt_len > kvflash_tokens_ - kvflash_pager_.chunk_tokens();
+    if (kvf_paged && kv_offset != 0) {
+        std::fprintf(stderr,
+            "[kvflash] restored prefix (%d) + prompt (%d) exceeds pool %d; "
+            "pooled prefill requires a fresh request\n",
+            kv_offset, prompt_len, kvflash_tokens_);
+        set_last_error("kvflash: restore + pooled prefill unsupported");
+        return -1;
+    }
+    if (kvf_paged) {
+        prefill_ubatch = kvflash_pager_.chunk_tokens();
+        kvflash_pager_.reset();
+        std::printf("[kvflash] pooled prefill: %d tokens through a %d-token pool "
+                    "(%d-token chunks, evicting)\n",
+                    prompt_len, kvflash_tokens_, prefill_ubatch);
+        std::fflush(stdout);
+    }
+
     // Skip KV-cache migration when resuming from a snapshot — the cache was
     // already migrated when the snapshot was taken; re-running migrate would
     // clobber the restored state.
@@ -887,18 +998,39 @@ int Qwen35Backend::do_prefill(const std::vector<int32_t> & tokens,
         // incl. the user message -> a different user msg restores garbage.)
         if (snap_slot >= 0 && snap_pos >= 0 &&
             kv_pos <= snap_pos && snap_pos < kv_pos + n_tokens) {
-            if (kv_pos > kv_offset) {   // skip a degenerate short-prefix snapshot
+            if (kv_pos > kv_offset && !kvf_paged) {   // skip degenerate / relocated
                 cache_.cur_pos = kv_pos;
                 if (snapshot_save(snap_slot)) {
                     std::printf("[snap] boundary slot=%d cur_pos=%d (req snap_pos=%d)\n",
                                 snap_slot, kv_pos, snap_pos);
                     std::fflush(stdout);
                 }
+            } else if (kvf_paged) {
+                std::fprintf(stderr, "[kvflash] boundary snapshot skipped: pooled "
+                                     "prefill relocates chunks\n");
             }
             snap_pos = -1;
             snap_slot = -1;
         }
-        const bool with_mask = (cfg_.kq_stride_pad > KQ_MASK_PAD) || (n_tokens > 1);
+        const bool with_mask = kvf_paged ||
+            (cfg_.kq_stride_pad > KQ_MASK_PAD) || (n_tokens > 1);
+
+        // kvflash pooled prefill: allocate this chunk's slots up front
+        // (evicting the lowest-priority resident chunk once the pool fills).
+        std::vector<int> kvf_slots;
+        if (kvf_paged) {
+            kvf_slots.resize((size_t)n_tokens);
+            bool ok = true;
+            for (int i = 0; i < n_tokens; i++) {
+                kvf_slots[(size_t)i] = kvflash_pager_.slot_for(kv_pos + i);
+                if (kvf_slots[(size_t)i] < 0) { ok = false; break; }
+            }
+            if (!ok) {
+                std::fprintf(stderr, "[kvflash] pooled prefill: slot alloc failed @%d\n", kv_pos);
+                set_last_error("kvflash: no evictable pool block");
+                return -1;
+            }
+        }
 
         // Prefill always uses full attention (fa_window=0) so that all
         // positions encode the complete context — critical for tool
@@ -911,10 +1043,26 @@ int Qwen35Backend::do_prefill(const std::vector<int32_t> & tokens,
                                /*fa_window=*/0,
                                /*last_token_logits_only=*/(start + n_tokens < prompt_len),
                                cfg_.kq_stride_pad,
-                               should_capture_moe_router())) {
+                               should_capture_moe_router(),
+                               /*kvflash_mask=*/kvf_paged)) {
             std::fprintf(stderr, "prefill build @%d\n", kv_pos);
             return -1;
         }
+        if (kvf_paged) {
+            if (!sg_.kv_write_rows) {
+                std::fprintf(stderr, "[kvflash] pooled prefill requires the set_rows path\n");
+                return -1;
+            }
+            // [n_tokens, n_head_kv] ne0-major (see verify_batch).
+            std::vector<int64_t> rows((size_t)n_tokens * w_.n_head_kv);
+            for (int h = 0; h < w_.n_head_kv; h++) {
+                for (int i = 0; i < n_tokens; i++) {
+                    rows[(size_t)h * n_tokens + i] = kvf_slots[(size_t)i];
+                }
+            }
+            ggml_backend_tensor_set(sg_.kv_write_rows, rows.data(), 0,
+                                    sizeof(int64_t) * rows.size());
+        }
 
         // Embed
         if (!w_.embedder.embed(tokens.data() + start, n_tokens, embed_buf.data())) {
@@ -936,7 +1084,34 @@ int Qwen35Backend::do_prefill(const std::vector<int32_t> & tokens,
                                 sizeof(int32_t) * pos_buf.size());
 
         // Mask — full attention during prefill (no windowing)
-        if (sg_.attn_mask) {
+        if (sg_.attn_mask && kvf_paged) {
+            // Slot-space mask (same recipe as verify_batch): row q attends
+            // (a) the slots of resident chunks holding positions < kv_pos
+            // and (b) this chunk's own slots, causally.
+            constexpr uint16_t F16_ZERO = 0x0000, F16_NEG_INF = 0xFC00;
+            const size_t kvd = (size_t)sg_.attn_mask->ne[0];
+            const int q_pad = (int)sg_.attn_mask->ne[1];
+            std::vector<uint16_t> mask_buf((size_t)kvd * q_pad, F16_NEG_INF);
+            const int ct = kvflash_pager_.chunk_tokens();
+            for (int c = 0; c < kvflash_pager_.n_chunks(); c++) {
+                const int blk = kvflash_pager_.block_of(c);
+                if (blk < 0) continue;
+                for (int i = 0; i < ct; i++) {
+                    if ((int64_t)c * ct + i >= kv_pos) break;
+                    mask_buf[(size_t)blk * ct + i] = F16_ZERO;
+                }
+            }
+            for (int q = 1; q < n_tokens; q++) {
+                std::memcpy(mask_buf.data() + (size_t)q * kvd, mask_buf.data(), kvd * 2);
+            }
+            for (int q = 0; q < n_tokens; q++) {
+                for (int i = 0; i <= q; i++) {
+                    mask_buf[(size_t)q * kvd + kvf_slots[(size_t)i]] = F16_ZERO;
+                }
+            }
+            ggml_backend_tensor_set(sg_.attn_mask, mask_buf.data(), 0,
+                                    sizeof(uint16_t) * mask_buf.size());
+        } else if (sg_.attn_mask) {
             const int win_start = 0;
             const int kv_len = kv_pos + n_tokens - win_start;
             std::vector<uint16_t> mask_buf;
@@ -979,6 +1154,18 @@ int Qwen35Backend::do_prefill(const std::vector<int32_t> & tokens,
         start += n_tokens;
     }
 
+    if (kvflash_active()) {
+        if (kvf_paged) {
+            // The pager mapping was built live during the pooled prefill;
+            // only the history / hygiene parts of the sync apply.
+            kvflash_history_.assign(tokens.begin(), tokens.end());
+            kvflash_pager_.zero_free_blocks();
+            kvflash_mask_epoch_ = (uint64_t)-1;
+        } else {
+            kvflash_sync_prefill(committed, tokens, kv_offset);
+        }
+    }
+
     // End-of-prefill snapshot: scoped disk-cache saves (auto/fixed policy)
     // request snap_pos == prompt end, which never falls inside a chunk so the
     // boundary branch above cannot fire. Taking the snapshot here changes
@@ -995,6 +1182,104 @@ int Qwen35Backend::do_prefill(const std::vector<int32_t> & tokens,
     return committed;
 }
 
+// ── kvflash helpers ─────────────────────────────────────────────────
+
+void Qwen35Backend::kvflash_sync_prefill(int committed,
+                                         const std::vector<int32_t> & tokens,
+                                         int kv_offset) {
+    // Prefill (and snapshot restore) place rows physically contiguous at
+    // [0, committed): rebuild the pager mapping identity-style and reset
+    // the token history to match.
+    kvflash_pager_.reset();
+    for (int p = 0; p < committed; p++) {
+        const int slot = kvflash_pager_.slot_for(p);
+        if (slot != p) {
+            // Cannot happen while prompt <= pool (blocks are handed out in
+            // order from a freshly reset pager); guard against future
+            // changes to the hand-out order.
+            std::fprintf(stderr, "[kvflash] prefill slot mismatch %d != %d\n", slot, p);
+        }
+    }
+    if (kv_offset == 0) {
+        kvflash_history_.assign(tokens.begin(), tokens.end());
+    } else {
+        kvflash_history_.resize((size_t)kv_offset, 0);  // restored prefix ids unknown
+        kvflash_history_.insert(kvflash_history_.end(), tokens.begin(), tokens.end());
+    }
+    // Slots past the prompt still hold the previous request's rows; the
+    // maskless qwen35moe pipelined decode reads the whole padded pool span.
+    kvflash_pager_.zero_free_blocks();
+    kvflash_mask_epoch_ = (uint64_t)-1;
+}
+
+void Qwen35Backend::kvflash_upload_mask() {
+    if (!sg_.attn_mask) return;
+    const size_t need = (size_t)sg_.attn_mask->ne[0] * sg_.attn_mask->ne[1];
+    if (kvflash_mask_buf_.size() != need || kvflash_pager_.epoch() != kvflash_mask_epoch_) {
+        kvflash_mask_buf_.assign(need, F16_NEG_INF);
+        kvflash_pager_.fill_slot_mask(kvflash_mask_buf_.data());   // q row 0
+        kvflash_mask_epoch_ = kvflash_pager_.epoch();
+    }
+    // Upload before EVERY compute: the input tensor's buffer region is
+    // reused by graph execution, so a stale upload reads back as garbage.
+    ggml_backend_tensor_set(sg_.attn_mask, kvflash_mask_buf_.data(), 0,
+                            need * sizeof(uint16_t));
+}
+
+// Attach the drafter as the residency scorer outside the pflash compress
+// path: with `--kvflash --prefill-drafter <gguf>` but compression off, the
+// drafter would otherwise never load and the pool would silently run
+// recency-only LRU. Loads lazily on the first reselect that needs it (and
+// re-attaches after a draft-residency release frees the drafter).
+void Qwen35Backend::kvflash_ensure_scorer() {
+    if (kvflash_scorer_ || kvflash_drafter_path_.empty() || kvflash_drafter_failed_) {
+        return;
+    }
+    if (!drafter_loaded_) {
+        ggml_backend_synchronize(target_backend_);
+        if (draft_backend_) ggml_backend_synchronize(draft_backend_);
+        std::fprintf(stderr, "[kvflash] loading drafter for residency scoring: %s\n",
+                     kvflash_drafter_path_.c_str());
+        if (!load_drafter(kvflash_drafter_path_, /*gpu_layers=*/999,
+                          cfg_.device.gpu, drafter_ctx_)) {
+            std::fprintf(stderr, "[kvflash] drafter load failed (%s); staying on "
+                                 "LRU residency\n", dflash27b_last_error());
+            kvflash_drafter_failed_ = true;
+            return;
+        }
+        drafter_loaded_ = true;
+    }
+    kvflash_scorer_ = std::make_unique<KvFlashDrafterScorer>(&drafter_ctx_);
+    std::fprintf(stderr, "[kvflash] drafter scorer attached (tau=%d)\n", kvflash_tau_);
+}
+
+void Qwen35Backend::kvflash_maybe_reselect(int generated) {
+    if (kvflash_tau_ <= 0) return;
+    // Adaptive tau: a rescore costs ~0.11 ms per history token (full 0.6B
+    // re-prefill; measured 0.9 s @8K, ~46 s bisected @256K), while decode
+    // produces ~30 tok/s. Capping rescore overhead at ~15% of decode time
+    // gives tau ~= history/45. The configured tau is the floor.
+    const int tau = std::max<int>(kvflash_tau_, (int)(kvflash_history_.size() / 45));
+    if (generated % tau != 0) return;
+    // Lazy-load the drafter only when a rescore is actually due, so the
+    // first tokens of the first request never pay the load.
+    if (!kvflash_scorer_) kvflash_ensure_scorer();
+    if (!kvflash_scorer_) return;
+    if (!kvflash_scorer_->score_chunks(kvflash_history_, kvflash_pager_.chunk_tokens(), kvflash_scores_)) {
+        return;  // scorer failure -> keep LRU behavior this round
+    }
+    kvflash_pager_.score_hook = [this](int c) {
+        return c < (int)kvflash_scores_.size() ? kvflash_scores_[c] : 1e30f;
+    };
+    const int events = kvflash_pager_.reselect();
+    if (events > 0) {
+        std::fprintf(stderr, "[kvflash] reselect @gen=%d: %d page events "
+                     "(resident %d/%d blocks)\n",
+                     generated, events, kvflash_pager_.resident_blocks(),
+                     kvflash_tokens_ / kvflash_pager_.chunk_tokens());
+    }
+}
+
 bool Qwen35Backend::do_ar_decode(int committed, int n_gen,
                                   std::vector<int32_t> & out_tokens,
                                   const DaemonIO & io,
@@ -1127,6 +1412,7 @@ bool Qwen35Backend::do_ar_decode(int committed, int n_gen,
         maybe_force_close(first_tok, committed);
         out_tokens.push_back(first_tok);
         io.emit(first_tok);
+        if (kvflash_active()) kvflash_history_.push_back(first_tok);
         if (IS_EOS_TOK(first_tok, w_)) return true;
         committed++;
         cache_.cur_pos = committed;
@@ -1141,24 +1427,39 @@ bool Qwen35Backend::do_ar_decode(int committed, int n_gen,
         int32_t pos4[4] = {committed, committed, committed, 0};
         ggml_backend_tensor_set(sg_.positions, pos4, 0, sizeof(int32_t) * 4);
 
+        // kvflash: graph carries a slot-validity mask alongside the
+        // step-invariant set_rows write; the FA span clamps to the pool.
+        const bool pool = kvflash_active();
         if (!build_target_step(sg_, w_, cache_, target_backend_,
                                /*kv_start=*/committed, /*n_tokens=*/1,
-                               /*with_mask=*/false, /*capture=*/false,
+                               /*with_mask=*/pool, /*capture=*/false,
                                /*capture_delta_intermediate=*/false,
                                /*fa_window=*/0,
                                /*last_token_logits_only=*/false,
                                cfg_.kq_stride_pad,
-                               should_capture_moe_router())) {
+                               should_capture_moe_router(),
+                               /*kvflash_mask=*/pool)) {
             return false;
         }
 
-        // Fill kv_write_rows with this step's cache slot (committed) for set_rows.
+        // Fill kv_write_rows with this step's cache slot for set_rows:
+        // the logical position directly, or its pool slot in kvflash mode.
         if (sg_.kv_write_rows) {
             const int n_head_kv = w_.n_head_kv;
-            std::vector<int64_t> row_vals(n_head_kv, (int64_t)committed);
+            const int64_t slot = pool ? (int64_t)kvflash_pager_.slot_for(committed)
+                                      : (int64_t)committed;
+            if (pool && slot < 0) {
+                std::fprintf(stderr, "[kvflash] no pool slot at pos %d "
+                                     "(pool %d exhausted)\n",
+                             committed, kvflash_tokens_);
+                set_last_error("kvflash: no evictable pool block");
+                return false;
+            }
+            std::vector<int64_t> row_vals(n_head_kv, slot);
             ggml_backend_tensor_set(sg_.kv_write_rows, row_vals.data(), 0,
                                     sizeof(int64_t) * n_head_kv);
         }
+        if (pool) kvflash_upload_mask();
 
         auto st = ggml_backend_graph_compute(target_backend_, sg_.gf);
         if (st != GGML_STATUS_SUCCESS) return false;
@@ -1220,6 +1521,10 @@ bool Qwen35Backend::do_ar_decode(int committed, int n_gen,
         io.emit(next_tok);
         committed++;
         cache_.cur_pos = committed;
+        if (pool) {
+            kvflash_history_.push_back(next_tok);
+            kvflash_maybe_reselect((int)(out_tokens.size() - out_tokens_at_entry));
+        }
         if (io.cancelled) break;
 
         if (IS_EOS_TOK(next_tok, w_)) break;
@@ -1352,6 +1657,12 @@ bool Qwen35Backend::do_spec_decode(int committed, int n_gen,
     // - draft model loaded and not parked
     // - feature mirror initialized
     // - greedy decoding (no logit processing) — spec decode uses argmax verification
+    // - kvflash: verify_batch is slot-mapped (Qwen35DFlashTarget pooled
+    //   path), and that covers --ddtree too: in the daemon, ddtree_mode
+    //   configures larger verify intermediates + fast_rollback, whose
+    //   snapshot_kv/restore_kv only touch DeltaNet/conv state (pool-
+    //   neutral); generation runs this same chain loop either way. The
+    //   tree-verify graphs exist only in the test harness (test_dflash).
     const bool can_spec = cfg_.draft_path
         && !draft_parked_
         && (cfg_.remote_draft.enabled()
diff --git a/server/src/qwen35/qwen35_backend.h b/server/src/qwen35/qwen35_backend.h
index 59a105fc9..0df4df036 100644
--- a/server/src/qwen35/qwen35_backend.h
+++ b/server/src/qwen35/qwen35_backend.h
@@ -21,6 +21,8 @@
 #include "dflash_feature_ring.h"
 #include "internal.h"         // TargetWeights, TargetCache, DraftWeights, PrefixSnapshot
 #include "qwen3/qwen3_drafter.h"  // DrafterContext, load_drafter, free_drafter, drafter_score_and_compress
+#include "kvflash_pager.h"         // bounded KV residency pool
+#include "kvflash_scorer.h"        // chunk-relevance policy interface
 
 #include "ggml.h"
 #include "ggml-backend.h"
@@ -158,6 +160,40 @@ class Qwen35Backend : public ModelBackend {
     // ── Configuration ────────────────────────────────────────────────
     Qwen35Config cfg_;
 
+    // ── kvflash (bounded KV residency, FlashMemory-style) ────────────
+    // Active when kvflash_tokens_ > 0 (env DFLASH_KVFLASH / --kvflash):
+    // attention KV tensors are allocated at pool capacity, logical
+    // positions map to pool slots via kvflash_pager_, cold chunks page to
+    // host. Policy-agnostic: with no scorer the pager is LRU; when the
+    // pflash drafter is loaded it becomes the reselect scorer (every
+    // kvflash_tau_ decoded tokens). Forces AR decode (no spec).
+    // Protected: the MoE subclass routes its pipelined decode loops and
+    // hybrid prefill through the same pager/history/reselect state.
+    KvFlashPager                   kvflash_pager_;
+    std::unique_ptr<KvFlashScorer> kvflash_scorer_;
+    std::vector<int32_t>           kvflash_history_;     // prompt + generated ids
+    std::vector<float>             kvflash_scores_;      // latest chunk scores
+    std::vector<uint16_t>          kvflash_mask_buf_;    // host mirror of slot mask
+    std::string                    kvflash_drafter_path_; // DFLASH_KVFLASH_DRAFTER
+    uint64_t                       kvflash_mask_epoch_ = (uint64_t)-1;
+    int  kvflash_tokens_ = 0;                       // 0 = off
+    int  kvflash_tau_    = 64;
+    bool kvflash_drafter_failed_ = false;           // don't retry a failed load
+    bool kvflash_active() const { return kvflash_tokens_ > 0; }
+    // Rebuild pager mapping after (re)prefill: positions [0, committed)
+    // occupy pool slots identity-mapped (prefill is contiguous).
+    void kvflash_sync_prefill(int committed, const std::vector<int32_t> & tokens,
+                              int kv_offset);
+    // Upload the slot-validity mask (host rebuild on epoch change, device
+    // upload every step — the input's buffer region is reused by compute).
+    void kvflash_upload_mask();
+    // Drafter rescore + reselect every kvflash_tau_ generated tokens.
+    void kvflash_maybe_reselect(int generated);
+    // Attach the drafter scorer if a drafter path is configured and the
+    // scorer is missing (lazy-loads the drafter on first need; also heals
+    // after a residency release frees it). No-op without a path.
+    void kvflash_ensure_scorer();
+
 private:
     // ── GPU backends ─────────────────────────────────────────────────
     ggml_backend_t target_backend_ = nullptr;
diff --git a/server/src/qwen35/qwen35_dflash_target.cpp b/server/src/qwen35/qwen35_dflash_target.cpp
index 65713d1bb..5af4490af 100644
--- a/server/src/qwen35/qwen35_dflash_target.cpp
+++ b/server/src/qwen35/qwen35_dflash_target.cpp
@@ -5,6 +5,8 @@
 #include "step_graph.h"
 #include "attn_masks.h"
 
+#include <cstring>
+
 namespace dflash::common {
 
 Qwen35DFlashTarget::~Qwen35DFlashTarget() {
@@ -33,18 +35,53 @@ bool Qwen35DFlashTarget::verify_batch(
     if (n_tokens <= 0) return false;
 
     const int hidden = w_.n_embd;
-    const bool need_mask = (kq_stride_pad_ > KQ_MASK_PAD) || (n_tokens > 1);
+    const bool pool = pager_ != nullptr;
+    const bool need_mask = pool || (kq_stride_pad_ > KQ_MASK_PAD) || (n_tokens > 1);
+
+    // kvflash: allocate slots for the verify block up front (may evict at
+    // a chunk boundary; protections keep sinks + the tail window safe).
+    std::vector<int> slots;
+    if (pool) {
+        slots.resize(n_tokens);
+        for (int i = 0; i < n_tokens; i++) {
+            slots[i] = pager_->slot_for(base_pos + i);
+            if (slots[i] < 0) {
+                std::fprintf(stderr, "verify_batch: pool slot alloc failed @%d\n", base_pos + i);
+                return false;
+            }
+        }
+    }
 
     if (!build_target_step(sg_, w_, cache_, backend_,
                            /*kv_start=*/base_pos, n_tokens,
                            need_mask, /*capture=*/true,
                            /*capture_delta_intermediate=*/false,
-                           fa_window_,
+                           pool ? 0 : fa_window_,
                            /*last_token_logits_only=*/false,
-                           kq_stride_pad_)) {
+                           kq_stride_pad_,
+                           /*capture_moe_router=*/false,
+                           /*kvflash_mask=*/pool)) {
         std::fprintf(stderr, "verify_batch: build_target_step failed (base=%d n=%d)\n", base_pos, n_tokens);
         return false;
     }
+    if (pool && !sg_.kv_write_rows) {
+        std::fprintf(stderr, "verify_batch: kvflash requires set_rows path\n");
+        return false;
+    }
+    if (pool) {
+        // kv_write_rows is [n_tokens, n_head_kv] ne0-major: element
+        // (token i, head h) lives at i + h*n_tokens (set_rows asserts
+        // b->ne[1] == c->ne[0]). Getting this transposed scrambles
+        // per-head row targets for every multi-token write.
+        std::vector<int64_t> rows((size_t)n_tokens * w_.n_head_kv);
+        for (int h = 0; h < w_.n_head_kv; h++) {
+            for (int i = 0; i < n_tokens; i++) {
+                rows[(size_t)h * n_tokens + i] = slots[i];
+            }
+        }
+        ggml_backend_tensor_set(sg_.kv_write_rows, rows.data(), 0,
+                                sizeof(int64_t) * rows.size());
+    }
 
     // Embed input tokens and fill positions.
     std::vector<float> embed((size_t)n_tokens * hidden);
@@ -66,8 +103,35 @@ bool Qwen35DFlashTarget::verify_batch(
     ggml_backend_tensor_set(sg_.positions, pos.data(), 0,
                             sizeof(int32_t) * pos.size());
 
-    // Fill causal attention mask when present.
-    if (sg_.attn_mask) {
+    // Fill the attention mask.
+    if (sg_.attn_mask && pool) {
+        // Slot-space mask: row q attends (a) slots of committed positions
+        // (pos < base_pos) of resident chunks — this exactly excludes
+        // slots holding rejected drafts from earlier rounds — and (b) the
+        // verify tokens' own slots, causally.
+        const size_t kvd = (size_t)sg_.attn_mask->ne[0];
+        const int q_pad = (int)sg_.attn_mask->ne[1];
+        std::vector<uint16_t> mask_buf((size_t)kvd * q_pad, F16_NEG_INF);
+        const int ct = pager_->chunk_tokens();
+        for (int c = 0; c < pager_->n_chunks(); c++) {
+            const int blk = pager_->block_of(c);
+            if (blk < 0) continue;
+            for (int i = 0; i < ct; i++) {
+                if ((int64_t)c * ct + i >= base_pos) break;
+                mask_buf[(size_t)blk * ct + i] = F16_ZERO;
+            }
+        }
+        for (int q = 1; q < n_tokens; q++) {
+            std::memcpy(mask_buf.data() + (size_t)q * kvd, mask_buf.data(), kvd * 2);
+        }
+        for (int q = 0; q < n_tokens; q++) {
+            for (int i = 0; i <= q; i++) {
+                mask_buf[(size_t)q * kvd + slots[i]] = F16_ZERO;
+            }
+        }
+        ggml_backend_tensor_set(sg_.attn_mask, mask_buf.data(), 0,
+                                sizeof(uint16_t) * mask_buf.size());
+    } else if (sg_.attn_mask) {
         const int win_start = (fa_window_ > 0 && base_pos > fa_window_)
                                   ? (base_pos - fa_window_) : 0;
         const int kv_len = base_pos + n_tokens - win_start;
diff --git a/server/src/qwen35/qwen35_dflash_target.h b/server/src/qwen35/qwen35_dflash_target.h
index 6a72e48b5..17ab8bf95 100644
--- a/server/src/qwen35/qwen35_dflash_target.h
+++ b/server/src/qwen35/qwen35_dflash_target.h
@@ -10,6 +10,7 @@
 #include "internal.h"         // TargetWeights, TargetCache, DraftWeights
 #include "step_graph.h"
 #include "graph_builders.h"
+#include "kvflash_pager.h"
 
 #include "ggml.h"
 #include "ggml-backend.h"
@@ -53,6 +54,14 @@ class Qwen35DFlashTarget : public DFlashTarget {
     int mask_token_id() const override;
     const std::vector<int> & capture_layer_ids() const override;
 
+    // kvflash mode: verify writes are slot-mapped via the pager and the
+    // attention mask carries slot validity (resident committed positions
+    // only) plus causal structure among the verify tokens. Rejected draft
+    // tokens need no explicit rollback: their slots are excluded by the
+    // pos < base_pos validity rule on the next verify and get rewritten.
+    // Forces fa_window = 0 (logical windowing is meaningless in slot space).
+    void set_kvflash_pager(KvFlashPager * pager) { pager_ = pager; }
+
 private:
     TargetWeights & w_;
     TargetCache & cache_;
@@ -60,6 +69,7 @@ class Qwen35DFlashTarget : public DFlashTarget {
     StepGraph & sg_;
     int kq_stride_pad_;
     int fa_window_;
+    KvFlashPager * pager_ = nullptr;
 
     // Cached vector form of capture layer IDs (built once in constructor).
     std::vector<int> capture_ids_;
diff --git a/server/src/qwen35/qwen35_target_graph.cpp b/server/src/qwen35/qwen35_target_graph.cpp
index ed7fbe057..e0f7d8ecd 100644
--- a/server/src/qwen35/qwen35_target_graph.cpp
+++ b/server/src/qwen35/qwen35_target_graph.cpp
@@ -76,10 +76,11 @@ bool create_target_cache(const TargetWeights & w,
                          int max_verify_tokens,
                          ggml_backend_t backend,
                          TargetCache & out,
-                         bool prefill_only) {
+                         bool prefill_only,
+                         int ctx_alloc) {
     return create_target_cache_partial(w, max_ctx, max_verify_tokens, backend,
                                        out, prefill_only,
-                                       0, w.n_layer, true);
+                                       0, w.n_layer, true, ctx_alloc);
 }
 
 bool create_target_cache_partial(const TargetWeights & w,
@@ -90,7 +91,8 @@ bool create_target_cache_partial(const TargetWeights & w,
                                  bool prefill_only,
                                  int layer_begin,
                                  int layer_end,
-                                 bool allocate_target_feat) {
+                                 bool allocate_target_feat,
+                                 int ctx_alloc) {
     if (layer_begin < 0) layer_begin = 0;
     if (layer_end < 0 || layer_end > w.n_layer) layer_end = w.n_layer;
     if (layer_begin > layer_end) {
@@ -133,9 +135,14 @@ bool create_target_cache_partial(const TargetWeights & w,
 
     const bool needs_256_stride =
         kv_k_type == GGML_TYPE_TQ3_0 || kv_v_type == GGML_TYPE_TQ3_0;
+    // kvflash mode: attention tensors are allocated at the (smaller)
+    // physical pool capacity; logical positions are mapped to pool slots
+    // by KvFlashPager. The 256-stride rounding applies to whichever capacity
+    // is in effect.
+    const int ctx_phys = (ctx_alloc > 0 && ctx_alloc < max_ctx) ? ctx_alloc : max_ctx;
     const int max_ctx_alloc = needs_256_stride
-        ? ((max_ctx + 255) / 256) * 256
-        : max_ctx;
+        ? ((ctx_phys + 255) / 256) * 256
+        : ctx_phys;
 
     // ── Base context: KV cache + SSM/conv state + target_feat ────────
     {
@@ -433,6 +440,62 @@ void restore_ssm_state(TargetCache & c) {
     }
 }
 
+// Allocate SSM/conv rollback snapshot tensors by mirroring the live recurrent
+// state tensors' shapes. The MoE hybrid spec-decode path sets up its DeltaNet
+// state in base_buf but never calls migrate_prefill_cache, so without this
+// snapshot_ssm_state/restore_ssm_state are silent no-ops (the _snap arrays are
+// empty/null) and rejected draft tokens leak permanently into the linear
+// recurrent state, collapsing generation. Idempotent: reuses an existing
+// rollback_ctx (from a prior request or migrate_prefill_cache).
+bool ensure_ssm_snapshot(TargetCache & c, ggml_backend_t backend) {
+    if (c.rollback_ctx) return true;
+    const size_t n = c.ssm_state.size();
+    if (n == 0) return true;
+    c.ssm_state_snap.assign(n, nullptr);
+    c.conv_state_snap.assign(n, nullptr);
+
+    size_t cnt = 0;
+    for (size_t i = 0; i < n; i++) {
+        if (c.ssm_state[i]) cnt++;
+        if (i < c.conv_state.size() && c.conv_state[i]) cnt++;
+    }
+    if (cnt == 0) return true;
+
+    ggml_init_params ip{};
+    ip.mem_size   = (cnt + 8) * ggml_tensor_overhead();
+    ip.mem_buffer = nullptr;
+    ip.no_alloc   = true;
+    c.rollback_ctx = ggml_init(ip);
+    if (!c.rollback_ctx) { set_last_error("ensure_ssm_snapshot ggml_init failed"); return false; }
+
+    for (size_t i = 0; i < n; i++) {
+        char name[64];
+        if (c.ssm_state[i]) {
+            ggml_tensor * t = c.ssm_state[i];
+            ggml_tensor * sn = ggml_new_tensor(c.rollback_ctx, t->type, ggml_n_dims(t), t->ne);
+            std::snprintf(name, sizeof(name), "ssm_state_snap_%zu", i);
+            ggml_set_name(sn, name);
+            c.ssm_state_snap[i] = sn;
+        }
+        if (i < c.conv_state.size() && c.conv_state[i]) {
+            ggml_tensor * t = c.conv_state[i];
+            ggml_tensor * cn = ggml_new_tensor(c.rollback_ctx, t->type, ggml_n_dims(t), t->ne);
+            std::snprintf(name, sizeof(name), "conv_state_snap_%zu", i);
+            ggml_set_name(cn, name);
+            c.conv_state_snap[i] = cn;
+        }
+    }
+
+    c.rollback_buf = ggml_backend_alloc_ctx_tensors(c.rollback_ctx, backend);
+    if (!c.rollback_buf) {
+        set_last_error("ensure_ssm_snapshot alloc_ctx_tensors failed");
+        ggml_free(c.rollback_ctx);
+        c.rollback_ctx = nullptr;
+        return false;
+    }
+    return true;
+}
+
 // ─── Helpers ─────────────────────────────────────────────────────────
 
 static ggml_tensor * build_swiglu_ffn(ggml_context * ctx, ggml_tensor * cur,
diff --git a/server/src/qwen35moe/qwen35moe_backend.cpp b/server/src/qwen35moe/qwen35moe_backend.cpp
index 6455eac52..8b40be9fa 100644
--- a/server/src/qwen35moe/qwen35moe_backend.cpp
+++ b/server/src/qwen35moe/qwen35moe_backend.cpp
@@ -469,6 +469,7 @@ bool Qwen35MoeBackend::run_pipelined_decode_path(int committed, int n_gen,
         if (is_eos_tok(first_tok, target_weights())) return true;
         committed++;
         target_cache().cur_pos = committed;
+        if (kvflash_active()) kvflash_history_.push_back(first_tok);
     }
 
     // ── Ensure persistent pipelined state (built once, reused) ──
@@ -487,11 +488,23 @@ bool Qwen35MoeBackend::run_pipelined_decode_path(int committed, int n_gen,
                                       act_cur.data(), 0, sizeof(float) * (size_t)hidden);
         const auto embed_done = DecodeClock::now();
 
+        // kvflash: physical pool slot for this token's KV rows (may evict).
+        int kv_slot = -1;
+        if (kvflash_active()) {
+            kv_slot = kvflash_pager_.slot_for(committed);
+            if (kv_slot < 0) {
+                std::fprintf(stderr, "[kvflash] pipelined decode: no slot at pos %d\n",
+                             committed);
+                return false;
+            }
+        }
+
         PipelinedDecodeTelemetry tel;
         if (!pipelined_decode_one_token(*pipe_state_, target_backend(), target_weights(),
                                        target_cache(), *target_weights().moe_hybrid,
                                        committed, cfg_.kq_stride_pad,
-                                       hybrid_telemetry_ ? &tel : nullptr)) {
+                                       hybrid_telemetry_ ? &tel : nullptr,
+                                       kv_slot)) {
             return false;
         }
         const auto layers_done = DecodeClock::now();
@@ -563,6 +576,10 @@ bool Qwen35MoeBackend::run_pipelined_decode_path(int committed, int n_gen,
         io.emit(next_tok);
         committed++;
         target_cache().cur_pos = committed;
+        if (kvflash_active()) {
+            kvflash_history_.push_back(next_tok);
+            kvflash_maybe_reselect((int)out_tokens.size());
+        }
         if (io.cancelled) break;
         if (is_eos_tok(next_tok, target_weights())) break;
     }
@@ -721,6 +738,19 @@ GenerateResult Qwen35MoeBackend::generate_impl(const GenerateRequest & req,
     const int prompt_len = (int)req.prompt.size();
     const int prefill_chunk = std::min(128, prompt_len); // batch size per GPU compute
 
+    // kvflash: hybrid prefill writes rows identity-mapped, so the prompt must
+    // fit the pool with one chunk of decode headroom (same contract as the
+    // base do_prefill).
+    if (kvflash_active() &&
+        prompt_len > kvflash_tokens_ - kvflash_pager_.chunk_tokens()) {
+        std::fprintf(stderr,
+            "[kvflash] hybrid prompt (%d) exceeds pool %d; raise --kvflash "
+            "or enable pflash compression\n", prompt_len, kvflash_tokens_);
+        result.error = "kvflash: prompt exceeds resident pool";
+        cleanup_graphs();
+        return result;
+    }
+
     // Embed all prompt tokens
     const int n_expert_used = target_weights().n_expert_used;
     std::vector<float> embed_all((size_t)prompt_len * (size_t)hidden);
@@ -957,6 +987,9 @@ GenerateResult Qwen35MoeBackend::generate_impl(const GenerateRequest & req,
 
     int committed = prompt_len;
     target_cache().cur_pos = committed;
+    if (kvflash_active()) {
+        kvflash_sync_prefill(committed, req.prompt, /*kv_offset=*/0);
+    }
     auto t_prefill_end = std::chrono::steady_clock::now();
     result.prefill_s = std::chrono::duration<double>(t_prefill_end - t_prefill_start).count();
 
@@ -990,7 +1023,9 @@ GenerateResult Qwen35MoeBackend::generate_impl(const GenerateRequest & req,
     if (req.n_gen > 0) {
         auto t_decode_start = std::chrono::steady_clock::now();
 
-        // Check if hybrid spec-decode is available
+        // Hybrid spec-decode runs on the pool: hybrid_forward_batch is
+        // slot-mapped (verify and replay both route through it) and the
+        // recurrent-state rollback is ssm snapshot/restore (pool-neutral).
         const bool can_hybrid_spec = !req.force_ar_decode
             && cfg_.draft_path
             && !is_draft_parked()
@@ -1021,7 +1056,8 @@ GenerateResult Qwen35MoeBackend::generate_impl(const GenerateRequest & req,
             target_cache().last_tok = first_tok;
 
             cleanup_graphs();
-            if (!do_hybrid_spec_decode(committed, req.n_gen, result.tokens, out_io)) {
+            if (!do_hybrid_spec_decode(committed, req.n_gen, result.tokens, out_io,
+                                       &result.accept_rate)) {
                 result.error = "hybrid_spec_decode";
                 return result;
             }
@@ -1057,6 +1093,7 @@ GenerateResult Qwen35MoeBackend::generate_impl(const GenerateRequest & req,
             if (!is_eos_tok(first_tok, target_weights())) {
                 committed++;
                 target_cache().cur_pos = committed;
+                if (kvflash_active()) kvflash_history_.push_back(first_tok);
 
                 // Pipelined decode loop
                 PipelinedDecodeTelemetry decode_tel_accum{};
@@ -1071,11 +1108,23 @@ GenerateResult Qwen35MoeBackend::generate_impl(const GenerateRequest & req,
                     ggml_backend_tensor_set_async(target_backend(), pipe_state_->gpu_state.act_cur,
                                                   act_cur.data(), 0, sizeof(float) * (size_t)hidden);
 
+                    // kvflash: pool slot for this token's KV rows (may evict)
+                    int kv_slot = -1;
+                    if (kvflash_active()) {
+                        kv_slot = kvflash_pager_.slot_for(committed);
+                        if (kv_slot < 0) {
+                            result.error = "kvflash_slot";
+                            cleanup_graphs();
+                            return result;
+                        }
+                    }
+
                     PipelinedDecodeTelemetry tel;
                     if (!pipelined_decode_one_token(*pipe_state_, target_backend(), target_weights(),
                                                     target_cache(), *target_weights().moe_hybrid,
                                                     committed, cfg_.kq_stride_pad,
-                                                    hybrid_telemetry_ ? &tel : nullptr)) {
+                                                    hybrid_telemetry_ ? &tel : nullptr,
+                                                    kv_slot)) {
                         result.error = "decode";
                         cleanup_graphs();
                         return result;
@@ -1133,6 +1182,10 @@ GenerateResult Qwen35MoeBackend::generate_impl(const GenerateRequest & req,
                     out_io.emit(next_tok);
                     committed++;
                     target_cache().cur_pos = committed;
+                    if (kvflash_active()) {
+                        kvflash_history_.push_back(next_tok);
+                        kvflash_maybe_reselect((int)result.tokens.size());
+                    }
                     if (out_io.cancelled) break;
                     if (is_eos_tok(next_tok, target_weights())) break;
                 }
@@ -1295,6 +1348,32 @@ GenerateResult Qwen35MoeBackend::restore_and_generate_impl(int slot,
         return result;
     }
 
+    // kvflash: the restored prefix + delta prefill land identity-mapped, so
+    // the full prompt must fit the pool (snapshots past the pool are never
+    // saved, but the delta can still overflow it).
+    if (kvflash_active() &&
+        prompt_len > kvflash_tokens_ - kvflash_pager_.chunk_tokens()) {
+        std::fprintf(stderr,
+            "[kvflash] hybrid restore prompt (%d) exceeds pool %d; raise "
+            "--kvflash\n", prompt_len, kvflash_tokens_);
+        result.error = "kvflash: prompt exceeds resident pool";
+        out_io.emit(-1);
+        return result;
+    }
+
+    // kvflash: the delta prefill below runs the maskless pipelined forward
+    // over the padded pool span; map the restored prefix identity-style and
+    // zero stale free slots BEFORE any forward reads them.
+    if (kvflash_active()) {
+        kvflash_pager_.reset();
+        if (!kvflash_pager_.alloc_span(0, snap_pos)) {
+            result.error = "kvflash_slot";
+            out_io.emit(-1);
+            return result;
+        }
+        kvflash_pager_.zero_free_blocks();
+    }
+
     const int hidden = target_weights().n_embd;
     std::vector<float> act_cur((size_t)hidden);
     if (prompt_len > snap_pos) {
@@ -1314,6 +1393,17 @@ GenerateResult Qwen35MoeBackend::restore_and_generate_impl(int slot,
             std::chrono::steady_clock::now() - t_prefill_start).count();
     }
 
+    if (kvflash_active()) {
+        // Rebuild the pager mapping over the identity-mapped [0, committed).
+        // With the full prompt available the history carries real ids;
+        // restore-only generates keep an unknown-prefix history.
+        if (prompt_len == committed) {
+            kvflash_sync_prefill(committed, req.prompt, /*kv_offset=*/0);
+        } else {
+            kvflash_sync_prefill(committed, {}, /*kv_offset=*/committed);
+        }
+    }
+
     if (req.n_gen > 0) {
         if (target_cache().last_tok < 0) {
             std::fprintf(stderr,
@@ -1457,6 +1547,29 @@ bool Qwen35MoeBackend::hybrid_forward_batch(
         }
     }
 
+    // kvflash: allocate the block's slots up front (may evict) and build
+    // the slot-mapped write rows + slot-space mask once; every layer's
+    // graph gets the same fills (verify and replay both land here, so all
+    // hybrid-spec KV writes are pool-routed).
+    const bool kvf = kvflash_active();
+    std::vector<int64_t> kvf_rows;
+    std::vector<uint16_t> kvf_mask;
+    std::vector<int> kvf_slots;
+    if (kvf) {
+        if (!kvflash_pager_.alloc_span(base_pos, n_tokens)) return false;
+        kvf_slots.resize((size_t)n_tokens);
+        for (int i = 0; i < n_tokens; ++i) {
+            kvf_slots[(size_t)i] = kvflash_pager_.slot_of(base_pos + i);
+        }
+        // [n_tokens, n_head_kv] ne0-major (see verify_batch).
+        kvf_rows.resize((size_t)n_tokens * target_weights().n_head_kv);
+        for (int h = 0; h < target_weights().n_head_kv; ++h) {
+            for (int i = 0; i < n_tokens; ++i) {
+                kvf_rows[(size_t)h * n_tokens + i] = kvf_slots[(size_t)i];
+            }
+        }
+    }
+
     // Process layer-by-layer (same as prefill)
     StepGraph prefn_sg;
     ggml_gallocr_t ffn_hot_alloc = nullptr;
@@ -1466,17 +1579,23 @@ bool Qwen35MoeBackend::hybrid_forward_batch(
     for (int il = 0; il < n_layer; ++il) {
         auto & storage = target_weights().moe_hybrid->layers[(size_t)il];
 
-        const bool with_mask = (cfg_.kq_stride_pad > KQ_MASK_PAD) || (n_tokens > 1);
+        const bool with_mask = kvf ||
+            (cfg_.kq_stride_pad > KQ_MASK_PAD) || (n_tokens > 1);
 
         // Build pre-FFN graph (DeltaNet/attention + router) for all tokens
         step_graph_free(prefn_sg);
         if (!build_layer_prefn_step(prefn_sg, target_weights(), target_cache(), target_backend(),
                                     il, /*kv_start=*/base_pos, n_tokens,
-                                    with_mask, /*fa_window=*/0, cfg_.kq_stride_pad)) {
+                                    with_mask, /*fa_window=*/0, cfg_.kq_stride_pad,
+                                    /*kvflash=*/kvf)) {
             step_graph_destroy(prefn_sg);
             if (ffn_hot_alloc) ggml_gallocr_free(ffn_hot_alloc);
             return false;
         }
+        if (prefn_sg.kv_write_rows) {
+            ggml_backend_tensor_set(prefn_sg.kv_write_rows, kvf_rows.data(), 0,
+                                    sizeof(int64_t) * kvf_rows.size());
+        }
 
         // Upload embeddings
         ggml_backend_tensor_set(prefn_sg.inp_embed, embed_all.data(), 0,
@@ -1496,7 +1615,36 @@ bool Qwen35MoeBackend::hybrid_forward_batch(
         }
 
         // Set causal mask
-        if (prefn_sg.attn_mask) {
+        if (prefn_sg.attn_mask && kvf) {
+            // Slot-space mask (verify_batch recipe): committed resident
+            // positions (< base_pos) plus this block's own slots, causal.
+            // Built once, reused for every layer's graph.
+            if (kvf_mask.empty()) {
+                constexpr uint16_t F16_ZERO = 0x0000, F16_NEG_INF = 0xFC00;
+                const size_t kvd = (size_t)prefn_sg.attn_mask->ne[0];
+                const int q_pad = (int)prefn_sg.attn_mask->ne[1];
+                kvf_mask.assign(kvd * q_pad, F16_NEG_INF);
+                const int ct = kvflash_pager_.chunk_tokens();
+                for (int c = 0; c < kvflash_pager_.n_chunks(); c++) {
+                    const int blk = kvflash_pager_.block_of(c);
+                    if (blk < 0) continue;
+                    for (int i = 0; i < ct; i++) {
+                        if ((int64_t)c * ct + i >= base_pos) break;
+                        kvf_mask[(size_t)blk * ct + i] = F16_ZERO;
+                    }
+                }
+                for (int q = 1; q < n_tokens; q++) {
+                    std::memcpy(kvf_mask.data() + (size_t)q * kvd, kvf_mask.data(), kvd * 2);
+                }
+                for (int q = 0; q < n_tokens; q++) {
+                    for (int i = 0; i <= q; i++) {
+                        kvf_mask[(size_t)q * kvd + kvf_slots[(size_t)i]] = F16_ZERO;
+                    }
+                }
+            }
+            ggml_backend_tensor_set(prefn_sg.attn_mask, kvf_mask.data(), 0,
+                                    sizeof(uint16_t) * kvf_mask.size());
+        } else if (prefn_sg.attn_mask) {
             const int kv_len = base_pos + n_tokens;
             const int kv_pad_override = (int)prefn_sg.attn_mask->ne[0];
             std::vector<uint16_t> mask_buf;
@@ -1542,14 +1690,27 @@ bool Qwen35MoeBackend::hybrid_forward_batch(
         std::vector<float> ffn_batch_out;
         bool ffn_ok = false;
 
-        if (storage.cold_expert_ids.empty()) {
-            // All-hot: use batched hot-only path
+        // Spark expert cache: pull the verify batch's selected cold experts into
+        // spare GPU slots (LRU) so the batched FFN serves them on-die — the SAME
+        // residency mechanism the AR pipelined path uses. Without this the verify
+        // re-evaluated cold experts on the CPU every step, which dominated its FFN
+        // time (the spec-decode-with-offloading inefficiency). After warmup the
+        // working set is resident and the CPU cold path is rarely taken.
+        const int n_route_slots = n_tokens * n_expert_used;
+        if (storage.cache_slots > 0 && !storage.cold_expert_ids.empty()) {
+            for (int i = 0; i < n_route_slots; ++i)
+                dflash::common::moe_hybrid_cache_swap_in(storage, chunk_selected[(size_t)i], target_backend());
+        }
+        const bool routed_all_hot = storage.cold_expert_ids.empty()
+            || storage.all_routed_are_hot(chunk_selected.data(), n_route_slots);
+        if (routed_all_hot) {
+            // All routed experts resident on GPU: fast batched hot-only path.
             ffn_ok = eval_moe_hot_only_batched(
                 target_backend(), chunk_cfg, chunk_desc, storage,
                 chunk_post.data(), chunk_selected.data(), chunk_weights.data(),
                 n_tokens, ffn_batch_out, nullptr, &ffn_hot_alloc);
         } else {
-            // Mixed hot/cold: use hybrid path
+            // Cache full / residue still cold: hybrid path (remaining cold on CPU).
             ffn_ok = eval_moe_hybrid_ffn_batched(
                 target_backend(), target_weights().moe_hybrid->cpu_backend,
                 chunk_cfg, chunk_desc, storage,
@@ -1619,29 +1780,13 @@ bool Qwen35MoeBackend::hybrid_forward_batch(
     act_cur.assign(embed_all.data() + (size_t)(n_tokens - 1) * (size_t)hidden,
                    embed_all.data() + (size_t)n_tokens * (size_t)hidden);
 
-    // Project ALL tokens to logits and get argmax for each
-    const int vocab = target_weights().n_vocab;
+    // Project ALL tokens to logits and argmax ON THE GPU, reading back only
+    // n_tokens token ids instead of vocab*n_tokens floats. The host logits
+    // readback + host argmax was a large per-step D2H cost in the verify and
+    // replay forwards (vocab ~152k x n_tokens x 4B, twice per spec step).
     argmax_out.resize(n_tokens);
-
     StepGraph proj_sg;
-    ggml_init_params ip{};
-    ip.mem_size = 64 * 1024 * 1024;
-    ip.mem_buffer = nullptr;
-    ip.no_alloc = true;
-    proj_sg.ctx = ggml_init(ip);
-    if (!proj_sg.ctx) return false;
-
-    proj_sg.hidden_input = ggml_new_tensor_2d(proj_sg.ctx, GGML_TYPE_F32, hidden, n_tokens);
-    ggml_set_input(proj_sg.hidden_input);
-    proj_sg.gf = ggml_new_graph_custom(proj_sg.ctx, 1024, false);
-    ggml_tensor * normed = ggml_rms_norm(proj_sg.ctx, proj_sg.hidden_input, target_weights().rms_eps);
-    normed = ggml_mul(proj_sg.ctx, normed, target_weights().out_norm);
-    proj_sg.logits = ggml_mul_mat(proj_sg.ctx, target_weights().output, normed);
-    ggml_set_output(proj_sg.logits);
-    ggml_build_forward_expand(proj_sg.gf, proj_sg.logits);
-    proj_sg.alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(target_backend()));
-    if (!ggml_gallocr_alloc_graph(proj_sg.alloc, proj_sg.gf)) {
-        step_graph_destroy(proj_sg);
+    if (!build_lm_head_projection_step(proj_sg, target_weights(), target_backend(), n_tokens)) {
         return false;
     }
     ggml_backend_tensor_set(proj_sg.hidden_input, embed_all.data(), 0,
@@ -1651,35 +1796,36 @@ bool Qwen35MoeBackend::hybrid_forward_batch(
         step_graph_destroy(proj_sg);
         return false;
     }
-
-    // Read logits and compute argmax per token
-    std::vector<float> logits_buf((size_t)vocab * (size_t)n_tokens);
-    ggml_backend_tensor_get(proj_sg.logits, logits_buf.data(), 0,
-                            sizeof(float) * logits_buf.size());
+    ggml_backend_tensor_get(proj_sg.argmax_tokens, argmax_out.data(), 0,
+                            sizeof(int32_t) * (size_t)n_tokens);
     step_graph_destroy(proj_sg);
-
-    for (int t = 0; t < n_tokens; ++t) {
-        const float * tok_logits = logits_buf.data() + (size_t)t * (size_t)vocab;
-        int32_t best_id = 0;
-        float best_val = tok_logits[0];
-        for (int j = 1; j < vocab; ++j) {
-            if (tok_logits[j] > best_val) {
-                best_val = tok_logits[j];
-                best_id = j;
-            }
-        }
-        argmax_out[t] = best_id;
-    }
     return true;
 }
 
 bool Qwen35MoeBackend::do_hybrid_spec_decode(int committed, int n_gen,
                                               std::vector<int32_t> & out_tokens,
-                                              const DaemonIO & io) {
+                                              const DaemonIO & io,
+                                              float * accept_rate_out) {
     const int hidden = target_weights().n_embd;
     const int q_len = draft_weights().block_size;
     if (q_len <= 0) return false;
 
+    // Verify width: cap how many draft tokens we actually verify. The batched
+    // verify's cost is dominated by the distinct experts its tokens touch
+    // (especially under --spark expert offload, where extra tokens stream extra
+    // cold experts). Tokens past the realized accept length are wasted, so
+    // capping the verify to a width above the typical accept length cuts that
+    // waste at no acceptance cost. Default = full draft block; tune via env.
+    // Verify-width control (see note above). DFLASH_VERIFY_WIDTH pins a fixed
+    // width; otherwise the width adapts to the realized accept length so chain
+    // decoding (low AL) verifies just a few tokens (cheap, especially under
+    // expert offload) while a high-AL draft still gets enough width.
+    const int forced_verify_width = [&]{
+        const char * e = std::getenv("DFLASH_VERIFY_WIDTH");
+        return e ? std::max(1, std::min(q_len, std::atoi(e))) : 0;
+    }();
+    int observed_max_accept = 1;
+
     int32_t last_tok = target_cache().last_tok;
     std::vector<float> act_cur((size_t)hidden);
 
@@ -1696,10 +1842,22 @@ bool Qwen35MoeBackend::do_hybrid_spec_decode(int committed, int n_gen,
     int n_draft_steps = 0;
     int n_accept_sum = 0;
 
+    // Allocate DeltaNet rollback snapshot tensors (no-op if already present).
+    // Without these, snapshot_ssm_state/restore_ssm_state silently do nothing
+    // and rejected draft tokens leak into the recurrent state, collapsing output.
+    if (!ensure_ssm_snapshot(target_cache(), target_backend())) {
+        std::fprintf(stderr, "[hybrid-spec] ensure_ssm_snapshot failed\n");
+        step_graph_destroy(draft_sg);
+        return false;
+    }
+
     auto t_dec0 = std::chrono::steady_clock::now();
 
     while (n_generated < n_gen) {
         const int need_commit_budget = n_gen - n_generated;
+        const int verify_width = forced_verify_width > 0
+            ? forced_verify_width
+            : std::min(q_len, std::max(6, observed_max_accept + 2));
 
         // 1. Build noise input for draft
         noise_ids[0] = last_tok;
@@ -1785,9 +1943,9 @@ bool Qwen35MoeBackend::do_hybrid_spec_decode(int committed, int n_gen,
         // 4. Verify: snapshot recurrent state, then run ALL draft tokens batched
         snapshot_ssm_state(target_cache());
 
-        target_tok.resize(q_len);
+        target_tok.resize(verify_width);
         bool verify_ok = hybrid_forward_batch(
-            draft_tok.data(), q_len, committed,
+            draft_tok.data(), verify_width, committed,
             act_cur, target_tok, /*capture_features=*/false);
         if (!verify_ok) {
             std::fprintf(stderr, "[hybrid-spec] verify failed\n");
@@ -1798,11 +1956,12 @@ bool Qwen35MoeBackend::do_hybrid_spec_decode(int committed, int n_gen,
 
         // 5. Acceptance: longest matching prefix
         int accept_n = 1;
-        for (int i = 0; i < q_len - 1; i++) {
+        for (int i = 0; i < verify_width - 1; i++) {
             if (draft_tok[i + 1] == target_tok[i]) accept_n++;
             else break;
         }
-        int bonus_tok = (accept_n < q_len) ? target_tok[accept_n - 1] : -1;
+        int bonus_tok = (accept_n < verify_width) ? target_tok[accept_n - 1] : -1;
+        observed_max_accept = std::max(observed_max_accept, accept_n);
         int commit_n = accept_n + (bonus_tok >= 0 ? 1 : 0);
         if (commit_n > need_commit_budget) {
             commit_n = need_commit_budget;
@@ -1859,6 +2018,10 @@ bool Qwen35MoeBackend::do_hybrid_spec_decode(int committed, int n_gen,
     const double decode_s = std::chrono::duration<double>(t_dec1 - t_dec0).count();
     const int total_draft_pos = std::max(1, n_draft_steps * q_len);
     const double accept_pct = 100.0 * (double)n_accept_sum / (double)total_draft_pos;
+    if (accept_rate_out) {
+        *accept_rate_out = total_draft_pos > 0
+            ? (float)((double)n_accept_sum / (double)total_draft_pos) : 0.0f;
+    }
     std::fprintf(stderr, "[hybrid-spec] tokens=%d time=%.3f s speed=%.2f tok/s "
                  "steps=%d accepted=%d/%d (%.1f%%) avg_commit=%.2f AL=%.2f\n",
                  n_generated, decode_s,
diff --git a/server/src/qwen35moe/qwen35moe_backend.h b/server/src/qwen35moe/qwen35moe_backend.h
index ca154e405..d2f711a4c 100644
--- a/server/src/qwen35moe/qwen35moe_backend.h
+++ b/server/src/qwen35moe/qwen35moe_backend.h
@@ -61,7 +61,8 @@ class Qwen35MoeBackend : public Qwen35Backend {
     // verify via hybrid forward (layer-by-layer with hot/cold FFN).
     bool do_hybrid_spec_decode(int committed, int n_gen,
                                std::vector<int32_t> & out_tokens,
-                               const DaemonIO & io);
+                               const DaemonIO & io,
+                               float * accept_rate_out = nullptr);
 
     // Run one token through hybrid forward, capturing features at capture layers.
     // Returns the logits argmax token. Advances committed by 1.
diff --git a/server/src/qwen35moe/qwen35moe_pipelined_decode.cpp b/server/src/qwen35moe/qwen35moe_pipelined_decode.cpp
index 72cb03975..bfd4df479 100644
--- a/server/src/qwen35moe/qwen35moe_pipelined_decode.cpp
+++ b/server/src/qwen35moe/qwen35moe_pipelined_decode.cpp
@@ -314,12 +314,16 @@ bool pipelined_decode_one_token(
     MoeHybridStorage & hybrid,
     int kv_pos,
     int kq_stride_pad,
-    PipelinedDecodeTelemetry * tel) {
+    PipelinedDecodeTelemetry * tel,
+    int kv_slot) {
 
     const int n_layer = state.n_layer;
     const int n_embd = state.n_embd;
     const int n_expert_used = state.n_expert_used;
     ggml_backend_t cpu_be = hybrid.cpu_backend;
+    // Physical KV row for this token: kvflash pool slot, or the logical
+    // position itself. positions (RoPE) always carry the logical kv_pos.
+    const int kv_row = kv_slot >= 0 ? kv_slot : kv_pos;
 
     if (tel) {
         *tel = PipelinedDecodeTelemetry{};
@@ -503,7 +507,12 @@ bool pipelined_decode_one_token(
         bool attn_cached_ok = false;
         if (is_attn && !g_no_kvpad) {
             auto & cpg = state.cached_prefn[(size_t)il];
-            const int kv_win_needed = ((kv_pos + 1) + 255) & ~255;
+            // Clamp the baked FA span to the cache tensor's physical capacity:
+            // with kvflash the tensors are pool-sized, so the window stops
+            // growing at the pool (and the cached graph never rebuilds again).
+            const int kv_phys = (int)cache.attn_k[0]->ne[1];
+            const int kv_win_needed =
+                std::min(((kv_pos + 1) + 255) & ~255, kv_phys);
             if (!cpg.valid() || cpg.kv_win < kv_win_needed) {
                 if (!build_cached_attn_prefn(cpg, backend, w, cache, il,
                                              kv_win_needed, kq_stride_pad)) {
@@ -519,7 +528,7 @@ bool pipelined_decode_one_token(
             ggml_backend_tensor_copy_async(backend, backend, state.gpu_state.act_cur, cpg.inp_embed);
             int32_t pos4[4] = {kv_pos, kv_pos, kv_pos, 0};
             ggml_backend_tensor_set_async(backend, cpg.positions, pos4, 0, sizeof(pos4));
-            std::vector<int64_t> row_vals((size_t)w.n_head_kv, (int64_t)kv_pos);
+            std::vector<int64_t> row_vals((size_t)w.n_head_kv, (int64_t)kv_row);
             ggml_backend_tensor_set_async(backend, cpg.kv_write_rows, row_vals.data(), 0,
                                           sizeof(int64_t) * row_vals.size());
 
@@ -536,7 +545,16 @@ bool pipelined_decode_one_token(
             moe_weights_tensor = cpg.moe_weights;
         } else if (is_attn || !state.cached_prefn[(size_t)il].valid()) {
             // Attention layer (legacy/fallback) OR failed DeltaNet cache:
-            // rebuild graph dynamically
+            // rebuild graph dynamically. The legacy path writes KV at the
+            // literal view offset kv_pos and cannot express a pool slot —
+            // refuse instead of corrupting the pool / running off its end.
+            if (is_attn && kv_slot >= 0) {
+                std::fprintf(stderr,
+                    "[pipelined] kvflash requires the cached set_rows attn path "
+                    "(layer %d cached-graph build failed)\n", il);
+                step_graph_destroy(dyn_sg);
+                return false;
+            }
             if (!build_layer_prefn_step(dyn_sg, w, cache, backend,
                                         il, kv_pos, /*n_tokens=*/1,
                                         /*with_mask=*/false, /*fa_window=*/0, kq_stride_pad)) {
diff --git a/server/src/qwen35moe/qwen35moe_pipelined_decode.h b/server/src/qwen35moe/qwen35moe_pipelined_decode.h
index ae35c775f..64d3b6bab 100644
--- a/server/src/qwen35moe/qwen35moe_pipelined_decode.h
+++ b/server/src/qwen35moe/qwen35moe_pipelined_decode.h
@@ -197,14 +197,18 @@ bool init_pipelined_decode_state(
 // Run one full token through the pipelined decode loop (all n_layer layers).
 // On success, gpu_state.act_cur holds the final hidden state on GPU.
 // selected_ids_out / weights_out: optional per-layer routing capture for telemetry.
+// kv_slot: physical KV row to write (kvflash pool slot); -1 = kv_pos (identity,
+// no pool). The FA span clamps to the cache tensor's physical capacity, so
+// pool-sized tensors bound the cached-graph window automatically.
 bool pipelined_decode_one_token(
     PipelinedDecodeState & state,
     ggml_backend_t backend,
     const TargetWeights & w,
     TargetCache & cache,
     MoeHybridStorage & hybrid,
-    int kv_pos,              // current KV position
+    int kv_pos,              // current KV position (logical; drives RoPE)
     int kq_stride_pad,
-    PipelinedDecodeTelemetry * telemetry = nullptr);
+    PipelinedDecodeTelemetry * telemetry = nullptr,
+    int kv_slot = -1);
 
 }  // namespace dflash::common
diff --git a/server/src/server/server_main.cpp b/server/src/server/server_main.cpp
index bbe274dbc..36c28e400 100644
--- a/server/src/server/server_main.cpp
+++ b/server/src/server/server_main.cpp
@@ -408,6 +408,33 @@ int main(int argc, char ** argv) {
             bargs.fast_rollback = true;
         } else if (std::strcmp(argv[i], "--ddtree-budget") == 0 && i + 1 < argc) {
             bargs.ddtree_budget = std::atoi(argv[++i]);
+        } else if (std::strcmp(argv[i], "--kvflash") == 0 && i + 1 < argc) {
+            // Bounded KV residency: attention KV lives in a fixed pool of N
+            // tokens; cold 64-token chunks page to host. Works with or
+            // without pflash (drafter becomes the reselect scorer when
+            // loaded; plain LRU otherwise). Forces AR decode.
+            ++i;
+            if (std::strcmp(argv[i], "auto") != 0 && std::atoi(argv[i]) <= 0) {
+                std::fprintf(stderr, "--kvflash expects a positive token count or "
+                                     "'auto', got '%s'\n", argv[i]);
+                return 1;
+            }
+            ::setenv("DFLASH_KVFLASH", argv[i], 1);
+        } else if (std::strcmp(argv[i], "--kvflash-policy") == 0 && i + 1 < argc) {
+            ++i;
+            if (std::strcmp(argv[i], "drafter") != 0 && std::strcmp(argv[i], "lru") != 0) {
+                std::fprintf(stderr, "--kvflash-policy expects 'drafter' or 'lru', got '%s'\n",
+                             argv[i]);
+                return 1;
+            }
+            ::setenv("DFLASH_KVFLASH_POLICY", argv[i], 1);
+        } else if (std::strcmp(argv[i], "--kvflash-tau") == 0 && i + 1 < argc) {
+            if (std::atoi(argv[++i]) <= 0) {
+                std::fprintf(stderr, "--kvflash-tau expects a positive interval, got '%s'\n",
+                             argv[i]);
+                return 1;
+            }
+            ::setenv("DFLASH_KVFLASH_TAU", argv[i], 1);
         } else if (std::strcmp(argv[i], "--spark") == 0) {
             spark_autotune = true;
         } else if (std::strcmp(argv[i], "--spark-slots") == 0 && i + 1 < argc) {
@@ -459,6 +486,9 @@ int main(int argc, char ** argv) {
             sconfig.pflash_keep_ratio = (float)std::atof(argv[++i]);
         } else if (std::strcmp(argv[i], "--prefill-drafter") == 0 && i + 1 < argc) {
             sconfig.pflash_drafter_path = argv[++i];
+            // kvflash reads this to lazy-attach the drafter as its
+            // residency scorer even when prefill compression is off.
+            ::setenv("DFLASH_KVFLASH_DRAFTER", argv[i], 1);
         } else if (std::strcmp(argv[i], "--prefill-skip-park") == 0) {
             sconfig.pflash_skip_park = true;
         } else if (std::strcmp(argv[i], "--prefill-upstream-base") == 0 && i + 1 < argc) {
diff --git a/server/test/test_kvflash.cpp b/server/test/test_kvflash.cpp
new file mode 100644
index 000000000..3f3634ac6
--- /dev/null
+++ b/server/test/test_kvflash.cpp
@@ -0,0 +1,1082 @@
+// test_kvflash — verifies KVFlash, the bounded-resident-pool KV cache
+// (kvflash_pager.h).
+//
+// Runs against one loaded qwen35 target:
+//
+//   A  baseline: cache at LOGICAL context (default 131072), maskless decode
+//      (production AR path shape). Reference tokens + baseline KV memory.
+//   B  relocation proof: small pool, chunks at SHUFFLED physical blocks,
+//      explicit pool slot mask, teacher-forced replay of A. Argmax must
+//      track A (position-independence + mask exactness).
+//   C  paging proof: pool ≪ prompt+gen, live eviction, bit-exact
+//      page_out/page_in roundtrip, KV bytes vs A.
+//   D  reselect/recall: evicted chunk recalled via score_hook + reselect()
+//      (the FlashMemory τ-step lookahead machinery); decode continues.
+//   E  performance profile: decode ms/step vs FA span — baseline at
+//      8K/32K/128K vs pool 1K/4K at 128K-logical — plus page-event and
+//      mask-refill microbenchmarks.
+//
+// Usage:
+//   test_kvflash <qwen35.gguf> [--logical-ctx=N] [--pool-b=N] [--pool-c=N]
+//                 [--prompt=N] [--gen=N] [--skip-profile] [--no-mask]
+//   modes: (default) verification suite A-F | --niah | --niah256 | --longab
+
+#include "dflash27b.h"
+#include "internal.h"
+#include "kvflash_pager.h"
+#include "attn_masks.h"
+#include "qwen3_drafter.h"
+#include "qwen3_kvflash_scorer.h"
+
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+#include "ggml-cuda.h"
+
+#include <cinttypes>
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <string>
+#include <vector>
+
+using namespace dflash::common;
+
+namespace {
+
+double now_ms() {
+    return std::chrono::duration<double, std::milli>(
+        std::chrono::steady_clock::now().time_since_epoch()).count();
+}
+
+size_t kv_cache_bytes(const TargetCache & c) {
+    size_t n = 0;
+    for (auto * t : c.attn_k) if (t) n += ggml_nbytes(t);
+    for (auto * t : c.attn_v) if (t) n += ggml_nbytes(t);
+    return n;
+}
+
+size_t vram_used_now() {
+    size_t free_b = 0, total_b = 0;
+    ggml_backend_cuda_get_device_memory(0, &free_b, &total_b);
+    return total_b - free_b;
+}
+
+// Single-token stepper over build_qwen35_graph with explicit control of:
+//   * kv_write_rows  — physical pool slot for the KV append
+//   * positions      — logical position (M-RoPE)
+//   * span           — FA window length (kv_start = span-1 in graph terms)
+//   * attn_mask      — optional [align32(span_padded), 32] f16 slot mask
+//
+// The graph arena and gallocr persist across rebuilds (same trick as
+// build_target_step) so identical topology lands at identical addresses
+// and the ggml-cuda CUDA-graph cache can replay decode steps.
+struct Stepper {
+    ggml_context *  ctx = nullptr;
+    ggml_cgraph *   gf  = nullptr;
+    ggml_gallocr_t  alloc = nullptr;
+    ggml_tensor *   inp_embed = nullptr;
+    ggml_tensor *   positions = nullptr;
+    ggml_tensor *   attn_mask = nullptr;
+    ggml_tensor *   kv_write_rows = nullptr;
+    ggml_tensor *   logits = nullptr;
+    ggml_tensor *   argmax_tokens = nullptr;
+
+    const TargetWeights * w = nullptr;
+    TargetCache * cache = nullptr;
+    ggml_backend_t backend = nullptr;
+    int span = 0;
+    bool with_mask = false;
+
+    std::vector<uint8_t> arena;
+    std::vector<float> embed_buf;
+    std::vector<uint16_t> mask_buf;
+    uint64_t mask_epoch = (uint64_t)-1;
+    double mask_fill_ms_total = 0.0;
+    int mask_fills = 0;
+
+    bool init(const TargetWeights & tw, TargetCache & tc, ggml_backend_t be,
+              int span_, bool with_mask_) {
+        w = &tw; cache = &tc; backend = be;
+        span = span_; with_mask = with_mask_;
+        embed_buf.resize(tw.n_embd);
+        arena.resize((size_t)512 * 1024 * 1024);
+        return build();
+    }
+
+    bool build() {
+        if (ctx) { ggml_free(ctx); ctx = nullptr; }
+        ggml_init_params ip{};
+        ip.mem_size   = arena.size();
+        ip.mem_buffer = arena.data();
+        ip.no_alloc   = true;
+        ctx = ggml_init(ip);
+        if (!ctx) return false;
+
+        const int hidden = w->n_embd;
+        inp_embed = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hidden, 1, 1);
+        ggml_set_input(inp_embed);
+        positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
+        ggml_set_input(positions);
+        kv_write_rows = ggml_new_tensor_2d(ctx, GGML_TYPE_I64, 1, w->n_head_kv);
+        ggml_set_input(kv_write_rows);
+
+        attn_mask = nullptr;
+        if (with_mask) {
+            // FA span is padded to 256 on the step-invariant path; the mask
+            // kv dim must cover it.
+            const int span_padded = std::min(((span + 255) / 256) * 256,
+                                             (int)cache->attn_k[0]->ne[1]);
+            attn_mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F16,
+                                           align_up(span_padded, KQ_MASK_PAD),
+                                           align_up(1, KQ_MASK_PAD));
+            ggml_set_input(attn_mask);
+            mask_buf.assign((size_t)attn_mask->ne[0] * attn_mask->ne[1], F16_NEG_INF);
+            mask_epoch = (uint64_t)-1;
+        }
+
+        gf = ggml_new_graph_custom(ctx, 16384, false);
+
+        QwenGraphInputs gi{};
+        gi.inp_embed      = inp_embed;
+        gi.positions      = positions;
+        gi.attn_mask      = attn_mask;
+        gi.n_tokens       = 1;
+        gi.kv_start       = span - 1;
+        gi.capture_layers = false;
+        gi.kv_write_rows  = kv_write_rows;
+
+        QwenGraphOutputs go = build_qwen35_graph(ctx, gf, *w, *cache, gi);
+        if (!go.logits) return false;
+        logits = go.logits;
+        ggml_set_output(logits);
+        argmax_tokens = ggml_argmax(ctx, logits);
+        ggml_set_output(argmax_tokens);
+        ggml_build_forward_expand(gf, argmax_tokens);
+
+        if (!alloc) alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
+        return ggml_gallocr_alloc_graph(alloc, gf);
+    }
+
+    void refresh_mask(const KvFlashPager & pager) {
+        if (!attn_mask) return;
+        const double t0 = now_ms();
+        if (pager.epoch() != mask_epoch) {
+            // Host-side rebuild only on residency change.
+            std::fill(mask_buf.begin(), mask_buf.end(), F16_NEG_INF);
+            pager.fill_slot_mask(mask_buf.data());
+            mask_epoch = pager.epoch();
+            mask_fills++;
+        }
+        // Upload EVERY step: the compute-buffer region backing this input
+        // tensor is reused by graph execution, so a stale upload reads as
+        // garbage (NaN logits) on the next step. Production prefill
+        // re-uploads its mask before every compute for the same reason.
+        ggml_backend_tensor_set(attn_mask, mask_buf.data(), 0,
+                                mask_buf.size() * sizeof(uint16_t));
+        mask_fill_ms_total += now_ms() - t0;
+    }
+
+    int32_t step(int32_t tok, int pos, int phys_slot) {
+        if (!w->embedder.embed(&tok, 1, embed_buf.data())) {
+            std::fprintf(stderr, "embed failed: tok=%d pos=%d (NaN logits upstream?)\n", tok, pos);
+            std::exit(1);
+        }
+        ggml_backend_tensor_set(inp_embed, embed_buf.data(), 0,
+                                sizeof(float) * embed_buf.size());
+        int32_t p4[4] = { pos, pos, pos, 0 };
+        ggml_backend_tensor_set(positions, p4, 0, sizeof(int32_t) * 4);
+        std::vector<int64_t> rows(w->n_head_kv, (int64_t)phys_slot);
+        ggml_backend_tensor_set(kv_write_rows, rows.data(), 0,
+                                sizeof(int64_t) * rows.size());
+        if (ggml_backend_graph_compute(backend, gf) != GGML_STATUS_SUCCESS) {
+            std::fprintf(stderr, "graph_compute failed pos=%d\n", pos);
+            std::exit(1);
+        }
+        int32_t next = 0;
+        ggml_backend_tensor_get(argmax_tokens, &next, 0, sizeof(int32_t));
+        return next;
+    }
+
+    void destroy() {
+        if (alloc) { ggml_gallocr_free(alloc); alloc = nullptr; }
+        if (ctx)   { ggml_free(ctx); ctx = nullptr; }
+    }
+};
+
+std::vector<int32_t> make_prompt(int n, int vocab) {
+    std::vector<int32_t> p(n);
+    uint64_t s = 0x9E3779B97F4A7C15ull;
+    // Cap below the drafter vocab too (Qwen3-0.6B ~151K) so the same ids
+    // are scoreable by the indexer in run F.
+    const int cap = std::min(vocab, 100000);
+    for (int i = 0; i < n; i++) {
+        s = s * 6364136223846793005ull + 1442695040888963407ull;
+        p[i] = (int32_t)(1000 + (s >> 33) % (uint64_t)(cap / 2));
+    }
+    return p;
+}
+
+// Pooled chunked prefill: 64-token (one pager chunk) batched forwards with
+// slot-mapped set_rows writes and a resident+causal mask. This is the
+// prompt > pool path: prefill evicts like decode does. Graph is built once
+// (fixed topology) and reused for every chunk.
+struct BatchStepper {
+    ggml_context *  ctx = nullptr;
+    ggml_cgraph *   gf  = nullptr;
+    ggml_gallocr_t  alloc = nullptr;
+    ggml_tensor *   inp_embed = nullptr;
+    ggml_tensor *   positions = nullptr;
+    ggml_tensor *   attn_mask = nullptr;
+    ggml_tensor *   kv_write_rows = nullptr;
+    ggml_tensor *   logits = nullptr;
+    ggml_tensor *   argmax_tokens = nullptr;
+
+    const TargetWeights * w = nullptr;
+    TargetCache * cache = nullptr;
+    ggml_backend_t backend = nullptr;
+    int pool = 0;
+    static constexpr int NB = 64;          // tokens per chunk
+
+    std::vector<uint8_t> arena;
+    std::vector<float> embed_buf;
+    std::vector<uint16_t> mask_buf;
+
+    bool init(const TargetWeights & tw, TargetCache & tc, ggml_backend_t be, int pool_) {
+        w = &tw; cache = &tc; backend = be; pool = pool_;
+        embed_buf.resize((size_t)tw.n_embd * NB);
+        arena.resize((size_t)512 * 1024 * 1024);
+
+        ggml_init_params ip{};
+        ip.mem_size   = arena.size();
+        ip.mem_buffer = arena.data();
+        ip.no_alloc   = true;
+        ctx = ggml_init(ip);
+        if (!ctx) return false;
+
+        inp_embed = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, tw.n_embd, NB, 1);
+        ggml_set_input(inp_embed);
+        positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4 * NB);
+        ggml_set_input(positions);
+        kv_write_rows = ggml_new_tensor_2d(ctx, GGML_TYPE_I64, NB, tw.n_head_kv);
+        ggml_set_input(kv_write_rows);
+        attn_mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F16,
+                                       align_up(pool, KQ_MASK_PAD),
+                                       align_up(NB, KQ_MASK_PAD));
+        ggml_set_input(attn_mask);
+        mask_buf.assign((size_t)attn_mask->ne[0] * attn_mask->ne[1], F16_NEG_INF);
+
+        gf = ggml_new_graph_custom(ctx, 16384, false);
+        QwenGraphInputs gi{};
+        gi.inp_embed   = inp_embed;
+        gi.positions   = positions;
+        gi.attn_mask   = attn_mask;
+        gi.n_tokens    = NB;
+        gi.kv_start    = pool - NB;      // span = whole pool
+        gi.kv_write_rows = kv_write_rows;
+        gi.last_token_logits_only = true;
+        QwenGraphOutputs go = build_qwen35_graph(ctx, gf, *w, *cache, gi);
+        if (!go.logits) return false;
+        logits = go.logits;
+        ggml_set_output(logits);
+        argmax_tokens = ggml_argmax(ctx, logits);
+        ggml_set_output(argmax_tokens);
+        ggml_build_forward_expand(gf, argmax_tokens);
+        alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
+        return ggml_gallocr_alloc_graph(alloc, gf);
+    }
+
+    // One 64-token chunk at logical [pos_base, pos_base+64). Allocates the
+    // chunk's block (evicting if needed), writes slot-mapped, masks
+    // resident slots + causal-within-chunk. Returns last-token argmax.
+    int32_t step_chunk(const int32_t * toks, int pos_base, KvFlashPager & pager) {
+        int slots[NB];
+        for (int i = 0; i < NB; i++) slots[i] = pager.slot_for(pos_base + i);
+
+        if (!w->embedder.embed(toks, NB, embed_buf.data())) {
+            std::fprintf(stderr, "batch embed failed @%d\n", pos_base);
+            std::exit(1);
+        }
+        ggml_backend_tensor_set(inp_embed, embed_buf.data(), 0,
+                                sizeof(float) * embed_buf.size());
+        std::vector<int32_t> p4((size_t)4 * NB);
+        for (int i = 0; i < NB; i++) {
+            p4[4 * i + 0] = p4[4 * i + 1] = p4[4 * i + 2] = pos_base + i;
+            p4[4 * i + 3] = 0;
+        }
+        ggml_backend_tensor_set(positions, p4.data(), 0, sizeof(int32_t) * p4.size());
+        // [n_tokens, n_head_kv] ne0-major: (token i, head h) at i + h*NB.
+        std::vector<int64_t> rows((size_t)NB * w->n_head_kv);
+        for (int h = 0; h < w->n_head_kv; h++) {
+            for (int i = 0; i < NB; i++) {
+                rows[(size_t)h * NB + i] = slots[i];
+            }
+        }
+        ggml_backend_tensor_set(kv_write_rows, rows.data(), 0,
+                                sizeof(int64_t) * rows.size());
+
+        // Mask: per q row, resident slots (excluding this chunk) attendable,
+        // this chunk's slots causal. Rebuilt + uploaded per chunk.
+        const size_t kvd = (size_t)attn_mask->ne[0];
+        std::fill(mask_buf.begin(), mask_buf.end(), F16_NEG_INF);
+        pager.fill_slot_mask(mask_buf.data());                    // row 0 base
+        const int this_block = slots[0] - slots[0] % NB;
+        for (int i = 0; i < NB; i++) mask_buf[(size_t)this_block + i] = F16_NEG_INF;
+        for (int q = 1; q < NB; q++) {
+            std::memcpy(mask_buf.data() + (size_t)q * kvd, mask_buf.data(), kvd * 2);
+        }
+        for (int q = 0; q < NB; q++) {
+            for (int i = 0; i <= q; i++) {
+                mask_buf[(size_t)q * kvd + slots[i]] = F16_ZERO;
+            }
+        }
+        ggml_backend_tensor_set(attn_mask, mask_buf.data(), 0,
+                                mask_buf.size() * sizeof(uint16_t));
+
+        if (ggml_backend_graph_compute(backend, gf) != GGML_STATUS_SUCCESS) {
+            std::fprintf(stderr, "batch compute failed @%d\n", pos_base);
+            std::exit(1);
+        }
+        int32_t last = 0;
+        ggml_backend_tensor_get(argmax_tokens, &last, 0, sizeof(int32_t));
+        return last;
+    }
+
+    void destroy() {
+        if (alloc) { ggml_gallocr_free(alloc); alloc = nullptr; }
+        if (ctx)   { ggml_free(ctx); ctx = nullptr; }
+    }
+};
+
+
+int arg_int(int argc, char ** argv, const char * key, int defv) {
+    const size_t kl = std::strlen(key);
+    for (int i = 2; i < argc; i++) {
+        if (std::strncmp(argv[i], key, kl) == 0 && argv[i][kl] == '=') {
+            return std::atoi(argv[i] + kl + 1);
+        }
+    }
+    return defv;
+}
+
+bool arg_flag(int argc, char ** argv, const char * key) {
+    for (int i = 2; i < argc; i++) if (std::strcmp(argv[i], key) == 0) return true;
+    return false;
+}
+
+struct StepTimes {
+    double p50 = 0, p95 = 0, mean = 0;
+};
+
+StepTimes summarize(std::vector<double> & ms) {
+    StepTimes r;
+    if (ms.empty()) return r;
+    std::sort(ms.begin(), ms.end());
+    r.p50 = ms[ms.size() / 2];
+    r.p95 = ms[(size_t)(ms.size() * 0.95)];
+    for (double v : ms) r.mean += v;
+    r.mean /= ms.size();
+    return r;
+}
+
+} // namespace
+
+int main(int argc, char ** argv) {
+    if (argc < 2) {
+        std::fprintf(stderr, "usage: %s <qwen35.gguf> [--logical-ctx=N] [--pool-b=N] "
+                             "[--pool-c=N] [--prompt=N] [--gen=N] [--skip-profile]\n", argv[0]);
+        return 2;
+    }
+    const int logical_ctx = arg_int(argc, argv, "--logical-ctx", 131072);
+    const int pool_b      = arg_int(argc, argv, "--pool-b", 2048);
+    const int pool_c      = arg_int(argc, argv, "--pool-c", 1024);
+    const int n_prompt    = arg_int(argc, argv, "--prompt", 512);
+    const int n_gen       = arg_int(argc, argv, "--gen", 1200);
+    const bool skip_prof  = arg_flag(argc, argv, "--skip-profile");
+    // Explicit pool slot mask: exact exclusion of non-resident slots.
+    // ON by default (requires the per-step re-upload in refresh_mask: the
+    // mask input's compute-buffer region is clobbered by graph execution).
+    // --no-mask falls back to the zero-row approximation production's
+    // padded span uses.
+    const bool use_mask   = !arg_flag(argc, argv, "--no-mask");
+    const int total       = n_prompt + n_gen;
+    if (total > pool_b) {
+        std::fprintf(stderr, "config error: prompt+gen (%d) must fit pool-b (%d)\n", total, pool_b);
+        return 2;
+    }
+
+    ggml_backend_t backend = ggml_backend_cuda_init(0);
+    if (!backend) { std::fprintf(stderr, "cuda init failed\n"); return 1; }
+    const size_t vram0 = vram_used_now();
+
+    TargetWeights w;
+    if (!load_target_gguf(argv[1], backend, w)) {
+        std::fprintf(stderr, "load: %s\n", dflash27b_last_error());
+        return 1;
+    }
+    std::printf("[load] weights ok, vram_used=%.1f MiB\n",
+                (vram_used_now() - vram0) / 1048576.0);
+
+    // ── --longab: end-to-end long-prompt A/B (speed + accuracy) ─────
+    // For L in {32K, 64K, 128K}: full-cache baseline vs pool-4096 with
+    // drafter reselect. Measures prefill time, decode tok/s over a
+    // 240-token free run, and needle recall (depth 0.25, outside both
+    // the sinks and the LRU window).
+    if (arg_flag(argc, argv, "--longab")) {
+        // Drafter loads lazily, pool mode only: the full-cache baseline at
+        // 256K needs every byte (weights 15.3 GiB + KV 4.6 GiB).
+        DrafterContext dctx;
+        KvFlashDrafterScorer scorer(&dctx);
+        // Single-config mode (one process per config: the CUDA VMM pool
+        // grows monotonically across large-cache configs and aborts).
+        const int only_L = arg_int(argc, argv, "--longab-L", 0);
+        const int only_mode = arg_int(argc, argv, "--longab-mode", -1); // 0=full 1=pool
+        std::printf("\n%-7s %-10s %-9s %-9s %-9s %-9s %s\n",
+                    "L", "mode", "prefill_s", "rescore_s", "dec_tok/s", "needle", "kv_vram");
+        for (int L : { 32768, 65536, 131072, 262144 }) {
+            if (only_L > 0 && L != only_L) continue;
+            for (int mode = 0; mode < 2; mode++) {           // 0=baseline 1=pool
+                if (only_mode >= 0 && mode != only_mode) continue;
+                if (mode == 1 && !dctx.loaded &&
+                    !load_drafter("/opt/lucebox/models/drafter/Qwen3-0.6B-BF16.gguf", 0, dctx)) {
+                    std::fprintf(stderr, "drafter load failed\n");
+                    return 1;
+                }
+                const int pool = mode == 0 ? L : 4096;
+                auto prompt = make_prompt(L, w.n_vocab);
+                std::vector<int32_t> needle(48);
+                uint64_t ns = 0xDEADBEEFCAFEull;
+                for (int i = 0; i < 48; i++) {
+                    ns = ns * 6364136223846793005ull + 1442695040888963407ull;
+                    needle[i] = (int32_t)(1000 + (ns >> 33) % 49000);
+                }
+                const int npos = ((int)(0.25 * (L - 512)) / 32) * 32;
+                for (int i = 0; i < 48; i++) prompt[npos + i] = needle[i];
+
+                TargetCache cache;
+                if (!create_target_cache(w, pool, 0, backend, cache, true)) return 1;
+                const double kv_mib = kv_cache_bytes(cache) / 1048576.0;
+                KvFlashPager pager;
+                KvFlashConfig pc; pc.pool_tokens = pool;
+                if (!pager.attach(pc, cache.attn_k, cache.attn_v)) return 1;
+
+                double t0 = now_ms();
+                BatchStepper bs;
+                if (!bs.init(w, cache, backend, pool)) return 1;
+                for (int p = 0; p < L; p += 64) bs.step_chunk(prompt.data() + p, p, pager);
+                bs.destroy();
+                const double prefill_s = (now_ms() - t0) / 1000.0;
+
+                Stepper st;
+                if (!st.init(w, cache, backend, pool, mode == 1)) return 1;
+                int32_t next = -1;
+                for (int i = 0; i < 32; i++) {
+                    const int slot = pager.slot_for(L + i);
+                    st.refresh_mask(pager);
+                    next = st.step(needle[i], L + i, slot);
+                }
+                double rescore_s = 0;
+                if (mode == 1) {
+                    std::vector<int32_t> hist = prompt;
+                    hist.insert(hist.end(), needle.begin(), needle.begin() + 32);
+                    std::vector<float> scores;
+                    t0 = now_ms();
+                    if (scorer.score_chunks(hist, pc.chunk_tokens, scores)) {
+                        pager.score_hook = [&scores](int c) {
+                            return c < (int)scores.size() ? scores[c] : 1e30f;
+                        };
+                        pager.reselect();
+                        pager.score_hook = nullptr;
+                    }
+                    rescore_s = (now_ms() - t0) / 1000.0;
+                }
+                int match = 0;
+                for (int i = 0; i < 16; i++) {
+                    if (next == needle[32 + i]) match++;
+                    const int pos = L + 32 + i;
+                    const int slot = pager.slot_for(pos);
+                    st.refresh_mask(pager);
+                    next = st.step(needle[32 + i], pos, slot);
+                }
+                t0 = now_ms();
+                for (int i = 0; i < 240; i++) {              // timed free run
+                    const int pos = L + 48 + i;
+                    const int slot = pager.slot_for(pos);
+                    st.refresh_mask(pager);
+                    next = st.step(next, pos, slot);
+                }
+                const double tok_s = 240.0 / ((now_ms() - t0) / 1000.0);
+                std::printf("%-7d %-10s %-9.1f %-9.1f %-9.1f %d/16   %.0f MiB\n",
+                            L, mode == 0 ? "full" : "pool4096",
+                            prefill_s, rescore_s, tok_s, match, kv_mib);
+                std::fflush(stdout);
+                st.destroy();
+                free_target_cache(cache);
+            }
+        }
+        if (dctx.loaded) free_drafter(dctx);
+        free_target_weights(w);
+        ggml_backend_free(backend);
+        return 0;
+    }
+
+    // ── --niah256: native-max-context probe (262144 logical) ────────
+    // Pooled configs only: the fixed-span harness makes a full-pool
+    // control prefill take hours at 256K. The LRU row with the needle
+    // inside the recency window is the induction control (distance-free).
+    if (arg_flag(argc, argv, "--niah256")) {
+        DrafterContext dctx;
+        if (!load_drafter("/opt/lucebox/models/drafter/Qwen3-0.6B-BF16.gguf", 0, dctx)) {
+            std::fprintf(stderr, "drafter load failed\n");
+            return 1;
+        }
+        KvFlashDrafterScorer scorer(&dctx);
+        const int L = 262144, pool = 16384;          // 6.25% residency
+        struct Cfg { const char * policy; double depth; };
+        const Cfg cfgs[] = {
+            {"lru",     0.97},   // in-window: induction control at 256K
+            {"lru",     0.50},
+            {"drafter", 0.10},
+            {"drafter", 0.50},
+            {"drafter", 0.90},
+        };
+        std::printf("\n%-7s %-6s %-8s %-6s %s\n", "L", "pool", "policy", "depth", "match/16");
+        for (const Cfg & cfg : cfgs) {
+            auto prompt = make_prompt(L, w.n_vocab);
+            std::vector<int32_t> needle(48);
+            uint64_t ns = 0xDEADBEEFCAFEull;
+            for (int i = 0; i < 48; i++) {
+                ns = ns * 6364136223846793005ull + 1442695040888963407ull;
+                needle[i] = (int32_t)(1000 + (ns >> 33) % 49000);
+            }
+            const int npos = ((int)(cfg.depth * (L - 512)) / 32) * 32;
+            for (int i = 0; i < 48; i++) prompt[npos + i] = needle[i];
+
+            TargetCache cache;
+            if (!create_target_cache(w, pool, 0, backend, cache, true)) return 1;
+            KvFlashPager pager;
+            KvFlashConfig pc; pc.pool_tokens = pool;
+            if (!pager.attach(pc, cache.attn_k, cache.attn_v)) return 1;
+
+            const double t0 = now_ms();
+            BatchStepper bs;
+            if (!bs.init(w, cache, backend, pool)) return 1;
+            for (int p = 0; p < L; p += 64) bs.step_chunk(prompt.data() + p, p, pager);
+            bs.destroy();
+            std::printf("[256k] prefill %.1f s, host backing %.2f GiB\n",
+                        (now_ms() - t0) / 1000.0,
+                        pager.stats().host_bytes / 1073741824.0);
+
+            Stepper st;
+            if (!st.init(w, cache, backend, pool, true)) return 1;
+            int32_t next = -1;
+            for (int i = 0; i < 32; i++) {
+                const int slot = pager.slot_for(L + i);
+                st.refresh_mask(pager);
+                next = st.step(needle[i], L + i, slot);
+            }
+            if (std::strcmp(cfg.policy, "drafter") == 0) {
+                std::vector<int32_t> hist = prompt;
+                hist.insert(hist.end(), needle.begin(), needle.begin() + 32);
+                std::vector<float> scores;
+                const double r0 = now_ms();
+                if (!scorer.score_chunks(hist, pc.chunk_tokens, scores)) {
+                    std::printf("[256k] WARN rescore failed\n");
+                } else {
+                    std::printf("[256k] rescore %.1f s\n", (now_ms() - r0) / 1000.0);
+                    pager.score_hook = [&scores](int c) {
+                        return c < (int)scores.size() ? scores[c] : 1e30f;
+                    };
+                    pager.reselect();
+                    pager.score_hook = nullptr;
+                }
+            }
+            int match = 0;
+            for (int i = 0; i < 16; i++) {
+                if (next == needle[32 + i]) match++;
+                const int pos = L + 32 + i;
+                const int slot = pager.slot_for(pos);
+                st.refresh_mask(pager);
+                next = st.step(needle[32 + i], pos, slot);
+            }
+            std::printf("%-7d %-6d %-8s %-6.2f %d/16\n", L, pool, cfg.policy, cfg.depth, match);
+            std::fflush(stdout);
+            st.destroy();
+            free_target_cache(cache);
+        }
+        free_drafter(dctx);
+        free_target_weights(w);
+        ggml_backend_free(backend);
+        return 0;
+    }
+
+    if (arg_flag(argc, argv, "--niah")) {
+        DrafterContext dctx;
+        const bool have_drafter =
+            load_drafter("/opt/lucebox/models/drafter/Qwen3-0.6B-BF16.gguf", 0, dctx);
+        if (!have_drafter) std::printf("[niah] drafter unavailable, skipping drafter policy\n");
+        KvFlashDrafterScorer scorer(&dctx);
+        if (have_drafter) {
+            // Reserve the drafter's compute buffers at max context NOW,
+            // before target-side cache churn fragments the CUDA pool.
+            // Without this, 32K rescores OOM late in the sweep and the
+            // drafter policy silently degrades to LRU.
+            std::vector<int32_t> warm(33024, 1234);
+            std::vector<float> tmp;
+            scorer.score_chunks(warm, 64, tmp);
+        }
+
+        const int Ls[] = { 8192, 32768 };
+        const double depths[] = { 0.10, 0.50, 0.90 };
+        std::printf("\n%-7s %-6s %-8s %-6s %s\n", "L", "pool", "policy", "depth", "match/16");
+        for (int L : Ls) {
+            const int pools[] = { L, L / 4, ((L / 10) / 256) * 256 };
+            for (int pi = 0; pi < 3; pi++) {
+                const int pool = pools[pi];
+                const char * policies[] = { "lru", "drafter" };
+                const int n_pol = (pi == 0) ? 1 : (have_drafter ? 2 : 1); // full pool: control only
+                for (int pol = 0; pol < n_pol; pol++) {
+                    for (double depth : depths) {
+                        // Needle: 48 unique-as-a-sequence tokens from the
+                        // filler id range (matched embedding statistics).
+                        // Query = first 32 (longer match = stronger
+                        // induction), score the last 16.
+                        auto prompt = make_prompt(L, w.n_vocab);
+                        std::vector<int32_t> needle(48);
+                        uint64_t ns = 0xDEADBEEFCAFEull;
+                        for (int i = 0; i < 48; i++) {
+                            ns = ns * 6364136223846793005ull + 1442695040888963407ull;
+                            needle[i] = (int32_t)(1000 + (ns >> 33) % 49000);
+                        }
+                        const int npos = ((int)(depth * (L - 512)) / 32) * 32;
+                        for (int i = 0; i < 48; i++) prompt[npos + i] = needle[i];
+
+                        TargetCache cache;
+                        if (!create_target_cache(w, pool, 0, backend, cache, true)) return 1;
+                        KvFlashPager pager;
+                        KvFlashConfig pc; pc.pool_tokens = pool;
+                        if (!pager.attach(pc, cache.attn_k, cache.attn_v)) return 1;
+
+                        BatchStepper bs;
+                        if (!bs.init(w, cache, backend, pool)) return 1;
+                        for (int p = 0; p < L; p += 64) {
+                            bs.step_chunk(prompt.data() + p, p, pager);
+                        }
+                        bs.destroy();
+
+                        Stepper st;
+                        if (!st.init(w, cache, backend, pool, true)) return 1;
+                        int32_t next = -1;
+                        for (int i = 0; i < 32; i++) {       // query: needle prefix
+                            const int slot = pager.slot_for(L + i);
+                            st.refresh_mask(pager);
+                            next = st.step(needle[i], L + i, slot);
+                        }
+                        if (pol == 1) {                       // drafter reselect
+                            std::vector<int32_t> hist = prompt;
+                            hist.insert(hist.end(), needle.begin(), needle.begin() + 32);
+                            std::vector<float> scores;
+                            if (!scorer.score_chunks(hist, pc.chunk_tokens, scores)) {
+                                std::printf("[niah] WARN: rescore failed (L=%d pool=%d)\n", L, pool);
+                            } else {
+                                pager.score_hook = [&scores](int c) {
+                                    return c < (int)scores.size() ? scores[c] : 1e30f;
+                                };
+                                pager.reselect();
+                                pager.score_hook = nullptr;
+                            }
+                        }
+                        int match = 0;
+                        for (int i = 0; i < 16; i++) {        // continuation
+                            if (next == needle[32 + i]) match++;
+                            // Teacher-force ground truth: one miss must not
+                            // cascade; we measure per-position retrieval.
+                            const int pos = L + 32 + i;
+                            const int slot = pager.slot_for(pos);
+                            st.refresh_mask(pager);
+                            next = st.step(needle[32 + i], pos, slot);
+                        }
+                        std::printf("%-7d %-6d %-8s %-6.2f %d/16\n",
+                                    L, pool, pi == 0 ? "full" : policies[pol],
+                                    depth, match);
+                        std::fflush(stdout);
+                        st.destroy();
+                        free_target_cache(cache);
+                    }
+                }
+            }
+        }
+        if (have_drafter) free_drafter(dctx);
+        free_target_weights(w);
+        ggml_backend_free(backend);
+        return 0;
+    }
+
+    const auto prompt = make_prompt(n_prompt, w.n_vocab);
+    std::vector<int32_t> tokens_a;
+    size_t mem_a_kv = 0, mem_a_buf = 0, mem_a_vram = 0;
+    size_t mem_c_kv = 0, mem_c_buf = 0, mem_c_vram = 0;
+    int hard_failures = 0;
+
+    // ── Run A: baseline at logical context, maskless ────────────────
+    {
+        const size_t v_before = vram_used_now();
+        TargetCache cache;
+        if (!create_target_cache(w, logical_ctx, 0, backend, cache, /*prefill_only=*/true)) {
+            std::fprintf(stderr, "cache A: %s\n", dflash27b_last_error());
+            return 1;
+        }
+        mem_a_kv = kv_cache_bytes(cache);
+        mem_a_buf = ggml_backend_buffer_get_size(cache.base_buf);
+        mem_a_vram = vram_used_now() - v_before;
+        std::printf("[A] logical_ctx=%d  kv=%.1f MiB  base_buf=%.1f MiB  vram_delta=%.1f MiB\n",
+                    logical_ctx, mem_a_kv / 1048576.0, mem_a_buf / 1048576.0,
+                    mem_a_vram / 1048576.0);
+
+        Stepper st;
+        int32_t next = -1;
+        const double t0 = now_ms();
+        for (int pos = 0; pos < total; pos++) {
+            // Production-like growing span: rebuild only when the padded
+            // span crosses a 256 boundary (mirrors do_ar_decode topology).
+            const int want_span = pos + 1;
+            if (!st.ctx || ((want_span + 255) / 256) != ((st.span + 255) / 256)) {
+                st.span = want_span;
+                if (!st.ctx) { if (!st.init(w, cache, backend, want_span, false)) return 1; }
+                else if (!st.build()) return 1;
+            }
+            const int32_t tok = pos < n_prompt ? prompt[pos]
+                              : (tokens_a.push_back(next), next);
+            next = st.step(tok, pos, pos);
+            cache.cur_pos = pos + 1;
+        }
+        tokens_a.push_back(next);
+        std::printf("[A] decoded %zu tokens, %.1f tok/s overall\n",
+                    tokens_a.size(), total / ((now_ms() - t0) / 1000.0));
+        st.destroy();
+        free_target_cache(cache);
+    }
+
+    // ── Run B: relocation + mask exactness, teacher-forced ──────────
+    {
+        TargetCache cache;
+        if (!create_target_cache(w, pool_b, 0, backend, cache, /*prefill_only=*/true)) {
+            std::fprintf(stderr, "cache B: %s\n", dflash27b_last_error());
+            return 1;
+        }
+        KvFlashPager pager;
+        KvFlashConfig pc;
+        pc.pool_tokens = pool_b;
+        if (!pager.attach(pc, cache.attn_k, cache.attn_v)) return 1;
+        const int nb = pool_b / pc.chunk_tokens;
+        std::vector<int> order(nb);
+        for (int i = 0; i < nb; i++) order[i] = i;
+        uint64_t s = 12345;
+        for (int i = nb - 1; i > 0; i--) {
+            s = s * 6364136223846793005ull + 1442695040888963407ull;
+            const int j = (int)((s >> 33) % (uint64_t)(i + 1));
+            std::swap(order[i], order[j]);
+        }
+        pager.set_block_order(order);
+
+        Stepper st;
+        if (!st.init(w, cache, backend, pool_b, use_mask)) return 1;
+        int mismatches = 0, first_mismatch = -1;
+        for (int pos = 0; pos < total; pos++) {
+            const int32_t tok = pos < n_prompt ? prompt[pos] : tokens_a[pos - n_prompt];
+            const int slot = pager.slot_for(pos);
+            st.refresh_mask(pager);
+            const int32_t next = st.step(tok, pos, slot);
+            const int ref_idx = pos - n_prompt + 1;
+            if (pos >= n_prompt - 1 && ref_idx < (int)tokens_a.size()) {
+                if (next != tokens_a[ref_idx]) {
+                    mismatches++;
+                    if (first_mismatch < 0) first_mismatch = pos;
+                }
+            }
+        }
+        const double rate = 100.0 * mismatches / (n_gen + 1);
+        std::printf("[B] shuffled+masked, pool=%d: %d/%d argmax mismatches (%.2f%%), first at pos %d; "
+                    "mask refills=%d avg=%.3f ms\n",
+                    pool_b, mismatches, n_gen + 1, rate, first_mismatch,
+                    st.mask_fills, st.mask_fills ? st.mask_fill_ms_total / st.mask_fills : 0.0);
+        // Gate at 2%: the flip sources are the maskless zero-row softmax
+        // mass plus run-to-run fattn nondeterminism; both measured ~1%
+        // (10-14 flips/1201 across runs), so a 1% gate flaps on noise.
+        std::printf("%s relocation equivalence (threshold 2%%)\n", rate <= 2.0 ? "PASS" : "FAIL");
+        if (rate > 2.0) hard_failures++;
+        st.destroy();
+        free_target_cache(cache);
+    }
+
+    // ── Run C: live paging + roundtrip; D: reselect recall ──────────
+    {
+        const size_t v_before = vram_used_now();
+        TargetCache cache;
+        if (!create_target_cache(w, pool_c, 0, backend, cache, /*prefill_only=*/true)) {
+            std::fprintf(stderr, "cache C: %s\n", dflash27b_last_error());
+            return 1;
+        }
+        mem_c_kv = kv_cache_bytes(cache);
+        mem_c_buf = ggml_backend_buffer_get_size(cache.base_buf);
+        mem_c_vram = vram_used_now() - v_before;
+
+        KvFlashPager pager;
+        KvFlashConfig pc;
+        pc.pool_tokens = pool_c;
+        if (!pager.attach(pc, cache.attn_k, cache.attn_v)) return 1;
+
+        Stepper st;
+        if (!st.init(w, cache, backend, pool_c, use_mask)) return 1;
+        int32_t next = -1;
+        for (int pos = 0; pos < n_prompt; pos++) {
+            const int slot = pager.slot_for(pos);
+            st.refresh_mask(pager);
+            next = st.step(prompt[pos], pos, slot);
+            cache.cur_pos = pos + 1;
+        }
+        {   // bit-exact roundtrip on chunk 2
+            ggml_tensor * t = cache.attn_k[0];
+            const size_t seg = (size_t)pc.chunk_tokens * t->nb[1];
+            std::vector<uint8_t> before(seg), after(seg);
+            ggml_backend_tensor_get(t, before.data(),
+                (size_t)pager.block_of(2) * pc.chunk_tokens * t->nb[1], seg);
+            if (!pager.page_out(2) || !pager.page_in(2)) {
+                std::fprintf(stderr, "roundtrip paging failed\n"); return 1;
+            }
+            ggml_backend_tensor_get(t, after.data(),
+                (size_t)pager.block_of(2) * pc.chunk_tokens * t->nb[1], seg);
+            const bool exact = std::memcmp(before.data(), after.data(), seg) == 0;
+            std::printf("%s page_out/page_in roundtrip bit-exact (chunk 2 -> block %d)\n",
+                        exact ? "PASS" : "FAIL", pager.block_of(2));
+            if (!exact) hard_failures++;
+        }
+
+        std::vector<int32_t> tokens_c;
+        const double t0 = now_ms();
+        for (int pos = n_prompt; pos < total; pos++) {
+            tokens_c.push_back(next);
+            const int slot = pager.slot_for(pos);
+            st.refresh_mask(pager);
+            next = st.step(next, pos, slot);
+            cache.cur_pos = pos + 1;
+        }
+        tokens_c.push_back(next);
+        const double secs = (now_ms() - t0) / 1000.0;
+        int agree = 0;
+        while (agree < (int)tokens_c.size() && agree < (int)tokens_a.size() &&
+               tokens_c[agree] == tokens_a[agree]) agree++;
+        const auto & ps = pager.stats();
+        std::printf("[C] pool=%d masked: decode %.1f tok/s, page_outs=%" PRId64
+                    " page_ins=%" PRId64 " host=%.1f MiB; baseline agreement %d tokens\n",
+                    pool_c, n_gen / secs, ps.page_outs, ps.page_ins,
+                    ps.host_bytes / 1048576.0, agree);
+        std::printf("PASS paged decode with eviction (%d evictions)\n", (int)ps.page_outs);
+
+        // ── Run D: τ-style reselect recall ──────────────────────────
+        {
+            int victim = -1;   // earliest paged-out, non-sink chunk
+            for (int c = pc.sink_chunks; c < pager.n_chunks(); c++) {
+                if (!pager.is_resident(c)) { victim = c; break; }
+            }
+            if (victim < 0) {
+                std::printf("FAIL reselect demo: no paged-out chunk found\n");
+                hard_failures++;
+            } else {
+                // Score injection: the victim becomes the hottest chunk —
+                // stands in for a drafter rescore flagging recalled context.
+                pager.score_hook = [&](int c) { return c == victim ? 2.0f : 1.0f / (1 + c); };
+                const double r0 = now_ms();
+                const int events = pager.reselect();
+                const double r_ms = now_ms() - r0;
+                const bool back = pager.is_resident(victim);
+                std::printf("%s reselect recalled chunk %d (%d page events, %.2f ms)\n",
+                            back ? "PASS" : "FAIL", victim, events, r_ms);
+                if (!back) hard_failures++;
+                // decode must continue cleanly after the residency change
+                pager.score_hook = nullptr;
+                for (int pos = total; pos < total + 64; pos++) {
+                    const int slot = pager.slot_for(pos);
+                    st.refresh_mask(pager);
+                    next = st.step(next, pos, slot);
+                }
+                std::printf("PASS decode continues after reselect (64 tokens)\n");
+            }
+        }
+        st.destroy();
+        free_target_cache(cache);
+    }
+
+    // ── Run F: full LSA loop — drafter as Memory Indexer ────────────
+    // Prompt LARGER than the pool, so prefill itself evicts; then the
+    // FlashMemory inference paradigm end to end: every τ=64 decoded
+    // tokens the drafter rescores the whole sequence (tail attention =
+    // indexer query), score_hook gets the fresh chunk scores, and
+    // reselect() repages the pool. PASS requires at least one genuine
+    // drafter-driven recall of a chunk evicted earlier.
+    {
+        const char * drafter_path = "/opt/lucebox/models/drafter/Qwen3-0.6B-BF16.gguf";
+        DrafterContext dctx;
+        if (!load_drafter(drafter_path, 0, dctx)) {
+            std::printf("FAIL indexer run: drafter load failed (%s)\n", dflash27b_last_error());
+            hard_failures++;
+        } else {
+            const int n_prompt_f = 2048, n_gen_f = 768, pool_f = 1024, tau = 64;
+            const auto prompt_f = make_prompt(n_prompt_f, w.n_vocab);
+            TargetCache cache;
+            if (!create_target_cache(w, pool_f, 0, backend, cache, true)) return 1;
+            KvFlashPager pager;
+            KvFlashConfig pc;
+            pc.pool_tokens = pool_f;
+            if (!pager.attach(pc, cache.attn_k, cache.attn_v)) return 1;
+            KvFlashDrafterScorer scorer(&dctx);   // the production indexer plugin
+
+            Stepper st;
+            if (!st.init(w, cache, backend, pool_f, use_mask)) return 1;
+            std::vector<int32_t> all_ids = prompt_f;
+            int32_t next = -1;
+            for (int pos = 0; pos < n_prompt_f; pos++) {
+                const int slot = pager.slot_for(pos);
+                st.refresh_mask(pager);
+                next = st.step(prompt_f[pos], pos, slot);
+            }
+            const int64_t prefill_evictions = pager.stats().page_outs;
+
+            std::vector<double> rescore_ms, reselect_ms;
+            int64_t recalls = 0;
+            std::vector<float> scores;
+            const double t0 = now_ms();
+            for (int g = 0; g < n_gen_f; g++) {
+                const int pos = n_prompt_f + g;
+                if (g % tau == 0) {
+                    double r0 = now_ms();
+                    if (!scorer.score_chunks(all_ids, pc.chunk_tokens, scores)) {
+                        std::fprintf(stderr, "scorer failed\n");
+                        std::exit(1);
+                    }
+                    rescore_ms.push_back(now_ms() - r0);
+                    pager.score_hook = [&scores](int c) {
+                        return c < (int)scores.size() ? scores[c] : 1e30f;
+                    };
+                    r0 = now_ms();
+                    const int64_t ins_before = pager.stats().page_ins;
+                    pager.reselect();
+                    reselect_ms.push_back(now_ms() - r0);
+                    recalls += pager.stats().page_ins - ins_before;
+                }
+                const int slot = pager.slot_for(pos);
+                st.refresh_mask(pager);
+                next = st.step(next, pos, slot);
+                all_ids.push_back(next);
+            }
+            const double secs = (now_ms() - t0) / 1000.0;
+            StepTimes rs = summarize(rescore_ms), rsel = summarize(reselect_ms);
+            const auto & ps = pager.stats();
+            std::printf("[F] LSA loop: prompt=%d pool=%d gen=%d tau=%d -> %.1f tok/s "
+                        "(prefill evicted %" PRId64 ")\n",
+                        n_prompt_f, pool_f, n_gen_f, tau, n_gen_f / secs, prefill_evictions);
+            std::printf("[F] indexer rescore p50=%.1f ms (full 0.6B re-prefill, %zu calls); "
+                        "reselect p50=%.2f ms; drafter-driven recalls=%" PRId64
+                        "; total page_outs=%" PRId64 " page_ins=%" PRId64 "\n",
+                        rs.p50, rescore_ms.size(), rsel.p50, recalls,
+                        ps.page_outs, ps.page_ins);
+            std::printf("%s LSA loop: drafter-driven recall of evicted context (recalls >= 1)\n",
+                        recalls >= 1 ? "PASS" : "FAIL");
+            if (recalls < 1) hard_failures++;
+            st.destroy();
+            free_target_cache(cache);
+            free_drafter(dctx);
+        }
+    }
+
+    // ── Run E: performance profile ──────────────────────────────────
+    if (!skip_prof) {
+        std::printf("\n=== DECODE PROFILE (64 timed steps each, junk KV, span = FA window) ===\n");
+        auto profile = [&](const char * tag, int alloc_ctx, int span, bool masked,
+                           KvFlashPager * pager, int pos_base) {
+            TargetCache cache;
+            if (!create_target_cache(w, alloc_ctx, 0, backend, cache, true)) {
+                std::fprintf(stderr, "cache E(%s): %s\n", tag, dflash27b_last_error());
+                std::exit(1);
+            }
+            KvFlashPager local;
+            if (masked && !pager) {
+                KvFlashConfig pc; pc.pool_tokens = alloc_ctx;
+                local.attach(pc, cache.attn_k, cache.attn_v);
+                // mark whole pool resident so the mask is all-zero (worst
+                // case mask read, no -inf shortcut)
+                for (int p = 0; p < alloc_ctx; p += 64) local.slot_for(p);
+                pager = &local;
+            }
+            Stepper st;
+            if (!st.init(w, cache, backend, span, masked)) std::exit(1);
+            // warmup 8, then time 64 (refresh included: it is part of the
+            // real per-step cost in masked mode)
+            int32_t tok = 1000;
+            for (int i = 0; i < 8; i++) {
+                if (masked) st.refresh_mask(*pager);
+                tok = st.step(tok, pos_base + i, (i * 64) % alloc_ctx);
+            }
+            std::vector<double> ms;
+            for (int i = 0; i < 64; i++) {
+                const double t0 = now_ms();
+                if (masked) st.refresh_mask(*pager);
+                tok = st.step(tok, pos_base + 8 + i, (8 * 64 + i) % alloc_ctx);
+                ms.push_back(now_ms() - t0);
+            }
+            const StepTimes r = summarize(ms);
+            std::printf("%-28s span=%6d  p50=%7.2f ms  p95=%7.2f ms  mean=%7.2f ms  (%5.1f tok/s)\n",
+                        tag, span, r.p50, r.p95, r.mean, 1000.0 / r.mean);
+            st.destroy();
+            free_target_cache(cache);
+        };
+        profile("baseline 8K",   8192,   8192,   false, nullptr, 8192 - 72);
+        profile("baseline 32K",  32768,  32768,  false, nullptr, 32768 - 72);
+        profile("baseline 128K", 131072, 131072, false, nullptr, 131072 - 72);
+        profile("pool 1K masked (128K logical)", 1024, 1024, true,  nullptr, 130000);
+        profile("pool 1K maskless",              1024, 1024, false, nullptr, 130000);
+        profile("pool 4K masked (128K logical)", 4096, 4096, true,  nullptr, 130000);
+
+        // Page-event microbench on a small pool.
+        {
+            TargetCache cache;
+            if (!create_target_cache(w, 1024, 0, backend, cache, true)) std::exit(1);
+            KvFlashPager pager;
+            KvFlashConfig pc; pc.pool_tokens = 1024;
+            pager.attach(pc, cache.attn_k, cache.attn_v);
+            for (int p = 0; p < 1024; p += 64) pager.slot_for(p);
+            std::vector<double> out_ms, in_ms;
+            for (int rep = 0; rep < 32; rep++) {
+                const int c = 2 + (rep % 8);
+                double t0 = now_ms();
+                pager.page_out(c);
+                out_ms.push_back(now_ms() - t0);
+                t0 = now_ms();
+                pager.page_in(c);
+                in_ms.push_back(now_ms() - t0);
+            }
+            const StepTimes o = summarize(out_ms), i = summarize(in_ms);
+            std::printf("page_out: p50=%.2f ms  p95=%.2f ms   page_in: p50=%.2f ms  p95=%.2f ms  (per 64-token chunk, %zu KiB)\n",
+                        o.p50, o.p95, i.p50, i.p95,
+                        (size_t)(pager.stats().host_bytes / std::max<int64_t>(1, 8) / 1024));
+            free_target_cache(cache);
+        }
+    }
+
+    // ── Memory verdict ──────────────────────────────────────────────
+    const double red_kv = 100.0 * (1.0 - (double)mem_c_kv / (double)mem_a_kv);
+    std::printf("\n=== KV MEMORY ===\n");
+    std::printf("baseline  (ctx %6d): kv=%9.1f MiB  base_buf=%9.1f MiB  vram_delta=%9.1f MiB\n",
+                logical_ctx, mem_a_kv / 1048576.0, mem_a_buf / 1048576.0, mem_a_vram / 1048576.0);
+    std::printf("pooled    (pool %5d): kv=%9.1f MiB  base_buf=%9.1f MiB  vram_delta=%9.1f MiB\n",
+                pool_c, mem_c_kv / 1048576.0, mem_c_buf / 1048576.0, mem_c_vram / 1048576.0);
+    std::printf("attn-KV reduction: %.1f%%\n", red_kv);
+    std::printf("%s KV memory reduction >= 90%%\n", red_kv >= 90.0 ? "PASS" : "FAIL");
+    if (red_kv < 90.0) hard_failures++;
+
+    free_target_weights(w);
+    ggml_backend_free(backend);
+    std::printf("\n%s (%d hard failures)\n", hard_failures == 0 ? "ALL PASS" : "FAILED", hard_failures);
+    return hard_failures == 0 ? 0 : 1;
+}