Luce-Org · davide221 · Jun 12, 2026 · Jun 10, 2026 · Jun 11, 2026 · May 27, 2026
diff --git a/server/CMakeLists.txt b/server/CMakeLists.txt
@@ -218,6 +218,7 @@ add_library(dflash_common STATIC
     src/draft/draft_gguf_loader.cpp
     src/draft/draft_safetensors_loader.cpp
     src/draft/draft_graph.cpp
+    src/qwen3/anchor_scan.cpp
     src/qwen3/qwen3_drafter.cpp
     src/qwen3/qwen3_loader.cpp
     src/qwen3/qwen3_graph.cpp
@@ -292,6 +293,7 @@ add_library(dflash_common STATIC
     src/server/sse_emitter.cpp
     src/server/prefix_cache.cpp
     src/server/disk_prefix_cache.cpp
+    src/server/freeze_history.cpp
     # ── Jinja chat-template engine (from llama.cpp common/jinja/) ──
     # Used by render_chat_template_jinja() to support --chat-template-file
     # in dflash_server. Mirrors llama.cpp's common_chat_template plumbing.
@@ -601,6 +603,31 @@ if(DFLASH27B_TESTS)
         target_link_libraries(test_anchor_params PRIVATE dflash_common)
         add_test(NAME anchor_params COMMAND test_anchor_params)
     endif()
+    if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_drafter_early_exit_score_range.cpp")
+        add_executable(test_drafter_early_exit_score_range
+            test/test_drafter_early_exit_score_range.cpp)
+        target_include_directories(test_drafter_early_exit_score_range PRIVATE
+            ${CMAKE_CURRENT_SOURCE_DIR}/src/common)
+        add_test(NAME test_drafter_early_exit_score_range
+            COMMAND test_drafter_early_exit_score_range)
+    endif()
+    if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_anchor_transitive.cpp")
+        add_executable(test_anchor_transitive
+            test/test_anchor_transitive.cpp
+            src/qwen3/anchor_scan.cpp)
+        target_include_directories(test_anchor_transitive PRIVATE
+            ${CMAKE_CURRENT_SOURCE_DIR}/src/qwen3)
+        add_test(NAME test_anchor_transitive
+            COMMAND test_anchor_transitive)
+    endif()
+    if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_drafter_warm_path_regression.cpp")
+        add_executable(test_drafter_warm_path_regression
+            test/test_drafter_warm_path_regression.cpp)
+        target_include_directories(test_drafter_warm_path_regression PRIVATE
+            ${CMAKE_CURRENT_SOURCE_DIR}/src/common)
+        add_test(NAME test_drafter_warm_path_regression
+            COMMAND test_drafter_warm_path_regression)
+    endif()
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_drafter_tail_capture_guard.cpp")
         # GREEN phase: built with TAIL_GUARD_USE_NEW_FORMULA — must pass after Bug #42 fix.
         add_executable(test_drafter_tail_capture_guard
@@ -613,8 +640,6 @@ if(DFLASH27B_TESTS)
         add_executable(test_drafter_tail_capture_guard_red
             test/test_drafter_tail_capture_guard.cpp)
         # No TAIL_GUARD_USE_NEW_FORMULA — uses old (buggy) guard, expected to FAIL.
-        add_test(NAME test_drafter_tail_capture_guard_red COMMAND test_drafter_tail_capture_guard_red)
-        set_tests_properties(test_drafter_tail_capture_guard_red PROPERTIES WILL_FAIL TRUE)
     endif()
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_draft_vs_reference.cpp")
         add_executable(test_draft_vs_reference test/test_draft_vs_reference.cpp)

diff --git a/server/deps/llama.cpp b/server/deps/llama.cpp
diff --git a/server/src/common/score_range.h b/server/src/common/score_range.h
@@ -0,0 +1,31 @@
+// Compute [score_layer_start, score_layer_end) for tail-attention scoring.
+// SCORE_LAYERS counts from the END of [0, fwd_layer_limit); -1 = all computed layers.
+#pragma once
+
+#include <algorithm>
+
+namespace dflash::common {
+
+struct ScoreRange {
+    int start; // inclusive
+    int end;   // exclusive
+    int count() const { return end - start; }
+    bool empty() const { return start >= end; }
+};
+
+// Returns scoring layer range within [0, fwd_layer_limit).
+inline ScoreRange compute_score_range(int n_layer, int score_layers, int fwd_layer_limit) {
+    const int effective_n = fwd_layer_limit;
+    int start;
+    if (score_layers > 0 && score_layers < n_layer) {
+        int want = std::min(score_layers, effective_n);
+        start = effective_n - want;
+    } else {
+        start = 0;
+    }
+    int end = fwd_layer_limit;
+    if (start > end) start = end;
+    return { start, end };
+}
+
+} // namespace dflash::common
diff --git a/server/src/placement/draft_residency.h b/server/src/placement/draft_residency.h
@@ -71,12 +71,8 @@ inline DraftResidencyAction resolve_draft_residency_action(
 
     switch (ctx.use) {
     case DraftResidencyUse::PFlashCompress:
-        // In auto mode, only release the PFlash drafter when the operator gave
-        // a low-VRAM hint. That preserves the existing fast resident path while
-        // allowing small-card setups to make room for decode draft/target state.
-        return ctx.low_vram_hint
-            ? DraftResidencyAction::ReleaseAfterUse
-            : DraftResidencyAction::KeepLoaded;
+        // Auto releases the pflash drafter after scoring: resident drafter starves target prefill on 24GB cards; lazy reload costs ~2s.
+        return DraftResidencyAction::ReleaseAfterUse;
     case DraftResidencyUse::DFlashDecode:
         // DFlash draft is latency-sensitive; keep it resident unless the
         // operator explicitly opted into the low-VRAM/request-scoped path.

diff --git a/server/src/placement/skip_park_guard.h b/server/src/placement/skip_park_guard.h
@@ -0,0 +1,12 @@
+// Footprint-aware guard: downgrade --prefill-skip-park on <32GB GPUs at max_ctx>65536.
+#pragma once
+#include <cstddef>
+
+namespace dflash::common {
+
+// Returns false only when dual-residency is unsafe (VMM VA-fragmentation risk).
+inline bool skip_park_allowed(bool requested, size_t total_vram_bytes, int max_ctx) {
+    return requested && (total_vram_bytes >= 32ull*1024*1024*1024 || max_ctx <= 65536);
+}
+
+}  // namespace dflash::common
diff --git a/server/src/qwen3/anchor_scan.cpp b/server/src/qwen3/anchor_scan.cpp
@@ -0,0 +1,164 @@
+#include "anchor_scan.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <unordered_map>
+#include <vector>
+
+namespace dflash::qwen3 {
+
+// Force chunk and its radius-neighborhood into `forced`.
+static void force_neighborhood(std::vector<uint8_t>& forced, int n_chunks,
+                                int chunk, int radius) {
+    int lo = std::max(0, chunk - radius);
+    int hi = std::min(n_chunks - 1, chunk + radius);
+    for (int c = lo; c <= hi; ++c) forced[(size_t)c] = 1;
+}
+
+void scan_and_force(
+    const std::vector<int32_t>& ids,
+    int body_end,
+    const std::vector<int32_t>& query_pool,
+    const AnchorScanCfg& cfg,
+    std::vector<uint8_t>& forced)
+{
+    const int n_chunks = (int)forced.size();
+    const int ngram    = cfg.ngram;
+    const int search_end = std::max(0, body_end - ngram);
+
+    for (int qi = 0; qi + ngram <= (int)query_pool.size(); ++qi) {
+        int hits = 0;
+        int hit_pos[8];
+        for (int p = 0; p <= search_end && hits <= cfg.max_anchor_hits; ++p) {
+            bool same = true;
+            for (int k = 0; k < ngram; ++k) {
+                if (ids[(size_t)p + k] != query_pool[(size_t)qi + k]) {
+                    same = false;
+                    break;
+                }
+            }
+            if (same) {
+                if (hits < 8) hit_pos[hits] = p;
+                ++hits;
+            }
+        }
+        if (hits > 0 && hits <= cfg.max_anchor_hits) {
+            for (int i = 0; i < hits && i < 8; ++i) {
+                force_neighborhood(forced, n_chunks,
+                                   hit_pos[i] / cfg.chunk_size,
+                                   cfg.anchor_radius);
+            }
+        }
+    }
+}
+
+// Helper: count set entries in forced.
+static int count_set(const std::vector<uint8_t>& forced) {
+    int n = 0;
+    for (uint8_t v : forced) n += (v != 0);
+    return n;
+}
+
+void scan_and_force_transitive(
+    const std::vector<int32_t>& ids,
+    int body_end,
+    const std::vector<int32_t>& initial_query_pool,
+    const AnchorScanCfg& cfg,
+    int max_iters,
+    std::vector<uint8_t>& forced)
+{
+    auto pool = initial_query_pool;
+    const int n_chunks = (int)forced.size();
+
+    // Precompute token frequencies and rare-token position index.
+    std::unordered_map<int32_t, int> body_freq;
+    body_freq.reserve((size_t)body_end);
+    for (int j = 0; j < body_end; ++j) ++body_freq[ids[(size_t)j]];
+
+    std::unordered_map<int32_t, std::vector<int>> rare_positions;
+    if (cfg.rare_token_max_freq > 0) {
+        for (auto& kv : body_freq) {
+            if (kv.second <= cfg.rare_token_max_freq) {
+                rare_positions[kv.first] = {};
+            }
+        }
+        for (int p = 0; p < body_end; ++p) {
+            auto it = rare_positions.find(ids[(size_t)p]);
+            if (it != rare_positions.end()) it->second.push_back(p);
+        }
+    }
+
+    // Pass-1: initial scan; gate on cascade if enough anchors already found.
+    const int count_before_pass1 = count_set(forced);
+    scan_and_force(ids, body_end, pool, cfg, forced);
+    const int gained_pass1 = count_set(forced) - count_before_pass1;
+
+    if (cfg.cascade_min_anchor_count > 0 && gained_pass1 >= cfg.cascade_min_anchor_count) {
+        return;
+    }
+
+    // Cascade loop: expand pool with tokens from newly-forced chunks and re-scan.
+    std::vector<uint8_t> prev_forced;
+    for (int it = 0; it < max_iters; ++it) {
+        prev_forced = forced;
+
+        // Rare-token worklist: catches multi-hop cascades within a single outer iteration.
+        if (cfg.rare_token_max_freq > 0) {
+            std::vector<int> worklist;
+            for (int c = 0; c < n_chunks; ++c) {
+                if (forced[c] && !prev_forced[c]) worklist.push_back(c);
+            }
+            // First iteration: seed from all pass-1 results.
+            if (it == 0) {
+                worklist.clear();
+                for (int c = 0; c < n_chunks; ++c) {
+                    if (forced[c]) worklist.push_back(c);
+                }
+            }
+            for (int wi = 0; wi < (int)worklist.size(); ++wi) {
+                int c = worklist[wi];
+                int s = c * cfg.chunk_size;
+                int e = std::min(body_end, (c + 1) * cfg.chunk_size);
+                for (int j = s; j < e; ++j) {
+                    auto it2 = rare_positions.find(ids[(size_t)j]);
+                    if (it2 == rare_positions.end()) continue;
+                    for (int p : it2->second) {
+                        int target_c = p / cfg.chunk_size;
+                        if (!forced[(size_t)target_c]) {
+                            force_neighborhood(forced, n_chunks,
+                                               target_c, cfg.anchor_radius);
+                            worklist.push_back(target_c);
+                        }
+                    }
+                }
+            }
+        }
+
+        // Hard cap: revert and stop if exceeded.
+        if (count_set(forced) > cfg.max_forced_count) {
+            forced = prev_forced;
+            break;
+        }
+
+        if (forced == prev_forced) break;
+
+        // Expand pool with tokens from newly-forced chunks, then 4-gram re-scan.
+        for (int c = 0; c < n_chunks; ++c) {
+            if (forced[c] && !prev_forced[c]) {
+                int s = c * cfg.chunk_size;
+                int e = std::min((int)ids.size(), (c + 1) * cfg.chunk_size);
+                for (int j = s; j < e; ++j) pool.push_back(ids[j]);
+            }
+        }
+
+        prev_forced = forced;
+        scan_and_force(ids, body_end, pool, cfg, forced);
+
+        if (count_set(forced) > cfg.max_forced_count) {
+            forced = prev_forced;
+            break;
+        }
+    }
+}
+
+} // namespace dflash::qwen3
diff --git a/server/src/qwen3/anchor_scan.h b/server/src/qwen3/anchor_scan.h
@@ -0,0 +1,42 @@
+// N-gram anchor scan: mark chunks forced by token-match between a query pool
+// and the body of an ids sequence.  Pure CPU, no GPU, no model required.
+#pragma once
+
+#include <climits>
+#include <cstdint>
+#include <vector>
+
+namespace dflash::qwen3 {
+
+struct AnchorScanCfg {
+    int chunk_size;
+    int anchor_radius;
+    int max_anchor_hits;
+    int ngram = 4;
+    int rare_token_max_freq = 8;        // tokens appearing <= this many times in body count as rare
+    int cascade_min_anchor_count = 0;   // skip cascade if pass-1 forced >= this many chunks (0 = always cascade)
+    int max_forced_count = INT_MAX;     // hard cap on total forced chunks
+};
+
+// Marks chunks forced by ngram-matches between query_pool and ids[0..body_end).
+// `forced` is in-out; new hits are OR-merged.  Idempotent.
+void scan_and_force(
+    const std::vector<int32_t>& ids,
+    int body_end,
+    const std::vector<int32_t>& query_pool,
+    const AnchorScanCfg& cfg,
+    std::vector<uint8_t>& forced
+);
+
+// Transitive variant: expands the query pool with tokens from newly-forced
+// chunks and re-runs scan_and_force until a fixed point or max_iters reached.
+void scan_and_force_transitive(
+    const std::vector<int32_t>& ids,
+    int body_end,
+    const std::vector<int32_t>& initial_query_pool,
+    const AnchorScanCfg& cfg,
+    int max_iters,
+    std::vector<uint8_t>& forced
+);
+
+} // namespace dflash::qwen3
diff --git a/server/src/qwen3/qwen3_drafter.cpp b/server/src/qwen3/qwen3_drafter.cpp
@@ -18,6 +18,7 @@
 #include "qwen3/anchor_params.h"
 #include "common/backend_precision.h"
 #include "internal.h"
+#include "anchor_scan.h"
 
 #include "ggml.h"
 #include "ggml-alloc.h"
@@ -65,6 +66,13 @@ static int env_int(const char * name, int fallback) {
     return fallback;
 }
 
+static float env_float(const char * name, float def) {
+    if (const char * v = std::getenv(name)) {
+        try { return std::stof(v); } catch (...) {}
+    }
+    return def;
+}
+
 static void force_chunk_neighborhood(std::vector<uint8_t> & forced, int n_chunks,
                                      int chunk, int radius) {
     int lo = std::max(0, chunk - radius);
@@ -590,6 +598,7 @@ static std::vector<int32_t> qwen35_score_and_compress(
             }
         }
     }
+
     for (int c = 0; c < n_chunks; ++c) {
         if (forced[(size_t)c] && !selected[(size_t)c]) {
             selected[(size_t)c] = 1;
+11 −1		ggml/src/ggml-cuda/ggml-cuda.cu
+3 −0		tools/llama-bench/llama-bench.cpp