Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 27 additions & 2 deletions server/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,7 @@ add_library(dflash_common STATIC
src/draft/draft_gguf_loader.cpp
src/draft/draft_safetensors_loader.cpp
src/draft/draft_graph.cpp
src/qwen3/anchor_scan.cpp
src/qwen3/qwen3_drafter.cpp
src/qwen3/qwen3_loader.cpp
src/qwen3/qwen3_graph.cpp
Expand Down Expand Up @@ -292,6 +293,7 @@ add_library(dflash_common STATIC
src/server/sse_emitter.cpp
src/server/prefix_cache.cpp
src/server/disk_prefix_cache.cpp
src/server/freeze_history.cpp
# ── Jinja chat-template engine (from llama.cpp common/jinja/) ──
# Used by render_chat_template_jinja() to support --chat-template-file
# in dflash_server. Mirrors llama.cpp's common_chat_template plumbing.
Expand Down Expand Up @@ -601,6 +603,31 @@ if(DFLASH27B_TESTS)
target_link_libraries(test_anchor_params PRIVATE dflash_common)
add_test(NAME anchor_params COMMAND test_anchor_params)
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_drafter_early_exit_score_range.cpp")
add_executable(test_drafter_early_exit_score_range
test/test_drafter_early_exit_score_range.cpp)
target_include_directories(test_drafter_early_exit_score_range PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/src/common)
add_test(NAME test_drafter_early_exit_score_range
COMMAND test_drafter_early_exit_score_range)
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_anchor_transitive.cpp")
add_executable(test_anchor_transitive
test/test_anchor_transitive.cpp
src/qwen3/anchor_scan.cpp)
target_include_directories(test_anchor_transitive PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/src/qwen3)
add_test(NAME test_anchor_transitive
COMMAND test_anchor_transitive)
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_drafter_warm_path_regression.cpp")
add_executable(test_drafter_warm_path_regression
test/test_drafter_warm_path_regression.cpp)
target_include_directories(test_drafter_warm_path_regression PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/src/common)
add_test(NAME test_drafter_warm_path_regression
COMMAND test_drafter_warm_path_regression)
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_drafter_tail_capture_guard.cpp")
# GREEN phase: built with TAIL_GUARD_USE_NEW_FORMULA — must pass after Bug #42 fix.
add_executable(test_drafter_tail_capture_guard
Expand All @@ -613,8 +640,6 @@ if(DFLASH27B_TESTS)
add_executable(test_drafter_tail_capture_guard_red
test/test_drafter_tail_capture_guard.cpp)
# No TAIL_GUARD_USE_NEW_FORMULA — uses old (buggy) guard, expected to FAIL.
add_test(NAME test_drafter_tail_capture_guard_red COMMAND test_drafter_tail_capture_guard_red)
set_tests_properties(test_drafter_tail_capture_guard_red PROPERTIES WILL_FAIL TRUE)
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_draft_vs_reference.cpp")
add_executable(test_draft_vs_reference test/test_draft_vs_reference.cpp)
Expand Down
2 changes: 1 addition & 1 deletion server/deps/llama.cpp
31 changes: 31 additions & 0 deletions server/src/common/score_range.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
// Compute [score_layer_start, score_layer_end) for tail-attention scoring.
// SCORE_LAYERS counts from the END of [0, fwd_layer_limit); -1 = all computed layers.
#pragma once

#include <algorithm>

namespace dflash::common {

struct ScoreRange {
int start; // inclusive
int end; // exclusive
int count() const { return end - start; }
bool empty() const { return start >= end; }
};

// Returns scoring layer range within [0, fwd_layer_limit).
inline ScoreRange compute_score_range(int n_layer, int score_layers, int fwd_layer_limit) {
const int effective_n = fwd_layer_limit;
int start;
if (score_layers > 0 && score_layers < n_layer) {
int want = std::min(score_layers, effective_n);
start = effective_n - want;
} else {
start = 0;
}
int end = fwd_layer_limit;
if (start > end) start = end;
return { start, end };
}

} // namespace dflash::common
8 changes: 2 additions & 6 deletions server/src/placement/draft_residency.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,12 +71,8 @@ inline DraftResidencyAction resolve_draft_residency_action(

switch (ctx.use) {
case DraftResidencyUse::PFlashCompress:
// In auto mode, only release the PFlash drafter when the operator gave
// a low-VRAM hint. That preserves the existing fast resident path while
// allowing small-card setups to make room for decode draft/target state.
return ctx.low_vram_hint
? DraftResidencyAction::ReleaseAfterUse
: DraftResidencyAction::KeepLoaded;
// Auto releases the pflash drafter after scoring: resident drafter starves target prefill on 24GB cards; lazy reload costs ~2s.
return DraftResidencyAction::ReleaseAfterUse;
case DraftResidencyUse::DFlashDecode:
// DFlash draft is latency-sensitive; keep it resident unless the
// operator explicitly opted into the low-VRAM/request-scoped path.
Expand Down
12 changes: 12 additions & 0 deletions server/src/placement/skip_park_guard.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
// Footprint-aware guard: downgrade --prefill-skip-park on <32GB GPUs at max_ctx>65536.
#pragma once
#include <cstddef>

namespace dflash::common {

// Returns false only when dual-residency is unsafe (VMM VA-fragmentation risk).
inline bool skip_park_allowed(bool requested, size_t total_vram_bytes, int max_ctx) {
return requested && (total_vram_bytes >= 32ull*1024*1024*1024 || max_ctx <= 65536);
}

} // namespace dflash::common
164 changes: 164 additions & 0 deletions server/src/qwen3/anchor_scan.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
#include "anchor_scan.h"

#include <algorithm>
#include <cstdint>
#include <unordered_map>
#include <vector>

namespace dflash::qwen3 {

// Force chunk and its radius-neighborhood into `forced`.
static void force_neighborhood(std::vector<uint8_t>& forced, int n_chunks,
int chunk, int radius) {
int lo = std::max(0, chunk - radius);
int hi = std::min(n_chunks - 1, chunk + radius);
for (int c = lo; c <= hi; ++c) forced[(size_t)c] = 1;
}

void scan_and_force(
const std::vector<int32_t>& ids,
int body_end,
const std::vector<int32_t>& query_pool,
const AnchorScanCfg& cfg,
std::vector<uint8_t>& forced)
{
const int n_chunks = (int)forced.size();
const int ngram = cfg.ngram;
const int search_end = std::max(0, body_end - ngram);

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1: search_end clamping to 0 causes one invalid n-gram comparison when body_end < ngram, risking out-of-bounds reads and boundary violations.

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At server/src/qwen3/anchor_scan.cpp, line 27:

<comment>`search_end` clamping to 0 causes one invalid n-gram comparison when `body_end < ngram`, risking out-of-bounds reads and boundary violations.</comment>

<file context>
@@ -0,0 +1,164 @@
+{
+    const int n_chunks = (int)forced.size();
+    const int ngram    = cfg.ngram;
+    const int search_end = std::max(0, body_end - ngram);
+
+    for (int qi = 0; qi + ngram <= (int)query_pool.size(); ++qi) {
</file context>


for (int qi = 0; qi + ngram <= (int)query_pool.size(); ++qi) {
int hits = 0;
int hit_pos[8];
for (int p = 0; p <= search_end && hits <= cfg.max_anchor_hits; ++p) {
bool same = true;
for (int k = 0; k < ngram; ++k) {
if (ids[(size_t)p + k] != query_pool[(size_t)qi + k]) {
same = false;
break;
}
}
if (same) {
if (hits < 8) hit_pos[hits] = p;
++hits;
}
}
if (hits > 0 && hits <= cfg.max_anchor_hits) {
for (int i = 0; i < hits && i < 8; ++i) {
force_neighborhood(forced, n_chunks,
hit_pos[i] / cfg.chunk_size,
cfg.anchor_radius);
}
}
}
}

// Helper: count set entries in forced.
static int count_set(const std::vector<uint8_t>& forced) {
int n = 0;
for (uint8_t v : forced) n += (v != 0);
return n;
}

void scan_and_force_transitive(
const std::vector<int32_t>& ids,
int body_end,
const std::vector<int32_t>& initial_query_pool,
const AnchorScanCfg& cfg,
int max_iters,
std::vector<uint8_t>& forced)
{
auto pool = initial_query_pool;
const int n_chunks = (int)forced.size();

// Precompute token frequencies and rare-token position index.
std::unordered_map<int32_t, int> body_freq;
body_freq.reserve((size_t)body_end);
for (int j = 0; j < body_end; ++j) ++body_freq[ids[(size_t)j]];

std::unordered_map<int32_t, std::vector<int>> rare_positions;
if (cfg.rare_token_max_freq > 0) {
for (auto& kv : body_freq) {
if (kv.second <= cfg.rare_token_max_freq) {
rare_positions[kv.first] = {};
}
}
for (int p = 0; p < body_end; ++p) {
auto it = rare_positions.find(ids[(size_t)p]);
if (it != rare_positions.end()) it->second.push_back(p);
}
}

// Pass-1: initial scan; gate on cascade if enough anchors already found.
const int count_before_pass1 = count_set(forced);
scan_and_force(ids, body_end, pool, cfg, forced);
const int gained_pass1 = count_set(forced) - count_before_pass1;

if (cfg.cascade_min_anchor_count > 0 && gained_pass1 >= cfg.cascade_min_anchor_count) {
return;
}

// Cascade loop: expand pool with tokens from newly-forced chunks and re-scan.
std::vector<uint8_t> prev_forced;
for (int it = 0; it < max_iters; ++it) {
prev_forced = forced;

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1: Transitive cascade loop exits early due to comparing forced against an immediately copied snapshot, so subsequent expansion/rescan iterations are skipped.

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At server/src/qwen3/anchor_scan.cpp, line 103:

<comment>Transitive cascade loop exits early due to comparing `forced` against an immediately copied snapshot, so subsequent expansion/rescan iterations are skipped.</comment>

<file context>
@@ -0,0 +1,164 @@
+    // Cascade loop: expand pool with tokens from newly-forced chunks and re-scan.
+    std::vector<uint8_t> prev_forced;
+    for (int it = 0; it < max_iters; ++it) {
+        prev_forced = forced;
+
+        // Rare-token worklist: catches multi-hop cascades within a single outer iteration.
</file context>


// Rare-token worklist: catches multi-hop cascades within a single outer iteration.
if (cfg.rare_token_max_freq > 0) {
std::vector<int> worklist;
for (int c = 0; c < n_chunks; ++c) {
if (forced[c] && !prev_forced[c]) worklist.push_back(c);
}
// First iteration: seed from all pass-1 results.
if (it == 0) {
worklist.clear();
for (int c = 0; c < n_chunks; ++c) {
if (forced[c]) worklist.push_back(c);
}
}
for (int wi = 0; wi < (int)worklist.size(); ++wi) {
int c = worklist[wi];
int s = c * cfg.chunk_size;
int e = std::min(body_end, (c + 1) * cfg.chunk_size);
for (int j = s; j < e; ++j) {
auto it2 = rare_positions.find(ids[(size_t)j]);
if (it2 == rare_positions.end()) continue;
for (int p : it2->second) {
int target_c = p / cfg.chunk_size;
if (!forced[(size_t)target_c]) {
force_neighborhood(forced, n_chunks,
target_c, cfg.anchor_radius);
worklist.push_back(target_c);
}
}
}
}
}

// Hard cap: revert and stop if exceeded.
if (count_set(forced) > cfg.max_forced_count) {
forced = prev_forced;
break;
}

if (forced == prev_forced) break;

// Expand pool with tokens from newly-forced chunks, then 4-gram re-scan.
for (int c = 0; c < n_chunks; ++c) {
if (forced[c] && !prev_forced[c]) {
int s = c * cfg.chunk_size;
int e = std::min((int)ids.size(), (c + 1) * cfg.chunk_size);
for (int j = s; j < e; ++j) pool.push_back(ids[j]);
}
}

prev_forced = forced;
scan_and_force(ids, body_end, pool, cfg, forced);

if (count_set(forced) > cfg.max_forced_count) {
forced = prev_forced;
break;
}
}
}

} // namespace dflash::qwen3
42 changes: 42 additions & 0 deletions server/src/qwen3/anchor_scan.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
// N-gram anchor scan: mark chunks forced by token-match between a query pool
// and the body of an ids sequence. Pure CPU, no GPU, no model required.
#pragma once

#include <climits>
#include <cstdint>
#include <vector>

namespace dflash::qwen3 {

struct AnchorScanCfg {
int chunk_size;
int anchor_radius;
int max_anchor_hits;
int ngram = 4;
int rare_token_max_freq = 8; // tokens appearing <= this many times in body count as rare
int cascade_min_anchor_count = 0; // skip cascade if pass-1 forced >= this many chunks (0 = always cascade)
int max_forced_count = INT_MAX; // hard cap on total forced chunks

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2: max_forced_count hard cap is checked only inside the cascade loop, but not after pass-1. If pass-1 alone already pushes forced chunks above max_forced_count, the cap is never enforced — the result can exceed the limit.

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At server/src/qwen3/anchor_scan.h, line 18:

<comment>max_forced_count hard cap is checked only inside the cascade loop, but not after pass-1. If pass-1 alone already pushes forced chunks above max_forced_count, the cap is never enforced — the result can exceed the limit.</comment>

<file context>
@@ -0,0 +1,42 @@
+    int ngram = 4;
+    int rare_token_max_freq = 8;        // tokens appearing <= this many times in body count as rare
+    int cascade_min_anchor_count = 0;   // skip cascade if pass-1 forced >= this many chunks (0 = always cascade)
+    int max_forced_count = INT_MAX;     // hard cap on total forced chunks
+};
+
</file context>

};

// Marks chunks forced by ngram-matches between query_pool and ids[0..body_end).
// `forced` is in-out; new hits are OR-merged. Idempotent.
void scan_and_force(
const std::vector<int32_t>& ids,
int body_end,
const std::vector<int32_t>& query_pool,
const AnchorScanCfg& cfg,
std::vector<uint8_t>& forced
);

// Transitive variant: expands the query pool with tokens from newly-forced
// chunks and re-runs scan_and_force until a fixed point or max_iters reached.
void scan_and_force_transitive(
const std::vector<int32_t>& ids,
int body_end,
const std::vector<int32_t>& initial_query_pool,
const AnchorScanCfg& cfg,
int max_iters,
std::vector<uint8_t>& forced
);

} // namespace dflash::qwen3
9 changes: 9 additions & 0 deletions server/src/qwen3/qwen3_drafter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "qwen3/anchor_params.h"
#include "common/backend_precision.h"
#include "internal.h"
#include "anchor_scan.h"

#include "ggml.h"
#include "ggml-alloc.h"
Expand Down Expand Up @@ -65,6 +66,13 @@ static int env_int(const char * name, int fallback) {
return fallback;
}

static float env_float(const char * name, float def) {
if (const char * v = std::getenv(name)) {
try { return std::stof(v); } catch (...) {}
}
return def;
}

static void force_chunk_neighborhood(std::vector<uint8_t> & forced, int n_chunks,
int chunk, int radius) {
int lo = std::max(0, chunk - radius);
Expand Down Expand Up @@ -590,6 +598,7 @@ static std::vector<int32_t> qwen35_score_and_compress(
}
}
}

for (int c = 0; c < n_chunks; ++c) {
if (forced[(size_t)c] && !selected[(size_t)c]) {
selected[(size_t)c] = 1;
Expand Down
Loading
Loading