-
Notifications
You must be signed in to change notification settings - Fork 224
compose: FlowKV aged-history compression + drafter residency fix — 1.72x vs disk-cache baseline at <=64K #372
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
731561d
0efdc33
cefa3ca
6a84805
3fc6882
2ae98c0
637fbda
1c562eb
e542e90
de774d2
26e0ee3
6a58498
f8227e6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| +11 −1 | ggml/src/ggml-cuda/ggml-cuda.cu | |
| +3 −0 | tools/llama-bench/llama-bench.cpp |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,31 @@ | ||
| // Compute [score_layer_start, score_layer_end) for tail-attention scoring. | ||
| // SCORE_LAYERS counts from the END of [0, fwd_layer_limit); -1 = all computed layers. | ||
| #pragma once | ||
|
|
||
| #include <algorithm> | ||
|
|
||
| namespace dflash::common { | ||
|
|
||
| struct ScoreRange { | ||
| int start; // inclusive | ||
| int end; // exclusive | ||
| int count() const { return end - start; } | ||
| bool empty() const { return start >= end; } | ||
| }; | ||
|
|
||
| // Returns scoring layer range within [0, fwd_layer_limit). | ||
| inline ScoreRange compute_score_range(int n_layer, int score_layers, int fwd_layer_limit) { | ||
| const int effective_n = fwd_layer_limit; | ||
| int start; | ||
| if (score_layers > 0 && score_layers < n_layer) { | ||
| int want = std::min(score_layers, effective_n); | ||
| start = effective_n - want; | ||
| } else { | ||
| start = 0; | ||
| } | ||
| int end = fwd_layer_limit; | ||
| if (start > end) start = end; | ||
| return { start, end }; | ||
| } | ||
|
|
||
| } // namespace dflash::common |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,12 @@ | ||
| // Footprint-aware guard: downgrade --prefill-skip-park on <32GB GPUs at max_ctx>65536. | ||
| #pragma once | ||
| #include <cstddef> | ||
|
|
||
| namespace dflash::common { | ||
|
|
||
| // Returns false only when dual-residency is unsafe (VMM VA-fragmentation risk). | ||
| inline bool skip_park_allowed(bool requested, size_t total_vram_bytes, int max_ctx) { | ||
| return requested && (total_vram_bytes >= 32ull*1024*1024*1024 || max_ctx <= 65536); | ||
| } | ||
|
|
||
| } // namespace dflash::common |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,164 @@ | ||
| #include "anchor_scan.h" | ||
|
|
||
| #include <algorithm> | ||
| #include <cstdint> | ||
| #include <unordered_map> | ||
| #include <vector> | ||
|
|
||
| namespace dflash::qwen3 { | ||
|
|
||
| // Force chunk and its radius-neighborhood into `forced`. | ||
| static void force_neighborhood(std::vector<uint8_t>& forced, int n_chunks, | ||
| int chunk, int radius) { | ||
| int lo = std::max(0, chunk - radius); | ||
| int hi = std::min(n_chunks - 1, chunk + radius); | ||
| for (int c = lo; c <= hi; ++c) forced[(size_t)c] = 1; | ||
| } | ||
|
|
||
| void scan_and_force( | ||
| const std::vector<int32_t>& ids, | ||
| int body_end, | ||
| const std::vector<int32_t>& query_pool, | ||
| const AnchorScanCfg& cfg, | ||
| std::vector<uint8_t>& forced) | ||
| { | ||
| const int n_chunks = (int)forced.size(); | ||
| const int ngram = cfg.ngram; | ||
| const int search_end = std::max(0, body_end - ngram); | ||
|
|
||
| for (int qi = 0; qi + ngram <= (int)query_pool.size(); ++qi) { | ||
| int hits = 0; | ||
| int hit_pos[8]; | ||
| for (int p = 0; p <= search_end && hits <= cfg.max_anchor_hits; ++p) { | ||
| bool same = true; | ||
| for (int k = 0; k < ngram; ++k) { | ||
| if (ids[(size_t)p + k] != query_pool[(size_t)qi + k]) { | ||
| same = false; | ||
| break; | ||
| } | ||
| } | ||
| if (same) { | ||
| if (hits < 8) hit_pos[hits] = p; | ||
| ++hits; | ||
| } | ||
| } | ||
| if (hits > 0 && hits <= cfg.max_anchor_hits) { | ||
| for (int i = 0; i < hits && i < 8; ++i) { | ||
| force_neighborhood(forced, n_chunks, | ||
| hit_pos[i] / cfg.chunk_size, | ||
| cfg.anchor_radius); | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // Helper: count set entries in forced. | ||
| static int count_set(const std::vector<uint8_t>& forced) { | ||
| int n = 0; | ||
| for (uint8_t v : forced) n += (v != 0); | ||
| return n; | ||
| } | ||
|
|
||
| void scan_and_force_transitive( | ||
| const std::vector<int32_t>& ids, | ||
| int body_end, | ||
| const std::vector<int32_t>& initial_query_pool, | ||
| const AnchorScanCfg& cfg, | ||
| int max_iters, | ||
| std::vector<uint8_t>& forced) | ||
| { | ||
| auto pool = initial_query_pool; | ||
| const int n_chunks = (int)forced.size(); | ||
|
|
||
| // Precompute token frequencies and rare-token position index. | ||
| std::unordered_map<int32_t, int> body_freq; | ||
| body_freq.reserve((size_t)body_end); | ||
| for (int j = 0; j < body_end; ++j) ++body_freq[ids[(size_t)j]]; | ||
|
|
||
| std::unordered_map<int32_t, std::vector<int>> rare_positions; | ||
| if (cfg.rare_token_max_freq > 0) { | ||
| for (auto& kv : body_freq) { | ||
| if (kv.second <= cfg.rare_token_max_freq) { | ||
| rare_positions[kv.first] = {}; | ||
| } | ||
| } | ||
| for (int p = 0; p < body_end; ++p) { | ||
| auto it = rare_positions.find(ids[(size_t)p]); | ||
| if (it != rare_positions.end()) it->second.push_back(p); | ||
| } | ||
| } | ||
|
|
||
| // Pass-1: initial scan; gate on cascade if enough anchors already found. | ||
| const int count_before_pass1 = count_set(forced); | ||
| scan_and_force(ids, body_end, pool, cfg, forced); | ||
| const int gained_pass1 = count_set(forced) - count_before_pass1; | ||
|
|
||
| if (cfg.cascade_min_anchor_count > 0 && gained_pass1 >= cfg.cascade_min_anchor_count) { | ||
| return; | ||
| } | ||
|
|
||
| // Cascade loop: expand pool with tokens from newly-forced chunks and re-scan. | ||
| std::vector<uint8_t> prev_forced; | ||
| for (int it = 0; it < max_iters; ++it) { | ||
| prev_forced = forced; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. P1: Transitive cascade loop exits early due to comparing Prompt for AI agents |
||
|
|
||
| // Rare-token worklist: catches multi-hop cascades within a single outer iteration. | ||
| if (cfg.rare_token_max_freq > 0) { | ||
| std::vector<int> worklist; | ||
| for (int c = 0; c < n_chunks; ++c) { | ||
| if (forced[c] && !prev_forced[c]) worklist.push_back(c); | ||
| } | ||
| // First iteration: seed from all pass-1 results. | ||
| if (it == 0) { | ||
| worklist.clear(); | ||
| for (int c = 0; c < n_chunks; ++c) { | ||
| if (forced[c]) worklist.push_back(c); | ||
| } | ||
| } | ||
| for (int wi = 0; wi < (int)worklist.size(); ++wi) { | ||
| int c = worklist[wi]; | ||
| int s = c * cfg.chunk_size; | ||
| int e = std::min(body_end, (c + 1) * cfg.chunk_size); | ||
| for (int j = s; j < e; ++j) { | ||
| auto it2 = rare_positions.find(ids[(size_t)j]); | ||
| if (it2 == rare_positions.end()) continue; | ||
| for (int p : it2->second) { | ||
| int target_c = p / cfg.chunk_size; | ||
| if (!forced[(size_t)target_c]) { | ||
| force_neighborhood(forced, n_chunks, | ||
| target_c, cfg.anchor_radius); | ||
| worklist.push_back(target_c); | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // Hard cap: revert and stop if exceeded. | ||
| if (count_set(forced) > cfg.max_forced_count) { | ||
| forced = prev_forced; | ||
| break; | ||
| } | ||
|
|
||
| if (forced == prev_forced) break; | ||
|
|
||
| // Expand pool with tokens from newly-forced chunks, then 4-gram re-scan. | ||
| for (int c = 0; c < n_chunks; ++c) { | ||
| if (forced[c] && !prev_forced[c]) { | ||
| int s = c * cfg.chunk_size; | ||
| int e = std::min((int)ids.size(), (c + 1) * cfg.chunk_size); | ||
| for (int j = s; j < e; ++j) pool.push_back(ids[j]); | ||
| } | ||
| } | ||
|
|
||
| prev_forced = forced; | ||
| scan_and_force(ids, body_end, pool, cfg, forced); | ||
|
|
||
| if (count_set(forced) > cfg.max_forced_count) { | ||
| forced = prev_forced; | ||
| break; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| } // namespace dflash::qwen3 | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,42 @@ | ||
| // N-gram anchor scan: mark chunks forced by token-match between a query pool | ||
| // and the body of an ids sequence. Pure CPU, no GPU, no model required. | ||
| #pragma once | ||
|
|
||
| #include <climits> | ||
| #include <cstdint> | ||
| #include <vector> | ||
|
|
||
| namespace dflash::qwen3 { | ||
|
|
||
| struct AnchorScanCfg { | ||
| int chunk_size; | ||
| int anchor_radius; | ||
| int max_anchor_hits; | ||
| int ngram = 4; | ||
| int rare_token_max_freq = 8; // tokens appearing <= this many times in body count as rare | ||
| int cascade_min_anchor_count = 0; // skip cascade if pass-1 forced >= this many chunks (0 = always cascade) | ||
| int max_forced_count = INT_MAX; // hard cap on total forced chunks | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. P2: max_forced_count hard cap is checked only inside the cascade loop, but not after pass-1. If pass-1 alone already pushes forced chunks above max_forced_count, the cap is never enforced — the result can exceed the limit. Prompt for AI agents |
||
| }; | ||
|
|
||
| // Marks chunks forced by ngram-matches between query_pool and ids[0..body_end). | ||
| // `forced` is in-out; new hits are OR-merged. Idempotent. | ||
| void scan_and_force( | ||
| const std::vector<int32_t>& ids, | ||
| int body_end, | ||
| const std::vector<int32_t>& query_pool, | ||
| const AnchorScanCfg& cfg, | ||
| std::vector<uint8_t>& forced | ||
| ); | ||
|
|
||
| // Transitive variant: expands the query pool with tokens from newly-forced | ||
| // chunks and re-runs scan_and_force until a fixed point or max_iters reached. | ||
| void scan_and_force_transitive( | ||
| const std::vector<int32_t>& ids, | ||
| int body_end, | ||
| const std::vector<int32_t>& initial_query_pool, | ||
| const AnchorScanCfg& cfg, | ||
| int max_iters, | ||
| std::vector<uint8_t>& forced | ||
| ); | ||
|
|
||
| } // namespace dflash::qwen3 | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
P1:
search_endclamping to 0 causes one invalid n-gram comparison whenbody_end < ngram, risking out-of-bounds reads and boundary violations.Prompt for AI agents