From cc7f9e54be54dc15db3f2044444c459f244a21aa Mon Sep 17 00:00:00 2001 From: Audrey Tang Date: Fri, 15 May 2026 08:04:21 -0400 Subject: [PATCH] fix(kv-cache): guard never-hit entries from immediate self-eviction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A fresh cold-save written at the start of a new conversation is, by construction, smaller and less token-dense per byte than a mature multi-thousand-token snapshot from a long-running session. The existing `(hits + 1) * tokens / file_size` eviction score correctly captures the long-run "value per byte stored" ranking, but it has one pathology under budget pressure: on a saturated cache the new cold-save is the lowest-scoring entry the moment it is written, so the eviction loop reaps it on the same wall-clock second before its session's next turn can issue its first lookup. The cold-save infrastructure (cold-prefix boundary alignment, BPE boundary trim, SHA-keyed text lookup on subsequent requests) is then effectively dead under any real workload that uses the default 4 GB budget. Observed before this change on a multi-turn workload at the default budget pressure (2500-token initial prompt + 3 small turns): turn 1 cold: 7.1s turn 2: 6.5s turn 3: 6.5s Log on every turn: kv cache stored tokens=2048 trimmed=… reason=cold size=49.78 MiB kv cache evicted reason=disk-cache-full tokens=2048 hits=0 size=49.78 MiB i.e. each cold-save is created and reaped in the same second. Turn 2 falls back to a full cold prefill. This commit adds a narrow guard. A never-hit entry whose last_used is within KV_EVICTION_FRESH_GUARD_SECONDS (60s) gets a +INF eviction score, so the eviction loop cannot pick it until either: 1. it ages past the guard window without ever being hit, in which case it returns to normal scoring and is freely evictable; or 2. it is hit at least once, in which case the existing (hits + 1) factor already protects it. The eviction loop additionally short-circuits when victim_score == DBL_MAX: if every entry in the cache is guard-protected, no entry is evicted and the cache overruns budget transiently until the guard window expires. That excursion is bounded by the guard duration and made explicit rather than silently looping forever or evicting a guarded entry. After this change on the same workload at the same 8 GB budget that was already saturated by old snapshots: turn 1 cold: 7.1s turn 2: 2.1s turn 3: 2.0s (3.2x speedup) Log on every turn: kv cache hit text tokens=2048 text=… load=5-6 ms file=… Turns 2+ now load the just-written cold-save's 2048-token prefix from disk in ~6 ms and only prefill the new tokens beyond it, instead of re-running the full prefill from token zero. The new test test_kv_cache_eviction_guards_just_written_never_hit_entry reproduces the failure mode: an old high-density entry and a fresh low-density entry compete for a one-entry budget. Under the old score the fresh entry would be evicted; under the guard the fresh entry survives and the old entry is evicted, matching the empirical fix. The two existing eviction tests (values_fresh_snapshots, keeps_aligned_continued_frontiers) continue to pass: they use ancient last_used timestamps (100, 200, 300) so the guard window never applies and the score reverts to the original formula. Honest scope statement: this fixes single-session multi-turn under budget pressure where fresh cold-saves would otherwise self-evict before their first lookup. Concurrent-session pathologies (many guard-protected entries piling up at once, briefly exceeding budget until their windows expire) are bounded by the guard duration and accepted as a transient over-budget excursion, not an unbounded leak. Co-Authored-By: Claude Opus 4.7 (1M context) --- ds4_server.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 81 insertions(+), 5 deletions(-) diff --git a/ds4_server.c b/ds4_server.c index 8e48cd91..da14009b 100644 --- a/ds4_server.c +++ b/ds4_server.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -8086,6 +8087,7 @@ static void apply_anthropic_stream_tool_ids(tool_calls *calls, #define KV_CACHE_DEFAULT_BOUNDARY_ALIGN_TOKENS 2048 #define KV_CACHE_DEFAULT_CONTINUED_INTERVAL_TOKENS 10000 #define KV_CACHE_DEFAULT_MB 4096 +#define KV_EVICTION_FRESH_GUARD_SECONDS 60u #define KV_EXT_TOOL_MAP (1u << 0) #define KV_EXT_RESPONSES_VISIBLE (1u << 1) #define KV_EXT_THINKING_VISIBLE (1u << 2) @@ -8644,7 +8646,8 @@ static void kv_cache_restore_tool_memory_for_messages(server *s, const chat_msgs id_list_free(&wanted); } -static double kv_entry_eviction_score(const kv_entry *e, const ds4_tokens *live) { +static double kv_entry_eviction_score(const kv_entry *e, const ds4_tokens *live, + uint64_t now_seconds) { if (!e || e->file_size == 0) return 0.0; (void)live; /* @@ -8652,9 +8655,28 @@ static double kv_entry_eviction_score(const kv_entry *e, const ds4_tokens *live) * it may be the only copy of the session that is about to be evicted from * RAM. Continued checkpoints are deliberately aligned restart frontiers, * so do not demote them just because they are a prefix of the live session. - * Use hits+1 for eviction value so a just-written checkpoint does not get - * deleted immediately just because its persisted hit counter is still 0. + * + * Plain `(hits + 1) * tokens / file_size` rewards entries with high token + * coverage per stored byte. That is the right long-run ranking, but it + * has one pathology: a fresh cold-save written at the start of a new + * conversation is, by construction, smaller and less token-dense per byte + * than a mature multi-thousand-token snapshot, so the formula values it + * lowest right when its lookup is about to happen. Under budget pressure + * the eviction loop then reaps it on the same wall-clock second it was + * written, before the session's next turn issues its first lookup. + * + * Guard exactly that case: a never-hit entry whose last_used (== created) + * is within KV_EVICTION_FRESH_GUARD_SECONDS gets a +INF score so the + * eviction loop can never pick it. The condition is narrow on purpose — + * any entry that has been hit once is already protected by the (hits + 1) + * factor, any entry older than the guard window is back under normal + * scoring, and any entry that ages past the window without being hit was + * never going to be hit anyway. */ + if (e->hits == 0 && now_seconds >= e->last_used && + now_seconds - e->last_used < KV_EVICTION_FRESH_GUARD_SECONDS) { + return DBL_MAX; + } return ((double)e->hits + 1.0) * (double)e->tokens / (double)e->file_size; } @@ -8663,11 +8685,12 @@ static void kv_cache_evict(kv_disk_cache *kc, const ds4_tokens *live) { kv_cache_refresh(kc); uint64_t total = 0; for (int i = 0; i < kc->len; i++) total += kc->entry[i].file_size; + const uint64_t now_seconds = (uint64_t)time(NULL); while (total > kc->budget_bytes && kc->len > 0) { int victim = 0; - double victim_score = kv_entry_eviction_score(&kc->entry[0], live); + double victim_score = kv_entry_eviction_score(&kc->entry[0], live, now_seconds); for (int i = 1; i < kc->len; i++) { - double score = kv_entry_eviction_score(&kc->entry[i], live); + double score = kv_entry_eviction_score(&kc->entry[i], live, now_seconds); if (score < victim_score || (score == victim_score && kc->entry[i].last_used < kc->entry[victim].last_used)) { @@ -8675,6 +8698,7 @@ static void kv_cache_evict(kv_disk_cache *kc, const ds4_tokens *live) { victim_score = score; } } + if (victim_score == DBL_MAX) break; /* nothing evictable; let cache overrun until guard expires */ kv_entry e = kc->entry[victim]; if (unlink(e.path) == 0) { server_log(DS4_LOG_KVCACHE, @@ -14292,6 +14316,57 @@ static void test_kv_cache_eviction_keeps_aligned_continued_frontiers(void) { rmdir(dir); } +static void test_kv_cache_eviction_guards_just_written_never_hit_entry(void) { + /* A fresh cold-save written at the start of a new conversation is, by + * construction, smaller and less token-dense per byte than a mature + * snapshot. Without the fresh-entry guard the plain `(hits+1)*tokens/size` + * score values it lowest right when its lookup is about to happen, so the + * eviction loop reaps it on the same wall-clock second it was written. + * + * This test sets up that exact pathology: an old high-density entry that + * dominates score-by-formula, and a freshly-written low-density entry that + * has the higher real-world value. Under tight budget the eviction loop + * must evict the old high-density entry, keeping the fresh one. + */ + char tmpl[] = "/tmp/ds4-kv-fresh-guard-test.XXXXXX"; + char *dir = mkdtemp(tmpl); + TEST_ASSERT(dir != NULL); + if (!dir) return; + + const char *old_sha = "1111111111111111111111111111111111111111"; + const char *fresh_sha = "2222222222222222222222222222222222222222"; + /* Old: high token density (4096 tokens packed in 4096 payload bytes), aged. */ + test_kv_stub_file(dir, old_sha, KV_REASON_UNKNOWN, 4096, 0, 100, 4096); + /* Fresh: lower token density (2048 tokens in 4096 payload bytes), just written. */ + const uint64_t now = (uint64_t)time(NULL); + test_kv_stub_file(dir, fresh_sha, KV_REASON_COLD, 2048, 0, now, 4096); + + char old_name[44], fresh_name[44]; + snprintf(old_name, sizeof(old_name), "%.40s.kv", old_sha); + snprintf(fresh_name, sizeof(fresh_name), "%.40s.kv", fresh_sha); + char *old_path = path_join(dir, old_name); + char *fresh_path = path_join(dir, fresh_name); + + kv_disk_cache kc = {0}; + kc.enabled = true; + kc.dir = xstrdup(dir); + kc.opt = kv_cache_default_options(); + /* Budget large enough for exactly one entry. */ + kc.budget_bytes = (KV_CACHE_FIXED_HEADER + 4u + 4096u) + 16u; + kv_cache_evict(&kc, NULL); + + /* Fresh entry must survive even though its plain score is lower. */ + TEST_ASSERT(access(old_path, F_OK) != 0); + TEST_ASSERT(access(fresh_path, F_OK) == 0); + + kv_cache_close(&kc); + unlink(old_path); + unlink(fresh_path); + free(old_path); + free(fresh_path); + rmdir(dir); +} + static void test_thinking_checkpoint_canonical_matches_future_prompt(void) { /* Simulate: user sends a single message, thinking mode on, no tools. * Model generates reasoning + content. The next request will drop the @@ -14629,6 +14704,7 @@ static void ds4_server_unit_tests_run(void) { test_kv_cache_lookup_uses_longest_text_prefix(); test_kv_cache_eviction_values_fresh_snapshots(); test_kv_cache_eviction_keeps_aligned_continued_frontiers(); + test_kv_cache_eviction_guards_just_written_never_hit_entry(); } #ifndef DS4_SERVER_TEST_NO_MAIN