From 403e598dfd20d17f0ad8ad91149ec961fc8ebe11 Mon Sep 17 00:00:00 2001 From: Erik LaBianca Date: Tue, 26 May 2026 12:07:58 -0400 Subject: [PATCH] feat(cpp-server): thinking-budget v2 + multi-dialect reasoning + model-card sidecars MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds per-request thinking-budget controls, multi-dialect reasoning emission, and JSON model-card sidecars to the native C++ dflash server. ## Thinking-budget mechanism - Level 2 BudgetHook: backend AR / spec-decode injects the model's `` close-token sequence once when `n_gen − committed ≤ hard_limit_reply_budget`. KV-continuous, mid-stream, in-process — applies uniformly to streaming and non-streaming requests. - Degenerate-decode watchdog: detects post-close repetition / runaway decode and aborts cleanly, surfacing the flag in finish_details and bench output. - Sidecar `thinking_terminator_hint` field: per-family close-token text (e.g. `\n\n` for qwen3, `\n\n` for gemma4) resolved at startup from the model card; `think_close_token_ids` is populated by tokenizing the hint, so the BudgetHook is family-aware without hardcoded archs. - Resolution order documented in spec §3: CLI > sidecar > family fallback > hard fallback. Hard fallback `hard_limit_reply_budget` defaults to 4096 (raised from 512); terse models should override down via sidecar. ## Multi-dialect reasoning emission - SSE emitter splits reasoning ↔ content for OpenAI Chat (`reasoning_content` delta), Anthropic (thinking/text content blocks with separate lifecycle), and Responses API (reasoning stripped per Codex r1 P2). - Qwen3.6 `` / `` special token ids are forwarded as text into the emitter; gemma4 `<|channel>` / `` are mapped onto the same channel so all archs share one state machine. - `first_content_token_index()` derives the natural-close split from the REASONING→CONTENT transition; leading `` opener is detected before fci capture so thinking_tokens accounts correctly for the Qwen3.6 streamed-thinking path. ## /props endpoint - Wholesale model_card (verbatim sidecar JSON, validates against share/model_cards/_schema.json) + budget_envelope (effective think_max_tokens, default_max_tokens, hard_limit_reply_budget, effort_tiers, model_card_source label) + runtime fields (chunk, target_device, draft_device, speculative_enabled, fa_window, ddtree_budget, kv_cache_k/v, runtime_backend). Captured by server_main at startup so the handler doesn't crack BackendArgs. ## Model-card sidecars - share/model_cards/{qwen3.6-27b,gemma-4-26b-a4b-it,gemma-4-31b-it, laguna-xs.2}.json — each ships max_tokens, complex_problem_max_tokens, hard_limit_reply_budget, thinking_terminator_hint, sampling defaults, and reasoning_effort_tiers as applicable. - share/model_cards/_schema.json — JSON Schema for sidecar validation, exercised by server_main loader and shipped for third-party authors. - model_card.cpp resolver: keyword + family-fallback lookup, with startup banner reporting the resolved `model_card_source` so operators can confirm which envelope is in force. ## Spec docs - docs/specs/thinking-budget.md — mechanism, resolution order, CLI surface, finish_details/close_kind contract. - docs/specs/model-cards.md — sidecar field reference, family fallback table, ship/override guidance. - docs/specs/props-endpoint.md + docs/specs/openapi-props.yaml — shape and OpenAPI 3 schema for the /props payload. ## Tests - test_server_unit gains coverage of the SSE emitter reasoning split (OpenAI / Anthropic / Responses), first_content_token_index across the natural-close, never-closed, content-only, and Qwen3.6 streamed-thinking paths, /props body shape (wholesale sidecar + family-fallback null), and usage.timings across all three response shapes. 1542 assertions, 0 failures. Co-Authored-By: Claude Opus 4.7 (1M context) --- dflash/CMakeLists.txt | 5 +- dflash/src/common/model_backend.h | 48 ++ dflash/src/gemma4/gemma4_backend.cpp | 149 +++- dflash/src/gemma4/gemma4_backend.h | 18 +- dflash/src/laguna/laguna_backend.cpp | 88 ++ dflash/src/qwen35/qwen35_backend.cpp | 228 +++++- dflash/src/qwen35/qwen35_backend.h | 29 +- dflash/src/server/chat_template.cpp | 78 +- dflash/src/server/http_server.cpp | 775 +++++++++++++++--- dflash/src/server/http_server.h | 110 ++- dflash/src/server/model_card.cpp | 362 +++++++++ dflash/src/server/model_card.h | 119 +++ dflash/src/server/prefix_cache.cpp | 35 +- dflash/src/server/prefix_cache.h | 41 + dflash/src/server/server_main.cpp | 324 +++++++- dflash/src/server/sse_emitter.cpp | 92 ++- dflash/src/server/sse_emitter.h | 67 +- dflash/src/server/tool_memory.h | 12 + dflash/test/test_server_unit.cpp | 482 ++++++++++- docs/specs/model-cards.md | 392 +++++++++ docs/specs/openapi-props.yaml | 929 ++++++++++++++++++++++ docs/specs/props-endpoint.md | 650 +++++++++++++++ docs/specs/thinking-budget.md | 577 ++++++++++++++ share/model_cards/README.md | 77 ++ share/model_cards/_schema.json | 85 ++ share/model_cards/gemma-4-26b-a4b-it.json | 23 + share/model_cards/gemma-4-31b-it.json | 23 + share/model_cards/laguna-xs.2.json | 19 + share/model_cards/qwen3.6-27b.json | 24 + 29 files changed, 5672 insertions(+), 189 deletions(-) create mode 100644 dflash/src/server/model_card.cpp create mode 100644 dflash/src/server/model_card.h create mode 100644 docs/specs/model-cards.md create mode 100644 docs/specs/openapi-props.yaml create mode 100644 docs/specs/props-endpoint.md create mode 100644 docs/specs/thinking-budget.md create mode 100644 share/model_cards/README.md create mode 100644 share/model_cards/_schema.json create mode 100644 share/model_cards/gemma-4-26b-a4b-it.json create mode 100644 share/model_cards/gemma-4-31b-it.json create mode 100644 share/model_cards/laguna-xs.2.json create mode 100644 share/model_cards/qwen3.6-27b.json diff --git a/dflash/CMakeLists.txt b/dflash/CMakeLists.txt index 0fcfc7382..f8e81f837 100644 --- a/dflash/CMakeLists.txt +++ b/dflash/CMakeLists.txt @@ -655,6 +655,7 @@ if(DFLASH27B_TESTS) add_executable(dflash_server src/server/server_main.cpp src/server/http_server.cpp + src/server/model_card.cpp ) target_include_directories(dflash_server PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS}) if(DFLASH27B_GPU_BACKEND STREQUAL "hip") @@ -720,7 +721,9 @@ if(DFLASH27B_TESTS) if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_server_unit.cpp") add_executable(test_server_unit test/test_server_unit.cpp) - target_sources(test_server_unit PRIVATE src/server/http_server.cpp) + target_sources(test_server_unit PRIVATE + src/server/http_server.cpp + src/server/model_card.cpp) target_include_directories(test_server_unit PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS}) if(DFLASH27B_GPU_BACKEND STREQUAL "hip") target_compile_definitions(test_server_unit PRIVATE DFLASH27B_BACKEND_HIP=1 GGML_USE_HIP) diff --git a/dflash/src/common/model_backend.h b/dflash/src/common/model_backend.h index 4b3105757..9238f2fa3 100644 --- a/dflash/src/common/model_backend.h +++ b/dflash/src/common/model_backend.h @@ -46,6 +46,39 @@ struct DaemonIO { // ─── Generate request/result ──────────────────────────────────────────── +// Thinking-budget force-close hook. Mirrors antirez/ds4 ds4_eval.c's +// hard_limit_reply_budget semantics: when the budget remaining (n_gen +// minus tokens committed so far) falls to hard_limit_remaining, the +// next sampled tokens get overridden with close_token_ids in order, +// giving the model the remaining budget to write a visible answer +// after the injected close-tag sequence. +// +// Single vs multi-token close: +// Qwen3.6: is one added_token (id 248069). close_token_ids +// has size 1. One override + budget_close_injected=true. +// DeepSeek/laguna: tokenizes to 3 ordinary tokens +// ([1718, 37947, 32] for DS-V3). close_token_ids has +// size 3. Three consecutive overrides, then resume. +// +// This is "Level 2" of our thinking-budget migration: in-process +// mid-stream force-close, KV-continuous. Beats Level 1's phase-2 +// reprompt because the model never sees a fresh prefill — its KV +// state continues naturally after the injected close. +// +// Current implementation: AR-decode only. When budget_hook is set, +// backends MAY route generation through their AR path (skipping spec +// decode) — the perf trade-off is acceptable since this only kicks in +// for thinking-enabled requests. Spec-decode integration is a follow-up. +struct BudgetHook { + // Multi-token close sequence injected when `(n_gen - committed)` + // drops to `hard_limit_remaining`. For Qwen3.x this is the + // canonical "Considering the limited time..." summarize-and-stop + // lead-in (tokenized at server startup); for non-qwen arches it's + // a single close-tag token. Empty = hook disabled. + std::vector close_token_ids; + int hard_limit_remaining = 0; +}; + struct GenerateRequest { std::vector prompt; int n_gen = 0; @@ -65,6 +98,8 @@ struct GenerateRequest { // When non-null, the spec decode loop uses these as draft overrides, // bypassing draft model computation for covered positions. const std::vector * hint_tokens = nullptr; + // Optional thinking-budget hook — see BudgetHook docs above. + BudgetHook budget_hook; }; struct GenerateResult { @@ -73,6 +108,19 @@ struct GenerateResult { std::vector tokens; double prefill_s = 0.0; double decode_s = 0.0; + // True when the backend's Level 2 hook injected the close + // sequence during this generation (vs. the model self-closing). The + // server uses this to attribute close_kind correctly: if the model + // produced naturally we report "natural"; if the hook fired + // we report "hard". Without this flag, decoding the phase-1 token + // stream and grepping for "" cannot distinguish the two + // (the injected close decodes identically). + bool budget_forced_close = false; + // True iff the AR decode loop's post-close watchdog detected an n-gram + // repetition loop and broke out early. Caller surfaces this so clients + // can mark the answer as unreliable rather than treating the + // (truncated) content as a clean response. + bool degenerate_decode_close = false; }; // ─── Backend interface ────────────────────────────────────────────────── diff --git a/dflash/src/gemma4/gemma4_backend.cpp b/dflash/src/gemma4/gemma4_backend.cpp index cd4befa13..19b708f67 100644 --- a/dflash/src/gemma4/gemma4_backend.cpp +++ b/dflash/src/gemma4/gemma4_backend.cpp @@ -261,14 +261,84 @@ int Gemma4Backend::do_prefill(const std::vector & tokens, bool Gemma4Backend::do_decode(int committed, int n_gen, std::vector & out_tokens, - const DaemonIO & io) { + const DaemonIO & io, + const BudgetHook & budget_hook, + bool * forced_close_out) { const int hidden = w_.n_embd; const int vocab = w_.n_vocab; std::vector embed_buf(hidden); std::vector logits; + // Budget force-close state — same shape as qwen35's maybe_force_close. + // See dflash/src/common/model_backend.h BudgetHook docs for the + // single- vs multi-token close-tag semantics. + bool budget_close_started = false; + int close_inject_pos = 0; + // Capture entry KV position so the budget check is in the + // "generated since entry" frame, not the absolute KV frame. + // committed_now (KV position) = prompt_len + tokens_generated; n_gen + // is the gen-only count (or remaining-budget remap from the + // spec-decode tail-off). Without this entry capture, force-close + // fires prompt_len tokens early on prompted requests and goes + // negative immediately after a tail-off. (Mirrors qwen35 fix 5c785f0 + // — same bug since this lambda was ported verbatim.) + const int committed_at_entry = committed; + auto maybe_force_close = [&](int32_t & tok, int committed_now) { + if (budget_hook.close_token_ids.empty()) return; + if (budget_close_started && + close_inject_pos < (int)budget_hook.close_token_ids.size()) + { + int32_t inj = budget_hook.close_token_ids[close_inject_pos]; + std::fprintf(stderr, + "[budget-hook] gemma4 close-seq continue %d/%zu: overriding " + "sampled token %d with %d\n", + close_inject_pos + 1, + budget_hook.close_token_ids.size(), tok, inj); + tok = inj; + close_inject_pos++; + return; + } + if (budget_close_started) return; + const int generated = committed_now - committed_at_entry; + int remaining = n_gen - generated; + if (remaining <= budget_hook.hard_limit_remaining) { + int32_t first_close = budget_hook.close_token_ids.front(); + if (tok == first_close) { + budget_close_started = true; + close_inject_pos = 1; + return; + } + std::fprintf(stderr, + "[budget-hook] gemma4 force-close at committed=%d/%d " + "(remaining=%d <= hard_limit=%d): overriding token %d " + "with close[0]=%d (seq len %zu)\n", + committed_now, n_gen, remaining, + budget_hook.hard_limit_remaining, tok, first_close, + budget_hook.close_token_ids.size()); + tok = first_close; + budget_close_started = true; + close_inject_pos = 1; + if (forced_close_out) *forced_close_out = true; + } + }; + for (int i = 0; i < n_gen; ++i) { - int32_t tok = out_tokens.back(); + // Seed for this iteration's embed step: + // - Normal case: previous iteration just pushed a sampled + // token onto out_tokens; we re-embed it to advance KV + + // produce next-token logits. + // - Empty case (spec-decode tail-off at iter 0): no prior + // iteration ran, so use cache_.last_tok — that's the + // prefill argmax that spec-decode would have consumed as + // its initial seed. Mirrors qwen35's initial_emitted=1 + // pattern; without this, out_tokens.back() on an empty + // vector is UB. (Codex r2 P2 follow-up: the previous fix + // pushed last_tok onto out_tokens here in the caller, but + // that grew out_tokens by an uncounted extra token and the + // caller's `result.tokens.size()` over-counted against the + // budget. Reading from cache instead keeps the budget + // honest.) + int32_t tok = out_tokens.empty() ? cache_.last_tok : out_tokens.back(); // Embed single token w_.embedder.embed(&tok, 1, embed_buf.data()); @@ -292,6 +362,7 @@ bool Gemma4Backend::do_decode(int committed, int n_gen, if (logits[j] > best) { best = logits[j]; next = j; } } } + maybe_force_close(next, committed); out_tokens.push_back(next); io.emit(next); @@ -310,7 +381,9 @@ bool Gemma4Backend::do_decode(int committed, int n_gen, bool Gemma4Backend::do_spec_decode(int committed, int n_gen, std::vector & out_tokens, - const DaemonIO & io) { + const DaemonIO & io, + const BudgetHook * budget_hook, + bool * forced_close_out) { const int hidden = w_.n_embd; int32_t last_tok = cache_.last_tok; @@ -336,6 +409,60 @@ bool Gemma4Backend::do_spec_decode(int committed, int n_gen, while (n_generated < n_gen) { const int need_commit_budget = n_gen - n_generated; + // Budget tail-off: when remaining budget is within one spec-decode + // batch of the force-close threshold, hand off to do_decode for the + // tail. AR handles the close-token override cleanly; spec-decode's + // verify-and-accept loop can't safely inject a token mid-batch + // without rewriting KV. + // + // Gemma4's do_decode reads `out_tokens.back()` as the seed each + // iter. After the first spec-decode iteration the most-recently- + // committed token is on out_tokens, but on a small-budget request + // (budget_tokens <= reply_budget + q_len) tail-off can fire on + // iter 0 before out_tokens has been seeded. Codex review flagged + // the resulting UB on out_tokens.back(); we set cache_.last_tok + // and let do_decode pick it up when out_tokens is empty. + // + // Budget accounting (codex r2 P2): in the previous patch we + // also push_back'd last_tok before calling do_decode, which + // grew out_tokens by an extra token outside the budget — the + // caller (http_server) then saw `result.tokens.size() == + // need_commit_budget + 1` and double-counted that seed against + // the budget. Mirror qwen35 instead: cache the seed via + // cache_.last_tok, leave out_tokens untouched, and have + // do_decode read the seed from cache when out_tokens is empty + // (initial_emitted=1 path below). That keeps the budget honest + // and matches the symmetry between qwen35 and gemma4 backends. + if (budget_hook && !budget_hook->close_token_ids.empty()) { + int hard = budget_hook->hard_limit_remaining; + if (need_commit_budget <= hard + q_len) { + std::fprintf(stderr, + "[budget-hook] gemma4 spec-decode tail-off at " + "committed=%d/%d (remaining=%d, hard_limit=%d, " + "batch=%d) — switching to AR\n", + committed, n_gen, need_commit_budget, hard, q_len); + step_graph_destroy(draft_sg); + cache_.last_tok = last_tok; // do_decode reads this when out_tokens empty + BudgetHook tail_hook = *budget_hook; + int ar_n_gen = need_commit_budget; + bool ok = do_decode(committed, ar_n_gen, out_tokens, io, + tail_hook, forced_close_out); + auto t_dec1 = std::chrono::steady_clock::now(); + const double decode_s = std::chrono::duration(t_dec1 - t_dec0).count(); + const int total_draft_pos = std::max(1, n_draft_steps * q_len); + const double accept_pct = 100.0 * (double)n_accept_sum / (double)total_draft_pos; + std::fprintf(stderr, + "[gemma4-spec] tail-off-stats tokens=%d time=%.3f s " + "speed=%.2f tok/s steps=%d accepted=%d/%d (%.1f%%)\n", + n_generated, decode_s, + n_generated > 0 ? n_generated / decode_s : 0.0, + n_draft_steps, n_accept_sum, total_draft_pos, + accept_pct); + io.emit(-1); + return ok; + } + } + // 1. Build noise input: [last_tok, MASK, MASK, ..., MASK] noise_ids[0] = last_tok; for (int i = 1; i < q_len; i++) noise_ids[i] = target->mask_token_id(); @@ -529,7 +656,9 @@ GenerateResult Gemma4Backend::generate(const GenerateRequest & req, && !sampler_.needs_logit_processing(); if (can_spec) { - if (!do_spec_decode(committed, req.n_gen, result.tokens, out_io)) { + if (!do_spec_decode(committed, req.n_gen, result.tokens, out_io, + &req.budget_hook, + &result.budget_forced_close)) { result.error = "spec_decode"; return result; } @@ -578,7 +707,9 @@ GenerateResult Gemma4Backend::generate(const GenerateRequest & req, } if (req.n_gen > 1) { - if (!do_decode(committed, req.n_gen - 1, result.tokens, out_io)) { + if (!do_decode(committed, req.n_gen - 1, result.tokens, out_io, + req.budget_hook, + &result.budget_forced_close)) { result.error = "decode"; return result; } @@ -694,7 +825,9 @@ GenerateResult Gemma4Backend::restore_and_generate(int slot, && sampler_.temp == 0.0f; if (can_spec) { - if (!do_spec_decode(committed, req.n_gen, result.tokens, out_io)) { + if (!do_spec_decode(committed, req.n_gen, result.tokens, out_io, + &req.budget_hook, + &result.budget_forced_close)) { result.error = "spec_decode"; return result; } @@ -743,7 +876,9 @@ GenerateResult Gemma4Backend::restore_and_generate(int slot, } if (req.n_gen > 1) { - if (!do_decode(committed, req.n_gen - 1, result.tokens, out_io)) { + if (!do_decode(committed, req.n_gen - 1, result.tokens, out_io, + req.budget_hook, + &result.budget_forced_close)) { result.error = "decode"; return result; } diff --git a/dflash/src/gemma4/gemma4_backend.h b/dflash/src/gemma4/gemma4_backend.h index 35da51dea..4a92a607d 100644 --- a/dflash/src/gemma4/gemma4_backend.h +++ b/dflash/src/gemma4/gemma4_backend.h @@ -105,14 +105,28 @@ class Gemma4Backend : public ModelBackend { int kv_offset = 0); // Autoregressive decode loop. + // budget_hook (when close_token_ids is non-empty) overrides the next + // sampled token(s) with the close-tag sequence once (n_gen - committed) + // <= hard_limit. Mirrors qwen35's do_ar_decode. For Gemma4 the close + // tag is typically `` (single token in the gemma4 vocab). + // forced_close_out, when non-null, is set to true iff the hook injected + // the close sequence (vs. the model self-closing). See qwen35_backend.h + // for full rationale. bool do_decode(int committed, int n_gen, std::vector & out_tokens, - const DaemonIO & io); + const DaemonIO & io, + const BudgetHook & budget_hook = {}, + bool * forced_close_out = nullptr); // DFlash speculative decode loop. + // When budget_hook is non-null and (n_gen - generated) falls within + // hard_limit + batch headroom, breaks out and tails via do_decode so + // the force-close override fires cleanly with KV state intact. bool do_spec_decode(int committed, int n_gen, std::vector & out_tokens, - const DaemonIO & io); + const DaemonIO & io, + const BudgetHook * budget_hook = nullptr, + bool * forced_close_out = nullptr); }; } // namespace dflash::common diff --git a/dflash/src/laguna/laguna_backend.cpp b/dflash/src/laguna/laguna_backend.cpp index ca64ab943..d6108e4e0 100644 --- a/dflash/src/laguna/laguna_backend.cpp +++ b/dflash/src/laguna/laguna_backend.cpp @@ -214,9 +214,54 @@ GenerateResult LagunaBackend::generate(const GenerateRequest & req, int next_tok = pick(last_logits); result.tokens.reserve(req.n_gen); + // Budget force-close state — see model_backend.h BudgetHook docs. + // Mirrors qwen35/do_ar_decode's maybe_force_close. Laguna has no + // spec-decode path so this is the only override site. + const BudgetHook & budget_hook = req.budget_hook; + bool budget_close_started = false; + int close_inject_pos = 0; + auto maybe_force_close = [&](int32_t & tok, int committed_now) { + if (budget_hook.close_token_ids.empty()) return; + if (budget_close_started && + close_inject_pos < (int)budget_hook.close_token_ids.size()) + { + int32_t inj = budget_hook.close_token_ids[close_inject_pos]; + std::fprintf(stderr, + "[budget-hook] laguna close-seq continue %d/%zu: overriding " + "sampled token %d with %d\n", + close_inject_pos + 1, + budget_hook.close_token_ids.size(), tok, inj); + tok = inj; + close_inject_pos++; + return; + } + if (budget_close_started) return; + int remaining = req.n_gen - committed_now; + if (remaining <= budget_hook.hard_limit_remaining) { + int32_t first_close = budget_hook.close_token_ids.front(); + if (tok == first_close) { + budget_close_started = true; + close_inject_pos = 1; + return; + } + std::fprintf(stderr, + "[budget-hook] laguna force-close at committed=%d/%d " + "(remaining=%d <= hard_limit=%d): overriding token %d " + "with close[0]=%d (seq len %zu)\n", + committed_now, req.n_gen, remaining, + budget_hook.hard_limit_remaining, tok, first_close, + budget_hook.close_token_ids.size()); + tok = first_close; + budget_close_started = true; + close_inject_pos = 1; + result.budget_forced_close = true; + } + }; + std::vector embed_step((size_t)w_.n_embd); auto t_g0 = std::chrono::steady_clock::now(); for (int s = 0; s < req.n_gen; ++s) { + maybe_force_close(next_tok, s); if (next_tok == w_.eos_id || next_tok == w_.eos_chat_id) break; result.tokens.push_back(next_tok); history.push_back(next_tok); @@ -307,9 +352,52 @@ GenerateResult LagunaBackend::restore_and_generate(int slot, }; int next_tok = pick(last_logits); + + const BudgetHook & budget_hook = req.budget_hook; + bool budget_close_started = false; + int close_inject_pos = 0; + auto maybe_force_close = [&](int32_t & tok, int committed_now) { + if (budget_hook.close_token_ids.empty()) return; + if (budget_close_started && + close_inject_pos < (int)budget_hook.close_token_ids.size()) + { + int32_t inj = budget_hook.close_token_ids[close_inject_pos]; + std::fprintf(stderr, + "[budget-hook] laguna(restore) close-seq continue %d/%zu: " + "overriding sampled token %d with %d\n", + close_inject_pos + 1, + budget_hook.close_token_ids.size(), tok, inj); + tok = inj; + close_inject_pos++; + return; + } + if (budget_close_started) return; + int remaining = req.n_gen - committed_now; + if (remaining <= budget_hook.hard_limit_remaining) { + int32_t first_close = budget_hook.close_token_ids.front(); + if (tok == first_close) { + budget_close_started = true; + close_inject_pos = 1; + return; + } + std::fprintf(stderr, + "[budget-hook] laguna(restore) force-close at " + "committed=%d/%d (remaining=%d <= hard_limit=%d): " + "overriding token %d with close[0]=%d (seq len %zu)\n", + committed_now, req.n_gen, remaining, + budget_hook.hard_limit_remaining, tok, first_close, + budget_hook.close_token_ids.size()); + tok = first_close; + budget_close_started = true; + close_inject_pos = 1; + result.budget_forced_close = true; + } + }; + std::vector embed_step((size_t)w_.n_embd); auto t_g0 = std::chrono::steady_clock::now(); for (int s = 0; s < req.n_gen; ++s) { + maybe_force_close(next_tok, s); if (next_tok == w_.eos_id || next_tok == w_.eos_chat_id) break; history.push_back(next_tok); result.tokens.push_back(next_tok); diff --git a/dflash/src/qwen35/qwen35_backend.cpp b/dflash/src/qwen35/qwen35_backend.cpp index 8f72e8f16..c4b4da62c 100644 --- a/dflash/src/qwen35/qwen35_backend.cpp +++ b/dflash/src/qwen35/qwen35_backend.cpp @@ -550,7 +550,16 @@ GenerateResult Qwen35Backend::generate(const GenerateRequest & req, // Decode (speculative) if (req.n_gen > 0) { auto t_decode_start = std::chrono::steady_clock::now(); - if (!do_spec_decode(committed, req.n_gen, result.tokens, out_io, req.hint_tokens)) { + // Pass the budget hook into spec-decode. When token count nears + // the budget edge, do_spec_decode breaks out and tails off via + // AR with the hook still active — force-close fires correctly + // without sacrificing spec-decode throughput for the bulk of + // generation. Most requests never hit the tail because the + // model closes naturally well before the budget edge. + if (!do_spec_decode(committed, req.n_gen, result.tokens, out_io, + req.hint_tokens, &req.budget_hook, + &result.budget_forced_close, + &result.degenerate_decode_close)) { result.error = "decode"; return result; } @@ -611,7 +620,16 @@ GenerateResult Qwen35Backend::restore_and_generate(int slot, // Decode if (req.n_gen > 0) { auto t_decode_start = std::chrono::steady_clock::now(); - if (!do_spec_decode(committed, req.n_gen, result.tokens, out_io, req.hint_tokens)) { + // Pass the budget hook into spec-decode. When token count nears + // the budget edge, do_spec_decode breaks out and tails off via + // AR with the hook still active — force-close fires correctly + // without sacrificing spec-decode throughput for the bulk of + // generation. Most requests never hit the tail because the + // model closes naturally well before the budget edge. + if (!do_spec_decode(committed, req.n_gen, result.tokens, out_io, + req.hint_tokens, &req.budget_hook, + &result.budget_forced_close, + &result.degenerate_decode_close)) { result.error = "decode"; return result; } @@ -767,9 +785,96 @@ int Qwen35Backend::do_prefill(const std::vector & tokens, bool Qwen35Backend::do_ar_decode(int committed, int n_gen, std::vector & out_tokens, - const DaemonIO & io) { + const DaemonIO & io, + const BudgetHook & budget_hook, + bool * forced_close_out, + bool * degenerate_close_out) { + // Budget hook state. + // - budget_close_started: true once we've begun injecting the close + // sequence. Prevents re-triggering on continued forward generation. + // - close_inject_pos: index into budget_hook.close_token_ids for the + // NEXT token to inject. While < close_token_ids.size(), each + // iteration overrides the sampled token with the corresponding + // close-sequence token (single-token close = 1 override and done; + // multi-token close like DeepSeek/laguna [1718,37947,32] = 3 + // consecutive overrides). Once equal to close_token_ids.size(), + // normal sampling resumes (model writes visible answer). + bool budget_close_started = false; + int close_inject_pos = 0; + // Capture entry KV position so the budget check is in the + // "generated since entry" frame, not the absolute KV frame. + // n_gen is the gen-only count (or the remaining-budget remap done by + // spec-decode tail-off); subtracting committed_now (absolute KV = + // prompt_len + tokens generated this call) directly would treat + // prompt-length tokens as if they were generated output, firing + // force-close prompt_len tokens early on prompted requests and + // potentially going negative after spec-decode tail-off. + const int committed_at_entry = committed; + auto maybe_force_close = [&](int32_t & tok, int committed_now) { + if (budget_hook.close_token_ids.empty()) return; + + // Continue an already-started multi-token close sequence. + if (budget_close_started && + close_inject_pos < (int)budget_hook.close_token_ids.size()) + { + int32_t inj = budget_hook.close_token_ids[close_inject_pos]; + std::fprintf(stderr, + "[budget-hook] close-seq continue %d/%zu: overriding " + "sampled token %d with %d\n", + close_inject_pos + 1, + budget_hook.close_token_ids.size(), tok, inj); + tok = inj; + close_inject_pos++; + return; + } + + // Already injected the full sequence — no further overrides. + if (budget_close_started) return; + + // Check if budget has tightened to the force-close trigger. + // generated = tokens produced in THIS do_ar_decode call; + // remaining = budget headroom, measured against n_gen (the + // requested gen count or tail-off remap, never against the + // absolute KV position which would mis-count the prompt). + const int generated = committed_now - committed_at_entry; + int remaining = n_gen - generated; + if (remaining <= budget_hook.hard_limit_remaining) { + // Don't trigger if the model already sampled the first close + // token naturally — avoids a redundant override. + int32_t first_close = budget_hook.close_token_ids.front(); + if (tok == first_close) { + // Model self-closed at the boundary; consume that token + // as the first of the sequence so we still inject the + // remaining members (multi-token case) but don't double-emit. + budget_close_started = true; + close_inject_pos = 1; + std::fprintf(stderr, + "[budget-hook] model self-emitted close[0]=%d at " + "committed=%d/%d (remaining=%d <= hard_limit=%d); " + "consuming as start of close sequence (%zu total)\n", + first_close, committed_now, n_gen, remaining, + budget_hook.hard_limit_remaining, + budget_hook.close_token_ids.size()); + return; + } + std::fprintf(stderr, + "[budget-hook] force-close at committed=%d/%d (remaining=%d " + "<= hard_limit=%d): overriding sampled token %d with close[0]=%d " + "(seq len %zu)\n", + committed_now, n_gen, remaining, + budget_hook.hard_limit_remaining, tok, first_close, + budget_hook.close_token_ids.size()); + tok = first_close; + budget_close_started = true; + close_inject_pos = 1; + if (forced_close_out) *forced_close_out = true; + } + }; if (n_gen <= 0) return true; + auto t_dec0_ar = std::chrono::steady_clock::now(); + const size_t out_tokens_at_entry = out_tokens.size(); + const int hidden = w_.n_embd; const int vocab = w_.n_vocab; std::vector logits_buf(vocab); @@ -779,7 +884,17 @@ bool Qwen35Backend::do_ar_decode(int committed, int n_gen, // First token: consume the final prefill position. Do not derive this // offset from committed/KV position: restore paths can prefill a delta at // nonzero KV offsets, and committed then no longer describes chunk size. - { + // + // Continuation mode: when out_tokens is non-empty, a previous decode + // path (e.g. spec-decode tail-off) already committed tokens and emitted + // them. Skip the first-token block — `committed` and `cache_.last_tok` + // are already pointing at the most recently committed token, and the + // main loop below uses out_tokens.back() as the embed input which IS + // that token. Without this skip we'd duplicate the last token in + // out_tokens, double-emit it, and advance committed past the actual + // KV state. + const int initial_emitted = out_tokens.empty() ? 1 : 0; + if (initial_emitted == 1) { int32_t first_tok; if (sampler_.needs_logit_processing()) { if (!prefill_last_logits_valid_) return false; @@ -790,6 +905,7 @@ bool Qwen35Backend::do_ar_decode(int committed, int n_gen, } else { first_tok = cache_.last_tok; } + maybe_force_close(first_tok, committed); out_tokens.push_back(first_tok); io.emit(first_tok); if (IS_EOS_TOK(first_tok, w_)) return true; @@ -798,7 +914,7 @@ bool Qwen35Backend::do_ar_decode(int committed, int n_gen, } // AR decode loop for remaining tokens - for (int i = 1; i < n_gen; i++) { + for (int i = initial_emitted; i < n_gen; i++) { int32_t tok = out_tokens.back(); if (!w_.embedder.embed(&tok, 1, embed_buf)) return false; @@ -833,6 +949,8 @@ bool Qwen35Backend::do_ar_decode(int committed, int n_gen, } } + maybe_force_close(next_tok, committed); + out_tokens.push_back(next_tok); io.emit(next_tok); committed++; @@ -840,7 +958,53 @@ bool Qwen35Backend::do_ar_decode(int committed, int n_gen, if (io.cancelled) break; if (IS_EOS_TOK(next_tok, w_)) break; + + // Degenerate-decode watchdog. Once we're past the budget-hook's + // close sequence (model in post-`` content phase), watch + // for repetition loops. The aime2025-02 case at think_max=4k + // produces a ~50-token phrase that repeats verbatim until + // max_tokens — pure waste. + // + // Sweep several common loop periods. For each period P we check + // if the last P tokens equal the previous P tokens (one full + // repeat). One match is enough; the model has already burned 2P + // tokens at that point and isn't getting out. The minimum-3 + // bar would catch tighter cycles but waits ~3P tokens to fire, + // which is wasteful for P ≥ 32. Periods are tuned to common + // failure modes: short loops (16-24) for "we have X, X, X" + // patterns, longer (48-64) for full-sentence restates like the + // aime02 case. + if (budget_close_started && close_inject_pos >= (int)budget_hook.close_token_ids.size()) + { + // Sweep contiguous periods 12..80. Any P where the last P + // tokens equal the previous P tokens means a loop of that + // period. Stop early on first match. Fixed periods missed + // the aime02 case which loops with period ~50; dense sweep + // covers any period in this range. + auto end = out_tokens.end(); + const int avail = (int)out_tokens.size(); + for (int P = 12; P <= 80; P++) { + if (avail < 2 * P) break; // larger P also won't have data + if (std::equal(end - 2*P, end - P, end - P)) { + std::fprintf(stderr, + "[degenerate-decode] post-close period=%d repeated — " + "breaking AR loop at committed=%d, content_tokens=%zu\n", + P, committed, + out_tokens.size() - out_tokens_at_entry); + if (degenerate_close_out) *degenerate_close_out = true; + goto degenerate_break; + } + } + } + if (false) { degenerate_break: break; } } + + auto t_dec1_ar = std::chrono::steady_clock::now(); + const double ar_decode_s = std::chrono::duration(t_dec1_ar - t_dec0_ar).count(); + const int ar_tokens = (int)(out_tokens.size() - out_tokens_at_entry); + std::fprintf(stderr, "[ar-decode] tokens=%d time=%.3f s speed=%.2f tok/s\n", + ar_tokens, ar_decode_s, + ar_tokens > 0 && ar_decode_s > 0 ? ar_tokens / ar_decode_s : 0.0); return true; } @@ -882,7 +1046,10 @@ bool Qwen35Backend::sync_remote_draft_features(int start_pos, int n_tokens) { bool Qwen35Backend::do_spec_decode(int committed, int n_gen, std::vector & out_tokens, const DaemonIO & io, - const std::vector * hint_tokens) { + const std::vector * hint_tokens, + const BudgetHook * budget_hook, + bool * forced_close_out, + bool * degenerate_close_out) { const int hidden = w_.n_embd; // First token: use the argmax that do_prefill already sampled and stored. @@ -906,8 +1073,11 @@ bool Qwen35Backend::do_spec_decode(int committed, int n_gen, if (!can_spec) { // AR fallback consumes the final prefill position itself, then advances - // one token at a time. - bool ok = do_ar_decode(committed, n_gen, out_tokens, io); + // one token at a time. Pass the budget hook through so force-close + // still fires when spec-decode is unavailable. + bool ok = do_ar_decode(committed, n_gen, out_tokens, io, + budget_hook ? *budget_hook : BudgetHook{}, + forced_close_out, degenerate_close_out); io.emit(-1); return ok; } @@ -939,6 +1109,48 @@ bool Qwen35Backend::do_spec_decode(int committed, int n_gen, while (n_generated < n_gen) { const int need_commit_budget = n_gen - n_generated; + // Budget tail-off: when remaining budget is within the spec-decode + // batch size of the force-close threshold, hand off to AR for the + // tail. AR handles the close-token override cleanly; spec-decode's + // verify-and-accept loop can't safely inject a token mid-batch + // without a KV-state rewrite. + // + // IMPORTANT: cache_.last_tok is set during do_prefill (line 701) + // and NEVER updated by the spec-decode commit loop — local + // `last_tok` here is the authoritative most-recently-committed + // token. Sync it into cache_.last_tok before handing off so AR's + // `first_tok = cache_.last_tok` seed is correct. Without this + // sync, AR would re-seed from the prefill's last argmax (stale + // by `n_generated` positions) and produce garbage continuation. + if (budget_hook && !budget_hook->close_token_ids.empty()) { + int hard = budget_hook->hard_limit_remaining; + // Tail when remaining <= hard + one spec-decode batch worth of + // headroom. Ensures the force-close fires within the AR tail + // rather than after a final spec-decode batch overshoots. + if (need_commit_budget <= hard + q_len) { + std::fprintf(stderr, + "[budget-hook] spec-decode tail-off at committed=%d/%d " + "(remaining=%d, hard_limit=%d, batch=%d) — switching to AR\n", + committed, n_gen, need_commit_budget, hard, q_len); + step_graph_destroy(draft_sg); + cache_.last_tok = last_tok; // sync spec-decode → AR seed + // Build a fresh hook keyed off this call's local n_gen + // (the remaining decode budget) so force-close fires once + // remaining <= hard_limit relative to AR's loop counter, + // not relative to the global decode budget. Without this + // remap force-close would either fire on every iter + // (negative remaining_for_hook) or never (positive but + // misscaled). + BudgetHook tail_hook = *budget_hook; + int ar_n_gen = need_commit_budget; + bool ok = do_ar_decode(committed, ar_n_gen, out_tokens, io, + tail_hook, forced_close_out, + degenerate_close_out); + io.emit(-1); + return ok; + } + } + // 1. Build noise input for draft noise_ids[0] = last_tok; for (int i = 1; i < q_len; i++) noise_ids[i] = target->mask_token_id(); diff --git a/dflash/src/qwen35/qwen35_backend.h b/dflash/src/qwen35/qwen35_backend.h index 277a8d3d5..c3031abbb 100644 --- a/dflash/src/qwen35/qwen35_backend.h +++ b/dflash/src/qwen35/qwen35_backend.h @@ -176,15 +176,40 @@ class Qwen35Backend : public ModelBackend { int kv_offset = 0); // Speculative decode loop: draft → verify → accept until EOS/max. + // When budget_hook is non-null and (n_gen - generated) drops to the + // hard-limit boundary, breaks out of the spec-decode loop and tails + // off via do_ar_decode so the force-close override fires cleanly + // with KV state intact. Spec-decode itself can't safely inject the + // close token mid-batch (verify-and-accept assumes the sampled + // tokens are the ones that got committed), so the boundary switch + // is the simplest correct integration. bool do_spec_decode(int committed, int n_gen, std::vector & out_tokens, const DaemonIO & io, - const std::vector * hint_tokens = nullptr); + const std::vector * hint_tokens = nullptr, + const BudgetHook * budget_hook = nullptr, + bool * forced_close_out = nullptr, + bool * degenerate_close_out = nullptr); // AR decode fallback (no draft model or sampling mode). + // budget_hook (when close_token_ids is non-empty) overrides the next + // sampled token(s) with the close-tag sequence once (n_gen - committed) + // <= hard_limit. For Qwen3.x, close_token_ids is the canonical + // "Considering the limited time..." summarize-and-stop lead-in (24 + // tokens including ``); for non-qwen arches it's a single + // close-tag token. Mirrors the trained pathway documented in the + // Qwen3 technical report (arXiv 2505.09388). + // forced_close_out, when non-null, is set to true iff the hook injected + // the close sequence (vs. the model self-closing at the boundary). The + // server uses this to attribute close_kind=hard correctly — decoding + // the token stream and grepping for "" cannot distinguish an + // injected close from a natural one because the bytes are identical. bool do_ar_decode(int committed, int n_gen, std::vector & out_tokens, - const DaemonIO & io); + const DaemonIO & io, + const BudgetHook & budget_hook = {}, + bool * forced_close_out = nullptr, + bool * degenerate_close_out = nullptr); bool sync_remote_draft_features(int start_pos, int n_tokens); diff --git a/dflash/src/server/chat_template.cpp b/dflash/src/server/chat_template.cpp index 72e7bcd87..199253bdc 100644 --- a/dflash/src/server/chat_template.cpp +++ b/dflash/src/server/chat_template.cpp @@ -131,6 +131,16 @@ std::string render_chat_template( // Qwen3 thinking disabled: inject closed think block so the // model skips reasoning and generates the answer directly. result += "\n\n\n\n"; + } else { + // Qwen3.6 enable_thinking: pre-open the thinking block so the + // model actually enters reasoning mode. Verified against the + // official Qwen3.6 chat_template.jinja: + // enable_thinking=true → suffix `assistant\n\n` + // enable_thinking=false → suffix `assistant\n\n\n\n\n` + // Without this prefix, Qwen3.6 stays in non-thinking mode + // even when the client opts in, defeating the thinking-budget + // mechanism entirely. + result += "\n"; } } break; @@ -163,17 +173,63 @@ std::string render_chat_template( } case ChatFormat::GEMMA4: { - // Gemma4 format: - // <|turn>user\n{msg}\n<|turn>model\n - // System messages are prepended to the first user message. + // Gemma4 format (see the chat template embedded in the GGUF + // metadata of google/gemma-4-26B-A4B-it): + // + // + // <|turn>system + // [<|think|>\n ← if enable_thinking] + // {system content} + // + // <|turn>user + // {msg} + // <|turn>model + // [<|channel>thought\n ← if NOT enable_thinking] + // + // The trailing channel-thought guard is the same trick Qwen3 + // uses (`\n\n\n\n`): when thinking is disabled + // we pre-fill an empty thought channel so the model SKIPS + // emitting its own. Without this, Gemma4 self-emits + // `<|channel>thought\n…` which then partially leaks + // into the visible content because the channel tokens were + // never opened from the prompt side. + // + // The `<|think|>` opener at the start of the system turn is + // the inverse: it signals "this conversation is in thinking + // mode" so the model's channel sequence routes to reasoning. + const bool has_system = !messages.empty() && messages[0].role == "system"; + const bool emit_system_turn = enable_thinking || has_system || has_tools; result = ""; - std::string system_content; + size_t start_idx = 0; - if (!messages.empty() && messages[0].role == "system") { + std::string system_content; + if (has_system) { system_content = messages[0].content; start_idx = 1; } + // System turn — emitted when there's actual system content OR + // we need somewhere to put the `<|think|>` opener. + if (emit_system_turn) { + result += "<|turn>system\n"; + if (enable_thinking) { + // Per the GGUF chat template: "Inject Thinking token at + // the very top of the FIRST system turn". + result += "<|think|>\n"; + } + if (!system_content.empty()) { + result += system_content; + } + // TODO: tool definitions block (`<|tool>…`) goes here + // when tools_json is non-empty. Out of scope for the + // budget-signaling fix. + (void)tools_json; + result += "\n"; + } + + // User/assistant turns. Unlike the previous implementation we + // don't prepend system content to the first user message — the + // system turn above already carries it (or there isn't one). for (size_t i = start_idx; i < messages.size(); i++) { const auto & msg = messages[i]; std::string role = msg.role; @@ -182,16 +238,18 @@ std::string render_chat_template( result += "<|turn>"; result += role; result += '\n'; - // Inject system content at the start of the first user message. - if (i == start_idx && !system_content.empty() && msg.role == "user") { - result += system_content; - result += "\n\n"; - } result += msg.content; result += "\n"; } if (add_generation_prompt) { result += "<|turn>model\n"; + if (!enable_thinking) { + // Empty thought-channel guard: model will skip its own + // `<|channel>thought…` block since this one + // already sits in the prompt. Matches the GGUF + // template's "if not enable_thinking" branch. + result += "<|channel>thought\n"; + } } break; } diff --git a/dflash/src/server/http_server.cpp b/dflash/src/server/http_server.cpp index f339c52bb..6da5a5138 100644 --- a/dflash/src/server/http_server.cpp +++ b/dflash/src/server/http_server.cpp @@ -24,6 +24,35 @@ namespace dflash::common { +// ─── /props constants ─────────────────────────────────────────────────── +// +// SERVER_NAME / SERVER_VERSION mirror the Python server's identity strings +// so cross-server consumers (autotune, dashboards) see a stable +// `build_info` shape. Bump PROPS_SCHEMA on breaking changes only: +// - field renamed +// - field removed +// - existing field's semantics change (units, nullability, type) +// Do NOT bump for additive changes (new fields, new sections). +// +// Matches dflash/scripts/server.py:175 (PROPS_SCHEMA constant). +static constexpr int kPropsSchema = 2; +static constexpr char kServerName[] = "luce-dflash"; +#ifndef DFLASH_SERVER_VERSION +#define DFLASH_SERVER_VERSION "0.0.0+cpp" +#endif + +// API endpoint registry served by /props. Keep in sync with the route +// handlers in handle_client() and route_request(). +static const std::vector kApiEndpoints = { + "GET /health", + "GET /props", + "GET /v1/models", + "POST /v1/chat/completions", + "POST /v1/messages", + "POST /v1/messages/count_tokens", + "POST /v1/responses", +}; + // ─── Utilities ────────────────────────────────────────────────────────── static std::string generate_id(const char * prefix) { @@ -34,6 +63,9 @@ static std::string generate_id(const char * prefix) { return buf; } +// Logging helpers shared by route_request() / worker_loop(). Kept static +// (file-scope) so they don't leak into the public ABI; the chat lifecycle +// logs that use them are part of #270's request-tracing instrumentation. static const char * api_format_name(ApiFormat format) { switch (format) { case ApiFormat::OPENAI_CHAT: return "chat"; @@ -47,6 +79,258 @@ static size_t json_array_size(const json & value) { return value.is_array() ? value.size() : 0; } +// Build the /props response body. Matches dflash/scripts/server.py:1221-1312 +// key-for-key so cross-server diffs stay clean. The Python version is the +// reference impl; if a key drifts here, update it there too (or document the +// intentional difference in docs/specs/thinking-budget.md). +// +// Non-static so unit tests can call it directly (declared in http_server.h). +json build_props_body(const ServerConfig & config, + const PrefixCache & prefix_cache, + const ToolMemory & tool_memory) { + // arch-gated capabilities (mirrors Python _capabilities()). + const bool is_qwen = (config.arch.rfind("qwen", 0) == 0); + const bool reasoning_supported = is_qwen; + const bool speculative_supported = is_qwen; + const bool tools_supported = is_qwen; + + auto pcs = prefix_cache.stats(); + auto pcfs = prefix_cache.full_stats(); + auto tms = tool_memory.stats(); + + const bool pflash_enabled = + (config.pflash_mode != ServerConfig::PflashMode::OFF); + // speculative_mode reports the *active* path, not arch capability. A + // Qwen-family model started without --ddtree has the capability but no + // active speculative decode, so it must report "off" — otherwise clients + // see `speculative_mode == "dflash"` paired with `speculative.enabled == + // false` and the two contradict (codex review feedback on 8d6ff04). + std::string speculative_mode; + if (pflash_enabled) speculative_mode = "pflash"; + else if (config.speculative_enabled) speculative_mode = "dflash"; + else speculative_mode = "off"; + + // Spec §4.2: the five-tier vocabulary (low | medium | high | x-high | max) + // all activate the phase-1 envelope. Advertise the full set when the + // arch supports reasoning so clients can negotiate the higher tiers. + json reasoning_efforts = json::array(); + if (reasoning_supported) { + reasoning_efforts.push_back("low"); + reasoning_efforts.push_back("medium"); + reasoning_efforts.push_back("high"); + reasoning_efforts.push_back("x-high"); + reasoning_efforts.push_back("max"); + } + + json server = { + {"name", kServerName}, + {"version", DFLASH_SERVER_VERSION}, + {"props_schema", kPropsSchema}, + }; + + json pflash; + if (!pflash_enabled) { + pflash = { + {"enabled", false}, + {"mode", "off"}, + {"threshold", nullptr}, + {"keep_ratio", nullptr}, + {"drafter_gguf", nullptr}, + {"skip_park", nullptr}, + {"bsa_enabled", nullptr}, + {"bsa_alpha", nullptr}, + {"lm_head_fix", nullptr}, + }; + } else { + const char * bsa_env = std::getenv("DFLASH_FP_USE_BSA"); + const char * alpha_env = std::getenv("DFLASH_FP_ALPHA"); + const char * lmfix_env = std::getenv("DFLASH27B_LM_HEAD_FIX"); + json bsa_alpha = nullptr; + if (alpha_env && *alpha_env) { + try { bsa_alpha = std::stod(alpha_env); } + catch (const std::exception &) { bsa_alpha = nullptr; } + } + std::string mode_str = + (config.pflash_mode == ServerConfig::PflashMode::AUTO) ? "auto" : + (config.pflash_mode == ServerConfig::PflashMode::ALWAYS) ? "always" : "off"; + pflash = { + {"enabled", true}, + {"mode", mode_str}, + {"threshold", config.pflash_threshold}, + {"keep_ratio", config.pflash_keep_ratio}, + {"drafter_gguf", config.pflash_drafter_path.empty() + ? json(nullptr) + : json(config.pflash_drafter_path)}, + {"skip_park", config.pflash_skip_park}, + {"bsa_enabled", (bsa_env != nullptr && *bsa_env && std::strcmp(bsa_env, "0") != 0)}, + {"bsa_alpha", bsa_alpha}, + {"lm_head_fix", (lmfix_env != nullptr && *lmfix_env && std::strcmp(lmfix_env, "0") != 0)}, + }; + } + + // Reflect actual sampler defaults the server applies when a request + // omits the field — these come from the loaded model card's sampling + // section (spec §3.3), not from a hard-coded greedy fallback. Clients + // that read /props to pick their sampling shape were getting greedy + // here regardless of what the model card said, which caused gemma4 + // benchmarks to silently run at temp=0 (degenerate-decode collapse) + // when the model card specifies temp=1.0/top_p=0.95/top_k=64. + const auto & smp = config.sampler_defaults; + json body = { + {"default_generation_settings", { + {"n_ctx", config.max_ctx}, + {"temperature", smp.has_temperature ? smp.temperature : 0.0f}, + {"top_p", smp.has_top_p ? smp.top_p : 1.0f}, + {"top_k", smp.has_top_k ? smp.top_k : 0}, + {"min_p", smp.has_min_p ? smp.min_p : 0.0f}, + {"repeat_penalty", smp.has_repetition_penalty ? smp.repetition_penalty : 1.0f}, + }}, + {"model_alias", config.model_name}, + {"model_path", config.model_path}, + {"build_info", std::string(kServerName) + " v" DFLASH_SERVER_VERSION + " props_schema=" + std::to_string(kPropsSchema)}, + {"speculative_mode", speculative_mode}, + {"server", server}, + {"model", { + {"arch", config.arch}, + {"draft_path", config.draft_path.empty() ? json(nullptr) : json(config.draft_path)}, + {"tokenizer_id", config.tokenizer_id.empty() ? json(nullptr) : json(config.tokenizer_id)}, + }}, + {"runtime", { + {"backend", config.runtime_backend.empty() ? "cuda" : config.runtime_backend}, + {"fa_window", config.fa_window}, + {"kv_cache_k", config.kv_cache_k}, + {"kv_cache_v", config.kv_cache_v}, + {"lazy_draft", config.lazy_draft}, + {"target_sharding", config.target_sharding}, + // Prefill chunk size (bargs.chunk). Surfaced so snapshot + // tooling captures the full config — bench consumers + // (dflash/scripts/bench_http_capability.py) read + // /props.runtime wholesale into result.json.server_info. + {"chunk", config.chunk}, + // Device placement strings (e.g. "auto:0", "cuda:0"). Empty + // string when no draft model is loaded. + {"target_device", config.target_device}, + {"draft_device", config.draft_device.empty() ? json(nullptr) : json(config.draft_device)}, + }}, + {"reasoning", { + {"supported", reasoning_supported}, + {"default", nullptr}, + {"supported_efforts", reasoning_efforts}, + }}, + // `model_card`: 1:1 with the on-disk sidecar JSON when one was + // loaded; null when family fallback or hard fallback was used. + // Validates against share/model_cards/_schema.json. The `source` + // field here is the upstream model-card URL (authored in the + // sidecar) — NOT a filepath. See spec §4.9. + {"model_card", config.model_card_json.is_null() + ? json(nullptr) + : config.model_card_json}, + // `budget_envelope`: runtime-resolved values driving the + // thinking-budget envelope. May differ from the authored card + // values because of CLI overrides and max_ctx-based tier clamping + // (spec §3.5). Always emitted regardless of model_card source. + // See spec §4.2. + {"budget_envelope", { + {"model_card_source", config.model_card_source_label}, + {"default_max_tokens", config.default_max_tokens}, + {"hard_limit_reply_budget", config.hard_limit_reply_budget}, + {"think_max_tokens", config.think_max_tokens}, + {"effort_tiers", { + {"low", config.effort_tiers.low}, + {"medium", config.effort_tiers.medium}, + {"high", config.effort_tiers.high}, + {"x-high", config.effort_tiers.x_high}, + {"max", config.effort_tiers.max}, + }}, + }}, + {"speculative", { + {"enabled", config.speculative_enabled}, + {"ddtree_budget", config.speculative_enabled + ? json(config.ddtree_budget) : json(nullptr)}, + }}, + {"sampling", { + {"capabilities", { + {"supports_temperature", true}, + {"supports_top_p", true}, + {"supports_top_k", true}, + {"supports_frequency_penalty", true}, + {"supports_seed", true}, + }}, + }}, + {"pflash", pflash}, + {"prefix_cache", { + {"capacity", pcs.capacity}, + {"in_use", pcs.in_use}, + {"lifetime_hits", pcs.lifetime_hits}, + }}, + {"full_cache", { + {"enabled", pcfs.enabled}, + {"capacity", pcfs.capacity}, + {"in_use", pcfs.in_use}, + {"disk_bytes", pcfs.disk_bytes}, + {"lifetime_hits", pcfs.lifetime_hits}, + }}, + {"tool_replay", { + {"max_entries", tms.max_entries}, + {"max_bytes", tms.max_bytes}, + {"current_entries", tms.current_entries}, + {"current_bytes", tms.current_bytes}, + }}, + // The C++ daemon is linked in-process; if /props is responding, + // the daemon is alive by construction. + {"daemon", {{"alive", true}}}, + {"api", {{"endpoints", kApiEndpoints}}}, + // Capability flags surfaced for clients that don't want to crack + // open `reasoning` / `speculative` / etc. — matches the Python + // server's _capabilities() helper. + {"capabilities", { + {"reasoning_supported", reasoning_supported}, + {"speculative_supported", speculative_supported}, + {"tools_supported", tools_supported}, + }}, + }; + return body; +} + +// Normalize Anthropic's `system` field (top-level on /v1/messages and +// /v1/messages/count_tokens) into a leading `{role:"system", content:...}` +// entry on `messages`. Accepts either a flat string or an array of typed +// blocks (`[{type:"text", text:"..."}]`), and strips any +// `x-anthropic-billing-header:`-prefixed block injected by Claude Code so +// it never reaches the model or the token counter. +// +// Side-effect: prepends a system message to `messages` when the body has +// a non-empty `system` field after billing-header filtering. No-op +// otherwise. Both endpoints call this with identical semantics — having +// one helper guarantees token counting and generation can't drift. +static void normalize_anthropic_system(const json & body, json & messages) { + if (!body.contains("system")) return; + json sys_content = body["system"]; + if (sys_content.is_array()) { + json filtered = json::array(); + for (const auto & block : sys_content) { + if (block.is_object() && block.value("type", "") == "text") { + std::string text = block.value("text", ""); + if (text.rfind("x-anthropic-billing-header:", 0) == 0) { + continue; // skip Claude Code billing header block + } + } + filtered.push_back(block); + } + sys_content = std::move(filtered); + } else if (sys_content.is_string()) { + std::string s = sys_content.get(); + if (s.rfind("x-anthropic-billing-header:", 0) == 0) { + sys_content = ""; + } + } + if (!sys_content.empty()) { + json sys_msg = {{"role", "system"}, {"content", sys_content}}; + messages.insert(messages.begin(), sys_msg); + } +} + json parse_responses_arguments(const json & item) { if (!item.contains("arguments")) return json::object(); const auto & arguments = item["arguments"]; @@ -356,6 +640,15 @@ void HttpServer::handle_client(int fd) { return; } + // Introspection: server config + cache stats + arch + capabilities. + // Matches dflash/scripts/server.py:1221-1312 key-for-key. + if (hr.method == "GET" && hr.path == "/props") { + json body = build_props_body(config_, prefix_cache_, tool_memory_); + send_response(fd, 200, "application/json", body.dump() + "\n"); + ::close(fd); + return; + } + // Models endpoint. if (hr.method == "GET" && hr.path == "/v1/models") { // Codex sends ?client_version= — serve the Codex-specific schema. @@ -366,9 +659,31 @@ void HttpServer::handle_client(int fd) { {"display_name", config_.model_name}, {"description", "Local DFlash speculative-decoding server"}, {"default_reasoning_level", "low"}, + // Spec §4.2: every tier activates the phase-1 envelope; + // the difference is the budget cap selected from the + // model card's effort_tiers. Descriptions surface the + // resolved cap so clients can pick a tier purposefully. {"supported_reasoning_levels", json::array({ - {{"effort", "low"}, {"description", "No thinking"}}, - {{"effort", "medium"}, {"description", "Thinking enabled"}}, + {{"effort", "low"}, + {"description", "Phase-1 budget at the model card's low tier (" + + std::to_string(config_.effort_tiers.low) + + " tokens)"}}, + {{"effort", "medium"}, + {"description", "Phase-1 budget at the model card's medium tier (" + + std::to_string(config_.effort_tiers.medium) + + " tokens)"}}, + {{"effort", "high"}, + {"description", "Phase-1 budget at the model card's standard recommendation (" + + std::to_string(config_.effort_tiers.high) + + " tokens)"}}, + {{"effort", "x-high"}, + {"description", "Phase-1 budget between high and the complex-problem ceiling (" + + std::to_string(config_.effort_tiers.x_high) + + " tokens)"}}, + {{"effort", "max"}, + {"description", "Phase-1 budget at the model card's complex-problem ceiling (" + + std::to_string(config_.effort_tiers.max) + + " tokens)"}}, })}, {"shell_type", "shell_command"}, {"visibility", "list"}, @@ -421,27 +736,49 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) { // Common fields. req.stream = body.value("stream", false); req.model = body.value("model", config_.model_name); + // Default when client omits all three: use --default-max-tokens + // (16000, matches ds4_eval.c). Codex review flagged that + // --default-max-tokens was previously a dead flag because the + // parser read config_.max_tokens (legacy 4096) instead. The new + // default protects thinking-budget requests that omit max_tokens + // from being capped at 4096 — thinking alone can consume that, + // leaving no headroom for the visible reply. req.max_output = body.value("max_tokens", body.value("max_output_tokens", - body.value("max_completion_tokens", config_.max_tokens))); + body.value("max_completion_tokens", config_.default_max_tokens))); + // Spec §4.4: clamp request max_tokens to --default-max-tokens. + if (req.max_output > config_.default_max_tokens) { + std::fprintf(stderr, + "[server] max_tokens=%d clamped to default_max_tokens=%d\n", + req.max_output, config_.default_max_tokens); + req.max_output = config_.default_max_tokens; + } - // Sampler parameters. - req.sampler.temp = body.value("temperature", 0.0f); - req.sampler.top_p = body.value("top_p", 1.0f); - req.sampler.top_k = body.value("top_k", 0); + // Sampler parameters. When the request omits a value, fall back to + // the model card's sampling defaults (spec §3.3); when the card + // doesn't supply one either, use the hard-coded default. + const auto & sd = config_.sampler_defaults; + req.sampler.temp = body.value("temperature", + sd.has_temperature ? sd.temperature : 0.0f); + req.sampler.top_p = body.value("top_p", + sd.has_top_p ? sd.top_p : 1.0f); + req.sampler.top_k = body.value("top_k", + sd.has_top_k ? sd.top_k : 0); if (body.contains("seed")) { req.sampler.seed = body["seed"].get(); } // OpenAI-style additive penalties. req.sampler.freq_pen = body.value("frequency_penalty", 0.0f); - req.sampler.pres_pen = body.value("presence_penalty", 0.0f); + req.sampler.pres_pen = body.value("presence_penalty", + sd.has_presence_penalty ? sd.presence_penalty : 0.0f); // HuggingFace-style multiplicative repetition penalty (also used by // vLLM, llama.cpp, etc.). Accepts both "repetition_penalty" and // the shorter "rep_pen" for daemon compatibility. req.sampler.rep_pen = body.value("repetition_penalty", - body.value("rep_pen", 1.0f)); + body.value("rep_pen", + sd.has_repetition_penalty ? sd.repetition_penalty : 1.0f)); if (body.contains("rep_window")) { req.sampler.rep_window = body["rep_window"].get(); } @@ -479,41 +816,25 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) { } } + // count_tokens shares Anthropic's message parsing; flag so we + // short-circuit before enqueueing the generation job. + bool count_tokens_only = false; + if (hr.path == "/v1/chat/completions") { req.format = ApiFormat::OPENAI_CHAT; req.response_id = generate_id("chatcmpl"); req.messages = body["messages"]; + } else if (hr.path == "/v1/messages/count_tokens") { + req.format = ApiFormat::ANTHROPIC; + req.response_id = generate_id("count"); + req.messages = body.value("messages", json::array()); + normalize_anthropic_system(body, req.messages); + count_tokens_only = true; } else if (hr.path == "/v1/messages") { req.format = ApiFormat::ANTHROPIC; req.response_id = generate_id("msg"); req.messages = body["messages"]; - if (body.contains("system")) { - // Anthropic puts system as a top-level field. - // Strip billing header blocks injected by Claude Code. - json sys_content = body["system"]; - if (sys_content.is_array()) { - json filtered = json::array(); - for (const auto & block : sys_content) { - if (block.is_object() && block.value("type", "") == "text") { - std::string text = block.value("text", ""); - if (text.rfind("x-anthropic-billing-header:", 0) == 0) { - continue; // skip billing header block - } - } - filtered.push_back(block); - } - sys_content = std::move(filtered); - } else if (sys_content.is_string()) { - std::string s = sys_content.get(); - if (s.rfind("x-anthropic-billing-header:", 0) == 0) { - sys_content = ""; - } - } - if (!sys_content.empty()) { - json sys_msg = {{"role", "system"}, {"content", sys_content}}; - req.messages.insert(req.messages.begin(), sys_msg); - } - } + normalize_anthropic_system(body, req.messages); } else if (hr.path == "/v1/responses") { req.format = ApiFormat::RESPONSES; req.response_id = generate_id("resp"); @@ -543,21 +864,54 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) { // DFlash acceptance rates; clients opt in explicitly). bool enable_thinking = false; - // OpenAI Responses API: "reasoning" field + // Track which fields the request explicitly set, so we can apply + // §4.3 combined precedence: thinking.budget_tokens beats + // reasoning.effort for the phase-1 cap, but the effort tier still + // selects defaults for any unspecified thinking.* field. + int request_budget_tokens = -1; // from thinking.budget_tokens + int request_reply_budget = -1; // from thinking.reply_budget + int effort_phase1_cap = -1; // from reasoning.effort lookup + bool effort_set = false; + + // OpenAI Responses API: "reasoning" field. Spec §4.2. if (body.contains("reasoning")) { auto & r = body["reasoning"]; if (r.contains("effort")) { - std::string effort = r.value("effort", "low"); - enable_thinking = (effort != "low"); + std::string effort = r.value("effort", "high"); + // Five-tier vocabulary (spec §4.2). Unknown → high. + int tier_value = config_.effort_tiers.high; + if (effort == "low") tier_value = config_.effort_tiers.low; + else if (effort == "medium") tier_value = config_.effort_tiers.medium; + else if (effort == "high") tier_value = config_.effort_tiers.high; + else if (effort == "x-high") tier_value = config_.effort_tiers.x_high; + else if (effort == "max") tier_value = config_.effort_tiers.max; + // else: unknown tier → fall back to high (no error). + + effort_phase1_cap = tier_value; + effort_set = true; + enable_thinking = true; + // Spec §4.2: reasoning.effort activates the budget envelope. + req.thinking_opt_in = true; } else { enable_thinking = true; } } - // Anthropic: "thinking" field + // Anthropic-style: "thinking" field. Presence-as-opt-in: any + // request that sends this field has opted in to the thinking-budget + // envelope (and will see a `finish_details` block on the response). if (body.contains("thinking")) { auto & th = body["thinking"]; if (th.contains("type")) { - enable_thinking = (th.value("type", "") == "enabled"); + std::string type = th.value("type", ""); + enable_thinking = (type == "enabled"); + req.thinking_opt_in = (type == "enabled"); + } + // Spec §4.1 fields. Clamp to server ceilings (§4.4). + if (th.contains("budget_tokens") && th["budget_tokens"].is_number_integer()) { + request_budget_tokens = th["budget_tokens"].get(); + } + if (th.contains("reply_budget") && th["reply_budget"].is_number_integer()) { + request_reply_budget = th["reply_budget"].get(); } } // Direct: chat_template_kwargs.enable_thinking @@ -570,6 +924,55 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) { req.thinking_enabled = enable_thinking; + // Spec §4.3 combined precedence + §4.4 clamping. + // Phase-1 cap: + // thinking.budget_tokens (if set) wins over reasoning.effort. + // Either is clamped to think_max_tokens. + if (request_budget_tokens >= 0) { + int eff = std::min(request_budget_tokens, config_.think_max_tokens); + if (request_budget_tokens > config_.think_max_tokens) { + std::fprintf(stderr, + "[server] thinking.budget_tokens=%d clamped to " + "think_max_tokens=%d\n", + request_budget_tokens, config_.think_max_tokens); + } + req.per_req_phase1_cap = eff; + } else if (effort_set) { + // Spec §4.4: when reasoning.effort is set, the effective phase-1 + // cap is min(effort_tier_value, request.max_tokens - + // hard_limit_reply_budget). The effort tier value can legitimately + // exceed default_max_tokens (e.g. Qwen3.6 max=81408 with + // default=32768) — clients that want that full budget must pass + // an explicit max_tokens. Otherwise we narrow silently to fit. + const int max_output_phase1_room = std::max(0, + req.max_output - config_.hard_limit_reply_budget); + int eff = std::min(effort_phase1_cap, max_output_phase1_room); + if (effort_phase1_cap > max_output_phase1_room) { + // Info-level: this is normal when clients use a tier name but + // don't pass an explicit max_tokens. Not a warning. + std::fprintf(stderr, + "[server] reasoning.effort tier=%d narrowed to %d " + "(max_tokens=%d - hard_limit_reply_budget=%d); " + "pass a larger max_tokens to use the full tier budget\n", + effort_phase1_cap, eff, + req.max_output, config_.hard_limit_reply_budget); + } + req.per_req_phase1_cap = eff; + } + // Reply budget: + if (request_reply_budget >= 0) { + int eff = std::min(request_reply_budget, config_.hard_limit_reply_budget); + if (request_reply_budget > config_.hard_limit_reply_budget) { + std::fprintf(stderr, + "[server] thinking.reply_budget=%d clamped to " + "hard_limit_reply_budget=%d\n", + request_reply_budget, config_.hard_limit_reply_budget); + } + req.per_req_reply_budget = eff; + } + // (effort tier doesn't influence reply_budget — spec §4.2: "the reply + // reserve falls back to --hard-limit-reply-budget".) + // Serialize tools JSON for template injection. std::string tools_json; if (req.tools.is_array() && !req.tools.empty()) { @@ -612,15 +1015,13 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) { tools_json); } req.prompt_tokens = tokenizer_.encode(rendered); - // Detect if prompt ends with (model will start in reasoning mode). - if (enable_thinking) { - size_t end = rendered.size(); - while (end > 0 && (rendered[end-1] == ' ' || rendered[end-1] == '\n' || - rendered[end-1] == '\r' || rendered[end-1] == '\t')) - end--; - if (end >= 7 && rendered.compare(end - 7, 7, "") == 0) { - req.started_in_thinking = true; - } + + // count_tokens: short-circuit after tokenization. Skip generation + // entirely — Anthropic's contract is just `{"input_tokens": N}`. + if (count_tokens_only) { + json resp = {{"input_tokens", (int)req.prompt_tokens.size()}}; + send_response(fd, 200, "application/json", resp.dump() + "\n"); + return true; } } catch (const std::exception & e) { @@ -721,7 +1122,7 @@ void HttpServer::worker_loop() { // Create SSE emitter for streaming state machine. SseEmitter emitter(req.format, req.response_id, req.model, (int)req.prompt_tokens.size(), req.tools, - &tool_memory_, req.started_in_thinking, + &tool_memory_, req.stop_sequences); // Emit initial SSE events. @@ -829,13 +1230,64 @@ void HttpServer::worker_loop() { } // Build generate request. + // + // Thinking-budget v2 (Level 2): when caller opts in via + // `thinking:{type:enabled}`, cap n_gen at think_max + reply_budget + // so the BudgetHook fires at the boundary, mid-stream, with KV + // state intact. Applies uniformly to streaming and non-streaming + // requests — the BudgetHook lives inside do_ar_decode / + // do_spec_decode and injects close tokens at the budget edge + // regardless of how the server is delivering the result. + const bool budget_active = req.thinking_opt_in; + // Effective think cap: per-request value (already clamped to + // config_.think_max_tokens above) wins over the server-wide + // think_max_tokens. Then both must fit inside the combined + // max_output. Spec §4.4 + §5.3. + const int effective_think_ceiling = (req.per_req_phase1_cap >= 0) + ? req.per_req_phase1_cap + : config_.think_max_tokens; + // The effective per-request reply budget is the operator's choice + // (CLI / sidecar / per-request override). The AR loop force-closes + // when `n_gen - generated <= eff_reply`, which means n_gen must + // include BOTH the think budget AND the reply reserve. Without the + // `+ eff_reply` term, force-close fires immediately when + // `eff_reply == effective_think_ceiling` (e.g. think_max=4096, + // hard_limit=4096 → remaining starts at 4096, condition fires + // before the model emits a single thinking token). Spec §4.4. + const int eff_reply_for_n_gen = (req.per_req_reply_budget >= 0) + ? req.per_req_reply_budget + : config_.hard_limit_reply_budget; + const int n_gen_cap = budget_active + ? std::min(effective_think_ceiling + eff_reply_for_n_gen, req.max_output) + : req.max_output; + GenerateRequest gen_req; gen_req.prompt = effective_prompt; - gen_req.n_gen = req.max_output; + gen_req.n_gen = n_gen_cap; gen_req.sampler = req.sampler; gen_req.do_sample = req.sampler.needs_logit_processing(); gen_req.stream = false; // we handle streaming via on_token callback + // Level 2 force-close: when thinking is opted in, the server is + // configured with a hard-limit reply budget, and we resolved the + // close-tag sequence at startup, wire the BudgetHook so the + // backend's AR decode injects `` at the budget boundary. + // The model gets to write the visible answer in-stream rather than + // running unbounded. + // + // hard_limit_remaining is the per-request reply_budget when set + // (already clamped to config_.hard_limit_reply_budget above), else + // the server default. Spec §4.4 + §5.3. + if (budget_active && !config_.think_close_token_ids.empty() && + config_.hard_limit_reply_budget > 0) + { + int eff_reply_budget = (req.per_req_reply_budget >= 0) + ? req.per_req_reply_budget + : config_.hard_limit_reply_budget; + gen_req.budget_hook.close_token_ids = config_.think_close_token_ids; + gen_req.budget_hook.hard_limit_remaining = eff_reply_budget; + } + // Tool call hint generation: pre-tokenize predictable structural tokens // to accelerate spec decode when tool_choice constrains the output. std::vector hint_tokens_storage; @@ -976,6 +1428,23 @@ void HttpServer::worker_loop() { return true; } + // Qwen3.6 thinking tokens: (id 248068) and (id 248069) + // are SINGLE special tokens in the added_tokens vocab. Without this + // mapping they hit the generic "skip <...>" filter below and get + // silently dropped — which means the emitter never sees the + // reasoning→content transition and stuffs everything into + // reasoning_content with empty visible content. Forward the text + // form into the emitter so parse_reasoning() can split correctly. + if (raw == "" || raw == "") { + if (req.stream) { + auto chunks = emitter.emit_token( + raw == "" ? "\n" : ""); + for (const auto & chunk : chunks) + if (!send_all(fd, chunk.data(), chunk.size())) { client_disconnected = true; return false; } + } + return true; + } + // Skip other special tokens (starting with <|, or any <...> except byte-fallback) if (raw.size() >= 2 && raw[0] == '<' && raw[1] == '|') return true; if (raw.size() >= 2 && raw[0] == '<' && raw.back() == '>') { @@ -1070,9 +1539,24 @@ void HttpServer::worker_loop() { } } + // close_kind reflects the Level 2 BudgetHook outcome: "hard" when + // the backend's AR/spec decode injected the close-token sequence + // at the budget boundary, "natural" when the model self-closed + // (or the request never opted in). Emitted as part of + // finish_details for thinking-budget callers. + std::string close_kind = + (req.thinking_opt_in && result.budget_forced_close) + ? "hard" + : "natural"; + // Finalize. + // Per-request wall-clock timings forwarded to the response's + // `usage.timings` (OpenAI Chat usage chunk, Anthropic + // message_delta usage, Responses response.completed usage). + // See docs/specs/thinking-budget.md §6.3. + GenTimings gen_timings{ result.prefill_s, result.decode_s }; if (req.stream && !client_disconnected) { - auto final_chunks = emitter.emit_finish(completion_tokens); + auto final_chunks = emitter.emit_finish(completion_tokens, &gen_timings); for (const auto & chunk : final_chunks) { if (!send_all(fd, chunk.data(), chunk.size())) { client_disconnected = true; @@ -1082,30 +1566,73 @@ void HttpServer::worker_loop() { } else if (!req.stream && !client_disconnected) { // Non-streaming: build complete response using emitter state. // Feed all tokens through emitter (skip specials like streaming path). - for (int32_t tok : result.tokens) { - const std::string & raw = tokenizer_.raw_token(tok); - if (tok == tokenizer_.eos_id()) continue; - if (tok == tokenizer_.eos_chat_id()) continue; - // Gemma4 channel → think mapping - if (raw == "<|channel>") { emitter.emit_token(""); continue; } - if (raw == "") { emitter.emit_token("\n"); continue; } - if (raw.size() >= 2 && raw[0] == '<' && raw[1] == '|') continue; - if (raw.size() >= 2 && raw[0] == '<' && raw.back() == '>') { - if (!(raw.size() == 6 && raw[1] == '0' && raw[2] == 'x')) - continue; + auto feed_tokens = [&](const std::vector & toks) -> bool { + for (int32_t tok : toks) { + const std::string & raw = tokenizer_.raw_token(tok); + if (tok == tokenizer_.eos_id()) continue; + if (tok == tokenizer_.eos_chat_id()) continue; + // Gemma4 channel → think mapping + if (raw == "<|channel>") { emitter.emit_token(""); continue; } + if (raw == "") { emitter.emit_token("\n"); continue; } + // Qwen3.6 thinking tokens (id 248068 / 248069) — must + // forward as text so the emitter transitions + // reasoning→content. Without this the generic <...> + // strip below drops them silently, leaving content + // empty and the model's whole answer wedged in + // reasoning_content. Mirrors the streaming-path fix + // above. + if (raw == "") { emitter.emit_token(""); continue; } + if (raw == "") { emitter.emit_token("\n"); continue; } + if (raw.size() >= 2 && raw[0] == '<' && raw[1] == '|') continue; + if (raw.size() >= 2 && raw[0] == '<' && raw.back() == '>') { + if (!(raw.size() == 6 && raw[1] == '0' && raw[2] == 'x')) + continue; + } + std::string text = tokenizer_.token_text(tok); + emitter.emit_token(text); + if (emitter.stop_hit()) return false; } - std::string text = tokenizer_.token_text(tok); - emitter.emit_token(text); - if (emitter.stop_hit()) break; - } - emitter.emit_finish((int)result.tokens.size()); + return true; + }; + + feed_tokens(result.tokens); + const int total_completion_tokens = (int)result.tokens.size(); + emitter.emit_finish(total_completion_tokens); + + // Derive per-mode token counts from the emitter's REASONING + // → CONTENT transition. first_content_token_index() returns + // the emit_token index that first ran with mode == CONTENT; + // tokens before that index were emitted while the emitter + // was in REASONING (the ``-carrying token itself + // lands in REASONING and the NEXT token is the first + // CONTENT). EOS/special tokens are skipped by feed_tokens + // above, so emit_token_count() may be smaller than + // result.tokens.size(); the remainder counts as + // unattributed (e.g., TOOL_BUFFER). + const int fci = emitter.first_content_token_index(); + const int emitted = emitter.emit_token_count(); + const int reasoning_tokens_emitted = + fci < 0 ? emitted : fci; + const int content_tokens_emitted = + fci < 0 ? 0 : emitted - fci; json resp; switch (req.format) { case ApiFormat::OPENAI_CHAT: { json msg = {{"role", "assistant"}, {"content", emitter.accumulated_text()}}; if (!emitter.reasoning_text().empty()) { - msg["reasoning_content"] = emitter.reasoning_text(); + // Multi-dialect reasoning emission — same text, three keys. + // See docs/specs/thinking-budget.md "Response shape — + // multi-dialect aliasing". + // reasoning_content : DeepSeek R1 / dflash primary + // reasoning : OpenRouter / Anthropic-gateway flat + // reasoning_details : typed-block list; single block. + const std::string & rt = emitter.reasoning_text(); + msg["reasoning_content"] = rt; + msg["reasoning"] = rt; + msg["reasoning_details"] = json::array({ + {{"type", "reasoning.text"}, {"text", rt}} + }); } if (!emitter.tool_calls().empty()) { json tcs = json::array(); @@ -1116,21 +1643,76 @@ void HttpServer::worker_loop() { } msg["tool_calls"] = tcs; } + // finish_reason: emitter only knows about "stop" / "tool_calls" + // (EOS / tool-call detection). It can't see that the daemon + // hit the n_gen cap. Compute "length" here from the + // committed-token count vs the n_gen cap. + // OpenAI/Anthropic clients (open-webui, Cline) gate retry + // logic on finish_reason="length". + std::string effective_finish_reason = emitter.finish_reason(); + if (effective_finish_reason == "stop") { + bool at_cap = (int)result.tokens.size() >= n_gen_cap; + if (at_cap) { + effective_finish_reason = "length"; + } + } + json choice = { + {"index", 0}, {"message", msg}, + {"finish_reason", effective_finish_reason} + }; + // finish_details — mirrors ds4_eval.c's eval_think_close_info. + // Emitted when the caller opted in to the thinking-budget + // envelope via `thinking:{type:enabled}`. close_kind reflects + // whether the model self-closed the thinking block ("natural") + // or the BudgetHook force-closed it at the budget boundary + // ("hard"). See docs/specs/thinking-budget.md "v2 design". + if (req.thinking_opt_in) { + // thinking_tokens / content_tokens come from the + // emitter's REASONING→CONTENT transition tracking; + // total_tokens is the raw committed-token count. + choice["finish_details"] = { + {"close_kind", close_kind}, + {"thinking_tokens", reasoning_tokens_emitted}, + {"content_tokens", content_tokens_emitted}, + {"total_tokens", total_completion_tokens}, + }; + // Honest signaling: when the post-close watchdog + // detected an n-gram repetition loop and aborted + // generation, surface a sibling flag so callers know + // the answer is unreliable. finish_reason stays + // "length" (SDK-safe per the truncation-signaling + // convention: OpenAI/Anthropic/Gemini all collapse + // budget-class events to one closed enum and put + // richer signal in sidecar fields). + if (result.degenerate_decode_close) { + choice["finish_details"]["degenerate_decode"] = true; + } + } + // usage.completion_tokens_details.reasoning_tokens — OpenAI + // o1/o3 standard location, also OR's normalized shape. Mirrors + // finish_details.thinking_tokens; kept in sync. + // usage.timings — per-request prefill / decode wall clock + // (always emitted; additive to OpenAI shape, ignored by + // clients that don't recognize it). See spec §6.3. + json chat_usage = { + {"prompt_tokens", (int)req.prompt_tokens.size()}, + {"completion_tokens", total_completion_tokens}, + {"total_tokens", (int)req.prompt_tokens.size() + total_completion_tokens}, + {"completion_tokens_details", { + // Match finish_details.thinking_tokens + // (emitter-tracked split). + {"reasoning_tokens", reasoning_tokens_emitted} + }}, + {"timings", build_timings_json(gen_timings, total_completion_tokens)} + }; resp = { {"id", req.response_id}, {"object", "chat.completion"}, {"created", std::chrono::duration_cast( std::chrono::system_clock::now().time_since_epoch()).count()}, {"model", req.model}, - {"choices", json::array({{ - {"index", 0}, {"message", msg}, - {"finish_reason", emitter.finish_reason()} - }})}, - {"usage", { - {"prompt_tokens", (int)req.prompt_tokens.size()}, - {"completion_tokens", (int)result.tokens.size()}, - {"total_tokens", (int)(req.prompt_tokens.size() + result.tokens.size())} - }} + {"choices", json::array({choice})}, + {"usage", chat_usage} }; break; } @@ -1170,15 +1752,28 @@ void HttpServer::worker_loop() { }); } } + // stop_reason: Anthropic's analog of finish_reason. Same + // length-vs-EOS distinction as OpenAI — Cline / Anthropic + // SDK gate retry on stop_reason=="max_tokens". + std::string anthropic_stop_reason; + { + std::string er = emitter.finish_reason(); + bool at_cap = (int)result.tokens.size() >= n_gen_cap; + if (er == "tool_calls") anthropic_stop_reason = "tool_use"; + else if (at_cap) anthropic_stop_reason = "max_tokens"; + else anthropic_stop_reason = "end_turn"; + } + json anth_usage = { + {"input_tokens", (int)req.prompt_tokens.size()}, + {"output_tokens", total_completion_tokens}, + {"timings", build_timings_json(gen_timings, total_completion_tokens)} + }; resp = { {"id", req.response_id}, {"type", "message"}, {"role", "assistant"}, {"model", req.model}, {"content", content}, - {"stop_reason", emitter.finish_reason() == "stop" ? "end_turn" : "tool_use"}, - {"usage", { - {"input_tokens", (int)req.prompt_tokens.size()}, - {"output_tokens", (int)result.tokens.size()} - }} + {"stop_reason", anthropic_stop_reason}, + {"usage", anth_usage} }; break; } @@ -1202,15 +1797,17 @@ void HttpServer::worker_loop() { }})} }); } + json resp_usage = { + {"input_tokens", (int)req.prompt_tokens.size()}, + {"output_tokens", total_completion_tokens}, + {"total_tokens", (int)req.prompt_tokens.size() + total_completion_tokens}, + {"timings", build_timings_json(gen_timings, total_completion_tokens)} + }; resp = { {"id", req.response_id}, {"object", "response"}, {"status", "completed"}, {"model", req.model}, {"output", output}, - {"usage", { - {"input_tokens", (int)req.prompt_tokens.size()}, - {"output_tokens", (int)result.tokens.size()}, - {"total_tokens", (int)(req.prompt_tokens.size() + result.tokens.size())} - }} + {"usage", resp_usage} }; break; } diff --git a/dflash/src/server/http_server.h b/dflash/src/server/http_server.h index efabb47e9..a537418a2 100644 --- a/dflash/src/server/http_server.h +++ b/dflash/src/server/http_server.h @@ -20,6 +20,7 @@ #include "api_types.h" #include "placement/remote_draft_config.h" #include "common/pflash_drafter_ipc.h" +#include "model_card.h" #include #include @@ -45,12 +46,99 @@ struct ServerJob; struct ServerConfig { std::string host = "0.0.0.0"; int port = 8080; - int max_tokens = 4096; // default max output tokens + int max_tokens = 4096; // default max output tokens (legacy alias for default_max_tokens) int max_ctx = 0; // 0 = use backend's DevicePlacement default (8192) bool enable_cors = true; std::string model_name = "dflash"; int prefix_cache_cap = 32; // prefix cache slots (0 disables) + // Thinking-budget v2. Applied when a request opts in via + // `thinking: {type: "enabled"}` or `reasoning: {effort: ...}`. + // think_max_tokens caps phase-1 reasoning generation; the combined + // (reasoning + content) cap is the request's max_tokens, defaulting + // to default_max_tokens when omitted. The defaults below are the + // hard fallback (antirez/ds4 ds4_eval.c reference values); at startup + // server_main may raise them by loading share/model_cards/.json + // when a sidecar matches the loaded model. CLI flags override both. + // See docs/specs/thinking-budget.md §3 for resolution order. + int think_max_tokens = 15488; // = default_max_tokens - hard_limit_reply_budget + int default_max_tokens = 16000; + // Level 2 force-close (in-process, KV-continuous). When > 0 AND the + // request opted into thinking, the backend's AR decode overrides + // the next sampled token with `` once (n_gen - committed) + // <= hard_limit_reply_budget. 0 disables the hook. + // + // Default 4096. The original 512 came from ds4_eval.c, which sized + // for DeepSeek-V4-flash's terse style. For most models that's far + // too small — Qwen3.6 restates work after `` (needs ~4k); + // Gemma 4 after the channel-thought force-close + transition cue + // writes a clean coordinate-geometry proof for AIME (~2-4k tokens). + // Without priors on a specific model, 4096 is the safer default + // — bench results from gemma4-26b-thinking-control-2026-05-25 + // showed every force-closed thinking probe getting truncated + // mid-answer at 512 reply tokens. + int hard_limit_reply_budget = 4096; + + // Token IDs resolved at server startup for the model's + // close-tag sequence. Single special token for Qwen3.6 (id 248069); + // multiple tokens for DeepSeek/laguna ([1718, 37947, 32]). When + // non-empty, used as BudgetHook.close_token_ids. server_main + // populates this from the tokenizer after loading; HttpServer just + // forwards into GenerateRequest.budget_hook when thinking is opted in. + std::vector think_close_token_ids; + + // Phase-1 budgets per `reasoning.effort` tier (spec §4.2). Selected + // by the request parser when `reasoning.effort` is present. Each + // value is itself capped at `think_max_tokens` at startup. + // Populated by server_main from the resolved model card; CLI flags + // (--reasoning-effort-) override individual tiers. + EffortTiers effort_tiers; + + // Sampler defaults from the model card (spec §3.3). Used to fill + // values the request body did not specify. has_* fields distinguish + // "card supplied a value" from "C++ default". HttpServer reads these + // in the request parser; CLI does not currently override. + SamplingDefaults sampler_defaults; + + // Operator-facing tag for the startup banner: e.g. + // "share/model_cards/qwen3.6-27b.json", "family:qwen35", "hard-fallback". + // Surfaced at /props.budget_envelope.model_card_source per + // docs/specs/props-endpoint.md §4.2. + std::string model_card_source_label; + + // Cached on startup by server_main after resolve_model_card. Null + // (`.is_null()` returns true) when family or hard fallback was used. + // Exposed verbatim under /props.model_card; validates against + // share/model_cards/_schema.json. See docs/specs/props-endpoint.md + // §4.9 and docs/specs/model-cards.md. + nlohmann::json model_card_json = nullptr; + + // /props introspection inputs — captured at startup by server_main so + // the /props handler doesn't need to crack open BackendArgs or env. + // Matches dflash/scripts/server.py:1221-1312 field-for-field. + std::string arch; // detected model arch (qwen35/36, laguna, gemma4, ...) + std::string model_path; // bargs.model_path + std::string draft_path; // bargs.draft_path (empty if no draft) + std::string tokenizer_id; // tokenizer name from GGUF metadata (best-effort) + std::string kv_cache_k; // effective KV K type ("q4_0", "tq3_0", "f16", ...) + std::string kv_cache_v; // effective KV V type + std::string runtime_backend; // "cuda" | "hip" | "cpu" + int fa_window = 0; + int ddtree_budget = 0; + bool speculative_enabled = false; + bool target_sharding = false; + // Prefill chunk size (bargs.chunk). Exposed at /props.runtime.chunk so + // bench/snapshot tooling can capture the full server config — needed + // because pre-c35a8a4 snapshots had no /props capture and post-hoc + // forensics on which chunk was used are otherwise impossible. See + // dflash/docs/specs/props-endpoint.md §4.5. + int chunk = 0; + // Resolved device placement strings (e.g. "auto:0", "cuda:0"). Sourced + // from placement_device_name(bargs.device / bargs.draft_device) in + // server_main after CLI parse. + std::string target_device; + std::string draft_device; + // PFlash (speculative prefill compression) enum class PflashMode { OFF, AUTO, ALWAYS }; PflashMode pflash_mode = PflashMode::OFF; @@ -96,11 +184,29 @@ struct ParsedRequest { std::string response_id; // Thinking/reasoning state bool thinking_enabled = true; - bool started_in_thinking = false; + // True when the request opted in to the thinking-budget envelope via + // `thinking: {type: "enabled"}`. Distinct from thinking_enabled (which + // can be set via the chat template kwarg alone). When true, the response + // includes a `finish_details` block. Mirrors server.py:2271 conditional. + bool thinking_opt_in = false; + // Per-request thinking-budget envelope (spec §4). Populated from + // `thinking.budget_tokens` and `thinking.reply_budget`, or selected + // from server-configured effort tiers when `reasoning.effort` is set. + // -1 = not set; the server falls back to its global think_max_tokens / + // hard_limit_reply_budget. Values are already clamped to those ceilings. + int per_req_phase1_cap = -1; + int per_req_reply_budget = -1; // Stop sequences (OpenAI "stop" + Anthropic "stop_sequences") std::vector stop_sequences; }; +// Build the /props response body. Exposed (non-static) so unit tests +// can assert on its shape without spinning up a real socket. See +// docs/specs/props-endpoint.md for the wire contract. +json build_props_body(const ServerConfig & config, + const PrefixCache & prefix_cache, + const ToolMemory & tool_memory); + // ─── HTTP server ──────────────────────────────────────────────────────── class HttpServer { public: diff --git a/dflash/src/server/model_card.cpp b/dflash/src/server/model_card.cpp new file mode 100644 index 000000000..fe3376ccb --- /dev/null +++ b/dflash/src/server/model_card.cpp @@ -0,0 +1,362 @@ +// Model card resolution. See model_card.h and docs/specs/thinking-budget.md §3. + +#include "model_card.h" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace dflash::common { + +using json = nlohmann::json; + +// ── Helpers ───────────────────────────────────────────────────────────── + +std::string normalize_model_card_stem(const std::string & general_name) { + std::string out; + out.reserve(general_name.size()); + for (char c : general_name) { + const unsigned char uc = static_cast(c); + if (uc == ' ' || uc == '\t' || uc == '_') { + out.push_back('-'); + } else if (uc >= 'A' && uc <= 'Z') { + out.push_back(static_cast(uc - 'A' + 'a')); + } else if ((uc >= 'a' && uc <= 'z') || + (uc >= '0' && uc <= '9') || + uc == '.' || uc == '-') { + out.push_back(static_cast(uc)); + } + // else: silently drop punctuation + } + return out; +} + +static bool file_exists(const std::string & path) { + struct stat st{}; + return ::stat(path.c_str(), &st) == 0 && S_ISREG(st.st_mode); +} + +static std::string self_bin_dir() { + char buf[4096]; + ssize_t n = ::readlink("/proc/self/exe", buf, sizeof(buf) - 1); + if (n <= 0) return {}; + buf[n] = '\0'; + std::string path(buf); + auto slash = path.find_last_of('/'); + if (slash == std::string::npos) return {}; + return path.substr(0, slash); +} + +// Find share/model_cards/ directory. Search order (spec §1 implementation note): +// (a) `repo_root_hint`/share/model_cards/ (if hint non-empty) +// (b) /../share/model_cards/ (install layout) +// (c) /share/model_cards/ (build layout) +// (d) ./share/model_cards/ (cwd, dev runs) +// (e) $DFLASH_MODEL_CARDS_DIR (explicit override) +// +// Returns first hit or empty string. Each candidate is logged to stderr so +// operators can see which path was probed. +static std::string find_model_cards_dir(const std::string & repo_root_hint) { + std::vector candidates; + if (!repo_root_hint.empty()) { + candidates.push_back(repo_root_hint + "/share/model_cards"); + } + std::string bd = self_bin_dir(); + if (!bd.empty()) { + candidates.push_back(bd + "/../share/model_cards"); + candidates.push_back(bd + "/share/model_cards"); + } + candidates.push_back("share/model_cards"); + if (const char * envp = std::getenv("DFLASH_MODEL_CARDS_DIR")) { + candidates.push_back(envp); + } + + for (const auto & c : candidates) { + struct stat st{}; + if (::stat(c.c_str(), &st) == 0 && S_ISDIR(st.st_mode)) { + std::fprintf(stderr, "[model_card] using cards dir: %s\n", c.c_str()); + return c; + } + } + std::fprintf(stderr, + "[model_card] no share/model_cards/ directory found; " + "tried %zu candidate(s)\n", candidates.size()); + return {}; +} + +// Compute the effort tiers from max_tokens + complex_problem_max_tokens +// using the spec §3.3 formula. `think_max` and `complex_think_max` are +// derived in caller. +static EffortTiers compute_default_tiers(int think_max, int complex_think_max) { + EffortTiers t; + t.low = (int)(think_max * 0.125 + 0.5); + t.medium = (int)(think_max * 0.5 + 0.5); + t.high = think_max; + // x-high midpoint of think_max and complex_think_max + t.x_high = (think_max + complex_think_max) / 2; + t.max = complex_think_max; + return t; +} + +// Apply spec §3.5 monotone-ordering invariant. Clamps to monotone +// non-decreasing order and warns if the sidecar (or computed tiers) +// violate the invariant. +// +// The absolute-ceiling invariant (max ≤ max_ctx − hard_limit_reply_budget) +// is enforced separately in server_main.cpp once max_ctx has been +// resolved from the backend / CLI — model_card resolution runs before +// that, and the card itself doesn't know the operator's runtime ceiling. +static void enforce_tier_invariants(EffortTiers & t, + const std::string & source) { + auto clamp_one = [&](int prev, int & v, const char * tier) { + if (v < prev) { + std::fprintf(stderr, + "[model_card] %s: effort_tiers.%s=%d < previous tier %d; " + "clamping up\n", source.c_str(), tier, v, prev); + v = prev; + } + }; + clamp_one(t.low, t.medium, "medium"); + clamp_one(t.medium, t.high, "high"); + clamp_one(t.high, t.x_high, "x-high"); + clamp_one(t.x_high, t.max, "max"); +} + +// ── Sidecar parsing ───────────────────────────────────────────────────── + +static bool load_sidecar(const std::string & path, ModelCard & out, std::string & err) { + std::ifstream f(path); + if (!f.is_open()) { + err = std::string("open: ") + std::strerror(errno); + return false; + } + std::stringstream ss; + ss << f.rdbuf(); + json j; + try { + j = json::parse(ss.str()); + } catch (const std::exception & e) { + err = std::string("parse: ") + e.what(); + return false; + } + + // Stash the parsed sidecar verbatim on the ModelCard so the HTTP + // server can re-emit it under /props.model_card. See spec §4.9. + out.raw_json = j; + + // Schema sanity check — the four required fields per + // share/model_cards/_schema.json. We warn but DO NOT fail-start; + // operators may have a partial card (e.g. only max_tokens) and the + // family / hard fallback paths still want a chance to fill in the + // rest. The JSON Schema at share/model_cards/_schema.json catches + // typos earlier (CI / author-facing validation). + static const char * const kRequiredFields[] = { + "name", "source", "verified_at", "max_tokens" + }; + for (const char * field : kRequiredFields) { + if (!j.contains(field)) { + std::fprintf(stderr, + "[model_card] %s: missing required field '%s' " + "(see share/model_cards/_schema.json)\n", + path.c_str(), field); + } + } + + if (j.contains("max_tokens") && j["max_tokens"].is_number_integer()) { + out.max_tokens = j["max_tokens"].get(); + } + if (j.contains("complex_problem_max_tokens") && + j["complex_problem_max_tokens"].is_number_integer()) { + out.complex_problem_max_tokens = j["complex_problem_max_tokens"].get(); + } + if (j.contains("hard_limit_reply_budget") && + j["hard_limit_reply_budget"].is_number_integer()) { + // Per-model override of the post-`` reserved reply budget. + // Verbose-post-close models (Qwen3.6) want 2k-4k; terse models + // (DeepSeek-V4-flash style) stay at the 512 ds4_eval.c default. + // See docs/specs/thinking-budget.md §3.3. + out.hard_limit_reply_budget = j["hard_limit_reply_budget"].get(); + } + if (j.contains("thinking_marker") && + j["thinking_marker"].is_string()) { + out.thinking_marker = j["thinking_marker"].get(); + } + if (j.contains("thinking_terminator_hint") && + j["thinking_terminator_hint"].is_string()) { + out.thinking_terminator_hint = + j["thinking_terminator_hint"].get(); + } + + if (j.contains("sampling") && j["sampling"].is_object()) { + const auto & s = j["sampling"]; + auto pick_f = [&](const char * k, float & v, bool & has) { + if (s.contains(k) && s[k].is_number()) { + v = s[k].get(); has = true; + } + }; + auto pick_i = [&](const char * k, int & v, bool & has) { + if (s.contains(k) && s[k].is_number_integer()) { + v = s[k].get(); has = true; + } + }; + pick_f("temperature", out.sampling.temperature, out.sampling.has_temperature); + pick_f("top_p", out.sampling.top_p, out.sampling.has_top_p); + pick_i("top_k", out.sampling.top_k, out.sampling.has_top_k); + pick_f("min_p", out.sampling.min_p, out.sampling.has_min_p); + pick_f("presence_penalty", out.sampling.presence_penalty, out.sampling.has_presence_penalty); + pick_f("repetition_penalty", out.sampling.repetition_penalty, out.sampling.has_repetition_penalty); + } + + if (j.contains("reasoning_effort_tiers") && j["reasoning_effort_tiers"].is_object()) { + const auto & rt = j["reasoning_effort_tiers"]; + auto pick = [&](const char * k, int & v) { + if (rt.contains(k) && rt[k].is_number_integer()) v = rt[k].get(); + }; + pick("low", out.effort_tiers.low); + pick("medium", out.effort_tiers.medium); + pick("high", out.effort_tiers.high); + pick("x-high", out.effort_tiers.x_high); + pick("max", out.effort_tiers.max); + } + + return true; +} + +// ── Per-family fallback table ─────────────────────────────────────────── + +static bool family_fallback(const std::string & arch, ModelCard & out) { + // Coarse safety net when no sidecar matches. Values are conservative + // and intentionally not aspirational — operators are expected to ship + // a sidecar for production models. See spec §3.1. + if (arch == "qwen35" || arch == "qwen36" || arch == "qwen3") { + out.max_tokens = 32768; + out.complex_problem_max_tokens = 0; + // Qwen3.x is verbose post-`` — restates derivation in the + // visible area before writing the answer line. The 512 default + // from ds4_eval.c (DeepSeek terse-style) clips this pattern. See + // docs/specs/thinking-budget.md §3.3. + out.hard_limit_reply_budget = 4096; + out.source_label = "family:" + arch; + return true; + } + if (arch == "gemma4") { + // Gemma4 verified value: see Gemma model card; conservative + // 16384 keeps us inside published recommendations. + out.max_tokens = 16384; + out.complex_problem_max_tokens = 0; + out.source_label = "family:gemma4"; + return true; + } + if (arch == "laguna") { + // Laguna (DeepSeek-V3-derivative) — same conservative ceiling + // as the Qwen family until a verified card lands. + out.max_tokens = 32768; + out.complex_problem_max_tokens = 0; + out.source_label = "family:laguna"; + return true; + } + return false; +} + +// ── Public entry point ────────────────────────────────────────────────── + +ModelCard resolve_model_card(const std::string & gguf_path, + const std::string & general_name, + const std::string & general_architecture, + const std::string & repo_root_hint) { + (void)gguf_path; // currently unused; kept in signature for future cards + // keyed by file hash or path-based overrides. + + ModelCard card; + + // Try sidecar first. + bool resolved = false; + if (!general_name.empty()) { + std::string stem = normalize_model_card_stem(general_name); + std::string dir = find_model_cards_dir(repo_root_hint); + if (!dir.empty() && !stem.empty()) { + std::string path = dir + "/" + stem + ".json"; + std::fprintf(stderr, + "[model_card] probing sidecar: %s (from general.name='%s')\n", + path.c_str(), general_name.c_str()); + if (file_exists(path)) { + std::string err; + ModelCard sidecar; + if (load_sidecar(path, sidecar, err)) { + sidecar.source_label = path; + card = sidecar; + resolved = true; + } else { + std::fprintf(stderr, + "[model_card] sidecar parse failed (%s): %s — " + "falling through\n", path.c_str(), err.c_str()); + } + } else { + std::fprintf(stderr, + "[model_card] sidecar not found at %s\n", path.c_str()); + } + } + } + + // Family fallback. + if (!resolved) { + if (family_fallback(general_architecture, card)) { + std::fprintf(stderr, + "[model_card] using family fallback for arch='%s'\n", + general_architecture.c_str()); + resolved = true; + } + } + + // Hard fallback. + if (!resolved) { + std::fprintf(stderr, + "[model_card] using hard fallback (no sidecar, no family match " + "for arch='%s')\n", general_architecture.c_str()); + card.source_label = "hard-fallback"; + card.max_tokens = 16000; + card.complex_problem_max_tokens = 0; + } + + // Derive think_max_tokens and missing tier values. + if (card.hard_limit_reply_budget < 0) card.hard_limit_reply_budget = 0; + card.think_max_tokens = std::max(0, card.max_tokens - card.hard_limit_reply_budget); + + int complex_think_max = card.complex_problem_max_tokens > 0 + ? std::max(0, card.complex_problem_max_tokens - card.hard_limit_reply_budget) + : card.think_max_tokens; + + // For each tier not explicitly set, fill via §3.3 formula. + EffortTiers computed = compute_default_tiers(card.think_max_tokens, complex_think_max); + if (card.effort_tiers.low <= 0) card.effort_tiers.low = computed.low; + if (card.effort_tiers.medium <= 0) card.effort_tiers.medium = computed.medium; + if (card.effort_tiers.high <= 0) card.effort_tiers.high = computed.high; + if (card.effort_tiers.x_high <= 0) card.effort_tiers.x_high = computed.x_high; + if (card.effort_tiers.max <= 0) card.effort_tiers.max = computed.max; + + // If complex_problem_max_tokens is unspecified, collapse x-high and max + // to high (spec §3.3 last paragraph). + if (card.complex_problem_max_tokens <= 0) { + if (card.effort_tiers.x_high > card.effort_tiers.high) card.effort_tiers.x_high = card.effort_tiers.high; + if (card.effort_tiers.max > card.effort_tiers.high) card.effort_tiers.max = card.effort_tiers.high; + } + + // Enforce monotone non-decreasing tiers. The absolute ceiling + // (max ≤ max_ctx − hard_limit_reply_budget) is applied later in + // server_main once max_ctx has been resolved. + enforce_tier_invariants(card.effort_tiers, card.source_label); + + return card; +} + +} // namespace dflash::common diff --git a/dflash/src/server/model_card.h b/dflash/src/server/model_card.h new file mode 100644 index 000000000..bb82abea9 --- /dev/null +++ b/dflash/src/server/model_card.h @@ -0,0 +1,119 @@ +// Model card resolution for thinking-budget v2. +// +// At server startup, look up the loaded GGUF's recommended defaults from +// (in order): a JSON sidecar at share/model_cards/.json, +// a per-family fallback table, or the hard fallback (antirez/ds4 reference +// values). See docs/specs/thinking-budget.md §3 for the full resolution +// order and field semantics. +// +// The resolved ModelCard is consumed by server_main.cpp, which copies the +// values into ServerConfig but only for fields the operator did NOT +// override via CLI. CLI flags always win. + +#pragma once + +#include + +#include + +namespace dflash::common { + +// Phase-1 reasoning budgets per `reasoning.effort` tier. +// See spec §3.3, §4.2. +struct EffortTiers { + int low = 0; + int medium = 0; + int high = 0; + int x_high = 0; + int max = 0; +}; + +// Sampler defaults from the model card. Each field's `has_*` companion +// records whether the sidecar actually supplied a value, so the request +// parser can know to fall back to its hard-coded default vs. apply this +// one when the request omits the field. +struct SamplingDefaults { + float temperature = 1.0f; + float top_p = 0.95f; + int top_k = 20; + float min_p = 0.0f; + float presence_penalty = 0.0f; + float repetition_penalty = 1.0f; + + bool has_temperature = false; + bool has_top_p = false; + bool has_top_k = false; + bool has_min_p = false; + bool has_presence_penalty = false; + bool has_repetition_penalty = false; +}; + +// Resolved model card. `source_label` is a short, operator-facing tag +// describing where each value came from; useful for the startup banner. +struct ModelCard { + // One of: "share/model_cards/.json", "family:", "hard-fallback". + std::string source_label; + + int max_tokens = 16000; // spec §3.4 hard fallback + int complex_problem_max_tokens = 0; // 0 = not specified + SamplingDefaults sampling; + EffortTiers effort_tiers; + // Bumped from 512 to 4096 on 2026-05-25. The original ds4_eval.c + // value was sized for DeepSeek-V4-flash's terse style but silently + // truncated almost every other model mid-answer. Terse sidecars can + // override down to 512-1024; verbose math/code models keep 4096. + int hard_limit_reply_budget = 4096; + + // Two distinct concepts for thinking-budget control: + // + // (a) `thinking_marker` — the parse-side terminator. Bytes that signal + // end-of-thinking to *us* (bench parser, chat template, response + // formatter). If empty, arch-default applies: `` for + // qwen3-family, `` for gemma4, `` elsewhere. + // (b) `thinking_terminator_hint` — the inject-side directive. What we + // tell the *model* when the budget hook fires. Free-form text; + // the server tokenizes it and overrides sampled tokens with this + // sequence at the budget boundary VERBATIM (no auto-append of + // marker — operator includes it if they want guaranteed close). + // Per Qwen3 tech report (arXiv 2505.09388) Qwen3.x's canonical + // trained hint is the "Considering the limited time by the user…" + // lead-in with `` embedded. Gemma4's documented working + // hint is the bare `\n\n` transition cue (the trailing + // newlines mirror Qwen3's no-think template suffix, giving gemma + // the same trained transition cue — see + // dflash/docs/experiments/gemma4-26b-thinking-control-2026-05-25.md). + std::string thinking_marker; + std::string thinking_terminator_hint; + + // Phase-1 ceiling derived from `max_tokens - hard_limit_reply_budget`. + // Convenience: also the spec's `think_max` quantity (§3.3 formula). + int think_max_tokens = 15488; + + // Raw parsed sidecar JSON, populated on successful sidecar load. + // Null (`raw_json.is_null() == true`) when family fallback or hard + // fallback was used. Exposed verbatim under `/props.model_card` + // (see docs/specs/props-endpoint.md §4.9). + nlohmann::json raw_json = nullptr; +}; + +// Normalize a GGUF `general.name` value to a model-card filename stem. +// "Qwen3.6 27B" -> "qwen3.6-27b". Lowercases, replaces spaces with `-`, +// strips characters outside `[a-z0-9.-]`. Exposed for tests/banner. +std::string normalize_model_card_stem(const std::string & general_name); + +// Resolve the model card for the loaded GGUF. +// +// Search order (spec §3.1): +// 1. share/model_cards/.json +// 2. Per-family fallback table keyed on general_architecture +// 3. Hard fallback (antirez/ds4 reference values) +// +// `repo_root_hint` is an optional explicit directory to search for +// share/model_cards/; pass empty to use auto-discovery (binary's parent, +// then cwd, then $DFLASH_MODEL_CARDS_DIR). +ModelCard resolve_model_card(const std::string & gguf_path, + const std::string & general_name, + const std::string & general_architecture, + const std::string & repo_root_hint = ""); + +} // namespace dflash::common diff --git a/dflash/src/server/prefix_cache.cpp b/dflash/src/server/prefix_cache.cpp index 26657b276..f42177c4f 100644 --- a/dflash/src/server/prefix_cache.cpp +++ b/dflash/src/server/prefix_cache.cpp @@ -271,6 +271,7 @@ std::pair PrefixCache::lookup(const std::vector & prompt_ids) } if (best_slot >= 0) { + lifetime_hits_.fetch_add(1, std::memory_order_relaxed); std::fprintf(stderr, "[pc] lookup hit slot=%d prefix_len=%d (of %zu total)\n", best_slot, best_len, prompt_ids.size()); } @@ -314,12 +315,16 @@ void PrefixCache::confirm_inline_snap(int slot, int target_cut, // Evict the reserved entry (if any). if (has_pending_evict_) { int idx = find_entry(pending_evict_key_); - if (idx >= 0) entries_.erase(entries_.begin() + idx); + if (idx >= 0) { + entries_.erase(entries_.begin() + idx); + entries_size_count_.fetch_sub(1, std::memory_order_relaxed); + } has_pending_evict_ = false; } auto key = hash_prefix(prompt_ids.data(), target_cut); entries_.push_back({key, slot}); + entries_size_count_.fetch_add(1, std::memory_order_relaxed); std::fprintf(stderr, "[pc] inline-snap committed slot=%d prefix_len=%d\n", slot, target_cut); } @@ -328,7 +333,10 @@ void PrefixCache::abort_inline_snap(int /*slot*/) { if (disabled_) return; if (has_pending_evict_) { int idx = find_entry(pending_evict_key_); - if (idx >= 0) entries_.erase(entries_.begin() + idx); + if (idx >= 0) { + entries_.erase(entries_.begin() + idx); + entries_size_count_.fetch_sub(1, std::memory_order_relaxed); + } has_pending_evict_ = false; } } @@ -337,6 +345,7 @@ void PrefixCache::mark_all_cleared() { if (disabled_) return; int n = (int)entries_.size(); entries_.clear(); + entries_size_count_.store(0, std::memory_order_relaxed); next_slot_ = 0; has_pending_evict_ = false; std::fprintf(stderr, "[pc] all-cleared — dropped %d LRU entries\n", n); @@ -378,6 +387,7 @@ std::pair PrefixCache::lookup_full(const std::vector & prompt int slot = e.slot; int cur_ids_len = e.cur_ids_len; move_full_to_end(idx); + full_lifetime_hits_.fetch_add(1, std::memory_order_relaxed); std::fprintf(stderr, "[pc] full-cache hit slot=%d cur_ids_len=%d\n", slot, cur_ids_len); @@ -412,7 +422,10 @@ void PrefixCache::confirm_full_snap(int slot, if (full_has_pending_evict_) { int idx = find_full_entry(full_pending_evict_key_); - if (idx >= 0) full_entries_.erase(full_entries_.begin() + idx); + if (idx >= 0) { + full_entries_.erase(full_entries_.begin() + idx); + full_entries_size_count_.fetch_sub(1, std::memory_order_relaxed); + } full_has_pending_evict_ = false; } @@ -425,6 +438,7 @@ void PrefixCache::confirm_full_snap(int slot, std::chrono::steady_clock::now().time_since_epoch()).count(); entry.hits = 0; full_entries_.push_back({key, std::move(entry)}); + full_entries_size_count_.fetch_add(1, std::memory_order_relaxed); std::fprintf(stderr, "[pc] full-cache committed slot=%d cur_ids_len=%d\n", slot, cur_ids_len); @@ -435,4 +449,19 @@ void PrefixCache::abort_full_snap(int /*slot*/) { full_has_pending_evict_ = false; } +PrefixCache::InlineStats PrefixCache::stats() const { + if (disabled_) return {0, 0, 0}; + return {cap_, + (int)entries_size_count_.load(std::memory_order_relaxed), + lifetime_hits_.load(std::memory_order_relaxed)}; +} + +PrefixCache::FullStats PrefixCache::full_stats() const { + if (full_disabled_) return {false, 0, 0, 0, 0}; + return {true, full_cap_, + (int)full_entries_size_count_.load(std::memory_order_relaxed), + full_disk_bytes_.load(std::memory_order_relaxed), + full_lifetime_hits_.load(std::memory_order_relaxed)}; +} + } // namespace dflash::common diff --git a/dflash/src/server/prefix_cache.h b/dflash/src/server/prefix_cache.h index b9ec001ff..b0494d9c0 100644 --- a/dflash/src/server/prefix_cache.h +++ b/dflash/src/server/prefix_cache.h @@ -15,6 +15,7 @@ #include "tokenizer.h" #include +#include #include #include #include @@ -105,6 +106,31 @@ class PrefixCache { // Abort reservation. void abort_full_snap(int slot); + // ── Introspection (for /props) ────────────────────────────────── + + struct InlineStats { + int capacity; + int in_use; + int64_t lifetime_hits; + }; + struct FullStats { + bool enabled; + int capacity; + int in_use; + int64_t disk_bytes; + int64_t lifetime_hits; + }; + + // Lockless snapshot for /props. Every published field — hit + // counters, disk-bytes, AND the two in-use counts — is mirrored to + // an std::atomic that the daemon thread updates alongside the + // backing vector. /props reads those atomics with + // memory_order_relaxed, so the cross-thread read is well-defined + // under the C++ memory model. Used for an ops dashboard; not safe + // for control-flow decisions. + InlineStats stats() const; + FullStats full_stats() const; + private: bool disabled_ = true; int cap_ = 0; @@ -134,6 +160,21 @@ class PrefixCache { std::vector full_entries_; PrefixHash full_pending_evict_key_{}; bool full_has_pending_evict_ = false; + // Atomic so /props can read them from a client thread without + // tearing across the daemon thread's increments. Relaxed ordering + // is sufficient — no synchronization with other state required. + std::atomic lifetime_hits_{0}; // inline cache hits + std::atomic full_lifetime_hits_{0}; // full-compress cache hits + std::atomic full_disk_bytes_{0}; // best-effort snapshot of disk usage + // Atomic mirrors of `entries_.size()` and `full_entries_.size()`. + // The vectors themselves are mutated only on the daemon thread + // under the daemon's serialised request loop, but `/props` reads + // happen from the client thread — calling `.size()` there is a + // data race per the C++ memory model. Bump these alongside every + // push_back / erase / clear so the public introspection counters + // stay well-defined. (Codex r1 P2 follow-up.) + std::atomic entries_size_count_{0}; // mirrors entries_.size() + std::atomic full_entries_size_count_{0}; // mirrors full_entries_.size() // Helpers int find_entry(const PrefixHash & h) const; diff --git a/dflash/src/server/server_main.cpp b/dflash/src/server/server_main.cpp index ab766a46a..ca9cd1da4 100644 --- a/dflash/src/server/server_main.cpp +++ b/dflash/src/server/server_main.cpp @@ -14,11 +14,14 @@ #include "http_server.h" #include "chat_template.h" +#include "model_card.h" #include "common/backend_factory.h" #include "common/gguf_inspect.h" #include "common/peer_access.h" #include "placement/pflash_placement.h" +#include "gguf.h" + #include #include #include @@ -136,7 +139,9 @@ static void print_usage(const char * prog) { " --port Listen port (default: 8080)\n" " --host Bind address (default: 0.0.0.0)\n" " --max-ctx Max context length (default: 131072)\n" - " --max-tokens Default max output tokens (default: 4096)\n" + " --max-tokens Default max output tokens (legacy alias for\n" + " --default-max-tokens; loses to --default-max-tokens\n" + " when both are passed)\n" " --target-device Target device (default: auto:0)\n" " --draft-device Draft device (default: auto:0)\n" " --draft-ipc-bin Remote backend IPC daemon for mixed backends\n" @@ -148,9 +153,30 @@ static void print_usage(const char * prog) { " --chunk Chunked-prefill chunk size (default: 512)\n" " --fa-window Flash-attention sliding window (default: 0=full)\n" " --model-name Model name for /v1/models (default: dflash)\n" + " --prefix-cache-slots Prefix cache slots (default: 32, 0 disables)\n" " --ddtree Enable DDTree speculative decode\n" " --ddtree-budget DDTree budget (default: 64)\n" " --no-cors Disable CORS headers\n" + " --think-max-tokens Phase-1 reasoning cap when a request opts in\n" + " via thinking:{type:enabled} (default: 15488 =\n" + " default_max_tokens - hard_limit_reply_budget;\n" + " may be raised by share/model_cards/.json)\n" + " --default-max-tokens Combined cap when request omits max_tokens\n" + " (default: 16000, matches antirez/ds4 ds4_eval.c;\n" + " may be raised by share/model_cards/.json)\n" + " --hard-limit-reply-budget \n" + " Level 2 force-close: when this many tokens\n" + " remain (of the combined cap), inject \n" + " so the model gets that budget to write the\n" + " visible answer. Mirrors ds4_eval.c's\n" + " hard_limit_reply_budget. 0 disables. (default: 4096)\n" + " --reasoning-effort-low Phase-1 budget when request asks effort=low\n" + " --reasoning-effort-medium Phase-1 budget when request asks effort=medium\n" + " --reasoning-effort-high Phase-1 budget when request asks effort=high\n" + " --reasoning-effort-x-high Phase-1 budget when request asks effort=x-high\n" + " --reasoning-effort-max Phase-1 budget when request asks effort=max\n" + " Defaults come from share/model_cards/.json;\n" + " see docs/specs/thinking-budget.md §3.\n" "\n" "KV cache:\n" " --cache-type-k KV cache K type (f16,bf16,q4_0,q4_1,q5_0,q5_1,q8_0,tq3_0)\n" @@ -200,6 +226,29 @@ int main(int argc, char ** argv) { bool target_device_seen = false; bool target_devices_seen = false; + // Track which thinking-budget tunables the operator set via CLI. + // Those values win over the model card (spec §3.1: "Explicit CLI + // flag" is the first source in the resolution order). Anything not + // overridden is taken from the resolved ModelCard after backend load. + struct CliOverrides { + bool think_max_tokens = false; + bool default_max_tokens = false; + bool hard_limit_reply_budget = false; + bool effort_low = false; + bool effort_medium = false; + bool effort_high = false; + bool effort_x_high = false; + bool effort_max = false; + } cli_set; + + // Track whether the operator passed the legacy --max-tokens alias. + // When set and --default-max-tokens is NOT also passed, --max-tokens + // wins over the model card for default_max_tokens (it was a documented + // CLI flag before the thinking-budget v2 work, and shipped deployments + // rely on it actually capping output). + bool legacy_max_tokens_set = false; + int legacy_max_tokens_val = 0; + for (int i = 2; i < argc; i++) { if (std::strcmp(argv[i], "--draft") == 0 && i + 1 < argc) { bargs.draft_path = argv[++i]; @@ -212,7 +261,12 @@ int main(int argc, char ** argv) { sconfig.max_ctx = v; bargs.device.max_ctx = v; } else if (std::strcmp(argv[i], "--max-tokens") == 0 && i + 1 < argc) { - sconfig.max_tokens = std::atoi(argv[++i]); + // Legacy alias for --default-max-tokens. Resolved after the + // arg-parse loop so an explicit --default-max-tokens still wins + // regardless of CLI order. + legacy_max_tokens_val = std::atoi(argv[++i]); + legacy_max_tokens_set = true; + sconfig.max_tokens = legacy_max_tokens_val; } else if (std::strcmp(argv[i], "--target-device") == 0 && i + 1 < argc) { if (target_devices_seen) { std::fprintf(stderr, "[server] --target-device conflicts with --target-devices\n"); @@ -261,6 +315,8 @@ int main(int argc, char ** argv) { bargs.fa_window = std::atoi(argv[++i]); } else if (std::strcmp(argv[i], "--model-name") == 0 && i + 1 < argc) { sconfig.model_name = argv[++i]; + } else if (std::strcmp(argv[i], "--prefix-cache-slots") == 0 && i + 1 < argc) { + sconfig.prefix_cache_cap = std::atoi(argv[++i]); } else if (std::strcmp(argv[i], "--ddtree") == 0) { bargs.ddtree_mode = true; bargs.fast_rollback = true; @@ -268,6 +324,30 @@ int main(int argc, char ** argv) { bargs.ddtree_budget = std::atoi(argv[++i]); } else if (std::strcmp(argv[i], "--no-cors") == 0) { sconfig.enable_cors = false; + } else if (std::strcmp(argv[i], "--think-max-tokens") == 0 && i + 1 < argc) { + sconfig.think_max_tokens = std::atoi(argv[++i]); + cli_set.think_max_tokens = true; + } else if (std::strcmp(argv[i], "--default-max-tokens") == 0 && i + 1 < argc) { + sconfig.default_max_tokens = std::atoi(argv[++i]); + cli_set.default_max_tokens = true; + } else if (std::strcmp(argv[i], "--hard-limit-reply-budget") == 0 && i + 1 < argc) { + sconfig.hard_limit_reply_budget = std::atoi(argv[++i]); + cli_set.hard_limit_reply_budget = true; + } else if (std::strcmp(argv[i], "--reasoning-effort-low") == 0 && i + 1 < argc) { + sconfig.effort_tiers.low = std::atoi(argv[++i]); + cli_set.effort_low = true; + } else if (std::strcmp(argv[i], "--reasoning-effort-medium") == 0 && i + 1 < argc) { + sconfig.effort_tiers.medium = std::atoi(argv[++i]); + cli_set.effort_medium = true; + } else if (std::strcmp(argv[i], "--reasoning-effort-high") == 0 && i + 1 < argc) { + sconfig.effort_tiers.high = std::atoi(argv[++i]); + cli_set.effort_high = true; + } else if (std::strcmp(argv[i], "--reasoning-effort-x-high") == 0 && i + 1 < argc) { + sconfig.effort_tiers.x_high = std::atoi(argv[++i]); + cli_set.effort_x_high = true; + } else if (std::strcmp(argv[i], "--reasoning-effort-max") == 0 && i + 1 < argc) { + sconfig.effort_tiers.max = std::atoi(argv[++i]); + cli_set.effort_max = true; } else if (std::strcmp(argv[i], "--prefill-compression") == 0 && i + 1 < argc) { const char * mode = argv[++i]; if (std::strcmp(mode, "auto") == 0) @@ -464,6 +544,126 @@ int main(int argc, char ** argv) { return 2; } + // ── Thinking-budget v2: resolve model card and apply to ServerConfig ── + // Read general.name + general.architecture directly from the GGUF. + // This is best-effort; if the file can't be opened (corruption, removed + // after backend init) we fall through to hard-fallback defaults via + // resolve_model_card(...). + std::string general_name; + std::string general_arch = arch; // fall back to detect_arch() result + { + gguf_init_params gip{}; + gip.no_alloc = true; + gip.ctx = nullptr; + gguf_context * gctx = gguf_init_from_file(bargs.model_path, gip); + if (gctx) { + int64_t name_id = gguf_find_key(gctx, "general.name"); + if (name_id >= 0) { + const char * v = gguf_get_val_str(gctx, name_id); + if (v) general_name = v; + } + int64_t arch_id = gguf_find_key(gctx, "general.architecture"); + if (arch_id >= 0) { + const char * v = gguf_get_val_str(gctx, arch_id); + if (v) general_arch = v; + } + gguf_free(gctx); + } + std::fprintf(stderr, + "[server] gguf meta: general.name='%s' general.architecture='%s'\n", + general_name.c_str(), general_arch.c_str()); + } + + ModelCard card = resolve_model_card( + bargs.model_path ? bargs.model_path : "", + general_name, + general_arch, + /*repo_root_hint=*/""); + + // Apply each tunable to sconfig only if the operator did NOT set it + // via CLI. CLI always wins (spec §3.1 source #1). + // + // --max-tokens is a documented legacy alias for --default-max-tokens + // and beats the card; --default-max-tokens still wins over it when + // both are passed (the more specific flag). + if (!cli_set.default_max_tokens) { + if (legacy_max_tokens_set) { + sconfig.default_max_tokens = legacy_max_tokens_val; + cli_set.default_max_tokens = true; + } else { + sconfig.default_max_tokens = card.max_tokens; + } + } + if (!cli_set.hard_limit_reply_budget) { + sconfig.hard_limit_reply_budget = card.hard_limit_reply_budget; + } + if (!cli_set.think_max_tokens) { + // Recompute from possibly-updated combined cap + reply budget so + // the invariant (think_max = default_max - hard_limit) holds when + // the operator overrode one but not the other. + sconfig.think_max_tokens = std::max(0, + sconfig.default_max_tokens - sconfig.hard_limit_reply_budget); + // But if the card itself specified a smaller think_max_tokens + // (because complex tiers ride above default_max_tokens — see + // spec §3.3), respect that as a floor on the ceiling. + // Practically: card.think_max_tokens is just (max_tokens - reply), + // so this collapses to the same value when neither was overridden. + if (card.think_max_tokens > 0 && + card.think_max_tokens < sconfig.think_max_tokens) { + sconfig.think_max_tokens = card.think_max_tokens; + } + } + // Effort tiers: per-tier CLI override. We pre-stored the CLI value + // into sconfig.effort_tiers above; for any tier the operator didn't + // set, take the card's value. + if (!cli_set.effort_low) sconfig.effort_tiers.low = card.effort_tiers.low; + if (!cli_set.effort_medium) sconfig.effort_tiers.medium = card.effort_tiers.medium; + if (!cli_set.effort_high) sconfig.effort_tiers.high = card.effort_tiers.high; + if (!cli_set.effort_x_high) sconfig.effort_tiers.x_high = card.effort_tiers.x_high; + if (!cli_set.effort_max) sconfig.effort_tiers.max = card.effort_tiers.max; + + // Sampler defaults — currently no CLI surface; always take from card. + sconfig.sampler_defaults = card.sampling; + + sconfig.model_card_source_label = card.source_label; + // Stash the raw sidecar JSON (or null on family/hard fallback) so + // /props.model_card can re-emit it verbatim. See + // docs/specs/props-endpoint.md §4.9. + sconfig.model_card_json = card.raw_json; + + // Spec §3.5 invariant: each effort tier must fit under the server's + // absolute ceiling, which is `max_ctx - hard_limit_reply_budget` (the + // most tokens any single request — including its phase-1 portion — + // can occupy while still leaving the reply-reserve headroom). + // + // This is intentionally *not* clamped to think_max_tokens / default_ + // max_tokens: effort tiers are phase-1 budgets, and the card's + // complex_problem_max_tokens can legitimately exceed default_max_tokens + // (Qwen3.6's card says max=81408 with default=32768). A request that + // wants to use such a tier must also pass an explicit max_tokens large + // enough to cover it (see spec §4.4); the request parser narrows the + // effective phase-1 cap when max_tokens is smaller. + const int tier_ceiling = std::max(0, + sconfig.max_ctx - sconfig.hard_limit_reply_budget); + std::fprintf(stderr, + "[server] effort-tier ceiling = max_ctx(%d) - hard_limit_reply_budget(%d) = %d\n", + sconfig.max_ctx, sconfig.hard_limit_reply_budget, tier_ceiling); + auto clamp_tier = [&](const char * name, int & v) { + if (tier_ceiling > 0 && v > tier_ceiling) { + std::fprintf(stderr, + "[server] reasoning-effort %s=%d clamped to " + "max_ctx - hard_limit_reply_budget = %d\n", + name, v, tier_ceiling); + v = tier_ceiling; + } + if (v < 0) v = 0; + }; + clamp_tier("low", sconfig.effort_tiers.low); + clamp_tier("medium", sconfig.effort_tiers.medium); + clamp_tier("high", sconfig.effort_tiers.high); + clamp_tier("x-high", sconfig.effort_tiers.x_high); + clamp_tier("max", sconfig.effort_tiers.max); + // Start HTTP server. std::fprintf(stderr, "\n"); std::fprintf(stderr, "[server] ╭─── Configuration ───────────────────────────────────╮\n"); @@ -473,7 +673,32 @@ int main(int argc, char ** argv) { std::fprintf(stderr, "[server] │ draft = %s\n", bargs.draft_path ? bargs.draft_path : "(none)"); std::fprintf(stderr, "[server] │ model_name = %s\n", sconfig.model_name.c_str()); std::fprintf(stderr, "[server] │ max_ctx = %d\n", sconfig.max_ctx); - std::fprintf(stderr, "[server] │ max_tokens = %d\n", sconfig.max_tokens); + // max_tokens default for requests that omit the field. The request + // parser reads default_max_tokens (16000), NOT sconfig.max_tokens + // (legacy 4096). Print default_max_tokens so the banner doesn't lie. + std::fprintf(stderr, "[server] │ model_card = %s\n", + sconfig.model_card_source_label.empty() + ? "(unresolved)" : sconfig.model_card_source_label.c_str()); + auto src_of = [&](bool cli_overridden) { + return cli_overridden ? "from CLI" : sconfig.model_card_source_label.c_str(); + }; + std::fprintf(stderr, "[server] │ max_tokens = %d (%s)\n", + sconfig.default_max_tokens, src_of(cli_set.default_max_tokens)); + std::fprintf(stderr, "[server] │ think_max_tokens= %d (%s)\n", + sconfig.think_max_tokens, src_of(cli_set.think_max_tokens)); + std::fprintf(stderr, "[server] │ hard_limit_reply= %d (%s)\n", + sconfig.hard_limit_reply_budget, + src_of(cli_set.hard_limit_reply_budget)); + std::fprintf(stderr, "[server] │ effort tiers = low=%d (%s)\n", + sconfig.effort_tiers.low, src_of(cli_set.effort_low)); + std::fprintf(stderr, "[server] │ medium=%d (%s)\n", + sconfig.effort_tiers.medium, src_of(cli_set.effort_medium)); + std::fprintf(stderr, "[server] │ high=%d (%s)\n", + sconfig.effort_tiers.high, src_of(cli_set.effort_high)); + std::fprintf(stderr, "[server] │ x-high=%d (%s)\n", + sconfig.effort_tiers.x_high, src_of(cli_set.effort_x_high)); + std::fprintf(stderr, "[server] │ max=%d (%s)\n", + sconfig.effort_tiers.max, src_of(cli_set.effort_max)); std::fprintf(stderr, "[server] │ target_device = %s\n", placement_device_name(bargs.device).c_str()); if (bargs.device.is_layer_split()) { @@ -504,6 +729,7 @@ int main(int argc, char ** argv) { std::fprintf(stderr, "[server] │ fa_window = %d\n", bargs.fa_window); std::fprintf(stderr, "[server] │ ddtree = %s\n", bargs.ddtree_mode ? "ON" : "off"); std::fprintf(stderr, "[server] │ ddtree_budget = %d\n", bargs.ddtree_budget); + std::fprintf(stderr, "[server] │ prefix_cache = %d slots\n", sconfig.prefix_cache_cap); std::fprintf(stderr, "[server] │ cors = %s\n", sconfig.enable_cors ? "ON" : "off"); std::fprintf(stderr, "[server] │ cache_type_k = %s\n", #ifdef GGML_USE_HIP @@ -536,6 +762,98 @@ int main(int argc, char ** argv) { } std::fprintf(stderr, "[server] ╰─────────────────────────────────────────────────────╯\n\n"); + // Populate /props introspection fields. These are runtime config snaps + // — the /props handler reads them lockless from config_ so they need to + // be set BEFORE the HttpServer constructor copies sconfig. + sconfig.arch = arch; + sconfig.model_path = bargs.model_path ? bargs.model_path : ""; + sconfig.draft_path = bargs.draft_path ? bargs.draft_path : ""; + sconfig.fa_window = bargs.fa_window; + sconfig.ddtree_budget = bargs.ddtree_budget; + sconfig.speculative_enabled = bargs.ddtree_mode; + sconfig.target_sharding = bargs.device.is_layer_split(); + // KV type: report the operator's choice if set, else the auto-default + // the daemon picks. Matches the printed table above. + sconfig.kv_cache_k = cache_type_k.empty() +#ifdef GGML_USE_HIP + ? "q4_0" +#else + ? (sconfig.max_ctx > 6144 ? "tq3_0" : "q4_0") +#endif + : cache_type_k; + sconfig.kv_cache_v = cache_type_v.empty() +#ifdef GGML_USE_HIP + ? "q4_0" +#else + ? (sconfig.max_ctx > 6144 ? "tq3_0" : "q4_0") +#endif + : cache_type_v; + sconfig.runtime_backend = +#ifdef GGML_USE_HIP + "hip"; +#else + "cuda"; +#endif + sconfig.chunk = bargs.chunk; + sconfig.target_device = placement_device_name(bargs.device); + sconfig.draft_device = bargs.draft_path + ? placement_device_name(bargs.draft_device) + : std::string(); + // Tokenizer ID: best-effort. The Tokenizer class doesn't currently + // expose the GGUF metadata key it was loaded from, so leave empty + // and let /props report null. (Add a getter on Tokenizer later.) + + // Resolve the Level 2 force-close sequence. Two concepts, both sourced + // from the model card sidecar (see model_card.h for semantics): + // - marker: bytes that signal end-of-thinking to *us* (parsers). + // Arch default if sidecar doesn't override: `` for qwen, + // `` for gemma4, `` for everything else. + // - hint: directive injected to tell the *model* to wrap up. Taken + // verbatim — the operator decides whether to include the marker + // at the end. Empty hint → inject just the marker (bare close). + // + // We do NOT auto-append the marker to the hint. Reasoning models have + // varied trained pathways; some respond to a directive followed by the + // marker (Qwen3.x: trained "Considering the limited time..." lead-in), + // others to just a transition cue after the marker (gemma4: `\n\n` + // — see dflash/docs/experiments/gemma4-26b-thinking-control-2026-05-25.md + // for the empirical finding that the `\n\n` mirrors Qwen3's no-think + // template suffix and gives gemma4 the trained "now answer" cue, where + // a bare `` left it mid-derivation). For each arch ship the + // right `thinking_terminator_hint` in its sidecar; for new arches the + // bare-marker fallback is safe but suboptimal. See spec §5.3. + if (sconfig.hard_limit_reply_budget > 0) { + std::string marker = card.thinking_marker; + if (marker.empty()) { + marker = (arch == "gemma4") ? "" : ""; + } + const std::string close_text = card.thinking_terminator_hint.empty() + ? marker + : card.thinking_terminator_hint; + auto close_ids = tokenizer.encode(close_text); + if (!close_ids.empty()) { + sconfig.think_close_token_ids = close_ids; + const char * src = card.thinking_terminator_hint.empty() + ? "marker-only" : "sidecar-hint"; + std::fprintf(stderr, + "[server] level-2 force-close (%s, %zu chars → %zu tokens, " + "hard_limit_reply_budget = %d)\n", + src, close_text.size(), close_ids.size(), + sconfig.hard_limit_reply_budget); + std::fprintf(stderr, + "[server] level-2 force-close token ids: "); + for (size_t i = 0; i < std::min(close_ids.size(), 16); ++i) { + std::fprintf(stderr, "%s%d", i ? "," : "", close_ids[i]); + } + if (close_ids.size() > 16) std::fprintf(stderr, ",..."); + std::fprintf(stderr, "\n"); + } else { + std::fprintf(stderr, + "[server] level-2 force-close DISABLED: text %.40s... " + "tokenizes to empty.\n", close_text.c_str()); + } + } + HttpServer server(*backend, tokenizer, sconfig); server.set_chat_format(chat_format_for_arch(arch)); g_server = &server; diff --git a/dflash/src/server/sse_emitter.cpp b/dflash/src/server/sse_emitter.cpp index 36a7a3864..604f11a73 100644 --- a/dflash/src/server/sse_emitter.cpp +++ b/dflash/src/server/sse_emitter.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include namespace dflash::common { @@ -48,6 +49,25 @@ static int64_t unix_timestamp() { std::chrono::system_clock::now().time_since_epoch()).count(); } +// Round `x` to 1 decimal place. JSON serialization of doubles can emit +// 17 significant digits which is noisy in client logs and bench output; +// caller-side rounding keeps the wire format stable across runs. +static double round1(double x) { + return std::round(x * 10.0) / 10.0; +} + +json build_timings_json(const GenTimings & t, int completion_tokens) { + const double prefill_ms = round1(t.prefill_s * 1000.0); + const double decode_ms = round1(t.decode_s * 1000.0); + const double tps = t.decode_s > 0.0 + ? round1((double)completion_tokens / t.decode_s) : 0.0; + return json{ + {"prefill_ms", prefill_ms}, + {"decode_ms", decode_ms}, + {"decode_tokens_per_sec", tps} + }; +} + // ─── Constructor ──────────────────────────────────────────────────────── SseEmitter::SseEmitter(ApiFormat format, @@ -56,7 +76,6 @@ SseEmitter::SseEmitter(ApiFormat format, int prompt_tokens, const json & tools, ToolMemory * tool_memory, - bool started_in_thinking, const std::vector & stop_sequences) : format_(format) , request_id_(request_id) @@ -64,8 +83,8 @@ SseEmitter::SseEmitter(ApiFormat format, , prompt_tokens_(prompt_tokens) , tools_(tools) , tool_memory_(tool_memory) - , mode_(started_in_thinking ? StreamMode::REASONING : StreamMode::CONTENT) - , active_kind_(started_in_thinking ? "thinking" : "text") + , mode_(StreamMode::CONTENT) + , active_kind_("text") , stop_sequences_(stop_sequences) , created_at_(unix_timestamp()) , msg_item_id_(gen_item_id()) @@ -194,6 +213,32 @@ std::vector SseEmitter::emit_start() { std::vector SseEmitter::emit_token(const std::string & raw_piece) { if (stop_hit_) return {}; // already stopped + // Track the first emit_token call whose mode-on-entry is CONTENT — + // that's the first token attributed to the visible reply. Mode-on- + // entry matters because a token whose text *contains* `` + // arrives while mode is still REASONING; the transition fires + // mid-emit. The token AFTER that transition is the first content + // token. Captured here so http_server can compute the natural-close + // split without a parallel bump_count loop. + // + // Exception: a leading `` opener (Qwen3.6's thinking-enabled + // first token, or the synthesized `<|channel>` → `` map for + // gemma4) arrives while mode is still CONTENT — the emitter's + // default — but the piece immediately transitions to REASONING. + // Capturing fci=0 in that case would misreport thinking_tokens as 0 + // for any streamed-thinking response. Detect the `` opener + // here (lookahead in the unsanitized piece, before the state + // machine runs) and skip the capture so it can fire on a later + // CONTENT-mode token after the natural close. + const bool entry_is_think_opener = + mode_ == StreamMode::CONTENT && + raw_piece.find(THINK_OPEN) != std::string::npos; + if (first_content_token_index_ < 0 && mode_ == StreamMode::CONTENT && + !entry_is_think_opener) { + first_content_token_index_ = emit_token_count_; + } + emit_token_count_++; + // Sanitize input to prevent json::dump() from throwing on invalid UTF-8. std::string piece = utf8_sanitize(raw_piece); std::vector out; @@ -247,7 +292,6 @@ std::vector SseEmitter::emit_token(const std::string & raw_piece) { if (mode_ == StreamMode::REASONING) { // Strip leading tag from reasoning (ds4 pattern). - // When started_in_thinking=true, the model may echo again. // The model may emit whitespace before , so we skip leading // whitespace first, then check for the tag. if (!checked_think_prefix_) { @@ -430,7 +474,8 @@ void SseEmitter::emit_content_delta(std::vector & out, // ─── emit_finish ──────────────────────────────────────────────────────── -std::vector SseEmitter::emit_finish(int completion_tokens) { +std::vector SseEmitter::emit_finish(int completion_tokens, + const GenTimings * timings) { std::vector out; // Flush remaining window @@ -570,16 +615,21 @@ std::vector SseEmitter::emit_finish(int completion_tokens) { case ApiFormat::OPENAI_CHAT: { // Finish reason chunk out.push_back(format_openai_delta(json::object(), fr.c_str())); - // Usage chunk + // Usage chunk — includes timings sub-object when caller supplied + // generation wall-clock breakdown (see spec §6.3). + json usage_body = { + {"prompt_tokens", prompt_tokens_}, + {"completion_tokens", completion_tokens}, + {"total_tokens", prompt_tokens_ + completion_tokens} + }; + if (timings) { + usage_body["timings"] = build_timings_json(*timings, completion_tokens); + } json usage = { {"id", request_id_}, {"object", "chat.completion.chunk"}, {"created", created_at_}, {"model", model_name_}, {"choices", json::array()}, - {"usage", { - {"prompt_tokens", prompt_tokens_}, - {"completion_tokens", completion_tokens}, - {"total_tokens", prompt_tokens_ + completion_tokens} - }} + {"usage", usage_body} }; out.push_back(sse_data(usage.dump())); out.push_back(sse_data("[DONE]")); @@ -603,10 +653,14 @@ std::vector SseEmitter::emit_finish(int completion_tokens) { // tool_result back), else "end_turn". Stop-sequence hits also report // "end_turn" (Anthropic has no dedicated reason for that case). const char * stop_reason = tool_calls_.empty() ? "end_turn" : "tool_use"; + json anth_usage = {{"output_tokens", completion_tokens}}; + if (timings) { + anth_usage["timings"] = build_timings_json(*timings, completion_tokens); + } json msg_delta = { {"type", "message_delta"}, {"delta", {{"stop_reason", stop_reason}, {"stop_sequence", nullptr}}}, - {"usage", {{"output_tokens", completion_tokens}}} + {"usage", anth_usage} }; out.push_back(sse_event("message_delta", msg_delta.dump())); // message_stop @@ -659,17 +713,21 @@ std::vector SseEmitter::emit_finish(int completion_tokens) { } // response.completed + json resp_usage = { + {"input_tokens", prompt_tokens_}, + {"output_tokens", completion_tokens}, + {"total_tokens", prompt_tokens_ + completion_tokens} + }; + if (timings) { + resp_usage["timings"] = build_timings_json(*timings, completion_tokens); + } json shell = { {"id", request_id_}, {"object", "response"}, {"created_at", created_at_}, {"status", "completed"}, {"model", model_name_}, {"output", final_output}, {"output_text", accumulated_content_}, - {"usage", { - {"input_tokens", prompt_tokens_}, - {"output_tokens", completion_tokens}, - {"total_tokens", prompt_tokens_ + completion_tokens} - }} + {"usage", resp_usage} }; out.push_back(format_responses_event("response.completed", {{"response", shell}})); break; diff --git a/dflash/src/server/sse_emitter.h b/dflash/src/server/sse_emitter.h index 9efdac296..4710b8d45 100644 --- a/dflash/src/server/sse_emitter.h +++ b/dflash/src/server/sse_emitter.h @@ -31,6 +31,26 @@ using SseEventFn = std::function & stop_sequences = {}); // Emit the initial SSE events (role delta, message_start, etc.) @@ -51,8 +70,14 @@ class SseEmitter { std::vector emit_token(const std::string & piece); // Flush remaining buffered content and emit final events. - // `completion_tokens` is the total token count. - std::vector emit_finish(int completion_tokens); + // `completion_tokens` is the total token count. `timings`, when + // non-null, is folded into the terminal `usage` block (OpenAI: + // usage chunk; Anthropic: message_delta usage; Responses: + // response.completed usage). Pass nullptr to suppress, matching + // the pre-timings API for unit tests that don't exercise that + // shape. + std::vector emit_finish(int completion_tokens, + const GenTimings * timings = nullptr); // Get the finish_reason for non-streaming responses. std::string finish_reason() const; @@ -69,6 +94,34 @@ class SseEmitter { // Get the reasoning text (after emit_finish). const std::string & reasoning_text() const { return reasoning_text_; } + // Current stream mode — callers tracking per-mode token counts use + // this to attribute a token to either REASONING or CONTENT. Sampled + // before each emit_token() call so tokens that span a + // transition are attributed to the mode they entered with. + StreamMode mode() const { return mode_; } + + // Zero-based index of the first emit_token() call that produced + // CONTENT-mode output (i.e., the first token after the model's + // natural ``). Returns -1 if the model never closed + // `` and the emitter stayed in REASONING for the whole + // stream. + // + // Callers use this to split a single phase-1 token vector into + // its reasoning prefix and content suffix when the model + // self-closed mid-stream: `finish_details.thinking_tokens` = + // first_content_token_index() (or the full size if -1), the + // remainder counts as content. Equivalent to the per-call + // bump_count(mode()) tracking but pushed into the emitter so + // both streaming and non-streaming response builders can read + // the same split. (Codex r1 P2 follow-up.) + int first_content_token_index() const { return first_content_token_index_; } + + // Total number of emit_token() calls observed so far. Used in + // tandem with first_content_token_index() to compute the + // content-token count without depending on the caller's own + // counter; the difference is the natural-close content suffix. + int emit_token_count() const { return emit_token_count_; } + private: // Format helpers std::string format_openai_delta(const json & delta, const char * finish = nullptr); @@ -104,6 +157,14 @@ class SseEmitter { // Strip leading tag from reasoning (ds4 pattern). bool checked_think_prefix_ = false; + // Track the index (in emit_token calls) at which CONTENT mode + // first started, and the total emit_token call count. Used by + // http_server to derive thinking/content token counts from the + // emitter's REASONING → CONTENT transition. See + // first_content_token_index() docs. + int first_content_token_index_ = -1; + int emit_token_count_ = 0; + // Stop sequences support std::vector stop_sequences_; size_t stop_holdback_ = 0; // max length of any stop sequence diff --git a/dflash/src/server/tool_memory.h b/dflash/src/server/tool_memory.h index 0ff92ac0e..5144e229f 100644 --- a/dflash/src/server/tool_memory.h +++ b/dflash/src/server/tool_memory.h @@ -31,6 +31,18 @@ class ToolMemory { bool disabled() const { return max_entries_ == 0 || max_bytes_ == 0; } + // Snapshot for /props. Two successive reads under the same thread, + // matching the Python implementation's "may tear by one entry" semantics. + struct Stats { + size_t max_entries; + size_t max_bytes; + size_t current_entries; + size_t current_bytes; + }; + Stats stats() const { + return {max_entries_, max_bytes_, by_id_.size(), total_bytes_}; + } + private: struct Block { size_t size_bytes; diff --git a/dflash/test/test_server_unit.cpp b/dflash/test/test_server_unit.cpp index 0e116d0d4..c0cab6d5a 100644 --- a/dflash/test/test_server_unit.cpp +++ b/dflash/test/test_server_unit.cpp @@ -102,10 +102,9 @@ static json weather_tools() { }); } -static SseEmitter make_emitter(ApiFormat fmt, bool thinking = false, - json tools = json::array()) { +static SseEmitter make_emitter(ApiFormat fmt, json tools = json::array()) { return SseEmitter(fmt, "test_id_001", "test-model", 10, - tools, nullptr, thinking); + tools, nullptr); } // Concatenate all SSE chunks into a single string. @@ -319,10 +318,14 @@ static void test_parse_tool_allowed_filter() { static void test_emitter_reasoning_split_openai() { // Feed reasoning + content through emitter, verify split. - auto em = make_emitter(ApiFormat::OPENAI_CHAT, true); + // Model emits the opening as its first token (Qwen3.6 path + // — the streaming on_token lambda maps the special id to + // emit_token("")). + auto em = make_emitter(ApiFormat::OPENAI_CHAT); em.emit_start(); - // Feed reasoning tokens + // Open reasoning, feed reasoning tokens + em.emit_token(""); em.emit_token("Let me think about this..."); // Close thinking and start content em.emit_token(""); @@ -337,12 +340,106 @@ static void test_emitter_reasoning_split_openai() { TEST_ASSERT(em.accumulated_text().find("") == std::string::npos); } +// SseEmitter::emit_token_count() / accumulated text accessors drive +// http_server's finish_details accounting on the natural-close path +// (model self-closes mid-stream). Each test feeds tokens +// one-per-call so the emit_token index is straightforward to reason +// about. +static void test_emitter_first_content_index_natural_close() { + // Emit reasoning tokens (with explicit open + + // close), then content tokens. The emit_token_count() reflects + // all delivered tokens; the reasoning/content split is also + // recoverable from accumulated_text / reasoning_text. + auto em = make_emitter(ApiFormat::OPENAI_CHAT); + em.emit_start(); + TEST_ASSERT(em.emit_token_count() == 0); + + em.emit_token(""); + em.emit_token("reasoning1"); + em.emit_token("reasoning2"); + em.emit_token("end"); + em.emit_token("content1"); + em.emit_token("content2"); + em.emit_finish(6); + + TEST_ASSERT(em.emit_token_count() == 6); + // Reasoning + content text both populated. + TEST_ASSERT(!em.reasoning_text().empty()); + TEST_ASSERT(em.accumulated_text().find("content1") != std::string::npos); + TEST_ASSERT(em.accumulated_text().find("content2") != std::string::npos); +} + +static void test_emitter_first_content_index_never_closed() { + // Model opens then emits reasoning only — never closes + // . All produced text lands in reasoning_text; visible + // accumulated_text stays empty. + auto em = make_emitter(ApiFormat::OPENAI_CHAT); + em.emit_start(); + + em.emit_token(""); + em.emit_token("reasoning never closes"); + em.emit_token("still thinking"); + em.emit_finish(3); + + TEST_ASSERT(em.emit_token_count() == 3); + TEST_ASSERT(em.reasoning_text().find("reasoning") != std::string::npos); + TEST_ASSERT(em.accumulated_text().empty()); +} + +static void test_emitter_first_content_index_content_only() { + // Non-thinking request: emitter starts in CONTENT mode, so the + // very first emit_token lands at index 0. + auto em = make_emitter(ApiFormat::OPENAI_CHAT); + em.emit_start(); + em.emit_token("immediate content"); + em.emit_finish(1); + + TEST_ASSERT(em.first_content_token_index() == 0); + TEST_ASSERT(em.emit_token_count() == 1); +} + +static void test_emitter_first_content_index_qwen36_streaming_thinking() { + // Regression: when the chat template emits a leading `` token + // (Qwen3.6 thinking-enabled path, or gemma4 `<|channel>` → `` + // map) the emitter starts in CONTENT mode by default and naively + // captured first_content_token_index_=0 on the first emit_token + // call, before the state machine transitioned to REASONING. Result: + // finish_details.thinking_tokens misreported as 0 for any streamed- + // thinking response. Fix: detect the `` opener up-front and + // defer the fci capture until a true CONTENT-mode token arrives. + auto em = make_emitter(ApiFormat::OPENAI_CHAT); + em.emit_start(); + + // Mirror http_server's on_token mapping: the special id is + // forwarded as a standalone "" piece, followed by reasoning + // text, the closing "" piece, then the answer. + em.emit_token(""); + em.emit_token("reasoning step 1"); + em.emit_token("reasoning step 2"); + em.emit_token("\n"); + em.emit_token("answer text"); + em.emit_finish(5); + + // fci must point at the first true content token, NOT 0. + TEST_ASSERT(em.first_content_token_index() > 0); + // Reasoning text populated, leading stripped. + TEST_ASSERT(!em.reasoning_text().empty()); + TEST_ASSERT(em.reasoning_text().find("") == std::string::npos); + // Content text populated. + TEST_ASSERT(em.accumulated_text().find("answer") != std::string::npos); + // emit_token_count - fci should be the content-suffix size + // (>0 means at least one content-mode token was attributed). + TEST_ASSERT(em.emit_token_count() - em.first_content_token_index() > 0); +} + static void test_emitter_reasoning_strips_leading_think_tag() { - // When started_in_thinking=true, model may echo . - auto em = make_emitter(ApiFormat::OPENAI_CHAT, true); + // Model emits leading whitespace + as one token, then + // continues thinking. The leading--with-whitespace-prefix + // strip ensures the reasoning text doesn't contain the open tag. + auto em = make_emitter(ApiFormat::OPENAI_CHAT); em.emit_start(); - // Model echoes \n\n before actual reasoning + // Model emits \n\n before actual reasoning em.emit_token("\n\nActual reasoning here"); em.emit_token(""); em.emit_token("Content"); @@ -355,7 +452,7 @@ static void test_emitter_reasoning_strips_leading_think_tag() { } static void test_emitter_content_only_no_thinking() { - auto em = make_emitter(ApiFormat::OPENAI_CHAT, false); + auto em = make_emitter(ApiFormat::OPENAI_CHAT); em.emit_start(); em.emit_token("Hello, world!"); em.emit_finish(5); @@ -366,7 +463,7 @@ static void test_emitter_content_only_no_thinking() { static void test_emitter_tool_buffer_detection() { // When the emitter sees , it should buffer and parse tools. - auto em = make_emitter(ApiFormat::OPENAI_CHAT, false, weather_tools()); + auto em = make_emitter(ApiFormat::OPENAI_CHAT, weather_tools()); em.emit_start(); em.emit_token("\n" "\n" @@ -398,7 +495,7 @@ static void test_emitter_anthropic_tool_use_blocks() { {"properties", {{"city", {{"type", "string"}}}}}}} }); SseEmitter em(ApiFormat::ANTHROPIC, "req_id", "test-model", 10, - tools, nullptr, /*thinking=*/false); + tools, nullptr); (void)em.emit_start(); // Feed Qwen3 XML tool call in chunks so the holdback buffer flushes; // parser will detect .... @@ -423,7 +520,7 @@ static void test_emitter_anthropic_tool_use_blocks() { } static void test_emitter_bare_function_tool_buffer_detection() { - auto em = make_emitter(ApiFormat::OPENAI_CHAT, false, weather_tools()); + auto em = make_emitter(ApiFormat::OPENAI_CHAT, weather_tools()); em.emit_start(); em.emit_token("\n" "\n" @@ -442,7 +539,7 @@ static void test_emitter_bare_function_tool_buffer_detection() { } static void test_emitter_does_not_leak_malformed_tool_xml() { - auto em = make_emitter(ApiFormat::OPENAI_CHAT, false, weather_tools()); + auto em = make_emitter(ApiFormat::OPENAI_CHAT, weather_tools()); em.emit_start(); em.emit_token("Let me list files.\n\n"); em.emit_token("\n" @@ -459,7 +556,7 @@ static void test_emitter_does_not_leak_malformed_tool_xml() { } static void test_emitter_parses_tool_call_missing_outer_close() { - auto em = make_emitter(ApiFormat::OPENAI_CHAT, false, weather_tools()); + auto em = make_emitter(ApiFormat::OPENAI_CHAT, weather_tools()); em.emit_start(); em.emit_token("\n" "\n" @@ -480,7 +577,7 @@ static void test_emitter_parses_tool_call_missing_outer_close() { } static void test_emitter_no_tools_keeps_tool_like_text() { - auto em = make_emitter(ApiFormat::OPENAI_CHAT, false); + auto em = make_emitter(ApiFormat::OPENAI_CHAT); em.emit_start(); em.emit_token("\n" "ls\n" @@ -493,7 +590,7 @@ static void test_emitter_no_tools_keeps_tool_like_text() { static void test_emitter_anthropic_structure() { // Verify Anthropic format emits proper event sequence. - auto em = make_emitter(ApiFormat::ANTHROPIC, false); + auto em = make_emitter(ApiFormat::ANTHROPIC); auto start = em.emit_start(); std::string start_str = concat(start); @@ -517,7 +614,7 @@ static void test_emitter_anthropic_structure() { } static void test_emitter_responses_structure() { - auto em = make_emitter(ApiFormat::RESPONSES, false); + auto em = make_emitter(ApiFormat::RESPONSES); auto start = em.emit_start(); std::string start_str = concat(start); @@ -543,7 +640,7 @@ static void test_emitter_responses_bare_function_tool_call() { }} }}); SseEmitter em(ApiFormat::RESPONSES, "resp_test_001", "test-model", 10, - tools, nullptr, false); + tools, nullptr); em.emit_start(); em.emit_token("\n\n\n\ngit pull\n"); em.emit_token("\n\n"); @@ -561,7 +658,7 @@ static void test_emitter_responses_bare_function_tool_call() { } static void test_emitter_streaming_openai_has_done() { - auto em = make_emitter(ApiFormat::OPENAI_CHAT, false); + auto em = make_emitter(ApiFormat::OPENAI_CHAT); em.emit_start(); em.emit_token("Hello"); auto finish = em.emit_finish(3); @@ -572,7 +669,7 @@ static void test_emitter_streaming_openai_has_done() { static void test_emitter_nonstreaming_accumulates() { // Non-streaming: tokens fed through emitter, accumulated_text() has all content. - auto em = make_emitter(ApiFormat::OPENAI_CHAT, false); + auto em = make_emitter(ApiFormat::OPENAI_CHAT); em.emit_token("Hello "); em.emit_token("world"); em.emit_finish(5); @@ -582,20 +679,20 @@ static void test_emitter_nonstreaming_accumulates() { } static void test_emitter_anthropic_thinking_blocks() { - auto em = make_emitter(ApiFormat::ANTHROPIC, true); + auto em = make_emitter(ApiFormat::ANTHROPIC); auto start = em.emit_start(); std::string start_str = concat(start); - TEST_ASSERT(start_str.find("thinking") != std::string::npos); - - // Feed reasoning - em.emit_token("Reasoning about the problem at length here..."); - em.emit_token(""); - em.emit_token("The answer is clear now."); + // Model opens , emits reasoning, closes, emits content. + auto t1 = em.emit_token(""); + auto t2 = em.emit_token("Reasoning about the problem at length here..."); + auto t3 = em.emit_token(""); + auto t4 = em.emit_token("The answer is clear now."); auto finish = em.emit_finish(20); - std::string all = start_str + concat(finish); + std::string all = start_str + concat(t1) + concat(t2) + concat(t3) + + concat(t4) + concat(finish); - // Should have both thinking and text blocks + // Should have both thinking and text blocks somewhere in the stream TEST_ASSERT(all.find("thinking") != std::string::npos); TEST_ASSERT(!em.reasoning_text().empty()); TEST_ASSERT(!em.accumulated_text().empty()); @@ -605,15 +702,15 @@ static void test_emitter_anthropic_thinking_blocks() { // Stop sequences tests // ═══════════════════════════════════════════════════════════════════════ -static SseEmitter make_emitter_with_stops(ApiFormat fmt, bool thinking, +static SseEmitter make_emitter_with_stops(ApiFormat fmt, const std::vector & stops) { return SseEmitter(fmt, "test_id_001", "test-model", 10, - json::array(), nullptr, thinking, stops); + json::array(), nullptr, stops); } static void test_stop_sequence_basic() { // Stop sequence should truncate content at the match point. - auto em = make_emitter_with_stops(ApiFormat::OPENAI_CHAT, false, {"STOP"}); + auto em = make_emitter_with_stops(ApiFormat::OPENAI_CHAT,{"STOP"}); em.emit_token("Hello "); em.emit_token("world "); em.emit_token("STOP"); @@ -629,7 +726,7 @@ static void test_stop_sequence_basic() { static void test_stop_sequence_mid_token() { // Stop sequence may span multiple tokens due to holdback buffering. - auto em = make_emitter_with_stops(ApiFormat::OPENAI_CHAT, false, {"END"}); + auto em = make_emitter_with_stops(ApiFormat::OPENAI_CHAT,{"END"}); em.emit_token("Go "); em.emit_token("to the E"); em.emit_token("ND now"); @@ -643,7 +740,7 @@ static void test_stop_sequence_mid_token() { static void test_stop_sequence_multiple() { // Multiple stop sequences — earliest match wins. - auto em = make_emitter_with_stops(ApiFormat::OPENAI_CHAT, false, {"AAA", "BB"}); + auto em = make_emitter_with_stops(ApiFormat::OPENAI_CHAT,{"AAA", "BB"}); em.emit_token("xBBy"); TEST_ASSERT(em.stop_hit()); @@ -653,7 +750,7 @@ static void test_stop_sequence_multiple() { static void test_stop_sequence_no_match() { // No stop sequence hit — normal operation. - auto em = make_emitter_with_stops(ApiFormat::OPENAI_CHAT, false, {"NOMATCH"}); + auto em = make_emitter_with_stops(ApiFormat::OPENAI_CHAT,{"NOMATCH"}); em.emit_token("Hello world this is a long text"); em.emit_finish(10); @@ -663,7 +760,7 @@ static void test_stop_sequence_no_match() { static void test_stop_sequence_empty_list() { // Empty stop list — no effect. - auto em = make_emitter_with_stops(ApiFormat::OPENAI_CHAT, false, {}); + auto em = make_emitter_with_stops(ApiFormat::OPENAI_CHAT,{}); em.emit_token("Hello STOP world"); em.emit_finish(5); @@ -673,7 +770,7 @@ static void test_stop_sequence_empty_list() { static void test_stop_sequence_finish_reason() { // finish_reason should be "stop" when stop sequence hit. - auto em = make_emitter_with_stops(ApiFormat::OPENAI_CHAT, false, {"END"}); + auto em = make_emitter_with_stops(ApiFormat::OPENAI_CHAT,{"END"}); em.emit_token("content END more"); TEST_ASSERT(em.stop_hit()); @@ -683,7 +780,7 @@ static void test_stop_sequence_finish_reason() { static void test_stop_sequence_streaming_output() { // Streaming: verify the [DONE] is still emitted after stop. - auto em = make_emitter_with_stops(ApiFormat::OPENAI_CHAT, false, {"HALT"}); + auto em = make_emitter_with_stops(ApiFormat::OPENAI_CHAT,{"HALT"}); auto start = em.emit_start(); em.emit_token("some text HALT rest"); @@ -696,7 +793,7 @@ static void test_stop_sequence_streaming_output() { static void test_stop_sequence_anthropic_format() { // Anthropic format should emit end_turn stop_reason. - auto em = make_emitter_with_stops(ApiFormat::ANTHROPIC, false, {"DONE"}); + auto em = make_emitter_with_stops(ApiFormat::ANTHROPIC, {"DONE"}); em.emit_start(); em.emit_token("This is content DONE rest"); @@ -708,8 +805,10 @@ static void test_stop_sequence_anthropic_format() { } static void test_stop_sequence_in_reasoning_mode() { - // Stop sequence in reasoning mode should still stop. - auto em = make_emitter_with_stops(ApiFormat::OPENAI_CHAT, true, {"CUTOFF"}); + // Stop sequence in reasoning mode should still stop. Model opens + // first to enter REASONING. + auto em = make_emitter_with_stops(ApiFormat::OPENAI_CHAT, {"CUTOFF"}); + em.emit_token(""); em.emit_token("Thinking deeply about this CUTOFF answer"); TEST_ASSERT(em.stop_hit()); @@ -721,7 +820,7 @@ static void test_stop_sequence_in_reasoning_mode() { static void test_stop_sequence_holdback_extends() { // With a long stop sequence, holdback buffer should extend to prevent // emitting text that's part of a stop sequence. - auto em = make_emitter_with_stops(ApiFormat::OPENAI_CHAT, false, + auto em = make_emitter_with_stops(ApiFormat::OPENAI_CHAT, {"LONGSTOPSEQUENCE"}); // Feed text token by token — the holdback should prevent premature emission em.emit_token("prefix "); @@ -1759,6 +1858,288 @@ static void test_sampler_needs_logit_processing() { TEST_ASSERT(!cfg.needs_logit_processing()); } +// ═══════════════════════════════════════════════════════════════════════ +// /props body shape tests (model-free) +// +// Verify build_props_body's new wholesale-sidecar `model_card` + new +// `budget_envelope` section per docs/specs/props-endpoint.md §4.9 / §4.X. +// ═══════════════════════════════════════════════════════════════════════ + +static ServerConfig make_props_config_with_sidecar(const json & sidecar) { + ServerConfig cfg; + cfg.arch = "qwen35"; + cfg.model_path = "/tmp/fake/model.gguf"; + cfg.model_card_source_label = "share/model_cards/qwen3.6-27b.json"; + cfg.model_card_json = sidecar; + cfg.default_max_tokens = 32768; + cfg.hard_limit_reply_budget = 512; + cfg.think_max_tokens = 32256; + cfg.effort_tiers.low = 4032; + cfg.effort_tiers.medium = 16128; + cfg.effort_tiers.high = 32256; + cfg.effort_tiers.x_high = 56832; + cfg.effort_tiers.max = 81408; + return cfg; +} + +static void test_props_model_card_wholesale_sidecar() { + // When a sidecar was loaded, /props.model_card should be the parsed + // sidecar JSON verbatim — *all* fields from the file, not just the + // five budget-derived ones from the pre-refactor shape. + json sidecar = { + {"name", "Qwen3.6 27B"}, + {"source", "https://huggingface.co/Qwen/Qwen3.6-27B"}, + {"verified_at", "2026-05-23"}, + {"max_tokens", 32768}, + {"complex_problem_max_tokens", 81920}, + {"sampling", { + {"temperature", 1.0}, + {"top_p", 0.95}, + {"top_k", 20}, + }}, + {"reasoning_effort_tiers", { + {"low", 4032}, + {"medium", 16128}, + {"high", 32256}, + {"x-high", 56832}, + {"max", 81408}, + }}, + {"notes", "test card"}, + }; + ServerConfig cfg = make_props_config_with_sidecar(sidecar); + Tokenizer tok; + PrefixCache pc(0, tok); + ToolMemory tm; + json body = build_props_body(cfg, pc, tm); + + TEST_ASSERT(body.contains("model_card")); + TEST_ASSERT(!body["model_card"].is_null()); + // `source` is the upstream URL, NOT the filepath. The filepath label + // moved to budget_envelope.model_card_source post-refactor. + TEST_ASSERT(body["model_card"]["source"].get() == + "https://huggingface.co/Qwen/Qwen3.6-27B"); + TEST_ASSERT(body["model_card"]["name"].get() == "Qwen3.6 27B"); + TEST_ASSERT(body["model_card"]["max_tokens"].get() == 32768); + TEST_ASSERT(body["model_card"]["complex_problem_max_tokens"].get() == 81920); + TEST_ASSERT(body["model_card"].contains("sampling")); + TEST_ASSERT(body["model_card"].contains("reasoning_effort_tiers")); + TEST_ASSERT(body["model_card"]["notes"].get() == "test card"); + // The pre-refactor `think_max_tokens` / `hard_limit_reply_budget` + // keys are NOT in the wholesale shape — they moved to budget_envelope. + TEST_ASSERT(!body["model_card"].contains("think_max_tokens")); + TEST_ASSERT(!body["model_card"].contains("hard_limit_reply_budget")); +} + +static void test_props_model_card_null_on_family_fallback() { + // When family or hard fallback was used (no sidecar), /props.model_card + // is JSON null. The budget_envelope still carries the resolved values. + ServerConfig cfg; + cfg.arch = "qwen35"; + cfg.model_card_source_label = "family:qwen35"; + cfg.model_card_json = nullptr; // no sidecar parsed + cfg.default_max_tokens = 32768; + cfg.hard_limit_reply_budget = 512; + cfg.think_max_tokens = 32256; + Tokenizer tok; + PrefixCache pc(0, tok); + ToolMemory tm; + json body = build_props_body(cfg, pc, tm); + + TEST_ASSERT(body.contains("model_card")); + TEST_ASSERT(body["model_card"].is_null()); + // budget_envelope still present and carries the family-fallback label. + TEST_ASSERT(body.contains("budget_envelope")); + TEST_ASSERT(body["budget_envelope"]["model_card_source"].get() == + "family:qwen35"); + TEST_ASSERT(body["budget_envelope"]["default_max_tokens"].get() == 32768); +} + +static void test_props_budget_envelope_shape() { + // budget_envelope is always present with all five fields and the + // expected effort_tiers vocabulary (low|medium|high|x-high|max). + // Values mirror ServerConfig regardless of what the sidecar carried. + json sidecar = { + {"name", "Qwen3.6 27B"}, + {"source", "https://huggingface.co/Qwen/Qwen3.6-27B"}, + {"verified_at", "2026-05-23"}, + {"max_tokens", 32768}, + }; + ServerConfig cfg = make_props_config_with_sidecar(sidecar); + // Simulate CLI override: budget_envelope reflects the runtime value, + // which may diverge from the sidecar (here, 16000 != sidecar 32768). + cfg.default_max_tokens = 16000; + cfg.hard_limit_reply_budget = 512; + cfg.think_max_tokens = 15488; + cfg.effort_tiers.low = 100; + cfg.effort_tiers.medium = 200; + cfg.effort_tiers.high = 300; + cfg.effort_tiers.x_high = 400; + cfg.effort_tiers.max = 500; + + Tokenizer tok; + PrefixCache pc(0, tok); + ToolMemory tm; + json body = build_props_body(cfg, pc, tm); + + TEST_ASSERT(body.contains("budget_envelope")); + const json & be = body["budget_envelope"]; + TEST_ASSERT(be["model_card_source"].get() == + "share/model_cards/qwen3.6-27b.json"); + TEST_ASSERT(be["default_max_tokens"].get() == 16000); + TEST_ASSERT(be["hard_limit_reply_budget"].get() == 512); + TEST_ASSERT(be["think_max_tokens"].get() == 15488); + TEST_ASSERT(be["effort_tiers"]["low"].get() == 100); + TEST_ASSERT(be["effort_tiers"]["medium"].get() == 200); + TEST_ASSERT(be["effort_tiers"]["high"].get() == 300); + TEST_ASSERT(be["effort_tiers"]["x-high"].get() == 400); + TEST_ASSERT(be["effort_tiers"]["max"].get() == 500); + + // Sanity: budget_envelope can diverge from model_card.max_tokens + // (CLI override case). Verifies the two sections aren't a tautology. + TEST_ASSERT(body["model_card"]["max_tokens"].get() == 32768); + TEST_ASSERT(be["default_max_tokens"].get() == 16000); + + // Sanity: props_schema bumped to 2 (breaking change). + TEST_ASSERT(body["server"]["props_schema"].get() == 2); +} + +// ─── /props.runtime captures full config (§4.16) ────────────────────── +// Snapshot/bench tooling reads /props.runtime wholesale into +// result.json.server_info; this test pins the field set so additions +// elsewhere don't accidentally drop a knob we depend on for forensics. +static void test_props_runtime_shape() { + ServerConfig cfg = make_props_config_with_sidecar(json{ + {"name", "Qwen3.6 27B"}, + {"source", "https://huggingface.co/Qwen/Qwen3.6-27B"}, + {"verified_at", "2026-05-23"}, + {"max_tokens", 32768}, + }); + cfg.runtime_backend = "cuda"; + cfg.fa_window = 2048; + cfg.kv_cache_k = "tq3_0"; + cfg.kv_cache_v = "tq3_0"; + cfg.lazy_draft = false; + cfg.target_sharding = false; + cfg.chunk = 512; + cfg.target_device = "auto:0"; + cfg.draft_device = "auto:0"; + + Tokenizer tok; + PrefixCache pc(0, tok); + ToolMemory tm; + json body = build_props_body(cfg, pc, tm); + + TEST_ASSERT(body.contains("runtime")); + const json & rt = body["runtime"]; + TEST_ASSERT(rt["backend"].get() == "cuda"); + TEST_ASSERT(rt["fa_window"].get() == 2048); + TEST_ASSERT(rt["kv_cache_k"].get() == "tq3_0"); + TEST_ASSERT(rt["kv_cache_v"].get() == "tq3_0"); + TEST_ASSERT(rt["lazy_draft"].get() == false); + TEST_ASSERT(rt["target_sharding"].get() == false); + TEST_ASSERT(rt["chunk"].get() == 512); + TEST_ASSERT(rt["target_device"].get() == "auto:0"); + TEST_ASSERT(rt["draft_device"].get() == "auto:0"); + + // draft_device is null when no draft model is loaded. + cfg.draft_device.clear(); + body = build_props_body(cfg, pc, tm); + TEST_ASSERT(body["runtime"]["draft_device"].is_null()); +} + +// ═══════════════════════════════════════════════════════════════════════ +// usage.timings — per-request prefill / decode wall-clock breakdown +// surfaced under usage.timings (spec §6.3). Tests cover all three +// response shapes plus the zero-decode_s div-by-zero guard. +// ═══════════════════════════════════════════════════════════════════════ + +static void test_usage_timings_openai_chat_streaming() { + // OpenAI Chat streaming: the terminal usage chunk (just before + // data: [DONE]) carries `timings.{prefill_ms, decode_ms, + // decode_tokens_per_sec}` when timings are passed to emit_finish. + auto em = make_emitter(ApiFormat::OPENAI_CHAT); + em.emit_start(); + em.emit_token("Hello world"); + + GenTimings t{0.2345, 2.4567}; // 234.5 ms / 2456.7 ms + auto finish = em.emit_finish(/*completion_tokens*/ 100, &t); + std::string finish_str = concat(finish); + + TEST_ASSERT(finish_str.find("\"timings\"") != std::string::npos); + TEST_ASSERT(finish_str.find("\"prefill_ms\":234.5") != std::string::npos); + TEST_ASSERT(finish_str.find("\"decode_ms\":2456.7") != std::string::npos); + // 100 / 2.4567 = 40.7048... → rounds to 40.7 + TEST_ASSERT(finish_str.find("\"decode_tokens_per_sec\":40.7") != std::string::npos); + TEST_ASSERT(finish_str.find("[DONE]") != std::string::npos); +} + +static void test_usage_timings_anthropic_streaming() { + // Anthropic streaming: message_delta.usage gains a `timings` + // sibling alongside `output_tokens`. + auto em = make_emitter(ApiFormat::ANTHROPIC); + em.emit_start(); + em.emit_token("ok"); + GenTimings t{0.05, 0.5}; // 50.0 ms / 500.0 ms + auto finish = em.emit_finish(/*completion_tokens*/ 10, &t); + std::string finish_str = concat(finish); + + TEST_ASSERT(finish_str.find("\"timings\"") != std::string::npos); + TEST_ASSERT(finish_str.find("\"prefill_ms\":50.0") != std::string::npos); + TEST_ASSERT(finish_str.find("\"decode_ms\":500.0") != std::string::npos); + // 10 / 0.5 = 20.0 + TEST_ASSERT(finish_str.find("\"decode_tokens_per_sec\":20.0") != std::string::npos); +} + +static void test_usage_timings_responses_streaming() { + // Responses streaming: response.completed.usage gains `timings`. + auto em = make_emitter(ApiFormat::RESPONSES); + em.emit_start(); + em.emit_token("done"); + GenTimings t{0.1, 1.0}; + auto finish = em.emit_finish(/*completion_tokens*/ 25, &t); + std::string finish_str = concat(finish); + + TEST_ASSERT(finish_str.find("\"timings\"") != std::string::npos); + TEST_ASSERT(finish_str.find("\"prefill_ms\":100.0") != std::string::npos); + TEST_ASSERT(finish_str.find("\"decode_ms\":1000.0") != std::string::npos); + // 25 / 1.0 = 25.0 + TEST_ASSERT(finish_str.find("\"decode_tokens_per_sec\":25.0") != std::string::npos); +} + +static void test_usage_timings_zero_decode_no_div_by_zero() { + // decode_s == 0 (prefill-only / no tokens generated path): emit + // decode_tokens_per_sec = 0.0 without div-by-zero. + GenTimings t{0.123, 0.0}; + json j = build_timings_json(t, /*completion_tokens*/ 42); + TEST_ASSERT(j["prefill_ms"].get() == 123.0); + TEST_ASSERT(j["decode_ms"].get() == 0.0); + TEST_ASSERT(j["decode_tokens_per_sec"].get() == 0.0); + + // Also exercise via OpenAI streaming path — finite JSON output, no NaN/Inf. + auto em = make_emitter(ApiFormat::OPENAI_CHAT); + em.emit_start(); + auto finish = em.emit_finish(/*completion_tokens*/ 0, &t); + std::string finish_str = concat(finish); + TEST_ASSERT(finish_str.find("\"decode_tokens_per_sec\":0.0") != std::string::npos); + // No NaN / Inf serialization leak. + TEST_ASSERT(finish_str.find("inf") == std::string::npos); + TEST_ASSERT(finish_str.find("nan") == std::string::npos); +} + +static void test_usage_timings_omitted_when_null() { + // Backward compat: emit_finish(n) (no timings) emits the legacy + // usage block — no `timings` key. Guards the SDK-facing default + // for callers that don't yet wire timings through. + auto em = make_emitter(ApiFormat::OPENAI_CHAT); + em.emit_start(); + em.emit_token("x"); + auto finish = em.emit_finish(3); // no timings arg + std::string finish_str = concat(finish); + + TEST_ASSERT(finish_str.find("\"timings\"") == std::string::npos); + TEST_ASSERT(finish_str.find("[DONE]") != std::string::npos); +} + int main() { std::fprintf(stderr, "══════════════════════════════════════════\n"); std::fprintf(stderr, " Server Unit Tests\n"); @@ -1792,6 +2173,10 @@ int main() { std::fprintf(stderr, "\n── SSE Emitter ──\n"); RUN_TEST(test_emitter_reasoning_split_openai); + RUN_TEST(test_emitter_first_content_index_natural_close); + RUN_TEST(test_emitter_first_content_index_never_closed); + RUN_TEST(test_emitter_first_content_index_content_only); + RUN_TEST(test_emitter_first_content_index_qwen36_streaming_thinking); RUN_TEST(test_emitter_reasoning_strips_leading_think_tag); RUN_TEST(test_emitter_content_only_no_thinking); RUN_TEST(test_emitter_tool_buffer_detection); @@ -1885,6 +2270,19 @@ int main() { RUN_TEST(test_sampler_temp_zero_with_penalties_uses_argmax); RUN_TEST(test_sampler_needs_logit_processing); + std::fprintf(stderr, "\n── /props body shape ──\n"); + RUN_TEST(test_props_model_card_wholesale_sidecar); + RUN_TEST(test_props_model_card_null_on_family_fallback); + RUN_TEST(test_props_budget_envelope_shape); + RUN_TEST(test_props_runtime_shape); + + std::fprintf(stderr, "\n── usage.timings ──\n"); + RUN_TEST(test_usage_timings_openai_chat_streaming); + RUN_TEST(test_usage_timings_anthropic_streaming); + RUN_TEST(test_usage_timings_responses_streaming); + RUN_TEST(test_usage_timings_zero_decode_no_div_by_zero); + RUN_TEST(test_usage_timings_omitted_when_null); + std::fprintf(stderr, "\n══════════════════════════════════════════\n"); std::fprintf(stderr, " Results: %d assertions, %d failures\n", test_count, test_failures); diff --git a/docs/specs/model-cards.md b/docs/specs/model-cards.md new file mode 100644 index 000000000..d44980f6a --- /dev/null +++ b/docs/specs/model-cards.md @@ -0,0 +1,392 @@ +# Model card sidecars + +A design spec for the per-model JSON sidecars at `share/model_cards/`. +A sidecar carries values transcribed from the upstream model's +HuggingFace card (or equivalent) so the server can apply +model-appropriate defaults at startup without operator hand-tuning. + +This spec covers the file layout, name lookup, JSON schema, field +semantics, resolution order, validation, and authoring workflow. + +## 1. Background + +GGUF metadata is sparse on inference-time recommendations. A given +GGUF file reliably exposes: + +- `general.architecture` — the model family +- `general.name` — a human-readable identifier (varies by source) +- `tokenizer.*` — vocab + special-token ids +- A small handful of sampler hints (e.g. `general.sampling.top_p`) + +What it does **not** carry: recommended `max_tokens`, recommended +thinking budget, recommended temperature for benchmark workloads, +or any reasoning-specific knob. Those typically live in the +HuggingFace README + `generation_config.json`, which the GGUF +conversion drops. + +To run a model at its trained recommendations without forcing every +operator to memorize them, the server reads a sidecar JSON at +startup. The sidecar is the canonical record of "what does the +upstream card say about this model" and is the primary source of +inference-time defaults. + +Sidecars are **source-controlled**, **reviewable**, and **bundled +into the runtime image**. Adding a new model = adding a JSON file. +No recompile required. + +### Server runtime exposure + +Loaded sidecars are exposed wholesale at `/props.model_card` (1:1 +with the on-disk JSON; `null` when family or hard fallback was +used). The runtime-resolved budget knobs the server will actually +apply — which may differ from the authored card values due to CLI +overrides or `max_ctx`-based effort-tier clamping — appear at +`/props.budget_envelope`. See +[`docs/specs/props-endpoint.md`](props-endpoint.md) §4.2 and §4.10 +for the wire shape. + +## 2. File location and lookup + +Sidecars live at: + +``` +share/model_cards/.json +``` + +The server resolves `` from the loaded GGUF's `general.name` +metadata, normalized per these rules: + +1. Lowercase ASCII letters. +2. Replace spaces, tabs, and underscores (`_`) with `-`. +3. Strip any character not in `[a-z0-9.-]`. +4. Append `.json`. + +Examples: + +| GGUF `general.name` | Normalized filename | +|---|---| +| `Qwen3.6-27B` | `qwen3.6-27b.json` | +| `Qwen3.6 27B` | `qwen3.6-27b.json` | +| `Foo_Bar` | `foo-bar.json` | +| `Laguna-XS.2` | `laguna-xs.2.json` | +| `DeepSeek-V4-Flash` | `deepseek-v4-flash.json` | + +### Cards directory search path + +The server probes (in order, matching +`find_model_cards_dir` in `dflash/src/server/model_card.cpp`): + +1. `/share/model_cards/` — an optional explicit + directory passed by the embedding application (e.g. tests). Not + exposed via a CLI flag today. +2. `/../share/model_cards/` (install layout — binary in + `bin/`, sidecars in `share/`, resolved via `/proc/self/exe`). +3. `/share/model_cards/` (build-tree layout — sidecars + shipped next to the binary). +4. `share/model_cards/` in the current working directory (dev runs). +5. `$DFLASH_MODEL_CARDS_DIR` if set (final override / escape hatch). + +The first directory that **exists** wins (the implementation does +*not* re-probe further candidates if the chosen directory lacks a +matching `.json`). If none exist, or if the chosen directory +has no matching card, the server falls through to the per-family +fallback table (see §4) and reports the chosen source in the +startup banner. + +## 3. JSON Schema + +Sidecar files are validated against +[`share/model_cards/_schema.json`](../../share/model_cards/_schema.json) +(draft 2020-12). The schema is the authoritative shape; this +section is the human-readable summary. + +### 3.1 Required fields + +| Field | Type | Purpose | +|---|---|---| +| `name` | string | Display name. Informational — the filename is what matters for lookup. | +| `source` | string (URI) | URL of the upstream card the values were transcribed from. | +| `verified_at` | string (ISO 8601 date, `YYYY-MM-DD`) | When the values were last checked against the source. | +| `max_tokens` | integer ≥ 1 | The card's standard recommended combined cap (reasoning + reply). Drives `default_max_tokens` when no CLI override is set. | + +### 3.2 Optional fields + +| Field | Type | Purpose | +|---|---|---| +| `download_urls` | object | Map of variant → URL (e.g. `{"Q4_K_M": "https://...", "bf16": "https://..."}`). For operator convenience; the server does not download automatically. | +| `complex_problem_max_tokens` | integer ≥ 1 | The card's recommendation for hard reasoning / benchmark workloads. Drives `x-high` and `max` effort tiers above the standard cap. Omit if the card has no separate complex-problem recommendation. | +| `hard_limit_reply_budget` | integer ≥ 256 | Tokens reserved post-`` for the visible answer phase. Default 512 (terse-style models, per `ds4_eval.c`). Bump for verbose models that restate work after force-close (Qwen3.6 ships 4096). Drives both `think_max_tokens = max_tokens − hard_limit_reply_budget` and the force-close trigger inside `do_ar_decode` / `do_spec_decode`. See `docs/specs/thinking-budget.md` §3.3 for the resolution chain and the failure mode this guards against. | +| `sampling` | object | Default sampler params. Used to fill values the request body did not specify. Allowed fields: `temperature` (float ≥ 0), `top_p` (float in 0..1), `top_k` (int ≥ 0), `min_p` (float in 0..1), `presence_penalty` (float), `repetition_penalty` (float). | +| `reasoning_effort_tiers` | object | Explicit phase-1 budgets per `reasoning.effort` tier. Keys: `low`, `medium`, `high`, `x-high`, `max` (all optional integers ≥ 1; the schema rejects `0`, and the resolver treats any value `≤ 0` as missing and substitutes the computed default per §5.4). Missing keys fall through to the computed defaults in `docs/specs/thinking-budget.md` §3.3. | +| `notes` | string | Free-form provenance / caveats. Useful when the card omits values and the sidecar author has to pick defaults from related models or domain knowledge. | + +### 3.3 Forbidden + +Root-level `additionalProperties` is `false`. Typos like +`reasoning_effort_tier` (singular) are caught at startup with a +warning. This keeps sidecars from accumulating undocumented fields +that look meaningful but aren't. + +## 4. Resolution order + +For each tunable, the server consults sources in this order. The +first source supplying a value wins: + +1. **Explicit CLI flag** (`--think-max-tokens N`, + `--default-max-tokens N`, `--reasoning-effort-high N`, etc.). +2. **Model card sidecar** (this file). +3. **Per-family fallback table**, built into the C++ server, keyed + on `general.architecture` (e.g. `qwen35`, `qwen36`, `qwen3`, + `gemma4`, `laguna`). A coarse safety net for known families when + no sidecar matches. Current family entries set `max_tokens` to + 32768 (qwen*, laguna) or 16384 (gemma4), with + `complex_problem_max_tokens=0`. +4. **Hard fallback**, matching `antirez/ds4 ds4_eval.c`'s reference + values: `max_tokens=16000`, `hard_limit_reply_budget=512`, + `think_max_tokens = max_tokens − hard_limit_reply_budget = 15488`. + These also match the `ServerConfig` defaults in + `dflash/src/server/http_server.h`. + +The startup banner prints each tunable's value and which source +supplied it, e.g.: + +``` +[server] │ max_tokens = 32768 (share/model_cards/qwen3.6-27b.json) +[server] │ think_max_tokens= 32256 (share/model_cards/qwen3.6-27b.json) +[server] │ effort tiers = low=4032 (share/model_cards/qwen3.6-27b.json) +[server] │ medium=16128 (share/model_cards/qwen3.6-27b.json) +[server] │ ... +``` + +When a CLI flag overrides a sidecar value, the banner says +`(from CLI)` next to that line. When a value is derived (e.g. +`think_max_tokens` computed from `default_max_tokens` and +`hard_limit_reply_budget`), the banner notes the derivation. + +## 5. Field semantics + +### 5.1 `max_tokens` and `complex_problem_max_tokens` + +`max_tokens` is the card's *standard* recommended combined cap. +The server applies it as the default value for `default_max_tokens` +(used when a request omits `max_tokens`). + +`complex_problem_max_tokens`, if present, is the card's +recommendation for hard reasoning / benchmarking workloads. The +server does **not** raise `default_max_tokens` to this value — +operators who want to bench at this scale must pass it explicitly +(e.g. `--max-tokens 81920` on the bench request, or +`--default-max-tokens 81920` on the server). What the field +*does* drive is the upper end of the `reasoning.effort` tier +ladder (see §5.4). + +The effective phase-1 cap derived from these fields is +`think_max_tokens = max_tokens − hard_limit_reply_budget`, unless +overridden by `--think-max-tokens`. + +### 5.2 `sampling` + +Sampler defaults from the card. When a request omits a sampler +field, the server uses the sidecar value. When the request +specifies a field, the request wins (the sidecar is a default, not +a ceiling). + +The `sampling` object is forwarded directly into the model's +sampler stack; field names match the OpenAI / Anthropic wire +shape. + +There is no CLI override for individual sampling fields today. +Operators who need to force a sampler value should patch the +sidecar (and update `verified_at`) or set it on the request. + +### 5.3 `download_urls` + +A convenience map. Keys are variant names (`Q4_K_M`, `bf16`, etc.); +values are URLs the operator can `wget`. The server does not +download anything — this is purely documentation. If the model is +gated, the URL still appears in the sidecar; access control +happens at the HF / mirror layer. + +### 5.4 `reasoning_effort_tiers` + +When present, the sidecar's `reasoning_effort_tiers` object +provides explicit phase-1 budgets per OpenAI-Responses-style +`reasoning.effort` value. Keys are `low`, `medium`, `high`, +`x-high`, `max`. Each value is an integer ≥ 0 representing the +phase-1 cap (in tokens) at that tier. + +Tiers are an extension to OpenAI Responses' 3-tier +(`low | medium | high`) vocabulary. The two extra tiers (`x-high`, +`max`) let clients opt in to the card's complex-problem budget when +the prompt warrants it. See `docs/specs/thinking-budget.md` §4.2. + +If the field is missing, the server computes defaults from +`max_tokens` + `complex_problem_max_tokens`: + +| Tier | Default formula | +|---|---| +| `low` | `round(think_max × 0.125)` | +| `medium` | `round(think_max × 0.5)` | +| `high` | `think_max` | +| `x-high` | `(think_max + complex_think_max) / 2` (integer division — truncates) | +| `max` | `complex_think_max` | + +Where `think_max = max_tokens − hard_limit_reply_budget` and +`complex_think_max = complex_problem_max_tokens − hard_limit_reply_budget` +(falling back to `think_max` when the card has no complex +recommendation, in which case `x-high` and `max` collapse to +`high`). + +Rounding note: `low` and `medium` use nearest-integer rounding +(`int(x + 0.5)`); `x-high` uses C++ integer division (truncation +toward zero). For odd or non-divisible `think_max` values this +produces deterministic but distinct off-by-one outcomes; see +`compute_default_tiers` in `dflash/src/server/model_card.cpp`. + +The `reasoning_effort_tiers` field exists because the ratio-based +defaults don't fit every model. A smaller model that caps at 8192 +tokens has a very different tier curve from Qwen3.6's 32768/81920 +envelope, and the model card author can pick more useful values +than a global formula. + +Partial overrides are supported: a sidecar can supply only +`low` and `high`, leaving the others to the computed defaults. + +## 6. Validation + +At startup, after `resolve_model_card` selects a source: + +1. **JSON shape**: parsed with `nlohmann::json`. Malformed JSON → log + error, fall through to family fallback, **do not** fail-start. +2. **Required-field check**: the loader logs a per-field error + message for each missing required field (`name`, `source`, + `verified_at`, `max_tokens`) but does **not** abandon the + sidecar — fields it could parse are kept; missing fields take + their `ModelCard` struct defaults (e.g. `max_tokens` defaults to + `16000`, matching the hard fallback). Authoritative + required/optional checking belongs in CI via + `share/model_cards/_schema.json`, not at server start. Operators + who want strict validation should run the schema validator in + their deployment pipeline. +3. **Tier monotonicity**: tier values are clamped to be + non-decreasing in `low ≤ medium ≤ high ≤ x-high ≤ max` + (`enforce_tier_invariants`). Any violation is logged and clamped + up to the previous tier. +4. **Absolute tier ceiling**: each effort tier is clamped to + `max_ctx − hard_limit_reply_budget` once `max_ctx` is resolved + from the backend / CLI (server_main.cpp, after + `resolve_model_card`). Any violation is logged and clamped down. +5. **Banner**: print the resolved source label, max_tokens, + think_max_tokens, effort tiers, and per-tier provenance + (`from CLI` vs sidecar/family label). + +Validation never fails the server. The design priority is "operator +can always start the server even with a missing/bad sidecar." The +fallback chain (family table → hard fallback) guarantees the server +has working defaults regardless of sidecar quality. Stricter +checks (e.g. `complex_problem_max_tokens ≥ max_tokens`, +`additionalProperties: false` on the root) live in +`share/model_cards/_schema.json` and run at CI time so bad sidecars +don't land on `main`. + +## 7. Examples + +### 7.1 Reasoning-capable, with complex-problem recommendation + +[`share/model_cards/qwen3.6-27b.json`](../../share/model_cards/qwen3.6-27b.json): + +```json +{ + "name": "Qwen3.6 27B", + "source": "https://huggingface.co/Qwen/Qwen3.6-27B", + "verified_at": "2026-05-23", + "max_tokens": 32768, + "complex_problem_max_tokens": 81920, + "sampling": { + "temperature": 1.0, + "top_p": 0.95, + "top_k": 20, + "min_p": 0.0, + "presence_penalty": 0.0, + "repetition_penalty": 1.0 + }, + "reasoning_effort_tiers": { + "low": 4032, + "medium": 16128, + "high": 32256, + "x-high": 56832, + "max": 81408 + } +} +``` + +### 7.2 Non-reasoning, no complex-problem mode + +[`share/model_cards/laguna-xs.2.json`](../../share/model_cards/laguna-xs.2.json): + +```json +{ + "name": "Laguna-XS.2", + "source": "https://huggingface.co/Lucebox/Laguna-XS.2-GGUF", + "verified_at": "2026-05-24", + "download_urls": { + "Q4_K_M": "https://huggingface.co/Lucebox/Laguna-XS.2-GGUF/resolve/main/laguna-xs2-Q4_K_M.gguf", + "bf16": "https://huggingface.co/Lucebox/Laguna-XS.2-GGUF/resolve/main/laguna-xs2-bf16.gguf" + }, + "notes": "Non-reasoning MoE code model. Card omits sampler params; defaults below are code-model-typical.", + "max_tokens": 4096, + "sampling": { + "temperature": 0.6, + "top_p": 0.95, + "top_k": 50, + "min_p": 0.0, + "presence_penalty": 0.0, + "repetition_penalty": 1.0 + } +} +``` + +Note the absence of `complex_problem_max_tokens` and +`reasoning_effort_tiers` — the server computes degenerate tiers +that all collapse to `think_max` (`3584`), which is fine for a +non-reasoning model since `reasoning.effort` is rarely sent +against it. + +## 8. Authoring a new sidecar + +1. Find the upstream model card (HuggingFace README + + `generation_config.json` if available). +2. Note the recommended `max_tokens` (or equivalent). Note any + separate recommendation for hard reasoning / benchmarking. +3. Author the JSON file in `share/model_cards/`. Set `source` to + the URL you used and `verified_at` to today's ISO date. +4. Validate against the schema: + ```bash + python -m jsonschema -i share/model_cards/.json \ + share/model_cards/_schema.json + ``` +5. Open a PR. The CI check runs the schema validation against all + sidecars under `share/model_cards/`. +6. Once merged, the docker image build picks up the new sidecar + automatically. + +When the upstream card changes, bump `verified_at` and review the +fields. A stale `verified_at` (e.g. > 6 months old) is a hint that +the values might be out of date. + +## 9. Out of scope + +- **Automatic GGUF download.** `download_urls` is operator-facing + documentation; the server doesn't fetch. +- **Per-effort sampler overrides.** Sampling is a single object + applied to all effort tiers. There is no way to say "use temp 0.6 + for `effort=high` but 1.0 for `effort=low`." +- **Multi-model deployment.** The server loads one model at a time; + the sidecar resolution happens once at startup. There is no + per-request model picker. +- **Card aging / staleness alerts.** `verified_at` is informational + only; the server doesn't refuse a stale card. A dashboard or CI + job could surface stale entries — out of scope here. +- **Variants beyond GGUF.** The server only loads GGUF. Non-GGUF + variants (safetensors, MLX, etc.) get sidecars only if/when those + loaders ship. diff --git a/docs/specs/openapi-props.yaml b/docs/specs/openapi-props.yaml new file mode 100644 index 000000000..71e3c64c4 --- /dev/null +++ b/docs/specs/openapi-props.yaml @@ -0,0 +1,929 @@ +openapi: 3.1.0 + +info: + title: dflash_server /props endpoint + version: "2" + summary: Capability and configuration introspection for dflash_server. + description: | + `GET /props` returns enough JSON for a dashboard, a deployment + healthcheck, or a client SDK to know what this server can do + and how it's configured. See `docs/specs/props-endpoint.md` + for the human-readable spec, including schema-versioning rules. + + The integer reported as `server.props_schema` (and as the + trailing token of `build_info`) bumps when the response shape + changes in a backward-incompatible way. The current schema is + `2`. + + Schema `2` (breaking change vs. `1`): `model_card` is now the + wholesale on-disk sidecar JSON (or `null` when family / hard + fallback was used), validating against + `share/model_cards/_schema.json`. The runtime-resolved budget + knobs (`default_max_tokens`, `hard_limit_reply_budget`, + `think_max_tokens`, `effort_tiers`) moved out of `model_card` + into a new top-level `budget_envelope` section, alongside a + `model_card_source` lookup-hit label. + contact: + name: lucebox-hub + url: https://github.com/easel/lucebox-hub + +# The numeric value matches `server.props_schema` and the +# `props_schema=` token in `build_info`. Bumps on breaking +# response-shape changes; additive changes keep the same value. +x-props-schema: 2 + +servers: + - url: http://localhost:8080 + description: Default dflash_server bind. + +paths: + /props: + get: + operationId: getProps + summary: Server configuration and capability snapshot. + description: | + Returns a JSON document describing the server's HTTP + endpoints, loaded model, model-card-derived budgets, + reasoning / speculative / tools capabilities, sampler + capability flags, prefix-cache state, full-cache state, + tool-replay state, and PFlash configuration. Never blocks + on the worker thread; the response is a snapshot of + atomic counters and config state. + tags: + - introspection + responses: + "200": + description: Capability / configuration snapshot. + content: + application/json: + schema: + $ref: "#/components/schemas/PropsResponse" + examples: + qwen36_27b_dflash: + summary: Qwen3.6-27B with DDTree speculative decode enabled. + value: + api: + endpoints: + - "GET /health" + - "GET /props" + - "GET /v1/models" + - "POST /v1/chat/completions" + - "POST /v1/messages" + - "POST /v1/messages/count_tokens" + - "POST /v1/responses" + budget_envelope: + model_card_source: "share/model_cards/qwen3.6-27b.json" + default_max_tokens: 32768 + hard_limit_reply_budget: 512 + think_max_tokens: 32256 + effort_tiers: + low: 4032 + medium: 16128 + high: 32256 + x-high: 56832 + max: 81408 + build_info: "luce-dflash v0.0.0+cpp props_schema=2" + capabilities: + reasoning_supported: true + speculative_supported: true + tools_supported: true + daemon: + alive: true + default_generation_settings: + min_p: 0.0 + n_ctx: 98304 + repeat_penalty: 1.0 + temperature: 0.0 + top_k: 0 + top_p: 1.0 + full_cache: + capacity: 0 + disk_bytes: 0 + enabled: false + in_use: 0 + lifetime_hits: 0 + model: + arch: "qwen35" + draft_path: "/.../dflash-draft-3.6-q8_0.gguf" + tokenizer_id: null + model_alias: "dflash" + model_card: + name: "Qwen3.6 27B" + source: "https://huggingface.co/Qwen/Qwen3.6-27B" + verified_at: "2026-05-23" + max_tokens: 32768 + complex_problem_max_tokens: 81920 + sampling: + temperature: 1.0 + top_p: 0.95 + top_k: 20 + min_p: 0.0 + presence_penalty: 0.0 + repetition_penalty: 1.0 + reasoning_effort_tiers: + low: 4032 + medium: 16128 + high: 32256 + x-high: 56832 + max: 81408 + model_path: "/.../Qwen3.6-27B-Q4_K_M.gguf" + pflash: + bsa_alpha: null + bsa_enabled: null + drafter_gguf: null + enabled: false + keep_ratio: null + lm_head_fix: null + mode: "off" + skip_park: null + threshold: null + prefix_cache: + capacity: 0 + in_use: 0 + lifetime_hits: 0 + reasoning: + default: null + supported: true + supported_efforts: + - "low" + - "medium" + - "high" + - "x-high" + - "max" + runtime: + backend: "cuda" + fa_window: 2048 + kv_cache_k: "q4_0" + kv_cache_v: "q4_0" + lazy_draft: false + target_sharding: false + chunk: 512 + target_device: "auto:0" + draft_device: "auto:0" + sampling: + capabilities: + supports_frequency_penalty: true + supports_seed: true + supports_temperature: true + supports_top_k: true + supports_top_p: true + server: + name: "luce-dflash" + props_schema: 2 + version: "0.0.0+cpp" + speculative: + enabled: true + ddtree_budget: 22 + speculative_mode: "dflash" + tool_replay: + current_bytes: 0 + current_entries: 0 + max_bytes: 67108864 + max_entries: 50000 + +components: + schemas: + + PropsResponse: + description: Top-level /props body. All fields are required. + type: object + required: + - api + - budget_envelope + - build_info + - capabilities + - daemon + - default_generation_settings + - full_cache + - model + - model_alias + - model_card + - model_path + - pflash + - prefix_cache + - reasoning + - runtime + - sampling + - server + - speculative + - speculative_mode + - tool_replay + additionalProperties: true + properties: + api: + $ref: "#/components/schemas/Api" + budget_envelope: + $ref: "#/components/schemas/BudgetEnvelope" + build_info: + type: string + description: | + Single-string identity: ` v + props_schema=`. Matches the structured `server` + object. Bumps `props_schema` on breaking changes. + example: "luce-dflash v0.0.0+cpp props_schema=2" + capabilities: + $ref: "#/components/schemas/Capabilities" + daemon: + $ref: "#/components/schemas/Daemon" + default_generation_settings: + $ref: "#/components/schemas/DefaultGenerationSettings" + full_cache: + $ref: "#/components/schemas/FullCache" + model: + $ref: "#/components/schemas/Model" + model_alias: + type: string + description: | + The value clients should pass as the `model` field in + chat/responses requests. Defaults to `"dflash"`; + overridable with `--model-name`. + example: "dflash" + model_card: + oneOf: + - $ref: "#/components/schemas/ModelCard" + - type: "null" + description: | + Wholesale on-disk sidecar JSON when a sidecar matched + the loaded GGUF, or `null` when the server fell through + to a per-family or hard fallback (in which case the + resolved budget knobs still appear under + `budget_envelope`). + model_path: + type: string + description: Absolute filesystem path of the loaded target GGUF. + example: "/.../Qwen3.6-27B-Q4_K_M.gguf" + pflash: + $ref: "#/components/schemas/Pflash" + prefix_cache: + $ref: "#/components/schemas/PrefixCache" + reasoning: + $ref: "#/components/schemas/Reasoning" + runtime: + $ref: "#/components/schemas/Runtime" + sampling: + $ref: "#/components/schemas/Sampling" + server: + $ref: "#/components/schemas/Server" + speculative: + $ref: "#/components/schemas/Speculative" + speculative_mode: + type: string + description: | + Which speculative-decode path is currently active. + Derived: `"pflash"` when pflash is enabled, + `"dflash"` when DDTree is enabled, else `"off"`. + enum: ["off", "dflash", "pflash"] + example: "dflash" + tool_replay: + $ref: "#/components/schemas/ToolReplay" + + Api: + description: HTTP endpoint registry. + type: object + required: [endpoints] + properties: + endpoints: + type: array + description: | + Method-and-path strings for every route the server + actually handles. Unordered. New endpoints appear here + without bumping props_schema (additive change). + items: + type: string + example: + - "GET /health" + - "GET /props" + - "GET /v1/models" + - "POST /v1/chat/completions" + - "POST /v1/messages" + - "POST /v1/messages/count_tokens" + - "POST /v1/responses" + + Capabilities: + description: | + Coarse-grained boolean feature flags. Each corresponds to a + major client capability gated on the loaded model arch. + type: object + required: + - reasoning_supported + - speculative_supported + - tools_supported + properties: + reasoning_supported: + type: boolean + description: | + Server accepts `thinking:{type:"enabled"}` and + `reasoning:{effort:...}` request fields. If false, + those fields are silently ignored. + example: true + speculative_supported: + type: boolean + description: DDTree speculative decode is wired up for the loaded backend. + example: true + tools_supported: + type: boolean + description: | + Server accepts `tools` and `tool_choice` request + fields and emits `tool_calls` blocks. + example: true + + Daemon: + description: Daemon-thread liveness flag. + type: object + required: [alive] + properties: + alive: + type: boolean + description: | + True when the model backend is loaded and ready. + Healthchecks should treat false as a failure even + when HTTP returns 200. + example: true + + DefaultGenerationSettings: + description: | + Server-wide sampler defaults applied when a request omits + the corresponding field. Uses llama.cpp wire conventions + (`repeat_penalty`, not `repetition_penalty`). Does NOT + reflect the loaded model card's `sampling` block — those + are applied at request-parse time. + type: object + required: + - min_p + - n_ctx + - repeat_penalty + - temperature + - top_k + - top_p + properties: + min_p: + type: number + format: double + minimum: 0.0 + maximum: 1.0 + description: Sampler min-p threshold. + example: 0.0 + n_ctx: + type: integer + minimum: 1 + description: Maximum prompt+output context length (`--max-ctx`). + example: 98304 + repeat_penalty: + type: number + format: double + description: llama.cpp-style repetition penalty. + example: 1.0 + temperature: + type: number + format: double + minimum: 0.0 + description: Sampler temperature default. + example: 0.0 + top_k: + type: integer + minimum: 0 + description: Sampler top-k cutoff. 0 = disabled. + example: 0 + top_p: + type: number + format: double + minimum: 0.0 + maximum: 1.0 + description: Sampler top-p cutoff. + example: 1.0 + + FullCache: + description: | + Full-prompt (disk-backed) cache state. Atomic-snapshot + counters; per-field tear-free but the set is not + internally consistent. Not safe for control flow. + type: object + required: + - capacity + - disk_bytes + - enabled + - in_use + - lifetime_hits + properties: + capacity: + type: integer + minimum: 0 + description: Configured slot capacity. 0 when disabled. + example: 0 + disk_bytes: + type: integer + format: int64 + minimum: 0 + description: Bytes currently used on disk. + example: 0 + enabled: + type: boolean + description: True when the disk-backed cache is configured. + example: false + in_use: + type: integer + minimum: 0 + description: Slots currently occupied. + example: 0 + lifetime_hits: + type: integer + format: int64 + minimum: 0 + description: Cumulative hit count since server start. + example: 0 + + Model: + description: Loaded model metadata. + type: object + required: + - arch + - draft_path + - tokenizer_id + properties: + arch: + type: string + description: | + Normalized `general.architecture` value from the loaded + GGUF (e.g. `qwen35`, `qwen36`, `gemma4`, `laguna`). + example: "qwen35" + draft_path: + type: ["string", "null"] + description: | + Filesystem path of the loaded speculative-decode draft + GGUF; `null` when no draft is loaded. + example: "/.../dflash-draft-3.6-q8_0.gguf" + tokenizer_id: + type: ["string", "null"] + description: | + Best-effort tokenizer family hint from GGUF metadata + (e.g. `qwen3`). `null` when unknown. + example: null + + ModelCard: + description: | + Wholesale on-disk model-card sidecar JSON. Validates 1:1 + against `share/model_cards/_schema.json` — the same shape + the sidecar author wrote in `share/model_cards/.json`. + + Note: the `source` field here is the **upstream model-card + URL** (e.g. the HuggingFace card the sidecar was + transcribed from), NOT a filepath. The lookup-hit label + (which sidecar / family / hard-fallback the server matched + on) lives at `budget_envelope.model_card_source`. + + See `docs/specs/model-cards.md` for the sidecar format and + resolution order, and `docs/specs/props-endpoint.md` §4.10 + for how this section is wired up at /props. + type: object + additionalProperties: false + required: [name, source, verified_at, max_tokens] + properties: + name: + type: string + description: Display name (informational). + example: "Qwen3.6 27B" + source: + type: string + format: uri + description: URL of the upstream model card the values were transcribed from. + example: "https://huggingface.co/Qwen/Qwen3.6-27B" + verified_at: + type: string + format: date + description: ISO 8601 date the values were last checked against the source. + example: "2026-05-23" + max_tokens: + type: integer + minimum: 1 + description: | + Card's standard recommended combined cap. Drives + `budget_envelope.default_max_tokens` when no CLI + override is set. + example: 32768 + complex_problem_max_tokens: + type: integer + minimum: 1 + description: | + Card's recommendation for hard reasoning / + benchmark workloads. Drives `x-high` and `max` + effort tiers in `budget_envelope.effort_tiers`. + example: 81920 + sampling: + type: object + additionalProperties: false + description: Recommended sampler defaults from the card. + properties: + temperature: { type: number } + top_p: { type: number } + top_k: { type: integer } + min_p: { type: number } + presence_penalty: { type: number } + repetition_penalty: { type: number } + reasoning_effort_tiers: + type: object + additionalProperties: false + description: | + Card-authored phase-1 budgets per + `reasoning.effort` tier. May differ from + `budget_envelope.effort_tiers` because the latter is + clamped to `max_ctx - hard_limit_reply_budget` at + startup. + properties: + low: { type: integer, minimum: 1 } + medium: { type: integer, minimum: 1 } + high: { type: integer, minimum: 1 } + x-high: { type: integer, minimum: 1 } + max: { type: integer, minimum: 1 } + download_urls: + type: object + description: Optional map of variant tag (e.g. `Q4_K_M`, `bf16`) to GGUF download URL. + additionalProperties: + type: string + format: uri + notes: + type: string + description: Free-form provenance / caveats. + + BudgetEnvelope: + description: | + Runtime-resolved budget knobs driving the thinking-budget + envelope. Always present, regardless of whether a sidecar + loaded. May differ from the authored `model_card` values + because of CLI overrides and `max_ctx`-based tier clamping + (spec §3.5). See props-endpoint.md §4.2. + type: object + required: + - default_max_tokens + - effort_tiers + - hard_limit_reply_budget + - model_card_source + - think_max_tokens + properties: + model_card_source: + type: string + description: | + Lookup-hit label identifying where the resolved values + came from: one of `share/model_cards/.json`, + `family:`, or `hard-fallback`. Always present; + matches the startup-banner value. + example: "share/model_cards/qwen3.6-27b.json" + default_max_tokens: + type: integer + minimum: 1 + description: | + Effective combined cap (reasoning + reply) applied when + a request omits `max_tokens`. May diverge from + `model_card.max_tokens` if the operator passed + `--max-tokens` / `--default-max-tokens`. + example: 32768 + hard_limit_reply_budget: + type: integer + minimum: 0 + description: Effective reply-reserve ceiling in tokens. + example: 512 + think_max_tokens: + type: integer + minimum: 0 + description: | + Effective phase-1 (reasoning) ceiling. Derived as + `default_max_tokens - hard_limit_reply_budget` unless + `--think-max-tokens` overrides. + example: 32256 + effort_tiers: + $ref: "#/components/schemas/EffortTiers" + + EffortTiers: + description: | + Per-`reasoning.effort` phase-1 budgets. Five fields + (`low`, `medium`, `high`, `x-high`, `max`), monotone + non-decreasing, each clamped to `max_ctx - + hard_limit_reply_budget` at startup. Surfaced under + `budget_envelope.effort_tiers`. + type: object + required: [low, medium, high, x-high, max] + properties: + low: + type: integer + minimum: 0 + example: 4032 + medium: + type: integer + minimum: 0 + example: 16128 + high: + type: integer + minimum: 0 + example: 32256 + x-high: + type: integer + minimum: 0 + example: 56832 + max: + type: integer + minimum: 0 + example: 81408 + + Pflash: + description: | + PFlash (speculative prefill compression) state. When + `enabled = false` and/or `mode = "off"`, all other + configuration fields are `null`. + type: object + required: + - bsa_alpha + - bsa_enabled + - drafter_gguf + - enabled + - keep_ratio + - lm_head_fix + - mode + - skip_park + - threshold + properties: + bsa_alpha: + type: ["number", "null"] + format: double + description: BSA alpha tunable (env `DFLASH_FP_ALPHA`). + example: null + bsa_enabled: + type: ["boolean", "null"] + description: BSA toggle (env `DFLASH_FP_USE_BSA`). + example: null + drafter_gguf: + type: ["string", "null"] + description: Path to the compression drafter GGUF. + example: null + enabled: + type: boolean + description: True when `mode != "off"`. + example: false + keep_ratio: + type: ["number", "null"] + format: float + description: Fraction of tokens retained after compression. + example: null + lm_head_fix: + type: ["boolean", "null"] + description: Backend-specific lm_head_fix tunable (env `DFLASH27B_LM_HEAD_FIX`). + example: null + mode: + type: string + enum: ["off", "auto", "always"] + description: Activation mode for PFlash. + example: "off" + skip_park: + type: ["boolean", "null"] + description: Whether to skip park/unpark (large-VRAM GPUs). + example: null + threshold: + type: ["integer", "null"] + minimum: 0 + description: Token-count threshold for AUTO mode. + example: null + + PrefixCache: + description: | + Inline prefix cache (system-prompt KV reuse). Atomic + non-strict snapshot; `capacity = 0` means the cache is + disabled. + type: object + required: [capacity, in_use, lifetime_hits] + properties: + capacity: + type: integer + minimum: 0 + description: Configured slot count. + example: 0 + in_use: + type: integer + minimum: 0 + description: Slots currently occupied. + example: 0 + lifetime_hits: + type: integer + format: int64 + minimum: 0 + description: Cumulative hit count since server start. + example: 0 + + Reasoning: + description: | + Reasoning / thinking-budget capability. See + docs/specs/thinking-budget.md §4 for per-tier semantics. + type: object + required: + - default + - supported + - supported_efforts + properties: + default: + type: ["string", "null"] + enum: [null, "low", "medium", "high", "x-high", "max"] + description: | + Effort tier the server applies when a request enables + thinking without specifying `effort`. `null` means + no default. + example: null + supported: + type: boolean + description: | + Server accepts `reasoning.effort` and + `thinking:{type:"enabled"}` request fields. + example: true + supported_efforts: + type: array + description: | + Effort tier values the server will recognize. + dflash_server lists all five; other servers may + list a subset. + items: + type: string + enum: ["low", "medium", "high", "x-high", "max"] + example: ["low", "medium", "high", "x-high", "max"] + + Runtime: + description: | + Runtime knobs resolved at startup (compute backend, KV + cache dtypes, FA window, sharding, prefill chunk, device + placement). New fields appear here without bumping + props_schema. + type: object + required: + - backend + - fa_window + - kv_cache_k + - kv_cache_v + - lazy_draft + - target_sharding + - chunk + - target_device + - draft_device + properties: + backend: + type: string + enum: ["cuda", "hip", "cpu"] + description: Active compute backend. + example: "cuda" + fa_window: + type: integer + minimum: 0 + description: Sliding-window attention window in tokens. + example: 2048 + kv_cache_k: + type: string + description: | + Effective KV cache K-tensor dtype (`q4_0`, `tq3_0`, + `f16`, ...). Empty when not resolved. + example: "q4_0" + kv_cache_v: + type: string + description: Effective KV cache V-tensor dtype. + example: "q4_0" + lazy_draft: + type: boolean + description: True when the decode draft is parked when idle. + example: false + target_sharding: + type: boolean + description: True when the target model is sharded across multiple GPUs. + example: false + chunk: + type: integer + minimum: 0 + description: | + Prefill chunk size in tokens (`bargs.chunk`). Determines + how prompt tokens are batched into the target model + during prefill. Captured so bench/snapshot tooling has + a full record of the chunk size used. + example: 512 + target_device: + type: string + description: | + Resolved target-model device placement (e.g. `"auto:0"`, + `"cuda:0"`). Empty string when no resolution was made. + example: "auto:0" + draft_device: + type: + - string + - "null" + description: | + Resolved draft-model device placement, or `null` when no + draft model is loaded. + example: "auto:0" + + Sampling: + description: | + Sampler capability advertisement. Currently only nests + `capabilities`; future revisions may add a `defaults` + object alongside. + type: object + required: [capabilities] + properties: + capabilities: + $ref: "#/components/schemas/SamplingCapabilities" + + SamplingCapabilities: + description: | + Which sampler request fields the server honors. A false + value means the field is silently ignored. + type: object + required: + - supports_frequency_penalty + - supports_seed + - supports_temperature + - supports_top_k + - supports_top_p + properties: + supports_frequency_penalty: + type: boolean + example: true + supports_seed: + type: boolean + example: true + supports_temperature: + type: boolean + example: true + supports_top_k: + type: boolean + example: true + supports_top_p: + type: boolean + example: true + + Server: + description: | + Structured server identity. The `props_schema` integer + also appears in `build_info`. Bumps on breaking changes. + type: object + required: [name, props_schema, version] + properties: + name: + type: string + description: Server identity string. + example: "luce-dflash" + props_schema: + type: integer + minimum: 1 + description: | + Integer schema version. Bumps when the response shape + changes in a backward-incompatible way (see §5 of + props-endpoint.md). Current value is `2`. + example: 2 + version: + type: string + description: Build version string (semver + build tag). + example: "0.0.0+cpp" + + Speculative: + description: | + DDTree speculative-decode runtime state. Always emitted; + `ddtree_budget` is `null` when `enabled` is false. + type: object + required: [enabled, ddtree_budget] + properties: + enabled: + type: boolean + description: | + True when DDTree speculative-decode is active for + this server. + example: true + ddtree_budget: + type: ["integer", "null"] + minimum: 0 + description: Current DDTree budget (`--ddtree-budget`). + example: 22 + + ToolReplay: + description: | + In-memory tool-call replay cache state. Same atomic / + non-strict-snapshot semantics as the other cache + sections. + type: object + required: + - current_bytes + - current_entries + - max_bytes + - max_entries + properties: + current_bytes: + type: integer + format: int64 + minimum: 0 + description: Bytes currently occupied. + example: 0 + current_entries: + type: integer + minimum: 0 + description: Entries currently cached. + example: 0 + max_bytes: + type: integer + format: int64 + minimum: 0 + description: Configured byte budget. + example: 67108864 + max_entries: + type: integer + minimum: 0 + description: Configured entry budget. + example: 50000 diff --git a/docs/specs/props-endpoint.md b/docs/specs/props-endpoint.md new file mode 100644 index 000000000..b71a4fb73 --- /dev/null +++ b/docs/specs/props-endpoint.md @@ -0,0 +1,650 @@ +# `/props` endpoint + +A design spec for `dflash_server`'s `/props` capability-advertising +endpoint. `/props` is the operator-facing introspection surface: a +single GET that returns enough JSON for a dashboard, a deployment +healthcheck, or a client SDK to know what this server can do and +how it's configured. + +This spec covers the URL contract, response shape, per-field +semantics, schema versioning, and backward-compatibility rules. + +## 1. Background + +There is no industry-standard "what can this LLM server do?" +endpoint. OpenAI exposes `/v1/models` for the model list; Anthropic +exposes nothing equivalent; llama.cpp has `/props` historically as +a server-state snapshot. dflash_server's `/props` extends that +tradition with structured capability advertising for: + +- The set of HTTP endpoints exposed +- Reasoning / thinking-budget capability (which effort tiers, what + budgets) +- Speculative-decode + tool-calling capability +- The loaded model card and its derived settings +- Prefix cache + full cache occupancy +- PFlash (speculative prefill compression) state +- Default sampler params and context length + +The intent is "everything an operator needs to verify a deployment +is configured correctly without having to read the startup banner +or grep the process listing." + +## 2. Request + +``` +GET /props HTTP/1.1 +``` + +| Aspect | Detail | +|---|---| +| Method | `GET` | +| Path | `/props` | +| Auth | None. Same posture as `/health` — accessible without bearer token so deployment probes work. | +| Caching | Response is generated per-request and is not cacheable. Fields may change between requests (e.g. `prefix_cache.in_use`, `full_cache.lifetime_hits`). | +| CORS | Allowed by default. Operators can disable with `--no-cors`. | + +`/props` never blocks on the worker thread. It reads atomic +counters and config snapshots only. A long-running generate +request will not delay a `/props` response. + +## 3. Response + +`Content-Type: application/json`. Top-level structure: + +```json +{ + "api": { "endpoints": [ … ] }, + "budget_envelope": { … }, + "build_info": "luce-dflash v props_schema=", + "capabilities": { … }, + "daemon": { "alive": true }, + "default_generation_settings": { … }, + "full_cache": { … }, + "model": { … }, + "model_alias": "", + "model_card": { … } | null, + "model_path": "", + "pflash": { … }, + "prefix_cache": { … }, + "reasoning": { … }, + "runtime": { … }, + "sampling": { "capabilities": { … } }, + "server": { "name": "", "version": "", "props_schema": }, + "speculative": { "enabled": , "ddtree_budget": }, + "speculative_mode": "off" | "dflash" | "pflash", + "tool_replay": { … } +} +``` + +All top-level keys are required and always emitted by +`build_props_body`. Optional nested fields may be `null` when the +corresponding feature is disabled (e.g. `pflash.threshold` when +`pflash.enabled = false`; `speculative.ddtree_budget` when +`speculative.enabled = false`). `model_card` itself is `null` when +the server fell through to family or hard fallback (no sidecar +matched). + +## 4. Per-section field semantics + +### 4.1 `api` + +```json +"api": { + "endpoints": [ + "GET /health", + "GET /props", + "GET /v1/models", + "POST /v1/chat/completions", + "POST /v1/messages", + "POST /v1/messages/count_tokens", + "POST /v1/responses" + ] +} +``` + +`endpoints` is an unordered list of method-and-path strings the +server actually handles. Extension points (e.g. tool-call +endpoints, embedding endpoints) appear here when implemented. + +### 4.2 `budget_envelope` + +```json +"budget_envelope": { + "model_card_source": "share/model_cards/qwen3.6-27b.json", + "default_max_tokens": 32768, + "hard_limit_reply_budget": 512, + "think_max_tokens": 32256, + "effort_tiers": { + "low": 4032, + "medium": 16128, + "high": 32256, + "x-high": 56832, + "max": 81408 + } +} +``` + +The runtime-resolved budget knobs driving the thinking-budget +envelope. These are always-present, even when no sidecar was loaded +(see §4.10 `model_card`). They may differ from the authored card +values because of CLI overrides (`--default-max-tokens`, +`--think-max-tokens`, `--reasoning-effort-`) and the +absolute-tier ceiling clamping (spec §3.5). + +- `model_card_source` — string, always present. The lookup hit that + produced these values: one of `share/model_cards/.json`, + `family:`, or `hard-fallback`. Matches the startup banner. +- `default_max_tokens` — effective combined cap (reasoning + reply) + applied when a request omits `max_tokens`. May diverge from + `model_card.max_tokens` if the operator passed `--max-tokens` or + `--default-max-tokens`. +- `hard_limit_reply_budget` — effective reply-reserve ceiling in + tokens. +- `think_max_tokens` — effective phase-1 (reasoning) ceiling. + Derived as `default_max_tokens − hard_limit_reply_budget` unless + `--think-max-tokens` overrides. +- `effort_tiers` — phase-1 budgets per `reasoning.effort` tier. + Resolved from the card's `reasoning_effort_tiers` (if present), + then computed from `max_tokens` / `complex_problem_max_tokens` per + spec §3.3, then clamped to `max_ctx − hard_limit_reply_budget` + per §3.5. May differ from `model_card.reasoning_effort_tiers` + because of that clamp. + +`budget_envelope` is the source of truth for what the server will +actually do with a request; `model_card` (§4.10) is the source of +truth for what the authored card says. + +### 4.3 `build_info` + +``` +"build_info": "luce-dflash v0.0.0+cpp props_schema=2" +``` + +A single string carrying: server name, build version, and the +**`props_schema` version**. Schema version bumps when the response +shape changes in a non-backward-compatible way (see §5). + +### 4.4 `capabilities` + +```json +"capabilities": { + "reasoning_supported": true, + "speculative_supported": true, + "tools_supported": true +} +``` + +Coarse-grained boolean feature flags. Each corresponds to a major +client capability: + +- `reasoning_supported` — server accepts `thinking:{type:"enabled"}` + and `reasoning:{effort:...}` request fields. If `false`, those + fields are silently ignored. +- `speculative_supported` — DDTree speculative decode is wired up + for the loaded backend. If `false`, requests run pure AR. +- `tools_supported` — server accepts `tools` and `tool_choice` + fields and emits `tool_calls` blocks. If `false`, those fields + are ignored. + +### 4.5 `daemon` + +```json +"daemon": { "alive": true } +``` + +`alive` is `true` if the model backend is loaded and ready. A +`false` value indicates the server is up but the daemon thread +crashed or is restarting. Healthchecks should treat `false` as a +failure even if HTTP returns 200. + +### 4.6 `default_generation_settings` + +```json +"default_generation_settings": { + "min_p": 0.0, + "n_ctx": 98304, + "repeat_penalty": 1.0, + "temperature": 0.0, + "top_k": 0, + "top_p": 1.0 +} +``` + +The default sampler values the server applies when a request omits +the corresponding field. `n_ctx` is the maximum prompt+output +context length (= `--max-ctx`). + +Field names use llama.cpp conventions (`repeat_penalty`, not +`repetition_penalty`) for compatibility with `/props` consumers +written against llama-server. + +These values are the server's hard-coded sampler defaults +(`temperature=0.0`, `top_p=1.0`, `top_k=0`, `min_p=0.0`, +`repeat_penalty=1.0`) and **do not** reflect the loaded model +card's `sampling` block. The model card's sampler defaults are +applied at request-parse time when a request omits a sampler +field; `/props` only carries the server-wide knobs. To see the +card's sampler values, read the sidecar JSON or the startup +banner. + +### 4.7 `full_cache` + +```json +"full_cache": { + "capacity": 0, + "disk_bytes": 0, + "enabled": false, + "in_use": 0, + "lifetime_hits": 0 +} +``` + +The full-prompt cache (disk-backed). When `enabled = false`, +`capacity`, `in_use`, `lifetime_hits`, and `disk_bytes` are all +zero and ignored. + +Counters use atomic loads (`std::memory_order_relaxed`); the +snapshot is tear-free per field but the set of fields is **not +internally consistent** — e.g. `in_use` and `lifetime_hits` may +correspond to slightly different points in wall time. Acceptable +for an introspection report; not safe for control-flow decisions. + +### 4.8 `model` + +```json +"model": { + "arch": "qwen35", + "draft_path": "/path/to/draft.gguf" | null, + "tokenizer_id": "qwen3" | null +} +``` + +`arch` is the `general.architecture` value from the loaded GGUF, +normalized. `draft_path` is the speculative-decode draft model +path, or `null` when no draft is loaded. `tokenizer_id` is a +best-effort tokenizer family hint from GGUF metadata. + +### 4.9 `model_alias` and `model_path` + +`model_alias` is the value clients should pass as the `model` field +in chat/responses requests (defaults to `"dflash"`; override with +`--model-name`). + +`model_path` is the absolute filesystem path of the loaded target +GGUF. Useful for "which weights is this server actually serving" +checks. + +### 4.10 `model_card` + +```json +"model_card": { + "name": "Qwen3.6 27B", + "source": "https://huggingface.co/Qwen/Qwen3.6-27B", + "verified_at": "2026-05-23", + "max_tokens": 32768, + "complex_problem_max_tokens": 81920, + "sampling": { + "temperature": 1.0, + "top_p": 0.95, + "top_k": 20, + "min_p": 0.0, + "presence_penalty": 0.0, + "repetition_penalty": 1.0 + }, + "reasoning_effort_tiers": { + "low": 4032, + "medium": 16128, + "high": 32256, + "x-high": 56832, + "max": 81408 + } +} +``` + +The on-disk model-card sidecar that was loaded for this run, +emitted **verbatim** (1:1 with the JSON in +`share/model_cards/.json`). The shape validates against +[`share/model_cards/_schema.json`](../../share/model_cards/_schema.json) — +all of `name`, `source`, `verified_at`, `max_tokens`, +`complex_problem_max_tokens`, `sampling`, `reasoning_effort_tiers`, +`download_urls`, and `notes` appear here exactly as the sidecar +author wrote them. + +`model_card` is `null` when the server fell through to a per-family +fallback or the hard fallback (no `share/model_cards/.json` +matched the loaded GGUF's `general.name`). In that case the resolved +budget knobs still appear under `budget_envelope` (§4.2), tagged +with `model_card_source` = `"family:"` or `"hard-fallback"`. + +The `source` field inside `model_card` is the **upstream model-card +URL** (e.g. the HuggingFace card the sidecar was transcribed from), +NOT the filepath the server loaded. The filepath / lookup-hit label +lives at `budget_envelope.model_card_source` (§4.2). This split +keeps the on-disk sidecar contract pure (authored JSON, schema-validated) +and the runtime-resolution metadata in its own section. + +For the runtime-resolved budget values (`default_max_tokens`, +`think_max_tokens`, effort tiers post-clamp) the server will actually +apply, see `budget_envelope` (§4.2). + +See [`docs/specs/model-cards.md`](model-cards.md) for the sidecar +format and resolution order. + +### 4.11 `pflash` + +```json +"pflash": { + "bsa_alpha": null, + "bsa_enabled": null, + "drafter_gguf": null, + "enabled": false, + "keep_ratio": null, + "lm_head_fix": null, + "mode": "off", + "skip_park": null, + "threshold": null +} +``` + +PFlash (speculative prefill compression) state. When `enabled = +false` and/or `mode = "off"`, all other fields are `null`. When +enabled, fields carry the runtime configuration: + +- `mode` — `"off" | "auto" | "always"` +- `threshold` — token-count threshold for AUTO mode +- `keep_ratio` — fraction of tokens retained after compression +- `drafter_gguf` — path to the compression drafter GGUF +- `skip_park` — whether to skip park/unpark (large-VRAM GPUs) +- `bsa_enabled` / `bsa_alpha` / `lm_head_fix` — backend-specific + PFlash tunables + +### 4.12 `prefix_cache` + +```json +"prefix_cache": { + "capacity": 0, + "in_use": 0, + "lifetime_hits": 0 +} +``` + +The inline prefix cache (system-prompt KV reuse). Same atomic / +non-strictly-consistent semantics as `full_cache` (§4.7). +`capacity = 0` means the cache is disabled. + +### 4.13 `reasoning` + +```json +"reasoning": { + "default": null | "low" | "medium" | "high" | "x-high" | "max", + "supported": true, + "supported_efforts": ["low", "medium", "high", "x-high", "max"] +} +``` + +Reasoning capability: + +- `supported` — does the server accept `reasoning.effort` and + `thinking:{type:"enabled"}` request fields. When `false`, those + fields are silently ignored (and the rest of this section can be + ignored). +- `supported_efforts` — the full set of effort tier values the + server will recognize. dflash_server always lists all five + (`low`, `medium`, `high`, `x-high`, `max`); other servers may + list a subset. +- `default` — when set, the effort tier the server will apply if + a request enables thinking without specifying `effort`. `null` + means no default (request must specify). + +See [`docs/specs/thinking-budget.md`](thinking-budget.md) §4 for +the per-tier semantics. + +### 4.14 `sampling.capabilities` + +```json +"sampling": { + "capabilities": { + "supports_frequency_penalty": true, + "supports_seed": true, + "supports_temperature": true, + "supports_top_k": true, + "supports_top_p": true + } +} +``` + +Which sampler fields the server honors on requests. A `false` +value means the field is silently ignored in request bodies. +Clients can use this to skip sending unsupported fields and avoid +confusion when behavior doesn't match the request. + +The `sampling` object intentionally nests `capabilities` for +future expansion (e.g. `sampling.defaults`, though that lives in +§4.6 today). + +### 4.15 `speculative` + +```json +"speculative": { + "ddtree_budget": 22 +} +``` + +Speculative-decode runtime state. When `capabilities.speculative_supported = false`, +this section is omitted entirely. + +- `ddtree_budget` — current DDTree budget (= `--ddtree-budget`). + +Future fields: `accept_rate`, `lookahead_depth`, `draft_model_id` +— added as the speculative-decode surface grows. + +### 4.16 `runtime` + +```json +"runtime": { + "backend": "cuda", + "fa_window": 2048, + "kv_cache_k": "q4_0", + "kv_cache_v": "q4_0", + "lazy_draft": false, + "target_sharding": false, + "chunk": 512, + "target_device": "auto:0", + "draft_device": "auto:0" +} +``` + +Runtime knobs resolved at startup. These reflect the effective +configuration the server is running with — CLI overrides, model- +card-driven defaults, and binary fallback defaults are all +collapsed into one snapshot. Bench/snapshot tooling reads this +wholesale into `result.json.server_info` so post-hoc forensics on +configuration drift between runs is possible. + +- `backend` — active compute backend: `"cuda" | "hip" | "cpu"`. +- `fa_window` — sliding-window attention window in tokens. +- `kv_cache_k` / `kv_cache_v` — effective KV cache dtypes (e.g. + `"q4_0"`, `"tq3_0"`, `"f16"`). Operator's CLI choice when set, + otherwise the binary's auto-default (`tq3_0` when + `max_ctx > 6144`, else `q4_0`, on CUDA). +- `lazy_draft` — whether the decode draft is parked when idle. +- `target_sharding` — true when the target model is layer-split + across multiple GPUs. +- `chunk` — prefill chunk size in tokens. Determines how prompt + tokens are batched into the target model during prefill. +- `target_device` — resolved target-model device placement string + (e.g. `"auto:0"`, `"cuda:0"`). +- `draft_device` — resolved draft-model device placement, or + `null` when no draft model is loaded. + +## 5. Schema versioning + +`build_info` includes `props_schema=`. The integer `n` bumps +when the response shape changes in a way that breaks existing +clients. The current schema is `2`. + +### 5.0 Changelog + +- **`2`** — `model_card` is now the wholesale on-disk sidecar JSON + (or `null` when family/hard fallback was used). Runtime-resolved + budget knobs that used to live under `model_card` + (`hard_limit_reply_budget`, `think_max_tokens`, `effort_tiers`, + effective `max_tokens`) moved to a new top-level `budget_envelope` + section. The `source` field inside `model_card` is now the + upstream URL from the sidecar; the lookup-hit filepath / label + lives at `budget_envelope.model_card_source`. +- **`1`** — Initial schema. + +### 5.1 Non-breaking changes (no version bump) + +- Adding a new top-level section or a new field inside an existing + section. +- Adding a new entry to `api.endpoints` or `reasoning.supported_efforts`. +- Loosening field bounds (e.g. extending an enum's allowed values). + +Clients are required to ignore unknown fields. The schema version +does not bump for additive changes. + +### 5.2 Breaking changes (bump `props_schema`) + +- Removing a field or section. +- Renaming a field. +- Changing a field's type. +- Tightening field bounds in a way that may invalidate previously + valid values. + +After a bump, the server may continue to emit the old shape under a +compat flag for one minor version; consult the changelog when the +version increments. + +## 6. Example: full response + +```json +{ + "api": { + "endpoints": [ + "GET /health", + "GET /props", + "GET /v1/models", + "POST /v1/chat/completions", + "POST /v1/messages", + "POST /v1/messages/count_tokens", + "POST /v1/responses" + ] + }, + "budget_envelope": { + "model_card_source": "share/model_cards/qwen3.6-27b.json", + "default_max_tokens": 32768, + "hard_limit_reply_budget": 512, + "think_max_tokens": 32256, + "effort_tiers": { + "low": 4032, + "medium": 16128, + "high": 32256, + "x-high": 56832, + "max": 81408 + } + }, + "build_info": "luce-dflash v0.0.0+cpp props_schema=2", + "capabilities": { + "reasoning_supported": true, + "speculative_supported": true, + "tools_supported": true + }, + "daemon": { "alive": true }, + "default_generation_settings": { + "min_p": 0.0, + "n_ctx": 98304, + "repeat_penalty": 1.0, + "temperature": 1.0, + "top_k": 20, + "top_p": 0.95 + }, + "full_cache": { + "capacity": 0, + "disk_bytes": 0, + "enabled": false, + "in_use": 0, + "lifetime_hits": 0 + }, + "model": { + "arch": "qwen35", + "draft_path": "/.../dflash-draft-3.6-q8_0.gguf", + "tokenizer_id": "qwen3" + }, + "model_alias": "dflash", + "model_card": { + "name": "Qwen3.6 27B", + "source": "https://huggingface.co/Qwen/Qwen3.6-27B", + "verified_at": "2026-05-23", + "max_tokens": 32768, + "complex_problem_max_tokens": 81920, + "sampling": { + "temperature": 1.0, + "top_p": 0.95, + "top_k": 20, + "min_p": 0.0, + "presence_penalty": 0.0, + "repetition_penalty": 1.0 + }, + "reasoning_effort_tiers": { + "low": 4032, + "medium": 16128, + "high": 32256, + "x-high": 56832, + "max": 81408 + } + }, + "model_path": "/.../Qwen3.6-27B-Q4_K_M.gguf", + "pflash": { + "bsa_alpha": null, + "bsa_enabled": null, + "drafter_gguf": null, + "enabled": false, + "keep_ratio": null, + "lm_head_fix": null, + "mode": "off", + "skip_park": null, + "threshold": null + }, + "prefix_cache": { + "capacity": 0, + "in_use": 0, + "lifetime_hits": 0 + }, + "reasoning": { + "default": null, + "supported": true, + "supported_efforts": ["low", "medium", "high", "x-high", "max"] + }, + "sampling": { + "capabilities": { + "supports_frequency_penalty": true, + "supports_seed": true, + "supports_temperature": true, + "supports_top_k": true, + "supports_top_p": true + } + }, + "speculative": { + "ddtree_budget": 22 + } +} +``` + +## 7. Out of scope + +- **Authentication.** Same posture as `/health`: open. A future + version may add an opt-in auth requirement, but that's a separate + spec. +- **Per-client capability negotiation.** `/props` is a static + advertise; there is no `GET /props?for_client=X` form. +- **Server-pushed updates.** `/props` is request-response. There is + no streaming variant. +- **Runtime metrics** (tokens/sec, accept rate, queue depth). + Those belong in a dedicated `/metrics` endpoint (Prometheus + format), not in `/props`. The hits/usage counters here are + *configuration* state, not *performance* metrics. +- **Multi-model.** The server loads one model; `/props` describes + it. A multi-model server is out of scope for this design. diff --git a/docs/specs/thinking-budget.md b/docs/specs/thinking-budget.md new file mode 100644 index 000000000..bd4c6735f --- /dev/null +++ b/docs/specs/thinking-budget.md @@ -0,0 +1,577 @@ +# Thinking budget — separate think vs reply token caps + +A design spec for `dflash_server`'s handling of "thinking" requests: +prompts where the model is expected to produce an internal reasoning +trace before its visible reply. The spec covers the request opt-in +(including per-request budget controls), the configuration surface, +the two close strategies (Level 1 and Level 2), the multi-dialect +response shape, and the close-kind taxonomy. + +## 1. Background + +A reasoning-capable model wraps its internal scratch work in a +delimited block — by convention `` for Qwen-family +chat templates, and equivalent tags for other architectures. The +text inside is the **reasoning trace**; the text after `` is +the **visible reply**. + +A single combined token cap (`max_tokens` on the wire) is not enough +to control these requests: + +- On hard reasoning prompts the model can spend its entire budget + inside the `` block and never emit ``. The response + arrives with no parseable answer. +- Even when the model does close `` on its own, a tight cap + can leave it with no remaining tokens to write the actual answer. + +We need two independent caps — one on reasoning length and one on +the combined output — plus a server-side mechanism that *forces* +the model out of `` if the reasoning cap is reached without +the model self-closing. That contract is the **thinking budget**. + +## 2. Terminology + +- **Phase 1 — reasoning.** Generation between the opening `` + and the model's ``. Output is reasoning text. +- **Phase 2 — content.** Generation after ``. Output is the + visible reply. +- **Budget envelope.** The set of caps a thinking-enabled request + agrees to be governed by: phase-1 cap, combined cap, and reply- + budget reserve. See §3. +- **Close kind.** How `` ended up in the stream. See §6. + +## 3. Configuration + +Server-side configuration establishes the **ceilings** that bound +every request's budget envelope. Per-request fields (see §4) may +request *tighter* values than the ceilings, but never looser — this +gives operators an unconditional resource-protection guarantee while +letting clients tune for their use case (short chat vs. deep +reasoning). + +### 3.1 Configuration sources + +The server resolves each knob from the first source that provides a +value, in this order: + +1. **Explicit CLI flag** (e.g. `--think-max-tokens N`). +2. **Model card sidecar.** A JSON file at + `share/model_cards/.json`, where `` is the loaded + GGUF's `general.name` metadata normalized to lowercase with + spaces replaced by `-`. Carries values from the upstream model + card (HuggingFace README or `generation_config.json`). +3. **Per-family fallback table**, built into the C++ server, keyed + on the detected architecture (e.g. `qwen35`, `gemma4`, `laguna`). + A coarse safety net for known families when no sidecar is shipped. +4. **Hard fallback**: + `default_max_tokens=16000`, `hard_limit_reply_budget=4096` + (raised from 512 on 2026-05-25 — see `hard_limit_reply_budget` row in + the sidecar field table below for why), + `think_max_tokens = default_max_tokens − hard_limit_reply_budget`. + +The resolution is reported in the startup banner so operators can +see which source supplied each value. + +### 3.2 Server CLI + +``` +dflash_server \ + --think-max-tokens 32256 \ # Phase-1 ceiling + --default-max-tokens 32768 \ # Combined ceiling when the + # request omits max_tokens + --hard-limit-reply-budget 4096 \ # Reply-reserve ceiling (default) + --reasoning-effort-low 4032 \ # Phase-1 budget for effort=low + --reasoning-effort-medium 16128 \ # Phase-1 budget for effort=medium + --reasoning-effort-high 32256 \ # Phase-1 budget for effort=high + --reasoning-effort-x-high 56832 \ # Phase-1 budget for effort=x-high + --reasoning-effort-max 81408 \ # Phase-1 budget for effort=max + # (each capped at --max-ctx − --hard-limit-reply-budget) +``` + +CLI flags always win. Omit any flag to take the value from the +model card sidecar, family fallback, or hard fallback in turn. + +### 3.3 Model card sidecar + +Each known model has a sidecar JSON at +`share/model_cards/.json`. The file carries values transcribed +from the upstream model card so future-us can reason about +provenance: + +```json +{ + "name": "Qwen3.6 27B", + "source": "https://huggingface.co/Qwen/Qwen3.6-27B", + "verified_at": "2026-05-23", + "max_tokens": 32768, + "complex_problem_max_tokens": 81920, + "sampling": {"temperature": 1.0, "top_p": 0.95, "top_k": 20}, + "reasoning_effort_tiers": { + "low": 4032, + "medium": 16128, + "high": 32256, + "x-high": 56832, + "max": 81408 + } +} +``` + +Fields: + +| Field | Meaning | +|---|---| +| `name` | Display name. Informational; the filename is what matters for lookup. | +| `source` | URL of the upstream card we transcribed. | +| `verified_at` | ISO date the values were last checked against the source. | +| `max_tokens` | The card's standard recommended combined cap. Drives `default_max_tokens`. | +| `complex_problem_max_tokens` | Optional. The card's recommendation for hard reasoning / benchmark workloads. Drives the `x-high` and `max` effort tiers, which sit *above* `default_max_tokens` when this field is present — they are admissible as long as they fit under `max_ctx − hard_limit_reply_budget`. If omitted, both collapse to the `high` tier value. | +| `hard_limit_reply_budget` | Optional. Tokens reserved post-`` for the visible answer phase, used both to derive `think_max_tokens = max_tokens − hard_limit_reply_budget` and as the force-close trigger inside `do_ar_decode` / `do_spec_decode` (when `n_gen − generated ≤ hard_limit_reply_budget`, the engine overrides the next sampled token with ``). Default 4096 (raised from 512 on 2026-05-25). The original 512 came from `ds4_eval.c`, sized for DeepSeek-V4-flash's terse style, but it silently truncated almost every other model mid-answer — bench results from `dflash/docs/experiments/gemma4-26b-thinking-control-2026-05-25.md` showed every force-closed thinking probe getting cut off mid-coordinate-geometry-proof at 512. Without priors on a specific model, 4096 is the safer default; terse models should override down. Qwen3.6, Gemma 4 26B, Gemma 4 31B all ship 4096 in their sidecars. | +| `sampling` | Recommended sampler params. Used as defaults when the request doesn't pin sampler values. | +| `reasoning_effort_tiers` | Explicit phase-1 budgets per tier. Override any computed default. Whichever tiers are present win; missing tiers fall through to the computed defaults below. | + +If the sidecar omits `reasoning_effort_tiers`, tier values are +computed from `max_tokens` and `complex_problem_max_tokens`: + +| Tier | Default formula | +|---|---| +| `low` | `think_max × 0.125` | +| `medium` | `think_max × 0.5` | +| `high` | `think_max × 1.0` (= ceiling derived from `max_tokens`) | +| `x-high` | `(think_max + complex_think_max) / 2` | +| `max` | `complex_think_max` | + +Where `think_max = max_tokens − hard_limit_reply_budget` and +`complex_think_max = complex_problem_max_tokens − hard_limit_reply_budget` +(or `think_max` if the card has no complex recommendation). + +The explicit `reasoning_effort_tiers` field exists because the +ratio-based defaults don't fit every model. A smaller model that +caps at 8192 tokens has a very different tier curve from Qwen3.6's +32768/81920 envelope, and the model card author is in a better +position to pick sensible numbers than a global formula. + +For the Qwen3.6 example above (`max_tokens=32768`, +`complex_problem_max_tokens=81920`), the resolved tiers are +`low=4032, medium=16128, high=32256, x-high=56832, max=81408`. +The `x-high` and `max` values exceed `default_max_tokens`, but they +are *phase-1 budgets* — clients that want to use them in full must +also pass an explicit `max_tokens` ≥ `tier_value + +hard_limit_reply_budget` on the request. With smaller `max_tokens`, +the request parser narrows the effective phase-1 cap to fit (see +§4.4). The tiers stay distinct rather than collapsing to `high` +because the ceiling that bounds them is `max_ctx`, not +`default_max_tokens`. + +### 3.4 Hard fallback + +When no sidecar matches the loaded model and no family fallback +applies, the server uses the `antirez/ds4 ds4_eval.c` reference +values: + +| Knob | Hard fallback | Role | +|---|---|---| +| `--default-max-tokens` | 16000 | Combined ceiling | +| `--hard-limit-reply-budget` | 512 | Reply-reserve ceiling | +| `--think-max-tokens` | 15488 | Phase-1 ceiling (= 16000 − 512) | + +Effort tiers in this configuration: `low=1936`, `medium=7744`, +`high=15488`, `x-high=15488`, `max=15488` (the last two collapse to +`high` because no `complex_problem_max_tokens` is defined). + +### 3.5 Effort-tier invariants + +The server enforces these invariants at startup and clamps with a +warning if violated: + +- `low ≤ medium ≤ high ≤ x-high ≤ max` +- `max ≤ max_ctx − hard_limit_reply_budget` + +The server's `--max-ctx` is the absolute ceiling for any single +request — including its phase-1 portion. Effort tiers are *phase-1 +budgets*, not combined budgets; a tier value larger than +`default_max_tokens` is well-defined. It just means a client that +wants to use that tier's full budget needs to pass an explicit +`max_tokens` ≥ `tier_value + hard_limit_reply_budget` on the +request. With smaller `max_tokens`, the request parser narrows the +effective phase-1 cap to `min(tier_value, request.max_tokens − +hard_limit_reply_budget)` (see §4.4). + +A request that asks for an effort tier exceeding the model's +ceiling (e.g. `effort: "max"` on a model whose card has no +`complex_problem_max_tokens`) gets the `high` value with no error. + +## 4. Request shape + +There are two equivalent ways a client opts into the budget envelope. +Both unlock Level 1, Level 2, and `finish_details` emission. + +### 4.1 Anthropic-style `thinking` + +```json +{ + "model": "...", + "messages": [...], + "max_tokens": 16000, + "thinking": { + "type": "enabled", + "budget_tokens": 4000, + "reply_budget": 300 + } +} +``` + +| Field | Meaning | +|---|---| +| `thinking.type` | `"enabled"` activates the envelope; anything else (or omitting `thinking`) keeps the legacy single-cap behaviour. | +| `thinking.budget_tokens` | Optional. Client-preferred phase-1 cap. Effective value = `min(budget_tokens, --think-max-tokens)`. Omit to use the server default. | +| `thinking.reply_budget` | Optional. Client-preferred reply reserve for Level 2 force-close. Effective value = `min(reply_budget, --hard-limit-reply-budget)`. Omit to use the server default. | + +### 4.2 OpenAI Responses-style `reasoning.effort` + +```json +{ + "model": "...", + "input": "...", + "reasoning": {"effort": "medium"} +} +``` + +| Field | Meaning | +|---|---| +| `reasoning.effort` | One of `"low"`, `"medium"`, `"high"`, `"x-high"`, or `"max"`. Each value selects a server-configured phase-1 budget (see §3) and activates the envelope. | + +`reasoning.effort` is the simpler shape for clients that don't want +to pick a token number. The effective phase-1 budget is the +`--reasoning-effort-` value at the chosen tier; the reply +reserve falls back to `--hard-limit-reply-budget`. + +The five-tier vocabulary is a dflash extension to the +OpenAI Responses three-tier (`low | medium | high`) standard. +Clients that send only OpenAI-standard values continue to work; the +extra tiers (`x-high`, `max`) let clients opt in to the model card's +complex-problem budget when the prompt warrants it. + +An unknown tier value falls back to `high` rather than erroring, so +clients that send a future tier (e.g. `"ultra"`) get sensible +behaviour instead of a 400. + +### 4.3 Combining the two + +If a request sets **both** `thinking.budget_tokens` and +`reasoning.effort`, `thinking.budget_tokens` wins (it is the more +specific control). The effort tier still selects defaults for any +unspecified `thinking.*` fields. This keeps mixed-dialect clients +predictable and lets per-request fine-tuning sit on top of a coarse +effort knob. + +### 4.4 Clamping rules + +All per-request budget fields clamp to the server ceiling — clients +can ask for *less* than the operator-configured ceiling but never +*more*: + +| Per-request field | Clamp | +|---|---| +| `thinking.budget_tokens` | `min(requested, --think-max-tokens)` | +| `thinking.reply_budget` | `min(requested, --hard-limit-reply-budget)` | +| `max_tokens` (combined cap) | `min(requested, --default-max-tokens)` | + +The server emits a single per-request log line whenever a clamp +fires, recording requested-vs-effective values for both fields. No +error response — clamping is silent at the wire to preserve OpenAI/ +Anthropic protocol compatibility. + +When `reasoning.effort` is set, the request's effective phase-1 +cap is `min(effort_tier_value, request.max_tokens − +hard_limit_reply_budget)`. The effort tier value is the server +configuration looked up from the resolved model card (or CLI +override); the per-request narrowing accommodates clients that +choose a tier (e.g. `"max"`) without also passing an explicit +`max_tokens`. If the client wants to use the full tier budget, +they must also pass a large enough `max_tokens` — otherwise the +effective cap silently narrows to what fits in `max_tokens` after +reserving the reply budget. This narrowing is logged once at info +level (not a warning — it is normal and expected behaviour). + +### 4.5 Why client-side controls are bounded, not full overrides + +A previous design allowed clients to override the server budget +entirely. The footgun was: middleboxes that did not understand the +new fields silently dropped them, leaving requests to hit the +server's combined `max_tokens` as their only cap — invariably +truncating mid-reasoning and producing artificially low quality +numbers in cross-server benchmarks. + +Clamping to the server ceiling resolves this asymmetrically: if a +middlebox drops the per-request field, the server falls back to its +configured default (which is a reasonable production policy), not to +the much-larger combined cap. Clients still get useful behaviour; +nobody silently truncates mid-thought. + +## 5. Close strategies + +When a request opts into the budget envelope the server uses one of +two strategies to ensure the response contains a visible reply, in +order of preference. Both are independent of the model architecture +in their contract; their implementation differs per backend. + +### 5.1 Level 1 — phase-2 reprompt + +When the daemon finishes phase-1 generation and `` did not +appear in the stream, the server constructs a fresh prompt: + +``` + + + + +Final answer: +``` + +It then runs a second daemon call against that prompt for at most +`max_tokens − phase1_emitted` more tokens and appends the result as +the visible reply. + +Level 1 works on any backend; it does not require sampling-loop +integration. Its cost is one extra prefill of the phase-1 reasoning, +which dominates for long traces. + +### 5.2 Level 2 — in-process force-close + +When supported by the backend (currently Qwen3.5/3.6, Gemma4, Laguna), +the server avoids the phase-2 reprompt by overriding sampling in the +generation loop: + +- Track the number of tokens generated since entry to the AR loop. +- When `(n_gen − generated) ≤ --hard-limit-reply-budget`, the + remaining headroom is dedicated to the visible reply. Override the + next sampled token with the tokenizer's `` close-tag. +- Close tags that tokenize to multiple ids (e.g. DeepSeek/Laguna, + where `` is `[1718, 37947, 32]`) are injected as a multi- + token sequence: each subsequent loop iteration overrides one more + token until the sequence is complete. Single-token close tags + (Qwen3.6 `` = id 248069) finish in one override. +- After the close sequence, normal sampling resumes. The model + continues from a still-hot KV cache and writes the visible reply + naturally, with `--hard-limit-reply-budget` tokens of headroom. + +Level 2 is strictly cheaper than Level 1 (no reprompt, no second +prefill, KV cache preserved) and produces a higher-quality reply +because the model's reasoning context is still in-frame when it +writes the answer. + +When a Level 2-capable backend serves a thinking-enabled request, +Level 2 fires first. Level 1 remains as a fallback for backends that +do not yet implement the BudgetHook, and for safety in case Level 2 +encounters an unexpected state. + +### 5.3 Budget arithmetic + +In Level 2 the budget check runs against tokens **generated in the +current AR loop**, not against the absolute KV position: + +``` +generated = committed_now − committed_at_entry +remaining = n_gen − generated +if remaining ≤ effective_reply_budget: force-close +``` + +Where `effective_reply_budget` is the per-request `thinking.reply_budget` +clamped to `--hard-limit-reply-budget` (see §4.4), and `n_gen` is the +effective phase-1 cap: `thinking.budget_tokens` clamped to +`--think-max-tokens` if set, otherwise the `reasoning.effort` tier value +narrowed by `request.max_tokens − hard_limit_reply_budget` (see §4.4). + +The generated-since-entry frame matters because `committed_now` +includes the prompt length and any tokens already committed before +AR took over (e.g. when the spec-decode path tails off into AR for +the final stretch). Without the offset the check would fire +`prompt_len` tokens early and could go negative after spec-decode +tail-off, force-closing immediately as AR began. + +## 6. Response shape + +### 6.1 Reasoning text — multi-dialect aliases + +Different reasoning-capable APIs put the reasoning trace under +different keys. There is no agreed-upon standard; each provider +picked one shape and tooling has fragmented around it. + +| API | Reasoning text field | Reasoning-token count field | +|---|---|---| +| OpenAI o1/o3 | not exposed (tokens are hidden) | `usage.completion_tokens_details.reasoning_tokens` | +| Anthropic Claude | `content[]: {type:"thinking", thinking:"...", signature:"..."}` (typed block) | `usage.thinking_tokens` | +| DeepSeek R1 | `message.reasoning_content` (flat string) | inferred from totals | +| Qwen3 native | inline `...` in `message.content` | not exposed | +| OpenRouter | `message.reasoning` (flat) + `message.reasoning_details[]` (typed-block list) | `usage.completion_tokens_details.reasoning_tokens` | + +dflash_server emits the reasoning text under **all** of the flat- +string names plus the typed-block list, and the OpenAI-shaped token +count, so any client written against any of these shapes works +without per-server remapping: + +```json +{ + "choices": [{ + "message": { + "role": "assistant", + "content": "Final visible answer.", + "reasoning_content": "Phase-1 reasoning text…", + "reasoning": "Phase-1 reasoning text…", + "reasoning_details": [ + {"type": "reasoning.text", "text": "Phase-1 reasoning text…"} + ] + }, + "finish_reason": "stop", + "finish_details": { + "close_kind": "natural", + "thinking_tokens": 8421, + "content_tokens": 312, + "total_tokens": 8733 + } + }], + "usage": { + "prompt_tokens": 201, + "completion_tokens": 8733, + "total_tokens": 8934, + "completion_tokens_details": { + "reasoning_tokens": 8421 + } + } +} +``` + +Field semantics: + +- **`message.content`** — the visible reply (post-`` text). + Standard OpenAI Chat Completions field. +- **`message.reasoning_content`** — flat string with the full + reasoning text. DeepSeek R1 convention. Primary field; tooling + that knows only one of these field names should know this one. +- **`message.reasoning`** — same string as `reasoning_content`, + under OpenRouter's normalized name. +- **`message.reasoning_details`** — a list of typed reasoning + blocks. Today always exactly one `{type:"reasoning.text", text:…}` + block carrying the full reasoning. The list shape leaves room to + add phase-1/phase-2 splits, Anthropic-style signature fields, or + per-stage metadata in a future version without breaking clients. +- **`usage.completion_tokens_details.reasoning_tokens`** — count of + tokens attributed to reasoning. Matches OpenAI o1/o3's location + and OpenRouter's normalization. +- **`finish_details`** — see §6.2. + +The three `message.*` reasoning fields carry identical strings. They +are emitted together; clients should not assume they will diverge. + +### 6.2 `finish_details` + +When a request opts into the budget envelope, the response carries +an additional `finish_details` object alongside the standard OpenAI +`finish_reason`: + +```json +"finish_details": { + "close_kind": "natural" | "hard", + "thinking_tokens": , + "content_tokens": , + "total_tokens": +} +``` + +- `close_kind` — see §7. +- `thinking_tokens` — tokens generated while the model was inside + the `` block. Equal to `usage.completion_tokens_details.reasoning_tokens`. +- `content_tokens` — tokens generated for the visible reply, summed + across phase-1 (post-`` if the model self-closed early) + and phase-2 (Level 1 reprompt output). +- `total_tokens` — `thinking_tokens + content_tokens`. + +`finish_reason` continues to follow OpenAI semantics +(`stop` / `length` / `tool_calls`). `finish_details` is additive: +clients that don't know about it ignore it. + +`finish_details` is omitted when the request did not opt into the +budget envelope (no `thinking:{type:"enabled"}`). + +### 6.3 Response timings + +When the server completes a request, the response's `usage` block +carries a `timings` object with per-request performance metrics: + +```json +"usage": { + "prompt_tokens": 256, + "completion_tokens": 1024, + "total_tokens": 1280, + "completion_tokens_details": { "reasoning_tokens": 512 }, + "timings": { + "prefill_ms": 234.5, + "decode_ms": 2456.7, + "decode_tokens_per_sec": 41.6 + } +} +``` + +- `prefill_ms` — wall time spent processing the input prompt before + generating the first output token (KV cache fill). Excludes queue + and scheduling overhead. +- `decode_ms` — wall time spent generating output tokens + (`completion_tokens` of them). Includes speculative-decode overhead. +- `decode_tokens_per_sec` — `completion_tokens / (decode_ms * 0.001)`. + The model's effective throughput on this request. Emitted as + `0.0` when `decode_ms` is zero (prefill-only / count-tokens paths) + rather than `null` / `NaN` so JSON parsers don't trip. + +These fields are emitted on every response (OpenAI Chat Completions, +Anthropic Messages, OpenAI Responses), regardless of whether the +thinking-budget envelope was opted into. Additive to the OpenAI / +Anthropic shape; clients that don't know the field ignore it. For +streaming requests, `timings` appears in the terminal usage chunk +(OpenAI), the `message_delta.usage` event (Anthropic), and the +`response.completed.usage` payload (Responses). + +## 7. Close-kind taxonomy + +`finish_details.close_kind` records how the `` block ended. +The current taxonomy is: + +| Value | Meaning | +|---|---| +| `natural` | The model emitted `` on its own, either before reaching the phase-1 cap or before Level 2 had to force-close. | +| `hard` | The phase-1 cap was reached without a model-emitted ``. Either Level 2 force-closed the block in-loop (preserving KV) or Level 1 ran the phase-2 reprompt. | + +A third value `soft` is reserved for a future voluntary-close +mechanism (logit-biasing the model toward `` as the cap +approaches, before forcing it). Reserved so consumers can switch on +the value without an exhaustive-match warning when a future server +version adds it; not emitted today. + +## 8. Streaming + +Streaming responses (`stream: true`) honor the same configuration +knobs and emit the same reasoning text via the format-appropriate +SSE deltas (OpenAI `delta.reasoning_content`, Anthropic +`content_block_delta` with `thinking_delta`, OpenRouter +`delta.reasoning`). + +`finish_details` is emitted in the final chunk for OpenAI Chat and +in the terminal `message_delta` event for Anthropic. + +## 9. Out of scope + +- **Per-request budget *override* (unclamped).** §4 describes the + bounded form: clients can request *tighter* budgets than the + server-configured ceiling, never looser. Allowing full override + would re-create the silent-truncation footgun of middleboxes that + drop unknown fields. +- **Soft close-kind / soft-budget hint.** The mechanism (logit bias + to nudge `` selection before the hard cap) is sketched in + §7 but not specified. +- **Per-token close-info metadata.** The upstream reference exposes + `(token_index, remaining_budget, rank)` for the close event. The + current `finish_details` reports aggregate counts only. +- **Phase-1/phase-2 split inside `reasoning_details`.** Today the + list always carries exactly one block. A future version may add + per-phase blocks (`[{phase:1, …}, {phase:2, …}]`) — the typed-list + shape was chosen specifically to allow this without breaking + clients. diff --git a/share/model_cards/README.md b/share/model_cards/README.md new file mode 100644 index 000000000..762132161 --- /dev/null +++ b/share/model_cards/README.md @@ -0,0 +1,77 @@ +# Model cards + +Sidecar JSON files carrying per-model defaults transcribed from the +upstream model card (typically the HuggingFace README + +`generation_config.json`). + +`dflash_server` reads these at startup to set sensible +`--default-max-tokens`, `--think-max-tokens`, sampler, and +`reasoning.effort` tier values for the loaded model. The CLI +flags still override anything here. See +[docs/specs/thinking-budget.md §3](../../docs/specs/thinking-budget.md) +for the resolution order. + +## Lookup + +The server normalises the loaded GGUF's `general.name` metadata to +lowercase with spaces replaced by `-`, then looks for +`share/model_cards/.json`. A missing file falls back to +the per-family table built into the server, then to the hard +fallback (`antirez/ds4 ds4_eval.c` reference values). + +## Adding a new card + +1. Find the upstream model card (HuggingFace README + + `generation_config.json`). +2. Note the recommended `max_tokens` (or equivalent), and any + separate recommendation for hard reasoning / benchmarking + workloads. +3. Author a JSON file in this directory. Set `source` to the URL + you used and `verified_at` to today's ISO date. +4. The file is bundled into the Docker image and read at server + startup. No recompile needed. + +## Fields + +See [docs/specs/thinking-budget.md §3.3](../../docs/specs/thinking-budget.md) +for the full field reference. + +| Field | Required | Notes | +|---|---|---| +| `name` | yes | Display name; informational. | +| `source` | yes | URL of the upstream card. | +| `verified_at` | yes | ISO date these values were last checked. | +| `max_tokens` | yes | The card's standard recommendation. | +| `download_urls` | no | Map of variant tag (e.g. `Q4_K_M`, `bf16`) to GGUF download URL. Used by deployment tooling. | +| `complex_problem_max_tokens` | no | For hard reasoning / benchmarking. Used to compute `x-high` and `max` effort tiers. | +| `sampling` | no | Recommended sampler defaults. | +| `reasoning_effort_tiers` | no | Explicit per-tier phase-1 budgets. Overrides any computed defaults. Use this when the ratio-based defaults don't fit the model. | +| `notes` | no | Free-form notes about provenance, caveats, or non-card-derived choices. | + +## Validating a sidecar + +The schema for these files lives at [`_schema.json`](_schema.json) +(JSON Schema draft 2020-12). Any author-facing JSON Schema validator +works; a couple of examples: + +```bash +# Python (stdlib + jsonschema) +python -m pip install jsonschema +python -c "import json, jsonschema; \ + schema=json.load(open('share/model_cards/_schema.json')); \ + doc=json.load(open('share/model_cards/qwen3.6-27b.json')); \ + jsonschema.Draft202012Validator(schema).validate(doc); print('OK')" + +# Node (ajv-cli) +npx --yes ajv-cli@5 validate \ + -s share/model_cards/_schema.json \ + -d share/model_cards/qwen3.6-27b.json \ + --spec=draft2020 +``` + +`additionalProperties: false` is set at the root, so typos in field +names (e.g. `verified_on` instead of `verified_at`) surface as +validation errors instead of being silently ignored by `dflash_server`. +The server itself does a runtime sanity check for the four required +fields when loading a sidecar and warns (does not fail-start) when one +is missing. diff --git a/share/model_cards/_schema.json b/share/model_cards/_schema.json new file mode 100644 index 000000000..3fc204cb4 --- /dev/null +++ b/share/model_cards/_schema.json @@ -0,0 +1,85 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://github.com/lucebox/lucebox-hub/blob/main/share/model_cards/_schema.json", + "title": "dflash model card sidecar", + "description": "Sidecar JSON carrying per-model defaults transcribed from the upstream model card. Read by dflash_server at startup to set --default-max-tokens, --think-max-tokens, sampler, and reasoning.effort tier values. See docs/specs/thinking-budget.md §3.3 for the full field reference.", + "type": "object", + "additionalProperties": false, + "required": ["name", "source", "verified_at", "max_tokens"], + "properties": { + "name": { + "type": "string", + "description": "Display name. Informational; the filename is what matters for lookup." + }, + "source": { + "type": "string", + "format": "uri", + "description": "URL of the upstream model card we transcribed." + }, + "verified_at": { + "type": "string", + "format": "date", + "description": "ISO 8601 date (YYYY-MM-DD) the values were last checked against the source." + }, + "max_tokens": { + "type": "integer", + "minimum": 1, + "description": "The card's standard recommended combined cap. Drives default_max_tokens." + }, + "download_urls": { + "type": "object", + "description": "Optional. Map of variant tag (e.g. Q4_K_M, bf16) to GGUF download URL. Used by deployment tooling to resolve a sidecar to a concrete artifact.", + "additionalProperties": { + "type": "string", + "format": "uri" + } + }, + "complex_problem_max_tokens": { + "type": "integer", + "minimum": 1, + "description": "Optional. Card's recommendation for hard reasoning / benchmark workloads. Drives x-high and max effort tiers." + }, + "hard_limit_reply_budget": { + "type": "integer", + "minimum": 256, + "description": "Optional. Tokens reserved post-`` for the visible answer phase. When `(n_gen - generated) <= hard_limit_reply_budget` during decode, the engine injects `thinking_terminator_hint` (or the bare `thinking_marker` if no hint is set). Default 4096 (raised from 512 on 2026-05-25 — the original was sized for DeepSeek-V4-flash's terse style but silently truncated almost every other model mid-answer). Terse models can override down to 512-1024; verbose math/code models keep 4096. Drives both `think_max_tokens = max_tokens - hard_limit_reply_budget` and the force-close trigger in do_ar_decode." + }, + "thinking_marker": { + "type": "string", + "description": "Optional. Bytes that signal end-of-thinking to PARSERS — bench grader, chat template, response formatter, started_in_thinking detect. If empty, the server uses arch defaults (`` for qwen3-family, `` for gemma4, `` elsewhere). Sidecar override exists for arches we haven't enumerated yet; for current arches leave empty." + }, + "thinking_terminator_hint": { + "type": "string", + "description": "Optional. Trained directive injected mid-stream when the budget hook fires. Tells the MODEL to wrap up. Tokenized at startup; the AR decode loop overrides sampled tokens with this sequence verbatim when `(n_gen - generated) <= hard_limit_reply_budget`. The server does NOT auto-append the marker — if the operator wants the close marker in the inject, they include it in the hint. This lets callers test whether a hint alone induces self-close (model emits marker itself) vs a full directive+marker (we force the close). For Qwen3.x the canonical hint per the Qwen3 technical report (arXiv 2505.09388) DOES include the marker: 'Considering the limited time by the user, I have to give the solution based on the thinking directly now.\\n\\n\\n'. If empty, the server emits the bare `thinking_marker` only — safe but the model may continue derivation in content space." + }, + "sampling": { + "type": "object", + "description": "Optional. Recommended sampler defaults; used when the request omits a field.", + "additionalProperties": false, + "properties": { + "temperature": { "type": "number" }, + "top_p": { "type": "number" }, + "top_k": { "type": "integer" }, + "min_p": { "type": "number" }, + "presence_penalty": { "type": "number" }, + "repetition_penalty": { "type": "number" } + } + }, + "reasoning_effort_tiers": { + "type": "object", + "description": "Optional. Explicit per-tier phase-1 budgets. Overrides any computed default. Use when ratio-based defaults don't fit the model.", + "additionalProperties": false, + "properties": { + "low": { "type": "integer", "minimum": 1 }, + "medium": { "type": "integer", "minimum": 1 }, + "high": { "type": "integer", "minimum": 1 }, + "x-high": { "type": "integer", "minimum": 1 }, + "max": { "type": "integer", "minimum": 1 } + } + }, + "notes": { + "type": "string", + "description": "Optional. Free-form notes about provenance, caveats, or non-card-derived choices." + } + } +} diff --git a/share/model_cards/gemma-4-26b-a4b-it.json b/share/model_cards/gemma-4-26b-a4b-it.json new file mode 100644 index 000000000..2d43d4b96 --- /dev/null +++ b/share/model_cards/gemma-4-26b-a4b-it.json @@ -0,0 +1,23 @@ +{ + "name": "Gemma 4 26B-A4B IT", + "source": "https://huggingface.co/google/gemma-4-26B-A4B-it", + "verified_at": "2026-05-24", + "download_urls": { + "Q4_K_M-target": "https://huggingface.co/bartowski/google_gemma-4-26B-A4B-it-GGUF/resolve/main/google_gemma-4-26B-A4B-it-Q4_K_M.gguf", + "Q5_K_M-target": "https://huggingface.co/bartowski/google_gemma-4-26B-A4B-it-GGUF/resolve/main/google_gemma-4-26B-A4B-it-Q5_K_M.gguf", + "Q8_0-target": "https://huggingface.co/bartowski/google_gemma-4-26B-A4B-it-GGUF/resolve/main/google_gemma-4-26B-A4B-it-Q8_0.gguf", + "DFlash-draft": "https://huggingface.co/Lucebox/gemma-4-26B-A4B-it-DFlash-GGUF/resolve/main/gemma-4-26B-A4B-it-DFlash-q8_0.gguf" + }, + "notes": "Gemma 4 26B-A4B IT — multimodal MoE (25.2B total / 3.8B active / 128 experts, 8 active + 1 shared). Reasoning-capable via `<|think|>` token at start of system prompt. Native 256K context. Target GGUFs are community quantizations by bartowski; the DFlash drafter is published by Lucebox.", + "max_tokens": 16384, + "hard_limit_reply_budget": 4096, + "thinking_terminator_hint": "\n\n", + "sampling": { + "temperature": 1.0, + "top_p": 0.95, + "top_k": 64, + "min_p": 0.0, + "presence_penalty": 0.0, + "repetition_penalty": 1.0 + } +} diff --git a/share/model_cards/gemma-4-31b-it.json b/share/model_cards/gemma-4-31b-it.json new file mode 100644 index 000000000..023b079e4 --- /dev/null +++ b/share/model_cards/gemma-4-31b-it.json @@ -0,0 +1,23 @@ +{ + "name": "Gemma 4 31B IT", + "source": "https://huggingface.co/google/gemma-4-31B-it", + "verified_at": "2026-05-24", + "download_urls": { + "Q4_K_M-target": "https://huggingface.co/bartowski/google_gemma-4-31B-it-GGUF/resolve/main/google_gemma-4-31B-it-Q4_K_M.gguf", + "Q5_K_M-target": "https://huggingface.co/bartowski/google_gemma-4-31B-it-GGUF/resolve/main/google_gemma-4-31B-it-Q5_K_M.gguf", + "Q8_0-target": "https://huggingface.co/bartowski/google_gemma-4-31B-it-GGUF/resolve/main/google_gemma-4-31B-it-Q8_0.gguf", + "DFlash-draft": "https://huggingface.co/Lucebox/gemma-4-31B-it-DFlash-GGUF/resolve/main/gemma-4-31B-it-DFlash-q8_0.gguf" + }, + "notes": "Gemma 4 31B IT — multimodal dense (30.7B params). Reasoning-capable via `<|think|>` token (or `enable_thinking=True` chat template kwarg). Native 256K context. Target GGUFs are community quantizations by bartowski; the DFlash drafter is published by Lucebox.", + "max_tokens": 16384, + "hard_limit_reply_budget": 4096, + "thinking_terminator_hint": "\n\n", + "sampling": { + "temperature": 1.0, + "top_p": 0.95, + "top_k": 64, + "min_p": 0.0, + "presence_penalty": 0.0, + "repetition_penalty": 1.0 + } +} diff --git a/share/model_cards/laguna-xs.2.json b/share/model_cards/laguna-xs.2.json new file mode 100644 index 000000000..bc0dda85a --- /dev/null +++ b/share/model_cards/laguna-xs.2.json @@ -0,0 +1,19 @@ +{ + "name": "Laguna-XS.2", + "source": "https://huggingface.co/Lucebox/Laguna-XS.2-GGUF", + "verified_at": "2026-05-24", + "download_urls": { + "Q4_K_M": "https://huggingface.co/Lucebox/Laguna-XS.2-GGUF/resolve/main/laguna-xs2-Q4_K_M.gguf", + "bf16": "https://huggingface.co/Lucebox/Laguna-XS.2-GGUF/resolve/main/laguna-xs2-bf16.gguf" + }, + "notes": "Non-reasoning MoE code model (3B active / 33B total). Card does not specify generation params or a complex-problem mode. context: native 4096, PFlash-extended to 131072. Sampling defaults below are code-model-typical (not from card). general.name has not been verified against a loaded GGUF — confirm and rename file if needed.", + "max_tokens": 4096, + "sampling": { + "temperature": 0.6, + "top_p": 0.95, + "top_k": 50, + "min_p": 0.0, + "presence_penalty": 0.0, + "repetition_penalty": 1.0 + } +} diff --git a/share/model_cards/qwen3.6-27b.json b/share/model_cards/qwen3.6-27b.json new file mode 100644 index 000000000..94094dddf --- /dev/null +++ b/share/model_cards/qwen3.6-27b.json @@ -0,0 +1,24 @@ +{ + "name": "Qwen3.6 27B", + "source": "https://huggingface.co/Qwen/Qwen3.6-27B", + "verified_at": "2026-05-25", + "max_tokens": 32768, + "complex_problem_max_tokens": 81920, + "hard_limit_reply_budget": 4096, + "thinking_terminator_hint": "Considering the limited time by the user, I have to give the solution based on the thinking directly now.\n\n\n", + "sampling": { + "temperature": 1.0, + "top_p": 0.95, + "top_k": 20, + "min_p": 0.0, + "presence_penalty": 0.0, + "repetition_penalty": 1.0 + }, + "reasoning_effort_tiers": { + "low": 4032, + "medium": 16128, + "high": 32256, + "x-high": 56832, + "max": 81408 + } +} \ No newline at end of file