Luce-Org · davide221 · May 26, 2026 · May 26, 2026 · May 26, 2026
diff --git a/dflash/CMakeLists.txt b/dflash/CMakeLists.txt
@@ -688,6 +688,7 @@ if(DFLASH27B_TESTS)
         add_executable(dflash_server
             src/server/server_main.cpp
             src/server/http_server.cpp
+            src/server/model_card.cpp
         )
         target_include_directories(dflash_server PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
         if(DFLASH27B_GPU_BACKEND STREQUAL "hip")
@@ -753,7 +754,9 @@ if(DFLASH27B_TESTS)
 
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_server_unit.cpp")
         add_executable(test_server_unit test/test_server_unit.cpp)
-        target_sources(test_server_unit PRIVATE src/server/http_server.cpp)
+        target_sources(test_server_unit PRIVATE
+            src/server/http_server.cpp
+            src/server/model_card.cpp)
         target_include_directories(test_server_unit PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
         if(DFLASH27B_GPU_BACKEND STREQUAL "hip")
             target_compile_definitions(test_server_unit PRIVATE DFLASH27B_BACKEND_HIP=1 GGML_USE_HIP)

diff --git a/dflash/src/common/model_backend.h b/dflash/src/common/model_backend.h
@@ -46,6 +46,39 @@ struct DaemonIO {
 
 // ─── Generate request/result ────────────────────────────────────────────
 
+// Thinking-budget force-close hook. Mirrors antirez/ds4 ds4_eval.c's
+// hard_limit_reply_budget semantics: when the budget remaining (n_gen
+// minus tokens committed so far) falls to hard_limit_remaining, the
+// next sampled tokens get overridden with close_token_ids in order,
+// giving the model the remaining budget to write a visible answer
+// after the injected close-tag sequence.
+//
+// Single vs multi-token close:
+//   Qwen3.6: </think> is one added_token (id 248069). close_token_ids
+//            has size 1. One override + budget_close_injected=true.
+//   DeepSeek/laguna: </think> tokenizes to 3 ordinary tokens
+//            ([1718, 37947, 32] for DS-V3). close_token_ids has
+//            size 3. Three consecutive overrides, then resume.
+//
+// This is "Level 2" of our thinking-budget migration: in-process
+// mid-stream force-close, KV-continuous. Beats Level 1's phase-2
+// reprompt because the model never sees a fresh prefill — its KV
+// state continues naturally after the injected close.
+//
+// Current implementation: AR-decode only. When budget_hook is set,
+// backends MAY route generation through their AR path (skipping spec
+// decode) — the perf trade-off is acceptable since this only kicks in
+// for thinking-enabled requests. Spec-decode integration is a follow-up.
+struct BudgetHook {
+    // Multi-token close sequence injected when `(n_gen - committed)`
+    // drops to `hard_limit_remaining`. For Qwen3.x this is the
+    // canonical "Considering the limited time..." summarize-and-stop
+    // lead-in (tokenized at server startup); for non-qwen arches it's
+    // a single close-tag token. Empty = hook disabled.
+    std::vector<int32_t> close_token_ids;
+    int                  hard_limit_remaining = 0;
+};
+
 struct GenerateRequest {
     std::vector<int32_t>       prompt;
     int                        n_gen       = 0;
@@ -65,6 +98,8 @@ struct GenerateRequest {
     // When non-null, the spec decode loop uses these as draft overrides,
     // bypassing draft model computation for covered positions.
     const std::vector<int32_t> * hint_tokens = nullptr;
+    // Optional thinking-budget hook — see BudgetHook docs above.
+    BudgetHook                 budget_hook;
 };
 
 struct GenerateResult {
@@ -73,6 +108,19 @@ struct GenerateResult {
     std::vector<int32_t>       tokens;
     double                     prefill_s   = 0.0;
     double                     decode_s    = 0.0;
+    // True when the backend's Level 2 hook injected the </think> close
+    // sequence during this generation (vs. the model self-closing). The
+    // server uses this to attribute close_kind correctly: if the model
+    // produced </think> naturally we report "natural"; if the hook fired
+    // we report "hard". Without this flag, decoding the phase-1 token
+    // stream and grepping for "</think>" cannot distinguish the two
+    // (the injected close decodes identically).
+    bool                       budget_forced_close = false;
+    // True iff the AR decode loop's post-close watchdog detected an n-gram
+    // repetition loop and broke out early. Caller surfaces this so clients
+    // can mark the answer as unreliable rather than treating the
+    // (truncated) content as a clean response.
+    bool                       degenerate_decode_close = false;
 };
 
 // ─── Backend interface ──────────────────────────────────────────────────

diff --git a/dflash/src/gemma4/gemma4_backend.cpp b/dflash/src/gemma4/gemma4_backend.cpp
@@ -261,14 +261,84 @@ int Gemma4Backend::do_prefill(const std::vector<int32_t> & tokens,
 
 bool Gemma4Backend::do_decode(int committed, int n_gen,
                                std::vector<int32_t> & out_tokens,
-                               const DaemonIO & io) {
+                               const DaemonIO & io,
+                               const BudgetHook & budget_hook,
+                               bool * forced_close_out) {
     const int hidden = w_.n_embd;
     const int vocab  = w_.n_vocab;
     std::vector<float> embed_buf(hidden);
     std::vector<float> logits;
 
+    // Budget force-close state — same shape as qwen35's maybe_force_close.
+    // See dflash/src/common/model_backend.h BudgetHook docs for the
+    // single- vs multi-token close-tag semantics.
+    bool budget_close_started = false;
+    int  close_inject_pos     = 0;
+    // Capture entry KV position so the budget check is in the
+    // "generated since entry" frame, not the absolute KV frame.
+    // committed_now (KV position) = prompt_len + tokens_generated; n_gen
+    // is the gen-only count (or remaining-budget remap from the
+    // spec-decode tail-off). Without this entry capture, force-close
+    // fires prompt_len tokens early on prompted requests and goes
+    // negative immediately after a tail-off. (Mirrors qwen35 fix 5c785f0
+    // — same bug since this lambda was ported verbatim.)
+    const int committed_at_entry = committed;
+    auto maybe_force_close = [&](int32_t & tok, int committed_now) {
+        if (budget_hook.close_token_ids.empty()) return;
+        if (budget_close_started &&
+            close_inject_pos < (int)budget_hook.close_token_ids.size())
+        {
+            int32_t inj = budget_hook.close_token_ids[close_inject_pos];
+            std::fprintf(stderr,
+                "[budget-hook] gemma4 close-seq continue %d/%zu: overriding "
+                "sampled token %d with %d\n",
+                close_inject_pos + 1,
+                budget_hook.close_token_ids.size(), tok, inj);
+            tok = inj;
+            close_inject_pos++;
+            return;
+        }
+        if (budget_close_started) return;
+        const int generated = committed_now - committed_at_entry;
+        int remaining = n_gen - generated;
+        if (remaining <= budget_hook.hard_limit_remaining) {
+            int32_t first_close = budget_hook.close_token_ids.front();
+            if (tok == first_close) {
+                budget_close_started = true;
+                close_inject_pos = 1;
+                return;
+            }
+            std::fprintf(stderr,
+                "[budget-hook] gemma4 force-close at committed=%d/%d "
+                "(remaining=%d <= hard_limit=%d): overriding token %d "
+                "with close[0]=%d (seq len %zu)\n",
+                committed_now, n_gen, remaining,
+                budget_hook.hard_limit_remaining, tok, first_close,
+                budget_hook.close_token_ids.size());
+            tok = first_close;
+            budget_close_started = true;
+            close_inject_pos = 1;
+            if (forced_close_out) *forced_close_out = true;
+        }
+    };
+
     for (int i = 0; i < n_gen; ++i) {
-        int32_t tok = out_tokens.back();
+        // Seed for this iteration's embed step:
+        //  - Normal case: previous iteration just pushed a sampled
+        //    token onto out_tokens; we re-embed it to advance KV +
+        //    produce next-token logits.
+        //  - Empty case (spec-decode tail-off at iter 0): no prior
+        //    iteration ran, so use cache_.last_tok — that's the
+        //    prefill argmax that spec-decode would have consumed as
+        //    its initial seed. Mirrors qwen35's initial_emitted=1
+        //    pattern; without this, out_tokens.back() on an empty
+        //    vector is UB. (Codex r2 P2 follow-up: the previous fix
+        //    pushed last_tok onto out_tokens here in the caller, but
+        //    that grew out_tokens by an uncounted extra token and the
+        //    caller's `result.tokens.size()` over-counted against the
+        //    budget. Reading from cache instead keeps the budget
+        //    honest.)
+        int32_t tok = out_tokens.empty() ? cache_.last_tok : out_tokens.back();
 
         // Embed single token
         w_.embedder.embed(&tok, 1, embed_buf.data());
@@ -292,6 +362,7 @@ bool Gemma4Backend::do_decode(int committed, int n_gen,
                 if (logits[j] > best) { best = logits[j]; next = j; }
             }
         }
+        maybe_force_close(next, committed);
 
         out_tokens.push_back(next);
         io.emit(next);
@@ -310,7 +381,9 @@ bool Gemma4Backend::do_decode(int committed, int n_gen,
 
 bool Gemma4Backend::do_spec_decode(int committed, int n_gen,
                                     std::vector<int32_t> & out_tokens,
-                                    const DaemonIO & io) {
+                                    const DaemonIO & io,
+                                    const BudgetHook * budget_hook,
+                                    bool * forced_close_out) {
     const int hidden = w_.n_embd;
     int32_t last_tok = cache_.last_tok;
 
@@ -336,6 +409,60 @@ bool Gemma4Backend::do_spec_decode(int committed, int n_gen,
     while (n_generated < n_gen) {
         const int need_commit_budget = n_gen - n_generated;
 
+        // Budget tail-off: when remaining budget is within one spec-decode
+        // batch of the force-close threshold, hand off to do_decode for the
+        // tail. AR handles the close-token override cleanly; spec-decode's
+        // verify-and-accept loop can't safely inject a token mid-batch
+        // without rewriting KV.
+        //
+        // Gemma4's do_decode reads `out_tokens.back()` as the seed each
+        // iter. After the first spec-decode iteration the most-recently-
+        // committed token is on out_tokens, but on a small-budget request
+        // (budget_tokens <= reply_budget + q_len) tail-off can fire on
+        // iter 0 before out_tokens has been seeded. Codex review flagged
+        // the resulting UB on out_tokens.back(); we set cache_.last_tok
+        // and let do_decode pick it up when out_tokens is empty.
+        //
+        // Budget accounting (codex r2 P2): in the previous patch we
+        // also push_back'd last_tok before calling do_decode, which
+        // grew out_tokens by an extra token outside the budget — the
+        // caller (http_server) then saw `result.tokens.size() ==
+        // need_commit_budget + 1` and double-counted that seed against
+        // the budget. Mirror qwen35 instead: cache the seed via
+        // cache_.last_tok, leave out_tokens untouched, and have
+        // do_decode read the seed from cache when out_tokens is empty
+        // (initial_emitted=1 path below). That keeps the budget honest
+        // and matches the symmetry between qwen35 and gemma4 backends.
+        if (budget_hook && !budget_hook->close_token_ids.empty()) {
+            int hard = budget_hook->hard_limit_remaining;
+            if (need_commit_budget <= hard + q_len) {
+                std::fprintf(stderr,
+                    "[budget-hook] gemma4 spec-decode tail-off at "
+                    "committed=%d/%d (remaining=%d, hard_limit=%d, "
+                    "batch=%d) — switching to AR\n",
+                    committed, n_gen, need_commit_budget, hard, q_len);
+                step_graph_destroy(draft_sg);
+                cache_.last_tok = last_tok;  // do_decode reads this when out_tokens empty
+                BudgetHook tail_hook = *budget_hook;
+                int ar_n_gen = need_commit_budget;
+                bool ok = do_decode(committed, ar_n_gen, out_tokens, io,
+                                    tail_hook, forced_close_out);
+                auto t_dec1 = std::chrono::steady_clock::now();
+                const double decode_s = std::chrono::duration<double>(t_dec1 - t_dec0).count();
+                const int total_draft_pos = std::max(1, n_draft_steps * q_len);
+                const double accept_pct = 100.0 * (double)n_accept_sum / (double)total_draft_pos;
+                std::fprintf(stderr,
+                    "[gemma4-spec] tail-off-stats tokens=%d time=%.3f s "
+                    "speed=%.2f tok/s steps=%d accepted=%d/%d (%.1f%%)\n",
+                    n_generated, decode_s,
+                    n_generated > 0 ? n_generated / decode_s : 0.0,
+                    n_draft_steps, n_accept_sum, total_draft_pos,
+                    accept_pct);
+                io.emit(-1);
+                return ok;
+            }
+        }
+
         // 1. Build noise input: [last_tok, MASK, MASK, ..., MASK]
         noise_ids[0] = last_tok;
         for (int i = 1; i < q_len; i++) noise_ids[i] = target->mask_token_id();
@@ -529,7 +656,9 @@ GenerateResult Gemma4Backend::generate(const GenerateRequest & req,
             && !sampler_.needs_logit_processing();
 
         if (can_spec) {
-            if (!do_spec_decode(committed, req.n_gen, result.tokens, out_io)) {
+            if (!do_spec_decode(committed, req.n_gen, result.tokens, out_io,
+                                &req.budget_hook,
+                                &result.budget_forced_close)) {
                 result.error = "spec_decode";
                 return result;
             }
@@ -578,7 +707,9 @@ GenerateResult Gemma4Backend::generate(const GenerateRequest & req,
             }
 
             if (req.n_gen > 1) {
-                if (!do_decode(committed, req.n_gen - 1, result.tokens, out_io)) {
+                if (!do_decode(committed, req.n_gen - 1, result.tokens, out_io,
+                               req.budget_hook,
+                               &result.budget_forced_close)) {
                     result.error = "decode";
                     return result;
                 }
@@ -694,7 +825,9 @@ GenerateResult Gemma4Backend::restore_and_generate(int slot,
             && sampler_.temp == 0.0f;
 
         if (can_spec) {
-            if (!do_spec_decode(committed, req.n_gen, result.tokens, out_io)) {
+            if (!do_spec_decode(committed, req.n_gen, result.tokens, out_io,
+                                &req.budget_hook,
+                                &result.budget_forced_close)) {
                 result.error = "spec_decode";
                 return result;
             }
@@ -743,7 +876,9 @@ GenerateResult Gemma4Backend::restore_and_generate(int slot,
             }
 
             if (req.n_gen > 1) {
-                if (!do_decode(committed, req.n_gen - 1, result.tokens, out_io)) {
+                if (!do_decode(committed, req.n_gen - 1, result.tokens, out_io,
+                               req.budget_hook,
+                               &result.budget_forced_close)) {
                     result.error = "decode";
                     return result;
                 }

diff --git a/dflash/src/gemma4/gemma4_backend.h b/dflash/src/gemma4/gemma4_backend.h
@@ -105,14 +105,28 @@ class Gemma4Backend : public ModelBackend {
                    int kv_offset = 0);
 
     // Autoregressive decode loop.
+    // budget_hook (when close_token_ids is non-empty) overrides the next
+    // sampled token(s) with the close-tag sequence once (n_gen - committed)
+    // <= hard_limit. Mirrors qwen35's do_ar_decode. For Gemma4 the close
+    // tag is typically `<channel|>` (single token in the gemma4 vocab).
+    // forced_close_out, when non-null, is set to true iff the hook injected
+    // the close sequence (vs. the model self-closing). See qwen35_backend.h
+    // for full rationale.
     bool do_decode(int committed, int n_gen,
                    std::vector<int32_t> & out_tokens,
-                   const DaemonIO & io);
+                   const DaemonIO & io,
+                   const BudgetHook & budget_hook = {},
+                   bool * forced_close_out = nullptr);
 
     // DFlash speculative decode loop.
+    // When budget_hook is non-null and (n_gen - generated) falls within
+    // hard_limit + batch headroom, breaks out and tails via do_decode so
+    // the force-close override fires cleanly with KV state intact.
     bool do_spec_decode(int committed, int n_gen,
                         std::vector<int32_t> & out_tokens,
-                        const DaemonIO & io);
+                        const DaemonIO & io,
+                        const BudgetHook * budget_hook = nullptr,
+                        bool * forced_close_out = nullptr);
 };
 
 }  // namespace dflash::common