Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion dflash/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -688,6 +688,7 @@ if(DFLASH27B_TESTS)
add_executable(dflash_server
src/server/server_main.cpp
src/server/http_server.cpp
src/server/model_card.cpp
)
target_include_directories(dflash_server PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
if(DFLASH27B_GPU_BACKEND STREQUAL "hip")
Expand Down Expand Up @@ -753,7 +754,9 @@ if(DFLASH27B_TESTS)

if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_server_unit.cpp")
add_executable(test_server_unit test/test_server_unit.cpp)
target_sources(test_server_unit PRIVATE src/server/http_server.cpp)
target_sources(test_server_unit PRIVATE
src/server/http_server.cpp
src/server/model_card.cpp)
target_include_directories(test_server_unit PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
if(DFLASH27B_GPU_BACKEND STREQUAL "hip")
target_compile_definitions(test_server_unit PRIVATE DFLASH27B_BACKEND_HIP=1 GGML_USE_HIP)
Expand Down
48 changes: 48 additions & 0 deletions dflash/src/common/model_backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,39 @@ struct DaemonIO {

// ─── Generate request/result ────────────────────────────────────────────

// Thinking-budget force-close hook. Mirrors antirez/ds4 ds4_eval.c's
// hard_limit_reply_budget semantics: when the budget remaining (n_gen
// minus tokens committed so far) falls to hard_limit_remaining, the
// next sampled tokens get overridden with close_token_ids in order,
// giving the model the remaining budget to write a visible answer
// after the injected close-tag sequence.
//
// Single vs multi-token close:
// Qwen3.6: </think> is one added_token (id 248069). close_token_ids
// has size 1. One override + budget_close_injected=true.
// DeepSeek/laguna: </think> tokenizes to 3 ordinary tokens
// ([1718, 37947, 32] for DS-V3). close_token_ids has
// size 3. Three consecutive overrides, then resume.
//
// This is "Level 2" of our thinking-budget migration: in-process
// mid-stream force-close, KV-continuous. Beats Level 1's phase-2
// reprompt because the model never sees a fresh prefill — its KV
// state continues naturally after the injected close.
//
// Current implementation: AR-decode only. When budget_hook is set,
// backends MAY route generation through their AR path (skipping spec
// decode) — the perf trade-off is acceptable since this only kicks in
// for thinking-enabled requests. Spec-decode integration is a follow-up.
struct BudgetHook {
// Multi-token close sequence injected when `(n_gen - committed)`
// drops to `hard_limit_remaining`. For Qwen3.x this is the
// canonical "Considering the limited time..." summarize-and-stop
// lead-in (tokenized at server startup); for non-qwen arches it's
// a single close-tag token. Empty = hook disabled.
std::vector<int32_t> close_token_ids;
int hard_limit_remaining = 0;
};

struct GenerateRequest {
std::vector<int32_t> prompt;
int n_gen = 0;
Expand All @@ -65,6 +98,8 @@ struct GenerateRequest {
// When non-null, the spec decode loop uses these as draft overrides,
// bypassing draft model computation for covered positions.
const std::vector<int32_t> * hint_tokens = nullptr;
// Optional thinking-budget hook — see BudgetHook docs above.
BudgetHook budget_hook;
};

struct GenerateResult {
Expand All @@ -73,6 +108,19 @@ struct GenerateResult {
std::vector<int32_t> tokens;
double prefill_s = 0.0;
double decode_s = 0.0;
// True when the backend's Level 2 hook injected the </think> close
// sequence during this generation (vs. the model self-closing). The
// server uses this to attribute close_kind correctly: if the model
// produced </think> naturally we report "natural"; if the hook fired
// we report "hard". Without this flag, decoding the phase-1 token
// stream and grepping for "</think>" cannot distinguish the two
// (the injected close decodes identically).
bool budget_forced_close = false;
// True iff the AR decode loop's post-close watchdog detected an n-gram
// repetition loop and broke out early. Caller surfaces this so clients
// can mark the answer as unreliable rather than treating the
// (truncated) content as a clean response.
bool degenerate_decode_close = false;
};

// ─── Backend interface ──────────────────────────────────────────────────
Expand Down
149 changes: 142 additions & 7 deletions dflash/src/gemma4/gemma4_backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -261,14 +261,84 @@ int Gemma4Backend::do_prefill(const std::vector<int32_t> & tokens,

bool Gemma4Backend::do_decode(int committed, int n_gen,
std::vector<int32_t> & out_tokens,
const DaemonIO & io) {
const DaemonIO & io,
const BudgetHook & budget_hook,
bool * forced_close_out) {
const int hidden = w_.n_embd;
const int vocab = w_.n_vocab;
std::vector<float> embed_buf(hidden);
std::vector<float> logits;

// Budget force-close state — same shape as qwen35's maybe_force_close.
// See dflash/src/common/model_backend.h BudgetHook docs for the
// single- vs multi-token close-tag semantics.
bool budget_close_started = false;
int close_inject_pos = 0;
// Capture entry KV position so the budget check is in the
// "generated since entry" frame, not the absolute KV frame.
// committed_now (KV position) = prompt_len + tokens_generated; n_gen
// is the gen-only count (or remaining-budget remap from the
// spec-decode tail-off). Without this entry capture, force-close
// fires prompt_len tokens early on prompted requests and goes
// negative immediately after a tail-off. (Mirrors qwen35 fix 5c785f0
// — same bug since this lambda was ported verbatim.)
const int committed_at_entry = committed;
auto maybe_force_close = [&](int32_t & tok, int committed_now) {
if (budget_hook.close_token_ids.empty()) return;
if (budget_close_started &&
close_inject_pos < (int)budget_hook.close_token_ids.size())
{
int32_t inj = budget_hook.close_token_ids[close_inject_pos];
std::fprintf(stderr,
"[budget-hook] gemma4 close-seq continue %d/%zu: overriding "
"sampled token %d with %d\n",
close_inject_pos + 1,
budget_hook.close_token_ids.size(), tok, inj);
tok = inj;
close_inject_pos++;
return;
}
if (budget_close_started) return;
const int generated = committed_now - committed_at_entry;
int remaining = n_gen - generated;
if (remaining <= budget_hook.hard_limit_remaining) {
int32_t first_close = budget_hook.close_token_ids.front();
if (tok == first_close) {
budget_close_started = true;
close_inject_pos = 1;
return;
}
std::fprintf(stderr,
"[budget-hook] gemma4 force-close at committed=%d/%d "
"(remaining=%d <= hard_limit=%d): overriding token %d "
"with close[0]=%d (seq len %zu)\n",
committed_now, n_gen, remaining,
budget_hook.hard_limit_remaining, tok, first_close,
budget_hook.close_token_ids.size());
tok = first_close;
budget_close_started = true;
close_inject_pos = 1;
if (forced_close_out) *forced_close_out = true;
}
};

for (int i = 0; i < n_gen; ++i) {
int32_t tok = out_tokens.back();
// Seed for this iteration's embed step:
// - Normal case: previous iteration just pushed a sampled
// token onto out_tokens; we re-embed it to advance KV +
// produce next-token logits.
// - Empty case (spec-decode tail-off at iter 0): no prior
// iteration ran, so use cache_.last_tok — that's the
// prefill argmax that spec-decode would have consumed as
// its initial seed. Mirrors qwen35's initial_emitted=1
// pattern; without this, out_tokens.back() on an empty
// vector is UB. (Codex r2 P2 follow-up: the previous fix
// pushed last_tok onto out_tokens here in the caller, but
// that grew out_tokens by an uncounted extra token and the
// caller's `result.tokens.size()` over-counted against the
// budget. Reading from cache instead keeps the budget
// honest.)
int32_t tok = out_tokens.empty() ? cache_.last_tok : out_tokens.back();

// Embed single token
w_.embedder.embed(&tok, 1, embed_buf.data());
Expand All @@ -292,6 +362,7 @@ bool Gemma4Backend::do_decode(int committed, int n_gen,
if (logits[j] > best) { best = logits[j]; next = j; }
}
}
maybe_force_close(next, committed);

out_tokens.push_back(next);
io.emit(next);
Expand All @@ -310,7 +381,9 @@ bool Gemma4Backend::do_decode(int committed, int n_gen,

bool Gemma4Backend::do_spec_decode(int committed, int n_gen,
std::vector<int32_t> & out_tokens,
const DaemonIO & io) {
const DaemonIO & io,
const BudgetHook * budget_hook,
bool * forced_close_out) {
const int hidden = w_.n_embd;
int32_t last_tok = cache_.last_tok;

Expand All @@ -336,6 +409,60 @@ bool Gemma4Backend::do_spec_decode(int committed, int n_gen,
while (n_generated < n_gen) {
const int need_commit_budget = n_gen - n_generated;

// Budget tail-off: when remaining budget is within one spec-decode
// batch of the force-close threshold, hand off to do_decode for the
// tail. AR handles the close-token override cleanly; spec-decode's
// verify-and-accept loop can't safely inject a token mid-batch
// without rewriting KV.
//
// Gemma4's do_decode reads `out_tokens.back()` as the seed each
// iter. After the first spec-decode iteration the most-recently-
// committed token is on out_tokens, but on a small-budget request
// (budget_tokens <= reply_budget + q_len) tail-off can fire on
// iter 0 before out_tokens has been seeded. Codex review flagged
// the resulting UB on out_tokens.back(); we set cache_.last_tok
// and let do_decode pick it up when out_tokens is empty.
//
// Budget accounting (codex r2 P2): in the previous patch we
// also push_back'd last_tok before calling do_decode, which
// grew out_tokens by an extra token outside the budget — the
// caller (http_server) then saw `result.tokens.size() ==
// need_commit_budget + 1` and double-counted that seed against
// the budget. Mirror qwen35 instead: cache the seed via
// cache_.last_tok, leave out_tokens untouched, and have
// do_decode read the seed from cache when out_tokens is empty
// (initial_emitted=1 path below). That keeps the budget honest
// and matches the symmetry between qwen35 and gemma4 backends.
if (budget_hook && !budget_hook->close_token_ids.empty()) {
int hard = budget_hook->hard_limit_remaining;
if (need_commit_budget <= hard + q_len) {
std::fprintf(stderr,
"[budget-hook] gemma4 spec-decode tail-off at "
"committed=%d/%d (remaining=%d, hard_limit=%d, "
"batch=%d) — switching to AR\n",
committed, n_gen, need_commit_budget, hard, q_len);
step_graph_destroy(draft_sg);
cache_.last_tok = last_tok; // do_decode reads this when out_tokens empty
BudgetHook tail_hook = *budget_hook;
int ar_n_gen = need_commit_budget;
bool ok = do_decode(committed, ar_n_gen, out_tokens, io,
tail_hook, forced_close_out);
auto t_dec1 = std::chrono::steady_clock::now();
const double decode_s = std::chrono::duration<double>(t_dec1 - t_dec0).count();
const int total_draft_pos = std::max(1, n_draft_steps * q_len);
const double accept_pct = 100.0 * (double)n_accept_sum / (double)total_draft_pos;
std::fprintf(stderr,
"[gemma4-spec] tail-off-stats tokens=%d time=%.3f s "
"speed=%.2f tok/s steps=%d accepted=%d/%d (%.1f%%)\n",
n_generated, decode_s,
n_generated > 0 ? n_generated / decode_s : 0.0,
n_draft_steps, n_accept_sum, total_draft_pos,
accept_pct);
io.emit(-1);
return ok;
}
}

// 1. Build noise input: [last_tok, MASK, MASK, ..., MASK]
noise_ids[0] = last_tok;
for (int i = 1; i < q_len; i++) noise_ids[i] = target->mask_token_id();
Expand Down Expand Up @@ -529,7 +656,9 @@ GenerateResult Gemma4Backend::generate(const GenerateRequest & req,
&& !sampler_.needs_logit_processing();

if (can_spec) {
if (!do_spec_decode(committed, req.n_gen, result.tokens, out_io)) {
if (!do_spec_decode(committed, req.n_gen, result.tokens, out_io,
&req.budget_hook,
&result.budget_forced_close)) {
result.error = "spec_decode";
return result;
}
Expand Down Expand Up @@ -578,7 +707,9 @@ GenerateResult Gemma4Backend::generate(const GenerateRequest & req,
}

if (req.n_gen > 1) {
if (!do_decode(committed, req.n_gen - 1, result.tokens, out_io)) {
if (!do_decode(committed, req.n_gen - 1, result.tokens, out_io,
req.budget_hook,
&result.budget_forced_close)) {
result.error = "decode";
return result;
}
Expand Down Expand Up @@ -694,7 +825,9 @@ GenerateResult Gemma4Backend::restore_and_generate(int slot,
&& sampler_.temp == 0.0f;

if (can_spec) {
if (!do_spec_decode(committed, req.n_gen, result.tokens, out_io)) {
if (!do_spec_decode(committed, req.n_gen, result.tokens, out_io,
&req.budget_hook,
&result.budget_forced_close)) {
result.error = "spec_decode";
return result;
}
Expand Down Expand Up @@ -743,7 +876,9 @@ GenerateResult Gemma4Backend::restore_and_generate(int slot,
}

if (req.n_gen > 1) {
if (!do_decode(committed, req.n_gen - 1, result.tokens, out_io)) {
if (!do_decode(committed, req.n_gen - 1, result.tokens, out_io,
req.budget_hook,
&result.budget_forced_close)) {
result.error = "decode";
return result;
}
Expand Down
18 changes: 16 additions & 2 deletions dflash/src/gemma4/gemma4_backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,14 +105,28 @@ class Gemma4Backend : public ModelBackend {
int kv_offset = 0);

// Autoregressive decode loop.
// budget_hook (when close_token_ids is non-empty) overrides the next
// sampled token(s) with the close-tag sequence once (n_gen - committed)
// <= hard_limit. Mirrors qwen35's do_ar_decode. For Gemma4 the close
// tag is typically `<channel|>` (single token in the gemma4 vocab).
// forced_close_out, when non-null, is set to true iff the hook injected
// the close sequence (vs. the model self-closing). See qwen35_backend.h
// for full rationale.
bool do_decode(int committed, int n_gen,
std::vector<int32_t> & out_tokens,
const DaemonIO & io);
const DaemonIO & io,
const BudgetHook & budget_hook = {},
bool * forced_close_out = nullptr);

// DFlash speculative decode loop.
// When budget_hook is non-null and (n_gen - generated) falls within
// hard_limit + batch headroom, breaks out and tails via do_decode so
// the force-close override fires cleanly with KV state intact.
bool do_spec_decode(int committed, int n_gen,
std::vector<int32_t> & out_tokens,
const DaemonIO & io);
const DaemonIO & io,
const BudgetHook * budget_hook = nullptr,
bool * forced_close_out = nullptr);
};

} // namespace dflash::common
Loading
Loading