From b3163f4391d29ca0f0f966e0b1111914a6de792a Mon Sep 17 00:00:00 2001 From: Erik LaBianca Date: Mon, 25 May 2026 20:27:10 -0400 Subject: [PATCH] feat(gemma4): wire prefill/decode timing into GenerateResult (mirrors qwen35 3b80fa8) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Gemma4Backend's generate() and restore_and_generate() never populated result.prefill_s or result.decode_s, so usage.timings.{prefill_ms, decode_ms, decode_tokens_per_sec} surfaced as 0.0 for every gemma4 request. Bench tooling that aggregates per-case decode rates was silently falling back to wall-time math, conflating prefill + HTTP overhead with decode. Add steady_clock measurements around the do_prefill call and around the entire decode block (both spec-decode and AR fallback paths) in both entry points. Same shape as qwen35's instrumentation from commit 3b80fa8. Verified on bragi (RTX 5090 Laptop) with gemma-4-31b-it Q4_K_M: prompt=30 comp=400 wall=19.66s timings: prefill_ms=117.4 decode_ms=19534.5 decode_tps=20.5 (The earlier "2.5 tok/s" number for 31b was a request queued behind an in-flight sweep — wall-based math gave a wrong rate. The server's ar_decode log line already showed ~19 tok/s consistently; this commit makes the same number flow through to the API response.) --- server/src/gemma4/gemma4_backend.cpp | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/server/src/gemma4/gemma4_backend.cpp b/server/src/gemma4/gemma4_backend.cpp index 19b708f67..d415feee9 100644 --- a/server/src/gemma4/gemma4_backend.cpp +++ b/server/src/gemma4/gemma4_backend.cpp @@ -632,7 +632,13 @@ GenerateResult Gemma4Backend::generate(const GenerateRequest & req, cache_.cur_pos = 0; + // Timing: prefill wall-clock — surfaced in usage.timings.prefill_ms + // by http_server.cpp via the GenTimings struct from sse_emitter. + // Mirrors qwen35_backend.cpp's instrumentation (commit 3b80fa8). + auto t_prefill_start = std::chrono::steady_clock::now(); const int committed = do_prefill(req.prompt, out_io, /*kv_offset=*/0); + result.prefill_s = std::chrono::duration( + std::chrono::steady_clock::now() - t_prefill_start).count(); if (committed < 0) { result.error = "prefill"; return result; @@ -648,6 +654,7 @@ GenerateResult Gemma4Backend::generate(const GenerateRequest & req, cache_.cur_pos = committed; } + auto t_decode_start = std::chrono::steady_clock::now(); if (req.n_gen > 0) { // Try speculative decode if draft is available and temp==0 const bool can_spec = dflash_target_ @@ -719,6 +726,8 @@ GenerateResult Gemma4Backend::generate(const GenerateRequest & req, } else { out_io.emit(-1); } + result.decode_s = std::chrono::duration( + std::chrono::steady_clock::now() - t_decode_start).count(); result.ok = true; return result; } @@ -784,6 +793,10 @@ GenerateResult Gemma4Backend::restore_and_generate(int slot, const int prompt_len = (int)req.prompt.size(); int committed = snap_pos; + // Timing: prefill wall-clock for the delta. Same instrumentation as + // generate() — only the delta range is prefilled; the snap-pos prefix + // is already in cache. Surface via result.prefill_s for usage.timings. + auto t_prefill_start = std::chrono::steady_clock::now(); if (prompt_len > snap_pos) { // Compute delta (tokens after the snapshot) std::vector delta(req.prompt.begin() + snap_pos, req.prompt.end()); @@ -798,6 +811,8 @@ GenerateResult Gemma4Backend::restore_and_generate(int slot, return result; } // else: prompt_len == snap_pos → no delta, committed stays at snap_pos + result.prefill_s = std::chrono::duration( + std::chrono::steady_clock::now() - t_prefill_start).count(); // Inline snapshot at snap_pos for prefix cache (new snap from this request) if (req.snap_slot >= 0 && req.snap_pos > 0 && req.snap_pos <= committed) { @@ -818,6 +833,7 @@ GenerateResult Gemma4Backend::restore_and_generate(int slot, } // Generate + auto t_decode_start = std::chrono::steady_clock::now(); if (req.n_gen > 0) { const bool can_spec = dflash_target_ && !draft_parked_ @@ -888,6 +904,8 @@ GenerateResult Gemma4Backend::restore_and_generate(int slot, } else { out_io.emit(-1); } + result.decode_s = std::chrono::duration( + std::chrono::steady_clock::now() - t_decode_start).count(); result.ok = true; return result; }