From b3163f4391d29ca0f0f966e0b1111914a6de792a Mon Sep 17 00:00:00 2001
From: Erik LaBianca <erik.labianca@gmail.com>
Date: Mon, 25 May 2026 20:27:10 -0400
Subject: [PATCH] feat(gemma4): wire prefill/decode timing into GenerateResult
 (mirrors qwen35 3b80fa8)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Gemma4Backend's generate() and restore_and_generate() never populated
result.prefill_s or result.decode_s, so usage.timings.{prefill_ms,
decode_ms, decode_tokens_per_sec} surfaced as 0.0 for every gemma4
request. Bench tooling that aggregates per-case decode rates was
silently falling back to wall-time math, conflating prefill + HTTP
overhead with decode.

Add steady_clock measurements around the do_prefill call and around
the entire decode block (both spec-decode and AR fallback paths) in
both entry points. Same shape as qwen35's instrumentation from
commit 3b80fa8.

Verified on bragi (RTX 5090 Laptop) with gemma-4-31b-it Q4_K_M:
  prompt=30 comp=400 wall=19.66s
  timings: prefill_ms=117.4 decode_ms=19534.5 decode_tps=20.5

(The earlier "2.5 tok/s" number for 31b was a request queued behind an
in-flight sweep — wall-based math gave a wrong rate. The server's
ar_decode log line already showed ~19 tok/s consistently; this commit
makes the same number flow through to the API response.)
---
 server/src/gemma4/gemma4_backend.cpp | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
diff --git a/server/src/gemma4/gemma4_backend.cpp b/server/src/gemma4/gemma4_backend.cpp
index 19b708f67..d415feee9 100644
--- a/server/src/gemma4/gemma4_backend.cpp
+++ b/server/src/gemma4/gemma4_backend.cpp
@@ -632,7 +632,13 @@ GenerateResult Gemma4Backend::generate(const GenerateRequest & req,
 
     cache_.cur_pos = 0;
 
+    // Timing: prefill wall-clock — surfaced in usage.timings.prefill_ms
+    // by http_server.cpp via the GenTimings struct from sse_emitter.
+    // Mirrors qwen35_backend.cpp's instrumentation (commit 3b80fa8).
+    auto t_prefill_start = std::chrono::steady_clock::now();
     const int committed = do_prefill(req.prompt, out_io, /*kv_offset=*/0);
+    result.prefill_s = std::chrono::duration<double>(
+        std::chrono::steady_clock::now() - t_prefill_start).count();
     if (committed < 0) {
         result.error = "prefill";
         return result;
@@ -648,6 +654,7 @@ GenerateResult Gemma4Backend::generate(const GenerateRequest & req,
         cache_.cur_pos = committed;
     }
 
+    auto t_decode_start = std::chrono::steady_clock::now();
     if (req.n_gen > 0) {
         // Try speculative decode if draft is available and temp==0
         const bool can_spec = dflash_target_
@@ -719,6 +726,8 @@ GenerateResult Gemma4Backend::generate(const GenerateRequest & req,
     } else {
         out_io.emit(-1);
     }
+    result.decode_s = std::chrono::duration<double>(
+        std::chrono::steady_clock::now() - t_decode_start).count();
     result.ok = true;
     return result;
 }
@@ -784,6 +793,10 @@ GenerateResult Gemma4Backend::restore_and_generate(int slot,
     const int prompt_len = (int)req.prompt.size();
     int committed = snap_pos;
 
+    // Timing: prefill wall-clock for the delta. Same instrumentation as
+    // generate() — only the delta range is prefilled; the snap-pos prefix
+    // is already in cache. Surface via result.prefill_s for usage.timings.
+    auto t_prefill_start = std::chrono::steady_clock::now();
     if (prompt_len > snap_pos) {
         // Compute delta (tokens after the snapshot)
         std::vector<int32_t> delta(req.prompt.begin() + snap_pos, req.prompt.end());
@@ -798,6 +811,8 @@ GenerateResult Gemma4Backend::restore_and_generate(int slot,
         return result;
     }
     // else: prompt_len == snap_pos → no delta, committed stays at snap_pos
+    result.prefill_s = std::chrono::duration<double>(
+        std::chrono::steady_clock::now() - t_prefill_start).count();
 
     // Inline snapshot at snap_pos for prefix cache (new snap from this request)
     if (req.snap_slot >= 0 && req.snap_pos > 0 && req.snap_pos <= committed) {
@@ -818,6 +833,7 @@ GenerateResult Gemma4Backend::restore_and_generate(int slot,
     }
 
     // Generate
+    auto t_decode_start = std::chrono::steady_clock::now();
     if (req.n_gen > 0) {
         const bool can_spec = dflash_target_
             && !draft_parked_
@@ -888,6 +904,8 @@ GenerateResult Gemma4Backend::restore_and_generate(int slot,
     } else {
         out_io.emit(-1);
     }
+    result.decode_s = std::chrono::duration<double>(
+        std::chrono::steady_clock::now() - t_decode_start).count();
     result.ok = true;
     return result;
 }