WaffleBits · WaffleBits · Jun 26, 2026 · Jun 24, 2026 · Jun 25, 2026 · Jun 26, 2026
diff --git a/README.md b/README.md
@@ -13,14 +13,17 @@ replayable scheduling traces, and canary/shadow release decisions.
 - Conservative paged-KV reservations that prevent admitted requests from
   exceeding declared capacity.
 - Round-robin decode scheduling so active requests make measurable progress.
-- Deterministic workload replay with a machine-readable trace fingerprint.
+- Deterministic workload replay with a machine-readable trace fingerprint,
+  queue-pressure summary, active-capacity summary, and KV-pressure summary.
 - Baseline/candidate release validation with `promote`, `hold`, and `rollback`
   outcomes.
 - Backend mirror normalization for vLLM/SGLang-style serving observations
   before the release gate runs.
 - Exact output checks, model-aware numeric tolerances for backend drift,
   per-segment release summaries, error-rate deltas, p95 latency regression
-  policy, tests, and CI.
+  policy, TTFT and decode-token p95 checks, KV memory-pressure reporting,
+  model-version transitions, token-trace fingerprints, structured triage
+  owner hints, tests, and CI.
 
 ## Quick Start
 
@@ -31,6 +34,10 @@ cargo run --release -- replay \
   --input fixtures/workload.json \
   --output artifacts/workload-replay.json
 
+cargo run --release -- replay \
+  --input fixtures/workload_pressure.json \
+  --output artifacts/workload-pressure-replay.json
+
 cargo run --release -- gate \
   --input fixtures/release_gate_safe.json \
   --output artifacts/release-gate-promote.json
@@ -53,12 +60,20 @@ an added error produces `rollback`.
 The numeric-tolerance fixture produces `promote` while reporting four tolerated
 numeric comparisons across a baseline-runtime to candidate-runtime segment.
 The backend-mirror fixture converts vLLM/SGLang-style request observations into
-the same release gate and produces `promote` with a vLLM to SGLang segment.
+the same release gate and produces `promote` with a vLLM to SGLang segment,
+model-version transition metadata, queue depth, KV memory pressure, TTFT, and
+decode-token p95 telemetry.
 
 The checked workload fixture completes four requests in 11 scheduler ticks,
-peaks at 12 of 20 KV pages, returns all pages on completion, and emits trace
+peaks at 12 of 20 KV pages, records three queued-pressure ticks, records three
+active-capacity ticks, returns all pages on completion, and emits trace
 fingerprint `394166dc24d38b6c`.
 
+The pressure fixture completes eight mixed-priority requests in 27 scheduler
+ticks, records a maximum queue depth of five, reaches all three active slots,
+peaks at 13 of 15 KV pages, reports 86.666667% peak KV pressure, and returns
+all pages on completion.
+
 ## Runtime Model
 
 Each request declares prompt length, maximum output length, priority, and
@@ -74,8 +89,9 @@ Every tick records:
 - queued and active counts; and
 - used KV pages.
 
-The replay report includes a stable trace fingerprint, peak KV pages, total
-ticks, and completion count.
+The replay report includes a stable trace fingerprint, peak KV pages, peak KV
+pressure percentage, maximum queued and active request counts, queue-pressure
+ticks, active-capacity ticks, total ticks, and completion count.
 
 ## Backend Mirror Adapter
 
@@ -88,21 +104,28 @@ output token IDs, explicit output fingerprints, and optional numeric output
 vectors. Successful observations must carry output material so correctness
 checks remain auditable. Token IDs and numeric vectors are converted into
 stable FNV-1a fingerprints when an engine-specific fingerprint is not supplied.
+Observations may also carry model version, queue depth, KV page usage, TTFT,
+decode-token latencies, and token-trace fingerprints. Those fields let the gate
+surface rollout context and hold a candidate when latency or memory-pressure
+telemetry crosses policy even if output correctness is intact.
 
 ## Release Policy
 
 The gate joins mirrored baseline and candidate observations by request ID.
 Outputs can be validated either by exact fingerprint or by a configured
 numeric tolerance scoped to model, candidate backend, and accelerator. Reports
 include aggregate metrics plus segment summaries so hardware/backend-specific
-regressions remain visible.
+regressions remain visible. Hold and rollback reports also include structured
+triage items that name the failed signal, the recommended response, an owner
+hint, and the next investigation action.
 
 | Signal | Response |
 |---|---|
 | Output mismatch above policy | `rollback` |
 | Numeric drift above model/backend policy | `rollback` |
 | Error-rate increase above policy | `rollback` |
 | p95 latency regression above policy | `hold` |
+| TTFT, decode-token p95, or memory-pressure regression above policy | `hold` |
 | Missing or insufficient matched traffic | `hold` |
 | Complete evidence within policy | `promote` |
 

diff --git a/artifacts/backend-mirror-report.json b/artifacts/backend-mirror-report.json
@@ -1,5 +1,5 @@
 {
-  "schema_version": 2,
+  "schema_version": 3,
   "decision": "promote",
   "matched_requests": 4,
   "baseline_requests": 4,
@@ -17,22 +17,45 @@
   "baseline_p95_latency_ms": 28.0,
   "candidate_p95_latency_ms": 27.2,
   "p95_latency_regression_pct": -2.857143,
+  "baseline_p95_ttft_ms": 9.0,
+  "candidate_p95_ttft_ms": 8.5,
+  "ttft_regression_pct": -5.555556,
+  "baseline_decode_token_p95_ms": 7.5,
+  "candidate_decode_token_p95_ms": 7.0,
+  "decode_token_p95_regression_pct": -6.666667,
+  "max_candidate_queue_depth": 6,
+  "max_candidate_memory_pressure_pct": 60.0,
+  "token_trace_pairs": 4,
+  "token_trace_mismatch_rate": 0.0,
   "segments": [
     {
       "model": "decoder-7b",
       "baseline_backend": "vllm",
       "candidate_backend": "sglang",
       "accelerator": "h100",
+      "baseline_model_version": "decoder-7b@baseline-2026-06-24",
+      "candidate_model_version": "decoder-7b@candidate-2026-06-24",
       "matched_requests": 4,
       "output_mismatch_rate": 0.0,
       "baseline_error_rate": 0.0,
       "candidate_error_rate": 0.0,
       "baseline_p95_latency_ms": 28.0,
       "candidate_p95_latency_ms": 27.2,
-      "p95_latency_regression_pct": -2.857143
+      "p95_latency_regression_pct": -2.857143,
+      "baseline_p95_ttft_ms": 9.0,
+      "candidate_p95_ttft_ms": 8.5,
+      "ttft_regression_pct": -5.555556,
+      "baseline_decode_token_p95_ms": 7.5,
+      "candidate_decode_token_p95_ms": 7.0,
+      "decode_token_p95_regression_pct": -6.666667,
+      "max_candidate_queue_depth": 6,
+      "max_candidate_memory_pressure_pct": 60.0,
+      "token_trace_pairs": 4,
+      "token_trace_mismatch_rate": 0.0
     }
   ],
+  "triage": [],
   "reasons": [
-    "candidate stayed within correctness, reliability, and latency policy"
+    "candidate stayed within correctness, reliability, latency, and telemetry policy"
   ]
 }
diff --git a/artifacts/release-gate-numeric-tolerance.json b/artifacts/release-gate-numeric-tolerance.json
@@ -1,5 +1,5 @@
 {
-  "schema_version": 2,
+  "schema_version": 3,
   "decision": "promote",
   "matched_requests": 4,
   "baseline_requests": 4,
@@ -17,22 +17,45 @@
   "baseline_p95_latency_ms": 28.0,
   "candidate_p95_latency_ms": 27.6,
   "p95_latency_regression_pct": -1.428571,
+  "baseline_p95_ttft_ms": null,
+  "candidate_p95_ttft_ms": null,
+  "ttft_regression_pct": null,
+  "baseline_decode_token_p95_ms": null,
+  "candidate_decode_token_p95_ms": null,
+  "decode_token_p95_regression_pct": null,
+  "max_candidate_queue_depth": null,
+  "max_candidate_memory_pressure_pct": null,
+  "token_trace_pairs": 0,
+  "token_trace_mismatch_rate": 0.0,
   "segments": [
     {
       "model": "decoder-7b",
       "baseline_backend": "baseline-runtime",
       "candidate_backend": "candidate-runtime",
       "accelerator": "h100",
+      "baseline_model_version": "unspecified",
+      "candidate_model_version": "unspecified",
       "matched_requests": 4,
       "output_mismatch_rate": 0.0,
       "baseline_error_rate": 0.0,
       "candidate_error_rate": 0.0,
       "baseline_p95_latency_ms": 28.0,
       "candidate_p95_latency_ms": 27.6,
-      "p95_latency_regression_pct": -1.428571
+      "p95_latency_regression_pct": -1.428571,
+      "baseline_p95_ttft_ms": null,
+      "candidate_p95_ttft_ms": null,
+      "ttft_regression_pct": null,
+      "baseline_decode_token_p95_ms": null,
+      "candidate_decode_token_p95_ms": null,
+      "decode_token_p95_regression_pct": null,
+      "max_candidate_queue_depth": null,
+      "max_candidate_memory_pressure_pct": null,
+      "token_trace_pairs": 0,
+      "token_trace_mismatch_rate": 0.0
     }
   ],
+  "triage": [],
   "reasons": [
-    "candidate stayed within correctness, reliability, and latency policy"
+    "candidate stayed within correctness, reliability, latency, and telemetry policy"
   ]
 }
diff --git a/artifacts/release-gate-promote.json b/artifacts/release-gate-promote.json
@@ -1,5 +1,5 @@
 {
-  "schema_version": 2,
+  "schema_version": 3,
   "decision": "promote",
   "matched_requests": 4,
   "baseline_requests": 4,
@@ -17,22 +17,45 @@
   "baseline_p95_latency_ms": 16.0,
   "candidate_p95_latency_ms": 16.7,
   "p95_latency_regression_pct": 4.375,
+  "baseline_p95_ttft_ms": null,
+  "candidate_p95_ttft_ms": null,
+  "ttft_regression_pct": null,
+  "baseline_decode_token_p95_ms": null,
+  "candidate_decode_token_p95_ms": null,
+  "decode_token_p95_regression_pct": null,
+  "max_candidate_queue_depth": null,
+  "max_candidate_memory_pressure_pct": null,
+  "token_trace_pairs": 0,
+  "token_trace_mismatch_rate": 0.0,
   "segments": [
     {
       "model": "unspecified",
       "baseline_backend": "unspecified",
       "candidate_backend": "unspecified",
       "accelerator": "unspecified",
+      "baseline_model_version": "unspecified",
+      "candidate_model_version": "unspecified",
       "matched_requests": 4,
       "output_mismatch_rate": 0.0,
       "baseline_error_rate": 0.0,
       "candidate_error_rate": 0.0,
       "baseline_p95_latency_ms": 16.0,
       "candidate_p95_latency_ms": 16.7,
-      "p95_latency_regression_pct": 4.375
+      "p95_latency_regression_pct": 4.375,
+      "baseline_p95_ttft_ms": null,
+      "candidate_p95_ttft_ms": null,
+      "ttft_regression_pct": null,
+      "baseline_decode_token_p95_ms": null,
+      "candidate_decode_token_p95_ms": null,
+      "decode_token_p95_regression_pct": null,
+      "max_candidate_queue_depth": null,
+      "max_candidate_memory_pressure_pct": null,
+      "token_trace_pairs": 0,
+      "token_trace_mismatch_rate": 0.0
     }
   ],
+  "triage": [],
   "reasons": [
-    "candidate stayed within correctness, reliability, and latency policy"
+    "candidate stayed within correctness, reliability, latency, and telemetry policy"
   ]
 }
diff --git a/artifacts/release-gate-rollback.json b/artifacts/release-gate-rollback.json
@@ -1,5 +1,5 @@
 {
-  "schema_version": 2,
+  "schema_version": 3,
   "decision": "rollback",
   "matched_requests": 4,
   "baseline_requests": 4,
@@ -17,19 +17,57 @@
   "baseline_p95_latency_ms": 16.0,
   "candidate_p95_latency_ms": 14.5,
   "p95_latency_regression_pct": -9.375,
+  "baseline_p95_ttft_ms": null,
+  "candidate_p95_ttft_ms": null,
+  "ttft_regression_pct": null,
+  "baseline_decode_token_p95_ms": null,
+  "candidate_decode_token_p95_ms": null,
+  "decode_token_p95_regression_pct": null,
+  "max_candidate_queue_depth": null,
+  "max_candidate_memory_pressure_pct": null,
+  "token_trace_pairs": 0,
+  "token_trace_mismatch_rate": 0.0,
   "segments": [
     {
       "model": "unspecified",
       "baseline_backend": "unspecified",
       "candidate_backend": "unspecified",
       "accelerator": "unspecified",
+      "baseline_model_version": "unspecified",
+      "candidate_model_version": "unspecified",
       "matched_requests": 4,
       "output_mismatch_rate": 0.333333,
       "baseline_error_rate": 0.0,
       "candidate_error_rate": 0.25,
       "baseline_p95_latency_ms": 16.0,
       "candidate_p95_latency_ms": 14.5,
-      "p95_latency_regression_pct": -9.375
+      "p95_latency_regression_pct": -9.375,
+      "baseline_p95_ttft_ms": null,
+      "candidate_p95_ttft_ms": null,
+      "ttft_regression_pct": null,
+      "baseline_decode_token_p95_ms": null,
+      "candidate_decode_token_p95_ms": null,
+      "decode_token_p95_regression_pct": null,
+      "max_candidate_queue_depth": null,
+      "max_candidate_memory_pressure_pct": null,
+      "token_trace_pairs": 0,
+      "token_trace_mismatch_rate": 0.0
+    }
+  ],
+  "triage": [
+    {
+      "signal": "output_correctness",
+      "response": "rollback",
+      "owner_hint": "model_runtime_correctness",
+      "action": "block the candidate and compare output fingerprints or numeric tolerance scope",
+      "reason": "output mismatch rate 0.3333 exceeded 0.0000"
+    },
+    {
+      "signal": "serving_reliability",
+      "response": "rollback",
+      "owner_hint": "serving_runtime",
+      "action": "block the candidate and inspect candidate errors before rerunning the gate",
+      "reason": "error-rate increase 0.2500 exceeded 0.0100"
     }
   ],
   "reasons": [