From cf99d784ac6c9065159987059eec3d06bded770a Mon Sep 17 00:00:00 2001 From: WaffleBits <19478926+WaffleBits@users.noreply.github.com> Date: Wed, 24 Jun 2026 17:33:00 -0400 Subject: [PATCH 1/3] Add inference telemetry release gates --- README.md | 12 +- artifacts/backend-mirror-report.json | 26 +- artifacts/release-gate-numeric-tolerance.json | 26 +- artifacts/release-gate-promote.json | 26 +- artifacts/release-gate-rollback.json | 24 +- docs/ARCHITECTURE.md | 23 +- docs/RELEASE_VALIDATION.md | 25 +- fixtures/backend_mirror_vllm_sglang.json | 67 ++++- src/adapter.rs | 79 +++++- src/lib.rs | 2 +- src/release.rs | 265 +++++++++++++++++- tests/adapter.rs | 129 +++++++++ tests/release.rs | 11 +- 13 files changed, 681 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index f88985f..4a38267 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,8 @@ replayable scheduling traces, and canary/shadow release decisions. before the release gate runs. - Exact output checks, model-aware numeric tolerances for backend drift, per-segment release summaries, error-rate deltas, p95 latency regression - policy, tests, and CI. + policy, TTFT and decode-token p95 checks, KV memory-pressure reporting, + model-version transitions, token-trace fingerprints, tests, and CI. ## Quick Start @@ -53,7 +54,9 @@ an added error produces `rollback`. The numeric-tolerance fixture produces `promote` while reporting four tolerated numeric comparisons across a baseline-runtime to candidate-runtime segment. The backend-mirror fixture converts vLLM/SGLang-style request observations into -the same release gate and produces `promote` with a vLLM to SGLang segment. +the same release gate and produces `promote` with a vLLM to SGLang segment, +model-version transition metadata, queue depth, KV memory pressure, TTFT, and +decode-token p95 telemetry. The checked workload fixture completes four requests in 11 scheduler ticks, peaks at 12 of 20 KV pages, returns all pages on completion, and emits trace @@ -88,6 +91,10 @@ output token IDs, explicit output fingerprints, and optional numeric output vectors. Successful observations must carry output material so correctness checks remain auditable. Token IDs and numeric vectors are converted into stable FNV-1a fingerprints when an engine-specific fingerprint is not supplied. +Observations may also carry model version, queue depth, KV page usage, TTFT, +decode-token latencies, and token-trace fingerprints. Those fields let the gate +surface rollout context and hold a candidate when latency or memory-pressure +telemetry crosses policy even if output correctness is intact. ## Release Policy @@ -103,6 +110,7 @@ regressions remain visible. | Numeric drift above model/backend policy | `rollback` | | Error-rate increase above policy | `rollback` | | p95 latency regression above policy | `hold` | +| TTFT, decode-token p95, or memory-pressure regression above policy | `hold` | | Missing or insufficient matched traffic | `hold` | | Complete evidence within policy | `promote` | diff --git a/artifacts/backend-mirror-report.json b/artifacts/backend-mirror-report.json index 1e4891d..1f5b3fe 100644 --- a/artifacts/backend-mirror-report.json +++ b/artifacts/backend-mirror-report.json @@ -17,22 +17,44 @@ "baseline_p95_latency_ms": 28.0, "candidate_p95_latency_ms": 27.2, "p95_latency_regression_pct": -2.857143, + "baseline_p95_ttft_ms": 9.0, + "candidate_p95_ttft_ms": 8.5, + "ttft_regression_pct": -5.555556, + "baseline_decode_token_p95_ms": 7.5, + "candidate_decode_token_p95_ms": 7.0, + "decode_token_p95_regression_pct": -6.666667, + "max_candidate_queue_depth": 6, + "max_candidate_memory_pressure_pct": 60.0, + "token_trace_pairs": 4, + "token_trace_mismatch_rate": 0.0, "segments": [ { "model": "decoder-7b", "baseline_backend": "vllm", "candidate_backend": "sglang", "accelerator": "h100", + "baseline_model_version": "decoder-7b@baseline-2026-06-24", + "candidate_model_version": "decoder-7b@candidate-2026-06-24", "matched_requests": 4, "output_mismatch_rate": 0.0, "baseline_error_rate": 0.0, "candidate_error_rate": 0.0, "baseline_p95_latency_ms": 28.0, "candidate_p95_latency_ms": 27.2, - "p95_latency_regression_pct": -2.857143 + "p95_latency_regression_pct": -2.857143, + "baseline_p95_ttft_ms": 9.0, + "candidate_p95_ttft_ms": 8.5, + "ttft_regression_pct": -5.555556, + "baseline_decode_token_p95_ms": 7.5, + "candidate_decode_token_p95_ms": 7.0, + "decode_token_p95_regression_pct": -6.666667, + "max_candidate_queue_depth": 6, + "max_candidate_memory_pressure_pct": 60.0, + "token_trace_pairs": 4, + "token_trace_mismatch_rate": 0.0 } ], "reasons": [ - "candidate stayed within correctness, reliability, and latency policy" + "candidate stayed within correctness, reliability, latency, and telemetry policy" ] } diff --git a/artifacts/release-gate-numeric-tolerance.json b/artifacts/release-gate-numeric-tolerance.json index 6b1b169..a1d9026 100644 --- a/artifacts/release-gate-numeric-tolerance.json +++ b/artifacts/release-gate-numeric-tolerance.json @@ -17,22 +17,44 @@ "baseline_p95_latency_ms": 28.0, "candidate_p95_latency_ms": 27.6, "p95_latency_regression_pct": -1.428571, + "baseline_p95_ttft_ms": null, + "candidate_p95_ttft_ms": null, + "ttft_regression_pct": null, + "baseline_decode_token_p95_ms": null, + "candidate_decode_token_p95_ms": null, + "decode_token_p95_regression_pct": null, + "max_candidate_queue_depth": null, + "max_candidate_memory_pressure_pct": null, + "token_trace_pairs": 0, + "token_trace_mismatch_rate": 0.0, "segments": [ { "model": "decoder-7b", "baseline_backend": "baseline-runtime", "candidate_backend": "candidate-runtime", "accelerator": "h100", + "baseline_model_version": "unspecified", + "candidate_model_version": "unspecified", "matched_requests": 4, "output_mismatch_rate": 0.0, "baseline_error_rate": 0.0, "candidate_error_rate": 0.0, "baseline_p95_latency_ms": 28.0, "candidate_p95_latency_ms": 27.6, - "p95_latency_regression_pct": -1.428571 + "p95_latency_regression_pct": -1.428571, + "baseline_p95_ttft_ms": null, + "candidate_p95_ttft_ms": null, + "ttft_regression_pct": null, + "baseline_decode_token_p95_ms": null, + "candidate_decode_token_p95_ms": null, + "decode_token_p95_regression_pct": null, + "max_candidate_queue_depth": null, + "max_candidate_memory_pressure_pct": null, + "token_trace_pairs": 0, + "token_trace_mismatch_rate": 0.0 } ], "reasons": [ - "candidate stayed within correctness, reliability, and latency policy" + "candidate stayed within correctness, reliability, latency, and telemetry policy" ] } diff --git a/artifacts/release-gate-promote.json b/artifacts/release-gate-promote.json index b02dc13..446229e 100644 --- a/artifacts/release-gate-promote.json +++ b/artifacts/release-gate-promote.json @@ -17,22 +17,44 @@ "baseline_p95_latency_ms": 16.0, "candidate_p95_latency_ms": 16.7, "p95_latency_regression_pct": 4.375, + "baseline_p95_ttft_ms": null, + "candidate_p95_ttft_ms": null, + "ttft_regression_pct": null, + "baseline_decode_token_p95_ms": null, + "candidate_decode_token_p95_ms": null, + "decode_token_p95_regression_pct": null, + "max_candidate_queue_depth": null, + "max_candidate_memory_pressure_pct": null, + "token_trace_pairs": 0, + "token_trace_mismatch_rate": 0.0, "segments": [ { "model": "unspecified", "baseline_backend": "unspecified", "candidate_backend": "unspecified", "accelerator": "unspecified", + "baseline_model_version": "unspecified", + "candidate_model_version": "unspecified", "matched_requests": 4, "output_mismatch_rate": 0.0, "baseline_error_rate": 0.0, "candidate_error_rate": 0.0, "baseline_p95_latency_ms": 16.0, "candidate_p95_latency_ms": 16.7, - "p95_latency_regression_pct": 4.375 + "p95_latency_regression_pct": 4.375, + "baseline_p95_ttft_ms": null, + "candidate_p95_ttft_ms": null, + "ttft_regression_pct": null, + "baseline_decode_token_p95_ms": null, + "candidate_decode_token_p95_ms": null, + "decode_token_p95_regression_pct": null, + "max_candidate_queue_depth": null, + "max_candidate_memory_pressure_pct": null, + "token_trace_pairs": 0, + "token_trace_mismatch_rate": 0.0 } ], "reasons": [ - "candidate stayed within correctness, reliability, and latency policy" + "candidate stayed within correctness, reliability, latency, and telemetry policy" ] } diff --git a/artifacts/release-gate-rollback.json b/artifacts/release-gate-rollback.json index 1b5fb75..8d30230 100644 --- a/artifacts/release-gate-rollback.json +++ b/artifacts/release-gate-rollback.json @@ -17,19 +17,41 @@ "baseline_p95_latency_ms": 16.0, "candidate_p95_latency_ms": 14.5, "p95_latency_regression_pct": -9.375, + "baseline_p95_ttft_ms": null, + "candidate_p95_ttft_ms": null, + "ttft_regression_pct": null, + "baseline_decode_token_p95_ms": null, + "candidate_decode_token_p95_ms": null, + "decode_token_p95_regression_pct": null, + "max_candidate_queue_depth": null, + "max_candidate_memory_pressure_pct": null, + "token_trace_pairs": 0, + "token_trace_mismatch_rate": 0.0, "segments": [ { "model": "unspecified", "baseline_backend": "unspecified", "candidate_backend": "unspecified", "accelerator": "unspecified", + "baseline_model_version": "unspecified", + "candidate_model_version": "unspecified", "matched_requests": 4, "output_mismatch_rate": 0.333333, "baseline_error_rate": 0.0, "candidate_error_rate": 0.25, "baseline_p95_latency_ms": 16.0, "candidate_p95_latency_ms": 14.5, - "p95_latency_regression_pct": -9.375 + "p95_latency_regression_pct": -9.375, + "baseline_p95_ttft_ms": null, + "candidate_p95_ttft_ms": null, + "ttft_regression_pct": null, + "baseline_decode_token_p95_ms": null, + "candidate_decode_token_p95_ms": null, + "decode_token_p95_regression_pct": null, + "max_candidate_queue_depth": null, + "max_candidate_memory_pressure_pct": null, + "token_trace_pairs": 0, + "token_trace_mismatch_rate": 0.0 } ], "reasons": [ diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index ea3fc7c..dc79fa5 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -39,13 +39,15 @@ It computes: - model/backend-scoped numeric drift when fingerprints differ; - error-rate increase; - successful-request p95 latency; and -- candidate p95 regression; and -- segment summaries by model, baseline backend, candidate backend, and - accelerator. +- candidate p95 regression; +- TTFT p95 and decode-token p95 regression; +- candidate queue depth and KV memory pressure; and +- segment summaries by model, model version, baseline backend, candidate + backend, and accelerator. Correctness, numeric drift, or reliability regressions produce `rollback`. -Latency regressions or incomplete evidence produce `hold`. A complete candidate -within policy produces `promote`. +Latency, token-path, memory-pressure regressions, or incomplete evidence produce +`hold`. A complete candidate within policy produces `promote`. This is a local validation component, not a deployment controller. Production integration would obtain observations from mirrored traffic, canary @@ -59,8 +61,9 @@ keeps ingestion concerns separate from rollout decisions. The adapter currently accepts compact vLLM/SGLang-style request summaries: request ID, latency, health, model, backend, accelerator, output token IDs, -optional explicit fingerprints, and optional numeric vectors. If an engine does -not provide a fingerprint, the adapter computes a stable FNV-1a fingerprint -from token IDs or numeric values. Successful observations without output -material are rejected so a candidate cannot be promoted from latency-only -evidence. +optional explicit fingerprints, optional numeric vectors, model version, queue +depth, KV page usage, TTFT, decode-token latencies, and optional token-trace +fingerprints. If an engine does not provide an output fingerprint, the adapter +computes a stable FNV-1a fingerprint from token IDs or numeric values. +Successful observations without output material are rejected so a candidate +cannot be promoted from latency-only evidence. diff --git a/docs/RELEASE_VALIDATION.md b/docs/RELEASE_VALIDATION.md index a6e1c2e..d44c082 100644 --- a/docs/RELEASE_VALIDATION.md +++ b/docs/RELEASE_VALIDATION.md @@ -10,13 +10,17 @@ Use `promote` when: - output fingerprints stay within the configured mismatch budget; - configured numeric tolerances cover any expected model/backend output drift; - the candidate error-rate increase stays within policy; and -- candidate p95 latency stays within the configured regression budget. +- candidate p95 latency, TTFT p95, decode-token p95, and KV memory pressure + stay within the configured regression budgets. ## Hold Use `hold` when the evidence is incomplete or performance needs investigation. Examples include missing shadow requests, too few matched samples, no successful output pairs, or a p95 latency regression without a correctness failure. +The same response is used for excessive TTFT regression, decode-token p95 +regression, or candidate KV memory pressure because those are operational +signals that need investigation before rollout. ## Rollback @@ -45,6 +49,23 @@ The report includes: - segment summaries by model, baseline backend, candidate backend, and accelerator. +## Operational Telemetry + +Mirrored observations can include rollout context and token-path telemetry: + +- model version; +- queue depth; +- KV pages used and available; +- time to first token; +- per-token decode latencies; and +- token-trace fingerprints. + +The gate reports aggregate and per-segment TTFT p95, decode-token p95, maximum +candidate queue depth, maximum candidate memory pressure, and token-trace +mismatch rate. Correct outputs with excessive latency or memory pressure produce +`hold`, not `rollback`, because the evidence points to performance or capacity +risk rather than a correctness failure. + ## Backend Mirror Adapter The `mirror` command normalizes request observations from backend-specific @@ -53,7 +74,7 @@ baseline/candidate comparisons such as vLLM versus SGLang, or a current runtime versus a candidate runtime behind shadow traffic. Each observation records request ID, latency, health, model, backend, -accelerator, and output material. Engines may provide their own +accelerator, output material, and optional operational telemetry. Engines may provide their own `output_fingerprint`; otherwise the adapter hashes output token IDs or numeric output vectors with a stable FNV-1a fingerprint. Successful observations without output material are rejected because the release gate cannot audit diff --git a/fixtures/backend_mirror_vllm_sglang.json b/fixtures/backend_mirror_vllm_sglang.json index 4a4da3b..3f73ed7 100644 --- a/fixtures/backend_mirror_vllm_sglang.json +++ b/fixtures/backend_mirror_vllm_sglang.json @@ -4,6 +4,9 @@ "max_output_mismatch_rate": 0.0, "max_error_rate_increase": 0.01, "max_p95_latency_regression_pct": 10.0, + "max_ttft_regression_pct": 10.0, + "max_decode_token_p95_regression_pct": 10.0, + "max_candidate_memory_pressure_pct": 90.0, "max_numeric_drift_rate": 0.0, "numeric_tolerances": [] }, @@ -16,25 +19,49 @@ "request_id": "prompt-a", "latency_ms": 18.0, "ok": true, - "output_token_ids": [101, 1402, 13] + "output_token_ids": [101, 1402, 13], + "model_version": "decoder-7b@baseline-2026-06-24", + "queue_depth": 2, + "kv_pages_used": 8, + "kv_pages_capacity": 20, + "time_to_first_token_ms": 6.0, + "decode_token_latencies_ms": [4.0, 4.5] }, { "request_id": "prompt-b", "latency_ms": 21.0, "ok": true, - "output_token_ids": [205, 778, 990] + "output_token_ids": [205, 778, 990], + "model_version": "decoder-7b@baseline-2026-06-24", + "queue_depth": 3, + "kv_pages_used": 9, + "kv_pages_capacity": 20, + "time_to_first_token_ms": 7.0, + "decode_token_latencies_ms": [5.0, 5.5] }, { "request_id": "prompt-c", "latency_ms": 24.0, "ok": true, - "output_token_ids": [42, 42, 7] + "output_token_ids": [42, 42, 7], + "model_version": "decoder-7b@baseline-2026-06-24", + "queue_depth": 4, + "kv_pages_used": 10, + "kv_pages_capacity": 20, + "time_to_first_token_ms": 8.0, + "decode_token_latencies_ms": [6.0, 6.5] }, { "request_id": "prompt-d", "latency_ms": 28.0, "ok": true, - "output_token_ids": [301, 302, 303, 2] + "output_token_ids": [301, 302, 303, 2], + "model_version": "decoder-7b@baseline-2026-06-24", + "queue_depth": 5, + "kv_pages_used": 11, + "kv_pages_capacity": 20, + "time_to_first_token_ms": 9.0, + "decode_token_latencies_ms": [6.5, 7.0, 7.5] } ] }, @@ -47,25 +74,49 @@ "request_id": "prompt-a", "latency_ms": 17.5, "ok": true, - "output_token_ids": [101, 1402, 13] + "output_token_ids": [101, 1402, 13], + "model_version": "decoder-7b@candidate-2026-06-24", + "queue_depth": 3, + "kv_pages_used": 9, + "kv_pages_capacity": 20, + "time_to_first_token_ms": 5.5, + "decode_token_latencies_ms": [3.8, 4.0] }, { "request_id": "prompt-b", "latency_ms": 20.6, "ok": true, - "output_token_ids": [205, 778, 990] + "output_token_ids": [205, 778, 990], + "model_version": "decoder-7b@candidate-2026-06-24", + "queue_depth": 4, + "kv_pages_used": 10, + "kv_pages_capacity": 20, + "time_to_first_token_ms": 6.5, + "decode_token_latencies_ms": [4.7, 5.0] }, { "request_id": "prompt-c", "latency_ms": 23.5, "ok": true, - "output_token_ids": [42, 42, 7] + "output_token_ids": [42, 42, 7], + "model_version": "decoder-7b@candidate-2026-06-24", + "queue_depth": 5, + "kv_pages_used": 11, + "kv_pages_capacity": 20, + "time_to_first_token_ms": 7.5, + "decode_token_latencies_ms": [5.8, 6.0] }, { "request_id": "prompt-d", "latency_ms": 27.2, "ok": true, - "output_token_ids": [301, 302, 303, 2] + "output_token_ids": [301, 302, 303, 2], + "model_version": "decoder-7b@candidate-2026-06-24", + "queue_depth": 6, + "kv_pages_used": 12, + "kv_pages_capacity": 20, + "time_to_first_token_ms": 8.5, + "decode_token_latencies_ms": [6.2, 6.5, 7.0] } ] } diff --git a/src/adapter.rs b/src/adapter.rs index a782c4d..041bfab 100644 --- a/src/adapter.rs +++ b/src/adapter.rs @@ -3,7 +3,7 @@ use std::fmt; use serde::{Deserialize, Serialize}; -use crate::release::{GateInput, GateThresholds, Observation}; +use crate::release::{GateInput, GateThresholds, Observation, OperationalTelemetry}; #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct BackendMirrorInput { @@ -36,6 +36,20 @@ pub struct BackendObservation { pub output_values: Option>, #[serde(default)] pub error: Option, + #[serde(default)] + pub model_version: Option, + #[serde(default)] + pub queue_depth: Option, + #[serde(default)] + pub kv_pages_used: Option, + #[serde(default)] + pub kv_pages_capacity: Option, + #[serde(default)] + pub time_to_first_token_ms: Option, + #[serde(default)] + pub decode_token_latencies_ms: Vec, + #[serde(default)] + pub token_trace_fingerprint: Option, } #[derive(Debug, Clone, PartialEq, Eq)] @@ -94,6 +108,7 @@ fn normalize_observation( observation.request_id ))); } + validate_operational_telemetry(observation)?; let ok = observation.ok.unwrap_or_else(|| { observation @@ -112,9 +127,71 @@ fn normalize_observation( backend: Some(set.backend.clone()), accelerator: set.accelerator.clone(), output_values: observation.output_values.clone(), + telemetry: operational_telemetry(observation), }) } +fn validate_operational_telemetry(observation: &BackendObservation) -> Result<(), AdapterError> { + if let Some(ttft) = observation.time_to_first_token_ms + && (!ttft.is_finite() || ttft < 0.0) + { + return Err(AdapterError::new(format!( + "request {} has invalid time_to_first_token_ms", + observation.request_id + ))); + } + + for latency in &observation.decode_token_latencies_ms { + if !latency.is_finite() || *latency < 0.0 { + return Err(AdapterError::new(format!( + "request {} has invalid decode_token_latencies_ms", + observation.request_id + ))); + } + } + + match (observation.kv_pages_used, observation.kv_pages_capacity) { + (Some(_), Some(0)) => Err(AdapterError::new(format!( + "request {} has kv_pages_capacity of zero", + observation.request_id + ))), + (Some(used), Some(capacity)) if used > capacity => Err(AdapterError::new(format!( + "request {} has kv_pages_used greater than kv_pages_capacity", + observation.request_id + ))), + _ => Ok(()), + } +} + +fn operational_telemetry(observation: &BackendObservation) -> OperationalTelemetry { + OperationalTelemetry { + model_version: observation.model_version.clone(), + queue_depth: observation.queue_depth, + kv_pages_used: observation.kv_pages_used, + kv_pages_capacity: observation.kv_pages_capacity, + time_to_first_token_ms: observation.time_to_first_token_ms, + decode_token_latencies_ms: observation.decode_token_latencies_ms.clone(), + token_trace_fingerprint: token_trace_fingerprint(observation), + } +} + +fn token_trace_fingerprint(observation: &BackendObservation) -> Option { + if let Some(fingerprint) = observation.token_trace_fingerprint.as_ref() + && !fingerprint.trim().is_empty() + { + return Some(fingerprint.clone()); + } + + if observation.output_token_ids.is_empty() { + None + } else { + Some(format!( + "token-trace-fnv64:{:016x}", + hash_i64_values(&observation.output_token_ids) + )) + } +} + fn output_fingerprint(observation: &BackendObservation, ok: bool) -> Result { if let Some(fingerprint) = observation.output_fingerprint.as_ref() && !fingerprint.trim().is_empty() diff --git a/src/lib.rs b/src/lib.rs index 67a26f5..056e1ac 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,7 +8,7 @@ pub use adapter::{ }; pub use release::{ GateDecision, GateInput, GateReport, GateThresholds, NumericTolerance, Observation, - SegmentReport, evaluate_release, + OperationalTelemetry, SegmentReport, evaluate_release, }; pub use scheduler::{ ReplayInput, ReplayReport, RequestSpec, RuntimeError, Scheduler, SchedulerConfig, TickTrace, diff --git a/src/release.rs b/src/release.rs index a1a45e1..9763c33 100644 --- a/src/release.rs +++ b/src/release.rs @@ -16,6 +16,37 @@ pub struct Observation { pub accelerator: Option, #[serde(default)] pub output_values: Option>, + #[serde(default)] + pub telemetry: OperationalTelemetry, +} + +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)] +pub struct OperationalTelemetry { + #[serde(default)] + pub model_version: Option, + #[serde(default)] + pub queue_depth: Option, + #[serde(default)] + pub kv_pages_used: Option, + #[serde(default)] + pub kv_pages_capacity: Option, + #[serde(default)] + pub time_to_first_token_ms: Option, + #[serde(default)] + pub decode_token_latencies_ms: Vec, + #[serde(default)] + pub token_trace_fingerprint: Option, +} + +impl OperationalTelemetry { + fn memory_pressure_pct(&self) -> Option { + match (self.kv_pages_used, self.kv_pages_capacity) { + (Some(used), Some(capacity)) if capacity > 0 => { + Some(round6((used as f64 / capacity as f64) * 100.0)) + } + _ => None, + } + } } #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] @@ -63,6 +94,12 @@ pub struct GateThresholds { pub max_output_mismatch_rate: f64, pub max_error_rate_increase: f64, pub max_p95_latency_regression_pct: f64, + #[serde(default = "default_max_ttft_regression_pct")] + pub max_ttft_regression_pct: f64, + #[serde(default = "default_max_decode_token_p95_regression_pct")] + pub max_decode_token_p95_regression_pct: f64, + #[serde(default = "default_max_candidate_memory_pressure_pct")] + pub max_candidate_memory_pressure_pct: f64, #[serde(default)] pub max_numeric_drift_rate: f64, #[serde(default)] @@ -76,12 +113,27 @@ impl Default for GateThresholds { max_output_mismatch_rate: 0.0, max_error_rate_increase: 0.01, max_p95_latency_regression_pct: 10.0, + max_ttft_regression_pct: default_max_ttft_regression_pct(), + max_decode_token_p95_regression_pct: default_max_decode_token_p95_regression_pct(), + max_candidate_memory_pressure_pct: default_max_candidate_memory_pressure_pct(), max_numeric_drift_rate: 0.0, numeric_tolerances: Vec::new(), } } } +fn default_max_ttft_regression_pct() -> f64 { + 10.0 +} + +fn default_max_decode_token_p95_regression_pct() -> f64 { + 10.0 +} + +fn default_max_candidate_memory_pressure_pct() -> f64 { + 95.0 +} + #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct GateInput { #[serde(default)] @@ -104,6 +156,8 @@ pub struct SegmentReport { pub baseline_backend: String, pub candidate_backend: String, pub accelerator: String, + pub baseline_model_version: String, + pub candidate_model_version: String, pub matched_requests: usize, pub output_mismatch_rate: f64, pub baseline_error_rate: f64, @@ -111,6 +165,16 @@ pub struct SegmentReport { pub baseline_p95_latency_ms: Option, pub candidate_p95_latency_ms: Option, pub p95_latency_regression_pct: Option, + pub baseline_p95_ttft_ms: Option, + pub candidate_p95_ttft_ms: Option, + pub ttft_regression_pct: Option, + pub baseline_decode_token_p95_ms: Option, + pub candidate_decode_token_p95_ms: Option, + pub decode_token_p95_regression_pct: Option, + pub max_candidate_queue_depth: Option, + pub max_candidate_memory_pressure_pct: Option, + pub token_trace_pairs: usize, + pub token_trace_mismatch_rate: f64, } #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] @@ -133,6 +197,16 @@ pub struct GateReport { pub baseline_p95_latency_ms: Option, pub candidate_p95_latency_ms: Option, pub p95_latency_regression_pct: Option, + pub baseline_p95_ttft_ms: Option, + pub candidate_p95_ttft_ms: Option, + pub ttft_regression_pct: Option, + pub baseline_decode_token_p95_ms: Option, + pub candidate_decode_token_p95_ms: Option, + pub decode_token_p95_regression_pct: Option, + pub max_candidate_queue_depth: Option, + pub max_candidate_memory_pressure_pct: Option, + pub token_trace_pairs: usize, + pub token_trace_mismatch_rate: f64, pub segments: Vec, pub reasons: Vec, } @@ -236,6 +310,67 @@ pub fn evaluate_release(input: &GateInput) -> GateReport { ); let p95_latency_regression_pct = latency_regression_pct(baseline_p95_latency_ms, candidate_p95_latency_ms); + let baseline_p95_ttft_ms = percentile95( + matched + .iter() + .filter(|(baseline, _)| baseline.ok) + .filter_map(|(baseline, _)| baseline.telemetry.time_to_first_token_ms), + ); + let candidate_p95_ttft_ms = percentile95( + matched + .iter() + .filter(|(_, candidate)| candidate.ok) + .filter_map(|(_, candidate)| candidate.telemetry.time_to_first_token_ms), + ); + let ttft_regression_pct = latency_regression_pct(baseline_p95_ttft_ms, candidate_p95_ttft_ms); + let baseline_decode_token_p95_ms = + percentile95(matched.iter().filter(|(baseline, _)| baseline.ok).flat_map( + |(baseline, _)| baseline.telemetry.decode_token_latencies_ms.iter().copied(), + )); + let candidate_decode_token_p95_ms = percentile95( + matched + .iter() + .filter(|(_, candidate)| candidate.ok) + .flat_map(|(_, candidate)| { + candidate + .telemetry + .decode_token_latencies_ms + .iter() + .copied() + }), + ); + let decode_token_p95_regression_pct = + latency_regression_pct(baseline_decode_token_p95_ms, candidate_decode_token_p95_ms); + let max_candidate_queue_depth = matched + .iter() + .filter_map(|(_, candidate)| candidate.telemetry.queue_depth) + .max(); + let max_candidate_memory_pressure_pct = max_comparison_value( + matched + .iter() + .filter_map(|(_, candidate)| candidate.telemetry.memory_pressure_pct()) + .map(Some), + ); + let token_trace_pairs = matched + .iter() + .filter(|(baseline, candidate)| { + baseline.telemetry.token_trace_fingerprint.is_some() + && candidate.telemetry.token_trace_fingerprint.is_some() + }) + .count(); + let token_trace_mismatches = matched + .iter() + .filter(|(baseline, candidate)| { + match ( + &baseline.telemetry.token_trace_fingerprint, + &candidate.telemetry.token_trace_fingerprint, + ) { + (Some(baseline_trace), Some(candidate_trace)) => baseline_trace != candidate_trace, + _ => false, + } + }) + .count(); + let token_trace_mismatch_rate = ratio(token_trace_mismatches, token_trace_pairs); let mut reasons = Vec::new(); let insufficient_coverage = matched_requests < input.thresholds.min_matched_requests @@ -285,13 +420,50 @@ pub fn evaluate_release(input: &GateInput) -> GateReport { input.thresholds.max_p95_latency_regression_pct )); } + let ttft_failed = ttft_regression_pct + .is_some_and(|regression| regression > input.thresholds.max_ttft_regression_pct); + if ttft_failed { + reasons.push(format!( + "TTFT p95 regression {:.2}% exceeded {:.2}%", + ttft_regression_pct.unwrap_or_default(), + input.thresholds.max_ttft_regression_pct + )); + } + let decode_token_latency_failed = decode_token_p95_regression_pct.is_some_and(|regression| { + regression > input.thresholds.max_decode_token_p95_regression_pct + }); + if decode_token_latency_failed { + reasons.push(format!( + "decode-token p95 regression {:.2}% exceeded {:.2}%", + decode_token_p95_regression_pct.unwrap_or_default(), + input.thresholds.max_decode_token_p95_regression_pct + )); + } + let memory_pressure_failed = max_candidate_memory_pressure_pct + .is_some_and(|pressure| pressure > input.thresholds.max_candidate_memory_pressure_pct); + if memory_pressure_failed { + reasons.push(format!( + "candidate memory pressure {:.2}% exceeded {:.2}%", + max_candidate_memory_pressure_pct.unwrap_or_default(), + input.thresholds.max_candidate_memory_pressure_pct + )); + } let decision = if correctness_failed || numeric_correctness_failed || reliability_failed { GateDecision::Rollback - } else if insufficient_coverage || comparable_outputs.is_empty() || latency_failed { + } else if insufficient_coverage + || comparable_outputs.is_empty() + || latency_failed + || ttft_failed + || decode_token_latency_failed + || memory_pressure_failed + { GateDecision::Hold } else { - reasons.push("candidate stayed within correctness, reliability, and latency policy".into()); + reasons.push( + "candidate stayed within correctness, reliability, latency, and telemetry policy" + .into(), + ); GateDecision::Promote }; @@ -314,6 +486,16 @@ pub fn evaluate_release(input: &GateInput) -> GateReport { baseline_p95_latency_ms, candidate_p95_latency_ms, p95_latency_regression_pct, + baseline_p95_ttft_ms, + candidate_p95_ttft_ms, + ttft_regression_pct, + baseline_decode_token_p95_ms, + candidate_decode_token_p95_ms, + decode_token_p95_regression_pct, + max_candidate_queue_depth, + max_candidate_memory_pressure_pct, + token_trace_pairs, + token_trace_mismatch_rate, segments: segment_reports(&matched, &matched_comparisons), reasons, } @@ -429,6 +611,8 @@ struct SegmentKey { baseline_backend: String, candidate_backend: String, accelerator: String, + baseline_model_version: String, + candidate_model_version: String, } #[derive(Debug, Default)] @@ -440,6 +624,14 @@ struct SegmentAccumulator { output_mismatches: usize, baseline_latencies: Vec, candidate_latencies: Vec, + baseline_ttft_latencies: Vec, + candidate_ttft_latencies: Vec, + baseline_decode_token_latencies: Vec, + candidate_decode_token_latencies: Vec, + candidate_queue_depths: Vec, + candidate_memory_pressure_pct: Vec, + token_trace_pairs: usize, + token_trace_mismatches: usize, } fn segment_reports( @@ -466,9 +658,42 @@ fn segment_reports( } if baseline.ok { entry.baseline_latencies.push(baseline.latency_ms); + if let Some(ttft) = baseline.telemetry.time_to_first_token_ms { + entry.baseline_ttft_latencies.push(ttft); + } + entry + .baseline_decode_token_latencies + .extend(baseline.telemetry.decode_token_latencies_ms.iter().copied()); } if candidate.ok { entry.candidate_latencies.push(candidate.latency_ms); + if let Some(ttft) = candidate.telemetry.time_to_first_token_ms { + entry.candidate_ttft_latencies.push(ttft); + } + entry.candidate_decode_token_latencies.extend( + candidate + .telemetry + .decode_token_latencies_ms + .iter() + .copied(), + ); + } + if let Some(queue_depth) = candidate.telemetry.queue_depth { + entry.candidate_queue_depths.push(queue_depth); + } + if let Some(memory_pressure_pct) = candidate.telemetry.memory_pressure_pct() { + entry + .candidate_memory_pressure_pct + .push(memory_pressure_pct); + } + if let (Some(baseline_trace), Some(candidate_trace)) = ( + &baseline.telemetry.token_trace_fingerprint, + &candidate.telemetry.token_trace_fingerprint, + ) { + entry.token_trace_pairs += 1; + if baseline_trace != candidate_trace { + entry.token_trace_mismatches += 1; + } } } @@ -478,11 +703,21 @@ fn segment_reports( let baseline_p95_latency_ms = percentile95(accumulator.baseline_latencies.into_iter()); let candidate_p95_latency_ms = percentile95(accumulator.candidate_latencies.into_iter()); + let baseline_p95_ttft_ms = + percentile95(accumulator.baseline_ttft_latencies.into_iter()); + let candidate_p95_ttft_ms = + percentile95(accumulator.candidate_ttft_latencies.into_iter()); + let baseline_decode_token_p95_ms = + percentile95(accumulator.baseline_decode_token_latencies.into_iter()); + let candidate_decode_token_p95_ms = + percentile95(accumulator.candidate_decode_token_latencies.into_iter()); SegmentReport { model: key.model, baseline_backend: key.baseline_backend, candidate_backend: key.candidate_backend, accelerator: key.accelerator, + baseline_model_version: key.baseline_model_version, + candidate_model_version: key.candidate_model_version, matched_requests: accumulator.matched_requests, output_mismatch_rate: ratio( accumulator.output_mismatches, @@ -502,6 +737,30 @@ fn segment_reports( baseline_p95_latency_ms, candidate_p95_latency_ms, ), + baseline_p95_ttft_ms, + candidate_p95_ttft_ms, + ttft_regression_pct: latency_regression_pct( + baseline_p95_ttft_ms, + candidate_p95_ttft_ms, + ), + baseline_decode_token_p95_ms, + candidate_decode_token_p95_ms, + decode_token_p95_regression_pct: latency_regression_pct( + baseline_decode_token_p95_ms, + candidate_decode_token_p95_ms, + ), + max_candidate_queue_depth: accumulator.candidate_queue_depths.into_iter().max(), + max_candidate_memory_pressure_pct: max_comparison_value( + accumulator + .candidate_memory_pressure_pct + .into_iter() + .map(Some), + ), + token_trace_pairs: accumulator.token_trace_pairs, + token_trace_mismatch_rate: ratio( + accumulator.token_trace_mismatches, + accumulator.token_trace_pairs, + ), } }) .collect() @@ -531,6 +790,8 @@ fn segment_key(baseline: &Observation, candidate: &Observation) -> SegmentKey { .or(baseline.accelerator.as_ref()) .cloned() .unwrap_or_else(|| "unspecified".into()), + baseline_model_version: optional_label(&baseline.telemetry.model_version), + candidate_model_version: optional_label(&candidate.telemetry.model_version), } } diff --git a/tests/adapter.rs b/tests/adapter.rs index 84cfddd..e6893c5 100644 --- a/tests/adapter.rs +++ b/tests/adapter.rs @@ -21,6 +21,40 @@ fn token_observation(id: &str, latency_ms: f64, token_ids: &[i64]) -> BackendObs output_token_ids: token_ids.to_vec(), output_values: None, error: None, + model_version: None, + queue_depth: None, + kv_pages_used: None, + kv_pages_capacity: None, + time_to_first_token_ms: None, + decode_token_latencies_ms: Vec::new(), + token_trace_fingerprint: None, + } +} + +fn telemetry_observation( + id: &str, + latency_ms: f64, + token_ids: &[i64], + queue_depth: usize, + kv_pages_used: usize, + ttft_ms: f64, + decode_token_latencies_ms: &[f64], +) -> BackendObservation { + BackendObservation { + request_id: id.into(), + latency_ms, + ok: Some(true), + output_fingerprint: None, + output_token_ids: token_ids.to_vec(), + output_values: None, + error: None, + model_version: Some("decoder-7b@2026-06-24".into()), + queue_depth: Some(queue_depth), + kv_pages_used: Some(kv_pages_used), + kv_pages_capacity: Some(20), + time_to_first_token_ms: Some(ttft_ms), + decode_token_latencies_ms: decode_token_latencies_ms.to_vec(), + token_trace_fingerprint: None, } } @@ -33,6 +67,13 @@ fn numeric_observation(id: &str, latency_ms: f64, values: &[f64]) -> BackendObse output_token_ids: Vec::new(), output_values: Some(values.to_vec()), error: None, + model_version: None, + queue_depth: None, + kv_pages_used: None, + kv_pages_capacity: None, + time_to_first_token_ms: None, + decode_token_latencies_ms: Vec::new(), + token_trace_fingerprint: None, } } @@ -73,6 +114,84 @@ fn converts_vllm_and_sglang_mirrors_into_release_gate_input() { assert_eq!(report.segments[0].candidate_backend, "sglang"); } +#[test] +fn carries_operational_telemetry_into_release_report() { + let input = BackendMirrorInput { + thresholds: GateThresholds::default(), + baseline: backend_set( + "vllm", + vec![ + telemetry_observation("prompt-a", 18.0, &[101, 1402, 13], 2, 8, 6.0, &[4.0, 4.5]), + telemetry_observation("prompt-b", 21.0, &[205, 778, 990], 3, 9, 7.0, &[5.0, 5.5]), + telemetry_observation("prompt-c", 24.0, &[42, 42, 7], 4, 10, 8.0, &[6.0, 6.5]), + ], + ), + candidate: backend_set( + "sglang", + vec![ + telemetry_observation("prompt-a", 17.5, &[101, 1402, 13], 4, 10, 5.5, &[3.8, 4.0]), + telemetry_observation("prompt-b", 20.6, &[205, 778, 990], 5, 11, 6.5, &[4.7, 5.0]), + telemetry_observation("prompt-c", 23.5, &[42, 42, 7], 6, 12, 7.5, &[5.8, 6.0]), + ], + ), + }; + + let report = evaluate_release(&mirror_to_gate_input(&input).expect("valid mirror input")); + + assert_eq!(report.decision, GateDecision::Promote); + assert_eq!(report.max_candidate_queue_depth, Some(6)); + assert_eq!(report.max_candidate_memory_pressure_pct, Some(60.0)); + assert_eq!(report.token_trace_pairs, 3); + assert_eq!(report.token_trace_mismatch_rate, 0.0); + assert_eq!(report.candidate_p95_ttft_ms, Some(7.5)); + assert_eq!(report.candidate_decode_token_p95_ms, Some(6.0)); +} + +#[test] +fn holds_when_candidate_telemetry_exceeds_policy() { + let input = BackendMirrorInput { + thresholds: GateThresholds { + max_ttft_regression_pct: 10.0, + max_decode_token_p95_regression_pct: 10.0, + max_candidate_memory_pressure_pct: 90.0, + ..GateThresholds::default() + }, + baseline: backend_set( + "vllm", + vec![ + telemetry_observation("prompt-a", 18.0, &[101], 2, 8, 6.0, &[4.0]), + telemetry_observation("prompt-b", 21.0, &[205], 3, 9, 7.0, &[5.0]), + telemetry_observation("prompt-c", 24.0, &[42], 4, 10, 8.0, &[6.0]), + ], + ), + candidate: backend_set( + "sglang", + vec![ + telemetry_observation("prompt-a", 18.0, &[101], 7, 19, 10.0, &[8.0]), + telemetry_observation("prompt-b", 21.0, &[205], 8, 19, 11.0, &[9.0]), + telemetry_observation("prompt-c", 24.0, &[42], 9, 19, 12.0, &[10.0]), + ], + ), + }; + + let report = evaluate_release(&mirror_to_gate_input(&input).expect("valid mirror input")); + + assert_eq!(report.decision, GateDecision::Hold); + assert!(report.reasons.iter().any(|reason| reason.contains("TTFT"))); + assert!( + report + .reasons + .iter() + .any(|reason| reason.contains("decode-token")) + ); + assert!( + report + .reasons + .iter() + .any(|reason| reason.contains("memory pressure")) + ); +} + #[test] fn numeric_mirror_outputs_use_model_backend_tolerance() { let input = BackendMirrorInput { @@ -81,6 +200,9 @@ fn numeric_mirror_outputs_use_model_backend_tolerance() { max_output_mismatch_rate: 0.0, max_error_rate_increase: 0.01, max_p95_latency_regression_pct: 10.0, + max_ttft_regression_pct: 10.0, + max_decode_token_p95_regression_pct: 10.0, + max_candidate_memory_pressure_pct: 95.0, max_numeric_drift_rate: 0.0, numeric_tolerances: vec![NumericTolerance { model: Some("decoder-7b".into()), @@ -130,6 +252,13 @@ fn rejects_successful_observation_without_output_material() { output_token_ids: Vec::new(), output_values: None, error: None, + model_version: None, + queue_depth: None, + kv_pages_used: None, + kv_pages_capacity: None, + time_to_first_token_ms: None, + decode_token_latencies_ms: Vec::new(), + token_trace_fingerprint: None, }], ), candidate: backend_set("sglang", vec![token_observation("prompt-a", 17.5, &[1])]), diff --git a/tests/release.rs b/tests/release.rs index a434f62..845b380 100644 --- a/tests/release.rs +++ b/tests/release.rs @@ -1,5 +1,6 @@ use rust_inference_runtime::{ - GateDecision, GateInput, GateThresholds, NumericTolerance, Observation, evaluate_release, + GateDecision, GateInput, GateThresholds, NumericTolerance, Observation, OperationalTelemetry, + evaluate_release, }; fn observation(id: &str, fingerprint: &str, latency_ms: f64, ok: bool) -> Observation { @@ -12,6 +13,7 @@ fn observation(id: &str, fingerprint: &str, latency_ms: f64, ok: bool) -> Observ backend: None, accelerator: None, output_values: None, + telemetry: OperationalTelemetry::default(), } } @@ -31,6 +33,7 @@ fn numeric_observation( backend: Some(backend.into()), accelerator: Some("h100".into()), output_values: Some(values.to_vec()), + telemetry: OperationalTelemetry::default(), } } @@ -101,6 +104,9 @@ fn promotes_candidate_with_numeric_outputs_inside_model_tolerance() { max_output_mismatch_rate: 0.0, max_error_rate_increase: 0.01, max_p95_latency_regression_pct: 10.0, + max_ttft_regression_pct: 10.0, + max_decode_token_p95_regression_pct: 10.0, + max_candidate_memory_pressure_pct: 95.0, max_numeric_drift_rate: 0.0, numeric_tolerances: vec![NumericTolerance { model: Some("decoder-7b".into()), @@ -139,6 +145,9 @@ fn rolls_back_numeric_outputs_outside_model_tolerance() { max_output_mismatch_rate: 0.0, max_error_rate_increase: 0.01, max_p95_latency_regression_pct: 10.0, + max_ttft_regression_pct: 10.0, + max_decode_token_p95_regression_pct: 10.0, + max_candidate_memory_pressure_pct: 95.0, max_numeric_drift_rate: 0.0, numeric_tolerances: vec![NumericTolerance { model: Some("decoder-7b".into()), From d2409c932f88ed1ff9e2511374767a537e7cdd0f Mon Sep 17 00:00:00 2001 From: WaffleBits <19478926+WaffleBits@users.noreply.github.com> Date: Thu, 25 Jun 2026 10:07:38 -0400 Subject: [PATCH 2/3] Add workload pressure replay summary --- README.md | 20 +- artifacts/workload-pressure-replay.json | 373 ++++++++++++++++++++++++ artifacts/workload-replay.json | 9 +- docs/ARCHITECTURE.md | 6 + fixtures/workload_pressure.json | 67 +++++ src/scheduler.rs | 59 +++- tests/scheduler.rs | 27 ++ 7 files changed, 553 insertions(+), 8 deletions(-) create mode 100644 artifacts/workload-pressure-replay.json create mode 100644 fixtures/workload_pressure.json diff --git a/README.md b/README.md index 4a38267..dc29469 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,8 @@ replayable scheduling traces, and canary/shadow release decisions. - Conservative paged-KV reservations that prevent admitted requests from exceeding declared capacity. - Round-robin decode scheduling so active requests make measurable progress. -- Deterministic workload replay with a machine-readable trace fingerprint. +- Deterministic workload replay with a machine-readable trace fingerprint, + queue-pressure summary, active-capacity summary, and KV-pressure summary. - Baseline/candidate release validation with `promote`, `hold`, and `rollback` outcomes. - Backend mirror normalization for vLLM/SGLang-style serving observations @@ -32,6 +33,10 @@ cargo run --release -- replay \ --input fixtures/workload.json \ --output artifacts/workload-replay.json +cargo run --release -- replay \ + --input fixtures/workload_pressure.json \ + --output artifacts/workload-pressure-replay.json + cargo run --release -- gate \ --input fixtures/release_gate_safe.json \ --output artifacts/release-gate-promote.json @@ -59,9 +64,15 @@ model-version transition metadata, queue depth, KV memory pressure, TTFT, and decode-token p95 telemetry. The checked workload fixture completes four requests in 11 scheduler ticks, -peaks at 12 of 20 KV pages, returns all pages on completion, and emits trace +peaks at 12 of 20 KV pages, records three queued-pressure ticks, records three +active-capacity ticks, returns all pages on completion, and emits trace fingerprint `394166dc24d38b6c`. +The pressure fixture completes eight mixed-priority requests in 27 scheduler +ticks, records a maximum queue depth of five, reaches all three active slots, +peaks at 13 of 15 KV pages, reports 86.666667% peak KV pressure, and returns +all pages on completion. + ## Runtime Model Each request declares prompt length, maximum output length, priority, and @@ -77,8 +88,9 @@ Every tick records: - queued and active counts; and - used KV pages. -The replay report includes a stable trace fingerprint, peak KV pages, total -ticks, and completion count. +The replay report includes a stable trace fingerprint, peak KV pages, peak KV +pressure percentage, maximum queued and active request counts, queue-pressure +ticks, active-capacity ticks, total ticks, and completion count. ## Backend Mirror Adapter diff --git a/artifacts/workload-pressure-replay.json b/artifacts/workload-pressure-replay.json new file mode 100644 index 0000000..0e7c4f5 --- /dev/null +++ b/artifacts/workload-pressure-replay.json @@ -0,0 +1,373 @@ +{ + "schema_version": 2, + "config": { + "max_active_requests": 3, + "decode_batch_size": 2, + "max_prefill_tokens_per_tick": 96, + "kv_page_tokens": 16, + "total_kv_pages": 15 + }, + "request_count": 8, + "completed_requests": 8, + "total_ticks": 27, + "max_queued_requests": 5, + "max_active_requests": 3, + "peak_kv_pages": 13, + "peak_kv_pressure_pct": 86.666667, + "ticks_with_queue_pressure": 19, + "ticks_at_active_capacity": 6, + "queue_pressure_ratio": 0.703704, + "active_capacity_ratio": 0.222222, + "final_kv_pages": 0, + "trace_fingerprint": "81dcc3b8c6005eba", + "ticks": [ + { + "tick": 0, + "admitted": [ + "interactive-001", + "interactive-002" + ], + "decoded": [ + "interactive-001", + "interactive-002" + ], + "completed": [], + "queued_requests": 1, + "active_requests": 2, + "used_kv_pages": 7 + }, + { + "tick": 1, + "admitted": [ + "interactive-003" + ], + "decoded": [ + "interactive-001", + "interactive-002" + ], + "completed": [], + "queued_requests": 2, + "active_requests": 3, + "used_kv_pages": 10 + }, + { + "tick": 2, + "admitted": [], + "decoded": [ + "interactive-003", + "interactive-001" + ], + "completed": [], + "queued_requests": 3, + "active_requests": 3, + "used_kv_pages": 10 + }, + { + "tick": 3, + "admitted": [], + "decoded": [ + "interactive-002", + "interactive-003" + ], + "completed": [], + "queued_requests": 4, + "active_requests": 3, + "used_kv_pages": 10 + }, + { + "tick": 4, + "admitted": [], + "decoded": [ + "interactive-001", + "interactive-002" + ], + "completed": [], + "queued_requests": 5, + "active_requests": 3, + "used_kv_pages": 10 + }, + { + "tick": 5, + "admitted": [], + "decoded": [ + "interactive-003", + "interactive-001" + ], + "completed": [ + "interactive-001" + ], + "queued_requests": 5, + "active_requests": 2, + "used_kv_pages": 7 + }, + { + "tick": 6, + "admitted": [ + "interactive-004" + ], + "decoded": [ + "interactive-002", + "interactive-003" + ], + "completed": [ + "interactive-002", + "interactive-003" + ], + "queued_requests": 4, + "active_requests": 1, + "used_kv_pages": 3 + }, + { + "tick": 7, + "admitted": [ + "interactive-005" + ], + "decoded": [ + "interactive-004", + "interactive-005" + ], + "completed": [], + "queued_requests": 3, + "active_requests": 2, + "used_kv_pages": 6 + }, + { + "tick": 8, + "admitted": [ + "batch-001" + ], + "decoded": [ + "interactive-004", + "interactive-005" + ], + "completed": [], + "queued_requests": 2, + "active_requests": 3, + "used_kv_pages": 12 + }, + { + "tick": 9, + "admitted": [], + "decoded": [ + "batch-001", + "interactive-004" + ], + "completed": [], + "queued_requests": 2, + "active_requests": 3, + "used_kv_pages": 12 + }, + { + "tick": 10, + "admitted": [], + "decoded": [ + "interactive-005", + "batch-001" + ], + "completed": [ + "interactive-005" + ], + "queued_requests": 2, + "active_requests": 2, + "used_kv_pages": 9 + }, + { + "tick": 11, + "admitted": [ + "batch-003" + ], + "decoded": [ + "interactive-004", + "batch-001" + ], + "completed": [ + "interactive-004" + ], + "queued_requests": 1, + "active_requests": 2, + "used_kv_pages": 12 + }, + { + "tick": 12, + "admitted": [], + "decoded": [ + "batch-003", + "batch-001" + ], + "completed": [], + "queued_requests": 1, + "active_requests": 2, + "used_kv_pages": 12 + }, + { + "tick": 13, + "admitted": [], + "decoded": [ + "batch-003", + "batch-001" + ], + "completed": [], + "queued_requests": 1, + "active_requests": 2, + "used_kv_pages": 12 + }, + { + "tick": 14, + "admitted": [], + "decoded": [ + "batch-003", + "batch-001" + ], + "completed": [], + "queued_requests": 1, + "active_requests": 2, + "used_kv_pages": 12 + }, + { + "tick": 15, + "admitted": [], + "decoded": [ + "batch-003", + "batch-001" + ], + "completed": [], + "queued_requests": 1, + "active_requests": 2, + "used_kv_pages": 12 + }, + { + "tick": 16, + "admitted": [], + "decoded": [ + "batch-003", + "batch-001" + ], + "completed": [], + "queued_requests": 1, + "active_requests": 2, + "used_kv_pages": 12 + }, + { + "tick": 17, + "admitted": [], + "decoded": [ + "batch-003", + "batch-001" + ], + "completed": [], + "queued_requests": 1, + "active_requests": 2, + "used_kv_pages": 12 + }, + { + "tick": 18, + "admitted": [], + "decoded": [ + "batch-003", + "batch-001" + ], + "completed": [ + "batch-001" + ], + "queued_requests": 1, + "active_requests": 1, + "used_kv_pages": 6 + }, + { + "tick": 19, + "admitted": [ + "batch-002" + ], + "decoded": [ + "batch-003", + "batch-002" + ], + "completed": [], + "queued_requests": 0, + "active_requests": 2, + "used_kv_pages": 13 + }, + { + "tick": 20, + "admitted": [], + "decoded": [ + "batch-003", + "batch-002" + ], + "completed": [ + "batch-003" + ], + "queued_requests": 0, + "active_requests": 1, + "used_kv_pages": 7 + }, + { + "tick": 21, + "admitted": [], + "decoded": [ + "batch-002" + ], + "completed": [], + "queued_requests": 0, + "active_requests": 1, + "used_kv_pages": 7 + }, + { + "tick": 22, + "admitted": [], + "decoded": [ + "batch-002" + ], + "completed": [], + "queued_requests": 0, + "active_requests": 1, + "used_kv_pages": 7 + }, + { + "tick": 23, + "admitted": [], + "decoded": [ + "batch-002" + ], + "completed": [], + "queued_requests": 0, + "active_requests": 1, + "used_kv_pages": 7 + }, + { + "tick": 24, + "admitted": [], + "decoded": [ + "batch-002" + ], + "completed": [], + "queued_requests": 0, + "active_requests": 1, + "used_kv_pages": 7 + }, + { + "tick": 25, + "admitted": [], + "decoded": [ + "batch-002" + ], + "completed": [], + "queued_requests": 0, + "active_requests": 1, + "used_kv_pages": 7 + }, + { + "tick": 26, + "admitted": [], + "decoded": [ + "batch-002" + ], + "completed": [ + "batch-002" + ], + "queued_requests": 0, + "active_requests": 0, + "used_kv_pages": 0 + } + ] +} diff --git a/artifacts/workload-replay.json b/artifacts/workload-replay.json index 7f30a3d..17d8a4a 100644 --- a/artifacts/workload-replay.json +++ b/artifacts/workload-replay.json @@ -1,5 +1,5 @@ { - "schema_version": 1, + "schema_version": 2, "config": { "max_active_requests": 3, "decode_batch_size": 2, @@ -10,7 +10,14 @@ "request_count": 4, "completed_requests": 4, "total_ticks": 11, + "max_queued_requests": 1, + "max_active_requests": 3, "peak_kv_pages": 12, + "peak_kv_pressure_pct": 60.0, + "ticks_with_queue_pressure": 3, + "ticks_at_active_capacity": 3, + "queue_pressure_ratio": 0.272727, + "active_capacity_ratio": 0.272727, "final_kv_pages": 0, "trace_fingerprint": "394166dc24d38b6c", "ticks": [ diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index dc79fa5..87a36fb 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -29,6 +29,12 @@ runner preserves input order for equal arrival times and emits a stable FNV-1a fingerprint over the serialized trace. Replaying the same input should produce the same admission, decode, completion, and memory sequence. +The replay report also summarizes workload pressure: maximum queued requests, +maximum active requests, peak KV pages, peak KV pressure percentage, ticks with +queued requests, ticks at active capacity, and pressure ratios over the full +run. Those fields make stress fixtures reviewable without requiring a reader to +manually inspect every tick. + ## Release Gate The release evaluator joins baseline and candidate observations by request ID. diff --git a/fixtures/workload_pressure.json b/fixtures/workload_pressure.json new file mode 100644 index 0000000..22b8be5 --- /dev/null +++ b/fixtures/workload_pressure.json @@ -0,0 +1,67 @@ +{ + "config": { + "max_active_requests": 3, + "decode_batch_size": 2, + "max_prefill_tokens_per_tick": 96, + "kv_page_tokens": 16, + "total_kv_pages": 15 + }, + "requests": [ + { + "id": "interactive-001", + "prompt_tokens": 32, + "max_output_tokens": 5, + "priority": 9, + "arrival_tick": 0 + }, + { + "id": "interactive-002", + "prompt_tokens": 48, + "max_output_tokens": 5, + "priority": 9, + "arrival_tick": 0 + }, + { + "id": "batch-001", + "prompt_tokens": 80, + "max_output_tokens": 10, + "priority": 1, + "arrival_tick": 0 + }, + { + "id": "interactive-003", + "prompt_tokens": 32, + "max_output_tokens": 4, + "priority": 8, + "arrival_tick": 1 + }, + { + "id": "batch-002", + "prompt_tokens": 96, + "max_output_tokens": 8, + "priority": 1, + "arrival_tick": 1 + }, + { + "id": "interactive-004", + "prompt_tokens": 40, + "max_output_tokens": 4, + "priority": 8, + "arrival_tick": 2 + }, + { + "id": "batch-003", + "prompt_tokens": 72, + "max_output_tokens": 9, + "priority": 1, + "arrival_tick": 3 + }, + { + "id": "interactive-005", + "prompt_tokens": 32, + "max_output_tokens": 3, + "priority": 7, + "arrival_tick": 4 + } + ] +} diff --git a/src/scheduler.rs b/src/scheduler.rs index fb46204..7785d3a 100644 --- a/src/scheduler.rs +++ b/src/scheduler.rs @@ -276,14 +276,21 @@ pub struct ReplayInput { pub requests: Vec, } -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct ReplayReport { pub schema_version: u32, pub config: SchedulerConfig, pub request_count: usize, pub completed_requests: usize, pub total_ticks: u64, + pub max_queued_requests: usize, + pub max_active_requests: usize, pub peak_kv_pages: u32, + pub peak_kv_pressure_pct: f64, + pub ticks_with_queue_pressure: u64, + pub ticks_at_active_capacity: u64, + pub queue_pressure_ratio: f64, + pub active_capacity_ratio: f64, pub final_kv_pages: u32, pub trace_fingerprint: String, pub ticks: Vec, @@ -320,16 +327,42 @@ pub fn run_replay(input: ReplayInput) -> Result { .map(|tick| tick.used_kv_pages) .max() .unwrap_or(0); + let max_queued_requests = ticks + .iter() + .map(|tick| tick.queued_requests) + .max() + .unwrap_or(0); + let max_active_requests = ticks + .iter() + .map(|tick| tick.active_requests) + .max() + .unwrap_or(0); + let ticks_with_queue_pressure = + ticks.iter().filter(|tick| tick.queued_requests > 0).count() as u64; + let max_active_capacity = input.config.max_active_requests; + let total_kv_pages = input.config.total_kv_pages; + let ticks_at_active_capacity = ticks + .iter() + .filter(|tick| tick.active_requests >= max_active_capacity) + .count() as u64; + let total_ticks = scheduler.current_tick(); let trace_bytes = serde_json::to_vec(&ticks) .map_err(|error| RuntimeError::Serialization(error.to_string()))?; Ok(ReplayReport { - schema_version: 1, + schema_version: 2, config: input.config, request_count, completed_requests: scheduler.completed_count(), - total_ticks: scheduler.current_tick(), + total_ticks, + max_queued_requests, + max_active_requests, peak_kv_pages, + peak_kv_pressure_pct: pressure_pct(peak_kv_pages, total_kv_pages), + ticks_with_queue_pressure, + ticks_at_active_capacity, + queue_pressure_ratio: ratio(ticks_with_queue_pressure, total_ticks), + active_capacity_ratio: ratio(ticks_at_active_capacity, total_ticks), final_kv_pages: 0, trace_fingerprint: fnv1a64_hex(&trace_bytes), ticks, @@ -345,6 +378,26 @@ fn fnv1a64_hex(bytes: &[u8]) -> String { format!("{hash:016x}") } +fn pressure_pct(used: u32, capacity: u32) -> f64 { + if capacity == 0 { + 0.0 + } else { + round6((f64::from(used) / f64::from(capacity)) * 100.0) + } +} + +fn ratio(numerator: u64, denominator: u64) -> f64 { + if denominator == 0 { + 0.0 + } else { + round6(numerator as f64 / denominator as f64) + } +} + +fn round6(value: f64) -> f64 { + (value * 1_000_000.0).round() / 1_000_000.0 +} + #[derive(Debug, Clone, PartialEq, Eq)] pub enum RuntimeError { InvalidConfig(String), diff --git a/tests/scheduler.rs b/tests/scheduler.rs index f47cd7a..fffc8e4 100644 --- a/tests/scheduler.rs +++ b/tests/scheduler.rs @@ -38,6 +38,11 @@ fn replay_is_deterministic() { assert_eq!(first, second); assert_eq!(first.ticks[0].admitted, vec!["urgent", "normal"]); assert_eq!(first.completed_requests, 2); + assert_eq!(first.max_queued_requests, 0); + assert_eq!(first.max_active_requests, 2); + assert_eq!(first.peak_kv_pressure_pct, 62.5); + assert_eq!(first.queue_pressure_ratio, 0.0); + assert_eq!(first.active_capacity_ratio, 0.333333); assert_eq!(first.final_kv_pages, 0); } @@ -102,3 +107,25 @@ fn round_robin_decode_prevents_active_request_starvation() { let decoded: Vec = (0..3).flat_map(|_| scheduler.step().decoded).collect(); assert_eq!(decoded, vec!["a", "b", "c"]); } + +#[test] +fn pressure_replay_reports_queue_and_capacity_summary() { + let input: ReplayInput = + serde_json::from_str(include_str!("../fixtures/workload_pressure.json")).unwrap(); + + let report = run_replay(input).unwrap(); + + assert_eq!(report.schema_version, 2); + assert_eq!(report.request_count, report.completed_requests); + assert!(report.max_queued_requests >= 3); + assert_eq!( + report.max_active_requests, + report.config.max_active_requests + ); + assert!(report.peak_kv_pressure_pct > 80.0); + assert!(report.ticks_with_queue_pressure > 0); + assert!(report.ticks_at_active_capacity > 0); + assert!(report.queue_pressure_ratio > 0.0); + assert!(report.active_capacity_ratio > 0.0); + assert_eq!(report.final_kv_pages, 0); +} From cd4d3b2c8b9268e5e807899f6ab15c56bfe64d85 Mon Sep 17 00:00:00 2001 From: WaffleBits <19478926+WaffleBits@users.noreply.github.com> Date: Fri, 26 Jun 2026 09:05:49 -0400 Subject: [PATCH 3/3] Add release gate triage output --- README.md | 7 +- artifacts/backend-mirror-report.json | 3 +- artifacts/release-gate-numeric-tolerance.json | 3 +- artifacts/release-gate-promote.json | 3 +- artifacts/release-gate-rollback.json | 18 +- docs/ARCHITECTURE.md | 2 + docs/RELEASE_VALIDATION.md | 13 ++ src/lib.rs | 2 +- src/release.rs | 179 ++++++++++++++---- tests/adapter.rs | 15 ++ tests/release.rs | 22 +++ 11 files changed, 222 insertions(+), 45 deletions(-) diff --git a/README.md b/README.md index dc29469..44d3e92 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,8 @@ replayable scheduling traces, and canary/shadow release decisions. - Exact output checks, model-aware numeric tolerances for backend drift, per-segment release summaries, error-rate deltas, p95 latency regression policy, TTFT and decode-token p95 checks, KV memory-pressure reporting, - model-version transitions, token-trace fingerprints, tests, and CI. + model-version transitions, token-trace fingerprints, structured triage + owner hints, tests, and CI. ## Quick Start @@ -114,7 +115,9 @@ The gate joins mirrored baseline and candidate observations by request ID. Outputs can be validated either by exact fingerprint or by a configured numeric tolerance scoped to model, candidate backend, and accelerator. Reports include aggregate metrics plus segment summaries so hardware/backend-specific -regressions remain visible. +regressions remain visible. Hold and rollback reports also include structured +triage items that name the failed signal, the recommended response, an owner +hint, and the next investigation action. | Signal | Response | |---|---| diff --git a/artifacts/backend-mirror-report.json b/artifacts/backend-mirror-report.json index 1f5b3fe..fac4fd0 100644 --- a/artifacts/backend-mirror-report.json +++ b/artifacts/backend-mirror-report.json @@ -1,5 +1,5 @@ { - "schema_version": 2, + "schema_version": 3, "decision": "promote", "matched_requests": 4, "baseline_requests": 4, @@ -54,6 +54,7 @@ "token_trace_mismatch_rate": 0.0 } ], + "triage": [], "reasons": [ "candidate stayed within correctness, reliability, latency, and telemetry policy" ] diff --git a/artifacts/release-gate-numeric-tolerance.json b/artifacts/release-gate-numeric-tolerance.json index a1d9026..0507ae5 100644 --- a/artifacts/release-gate-numeric-tolerance.json +++ b/artifacts/release-gate-numeric-tolerance.json @@ -1,5 +1,5 @@ { - "schema_version": 2, + "schema_version": 3, "decision": "promote", "matched_requests": 4, "baseline_requests": 4, @@ -54,6 +54,7 @@ "token_trace_mismatch_rate": 0.0 } ], + "triage": [], "reasons": [ "candidate stayed within correctness, reliability, latency, and telemetry policy" ] diff --git a/artifacts/release-gate-promote.json b/artifacts/release-gate-promote.json index 446229e..9936d06 100644 --- a/artifacts/release-gate-promote.json +++ b/artifacts/release-gate-promote.json @@ -1,5 +1,5 @@ { - "schema_version": 2, + "schema_version": 3, "decision": "promote", "matched_requests": 4, "baseline_requests": 4, @@ -54,6 +54,7 @@ "token_trace_mismatch_rate": 0.0 } ], + "triage": [], "reasons": [ "candidate stayed within correctness, reliability, latency, and telemetry policy" ] diff --git a/artifacts/release-gate-rollback.json b/artifacts/release-gate-rollback.json index 8d30230..8a6a625 100644 --- a/artifacts/release-gate-rollback.json +++ b/artifacts/release-gate-rollback.json @@ -1,5 +1,5 @@ { - "schema_version": 2, + "schema_version": 3, "decision": "rollback", "matched_requests": 4, "baseline_requests": 4, @@ -54,6 +54,22 @@ "token_trace_mismatch_rate": 0.0 } ], + "triage": [ + { + "signal": "output_correctness", + "response": "rollback", + "owner_hint": "model_runtime_correctness", + "action": "block the candidate and compare output fingerprints or numeric tolerance scope", + "reason": "output mismatch rate 0.3333 exceeded 0.0000" + }, + { + "signal": "serving_reliability", + "response": "rollback", + "owner_hint": "serving_runtime", + "action": "block the candidate and inspect candidate errors before rerunning the gate", + "reason": "error-rate increase 0.2500 exceeded 0.0100" + } + ], "reasons": [ "output mismatch rate 0.3333 exceeded 0.0000", "error-rate increase 0.2500 exceeded 0.0100" diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 87a36fb..990c333 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -54,6 +54,8 @@ It computes: Correctness, numeric drift, or reliability regressions produce `rollback`. Latency, token-path, memory-pressure regressions, or incomplete evidence produce `hold`. A complete candidate within policy produces `promote`. +Hold and rollback reports include structured triage items so CI or rollout +tooling can route the failed signal to a likely owner without parsing prose. This is a local validation component, not a deployment controller. Production integration would obtain observations from mirrored traffic, canary diff --git a/docs/RELEASE_VALIDATION.md b/docs/RELEASE_VALIDATION.md index d44c082..a67b638 100644 --- a/docs/RELEASE_VALIDATION.md +++ b/docs/RELEASE_VALIDATION.md @@ -66,6 +66,19 @@ mismatch rate. Correct outputs with excessive latency or memory pressure produce `hold`, not `rollback`, because the evidence points to performance or capacity risk rather than a correctness failure. +## Triage Output + +Every hold or rollback reason is mirrored into a structured `triage` item. +Each item records: + +- the failed signal; +- the recommended response; +- an owner hint; and +- the next investigation action. + +This keeps the human-readable `reasons` field intact while making deploy-gate +output easier to route through CI, incident tooling, or rollout dashboards. + ## Backend Mirror Adapter The `mirror` command normalizes request observations from backend-specific diff --git a/src/lib.rs b/src/lib.rs index 056e1ac..2f8e644 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,7 +8,7 @@ pub use adapter::{ }; pub use release::{ GateDecision, GateInput, GateReport, GateThresholds, NumericTolerance, Observation, - OperationalTelemetry, SegmentReport, evaluate_release, + OperationalTelemetry, SegmentReport, TriageItem, evaluate_release, }; pub use scheduler::{ ReplayInput, ReplayReport, RequestSpec, RuntimeError, Scheduler, SchedulerConfig, TickTrace, diff --git a/src/release.rs b/src/release.rs index 9763c33..62e997b 100644 --- a/src/release.rs +++ b/src/release.rs @@ -150,6 +150,15 @@ pub enum GateDecision { Rollback, } +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct TriageItem { + pub signal: String, + pub response: GateDecision, + pub owner_hint: String, + pub action: String, + pub reason: String, +} + #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct SegmentReport { pub model: String, @@ -208,6 +217,7 @@ pub struct GateReport { pub token_trace_pairs: usize, pub token_trace_mismatch_rate: f64, pub segments: Vec, + pub triage: Vec, pub reasons: Vec, } @@ -373,80 +383,153 @@ pub fn evaluate_release(input: &GateInput) -> GateReport { let token_trace_mismatch_rate = ratio(token_trace_mismatches, token_trace_pairs); let mut reasons = Vec::new(); + let mut triage = Vec::new(); let insufficient_coverage = matched_requests < input.thresholds.min_matched_requests || matched_requests != baseline_requests || matched_requests != candidate_requests; if insufficient_coverage { - reasons.push(format!( - "matched {matched_requests} requests; need at least {} with complete coverage", - input.thresholds.min_matched_requests - )); + add_triage( + &mut reasons, + &mut triage, + "evidence_coverage", + GateDecision::Hold, + "release_validation", + "rerun mirrored traffic with complete baseline and candidate request coverage", + format!( + "matched {matched_requests} requests; need at least {} with complete coverage", + input.thresholds.min_matched_requests + ), + ); } if comparable_outputs.is_empty() { - reasons.push("no successful request pairs were available for output comparison".into()); + add_triage( + &mut reasons, + &mut triage, + "output_comparison_coverage", + GateDecision::Hold, + "release_validation", + "collect successful mirrored request pairs with auditable output material", + "no successful request pairs were available for output comparison".into(), + ); } let correctness_failed = output_mismatch_rate > input.thresholds.max_output_mismatch_rate; if correctness_failed { - reasons.push(format!( - "output mismatch rate {:.4} exceeded {:.4}", - output_mismatch_rate, input.thresholds.max_output_mismatch_rate - )); + add_triage( + &mut reasons, + &mut triage, + "output_correctness", + GateDecision::Rollback, + "model_runtime_correctness", + "block the candidate and compare output fingerprints or numeric tolerance scope", + format!( + "output mismatch rate {:.4} exceeded {:.4}", + output_mismatch_rate, input.thresholds.max_output_mismatch_rate + ), + ); } let numeric_correctness_failed = numeric_drift_rate > input.thresholds.max_numeric_drift_rate && numeric_pairs > 0; if numeric_correctness_failed { - reasons.push(format!( - "numeric drift rate {:.4} exceeded {:.4}", - numeric_drift_rate, input.thresholds.max_numeric_drift_rate - )); + add_triage( + &mut reasons, + &mut triage, + "numeric_correctness", + GateDecision::Rollback, + "accelerator_backend_correctness", + "block the candidate and review model, backend, and accelerator tolerance policy", + format!( + "numeric drift rate {:.4} exceeded {:.4}", + numeric_drift_rate, input.thresholds.max_numeric_drift_rate + ), + ); } let reliability_failed = error_rate_increase > input.thresholds.max_error_rate_increase; if reliability_failed { - reasons.push(format!( - "error-rate increase {:.4} exceeded {:.4}", - error_rate_increase, input.thresholds.max_error_rate_increase - )); + add_triage( + &mut reasons, + &mut triage, + "serving_reliability", + GateDecision::Rollback, + "serving_runtime", + "block the candidate and inspect candidate errors before rerunning the gate", + format!( + "error-rate increase {:.4} exceeded {:.4}", + error_rate_increase, input.thresholds.max_error_rate_increase + ), + ); } let latency_failed = p95_latency_regression_pct .is_some_and(|regression| regression > input.thresholds.max_p95_latency_regression_pct); if latency_failed { - reasons.push(format!( - "p95 latency regression {:.2}% exceeded {:.2}%", - p95_latency_regression_pct.unwrap_or_default(), - input.thresholds.max_p95_latency_regression_pct - )); + add_triage( + &mut reasons, + &mut triage, + "request_latency", + GateDecision::Hold, + "runtime_performance", + "profile scheduler and backend request path before rollout", + format!( + "p95 latency regression {:.2}% exceeded {:.2}%", + p95_latency_regression_pct.unwrap_or_default(), + input.thresholds.max_p95_latency_regression_pct + ), + ); } let ttft_failed = ttft_regression_pct .is_some_and(|regression| regression > input.thresholds.max_ttft_regression_pct); if ttft_failed { - reasons.push(format!( - "TTFT p95 regression {:.2}% exceeded {:.2}%", - ttft_regression_pct.unwrap_or_default(), - input.thresholds.max_ttft_regression_pct - )); + add_triage( + &mut reasons, + &mut triage, + "time_to_first_token", + GateDecision::Hold, + "prefill_runtime", + "inspect prefill queueing and first-token path before rollout", + format!( + "TTFT p95 regression {:.2}% exceeded {:.2}%", + ttft_regression_pct.unwrap_or_default(), + input.thresholds.max_ttft_regression_pct + ), + ); } let decode_token_latency_failed = decode_token_p95_regression_pct.is_some_and(|regression| { regression > input.thresholds.max_decode_token_p95_regression_pct }); if decode_token_latency_failed { - reasons.push(format!( - "decode-token p95 regression {:.2}% exceeded {:.2}%", - decode_token_p95_regression_pct.unwrap_or_default(), - input.thresholds.max_decode_token_p95_regression_pct - )); + add_triage( + &mut reasons, + &mut triage, + "decode_token_latency", + GateDecision::Hold, + "decode_runtime", + "inspect decode loop, batching policy, and backend saturation before rollout", + format!( + "decode-token p95 regression {:.2}% exceeded {:.2}%", + decode_token_p95_regression_pct.unwrap_or_default(), + input.thresholds.max_decode_token_p95_regression_pct + ), + ); } let memory_pressure_failed = max_candidate_memory_pressure_pct .is_some_and(|pressure| pressure > input.thresholds.max_candidate_memory_pressure_pct); if memory_pressure_failed { - reasons.push(format!( - "candidate memory pressure {:.2}% exceeded {:.2}%", - max_candidate_memory_pressure_pct.unwrap_or_default(), - input.thresholds.max_candidate_memory_pressure_pct - )); + add_triage( + &mut reasons, + &mut triage, + "kv_memory_pressure", + GateDecision::Hold, + "runtime_capacity", + "inspect KV reservation, queue pressure, and capacity policy before rollout", + format!( + "candidate memory pressure {:.2}% exceeded {:.2}%", + max_candidate_memory_pressure_pct.unwrap_or_default(), + input.thresholds.max_candidate_memory_pressure_pct + ), + ); } let decision = if correctness_failed || numeric_correctness_failed || reliability_failed { @@ -468,7 +551,7 @@ pub fn evaluate_release(input: &GateInput) -> GateReport { }; GateReport { - schema_version: 2, + schema_version: 3, decision, matched_requests, baseline_requests, @@ -497,10 +580,30 @@ pub fn evaluate_release(input: &GateInput) -> GateReport { token_trace_pairs, token_trace_mismatch_rate, segments: segment_reports(&matched, &matched_comparisons), + triage, reasons, } } +fn add_triage( + reasons: &mut Vec, + triage: &mut Vec, + signal: &str, + response: GateDecision, + owner_hint: &str, + action: &str, + reason: String, +) { + reasons.push(reason.clone()); + triage.push(TriageItem { + signal: signal.into(), + response, + owner_hint: owner_hint.into(), + action: action.into(), + reason, + }); +} + #[derive(Debug, Clone, Copy)] struct OutputComparison { matches: bool, diff --git a/tests/adapter.rs b/tests/adapter.rs index e6893c5..9c91247 100644 --- a/tests/adapter.rs +++ b/tests/adapter.rs @@ -190,6 +190,21 @@ fn holds_when_candidate_telemetry_exceeds_policy() { .iter() .any(|reason| reason.contains("memory pressure")) ); + assert!( + report.triage.iter().any( + |item| item.signal == "time_to_first_token" && item.owner_hint == "prefill_runtime" + ) + ); + assert!( + report.triage.iter().any( + |item| item.signal == "decode_token_latency" && item.owner_hint == "decode_runtime" + ) + ); + assert!( + report.triage.iter().any( + |item| item.signal == "kv_memory_pressure" && item.owner_hint == "runtime_capacity" + ) + ); } #[test] diff --git a/tests/release.rs b/tests/release.rs index 845b380..6191c70 100644 --- a/tests/release.rs +++ b/tests/release.rs @@ -60,6 +60,7 @@ fn promotes_candidate_within_policy() { }); assert_eq!(report.decision, GateDecision::Promote); + assert!(report.triage.is_empty()); } #[test] @@ -77,6 +78,13 @@ fn rolls_back_output_mismatch() { assert_eq!(report.decision, GateDecision::Rollback); assert_eq!(report.output_mismatch_rate, 0.25); + let triage = report + .triage + .iter() + .find(|item| item.signal == "output_correctness") + .expect("output mismatch should produce correctness triage"); + assert_eq!(triage.response, GateDecision::Rollback); + assert_eq!(triage.owner_hint, "model_runtime_correctness"); } #[test] @@ -94,6 +102,13 @@ fn holds_latency_regression_for_investigation() { assert_eq!(report.decision, GateDecision::Hold); assert!(report.p95_latency_regression_pct.unwrap() > 10.0); + let triage = report + .triage + .iter() + .find(|item| item.signal == "request_latency") + .expect("latency regression should produce performance triage"); + assert_eq!(triage.response, GateDecision::Hold); + assert_eq!(triage.owner_hint, "runtime_performance"); } #[test] @@ -172,6 +187,13 @@ fn rolls_back_numeric_outputs_outside_model_tolerance() { assert_eq!(report.decision, GateDecision::Rollback); assert_eq!(report.output_mismatch_rate, 0.333333); assert_eq!(report.numeric_drift_rate, 0.333333); + assert!( + report + .triage + .iter() + .any(|item| item.signal == "numeric_correctness" + && item.owner_hint == "accelerator_backend_correctness") + ); assert!( report .reasons