diff --git a/README.md b/README.md index 44d3e92..bb816f1 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,8 @@ replayable scheduling traces, and canary/shadow release decisions. exceeding declared capacity. - Round-robin decode scheduling so active requests make measurable progress. - Deterministic workload replay with a machine-readable trace fingerprint, - queue-pressure summary, active-capacity summary, and KV-pressure summary. + queue-pressure summary, active-capacity summary, KV-pressure summary, and + replay-level capacity envelope. - Baseline/candidate release validation with `promote`, `hold`, and `rollback` outcomes. - Backend mirror normalization for vLLM/SGLang-style serving observations @@ -65,14 +66,17 @@ model-version transition metadata, queue depth, KV memory pressure, TTFT, and decode-token p95 telemetry. The checked workload fixture completes four requests in 11 scheduler ticks, +accounts for 224 prompt tokens, 18 decode tokens, and 18 reserved KV pages, peaks at 12 of 20 KV pages, records three queued-pressure ticks, records three -active-capacity ticks, returns all pages on completion, and emits trace -fingerprint `394166dc24d38b6c`. +active-capacity ticks, reports 0.818182 decode-capacity utilization, returns +all pages on completion, and emits trace fingerprint `b454ea97ea75ee90`. The pressure fixture completes eight mixed-priority requests in 27 scheduler ticks, records a maximum queue depth of five, reaches all three active slots, -peaks at 13 of 15 KV pages, reports 86.666667% peak KV pressure, and returns -all pages on completion. +accounts for 432 prompt tokens, 48 decode tokens, and 35 reserved KV pages, +peaks at 13 of 15 KV pages, reports 86.666667% peak KV pressure, records +0.888889 decode-capacity utilization and 0.595062 KV-page occupancy, and +returns all pages on completion. ## Runtime Model @@ -85,13 +89,17 @@ order with a configurable batch width. Every tick records: - admitted request IDs; +- admitted prefill tokens; - decoded and completed request IDs; +- decoded token count; - queued and active counts; and - used KV pages. The replay report includes a stable trace fingerprint, peak KV pages, peak KV pressure percentage, maximum queued and active request counts, queue-pressure -ticks, active-capacity ticks, total ticks, and completion count. +ticks, active-capacity ticks, total prompt and decode tokens, total reserved KV +pages, declared prefill/decode/KV capacity, utilization ratios, total ticks, +and completion count. ## Backend Mirror Adapter diff --git a/artifacts/workload-pressure-replay.json b/artifacts/workload-pressure-replay.json index 0e7c4f5..f53a53e 100644 --- a/artifacts/workload-pressure-replay.json +++ b/artifacts/workload-pressure-replay.json @@ -1,5 +1,5 @@ { - "schema_version": 2, + "schema_version": 3, "config": { "max_active_requests": 3, "decode_batch_size": 2, @@ -9,6 +9,9 @@ }, "request_count": 8, "completed_requests": 8, + "total_prompt_tokens": 432, + "total_decode_tokens": 48, + "total_reserved_kv_pages": 35, "total_ticks": 27, "max_queued_requests": 5, "max_active_requests": 3, @@ -16,10 +19,16 @@ "peak_kv_pressure_pct": 86.666667, "ticks_with_queue_pressure": 19, "ticks_at_active_capacity": 6, + "declared_prefill_capacity_tokens": 2592, + "declared_decode_capacity_tokens": 54, + "kv_page_capacity_ticks": 405, "queue_pressure_ratio": 0.703704, "active_capacity_ratio": 0.222222, + "prefill_capacity_utilization": 0.166667, + "decode_capacity_utilization": 0.888889, + "kv_page_occupancy_ratio": 0.595062, "final_kv_pages": 0, - "trace_fingerprint": "81dcc3b8c6005eba", + "trace_fingerprint": "fe52cca977038f01", "ticks": [ { "tick": 0, @@ -27,10 +36,12 @@ "interactive-001", "interactive-002" ], + "admitted_prefill_tokens": 80, "decoded": [ "interactive-001", "interactive-002" ], + "decoded_tokens": 2, "completed": [], "queued_requests": 1, "active_requests": 2, @@ -41,10 +52,12 @@ "admitted": [ "interactive-003" ], + "admitted_prefill_tokens": 32, "decoded": [ "interactive-001", "interactive-002" ], + "decoded_tokens": 2, "completed": [], "queued_requests": 2, "active_requests": 3, @@ -53,10 +66,12 @@ { "tick": 2, "admitted": [], + "admitted_prefill_tokens": 0, "decoded": [ "interactive-003", "interactive-001" ], + "decoded_tokens": 2, "completed": [], "queued_requests": 3, "active_requests": 3, @@ -65,10 +80,12 @@ { "tick": 3, "admitted": [], + "admitted_prefill_tokens": 0, "decoded": [ "interactive-002", "interactive-003" ], + "decoded_tokens": 2, "completed": [], "queued_requests": 4, "active_requests": 3, @@ -77,10 +94,12 @@ { "tick": 4, "admitted": [], + "admitted_prefill_tokens": 0, "decoded": [ "interactive-001", "interactive-002" ], + "decoded_tokens": 2, "completed": [], "queued_requests": 5, "active_requests": 3, @@ -89,10 +108,12 @@ { "tick": 5, "admitted": [], + "admitted_prefill_tokens": 0, "decoded": [ "interactive-003", "interactive-001" ], + "decoded_tokens": 2, "completed": [ "interactive-001" ], @@ -105,10 +126,12 @@ "admitted": [ "interactive-004" ], + "admitted_prefill_tokens": 40, "decoded": [ "interactive-002", "interactive-003" ], + "decoded_tokens": 2, "completed": [ "interactive-002", "interactive-003" @@ -122,10 +145,12 @@ "admitted": [ "interactive-005" ], + "admitted_prefill_tokens": 32, "decoded": [ "interactive-004", "interactive-005" ], + "decoded_tokens": 2, "completed": [], "queued_requests": 3, "active_requests": 2, @@ -136,10 +161,12 @@ "admitted": [ "batch-001" ], + "admitted_prefill_tokens": 80, "decoded": [ "interactive-004", "interactive-005" ], + "decoded_tokens": 2, "completed": [], "queued_requests": 2, "active_requests": 3, @@ -148,10 +175,12 @@ { "tick": 9, "admitted": [], + "admitted_prefill_tokens": 0, "decoded": [ "batch-001", "interactive-004" ], + "decoded_tokens": 2, "completed": [], "queued_requests": 2, "active_requests": 3, @@ -160,10 +189,12 @@ { "tick": 10, "admitted": [], + "admitted_prefill_tokens": 0, "decoded": [ "interactive-005", "batch-001" ], + "decoded_tokens": 2, "completed": [ "interactive-005" ], @@ -176,10 +207,12 @@ "admitted": [ "batch-003" ], + "admitted_prefill_tokens": 72, "decoded": [ "interactive-004", "batch-001" ], + "decoded_tokens": 2, "completed": [ "interactive-004" ], @@ -190,10 +223,12 @@ { "tick": 12, "admitted": [], + "admitted_prefill_tokens": 0, "decoded": [ "batch-003", "batch-001" ], + "decoded_tokens": 2, "completed": [], "queued_requests": 1, "active_requests": 2, @@ -202,10 +237,12 @@ { "tick": 13, "admitted": [], + "admitted_prefill_tokens": 0, "decoded": [ "batch-003", "batch-001" ], + "decoded_tokens": 2, "completed": [], "queued_requests": 1, "active_requests": 2, @@ -214,10 +251,12 @@ { "tick": 14, "admitted": [], + "admitted_prefill_tokens": 0, "decoded": [ "batch-003", "batch-001" ], + "decoded_tokens": 2, "completed": [], "queued_requests": 1, "active_requests": 2, @@ -226,10 +265,12 @@ { "tick": 15, "admitted": [], + "admitted_prefill_tokens": 0, "decoded": [ "batch-003", "batch-001" ], + "decoded_tokens": 2, "completed": [], "queued_requests": 1, "active_requests": 2, @@ -238,10 +279,12 @@ { "tick": 16, "admitted": [], + "admitted_prefill_tokens": 0, "decoded": [ "batch-003", "batch-001" ], + "decoded_tokens": 2, "completed": [], "queued_requests": 1, "active_requests": 2, @@ -250,10 +293,12 @@ { "tick": 17, "admitted": [], + "admitted_prefill_tokens": 0, "decoded": [ "batch-003", "batch-001" ], + "decoded_tokens": 2, "completed": [], "queued_requests": 1, "active_requests": 2, @@ -262,10 +307,12 @@ { "tick": 18, "admitted": [], + "admitted_prefill_tokens": 0, "decoded": [ "batch-003", "batch-001" ], + "decoded_tokens": 2, "completed": [ "batch-001" ], @@ -278,10 +325,12 @@ "admitted": [ "batch-002" ], + "admitted_prefill_tokens": 96, "decoded": [ "batch-003", "batch-002" ], + "decoded_tokens": 2, "completed": [], "queued_requests": 0, "active_requests": 2, @@ -290,10 +339,12 @@ { "tick": 20, "admitted": [], + "admitted_prefill_tokens": 0, "decoded": [ "batch-003", "batch-002" ], + "decoded_tokens": 2, "completed": [ "batch-003" ], @@ -304,9 +355,11 @@ { "tick": 21, "admitted": [], + "admitted_prefill_tokens": 0, "decoded": [ "batch-002" ], + "decoded_tokens": 1, "completed": [], "queued_requests": 0, "active_requests": 1, @@ -315,9 +368,11 @@ { "tick": 22, "admitted": [], + "admitted_prefill_tokens": 0, "decoded": [ "batch-002" ], + "decoded_tokens": 1, "completed": [], "queued_requests": 0, "active_requests": 1, @@ -326,9 +381,11 @@ { "tick": 23, "admitted": [], + "admitted_prefill_tokens": 0, "decoded": [ "batch-002" ], + "decoded_tokens": 1, "completed": [], "queued_requests": 0, "active_requests": 1, @@ -337,9 +394,11 @@ { "tick": 24, "admitted": [], + "admitted_prefill_tokens": 0, "decoded": [ "batch-002" ], + "decoded_tokens": 1, "completed": [], "queued_requests": 0, "active_requests": 1, @@ -348,9 +407,11 @@ { "tick": 25, "admitted": [], + "admitted_prefill_tokens": 0, "decoded": [ "batch-002" ], + "decoded_tokens": 1, "completed": [], "queued_requests": 0, "active_requests": 1, @@ -359,9 +420,11 @@ { "tick": 26, "admitted": [], + "admitted_prefill_tokens": 0, "decoded": [ "batch-002" ], + "decoded_tokens": 1, "completed": [ "batch-002" ], diff --git a/artifacts/workload-replay.json b/artifacts/workload-replay.json index 17d8a4a..8acdba5 100644 --- a/artifacts/workload-replay.json +++ b/artifacts/workload-replay.json @@ -1,5 +1,5 @@ { - "schema_version": 2, + "schema_version": 3, "config": { "max_active_requests": 3, "decode_batch_size": 2, @@ -9,6 +9,9 @@ }, "request_count": 4, "completed_requests": 4, + "total_prompt_tokens": 224, + "total_decode_tokens": 18, + "total_reserved_kv_pages": 18, "total_ticks": 11, "max_queued_requests": 1, "max_active_requests": 3, @@ -16,10 +19,16 @@ "peak_kv_pressure_pct": 60.0, "ticks_with_queue_pressure": 3, "ticks_at_active_capacity": 3, + "declared_prefill_capacity_tokens": 1056, + "declared_decode_capacity_tokens": 22, + "kv_page_capacity_ticks": 220, "queue_pressure_ratio": 0.272727, "active_capacity_ratio": 0.272727, + "prefill_capacity_utilization": 0.212121, + "decode_capacity_utilization": 0.818182, + "kv_page_occupancy_ratio": 0.4, "final_kv_pages": 0, - "trace_fingerprint": "394166dc24d38b6c", + "trace_fingerprint": "b454ea97ea75ee90", "ticks": [ { "tick": 0, @@ -27,10 +36,12 @@ "interactive-001", "batch-001" ], + "admitted_prefill_tokens": 96, "decoded": [ "interactive-001", "batch-001" ], + "decoded_tokens": 2, "completed": [], "queued_requests": 0, "active_requests": 2, @@ -41,10 +52,12 @@ "admitted": [ "interactive-002" ], + "admitted_prefill_tokens": 48, "decoded": [ "interactive-001", "batch-001" ], + "decoded_tokens": 2, "completed": [], "queued_requests": 0, "active_requests": 3, @@ -53,10 +66,12 @@ { "tick": 2, "admitted": [], + "admitted_prefill_tokens": 0, "decoded": [ "interactive-002", "interactive-001" ], + "decoded_tokens": 2, "completed": [], "queued_requests": 1, "active_requests": 3, @@ -65,10 +80,12 @@ { "tick": 3, "admitted": [], + "admitted_prefill_tokens": 0, "decoded": [ "batch-001", "interactive-002" ], + "decoded_tokens": 2, "completed": [], "queued_requests": 1, "active_requests": 3, @@ -77,10 +94,12 @@ { "tick": 4, "admitted": [], + "admitted_prefill_tokens": 0, "decoded": [ "interactive-001", "batch-001" ], + "decoded_tokens": 2, "completed": [ "interactive-001" ], @@ -93,10 +112,12 @@ "admitted": [ "batch-002" ], + "admitted_prefill_tokens": 80, "decoded": [ "interactive-002", "batch-001" ], + "decoded_tokens": 2, "completed": [ "interactive-002" ], @@ -107,10 +128,12 @@ { "tick": 6, "admitted": [], + "admitted_prefill_tokens": 0, "decoded": [ "batch-002", "batch-001" ], + "decoded_tokens": 2, "completed": [ "batch-001" ], @@ -121,9 +144,11 @@ { "tick": 7, "admitted": [], + "admitted_prefill_tokens": 0, "decoded": [ "batch-002" ], + "decoded_tokens": 1, "completed": [], "queued_requests": 0, "active_requests": 1, @@ -132,9 +157,11 @@ { "tick": 8, "admitted": [], + "admitted_prefill_tokens": 0, "decoded": [ "batch-002" ], + "decoded_tokens": 1, "completed": [], "queued_requests": 0, "active_requests": 1, @@ -143,9 +170,11 @@ { "tick": 9, "admitted": [], + "admitted_prefill_tokens": 0, "decoded": [ "batch-002" ], + "decoded_tokens": 1, "completed": [], "queued_requests": 0, "active_requests": 1, @@ -154,9 +183,11 @@ { "tick": 10, "admitted": [], + "admitted_prefill_tokens": 0, "decoded": [ "batch-002" ], + "decoded_tokens": 1, "completed": [ "batch-002" ], diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 990c333..01689bb 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -29,11 +29,12 @@ runner preserves input order for equal arrival times and emits a stable FNV-1a fingerprint over the serialized trace. Replaying the same input should produce the same admission, decode, completion, and memory sequence. -The replay report also summarizes workload pressure: maximum queued requests, -maximum active requests, peak KV pages, peak KV pressure percentage, ticks with -queued requests, ticks at active capacity, and pressure ratios over the full -run. Those fields make stress fixtures reviewable without requiring a reader to -manually inspect every tick. +The replay report also summarizes workload pressure and capacity use: maximum +queued requests, maximum active requests, peak KV pages, peak KV pressure +percentage, ticks with queued requests, ticks at active capacity, total prompt +and decode tokens, total reserved KV pages, declared prefill/decode/KV capacity, +and utilization ratios over the full run. Those fields make stress fixtures +reviewable without requiring a reader to manually inspect every tick. ## Release Gate diff --git a/src/scheduler.rs b/src/scheduler.rs index 7785d3a..182119e 100644 --- a/src/scheduler.rs +++ b/src/scheduler.rs @@ -115,7 +115,9 @@ struct ActiveRequest { pub struct TickTrace { pub tick: u64, pub admitted: Vec, + pub admitted_prefill_tokens: u32, pub decoded: Vec, + pub decoded_tokens: u32, pub completed: Vec, pub queued_requests: usize, pub active_requests: usize, @@ -172,15 +174,17 @@ impl Scheduler { let mut trace = TickTrace { tick: self.tick, admitted: Vec::new(), + admitted_prefill_tokens: 0, decoded: Vec::new(), + decoded_tokens: 0, completed: Vec::new(), queued_requests: 0, active_requests: 0, used_kv_pages: 0, }; - self.admit_requests(&mut trace.admitted); - self.decode_requests(&mut trace.decoded, &mut trace.completed); + trace.admitted_prefill_tokens = self.admit_requests(&mut trace.admitted); + trace.decoded_tokens = self.decode_requests(&mut trace.decoded, &mut trace.completed); trace.queued_requests = self.queue.len(); trace.active_requests = self.active.len(); @@ -197,10 +201,11 @@ impl Scheduler { self.completed_ids.len() } - fn admit_requests(&mut self, admitted: &mut Vec) { + fn admit_requests(&mut self, admitted: &mut Vec) -> u32 { let mut remaining_prefill = self.config.max_prefill_tokens_per_tick; let mut deferred = Vec::new(); let candidates = self.queue.len(); + let mut admitted_prefill_tokens = 0; for _ in 0..candidates { if self.active.len() >= self.config.max_active_requests { @@ -218,6 +223,7 @@ impl Scheduler { if pages_fit && prefill_fits { remaining_prefill = remaining_prefill.saturating_sub(candidate.spec.prompt_tokens); + admitted_prefill_tokens += candidate.spec.prompt_tokens; self.used_kv_pages += pages; let id = candidate.spec.id.clone(); self.active.insert( @@ -236,9 +242,10 @@ impl Scheduler { } self.queue.extend(deferred); + admitted_prefill_tokens } - fn decode_requests(&mut self, decoded: &mut Vec, completed: &mut Vec) { + fn decode_requests(&mut self, decoded: &mut Vec, completed: &mut Vec) -> u32 { let batch_size = self.config.decode_batch_size.min(self.decode_order.len()); for _ in 0..batch_size { @@ -267,6 +274,8 @@ impl Scheduler { self.decode_order.push_back(id); } } + + decoded.len() as u32 } } @@ -282,6 +291,9 @@ pub struct ReplayReport { pub config: SchedulerConfig, pub request_count: usize, pub completed_requests: usize, + pub total_prompt_tokens: u64, + pub total_decode_tokens: u64, + pub total_reserved_kv_pages: u64, pub total_ticks: u64, pub max_queued_requests: usize, pub max_active_requests: usize, @@ -289,8 +301,14 @@ pub struct ReplayReport { pub peak_kv_pressure_pct: f64, pub ticks_with_queue_pressure: u64, pub ticks_at_active_capacity: u64, + pub declared_prefill_capacity_tokens: u64, + pub declared_decode_capacity_tokens: u64, + pub kv_page_capacity_ticks: u64, pub queue_pressure_ratio: f64, pub active_capacity_ratio: f64, + pub prefill_capacity_utilization: f64, + pub decode_capacity_utilization: f64, + pub kv_page_occupancy_ratio: f64, pub final_kv_pages: u32, pub trace_fingerprint: String, pub ticks: Vec, @@ -300,6 +318,18 @@ pub fn run_replay(input: ReplayInput) -> Result { let mut scheduler = Scheduler::new(input.config.clone())?; let request_count = input.requests.len(); let mut pending: Vec<(usize, RequestSpec)> = input.requests.into_iter().enumerate().collect(); + let total_prompt_tokens = pending + .iter() + .map(|(_, request)| u64::from(request.prompt_tokens)) + .sum(); + let total_decode_tokens = pending + .iter() + .map(|(_, request)| u64::from(request.max_output_tokens)) + .sum(); + let total_reserved_kv_pages = pending + .iter() + .map(|(_, request)| u64::from(request.reserved_pages(input.config.kv_page_tokens))) + .sum(); pending.sort_by(|(left_index, left), (right_index, right)| { left.arrival_tick .cmp(&right.arrival_tick) @@ -346,14 +376,23 @@ pub fn run_replay(input: ReplayInput) -> Result { .filter(|tick| tick.active_requests >= max_active_capacity) .count() as u64; let total_ticks = scheduler.current_tick(); + let declared_prefill_capacity_tokens = + total_ticks.saturating_mul(u64::from(input.config.max_prefill_tokens_per_tick)); + let declared_decode_capacity_tokens = + total_ticks.saturating_mul(input.config.decode_batch_size as u64); + let kv_page_capacity_ticks = total_ticks.saturating_mul(u64::from(total_kv_pages)); + let occupied_kv_page_ticks = ticks.iter().map(|tick| u64::from(tick.used_kv_pages)).sum(); let trace_bytes = serde_json::to_vec(&ticks) .map_err(|error| RuntimeError::Serialization(error.to_string()))?; Ok(ReplayReport { - schema_version: 2, + schema_version: 3, config: input.config, request_count, completed_requests: scheduler.completed_count(), + total_prompt_tokens, + total_decode_tokens, + total_reserved_kv_pages, total_ticks, max_queued_requests, max_active_requests, @@ -361,8 +400,14 @@ pub fn run_replay(input: ReplayInput) -> Result { peak_kv_pressure_pct: pressure_pct(peak_kv_pages, total_kv_pages), ticks_with_queue_pressure, ticks_at_active_capacity, + declared_prefill_capacity_tokens, + declared_decode_capacity_tokens, + kv_page_capacity_ticks, queue_pressure_ratio: ratio(ticks_with_queue_pressure, total_ticks), active_capacity_ratio: ratio(ticks_at_active_capacity, total_ticks), + prefill_capacity_utilization: ratio(total_prompt_tokens, declared_prefill_capacity_tokens), + decode_capacity_utilization: ratio(total_decode_tokens, declared_decode_capacity_tokens), + kv_page_occupancy_ratio: ratio(occupied_kv_page_ticks, kv_page_capacity_ticks), final_kv_pages: 0, trace_fingerprint: fnv1a64_hex(&trace_bytes), ticks, diff --git a/tests/scheduler.rs b/tests/scheduler.rs index fffc8e4..19ed705 100644 --- a/tests/scheduler.rs +++ b/tests/scheduler.rs @@ -38,11 +38,21 @@ fn replay_is_deterministic() { assert_eq!(first, second); assert_eq!(first.ticks[0].admitted, vec!["urgent", "normal"]); assert_eq!(first.completed_requests, 2); + assert_eq!(first.schema_version, 3); + assert_eq!(first.total_prompt_tokens, 48); + assert_eq!(first.total_decode_tokens, 5); + assert_eq!(first.total_reserved_kv_pages, 5); assert_eq!(first.max_queued_requests, 0); assert_eq!(first.max_active_requests, 2); assert_eq!(first.peak_kv_pressure_pct, 62.5); + assert_eq!(first.declared_prefill_capacity_tokens, 144); + assert_eq!(first.declared_decode_capacity_tokens, 6); + assert_eq!(first.kv_page_capacity_ticks, 24); assert_eq!(first.queue_pressure_ratio, 0.0); assert_eq!(first.active_capacity_ratio, 0.333333); + assert_eq!(first.prefill_capacity_utilization, 0.333333); + assert_eq!(first.decode_capacity_utilization, 0.833333); + assert!(first.kv_page_occupancy_ratio > 0.0); assert_eq!(first.final_kv_pages, 0); } @@ -115,8 +125,11 @@ fn pressure_replay_reports_queue_and_capacity_summary() { let report = run_replay(input).unwrap(); - assert_eq!(report.schema_version, 2); + assert_eq!(report.schema_version, 3); assert_eq!(report.request_count, report.completed_requests); + assert_eq!(report.total_prompt_tokens, 432); + assert_eq!(report.total_decode_tokens, 48); + assert_eq!(report.total_reserved_kv_pages, 35); assert!(report.max_queued_requests >= 3); assert_eq!( report.max_active_requests, @@ -127,5 +140,8 @@ fn pressure_replay_reports_queue_and_capacity_summary() { assert!(report.ticks_at_active_capacity > 0); assert!(report.queue_pressure_ratio > 0.0); assert!(report.active_capacity_ratio > 0.0); + assert!(report.prefill_capacity_utilization > 0.0); + assert!(report.decode_capacity_utilization > 0.0); + assert!(report.kv_page_occupancy_ratio > 0.0); assert_eq!(report.final_kv_pages, 0); }