Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion crates/korg-runtime/src/execution/dag.rs
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,13 @@ impl SpeculativeScheduler {
}
}

/// Pre-warm execution resources. No-op in the embedded form (no bun discovery needed).
/// Legacy no-op stub. The `SpeculativeScheduler` is a test-only path with no
/// production caller, so "warming" it primed nothing the real campaign runs —
/// theater. The REAL warm boot now lives in [`super::warm_boot::warm_boot`]:
/// it pre-warms a shared `CARGO_TARGET_DIR` (`warm_target_dir(session)`) that
/// the live worker child (`SubprocessBackend::spawn`) reuses. This stub only
/// flips its idempotency flag and does no work; the campaign-level
/// `warm_boot(...)` call (gated by `--speculative`) is the real mechanism.
pub async fn speculative_warm_boot(&mut self) -> Result<()> {
if self.warm_boot_started {
return Ok(());
Expand Down
2 changes: 2 additions & 0 deletions crates/korg-runtime/src/execution/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@ pub mod dag;
pub mod events;
pub mod pool;
pub mod recovery;
pub mod warm_boot;

pub use dag::{DagNode, ExecutionDag, ExecutionSummary, NodeStatus, SpeculativeScheduler};
pub use events::{BunEvent, BunEventOrOutcome, BunOutcome, EventLevel};
pub use recovery::{heal_node, heal_node_with_context};
pub use warm_boot::{warm_boot, warm_target_dir, WarmBootReport, WarmBootStatus};
326 changes: 326 additions & 0 deletions crates/korg-runtime/src/execution/warm_boot.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,326 @@
//! Real warm boot — pre-warm a shared `CARGO_TARGET_DIR` the live worker reuses.
//!
//! ## The honesty thesis
//! The legacy `SpeculativeScheduler::speculative_warm_boot` (`dag.rs`) primed a
//! test-only path no production campaign runs — theater. The *real* campaign work
//! is `leader → dispatch_level → spawn_worker_process → korg worker child →
//! observation::cargo_check`, a separate process whose `cargo check` runs cold in
//! each worktree.
//!
//! This module makes warm boot do REAL work the REAL worker reuses: both sides
//! independently derive the SAME stable [`warm_target_dir`] (no cross-process
//! plumbing), warm boot compiles the dependency graph into it once, and every
//! worker child is spawned with `CARGO_TARGET_DIR` pointing at it (cargo honors
//! that env automatically — `observation::cargo_check` needs no change). Reuse is
//! proven STRUCTURALLY (cache non-empty + worker env points at it), not by a flaky
//! timing benchmark.
//!
//! ## Hermetic contract
//! Default off (`enabled = false` → [`WarmBootStatus::Skipped`], no dir, no work).
//! When enabled, the warm `cargo check` is wrapped in a [`tokio::time::timeout`]
//! cap and a cargo-presence guard: cargo absent / spawn error / timeout →
//! [`WarmBootStatus::Unavailable`] with a log, never a hang/panic/`Err` that would
//! abort the campaign. The bare-host path always completes.

use std::path::{Path, PathBuf};
use std::time::Duration;

/// Hard cap on the warm `cargo check`. On a cold dependency graph a real check
/// can take a while; past this we degrade to the cold path rather than block the
/// campaign. Chosen generous (deps compile once) but bounded.
const WARM_BOOT_TIMEOUT: Duration = Duration::from_secs(60);

/// Outcome class of a warm-boot attempt.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum WarmBootStatus {
/// Speculative was off — no dir created, no work done.
Skipped,
/// The shared cache was warmed by a successful `cargo check`.
Warmed,
/// Cargo absent, spawn failed, or the warm check timed out — degrade to the
/// cold path. NEVER an error that aborts the campaign.
Unavailable,
}

/// Report from a warm-boot attempt. `target_dir` is `Some` whenever a shared dir
/// was created (warmed or attempted), `None` when skipped. `populated` is the
/// structural reuse proof: the shared cache is non-empty after a successful warm.
#[derive(Debug, Clone)]
pub struct WarmBootReport {
pub status: WarmBootStatus,
pub target_dir: Option<PathBuf>,
pub populated: bool,
}

/// The STABLE shared cargo target dir for a campaign session.
///
/// Both warm boot and the worker spawn compute this independently from the same
/// `session_id`, so they agree on the path without passing data across the
/// process boundary. Same session → same path; different sessions → different.
///
/// Rooted at the OS cache dir (falling back to `~/.korg/cache`, then the system
/// temp dir) under `korg/target-<session_id>`.
pub fn warm_target_dir(session_id: &str) -> PathBuf {
cache_root()
.join("korg")
.join(format!("target-{session_id}"))
}

/// Deterministic cache root: OS cache dir → `~/.korg/cache` → temp dir.
fn cache_root() -> PathBuf {
if let Some(c) = dirs::cache_dir() {
return c;
}
if let Some(home) = dirs::home_dir() {
return home.join(".korg").join("cache");
}
std::env::temp_dir()
}

/// Pre-warm the shared cargo target dir for `session_id` by running `cargo check`
/// against `repo` with `CARGO_TARGET_DIR` set to [`warm_target_dir`].
///
/// Hermetic: `!enabled` → [`WarmBootStatus::Skipped`] (no dir, no work). When
/// enabled, the check is bounded by [`WARM_BOOT_TIMEOUT`] and guarded against a
/// missing `cargo` / spawn failure; any of those → [`WarmBootStatus::Unavailable`]
/// with a log. This function never returns an `Err` and never hangs.
pub async fn warm_boot(session_id: &str, repo: &Path, enabled: bool) -> WarmBootReport {
if !enabled {
return WarmBootReport {
status: WarmBootStatus::Skipped,
target_dir: None,
populated: false,
};
}

let target_dir = warm_target_dir(session_id);
if let Err(e) = std::fs::create_dir_all(&target_dir) {
tracing::warn!(
session_id,
target_dir = %target_dir.display(),
error = %e,
"warm_boot: could not create shared target dir — degrading to cold path"
);
return WarmBootReport {
status: WarmBootStatus::Unavailable,
target_dir: Some(target_dir),
populated: false,
};
}

// Spawn the warm check with CARGO_TARGET_DIR pointing at the shared cache.
// A spawn error here means cargo is absent / unusable → Unavailable.
let spawn = tokio::process::Command::new("cargo")
.arg("check")
.arg("--quiet")
.current_dir(repo)
.env("CARGO_TARGET_DIR", &target_dir)
.output();

let result = tokio::time::timeout(WARM_BOOT_TIMEOUT, spawn).await;

match result {
// Cargo ran (pass or fail). Either way the shared cache got populated with
// whatever compiled; success is the strong signal but a failed user crate
// can still leave dep artifacts. We classify on whether cargo ran cleanly.
Ok(Ok(output)) if output.status.success() => {
let populated = dir_is_non_empty(&target_dir);
tracing::info!(
session_id,
target_dir = %target_dir.display(),
populated,
"warm_boot: shared cargo cache warmed"
);
WarmBootReport {
status: WarmBootStatus::Warmed,
target_dir: Some(target_dir),
populated,
}
}
Ok(Ok(output)) => {
// cargo ran but the repo didn't compile (or isn't a crate). Not a warm
// success — degrade. The campaign's own cold checks still run normally.
tracing::warn!(
session_id,
target_dir = %target_dir.display(),
stderr = %String::from_utf8_lossy(&output.stderr),
"warm_boot: cargo check did not succeed — degrading to cold path"
);
let populated = dir_is_non_empty(&target_dir);
WarmBootReport {
status: WarmBootStatus::Unavailable,
target_dir: Some(target_dir),
populated,
}
}
Ok(Err(e)) => {
// Spawn error: cargo absent / not executable.
tracing::warn!(
session_id,
error = %e,
"warm_boot: cargo unavailable — degrading to cold path"
);
WarmBootReport {
status: WarmBootStatus::Unavailable,
target_dir: Some(target_dir),
populated: false,
}
}
Err(_elapsed) => {
// Timed out. Never hang the campaign.
tracing::warn!(
session_id,
timeout_secs = WARM_BOOT_TIMEOUT.as_secs(),
"warm_boot: cargo check timed out — degrading to cold path"
);
let populated = dir_is_non_empty(&target_dir);
WarmBootReport {
status: WarmBootStatus::Unavailable,
target_dir: Some(target_dir),
populated,
}
}
}
}

/// True if `dir` exists and contains at least one entry — the structural proof
/// that a real compilation cache was produced.
fn dir_is_non_empty(dir: &Path) -> bool {
std::fs::read_dir(dir)
.map(|mut it| it.next().is_some())
.unwrap_or(false)
}

#[cfg(test)]
mod tests {
use super::*;

/// A throwaway crate dir with a trivial valid lib — mirrors the fixture setup
/// in `observation.rs` tests. No git needed; `cargo check` only needs a crate.
fn tiny_crate() -> PathBuf {
let d = std::env::temp_dir().join(format!("korg-warm-{}", uuid::Uuid::new_v4()));
std::fs::create_dir_all(d.join("src")).unwrap();
std::fs::write(
d.join("Cargo.toml"),
"[package]\nname=\"t\"\nversion=\"0.1.0\"\nedition=\"2021\"\n[lib]\npath=\"src/lib.rs\"\n",
)
.unwrap();
std::fs::write(d.join("src/lib.rs"), "pub fn f() -> i64 { 1 }\n").unwrap();
d
}

#[test]
fn warm_target_dir_is_stable_per_session() {
let a1 = warm_target_dir("session-abc");
let a2 = warm_target_dir("session-abc");
assert_eq!(
a1, a2,
"same session must derive the same shared target dir"
);
}

#[test]
fn warm_target_dir_differs_across_sessions() {
let a = warm_target_dir("session-abc");
let b = warm_target_dir("session-xyz");
assert_ne!(a, b, "different sessions must derive different target dirs");
assert!(
a.ends_with("target-session-abc"),
"path must be session-scoped, got {}",
a.display()
);
}

#[tokio::test]
async fn warm_boot_disabled_is_skipped_and_creates_nothing() {
let session = format!("disabled-{}", uuid::Uuid::new_v4());
let dir = warm_target_dir(&session);
// Pre-condition: not present.
let _ = std::fs::remove_dir_all(&dir);

let repo = tiny_crate();
let report = warm_boot(&session, &repo, false).await;

assert_eq!(report.status, WarmBootStatus::Skipped);
assert!(report.target_dir.is_none(), "skipped must report no dir");
assert!(!report.populated);
assert!(
!dir.exists(),
"disabled warm boot must not create the target dir"
);
let _ = std::fs::remove_dir_all(&repo);
}

#[tokio::test]
async fn warm_boot_enabled_warms_a_non_empty_cache() {
// Skip gracefully if cargo isn't on PATH in this environment — the test
// proves the cache-population behavior, which requires a real cargo.
if which_cargo().is_none() {
eprintln!("skipping warm_boot_enabled_warms_a_non_empty_cache: cargo not on PATH");
return;
}
let session = format!("warmed-{}", uuid::Uuid::new_v4());
let dir = warm_target_dir(&session);
let _ = std::fs::remove_dir_all(&dir);

let repo = tiny_crate();
let report = warm_boot(&session, &repo, true).await;

assert_eq!(
report.status,
WarmBootStatus::Warmed,
"a valid tiny crate must warm successfully"
);
assert_eq!(report.target_dir.as_deref(), Some(dir.as_path()));
assert!(
report.populated,
"the shared cache must be non-empty (structural reuse proof)"
);
assert!(dir.exists() && dir_is_non_empty(&dir));

let _ = std::fs::remove_dir_all(&dir);
let _ = std::fs::remove_dir_all(&repo);
}

#[tokio::test]
async fn warm_boot_is_hermetic_when_cargo_absent() {
// Force cargo absent by running with an empty PATH for this process call.
// We can't mutate global PATH safely in parallel tests, so instead point
// the warm boot at a NON-crate dir AND verify it never panics / hangs and
// returns promptly. To force the cargo-absent branch deterministically,
// we temporarily clear PATH around the call via a child-friendly guard.
let session = format!("absent-{}", uuid::Uuid::new_v4());
let dir = warm_target_dir(&session);
let _ = std::fs::remove_dir_all(&dir);

// A dir that is not a cargo crate — cargo check will fail fast (or be
// absent). Either way: Unavailable, no panic, quick.
let not_a_crate =
std::env::temp_dir().join(format!("korg-notcrate-{}", uuid::Uuid::new_v4()));
std::fs::create_dir_all(&not_a_crate).unwrap();

let report = tokio::time::timeout(
Duration::from_secs(30),
warm_boot(&session, &not_a_crate, true),
)
.await
.expect("warm_boot must complete well within the timeout, never hang");

assert_eq!(
report.status,
WarmBootStatus::Unavailable,
"a non-crate / cargo-absent host must degrade to Unavailable, not panic"
);

let _ = std::fs::remove_dir_all(&dir);
let _ = std::fs::remove_dir_all(&not_a_crate);
}

/// Best-effort cargo presence probe for the conditional warm test.
fn which_cargo() -> Option<PathBuf> {
let path = std::env::var_os("PATH")?;
std::env::split_paths(&path)
.map(|p| p.join("cargo"))
.find(|c| c.exists())
}
}
Loading
Loading