diff --git a/scripts/modal_sandbox.py b/scripts/modal_sandbox.py index 99f58855..cc97bd7f 100755 --- a/scripts/modal_sandbox.py +++ b/scripts/modal_sandbox.py @@ -445,6 +445,25 @@ def destroy(sandbox_id: str): logger.info("Terminated sandbox %s", sandbox_id) +SANDBOX_DEAD_MARKER = "OFFLOAD_SANDBOX_DEAD:" +# Exit code returned when the sandbox is dead. Distinct from 1 so that the +# Rust side could in principle short-circuit on the exit code alone, though +# the marker on stderr is the authoritative signal. +SANDBOX_DEAD_EXIT_CODE = 75 + + +def _emit_sandbox_dead(reason: str) -> None: + """Emit the SANDBOX_DEAD sentinel on a line by itself and exit. + + Offload's Rust side parses this marker out of stderr to map the failure + to a non-retryable ProviderError::SandboxDead, avoiding a multi-minute + Modal-SDK gRPC retry storm against a container that is already gone. + """ + sys.stderr.write("%s %s\n" % (SANDBOX_DEAD_MARKER, reason)) + sys.stderr.flush() + sys.exit(SANDBOX_DEAD_EXIT_CODE) + + @cli.command("download") @click.argument("sandbox_id") @click.argument("paths", nargs=-1, required=True) @@ -464,6 +483,16 @@ def download(sandbox_id: str, paths: tuple[str, ...]): """ sandbox = modal.Sandbox.from_id(sandbox_id) + # Bail fast if the sandbox is already dead. Without this, sandbox.open() + # below triggers the Modal SDK's own gRPC retry loop (~tens of seconds per + # attempt). Combined with offload's three rust-level retries, that adds up + # to roughly four minutes of polling a dead container before we give up. + poll_result = sandbox.poll() + if poll_result is not None: + _emit_sandbox_dead( + "sandbox %s already finished with returncode=%s" % (sandbox_id, poll_result) + ) + for path_spec in paths: if ":" not in path_spec: logger.error( @@ -482,6 +511,15 @@ def download(sandbox_id: str, paths: tuple[str, ...]): try: copy_from_sandbox(sandbox, remote_path, local_path) except Exception as e: + # Even with the pre-flight poll() above, the sandbox can die mid- + # download. Detect Modal's "Container ID ... finished" wording and + # surface it as the SANDBOX_DEAD sentinel so the rust side does + # not waste retries on a corpse. + err_text = str(e) + if "finished" in err_text and "status=" in err_text: + _emit_sandbox_dead( + "sandbox %s died mid-download: %s" % (sandbox_id, err_text) + ) logger.error("Failed to download %s: %s", remote_path, e) sys.exit(1) diff --git a/src/provider.rs b/src/provider.rs index b761d7c6..2b87e37d 100644 --- a/src/provider.rs +++ b/src/provider.rs @@ -64,7 +64,7 @@ impl std::fmt::Display for CostEstimate { /// /// Errors are categorized to enable appropriate handling strategies: /// - **Retryable**: `Timeout`, `Connection` - may succeed on retry -/// - **Fatal**: `CreateFailed`, `NotFound` - unlikely to succeed on retry +/// - **Fatal**: `CreateFailed`, `NotFound`, `SandboxDead` - unlikely to succeed on retry /// - **Resource**: `SandboxExhausted` - need to wait for resources /// #[derive(Debug, thiserror::Error)] @@ -86,6 +86,14 @@ pub enum ProviderError { #[error("Failed to download file: {0}")] DownloadFailed(String), + /// The sandbox has finished or failed and can no longer service requests. + /// + /// Distinct from [`DownloadFailed`] because it is **not** retryable: the + /// container is gone, so retrying will only burn time waiting for the + /// underlying SDK's own gRPC retries to give up. + #[error("Sandbox is dead: {0}")] + SandboxDead(String), + /// The specified sandbox was not found. /// /// May indicate the sandbox was terminated or never existed. diff --git a/src/provider/default.rs b/src/provider/default.rs index 7402e203..9560f2ad 100644 --- a/src/provider/default.rs +++ b/src/provider/default.rs @@ -14,6 +14,24 @@ use super::{ }; /// Modal non-preemptible pricing: $0.00003942 per CPU-core per second. const MODAL_CPU_COST_PER_CORE_PER_SEC: f64 = 0.00003942; + +/// Sentinel emitted by `modal_sandbox.py` (and any equivalent download script) +/// to signal that the underlying sandbox has finished/failed and can no longer +/// service requests. Detecting this lets us short-circuit the retry loop instead +/// of waiting for the Modal SDK's gRPC retries to give up. +const SANDBOX_DEAD_MARKER: &str = "OFFLOAD_SANDBOX_DEAD:"; + +/// Returns the human-readable reason if `stderr` contains the +/// [`SANDBOX_DEAD_MARKER`] sentinel, otherwise `None`. +fn parse_sandbox_dead_marker(stderr: &str) -> Option { + for line in stderr.lines() { + if let Some(idx) = line.find(SANDBOX_DEAD_MARKER) { + let reason = line[idx + SANDBOX_DEAD_MARKER.len()..].trim(); + return Some(reason.to_string()); + } + } + None +} use crate::config::{DefaultProviderConfig, SandboxConfig}; use crate::connector::{Connector, ShellConnector}; use crate::image_cache::{ImageBuilder, prepare_with_prewarm}; @@ -386,6 +404,12 @@ impl Sandbox for DefaultSandbox { let result = self.connector.run(&shell_cmd).await?; if result.exit_code != 0 { + if let Some(reason) = parse_sandbox_dead_marker(&result.stderr) { + return Err(ProviderError::SandboxDead(format!( + "sandbox {}: {}", + self.id, reason + ))); + } return Err(ProviderError::DownloadFailed(format!( "Download command failed: {}", result.stderr @@ -752,6 +776,31 @@ mod tests { ); } + #[test] + fn parse_sandbox_dead_marker_finds_sentinel() { + let stderr = "some prelude\nOFFLOAD_SANDBOX_DEAD: sb-abc123 finished 4s ago (status='failure')\nmore noise\n"; + let reason = parse_sandbox_dead_marker(stderr); + assert_eq!( + reason.as_deref(), + Some("sb-abc123 finished 4s ago (status='failure')") + ); + } + + #[test] + fn parse_sandbox_dead_marker_returns_none_when_absent() { + let stderr = "regular download error\nno sentinel here\n"; + assert!(parse_sandbox_dead_marker(stderr).is_none()); + } + + #[test] + fn parse_sandbox_dead_marker_handles_logger_prefix() { + // Real stderr from modal_sandbox.py is line-oriented; the sentinel may + // appear after a logger prefix on the same line. + let stderr = "ERROR:offload:OFFLOAD_SANDBOX_DEAD: container gone"; + let reason = parse_sandbox_dead_marker(stderr); + assert_eq!(reason.as_deref(), Some("container gone")); + } + /// Integration test for Modal sandbox download functionality via DefaultProvider. /// /// This test requires Modal credentials (MODAL_TOKEN_ID and MODAL_TOKEN_SECRET). diff --git a/src/provider/retry.rs b/src/provider/retry.rs index 7e1742eb..c971bc30 100644 --- a/src/provider/retry.rs +++ b/src/provider/retry.rs @@ -87,6 +87,13 @@ mod tests { assert!(!is_retryable(&ProviderError::CreateFailed("c".into()))); } + #[test] + fn sandbox_dead_is_not_retryable() { + assert!(!is_retryable(&ProviderError::SandboxDead( + "container finished".into() + ))); + } + // ── with_retry! macro behavior ─────────────────────────────────── #[tokio::test]