Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions scripts/modal_sandbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,25 @@ def destroy(sandbox_id: str):
logger.info("Terminated sandbox %s", sandbox_id)


SANDBOX_DEAD_MARKER = "OFFLOAD_SANDBOX_DEAD:"
# Exit code returned when the sandbox is dead. Distinct from 1 so that the
# Rust side could in principle short-circuit on the exit code alone, though
# the marker on stderr is the authoritative signal.
SANDBOX_DEAD_EXIT_CODE = 75


def _emit_sandbox_dead(reason: str) -> None:
"""Emit the SANDBOX_DEAD sentinel on a line by itself and exit.

Offload's Rust side parses this marker out of stderr to map the failure
to a non-retryable ProviderError::SandboxDead, avoiding a multi-minute
Modal-SDK gRPC retry storm against a container that is already gone.
"""
sys.stderr.write("%s %s\n" % (SANDBOX_DEAD_MARKER, reason))
sys.stderr.flush()
sys.exit(SANDBOX_DEAD_EXIT_CODE)


@cli.command("download")
@click.argument("sandbox_id")
@click.argument("paths", nargs=-1, required=True)
Expand All @@ -464,6 +483,16 @@ def download(sandbox_id: str, paths: tuple[str, ...]):
"""
sandbox = modal.Sandbox.from_id(sandbox_id)

# Bail fast if the sandbox is already dead. Without this, sandbox.open()
# below triggers the Modal SDK's own gRPC retry loop (~tens of seconds per
# attempt). Combined with offload's three rust-level retries, that adds up
# to roughly four minutes of polling a dead container before we give up.
poll_result = sandbox.poll()
if poll_result is not None:
_emit_sandbox_dead(
"sandbox %s already finished with returncode=%s" % (sandbox_id, poll_result)
)

for path_spec in paths:
if ":" not in path_spec:
logger.error(
Expand All @@ -482,6 +511,15 @@ def download(sandbox_id: str, paths: tuple[str, ...]):
try:
copy_from_sandbox(sandbox, remote_path, local_path)
except Exception as e:
# Even with the pre-flight poll() above, the sandbox can die mid-
# download. Detect Modal's "Container ID ... finished" wording and
# surface it as the SANDBOX_DEAD sentinel so the rust side does
# not waste retries on a corpse.
err_text = str(e)
if "finished" in err_text and "status=" in err_text:
_emit_sandbox_dead(
"sandbox %s died mid-download: %s" % (sandbox_id, err_text)
)
logger.error("Failed to download %s: %s", remote_path, e)
sys.exit(1)

Expand Down
10 changes: 9 additions & 1 deletion src/provider.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ impl std::fmt::Display for CostEstimate {
///
/// Errors are categorized to enable appropriate handling strategies:
/// - **Retryable**: `Timeout`, `Connection` - may succeed on retry
/// - **Fatal**: `CreateFailed`, `NotFound` - unlikely to succeed on retry
/// - **Fatal**: `CreateFailed`, `NotFound`, `SandboxDead` - unlikely to succeed on retry
/// - **Resource**: `SandboxExhausted` - need to wait for resources
///
#[derive(Debug, thiserror::Error)]
Expand All @@ -86,6 +86,14 @@ pub enum ProviderError {
#[error("Failed to download file: {0}")]
DownloadFailed(String),

/// The sandbox has finished or failed and can no longer service requests.
///
/// Distinct from [`DownloadFailed`] because it is **not** retryable: the
/// container is gone, so retrying will only burn time waiting for the
/// underlying SDK's own gRPC retries to give up.
#[error("Sandbox is dead: {0}")]
SandboxDead(String),

/// The specified sandbox was not found.
///
/// May indicate the sandbox was terminated or never existed.
Expand Down
49 changes: 49 additions & 0 deletions src/provider/default.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,24 @@ use super::{
};
/// Modal non-preemptible pricing: $0.00003942 per CPU-core per second.
const MODAL_CPU_COST_PER_CORE_PER_SEC: f64 = 0.00003942;

/// Sentinel emitted by `modal_sandbox.py` (and any equivalent download script)
/// to signal that the underlying sandbox has finished/failed and can no longer
/// service requests. Detecting this lets us short-circuit the retry loop instead
/// of waiting for the Modal SDK's gRPC retries to give up.
const SANDBOX_DEAD_MARKER: &str = "OFFLOAD_SANDBOX_DEAD:";

/// Returns the human-readable reason if `stderr` contains the
/// [`SANDBOX_DEAD_MARKER`] sentinel, otherwise `None`.
fn parse_sandbox_dead_marker(stderr: &str) -> Option<String> {
for line in stderr.lines() {
if let Some(idx) = line.find(SANDBOX_DEAD_MARKER) {
let reason = line[idx + SANDBOX_DEAD_MARKER.len()..].trim();
return Some(reason.to_string());
}
}
None
}
use crate::config::{DefaultProviderConfig, SandboxConfig};
use crate::connector::{Connector, ShellConnector};
use crate::image_cache::{ImageBuilder, prepare_with_prewarm};
Expand Down Expand Up @@ -386,6 +404,12 @@ impl Sandbox for DefaultSandbox {
let result = self.connector.run(&shell_cmd).await?;

if result.exit_code != 0 {
if let Some(reason) = parse_sandbox_dead_marker(&result.stderr) {
return Err(ProviderError::SandboxDead(format!(
"sandbox {}: {}",
self.id, reason
)));
}
return Err(ProviderError::DownloadFailed(format!(
"Download command failed: {}",
result.stderr
Expand Down Expand Up @@ -752,6 +776,31 @@ mod tests {
);
}

#[test]
fn parse_sandbox_dead_marker_finds_sentinel() {
let stderr = "some prelude\nOFFLOAD_SANDBOX_DEAD: sb-abc123 finished 4s ago (status='failure')\nmore noise\n";
let reason = parse_sandbox_dead_marker(stderr);
assert_eq!(
reason.as_deref(),
Some("sb-abc123 finished 4s ago (status='failure')")
);
}

#[test]
fn parse_sandbox_dead_marker_returns_none_when_absent() {
let stderr = "regular download error\nno sentinel here\n";
assert!(parse_sandbox_dead_marker(stderr).is_none());
}

#[test]
fn parse_sandbox_dead_marker_handles_logger_prefix() {
// Real stderr from modal_sandbox.py is line-oriented; the sentinel may
// appear after a logger prefix on the same line.
let stderr = "ERROR:offload:OFFLOAD_SANDBOX_DEAD: container gone";
let reason = parse_sandbox_dead_marker(stderr);
assert_eq!(reason.as_deref(), Some("container gone"));
}

/// Integration test for Modal sandbox download functionality via DefaultProvider.
///
/// This test requires Modal credentials (MODAL_TOKEN_ID and MODAL_TOKEN_SECRET).
Expand Down
7 changes: 7 additions & 0 deletions src/provider/retry.rs
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,13 @@ mod tests {
assert!(!is_retryable(&ProviderError::CreateFailed("c".into())));
}

#[test]
fn sandbox_dead_is_not_retryable() {
assert!(!is_retryable(&ProviderError::SandboxDead(
"container finished".into()
)));
}

// ── with_retry! macro behavior ───────────────────────────────────

#[tokio::test]
Expand Down
Loading