diff --git a/Cargo.lock b/Cargo.lock index a04f6fd5f..6c9f2f08f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1276,6 +1276,16 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" +[[package]] +name = "filetime" +version = "0.2.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c287a33c7f0a620c38e641e7f60827713987b3c0f26e8ddc9462cc69cf75759" +dependencies = [ + "cfg-if", + "libc", +] + [[package]] name = "flate2" version = "1.1.9" @@ -2217,6 +2227,7 @@ dependencies = [ "clap", "cron", "emojis", + "flate2", "futures-util", "hex", "http 1.4.2", @@ -2232,6 +2243,7 @@ dependencies = [ "serde_json", "serenity", "sha2 0.10.9", + "tar", "tempfile", "tokio", "tokio-rustls 0.25.0", @@ -3357,6 +3369,16 @@ dependencies = [ "syn", ] +[[package]] +name = "tar" +version = "0.4.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f6221d9a6003c78398e3b239969f352578258df48c8eb051caadae0015bc840" +dependencies = [ + "filetime", + "libc", +] + [[package]] name = "tempfile" version = "3.27.0" diff --git a/crates/openab-core/Cargo.toml b/crates/openab-core/Cargo.toml index 01005c1bf..8d10eee84 100644 --- a/crates/openab-core/Cargo.toml +++ b/crates/openab-core/Cargo.toml @@ -40,6 +40,8 @@ aws-sdk-secretsmanager = { version = "1", optional = true } aws-sdk-s3 = { version = "1", optional = true } aws-config = { version = "1", optional = true } zip = { version = "2", default-features = false, features = ["deflate"], optional = true } +flate2 = { version = "1", optional = true } +tar = { version = "0.4.45", default-features = false, optional = true } aws-sigv4 = { version = "1", optional = true } aws-credential-types = { version = "1", optional = true } urlencoding = { version = "2", optional = true } @@ -55,5 +57,5 @@ discord = ["dep:serenity"] slack = [] secrets-aws = ["dep:aws-sdk-secretsmanager", "dep:aws-config"] config-s3 = ["dep:aws-sdk-s3", "dep:aws-config"] -pre-seed = ["dep:aws-sdk-s3", "dep:aws-config", "dep:zip", "dep:hex"] +pre-seed = ["dep:aws-sdk-s3", "dep:aws-config", "dep:zip", "dep:hex", "dep:flate2", "dep:tar"] agentcore = ["dep:aws-config", "dep:aws-sigv4", "dep:aws-credential-types", "dep:urlencoding", "dep:hex", "dep:http", "dep:rustls", "dep:tokio-rustls", "dep:webpki-roots"] diff --git a/crates/openab-core/src/pre_seed.rs b/crates/openab-core/src/pre_seed.rs index 576344f9c..c5695391e 100644 --- a/crates/openab-core/src/pre_seed.rs +++ b/crates/openab-core/src/pre_seed.rs @@ -164,16 +164,24 @@ async fn download_and_extract( Ok(()) } -/// Extract zip to a temp directory with budget enforcement, then move into target. +/// Extract archive to a temp directory with budget enforcement, then move into target. +/// Supports zip and gzipped tarball formats (detected via magic bytes). /// Checks deadline cooperatively before each file operation. -fn extract_and_apply(data: &[u8], target: &Path, deadline: Instant) -> anyhow::Result<()> { +fn extract_and_apply( + data: &[u8], + target: &Path, + deadline: Instant, +) -> anyhow::Result<()> { let temp_dir = tempfile::tempdir_in(target.parent().unwrap_or(target))?; - extract_zip_with_limits(data, temp_dir.path(), deadline)?; + if data.starts_with(&[0x1f, 0x8b]) { + extract_tarball_with_limits(data, temp_dir.path(), deadline)?; + } else { + extract_zip_with_limits(data, temp_dir.path(), deadline)?; + } // Check deadline before applying to target if Instant::now() >= deadline { - // temp_dir drops and cleans up automatically anyhow::bail!("hooks.pre_seed: timed out before applying to target"); } @@ -256,6 +264,62 @@ fn extract_zip_budgeted( Ok(()) } +/// Extract a .tar.gz/.tgz archive with cooperative deadline checks and size budget. +fn extract_tarball_with_limits(data: &[u8], dest: &Path, deadline: Instant) -> anyhow::Result<()> { + use flate2::read::GzDecoder; + + let decoder = GzDecoder::new(data); + let mut archive = tar::Archive::new(decoder); + archive.set_preserve_permissions(false); + + let mut file_count: usize = 0; + let mut total_extracted: u64 = 0; + + for entry in archive.entries()? { + let mut entry = entry?; + + file_count += 1; + if file_count > DEFAULT_MAX_FILE_COUNT { + anyhow::bail!( + "hooks.pre_seed: tarball contains too many entries ({file_count}, max {DEFAULT_MAX_FILE_COUNT})" + ); + } + + // Cooperative deadline check every 10 files + if file_count % 10 == 0 && Instant::now() >= deadline { + anyhow::bail!("hooks.pre_seed: timed out during tarball extraction at entry {file_count}"); + } + + // Size budget + total_extracted += entry.size(); + if total_extracted > DEFAULT_MAX_EXTRACTED_BYTES { + anyhow::bail!( + "hooks.pre_seed: extracted size exceeds limit ({total_extracted} > {DEFAULT_MAX_EXTRACTED_BYTES})" + ); + } + + entry.unpack_in(dest)?; + + // Manually set permissions (strip suid/sgid/sticky, like zip path) + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + if let Ok(path) = entry.path() { + let out_path = dest.join(path); + if out_path.is_file() { + let mode = entry.header().mode().unwrap_or(0o644) & 0o0777; + let _ = std::fs::set_permissions( + &out_path, + std::fs::Permissions::from_mode(mode), + ); + } + } + } + } + + Ok(()) +} + /// Recursively move files from src directory into dst directory. /// Checks deadline cooperatively. fn move_recursive(src: &Path, dst: &Path, deadline: Instant) -> anyhow::Result<()> { @@ -490,4 +554,101 @@ mod tests { "should fail on file count limit" ); } + + #[test] + fn extract_tarball_basic() { + use flate2::write::GzEncoder; + use flate2::Compression; + + let dir = tempfile::tempdir().unwrap(); + let deadline = Instant::now() + std::time::Duration::from_secs(60); + + let buf = Vec::new(); + let enc = GzEncoder::new(buf, Compression::default()); + let mut builder = tar::Builder::new(enc); + + let mut header = tar::Header::new_gnu(); + header.set_size(5); + header.set_mode(0o644); + builder + .append_data(&mut header, "hello.txt", &b"world"[..]) + .unwrap(); + + let mut header2 = tar::Header::new_gnu(); + header2.set_size(14); + header2.set_mode(0o644); + builder + .append_data(&mut header2, "sub/nested.txt", &b"nested content"[..]) + .unwrap(); + + let enc = builder.into_inner().unwrap(); + let tarball_bytes = enc.finish().unwrap(); + + extract_tarball_with_limits(&tarball_bytes, dir.path(), deadline).unwrap(); + + assert_eq!( + std::fs::read_to_string(dir.path().join("hello.txt")).unwrap(), + "world" + ); + assert_eq!( + std::fs::read_to_string(dir.path().join("sub/nested.txt")).unwrap(), + "nested content" + ); + } + + #[test] + fn extract_and_apply_detects_tarball_via_magic_bytes() { + use flate2::write::GzEncoder; + use flate2::Compression; + + let target = tempfile::tempdir().unwrap(); + let deadline = Instant::now() + std::time::Duration::from_secs(60); + + let buf = Vec::new(); + let enc = GzEncoder::new(buf, Compression::default()); + let mut builder = tar::Builder::new(enc); + let mut header = tar::Header::new_gnu(); + header.set_size(5); + header.set_mode(0o644); + builder + .append_data(&mut header, "hello.txt", &b"world"[..]) + .unwrap(); + let enc = builder.into_inner().unwrap(); + let tarball_bytes = enc.finish().unwrap(); + + // Magic bytes detection — no URI needed + extract_and_apply(&tarball_bytes, target.path(), deadline).unwrap(); + assert_eq!( + std::fs::read_to_string(target.path().join("hello.txt")).unwrap(), + "world" + ); + } + + #[test] + fn extract_tarball_respects_deadline() { + use flate2::write::GzEncoder; + use flate2::Compression; + + let dir = tempfile::tempdir().unwrap(); + let expired = Instant::now() - std::time::Duration::from_secs(1); + + let buf = Vec::new(); + let enc = GzEncoder::new(buf, Compression::default()); + let mut builder = tar::Builder::new(enc); + // Create > 10 files to trigger deadline check + for i in 0..11 { + let mut header = tar::Header::new_gnu(); + header.set_size(1); + header.set_mode(0o644); + builder + .append_data(&mut header, format!("f{i}.txt"), &b"x"[..]) + .unwrap(); + } + let enc = builder.into_inner().unwrap(); + let tarball_bytes = enc.finish().unwrap(); + + let result = extract_tarball_with_limits(&tarball_bytes, dir.path(), expired); + assert!(result.is_err()); + assert!(result.unwrap_err().to_string().contains("timed out")); + } } diff --git a/docs/config-reference.md b/docs/config-reference.md index 2c8ba482b..eae3fc352 100644 --- a/docs/config-reference.md +++ b/docs/config-reference.md @@ -243,15 +243,15 @@ Lifecycle hooks that run at specific points during the container lifecycle. See ### `[hooks.pre_seed]` -Downloads and extracts zip archives from S3 before `pre_boot`. Seeds the agent environment with configs, tools, and shared memory without requiring AWS CLI in the image. +Downloads and extracts archives from S3 before `pre_boot`. Seeds the agent environment with configs, tools, and shared memory without requiring AWS CLI in the image. > **Feature flag:** requires the `pre-seed` feature (opt-in, not in default). Enable with `--features pre-seed`. | Key | Type | Default | Description | |-----|------|---------|-------------| -| `sources` | string[] | `[]` | S3 URIs of zip archives (`s3://bucket/key.zip`). Max 5. Extracted in order; later layers overwrite earlier ones. | +| `sources` | string[] | `[]` | S3 URIs of archives (`.zip`, `.tar.gz`, `.tgz`). Max 5. Extracted in order; later layers overwrite earlier ones. | | `target` | string | `$HOME` | Extraction target directory. | -| `max_bytes` | u64 | `104857600` | Max compressed zip size in bytes (100 MiB). Rejects downloads exceeding this. | +| `max_bytes` | u64 | `104857600` | Max compressed archive size in bytes (100 MiB). Rejects downloads exceeding this. | | `timeout_seconds` | u64 | `300` | Per-source download+extract timeout in seconds. | | `on_failure` | string | `"abort"` | `"abort"` exits openab; `"warn"` logs and continues. | | `region` | string | — | Override AWS region for S3 access. | @@ -265,9 +265,9 @@ environment variables, shared credentials, IRSA / EKS Pod Identity, ECS task rol ```toml [hooks.pre_seed] sources = [ - "s3://my-bucket/base-env.zip", + "s3://my-bucket/base-env.tar.gz", "s3://my-bucket/shared-memory.zip", - "s3://my-bucket/agent-overrides.zip", + "s3://my-bucket/agent-overrides.tgz", ] timeout_seconds = 300 on_failure = "abort" diff --git a/docs/hooks.md b/docs/hooks.md index db1961a5d..164b7a135 100644 --- a/docs/hooks.md +++ b/docs/hooks.md @@ -10,13 +10,13 @@ hooks.pre_seed → hooks.pre_boot → (agent running) → hooks.pre_shutdown | Phase | Purpose | Config | Action Type | |-------|---------|--------|-------------| -| `pre_seed` | Download & extract S3 zip archives to seed the environment | `[hooks.pre_seed]` | Built-in S3 download + unzip | +| `pre_seed` | Download & extract S3 archives to seed the environment | `[hooks.pre_seed]` | Built-in S3 download + extract | | `pre_boot` | Run custom setup scripts before agent pool creation | `[hooks.pre_boot]` | User script | | `pre_shutdown` | Run custom cleanup scripts after pool shutdown | `[hooks.pre_shutdown]` | User script | ## Pre-Seed Phase -The `pre_seed` phase runs **before** `pre_boot`. It downloads zip archives from S3 and extracts them into the agent's home directory (or a custom target). This eliminates the need for users to install AWS CLI and write download scripts in `pre_boot`. +The `pre_seed` phase runs **before** `pre_boot`. It downloads archives from S3 and extracts them into the agent's home directory (or a custom target). Supported formats: `.zip`, `.tar.gz`, and `.tgz` (auto-detected via magic bytes). This eliminates the need for users to install AWS CLI and write download scripts in `pre_boot`. > **Feature flag:** requires the `pre-seed` feature (opt-in, not in default). @@ -25,12 +25,12 @@ The `pre_seed` phase runs **before** `pre_boot`. It downloads zip archives from ```toml [hooks.pre_seed] sources = [ - "s3://my-bucket/base-env.zip", + "s3://my-bucket/base-env.tar.gz", "s3://my-bucket/shared-memory.zip", - "s3://my-bucket/agent-overrides.zip", + "s3://my-bucket/agent-overrides.tgz", ] # target = "/home/agent" # default: $HOME -# max_bytes = 104857600 # max compressed size per zip (default: 100 MiB) +# max_bytes = 104857600 # max compressed size per archive (default: 100 MiB) # timeout_seconds = 300 # per-source timeout (default: 300) # on_failure = "abort" # "abort" or "warn" (default: "abort") # region = "us-west-2" # optional: override AWS region @@ -41,9 +41,9 @@ sources = [ | Field | Type | Default | Description | |-------|------|---------|-------------| -| `sources` | string[] | `[]` | S3 URIs of zip archives. Max 5. Extracted in order. | +| `sources` | string[] | `[]` | S3 URIs of archives (`.zip`, `.tar.gz`, `.tgz`). Max 5. Extracted in order. | | `target` | string | `$HOME` | Extraction target directory. | -| `max_bytes` | u64 | `104857600` | Max compressed zip size in bytes (100 MiB). | +| `max_bytes` | u64 | `104857600` | Max compressed archive size in bytes (100 MiB). | | `timeout_seconds` | u64 | `300` | Per-source download+extract timeout. | | `on_failure` | string | `"abort"` | `"abort"` exits openab; `"warn"` logs and continues. | | `region` | string | — | Override AWS region. | @@ -67,14 +67,15 @@ Layer 1 (first) ─── base layer 1. **S3-native checksum (automatic)**: if the object was uploaded with `--checksum-algorithm SHA256`, OpenAB automatically verifies it on download — no config needed 2. **User-provided `sha256s` (optional)**: explicit checksums in config for additional defense-in-depth - **Size cap**: downloads exceeding `max_bytes` are rejected before extraction -- **Atomic extraction**: zips are first extracted to a temp directory, then moved into target — if extraction fails, target is not corrupted. Note: the move phase is per-file; if it fails mid-way with `on_failure = "warn"`, the target may be partially updated. -- **Zip Slip prevention**: uses `enclosed_name()` to block path traversal attacks +- **Atomic extraction**: archives are first extracted to a temp directory, then moved into target — if extraction fails, target is not corrupted. Note: the move phase is per-file; if it fails mid-way with `on_failure = "warn"`, the target may be partially updated. +- **Path traversal prevention**: zip uses `enclosed_name()`; tarball uses `unpack_in()` which rejects `..` escapes +- **Permission hardening**: suid/sgid/sticky bits are stripped from extracted files ### Constraints - Maximum **5** sources - Only `s3://` URIs supported -- Only `.zip` format supported +- Supported formats: `.zip`, `.tar.gz`, `.tgz` (auto-detected via gzip magic bytes) - Uses the standard AWS credential chain (IRSA, ECS task role, env vars) - Optional `region`/`endpoint_url` override for LocalStack or VPC endpoints