Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion crates/openab-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ aws-sdk-secretsmanager = { version = "1", optional = true }
aws-sdk-s3 = { version = "1", optional = true }
aws-config = { version = "1", optional = true }
zip = { version = "2", default-features = false, features = ["deflate"], optional = true }
flate2 = { version = "1", optional = true }
tar = { version = "0.4.45", default-features = false, optional = true }
aws-sigv4 = { version = "1", optional = true }
aws-credential-types = { version = "1", optional = true }
urlencoding = { version = "2", optional = true }
Expand All @@ -55,5 +57,5 @@ discord = ["dep:serenity"]
slack = []
secrets-aws = ["dep:aws-sdk-secretsmanager", "dep:aws-config"]
config-s3 = ["dep:aws-sdk-s3", "dep:aws-config"]
pre-seed = ["dep:aws-sdk-s3", "dep:aws-config", "dep:zip", "dep:hex"]
pre-seed = ["dep:aws-sdk-s3", "dep:aws-config", "dep:zip", "dep:hex", "dep:flate2", "dep:tar"]
agentcore = ["dep:aws-config", "dep:aws-sigv4", "dep:aws-credential-types", "dep:urlencoding", "dep:hex", "dep:http", "dep:rustls", "dep:tokio-rustls", "dep:webpki-roots"]
169 changes: 165 additions & 4 deletions crates/openab-core/src/pre_seed.rs
Original file line number Diff line number Diff line change
Expand Up @@ -164,16 +164,24 @@ async fn download_and_extract(
Ok(())
}

/// Extract zip to a temp directory with budget enforcement, then move into target.
/// Extract archive to a temp directory with budget enforcement, then move into target.
/// Supports zip and gzipped tarball formats (detected via magic bytes).
/// Checks deadline cooperatively before each file operation.
fn extract_and_apply(data: &[u8], target: &Path, deadline: Instant) -> anyhow::Result<()> {
fn extract_and_apply(
data: &[u8],
target: &Path,
deadline: Instant,
) -> anyhow::Result<()> {
let temp_dir = tempfile::tempdir_in(target.parent().unwrap_or(target))?;

extract_zip_with_limits(data, temp_dir.path(), deadline)?;
if data.starts_with(&[0x1f, 0x8b]) {
extract_tarball_with_limits(data, temp_dir.path(), deadline)?;
} else {
extract_zip_with_limits(data, temp_dir.path(), deadline)?;
}

// Check deadline before applying to target
if Instant::now() >= deadline {
// temp_dir drops and cleans up automatically
anyhow::bail!("hooks.pre_seed: timed out before applying to target");
}

Expand Down Expand Up @@ -256,6 +264,62 @@ fn extract_zip_budgeted(
Ok(())
}

/// Extract a .tar.gz/.tgz archive with cooperative deadline checks and size budget.
fn extract_tarball_with_limits(data: &[u8], dest: &Path, deadline: Instant) -> anyhow::Result<()> {
use flate2::read::GzDecoder;

let decoder = GzDecoder::new(data);
let mut archive = tar::Archive::new(decoder);
archive.set_preserve_permissions(false);

let mut file_count: usize = 0;
let mut total_extracted: u64 = 0;

for entry in archive.entries()? {
let mut entry = entry?;

file_count += 1;
if file_count > DEFAULT_MAX_FILE_COUNT {
anyhow::bail!(
"hooks.pre_seed: tarball contains too many entries ({file_count}, max {DEFAULT_MAX_FILE_COUNT})"
);
}

// Cooperative deadline check every 10 files
if file_count % 10 == 0 && Instant::now() >= deadline {
anyhow::bail!("hooks.pre_seed: timed out during tarball extraction at entry {file_count}");
}

// Size budget
total_extracted += entry.size();
if total_extracted > DEFAULT_MAX_EXTRACTED_BYTES {
anyhow::bail!(
"hooks.pre_seed: extracted size exceeds limit ({total_extracted} > {DEFAULT_MAX_EXTRACTED_BYTES})"
);
}

entry.unpack_in(dest)?;

// Manually set permissions (strip suid/sgid/sticky, like zip path)
#[cfg(unix)]
{
use std::os::unix::fs::PermissionsExt;
if let Ok(path) = entry.path() {
let out_path = dest.join(path);
if out_path.is_file() {
let mode = entry.header().mode().unwrap_or(0o644) & 0o0777;
let _ = std::fs::set_permissions(
&out_path,
std::fs::Permissions::from_mode(mode),
);
}
}
}
}

Ok(())
}

/// Recursively move files from src directory into dst directory.
/// Checks deadline cooperatively.
fn move_recursive(src: &Path, dst: &Path, deadline: Instant) -> anyhow::Result<()> {
Expand Down Expand Up @@ -490,4 +554,101 @@ mod tests {
"should fail on file count limit"
);
}

#[test]
fn extract_tarball_basic() {
use flate2::write::GzEncoder;
use flate2::Compression;

let dir = tempfile::tempdir().unwrap();
let deadline = Instant::now() + std::time::Duration::from_secs(60);

let buf = Vec::new();
let enc = GzEncoder::new(buf, Compression::default());
let mut builder = tar::Builder::new(enc);

let mut header = tar::Header::new_gnu();
header.set_size(5);
header.set_mode(0o644);
builder
.append_data(&mut header, "hello.txt", &b"world"[..])
.unwrap();

let mut header2 = tar::Header::new_gnu();
header2.set_size(14);
header2.set_mode(0o644);
builder
.append_data(&mut header2, "sub/nested.txt", &b"nested content"[..])
.unwrap();

let enc = builder.into_inner().unwrap();
let tarball_bytes = enc.finish().unwrap();

extract_tarball_with_limits(&tarball_bytes, dir.path(), deadline).unwrap();

assert_eq!(
std::fs::read_to_string(dir.path().join("hello.txt")).unwrap(),
"world"
);
assert_eq!(
std::fs::read_to_string(dir.path().join("sub/nested.txt")).unwrap(),
"nested content"
);
}

#[test]
fn extract_and_apply_detects_tarball_via_magic_bytes() {
use flate2::write::GzEncoder;
use flate2::Compression;

let target = tempfile::tempdir().unwrap();
let deadline = Instant::now() + std::time::Duration::from_secs(60);

let buf = Vec::new();
let enc = GzEncoder::new(buf, Compression::default());
let mut builder = tar::Builder::new(enc);
let mut header = tar::Header::new_gnu();
header.set_size(5);
header.set_mode(0o644);
builder
.append_data(&mut header, "hello.txt", &b"world"[..])
.unwrap();
let enc = builder.into_inner().unwrap();
let tarball_bytes = enc.finish().unwrap();

// Magic bytes detection — no URI needed
extract_and_apply(&tarball_bytes, target.path(), deadline).unwrap();
assert_eq!(
std::fs::read_to_string(target.path().join("hello.txt")).unwrap(),
"world"
);
}

#[test]
fn extract_tarball_respects_deadline() {
use flate2::write::GzEncoder;
use flate2::Compression;

let dir = tempfile::tempdir().unwrap();
let expired = Instant::now() - std::time::Duration::from_secs(1);

let buf = Vec::new();
let enc = GzEncoder::new(buf, Compression::default());
let mut builder = tar::Builder::new(enc);
// Create > 10 files to trigger deadline check
for i in 0..11 {
let mut header = tar::Header::new_gnu();
header.set_size(1);
header.set_mode(0o644);
builder
.append_data(&mut header, format!("f{i}.txt"), &b"x"[..])
.unwrap();
}
let enc = builder.into_inner().unwrap();
let tarball_bytes = enc.finish().unwrap();

let result = extract_tarball_with_limits(&tarball_bytes, dir.path(), expired);
assert!(result.is_err());
assert!(result.unwrap_err().to_string().contains("timed out"));
}
}
10 changes: 5 additions & 5 deletions docs/config-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -243,15 +243,15 @@ Lifecycle hooks that run at specific points during the container lifecycle. See

### `[hooks.pre_seed]`

Downloads and extracts zip archives from S3 before `pre_boot`. Seeds the agent environment with configs, tools, and shared memory without requiring AWS CLI in the image.
Downloads and extracts archives from S3 before `pre_boot`. Seeds the agent environment with configs, tools, and shared memory without requiring AWS CLI in the image.

> **Feature flag:** requires the `pre-seed` feature (opt-in, not in default). Enable with `--features pre-seed`.

| Key | Type | Default | Description |
|-----|------|---------|-------------|
| `sources` | string[] | `[]` | S3 URIs of zip archives (`s3://bucket/key.zip`). Max 5. Extracted in order; later layers overwrite earlier ones. |
| `sources` | string[] | `[]` | S3 URIs of archives (`.zip`, `.tar.gz`, `.tgz`). Max 5. Extracted in order; later layers overwrite earlier ones. |
| `target` | string | `$HOME` | Extraction target directory. |
| `max_bytes` | u64 | `104857600` | Max compressed zip size in bytes (100 MiB). Rejects downloads exceeding this. |
| `max_bytes` | u64 | `104857600` | Max compressed archive size in bytes (100 MiB). Rejects downloads exceeding this. |
| `timeout_seconds` | u64 | `300` | Per-source download+extract timeout in seconds. |
| `on_failure` | string | `"abort"` | `"abort"` exits openab; `"warn"` logs and continues. |
| `region` | string | — | Override AWS region for S3 access. |
Expand All @@ -265,9 +265,9 @@ environment variables, shared credentials, IRSA / EKS Pod Identity, ECS task rol
```toml
[hooks.pre_seed]
sources = [
"s3://my-bucket/base-env.zip",
"s3://my-bucket/base-env.tar.gz",
"s3://my-bucket/shared-memory.zip",
"s3://my-bucket/agent-overrides.zip",
"s3://my-bucket/agent-overrides.tgz",
]
timeout_seconds = 300
on_failure = "abort"
Expand Down
21 changes: 11 additions & 10 deletions docs/hooks.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,13 @@ hooks.pre_seed → hooks.pre_boot → (agent running) → hooks.pre_shutdown

| Phase | Purpose | Config | Action Type |
|-------|---------|--------|-------------|
| `pre_seed` | Download & extract S3 zip archives to seed the environment | `[hooks.pre_seed]` | Built-in S3 download + unzip |
| `pre_seed` | Download & extract S3 archives to seed the environment | `[hooks.pre_seed]` | Built-in S3 download + extract |
| `pre_boot` | Run custom setup scripts before agent pool creation | `[hooks.pre_boot]` | User script |
| `pre_shutdown` | Run custom cleanup scripts after pool shutdown | `[hooks.pre_shutdown]` | User script |

## Pre-Seed Phase

The `pre_seed` phase runs **before** `pre_boot`. It downloads zip archives from S3 and extracts them into the agent's home directory (or a custom target). This eliminates the need for users to install AWS CLI and write download scripts in `pre_boot`.
The `pre_seed` phase runs **before** `pre_boot`. It downloads archives from S3 and extracts them into the agent's home directory (or a custom target). Supported formats: `.zip`, `.tar.gz`, and `.tgz` (auto-detected via magic bytes). This eliminates the need for users to install AWS CLI and write download scripts in `pre_boot`.

> **Feature flag:** requires the `pre-seed` feature (opt-in, not in default).

Expand All @@ -25,12 +25,12 @@ The `pre_seed` phase runs **before** `pre_boot`. It downloads zip archives from
```toml
[hooks.pre_seed]
sources = [
"s3://my-bucket/base-env.zip",
"s3://my-bucket/base-env.tar.gz",
"s3://my-bucket/shared-memory.zip",
"s3://my-bucket/agent-overrides.zip",
"s3://my-bucket/agent-overrides.tgz",
]
# target = "/home/agent" # default: $HOME
# max_bytes = 104857600 # max compressed size per zip (default: 100 MiB)
# max_bytes = 104857600 # max compressed size per archive (default: 100 MiB)
# timeout_seconds = 300 # per-source timeout (default: 300)
# on_failure = "abort" # "abort" or "warn" (default: "abort")
# region = "us-west-2" # optional: override AWS region
Expand All @@ -41,9 +41,9 @@ sources = [

| Field | Type | Default | Description |
|-------|------|---------|-------------|
| `sources` | string[] | `[]` | S3 URIs of zip archives. Max 5. Extracted in order. |
| `sources` | string[] | `[]` | S3 URIs of archives (`.zip`, `.tar.gz`, `.tgz`). Max 5. Extracted in order. |
| `target` | string | `$HOME` | Extraction target directory. |
| `max_bytes` | u64 | `104857600` | Max compressed zip size in bytes (100 MiB). |
| `max_bytes` | u64 | `104857600` | Max compressed archive size in bytes (100 MiB). |
| `timeout_seconds` | u64 | `300` | Per-source download+extract timeout. |
| `on_failure` | string | `"abort"` | `"abort"` exits openab; `"warn"` logs and continues. |
| `region` | string | — | Override AWS region. |
Expand All @@ -67,14 +67,15 @@ Layer 1 (first) ─── base layer
1. **S3-native checksum (automatic)**: if the object was uploaded with `--checksum-algorithm SHA256`, OpenAB automatically verifies it on download — no config needed
2. **User-provided `sha256s` (optional)**: explicit checksums in config for additional defense-in-depth
- **Size cap**: downloads exceeding `max_bytes` are rejected before extraction
- **Atomic extraction**: zips are first extracted to a temp directory, then moved into target — if extraction fails, target is not corrupted. Note: the move phase is per-file; if it fails mid-way with `on_failure = "warn"`, the target may be partially updated.
- **Zip Slip prevention**: uses `enclosed_name()` to block path traversal attacks
- **Atomic extraction**: archives are first extracted to a temp directory, then moved into target — if extraction fails, target is not corrupted. Note: the move phase is per-file; if it fails mid-way with `on_failure = "warn"`, the target may be partially updated.
- **Path traversal prevention**: zip uses `enclosed_name()`; tarball uses `unpack_in()` which rejects `..` escapes
- **Permission hardening**: suid/sgid/sticky bits are stripped from extracted files

### Constraints

- Maximum **5** sources
- Only `s3://` URIs supported
- Only `.zip` format supported
- Supported formats: `.zip`, `.tar.gz`, `.tgz` (auto-detected via gzip magic bytes)
- Uses the standard AWS credential chain (IRSA, ECS task role, env vars)
- Optional `region`/`endpoint_url` override for LocalStack or VPC endpoints

Expand Down
Loading