diff --git a/.githooks/pre-commit b/.githooks/pre-commit new file mode 100755 index 0000000..9857579 --- /dev/null +++ b/.githooks/pre-commit @@ -0,0 +1,27 @@ +#!/bin/bash +set -e + +# Skip hooks if SKIP_GIT_HOOKS=1 is set +if [ "${SKIP_GIT_HOOKS:-0}" = "1" ]; then + echo "[pre-commit] Skipped (SKIP_GIT_HOOKS=1)" + exit 0 +fi + +echo "╔══════════════════════════════════════╗" +echo "║ pre-commit quality gate ║" +echo "╚══════════════════════════════════════╝" + +# 1. Format check +echo "" +echo "▶ Checking formatting (cargo +nightly fmt)..." +cargo +nightly fmt --all -- --check +echo " ✓ Formatting OK" + +# 2. Clippy lint +echo "" +echo "▶ Running clippy..." +cargo +nightly clippy --all-targets -- -D warnings +echo " ✓ Clippy OK" + +echo "" +echo "✅ pre-commit passed" diff --git a/.githooks/pre-push b/.githooks/pre-push new file mode 100755 index 0000000..5257348 --- /dev/null +++ b/.githooks/pre-push @@ -0,0 +1,39 @@ +#!/bin/bash +set -e + +# Skip hooks if SKIP_GIT_HOOKS=1 is set +if [ "${SKIP_GIT_HOOKS:-0}" = "1" ]; then + echo "[pre-push] Skipped (SKIP_GIT_HOOKS=1)" + exit 0 +fi + +echo "╔══════════════════════════════════════╗" +echo "║ pre-push quality gate ║" +echo "╚══════════════════════════════════════╝" + +# 1. Format check +echo "" +echo "▶ Checking formatting (cargo +nightly fmt)..." +cargo +nightly fmt --all -- --check +echo " ✓ Formatting OK" + +# 2. Clippy lint +echo "" +echo "▶ Running clippy..." +cargo +nightly clippy --all-targets -- -D warnings +echo " ✓ Clippy OK" + +# 3. Tests +echo "" +echo "▶ Running tests..." +cargo +nightly test --release -- --test-threads=$(nproc 2>/dev/null || echo 4) +echo " ✓ Tests OK" + +# 4. Release build +echo "" +echo "▶ Building release binary..." +cargo +nightly build --release +echo " ✓ Build OK" + +echo "" +echo "✅ pre-push passed — safe to push" diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index bb30f85..aac2e42 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,9 +17,9 @@ env: RUST_BACKTRACE: short jobs: - lint: - name: Lint & Format - runs-on: blacksmith-4vcpu-ubuntu-2404 + check: + name: Format + Lint + Test + Build + runs-on: blacksmith-32vcpu-ubuntu-2404 steps: - uses: actions/checkout@v4 @@ -38,9 +38,9 @@ jobs: ~/.cargo/registry ~/.cargo/git target - key: lint-${{ runner.os }}-nightly-${{ hashFiles('**/Cargo.lock') }} + key: check-${{ runner.os }}-nightly-${{ hashFiles('**/Cargo.lock') }} restore-keys: | - lint-${{ runner.os }}-nightly- + check-${{ runner.os }}-nightly- - name: Check formatting run: cargo +nightly fmt --all -- --check @@ -48,28 +48,10 @@ jobs: - name: Clippy run: cargo +nightly clippy --all-targets -- -D warnings - build: - name: Build (nightly, release, 32 threads) - runs-on: blacksmith-32vcpu-ubuntu-2404 - steps: - - uses: actions/checkout@v4 - - - name: Install mold linker - run: sudo apt-get update -qq && sudo apt-get install -y -qq mold clang - - - name: Install Rust nightly - uses: dtolnay/rust-toolchain@nightly - - - name: Cache cargo registry + target - uses: actions/cache@v4 - with: - path: | - ~/.cargo/registry - ~/.cargo/git - target - key: build-${{ runner.os }}-nightly-${{ hashFiles('**/Cargo.lock') }} - restore-keys: | - build-${{ runner.os }}-nightly- + - name: Run tests + run: cargo +nightly test --release -j $(nproc) -- --test-threads=$(nproc) + env: + RUST_LOG: warn - name: Build release run: cargo +nightly build --release -j $(nproc) @@ -83,38 +65,10 @@ jobs: path: target/release/term-executor retention-days: 7 - test: - name: Tests (nightly, 32 threads) - runs-on: blacksmith-32vcpu-ubuntu-2404 - steps: - - uses: actions/checkout@v4 - - - name: Install mold linker - run: sudo apt-get update -qq && sudo apt-get install -y -qq mold clang - - - name: Install Rust nightly - uses: dtolnay/rust-toolchain@nightly - - - name: Cache cargo registry + target - uses: actions/cache@v4 - with: - path: | - ~/.cargo/registry - ~/.cargo/git - target - key: test-${{ runner.os }}-nightly-${{ hashFiles('**/Cargo.lock') }} - restore-keys: | - test-${{ runner.os }}-nightly- - - - name: Run tests - run: cargo +nightly test --release -j $(nproc) -- --test-threads=$(nproc) - env: - RUST_LOG: warn - docker: name: Docker build runs-on: blacksmith-32vcpu-ubuntu-2404 - needs: [lint, build, test] + needs: [check] if: github.ref == 'refs/heads/main' steps: - uses: actions/checkout@v4 @@ -138,3 +92,31 @@ jobs: tags: | ghcr.io/${{ steps.lower.outputs.repo }}:latest ghcr.io/${{ steps.lower.outputs.repo }}:${{ github.sha }} + + release: + name: Semantic Release + runs-on: ubuntu-latest + needs: [check] + if: github.ref == 'refs/heads/main' && github.event_name == 'push' + permissions: + contents: write + issues: write + pull-requests: write + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + persist-credentials: false + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: 22 + + - name: Install semantic-release + run: npm install -g semantic-release @semantic-release/changelog @semantic-release/git @semantic-release/exec + + - name: Run semantic-release + run: npx semantic-release + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.releaserc.json b/.releaserc.json new file mode 100644 index 0000000..5c1bbe8 --- /dev/null +++ b/.releaserc.json @@ -0,0 +1,28 @@ +{ + "branches": ["main"], + "tagFormat": "v${version}", + "plugins": [ + "@semantic-release/commit-analyzer", + "@semantic-release/release-notes-generator", + [ + "@semantic-release/changelog", + { + "changelogFile": "CHANGELOG.md" + } + ], + [ + "@semantic-release/exec", + { + "prepareCmd": "echo ${nextRelease.version} > VERSION && sed -i 's/^version = \".*\"/version = \"${nextRelease.version}\"/' Cargo.toml" + } + ], + [ + "@semantic-release/git", + { + "assets": ["VERSION", "CHANGELOG.md", "Cargo.toml"], + "message": "chore(release): ${nextRelease.version} [skip ci]\n\n${nextRelease.notes}" + } + ], + "@semantic-release/github" + ] +} diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..8886dd3 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,160 @@ +# AGENTS.md — term-executor + +## Project Purpose + +**term-executor** is a remote evaluation executor for the [term-challenge](https://github.com/PlatformNetwork/term-challenge) platform. It runs as a containerized Rust service on [Basilica](https://basilica.ai) that receives agent code submissions, executes them against a cloned task repository, runs validation test scripts, and reports pass/fail results. It is the core compute backend that evaluates AI agent coding challenges. + +## Architecture Overview + +This is a **single-crate Rust binary** (`term-executor`) built with Axum. There are no sub-crates or workspaces. + +### Data Flow + +``` +Platform Server → POST /evaluate → term-executor + 1. Download task archive (.tar.gz / .zip) from task_url + 2. Parse workspace.yaml, prompt.md, tests/ + 3. git clone the target repository at base_commit + 4. Run install commands (pip install, etc.) + 5. Write & execute agent code in the repo + 6. Write test source files into the repo + 7. Run test scripts (bash), collect exit codes + 8. Return results via GET /evaluate/{id} +``` + +### Module Map + +| File | Responsibility | +|---|---| +| `src/main.rs` | Entry point — bootstraps config, session manager, executor, Axum server, reaper tasks | +| `src/config.rs` | `Config` struct loaded from environment variables with defaults | +| `src/handlers.rs` | Axum route handlers: `/health`, `/status`, `/metrics`, `/evaluate`, `/evaluate/{id}`, `/evaluations` | +| `src/auth.rs` | Bearer token authentication middleware and `check_token()` helper | +| `src/executor.rs` | Core evaluation engine — spawns async tasks that clone repos, run agents, run tests | +| `src/session.rs` | `SessionManager` with `DashMap`, `Session`, `EvalResult`, `EvalStatus`, `EvalStep` types | +| `src/task.rs` | Task archive download/extraction (zip/tar.gz), `workspace.yaml` parsing, test file loading | +| `src/metrics.rs` | Atomic counter-based Prometheus metrics (total, passed, failed, active, duration) | +| `src/cleanup.rs` | Work directory removal, stale session reaping, process group killing | + +### Key Shared State (via `Arc`) + +- `AppState` (in `handlers.rs`) holds `Config`, `SessionManager`, `Metrics`, `Executor`, `Semaphore` +- `SessionManager` uses `DashMap>` for lock-free concurrent access +- `Semaphore` controls max concurrent evaluations (default: 4) + +## Tech Stack + +- **Language**: Rust (edition 2021, nightly toolchain for fmt/clippy) +- **Async Runtime**: Tokio (full features + process) +- **Web Framework**: Axum 0.7 with Tower middleware +- **HTTP Client**: reqwest 0.12 (for downloading task archives) +- **Serialization**: serde + serde_json + serde_yaml +- **Concurrency**: `DashMap` 6, `parking_lot` 0.12, `tokio::sync::Semaphore` +- **Archive Handling**: `flate2` + `tar` (tar.gz), `zip` 2 (zip) +- **Error Handling**: `anyhow` 1 + `thiserror` 2 +- **Logging**: `tracing` + `tracing-subscriber` with env-filter +- **Build Tooling**: `mold` linker via `.cargo/config.toml`, `clang` as linker driver +- **Container**: Multi-stage Dockerfile — `rust:1.93-slim-bookworm` builder → `debian:bookworm-slim` runtime +- **CI**: GitHub Actions on Blacksmith runners (4/32 vCPU), nightly Rust + +## CRITICAL RULES + +1. **Always use `cargo +nightly fmt --all` before committing.** The CI enforces `--check` and will reject unformatted code. The project uses the nightly formatter exclusively. + +2. **All clippy warnings are errors.** Run `cargo +nightly clippy --all-targets -- -D warnings` locally. CI runs the same command and will fail on any warning. + +3. **Never expose secrets in logs or responses.** The `AUTH_TOKEN` environment variable is sensitive. Auth failures log only the `x-forwarded-for` header, never the token value. Follow this pattern for any new secrets. + +4. **All process execution MUST have timeouts.** Every call to `run_cmd`/`run_shell` in `src/executor.rs` takes a `Duration` timeout. Never spawn a child process without a timeout — agent code is untrusted and may hang forever. + +5. **Output MUST be truncated.** The `truncate_output()` function in `src/executor.rs` caps output at `MAX_OUTPUT` (1MB). Any new command output capture must use this function to prevent memory exhaustion from malicious agent output. + +6. **Shared state must use `Arc` + lock-free structures.** `SessionManager` uses `DashMap` (not `Mutex`). Metrics use `AtomicU64`. New shared state should follow these patterns — never use `std::sync::Mutex` for hot-path data. + +7. **Semaphore must gate evaluation capacity.** The `Semaphore` in `AppState` limits concurrent evaluations to `MAX_CONCURRENT_EVALS`. Any new evaluation path must acquire a permit before spawning work. + +8. **Session cleanup is mandatory.** Every evaluation must clean up its work directory in `src/executor.rs` (the `Cleanup` step). The stale session reaper in `src/cleanup.rs` is a safety net, not a primary mechanism. + +9. **Error handling: use `anyhow::Result` for internal logic, `(StatusCode, String)` for HTTP responses.** Handler functions in `src/handlers.rs` return `Result`. Internal executor/task functions return `anyhow::Result`. + +10. **All new fields on serialized structs must use `#[serde(default)]` or `Option`.** The `EvalRequest`, `EvalResult`, and `WorkspaceConfig` structs are deserialized from external input. Missing fields must not break deserialization. + +## DO / DO NOT + +### DO +- Write unit tests for all new public functions (see existing `#[cfg(test)]` modules in every file) +- Use `tracing::info!`/`warn!`/`error!` for logging (not `println!`) +- Add new routes in `src/handlers.rs` via the `router()` function +- Use `tokio::fs` for async file I/O in the executor pipeline +- Keep the Dockerfile minimal — runtime image has no compilers or language runtimes +- Use conventional commits (`feat:`, `fix:`, `perf:`, `chore:`, etc.) + +### DO NOT +- Do NOT add `unsafe` code — there is none in this project and it should stay that way +- Do NOT add synchronous blocking I/O in async functions — use `tokio::task::spawn_blocking` for CPU-heavy work (see `extract_archive` in `src/task.rs`) +- Do NOT store large data (agent output, test output) in memory without truncation +- Do NOT add new dependencies without justification — the binary must stay small for container deployment +- Do NOT use `unwrap()` in production code paths — use `?` or `context()` from anyhow. `unwrap()` is only acceptable in tests and infallible cases (like parsing a known-good string) +- Do NOT modify `.cargo/config.toml` — it configures the mold linker for fast builds + +## Build & Test Commands + +```bash +# Build (debug) +cargo build + +# Build (release, matches CI) +cargo +nightly build --release -j $(nproc) + +# Run tests +cargo test + +# Run tests (release, matches CI) +cargo +nightly test --release -j $(nproc) -- --test-threads=$(nproc) + +# Format (required before commit) +cargo +nightly fmt --all + +# Format check (what CI runs) +cargo +nightly fmt --all -- --check + +# Lint (required before commit) +cargo +nightly clippy --all-targets -- -D warnings + +# Run locally +AUTH_TOKEN=test PORT=8080 cargo run + +# Docker build +docker build -t term-executor . +``` + +## Git Hooks + +The `.githooks/` directory contains automated quality gates: + +### pre-commit +- Runs `cargo +nightly fmt --all -- --check` to enforce formatting +- Runs `cargo +nightly clippy --all-targets -- -D warnings` to enforce lint +- Skip with `SKIP_GIT_HOOKS=1 git commit ...` + +### pre-push +- Runs format check, clippy, full test suite, and release build +- This is the full quality gate matching CI +- Skip with `SKIP_GIT_HOOKS=1 git push ...` + +Both hooks are activated via `git config core.hooksPath .githooks`. + +## Environment Variables + +| Variable | Default | Description | +|---|---|---| +| `PORT` | `8080` | HTTP listen port | +| `AUTH_TOKEN` | *(none)* | Bearer token for `/evaluate`. If unset, auth is disabled | +| `SESSION_TTL_SECS` | `1800` | Max session lifetime before reaping | +| `MAX_CONCURRENT_EVALS` | `4` | Maximum parallel evaluations | +| `CLONE_TIMEOUT_SECS` | `120` | Git clone timeout | +| `AGENT_TIMEOUT_SECS` | `600` | Agent execution timeout | +| `TEST_TIMEOUT_SECS` | `300` | Test suite timeout | +| `MAX_AGENT_CODE_BYTES` | `5242880` | Max agent code payload (5MB) | +| `MAX_OUTPUT_BYTES` | `1048576` | Max captured output per command (1MB) | +| `WORKSPACE_BASE` | `/tmp/sessions` | Base directory for session workspaces | diff --git a/VERSION b/VERSION new file mode 100644 index 0000000..6e8bf73 --- /dev/null +++ b/VERSION @@ -0,0 +1 @@ +0.1.0 diff --git a/src/AGENTS.md b/src/AGENTS.md new file mode 100644 index 0000000..2ed2326 --- /dev/null +++ b/src/AGENTS.md @@ -0,0 +1,92 @@ +# AGENTS.md — src/ (term-executor core) + +This is a single-crate binary. All source files live in `src/` with no sub-modules or nested directories. + +## Module Dependency Graph + +``` +main.rs + ├── config.rs (Config::from_env) + ├── handlers.rs (Axum router + AppState) + │ ├── auth.rs (check_token for /evaluate) + │ ├── executor.rs (spawned from evaluate handler) + │ │ ├── task.rs (download, extract, parse) + │ │ ├── session.rs (EvalResult mutation) + │ │ └── cleanup.rs (work dir removal) + │ ├── metrics.rs (Prometheus rendering) + │ └── session.rs (SessionManager CRUD) + ├── session.rs (reaper_loop spawned from main) + └── cleanup.rs (reap_stale_sessions spawned from main) +``` + +## File-by-File Guide + +### `main.rs` +- Entry point. Initializes tracing, config, session manager, metrics, executor, semaphore. +- Creates `AppState`, builds Axum router, spawns background tasks (session reaper, stale dir reaper). +- Binds to `0.0.0.0:{PORT}` with graceful shutdown on SIGTERM/CTRL+C. +- **Convention**: Background tasks are spawned with `tokio::spawn` and run indefinitely. + +### `config.rs` +- `Config` struct with all environment-driven settings. +- `Config::from_env()` reads env vars with `env_parse()` helper (returns default on missing/invalid). +- `Config::print_banner()` logs a formatted startup banner. +- **Convention**: Add new config fields here, with a `DEFAULT_*` constant and an env var name. Always provide a sensible default. + +### `handlers.rs` +- Defines `AppState` struct (all fields `Arc`-wrapped for sharing). +- `router()` builds the Axum `Router` with all routes and shared state. +- Route handlers: `health`, `status`, `metrics`, `evaluate`, `get_eval`, `list_evals`. +- `evaluate` handler does: auth check → payload validation → capacity check → session creation → executor spawn. +- **Convention**: Return `Result` from handlers that can fail. Use `Json(serde_json::json!({...}))` for responses. + +### `auth.rs` +- `auth_middleware` — Axum middleware (currently unused in router, auth is inline in `evaluate`). +- `check_token(auth_header, expected)` — simple Bearer token comparison used by `evaluate` handler. +- `inject_request_id` — adds `x-request-id` UUID header to responses. +- **Convention**: Auth is optional — if `AUTH_TOKEN` env var is unset, `/evaluate` is open. + +### `executor.rs` +- `Executor::spawn_eval(session)` — spawns a tokio task that runs the full evaluation pipeline. +- `run_eval(config, session, cancel_rx)` — orchestrates: download → clone → install → agent → tests → cleanup. +- `run_cmd(argv, cwd, timeout, env)` / `run_shell(shell_cmd, cwd, timeout, env)` — process execution with timeout. +- `truncate_output(raw)` — caps output at 1MB. +- `agent_extension(language)` / `agent_runner(language, script_path)` — maps language strings to file extensions and runner commands. +- **Convention**: Every phase checks `cancel_rx` for cancellation. Every process has a timeout. Output is always truncated. + +### `session.rs` +- `EvalRequest`, `EvalStatus` (enum), `EvalStep` (enum), `TaskTestResult`, `EvalResult` — core data types. +- `Session` — holds id, request, result (`Arc>`), created_at, cancel channel. +- `SessionManager` — `DashMap`-backed session store with create/get/remove/list/mark operations. +- `reaper_loop()` — runs every 60s, removes sessions older than TTL, sends cancel signal. +- **Convention**: All enums use `#[serde(rename_all = "snake_case")]`. Session IDs are UUID v4 strings. + +### `task.rs` +- `download_and_extract(url, dest)` — HTTP GET → bytes → extract (zip or tar.gz) in a blocking task. +- `parse_task(task_dir)` — reads `workspace.yaml`, `prompt.md`, `tests/` directory, `checks.txt`. +- `find_task_root(base)` — locates `workspace.yaml` in extracted archive (direct or one level nested). +- `WorkspaceConfig` — deserialized from `workspace.yaml` (repo, version, base_commit, install, language). +- `SweForgeTask` — parsed task with workspace config, prompt text, test scripts, test source files. +- **Convention**: `.sh` files in `tests/` are test scripts (executed); all other files are source files (written to repo). Archive size capped at 100MB. + +### `metrics.rs` +- `Metrics` — atomic counters for evals total/passed/failed/cancelled/active/duration_sum. +- `start_eval()` / `finish_eval(passed, duration_ms)` / `cancel_eval()` — counter operations. +- `render_prometheus()` — formats counters as Prometheus text exposition format. +- **Convention**: All counters are `AtomicU64` with `Ordering::Relaxed`. Metrics are exposed at `GET /metrics`. + +### `cleanup.rs` +- `remove_work_dir(path)` — async directory removal (logs warning on failure, never panics). +- `kill_process_group(pgid)` — best-effort `kill -9` on a process group. +- `reap_stale_sessions(base, max_age_secs)` — scans workspace base, removes dirs older than TTL. +- **Convention**: Cleanup functions are fire-and-forget. They log but never return errors. + +## Testing + +Every module has a `#[cfg(test)] mod tests` block. Tests use: +- `#[test]` for sync unit tests +- `#[tokio::test]` for async tests +- `tempfile::tempdir()` for filesystem tests +- No external test fixtures or mock servers needed + +Run all tests: `cargo test` or `cargo +nightly test --release`