Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions .githooks/pre-commit
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/bin/bash
set -e

# Skip hooks if SKIP_GIT_HOOKS=1 is set
if [ "${SKIP_GIT_HOOKS:-0}" = "1" ]; then
echo "[pre-commit] Skipped (SKIP_GIT_HOOKS=1)"
exit 0
fi

echo "╔══════════════════════════════════════╗"
echo "║ pre-commit quality gate ║"
echo "╚══════════════════════════════════════╝"

# 1. Format check
echo ""
echo "▶ Checking formatting (cargo +nightly fmt)..."
cargo +nightly fmt --all -- --check
echo " ✓ Formatting OK"

# 2. Clippy lint
echo ""
echo "▶ Running clippy..."
cargo +nightly clippy --all-targets -- -D warnings
echo " ✓ Clippy OK"

echo ""
echo "✅ pre-commit passed"
39 changes: 39 additions & 0 deletions .githooks/pre-push
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/bin/bash
set -e

# Skip hooks if SKIP_GIT_HOOKS=1 is set
if [ "${SKIP_GIT_HOOKS:-0}" = "1" ]; then
echo "[pre-push] Skipped (SKIP_GIT_HOOKS=1)"
exit 0
fi

echo "╔══════════════════════════════════════╗"
echo "║ pre-push quality gate ║"
echo "╚══════════════════════════════════════╝"

# 1. Format check
echo ""
echo "▶ Checking formatting (cargo +nightly fmt)..."
cargo +nightly fmt --all -- --check
echo " ✓ Formatting OK"

# 2. Clippy lint
echo ""
echo "▶ Running clippy..."
cargo +nightly clippy --all-targets -- -D warnings
echo " ✓ Clippy OK"

# 3. Tests
echo ""
echo "▶ Running tests..."
cargo +nightly test --release -- --test-threads=$(nproc 2>/dev/null || echo 4)
echo " ✓ Tests OK"

# 4. Release build
echo ""
echo "▶ Building release binary..."
cargo +nightly build --release
echo " ✓ Build OK"

echo ""
echo "✅ pre-push passed — safe to push"
94 changes: 38 additions & 56 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@ env:
RUST_BACKTRACE: short

jobs:
lint:
name: Lint & Format
runs-on: blacksmith-4vcpu-ubuntu-2404
check:
name: Format + Lint + Test + Build
runs-on: blacksmith-32vcpu-ubuntu-2404
steps:
- uses: actions/checkout@v4

Expand All @@ -38,38 +38,20 @@ jobs:
~/.cargo/registry
~/.cargo/git
target
key: lint-${{ runner.os }}-nightly-${{ hashFiles('**/Cargo.lock') }}
key: check-${{ runner.os }}-nightly-${{ hashFiles('**/Cargo.lock') }}
restore-keys: |
lint-${{ runner.os }}-nightly-
check-${{ runner.os }}-nightly-

- name: Check formatting
run: cargo +nightly fmt --all -- --check

- name: Clippy
run: cargo +nightly clippy --all-targets -- -D warnings

build:
name: Build (nightly, release, 32 threads)
runs-on: blacksmith-32vcpu-ubuntu-2404
steps:
- uses: actions/checkout@v4

- name: Install mold linker
run: sudo apt-get update -qq && sudo apt-get install -y -qq mold clang

- name: Install Rust nightly
uses: dtolnay/rust-toolchain@nightly

- name: Cache cargo registry + target
uses: actions/cache@v4
with:
path: |
~/.cargo/registry
~/.cargo/git
target
key: build-${{ runner.os }}-nightly-${{ hashFiles('**/Cargo.lock') }}
restore-keys: |
build-${{ runner.os }}-nightly-
- name: Run tests
run: cargo +nightly test --release -j $(nproc) -- --test-threads=$(nproc)
env:
RUST_LOG: warn

- name: Build release
run: cargo +nightly build --release -j $(nproc)
Expand All @@ -83,38 +65,10 @@ jobs:
path: target/release/term-executor
retention-days: 7

test:
name: Tests (nightly, 32 threads)
runs-on: blacksmith-32vcpu-ubuntu-2404
steps:
- uses: actions/checkout@v4

- name: Install mold linker
run: sudo apt-get update -qq && sudo apt-get install -y -qq mold clang

- name: Install Rust nightly
uses: dtolnay/rust-toolchain@nightly

- name: Cache cargo registry + target
uses: actions/cache@v4
with:
path: |
~/.cargo/registry
~/.cargo/git
target
key: test-${{ runner.os }}-nightly-${{ hashFiles('**/Cargo.lock') }}
restore-keys: |
test-${{ runner.os }}-nightly-

- name: Run tests
run: cargo +nightly test --release -j $(nproc) -- --test-threads=$(nproc)
env:
RUST_LOG: warn

docker:
name: Docker build
runs-on: blacksmith-32vcpu-ubuntu-2404
needs: [lint, build, test]
needs: [check]
if: github.ref == 'refs/heads/main'
steps:
- uses: actions/checkout@v4
Expand All @@ -138,3 +92,31 @@ jobs:
tags: |
ghcr.io/${{ steps.lower.outputs.repo }}:latest
ghcr.io/${{ steps.lower.outputs.repo }}:${{ github.sha }}

release:
name: Semantic Release
runs-on: ubuntu-latest
needs: [check]
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
permissions:
contents: write
issues: write
pull-requests: write
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
persist-credentials: false

- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: 22

- name: Install semantic-release
run: npm install -g semantic-release @semantic-release/changelog @semantic-release/git @semantic-release/exec

- name: Run semantic-release
run: npx semantic-release
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
28 changes: 28 additions & 0 deletions .releaserc.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
"branches": ["main"],
"tagFormat": "v${version}",
"plugins": [
"@semantic-release/commit-analyzer",
"@semantic-release/release-notes-generator",
[
"@semantic-release/changelog",
{
"changelogFile": "CHANGELOG.md"
}
],
[
"@semantic-release/exec",
{
"prepareCmd": "echo ${nextRelease.version} > VERSION && sed -i 's/^version = \".*\"/version = \"${nextRelease.version}\"/' Cargo.toml"
}
],
[
"@semantic-release/git",
{
"assets": ["VERSION", "CHANGELOG.md", "Cargo.toml"],
"message": "chore(release): ${nextRelease.version} [skip ci]\n\n${nextRelease.notes}"
}
],
"@semantic-release/github"
]
}
160 changes: 160 additions & 0 deletions AGENTS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
# AGENTS.md — term-executor

## Project Purpose

**term-executor** is a remote evaluation executor for the [term-challenge](https://github.com/PlatformNetwork/term-challenge) platform. It runs as a containerized Rust service on [Basilica](https://basilica.ai) that receives agent code submissions, executes them against a cloned task repository, runs validation test scripts, and reports pass/fail results. It is the core compute backend that evaluates AI agent coding challenges.

## Architecture Overview

This is a **single-crate Rust binary** (`term-executor`) built with Axum. There are no sub-crates or workspaces.

### Data Flow

```
Platform Server → POST /evaluate → term-executor
1. Download task archive (.tar.gz / .zip) from task_url
2. Parse workspace.yaml, prompt.md, tests/
3. git clone the target repository at base_commit
4. Run install commands (pip install, etc.)
5. Write & execute agent code in the repo
6. Write test source files into the repo
7. Run test scripts (bash), collect exit codes
8. Return results via GET /evaluate/{id}
```

### Module Map

| File | Responsibility |
|---|---|
| `src/main.rs` | Entry point — bootstraps config, session manager, executor, Axum server, reaper tasks |
| `src/config.rs` | `Config` struct loaded from environment variables with defaults |
| `src/handlers.rs` | Axum route handlers: `/health`, `/status`, `/metrics`, `/evaluate`, `/evaluate/{id}`, `/evaluations` |
| `src/auth.rs` | Bearer token authentication middleware and `check_token()` helper |
| `src/executor.rs` | Core evaluation engine — spawns async tasks that clone repos, run agents, run tests |
| `src/session.rs` | `SessionManager` with `DashMap`, `Session`, `EvalResult`, `EvalStatus`, `EvalStep` types |
| `src/task.rs` | Task archive download/extraction (zip/tar.gz), `workspace.yaml` parsing, test file loading |
| `src/metrics.rs` | Atomic counter-based Prometheus metrics (total, passed, failed, active, duration) |
| `src/cleanup.rs` | Work directory removal, stale session reaping, process group killing |

### Key Shared State (via `Arc`)

- `AppState` (in `handlers.rs`) holds `Config`, `SessionManager`, `Metrics`, `Executor`, `Semaphore`
- `SessionManager` uses `DashMap<String, Arc<Session>>` for lock-free concurrent access
- `Semaphore` controls max concurrent evaluations (default: 4)

## Tech Stack

- **Language**: Rust (edition 2021, nightly toolchain for fmt/clippy)
- **Async Runtime**: Tokio (full features + process)
- **Web Framework**: Axum 0.7 with Tower middleware
- **HTTP Client**: reqwest 0.12 (for downloading task archives)
- **Serialization**: serde + serde_json + serde_yaml
- **Concurrency**: `DashMap` 6, `parking_lot` 0.12, `tokio::sync::Semaphore`
- **Archive Handling**: `flate2` + `tar` (tar.gz), `zip` 2 (zip)
- **Error Handling**: `anyhow` 1 + `thiserror` 2
- **Logging**: `tracing` + `tracing-subscriber` with env-filter
- **Build Tooling**: `mold` linker via `.cargo/config.toml`, `clang` as linker driver
- **Container**: Multi-stage Dockerfile — `rust:1.93-slim-bookworm` builder → `debian:bookworm-slim` runtime
- **CI**: GitHub Actions on Blacksmith runners (4/32 vCPU), nightly Rust

## CRITICAL RULES

1. **Always use `cargo +nightly fmt --all` before committing.** The CI enforces `--check` and will reject unformatted code. The project uses the nightly formatter exclusively.

2. **All clippy warnings are errors.** Run `cargo +nightly clippy --all-targets -- -D warnings` locally. CI runs the same command and will fail on any warning.

3. **Never expose secrets in logs or responses.** The `AUTH_TOKEN` environment variable is sensitive. Auth failures log only the `x-forwarded-for` header, never the token value. Follow this pattern for any new secrets.

4. **All process execution MUST have timeouts.** Every call to `run_cmd`/`run_shell` in `src/executor.rs` takes a `Duration` timeout. Never spawn a child process without a timeout — agent code is untrusted and may hang forever.

5. **Output MUST be truncated.** The `truncate_output()` function in `src/executor.rs` caps output at `MAX_OUTPUT` (1MB). Any new command output capture must use this function to prevent memory exhaustion from malicious agent output.

6. **Shared state must use `Arc` + lock-free structures.** `SessionManager` uses `DashMap` (not `Mutex<HashMap>`). Metrics use `AtomicU64`. New shared state should follow these patterns — never use `std::sync::Mutex` for hot-path data.

7. **Semaphore must gate evaluation capacity.** The `Semaphore` in `AppState` limits concurrent evaluations to `MAX_CONCURRENT_EVALS`. Any new evaluation path must acquire a permit before spawning work.

8. **Session cleanup is mandatory.** Every evaluation must clean up its work directory in `src/executor.rs` (the `Cleanup` step). The stale session reaper in `src/cleanup.rs` is a safety net, not a primary mechanism.

9. **Error handling: use `anyhow::Result` for internal logic, `(StatusCode, String)` for HTTP responses.** Handler functions in `src/handlers.rs` return `Result<impl IntoResponse, (StatusCode, String)>`. Internal executor/task functions return `anyhow::Result<T>`.

10. **All new fields on serialized structs must use `#[serde(default)]` or `Option<T>`.** The `EvalRequest`, `EvalResult`, and `WorkspaceConfig` structs are deserialized from external input. Missing fields must not break deserialization.

## DO / DO NOT

### DO
- Write unit tests for all new public functions (see existing `#[cfg(test)]` modules in every file)
- Use `tracing::info!`/`warn!`/`error!` for logging (not `println!`)
- Add new routes in `src/handlers.rs` via the `router()` function
- Use `tokio::fs` for async file I/O in the executor pipeline
- Keep the Dockerfile minimal — runtime image has no compilers or language runtimes
- Use conventional commits (`feat:`, `fix:`, `perf:`, `chore:`, etc.)

### DO NOT
- Do NOT add `unsafe` code — there is none in this project and it should stay that way
- Do NOT add synchronous blocking I/O in async functions — use `tokio::task::spawn_blocking` for CPU-heavy work (see `extract_archive` in `src/task.rs`)
- Do NOT store large data (agent output, test output) in memory without truncation
- Do NOT add new dependencies without justification — the binary must stay small for container deployment
- Do NOT use `unwrap()` in production code paths — use `?` or `context()` from anyhow. `unwrap()` is only acceptable in tests and infallible cases (like parsing a known-good string)
- Do NOT modify `.cargo/config.toml` — it configures the mold linker for fast builds

## Build & Test Commands

```bash
# Build (debug)
cargo build

# Build (release, matches CI)
cargo +nightly build --release -j $(nproc)

# Run tests
cargo test

# Run tests (release, matches CI)
cargo +nightly test --release -j $(nproc) -- --test-threads=$(nproc)

# Format (required before commit)
cargo +nightly fmt --all

# Format check (what CI runs)
cargo +nightly fmt --all -- --check

# Lint (required before commit)
cargo +nightly clippy --all-targets -- -D warnings

# Run locally
AUTH_TOKEN=test PORT=8080 cargo run

# Docker build
docker build -t term-executor .
```

## Git Hooks

The `.githooks/` directory contains automated quality gates:

### pre-commit
- Runs `cargo +nightly fmt --all -- --check` to enforce formatting
- Runs `cargo +nightly clippy --all-targets -- -D warnings` to enforce lint
- Skip with `SKIP_GIT_HOOKS=1 git commit ...`

### pre-push
- Runs format check, clippy, full test suite, and release build
- This is the full quality gate matching CI
- Skip with `SKIP_GIT_HOOKS=1 git push ...`

Both hooks are activated via `git config core.hooksPath .githooks`.

## Environment Variables

| Variable | Default | Description |
|---|---|---|
| `PORT` | `8080` | HTTP listen port |
| `AUTH_TOKEN` | *(none)* | Bearer token for `/evaluate`. If unset, auth is disabled |
| `SESSION_TTL_SECS` | `1800` | Max session lifetime before reaping |
| `MAX_CONCURRENT_EVALS` | `4` | Maximum parallel evaluations |
| `CLONE_TIMEOUT_SECS` | `120` | Git clone timeout |
| `AGENT_TIMEOUT_SECS` | `600` | Agent execution timeout |
| `TEST_TIMEOUT_SECS` | `300` | Test suite timeout |
| `MAX_AGENT_CODE_BYTES` | `5242880` | Max agent code payload (5MB) |
| `MAX_OUTPUT_BYTES` | `1048576` | Max captured output per command (1MB) |
| `WORKSPACE_BASE` | `/tmp/sessions` | Base directory for session workspaces |
1 change: 1 addition & 0 deletions VERSION
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0.1.0
Loading