diff --git a/.githooks/pre-commit b/.githooks/pre-commit
new file mode 100755
index 0000000..9857579
--- /dev/null
+++ b/.githooks/pre-commit
@@ -0,0 +1,27 @@
+#!/bin/bash
+set -e
+
+# Skip hooks if SKIP_GIT_HOOKS=1 is set
+if [ "${SKIP_GIT_HOOKS:-0}" = "1" ]; then
+    echo "[pre-commit] Skipped (SKIP_GIT_HOOKS=1)"
+    exit 0
+fi
+
+echo "╔══════════════════════════════════════╗"
+echo "║       pre-commit quality gate        ║"
+echo "╚══════════════════════════════════════╝"
+
+# 1. Format check
+echo ""
+echo "▶ Checking formatting (cargo +nightly fmt)..."
+cargo +nightly fmt --all -- --check
+echo "  ✓ Formatting OK"
+
+# 2. Clippy lint
+echo ""
+echo "▶ Running clippy..."
+cargo +nightly clippy --all-targets -- -D warnings
+echo "  ✓ Clippy OK"
+
+echo ""
+echo "✅ pre-commit passed"
diff --git a/.githooks/pre-push b/.githooks/pre-push
new file mode 100755
index 0000000..5257348
--- /dev/null
+++ b/.githooks/pre-push
@@ -0,0 +1,39 @@
+#!/bin/bash
+set -e
+
+# Skip hooks if SKIP_GIT_HOOKS=1 is set
+if [ "${SKIP_GIT_HOOKS:-0}" = "1" ]; then
+    echo "[pre-push] Skipped (SKIP_GIT_HOOKS=1)"
+    exit 0
+fi
+
+echo "╔══════════════════════════════════════╗"
+echo "║        pre-push quality gate         ║"
+echo "╚══════════════════════════════════════╝"
+
+# 1. Format check
+echo ""
+echo "▶ Checking formatting (cargo +nightly fmt)..."
+cargo +nightly fmt --all -- --check
+echo "  ✓ Formatting OK"
+
+# 2. Clippy lint
+echo ""
+echo "▶ Running clippy..."
+cargo +nightly clippy --all-targets -- -D warnings
+echo "  ✓ Clippy OK"
+
+# 3. Tests
+echo ""
+echo "▶ Running tests..."
+cargo +nightly test --release -- --test-threads=$(nproc 2>/dev/null || echo 4)
+echo "  ✓ Tests OK"
+
+# 4. Release build
+echo ""
+echo "▶ Building release binary..."
+cargo +nightly build --release
+echo "  ✓ Build OK"
+
+echo ""
+echo "✅ pre-push passed — safe to push"
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index bb30f85..aac2e42 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -17,9 +17,9 @@ env:
   RUST_BACKTRACE: short
 
 jobs:
-  lint:
-    name: Lint & Format
-    runs-on: blacksmith-4vcpu-ubuntu-2404
+  check:
+    name: Format + Lint + Test + Build
+    runs-on: blacksmith-32vcpu-ubuntu-2404
     steps:
       - uses: actions/checkout@v4
 
@@ -38,9 +38,9 @@ jobs:
             ~/.cargo/registry
             ~/.cargo/git
             target
-          key: lint-${{ runner.os }}-nightly-${{ hashFiles('**/Cargo.lock') }}
+          key: check-${{ runner.os }}-nightly-${{ hashFiles('**/Cargo.lock') }}
           restore-keys: |
-            lint-${{ runner.os }}-nightly-
+            check-${{ runner.os }}-nightly-
 
       - name: Check formatting
         run: cargo +nightly fmt --all -- --check
@@ -48,28 +48,10 @@ jobs:
       - name: Clippy
         run: cargo +nightly clippy --all-targets -- -D warnings
 
-  build:
-    name: Build (nightly, release, 32 threads)
-    runs-on: blacksmith-32vcpu-ubuntu-2404
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Install mold linker
-        run: sudo apt-get update -qq && sudo apt-get install -y -qq mold clang
-
-      - name: Install Rust nightly
-        uses: dtolnay/rust-toolchain@nightly
-
-      - name: Cache cargo registry + target
-        uses: actions/cache@v4
-        with:
-          path: |
-            ~/.cargo/registry
-            ~/.cargo/git
-            target
-          key: build-${{ runner.os }}-nightly-${{ hashFiles('**/Cargo.lock') }}
-          restore-keys: |
-            build-${{ runner.os }}-nightly-
+      - name: Run tests
+        run: cargo +nightly test --release -j $(nproc) -- --test-threads=$(nproc)
+        env:
+          RUST_LOG: warn
 
       - name: Build release
         run: cargo +nightly build --release -j $(nproc)
@@ -83,38 +65,10 @@ jobs:
           path: target/release/term-executor
           retention-days: 7
 
-  test:
-    name: Tests (nightly, 32 threads)
-    runs-on: blacksmith-32vcpu-ubuntu-2404
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Install mold linker
-        run: sudo apt-get update -qq && sudo apt-get install -y -qq mold clang
-
-      - name: Install Rust nightly
-        uses: dtolnay/rust-toolchain@nightly
-
-      - name: Cache cargo registry + target
-        uses: actions/cache@v4
-        with:
-          path: |
-            ~/.cargo/registry
-            ~/.cargo/git
-            target
-          key: test-${{ runner.os }}-nightly-${{ hashFiles('**/Cargo.lock') }}
-          restore-keys: |
-            test-${{ runner.os }}-nightly-
-
-      - name: Run tests
-        run: cargo +nightly test --release -j $(nproc) -- --test-threads=$(nproc)
-        env:
-          RUST_LOG: warn
-
   docker:
     name: Docker build
     runs-on: blacksmith-32vcpu-ubuntu-2404
-    needs: [lint, build, test]
+    needs: [check]
     if: github.ref == 'refs/heads/main'
     steps:
       - uses: actions/checkout@v4
@@ -138,3 +92,31 @@ jobs:
           tags: |
             ghcr.io/${{ steps.lower.outputs.repo }}:latest
             ghcr.io/${{ steps.lower.outputs.repo }}:${{ github.sha }}
+
+  release:
+    name: Semantic Release
+    runs-on: ubuntu-latest
+    needs: [check]
+    if: github.ref == 'refs/heads/main' && github.event_name == 'push'
+    permissions:
+      contents: write
+      issues: write
+      pull-requests: write
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          persist-credentials: false
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: 22
+
+      - name: Install semantic-release
+        run: npm install -g semantic-release @semantic-release/changelog @semantic-release/git @semantic-release/exec
+
+      - name: Run semantic-release
+        run: npx semantic-release
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.releaserc.json b/.releaserc.json
new file mode 100644
index 0000000..5c1bbe8
--- /dev/null
+++ b/.releaserc.json
@@ -0,0 +1,28 @@
+{
+  "branches": ["main"],
+  "tagFormat": "v${version}",
+  "plugins": [
+    "@semantic-release/commit-analyzer",
+    "@semantic-release/release-notes-generator",
+    [
+      "@semantic-release/changelog",
+      {
+        "changelogFile": "CHANGELOG.md"
+      }
+    ],
+    [
+      "@semantic-release/exec",
+      {
+        "prepareCmd": "echo ${nextRelease.version} > VERSION && sed -i 's/^version = \".*\"/version = \"${nextRelease.version}\"/' Cargo.toml"
+      }
+    ],
+    [
+      "@semantic-release/git",
+      {
+        "assets": ["VERSION", "CHANGELOG.md", "Cargo.toml"],
+        "message": "chore(release): ${nextRelease.version} [skip ci]\n\n${nextRelease.notes}"
+      }
+    ],
+    "@semantic-release/github"
+  ]
+}
diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 0000000..8886dd3
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,160 @@
+# AGENTS.md — term-executor
+
+## Project Purpose
+
+**term-executor** is a remote evaluation executor for the [term-challenge](https://github.com/PlatformNetwork/term-challenge) platform. It runs as a containerized Rust service on [Basilica](https://basilica.ai) that receives agent code submissions, executes them against a cloned task repository, runs validation test scripts, and reports pass/fail results. It is the core compute backend that evaluates AI agent coding challenges.
+
+## Architecture Overview
+
+This is a **single-crate Rust binary** (`term-executor`) built with Axum. There are no sub-crates or workspaces.
+
+### Data Flow
+
+```
+Platform Server → POST /evaluate → term-executor
+  1. Download task archive (.tar.gz / .zip) from task_url
+  2. Parse workspace.yaml, prompt.md, tests/
+  3. git clone the target repository at base_commit
+  4. Run install commands (pip install, etc.)
+  5. Write & execute agent code in the repo
+  6. Write test source files into the repo
+  7. Run test scripts (bash), collect exit codes
+  8. Return results via GET /evaluate/{id}
+```
+
+### Module Map
+
+| File | Responsibility |
+|---|---|
+| `src/main.rs` | Entry point — bootstraps config, session manager, executor, Axum server, reaper tasks |
+| `src/config.rs` | `Config` struct loaded from environment variables with defaults |
+| `src/handlers.rs` | Axum route handlers: `/health`, `/status`, `/metrics`, `/evaluate`, `/evaluate/{id}`, `/evaluations` |
+| `src/auth.rs` | Bearer token authentication middleware and `check_token()` helper |
+| `src/executor.rs` | Core evaluation engine — spawns async tasks that clone repos, run agents, run tests |
+| `src/session.rs` | `SessionManager` with `DashMap`, `Session`, `EvalResult`, `EvalStatus`, `EvalStep` types |
+| `src/task.rs` | Task archive download/extraction (zip/tar.gz), `workspace.yaml` parsing, test file loading |
+| `src/metrics.rs` | Atomic counter-based Prometheus metrics (total, passed, failed, active, duration) |
+| `src/cleanup.rs` | Work directory removal, stale session reaping, process group killing |
+
+### Key Shared State (via `Arc`)
+
+- `AppState` (in `handlers.rs`) holds `Config`, `SessionManager`, `Metrics`, `Executor`, `Semaphore`
+- `SessionManager` uses `DashMap<String, Arc<Session>>` for lock-free concurrent access
+- `Semaphore` controls max concurrent evaluations (default: 4)
+
+## Tech Stack
+
+- **Language**: Rust (edition 2021, nightly toolchain for fmt/clippy)
+- **Async Runtime**: Tokio (full features + process)
+- **Web Framework**: Axum 0.7 with Tower middleware
+- **HTTP Client**: reqwest 0.12 (for downloading task archives)
+- **Serialization**: serde + serde_json + serde_yaml
+- **Concurrency**: `DashMap` 6, `parking_lot` 0.12, `tokio::sync::Semaphore`
+- **Archive Handling**: `flate2` + `tar` (tar.gz), `zip` 2 (zip)
+- **Error Handling**: `anyhow` 1 + `thiserror` 2
+- **Logging**: `tracing` + `tracing-subscriber` with env-filter
+- **Build Tooling**: `mold` linker via `.cargo/config.toml`, `clang` as linker driver
+- **Container**: Multi-stage Dockerfile — `rust:1.93-slim-bookworm` builder → `debian:bookworm-slim` runtime
+- **CI**: GitHub Actions on Blacksmith runners (4/32 vCPU), nightly Rust
+
+## CRITICAL RULES
+
+1. **Always use `cargo +nightly fmt --all` before committing.** The CI enforces `--check` and will reject unformatted code. The project uses the nightly formatter exclusively.
+
+2. **All clippy warnings are errors.** Run `cargo +nightly clippy --all-targets -- -D warnings` locally. CI runs the same command and will fail on any warning.
+
+3. **Never expose secrets in logs or responses.** The `AUTH_TOKEN` environment variable is sensitive. Auth failures log only the `x-forwarded-for` header, never the token value. Follow this pattern for any new secrets.
+
+4. **All process execution MUST have timeouts.** Every call to `run_cmd`/`run_shell` in `src/executor.rs` takes a `Duration` timeout. Never spawn a child process without a timeout — agent code is untrusted and may hang forever.
+
+5. **Output MUST be truncated.** The `truncate_output()` function in `src/executor.rs` caps output at `MAX_OUTPUT` (1MB). Any new command output capture must use this function to prevent memory exhaustion from malicious agent output.
+
+6. **Shared state must use `Arc` + lock-free structures.** `SessionManager` uses `DashMap` (not `Mutex<HashMap>`). Metrics use `AtomicU64`. New shared state should follow these patterns — never use `std::sync::Mutex` for hot-path data.
+
+7. **Semaphore must gate evaluation capacity.** The `Semaphore` in `AppState` limits concurrent evaluations to `MAX_CONCURRENT_EVALS`. Any new evaluation path must acquire a permit before spawning work.
+
+8. **Session cleanup is mandatory.** Every evaluation must clean up its work directory in `src/executor.rs` (the `Cleanup` step). The stale session reaper in `src/cleanup.rs` is a safety net, not a primary mechanism.
+
+9. **Error handling: use `anyhow::Result` for internal logic, `(StatusCode, String)` for HTTP responses.** Handler functions in `src/handlers.rs` return `Result<impl IntoResponse, (StatusCode, String)>`. Internal executor/task functions return `anyhow::Result<T>`.
+
+10. **All new fields on serialized structs must use `#[serde(default)]` or `Option<T>`.** The `EvalRequest`, `EvalResult`, and `WorkspaceConfig` structs are deserialized from external input. Missing fields must not break deserialization.
+
+## DO / DO NOT
+
+### DO
+- Write unit tests for all new public functions (see existing `#[cfg(test)]` modules in every file)
+- Use `tracing::info!`/`warn!`/`error!` for logging (not `println!`)
+- Add new routes in `src/handlers.rs` via the `router()` function
+- Use `tokio::fs` for async file I/O in the executor pipeline
+- Keep the Dockerfile minimal — runtime image has no compilers or language runtimes
+- Use conventional commits (`feat:`, `fix:`, `perf:`, `chore:`, etc.)
+
+### DO NOT
+- Do NOT add `unsafe` code — there is none in this project and it should stay that way
+- Do NOT add synchronous blocking I/O in async functions — use `tokio::task::spawn_blocking` for CPU-heavy work (see `extract_archive` in `src/task.rs`)
+- Do NOT store large data (agent output, test output) in memory without truncation
+- Do NOT add new dependencies without justification — the binary must stay small for container deployment
+- Do NOT use `unwrap()` in production code paths — use `?` or `context()` from anyhow. `unwrap()` is only acceptable in tests and infallible cases (like parsing a known-good string)
+- Do NOT modify `.cargo/config.toml` — it configures the mold linker for fast builds
+
+## Build & Test Commands
+
+```bash
+# Build (debug)
+cargo build
+
+# Build (release, matches CI)
+cargo +nightly build --release -j $(nproc)
+
+# Run tests
+cargo test
+
+# Run tests (release, matches CI)
+cargo +nightly test --release -j $(nproc) -- --test-threads=$(nproc)
+
+# Format (required before commit)
+cargo +nightly fmt --all
+
+# Format check (what CI runs)
+cargo +nightly fmt --all -- --check
+
+# Lint (required before commit)
+cargo +nightly clippy --all-targets -- -D warnings
+
+# Run locally
+AUTH_TOKEN=test PORT=8080 cargo run
+
+# Docker build
+docker build -t term-executor .
+```
+
+## Git Hooks
+
+The `.githooks/` directory contains automated quality gates:
+
+### pre-commit
+- Runs `cargo +nightly fmt --all -- --check` to enforce formatting
+- Runs `cargo +nightly clippy --all-targets -- -D warnings` to enforce lint
+- Skip with `SKIP_GIT_HOOKS=1 git commit ...`
+
+### pre-push
+- Runs format check, clippy, full test suite, and release build
+- This is the full quality gate matching CI
+- Skip with `SKIP_GIT_HOOKS=1 git push ...`
+
+Both hooks are activated via `git config core.hooksPath .githooks`.
+
+## Environment Variables
+
+| Variable | Default | Description |
+|---|---|---|
+| `PORT` | `8080` | HTTP listen port |
+| `AUTH_TOKEN` | *(none)* | Bearer token for `/evaluate`. If unset, auth is disabled |
+| `SESSION_TTL_SECS` | `1800` | Max session lifetime before reaping |
+| `MAX_CONCURRENT_EVALS` | `4` | Maximum parallel evaluations |
+| `CLONE_TIMEOUT_SECS` | `120` | Git clone timeout |
+| `AGENT_TIMEOUT_SECS` | `600` | Agent execution timeout |
+| `TEST_TIMEOUT_SECS` | `300` | Test suite timeout |
+| `MAX_AGENT_CODE_BYTES` | `5242880` | Max agent code payload (5MB) |
+| `MAX_OUTPUT_BYTES` | `1048576` | Max captured output per command (1MB) |
+| `WORKSPACE_BASE` | `/tmp/sessions` | Base directory for session workspaces |
diff --git a/VERSION b/VERSION
new file mode 100644
index 0000000..6e8bf73
--- /dev/null
+++ b/VERSION
@@ -0,0 +1 @@
+0.1.0
diff --git a/src/AGENTS.md b/src/AGENTS.md
new file mode 100644
index 0000000..2ed2326
--- /dev/null
+++ b/src/AGENTS.md
@@ -0,0 +1,92 @@
+# AGENTS.md — src/ (term-executor core)
+
+This is a single-crate binary. All source files live in `src/` with no sub-modules or nested directories.
+
+## Module Dependency Graph
+
+```
+main.rs
+  ├── config.rs      (Config::from_env)
+  ├── handlers.rs     (Axum router + AppState)
+  │     ├── auth.rs   (check_token for /evaluate)
+  │     ├── executor.rs (spawned from evaluate handler)
+  │     │     ├── task.rs (download, extract, parse)
+  │     │     ├── session.rs (EvalResult mutation)
+  │     │     └── cleanup.rs (work dir removal)
+  │     ├── metrics.rs (Prometheus rendering)
+  │     └── session.rs (SessionManager CRUD)
+  ├── session.rs      (reaper_loop spawned from main)
+  └── cleanup.rs      (reap_stale_sessions spawned from main)
+```
+
+## File-by-File Guide
+
+### `main.rs`
+- Entry point. Initializes tracing, config, session manager, metrics, executor, semaphore.
+- Creates `AppState`, builds Axum router, spawns background tasks (session reaper, stale dir reaper).
+- Binds to `0.0.0.0:{PORT}` with graceful shutdown on SIGTERM/CTRL+C.
+- **Convention**: Background tasks are spawned with `tokio::spawn` and run indefinitely.
+
+### `config.rs`
+- `Config` struct with all environment-driven settings.
+- `Config::from_env()` reads env vars with `env_parse()` helper (returns default on missing/invalid).
+- `Config::print_banner()` logs a formatted startup banner.
+- **Convention**: Add new config fields here, with a `DEFAULT_*` constant and an env var name. Always provide a sensible default.
+
+### `handlers.rs`
+- Defines `AppState` struct (all fields `Arc`-wrapped for sharing).
+- `router()` builds the Axum `Router` with all routes and shared state.
+- Route handlers: `health`, `status`, `metrics`, `evaluate`, `get_eval`, `list_evals`.
+- `evaluate` handler does: auth check → payload validation → capacity check → session creation → executor spawn.
+- **Convention**: Return `Result<impl IntoResponse, (StatusCode, String)>` from handlers that can fail. Use `Json(serde_json::json!({...}))` for responses.
+
+### `auth.rs`
+- `auth_middleware` — Axum middleware (currently unused in router, auth is inline in `evaluate`).
+- `check_token(auth_header, expected)` — simple Bearer token comparison used by `evaluate` handler.
+- `inject_request_id` — adds `x-request-id` UUID header to responses.
+- **Convention**: Auth is optional — if `AUTH_TOKEN` env var is unset, `/evaluate` is open.
+
+### `executor.rs`
+- `Executor::spawn_eval(session)` — spawns a tokio task that runs the full evaluation pipeline.
+- `run_eval(config, session, cancel_rx)` — orchestrates: download → clone → install → agent → tests → cleanup.
+- `run_cmd(argv, cwd, timeout, env)` / `run_shell(shell_cmd, cwd, timeout, env)` — process execution with timeout.
+- `truncate_output(raw)` — caps output at 1MB.
+- `agent_extension(language)` / `agent_runner(language, script_path)` — maps language strings to file extensions and runner commands.
+- **Convention**: Every phase checks `cancel_rx` for cancellation. Every process has a timeout. Output is always truncated.
+
+### `session.rs`
+- `EvalRequest`, `EvalStatus` (enum), `EvalStep` (enum), `TaskTestResult`, `EvalResult` — core data types.
+- `Session` — holds id, request, result (`Arc<Mutex<EvalResult>>`), created_at, cancel channel.
+- `SessionManager` — `DashMap`-backed session store with create/get/remove/list/mark operations.
+- `reaper_loop()` — runs every 60s, removes sessions older than TTL, sends cancel signal.
+- **Convention**: All enums use `#[serde(rename_all = "snake_case")]`. Session IDs are UUID v4 strings.
+
+### `task.rs`
+- `download_and_extract(url, dest)` — HTTP GET → bytes → extract (zip or tar.gz) in a blocking task.
+- `parse_task(task_dir)` — reads `workspace.yaml`, `prompt.md`, `tests/` directory, `checks.txt`.
+- `find_task_root(base)` — locates `workspace.yaml` in extracted archive (direct or one level nested).
+- `WorkspaceConfig` — deserialized from `workspace.yaml` (repo, version, base_commit, install, language).
+- `SweForgeTask` — parsed task with workspace config, prompt text, test scripts, test source files.
+- **Convention**: `.sh` files in `tests/` are test scripts (executed); all other files are source files (written to repo). Archive size capped at 100MB.
+
+### `metrics.rs`
+- `Metrics` — atomic counters for evals total/passed/failed/cancelled/active/duration_sum.
+- `start_eval()` / `finish_eval(passed, duration_ms)` / `cancel_eval()` — counter operations.
+- `render_prometheus()` — formats counters as Prometheus text exposition format.
+- **Convention**: All counters are `AtomicU64` with `Ordering::Relaxed`. Metrics are exposed at `GET /metrics`.
+
+### `cleanup.rs`
+- `remove_work_dir(path)` — async directory removal (logs warning on failure, never panics).
+- `kill_process_group(pgid)` — best-effort `kill -9` on a process group.
+- `reap_stale_sessions(base, max_age_secs)` — scans workspace base, removes dirs older than TTL.
+- **Convention**: Cleanup functions are fire-and-forget. They log but never return errors.
+
+## Testing
+
+Every module has a `#[cfg(test)] mod tests` block. Tests use:
+- `#[test]` for sync unit tests
+- `#[tokio::test]` for async tests
+- `tempfile::tempdir()` for filesystem tests
+- No external test fixtures or mock servers needed
+
+Run all tests: `cargo test` or `cargo +nightly test --release`