CortexLM · vitaagent · Feb 17, 2026 · Feb 16, 2026
diff --git a/.githooks/pre-commit b/.githooks/pre-commit
@@ -0,0 +1,16 @@
+#!/bin/bash
+set -e
+
+# Skip hooks when SKIP_GIT_HOOKS=1 is set
+if [ "${SKIP_GIT_HOOKS:-0}" = "1" ]; then
+  echo "=== pre-commit: skipped (SKIP_GIT_HOOKS=1) ==="
+  exit 0
+fi
+
+echo "=== pre-commit: checking formatting ==="
+cargo fmt --all -- --check
+
+echo "=== pre-commit: running clippy ==="
+cargo clippy --all-targets --all-features -- -D warnings
+
+echo "=== pre-commit: all checks passed ==="
diff --git a/.githooks/pre-push b/.githooks/pre-push
@@ -0,0 +1,22 @@
+#!/bin/bash
+set -e
+
+# Skip hooks when SKIP_GIT_HOOKS=1 is set
+if [ "${SKIP_GIT_HOOKS:-0}" = "1" ]; then
+  echo "=== pre-push: skipped (SKIP_GIT_HOOKS=1) ==="
+  exit 0
+fi
+
+echo "=== pre-push: checking formatting ==="
+cargo fmt --all -- --check
+
+echo "=== pre-push: running clippy ==="
+cargo clippy --all-targets --all-features -- -D warnings
+
+echo "=== pre-push: running tests ==="
+cargo test --lib
+
+echo "=== pre-push: building ==="
+cargo build
+
+echo "=== pre-push: all checks passed ==="
diff --git a/.releaserc.json b/.releaserc.json
@@ -0,0 +1,21 @@
+{
+  "branches": ["main"],
+  "plugins": [
+    "@semantic-release/commit-analyzer",
+    "@semantic-release/release-notes-generator",
+    [
+      "@semantic-release/exec",
+      {
+        "prepareCmd": "sed -i 's/^version = \".*\"/version = \"${nextRelease.version}\"/' Cargo.toml && echo '${nextRelease.version}' > VERSION"
+      }
+    ],
+    [
+      "@semantic-release/git",
+      {
+        "assets": ["Cargo.toml", "VERSION"],
+        "message": "chore(release): ${nextRelease.version} [skip ci]\n\n${nextRelease.notes}"
+      }
+    ],
+    "@semantic-release/github"
+  ]
+}
diff --git a/AGENTS.md b/AGENTS.md
@@ -0,0 +1,169 @@
+# AGENTS.md — swe-forge
+
+## Project Purpose
+
+**swe-forge** is a high-performance SWE-bench dataset generator and evaluation harness written in Rust. It mines real GitHub pull requests from GH Archive, enriches them via the GitHub API, uses LLMs (via OpenRouter) to classify difficulty and generate test specifications through an agentic multi-turn loop, and exports SWE-bench-compatible task instances. It also includes a Docker-isolated evaluation harness that runs external coding agents on generated tasks and verifies their solutions.
+
+## Architecture Overview
+
+swe-forge is a single Rust binary crate (`src/main.rs`) with a library (`src/lib.rs`) organized into these modules:
+
+```
+src/
+├── main.rs                  # CLI entry point (tokio async runtime)
+├── lib.rs                   # Public module declarations
+├── cli/                     # Clap-based CLI (commands: generate, evaluate, swe mine/harness/validate/export)
+├── swe/                     # Core mining pipeline (GH Archive → enrich → filter → classify → extract → test gen → export)
+│   ├── gharchive.rs         # GH Archive HTTP ingestion (gzip → JSON events)
+│   ├── enricher.rs          # GitHub API PR enrichment (title, body, diff, files)
+│   ├── filters.rs           # Pre-filter (bots, org repos, language, stars)
+│   ├── extractor.rs         # Git clone + diff patch extraction
+│   ├── test_generator.rs    # Agentic multi-turn LLM test generation (up to 200 turns)
+│   ├── quality.rs           # LLM-based quality scoring
+│   ├── prompt_rewriter.rs   # PR body → agent prompt (strip test plan leaks)
+│   ├── harness.rs           # Docker-isolated evaluation harness
+│   ├── docker_sandbox.rs    # Docker sandbox for test generation
+│   ├── orchestrator.rs      # End-to-end pipeline orchestrator
+│   ├── pipeline.rs          # Streaming pipeline with chunk processing
+│   └── pr_cache.rs          # JSONL-based PR deduplication cache
+├── llm/                     # LLM integration layer
+│   ├── litellm.rs           # OpenAI-compatible API client (function calling, tools)
+│   ├── providers/            # Provider implementations (OpenRouter)
+│   ├── router.rs            # Multi-model routing (cost-optimized, round-robin)
+│   ├── cache.rs             # Prompt caching for multi-conversation efficiency
+│   └── cost.rs              # Usage tracking with daily/monthly budgets
+├── agents/                  # Task validation agents (Docker-based verification)
+├── execution/               # Docker execution layer (bollard crate, container lifecycle)
+├── docker/                  # Dockerfile/docker-compose generation
+├── export/                  # Parquet dataset export + HuggingFace Hub upload
+├── runner/                  # Agent runner for benchmark evaluation
+├── difficulty/              # Difficulty levels, resource limits, scoring
+├── anti_hardcoding/         # Canary strings, sealed parameters, contamination detection
+├── utils/                   # JSON extraction from LLM responses
+└── error.rs                 # Typed error hierarchy (thiserror)
+```
+
+### Data Flow
+
+```
+GH Archive (hourly dumps, 8x concurrent)
+  → Pre-filter (merged PRs, no bots, org repos)
+  → GitHub API enrichment (3x concurrent, rate-limited 5000/h)
+  → Local filter (language, stars, files changed)
+  → LLM pre-classification (10x concurrent, title+body only)
+  → Patch extraction (git clone + diff, 3x concurrent)
+  → Agentic test generation (Codex-style multi-turn, 3x concurrent)
+  → Quality scoring (LLM-based)
+  → Export (workspace.yaml + prompt.md + checks.txt)
+```
+
+## Tech Stack
+
+| Component | Technology |
+|-----------|-----------|
+| Language | Rust (edition 2021, nightly toolchain) |
+| Async runtime | Tokio (full features) |
+| CLI framework | Clap 4 (derive mode) |
+| HTTP client | reqwest 0.13 (rustls) |
+| Docker | bollard 0.16 (SSL) |
+| Serialization | serde + serde_json + serde_yaml |
+| Database | SQLx 0.7 (Postgres + SQLite, migrations) |
+| Data export | Apache Arrow 54 + Parquet 54 |
+| Caching | Redis 0.24 (tokio-comp) |
+| Templating | Tera 1.20 |
+| Error handling | thiserror 2.0 + anyhow 1.0 |
+| Logging | tracing + tracing-subscriber (env-filter) |
+| Linker | mold (via `.cargo/config.toml`) |
+| LLM provider | OpenRouter (OpenAI-compatible function calling) |
+
+## Build & Test Commands
+
+```bash
+# Build (debug)
+cargo build
+
+# Build (release, optimized)
+cargo build --release
+
+# Run all tests
+cargo test
+
+# Run tests (release mode, parallel)
+cargo test --release -- --test-threads=$(nproc)
+
+# Lint
+cargo clippy --all-targets --all-features -- -D warnings
+
+# Format check
+cargo fmt --all -- --check
+
+# Format fix
+cargo fmt --all
+
+# Run doc tests
+cargo test --doc
+
+# Run the CLI
+cargo run -- swe mine --help
+cargo run -- swe harness --help
+```
+
+## Environment Variables
+
+| Variable | Required | Description |
+|----------|----------|-------------|
+| `OPENROUTER_API_KEY` | Yes (runtime) | OpenRouter API key for LLM calls |
+| `GITHUB_TOKEN` | Yes (runtime) | GitHub PAT for PR enrichment |
+| `RUST_LOG` | No | Log level: `error`, `warn`, `info`, `debug`, `trace` |
+
+## Git Hooks
+
+Git hooks are in `.githooks/` and activated via `git config core.hooksPath .githooks`.
+
+- **pre-commit**: Runs `cargo fmt --all -- --check` and `cargo clippy --all-targets -- -D warnings`
+- **pre-push**: Runs format check + clippy + `cargo test --lib` + `cargo build`
+- Both hooks can be skipped with `SKIP_GIT_HOOKS=1`
+
+## CRITICAL RULES
+
+1. **All errors must use typed error enums from `src/error.rs`** — Never use `unwrap()` or `expect()` in library code. Use `anyhow::Result` only in `main.rs` and CLI commands. Library modules must return typed errors via `thiserror` (`RegistryError`, `GeneratorError`, `LlmError`, `DockerError`, `ExportError`, `ValidationError`, `TemplateError`).
+
+2. **All LLM interactions must use function calling (`tools` + `tool_choice`)** — Never parse free-form LLM text. Use OpenAI-compatible `tools` array with `tool_choice: "required"` for structured JSON output. See `src/llm/litellm.rs` for `ToolDefinition`, `ToolChoice`, and `ToolCallInfo` types.
+
+3. **Never leak test plans into agent prompts** — The `prompt_rewriter.rs` module strips test-specific information from PR bodies before generating `prompt.md`. Any new prompt generation code must ensure `fail_to_pass` and `pass_to_pass` test commands are never visible to the agent being evaluated.
+
+4. **Docker containers must have resource limits** — All container creation must use `apply_resource_limits()` from `src/docker/resources.rs`. Difficulty-based limits are enforced: memory (512MB–4GB), CPU (1–4 cores), timeouts (5–30 min). Never create containers without limits.
+
+5. **Respect GitHub API rate limits (5000 req/h)** — The pipeline processes candidates in chunks of 30. Each candidate needs ~2 API calls for enrichment. Never add unbounded concurrent GitHub API calls. Use the existing concurrency limits (enrichment: 3x, pre-classification: 10x, deep processing: 3x).
+
+6. **All async code must be `Send + Sync` compatible** — The codebase uses `Arc<dyn LlmProvider>` extensively. Trait objects must be `Send + Sync`. Never introduce `Rc`, `RefCell`, or non-Send types in async contexts.
+
+7. **Serde rename conventions must be `snake_case`** — All serializable enums use `#[serde(rename_all = "snake_case")]`. Task status, difficulty levels, and all API-facing types must follow this convention for YAML/JSON compatibility.
+
+8. **Anti-hardcoding mechanisms must be preserved** — The `anti_hardcoding/` module provides canary strings, sealed parameters, and process validation. Never bypass contamination detection. Any new task generation must embed canary strings via `CanaryConfig::generate()`.
+
+9. **Use `tracing` for all logging, never `println!`** — All log output must use `tracing::{info, warn, debug, error, trace}` macros. The log level is controlled by `RUST_LOG` env var or `--log-level` CLI arg.
+
+10. **Parquet/Arrow exports must preserve schema** — The `export/parquet_writer.rs` module defines the schema for dataset export. Never change field types or remove fields from the Parquet schema without updating `read_parquet` and `write_parquet` together.
+
+## DO's
+
+- Use `anyhow::Result` for CLI command handlers in `src/cli/commands.rs`
+- Use typed `thiserror` errors for all library module boundaries
+- Add `#[cfg(test)] mod tests` blocks in the same file for unit tests
+- Use `tokio::spawn` for concurrent work, `futures::stream` for bounded concurrency
+- Follow the existing pattern of `mod.rs` re-exporting public types
+- Use `Arc<dyn LlmProvider>` for LLM provider abstraction
+- Add doc comments (`///`) to all public types and functions
+- Use `BTreeMap` (not `HashMap`) for deterministic serialization in `SweTask`
+
+## DON'Ts
+
+- Don't use `unwrap()` or `expect()` in library code — use `?` operator
+- Don't add new direct dependencies without checking if an existing dep covers the use case
+- Don't use `println!` or `eprintln!` — use `tracing` macros
+- Don't create Docker containers without resource limits
+- Don't make unbounded concurrent API calls — always use semaphore or stream limits
+- Don't store secrets (API keys, tokens) in code or config files
+- Don't change the `workspace.yaml` schema without updating the harness parser
+- Don't bypass the PR deduplication cache (`pr_cache.rs`) — it prevents reprocessing
diff --git a/VERSION b/VERSION
@@ -0,0 +1 @@
+0.1.0
diff --git a/src/agents/AGENTS.md b/src/agents/AGENTS.md
@@ -0,0 +1,31 @@
+# AGENTS.md — src/agents/
+
+## Purpose
+
+Task validation and Docker-based verification agents. These agents validate generated benchmark tasks for correctness, execute them in Docker containers, and score difficulty.
+
+## Module Structure
+
+| File | Responsibility |
+|------|---------------|
+| `mod.rs` | Re-exports |
+| `docker_validator.rs` | `DockerValidatorAgent` — builds and runs tasks in Docker, validates output |
+| `task_validator.rs` | `TaskValidatorAgent` — validates task ideas and assesses feasibility |
+| `task_executor.rs` | `TaskExecutorAgent` — generates synthetic tasks with anti-memorization, difficulty scoring, verification specs |
+| `error.rs` | `AgentError` enum, `AgentResult<T>` type alias |
+
+## Key Types
+
+- `DockerValidatorAgent` / `DockerValidatorConfig` / `DockerValidationResult`
+- `TaskValidatorAgent` / `TaskValidatorConfig` / `ValidationAssessment` / `TaskIdea`
+- `TaskExecutorAgent` / `TaskExecutorConfig` / `SyntheticTask` / `TaskMetadata`
+- `AntiMemorizationConfig` — Config for anti-hardcoding in generated tasks
+- `DifficultyScoring` — Difficulty assessment with scoring criteria
+- `HiddenSolution` — Solution hidden from the agent during evaluation
+- `VerificationSpec` — Specification for verifying task outputs
+
+## Rules
+
+- Docker validation must always use resource limits from `src/docker/resources.rs`
+- Task executor must embed `AntiMemorizationConfig` canary strings
+- All agent errors must use `AgentError` from `error.rs`
diff --git a/src/anti_hardcoding/AGENTS.md b/src/anti_hardcoding/AGENTS.md
@@ -0,0 +1,36 @@
+# AGENTS.md — src/anti_hardcoding/
+
+## Purpose
+
+Anti-hardcoding mechanisms to ensure benchmark integrity. Detects if models have memorized benchmarks (contamination), prevents pre-computation of answers, and validates that agents follow proper problem-solving processes.
+
+## Module Structure
+
+| File | Responsibility |
+|------|---------------|
+| `mod.rs` | `AntiHardcodingVerifier` — unified verifier combining all mechanisms; `VerificationResult` |
+| `canary.rs` | `CanaryConfig` — unique identifiers embedded in tasks for contamination detection |
+| `sealed.rs` | `SealedParameters` / `SealedData` — encrypted parameters revealed only at verification time |
+| `process_validation.rs` | `ProcessTracer` / `ProcessValidationConfig` — validates command execution patterns |
+
+## Key Types
+
+- `AntiHardcodingVerifier` — Combines canary + process validation
+- `CanaryConfig` — Generated via `CanaryConfig::generate(task_id, seed)`
+- `ContaminationResult` — `contaminated`, `partial_match`, `confidence`
+- `SealedParameters` / `SealedData` / `SealError`
+- `ProcessTracer` / `ProcessValidationConfig` / `CommandExecution`
+- `VerificationResult` — `valid`, `score`, `contamination`, `process_validation`, `issues`
+
+## Scoring
+
+- Confirmed contamination: 90% score penalty (`score *= 0.1`)
+- Partial match: 30% penalty (`score *= 0.7`)
+- High confidence (>0.5): up to 20% additional penalty
+
+## Rules
+
+- Every generated task must embed a canary via `CanaryConfig::generate()`
+- Never bypass contamination detection in the verification pipeline
+- Process validation patterns use regex — test patterns before deploying
+- `required_pattern` must match at least one recorded `CommandExecution`
diff --git a/src/cli/AGENTS.md b/src/cli/AGENTS.md
@@ -0,0 +1,31 @@
+# AGENTS.md — src/cli/
+
+## Purpose
+
+Clap-based CLI interface. Defines all commands, argument parsing, and command dispatch.
+
+## Module Structure
+
+| File | Responsibility |
+|------|---------------|
+| `mod.rs` | Re-exports `parse_cli`, `run`, `run_with_cli` |
+| `commands.rs` | `Cli` struct (Clap derive), `Commands` enum, all subcommand args, command handlers |
+
+## Commands
+
+| Command | Description |
+|---------|-------------|
+| `swe-forge generate` (alias: `gen`) | Generate SWE DataForge tasks from real GitHub PRs |
+| `swe-forge evaluate` (alias: `eval`) | Evaluate generated tasks using an autonomous agent |
+| `swe-forge swe mine` | Mine real PRs and export SWE-style tasks |
+| `swe-forge swe harness` | Run evaluation harness on generated tasks |
+| `swe-forge swe validate` | Validate generated SWE workspaces |
+| `swe-forge swe export` | Export SWE workspaces to dataset format |
+
+## Rules
+
+- Use `anyhow::Result` for command handler return types
+- Default model constant: `DEFAULT_MODEL = "openai/gpt-5.2-codex:nitro"`
+- Default output dirs: `./generated-datasets` (generate), `./generated-swe` (swe mine)
+- Global `--log-level` arg controls tracing filter
+- API keys come from env vars or CLI args (env var takes precedence)
diff --git a/src/cli/commands.rs b/src/cli/commands.rs
@@ -737,12 +737,12 @@ async fn run_swe_mine_command(args: SweMineArgs) -> anyhow::Result<()> {
         None => None,
     };
 
-    let (effective_max_tasks, effective_difficulty_filter) = if let Some(ref dt) = difficulty_targets
-    {
-        (dt.total_tasks(), None)
-    } else {
-        (args.max_tasks, args.difficulty.clone())
-    };
+    let (effective_max_tasks, effective_difficulty_filter) =
+        if let Some(ref dt) = difficulty_targets {
+            (dt.total_tasks(), None)
+        } else {
+            (args.max_tasks, args.difficulty.clone())
+        };
 
     let hf_upload = match (&args.hf_repo, &args.hf_token) {
         (Some(repo), Some(token)) => Some(crate::export::HfUploadConfig {
@@ -819,7 +819,8 @@ async fn run_swe_mine_command(args: SweMineArgs) -> anyhow::Result<()> {
         );
 
         // Show per-difficulty breakdown
-        let mut per_level: std::collections::HashMap<String, usize> = std::collections::HashMap::new();
+        let mut per_level: std::collections::HashMap<String, usize> =
+            std::collections::HashMap::new();
         for task in &result.tasks {
             if task.quality_passed {
                 let level = task
@@ -868,10 +869,16 @@ async fn run_swe_load_command(args: SweLoadArgs) -> anyhow::Result<()> {
     }
 
     // Compute stats
-    let mut by_difficulty: std::collections::HashMap<String, usize> = std::collections::HashMap::new();
-    let mut by_language: std::collections::HashMap<String, usize> = std::collections::HashMap::new();
+    let mut by_difficulty: std::collections::HashMap<String, usize> =
+        std::collections::HashMap::new();
+    let mut by_language: std::collections::HashMap<String, usize> =
+        std::collections::HashMap::new();
     for task in &tasks {
-        let diff = task.meta.get("difficulty").cloned().unwrap_or_else(|| "unknown".to_string());
+        let diff = task
+            .meta
+            .get("difficulty")
+            .cloned()
+            .unwrap_or_else(|| "unknown".to_string());
         *by_difficulty.entry(diff).or_insert(0) += 1;
         *by_language.entry(task.language.clone()).or_insert(0) += 1;
     }