diff --git a/.githooks/pre-commit b/.githooks/pre-commit new file mode 100755 index 0000000..0d7e06c --- /dev/null +++ b/.githooks/pre-commit @@ -0,0 +1,16 @@ +#!/bin/bash +set -e + +# Skip hooks when SKIP_GIT_HOOKS=1 is set +if [ "${SKIP_GIT_HOOKS:-0}" = "1" ]; then + echo "=== pre-commit: skipped (SKIP_GIT_HOOKS=1) ===" + exit 0 +fi + +echo "=== pre-commit: checking formatting ===" +cargo fmt --all -- --check + +echo "=== pre-commit: running clippy ===" +cargo clippy --all-targets --all-features -- -D warnings + +echo "=== pre-commit: all checks passed ===" diff --git a/.githooks/pre-push b/.githooks/pre-push new file mode 100755 index 0000000..20be0a7 --- /dev/null +++ b/.githooks/pre-push @@ -0,0 +1,22 @@ +#!/bin/bash +set -e + +# Skip hooks when SKIP_GIT_HOOKS=1 is set +if [ "${SKIP_GIT_HOOKS:-0}" = "1" ]; then + echo "=== pre-push: skipped (SKIP_GIT_HOOKS=1) ===" + exit 0 +fi + +echo "=== pre-push: checking formatting ===" +cargo fmt --all -- --check + +echo "=== pre-push: running clippy ===" +cargo clippy --all-targets --all-features -- -D warnings + +echo "=== pre-push: running tests ===" +cargo test --lib + +echo "=== pre-push: building ===" +cargo build + +echo "=== pre-push: all checks passed ===" diff --git a/.releaserc.json b/.releaserc.json new file mode 100644 index 0000000..0486f9b --- /dev/null +++ b/.releaserc.json @@ -0,0 +1,21 @@ +{ + "branches": ["main"], + "plugins": [ + "@semantic-release/commit-analyzer", + "@semantic-release/release-notes-generator", + [ + "@semantic-release/exec", + { + "prepareCmd": "sed -i 's/^version = \".*\"/version = \"${nextRelease.version}\"/' Cargo.toml && echo '${nextRelease.version}' > VERSION" + } + ], + [ + "@semantic-release/git", + { + "assets": ["Cargo.toml", "VERSION"], + "message": "chore(release): ${nextRelease.version} [skip ci]\n\n${nextRelease.notes}" + } + ], + "@semantic-release/github" + ] +} diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..9fe3885 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,169 @@ +# AGENTS.md — swe-forge + +## Project Purpose + +**swe-forge** is a high-performance SWE-bench dataset generator and evaluation harness written in Rust. It mines real GitHub pull requests from GH Archive, enriches them via the GitHub API, uses LLMs (via OpenRouter) to classify difficulty and generate test specifications through an agentic multi-turn loop, and exports SWE-bench-compatible task instances. It also includes a Docker-isolated evaluation harness that runs external coding agents on generated tasks and verifies their solutions. + +## Architecture Overview + +swe-forge is a single Rust binary crate (`src/main.rs`) with a library (`src/lib.rs`) organized into these modules: + +``` +src/ +├── main.rs # CLI entry point (tokio async runtime) +├── lib.rs # Public module declarations +├── cli/ # Clap-based CLI (commands: generate, evaluate, swe mine/harness/validate/export) +├── swe/ # Core mining pipeline (GH Archive → enrich → filter → classify → extract → test gen → export) +│ ├── gharchive.rs # GH Archive HTTP ingestion (gzip → JSON events) +│ ├── enricher.rs # GitHub API PR enrichment (title, body, diff, files) +│ ├── filters.rs # Pre-filter (bots, org repos, language, stars) +│ ├── extractor.rs # Git clone + diff patch extraction +│ ├── test_generator.rs # Agentic multi-turn LLM test generation (up to 200 turns) +│ ├── quality.rs # LLM-based quality scoring +│ ├── prompt_rewriter.rs # PR body → agent prompt (strip test plan leaks) +│ ├── harness.rs # Docker-isolated evaluation harness +│ ├── docker_sandbox.rs # Docker sandbox for test generation +│ ├── orchestrator.rs # End-to-end pipeline orchestrator +│ ├── pipeline.rs # Streaming pipeline with chunk processing +│ └── pr_cache.rs # JSONL-based PR deduplication cache +├── llm/ # LLM integration layer +│ ├── litellm.rs # OpenAI-compatible API client (function calling, tools) +│ ├── providers/ # Provider implementations (OpenRouter) +│ ├── router.rs # Multi-model routing (cost-optimized, round-robin) +│ ├── cache.rs # Prompt caching for multi-conversation efficiency +│ └── cost.rs # Usage tracking with daily/monthly budgets +├── agents/ # Task validation agents (Docker-based verification) +├── execution/ # Docker execution layer (bollard crate, container lifecycle) +├── docker/ # Dockerfile/docker-compose generation +├── export/ # Parquet dataset export + HuggingFace Hub upload +├── runner/ # Agent runner for benchmark evaluation +├── difficulty/ # Difficulty levels, resource limits, scoring +├── anti_hardcoding/ # Canary strings, sealed parameters, contamination detection +├── utils/ # JSON extraction from LLM responses +└── error.rs # Typed error hierarchy (thiserror) +``` + +### Data Flow + +``` +GH Archive (hourly dumps, 8x concurrent) + → Pre-filter (merged PRs, no bots, org repos) + → GitHub API enrichment (3x concurrent, rate-limited 5000/h) + → Local filter (language, stars, files changed) + → LLM pre-classification (10x concurrent, title+body only) + → Patch extraction (git clone + diff, 3x concurrent) + → Agentic test generation (Codex-style multi-turn, 3x concurrent) + → Quality scoring (LLM-based) + → Export (workspace.yaml + prompt.md + checks.txt) +``` + +## Tech Stack + +| Component | Technology | +|-----------|-----------| +| Language | Rust (edition 2021, nightly toolchain) | +| Async runtime | Tokio (full features) | +| CLI framework | Clap 4 (derive mode) | +| HTTP client | reqwest 0.13 (rustls) | +| Docker | bollard 0.16 (SSL) | +| Serialization | serde + serde_json + serde_yaml | +| Database | SQLx 0.7 (Postgres + SQLite, migrations) | +| Data export | Apache Arrow 54 + Parquet 54 | +| Caching | Redis 0.24 (tokio-comp) | +| Templating | Tera 1.20 | +| Error handling | thiserror 2.0 + anyhow 1.0 | +| Logging | tracing + tracing-subscriber (env-filter) | +| Linker | mold (via `.cargo/config.toml`) | +| LLM provider | OpenRouter (OpenAI-compatible function calling) | + +## Build & Test Commands + +```bash +# Build (debug) +cargo build + +# Build (release, optimized) +cargo build --release + +# Run all tests +cargo test + +# Run tests (release mode, parallel) +cargo test --release -- --test-threads=$(nproc) + +# Lint +cargo clippy --all-targets --all-features -- -D warnings + +# Format check +cargo fmt --all -- --check + +# Format fix +cargo fmt --all + +# Run doc tests +cargo test --doc + +# Run the CLI +cargo run -- swe mine --help +cargo run -- swe harness --help +``` + +## Environment Variables + +| Variable | Required | Description | +|----------|----------|-------------| +| `OPENROUTER_API_KEY` | Yes (runtime) | OpenRouter API key for LLM calls | +| `GITHUB_TOKEN` | Yes (runtime) | GitHub PAT for PR enrichment | +| `RUST_LOG` | No | Log level: `error`, `warn`, `info`, `debug`, `trace` | + +## Git Hooks + +Git hooks are in `.githooks/` and activated via `git config core.hooksPath .githooks`. + +- **pre-commit**: Runs `cargo fmt --all -- --check` and `cargo clippy --all-targets -- -D warnings` +- **pre-push**: Runs format check + clippy + `cargo test --lib` + `cargo build` +- Both hooks can be skipped with `SKIP_GIT_HOOKS=1` + +## CRITICAL RULES + +1. **All errors must use typed error enums from `src/error.rs`** — Never use `unwrap()` or `expect()` in library code. Use `anyhow::Result` only in `main.rs` and CLI commands. Library modules must return typed errors via `thiserror` (`RegistryError`, `GeneratorError`, `LlmError`, `DockerError`, `ExportError`, `ValidationError`, `TemplateError`). + +2. **All LLM interactions must use function calling (`tools` + `tool_choice`)** — Never parse free-form LLM text. Use OpenAI-compatible `tools` array with `tool_choice: "required"` for structured JSON output. See `src/llm/litellm.rs` for `ToolDefinition`, `ToolChoice`, and `ToolCallInfo` types. + +3. **Never leak test plans into agent prompts** — The `prompt_rewriter.rs` module strips test-specific information from PR bodies before generating `prompt.md`. Any new prompt generation code must ensure `fail_to_pass` and `pass_to_pass` test commands are never visible to the agent being evaluated. + +4. **Docker containers must have resource limits** — All container creation must use `apply_resource_limits()` from `src/docker/resources.rs`. Difficulty-based limits are enforced: memory (512MB–4GB), CPU (1–4 cores), timeouts (5–30 min). Never create containers without limits. + +5. **Respect GitHub API rate limits (5000 req/h)** — The pipeline processes candidates in chunks of 30. Each candidate needs ~2 API calls for enrichment. Never add unbounded concurrent GitHub API calls. Use the existing concurrency limits (enrichment: 3x, pre-classification: 10x, deep processing: 3x). + +6. **All async code must be `Send + Sync` compatible** — The codebase uses `Arc` extensively. Trait objects must be `Send + Sync`. Never introduce `Rc`, `RefCell`, or non-Send types in async contexts. + +7. **Serde rename conventions must be `snake_case`** — All serializable enums use `#[serde(rename_all = "snake_case")]`. Task status, difficulty levels, and all API-facing types must follow this convention for YAML/JSON compatibility. + +8. **Anti-hardcoding mechanisms must be preserved** — The `anti_hardcoding/` module provides canary strings, sealed parameters, and process validation. Never bypass contamination detection. Any new task generation must embed canary strings via `CanaryConfig::generate()`. + +9. **Use `tracing` for all logging, never `println!`** — All log output must use `tracing::{info, warn, debug, error, trace}` macros. The log level is controlled by `RUST_LOG` env var or `--log-level` CLI arg. + +10. **Parquet/Arrow exports must preserve schema** — The `export/parquet_writer.rs` module defines the schema for dataset export. Never change field types or remove fields from the Parquet schema without updating `read_parquet` and `write_parquet` together. + +## DO's + +- Use `anyhow::Result` for CLI command handlers in `src/cli/commands.rs` +- Use typed `thiserror` errors for all library module boundaries +- Add `#[cfg(test)] mod tests` blocks in the same file for unit tests +- Use `tokio::spawn` for concurrent work, `futures::stream` for bounded concurrency +- Follow the existing pattern of `mod.rs` re-exporting public types +- Use `Arc` for LLM provider abstraction +- Add doc comments (`///`) to all public types and functions +- Use `BTreeMap` (not `HashMap`) for deterministic serialization in `SweTask` + +## DON'Ts + +- Don't use `unwrap()` or `expect()` in library code — use `?` operator +- Don't add new direct dependencies without checking if an existing dep covers the use case +- Don't use `println!` or `eprintln!` — use `tracing` macros +- Don't create Docker containers without resource limits +- Don't make unbounded concurrent API calls — always use semaphore or stream limits +- Don't store secrets (API keys, tokens) in code or config files +- Don't change the `workspace.yaml` schema without updating the harness parser +- Don't bypass the PR deduplication cache (`pr_cache.rs`) — it prevents reprocessing diff --git a/VERSION b/VERSION new file mode 100644 index 0000000..6e8bf73 --- /dev/null +++ b/VERSION @@ -0,0 +1 @@ +0.1.0 diff --git a/src/agents/AGENTS.md b/src/agents/AGENTS.md new file mode 100644 index 0000000..1640ef4 --- /dev/null +++ b/src/agents/AGENTS.md @@ -0,0 +1,31 @@ +# AGENTS.md — src/agents/ + +## Purpose + +Task validation and Docker-based verification agents. These agents validate generated benchmark tasks for correctness, execute them in Docker containers, and score difficulty. + +## Module Structure + +| File | Responsibility | +|------|---------------| +| `mod.rs` | Re-exports | +| `docker_validator.rs` | `DockerValidatorAgent` — builds and runs tasks in Docker, validates output | +| `task_validator.rs` | `TaskValidatorAgent` — validates task ideas and assesses feasibility | +| `task_executor.rs` | `TaskExecutorAgent` — generates synthetic tasks with anti-memorization, difficulty scoring, verification specs | +| `error.rs` | `AgentError` enum, `AgentResult` type alias | + +## Key Types + +- `DockerValidatorAgent` / `DockerValidatorConfig` / `DockerValidationResult` +- `TaskValidatorAgent` / `TaskValidatorConfig` / `ValidationAssessment` / `TaskIdea` +- `TaskExecutorAgent` / `TaskExecutorConfig` / `SyntheticTask` / `TaskMetadata` +- `AntiMemorizationConfig` — Config for anti-hardcoding in generated tasks +- `DifficultyScoring` — Difficulty assessment with scoring criteria +- `HiddenSolution` — Solution hidden from the agent during evaluation +- `VerificationSpec` — Specification for verifying task outputs + +## Rules + +- Docker validation must always use resource limits from `src/docker/resources.rs` +- Task executor must embed `AntiMemorizationConfig` canary strings +- All agent errors must use `AgentError` from `error.rs` diff --git a/src/anti_hardcoding/AGENTS.md b/src/anti_hardcoding/AGENTS.md new file mode 100644 index 0000000..abbbe48 --- /dev/null +++ b/src/anti_hardcoding/AGENTS.md @@ -0,0 +1,36 @@ +# AGENTS.md — src/anti_hardcoding/ + +## Purpose + +Anti-hardcoding mechanisms to ensure benchmark integrity. Detects if models have memorized benchmarks (contamination), prevents pre-computation of answers, and validates that agents follow proper problem-solving processes. + +## Module Structure + +| File | Responsibility | +|------|---------------| +| `mod.rs` | `AntiHardcodingVerifier` — unified verifier combining all mechanisms; `VerificationResult` | +| `canary.rs` | `CanaryConfig` — unique identifiers embedded in tasks for contamination detection | +| `sealed.rs` | `SealedParameters` / `SealedData` — encrypted parameters revealed only at verification time | +| `process_validation.rs` | `ProcessTracer` / `ProcessValidationConfig` — validates command execution patterns | + +## Key Types + +- `AntiHardcodingVerifier` — Combines canary + process validation +- `CanaryConfig` — Generated via `CanaryConfig::generate(task_id, seed)` +- `ContaminationResult` — `contaminated`, `partial_match`, `confidence` +- `SealedParameters` / `SealedData` / `SealError` +- `ProcessTracer` / `ProcessValidationConfig` / `CommandExecution` +- `VerificationResult` — `valid`, `score`, `contamination`, `process_validation`, `issues` + +## Scoring + +- Confirmed contamination: 90% score penalty (`score *= 0.1`) +- Partial match: 30% penalty (`score *= 0.7`) +- High confidence (>0.5): up to 20% additional penalty + +## Rules + +- Every generated task must embed a canary via `CanaryConfig::generate()` +- Never bypass contamination detection in the verification pipeline +- Process validation patterns use regex — test patterns before deploying +- `required_pattern` must match at least one recorded `CommandExecution` diff --git a/src/cli/AGENTS.md b/src/cli/AGENTS.md new file mode 100644 index 0000000..e509577 --- /dev/null +++ b/src/cli/AGENTS.md @@ -0,0 +1,31 @@ +# AGENTS.md — src/cli/ + +## Purpose + +Clap-based CLI interface. Defines all commands, argument parsing, and command dispatch. + +## Module Structure + +| File | Responsibility | +|------|---------------| +| `mod.rs` | Re-exports `parse_cli`, `run`, `run_with_cli` | +| `commands.rs` | `Cli` struct (Clap derive), `Commands` enum, all subcommand args, command handlers | + +## Commands + +| Command | Description | +|---------|-------------| +| `swe-forge generate` (alias: `gen`) | Generate SWE DataForge tasks from real GitHub PRs | +| `swe-forge evaluate` (alias: `eval`) | Evaluate generated tasks using an autonomous agent | +| `swe-forge swe mine` | Mine real PRs and export SWE-style tasks | +| `swe-forge swe harness` | Run evaluation harness on generated tasks | +| `swe-forge swe validate` | Validate generated SWE workspaces | +| `swe-forge swe export` | Export SWE workspaces to dataset format | + +## Rules + +- Use `anyhow::Result` for command handler return types +- Default model constant: `DEFAULT_MODEL = "openai/gpt-5.2-codex:nitro"` +- Default output dirs: `./generated-datasets` (generate), `./generated-swe` (swe mine) +- Global `--log-level` arg controls tracing filter +- API keys come from env vars or CLI args (env var takes precedence) diff --git a/src/cli/commands.rs b/src/cli/commands.rs index 8320da7..22b3a16 100644 --- a/src/cli/commands.rs +++ b/src/cli/commands.rs @@ -737,12 +737,12 @@ async fn run_swe_mine_command(args: SweMineArgs) -> anyhow::Result<()> { None => None, }; - let (effective_max_tasks, effective_difficulty_filter) = if let Some(ref dt) = difficulty_targets - { - (dt.total_tasks(), None) - } else { - (args.max_tasks, args.difficulty.clone()) - }; + let (effective_max_tasks, effective_difficulty_filter) = + if let Some(ref dt) = difficulty_targets { + (dt.total_tasks(), None) + } else { + (args.max_tasks, args.difficulty.clone()) + }; let hf_upload = match (&args.hf_repo, &args.hf_token) { (Some(repo), Some(token)) => Some(crate::export::HfUploadConfig { @@ -819,7 +819,8 @@ async fn run_swe_mine_command(args: SweMineArgs) -> anyhow::Result<()> { ); // Show per-difficulty breakdown - let mut per_level: std::collections::HashMap = std::collections::HashMap::new(); + let mut per_level: std::collections::HashMap = + std::collections::HashMap::new(); for task in &result.tasks { if task.quality_passed { let level = task @@ -868,10 +869,16 @@ async fn run_swe_load_command(args: SweLoadArgs) -> anyhow::Result<()> { } // Compute stats - let mut by_difficulty: std::collections::HashMap = std::collections::HashMap::new(); - let mut by_language: std::collections::HashMap = std::collections::HashMap::new(); + let mut by_difficulty: std::collections::HashMap = + std::collections::HashMap::new(); + let mut by_language: std::collections::HashMap = + std::collections::HashMap::new(); for task in &tasks { - let diff = task.meta.get("difficulty").cloned().unwrap_or_else(|| "unknown".to_string()); + let diff = task + .meta + .get("difficulty") + .cloned() + .unwrap_or_else(|| "unknown".to_string()); *by_difficulty.entry(diff).or_insert(0) += 1; *by_language.entry(task.language.clone()).or_insert(0) += 1; } diff --git a/src/difficulty/AGENTS.md b/src/difficulty/AGENTS.md new file mode 100644 index 0000000..bd83657 --- /dev/null +++ b/src/difficulty/AGENTS.md @@ -0,0 +1,28 @@ +# AGENTS.md — src/difficulty/ + +## Purpose + +Difficulty classification system. Defines difficulty levels (Easy/Medium/Hard), resource limits per level, scoring calculations, and time/step expectations. + +## Module Structure + +| File | Responsibility | +|------|---------------| +| `mod.rs` | `DifficultyLevel` enum, score ranges, time ranges, command step ranges, resource limits | + +## Key Types + +- `DifficultyLevel` — `Easy`, `Medium`, `Hard` (serde: `snake_case`) + +## Difficulty Ranges + +| Level | Score | Time | Steps | Success Rate | +|-------|-------|------|-------|-------------| +| Easy | 0.0–0.33 | 3–6 min | 5–10 | 90% | +| Medium | 0.34–0.66 | 8–15 min | 10–25 | 70% | +| Hard | 0.67–1.0 | 15–60 min | 25–50 | 50% | + +## Rules + +- Always use `#[serde(rename_all = "lowercase")]` for `DifficultyLevel` +- Score ranges, time ranges, and step ranges are authoritative — don't change without updating all consumers diff --git a/src/docker/AGENTS.md b/src/docker/AGENTS.md new file mode 100644 index 0000000..866e022 --- /dev/null +++ b/src/docker/AGENTS.md @@ -0,0 +1,30 @@ +# AGENTS.md — src/docker/ + +## Purpose + +Docker environment generation — produces Dockerfiles, docker-compose.yaml, and container configurations for benchmark task execution. Separate from `src/execution/` which handles runtime container management. + +## Module Structure + +| File | Responsibility | +|------|---------------| +| `mod.rs` | `DockerEnvironment` struct, re-exports | +| `dockerfile.rs` | `DockerfileBuilder` — generates Dockerfiles with base image selection (`python`, `node`, `rust`, `ubuntu`) | +| `compose.rs` | `ComposeBuilder` — generates docker-compose.yaml with optional database/cache/webserver services | +| `resources.rs` | `ResourceLimits`, `ContainerConfig`, `VolumeMount`, `NetworkMode` — security and resource config | + +## Key Types + +- `DockerEnvironment` — Complete Docker setup (Dockerfile + compose + container config) +- `DockerfileBuilder` / `DockerfileConfig` — Dockerfile generation +- `ComposeBuilder` / `ComposeConfig` / `ComposeService` — docker-compose generation +- `ResourceLimits` — Memory, CPU, PID limits per difficulty +- `ContainerConfig` — Name, image, limits, env vars, volumes, network mode +- Base images: `BASE_PYTHON`, `BASE_NODE`, `BASE_RUST`, `BASE_UBUNTU`, `BASE_MULTI_LANG` + +## Rules + +- Always use `apply_resource_limits(&difficulty)` when creating containers +- Network mode is difficulty-dependent (`network_mode_from_difficulty()`) +- Volumes must use `create_secure_volumes()` for isolation +- Base image selection via `select_base_image()` based on language diff --git a/src/execution/AGENTS.md b/src/execution/AGENTS.md new file mode 100644 index 0000000..bdadf6d --- /dev/null +++ b/src/execution/AGENTS.md @@ -0,0 +1,29 @@ +# AGENTS.md — src/execution/ + +## Purpose + +Docker execution layer using the `bollard` crate. Manages container lifecycle (create → start → exec → cleanup), resource limits, and task execution isolation. + +## Module Structure + +| File | Responsibility | +|------|---------------| +| `mod.rs` | Re-exports, lifecycle documentation | +| `container.rs` | `Container` struct with state machine (`PENDING → CREATING → RUNNING → COMPLETED/FAILED/TIMEOUT → CLEANUP`) | +| `docker_client.rs` | `DockerClient` wrapper around `bollard::Docker` | +| `resources.rs` | `ExecutionLimits` — difficulty-based resource limits | + +## Key Types + +- `Container` — Stateful container with `start()`, `exec()`, `cleanup()` methods +- `ContainerStatus` — State enum tracking container lifecycle +- `ExecResult` — stdout, stderr, exit code from container exec +- `DockerClient` — Thin wrapper for Docker API operations +- `ExecutionLimits` — Memory, CPU, timeout, network limits per difficulty + +## Rules + +- Container states follow: `PENDING → CREATING → RUNNING → COMPLETED/FAILED/TIMEOUT → CLEANUP` +- Always call `cleanup()` after use — containers must not leak +- Use `get_execution_limits()` to get difficulty-appropriate limits +- All container operations are async (bollard is tokio-based) diff --git a/src/export/AGENTS.md b/src/export/AGENTS.md new file mode 100644 index 0000000..6d5e9ef --- /dev/null +++ b/src/export/AGENTS.md @@ -0,0 +1,26 @@ +# AGENTS.md — src/export/ + +## Purpose + +Dataset export module. Writes SWE task data to Apache Parquet format and uploads to HuggingFace Hub. + +## Module Structure + +| File | Responsibility | +|------|---------------| +| `mod.rs` | Re-exports | +| `dataset.rs` | `DatasetManager` — load, download, and manage datasets; `DatasetConfig`, `DatasetSummary` | +| `parquet_writer.rs` | `write_parquet()`, `read_parquet()`, `write_parquet_bytes()` — Arrow/Parquet serialization | +| `hf_uploader.rs` | `HfUploader` — HuggingFace Hub API upload with `HfUploadConfig` | + +## Key Types + +- `DatasetManager` / `DatasetConfig` / `DatasetSummary` +- `HfUploader` / `HfUploadConfig` +- `write_parquet(tasks, path)` / `read_parquet(path)` — core I/O functions + +## Rules + +- Never change Parquet schema fields without updating both `write_parquet` and `read_parquet` +- HuggingFace upload requires `HF_TOKEN` environment variable +- Parquet uses snappy + zstd compression (configured in `Cargo.toml` features) diff --git a/src/export/dataset.rs b/src/export/dataset.rs index 5398f92..4a00c68 100644 --- a/src/export/dataset.rs +++ b/src/export/dataset.rs @@ -156,14 +156,21 @@ impl DatasetManager { let mut all_tasks = Vec::new(); let mut entries: Vec<_> = std::fs::read_dir(&data_dir)? .filter_map(|e| e.ok()) - .filter(|e| e.path().extension().map(|x| x == "parquet").unwrap_or(false)) + .filter(|e| { + e.path() + .extension() + .map(|x| x == "parquet") + .unwrap_or(false) + }) .collect(); entries.sort_by_key(|e| e.file_name()); for entry in &entries { match parquet_writer::read_parquet(&entry.path()) { Ok(tasks) => all_tasks.extend(tasks), - Err(e) => tracing::warn!(path = %entry.path().display(), error = %e, "Failed to read shard"), + Err(e) => { + tracing::warn!(path = %entry.path().display(), error = %e, "Failed to read shard") + } } } @@ -188,9 +195,14 @@ impl DatasetManager { // Upload combined + splits to HF if let Some(ref uploader) = self.uploader { - let combined_bytes = std::fs::read(self.config.output_dir.join("train.parquet"))?; + let combined_bytes = + std::fs::read(self.config.output_dir.join("train.parquet"))?; let _ = uploader - .upload_file("train.parquet", &combined_bytes, "Add combined train.parquet") + .upload_file( + "train.parquet", + &combined_bytes, + "Add combined train.parquet", + ) .await; for (diff, _) in &by_diff { @@ -373,7 +385,12 @@ pub fn load_dataset(path: &Path) -> anyhow::Result> { let mut all_tasks = Vec::new(); let mut entries: Vec<_> = std::fs::read_dir(path)? .filter_map(|e| e.ok()) - .filter(|e| e.path().extension().map(|x| x == "parquet").unwrap_or(false)) + .filter(|e| { + e.path() + .extension() + .map(|x| x == "parquet") + .unwrap_or(false) + }) .collect(); entries.sort_by_key(|e| e.file_name()); @@ -384,7 +401,10 @@ pub fn load_dataset(path: &Path) -> anyhow::Result> { return Ok(all_tasks); } - anyhow::bail!("Path is neither a parquet file nor a directory: {}", path.display()); + anyhow::bail!( + "Path is neither a parquet file nor a directory: {}", + path.display() + ); } /// Download a dataset from HuggingFace and return the tasks. @@ -406,10 +426,7 @@ pub async fn download_dataset( tracing::info!(repo = repo_id, file = %filename, "Downloading dataset from HuggingFace"); let client = reqwest::Client::new(); - let resp = client - .get(&url) - .send() - .await?; + let resp = client.get(&url).send().await?; if !resp.status().is_success() { anyhow::bail!( diff --git a/src/export/hf_uploader.rs b/src/export/hf_uploader.rs index ec98c91..7770329 100644 --- a/src/export/hf_uploader.rs +++ b/src/export/hf_uploader.rs @@ -110,10 +110,7 @@ impl HfUploader { HF_API_BASE, self.config.repo_id ); - let encoded = base64::Engine::encode( - &base64::engine::general_purpose::STANDARD, - content, - ); + let encoded = base64::Engine::encode(&base64::engine::general_purpose::STANDARD, content); let body = CommitRequest { summary: commit_message.to_string(), @@ -140,7 +137,10 @@ impl HfUploader { repo = %self.config.repo_id, "Uploaded file to HF" ); - self.uploaded_files.lock().await.push(path_in_repo.to_string()); + self.uploaded_files + .lock() + .await + .push(path_in_repo.to_string()); Ok(()) } else { let status = resp.status(); @@ -157,7 +157,8 @@ impl HfUploader { commit_message: &str, ) -> anyhow::Result<()> { let content = std::fs::read(local_path)?; - self.upload_file(path_in_repo, &content, commit_message).await + self.upload_file(path_in_repo, &content, commit_message) + .await } /// Upload multiple files in a single commit (more efficient). @@ -178,10 +179,8 @@ impl HfUploader { let actions: Vec = files .iter() .map(|(path, content)| { - let encoded = base64::Engine::encode( - &base64::engine::general_purpose::STANDARD, - content, - ); + let encoded = + base64::Engine::encode(&base64::engine::general_purpose::STANDARD, content); CommitAction { action: "file".to_string(), path: path.to_string(), diff --git a/src/export/parquet_writer.rs b/src/export/parquet_writer.rs index 7274136..ec837c9 100644 --- a/src/export/parquet_writer.rs +++ b/src/export/parquet_writer.rs @@ -80,11 +80,7 @@ pub fn tasks_to_record_batch(tasks: &[SweTask]) -> anyhow::Result { created_at.append_value(task.created_at.to_rfc3339()); - let ver = task - .meta - .get("version") - .cloned() - .unwrap_or_default(); + let ver = task.meta.get("version").cloned().unwrap_or_default(); if ver.is_empty() { version.append_null(); } else { @@ -109,15 +105,15 @@ pub fn tasks_to_record_batch(tasks: &[SweTask]) -> anyhow::Result { language.append_value(&task.language); - let diff_label = task - .meta - .get("difficulty") - .cloned() - .unwrap_or_else(|| match task.difficulty_score { - 0..=1 => "easy".to_string(), - 2 => "medium".to_string(), - _ => "hard".to_string(), - }); + let diff_label = + task.meta + .get("difficulty") + .cloned() + .unwrap_or_else(|| match task.difficulty_score { + 0..=1 => "easy".to_string(), + 2 => "medium".to_string(), + _ => "hard".to_string(), + }); difficulty.append_value(&diff_label); difficulty_score.append_value(task.difficulty_score); @@ -219,9 +215,17 @@ pub fn read_parquet(input_path: &Path) -> anyhow::Result> { batch .column_by_name(name) .and_then(|col| col.as_any().downcast_ref::()) - .map(|arr| (0..num_rows).map(|i| { - if arr.is_null(i) { None } else { Some(arr.value(i).to_string()) } - }).collect()) + .map(|arr| { + (0..num_rows) + .map(|i| { + if arr.is_null(i) { + None + } else { + Some(arr.value(i).to_string()) + } + }) + .collect() + }) .unwrap_or_else(|| vec![None; num_rows]) }; @@ -241,15 +245,27 @@ pub fn read_parquet(input_path: &Path) -> anyhow::Result> { let difficulty_scores: Vec = batch .column_by_name("difficulty_score") .and_then(|col| col.as_any().downcast_ref::()) - .map(|arr| (0..num_rows).map(|i| if arr.is_null(i) { 1 } else { arr.value(i) }).collect()) + .map(|arr| { + (0..num_rows) + .map(|i| if arr.is_null(i) { 1 } else { arr.value(i) }) + .collect() + }) .unwrap_or_else(|| vec![1; num_rows]); let quality_scores: Vec> = batch .column_by_name("quality_score") .and_then(|col| col.as_any().downcast_ref::()) - .map(|arr| (0..num_rows).map(|i| { - if arr.is_null(i) { None } else { Some(arr.value(i)) } - }).collect()) + .map(|arr| { + (0..num_rows) + .map(|i| { + if arr.is_null(i) { + None + } else { + Some(arr.value(i)) + } + }) + .collect() + }) .unwrap_or_else(|| vec![None; num_rows]); for i in 0..num_rows { @@ -259,8 +275,12 @@ pub fn read_parquet(input_path: &Path) -> anyhow::Result> { continue; } - let f2p_str = fail_to_passes[i].clone().unwrap_or_else(|| "[]".to_string()); - let p2p_str = pass_to_passes[i].clone().unwrap_or_else(|| "[]".to_string()); + let f2p_str = fail_to_passes[i] + .clone() + .unwrap_or_else(|| "[]".to_string()); + let p2p_str = pass_to_passes[i] + .clone() + .unwrap_or_else(|| "[]".to_string()); let fail_to_pass: Vec = serde_json::from_str(&f2p_str).unwrap_or_default(); let pass_to_pass: Vec = serde_json::from_str(&p2p_str).unwrap_or_default(); @@ -276,7 +296,9 @@ pub fn read_parquet(input_path: &Path) -> anyhow::Result> { task.test_patch = test_patches[i].clone().unwrap_or_default(); task.prompt = problem_statements[i].clone().unwrap_or_default(); task.original_pr_body = hints[i].clone().unwrap_or_default(); - task.language = languages[i].clone().unwrap_or_else(|| "unknown".to_string()); + task.language = languages[i] + .clone() + .unwrap_or_else(|| "unknown".to_string()); task.difficulty_score = difficulty_scores[i]; task.quality_score = quality_scores[i]; task.quality_passed = true; @@ -318,7 +340,8 @@ mod tests { task.quality_passed = true; task.fail_to_pass = vec!["pytest tests/test_x.py::test_fix".to_string()]; task.pass_to_pass = vec!["pytest tests/test_x.py::test_other".to_string()]; - task.meta.insert("difficulty".to_string(), "medium".to_string()); + task.meta + .insert("difficulty".to_string(), "medium".to_string()); task } diff --git a/src/llm/AGENTS.md b/src/llm/AGENTS.md new file mode 100644 index 0000000..d1c456b --- /dev/null +++ b/src/llm/AGENTS.md @@ -0,0 +1,38 @@ +# AGENTS.md — src/llm/ + +## Purpose + +LLM integration layer providing an OpenAI-compatible API client with function calling, multi-model routing, prompt caching, and cost tracking. All LLM interactions in swe-forge go through this module. + +## Module Structure + +| File | Responsibility | +|------|---------------| +| `mod.rs` | Re-exports, module docs, usage examples | +| `litellm.rs` | Core API client (`LiteLlmClient`), request/response types, `LlmProvider` trait | +| `providers/openrouter.rs` | OpenRouter provider implementation | +| `router.rs` | `MultiModelRouter` with strategies: `CostOptimized`, `RoundRobin`, `CapabilityBased` | +| `cache.rs` | `PromptCache` for multi-conversation prompt caching (content hashing) | +| `cost.rs` | `CostTracker` with daily/monthly budgets, usage recording | + +## Key Types + +- `LlmProvider` (trait) — `async fn generate(&self, request: GenerationRequest) -> Result` +- `LiteLlmClient` — Direct OpenAI-compatible HTTP client +- `OpenRouterProvider` — OpenRouter-specific provider +- `GenerationRequest` — Messages + model + tools + tool_choice + temperature +- `GenerationResponse` — Choices with `ToolCallInfo` for function calling +- `ToolDefinition` — JSON Schema function definition for `tools` array +- `ToolChoice` — `Auto`, `None`, `Required`, `Named(String)` +- `Message` — `system`, `user`, `assistant`, `tool` roles +- `MultiModelRouter` — Routes requests across providers by strategy +- `PromptCache` / `SharedPromptCache` — Thread-safe prompt caching (`Arc>`) +- `CostTracker` — Atomic cost tracking with budget enforcement + +## Rules + +- Always use `tools` + `tool_choice: "required"` for structured output — never parse free-form text +- Provider trait objects must be `Send + Sync` (used as `Arc`) +- Default model: `openai/gpt-5.2-codex:nitro` (set in `src/cli/commands.rs`) +- Cost tracking is optional but should be used when available +- Cache keys are content hashes (`sha2`) — not message indices diff --git a/src/runner/AGENTS.md b/src/runner/AGENTS.md new file mode 100644 index 0000000..d3074c0 --- /dev/null +++ b/src/runner/AGENTS.md @@ -0,0 +1,39 @@ +# AGENTS.md — src/runner/ + +## Purpose + +Agent runner infrastructure for benchmark evaluation. Spawns external AI agents against benchmark tasks in isolated sandboxes, captures outputs, and verifies results. + +## Module Structure + +| File | Responsibility | +|------|---------------| +| `mod.rs` | Re-exports, architecture docs | +| `config.rs` | `RunConfig` — task path, agent type, timeout, environment | +| `executor.rs` | `AgentRunner` — spawns agent process, captures output, records metadata | +| `result.rs` | `RunResult`, `RunStatus`, `ExecutionTrace`, `TokenUsage` | +| `sandbox.rs` | `Sandbox` / `SandboxConfig` — isolated execution environment | +| `verifier.rs` | `Verifier` — loads `task.yaml`, runs checks, produces `VerificationResult` with scores | +| `agents/baseagent.rs` | Base agent adapter implementation | +| `agents/generic.rs` | Generic agent adapter for external commands | +| `agents/mod.rs` | `AgentAdapter` trait, `AgentType` enum | + +## Key Types + +- `AgentRunner` / `RunConfig` — Run an agent against a task +- `RunResult` / `RunStatus` — Execution result with status and traces +- `Sandbox` / `SandboxConfig` / `SandboxError` — Isolated environment +- `Verifier` / `VerificationResult` / `CheckResult` — Output verification +- `AgentAdapter` (trait) / `AgentType` — Agent abstraction + +## Data Flow + +``` +Task (prompt.md) → AgentRunner → Agent Process → Output Directory → Verifier +``` + +## Rules + +- Agent timeout is configurable (default 600s) — always enforce it +- Sandbox must isolate agent from host filesystem +- Verifier loads checks from `task.yaml` — schema must match diff --git a/src/swe/AGENTS.md b/src/swe/AGENTS.md new file mode 100644 index 0000000..2b554f0 --- /dev/null +++ b/src/swe/AGENTS.md @@ -0,0 +1,54 @@ +# AGENTS.md — src/swe/ + +## Purpose + +Core SWE mining pipeline. Fetches merged pull requests from GH Archive, enriches them via GitHub API, classifies difficulty with LLMs, extracts patches via git clone, generates test specifications through an agentic multi-turn loop, scores quality, and exports SWE-bench-compatible task instances. + +## Module Structure + +| File | Responsibility | +|------|---------------| +| `mod.rs` | `SweTask` struct, `SweTaskStatus` enum, re-exports | +| `gharchive.rs` | HTTP client for GH Archive hourly event dumps (gzip → JSON) | +| `enricher.rs` | GitHub API enrichment (PR metadata, diff, files, 3x concurrent) | +| `filters.rs` | Pre-filter (merged PRs, no bots, org repos, language, stars) | +| `extractor.rs` | Git clone + `git diff` patch extraction | +| `test_generator.rs` | Agentic multi-turn LLM test generation (up to 200 turns, `shell` + `submit_tests` tools) | +| `quality.rs` | LLM-based quality scoring and difficulty classification | +| `prompt_rewriter.rs` | Strips test plan leaks from PR body → `prompt.md` | +| `harness.rs` | Docker-isolated evaluation harness (sanity check → agent run → verify) | +| `docker_sandbox.rs` | Docker sandbox for test generation phase | +| `orchestrator.rs` | End-to-end pipeline orchestrator with `DifficultyTargets` | +| `pipeline.rs` | Streaming pipeline with chunk processing (batches of 30) | +| `pr_cache.rs` | JSONL-based PR deduplication cache | + +## Key Types + +- `SweTask` — Central task struct with patch, tests, metadata, quality score +- `SweTaskStatus` — `Candidate → Rejected | Ready → Exported → Validated` +- `GhArchiveClient` / `GhArchiveEvent` — GH Archive ingestion +- `EnrichedPullRequest` — GitHub API enriched PR data +- `ExtractedPatch` / `PatchExtractor` — Git diff extraction +- `TestGenerator` / `TestFile` — Agentic test generation +- `QualityScorer` / `QualityAssessment` — LLM quality gate +- `HarnessConfig` / `HarnessResult` / `HarnessSummary` — Evaluation harness +- `SwePipeline` / `SwePipelineEvent` — Streaming pipeline +- `SweOrchestrator` / `SweOrchestratorConfig` — Orchestrator + +## Concurrency Limits + +| Stage | Concurrency | Rate Limit | +|-------|-------------|------------| +| GH Archive fetch | 8 | None | +| GitHub enrichment | 3 | 5000 req/h | +| LLM pre-classification | 10 | OpenRouter | +| Patch extraction | 3 | None | +| Test generation | 3 | OpenRouter | + +## Rules + +- Never leak `fail_to_pass` / `pass_to_pass` into `prompt.md` — use `prompt_rewriter.rs` +- Always check `pr_cache` before processing a PR to avoid duplicates +- Process candidates in chunks of 30 to respect GitHub rate limits +- All LLM calls must use function calling (`tools` + `tool_choice: "required"`) +- Harness statuses: `resolved`, `unresolved`, `agent_error`, `test_error`, `setup_error`, `sanity_fail` diff --git a/src/swe/docker_sandbox.rs b/src/swe/docker_sandbox.rs index 3998d3c..4541969 100644 --- a/src/swe/docker_sandbox.rs +++ b/src/swe/docker_sandbox.rs @@ -152,7 +152,15 @@ impl DockerSandbox { let result = tokio::time::timeout( std::time::Duration::from_millis(timeout_ms), Command::new("docker") - .args(["exec", "-w", "/repo", &self.container_name, "bash", "-c", cmd]) + .args([ + "exec", + "-w", + "/repo", + &self.container_name, + "bash", + "-c", + cmd, + ]) .stdout(Stdio::piped()) .stderr(Stdio::piped()) .output(), diff --git a/src/swe/enricher.rs b/src/swe/enricher.rs index 3f2b363..c7fe9bc 100644 --- a/src/swe/enricher.rs +++ b/src/swe/enricher.rs @@ -296,14 +296,8 @@ async fn fetch_pr_files_info( if let Some(path) = file.get("filename").and_then(Value::as_str) { info.file_paths.push(path.to_string()); } - info.added_lines += file - .get("additions") - .and_then(Value::as_u64) - .unwrap_or(0) as usize; - info.removed_lines += file - .get("deletions") - .and_then(Value::as_u64) - .unwrap_or(0) as usize; + info.added_lines += file.get("additions").and_then(Value::as_u64).unwrap_or(0) as usize; + info.removed_lines += file.get("deletions").and_then(Value::as_u64).unwrap_or(0) as usize; } Ok(info) } diff --git a/src/swe/extractor.rs b/src/swe/extractor.rs index 17ac86b..1548c06 100644 --- a/src/swe/extractor.rs +++ b/src/swe/extractor.rs @@ -208,13 +208,19 @@ impl PatchExtractor { (Some(base), Some(merge)) if !base.is_empty() && !merge.is_empty() => { // Fetch the merge commit (shallow clone may not have it) sandbox - .exec(&format!("git fetch origin {} --depth=1 2>&1", merge), 60_000) + .exec( + &format!("git fetch origin {} --depth=1 2>&1", merge), + 60_000, + ) .await; format!("{base}..{merge}") } (_, Some(merge)) if !merge.is_empty() => { sandbox - .exec(&format!("git fetch origin {} --depth=1 2>&1", merge), 60_000) + .exec( + &format!("git fetch origin {} --depth=1 2>&1", merge), + 60_000, + ) .await; merge.to_string() } @@ -231,10 +237,7 @@ impl PatchExtractor { sandbox.destroy().await; if result.exit_code != 0 { - anyhow::bail!( - "git show failed in Docker: {}", - &result.stderr - ); + anyhow::bail!("git show failed in Docker: {}", &result.stderr); } Ok(result.stdout) diff --git a/src/swe/orchestrator.rs b/src/swe/orchestrator.rs index 61d8ca7..3ae2185 100644 --- a/src/swe/orchestrator.rs +++ b/src/swe/orchestrator.rs @@ -44,15 +44,24 @@ impl DifficultyTargets { })?; let level = level.trim().to_lowercase(); if !matches!(level.as_str(), "easy" | "medium" | "hard") { - anyhow::bail!("Unknown difficulty level '{}'. Use easy, medium, or hard.", level); + anyhow::bail!( + "Unknown difficulty level '{}'. Use easy, medium, or hard.", + level + ); } let count: usize = count.trim().parse().map_err(|_| { - anyhow::anyhow!("Invalid count '{}' for difficulty '{}'", count.trim(), level) + anyhow::anyhow!( + "Invalid count '{}' for difficulty '{}'", + count.trim(), + level + ) })?; targets.insert(level, count); } if targets.is_empty() { - anyhow::bail!("No valid difficulty targets found. Use format: easy:50,medium:50,hard:50"); + anyhow::bail!( + "No valid difficulty targets found. Use format: easy:50,medium:50,hard:50" + ); } Ok(Self { targets }) } @@ -120,19 +129,20 @@ impl SweOrchestrator { pub async fn mine(&self) -> anyhow::Result { let is_multi = self.config.difficulty_targets.is_some(); - let (max_tasks, candidate_multiplier) = if let Some(ref targets) = self.config.difficulty_targets { - let total = targets.total_tasks(); - let has_hard = targets.targets.contains_key("hard"); - let mult = if has_hard { 200 } else { 100 }; - tracing::info!(?targets, total, "Starting multi-difficulty mining"); - (total, mult) - } else if self.config.difficulty_filter.as_deref() == Some("hard") { - (self.config.max_tasks, 200) - } else if self.config.difficulty_filter.is_some() { - (self.config.max_tasks, 100) - } else { - (self.config.max_tasks, 50) - }; + let (max_tasks, candidate_multiplier) = + if let Some(ref targets) = self.config.difficulty_targets { + let total = targets.total_tasks(); + let has_hard = targets.targets.contains_key("hard"); + let mult = if has_hard { 200 } else { 100 }; + tracing::info!(?targets, total, "Starting multi-difficulty mining"); + (total, mult) + } else if self.config.difficulty_filter.as_deref() == Some("hard") { + (self.config.max_tasks, 200) + } else if self.config.difficulty_filter.is_some() { + (self.config.max_tasks, 100) + } else { + (self.config.max_tasks, 50) + }; let pipeline_config = SwePipelineConfig { min_stars: self.config.min_stars, @@ -142,7 +152,11 @@ impl SweOrchestrator { once: self.config.once, validate_docker: self.config.validate_docker, skip_prs: self.config.skip_prs.clone(), - difficulty_filter: if is_multi { None } else { self.config.difficulty_filter.clone() }, + difficulty_filter: if is_multi { + None + } else { + self.config.difficulty_filter.clone() + }, difficulty_targets: self.config.difficulty_targets.clone(), cache: self.config.cache.clone(), mining_image: self.config.mining_image.clone(), @@ -172,7 +186,12 @@ impl SweOrchestrator { let pipeline = crate::swe::pipeline::SwePipeline::new(&pipeline_config, self.llm.clone())?; let run: SwePipelineRunResult = pipeline - .run_full(&pipeline_config, None, Some(export_config), dataset_handle.clone()) + .run_full( + &pipeline_config, + None, + Some(export_config), + dataset_handle.clone(), + ) .await?; // Finalize dataset: flush remaining shard, write combined parquet, upload splits @@ -199,7 +218,11 @@ impl SweOrchestrator { let mut per_level: HashMap = HashMap::new(); for task in &tasks { if task.quality_passed { - let level = task.meta.get("difficulty").cloned().unwrap_or_else(|| "unknown".to_string()); + let level = task + .meta + .get("difficulty") + .cloned() + .unwrap_or_else(|| "unknown".to_string()); *per_level.entry(level).or_insert(0) += 1; } } @@ -218,5 +241,3 @@ impl SweOrchestrator { }) } } - - diff --git a/src/swe/pipeline.rs b/src/swe/pipeline.rs index 7c5f602..9deb2a0 100644 --- a/src/swe/pipeline.rs +++ b/src/swe/pipeline.rs @@ -710,9 +710,10 @@ impl SwePipeline { // Check completion: multi-target mode or single mode if let Some(ref targets) = difficulty_targets { let counts = per_difficulty_completed.lock().await; - let all_met = targets.targets.iter().all(|(level, "a)| { - counts.get(level).copied().unwrap_or(0) >= quota - }); + let all_met = targets + .targets + .iter() + .all(|(level, "a)| counts.get(level).copied().unwrap_or(0) >= quota); if all_met && once { tracing::info!("All difficulty targets met, stopping pool"); break; diff --git a/src/swe/pr_cache.rs b/src/swe/pr_cache.rs index 52884d1..a3e8c60 100644 --- a/src/swe/pr_cache.rs +++ b/src/swe/pr_cache.rs @@ -108,15 +108,13 @@ impl PrCache { } pub async fn get(&self, repo: &str, pr: u64) -> Option { - let row = sqlx::query( - "SELECT * FROM pr_cache WHERE repo = ?1 AND pr_number = ?2", - ) - .bind(repo) - .bind(pr as i64) - .fetch_optional(&self.pool) - .await - .ok() - .flatten()?; + let row = sqlx::query("SELECT * FROM pr_cache WHERE repo = ?1 AND pr_number = ?2") + .bind(repo) + .bind(pr as i64) + .fetch_optional(&self.pool) + .await + .ok() + .flatten()?; Some(PrCacheEntry { repo: row.get("repo"), @@ -129,12 +127,16 @@ impl PrCache { stars: row.get::, _>("stars").map(|v| v as u32), base_sha: row.get("base_sha"), merge_sha: row.get("merge_sha"), - files_changed: row.get::, _>("files_changed").map(|v| v as usize), + files_changed: row + .get::, _>("files_changed") + .map(|v| v as usize), has_org: row.get::, _>("has_org").map(|v| v != 0), triage_difficulty: row.get("triage_difficulty"), patch: row.get("patch"), test_patch: row.get("test_patch"), - difficulty_score: row.get::, _>("difficulty_score").map(|v| v as u8), + difficulty_score: row + .get::, _>("difficulty_score") + .map(|v| v as u8), quality_score: row.get("quality_score"), quality_passed: row.get::, _>("quality_passed").map(|v| v != 0), status: row.get("status"), @@ -199,15 +201,13 @@ impl PrCache { /// Returns true if this PR should be skipped (already exported or rejected). pub async fn should_skip(&self, repo: &str, pr: u64) -> bool { - let row = sqlx::query( - "SELECT status FROM pr_cache WHERE repo = ?1 AND pr_number = ?2", - ) - .bind(repo) - .bind(pr as i64) - .fetch_optional(&self.pool) - .await - .ok() - .flatten(); + let row = sqlx::query("SELECT status FROM pr_cache WHERE repo = ?1 AND pr_number = ?2") + .bind(repo) + .bind(pr as i64) + .fetch_optional(&self.pool) + .await + .ok() + .flatten(); match row { Some(r) => { @@ -417,7 +417,10 @@ mod tests { ..Default::default() }; cache.upsert(&entry).await.unwrap(); - cache.mark_rejected("owner/repo", 2, "too easy").await.unwrap(); + cache + .mark_rejected("owner/repo", 2, "too easy") + .await + .unwrap(); assert!(cache.should_skip("owner/repo", 2).await); } diff --git a/src/swe/test_generator.rs b/src/swe/test_generator.rs index 71fb86d..abf940b 100644 --- a/src/swe/test_generator.rs +++ b/src/swe/test_generator.rs @@ -344,12 +344,7 @@ impl TestGenerator { // --- Dual-commit validation: apply patch, re-run tests --- let patch_validation = self - .validate_on_pr_commit( - sandbox, - &task.patch, - &submit, - &all_files, - ) + .validate_on_pr_commit(sandbox, &task.patch, &submit, &all_files) .await; match patch_validation { @@ -399,8 +394,10 @@ impl TestGenerator { task.meta.insert("test_files".to_string(), json); } } - task.meta - .insert("test_generation".to_string(), "agentic-docker".to_string()); + task.meta.insert( + "test_generation".to_string(), + "agentic-docker".to_string(), + ); return Ok(()); } ToolResult::Error(err) => { @@ -424,7 +421,8 @@ impl TestGenerator { anyhow::bail!( "Agentic test generation failed for {}: exhausted {} turns without submitting", - task.id, MAX_AGENT_TURNS + task.id, + MAX_AGENT_TURNS ) } @@ -593,16 +591,33 @@ enum ToolResult { fn reject_string_matching_tests(files: &[TestFile]) -> Option { let patterns: &[(&str, &str)] = &[ // Python source-reading patterns - (r#"open\([^)]*\)\.read"#, "open().read() used to read source files"), - (r#"Path\([^)]*\)\.read_text"#, "Path().read_text() used to read source files"), - (r#"\.read\(\)[^;]*assert.*\bin\b"#, ".read() + assert...in (string-matching)"), + ( + r#"open\([^)]*\)\.read"#, + "open().read() used to read source files", + ), + ( + r#"Path\([^)]*\)\.read_text"#, + "Path().read_text() used to read source files", + ), + ( + r#"\.read\(\)[^;]*assert.*\bin\b"#, + ".read() + assert...in (string-matching)", + ), // JavaScript/TypeScript source-reading patterns - (r#"readFileSync\("#, "readFileSync() used to read source files"), + ( + r#"readFileSync\("#, + "readFileSync() used to read source files", + ), (r#"readFile\("#, "readFile() used to read source files"), // Combined read + assert patterns - (r#"assert.*\bin\s+(source|content|text|code|file_content|src|contents)"#, - "assert...in source/content (string-matching on file content)"), - (r#"\.(includes|contains)\(['""]"#, ".includes()/.contains() on source content"), + ( + r#"assert.*\bin\s+(source|content|text|code|file_content|src|contents)"#, + "assert...in source/content (string-matching on file content)", + ), + ( + r#"\.(includes|contains)\(['""]"#, + ".includes()/.contains() on source content", + ), ]; let mut violations = Vec::new(); diff --git a/src/utils/AGENTS.md b/src/utils/AGENTS.md new file mode 100644 index 0000000..12c5e91 --- /dev/null +++ b/src/utils/AGENTS.md @@ -0,0 +1,28 @@ +# AGENTS.md — src/utils/ + +## Purpose + +Shared utility functions used across modules, primarily for extracting structured JSON from LLM responses. + +## Module Structure + +| File | Responsibility | +|------|---------------| +| `mod.rs` | Re-exports | +| `json_extraction.rs` | JSON extraction from LLM responses: code blocks, regex, brace matching, truncation detection | + +## Key Functions + +- `extract_json_from_response(text)` — Primary extraction (tries code blocks, then regex, then brace matching) +- `try_extract_json_from_response(text)` — Returns `Option` instead of `Result` +- `extract_from_json_code_block(text)` — Extracts from ` ```json ... ``` ` blocks +- `extract_from_generic_code_block(text)` — Extracts from ` ``` ... ``` ` blocks +- `extract_json_with_regex(text)` — Regex-based JSON object extraction +- `find_matching_brace(text, start)` / `find_matching_bracket(text, start)` — Balanced delimiter matching +- `detect_truncated_json(text)` — Detects incomplete JSON responses +- `analyze_json_structure(text)` — Returns `JsonStructureAnalysis` with depth, key count, etc. + +## Rules + +- Prefer function calling over JSON extraction — these are fallback utilities +- `JsonExtractionError` should be used for all extraction failures