diff --git a/.gitignore b/.gitignore
index f1ac09f1..53753de8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -41,6 +41,7 @@ berkeley-function-call-leaderboard/bfcl_eval/scripts/ground_truth_conversation/
# Ignore lock files
berkeley-function-call-leaderboard/**/*.lock
+.claude/
.direnv/
.venv
diff --git a/algorithmic/README.md b/algorithmic/README.md
index 8737c55e..9299a7e3 100644
--- a/algorithmic/README.md
+++ b/algorithmic/README.md
@@ -97,6 +97,107 @@ sky launch -c algo-judge algorithmic/sky-judge.yaml --idle-minutes-to-autostop 1
frontier eval algorithmic 1 solution.cpp --judge-url http://$(sky status --ip algo-judge):8081
```
+### Agent Evaluation
+
+Agent mode lets an AI agent solve problems iteratively — reading the statement, writing code, compiling, testing against samples, and refining — rather than generating a single-shot solution.
+
+Agents use the [Claude Agent SDK](https://github.com/anthropic/claude-agent-sdk) (Claude Code as a library). The agent gets a temporary copy of the problem directory with full tool access (shell, file I/O, compilation).
+
+#### Model naming convention
+
+Append `-agent` to any Claude model name to trigger agent mode:
+
+```
+claude-sonnet-4-5-20250514-agent # Agent mode with Sonnet 4.5
+claude-opus-4-6-20250610-agent # Agent mode with Opus 4.6
+```
+
+The `-agent` suffix is stripped before passing the model to the SDK. The model prefix for output files includes `agent` (e.g., `claude4.5sonnetagent.cpp`), so agent and single-shot results never collide.
+
+#### Running agent evaluation
+
+```bash
+cd algorithmic/scripts
+
+# Single model, all problems
+python generate_solutions.py \
+ --model claude-sonnet-4-5-20250514-agent \
+ --judge-url http://localhost:8081
+
+# Subset of problems, custom budget
+python generate_solutions.py \
+ --model claude-sonnet-4-5-20250514-agent \
+ --problems 0,1,2,3 \
+ --agent-timeout 1800 \
+ --agent-cost-limit 30 \
+ --judge-url http://localhost:8081
+
+# Multiple variants per problem
+python generate_solutions.py \
+ --model claude-sonnet-4-5-20250514-agent \
+ --indices 3 \
+ --judge-url http://localhost:8081
+```
+
+**Agent-specific CLI flags:**
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--agent-timeout` | 1200 (20 min) | Wall-clock timeout per problem in seconds |
+| `--agent-cost-limit` | 20.0 | Max cost per problem in USD |
+
+#### Output files
+
+For each problem/variant, agent mode produces three files:
+
+```
+solutions/{problem_id}/
+├── claude4.5sonnetagent.cpp # Extracted C++ solution
+├── claude4.5sonnetagent.meta.json # Run metadata (cost, tokens, turns, status)
+└── (in generation_logs/)
+ └── claude4.5sonnetagent_*.transcript.jsonl # Full agent transcript
+```
+
+**meta.json** fields:
+- `tokens_in` / `tokens_out` — total token usage
+- `cost_usd` — total API cost
+- `time_seconds` — wall-clock time
+- `turns` — number of agentic turns (tool-use round trips)
+- `status` — `success`, `timeout`, `cost_limit`, or `error`
+
+#### Prerequisites
+
+1. **Claude Agent SDK**: `pip install claude-agent-sdk` (or `uv sync` if already in project deps)
+2. **Claude Code CLI**: Must be installed and authenticated (`claude --version`)
+3. **Judge server**: Running and accessible (see [Judge Server Configuration](#judge-server-configuration))
+4. **g++**: Available in PATH for the agent to compile solutions
+
+#### How it works
+
+1. The problem directory is copied to a temp working directory (concurrent-safe)
+2. `testlib.h` is automatically copied from `judge/include/` if present (needed for interactive problems)
+3. The agent receives a structured prompt with the problem path and workflow guidance
+4. The agent iterates: reads the problem, writes code, compiles, tests against samples, and refines
+5. On completion (or timeout), `solution.cpp` is extracted from the working directory
+6. The temp directory is cleaned up; solution + metadata are saved
+
+#### Interactive problems
+
+Problems with `interactor.cc` (instead of `chk.cc`) are interactive — the solution communicates with a judge interactor via stdin/stdout. The agent prompt instructs it to:
+
+1. Compile the interactor using `g++ -std=gnu++17 -I. interactor.cc -o interactor`
+2. Test locally via pipes (e.g., `mkfifo pipe; ./solution < pipe | ./interactor > pipe`)
+3. `testlib.h` is provided automatically in the working directory
+
+Interactive problems are harder for agents because local testing requires building a pipe harness, which agents sometimes skip or get wrong.
+
+#### Known limitations
+
+- **No extended thinking**: The Claude Agent SDK does not currently expose extended thinking controls. Enabling it may improve complex algorithmic reasoning.
+- **Rewrite tendency**: Agents sometimes rewrite solutions from scratch after failures, losing working logic. The prompt mitigates this but doesn't eliminate it.
+- **Interactive testing**: Agents frequently skip local testing for interactive problems, submitting untested code.
+- **Algorithm ceiling**: For problems requiring non-trivial algorithmic insight (advanced DP, flow, geometry), agent iteration doesn't compensate for model capability gaps.
+
### Creating Problems
> For contributing problems to Frontier-CS (detailed file formats, CI requirements), see [CONTRIBUTING.md](../CONTRIBUTING.md#algorithmic-problems).
diff --git a/algorithmic/scripts/generate_solutions.py b/algorithmic/scripts/generate_solutions.py
index 93a6f260..6d695b96 100644
--- a/algorithmic/scripts/generate_solutions.py
+++ b/algorithmic/scripts/generate_solutions.py
@@ -17,6 +17,7 @@
import time
import argparse
import re
+import json
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass
@@ -54,6 +55,7 @@ class GenerationTask:
variant_index: int
solution_name: str
total_variants: int = 1
+ problem_dir: Optional[str] = None # Set for agent models
class AlgorithmicJudgeClient:
@@ -298,15 +300,25 @@ def main():
parser.add_argument("--concurrency", type=int, default=4,
help="Maximum parallel generations")
+ # Agent-specific parameters
+ parser.add_argument("--agent-timeout", type=float, default=10800.0,
+ help="Agent timeout in seconds (default: 10800 = 3 hours)")
+ parser.add_argument("--agent-cost-limit", type=float, default=None,
+ help="Agent max cost per problem in USD (default: no limit)")
+
+
args = parser.parse_args()
# Output directory for algorithmic solutions
output_dir = algo_dir / "solutions"
+ # Detect if all models are agent-only (no judge needed)
+ all_agent = args.models and all(m.endswith("-agent") for m in args.models)
+
# Initialize judge client
judge = AlgorithmicJudgeClient(args.judge_url)
- if not judge.is_available():
+ if not all_agent and not judge.is_available():
print(f"{red('ERROR:')} Judge server not available at {args.judge_url}")
print("Start the judge with: cd algorithmic && docker compose up -d")
sys.exit(1)
@@ -390,7 +402,15 @@ def main():
skipped: List[str] = []
for problem_id in problem_ids:
- statement = judge.get_problem_statement(problem_id)
+ # Resolve problem directory
+ problem_dir_path = algo_dir / "problems" / problem_id
+
+ if all_agent:
+ # Agent mode reads statement from local file; no judge needed
+ stmt_path = problem_dir_path / "statement.txt"
+ statement = stmt_path.read_text(encoding="utf-8") if stmt_path.exists() else ""
+ else:
+ statement = judge.get_problem_statement(problem_id)
if not statement:
print(f"{yellow('WARNING:')} Could not get statement for problem {problem_id}")
continue
@@ -399,6 +419,7 @@ def main():
model_prefix = get_model_prefix(model)
provider = detect_provider(model)
reasoning = is_reasoning_model(model)
+ is_agent = model.endswith("-agent")
for variant_idx in solution_indices:
# Nested format: {problem}/{model}.cpp or {problem}/{model}_{variant}.cpp
@@ -428,6 +449,7 @@ def main():
variant_index=variant_idx,
solution_name=sol_filename,
total_variants=len(solution_indices),
+ problem_dir=str(problem_dir_path) if is_agent else None,
))
# Print plan
@@ -498,14 +520,41 @@ def execute_task(task: GenerationTask) -> Tuple[str, str, Optional[str], str, Op
failed_path = get_failed_path(sol_path)
try:
- code = generate_code(
- task.statement,
- model=task.model,
- api_key=api_key,
- log_file=log_file,
- is_reasoning_model=task.reasoning_model,
- timeout=args.timeout,
- )
+ if task.problem_dir is not None:
+ # Agent mode
+ from frontier_cs.gen.agent_interface import generate_agent_solution
+
+ base_model = task.model.removesuffix("-agent")
+ transcript_path = logs_dir / task.solution_name.replace(".cpp", f"_{timestamp}.transcript.jsonl")
+ transcript_path.parent.mkdir(parents=True, exist_ok=True)
+
+ code, metadata = generate_agent_solution(
+ problem_dir=task.problem_dir,
+ model=base_model,
+ api_key=api_key,
+ cost_limit=args.agent_cost_limit,
+ timeout=args.agent_timeout,
+ transcript_path=transcript_path,
+ )
+
+ # Save metadata alongside solution
+ meta_path = sol_path.with_suffix(".meta.json")
+ meta_path.parent.mkdir(parents=True, exist_ok=True)
+ meta_path.write_text(json.dumps(metadata, indent=2), encoding="utf-8")
+ print(f" {dim('meta:')} {meta_path}")
+ else:
+ # Single-shot mode (existing)
+ code = generate_code(
+ task.statement,
+ model=task.model,
+ api_key=api_key,
+ log_file=log_file,
+ is_reasoning_model=task.reasoning_model,
+ timeout=args.timeout,
+ )
+
+ if not code:
+ raise RuntimeError("No solution code produced")
# Save solution to nested directory
sol_path.parent.mkdir(parents=True, exist_ok=True)
diff --git a/pyproject.toml b/pyproject.toml
index a7389580..62ae3466 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,6 +5,7 @@ description = "Evaluation framework for Frontier-CS problems"
requires-python = ">=3.11"
dependencies = [
"anthropic>=0.74.0",
+ "claude-agent-sdk>=0.1.0",
"colorlog>=6.10.1",
"datasets>=4.4.1",
"google-genai>=1.55.0",
@@ -28,3 +29,8 @@ package = true
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
+
+[dependency-groups]
+dev = [
+ "pytest>=9.0.3",
+]
diff --git a/src/frontier_cs/gen/agent_constants.py b/src/frontier_cs/gen/agent_constants.py
new file mode 100644
index 00000000..1b80194c
--- /dev/null
+++ b/src/frontier_cs/gen/agent_constants.py
@@ -0,0 +1,345 @@
+"""Prompt templates, shell scripts, and CLAUDE.md content for agent eval.
+
+All large string constants live here to keep agent_interface.py focused on logic.
+"""
+
+# ---------------------------------------------------------------------------
+# Shell scripts (used in parity=False mode only)
+# ---------------------------------------------------------------------------
+
+TEST_ALL_SH = r"""#!/bin/bash
+set -e
+echo "=== Compiling solution.cpp ==="
+g++ -std=gnu++17 -O2 -o solution solution.cpp
+echo "=== Compilation OK ==="
+
+# Compile checker if available (special judge)
+USE_CHECKER=0
+if [ -f "chk.cc" ]; then
+ echo "=== Compiling special judge (chk.cc) ==="
+ if g++ -std=gnu++17 -O2 -I. chk.cc -o checker 2>/dev/null; then
+ USE_CHECKER=1
+ echo "=== Checker compiled OK — using it instead of diff ==="
+ else
+ echo "=== Checker compilation failed — falling back to diff ==="
+ fi
+fi
+
+passed=0; failed=0; total=0
+for inf in testdata/*.in; do
+ [ -f "$inf" ] || continue
+ id=$(basename "$inf" .in)
+ ans="testdata/${id}.ans"
+ [ -f "$ans" ] || continue
+ total=$((total + 1))
+
+ # Run with timeout
+ if timeout 15 ./solution < "$inf" > "my_${id}.out" 2>"my_${id}.err"; then
+ if [ "$USE_CHECKER" -eq 1 ]; then
+ # Special judge: ./checker