FrontierCS · andylizf · Apr 6, 2026 · Apr 6, 2026 · Apr 6, 2026 · Apr 6, 2026
diff --git a/.gitignore b/.gitignore
@@ -41,6 +41,7 @@ berkeley-function-call-leaderboard/bfcl_eval/scripts/ground_truth_conversation/
 # Ignore lock files
 berkeley-function-call-leaderboard/**/*.lock
 
+.claude/
 .direnv/
 .venv
 

diff --git a/algorithmic/README.md b/algorithmic/README.md
@@ -97,6 +97,107 @@ sky launch -c algo-judge algorithmic/sky-judge.yaml --idle-minutes-to-autostop 1
 frontier eval algorithmic 1 solution.cpp --judge-url http://$(sky status --ip algo-judge):8081
 ```
 
+### Agent Evaluation
+
+Agent mode lets an AI agent solve problems iteratively — reading the statement, writing code, compiling, testing against samples, and refining — rather than generating a single-shot solution.
+
+Agents use the [Claude Agent SDK](https://github.com/anthropic/claude-agent-sdk) (Claude Code as a library). The agent gets a temporary copy of the problem directory with full tool access (shell, file I/O, compilation).
+
+#### Model naming convention
+
+Append `-agent` to any Claude model name to trigger agent mode:
+
+```
+claude-sonnet-4-5-20250514-agent   # Agent mode with Sonnet 4.5
+claude-opus-4-6-20250610-agent     # Agent mode with Opus 4.6
+```
+
+The `-agent` suffix is stripped before passing the model to the SDK. The model prefix for output files includes `agent` (e.g., `claude4.5sonnetagent.cpp`), so agent and single-shot results never collide.
+
+#### Running agent evaluation
+
+```bash
+cd algorithmic/scripts
+
+# Single model, all problems
+python generate_solutions.py \
+  --model claude-sonnet-4-5-20250514-agent \
+  --judge-url http://localhost:8081
+
+# Subset of problems, custom budget
+python generate_solutions.py \
+  --model claude-sonnet-4-5-20250514-agent \
+  --problems 0,1,2,3 \
+  --agent-timeout 1800 \
+  --agent-cost-limit 30 \
+  --judge-url http://localhost:8081
+
+# Multiple variants per problem
+python generate_solutions.py \
+  --model claude-sonnet-4-5-20250514-agent \
+  --indices 3 \
+  --judge-url http://localhost:8081
+```
+
+**Agent-specific CLI flags:**
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--agent-timeout` | 1200 (20 min) | Wall-clock timeout per problem in seconds |
+| `--agent-cost-limit` | 20.0 | Max cost per problem in USD |
+
+#### Output files
+
+For each problem/variant, agent mode produces three files:
+
+```
+solutions/{problem_id}/
+├── claude4.5sonnetagent.cpp           # Extracted C++ solution
+├── claude4.5sonnetagent.meta.json     # Run metadata (cost, tokens, turns, status)
+└── (in generation_logs/)
+    └── claude4.5sonnetagent_*.transcript.jsonl  # Full agent transcript
+```
+
+**meta.json** fields:
+- `tokens_in` / `tokens_out` — total token usage
+- `cost_usd` — total API cost
+- `time_seconds` — wall-clock time
+- `turns` — number of agentic turns (tool-use round trips)
+- `status` — `success`, `timeout`, `cost_limit`, or `error`
+
+#### Prerequisites
+
+1. **Claude Agent SDK**: `pip install claude-agent-sdk` (or `uv sync` if already in project deps)
+2. **Claude Code CLI**: Must be installed and authenticated (`claude --version`)
+3. **Judge server**: Running and accessible (see [Judge Server Configuration](#judge-server-configuration))
+4. **g++**: Available in PATH for the agent to compile solutions
+
+#### How it works
+
+1. The problem directory is copied to a temp working directory (concurrent-safe)
+2. `testlib.h` is automatically copied from `judge/include/` if present (needed for interactive problems)
+3. The agent receives a structured prompt with the problem path and workflow guidance
+4. The agent iterates: reads the problem, writes code, compiles, tests against samples, and refines
+5. On completion (or timeout), `solution.cpp` is extracted from the working directory
+6. The temp directory is cleaned up; solution + metadata are saved
+
+#### Interactive problems
+
+Problems with `interactor.cc` (instead of `chk.cc`) are interactive — the solution communicates with a judge interactor via stdin/stdout. The agent prompt instructs it to:
+
+1. Compile the interactor using `g++ -std=gnu++17 -I. interactor.cc -o interactor`
+2. Test locally via pipes (e.g., `mkfifo pipe; ./solution < pipe | ./interactor > pipe`)
+3. `testlib.h` is provided automatically in the working directory
+
+Interactive problems are harder for agents because local testing requires building a pipe harness, which agents sometimes skip or get wrong.
+
+#### Known limitations
+
+- **No extended thinking**: The Claude Agent SDK does not currently expose extended thinking controls. Enabling it may improve complex algorithmic reasoning.
+- **Rewrite tendency**: Agents sometimes rewrite solutions from scratch after failures, losing working logic. The prompt mitigates this but doesn't eliminate it.
+- **Interactive testing**: Agents frequently skip local testing for interactive problems, submitting untested code.
+- **Algorithm ceiling**: For problems requiring non-trivial algorithmic insight (advanced DP, flow, geometry), agent iteration doesn't compensate for model capability gaps.
+
 ### Creating Problems
 
 > For contributing problems to Frontier-CS (detailed file formats, CI requirements), see [CONTRIBUTING.md](../CONTRIBUTING.md#algorithmic-problems).

diff --git a/algorithmic/scripts/generate_solutions.py b/algorithmic/scripts/generate_solutions.py
@@ -17,6 +17,7 @@
 import time
 import argparse
 import re
+import json
 from pathlib import Path
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from dataclasses import dataclass
@@ -54,6 +55,7 @@ class GenerationTask:
     variant_index: int
     solution_name: str
     total_variants: int = 1
+    problem_dir: Optional[str] = None  # Set for agent models
 
 
 class AlgorithmicJudgeClient:
@@ -298,15 +300,25 @@ def main():
     parser.add_argument("--concurrency", type=int, default=4,
                         help="Maximum parallel generations")
 
+    # Agent-specific parameters
+    parser.add_argument("--agent-timeout", type=float, default=10800.0,
+                        help="Agent timeout in seconds (default: 10800 = 3 hours)")
+    parser.add_argument("--agent-cost-limit", type=float, default=None,
+                        help="Agent max cost per problem in USD (default: no limit)")
+
+
     args = parser.parse_args()
 
     # Output directory for algorithmic solutions
     output_dir = algo_dir / "solutions"
 
+    # Detect if all models are agent-only (no judge needed)
+    all_agent = args.models and all(m.endswith("-agent") for m in args.models)
+
     # Initialize judge client
     judge = AlgorithmicJudgeClient(args.judge_url)
 
-    if not judge.is_available():
+    if not all_agent and not judge.is_available():
         print(f"{red('ERROR:')} Judge server not available at {args.judge_url}")
         print("Start the judge with: cd algorithmic && docker compose up -d")
         sys.exit(1)
@@ -390,7 +402,15 @@ def main():
     skipped: List[str] = []
 
     for problem_id in problem_ids:
-        statement = judge.get_problem_statement(problem_id)
+        # Resolve problem directory
+        problem_dir_path = algo_dir / "problems" / problem_id
+
+        if all_agent:
+            # Agent mode reads statement from local file; no judge needed
+            stmt_path = problem_dir_path / "statement.txt"
+            statement = stmt_path.read_text(encoding="utf-8") if stmt_path.exists() else ""
+        else:
+            statement = judge.get_problem_statement(problem_id)
         if not statement:
             print(f"{yellow('WARNING:')} Could not get statement for problem {problem_id}")
             continue
@@ -399,6 +419,7 @@ def main():
             model_prefix = get_model_prefix(model)
             provider = detect_provider(model)
             reasoning = is_reasoning_model(model)
+            is_agent = model.endswith("-agent")
 
             for variant_idx in solution_indices:
                 # Nested format: {problem}/{model}.cpp or {problem}/{model}_{variant}.cpp
@@ -428,6 +449,7 @@ def main():
                     variant_index=variant_idx,
                     solution_name=sol_filename,
                     total_variants=len(solution_indices),
+                    problem_dir=str(problem_dir_path) if is_agent else None,
                 ))
 
     # Print plan
@@ -498,14 +520,41 @@ def execute_task(task: GenerationTask) -> Tuple[str, str, Optional[str], str, Op
         failed_path = get_failed_path(sol_path)
 
         try:
-            code = generate_code(
-                task.statement,
-                model=task.model,
-                api_key=api_key,
-                log_file=log_file,
-                is_reasoning_model=task.reasoning_model,
-                timeout=args.timeout,
-            )
+            if task.problem_dir is not None:
+                # Agent mode
+                from frontier_cs.gen.agent_interface import generate_agent_solution
+
+                base_model = task.model.removesuffix("-agent")
+                transcript_path = logs_dir / task.solution_name.replace(".cpp", f"_{timestamp}.transcript.jsonl")
+                transcript_path.parent.mkdir(parents=True, exist_ok=True)
+
+                code, metadata = generate_agent_solution(
+                    problem_dir=task.problem_dir,
+                    model=base_model,
+                    api_key=api_key,
+                    cost_limit=args.agent_cost_limit,
+                    timeout=args.agent_timeout,
+                    transcript_path=transcript_path,
+                )
+
+                # Save metadata alongside solution
+                meta_path = sol_path.with_suffix(".meta.json")
+                meta_path.parent.mkdir(parents=True, exist_ok=True)
+                meta_path.write_text(json.dumps(metadata, indent=2), encoding="utf-8")
+                print(f"  {dim('meta:')} {meta_path}")
+            else:
+                # Single-shot mode (existing)
+                code = generate_code(
+                    task.statement,
+                    model=task.model,
+                    api_key=api_key,
+                    log_file=log_file,
+                    is_reasoning_model=task.reasoning_model,
+                    timeout=args.timeout,
+                )
+
+            if not code:
+                raise RuntimeError("No solution code produced")
 
             # Save solution to nested directory
             sol_path.parent.mkdir(parents=True, exist_ok=True)

diff --git a/pyproject.toml b/pyproject.toml
@@ -5,6 +5,7 @@ description = "Evaluation framework for Frontier-CS problems"
 requires-python = ">=3.11"
 dependencies = [
     "anthropic>=0.74.0",
+    "claude-agent-sdk>=0.1.0",
     "colorlog>=6.10.1",
     "datasets>=4.4.1",
     "google-genai>=1.55.0",
@@ -28,3 +29,8 @@ package = true
 [build-system]
 requires = ["hatchling"]
 build-backend = "hatchling.build"
+
+[dependency-groups]
+dev = [
+    "pytest>=9.0.3",
+]