From 509f5cb9d2ac7abfe30b5e5a49f5c429684fab33 Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Mon, 6 Apr 2026 11:40:48 +0000 Subject: [PATCH 01/16] feat: add claude-agent-sdk dependency for agent eval --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index a7389580..317700e5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,6 +5,7 @@ description = "Evaluation framework for Frontier-CS problems" requires-python = ">=3.11" dependencies = [ "anthropic>=0.74.0", + "claude-agent-sdk>=0.1.0", "colorlog>=6.10.1", "datasets>=4.4.1", "google-genai>=1.55.0", From c8ee4aaadf61dcd62a89ebd44ab266e476dafd7c Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Mon, 6 Apr 2026 11:44:27 +0000 Subject: [PATCH 02/16] feat: handle -agent model suffix in model prefix and provider detection --- src/frontier_cs/models.py | 26 ++++++++++++++++++-------- tests/test_agent_interface.py | 25 +++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 8 deletions(-) create mode 100644 tests/test_agent_interface.py diff --git a/src/frontier_cs/models.py b/src/frontier_cs/models.py index bec6640c..03fd4b83 100644 --- a/src/frontier_cs/models.py +++ b/src/frontier_cs/models.py @@ -42,6 +42,12 @@ def get_model_prefix(model: str) -> str: """ original = model + # Strip and track -agent suffix + agent_suffix = "" + if model.endswith("-agent"): + agent_suffix = "agent" + model = model.removesuffix("-agent") + # Remove provider prefix if present (e.g., 'gemini/gemini-2.5-pro' -> 'gemini-2.5-pro') if "/" in model: model = model.split("/", 1)[1] @@ -51,21 +57,21 @@ def get_model_prefix(model: str) -> str: # Handle GPT-5 variants # Keep 'gpt-5.1', 'gpt-5.2' etc. distinct so their artifacts prefix correctly if model_lower.startswith("gpt-5.2") or model_lower.startswith("gpt5.2"): - return "gpt5.2" + return "gpt5.2" + agent_suffix if model_lower.startswith("gpt-5.1") or model_lower.startswith("gpt5.1"): - return "gpt5.1" + return "gpt5.1" + agent_suffix if model_lower.startswith("gpt-5") or model_lower.startswith("gpt5"): - return "gpt5" + return "gpt5" + agent_suffix # Handle Gemini 2.5 Pro variants if "gemini-2.5-pro" in model_lower or "gemini2.5pro" in model_lower: - return "gemini2.5pro" + return "gemini2.5pro" + agent_suffix # Handle other Gemini variants (e.g., gemini-1.5-pro -> gemini1.5pro) gemini_match = re.match(r"gemini-?(\d+\.?\d*)-?pro", model_lower) if gemini_match: version = gemini_match.group(1) - return f"gemini{version}pro" + return f"gemini{version}pro" + agent_suffix # Handle Claude variants (e.g., claude-sonnet-4-5-20250929 -> claude4.5sonnet) claude_match = re.match(r"claude-([a-z]+)-(\d+)-(\d+)", model_lower) @@ -73,19 +79,19 @@ def get_model_prefix(model: str) -> str: family = claude_match.group(1) major = claude_match.group(2) minor = claude_match.group(3) - return f"claude{major}.{minor}{family}" + return f"claude{major}.{minor}{family}" + agent_suffix # Handle Grok variants - keep 'fast' and 'reasoning' in the prefix if "grok" in model_lower: sanitized = re.sub(r"[^a-zA-Z0-9]+", "", model_lower) if sanitized: - return sanitized + return sanitized + agent_suffix # Default: sanitize by removing all non-alphanumeric characters sanitized = re.sub(r"[^a-zA-Z0-9]+", "", model_lower) if not sanitized: raise ValueError(f"Unable to derive model prefix from '{original}'") - return sanitized + return sanitized + agent_suffix def normalize_solution_name(name: str) -> str: @@ -217,6 +223,10 @@ def detect_provider(model: str) -> str: Returns: Provider name: 'openai', 'google', 'anthropic', 'xai', 'deepseek', 'openrouter' """ + # Strip agent suffix before detection + if model.endswith("-agent"): + model = model.removesuffix("-agent") + normalized = model.strip() if "/" in normalized: provider_hint, actual_model = normalized.split("/", 1) diff --git a/tests/test_agent_interface.py b/tests/test_agent_interface.py new file mode 100644 index 00000000..96c19e05 --- /dev/null +++ b/tests/test_agent_interface.py @@ -0,0 +1,25 @@ +"""Tests for agent model name handling and agent_interface.""" + +from frontier_cs.models import get_model_prefix, detect_provider, is_reasoning_model + + +def test_agent_model_prefix(): + """Agent model prefix includes 'agent' suffix.""" + assert get_model_prefix("claude-opus-4-6-agent") == "claude4.6opusagent" + assert get_model_prefix("claude-sonnet-4-5-agent") == "claude4.5sonnetagent" + + +def test_agent_model_prefix_does_not_collide_with_single_shot(): + """Agent prefix must differ from single-shot prefix.""" + assert get_model_prefix("claude-opus-4-6-agent") != get_model_prefix("claude-opus-4-6") + + +def test_agent_detect_provider(): + """Agent models detect as anthropic provider.""" + assert detect_provider("claude-opus-4-6-agent") == "anthropic" + assert detect_provider("claude-sonnet-4-5-agent") == "anthropic" + + +def test_agent_is_not_reasoning_model(): + """Agent models are not reasoning models.""" + assert is_reasoning_model("claude-opus-4-6-agent") is False From f208dfa90184f8ea600f620833428c827e2fbd90 Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Mon, 6 Apr 2026 11:47:03 +0000 Subject: [PATCH 03/16] =?UTF-8?q?feat:=20add=20agent=5Finterface.py=20?= =?UTF-8?q?=E2=80=94=20core=20agent=20runner=20with=20logging=20and=20extr?= =?UTF-8?q?action?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/frontier_cs/gen/agent_interface.py | 313 +++++++++++++++++++++++++ tests/test_agent_interface.py | 57 +++++ 2 files changed, 370 insertions(+) create mode 100644 src/frontier_cs/gen/agent_interface.py diff --git a/src/frontier_cs/gen/agent_interface.py b/src/frontier_cs/gen/agent_interface.py new file mode 100644 index 00000000..97dc09ba --- /dev/null +++ b/src/frontier_cs/gen/agent_interface.py @@ -0,0 +1,313 @@ +"""Agent-based solution generation using Claude Agent SDK. + +This module handles the full agent lifecycle for solving competitive programming +problems: prompt construction, Agent SDK invocation with streaming, JSONL transcript +logging, live monitoring, timeout/cost control, and solution extraction. + +Agent models are identified by a "-agent" suffix (e.g., "claude-opus-4-6-agent"). +They are treated as distinct "models" in the gen pipeline — no special routing needed +downstream. +""" + +import asyncio +import json +import logging +import sys +import time +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Dict, Optional, Tuple + +logger = logging.getLogger(__name__) + +# Default budget limits +DEFAULT_COST_LIMIT_USD = 20.0 +DEFAULT_TIMEOUT_SECONDS = 1200 # 20 minutes + + +def build_agent_prompt(problem_dir: str) -> str: + """Construct the prompt given to the agent. + + Args: + problem_dir: Absolute path to the problem directory. + + Returns: + The prompt string for the agent. + """ + return f"""You are solving a competitive programming problem. + +Problem directory: {problem_dir} +- Read statement.txt for the problem description +- testdata/ contains sample test cases (*.in, *.ans), but these are only a subset +- Your solution will be evaluated against a larger hidden test suite +- You can compile with g++, run against the available samples, and iterate +- config.yaml has time/memory limits — respect them in your solution + +Submit your final solution as solution.cpp in the current working directory.""" + + +def extract_solution_cpp(workdir: Path) -> str: + """Extract solution.cpp from the agent working directory. + + Looks for solution.cpp first, then falls back to any .cpp file. + + Args: + workdir: The agent's working directory. + + Returns: + The C++ source code, or empty string if not found. + """ + # Primary: solution.cpp + sol = workdir / "solution.cpp" + if sol.is_file(): + return sol.read_text(encoding="utf-8") + + # Fallback: any .cpp file (agent might have used a different name) + cpp_files = list(workdir.glob("*.cpp")) + if cpp_files: + # Pick the most recently modified one + newest = max(cpp_files, key=lambda p: p.stat().st_mtime) + return newest.read_text(encoding="utf-8") + + return "" + + +def build_metadata( + *, + tokens_in: int, + tokens_out: int, + cost_usd: float, + time_seconds: float, + turns: int, + status: str, +) -> Dict[str, Any]: + """Build the metadata dict for an agent run. + + Args: + tokens_in: Total input tokens consumed. + tokens_out: Total output tokens consumed. + cost_usd: Total cost in USD. + time_seconds: Wall-clock time in seconds. + turns: Number of agentic turns (tool-use round trips). + status: One of "success", "timeout", "cost_limit", "error". + + Returns: + Metadata dictionary. + """ + return { + "tokens_in": tokens_in, + "tokens_out": tokens_out, + "cost_usd": round(cost_usd, 4), + "time_seconds": round(time_seconds, 2), + "turns": turns, + "status": status, + } + + +@dataclass +class TranscriptLogger: + """Writes JSONL transcript of agent events, flushed per event.""" + + path: Path + _file: Any = field(default=None, init=False, repr=False) + + def open(self) -> None: + self.path.parent.mkdir(parents=True, exist_ok=True) + self._file = open(self.path, "w", encoding="utf-8") + + def log(self, event: Dict[str, Any]) -> None: + if self._file is None: + return + event["_ts"] = time.time() + self._file.write(json.dumps(event, default=str) + "\n") + self._file.flush() + + def close(self) -> None: + if self._file is not None: + self._file.close() + self._file = None + + +async def run_agent( + problem_dir: str, + model: str, + *, + cost_limit: float = DEFAULT_COST_LIMIT_USD, + timeout: float = DEFAULT_TIMEOUT_SECONDS, + transcript_path: Optional[Path] = None, +) -> Tuple[str, Dict[str, Any]]: + """Run the agent to solve a problem. + + Args: + problem_dir: Absolute path to the problem directory. + model: Base model name (without -agent suffix). + cost_limit: Maximum cost in USD. + timeout: Maximum wall-clock time in seconds. + transcript_path: Path for JSONL transcript log. None to skip. + + Returns: + Tuple of (cpp_code, metadata_dict). + """ + from claude_agent_sdk import query, ClaudeAgentOptions + from claude_agent_sdk.types import StreamEvent + + prompt = build_agent_prompt(problem_dir) + workdir = Path(problem_dir) + + options = ClaudeAgentOptions( + model=model, + cwd=str(workdir), + max_budget_usd=cost_limit, + permission_mode="bypassPermissions", + include_partial_messages=True, + ) + + # Set up transcript logging + transcript = TranscriptLogger(transcript_path) if transcript_path else None + if transcript: + transcript.open() + + start_time = time.time() + status = "success" + num_turns = 0 + total_cost: Optional[float] = None + usage_in = 0 + usage_out = 0 + + try: + async def _run(): + nonlocal num_turns, total_cost, usage_in, usage_out + + async for message in query(prompt=prompt, options=options): + # Import here to check types + from claude_agent_sdk import AssistantMessage, ResultMessage + + if isinstance(message, StreamEvent): + event = message.event + event_type = event.get("type", "") + + # Log every event + if transcript: + transcript.log({"type": "stream_event", "event": event}) + + # Live monitoring: tool calls + if event_type == "content_block_start": + cb = event.get("content_block", {}) + if cb.get("type") == "tool_use": + tool = cb.get("name", "?") + elapsed = time.time() - start_time + print( + f" [{elapsed:6.1f}s] [turn {num_turns}] {tool}", + flush=True, + ) + + elif isinstance(message, AssistantMessage): + num_turns += 1 + if transcript: + tools_used = [ + b.name + for b in message.content + if hasattr(b, "name") + ] + transcript.log({ + "type": "assistant_turn", + "turn": num_turns, + "tools": tools_used, + "model": message.model, + }) + + # Per-message usage tracking + if message.usage: + usage_in += message.usage.get("input_tokens", 0) + usage_out += message.usage.get("output_tokens", 0) + + # Periodic cost summary to stderr + elapsed = time.time() - start_time + print( + f" [{elapsed:6.1f}s] turn {num_turns}, " + f"{usage_in // 1000}K in / {usage_out // 1000}K out", + file=sys.stderr, + flush=True, + ) + + elif isinstance(message, ResultMessage): + total_cost = message.total_cost_usd + if message.usage: + usage_in = message.usage.get("input_tokens", usage_in) + usage_out = message.usage.get("output_tokens", usage_out) + num_turns = message.num_turns + if transcript: + transcript.log({ + "type": "result", + "cost_usd": total_cost, + "num_turns": num_turns, + "duration_ms": message.duration_ms, + "stop_reason": message.stop_reason, + "is_error": message.is_error, + }) + + await asyncio.wait_for(_run(), timeout=timeout) + + except asyncio.TimeoutError: + status = "timeout" + logger.warning("Agent timed out after %.0fs", timeout) + except Exception as e: + status = "error" + logger.error("Agent error: %s", e) + if transcript: + transcript.log({"type": "error", "error": str(e)}) + finally: + if transcript: + transcript.close() + + elapsed = time.time() - start_time + + # Extract solution (best-effort even on timeout/error) + code = extract_solution_cpp(workdir) + if not code and status == "success": + status = "error" + logger.error("Agent completed but no .cpp file found in %s", workdir) + + metadata = build_metadata( + tokens_in=usage_in, + tokens_out=usage_out, + cost_usd=total_cost if total_cost is not None else 0.0, + time_seconds=elapsed, + turns=num_turns, + status=status, + ) + + return code, metadata + + +def generate_agent_solution( + problem_dir: str, + model: str, + *, + cost_limit: float = DEFAULT_COST_LIMIT_USD, + timeout: float = DEFAULT_TIMEOUT_SECONDS, + transcript_path: Optional[Path] = None, +) -> Tuple[str, Dict[str, Any]]: + """Synchronous wrapper for run_agent. + + This is the main entry point called from generate_solutions.py. + + Args: + problem_dir: Absolute path to the problem directory. + model: Base model name (without -agent suffix). + cost_limit: Maximum cost in USD. + timeout: Maximum wall-clock time in seconds. + transcript_path: Path for JSONL transcript log. + + Returns: + Tuple of (cpp_code, metadata_dict). + """ + return asyncio.run( + run_agent( + problem_dir, + model, + cost_limit=cost_limit, + timeout=timeout, + transcript_path=transcript_path, + ) + ) diff --git a/tests/test_agent_interface.py b/tests/test_agent_interface.py index 96c19e05..be163ba4 100644 --- a/tests/test_agent_interface.py +++ b/tests/test_agent_interface.py @@ -23,3 +23,60 @@ def test_agent_detect_provider(): def test_agent_is_not_reasoning_model(): """Agent models are not reasoning models.""" assert is_reasoning_model("claude-opus-4-6-agent") is False + + +import json +import tempfile +from pathlib import Path + + +def test_build_agent_prompt(): + """Agent prompt includes problem dir and key instructions.""" + from frontier_cs.gen.agent_interface import build_agent_prompt + + prompt = build_agent_prompt("/tmp/fake_problem") + assert "/tmp/fake_problem" in prompt + assert "statement.txt" in prompt + assert "testdata/" in prompt + assert "hidden test suite" in prompt + assert "solution.cpp" in prompt + + +def test_extract_cpp_from_workdir(): + """Extract solution.cpp from agent working directory.""" + from frontier_cs.gen.agent_interface import extract_solution_cpp + + with tempfile.TemporaryDirectory() as tmpdir: + sol_path = Path(tmpdir) / "solution.cpp" + sol_path.write_text('#include \nint main() { return 0; }') + code = extract_solution_cpp(Path(tmpdir)) + assert "#include " in code + + +def test_extract_cpp_missing(): + """Return empty string if no solution.cpp found.""" + from frontier_cs.gen.agent_interface import extract_solution_cpp + + with tempfile.TemporaryDirectory() as tmpdir: + code = extract_solution_cpp(Path(tmpdir)) + assert code == "" + + +def test_build_metadata(): + """Build metadata dict from agent run results.""" + from frontier_cs.gen.agent_interface import build_metadata + + meta = build_metadata( + tokens_in=100000, + tokens_out=25000, + cost_usd=5.50, + time_seconds=300.5, + turns=15, + status="success", + ) + assert meta["tokens_in"] == 100000 + assert meta["tokens_out"] == 25000 + assert meta["cost_usd"] == 5.50 + assert meta["time_seconds"] == 300.5 + assert meta["turns"] == 15 + assert meta["status"] == "success" From e385340b8e5fb63102ebb29232b6712318e7a3c7 Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Mon, 6 Apr 2026 11:53:23 +0000 Subject: [PATCH 04/16] feat: integrate agent mode into generate_solutions.py Add agent model support to the solution generation pipeline: - Detect -agent suffix models and store problem_dir in GenerationTask - Add --agent-timeout and --agent-cost-limit CLI arguments - Branch execute_task to call generate_agent_solution for agent models - Save .meta.json alongside generated .cpp solutions - Add import json for metadata serialization --- algorithmic/scripts/generate_solutions.py | 55 +++++++++++++++++++---- 1 file changed, 47 insertions(+), 8 deletions(-) diff --git a/algorithmic/scripts/generate_solutions.py b/algorithmic/scripts/generate_solutions.py index 93a6f260..7d01b5b6 100644 --- a/algorithmic/scripts/generate_solutions.py +++ b/algorithmic/scripts/generate_solutions.py @@ -17,6 +17,7 @@ import time import argparse import re +import json from pathlib import Path from concurrent.futures import ThreadPoolExecutor, as_completed from dataclasses import dataclass @@ -54,6 +55,7 @@ class GenerationTask: variant_index: int solution_name: str total_variants: int = 1 + problem_dir: Optional[str] = None # Set for agent models class AlgorithmicJudgeClient: @@ -298,6 +300,12 @@ def main(): parser.add_argument("--concurrency", type=int, default=4, help="Maximum parallel generations") + # Agent-specific parameters + parser.add_argument("--agent-timeout", type=float, default=1200.0, + help="Agent timeout in seconds (default: 1200 = 20 min)") + parser.add_argument("--agent-cost-limit", type=float, default=20.0, + help="Agent max cost per problem in USD (default: 20)") + args = parser.parse_args() # Output directory for algorithmic solutions @@ -395,10 +403,14 @@ def main(): print(f"{yellow('WARNING:')} Could not get statement for problem {problem_id}") continue + # Resolve problem directory for agent models + problem_dir_path = algo_dir / "problems" / problem_id + for model in models_list: model_prefix = get_model_prefix(model) provider = detect_provider(model) reasoning = is_reasoning_model(model) + is_agent = model.endswith("-agent") for variant_idx in solution_indices: # Nested format: {problem}/{model}.cpp or {problem}/{model}_{variant}.cpp @@ -428,6 +440,7 @@ def main(): variant_index=variant_idx, solution_name=sol_filename, total_variants=len(solution_indices), + problem_dir=str(problem_dir_path) if is_agent else None, )) # Print plan @@ -498,14 +511,40 @@ def execute_task(task: GenerationTask) -> Tuple[str, str, Optional[str], str, Op failed_path = get_failed_path(sol_path) try: - code = generate_code( - task.statement, - model=task.model, - api_key=api_key, - log_file=log_file, - is_reasoning_model=task.reasoning_model, - timeout=args.timeout, - ) + if task.problem_dir is not None: + # Agent mode + from frontier_cs.gen.agent_interface import generate_agent_solution + + base_model = task.model.removesuffix("-agent") + transcript_path = logs_dir / task.solution_name.replace(".cpp", f"_{timestamp}.transcript.jsonl") + transcript_path.parent.mkdir(parents=True, exist_ok=True) + + code, metadata = generate_agent_solution( + problem_dir=task.problem_dir, + model=base_model, + cost_limit=args.agent_cost_limit, + timeout=args.agent_timeout, + transcript_path=transcript_path, + ) + + # Save metadata alongside solution + meta_path = sol_path.with_suffix(".meta.json") + meta_path.parent.mkdir(parents=True, exist_ok=True) + meta_path.write_text(json.dumps(metadata, indent=2), encoding="utf-8") + print(f" {dim('meta:')} {meta_path}") + else: + # Single-shot mode (existing) + code = generate_code( + task.statement, + model=task.model, + api_key=api_key, + log_file=log_file, + is_reasoning_model=task.reasoning_model, + timeout=args.timeout, + ) + + if not code: + raise RuntimeError("No solution code produced") # Save solution to nested directory sol_path.parent.mkdir(parents=True, exist_ok=True) From 99fb5c05f2671beafbdc5f2763be0338e60324e5 Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Mon, 6 Apr 2026 13:07:45 +0000 Subject: [PATCH 05/16] fix: use temp workdir and improve token tracking in agent_interface - Copy problem dir to temp directory so agent doesn't pollute originals - Makes concurrent runs on same problem safe - Track token usage from streaming message_delta events (only reliable source when timeout kills run before ResultMessage arrives) - Clean up temp dir after extraction --- src/frontier_cs/gen/agent_interface.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/src/frontier_cs/gen/agent_interface.py b/src/frontier_cs/gen/agent_interface.py index 97dc09ba..f87adf7a 100644 --- a/src/frontier_cs/gen/agent_interface.py +++ b/src/frontier_cs/gen/agent_interface.py @@ -12,7 +12,9 @@ import asyncio import json import logging +import shutil import sys +import tempfile import time from dataclasses import dataclass, field from pathlib import Path @@ -151,8 +153,13 @@ async def run_agent( from claude_agent_sdk import query, ClaudeAgentOptions from claude_agent_sdk.types import StreamEvent - prompt = build_agent_prompt(problem_dir) - workdir = Path(problem_dir) + # Copy problem dir to a temp working directory to avoid polluting the original. + # This also makes concurrent runs on the same problem safe. + tmpdir = tempfile.mkdtemp(prefix="agent_eval_") + workdir = Path(tmpdir) / "problem" + shutil.copytree(problem_dir, workdir) + + prompt = build_agent_prompt(str(workdir)) options = ClaudeAgentOptions( model=model, @@ -201,6 +208,14 @@ async def _run(): flush=True, ) + # Track token usage from streaming message_delta events. + # This is the only reliable source when timeout kills + # the run before ResultMessage arrives. + if event_type == "message_delta": + delta_usage = event.get("usage", {}) + if delta_usage.get("output_tokens"): + usage_out = delta_usage["output_tokens"] + elif isinstance(message, AssistantMessage): num_turns += 1 if transcript: @@ -268,6 +283,9 @@ async def _run(): status = "error" logger.error("Agent completed but no .cpp file found in %s", workdir) + # Clean up temp directory + shutil.rmtree(tmpdir, ignore_errors=True) + metadata = build_metadata( tokens_in=usage_in, tokens_out=usage_out, From f54d370b467de84a2a0c93880e53069bd12387dd Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Thu, 16 Apr 2026 02:56:58 +0000 Subject: [PATCH 06/16] feat: add prompt construction, test scripts, and sample I/O embedding for agent eval Build dynamic agent prompts from problem config (time/memory limits, subtask counts, interactive vs standard). Write test_all.sh and run_interactive.sh into agent workdir. Embed small sample I/O directly in prompt. Add CLAUDE.md with solving strategy guidance. --- algorithmic/README.md | 101 +++++ src/frontier_cs/gen/agent_interface.py | 497 +++++++++++++++++++++++-- tests/test_agent_interface.py | 171 ++++++++- 3 files changed, 730 insertions(+), 39 deletions(-) diff --git a/algorithmic/README.md b/algorithmic/README.md index 8737c55e..9299a7e3 100644 --- a/algorithmic/README.md +++ b/algorithmic/README.md @@ -97,6 +97,107 @@ sky launch -c algo-judge algorithmic/sky-judge.yaml --idle-minutes-to-autostop 1 frontier eval algorithmic 1 solution.cpp --judge-url http://$(sky status --ip algo-judge):8081 ``` +### Agent Evaluation + +Agent mode lets an AI agent solve problems iteratively — reading the statement, writing code, compiling, testing against samples, and refining — rather than generating a single-shot solution. + +Agents use the [Claude Agent SDK](https://github.com/anthropic/claude-agent-sdk) (Claude Code as a library). The agent gets a temporary copy of the problem directory with full tool access (shell, file I/O, compilation). + +#### Model naming convention + +Append `-agent` to any Claude model name to trigger agent mode: + +``` +claude-sonnet-4-5-20250514-agent # Agent mode with Sonnet 4.5 +claude-opus-4-6-20250610-agent # Agent mode with Opus 4.6 +``` + +The `-agent` suffix is stripped before passing the model to the SDK. The model prefix for output files includes `agent` (e.g., `claude4.5sonnetagent.cpp`), so agent and single-shot results never collide. + +#### Running agent evaluation + +```bash +cd algorithmic/scripts + +# Single model, all problems +python generate_solutions.py \ + --model claude-sonnet-4-5-20250514-agent \ + --judge-url http://localhost:8081 + +# Subset of problems, custom budget +python generate_solutions.py \ + --model claude-sonnet-4-5-20250514-agent \ + --problems 0,1,2,3 \ + --agent-timeout 1800 \ + --agent-cost-limit 30 \ + --judge-url http://localhost:8081 + +# Multiple variants per problem +python generate_solutions.py \ + --model claude-sonnet-4-5-20250514-agent \ + --indices 3 \ + --judge-url http://localhost:8081 +``` + +**Agent-specific CLI flags:** + +| Flag | Default | Description | +|------|---------|-------------| +| `--agent-timeout` | 1200 (20 min) | Wall-clock timeout per problem in seconds | +| `--agent-cost-limit` | 20.0 | Max cost per problem in USD | + +#### Output files + +For each problem/variant, agent mode produces three files: + +``` +solutions/{problem_id}/ +├── claude4.5sonnetagent.cpp # Extracted C++ solution +├── claude4.5sonnetagent.meta.json # Run metadata (cost, tokens, turns, status) +└── (in generation_logs/) + └── claude4.5sonnetagent_*.transcript.jsonl # Full agent transcript +``` + +**meta.json** fields: +- `tokens_in` / `tokens_out` — total token usage +- `cost_usd` — total API cost +- `time_seconds` — wall-clock time +- `turns` — number of agentic turns (tool-use round trips) +- `status` — `success`, `timeout`, `cost_limit`, or `error` + +#### Prerequisites + +1. **Claude Agent SDK**: `pip install claude-agent-sdk` (or `uv sync` if already in project deps) +2. **Claude Code CLI**: Must be installed and authenticated (`claude --version`) +3. **Judge server**: Running and accessible (see [Judge Server Configuration](#judge-server-configuration)) +4. **g++**: Available in PATH for the agent to compile solutions + +#### How it works + +1. The problem directory is copied to a temp working directory (concurrent-safe) +2. `testlib.h` is automatically copied from `judge/include/` if present (needed for interactive problems) +3. The agent receives a structured prompt with the problem path and workflow guidance +4. The agent iterates: reads the problem, writes code, compiles, tests against samples, and refines +5. On completion (or timeout), `solution.cpp` is extracted from the working directory +6. The temp directory is cleaned up; solution + metadata are saved + +#### Interactive problems + +Problems with `interactor.cc` (instead of `chk.cc`) are interactive — the solution communicates with a judge interactor via stdin/stdout. The agent prompt instructs it to: + +1. Compile the interactor using `g++ -std=gnu++17 -I. interactor.cc -o interactor` +2. Test locally via pipes (e.g., `mkfifo pipe; ./solution < pipe | ./interactor > pipe`) +3. `testlib.h` is provided automatically in the working directory + +Interactive problems are harder for agents because local testing requires building a pipe harness, which agents sometimes skip or get wrong. + +#### Known limitations + +- **No extended thinking**: The Claude Agent SDK does not currently expose extended thinking controls. Enabling it may improve complex algorithmic reasoning. +- **Rewrite tendency**: Agents sometimes rewrite solutions from scratch after failures, losing working logic. The prompt mitigates this but doesn't eliminate it. +- **Interactive testing**: Agents frequently skip local testing for interactive problems, submitting untested code. +- **Algorithm ceiling**: For problems requiring non-trivial algorithmic insight (advanced DP, flow, geometry), agent iteration doesn't compensate for model capability gaps. + ### Creating Problems > For contributing problems to Frontier-CS (detailed file formats, CI requirements), see [CONTRIBUTING.md](../CONTRIBUTING.md#algorithmic-problems). diff --git a/src/frontier_cs/gen/agent_interface.py b/src/frontier_cs/gen/agent_interface.py index f87adf7a..138ba6fa 100644 --- a/src/frontier_cs/gen/agent_interface.py +++ b/src/frontier_cs/gen/agent_interface.py @@ -12,13 +12,17 @@ import asyncio import json import logging +import os import shutil +import stat import sys import tempfile import time from dataclasses import dataclass, field from pathlib import Path -from typing import Any, Dict, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple + +import yaml logger = logging.getLogger(__name__) @@ -26,9 +30,209 @@ DEFAULT_COST_LIMIT_USD = 20.0 DEFAULT_TIMEOUT_SECONDS = 1200 # 20 minutes +# Max size of sample I/O to embed directly in the prompt (bytes). +# Larger inputs are left for the agent to read from disk. +_MAX_EMBED_SIZE = 4096 + + +def _read_problem_config(problem_dir: str) -> Dict[str, Any]: + """Read and parse config.yaml from a problem directory.""" + config_path = Path(problem_dir) / "config.yaml" + if config_path.is_file(): + return yaml.safe_load(config_path.read_text(encoding="utf-8")) or {} + return {} + + +def _collect_samples(problem_dir: str) -> List[Dict[str, str]]: + """Collect sample test cases from testdata/, sorted by number. + + Returns list of dicts with keys 'id', 'input', 'answer'. + Only includes samples where both .in and .ans exist and are small enough to embed. + """ + testdata = Path(problem_dir) / "testdata" + if not testdata.is_dir(): + return [] + + samples = [] + in_files = sorted(testdata.glob("*.in"), key=lambda p: int(p.stem) if p.stem.isdigit() else 0) + for in_file in in_files: + ans_file = in_file.with_suffix(".ans") + if not ans_file.is_file(): + continue + if in_file.stat().st_size > _MAX_EMBED_SIZE or ans_file.stat().st_size > _MAX_EMBED_SIZE: + continue + samples.append({ + "id": in_file.stem, + "input": in_file.read_text(encoding="utf-8"), + "answer": ans_file.read_text(encoding="utf-8"), + }) + return samples + + +def _format_samples(samples: List[Dict[str, str]], is_interactive: bool) -> str: + """Format sample test cases for inclusion in the prompt.""" + if not samples: + return "" + parts = ["\n## Sample test cases (embedded for convenience)\n"] + note = " (interactor judge input — NOT your stdin)" if is_interactive else "" + for s in samples: + parts.append(f"### Sample {s['id']}{note}") + parts.append(f"Input:\n```\n{s['input'].rstrip()}\n```") + parts.append(f"Expected output:\n```\n{s['answer'].rstrip()}\n```\n") + return "\n".join(parts) + + +# Shell script: compile solution.cpp and test against all sample cases. +# If chk.cc exists (special judge), uses it for verification instead of diff. +_TEST_ALL_SH = r"""#!/bin/bash +set -e +echo "=== Compiling solution.cpp ===" +g++ -std=gnu++17 -O2 -o solution solution.cpp +echo "=== Compilation OK ===" + +# Compile checker if available (special judge) +USE_CHECKER=0 +if [ -f "chk.cc" ]; then + echo "=== Compiling special judge (chk.cc) ===" + if g++ -std=gnu++17 -O2 -I. chk.cc -o checker 2>/dev/null; then + USE_CHECKER=1 + echo "=== Checker compiled OK — using it instead of diff ===" + else + echo "=== Checker compilation failed — falling back to diff ===" + fi +fi + +passed=0; failed=0; total=0 +for inf in testdata/*.in; do + [ -f "$inf" ] || continue + id=$(basename "$inf" .in) + ans="testdata/${id}.ans" + [ -f "$ans" ] || continue + total=$((total + 1)) + + # Run with timeout + if timeout 15 ./solution < "$inf" > "my_${id}.out" 2>"my_${id}.err"; then + if [ "$USE_CHECKER" -eq 1 ]; then + # Special judge: ./checker + checker_out=$(./checker "$inf" "my_${id}.out" "$ans" 2>&1) && chk_rc=$? || chk_rc=$? + if [ $chk_rc -eq 0 ]; then + echo " Sample $id: PASS (checker: $checker_out)" + passed=$((passed + 1)) + else + echo " Sample $id: WRONG ANSWER (checker exit $chk_rc)" + echo " Checker output: $checker_out" + failed=$((failed + 1)) + fi + else + # Diff-based comparison (normalize whitespace) + if diff -q <(tr -s '[:space:]' '\n' < "my_${id}.out" | sed '/^$/d') \ + <(tr -s '[:space:]' '\n' < "$ans" | sed '/^$/d') >/dev/null 2>&1; then + echo " Sample $id: PASS" + passed=$((passed + 1)) + else + echo " Sample $id: WRONG ANSWER" + echo " Expected (first 5 lines):" + head -5 "$ans" | sed 's/^/ /' + echo " Got (first 5 lines):" + head -5 "my_${id}.out" | sed 's/^/ /' + failed=$((failed + 1)) + fi + fi + else + rc=$? + echo " Sample $id: RUNTIME ERROR or TLE (exit $rc)" + [ -s "my_${id}.err" ] && head -3 "my_${id}.err" | sed 's/^/ stderr: /' + failed=$((failed + 1)) + fi +done + +echo "=== Results: $passed/$total passed ===" +[ "$failed" -eq 0 ] && exit 0 || exit 1 +""" + +# Shell script: test solution against an interactor using named pipes. +_RUN_INTERACTIVE_SH = r"""#!/bin/bash +# Usage: ./run_interactive.sh [sample_id] (default: 1) +# Compiles solution.cpp and interactor.cc, then tests via pipe. +# Exit codes: 0=accepted, 1=wrong answer, 2=presentation error, 3=build error, 4=timeout/crash + +SAMPLE=${1:-1} +INF="testdata/${SAMPLE}.in" +ANSF="testdata/${SAMPLE}.ans" + +if [ ! -f "$INF" ]; then + echo "Error: $INF not found" + exit 3 +fi + +# Compile only if binaries are missing or sources are newer +if [ ! -f ./solution ] || [ solution.cpp -nt ./solution ]; then + echo "=== Compiling solution.cpp ===" + g++ -std=gnu++17 -O2 -o solution solution.cpp || { echo "Compilation failed"; exit 3; } +fi + +if [ ! -f ./interactor ] || [ interactor.cc -nt ./interactor ]; then + echo "=== Compiling interactor ===" + g++ -std=gnu++17 -O2 -I. interactor.cc -o interactor || { echo "Interactor compilation failed"; exit 3; } +fi + +# Create named pipes in current dir (avoids /tmp permission issues) +PIPE_S2I=".pipe_s2i_$$" +PIPE_I2S=".pipe_i2s_$$" +rm -f "$PIPE_S2I" "$PIPE_I2S" +mkfifo "$PIPE_S2I" "$PIPE_I2S" + +cleanup() { rm -f "$PIPE_S2I" "$PIPE_I2S" inter_stderr.tmp sol_stderr.tmp; } +trap cleanup EXIT + +echo "=== Running sample $SAMPLE ===" + +# interactor: reads from solution's stdout via pipe, writes to solution's stdin via pipe +# testlib interactors: argv = [ans] +# We use /dev/null for ouf (output file) since we only care about exit code +timeout 120 ./interactor "$INF" /dev/null "$ANSF" < "$PIPE_S2I" > "$PIPE_I2S" 2>inter_stderr.tmp & +INTER_PID=$! + +timeout 120 ./solution < "$PIPE_I2S" > "$PIPE_S2I" 2>sol_stderr.tmp & +SOL_PID=$! + +# Wait for both processes +INTER_EXIT=0; SOL_EXIT=0 +wait $INTER_PID 2>/dev/null || INTER_EXIT=$? +wait $SOL_PID 2>/dev/null || SOL_EXIT=$? + +# Report results +if [ $INTER_EXIT -eq 0 ]; then + echo " Sample $SAMPLE: ACCEPTED (interactor exit 0)" + [ -s inter_stderr.tmp ] && head -2 inter_stderr.tmp | sed 's/^/ interactor: /' + exit 0 +elif [ $INTER_EXIT -eq 1 ]; then + echo " Sample $SAMPLE: WRONG ANSWER (interactor exit 1)" + [ -s inter_stderr.tmp ] && head -3 inter_stderr.tmp | sed 's/^/ interactor: /' + exit 1 +elif [ $INTER_EXIT -eq 2 ]; then + echo " Sample $SAMPLE: PRESENTATION ERROR (interactor exit 2)" + [ -s inter_stderr.tmp ] && head -3 inter_stderr.tmp | sed 's/^/ interactor: /' + exit 2 +elif [ $INTER_EXIT -eq 124 ] || [ $INTER_EXIT -eq 137 ]; then + echo " Sample $SAMPLE: TIMEOUT (120s exceeded)" + echo " This usually means your solution deadlocked (missing flush? wrong protocol?)" + [ -s sol_stderr.tmp ] && head -3 sol_stderr.tmp | sed 's/^/ solution stderr: /' + exit 4 +else + echo " Sample $SAMPLE: UNKNOWN (interactor exit $INTER_EXIT, solution exit $SOL_EXIT)" + [ -s inter_stderr.tmp ] && head -3 inter_stderr.tmp | sed 's/^/ interactor: /' + [ -s sol_stderr.tmp ] && head -3 sol_stderr.tmp | sed 's/^/ solution: /' + exit 4 +fi +""" + def build_agent_prompt(problem_dir: str) -> str: - """Construct the prompt given to the agent. + """Construct a problem-aware prompt for the agent. + + Reads config.yaml to detect problem type (interactive vs standard, SPJ), + embeds small sample I/O directly, and provides tailored workflow guidance. Args: problem_dir: Absolute path to the problem directory. @@ -36,40 +240,237 @@ def build_agent_prompt(problem_dir: str) -> str: Returns: The prompt string for the agent. """ - return f"""You are solving a competitive programming problem. + config = _read_problem_config(problem_dir) + is_interactive = config.get("type") == "interactive" + has_checker = "checker" in config + time_limit = config.get("time", "?") + memory_limit = config.get("memory", "?") + subtasks = config.get("subtasks", []) + total_cases = sum(s.get("n_cases", 0) for s in subtasks) if subtasks else "?" + samples = _collect_samples(problem_dir) + + # Base info + parts = [f"""You are solving a competitive programming problem. Problem directory: {problem_dir} -- Read statement.txt for the problem description -- testdata/ contains sample test cases (*.in, *.ans), but these are only a subset -- Your solution will be evaluated against a larger hidden test suite -- You can compile with g++, run against the available samples, and iterate -- config.yaml has time/memory limits — respect them in your solution - -Submit your final solution as solution.cpp in the current working directory.""" +- Read statement.txt for the full problem description +- Time limit: {time_limit}, Memory limit: {memory_limit} +- Total hidden test cases: {total_cases} (your score = fraction passed) +- testdata/ contains sample test cases — these are a SUBSET of the hidden tests"""] + + # Problem type specific guidance + if is_interactive: + parts.append(""" +## Problem type: INTERACTIVE + +This is an interactive problem. Your solution communicates with a judge interactor +via stdin/stdout. You do NOT read from files — you read responses from the interactor +and write queries/answers to stdout. + +Key files provided: +- interactor.cc — the judge interactor (uses testlib.h, both provided) +- testdata/*.in — interactor input seeds (NOT your stdin) + +**CRITICAL for interactive problems:** +- You MUST flush stdout after EVERY output line: use `cout << endl;` or `cout << flush;` +- Read the interactor source code to understand the exact protocol (what it sends, what it expects) +- Count your queries carefully against the stated limit + +**Testing interactive solutions locally:** +Use the provided `./run_interactive.sh` script: +```bash +./run_interactive.sh 1 # Test with sample 1 +./run_interactive.sh 2 # Test with sample 2 +# Run all samples: +for i in testdata/*.in; do ./run_interactive.sh $(basename $i .in); done +``` + +If `run_interactive.sh` times out (exit code 4), it usually means a deadlock: +- Missing `flush` / `endl` on your output +- Reading when the interactor expects you to write, or vice versa +- Exceeding the query limit (interactor stops responding) + +**Fallback testing:** If the shell script doesn't work, write a Python wrapper: +```python +import subprocess, os +proc_sol = subprocess.Popen(['./solution'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) +proc_int = subprocess.Popen(['./interactor', 'testdata/1.in', '/dev/null', 'testdata/1.ans'], + stdin=proc_sol.stdout, stdout=proc_sol.stdin) +proc_int.wait(); proc_sol.wait() +print(f"interactor exit: {proc_int.returncode}") +``` + +IMPORTANT: You MUST test your solution locally before finalizing. Do NOT submit untested code.""") + else: + checker_note = "" + if has_checker: + checker_note = """ +Note: This problem has a SPECIAL JUDGE (chk.cc) — multiple valid outputs may be accepted. +`test_all.sh` will automatically compile and use the checker for validation. +If the checker reports PASS but the output looks different from the .ans file, that's fine.""" + + parts.append(f""" +## Problem type: {"SPECIAL JUDGE (multiple valid outputs accepted)" if has_checker else "STANDARD"} + +**Testing your solution locally:** +Use the provided `./test_all.sh` script: +```bash +./test_all.sh # Compiles solution.cpp and runs against ALL samples +``` +This compiles, runs each sample, and compares output. Always run this before finalizing.{checker_note}""") + + # Scoring context + parts.append(f""" +## Scoring + +Your score is the fraction of hidden test cases passed (0-100%). +- There are {total_cases} hidden test cases total +- Partial credit counts — passing 7/10 cases = 70% score +- A correct-but-slow solution that passes small cases is MUCH better than a broken fast one +- Prioritize CORRECTNESS over optimality. Get a working solution first, then optimize.""") + + # Embed samples if small enough + sample_text = _format_samples(samples, is_interactive) + if sample_text: + parts.append(sample_text) + elif samples: + parts.append("\n(Sample inputs are large — read them from testdata/ directory.)\n") + + # Workflow + parts.append("""## Workflow + +1. Read the FULL problem statement carefully. Re-read the constraints and edge cases. +2. Read ALL sample test cases and understand the expected I/O format. +3. Design your algorithm. Think about time complexity vs the constraints. +4. Write a SIMPLE correct solution first — brute force is fine for a first version. +5. Compile and test against ALL samples using the provided test script. +6. If samples fail: debug by examining the diff, don't just rewrite everything. +7. Once samples pass: think about edge cases and whether your algorithm handles large inputs. +8. Optimize only after correctness is established. + +**Critical rules:** +- Do NOT rewrite your solution from scratch more than once. Incremental edits preserve working logic. +- Do NOT skip local testing. Every change must be tested before you move on. +- Do NOT submit without running test_all.sh (or run_interactive.sh for interactive). +- If you TLE on large cases, profile the bottleneck — don't simplify the entire algorithm. + +**Retreat strategy — know when to simplify:** +- If you've been debugging the SAME bug for more than 5 edit-test cycles without progress, + STOP and switch to a fundamentally simpler approach. A correct brute-force that passes + small cases is worth more than a broken optimized solution that passes nothing. +- If your approach is off by a small constant (e.g., exceeding a limit by 1), consider whether + a completely different algorithm would avoid the issue rather than patching endlessly. +- Remember: partial credit exists. A solution scoring 30% is infinitely better than 0%. + When in doubt, submit what works even if it's suboptimal. + +Submit your final solution as solution.cpp in the current working directory.""") + + return "\n".join(parts) + + +def _write_helper_scripts(workdir: Path, is_interactive: bool) -> None: + """Write test helper scripts to the agent's working directory.""" + # Always provide test_all.sh for non-interactive + test_all = workdir / "test_all.sh" + test_all.write_text(_TEST_ALL_SH, encoding="utf-8") + test_all.chmod(test_all.stat().st_mode | stat.S_IEXEC) + + if is_interactive: + run_inter = workdir / "run_interactive.sh" + run_inter.write_text(_RUN_INTERACTIVE_SH, encoding="utf-8") + run_inter.chmod(run_inter.stat().st_mode | stat.S_IEXEC) + + +def _write_workdir_claude_md(workdir: Path, is_interactive: bool) -> None: + """Write a CLAUDE.md to the workdir so Claude Code picks up behavioral guidance.""" + lines = [ + "# Agent Eval — Working Directory", + "", + "You are solving a competitive programming problem in this directory.", + "", + "## Rules", + "", + "- Your ONLY deliverable is `solution.cpp` in this directory.", + "- Use C++17 (g++ -std=gnu++17).", + "- Always compile with `-O2` for performance testing.", + "- Test against ALL sample cases before considering your solution done.", + "- Read the problem statement COMPLETELY before writing any code.", + "", + "## Testing", + "", + ] + if is_interactive: + lines += [ + "This is an INTERACTIVE problem. Use `./run_interactive.sh N` to test sample N.", + "Do NOT skip interactive testing — protocol bugs are the #1 failure mode.", + "", + "### Interactive protocol checklist", + "- `cout << endl;` or `cout << flush;` after EVERY line you output", + "- Read the interactor source code to know the exact send/receive order", + "- Count queries against the stated limit", + "- If run_interactive.sh times out: you likely have a deadlock (missing flush or wrong protocol)", + "- Fallback: write a Python subprocess wrapper if the shell script fails", + "", + ] + else: + lines += [ + "Use `./test_all.sh` to compile and test against all samples.", + "If chk.cc exists, test_all.sh uses it as a special judge automatically.", + "Fix any failing samples before moving on to optimization.", + "", + ] + lines += [ + "## Common mistakes to avoid", + "", + "- Forgetting to flush stdout in interactive problems", + "- Off-by-one errors in array indexing (0-indexed vs 1-indexed)", + "- Integer overflow — use `long long` for anything that could exceed 2^31", + "- Reading input in the wrong order or format", + "- Not handling the edge case where N=1 or the input is minimal", + "- Rewriting the entire solution when a small fix would work", + "", + "## When to retreat", + "", + "- If you've edited and tested 5+ times for the same bug without progress, STOP.", + "- Switch to a simpler algorithm that is guaranteed correct, even if slower.", + "- A correct brute-force scoring 30% beats a broken clever solution scoring 0%.", + "- Partial credit is real: every test case you pass counts.", + "", + ] + (workdir / "CLAUDE.md").write_text("\n".join(lines), encoding="utf-8") def extract_solution_cpp(workdir: Path) -> str: """Extract solution.cpp from the agent working directory. - Looks for solution.cpp first, then falls back to any .cpp file. + Searches for solution.cpp in the workdir, its parent (the tmpdir root), + and recursively. Falls back to any .cpp file that looks like a solution. Args: - workdir: The agent's working directory. + workdir: The agent's working directory (typically tmpdir/problem). Returns: The C++ source code, or empty string if not found. """ - # Primary: solution.cpp - sol = workdir / "solution.cpp" - if sol.is_file(): - return sol.read_text(encoding="utf-8") - - # Fallback: any .cpp file (agent might have used a different name) - cpp_files = list(workdir.glob("*.cpp")) - if cpp_files: - # Pick the most recently modified one - newest = max(cpp_files, key=lambda p: p.stat().st_mtime) - return newest.read_text(encoding="utf-8") + # Search these directories in priority order + search_dirs = [workdir, workdir.parent] + + for d in search_dirs: + sol = d / "solution.cpp" + if sol.is_file(): + return sol.read_text(encoding="utf-8") + + # Fallback: any .cpp file in workdir or parent (excluding problem-provided files) + problem_files = {p.name for p in workdir.glob("**/*.cpp") + if p.stat().st_mtime < workdir.stat().st_mtime} + for d in search_dirs: + cpp_files = [ + p for p in d.glob("*.cpp") + if p.name not in problem_files and p.name != "chk.cc" + ] + if cpp_files: + newest = max(cpp_files, key=lambda p: p.stat().st_mtime) + return newest.read_text(encoding="utf-8") return "" @@ -82,6 +483,8 @@ def build_metadata( time_seconds: float, turns: int, status: str, + model: str, + prompt: str, ) -> Dict[str, Any]: """Build the metadata dict for an agent run. @@ -92,11 +495,15 @@ def build_metadata( time_seconds: Wall-clock time in seconds. turns: Number of agentic turns (tool-use round trips). status: One of "success", "timeout", "cost_limit", "error". + model: The model name passed to the agent SDK. + prompt: The full prompt sent to the agent. Returns: Metadata dictionary. """ return { + "model": model, + "prompt": prompt, "tokens_in": tokens_in, "tokens_out": tokens_out, "cost_usd": round(cost_usd, 4), @@ -153,12 +560,35 @@ async def run_agent( from claude_agent_sdk import query, ClaudeAgentOptions from claude_agent_sdk.types import StreamEvent + # Claude Code CLI uses short model names, not full API model IDs. + # Map common API IDs to CLI-accepted names. + CLI_MODEL_MAP = { + "claude-sonnet-4-5-20250514": "claude-sonnet-4-5", + "claude-sonnet-4-6-20250610": "claude-sonnet-4-6", + "claude-opus-4-6-20250610": "claude-opus-4-6", + "claude-haiku-4-5-20251001": "claude-haiku-4-5", + } + model = CLI_MODEL_MAP.get(model, model) + + # Read problem config before copying to detect type. + config = _read_problem_config(problem_dir) + is_interactive = config.get("type") == "interactive" + # Copy problem dir to a temp working directory to avoid polluting the original. # This also makes concurrent runs on the same problem safe. tmpdir = tempfile.mkdtemp(prefix="agent_eval_") workdir = Path(tmpdir) / "problem" shutil.copytree(problem_dir, workdir) + # Provide testlib.h so agents can compile interactors/checkers for local testing. + testlib_src = Path(problem_dir).parent.parent / "judge" / "include" / "testlib.h" + if testlib_src.is_file(): + shutil.copy2(testlib_src, workdir / "testlib.h") + + # Write helper scripts and CLAUDE.md into workdir. + _write_helper_scripts(workdir, is_interactive) + _write_workdir_claude_md(workdir, is_interactive) + prompt = build_agent_prompt(str(workdir)) options = ClaudeAgentOptions( @@ -248,9 +678,12 @@ async def _run(): elif isinstance(message, ResultMessage): total_cost = message.total_cost_usd if message.usage: - usage_in = message.usage.get("input_tokens", usage_in) - usage_out = message.usage.get("output_tokens", usage_out) - num_turns = message.num_turns + usage_in = max(usage_in, message.usage.get("input_tokens", 0)) + usage_out = max(usage_out, message.usage.get("output_tokens", 0)) + # SDK may send multiple ResultMessages (main run + follow-ups). + # Keep the highest turn count to avoid a follow-up (turns=1) + # clobbering the real value. + num_turns = max(num_turns, message.num_turns) if transcript: transcript.log({ "type": "result", @@ -267,8 +700,14 @@ async def _run(): status = "timeout" logger.warning("Agent timed out after %.0fs", timeout) except Exception as e: - status = "error" - logger.error("Agent error: %s", e) + # Claude CLI often exits with code 1 after a successful run. + # If we already received a ResultMessage (total_cost is set), + # treat this as a successful completion, not an error. + if total_cost is not None: + logger.info("Agent completed (post-result CLI exit: %s)", e) + else: + status = "error" + logger.error("Agent error: %s", e) if transcript: transcript.log({"type": "error", "error": str(e)}) finally: @@ -293,6 +732,8 @@ async def _run(): time_seconds=elapsed, turns=num_turns, status=status, + model=model, + prompt=prompt, ) return code, metadata diff --git a/tests/test_agent_interface.py b/tests/test_agent_interface.py index be163ba4..0c771522 100644 --- a/tests/test_agent_interface.py +++ b/tests/test_agent_interface.py @@ -26,20 +26,97 @@ def test_agent_is_not_reasoning_model(): import json +import os import tempfile from pathlib import Path -def test_build_agent_prompt(): - """Agent prompt includes problem dir and key instructions.""" +def _make_problem_dir(tmpdir: str, *, interactive: bool = False, samples: int = 2) -> Path: + """Create a minimal problem directory for testing.""" + pdir = Path(tmpdir) / "problems" / "0" + pdir.mkdir(parents=True) + (pdir / "statement.txt").write_text("# Test Problem\nSolve it.\n") + + config = { + "type": "interactive" if interactive else "default", + "time": "1s", + "memory": "256m", + "subtasks": [{"score": 100, "n_cases": 3}], + } + if interactive: + config["interactor"] = "interactor.cc" + (pdir / "interactor.cc").write_text("// interactor\n") + else: + config["checker"] = "chk.cc" + + import yaml + (pdir / "config.yaml").write_text(yaml.dump(config)) + + testdata = pdir / "testdata" + testdata.mkdir() + for i in range(1, samples + 1): + (testdata / f"{i}.in").write_text(f"{i}\n") + (testdata / f"{i}.ans").write_text(f"{i * 2}\n") + + # testlib.h at judge/include/ level + judge_inc = Path(tmpdir) / "judge" / "include" + judge_inc.mkdir(parents=True, exist_ok=True) + (judge_inc / "testlib.h").write_text("// testlib stub\n") + + return pdir + + +def test_build_agent_prompt_standard(): + """Standard problem prompt includes test script and scoring info.""" + from frontier_cs.gen.agent_interface import build_agent_prompt + + with tempfile.TemporaryDirectory() as tmpdir: + pdir = _make_problem_dir(tmpdir) + prompt = build_agent_prompt(str(pdir)) + assert "test_all.sh" in prompt + assert "STANDARD" in prompt or "SPECIAL JUDGE" in prompt + assert "solution.cpp" in prompt + assert "Scoring" in prompt + assert "fraction" in prompt.lower() or "partial" in prompt.lower() + # Samples should be embedded (they're tiny) + assert "Sample 1" in prompt + + +def test_build_agent_prompt_interactive(): + """Interactive problem prompt includes interactor guidance and run_interactive.sh.""" + from frontier_cs.gen.agent_interface import build_agent_prompt + + with tempfile.TemporaryDirectory() as tmpdir: + pdir = _make_problem_dir(tmpdir, interactive=True) + prompt = build_agent_prompt(str(pdir)) + assert "INTERACTIVE" in prompt + assert "run_interactive.sh" in prompt + assert "flush" in prompt.lower() or "pipe" in prompt.lower() + + +def test_build_agent_prompt_embeds_small_samples(): + """Small samples are embedded directly in the prompt.""" from frontier_cs.gen.agent_interface import build_agent_prompt - prompt = build_agent_prompt("/tmp/fake_problem") - assert "/tmp/fake_problem" in prompt - assert "statement.txt" in prompt - assert "testdata/" in prompt - assert "hidden test suite" in prompt - assert "solution.cpp" in prompt + with tempfile.TemporaryDirectory() as tmpdir: + pdir = _make_problem_dir(tmpdir, samples=2) + prompt = build_agent_prompt(str(pdir)) + # The sample content should appear in the prompt + assert "Sample 1" in prompt + assert "Sample 2" in prompt + + +def test_build_agent_prompt_skips_large_samples(): + """Large samples are NOT embedded in the prompt.""" + from frontier_cs.gen.agent_interface import build_agent_prompt, _MAX_EMBED_SIZE + + with tempfile.TemporaryDirectory() as tmpdir: + pdir = _make_problem_dir(tmpdir, samples=1) + # Make the input file larger than the embed threshold + (pdir / "testdata" / "1.in").write_text("x" * (_MAX_EMBED_SIZE + 1)) + prompt = build_agent_prompt(str(pdir)) + # Should NOT contain the embedded content + assert "Sample 1" not in prompt def test_extract_cpp_from_workdir(): @@ -47,18 +124,38 @@ def test_extract_cpp_from_workdir(): from frontier_cs.gen.agent_interface import extract_solution_cpp with tempfile.TemporaryDirectory() as tmpdir: - sol_path = Path(tmpdir) / "solution.cpp" + workdir = Path(tmpdir) / "problem" + workdir.mkdir() + sol_path = workdir / "solution.cpp" sol_path.write_text('#include \nint main() { return 0; }') - code = extract_solution_cpp(Path(tmpdir)) + code = extract_solution_cpp(workdir) assert "#include " in code +def test_extract_cpp_from_parent(): + """Extract solution.cpp when agent writes it to tmpdir root instead of workdir.""" + from frontier_cs.gen.agent_interface import extract_solution_cpp + + with tempfile.TemporaryDirectory() as tmpdir: + workdir = Path(tmpdir) / "problem" + workdir.mkdir() + # Agent wrote solution.cpp in the parent (tmpdir), not in workdir + sol_path = Path(tmpdir) / "solution.cpp" + sol_path.write_text('#include \nint main() {}') + code = extract_solution_cpp(workdir) + assert "#include " in code + + def test_extract_cpp_missing(): """Return empty string if no solution.cpp found.""" from frontier_cs.gen.agent_interface import extract_solution_cpp with tempfile.TemporaryDirectory() as tmpdir: - code = extract_solution_cpp(Path(tmpdir)) + # Use a nested dir to mimic real layout (tmpdir/problem) and avoid + # picking up stray .cpp files from the system /tmp. + workdir = Path(tmpdir) / "problem" + workdir.mkdir() + code = extract_solution_cpp(workdir) assert code == "" @@ -73,6 +170,8 @@ def test_build_metadata(): time_seconds=300.5, turns=15, status="success", + model="claude-sonnet-4-5", + prompt="You are solving a competitive programming problem.", ) assert meta["tokens_in"] == 100000 assert meta["tokens_out"] == 25000 @@ -80,3 +179,53 @@ def test_build_metadata(): assert meta["time_seconds"] == 300.5 assert meta["turns"] == 15 assert meta["status"] == "success" + assert meta["model"] == "claude-sonnet-4-5" + assert meta["prompt"] == "You are solving a competitive programming problem." + + +def test_write_helper_scripts_standard(): + """Standard problem gets test_all.sh but not run_interactive.sh.""" + from frontier_cs.gen.agent_interface import _write_helper_scripts + + with tempfile.TemporaryDirectory() as tmpdir: + workdir = Path(tmpdir) + _write_helper_scripts(workdir, is_interactive=False) + assert (workdir / "test_all.sh").is_file() + assert os.access(workdir / "test_all.sh", os.X_OK) + assert not (workdir / "run_interactive.sh").is_file() + + +def test_write_helper_scripts_interactive(): + """Interactive problem gets both scripts.""" + from frontier_cs.gen.agent_interface import _write_helper_scripts + + with tempfile.TemporaryDirectory() as tmpdir: + workdir = Path(tmpdir) + _write_helper_scripts(workdir, is_interactive=True) + assert (workdir / "test_all.sh").is_file() + assert (workdir / "run_interactive.sh").is_file() + assert os.access(workdir / "run_interactive.sh", os.X_OK) + + +def test_write_workdir_claude_md_standard(): + """CLAUDE.md for standard problems mentions test_all.sh.""" + from frontier_cs.gen.agent_interface import _write_workdir_claude_md + + with tempfile.TemporaryDirectory() as tmpdir: + workdir = Path(tmpdir) + _write_workdir_claude_md(workdir, is_interactive=False) + content = (workdir / "CLAUDE.md").read_text() + assert "test_all.sh" in content + assert "run_interactive.sh" not in content + + +def test_write_workdir_claude_md_interactive(): + """CLAUDE.md for interactive problems mentions flush and run_interactive.sh.""" + from frontier_cs.gen.agent_interface import _write_workdir_claude_md + + with tempfile.TemporaryDirectory() as tmpdir: + workdir = Path(tmpdir) + _write_workdir_claude_md(workdir, is_interactive=True) + content = (workdir / "CLAUDE.md").read_text() + assert "run_interactive.sh" in content + assert "flush" in content From 170e7794d0fcdaf84dd6f87d08c6ca524ebe03fb Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Thu, 16 Apr 2026 04:06:23 +0000 Subject: [PATCH 07/16] feat: add parity mode for Harbor alignment and infra_git_hash tracking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Parity mode (--parity flag) strips all test data, helper scripts, checker, and interactor from the agent workspace — matching the Harbor adapter setup where agents must self-test via brute-force cross-validation (对拍). Changes: - agent_interface.py: parity-aware prompt, workspace setup, CLAUDE.md, _get_infra_git_hash(), and enriched build_metadata (timestamp, parity flag) - generate_solutions.py: --parity CLI argument - tests: parity prompt validation (standard + interactive) - docs: solutions repo separation plan (infra_git_hash in meta.json) - .gitignore: exclude .claude/ directory - pyproject.toml: add pytest dev dependency --- .gitignore | 1 + algorithmic/scripts/generate_solutions.py | 3 + docs/solutions-repo-separation.md | 43 ++++++ pyproject.toml | 5 + src/frontier_cs/gen/agent_interface.py | 171 ++++++++++++++++++++-- tests/test_agent_interface.py | 31 ++++ 6 files changed, 241 insertions(+), 13 deletions(-) create mode 100644 docs/solutions-repo-separation.md diff --git a/.gitignore b/.gitignore index f1ac09f1..53753de8 100644 --- a/.gitignore +++ b/.gitignore @@ -41,6 +41,7 @@ berkeley-function-call-leaderboard/bfcl_eval/scripts/ground_truth_conversation/ # Ignore lock files berkeley-function-call-leaderboard/**/*.lock +.claude/ .direnv/ .venv diff --git a/algorithmic/scripts/generate_solutions.py b/algorithmic/scripts/generate_solutions.py index 7d01b5b6..c9181137 100644 --- a/algorithmic/scripts/generate_solutions.py +++ b/algorithmic/scripts/generate_solutions.py @@ -305,6 +305,8 @@ def main(): help="Agent timeout in seconds (default: 1200 = 20 min)") parser.add_argument("--agent-cost-limit", type=float, default=20.0, help="Agent max cost per problem in USD (default: 20)") + parser.add_argument("--parity", action="store_true", + help="Harbor parity mode: no test data or helper scripts given to agent") args = parser.parse_args() @@ -525,6 +527,7 @@ def execute_task(task: GenerationTask) -> Tuple[str, str, Optional[str], str, Op cost_limit=args.agent_cost_limit, timeout=args.agent_timeout, transcript_path=transcript_path, + parity=args.parity, ) # Save metadata alongside solution diff --git a/docs/solutions-repo-separation.md b/docs/solutions-repo-separation.md new file mode 100644 index 00000000..f64a201e --- /dev/null +++ b/docs/solutions-repo-separation.md @@ -0,0 +1,43 @@ +# Solutions Repo Separation + +## Problem + +Infra code (agent_interface.py, generate_solutions.py) and generated solutions (.cpp, .meta.json) live in the same repo. This causes: + +- Can't freely rebase/restructure infra without worrying about losing uncommitted solutions +- Git diffs polluted by large generated files +- No traceability — can't tell which version of infra generated a given solution + +## Decision + +1. **Move solutions to `FrontierCS/Frontier-CS-Result`** (already exists for storing results). +2. **Add `infra_git_hash` to `.meta.json`** so each solution records which commit of this repo generated it. +3. **Keep existing naming**: `{model_prefix}.cpp`, `{model_prefix}_{variant}.cpp`. Indices (`_0`, `_1`, `_2`) remain multi-variant within a single run. +4. **Version via git commits** in the result repo. Re-running overwrites files, but commit before re-running to preserve history. + +## meta.json additions + +```json +{ + "model": "claude-sonnet-4-5-20250514", + "cost_usd": 0.55, + "time_seconds": 337, + "turns": 59, + "tokens_in": 125000, + "tokens_out": 18000, + "status": "success", + "infra_git_hash": "f54d370b", + "timestamp": "2026-04-15T14:30:22Z" +} +``` + +## What stays in this repo + +- `src/frontier_cs/gen/` — generation and agent infra code +- `algorithmic/problems/` — problem definitions +- `algorithmic/judge/` — judge server + +## What moves to Frontier-CS-Result + +- `algorithmic/solutions/` — all generated solution files +- `algorithmic/AGENT_EVAL_RESULTS.md` — eval result summaries diff --git a/pyproject.toml b/pyproject.toml index 317700e5..62ae3466 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,3 +29,8 @@ package = true [build-system] requires = ["hatchling"] build-backend = "hatchling.build" + +[dependency-groups] +dev = [ + "pytest>=9.0.3", +] diff --git a/src/frontier_cs/gen/agent_interface.py b/src/frontier_cs/gen/agent_interface.py index 138ba6fa..4bc4e879 100644 --- a/src/frontier_cs/gen/agent_interface.py +++ b/src/frontier_cs/gen/agent_interface.py @@ -15,6 +15,7 @@ import os import shutil import stat +import subprocess import sys import tempfile import time @@ -228,14 +229,18 @@ def _format_samples(samples: List[Dict[str, str]], is_interactive: bool) -> str: """ -def build_agent_prompt(problem_dir: str) -> str: +def build_agent_prompt(problem_dir: str, *, parity: bool = False) -> str: """Construct a problem-aware prompt for the agent. Reads config.yaml to detect problem type (interactive vs standard, SPJ), embeds small sample I/O directly, and provides tailored workflow guidance. + In parity mode, no test data or helper scripts are referenced — the agent + must write its own tests. This matches the Harbor adapter setup. + Args: problem_dir: Absolute path to the problem directory. + parity: If True, build a prompt that assumes no test data or scripts. Returns: The prompt string for the agent. @@ -247,6 +252,13 @@ def build_agent_prompt(problem_dir: str) -> str: memory_limit = config.get("memory", "?") subtasks = config.get("subtasks", []) total_cases = sum(s.get("n_cases", 0) for s in subtasks) if subtasks else "?" + + if parity: + return _build_parity_prompt( + problem_dir, config, is_interactive, has_checker, + time_limit, memory_limit, total_cases, + ) + samples = _collect_samples(problem_dir) # Base info @@ -368,6 +380,92 @@ def build_agent_prompt(problem_dir: str) -> str: return "\n".join(parts) +def _build_parity_prompt( + problem_dir: str, + config: Dict[str, Any], + is_interactive: bool, + has_checker: bool, + time_limit: str, + memory_limit: str, + total_cases: Any, +) -> str: + """Build a prompt for parity mode (no test data, no helper scripts). + + Matches the Harbor adapter setup: agent gets only the problem statement + and must write its own tests. + """ + parts = [f"""You are solving a competitive programming problem. + +Problem directory: {problem_dir} +- Read statement.txt for the full problem description +- Time limit: {time_limit}, Memory limit: {memory_limit} +- Total test cases: {total_cases} (your score = fraction passed) +- Scoring is partial: 0-100% based on test cases passed"""] + + if is_interactive: + parts.append(""" +## Problem type: INTERACTIVE + +This is an interactive problem. Your solution communicates with a hidden judge +via stdin/stdout. You do NOT read from files. + +**CRITICAL:** +- Flush stdout after EVERY output line: `cout << endl;` or `cout << flush;` +- Read the problem statement carefully for the exact query/response protocol +- Count your queries against the stated limit""") + elif has_checker: + parts.append(""" +## Problem type: SPECIAL JUDGE + +This problem accepts multiple valid outputs. Your solution will be checked by +a special judge, not by exact string matching.""") + else: + parts.append(""" +## Problem type: STANDARD + +Your output must match the expected output exactly (whitespace-normalized).""") + + parts.append(f""" +## Scoring + +Your score is the fraction of test cases passed (0-100%). +- There are {total_cases} test cases total +- Partial credit counts — passing 7/10 cases = 70% score +- A correct-but-slow solution that passes small cases is MUCH better than a broken fast one +- Prioritize CORRECTNESS over optimality + +## Self-testing + +No test data or test scripts are provided. You must validate your own solution: + +1. **Write a brute-force reference solution** (even if O(n!) or exponential) that you are + confident is correct for small inputs. +2. **Write a random test generator** that produces valid inputs within the problem constraints. +3. **Cross-validate (对拍):** Run both solutions on hundreds of random small inputs and compare + outputs. Fix any discrepancies by debugging your main solution against the brute-force. +4. **Stress test:** Generate larger random inputs to check for TLE, MLE, or crashes. +5. **Edge cases:** Manually test minimum inputs (N=1, empty, etc.) and boundary values. + +This self-testing approach is standard competitive programming practice. Do NOT skip it. + +## Workflow + +1. Read the FULL problem statement carefully. Re-read the constraints and edge cases. +2. Understand the I/O format from the examples in the problem statement. +3. Design your algorithm. Think about time complexity vs the constraints. +4. Write a SIMPLE correct solution first — brute force is fine for a first version. +5. Write a separate brute-force and test generator, then cross-validate. +6. Once confident in correctness: optimize for performance if needed. +7. Stress test with larger inputs before finalizing. + +**Retreat strategy:** If stuck on the same bug for 5+ cycles, switch to a simpler +algorithm. A correct brute-force scoring 30% beats a broken solution scoring 0%. + +Submit your final solution as solution.cpp in the current working directory.""") + + return "\n".join(parts) + + def _write_helper_scripts(workdir: Path, is_interactive: bool) -> None: """Write test helper scripts to the agent's working directory.""" # Always provide test_all.sh for non-interactive @@ -381,7 +479,7 @@ def _write_helper_scripts(workdir: Path, is_interactive: bool) -> None: run_inter.chmod(run_inter.stat().st_mode | stat.S_IEXEC) -def _write_workdir_claude_md(workdir: Path, is_interactive: bool) -> None: +def _write_workdir_claude_md(workdir: Path, is_interactive: bool, *, parity: bool = False) -> None: """Write a CLAUDE.md to the workdir so Claude Code picks up behavioral guidance.""" lines = [ "# Agent Eval — Working Directory", @@ -393,13 +491,27 @@ def _write_workdir_claude_md(workdir: Path, is_interactive: bool) -> None: "- Your ONLY deliverable is `solution.cpp` in this directory.", "- Use C++17 (g++ -std=gnu++17).", "- Always compile with `-O2` for performance testing.", - "- Test against ALL sample cases before considering your solution done.", "- Read the problem statement COMPLETELY before writing any code.", "", "## Testing", "", ] - if is_interactive: + if parity: + lines += [ + "No test data or test scripts are provided.", + "Write your own brute-force solution + random test generator to cross-validate.", + "This is standard competitive programming practice (对拍).", + "", + ] + if is_interactive: + lines += [ + "This is an INTERACTIVE problem.", + "- `cout << endl;` or `cout << flush;` after EVERY line you output", + "- Read the problem statement to understand the exact protocol", + "- Count queries against the stated limit", + "", + ] + elif is_interactive: lines += [ "This is an INTERACTIVE problem. Use `./run_interactive.sh N` to test sample N.", "Do NOT skip interactive testing — protocol bugs are the #1 failure mode.", @@ -475,6 +587,19 @@ def extract_solution_cpp(workdir: Path) -> str: return "" +def _get_infra_git_hash() -> str: + """Get the current git commit hash of this repo (infra code).""" + try: + result = subprocess.run( + ["git", "rev-parse", "--short", "HEAD"], + capture_output=True, text=True, timeout=5, + cwd=Path(__file__).parent, + ) + return result.stdout.strip() if result.returncode == 0 else "unknown" + except Exception: + return "unknown" + + def build_metadata( *, tokens_in: int, @@ -485,6 +610,7 @@ def build_metadata( status: str, model: str, prompt: str, + parity: bool = False, ) -> Dict[str, Any]: """Build the metadata dict for an agent run. @@ -497,6 +623,7 @@ def build_metadata( status: One of "success", "timeout", "cost_limit", "error". model: The model name passed to the agent SDK. prompt: The full prompt sent to the agent. + parity: Whether this run used parity mode. Returns: Metadata dictionary. @@ -510,6 +637,9 @@ def build_metadata( "time_seconds": round(time_seconds, 2), "turns": turns, "status": status, + "infra_git_hash": _get_infra_git_hash(), + "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), + "parity": parity, } @@ -544,6 +674,7 @@ async def run_agent( cost_limit: float = DEFAULT_COST_LIMIT_USD, timeout: float = DEFAULT_TIMEOUT_SECONDS, transcript_path: Optional[Path] = None, + parity: bool = False, ) -> Tuple[str, Dict[str, Any]]: """Run the agent to solve a problem. @@ -553,6 +684,7 @@ async def run_agent( cost_limit: Maximum cost in USD. timeout: Maximum wall-clock time in seconds. transcript_path: Path for JSONL transcript log. None to skip. + parity: If True, strip test data and helper scripts (Harbor parity mode). Returns: Tuple of (cpp_code, metadata_dict). @@ -578,18 +710,27 @@ async def run_agent( # This also makes concurrent runs on the same problem safe. tmpdir = tempfile.mkdtemp(prefix="agent_eval_") workdir = Path(tmpdir) / "problem" - shutil.copytree(problem_dir, workdir) - # Provide testlib.h so agents can compile interactors/checkers for local testing. - testlib_src = Path(problem_dir).parent.parent / "judge" / "include" / "testlib.h" - if testlib_src.is_file(): - shutil.copy2(testlib_src, workdir / "testlib.h") + if parity: + # Parity mode: only copy statement.txt and config.yaml — no test data, + # no checker, no interactor. Agent must self-test. + workdir.mkdir(parents=True) + for fname in ("statement.txt", "config.yaml", "tag.txt"): + src = Path(problem_dir) / fname + if src.is_file(): + shutil.copy2(src, workdir / fname) + else: + shutil.copytree(problem_dir, workdir) + # Provide testlib.h so agents can compile interactors/checkers for local testing. + testlib_src = Path(problem_dir).parent.parent / "judge" / "include" / "testlib.h" + if testlib_src.is_file(): + shutil.copy2(testlib_src, workdir / "testlib.h") + # Write helper scripts for local testing. + _write_helper_scripts(workdir, is_interactive) - # Write helper scripts and CLAUDE.md into workdir. - _write_helper_scripts(workdir, is_interactive) - _write_workdir_claude_md(workdir, is_interactive) + _write_workdir_claude_md(workdir, is_interactive, parity=parity) - prompt = build_agent_prompt(str(workdir)) + prompt = build_agent_prompt(str(workdir), parity=parity) options = ClaudeAgentOptions( model=model, @@ -734,6 +875,7 @@ async def _run(): status=status, model=model, prompt=prompt, + parity=parity, ) return code, metadata @@ -746,6 +888,7 @@ def generate_agent_solution( cost_limit: float = DEFAULT_COST_LIMIT_USD, timeout: float = DEFAULT_TIMEOUT_SECONDS, transcript_path: Optional[Path] = None, + parity: bool = False, ) -> Tuple[str, Dict[str, Any]]: """Synchronous wrapper for run_agent. @@ -757,6 +900,7 @@ def generate_agent_solution( cost_limit: Maximum cost in USD. timeout: Maximum wall-clock time in seconds. transcript_path: Path for JSONL transcript log. + parity: If True, strip test data and helper scripts (Harbor parity mode). Returns: Tuple of (cpp_code, metadata_dict). @@ -768,5 +912,6 @@ def generate_agent_solution( cost_limit=cost_limit, timeout=timeout, transcript_path=transcript_path, + parity=parity, ) ) diff --git a/tests/test_agent_interface.py b/tests/test_agent_interface.py index 0c771522..176bef6f 100644 --- a/tests/test_agent_interface.py +++ b/tests/test_agent_interface.py @@ -119,6 +119,37 @@ def test_build_agent_prompt_skips_large_samples(): assert "Sample 1" not in prompt +def test_build_agent_prompt_parity_no_test_refs(): + """Parity mode prompt has no references to test scripts or test data.""" + from frontier_cs.gen.agent_interface import build_agent_prompt + + with tempfile.TemporaryDirectory() as tmpdir: + pdir = _make_problem_dir(tmpdir, samples=2) + prompt = build_agent_prompt(str(pdir), parity=True) + assert "test_all.sh" not in prompt + assert "run_interactive.sh" not in prompt + assert "testdata/" not in prompt + assert "Sample 1" not in prompt + assert "chk.cc" not in prompt + assert "interactor.cc" not in prompt + # Should mention self-testing + assert "brute-force" in prompt.lower() or "brute force" in prompt.lower() + assert "solution.cpp" in prompt + + +def test_build_agent_prompt_parity_interactive(): + """Parity mode interactive prompt mentions flush but not interactor source.""" + from frontier_cs.gen.agent_interface import build_agent_prompt + + with tempfile.TemporaryDirectory() as tmpdir: + pdir = _make_problem_dir(tmpdir, interactive=True) + prompt = build_agent_prompt(str(pdir), parity=True) + assert "INTERACTIVE" in prompt + assert "flush" in prompt.lower() + assert "run_interactive.sh" not in prompt + assert "interactor.cc" not in prompt + + def test_extract_cpp_from_workdir(): """Extract solution.cpp from agent working directory.""" from frontier_cs.gen.agent_interface import extract_solution_cpp From dd0a6339af6843b8e9c01e27697e4b35966e9ee6 Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Thu, 16 Apr 2026 04:12:17 +0000 Subject: [PATCH 08/16] revert: remove infra_git_hash and timestamp from build_metadata These belong to the solutions repo separation effort, which is docs-only for now. Removed _get_infra_git_hash(), subprocess import, and the infra_git_hash/timestamp/parity fields from build_metadata(). --- src/frontier_cs/gen/agent_interface.py | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/src/frontier_cs/gen/agent_interface.py b/src/frontier_cs/gen/agent_interface.py index 4bc4e879..b3145866 100644 --- a/src/frontier_cs/gen/agent_interface.py +++ b/src/frontier_cs/gen/agent_interface.py @@ -15,7 +15,6 @@ import os import shutil import stat -import subprocess import sys import tempfile import time @@ -587,19 +586,6 @@ def extract_solution_cpp(workdir: Path) -> str: return "" -def _get_infra_git_hash() -> str: - """Get the current git commit hash of this repo (infra code).""" - try: - result = subprocess.run( - ["git", "rev-parse", "--short", "HEAD"], - capture_output=True, text=True, timeout=5, - cwd=Path(__file__).parent, - ) - return result.stdout.strip() if result.returncode == 0 else "unknown" - except Exception: - return "unknown" - - def build_metadata( *, tokens_in: int, @@ -610,7 +596,6 @@ def build_metadata( status: str, model: str, prompt: str, - parity: bool = False, ) -> Dict[str, Any]: """Build the metadata dict for an agent run. @@ -623,7 +608,6 @@ def build_metadata( status: One of "success", "timeout", "cost_limit", "error". model: The model name passed to the agent SDK. prompt: The full prompt sent to the agent. - parity: Whether this run used parity mode. Returns: Metadata dictionary. @@ -637,9 +621,6 @@ def build_metadata( "time_seconds": round(time_seconds, 2), "turns": turns, "status": status, - "infra_git_hash": _get_infra_git_hash(), - "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), - "parity": parity, } @@ -875,7 +856,6 @@ async def _run(): status=status, model=model, prompt=prompt, - parity=parity, ) return code, metadata From e95fb8f3e94ad0e974ff58a20371feacc8607bf5 Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Thu, 16 Apr 2026 04:51:22 +0000 Subject: [PATCH 09/16] fix: make parity mode the default and remove solutions-repo-separation doc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Agent always runs without test data — no --parity flag needed. The solutions repo separation plan is not ready to commit. --- algorithmic/scripts/generate_solutions.py | 4 +-- docs/solutions-repo-separation.md | 43 ----------------------- src/frontier_cs/gen/agent_interface.py | 8 ++--- tests/test_agent_interface.py | 12 +++---- 4 files changed, 11 insertions(+), 56 deletions(-) delete mode 100644 docs/solutions-repo-separation.md diff --git a/algorithmic/scripts/generate_solutions.py b/algorithmic/scripts/generate_solutions.py index c9181137..d2b9ed33 100644 --- a/algorithmic/scripts/generate_solutions.py +++ b/algorithmic/scripts/generate_solutions.py @@ -305,8 +305,7 @@ def main(): help="Agent timeout in seconds (default: 1200 = 20 min)") parser.add_argument("--agent-cost-limit", type=float, default=20.0, help="Agent max cost per problem in USD (default: 20)") - parser.add_argument("--parity", action="store_true", - help="Harbor parity mode: no test data or helper scripts given to agent") + args = parser.parse_args() @@ -527,7 +526,6 @@ def execute_task(task: GenerationTask) -> Tuple[str, str, Optional[str], str, Op cost_limit=args.agent_cost_limit, timeout=args.agent_timeout, transcript_path=transcript_path, - parity=args.parity, ) # Save metadata alongside solution diff --git a/docs/solutions-repo-separation.md b/docs/solutions-repo-separation.md deleted file mode 100644 index f64a201e..00000000 --- a/docs/solutions-repo-separation.md +++ /dev/null @@ -1,43 +0,0 @@ -# Solutions Repo Separation - -## Problem - -Infra code (agent_interface.py, generate_solutions.py) and generated solutions (.cpp, .meta.json) live in the same repo. This causes: - -- Can't freely rebase/restructure infra without worrying about losing uncommitted solutions -- Git diffs polluted by large generated files -- No traceability — can't tell which version of infra generated a given solution - -## Decision - -1. **Move solutions to `FrontierCS/Frontier-CS-Result`** (already exists for storing results). -2. **Add `infra_git_hash` to `.meta.json`** so each solution records which commit of this repo generated it. -3. **Keep existing naming**: `{model_prefix}.cpp`, `{model_prefix}_{variant}.cpp`. Indices (`_0`, `_1`, `_2`) remain multi-variant within a single run. -4. **Version via git commits** in the result repo. Re-running overwrites files, but commit before re-running to preserve history. - -## meta.json additions - -```json -{ - "model": "claude-sonnet-4-5-20250514", - "cost_usd": 0.55, - "time_seconds": 337, - "turns": 59, - "tokens_in": 125000, - "tokens_out": 18000, - "status": "success", - "infra_git_hash": "f54d370b", - "timestamp": "2026-04-15T14:30:22Z" -} -``` - -## What stays in this repo - -- `src/frontier_cs/gen/` — generation and agent infra code -- `algorithmic/problems/` — problem definitions -- `algorithmic/judge/` — judge server - -## What moves to Frontier-CS-Result - -- `algorithmic/solutions/` — all generated solution files -- `algorithmic/AGENT_EVAL_RESULTS.md` — eval result summaries diff --git a/src/frontier_cs/gen/agent_interface.py b/src/frontier_cs/gen/agent_interface.py index b3145866..05f11671 100644 --- a/src/frontier_cs/gen/agent_interface.py +++ b/src/frontier_cs/gen/agent_interface.py @@ -228,7 +228,7 @@ def _format_samples(samples: List[Dict[str, str]], is_interactive: bool) -> str: """ -def build_agent_prompt(problem_dir: str, *, parity: bool = False) -> str: +def build_agent_prompt(problem_dir: str, *, parity: bool = True) -> str: """Construct a problem-aware prompt for the agent. Reads config.yaml to detect problem type (interactive vs standard, SPJ), @@ -478,7 +478,7 @@ def _write_helper_scripts(workdir: Path, is_interactive: bool) -> None: run_inter.chmod(run_inter.stat().st_mode | stat.S_IEXEC) -def _write_workdir_claude_md(workdir: Path, is_interactive: bool, *, parity: bool = False) -> None: +def _write_workdir_claude_md(workdir: Path, is_interactive: bool, *, parity: bool = True) -> None: """Write a CLAUDE.md to the workdir so Claude Code picks up behavioral guidance.""" lines = [ "# Agent Eval — Working Directory", @@ -655,7 +655,7 @@ async def run_agent( cost_limit: float = DEFAULT_COST_LIMIT_USD, timeout: float = DEFAULT_TIMEOUT_SECONDS, transcript_path: Optional[Path] = None, - parity: bool = False, + parity: bool = True, ) -> Tuple[str, Dict[str, Any]]: """Run the agent to solve a problem. @@ -868,7 +868,7 @@ def generate_agent_solution( cost_limit: float = DEFAULT_COST_LIMIT_USD, timeout: float = DEFAULT_TIMEOUT_SECONDS, transcript_path: Optional[Path] = None, - parity: bool = False, + parity: bool = True, ) -> Tuple[str, Dict[str, Any]]: """Synchronous wrapper for run_agent. diff --git a/tests/test_agent_interface.py b/tests/test_agent_interface.py index 176bef6f..34f27037 100644 --- a/tests/test_agent_interface.py +++ b/tests/test_agent_interface.py @@ -72,7 +72,7 @@ def test_build_agent_prompt_standard(): with tempfile.TemporaryDirectory() as tmpdir: pdir = _make_problem_dir(tmpdir) - prompt = build_agent_prompt(str(pdir)) + prompt = build_agent_prompt(str(pdir), parity=False) assert "test_all.sh" in prompt assert "STANDARD" in prompt or "SPECIAL JUDGE" in prompt assert "solution.cpp" in prompt @@ -88,7 +88,7 @@ def test_build_agent_prompt_interactive(): with tempfile.TemporaryDirectory() as tmpdir: pdir = _make_problem_dir(tmpdir, interactive=True) - prompt = build_agent_prompt(str(pdir)) + prompt = build_agent_prompt(str(pdir), parity=False) assert "INTERACTIVE" in prompt assert "run_interactive.sh" in prompt assert "flush" in prompt.lower() or "pipe" in prompt.lower() @@ -100,7 +100,7 @@ def test_build_agent_prompt_embeds_small_samples(): with tempfile.TemporaryDirectory() as tmpdir: pdir = _make_problem_dir(tmpdir, samples=2) - prompt = build_agent_prompt(str(pdir)) + prompt = build_agent_prompt(str(pdir), parity=False) # The sample content should appear in the prompt assert "Sample 1" in prompt assert "Sample 2" in prompt @@ -114,7 +114,7 @@ def test_build_agent_prompt_skips_large_samples(): pdir = _make_problem_dir(tmpdir, samples=1) # Make the input file larger than the embed threshold (pdir / "testdata" / "1.in").write_text("x" * (_MAX_EMBED_SIZE + 1)) - prompt = build_agent_prompt(str(pdir)) + prompt = build_agent_prompt(str(pdir), parity=False) # Should NOT contain the embedded content assert "Sample 1" not in prompt @@ -244,7 +244,7 @@ def test_write_workdir_claude_md_standard(): with tempfile.TemporaryDirectory() as tmpdir: workdir = Path(tmpdir) - _write_workdir_claude_md(workdir, is_interactive=False) + _write_workdir_claude_md(workdir, is_interactive=False, parity=False) content = (workdir / "CLAUDE.md").read_text() assert "test_all.sh" in content assert "run_interactive.sh" not in content @@ -256,7 +256,7 @@ def test_write_workdir_claude_md_interactive(): with tempfile.TemporaryDirectory() as tmpdir: workdir = Path(tmpdir) - _write_workdir_claude_md(workdir, is_interactive=True) + _write_workdir_claude_md(workdir, is_interactive=True, parity=False) content = (workdir / "CLAUDE.md").read_text() assert "run_interactive.sh" in content assert "flush" in content From 1549c534d114116d8640d1942a99ba1925ead8bf Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Thu, 16 Apr 2026 05:17:32 +0000 Subject: [PATCH 10/16] refactor: extract prompt templates and scripts to agent_constants.py Move all large string constants (prompt templates, shell scripts, CLAUDE.md content) out of agent_interface.py into a dedicated constants module. --- src/frontier_cs/gen/agent_constants.py | 384 ++++++++++++++++++++++ src/frontier_cs/gen/agent_interface.py | 435 ++++--------------------- 2 files changed, 440 insertions(+), 379 deletions(-) create mode 100644 src/frontier_cs/gen/agent_constants.py diff --git a/src/frontier_cs/gen/agent_constants.py b/src/frontier_cs/gen/agent_constants.py new file mode 100644 index 00000000..6b23d915 --- /dev/null +++ b/src/frontier_cs/gen/agent_constants.py @@ -0,0 +1,384 @@ +"""Prompt templates, shell scripts, and CLAUDE.md content for agent eval. + +All large string constants live here to keep agent_interface.py focused on logic. +""" + +# --------------------------------------------------------------------------- +# Shell scripts (used in parity=False mode only) +# --------------------------------------------------------------------------- + +TEST_ALL_SH = r"""#!/bin/bash +set -e +echo "=== Compiling solution.cpp ===" +g++ -std=gnu++17 -O2 -o solution solution.cpp +echo "=== Compilation OK ===" + +# Compile checker if available (special judge) +USE_CHECKER=0 +if [ -f "chk.cc" ]; then + echo "=== Compiling special judge (chk.cc) ===" + if g++ -std=gnu++17 -O2 -I. chk.cc -o checker 2>/dev/null; then + USE_CHECKER=1 + echo "=== Checker compiled OK — using it instead of diff ===" + else + echo "=== Checker compilation failed — falling back to diff ===" + fi +fi + +passed=0; failed=0; total=0 +for inf in testdata/*.in; do + [ -f "$inf" ] || continue + id=$(basename "$inf" .in) + ans="testdata/${id}.ans" + [ -f "$ans" ] || continue + total=$((total + 1)) + + # Run with timeout + if timeout 15 ./solution < "$inf" > "my_${id}.out" 2>"my_${id}.err"; then + if [ "$USE_CHECKER" -eq 1 ]; then + # Special judge: ./checker + checker_out=$(./checker "$inf" "my_${id}.out" "$ans" 2>&1) && chk_rc=$? || chk_rc=$? + if [ $chk_rc -eq 0 ]; then + echo " Sample $id: PASS (checker: $checker_out)" + passed=$((passed + 1)) + else + echo " Sample $id: WRONG ANSWER (checker exit $chk_rc)" + echo " Checker output: $checker_out" + failed=$((failed + 1)) + fi + else + # Diff-based comparison (normalize whitespace) + if diff -q <(tr -s '[:space:]' '\n' < "my_${id}.out" | sed '/^$/d') \ + <(tr -s '[:space:]' '\n' < "$ans" | sed '/^$/d') >/dev/null 2>&1; then + echo " Sample $id: PASS" + passed=$((passed + 1)) + else + echo " Sample $id: WRONG ANSWER" + echo " Expected (first 5 lines):" + head -5 "$ans" | sed 's/^/ /' + echo " Got (first 5 lines):" + head -5 "my_${id}.out" | sed 's/^/ /' + failed=$((failed + 1)) + fi + fi + else + rc=$? + echo " Sample $id: RUNTIME ERROR or TLE (exit $rc)" + [ -s "my_${id}.err" ] && head -3 "my_${id}.err" | sed 's/^/ stderr: /' + failed=$((failed + 1)) + fi +done + +echo "=== Results: $passed/$total passed ===" +[ "$failed" -eq 0 ] && exit 0 || exit 1 +""" + +RUN_INTERACTIVE_SH = r"""#!/bin/bash +# Usage: ./run_interactive.sh [sample_id] (default: 1) +# Compiles solution.cpp and interactor.cc, then tests via pipe. +# Exit codes: 0=accepted, 1=wrong answer, 2=presentation error, 3=build error, 4=timeout/crash + +SAMPLE=${1:-1} +INF="testdata/${SAMPLE}.in" +ANSF="testdata/${SAMPLE}.ans" + +if [ ! -f "$INF" ]; then + echo "Error: $INF not found" + exit 3 +fi + +# Compile only if binaries are missing or sources are newer +if [ ! -f ./solution ] || [ solution.cpp -nt ./solution ]; then + echo "=== Compiling solution.cpp ===" + g++ -std=gnu++17 -O2 -o solution solution.cpp || { echo "Compilation failed"; exit 3; } +fi + +if [ ! -f ./interactor ] || [ interactor.cc -nt ./interactor ]; then + echo "=== Compiling interactor ===" + g++ -std=gnu++17 -O2 -I. interactor.cc -o interactor || { echo "Interactor compilation failed"; exit 3; } +fi + +# Create named pipes in current dir (avoids /tmp permission issues) +PIPE_S2I=".pipe_s2i_$$" +PIPE_I2S=".pipe_i2s_$$" +rm -f "$PIPE_S2I" "$PIPE_I2S" +mkfifo "$PIPE_S2I" "$PIPE_I2S" + +cleanup() { rm -f "$PIPE_S2I" "$PIPE_I2S" inter_stderr.tmp sol_stderr.tmp; } +trap cleanup EXIT + +echo "=== Running sample $SAMPLE ===" + +# interactor: reads from solution's stdout via pipe, writes to solution's stdin via pipe +# testlib interactors: argv = [ans] +# We use /dev/null for ouf (output file) since we only care about exit code +timeout 120 ./interactor "$INF" /dev/null "$ANSF" < "$PIPE_S2I" > "$PIPE_I2S" 2>inter_stderr.tmp & +INTER_PID=$! + +timeout 120 ./solution < "$PIPE_I2S" > "$PIPE_S2I" 2>sol_stderr.tmp & +SOL_PID=$! + +# Wait for both processes +INTER_EXIT=0; SOL_EXIT=0 +wait $INTER_PID 2>/dev/null || INTER_EXIT=$? +wait $SOL_PID 2>/dev/null || SOL_EXIT=$? + +# Report results +if [ $INTER_EXIT -eq 0 ]; then + echo " Sample $SAMPLE: ACCEPTED (interactor exit 0)" + [ -s inter_stderr.tmp ] && head -2 inter_stderr.tmp | sed 's/^/ interactor: /' + exit 0 +elif [ $INTER_EXIT -eq 1 ]; then + echo " Sample $SAMPLE: WRONG ANSWER (interactor exit 1)" + [ -s inter_stderr.tmp ] && head -3 inter_stderr.tmp | sed 's/^/ interactor: /' + exit 1 +elif [ $INTER_EXIT -eq 2 ]; then + echo " Sample $SAMPLE: PRESENTATION ERROR (interactor exit 2)" + [ -s inter_stderr.tmp ] && head -3 inter_stderr.tmp | sed 's/^/ interactor: /' + exit 2 +elif [ $INTER_EXIT -eq 124 ] || [ $INTER_EXIT -eq 137 ]; then + echo " Sample $SAMPLE: TIMEOUT (120s exceeded)" + echo " This usually means your solution deadlocked (missing flush? wrong protocol?)" + [ -s sol_stderr.tmp ] && head -3 sol_stderr.tmp | sed 's/^/ solution stderr: /' + exit 4 +else + echo " Sample $SAMPLE: UNKNOWN (interactor exit $INTER_EXIT, solution exit $SOL_EXIT)" + [ -s inter_stderr.tmp ] && head -3 inter_stderr.tmp | sed 's/^/ interactor: /' + [ -s sol_stderr.tmp ] && head -3 sol_stderr.tmp | sed 's/^/ solution: /' + exit 4 +fi +""" + +# --------------------------------------------------------------------------- +# Prompt templates +# --------------------------------------------------------------------------- + +# Parity prompt (default): agent gets NO test data, must self-test +PARITY_PROMPT = """You are solving a competitive programming problem. + +Problem directory: {problem_dir} +- Read statement.txt for the full problem description +- Time limit: {time_limit}, Memory limit: {memory_limit} +- Total test cases: {total_cases} (your score = fraction passed) +- Scoring is partial: 0-100% based on test cases passed""" + +PARITY_INTERACTIVE_SECTION = """ +## Problem type: INTERACTIVE + +This is an interactive problem. Your solution communicates with a hidden judge +via stdin/stdout. You do NOT read from files. + +**CRITICAL:** +- Flush stdout after EVERY output line: `cout << endl;` or `cout << flush;` +- Read the problem statement carefully for the exact query/response protocol +- Count your queries against the stated limit""" + +PARITY_SPJ_SECTION = """ +## Problem type: SPECIAL JUDGE + +This problem accepts multiple valid outputs. Your solution will be checked by +a special judge, not by exact string matching.""" + +PARITY_STANDARD_SECTION = """ +## Problem type: STANDARD + +Your output must match the expected output exactly (whitespace-normalized).""" + +PARITY_SCORING_AND_WORKFLOW = """ +## Scoring + +Your score is the fraction of test cases passed (0-100%). +- There are {total_cases} test cases total +- Partial credit counts — passing 7/10 cases = 70% score +- A correct-but-slow solution that passes small cases is MUCH better than a broken fast one +- Prioritize CORRECTNESS over optimality + +## Self-testing + +No test data or test scripts are provided. You must validate your own solution: + +1. **Write a brute-force reference solution** (even if O(n!) or exponential) that you are + confident is correct for small inputs. +2. **Write a random test generator** that produces valid inputs within the problem constraints. +3. **Cross-validate (对拍):** Run both solutions on hundreds of random small inputs and compare + outputs. Fix any discrepancies by debugging your main solution against the brute-force. +4. **Stress test:** Generate larger random inputs to check for TLE, MLE, or crashes. +5. **Edge cases:** Manually test minimum inputs (N=1, empty, etc.) and boundary values. + +This self-testing approach is standard competitive programming practice. Do NOT skip it. + +## Workflow + +1. Read the FULL problem statement carefully. Re-read the constraints and edge cases. +2. Understand the I/O format from the examples in the problem statement. +3. Design your algorithm. Think about time complexity vs the constraints. +4. Write a SIMPLE correct solution first — brute force is fine for a first version. +5. Write a separate brute-force and test generator, then cross-validate. +6. Once confident in correctness: optimize for performance if needed. +7. Stress test with larger inputs before finalizing. + +**Retreat strategy:** If stuck on the same bug for 5+ cycles, switch to a simpler +algorithm. A correct brute-force scoring 30% beats a broken solution scoring 0%. + +Submit your final solution as solution.cpp in the current working directory.""" + +# Full-access prompt (parity=False): agent gets test data and helper scripts +FULL_ACCESS_PROMPT = """You are solving a competitive programming problem. + +Problem directory: {problem_dir} +- Read statement.txt for the full problem description +- Time limit: {time_limit}, Memory limit: {memory_limit} +- Total hidden test cases: {total_cases} (your score = fraction passed) +- testdata/ contains sample test cases — these are a SUBSET of the hidden tests""" + +FULL_ACCESS_INTERACTIVE_SECTION = """ +## Problem type: INTERACTIVE + +This is an interactive problem. Your solution communicates with a judge interactor +via stdin/stdout. You do NOT read from files — you read responses from the interactor +and write queries/answers to stdout. + +Key files provided: +- interactor.cc — the judge interactor (uses testlib.h, both provided) +- testdata/*.in — interactor input seeds (NOT your stdin) + +**CRITICAL for interactive problems:** +- You MUST flush stdout after EVERY output line: use `cout << endl;` or `cout << flush;` +- Read the interactor source code to understand the exact protocol (what it sends, what it expects) +- Count your queries carefully against the stated limit + +**Testing interactive solutions locally:** +Use the provided `./run_interactive.sh` script: +```bash +./run_interactive.sh 1 # Test with sample 1 +./run_interactive.sh 2 # Test with sample 2 +# Run all samples: +for i in testdata/*.in; do ./run_interactive.sh $(basename $i .in); done +``` + +If `run_interactive.sh` times out (exit code 4), it usually means a deadlock: +- Missing `flush` / `endl` on your output +- Reading when the interactor expects you to write, or vice versa +- Exceeding the query limit (interactor stops responding) + +**Fallback testing:** If the shell script doesn't work, write a Python wrapper: +```python +import subprocess, os +proc_sol = subprocess.Popen(['./solution'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) +proc_int = subprocess.Popen(['./interactor', 'testdata/1.in', '/dev/null', 'testdata/1.ans'], + stdin=proc_sol.stdout, stdout=proc_sol.stdin) +proc_int.wait(); proc_sol.wait() +print(f"interactor exit: {{proc_int.returncode}}") +``` + +IMPORTANT: You MUST test your solution locally before finalizing. Do NOT submit untested code.""" + +FULL_ACCESS_STANDARD_SECTION = """ +## Problem type: {problem_type} + +**Testing your solution locally:** +Use the provided `./test_all.sh` script: +```bash +./test_all.sh # Compiles solution.cpp and runs against ALL samples +``` +This compiles, runs each sample, and compares output. Always run this before finalizing.{checker_note}""" + +FULL_ACCESS_SCORING_SECTION = """ +## Scoring + +Your score is the fraction of hidden test cases passed (0-100%). +- There are {total_cases} hidden test cases total +- Partial credit counts — passing 7/10 cases = 70% score +- A correct-but-slow solution that passes small cases is MUCH better than a broken fast one +- Prioritize CORRECTNESS over optimality. Get a working solution first, then optimize.""" + +FULL_ACCESS_WORKFLOW = """## Workflow + +1. Read the FULL problem statement carefully. Re-read the constraints and edge cases. +2. Read ALL sample test cases and understand the expected I/O format. +3. Design your algorithm. Think about time complexity vs the constraints. +4. Write a SIMPLE correct solution first — brute force is fine for a first version. +5. Compile and test against ALL samples using the provided test script. +6. If samples fail: debug by examining the diff, don't just rewrite everything. +7. Once samples pass: think about edge cases and whether your algorithm handles large inputs. +8. Optimize only after correctness is established. + +**Critical rules:** +- Do NOT rewrite your solution from scratch more than once. Incremental edits preserve working logic. +- Do NOT skip local testing. Every change must be tested before you move on. +- Do NOT submit without running test_all.sh (or run_interactive.sh for interactive). +- If you TLE on large cases, profile the bottleneck — don't simplify the entire algorithm. + +**Retreat strategy — know when to simplify:** +- If you've been debugging the SAME bug for more than 5 edit-test cycles without progress, + STOP and switch to a fundamentally simpler approach. A correct brute-force that passes + small cases is worth more than a broken optimized solution that passes nothing. +- If your approach is off by a small constant (e.g., exceeding a limit by 1), consider whether + a completely different algorithm would avoid the issue rather than patching endlessly. +- Remember: partial credit exists. A solution scoring 30% is infinitely better than 0%. + When in doubt, submit what works even if it's suboptimal. + +Submit your final solution as solution.cpp in the current working directory.""" + +# --------------------------------------------------------------------------- +# CLAUDE.md content +# --------------------------------------------------------------------------- + +CLAUDE_MD_HEADER = """# Agent Eval — Working Directory + +You are solving a competitive programming problem in this directory. + +## Rules + +- Your ONLY deliverable is `solution.cpp` in this directory. +- Use C++17 (g++ -std=gnu++17). +- Always compile with `-O2` for performance testing. +- Read the problem statement COMPLETELY before writing any code. + +## Testing +""" + +CLAUDE_MD_PARITY_TESTING = """No test data or test scripts are provided. +Write your own brute-force solution + random test generator to cross-validate. +This is standard competitive programming practice (对拍). +""" + +CLAUDE_MD_PARITY_INTERACTIVE = """This is an INTERACTIVE problem. +- `cout << endl;` or `cout << flush;` after EVERY line you output +- Read the problem statement to understand the exact protocol +- Count queries against the stated limit +""" + +CLAUDE_MD_FULL_INTERACTIVE = """This is an INTERACTIVE problem. Use `./run_interactive.sh N` to test sample N. +Do NOT skip interactive testing — protocol bugs are the #1 failure mode. + +### Interactive protocol checklist +- `cout << endl;` or `cout << flush;` after EVERY line you output +- Read the interactor source code to know the exact send/receive order +- Count queries against the stated limit +- If run_interactive.sh times out: you likely have a deadlock (missing flush or wrong protocol) +- Fallback: write a Python subprocess wrapper if the shell script fails +""" + +CLAUDE_MD_FULL_STANDARD = """Use `./test_all.sh` to compile and test against all samples. +If chk.cc exists, test_all.sh uses it as a special judge automatically. +Fix any failing samples before moving on to optimization. +""" + +CLAUDE_MD_FOOTER = """ +## Common mistakes to avoid + +- Forgetting to flush stdout in interactive problems +- Off-by-one errors in array indexing (0-indexed vs 1-indexed) +- Integer overflow — use `long long` for anything that could exceed 2^31 +- Reading input in the wrong order or format +- Not handling the edge case where N=1 or the input is minimal +- Rewriting the entire solution when a small fix would work + +## When to retreat + +- If you've edited and tested 5+ times for the same bug without progress, STOP. +- Switch to a simpler algorithm that is guaranteed correct, even if slower. +- A correct brute-force scoring 30% beats a broken clever solution scoring 0%. +- Partial credit is real: every test case you pass counts. +""" diff --git a/src/frontier_cs/gen/agent_interface.py b/src/frontier_cs/gen/agent_interface.py index 05f11671..907a53eb 100644 --- a/src/frontier_cs/gen/agent_interface.py +++ b/src/frontier_cs/gen/agent_interface.py @@ -24,6 +24,27 @@ import yaml +from frontier_cs.gen.agent_constants import ( + CLAUDE_MD_FOOTER, + CLAUDE_MD_FULL_INTERACTIVE, + CLAUDE_MD_FULL_STANDARD, + CLAUDE_MD_HEADER, + CLAUDE_MD_PARITY_INTERACTIVE, + CLAUDE_MD_PARITY_TESTING, + FULL_ACCESS_INTERACTIVE_SECTION, + FULL_ACCESS_PROMPT, + FULL_ACCESS_SCORING_SECTION, + FULL_ACCESS_STANDARD_SECTION, + FULL_ACCESS_WORKFLOW, + PARITY_INTERACTIVE_SECTION, + PARITY_PROMPT, + PARITY_SCORING_AND_WORKFLOW, + PARITY_SPJ_SECTION, + PARITY_STANDARD_SECTION, + RUN_INTERACTIVE_SH, + TEST_ALL_SH, +) + logger = logging.getLogger(__name__) # Default budget limits @@ -82,150 +103,6 @@ def _format_samples(samples: List[Dict[str, str]], is_interactive: bool) -> str: return "\n".join(parts) -# Shell script: compile solution.cpp and test against all sample cases. -# If chk.cc exists (special judge), uses it for verification instead of diff. -_TEST_ALL_SH = r"""#!/bin/bash -set -e -echo "=== Compiling solution.cpp ===" -g++ -std=gnu++17 -O2 -o solution solution.cpp -echo "=== Compilation OK ===" - -# Compile checker if available (special judge) -USE_CHECKER=0 -if [ -f "chk.cc" ]; then - echo "=== Compiling special judge (chk.cc) ===" - if g++ -std=gnu++17 -O2 -I. chk.cc -o checker 2>/dev/null; then - USE_CHECKER=1 - echo "=== Checker compiled OK — using it instead of diff ===" - else - echo "=== Checker compilation failed — falling back to diff ===" - fi -fi - -passed=0; failed=0; total=0 -for inf in testdata/*.in; do - [ -f "$inf" ] || continue - id=$(basename "$inf" .in) - ans="testdata/${id}.ans" - [ -f "$ans" ] || continue - total=$((total + 1)) - - # Run with timeout - if timeout 15 ./solution < "$inf" > "my_${id}.out" 2>"my_${id}.err"; then - if [ "$USE_CHECKER" -eq 1 ]; then - # Special judge: ./checker - checker_out=$(./checker "$inf" "my_${id}.out" "$ans" 2>&1) && chk_rc=$? || chk_rc=$? - if [ $chk_rc -eq 0 ]; then - echo " Sample $id: PASS (checker: $checker_out)" - passed=$((passed + 1)) - else - echo " Sample $id: WRONG ANSWER (checker exit $chk_rc)" - echo " Checker output: $checker_out" - failed=$((failed + 1)) - fi - else - # Diff-based comparison (normalize whitespace) - if diff -q <(tr -s '[:space:]' '\n' < "my_${id}.out" | sed '/^$/d') \ - <(tr -s '[:space:]' '\n' < "$ans" | sed '/^$/d') >/dev/null 2>&1; then - echo " Sample $id: PASS" - passed=$((passed + 1)) - else - echo " Sample $id: WRONG ANSWER" - echo " Expected (first 5 lines):" - head -5 "$ans" | sed 's/^/ /' - echo " Got (first 5 lines):" - head -5 "my_${id}.out" | sed 's/^/ /' - failed=$((failed + 1)) - fi - fi - else - rc=$? - echo " Sample $id: RUNTIME ERROR or TLE (exit $rc)" - [ -s "my_${id}.err" ] && head -3 "my_${id}.err" | sed 's/^/ stderr: /' - failed=$((failed + 1)) - fi -done - -echo "=== Results: $passed/$total passed ===" -[ "$failed" -eq 0 ] && exit 0 || exit 1 -""" - -# Shell script: test solution against an interactor using named pipes. -_RUN_INTERACTIVE_SH = r"""#!/bin/bash -# Usage: ./run_interactive.sh [sample_id] (default: 1) -# Compiles solution.cpp and interactor.cc, then tests via pipe. -# Exit codes: 0=accepted, 1=wrong answer, 2=presentation error, 3=build error, 4=timeout/crash - -SAMPLE=${1:-1} -INF="testdata/${SAMPLE}.in" -ANSF="testdata/${SAMPLE}.ans" - -if [ ! -f "$INF" ]; then - echo "Error: $INF not found" - exit 3 -fi - -# Compile only if binaries are missing or sources are newer -if [ ! -f ./solution ] || [ solution.cpp -nt ./solution ]; then - echo "=== Compiling solution.cpp ===" - g++ -std=gnu++17 -O2 -o solution solution.cpp || { echo "Compilation failed"; exit 3; } -fi - -if [ ! -f ./interactor ] || [ interactor.cc -nt ./interactor ]; then - echo "=== Compiling interactor ===" - g++ -std=gnu++17 -O2 -I. interactor.cc -o interactor || { echo "Interactor compilation failed"; exit 3; } -fi - -# Create named pipes in current dir (avoids /tmp permission issues) -PIPE_S2I=".pipe_s2i_$$" -PIPE_I2S=".pipe_i2s_$$" -rm -f "$PIPE_S2I" "$PIPE_I2S" -mkfifo "$PIPE_S2I" "$PIPE_I2S" - -cleanup() { rm -f "$PIPE_S2I" "$PIPE_I2S" inter_stderr.tmp sol_stderr.tmp; } -trap cleanup EXIT - -echo "=== Running sample $SAMPLE ===" - -# interactor: reads from solution's stdout via pipe, writes to solution's stdin via pipe -# testlib interactors: argv = [ans] -# We use /dev/null for ouf (output file) since we only care about exit code -timeout 120 ./interactor "$INF" /dev/null "$ANSF" < "$PIPE_S2I" > "$PIPE_I2S" 2>inter_stderr.tmp & -INTER_PID=$! - -timeout 120 ./solution < "$PIPE_I2S" > "$PIPE_S2I" 2>sol_stderr.tmp & -SOL_PID=$! - -# Wait for both processes -INTER_EXIT=0; SOL_EXIT=0 -wait $INTER_PID 2>/dev/null || INTER_EXIT=$? -wait $SOL_PID 2>/dev/null || SOL_EXIT=$? - -# Report results -if [ $INTER_EXIT -eq 0 ]; then - echo " Sample $SAMPLE: ACCEPTED (interactor exit 0)" - [ -s inter_stderr.tmp ] && head -2 inter_stderr.tmp | sed 's/^/ interactor: /' - exit 0 -elif [ $INTER_EXIT -eq 1 ]; then - echo " Sample $SAMPLE: WRONG ANSWER (interactor exit 1)" - [ -s inter_stderr.tmp ] && head -3 inter_stderr.tmp | sed 's/^/ interactor: /' - exit 1 -elif [ $INTER_EXIT -eq 2 ]; then - echo " Sample $SAMPLE: PRESENTATION ERROR (interactor exit 2)" - [ -s inter_stderr.tmp ] && head -3 inter_stderr.tmp | sed 's/^/ interactor: /' - exit 2 -elif [ $INTER_EXIT -eq 124 ] || [ $INTER_EXIT -eq 137 ]; then - echo " Sample $SAMPLE: TIMEOUT (120s exceeded)" - echo " This usually means your solution deadlocked (missing flush? wrong protocol?)" - [ -s sol_stderr.tmp ] && head -3 sol_stderr.tmp | sed 's/^/ solution stderr: /' - exit 4 -else - echo " Sample $SAMPLE: UNKNOWN (interactor exit $INTER_EXIT, solution exit $SOL_EXIT)" - [ -s inter_stderr.tmp ] && head -3 inter_stderr.tmp | sed 's/^/ interactor: /' - [ -s sol_stderr.tmp ] && head -3 sol_stderr.tmp | sed 's/^/ solution: /' - exit 4 -fi -""" def build_agent_prompt(problem_dir: str, *, parity: bool = True) -> str: @@ -260,121 +137,35 @@ def build_agent_prompt(problem_dir: str, *, parity: bool = True) -> str: samples = _collect_samples(problem_dir) - # Base info - parts = [f"""You are solving a competitive programming problem. - -Problem directory: {problem_dir} -- Read statement.txt for the full problem description -- Time limit: {time_limit}, Memory limit: {memory_limit} -- Total hidden test cases: {total_cases} (your score = fraction passed) -- testdata/ contains sample test cases — these are a SUBSET of the hidden tests"""] + parts = [FULL_ACCESS_PROMPT.format( + problem_dir=problem_dir, time_limit=time_limit, + memory_limit=memory_limit, total_cases=total_cases, + )] - # Problem type specific guidance if is_interactive: - parts.append(""" -## Problem type: INTERACTIVE - -This is an interactive problem. Your solution communicates with a judge interactor -via stdin/stdout. You do NOT read from files — you read responses from the interactor -and write queries/answers to stdout. - -Key files provided: -- interactor.cc — the judge interactor (uses testlib.h, both provided) -- testdata/*.in — interactor input seeds (NOT your stdin) - -**CRITICAL for interactive problems:** -- You MUST flush stdout after EVERY output line: use `cout << endl;` or `cout << flush;` -- Read the interactor source code to understand the exact protocol (what it sends, what it expects) -- Count your queries carefully against the stated limit - -**Testing interactive solutions locally:** -Use the provided `./run_interactive.sh` script: -```bash -./run_interactive.sh 1 # Test with sample 1 -./run_interactive.sh 2 # Test with sample 2 -# Run all samples: -for i in testdata/*.in; do ./run_interactive.sh $(basename $i .in); done -``` - -If `run_interactive.sh` times out (exit code 4), it usually means a deadlock: -- Missing `flush` / `endl` on your output -- Reading when the interactor expects you to write, or vice versa -- Exceeding the query limit (interactor stops responding) - -**Fallback testing:** If the shell script doesn't work, write a Python wrapper: -```python -import subprocess, os -proc_sol = subprocess.Popen(['./solution'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) -proc_int = subprocess.Popen(['./interactor', 'testdata/1.in', '/dev/null', 'testdata/1.ans'], - stdin=proc_sol.stdout, stdout=proc_sol.stdin) -proc_int.wait(); proc_sol.wait() -print(f"interactor exit: {proc_int.returncode}") -``` - -IMPORTANT: You MUST test your solution locally before finalizing. Do NOT submit untested code.""") + parts.append(FULL_ACCESS_INTERACTIVE_SECTION) else: checker_note = "" if has_checker: - checker_note = """ -Note: This problem has a SPECIAL JUDGE (chk.cc) — multiple valid outputs may be accepted. -`test_all.sh` will automatically compile and use the checker for validation. -If the checker reports PASS but the output looks different from the .ans file, that's fine.""" - - parts.append(f""" -## Problem type: {"SPECIAL JUDGE (multiple valid outputs accepted)" if has_checker else "STANDARD"} - -**Testing your solution locally:** -Use the provided `./test_all.sh` script: -```bash -./test_all.sh # Compiles solution.cpp and runs against ALL samples -``` -This compiles, runs each sample, and compares output. Always run this before finalizing.{checker_note}""") - - # Scoring context - parts.append(f""" -## Scoring - -Your score is the fraction of hidden test cases passed (0-100%). -- There are {total_cases} hidden test cases total -- Partial credit counts — passing 7/10 cases = 70% score -- A correct-but-slow solution that passes small cases is MUCH better than a broken fast one -- Prioritize CORRECTNESS over optimality. Get a working solution first, then optimize.""") - - # Embed samples if small enough + checker_note = ("\nNote: This problem has a SPECIAL JUDGE (chk.cc) — " + "multiple valid outputs may be accepted.\n" + "`test_all.sh` will automatically compile and use the " + "checker for validation.\nIf the checker reports PASS but " + "the output looks different from the .ans file, that's fine.") + problem_type = "SPECIAL JUDGE (multiple valid outputs accepted)" if has_checker else "STANDARD" + parts.append(FULL_ACCESS_STANDARD_SECTION.format( + problem_type=problem_type, checker_note=checker_note, + )) + + parts.append(FULL_ACCESS_SCORING_SECTION.format(total_cases=total_cases)) + sample_text = _format_samples(samples, is_interactive) if sample_text: parts.append(sample_text) elif samples: parts.append("\n(Sample inputs are large — read them from testdata/ directory.)\n") - # Workflow - parts.append("""## Workflow - -1. Read the FULL problem statement carefully. Re-read the constraints and edge cases. -2. Read ALL sample test cases and understand the expected I/O format. -3. Design your algorithm. Think about time complexity vs the constraints. -4. Write a SIMPLE correct solution first — brute force is fine for a first version. -5. Compile and test against ALL samples using the provided test script. -6. If samples fail: debug by examining the diff, don't just rewrite everything. -7. Once samples pass: think about edge cases and whether your algorithm handles large inputs. -8. Optimize only after correctness is established. - -**Critical rules:** -- Do NOT rewrite your solution from scratch more than once. Incremental edits preserve working logic. -- Do NOT skip local testing. Every change must be tested before you move on. -- Do NOT submit without running test_all.sh (or run_interactive.sh for interactive). -- If you TLE on large cases, profile the bottleneck — don't simplify the entire algorithm. - -**Retreat strategy — know when to simplify:** -- If you've been debugging the SAME bug for more than 5 edit-test cycles without progress, - STOP and switch to a fundamentally simpler approach. A correct brute-force that passes - small cases is worth more than a broken optimized solution that passes nothing. -- If your approach is off by a small constant (e.g., exceeding a limit by 1), consider whether - a completely different algorithm would avoid the issue rather than patching endlessly. -- Remember: partial credit exists. A solution scoring 30% is infinitely better than 0%. - When in doubt, submit what works even if it's suboptimal. - -Submit your final solution as solution.cpp in the current working directory.""") + parts.append(FULL_ACCESS_WORKFLOW) return "\n".join(parts) @@ -393,162 +184,48 @@ def _build_parity_prompt( Matches the Harbor adapter setup: agent gets only the problem statement and must write its own tests. """ - parts = [f"""You are solving a competitive programming problem. - -Problem directory: {problem_dir} -- Read statement.txt for the full problem description -- Time limit: {time_limit}, Memory limit: {memory_limit} -- Total test cases: {total_cases} (your score = fraction passed) -- Scoring is partial: 0-100% based on test cases passed"""] + parts = [PARITY_PROMPT.format( + problem_dir=problem_dir, time_limit=time_limit, + memory_limit=memory_limit, total_cases=total_cases, + )] if is_interactive: - parts.append(""" -## Problem type: INTERACTIVE - -This is an interactive problem. Your solution communicates with a hidden judge -via stdin/stdout. You do NOT read from files. - -**CRITICAL:** -- Flush stdout after EVERY output line: `cout << endl;` or `cout << flush;` -- Read the problem statement carefully for the exact query/response protocol -- Count your queries against the stated limit""") + parts.append(PARITY_INTERACTIVE_SECTION) elif has_checker: - parts.append(""" -## Problem type: SPECIAL JUDGE - -This problem accepts multiple valid outputs. Your solution will be checked by -a special judge, not by exact string matching.""") + parts.append(PARITY_SPJ_SECTION) else: - parts.append(""" -## Problem type: STANDARD - -Your output must match the expected output exactly (whitespace-normalized).""") - - parts.append(f""" -## Scoring + parts.append(PARITY_STANDARD_SECTION) -Your score is the fraction of test cases passed (0-100%). -- There are {total_cases} test cases total -- Partial credit counts — passing 7/10 cases = 70% score -- A correct-but-slow solution that passes small cases is MUCH better than a broken fast one -- Prioritize CORRECTNESS over optimality - -## Self-testing - -No test data or test scripts are provided. You must validate your own solution: - -1. **Write a brute-force reference solution** (even if O(n!) or exponential) that you are - confident is correct for small inputs. -2. **Write a random test generator** that produces valid inputs within the problem constraints. -3. **Cross-validate (对拍):** Run both solutions on hundreds of random small inputs and compare - outputs. Fix any discrepancies by debugging your main solution against the brute-force. -4. **Stress test:** Generate larger random inputs to check for TLE, MLE, or crashes. -5. **Edge cases:** Manually test minimum inputs (N=1, empty, etc.) and boundary values. - -This self-testing approach is standard competitive programming practice. Do NOT skip it. - -## Workflow - -1. Read the FULL problem statement carefully. Re-read the constraints and edge cases. -2. Understand the I/O format from the examples in the problem statement. -3. Design your algorithm. Think about time complexity vs the constraints. -4. Write a SIMPLE correct solution first — brute force is fine for a first version. -5. Write a separate brute-force and test generator, then cross-validate. -6. Once confident in correctness: optimize for performance if needed. -7. Stress test with larger inputs before finalizing. - -**Retreat strategy:** If stuck on the same bug for 5+ cycles, switch to a simpler -algorithm. A correct brute-force scoring 30% beats a broken solution scoring 0%. - -Submit your final solution as solution.cpp in the current working directory.""") + parts.append(PARITY_SCORING_AND_WORKFLOW.format(total_cases=total_cases)) return "\n".join(parts) def _write_helper_scripts(workdir: Path, is_interactive: bool) -> None: """Write test helper scripts to the agent's working directory.""" - # Always provide test_all.sh for non-interactive test_all = workdir / "test_all.sh" - test_all.write_text(_TEST_ALL_SH, encoding="utf-8") + test_all.write_text(TEST_ALL_SH, encoding="utf-8") test_all.chmod(test_all.stat().st_mode | stat.S_IEXEC) if is_interactive: run_inter = workdir / "run_interactive.sh" - run_inter.write_text(_RUN_INTERACTIVE_SH, encoding="utf-8") + run_inter.write_text(RUN_INTERACTIVE_SH, encoding="utf-8") run_inter.chmod(run_inter.stat().st_mode | stat.S_IEXEC) def _write_workdir_claude_md(workdir: Path, is_interactive: bool, *, parity: bool = True) -> None: """Write a CLAUDE.md to the workdir so Claude Code picks up behavioral guidance.""" - lines = [ - "# Agent Eval — Working Directory", - "", - "You are solving a competitive programming problem in this directory.", - "", - "## Rules", - "", - "- Your ONLY deliverable is `solution.cpp` in this directory.", - "- Use C++17 (g++ -std=gnu++17).", - "- Always compile with `-O2` for performance testing.", - "- Read the problem statement COMPLETELY before writing any code.", - "", - "## Testing", - "", - ] + parts = [CLAUDE_MD_HEADER] if parity: - lines += [ - "No test data or test scripts are provided.", - "Write your own brute-force solution + random test generator to cross-validate.", - "This is standard competitive programming practice (对拍).", - "", - ] + parts.append(CLAUDE_MD_PARITY_TESTING) if is_interactive: - lines += [ - "This is an INTERACTIVE problem.", - "- `cout << endl;` or `cout << flush;` after EVERY line you output", - "- Read the problem statement to understand the exact protocol", - "- Count queries against the stated limit", - "", - ] + parts.append(CLAUDE_MD_PARITY_INTERACTIVE) elif is_interactive: - lines += [ - "This is an INTERACTIVE problem. Use `./run_interactive.sh N` to test sample N.", - "Do NOT skip interactive testing — protocol bugs are the #1 failure mode.", - "", - "### Interactive protocol checklist", - "- `cout << endl;` or `cout << flush;` after EVERY line you output", - "- Read the interactor source code to know the exact send/receive order", - "- Count queries against the stated limit", - "- If run_interactive.sh times out: you likely have a deadlock (missing flush or wrong protocol)", - "- Fallback: write a Python subprocess wrapper if the shell script fails", - "", - ] + parts.append(CLAUDE_MD_FULL_INTERACTIVE) else: - lines += [ - "Use `./test_all.sh` to compile and test against all samples.", - "If chk.cc exists, test_all.sh uses it as a special judge automatically.", - "Fix any failing samples before moving on to optimization.", - "", - ] - lines += [ - "## Common mistakes to avoid", - "", - "- Forgetting to flush stdout in interactive problems", - "- Off-by-one errors in array indexing (0-indexed vs 1-indexed)", - "- Integer overflow — use `long long` for anything that could exceed 2^31", - "- Reading input in the wrong order or format", - "- Not handling the edge case where N=1 or the input is minimal", - "- Rewriting the entire solution when a small fix would work", - "", - "## When to retreat", - "", - "- If you've edited and tested 5+ times for the same bug without progress, STOP.", - "- Switch to a simpler algorithm that is guaranteed correct, even if slower.", - "- A correct brute-force scoring 30% beats a broken clever solution scoring 0%.", - "- Partial credit is real: every test case you pass counts.", - "", - ] - (workdir / "CLAUDE.md").write_text("\n".join(lines), encoding="utf-8") + parts.append(CLAUDE_MD_FULL_STANDARD) + parts.append(CLAUDE_MD_FOOTER) + (workdir / "CLAUDE.md").write_text("\n".join(parts), encoding="utf-8") def extract_solution_cpp(workdir: Path) -> str: From a1f9fae324e40f5e1d39a72ff38d527ec09df550 Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Thu, 16 Apr 2026 17:08:47 +0000 Subject: [PATCH 11/16] refactor: move workflow/testing guidance from prompt to CLAUDE.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prompt (initial message) is now lean — only problem-specific info (path, type, limits). CLAUDE.md carries persistent guidance that survives context compaction: self-testing methodology, workflow steps, common mistakes, retreat strategy. --- src/frontier_cs/gen/agent_constants.py | 265 +++++++++++-------------- src/frontier_cs/gen/agent_interface.py | 37 ++-- tests/test_agent_interface.py | 49 +++-- 3 files changed, 162 insertions(+), 189 deletions(-) diff --git a/src/frontier_cs/gen/agent_constants.py b/src/frontier_cs/gen/agent_constants.py index 6b23d915..060fe419 100644 --- a/src/frontier_cs/gen/agent_constants.py +++ b/src/frontier_cs/gen/agent_constants.py @@ -150,7 +150,7 @@ """ # --------------------------------------------------------------------------- -# Prompt templates +# Prompt templates (initial message — problem-specific, may get compacted) # --------------------------------------------------------------------------- # Parity prompt (default): agent gets NO test data, must self-test @@ -159,19 +159,14 @@ Problem directory: {problem_dir} - Read statement.txt for the full problem description - Time limit: {time_limit}, Memory limit: {memory_limit} -- Total test cases: {total_cases} (your score = fraction passed) -- Scoring is partial: 0-100% based on test cases passed""" +- Total test cases: {total_cases} +- Scoring is partial: 0-100% based on fraction of test cases passed""" PARITY_INTERACTIVE_SECTION = """ ## Problem type: INTERACTIVE This is an interactive problem. Your solution communicates with a hidden judge -via stdin/stdout. You do NOT read from files. - -**CRITICAL:** -- Flush stdout after EVERY output line: `cout << endl;` or `cout << flush;` -- Read the problem statement carefully for the exact query/response protocol -- Count your queries against the stated limit""" +via stdin/stdout. You do NOT read from files.""" PARITY_SPJ_SECTION = """ ## Problem type: SPECIAL JUDGE @@ -184,43 +179,9 @@ Your output must match the expected output exactly (whitespace-normalized).""" -PARITY_SCORING_AND_WORKFLOW = """ -## Scoring - -Your score is the fraction of test cases passed (0-100%). -- There are {total_cases} test cases total -- Partial credit counts — passing 7/10 cases = 70% score -- A correct-but-slow solution that passes small cases is MUCH better than a broken fast one -- Prioritize CORRECTNESS over optimality - -## Self-testing - -No test data or test scripts are provided. You must validate your own solution: - -1. **Write a brute-force reference solution** (even if O(n!) or exponential) that you are - confident is correct for small inputs. -2. **Write a random test generator** that produces valid inputs within the problem constraints. -3. **Cross-validate (对拍):** Run both solutions on hundreds of random small inputs and compare - outputs. Fix any discrepancies by debugging your main solution against the brute-force. -4. **Stress test:** Generate larger random inputs to check for TLE, MLE, or crashes. -5. **Edge cases:** Manually test minimum inputs (N=1, empty, etc.) and boundary values. - -This self-testing approach is standard competitive programming practice. Do NOT skip it. - -## Workflow - -1. Read the FULL problem statement carefully. Re-read the constraints and edge cases. -2. Understand the I/O format from the examples in the problem statement. -3. Design your algorithm. Think about time complexity vs the constraints. -4. Write a SIMPLE correct solution first — brute force is fine for a first version. -5. Write a separate brute-force and test generator, then cross-validate. -6. Once confident in correctness: optimize for performance if needed. -7. Stress test with larger inputs before finalizing. - -**Retreat strategy:** If stuck on the same bug for 5+ cycles, switch to a simpler -algorithm. A correct brute-force scoring 30% beats a broken solution scoring 0%. - -Submit your final solution as solution.cpp in the current working directory.""" +PARITY_TAIL = """ +Read the CLAUDE.md in this directory for compilation, testing, and workflow guidance. +Begin by reading the full problem statement in statement.txt.""" # Full-access prompt (parity=False): agent gets test data and helper scripts FULL_ACCESS_PROMPT = """You are solving a competitive programming problem. @@ -228,7 +189,8 @@ Problem directory: {problem_dir} - Read statement.txt for the full problem description - Time limit: {time_limit}, Memory limit: {memory_limit} -- Total hidden test cases: {total_cases} (your score = fraction passed) +- Total hidden test cases: {total_cases} +- Scoring is partial: 0-100% based on fraction of test cases passed - testdata/ contains sample test cases — these are a SUBSET of the hidden tests""" FULL_ACCESS_INTERACTIVE_SECTION = """ @@ -240,145 +202,148 @@ Key files provided: - interactor.cc — the judge interactor (uses testlib.h, both provided) -- testdata/*.in — interactor input seeds (NOT your stdin) +- testdata/*.in — interactor input seeds (NOT your stdin)""" -**CRITICAL for interactive problems:** -- You MUST flush stdout after EVERY output line: use `cout << endl;` or `cout << flush;` -- Read the interactor source code to understand the exact protocol (what it sends, what it expects) -- Count your queries carefully against the stated limit +FULL_ACCESS_STANDARD_SECTION = """ +## Problem type: {problem_type} -**Testing interactive solutions locally:** -Use the provided `./run_interactive.sh` script: -```bash -./run_interactive.sh 1 # Test with sample 1 -./run_interactive.sh 2 # Test with sample 2 -# Run all samples: -for i in testdata/*.in; do ./run_interactive.sh $(basename $i .in); done -``` +Use `./test_all.sh` to compile and test against all samples.{checker_note}""" -If `run_interactive.sh` times out (exit code 4), it usually means a deadlock: -- Missing `flush` / `endl` on your output -- Reading when the interactor expects you to write, or vice versa -- Exceeding the query limit (interactor stops responding) - -**Fallback testing:** If the shell script doesn't work, write a Python wrapper: -```python -import subprocess, os -proc_sol = subprocess.Popen(['./solution'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) -proc_int = subprocess.Popen(['./interactor', 'testdata/1.in', '/dev/null', 'testdata/1.ans'], - stdin=proc_sol.stdout, stdout=proc_sol.stdin) -proc_int.wait(); proc_sol.wait() -print(f"interactor exit: {{proc_int.returncode}}") -``` +FULL_ACCESS_TAIL = """ +Read the CLAUDE.md in this directory for compilation, testing, and workflow guidance. +Begin by reading the full problem statement in statement.txt.""" -IMPORTANT: You MUST test your solution locally before finalizing. Do NOT submit untested code.""" +# --------------------------------------------------------------------------- +# CLAUDE.md content (persistent system context — survives compaction) +# --------------------------------------------------------------------------- -FULL_ACCESS_STANDARD_SECTION = """ -## Problem type: {problem_type} +CLAUDE_MD_PARITY = """# Competitive Programming — Agent Workspace + +## Deliverable + +Write your solution to `solution.cpp` in this directory. Nothing else is evaluated. + +## Compilation -**Testing your solution locally:** -Use the provided `./test_all.sh` script: ```bash -./test_all.sh # Compiles solution.cpp and runs against ALL samples +g++ -std=gnu++17 -O2 -o solution solution.cpp ``` -This compiles, runs each sample, and compares output. Always run this before finalizing.{checker_note}""" -FULL_ACCESS_SCORING_SECTION = """ ## Scoring -Your score is the fraction of hidden test cases passed (0-100%). -- There are {total_cases} hidden test cases total -- Partial credit counts — passing 7/10 cases = 70% score -- A correct-but-slow solution that passes small cases is MUCH better than a broken fast one -- Prioritize CORRECTNESS over optimality. Get a working solution first, then optimize.""" - -FULL_ACCESS_WORKFLOW = """## Workflow +Your score = fraction of test cases passed (0-100%). +- Partial credit counts — passing 7/10 cases = 70% +- A correct-but-slow solution passing small cases is MUCH better than a broken fast one +- Prioritize CORRECTNESS over optimality -1. Read the FULL problem statement carefully. Re-read the constraints and edge cases. -2. Read ALL sample test cases and understand the expected I/O format. -3. Design your algorithm. Think about time complexity vs the constraints. -4. Write a SIMPLE correct solution first — brute force is fine for a first version. -5. Compile and test against ALL samples using the provided test script. -6. If samples fail: debug by examining the diff, don't just rewrite everything. -7. Once samples pass: think about edge cases and whether your algorithm handles large inputs. -8. Optimize only after correctness is established. +## Self-testing (no test data provided) -**Critical rules:** -- Do NOT rewrite your solution from scratch more than once. Incremental edits preserve working logic. -- Do NOT skip local testing. Every change must be tested before you move on. -- Do NOT submit without running test_all.sh (or run_interactive.sh for interactive). -- If you TLE on large cases, profile the bottleneck — don't simplify the entire algorithm. +You must validate your own solution: -**Retreat strategy — know when to simplify:** -- If you've been debugging the SAME bug for more than 5 edit-test cycles without progress, - STOP and switch to a fundamentally simpler approach. A correct brute-force that passes - small cases is worth more than a broken optimized solution that passes nothing. -- If your approach is off by a small constant (e.g., exceeding a limit by 1), consider whether - a completely different algorithm would avoid the issue rather than patching endlessly. -- Remember: partial credit exists. A solution scoring 30% is infinitely better than 0%. - When in doubt, submit what works even if it's suboptimal. +1. **Brute-force reference**: Write a simple, obviously correct solution (even O(n!) is fine) +2. **Random test generator**: Produce valid inputs within the problem constraints +3. **Cross-validate (对拍)**: Run both solutions on hundreds of random small inputs, compare outputs. + Fix any discrepancy by debugging your main solution against the brute-force. +4. **Stress test**: Generate larger inputs to check for TLE/MLE/crashes +5. **Edge cases**: Test minimum inputs (N=1, empty) and boundary values -Submit your final solution as solution.cpp in the current working directory.""" +Do NOT skip self-testing. This is standard competitive programming practice. -# --------------------------------------------------------------------------- -# CLAUDE.md content -# --------------------------------------------------------------------------- +## Workflow -CLAUDE_MD_HEADER = """# Agent Eval — Working Directory +1. Read the FULL problem statement. Re-read constraints and edge cases. +2. Understand the I/O format from the examples in the statement. +3. Design your algorithm. Consider time complexity vs constraints. +4. Write a SIMPLE correct solution first — brute force is fine initially. +5. Write brute-force + generator, then cross-validate. +6. Optimize for performance only after correctness is confirmed. +7. Stress test with larger inputs before finalizing. -You are solving a competitive programming problem in this directory. +## Common mistakes -## Rules +- Integer overflow — use `long long` for anything that could exceed 2^31 +- Off-by-one errors in array indexing (0-indexed vs 1-indexed) +- Reading input in the wrong order or format +- Not handling N=1 or minimal input edge cases -- Your ONLY deliverable is `solution.cpp` in this directory. -- Use C++17 (g++ -std=gnu++17). -- Always compile with `-O2` for performance testing. -- Read the problem statement COMPLETELY before writing any code. +## When to retreat -## Testing +- If you've been debugging the SAME bug for 5+ edit-test cycles, STOP. +- Switch to a simpler algorithm that is guaranteed correct, even if slower. +- A correct brute-force scoring 30% beats a broken optimized solution scoring 0%. +- Do NOT rewrite from scratch more than once. Incremental edits preserve working logic. """ -CLAUDE_MD_PARITY_TESTING = """No test data or test scripts are provided. -Write your own brute-force solution + random test generator to cross-validate. -This is standard competitive programming practice (对拍). -""" +CLAUDE_MD_PARITY_INTERACTIVE_ADDENDUM = """ +## Interactive problem -CLAUDE_MD_PARITY_INTERACTIVE = """This is an INTERACTIVE problem. -- `cout << endl;` or `cout << flush;` after EVERY line you output -- Read the problem statement to understand the exact protocol -- Count queries against the stated limit +- Flush stdout after EVERY output line: `cout << endl;` or `cout << flush;` +- Read the problem statement carefully for the exact query/response protocol +- Count your queries against the stated limit +- For self-testing: simulate the interactor in a separate program and connect via pipes """ -CLAUDE_MD_FULL_INTERACTIVE = """This is an INTERACTIVE problem. Use `./run_interactive.sh N` to test sample N. -Do NOT skip interactive testing — protocol bugs are the #1 failure mode. +CLAUDE_MD_FULL_ACCESS = """# Competitive Programming — Agent Workspace -### Interactive protocol checklist -- `cout << endl;` or `cout << flush;` after EVERY line you output -- Read the interactor source code to know the exact send/receive order -- Count queries against the stated limit -- If run_interactive.sh times out: you likely have a deadlock (missing flush or wrong protocol) -- Fallback: write a Python subprocess wrapper if the shell script fails -""" +## Deliverable + +Write your solution to `solution.cpp` in this directory. Nothing else is evaluated. + +## Compilation + +```bash +g++ -std=gnu++17 -O2 -o solution solution.cpp +``` + +## Scoring + +Your score = fraction of hidden test cases passed (0-100%). +- Partial credit counts — passing 7/10 cases = 70% +- A correct-but-slow solution passing small cases is MUCH better than a broken fast one +- Prioritize CORRECTNESS over optimality + +## Testing + +Use the provided test scripts to validate before finalizing: + +```bash +./test_all.sh # Standard problems: compile + test all samples +./run_interactive.sh 1 # Interactive problems: test sample 1 +``` -CLAUDE_MD_FULL_STANDARD = """Use `./test_all.sh` to compile and test against all samples. If chk.cc exists, test_all.sh uses it as a special judge automatically. -Fix any failing samples before moving on to optimization. -""" -CLAUDE_MD_FOOTER = """ -## Common mistakes to avoid +## Workflow + +1. Read the FULL problem statement. Re-read constraints and edge cases. +2. Read ALL sample test cases and understand expected I/O format. +3. Design your algorithm. Consider time complexity vs constraints. +4. Write a SIMPLE correct solution first — brute force is fine initially. +5. Compile and test against ALL samples using the provided script. +6. If samples fail: debug by examining the diff, don't just rewrite everything. +7. Once samples pass: consider edge cases and large input performance. +8. Optimize only after correctness is established. + +## Interactive problems + +- Flush stdout after EVERY output line: `cout << endl;` or `cout << flush;` +- Read the interactor source code to understand the exact protocol +- Count queries against the stated limit +- If run_interactive.sh times out: likely a deadlock (missing flush or wrong protocol) +- Fallback: write a Python subprocess wrapper if the shell script doesn't work + +## Common mistakes - Forgetting to flush stdout in interactive problems -- Off-by-one errors in array indexing (0-indexed vs 1-indexed) - Integer overflow — use `long long` for anything that could exceed 2^31 +- Off-by-one errors in array indexing (0-indexed vs 1-indexed) - Reading input in the wrong order or format -- Not handling the edge case where N=1 or the input is minimal -- Rewriting the entire solution when a small fix would work +- Not handling N=1 or minimal input edge cases ## When to retreat -- If you've edited and tested 5+ times for the same bug without progress, STOP. +- If you've been debugging the SAME bug for 5+ edit-test cycles, STOP. - Switch to a simpler algorithm that is guaranteed correct, even if slower. -- A correct brute-force scoring 30% beats a broken clever solution scoring 0%. -- Partial credit is real: every test case you pass counts. +- A correct brute-force scoring 30% beats a broken optimized solution scoring 0%. +- Do NOT rewrite from scratch more than once. Incremental edits preserve working logic. """ diff --git a/src/frontier_cs/gen/agent_interface.py b/src/frontier_cs/gen/agent_interface.py index 907a53eb..d7790a99 100644 --- a/src/frontier_cs/gen/agent_interface.py +++ b/src/frontier_cs/gen/agent_interface.py @@ -25,22 +25,18 @@ import yaml from frontier_cs.gen.agent_constants import ( - CLAUDE_MD_FOOTER, - CLAUDE_MD_FULL_INTERACTIVE, - CLAUDE_MD_FULL_STANDARD, - CLAUDE_MD_HEADER, - CLAUDE_MD_PARITY_INTERACTIVE, - CLAUDE_MD_PARITY_TESTING, + CLAUDE_MD_FULL_ACCESS, + CLAUDE_MD_PARITY, + CLAUDE_MD_PARITY_INTERACTIVE_ADDENDUM, FULL_ACCESS_INTERACTIVE_SECTION, FULL_ACCESS_PROMPT, - FULL_ACCESS_SCORING_SECTION, FULL_ACCESS_STANDARD_SECTION, - FULL_ACCESS_WORKFLOW, + FULL_ACCESS_TAIL, PARITY_INTERACTIVE_SECTION, PARITY_PROMPT, - PARITY_SCORING_AND_WORKFLOW, PARITY_SPJ_SECTION, PARITY_STANDARD_SECTION, + PARITY_TAIL, RUN_INTERACTIVE_SH, TEST_ALL_SH, ) @@ -148,24 +144,19 @@ def build_agent_prompt(problem_dir: str, *, parity: bool = True) -> str: checker_note = "" if has_checker: checker_note = ("\nNote: This problem has a SPECIAL JUDGE (chk.cc) — " - "multiple valid outputs may be accepted.\n" - "`test_all.sh` will automatically compile and use the " - "checker for validation.\nIf the checker reports PASS but " - "the output looks different from the .ans file, that's fine.") + "multiple valid outputs may be accepted.") problem_type = "SPECIAL JUDGE (multiple valid outputs accepted)" if has_checker else "STANDARD" parts.append(FULL_ACCESS_STANDARD_SECTION.format( problem_type=problem_type, checker_note=checker_note, )) - parts.append(FULL_ACCESS_SCORING_SECTION.format(total_cases=total_cases)) - sample_text = _format_samples(samples, is_interactive) if sample_text: parts.append(sample_text) elif samples: parts.append("\n(Sample inputs are large — read them from testdata/ directory.)\n") - parts.append(FULL_ACCESS_WORKFLOW) + parts.append(FULL_ACCESS_TAIL) return "\n".join(parts) @@ -196,7 +187,7 @@ def _build_parity_prompt( else: parts.append(PARITY_STANDARD_SECTION) - parts.append(PARITY_SCORING_AND_WORKFLOW.format(total_cases=total_cases)) + parts.append(PARITY_TAIL) return "\n".join(parts) @@ -215,17 +206,13 @@ def _write_helper_scripts(workdir: Path, is_interactive: bool) -> None: def _write_workdir_claude_md(workdir: Path, is_interactive: bool, *, parity: bool = True) -> None: """Write a CLAUDE.md to the workdir so Claude Code picks up behavioral guidance.""" - parts = [CLAUDE_MD_HEADER] if parity: - parts.append(CLAUDE_MD_PARITY_TESTING) + content = CLAUDE_MD_PARITY if is_interactive: - parts.append(CLAUDE_MD_PARITY_INTERACTIVE) - elif is_interactive: - parts.append(CLAUDE_MD_FULL_INTERACTIVE) + content += CLAUDE_MD_PARITY_INTERACTIVE_ADDENDUM else: - parts.append(CLAUDE_MD_FULL_STANDARD) - parts.append(CLAUDE_MD_FOOTER) - (workdir / "CLAUDE.md").write_text("\n".join(parts), encoding="utf-8") + content = CLAUDE_MD_FULL_ACCESS + (workdir / "CLAUDE.md").write_text(content, encoding="utf-8") def extract_solution_cpp(workdir: Path) -> str: diff --git a/tests/test_agent_interface.py b/tests/test_agent_interface.py index 34f27037..6265c5f7 100644 --- a/tests/test_agent_interface.py +++ b/tests/test_agent_interface.py @@ -75,23 +75,20 @@ def test_build_agent_prompt_standard(): prompt = build_agent_prompt(str(pdir), parity=False) assert "test_all.sh" in prompt assert "STANDARD" in prompt or "SPECIAL JUDGE" in prompt - assert "solution.cpp" in prompt - assert "Scoring" in prompt - assert "fraction" in prompt.lower() or "partial" in prompt.lower() + assert "partial" in prompt.lower() # Samples should be embedded (they're tiny) assert "Sample 1" in prompt def test_build_agent_prompt_interactive(): - """Interactive problem prompt includes interactor guidance and run_interactive.sh.""" + """Interactive problem prompt includes interactor guidance.""" from frontier_cs.gen.agent_interface import build_agent_prompt with tempfile.TemporaryDirectory() as tmpdir: pdir = _make_problem_dir(tmpdir, interactive=True) prompt = build_agent_prompt(str(pdir), parity=False) assert "INTERACTIVE" in prompt - assert "run_interactive.sh" in prompt - assert "flush" in prompt.lower() or "pipe" in prompt.lower() + assert "interactor.cc" in prompt def test_build_agent_prompt_embeds_small_samples(): @@ -132,20 +129,19 @@ def test_build_agent_prompt_parity_no_test_refs(): assert "Sample 1" not in prompt assert "chk.cc" not in prompt assert "interactor.cc" not in prompt - # Should mention self-testing - assert "brute-force" in prompt.lower() or "brute force" in prompt.lower() - assert "solution.cpp" in prompt + # Prompt is lean — delegates to CLAUDE.md + assert "CLAUDE.md" in prompt + assert "statement.txt" in prompt def test_build_agent_prompt_parity_interactive(): - """Parity mode interactive prompt mentions flush but not interactor source.""" + """Parity mode interactive prompt identifies type but delegates details to CLAUDE.md.""" from frontier_cs.gen.agent_interface import build_agent_prompt with tempfile.TemporaryDirectory() as tmpdir: pdir = _make_problem_dir(tmpdir, interactive=True) prompt = build_agent_prompt(str(pdir), parity=True) assert "INTERACTIVE" in prompt - assert "flush" in prompt.lower() assert "run_interactive.sh" not in prompt assert "interactor.cc" not in prompt @@ -247,16 +243,41 @@ def test_write_workdir_claude_md_standard(): _write_workdir_claude_md(workdir, is_interactive=False, parity=False) content = (workdir / "CLAUDE.md").read_text() assert "test_all.sh" in content - assert "run_interactive.sh" not in content + assert "solution.cpp" in content def test_write_workdir_claude_md_interactive(): - """CLAUDE.md for interactive problems mentions flush and run_interactive.sh.""" + """CLAUDE.md for interactive problems mentions flush.""" from frontier_cs.gen.agent_interface import _write_workdir_claude_md with tempfile.TemporaryDirectory() as tmpdir: workdir = Path(tmpdir) _write_workdir_claude_md(workdir, is_interactive=True, parity=False) content = (workdir / "CLAUDE.md").read_text() - assert "run_interactive.sh" in content assert "flush" in content + + +def test_write_workdir_claude_md_parity(): + """Parity CLAUDE.md has self-testing guidance, no test script refs.""" + from frontier_cs.gen.agent_interface import _write_workdir_claude_md + + with tempfile.TemporaryDirectory() as tmpdir: + workdir = Path(tmpdir) + _write_workdir_claude_md(workdir, is_interactive=False, parity=True) + content = (workdir / "CLAUDE.md").read_text() + assert "brute-force" in content.lower() or "brute force" in content.lower() + assert "solution.cpp" in content + assert "test_all.sh" not in content + assert "run_interactive.sh" not in content + + +def test_write_workdir_claude_md_parity_interactive(): + """Parity interactive CLAUDE.md has flush guidance.""" + from frontier_cs.gen.agent_interface import _write_workdir_claude_md + + with tempfile.TemporaryDirectory() as tmpdir: + workdir = Path(tmpdir) + _write_workdir_claude_md(workdir, is_interactive=True, parity=True) + content = (workdir / "CLAUDE.md").read_text() + assert "flush" in content + assert "run_interactive.sh" not in content From b31db0a14877feeb471b71d8ae0d4b14957474ee Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Fri, 17 Apr 2026 04:17:16 +0000 Subject: [PATCH 12/16] refactor: soften scoring and retreat guidance to give agents more judgment room --- src/frontier_cs/gen/agent_constants.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/frontier_cs/gen/agent_constants.py b/src/frontier_cs/gen/agent_constants.py index 060fe419..6dcd9d9f 100644 --- a/src/frontier_cs/gen/agent_constants.py +++ b/src/frontier_cs/gen/agent_constants.py @@ -233,8 +233,7 @@ Your score = fraction of test cases passed (0-100%). - Partial credit counts — passing 7/10 cases = 70% -- A correct-but-slow solution passing small cases is MUCH better than a broken fast one -- Prioritize CORRECTNESS over optimality +- Start with a correct solution, then optimize. A working brute-force is a good fallback, but aim for the efficient solution when you can. ## Self-testing (no test data provided) @@ -268,8 +267,7 @@ ## When to retreat -- If you've been debugging the SAME bug for 5+ edit-test cycles, STOP. -- Switch to a simpler algorithm that is guaranteed correct, even if slower. +- If you're going in circles on the same bug with no new insight, consider falling back to a simpler algorithm. - A correct brute-force scoring 30% beats a broken optimized solution scoring 0%. - Do NOT rewrite from scratch more than once. Incremental edits preserve working logic. """ @@ -299,8 +297,7 @@ Your score = fraction of hidden test cases passed (0-100%). - Partial credit counts — passing 7/10 cases = 70% -- A correct-but-slow solution passing small cases is MUCH better than a broken fast one -- Prioritize CORRECTNESS over optimality +- Start with a correct solution, then optimize. A working brute-force is a good fallback, but aim for the efficient solution when you can. ## Testing @@ -342,8 +339,7 @@ ## When to retreat -- If you've been debugging the SAME bug for 5+ edit-test cycles, STOP. -- Switch to a simpler algorithm that is guaranteed correct, even if slower. +- If you're going in circles on the same bug with no new insight, consider falling back to a simpler algorithm. - A correct brute-force scoring 30% beats a broken optimized solution scoring 0%. - Do NOT rewrite from scratch more than once. Incremental edits preserve working logic. """ From 198505043f6cc4dffd30f7a154e6ded136e9ccf9 Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Fri, 17 Apr 2026 11:31:37 +0000 Subject: [PATCH 13/16] feat: align timeout and cost limit with Harbor adapter defaults --- src/frontier_cs/gen/agent_interface.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/frontier_cs/gen/agent_interface.py b/src/frontier_cs/gen/agent_interface.py index d7790a99..1cee7176 100644 --- a/src/frontier_cs/gen/agent_interface.py +++ b/src/frontier_cs/gen/agent_interface.py @@ -43,9 +43,9 @@ logger = logging.getLogger(__name__) -# Default budget limits -DEFAULT_COST_LIMIT_USD = 20.0 -DEFAULT_TIMEOUT_SECONDS = 1200 # 20 minutes +# Default budget limits — aligned with Harbor adapter (task.toml agent.timeout_sec=3600) +DEFAULT_COST_LIMIT_USD = None # None = no limit; Harbor relies on timeout, not cost cap +DEFAULT_TIMEOUT_SECONDS = 3600 # 1 hour, matching Harbor # Max size of sample I/O to embed directly in the prompt (bytes). # Larger inputs are left for the agent to read from disk. @@ -316,7 +316,7 @@ async def run_agent( problem_dir: str, model: str, *, - cost_limit: float = DEFAULT_COST_LIMIT_USD, + cost_limit: Optional[float] = DEFAULT_COST_LIMIT_USD, timeout: float = DEFAULT_TIMEOUT_SECONDS, transcript_path: Optional[Path] = None, parity: bool = True, @@ -326,7 +326,7 @@ async def run_agent( Args: problem_dir: Absolute path to the problem directory. model: Base model name (without -agent suffix). - cost_limit: Maximum cost in USD. + cost_limit: Maximum cost in USD. None = no limit. timeout: Maximum wall-clock time in seconds. transcript_path: Path for JSONL transcript log. None to skip. parity: If True, strip test data and helper scripts (Harbor parity mode). @@ -529,7 +529,7 @@ def generate_agent_solution( problem_dir: str, model: str, *, - cost_limit: float = DEFAULT_COST_LIMIT_USD, + cost_limit: Optional[float] = DEFAULT_COST_LIMIT_USD, timeout: float = DEFAULT_TIMEOUT_SECONDS, transcript_path: Optional[Path] = None, parity: bool = True, @@ -541,7 +541,7 @@ def generate_agent_solution( Args: problem_dir: Absolute path to the problem directory. model: Base model name (without -agent suffix). - cost_limit: Maximum cost in USD. + cost_limit: Maximum cost in USD. None = no limit. timeout: Maximum wall-clock time in seconds. transcript_path: Path for JSONL transcript log. parity: If True, strip test data and helper scripts (Harbor parity mode). From 7a30fd957318a62462ca0309185ff5c560993495 Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Fri, 17 Apr 2026 11:32:31 +0000 Subject: [PATCH 14/16] fix: align CLI defaults for agent-timeout and agent-cost-limit with Harbor --- algorithmic/scripts/generate_solutions.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/algorithmic/scripts/generate_solutions.py b/algorithmic/scripts/generate_solutions.py index d2b9ed33..64c9bf58 100644 --- a/algorithmic/scripts/generate_solutions.py +++ b/algorithmic/scripts/generate_solutions.py @@ -301,10 +301,10 @@ def main(): help="Maximum parallel generations") # Agent-specific parameters - parser.add_argument("--agent-timeout", type=float, default=1200.0, - help="Agent timeout in seconds (default: 1200 = 20 min)") - parser.add_argument("--agent-cost-limit", type=float, default=20.0, - help="Agent max cost per problem in USD (default: 20)") + parser.add_argument("--agent-timeout", type=float, default=3600.0, + help="Agent timeout in seconds (default: 3600 = 1 hour)") + parser.add_argument("--agent-cost-limit", type=float, default=None, + help="Agent max cost per problem in USD (default: no limit)") args = parser.parse_args() From e1b965c5a604e498562015084e358edb0b0cf494 Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Mon, 27 Apr 2026 12:04:59 +0000 Subject: [PATCH 15/16] fix: pass api_key per-run and skip judge when all models are agents - generate_solutions.py: detect all-agent runs and skip judge availability check; read statement from local file instead of judge API in agent mode; pass api_key through to generate_agent_solution for key pool rotation - agent_interface.py: add api_key parameter to run_agent/generate_agent_solution, forwarded to SDK subprocess env for per-run key rotation - api_keys.py: only count API-level errors (rate limit, 5xx, auth) toward backoff; application-level failures (agent timeout, no solution) no longer penalize the key --- algorithmic/scripts/generate_solutions.py | 19 ++++++++++++++----- src/frontier_cs/gen/agent_interface.py | 13 +++++++++++++ src/frontier_cs/gen/api_keys.py | 18 ++++++++++++++++-- 3 files changed, 43 insertions(+), 7 deletions(-) diff --git a/algorithmic/scripts/generate_solutions.py b/algorithmic/scripts/generate_solutions.py index 64c9bf58..58af43e7 100644 --- a/algorithmic/scripts/generate_solutions.py +++ b/algorithmic/scripts/generate_solutions.py @@ -312,10 +312,13 @@ def main(): # Output directory for algorithmic solutions output_dir = algo_dir / "solutions" + # Detect if all models are agent-only (no judge needed) + all_agent = args.models and all(m.endswith("-agent") for m in args.models) + # Initialize judge client judge = AlgorithmicJudgeClient(args.judge_url) - if not judge.is_available(): + if not all_agent and not judge.is_available(): print(f"{red('ERROR:')} Judge server not available at {args.judge_url}") print("Start the judge with: cd algorithmic && docker compose up -d") sys.exit(1) @@ -399,14 +402,19 @@ def main(): skipped: List[str] = [] for problem_id in problem_ids: - statement = judge.get_problem_statement(problem_id) + # Resolve problem directory + problem_dir_path = algo_dir / "problems" / problem_id + + if all_agent: + # Agent mode reads statement from local file; no judge needed + stmt_path = problem_dir_path / "statement.txt" + statement = stmt_path.read_text(encoding="utf-8") if stmt_path.exists() else "" + else: + statement = judge.get_problem_statement(problem_id) if not statement: print(f"{yellow('WARNING:')} Could not get statement for problem {problem_id}") continue - # Resolve problem directory for agent models - problem_dir_path = algo_dir / "problems" / problem_id - for model in models_list: model_prefix = get_model_prefix(model) provider = detect_provider(model) @@ -523,6 +531,7 @@ def execute_task(task: GenerationTask) -> Tuple[str, str, Optional[str], str, Op code, metadata = generate_agent_solution( problem_dir=task.problem_dir, model=base_model, + api_key=api_key, cost_limit=args.agent_cost_limit, timeout=args.agent_timeout, transcript_path=transcript_path, diff --git a/src/frontier_cs/gen/agent_interface.py b/src/frontier_cs/gen/agent_interface.py index 1cee7176..7ca4e28f 100644 --- a/src/frontier_cs/gen/agent_interface.py +++ b/src/frontier_cs/gen/agent_interface.py @@ -316,6 +316,7 @@ async def run_agent( problem_dir: str, model: str, *, + api_key: Optional[str] = None, cost_limit: Optional[float] = DEFAULT_COST_LIMIT_USD, timeout: float = DEFAULT_TIMEOUT_SECONDS, transcript_path: Optional[Path] = None, @@ -326,6 +327,9 @@ async def run_agent( Args: problem_dir: Absolute path to the problem directory. model: Base model name (without -agent suffix). + api_key: Anthropic API key. If provided, passed to the SDK subprocess + via env (per-run), allowing pool-managed key rotation. If None, + the SDK falls back to inheriting ANTHROPIC_API_KEY from the parent. cost_limit: Maximum cost in USD. None = no limit. timeout: Maximum wall-clock time in seconds. transcript_path: Path for JSONL transcript log. None to skip. @@ -377,9 +381,14 @@ async def run_agent( prompt = build_agent_prompt(str(workdir), parity=parity) + sdk_env: Dict[str, str] = {} + if api_key: + sdk_env["ANTHROPIC_API_KEY"] = api_key + options = ClaudeAgentOptions( model=model, cwd=str(workdir), + env=sdk_env, max_budget_usd=cost_limit, permission_mode="bypassPermissions", include_partial_messages=True, @@ -529,6 +538,7 @@ def generate_agent_solution( problem_dir: str, model: str, *, + api_key: Optional[str] = None, cost_limit: Optional[float] = DEFAULT_COST_LIMIT_USD, timeout: float = DEFAULT_TIMEOUT_SECONDS, transcript_path: Optional[Path] = None, @@ -541,6 +551,8 @@ def generate_agent_solution( Args: problem_dir: Absolute path to the problem directory. model: Base model name (without -agent suffix). + api_key: Anthropic API key (passed to SDK subprocess env, per-run). + If None, the SDK inherits ANTHROPIC_API_KEY from the parent process. cost_limit: Maximum cost in USD. None = no limit. timeout: Maximum wall-clock time in seconds. transcript_path: Path for JSONL transcript log. @@ -553,6 +565,7 @@ def generate_agent_solution( run_agent( problem_dir, model, + api_key=api_key, cost_limit=cost_limit, timeout=timeout, transcript_path=transcript_path, diff --git a/src/frontier_cs/gen/api_keys.py b/src/frontier_cs/gen/api_keys.py index 2fd67680..db9d3759 100644 --- a/src/frontier_cs/gen/api_keys.py +++ b/src/frontier_cs/gen/api_keys.py @@ -114,23 +114,37 @@ def report_success(self, idx: Optional[int]) -> None: state["backoff_until"] = 0.0 def report_failure(self, idx: Optional[int], error: Optional[str]) -> None: - """Report failed API call for a key.""" + """Report failed task that used this key. + + Only API-key-related errors (auth, rate limit, server-side) trigger + disable/backoff. Application-level errors (e.g. agent timeout, no + solution produced) leave the key untouched, since they say nothing + about key health. + """ if idx is None: return with self._lock: if not (0 <= idx < len(self._states)): return state = self._states[idx] - state["failures"] += 1 reason = (error or "").lower() + fatal_markers = ("invalid", "unauthorized", "forbidden", "permission", "auth") if any(marker in reason for marker in fatal_markers): if not state["disabled"]: logger.warning(f"Disabling API key for {self.name}: invalid/unauthorized") state["disabled"] = True state["backoff_until"] = float("inf") + state["failures"] += 1 return + api_markers = ("rate limit", "rate_limit", "429", "503", "502", "500", + "overloaded", "quota", "throttle", "connection") + if not any(marker in reason for marker in api_markers): + # Not an API/key issue — leave key state unchanged. + return + + state["failures"] += 1 delay: int = min(600, 60 * state["failures"]) state["backoff_until"] = max(state["backoff_until"], time.time() + delay) logger.info(f"Backing off {delay:.0f}s for {self.name} key (failures={state['failures']})") From 6b6fdb2a7aacb68b19947ff21ebb39497689cd84 Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Mon, 27 Apr 2026 12:08:22 +0000 Subject: [PATCH 16/16] fix: align timeout default (10800s) and prompt wording with Harbor adapter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - DEFAULT_TIMEOUT_SECONDS: 3600 → 10800 to match Harbor task.toml - CLI --agent-timeout default: 3600 → 10800 - PARITY_TAIL / FULL_ACCESS_TAIL: drop stale article "the" before CLAUDE.md to match Harbor's "Read AGENT.md" phrasing --- algorithmic/scripts/generate_solutions.py | 4 ++-- src/frontier_cs/gen/agent_constants.py | 4 ++-- src/frontier_cs/gen/agent_interface.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/algorithmic/scripts/generate_solutions.py b/algorithmic/scripts/generate_solutions.py index 58af43e7..6d695b96 100644 --- a/algorithmic/scripts/generate_solutions.py +++ b/algorithmic/scripts/generate_solutions.py @@ -301,8 +301,8 @@ def main(): help="Maximum parallel generations") # Agent-specific parameters - parser.add_argument("--agent-timeout", type=float, default=3600.0, - help="Agent timeout in seconds (default: 3600 = 1 hour)") + parser.add_argument("--agent-timeout", type=float, default=10800.0, + help="Agent timeout in seconds (default: 10800 = 3 hours)") parser.add_argument("--agent-cost-limit", type=float, default=None, help="Agent max cost per problem in USD (default: no limit)") diff --git a/src/frontier_cs/gen/agent_constants.py b/src/frontier_cs/gen/agent_constants.py index 6dcd9d9f..1b80194c 100644 --- a/src/frontier_cs/gen/agent_constants.py +++ b/src/frontier_cs/gen/agent_constants.py @@ -180,7 +180,7 @@ Your output must match the expected output exactly (whitespace-normalized).""" PARITY_TAIL = """ -Read the CLAUDE.md in this directory for compilation, testing, and workflow guidance. +Read CLAUDE.md in this directory for compilation, testing, and workflow guidance. Begin by reading the full problem statement in statement.txt.""" # Full-access prompt (parity=False): agent gets test data and helper scripts @@ -210,7 +210,7 @@ Use `./test_all.sh` to compile and test against all samples.{checker_note}""" FULL_ACCESS_TAIL = """ -Read the CLAUDE.md in this directory for compilation, testing, and workflow guidance. +Read CLAUDE.md in this directory for compilation, testing, and workflow guidance. Begin by reading the full problem statement in statement.txt.""" # --------------------------------------------------------------------------- diff --git a/src/frontier_cs/gen/agent_interface.py b/src/frontier_cs/gen/agent_interface.py index 7ca4e28f..26f0d9f8 100644 --- a/src/frontier_cs/gen/agent_interface.py +++ b/src/frontier_cs/gen/agent_interface.py @@ -43,9 +43,9 @@ logger = logging.getLogger(__name__) -# Default budget limits — aligned with Harbor adapter (task.toml agent.timeout_sec=3600) +# Default budget limits — aligned with Harbor adapter (task.toml agent.timeout_sec=10800) DEFAULT_COST_LIMIT_USD = None # None = no limit; Harbor relies on timeout, not cost cap -DEFAULT_TIMEOUT_SECONDS = 3600 # 1 hour, matching Harbor +DEFAULT_TIMEOUT_SECONDS = 10800 # 3 hours, matching Harbor # Max size of sample I/O to embed directly in the prompt (bytes). # Larger inputs are left for the agent to read from disk.