From 509f5cb9d2ac7abfe30b5e5a49f5c429684fab33 Mon Sep 17 00:00:00 2001
From: Andy Lee <andylizf@outlook.com>
Date: Mon, 6 Apr 2026 11:40:48 +0000
Subject: [PATCH 01/16] feat: add claude-agent-sdk dependency for agent eval

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index a7389580..317700e5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,6 +5,7 @@ description = "Evaluation framework for Frontier-CS problems"
 requires-python = ">=3.11"
 dependencies = [
     "anthropic>=0.74.0",
+    "claude-agent-sdk>=0.1.0",
     "colorlog>=6.10.1",
     "datasets>=4.4.1",
     "google-genai>=1.55.0",

From c8ee4aaadf61dcd62a89ebd44ab266e476dafd7c Mon Sep 17 00:00:00 2001
From: Andy Lee <andylizf@outlook.com>
Date: Mon, 6 Apr 2026 11:44:27 +0000
Subject: [PATCH 02/16] feat: handle -agent model suffix in model prefix and
 provider detection

---
 src/frontier_cs/models.py     | 26 ++++++++++++++++++--------
 tests/test_agent_interface.py | 25 +++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 8 deletions(-)
 create mode 100644 tests/test_agent_interface.py

diff --git a/src/frontier_cs/models.py b/src/frontier_cs/models.py
index bec6640c..03fd4b83 100644
--- a/src/frontier_cs/models.py
+++ b/src/frontier_cs/models.py
@@ -42,6 +42,12 @@ def get_model_prefix(model: str) -> str:
     """
     original = model
 
+    # Strip and track -agent suffix
+    agent_suffix = ""
+    if model.endswith("-agent"):
+        agent_suffix = "agent"
+        model = model.removesuffix("-agent")
+
     # Remove provider prefix if present (e.g., 'gemini/gemini-2.5-pro' -> 'gemini-2.5-pro')
     if "/" in model:
         model = model.split("/", 1)[1]
@@ -51,21 +57,21 @@ def get_model_prefix(model: str) -> str:
     # Handle GPT-5 variants
     # Keep 'gpt-5.1', 'gpt-5.2' etc. distinct so their artifacts prefix correctly
     if model_lower.startswith("gpt-5.2") or model_lower.startswith("gpt5.2"):
-        return "gpt5.2"
+        return "gpt5.2" + agent_suffix
     if model_lower.startswith("gpt-5.1") or model_lower.startswith("gpt5.1"):
-        return "gpt5.1"
+        return "gpt5.1" + agent_suffix
     if model_lower.startswith("gpt-5") or model_lower.startswith("gpt5"):
-        return "gpt5"
+        return "gpt5" + agent_suffix
 
     # Handle Gemini 2.5 Pro variants
     if "gemini-2.5-pro" in model_lower or "gemini2.5pro" in model_lower:
-        return "gemini2.5pro"
+        return "gemini2.5pro" + agent_suffix
 
     # Handle other Gemini variants (e.g., gemini-1.5-pro -> gemini1.5pro)
     gemini_match = re.match(r"gemini-?(\d+\.?\d*)-?pro", model_lower)
     if gemini_match:
         version = gemini_match.group(1)
-        return f"gemini{version}pro"
+        return f"gemini{version}pro" + agent_suffix
 
     # Handle Claude variants (e.g., claude-sonnet-4-5-20250929 -> claude4.5sonnet)
     claude_match = re.match(r"claude-([a-z]+)-(\d+)-(\d+)", model_lower)
@@ -73,19 +79,19 @@ def get_model_prefix(model: str) -> str:
         family = claude_match.group(1)
         major = claude_match.group(2)
         minor = claude_match.group(3)
-        return f"claude{major}.{minor}{family}"
+        return f"claude{major}.{minor}{family}" + agent_suffix
 
     # Handle Grok variants - keep 'fast' and 'reasoning' in the prefix
     if "grok" in model_lower:
         sanitized = re.sub(r"[^a-zA-Z0-9]+", "", model_lower)
         if sanitized:
-            return sanitized
+            return sanitized + agent_suffix
 
     # Default: sanitize by removing all non-alphanumeric characters
     sanitized = re.sub(r"[^a-zA-Z0-9]+", "", model_lower)
     if not sanitized:
         raise ValueError(f"Unable to derive model prefix from '{original}'")
-    return sanitized
+    return sanitized + agent_suffix
 
 
 def normalize_solution_name(name: str) -> str:
@@ -217,6 +223,10 @@ def detect_provider(model: str) -> str:
     Returns:
         Provider name: 'openai', 'google', 'anthropic', 'xai', 'deepseek', 'openrouter'
     """
+    # Strip agent suffix before detection
+    if model.endswith("-agent"):
+        model = model.removesuffix("-agent")
+
     normalized = model.strip()
     if "/" in normalized:
         provider_hint, actual_model = normalized.split("/", 1)
diff --git a/tests/test_agent_interface.py b/tests/test_agent_interface.py
new file mode 100644
index 00000000..96c19e05
--- /dev/null
+++ b/tests/test_agent_interface.py
@@ -0,0 +1,25 @@
+"""Tests for agent model name handling and agent_interface."""
+
+from frontier_cs.models import get_model_prefix, detect_provider, is_reasoning_model
+
+
+def test_agent_model_prefix():
+    """Agent model prefix includes 'agent' suffix."""
+    assert get_model_prefix("claude-opus-4-6-agent") == "claude4.6opusagent"
+    assert get_model_prefix("claude-sonnet-4-5-agent") == "claude4.5sonnetagent"
+
+
+def test_agent_model_prefix_does_not_collide_with_single_shot():
+    """Agent prefix must differ from single-shot prefix."""
+    assert get_model_prefix("claude-opus-4-6-agent") != get_model_prefix("claude-opus-4-6")
+
+
+def test_agent_detect_provider():
+    """Agent models detect as anthropic provider."""
+    assert detect_provider("claude-opus-4-6-agent") == "anthropic"
+    assert detect_provider("claude-sonnet-4-5-agent") == "anthropic"
+
+
+def test_agent_is_not_reasoning_model():
+    """Agent models are not reasoning models."""
+    assert is_reasoning_model("claude-opus-4-6-agent") is False

From f208dfa90184f8ea600f620833428c827e2fbd90 Mon Sep 17 00:00:00 2001
From: Andy Lee <andylizf@outlook.com>
Date: Mon, 6 Apr 2026 11:47:03 +0000
Subject: [PATCH 03/16] =?UTF-8?q?feat:=20add=20agent=5Finterface.py=20?=
 =?UTF-8?q?=E2=80=94=20core=20agent=20runner=20with=20logging=20and=20extr?=
 =?UTF-8?q?action?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/frontier_cs/gen/agent_interface.py | 313 +++++++++++++++++++++++++
 tests/test_agent_interface.py          |  57 +++++
 2 files changed, 370 insertions(+)
 create mode 100644 src/frontier_cs/gen/agent_interface.py

diff --git a/src/frontier_cs/gen/agent_interface.py b/src/frontier_cs/gen/agent_interface.py
new file mode 100644
index 00000000..97dc09ba
--- /dev/null
+++ b/src/frontier_cs/gen/agent_interface.py
@@ -0,0 +1,313 @@
+"""Agent-based solution generation using Claude Agent SDK.
+
+This module handles the full agent lifecycle for solving competitive programming
+problems: prompt construction, Agent SDK invocation with streaming, JSONL transcript
+logging, live monitoring, timeout/cost control, and solution extraction.
+
+Agent models are identified by a "-agent" suffix (e.g., "claude-opus-4-6-agent").
+They are treated as distinct "models" in the gen pipeline — no special routing needed
+downstream.
+"""
+
+import asyncio
+import json
+import logging
+import sys
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Dict, Optional, Tuple
+
+logger = logging.getLogger(__name__)
+
+# Default budget limits
+DEFAULT_COST_LIMIT_USD = 20.0
+DEFAULT_TIMEOUT_SECONDS = 1200  # 20 minutes
+
+
+def build_agent_prompt(problem_dir: str) -> str:
+    """Construct the prompt given to the agent.
+
+    Args:
+        problem_dir: Absolute path to the problem directory.
+
+    Returns:
+        The prompt string for the agent.
+    """
+    return f"""You are solving a competitive programming problem.
+
+Problem directory: {problem_dir}
+- Read statement.txt for the problem description
+- testdata/ contains sample test cases (*.in, *.ans), but these are only a subset
+- Your solution will be evaluated against a larger hidden test suite
+- You can compile with g++, run against the available samples, and iterate
+- config.yaml has time/memory limits — respect them in your solution
+
+Submit your final solution as solution.cpp in the current working directory."""
+
+
+def extract_solution_cpp(workdir: Path) -> str:
+    """Extract solution.cpp from the agent working directory.
+
+    Looks for solution.cpp first, then falls back to any .cpp file.
+
+    Args:
+        workdir: The agent's working directory.
+
+    Returns:
+        The C++ source code, or empty string if not found.
+    """
+    # Primary: solution.cpp
+    sol = workdir / "solution.cpp"
+    if sol.is_file():
+        return sol.read_text(encoding="utf-8")
+
+    # Fallback: any .cpp file (agent might have used a different name)
+    cpp_files = list(workdir.glob("*.cpp"))
+    if cpp_files:
+        # Pick the most recently modified one
+        newest = max(cpp_files, key=lambda p: p.stat().st_mtime)
+        return newest.read_text(encoding="utf-8")
+
+    return ""
+
+
+def build_metadata(
+    *,
+    tokens_in: int,
+    tokens_out: int,
+    cost_usd: float,
+    time_seconds: float,
+    turns: int,
+    status: str,
+) -> Dict[str, Any]:
+    """Build the metadata dict for an agent run.
+
+    Args:
+        tokens_in: Total input tokens consumed.
+        tokens_out: Total output tokens consumed.
+        cost_usd: Total cost in USD.
+        time_seconds: Wall-clock time in seconds.
+        turns: Number of agentic turns (tool-use round trips).
+        status: One of "success", "timeout", "cost_limit", "error".
+
+    Returns:
+        Metadata dictionary.
+    """
+    return {
+        "tokens_in": tokens_in,
+        "tokens_out": tokens_out,
+        "cost_usd": round(cost_usd, 4),
+        "time_seconds": round(time_seconds, 2),
+        "turns": turns,
+        "status": status,
+    }
+
+
+@dataclass
+class TranscriptLogger:
+    """Writes JSONL transcript of agent events, flushed per event."""
+
+    path: Path
+    _file: Any = field(default=None, init=False, repr=False)
+
+    def open(self) -> None:
+        self.path.parent.mkdir(parents=True, exist_ok=True)
+        self._file = open(self.path, "w", encoding="utf-8")
+
+    def log(self, event: Dict[str, Any]) -> None:
+        if self._file is None:
+            return
+        event["_ts"] = time.time()
+        self._file.write(json.dumps(event, default=str) + "\n")
+        self._file.flush()
+
+    def close(self) -> None:
+        if self._file is not None:
+            self._file.close()
+            self._file = None
+
+
+async def run_agent(
+    problem_dir: str,
+    model: str,
+    *,
+    cost_limit: float = DEFAULT_COST_LIMIT_USD,
+    timeout: float = DEFAULT_TIMEOUT_SECONDS,
+    transcript_path: Optional[Path] = None,
+) -> Tuple[str, Dict[str, Any]]:
+    """Run the agent to solve a problem.
+
+    Args:
+        problem_dir: Absolute path to the problem directory.
+        model: Base model name (without -agent suffix).
+        cost_limit: Maximum cost in USD.
+        timeout: Maximum wall-clock time in seconds.
+        transcript_path: Path for JSONL transcript log. None to skip.
+
+    Returns:
+        Tuple of (cpp_code, metadata_dict).
+    """
+    from claude_agent_sdk import query, ClaudeAgentOptions
+    from claude_agent_sdk.types import StreamEvent
+
+    prompt = build_agent_prompt(problem_dir)
+    workdir = Path(problem_dir)
+
+    options = ClaudeAgentOptions(
+        model=model,
+        cwd=str(workdir),
+        max_budget_usd=cost_limit,
+        permission_mode="bypassPermissions",
+        include_partial_messages=True,
+    )
+
+    # Set up transcript logging
+    transcript = TranscriptLogger(transcript_path) if transcript_path else None
+    if transcript:
+        transcript.open()
+
+    start_time = time.time()
+    status = "success"
+    num_turns = 0
+    total_cost: Optional[float] = None
+    usage_in = 0
+    usage_out = 0
+
+    try:
+        async def _run():
+            nonlocal num_turns, total_cost, usage_in, usage_out
+
+            async for message in query(prompt=prompt, options=options):
+                # Import here to check types
+                from claude_agent_sdk import AssistantMessage, ResultMessage
+
+                if isinstance(message, StreamEvent):
+                    event = message.event
+                    event_type = event.get("type", "")
+
+                    # Log every event
+                    if transcript:
+                        transcript.log({"type": "stream_event", "event": event})
+
+                    # Live monitoring: tool calls
+                    if event_type == "content_block_start":
+                        cb = event.get("content_block", {})
+                        if cb.get("type") == "tool_use":
+                            tool = cb.get("name", "?")
+                            elapsed = time.time() - start_time
+                            print(
+                                f"  [{elapsed:6.1f}s] [turn {num_turns}] {tool}",
+                                flush=True,
+                            )
+
+                elif isinstance(message, AssistantMessage):
+                    num_turns += 1
+                    if transcript:
+                        tools_used = [
+                            b.name
+                            for b in message.content
+                            if hasattr(b, "name")
+                        ]
+                        transcript.log({
+                            "type": "assistant_turn",
+                            "turn": num_turns,
+                            "tools": tools_used,
+                            "model": message.model,
+                        })
+
+                    # Per-message usage tracking
+                    if message.usage:
+                        usage_in += message.usage.get("input_tokens", 0)
+                        usage_out += message.usage.get("output_tokens", 0)
+
+                    # Periodic cost summary to stderr
+                    elapsed = time.time() - start_time
+                    print(
+                        f"  [{elapsed:6.1f}s] turn {num_turns}, "
+                        f"{usage_in // 1000}K in / {usage_out // 1000}K out",
+                        file=sys.stderr,
+                        flush=True,
+                    )
+
+                elif isinstance(message, ResultMessage):
+                    total_cost = message.total_cost_usd
+                    if message.usage:
+                        usage_in = message.usage.get("input_tokens", usage_in)
+                        usage_out = message.usage.get("output_tokens", usage_out)
+                    num_turns = message.num_turns
+                    if transcript:
+                        transcript.log({
+                            "type": "result",
+                            "cost_usd": total_cost,
+                            "num_turns": num_turns,
+                            "duration_ms": message.duration_ms,
+                            "stop_reason": message.stop_reason,
+                            "is_error": message.is_error,
+                        })
+
+        await asyncio.wait_for(_run(), timeout=timeout)
+
+    except asyncio.TimeoutError:
+        status = "timeout"
+        logger.warning("Agent timed out after %.0fs", timeout)
+    except Exception as e:
+        status = "error"
+        logger.error("Agent error: %s", e)
+        if transcript:
+            transcript.log({"type": "error", "error": str(e)})
+    finally:
+        if transcript:
+            transcript.close()
+
+    elapsed = time.time() - start_time
+
+    # Extract solution (best-effort even on timeout/error)
+    code = extract_solution_cpp(workdir)
+    if not code and status == "success":
+        status = "error"
+        logger.error("Agent completed but no .cpp file found in %s", workdir)
+
+    metadata = build_metadata(
+        tokens_in=usage_in,
+        tokens_out=usage_out,
+        cost_usd=total_cost if total_cost is not None else 0.0,
+        time_seconds=elapsed,
+        turns=num_turns,
+        status=status,
+    )
+
+    return code, metadata
+
+
+def generate_agent_solution(
+    problem_dir: str,
+    model: str,
+    *,
+    cost_limit: float = DEFAULT_COST_LIMIT_USD,
+    timeout: float = DEFAULT_TIMEOUT_SECONDS,
+    transcript_path: Optional[Path] = None,
+) -> Tuple[str, Dict[str, Any]]:
+    """Synchronous wrapper for run_agent.
+
+    This is the main entry point called from generate_solutions.py.
+
+    Args:
+        problem_dir: Absolute path to the problem directory.
+        model: Base model name (without -agent suffix).
+        cost_limit: Maximum cost in USD.
+        timeout: Maximum wall-clock time in seconds.
+        transcript_path: Path for JSONL transcript log.
+
+    Returns:
+        Tuple of (cpp_code, metadata_dict).
+    """
+    return asyncio.run(
+        run_agent(
+            problem_dir,
+            model,
+            cost_limit=cost_limit,
+            timeout=timeout,
+            transcript_path=transcript_path,
+        )
+    )
diff --git a/tests/test_agent_interface.py b/tests/test_agent_interface.py
index 96c19e05..be163ba4 100644
--- a/tests/test_agent_interface.py
+++ b/tests/test_agent_interface.py
@@ -23,3 +23,60 @@ def test_agent_detect_provider():
 def test_agent_is_not_reasoning_model():
     """Agent models are not reasoning models."""
     assert is_reasoning_model("claude-opus-4-6-agent") is False
+
+
+import json
+import tempfile
+from pathlib import Path
+
+
+def test_build_agent_prompt():
+    """Agent prompt includes problem dir and key instructions."""
+    from frontier_cs.gen.agent_interface import build_agent_prompt
+
+    prompt = build_agent_prompt("/tmp/fake_problem")
+    assert "/tmp/fake_problem" in prompt
+    assert "statement.txt" in prompt
+    assert "testdata/" in prompt
+    assert "hidden test suite" in prompt
+    assert "solution.cpp" in prompt
+
+
+def test_extract_cpp_from_workdir():
+    """Extract solution.cpp from agent working directory."""
+    from frontier_cs.gen.agent_interface import extract_solution_cpp
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        sol_path = Path(tmpdir) / "solution.cpp"
+        sol_path.write_text('#include <iostream>\nint main() { return 0; }')
+        code = extract_solution_cpp(Path(tmpdir))
+        assert "#include <iostream>" in code
+
+
+def test_extract_cpp_missing():
+    """Return empty string if no solution.cpp found."""
+    from frontier_cs.gen.agent_interface import extract_solution_cpp
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        code = extract_solution_cpp(Path(tmpdir))
+        assert code == ""
+
+
+def test_build_metadata():
+    """Build metadata dict from agent run results."""
+    from frontier_cs.gen.agent_interface import build_metadata
+
+    meta = build_metadata(
+        tokens_in=100000,
+        tokens_out=25000,
+        cost_usd=5.50,
+        time_seconds=300.5,
+        turns=15,
+        status="success",
+    )
+    assert meta["tokens_in"] == 100000
+    assert meta["tokens_out"] == 25000
+    assert meta["cost_usd"] == 5.50
+    assert meta["time_seconds"] == 300.5
+    assert meta["turns"] == 15
+    assert meta["status"] == "success"

From e385340b8e5fb63102ebb29232b6712318e7a3c7 Mon Sep 17 00:00:00 2001
From: Andy Lee <andylizf@outlook.com>
Date: Mon, 6 Apr 2026 11:53:23 +0000
Subject: [PATCH 04/16] feat: integrate agent mode into generate_solutions.py

Add agent model support to the solution generation pipeline:
- Detect -agent suffix models and store problem_dir in GenerationTask
- Add --agent-timeout and --agent-cost-limit CLI arguments
- Branch execute_task to call generate_agent_solution for agent models
- Save .meta.json alongside generated .cpp solutions
- Add import json for metadata serialization
---
 algorithmic/scripts/generate_solutions.py | 55 +++++++++++++++++++----
 1 file changed, 47 insertions(+), 8 deletions(-)

diff --git a/algorithmic/scripts/generate_solutions.py b/algorithmic/scripts/generate_solutions.py
index 93a6f260..7d01b5b6 100644
--- a/algorithmic/scripts/generate_solutions.py
+++ b/algorithmic/scripts/generate_solutions.py
@@ -17,6 +17,7 @@
 import time
 import argparse
 import re
+import json
 from pathlib import Path
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from dataclasses import dataclass
@@ -54,6 +55,7 @@ class GenerationTask:
     variant_index: int
     solution_name: str
     total_variants: int = 1
+    problem_dir: Optional[str] = None  # Set for agent models
 
 
 class AlgorithmicJudgeClient:
@@ -298,6 +300,12 @@ def main():
     parser.add_argument("--concurrency", type=int, default=4,
                         help="Maximum parallel generations")
 
+    # Agent-specific parameters
+    parser.add_argument("--agent-timeout", type=float, default=1200.0,
+                        help="Agent timeout in seconds (default: 1200 = 20 min)")
+    parser.add_argument("--agent-cost-limit", type=float, default=20.0,
+                        help="Agent max cost per problem in USD (default: 20)")
+
     args = parser.parse_args()
 
     # Output directory for algorithmic solutions
@@ -395,10 +403,14 @@ def main():
             print(f"{yellow('WARNING:')} Could not get statement for problem {problem_id}")
             continue
 
+        # Resolve problem directory for agent models
+        problem_dir_path = algo_dir / "problems" / problem_id
+
         for model in models_list:
             model_prefix = get_model_prefix(model)
             provider = detect_provider(model)
             reasoning = is_reasoning_model(model)
+            is_agent = model.endswith("-agent")
 
             for variant_idx in solution_indices:
                 # Nested format: {problem}/{model}.cpp or {problem}/{model}_{variant}.cpp
@@ -428,6 +440,7 @@ def main():
                     variant_index=variant_idx,
                     solution_name=sol_filename,
                     total_variants=len(solution_indices),
+                    problem_dir=str(problem_dir_path) if is_agent else None,
                 ))
 
     # Print plan
@@ -498,14 +511,40 @@ def execute_task(task: GenerationTask) -> Tuple[str, str, Optional[str], str, Op
         failed_path = get_failed_path(sol_path)
 
         try:
-            code = generate_code(
-                task.statement,
-                model=task.model,
-                api_key=api_key,
-                log_file=log_file,
-                is_reasoning_model=task.reasoning_model,
-                timeout=args.timeout,
-            )
+            if task.problem_dir is not None:
+                # Agent mode
+                from frontier_cs.gen.agent_interface import generate_agent_solution
+
+                base_model = task.model.removesuffix("-agent")
+                transcript_path = logs_dir / task.solution_name.replace(".cpp", f"_{timestamp}.transcript.jsonl")
+                transcript_path.parent.mkdir(parents=True, exist_ok=True)
+
+                code, metadata = generate_agent_solution(
+                    problem_dir=task.problem_dir,
+                    model=base_model,
+                    cost_limit=args.agent_cost_limit,
+                    timeout=args.agent_timeout,
+                    transcript_path=transcript_path,
+                )
+
+                # Save metadata alongside solution
+                meta_path = sol_path.with_suffix(".meta.json")
+                meta_path.parent.mkdir(parents=True, exist_ok=True)
+                meta_path.write_text(json.dumps(metadata, indent=2), encoding="utf-8")
+                print(f"  {dim('meta:')} {meta_path}")
+            else:
+                # Single-shot mode (existing)
+                code = generate_code(
+                    task.statement,
+                    model=task.model,
+                    api_key=api_key,
+                    log_file=log_file,
+                    is_reasoning_model=task.reasoning_model,
+                    timeout=args.timeout,
+                )
+
+            if not code:
+                raise RuntimeError("No solution code produced")
 
             # Save solution to nested directory
             sol_path.parent.mkdir(parents=True, exist_ok=True)

From 99fb5c05f2671beafbdc5f2763be0338e60324e5 Mon Sep 17 00:00:00 2001
From: Andy Lee <andylizf@outlook.com>
Date: Mon, 6 Apr 2026 13:07:45 +0000
Subject: [PATCH 05/16] fix: use temp workdir and improve token tracking in
 agent_interface

- Copy problem dir to temp directory so agent doesn't pollute originals
- Makes concurrent runs on same problem safe
- Track token usage from streaming message_delta events (only reliable
  source when timeout kills run before ResultMessage arrives)
- Clean up temp dir after extraction
---
 src/frontier_cs/gen/agent_interface.py | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/src/frontier_cs/gen/agent_interface.py b/src/frontier_cs/gen/agent_interface.py
index 97dc09ba..f87adf7a 100644
--- a/src/frontier_cs/gen/agent_interface.py
+++ b/src/frontier_cs/gen/agent_interface.py
@@ -12,7 +12,9 @@
 import asyncio
 import json
 import logging
+import shutil
 import sys
+import tempfile
 import time
 from dataclasses import dataclass, field
 from pathlib import Path
@@ -151,8 +153,13 @@ async def run_agent(
     from claude_agent_sdk import query, ClaudeAgentOptions
     from claude_agent_sdk.types import StreamEvent
 
-    prompt = build_agent_prompt(problem_dir)
-    workdir = Path(problem_dir)
+    # Copy problem dir to a temp working directory to avoid polluting the original.
+    # This also makes concurrent runs on the same problem safe.
+    tmpdir = tempfile.mkdtemp(prefix="agent_eval_")
+    workdir = Path(tmpdir) / "problem"
+    shutil.copytree(problem_dir, workdir)
+
+    prompt = build_agent_prompt(str(workdir))
 
     options = ClaudeAgentOptions(
         model=model,
@@ -201,6 +208,14 @@ async def _run():
                                 flush=True,
                             )
 
+                    # Track token usage from streaming message_delta events.
+                    # This is the only reliable source when timeout kills
+                    # the run before ResultMessage arrives.
+                    if event_type == "message_delta":
+                        delta_usage = event.get("usage", {})
+                        if delta_usage.get("output_tokens"):
+                            usage_out = delta_usage["output_tokens"]
+
                 elif isinstance(message, AssistantMessage):
                     num_turns += 1
                     if transcript:
@@ -268,6 +283,9 @@ async def _run():
         status = "error"
         logger.error("Agent completed but no .cpp file found in %s", workdir)
 
+    # Clean up temp directory
+    shutil.rmtree(tmpdir, ignore_errors=True)
+
     metadata = build_metadata(
         tokens_in=usage_in,
         tokens_out=usage_out,

From f54d370b467de84a2a0c93880e53069bd12387dd Mon Sep 17 00:00:00 2001
From: Andy Lee <andylizf@outlook.com>
Date: Thu, 16 Apr 2026 02:56:58 +0000
Subject: [PATCH 06/16] feat: add prompt construction, test scripts, and sample
 I/O embedding for agent eval

Build dynamic agent prompts from problem config (time/memory limits,
subtask counts, interactive vs standard). Write test_all.sh and
run_interactive.sh into agent workdir. Embed small sample I/O directly
in prompt. Add CLAUDE.md with solving strategy guidance.
---
 algorithmic/README.md                  | 101 +++++
 src/frontier_cs/gen/agent_interface.py | 497 +++++++++++++++++++++++--
 tests/test_agent_interface.py          | 171 ++++++++-
 3 files changed, 730 insertions(+), 39 deletions(-)

diff --git a/algorithmic/README.md b/algorithmic/README.md
index 8737c55e..9299a7e3 100644
--- a/algorithmic/README.md
+++ b/algorithmic/README.md
@@ -97,6 +97,107 @@ sky launch -c algo-judge algorithmic/sky-judge.yaml --idle-minutes-to-autostop 1
 frontier eval algorithmic 1 solution.cpp --judge-url http://$(sky status --ip algo-judge):8081
 ```
 
+### Agent Evaluation
+
+Agent mode lets an AI agent solve problems iteratively — reading the statement, writing code, compiling, testing against samples, and refining — rather than generating a single-shot solution.
+
+Agents use the [Claude Agent SDK](https://github.com/anthropic/claude-agent-sdk) (Claude Code as a library). The agent gets a temporary copy of the problem directory with full tool access (shell, file I/O, compilation).
+
+#### Model naming convention
+
+Append `-agent` to any Claude model name to trigger agent mode:
+
+```
+claude-sonnet-4-5-20250514-agent   # Agent mode with Sonnet 4.5
+claude-opus-4-6-20250610-agent     # Agent mode with Opus 4.6
+```
+
+The `-agent` suffix is stripped before passing the model to the SDK. The model prefix for output files includes `agent` (e.g., `claude4.5sonnetagent.cpp`), so agent and single-shot results never collide.
+
+#### Running agent evaluation
+
+```bash
+cd algorithmic/scripts
+
+# Single model, all problems
+python generate_solutions.py \
+  --model claude-sonnet-4-5-20250514-agent \
+  --judge-url http://localhost:8081
+
+# Subset of problems, custom budget
+python generate_solutions.py \
+  --model claude-sonnet-4-5-20250514-agent \
+  --problems 0,1,2,3 \
+  --agent-timeout 1800 \
+  --agent-cost-limit 30 \
+  --judge-url http://localhost:8081
+
+# Multiple variants per problem
+python generate_solutions.py \
+  --model claude-sonnet-4-5-20250514-agent \
+  --indices 3 \
+  --judge-url http://localhost:8081
+```
+
+**Agent-specific CLI flags:**
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--agent-timeout` | 1200 (20 min) | Wall-clock timeout per problem in seconds |
+| `--agent-cost-limit` | 20.0 | Max cost per problem in USD |
+
+#### Output files
+
+For each problem/variant, agent mode produces three files:
+
+```
+solutions/{problem_id}/
+├── claude4.5sonnetagent.cpp           # Extracted C++ solution
+├── claude4.5sonnetagent.meta.json     # Run metadata (cost, tokens, turns, status)
+└── (in generation_logs/)
+    └── claude4.5sonnetagent_*.transcript.jsonl  # Full agent transcript
+```
+
+**meta.json** fields:
+- `tokens_in` / `tokens_out` — total token usage
+- `cost_usd` — total API cost
+- `time_seconds` — wall-clock time
+- `turns` — number of agentic turns (tool-use round trips)
+- `status` — `success`, `timeout`, `cost_limit`, or `error`
+
+#### Prerequisites
+
+1. **Claude Agent SDK**: `pip install claude-agent-sdk` (or `uv sync` if already in project deps)
+2. **Claude Code CLI**: Must be installed and authenticated (`claude --version`)
+3. **Judge server**: Running and accessible (see [Judge Server Configuration](#judge-server-configuration))
+4. **g++**: Available in PATH for the agent to compile solutions
+
+#### How it works
+
+1. The problem directory is copied to a temp working directory (concurrent-safe)
+2. `testlib.h` is automatically copied from `judge/include/` if present (needed for interactive problems)
+3. The agent receives a structured prompt with the problem path and workflow guidance
+4. The agent iterates: reads the problem, writes code, compiles, tests against samples, and refines
+5. On completion (or timeout), `solution.cpp` is extracted from the working directory
+6. The temp directory is cleaned up; solution + metadata are saved
+
+#### Interactive problems
+
+Problems with `interactor.cc` (instead of `chk.cc`) are interactive — the solution communicates with a judge interactor via stdin/stdout. The agent prompt instructs it to:
+
+1. Compile the interactor using `g++ -std=gnu++17 -I. interactor.cc -o interactor`
+2. Test locally via pipes (e.g., `mkfifo pipe; ./solution < pipe | ./interactor > pipe`)
+3. `testlib.h` is provided automatically in the working directory
+
+Interactive problems are harder for agents because local testing requires building a pipe harness, which agents sometimes skip or get wrong.
+
+#### Known limitations
+
+- **No extended thinking**: The Claude Agent SDK does not currently expose extended thinking controls. Enabling it may improve complex algorithmic reasoning.
+- **Rewrite tendency**: Agents sometimes rewrite solutions from scratch after failures, losing working logic. The prompt mitigates this but doesn't eliminate it.
+- **Interactive testing**: Agents frequently skip local testing for interactive problems, submitting untested code.
+- **Algorithm ceiling**: For problems requiring non-trivial algorithmic insight (advanced DP, flow, geometry), agent iteration doesn't compensate for model capability gaps.
+
 ### Creating Problems
 
 > For contributing problems to Frontier-CS (detailed file formats, CI requirements), see [CONTRIBUTING.md](../CONTRIBUTING.md#algorithmic-problems).
diff --git a/src/frontier_cs/gen/agent_interface.py b/src/frontier_cs/gen/agent_interface.py
index f87adf7a..138ba6fa 100644
--- a/src/frontier_cs/gen/agent_interface.py
+++ b/src/frontier_cs/gen/agent_interface.py
@@ -12,13 +12,17 @@
 import asyncio
 import json
 import logging
+import os
 import shutil
+import stat
 import sys
 import tempfile
 import time
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Any, Dict, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple
+
+import yaml
 
 logger = logging.getLogger(__name__)
 
@@ -26,9 +30,209 @@
 DEFAULT_COST_LIMIT_USD = 20.0
 DEFAULT_TIMEOUT_SECONDS = 1200  # 20 minutes
 
+# Max size of sample I/O to embed directly in the prompt (bytes).
+# Larger inputs are left for the agent to read from disk.
+_MAX_EMBED_SIZE = 4096
+
+
+def _read_problem_config(problem_dir: str) -> Dict[str, Any]:
+    """Read and parse config.yaml from a problem directory."""
+    config_path = Path(problem_dir) / "config.yaml"
+    if config_path.is_file():
+        return yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
+    return {}
+
+
+def _collect_samples(problem_dir: str) -> List[Dict[str, str]]:
+    """Collect sample test cases from testdata/, sorted by number.
+
+    Returns list of dicts with keys 'id', 'input', 'answer'.
+    Only includes samples where both .in and .ans exist and are small enough to embed.
+    """
+    testdata = Path(problem_dir) / "testdata"
+    if not testdata.is_dir():
+        return []
+
+    samples = []
+    in_files = sorted(testdata.glob("*.in"), key=lambda p: int(p.stem) if p.stem.isdigit() else 0)
+    for in_file in in_files:
+        ans_file = in_file.with_suffix(".ans")
+        if not ans_file.is_file():
+            continue
+        if in_file.stat().st_size > _MAX_EMBED_SIZE or ans_file.stat().st_size > _MAX_EMBED_SIZE:
+            continue
+        samples.append({
+            "id": in_file.stem,
+            "input": in_file.read_text(encoding="utf-8"),
+            "answer": ans_file.read_text(encoding="utf-8"),
+        })
+    return samples
+
+
+def _format_samples(samples: List[Dict[str, str]], is_interactive: bool) -> str:
+    """Format sample test cases for inclusion in the prompt."""
+    if not samples:
+        return ""
+    parts = ["\n## Sample test cases (embedded for convenience)\n"]
+    note = " (interactor judge input — NOT your stdin)" if is_interactive else ""
+    for s in samples:
+        parts.append(f"### Sample {s['id']}{note}")
+        parts.append(f"Input:\n```\n{s['input'].rstrip()}\n```")
+        parts.append(f"Expected output:\n```\n{s['answer'].rstrip()}\n```\n")
+    return "\n".join(parts)
+
+
+# Shell script: compile solution.cpp and test against all sample cases.
+# If chk.cc exists (special judge), uses it for verification instead of diff.
+_TEST_ALL_SH = r"""#!/bin/bash
+set -e
+echo "=== Compiling solution.cpp ==="
+g++ -std=gnu++17 -O2 -o solution solution.cpp
+echo "=== Compilation OK ==="
+
+# Compile checker if available (special judge)
+USE_CHECKER=0
+if [ -f "chk.cc" ]; then
+    echo "=== Compiling special judge (chk.cc) ==="
+    if g++ -std=gnu++17 -O2 -I. chk.cc -o checker 2>/dev/null; then
+        USE_CHECKER=1
+        echo "=== Checker compiled OK — using it instead of diff ==="
+    else
+        echo "=== Checker compilation failed — falling back to diff ==="
+    fi
+fi
+
+passed=0; failed=0; total=0
+for inf in testdata/*.in; do
+    [ -f "$inf" ] || continue
+    id=$(basename "$inf" .in)
+    ans="testdata/${id}.ans"
+    [ -f "$ans" ] || continue
+    total=$((total + 1))
+
+    # Run with timeout
+    if timeout 15 ./solution < "$inf" > "my_${id}.out" 2>"my_${id}.err"; then
+        if [ "$USE_CHECKER" -eq 1 ]; then
+            # Special judge: ./checker <input> <output> <answer>
+            checker_out=$(./checker "$inf" "my_${id}.out" "$ans" 2>&1) && chk_rc=$? || chk_rc=$?
+            if [ $chk_rc -eq 0 ]; then
+                echo "  Sample $id: PASS (checker: $checker_out)"
+                passed=$((passed + 1))
+            else
+                echo "  Sample $id: WRONG ANSWER (checker exit $chk_rc)"
+                echo "    Checker output: $checker_out"
+                failed=$((failed + 1))
+            fi
+        else
+            # Diff-based comparison (normalize whitespace)
+            if diff -q <(tr -s '[:space:]' '\n' < "my_${id}.out" | sed '/^$/d') \
+                        <(tr -s '[:space:]' '\n' < "$ans" | sed '/^$/d') >/dev/null 2>&1; then
+                echo "  Sample $id: PASS"
+                passed=$((passed + 1))
+            else
+                echo "  Sample $id: WRONG ANSWER"
+                echo "    Expected (first 5 lines):"
+                head -5 "$ans" | sed 's/^/      /'
+                echo "    Got (first 5 lines):"
+                head -5 "my_${id}.out" | sed 's/^/      /'
+                failed=$((failed + 1))
+            fi
+        fi
+    else
+        rc=$?
+        echo "  Sample $id: RUNTIME ERROR or TLE (exit $rc)"
+        [ -s "my_${id}.err" ] && head -3 "my_${id}.err" | sed 's/^/    stderr: /'
+        failed=$((failed + 1))
+    fi
+done
+
+echo "=== Results: $passed/$total passed ==="
+[ "$failed" -eq 0 ] && exit 0 || exit 1
+"""
+
+# Shell script: test solution against an interactor using named pipes.
+_RUN_INTERACTIVE_SH = r"""#!/bin/bash
+# Usage: ./run_interactive.sh [sample_id]  (default: 1)
+# Compiles solution.cpp and interactor.cc, then tests via pipe.
+# Exit codes: 0=accepted, 1=wrong answer, 2=presentation error, 3=build error, 4=timeout/crash
+
+SAMPLE=${1:-1}
+INF="testdata/${SAMPLE}.in"
+ANSF="testdata/${SAMPLE}.ans"
+
+if [ ! -f "$INF" ]; then
+    echo "Error: $INF not found"
+    exit 3
+fi
+
+# Compile only if binaries are missing or sources are newer
+if [ ! -f ./solution ] || [ solution.cpp -nt ./solution ]; then
+    echo "=== Compiling solution.cpp ==="
+    g++ -std=gnu++17 -O2 -o solution solution.cpp || { echo "Compilation failed"; exit 3; }
+fi
+
+if [ ! -f ./interactor ] || [ interactor.cc -nt ./interactor ]; then
+    echo "=== Compiling interactor ==="
+    g++ -std=gnu++17 -O2 -I. interactor.cc -o interactor || { echo "Interactor compilation failed"; exit 3; }
+fi
+
+# Create named pipes in current dir (avoids /tmp permission issues)
+PIPE_S2I=".pipe_s2i_$$"
+PIPE_I2S=".pipe_i2s_$$"
+rm -f "$PIPE_S2I" "$PIPE_I2S"
+mkfifo "$PIPE_S2I" "$PIPE_I2S"
+
+cleanup() { rm -f "$PIPE_S2I" "$PIPE_I2S" inter_stderr.tmp sol_stderr.tmp; }
+trap cleanup EXIT
+
+echo "=== Running sample $SAMPLE ==="
+
+# interactor: reads from solution's stdout via pipe, writes to solution's stdin via pipe
+# testlib interactors: argv = <inf> <ouf> [ans]
+# We use /dev/null for ouf (output file) since we only care about exit code
+timeout 120 ./interactor "$INF" /dev/null "$ANSF" < "$PIPE_S2I" > "$PIPE_I2S" 2>inter_stderr.tmp &
+INTER_PID=$!
+
+timeout 120 ./solution < "$PIPE_I2S" > "$PIPE_S2I" 2>sol_stderr.tmp &
+SOL_PID=$!
+
+# Wait for both processes
+INTER_EXIT=0; SOL_EXIT=0
+wait $INTER_PID 2>/dev/null || INTER_EXIT=$?
+wait $SOL_PID 2>/dev/null || SOL_EXIT=$?
+
+# Report results
+if [ $INTER_EXIT -eq 0 ]; then
+    echo "  Sample $SAMPLE: ACCEPTED (interactor exit 0)"
+    [ -s inter_stderr.tmp ] && head -2 inter_stderr.tmp | sed 's/^/    interactor: /'
+    exit 0
+elif [ $INTER_EXIT -eq 1 ]; then
+    echo "  Sample $SAMPLE: WRONG ANSWER (interactor exit 1)"
+    [ -s inter_stderr.tmp ] && head -3 inter_stderr.tmp | sed 's/^/    interactor: /'
+    exit 1
+elif [ $INTER_EXIT -eq 2 ]; then
+    echo "  Sample $SAMPLE: PRESENTATION ERROR (interactor exit 2)"
+    [ -s inter_stderr.tmp ] && head -3 inter_stderr.tmp | sed 's/^/    interactor: /'
+    exit 2
+elif [ $INTER_EXIT -eq 124 ] || [ $INTER_EXIT -eq 137 ]; then
+    echo "  Sample $SAMPLE: TIMEOUT (120s exceeded)"
+    echo "    This usually means your solution deadlocked (missing flush? wrong protocol?)"
+    [ -s sol_stderr.tmp ] && head -3 sol_stderr.tmp | sed 's/^/    solution stderr: /'
+    exit 4
+else
+    echo "  Sample $SAMPLE: UNKNOWN (interactor exit $INTER_EXIT, solution exit $SOL_EXIT)"
+    [ -s inter_stderr.tmp ] && head -3 inter_stderr.tmp | sed 's/^/    interactor: /'
+    [ -s sol_stderr.tmp ] && head -3 sol_stderr.tmp | sed 's/^/    solution: /'
+    exit 4
+fi
+"""
+
 
 def build_agent_prompt(problem_dir: str) -> str:
-    """Construct the prompt given to the agent.
+    """Construct a problem-aware prompt for the agent.
+
+    Reads config.yaml to detect problem type (interactive vs standard, SPJ),
+    embeds small sample I/O directly, and provides tailored workflow guidance.
 
     Args:
         problem_dir: Absolute path to the problem directory.
@@ -36,40 +240,237 @@ def build_agent_prompt(problem_dir: str) -> str:
     Returns:
         The prompt string for the agent.
     """
-    return f"""You are solving a competitive programming problem.
+    config = _read_problem_config(problem_dir)
+    is_interactive = config.get("type") == "interactive"
+    has_checker = "checker" in config
+    time_limit = config.get("time", "?")
+    memory_limit = config.get("memory", "?")
+    subtasks = config.get("subtasks", [])
+    total_cases = sum(s.get("n_cases", 0) for s in subtasks) if subtasks else "?"
+    samples = _collect_samples(problem_dir)
+
+    # Base info
+    parts = [f"""You are solving a competitive programming problem.
 
 Problem directory: {problem_dir}
-- Read statement.txt for the problem description
-- testdata/ contains sample test cases (*.in, *.ans), but these are only a subset
-- Your solution will be evaluated against a larger hidden test suite
-- You can compile with g++, run against the available samples, and iterate
-- config.yaml has time/memory limits — respect them in your solution
-
-Submit your final solution as solution.cpp in the current working directory."""
+- Read statement.txt for the full problem description
+- Time limit: {time_limit}, Memory limit: {memory_limit}
+- Total hidden test cases: {total_cases} (your score = fraction passed)
+- testdata/ contains sample test cases — these are a SUBSET of the hidden tests"""]
+
+    # Problem type specific guidance
+    if is_interactive:
+        parts.append("""
+## Problem type: INTERACTIVE
+
+This is an interactive problem. Your solution communicates with a judge interactor
+via stdin/stdout. You do NOT read from files — you read responses from the interactor
+and write queries/answers to stdout.
+
+Key files provided:
+- interactor.cc — the judge interactor (uses testlib.h, both provided)
+- testdata/*.in — interactor input seeds (NOT your stdin)
+
+**CRITICAL for interactive problems:**
+- You MUST flush stdout after EVERY output line: use `cout << endl;` or `cout << flush;`
+- Read the interactor source code to understand the exact protocol (what it sends, what it expects)
+- Count your queries carefully against the stated limit
+
+**Testing interactive solutions locally:**
+Use the provided `./run_interactive.sh` script:
+```bash
+./run_interactive.sh 1    # Test with sample 1
+./run_interactive.sh 2    # Test with sample 2
+# Run all samples:
+for i in testdata/*.in; do ./run_interactive.sh $(basename $i .in); done
+```
+
+If `run_interactive.sh` times out (exit code 4), it usually means a deadlock:
+- Missing `flush` / `endl` on your output
+- Reading when the interactor expects you to write, or vice versa
+- Exceeding the query limit (interactor stops responding)
+
+**Fallback testing:** If the shell script doesn't work, write a Python wrapper:
+```python
+import subprocess, os
+proc_sol = subprocess.Popen(['./solution'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+proc_int = subprocess.Popen(['./interactor', 'testdata/1.in', '/dev/null', 'testdata/1.ans'],
+                             stdin=proc_sol.stdout, stdout=proc_sol.stdin)
+proc_int.wait(); proc_sol.wait()
+print(f"interactor exit: {proc_int.returncode}")
+```
+
+IMPORTANT: You MUST test your solution locally before finalizing. Do NOT submit untested code.""")
+    else:
+        checker_note = ""
+        if has_checker:
+            checker_note = """
+Note: This problem has a SPECIAL JUDGE (chk.cc) — multiple valid outputs may be accepted.
+`test_all.sh` will automatically compile and use the checker for validation.
+If the checker reports PASS but the output looks different from the .ans file, that's fine."""
+
+        parts.append(f"""
+## Problem type: {"SPECIAL JUDGE (multiple valid outputs accepted)" if has_checker else "STANDARD"}
+
+**Testing your solution locally:**
+Use the provided `./test_all.sh` script:
+```bash
+./test_all.sh    # Compiles solution.cpp and runs against ALL samples
+```
+This compiles, runs each sample, and compares output. Always run this before finalizing.{checker_note}""")
+
+    # Scoring context
+    parts.append(f"""
+## Scoring
+
+Your score is the fraction of hidden test cases passed (0-100%).
+- There are {total_cases} hidden test cases total
+- Partial credit counts — passing 7/10 cases = 70% score
+- A correct-but-slow solution that passes small cases is MUCH better than a broken fast one
+- Prioritize CORRECTNESS over optimality. Get a working solution first, then optimize.""")
+
+    # Embed samples if small enough
+    sample_text = _format_samples(samples, is_interactive)
+    if sample_text:
+        parts.append(sample_text)
+    elif samples:
+        parts.append("\n(Sample inputs are large — read them from testdata/ directory.)\n")
+
+    # Workflow
+    parts.append("""## Workflow
+
+1. Read the FULL problem statement carefully. Re-read the constraints and edge cases.
+2. Read ALL sample test cases and understand the expected I/O format.
+3. Design your algorithm. Think about time complexity vs the constraints.
+4. Write a SIMPLE correct solution first — brute force is fine for a first version.
+5. Compile and test against ALL samples using the provided test script.
+6. If samples fail: debug by examining the diff, don't just rewrite everything.
+7. Once samples pass: think about edge cases and whether your algorithm handles large inputs.
+8. Optimize only after correctness is established.
+
+**Critical rules:**
+- Do NOT rewrite your solution from scratch more than once. Incremental edits preserve working logic.
+- Do NOT skip local testing. Every change must be tested before you move on.
+- Do NOT submit without running test_all.sh (or run_interactive.sh for interactive).
+- If you TLE on large cases, profile the bottleneck — don't simplify the entire algorithm.
+
+**Retreat strategy — know when to simplify:**
+- If you've been debugging the SAME bug for more than 5 edit-test cycles without progress,
+  STOP and switch to a fundamentally simpler approach. A correct brute-force that passes
+  small cases is worth more than a broken optimized solution that passes nothing.
+- If your approach is off by a small constant (e.g., exceeding a limit by 1), consider whether
+  a completely different algorithm would avoid the issue rather than patching endlessly.
+- Remember: partial credit exists. A solution scoring 30% is infinitely better than 0%.
+  When in doubt, submit what works even if it's suboptimal.
+
+Submit your final solution as solution.cpp in the current working directory.""")
+
+    return "\n".join(parts)
+
+
+def _write_helper_scripts(workdir: Path, is_interactive: bool) -> None:
+    """Write test helper scripts to the agent's working directory."""
+    # Always provide test_all.sh for non-interactive
+    test_all = workdir / "test_all.sh"
+    test_all.write_text(_TEST_ALL_SH, encoding="utf-8")
+    test_all.chmod(test_all.stat().st_mode | stat.S_IEXEC)
+
+    if is_interactive:
+        run_inter = workdir / "run_interactive.sh"
+        run_inter.write_text(_RUN_INTERACTIVE_SH, encoding="utf-8")
+        run_inter.chmod(run_inter.stat().st_mode | stat.S_IEXEC)
+
+
+def _write_workdir_claude_md(workdir: Path, is_interactive: bool) -> None:
+    """Write a CLAUDE.md to the workdir so Claude Code picks up behavioral guidance."""
+    lines = [
+        "# Agent Eval — Working Directory",
+        "",
+        "You are solving a competitive programming problem in this directory.",
+        "",
+        "## Rules",
+        "",
+        "- Your ONLY deliverable is `solution.cpp` in this directory.",
+        "- Use C++17 (g++ -std=gnu++17).",
+        "- Always compile with `-O2` for performance testing.",
+        "- Test against ALL sample cases before considering your solution done.",
+        "- Read the problem statement COMPLETELY before writing any code.",
+        "",
+        "## Testing",
+        "",
+    ]
+    if is_interactive:
+        lines += [
+            "This is an INTERACTIVE problem. Use `./run_interactive.sh N` to test sample N.",
+            "Do NOT skip interactive testing — protocol bugs are the #1 failure mode.",
+            "",
+            "### Interactive protocol checklist",
+            "- `cout << endl;` or `cout << flush;` after EVERY line you output",
+            "- Read the interactor source code to know the exact send/receive order",
+            "- Count queries against the stated limit",
+            "- If run_interactive.sh times out: you likely have a deadlock (missing flush or wrong protocol)",
+            "- Fallback: write a Python subprocess wrapper if the shell script fails",
+            "",
+        ]
+    else:
+        lines += [
+            "Use `./test_all.sh` to compile and test against all samples.",
+            "If chk.cc exists, test_all.sh uses it as a special judge automatically.",
+            "Fix any failing samples before moving on to optimization.",
+            "",
+        ]
+    lines += [
+        "## Common mistakes to avoid",
+        "",
+        "- Forgetting to flush stdout in interactive problems",
+        "- Off-by-one errors in array indexing (0-indexed vs 1-indexed)",
+        "- Integer overflow — use `long long` for anything that could exceed 2^31",
+        "- Reading input in the wrong order or format",
+        "- Not handling the edge case where N=1 or the input is minimal",
+        "- Rewriting the entire solution when a small fix would work",
+        "",
+        "## When to retreat",
+        "",
+        "- If you've edited and tested 5+ times for the same bug without progress, STOP.",
+        "- Switch to a simpler algorithm that is guaranteed correct, even if slower.",
+        "- A correct brute-force scoring 30% beats a broken clever solution scoring 0%.",
+        "- Partial credit is real: every test case you pass counts.",
+        "",
+    ]
+    (workdir / "CLAUDE.md").write_text("\n".join(lines), encoding="utf-8")
 
 
 def extract_solution_cpp(workdir: Path) -> str:
     """Extract solution.cpp from the agent working directory.
 
-    Looks for solution.cpp first, then falls back to any .cpp file.
+    Searches for solution.cpp in the workdir, its parent (the tmpdir root),
+    and recursively. Falls back to any .cpp file that looks like a solution.
 
     Args:
-        workdir: The agent's working directory.
+        workdir: The agent's working directory (typically tmpdir/problem).
 
     Returns:
         The C++ source code, or empty string if not found.
     """
-    # Primary: solution.cpp
-    sol = workdir / "solution.cpp"
-    if sol.is_file():
-        return sol.read_text(encoding="utf-8")
-
-    # Fallback: any .cpp file (agent might have used a different name)
-    cpp_files = list(workdir.glob("*.cpp"))
-    if cpp_files:
-        # Pick the most recently modified one
-        newest = max(cpp_files, key=lambda p: p.stat().st_mtime)
-        return newest.read_text(encoding="utf-8")
+    # Search these directories in priority order
+    search_dirs = [workdir, workdir.parent]
+
+    for d in search_dirs:
+        sol = d / "solution.cpp"
+        if sol.is_file():
+            return sol.read_text(encoding="utf-8")
+
+    # Fallback: any .cpp file in workdir or parent (excluding problem-provided files)
+    problem_files = {p.name for p in workdir.glob("**/*.cpp")
+                     if p.stat().st_mtime < workdir.stat().st_mtime}
+    for d in search_dirs:
+        cpp_files = [
+            p for p in d.glob("*.cpp")
+            if p.name not in problem_files and p.name != "chk.cc"
+        ]
+        if cpp_files:
+            newest = max(cpp_files, key=lambda p: p.stat().st_mtime)
+            return newest.read_text(encoding="utf-8")
 
     return ""
 
@@ -82,6 +483,8 @@ def build_metadata(
     time_seconds: float,
     turns: int,
     status: str,
+    model: str,
+    prompt: str,
 ) -> Dict[str, Any]:
     """Build the metadata dict for an agent run.
 
@@ -92,11 +495,15 @@ def build_metadata(
         time_seconds: Wall-clock time in seconds.
         turns: Number of agentic turns (tool-use round trips).
         status: One of "success", "timeout", "cost_limit", "error".
+        model: The model name passed to the agent SDK.
+        prompt: The full prompt sent to the agent.
 
     Returns:
         Metadata dictionary.
     """
     return {
+        "model": model,
+        "prompt": prompt,
         "tokens_in": tokens_in,
         "tokens_out": tokens_out,
         "cost_usd": round(cost_usd, 4),
@@ -153,12 +560,35 @@ async def run_agent(
     from claude_agent_sdk import query, ClaudeAgentOptions
     from claude_agent_sdk.types import StreamEvent
 
+    # Claude Code CLI uses short model names, not full API model IDs.
+    # Map common API IDs to CLI-accepted names.
+    CLI_MODEL_MAP = {
+        "claude-sonnet-4-5-20250514": "claude-sonnet-4-5",
+        "claude-sonnet-4-6-20250610": "claude-sonnet-4-6",
+        "claude-opus-4-6-20250610": "claude-opus-4-6",
+        "claude-haiku-4-5-20251001": "claude-haiku-4-5",
+    }
+    model = CLI_MODEL_MAP.get(model, model)
+
+    # Read problem config before copying to detect type.
+    config = _read_problem_config(problem_dir)
+    is_interactive = config.get("type") == "interactive"
+
     # Copy problem dir to a temp working directory to avoid polluting the original.
     # This also makes concurrent runs on the same problem safe.
     tmpdir = tempfile.mkdtemp(prefix="agent_eval_")
     workdir = Path(tmpdir) / "problem"
     shutil.copytree(problem_dir, workdir)
 
+    # Provide testlib.h so agents can compile interactors/checkers for local testing.
+    testlib_src = Path(problem_dir).parent.parent / "judge" / "include" / "testlib.h"
+    if testlib_src.is_file():
+        shutil.copy2(testlib_src, workdir / "testlib.h")
+
+    # Write helper scripts and CLAUDE.md into workdir.
+    _write_helper_scripts(workdir, is_interactive)
+    _write_workdir_claude_md(workdir, is_interactive)
+
     prompt = build_agent_prompt(str(workdir))
 
     options = ClaudeAgentOptions(
@@ -248,9 +678,12 @@ async def _run():
                 elif isinstance(message, ResultMessage):
                     total_cost = message.total_cost_usd
                     if message.usage:
-                        usage_in = message.usage.get("input_tokens", usage_in)
-                        usage_out = message.usage.get("output_tokens", usage_out)
-                    num_turns = message.num_turns
+                        usage_in = max(usage_in, message.usage.get("input_tokens", 0))
+                        usage_out = max(usage_out, message.usage.get("output_tokens", 0))
+                    # SDK may send multiple ResultMessages (main run + follow-ups).
+                    # Keep the highest turn count to avoid a follow-up (turns=1)
+                    # clobbering the real value.
+                    num_turns = max(num_turns, message.num_turns)
                     if transcript:
                         transcript.log({
                             "type": "result",
@@ -267,8 +700,14 @@ async def _run():
         status = "timeout"
         logger.warning("Agent timed out after %.0fs", timeout)
     except Exception as e:
-        status = "error"
-        logger.error("Agent error: %s", e)
+        # Claude CLI often exits with code 1 after a successful run.
+        # If we already received a ResultMessage (total_cost is set),
+        # treat this as a successful completion, not an error.
+        if total_cost is not None:
+            logger.info("Agent completed (post-result CLI exit: %s)", e)
+        else:
+            status = "error"
+            logger.error("Agent error: %s", e)
         if transcript:
             transcript.log({"type": "error", "error": str(e)})
     finally:
@@ -293,6 +732,8 @@ async def _run():
         time_seconds=elapsed,
         turns=num_turns,
         status=status,
+        model=model,
+        prompt=prompt,
     )
 
     return code, metadata
diff --git a/tests/test_agent_interface.py b/tests/test_agent_interface.py
index be163ba4..0c771522 100644
--- a/tests/test_agent_interface.py
+++ b/tests/test_agent_interface.py
@@ -26,20 +26,97 @@ def test_agent_is_not_reasoning_model():
 
 
 import json
+import os
 import tempfile
 from pathlib import Path
 
 
-def test_build_agent_prompt():
-    """Agent prompt includes problem dir and key instructions."""
+def _make_problem_dir(tmpdir: str, *, interactive: bool = False, samples: int = 2) -> Path:
+    """Create a minimal problem directory for testing."""
+    pdir = Path(tmpdir) / "problems" / "0"
+    pdir.mkdir(parents=True)
+    (pdir / "statement.txt").write_text("# Test Problem\nSolve it.\n")
+
+    config = {
+        "type": "interactive" if interactive else "default",
+        "time": "1s",
+        "memory": "256m",
+        "subtasks": [{"score": 100, "n_cases": 3}],
+    }
+    if interactive:
+        config["interactor"] = "interactor.cc"
+        (pdir / "interactor.cc").write_text("// interactor\n")
+    else:
+        config["checker"] = "chk.cc"
+
+    import yaml
+    (pdir / "config.yaml").write_text(yaml.dump(config))
+
+    testdata = pdir / "testdata"
+    testdata.mkdir()
+    for i in range(1, samples + 1):
+        (testdata / f"{i}.in").write_text(f"{i}\n")
+        (testdata / f"{i}.ans").write_text(f"{i * 2}\n")
+
+    # testlib.h at judge/include/ level
+    judge_inc = Path(tmpdir) / "judge" / "include"
+    judge_inc.mkdir(parents=True, exist_ok=True)
+    (judge_inc / "testlib.h").write_text("// testlib stub\n")
+
+    return pdir
+
+
+def test_build_agent_prompt_standard():
+    """Standard problem prompt includes test script and scoring info."""
+    from frontier_cs.gen.agent_interface import build_agent_prompt
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        pdir = _make_problem_dir(tmpdir)
+        prompt = build_agent_prompt(str(pdir))
+        assert "test_all.sh" in prompt
+        assert "STANDARD" in prompt or "SPECIAL JUDGE" in prompt
+        assert "solution.cpp" in prompt
+        assert "Scoring" in prompt
+        assert "fraction" in prompt.lower() or "partial" in prompt.lower()
+        # Samples should be embedded (they're tiny)
+        assert "Sample 1" in prompt
+
+
+def test_build_agent_prompt_interactive():
+    """Interactive problem prompt includes interactor guidance and run_interactive.sh."""
+    from frontier_cs.gen.agent_interface import build_agent_prompt
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        pdir = _make_problem_dir(tmpdir, interactive=True)
+        prompt = build_agent_prompt(str(pdir))
+        assert "INTERACTIVE" in prompt
+        assert "run_interactive.sh" in prompt
+        assert "flush" in prompt.lower() or "pipe" in prompt.lower()
+
+
+def test_build_agent_prompt_embeds_small_samples():
+    """Small samples are embedded directly in the prompt."""
     from frontier_cs.gen.agent_interface import build_agent_prompt
 
-    prompt = build_agent_prompt("/tmp/fake_problem")
-    assert "/tmp/fake_problem" in prompt
-    assert "statement.txt" in prompt
-    assert "testdata/" in prompt
-    assert "hidden test suite" in prompt
-    assert "solution.cpp" in prompt
+    with tempfile.TemporaryDirectory() as tmpdir:
+        pdir = _make_problem_dir(tmpdir, samples=2)
+        prompt = build_agent_prompt(str(pdir))
+        # The sample content should appear in the prompt
+        assert "Sample 1" in prompt
+        assert "Sample 2" in prompt
+
+
+def test_build_agent_prompt_skips_large_samples():
+    """Large samples are NOT embedded in the prompt."""
+    from frontier_cs.gen.agent_interface import build_agent_prompt, _MAX_EMBED_SIZE
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        pdir = _make_problem_dir(tmpdir, samples=1)
+        # Make the input file larger than the embed threshold
+        (pdir / "testdata" / "1.in").write_text("x" * (_MAX_EMBED_SIZE + 1))
+        prompt = build_agent_prompt(str(pdir))
+        # Should NOT contain the embedded content
+        assert "Sample 1" not in prompt
 
 
 def test_extract_cpp_from_workdir():
@@ -47,18 +124,38 @@ def test_extract_cpp_from_workdir():
     from frontier_cs.gen.agent_interface import extract_solution_cpp
 
     with tempfile.TemporaryDirectory() as tmpdir:
-        sol_path = Path(tmpdir) / "solution.cpp"
+        workdir = Path(tmpdir) / "problem"
+        workdir.mkdir()
+        sol_path = workdir / "solution.cpp"
         sol_path.write_text('#include <iostream>\nint main() { return 0; }')
-        code = extract_solution_cpp(Path(tmpdir))
+        code = extract_solution_cpp(workdir)
         assert "#include <iostream>" in code
 
 
+def test_extract_cpp_from_parent():
+    """Extract solution.cpp when agent writes it to tmpdir root instead of workdir."""
+    from frontier_cs.gen.agent_interface import extract_solution_cpp
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        workdir = Path(tmpdir) / "problem"
+        workdir.mkdir()
+        # Agent wrote solution.cpp in the parent (tmpdir), not in workdir
+        sol_path = Path(tmpdir) / "solution.cpp"
+        sol_path.write_text('#include <cstdio>\nint main() {}')
+        code = extract_solution_cpp(workdir)
+        assert "#include <cstdio>" in code
+
+
 def test_extract_cpp_missing():
     """Return empty string if no solution.cpp found."""
     from frontier_cs.gen.agent_interface import extract_solution_cpp
 
     with tempfile.TemporaryDirectory() as tmpdir:
-        code = extract_solution_cpp(Path(tmpdir))
+        # Use a nested dir to mimic real layout (tmpdir/problem) and avoid
+        # picking up stray .cpp files from the system /tmp.
+        workdir = Path(tmpdir) / "problem"
+        workdir.mkdir()
+        code = extract_solution_cpp(workdir)
         assert code == ""
 
 
@@ -73,6 +170,8 @@ def test_build_metadata():
         time_seconds=300.5,
         turns=15,
         status="success",
+        model="claude-sonnet-4-5",
+        prompt="You are solving a competitive programming problem.",
     )
     assert meta["tokens_in"] == 100000
     assert meta["tokens_out"] == 25000
@@ -80,3 +179,53 @@ def test_build_metadata():
     assert meta["time_seconds"] == 300.5
     assert meta["turns"] == 15
     assert meta["status"] == "success"
+    assert meta["model"] == "claude-sonnet-4-5"
+    assert meta["prompt"] == "You are solving a competitive programming problem."
+
+
+def test_write_helper_scripts_standard():
+    """Standard problem gets test_all.sh but not run_interactive.sh."""
+    from frontier_cs.gen.agent_interface import _write_helper_scripts
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        workdir = Path(tmpdir)
+        _write_helper_scripts(workdir, is_interactive=False)
+        assert (workdir / "test_all.sh").is_file()
+        assert os.access(workdir / "test_all.sh", os.X_OK)
+        assert not (workdir / "run_interactive.sh").is_file()
+
+
+def test_write_helper_scripts_interactive():
+    """Interactive problem gets both scripts."""
+    from frontier_cs.gen.agent_interface import _write_helper_scripts
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        workdir = Path(tmpdir)
+        _write_helper_scripts(workdir, is_interactive=True)
+        assert (workdir / "test_all.sh").is_file()
+        assert (workdir / "run_interactive.sh").is_file()
+        assert os.access(workdir / "run_interactive.sh", os.X_OK)
+
+
+def test_write_workdir_claude_md_standard():
+    """CLAUDE.md for standard problems mentions test_all.sh."""
+    from frontier_cs.gen.agent_interface import _write_workdir_claude_md
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        workdir = Path(tmpdir)
+        _write_workdir_claude_md(workdir, is_interactive=False)
+        content = (workdir / "CLAUDE.md").read_text()
+        assert "test_all.sh" in content
+        assert "run_interactive.sh" not in content
+
+
+def test_write_workdir_claude_md_interactive():
+    """CLAUDE.md for interactive problems mentions flush and run_interactive.sh."""
+    from frontier_cs.gen.agent_interface import _write_workdir_claude_md
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        workdir = Path(tmpdir)
+        _write_workdir_claude_md(workdir, is_interactive=True)
+        content = (workdir / "CLAUDE.md").read_text()
+        assert "run_interactive.sh" in content
+        assert "flush" in content

From 170e7794d0fcdaf84dd6f87d08c6ca524ebe03fb Mon Sep 17 00:00:00 2001
From: Andy Lee <andylizf@outlook.com>
Date: Thu, 16 Apr 2026 04:06:23 +0000
Subject: [PATCH 07/16] feat: add parity mode for Harbor alignment and
 infra_git_hash tracking
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Parity mode (--parity flag) strips all test data, helper scripts, checker,
and interactor from the agent workspace — matching the Harbor adapter setup
where agents must self-test via brute-force cross-validation (对拍).

Changes:
- agent_interface.py: parity-aware prompt, workspace setup, CLAUDE.md,
  _get_infra_git_hash(), and enriched build_metadata (timestamp, parity flag)
- generate_solutions.py: --parity CLI argument
- tests: parity prompt validation (standard + interactive)
- docs: solutions repo separation plan (infra_git_hash in meta.json)
- .gitignore: exclude .claude/ directory
- pyproject.toml: add pytest dev dependency
---
 .gitignore                                |   1 +
 algorithmic/scripts/generate_solutions.py |   3 +
 docs/solutions-repo-separation.md         |  43 ++++++
 pyproject.toml                            |   5 +
 src/frontier_cs/gen/agent_interface.py    | 171 ++++++++++++++++++++--
 tests/test_agent_interface.py             |  31 ++++
 6 files changed, 241 insertions(+), 13 deletions(-)
 create mode 100644 docs/solutions-repo-separation.md

diff --git a/.gitignore b/.gitignore
index f1ac09f1..53753de8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -41,6 +41,7 @@ berkeley-function-call-leaderboard/bfcl_eval/scripts/ground_truth_conversation/
 # Ignore lock files
 berkeley-function-call-leaderboard/**/*.lock
 
+.claude/
 .direnv/
 .venv
 
diff --git a/algorithmic/scripts/generate_solutions.py b/algorithmic/scripts/generate_solutions.py
index 7d01b5b6..c9181137 100644
--- a/algorithmic/scripts/generate_solutions.py
+++ b/algorithmic/scripts/generate_solutions.py
@@ -305,6 +305,8 @@ def main():
                         help="Agent timeout in seconds (default: 1200 = 20 min)")
     parser.add_argument("--agent-cost-limit", type=float, default=20.0,
                         help="Agent max cost per problem in USD (default: 20)")
+    parser.add_argument("--parity", action="store_true",
+                        help="Harbor parity mode: no test data or helper scripts given to agent")
 
     args = parser.parse_args()
 
@@ -525,6 +527,7 @@ def execute_task(task: GenerationTask) -> Tuple[str, str, Optional[str], str, Op
                     cost_limit=args.agent_cost_limit,
                     timeout=args.agent_timeout,
                     transcript_path=transcript_path,
+                    parity=args.parity,
                 )
 
                 # Save metadata alongside solution
diff --git a/docs/solutions-repo-separation.md b/docs/solutions-repo-separation.md
new file mode 100644
index 00000000..f64a201e
--- /dev/null
+++ b/docs/solutions-repo-separation.md
@@ -0,0 +1,43 @@
+# Solutions Repo Separation
+
+## Problem
+
+Infra code (agent_interface.py, generate_solutions.py) and generated solutions (.cpp, .meta.json) live in the same repo. This causes:
+
+- Can't freely rebase/restructure infra without worrying about losing uncommitted solutions
+- Git diffs polluted by large generated files
+- No traceability — can't tell which version of infra generated a given solution
+
+## Decision
+
+1. **Move solutions to `FrontierCS/Frontier-CS-Result`** (already exists for storing results).
+2. **Add `infra_git_hash` to `.meta.json`** so each solution records which commit of this repo generated it.
+3. **Keep existing naming**: `{model_prefix}.cpp`, `{model_prefix}_{variant}.cpp`. Indices (`_0`, `_1`, `_2`) remain multi-variant within a single run.
+4. **Version via git commits** in the result repo. Re-running overwrites files, but commit before re-running to preserve history.
+
+## meta.json additions
+
+```json
+{
+  "model": "claude-sonnet-4-5-20250514",
+  "cost_usd": 0.55,
+  "time_seconds": 337,
+  "turns": 59,
+  "tokens_in": 125000,
+  "tokens_out": 18000,
+  "status": "success",
+  "infra_git_hash": "f54d370b",
+  "timestamp": "2026-04-15T14:30:22Z"
+}
+```
+
+## What stays in this repo
+
+- `src/frontier_cs/gen/` — generation and agent infra code
+- `algorithmic/problems/` — problem definitions
+- `algorithmic/judge/` — judge server
+
+## What moves to Frontier-CS-Result
+
+- `algorithmic/solutions/` — all generated solution files
+- `algorithmic/AGENT_EVAL_RESULTS.md` — eval result summaries
diff --git a/pyproject.toml b/pyproject.toml
index 317700e5..62ae3466 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -29,3 +29,8 @@ package = true
 [build-system]
 requires = ["hatchling"]
 build-backend = "hatchling.build"
+
+[dependency-groups]
+dev = [
+    "pytest>=9.0.3",
+]
diff --git a/src/frontier_cs/gen/agent_interface.py b/src/frontier_cs/gen/agent_interface.py
index 138ba6fa..4bc4e879 100644
--- a/src/frontier_cs/gen/agent_interface.py
+++ b/src/frontier_cs/gen/agent_interface.py
@@ -15,6 +15,7 @@
 import os
 import shutil
 import stat
+import subprocess
 import sys
 import tempfile
 import time
@@ -228,14 +229,18 @@ def _format_samples(samples: List[Dict[str, str]], is_interactive: bool) -> str:
 """
 
 
-def build_agent_prompt(problem_dir: str) -> str:
+def build_agent_prompt(problem_dir: str, *, parity: bool = False) -> str:
     """Construct a problem-aware prompt for the agent.
 
     Reads config.yaml to detect problem type (interactive vs standard, SPJ),
     embeds small sample I/O directly, and provides tailored workflow guidance.
 
+    In parity mode, no test data or helper scripts are referenced — the agent
+    must write its own tests. This matches the Harbor adapter setup.
+
     Args:
         problem_dir: Absolute path to the problem directory.
+        parity: If True, build a prompt that assumes no test data or scripts.
 
     Returns:
         The prompt string for the agent.
@@ -247,6 +252,13 @@ def build_agent_prompt(problem_dir: str) -> str:
     memory_limit = config.get("memory", "?")
     subtasks = config.get("subtasks", [])
     total_cases = sum(s.get("n_cases", 0) for s in subtasks) if subtasks else "?"
+
+    if parity:
+        return _build_parity_prompt(
+            problem_dir, config, is_interactive, has_checker,
+            time_limit, memory_limit, total_cases,
+        )
+
     samples = _collect_samples(problem_dir)
 
     # Base info
@@ -368,6 +380,92 @@ def build_agent_prompt(problem_dir: str) -> str:
     return "\n".join(parts)
 
 
+def _build_parity_prompt(
+    problem_dir: str,
+    config: Dict[str, Any],
+    is_interactive: bool,
+    has_checker: bool,
+    time_limit: str,
+    memory_limit: str,
+    total_cases: Any,
+) -> str:
+    """Build a prompt for parity mode (no test data, no helper scripts).
+
+    Matches the Harbor adapter setup: agent gets only the problem statement
+    and must write its own tests.
+    """
+    parts = [f"""You are solving a competitive programming problem.
+
+Problem directory: {problem_dir}
+- Read statement.txt for the full problem description
+- Time limit: {time_limit}, Memory limit: {memory_limit}
+- Total test cases: {total_cases} (your score = fraction passed)
+- Scoring is partial: 0-100% based on test cases passed"""]
+
+    if is_interactive:
+        parts.append("""
+## Problem type: INTERACTIVE
+
+This is an interactive problem. Your solution communicates with a hidden judge
+via stdin/stdout. You do NOT read from files.
+
+**CRITICAL:**
+- Flush stdout after EVERY output line: `cout << endl;` or `cout << flush;`
+- Read the problem statement carefully for the exact query/response protocol
+- Count your queries against the stated limit""")
+    elif has_checker:
+        parts.append("""
+## Problem type: SPECIAL JUDGE
+
+This problem accepts multiple valid outputs. Your solution will be checked by
+a special judge, not by exact string matching.""")
+    else:
+        parts.append("""
+## Problem type: STANDARD
+
+Your output must match the expected output exactly (whitespace-normalized).""")
+
+    parts.append(f"""
+## Scoring
+
+Your score is the fraction of test cases passed (0-100%).
+- There are {total_cases} test cases total
+- Partial credit counts — passing 7/10 cases = 70% score
+- A correct-but-slow solution that passes small cases is MUCH better than a broken fast one
+- Prioritize CORRECTNESS over optimality
+
+## Self-testing
+
+No test data or test scripts are provided. You must validate your own solution:
+
+1. **Write a brute-force reference solution** (even if O(n!) or exponential) that you are
+   confident is correct for small inputs.
+2. **Write a random test generator** that produces valid inputs within the problem constraints.
+3. **Cross-validate (对拍):** Run both solutions on hundreds of random small inputs and compare
+   outputs. Fix any discrepancies by debugging your main solution against the brute-force.
+4. **Stress test:** Generate larger random inputs to check for TLE, MLE, or crashes.
+5. **Edge cases:** Manually test minimum inputs (N=1, empty, etc.) and boundary values.
+
+This self-testing approach is standard competitive programming practice. Do NOT skip it.
+
+## Workflow
+
+1. Read the FULL problem statement carefully. Re-read the constraints and edge cases.
+2. Understand the I/O format from the examples in the problem statement.
+3. Design your algorithm. Think about time complexity vs the constraints.
+4. Write a SIMPLE correct solution first — brute force is fine for a first version.
+5. Write a separate brute-force and test generator, then cross-validate.
+6. Once confident in correctness: optimize for performance if needed.
+7. Stress test with larger inputs before finalizing.
+
+**Retreat strategy:** If stuck on the same bug for 5+ cycles, switch to a simpler
+algorithm. A correct brute-force scoring 30% beats a broken solution scoring 0%.
+
+Submit your final solution as solution.cpp in the current working directory.""")
+
+    return "\n".join(parts)
+
+
 def _write_helper_scripts(workdir: Path, is_interactive: bool) -> None:
     """Write test helper scripts to the agent's working directory."""
     # Always provide test_all.sh for non-interactive
@@ -381,7 +479,7 @@ def _write_helper_scripts(workdir: Path, is_interactive: bool) -> None:
         run_inter.chmod(run_inter.stat().st_mode | stat.S_IEXEC)
 
 
-def _write_workdir_claude_md(workdir: Path, is_interactive: bool) -> None:
+def _write_workdir_claude_md(workdir: Path, is_interactive: bool, *, parity: bool = False) -> None:
     """Write a CLAUDE.md to the workdir so Claude Code picks up behavioral guidance."""
     lines = [
         "# Agent Eval — Working Directory",
@@ -393,13 +491,27 @@ def _write_workdir_claude_md(workdir: Path, is_interactive: bool) -> None:
         "- Your ONLY deliverable is `solution.cpp` in this directory.",
         "- Use C++17 (g++ -std=gnu++17).",
         "- Always compile with `-O2` for performance testing.",
-        "- Test against ALL sample cases before considering your solution done.",
         "- Read the problem statement COMPLETELY before writing any code.",
         "",
         "## Testing",
         "",
     ]
-    if is_interactive:
+    if parity:
+        lines += [
+            "No test data or test scripts are provided.",
+            "Write your own brute-force solution + random test generator to cross-validate.",
+            "This is standard competitive programming practice (对拍).",
+            "",
+        ]
+        if is_interactive:
+            lines += [
+                "This is an INTERACTIVE problem.",
+                "- `cout << endl;` or `cout << flush;` after EVERY line you output",
+                "- Read the problem statement to understand the exact protocol",
+                "- Count queries against the stated limit",
+                "",
+            ]
+    elif is_interactive:
         lines += [
             "This is an INTERACTIVE problem. Use `./run_interactive.sh N` to test sample N.",
             "Do NOT skip interactive testing — protocol bugs are the #1 failure mode.",
@@ -475,6 +587,19 @@ def extract_solution_cpp(workdir: Path) -> str:
     return ""
 
 
+def _get_infra_git_hash() -> str:
+    """Get the current git commit hash of this repo (infra code)."""
+    try:
+        result = subprocess.run(
+            ["git", "rev-parse", "--short", "HEAD"],
+            capture_output=True, text=True, timeout=5,
+            cwd=Path(__file__).parent,
+        )
+        return result.stdout.strip() if result.returncode == 0 else "unknown"
+    except Exception:
+        return "unknown"
+
+
 def build_metadata(
     *,
     tokens_in: int,
@@ -485,6 +610,7 @@ def build_metadata(
     status: str,
     model: str,
     prompt: str,
+    parity: bool = False,
 ) -> Dict[str, Any]:
     """Build the metadata dict for an agent run.
 
@@ -497,6 +623,7 @@ def build_metadata(
         status: One of "success", "timeout", "cost_limit", "error".
         model: The model name passed to the agent SDK.
         prompt: The full prompt sent to the agent.
+        parity: Whether this run used parity mode.
 
     Returns:
         Metadata dictionary.
@@ -510,6 +637,9 @@ def build_metadata(
         "time_seconds": round(time_seconds, 2),
         "turns": turns,
         "status": status,
+        "infra_git_hash": _get_infra_git_hash(),
+        "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
+        "parity": parity,
     }
 
 
@@ -544,6 +674,7 @@ async def run_agent(
     cost_limit: float = DEFAULT_COST_LIMIT_USD,
     timeout: float = DEFAULT_TIMEOUT_SECONDS,
     transcript_path: Optional[Path] = None,
+    parity: bool = False,
 ) -> Tuple[str, Dict[str, Any]]:
     """Run the agent to solve a problem.
 
@@ -553,6 +684,7 @@ async def run_agent(
         cost_limit: Maximum cost in USD.
         timeout: Maximum wall-clock time in seconds.
         transcript_path: Path for JSONL transcript log. None to skip.
+        parity: If True, strip test data and helper scripts (Harbor parity mode).
 
     Returns:
         Tuple of (cpp_code, metadata_dict).
@@ -578,18 +710,27 @@ async def run_agent(
     # This also makes concurrent runs on the same problem safe.
     tmpdir = tempfile.mkdtemp(prefix="agent_eval_")
     workdir = Path(tmpdir) / "problem"
-    shutil.copytree(problem_dir, workdir)
 
-    # Provide testlib.h so agents can compile interactors/checkers for local testing.
-    testlib_src = Path(problem_dir).parent.parent / "judge" / "include" / "testlib.h"
-    if testlib_src.is_file():
-        shutil.copy2(testlib_src, workdir / "testlib.h")
+    if parity:
+        # Parity mode: only copy statement.txt and config.yaml — no test data,
+        # no checker, no interactor. Agent must self-test.
+        workdir.mkdir(parents=True)
+        for fname in ("statement.txt", "config.yaml", "tag.txt"):
+            src = Path(problem_dir) / fname
+            if src.is_file():
+                shutil.copy2(src, workdir / fname)
+    else:
+        shutil.copytree(problem_dir, workdir)
+        # Provide testlib.h so agents can compile interactors/checkers for local testing.
+        testlib_src = Path(problem_dir).parent.parent / "judge" / "include" / "testlib.h"
+        if testlib_src.is_file():
+            shutil.copy2(testlib_src, workdir / "testlib.h")
+        # Write helper scripts for local testing.
+        _write_helper_scripts(workdir, is_interactive)
 
-    # Write helper scripts and CLAUDE.md into workdir.
-    _write_helper_scripts(workdir, is_interactive)
-    _write_workdir_claude_md(workdir, is_interactive)
+    _write_workdir_claude_md(workdir, is_interactive, parity=parity)
 
-    prompt = build_agent_prompt(str(workdir))
+    prompt = build_agent_prompt(str(workdir), parity=parity)
 
     options = ClaudeAgentOptions(
         model=model,
@@ -734,6 +875,7 @@ async def _run():
         status=status,
         model=model,
         prompt=prompt,
+        parity=parity,
     )
 
     return code, metadata
@@ -746,6 +888,7 @@ def generate_agent_solution(
     cost_limit: float = DEFAULT_COST_LIMIT_USD,
     timeout: float = DEFAULT_TIMEOUT_SECONDS,
     transcript_path: Optional[Path] = None,
+    parity: bool = False,
 ) -> Tuple[str, Dict[str, Any]]:
     """Synchronous wrapper for run_agent.
 
@@ -757,6 +900,7 @@ def generate_agent_solution(
         cost_limit: Maximum cost in USD.
         timeout: Maximum wall-clock time in seconds.
         transcript_path: Path for JSONL transcript log.
+        parity: If True, strip test data and helper scripts (Harbor parity mode).
 
     Returns:
         Tuple of (cpp_code, metadata_dict).
@@ -768,5 +912,6 @@ def generate_agent_solution(
             cost_limit=cost_limit,
             timeout=timeout,
             transcript_path=transcript_path,
+            parity=parity,
         )
     )
diff --git a/tests/test_agent_interface.py b/tests/test_agent_interface.py
index 0c771522..176bef6f 100644
--- a/tests/test_agent_interface.py
+++ b/tests/test_agent_interface.py
@@ -119,6 +119,37 @@ def test_build_agent_prompt_skips_large_samples():
         assert "Sample 1" not in prompt
 
 
+def test_build_agent_prompt_parity_no_test_refs():
+    """Parity mode prompt has no references to test scripts or test data."""
+    from frontier_cs.gen.agent_interface import build_agent_prompt
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        pdir = _make_problem_dir(tmpdir, samples=2)
+        prompt = build_agent_prompt(str(pdir), parity=True)
+        assert "test_all.sh" not in prompt
+        assert "run_interactive.sh" not in prompt
+        assert "testdata/" not in prompt
+        assert "Sample 1" not in prompt
+        assert "chk.cc" not in prompt
+        assert "interactor.cc" not in prompt
+        # Should mention self-testing
+        assert "brute-force" in prompt.lower() or "brute force" in prompt.lower()
+        assert "solution.cpp" in prompt
+
+
+def test_build_agent_prompt_parity_interactive():
+    """Parity mode interactive prompt mentions flush but not interactor source."""
+    from frontier_cs.gen.agent_interface import build_agent_prompt
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        pdir = _make_problem_dir(tmpdir, interactive=True)
+        prompt = build_agent_prompt(str(pdir), parity=True)
+        assert "INTERACTIVE" in prompt
+        assert "flush" in prompt.lower()
+        assert "run_interactive.sh" not in prompt
+        assert "interactor.cc" not in prompt
+
+
 def test_extract_cpp_from_workdir():
     """Extract solution.cpp from agent working directory."""
     from frontier_cs.gen.agent_interface import extract_solution_cpp

From dd0a6339af6843b8e9c01e27697e4b35966e9ee6 Mon Sep 17 00:00:00 2001
From: Andy Lee <andylizf@outlook.com>
Date: Thu, 16 Apr 2026 04:12:17 +0000
Subject: [PATCH 08/16] revert: remove infra_git_hash and timestamp from
 build_metadata

These belong to the solutions repo separation effort, which is docs-only
for now. Removed _get_infra_git_hash(), subprocess import, and the
infra_git_hash/timestamp/parity fields from build_metadata().
---
 src/frontier_cs/gen/agent_interface.py | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/src/frontier_cs/gen/agent_interface.py b/src/frontier_cs/gen/agent_interface.py
index 4bc4e879..b3145866 100644
--- a/src/frontier_cs/gen/agent_interface.py
+++ b/src/frontier_cs/gen/agent_interface.py
@@ -15,7 +15,6 @@
 import os
 import shutil
 import stat
-import subprocess
 import sys
 import tempfile
 import time
@@ -587,19 +586,6 @@ def extract_solution_cpp(workdir: Path) -> str:
     return ""
 
 
-def _get_infra_git_hash() -> str:
-    """Get the current git commit hash of this repo (infra code)."""
-    try:
-        result = subprocess.run(
-            ["git", "rev-parse", "--short", "HEAD"],
-            capture_output=True, text=True, timeout=5,
-            cwd=Path(__file__).parent,
-        )
-        return result.stdout.strip() if result.returncode == 0 else "unknown"
-    except Exception:
-        return "unknown"
-
-
 def build_metadata(
     *,
     tokens_in: int,
@@ -610,7 +596,6 @@ def build_metadata(
     status: str,
     model: str,
     prompt: str,
-    parity: bool = False,
 ) -> Dict[str, Any]:
     """Build the metadata dict for an agent run.
 
@@ -623,7 +608,6 @@ def build_metadata(
         status: One of "success", "timeout", "cost_limit", "error".
         model: The model name passed to the agent SDK.
         prompt: The full prompt sent to the agent.
-        parity: Whether this run used parity mode.
 
     Returns:
         Metadata dictionary.
@@ -637,9 +621,6 @@ def build_metadata(
         "time_seconds": round(time_seconds, 2),
         "turns": turns,
         "status": status,
-        "infra_git_hash": _get_infra_git_hash(),
-        "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
-        "parity": parity,
     }
 
 
@@ -875,7 +856,6 @@ async def _run():
         status=status,
         model=model,
         prompt=prompt,
-        parity=parity,
     )
 
     return code, metadata

From e95fb8f3e94ad0e974ff58a20371feacc8607bf5 Mon Sep 17 00:00:00 2001
From: Andy Lee <andylizf@outlook.com>
Date: Thu, 16 Apr 2026 04:51:22 +0000
Subject: [PATCH 09/16] fix: make parity mode the default and remove
 solutions-repo-separation doc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Agent always runs without test data — no --parity flag needed.
The solutions repo separation plan is not ready to commit.
---
 algorithmic/scripts/generate_solutions.py |  4 +--
 docs/solutions-repo-separation.md         | 43 -----------------------
 src/frontier_cs/gen/agent_interface.py    |  8 ++---
 tests/test_agent_interface.py             | 12 +++----
 4 files changed, 11 insertions(+), 56 deletions(-)
 delete mode 100644 docs/solutions-repo-separation.md

diff --git a/algorithmic/scripts/generate_solutions.py b/algorithmic/scripts/generate_solutions.py
index c9181137..d2b9ed33 100644
--- a/algorithmic/scripts/generate_solutions.py
+++ b/algorithmic/scripts/generate_solutions.py
@@ -305,8 +305,7 @@ def main():
                         help="Agent timeout in seconds (default: 1200 = 20 min)")
     parser.add_argument("--agent-cost-limit", type=float, default=20.0,
                         help="Agent max cost per problem in USD (default: 20)")
-    parser.add_argument("--parity", action="store_true",
-                        help="Harbor parity mode: no test data or helper scripts given to agent")
+
 
     args = parser.parse_args()
 
@@ -527,7 +526,6 @@ def execute_task(task: GenerationTask) -> Tuple[str, str, Optional[str], str, Op
                     cost_limit=args.agent_cost_limit,
                     timeout=args.agent_timeout,
                     transcript_path=transcript_path,
-                    parity=args.parity,
                 )
 
                 # Save metadata alongside solution
diff --git a/docs/solutions-repo-separation.md b/docs/solutions-repo-separation.md
deleted file mode 100644
index f64a201e..00000000
--- a/docs/solutions-repo-separation.md
+++ /dev/null
@@ -1,43 +0,0 @@
-# Solutions Repo Separation
-
-## Problem
-
-Infra code (agent_interface.py, generate_solutions.py) and generated solutions (.cpp, .meta.json) live in the same repo. This causes:
-
-- Can't freely rebase/restructure infra without worrying about losing uncommitted solutions
-- Git diffs polluted by large generated files
-- No traceability — can't tell which version of infra generated a given solution
-
-## Decision
-
-1. **Move solutions to `FrontierCS/Frontier-CS-Result`** (already exists for storing results).
-2. **Add `infra_git_hash` to `.meta.json`** so each solution records which commit of this repo generated it.
-3. **Keep existing naming**: `{model_prefix}.cpp`, `{model_prefix}_{variant}.cpp`. Indices (`_0`, `_1`, `_2`) remain multi-variant within a single run.
-4. **Version via git commits** in the result repo. Re-running overwrites files, but commit before re-running to preserve history.
-
-## meta.json additions
-
-```json
-{
-  "model": "claude-sonnet-4-5-20250514",
-  "cost_usd": 0.55,
-  "time_seconds": 337,
-  "turns": 59,
-  "tokens_in": 125000,
-  "tokens_out": 18000,
-  "status": "success",
-  "infra_git_hash": "f54d370b",
-  "timestamp": "2026-04-15T14:30:22Z"
-}
-```
-
-## What stays in this repo
-
-- `src/frontier_cs/gen/` — generation and agent infra code
-- `algorithmic/problems/` — problem definitions
-- `algorithmic/judge/` — judge server
-
-## What moves to Frontier-CS-Result
-
-- `algorithmic/solutions/` — all generated solution files
-- `algorithmic/AGENT_EVAL_RESULTS.md` — eval result summaries
diff --git a/src/frontier_cs/gen/agent_interface.py b/src/frontier_cs/gen/agent_interface.py
index b3145866..05f11671 100644
--- a/src/frontier_cs/gen/agent_interface.py
+++ b/src/frontier_cs/gen/agent_interface.py
@@ -228,7 +228,7 @@ def _format_samples(samples: List[Dict[str, str]], is_interactive: bool) -> str:
 """
 
 
-def build_agent_prompt(problem_dir: str, *, parity: bool = False) -> str:
+def build_agent_prompt(problem_dir: str, *, parity: bool = True) -> str:
     """Construct a problem-aware prompt for the agent.
 
     Reads config.yaml to detect problem type (interactive vs standard, SPJ),
@@ -478,7 +478,7 @@ def _write_helper_scripts(workdir: Path, is_interactive: bool) -> None:
         run_inter.chmod(run_inter.stat().st_mode | stat.S_IEXEC)
 
 
-def _write_workdir_claude_md(workdir: Path, is_interactive: bool, *, parity: bool = False) -> None:
+def _write_workdir_claude_md(workdir: Path, is_interactive: bool, *, parity: bool = True) -> None:
     """Write a CLAUDE.md to the workdir so Claude Code picks up behavioral guidance."""
     lines = [
         "# Agent Eval — Working Directory",
@@ -655,7 +655,7 @@ async def run_agent(
     cost_limit: float = DEFAULT_COST_LIMIT_USD,
     timeout: float = DEFAULT_TIMEOUT_SECONDS,
     transcript_path: Optional[Path] = None,
-    parity: bool = False,
+    parity: bool = True,
 ) -> Tuple[str, Dict[str, Any]]:
     """Run the agent to solve a problem.
 
@@ -868,7 +868,7 @@ def generate_agent_solution(
     cost_limit: float = DEFAULT_COST_LIMIT_USD,
     timeout: float = DEFAULT_TIMEOUT_SECONDS,
     transcript_path: Optional[Path] = None,
-    parity: bool = False,
+    parity: bool = True,
 ) -> Tuple[str, Dict[str, Any]]:
     """Synchronous wrapper for run_agent.
 
diff --git a/tests/test_agent_interface.py b/tests/test_agent_interface.py
index 176bef6f..34f27037 100644
--- a/tests/test_agent_interface.py
+++ b/tests/test_agent_interface.py
@@ -72,7 +72,7 @@ def test_build_agent_prompt_standard():
 
     with tempfile.TemporaryDirectory() as tmpdir:
         pdir = _make_problem_dir(tmpdir)
-        prompt = build_agent_prompt(str(pdir))
+        prompt = build_agent_prompt(str(pdir), parity=False)
         assert "test_all.sh" in prompt
         assert "STANDARD" in prompt or "SPECIAL JUDGE" in prompt
         assert "solution.cpp" in prompt
@@ -88,7 +88,7 @@ def test_build_agent_prompt_interactive():
 
     with tempfile.TemporaryDirectory() as tmpdir:
         pdir = _make_problem_dir(tmpdir, interactive=True)
-        prompt = build_agent_prompt(str(pdir))
+        prompt = build_agent_prompt(str(pdir), parity=False)
         assert "INTERACTIVE" in prompt
         assert "run_interactive.sh" in prompt
         assert "flush" in prompt.lower() or "pipe" in prompt.lower()
@@ -100,7 +100,7 @@ def test_build_agent_prompt_embeds_small_samples():
 
     with tempfile.TemporaryDirectory() as tmpdir:
         pdir = _make_problem_dir(tmpdir, samples=2)
-        prompt = build_agent_prompt(str(pdir))
+        prompt = build_agent_prompt(str(pdir), parity=False)
         # The sample content should appear in the prompt
         assert "Sample 1" in prompt
         assert "Sample 2" in prompt
@@ -114,7 +114,7 @@ def test_build_agent_prompt_skips_large_samples():
         pdir = _make_problem_dir(tmpdir, samples=1)
         # Make the input file larger than the embed threshold
         (pdir / "testdata" / "1.in").write_text("x" * (_MAX_EMBED_SIZE + 1))
-        prompt = build_agent_prompt(str(pdir))
+        prompt = build_agent_prompt(str(pdir), parity=False)
         # Should NOT contain the embedded content
         assert "Sample 1" not in prompt
 
@@ -244,7 +244,7 @@ def test_write_workdir_claude_md_standard():
 
     with tempfile.TemporaryDirectory() as tmpdir:
         workdir = Path(tmpdir)
-        _write_workdir_claude_md(workdir, is_interactive=False)
+        _write_workdir_claude_md(workdir, is_interactive=False, parity=False)
         content = (workdir / "CLAUDE.md").read_text()
         assert "test_all.sh" in content
         assert "run_interactive.sh" not in content
@@ -256,7 +256,7 @@ def test_write_workdir_claude_md_interactive():
 
     with tempfile.TemporaryDirectory() as tmpdir:
         workdir = Path(tmpdir)
-        _write_workdir_claude_md(workdir, is_interactive=True)
+        _write_workdir_claude_md(workdir, is_interactive=True, parity=False)
         content = (workdir / "CLAUDE.md").read_text()
         assert "run_interactive.sh" in content
         assert "flush" in content

From 1549c534d114116d8640d1942a99ba1925ead8bf Mon Sep 17 00:00:00 2001
From: Andy Lee <andylizf@outlook.com>
Date: Thu, 16 Apr 2026 05:17:32 +0000
Subject: [PATCH 10/16] refactor: extract prompt templates and scripts to
 agent_constants.py

Move all large string constants (prompt templates, shell scripts, CLAUDE.md
content) out of agent_interface.py into a dedicated constants module.
---
 src/frontier_cs/gen/agent_constants.py | 384 ++++++++++++++++++++++
 src/frontier_cs/gen/agent_interface.py | 435 ++++---------------------
 2 files changed, 440 insertions(+), 379 deletions(-)
 create mode 100644 src/frontier_cs/gen/agent_constants.py

diff --git a/src/frontier_cs/gen/agent_constants.py b/src/frontier_cs/gen/agent_constants.py
new file mode 100644
index 00000000..6b23d915
--- /dev/null
+++ b/src/frontier_cs/gen/agent_constants.py
@@ -0,0 +1,384 @@
+"""Prompt templates, shell scripts, and CLAUDE.md content for agent eval.
+
+All large string constants live here to keep agent_interface.py focused on logic.
+"""
+
+# ---------------------------------------------------------------------------
+# Shell scripts (used in parity=False mode only)
+# ---------------------------------------------------------------------------
+
+TEST_ALL_SH = r"""#!/bin/bash
+set -e
+echo "=== Compiling solution.cpp ==="
+g++ -std=gnu++17 -O2 -o solution solution.cpp
+echo "=== Compilation OK ==="
+
+# Compile checker if available (special judge)
+USE_CHECKER=0
+if [ -f "chk.cc" ]; then
+    echo "=== Compiling special judge (chk.cc) ==="
+    if g++ -std=gnu++17 -O2 -I. chk.cc -o checker 2>/dev/null; then
+        USE_CHECKER=1
+        echo "=== Checker compiled OK — using it instead of diff ==="
+    else
+        echo "=== Checker compilation failed — falling back to diff ==="
+    fi
+fi
+
+passed=0; failed=0; total=0
+for inf in testdata/*.in; do
+    [ -f "$inf" ] || continue
+    id=$(basename "$inf" .in)
+    ans="testdata/${id}.ans"
+    [ -f "$ans" ] || continue
+    total=$((total + 1))
+
+    # Run with timeout
+    if timeout 15 ./solution < "$inf" > "my_${id}.out" 2>"my_${id}.err"; then
+        if [ "$USE_CHECKER" -eq 1 ]; then
+            # Special judge: ./checker <input> <output> <answer>
+            checker_out=$(./checker "$inf" "my_${id}.out" "$ans" 2>&1) && chk_rc=$? || chk_rc=$?
+            if [ $chk_rc -eq 0 ]; then
+                echo "  Sample $id: PASS (checker: $checker_out)"
+                passed=$((passed + 1))
+            else
+                echo "  Sample $id: WRONG ANSWER (checker exit $chk_rc)"
+                echo "    Checker output: $checker_out"
+                failed=$((failed + 1))
+            fi
+        else
+            # Diff-based comparison (normalize whitespace)
+            if diff -q <(tr -s '[:space:]' '\n' < "my_${id}.out" | sed '/^$/d') \
+                        <(tr -s '[:space:]' '\n' < "$ans" | sed '/^$/d') >/dev/null 2>&1; then
+                echo "  Sample $id: PASS"
+                passed=$((passed + 1))
+            else
+                echo "  Sample $id: WRONG ANSWER"
+                echo "    Expected (first 5 lines):"
+                head -5 "$ans" | sed 's/^/      /'
+                echo "    Got (first 5 lines):"
+                head -5 "my_${id}.out" | sed 's/^/      /'
+                failed=$((failed + 1))
+            fi
+        fi
+    else
+        rc=$?
+        echo "  Sample $id: RUNTIME ERROR or TLE (exit $rc)"
+        [ -s "my_${id}.err" ] && head -3 "my_${id}.err" | sed 's/^/    stderr: /'
+        failed=$((failed + 1))
+    fi
+done
+
+echo "=== Results: $passed/$total passed ==="
+[ "$failed" -eq 0 ] && exit 0 || exit 1
+"""
+
+RUN_INTERACTIVE_SH = r"""#!/bin/bash
+# Usage: ./run_interactive.sh [sample_id]  (default: 1)
+# Compiles solution.cpp and interactor.cc, then tests via pipe.
+# Exit codes: 0=accepted, 1=wrong answer, 2=presentation error, 3=build error, 4=timeout/crash
+
+SAMPLE=${1:-1}
+INF="testdata/${SAMPLE}.in"
+ANSF="testdata/${SAMPLE}.ans"
+
+if [ ! -f "$INF" ]; then
+    echo "Error: $INF not found"
+    exit 3
+fi
+
+# Compile only if binaries are missing or sources are newer
+if [ ! -f ./solution ] || [ solution.cpp -nt ./solution ]; then
+    echo "=== Compiling solution.cpp ==="
+    g++ -std=gnu++17 -O2 -o solution solution.cpp || { echo "Compilation failed"; exit 3; }
+fi
+
+if [ ! -f ./interactor ] || [ interactor.cc -nt ./interactor ]; then
+    echo "=== Compiling interactor ==="
+    g++ -std=gnu++17 -O2 -I. interactor.cc -o interactor || { echo "Interactor compilation failed"; exit 3; }
+fi
+
+# Create named pipes in current dir (avoids /tmp permission issues)
+PIPE_S2I=".pipe_s2i_$$"
+PIPE_I2S=".pipe_i2s_$$"
+rm -f "$PIPE_S2I" "$PIPE_I2S"
+mkfifo "$PIPE_S2I" "$PIPE_I2S"
+
+cleanup() { rm -f "$PIPE_S2I" "$PIPE_I2S" inter_stderr.tmp sol_stderr.tmp; }
+trap cleanup EXIT
+
+echo "=== Running sample $SAMPLE ==="
+
+# interactor: reads from solution's stdout via pipe, writes to solution's stdin via pipe
+# testlib interactors: argv = <inf> <ouf> [ans]
+# We use /dev/null for ouf (output file) since we only care about exit code
+timeout 120 ./interactor "$INF" /dev/null "$ANSF" < "$PIPE_S2I" > "$PIPE_I2S" 2>inter_stderr.tmp &
+INTER_PID=$!
+
+timeout 120 ./solution < "$PIPE_I2S" > "$PIPE_S2I" 2>sol_stderr.tmp &
+SOL_PID=$!
+
+# Wait for both processes
+INTER_EXIT=0; SOL_EXIT=0
+wait $INTER_PID 2>/dev/null || INTER_EXIT=$?
+wait $SOL_PID 2>/dev/null || SOL_EXIT=$?
+
+# Report results
+if [ $INTER_EXIT -eq 0 ]; then
+    echo "  Sample $SAMPLE: ACCEPTED (interactor exit 0)"
+    [ -s inter_stderr.tmp ] && head -2 inter_stderr.tmp | sed 's/^/    interactor: /'
+    exit 0
+elif [ $INTER_EXIT -eq 1 ]; then
+    echo "  Sample $SAMPLE: WRONG ANSWER (interactor exit 1)"
+    [ -s inter_stderr.tmp ] && head -3 inter_stderr.tmp | sed 's/^/    interactor: /'
+    exit 1
+elif [ $INTER_EXIT -eq 2 ]; then
+    echo "  Sample $SAMPLE: PRESENTATION ERROR (interactor exit 2)"
+    [ -s inter_stderr.tmp ] && head -3 inter_stderr.tmp | sed 's/^/    interactor: /'
+    exit 2
+elif [ $INTER_EXIT -eq 124 ] || [ $INTER_EXIT -eq 137 ]; then
+    echo "  Sample $SAMPLE: TIMEOUT (120s exceeded)"
+    echo "    This usually means your solution deadlocked (missing flush? wrong protocol?)"
+    [ -s sol_stderr.tmp ] && head -3 sol_stderr.tmp | sed 's/^/    solution stderr: /'
+    exit 4
+else
+    echo "  Sample $SAMPLE: UNKNOWN (interactor exit $INTER_EXIT, solution exit $SOL_EXIT)"
+    [ -s inter_stderr.tmp ] && head -3 inter_stderr.tmp | sed 's/^/    interactor: /'
+    [ -s sol_stderr.tmp ] && head -3 sol_stderr.tmp | sed 's/^/    solution: /'
+    exit 4
+fi
+"""
+
+# ---------------------------------------------------------------------------
+# Prompt templates
+# ---------------------------------------------------------------------------
+
+# Parity prompt (default): agent gets NO test data, must self-test
+PARITY_PROMPT = """You are solving a competitive programming problem.
+
+Problem directory: {problem_dir}
+- Read statement.txt for the full problem description
+- Time limit: {time_limit}, Memory limit: {memory_limit}
+- Total test cases: {total_cases} (your score = fraction passed)
+- Scoring is partial: 0-100% based on test cases passed"""
+
+PARITY_INTERACTIVE_SECTION = """
+## Problem type: INTERACTIVE
+
+This is an interactive problem. Your solution communicates with a hidden judge
+via stdin/stdout. You do NOT read from files.
+
+**CRITICAL:**
+- Flush stdout after EVERY output line: `cout << endl;` or `cout << flush;`
+- Read the problem statement carefully for the exact query/response protocol
+- Count your queries against the stated limit"""
+
+PARITY_SPJ_SECTION = """
+## Problem type: SPECIAL JUDGE
+
+This problem accepts multiple valid outputs. Your solution will be checked by
+a special judge, not by exact string matching."""
+
+PARITY_STANDARD_SECTION = """
+## Problem type: STANDARD
+
+Your output must match the expected output exactly (whitespace-normalized)."""
+
+PARITY_SCORING_AND_WORKFLOW = """
+## Scoring
+
+Your score is the fraction of test cases passed (0-100%).
+- There are {total_cases} test cases total
+- Partial credit counts — passing 7/10 cases = 70% score
+- A correct-but-slow solution that passes small cases is MUCH better than a broken fast one
+- Prioritize CORRECTNESS over optimality
+
+## Self-testing
+
+No test data or test scripts are provided. You must validate your own solution:
+
+1. **Write a brute-force reference solution** (even if O(n!) or exponential) that you are
+   confident is correct for small inputs.
+2. **Write a random test generator** that produces valid inputs within the problem constraints.
+3. **Cross-validate (对拍):** Run both solutions on hundreds of random small inputs and compare
+   outputs. Fix any discrepancies by debugging your main solution against the brute-force.
+4. **Stress test:** Generate larger random inputs to check for TLE, MLE, or crashes.
+5. **Edge cases:** Manually test minimum inputs (N=1, empty, etc.) and boundary values.
+
+This self-testing approach is standard competitive programming practice. Do NOT skip it.
+
+## Workflow
+
+1. Read the FULL problem statement carefully. Re-read the constraints and edge cases.
+2. Understand the I/O format from the examples in the problem statement.
+3. Design your algorithm. Think about time complexity vs the constraints.
+4. Write a SIMPLE correct solution first — brute force is fine for a first version.
+5. Write a separate brute-force and test generator, then cross-validate.
+6. Once confident in correctness: optimize for performance if needed.
+7. Stress test with larger inputs before finalizing.
+
+**Retreat strategy:** If stuck on the same bug for 5+ cycles, switch to a simpler
+algorithm. A correct brute-force scoring 30% beats a broken solution scoring 0%.
+
+Submit your final solution as solution.cpp in the current working directory."""
+
+# Full-access prompt (parity=False): agent gets test data and helper scripts
+FULL_ACCESS_PROMPT = """You are solving a competitive programming problem.
+
+Problem directory: {problem_dir}
+- Read statement.txt for the full problem description
+- Time limit: {time_limit}, Memory limit: {memory_limit}
+- Total hidden test cases: {total_cases} (your score = fraction passed)
+- testdata/ contains sample test cases — these are a SUBSET of the hidden tests"""
+
+FULL_ACCESS_INTERACTIVE_SECTION = """
+## Problem type: INTERACTIVE
+
+This is an interactive problem. Your solution communicates with a judge interactor
+via stdin/stdout. You do NOT read from files — you read responses from the interactor
+and write queries/answers to stdout.
+
+Key files provided:
+- interactor.cc — the judge interactor (uses testlib.h, both provided)
+- testdata/*.in — interactor input seeds (NOT your stdin)
+
+**CRITICAL for interactive problems:**
+- You MUST flush stdout after EVERY output line: use `cout << endl;` or `cout << flush;`
+- Read the interactor source code to understand the exact protocol (what it sends, what it expects)
+- Count your queries carefully against the stated limit
+
+**Testing interactive solutions locally:**
+Use the provided `./run_interactive.sh` script:
+```bash
+./run_interactive.sh 1    # Test with sample 1
+./run_interactive.sh 2    # Test with sample 2
+# Run all samples:
+for i in testdata/*.in; do ./run_interactive.sh $(basename $i .in); done
+```
+
+If `run_interactive.sh` times out (exit code 4), it usually means a deadlock:
+- Missing `flush` / `endl` on your output
+- Reading when the interactor expects you to write, or vice versa
+- Exceeding the query limit (interactor stops responding)
+
+**Fallback testing:** If the shell script doesn't work, write a Python wrapper:
+```python
+import subprocess, os
+proc_sol = subprocess.Popen(['./solution'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+proc_int = subprocess.Popen(['./interactor', 'testdata/1.in', '/dev/null', 'testdata/1.ans'],
+                             stdin=proc_sol.stdout, stdout=proc_sol.stdin)
+proc_int.wait(); proc_sol.wait()
+print(f"interactor exit: {{proc_int.returncode}}")
+```
+
+IMPORTANT: You MUST test your solution locally before finalizing. Do NOT submit untested code."""
+
+FULL_ACCESS_STANDARD_SECTION = """
+## Problem type: {problem_type}
+
+**Testing your solution locally:**
+Use the provided `./test_all.sh` script:
+```bash
+./test_all.sh    # Compiles solution.cpp and runs against ALL samples
+```
+This compiles, runs each sample, and compares output. Always run this before finalizing.{checker_note}"""
+
+FULL_ACCESS_SCORING_SECTION = """
+## Scoring
+
+Your score is the fraction of hidden test cases passed (0-100%).
+- There are {total_cases} hidden test cases total
+- Partial credit counts — passing 7/10 cases = 70% score
+- A correct-but-slow solution that passes small cases is MUCH better than a broken fast one
+- Prioritize CORRECTNESS over optimality. Get a working solution first, then optimize."""
+
+FULL_ACCESS_WORKFLOW = """## Workflow
+
+1. Read the FULL problem statement carefully. Re-read the constraints and edge cases.
+2. Read ALL sample test cases and understand the expected I/O format.
+3. Design your algorithm. Think about time complexity vs the constraints.
+4. Write a SIMPLE correct solution first — brute force is fine for a first version.
+5. Compile and test against ALL samples using the provided test script.
+6. If samples fail: debug by examining the diff, don't just rewrite everything.
+7. Once samples pass: think about edge cases and whether your algorithm handles large inputs.
+8. Optimize only after correctness is established.
+
+**Critical rules:**
+- Do NOT rewrite your solution from scratch more than once. Incremental edits preserve working logic.
+- Do NOT skip local testing. Every change must be tested before you move on.
+- Do NOT submit without running test_all.sh (or run_interactive.sh for interactive).
+- If you TLE on large cases, profile the bottleneck — don't simplify the entire algorithm.
+
+**Retreat strategy — know when to simplify:**
+- If you've been debugging the SAME bug for more than 5 edit-test cycles without progress,
+  STOP and switch to a fundamentally simpler approach. A correct brute-force that passes
+  small cases is worth more than a broken optimized solution that passes nothing.
+- If your approach is off by a small constant (e.g., exceeding a limit by 1), consider whether
+  a completely different algorithm would avoid the issue rather than patching endlessly.
+- Remember: partial credit exists. A solution scoring 30% is infinitely better than 0%.
+  When in doubt, submit what works even if it's suboptimal.
+
+Submit your final solution as solution.cpp in the current working directory."""
+
+# ---------------------------------------------------------------------------
+# CLAUDE.md content
+# ---------------------------------------------------------------------------
+
+CLAUDE_MD_HEADER = """# Agent Eval — Working Directory
+
+You are solving a competitive programming problem in this directory.
+
+## Rules
+
+- Your ONLY deliverable is `solution.cpp` in this directory.
+- Use C++17 (g++ -std=gnu++17).
+- Always compile with `-O2` for performance testing.
+- Read the problem statement COMPLETELY before writing any code.
+
+## Testing
+"""
+
+CLAUDE_MD_PARITY_TESTING = """No test data or test scripts are provided.
+Write your own brute-force solution + random test generator to cross-validate.
+This is standard competitive programming practice (对拍).
+"""
+
+CLAUDE_MD_PARITY_INTERACTIVE = """This is an INTERACTIVE problem.
+- `cout << endl;` or `cout << flush;` after EVERY line you output
+- Read the problem statement to understand the exact protocol
+- Count queries against the stated limit
+"""
+
+CLAUDE_MD_FULL_INTERACTIVE = """This is an INTERACTIVE problem. Use `./run_interactive.sh N` to test sample N.
+Do NOT skip interactive testing — protocol bugs are the #1 failure mode.
+
+### Interactive protocol checklist
+- `cout << endl;` or `cout << flush;` after EVERY line you output
+- Read the interactor source code to know the exact send/receive order
+- Count queries against the stated limit
+- If run_interactive.sh times out: you likely have a deadlock (missing flush or wrong protocol)
+- Fallback: write a Python subprocess wrapper if the shell script fails
+"""
+
+CLAUDE_MD_FULL_STANDARD = """Use `./test_all.sh` to compile and test against all samples.
+If chk.cc exists, test_all.sh uses it as a special judge automatically.
+Fix any failing samples before moving on to optimization.
+"""
+
+CLAUDE_MD_FOOTER = """
+## Common mistakes to avoid
+
+- Forgetting to flush stdout in interactive problems
+- Off-by-one errors in array indexing (0-indexed vs 1-indexed)
+- Integer overflow — use `long long` for anything that could exceed 2^31
+- Reading input in the wrong order or format
+- Not handling the edge case where N=1 or the input is minimal
+- Rewriting the entire solution when a small fix would work
+
+## When to retreat
+
+- If you've edited and tested 5+ times for the same bug without progress, STOP.
+- Switch to a simpler algorithm that is guaranteed correct, even if slower.
+- A correct brute-force scoring 30% beats a broken clever solution scoring 0%.
+- Partial credit is real: every test case you pass counts.
+"""
diff --git a/src/frontier_cs/gen/agent_interface.py b/src/frontier_cs/gen/agent_interface.py
index 05f11671..907a53eb 100644
--- a/src/frontier_cs/gen/agent_interface.py
+++ b/src/frontier_cs/gen/agent_interface.py
@@ -24,6 +24,27 @@
 
 import yaml
 
+from frontier_cs.gen.agent_constants import (
+    CLAUDE_MD_FOOTER,
+    CLAUDE_MD_FULL_INTERACTIVE,
+    CLAUDE_MD_FULL_STANDARD,
+    CLAUDE_MD_HEADER,
+    CLAUDE_MD_PARITY_INTERACTIVE,
+    CLAUDE_MD_PARITY_TESTING,
+    FULL_ACCESS_INTERACTIVE_SECTION,
+    FULL_ACCESS_PROMPT,
+    FULL_ACCESS_SCORING_SECTION,
+    FULL_ACCESS_STANDARD_SECTION,
+    FULL_ACCESS_WORKFLOW,
+    PARITY_INTERACTIVE_SECTION,
+    PARITY_PROMPT,
+    PARITY_SCORING_AND_WORKFLOW,
+    PARITY_SPJ_SECTION,
+    PARITY_STANDARD_SECTION,
+    RUN_INTERACTIVE_SH,
+    TEST_ALL_SH,
+)
+
 logger = logging.getLogger(__name__)
 
 # Default budget limits
@@ -82,150 +103,6 @@ def _format_samples(samples: List[Dict[str, str]], is_interactive: bool) -> str:
     return "\n".join(parts)
 
 
-# Shell script: compile solution.cpp and test against all sample cases.
-# If chk.cc exists (special judge), uses it for verification instead of diff.
-_TEST_ALL_SH = r"""#!/bin/bash
-set -e
-echo "=== Compiling solution.cpp ==="
-g++ -std=gnu++17 -O2 -o solution solution.cpp
-echo "=== Compilation OK ==="
-
-# Compile checker if available (special judge)
-USE_CHECKER=0
-if [ -f "chk.cc" ]; then
-    echo "=== Compiling special judge (chk.cc) ==="
-    if g++ -std=gnu++17 -O2 -I. chk.cc -o checker 2>/dev/null; then
-        USE_CHECKER=1
-        echo "=== Checker compiled OK — using it instead of diff ==="
-    else
-        echo "=== Checker compilation failed — falling back to diff ==="
-    fi
-fi
-
-passed=0; failed=0; total=0
-for inf in testdata/*.in; do
-    [ -f "$inf" ] || continue
-    id=$(basename "$inf" .in)
-    ans="testdata/${id}.ans"
-    [ -f "$ans" ] || continue
-    total=$((total + 1))
-
-    # Run with timeout
-    if timeout 15 ./solution < "$inf" > "my_${id}.out" 2>"my_${id}.err"; then
-        if [ "$USE_CHECKER" -eq 1 ]; then
-            # Special judge: ./checker <input> <output> <answer>
-            checker_out=$(./checker "$inf" "my_${id}.out" "$ans" 2>&1) && chk_rc=$? || chk_rc=$?
-            if [ $chk_rc -eq 0 ]; then
-                echo "  Sample $id: PASS (checker: $checker_out)"
-                passed=$((passed + 1))
-            else
-                echo "  Sample $id: WRONG ANSWER (checker exit $chk_rc)"
-                echo "    Checker output: $checker_out"
-                failed=$((failed + 1))
-            fi
-        else
-            # Diff-based comparison (normalize whitespace)
-            if diff -q <(tr -s '[:space:]' '\n' < "my_${id}.out" | sed '/^$/d') \
-                        <(tr -s '[:space:]' '\n' < "$ans" | sed '/^$/d') >/dev/null 2>&1; then
-                echo "  Sample $id: PASS"
-                passed=$((passed + 1))
-            else
-                echo "  Sample $id: WRONG ANSWER"
-                echo "    Expected (first 5 lines):"
-                head -5 "$ans" | sed 's/^/      /'
-                echo "    Got (first 5 lines):"
-                head -5 "my_${id}.out" | sed 's/^/      /'
-                failed=$((failed + 1))
-            fi
-        fi
-    else
-        rc=$?
-        echo "  Sample $id: RUNTIME ERROR or TLE (exit $rc)"
-        [ -s "my_${id}.err" ] && head -3 "my_${id}.err" | sed 's/^/    stderr: /'
-        failed=$((failed + 1))
-    fi
-done
-
-echo "=== Results: $passed/$total passed ==="
-[ "$failed" -eq 0 ] && exit 0 || exit 1
-"""
-
-# Shell script: test solution against an interactor using named pipes.
-_RUN_INTERACTIVE_SH = r"""#!/bin/bash
-# Usage: ./run_interactive.sh [sample_id]  (default: 1)
-# Compiles solution.cpp and interactor.cc, then tests via pipe.
-# Exit codes: 0=accepted, 1=wrong answer, 2=presentation error, 3=build error, 4=timeout/crash
-
-SAMPLE=${1:-1}
-INF="testdata/${SAMPLE}.in"
-ANSF="testdata/${SAMPLE}.ans"
-
-if [ ! -f "$INF" ]; then
-    echo "Error: $INF not found"
-    exit 3
-fi
-
-# Compile only if binaries are missing or sources are newer
-if [ ! -f ./solution ] || [ solution.cpp -nt ./solution ]; then
-    echo "=== Compiling solution.cpp ==="
-    g++ -std=gnu++17 -O2 -o solution solution.cpp || { echo "Compilation failed"; exit 3; }
-fi
-
-if [ ! -f ./interactor ] || [ interactor.cc -nt ./interactor ]; then
-    echo "=== Compiling interactor ==="
-    g++ -std=gnu++17 -O2 -I. interactor.cc -o interactor || { echo "Interactor compilation failed"; exit 3; }
-fi
-
-# Create named pipes in current dir (avoids /tmp permission issues)
-PIPE_S2I=".pipe_s2i_$$"
-PIPE_I2S=".pipe_i2s_$$"
-rm -f "$PIPE_S2I" "$PIPE_I2S"
-mkfifo "$PIPE_S2I" "$PIPE_I2S"
-
-cleanup() { rm -f "$PIPE_S2I" "$PIPE_I2S" inter_stderr.tmp sol_stderr.tmp; }
-trap cleanup EXIT
-
-echo "=== Running sample $SAMPLE ==="
-
-# interactor: reads from solution's stdout via pipe, writes to solution's stdin via pipe
-# testlib interactors: argv = <inf> <ouf> [ans]
-# We use /dev/null for ouf (output file) since we only care about exit code
-timeout 120 ./interactor "$INF" /dev/null "$ANSF" < "$PIPE_S2I" > "$PIPE_I2S" 2>inter_stderr.tmp &
-INTER_PID=$!
-
-timeout 120 ./solution < "$PIPE_I2S" > "$PIPE_S2I" 2>sol_stderr.tmp &
-SOL_PID=$!
-
-# Wait for both processes
-INTER_EXIT=0; SOL_EXIT=0
-wait $INTER_PID 2>/dev/null || INTER_EXIT=$?
-wait $SOL_PID 2>/dev/null || SOL_EXIT=$?
-
-# Report results
-if [ $INTER_EXIT -eq 0 ]; then
-    echo "  Sample $SAMPLE: ACCEPTED (interactor exit 0)"
-    [ -s inter_stderr.tmp ] && head -2 inter_stderr.tmp | sed 's/^/    interactor: /'
-    exit 0
-elif [ $INTER_EXIT -eq 1 ]; then
-    echo "  Sample $SAMPLE: WRONG ANSWER (interactor exit 1)"
-    [ -s inter_stderr.tmp ] && head -3 inter_stderr.tmp | sed 's/^/    interactor: /'
-    exit 1
-elif [ $INTER_EXIT -eq 2 ]; then
-    echo "  Sample $SAMPLE: PRESENTATION ERROR (interactor exit 2)"
-    [ -s inter_stderr.tmp ] && head -3 inter_stderr.tmp | sed 's/^/    interactor: /'
-    exit 2
-elif [ $INTER_EXIT -eq 124 ] || [ $INTER_EXIT -eq 137 ]; then
-    echo "  Sample $SAMPLE: TIMEOUT (120s exceeded)"
-    echo "    This usually means your solution deadlocked (missing flush? wrong protocol?)"
-    [ -s sol_stderr.tmp ] && head -3 sol_stderr.tmp | sed 's/^/    solution stderr: /'
-    exit 4
-else
-    echo "  Sample $SAMPLE: UNKNOWN (interactor exit $INTER_EXIT, solution exit $SOL_EXIT)"
-    [ -s inter_stderr.tmp ] && head -3 inter_stderr.tmp | sed 's/^/    interactor: /'
-    [ -s sol_stderr.tmp ] && head -3 sol_stderr.tmp | sed 's/^/    solution: /'
-    exit 4
-fi
-"""
 
 
 def build_agent_prompt(problem_dir: str, *, parity: bool = True) -> str:
@@ -260,121 +137,35 @@ def build_agent_prompt(problem_dir: str, *, parity: bool = True) -> str:
 
     samples = _collect_samples(problem_dir)
 
-    # Base info
-    parts = [f"""You are solving a competitive programming problem.
-
-Problem directory: {problem_dir}
-- Read statement.txt for the full problem description
-- Time limit: {time_limit}, Memory limit: {memory_limit}
-- Total hidden test cases: {total_cases} (your score = fraction passed)
-- testdata/ contains sample test cases — these are a SUBSET of the hidden tests"""]
+    parts = [FULL_ACCESS_PROMPT.format(
+        problem_dir=problem_dir, time_limit=time_limit,
+        memory_limit=memory_limit, total_cases=total_cases,
+    )]
 
-    # Problem type specific guidance
     if is_interactive:
-        parts.append("""
-## Problem type: INTERACTIVE
-
-This is an interactive problem. Your solution communicates with a judge interactor
-via stdin/stdout. You do NOT read from files — you read responses from the interactor
-and write queries/answers to stdout.
-
-Key files provided:
-- interactor.cc — the judge interactor (uses testlib.h, both provided)
-- testdata/*.in — interactor input seeds (NOT your stdin)
-
-**CRITICAL for interactive problems:**
-- You MUST flush stdout after EVERY output line: use `cout << endl;` or `cout << flush;`
-- Read the interactor source code to understand the exact protocol (what it sends, what it expects)
-- Count your queries carefully against the stated limit
-
-**Testing interactive solutions locally:**
-Use the provided `./run_interactive.sh` script:
-```bash
-./run_interactive.sh 1    # Test with sample 1
-./run_interactive.sh 2    # Test with sample 2
-# Run all samples:
-for i in testdata/*.in; do ./run_interactive.sh $(basename $i .in); done
-```
-
-If `run_interactive.sh` times out (exit code 4), it usually means a deadlock:
-- Missing `flush` / `endl` on your output
-- Reading when the interactor expects you to write, or vice versa
-- Exceeding the query limit (interactor stops responding)
-
-**Fallback testing:** If the shell script doesn't work, write a Python wrapper:
-```python
-import subprocess, os
-proc_sol = subprocess.Popen(['./solution'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
-proc_int = subprocess.Popen(['./interactor', 'testdata/1.in', '/dev/null', 'testdata/1.ans'],
-                             stdin=proc_sol.stdout, stdout=proc_sol.stdin)
-proc_int.wait(); proc_sol.wait()
-print(f"interactor exit: {proc_int.returncode}")
-```
-
-IMPORTANT: You MUST test your solution locally before finalizing. Do NOT submit untested code.""")
+        parts.append(FULL_ACCESS_INTERACTIVE_SECTION)
     else:
         checker_note = ""
         if has_checker:
-            checker_note = """
-Note: This problem has a SPECIAL JUDGE (chk.cc) — multiple valid outputs may be accepted.
-`test_all.sh` will automatically compile and use the checker for validation.
-If the checker reports PASS but the output looks different from the .ans file, that's fine."""
-
-        parts.append(f"""
-## Problem type: {"SPECIAL JUDGE (multiple valid outputs accepted)" if has_checker else "STANDARD"}
-
-**Testing your solution locally:**
-Use the provided `./test_all.sh` script:
-```bash
-./test_all.sh    # Compiles solution.cpp and runs against ALL samples
-```
-This compiles, runs each sample, and compares output. Always run this before finalizing.{checker_note}""")
-
-    # Scoring context
-    parts.append(f"""
-## Scoring
-
-Your score is the fraction of hidden test cases passed (0-100%).
-- There are {total_cases} hidden test cases total
-- Partial credit counts — passing 7/10 cases = 70% score
-- A correct-but-slow solution that passes small cases is MUCH better than a broken fast one
-- Prioritize CORRECTNESS over optimality. Get a working solution first, then optimize.""")
-
-    # Embed samples if small enough
+            checker_note = ("\nNote: This problem has a SPECIAL JUDGE (chk.cc) — "
+                           "multiple valid outputs may be accepted.\n"
+                           "`test_all.sh` will automatically compile and use the "
+                           "checker for validation.\nIf the checker reports PASS but "
+                           "the output looks different from the .ans file, that's fine.")
+        problem_type = "SPECIAL JUDGE (multiple valid outputs accepted)" if has_checker else "STANDARD"
+        parts.append(FULL_ACCESS_STANDARD_SECTION.format(
+            problem_type=problem_type, checker_note=checker_note,
+        ))
+
+    parts.append(FULL_ACCESS_SCORING_SECTION.format(total_cases=total_cases))
+
     sample_text = _format_samples(samples, is_interactive)
     if sample_text:
         parts.append(sample_text)
     elif samples:
         parts.append("\n(Sample inputs are large — read them from testdata/ directory.)\n")
 
-    # Workflow
-    parts.append("""## Workflow
-
-1. Read the FULL problem statement carefully. Re-read the constraints and edge cases.
-2. Read ALL sample test cases and understand the expected I/O format.
-3. Design your algorithm. Think about time complexity vs the constraints.
-4. Write a SIMPLE correct solution first — brute force is fine for a first version.
-5. Compile and test against ALL samples using the provided test script.
-6. If samples fail: debug by examining the diff, don't just rewrite everything.
-7. Once samples pass: think about edge cases and whether your algorithm handles large inputs.
-8. Optimize only after correctness is established.
-
-**Critical rules:**
-- Do NOT rewrite your solution from scratch more than once. Incremental edits preserve working logic.
-- Do NOT skip local testing. Every change must be tested before you move on.
-- Do NOT submit without running test_all.sh (or run_interactive.sh for interactive).
-- If you TLE on large cases, profile the bottleneck — don't simplify the entire algorithm.
-
-**Retreat strategy — know when to simplify:**
-- If you've been debugging the SAME bug for more than 5 edit-test cycles without progress,
-  STOP and switch to a fundamentally simpler approach. A correct brute-force that passes
-  small cases is worth more than a broken optimized solution that passes nothing.
-- If your approach is off by a small constant (e.g., exceeding a limit by 1), consider whether
-  a completely different algorithm would avoid the issue rather than patching endlessly.
-- Remember: partial credit exists. A solution scoring 30% is infinitely better than 0%.
-  When in doubt, submit what works even if it's suboptimal.
-
-Submit your final solution as solution.cpp in the current working directory.""")
+    parts.append(FULL_ACCESS_WORKFLOW)
 
     return "\n".join(parts)
 
@@ -393,162 +184,48 @@ def _build_parity_prompt(
     Matches the Harbor adapter setup: agent gets only the problem statement
     and must write its own tests.
     """
-    parts = [f"""You are solving a competitive programming problem.
-
-Problem directory: {problem_dir}
-- Read statement.txt for the full problem description
-- Time limit: {time_limit}, Memory limit: {memory_limit}
-- Total test cases: {total_cases} (your score = fraction passed)
-- Scoring is partial: 0-100% based on test cases passed"""]
+    parts = [PARITY_PROMPT.format(
+        problem_dir=problem_dir, time_limit=time_limit,
+        memory_limit=memory_limit, total_cases=total_cases,
+    )]
 
     if is_interactive:
-        parts.append("""
-## Problem type: INTERACTIVE
-
-This is an interactive problem. Your solution communicates with a hidden judge
-via stdin/stdout. You do NOT read from files.
-
-**CRITICAL:**
-- Flush stdout after EVERY output line: `cout << endl;` or `cout << flush;`
-- Read the problem statement carefully for the exact query/response protocol
-- Count your queries against the stated limit""")
+        parts.append(PARITY_INTERACTIVE_SECTION)
     elif has_checker:
-        parts.append("""
-## Problem type: SPECIAL JUDGE
-
-This problem accepts multiple valid outputs. Your solution will be checked by
-a special judge, not by exact string matching.""")
+        parts.append(PARITY_SPJ_SECTION)
     else:
-        parts.append("""
-## Problem type: STANDARD
-
-Your output must match the expected output exactly (whitespace-normalized).""")
-
-    parts.append(f"""
-## Scoring
+        parts.append(PARITY_STANDARD_SECTION)
 
-Your score is the fraction of test cases passed (0-100%).
-- There are {total_cases} test cases total
-- Partial credit counts — passing 7/10 cases = 70% score
-- A correct-but-slow solution that passes small cases is MUCH better than a broken fast one
-- Prioritize CORRECTNESS over optimality
-
-## Self-testing
-
-No test data or test scripts are provided. You must validate your own solution:
-
-1. **Write a brute-force reference solution** (even if O(n!) or exponential) that you are
-   confident is correct for small inputs.
-2. **Write a random test generator** that produces valid inputs within the problem constraints.
-3. **Cross-validate (对拍):** Run both solutions on hundreds of random small inputs and compare
-   outputs. Fix any discrepancies by debugging your main solution against the brute-force.
-4. **Stress test:** Generate larger random inputs to check for TLE, MLE, or crashes.
-5. **Edge cases:** Manually test minimum inputs (N=1, empty, etc.) and boundary values.
-
-This self-testing approach is standard competitive programming practice. Do NOT skip it.
-
-## Workflow
-
-1. Read the FULL problem statement carefully. Re-read the constraints and edge cases.
-2. Understand the I/O format from the examples in the problem statement.
-3. Design your algorithm. Think about time complexity vs the constraints.
-4. Write a SIMPLE correct solution first — brute force is fine for a first version.
-5. Write a separate brute-force and test generator, then cross-validate.
-6. Once confident in correctness: optimize for performance if needed.
-7. Stress test with larger inputs before finalizing.
-
-**Retreat strategy:** If stuck on the same bug for 5+ cycles, switch to a simpler
-algorithm. A correct brute-force scoring 30% beats a broken solution scoring 0%.
-
-Submit your final solution as solution.cpp in the current working directory.""")
+    parts.append(PARITY_SCORING_AND_WORKFLOW.format(total_cases=total_cases))
 
     return "\n".join(parts)
 
 
 def _write_helper_scripts(workdir: Path, is_interactive: bool) -> None:
     """Write test helper scripts to the agent's working directory."""
-    # Always provide test_all.sh for non-interactive
     test_all = workdir / "test_all.sh"
-    test_all.write_text(_TEST_ALL_SH, encoding="utf-8")
+    test_all.write_text(TEST_ALL_SH, encoding="utf-8")
     test_all.chmod(test_all.stat().st_mode | stat.S_IEXEC)
 
     if is_interactive:
         run_inter = workdir / "run_interactive.sh"
-        run_inter.write_text(_RUN_INTERACTIVE_SH, encoding="utf-8")
+        run_inter.write_text(RUN_INTERACTIVE_SH, encoding="utf-8")
         run_inter.chmod(run_inter.stat().st_mode | stat.S_IEXEC)
 
 
 def _write_workdir_claude_md(workdir: Path, is_interactive: bool, *, parity: bool = True) -> None:
     """Write a CLAUDE.md to the workdir so Claude Code picks up behavioral guidance."""
-    lines = [
-        "# Agent Eval — Working Directory",
-        "",
-        "You are solving a competitive programming problem in this directory.",
-        "",
-        "## Rules",
-        "",
-        "- Your ONLY deliverable is `solution.cpp` in this directory.",
-        "- Use C++17 (g++ -std=gnu++17).",
-        "- Always compile with `-O2` for performance testing.",
-        "- Read the problem statement COMPLETELY before writing any code.",
-        "",
-        "## Testing",
-        "",
-    ]
+    parts = [CLAUDE_MD_HEADER]
     if parity:
-        lines += [
-            "No test data or test scripts are provided.",
-            "Write your own brute-force solution + random test generator to cross-validate.",
-            "This is standard competitive programming practice (对拍).",
-            "",
-        ]
+        parts.append(CLAUDE_MD_PARITY_TESTING)
         if is_interactive:
-            lines += [
-                "This is an INTERACTIVE problem.",
-                "- `cout << endl;` or `cout << flush;` after EVERY line you output",
-                "- Read the problem statement to understand the exact protocol",
-                "- Count queries against the stated limit",
-                "",
-            ]
+            parts.append(CLAUDE_MD_PARITY_INTERACTIVE)
     elif is_interactive:
-        lines += [
-            "This is an INTERACTIVE problem. Use `./run_interactive.sh N` to test sample N.",
-            "Do NOT skip interactive testing — protocol bugs are the #1 failure mode.",
-            "",
-            "### Interactive protocol checklist",
-            "- `cout << endl;` or `cout << flush;` after EVERY line you output",
-            "- Read the interactor source code to know the exact send/receive order",
-            "- Count queries against the stated limit",
-            "- If run_interactive.sh times out: you likely have a deadlock (missing flush or wrong protocol)",
-            "- Fallback: write a Python subprocess wrapper if the shell script fails",
-            "",
-        ]
+        parts.append(CLAUDE_MD_FULL_INTERACTIVE)
     else:
-        lines += [
-            "Use `./test_all.sh` to compile and test against all samples.",
-            "If chk.cc exists, test_all.sh uses it as a special judge automatically.",
-            "Fix any failing samples before moving on to optimization.",
-            "",
-        ]
-    lines += [
-        "## Common mistakes to avoid",
-        "",
-        "- Forgetting to flush stdout in interactive problems",
-        "- Off-by-one errors in array indexing (0-indexed vs 1-indexed)",
-        "- Integer overflow — use `long long` for anything that could exceed 2^31",
-        "- Reading input in the wrong order or format",
-        "- Not handling the edge case where N=1 or the input is minimal",
-        "- Rewriting the entire solution when a small fix would work",
-        "",
-        "## When to retreat",
-        "",
-        "- If you've edited and tested 5+ times for the same bug without progress, STOP.",
-        "- Switch to a simpler algorithm that is guaranteed correct, even if slower.",
-        "- A correct brute-force scoring 30% beats a broken clever solution scoring 0%.",
-        "- Partial credit is real: every test case you pass counts.",
-        "",
-    ]
-    (workdir / "CLAUDE.md").write_text("\n".join(lines), encoding="utf-8")
+        parts.append(CLAUDE_MD_FULL_STANDARD)
+    parts.append(CLAUDE_MD_FOOTER)
+    (workdir / "CLAUDE.md").write_text("\n".join(parts), encoding="utf-8")
 
 
 def extract_solution_cpp(workdir: Path) -> str:

From a1f9fae324e40f5e1d39a72ff38d527ec09df550 Mon Sep 17 00:00:00 2001
From: Andy Lee <andylizf@outlook.com>
Date: Thu, 16 Apr 2026 17:08:47 +0000
Subject: [PATCH 11/16] refactor: move workflow/testing guidance from prompt to
 CLAUDE.md
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Prompt (initial message) is now lean — only problem-specific info (path,
type, limits). CLAUDE.md carries persistent guidance that survives context
compaction: self-testing methodology, workflow steps, common mistakes,
retreat strategy.
---
 src/frontier_cs/gen/agent_constants.py | 265 +++++++++++--------------
 src/frontier_cs/gen/agent_interface.py |  37 ++--
 tests/test_agent_interface.py          |  49 +++--
 3 files changed, 162 insertions(+), 189 deletions(-)

diff --git a/src/frontier_cs/gen/agent_constants.py b/src/frontier_cs/gen/agent_constants.py
index 6b23d915..060fe419 100644
--- a/src/frontier_cs/gen/agent_constants.py
+++ b/src/frontier_cs/gen/agent_constants.py
@@ -150,7 +150,7 @@
 """
 
 # ---------------------------------------------------------------------------
-# Prompt templates
+# Prompt templates (initial message — problem-specific, may get compacted)
 # ---------------------------------------------------------------------------
 
 # Parity prompt (default): agent gets NO test data, must self-test
@@ -159,19 +159,14 @@
 Problem directory: {problem_dir}
 - Read statement.txt for the full problem description
 - Time limit: {time_limit}, Memory limit: {memory_limit}
-- Total test cases: {total_cases} (your score = fraction passed)
-- Scoring is partial: 0-100% based on test cases passed"""
+- Total test cases: {total_cases}
+- Scoring is partial: 0-100% based on fraction of test cases passed"""
 
 PARITY_INTERACTIVE_SECTION = """
 ## Problem type: INTERACTIVE
 
 This is an interactive problem. Your solution communicates with a hidden judge
-via stdin/stdout. You do NOT read from files.
-
-**CRITICAL:**
-- Flush stdout after EVERY output line: `cout << endl;` or `cout << flush;`
-- Read the problem statement carefully for the exact query/response protocol
-- Count your queries against the stated limit"""
+via stdin/stdout. You do NOT read from files."""
 
 PARITY_SPJ_SECTION = """
 ## Problem type: SPECIAL JUDGE
@@ -184,43 +179,9 @@
 
 Your output must match the expected output exactly (whitespace-normalized)."""
 
-PARITY_SCORING_AND_WORKFLOW = """
-## Scoring
-
-Your score is the fraction of test cases passed (0-100%).
-- There are {total_cases} test cases total
-- Partial credit counts — passing 7/10 cases = 70% score
-- A correct-but-slow solution that passes small cases is MUCH better than a broken fast one
-- Prioritize CORRECTNESS over optimality
-
-## Self-testing
-
-No test data or test scripts are provided. You must validate your own solution:
-
-1. **Write a brute-force reference solution** (even if O(n!) or exponential) that you are
-   confident is correct for small inputs.
-2. **Write a random test generator** that produces valid inputs within the problem constraints.
-3. **Cross-validate (对拍):** Run both solutions on hundreds of random small inputs and compare
-   outputs. Fix any discrepancies by debugging your main solution against the brute-force.
-4. **Stress test:** Generate larger random inputs to check for TLE, MLE, or crashes.
-5. **Edge cases:** Manually test minimum inputs (N=1, empty, etc.) and boundary values.
-
-This self-testing approach is standard competitive programming practice. Do NOT skip it.
-
-## Workflow
-
-1. Read the FULL problem statement carefully. Re-read the constraints and edge cases.
-2. Understand the I/O format from the examples in the problem statement.
-3. Design your algorithm. Think about time complexity vs the constraints.
-4. Write a SIMPLE correct solution first — brute force is fine for a first version.
-5. Write a separate brute-force and test generator, then cross-validate.
-6. Once confident in correctness: optimize for performance if needed.
-7. Stress test with larger inputs before finalizing.
-
-**Retreat strategy:** If stuck on the same bug for 5+ cycles, switch to a simpler
-algorithm. A correct brute-force scoring 30% beats a broken solution scoring 0%.
-
-Submit your final solution as solution.cpp in the current working directory."""
+PARITY_TAIL = """
+Read the CLAUDE.md in this directory for compilation, testing, and workflow guidance.
+Begin by reading the full problem statement in statement.txt."""
 
 # Full-access prompt (parity=False): agent gets test data and helper scripts
 FULL_ACCESS_PROMPT = """You are solving a competitive programming problem.
@@ -228,7 +189,8 @@
 Problem directory: {problem_dir}
 - Read statement.txt for the full problem description
 - Time limit: {time_limit}, Memory limit: {memory_limit}
-- Total hidden test cases: {total_cases} (your score = fraction passed)
+- Total hidden test cases: {total_cases}
+- Scoring is partial: 0-100% based on fraction of test cases passed
 - testdata/ contains sample test cases — these are a SUBSET of the hidden tests"""
 
 FULL_ACCESS_INTERACTIVE_SECTION = """
@@ -240,145 +202,148 @@
 
 Key files provided:
 - interactor.cc — the judge interactor (uses testlib.h, both provided)
-- testdata/*.in — interactor input seeds (NOT your stdin)
+- testdata/*.in — interactor input seeds (NOT your stdin)"""
 
-**CRITICAL for interactive problems:**
-- You MUST flush stdout after EVERY output line: use `cout << endl;` or `cout << flush;`
-- Read the interactor source code to understand the exact protocol (what it sends, what it expects)
-- Count your queries carefully against the stated limit
+FULL_ACCESS_STANDARD_SECTION = """
+## Problem type: {problem_type}
 
-**Testing interactive solutions locally:**
-Use the provided `./run_interactive.sh` script:
-```bash
-./run_interactive.sh 1    # Test with sample 1
-./run_interactive.sh 2    # Test with sample 2
-# Run all samples:
-for i in testdata/*.in; do ./run_interactive.sh $(basename $i .in); done
-```
+Use `./test_all.sh` to compile and test against all samples.{checker_note}"""
 
-If `run_interactive.sh` times out (exit code 4), it usually means a deadlock:
-- Missing `flush` / `endl` on your output
-- Reading when the interactor expects you to write, or vice versa
-- Exceeding the query limit (interactor stops responding)
-
-**Fallback testing:** If the shell script doesn't work, write a Python wrapper:
-```python
-import subprocess, os
-proc_sol = subprocess.Popen(['./solution'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
-proc_int = subprocess.Popen(['./interactor', 'testdata/1.in', '/dev/null', 'testdata/1.ans'],
-                             stdin=proc_sol.stdout, stdout=proc_sol.stdin)
-proc_int.wait(); proc_sol.wait()
-print(f"interactor exit: {{proc_int.returncode}}")
-```
+FULL_ACCESS_TAIL = """
+Read the CLAUDE.md in this directory for compilation, testing, and workflow guidance.
+Begin by reading the full problem statement in statement.txt."""
 
-IMPORTANT: You MUST test your solution locally before finalizing. Do NOT submit untested code."""
+# ---------------------------------------------------------------------------
+# CLAUDE.md content (persistent system context — survives compaction)
+# ---------------------------------------------------------------------------
 
-FULL_ACCESS_STANDARD_SECTION = """
-## Problem type: {problem_type}
+CLAUDE_MD_PARITY = """# Competitive Programming — Agent Workspace
+
+## Deliverable
+
+Write your solution to `solution.cpp` in this directory. Nothing else is evaluated.
+
+## Compilation
 
-**Testing your solution locally:**
-Use the provided `./test_all.sh` script:
 ```bash
-./test_all.sh    # Compiles solution.cpp and runs against ALL samples
+g++ -std=gnu++17 -O2 -o solution solution.cpp
 ```
-This compiles, runs each sample, and compares output. Always run this before finalizing.{checker_note}"""
 
-FULL_ACCESS_SCORING_SECTION = """
 ## Scoring
 
-Your score is the fraction of hidden test cases passed (0-100%).
-- There are {total_cases} hidden test cases total
-- Partial credit counts — passing 7/10 cases = 70% score
-- A correct-but-slow solution that passes small cases is MUCH better than a broken fast one
-- Prioritize CORRECTNESS over optimality. Get a working solution first, then optimize."""
-
-FULL_ACCESS_WORKFLOW = """## Workflow
+Your score = fraction of test cases passed (0-100%).
+- Partial credit counts — passing 7/10 cases = 70%
+- A correct-but-slow solution passing small cases is MUCH better than a broken fast one
+- Prioritize CORRECTNESS over optimality
 
-1. Read the FULL problem statement carefully. Re-read the constraints and edge cases.
-2. Read ALL sample test cases and understand the expected I/O format.
-3. Design your algorithm. Think about time complexity vs the constraints.
-4. Write a SIMPLE correct solution first — brute force is fine for a first version.
-5. Compile and test against ALL samples using the provided test script.
-6. If samples fail: debug by examining the diff, don't just rewrite everything.
-7. Once samples pass: think about edge cases and whether your algorithm handles large inputs.
-8. Optimize only after correctness is established.
+## Self-testing (no test data provided)
 
-**Critical rules:**
-- Do NOT rewrite your solution from scratch more than once. Incremental edits preserve working logic.
-- Do NOT skip local testing. Every change must be tested before you move on.
-- Do NOT submit without running test_all.sh (or run_interactive.sh for interactive).
-- If you TLE on large cases, profile the bottleneck — don't simplify the entire algorithm.
+You must validate your own solution:
 
-**Retreat strategy — know when to simplify:**
-- If you've been debugging the SAME bug for more than 5 edit-test cycles without progress,
-  STOP and switch to a fundamentally simpler approach. A correct brute-force that passes
-  small cases is worth more than a broken optimized solution that passes nothing.
-- If your approach is off by a small constant (e.g., exceeding a limit by 1), consider whether
-  a completely different algorithm would avoid the issue rather than patching endlessly.
-- Remember: partial credit exists. A solution scoring 30% is infinitely better than 0%.
-  When in doubt, submit what works even if it's suboptimal.
+1. **Brute-force reference**: Write a simple, obviously correct solution (even O(n!) is fine)
+2. **Random test generator**: Produce valid inputs within the problem constraints
+3. **Cross-validate (对拍)**: Run both solutions on hundreds of random small inputs, compare outputs.
+   Fix any discrepancy by debugging your main solution against the brute-force.
+4. **Stress test**: Generate larger inputs to check for TLE/MLE/crashes
+5. **Edge cases**: Test minimum inputs (N=1, empty) and boundary values
 
-Submit your final solution as solution.cpp in the current working directory."""
+Do NOT skip self-testing. This is standard competitive programming practice.
 
-# ---------------------------------------------------------------------------
-# CLAUDE.md content
-# ---------------------------------------------------------------------------
+## Workflow
 
-CLAUDE_MD_HEADER = """# Agent Eval — Working Directory
+1. Read the FULL problem statement. Re-read constraints and edge cases.
+2. Understand the I/O format from the examples in the statement.
+3. Design your algorithm. Consider time complexity vs constraints.
+4. Write a SIMPLE correct solution first — brute force is fine initially.
+5. Write brute-force + generator, then cross-validate.
+6. Optimize for performance only after correctness is confirmed.
+7. Stress test with larger inputs before finalizing.
 
-You are solving a competitive programming problem in this directory.
+## Common mistakes
 
-## Rules
+- Integer overflow — use `long long` for anything that could exceed 2^31
+- Off-by-one errors in array indexing (0-indexed vs 1-indexed)
+- Reading input in the wrong order or format
+- Not handling N=1 or minimal input edge cases
 
-- Your ONLY deliverable is `solution.cpp` in this directory.
-- Use C++17 (g++ -std=gnu++17).
-- Always compile with `-O2` for performance testing.
-- Read the problem statement COMPLETELY before writing any code.
+## When to retreat
 
-## Testing
+- If you've been debugging the SAME bug for 5+ edit-test cycles, STOP.
+- Switch to a simpler algorithm that is guaranteed correct, even if slower.
+- A correct brute-force scoring 30% beats a broken optimized solution scoring 0%.
+- Do NOT rewrite from scratch more than once. Incremental edits preserve working logic.
 """
 
-CLAUDE_MD_PARITY_TESTING = """No test data or test scripts are provided.
-Write your own brute-force solution + random test generator to cross-validate.
-This is standard competitive programming practice (对拍).
-"""
+CLAUDE_MD_PARITY_INTERACTIVE_ADDENDUM = """
+## Interactive problem
 
-CLAUDE_MD_PARITY_INTERACTIVE = """This is an INTERACTIVE problem.
-- `cout << endl;` or `cout << flush;` after EVERY line you output
-- Read the problem statement to understand the exact protocol
-- Count queries against the stated limit
+- Flush stdout after EVERY output line: `cout << endl;` or `cout << flush;`
+- Read the problem statement carefully for the exact query/response protocol
+- Count your queries against the stated limit
+- For self-testing: simulate the interactor in a separate program and connect via pipes
 """
 
-CLAUDE_MD_FULL_INTERACTIVE = """This is an INTERACTIVE problem. Use `./run_interactive.sh N` to test sample N.
-Do NOT skip interactive testing — protocol bugs are the #1 failure mode.
+CLAUDE_MD_FULL_ACCESS = """# Competitive Programming — Agent Workspace
 
-### Interactive protocol checklist
-- `cout << endl;` or `cout << flush;` after EVERY line you output
-- Read the interactor source code to know the exact send/receive order
-- Count queries against the stated limit
-- If run_interactive.sh times out: you likely have a deadlock (missing flush or wrong protocol)
-- Fallback: write a Python subprocess wrapper if the shell script fails
-"""
+## Deliverable
+
+Write your solution to `solution.cpp` in this directory. Nothing else is evaluated.
+
+## Compilation
+
+```bash
+g++ -std=gnu++17 -O2 -o solution solution.cpp
+```
+
+## Scoring
+
+Your score = fraction of hidden test cases passed (0-100%).
+- Partial credit counts — passing 7/10 cases = 70%
+- A correct-but-slow solution passing small cases is MUCH better than a broken fast one
+- Prioritize CORRECTNESS over optimality
+
+## Testing
+
+Use the provided test scripts to validate before finalizing:
+
+```bash
+./test_all.sh                    # Standard problems: compile + test all samples
+./run_interactive.sh 1           # Interactive problems: test sample 1
+```
 
-CLAUDE_MD_FULL_STANDARD = """Use `./test_all.sh` to compile and test against all samples.
 If chk.cc exists, test_all.sh uses it as a special judge automatically.
-Fix any failing samples before moving on to optimization.
-"""
 
-CLAUDE_MD_FOOTER = """
-## Common mistakes to avoid
+## Workflow
+
+1. Read the FULL problem statement. Re-read constraints and edge cases.
+2. Read ALL sample test cases and understand expected I/O format.
+3. Design your algorithm. Consider time complexity vs constraints.
+4. Write a SIMPLE correct solution first — brute force is fine initially.
+5. Compile and test against ALL samples using the provided script.
+6. If samples fail: debug by examining the diff, don't just rewrite everything.
+7. Once samples pass: consider edge cases and large input performance.
+8. Optimize only after correctness is established.
+
+## Interactive problems
+
+- Flush stdout after EVERY output line: `cout << endl;` or `cout << flush;`
+- Read the interactor source code to understand the exact protocol
+- Count queries against the stated limit
+- If run_interactive.sh times out: likely a deadlock (missing flush or wrong protocol)
+- Fallback: write a Python subprocess wrapper if the shell script doesn't work
+
+## Common mistakes
 
 - Forgetting to flush stdout in interactive problems
-- Off-by-one errors in array indexing (0-indexed vs 1-indexed)
 - Integer overflow — use `long long` for anything that could exceed 2^31
+- Off-by-one errors in array indexing (0-indexed vs 1-indexed)
 - Reading input in the wrong order or format
-- Not handling the edge case where N=1 or the input is minimal
-- Rewriting the entire solution when a small fix would work
+- Not handling N=1 or minimal input edge cases
 
 ## When to retreat
 
-- If you've edited and tested 5+ times for the same bug without progress, STOP.
+- If you've been debugging the SAME bug for 5+ edit-test cycles, STOP.
 - Switch to a simpler algorithm that is guaranteed correct, even if slower.
-- A correct brute-force scoring 30% beats a broken clever solution scoring 0%.
-- Partial credit is real: every test case you pass counts.
+- A correct brute-force scoring 30% beats a broken optimized solution scoring 0%.
+- Do NOT rewrite from scratch more than once. Incremental edits preserve working logic.
 """
diff --git a/src/frontier_cs/gen/agent_interface.py b/src/frontier_cs/gen/agent_interface.py
index 907a53eb..d7790a99 100644
--- a/src/frontier_cs/gen/agent_interface.py
+++ b/src/frontier_cs/gen/agent_interface.py
@@ -25,22 +25,18 @@
 import yaml
 
 from frontier_cs.gen.agent_constants import (
-    CLAUDE_MD_FOOTER,
-    CLAUDE_MD_FULL_INTERACTIVE,
-    CLAUDE_MD_FULL_STANDARD,
-    CLAUDE_MD_HEADER,
-    CLAUDE_MD_PARITY_INTERACTIVE,
-    CLAUDE_MD_PARITY_TESTING,
+    CLAUDE_MD_FULL_ACCESS,
+    CLAUDE_MD_PARITY,
+    CLAUDE_MD_PARITY_INTERACTIVE_ADDENDUM,
     FULL_ACCESS_INTERACTIVE_SECTION,
     FULL_ACCESS_PROMPT,
-    FULL_ACCESS_SCORING_SECTION,
     FULL_ACCESS_STANDARD_SECTION,
-    FULL_ACCESS_WORKFLOW,
+    FULL_ACCESS_TAIL,
     PARITY_INTERACTIVE_SECTION,
     PARITY_PROMPT,
-    PARITY_SCORING_AND_WORKFLOW,
     PARITY_SPJ_SECTION,
     PARITY_STANDARD_SECTION,
+    PARITY_TAIL,
     RUN_INTERACTIVE_SH,
     TEST_ALL_SH,
 )
@@ -148,24 +144,19 @@ def build_agent_prompt(problem_dir: str, *, parity: bool = True) -> str:
         checker_note = ""
         if has_checker:
             checker_note = ("\nNote: This problem has a SPECIAL JUDGE (chk.cc) — "
-                           "multiple valid outputs may be accepted.\n"
-                           "`test_all.sh` will automatically compile and use the "
-                           "checker for validation.\nIf the checker reports PASS but "
-                           "the output looks different from the .ans file, that's fine.")
+                           "multiple valid outputs may be accepted.")
         problem_type = "SPECIAL JUDGE (multiple valid outputs accepted)" if has_checker else "STANDARD"
         parts.append(FULL_ACCESS_STANDARD_SECTION.format(
             problem_type=problem_type, checker_note=checker_note,
         ))
 
-    parts.append(FULL_ACCESS_SCORING_SECTION.format(total_cases=total_cases))
-
     sample_text = _format_samples(samples, is_interactive)
     if sample_text:
         parts.append(sample_text)
     elif samples:
         parts.append("\n(Sample inputs are large — read them from testdata/ directory.)\n")
 
-    parts.append(FULL_ACCESS_WORKFLOW)
+    parts.append(FULL_ACCESS_TAIL)
 
     return "\n".join(parts)
 
@@ -196,7 +187,7 @@ def _build_parity_prompt(
     else:
         parts.append(PARITY_STANDARD_SECTION)
 
-    parts.append(PARITY_SCORING_AND_WORKFLOW.format(total_cases=total_cases))
+    parts.append(PARITY_TAIL)
 
     return "\n".join(parts)
 
@@ -215,17 +206,13 @@ def _write_helper_scripts(workdir: Path, is_interactive: bool) -> None:
 
 def _write_workdir_claude_md(workdir: Path, is_interactive: bool, *, parity: bool = True) -> None:
     """Write a CLAUDE.md to the workdir so Claude Code picks up behavioral guidance."""
-    parts = [CLAUDE_MD_HEADER]
     if parity:
-        parts.append(CLAUDE_MD_PARITY_TESTING)
+        content = CLAUDE_MD_PARITY
         if is_interactive:
-            parts.append(CLAUDE_MD_PARITY_INTERACTIVE)
-    elif is_interactive:
-        parts.append(CLAUDE_MD_FULL_INTERACTIVE)
+            content += CLAUDE_MD_PARITY_INTERACTIVE_ADDENDUM
     else:
-        parts.append(CLAUDE_MD_FULL_STANDARD)
-    parts.append(CLAUDE_MD_FOOTER)
-    (workdir / "CLAUDE.md").write_text("\n".join(parts), encoding="utf-8")
+        content = CLAUDE_MD_FULL_ACCESS
+    (workdir / "CLAUDE.md").write_text(content, encoding="utf-8")
 
 
 def extract_solution_cpp(workdir: Path) -> str:
diff --git a/tests/test_agent_interface.py b/tests/test_agent_interface.py
index 34f27037..6265c5f7 100644
--- a/tests/test_agent_interface.py
+++ b/tests/test_agent_interface.py
@@ -75,23 +75,20 @@ def test_build_agent_prompt_standard():
         prompt = build_agent_prompt(str(pdir), parity=False)
         assert "test_all.sh" in prompt
         assert "STANDARD" in prompt or "SPECIAL JUDGE" in prompt
-        assert "solution.cpp" in prompt
-        assert "Scoring" in prompt
-        assert "fraction" in prompt.lower() or "partial" in prompt.lower()
+        assert "partial" in prompt.lower()
         # Samples should be embedded (they're tiny)
         assert "Sample 1" in prompt
 
 
 def test_build_agent_prompt_interactive():
-    """Interactive problem prompt includes interactor guidance and run_interactive.sh."""
+    """Interactive problem prompt includes interactor guidance."""
     from frontier_cs.gen.agent_interface import build_agent_prompt
 
     with tempfile.TemporaryDirectory() as tmpdir:
         pdir = _make_problem_dir(tmpdir, interactive=True)
         prompt = build_agent_prompt(str(pdir), parity=False)
         assert "INTERACTIVE" in prompt
-        assert "run_interactive.sh" in prompt
-        assert "flush" in prompt.lower() or "pipe" in prompt.lower()
+        assert "interactor.cc" in prompt
 
 
 def test_build_agent_prompt_embeds_small_samples():
@@ -132,20 +129,19 @@ def test_build_agent_prompt_parity_no_test_refs():
         assert "Sample 1" not in prompt
         assert "chk.cc" not in prompt
         assert "interactor.cc" not in prompt
-        # Should mention self-testing
-        assert "brute-force" in prompt.lower() or "brute force" in prompt.lower()
-        assert "solution.cpp" in prompt
+        # Prompt is lean — delegates to CLAUDE.md
+        assert "CLAUDE.md" in prompt
+        assert "statement.txt" in prompt
 
 
 def test_build_agent_prompt_parity_interactive():
-    """Parity mode interactive prompt mentions flush but not interactor source."""
+    """Parity mode interactive prompt identifies type but delegates details to CLAUDE.md."""
     from frontier_cs.gen.agent_interface import build_agent_prompt
 
     with tempfile.TemporaryDirectory() as tmpdir:
         pdir = _make_problem_dir(tmpdir, interactive=True)
         prompt = build_agent_prompt(str(pdir), parity=True)
         assert "INTERACTIVE" in prompt
-        assert "flush" in prompt.lower()
         assert "run_interactive.sh" not in prompt
         assert "interactor.cc" not in prompt
 
@@ -247,16 +243,41 @@ def test_write_workdir_claude_md_standard():
         _write_workdir_claude_md(workdir, is_interactive=False, parity=False)
         content = (workdir / "CLAUDE.md").read_text()
         assert "test_all.sh" in content
-        assert "run_interactive.sh" not in content
+        assert "solution.cpp" in content
 
 
 def test_write_workdir_claude_md_interactive():
-    """CLAUDE.md for interactive problems mentions flush and run_interactive.sh."""
+    """CLAUDE.md for interactive problems mentions flush."""
     from frontier_cs.gen.agent_interface import _write_workdir_claude_md
 
     with tempfile.TemporaryDirectory() as tmpdir:
         workdir = Path(tmpdir)
         _write_workdir_claude_md(workdir, is_interactive=True, parity=False)
         content = (workdir / "CLAUDE.md").read_text()
-        assert "run_interactive.sh" in content
         assert "flush" in content
+
+
+def test_write_workdir_claude_md_parity():
+    """Parity CLAUDE.md has self-testing guidance, no test script refs."""
+    from frontier_cs.gen.agent_interface import _write_workdir_claude_md
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        workdir = Path(tmpdir)
+        _write_workdir_claude_md(workdir, is_interactive=False, parity=True)
+        content = (workdir / "CLAUDE.md").read_text()
+        assert "brute-force" in content.lower() or "brute force" in content.lower()
+        assert "solution.cpp" in content
+        assert "test_all.sh" not in content
+        assert "run_interactive.sh" not in content
+
+
+def test_write_workdir_claude_md_parity_interactive():
+    """Parity interactive CLAUDE.md has flush guidance."""
+    from frontier_cs.gen.agent_interface import _write_workdir_claude_md
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        workdir = Path(tmpdir)
+        _write_workdir_claude_md(workdir, is_interactive=True, parity=True)
+        content = (workdir / "CLAUDE.md").read_text()
+        assert "flush" in content
+        assert "run_interactive.sh" not in content

From b31db0a14877feeb471b71d8ae0d4b14957474ee Mon Sep 17 00:00:00 2001
From: Andy Lee <andylizf@outlook.com>
Date: Fri, 17 Apr 2026 04:17:16 +0000
Subject: [PATCH 12/16] refactor: soften scoring and retreat guidance to give
 agents more judgment room

---
 src/frontier_cs/gen/agent_constants.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/src/frontier_cs/gen/agent_constants.py b/src/frontier_cs/gen/agent_constants.py
index 060fe419..6dcd9d9f 100644
--- a/src/frontier_cs/gen/agent_constants.py
+++ b/src/frontier_cs/gen/agent_constants.py
@@ -233,8 +233,7 @@
 
 Your score = fraction of test cases passed (0-100%).
 - Partial credit counts — passing 7/10 cases = 70%
-- A correct-but-slow solution passing small cases is MUCH better than a broken fast one
-- Prioritize CORRECTNESS over optimality
+- Start with a correct solution, then optimize. A working brute-force is a good fallback, but aim for the efficient solution when you can.
 
 ## Self-testing (no test data provided)
 
@@ -268,8 +267,7 @@
 
 ## When to retreat
 
-- If you've been debugging the SAME bug for 5+ edit-test cycles, STOP.
-- Switch to a simpler algorithm that is guaranteed correct, even if slower.
+- If you're going in circles on the same bug with no new insight, consider falling back to a simpler algorithm.
 - A correct brute-force scoring 30% beats a broken optimized solution scoring 0%.
 - Do NOT rewrite from scratch more than once. Incremental edits preserve working logic.
 """
@@ -299,8 +297,7 @@
 
 Your score = fraction of hidden test cases passed (0-100%).
 - Partial credit counts — passing 7/10 cases = 70%
-- A correct-but-slow solution passing small cases is MUCH better than a broken fast one
-- Prioritize CORRECTNESS over optimality
+- Start with a correct solution, then optimize. A working brute-force is a good fallback, but aim for the efficient solution when you can.
 
 ## Testing
 
@@ -342,8 +339,7 @@
 
 ## When to retreat
 
-- If you've been debugging the SAME bug for 5+ edit-test cycles, STOP.
-- Switch to a simpler algorithm that is guaranteed correct, even if slower.
+- If you're going in circles on the same bug with no new insight, consider falling back to a simpler algorithm.
 - A correct brute-force scoring 30% beats a broken optimized solution scoring 0%.
 - Do NOT rewrite from scratch more than once. Incremental edits preserve working logic.
 """

From 198505043f6cc4dffd30f7a154e6ded136e9ccf9 Mon Sep 17 00:00:00 2001
From: Andy Lee <andylizf@outlook.com>
Date: Fri, 17 Apr 2026 11:31:37 +0000
Subject: [PATCH 13/16] feat: align timeout and cost limit with Harbor adapter
 defaults

---
 src/frontier_cs/gen/agent_interface.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/frontier_cs/gen/agent_interface.py b/src/frontier_cs/gen/agent_interface.py
index d7790a99..1cee7176 100644
--- a/src/frontier_cs/gen/agent_interface.py
+++ b/src/frontier_cs/gen/agent_interface.py
@@ -43,9 +43,9 @@
 
 logger = logging.getLogger(__name__)
 
-# Default budget limits
-DEFAULT_COST_LIMIT_USD = 20.0
-DEFAULT_TIMEOUT_SECONDS = 1200  # 20 minutes
+# Default budget limits — aligned with Harbor adapter (task.toml agent.timeout_sec=3600)
+DEFAULT_COST_LIMIT_USD = None  # None = no limit; Harbor relies on timeout, not cost cap
+DEFAULT_TIMEOUT_SECONDS = 3600  # 1 hour, matching Harbor
 
 # Max size of sample I/O to embed directly in the prompt (bytes).
 # Larger inputs are left for the agent to read from disk.
@@ -316,7 +316,7 @@ async def run_agent(
     problem_dir: str,
     model: str,
     *,
-    cost_limit: float = DEFAULT_COST_LIMIT_USD,
+    cost_limit: Optional[float] = DEFAULT_COST_LIMIT_USD,
     timeout: float = DEFAULT_TIMEOUT_SECONDS,
     transcript_path: Optional[Path] = None,
     parity: bool = True,
@@ -326,7 +326,7 @@ async def run_agent(
     Args:
         problem_dir: Absolute path to the problem directory.
         model: Base model name (without -agent suffix).
-        cost_limit: Maximum cost in USD.
+        cost_limit: Maximum cost in USD. None = no limit.
         timeout: Maximum wall-clock time in seconds.
         transcript_path: Path for JSONL transcript log. None to skip.
         parity: If True, strip test data and helper scripts (Harbor parity mode).
@@ -529,7 +529,7 @@ def generate_agent_solution(
     problem_dir: str,
     model: str,
     *,
-    cost_limit: float = DEFAULT_COST_LIMIT_USD,
+    cost_limit: Optional[float] = DEFAULT_COST_LIMIT_USD,
     timeout: float = DEFAULT_TIMEOUT_SECONDS,
     transcript_path: Optional[Path] = None,
     parity: bool = True,
@@ -541,7 +541,7 @@ def generate_agent_solution(
     Args:
         problem_dir: Absolute path to the problem directory.
         model: Base model name (without -agent suffix).
-        cost_limit: Maximum cost in USD.
+        cost_limit: Maximum cost in USD. None = no limit.
         timeout: Maximum wall-clock time in seconds.
         transcript_path: Path for JSONL transcript log.
         parity: If True, strip test data and helper scripts (Harbor parity mode).

From 7a30fd957318a62462ca0309185ff5c560993495 Mon Sep 17 00:00:00 2001
From: Andy Lee <andylizf@outlook.com>
Date: Fri, 17 Apr 2026 11:32:31 +0000
Subject: [PATCH 14/16] fix: align CLI defaults for agent-timeout and
 agent-cost-limit with Harbor

---
 algorithmic/scripts/generate_solutions.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/algorithmic/scripts/generate_solutions.py b/algorithmic/scripts/generate_solutions.py
index d2b9ed33..64c9bf58 100644
--- a/algorithmic/scripts/generate_solutions.py
+++ b/algorithmic/scripts/generate_solutions.py
@@ -301,10 +301,10 @@ def main():
                         help="Maximum parallel generations")
 
     # Agent-specific parameters
-    parser.add_argument("--agent-timeout", type=float, default=1200.0,
-                        help="Agent timeout in seconds (default: 1200 = 20 min)")
-    parser.add_argument("--agent-cost-limit", type=float, default=20.0,
-                        help="Agent max cost per problem in USD (default: 20)")
+    parser.add_argument("--agent-timeout", type=float, default=3600.0,
+                        help="Agent timeout in seconds (default: 3600 = 1 hour)")
+    parser.add_argument("--agent-cost-limit", type=float, default=None,
+                        help="Agent max cost per problem in USD (default: no limit)")
 
 
     args = parser.parse_args()

From e1b965c5a604e498562015084e358edb0b0cf494 Mon Sep 17 00:00:00 2001
From: Andy Lee <andylizf@outlook.com>
Date: Mon, 27 Apr 2026 12:04:59 +0000
Subject: [PATCH 15/16] fix: pass api_key per-run and skip judge when all
 models are agents

- generate_solutions.py: detect all-agent runs and skip judge availability
  check; read statement from local file instead of judge API in agent mode;
  pass api_key through to generate_agent_solution for key pool rotation
- agent_interface.py: add api_key parameter to run_agent/generate_agent_solution,
  forwarded to SDK subprocess env for per-run key rotation
- api_keys.py: only count API-level errors (rate limit, 5xx, auth) toward
  backoff; application-level failures (agent timeout, no solution) no longer
  penalize the key
---
 algorithmic/scripts/generate_solutions.py | 19 ++++++++++++++-----
 src/frontier_cs/gen/agent_interface.py    | 13 +++++++++++++
 src/frontier_cs/gen/api_keys.py           | 18 ++++++++++++++++--
 3 files changed, 43 insertions(+), 7 deletions(-)

diff --git a/algorithmic/scripts/generate_solutions.py b/algorithmic/scripts/generate_solutions.py
index 64c9bf58..58af43e7 100644
--- a/algorithmic/scripts/generate_solutions.py
+++ b/algorithmic/scripts/generate_solutions.py
@@ -312,10 +312,13 @@ def main():
     # Output directory for algorithmic solutions
     output_dir = algo_dir / "solutions"
 
+    # Detect if all models are agent-only (no judge needed)
+    all_agent = args.models and all(m.endswith("-agent") for m in args.models)
+
     # Initialize judge client
     judge = AlgorithmicJudgeClient(args.judge_url)
 
-    if not judge.is_available():
+    if not all_agent and not judge.is_available():
         print(f"{red('ERROR:')} Judge server not available at {args.judge_url}")
         print("Start the judge with: cd algorithmic && docker compose up -d")
         sys.exit(1)
@@ -399,14 +402,19 @@ def main():
     skipped: List[str] = []
 
     for problem_id in problem_ids:
-        statement = judge.get_problem_statement(problem_id)
+        # Resolve problem directory
+        problem_dir_path = algo_dir / "problems" / problem_id
+
+        if all_agent:
+            # Agent mode reads statement from local file; no judge needed
+            stmt_path = problem_dir_path / "statement.txt"
+            statement = stmt_path.read_text(encoding="utf-8") if stmt_path.exists() else ""
+        else:
+            statement = judge.get_problem_statement(problem_id)
         if not statement:
             print(f"{yellow('WARNING:')} Could not get statement for problem {problem_id}")
             continue
 
-        # Resolve problem directory for agent models
-        problem_dir_path = algo_dir / "problems" / problem_id
-
         for model in models_list:
             model_prefix = get_model_prefix(model)
             provider = detect_provider(model)
@@ -523,6 +531,7 @@ def execute_task(task: GenerationTask) -> Tuple[str, str, Optional[str], str, Op
                 code, metadata = generate_agent_solution(
                     problem_dir=task.problem_dir,
                     model=base_model,
+                    api_key=api_key,
                     cost_limit=args.agent_cost_limit,
                     timeout=args.agent_timeout,
                     transcript_path=transcript_path,
diff --git a/src/frontier_cs/gen/agent_interface.py b/src/frontier_cs/gen/agent_interface.py
index 1cee7176..7ca4e28f 100644
--- a/src/frontier_cs/gen/agent_interface.py
+++ b/src/frontier_cs/gen/agent_interface.py
@@ -316,6 +316,7 @@ async def run_agent(
     problem_dir: str,
     model: str,
     *,
+    api_key: Optional[str] = None,
     cost_limit: Optional[float] = DEFAULT_COST_LIMIT_USD,
     timeout: float = DEFAULT_TIMEOUT_SECONDS,
     transcript_path: Optional[Path] = None,
@@ -326,6 +327,9 @@ async def run_agent(
     Args:
         problem_dir: Absolute path to the problem directory.
         model: Base model name (without -agent suffix).
+        api_key: Anthropic API key. If provided, passed to the SDK subprocess
+            via env (per-run), allowing pool-managed key rotation. If None,
+            the SDK falls back to inheriting ANTHROPIC_API_KEY from the parent.
         cost_limit: Maximum cost in USD. None = no limit.
         timeout: Maximum wall-clock time in seconds.
         transcript_path: Path for JSONL transcript log. None to skip.
@@ -377,9 +381,14 @@ async def run_agent(
 
     prompt = build_agent_prompt(str(workdir), parity=parity)
 
+    sdk_env: Dict[str, str] = {}
+    if api_key:
+        sdk_env["ANTHROPIC_API_KEY"] = api_key
+
     options = ClaudeAgentOptions(
         model=model,
         cwd=str(workdir),
+        env=sdk_env,
         max_budget_usd=cost_limit,
         permission_mode="bypassPermissions",
         include_partial_messages=True,
@@ -529,6 +538,7 @@ def generate_agent_solution(
     problem_dir: str,
     model: str,
     *,
+    api_key: Optional[str] = None,
     cost_limit: Optional[float] = DEFAULT_COST_LIMIT_USD,
     timeout: float = DEFAULT_TIMEOUT_SECONDS,
     transcript_path: Optional[Path] = None,
@@ -541,6 +551,8 @@ def generate_agent_solution(
     Args:
         problem_dir: Absolute path to the problem directory.
         model: Base model name (without -agent suffix).
+        api_key: Anthropic API key (passed to SDK subprocess env, per-run).
+            If None, the SDK inherits ANTHROPIC_API_KEY from the parent process.
         cost_limit: Maximum cost in USD. None = no limit.
         timeout: Maximum wall-clock time in seconds.
         transcript_path: Path for JSONL transcript log.
@@ -553,6 +565,7 @@ def generate_agent_solution(
         run_agent(
             problem_dir,
             model,
+            api_key=api_key,
             cost_limit=cost_limit,
             timeout=timeout,
             transcript_path=transcript_path,
diff --git a/src/frontier_cs/gen/api_keys.py b/src/frontier_cs/gen/api_keys.py
index 2fd67680..db9d3759 100644
--- a/src/frontier_cs/gen/api_keys.py
+++ b/src/frontier_cs/gen/api_keys.py
@@ -114,23 +114,37 @@ def report_success(self, idx: Optional[int]) -> None:
                 state["backoff_until"] = 0.0
 
     def report_failure(self, idx: Optional[int], error: Optional[str]) -> None:
-        """Report failed API call for a key."""
+        """Report failed task that used this key.
+
+        Only API-key-related errors (auth, rate limit, server-side) trigger
+        disable/backoff. Application-level errors (e.g. agent timeout, no
+        solution produced) leave the key untouched, since they say nothing
+        about key health.
+        """
         if idx is None:
             return
         with self._lock:
             if not (0 <= idx < len(self._states)):
                 return
             state = self._states[idx]
-            state["failures"] += 1
             reason = (error or "").lower()
+
             fatal_markers = ("invalid", "unauthorized", "forbidden", "permission", "auth")
             if any(marker in reason for marker in fatal_markers):
                 if not state["disabled"]:
                     logger.warning(f"Disabling API key for {self.name}: invalid/unauthorized")
                 state["disabled"] = True
                 state["backoff_until"] = float("inf")
+                state["failures"] += 1
                 return
 
+            api_markers = ("rate limit", "rate_limit", "429", "503", "502", "500",
+                           "overloaded", "quota", "throttle", "connection")
+            if not any(marker in reason for marker in api_markers):
+                # Not an API/key issue — leave key state unchanged.
+                return
+
+            state["failures"] += 1
             delay: int = min(600, 60 * state["failures"])
             state["backoff_until"] = max(state["backoff_until"], time.time() + delay)
             logger.info(f"Backing off {delay:.0f}s for {self.name} key (failures={state['failures']})")

From 6b6fdb2a7aacb68b19947ff21ebb39497689cd84 Mon Sep 17 00:00:00 2001
From: Andy Lee <andylizf@outlook.com>
Date: Mon, 27 Apr 2026 12:08:22 +0000
Subject: [PATCH 16/16] fix: align timeout default (10800s) and prompt wording
 with Harbor adapter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- DEFAULT_TIMEOUT_SECONDS: 3600 → 10800 to match Harbor task.toml
- CLI --agent-timeout default: 3600 → 10800
- PARITY_TAIL / FULL_ACCESS_TAIL: drop stale article "the" before CLAUDE.md
  to match Harbor's "Read AGENT.md" phrasing
---
 algorithmic/scripts/generate_solutions.py | 4 ++--
 src/frontier_cs/gen/agent_constants.py    | 4 ++--
 src/frontier_cs/gen/agent_interface.py    | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/algorithmic/scripts/generate_solutions.py b/algorithmic/scripts/generate_solutions.py
index 58af43e7..6d695b96 100644
--- a/algorithmic/scripts/generate_solutions.py
+++ b/algorithmic/scripts/generate_solutions.py
@@ -301,8 +301,8 @@ def main():
                         help="Maximum parallel generations")
 
     # Agent-specific parameters
-    parser.add_argument("--agent-timeout", type=float, default=3600.0,
-                        help="Agent timeout in seconds (default: 3600 = 1 hour)")
+    parser.add_argument("--agent-timeout", type=float, default=10800.0,
+                        help="Agent timeout in seconds (default: 10800 = 3 hours)")
     parser.add_argument("--agent-cost-limit", type=float, default=None,
                         help="Agent max cost per problem in USD (default: no limit)")
 
diff --git a/src/frontier_cs/gen/agent_constants.py b/src/frontier_cs/gen/agent_constants.py
index 6dcd9d9f..1b80194c 100644
--- a/src/frontier_cs/gen/agent_constants.py
+++ b/src/frontier_cs/gen/agent_constants.py
@@ -180,7 +180,7 @@
 Your output must match the expected output exactly (whitespace-normalized)."""
 
 PARITY_TAIL = """
-Read the CLAUDE.md in this directory for compilation, testing, and workflow guidance.
+Read CLAUDE.md in this directory for compilation, testing, and workflow guidance.
 Begin by reading the full problem statement in statement.txt."""
 
 # Full-access prompt (parity=False): agent gets test data and helper scripts
@@ -210,7 +210,7 @@
 Use `./test_all.sh` to compile and test against all samples.{checker_note}"""
 
 FULL_ACCESS_TAIL = """
-Read the CLAUDE.md in this directory for compilation, testing, and workflow guidance.
+Read CLAUDE.md in this directory for compilation, testing, and workflow guidance.
 Begin by reading the full problem statement in statement.txt."""
 
 # ---------------------------------------------------------------------------
diff --git a/src/frontier_cs/gen/agent_interface.py b/src/frontier_cs/gen/agent_interface.py
index 7ca4e28f..26f0d9f8 100644
--- a/src/frontier_cs/gen/agent_interface.py
+++ b/src/frontier_cs/gen/agent_interface.py
@@ -43,9 +43,9 @@
 
 logger = logging.getLogger(__name__)
 
-# Default budget limits — aligned with Harbor adapter (task.toml agent.timeout_sec=3600)
+# Default budget limits — aligned with Harbor adapter (task.toml agent.timeout_sec=10800)
 DEFAULT_COST_LIMIT_USD = None  # None = no limit; Harbor relies on timeout, not cost cap
-DEFAULT_TIMEOUT_SECONDS = 3600  # 1 hour, matching Harbor
+DEFAULT_TIMEOUT_SECONDS = 10800  # 3 hours, matching Harbor
 
 # Max size of sample I/O to embed directly in the prompt (bytes).
 # Larger inputs are left for the agent to read from disk.