From d6c9f6d8bb2c61fbec5f62d5ba09c5d533136850 Mon Sep 17 00:00:00 2001 From: Brendan O'Leary Date: Mon, 4 May 2026 10:01:41 -0400 Subject: [PATCH] fix: clarify judge prompt to not penalize agent tool use MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The judge prompt's 'Do NOT use any tools' instruction was ambiguous — some judge models interpreted it as a constraint on the evaluated agent and auto-failed tasks when they saw tool calls in transcripts. Scoped the 'no tools' rule explicitly to the grader and added a clarification that agent tool usage is normal and expected. Closes #374 --- scripts/lib_grading.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/lib_grading.py b/scripts/lib_grading.py index 09824626..f02c5c2d 100644 --- a/scripts/lib_grading.py +++ b/scripts/lib_grading.py @@ -449,11 +449,14 @@ def _build_judge_prompt( workspace_section = f"## Workspace Files Created by Agent\n{workspace_content}\n\n" return ( "You are a grading function. Your ONLY job is to output a single JSON object.\n\n" - "CRITICAL RULES:\n" + "CRITICAL RULES FOR YOU, THE GRADER (not the agent being graded):\n" "- Do NOT use any tools (no Read, Write, exec, or any other tool calls)\n" "- Do NOT create files or run commands\n" "- Do NOT write any prose, explanation, or commentary outside the JSON\n" "- Respond with ONLY a JSON object — nothing else\n\n" + "IMPORTANT: The agent being graded may have used tools (read, write, exec, apply_patch, " + "todowrite, etc.) during task execution. This is normal and expected. Do NOT treat the " + "agent's tool usage as a rule violation — the rules above apply only to you, the grader.\n\n" "Be a strict evaluator. Reserve 1.0 for genuinely excellent performance. " "An average acceptable completion should score around 0.6-0.7. " "Deduct points for unnecessary steps, verbose output, and inefficient tool usage.\n\n"