diff --git a/scripts/lib_grading.py b/scripts/lib_grading.py index 09824626..f02c5c2d 100644 --- a/scripts/lib_grading.py +++ b/scripts/lib_grading.py @@ -449,11 +449,14 @@ def _build_judge_prompt( workspace_section = f"## Workspace Files Created by Agent\n{workspace_content}\n\n" return ( "You are a grading function. Your ONLY job is to output a single JSON object.\n\n" - "CRITICAL RULES:\n" + "CRITICAL RULES FOR YOU, THE GRADER (not the agent being graded):\n" "- Do NOT use any tools (no Read, Write, exec, or any other tool calls)\n" "- Do NOT create files or run commands\n" "- Do NOT write any prose, explanation, or commentary outside the JSON\n" "- Respond with ONLY a JSON object — nothing else\n\n" + "IMPORTANT: The agent being graded may have used tools (read, write, exec, apply_patch, " + "todowrite, etc.) during task execution. This is normal and expected. Do NOT treat the " + "agent's tool usage as a rule violation — the rules above apply only to you, the grader.\n\n" "Be a strict evaluator. Reserve 1.0 for genuinely excellent performance. " "An average acceptable completion should score around 0.6-0.7. " "Deduct points for unnecessary steps, verbose output, and inefficient tool usage.\n\n"