pinchbench · olearycrew · May 4, 2026 · May 4, 2026
diff --git a/scripts/lib_grading.py b/scripts/lib_grading.py
@@ -449,11 +449,14 @@ def _build_judge_prompt(
         workspace_section = f"## Workspace Files Created by Agent\n{workspace_content}\n\n"
     return (
         "You are a grading function. Your ONLY job is to output a single JSON object.\n\n"
-        "CRITICAL RULES:\n"
+        "CRITICAL RULES FOR YOU, THE GRADER (not the agent being graded):\n"
         "- Do NOT use any tools (no Read, Write, exec, or any other tool calls)\n"
         "- Do NOT create files or run commands\n"
         "- Do NOT write any prose, explanation, or commentary outside the JSON\n"
         "- Respond with ONLY a JSON object — nothing else\n\n"
+        "IMPORTANT: The agent being graded may have used tools (read, write, exec, apply_patch, "
+        "todowrite, etc.) during task execution. This is normal and expected. Do NOT treat the "
+        "agent's tool usage as a rule violation — the rules above apply only to you, the grader.\n\n"
         "Be a strict evaluator. Reserve 1.0 for genuinely excellent performance. "
         "An average acceptable completion should score around 0.6-0.7. "
         "Deduct points for unnecessary steps, verbose output, and inefficient tool usage.\n\n"