Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 15 additions & 3 deletions scripts/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,9 @@
import subprocess
import sys
import time
from concurrent.futures import Future, ThreadPoolExecutor
from pathlib import Path
from typing import Dict, List, Optional, Any
from typing import Any, Dict, List, Optional, Tuple

from lib_agent import (
cleanup_agent_sessions,
Expand All @@ -33,7 +34,7 @@
slugify_model,
validate_openrouter_model,
)
from lib_grading import GradeResult, grade_task
from lib_grading import DEFAULT_JUDGE_TIMEOUT_SECONDS, GradeResult, grade_task
from lib_tasks import Task, TaskLoader


Expand Down Expand Up @@ -270,6 +271,16 @@ def _parse_args() -> argparse.Namespace:
default=-0.5,
help="Slope (%%/run) below which regression is flagged (default: -0.5)",
)
parser.add_argument(
"--no-parallel-judge",
action="store_true",
help="Disable parallel judge execution (grade synchronously after each task)",
)
parser.add_argument(
"--no-judge-cache",
action="store_true",
help="Disable judge response caching (forces fresh evaluation)",
)
args = parser.parse_args()

# Validate --trend-window
Expand Down Expand Up @@ -842,7 +853,8 @@ def _write_incremental_results():
}
try:
grade_kwargs = dict(
task=task, execution_result=result, skill_dir=skill_dir, verbose=args.verbose
task=task, execution_result=result, skill_dir=skill_dir, verbose=args.verbose,
use_judge_cache=not args.no_judge_cache,
)
if args.judge:
grade_kwargs["judge_model"] = args.judge
Expand Down
60 changes: 59 additions & 1 deletion scripts/lib_grading.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from __future__ import annotations

import hashlib
import json
import logging
import os
Expand All @@ -19,6 +20,38 @@

logger = logging.getLogger(__name__)

JUDGE_CACHE_DIR = Path(".judge_cache")


def _cache_key(task_id: str, transcript: list) -> str:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CRITICAL: Cache key does not include the judge model or backend

The cache key is derived only from task_id and transcript. If a user runs the benchmark with --judge model-A, gets results cached, then re-runs with --judge model-B, the cache will return stale results from model-A silently. This defeats the purpose of comparing judge models.

judge_model and judge_backend should both be part of the cache key. Since _cache_key is a private helper called from _grade_llm_judge (which does receive judge_model and judge_backend), the function signature needs to accept those additional inputs.

Suggested change
def _cache_key(task_id: str, transcript: list) -> str:
def _cache_key(task_id: str, transcript: list, judge_model: str = "", judge_backend: str = "") -> str:

"""Generate cache key from task ID and transcript."""
transcript_str = json.dumps(transcript, sort_keys=True)
hash_hex = hashlib.sha256(transcript_str.encode()).hexdigest()[:16]
return f"{task_id}_{hash_hex}"


def _get_cached_grade(task_id: str, transcript: list) -> dict | None:
"""Retrieve cached grade result if available."""
key = _cache_key(task_id, transcript)
cache_path = JUDGE_CACHE_DIR / f"{key}.json"
if cache_path.exists():
try:
return json.loads(cache_path.read_text())
except (json.JSONDecodeError, OSError):
return None
return None


def _save_grade_to_cache(task_id: str, transcript: list, grade: GradeResult):
"""Save grade result to cache for future reruns."""
key = _cache_key(task_id, transcript)
JUDGE_CACHE_DIR.mkdir(exist_ok=True)
cache_path = JUDGE_CACHE_DIR / f"{key}.json"
try:
cache_path.write_text(json.dumps(grade.to_dict(), indent=2))
except OSError:
pass # Cache write failure is non-fatal


DEFAULT_JUDGE_MODEL = "openrouter/anthropic/claude-opus-4.5"
DEFAULT_JUDGE_AGENT_PREFIX = "bench-judge"
Expand Down Expand Up @@ -55,6 +88,7 @@ def grade_task(
judge_timeout_seconds: float = DEFAULT_JUDGE_TIMEOUT_SECONDS,
judge_backend: str = "openclaw",
verbose: bool = False,
use_judge_cache: bool = True,
) -> GradeResult:
grading_type = task.grading_type
if verbose:
Expand All @@ -76,6 +110,7 @@ def grade_task(
judge_backend=judge_backend,
skill_dir=skill_dir,
verbose=verbose,
use_judge_cache=use_judge_cache,
)
if verbose:
logger.info(" [VERBOSE] LLM judge breakdown: %s", result.breakdown)
Expand All @@ -91,6 +126,7 @@ def grade_task(
judge_backend=judge_backend,
skill_dir=skill_dir,
verbose=verbose,
use_judge_cache=use_judge_cache,
)
return _combine_grades(task, auto_result, llm_result)
raise ValueError(f"Unknown grading type: {grading_type}")
Expand Down Expand Up @@ -188,10 +224,26 @@ def _grade_llm_judge(
judge_backend: str = "openclaw",
skill_dir: Optional[Path] = None,
verbose: bool = False,
use_judge_cache: bool = True,
) -> GradeResult:
transcript = execution_result.get("transcript", [])
execution_status = execution_result.get("status", "unknown")

# Check cache first
if use_judge_cache:
cached = _get_cached_grade(task.task_id, transcript)
if cached:
if verbose:
logger.info(" [VERBOSE] Using cached judge result for %s", task.task_id)
return GradeResult(
task_id=cached["task_id"],
score=cached["score"],
max_score=cached["max_score"],
grading_type=cached["grading_type"],
breakdown=cached["breakdown"],
notes=cached["notes"],
)

if not transcript and execution_status != "success":
if verbose:
logger.info(
Expand Down Expand Up @@ -304,14 +356,20 @@ def _grade_llm_judge(
task.task_id,
raw_parsed,
)
return GradeResult(
grade = GradeResult(
task_id=task.task_id,
score=float(total) if total is not None else 0.0,
max_score=1.0,
grading_type="llm_judge",
breakdown=_normalize_score_dict(breakdown),
notes=str(notes) if notes is not None else "",
)

# Save to cache if enabled
if use_judge_cache:
_save_grade_to_cache(task.task_id, transcript, grade)

return grade


def _combine_grades(task: Task, auto_result: GradeResult, llm_result: GradeResult) -> GradeResult:
Expand Down
Loading