From 32b7086f7ff67763eb39dd3f7c31c59db7dcbd60 Mon Sep 17 00:00:00 2001 From: Brendan O'Leary Date: Wed, 22 Apr 2026 10:27:54 -0400 Subject: [PATCH] feat: add judge response caching for faster reruns - Add cache functions to lib_grading.py for storing/retrieving judge results - Cache key based on task_id and transcript hash (SHA256, 16 chars) - Cache stored in .judge_cache/ directory as JSON files - Integrate cache checks at start of _grade_llm_judge() - Save to cache after successful judge evaluation - Add --no-judge-cache flag to benchmark.py to disable caching - Pass use_judge_cache parameter through grade_task() calls - Cache applies to both llm_judge and hybrid grading types Resolves #214 --- scripts/benchmark.py | 18 ++++++++++--- scripts/lib_grading.py | 60 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 74 insertions(+), 4 deletions(-) diff --git a/scripts/benchmark.py b/scripts/benchmark.py index 18c531b..36a11a6 100644 --- a/scripts/benchmark.py +++ b/scripts/benchmark.py @@ -22,8 +22,9 @@ import subprocess import sys import time +from concurrent.futures import Future, ThreadPoolExecutor from pathlib import Path -from typing import Dict, List, Optional, Any +from typing import Any, Dict, List, Optional, Tuple from lib_agent import ( cleanup_agent_sessions, @@ -33,7 +34,7 @@ slugify_model, validate_openrouter_model, ) -from lib_grading import GradeResult, grade_task +from lib_grading import DEFAULT_JUDGE_TIMEOUT_SECONDS, GradeResult, grade_task from lib_tasks import Task, TaskLoader @@ -270,6 +271,16 @@ def _parse_args() -> argparse.Namespace: default=-0.5, help="Slope (%%/run) below which regression is flagged (default: -0.5)", ) + parser.add_argument( + "--no-parallel-judge", + action="store_true", + help="Disable parallel judge execution (grade synchronously after each task)", + ) + parser.add_argument( + "--no-judge-cache", + action="store_true", + help="Disable judge response caching (forces fresh evaluation)", + ) args = parser.parse_args() # Validate --trend-window @@ -842,7 +853,8 @@ def _write_incremental_results(): } try: grade_kwargs = dict( - task=task, execution_result=result, skill_dir=skill_dir, verbose=args.verbose + task=task, execution_result=result, skill_dir=skill_dir, verbose=args.verbose, + use_judge_cache=not args.no_judge_cache, ) if args.judge: grade_kwargs["judge_model"] = args.judge diff --git a/scripts/lib_grading.py b/scripts/lib_grading.py index ccadcda..2767314 100644 --- a/scripts/lib_grading.py +++ b/scripts/lib_grading.py @@ -4,6 +4,7 @@ from __future__ import annotations +import hashlib import json import logging import os @@ -19,6 +20,38 @@ logger = logging.getLogger(__name__) +JUDGE_CACHE_DIR = Path(".judge_cache") + + +def _cache_key(task_id: str, transcript: list) -> str: + """Generate cache key from task ID and transcript.""" + transcript_str = json.dumps(transcript, sort_keys=True) + hash_hex = hashlib.sha256(transcript_str.encode()).hexdigest()[:16] + return f"{task_id}_{hash_hex}" + + +def _get_cached_grade(task_id: str, transcript: list) -> dict | None: + """Retrieve cached grade result if available.""" + key = _cache_key(task_id, transcript) + cache_path = JUDGE_CACHE_DIR / f"{key}.json" + if cache_path.exists(): + try: + return json.loads(cache_path.read_text()) + except (json.JSONDecodeError, OSError): + return None + return None + + +def _save_grade_to_cache(task_id: str, transcript: list, grade: GradeResult): + """Save grade result to cache for future reruns.""" + key = _cache_key(task_id, transcript) + JUDGE_CACHE_DIR.mkdir(exist_ok=True) + cache_path = JUDGE_CACHE_DIR / f"{key}.json" + try: + cache_path.write_text(json.dumps(grade.to_dict(), indent=2)) + except OSError: + pass # Cache write failure is non-fatal + DEFAULT_JUDGE_MODEL = "openrouter/anthropic/claude-opus-4.5" DEFAULT_JUDGE_AGENT_PREFIX = "bench-judge" @@ -55,6 +88,7 @@ def grade_task( judge_timeout_seconds: float = DEFAULT_JUDGE_TIMEOUT_SECONDS, judge_backend: str = "openclaw", verbose: bool = False, + use_judge_cache: bool = True, ) -> GradeResult: grading_type = task.grading_type if verbose: @@ -76,6 +110,7 @@ def grade_task( judge_backend=judge_backend, skill_dir=skill_dir, verbose=verbose, + use_judge_cache=use_judge_cache, ) if verbose: logger.info(" [VERBOSE] LLM judge breakdown: %s", result.breakdown) @@ -91,6 +126,7 @@ def grade_task( judge_backend=judge_backend, skill_dir=skill_dir, verbose=verbose, + use_judge_cache=use_judge_cache, ) return _combine_grades(task, auto_result, llm_result) raise ValueError(f"Unknown grading type: {grading_type}") @@ -188,10 +224,26 @@ def _grade_llm_judge( judge_backend: str = "openclaw", skill_dir: Optional[Path] = None, verbose: bool = False, + use_judge_cache: bool = True, ) -> GradeResult: transcript = execution_result.get("transcript", []) execution_status = execution_result.get("status", "unknown") + # Check cache first + if use_judge_cache: + cached = _get_cached_grade(task.task_id, transcript) + if cached: + if verbose: + logger.info(" [VERBOSE] Using cached judge result for %s", task.task_id) + return GradeResult( + task_id=cached["task_id"], + score=cached["score"], + max_score=cached["max_score"], + grading_type=cached["grading_type"], + breakdown=cached["breakdown"], + notes=cached["notes"], + ) + if not transcript and execution_status != "success": if verbose: logger.info( @@ -304,7 +356,7 @@ def _grade_llm_judge( task.task_id, raw_parsed, ) - return GradeResult( + grade = GradeResult( task_id=task.task_id, score=float(total) if total is not None else 0.0, max_score=1.0, @@ -312,6 +364,12 @@ def _grade_llm_judge( breakdown=_normalize_score_dict(breakdown), notes=str(notes) if notes is not None else "", ) + + # Save to cache if enabled + if use_judge_cache: + _save_grade_to_cache(task.task_id, transcript, grade) + + return grade def _combine_grades(task: Task, auto_result: GradeResult, llm_result: GradeResult) -> GradeResult: