diff --git a/scripts/benchmark.py b/scripts/benchmark.py index 499b491..f9a535b 100644 --- a/scripts/benchmark.py +++ b/scripts/benchmark.py @@ -37,7 +37,14 @@ validate_openrouter_model, ) from lib_axiom import init_axiom -from lib_grading import DEFAULT_JUDGE_TIMEOUT_SECONDS, GradeResult, grade_task +from lib_grading import ( + DEFAULT_JUDGE_TIMEOUT_SECONDS, + GradeResult, + grade_task, + set_judge_cache_dir, + get_judge_cache_stats, + clear_judge_cache, +) from lib_tasks import Task, TaskLoader @@ -187,6 +194,11 @@ def _parse_args() -> argparse.Namespace: default="all", help='Tasks to run: "all", "automated-only", a category name (e.g. "coding"), or comma-separated task IDs', ) + parser.add_argument( + "--core", + action="store_true", + help="Run only core tasks (~25 representative tasks for quick benchmarking)", + ) parser.add_argument( "--output-dir", default="results", @@ -261,6 +273,16 @@ def _parse_args() -> argparse.Namespace: action="store_true", help="Disable parallel judge execution (grade synchronously after each task)", ) + parser.add_argument( + "--no-judge-cache", + action="store_true", + help="Disable judge result caching (re-grade even if transcript+rubric unchanged)", + ) + parser.add_argument( + "--clear-judge-cache", + action="store_true", + help="Clear the judge cache before running", + ) parser.add_argument( "--trend", action="store_true", @@ -741,6 +763,16 @@ def main(): logger.error("Upload failed: %s", exc) sys.exit(1) + # Initialize judge cache + if not args.no_judge_cache: + cache_dir = Path(args.output_dir) / ".judge_cache" + set_judge_cache_dir(cache_dir) + if args.clear_judge_cache: + clear_judge_cache() + logger.info("🗑️ Judge cache cleared") + else: + logger.info("📦 Judge caching disabled") + logger.info("🔧 Initializing BenchmarkRunner...") runner = BenchmarkRunner(tasks_dir) @@ -775,6 +807,16 @@ def main(): cleanup_agent_sessions(agent_id) task_ids = _select_task_ids(runner.tasks, args.suite, runner.task_loader.category_map) + + # Handle --core flag: use core tasks from manifest + if args.core: + core_task_ids = runner.task_loader.core_tasks + if not core_task_ids: + logger.warning("⚠️ No core tasks defined in manifest.yaml, running all tasks") + else: + task_ids = core_task_ids + logger.info(f"🎯 Core mode: running {len(core_task_ids)} representative tasks") + results = [] grades_by_task_id = {} sanity_task_id = "task_sanity" @@ -1152,6 +1194,19 @@ def _build_and_write_results(): score_pct = (total_score / max_score * 100) if max_score > 0 else 0 logger.info("📊 Final score: %.2f/%.0f (%.1f%%)", total_score, max_score, score_pct) + # Log judge cache stats + if not args.no_judge_cache: + cache_stats = get_judge_cache_stats() + if cache_stats["hits"] > 0 or cache_stats["misses"] > 0: + hit_rate = cache_stats["hits"] / (cache_stats["hits"] + cache_stats["misses"]) * 100 + logger.info( + "📦 Judge cache: %d hits, %d misses (%.0f%% hit rate, %d entries)", + cache_stats["hits"], + cache_stats["misses"], + hit_rate, + cache_stats["entries"], + ) + logger.info("Saved results to %s", output_path) _log_category_summary(task_entries, tasks_by_id, category_order) _log_efficiency_summary(efficiency, grades_by_task_id) diff --git a/scripts/lib_grading.py b/scripts/lib_grading.py index f02c5c2..60746ec 100644 --- a/scripts/lib_grading.py +++ b/scripts/lib_grading.py @@ -4,6 +4,7 @@ from __future__ import annotations +import hashlib import json import logging import os @@ -24,6 +25,71 @@ DEFAULT_JUDGE_AGENT_PREFIX = "bench-judge" DEFAULT_JUDGE_TIMEOUT_SECONDS = 300 +# Judge result cache: maps cache_key -> GradeResult dict +# Cache key = hash of (task_id, transcript_summary, rubric, judge_model) +_judge_cache: Dict[str, Dict[str, Any]] = {} +_judge_cache_dir: Optional[Path] = None + + +def set_judge_cache_dir(cache_dir: Path) -> None: + """Set the directory for persistent judge cache storage.""" + global _judge_cache_dir + _judge_cache_dir = cache_dir + cache_dir.mkdir(parents=True, exist_ok=True) + _load_judge_cache() + + +def _load_judge_cache() -> None: + """Load judge cache from disk.""" + global _judge_cache + if _judge_cache_dir is None: + return + cache_file = _judge_cache_dir / "judge_cache.json" + if cache_file.exists(): + try: + _judge_cache = json.loads(cache_file.read_text(encoding="utf-8")) + logger.info(f"📦 Loaded judge cache with {len(_judge_cache)} entries") + except Exception as e: + logger.warning(f"Failed to load judge cache: {e}") + _judge_cache = {} + + +def _save_judge_cache() -> None: + """Persist judge cache to disk.""" + if _judge_cache_dir is None: + return + cache_file = _judge_cache_dir / "judge_cache.json" + try: + cache_file.write_text(json.dumps(_judge_cache, indent=2), encoding="utf-8") + except Exception as e: + logger.warning(f"Failed to save judge cache: {e}") + + +def _compute_cache_key(task_id: str, transcript: str, rubric: str, model: str) -> str: + """Compute a cache key from grading inputs.""" + content = f"{task_id}|{transcript}|{rubric}|{model}" + return hashlib.sha256(content.encode()).hexdigest()[:16] + + +def get_judge_cache_stats() -> Dict[str, int]: + """Return cache statistics.""" + return { + "entries": len(_judge_cache), + "hits": getattr(get_judge_cache_stats, "_hits", 0), + "misses": getattr(get_judge_cache_stats, "_misses", 0), + } + + +def clear_judge_cache() -> None: + """Clear the in-memory and on-disk judge cache.""" + global _judge_cache + _judge_cache = {} + if _judge_cache_dir is not None: + cache_file = _judge_cache_dir / "judge_cache.json" + if cache_file.exists(): + cache_file.unlink() + logger.info("🗑️ Judge cache cleared") + @dataclass class GradeResult: @@ -220,6 +286,26 @@ def _grade_llm_judge( workspace_content[:500], ) rubric = task.llm_judge_rubric or _format_grading_criteria(task) + + # Check cache before calling judge + cache_key = _compute_cache_key(task.task_id, transcript_summary, rubric, judge_model) + if cache_key in _judge_cache: + cached = _judge_cache[cache_key] + if verbose: + logger.info(" [VERBOSE] Cache HIT for %s (key=%s)", task.task_id, cache_key[:8]) + get_judge_cache_stats._hits = getattr(get_judge_cache_stats, "_hits", 0) + 1 + return GradeResult( + task_id=task.task_id, + score=cached["score"], + max_score=cached["max_score"], + grading_type="llm_judge", + breakdown=cached.get("breakdown", {}), + notes=cached.get("notes", "") + " [cached]", + ) + get_judge_cache_stats._misses = getattr(get_judge_cache_stats, "_misses", 0) + 1 + if verbose: + logger.info(" [VERBOSE] Cache MISS for %s (key=%s)", task.task_id, cache_key[:8]) + prompt = _build_judge_prompt(task, transcript_summary, rubric, workspace_content) max_judge_attempts = 2 @@ -304,7 +390,8 @@ def _grade_llm_judge( task.task_id, raw_parsed, ) - return GradeResult( + + result = GradeResult( task_id=task.task_id, score=float(total) if total is not None else 0.0, max_score=1.0, @@ -312,6 +399,18 @@ def _grade_llm_judge( breakdown=_normalize_score_dict(breakdown), notes=str(notes) if notes is not None else "", ) + + # Cache successful results (only if we got a valid score) + if total is not None: + _judge_cache[cache_key] = { + "score": result.score, + "max_score": result.max_score, + "breakdown": result.breakdown, + "notes": result.notes, + } + _save_judge_cache() + + return result def _combine_grades(task: Task, auto_result: GradeResult, llm_result: GradeResult) -> GradeResult: diff --git a/scripts/lib_tasks.py b/scripts/lib_tasks.py index 0ae0f22..4eda809 100644 --- a/scripts/lib_tasks.py +++ b/scripts/lib_tasks.py @@ -80,6 +80,7 @@ def __init__(self, tasks_dir: Path): self.tasks_dir = tasks_dir self.category_map: Dict[str, str] = {} # task_id -> category from manifest self.categories: List[str] = [] # ordered list of category names + self.core_tasks: List[str] = [] # core task IDs for quick benchmark runs logger.info(f"Initialized TaskLoader with directory: {tasks_dir}") def load_all_tasks(self) -> List[Task]: @@ -141,12 +142,13 @@ def _load_from_manifest(self, manifest_path: Path) -> List[Task]: def _parse_categorized_manifest(self, manifest: Dict[str, Any]) -> List[str]: """Parse the categorized manifest format and return an ordered task list. - Populates ``self.category_map`` and ``self.categories``. Tasks listed - in ``run_first`` are placed at the front of the returned list while - preserving their category membership. + Populates ``self.category_map``, ``self.categories``, and ``self.core_tasks``. + Tasks listed in ``run_first`` are placed at the front of the returned list + while preserving their category membership. """ run_first: List[str] = manifest.get("run_first", []) categories: Dict[str, List[str]] = manifest.get("categories", {}) + self.core_tasks = manifest.get("core", []) self.categories = list(categories.keys()) self.category_map = {} diff --git a/tasks/manifest.yaml b/tasks/manifest.yaml index 1a80e2a..62e9d62 100644 --- a/tasks/manifest.yaml +++ b/tasks/manifest.yaml @@ -8,10 +8,53 @@ # # The `run_first` list contains tasks that must execute before all others # (in the order listed), regardless of which category they belong to. +# +# The `core` list defines a smaller subset of representative tasks for quick +# benchmark runs. Use --core flag to run only these tasks. Selection criteria: +# - At least one task per category +# - Prefer automated grading for speed +# - Include a mix of difficulty levels +# - Cover key capability areas run_first: - task_sanity +# Core tasks (~25) for quick benchmark runs. ~20% of full suite. +# Run with: benchmark.py --core +core: + # productivity (2) + - task_sanity # automated - baseline/smoke test + - task_calendar # automated - tool use + # research (2) + - task_stock # automated - web search + - task_market_research # hybrid - deeper research + # writing (2) + - task_email # llm_judge - short form + - task_humanizer # hybrid - text transformation + # coding (3) + - task_weather # automated - simple coding + - task_shell_command_generator # automated - command gen + - task_multi_file_refactoring # automated - code changes + # analysis (2) + - task_summary # llm_judge - comprehension + - task_spreadsheet_summary # hybrid - data analysis + # csv_analysis (2) + - task_csv_stock_trend # hybrid - time series + - task_csv_iris_summary # hybrid - stats + # log_analysis (2) + - task_log_apache_top_errors # hybrid - pattern finding + - task_log_syslog_boot # hybrid - system logs + # meeting_analysis (2) + - task_meeting_tldr # hybrid - summarization + - task_meeting_council_votes # hybrid - extraction + # memory (1) + - task_memory # automated - memory ops + # skills (2) + - task_files # automated - file ops + - task_skill_search # automated - skill discovery + # integrations (1) + - task_gws_email_triage # hybrid - integration test + categories: productivity: - task_sanity