Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 56 additions & 1 deletion scripts/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,14 @@
validate_openrouter_model,
)
from lib_axiom import init_axiom
from lib_grading import DEFAULT_JUDGE_TIMEOUT_SECONDS, GradeResult, grade_task
from lib_grading import (
DEFAULT_JUDGE_TIMEOUT_SECONDS,
GradeResult,
grade_task,
set_judge_cache_dir,
get_judge_cache_stats,
clear_judge_cache,
)
from lib_tasks import Task, TaskLoader


Expand Down Expand Up @@ -187,6 +194,11 @@ def _parse_args() -> argparse.Namespace:
default="all",
help='Tasks to run: "all", "automated-only", a category name (e.g. "coding"), or comma-separated task IDs',
)
parser.add_argument(
"--core",
action="store_true",
help="Run only core tasks (~25 representative tasks for quick benchmarking)",
)
parser.add_argument(
"--output-dir",
default="results",
Expand Down Expand Up @@ -261,6 +273,16 @@ def _parse_args() -> argparse.Namespace:
action="store_true",
help="Disable parallel judge execution (grade synchronously after each task)",
)
parser.add_argument(
"--no-judge-cache",
action="store_true",
help="Disable judge result caching (re-grade even if transcript+rubric unchanged)",
)
parser.add_argument(
"--clear-judge-cache",
action="store_true",
help="Clear the judge cache before running",
)
parser.add_argument(
"--trend",
action="store_true",
Expand Down Expand Up @@ -741,6 +763,16 @@ def main():
logger.error("Upload failed: %s", exc)
sys.exit(1)

# Initialize judge cache
if not args.no_judge_cache:
cache_dir = Path(args.output_dir) / ".judge_cache"
set_judge_cache_dir(cache_dir)
if args.clear_judge_cache:
clear_judge_cache()
logger.info("🗑️ Judge cache cleared")
else:
logger.info("📦 Judge caching disabled")

logger.info("🔧 Initializing BenchmarkRunner...")
runner = BenchmarkRunner(tasks_dir)

Expand Down Expand Up @@ -775,6 +807,16 @@ def main():
cleanup_agent_sessions(agent_id)

task_ids = _select_task_ids(runner.tasks, args.suite, runner.task_loader.category_map)

# Handle --core flag: use core tasks from manifest
if args.core:
core_task_ids = runner.task_loader.core_tasks
if not core_task_ids:
logger.warning("⚠️ No core tasks defined in manifest.yaml, running all tasks")
else:
task_ids = core_task_ids
logger.info(f"🎯 Core mode: running {len(core_task_ids)} representative tasks")

results = []
grades_by_task_id = {}
sanity_task_id = "task_sanity"
Expand Down Expand Up @@ -1152,6 +1194,19 @@ def _build_and_write_results():
score_pct = (total_score / max_score * 100) if max_score > 0 else 0
logger.info("📊 Final score: %.2f/%.0f (%.1f%%)", total_score, max_score, score_pct)

# Log judge cache stats
if not args.no_judge_cache:
cache_stats = get_judge_cache_stats()
if cache_stats["hits"] > 0 or cache_stats["misses"] > 0:
hit_rate = cache_stats["hits"] / (cache_stats["hits"] + cache_stats["misses"]) * 100
logger.info(
"📦 Judge cache: %d hits, %d misses (%.0f%% hit rate, %d entries)",
cache_stats["hits"],
cache_stats["misses"],
hit_rate,
cache_stats["entries"],
)

logger.info("Saved results to %s", output_path)
_log_category_summary(task_entries, tasks_by_id, category_order)
_log_efficiency_summary(efficiency, grades_by_task_id)
Expand Down
101 changes: 100 additions & 1 deletion scripts/lib_grading.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from __future__ import annotations

import hashlib
import json
import logging
import os
Expand All @@ -24,6 +25,71 @@
DEFAULT_JUDGE_AGENT_PREFIX = "bench-judge"
DEFAULT_JUDGE_TIMEOUT_SECONDS = 300

# Judge result cache: maps cache_key -> GradeResult dict
# Cache key = hash of (task_id, transcript_summary, rubric, judge_model)
_judge_cache: Dict[str, Dict[str, Any]] = {}
_judge_cache_dir: Optional[Path] = None


def set_judge_cache_dir(cache_dir: Path) -> None:
"""Set the directory for persistent judge cache storage."""
global _judge_cache_dir
_judge_cache_dir = cache_dir
cache_dir.mkdir(parents=True, exist_ok=True)
_load_judge_cache()


def _load_judge_cache() -> None:
"""Load judge cache from disk."""
global _judge_cache
if _judge_cache_dir is None:
return
cache_file = _judge_cache_dir / "judge_cache.json"
if cache_file.exists():
try:
_judge_cache = json.loads(cache_file.read_text(encoding="utf-8"))
logger.info(f"📦 Loaded judge cache with {len(_judge_cache)} entries")
except Exception as e:
logger.warning(f"Failed to load judge cache: {e}")
_judge_cache = {}


def _save_judge_cache() -> None:
"""Persist judge cache to disk."""
if _judge_cache_dir is None:
return
cache_file = _judge_cache_dir / "judge_cache.json"
try:
cache_file.write_text(json.dumps(_judge_cache, indent=2), encoding="utf-8")
except Exception as e:
logger.warning(f"Failed to save judge cache: {e}")


def _compute_cache_key(task_id: str, transcript: str, rubric: str, model: str) -> str:
"""Compute a cache key from grading inputs."""
content = f"{task_id}|{transcript}|{rubric}|{model}"
return hashlib.sha256(content.encode()).hexdigest()[:16]


def get_judge_cache_stats() -> Dict[str, int]:
"""Return cache statistics."""
return {
"entries": len(_judge_cache),
"hits": getattr(get_judge_cache_stats, "_hits", 0),
"misses": getattr(get_judge_cache_stats, "_misses", 0),
}


def clear_judge_cache() -> None:
"""Clear the in-memory and on-disk judge cache."""
global _judge_cache
_judge_cache = {}
if _judge_cache_dir is not None:
cache_file = _judge_cache_dir / "judge_cache.json"
if cache_file.exists():
cache_file.unlink()
logger.info("🗑️ Judge cache cleared")


@dataclass
class GradeResult:
Expand Down Expand Up @@ -220,6 +286,26 @@ def _grade_llm_judge(
workspace_content[:500],
)
rubric = task.llm_judge_rubric or _format_grading_criteria(task)

# Check cache before calling judge
cache_key = _compute_cache_key(task.task_id, transcript_summary, rubric, judge_model)
if cache_key in _judge_cache:
cached = _judge_cache[cache_key]
if verbose:
logger.info(" [VERBOSE] Cache HIT for %s (key=%s)", task.task_id, cache_key[:8])
get_judge_cache_stats._hits = getattr(get_judge_cache_stats, "_hits", 0) + 1
return GradeResult(
task_id=task.task_id,
score=cached["score"],
max_score=cached["max_score"],
grading_type="llm_judge",
breakdown=cached.get("breakdown", {}),
notes=cached.get("notes", "") + " [cached]",
)
get_judge_cache_stats._misses = getattr(get_judge_cache_stats, "_misses", 0) + 1
if verbose:
logger.info(" [VERBOSE] Cache MISS for %s (key=%s)", task.task_id, cache_key[:8])

prompt = _build_judge_prompt(task, transcript_summary, rubric, workspace_content)

max_judge_attempts = 2
Expand Down Expand Up @@ -304,14 +390,27 @@ def _grade_llm_judge(
task.task_id,
raw_parsed,
)
return GradeResult(

result = GradeResult(
task_id=task.task_id,
score=float(total) if total is not None else 0.0,
max_score=1.0,
grading_type="llm_judge",
breakdown=_normalize_score_dict(breakdown),
notes=str(notes) if notes is not None else "",
)

# Cache successful results (only if we got a valid score)
if total is not None:
_judge_cache[cache_key] = {
"score": result.score,
"max_score": result.max_score,
"breakdown": result.breakdown,
"notes": result.notes,
}
_save_judge_cache()

return result


def _combine_grades(task: Task, auto_result: GradeResult, llm_result: GradeResult) -> GradeResult:
Expand Down
8 changes: 5 additions & 3 deletions scripts/lib_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ def __init__(self, tasks_dir: Path):
self.tasks_dir = tasks_dir
self.category_map: Dict[str, str] = {} # task_id -> category from manifest
self.categories: List[str] = [] # ordered list of category names
self.core_tasks: List[str] = [] # core task IDs for quick benchmark runs
logger.info(f"Initialized TaskLoader with directory: {tasks_dir}")

def load_all_tasks(self) -> List[Task]:
Expand Down Expand Up @@ -141,12 +142,13 @@ def _load_from_manifest(self, manifest_path: Path) -> List[Task]:
def _parse_categorized_manifest(self, manifest: Dict[str, Any]) -> List[str]:
"""Parse the categorized manifest format and return an ordered task list.

Populates ``self.category_map`` and ``self.categories``. Tasks listed
in ``run_first`` are placed at the front of the returned list while
preserving their category membership.
Populates ``self.category_map``, ``self.categories``, and ``self.core_tasks``.
Tasks listed in ``run_first`` are placed at the front of the returned list
while preserving their category membership.
"""
run_first: List[str] = manifest.get("run_first", [])
categories: Dict[str, List[str]] = manifest.get("categories", {})
self.core_tasks = manifest.get("core", [])

self.categories = list(categories.keys())
self.category_map = {}
Expand Down
43 changes: 43 additions & 0 deletions tasks/manifest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,53 @@
#
# The `run_first` list contains tasks that must execute before all others
# (in the order listed), regardless of which category they belong to.
#
# The `core` list defines a smaller subset of representative tasks for quick
# benchmark runs. Use --core flag to run only these tasks. Selection criteria:
# - At least one task per category
# - Prefer automated grading for speed
# - Include a mix of difficulty levels
# - Cover key capability areas

run_first:
- task_sanity

# Core tasks (~25) for quick benchmark runs. ~20% of full suite.
# Run with: benchmark.py --core
core:
# productivity (2)
- task_sanity # automated - baseline/smoke test
- task_calendar # automated - tool use
# research (2)
- task_stock # automated - web search
- task_market_research # hybrid - deeper research
# writing (2)
- task_email # llm_judge - short form
- task_humanizer # hybrid - text transformation
# coding (3)
- task_weather # automated - simple coding
- task_shell_command_generator # automated - command gen
- task_multi_file_refactoring # automated - code changes
# analysis (2)
- task_summary # llm_judge - comprehension
- task_spreadsheet_summary # hybrid - data analysis
# csv_analysis (2)
- task_csv_stock_trend # hybrid - time series
- task_csv_iris_summary # hybrid - stats
# log_analysis (2)
- task_log_apache_top_errors # hybrid - pattern finding
- task_log_syslog_boot # hybrid - system logs
# meeting_analysis (2)
- task_meeting_tldr # hybrid - summarization
- task_meeting_council_votes # hybrid - extraction
# memory (1)
- task_memory # automated - memory ops
# skills (2)
- task_files # automated - file ops
- task_skill_search # automated - skill discovery
# integrations (1)
- task_gws_email_triage # hybrid - integration test

categories:
productivity:
- task_sanity
Expand Down
Loading