pinchbench · ScuttleBot · May 5, 2026 · Apr 24, 2026
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
@@ -37,7 +37,14 @@
     validate_openrouter_model,
 )
 from lib_axiom import init_axiom
-from lib_grading import DEFAULT_JUDGE_TIMEOUT_SECONDS, GradeResult, grade_task
+from lib_grading import (
+    DEFAULT_JUDGE_TIMEOUT_SECONDS,
+    GradeResult,
+    grade_task,
+    set_judge_cache_dir,
+    get_judge_cache_stats,
+    clear_judge_cache,
+)
 from lib_tasks import Task, TaskLoader
 
 
@@ -187,6 +194,11 @@ def _parse_args() -> argparse.Namespace:
         default="all",
         help='Tasks to run: "all", "automated-only", a category name (e.g. "coding"), or comma-separated task IDs',
     )
+    parser.add_argument(
+        "--core",
+        action="store_true",
+        help="Run only core tasks (~25 representative tasks for quick benchmarking)",
+    )
     parser.add_argument(
         "--output-dir",
         default="results",
@@ -261,6 +273,16 @@ def _parse_args() -> argparse.Namespace:
         action="store_true",
         help="Disable parallel judge execution (grade synchronously after each task)",
     )
+    parser.add_argument(
+        "--no-judge-cache",
+        action="store_true",
+        help="Disable judge result caching (re-grade even if transcript+rubric unchanged)",
+    )
+    parser.add_argument(
+        "--clear-judge-cache",
+        action="store_true",
+        help="Clear the judge cache before running",
+    )
     parser.add_argument(
         "--trend",
         action="store_true",
@@ -741,6 +763,16 @@ def main():
             logger.error("Upload failed: %s", exc)
             sys.exit(1)
 
+    # Initialize judge cache
+    if not args.no_judge_cache:
+        cache_dir = Path(args.output_dir) / ".judge_cache"
+        set_judge_cache_dir(cache_dir)
+        if args.clear_judge_cache:
+            clear_judge_cache()
+            logger.info("🗑️  Judge cache cleared")
+    else:
+        logger.info("📦 Judge caching disabled")
+
     logger.info("🔧 Initializing BenchmarkRunner...")
     runner = BenchmarkRunner(tasks_dir)
 
@@ -775,6 +807,16 @@ def main():
     cleanup_agent_sessions(agent_id)
 
     task_ids = _select_task_ids(runner.tasks, args.suite, runner.task_loader.category_map)
+
+    # Handle --core flag: use core tasks from manifest
+    if args.core:
+        core_task_ids = runner.task_loader.core_tasks
+        if not core_task_ids:
+            logger.warning("⚠️  No core tasks defined in manifest.yaml, running all tasks")
+        else:
+            task_ids = core_task_ids
+            logger.info(f"🎯 Core mode: running {len(core_task_ids)} representative tasks")
+
     results = []
     grades_by_task_id = {}
     sanity_task_id = "task_sanity"
@@ -1152,6 +1194,19 @@ def _build_and_write_results():
     score_pct = (total_score / max_score * 100) if max_score > 0 else 0
     logger.info("📊 Final score: %.2f/%.0f (%.1f%%)", total_score, max_score, score_pct)
 
+    # Log judge cache stats
+    if not args.no_judge_cache:
+        cache_stats = get_judge_cache_stats()
+        if cache_stats["hits"] > 0 or cache_stats["misses"] > 0:
+            hit_rate = cache_stats["hits"] / (cache_stats["hits"] + cache_stats["misses"]) * 100
+            logger.info(
+                "📦 Judge cache: %d hits, %d misses (%.0f%% hit rate, %d entries)",
+                cache_stats["hits"],
+                cache_stats["misses"],
+                hit_rate,
+                cache_stats["entries"],
+            )
+
     logger.info("Saved results to %s", output_path)
     _log_category_summary(task_entries, tasks_by_id, category_order)
     _log_efficiency_summary(efficiency, grades_by_task_id)

diff --git a/scripts/lib_grading.py b/scripts/lib_grading.py
@@ -4,6 +4,7 @@
 
 from __future__ import annotations
 
+import hashlib
 import json
 import logging
 import os
@@ -24,6 +25,71 @@
 DEFAULT_JUDGE_AGENT_PREFIX = "bench-judge"
 DEFAULT_JUDGE_TIMEOUT_SECONDS = 300
 
+# Judge result cache: maps cache_key -> GradeResult dict
+# Cache key = hash of (task_id, transcript_summary, rubric, judge_model)
+_judge_cache: Dict[str, Dict[str, Any]] = {}
+_judge_cache_dir: Optional[Path] = None
+
+
+def set_judge_cache_dir(cache_dir: Path) -> None:
+    """Set the directory for persistent judge cache storage."""
+    global _judge_cache_dir
+    _judge_cache_dir = cache_dir
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    _load_judge_cache()
+
+
+def _load_judge_cache() -> None:
+    """Load judge cache from disk."""
+    global _judge_cache
+    if _judge_cache_dir is None:
+        return
+    cache_file = _judge_cache_dir / "judge_cache.json"
+    if cache_file.exists():
+        try:
+            _judge_cache = json.loads(cache_file.read_text(encoding="utf-8"))
+            logger.info(f"📦 Loaded judge cache with {len(_judge_cache)} entries")
+        except Exception as e:
+            logger.warning(f"Failed to load judge cache: {e}")
+            _judge_cache = {}
+
+
+def _save_judge_cache() -> None:
+    """Persist judge cache to disk."""
+    if _judge_cache_dir is None:
+        return
+    cache_file = _judge_cache_dir / "judge_cache.json"
+    try:
+        cache_file.write_text(json.dumps(_judge_cache, indent=2), encoding="utf-8")
+    except Exception as e:
+        logger.warning(f"Failed to save judge cache: {e}")
+
+
+def _compute_cache_key(task_id: str, transcript: str, rubric: str, model: str) -> str:
+    """Compute a cache key from grading inputs."""
+    content = f"{task_id}|{transcript}|{rubric}|{model}"
+    return hashlib.sha256(content.encode()).hexdigest()[:16]
+
+
+def get_judge_cache_stats() -> Dict[str, int]:
+    """Return cache statistics."""
+    return {
+        "entries": len(_judge_cache),
+        "hits": getattr(get_judge_cache_stats, "_hits", 0),
+        "misses": getattr(get_judge_cache_stats, "_misses", 0),
+    }
+
+
+def clear_judge_cache() -> None:
+    """Clear the in-memory and on-disk judge cache."""
+    global _judge_cache
+    _judge_cache = {}
+    if _judge_cache_dir is not None:
+        cache_file = _judge_cache_dir / "judge_cache.json"
+        if cache_file.exists():
+            cache_file.unlink()
+    logger.info("🗑️  Judge cache cleared")
+
 
 @dataclass
 class GradeResult:
@@ -220,6 +286,26 @@ def _grade_llm_judge(
             workspace_content[:500],
         )
     rubric = task.llm_judge_rubric or _format_grading_criteria(task)
+
+    # Check cache before calling judge
+    cache_key = _compute_cache_key(task.task_id, transcript_summary, rubric, judge_model)
+    if cache_key in _judge_cache:
+        cached = _judge_cache[cache_key]
+        if verbose:
+            logger.info("   [VERBOSE] Cache HIT for %s (key=%s)", task.task_id, cache_key[:8])
+        get_judge_cache_stats._hits = getattr(get_judge_cache_stats, "_hits", 0) + 1
+        return GradeResult(
+            task_id=task.task_id,
+            score=cached["score"],
+            max_score=cached["max_score"],
+            grading_type="llm_judge",
+            breakdown=cached.get("breakdown", {}),
+            notes=cached.get("notes", "") + " [cached]",
+        )
+    get_judge_cache_stats._misses = getattr(get_judge_cache_stats, "_misses", 0) + 1
+    if verbose:
+        logger.info("   [VERBOSE] Cache MISS for %s (key=%s)", task.task_id, cache_key[:8])
+
     prompt = _build_judge_prompt(task, transcript_summary, rubric, workspace_content)
 
     max_judge_attempts = 2
@@ -304,14 +390,27 @@ def _grade_llm_judge(
             task.task_id,
             raw_parsed,
         )
-    return GradeResult(
+
+    result = GradeResult(
         task_id=task.task_id,
         score=float(total) if total is not None else 0.0,
         max_score=1.0,
         grading_type="llm_judge",
         breakdown=_normalize_score_dict(breakdown),
         notes=str(notes) if notes is not None else "",
     )
+
+    # Cache successful results (only if we got a valid score)
+    if total is not None:
+        _judge_cache[cache_key] = {
+            "score": result.score,
+            "max_score": result.max_score,
+            "breakdown": result.breakdown,
+            "notes": result.notes,
+        }
+        _save_judge_cache()
+
+    return result
 
 
 def _combine_grades(task: Task, auto_result: GradeResult, llm_result: GradeResult) -> GradeResult:

diff --git a/scripts/lib_tasks.py b/scripts/lib_tasks.py
@@ -80,6 +80,7 @@ def __init__(self, tasks_dir: Path):
         self.tasks_dir = tasks_dir
         self.category_map: Dict[str, str] = {}  # task_id -> category from manifest
         self.categories: List[str] = []  # ordered list of category names
+        self.core_tasks: List[str] = []  # core task IDs for quick benchmark runs
         logger.info(f"Initialized TaskLoader with directory: {tasks_dir}")
 
     def load_all_tasks(self) -> List[Task]:
@@ -141,12 +142,13 @@ def _load_from_manifest(self, manifest_path: Path) -> List[Task]:
     def _parse_categorized_manifest(self, manifest: Dict[str, Any]) -> List[str]:
         """Parse the categorized manifest format and return an ordered task list.
 
-        Populates ``self.category_map`` and ``self.categories``.  Tasks listed
-        in ``run_first`` are placed at the front of the returned list while
-        preserving their category membership.
+        Populates ``self.category_map``, ``self.categories``, and ``self.core_tasks``.
+        Tasks listed in ``run_first`` are placed at the front of the returned list
+        while preserving their category membership.
         """
         run_first: List[str] = manifest.get("run_first", [])
         categories: Dict[str, List[str]] = manifest.get("categories", {})
+        self.core_tasks = manifest.get("core", [])
 
         self.categories = list(categories.keys())
         self.category_map = {}

diff --git a/tasks/manifest.yaml b/tasks/manifest.yaml
@@ -8,10 +8,53 @@
 #
 # The `run_first` list contains tasks that must execute before all others
 # (in the order listed), regardless of which category they belong to.
+#
+# The `core` list defines a smaller subset of representative tasks for quick
+# benchmark runs. Use --core flag to run only these tasks. Selection criteria:
+# - At least one task per category
+# - Prefer automated grading for speed
+# - Include a mix of difficulty levels
+# - Cover key capability areas
 
 run_first:
   - task_sanity
 
+# Core tasks (~25) for quick benchmark runs. ~20% of full suite.
+# Run with: benchmark.py --core
+core:
+  # productivity (2)
+  - task_sanity          # automated - baseline/smoke test
+  - task_calendar        # automated - tool use
+  # research (2)
+  - task_stock           # automated - web search
+  - task_market_research # hybrid - deeper research
+  # writing (2)
+  - task_email           # llm_judge - short form
+  - task_humanizer       # hybrid - text transformation
+  # coding (3)
+  - task_weather         # automated - simple coding
+  - task_shell_command_generator  # automated - command gen
+  - task_multi_file_refactoring   # automated - code changes
+  # analysis (2)
+  - task_summary         # llm_judge - comprehension
+  - task_spreadsheet_summary      # hybrid - data analysis
+  # csv_analysis (2)
+  - task_csv_stock_trend          # hybrid - time series
+  - task_csv_iris_summary         # hybrid - stats
+  # log_analysis (2)
+  - task_log_apache_top_errors    # hybrid - pattern finding
+  - task_log_syslog_boot          # hybrid - system logs
+  # meeting_analysis (2)
+  - task_meeting_tldr             # hybrid - summarization
+  - task_meeting_council_votes    # hybrid - extraction
+  # memory (1)
+  - task_memory          # automated - memory ops
+  # skills (2)
+  - task_files           # automated - file ops
+  - task_skill_search    # automated - skill discovery
+  # integrations (1)
+  - task_gws_email_triage         # hybrid - integration test
+
 categories:
   productivity:
     - task_sanity