diff --git a/README.md b/README.md index b23eb68..9f47e0a 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ [![Leaderboard](https://img.shields.io/badge/leaderboard-pinchbench.com-blue)](https://pinchbench.com) [![License](https://img.shields.io/badge/license-MIT-green)](LICENSE) -![Tasks](https://img.shields.io/badge/tasks-53-orange) +![Tasks](https://img.shields.io/badge/tasks-123-orange) > **Note:** This repository contains the benchmark skill/tasks. It is NOT the source of official leaderboard results. To add models to the official results, modify [pinchbench/scripts/default-models.yml](https://github.com/pinchbench/scripts/blob/main/default-models.yml). @@ -47,7 +47,7 @@ cd skill ## What Gets Tested -PinchBench includes 53 tasks across real-world categories: +PinchBench includes 123 tasks across real-world categories: | Category | Tasks | What's tested | | ---------------- | --------------------------------------- | ---------------------------------------- | diff --git a/scripts/benchmark.py b/scripts/benchmark.py index 61cf867..5028616 100644 --- a/scripts/benchmark.py +++ b/scripts/benchmark.py @@ -18,9 +18,11 @@ import logging import os import re +import shutil import statistics import subprocess import sys +import tempfile import time from concurrent.futures import Future, ThreadPoolExecutor from pathlib import Path @@ -823,13 +825,15 @@ def _write_incremental_results(): pending_grade_result: Optional[Dict[str, Any]] = None pending_grade_task_num: int = 0 + pending_grade_snapshot_dir: Optional[str] = None + if use_parallel_judge: judge_executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix="judge") logger.info("Parallel judge execution enabled") def _wait_for_pending_grade() -> None: """Wait for any pending background grade to complete and record it.""" - nonlocal pending_grade_future, pending_grade_task, pending_grade_result, pending_grade_task_num + nonlocal pending_grade_future, pending_grade_task, pending_grade_result, pending_grade_task_num, pending_grade_snapshot_dir if pending_grade_future is None: return @@ -883,6 +887,14 @@ def _wait_for_pending_grade() -> None: "max": max(task_scores), } + # Clean up workspace snapshot temp directory + if pending_grade_snapshot_dir is not None: + try: + shutil.rmtree(pending_grade_snapshot_dir, ignore_errors=True) + except Exception: + pass + pending_grade_snapshot_dir = None + pending_grade_future = None pending_grade_task = None pending_grade_result = None @@ -955,6 +967,22 @@ def _wait_for_pending_grade() -> None: ) if can_parallelize: + # Snapshot the workspace directory so the background grader + # reads a stable copy even after the main thread rebuilds the + # workspace for the next task (fixes race condition #365). + workspace_path = result.get("workspace", "") + if workspace_path and os.path.isdir(workspace_path): + snapshot_dir = tempfile.mkdtemp(prefix="pinchbench_grade_") + shutil.copytree(workspace_path, os.path.join(snapshot_dir, "ws"), dirs_exist_ok=True) + snapshot_workspace = os.path.join(snapshot_dir, "ws") + # Create a shallow copy of execution_result with the snapshot path + grade_result_copy = dict(result) + grade_result_copy["workspace"] = snapshot_workspace + grade_kwargs["execution_result"] = grade_result_copy + pending_grade_snapshot_dir = snapshot_dir + else: + pending_grade_snapshot_dir = None + # Submit grading to background thread pending_grade_future = judge_executor.submit(grade_task, **grade_kwargs) pending_grade_task = task