From 117de6f8d47a0073c516ae00230b10392f76b95c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 4 May 2026 16:08:26 +0000 Subject: [PATCH 1/2] chore: update task count in README [skip ci] --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b23eb68..9f47e0a 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ [![Leaderboard](https://img.shields.io/badge/leaderboard-pinchbench.com-blue)](https://pinchbench.com) [![License](https://img.shields.io/badge/license-MIT-green)](LICENSE) -![Tasks](https://img.shields.io/badge/tasks-53-orange) +![Tasks](https://img.shields.io/badge/tasks-123-orange) > **Note:** This repository contains the benchmark skill/tasks. It is NOT the source of official leaderboard results. To add models to the official results, modify [pinchbench/scripts/default-models.yml](https://github.com/pinchbench/scripts/blob/main/default-models.yml). @@ -47,7 +47,7 @@ cd skill ## What Gets Tested -PinchBench includes 53 tasks across real-world categories: +PinchBench includes 123 tasks across real-world categories: | Category | Tasks | What's tested | | ---------------- | --------------------------------------- | ---------------------------------------- | From f39516ecb864839fe189871b488c47ac2713bcf7 Mon Sep 17 00:00:00 2001 From: scuttlebot Date: Mon, 4 May 2026 16:18:17 +0000 Subject: [PATCH 2/2] fix: snapshot workspace before parallel grading to prevent race condition When parallel grading is enabled, grade_task() runs in a background thread while the main thread prepares the next task's workspace. The background grader then reads the rebuilt workspace instead of the completed task's files, causing transcript mismatches and incorrect automated scores. Fix: before submitting grade_task to the ThreadPoolExecutor, deep-copy the workspace directory into an isolated temp directory via shutil.copytree. A shallow copy of execution_result with the snapshot path is passed to the background grader. The snapshot is cleaned up in _wait_for_pending_grade() after grading completes. The synchronous grading path is unchanged. Closes #365 --- scripts/benchmark.py | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/scripts/benchmark.py b/scripts/benchmark.py index 61cf867..5028616 100644 --- a/scripts/benchmark.py +++ b/scripts/benchmark.py @@ -18,9 +18,11 @@ import logging import os import re +import shutil import statistics import subprocess import sys +import tempfile import time from concurrent.futures import Future, ThreadPoolExecutor from pathlib import Path @@ -823,13 +825,15 @@ def _write_incremental_results(): pending_grade_result: Optional[Dict[str, Any]] = None pending_grade_task_num: int = 0 + pending_grade_snapshot_dir: Optional[str] = None + if use_parallel_judge: judge_executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix="judge") logger.info("Parallel judge execution enabled") def _wait_for_pending_grade() -> None: """Wait for any pending background grade to complete and record it.""" - nonlocal pending_grade_future, pending_grade_task, pending_grade_result, pending_grade_task_num + nonlocal pending_grade_future, pending_grade_task, pending_grade_result, pending_grade_task_num, pending_grade_snapshot_dir if pending_grade_future is None: return @@ -883,6 +887,14 @@ def _wait_for_pending_grade() -> None: "max": max(task_scores), } + # Clean up workspace snapshot temp directory + if pending_grade_snapshot_dir is not None: + try: + shutil.rmtree(pending_grade_snapshot_dir, ignore_errors=True) + except Exception: + pass + pending_grade_snapshot_dir = None + pending_grade_future = None pending_grade_task = None pending_grade_result = None @@ -955,6 +967,22 @@ def _wait_for_pending_grade() -> None: ) if can_parallelize: + # Snapshot the workspace directory so the background grader + # reads a stable copy even after the main thread rebuilds the + # workspace for the next task (fixes race condition #365). + workspace_path = result.get("workspace", "") + if workspace_path and os.path.isdir(workspace_path): + snapshot_dir = tempfile.mkdtemp(prefix="pinchbench_grade_") + shutil.copytree(workspace_path, os.path.join(snapshot_dir, "ws"), dirs_exist_ok=True) + snapshot_workspace = os.path.join(snapshot_dir, "ws") + # Create a shallow copy of execution_result with the snapshot path + grade_result_copy = dict(result) + grade_result_copy["workspace"] = snapshot_workspace + grade_kwargs["execution_result"] = grade_result_copy + pending_grade_snapshot_dir = snapshot_dir + else: + pending_grade_snapshot_dir = None + # Submit grading to background thread pending_grade_future = judge_executor.submit(grade_task, **grade_kwargs) pending_grade_task = task