Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

[![Leaderboard](https://img.shields.io/badge/leaderboard-pinchbench.com-blue)](https://pinchbench.com)
[![License](https://img.shields.io/badge/license-MIT-green)](LICENSE)
<!-- task-count-badge -->![Tasks](https://img.shields.io/badge/tasks-53-orange)<!-- /task-count-badge -->
<!-- task-count-badge -->![Tasks](https://img.shields.io/badge/tasks-123-orange)<!-- /task-count-badge -->

> **Note:** This repository contains the benchmark skill/tasks. It is NOT the source of official leaderboard results. To add models to the official results, modify [pinchbench/scripts/default-models.yml](https://github.com/pinchbench/scripts/blob/main/default-models.yml).

Expand Down Expand Up @@ -47,7 +47,7 @@ cd skill

## What Gets Tested

<!-- task-count-text -->PinchBench includes 53 tasks across real-world categories:<!-- /task-count-text -->
<!-- task-count-text -->PinchBench includes 123 tasks across real-world categories:<!-- /task-count-text -->

| Category | Tasks | What's tested |
| ---------------- | --------------------------------------- | ---------------------------------------- |
Expand Down
30 changes: 29 additions & 1 deletion scripts/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,11 @@
import logging
import os
import re
import shutil
import statistics
import subprocess
import sys
import tempfile
import time
from concurrent.futures import Future, ThreadPoolExecutor
from pathlib import Path
Expand Down Expand Up @@ -823,13 +825,15 @@ def _write_incremental_results():
pending_grade_result: Optional[Dict[str, Any]] = None
pending_grade_task_num: int = 0

pending_grade_snapshot_dir: Optional[str] = None

if use_parallel_judge:
judge_executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix="judge")
logger.info("Parallel judge execution enabled")

def _wait_for_pending_grade() -> None:
"""Wait for any pending background grade to complete and record it."""
nonlocal pending_grade_future, pending_grade_task, pending_grade_result, pending_grade_task_num
nonlocal pending_grade_future, pending_grade_task, pending_grade_result, pending_grade_task_num, pending_grade_snapshot_dir
if pending_grade_future is None:
return

Expand Down Expand Up @@ -883,6 +887,14 @@ def _wait_for_pending_grade() -> None:
"max": max(task_scores),
}

# Clean up workspace snapshot temp directory
if pending_grade_snapshot_dir is not None:
try:
shutil.rmtree(pending_grade_snapshot_dir, ignore_errors=True)
except Exception:
pass
pending_grade_snapshot_dir = None

pending_grade_future = None
pending_grade_task = None
pending_grade_result = None
Expand Down Expand Up @@ -955,6 +967,22 @@ def _wait_for_pending_grade() -> None:
)

if can_parallelize:
# Snapshot the workspace directory so the background grader
# reads a stable copy even after the main thread rebuilds the
# workspace for the next task (fixes race condition #365).
workspace_path = result.get("workspace", "")
if workspace_path and os.path.isdir(workspace_path):
snapshot_dir = tempfile.mkdtemp(prefix="pinchbench_grade_")
shutil.copytree(workspace_path, os.path.join(snapshot_dir, "ws"), dirs_exist_ok=True)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

WARNING: shutil.copytree is not wrapped in a try/except here. If it raises (e.g. disk full, broken symlink, permission error), the exception propagates uncaught through the task loop and crashes the entire benchmark run — losing all results collected so far. The temp dir created on the line above would also be leaked.

Consider wrapping the snapshot block in a try/except that falls back to using the original result (accepting the small race-condition risk) rather than aborting the run:

try:
    snapshot_dir = tempfile.mkdtemp(prefix="pinchbench_grade_")
    shutil.copytree(workspace_path, os.path.join(snapshot_dir, "ws"), dirs_exist_ok=True)
    snapshot_workspace = os.path.join(snapshot_dir, "ws")
    grade_result_copy = dict(result)
    grade_result_copy["workspace"] = snapshot_workspace
    grade_kwargs["execution_result"] = grade_result_copy
    pending_grade_snapshot_dir = snapshot_dir
except Exception as exc:
    logger.warning("Workspace snapshot failed, grading with live path: %s", exc)
    pending_grade_snapshot_dir = None

snapshot_workspace = os.path.join(snapshot_dir, "ws")
# Create a shallow copy of execution_result with the snapshot path
grade_result_copy = dict(result)
grade_result_copy["workspace"] = snapshot_workspace
grade_kwargs["execution_result"] = grade_result_copy
pending_grade_snapshot_dir = snapshot_dir
else:
pending_grade_snapshot_dir = None

# Submit grading to background thread
pending_grade_future = judge_executor.submit(grade_task, **grade_kwargs)
pending_grade_task = task
Expand Down
Loading