diff --git a/README.md b/README.md
index b23eb68..9f47e0a 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
 
 [![Leaderboard](https://img.shields.io/badge/leaderboard-pinchbench.com-blue)](https://pinchbench.com)
 [![License](https://img.shields.io/badge/license-MIT-green)](LICENSE)
-<!-- task-count-badge -->![Tasks](https://img.shields.io/badge/tasks-53-orange)<!-- /task-count-badge -->
+<!-- task-count-badge -->![Tasks](https://img.shields.io/badge/tasks-123-orange)<!-- /task-count-badge -->
 
 > **Note:** This repository contains the benchmark skill/tasks. It is NOT the source of official leaderboard results. To add models to the official results, modify [pinchbench/scripts/default-models.yml](https://github.com/pinchbench/scripts/blob/main/default-models.yml).
 
@@ -47,7 +47,7 @@ cd skill
 
 ## What Gets Tested
 
-<!-- task-count-text -->PinchBench includes 53 tasks across real-world categories:<!-- /task-count-text -->
+<!-- task-count-text -->PinchBench includes 123 tasks across real-world categories:<!-- /task-count-text -->
 
 | Category         | Tasks                                   | What's tested                            |
 | ---------------- | --------------------------------------- | ---------------------------------------- |
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
index 61cf867..5028616 100644
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -18,9 +18,11 @@
 import logging
 import os
 import re
+import shutil
 import statistics
 import subprocess
 import sys
+import tempfile
 import time
 from concurrent.futures import Future, ThreadPoolExecutor
 from pathlib import Path
@@ -823,13 +825,15 @@ def _write_incremental_results():
     pending_grade_result: Optional[Dict[str, Any]] = None
     pending_grade_task_num: int = 0
 
+    pending_grade_snapshot_dir: Optional[str] = None
+
     if use_parallel_judge:
         judge_executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix="judge")
         logger.info("Parallel judge execution enabled")
 
     def _wait_for_pending_grade() -> None:
         """Wait for any pending background grade to complete and record it."""
-        nonlocal pending_grade_future, pending_grade_task, pending_grade_result, pending_grade_task_num
+        nonlocal pending_grade_future, pending_grade_task, pending_grade_result, pending_grade_task_num, pending_grade_snapshot_dir
         if pending_grade_future is None:
             return
 
@@ -883,6 +887,14 @@ def _wait_for_pending_grade() -> None:
             "max": max(task_scores),
         }
 
+        # Clean up workspace snapshot temp directory
+        if pending_grade_snapshot_dir is not None:
+            try:
+                shutil.rmtree(pending_grade_snapshot_dir, ignore_errors=True)
+            except Exception:
+                pass
+            pending_grade_snapshot_dir = None
+
         pending_grade_future = None
         pending_grade_task = None
         pending_grade_result = None
@@ -955,6 +967,22 @@ def _wait_for_pending_grade() -> None:
             )
 
             if can_parallelize:
+                # Snapshot the workspace directory so the background grader
+                # reads a stable copy even after the main thread rebuilds the
+                # workspace for the next task (fixes race condition #365).
+                workspace_path = result.get("workspace", "")
+                if workspace_path and os.path.isdir(workspace_path):
+                    snapshot_dir = tempfile.mkdtemp(prefix="pinchbench_grade_")
+                    shutil.copytree(workspace_path, os.path.join(snapshot_dir, "ws"), dirs_exist_ok=True)
+                    snapshot_workspace = os.path.join(snapshot_dir, "ws")
+                    # Create a shallow copy of execution_result with the snapshot path
+                    grade_result_copy = dict(result)
+                    grade_result_copy["workspace"] = snapshot_workspace
+                    grade_kwargs["execution_result"] = grade_result_copy
+                    pending_grade_snapshot_dir = snapshot_dir
+                else:
+                    pending_grade_snapshot_dir = None
+
                 # Submit grading to background thread
                 pending_grade_future = judge_executor.submit(grade_task, **grade_kwargs)
                 pending_grade_task = task