pinchbench · olearycrew · May 5, 2026 · May 5, 2026
diff --git a/README.md b/README.md
@@ -94,15 +94,16 @@ export PINCHBENCH_OFFICIAL_KEY=your_official_key
 | Flag                     | Description                                                                   |
 | ------------------------ | ----------------------------------------------------------------------------- |
 | `--model MODEL`          | Model to test (e.g., `openrouter/anthropic/claude-sonnet-4`)                  |
-| `--judge MODEL`          | Judge model for LLM grading; uses direct API when set (see below)                 |
+| `--judge MODEL`          | Judge model for LLM grading; uses direct API when set (see below)             |
 | `--suite SUITE`          | `all`, `automated-only`, or comma-separated task IDs                          |
 | `--runs N`               | Number of runs per task for averaging                                         |
 | `--timeout-multiplier N` | Scale timeouts for slower models                                              |
+| `--thinking LEVEL`       | Reasoning depth: `off`, `minimal`, `low`, `medium`, `high`, `xhigh`, `adaptive` |
 | `--output-dir DIR`       | Where to save results (default: `results/`)                                   |
 | `--no-upload`            | Skip uploading to leaderboard                                                 |
 | `--register`             | Request an API token for submissions                                          |
 | `--upload FILE`          | Upload a previous results JSON                                                |
-| `--official-key KEY`     | Mark submission as official (or use `PINCHBENCH_OFFICIAL_KEY` env var)         |
+| `--official-key KEY`     | Mark submission as official (or use `PINCHBENCH_OFFICIAL_KEY` env var)        |
 
 ### Judge
 

diff --git a/scripts/benchmark.py b/scripts/benchmark.py
@@ -35,6 +35,7 @@
     ModelValidationError,
     slugify_model,
     validate_openrouter_model,
+    VALID_THINKING_LEVELS,
 )
 from lib_axiom import init_axiom
 from lib_grading import (
@@ -283,6 +284,12 @@ def _parse_args() -> argparse.Namespace:
         action="store_true",
         help="Clear the judge cache before running",
     )
+    parser.add_argument(
+        "--thinking",
+        type=str,
+        default=None,
+        help="Thinking level for reasoning depth (off, minimal, low, medium, high, xhigh, adaptive)",
+    )
     parser.add_argument(
         "--trend",
         action="store_true",
@@ -307,6 +314,13 @@ def _parse_args() -> argparse.Namespace:
     if args.trend_window < 2:
         parser.error("--trend-window must be >= 2")
 
+    # Validate --thinking
+    if args.thinking and args.thinking not in VALID_THINKING_LEVELS:
+        parser.error(
+            f"Invalid thinking level '{args.thinking}'. "
+            f"Valid levels: {', '.join(VALID_THINKING_LEVELS)}"
+        )
+
     return args
 
 
@@ -993,6 +1007,7 @@ def _wait_for_pending_grade() -> None:
                     skill_dir=skill_dir,
                     output_dir=Path(args.output_dir) / f"{run_id}_transcripts",
                     verbose=args.verbose,
+                    thinking_level=args.thinking,
                 )
             except Exception as exc:
                 execution_error = str(exc)

diff --git a/scripts/lib_agent.py b/scripts/lib_agent.py
@@ -33,6 +33,9 @@ class ModelValidationError(Exception):
 MAX_OPENCLAW_MESSAGE_CHARS = int(os.environ.get("PINCHBENCH_MAX_MSG_CHARS", "8000"))
 JUDGE_MAX_MSG_CHARS = int(os.environ.get("PINCHBENCH_JUDGE_MAX_MSG_CHARS", "3000"))
 
+# Valid thinking levels for OpenClaw reasoning depth
+VALID_THINKING_LEVELS = ("off", "minimal", "low", "medium", "high", "xhigh", "adaptive")
+
 
 def _coerce_subprocess_output(value: Any) -> str:
     if value is None:
@@ -772,10 +775,13 @@ def execute_openclaw_task(
     skill_dir: Path,
     output_dir: Optional[Path] = None,
     verbose: bool = False,
+    thinking_level: Optional[str] = None,
 ) -> Dict[str, Any]:
     logger.info("🤖 Agent [%s] starting task: %s", agent_id, task.task_id)
     logger.info("   Task: %s", task.name)
     logger.info("   Category: %s", task.category)
+    if thinking_level:
+        logger.info("   Thinking: %s", thinking_level)
     if verbose:
         logger.info(
             "   Prompt: %s", task.prompt[:500] + "..." if len(task.prompt) > 500 else task.prompt
@@ -862,6 +868,8 @@ def execute_openclaw_task(
                     ]
                 if use_local:
                     cmd.insert(2, "--local")
+                if thinking_level:
+                    cmd.extend(["--thinking", thinking_level])
                 result = subprocess.run(
                     cmd,
                     capture_output=True,
@@ -899,6 +907,8 @@ def execute_openclaw_task(
                 ]
             if use_local:
                 cmd.insert(2, "--local")
+            if thinking_level:
+                cmd.extend(["--thinking", thinking_level])
             result = subprocess.run(
                 cmd,
                 capture_output=True,