diff --git a/README.md b/README.md index b209a1f9..66d8a979 100644 --- a/README.md +++ b/README.md @@ -94,15 +94,16 @@ export PINCHBENCH_OFFICIAL_KEY=your_official_key | Flag | Description | | ------------------------ | ----------------------------------------------------------------------------- | | `--model MODEL` | Model to test (e.g., `openrouter/anthropic/claude-sonnet-4`) | -| `--judge MODEL` | Judge model for LLM grading; uses direct API when set (see below) | +| `--judge MODEL` | Judge model for LLM grading; uses direct API when set (see below) | | `--suite SUITE` | `all`, `automated-only`, or comma-separated task IDs | | `--runs N` | Number of runs per task for averaging | | `--timeout-multiplier N` | Scale timeouts for slower models | +| `--thinking LEVEL` | Reasoning depth: `off`, `minimal`, `low`, `medium`, `high`, `xhigh`, `adaptive` | | `--output-dir DIR` | Where to save results (default: `results/`) | | `--no-upload` | Skip uploading to leaderboard | | `--register` | Request an API token for submissions | | `--upload FILE` | Upload a previous results JSON | -| `--official-key KEY` | Mark submission as official (or use `PINCHBENCH_OFFICIAL_KEY` env var) | +| `--official-key KEY` | Mark submission as official (or use `PINCHBENCH_OFFICIAL_KEY` env var) | ### Judge diff --git a/scripts/benchmark.py b/scripts/benchmark.py index f9a535b4..3106567b 100644 --- a/scripts/benchmark.py +++ b/scripts/benchmark.py @@ -35,6 +35,7 @@ ModelValidationError, slugify_model, validate_openrouter_model, + VALID_THINKING_LEVELS, ) from lib_axiom import init_axiom from lib_grading import ( @@ -283,6 +284,12 @@ def _parse_args() -> argparse.Namespace: action="store_true", help="Clear the judge cache before running", ) + parser.add_argument( + "--thinking", + type=str, + default=None, + help="Thinking level for reasoning depth (off, minimal, low, medium, high, xhigh, adaptive)", + ) parser.add_argument( "--trend", action="store_true", @@ -307,6 +314,13 @@ def _parse_args() -> argparse.Namespace: if args.trend_window < 2: parser.error("--trend-window must be >= 2") + # Validate --thinking + if args.thinking and args.thinking not in VALID_THINKING_LEVELS: + parser.error( + f"Invalid thinking level '{args.thinking}'. " + f"Valid levels: {', '.join(VALID_THINKING_LEVELS)}" + ) + return args @@ -993,6 +1007,7 @@ def _wait_for_pending_grade() -> None: skill_dir=skill_dir, output_dir=Path(args.output_dir) / f"{run_id}_transcripts", verbose=args.verbose, + thinking_level=args.thinking, ) except Exception as exc: execution_error = str(exc) diff --git a/scripts/lib_agent.py b/scripts/lib_agent.py index 5f165426..62070298 100644 --- a/scripts/lib_agent.py +++ b/scripts/lib_agent.py @@ -33,6 +33,9 @@ class ModelValidationError(Exception): MAX_OPENCLAW_MESSAGE_CHARS = int(os.environ.get("PINCHBENCH_MAX_MSG_CHARS", "8000")) JUDGE_MAX_MSG_CHARS = int(os.environ.get("PINCHBENCH_JUDGE_MAX_MSG_CHARS", "3000")) +# Valid thinking levels for OpenClaw reasoning depth +VALID_THINKING_LEVELS = ("off", "minimal", "low", "medium", "high", "xhigh", "adaptive") + def _coerce_subprocess_output(value: Any) -> str: if value is None: @@ -772,10 +775,13 @@ def execute_openclaw_task( skill_dir: Path, output_dir: Optional[Path] = None, verbose: bool = False, + thinking_level: Optional[str] = None, ) -> Dict[str, Any]: logger.info("🤖 Agent [%s] starting task: %s", agent_id, task.task_id) logger.info(" Task: %s", task.name) logger.info(" Category: %s", task.category) + if thinking_level: + logger.info(" Thinking: %s", thinking_level) if verbose: logger.info( " Prompt: %s", task.prompt[:500] + "..." if len(task.prompt) > 500 else task.prompt @@ -862,6 +868,8 @@ def execute_openclaw_task( ] if use_local: cmd.insert(2, "--local") + if thinking_level: + cmd.extend(["--thinking", thinking_level]) result = subprocess.run( cmd, capture_output=True, @@ -899,6 +907,8 @@ def execute_openclaw_task( ] if use_local: cmd.insert(2, "--local") + if thinking_level: + cmd.extend(["--thinking", thinking_level]) result = subprocess.run( cmd, capture_output=True,