Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,15 +94,16 @@ export PINCHBENCH_OFFICIAL_KEY=your_official_key
| Flag | Description |
| ------------------------ | ----------------------------------------------------------------------------- |
| `--model MODEL` | Model to test (e.g., `openrouter/anthropic/claude-sonnet-4`) |
| `--judge MODEL` | Judge model for LLM grading; uses direct API when set (see below) |
| `--judge MODEL` | Judge model for LLM grading; uses direct API when set (see below) |
| `--suite SUITE` | `all`, `automated-only`, or comma-separated task IDs |
| `--runs N` | Number of runs per task for averaging |
| `--timeout-multiplier N` | Scale timeouts for slower models |
| `--thinking LEVEL` | Reasoning depth: `off`, `minimal`, `low`, `medium`, `high`, `xhigh`, `adaptive` |
| `--output-dir DIR` | Where to save results (default: `results/`) |
| `--no-upload` | Skip uploading to leaderboard |
| `--register` | Request an API token for submissions |
| `--upload FILE` | Upload a previous results JSON |
| `--official-key KEY` | Mark submission as official (or use `PINCHBENCH_OFFICIAL_KEY` env var) |
| `--official-key KEY` | Mark submission as official (or use `PINCHBENCH_OFFICIAL_KEY` env var) |

### Judge

Expand Down
15 changes: 15 additions & 0 deletions scripts/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
ModelValidationError,
slugify_model,
validate_openrouter_model,
VALID_THINKING_LEVELS,
)
from lib_axiom import init_axiom
from lib_grading import (
Expand Down Expand Up @@ -283,6 +284,12 @@ def _parse_args() -> argparse.Namespace:
action="store_true",
help="Clear the judge cache before running",
)
parser.add_argument(
"--thinking",
type=str,
default=None,
help="Thinking level for reasoning depth (off, minimal, low, medium, high, xhigh, adaptive)",
)
parser.add_argument(
"--trend",
action="store_true",
Expand All @@ -307,6 +314,13 @@ def _parse_args() -> argparse.Namespace:
if args.trend_window < 2:
parser.error("--trend-window must be >= 2")

# Validate --thinking
if args.thinking and args.thinking not in VALID_THINKING_LEVELS:
parser.error(
f"Invalid thinking level '{args.thinking}'. "
f"Valid levels: {', '.join(VALID_THINKING_LEVELS)}"
)

return args


Expand Down Expand Up @@ -993,6 +1007,7 @@ def _wait_for_pending_grade() -> None:
skill_dir=skill_dir,
output_dir=Path(args.output_dir) / f"{run_id}_transcripts",
verbose=args.verbose,
thinking_level=args.thinking,
)
except Exception as exc:
execution_error = str(exc)
Expand Down
10 changes: 10 additions & 0 deletions scripts/lib_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ class ModelValidationError(Exception):
MAX_OPENCLAW_MESSAGE_CHARS = int(os.environ.get("PINCHBENCH_MAX_MSG_CHARS", "8000"))
JUDGE_MAX_MSG_CHARS = int(os.environ.get("PINCHBENCH_JUDGE_MAX_MSG_CHARS", "3000"))

# Valid thinking levels for OpenClaw reasoning depth
VALID_THINKING_LEVELS = ("off", "minimal", "low", "medium", "high", "xhigh", "adaptive")


def _coerce_subprocess_output(value: Any) -> str:
if value is None:
Expand Down Expand Up @@ -772,10 +775,13 @@ def execute_openclaw_task(
skill_dir: Path,
output_dir: Optional[Path] = None,
verbose: bool = False,
thinking_level: Optional[str] = None,
) -> Dict[str, Any]:
logger.info("🤖 Agent [%s] starting task: %s", agent_id, task.task_id)
logger.info(" Task: %s", task.name)
logger.info(" Category: %s", task.category)
if thinking_level:
logger.info(" Thinking: %s", thinking_level)
if verbose:
logger.info(
" Prompt: %s", task.prompt[:500] + "..." if len(task.prompt) > 500 else task.prompt
Expand Down Expand Up @@ -862,6 +868,8 @@ def execute_openclaw_task(
]
if use_local:
cmd.insert(2, "--local")
if thinking_level:
cmd.extend(["--thinking", thinking_level])
result = subprocess.run(
cmd,
capture_output=True,
Expand Down Expand Up @@ -899,6 +907,8 @@ def execute_openclaw_task(
]
if use_local:
cmd.insert(2, "--local")
if thinking_level:
cmd.extend(["--thinking", thinking_level])
result = subprocess.run(
cmd,
capture_output=True,
Expand Down
Loading