Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,10 @@ cd benchmarks/appworld && ./eval.sh --task 82e2fac_1
# Compare via dispatcher
./scripts/compare.sh --benchmark bpo --runs 3
./scripts/compare.sh --benchmark m3 --runs 2

# .env overrides model profile (.env wins; defaults to gpt-oss base if no --model-profile)
./scripts/eval.sh --benchmark bpo --dotenv
./scripts/eval.sh --benchmark bpo --model-profile gpt4o --dotenv
```

### Common flags
Expand All @@ -355,6 +359,7 @@ Flags accepted by every `eval.sh` (and forwarded by every `compare.sh`):
| `--task <id>...` | Run only the listed task(s) (numeric IDs, task names, or — for AppWorld — task UUIDs). |
| `--agent cuga\|react` | Pick agent. `cuga` is the default; `react` runs the lightweight ReAct baseline. Not all benchmarks support both (see Agent Selection above). |
| `--model-profile <name>` | Pick model profile (`gpt-oss`, `gpt4o`, `gpt4.1`, `opus4.5`). Default comes from `.env`. |
| `--dotenv` | After applying the model profile, re-read `.env` and force-export every variable it contains. `.env` values override the profile. If no `--model-profile` is given, defaults to `gpt-oss` as the base. |
| `--no-bundle` | Skip reproducibility bundle creation. |
| `--bundle-zip` | Zip the bundle for sharing. |

Expand Down
9 changes: 7 additions & 2 deletions benchmarks/appworld/compare.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ AGENTS="${AGENTS:-}"
COMPARE_AGENTS="${COMPARE_AGENTS:-false}"
NO_BUNDLE="${NO_BUNDLE:-false}"
BUNDLE_ZIP="${BUNDLE_ZIP:-false}"
USE_DOTENV="${USE_DOTENV:-false}"
FORWARDED_ARGS=()

# Parse arguments
Expand Down Expand Up @@ -79,6 +80,10 @@ while [[ $idx -lt ${#ARGS[@]} ]]; do
BUNDLE_ZIP=true
idx=$((idx+1))
;;
--dotenv)
USE_DOTENV=true
idx=$((idx+1))
;;
*)
FORWARDED_ARGS+=("${ARGS[$idx]}")
idx=$((idx+1))
Expand Down Expand Up @@ -161,8 +166,8 @@ for config in "${CONFIGS[@]}"; do
echo -e "${CYAN:-}Configuration: ${config}${NC:-}"
echo -e "${BLUE:-}══════════════════════════════════════════════════════════════${NC:-}"

if type apply_model_profile &>/dev/null; then
apply_model_profile "$model"
if type apply_model_config &>/dev/null; then
apply_model_config "$model"
fi

# Snapshot existing result files before this config's runs
Expand Down
13 changes: 9 additions & 4 deletions benchmarks/bpo/compare.sh
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ COMPARE_AGENTS="${COMPARE_AGENTS:-false}"
COMPARE_POLICIES=false
NO_BUNDLE="${NO_BUNDLE:-false}"
BUNDLE_ZIP="${BUNDLE_ZIP:-false}"
USE_DOTENV="${USE_DOTENV:-false}"
FORWARDED_ARGS=()

# Parse arguments
Expand Down Expand Up @@ -95,6 +96,10 @@ while [[ $idx -lt ${#ARGS[@]} ]]; do
BUNDLE_ZIP=true
idx=$((idx+1))
;;
--dotenv)
USE_DOTENV=true
idx=$((idx+1))
;;
--dry-run)
DRY_RUN=true
idx=$((idx+1))
Expand Down Expand Up @@ -214,10 +219,10 @@ for config in "${CONFIGS[@]}"; do
echo -e "${CYAN:-}Configuration: ${config}${NC:-}"
echo -e "${BLUE:-}══════════════════════════════════════════════════════════════${NC:-}"

# Apply model profile
if type apply_model_profile &>/dev/null; then
if ! apply_model_profile "$model"; then
echo -e "${RED:-}Error: Failed to apply model profile '$model'${NC:-}"
# Apply model config (profile + optional .env overrides)
if type apply_model_config &>/dev/null; then
if ! apply_model_config "$model"; then
echo -e "${RED:-}Error: Failed to apply model config '$model'${NC:-}"
echo -e "${YELLOW:-}Valid profiles: gpt-oss, gpt4o, gpt4.1, opus4.5${NC:-}"
exit 1
fi
Expand Down
65 changes: 64 additions & 1 deletion benchmarks/helpers/common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ COMPARE_AGENTS="${COMPARE_AGENTS:-false}"
NO_BUNDLE="${NO_BUNDLE:-false}"
BUNDLE_ZIP="${BUNDLE_ZIP:-false}"
FORWARDED_ARGS=()
USE_DOTENV="${USE_DOTENV:-false}"

parse_common_args() {
local args=("$@")
Expand Down Expand Up @@ -193,6 +194,10 @@ parse_common_args() {
BUNDLE_ZIP=true
idx=$((idx+1))
;;
--dotenv)
USE_DOTENV=true
idx=$((idx+1))
;;
--help|-h)
# Let the caller handle --help
FORWARDED_ARGS+=("$arg")
Expand Down Expand Up @@ -249,9 +254,67 @@ apply_model_cli_overrides_if_set() {
fi
}

# Re-read .env with force-export semantics so .env vars win over a
# previously-applied model profile. Accepts an optional path argument for
# testability; defaults to <project_root>/.env derived from BASH_SOURCE[0].
apply_dotenv_model_overrides() {
local helpers_dir env_file
helpers_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
env_file="${1:-$helpers_dir/../../.env}"

if [ ! -f "$env_file" ]; then
echo -e "${YELLOW}Warning: --dotenv specified but .env not found at $env_file${NC}"
return 0
fi

echo -e "${GREEN}✓${NC} .env overrides (--dotenv):"
local line key val
while IFS= read -r line || [[ -n "$line" ]]; do
[[ "$line" =~ ^[[:space:]]*# ]] && continue
[[ -z "${line//[[:space:]]/}" ]] && continue
if [[ "$line" =~ ^[[:space:]]*export[[:space:]]+([A-Za-z_][A-Za-z0-9_]*)=(.*) ]]; then
key="${BASH_REMATCH[1]}"
val="${BASH_REMATCH[2]}"
elif [[ "$line" =~ ^[[:space:]]*([A-Za-z_][A-Za-z0-9_]*)=(.*) ]]; then
key="${BASH_REMATCH[1]}"
val="${BASH_REMATCH[2]}"
else
continue
fi
# Strip inline comments from unquoted values
if [[ "$val" != \"*\" && "$val" != \'*\' ]]; then
val="${val%%[[:space:]]#*}"
val="${val%"${val##*[![:space:]]}"}"
fi
# Strip surrounding quotes
val="${val#\"}" ; val="${val%\"}"
val="${val#\'}" ; val="${val%\'}"
echo -e " ${GREEN}↳${NC} $key=$val"
export "$key=$val"
done < "$env_file"
}

# Apply a model profile and, when USE_DOTENV=true, layer .env overrides on top.
# With no profile and USE_DOTENV=true, defaults to gpt-oss as the base.
# env_file is optional; used by tests to supply a temp file instead of the real .env.
apply_model_config() {
local profile="${1:-}"
local env_file="${2:-}"
if [[ "${USE_DOTENV:-false}" == "true" && -z "$profile" ]]; then
profile="gpt-oss"
fi
if [[ -n "$profile" ]]; then
_ensure_model_profiles_loaded || return 1
apply_model_profile "$profile" || return 1
fi
if [[ "${USE_DOTENV:-false}" == "true" ]]; then
apply_dotenv_model_overrides "$env_file"
fi
}

# Apply profile then CLI overrides. Call after load_env.sh and arg parsing.
finalize_model_config() {
apply_model_profile_if_set || return 1
apply_model_config "$MODEL_PROFILE" || return 1
apply_model_cli_overrides_if_set
}

Expand Down
117 changes: 117 additions & 0 deletions benchmarks/helpers/tests/test_model_config.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
#!/usr/bin/env bash
# Unit tests for apply_dotenv_model_overrides and apply_model_config.
# Run: bash benchmarks/helpers/tests/test_model_config.sh
set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

PASS=0; FAIL=0

assert_eq() {
if [[ "${2}" == "${3}" ]]; then
echo " PASS: ${1}"; PASS=$((PASS+1))
else
echo " FAIL: ${1}"; echo " want: ${2}"; echo " got: ${3}"; FAIL=$((FAIL+1))
fi
}

# ─── apply_dotenv_model_overrides ────────────────────────────────────────────

echo "apply_dotenv_model_overrides"

# NOTE: diagnostic output (echo lines from apply_dotenv_model_overrides /
# apply_model_profile) is redirected to /dev/null so only the final echo
# survives into $result. In the red phase (before implementation) the whole
# script will abort with "command not found" — that is the expected failure.

# overrides existing vars from a supplied env file
result=$(
source "$SCRIPT_DIR/../common.sh"
TMP=$(mktemp); trap 'rm -f "$TMP"' EXIT
printf 'MODEL_NAME=my-model\nOPENAI_BASE_URL=https://custom\n' > "$TMP"
export MODEL_NAME=original; export OPENAI_BASE_URL=original
apply_dotenv_model_overrides "$TMP" > /dev/null 2>&1
echo "$MODEL_NAME|$OPENAI_BASE_URL"
)
assert_eq "overrides MODEL_NAME and OPENAI_BASE_URL" "my-model|https://custom" "$result"

# no-op when file does not exist — prints warning, does not error
result=$(
source "$SCRIPT_DIR/../common.sh"
export MODEL_NAME=original
apply_dotenv_model_overrides "/no/such/file.env" > /dev/null 2>&1
echo "$MODEL_NAME"
)
assert_eq "no-op when .env missing" "original" "$result"

# strips surrounding quotes and inline comments
result=$(
source "$SCRIPT_DIR/../common.sh"
TMP=$(mktemp); trap 'rm -f "$TMP"' EXIT
printf 'MODEL_NAME="quoted-model"\nOPENAI_BASE_URL=https://x # comment\n' > "$TMP"
apply_dotenv_model_overrides "$TMP" > /dev/null 2>&1
echo "$MODEL_NAME|$OPENAI_BASE_URL"
)
assert_eq "strips quotes and inline comments" "quoted-model|https://x" "$result"

# handles export-prefixed lines
result=$(
source "$SCRIPT_DIR/../common.sh"
TMP=$(mktemp); trap 'rm -f "$TMP"' EXIT
printf 'export MODEL_NAME=export-style\n' > "$TMP"
apply_dotenv_model_overrides "$TMP" > /dev/null 2>&1
echo "$MODEL_NAME"
)
assert_eq "handles export-prefixed lines" "export-style" "$result"

# ─── apply_model_config ───────────────────────────────────────────────────────

echo "apply_model_config"

# USE_DOTENV=false: behaves exactly like apply_model_profile, no .env re-read
result=$(
source "$SCRIPT_DIR/../common.sh"
export USE_DOTENV=false
apply_model_config "gpt-oss" > /dev/null 2>&1
echo "$MODEL_NAME"
)
assert_eq "USE_DOTENV=false: MODEL_NAME from profile" "openai/gpt-oss-120b" "$result"

# USE_DOTENV=true with profile: profile runs first, then .env overrides MODEL_NAME
result=$(
source "$SCRIPT_DIR/../common.sh"
TMP=$(mktemp); trap 'rm -f "$TMP"' EXIT
printf 'MODEL_NAME=dotenv-override\n' > "$TMP"
export USE_DOTENV=true
apply_model_config "gpt-oss" "$TMP" > /dev/null 2>&1
echo "$MODEL_NAME"
)
assert_eq "USE_DOTENV=true: .env wins over profile" "dotenv-override" "$result"

# USE_DOTENV=true, .env does NOT set MODEL_NAME: profile value is kept
result=$(
source "$SCRIPT_DIR/../common.sh"
TMP=$(mktemp); trap 'rm -f "$TMP"' EXIT
printf 'SOME_OTHER_VAR=x\n' > "$TMP"
export USE_DOTENV=true
apply_model_config "gpt-oss" "$TMP" > /dev/null 2>&1
echo "$MODEL_NAME"
)
assert_eq "USE_DOTENV=true: profile value kept when .env omits var" "openai/gpt-oss-120b" "$result"

# USE_DOTENV=true, no profile: defaults to gpt-oss base
result=$(
source "$SCRIPT_DIR/../common.sh"
TMP=$(mktemp); trap 'rm -f "$TMP"' EXIT
printf 'SOME_OTHER_VAR=x\n' > "$TMP"
export USE_DOTENV=true
apply_model_config "" "$TMP" > /dev/null 2>&1
echo "$MODEL_NAME"
)
assert_eq "USE_DOTENV=true, no profile: defaults to gpt-oss" "openai/gpt-oss-120b" "$result"

# ─── Summary ─────────────────────────────────────────────────────────────────

echo ""
echo "Results: $PASS passed, $FAIL failed"
[[ $FAIL -eq 0 ]]
9 changes: 7 additions & 2 deletions benchmarks/m3/compare.sh
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ COMPARE_POLICIES="${COMPARE_POLICIES:-false}"
GLOBAL_NO_POLICIES="${GLOBAL_NO_POLICIES:-false}"
NO_BUNDLE="${NO_BUNDLE:-false}"
BUNDLE_ZIP="${BUNDLE_ZIP:-false}"
USE_DOTENV="${USE_DOTENV:-false}"
FORWARDED_ARGS=()

# Parse arguments
Expand Down Expand Up @@ -96,6 +97,10 @@ while [[ $idx -lt ${#ARGS[@]} ]]; do
BUNDLE_ZIP=true
idx=$((idx+1))
;;
--dotenv)
USE_DOTENV=true
idx=$((idx+1))
;;
--dry-run)
DRY_RUN=true
idx=$((idx+1))
Expand Down Expand Up @@ -441,8 +446,8 @@ for config in "${CONFIGS[@]}"; do
echo -e "${CYAN:-}Configuration: ${config}${NC:-}"
echo -e "${BLUE:-}══════════════════════════════════════════════════════════════${NC:-}"

if type apply_model_profile &>/dev/null; then
apply_model_profile "$model"
if type apply_model_config &>/dev/null; then
apply_model_config "$model"
fi

# Per-config extra args (e.g., --no-policies when comparing policy modes).
Expand Down
9 changes: 7 additions & 2 deletions benchmarks/oak_health_insurance/compare.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ AGENTS="${AGENTS:-}"
COMPARE_AGENTS="${COMPARE_AGENTS:-false}"
NO_BUNDLE="${NO_BUNDLE:-false}"
BUNDLE_ZIP="${BUNDLE_ZIP:-false}"
USE_DOTENV="${USE_DOTENV:-false}"
FORWARDED_ARGS=()

# Parse arguments
Expand Down Expand Up @@ -76,6 +77,10 @@ while [[ $idx -lt ${#ARGS[@]} ]]; do
BUNDLE_ZIP=true
idx=$((idx+1))
;;
--dotenv)
USE_DOTENV=true
idx=$((idx+1))
;;
--dry-run)
DRY_RUN=true
idx=$((idx+1))
Expand Down Expand Up @@ -155,8 +160,8 @@ for model in "${MODEL_LIST[@]}"; do
echo -e "${CYAN:-}Model: ${model}${NC:-}"
echo -e "${BLUE:-}══════════════════════════════════════════════════════════════${NC:-}"

if type apply_model_profile &>/dev/null; then
apply_model_profile "$model"
if type apply_model_config &>/dev/null; then
apply_model_config "$model"
fi

# Snapshot existing result files and trajectory folders before this model's runs
Expand Down
3 changes: 2 additions & 1 deletion scripts/compare.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ for arg in "${FORWARDED_ARGS[@]}"; do
echo " --runs <N> Number of runs (default: 1)"
echo " --output, -o <file> Save comparison report to file"
echo " --model-profile <name> Model profile (gpt-oss, gpt4o, gpt4.1, opus4.5)"
echo " --dotenv Use .env values to override the model profile"
echo " --dry-run Print what would be run without executing"
echo " --no-bundle Skip reproducibility bundle creation"
echo " --verbose, -v Enable verbose output"
Expand Down Expand Up @@ -95,7 +96,7 @@ finalize_model_config
check_langfuse_env

# Export common variables
export RUNS OUTPUT_FILE DRY_RUN NO_BUNDLE BUNDLE_ZIP MODEL_PROFILE VERBOSE AGENT AGENTS COMPARE_AGENTS
export RUNS OUTPUT_FILE DRY_RUN NO_BUNDLE BUNDLE_ZIP MODEL_PROFILE VERBOSE AGENT AGENTS COMPARE_AGENTS USE_DOTENV

# Banner: when comparing multiple agents, show the agent list instead of the singular AGENT.
BANNER_AGENT_LABEL="$AGENT"
Expand Down
3 changes: 2 additions & 1 deletion scripts/eval.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ for arg in "${FORWARDED_ARGS[@]}"; do
echo " --benchmark, -b <name> Benchmark to run (required)"
echo " --agent <name> Agent to run (cuga, react; default: cuga)"
echo " --model-profile <name> Model profile (gpt-oss, gpt4o, gpt4.1, opus4.5)"
echo " --dotenv Use .env values to override the model profile"
echo " --verbose, -v Enable verbose output"
echo " --no-bundle Skip reproducibility bundle creation"
echo " --bundle-zip Create zip archive of bundle"
Expand Down Expand Up @@ -86,7 +87,7 @@ finalize_model_config
check_langfuse_env

# Export common variables for the benchmark script
export NO_BUNDLE BUNDLE_ZIP MODEL_PROFILE VERBOSE AGENT
export NO_BUNDLE BUNDLE_ZIP MODEL_PROFILE VERBOSE AGENT USE_DOTENV

echo -e "${BLUE}╔════════════════════════════════════════════════════════════╗${NC}"
echo -e "${BLUE}║ Evaluation: ${BENCHMARK} [${AGENT}]$(printf '%*s' $((34 - ${#BENCHMARK} - ${#AGENT})) '')║${NC}"
Expand Down
Loading