From bb61735b405af3c3ae86b3f0b0c24c0baf250f65 Mon Sep 17 00:00:00 2001 From: Harold Ship Date: Tue, 9 Jun 2026 14:33:40 +0300 Subject: [PATCH 01/12] test(model-config): add bash unit tests for apply_dotenv_model_overrides and apply_model_config --- benchmarks/helpers/tests/test_model_config.sh | 117 ++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100755 benchmarks/helpers/tests/test_model_config.sh diff --git a/benchmarks/helpers/tests/test_model_config.sh b/benchmarks/helpers/tests/test_model_config.sh new file mode 100755 index 0000000..413eb20 --- /dev/null +++ b/benchmarks/helpers/tests/test_model_config.sh @@ -0,0 +1,117 @@ +#!/usr/bin/env bash +# Unit tests for apply_dotenv_model_overrides and apply_model_config. +# Run: bash benchmarks/helpers/tests/test_model_config.sh +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +PASS=0; FAIL=0 + +assert_eq() { + if [[ "${2}" == "${3}" ]]; then + echo " PASS: ${1}"; PASS=$((PASS+1)) + else + echo " FAIL: ${1}"; echo " want: ${2}"; echo " got: ${3}"; FAIL=$((FAIL+1)) + fi +} + +# ─── apply_dotenv_model_overrides ──────────────────────────────────────────── + +echo "apply_dotenv_model_overrides" + +# NOTE: diagnostic output (echo lines from apply_dotenv_model_overrides / +# apply_model_profile) is redirected to /dev/null so only the final echo +# survives into $result. In the red phase (before implementation) the whole +# script will abort with "command not found" — that is the expected failure. + +# overrides existing vars from a supplied env file +result=$( + source "$SCRIPT_DIR/../common.sh" + TMP=$(mktemp); trap "rm -f $TMP" EXIT + printf 'MODEL_NAME=my-model\nOPENAI_BASE_URL=https://custom\n' > "$TMP" + export MODEL_NAME=original; export OPENAI_BASE_URL=original + apply_dotenv_model_overrides "$TMP" > /dev/null + echo "$MODEL_NAME|$OPENAI_BASE_URL" +) +assert_eq "overrides MODEL_NAME and OPENAI_BASE_URL" "my-model|https://custom" "$result" + +# no-op when file does not exist — prints warning, does not error +result=$( + source "$SCRIPT_DIR/../common.sh" + export MODEL_NAME=original + apply_dotenv_model_overrides "/no/such/file.env" > /dev/null 2>&1 + echo "$MODEL_NAME" +) +assert_eq "no-op when .env missing" "original" "$result" + +# strips surrounding quotes and inline comments +result=$( + source "$SCRIPT_DIR/../common.sh" + TMP=$(mktemp); trap "rm -f $TMP" EXIT + printf 'MODEL_NAME="quoted-model"\nOPENAI_BASE_URL=https://x # comment\n' > "$TMP" + apply_dotenv_model_overrides "$TMP" > /dev/null + echo "$MODEL_NAME|$OPENAI_BASE_URL" +) +assert_eq "strips quotes and inline comments" "quoted-model|https://x" "$result" + +# handles export-prefixed lines +result=$( + source "$SCRIPT_DIR/../common.sh" + TMP=$(mktemp); trap "rm -f $TMP" EXIT + printf 'export MODEL_NAME=export-style\n' > "$TMP" + apply_dotenv_model_overrides "$TMP" > /dev/null + echo "$MODEL_NAME" +) +assert_eq "handles export-prefixed lines" "export-style" "$result" + +# ─── apply_model_config ─────────────────────────────────────────────────────── + +echo "apply_model_config" + +# USE_DOTENV=false: behaves exactly like apply_model_profile, no .env re-read +result=$( + source "$SCRIPT_DIR/../common.sh" + export USE_DOTENV=false + apply_model_config "gpt-oss" > /dev/null + echo "$MODEL_NAME" +) +assert_eq "USE_DOTENV=false: MODEL_NAME from profile" "openai/gpt-oss-120b" "$result" + +# USE_DOTENV=true with profile: profile runs first, then .env overrides MODEL_NAME +result=$( + source "$SCRIPT_DIR/../common.sh" + TMP=$(mktemp); trap "rm -f $TMP" EXIT + printf 'MODEL_NAME=dotenv-override\n' > "$TMP" + export USE_DOTENV=true + apply_model_config "gpt-oss" "$TMP" > /dev/null + echo "$MODEL_NAME" +) +assert_eq "USE_DOTENV=true: .env wins over profile" "dotenv-override" "$result" + +# USE_DOTENV=true, .env does NOT set MODEL_NAME: profile value is kept +result=$( + source "$SCRIPT_DIR/../common.sh" + TMP=$(mktemp); trap "rm -f $TMP" EXIT + printf 'SOME_OTHER_VAR=x\n' > "$TMP" + export USE_DOTENV=true + apply_model_config "gpt-oss" "$TMP" > /dev/null + echo "$MODEL_NAME" +) +assert_eq "USE_DOTENV=true: profile value kept when .env omits var" "openai/gpt-oss-120b" "$result" + +# USE_DOTENV=true, no profile: defaults to gpt-oss base +result=$( + source "$SCRIPT_DIR/../common.sh" + TMP=$(mktemp); trap "rm -f $TMP" EXIT + printf 'SOME_OTHER_VAR=x\n' > "$TMP" + export USE_DOTENV=true + apply_model_config "" "$TMP" > /dev/null + echo "$MODEL_NAME" +) +assert_eq "USE_DOTENV=true, no profile: defaults to gpt-oss" "openai/gpt-oss-120b" "$result" + +# ─── Summary ───────────────────────────────────────────────────────────────── + +echo "" +echo "Results: $PASS passed, $FAIL failed" +[[ $FAIL -eq 0 ]] From 812655fd95ac57158aa9c860c9206b8c7eb9e661 Mon Sep 17 00:00:00 2001 From: Harold Ship Date: Tue, 9 Jun 2026 14:36:32 +0300 Subject: [PATCH 02/12] test(model-config): suppress stderr in all test function calls --- benchmarks/helpers/tests/test_model_config.sh | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/benchmarks/helpers/tests/test_model_config.sh b/benchmarks/helpers/tests/test_model_config.sh index 413eb20..b099d23 100755 --- a/benchmarks/helpers/tests/test_model_config.sh +++ b/benchmarks/helpers/tests/test_model_config.sh @@ -30,7 +30,7 @@ result=$( TMP=$(mktemp); trap "rm -f $TMP" EXIT printf 'MODEL_NAME=my-model\nOPENAI_BASE_URL=https://custom\n' > "$TMP" export MODEL_NAME=original; export OPENAI_BASE_URL=original - apply_dotenv_model_overrides "$TMP" > /dev/null + apply_dotenv_model_overrides "$TMP" > /dev/null 2>&1 echo "$MODEL_NAME|$OPENAI_BASE_URL" ) assert_eq "overrides MODEL_NAME and OPENAI_BASE_URL" "my-model|https://custom" "$result" @@ -49,7 +49,7 @@ result=$( source "$SCRIPT_DIR/../common.sh" TMP=$(mktemp); trap "rm -f $TMP" EXIT printf 'MODEL_NAME="quoted-model"\nOPENAI_BASE_URL=https://x # comment\n' > "$TMP" - apply_dotenv_model_overrides "$TMP" > /dev/null + apply_dotenv_model_overrides "$TMP" > /dev/null 2>&1 echo "$MODEL_NAME|$OPENAI_BASE_URL" ) assert_eq "strips quotes and inline comments" "quoted-model|https://x" "$result" @@ -59,7 +59,7 @@ result=$( source "$SCRIPT_DIR/../common.sh" TMP=$(mktemp); trap "rm -f $TMP" EXIT printf 'export MODEL_NAME=export-style\n' > "$TMP" - apply_dotenv_model_overrides "$TMP" > /dev/null + apply_dotenv_model_overrides "$TMP" > /dev/null 2>&1 echo "$MODEL_NAME" ) assert_eq "handles export-prefixed lines" "export-style" "$result" @@ -72,7 +72,7 @@ echo "apply_model_config" result=$( source "$SCRIPT_DIR/../common.sh" export USE_DOTENV=false - apply_model_config "gpt-oss" > /dev/null + apply_model_config "gpt-oss" > /dev/null 2>&1 echo "$MODEL_NAME" ) assert_eq "USE_DOTENV=false: MODEL_NAME from profile" "openai/gpt-oss-120b" "$result" @@ -83,7 +83,7 @@ result=$( TMP=$(mktemp); trap "rm -f $TMP" EXIT printf 'MODEL_NAME=dotenv-override\n' > "$TMP" export USE_DOTENV=true - apply_model_config "gpt-oss" "$TMP" > /dev/null + apply_model_config "gpt-oss" "$TMP" > /dev/null 2>&1 echo "$MODEL_NAME" ) assert_eq "USE_DOTENV=true: .env wins over profile" "dotenv-override" "$result" @@ -94,7 +94,7 @@ result=$( TMP=$(mktemp); trap "rm -f $TMP" EXIT printf 'SOME_OTHER_VAR=x\n' > "$TMP" export USE_DOTENV=true - apply_model_config "gpt-oss" "$TMP" > /dev/null + apply_model_config "gpt-oss" "$TMP" > /dev/null 2>&1 echo "$MODEL_NAME" ) assert_eq "USE_DOTENV=true: profile value kept when .env omits var" "openai/gpt-oss-120b" "$result" @@ -105,7 +105,7 @@ result=$( TMP=$(mktemp); trap "rm -f $TMP" EXIT printf 'SOME_OTHER_VAR=x\n' > "$TMP" export USE_DOTENV=true - apply_model_config "" "$TMP" > /dev/null + apply_model_config "" "$TMP" > /dev/null 2>&1 echo "$MODEL_NAME" ) assert_eq "USE_DOTENV=true, no profile: defaults to gpt-oss" "openai/gpt-oss-120b" "$result" From 1047fa9843f31b18011c64504634040819f0d87d Mon Sep 17 00:00:00 2001 From: Harold Ship Date: Tue, 9 Jun 2026 14:37:11 +0300 Subject: [PATCH 03/12] feat(model-config): add USE_DOTENV global and --dotenv arg parsing --- benchmarks/helpers/common.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/benchmarks/helpers/common.sh b/benchmarks/helpers/common.sh index a2436bd..e9f94da 100755 --- a/benchmarks/helpers/common.sh +++ b/benchmarks/helpers/common.sh @@ -131,6 +131,7 @@ COMPARE_AGENTS="${COMPARE_AGENTS:-false}" NO_BUNDLE="${NO_BUNDLE:-false}" BUNDLE_ZIP="${BUNDLE_ZIP:-false}" FORWARDED_ARGS=() +USE_DOTENV="${USE_DOTENV:-false}" parse_common_args() { local args=("$@") @@ -193,6 +194,10 @@ parse_common_args() { BUNDLE_ZIP=true idx=$((idx+1)) ;; + --dotenv) + USE_DOTENV=true + idx=$((idx+1)) + ;; --help|-h) # Let the caller handle --help FORWARDED_ARGS+=("$arg") From 4a7848d0ffa760e0643a2d02113c44f6296a9f22 Mon Sep 17 00:00:00 2001 From: Harold Ship Date: Tue, 9 Jun 2026 14:38:28 +0300 Subject: [PATCH 04/12] feat(model-config): add apply_dotenv_model_overrides --- benchmarks/helpers/common.sh | 40 ++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/benchmarks/helpers/common.sh b/benchmarks/helpers/common.sh index e9f94da..1782b62 100755 --- a/benchmarks/helpers/common.sh +++ b/benchmarks/helpers/common.sh @@ -254,6 +254,46 @@ apply_model_cli_overrides_if_set() { fi } +# Re-read .env with force-export semantics so .env vars win over a +# previously-applied model profile. Accepts an optional path argument for +# testability; defaults to /.env derived from BASH_SOURCE[0]. +apply_dotenv_model_overrides() { + local helpers_dir env_file + helpers_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + env_file="${1:-$helpers_dir/../../.env}" + + if [ ! -f "$env_file" ]; then + echo -e "${YELLOW}Warning: --dotenv specified but .env not found at $env_file${NC}" + return 0 + fi + + echo -e "${GREEN}✓${NC} .env overrides (--dotenv):" + local line key val + while IFS= read -r line || [[ -n "$line" ]]; do + [[ "$line" =~ ^[[:space:]]*# ]] && continue + [[ -z "${line//[[:space:]]/}" ]] && continue + if [[ "$line" =~ ^[[:space:]]*export[[:space:]]+([A-Za-z_][A-Za-z0-9_]*)=(.*) ]]; then + key="${BASH_REMATCH[1]}" + val="${BASH_REMATCH[2]}" + elif [[ "$line" =~ ^[[:space:]]*([A-Za-z_][A-Za-z0-9_]*)=(.*) ]]; then + key="${BASH_REMATCH[1]}" + val="${BASH_REMATCH[2]}" + else + continue + fi + # Strip inline comments from unquoted values + if [[ "$val" != \"*\" && "$val" != \'*\' ]]; then + val="${val%%[[:space:]]#*}" + val="${val%"${val##*[![:space:]]}"}" + fi + # Strip surrounding quotes + val="${val#\"}" ; val="${val%\"}" + val="${val#\'}" ; val="${val%\'}" + echo -e " ${GREEN}↳${NC} $key=$val" + export "$key=$val" + done < "$env_file" +} + # Apply profile then CLI overrides. Call after load_env.sh and arg parsing. finalize_model_config() { apply_model_profile_if_set || return 1 From c1bbb28256bd254b61bddbaafd401120bb8f117d Mon Sep 17 00:00:00 2001 From: Harold Ship Date: Tue, 9 Jun 2026 14:39:57 +0300 Subject: [PATCH 05/12] feat(model-config): add apply_model_config and update finalize_model_config --- benchmarks/helpers/common.sh | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/benchmarks/helpers/common.sh b/benchmarks/helpers/common.sh index 1782b62..49f7884 100755 --- a/benchmarks/helpers/common.sh +++ b/benchmarks/helpers/common.sh @@ -294,9 +294,27 @@ apply_dotenv_model_overrides() { done < "$env_file" } +# Apply a model profile and, when USE_DOTENV=true, layer .env overrides on top. +# With no profile and USE_DOTENV=true, defaults to gpt-oss as the base. +# env_file is optional; used by tests to supply a temp file instead of the real .env. +apply_model_config() { + local profile="${1:-}" + local env_file="${2:-}" + if [[ "${USE_DOTENV:-false}" == "true" && -z "$profile" ]]; then + profile="gpt-oss" + fi + if [[ -n "$profile" ]]; then + _ensure_model_profiles_loaded || return 1 + apply_model_profile "$profile" || return 1 + fi + if [[ "${USE_DOTENV:-false}" == "true" ]]; then + apply_dotenv_model_overrides "$env_file" + fi +} + # Apply profile then CLI overrides. Call after load_env.sh and arg parsing. finalize_model_config() { - apply_model_profile_if_set || return 1 + apply_model_config "$MODEL_PROFILE" || return 1 apply_model_cli_overrides_if_set } From cb5dbf51e96f97957b6decca81cc4a5e58e46b91 Mon Sep 17 00:00:00 2001 From: Harold Ship Date: Tue, 9 Jun 2026 14:40:56 +0300 Subject: [PATCH 06/12] feat(model-config): expose --dotenv in top-level eval/compare scripts --- scripts/compare.sh | 3 ++- scripts/eval.sh | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/compare.sh b/scripts/compare.sh index 1712f17..b34491e 100755 --- a/scripts/compare.sh +++ b/scripts/compare.sh @@ -34,6 +34,7 @@ for arg in "${FORWARDED_ARGS[@]}"; do echo " --runs Number of runs (default: 1)" echo " --output, -o Save comparison report to file" echo " --model-profile Model profile (gpt-oss, gpt4o, gpt4.1, opus4.5)" + echo " --dotenv Use .env values to override the model profile" echo " --dry-run Print what would be run without executing" echo " --no-bundle Skip reproducibility bundle creation" echo " --verbose, -v Enable verbose output" @@ -95,7 +96,7 @@ finalize_model_config check_langfuse_env # Export common variables -export RUNS OUTPUT_FILE DRY_RUN NO_BUNDLE BUNDLE_ZIP MODEL_PROFILE VERBOSE AGENT AGENTS COMPARE_AGENTS +export RUNS OUTPUT_FILE DRY_RUN NO_BUNDLE BUNDLE_ZIP MODEL_PROFILE VERBOSE AGENT AGENTS COMPARE_AGENTS USE_DOTENV # Banner: when comparing multiple agents, show the agent list instead of the singular AGENT. BANNER_AGENT_LABEL="$AGENT" diff --git a/scripts/eval.sh b/scripts/eval.sh index c973b03..0be74aa 100755 --- a/scripts/eval.sh +++ b/scripts/eval.sh @@ -30,6 +30,7 @@ for arg in "${FORWARDED_ARGS[@]}"; do echo " --benchmark, -b Benchmark to run (required)" echo " --agent Agent to run (cuga, react; default: cuga)" echo " --model-profile Model profile (gpt-oss, gpt4o, gpt4.1, opus4.5)" + echo " --dotenv Use .env values to override the model profile" echo " --verbose, -v Enable verbose output" echo " --no-bundle Skip reproducibility bundle creation" echo " --bundle-zip Create zip archive of bundle" @@ -86,7 +87,7 @@ finalize_model_config check_langfuse_env # Export common variables for the benchmark script -export NO_BUNDLE BUNDLE_ZIP MODEL_PROFILE VERBOSE AGENT +export NO_BUNDLE BUNDLE_ZIP MODEL_PROFILE VERBOSE AGENT USE_DOTENV echo -e "${BLUE}╔════════════════════════════════════════════════════════════╗${NC}" echo -e "${BLUE}║ Evaluation: ${BENCHMARK} [${AGENT}]$(printf '%*s' $((34 - ${#BENCHMARK} - ${#AGENT})) '')║${NC}" From fd33430a3c377c04ecb09c8800b652837d36c44a Mon Sep 17 00:00:00 2001 From: Harold Ship Date: Tue, 9 Jun 2026 14:43:52 +0300 Subject: [PATCH 07/12] feat(bpo): honour --dotenv in per-config compare loop --- benchmarks/bpo/compare.sh | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/benchmarks/bpo/compare.sh b/benchmarks/bpo/compare.sh index a99bef0..a285bc8 100755 --- a/benchmarks/bpo/compare.sh +++ b/benchmarks/bpo/compare.sh @@ -51,6 +51,7 @@ COMPARE_AGENTS="${COMPARE_AGENTS:-false}" COMPARE_POLICIES=false NO_BUNDLE="${NO_BUNDLE:-false}" BUNDLE_ZIP="${BUNDLE_ZIP:-false}" +USE_DOTENV="${USE_DOTENV:-false}" FORWARDED_ARGS=() # Parse arguments @@ -95,6 +96,10 @@ while [[ $idx -lt ${#ARGS[@]} ]]; do BUNDLE_ZIP=true idx=$((idx+1)) ;; + --dotenv) + USE_DOTENV=true + idx=$((idx+1)) + ;; --dry-run) DRY_RUN=true idx=$((idx+1)) @@ -214,10 +219,10 @@ for config in "${CONFIGS[@]}"; do echo -e "${CYAN:-}Configuration: ${config}${NC:-}" echo -e "${BLUE:-}══════════════════════════════════════════════════════════════${NC:-}" - # Apply model profile - if type apply_model_profile &>/dev/null; then - if ! apply_model_profile "$model"; then - echo -e "${RED:-}Error: Failed to apply model profile '$model'${NC:-}" + # Apply model config (profile + optional .env overrides) + if type apply_model_config &>/dev/null; then + if ! apply_model_config "$model"; then + echo -e "${RED:-}Error: Failed to apply model config '$model'${NC:-}" echo -e "${YELLOW:-}Valid profiles: gpt-oss, gpt4o, gpt4.1, opus4.5${NC:-}" exit 1 fi From d7714df657590be3ef11ea41cc1fa178f123c878 Mon Sep 17 00:00:00 2001 From: Harold Ship Date: Tue, 9 Jun 2026 14:46:46 +0300 Subject: [PATCH 08/12] feat(m3): honour --dotenv in per-config compare loop --- benchmarks/m3/compare.sh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/benchmarks/m3/compare.sh b/benchmarks/m3/compare.sh index 8141108..c106d1c 100755 --- a/benchmarks/m3/compare.sh +++ b/benchmarks/m3/compare.sh @@ -48,6 +48,7 @@ COMPARE_POLICIES="${COMPARE_POLICIES:-false}" GLOBAL_NO_POLICIES="${GLOBAL_NO_POLICIES:-false}" NO_BUNDLE="${NO_BUNDLE:-false}" BUNDLE_ZIP="${BUNDLE_ZIP:-false}" +USE_DOTENV="${USE_DOTENV:-false}" FORWARDED_ARGS=() # Parse arguments @@ -96,6 +97,10 @@ while [[ $idx -lt ${#ARGS[@]} ]]; do BUNDLE_ZIP=true idx=$((idx+1)) ;; + --dotenv) + USE_DOTENV=true + idx=$((idx+1)) + ;; --dry-run) DRY_RUN=true idx=$((idx+1)) @@ -441,8 +446,8 @@ for config in "${CONFIGS[@]}"; do echo -e "${CYAN:-}Configuration: ${config}${NC:-}" echo -e "${BLUE:-}══════════════════════════════════════════════════════════════${NC:-}" - if type apply_model_profile &>/dev/null; then - apply_model_profile "$model" + if type apply_model_config &>/dev/null; then + apply_model_config "$model" fi # Per-config extra args (e.g., --no-policies when comparing policy modes). From debcf621d78e1b9cecad10faedcde0ba8b6d5c47 Mon Sep 17 00:00:00 2001 From: Harold Ship Date: Tue, 9 Jun 2026 14:49:27 +0300 Subject: [PATCH 09/12] feat(appworld): honour --dotenv in per-config compare loop --- benchmarks/appworld/compare.sh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/benchmarks/appworld/compare.sh b/benchmarks/appworld/compare.sh index 18076ba..6d2473e 100755 --- a/benchmarks/appworld/compare.sh +++ b/benchmarks/appworld/compare.sh @@ -35,6 +35,7 @@ AGENTS="${AGENTS:-}" COMPARE_AGENTS="${COMPARE_AGENTS:-false}" NO_BUNDLE="${NO_BUNDLE:-false}" BUNDLE_ZIP="${BUNDLE_ZIP:-false}" +USE_DOTENV="${USE_DOTENV:-false}" FORWARDED_ARGS=() # Parse arguments @@ -79,6 +80,10 @@ while [[ $idx -lt ${#ARGS[@]} ]]; do BUNDLE_ZIP=true idx=$((idx+1)) ;; + --dotenv) + USE_DOTENV=true + idx=$((idx+1)) + ;; *) FORWARDED_ARGS+=("${ARGS[$idx]}") idx=$((idx+1)) @@ -161,8 +166,8 @@ for config in "${CONFIGS[@]}"; do echo -e "${CYAN:-}Configuration: ${config}${NC:-}" echo -e "${BLUE:-}══════════════════════════════════════════════════════════════${NC:-}" - if type apply_model_profile &>/dev/null; then - apply_model_profile "$model" + if type apply_model_config &>/dev/null; then + apply_model_config "$model" fi # Snapshot existing result files before this config's runs From a28b714ed3eda22195b2db019c827318cea7522d Mon Sep 17 00:00:00 2001 From: Harold Ship Date: Tue, 9 Jun 2026 14:50:37 +0300 Subject: [PATCH 10/12] feat(oak): honour --dotenv in per-model compare loop --- benchmarks/oak_health_insurance/compare.sh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/benchmarks/oak_health_insurance/compare.sh b/benchmarks/oak_health_insurance/compare.sh index e27c47d..57ec651 100755 --- a/benchmarks/oak_health_insurance/compare.sh +++ b/benchmarks/oak_health_insurance/compare.sh @@ -36,6 +36,7 @@ AGENTS="${AGENTS:-}" COMPARE_AGENTS="${COMPARE_AGENTS:-false}" NO_BUNDLE="${NO_BUNDLE:-false}" BUNDLE_ZIP="${BUNDLE_ZIP:-false}" +USE_DOTENV="${USE_DOTENV:-false}" FORWARDED_ARGS=() # Parse arguments @@ -76,6 +77,10 @@ while [[ $idx -lt ${#ARGS[@]} ]]; do BUNDLE_ZIP=true idx=$((idx+1)) ;; + --dotenv) + USE_DOTENV=true + idx=$((idx+1)) + ;; --dry-run) DRY_RUN=true idx=$((idx+1)) @@ -155,8 +160,8 @@ for model in "${MODEL_LIST[@]}"; do echo -e "${CYAN:-}Model: ${model}${NC:-}" echo -e "${BLUE:-}══════════════════════════════════════════════════════════════${NC:-}" - if type apply_model_profile &>/dev/null; then - apply_model_profile "$model" + if type apply_model_config &>/dev/null; then + apply_model_config "$model" fi # Snapshot existing result files and trajectory folders before this model's runs From 6c0c8e9c1ba08819105474db5efee791ef967609 Mon Sep 17 00:00:00 2001 From: Harold Ship Date: Tue, 9 Jun 2026 14:52:03 +0300 Subject: [PATCH 11/12] docs: document --dotenv flag in README --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index e980d72..1b3717e 100644 --- a/README.md +++ b/README.md @@ -342,6 +342,10 @@ cd benchmarks/appworld && ./eval.sh --task 82e2fac_1 # Compare via dispatcher ./scripts/compare.sh --benchmark bpo --runs 3 ./scripts/compare.sh --benchmark m3 --runs 2 + +# .env overrides model profile (.env wins; defaults to gpt-oss base if no --model-profile) +./scripts/eval.sh --benchmark bpo --dotenv +./scripts/eval.sh --benchmark bpo --model-profile gpt4o --dotenv ``` ### Common flags @@ -355,6 +359,7 @@ Flags accepted by every `eval.sh` (and forwarded by every `compare.sh`): | `--task ...` | Run only the listed task(s) (numeric IDs, task names, or — for AppWorld — task UUIDs). | | `--agent cuga\|react` | Pick agent. `cuga` is the default; `react` runs the lightweight ReAct baseline. Not all benchmarks support both (see Agent Selection above). | | `--model-profile ` | Pick model profile (`gpt-oss`, `gpt4o`, `gpt4.1`, `opus4.5`). Default comes from `.env`. | +| `--dotenv` | After applying the model profile, re-read `.env` and force-export every variable it contains. `.env` values override the profile. If no `--model-profile` is given, defaults to `gpt-oss` as the base. | | `--no-bundle` | Skip reproducibility bundle creation. | | `--bundle-zip` | Zip the bundle for sharing. | From 6f4be7f281e06d6adb3baf45fd0af567d1091ac1 Mon Sep 17 00:00:00 2001 From: Harold Ship Date: Tue, 9 Jun 2026 17:47:57 +0300 Subject: [PATCH 12/12] fix(test): defer $TMP expansion in trap commands (SC2064) --- benchmarks/helpers/tests/test_model_config.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmarks/helpers/tests/test_model_config.sh b/benchmarks/helpers/tests/test_model_config.sh index b099d23..ee5a61e 100755 --- a/benchmarks/helpers/tests/test_model_config.sh +++ b/benchmarks/helpers/tests/test_model_config.sh @@ -27,7 +27,7 @@ echo "apply_dotenv_model_overrides" # overrides existing vars from a supplied env file result=$( source "$SCRIPT_DIR/../common.sh" - TMP=$(mktemp); trap "rm -f $TMP" EXIT + TMP=$(mktemp); trap 'rm -f "$TMP"' EXIT printf 'MODEL_NAME=my-model\nOPENAI_BASE_URL=https://custom\n' > "$TMP" export MODEL_NAME=original; export OPENAI_BASE_URL=original apply_dotenv_model_overrides "$TMP" > /dev/null 2>&1 @@ -47,7 +47,7 @@ assert_eq "no-op when .env missing" "original" "$result" # strips surrounding quotes and inline comments result=$( source "$SCRIPT_DIR/../common.sh" - TMP=$(mktemp); trap "rm -f $TMP" EXIT + TMP=$(mktemp); trap 'rm -f "$TMP"' EXIT printf 'MODEL_NAME="quoted-model"\nOPENAI_BASE_URL=https://x # comment\n' > "$TMP" apply_dotenv_model_overrides "$TMP" > /dev/null 2>&1 echo "$MODEL_NAME|$OPENAI_BASE_URL" @@ -57,7 +57,7 @@ assert_eq "strips quotes and inline comments" "quoted-model|https://x" "$result" # handles export-prefixed lines result=$( source "$SCRIPT_DIR/../common.sh" - TMP=$(mktemp); trap "rm -f $TMP" EXIT + TMP=$(mktemp); trap 'rm -f "$TMP"' EXIT printf 'export MODEL_NAME=export-style\n' > "$TMP" apply_dotenv_model_overrides "$TMP" > /dev/null 2>&1 echo "$MODEL_NAME" @@ -80,7 +80,7 @@ assert_eq "USE_DOTENV=false: MODEL_NAME from profile" "openai/gpt-oss-120b" "$re # USE_DOTENV=true with profile: profile runs first, then .env overrides MODEL_NAME result=$( source "$SCRIPT_DIR/../common.sh" - TMP=$(mktemp); trap "rm -f $TMP" EXIT + TMP=$(mktemp); trap 'rm -f "$TMP"' EXIT printf 'MODEL_NAME=dotenv-override\n' > "$TMP" export USE_DOTENV=true apply_model_config "gpt-oss" "$TMP" > /dev/null 2>&1 @@ -91,7 +91,7 @@ assert_eq "USE_DOTENV=true: .env wins over profile" "dotenv-override" "$result" # USE_DOTENV=true, .env does NOT set MODEL_NAME: profile value is kept result=$( source "$SCRIPT_DIR/../common.sh" - TMP=$(mktemp); trap "rm -f $TMP" EXIT + TMP=$(mktemp); trap 'rm -f "$TMP"' EXIT printf 'SOME_OTHER_VAR=x\n' > "$TMP" export USE_DOTENV=true apply_model_config "gpt-oss" "$TMP" > /dev/null 2>&1 @@ -102,7 +102,7 @@ assert_eq "USE_DOTENV=true: profile value kept when .env omits var" "openai/gpt- # USE_DOTENV=true, no profile: defaults to gpt-oss base result=$( source "$SCRIPT_DIR/../common.sh" - TMP=$(mktemp); trap "rm -f $TMP" EXIT + TMP=$(mktemp); trap 'rm -f "$TMP"' EXIT printf 'SOME_OTHER_VAR=x\n' > "$TMP" export USE_DOTENV=true apply_model_config "" "$TMP" > /dev/null 2>&1