cuga-project · haroldship · Jun 9, 2026 · Jun 9, 2026 · Jun 9, 2026 · Jun 9, 2026
diff --git a/README.md b/README.md
@@ -342,6 +342,10 @@ cd benchmarks/appworld && ./eval.sh --task 82e2fac_1
 # Compare via dispatcher
 ./scripts/compare.sh --benchmark bpo --runs 3
 ./scripts/compare.sh --benchmark m3 --runs 2
+
+# .env overrides model profile (.env wins; defaults to gpt-oss base if no --model-profile)
+./scripts/eval.sh --benchmark bpo --dotenv
+./scripts/eval.sh --benchmark bpo --model-profile gpt4o --dotenv
 ```
 
 ### Common flags
@@ -355,6 +359,7 @@ Flags accepted by every `eval.sh` (and forwarded by every `compare.sh`):
 | `--task <id>...` | Run only the listed task(s) (numeric IDs, task names, or — for AppWorld — task UUIDs). |
 | `--agent cuga\|react` | Pick agent. `cuga` is the default; `react` runs the lightweight ReAct baseline. Not all benchmarks support both (see Agent Selection above). |
 | `--model-profile <name>` | Pick model profile (`gpt-oss`, `gpt4o`, `gpt4.1`, `opus4.5`). Default comes from `.env`. |
+| `--dotenv` | After applying the model profile, re-read `.env` and force-export every variable it contains. `.env` values override the profile. If no `--model-profile` is given, defaults to `gpt-oss` as the base. |
 | `--no-bundle` | Skip reproducibility bundle creation. |
 | `--bundle-zip` | Zip the bundle for sharing. |
 

diff --git a/benchmarks/appworld/compare.sh b/benchmarks/appworld/compare.sh
@@ -35,6 +35,7 @@ AGENTS="${AGENTS:-}"
 COMPARE_AGENTS="${COMPARE_AGENTS:-false}"
 NO_BUNDLE="${NO_BUNDLE:-false}"
 BUNDLE_ZIP="${BUNDLE_ZIP:-false}"
+USE_DOTENV="${USE_DOTENV:-false}"
 FORWARDED_ARGS=()
 
 # Parse arguments
@@ -79,6 +80,10 @@ while [[ $idx -lt ${#ARGS[@]} ]]; do
             BUNDLE_ZIP=true
             idx=$((idx+1))
             ;;
+        --dotenv)
+            USE_DOTENV=true
+            idx=$((idx+1))
+            ;;
         *)
             FORWARDED_ARGS+=("${ARGS[$idx]}")
             idx=$((idx+1))
@@ -161,8 +166,8 @@ for config in "${CONFIGS[@]}"; do
     echo -e "${CYAN:-}Configuration: ${config}${NC:-}"
     echo -e "${BLUE:-}══════════════════════════════════════════════════════════════${NC:-}"
 
-    if type apply_model_profile &>/dev/null; then
-        apply_model_profile "$model"
+    if type apply_model_config &>/dev/null; then
+        apply_model_config "$model"
     fi
 
     # Snapshot existing result files before this config's runs

diff --git a/benchmarks/bpo/compare.sh b/benchmarks/bpo/compare.sh
@@ -51,6 +51,7 @@ COMPARE_AGENTS="${COMPARE_AGENTS:-false}"
 COMPARE_POLICIES=false
 NO_BUNDLE="${NO_BUNDLE:-false}"
 BUNDLE_ZIP="${BUNDLE_ZIP:-false}"
+USE_DOTENV="${USE_DOTENV:-false}"
 FORWARDED_ARGS=()
 
 # Parse arguments
@@ -95,6 +96,10 @@ while [[ $idx -lt ${#ARGS[@]} ]]; do
             BUNDLE_ZIP=true
             idx=$((idx+1))
             ;;
+        --dotenv)
+            USE_DOTENV=true
+            idx=$((idx+1))
+            ;;
         --dry-run)
             DRY_RUN=true
             idx=$((idx+1))
@@ -214,10 +219,10 @@ for config in "${CONFIGS[@]}"; do
     echo -e "${CYAN:-}Configuration: ${config}${NC:-}"
     echo -e "${BLUE:-}══════════════════════════════════════════════════════════════${NC:-}"
 
-    # Apply model profile
-    if type apply_model_profile &>/dev/null; then
-        if ! apply_model_profile "$model"; then
-            echo -e "${RED:-}Error: Failed to apply model profile '$model'${NC:-}"
+    # Apply model config (profile + optional .env overrides)
+    if type apply_model_config &>/dev/null; then
+        if ! apply_model_config "$model"; then
+            echo -e "${RED:-}Error: Failed to apply model config '$model'${NC:-}"
             echo -e "${YELLOW:-}Valid profiles: gpt-oss, gpt4o, gpt4.1, opus4.5${NC:-}"
             exit 1
         fi

diff --git a/benchmarks/helpers/common.sh b/benchmarks/helpers/common.sh
@@ -131,6 +131,7 @@ COMPARE_AGENTS="${COMPARE_AGENTS:-false}"
 NO_BUNDLE="${NO_BUNDLE:-false}"
 BUNDLE_ZIP="${BUNDLE_ZIP:-false}"
 FORWARDED_ARGS=()
+USE_DOTENV="${USE_DOTENV:-false}"
 
 parse_common_args() {
     local args=("$@")
@@ -193,6 +194,10 @@ parse_common_args() {
                 BUNDLE_ZIP=true
                 idx=$((idx+1))
                 ;;
+            --dotenv)
+                USE_DOTENV=true
+                idx=$((idx+1))
+                ;;
             --help|-h)
                 # Let the caller handle --help
                 FORWARDED_ARGS+=("$arg")
@@ -249,9 +254,67 @@ apply_model_cli_overrides_if_set() {
     fi
 }
 
+# Re-read .env with force-export semantics so .env vars win over a
+# previously-applied model profile. Accepts an optional path argument for
+# testability; defaults to <project_root>/.env derived from BASH_SOURCE[0].
+apply_dotenv_model_overrides() {
+    local helpers_dir env_file
+    helpers_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+    env_file="${1:-$helpers_dir/../../.env}"
+
+    if [ ! -f "$env_file" ]; then
+        echo -e "${YELLOW}Warning: --dotenv specified but .env not found at $env_file${NC}"
+        return 0
+    fi
+
+    echo -e "${GREEN}✓${NC} .env overrides (--dotenv):"
+    local line key val
+    while IFS= read -r line || [[ -n "$line" ]]; do
+        [[ "$line" =~ ^[[:space:]]*# ]] && continue
+        [[ -z "${line//[[:space:]]/}" ]] && continue
+        if [[ "$line" =~ ^[[:space:]]*export[[:space:]]+([A-Za-z_][A-Za-z0-9_]*)=(.*) ]]; then
+            key="${BASH_REMATCH[1]}"
+            val="${BASH_REMATCH[2]}"
+        elif [[ "$line" =~ ^[[:space:]]*([A-Za-z_][A-Za-z0-9_]*)=(.*) ]]; then
+            key="${BASH_REMATCH[1]}"
+            val="${BASH_REMATCH[2]}"
+        else
+            continue
+        fi
+        # Strip inline comments from unquoted values
+        if [[ "$val" != \"*\" && "$val" != \'*\' ]]; then
+            val="${val%%[[:space:]]#*}"
+            val="${val%"${val##*[![:space:]]}"}"
+        fi
+        # Strip surrounding quotes
+        val="${val#\"}" ; val="${val%\"}"
+        val="${val#\'}" ; val="${val%\'}"
+        echo -e "  ${GREEN}↳${NC} $key=$val"
+        export "$key=$val"
+    done < "$env_file"
+}
+
+# Apply a model profile and, when USE_DOTENV=true, layer .env overrides on top.
+# With no profile and USE_DOTENV=true, defaults to gpt-oss as the base.
+# env_file is optional; used by tests to supply a temp file instead of the real .env.
+apply_model_config() {
+    local profile="${1:-}"
+    local env_file="${2:-}"
+    if [[ "${USE_DOTENV:-false}" == "true" && -z "$profile" ]]; then
+        profile="gpt-oss"
+    fi
+    if [[ -n "$profile" ]]; then
+        _ensure_model_profiles_loaded || return 1
+        apply_model_profile "$profile" || return 1
+    fi
+    if [[ "${USE_DOTENV:-false}" == "true" ]]; then
+        apply_dotenv_model_overrides "$env_file"
+    fi
+}
+
 # Apply profile then CLI overrides. Call after load_env.sh and arg parsing.
 finalize_model_config() {
-    apply_model_profile_if_set || return 1
+    apply_model_config "$MODEL_PROFILE" || return 1
     apply_model_cli_overrides_if_set
 }
 

diff --git a/benchmarks/helpers/tests/test_model_config.sh b/benchmarks/helpers/tests/test_model_config.sh
@@ -0,0 +1,117 @@
+#!/usr/bin/env bash
+# Unit tests for apply_dotenv_model_overrides and apply_model_config.
+# Run: bash benchmarks/helpers/tests/test_model_config.sh
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+PASS=0; FAIL=0
+
+assert_eq() {
+    if [[ "${2}" == "${3}" ]]; then
+        echo "  PASS: ${1}"; PASS=$((PASS+1))
+    else
+        echo "  FAIL: ${1}"; echo "        want: ${2}"; echo "        got:  ${3}"; FAIL=$((FAIL+1))
+    fi
+}
+
+# ─── apply_dotenv_model_overrides ────────────────────────────────────────────
+
+echo "apply_dotenv_model_overrides"
+
+# NOTE: diagnostic output (echo lines from apply_dotenv_model_overrides /
+# apply_model_profile) is redirected to /dev/null so only the final echo
+# survives into $result. In the red phase (before implementation) the whole
+# script will abort with "command not found" — that is the expected failure.
+
+# overrides existing vars from a supplied env file
+result=$(
+    source "$SCRIPT_DIR/../common.sh"
+    TMP=$(mktemp); trap 'rm -f "$TMP"' EXIT
+    printf 'MODEL_NAME=my-model\nOPENAI_BASE_URL=https://custom\n' > "$TMP"
+    export MODEL_NAME=original; export OPENAI_BASE_URL=original
+    apply_dotenv_model_overrides "$TMP" > /dev/null 2>&1
+    echo "$MODEL_NAME|$OPENAI_BASE_URL"
+)
+assert_eq "overrides MODEL_NAME and OPENAI_BASE_URL" "my-model|https://custom" "$result"
+
+# no-op when file does not exist — prints warning, does not error
+result=$(
+    source "$SCRIPT_DIR/../common.sh"
+    export MODEL_NAME=original
+    apply_dotenv_model_overrides "/no/such/file.env" > /dev/null 2>&1
+    echo "$MODEL_NAME"
+)
+assert_eq "no-op when .env missing" "original" "$result"
+
+# strips surrounding quotes and inline comments
+result=$(
+    source "$SCRIPT_DIR/../common.sh"
+    TMP=$(mktemp); trap 'rm -f "$TMP"' EXIT
+    printf 'MODEL_NAME="quoted-model"\nOPENAI_BASE_URL=https://x # comment\n' > "$TMP"
+    apply_dotenv_model_overrides "$TMP" > /dev/null 2>&1
+    echo "$MODEL_NAME|$OPENAI_BASE_URL"
+)
+assert_eq "strips quotes and inline comments" "quoted-model|https://x" "$result"
+
+# handles export-prefixed lines
+result=$(
+    source "$SCRIPT_DIR/../common.sh"
+    TMP=$(mktemp); trap 'rm -f "$TMP"' EXIT
+    printf 'export MODEL_NAME=export-style\n' > "$TMP"
+    apply_dotenv_model_overrides "$TMP" > /dev/null 2>&1
+    echo "$MODEL_NAME"
+)
+assert_eq "handles export-prefixed lines" "export-style" "$result"
+
+# ─── apply_model_config ───────────────────────────────────────────────────────
+
+echo "apply_model_config"
+
+# USE_DOTENV=false: behaves exactly like apply_model_profile, no .env re-read
+result=$(
+    source "$SCRIPT_DIR/../common.sh"
+    export USE_DOTENV=false
+    apply_model_config "gpt-oss" > /dev/null 2>&1
+    echo "$MODEL_NAME"
+)
+assert_eq "USE_DOTENV=false: MODEL_NAME from profile" "openai/gpt-oss-120b" "$result"
+
+# USE_DOTENV=true with profile: profile runs first, then .env overrides MODEL_NAME
+result=$(
+    source "$SCRIPT_DIR/../common.sh"
+    TMP=$(mktemp); trap 'rm -f "$TMP"' EXIT
+    printf 'MODEL_NAME=dotenv-override\n' > "$TMP"
+    export USE_DOTENV=true
+    apply_model_config "gpt-oss" "$TMP" > /dev/null 2>&1
+    echo "$MODEL_NAME"
+)
+assert_eq "USE_DOTENV=true: .env wins over profile" "dotenv-override" "$result"
+
+# USE_DOTENV=true, .env does NOT set MODEL_NAME: profile value is kept
+result=$(
+    source "$SCRIPT_DIR/../common.sh"
+    TMP=$(mktemp); trap 'rm -f "$TMP"' EXIT
+    printf 'SOME_OTHER_VAR=x\n' > "$TMP"
+    export USE_DOTENV=true
+    apply_model_config "gpt-oss" "$TMP" > /dev/null 2>&1
+    echo "$MODEL_NAME"
+)
+assert_eq "USE_DOTENV=true: profile value kept when .env omits var" "openai/gpt-oss-120b" "$result"
+
+# USE_DOTENV=true, no profile: defaults to gpt-oss base
+result=$(
+    source "$SCRIPT_DIR/../common.sh"
+    TMP=$(mktemp); trap 'rm -f "$TMP"' EXIT
+    printf 'SOME_OTHER_VAR=x\n' > "$TMP"
+    export USE_DOTENV=true
+    apply_model_config "" "$TMP" > /dev/null 2>&1
+    echo "$MODEL_NAME"
+)
+assert_eq "USE_DOTENV=true, no profile: defaults to gpt-oss" "openai/gpt-oss-120b" "$result"
+
+# ─── Summary ─────────────────────────────────────────────────────────────────
+
+echo ""
+echo "Results: $PASS passed, $FAIL failed"
+[[ $FAIL -eq 0 ]]
diff --git a/benchmarks/m3/compare.sh b/benchmarks/m3/compare.sh
@@ -48,6 +48,7 @@ COMPARE_POLICIES="${COMPARE_POLICIES:-false}"
 GLOBAL_NO_POLICIES="${GLOBAL_NO_POLICIES:-false}"
 NO_BUNDLE="${NO_BUNDLE:-false}"
 BUNDLE_ZIP="${BUNDLE_ZIP:-false}"
+USE_DOTENV="${USE_DOTENV:-false}"
 FORWARDED_ARGS=()
 
 # Parse arguments
@@ -96,6 +97,10 @@ while [[ $idx -lt ${#ARGS[@]} ]]; do
             BUNDLE_ZIP=true
             idx=$((idx+1))
             ;;
+        --dotenv)
+            USE_DOTENV=true
+            idx=$((idx+1))
+            ;;
         --dry-run)
             DRY_RUN=true
             idx=$((idx+1))
@@ -441,8 +446,8 @@ for config in "${CONFIGS[@]}"; do
     echo -e "${CYAN:-}Configuration: ${config}${NC:-}"
     echo -e "${BLUE:-}══════════════════════════════════════════════════════════════${NC:-}"
 
-    if type apply_model_profile &>/dev/null; then
-        apply_model_profile "$model"
+    if type apply_model_config &>/dev/null; then
+        apply_model_config "$model"
     fi
 
     # Per-config extra args (e.g., --no-policies when comparing policy modes).

diff --git a/benchmarks/oak_health_insurance/compare.sh b/benchmarks/oak_health_insurance/compare.sh
@@ -36,6 +36,7 @@ AGENTS="${AGENTS:-}"
 COMPARE_AGENTS="${COMPARE_AGENTS:-false}"
 NO_BUNDLE="${NO_BUNDLE:-false}"
 BUNDLE_ZIP="${BUNDLE_ZIP:-false}"
+USE_DOTENV="${USE_DOTENV:-false}"
 FORWARDED_ARGS=()
 
 # Parse arguments
@@ -76,6 +77,10 @@ while [[ $idx -lt ${#ARGS[@]} ]]; do
             BUNDLE_ZIP=true
             idx=$((idx+1))
             ;;
+        --dotenv)
+            USE_DOTENV=true
+            idx=$((idx+1))
+            ;;
         --dry-run)
             DRY_RUN=true
             idx=$((idx+1))
@@ -155,8 +160,8 @@ for model in "${MODEL_LIST[@]}"; do
     echo -e "${CYAN:-}Model: ${model}${NC:-}"
     echo -e "${BLUE:-}══════════════════════════════════════════════════════════════${NC:-}"
 
-    if type apply_model_profile &>/dev/null; then
-        apply_model_profile "$model"
+    if type apply_model_config &>/dev/null; then
+        apply_model_config "$model"
     fi
 
     # Snapshot existing result files and trajectory folders before this model's runs

diff --git a/scripts/compare.sh b/scripts/compare.sh
@@ -34,6 +34,7 @@ for arg in "${FORWARDED_ARGS[@]}"; do
         echo "  --runs <N>                Number of runs (default: 1)"
         echo "  --output, -o <file>       Save comparison report to file"
         echo "  --model-profile <name>    Model profile (gpt-oss, gpt4o, gpt4.1, opus4.5)"
+        echo "  --dotenv                  Use .env values to override the model profile"
         echo "  --dry-run                 Print what would be run without executing"
         echo "  --no-bundle               Skip reproducibility bundle creation"
         echo "  --verbose, -v             Enable verbose output"
@@ -95,7 +96,7 @@ finalize_model_config
 check_langfuse_env
 
 # Export common variables
-export RUNS OUTPUT_FILE DRY_RUN NO_BUNDLE BUNDLE_ZIP MODEL_PROFILE VERBOSE AGENT AGENTS COMPARE_AGENTS
+export RUNS OUTPUT_FILE DRY_RUN NO_BUNDLE BUNDLE_ZIP MODEL_PROFILE VERBOSE AGENT AGENTS COMPARE_AGENTS USE_DOTENV
 
 # Banner: when comparing multiple agents, show the agent list instead of the singular AGENT.
 BANNER_AGENT_LABEL="$AGENT"

diff --git a/scripts/eval.sh b/scripts/eval.sh
@@ -30,6 +30,7 @@ for arg in "${FORWARDED_ARGS[@]}"; do
         echo "  --benchmark, -b <name>    Benchmark to run (required)"
         echo "  --agent <name>            Agent to run (cuga, react; default: cuga)"
         echo "  --model-profile <name>    Model profile (gpt-oss, gpt4o, gpt4.1, opus4.5)"
+        echo "  --dotenv                  Use .env values to override the model profile"
         echo "  --verbose, -v             Enable verbose output"
         echo "  --no-bundle               Skip reproducibility bundle creation"
         echo "  --bundle-zip              Create zip archive of bundle"
@@ -86,7 +87,7 @@ finalize_model_config
 check_langfuse_env
 
 # Export common variables for the benchmark script
-export NO_BUNDLE BUNDLE_ZIP MODEL_PROFILE VERBOSE AGENT
+export NO_BUNDLE BUNDLE_ZIP MODEL_PROFILE VERBOSE AGENT USE_DOTENV
 
 echo -e "${BLUE}╔════════════════════════════════════════════════════════════╗${NC}"
 echo -e "${BLUE}║  Evaluation: ${BENCHMARK} [${AGENT}]$(printf '%*s' $((34 - ${#BENCHMARK} - ${#AGENT})) '')║${NC}"